Spaces:

prithivMLmods
/

SD3.5-Turbo-Portrait

Running on Zero

App Files Files Community

prithivMLmods commited on 15 days ago

Commit

1c5b159

verified ·

1 Parent(s): 0d38f12

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -30

app.py CHANGED Viewed

@@ -79,28 +79,25 @@ docscopeocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Main Inference Function
 @spaces.GPU
-def model_inference(message, history, use_docscopeocr):
-    text = message["text"].strip()
-    files = message.get("files", [])
     if not text and not files:
         yield "Error: Please input a text query or provide files (images, videos, PDFs)."
         return
     # Process files: images, videos, PDFs
     image_list = []
-    for idx, file in enumerate(files):
-        if file.lower().endswith(".pdf"):
             try:
-                pdf_images = convert_from_path(file)
                 for page_num, img in enumerate(pdf_images, start=1):
                     label = f"PDF {idx+1} Page {page_num}:"
                     image_list.append((label, img))
             except Exception as e:
                 yield f"Error converting PDF: {str(e)}"
                 return
-        elif file.lower().endswith((".mp4", ".avi", ".mov")):
-            frames = downsample_video(file)
             if not frames:
                 yield "Error: Could not extract frames from the video."
                 return
@@ -109,7 +106,7 @@ def model_inference(message, history, use_docscopeocr):
                 image_list.append((label, frame))
         else:
             try:
-                img = load_image(file)
                 label = f"Image {idx+1}:"
                 image_list.append((label, img))
             except Exception as e:
@@ -156,28 +153,42 @@ def model_inference(message, history, use_docscopeocr):
         yield buffer
 # Gradio Interface
 examples = [
-    [{"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
-    [{"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
-    [{"text": "OCR the Image", "files": ["rolm/3.jpeg"]}],
-    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
-demo = gr.ChatInterface(
-    fn=model_inference,
-    description="# **DocScope OCR `VL/OCR`**",
-    examples=examples,
-    textbox=gr.MultimodalTextbox(
-        label="Query Input",
-        file_types=["image", "video", "pdf"],
-        file_count="multiple",
-        placeholder="Input your query and optionally upload image(s), video(s), or PDF(s). Select the model using the checkbox."
-    ),
-    stop_btn="Stop Generation",
-    multimodal=True,
-    cache_examples=False,
-    theme="bethecloud/storj_theme",
-    additional_inputs=[gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")],
-)
 demo.launch(debug=True, ssr_mode=False)

 # Main Inference Function
 @spaces.GPU
+def model_inference(text, files, history, use_docscopeocr):
     if not text and not files:
         yield "Error: Please input a text query or provide files (images, videos, PDFs)."
         return
     # Process files: images, videos, PDFs
     image_list = []
+    for idx, file in enumerate(files or []):
+        if file.name.lower().endswith(".pdf"):
             try:
+                pdf_images = convert_from_path(file.name)
                 for page_num, img in enumerate(pdf_images, start=1):
                     label = f"PDF {idx+1} Page {page_num}:"
                     image_list.append((label, img))
             except Exception as e:
                 yield f"Error converting PDF: {str(e)}"
                 return
+        elif file.name.lower().endswith((".mp4", ".avi", ".mov")):
+            frames = downsample_video(file.name)
             if not frames:
                 yield "Error: Could not extract frames from the video."
                 return
                 image_list.append((label, frame))
         else:
             try:
+                img = load_image(file.name)
                 label = f"Image {idx+1}:"
                 image_list.append((label, img))
             except Exception as e:
         yield buffer
 # Gradio Interface
+def chat_interface(text, files, use_docscopeocr, history):
+    if text is None and files is None:
+        return "Error: Please input a text query or provide files."
+    return model_inference(text, files, history, use_docscopeocr)
 examples = [
+    {"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]},
+    {"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]},
+    {"text": "OCR the Image", "files": ["rolm/3.jpeg"]},
+    {"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]},
 ]
+with gr.Blocks(theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **DocScope OCR `VL/OCR`**")
+    with gr.Row():
+        text_input = gr.Textbox(label="Query Input", placeholder="Input your query here.")
+        file_input = gr.File(label="Upload Files", file_count="multiple", file_types=["image", "video", "pdf"])
+        use_docscopeocr = gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")
+    chat = gr.Chatbot()
+    submit_btn = gr.Button("Submit")
+    stop_btn = gr.Button("Stop Generation")
+    def submit(text, files, use_docscopeocr, history):
+        if not history:
+            history = []
+        history.append({"role": "user", "content": text})
+        return history, gr.update(interactive=False), gr.update(interactive=True)
+    def generate(history, text, files, use_docscopeocr):
+        if not history:
+            history = []
+        for response in model_inference(text, files, history, use_docscopeocr):
+            history.append({"role": "assistant", "content": response})
+            yield history
+    submit_btn.click(submit, [text_input, file_input, use_docscopeocr, chat], [chat, submit_btn, stop_btn])
+    submit_btn.click(generate, [chat, text_input, file_input, use_docscopeocr], chat)
 demo.launch(debug=True, ssr_mode=False)