prithivMLmods commited on
Commit
1c5b159
·
verified ·
1 Parent(s): 0d38f12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -30
app.py CHANGED
@@ -79,28 +79,25 @@ docscopeocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
79
 
80
  # Main Inference Function
81
  @spaces.GPU
82
- def model_inference(message, history, use_docscopeocr):
83
- text = message["text"].strip()
84
- files = message.get("files", [])
85
-
86
  if not text and not files:
87
  yield "Error: Please input a text query or provide files (images, videos, PDFs)."
88
  return
89
 
90
  # Process files: images, videos, PDFs
91
  image_list = []
92
- for idx, file in enumerate(files):
93
- if file.lower().endswith(".pdf"):
94
  try:
95
- pdf_images = convert_from_path(file)
96
  for page_num, img in enumerate(pdf_images, start=1):
97
  label = f"PDF {idx+1} Page {page_num}:"
98
  image_list.append((label, img))
99
  except Exception as e:
100
  yield f"Error converting PDF: {str(e)}"
101
  return
102
- elif file.lower().endswith((".mp4", ".avi", ".mov")):
103
- frames = downsample_video(file)
104
  if not frames:
105
  yield "Error: Could not extract frames from the video."
106
  return
@@ -109,7 +106,7 @@ def model_inference(message, history, use_docscopeocr):
109
  image_list.append((label, frame))
110
  else:
111
  try:
112
- img = load_image(file)
113
  label = f"Image {idx+1}:"
114
  image_list.append((label, img))
115
  except Exception as e:
@@ -156,28 +153,42 @@ def model_inference(message, history, use_docscopeocr):
156
  yield buffer
157
 
158
  # Gradio Interface
 
 
 
 
 
159
  examples = [
160
- [{"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
161
- [{"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
162
- [{"text": "OCR the Image", "files": ["rolm/3.jpeg"]}],
163
- [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
164
  ]
165
 
166
- demo = gr.ChatInterface(
167
- fn=model_inference,
168
- description="# **DocScope OCR `VL/OCR`**",
169
- examples=examples,
170
- textbox=gr.MultimodalTextbox(
171
- label="Query Input",
172
- file_types=["image", "video", "pdf"],
173
- file_count="multiple",
174
- placeholder="Input your query and optionally upload image(s), video(s), or PDF(s). Select the model using the checkbox."
175
- ),
176
- stop_btn="Stop Generation",
177
- multimodal=True,
178
- cache_examples=False,
179
- theme="bethecloud/storj_theme",
180
- additional_inputs=[gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")],
181
- )
 
 
 
 
 
 
 
 
 
182
 
183
  demo.launch(debug=True, ssr_mode=False)
 
79
 
80
  # Main Inference Function
81
  @spaces.GPU
82
+ def model_inference(text, files, history, use_docscopeocr):
 
 
 
83
  if not text and not files:
84
  yield "Error: Please input a text query or provide files (images, videos, PDFs)."
85
  return
86
 
87
  # Process files: images, videos, PDFs
88
  image_list = []
89
+ for idx, file in enumerate(files or []):
90
+ if file.name.lower().endswith(".pdf"):
91
  try:
92
+ pdf_images = convert_from_path(file.name)
93
  for page_num, img in enumerate(pdf_images, start=1):
94
  label = f"PDF {idx+1} Page {page_num}:"
95
  image_list.append((label, img))
96
  except Exception as e:
97
  yield f"Error converting PDF: {str(e)}"
98
  return
99
+ elif file.name.lower().endswith((".mp4", ".avi", ".mov")):
100
+ frames = downsample_video(file.name)
101
  if not frames:
102
  yield "Error: Could not extract frames from the video."
103
  return
 
106
  image_list.append((label, frame))
107
  else:
108
  try:
109
+ img = load_image(file.name)
110
  label = f"Image {idx+1}:"
111
  image_list.append((label, img))
112
  except Exception as e:
 
153
  yield buffer
154
 
155
  # Gradio Interface
156
+ def chat_interface(text, files, use_docscopeocr, history):
157
+ if text is None and files is None:
158
+ return "Error: Please input a text query or provide files."
159
+ return model_inference(text, files, history, use_docscopeocr)
160
+
161
  examples = [
162
+ {"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]},
163
+ {"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]},
164
+ {"text": "OCR the Image", "files": ["rolm/3.jpeg"]},
165
+ {"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]},
166
  ]
167
 
168
+ with gr.Blocks(theme="bethecloud/storj_theme") as demo:
169
+ gr.Markdown("# **DocScope OCR `VL/OCR`**")
170
+ with gr.Row():
171
+ text_input = gr.Textbox(label="Query Input", placeholder="Input your query here.")
172
+ file_input = gr.File(label="Upload Files", file_count="multiple", file_types=["image", "video", "pdf"])
173
+ use_docscopeocr = gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")
174
+ chat = gr.Chatbot()
175
+ submit_btn = gr.Button("Submit")
176
+ stop_btn = gr.Button("Stop Generation")
177
+
178
+ def submit(text, files, use_docscopeocr, history):
179
+ if not history:
180
+ history = []
181
+ history.append({"role": "user", "content": text})
182
+ return history, gr.update(interactive=False), gr.update(interactive=True)
183
+
184
+ def generate(history, text, files, use_docscopeocr):
185
+ if not history:
186
+ history = []
187
+ for response in model_inference(text, files, history, use_docscopeocr):
188
+ history.append({"role": "assistant", "content": response})
189
+ yield history
190
+
191
+ submit_btn.click(submit, [text_input, file_input, use_docscopeocr, chat], [chat, submit_btn, stop_btn])
192
+ submit_btn.click(generate, [chat, text_input, file_input, use_docscopeocr], chat)
193
 
194
  demo.launch(debug=True, ssr_mode=False)