xiaoyao9184 commited on
Commit
3bbc408
·
verified ·
1 Parent(s): 2f98410

Synced repo using 'sync_with_huggingface' Github Action

Browse files

original:
- remote: "https://github.com/xiaoyao9184/docker-marker"
- commit: "53922256b5d246d3f11148c1a3dac0048a6fef4f"
sync_with_huggingface:
- repository: ""
- ref: ""

Files changed (2) hide show
  1. gradio_app.py +40 -3
  2. requirements.txt +1 -1
gradio_app.py CHANGED
@@ -117,7 +117,7 @@ with gr.Blocks(title="Marker") as demo:
117
  )
118
 
119
  page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
120
- output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown")
121
 
122
  use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
123
  force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
@@ -186,7 +186,7 @@ with gr.Blocks(title="Marker") as demo:
186
  )
187
 
188
  output_format_dd.change(
189
- fn=lambda x: gr.update(interactive=x == "json", value=x == "json"),
190
  inputs=[output_format_dd],
191
  outputs=[show_blocks_ckb],
192
  api_name=False
@@ -201,7 +201,7 @@ with gr.Blocks(title="Marker") as demo:
201
  filename (str): Path to the input PDF file.
202
  page_range (str): Page range to process (e.g., "0-5").
203
  force_ocr (bool, optional): If True (default), force OCR even on text-based PDFs.
204
- output_format (str, optional): Output format. One of: "markdown", "html", "json".
205
  Defaults to "markdown".
206
  show_blocks (bool, optional): If True, show blocks in preview image with JSON output.
207
  Defaults to False.
@@ -314,6 +314,43 @@ with gr.Blocks(title="Marker") as demo:
314
  gr_debug_lay,
315
  gr_img
316
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
  run_marker_btn.click(
319
  fn=run_marker_img,
 
117
  )
118
 
119
  page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
120
+ output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html", "chunks"], value="markdown")
121
 
122
  use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
123
  force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
 
186
  )
187
 
188
  output_format_dd.change(
189
+ fn=lambda x: gr.update(interactive=x == "json" or x == "chunks", value=x == "json" or x == "chunks",),
190
  inputs=[output_format_dd],
191
  outputs=[show_blocks_ckb],
192
  api_name=False
 
201
  filename (str): Path to the input PDF file.
202
  page_range (str): Page range to process (e.g., "0-5").
203
  force_ocr (bool, optional): If True (default), force OCR even on text-based PDFs.
204
+ output_format (str, optional): Output format. One of: "markdown", "html", "json", "chunks".
205
  Defaults to "markdown".
206
  show_blocks (bool, optional): If True, show blocks in preview image with JSON output.
207
  Defaults to False.
 
314
  gr_debug_lay,
315
  gr_img
316
  ]
317
+ elif output_format == "chunks":
318
+ if show_blocks:
319
+ doc_json = json.loads(text)
320
+ color_map = {}
321
+ sections = []
322
+ def traverse(block):
323
+ if "block_type" in block:
324
+ try:
325
+ index = list(BlockTypes.__members__).index(block["block_type"])
326
+ color = COLORS[index % len(COLORS)]
327
+ except (ValueError, IndexError):
328
+ color = "#cccccc" # fallback color
329
+
330
+ label = block["id"].replace("/page/0/", "")
331
+ color_map[label] = color
332
+
333
+ bbox = tuple(int(x) for x in block["bbox"])
334
+ sections.append((bbox, label))
335
+ if "blocks" in block and isinstance(block["blocks"], list):
336
+ for child in block["blocks"]:
337
+ traverse(child)
338
+ traverse(doc_json)
339
+
340
+ page_range = config_parser.generate_config_dict()["page_range"]
341
+ first_page = page_range[0] if page_range else 0
342
+ img = get_page_image(filename, first_page + 1, dpi=72)
343
+
344
+ gr_img = gr.update(value=(img, sections), color_map=color_map)
345
+
346
+ return [
347
+ gr.update(visible=False),
348
+ gr.update(visible=True, value=text),
349
+ gr.update(visible=False),
350
+ gr_debug_pdf,
351
+ gr_debug_lay,
352
+ gr_img
353
+ ]
354
 
355
  run_marker_btn.click(
356
  fn=run_marker_img,
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  torch==2.7.0
2
- marker-pdf[full]==1.7.5
3
  gradio[mcp]==5.28.0
4
 
5
  # transformers 4.52.4 depends on huggingface-hub>=0.30.0
 
1
  torch==2.7.0
2
+ marker-pdf[full]==1.8.0
3
  gradio[mcp]==5.28.0
4
 
5
  # transformers 4.52.4 depends on huggingface-hub>=0.30.0