Synced repo using 'sync_with_huggingface' Github Action
Browse filesoriginal:
- remote: "https://github.com/xiaoyao9184/docker-marker"
- commit: "53922256b5d246d3f11148c1a3dac0048a6fef4f"
sync_with_huggingface:
- repository: ""
- ref: ""
- gradio_app.py +40 -3
- requirements.txt +1 -1
gradio_app.py
CHANGED
@@ -117,7 +117,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
117 |
)
|
118 |
|
119 |
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
|
120 |
-
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown")
|
121 |
|
122 |
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
|
123 |
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
|
@@ -186,7 +186,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
186 |
)
|
187 |
|
188 |
output_format_dd.change(
|
189 |
-
fn=lambda x: gr.update(interactive=x == "json", value=x == "json"),
|
190 |
inputs=[output_format_dd],
|
191 |
outputs=[show_blocks_ckb],
|
192 |
api_name=False
|
@@ -201,7 +201,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
201 |
filename (str): Path to the input PDF file.
|
202 |
page_range (str): Page range to process (e.g., "0-5").
|
203 |
force_ocr (bool, optional): If True (default), force OCR even on text-based PDFs.
|
204 |
-
output_format (str, optional): Output format. One of: "markdown", "html", "json".
|
205 |
Defaults to "markdown".
|
206 |
show_blocks (bool, optional): If True, show blocks in preview image with JSON output.
|
207 |
Defaults to False.
|
@@ -314,6 +314,43 @@ with gr.Blocks(title="Marker") as demo:
|
|
314 |
gr_debug_lay,
|
315 |
gr_img
|
316 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
|
318 |
run_marker_btn.click(
|
319 |
fn=run_marker_img,
|
|
|
117 |
)
|
118 |
|
119 |
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
|
120 |
+
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html", "chunks"], value="markdown")
|
121 |
|
122 |
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
|
123 |
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
|
|
|
186 |
)
|
187 |
|
188 |
output_format_dd.change(
|
189 |
+
fn=lambda x: gr.update(interactive=x == "json" or x == "chunks", value=x == "json" or x == "chunks",),
|
190 |
inputs=[output_format_dd],
|
191 |
outputs=[show_blocks_ckb],
|
192 |
api_name=False
|
|
|
201 |
filename (str): Path to the input PDF file.
|
202 |
page_range (str): Page range to process (e.g., "0-5").
|
203 |
force_ocr (bool, optional): If True (default), force OCR even on text-based PDFs.
|
204 |
+
output_format (str, optional): Output format. One of: "markdown", "html", "json", "chunks".
|
205 |
Defaults to "markdown".
|
206 |
show_blocks (bool, optional): If True, show blocks in preview image with JSON output.
|
207 |
Defaults to False.
|
|
|
314 |
gr_debug_lay,
|
315 |
gr_img
|
316 |
]
|
317 |
+
elif output_format == "chunks":
|
318 |
+
if show_blocks:
|
319 |
+
doc_json = json.loads(text)
|
320 |
+
color_map = {}
|
321 |
+
sections = []
|
322 |
+
def traverse(block):
|
323 |
+
if "block_type" in block:
|
324 |
+
try:
|
325 |
+
index = list(BlockTypes.__members__).index(block["block_type"])
|
326 |
+
color = COLORS[index % len(COLORS)]
|
327 |
+
except (ValueError, IndexError):
|
328 |
+
color = "#cccccc" # fallback color
|
329 |
+
|
330 |
+
label = block["id"].replace("/page/0/", "")
|
331 |
+
color_map[label] = color
|
332 |
+
|
333 |
+
bbox = tuple(int(x) for x in block["bbox"])
|
334 |
+
sections.append((bbox, label))
|
335 |
+
if "blocks" in block and isinstance(block["blocks"], list):
|
336 |
+
for child in block["blocks"]:
|
337 |
+
traverse(child)
|
338 |
+
traverse(doc_json)
|
339 |
+
|
340 |
+
page_range = config_parser.generate_config_dict()["page_range"]
|
341 |
+
first_page = page_range[0] if page_range else 0
|
342 |
+
img = get_page_image(filename, first_page + 1, dpi=72)
|
343 |
+
|
344 |
+
gr_img = gr.update(value=(img, sections), color_map=color_map)
|
345 |
+
|
346 |
+
return [
|
347 |
+
gr.update(visible=False),
|
348 |
+
gr.update(visible=True, value=text),
|
349 |
+
gr.update(visible=False),
|
350 |
+
gr_debug_pdf,
|
351 |
+
gr_debug_lay,
|
352 |
+
gr_img
|
353 |
+
]
|
354 |
|
355 |
run_marker_btn.click(
|
356 |
fn=run_marker_img,
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
torch==2.7.0
|
2 |
-
marker-pdf[full]==1.
|
3 |
gradio[mcp]==5.28.0
|
4 |
|
5 |
# transformers 4.52.4 depends on huggingface-hub>=0.30.0
|
|
|
1 |
torch==2.7.0
|
2 |
+
marker-pdf[full]==1.8.0
|
3 |
gradio[mcp]==5.28.0
|
4 |
|
5 |
# transformers 4.52.4 depends on huggingface-hub>=0.30.0
|