Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModelForImageTextToText | |
from pdf2image import convert_from_path | |
import base64 | |
import io | |
import spaces | |
from PIL import Image | |
# Load the OCR model and processor from Hugging Face | |
try: | |
processor = AutoProcessor.from_pretrained("allenai/olmOCR-7B-0225-preview") | |
model = AutoModelForVision2Seq.from_pretrained("allenai/olmOCR-7B-0225-preview") | |
except ImportError as e: | |
processor = None | |
model = None | |
print(f"Error loading model: {str(e)}. Please ensure PyTorch is installed.") | |
except ValueError as e: | |
processor = None | |
model = None | |
print(f"Error with model configuration: {str(e)}") | |
def process_pdf(pdf_file): | |
""" | |
Process the uploaded PDF file one page at a time, yielding HTML for each page | |
with its image and extracted text. | |
""" | |
if processor is None or model is None: | |
yield "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>" | |
return | |
# Check if a PDF file was uploaded | |
if pdf_file is None: | |
yield "<p>Please upload a PDF file.</p>" | |
return | |
# Convert PDF to images | |
try: | |
pages = convert_from_path(pdf_file.name) | |
except Exception as e: | |
yield f"<p>Error converting PDF to images: {str(e)}</p>" | |
return | |
# Initial HTML with "Copy All" button and container for pages | |
html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">' | |
yield html # Start with the header | |
# Process each page incrementally | |
for i, page in enumerate(pages): | |
# Convert the page image to base64 for embedding in HTML | |
buffered = io.BytesIO() | |
page.save(buffered, format="PNG") | |
img_str = base64.b64encode(buffered.getvalue()).decode() | |
img_data = f"data:image/png;base64,{img_str}" | |
# Extract text from the page using the OCR model | |
try: | |
inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt") | |
outputs = model.generate(**inputs) | |
text = processor.decode(outputs[0], skip_special_tokens=True) | |
except Exception as e: | |
text = f"Error extracting text: {str(e)}" | |
# Generate HTML for this page's section | |
textarea_id = f"text{i+1}" | |
page_html = f''' | |
<div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;"> | |
<h3>Page {i+1}</h3> | |
<div style="display: flex; align-items: flex-start;"> | |
<img src="{img_data}" alt="Page {i+1}" style="max-width: 300px; margin-right: 20px;"> | |
<div style="flex-grow: 1;"> | |
<textarea id="{textarea_id}" rows="10" style="width: 100%;">{text}</textarea> | |
<button onclick="copyText('{textarea_id}')" style="margin-top: 5px;">Copy</button> | |
</div> | |
</div> | |
</div> | |
''' | |
# Append this page to the existing HTML and yield the updated content | |
html += page_html | |
yield html | |
# After all pages are processed, close the div and add JavaScript | |
html += '</div>' | |
html += ''' | |
<script> | |
function copyText(id) { | |
var text = document.getElementById(id); | |
text.select(); | |
document.execCommand("copy"); | |
} | |
function copyAll() { | |
var texts = document.querySelectorAll("#pages textarea"); | |
var allText = Array.from(texts).map(t => t.value).join("\\n\\n"); | |
navigator.clipboard.writeText(allText); | |
} | |
</script> | |
''' | |
yield html # Final yield with complete content and scripts | |
# Define the Gradio interface | |
with gr.Blocks(title="PDF Text Extractor") as demo: | |
gr.Markdown("# PDF Text Extractor") | |
gr.Markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text incrementally.") | |
with gr.Row(): | |
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
submit_btn = gr.Button("Extract Text") | |
output_html = gr.HTML() | |
submit_btn.click(fn=process_pdf, inputs=pdf_input, outputs=output_html) | |
# Launch the interface | |
demo.launch() |