import os import io from pdf2image import convert_from_path from openai import OpenAI import base64 import asyncio from datetime import datetime import gradio as gr # We'll use an environment variable for the API key in Spaces OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") class PDFTextExtractor: def __init__(self, api_key): self.client = OpenAI(api_key=api_key) async def extract_text_from_pdf(self, pdf_path): try: if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF file not found: {pdf_path}") print(f"Processing PDF: {pdf_path}") images = convert_from_path(pdf_path) extracted_texts = [] for i, image in enumerate(images): print(f"Processing page {i+1}...") img_buffer = io.BytesIO() image.save(img_buffer, format='PNG') img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8') response = self.client.chat.completions.create( model="gpt-4o", messages=[ { "role": "system", "content": "Extract ALL text from this image exactly as it appears, preserving all formatting, numbers, and special characters. Include everything you can see, from headers to footers, timestamps to footnotes. Also include the tickmarks present in the forms." }, { "role": "user", "content": [ { "type": "text", "text": "Please extract and transcribe ALL text visible in this image, exactly as it appears. Include every piece of text you can see, maintaining the exact formatting, spacing, and line breaks." }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_base64}" } } ] } ], max_tokens=4096 ) extracted_texts.append({ 'page': i + 1, 'text': response.choices[0].message.content }) return extracted_texts except Exception as e: print(f"Error in text extraction: {str(e)}") return None def extract_text(pdf_file): if OPENAI_API_KEY is None: return "Error: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable." extractor = PDFTextExtractor(OPENAI_API_KEY) pdf_path = pdf_file.name extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path)) if extracted_texts: output = "" for page in extracted_texts: output += f"\n\n=== Page {page['page']} ===\n\n" output += page['text'] return output else: return "Failed to extract text from PDF" iface = gr.Interface( fn=extract_text, inputs=gr.File(label="Upload PDF"), outputs="text", title="PDF Text Extractor", description="Upload a PDF file to extract all text using OpenAI's GPT-4 Vision." ) iface.launch()