Spaces:

Tanish28
/

Text_Extract

Sleeping

File size: 3,636 Bytes

import os
import io
from pdf2image import convert_from_path
from openai import OpenAI
import base64
import asyncio
from datetime import datetime
import gradio as gr

# We'll use an environment variable for the API key in Spaces
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

class PDFTextExtractor:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)

    async def extract_text_from_pdf(self, pdf_path):
        try:
            if not os.path.exists(pdf_path):
                raise FileNotFoundError(f"PDF file not found: {pdf_path}")
            
            print(f"Processing PDF: {pdf_path}")
            
            images = convert_from_path(pdf_path)
            
            extracted_texts = []
            for i, image in enumerate(images):
                print(f"Processing page {i+1}...")
                
                img_buffer = io.BytesIO()
                image.save(img_buffer, format='PNG')
                img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
                
                response = self.client.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {
                            "role": "system",
                            "content": "Extract ALL text from this image exactly as it appears, preserving all formatting, numbers, and special characters. Include everything you can see, from headers to footers, timestamps to footnotes. Also include the tickmarks present in the forms."
                        },
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": "Please extract and transcribe ALL text visible in this image, exactly as it appears. Include every piece of text you can see, maintaining the exact formatting, spacing, and line breaks."
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/png;base64,{img_base64}"
                                    }
                                }
                            ]
                        }
                    ],
                    max_tokens=4096
                )
                
                extracted_texts.append({
                    'page': i + 1,
                    'text': response.choices[0].message.content
                })
            
            return extracted_texts

        except Exception as e:
            print(f"Error in text extraction: {str(e)}")
            return None

def extract_text(pdf_file):
    if OPENAI_API_KEY is None:
        return "Error: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable."

    extractor = PDFTextExtractor(OPENAI_API_KEY)
    
    pdf_path = pdf_file.name
    extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path))
    
    if extracted_texts:
        output = ""
        for page in extracted_texts:
            output += f"\n\n=== Page {page['page']} ===\n\n"
            output += page['text']
        
        return output
    else:
        return "Failed to extract text from PDF"

iface = gr.Interface(
    fn=extract_text,
    inputs=gr.File(label="Upload PDF"),
    outputs="text",
    title="PDF Text Extractor",
    description="Upload a PDF file to extract all text using OpenAI's GPT-4 Vision."
)

iface.launch()