OmkarGhugarkar's picture
Upload 3 files
3905e66 verified
import string
import random
import fitz
from PIL import Image as Img
import os
import shutil
import base64
from openai import OpenAI
import string
import random
import fitz
from PIL import Image as Img
import os
import tqdm
import shutil
import base64
from openai import OpenAI
import streamlit as st
def process_pdf_with_ocr(pdf_path, api_key):
def generate_random_string(length=10):
characters = string.ascii_letters + string.digits
return ''.join(random.choices(characters, k=length))
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def get_ocr_text(image_path, client, current_page, total_pages):
progress = (current_page / total_pages) * 100
status_text.text(f"Processing page {current_page}/{total_pages} with OCR")
progress_bar.progress(int(progress))
prompt = """
You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
- Regular text is returned as plain text.
- Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
"""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}]
)
return response.choices[0].message.content
# Initialize progress tracking
progress_bar = st.progress(0)
status_text = st.empty()
progress_info = st.empty()
# Initialize OpenAI client
status_text.text("Initializing OpenAI client...")
progress_bar.progress(5)
os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI()
# Create temp folder for images
temp_folder = f"Images/{generate_random_string()}"
os.makedirs(temp_folder, exist_ok=True)
progress_bar.progress(10)
result = {}
try:
# Open PDF and get total pages
status_text.text("Opening PDF document...")
pdf_document = fitz.open(pdf_path)
total_pages = len(pdf_document)
progress_bar.progress(15)
# Convert PDF to images
for page_num in range(total_pages):
current_progress = 15 + (page_num / total_pages * 25) # 15-40% progress for PDF to image conversion
status_text.text(f"Converting page {page_num + 1}/{total_pages} to image")
progress_info.text(f"PDF to Image conversion: {int(current_progress)}%")
progress_bar.progress(int(current_progress))
page = pdf_document[page_num]
pix = page.get_pixmap(dpi=150)
image_path = f"{temp_folder}/page_{page_num + 1}.png"
image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
image.save(image_path)
# Process OCR for each image
status_text.text("Starting OCR processing...")
progress_bar.progress(40)
for page_num in range(total_pages):
current_progress = 40 + (page_num / total_pages * 55) # 40-95% progress for OCR
image_path = f"{temp_folder}/page_{page_num + 1}.png"
progress_info.text(f"OCR Processing: {int(current_progress)}%")
ocr_text = get_ocr_text(image_path, client, page_num + 1, total_pages)
result[page_num + 1] = ocr_text
pdf_document.close()
status_text.text("Finalizing...")
progress_bar.progress(95)
finally:
# Clean up
if os.path.exists(temp_folder):
status_text.text("Cleaning up temporary files...")
shutil.rmtree(temp_folder)
progress_bar.progress(100)
status_text.text("Processing complete!")
progress_info.empty()
return result
'''
def process_pdf_with_ocr(pdf_path, api_key):
def generate_random_string(length=10):
characters = string.ascii_letters + string.digits
return ''.join(random.choices(characters, k=length))
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def get_ocr_text(image_path, client):
prompt = """
You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
- Regular text is returned as plain text.
- Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
"""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}]
)
print(image_path)
print(response.choices[0].message.content)
return response.choices[0].message.content
# Initialize OpenAI client
os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI()
# Create temp folder for images
temp_folder = f"Images/{generate_random_string()}"
os.makedirs(temp_folder, exist_ok=True)
# Process PDF
result = {}
try:
# Convert PDF to images
pdf_document = fitz.open(pdf_path)
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
pix = page.get_pixmap(dpi=150)
image_path = f"{temp_folder}/page_{page_num + 1}.png"
image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
image.save(image_path)
# Process each image with OCR
ocr_text = get_ocr_text(image_path, client)
result[page_num + 1] = ocr_text
pdf_document.close()
finally:
# Clean up temporary files
if os.path.exists(temp_folder):
shutil.rmtree(temp_folder)
return result
'''