|
import string |
|
import random |
|
import fitz |
|
from PIL import Image as Img |
|
import os |
|
import shutil |
|
import base64 |
|
from openai import OpenAI |
|
|
|
import string |
|
import random |
|
import fitz |
|
from PIL import Image as Img |
|
import os |
|
import tqdm |
|
import shutil |
|
import base64 |
|
from openai import OpenAI |
|
import streamlit as st |
|
|
|
def process_pdf_with_ocr(pdf_path, api_key): |
|
def generate_random_string(length=10): |
|
characters = string.ascii_letters + string.digits |
|
return ''.join(random.choices(characters, k=length)) |
|
|
|
def encode_image(image_path): |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode("utf-8") |
|
|
|
def get_ocr_text(image_path, client, current_page, total_pages): |
|
progress = (current_page / total_pages) * 100 |
|
status_text.text(f"Processing page {current_page}/{total_pages} with OCR") |
|
progress_bar.progress(int(progress)) |
|
|
|
prompt = """ |
|
You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that: |
|
- Regular text is returned as plain text. |
|
- Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators). |
|
Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response. |
|
""" |
|
|
|
base64_image = encode_image(image_path) |
|
response = client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": prompt}, |
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} |
|
] |
|
}] |
|
) |
|
return response.choices[0].message.content |
|
|
|
|
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
progress_info = st.empty() |
|
|
|
|
|
status_text.text("Initializing OpenAI client...") |
|
progress_bar.progress(5) |
|
os.environ["OPENAI_API_KEY"] = api_key |
|
client = OpenAI() |
|
|
|
|
|
temp_folder = f"Images/{generate_random_string()}" |
|
os.makedirs(temp_folder, exist_ok=True) |
|
progress_bar.progress(10) |
|
|
|
result = {} |
|
try: |
|
|
|
status_text.text("Opening PDF document...") |
|
pdf_document = fitz.open(pdf_path) |
|
total_pages = len(pdf_document) |
|
progress_bar.progress(15) |
|
|
|
|
|
for page_num in range(total_pages): |
|
current_progress = 15 + (page_num / total_pages * 25) |
|
status_text.text(f"Converting page {page_num + 1}/{total_pages} to image") |
|
progress_info.text(f"PDF to Image conversion: {int(current_progress)}%") |
|
progress_bar.progress(int(current_progress)) |
|
|
|
page = pdf_document[page_num] |
|
pix = page.get_pixmap(dpi=150) |
|
image_path = f"{temp_folder}/page_{page_num + 1}.png" |
|
image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
image.save(image_path) |
|
|
|
|
|
status_text.text("Starting OCR processing...") |
|
progress_bar.progress(40) |
|
|
|
for page_num in range(total_pages): |
|
current_progress = 40 + (page_num / total_pages * 55) |
|
image_path = f"{temp_folder}/page_{page_num + 1}.png" |
|
progress_info.text(f"OCR Processing: {int(current_progress)}%") |
|
|
|
ocr_text = get_ocr_text(image_path, client, page_num + 1, total_pages) |
|
result[page_num + 1] = ocr_text |
|
|
|
pdf_document.close() |
|
status_text.text("Finalizing...") |
|
progress_bar.progress(95) |
|
|
|
finally: |
|
|
|
if os.path.exists(temp_folder): |
|
status_text.text("Cleaning up temporary files...") |
|
shutil.rmtree(temp_folder) |
|
progress_bar.progress(100) |
|
status_text.text("Processing complete!") |
|
progress_info.empty() |
|
|
|
return result |
|
|
|
''' |
|
def process_pdf_with_ocr(pdf_path, api_key): |
|
def generate_random_string(length=10): |
|
characters = string.ascii_letters + string.digits |
|
return ''.join(random.choices(characters, k=length)) |
|
|
|
def encode_image(image_path): |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode("utf-8") |
|
|
|
def get_ocr_text(image_path, client): |
|
prompt = """ |
|
You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that: |
|
- Regular text is returned as plain text. |
|
- Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators). |
|
Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response. |
|
""" |
|
base64_image = encode_image(image_path) |
|
response = client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": prompt}, |
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} |
|
] |
|
}] |
|
) |
|
print(image_path) |
|
print(response.choices[0].message.content) |
|
return response.choices[0].message.content |
|
|
|
# Initialize OpenAI client |
|
os.environ["OPENAI_API_KEY"] = api_key |
|
client = OpenAI() |
|
|
|
# Create temp folder for images |
|
temp_folder = f"Images/{generate_random_string()}" |
|
os.makedirs(temp_folder, exist_ok=True) |
|
|
|
# Process PDF |
|
result = {} |
|
try: |
|
# Convert PDF to images |
|
pdf_document = fitz.open(pdf_path) |
|
for page_num in range(len(pdf_document)): |
|
page = pdf_document[page_num] |
|
pix = page.get_pixmap(dpi=150) |
|
image_path = f"{temp_folder}/page_{page_num + 1}.png" |
|
image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
image.save(image_path) |
|
|
|
# Process each image with OCR |
|
ocr_text = get_ocr_text(image_path, client) |
|
result[page_num + 1] = ocr_text |
|
|
|
pdf_document.close() |
|
|
|
finally: |
|
# Clean up temporary files |
|
if os.path.exists(temp_folder): |
|
shutil.rmtree(temp_folder) |
|
|
|
return result |
|
''' |