Spaces:

OmkarGhugarkar
/

Multi-File-PDF-Chat-Application

Sleeping

App Files Files Community

Multi-File-PDF-Chat-Application / processPDF.py

OmkarGhugarkar

Upload 3 files

3905e66 verified 4 months ago

raw

history blame contribute delete

6.96 kB

	import string
	import random
	import fitz
	from PIL import Image as Img
	import os
	import shutil
	import base64
	from openai import OpenAI

	import string
	import random
	import fitz
	from PIL import Image as Img
	import os
	import tqdm
	import shutil
	import base64
	from openai import OpenAI
	import streamlit as st

	def process_pdf_with_ocr(pdf_path, api_key):
	def generate_random_string(length=10):
	characters = string.ascii_letters + string.digits
	return ''.join(random.choices(characters, k=length))

	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")

	def get_ocr_text(image_path, client, current_page, total_pages):
	progress = (current_page / total_pages) * 100
	status_text.text(f"Processing page {current_page}/{total_pages} with OCR")
	progress_bar.progress(int(progress))

	prompt = """
	You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
	- Regular text is returned as plain text.
	- Any detected tables are reconstructed using proper markdown table formatting (using pipes "\|" for columns and dashes "-" for row separators).
	Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
	"""

	base64_image = encode_image(image_path)
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
	]
	}]
	)
	return response.choices[0].message.content

	# Initialize progress tracking
	progress_bar = st.progress(0)
	status_text = st.empty()
	progress_info = st.empty()

	# Initialize OpenAI client
	status_text.text("Initializing OpenAI client...")
	progress_bar.progress(5)
	os.environ["OPENAI_API_KEY"] = api_key
	client = OpenAI()

	# Create temp folder for images
	temp_folder = f"Images/{generate_random_string()}"
	os.makedirs(temp_folder, exist_ok=True)
	progress_bar.progress(10)

	result = {}
	try:
	# Open PDF and get total pages
	status_text.text("Opening PDF document...")
	pdf_document = fitz.open(pdf_path)
	total_pages = len(pdf_document)
	progress_bar.progress(15)

	# Convert PDF to images
	for page_num in range(total_pages):
	current_progress = 15 + (page_num / total_pages * 25) # 15-40% progress for PDF to image conversion
	status_text.text(f"Converting page {page_num + 1}/{total_pages} to image")
	progress_info.text(f"PDF to Image conversion: {int(current_progress)}%")
	progress_bar.progress(int(current_progress))

	page = pdf_document[page_num]
	pix = page.get_pixmap(dpi=150)
	image_path = f"{temp_folder}/page_{page_num + 1}.png"
	image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
	image.save(image_path)

	# Process OCR for each image
	status_text.text("Starting OCR processing...")
	progress_bar.progress(40)

	for page_num in range(total_pages):
	current_progress = 40 + (page_num / total_pages * 55) # 40-95% progress for OCR
	image_path = f"{temp_folder}/page_{page_num + 1}.png"
	progress_info.text(f"OCR Processing: {int(current_progress)}%")

	ocr_text = get_ocr_text(image_path, client, page_num + 1, total_pages)
	result[page_num + 1] = ocr_text

	pdf_document.close()
	status_text.text("Finalizing...")
	progress_bar.progress(95)

	finally:
	# Clean up
	if os.path.exists(temp_folder):
	status_text.text("Cleaning up temporary files...")
	shutil.rmtree(temp_folder)
	progress_bar.progress(100)
	status_text.text("Processing complete!")
	progress_info.empty()

	return result

	'''
	def process_pdf_with_ocr(pdf_path, api_key):
	def generate_random_string(length=10):
	characters = string.ascii_letters + string.digits
	return ''.join(random.choices(characters, k=length))

	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")

	def get_ocr_text(image_path, client):
	prompt = """
	You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
	- Regular text is returned as plain text.
	- Any detected tables are reconstructed using proper markdown table formatting (using pipes "\|" for columns and dashes "-" for row separators).
	Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
	"""
	base64_image = encode_image(image_path)
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
	]
	}]
	)
	print(image_path)
	print(response.choices[0].message.content)
	return response.choices[0].message.content

	# Initialize OpenAI client
	os.environ["OPENAI_API_KEY"] = api_key
	client = OpenAI()

	# Create temp folder for images
	temp_folder = f"Images/{generate_random_string()}"
	os.makedirs(temp_folder, exist_ok=True)

	# Process PDF
	result = {}
	try:
	# Convert PDF to images
	pdf_document = fitz.open(pdf_path)
	for page_num in range(len(pdf_document)):
	page = pdf_document[page_num]
	pix = page.get_pixmap(dpi=150)
	image_path = f"{temp_folder}/page_{page_num + 1}.png"
	image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
	image.save(image_path)

	# Process each image with OCR
	ocr_text = get_ocr_text(image_path, client)
	result[page_num + 1] = ocr_text

	pdf_document.close()

	finally:
	# Clean up temporary files
	if os.path.exists(temp_folder):
	shutil.rmtree(temp_folder)

	return result
	'''