Spaces:

Tanish28
/

New_Space

Sleeping

App Files Files Community

New_Space / app.py

Tanish28

Update app.py

8083d4b verified 29 days ago

raw

history blame contribute delete

7.34 kB

	import os
	import io
	from pdf2image import convert_from_path
	from openai import OpenAI
	import base64
	import asyncio
	from datetime import datetime
	import gradio as gr

	# We'll use an environment variable for the API key in Spaces
	OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

	class PDFTextExtractor:
	def __init__(self, api_key):
	self.client = OpenAI(api_key=api_key)

	async def extract_text_from_pdf(self, pdf_path):
	try:
	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	print(f"Processing PDF: {pdf_path}")

	images = convert_from_path(pdf_path)

	extracted_texts = []
	for i, image in enumerate(images):
	print(f"Processing page {i+1}...")

	img_buffer = io.BytesIO()
	image.save(img_buffer, format='PNG')
	img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')

	# Updated OpenAI API call using the correct format for newer SDK versions
	response = self.client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "system",
	"content": """You are a doctor at a hospital. You can understand sloppy handwriting and convert it to readable text. Extract all the data from the form according to the markdown structure given below.
	Follow this exact markdown structure:
	# ER - DOCTORS INITIAL ASSESSMENT FORM
	## DR.KAMAKSHI MEMORIAL HOSPITAL, PALLIKARANAI, CHENNAI.

	### PATIENT PROFILE
	Please paste the sticker within the box

	* UHID: ______
	* Date: [DD/MM/YYYY]
	* Patient Name: ______
	* Age/Gender: ______
	* Doctor Name: ______

	### CASE INFORMATION
	* MLC: □ No □ Yes, AR No.: ________________
	* Information Provided By: □ Self □ Care Taker
	* Ambulation: □ Walking □ Wheelchair □ Stretcher
	* Triage Code: □ Red □ Orange □ Yellow □ Green □ Blue

	### ALLERGIES / INTOLERANCES
	* □ Nil Known / □ Yes (See below)
	* Drug & description of allergy / intolerance: ________________

	### CHIEF COMPLAINTS:
	________________
	________________
	________________

	### ASSESSMENT
	* Pain Score: ______ □ NRS □ NPRS-R

	### PAST MEDICAL HISTORY:
	* □ SHTN □ DM □ CAD □ CKD □ STROKE □ ASTHMA □ COPD □ SEIZURE □ HYPOTHYROIDISM
	* □ OTHERS: ________________

	### PAST SURGERIES:
	________________
	________________

	### CURRENT MEDICATIONS:
	* □ Regular □ Irregular □ Nil □ AYUSH: ________________

	### GENERAL EXAMINATION
	* Weight: ______ kgs
	* CBG: ______ mg/dL
	* □ Pallor □ Icterus □ Cyanosis □ Clubbing □ Lymphadenopathy □ Edema

	### VITALS
	* Temp: ______ °F
	* BP: ______/______ mmHg
	* HR: ______/min
	* RR: ______/min
	* SpO₂: ______%
	* NEWS: ______

	### SYSTEMIC EXAMINATION
	* RS: ________________
	* CVS: ________________
	* P/A: ________________
	* CNS: ________________
	* GCS: E____ V____ M____
	* Local O/E: ________________

	Form No: KMHPF190V1
	"""
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Extract and format the Patient Admission Form from this image according to the specified markdown format. Even if the handwriting is sloppy, try to extract the text accurately. Preserve all form fields and checkboxes (as □)."
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{img_base64}"
	}
	}
	]
	}
	],
	max_tokens=4096
	)

	# Updated response parsing for newer SDK versions
	extracted_texts.append({
	'page': i + 1,
	'text': response.choices[0].message.content
	})

	return extracted_texts

	except Exception as e:
	print(f"Error in text extraction: {str(e)}")
	return [{'page': 0, 'text': f"Error: {str(e)}"}]

	def extract_text(pdf_file):
	if OPENAI_API_KEY is None:
	return "Error: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable."

	extractor = PDFTextExtractor(OPENAI_API_KEY)

	pdf_path = pdf_file.name
	extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path))

	if extracted_texts:
	if extracted_texts[0].get('text', '').startswith('Error:'):
	return extracted_texts[0]['text']

	output = ""
	for page in extracted_texts:
	output += f"\n\n=== Page {page['page']} ===\n\n"
	output += page['text']

	return output
	else:
	return "Failed to extract text from PDF"

	iface = gr.Interface(
	fn=extract_text,
	inputs=gr.File(label="Upload PDF"),
	outputs="text",
	title="PDF Text Extractor",
	description="Upload a PDF file to extract text using OpenAI's GPT 4o model."
	)

	iface.launch()