Spaces:

Tanish28
/

New_Space

Sleeping

App Files Files Community

Tanish28 commited on Feb 25

Commit

463fe4b

verified ·

1 Parent(s): 11d03c2

Update app.py

Browse files

Files changed (1) hide show

app.py +187 -131

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pdf2image import convert_from_path
 from anthropic import Anthropic
 import base64
 import asyncio
 from datetime import datetime
 import gradio as gr
@@ -21,138 +22,183 @@ class PDFTextExtractor:
             print(f"Processing PDF: {pdf_path}")
-            images = convert_from_path(pdf_path)
             extracted_texts = []
             for i, image in enumerate(images):
                 print(f"Processing page {i+1}...")
-                img_buffer = io.BytesIO()
-                image.save(img_buffer, format='PNG')
-                img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
-                response = self.client.messages.create(
-                    model="claude-3-7-sonnet-20250219",
-                    system="""You are a doctor at a hospital. You can understand sloppy handwriting and convert it to readable text.
-                            Extract all the text from the form according to the markdown structure given below. The number in the form can be different from what they look. For example: 9 might look like 7. 2 might look like 1. Keep this in mind.
-                            Follow this exact markdown structure:
-                            # PATIENT ADMISSION FORM
-                            ## DR.KAMAKSHI MEMORIAL HOSPITAL, PALLIKARANAI, CHENNAI.
-                            ### PATIENT PROFILE
-                            *Please paste the sticker within the box*
-                            * UHID: ______
-                            * Patient Name: ______
-                            * Age/Gender: ______
-                            * Doctor Name: ______
-                            ### BASIC INFORMATION
-                            * Date & Time of Admission: [DD/MM/YYYY]
-                            * Date of Birth: [DD/MM/YYYY]
-                            ### IDENTIFICATION
-                            **ID Proof Already Registered**: □ Yes □ No
-                            **Type of ID**:
-                            * □ Aadhar
-                            * □ Passport
-                            * □ Voter ID
-                            * □ Driving License
-                            * □ Others
-                            ID No.: ________________
-                            Contact No.: ________________
-                            ### MEDICAL DETAILS
-                            * Provisional Diagnosis:
-                            * Reason for Admission:
-                            * Plan of Care:
-                            * Expected Outcome*:
-                            ### CONSULTANT DETAILS
-                            * Primary Consultant Name:
-                            * Speciality:
-                            ### PATIENT REFERENCE INFORMATION
-                            *(To be filled by Front Office)*
-                            **Reference Via**:
-                            * □ Doctor
-                            * □ Hospital
-                            * □ Ambulance
-                            * □ DRKMH Employee
-                            * □ Self / Walk In
-                            **Referrer Details**:
-                            * Name: ________________
-                            * Contact No.: ________________
-                            ### TYPE OF ADMISSION
-                            * □ Emergency
-                            * □ Elective
-                            * □ MLC
-                            * □ Surgery
-                            * □ Medical
-                            * □ Others: ________________
-                            ### TREATMENT TYPE
-                            * □ In Patient
-                            * □ Day Care
-                            * Transfer To: ________________
-                            ### CONTACT DETAILS
-                            **Person to Contact (Next of Kin)**:
-                            * Name: ________________
-                            * Relationship with Patient: ________________
-                            * Address: ________________
-                            * Pincode: ________________
-                            * Mobile: ________________
-                            * Email: ________________
-                            ### OFFICIAL USE
-                            * Front Office Executive Name:
-                            * Front Office Executive Signature:
-                            * Advance Amount Paid:
-                            * ICD-10 Code (For Medical Records Section):
-                            *\\* Subject to change during the course of diseases*
-                            ---
-                            *Form No: KMHIPF002V3*
-                            """,
-                    max_tokens=4096,
-                    messages=[
-                        {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": "Extract and format the Patient Admission Form from this image according to the specified markdown format. Even if the handwriting is sloppy, try to extract the text accurately. Preserve all form fields and checkboxes (as □)."
-                                },
-                                {
-                                    "type": "image",
-                                    "source": {
-                                        "type": "base64",
-                                        "media_type": "image/png",
-                                        "data": img_base64
                                     }
-                                }
-                            ]
-                        }
-                    ]
-                )
-                extracted_texts.append({
-                    'page': i + 1,
-                    'text': response.content[0].text
-                })
             return extracted_texts
         except Exception as e:
             print(f"Error in text extraction: {str(e)}")
-            return None
 def extract_text(pdf_file):
     if ANTHROPIC_API_KEY is None:
@@ -160,18 +206,28 @@ def extract_text(pdf_file):
     extractor = PDFTextExtractor(ANTHROPIC_API_KEY)
-    pdf_path = pdf_file.name
-    extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path))
-    if extracted_texts:
-        output = ""
-        for page in extracted_texts:
-            output += f"\n\n=== Page {page['page']} ===\n\n"
-            output += page['text']
-        return output
-    else:
-        return "Failed to extract text from PDF"
 iface = gr.Interface(
     fn=extract_text,

 from anthropic import Anthropic
 import base64
 import asyncio
+import traceback
 from datetime import datetime
 import gradio as gr
             print(f"Processing PDF: {pdf_path}")
+            # Add more parameters to control image quality and size
+            try:
+                images = convert_from_path(
+                    pdf_path,
+                    dpi=150,  # Lower DPI for smaller images
+                    fmt="png"
+                )
+                print(f"Successfully converted PDF to {len(images)} images")
+            except Exception as e:
+                print(f"Error converting PDF to images: {str(e)}")
+                print(traceback.format_exc())
+                return None
             extracted_texts = []
             for i, image in enumerate(images):
                 print(f"Processing page {i+1}...")
+                try:
+                    img_buffer = io.BytesIO()
+                    image.save(img_buffer, format='PNG')
+                    img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
+                    img_size = len(img_base64)
+                    print(f"Image {i+1} converted to base64. Size: {img_size} bytes")
+                    # Check if image size exceeds limits
+                    if img_size > 10000000:  # 10MB example limit
+                        print(f"Warning: Image size ({img_size} bytes) might be too large")
+                    print(f"Sending request to Claude API for page {i+1}...")
+                    response = self.client.messages.create(
+                        model="claude-3-7-sonnet-20250219",
+                        system="""You are a doctor at a hospital. You can understand sloppy handwriting and convert it to readable text. Extract all the data from the form according to the markdown structure given below.
+                                Follow this exact markdown structure:
+                                # PATIENT ADMISSION FORM
+                                ## DR.KAMAKSHI MEMORIAL HOSPITAL, PALLIKARANAI, CHENNAI.
+                                ### PATIENT PROFILE
+                                *Please paste the sticker within the box*
+                                * UHID: ______
+                                * Patient Name: ______
+                                * Age/Gender: ______
+                                * Doctor Name: ______
+                                ### BASIC INFORMATION
+                                * Date & Time of Admission: [DD/MM/YYYY]
+                                * Date of Birth: [DD/MM/YYYY]
+                                ### IDENTIFICATION
+                                **ID Proof Already Registered**: □ Yes □ No
+                                **Type of ID**:
+                                * □ Aadhar
+                                * □ Passport
+                                * □ Voter ID
+                                * □ Driving License
+                                * □ Others
+                                ID No.: ________________
+                                Contact No.: ________________
+                                ### MEDICAL DETAILS
+                                * Provisional Diagnosis:
+                                * Reason for Admission:
+                                * Plan of Care:
+                                * Expected Outcome*:
+                                ### CONSULTANT DETAILS
+                                * Primary Consultant Name:
+                                * Speciality:
+                                ### PATIENT REFERENCE INFORMATION
+                                *(To be filled by Front Office)*
+                                **Reference Via**:
+                                * □ Doctor
+                                * □ Hospital
+                                * □ Ambulance
+                                * □ DRKMH Employee
+                                * □ Self / Walk In
+                                **Referrer Details**:
+                                * Name: ________________
+                                * Contact No.: ________________
+                                ### TYPE OF ADMISSION
+                                * □ Emergency
+                                * □ Elective
+                                * □ MLC
+                                * □ Surgery
+                                * □ Medical
+                                * □ Others: ________________
+                                ### TREATMENT TYPE
+                                * □ In Patient
+                                * □ Day Care
+                                * Transfer To: ________________
+                                ### CONTACT DETAILS
+                                **Person to Contact (Next of Kin)**:
+                                * Name: ________________
+                                * Relationship with Patient: ________________
+                                * Address: ________________
+                                * Pincode: ________________
+                                * Mobile: ________________
+                                * Email: ________________
+                                ### OFFICIAL USE
+                                * Front Office Executive Name:
+                                * Front Office Executive Signature:
+                                * Advance Amount Paid:
+                                * ICD-10 Code (For Medical Records Section):
+                                *\\* Subject to change during the course of diseases*
+                                ---
+                                *Form No: KMHIPF002V3*
+                                """,
+                        max_tokens=4096,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": "Extract and format the Patient Admission Form from this image according to the specified markdown format. Even if the handwriting is sloppy, try to extract the text accurately. Preserve all form fields and checkboxes (as □)."
+                                    },
+                                    {
+                                        "type": "image",
+                                        "source": {
+                                            "type": "base64",
+                                            "media_type": "image/png",
+                                            "data": img_base64
+                                        }
                                     }
+                                ]
+                            }
+                        ]
+                    )
+                    print(f"Received response for page {i+1}")
+                    # Print response structure for debugging
+                    print(f"Response type: {type(response)}")
+                    print(f"Response attributes: {dir(response)}")
+                    # Safely extract content
+                    try:
+                        if hasattr(response, 'content') and response.content:
+                            content = response.content[0].text
+                            print(f"Successfully extracted text content from API response")
+                        else:
+                            print(f"No content found in response: {response}")
+                            content = "Error: No content found in API response"
+                    except Exception as e:
+                        print(f"Error extracting content from response: {str(e)}")
+                        print(f"Raw response: {response}")
+                        content = f"Error extracting content: {str(e)}"
+                    extracted_texts.append({
+                        'page': i + 1,
+                        'text': content
+                    })
+                except Exception as e:
+                    print(f"Error processing page {i+1}: {str(e)}")
+                    print(traceback.format_exc())
+                    extracted_texts.append({
+                        'page': i + 1,
+                        'text': f"Error processing this page: {str(e)}"
+                    })
             return extracted_texts
         except Exception as e:
             print(f"Error in text extraction: {str(e)}")
+            print(traceback.format_exc())
+            return [{'page': 1, 'text': f"Error in text extraction: {str(e)}"}]
 def extract_text(pdf_file):
     if ANTHROPIC_API_KEY is None:
     extractor = PDFTextExtractor(ANTHROPIC_API_KEY)
+    try:
+        pdf_path = pdf_file.name
+        print(f"Starting extraction from {pdf_path}")
+        extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path))
+        if extracted_texts:
+            output = ""
+            for page in extracted_texts:
+                output += f"\n\n=== Page {page['page']} ===\n\n"
+                output += page['text']
+            return output
+        else:
+            return "Failed to extract text from PDF (no texts returned)"
+    except Exception as e:
+        error_trace = traceback.format_exc()
+        print(f"Error in extract_text function: {str(e)}")
+        print(error_trace)
+        return f"Failed to extract text from PDF: {str(e)}\n\nStacktrace:\n{error_trace}"
+# Additional info
+print(f"Anthropic API key present: {'Yes' if ANTHROPIC_API_KEY else 'No'}")
 iface = gr.Interface(
     fn=extract_text,