Tanish28 commited on
Commit
95dfd20
·
verified ·
1 Parent(s): 463fe4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -187
app.py CHANGED
@@ -4,7 +4,6 @@ from pdf2image import convert_from_path
4
  from anthropic import Anthropic
5
  import base64
6
  import asyncio
7
- import traceback
8
  from datetime import datetime
9
  import gradio as gr
10
 
@@ -22,183 +21,137 @@ class PDFTextExtractor:
22
 
23
  print(f"Processing PDF: {pdf_path}")
24
 
25
- # Add more parameters to control image quality and size
26
- try:
27
- images = convert_from_path(
28
- pdf_path,
29
- dpi=150, # Lower DPI for smaller images
30
- fmt="png"
31
- )
32
- print(f"Successfully converted PDF to {len(images)} images")
33
- except Exception as e:
34
- print(f"Error converting PDF to images: {str(e)}")
35
- print(traceback.format_exc())
36
- return None
37
 
38
  extracted_texts = []
39
  for i, image in enumerate(images):
40
  print(f"Processing page {i+1}...")
41
 
42
- try:
43
- img_buffer = io.BytesIO()
44
- image.save(img_buffer, format='PNG')
45
- img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
46
- img_size = len(img_base64)
47
- print(f"Image {i+1} converted to base64. Size: {img_size} bytes")
48
-
49
- # Check if image size exceeds limits
50
- if img_size > 10000000: # 10MB example limit
51
- print(f"Warning: Image size ({img_size} bytes) might be too large")
52
-
53
- print(f"Sending request to Claude API for page {i+1}...")
54
- response = self.client.messages.create(
55
- model="claude-3-7-sonnet-20250219",
56
- system="""You are a doctor at a hospital. You can understand sloppy handwriting and convert it to readable text. Extract all the data from the form according to the markdown structure given below.
57
- Follow this exact markdown structure:
58
- # PATIENT ADMISSION FORM
59
- ## DR.KAMAKSHI MEMORIAL HOSPITAL, PALLIKARANAI, CHENNAI.
60
-
61
- ### PATIENT PROFILE
62
- *Please paste the sticker within the box*
63
-
64
- * UHID: ______
65
- * Patient Name: ______
66
- * Age/Gender: ______
67
- * Doctor Name: ______
68
-
69
- ### BASIC INFORMATION
70
- * Date & Time of Admission: [DD/MM/YYYY]
71
- * Date of Birth: [DD/MM/YYYY]
72
-
73
- ### IDENTIFICATION
74
- **ID Proof Already Registered**: □ Yes □ No
75
-
76
- **Type of ID**:
77
- * □ Aadhar
78
- * Passport
79
- * Voter ID
80
- * Driving License
81
- * Others
82
-
83
- ID No.: ________________
84
- Contact No.: ________________
85
-
86
- ### MEDICAL DETAILS
87
- * Provisional Diagnosis:
88
- * Reason for Admission:
89
- * Plan of Care:
90
- * Expected Outcome*:
91
-
92
- ### CONSULTANT DETAILS
93
- * Primary Consultant Name:
94
- * Speciality:
95
-
96
- ### PATIENT REFERENCE INFORMATION
97
- *(To be filled by Front Office)*
98
-
99
- **Reference Via**:
100
- * Doctor
101
- * □ Hospital
102
- * Ambulance
103
- * □ DRKMH Employee
104
- * □ Self / Walk In
105
-
106
- **Referrer Details**:
107
- * Name: ________________
108
- * Contact No.: ________________
109
-
110
- ### TYPE OF ADMISSION
111
- * □ Emergency
112
- * □ Elective
113
- * MLC
114
- * □ Surgery
115
- * Medical
116
- * Others: ________________
117
-
118
- ### TREATMENT TYPE
119
- * In Patient
120
- * Day Care
121
- * Transfer To: ________________
122
-
123
- ### CONTACT DETAILS
124
- **Person to Contact (Next of Kin)**:
125
- * Name: ________________
126
- * Relationship with Patient: ________________
127
- * Address: ________________
128
- * Pincode: ________________
129
- * Mobile: ________________
130
- * Email: ________________
131
-
132
- ### OFFICIAL USE
133
- * Front Office Executive Name:
134
- * Front Office Executive Signature:
135
- * Advance Amount Paid:
136
- * ICD-10 Code (For Medical Records Section):
137
-
138
- *\\* Subject to change during the course of diseases*
139
-
140
- ---
141
- *Form No: KMHIPF002V3*
142
- """,
143
- max_tokens=4096,
144
- messages=[
145
- {
146
- "role": "user",
147
- "content": [
148
- {
149
- "type": "text",
150
- "text": "Extract and format the Patient Admission Form from this image according to the specified markdown format. Even if the handwriting is sloppy, try to extract the text accurately. Preserve all form fields and checkboxes (as □)."
151
- },
152
- {
153
- "type": "image",
154
- "source": {
155
- "type": "base64",
156
- "media_type": "image/png",
157
- "data": img_base64
158
- }
159
  }
160
- ]
161
- }
162
- ]
163
- )
164
-
165
- print(f"Received response for page {i+1}")
166
- # Print response structure for debugging
167
- print(f"Response type: {type(response)}")
168
- print(f"Response attributes: {dir(response)}")
169
-
170
- # Safely extract content
171
- try:
172
- if hasattr(response, 'content') and response.content:
173
- content = response.content[0].text
174
- print(f"Successfully extracted text content from API response")
175
- else:
176
- print(f"No content found in response: {response}")
177
- content = "Error: No content found in API response"
178
- except Exception as e:
179
- print(f"Error extracting content from response: {str(e)}")
180
- print(f"Raw response: {response}")
181
- content = f"Error extracting content: {str(e)}"
182
 
183
- extracted_texts.append({
184
- 'page': i + 1,
185
- 'text': content
186
- })
187
-
188
- except Exception as e:
189
- print(f"Error processing page {i+1}: {str(e)}")
190
- print(traceback.format_exc())
191
- extracted_texts.append({
192
- 'page': i + 1,
193
- 'text': f"Error processing this page: {str(e)}"
194
- })
195
 
196
  return extracted_texts
197
 
198
  except Exception as e:
199
  print(f"Error in text extraction: {str(e)}")
200
- print(traceback.format_exc())
201
- return [{'page': 1, 'text': f"Error in text extraction: {str(e)}"}]
202
 
203
  def extract_text(pdf_file):
204
  if ANTHROPIC_API_KEY is None:
@@ -206,28 +159,18 @@ def extract_text(pdf_file):
206
 
207
  extractor = PDFTextExtractor(ANTHROPIC_API_KEY)
208
 
209
- try:
210
- pdf_path = pdf_file.name
211
- print(f"Starting extraction from {pdf_path}")
212
- extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path))
 
 
 
 
213
 
214
- if extracted_texts:
215
- output = ""
216
- for page in extracted_texts:
217
- output += f"\n\n=== Page {page['page']} ===\n\n"
218
- output += page['text']
219
-
220
- return output
221
- else:
222
- return "Failed to extract text from PDF (no texts returned)"
223
- except Exception as e:
224
- error_trace = traceback.format_exc()
225
- print(f"Error in extract_text function: {str(e)}")
226
- print(error_trace)
227
- return f"Failed to extract text from PDF: {str(e)}\n\nStacktrace:\n{error_trace}"
228
-
229
- # Additional info
230
- print(f"Anthropic API key present: {'Yes' if ANTHROPIC_API_KEY else 'No'}")
231
 
232
  iface = gr.Interface(
233
  fn=extract_text,
 
4
  from anthropic import Anthropic
5
  import base64
6
  import asyncio
 
7
  from datetime import datetime
8
  import gradio as gr
9
 
 
21
 
22
  print(f"Processing PDF: {pdf_path}")
23
 
24
+ images = convert_from_path(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  extracted_texts = []
27
  for i, image in enumerate(images):
28
  print(f"Processing page {i+1}...")
29
 
30
+ img_buffer = io.BytesIO()
31
+ image.save(img_buffer, format='PNG')
32
+ img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
33
+
34
+ response = self.client.messages.create(
35
+ model="claude-3-7-sonnet-20250219",
36
+ system="""You are a doctor at a hospital. You can understand sloppy handwriting and convert it to readable text. Extract all the data from the form according to the markdown structure given below.
37
+ Follow this exact markdown structure:
38
+ # PATIENT ADMISSION FORM
39
+ ## DR.KAMAKSHI MEMORIAL HOSPITAL, PALLIKARANAI, CHENNAI.
40
+
41
+ ### PATIENT PROFILE
42
+ *Please paste the sticker within the box*
43
+
44
+ * UHID: ______
45
+ * Patient Name: ______
46
+ * Age/Gender: ______
47
+ * Doctor Name: ______
48
+
49
+ ### BASIC INFORMATION
50
+ * Date & Time of Admission: [DD/MM/YYYY]
51
+ * Date of Birth: [DD/MM/YYYY]
52
+
53
+ ### IDENTIFICATION
54
+ **ID Proof Already Registered**: □ Yes □ No
55
+
56
+ **Type of ID**:
57
+ * Aadhar
58
+ * Passport
59
+ * Voter ID
60
+ * □ Driving License
61
+ * □ Others
62
+
63
+ ID No.: ________________
64
+ Contact No.: ________________
65
+
66
+ ### MEDICAL DETAILS
67
+ * Provisional Diagnosis:
68
+ * Reason for Admission:
69
+ * Plan of Care:
70
+ * Expected Outcome*:
71
+
72
+ ### CONSULTANT DETAILS
73
+ * Primary Consultant Name:
74
+ * Speciality:
75
+
76
+ ### PATIENT REFERENCE INFORMATION
77
+ *(To be filled by Front Office)*
78
+
79
+ **Reference Via**:
80
+ * Doctor
81
+ * Hospital
82
+ * □ Ambulance
83
+ * □ DRKMH Employee
84
+ * Self / Walk In
85
+
86
+ **Referrer Details**:
87
+ * Name: ________________
88
+ * Contact No.: ________________
89
+
90
+ ### TYPE OF ADMISSION
91
+ * □ Emergency
92
+ * □ Elective
93
+ * □ MLC
94
+ * □ Surgery
95
+ * Medical
96
+ * Others: ________________
97
+
98
+ ### TREATMENT TYPE
99
+ * □ In Patient
100
+ * □ Day Care
101
+ * Transfer To: ________________
102
+
103
+ ### CONTACT DETAILS
104
+ **Person to Contact (Next of Kin)**:
105
+ * Name: ________________
106
+ * Relationship with Patient: ________________
107
+ * Address: ________________
108
+ * Pincode: ________________
109
+ * Mobile: ________________
110
+ * Email: ________________
111
+
112
+ ### OFFICIAL USE
113
+ * Front Office Executive Name:
114
+ * Front Office Executive Signature:
115
+ * Advance Amount Paid:
116
+ * ICD-10 Code (For Medical Records Section):
117
+
118
+ *\\* Subject to change during the course of diseases*
119
+
120
+ ---
121
+ *Form No: KMHIPF002V3*
122
+ """,
123
+ max_tokens=4096,
124
+ messages=[
125
+ {
126
+ "role": "user",
127
+ "content": [
128
+ {
129
+ "type": "text",
130
+ "text": "Extract and format the Patient Admission Form from this image according to the specified markdown format. Even if the handwriting is sloppy, try to extract the text accurately. Preserve all form fields and checkboxes (as □)."
131
+ },
132
+ {
133
+ "type": "image",
134
+ "source": {
135
+ "type": "base64",
136
+ "media_type": "image/png",
137
+ "data": img_base64
 
 
 
 
 
 
 
 
 
138
  }
139
+ }
140
+ ]
141
+ }
142
+ ]
143
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ extracted_texts.append({
146
+ 'page': i + 1,
147
+ 'text': response.content[0].text
148
+ })
 
 
 
 
 
 
 
 
149
 
150
  return extracted_texts
151
 
152
  except Exception as e:
153
  print(f"Error in text extraction: {str(e)}")
154
+ return None
 
155
 
156
  def extract_text(pdf_file):
157
  if ANTHROPIC_API_KEY is None:
 
159
 
160
  extractor = PDFTextExtractor(ANTHROPIC_API_KEY)
161
 
162
+ pdf_path = pdf_file.name
163
+ extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path))
164
+
165
+ if extracted_texts:
166
+ output = ""
167
+ for page in extracted_texts:
168
+ output += f"\n\n=== Page {page['page']} ===\n\n"
169
+ output += page['text']
170
 
171
+ return output
172
+ else:
173
+ return "Failed to extract text from PDF"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  iface = gr.Interface(
176
  fn=extract_text,