MonilM commited on
Commit
6e4ec8a
·
1 Parent(s): 1704a67
Files changed (2) hide show
  1. app.py +46 -27
  2. nlp_service.py +13 -2
app.py CHANGED
@@ -132,58 +132,77 @@ def find_main_amount(ocr_results):
132
 
133
  # --- Strategy to find the best candidate ---
134
 
135
- # 1. Look for numbers on the SAME line as PRIORITY keywords
136
  priority_candidates = []
137
- for line in parsed_lines:
138
- if line["has_priority_keyword"] and line["numbers"]:
139
- priority_candidates.extend(line["numbers"])
 
 
 
 
 
140
  if priority_candidates:
141
- # Often the largest number on these lines is the final total
142
- return max(priority_candidates)
143
 
144
- # 2. Look for numbers on the SAME line as SECONDARY keywords
145
  secondary_candidates = []
146
- for line in parsed_lines:
147
- if line["has_secondary_keyword"] and line["numbers"]:
148
- secondary_candidates.extend(line["numbers"])
 
 
 
 
 
149
  if secondary_candidates:
150
- # If we only found secondary keywords, return the largest number found on those lines
151
- # This might catch 'Net Total' or 'Total' when 'Grand Total' isn't present
152
  return max(secondary_candidates)
153
 
154
- # 3. Look near priority/secondary keywords (less reliable, might pick up tax/service charge)
155
- # Consider removing or deprioritizing this 'near' logic if same-line logic is sufficient
156
-
157
- # 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal)
158
  lower_priority_candidates = []
159
- for line in parsed_lines:
160
- if line["has_lower_priority_keyword"] and line["numbers"]:
161
- lower_priority_candidates.extend(line["numbers"])
 
 
 
 
162
  # Don't return subtotal directly unless it's the only thing found later
163
 
164
  # 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
165
- print("Warning: No numbers found on priority/secondary keyword lines. Using fallback.")
166
  all_numbers = []
167
- subtotal_numbers = set(lower_priority_candidates) # Keep track of subtotals
 
 
 
 
 
 
168
 
169
  for line in parsed_lines:
170
  all_numbers.extend(line["numbers"])
171
 
172
  if all_numbers:
173
  unique_numbers = list(set(all_numbers))
174
-
175
  # Filter out potential quantities/years/small irrelevant numbers
176
- plausible_numbers = [n for n in unique_numbers if n >= 1.0 or '.' in str(n)]
177
- # Filter out very large numbers unlikely to be totals unless they have decimals?
178
- plausible_numbers = [n for n in plausible_numbers if n < 100000 or '.' in str(n)]
 
179
 
180
  # If we have plausible numbers other than subtotals, prefer them
181
  non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
182
-
183
  if non_subtotal_plausible:
184
  return max(non_subtotal_plausible)
185
  elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
186
- return max(plausible_numbers) # Return the largest subtotal as last resort
187
 
188
  # 6. If still nothing, return None
189
  print("Warning: Could not determine main amount.")
 
132
 
133
  # --- Strategy to find the best candidate ---
134
 
135
+ # 1. Look for numbers on the SAME line as PRIORITY keywords OR the line IMMEDIATELY AFTER
136
  priority_candidates = []
137
+ for i, line in enumerate(parsed_lines):
138
+ if line["has_priority_keyword"]:
139
+ if line["numbers"]:
140
+ priority_candidates.extend(line["numbers"])
141
+ # Check next line if current line has no numbers and next line exists
142
+ elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
143
+ priority_candidates.extend(parsed_lines[i+1]["numbers"])
144
+
145
  if priority_candidates:
146
+ # Often the largest number on/near these lines is the final total
147
+ return max(priority_candidates)
148
 
149
+ # 2. Look for numbers on the SAME line as SECONDARY keywords OR the line IMMEDIATELY AFTER
150
  secondary_candidates = []
151
+ for i, line in enumerate(parsed_lines):
152
+ if line["has_secondary_keyword"]:
153
+ if line["numbers"]:
154
+ secondary_candidates.extend(line["numbers"])
155
+ # Check next line if current line has no numbers and next line exists
156
+ elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
157
+ secondary_candidates.extend(parsed_lines[i+1]["numbers"])
158
+
159
  if secondary_candidates:
160
+ # If we only found secondary keywords, return the largest number found on/near those lines
 
161
  return max(secondary_candidates)
162
 
163
+ # 3. Look near priority/secondary keywords (REMOVED - less reliable, covered by step 1 & 2)
164
+
165
+ # 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal) OR the line IMMEDIATELY AFTER
 
166
  lower_priority_candidates = []
167
+ for i, line in enumerate(parsed_lines):
168
+ if line["has_lower_priority_keyword"]:
169
+ if line["numbers"]:
170
+ lower_priority_candidates.extend(line["numbers"])
171
+ # Check next line if current line has no numbers and next line exists
172
+ elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
173
+ lower_priority_candidates.extend(parsed_lines[i+1]["numbers"])
174
  # Don't return subtotal directly unless it's the only thing found later
175
 
176
  # 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
177
+ print("Warning: No numbers found on/near priority/secondary keyword lines. Using fallback.")
178
  all_numbers = []
179
+ # Use set comprehension for efficiency
180
+ subtotal_numbers = {num for line in parsed_lines if line["has_lower_priority_keyword"] for num in line["numbers"]}
181
+ # Also add numbers from the line after lower priority keywords to subtotals
182
+ for i, line in enumerate(parsed_lines):
183
+ if line["has_lower_priority_keyword"] and not line["numbers"] and i + 1 < len(parsed_lines):
184
+ subtotal_numbers.update(parsed_lines[i+1]["numbers"])
185
+
186
 
187
  for line in parsed_lines:
188
  all_numbers.extend(line["numbers"])
189
 
190
  if all_numbers:
191
  unique_numbers = list(set(all_numbers))
192
+
193
  # Filter out potential quantities/years/small irrelevant numbers
194
+ plausible_numbers = [n for n in unique_numbers if n >= 0.01] # Keep small decimals too
195
+ # Stricter filter for large numbers: exclude large integers (likely IDs, phone numbers)
196
+ # Keep numbers < 50000 OR numbers that have a non-zero decimal part
197
+ plausible_numbers = [n for n in plausible_numbers if n < 50000 or (n != int(n))]
198
 
199
  # If we have plausible numbers other than subtotals, prefer them
200
  non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
201
+
202
  if non_subtotal_plausible:
203
  return max(non_subtotal_plausible)
204
  elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
205
+ return max(plausible_numbers) # Return the largest subtotal/plausible as last resort
206
 
207
  # 6. If still nothing, return None
208
  print("Warning: Could not determine main amount.")
nlp_service.py CHANGED
@@ -599,9 +599,20 @@ def analyze_expense_text(text):
599
  next_expense_id += 1
600
  logging.info(f"Added expense (from Gemini): {new_expense}")
601
  # Update message for consistency
602
- response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} on {new_expense['date'].strftime('%b %d, %Y')}."
 
 
 
 
 
 
603
  # Make details serializable for JSON response
604
- response_data["details"]["date"] = response_data["details"]["date"].isoformat()
 
 
 
 
 
605
  else:
606
  logging.warning("Gemini add_expense result missing required fields.")
607
  response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}
 
599
  next_expense_id += 1
600
  logging.info(f"Added expense (from Gemini): {new_expense}")
601
  # Update message for consistency
602
+ # --- FIX: Check if date is valid before formatting ---
603
+ if isinstance(new_expense.get('date'), datetime.date):
604
+ date_str = new_expense['date'].strftime('%b %d, %Y')
605
+ response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} on {date_str}."
606
+ else:
607
+ logging.warning(f"Gemini add_expense result had invalid date type: {type(new_expense.get('date'))}. Using default message.")
608
+ response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} (date missing/invalid)."
609
  # Make details serializable for JSON response
610
+ # Ensure date is serializable even if it was invalid earlier
611
+ if isinstance(response_data["details"].get("date"), datetime.date):
612
+ response_data["details"]["date"] = response_data["details"]["date"].isoformat()
613
+ else:
614
+ # Handle case where date might be None or wrong type after processing
615
+ response_data["details"]["date"] = None # Or some indicator of invalidity
616
  else:
617
  logging.warning("Gemini add_expense result missing required fields.")
618
  response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}