ClearSpend

Sleeping

App Files Files Community

MonilM commited on Apr 26

Commit

6e4ec8a

1 Parent(s): 1704a67

Api#4

Browse files

Files changed (2) hide show

app.py +46 -27
nlp_service.py +13 -2

app.py CHANGED Viewed

@@ -132,58 +132,77 @@ def find_main_amount(ocr_results):
     # --- Strategy to find the best candidate ---
-    # 1. Look for numbers on the SAME line as PRIORITY keywords
     priority_candidates = []
-    for line in parsed_lines:
-        if line["has_priority_keyword"] and line["numbers"]:
-            priority_candidates.extend(line["numbers"])
     if priority_candidates:
-        # Often the largest number on these lines is the final total
-        return max(priority_candidates)
-    # 2. Look for numbers on the SAME line as SECONDARY keywords
     secondary_candidates = []
-    for line in parsed_lines:
-        if line["has_secondary_keyword"] and line["numbers"]:
-            secondary_candidates.extend(line["numbers"])
     if secondary_candidates:
-         # If we only found secondary keywords, return the largest number found on those lines
-         # This might catch 'Net Total' or 'Total' when 'Grand Total' isn't present
         return max(secondary_candidates)
-    # 3. Look near priority/secondary keywords (less reliable, might pick up tax/service charge)
-    # Consider removing or deprioritizing this 'near' logic if same-line logic is sufficient
-    # 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal)
     lower_priority_candidates = []
-    for line in parsed_lines:
-        if line["has_lower_priority_keyword"] and line["numbers"]:
-            lower_priority_candidates.extend(line["numbers"])
     # Don't return subtotal directly unless it's the only thing found later
     # 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
-    print("Warning: No numbers found on priority/secondary keyword lines. Using fallback.")
     all_numbers = []
-    subtotal_numbers = set(lower_priority_candidates) # Keep track of subtotals
     for line in parsed_lines:
         all_numbers.extend(line["numbers"])
     if all_numbers:
         unique_numbers = list(set(all_numbers))
         # Filter out potential quantities/years/small irrelevant numbers
-        plausible_numbers = [n for n in unique_numbers if n >= 1.0 or '.' in str(n)]
-        # Filter out very large numbers unlikely to be totals unless they have decimals?
-        plausible_numbers = [n for n in plausible_numbers if n < 100000 or '.' in str(n)]
         # If we have plausible numbers other than subtotals, prefer them
         non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
         if non_subtotal_plausible:
             return max(non_subtotal_plausible)
         elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
-             return max(plausible_numbers) # Return the largest subtotal as last resort
     # 6. If still nothing, return None
     print("Warning: Could not determine main amount.")

     # --- Strategy to find the best candidate ---
+    # 1. Look for numbers on the SAME line as PRIORITY keywords OR the line IMMEDIATELY AFTER
     priority_candidates = []
+    for i, line in enumerate(parsed_lines):
+        if line["has_priority_keyword"]:
+            if line["numbers"]:
+                priority_candidates.extend(line["numbers"])
+            # Check next line if current line has no numbers and next line exists
+            elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
+                 priority_candidates.extend(parsed_lines[i+1]["numbers"])
     if priority_candidates:
+        # Often the largest number on/near these lines is the final total
+        return max(priority_candidates)
+    # 2. Look for numbers on the SAME line as SECONDARY keywords OR the line IMMEDIATELY AFTER
     secondary_candidates = []
+    for i, line in enumerate(parsed_lines):
+         if line["has_secondary_keyword"]:
+            if line["numbers"]:
+                secondary_candidates.extend(line["numbers"])
+            # Check next line if current line has no numbers and next line exists
+            elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
+                 secondary_candidates.extend(parsed_lines[i+1]["numbers"])
     if secondary_candidates:
+         # If we only found secondary keywords, return the largest number found on/near those lines
         return max(secondary_candidates)
+    # 3. Look near priority/secondary keywords (REMOVED - less reliable, covered by step 1 & 2)
+    # 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal) OR the line IMMEDIATELY AFTER
     lower_priority_candidates = []
+    for i, line in enumerate(parsed_lines):
+        if line["has_lower_priority_keyword"]:
+            if line["numbers"]:
+                lower_priority_candidates.extend(line["numbers"])
+            # Check next line if current line has no numbers and next line exists
+            elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
+                 lower_priority_candidates.extend(parsed_lines[i+1]["numbers"])
     # Don't return subtotal directly unless it's the only thing found later
     # 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
+    print("Warning: No numbers found on/near priority/secondary keyword lines. Using fallback.")
     all_numbers = []
+    # Use set comprehension for efficiency
+    subtotal_numbers = {num for line in parsed_lines if line["has_lower_priority_keyword"] for num in line["numbers"]}
+    # Also add numbers from the line after lower priority keywords to subtotals
+    for i, line in enumerate(parsed_lines):
+        if line["has_lower_priority_keyword"] and not line["numbers"] and i + 1 < len(parsed_lines):
+             subtotal_numbers.update(parsed_lines[i+1]["numbers"])
     for line in parsed_lines:
         all_numbers.extend(line["numbers"])
     if all_numbers:
         unique_numbers = list(set(all_numbers))
         # Filter out potential quantities/years/small irrelevant numbers
+        plausible_numbers = [n for n in unique_numbers if n >= 0.01] # Keep small decimals too
+        # Stricter filter for large numbers: exclude large integers (likely IDs, phone numbers)
+        # Keep numbers < 50000 OR numbers that have a non-zero decimal part
+        plausible_numbers = [n for n in plausible_numbers if n < 50000 or (n != int(n))]
         # If we have plausible numbers other than subtotals, prefer them
         non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
         if non_subtotal_plausible:
             return max(non_subtotal_plausible)
         elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
+             return max(plausible_numbers) # Return the largest subtotal/plausible as last resort
     # 6. If still nothing, return None
     print("Warning: Could not determine main amount.")

nlp_service.py CHANGED Viewed

@@ -599,9 +599,20 @@ def analyze_expense_text(text):
                          next_expense_id += 1
                          logging.info(f"Added expense (from Gemini): {new_expense}")
                          # Update message for consistency
-                         response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} on {new_expense['date'].strftime('%b %d, %Y')}."
                          # Make details serializable for JSON response
-                         response_data["details"]["date"] = response_data["details"]["date"].isoformat()
                      else:
                          logging.warning("Gemini add_expense result missing required fields.")
                          response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}

                          next_expense_id += 1
                          logging.info(f"Added expense (from Gemini): {new_expense}")
                          # Update message for consistency
+                         # --- FIX: Check if date is valid before formatting ---
+                         if isinstance(new_expense.get('date'), datetime.date):
+                             date_str = new_expense['date'].strftime('%b %d, %Y')
+                             response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} on {date_str}."
+                         else:
+                             logging.warning(f"Gemini add_expense result had invalid date type: {type(new_expense.get('date'))}. Using default message.")
+                             response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} (date missing/invalid)."
                          # Make details serializable for JSON response
+                         # Ensure date is serializable even if it was invalid earlier
+                         if isinstance(response_data["details"].get("date"), datetime.date):
+                             response_data["details"]["date"] = response_data["details"]["date"].isoformat()
+                         else:
+                             # Handle case where date might be None or wrong type after processing
+                             response_data["details"]["date"] = None # Or some indicator of invalidity
                      else:
                          logging.warning("Gemini add_expense result missing required fields.")
                          response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}