Spaces:
Running
Running
Api#4
Browse files- app.py +46 -27
- nlp_service.py +13 -2
app.py
CHANGED
@@ -132,58 +132,77 @@ def find_main_amount(ocr_results):
|
|
132 |
|
133 |
# --- Strategy to find the best candidate ---
|
134 |
|
135 |
-
# 1. Look for numbers on the SAME line as PRIORITY keywords
|
136 |
priority_candidates = []
|
137 |
-
for line in parsed_lines:
|
138 |
-
if line["has_priority_keyword"]
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
140 |
if priority_candidates:
|
141 |
-
# Often the largest number on these lines is the final total
|
142 |
-
return max(priority_candidates)
|
143 |
|
144 |
-
# 2. Look for numbers on the SAME line as SECONDARY keywords
|
145 |
secondary_candidates = []
|
146 |
-
for line in parsed_lines:
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
149 |
if secondary_candidates:
|
150 |
-
# If we only found secondary keywords, return the largest number found on those lines
|
151 |
-
# This might catch 'Net Total' or 'Total' when 'Grand Total' isn't present
|
152 |
return max(secondary_candidates)
|
153 |
|
154 |
-
# 3. Look near priority/secondary keywords (less reliable,
|
155 |
-
|
156 |
-
|
157 |
-
# 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal)
|
158 |
lower_priority_candidates = []
|
159 |
-
for line in parsed_lines:
|
160 |
-
if line["has_lower_priority_keyword"]
|
161 |
-
|
|
|
|
|
|
|
|
|
162 |
# Don't return subtotal directly unless it's the only thing found later
|
163 |
|
164 |
# 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
|
165 |
-
print("Warning: No numbers found on priority/secondary keyword lines. Using fallback.")
|
166 |
all_numbers = []
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
for line in parsed_lines:
|
170 |
all_numbers.extend(line["numbers"])
|
171 |
|
172 |
if all_numbers:
|
173 |
unique_numbers = list(set(all_numbers))
|
174 |
-
|
175 |
# Filter out potential quantities/years/small irrelevant numbers
|
176 |
-
plausible_numbers = [n for n in unique_numbers if n >=
|
177 |
-
#
|
178 |
-
|
|
|
179 |
|
180 |
# If we have plausible numbers other than subtotals, prefer them
|
181 |
non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
|
182 |
-
|
183 |
if non_subtotal_plausible:
|
184 |
return max(non_subtotal_plausible)
|
185 |
elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
|
186 |
-
return max(plausible_numbers) # Return the largest subtotal as last resort
|
187 |
|
188 |
# 6. If still nothing, return None
|
189 |
print("Warning: Could not determine main amount.")
|
|
|
132 |
|
133 |
# --- Strategy to find the best candidate ---
|
134 |
|
135 |
+
# 1. Look for numbers on the SAME line as PRIORITY keywords OR the line IMMEDIATELY AFTER
|
136 |
priority_candidates = []
|
137 |
+
for i, line in enumerate(parsed_lines):
|
138 |
+
if line["has_priority_keyword"]:
|
139 |
+
if line["numbers"]:
|
140 |
+
priority_candidates.extend(line["numbers"])
|
141 |
+
# Check next line if current line has no numbers and next line exists
|
142 |
+
elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
|
143 |
+
priority_candidates.extend(parsed_lines[i+1]["numbers"])
|
144 |
+
|
145 |
if priority_candidates:
|
146 |
+
# Often the largest number on/near these lines is the final total
|
147 |
+
return max(priority_candidates)
|
148 |
|
149 |
+
# 2. Look for numbers on the SAME line as SECONDARY keywords OR the line IMMEDIATELY AFTER
|
150 |
secondary_candidates = []
|
151 |
+
for i, line in enumerate(parsed_lines):
|
152 |
+
if line["has_secondary_keyword"]:
|
153 |
+
if line["numbers"]:
|
154 |
+
secondary_candidates.extend(line["numbers"])
|
155 |
+
# Check next line if current line has no numbers and next line exists
|
156 |
+
elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
|
157 |
+
secondary_candidates.extend(parsed_lines[i+1]["numbers"])
|
158 |
+
|
159 |
if secondary_candidates:
|
160 |
+
# If we only found secondary keywords, return the largest number found on/near those lines
|
|
|
161 |
return max(secondary_candidates)
|
162 |
|
163 |
+
# 3. Look near priority/secondary keywords (REMOVED - less reliable, covered by step 1 & 2)
|
164 |
+
|
165 |
+
# 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal) OR the line IMMEDIATELY AFTER
|
|
|
166 |
lower_priority_candidates = []
|
167 |
+
for i, line in enumerate(parsed_lines):
|
168 |
+
if line["has_lower_priority_keyword"]:
|
169 |
+
if line["numbers"]:
|
170 |
+
lower_priority_candidates.extend(line["numbers"])
|
171 |
+
# Check next line if current line has no numbers and next line exists
|
172 |
+
elif i + 1 < len(parsed_lines) and parsed_lines[i+1]["numbers"]:
|
173 |
+
lower_priority_candidates.extend(parsed_lines[i+1]["numbers"])
|
174 |
# Don't return subtotal directly unless it's the only thing found later
|
175 |
|
176 |
# 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
|
177 |
+
print("Warning: No numbers found on/near priority/secondary keyword lines. Using fallback.")
|
178 |
all_numbers = []
|
179 |
+
# Use set comprehension for efficiency
|
180 |
+
subtotal_numbers = {num for line in parsed_lines if line["has_lower_priority_keyword"] for num in line["numbers"]}
|
181 |
+
# Also add numbers from the line after lower priority keywords to subtotals
|
182 |
+
for i, line in enumerate(parsed_lines):
|
183 |
+
if line["has_lower_priority_keyword"] and not line["numbers"] and i + 1 < len(parsed_lines):
|
184 |
+
subtotal_numbers.update(parsed_lines[i+1]["numbers"])
|
185 |
+
|
186 |
|
187 |
for line in parsed_lines:
|
188 |
all_numbers.extend(line["numbers"])
|
189 |
|
190 |
if all_numbers:
|
191 |
unique_numbers = list(set(all_numbers))
|
192 |
+
|
193 |
# Filter out potential quantities/years/small irrelevant numbers
|
194 |
+
plausible_numbers = [n for n in unique_numbers if n >= 0.01] # Keep small decimals too
|
195 |
+
# Stricter filter for large numbers: exclude large integers (likely IDs, phone numbers)
|
196 |
+
# Keep numbers < 50000 OR numbers that have a non-zero decimal part
|
197 |
+
plausible_numbers = [n for n in plausible_numbers if n < 50000 or (n != int(n))]
|
198 |
|
199 |
# If we have plausible numbers other than subtotals, prefer them
|
200 |
non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
|
201 |
+
|
202 |
if non_subtotal_plausible:
|
203 |
return max(non_subtotal_plausible)
|
204 |
elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
|
205 |
+
return max(plausible_numbers) # Return the largest subtotal/plausible as last resort
|
206 |
|
207 |
# 6. If still nothing, return None
|
208 |
print("Warning: Could not determine main amount.")
|
nlp_service.py
CHANGED
@@ -599,9 +599,20 @@ def analyze_expense_text(text):
|
|
599 |
next_expense_id += 1
|
600 |
logging.info(f"Added expense (from Gemini): {new_expense}")
|
601 |
# Update message for consistency
|
602 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
603 |
# Make details serializable for JSON response
|
604 |
-
|
|
|
|
|
|
|
|
|
|
|
605 |
else:
|
606 |
logging.warning("Gemini add_expense result missing required fields.")
|
607 |
response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}
|
|
|
599 |
next_expense_id += 1
|
600 |
logging.info(f"Added expense (from Gemini): {new_expense}")
|
601 |
# Update message for consistency
|
602 |
+
# --- FIX: Check if date is valid before formatting ---
|
603 |
+
if isinstance(new_expense.get('date'), datetime.date):
|
604 |
+
date_str = new_expense['date'].strftime('%b %d, %Y')
|
605 |
+
response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} on {date_str}."
|
606 |
+
else:
|
607 |
+
logging.warning(f"Gemini add_expense result had invalid date type: {type(new_expense.get('date'))}. Using default message.")
|
608 |
+
response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} (date missing/invalid)."
|
609 |
# Make details serializable for JSON response
|
610 |
+
# Ensure date is serializable even if it was invalid earlier
|
611 |
+
if isinstance(response_data["details"].get("date"), datetime.date):
|
612 |
+
response_data["details"]["date"] = response_data["details"]["date"].isoformat()
|
613 |
+
else:
|
614 |
+
# Handle case where date might be None or wrong type after processing
|
615 |
+
response_data["details"]["date"] = None # Or some indicator of invalidity
|
616 |
else:
|
617 |
logging.warning("Gemini add_expense result missing required fields.")
|
618 |
response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}
|