Spaces:
Sleeping
Sleeping
Added NLP
Browse files- app.py +121 -41
- nlp_service.py +611 -0
- requirements.txt +6 -2
app.py
CHANGED
@@ -11,6 +11,9 @@ from flask import Flask, request, jsonify
|
|
11 |
from paddleocr import PaddleOCR
|
12 |
from PIL import Image
|
13 |
|
|
|
|
|
|
|
14 |
# --- Configuration ---
|
15 |
LANG = 'en' # Default language, can be overridden if needed
|
16 |
NUM_WORKERS = 2 # Number of OCR worker threads
|
@@ -85,9 +88,12 @@ def find_main_amount(ocr_results):
|
|
85 |
if not ocr_results:
|
86 |
return None
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
91 |
|
92 |
parsed_lines = []
|
93 |
for i, line_info in enumerate(ocr_results):
|
@@ -100,62 +106,95 @@ def find_main_amount(ocr_results):
|
|
100 |
float_numbers = []
|
101 |
for num_str in numbers_in_line:
|
102 |
try:
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
105 |
float_numbers.append(float(num_str.replace(',', '')))
|
106 |
except ValueError:
|
107 |
continue
|
108 |
|
109 |
-
|
110 |
-
for
|
111 |
-
|
112 |
-
|
113 |
-
break
|
114 |
|
115 |
parsed_lines.append({
|
116 |
"index": i,
|
117 |
"text": text,
|
118 |
"numbers": float_numbers,
|
119 |
-
"
|
|
|
|
|
120 |
"confidence": confidence
|
121 |
})
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
for
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
all_numbers = []
|
|
|
|
|
143 |
for line in parsed_lines:
|
144 |
all_numbers.extend(line["numbers"])
|
145 |
|
146 |
if all_numbers:
|
147 |
unique_numbers = list(set(all_numbers))
|
148 |
-
|
149 |
-
|
150 |
-
if
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
|
|
153 |
print("Warning: Could not determine main amount.")
|
154 |
return None
|
155 |
|
156 |
# --- Flask App Setup ---
|
157 |
app = Flask(__name__)
|
158 |
|
|
|
|
|
|
|
159 |
# --- Initialize OCR Manager ---
|
160 |
ocr_model_factory = functools.partial(PaddleOCR, lang=LANG, use_angle_cls=True, use_gpu=False, show_log=False)
|
161 |
ocr_manager = PaddleOCRModelManager(num_workers=NUM_WORKERS, model_factory=ocr_model_factory)
|
@@ -185,20 +224,54 @@ def extract_expense():
|
|
185 |
# Perform OCR
|
186 |
ocr_result = ocr_manager.infer(temp_file_path, cls=True)
|
187 |
|
188 |
-
# Process results
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
else:
|
193 |
extracted_lines = [line[1][0] for line in ocr_result if line and len(line) > 1 and len(line[1]) > 0]
|
194 |
extracted_text = "\n".join(extracted_lines)
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
|
|
197 |
response_data = {
|
198 |
"type": "photo",
|
199 |
"extracted_text": extracted_text,
|
200 |
-
"
|
|
|
|
|
201 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
return jsonify(response_data)
|
203 |
|
204 |
except Exception as e:
|
@@ -212,6 +285,13 @@ def extract_expense():
|
|
212 |
|
213 |
return jsonify({"error": "File processing failed"}), 500
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
# --- Run the App ---
|
216 |
if __name__ == '__main__':
|
217 |
# Use port 7860 as expected by Hugging Face Spaces
|
|
|
11 |
from paddleocr import PaddleOCR
|
12 |
from PIL import Image
|
13 |
|
14 |
+
# --- NEW: Import the NLP analysis function ---
|
15 |
+
from nlp_service import analyze_expense_text # Import the core analysis function
|
16 |
+
|
17 |
# --- Configuration ---
|
18 |
LANG = 'en' # Default language, can be overridden if needed
|
19 |
NUM_WORKERS = 2 # Number of OCR worker threads
|
|
|
88 |
if not ocr_results:
|
89 |
return None
|
90 |
|
91 |
+
amount_regex = re.compile(r'(?<!%)\b\d{1,3}(?:,?\d{3})*(?:\.\d{2})\b|\b\d+\.\d{2}\b|\b\d+\b(?!\.\d{1})')
|
92 |
+
|
93 |
+
# Prioritized keywords
|
94 |
+
priority_keywords = ['grand total', 'total amount', 'amount due', 'to pay', 'bill total', 'total payable']
|
95 |
+
secondary_keywords = ['total', 'balance', 'net amount', 'paid', 'charge', 'net total'] # Added 'net total'
|
96 |
+
lower_priority_keywords = ['subtotal', 'sub total'] # Added 'sub total'
|
97 |
|
98 |
parsed_lines = []
|
99 |
for i, line_info in enumerate(ocr_results):
|
|
|
106 |
float_numbers = []
|
107 |
for num_str in numbers_in_line:
|
108 |
try:
|
109 |
+
# Avoid converting year-like numbers if they stand alone on short lines
|
110 |
+
if len(text) < 7 and '.' not in num_str and 1900 < int(num_str.replace(',', '')) < 2100:
|
111 |
+
# More robust check: avoid if it's the only thing and looks like a year
|
112 |
+
if len(numbers_in_line) == 1 and len(num_str) == 4:
|
113 |
+
continue
|
114 |
float_numbers.append(float(num_str.replace(',', '')))
|
115 |
except ValueError:
|
116 |
continue
|
117 |
|
118 |
+
# Check for keywords
|
119 |
+
has_priority_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in priority_keywords)
|
120 |
+
has_secondary_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in secondary_keywords)
|
121 |
+
has_lower_priority_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in lower_priority_keywords)
|
|
|
122 |
|
123 |
parsed_lines.append({
|
124 |
"index": i,
|
125 |
"text": text,
|
126 |
"numbers": float_numbers,
|
127 |
+
"has_priority_keyword": has_priority_keyword,
|
128 |
+
"has_secondary_keyword": has_secondary_keyword,
|
129 |
+
"has_lower_priority_keyword": has_lower_priority_keyword,
|
130 |
"confidence": confidence
|
131 |
})
|
132 |
|
133 |
+
# --- Strategy to find the best candidate ---
|
134 |
+
|
135 |
+
# 1. Look for numbers on the SAME line as PRIORITY keywords
|
136 |
+
priority_candidates = []
|
137 |
+
for line in parsed_lines:
|
138 |
+
if line["has_priority_keyword"] and line["numbers"]:
|
139 |
+
priority_candidates.extend(line["numbers"])
|
140 |
+
if priority_candidates:
|
141 |
+
# Often the largest number on these lines is the final total
|
142 |
+
return max(priority_candidates)
|
143 |
+
|
144 |
+
# 2. Look for numbers on the SAME line as SECONDARY keywords
|
145 |
+
secondary_candidates = []
|
146 |
+
for line in parsed_lines:
|
147 |
+
if line["has_secondary_keyword"] and line["numbers"]:
|
148 |
+
secondary_candidates.extend(line["numbers"])
|
149 |
+
if secondary_candidates:
|
150 |
+
# If we only found secondary keywords, return the largest number found on those lines
|
151 |
+
# This might catch 'Net Total' or 'Total' when 'Grand Total' isn't present
|
152 |
+
return max(secondary_candidates)
|
153 |
+
|
154 |
+
# 3. Look near priority/secondary keywords (less reliable, might pick up tax/service charge)
|
155 |
+
# Consider removing or deprioritizing this 'near' logic if same-line logic is sufficient
|
156 |
+
|
157 |
+
# 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal)
|
158 |
+
lower_priority_candidates = []
|
159 |
+
for line in parsed_lines:
|
160 |
+
if line["has_lower_priority_keyword"] and line["numbers"]:
|
161 |
+
lower_priority_candidates.extend(line["numbers"])
|
162 |
+
# Don't return subtotal directly unless it's the only thing found later
|
163 |
+
|
164 |
+
# 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
|
165 |
+
print("Warning: No numbers found on priority/secondary keyword lines. Using fallback.")
|
166 |
all_numbers = []
|
167 |
+
subtotal_numbers = set(lower_priority_candidates) # Keep track of subtotals
|
168 |
+
|
169 |
for line in parsed_lines:
|
170 |
all_numbers.extend(line["numbers"])
|
171 |
|
172 |
if all_numbers:
|
173 |
unique_numbers = list(set(all_numbers))
|
174 |
+
|
175 |
+
# Filter out potential quantities/years/small irrelevant numbers
|
176 |
+
plausible_numbers = [n for n in unique_numbers if n >= 1.0 or '.' in str(n)]
|
177 |
+
# Filter out very large numbers unlikely to be totals unless they have decimals?
|
178 |
+
plausible_numbers = [n for n in plausible_numbers if n < 100000 or '.' in str(n)]
|
179 |
+
|
180 |
+
# If we have plausible numbers other than subtotals, prefer them
|
181 |
+
non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
|
182 |
+
|
183 |
+
if non_subtotal_plausible:
|
184 |
+
return max(non_subtotal_plausible)
|
185 |
+
elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
|
186 |
+
return max(plausible_numbers) # Return the largest subtotal as last resort
|
187 |
|
188 |
+
# 6. If still nothing, return None
|
189 |
print("Warning: Could not determine main amount.")
|
190 |
return None
|
191 |
|
192 |
# --- Flask App Setup ---
|
193 |
app = Flask(__name__)
|
194 |
|
195 |
+
# --- REMOVED: Register the NLP Blueprint ---
|
196 |
+
# app.register_blueprint(nlp_bp) # No longer needed as we call the function directly
|
197 |
+
|
198 |
# --- Initialize OCR Manager ---
|
199 |
ocr_model_factory = functools.partial(PaddleOCR, lang=LANG, use_angle_cls=True, use_gpu=False, show_log=False)
|
200 |
ocr_manager = PaddleOCRModelManager(num_workers=NUM_WORKERS, model_factory=ocr_model_factory)
|
|
|
224 |
# Perform OCR
|
225 |
ocr_result = ocr_manager.infer(temp_file_path, cls=True)
|
226 |
|
227 |
+
# Process OCR results
|
228 |
+
extracted_text = ""
|
229 |
+
main_amount_ocr = None
|
230 |
+
if ocr_result:
|
|
|
231 |
extracted_lines = [line[1][0] for line in ocr_result if line and len(line) > 1 and len(line[1]) > 0]
|
232 |
extracted_text = "\n".join(extracted_lines)
|
233 |
+
main_amount_ocr = find_main_amount(ocr_result) # Keep OCR amount extraction
|
234 |
+
|
235 |
+
# --- NEW: Call NLP Function Directly ---
|
236 |
+
nlp_analysis_result = None
|
237 |
+
nlp_error = None
|
238 |
+
if extracted_text:
|
239 |
+
try:
|
240 |
+
# Call the imported analysis function
|
241 |
+
nlp_analysis_result = analyze_expense_text(extracted_text)
|
242 |
+
print(f"NLP Service Analysis Result: {nlp_analysis_result}")
|
243 |
+
# Check if the NLP analysis itself reported an error/failure
|
244 |
+
if nlp_analysis_result.get("status") == "failed":
|
245 |
+
nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
|
246 |
+
# Keep the result structure but note the failure
|
247 |
+
except Exception as nlp_e:
|
248 |
+
nlp_error = f"Error calling NLP analysis function: {nlp_e}"
|
249 |
+
print(f"Error calling NLP function: {nlp_error}")
|
250 |
+
nlp_analysis_result = None # Ensure result is None on exception during call
|
251 |
+
else:
|
252 |
+
nlp_error = "No text extracted from image for NLP analysis."
|
253 |
+
# --- End NLP Call ---
|
254 |
|
255 |
+
# Construct the response
|
256 |
response_data = {
|
257 |
"type": "photo",
|
258 |
"extracted_text": extracted_text,
|
259 |
+
"main_amount_ocr": main_amount_ocr, # Amount found by OCR regex logic
|
260 |
+
"nlp_analysis": nlp_analysis_result, # Include the full NLP analysis result (or None)
|
261 |
+
"nlp_error": nlp_error # Include any error from NLP call/processing
|
262 |
}
|
263 |
+
|
264 |
+
# Optional: Add top-level convenience fields based on successful NLP analysis
|
265 |
+
if nlp_analysis_result and nlp_analysis_result.get("status") == "success":
|
266 |
+
if nlp_analysis_result.get("action") == "add_expense":
|
267 |
+
response_data['confirmed_expense_details'] = nlp_analysis_result.get('details')
|
268 |
+
response_data['confirmation_message'] = nlp_analysis_result.get('message')
|
269 |
+
elif nlp_analysis_result.get("action") == "query_expense":
|
270 |
+
# Include query results if applicable (depends on nlp_service structure)
|
271 |
+
response_data['query_message'] = nlp_analysis_result.get('message')
|
272 |
+
response_data['query_criteria'] = nlp_analysis_result.get('criteria')
|
273 |
+
response_data['query_results_count'] = nlp_analysis_result.get('results_count')
|
274 |
+
|
275 |
return jsonify(response_data)
|
276 |
|
277 |
except Exception as e:
|
|
|
285 |
|
286 |
return jsonify({"error": "File processing failed"}), 500
|
287 |
|
288 |
+
# --- NEW: Health Check Endpoint ---
|
289 |
+
@app.route('/health', methods=['GET'])
|
290 |
+
def health_check():
|
291 |
+
# You could add more checks here (e.g., if OCR workers are alive)
|
292 |
+
return jsonify({"status": "ok"}), 200
|
293 |
+
|
294 |
+
|
295 |
# --- Run the App ---
|
296 |
if __name__ == '__main__':
|
297 |
# Use port 7860 as expected by Hugging Face Spaces
|
nlp_service.py
ADDED
@@ -0,0 +1,611 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import datetime
|
3 |
+
import dateparser # Still essential for interpreting date strings
|
4 |
+
import spacy # Import spaCy
|
5 |
+
from flask import Blueprint, request, jsonify
|
6 |
+
from collections import defaultdict
|
7 |
+
import logging
|
8 |
+
import os # To handle potential model loading issues
|
9 |
+
|
10 |
+
# --- Setup ---
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
|
13 |
+
# --- Load spaCy Model ---
|
14 |
+
# Using medium model for better accuracy and word vectors (though not used explicitly yet)
|
15 |
+
# Handle potential errors during model loading
|
16 |
+
try:
|
17 |
+
# Check if running in an environment where models might be linked differently
|
18 |
+
# (e.g., Google Cloud Functions sometimes needs explicit path)
|
19 |
+
model_name = "en_core_web_md"
|
20 |
+
if not spacy.util.is_package(model_name):
|
21 |
+
print(f"spaCy model '{model_name}' not found as package. Attempting download...")
|
22 |
+
spacy.cli.download(model_name)
|
23 |
+
|
24 |
+
nlp = spacy.load(model_name)
|
25 |
+
logging.info(f"Successfully loaded spaCy model '{model_name}'")
|
26 |
+
except (OSError, ImportError) as e:
|
27 |
+
logging.error(f"Could not load spaCy model '{model_name}'. Error: {e}")
|
28 |
+
logging.error("Ensure the model is downloaded: python -m spacy download en_core_web_md")
|
29 |
+
# Fallback or exit - for now, we'll log and potentially fail later if nlp isn't loaded
|
30 |
+
nlp = None # Indicate model loading failed
|
31 |
+
|
32 |
+
# --- In-Memory Data Storage (Replace with Database) ---
|
33 |
+
expenses = []
|
34 |
+
next_expense_id = 1
|
35 |
+
|
36 |
+
# --- NLP Configuration & Helpers ---
|
37 |
+
CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
|
38 |
+
# More robust regex to find monetary values even if spaCy misses MONEY entity
|
39 |
+
FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b', re.IGNORECASE)
|
40 |
+
|
41 |
+
# Category keywords remain useful
|
42 |
+
CATEGORY_KEYWORDS = {
|
43 |
+
"food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "groceries", "sandwich", "burger", "pizza"],
|
44 |
+
"coffee": ["coffee", "latte", "cappuccino", "espresso", "cafe", "starbucks", "ccd", "café", "mocha"],
|
45 |
+
"travel": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket"],
|
46 |
+
"shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
|
47 |
+
"groceries": ["groceries", "supermarket", "zepto", "blinkit", "bigbasket", "vegetables", "milk", "market"],
|
48 |
+
"utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
|
49 |
+
"entertainment": ["movie", "cinema", "concert", "game", "fun", "netflix", "spotify", "tickets"],
|
50 |
+
"rent": ["rent", "lease"],
|
51 |
+
"transport": ["transport", "metro", "auto", "rickshaw", "commute"]
|
52 |
+
}
|
53 |
+
|
54 |
+
# Keywords for intent detection (can be less critical now, intent inferred more from entities)
|
55 |
+
QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
|
56 |
+
ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending
|
57 |
+
|
58 |
+
|
59 |
+
def parse_money_entity(text, doc):
|
60 |
+
"""
|
61 |
+
Extracts amount using spaCy MONEY entities first, then falls back to regex.
|
62 |
+
Returns the amount as float and identified currency symbol/code.
|
63 |
+
"""
|
64 |
+
amount = None
|
65 |
+
currency = None
|
66 |
+
text = text.replace(',', '') # Remove commas for easier parsing
|
67 |
+
|
68 |
+
# 1. Try spaCy MONEY entities first
|
69 |
+
money_ents = [ent for ent in doc.ents if ent.label_ == "MONEY"]
|
70 |
+
if money_ents:
|
71 |
+
# Prioritize longer entities or ones closer to verbs like 'spent' if multiple found
|
72 |
+
# Simple approach: take the first one for now
|
73 |
+
ent_text = money_ents[0].text.replace(',', '')
|
74 |
+
# Try to extract number and symbol/code from the entity text
|
75 |
+
num_match = re.search(r'([\d\.]+)', ent_text)
|
76 |
+
if num_match:
|
77 |
+
try:
|
78 |
+
amount = float(num_match.group(1))
|
79 |
+
# Try to find a known symbol or code within the entity text
|
80 |
+
symbol_match = re.search(r'([\$€£₹])', ent_text)
|
81 |
+
if symbol_match:
|
82 |
+
currency = symbol_match.group(1)
|
83 |
+
else:
|
84 |
+
# Check for codes like USD, GBP etc. (simple check)
|
85 |
+
code_match = re.search(r'\b(USD|EUR|GBP|INR|RS)\b', ent_text, re.IGNORECASE)
|
86 |
+
if code_match:
|
87 |
+
currency = code_match.group(1).upper()
|
88 |
+
# Standardize common ones
|
89 |
+
if currency == "RS": currency = "INR"
|
90 |
+
|
91 |
+
# If amount found but no currency symbol in entity, check doc context
|
92 |
+
if amount is not None and currency is None:
|
93 |
+
for token in doc:
|
94 |
+
if token.text in CURRENCY_SYMBOLS:
|
95 |
+
currency = token.text
|
96 |
+
break
|
97 |
+
return amount, currency
|
98 |
+
except ValueError:
|
99 |
+
pass # Failed to convert number
|
100 |
+
|
101 |
+
# 2. Fallback Regex (if spaCy missed it or parsing failed)
|
102 |
+
match = FALLBACK_AMOUNT_REGEX.search(text)
|
103 |
+
if match:
|
104 |
+
try:
|
105 |
+
if match.group(2): # Format: $100 or Rs 100
|
106 |
+
amount = float(match.group(2))
|
107 |
+
currency_text = match.group(1)
|
108 |
+
elif match.group(3): # Format: 100 dollars or 100 Rs
|
109 |
+
amount = float(match.group(3))
|
110 |
+
currency_text = match.group(4)
|
111 |
+
else: # Should not happen with this regex, but safety first
|
112 |
+
return None, None
|
113 |
+
|
114 |
+
# Normalize currency symbol/code
|
115 |
+
if currency_text in CURRENCY_SYMBOLS:
|
116 |
+
currency = currency_text
|
117 |
+
else:
|
118 |
+
currency_text = currency_text.lower()
|
119 |
+
if currency_text in ["rs", "rupees"]: currency = "₹" # Or INR
|
120 |
+
elif currency_text in ["dollars", "usd"]: currency = "$" # Or USD
|
121 |
+
elif currency_text in ["pounds", "gbp"]: currency = "£" # Or GBP
|
122 |
+
elif currency_text in ["euros", "eur"]: currency = "€" # Or EUR
|
123 |
+
|
124 |
+
return amount, currency
|
125 |
+
|
126 |
+
except (ValueError, IndexError):
|
127 |
+
logging.warning(f"Regex fallback failed to parse amount from: {text}")
|
128 |
+
return None, None
|
129 |
+
|
130 |
+
return None, None # No amount found
|
131 |
+
|
132 |
+
def parse_date_entities(doc):
|
133 |
+
"""
|
134 |
+
Uses dateparser to interpret spaCy DATE entities.
|
135 |
+
Returns the *most likely* date found, defaulting to today.
|
136 |
+
"""
|
137 |
+
dates = []
|
138 |
+
# Settings for dateparser: prefer past dates for expenses
|
139 |
+
settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
|
140 |
+
|
141 |
+
date_ents = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
|
142 |
+
logging.debug(f"Found DATE entities: {date_ents}")
|
143 |
+
|
144 |
+
if date_ents:
|
145 |
+
for date_str in date_ents:
|
146 |
+
# Sometimes spaCy includes words like "on", "last" in the entity, dateparser handles this
|
147 |
+
parsed = dateparser.parse(date_str, settings=settings)
|
148 |
+
if parsed:
|
149 |
+
dates.append(parsed.date())
|
150 |
+
|
151 |
+
if dates:
|
152 |
+
# Heuristic: If multiple dates, prefer the one closest to today? Or just the first?
|
153 |
+
# Let's prefer the latest valid past date found (most recent expense)
|
154 |
+
past_dates = [d for d in dates if d <= datetime.date.today()]
|
155 |
+
if past_dates:
|
156 |
+
return max(past_dates) # Return the most recent valid date
|
157 |
+
elif dates:
|
158 |
+
return min(dates) # If only future dates found, return the earliest one (less likely for expense)
|
159 |
+
|
160 |
+
# Fallback if no DATE entity found or parsed
|
161 |
+
logging.debug("No valid DATE entity found or parsed, defaulting to today.")
|
162 |
+
return datetime.date.today()
|
163 |
+
|
164 |
+
def identify_merchant_and_category(doc):
|
165 |
+
"""
|
166 |
+
Identifies merchant using ORG/PERSON/GPE entities and context.
|
167 |
+
Identifies category using keywords and context around amount/merchant.
|
168 |
+
"""
|
169 |
+
merchant = None
|
170 |
+
category = "Uncategorized" # Default
|
171 |
+
|
172 |
+
money_token_indices = [token.i for token in doc if token.like_num or token.text in CURRENCY_SYMBOLS or any(sym in token.text for sym in CURRENCY_SYMBOLS) or (token.ent_type_ == "MONEY")]
|
173 |
+
|
174 |
+
potential_merchants = []
|
175 |
+
for ent in doc.ents:
|
176 |
+
if ent.label_ in ["ORG", "PERSON", "GPE", "FAC"]: # Facility might also be relevant
|
177 |
+
# Check context: is it preceded by "at", "from", "in"? Is it near the money amount?
|
178 |
+
prepositions = {"at", "from", "in", "on", "with"}
|
179 |
+
# Check token before the entity start
|
180 |
+
if ent.start > 0 and doc[ent.start - 1].lower_ in prepositions:
|
181 |
+
potential_merchants.append(ent.text)
|
182 |
+
continue
|
183 |
+
# Check dependency relation (e.g., object of preposition)
|
184 |
+
if ent.root.head.lemma_ in prepositions:
|
185 |
+
potential_merchants.append(ent.text)
|
186 |
+
continue
|
187 |
+
# Check proximity to money amount if indices available
|
188 |
+
if money_token_indices:
|
189 |
+
min_dist = min(abs(ent.start - idx) for idx in money_token_indices)
|
190 |
+
if min_dist < 5: # Arbitrary proximity threshold
|
191 |
+
potential_merchants.append(ent.text)
|
192 |
+
continue
|
193 |
+
|
194 |
+
|
195 |
+
if potential_merchants:
|
196 |
+
# Simple heuristic: choose the first likely one. Could be refined.
|
197 |
+
# Filter out very common words or locations if needed (e.g., "City", "Bank" if too generic)
|
198 |
+
merchant = potential_merchants[0].strip()
|
199 |
+
logging.debug(f"Identified potential merchant: {merchant} from entities {potential_merchants}")
|
200 |
+
|
201 |
+
|
202 |
+
# --- Category Identification ---
|
203 |
+
text_lower = doc.text.lower()
|
204 |
+
|
205 |
+
# 1. Check explicit category keywords
|
206 |
+
found_category = None
|
207 |
+
matched_keywords = []
|
208 |
+
for cat, keywords in CATEGORY_KEYWORDS.items():
|
209 |
+
if any(keyword in text_lower for keyword in keywords):
|
210 |
+
# If multiple categories match, prioritize based on merchant or context?
|
211 |
+
# Simple approach: Store all matches for now
|
212 |
+
matched_keywords.append(cat)
|
213 |
+
|
214 |
+
if len(matched_keywords) == 1:
|
215 |
+
found_category = matched_keywords[0]
|
216 |
+
elif len(matched_keywords) > 1:
|
217 |
+
# Ambiguity - Requires smarter logic. E.g., "Coffee at Food court" -> Coffee or Food?
|
218 |
+
# Prioritize based on merchant if known? E.g. if merchant is Starbucks -> Coffee
|
219 |
+
if merchant:
|
220 |
+
merchant_lower = merchant.lower()
|
221 |
+
if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower:
|
222 |
+
if "coffee" in matched_keywords: found_category = "coffee"
|
223 |
+
elif "amazon" in merchant_lower or "flipkart" in merchant_lower:
|
224 |
+
if "shopping" in matched_keywords: found_category = "shopping"
|
225 |
+
elif "zepto" in merchant_lower or "blinkit" in merchant_lower or "groceries" in merchant_lower:
|
226 |
+
if "groceries" in matched_keywords: found_category = "groceries"
|
227 |
+
elif "food" in matched_keywords: found_category = "groceries" # Prefer specific
|
228 |
+
|
229 |
+
# If still ambiguous, maybe pick the most specific one (e.g., prefer 'coffee' over 'food')
|
230 |
+
if not found_category:
|
231 |
+
if "coffee" in matched_keywords: found_category = "coffee"
|
232 |
+
elif "groceries" in matched_keywords: found_category = "groceries"
|
233 |
+
elif "transport" in matched_keywords: found_category = "transport"
|
234 |
+
# Add more specific priorities if needed
|
235 |
+
elif "food" in matched_keywords : found_category = "food" # More general last
|
236 |
+
else: found_category = matched_keywords[0] # Default to first match if no rules apply
|
237 |
+
|
238 |
+
|
239 |
+
if found_category:
|
240 |
+
category = found_category
|
241 |
+
# 2. (Optional/Advanced) Infer from merchant if category is Uncategorized
|
242 |
+
elif merchant and category == "Uncategorized":
|
243 |
+
merchant_lower = merchant.lower()
|
244 |
+
if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower: category = "coffee"
|
245 |
+
elif "amazon" in merchant_lower or "flipkart" in merchant_lower: category = "shopping"
|
246 |
+
elif "zepto" in merchant_lower or "blinkit" in merchant_lower: category = "groceries"
|
247 |
+
elif "uber" in merchant_lower or "ola" in merchant_lower: category = "travel"
|
248 |
+
elif "netflix" in merchant_lower or "spotify" in merchant_lower: category = "entertainment"
|
249 |
+
# Add more merchant->category mappings
|
250 |
+
|
251 |
+
# 3. (Optional/Advanced) Use Dependency Parsing or Word Vectors
|
252 |
+
# Example: Look for nouns that are objects of spending verbs near the amount
|
253 |
+
# This requires more complex linguistic analysis.
|
254 |
+
|
255 |
+
logging.debug(f"Identified Category: {category}")
|
256 |
+
return merchant, category
|
257 |
+
|
258 |
+
def determine_intent(doc):
|
259 |
+
"""Determines intent: 'add_expense', 'query_expense', or 'unknown'."""
|
260 |
+
text_lower = doc.text.lower()
|
261 |
+
|
262 |
+
has_query_keyword = any(keyword in text_lower for keyword in QUERY_KEYWORDS)
|
263 |
+
has_add_verb = any(verb.lemma_ in ADD_EXPENSE_VERBS for verb in doc if verb.pos_ == "VERB")
|
264 |
+
has_money_entity = any(ent.label_ == "MONEY" for ent in doc.ents) or FALLBACK_AMOUNT_REGEX.search(text_lower) is not None
|
265 |
+
|
266 |
+
# More explicit questions are likely queries
|
267 |
+
if doc[0].pos_ == "AUX" or doc[0].lemma_ in ["what", "how", "show", "list", "view"]: # Starts like a question
|
268 |
+
return "query_expense"
|
269 |
+
|
270 |
+
if has_query_keyword:
|
271 |
+
return "query_expense"
|
272 |
+
|
273 |
+
# If it has a spending verb and a money amount, likely adding expense
|
274 |
+
if has_add_verb and has_money_entity:
|
275 |
+
return "add_expense"
|
276 |
+
|
277 |
+
# If it just has a money amount and maybe date/merchant, could be adding expense (implicit verb)
|
278 |
+
if has_money_entity and not has_query_keyword:
|
279 |
+
# Check if there are nouns suggesting items bought
|
280 |
+
has_object_noun = any(tok.pos_ == "NOUN" and tok.dep_ in ["dobj", "pobj", "attr"] for tok in doc)
|
281 |
+
if has_object_noun or any(ent.label_ in ["ORG", "PRODUCT"] for ent in doc.ents):
|
282 |
+
return "add_expense"
|
283 |
+
|
284 |
+
# If only query keywords or unclear structure, lean towards query or unknown
|
285 |
+
if has_query_keyword:
|
286 |
+
return "query_expense"
|
287 |
+
|
288 |
+
return "unknown"
|
289 |
+
|
290 |
+
# --- Filtering and Formatting (largely reused, minor adjustments) ---
|
291 |
+
|
292 |
+
def filter_expenses(criteria):
|
293 |
+
"""Filters the global 'expenses' list based on criteria."""
|
294 |
+
# (This function remains largely the same as the previous version)
|
295 |
+
filtered = expenses
|
296 |
+
|
297 |
+
# Filter by Category
|
298 |
+
if 'category' in criteria and criteria['category'] is not None:
|
299 |
+
target_cat = criteria['category'].lower()
|
300 |
+
# Handle general 'food' query including 'coffee', 'groceries' etc.
|
301 |
+
food_related_cats = {'food', 'coffee', 'groceries', 'restaurant'} # Define food-related categories
|
302 |
+
if target_cat == 'food':
|
303 |
+
filtered = [e for e in filtered if e['category'].lower() in food_related_cats]
|
304 |
+
else:
|
305 |
+
filtered = [e for e in filtered if e['category'].lower() == target_cat]
|
306 |
+
|
307 |
+
# Filter by Date Range (start_date and end_date are inclusive)
|
308 |
+
if 'start_date' in criteria and criteria['start_date'] is not None:
|
309 |
+
filtered = [e for e in filtered if e['date'] >= criteria['start_date']]
|
310 |
+
if 'end_date' in criteria and criteria['end_date'] is not None:
|
311 |
+
filtered = [e for e in filtered if e['date'] <= criteria['end_date']]
|
312 |
+
|
313 |
+
# Filter by Merchant (case-insensitive substring match)
|
314 |
+
if 'merchant' in criteria and criteria['merchant'] is not None:
|
315 |
+
target_merchant = criteria['merchant'].lower()
|
316 |
+
filtered = [e for e in filtered if e['merchant'] and target_merchant in e['merchant'].lower()]
|
317 |
+
|
318 |
+
return filtered
|
319 |
+
|
320 |
+
def parse_date_range_from_query(doc):
|
321 |
+
"""Parses date ranges specifically for queries (e.g., 'this month', 'last week')."""
|
322 |
+
# (This function remains largely the same, using dateparser on DATE entities or keywords)
|
323 |
+
today = datetime.date.today()
|
324 |
+
text_lower = doc.text.lower() # Use full text for keywords like "this month"
|
325 |
+
start_date, end_date = None, None
|
326 |
+
|
327 |
+
# Prioritize DATE entities found by spaCy
|
328 |
+
date_ents_text = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
|
329 |
+
parsed_dates = []
|
330 |
+
settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
|
331 |
+
|
332 |
+
for date_str in date_ents_text:
|
333 |
+
# Try parsing as a potential range using dateparser's experimental range feature (or parse single dates)
|
334 |
+
# For simplicity, we'll stick to parsing single points and let keyword logic handle ranges
|
335 |
+
parsed = dateparser.parse(date_str, settings=settings)
|
336 |
+
if parsed:
|
337 |
+
parsed_dates.append(parsed.date())
|
338 |
+
|
339 |
+
# If spaCy found specific dates, use them
|
340 |
+
if len(parsed_dates) == 1:
|
341 |
+
start_date = end_date = parsed_dates[0]
|
342 |
+
elif len(parsed_dates) > 1:
|
343 |
+
# Ambiguous, maybe take min/max? Or rely on keywords below?
|
344 |
+
start_date = min(parsed_dates)
|
345 |
+
end_date = max(parsed_dates)
|
346 |
+
if start_date > end_date: # Swap if order is wrong
|
347 |
+
start_date, end_date = end_date, start_date
|
348 |
+
|
349 |
+
# If no specific date entities, check for range keywords
|
350 |
+
if start_date is None and end_date is None:
|
351 |
+
if "today" in text_lower:
|
352 |
+
start_date = end_date = today
|
353 |
+
elif "yesterday" in text_lower:
|
354 |
+
start_date = end_date = today - datetime.timedelta(days=1)
|
355 |
+
elif "this week" in text_lower:
|
356 |
+
start_of_week = today - datetime.timedelta(days=today.weekday()) # Monday
|
357 |
+
end_of_week = start_of_week + datetime.timedelta(days=6) # Sunday
|
358 |
+
start_date = start_of_week
|
359 |
+
end_date = end_of_week
|
360 |
+
elif "last week" in text_lower:
|
361 |
+
end_of_last_week = today - datetime.timedelta(days=today.weekday() + 1) # Last Sunday
|
362 |
+
start_of_last_week = end_of_last_week - datetime.timedelta(days=6) # Last Monday
|
363 |
+
start_date = start_of_last_week
|
364 |
+
end_date = end_of_last_week
|
365 |
+
elif "this month" in text_lower:
|
366 |
+
start_date = today.replace(day=1)
|
367 |
+
next_month = today.replace(day=28) + datetime.timedelta(days=4)
|
368 |
+
last_day_of_month = next_month - datetime.timedelta(days=next_month.day)
|
369 |
+
end_date = last_day_of_month
|
370 |
+
elif "last month" in text_lower:
|
371 |
+
first_day_of_current_month = today.replace(day=1)
|
372 |
+
last_day_of_last_month = first_day_of_current_month - datetime.timedelta(days=1)
|
373 |
+
first_day_of_last_month = last_day_of_last_month.replace(day=1)
|
374 |
+
start_date = first_day_of_last_month
|
375 |
+
end_date = last_day_of_last_month
|
376 |
+
elif "year" in text_lower: # e.g., "this year", "last year"
|
377 |
+
if "this year" in text_lower:
|
378 |
+
start_date = datetime.date(today.year, 1, 1)
|
379 |
+
end_date = datetime.date(today.year, 12, 31)
|
380 |
+
elif "last year" in text_lower:
|
381 |
+
start_date = datetime.date(today.year - 1, 1, 1)
|
382 |
+
end_date = datetime.date(today.year - 1, 12, 31)
|
383 |
+
# Check for specific year like "in 2023"
|
384 |
+
year_match = re.search(r'\b(in|for)\s+(\d{4})\b', text_lower)
|
385 |
+
if year_match:
|
386 |
+
year = int(year_match.group(2))
|
387 |
+
start_date = datetime.date(year, 1, 1)
|
388 |
+
end_date = datetime.date(year, 12, 31)
|
389 |
+
|
390 |
+
# Add specific month parsing ("in January") if needed (similar to previous version)
|
391 |
+
else:
|
392 |
+
month_match = re.search(r'\b(in|for)\s+(january|february|march|april|may|june|july|august|september|october|november|december)\b', text_lower)
|
393 |
+
if month_match:
|
394 |
+
month_name = month_match.group(2)
|
395 |
+
year_context = today.year # Assume current year
|
396 |
+
# Check if a year was mentioned nearby
|
397 |
+
year_ent = [e.text for e in doc.ents if e.label_ == "DATE" and e.text.isdigit() and len(e.text)==4]
|
398 |
+
if year_ent:
|
399 |
+
year_context = int(year_ent[0])
|
400 |
+
try:
|
401 |
+
month_num = list(datetime.date(2000, i, 1).strftime('%B').lower() for i in range(1, 13)).index(month_name) + 1
|
402 |
+
start_date = datetime.date(year_context, month_num, 1)
|
403 |
+
next_m = (start_date.replace(day=28) + datetime.timedelta(days=4))
|
404 |
+
end_date = next_m - datetime.timedelta(days=next_m.day)
|
405 |
+
except (ValueError, IndexError): pass # Ignore invalid month/year
|
406 |
+
|
407 |
+
|
408 |
+
logging.debug(f"Parsed date range for query: {start_date} to {end_date}")
|
409 |
+
return start_date, end_date
|
410 |
+
|
411 |
+
def format_expense_list(expense_list, title="Here are the expenses:"):
|
412 |
+
"""Formats a list of expenses into a user-friendly string."""
|
413 |
+
# (This function remains largely the same)
|
414 |
+
if not expense_list:
|
415 |
+
return "No expenses found matching your criteria."
|
416 |
+
|
417 |
+
total_amount = sum(e['amount'] for e in expense_list)
|
418 |
+
# Try to get a consistent currency symbol, default to first expense's symbol or fallback
|
419 |
+
currency_symbol = expense_list[0].get("currency") or "₹" if expense_list else "₹"
|
420 |
+
|
421 |
+
response_lines = [title]
|
422 |
+
expense_list.sort(key=lambda x: x['date'], reverse=True)
|
423 |
+
|
424 |
+
for expense in expense_list:
|
425 |
+
cur = expense.get("currency") or currency_symbol # Use expense specific or default
|
426 |
+
amount_str = f"{cur}{expense['amount']:.2f}"
|
427 |
+
merchant_part = f" at {expense['merchant']}" if expense['merchant'] else ""
|
428 |
+
category_part = f" ({expense['category']})" if expense['category'] != 'Uncategorized' else ""
|
429 |
+
date_str = expense['date'].strftime("%b %d, %Y")
|
430 |
+
response_lines.append(f"- {amount_str}{category_part}{merchant_part} - {date_str}")
|
431 |
+
|
432 |
+
if len(expense_list) > 1:
|
433 |
+
total_str = f"{currency_symbol}{total_amount:.2f}"
|
434 |
+
response_lines.append(f"Total: {total_str}")
|
435 |
+
|
436 |
+
return "\n".join(response_lines)
|
437 |
+
|
438 |
+
# --- NEW: Core NLP Processing Function ---
|
439 |
+
def analyze_expense_text(text):
|
440 |
+
"""
|
441 |
+
Analyzes text to extract expense details or understand queries using spaCy.
|
442 |
+
Returns a dictionary with action, status, and extracted details/message.
|
443 |
+
"""
|
444 |
+
global next_expense_id # Allow modification of the global counter
|
445 |
+
|
446 |
+
if nlp is None:
|
447 |
+
logging.error("spaCy model not loaded. Cannot process text.")
|
448 |
+
return {"action": "error", "status": "failed", "message": "NLP model not available"}
|
449 |
+
|
450 |
+
logging.info(f"Analyzing text: {text[:100]}...") # Log snippet
|
451 |
+
doc = nlp(text)
|
452 |
+
logging.debug(f"spaCy Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
|
453 |
+
|
454 |
+
intent = determine_intent(doc)
|
455 |
+
logging.info(f"Determined Intent: {intent}")
|
456 |
+
response_data = {}
|
457 |
+
|
458 |
+
if intent == "add_expense":
|
459 |
+
amount, currency = parse_money_entity(text, doc)
|
460 |
+
expense_date = parse_date_entities(doc)
|
461 |
+
merchant, category = identify_merchant_and_category(doc)
|
462 |
+
|
463 |
+
if amount is not None:
|
464 |
+
currency_symbol = currency or "₹" # Default currency
|
465 |
+
new_expense = {
|
466 |
+
"id": next_expense_id,
|
467 |
+
"amount": amount,
|
468 |
+
"currency": currency_symbol,
|
469 |
+
"category": category,
|
470 |
+
"merchant": merchant,
|
471 |
+
"date": expense_date, # Keep as date object internally
|
472 |
+
"original_message": text
|
473 |
+
}
|
474 |
+
expenses.append(new_expense)
|
475 |
+
next_expense_id += 1
|
476 |
+
logging.info(f"Added expense (in-memory): {new_expense}")
|
477 |
+
|
478 |
+
merchant_part = f" at {merchant}" if merchant else ""
|
479 |
+
date_str = expense_date.strftime('%b %d, %Y')
|
480 |
+
confirmation_msg = f"✅ Expense added: {currency_symbol}{amount:.2f} for {category}{merchant_part} on {date_str}."
|
481 |
+
|
482 |
+
new_expense_serializable = new_expense.copy()
|
483 |
+
new_expense_serializable["date"] = new_expense["date"].isoformat()
|
484 |
+
|
485 |
+
response_data = {
|
486 |
+
"action": "add_expense",
|
487 |
+
"status": "success",
|
488 |
+
"message": confirmation_msg,
|
489 |
+
"details": new_expense_serializable
|
490 |
+
}
|
491 |
+
else:
|
492 |
+
logging.warning(f"Could not extract amount reliably from: {text}")
|
493 |
+
response_data = {
|
494 |
+
"action": "add_expense",
|
495 |
+
"status": "failed",
|
496 |
+
"message": f"Sorry, I couldn't understand the amount. Please include it clearly (e.g., '₹500', '$20', '15 pounds')."
|
497 |
+
}
|
498 |
+
|
499 |
+
elif intent == "query_expense":
|
500 |
+
logging.info("Processing query intent.")
|
501 |
+
query_criteria = {}
|
502 |
+
_q_merchant, q_category = identify_merchant_and_category(doc)
|
503 |
+
|
504 |
+
# ... (rest of query criteria extraction logic remains the same) ...
|
505 |
+
query_cat_found = None
|
506 |
+
text_lower = doc.text.lower()
|
507 |
+
for cat, keywords in CATEGORY_KEYWORDS.items():
|
508 |
+
if any(keyword in text_lower for keyword in keywords):
|
509 |
+
if cat == 'food' or q_category == 'food':
|
510 |
+
query_cat_found = 'food'
|
511 |
+
break
|
512 |
+
query_cat_found = q_category if q_category != 'Uncategorized' else cat
|
513 |
+
break
|
514 |
+
|
515 |
+
query_criteria['category'] = query_cat_found
|
516 |
+
query_criteria['merchant'] = _q_merchant
|
517 |
+
start_date, end_date = parse_date_range_from_query(doc)
|
518 |
+
query_criteria['start_date'] = start_date
|
519 |
+
query_criteria['end_date'] = end_date
|
520 |
+
|
521 |
+
logging.info(f"Query Criteria: {query_criteria}")
|
522 |
+
results = filter_expenses(query_criteria)
|
523 |
+
response_message = ""
|
524 |
+
|
525 |
+
# ... (rest of query response formatting logic remains the same) ...
|
526 |
+
if results and ("total" in text_lower or "sum" in text_lower or "how much" in doc[0].lower_):
|
527 |
+
total_amount = sum(e['amount'] for e in results)
|
528 |
+
currency_symbol = results[0].get("currency") or "₹"
|
529 |
+
category_filter_text = f" on {query_criteria['category']}" if query_criteria['category'] else ""
|
530 |
+
date_filter_text = ""
|
531 |
+
if start_date and end_date and start_date == end_date: date_filter_text = f" for {start_date.strftime('%b %d, %Y')}"
|
532 |
+
elif start_date and end_date: date_filter_text = f" from {start_date.strftime('%b %d')} to {end_date.strftime('%b %d, %Y')}"
|
533 |
+
elif start_date: date_filter_text = f" since {start_date.strftime('%b %d, %Y')}"
|
534 |
+
elif end_date: date_filter_text = f" until {end_date.strftime('%b %d, %Y')}"
|
535 |
+
response_message = f"Your total spending{category_filter_text}{date_filter_text} is {currency_symbol}{total_amount:.2f}."
|
536 |
+
if len(results) <= 10:
|
537 |
+
response_message += "\n" + format_expense_list(results, "Details:")
|
538 |
+
else:
|
539 |
+
response_message += f" (from {len(results)} transactions)"
|
540 |
+
elif results and ("biggest" in text_lower or "largest" in text_lower or "top" in text_lower):
|
541 |
+
top_n = 3
|
542 |
+
top_expenses = sorted(results, key=lambda x: x['amount'], reverse=True)[:top_n]
|
543 |
+
response_message = format_expense_list(top_expenses, f"Your top {len(top_expenses)} expenses:")
|
544 |
+
else:
|
545 |
+
date_filter_desc = ""
|
546 |
+
if start_date and end_date and start_date == end_date: date_filter_desc = f" from {start_date.strftime('%b %d, %Y')}"
|
547 |
+
elif start_date or end_date: date_filter_desc = " matching the date criteria"
|
548 |
+
category_filter_desc = f" for {query_criteria['category']}" if query_criteria['category'] else ""
|
549 |
+
merchant_filter_desc = f" at {query_criteria['merchant']}" if query_criteria['merchant'] else ""
|
550 |
+
title = f"Expenses{category_filter_desc}{merchant_filter_desc}{date_filter_desc}:"
|
551 |
+
response_message = format_expense_list(results, title)
|
552 |
+
|
553 |
+
|
554 |
+
response_data = {
|
555 |
+
"action": "query_expense",
|
556 |
+
"status": "success",
|
557 |
+
"message": response_message,
|
558 |
+
"criteria": {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in query_criteria.items() if v is not None},
|
559 |
+
"results_count": len(results)
|
560 |
+
}
|
561 |
+
|
562 |
+
else: # intent == "unknown"
|
563 |
+
logging.info(f"Could not determine intent for: {text}")
|
564 |
+
response_data = {
|
565 |
+
"action": "unknown",
|
566 |
+
"status": "failed",
|
567 |
+
"message": "Sorry, I couldn't quite understand that. Please try phrasing your expense or query differently. \nExamples:\n- 'Spent ₹50 on coffee yesterday at Starbucks'\n- 'Show my food expenses last week'\n- 'What was my total spending last month?'"
|
568 |
+
}
|
569 |
+
|
570 |
+
logging.info(f"Analysis complete. Action: {response_data.get('action')}, Status: {response_data.get('status')}")
|
571 |
+
return response_data
|
572 |
+
|
573 |
+
|
574 |
+
# --- Flask Blueprint Setup (Optional: Keep if direct API access is needed) ---
|
575 |
+
nlp_bp = Blueprint('nlp_service', __name__)
|
576 |
+
|
577 |
+
@nlp_bp.route('/process_nlp', methods=['POST'])
|
578 |
+
def process_nlp_expense_route():
|
579 |
+
"""Flask route handler that calls the core analysis function."""
|
580 |
+
data = request.get_json()
|
581 |
+
if not data or 'message' not in data:
|
582 |
+
logging.warning("Received request without 'message' field.")
|
583 |
+
return jsonify({"error": "Missing 'message' in request body"}), 400
|
584 |
+
|
585 |
+
user_message = data['message']
|
586 |
+
result = analyze_expense_text(user_message) # Call the core function
|
587 |
+
|
588 |
+
# Determine status code based on result
|
589 |
+
status_code = 200
|
590 |
+
if result.get("status") == "failed":
|
591 |
+
status_code = 400 # Or 500 if it's an internal NLP model error
|
592 |
+
if result.get("message") == "NLP model not available":
|
593 |
+
status_code = 500
|
594 |
+
|
595 |
+
return jsonify(result), status_code
|
596 |
+
|
597 |
+
|
598 |
+
# --- Example Usage / Testing Setup ---
|
599 |
+
if __name__ == '__main__':
|
600 |
+
from flask import Flask
|
601 |
+
|
602 |
+
app = Flask(__name__)
|
603 |
+
app.register_blueprint(nlp_bp) # Register the blueprint
|
604 |
+
|
605 |
+
# Dummy data removed
|
606 |
+
|
607 |
+
print("Starting Flask server for testing NLP service...")
|
608 |
+
# print("Registered expenses:", expenses) # Can be long
|
609 |
+
if nlp is None:
|
610 |
+
print("WARNING: spaCy model failed to load. /process_nlp endpoint will return errors.")
|
611 |
+
app.run(debug=True, host='0.0.0.0', port=5001)
|
requirements.txt
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
Pillow
|
2 |
flask
|
3 |
-
requests
|
4 |
paddlepaddle
|
5 |
-
paddleocr
|
|
|
|
|
|
|
|
|
|
1 |
Pillow
|
2 |
flask
|
3 |
+
# requests # Removed as NLP is called directly now
|
4 |
paddlepaddle
|
5 |
+
paddleocr
|
6 |
+
spacy>=3.0.0 # Added spaCy
|
7 |
+
dateparser>=1.0.0 # Added dateparser
|
8 |
+
# Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
|
9 |
+
# python -m spacy download en_core_web_md
|