MonilM commited on
Commit
4c9f681
·
1 Parent(s): 3b392a9
Files changed (3) hide show
  1. app.py +121 -41
  2. nlp_service.py +611 -0
  3. requirements.txt +6 -2
app.py CHANGED
@@ -11,6 +11,9 @@ from flask import Flask, request, jsonify
11
  from paddleocr import PaddleOCR
12
  from PIL import Image
13
 
 
 
 
14
  # --- Configuration ---
15
  LANG = 'en' # Default language, can be overridden if needed
16
  NUM_WORKERS = 2 # Number of OCR worker threads
@@ -85,9 +88,12 @@ def find_main_amount(ocr_results):
85
  if not ocr_results:
86
  return None
87
 
88
- potential_amounts = []
89
- amount_regex = re.compile(r'(?<!\%)\b\d{1,3}(?:,?\d{3})*(?:\.\d{2})\b|\b\d+\.\d{2}\b|\b\d+\b(?!\.\d{1})')
90
- total_keywords = ['total', 'grand total', 'amount due', 'balance', 'net amount', 'paid', 'charge', 'subtotal', 'total amount', 'to pay']
 
 
 
91
 
92
  parsed_lines = []
93
  for i, line_info in enumerate(ocr_results):
@@ -100,62 +106,95 @@ def find_main_amount(ocr_results):
100
  float_numbers = []
101
  for num_str in numbers_in_line:
102
  try:
103
- if len(text) < 6 and '.' not in num_str and 1900 < int(num_str.replace(',', '')) < 2100:
104
- continue
 
 
 
105
  float_numbers.append(float(num_str.replace(',', '')))
106
  except ValueError:
107
  continue
108
 
109
- has_keyword = False
110
- for keyword in total_keywords:
111
- if re.search(r'\b' + re.escape(keyword) + r'\b', text):
112
- has_keyword = True
113
- break
114
 
115
  parsed_lines.append({
116
  "index": i,
117
  "text": text,
118
  "numbers": float_numbers,
119
- "has_keyword": has_keyword,
 
 
120
  "confidence": confidence
121
  })
122
 
123
- keyword_candidates = []
124
- keyword_line_indices = {line["index"] for line in parsed_lines if line["has_keyword"]}
125
- checked_indices_near_keywords = set()
126
-
127
- for line_idx in keyword_line_indices:
128
- indices_to_check = {line_idx, line_idx - 1, line_idx + 1}
129
- for check_idx in indices_to_check:
130
- if 0 <= check_idx < len(parsed_lines) and check_idx not in checked_indices_near_keywords:
131
- line_to_check = parsed_lines[check_idx]
132
- if line_to_check["numbers"]:
133
- keyword_candidates.extend(line_to_check["numbers"])
134
- checked_indices_near_keywords.add(check_idx)
135
-
136
- if keyword_candidates:
137
- unique_candidates = list(set(keyword_candidates))
138
- if unique_candidates:
139
- return max(unique_candidates)
140
-
141
- print("Warning: No numbers found near keywords. Using fallback (largest overall).")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  all_numbers = []
 
 
143
  for line in parsed_lines:
144
  all_numbers.extend(line["numbers"])
145
 
146
  if all_numbers:
147
  unique_numbers = list(set(all_numbers))
148
- plausible_numbers = [n for n in unique_numbers if n < 100000 or '.' in str(n)]
149
- plausible_numbers = [n for n in plausible_numbers if n >= 1.0 or '.' in str(n)]
150
- if plausible_numbers:
151
- return max(plausible_numbers)
 
 
 
 
 
 
 
 
 
152
 
 
153
  print("Warning: Could not determine main amount.")
154
  return None
155
 
156
  # --- Flask App Setup ---
157
  app = Flask(__name__)
158
 
 
 
 
159
  # --- Initialize OCR Manager ---
160
  ocr_model_factory = functools.partial(PaddleOCR, lang=LANG, use_angle_cls=True, use_gpu=False, show_log=False)
161
  ocr_manager = PaddleOCRModelManager(num_workers=NUM_WORKERS, model_factory=ocr_model_factory)
@@ -185,20 +224,54 @@ def extract_expense():
185
  # Perform OCR
186
  ocr_result = ocr_manager.infer(temp_file_path, cls=True)
187
 
188
- # Process results
189
- if not ocr_result:
190
- extracted_text = ""
191
- main_amount = None
192
- else:
193
  extracted_lines = [line[1][0] for line in ocr_result if line and len(line) > 1 and len(line[1]) > 0]
194
  extracted_text = "\n".join(extracted_lines)
195
- main_amount = find_main_amount(ocr_result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
 
197
  response_data = {
198
  "type": "photo",
199
  "extracted_text": extracted_text,
200
- "main_amount": main_amount
 
 
201
  }
 
 
 
 
 
 
 
 
 
 
 
 
202
  return jsonify(response_data)
203
 
204
  except Exception as e:
@@ -212,6 +285,13 @@ def extract_expense():
212
 
213
  return jsonify({"error": "File processing failed"}), 500
214
 
 
 
 
 
 
 
 
215
  # --- Run the App ---
216
  if __name__ == '__main__':
217
  # Use port 7860 as expected by Hugging Face Spaces
 
11
  from paddleocr import PaddleOCR
12
  from PIL import Image
13
 
14
+ # --- NEW: Import the NLP analysis function ---
15
+ from nlp_service import analyze_expense_text # Import the core analysis function
16
+
17
  # --- Configuration ---
18
  LANG = 'en' # Default language, can be overridden if needed
19
  NUM_WORKERS = 2 # Number of OCR worker threads
 
88
  if not ocr_results:
89
  return None
90
 
91
+ amount_regex = re.compile(r'(?<!%)\b\d{1,3}(?:,?\d{3})*(?:\.\d{2})\b|\b\d+\.\d{2}\b|\b\d+\b(?!\.\d{1})')
92
+
93
+ # Prioritized keywords
94
+ priority_keywords = ['grand total', 'total amount', 'amount due', 'to pay', 'bill total', 'total payable']
95
+ secondary_keywords = ['total', 'balance', 'net amount', 'paid', 'charge', 'net total'] # Added 'net total'
96
+ lower_priority_keywords = ['subtotal', 'sub total'] # Added 'sub total'
97
 
98
  parsed_lines = []
99
  for i, line_info in enumerate(ocr_results):
 
106
  float_numbers = []
107
  for num_str in numbers_in_line:
108
  try:
109
+ # Avoid converting year-like numbers if they stand alone on short lines
110
+ if len(text) < 7 and '.' not in num_str and 1900 < int(num_str.replace(',', '')) < 2100:
111
+ # More robust check: avoid if it's the only thing and looks like a year
112
+ if len(numbers_in_line) == 1 and len(num_str) == 4:
113
+ continue
114
  float_numbers.append(float(num_str.replace(',', '')))
115
  except ValueError:
116
  continue
117
 
118
+ # Check for keywords
119
+ has_priority_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in priority_keywords)
120
+ has_secondary_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in secondary_keywords)
121
+ has_lower_priority_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in lower_priority_keywords)
 
122
 
123
  parsed_lines.append({
124
  "index": i,
125
  "text": text,
126
  "numbers": float_numbers,
127
+ "has_priority_keyword": has_priority_keyword,
128
+ "has_secondary_keyword": has_secondary_keyword,
129
+ "has_lower_priority_keyword": has_lower_priority_keyword,
130
  "confidence": confidence
131
  })
132
 
133
+ # --- Strategy to find the best candidate ---
134
+
135
+ # 1. Look for numbers on the SAME line as PRIORITY keywords
136
+ priority_candidates = []
137
+ for line in parsed_lines:
138
+ if line["has_priority_keyword"] and line["numbers"]:
139
+ priority_candidates.extend(line["numbers"])
140
+ if priority_candidates:
141
+ # Often the largest number on these lines is the final total
142
+ return max(priority_candidates)
143
+
144
+ # 2. Look for numbers on the SAME line as SECONDARY keywords
145
+ secondary_candidates = []
146
+ for line in parsed_lines:
147
+ if line["has_secondary_keyword"] and line["numbers"]:
148
+ secondary_candidates.extend(line["numbers"])
149
+ if secondary_candidates:
150
+ # If we only found secondary keywords, return the largest number found on those lines
151
+ # This might catch 'Net Total' or 'Total' when 'Grand Total' isn't present
152
+ return max(secondary_candidates)
153
+
154
+ # 3. Look near priority/secondary keywords (less reliable, might pick up tax/service charge)
155
+ # Consider removing or deprioritizing this 'near' logic if same-line logic is sufficient
156
+
157
+ # 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal)
158
+ lower_priority_candidates = []
159
+ for line in parsed_lines:
160
+ if line["has_lower_priority_keyword"] and line["numbers"]:
161
+ lower_priority_candidates.extend(line["numbers"])
162
+ # Don't return subtotal directly unless it's the only thing found later
163
+
164
+ # 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
165
+ print("Warning: No numbers found on priority/secondary keyword lines. Using fallback.")
166
  all_numbers = []
167
+ subtotal_numbers = set(lower_priority_candidates) # Keep track of subtotals
168
+
169
  for line in parsed_lines:
170
  all_numbers.extend(line["numbers"])
171
 
172
  if all_numbers:
173
  unique_numbers = list(set(all_numbers))
174
+
175
+ # Filter out potential quantities/years/small irrelevant numbers
176
+ plausible_numbers = [n for n in unique_numbers if n >= 1.0 or '.' in str(n)]
177
+ # Filter out very large numbers unlikely to be totals unless they have decimals?
178
+ plausible_numbers = [n for n in plausible_numbers if n < 100000 or '.' in str(n)]
179
+
180
+ # If we have plausible numbers other than subtotals, prefer them
181
+ non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
182
+
183
+ if non_subtotal_plausible:
184
+ return max(non_subtotal_plausible)
185
+ elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
186
+ return max(plausible_numbers) # Return the largest subtotal as last resort
187
 
188
+ # 6. If still nothing, return None
189
  print("Warning: Could not determine main amount.")
190
  return None
191
 
192
  # --- Flask App Setup ---
193
  app = Flask(__name__)
194
 
195
+ # --- REMOVED: Register the NLP Blueprint ---
196
+ # app.register_blueprint(nlp_bp) # No longer needed as we call the function directly
197
+
198
  # --- Initialize OCR Manager ---
199
  ocr_model_factory = functools.partial(PaddleOCR, lang=LANG, use_angle_cls=True, use_gpu=False, show_log=False)
200
  ocr_manager = PaddleOCRModelManager(num_workers=NUM_WORKERS, model_factory=ocr_model_factory)
 
224
  # Perform OCR
225
  ocr_result = ocr_manager.infer(temp_file_path, cls=True)
226
 
227
+ # Process OCR results
228
+ extracted_text = ""
229
+ main_amount_ocr = None
230
+ if ocr_result:
 
231
  extracted_lines = [line[1][0] for line in ocr_result if line and len(line) > 1 and len(line[1]) > 0]
232
  extracted_text = "\n".join(extracted_lines)
233
+ main_amount_ocr = find_main_amount(ocr_result) # Keep OCR amount extraction
234
+
235
+ # --- NEW: Call NLP Function Directly ---
236
+ nlp_analysis_result = None
237
+ nlp_error = None
238
+ if extracted_text:
239
+ try:
240
+ # Call the imported analysis function
241
+ nlp_analysis_result = analyze_expense_text(extracted_text)
242
+ print(f"NLP Service Analysis Result: {nlp_analysis_result}")
243
+ # Check if the NLP analysis itself reported an error/failure
244
+ if nlp_analysis_result.get("status") == "failed":
245
+ nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
246
+ # Keep the result structure but note the failure
247
+ except Exception as nlp_e:
248
+ nlp_error = f"Error calling NLP analysis function: {nlp_e}"
249
+ print(f"Error calling NLP function: {nlp_error}")
250
+ nlp_analysis_result = None # Ensure result is None on exception during call
251
+ else:
252
+ nlp_error = "No text extracted from image for NLP analysis."
253
+ # --- End NLP Call ---
254
 
255
+ # Construct the response
256
  response_data = {
257
  "type": "photo",
258
  "extracted_text": extracted_text,
259
+ "main_amount_ocr": main_amount_ocr, # Amount found by OCR regex logic
260
+ "nlp_analysis": nlp_analysis_result, # Include the full NLP analysis result (or None)
261
+ "nlp_error": nlp_error # Include any error from NLP call/processing
262
  }
263
+
264
+ # Optional: Add top-level convenience fields based on successful NLP analysis
265
+ if nlp_analysis_result and nlp_analysis_result.get("status") == "success":
266
+ if nlp_analysis_result.get("action") == "add_expense":
267
+ response_data['confirmed_expense_details'] = nlp_analysis_result.get('details')
268
+ response_data['confirmation_message'] = nlp_analysis_result.get('message')
269
+ elif nlp_analysis_result.get("action") == "query_expense":
270
+ # Include query results if applicable (depends on nlp_service structure)
271
+ response_data['query_message'] = nlp_analysis_result.get('message')
272
+ response_data['query_criteria'] = nlp_analysis_result.get('criteria')
273
+ response_data['query_results_count'] = nlp_analysis_result.get('results_count')
274
+
275
  return jsonify(response_data)
276
 
277
  except Exception as e:
 
285
 
286
  return jsonify({"error": "File processing failed"}), 500
287
 
288
+ # --- NEW: Health Check Endpoint ---
289
+ @app.route('/health', methods=['GET'])
290
+ def health_check():
291
+ # You could add more checks here (e.g., if OCR workers are alive)
292
+ return jsonify({"status": "ok"}), 200
293
+
294
+
295
  # --- Run the App ---
296
  if __name__ == '__main__':
297
  # Use port 7860 as expected by Hugging Face Spaces
nlp_service.py ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import datetime
3
+ import dateparser # Still essential for interpreting date strings
4
+ import spacy # Import spaCy
5
+ from flask import Blueprint, request, jsonify
6
+ from collections import defaultdict
7
+ import logging
8
+ import os # To handle potential model loading issues
9
+
10
+ # --- Setup ---
11
+ logging.basicConfig(level=logging.INFO)
12
+
13
+ # --- Load spaCy Model ---
14
+ # Using medium model for better accuracy and word vectors (though not used explicitly yet)
15
+ # Handle potential errors during model loading
16
+ try:
17
+ # Check if running in an environment where models might be linked differently
18
+ # (e.g., Google Cloud Functions sometimes needs explicit path)
19
+ model_name = "en_core_web_md"
20
+ if not spacy.util.is_package(model_name):
21
+ print(f"spaCy model '{model_name}' not found as package. Attempting download...")
22
+ spacy.cli.download(model_name)
23
+
24
+ nlp = spacy.load(model_name)
25
+ logging.info(f"Successfully loaded spaCy model '{model_name}'")
26
+ except (OSError, ImportError) as e:
27
+ logging.error(f"Could not load spaCy model '{model_name}'. Error: {e}")
28
+ logging.error("Ensure the model is downloaded: python -m spacy download en_core_web_md")
29
+ # Fallback or exit - for now, we'll log and potentially fail later if nlp isn't loaded
30
+ nlp = None # Indicate model loading failed
31
+
32
+ # --- In-Memory Data Storage (Replace with Database) ---
33
+ expenses = []
34
+ next_expense_id = 1
35
+
36
+ # --- NLP Configuration & Helpers ---
37
+ CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
38
+ # More robust regex to find monetary values even if spaCy misses MONEY entity
39
+ FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b', re.IGNORECASE)
40
+
41
+ # Category keywords remain useful
42
+ CATEGORY_KEYWORDS = {
43
+ "food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "groceries", "sandwich", "burger", "pizza"],
44
+ "coffee": ["coffee", "latte", "cappuccino", "espresso", "cafe", "starbucks", "ccd", "café", "mocha"],
45
+ "travel": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket"],
46
+ "shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
47
+ "groceries": ["groceries", "supermarket", "zepto", "blinkit", "bigbasket", "vegetables", "milk", "market"],
48
+ "utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
49
+ "entertainment": ["movie", "cinema", "concert", "game", "fun", "netflix", "spotify", "tickets"],
50
+ "rent": ["rent", "lease"],
51
+ "transport": ["transport", "metro", "auto", "rickshaw", "commute"]
52
+ }
53
+
54
+ # Keywords for intent detection (can be less critical now, intent inferred more from entities)
55
+ QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
56
+ ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending
57
+
58
+
59
+ def parse_money_entity(text, doc):
60
+ """
61
+ Extracts amount using spaCy MONEY entities first, then falls back to regex.
62
+ Returns the amount as float and identified currency symbol/code.
63
+ """
64
+ amount = None
65
+ currency = None
66
+ text = text.replace(',', '') # Remove commas for easier parsing
67
+
68
+ # 1. Try spaCy MONEY entities first
69
+ money_ents = [ent for ent in doc.ents if ent.label_ == "MONEY"]
70
+ if money_ents:
71
+ # Prioritize longer entities or ones closer to verbs like 'spent' if multiple found
72
+ # Simple approach: take the first one for now
73
+ ent_text = money_ents[0].text.replace(',', '')
74
+ # Try to extract number and symbol/code from the entity text
75
+ num_match = re.search(r'([\d\.]+)', ent_text)
76
+ if num_match:
77
+ try:
78
+ amount = float(num_match.group(1))
79
+ # Try to find a known symbol or code within the entity text
80
+ symbol_match = re.search(r'([\$€£₹])', ent_text)
81
+ if symbol_match:
82
+ currency = symbol_match.group(1)
83
+ else:
84
+ # Check for codes like USD, GBP etc. (simple check)
85
+ code_match = re.search(r'\b(USD|EUR|GBP|INR|RS)\b', ent_text, re.IGNORECASE)
86
+ if code_match:
87
+ currency = code_match.group(1).upper()
88
+ # Standardize common ones
89
+ if currency == "RS": currency = "INR"
90
+
91
+ # If amount found but no currency symbol in entity, check doc context
92
+ if amount is not None and currency is None:
93
+ for token in doc:
94
+ if token.text in CURRENCY_SYMBOLS:
95
+ currency = token.text
96
+ break
97
+ return amount, currency
98
+ except ValueError:
99
+ pass # Failed to convert number
100
+
101
+ # 2. Fallback Regex (if spaCy missed it or parsing failed)
102
+ match = FALLBACK_AMOUNT_REGEX.search(text)
103
+ if match:
104
+ try:
105
+ if match.group(2): # Format: $100 or Rs 100
106
+ amount = float(match.group(2))
107
+ currency_text = match.group(1)
108
+ elif match.group(3): # Format: 100 dollars or 100 Rs
109
+ amount = float(match.group(3))
110
+ currency_text = match.group(4)
111
+ else: # Should not happen with this regex, but safety first
112
+ return None, None
113
+
114
+ # Normalize currency symbol/code
115
+ if currency_text in CURRENCY_SYMBOLS:
116
+ currency = currency_text
117
+ else:
118
+ currency_text = currency_text.lower()
119
+ if currency_text in ["rs", "rupees"]: currency = "₹" # Or INR
120
+ elif currency_text in ["dollars", "usd"]: currency = "$" # Or USD
121
+ elif currency_text in ["pounds", "gbp"]: currency = "£" # Or GBP
122
+ elif currency_text in ["euros", "eur"]: currency = "€" # Or EUR
123
+
124
+ return amount, currency
125
+
126
+ except (ValueError, IndexError):
127
+ logging.warning(f"Regex fallback failed to parse amount from: {text}")
128
+ return None, None
129
+
130
+ return None, None # No amount found
131
+
132
+ def parse_date_entities(doc):
133
+ """
134
+ Uses dateparser to interpret spaCy DATE entities.
135
+ Returns the *most likely* date found, defaulting to today.
136
+ """
137
+ dates = []
138
+ # Settings for dateparser: prefer past dates for expenses
139
+ settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
140
+
141
+ date_ents = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
142
+ logging.debug(f"Found DATE entities: {date_ents}")
143
+
144
+ if date_ents:
145
+ for date_str in date_ents:
146
+ # Sometimes spaCy includes words like "on", "last" in the entity, dateparser handles this
147
+ parsed = dateparser.parse(date_str, settings=settings)
148
+ if parsed:
149
+ dates.append(parsed.date())
150
+
151
+ if dates:
152
+ # Heuristic: If multiple dates, prefer the one closest to today? Or just the first?
153
+ # Let's prefer the latest valid past date found (most recent expense)
154
+ past_dates = [d for d in dates if d <= datetime.date.today()]
155
+ if past_dates:
156
+ return max(past_dates) # Return the most recent valid date
157
+ elif dates:
158
+ return min(dates) # If only future dates found, return the earliest one (less likely for expense)
159
+
160
+ # Fallback if no DATE entity found or parsed
161
+ logging.debug("No valid DATE entity found or parsed, defaulting to today.")
162
+ return datetime.date.today()
163
+
164
+ def identify_merchant_and_category(doc):
165
+ """
166
+ Identifies merchant using ORG/PERSON/GPE entities and context.
167
+ Identifies category using keywords and context around amount/merchant.
168
+ """
169
+ merchant = None
170
+ category = "Uncategorized" # Default
171
+
172
+ money_token_indices = [token.i for token in doc if token.like_num or token.text in CURRENCY_SYMBOLS or any(sym in token.text for sym in CURRENCY_SYMBOLS) or (token.ent_type_ == "MONEY")]
173
+
174
+ potential_merchants = []
175
+ for ent in doc.ents:
176
+ if ent.label_ in ["ORG", "PERSON", "GPE", "FAC"]: # Facility might also be relevant
177
+ # Check context: is it preceded by "at", "from", "in"? Is it near the money amount?
178
+ prepositions = {"at", "from", "in", "on", "with"}
179
+ # Check token before the entity start
180
+ if ent.start > 0 and doc[ent.start - 1].lower_ in prepositions:
181
+ potential_merchants.append(ent.text)
182
+ continue
183
+ # Check dependency relation (e.g., object of preposition)
184
+ if ent.root.head.lemma_ in prepositions:
185
+ potential_merchants.append(ent.text)
186
+ continue
187
+ # Check proximity to money amount if indices available
188
+ if money_token_indices:
189
+ min_dist = min(abs(ent.start - idx) for idx in money_token_indices)
190
+ if min_dist < 5: # Arbitrary proximity threshold
191
+ potential_merchants.append(ent.text)
192
+ continue
193
+
194
+
195
+ if potential_merchants:
196
+ # Simple heuristic: choose the first likely one. Could be refined.
197
+ # Filter out very common words or locations if needed (e.g., "City", "Bank" if too generic)
198
+ merchant = potential_merchants[0].strip()
199
+ logging.debug(f"Identified potential merchant: {merchant} from entities {potential_merchants}")
200
+
201
+
202
+ # --- Category Identification ---
203
+ text_lower = doc.text.lower()
204
+
205
+ # 1. Check explicit category keywords
206
+ found_category = None
207
+ matched_keywords = []
208
+ for cat, keywords in CATEGORY_KEYWORDS.items():
209
+ if any(keyword in text_lower for keyword in keywords):
210
+ # If multiple categories match, prioritize based on merchant or context?
211
+ # Simple approach: Store all matches for now
212
+ matched_keywords.append(cat)
213
+
214
+ if len(matched_keywords) == 1:
215
+ found_category = matched_keywords[0]
216
+ elif len(matched_keywords) > 1:
217
+ # Ambiguity - Requires smarter logic. E.g., "Coffee at Food court" -> Coffee or Food?
218
+ # Prioritize based on merchant if known? E.g. if merchant is Starbucks -> Coffee
219
+ if merchant:
220
+ merchant_lower = merchant.lower()
221
+ if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower:
222
+ if "coffee" in matched_keywords: found_category = "coffee"
223
+ elif "amazon" in merchant_lower or "flipkart" in merchant_lower:
224
+ if "shopping" in matched_keywords: found_category = "shopping"
225
+ elif "zepto" in merchant_lower or "blinkit" in merchant_lower or "groceries" in merchant_lower:
226
+ if "groceries" in matched_keywords: found_category = "groceries"
227
+ elif "food" in matched_keywords: found_category = "groceries" # Prefer specific
228
+
229
+ # If still ambiguous, maybe pick the most specific one (e.g., prefer 'coffee' over 'food')
230
+ if not found_category:
231
+ if "coffee" in matched_keywords: found_category = "coffee"
232
+ elif "groceries" in matched_keywords: found_category = "groceries"
233
+ elif "transport" in matched_keywords: found_category = "transport"
234
+ # Add more specific priorities if needed
235
+ elif "food" in matched_keywords : found_category = "food" # More general last
236
+ else: found_category = matched_keywords[0] # Default to first match if no rules apply
237
+
238
+
239
+ if found_category:
240
+ category = found_category
241
+ # 2. (Optional/Advanced) Infer from merchant if category is Uncategorized
242
+ elif merchant and category == "Uncategorized":
243
+ merchant_lower = merchant.lower()
244
+ if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower: category = "coffee"
245
+ elif "amazon" in merchant_lower or "flipkart" in merchant_lower: category = "shopping"
246
+ elif "zepto" in merchant_lower or "blinkit" in merchant_lower: category = "groceries"
247
+ elif "uber" in merchant_lower or "ola" in merchant_lower: category = "travel"
248
+ elif "netflix" in merchant_lower or "spotify" in merchant_lower: category = "entertainment"
249
+ # Add more merchant->category mappings
250
+
251
+ # 3. (Optional/Advanced) Use Dependency Parsing or Word Vectors
252
+ # Example: Look for nouns that are objects of spending verbs near the amount
253
+ # This requires more complex linguistic analysis.
254
+
255
+ logging.debug(f"Identified Category: {category}")
256
+ return merchant, category
257
+
258
+ def determine_intent(doc):
259
+ """Determines intent: 'add_expense', 'query_expense', or 'unknown'."""
260
+ text_lower = doc.text.lower()
261
+
262
+ has_query_keyword = any(keyword in text_lower for keyword in QUERY_KEYWORDS)
263
+ has_add_verb = any(verb.lemma_ in ADD_EXPENSE_VERBS for verb in doc if verb.pos_ == "VERB")
264
+ has_money_entity = any(ent.label_ == "MONEY" for ent in doc.ents) or FALLBACK_AMOUNT_REGEX.search(text_lower) is not None
265
+
266
+ # More explicit questions are likely queries
267
+ if doc[0].pos_ == "AUX" or doc[0].lemma_ in ["what", "how", "show", "list", "view"]: # Starts like a question
268
+ return "query_expense"
269
+
270
+ if has_query_keyword:
271
+ return "query_expense"
272
+
273
+ # If it has a spending verb and a money amount, likely adding expense
274
+ if has_add_verb and has_money_entity:
275
+ return "add_expense"
276
+
277
+ # If it just has a money amount and maybe date/merchant, could be adding expense (implicit verb)
278
+ if has_money_entity and not has_query_keyword:
279
+ # Check if there are nouns suggesting items bought
280
+ has_object_noun = any(tok.pos_ == "NOUN" and tok.dep_ in ["dobj", "pobj", "attr"] for tok in doc)
281
+ if has_object_noun or any(ent.label_ in ["ORG", "PRODUCT"] for ent in doc.ents):
282
+ return "add_expense"
283
+
284
+ # If only query keywords or unclear structure, lean towards query or unknown
285
+ if has_query_keyword:
286
+ return "query_expense"
287
+
288
+ return "unknown"
289
+
290
+ # --- Filtering and Formatting (largely reused, minor adjustments) ---
291
+
292
+ def filter_expenses(criteria):
293
+ """Filters the global 'expenses' list based on criteria."""
294
+ # (This function remains largely the same as the previous version)
295
+ filtered = expenses
296
+
297
+ # Filter by Category
298
+ if 'category' in criteria and criteria['category'] is not None:
299
+ target_cat = criteria['category'].lower()
300
+ # Handle general 'food' query including 'coffee', 'groceries' etc.
301
+ food_related_cats = {'food', 'coffee', 'groceries', 'restaurant'} # Define food-related categories
302
+ if target_cat == 'food':
303
+ filtered = [e for e in filtered if e['category'].lower() in food_related_cats]
304
+ else:
305
+ filtered = [e for e in filtered if e['category'].lower() == target_cat]
306
+
307
+ # Filter by Date Range (start_date and end_date are inclusive)
308
+ if 'start_date' in criteria and criteria['start_date'] is not None:
309
+ filtered = [e for e in filtered if e['date'] >= criteria['start_date']]
310
+ if 'end_date' in criteria and criteria['end_date'] is not None:
311
+ filtered = [e for e in filtered if e['date'] <= criteria['end_date']]
312
+
313
+ # Filter by Merchant (case-insensitive substring match)
314
+ if 'merchant' in criteria and criteria['merchant'] is not None:
315
+ target_merchant = criteria['merchant'].lower()
316
+ filtered = [e for e in filtered if e['merchant'] and target_merchant in e['merchant'].lower()]
317
+
318
+ return filtered
319
+
320
+ def parse_date_range_from_query(doc):
321
+ """Parses date ranges specifically for queries (e.g., 'this month', 'last week')."""
322
+ # (This function remains largely the same, using dateparser on DATE entities or keywords)
323
+ today = datetime.date.today()
324
+ text_lower = doc.text.lower() # Use full text for keywords like "this month"
325
+ start_date, end_date = None, None
326
+
327
+ # Prioritize DATE entities found by spaCy
328
+ date_ents_text = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
329
+ parsed_dates = []
330
+ settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
331
+
332
+ for date_str in date_ents_text:
333
+ # Try parsing as a potential range using dateparser's experimental range feature (or parse single dates)
334
+ # For simplicity, we'll stick to parsing single points and let keyword logic handle ranges
335
+ parsed = dateparser.parse(date_str, settings=settings)
336
+ if parsed:
337
+ parsed_dates.append(parsed.date())
338
+
339
+ # If spaCy found specific dates, use them
340
+ if len(parsed_dates) == 1:
341
+ start_date = end_date = parsed_dates[0]
342
+ elif len(parsed_dates) > 1:
343
+ # Ambiguous, maybe take min/max? Or rely on keywords below?
344
+ start_date = min(parsed_dates)
345
+ end_date = max(parsed_dates)
346
+ if start_date > end_date: # Swap if order is wrong
347
+ start_date, end_date = end_date, start_date
348
+
349
+ # If no specific date entities, check for range keywords
350
+ if start_date is None and end_date is None:
351
+ if "today" in text_lower:
352
+ start_date = end_date = today
353
+ elif "yesterday" in text_lower:
354
+ start_date = end_date = today - datetime.timedelta(days=1)
355
+ elif "this week" in text_lower:
356
+ start_of_week = today - datetime.timedelta(days=today.weekday()) # Monday
357
+ end_of_week = start_of_week + datetime.timedelta(days=6) # Sunday
358
+ start_date = start_of_week
359
+ end_date = end_of_week
360
+ elif "last week" in text_lower:
361
+ end_of_last_week = today - datetime.timedelta(days=today.weekday() + 1) # Last Sunday
362
+ start_of_last_week = end_of_last_week - datetime.timedelta(days=6) # Last Monday
363
+ start_date = start_of_last_week
364
+ end_date = end_of_last_week
365
+ elif "this month" in text_lower:
366
+ start_date = today.replace(day=1)
367
+ next_month = today.replace(day=28) + datetime.timedelta(days=4)
368
+ last_day_of_month = next_month - datetime.timedelta(days=next_month.day)
369
+ end_date = last_day_of_month
370
+ elif "last month" in text_lower:
371
+ first_day_of_current_month = today.replace(day=1)
372
+ last_day_of_last_month = first_day_of_current_month - datetime.timedelta(days=1)
373
+ first_day_of_last_month = last_day_of_last_month.replace(day=1)
374
+ start_date = first_day_of_last_month
375
+ end_date = last_day_of_last_month
376
+ elif "year" in text_lower: # e.g., "this year", "last year"
377
+ if "this year" in text_lower:
378
+ start_date = datetime.date(today.year, 1, 1)
379
+ end_date = datetime.date(today.year, 12, 31)
380
+ elif "last year" in text_lower:
381
+ start_date = datetime.date(today.year - 1, 1, 1)
382
+ end_date = datetime.date(today.year - 1, 12, 31)
383
+ # Check for specific year like "in 2023"
384
+ year_match = re.search(r'\b(in|for)\s+(\d{4})\b', text_lower)
385
+ if year_match:
386
+ year = int(year_match.group(2))
387
+ start_date = datetime.date(year, 1, 1)
388
+ end_date = datetime.date(year, 12, 31)
389
+
390
+ # Add specific month parsing ("in January") if needed (similar to previous version)
391
+ else:
392
+ month_match = re.search(r'\b(in|for)\s+(january|february|march|april|may|june|july|august|september|october|november|december)\b', text_lower)
393
+ if month_match:
394
+ month_name = month_match.group(2)
395
+ year_context = today.year # Assume current year
396
+ # Check if a year was mentioned nearby
397
+ year_ent = [e.text for e in doc.ents if e.label_ == "DATE" and e.text.isdigit() and len(e.text)==4]
398
+ if year_ent:
399
+ year_context = int(year_ent[0])
400
+ try:
401
+ month_num = list(datetime.date(2000, i, 1).strftime('%B').lower() for i in range(1, 13)).index(month_name) + 1
402
+ start_date = datetime.date(year_context, month_num, 1)
403
+ next_m = (start_date.replace(day=28) + datetime.timedelta(days=4))
404
+ end_date = next_m - datetime.timedelta(days=next_m.day)
405
+ except (ValueError, IndexError): pass # Ignore invalid month/year
406
+
407
+
408
+ logging.debug(f"Parsed date range for query: {start_date} to {end_date}")
409
+ return start_date, end_date
410
+
411
+ def format_expense_list(expense_list, title="Here are the expenses:"):
412
+ """Formats a list of expenses into a user-friendly string."""
413
+ # (This function remains largely the same)
414
+ if not expense_list:
415
+ return "No expenses found matching your criteria."
416
+
417
+ total_amount = sum(e['amount'] for e in expense_list)
418
+ # Try to get a consistent currency symbol, default to first expense's symbol or fallback
419
+ currency_symbol = expense_list[0].get("currency") or "₹" if expense_list else "₹"
420
+
421
+ response_lines = [title]
422
+ expense_list.sort(key=lambda x: x['date'], reverse=True)
423
+
424
+ for expense in expense_list:
425
+ cur = expense.get("currency") or currency_symbol # Use expense specific or default
426
+ amount_str = f"{cur}{expense['amount']:.2f}"
427
+ merchant_part = f" at {expense['merchant']}" if expense['merchant'] else ""
428
+ category_part = f" ({expense['category']})" if expense['category'] != 'Uncategorized' else ""
429
+ date_str = expense['date'].strftime("%b %d, %Y")
430
+ response_lines.append(f"- {amount_str}{category_part}{merchant_part} - {date_str}")
431
+
432
+ if len(expense_list) > 1:
433
+ total_str = f"{currency_symbol}{total_amount:.2f}"
434
+ response_lines.append(f"Total: {total_str}")
435
+
436
+ return "\n".join(response_lines)
437
+
438
+ # --- NEW: Core NLP Processing Function ---
439
+ def analyze_expense_text(text):
440
+ """
441
+ Analyzes text to extract expense details or understand queries using spaCy.
442
+ Returns a dictionary with action, status, and extracted details/message.
443
+ """
444
+ global next_expense_id # Allow modification of the global counter
445
+
446
+ if nlp is None:
447
+ logging.error("spaCy model not loaded. Cannot process text.")
448
+ return {"action": "error", "status": "failed", "message": "NLP model not available"}
449
+
450
+ logging.info(f"Analyzing text: {text[:100]}...") # Log snippet
451
+ doc = nlp(text)
452
+ logging.debug(f"spaCy Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
453
+
454
+ intent = determine_intent(doc)
455
+ logging.info(f"Determined Intent: {intent}")
456
+ response_data = {}
457
+
458
+ if intent == "add_expense":
459
+ amount, currency = parse_money_entity(text, doc)
460
+ expense_date = parse_date_entities(doc)
461
+ merchant, category = identify_merchant_and_category(doc)
462
+
463
+ if amount is not None:
464
+ currency_symbol = currency or "₹" # Default currency
465
+ new_expense = {
466
+ "id": next_expense_id,
467
+ "amount": amount,
468
+ "currency": currency_symbol,
469
+ "category": category,
470
+ "merchant": merchant,
471
+ "date": expense_date, # Keep as date object internally
472
+ "original_message": text
473
+ }
474
+ expenses.append(new_expense)
475
+ next_expense_id += 1
476
+ logging.info(f"Added expense (in-memory): {new_expense}")
477
+
478
+ merchant_part = f" at {merchant}" if merchant else ""
479
+ date_str = expense_date.strftime('%b %d, %Y')
480
+ confirmation_msg = f"✅ Expense added: {currency_symbol}{amount:.2f} for {category}{merchant_part} on {date_str}."
481
+
482
+ new_expense_serializable = new_expense.copy()
483
+ new_expense_serializable["date"] = new_expense["date"].isoformat()
484
+
485
+ response_data = {
486
+ "action": "add_expense",
487
+ "status": "success",
488
+ "message": confirmation_msg,
489
+ "details": new_expense_serializable
490
+ }
491
+ else:
492
+ logging.warning(f"Could not extract amount reliably from: {text}")
493
+ response_data = {
494
+ "action": "add_expense",
495
+ "status": "failed",
496
+ "message": f"Sorry, I couldn't understand the amount. Please include it clearly (e.g., '₹500', '$20', '15 pounds')."
497
+ }
498
+
499
+ elif intent == "query_expense":
500
+ logging.info("Processing query intent.")
501
+ query_criteria = {}
502
+ _q_merchant, q_category = identify_merchant_and_category(doc)
503
+
504
+ # ... (rest of query criteria extraction logic remains the same) ...
505
+ query_cat_found = None
506
+ text_lower = doc.text.lower()
507
+ for cat, keywords in CATEGORY_KEYWORDS.items():
508
+ if any(keyword in text_lower for keyword in keywords):
509
+ if cat == 'food' or q_category == 'food':
510
+ query_cat_found = 'food'
511
+ break
512
+ query_cat_found = q_category if q_category != 'Uncategorized' else cat
513
+ break
514
+
515
+ query_criteria['category'] = query_cat_found
516
+ query_criteria['merchant'] = _q_merchant
517
+ start_date, end_date = parse_date_range_from_query(doc)
518
+ query_criteria['start_date'] = start_date
519
+ query_criteria['end_date'] = end_date
520
+
521
+ logging.info(f"Query Criteria: {query_criteria}")
522
+ results = filter_expenses(query_criteria)
523
+ response_message = ""
524
+
525
+ # ... (rest of query response formatting logic remains the same) ...
526
+ if results and ("total" in text_lower or "sum" in text_lower or "how much" in doc[0].lower_):
527
+ total_amount = sum(e['amount'] for e in results)
528
+ currency_symbol = results[0].get("currency") or "₹"
529
+ category_filter_text = f" on {query_criteria['category']}" if query_criteria['category'] else ""
530
+ date_filter_text = ""
531
+ if start_date and end_date and start_date == end_date: date_filter_text = f" for {start_date.strftime('%b %d, %Y')}"
532
+ elif start_date and end_date: date_filter_text = f" from {start_date.strftime('%b %d')} to {end_date.strftime('%b %d, %Y')}"
533
+ elif start_date: date_filter_text = f" since {start_date.strftime('%b %d, %Y')}"
534
+ elif end_date: date_filter_text = f" until {end_date.strftime('%b %d, %Y')}"
535
+ response_message = f"Your total spending{category_filter_text}{date_filter_text} is {currency_symbol}{total_amount:.2f}."
536
+ if len(results) <= 10:
537
+ response_message += "\n" + format_expense_list(results, "Details:")
538
+ else:
539
+ response_message += f" (from {len(results)} transactions)"
540
+ elif results and ("biggest" in text_lower or "largest" in text_lower or "top" in text_lower):
541
+ top_n = 3
542
+ top_expenses = sorted(results, key=lambda x: x['amount'], reverse=True)[:top_n]
543
+ response_message = format_expense_list(top_expenses, f"Your top {len(top_expenses)} expenses:")
544
+ else:
545
+ date_filter_desc = ""
546
+ if start_date and end_date and start_date == end_date: date_filter_desc = f" from {start_date.strftime('%b %d, %Y')}"
547
+ elif start_date or end_date: date_filter_desc = " matching the date criteria"
548
+ category_filter_desc = f" for {query_criteria['category']}" if query_criteria['category'] else ""
549
+ merchant_filter_desc = f" at {query_criteria['merchant']}" if query_criteria['merchant'] else ""
550
+ title = f"Expenses{category_filter_desc}{merchant_filter_desc}{date_filter_desc}:"
551
+ response_message = format_expense_list(results, title)
552
+
553
+
554
+ response_data = {
555
+ "action": "query_expense",
556
+ "status": "success",
557
+ "message": response_message,
558
+ "criteria": {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in query_criteria.items() if v is not None},
559
+ "results_count": len(results)
560
+ }
561
+
562
+ else: # intent == "unknown"
563
+ logging.info(f"Could not determine intent for: {text}")
564
+ response_data = {
565
+ "action": "unknown",
566
+ "status": "failed",
567
+ "message": "Sorry, I couldn't quite understand that. Please try phrasing your expense or query differently. \nExamples:\n- 'Spent ₹50 on coffee yesterday at Starbucks'\n- 'Show my food expenses last week'\n- 'What was my total spending last month?'"
568
+ }
569
+
570
+ logging.info(f"Analysis complete. Action: {response_data.get('action')}, Status: {response_data.get('status')}")
571
+ return response_data
572
+
573
+
574
+ # --- Flask Blueprint Setup (Optional: Keep if direct API access is needed) ---
575
+ nlp_bp = Blueprint('nlp_service', __name__)
576
+
577
+ @nlp_bp.route('/process_nlp', methods=['POST'])
578
+ def process_nlp_expense_route():
579
+ """Flask route handler that calls the core analysis function."""
580
+ data = request.get_json()
581
+ if not data or 'message' not in data:
582
+ logging.warning("Received request without 'message' field.")
583
+ return jsonify({"error": "Missing 'message' in request body"}), 400
584
+
585
+ user_message = data['message']
586
+ result = analyze_expense_text(user_message) # Call the core function
587
+
588
+ # Determine status code based on result
589
+ status_code = 200
590
+ if result.get("status") == "failed":
591
+ status_code = 400 # Or 500 if it's an internal NLP model error
592
+ if result.get("message") == "NLP model not available":
593
+ status_code = 500
594
+
595
+ return jsonify(result), status_code
596
+
597
+
598
+ # --- Example Usage / Testing Setup ---
599
+ if __name__ == '__main__':
600
+ from flask import Flask
601
+
602
+ app = Flask(__name__)
603
+ app.register_blueprint(nlp_bp) # Register the blueprint
604
+
605
+ # Dummy data removed
606
+
607
+ print("Starting Flask server for testing NLP service...")
608
+ # print("Registered expenses:", expenses) # Can be long
609
+ if nlp is None:
610
+ print("WARNING: spaCy model failed to load. /process_nlp endpoint will return errors.")
611
+ app.run(debug=True, host='0.0.0.0', port=5001)
requirements.txt CHANGED
@@ -1,5 +1,9 @@
1
  Pillow
2
  flask
3
- requests
4
  paddlepaddle
5
- paddleocr
 
 
 
 
 
1
  Pillow
2
  flask
3
+ # requests # Removed as NLP is called directly now
4
  paddlepaddle
5
+ paddleocr
6
+ spacy>=3.0.0 # Added spaCy
7
+ dateparser>=1.0.0 # Added dateparser
8
+ # Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
9
+ # python -m spacy download en_core_web_md