MonilM commited on
Commit
07b50c0
·
1 Parent(s): 6e4ec8a

Improved NLP Logic

Browse files
Files changed (7) hide show
  1. app.py +9 -5
  2. config.py +27 -0
  3. handler.py +105 -0
  4. model_setup.py +5 -0
  5. nlp_service.py +107 -801
  6. requirements.txt +6 -1
  7. utils.py +141 -0
app.py CHANGED
@@ -12,7 +12,7 @@ from paddleocr import PaddleOCR
12
  from PIL import Image
13
 
14
  # --- NEW: Import the NLP analysis function ---
15
- from nlp_service import analyze_expense_text # Import the core analysis function
16
 
17
  # --- Configuration ---
18
  LANG = 'en' # Default language, can be overridden if needed
@@ -292,13 +292,17 @@ def process_message():
292
  nlp_error = None
293
  try:
294
  # Call the imported analysis function
295
- nlp_analysis_result = analyze_expense_text(text_message)
296
  print(f"NLP Service Analysis Result: {nlp_analysis_result}")
297
- # Check if the NLP analysis itself reported an error/failure
298
- if nlp_analysis_result.get("status") == "failed":
 
299
  nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
300
  # Return the failure result from NLP service
301
- return jsonify(nlp_analysis_result), 400 # Or 200 with error status? Let's use 200 for now.
 
 
 
302
 
303
  # Return the successful analysis result
304
  return jsonify(nlp_analysis_result)
 
12
  from PIL import Image
13
 
14
  # --- NEW: Import the NLP analysis function ---
15
+ from nlp_service import analyze_text # Corrected import
16
 
17
  # --- Configuration ---
18
  LANG = 'en' # Default language, can be overridden if needed
 
292
  nlp_error = None
293
  try:
294
  # Call the imported analysis function
295
+ nlp_analysis_result = analyze_text(text_message) # Corrected function call
296
  print(f"NLP Service Analysis Result: {nlp_analysis_result}")
297
+ # Check if the NLP analysis itself reported an error/failure or requires fallback
298
+ status = nlp_analysis_result.get("status")
299
+ if status == "failed":
300
  nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
301
  # Return the failure result from NLP service
302
+ return jsonify(nlp_analysis_result), 400 # Use 400 for client-side errors like empty text
303
+ elif status == "fallback_required":
304
+ # Return the fallback result (e.g., for queries)
305
+ return jsonify(nlp_analysis_result), 200 # Return 200, but indicate fallback needed
306
 
307
  # Return the successful analysis result
308
  return jsonify(nlp_analysis_result)
config.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # --- NLP Configuration ---
4
+ CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
5
+
6
+ # More robust regex to find monetary values even if spaCy misses MONEY entity
7
+ # Added a group to capture standalone numbers potentially without currency symbols nearby
8
+ FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b|\b([\d,]+(?:\.\d{1,2})?)\b', re.IGNORECASE)
9
+
10
+ # Consolidated Category Keywords
11
+ CATEGORY_KEYWORDS = {
12
+ "Coffee": ["coffee", "latte", "cappuccino", "starbucks", "cafe", "café", "espresso", "mocha", "ccd"],
13
+ "Food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "sandwich", "burger", "pizza"],
14
+ "Groceries": ["groceries", "supermarket", "vegetables", "milk", "market", "zepto", "blinkit", "bigbasket"],
15
+ "Entertainment": ["movie", "cinema", "concert", "game", "netflix", "spotify", "tickets", "fun"],
16
+ "Transport": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket", "metro", "auto", "rickshaw", "commute"], # Combined Travel/Transport
17
+ "Shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
18
+ "Utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
19
+ "Rent": ["rent", "lease"],
20
+ "Income": ["salary", "received", "credited", "deposit", "income"], # Added income keyword
21
+ "Investment": ["invest", "stock", "shares", "mutual fund", "sip", "investment"], # Added investment keyword
22
+ # "Misc" can be the default if no keywords match
23
+ }
24
+
25
+ # Keywords for intent detection (less critical if using zero-shot, but can be helpers)
26
+ QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
27
+ ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending
handler.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ # Remove direct model/util imports if calling analyze_text
3
+ # from model_setup import zero_shot, ner
4
+ # from utils import parse_entities
5
+ # from config import CATEGORY_KEYWORDS
6
+
7
+ # Import the centralized analysis function
8
+ from nlp_service import analyze_text
9
+
10
+ def lambda_handler(event, context):
11
+ # ... (Keep body parsing logic) ...
12
+ body_str = event.get("body", "{}")
13
+ try:
14
+ body = json.loads(body_str)
15
+ except json.JSONDecodeError:
16
+ print(f"Error decoding JSON body: {body_str}")
17
+ return {
18
+ "statusCode": 400,
19
+ "body": json.dumps({"error": "Invalid JSON in request body"})
20
+ }
21
+
22
+ text = body.get("text", "")
23
+ if not text:
24
+ return {
25
+ "statusCode": 400,
26
+ "body": json.dumps({"error": "Missing 'text' field in request body"})
27
+ }
28
+
29
+ print(f"Processing text via nlp_service: {text}") # Log input
30
+
31
+ # Call the centralized NLP service function
32
+ try:
33
+ analysis_result = analyze_text(text)
34
+ status = analysis_result.get("status")
35
+
36
+ if status == "failed":
37
+ print(f"NLP analysis failed: {analysis_result.get('message')}")
38
+ # Return 400 for input errors, 500 for internal NLP errors?
39
+ # Let's return 400 if it's a known failure from analyze_text
40
+ return {
41
+ "statusCode": 400,
42
+ "body": json.dumps(analysis_result)
43
+ }
44
+ elif status == "fallback_required":
45
+ print(f"NLP analysis requires fallback: {analysis_result.get('message')}")
46
+ # Return 200 but indicate fallback needed
47
+ return {
48
+ "statusCode": 200,
49
+ "body": json.dumps(analysis_result)
50
+ }
51
+ elif status == "success":
52
+ print(f"NLP analysis successful: {analysis_result}")
53
+ # Return the successful analysis result
54
+ return {
55
+ "statusCode": 200,
56
+ "body": json.dumps(analysis_result) # Already contains status
57
+ }
58
+ else:
59
+ # Should not happen if analyze_text always returns a status
60
+ print(f"Error: Unknown status from analyze_text: {status}")
61
+ return {
62
+ "statusCode": 500,
63
+ "body": json.dumps({"error": "Internal server error: Unexpected NLP response"})
64
+ }
65
+
66
+ except Exception as e:
67
+ print(f"Error calling analyze_text from handler: {e}")
68
+ import traceback
69
+ traceback.print_exc()
70
+ return {
71
+ "statusCode": 500,
72
+ "body": json.dumps({"error": "Internal server error during NLP processing", "details": str(e)})
73
+ }
74
+
75
+ # Example event structure (for local testing if needed)
76
+ if __name__ == '__main__':
77
+ # ... (Keep example test cases, they should still work) ...
78
+ example_event = {
79
+ "body": json.dumps({
80
+ "text": "spent 5 eur on coffee"
81
+ })
82
+ }
83
+ context = {}
84
+ response = lambda_handler(example_event, context)
85
+ print("\n--- Lambda Response ---")
86
+ # The body is already a JSON string containing the result from analyze_text
87
+ print(json.dumps(json.loads(response['body']), indent=2))
88
+
89
+ example_event_query = {
90
+ "body": json.dumps({
91
+ "text": "how much did I spend last month"
92
+ })
93
+ }
94
+ response_query = lambda_handler(example_event_query, context)
95
+ print("\n--- Lambda Response (Query) ---")
96
+ print(json.dumps(json.loads(response_query['body']), indent=2))
97
+
98
+ example_event_income = {
99
+ "body": json.dumps({
100
+ "text": "salary credited 50000"
101
+ })
102
+ }
103
+ response_income = lambda_handler(example_event_income, context)
104
+ print("\n--- Lambda Response (Income) ---")
105
+ print(json.dumps(json.loads(response_income['body']), indent=2))
model_setup.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ # Load once and reuse
4
+ zero_shot = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
5
+ ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
nlp_service.py CHANGED
@@ -1,814 +1,120 @@
1
- import re
2
- import datetime
3
- import dateparser # Still essential for interpreting date strings
4
- import spacy # Import spaCy
5
- from flask import Blueprint, request, jsonify
6
- from collections import defaultdict
7
- import logging
8
- import os # To handle potential model loading issues
9
- import requests # Add requests for API calls
10
- import json # For handling JSON data
11
- import os # Already imported, needed for API key
12
 
13
- # --- Setup ---
14
- logging.basicConfig(level=logging.INFO)
15
-
16
- # --- Load spaCy Model ---
17
- # Using medium model for better accuracy and word vectors (though not used explicitly yet)
18
- # Handle potential errors during model loading
19
- try:
20
- # Check if running in an environment where models might be linked differently
21
- # (e.g., Google Cloud Functions sometimes needs explicit path)
22
- model_name = "en_core_web_md"
23
- if not spacy.util.is_package(model_name):
24
- print(f"spaCy model '{model_name}' not found as package. Attempting download...")
25
- spacy.cli.download(model_name)
26
-
27
- nlp = spacy.load(model_name)
28
- logging.info(f"Successfully loaded spaCy model '{model_name}'")
29
- except (OSError, ImportError) as e:
30
- logging.error(f"Could not load spaCy model '{model_name}'. Error: {e}")
31
- logging.error("Ensure the model is downloaded: python -m spacy download en_core_web_md")
32
- # Fallback or exit - for now, we'll log and potentially fail later if nlp isn't loaded
33
- nlp = None # Indicate model loading failed
34
-
35
- # --- In-Memory Data Storage (Replace with Database) ---
36
- expenses = []
37
- next_expense_id = 1
38
-
39
- # --- NLP Configuration & Helpers ---
40
- CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
41
- # More robust regex to find monetary values even if spaCy misses MONEY entity
42
- FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b', re.IGNORECASE)
43
-
44
- # Category keywords remain useful
45
- CATEGORY_KEYWORDS = {
46
- "food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "groceries", "sandwich", "burger", "pizza"],
47
- "coffee": ["coffee", "latte", "cappuccino", "espresso", "cafe", "starbucks", "ccd", "café", "mocha"],
48
- "travel": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket"],
49
- "shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
50
- "groceries": ["groceries", "supermarket", "zepto", "blinkit", "bigbasket", "vegetables", "milk", "market"],
51
- "utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
52
- "entertainment": ["movie", "cinema", "concert", "game", "fun", "netflix", "spotify", "tickets"],
53
- "rent": ["rent", "lease"],
54
- "transport": ["transport", "metro", "auto", "rickshaw", "commute"]
55
- }
56
-
57
- # Keywords for intent detection (can be less critical now, intent inferred more from entities)
58
- QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
59
- ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending
60
-
61
-
62
- def parse_money_entity(text, doc):
63
  """
64
- Extracts amount using spaCy MONEY entities first, then falls back to regex.
65
- Returns the amount as float and identified currency symbol/code.
66
- """
67
- amount = None
68
- currency = None
69
- text = text.replace(',', '') # Remove commas for easier parsing
70
-
71
- # 1. Try spaCy MONEY entities first
72
- money_ents = [ent for ent in doc.ents if ent.label_ == "MONEY"]
73
- if money_ents:
74
- # Prioritize longer entities or ones closer to verbs like 'spent' if multiple found
75
- # Simple approach: take the first one for now
76
- ent_text = money_ents[0].text.replace(',', '')
77
- # Try to extract number and symbol/code from the entity text
78
- num_match = re.search(r'([\d\.]+)', ent_text)
79
- if num_match:
80
- try:
81
- amount = float(num_match.group(1))
82
- # Try to find a known symbol or code within the entity text
83
- symbol_match = re.search(r'([\$€£₹])', ent_text)
84
- if symbol_match:
85
- currency = symbol_match.group(1)
86
- else:
87
- # Check for codes like USD, GBP etc. (simple check)
88
- code_match = re.search(r'\b(USD|EUR|GBP|INR|RS)\b', ent_text, re.IGNORECASE)
89
- if code_match:
90
- currency = code_match.group(1).upper()
91
- # Standardize common ones
92
- if currency == "RS": currency = "INR"
93
-
94
- # If amount found but no currency symbol in entity, check doc context
95
- if amount is not None and currency is None:
96
- for token in doc:
97
- if token.text in CURRENCY_SYMBOLS:
98
- currency = token.text
99
- break
100
- return amount, currency
101
- except ValueError:
102
- pass # Failed to convert number
103
-
104
- # 2. Fallback Regex (if spaCy missed it or parsing failed)
105
- match = FALLBACK_AMOUNT_REGEX.search(text)
106
- if match:
107
- try:
108
- if match.group(2): # Format: $100 or Rs 100
109
- amount = float(match.group(2))
110
- currency_text = match.group(1)
111
- elif match.group(3): # Format: 100 dollars or 100 Rs
112
- amount = float(match.group(3))
113
- currency_text = match.group(4)
114
- else: # Should not happen with this regex, but safety first
115
- return None, None
116
-
117
- # Normalize currency symbol/code
118
- if currency_text in CURRENCY_SYMBOLS:
119
- currency = currency_text
120
- else:
121
- currency_text = currency_text.lower()
122
- if currency_text in ["rs", "rupees"]: currency = "₹" # Or INR
123
- elif currency_text in ["dollars", "usd"]: currency = "$" # Or USD
124
- elif currency_text in ["pounds", "gbp"]: currency = "£" # Or GBP
125
- elif currency_text in ["euros", "eur"]: currency = "€" # Or EUR
126
-
127
- return amount, currency
128
-
129
- except (ValueError, IndexError):
130
- logging.warning(f"Regex fallback failed to parse amount from: {text}")
131
- return None, None
132
-
133
- return None, None # No amount found
134
-
135
- def parse_date_entities(doc):
136
- """
137
- Uses dateparser to interpret spaCy DATE entities.
138
- Returns the *most likely* date found, defaulting to today.
139
- """
140
- dates = []
141
- # Settings for dateparser: prefer past dates for expenses
142
- settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
143
-
144
- date_ents = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
145
- logging.debug(f"Found DATE entities: {date_ents}")
146
-
147
- if date_ents:
148
- for date_str in date_ents:
149
- # Sometimes spaCy includes words like "on", "last" in the entity, dateparser handles this
150
- parsed = dateparser.parse(date_str, settings=settings)
151
- if parsed:
152
- dates.append(parsed.date())
153
-
154
- if dates:
155
- # Heuristic: If multiple dates, prefer the one closest to today? Or just the first?
156
- # Let's prefer the latest valid past date found (most recent expense)
157
- past_dates = [d for d in dates if d <= datetime.date.today()]
158
- if past_dates:
159
- return max(past_dates) # Return the most recent valid date
160
- elif dates:
161
- return min(dates) # If only future dates found, return the earliest one (less likely for expense)
162
-
163
- # Fallback if no DATE entity found or parsed
164
- logging.debug("No valid DATE entity found or parsed, defaulting to today.")
165
- return datetime.date.today()
166
-
167
- def identify_merchant_and_category(doc):
168
- """
169
- Identifies merchant using ORG/PERSON/GPE entities and context.
170
- Identifies category using keywords and context around amount/merchant.
171
- """
172
- merchant = None
173
- category = "Uncategorized" # Default
174
-
175
- money_token_indices = [token.i for token in doc if token.like_num or token.text in CURRENCY_SYMBOLS or any(sym in token.text for sym in CURRENCY_SYMBOLS) or (token.ent_type_ == "MONEY")]
176
-
177
- potential_merchants = []
178
- for ent in doc.ents:
179
- if ent.label_ in ["ORG", "PERSON", "GPE", "FAC"]: # Facility might also be relevant
180
- # Check context: is it preceded by "at", "from", "in"? Is it near the money amount?
181
- prepositions = {"at", "from", "in", "on", "with"}
182
- # Check token before the entity start
183
- if ent.start > 0 and doc[ent.start - 1].lower_ in prepositions:
184
- potential_merchants.append(ent.text)
185
- continue
186
- # Check dependency relation (e.g., object of preposition)
187
- if ent.root.head.lemma_ in prepositions:
188
- potential_merchants.append(ent.text)
189
- continue
190
- # Check proximity to money amount if indices available
191
- if money_token_indices:
192
- min_dist = min(abs(ent.start - idx) for idx in money_token_indices)
193
- if min_dist < 5: # Arbitrary proximity threshold
194
- potential_merchants.append(ent.text)
195
- continue
196
-
197
-
198
- if potential_merchants:
199
- # Simple heuristic: choose the first likely one. Could be refined.
200
- # Filter out very common words or locations if needed (e.g., "City", "Bank" if too generic)
201
- merchant = potential_merchants[0].strip()
202
- logging.debug(f"Identified potential merchant: {merchant} from entities {potential_merchants}")
203
-
204
-
205
- # --- Category Identification ---
206
- text_lower = doc.text.lower()
207
-
208
- # 1. Check explicit category keywords
209
- found_category = None
210
- matched_keywords = []
211
- for cat, keywords in CATEGORY_KEYWORDS.items():
212
- if any(keyword in text_lower for keyword in keywords):
213
- # If multiple categories match, prioritize based on merchant or context?
214
- # Simple approach: Store all matches for now
215
- matched_keywords.append(cat)
216
-
217
- if len(matched_keywords) == 1:
218
- found_category = matched_keywords[0]
219
- elif len(matched_keywords) > 1:
220
- # Ambiguity - Requires smarter logic. E.g., "Coffee at Food court" -> Coffee or Food?
221
- # Prioritize based on merchant if known? E.g. if merchant is Starbucks -> Coffee
222
- if merchant:
223
- merchant_lower = merchant.lower()
224
- if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower:
225
- if "coffee" in matched_keywords: found_category = "coffee"
226
- elif "amazon" in merchant_lower or "flipkart" in merchant_lower:
227
- if "shopping" in matched_keywords: found_category = "shopping"
228
- elif "zepto" in merchant_lower or "blinkit" in merchant_lower or "groceries" in merchant_lower:
229
- if "groceries" in matched_keywords: found_category = "groceries"
230
- elif "food" in matched_keywords: found_category = "groceries" # Prefer specific
231
-
232
- # If still ambiguous, maybe pick the most specific one (e.g., prefer 'coffee' over 'food')
233
- if not found_category:
234
- if "coffee" in matched_keywords: found_category = "coffee"
235
- elif "groceries" in matched_keywords: found_category = "groceries"
236
- elif "transport" in matched_keywords: found_category = "transport"
237
- # Add more specific priorities if needed
238
- elif "food" in matched_keywords : found_category = "food" # More general last
239
- else: found_category = matched_keywords[0] # Default to first match if no rules apply
240
-
241
-
242
- if found_category:
243
- category = found_category
244
- # 2. (Optional/Advanced) Infer from merchant if category is Uncategorized
245
- elif merchant and category == "Uncategorized":
246
- merchant_lower = merchant.lower()
247
- if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower: category = "coffee"
248
- elif "amazon" in merchant_lower or "flipkart" in merchant_lower: category = "shopping"
249
- elif "zepto" in merchant_lower or "blinkit" in merchant_lower: category = "groceries"
250
- elif "uber" in merchant_lower or "ola" in merchant_lower: category = "travel"
251
- elif "netflix" in merchant_lower or "spotify" in merchant_lower: category = "entertainment"
252
- # Add more merchant->category mappings
253
-
254
- # 3. (Optional/Advanced) Use Dependency Parsing or Word Vectors
255
- # Example: Look for nouns that are objects of spending verbs near the amount
256
- # This requires more complex linguistic analysis.
257
-
258
- logging.debug(f"Identified Category: {category}")
259
- return merchant, category
260
-
261
- def determine_intent(doc):
262
- """Determines intent: 'add_expense', 'query_expense', or 'unknown'."""
263
- text_lower = doc.text.lower()
264
-
265
- has_query_keyword = any(keyword in text_lower for keyword in QUERY_KEYWORDS)
266
- has_add_verb = any(verb.lemma_ in ADD_EXPENSE_VERBS for verb in doc if verb.pos_ == "VERB")
267
- has_money_entity = any(ent.label_ == "MONEY" for ent in doc.ents) or FALLBACK_AMOUNT_REGEX.search(text_lower) is not None
268
-
269
- # More explicit questions are likely queries
270
- if doc[0].pos_ == "AUX" or doc[0].lemma_ in ["what", "how", "show", "list", "view"]: # Starts like a question
271
- return "query_expense"
272
-
273
- if has_query_keyword:
274
- return "query_expense"
275
-
276
- # If it has a spending verb and a money amount, likely adding expense
277
- if has_add_verb and has_money_entity:
278
- return "add_expense"
279
-
280
- # If it just has a money amount and maybe date/merchant, could be adding expense (implicit verb)
281
- if has_money_entity and not has_query_keyword:
282
- # Check if there are nouns suggesting items bought
283
- has_object_noun = any(tok.pos_ == "NOUN" and tok.dep_ in ["dobj", "pobj", "attr"] for tok in doc)
284
- if has_object_noun or any(ent.label_ in ["ORG", "PRODUCT"] for ent in doc.ents):
285
- return "add_expense"
286
-
287
- # If only query keywords or unclear structure, lean towards query or unknown
288
- if has_query_keyword:
289
- return "query_expense"
290
-
291
- return "unknown"
292
-
293
- # --- Filtering and Formatting (largely reused, minor adjustments) ---
294
-
295
- def filter_expenses(criteria):
296
- """Filters the global 'expenses' list based on criteria."""
297
- # (This function remains largely the same as the previous version)
298
- filtered = expenses
299
-
300
- # Filter by Category
301
- if 'category' in criteria and criteria['category'] is not None:
302
- target_cat = criteria['category'].lower()
303
- # Handle general 'food' query including 'coffee', 'groceries' etc.
304
- food_related_cats = {'food', 'coffee', 'groceries', 'restaurant'} # Define food-related categories
305
- if target_cat == 'food':
306
- filtered = [e for e in filtered if e['category'].lower() in food_related_cats]
307
- else:
308
- filtered = [e for e in filtered if e['category'].lower() == target_cat]
309
-
310
- # Filter by Date Range (start_date and end_date are inclusive)
311
- if 'start_date' in criteria and criteria['start_date'] is not None:
312
- filtered = [e for e in filtered if e['date'] >= criteria['start_date']]
313
- if 'end_date' in criteria and criteria['end_date'] is not None:
314
- filtered = [e for e in filtered if e['date'] <= criteria['end_date']]
315
-
316
- # Filter by Merchant (case-insensitive substring match)
317
- if 'merchant' in criteria and criteria['merchant'] is not None:
318
- target_merchant = criteria['merchant'].lower()
319
- filtered = [e for e in filtered if e['merchant'] and target_merchant in e['merchant'].lower()]
320
-
321
- return filtered
322
-
323
- def parse_date_range_from_query(doc):
324
- """Parses date ranges specifically for queries (e.g., 'this month', 'last week')."""
325
- # (This function remains largely the same, using dateparser on DATE entities or keywords)
326
- today = datetime.date.today()
327
- text_lower = doc.text.lower() # Use full text for keywords like "this month"
328
- start_date, end_date = None, None
329
-
330
- # Prioritize DATE entities found by spaCy
331
- date_ents_text = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
332
- parsed_dates = []
333
- settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
334
-
335
- for date_str in date_ents_text:
336
- # Try parsing as a potential range using dateparser's experimental range feature (or parse single dates)
337
- # For simplicity, we'll stick to parsing single points and let keyword logic handle ranges
338
- parsed = dateparser.parse(date_str, settings=settings)
339
- if parsed:
340
- parsed_dates.append(parsed.date())
341
-
342
- # If spaCy found specific dates, use them
343
- if len(parsed_dates) == 1:
344
- start_date = end_date = parsed_dates[0]
345
- elif len(parsed_dates) > 1:
346
- # Ambiguous, maybe take min/max? Or rely on keywords below?
347
- start_date = min(parsed_dates)
348
- end_date = max(parsed_dates)
349
- if start_date > end_date: # Swap if order is wrong
350
- start_date, end_date = end_date, start_date
351
-
352
- # If no specific date entities, check for range keywords
353
- if start_date is None and end_date is None:
354
- if "today" in text_lower:
355
- start_date = end_date = today
356
- elif "yesterday" in text_lower:
357
- start_date = end_date = today - datetime.timedelta(days=1)
358
- elif "this week" in text_lower:
359
- start_of_week = today - datetime.timedelta(days=today.weekday()) # Monday
360
- end_of_week = start_of_week + datetime.timedelta(days=6) # Sunday
361
- start_date = start_of_week
362
- end_date = end_of_week
363
- elif "last week" in text_lower:
364
- end_of_last_week = today - datetime.timedelta(days=today.weekday() + 1) # Last Sunday
365
- start_of_last_week = end_of_last_week - datetime.timedelta(days=6) # Last Monday
366
- start_date = start_of_last_week
367
- end_date = end_of_last_week
368
- elif "this month" in text_lower:
369
- start_date = today.replace(day=1)
370
- next_month = today.replace(day=28) + datetime.timedelta(days=4)
371
- last_day_of_month = next_month - datetime.timedelta(days=next_month.day)
372
- end_date = last_day_of_month
373
- elif "last month" in text_lower:
374
- first_day_of_current_month = today.replace(day=1)
375
- last_day_of_last_month = first_day_of_current_month - datetime.timedelta(days=1)
376
- first_day_of_last_month = last_day_of_last_month.replace(day=1)
377
- start_date = first_day_of_last_month
378
- end_date = last_day_of_last_month
379
- elif "year" in text_lower: # e.g., "this year", "last year"
380
- if "this year" in text_lower:
381
- start_date = datetime.date(today.year, 1, 1)
382
- end_date = datetime.date(today.year, 12, 31)
383
- elif "last year" in text_lower:
384
- start_date = datetime.date(today.year - 1, 1, 1)
385
- end_date = datetime.date(today.year - 1, 12, 31)
386
- # Check for specific year like "in 2023"
387
- year_match = re.search(r'\b(in|for)\s+(\d{4})\b', text_lower)
388
- if year_match:
389
- year = int(year_match.group(2))
390
- start_date = datetime.date(year, 1, 1)
391
- end_date = datetime.date(year, 12, 31)
392
-
393
- # Add specific month parsing ("in January") if needed (similar to previous version)
394
- else:
395
- month_match = re.search(r'\b(in|for)\s+(january|february|march|april|may|june|july|august|september|october|november|december)\b', text_lower)
396
- if month_match:
397
- month_name = month_match.group(2)
398
- year_context = today.year # Assume current year
399
- # Check if a year was mentioned nearby
400
- year_ent = [e.text for e in doc.ents if e.label_ == "DATE" and e.text.isdigit() and len(e.text)==4]
401
- if year_ent:
402
- year_context = int(year_ent[0])
403
- try:
404
- month_num = list(datetime.date(2000, i, 1).strftime('%B').lower() for i in range(1, 13)).index(month_name) + 1
405
- start_date = datetime.date(year_context, month_num, 1)
406
- next_m = (start_date.replace(day=28) + datetime.timedelta(days=4))
407
- end_date = next_m - datetime.timedelta(days=next_m.day)
408
- except (ValueError, IndexError): pass # Ignore invalid month/year
409
-
410
-
411
- logging.debug(f"Parsed date range for query: {start_date} to {end_date}")
412
- return start_date, end_date
413
-
414
- def format_expense_list(expense_list, title="Here are the expenses:"):
415
- """Formats a list of expenses into a user-friendly string."""
416
- # (This function remains largely the same)
417
- if not expense_list:
418
- return "No expenses found matching your criteria."
419
-
420
- total_amount = sum(e['amount'] for e in expense_list)
421
- # Try to get a consistent currency symbol, default to first expense's symbol or fallback
422
- currency_symbol = expense_list[0].get("currency") or "₹" if expense_list else "₹"
423
 
424
- response_lines = [title]
425
- expense_list.sort(key=lambda x: x['date'], reverse=True)
426
 
427
- for expense in expense_list:
428
- cur = expense.get("currency") or currency_symbol # Use expense specific or default
429
- amount_str = f"{cur}{expense['amount']:.2f}"
430
- merchant_part = f" at {expense['merchant']}" if expense['merchant'] else ""
431
- category_part = f" ({expense['category']})" if expense['category'] != 'Uncategorized' else ""
432
- date_str = expense['date'].strftime("%b %d, %Y")
433
- response_lines.append(f"- {amount_str}{category_part}{merchant_part} - {date_str}")
434
-
435
- if len(expense_list) > 1:
436
- total_str = f"{currency_symbol}{total_amount:.2f}"
437
- response_lines.append(f"Total: {total_str}")
438
-
439
- return "\n".join(response_lines)
440
-
441
- # --- NEW: Core NLP Processing Function ---
442
- def analyze_expense_text(text):
443
  """
444
- Analyzes text to extract expense details or understand queries using spaCy.
445
- Returns a dictionary with action, status, and extracted details/message.
446
- """
447
- global next_expense_id # Allow modification of the global counter
448
-
449
- if nlp is None:
450
- logging.error("spaCy model not loaded. Cannot process text.")
451
- return {"action": "error", "status": "failed", "message": "NLP model not available"}
452
-
453
- logging.info(f"Analyzing text: {text[:100]}...") # Log snippet
454
- doc = nlp(text)
455
- logging.debug(f"spaCy Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
456
-
457
- intent = determine_intent(doc)
458
- logging.info(f"Determined Intent: {intent}")
459
- response_data = {}
460
-
461
- if intent == "add_expense":
462
- amount, currency = parse_money_entity(text, doc)
463
- expense_date = parse_date_entities(doc)
464
- merchant, category = identify_merchant_and_category(doc)
465
-
466
- if amount is not None:
467
- currency_symbol = currency or "₹" # Default currency
468
- new_expense = {
469
- "id": next_expense_id,
470
- "amount": amount,
471
- "currency": currency_symbol,
472
- "category": category,
473
- "merchant": merchant,
474
- "date": expense_date, # Keep as date object internally
475
- "original_message": text
476
- }
477
- expenses.append(new_expense)
478
- next_expense_id += 1
479
- logging.info(f"Added expense (in-memory): {new_expense}")
480
-
481
- merchant_part = f" at {merchant}" if merchant else ""
482
- date_str = expense_date.strftime('%b %d, %Y')
483
- confirmation_msg = f"✅ Expense added: {currency_symbol}{amount:.2f} for {category}{merchant_part} on {date_str}."
484
-
485
- new_expense_serializable = new_expense.copy()
486
- new_expense_serializable["date"] = new_expense["date"].isoformat()
487
-
488
- response_data = {
489
- "action": "add_expense",
490
- "status": "success",
491
- "message": confirmation_msg,
492
- "details": new_expense_serializable
493
- }
494
- else:
495
- logging.warning(f"Could not extract amount reliably from: {text}")
496
- response_data = {
497
- "action": "add_expense",
498
- "status": "failed",
499
- "message": f"Sorry, I couldn't understand the amount. Please include it clearly (e.g., '₹500', '$20', '15 pounds')."
500
- }
501
-
502
- elif intent == "query_expense":
503
- logging.info("Processing query intent.")
504
- query_criteria = {}
505
- _q_merchant, q_category = identify_merchant_and_category(doc)
506
-
507
- # ... (rest of query criteria extraction logic remains the same) ...
508
- query_cat_found = None
509
- text_lower = doc.text.lower()
510
- for cat, keywords in CATEGORY_KEYWORDS.items():
511
- if any(keyword in text_lower for keyword in keywords):
512
- if cat == 'food' or q_category == 'food':
513
- query_cat_found = 'food'
514
- break
515
- query_cat_found = q_category if q_category != 'Uncategorized' else cat
516
- break
517
-
518
- query_criteria['category'] = query_cat_found
519
- query_criteria['merchant'] = _q_merchant
520
- start_date, end_date = parse_date_range_from_query(doc)
521
- query_criteria['start_date'] = start_date
522
- query_criteria['end_date'] = end_date
523
-
524
- logging.info(f"Query Criteria: {query_criteria}")
525
- results = filter_expenses(query_criteria)
526
- response_message = ""
527
-
528
- # ... (rest of query response formatting logic remains the same) ...
529
- if results and ("total" in text_lower or "sum" in text_lower or "how much" in doc[0].lower_):
530
- total_amount = sum(e['amount'] for e in results)
531
- currency_symbol = results[0].get("currency") or "₹"
532
- category_filter_text = f" on {query_criteria['category']}" if query_criteria['category'] else ""
533
- date_filter_text = ""
534
- if start_date and end_date and start_date == end_date: date_filter_text = f" for {start_date.strftime('%b %d, %Y')}"
535
- elif start_date and end_date: date_filter_text = f" from {start_date.strftime('%b %d')} to {end_date.strftime('%b %d, %Y')}"
536
- elif start_date: date_filter_text = f" since {start_date.strftime('%b %d, %Y')}"
537
- elif end_date: date_filter_text = f" until {end_date.strftime('%b %d, %Y')}"
538
- response_message = f"Your total spending{category_filter_text}{date_filter_text} is {currency_symbol}{total_amount:.2f}."
539
- if len(results) <= 10:
540
- response_message += "\n" + format_expense_list(results, "Details:")
541
- else:
542
- response_message += f" (from {len(results)} transactions)"
543
- elif results and ("biggest" in text_lower or "largest" in text_lower or "top" in text_lower):
544
- top_n = 3
545
- top_expenses = sorted(results, key=lambda x: x['amount'], reverse=True)[:top_n]
546
- response_message = format_expense_list(top_expenses, f"Your top {len(top_expenses)} expenses:")
547
- else:
548
- date_filter_desc = ""
549
- if start_date and end_date and start_date == end_date: date_filter_desc = f" from {start_date.strftime('%b %d, %Y')}"
550
- elif start_date or end_date: date_filter_desc = " matching the date criteria"
551
- category_filter_desc = f" for {query_criteria['category']}" if query_criteria['category'] else ""
552
- merchant_filter_desc = f" at {query_criteria['merchant']}" if query_criteria['merchant'] else ""
553
- title = f"Expenses{category_filter_desc}{merchant_filter_desc}{date_filter_desc}:"
554
- response_message = format_expense_list(results, title)
555
-
556
-
557
- response_data = {
558
- "action": "query_expense",
559
- "status": "success",
560
- "message": response_message,
561
- "criteria": {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in query_criteria.items() if v is not None},
562
- "results_count": len(results)
563
  }
564
 
565
- else: # intent == "unknown"
566
- logging.info(f"Local NLP intent unknown for: {text}. Attempting Gemini API call.")
567
-
568
- # --- Call Gemini API ---
569
- gemini_result = call_gemini_api(text, GEMINI_API_KEY)
570
-
571
- if (gemini_result and isinstance(gemini_result, dict) and gemini_result.get("action") in ["add_expense", "query_expense", "info"]):
572
- # If Gemini returned a structured result we can use (or an info message), return it
573
- logging.info(f"Using result from Gemini API. Action: {gemini_result.get('action')}")
574
- response_data = gemini_result
575
- # TODO: Potentially re-validate or re-process gemini_result here if needed
576
- # For example, if action is add_expense, ensure data types are correct, parse date string etc.
577
- # If action is query_expense, parse date strings etc.
578
- if response_data.get("action") == "add_expense" and "details" in response_data:
579
- # Basic post-processing/validation for added expense
580
- details = response_data["details"]
581
- try:
582
- if "date" in details and isinstance(details["date"], str):
583
- details["date"] = datetime.datetime.fromisoformat(details["date"].split("T")[0]).date()
584
- if "amount" in details:
585
- details["amount"] = float(details["amount"])
586
- # Add expense to memory if Gemini successfully added it
587
- # Note: This assumes Gemini provides all necessary fields correctly
588
- if all(k in details for k in ["amount", "currency", "category", "date"]):
589
- new_expense = {
590
- "id": next_expense_id,
591
- "amount": details["amount"],
592
- "currency": details.get("currency", "₹"),
593
- "category": details.get("category", "Uncategorized"),
594
- "merchant": details.get("merchant"),
595
- "date": details["date"],
596
- "original_message": text
597
- }
598
- expenses.append(new_expense)
599
- next_expense_id += 1
600
- logging.info(f"Added expense (from Gemini): {new_expense}")
601
- # Update message for consistency
602
- # --- FIX: Check if date is valid before formatting ---
603
- if isinstance(new_expense.get('date'), datetime.date):
604
- date_str = new_expense['date'].strftime('%b %d, %Y')
605
- response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} on {date_str}."
606
- else:
607
- logging.warning(f"Gemini add_expense result had invalid date type: {type(new_expense.get('date'))}. Using default message.")
608
- response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} (date missing/invalid)."
609
- # Make details serializable for JSON response
610
- # Ensure date is serializable even if it was invalid earlier
611
- if isinstance(response_data["details"].get("date"), datetime.date):
612
- response_data["details"]["date"] = response_data["details"]["date"].isoformat()
613
- else:
614
- # Handle case where date might be None or wrong type after processing
615
- response_data["details"]["date"] = None # Or some indicator of invalidity
616
- else:
617
- logging.warning("Gemini add_expense result missing required fields.")
618
- response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}
619
-
620
- except (ValueError, TypeError) as e:
621
- logging.warning(f"Error processing Gemini add_expense details: {e}")
622
- response_data = {"action": "unknown", "status": "failed", "message": "Could not process expense details suggested by Gemini."}
623
-
624
- elif response_data.get("action") == "query_expense" and "criteria" in response_data:
625
- # Basic post-processing for query
626
- criteria = response_data["criteria"]
627
- try:
628
- if "start_date" in criteria and isinstance(criteria["start_date"], str):
629
- criteria["start_date"] = datetime.datetime.fromisoformat(criteria["start_date"].split("T")[0]).date()
630
- if "end_date" in criteria and isinstance(criteria["end_date"], str):
631
- criteria["end_date"] = datetime.datetime.fromisoformat(criteria["end_date"].split("T")[0]).date()
632
- # Execute the query based on Gemini's criteria
633
- results = filter_expenses(criteria)
634
- # Use Gemini's message or generate a new one
635
- if not response_data.get("message"):
636
- response_data["message"] = format_expense_list(results, "Query results (via Gemini):")
637
- response_data["results_count"] = len(results)
638
- # Make criteria serializable
639
- response_data["criteria"] = {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in criteria.items() if v is not None}
640
-
641
- except (ValueError, TypeError) as e:
642
- logging.warning(f"Error processing Gemini query_expense criteria: {e}")
643
- response_data = {"action": "unknown", "status": "failed", "message": "Could not process query criteria suggested by Gemini."}
644
-
645
- else:
646
- # Fallback to original unknown message if Gemini fails or returns unusable data
647
- logging.info("Gemini API did not provide a usable structured result. Falling back to default unknown message.")
648
- response_data = {
649
- "action": "unknown",
650
- "status": "failed",
651
- "message": "Sorry, I couldn't quite understand that. Please try phrasing your expense or query differently. \nExamples:\n- 'Spent ₹50 on coffee yesterday at Starbucks'\n- 'Show my food expenses last week'\n- 'What was my total spending last month?'"
652
- }
653
- # Optionally include Gemini's raw suggestion if available and not structured
654
- if gemini_result and isinstance(gemini_result, dict) and "message" in gemini_result:
655
- response_data["message"] += f"\n\nGemini suggestion: {gemini_result['message']}"
656
-
657
- logging.info(f"Analysis complete. Action: {response_data.get('action')}, Status: {response_data.get('status')}") # Corrected closing parenthesis
658
- return response_data
659
-
660
-
661
- # Placeholder for Gemini API Key - Load from environment variable
662
- GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
663
-
664
- # Placeholder function for Gemini API call
665
- def call_gemini_api(text, api_key):
666
- """
667
- Placeholder function to call the Gemini API.
668
- Replace with actual implementation.
669
- Should ideally return a dictionary similar to analyze_expense_text's output
670
- or None if the call fails or response is unusable.
671
- """
672
- if not api_key:
673
- logging.warning("GEMINI_API_KEY not set. Skipping Gemini API call.")
674
- return None
675
-
676
- # --- Replace with actual Gemini API endpoint and request structure ---
677
- # Example using Google AI Generative Language API (adjust model and endpoint as needed)
678
- # Ensure you have the google-generativeai library installed (`pip install google-generativeai`)
679
- # and the API key is correctly set as an environment variable.
680
- # Use a current model and the v1 endpoint
681
- model_name = "gemini-2.0-flash-lite" # Updated model name
682
- api_endpoint = f"https://generativelanguage.googleapis.com/v1/models/{model_name}:generateContent?key={api_key}"
683
- headers = {
684
- "Content-Type": "application/json"
685
- }
686
- # Construct the payload based on Gemini API requirements
687
- # This prompt asks Gemini to act like the existing NLP service
688
- # Corrected indentation for the prompt string
689
- prompt = f"""Analyze the following text for expense tracking. Determine the intent ('add_expense' or 'query_expense') and extract relevant details.
690
-
691
- Text: "{text}"
692
-
693
- Desired JSON output format:
694
- {{
695
- "action": "add_expense" | "query_expense" | "unknown" | "info",
696
- "status": "success" | "failed",
697
- "message": "Confirmation or result summary or explanation",
698
- "details": {{ // Only for add_expense if successful
699
- "amount": <float>,
700
- "currency": "<string>", // e.g., "₹", "$", "EUR"
701
- "category": "<string>", // e.g., "food", "travel", "shopping"
702
- "merchant": "<string>", // e.g., "Starbucks", "Amazon"
703
- "date": "YYYY-MM-DD"
704
- }},
705
- "criteria": {{ // Only for query_expense if successful
706
- "category": "<string>",
707
- "merchant": "<string>",
708
- "start_date": "YYYY-MM-DD",
709
- "end_date": "YYYY-MM-DD"
710
- }}
711
- }}
712
-
713
- - If the intent is clearly 'add_expense' and details can be extracted, use action "add_expense" and status "success". Include extracted details.
714
- - If the intent is clearly 'query_expense' and criteria can be extracted, use action "query_expense" and status "success". Include extracted criteria.
715
- - If the intent is unclear, details are missing for adding, or it's a general question/statement not related to adding/querying expenses, use action "unknown" or "info" and status "failed" or "success" respectively. Provide a helpful message.
716
- - Ensure date format is YYYY-MM-DD.
717
- - Default currency to "₹" if not specified.
718
- - Default category to "Uncategorized" if not specified.
719
- Provide only the JSON output.
720
- """
721
-
722
- payload = json.dumps({
723
- "contents": [{
724
- "parts":[{ "text": prompt }]
725
- }]
726
- # Add generationConfig if needed (e.g., temperature, max output tokens)
727
- # "generationConfig": {
728
- # "temperature": 0.7,
729
- # "maxOutputTokens": 256
730
- # }
731
- })
732
- # --- End of placeholder section ---
733
 
 
734
  try:
735
- response = requests.post(api_endpoint, headers=headers, data=payload, timeout=20) # Increased timeout
736
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
737
-
738
- gemini_response_raw = response.json()
739
- logging.debug(f"Raw Gemini API response: {gemini_response_raw}")
740
-
741
- # --- Process gemini_response ---
742
- content = None # Initialize content to None
743
- content_cleaned = None # Initialize content_cleaned to None
744
- # Extract the text content which should contain the JSON
745
- if 'candidates' in gemini_response_raw and len(gemini_response_raw['candidates']) > 0:
746
- content = gemini_response_raw['candidates'][0].get('content', {}).get('parts', [{}])[0].get('text')
747
- if content:
748
- logging.info(f"Gemini suggested JSON: {content}")
749
- # Clean potential markdown/code block formatting
750
- content_cleaned = content.strip().strip('```json').strip('```').strip()
751
- try:
752
- # Attempt to parse the JSON string from Gemini
753
- parsed_result = json.loads(content_cleaned)
754
- # Basic validation of the parsed structure
755
- if isinstance(parsed_result, dict) and "action" in parsed_result:
756
- logging.info("Successfully parsed structured data from Gemini.")
757
- # Add further validation/sanitization if needed
758
- return parsed_result
759
- else:
760
- logging.warning("Gemini response parsed but lacks expected structure.")
761
- # Return info message if structure is wrong but content exists
762
- return {"action": "info", "status": "success", "message": f"Gemini suggestion: {content_cleaned}"}
763
- except json.JSONDecodeError as json_err:
764
- logging.warning(f"Failed to decode JSON from Gemini response: {json_err}. Raw content: {content_cleaned}")
765
- # Return the raw text as a message if JSON parsing fails but content exists
766
- return {"action": "info", "status": "success", "message": f"Gemini suggestion: {content_cleaned}"}
767
- else:
768
- logging.warning("No text content found in Gemini response candidates.")
769
- return None
770
- else:
771
- logging.warning("No candidates found in Gemini API response.")
772
- return None
773
-
774
- except requests.exceptions.Timeout:
775
- logging.error("Gemini API call timed out.")
776
- return None
777
- except requests.exceptions.RequestException as e:
778
- logging.error(f"Gemini API call failed: {e}")
779
- # Log response body if available and indicates an API error
780
- if e.response is not None:
781
- try:
782
- logging.error(f"Gemini API error response: {e.response.json()}")
783
- except json.JSONDecodeError:
784
- logging.error(f"Gemini API error response (non-JSON): {e.response.text}")
785
- return None
786
  except Exception as e:
787
- # Include content_cleaned in the log if available during unexpected errors
788
- error_context = f"Raw content (if available): {content_cleaned}" if content_cleaned else "No raw content parsed."
789
- logging.error(f"An unexpected error occurred during Gemini API call or processing: {e}. {error_context}")
790
- return None
791
-
792
-
793
- # --- Flask Blueprint Setup (Optional: Keep if direct API access is needed) ---
794
- nlp_bp = Blueprint('nlp_service', __name__)
795
-
796
- @nlp_bp.route('/process_nlp', methods=['POST'])
797
- def process_nlp_expense_route():
798
- """Flask route handler that calls the core analysis function."""
799
- data = request.get_json()
800
- if not data or 'message' not in data:
801
- logging.warning("Received request without 'message' field.")
802
- return jsonify({"error": "Missing 'message' in request body"}), 400
803
 
804
- user_message = data['message']
805
- result = analyze_expense_text(user_message) # Call the core function
 
 
 
 
 
 
 
 
806
 
807
- # Determine status code based on result
808
- status_code = 200
809
- if result.get("status") == "failed":
810
- status_code = 400 # Or 500 if it's an internal NLP model error
811
- if result.get("message") == "NLP model not available":
812
- status_code = 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
 
814
- return jsonify(result), status_code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filepath: c:\Users\Dell\Monil\Apps\code\Projects\space-songporter\OCR\nlp_service.py
2
+ import json
3
+ from model_setup import zero_shot, ner # Assuming model_setup.py exists and is correct
4
+ from utils import parse_entities # Assuming utils.py exists and is correct
5
+ from config import CATEGORY_KEYWORDS # Import categories from config
 
 
 
 
 
 
6
 
7
+ def analyze_text(text: str) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  """
9
+ Analyzes the input text for intent, entities, and category.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ Args:
12
+ text: The input text string.
13
 
14
+ Returns:
15
+ A dictionary containing the analysis results (intent, category, amount, etc.)
16
+ or an error message.
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  """
18
+ if not text:
19
+ return {
20
+ "status": "failed",
21
+ "message": "Input text cannot be empty."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  }
23
 
24
+ print(f"NLP Service: Processing text: {text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Step 1: Intent classification
27
  try:
28
+ candidate_labels = ["expense", "investment", "query", "limit-setting", "income", "other"]
29
+ intent_result = zero_shot(text, candidate_labels=candidate_labels)
30
+ intent = intent_result["labels"][0]
31
+ score = intent_result["scores"][0]
32
+ print(f"NLP Service: Intent classification: {intent} (Score: {score:.2f})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  except Exception as e:
34
+ print(f"NLP Service: Error during intent classification: {e}")
35
+ return {
36
+ "status": "failed",
37
+ "message": "Intent classification failed",
38
+ "error": str(e)
39
+ }
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Step 2: Check if intent requires fallback (e.g., Gemini route)
42
+ if intent == "query":
43
+ print(f"NLP Service: Intent classified as '{intent}'. Fallback route triggered.")
44
+ # Placeholder for potential future Gemini integration
45
+ return {
46
+ "status": "fallback_required", # Use a specific status
47
+ "message": "Intent requires further processing (e.g., query engine - not implemented).",
48
+ "original_text": text,
49
+ "classified_intent": intent
50
+ }
51
 
52
+ # Step 3: Entity extraction (for non-fallback intents)
53
+ try:
54
+ entities = ner(text)
55
+ print(f"NLP Service: NER entities: {entities}")
56
+ amount, currency, item = parse_entities(entities)
57
+ print(f"NLP Service: Parsed entities: Amount={amount}, Currency={currency}, Item={item}")
58
+ except Exception as e:
59
+ print(f"NLP Service: Error during entity extraction: {e}")
60
+ # Decide if you want to return an error or proceed with partial data
61
+ amount, currency, item = None, None, None # Default to None on error
62
+
63
+ # Step 4: Category matching using config.py
64
+ category = "Misc" # Default
65
+ text_lower = text.lower()
66
+ item_lower = item.lower() if item else ""
67
+
68
+ # Check intent first for Income/Investment categories
69
+ if intent == "income":
70
+ category = "Income"
71
+ elif intent == "investment":
72
+ category = "Investment"
73
+ else: # Only check keywords if not already classified as Income/Investment by intent
74
+ for cat, keywords in CATEGORY_KEYWORDS.items():
75
+ # Skip Income/Investment keywords here as intent handles them
76
+ if cat in ["Income", "Investment"]:
77
+ continue
78
+ if any(kw in text_lower or (item_lower and kw in item_lower) for kw in keywords):
79
+ category = cat
80
+ break # Stop after first match
81
+
82
+ # Refine intent based on keywords if initial classification was 'other' or potentially wrong
83
+ if intent != "income" and category == "Income":
84
+ print(f"NLP Service: Correcting intent to 'income' based on keywords/category.")
85
+ intent = "income"
86
+ elif intent != "investment" and category == "Investment":
87
+ print(f"NLP Service: Correcting intent to 'investment' based on keywords/category.")
88
+ intent = "investment"
89
+ # If no specific category matched but intent is expense/other, ensure category isn't Income/Investment
90
+ elif category in ["Income", "Investment"] and intent not in ["income", "investment"]:
91
+ category = "Misc" # Revert category if intent doesn't match
92
+
93
+ print(f"NLP Service: Assigned category: {category}")
94
+
95
+ # Final successful response structure
96
+ return {
97
+ "status": "success",
98
+ "type": intent,
99
+ "category": category,
100
+ "amount": amount,
101
+ "currency": currency,
102
+ "item": item
103
+ }
104
 
105
+ # Example usage (for testing nlp_service.py directly)
106
+ if __name__ == '__main__':
107
+ test_cases = [
108
+ "spent 5 eur on coffee",
109
+ "how much did I spend last month",
110
+ "salary credited 50000",
111
+ "invested 1000 in stocks",
112
+ "paid 20 usd for lunch",
113
+ "got groceries for 50 dollars",
114
+ "what was my total spending on food?",
115
+ "received 200 GBP deposit"
116
+ ]
117
+ for case in test_cases:
118
+ print(f"\n--- Testing: '{case}' ---")
119
+ result = analyze_text(case)
120
+ print(json.dumps(result, indent=2))
requirements.txt CHANGED
@@ -5,5 +5,10 @@ paddlepaddle
5
  paddleocr
6
  spacy>=3.0.0 # Added spaCy
7
  dateparser>=1.0.0 # Added dateparser
 
 
8
  # Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
9
- # python -m spacy download en_core_web_md
 
 
 
 
5
  paddleocr
6
  spacy>=3.0.0 # Added spaCy
7
  dateparser>=1.0.0 # Added dateparser
8
+ google-generativeai # Added for Gemini API
9
+ python-dotenv # Added for loading .env files
10
  # Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
11
+ # python -m spacy download en_core_web_md
12
+ transformers
13
+ torch
14
+ sentencepiece
utils.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols
4
+
5
+ def parse_entities(entities, full_text: str):
6
+ """
7
+ Extracts amount, currency, and item description from NER entities and full text.
8
+
9
+ Args:
10
+ entities: List of dictionaries from the NER pipeline.
11
+ full_text: The original input text string.
12
+
13
+ Returns:
14
+ A tuple: (amount, currency, item)
15
+ """
16
+ amount, currency, item = None, None, None
17
+ potential_amounts = []
18
+
19
+ # 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable
20
+ # Regex groups:
21
+ # 1: Symbol/Code before number ($, EUR, etc.)
22
+ # 2: Number when symbol/code is before
23
+ # 3: Number when symbol/code is after
24
+ # 4: Symbol/Code after number (rs, dollars, etc.)
25
+ # 5: Standalone number
26
+ for match in FALLBACK_AMOUNT_REGEX.finditer(full_text):
27
+ num_str = None
28
+ curr_symbol = None
29
+ curr_code = None
30
+
31
+ if match.group(1) and match.group(2): # Symbol/Code before
32
+ curr_symbol = match.group(1)
33
+ num_str = match.group(2)
34
+ elif match.group(3) and match.group(4): # Symbol/Code after
35
+ num_str = match.group(3)
36
+ curr_code = match.group(4)
37
+ elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number
38
+ num_str = match.group(5)
39
+
40
+ if num_str:
41
+ try:
42
+ value = float(num_str.replace(",", ""))
43
+ # Basic validation: avoid huge numbers unless they have decimals (might be IDs)
44
+ if value < 1_000_000 or '.' in num_str:
45
+ potential_amounts.append({
46
+ "value": value,
47
+ "currency_symbol": curr_symbol,
48
+ "currency_code": curr_code,
49
+ "match_obj": match # Store match object for position info later if needed
50
+ })
51
+ except ValueError:
52
+ continue # Ignore invalid numbers like "1,2,3"
53
+
54
+ # 2. Determine Amount and Currency from regex matches
55
+ if potential_amounts:
56
+ # Prioritize matches that included a currency symbol/code
57
+ currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]]
58
+ if currency_matches:
59
+ # Often the largest value with currency is the main one
60
+ best_match = max(currency_matches, key=lambda x: x["value"])
61
+ amount = best_match["value"]
62
+ # Determine currency from symbol/code
63
+ symbol = best_match["currency_symbol"]
64
+ code = best_match["currency_code"]
65
+ if symbol:
66
+ if "₹" in symbol: currency = "INR"
67
+ elif "$" in symbol: currency = "USD"
68
+ elif "€" in symbol: currency = "EUR"
69
+ elif "£" in symbol: currency = "GBP"
70
+ elif code:
71
+ code_lower = code.lower()
72
+ if code_lower in ["inr", "rs", "rupees"]: currency = "INR"
73
+ elif code_lower in ["usd", "dollars"]: currency = "USD"
74
+ elif code_lower in ["eur", "euros"]: currency = "EUR"
75
+ elif code_lower in ["gbp", "pounds"]: currency = "GBP"
76
+ else:
77
+ # If no currency found, take the largest standalone number as amount
78
+ best_match = max(potential_amounts, key=lambda x: x["value"])
79
+ amount = best_match["value"]
80
+ currency = None # Explicitly None if not found
81
+
82
+ # 3. Extract Item using NER entities (excluding amounts/currency)
83
+ item_parts = []
84
+ if entities:
85
+ # Get text segments identified as potential amounts by the regex
86
+ amount_texts = set()
87
+ for p in potential_amounts:
88
+ amount_texts.add(p["match_obj"].group(0)) # Add the full matched string
89
+
90
+ for entity in entities:
91
+ entity_group = entity.get("entity_group", "")
92
+ word = entity.get("word", "")
93
+
94
+ # Skip if the entity word is part of a detected amount or is just a currency symbol
95
+ if word in amount_texts or word in CURRENCY_SYMBOLS:
96
+ continue
97
+
98
+ # Skip if it's classified as MONEY by NER (already handled by regex)
99
+ # Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees")
100
+ if "MONEY" in entity_group:
101
+ continue
102
+
103
+ # Include relevant entity types for item description
104
+ if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]:
105
+ # Clean up sub-word tokens like ##ing
106
+ cleaned_word = word.replace(" ##", "").strip()
107
+ if cleaned_word:
108
+ item_parts.append(cleaned_word)
109
+
110
+ if item_parts:
111
+ item = " ".join(item_parts).strip()
112
+ # Further clean-up (optional): remove leading/trailing punctuation if desired
113
+ item = re.sub(r"^[^\w]+|[^\w]+$", "", item)
114
+
115
+
116
+ # 4. Final checks and return
117
+ # If amount is found but currency is None, consider a default (optional, decided against for now)
118
+ # if amount is not None and currency is None:
119
+ # currency = "INR" # Or keep as None
120
+
121
+ print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}")
122
+ return amount, currency, item
123
+
124
+ # ... (keep parse_gemini_response as is) ...
125
+ def parse_gemini_response(response_text):
126
+ """
127
+ Parses a structured string response from Gemini (expected JSON-like).
128
+ Example expected format:
129
+ "{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }"
130
+ """
131
+ try:
132
+ # Clean the response text if it's wrapped in markdown code blocks
133
+ response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip())
134
+ data = json.loads(response_text)
135
+ return data
136
+ except json.JSONDecodeError:
137
+ print(f"Warning: Could not parse Gemini response: {response_text}")
138
+ return None
139
+ except Exception as e:
140
+ print(f"Error parsing Gemini response: {e}")
141
+ return None