Spaces:
Running
Running
Improved NLP Logic
Browse files- app.py +9 -5
- config.py +27 -0
- handler.py +105 -0
- model_setup.py +5 -0
- nlp_service.py +107 -801
- requirements.txt +6 -1
- utils.py +141 -0
app.py
CHANGED
@@ -12,7 +12,7 @@ from paddleocr import PaddleOCR
|
|
12 |
from PIL import Image
|
13 |
|
14 |
# --- NEW: Import the NLP analysis function ---
|
15 |
-
from nlp_service import
|
16 |
|
17 |
# --- Configuration ---
|
18 |
LANG = 'en' # Default language, can be overridden if needed
|
@@ -292,13 +292,17 @@ def process_message():
|
|
292 |
nlp_error = None
|
293 |
try:
|
294 |
# Call the imported analysis function
|
295 |
-
nlp_analysis_result =
|
296 |
print(f"NLP Service Analysis Result: {nlp_analysis_result}")
|
297 |
-
# Check if the NLP analysis itself reported an error/failure
|
298 |
-
|
|
|
299 |
nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
|
300 |
# Return the failure result from NLP service
|
301 |
-
return jsonify(nlp_analysis_result), 400 #
|
|
|
|
|
|
|
302 |
|
303 |
# Return the successful analysis result
|
304 |
return jsonify(nlp_analysis_result)
|
|
|
12 |
from PIL import Image
|
13 |
|
14 |
# --- NEW: Import the NLP analysis function ---
|
15 |
+
from nlp_service import analyze_text # Corrected import
|
16 |
|
17 |
# --- Configuration ---
|
18 |
LANG = 'en' # Default language, can be overridden if needed
|
|
|
292 |
nlp_error = None
|
293 |
try:
|
294 |
# Call the imported analysis function
|
295 |
+
nlp_analysis_result = analyze_text(text_message) # Corrected function call
|
296 |
print(f"NLP Service Analysis Result: {nlp_analysis_result}")
|
297 |
+
# Check if the NLP analysis itself reported an error/failure or requires fallback
|
298 |
+
status = nlp_analysis_result.get("status")
|
299 |
+
if status == "failed":
|
300 |
nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
|
301 |
# Return the failure result from NLP service
|
302 |
+
return jsonify(nlp_analysis_result), 400 # Use 400 for client-side errors like empty text
|
303 |
+
elif status == "fallback_required":
|
304 |
+
# Return the fallback result (e.g., for queries)
|
305 |
+
return jsonify(nlp_analysis_result), 200 # Return 200, but indicate fallback needed
|
306 |
|
307 |
# Return the successful analysis result
|
308 |
return jsonify(nlp_analysis_result)
|
config.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
# --- NLP Configuration ---
|
4 |
+
CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
|
5 |
+
|
6 |
+
# More robust regex to find monetary values even if spaCy misses MONEY entity
|
7 |
+
# Added a group to capture standalone numbers potentially without currency symbols nearby
|
8 |
+
FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b|\b([\d,]+(?:\.\d{1,2})?)\b', re.IGNORECASE)
|
9 |
+
|
10 |
+
# Consolidated Category Keywords
|
11 |
+
CATEGORY_KEYWORDS = {
|
12 |
+
"Coffee": ["coffee", "latte", "cappuccino", "starbucks", "cafe", "café", "espresso", "mocha", "ccd"],
|
13 |
+
"Food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "sandwich", "burger", "pizza"],
|
14 |
+
"Groceries": ["groceries", "supermarket", "vegetables", "milk", "market", "zepto", "blinkit", "bigbasket"],
|
15 |
+
"Entertainment": ["movie", "cinema", "concert", "game", "netflix", "spotify", "tickets", "fun"],
|
16 |
+
"Transport": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket", "metro", "auto", "rickshaw", "commute"], # Combined Travel/Transport
|
17 |
+
"Shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
|
18 |
+
"Utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
|
19 |
+
"Rent": ["rent", "lease"],
|
20 |
+
"Income": ["salary", "received", "credited", "deposit", "income"], # Added income keyword
|
21 |
+
"Investment": ["invest", "stock", "shares", "mutual fund", "sip", "investment"], # Added investment keyword
|
22 |
+
# "Misc" can be the default if no keywords match
|
23 |
+
}
|
24 |
+
|
25 |
+
# Keywords for intent detection (less critical if using zero-shot, but can be helpers)
|
26 |
+
QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
|
27 |
+
ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending
|
handler.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
# Remove direct model/util imports if calling analyze_text
|
3 |
+
# from model_setup import zero_shot, ner
|
4 |
+
# from utils import parse_entities
|
5 |
+
# from config import CATEGORY_KEYWORDS
|
6 |
+
|
7 |
+
# Import the centralized analysis function
|
8 |
+
from nlp_service import analyze_text
|
9 |
+
|
10 |
+
def lambda_handler(event, context):
|
11 |
+
# ... (Keep body parsing logic) ...
|
12 |
+
body_str = event.get("body", "{}")
|
13 |
+
try:
|
14 |
+
body = json.loads(body_str)
|
15 |
+
except json.JSONDecodeError:
|
16 |
+
print(f"Error decoding JSON body: {body_str}")
|
17 |
+
return {
|
18 |
+
"statusCode": 400,
|
19 |
+
"body": json.dumps({"error": "Invalid JSON in request body"})
|
20 |
+
}
|
21 |
+
|
22 |
+
text = body.get("text", "")
|
23 |
+
if not text:
|
24 |
+
return {
|
25 |
+
"statusCode": 400,
|
26 |
+
"body": json.dumps({"error": "Missing 'text' field in request body"})
|
27 |
+
}
|
28 |
+
|
29 |
+
print(f"Processing text via nlp_service: {text}") # Log input
|
30 |
+
|
31 |
+
# Call the centralized NLP service function
|
32 |
+
try:
|
33 |
+
analysis_result = analyze_text(text)
|
34 |
+
status = analysis_result.get("status")
|
35 |
+
|
36 |
+
if status == "failed":
|
37 |
+
print(f"NLP analysis failed: {analysis_result.get('message')}")
|
38 |
+
# Return 400 for input errors, 500 for internal NLP errors?
|
39 |
+
# Let's return 400 if it's a known failure from analyze_text
|
40 |
+
return {
|
41 |
+
"statusCode": 400,
|
42 |
+
"body": json.dumps(analysis_result)
|
43 |
+
}
|
44 |
+
elif status == "fallback_required":
|
45 |
+
print(f"NLP analysis requires fallback: {analysis_result.get('message')}")
|
46 |
+
# Return 200 but indicate fallback needed
|
47 |
+
return {
|
48 |
+
"statusCode": 200,
|
49 |
+
"body": json.dumps(analysis_result)
|
50 |
+
}
|
51 |
+
elif status == "success":
|
52 |
+
print(f"NLP analysis successful: {analysis_result}")
|
53 |
+
# Return the successful analysis result
|
54 |
+
return {
|
55 |
+
"statusCode": 200,
|
56 |
+
"body": json.dumps(analysis_result) # Already contains status
|
57 |
+
}
|
58 |
+
else:
|
59 |
+
# Should not happen if analyze_text always returns a status
|
60 |
+
print(f"Error: Unknown status from analyze_text: {status}")
|
61 |
+
return {
|
62 |
+
"statusCode": 500,
|
63 |
+
"body": json.dumps({"error": "Internal server error: Unexpected NLP response"})
|
64 |
+
}
|
65 |
+
|
66 |
+
except Exception as e:
|
67 |
+
print(f"Error calling analyze_text from handler: {e}")
|
68 |
+
import traceback
|
69 |
+
traceback.print_exc()
|
70 |
+
return {
|
71 |
+
"statusCode": 500,
|
72 |
+
"body": json.dumps({"error": "Internal server error during NLP processing", "details": str(e)})
|
73 |
+
}
|
74 |
+
|
75 |
+
# Example event structure (for local testing if needed)
|
76 |
+
if __name__ == '__main__':
|
77 |
+
# ... (Keep example test cases, they should still work) ...
|
78 |
+
example_event = {
|
79 |
+
"body": json.dumps({
|
80 |
+
"text": "spent 5 eur on coffee"
|
81 |
+
})
|
82 |
+
}
|
83 |
+
context = {}
|
84 |
+
response = lambda_handler(example_event, context)
|
85 |
+
print("\n--- Lambda Response ---")
|
86 |
+
# The body is already a JSON string containing the result from analyze_text
|
87 |
+
print(json.dumps(json.loads(response['body']), indent=2))
|
88 |
+
|
89 |
+
example_event_query = {
|
90 |
+
"body": json.dumps({
|
91 |
+
"text": "how much did I spend last month"
|
92 |
+
})
|
93 |
+
}
|
94 |
+
response_query = lambda_handler(example_event_query, context)
|
95 |
+
print("\n--- Lambda Response (Query) ---")
|
96 |
+
print(json.dumps(json.loads(response_query['body']), indent=2))
|
97 |
+
|
98 |
+
example_event_income = {
|
99 |
+
"body": json.dumps({
|
100 |
+
"text": "salary credited 50000"
|
101 |
+
})
|
102 |
+
}
|
103 |
+
response_income = lambda_handler(example_event_income, context)
|
104 |
+
print("\n--- Lambda Response (Income) ---")
|
105 |
+
print(json.dumps(json.loads(response_income['body']), indent=2))
|
model_setup.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
# Load once and reuse
|
4 |
+
zero_shot = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
|
5 |
+
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
|
nlp_service.py
CHANGED
@@ -1,814 +1,120 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
-
import
|
4 |
-
import
|
5 |
-
from
|
6 |
-
from collections import defaultdict
|
7 |
-
import logging
|
8 |
-
import os # To handle potential model loading issues
|
9 |
-
import requests # Add requests for API calls
|
10 |
-
import json # For handling JSON data
|
11 |
-
import os # Already imported, needed for API key
|
12 |
|
13 |
-
|
14 |
-
logging.basicConfig(level=logging.INFO)
|
15 |
-
|
16 |
-
# --- Load spaCy Model ---
|
17 |
-
# Using medium model for better accuracy and word vectors (though not used explicitly yet)
|
18 |
-
# Handle potential errors during model loading
|
19 |
-
try:
|
20 |
-
# Check if running in an environment where models might be linked differently
|
21 |
-
# (e.g., Google Cloud Functions sometimes needs explicit path)
|
22 |
-
model_name = "en_core_web_md"
|
23 |
-
if not spacy.util.is_package(model_name):
|
24 |
-
print(f"spaCy model '{model_name}' not found as package. Attempting download...")
|
25 |
-
spacy.cli.download(model_name)
|
26 |
-
|
27 |
-
nlp = spacy.load(model_name)
|
28 |
-
logging.info(f"Successfully loaded spaCy model '{model_name}'")
|
29 |
-
except (OSError, ImportError) as e:
|
30 |
-
logging.error(f"Could not load spaCy model '{model_name}'. Error: {e}")
|
31 |
-
logging.error("Ensure the model is downloaded: python -m spacy download en_core_web_md")
|
32 |
-
# Fallback or exit - for now, we'll log and potentially fail later if nlp isn't loaded
|
33 |
-
nlp = None # Indicate model loading failed
|
34 |
-
|
35 |
-
# --- In-Memory Data Storage (Replace with Database) ---
|
36 |
-
expenses = []
|
37 |
-
next_expense_id = 1
|
38 |
-
|
39 |
-
# --- NLP Configuration & Helpers ---
|
40 |
-
CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
|
41 |
-
# More robust regex to find monetary values even if spaCy misses MONEY entity
|
42 |
-
FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b', re.IGNORECASE)
|
43 |
-
|
44 |
-
# Category keywords remain useful
|
45 |
-
CATEGORY_KEYWORDS = {
|
46 |
-
"food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "groceries", "sandwich", "burger", "pizza"],
|
47 |
-
"coffee": ["coffee", "latte", "cappuccino", "espresso", "cafe", "starbucks", "ccd", "café", "mocha"],
|
48 |
-
"travel": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket"],
|
49 |
-
"shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
|
50 |
-
"groceries": ["groceries", "supermarket", "zepto", "blinkit", "bigbasket", "vegetables", "milk", "market"],
|
51 |
-
"utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
|
52 |
-
"entertainment": ["movie", "cinema", "concert", "game", "fun", "netflix", "spotify", "tickets"],
|
53 |
-
"rent": ["rent", "lease"],
|
54 |
-
"transport": ["transport", "metro", "auto", "rickshaw", "commute"]
|
55 |
-
}
|
56 |
-
|
57 |
-
# Keywords for intent detection (can be less critical now, intent inferred more from entities)
|
58 |
-
QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
|
59 |
-
ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending
|
60 |
-
|
61 |
-
|
62 |
-
def parse_money_entity(text, doc):
|
63 |
"""
|
64 |
-
|
65 |
-
Returns the amount as float and identified currency symbol/code.
|
66 |
-
"""
|
67 |
-
amount = None
|
68 |
-
currency = None
|
69 |
-
text = text.replace(',', '') # Remove commas for easier parsing
|
70 |
-
|
71 |
-
# 1. Try spaCy MONEY entities first
|
72 |
-
money_ents = [ent for ent in doc.ents if ent.label_ == "MONEY"]
|
73 |
-
if money_ents:
|
74 |
-
# Prioritize longer entities or ones closer to verbs like 'spent' if multiple found
|
75 |
-
# Simple approach: take the first one for now
|
76 |
-
ent_text = money_ents[0].text.replace(',', '')
|
77 |
-
# Try to extract number and symbol/code from the entity text
|
78 |
-
num_match = re.search(r'([\d\.]+)', ent_text)
|
79 |
-
if num_match:
|
80 |
-
try:
|
81 |
-
amount = float(num_match.group(1))
|
82 |
-
# Try to find a known symbol or code within the entity text
|
83 |
-
symbol_match = re.search(r'([\$€£₹])', ent_text)
|
84 |
-
if symbol_match:
|
85 |
-
currency = symbol_match.group(1)
|
86 |
-
else:
|
87 |
-
# Check for codes like USD, GBP etc. (simple check)
|
88 |
-
code_match = re.search(r'\b(USD|EUR|GBP|INR|RS)\b', ent_text, re.IGNORECASE)
|
89 |
-
if code_match:
|
90 |
-
currency = code_match.group(1).upper()
|
91 |
-
# Standardize common ones
|
92 |
-
if currency == "RS": currency = "INR"
|
93 |
-
|
94 |
-
# If amount found but no currency symbol in entity, check doc context
|
95 |
-
if amount is not None and currency is None:
|
96 |
-
for token in doc:
|
97 |
-
if token.text in CURRENCY_SYMBOLS:
|
98 |
-
currency = token.text
|
99 |
-
break
|
100 |
-
return amount, currency
|
101 |
-
except ValueError:
|
102 |
-
pass # Failed to convert number
|
103 |
-
|
104 |
-
# 2. Fallback Regex (if spaCy missed it or parsing failed)
|
105 |
-
match = FALLBACK_AMOUNT_REGEX.search(text)
|
106 |
-
if match:
|
107 |
-
try:
|
108 |
-
if match.group(2): # Format: $100 or Rs 100
|
109 |
-
amount = float(match.group(2))
|
110 |
-
currency_text = match.group(1)
|
111 |
-
elif match.group(3): # Format: 100 dollars or 100 Rs
|
112 |
-
amount = float(match.group(3))
|
113 |
-
currency_text = match.group(4)
|
114 |
-
else: # Should not happen with this regex, but safety first
|
115 |
-
return None, None
|
116 |
-
|
117 |
-
# Normalize currency symbol/code
|
118 |
-
if currency_text in CURRENCY_SYMBOLS:
|
119 |
-
currency = currency_text
|
120 |
-
else:
|
121 |
-
currency_text = currency_text.lower()
|
122 |
-
if currency_text in ["rs", "rupees"]: currency = "₹" # Or INR
|
123 |
-
elif currency_text in ["dollars", "usd"]: currency = "$" # Or USD
|
124 |
-
elif currency_text in ["pounds", "gbp"]: currency = "£" # Or GBP
|
125 |
-
elif currency_text in ["euros", "eur"]: currency = "€" # Or EUR
|
126 |
-
|
127 |
-
return amount, currency
|
128 |
-
|
129 |
-
except (ValueError, IndexError):
|
130 |
-
logging.warning(f"Regex fallback failed to parse amount from: {text}")
|
131 |
-
return None, None
|
132 |
-
|
133 |
-
return None, None # No amount found
|
134 |
-
|
135 |
-
def parse_date_entities(doc):
|
136 |
-
"""
|
137 |
-
Uses dateparser to interpret spaCy DATE entities.
|
138 |
-
Returns the *most likely* date found, defaulting to today.
|
139 |
-
"""
|
140 |
-
dates = []
|
141 |
-
# Settings for dateparser: prefer past dates for expenses
|
142 |
-
settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
|
143 |
-
|
144 |
-
date_ents = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
|
145 |
-
logging.debug(f"Found DATE entities: {date_ents}")
|
146 |
-
|
147 |
-
if date_ents:
|
148 |
-
for date_str in date_ents:
|
149 |
-
# Sometimes spaCy includes words like "on", "last" in the entity, dateparser handles this
|
150 |
-
parsed = dateparser.parse(date_str, settings=settings)
|
151 |
-
if parsed:
|
152 |
-
dates.append(parsed.date())
|
153 |
-
|
154 |
-
if dates:
|
155 |
-
# Heuristic: If multiple dates, prefer the one closest to today? Or just the first?
|
156 |
-
# Let's prefer the latest valid past date found (most recent expense)
|
157 |
-
past_dates = [d for d in dates if d <= datetime.date.today()]
|
158 |
-
if past_dates:
|
159 |
-
return max(past_dates) # Return the most recent valid date
|
160 |
-
elif dates:
|
161 |
-
return min(dates) # If only future dates found, return the earliest one (less likely for expense)
|
162 |
-
|
163 |
-
# Fallback if no DATE entity found or parsed
|
164 |
-
logging.debug("No valid DATE entity found or parsed, defaulting to today.")
|
165 |
-
return datetime.date.today()
|
166 |
-
|
167 |
-
def identify_merchant_and_category(doc):
|
168 |
-
"""
|
169 |
-
Identifies merchant using ORG/PERSON/GPE entities and context.
|
170 |
-
Identifies category using keywords and context around amount/merchant.
|
171 |
-
"""
|
172 |
-
merchant = None
|
173 |
-
category = "Uncategorized" # Default
|
174 |
-
|
175 |
-
money_token_indices = [token.i for token in doc if token.like_num or token.text in CURRENCY_SYMBOLS or any(sym in token.text for sym in CURRENCY_SYMBOLS) or (token.ent_type_ == "MONEY")]
|
176 |
-
|
177 |
-
potential_merchants = []
|
178 |
-
for ent in doc.ents:
|
179 |
-
if ent.label_ in ["ORG", "PERSON", "GPE", "FAC"]: # Facility might also be relevant
|
180 |
-
# Check context: is it preceded by "at", "from", "in"? Is it near the money amount?
|
181 |
-
prepositions = {"at", "from", "in", "on", "with"}
|
182 |
-
# Check token before the entity start
|
183 |
-
if ent.start > 0 and doc[ent.start - 1].lower_ in prepositions:
|
184 |
-
potential_merchants.append(ent.text)
|
185 |
-
continue
|
186 |
-
# Check dependency relation (e.g., object of preposition)
|
187 |
-
if ent.root.head.lemma_ in prepositions:
|
188 |
-
potential_merchants.append(ent.text)
|
189 |
-
continue
|
190 |
-
# Check proximity to money amount if indices available
|
191 |
-
if money_token_indices:
|
192 |
-
min_dist = min(abs(ent.start - idx) for idx in money_token_indices)
|
193 |
-
if min_dist < 5: # Arbitrary proximity threshold
|
194 |
-
potential_merchants.append(ent.text)
|
195 |
-
continue
|
196 |
-
|
197 |
-
|
198 |
-
if potential_merchants:
|
199 |
-
# Simple heuristic: choose the first likely one. Could be refined.
|
200 |
-
# Filter out very common words or locations if needed (e.g., "City", "Bank" if too generic)
|
201 |
-
merchant = potential_merchants[0].strip()
|
202 |
-
logging.debug(f"Identified potential merchant: {merchant} from entities {potential_merchants}")
|
203 |
-
|
204 |
-
|
205 |
-
# --- Category Identification ---
|
206 |
-
text_lower = doc.text.lower()
|
207 |
-
|
208 |
-
# 1. Check explicit category keywords
|
209 |
-
found_category = None
|
210 |
-
matched_keywords = []
|
211 |
-
for cat, keywords in CATEGORY_KEYWORDS.items():
|
212 |
-
if any(keyword in text_lower for keyword in keywords):
|
213 |
-
# If multiple categories match, prioritize based on merchant or context?
|
214 |
-
# Simple approach: Store all matches for now
|
215 |
-
matched_keywords.append(cat)
|
216 |
-
|
217 |
-
if len(matched_keywords) == 1:
|
218 |
-
found_category = matched_keywords[0]
|
219 |
-
elif len(matched_keywords) > 1:
|
220 |
-
# Ambiguity - Requires smarter logic. E.g., "Coffee at Food court" -> Coffee or Food?
|
221 |
-
# Prioritize based on merchant if known? E.g. if merchant is Starbucks -> Coffee
|
222 |
-
if merchant:
|
223 |
-
merchant_lower = merchant.lower()
|
224 |
-
if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower:
|
225 |
-
if "coffee" in matched_keywords: found_category = "coffee"
|
226 |
-
elif "amazon" in merchant_lower or "flipkart" in merchant_lower:
|
227 |
-
if "shopping" in matched_keywords: found_category = "shopping"
|
228 |
-
elif "zepto" in merchant_lower or "blinkit" in merchant_lower or "groceries" in merchant_lower:
|
229 |
-
if "groceries" in matched_keywords: found_category = "groceries"
|
230 |
-
elif "food" in matched_keywords: found_category = "groceries" # Prefer specific
|
231 |
-
|
232 |
-
# If still ambiguous, maybe pick the most specific one (e.g., prefer 'coffee' over 'food')
|
233 |
-
if not found_category:
|
234 |
-
if "coffee" in matched_keywords: found_category = "coffee"
|
235 |
-
elif "groceries" in matched_keywords: found_category = "groceries"
|
236 |
-
elif "transport" in matched_keywords: found_category = "transport"
|
237 |
-
# Add more specific priorities if needed
|
238 |
-
elif "food" in matched_keywords : found_category = "food" # More general last
|
239 |
-
else: found_category = matched_keywords[0] # Default to first match if no rules apply
|
240 |
-
|
241 |
-
|
242 |
-
if found_category:
|
243 |
-
category = found_category
|
244 |
-
# 2. (Optional/Advanced) Infer from merchant if category is Uncategorized
|
245 |
-
elif merchant and category == "Uncategorized":
|
246 |
-
merchant_lower = merchant.lower()
|
247 |
-
if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower: category = "coffee"
|
248 |
-
elif "amazon" in merchant_lower or "flipkart" in merchant_lower: category = "shopping"
|
249 |
-
elif "zepto" in merchant_lower or "blinkit" in merchant_lower: category = "groceries"
|
250 |
-
elif "uber" in merchant_lower or "ola" in merchant_lower: category = "travel"
|
251 |
-
elif "netflix" in merchant_lower or "spotify" in merchant_lower: category = "entertainment"
|
252 |
-
# Add more merchant->category mappings
|
253 |
-
|
254 |
-
# 3. (Optional/Advanced) Use Dependency Parsing or Word Vectors
|
255 |
-
# Example: Look for nouns that are objects of spending verbs near the amount
|
256 |
-
# This requires more complex linguistic analysis.
|
257 |
-
|
258 |
-
logging.debug(f"Identified Category: {category}")
|
259 |
-
return merchant, category
|
260 |
-
|
261 |
-
def determine_intent(doc):
|
262 |
-
"""Determines intent: 'add_expense', 'query_expense', or 'unknown'."""
|
263 |
-
text_lower = doc.text.lower()
|
264 |
-
|
265 |
-
has_query_keyword = any(keyword in text_lower for keyword in QUERY_KEYWORDS)
|
266 |
-
has_add_verb = any(verb.lemma_ in ADD_EXPENSE_VERBS for verb in doc if verb.pos_ == "VERB")
|
267 |
-
has_money_entity = any(ent.label_ == "MONEY" for ent in doc.ents) or FALLBACK_AMOUNT_REGEX.search(text_lower) is not None
|
268 |
-
|
269 |
-
# More explicit questions are likely queries
|
270 |
-
if doc[0].pos_ == "AUX" or doc[0].lemma_ in ["what", "how", "show", "list", "view"]: # Starts like a question
|
271 |
-
return "query_expense"
|
272 |
-
|
273 |
-
if has_query_keyword:
|
274 |
-
return "query_expense"
|
275 |
-
|
276 |
-
# If it has a spending verb and a money amount, likely adding expense
|
277 |
-
if has_add_verb and has_money_entity:
|
278 |
-
return "add_expense"
|
279 |
-
|
280 |
-
# If it just has a money amount and maybe date/merchant, could be adding expense (implicit verb)
|
281 |
-
if has_money_entity and not has_query_keyword:
|
282 |
-
# Check if there are nouns suggesting items bought
|
283 |
-
has_object_noun = any(tok.pos_ == "NOUN" and tok.dep_ in ["dobj", "pobj", "attr"] for tok in doc)
|
284 |
-
if has_object_noun or any(ent.label_ in ["ORG", "PRODUCT"] for ent in doc.ents):
|
285 |
-
return "add_expense"
|
286 |
-
|
287 |
-
# If only query keywords or unclear structure, lean towards query or unknown
|
288 |
-
if has_query_keyword:
|
289 |
-
return "query_expense"
|
290 |
-
|
291 |
-
return "unknown"
|
292 |
-
|
293 |
-
# --- Filtering and Formatting (largely reused, minor adjustments) ---
|
294 |
-
|
295 |
-
def filter_expenses(criteria):
|
296 |
-
"""Filters the global 'expenses' list based on criteria."""
|
297 |
-
# (This function remains largely the same as the previous version)
|
298 |
-
filtered = expenses
|
299 |
-
|
300 |
-
# Filter by Category
|
301 |
-
if 'category' in criteria and criteria['category'] is not None:
|
302 |
-
target_cat = criteria['category'].lower()
|
303 |
-
# Handle general 'food' query including 'coffee', 'groceries' etc.
|
304 |
-
food_related_cats = {'food', 'coffee', 'groceries', 'restaurant'} # Define food-related categories
|
305 |
-
if target_cat == 'food':
|
306 |
-
filtered = [e for e in filtered if e['category'].lower() in food_related_cats]
|
307 |
-
else:
|
308 |
-
filtered = [e for e in filtered if e['category'].lower() == target_cat]
|
309 |
-
|
310 |
-
# Filter by Date Range (start_date and end_date are inclusive)
|
311 |
-
if 'start_date' in criteria and criteria['start_date'] is not None:
|
312 |
-
filtered = [e for e in filtered if e['date'] >= criteria['start_date']]
|
313 |
-
if 'end_date' in criteria and criteria['end_date'] is not None:
|
314 |
-
filtered = [e for e in filtered if e['date'] <= criteria['end_date']]
|
315 |
-
|
316 |
-
# Filter by Merchant (case-insensitive substring match)
|
317 |
-
if 'merchant' in criteria and criteria['merchant'] is not None:
|
318 |
-
target_merchant = criteria['merchant'].lower()
|
319 |
-
filtered = [e for e in filtered if e['merchant'] and target_merchant in e['merchant'].lower()]
|
320 |
-
|
321 |
-
return filtered
|
322 |
-
|
323 |
-
def parse_date_range_from_query(doc):
|
324 |
-
"""Parses date ranges specifically for queries (e.g., 'this month', 'last week')."""
|
325 |
-
# (This function remains largely the same, using dateparser on DATE entities or keywords)
|
326 |
-
today = datetime.date.today()
|
327 |
-
text_lower = doc.text.lower() # Use full text for keywords like "this month"
|
328 |
-
start_date, end_date = None, None
|
329 |
-
|
330 |
-
# Prioritize DATE entities found by spaCy
|
331 |
-
date_ents_text = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
|
332 |
-
parsed_dates = []
|
333 |
-
settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
|
334 |
-
|
335 |
-
for date_str in date_ents_text:
|
336 |
-
# Try parsing as a potential range using dateparser's experimental range feature (or parse single dates)
|
337 |
-
# For simplicity, we'll stick to parsing single points and let keyword logic handle ranges
|
338 |
-
parsed = dateparser.parse(date_str, settings=settings)
|
339 |
-
if parsed:
|
340 |
-
parsed_dates.append(parsed.date())
|
341 |
-
|
342 |
-
# If spaCy found specific dates, use them
|
343 |
-
if len(parsed_dates) == 1:
|
344 |
-
start_date = end_date = parsed_dates[0]
|
345 |
-
elif len(parsed_dates) > 1:
|
346 |
-
# Ambiguous, maybe take min/max? Or rely on keywords below?
|
347 |
-
start_date = min(parsed_dates)
|
348 |
-
end_date = max(parsed_dates)
|
349 |
-
if start_date > end_date: # Swap if order is wrong
|
350 |
-
start_date, end_date = end_date, start_date
|
351 |
-
|
352 |
-
# If no specific date entities, check for range keywords
|
353 |
-
if start_date is None and end_date is None:
|
354 |
-
if "today" in text_lower:
|
355 |
-
start_date = end_date = today
|
356 |
-
elif "yesterday" in text_lower:
|
357 |
-
start_date = end_date = today - datetime.timedelta(days=1)
|
358 |
-
elif "this week" in text_lower:
|
359 |
-
start_of_week = today - datetime.timedelta(days=today.weekday()) # Monday
|
360 |
-
end_of_week = start_of_week + datetime.timedelta(days=6) # Sunday
|
361 |
-
start_date = start_of_week
|
362 |
-
end_date = end_of_week
|
363 |
-
elif "last week" in text_lower:
|
364 |
-
end_of_last_week = today - datetime.timedelta(days=today.weekday() + 1) # Last Sunday
|
365 |
-
start_of_last_week = end_of_last_week - datetime.timedelta(days=6) # Last Monday
|
366 |
-
start_date = start_of_last_week
|
367 |
-
end_date = end_of_last_week
|
368 |
-
elif "this month" in text_lower:
|
369 |
-
start_date = today.replace(day=1)
|
370 |
-
next_month = today.replace(day=28) + datetime.timedelta(days=4)
|
371 |
-
last_day_of_month = next_month - datetime.timedelta(days=next_month.day)
|
372 |
-
end_date = last_day_of_month
|
373 |
-
elif "last month" in text_lower:
|
374 |
-
first_day_of_current_month = today.replace(day=1)
|
375 |
-
last_day_of_last_month = first_day_of_current_month - datetime.timedelta(days=1)
|
376 |
-
first_day_of_last_month = last_day_of_last_month.replace(day=1)
|
377 |
-
start_date = first_day_of_last_month
|
378 |
-
end_date = last_day_of_last_month
|
379 |
-
elif "year" in text_lower: # e.g., "this year", "last year"
|
380 |
-
if "this year" in text_lower:
|
381 |
-
start_date = datetime.date(today.year, 1, 1)
|
382 |
-
end_date = datetime.date(today.year, 12, 31)
|
383 |
-
elif "last year" in text_lower:
|
384 |
-
start_date = datetime.date(today.year - 1, 1, 1)
|
385 |
-
end_date = datetime.date(today.year - 1, 12, 31)
|
386 |
-
# Check for specific year like "in 2023"
|
387 |
-
year_match = re.search(r'\b(in|for)\s+(\d{4})\b', text_lower)
|
388 |
-
if year_match:
|
389 |
-
year = int(year_match.group(2))
|
390 |
-
start_date = datetime.date(year, 1, 1)
|
391 |
-
end_date = datetime.date(year, 12, 31)
|
392 |
-
|
393 |
-
# Add specific month parsing ("in January") if needed (similar to previous version)
|
394 |
-
else:
|
395 |
-
month_match = re.search(r'\b(in|for)\s+(january|february|march|april|may|june|july|august|september|october|november|december)\b', text_lower)
|
396 |
-
if month_match:
|
397 |
-
month_name = month_match.group(2)
|
398 |
-
year_context = today.year # Assume current year
|
399 |
-
# Check if a year was mentioned nearby
|
400 |
-
year_ent = [e.text for e in doc.ents if e.label_ == "DATE" and e.text.isdigit() and len(e.text)==4]
|
401 |
-
if year_ent:
|
402 |
-
year_context = int(year_ent[0])
|
403 |
-
try:
|
404 |
-
month_num = list(datetime.date(2000, i, 1).strftime('%B').lower() for i in range(1, 13)).index(month_name) + 1
|
405 |
-
start_date = datetime.date(year_context, month_num, 1)
|
406 |
-
next_m = (start_date.replace(day=28) + datetime.timedelta(days=4))
|
407 |
-
end_date = next_m - datetime.timedelta(days=next_m.day)
|
408 |
-
except (ValueError, IndexError): pass # Ignore invalid month/year
|
409 |
-
|
410 |
-
|
411 |
-
logging.debug(f"Parsed date range for query: {start_date} to {end_date}")
|
412 |
-
return start_date, end_date
|
413 |
-
|
414 |
-
def format_expense_list(expense_list, title="Here are the expenses:"):
|
415 |
-
"""Formats a list of expenses into a user-friendly string."""
|
416 |
-
# (This function remains largely the same)
|
417 |
-
if not expense_list:
|
418 |
-
return "No expenses found matching your criteria."
|
419 |
-
|
420 |
-
total_amount = sum(e['amount'] for e in expense_list)
|
421 |
-
# Try to get a consistent currency symbol, default to first expense's symbol or fallback
|
422 |
-
currency_symbol = expense_list[0].get("currency") or "₹" if expense_list else "₹"
|
423 |
|
424 |
-
|
425 |
-
|
426 |
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
merchant_part = f" at {expense['merchant']}" if expense['merchant'] else ""
|
431 |
-
category_part = f" ({expense['category']})" if expense['category'] != 'Uncategorized' else ""
|
432 |
-
date_str = expense['date'].strftime("%b %d, %Y")
|
433 |
-
response_lines.append(f"- {amount_str}{category_part}{merchant_part} - {date_str}")
|
434 |
-
|
435 |
-
if len(expense_list) > 1:
|
436 |
-
total_str = f"{currency_symbol}{total_amount:.2f}"
|
437 |
-
response_lines.append(f"Total: {total_str}")
|
438 |
-
|
439 |
-
return "\n".join(response_lines)
|
440 |
-
|
441 |
-
# --- NEW: Core NLP Processing Function ---
|
442 |
-
def analyze_expense_text(text):
|
443 |
"""
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
if nlp is None:
|
450 |
-
logging.error("spaCy model not loaded. Cannot process text.")
|
451 |
-
return {"action": "error", "status": "failed", "message": "NLP model not available"}
|
452 |
-
|
453 |
-
logging.info(f"Analyzing text: {text[:100]}...") # Log snippet
|
454 |
-
doc = nlp(text)
|
455 |
-
logging.debug(f"spaCy Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
|
456 |
-
|
457 |
-
intent = determine_intent(doc)
|
458 |
-
logging.info(f"Determined Intent: {intent}")
|
459 |
-
response_data = {}
|
460 |
-
|
461 |
-
if intent == "add_expense":
|
462 |
-
amount, currency = parse_money_entity(text, doc)
|
463 |
-
expense_date = parse_date_entities(doc)
|
464 |
-
merchant, category = identify_merchant_and_category(doc)
|
465 |
-
|
466 |
-
if amount is not None:
|
467 |
-
currency_symbol = currency or "₹" # Default currency
|
468 |
-
new_expense = {
|
469 |
-
"id": next_expense_id,
|
470 |
-
"amount": amount,
|
471 |
-
"currency": currency_symbol,
|
472 |
-
"category": category,
|
473 |
-
"merchant": merchant,
|
474 |
-
"date": expense_date, # Keep as date object internally
|
475 |
-
"original_message": text
|
476 |
-
}
|
477 |
-
expenses.append(new_expense)
|
478 |
-
next_expense_id += 1
|
479 |
-
logging.info(f"Added expense (in-memory): {new_expense}")
|
480 |
-
|
481 |
-
merchant_part = f" at {merchant}" if merchant else ""
|
482 |
-
date_str = expense_date.strftime('%b %d, %Y')
|
483 |
-
confirmation_msg = f"✅ Expense added: {currency_symbol}{amount:.2f} for {category}{merchant_part} on {date_str}."
|
484 |
-
|
485 |
-
new_expense_serializable = new_expense.copy()
|
486 |
-
new_expense_serializable["date"] = new_expense["date"].isoformat()
|
487 |
-
|
488 |
-
response_data = {
|
489 |
-
"action": "add_expense",
|
490 |
-
"status": "success",
|
491 |
-
"message": confirmation_msg,
|
492 |
-
"details": new_expense_serializable
|
493 |
-
}
|
494 |
-
else:
|
495 |
-
logging.warning(f"Could not extract amount reliably from: {text}")
|
496 |
-
response_data = {
|
497 |
-
"action": "add_expense",
|
498 |
-
"status": "failed",
|
499 |
-
"message": f"Sorry, I couldn't understand the amount. Please include it clearly (e.g., '₹500', '$20', '15 pounds')."
|
500 |
-
}
|
501 |
-
|
502 |
-
elif intent == "query_expense":
|
503 |
-
logging.info("Processing query intent.")
|
504 |
-
query_criteria = {}
|
505 |
-
_q_merchant, q_category = identify_merchant_and_category(doc)
|
506 |
-
|
507 |
-
# ... (rest of query criteria extraction logic remains the same) ...
|
508 |
-
query_cat_found = None
|
509 |
-
text_lower = doc.text.lower()
|
510 |
-
for cat, keywords in CATEGORY_KEYWORDS.items():
|
511 |
-
if any(keyword in text_lower for keyword in keywords):
|
512 |
-
if cat == 'food' or q_category == 'food':
|
513 |
-
query_cat_found = 'food'
|
514 |
-
break
|
515 |
-
query_cat_found = q_category if q_category != 'Uncategorized' else cat
|
516 |
-
break
|
517 |
-
|
518 |
-
query_criteria['category'] = query_cat_found
|
519 |
-
query_criteria['merchant'] = _q_merchant
|
520 |
-
start_date, end_date = parse_date_range_from_query(doc)
|
521 |
-
query_criteria['start_date'] = start_date
|
522 |
-
query_criteria['end_date'] = end_date
|
523 |
-
|
524 |
-
logging.info(f"Query Criteria: {query_criteria}")
|
525 |
-
results = filter_expenses(query_criteria)
|
526 |
-
response_message = ""
|
527 |
-
|
528 |
-
# ... (rest of query response formatting logic remains the same) ...
|
529 |
-
if results and ("total" in text_lower or "sum" in text_lower or "how much" in doc[0].lower_):
|
530 |
-
total_amount = sum(e['amount'] for e in results)
|
531 |
-
currency_symbol = results[0].get("currency") or "₹"
|
532 |
-
category_filter_text = f" on {query_criteria['category']}" if query_criteria['category'] else ""
|
533 |
-
date_filter_text = ""
|
534 |
-
if start_date and end_date and start_date == end_date: date_filter_text = f" for {start_date.strftime('%b %d, %Y')}"
|
535 |
-
elif start_date and end_date: date_filter_text = f" from {start_date.strftime('%b %d')} to {end_date.strftime('%b %d, %Y')}"
|
536 |
-
elif start_date: date_filter_text = f" since {start_date.strftime('%b %d, %Y')}"
|
537 |
-
elif end_date: date_filter_text = f" until {end_date.strftime('%b %d, %Y')}"
|
538 |
-
response_message = f"Your total spending{category_filter_text}{date_filter_text} is {currency_symbol}{total_amount:.2f}."
|
539 |
-
if len(results) <= 10:
|
540 |
-
response_message += "\n" + format_expense_list(results, "Details:")
|
541 |
-
else:
|
542 |
-
response_message += f" (from {len(results)} transactions)"
|
543 |
-
elif results and ("biggest" in text_lower or "largest" in text_lower or "top" in text_lower):
|
544 |
-
top_n = 3
|
545 |
-
top_expenses = sorted(results, key=lambda x: x['amount'], reverse=True)[:top_n]
|
546 |
-
response_message = format_expense_list(top_expenses, f"Your top {len(top_expenses)} expenses:")
|
547 |
-
else:
|
548 |
-
date_filter_desc = ""
|
549 |
-
if start_date and end_date and start_date == end_date: date_filter_desc = f" from {start_date.strftime('%b %d, %Y')}"
|
550 |
-
elif start_date or end_date: date_filter_desc = " matching the date criteria"
|
551 |
-
category_filter_desc = f" for {query_criteria['category']}" if query_criteria['category'] else ""
|
552 |
-
merchant_filter_desc = f" at {query_criteria['merchant']}" if query_criteria['merchant'] else ""
|
553 |
-
title = f"Expenses{category_filter_desc}{merchant_filter_desc}{date_filter_desc}:"
|
554 |
-
response_message = format_expense_list(results, title)
|
555 |
-
|
556 |
-
|
557 |
-
response_data = {
|
558 |
-
"action": "query_expense",
|
559 |
-
"status": "success",
|
560 |
-
"message": response_message,
|
561 |
-
"criteria": {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in query_criteria.items() if v is not None},
|
562 |
-
"results_count": len(results)
|
563 |
}
|
564 |
|
565 |
-
|
566 |
-
logging.info(f"Local NLP intent unknown for: {text}. Attempting Gemini API call.")
|
567 |
-
|
568 |
-
# --- Call Gemini API ---
|
569 |
-
gemini_result = call_gemini_api(text, GEMINI_API_KEY)
|
570 |
-
|
571 |
-
if (gemini_result and isinstance(gemini_result, dict) and gemini_result.get("action") in ["add_expense", "query_expense", "info"]):
|
572 |
-
# If Gemini returned a structured result we can use (or an info message), return it
|
573 |
-
logging.info(f"Using result from Gemini API. Action: {gemini_result.get('action')}")
|
574 |
-
response_data = gemini_result
|
575 |
-
# TODO: Potentially re-validate or re-process gemini_result here if needed
|
576 |
-
# For example, if action is add_expense, ensure data types are correct, parse date string etc.
|
577 |
-
# If action is query_expense, parse date strings etc.
|
578 |
-
if response_data.get("action") == "add_expense" and "details" in response_data:
|
579 |
-
# Basic post-processing/validation for added expense
|
580 |
-
details = response_data["details"]
|
581 |
-
try:
|
582 |
-
if "date" in details and isinstance(details["date"], str):
|
583 |
-
details["date"] = datetime.datetime.fromisoformat(details["date"].split("T")[0]).date()
|
584 |
-
if "amount" in details:
|
585 |
-
details["amount"] = float(details["amount"])
|
586 |
-
# Add expense to memory if Gemini successfully added it
|
587 |
-
# Note: This assumes Gemini provides all necessary fields correctly
|
588 |
-
if all(k in details for k in ["amount", "currency", "category", "date"]):
|
589 |
-
new_expense = {
|
590 |
-
"id": next_expense_id,
|
591 |
-
"amount": details["amount"],
|
592 |
-
"currency": details.get("currency", "₹"),
|
593 |
-
"category": details.get("category", "Uncategorized"),
|
594 |
-
"merchant": details.get("merchant"),
|
595 |
-
"date": details["date"],
|
596 |
-
"original_message": text
|
597 |
-
}
|
598 |
-
expenses.append(new_expense)
|
599 |
-
next_expense_id += 1
|
600 |
-
logging.info(f"Added expense (from Gemini): {new_expense}")
|
601 |
-
# Update message for consistency
|
602 |
-
# --- FIX: Check if date is valid before formatting ---
|
603 |
-
if isinstance(new_expense.get('date'), datetime.date):
|
604 |
-
date_str = new_expense['date'].strftime('%b %d, %Y')
|
605 |
-
response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} on {date_str}."
|
606 |
-
else:
|
607 |
-
logging.warning(f"Gemini add_expense result had invalid date type: {type(new_expense.get('date'))}. Using default message.")
|
608 |
-
response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} (date missing/invalid)."
|
609 |
-
# Make details serializable for JSON response
|
610 |
-
# Ensure date is serializable even if it was invalid earlier
|
611 |
-
if isinstance(response_data["details"].get("date"), datetime.date):
|
612 |
-
response_data["details"]["date"] = response_data["details"]["date"].isoformat()
|
613 |
-
else:
|
614 |
-
# Handle case where date might be None or wrong type after processing
|
615 |
-
response_data["details"]["date"] = None # Or some indicator of invalidity
|
616 |
-
else:
|
617 |
-
logging.warning("Gemini add_expense result missing required fields.")
|
618 |
-
response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}
|
619 |
-
|
620 |
-
except (ValueError, TypeError) as e:
|
621 |
-
logging.warning(f"Error processing Gemini add_expense details: {e}")
|
622 |
-
response_data = {"action": "unknown", "status": "failed", "message": "Could not process expense details suggested by Gemini."}
|
623 |
-
|
624 |
-
elif response_data.get("action") == "query_expense" and "criteria" in response_data:
|
625 |
-
# Basic post-processing for query
|
626 |
-
criteria = response_data["criteria"]
|
627 |
-
try:
|
628 |
-
if "start_date" in criteria and isinstance(criteria["start_date"], str):
|
629 |
-
criteria["start_date"] = datetime.datetime.fromisoformat(criteria["start_date"].split("T")[0]).date()
|
630 |
-
if "end_date" in criteria and isinstance(criteria["end_date"], str):
|
631 |
-
criteria["end_date"] = datetime.datetime.fromisoformat(criteria["end_date"].split("T")[0]).date()
|
632 |
-
# Execute the query based on Gemini's criteria
|
633 |
-
results = filter_expenses(criteria)
|
634 |
-
# Use Gemini's message or generate a new one
|
635 |
-
if not response_data.get("message"):
|
636 |
-
response_data["message"] = format_expense_list(results, "Query results (via Gemini):")
|
637 |
-
response_data["results_count"] = len(results)
|
638 |
-
# Make criteria serializable
|
639 |
-
response_data["criteria"] = {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in criteria.items() if v is not None}
|
640 |
-
|
641 |
-
except (ValueError, TypeError) as e:
|
642 |
-
logging.warning(f"Error processing Gemini query_expense criteria: {e}")
|
643 |
-
response_data = {"action": "unknown", "status": "failed", "message": "Could not process query criteria suggested by Gemini."}
|
644 |
-
|
645 |
-
else:
|
646 |
-
# Fallback to original unknown message if Gemini fails or returns unusable data
|
647 |
-
logging.info("Gemini API did not provide a usable structured result. Falling back to default unknown message.")
|
648 |
-
response_data = {
|
649 |
-
"action": "unknown",
|
650 |
-
"status": "failed",
|
651 |
-
"message": "Sorry, I couldn't quite understand that. Please try phrasing your expense or query differently. \nExamples:\n- 'Spent ₹50 on coffee yesterday at Starbucks'\n- 'Show my food expenses last week'\n- 'What was my total spending last month?'"
|
652 |
-
}
|
653 |
-
# Optionally include Gemini's raw suggestion if available and not structured
|
654 |
-
if gemini_result and isinstance(gemini_result, dict) and "message" in gemini_result:
|
655 |
-
response_data["message"] += f"\n\nGemini suggestion: {gemini_result['message']}"
|
656 |
-
|
657 |
-
logging.info(f"Analysis complete. Action: {response_data.get('action')}, Status: {response_data.get('status')}") # Corrected closing parenthesis
|
658 |
-
return response_data
|
659 |
-
|
660 |
-
|
661 |
-
# Placeholder for Gemini API Key - Load from environment variable
|
662 |
-
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
|
663 |
-
|
664 |
-
# Placeholder function for Gemini API call
|
665 |
-
def call_gemini_api(text, api_key):
|
666 |
-
"""
|
667 |
-
Placeholder function to call the Gemini API.
|
668 |
-
Replace with actual implementation.
|
669 |
-
Should ideally return a dictionary similar to analyze_expense_text's output
|
670 |
-
or None if the call fails or response is unusable.
|
671 |
-
"""
|
672 |
-
if not api_key:
|
673 |
-
logging.warning("GEMINI_API_KEY not set. Skipping Gemini API call.")
|
674 |
-
return None
|
675 |
-
|
676 |
-
# --- Replace with actual Gemini API endpoint and request structure ---
|
677 |
-
# Example using Google AI Generative Language API (adjust model and endpoint as needed)
|
678 |
-
# Ensure you have the google-generativeai library installed (`pip install google-generativeai`)
|
679 |
-
# and the API key is correctly set as an environment variable.
|
680 |
-
# Use a current model and the v1 endpoint
|
681 |
-
model_name = "gemini-2.0-flash-lite" # Updated model name
|
682 |
-
api_endpoint = f"https://generativelanguage.googleapis.com/v1/models/{model_name}:generateContent?key={api_key}"
|
683 |
-
headers = {
|
684 |
-
"Content-Type": "application/json"
|
685 |
-
}
|
686 |
-
# Construct the payload based on Gemini API requirements
|
687 |
-
# This prompt asks Gemini to act like the existing NLP service
|
688 |
-
# Corrected indentation for the prompt string
|
689 |
-
prompt = f"""Analyze the following text for expense tracking. Determine the intent ('add_expense' or 'query_expense') and extract relevant details.
|
690 |
-
|
691 |
-
Text: "{text}"
|
692 |
-
|
693 |
-
Desired JSON output format:
|
694 |
-
{{
|
695 |
-
"action": "add_expense" | "query_expense" | "unknown" | "info",
|
696 |
-
"status": "success" | "failed",
|
697 |
-
"message": "Confirmation or result summary or explanation",
|
698 |
-
"details": {{ // Only for add_expense if successful
|
699 |
-
"amount": <float>,
|
700 |
-
"currency": "<string>", // e.g., "₹", "$", "EUR"
|
701 |
-
"category": "<string>", // e.g., "food", "travel", "shopping"
|
702 |
-
"merchant": "<string>", // e.g., "Starbucks", "Amazon"
|
703 |
-
"date": "YYYY-MM-DD"
|
704 |
-
}},
|
705 |
-
"criteria": {{ // Only for query_expense if successful
|
706 |
-
"category": "<string>",
|
707 |
-
"merchant": "<string>",
|
708 |
-
"start_date": "YYYY-MM-DD",
|
709 |
-
"end_date": "YYYY-MM-DD"
|
710 |
-
}}
|
711 |
-
}}
|
712 |
-
|
713 |
-
- If the intent is clearly 'add_expense' and details can be extracted, use action "add_expense" and status "success". Include extracted details.
|
714 |
-
- If the intent is clearly 'query_expense' and criteria can be extracted, use action "query_expense" and status "success". Include extracted criteria.
|
715 |
-
- If the intent is unclear, details are missing for adding, or it's a general question/statement not related to adding/querying expenses, use action "unknown" or "info" and status "failed" or "success" respectively. Provide a helpful message.
|
716 |
-
- Ensure date format is YYYY-MM-DD.
|
717 |
-
- Default currency to "₹" if not specified.
|
718 |
-
- Default category to "Uncategorized" if not specified.
|
719 |
-
Provide only the JSON output.
|
720 |
-
"""
|
721 |
-
|
722 |
-
payload = json.dumps({
|
723 |
-
"contents": [{
|
724 |
-
"parts":[{ "text": prompt }]
|
725 |
-
}]
|
726 |
-
# Add generationConfig if needed (e.g., temperature, max output tokens)
|
727 |
-
# "generationConfig": {
|
728 |
-
# "temperature": 0.7,
|
729 |
-
# "maxOutputTokens": 256
|
730 |
-
# }
|
731 |
-
})
|
732 |
-
# --- End of placeholder section ---
|
733 |
|
|
|
734 |
try:
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
# --- Process gemini_response ---
|
742 |
-
content = None # Initialize content to None
|
743 |
-
content_cleaned = None # Initialize content_cleaned to None
|
744 |
-
# Extract the text content which should contain the JSON
|
745 |
-
if 'candidates' in gemini_response_raw and len(gemini_response_raw['candidates']) > 0:
|
746 |
-
content = gemini_response_raw['candidates'][0].get('content', {}).get('parts', [{}])[0].get('text')
|
747 |
-
if content:
|
748 |
-
logging.info(f"Gemini suggested JSON: {content}")
|
749 |
-
# Clean potential markdown/code block formatting
|
750 |
-
content_cleaned = content.strip().strip('```json').strip('```').strip()
|
751 |
-
try:
|
752 |
-
# Attempt to parse the JSON string from Gemini
|
753 |
-
parsed_result = json.loads(content_cleaned)
|
754 |
-
# Basic validation of the parsed structure
|
755 |
-
if isinstance(parsed_result, dict) and "action" in parsed_result:
|
756 |
-
logging.info("Successfully parsed structured data from Gemini.")
|
757 |
-
# Add further validation/sanitization if needed
|
758 |
-
return parsed_result
|
759 |
-
else:
|
760 |
-
logging.warning("Gemini response parsed but lacks expected structure.")
|
761 |
-
# Return info message if structure is wrong but content exists
|
762 |
-
return {"action": "info", "status": "success", "message": f"Gemini suggestion: {content_cleaned}"}
|
763 |
-
except json.JSONDecodeError as json_err:
|
764 |
-
logging.warning(f"Failed to decode JSON from Gemini response: {json_err}. Raw content: {content_cleaned}")
|
765 |
-
# Return the raw text as a message if JSON parsing fails but content exists
|
766 |
-
return {"action": "info", "status": "success", "message": f"Gemini suggestion: {content_cleaned}"}
|
767 |
-
else:
|
768 |
-
logging.warning("No text content found in Gemini response candidates.")
|
769 |
-
return None
|
770 |
-
else:
|
771 |
-
logging.warning("No candidates found in Gemini API response.")
|
772 |
-
return None
|
773 |
-
|
774 |
-
except requests.exceptions.Timeout:
|
775 |
-
logging.error("Gemini API call timed out.")
|
776 |
-
return None
|
777 |
-
except requests.exceptions.RequestException as e:
|
778 |
-
logging.error(f"Gemini API call failed: {e}")
|
779 |
-
# Log response body if available and indicates an API error
|
780 |
-
if e.response is not None:
|
781 |
-
try:
|
782 |
-
logging.error(f"Gemini API error response: {e.response.json()}")
|
783 |
-
except json.JSONDecodeError:
|
784 |
-
logging.error(f"Gemini API error response (non-JSON): {e.response.text}")
|
785 |
-
return None
|
786 |
except Exception as e:
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
# --- Flask Blueprint Setup (Optional: Keep if direct API access is needed) ---
|
794 |
-
nlp_bp = Blueprint('nlp_service', __name__)
|
795 |
-
|
796 |
-
@nlp_bp.route('/process_nlp', methods=['POST'])
|
797 |
-
def process_nlp_expense_route():
|
798 |
-
"""Flask route handler that calls the core analysis function."""
|
799 |
-
data = request.get_json()
|
800 |
-
if not data or 'message' not in data:
|
801 |
-
logging.warning("Received request without 'message' field.")
|
802 |
-
return jsonify({"error": "Missing 'message' in request body"}), 400
|
803 |
|
804 |
-
|
805 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
806 |
|
807 |
-
#
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
813 |
|
814 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# filepath: c:\Users\Dell\Monil\Apps\code\Projects\space-songporter\OCR\nlp_service.py
|
2 |
+
import json
|
3 |
+
from model_setup import zero_shot, ner # Assuming model_setup.py exists and is correct
|
4 |
+
from utils import parse_entities # Assuming utils.py exists and is correct
|
5 |
+
from config import CATEGORY_KEYWORDS # Import categories from config
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
def analyze_text(text: str) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
"""
|
9 |
+
Analyzes the input text for intent, entities, and category.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
Args:
|
12 |
+
text: The input text string.
|
13 |
|
14 |
+
Returns:
|
15 |
+
A dictionary containing the analysis results (intent, category, amount, etc.)
|
16 |
+
or an error message.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
"""
|
18 |
+
if not text:
|
19 |
+
return {
|
20 |
+
"status": "failed",
|
21 |
+
"message": "Input text cannot be empty."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
}
|
23 |
|
24 |
+
print(f"NLP Service: Processing text: {text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
# Step 1: Intent classification
|
27 |
try:
|
28 |
+
candidate_labels = ["expense", "investment", "query", "limit-setting", "income", "other"]
|
29 |
+
intent_result = zero_shot(text, candidate_labels=candidate_labels)
|
30 |
+
intent = intent_result["labels"][0]
|
31 |
+
score = intent_result["scores"][0]
|
32 |
+
print(f"NLP Service: Intent classification: {intent} (Score: {score:.2f})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
except Exception as e:
|
34 |
+
print(f"NLP Service: Error during intent classification: {e}")
|
35 |
+
return {
|
36 |
+
"status": "failed",
|
37 |
+
"message": "Intent classification failed",
|
38 |
+
"error": str(e)
|
39 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# Step 2: Check if intent requires fallback (e.g., Gemini route)
|
42 |
+
if intent == "query":
|
43 |
+
print(f"NLP Service: Intent classified as '{intent}'. Fallback route triggered.")
|
44 |
+
# Placeholder for potential future Gemini integration
|
45 |
+
return {
|
46 |
+
"status": "fallback_required", # Use a specific status
|
47 |
+
"message": "Intent requires further processing (e.g., query engine - not implemented).",
|
48 |
+
"original_text": text,
|
49 |
+
"classified_intent": intent
|
50 |
+
}
|
51 |
|
52 |
+
# Step 3: Entity extraction (for non-fallback intents)
|
53 |
+
try:
|
54 |
+
entities = ner(text)
|
55 |
+
print(f"NLP Service: NER entities: {entities}")
|
56 |
+
amount, currency, item = parse_entities(entities)
|
57 |
+
print(f"NLP Service: Parsed entities: Amount={amount}, Currency={currency}, Item={item}")
|
58 |
+
except Exception as e:
|
59 |
+
print(f"NLP Service: Error during entity extraction: {e}")
|
60 |
+
# Decide if you want to return an error or proceed with partial data
|
61 |
+
amount, currency, item = None, None, None # Default to None on error
|
62 |
+
|
63 |
+
# Step 4: Category matching using config.py
|
64 |
+
category = "Misc" # Default
|
65 |
+
text_lower = text.lower()
|
66 |
+
item_lower = item.lower() if item else ""
|
67 |
+
|
68 |
+
# Check intent first for Income/Investment categories
|
69 |
+
if intent == "income":
|
70 |
+
category = "Income"
|
71 |
+
elif intent == "investment":
|
72 |
+
category = "Investment"
|
73 |
+
else: # Only check keywords if not already classified as Income/Investment by intent
|
74 |
+
for cat, keywords in CATEGORY_KEYWORDS.items():
|
75 |
+
# Skip Income/Investment keywords here as intent handles them
|
76 |
+
if cat in ["Income", "Investment"]:
|
77 |
+
continue
|
78 |
+
if any(kw in text_lower or (item_lower and kw in item_lower) for kw in keywords):
|
79 |
+
category = cat
|
80 |
+
break # Stop after first match
|
81 |
+
|
82 |
+
# Refine intent based on keywords if initial classification was 'other' or potentially wrong
|
83 |
+
if intent != "income" and category == "Income":
|
84 |
+
print(f"NLP Service: Correcting intent to 'income' based on keywords/category.")
|
85 |
+
intent = "income"
|
86 |
+
elif intent != "investment" and category == "Investment":
|
87 |
+
print(f"NLP Service: Correcting intent to 'investment' based on keywords/category.")
|
88 |
+
intent = "investment"
|
89 |
+
# If no specific category matched but intent is expense/other, ensure category isn't Income/Investment
|
90 |
+
elif category in ["Income", "Investment"] and intent not in ["income", "investment"]:
|
91 |
+
category = "Misc" # Revert category if intent doesn't match
|
92 |
+
|
93 |
+
print(f"NLP Service: Assigned category: {category}")
|
94 |
+
|
95 |
+
# Final successful response structure
|
96 |
+
return {
|
97 |
+
"status": "success",
|
98 |
+
"type": intent,
|
99 |
+
"category": category,
|
100 |
+
"amount": amount,
|
101 |
+
"currency": currency,
|
102 |
+
"item": item
|
103 |
+
}
|
104 |
|
105 |
+
# Example usage (for testing nlp_service.py directly)
|
106 |
+
if __name__ == '__main__':
|
107 |
+
test_cases = [
|
108 |
+
"spent 5 eur on coffee",
|
109 |
+
"how much did I spend last month",
|
110 |
+
"salary credited 50000",
|
111 |
+
"invested 1000 in stocks",
|
112 |
+
"paid 20 usd for lunch",
|
113 |
+
"got groceries for 50 dollars",
|
114 |
+
"what was my total spending on food?",
|
115 |
+
"received 200 GBP deposit"
|
116 |
+
]
|
117 |
+
for case in test_cases:
|
118 |
+
print(f"\n--- Testing: '{case}' ---")
|
119 |
+
result = analyze_text(case)
|
120 |
+
print(json.dumps(result, indent=2))
|
requirements.txt
CHANGED
@@ -5,5 +5,10 @@ paddlepaddle
|
|
5 |
paddleocr
|
6 |
spacy>=3.0.0 # Added spaCy
|
7 |
dateparser>=1.0.0 # Added dateparser
|
|
|
|
|
8 |
# Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
|
9 |
-
# python -m spacy download en_core_web_md
|
|
|
|
|
|
|
|
5 |
paddleocr
|
6 |
spacy>=3.0.0 # Added spaCy
|
7 |
dateparser>=1.0.0 # Added dateparser
|
8 |
+
google-generativeai # Added for Gemini API
|
9 |
+
python-dotenv # Added for loading .env files
|
10 |
# Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
|
11 |
+
# python -m spacy download en_core_web_md
|
12 |
+
transformers
|
13 |
+
torch
|
14 |
+
sentencepiece
|
utils.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols
|
4 |
+
|
5 |
+
def parse_entities(entities, full_text: str):
|
6 |
+
"""
|
7 |
+
Extracts amount, currency, and item description from NER entities and full text.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
entities: List of dictionaries from the NER pipeline.
|
11 |
+
full_text: The original input text string.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
A tuple: (amount, currency, item)
|
15 |
+
"""
|
16 |
+
amount, currency, item = None, None, None
|
17 |
+
potential_amounts = []
|
18 |
+
|
19 |
+
# 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable
|
20 |
+
# Regex groups:
|
21 |
+
# 1: Symbol/Code before number ($, EUR, etc.)
|
22 |
+
# 2: Number when symbol/code is before
|
23 |
+
# 3: Number when symbol/code is after
|
24 |
+
# 4: Symbol/Code after number (rs, dollars, etc.)
|
25 |
+
# 5: Standalone number
|
26 |
+
for match in FALLBACK_AMOUNT_REGEX.finditer(full_text):
|
27 |
+
num_str = None
|
28 |
+
curr_symbol = None
|
29 |
+
curr_code = None
|
30 |
+
|
31 |
+
if match.group(1) and match.group(2): # Symbol/Code before
|
32 |
+
curr_symbol = match.group(1)
|
33 |
+
num_str = match.group(2)
|
34 |
+
elif match.group(3) and match.group(4): # Symbol/Code after
|
35 |
+
num_str = match.group(3)
|
36 |
+
curr_code = match.group(4)
|
37 |
+
elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number
|
38 |
+
num_str = match.group(5)
|
39 |
+
|
40 |
+
if num_str:
|
41 |
+
try:
|
42 |
+
value = float(num_str.replace(",", ""))
|
43 |
+
# Basic validation: avoid huge numbers unless they have decimals (might be IDs)
|
44 |
+
if value < 1_000_000 or '.' in num_str:
|
45 |
+
potential_amounts.append({
|
46 |
+
"value": value,
|
47 |
+
"currency_symbol": curr_symbol,
|
48 |
+
"currency_code": curr_code,
|
49 |
+
"match_obj": match # Store match object for position info later if needed
|
50 |
+
})
|
51 |
+
except ValueError:
|
52 |
+
continue # Ignore invalid numbers like "1,2,3"
|
53 |
+
|
54 |
+
# 2. Determine Amount and Currency from regex matches
|
55 |
+
if potential_amounts:
|
56 |
+
# Prioritize matches that included a currency symbol/code
|
57 |
+
currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]]
|
58 |
+
if currency_matches:
|
59 |
+
# Often the largest value with currency is the main one
|
60 |
+
best_match = max(currency_matches, key=lambda x: x["value"])
|
61 |
+
amount = best_match["value"]
|
62 |
+
# Determine currency from symbol/code
|
63 |
+
symbol = best_match["currency_symbol"]
|
64 |
+
code = best_match["currency_code"]
|
65 |
+
if symbol:
|
66 |
+
if "₹" in symbol: currency = "INR"
|
67 |
+
elif "$" in symbol: currency = "USD"
|
68 |
+
elif "€" in symbol: currency = "EUR"
|
69 |
+
elif "£" in symbol: currency = "GBP"
|
70 |
+
elif code:
|
71 |
+
code_lower = code.lower()
|
72 |
+
if code_lower in ["inr", "rs", "rupees"]: currency = "INR"
|
73 |
+
elif code_lower in ["usd", "dollars"]: currency = "USD"
|
74 |
+
elif code_lower in ["eur", "euros"]: currency = "EUR"
|
75 |
+
elif code_lower in ["gbp", "pounds"]: currency = "GBP"
|
76 |
+
else:
|
77 |
+
# If no currency found, take the largest standalone number as amount
|
78 |
+
best_match = max(potential_amounts, key=lambda x: x["value"])
|
79 |
+
amount = best_match["value"]
|
80 |
+
currency = None # Explicitly None if not found
|
81 |
+
|
82 |
+
# 3. Extract Item using NER entities (excluding amounts/currency)
|
83 |
+
item_parts = []
|
84 |
+
if entities:
|
85 |
+
# Get text segments identified as potential amounts by the regex
|
86 |
+
amount_texts = set()
|
87 |
+
for p in potential_amounts:
|
88 |
+
amount_texts.add(p["match_obj"].group(0)) # Add the full matched string
|
89 |
+
|
90 |
+
for entity in entities:
|
91 |
+
entity_group = entity.get("entity_group", "")
|
92 |
+
word = entity.get("word", "")
|
93 |
+
|
94 |
+
# Skip if the entity word is part of a detected amount or is just a currency symbol
|
95 |
+
if word in amount_texts or word in CURRENCY_SYMBOLS:
|
96 |
+
continue
|
97 |
+
|
98 |
+
# Skip if it's classified as MONEY by NER (already handled by regex)
|
99 |
+
# Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees")
|
100 |
+
if "MONEY" in entity_group:
|
101 |
+
continue
|
102 |
+
|
103 |
+
# Include relevant entity types for item description
|
104 |
+
if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]:
|
105 |
+
# Clean up sub-word tokens like ##ing
|
106 |
+
cleaned_word = word.replace(" ##", "").strip()
|
107 |
+
if cleaned_word:
|
108 |
+
item_parts.append(cleaned_word)
|
109 |
+
|
110 |
+
if item_parts:
|
111 |
+
item = " ".join(item_parts).strip()
|
112 |
+
# Further clean-up (optional): remove leading/trailing punctuation if desired
|
113 |
+
item = re.sub(r"^[^\w]+|[^\w]+$", "", item)
|
114 |
+
|
115 |
+
|
116 |
+
# 4. Final checks and return
|
117 |
+
# If amount is found but currency is None, consider a default (optional, decided against for now)
|
118 |
+
# if amount is not None and currency is None:
|
119 |
+
# currency = "INR" # Or keep as None
|
120 |
+
|
121 |
+
print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}")
|
122 |
+
return amount, currency, item
|
123 |
+
|
124 |
+
# ... (keep parse_gemini_response as is) ...
|
125 |
+
def parse_gemini_response(response_text):
|
126 |
+
"""
|
127 |
+
Parses a structured string response from Gemini (expected JSON-like).
|
128 |
+
Example expected format:
|
129 |
+
"{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }"
|
130 |
+
"""
|
131 |
+
try:
|
132 |
+
# Clean the response text if it's wrapped in markdown code blocks
|
133 |
+
response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip())
|
134 |
+
data = json.loads(response_text)
|
135 |
+
return data
|
136 |
+
except json.JSONDecodeError:
|
137 |
+
print(f"Warning: Could not parse Gemini response: {response_text}")
|
138 |
+
return None
|
139 |
+
except Exception as e:
|
140 |
+
print(f"Error parsing Gemini response: {e}")
|
141 |
+
return None
|