ClearSpend

Sleeping

App Files Files Community

MonilM commited on Apr 30

Commit

07b50c0

1 Parent(s): 6e4ec8a

Improved NLP Logic

Browse files

Files changed (7) hide show

app.py +9 -5
config.py +27 -0
handler.py +105 -0
model_setup.py +5 -0
nlp_service.py +107 -801
requirements.txt +6 -1
utils.py +141 -0

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from paddleocr import PaddleOCR
 from PIL import Image
 # --- NEW: Import the NLP analysis function ---
-from nlp_service import analyze_expense_text # Import the core analysis function
 # --- Configuration ---
 LANG = 'en' # Default language, can be overridden if needed
@@ -292,13 +292,17 @@ def process_message():
     nlp_error = None
     try:
         # Call the imported analysis function
-        nlp_analysis_result = analyze_expense_text(text_message)
         print(f"NLP Service Analysis Result: {nlp_analysis_result}")
-        # Check if the NLP analysis itself reported an error/failure
-        if nlp_analysis_result.get("status") == "failed":
             nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
             # Return the failure result from NLP service
-            return jsonify(nlp_analysis_result), 400 # Or 200 with error status? Let's use 200 for now.
         # Return the successful analysis result
         return jsonify(nlp_analysis_result)

 from PIL import Image
 # --- NEW: Import the NLP analysis function ---
+from nlp_service import analyze_text # Corrected import
 # --- Configuration ---
 LANG = 'en' # Default language, can be overridden if needed
     nlp_error = None
     try:
         # Call the imported analysis function
+        nlp_analysis_result = analyze_text(text_message) # Corrected function call
         print(f"NLP Service Analysis Result: {nlp_analysis_result}")
+        # Check if the NLP analysis itself reported an error/failure or requires fallback
+        status = nlp_analysis_result.get("status")
+        if status == "failed":
             nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
             # Return the failure result from NLP service
+            return jsonify(nlp_analysis_result), 400 # Use 400 for client-side errors like empty text
+        elif status == "fallback_required":
+             # Return the fallback result (e.g., for queries)
+             return jsonify(nlp_analysis_result), 200 # Return 200, but indicate fallback needed
         # Return the successful analysis result
         return jsonify(nlp_analysis_result)

config.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import re
+# --- NLP Configuration ---
+CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
+# More robust regex to find monetary values even if spaCy misses MONEY entity
+# Added a group to capture standalone numbers potentially without currency symbols nearby
+FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b|\b([\d,]+(?:\.\d{1,2})?)\b', re.IGNORECASE)
+# Consolidated Category Keywords
+CATEGORY_KEYWORDS = {
+    "Coffee": ["coffee", "latte", "cappuccino", "starbucks", "cafe", "café", "espresso", "mocha", "ccd"],
+    "Food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "sandwich", "burger", "pizza"],
+    "Groceries": ["groceries", "supermarket", "vegetables", "milk", "market", "zepto", "blinkit", "bigbasket"],
+    "Entertainment": ["movie", "cinema", "concert", "game", "netflix", "spotify", "tickets", "fun"],
+    "Transport": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket", "metro", "auto", "rickshaw", "commute"], # Combined Travel/Transport
+    "Shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
+    "Utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
+    "Rent": ["rent", "lease"],
+    "Income": ["salary", "received", "credited", "deposit", "income"], # Added income keyword
+    "Investment": ["invest", "stock", "shares", "mutual fund", "sip", "investment"], # Added investment keyword
+    # "Misc" can be the default if no keywords match
+}
+# Keywords for intent detection (less critical if using zero-shot, but can be helpers)
+QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
+ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending

handler.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import json
+# Remove direct model/util imports if calling analyze_text
+# from model_setup import zero_shot, ner
+# from utils import parse_entities
+# from config import CATEGORY_KEYWORDS
+# Import the centralized analysis function
+from nlp_service import analyze_text
+def lambda_handler(event, context):
+    # ... (Keep body parsing logic) ...
+    body_str = event.get("body", "{}")
+    try:
+        body = json.loads(body_str)
+    except json.JSONDecodeError:
+        print(f"Error decoding JSON body: {body_str}")
+        return {
+            "statusCode": 400,
+            "body": json.dumps({"error": "Invalid JSON in request body"})
+        }
+    text = body.get("text", "")
+    if not text:
+        return {
+            "statusCode": 400,
+            "body": json.dumps({"error": "Missing 'text' field in request body"})
+        }
+    print(f"Processing text via nlp_service: {text}") # Log input
+    # Call the centralized NLP service function
+    try:
+        analysis_result = analyze_text(text)
+        status = analysis_result.get("status")
+        if status == "failed":
+            print(f"NLP analysis failed: {analysis_result.get('message')}")
+            # Return 400 for input errors, 500 for internal NLP errors?
+            # Let's return 400 if it's a known failure from analyze_text
+            return {
+                "statusCode": 400,
+                "body": json.dumps(analysis_result)
+            }
+        elif status == "fallback_required":
+            print(f"NLP analysis requires fallback: {analysis_result.get('message')}")
+            # Return 200 but indicate fallback needed
+            return {
+                "statusCode": 200,
+                "body": json.dumps(analysis_result)
+            }
+        elif status == "success":
+             print(f"NLP analysis successful: {analysis_result}")
+             # Return the successful analysis result
+             return {
+                 "statusCode": 200,
+                 "body": json.dumps(analysis_result) # Already contains status
+             }
+        else:
+            # Should not happen if analyze_text always returns a status
+            print(f"Error: Unknown status from analyze_text: {status}")
+            return {
+                "statusCode": 500,
+                "body": json.dumps({"error": "Internal server error: Unexpected NLP response"})
+            }
+    except Exception as e:
+        print(f"Error calling analyze_text from handler: {e}")
+        import traceback
+        traceback.print_exc()
+        return {
+            "statusCode": 500,
+            "body": json.dumps({"error": "Internal server error during NLP processing", "details": str(e)})
+        }
+# Example event structure (for local testing if needed)
+if __name__ == '__main__':
+    # ... (Keep example test cases, they should still work) ...
+    example_event = {
+      "body": json.dumps({
+        "text": "spent 5 eur on coffee"
+      })
+    }
+    context = {}
+    response = lambda_handler(example_event, context)
+    print("\n--- Lambda Response ---")
+    # The body is already a JSON string containing the result from analyze_text
+    print(json.dumps(json.loads(response['body']), indent=2))
+    example_event_query = {
+      "body": json.dumps({
+        "text": "how much did I spend last month"
+      })
+    }
+    response_query = lambda_handler(example_event_query, context)
+    print("\n--- Lambda Response (Query) ---")
+    print(json.dumps(json.loads(response_query['body']), indent=2))
+    example_event_income = {
+      "body": json.dumps({
+        "text": "salary credited 50000"
+      })
+    }
+    response_income = lambda_handler(example_event_income, context)
+    print("\n--- Lambda Response (Income) ---")
+    print(json.dumps(json.loads(response_income['body']), indent=2))

model_setup.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from transformers import pipeline
+# Load once and reuse
+zero_shot = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
+ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

nlp_service.py CHANGED Viewed

@@ -1,814 +1,120 @@
-import re
-import datetime
-import dateparser # Still essential for interpreting date strings
-import spacy # Import spaCy
-from flask import Blueprint, request, jsonify
-from collections import defaultdict
-import logging
-import os # To handle potential model loading issues
-import requests # Add requests for API calls
-import json # For handling JSON data
-import os # Already imported, needed for API key
-# --- Setup ---
-logging.basicConfig(level=logging.INFO)
-# --- Load spaCy Model ---
-# Using medium model for better accuracy and word vectors (though not used explicitly yet)
-# Handle potential errors during model loading
-try:
-    # Check if running in an environment where models might be linked differently
-    # (e.g., Google Cloud Functions sometimes needs explicit path)
-    model_name = "en_core_web_md"
-    if not spacy.util.is_package(model_name):
-        print(f"spaCy model '{model_name}' not found as package. Attempting download...")
-        spacy.cli.download(model_name)
-    nlp = spacy.load(model_name)
-    logging.info(f"Successfully loaded spaCy model '{model_name}'")
-except (OSError, ImportError) as e:
-    logging.error(f"Could not load spaCy model '{model_name}'. Error: {e}")
-    logging.error("Ensure the model is downloaded: python -m spacy download en_core_web_md")
-    # Fallback or exit - for now, we'll log and potentially fail later if nlp isn't loaded
-    nlp = None # Indicate model loading failed
-# --- In-Memory Data Storage (Replace with Database) ---
-expenses = []
-next_expense_id = 1
-# --- NLP Configuration & Helpers ---
-CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
-# More robust regex to find monetary values even if spaCy misses MONEY entity
-FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b', re.IGNORECASE)
-# Category keywords remain useful
-CATEGORY_KEYWORDS = {
-    "food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "groceries", "sandwich", "burger", "pizza"],
-    "coffee": ["coffee", "latte", "cappuccino", "espresso", "cafe", "starbucks", "ccd", "café", "mocha"],
-    "travel": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket"],
-    "shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
-    "groceries": ["groceries", "supermarket", "zepto", "blinkit", "bigbasket", "vegetables", "milk", "market"],
-    "utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
-    "entertainment": ["movie", "cinema", "concert", "game", "fun", "netflix", "spotify", "tickets"],
-    "rent": ["rent", "lease"],
-    "transport": ["transport", "metro", "auto", "rickshaw", "commute"]
-}
-# Keywords for intent detection (can be less critical now, intent inferred more from entities)
-QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
-ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending
-def parse_money_entity(text, doc):
     """
-    Extracts amount using spaCy MONEY entities first, then falls back to regex.
-    Returns the amount as float and identified currency symbol/code.
-    """
-    amount = None
-    currency = None
-    text = text.replace(',', '') # Remove commas for easier parsing
-    # 1. Try spaCy MONEY entities first
-    money_ents = [ent for ent in doc.ents if ent.label_ == "MONEY"]
-    if money_ents:
-        # Prioritize longer entities or ones closer to verbs like 'spent' if multiple found
-        # Simple approach: take the first one for now
-        ent_text = money_ents[0].text.replace(',', '')
-        # Try to extract number and symbol/code from the entity text
-        num_match = re.search(r'([\d\.]+)', ent_text)
-        if num_match:
-            try:
-                amount = float(num_match.group(1))
-                # Try to find a known symbol or code within the entity text
-                symbol_match = re.search(r'([\$€£₹])', ent_text)
-                if symbol_match:
-                    currency = symbol_match.group(1)
-                else:
-                    # Check for codes like USD, GBP etc. (simple check)
-                    code_match = re.search(r'\b(USD|EUR|GBP|INR|RS)\b', ent_text, re.IGNORECASE)
-                    if code_match:
-                        currency = code_match.group(1).upper()
-                        # Standardize common ones
-                        if currency == "RS": currency = "INR"
-                # If amount found but no currency symbol in entity, check doc context
-                if amount is not None and currency is None:
-                     for token in doc:
-                         if token.text in CURRENCY_SYMBOLS:
-                             currency = token.text
-                             break
-                return amount, currency
-            except ValueError:
-                pass # Failed to convert number
-    # 2. Fallback Regex (if spaCy missed it or parsing failed)
-    match = FALLBACK_AMOUNT_REGEX.search(text)
-    if match:
-        try:
-            if match.group(2): # Format: $100 or Rs 100
-                amount = float(match.group(2))
-                currency_text = match.group(1)
-            elif match.group(3): # Format: 100 dollars or 100 Rs
-                amount = float(match.group(3))
-                currency_text = match.group(4)
-            else: # Should not happen with this regex, but safety first
-                return None, None
-            # Normalize currency symbol/code
-            if currency_text in CURRENCY_SYMBOLS:
-                currency = currency_text
-            else:
-                currency_text = currency_text.lower()
-                if currency_text in ["rs", "rupees"]: currency = "₹" # Or INR
-                elif currency_text in ["dollars", "usd"]: currency = "$" # Or USD
-                elif currency_text in ["pounds", "gbp"]: currency = "£" # Or GBP
-                elif currency_text in ["euros", "eur"]: currency = "€" # Or EUR
-            return amount, currency
-        except (ValueError, IndexError):
-            logging.warning(f"Regex fallback failed to parse amount from: {text}")
-            return None, None
-    return None, None # No amount found
-def parse_date_entities(doc):
-    """
-    Uses dateparser to interpret spaCy DATE entities.
-    Returns the *most likely* date found, defaulting to today.
-    """
-    dates = []
-    # Settings for dateparser: prefer past dates for expenses
-    settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
-    date_ents = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
-    logging.debug(f"Found DATE entities: {date_ents}")
-    if date_ents:
-        for date_str in date_ents:
-            # Sometimes spaCy includes words like "on", "last" in the entity, dateparser handles this
-            parsed = dateparser.parse(date_str, settings=settings)
-            if parsed:
-                dates.append(parsed.date())
-    if dates:
-        # Heuristic: If multiple dates, prefer the one closest to today? Or just the first?
-        # Let's prefer the latest valid past date found (most recent expense)
-        past_dates = [d for d in dates if d <= datetime.date.today()]
-        if past_dates:
-             return max(past_dates) # Return the most recent valid date
-        elif dates:
-             return min(dates) # If only future dates found, return the earliest one (less likely for expense)
-    # Fallback if no DATE entity found or parsed
-    logging.debug("No valid DATE entity found or parsed, defaulting to today.")
-    return datetime.date.today()
-def identify_merchant_and_category(doc):
-    """
-    Identifies merchant using ORG/PERSON/GPE entities and context.
-    Identifies category using keywords and context around amount/merchant.
-    """
-    merchant = None
-    category = "Uncategorized" # Default
-    money_token_indices = [token.i for token in doc if token.like_num or token.text in CURRENCY_SYMBOLS or any(sym in token.text for sym in CURRENCY_SYMBOLS) or (token.ent_type_ == "MONEY")]
-    potential_merchants = []
-    for ent in doc.ents:
-        if ent.label_ in ["ORG", "PERSON", "GPE", "FAC"]: # Facility might also be relevant
-             # Check context: is it preceded by "at", "from", "in"? Is it near the money amount?
-             prepositions = {"at", "from", "in", "on", "with"}
-             # Check token before the entity start
-             if ent.start > 0 and doc[ent.start - 1].lower_ in prepositions:
-                 potential_merchants.append(ent.text)
-                 continue
-             # Check dependency relation (e.g., object of preposition)
-             if ent.root.head.lemma_ in prepositions:
-                 potential_merchants.append(ent.text)
-                 continue
-            # Check proximity to money amount if indices available
-             if money_token_indices:
-                 min_dist = min(abs(ent.start - idx) for idx in money_token_indices)
-                 if min_dist < 5: # Arbitrary proximity threshold
-                     potential_merchants.append(ent.text)
-                     continue
-    if potential_merchants:
-        # Simple heuristic: choose the first likely one. Could be refined.
-        # Filter out very common words or locations if needed (e.g., "City", "Bank" if too generic)
-        merchant = potential_merchants[0].strip()
-        logging.debug(f"Identified potential merchant: {merchant} from entities {potential_merchants}")
-    # --- Category Identification ---
-    text_lower = doc.text.lower()
-    # 1. Check explicit category keywords
-    found_category = None
-    matched_keywords = []
-    for cat, keywords in CATEGORY_KEYWORDS.items():
-        if any(keyword in text_lower for keyword in keywords):
-            # If multiple categories match, prioritize based on merchant or context?
-            # Simple approach: Store all matches for now
-             matched_keywords.append(cat)
-    if len(matched_keywords) == 1:
-         found_category = matched_keywords[0]
-    elif len(matched_keywords) > 1:
-         # Ambiguity - Requires smarter logic. E.g., "Coffee at Food court" -> Coffee or Food?
-         # Prioritize based on merchant if known? E.g. if merchant is Starbucks -> Coffee
-         if merchant:
-             merchant_lower = merchant.lower()
-             if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower:
-                 if "coffee" in matched_keywords: found_category = "coffee"
-             elif "amazon" in merchant_lower or "flipkart" in merchant_lower:
-                 if "shopping" in matched_keywords: found_category = "shopping"
-             elif "zepto" in merchant_lower or "blinkit" in merchant_lower or "groceries" in merchant_lower:
-                 if "groceries" in matched_keywords: found_category = "groceries"
-                 elif "food" in matched_keywords: found_category = "groceries" # Prefer specific
-         # If still ambiguous, maybe pick the most specific one (e.g., prefer 'coffee' over 'food')
-         if not found_category:
-             if "coffee" in matched_keywords: found_category = "coffee"
-             elif "groceries" in matched_keywords: found_category = "groceries"
-             elif "transport" in matched_keywords: found_category = "transport"
-             # Add more specific priorities if needed
-             elif "food" in matched_keywords : found_category = "food" # More general last
-             else: found_category = matched_keywords[0] # Default to first match if no rules apply
-    if found_category:
-        category = found_category
-    # 2. (Optional/Advanced) Infer from merchant if category is Uncategorized
-    elif merchant and category == "Uncategorized":
-        merchant_lower = merchant.lower()
-        if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower: category = "coffee"
-        elif "amazon" in merchant_lower or "flipkart" in merchant_lower: category = "shopping"
-        elif "zepto" in merchant_lower or "blinkit" in merchant_lower: category = "groceries"
-        elif "uber" in merchant_lower or "ola" in merchant_lower: category = "travel"
-        elif "netflix" in merchant_lower or "spotify" in merchant_lower: category = "entertainment"
-        # Add more merchant->category mappings
-    # 3. (Optional/Advanced) Use Dependency Parsing or Word Vectors
-    # Example: Look for nouns that are objects of spending verbs near the amount
-    # This requires more complex linguistic analysis.
-    logging.debug(f"Identified Category: {category}")
-    return merchant, category
-def determine_intent(doc):
-    """Determines intent: 'add_expense', 'query_expense', or 'unknown'."""
-    text_lower = doc.text.lower()
-    has_query_keyword = any(keyword in text_lower for keyword in QUERY_KEYWORDS)
-    has_add_verb = any(verb.lemma_ in ADD_EXPENSE_VERBS for verb in doc if verb.pos_ == "VERB")
-    has_money_entity = any(ent.label_ == "MONEY" for ent in doc.ents) or FALLBACK_AMOUNT_REGEX.search(text_lower) is not None
-    # More explicit questions are likely queries
-    if doc[0].pos_ == "AUX" or doc[0].lemma_ in ["what", "how", "show", "list", "view"]: # Starts like a question
-        return "query_expense"
-    if has_query_keyword:
-        return "query_expense"
-    # If it has a spending verb and a money amount, likely adding expense
-    if has_add_verb and has_money_entity:
-        return "add_expense"
-    # If it just has a money amount and maybe date/merchant, could be adding expense (implicit verb)
-    if has_money_entity and not has_query_keyword:
-         # Check if there are nouns suggesting items bought
-         has_object_noun = any(tok.pos_ == "NOUN" and tok.dep_ in ["dobj", "pobj", "attr"] for tok in doc)
-         if has_object_noun or any(ent.label_ in ["ORG", "PRODUCT"] for ent in doc.ents):
-            return "add_expense"
-    # If only query keywords or unclear structure, lean towards query or unknown
-    if has_query_keyword:
-         return "query_expense"
-    return "unknown"
-# --- Filtering and Formatting (largely reused, minor adjustments) ---
-def filter_expenses(criteria):
-    """Filters the global 'expenses' list based on criteria."""
-    # (This function remains largely the same as the previous version)
-    filtered = expenses
-    # Filter by Category
-    if 'category' in criteria and criteria['category'] is not None:
-        target_cat = criteria['category'].lower()
-        # Handle general 'food' query including 'coffee', 'groceries' etc.
-        food_related_cats = {'food', 'coffee', 'groceries', 'restaurant'} # Define food-related categories
-        if target_cat == 'food':
-            filtered = [e for e in filtered if e['category'].lower() in food_related_cats]
-        else:
-             filtered = [e for e in filtered if e['category'].lower() == target_cat]
-    # Filter by Date Range (start_date and end_date are inclusive)
-    if 'start_date' in criteria and criteria['start_date'] is not None:
-        filtered = [e for e in filtered if e['date'] >= criteria['start_date']]
-    if 'end_date' in criteria and criteria['end_date'] is not None:
-        filtered = [e for e in filtered if e['date'] <= criteria['end_date']]
-    # Filter by Merchant (case-insensitive substring match)
-    if 'merchant' in criteria and criteria['merchant'] is not None:
-        target_merchant = criteria['merchant'].lower()
-        filtered = [e for e in filtered if e['merchant'] and target_merchant in e['merchant'].lower()]
-    return filtered
-def parse_date_range_from_query(doc):
-    """Parses date ranges specifically for queries (e.g., 'this month', 'last week')."""
-    # (This function remains largely the same, using dateparser on DATE entities or keywords)
-    today = datetime.date.today()
-    text_lower = doc.text.lower() # Use full text for keywords like "this month"
-    start_date, end_date = None, None
-    # Prioritize DATE entities found by spaCy
-    date_ents_text = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
-    parsed_dates = []
-    settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
-    for date_str in date_ents_text:
-        # Try parsing as a potential range using dateparser's experimental range feature (or parse single dates)
-        # For simplicity, we'll stick to parsing single points and let keyword logic handle ranges
-        parsed = dateparser.parse(date_str, settings=settings)
-        if parsed:
-            parsed_dates.append(parsed.date())
-    # If spaCy found specific dates, use them
-    if len(parsed_dates) == 1:
-        start_date = end_date = parsed_dates[0]
-    elif len(parsed_dates) > 1:
-         # Ambiguous, maybe take min/max? Or rely on keywords below?
-         start_date = min(parsed_dates)
-         end_date = max(parsed_dates)
-         if start_date > end_date: # Swap if order is wrong
-             start_date, end_date = end_date, start_date
-    # If no specific date entities, check for range keywords
-    if start_date is None and end_date is None:
-        if "today" in text_lower:
-            start_date = end_date = today
-        elif "yesterday" in text_lower:
-            start_date = end_date = today - datetime.timedelta(days=1)
-        elif "this week" in text_lower:
-            start_of_week = today - datetime.timedelta(days=today.weekday()) # Monday
-            end_of_week = start_of_week + datetime.timedelta(days=6) # Sunday
-            start_date = start_of_week
-            end_date = end_of_week
-        elif "last week" in text_lower:
-            end_of_last_week = today - datetime.timedelta(days=today.weekday() + 1) # Last Sunday
-            start_of_last_week = end_of_last_week - datetime.timedelta(days=6) # Last Monday
-            start_date = start_of_last_week
-            end_date = end_of_last_week
-        elif "this month" in text_lower:
-            start_date = today.replace(day=1)
-            next_month = today.replace(day=28) + datetime.timedelta(days=4)
-            last_day_of_month = next_month - datetime.timedelta(days=next_month.day)
-            end_date = last_day_of_month
-        elif "last month" in text_lower:
-            first_day_of_current_month = today.replace(day=1)
-            last_day_of_last_month = first_day_of_current_month - datetime.timedelta(days=1)
-            first_day_of_last_month = last_day_of_last_month.replace(day=1)
-            start_date = first_day_of_last_month
-            end_date = last_day_of_last_month
-        elif "year" in text_lower: # e.g., "this year", "last year"
-             if "this year" in text_lower:
-                 start_date = datetime.date(today.year, 1, 1)
-                 end_date = datetime.date(today.year, 12, 31)
-             elif "last year" in text_lower:
-                 start_date = datetime.date(today.year - 1, 1, 1)
-                 end_date = datetime.date(today.year - 1, 12, 31)
-             # Check for specific year like "in 2023"
-             year_match = re.search(r'\b(in|for)\s+(\d{4})\b', text_lower)
-             if year_match:
-                 year = int(year_match.group(2))
-                 start_date = datetime.date(year, 1, 1)
-                 end_date = datetime.date(year, 12, 31)
-        # Add specific month parsing ("in January") if needed (similar to previous version)
-        else:
-             month_match = re.search(r'\b(in|for)\s+(january|february|march|april|may|june|july|august|september|october|november|december)\b', text_lower)
-             if month_match:
-                 month_name = month_match.group(2)
-                 year_context = today.year # Assume current year
-                 # Check if a year was mentioned nearby
-                 year_ent = [e.text for e in doc.ents if e.label_ == "DATE" and e.text.isdigit() and len(e.text)==4]
-                 if year_ent:
-                     year_context = int(year_ent[0])
-                 try:
-                     month_num = list(datetime.date(2000, i, 1).strftime('%B').lower() for i in range(1, 13)).index(month_name) + 1
-                     start_date = datetime.date(year_context, month_num, 1)
-                     next_m = (start_date.replace(day=28) + datetime.timedelta(days=4))
-                     end_date = next_m - datetime.timedelta(days=next_m.day)
-                 except (ValueError, IndexError): pass # Ignore invalid month/year
-    logging.debug(f"Parsed date range for query: {start_date} to {end_date}")
-    return start_date, end_date
-def format_expense_list(expense_list, title="Here are the expenses:"):
-    """Formats a list of expenses into a user-friendly string."""
-    # (This function remains largely the same)
-    if not expense_list:
-        return "No expenses found matching your criteria."
-    total_amount = sum(e['amount'] for e in expense_list)
-    # Try to get a consistent currency symbol, default to first expense's symbol or fallback
-    currency_symbol = expense_list[0].get("currency") or "₹" if expense_list else "₹"
-    response_lines = [title]
-    expense_list.sort(key=lambda x: x['date'], reverse=True)
-    for expense in expense_list:
-        cur = expense.get("currency") or currency_symbol # Use expense specific or default
-        amount_str = f"{cur}{expense['amount']:.2f}"
-        merchant_part = f" at {expense['merchant']}" if expense['merchant'] else ""
-        category_part = f" ({expense['category']})" if expense['category'] != 'Uncategorized' else ""
-        date_str = expense['date'].strftime("%b %d, %Y")
-        response_lines.append(f"- {amount_str}{category_part}{merchant_part} - {date_str}")
-    if len(expense_list) > 1:
-         total_str = f"{currency_symbol}{total_amount:.2f}"
-         response_lines.append(f"Total: {total_str}")
-    return "\n".join(response_lines)
-# --- NEW: Core NLP Processing Function ---
-def analyze_expense_text(text):
     """
-    Analyzes text to extract expense details or understand queries using spaCy.
-    Returns a dictionary with action, status, and extracted details/message.
-    """
-    global next_expense_id # Allow modification of the global counter
-    if nlp is None:
-        logging.error("spaCy model not loaded. Cannot process text.")
-        return {"action": "error", "status": "failed", "message": "NLP model not available"}
-    logging.info(f"Analyzing text: {text[:100]}...") # Log snippet
-    doc = nlp(text)
-    logging.debug(f"spaCy Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
-    intent = determine_intent(doc)
-    logging.info(f"Determined Intent: {intent}")
-    response_data = {}
-    if intent == "add_expense":
-        amount, currency = parse_money_entity(text, doc)
-        expense_date = parse_date_entities(doc)
-        merchant, category = identify_merchant_and_category(doc)
-        if amount is not None:
-            currency_symbol = currency or "₹" # Default currency
-            new_expense = {
-                "id": next_expense_id,
-                "amount": amount,
-                "currency": currency_symbol,
-                "category": category,
-                "merchant": merchant,
-                "date": expense_date, # Keep as date object internally
-                "original_message": text
-            }
-            expenses.append(new_expense)
-            next_expense_id += 1
-            logging.info(f"Added expense (in-memory): {new_expense}")
-            merchant_part = f" at {merchant}" if merchant else ""
-            date_str = expense_date.strftime('%b %d, %Y')
-            confirmation_msg = f"✅ Expense added: {currency_symbol}{amount:.2f} for {category}{merchant_part} on {date_str}."
-            new_expense_serializable = new_expense.copy()
-            new_expense_serializable["date"] = new_expense["date"].isoformat()
-            response_data = {
-                "action": "add_expense",
-                "status": "success",
-                "message": confirmation_msg,
-                "details": new_expense_serializable
-            }
-        else:
-            logging.warning(f"Could not extract amount reliably from: {text}")
-            response_data = {
-                "action": "add_expense",
-                "status": "failed",
-                "message": f"Sorry, I couldn't understand the amount. Please include it clearly (e.g., '₹500', '$20', '15 pounds')."
-            }
-    elif intent == "query_expense":
-        logging.info("Processing query intent.")
-        query_criteria = {}
-        _q_merchant, q_category = identify_merchant_and_category(doc)
-        # ... (rest of query criteria extraction logic remains the same) ...
-        query_cat_found = None
-        text_lower = doc.text.lower()
-        for cat, keywords in CATEGORY_KEYWORDS.items():
-             if any(keyword in text_lower for keyword in keywords):
-                  if cat == 'food' or q_category == 'food':
-                      query_cat_found = 'food'
-                      break
-                  query_cat_found = q_category if q_category != 'Uncategorized' else cat
-                  break
-        query_criteria['category'] = query_cat_found
-        query_criteria['merchant'] = _q_merchant
-        start_date, end_date = parse_date_range_from_query(doc)
-        query_criteria['start_date'] = start_date
-        query_criteria['end_date'] = end_date
-        logging.info(f"Query Criteria: {query_criteria}")
-        results = filter_expenses(query_criteria)
-        response_message = ""
-        # ... (rest of query response formatting logic remains the same) ...
-        if results and ("total" in text_lower or "sum" in text_lower or "how much" in doc[0].lower_):
-             total_amount = sum(e['amount'] for e in results)
-             currency_symbol = results[0].get("currency") or "₹"
-             category_filter_text = f" on {query_criteria['category']}" if query_criteria['category'] else ""
-             date_filter_text = ""
-             if start_date and end_date and start_date == end_date: date_filter_text = f" for {start_date.strftime('%b %d, %Y')}"
-             elif start_date and end_date: date_filter_text = f" from {start_date.strftime('%b %d')} to {end_date.strftime('%b %d, %Y')}"
-             elif start_date: date_filter_text = f" since {start_date.strftime('%b %d, %Y')}"
-             elif end_date: date_filter_text = f" until {end_date.strftime('%b %d, %Y')}"
-             response_message = f"Your total spending{category_filter_text}{date_filter_text} is {currency_symbol}{total_amount:.2f}."
-             if len(results) <= 10:
-                 response_message += "\n" + format_expense_list(results, "Details:")
-             else:
-                 response_message += f" (from {len(results)} transactions)"
-        elif results and ("biggest" in text_lower or "largest" in text_lower or "top" in text_lower):
-             top_n = 3
-             top_expenses = sorted(results, key=lambda x: x['amount'], reverse=True)[:top_n]
-             response_message = format_expense_list(top_expenses, f"Your top {len(top_expenses)} expenses:")
-        else:
-            date_filter_desc = ""
-            if start_date and end_date and start_date == end_date: date_filter_desc = f" from {start_date.strftime('%b %d, %Y')}"
-            elif start_date or end_date: date_filter_desc = " matching the date criteria"
-            category_filter_desc = f" for {query_criteria['category']}" if query_criteria['category'] else ""
-            merchant_filter_desc = f" at {query_criteria['merchant']}" if query_criteria['merchant'] else ""
-            title = f"Expenses{category_filter_desc}{merchant_filter_desc}{date_filter_desc}:"
-            response_message = format_expense_list(results, title)
-        response_data = {
-            "action": "query_expense",
-            "status": "success",
-            "message": response_message,
-            "criteria": {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in query_criteria.items() if v is not None},
-            "results_count": len(results)
         }
-    else: # intent == "unknown"
-        logging.info(f"Local NLP intent unknown for: {text}. Attempting Gemini API call.")
-        # --- Call Gemini API ---
-        gemini_result = call_gemini_api(text, GEMINI_API_KEY)
-        if (gemini_result and isinstance(gemini_result, dict) and gemini_result.get("action") in ["add_expense", "query_expense", "info"]):
-            # If Gemini returned a structured result we can use (or an info message), return it
-            logging.info(f"Using result from Gemini API. Action: {gemini_result.get('action')}")
-            response_data = gemini_result
-            # TODO: Potentially re-validate or re-process gemini_result here if needed
-            # For example, if action is add_expense, ensure data types are correct, parse date string etc.
-            # If action is query_expense, parse date strings etc.
-            if response_data.get("action") == "add_expense" and "details" in response_data:
-                 # Basic post-processing/validation for added expense
-                 details = response_data["details"]
-                 try:
-                     if "date" in details and isinstance(details["date"], str):
-                         details["date"] = datetime.datetime.fromisoformat(details["date"].split("T")[0]).date()
-                     if "amount" in details:
-                         details["amount"] = float(details["amount"])
-                     # Add expense to memory if Gemini successfully added it
-                     # Note: This assumes Gemini provides all necessary fields correctly
-                     if all(k in details for k in ["amount", "currency", "category", "date"]):
-                         new_expense = {
-                             "id": next_expense_id,
-                             "amount": details["amount"],
-                             "currency": details.get("currency", "₹"),
-                             "category": details.get("category", "Uncategorized"),
-                             "merchant": details.get("merchant"),
-                             "date": details["date"],
-                             "original_message": text
-                         }
-                         expenses.append(new_expense)
-                         next_expense_id += 1
-                         logging.info(f"Added expense (from Gemini): {new_expense}")
-                         # Update message for consistency
-                         # --- FIX: Check if date is valid before formatting ---
-                         if isinstance(new_expense.get('date'), datetime.date):
-                             date_str = new_expense['date'].strftime('%b %d, %Y')
-                             response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} on {date_str}."
-                         else:
-                             logging.warning(f"Gemini add_expense result had invalid date type: {type(new_expense.get('date'))}. Using default message.")
-                             response_data["message"] = f"✅ Expense added (via Gemini): {new_expense['currency']}{new_expense['amount']:.2f} for {new_expense['category']} (date missing/invalid)."
-                         # Make details serializable for JSON response
-                         # Ensure date is serializable even if it was invalid earlier
-                         if isinstance(response_data["details"].get("date"), datetime.date):
-                             response_data["details"]["date"] = response_data["details"]["date"].isoformat()
-                         else:
-                             # Handle case where date might be None or wrong type after processing
-                             response_data["details"]["date"] = None # Or some indicator of invalidity
-                     else:
-                         logging.warning("Gemini add_expense result missing required fields.")
-                         response_data = {"action": "unknown", "status": "failed", "message": "Gemini suggested adding an expense, but details were incomplete."}
-                 except (ValueError, TypeError) as e:
-                     logging.warning(f"Error processing Gemini add_expense details: {e}")
-                     response_data = {"action": "unknown", "status": "failed", "message": "Could not process expense details suggested by Gemini."}
-            elif response_data.get("action") == "query_expense" and "criteria" in response_data:
-                 # Basic post-processing for query
-                 criteria = response_data["criteria"]
-                 try:
-                     if "start_date" in criteria and isinstance(criteria["start_date"], str):
-                         criteria["start_date"] = datetime.datetime.fromisoformat(criteria["start_date"].split("T")[0]).date()
-                     if "end_date" in criteria and isinstance(criteria["end_date"], str):
-                         criteria["end_date"] = datetime.datetime.fromisoformat(criteria["end_date"].split("T")[0]).date()
-                     # Execute the query based on Gemini's criteria
-                     results = filter_expenses(criteria)
-                     # Use Gemini's message or generate a new one
-                     if not response_data.get("message"):
-                         response_data["message"] = format_expense_list(results, "Query results (via Gemini):")
-                     response_data["results_count"] = len(results)
-                     # Make criteria serializable
-                     response_data["criteria"] = {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in criteria.items() if v is not None}
-                 except (ValueError, TypeError) as e:
-                     logging.warning(f"Error processing Gemini query_expense criteria: {e}")
-                     response_data = {"action": "unknown", "status": "failed", "message": "Could not process query criteria suggested by Gemini."}
-        else:
-            # Fallback to original unknown message if Gemini fails or returns unusable data
-            logging.info("Gemini API did not provide a usable structured result. Falling back to default unknown message.")
-            response_data = {
-                "action": "unknown",
-                "status": "failed",
-                "message": "Sorry, I couldn't quite understand that. Please try phrasing your expense or query differently. \nExamples:\n- 'Spent ₹50 on coffee yesterday at Starbucks'\n- 'Show my food expenses last week'\n- 'What was my total spending last month?'"
-            }
-            # Optionally include Gemini's raw suggestion if available and not structured
-            if gemini_result and isinstance(gemini_result, dict) and "message" in gemini_result:
-                 response_data["message"] += f"\n\nGemini suggestion: {gemini_result['message']}"
-    logging.info(f"Analysis complete. Action: {response_data.get('action')}, Status: {response_data.get('status')}") # Corrected closing parenthesis
-    return response_data
-# Placeholder for Gemini API Key - Load from environment variable
-GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
-# Placeholder function for Gemini API call
-def call_gemini_api(text, api_key):
-    """
-    Placeholder function to call the Gemini API.
-    Replace with actual implementation.
-    Should ideally return a dictionary similar to analyze_expense_text's output
-    or None if the call fails or response is unusable.
-    """
-    if not api_key:
-        logging.warning("GEMINI_API_KEY not set. Skipping Gemini API call.")
-        return None
-    # --- Replace with actual Gemini API endpoint and request structure ---
-    # Example using Google AI Generative Language API (adjust model and endpoint as needed)
-    # Ensure you have the google-generativeai library installed (`pip install google-generativeai`)
-    # and the API key is correctly set as an environment variable.
-    # Use a current model and the v1 endpoint
-    model_name = "gemini-2.0-flash-lite" # Updated model name
-    api_endpoint = f"https://generativelanguage.googleapis.com/v1/models/{model_name}:generateContent?key={api_key}"
-    headers = {
-        "Content-Type": "application/json"
-    }
-    # Construct the payload based on Gemini API requirements
-    # This prompt asks Gemini to act like the existing NLP service
-    # Corrected indentation for the prompt string
-    prompt = f"""Analyze the following text for expense tracking. Determine the intent ('add_expense' or 'query_expense') and extract relevant details.
-Text: "{text}"
-Desired JSON output format:
-{{
-  "action": "add_expense" | "query_expense" | "unknown" | "info",
-  "status": "success" | "failed",
-  "message": "Confirmation or result summary or explanation",
-  "details": {{ // Only for add_expense if successful
-    "amount": <float>,
-    "currency": "<string>", // e.g., "₹", "$", "EUR"
-    "category": "<string>", // e.g., "food", "travel", "shopping"
-    "merchant": "<string>", // e.g., "Starbucks", "Amazon"
-    "date": "YYYY-MM-DD"
-  }},
-  "criteria": {{ // Only for query_expense if successful
-    "category": "<string>",
-    "merchant": "<string>",
-    "start_date": "YYYY-MM-DD",
-    "end_date": "YYYY-MM-DD"
-  }}
-}}
-- If the intent is clearly 'add_expense' and details can be extracted, use action "add_expense" and status "success". Include extracted details.
-- If the intent is clearly 'query_expense' and criteria can be extracted, use action "query_expense" and status "success". Include extracted criteria.
-- If the intent is unclear, details are missing for adding, or it's a general question/statement not related to adding/querying expenses, use action "unknown" or "info" and status "failed" or "success" respectively. Provide a helpful message.
-- Ensure date format is YYYY-MM-DD.
-- Default currency to "₹" if not specified.
-- Default category to "Uncategorized" if not specified.
-Provide only the JSON output.
-"""
-    payload = json.dumps({
-      "contents": [{
-        "parts":[{ "text": prompt }]
-      }]
-      # Add generationConfig if needed (e.g., temperature, max output tokens)
-      # "generationConfig": {
-      #   "temperature": 0.7,
-      #   "maxOutputTokens": 256
-      # }
-    })
-    # --- End of placeholder section ---
     try:
-        response = requests.post(api_endpoint, headers=headers, data=payload, timeout=20) # Increased timeout
-        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
-        gemini_response_raw = response.json()
-        logging.debug(f"Raw Gemini API response: {gemini_response_raw}")
-        # --- Process gemini_response ---
-        content = None # Initialize content to None
-        content_cleaned = None # Initialize content_cleaned to None
-        # Extract the text content which should contain the JSON
-        if 'candidates' in gemini_response_raw and len(gemini_response_raw['candidates']) > 0:
-            content = gemini_response_raw['candidates'][0].get('content', {}).get('parts', [{}])[0].get('text')
-            if content:
-                logging.info(f"Gemini suggested JSON: {content}")
-                # Clean potential markdown/code block formatting
-                content_cleaned = content.strip().strip('```json').strip('```').strip()
-                try:
-                    # Attempt to parse the JSON string from Gemini
-                    parsed_result = json.loads(content_cleaned)
-                    # Basic validation of the parsed structure
-                    if isinstance(parsed_result, dict) and "action" in parsed_result:
-                        logging.info("Successfully parsed structured data from Gemini.")
-                        # Add further validation/sanitization if needed
-                        return parsed_result
-                    else:
-                        logging.warning("Gemini response parsed but lacks expected structure.")
-                        # Return info message if structure is wrong but content exists
-                        return {"action": "info", "status": "success", "message": f"Gemini suggestion: {content_cleaned}"}
-                except json.JSONDecodeError as json_err:
-                    logging.warning(f"Failed to decode JSON from Gemini response: {json_err}. Raw content: {content_cleaned}")
-                    # Return the raw text as a message if JSON parsing fails but content exists
-                    return {"action": "info", "status": "success", "message": f"Gemini suggestion: {content_cleaned}"}
-            else:
-                logging.warning("No text content found in Gemini response candidates.")
-                return None
-        else:
-            logging.warning("No candidates found in Gemini API response.")
-            return None
-    except requests.exceptions.Timeout:
-        logging.error("Gemini API call timed out.")
-        return None
-    except requests.exceptions.RequestException as e:
-        logging.error(f"Gemini API call failed: {e}")
-        # Log response body if available and indicates an API error
-        if e.response is not None:
-            try:
-                logging.error(f"Gemini API error response: {e.response.json()}")
-            except json.JSONDecodeError:
-                logging.error(f"Gemini API error response (non-JSON): {e.response.text}")
-        return None
     except Exception as e:
-        # Include content_cleaned in the log if available during unexpected errors
-        error_context = f"Raw content (if available): {content_cleaned}" if content_cleaned else "No raw content parsed."
-        logging.error(f"An unexpected error occurred during Gemini API call or processing: {e}. {error_context}")
-        return None
-# --- Flask Blueprint Setup (Optional: Keep if direct API access is needed) ---
-nlp_bp = Blueprint('nlp_service', __name__)
-@nlp_bp.route('/process_nlp', methods=['POST'])
-def process_nlp_expense_route():
-    """Flask route handler that calls the core analysis function."""
-    data = request.get_json()
-    if not data or 'message' not in data:
-        logging.warning("Received request without 'message' field.")
-        return jsonify({"error": "Missing 'message' in request body"}), 400
-    user_message = data['message']
-    result = analyze_expense_text(user_message) # Call the core function
-    # Determine status code based on result
-    status_code = 200
-    if result.get("status") == "failed":
-        status_code = 400 # Or 500 if it's an internal NLP model error
-        if result.get("message") == "NLP model not available":
-            status_code = 500
-    return jsonify(result), status_code

+# filepath: c:\Users\Dell\Monil\Apps\code\Projects\space-songporter\OCR\nlp_service.py
+import json
+from model_setup import zero_shot, ner # Assuming model_setup.py exists and is correct
+from utils import parse_entities      # Assuming utils.py exists and is correct
+from config import CATEGORY_KEYWORDS  # Import categories from config
+def analyze_text(text: str) -> dict:
     """
+    Analyzes the input text for intent, entities, and category.
+    Args:
+        text: The input text string.
+    Returns:
+        A dictionary containing the analysis results (intent, category, amount, etc.)
+        or an error message.
     """
+    if not text:
+        return {
+            "status": "failed",
+            "message": "Input text cannot be empty."
         }
+    print(f"NLP Service: Processing text: {text}")
+    # Step 1: Intent classification
     try:
+        candidate_labels = ["expense", "investment", "query", "limit-setting", "income", "other"]
+        intent_result = zero_shot(text, candidate_labels=candidate_labels)
+        intent = intent_result["labels"][0]
+        score = intent_result["scores"][0]
+        print(f"NLP Service: Intent classification: {intent} (Score: {score:.2f})")
     except Exception as e:
+        print(f"NLP Service: Error during intent classification: {e}")
+        return {
+            "status": "failed",
+            "message": "Intent classification failed",
+            "error": str(e)
+        }
+    # Step 2: Check if intent requires fallback (e.g., Gemini route)
+    if intent == "query":
+        print(f"NLP Service: Intent classified as '{intent}'. Fallback route triggered.")
+        # Placeholder for potential future Gemini integration
+        return {
+            "status": "fallback_required", # Use a specific status
+            "message": "Intent requires further processing (e.g., query engine - not implemented).",
+            "original_text": text,
+            "classified_intent": intent
+        }
+    # Step 3: Entity extraction (for non-fallback intents)
+    try:
+        entities = ner(text)
+        print(f"NLP Service: NER entities: {entities}")
+        amount, currency, item = parse_entities(entities)
+        print(f"NLP Service: Parsed entities: Amount={amount}, Currency={currency}, Item={item}")
+    except Exception as e:
+        print(f"NLP Service: Error during entity extraction: {e}")
+        # Decide if you want to return an error or proceed with partial data
+        amount, currency, item = None, None, None # Default to None on error
+    # Step 4: Category matching using config.py
+    category = "Misc" # Default
+    text_lower = text.lower()
+    item_lower = item.lower() if item else ""
+    # Check intent first for Income/Investment categories
+    if intent == "income":
+        category = "Income"
+    elif intent == "investment":
+        category = "Investment"
+    else: # Only check keywords if not already classified as Income/Investment by intent
+        for cat, keywords in CATEGORY_KEYWORDS.items():
+            # Skip Income/Investment keywords here as intent handles them
+            if cat in ["Income", "Investment"]:
+                continue
+            if any(kw in text_lower or (item_lower and kw in item_lower) for kw in keywords):
+                category = cat
+                break # Stop after first match
+    # Refine intent based on keywords if initial classification was 'other' or potentially wrong
+    if intent != "income" and category == "Income":
+        print(f"NLP Service: Correcting intent to 'income' based on keywords/category.")
+        intent = "income"
+    elif intent != "investment" and category == "Investment":
+        print(f"NLP Service: Correcting intent to 'investment' based on keywords/category.")
+        intent = "investment"
+    # If no specific category matched but intent is expense/other, ensure category isn't Income/Investment
+    elif category in ["Income", "Investment"] and intent not in ["income", "investment"]:
+         category = "Misc" # Revert category if intent doesn't match
+    print(f"NLP Service: Assigned category: {category}")
+    # Final successful response structure
+    return {
+        "status": "success",
+        "type": intent,
+        "category": category,
+        "amount": amount,
+        "currency": currency,
+        "item": item
+    }
+# Example usage (for testing nlp_service.py directly)
+if __name__ == '__main__':
+    test_cases = [
+        "spent 5 eur on coffee",
+        "how much did I spend last month",
+        "salary credited 50000",
+        "invested 1000 in stocks",
+        "paid 20 usd for lunch",
+        "got groceries for 50 dollars",
+        "what was my total spending on food?",
+        "received 200 GBP deposit"
+    ]
+    for case in test_cases:
+        print(f"\n--- Testing: '{case}' ---")
+        result = analyze_text(case)
+        print(json.dumps(result, indent=2))

requirements.txt CHANGED Viewed

@@ -5,5 +5,10 @@ paddlepaddle
 paddleocr
 spacy>=3.0.0 # Added spaCy
 dateparser>=1.0.0 # Added dateparser
 # Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
-# python -m spacy download en_core_web_md

 paddleocr
 spacy>=3.0.0 # Added spaCy
 dateparser>=1.0.0 # Added dateparser
+google-generativeai # Added for Gemini API
+python-dotenv # Added for loading .env files
 # Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
+# python -m spacy download en_core_web_md
+transformers
+torch
+sentencepiece

utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import re
+import json
+from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols
+def parse_entities(entities, full_text: str):
+    """
+    Extracts amount, currency, and item description from NER entities and full text.
+    Args:
+        entities: List of dictionaries from the NER pipeline.
+        full_text: The original input text string.
+    Returns:
+        A tuple: (amount, currency, item)
+    """
+    amount, currency, item = None, None, None
+    potential_amounts = []
+    # 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable
+    #    Regex groups:
+    #    1: Symbol/Code before number ($, EUR, etc.)
+    #    2: Number when symbol/code is before
+    #    3: Number when symbol/code is after
+    #    4: Symbol/Code after number (rs, dollars, etc.)
+    #    5: Standalone number
+    for match in FALLBACK_AMOUNT_REGEX.finditer(full_text):
+        num_str = None
+        curr_symbol = None
+        curr_code = None
+        if match.group(1) and match.group(2): # Symbol/Code before
+            curr_symbol = match.group(1)
+            num_str = match.group(2)
+        elif match.group(3) and match.group(4): # Symbol/Code after
+            num_str = match.group(3)
+            curr_code = match.group(4)
+        elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number
+             num_str = match.group(5)
+        if num_str:
+            try:
+                value = float(num_str.replace(",", ""))
+                # Basic validation: avoid huge numbers unless they have decimals (might be IDs)
+                if value < 1_000_000 or '.' in num_str:
+                    potential_amounts.append({
+                        "value": value,
+                        "currency_symbol": curr_symbol,
+                        "currency_code": curr_code,
+                        "match_obj": match # Store match object for position info later if needed
+                    })
+            except ValueError:
+                continue # Ignore invalid numbers like "1,2,3"
+    # 2. Determine Amount and Currency from regex matches
+    if potential_amounts:
+        # Prioritize matches that included a currency symbol/code
+        currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]]
+        if currency_matches:
+            # Often the largest value with currency is the main one
+            best_match = max(currency_matches, key=lambda x: x["value"])
+            amount = best_match["value"]
+            # Determine currency from symbol/code
+            symbol = best_match["currency_symbol"]
+            code = best_match["currency_code"]
+            if symbol:
+                if "₹" in symbol: currency = "INR"
+                elif "$" in symbol: currency = "USD"
+                elif "€" in symbol: currency = "EUR"
+                elif "£" in symbol: currency = "GBP"
+            elif code:
+                code_lower = code.lower()
+                if code_lower in ["inr", "rs", "rupees"]: currency = "INR"
+                elif code_lower in ["usd", "dollars"]: currency = "USD"
+                elif code_lower in ["eur", "euros"]: currency = "EUR"
+                elif code_lower in ["gbp", "pounds"]: currency = "GBP"
+        else:
+            # If no currency found, take the largest standalone number as amount
+            best_match = max(potential_amounts, key=lambda x: x["value"])
+            amount = best_match["value"]
+            currency = None # Explicitly None if not found
+    # 3. Extract Item using NER entities (excluding amounts/currency)
+    item_parts = []
+    if entities:
+        # Get text segments identified as potential amounts by the regex
+        amount_texts = set()
+        for p in potential_amounts:
+            amount_texts.add(p["match_obj"].group(0)) # Add the full matched string
+        for entity in entities:
+            entity_group = entity.get("entity_group", "")
+            word = entity.get("word", "")
+            # Skip if the entity word is part of a detected amount or is just a currency symbol
+            if word in amount_texts or word in CURRENCY_SYMBOLS:
+                continue
+            # Skip if it's classified as MONEY by NER (already handled by regex)
+            # Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees")
+            if "MONEY" in entity_group:
+                 continue
+            # Include relevant entity types for item description
+            if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]:
+                 # Clean up sub-word tokens like ##ing
+                 cleaned_word = word.replace(" ##", "").strip()
+                 if cleaned_word:
+                    item_parts.append(cleaned_word)
+    if item_parts:
+        item = " ".join(item_parts).strip()
+        # Further clean-up (optional): remove leading/trailing punctuation if desired
+        item = re.sub(r"^[^\w]+|[^\w]+$", "", item)
+    # 4. Final checks and return
+    # If amount is found but currency is None, consider a default (optional, decided against for now)
+    # if amount is not None and currency is None:
+    #     currency = "INR" # Or keep as None
+    print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}")
+    return amount, currency, item
+# ... (keep parse_gemini_response as is) ...
+def parse_gemini_response(response_text):
+    """
+    Parses a structured string response from Gemini (expected JSON-like).
+    Example expected format:
+    "{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }"
+    """
+    try:
+        # Clean the response text if it's wrapped in markdown code blocks
+        response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip())
+        data = json.loads(response_text)
+        return data
+    except json.JSONDecodeError:
+        print(f"Warning: Could not parse Gemini response: {response_text}")
+        return None
+    except Exception as e:
+        print(f"Error parsing Gemini response: {e}")
+        return None