ClearSpend

Sleeping

App Files Files Community

MonilM commited on Apr 26

Commit

4c9f681

1 Parent(s): 3b392a9

Added NLP

Browse files

Files changed (3) hide show

app.py +121 -41
nlp_service.py +611 -0
requirements.txt +6 -2

app.py CHANGED Viewed

@@ -11,6 +11,9 @@ from flask import Flask, request, jsonify
 from paddleocr import PaddleOCR
 from PIL import Image
 # --- Configuration ---
 LANG = 'en' # Default language, can be overridden if needed
 NUM_WORKERS = 2  # Number of OCR worker threads
@@ -85,9 +88,12 @@ def find_main_amount(ocr_results):
     if not ocr_results:
         return None
-    potential_amounts = []
-    amount_regex = re.compile(r'(?<!\%)\b\d{1,3}(?:,?\d{3})*(?:\.\d{2})\b|\b\d+\.\d{2}\b|\b\d+\b(?!\.\d{1})')
-    total_keywords = ['total', 'grand total', 'amount due', 'balance', 'net amount', 'paid', 'charge', 'subtotal', 'total amount', 'to pay']
     parsed_lines = []
     for i, line_info in enumerate(ocr_results):
@@ -100,62 +106,95 @@ def find_main_amount(ocr_results):
         float_numbers = []
         for num_str in numbers_in_line:
             try:
-                if len(text) < 6 and '.' not in num_str and 1900 < int(num_str.replace(',', '')) < 2100:
-                    continue
                 float_numbers.append(float(num_str.replace(',', '')))
             except ValueError:
                 continue
-        has_keyword = False
-        for keyword in total_keywords:
-           if re.search(r'\b' + re.escape(keyword) + r'\b', text):
-               has_keyword = True
-               break
         parsed_lines.append({
             "index": i,
             "text": text,
             "numbers": float_numbers,
-            "has_keyword": has_keyword,
             "confidence": confidence
         })
-    keyword_candidates = []
-    keyword_line_indices = {line["index"] for line in parsed_lines if line["has_keyword"]}
-    checked_indices_near_keywords = set()
-    for line_idx in keyword_line_indices:
-        indices_to_check = {line_idx, line_idx - 1, line_idx + 1}
-        for check_idx in indices_to_check:
-            if 0 <= check_idx < len(parsed_lines) and check_idx not in checked_indices_near_keywords:
-                line_to_check = parsed_lines[check_idx]
-                if line_to_check["numbers"]:
-                    keyword_candidates.extend(line_to_check["numbers"])
-                    checked_indices_near_keywords.add(check_idx)
-    if keyword_candidates:
-         unique_candidates = list(set(keyword_candidates))
-         if unique_candidates:
-              return max(unique_candidates)
-    print("Warning: No numbers found near keywords. Using fallback (largest overall).")
     all_numbers = []
     for line in parsed_lines:
         all_numbers.extend(line["numbers"])
     if all_numbers:
         unique_numbers = list(set(all_numbers))
-        plausible_numbers = [n for n in unique_numbers if n < 100000 or '.' in str(n)]
-        plausible_numbers = [n for n in plausible_numbers if n >= 1.0 or '.' in str(n)]
-        if plausible_numbers:
-             return max(plausible_numbers)
     print("Warning: Could not determine main amount.")
     return None
 # --- Flask App Setup ---
 app = Flask(__name__)
 # --- Initialize OCR Manager ---
 ocr_model_factory = functools.partial(PaddleOCR, lang=LANG, use_angle_cls=True, use_gpu=False, show_log=False)
 ocr_manager = PaddleOCRModelManager(num_workers=NUM_WORKERS, model_factory=ocr_model_factory)
@@ -185,20 +224,54 @@ def extract_expense():
             # Perform OCR
             ocr_result = ocr_manager.infer(temp_file_path, cls=True)
-            # Process results
-            if not ocr_result:
-                 extracted_text = ""
-                 main_amount = None
-            else:
                 extracted_lines = [line[1][0] for line in ocr_result if line and len(line) > 1 and len(line[1]) > 0]
                 extracted_text = "\n".join(extracted_lines)
-                main_amount = find_main_amount(ocr_result)
             response_data = {
                 "type": "photo",
                 "extracted_text": extracted_text,
-                "main_amount": main_amount
             }
             return jsonify(response_data)
         except Exception as e:
@@ -212,6 +285,13 @@ def extract_expense():
     return jsonify({"error": "File processing failed"}), 500
 # --- Run the App ---
 if __name__ == '__main__':
     # Use port 7860 as expected by Hugging Face Spaces

 from paddleocr import PaddleOCR
 from PIL import Image
+# --- NEW: Import the NLP analysis function ---
+from nlp_service import analyze_expense_text # Import the core analysis function
 # --- Configuration ---
 LANG = 'en' # Default language, can be overridden if needed
 NUM_WORKERS = 2  # Number of OCR worker threads
     if not ocr_results:
         return None
+    amount_regex = re.compile(r'(?<!%)\b\d{1,3}(?:,?\d{3})*(?:\.\d{2})\b|\b\d+\.\d{2}\b|\b\d+\b(?!\.\d{1})')
+    # Prioritized keywords
+    priority_keywords = ['grand total', 'total amount', 'amount due', 'to pay', 'bill total', 'total payable']
+    secondary_keywords = ['total', 'balance', 'net amount', 'paid', 'charge', 'net total'] # Added 'net total'
+    lower_priority_keywords = ['subtotal', 'sub total'] # Added 'sub total'
     parsed_lines = []
     for i, line_info in enumerate(ocr_results):
         float_numbers = []
         for num_str in numbers_in_line:
             try:
+                # Avoid converting year-like numbers if they stand alone on short lines
+                if len(text) < 7 and '.' not in num_str and 1900 < int(num_str.replace(',', '')) < 2100:
+                     # More robust check: avoid if it's the only thing and looks like a year
+                     if len(numbers_in_line) == 1 and len(num_str) == 4:
+                         continue
                 float_numbers.append(float(num_str.replace(',', '')))
             except ValueError:
                 continue
+        # Check for keywords
+        has_priority_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in priority_keywords)
+        has_secondary_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in secondary_keywords)
+        has_lower_priority_keyword = any(re.search(r'\b' + re.escape(kw) + r'\b', text) for kw in lower_priority_keywords)
         parsed_lines.append({
             "index": i,
             "text": text,
             "numbers": float_numbers,
+            "has_priority_keyword": has_priority_keyword,
+            "has_secondary_keyword": has_secondary_keyword,
+            "has_lower_priority_keyword": has_lower_priority_keyword,
             "confidence": confidence
         })
+    # --- Strategy to find the best candidate ---
+    # 1. Look for numbers on the SAME line as PRIORITY keywords
+    priority_candidates = []
+    for line in parsed_lines:
+        if line["has_priority_keyword"] and line["numbers"]:
+            priority_candidates.extend(line["numbers"])
+    if priority_candidates:
+        # Often the largest number on these lines is the final total
+        return max(priority_candidates)
+    # 2. Look for numbers on the SAME line as SECONDARY keywords
+    secondary_candidates = []
+    for line in parsed_lines:
+        if line["has_secondary_keyword"] and line["numbers"]:
+            secondary_candidates.extend(line["numbers"])
+    if secondary_candidates:
+         # If we only found secondary keywords, return the largest number found on those lines
+         # This might catch 'Net Total' or 'Total' when 'Grand Total' isn't present
+        return max(secondary_candidates)
+    # 3. Look near priority/secondary keywords (less reliable, might pick up tax/service charge)
+    # Consider removing or deprioritizing this 'near' logic if same-line logic is sufficient
+    # 4. Look for numbers on the SAME line as LOWER PRIORITY keywords (Subtotal)
+    lower_priority_candidates = []
+    for line in parsed_lines:
+        if line["has_lower_priority_keyword"] and line["numbers"]:
+            lower_priority_candidates.extend(line["numbers"])
+    # Don't return subtotal directly unless it's the only thing found later
+    # 5. Fallback: Largest plausible number overall (excluding subtotals if other numbers exist)
+    print("Warning: No numbers found on priority/secondary keyword lines. Using fallback.")
     all_numbers = []
+    subtotal_numbers = set(lower_priority_candidates) # Keep track of subtotals
     for line in parsed_lines:
         all_numbers.extend(line["numbers"])
     if all_numbers:
         unique_numbers = list(set(all_numbers))
+        # Filter out potential quantities/years/small irrelevant numbers
+        plausible_numbers = [n for n in unique_numbers if n >= 1.0 or '.' in str(n)]
+        # Filter out very large numbers unlikely to be totals unless they have decimals?
+        plausible_numbers = [n for n in plausible_numbers if n < 100000 or '.' in str(n)]
+        # If we have plausible numbers other than subtotals, prefer them
+        non_subtotal_plausible = [n for n in plausible_numbers if n not in subtotal_numbers]
+        if non_subtotal_plausible:
+            return max(non_subtotal_plausible)
+        elif plausible_numbers: # Only subtotals (or nothing else plausible) were found
+             return max(plausible_numbers) # Return the largest subtotal as last resort
+    # 6. If still nothing, return None
     print("Warning: Could not determine main amount.")
     return None
 # --- Flask App Setup ---
 app = Flask(__name__)
+# --- REMOVED: Register the NLP Blueprint ---
+# app.register_blueprint(nlp_bp) # No longer needed as we call the function directly
 # --- Initialize OCR Manager ---
 ocr_model_factory = functools.partial(PaddleOCR, lang=LANG, use_angle_cls=True, use_gpu=False, show_log=False)
 ocr_manager = PaddleOCRModelManager(num_workers=NUM_WORKERS, model_factory=ocr_model_factory)
             # Perform OCR
             ocr_result = ocr_manager.infer(temp_file_path, cls=True)
+            # Process OCR results
+            extracted_text = ""
+            main_amount_ocr = None
+            if ocr_result:
                 extracted_lines = [line[1][0] for line in ocr_result if line and len(line) > 1 and len(line[1]) > 0]
                 extracted_text = "\n".join(extracted_lines)
+                main_amount_ocr = find_main_amount(ocr_result) # Keep OCR amount extraction
+            # --- NEW: Call NLP Function Directly ---
+            nlp_analysis_result = None
+            nlp_error = None
+            if extracted_text:
+                try:
+                    # Call the imported analysis function
+                    nlp_analysis_result = analyze_expense_text(extracted_text)
+                    print(f"NLP Service Analysis Result: {nlp_analysis_result}")
+                    # Check if the NLP analysis itself reported an error/failure
+                    if nlp_analysis_result.get("status") == "failed":
+                        nlp_error = nlp_analysis_result.get("message", "NLP processing failed")
+                        # Keep the result structure but note the failure
+                except Exception as nlp_e:
+                    nlp_error = f"Error calling NLP analysis function: {nlp_e}"
+                    print(f"Error calling NLP function: {nlp_error}")
+                    nlp_analysis_result = None # Ensure result is None on exception during call
+            else:
+                nlp_error = "No text extracted from image for NLP analysis."
+            # --- End NLP Call ---
+            # Construct the response
             response_data = {
                 "type": "photo",
                 "extracted_text": extracted_text,
+                "main_amount_ocr": main_amount_ocr, # Amount found by OCR regex logic
+                "nlp_analysis": nlp_analysis_result, # Include the full NLP analysis result (or None)
+                "nlp_error": nlp_error # Include any error from NLP call/processing
             }
+            # Optional: Add top-level convenience fields based on successful NLP analysis
+            if nlp_analysis_result and nlp_analysis_result.get("status") == "success":
+                if nlp_analysis_result.get("action") == "add_expense":
+                    response_data['confirmed_expense_details'] = nlp_analysis_result.get('details')
+                    response_data['confirmation_message'] = nlp_analysis_result.get('message')
+                elif nlp_analysis_result.get("action") == "query_expense":
+                    # Include query results if applicable (depends on nlp_service structure)
+                    response_data['query_message'] = nlp_analysis_result.get('message')
+                    response_data['query_criteria'] = nlp_analysis_result.get('criteria')
+                    response_data['query_results_count'] = nlp_analysis_result.get('results_count')
             return jsonify(response_data)
         except Exception as e:
     return jsonify({"error": "File processing failed"}), 500
+# --- NEW: Health Check Endpoint ---
+@app.route('/health', methods=['GET'])
+def health_check():
+    # You could add more checks here (e.g., if OCR workers are alive)
+    return jsonify({"status": "ok"}), 200
 # --- Run the App ---
 if __name__ == '__main__':
     # Use port 7860 as expected by Hugging Face Spaces

nlp_service.py ADDED Viewed

	@@ -0,0 +1,611 @@

+import re
+import datetime
+import dateparser # Still essential for interpreting date strings
+import spacy # Import spaCy
+from flask import Blueprint, request, jsonify
+from collections import defaultdict
+import logging
+import os # To handle potential model loading issues
+# --- Setup ---
+logging.basicConfig(level=logging.INFO)
+# --- Load spaCy Model ---
+# Using medium model for better accuracy and word vectors (though not used explicitly yet)
+# Handle potential errors during model loading
+try:
+    # Check if running in an environment where models might be linked differently
+    # (e.g., Google Cloud Functions sometimes needs explicit path)
+    model_name = "en_core_web_md"
+    if not spacy.util.is_package(model_name):
+        print(f"spaCy model '{model_name}' not found as package. Attempting download...")
+        spacy.cli.download(model_name)
+    nlp = spacy.load(model_name)
+    logging.info(f"Successfully loaded spaCy model '{model_name}'")
+except (OSError, ImportError) as e:
+    logging.error(f"Could not load spaCy model '{model_name}'. Error: {e}")
+    logging.error("Ensure the model is downloaded: python -m spacy download en_core_web_md")
+    # Fallback or exit - for now, we'll log and potentially fail later if nlp isn't loaded
+    nlp = None # Indicate model loading failed
+# --- In-Memory Data Storage (Replace with Database) ---
+expenses = []
+next_expense_id = 1
+# --- NLP Configuration & Helpers ---
+CURRENCY_SYMBOLS = ["₹", "$", "€", "£"] # Expand as needed
+# More robust regex to find monetary values even if spaCy misses MONEY entity
+FALLBACK_AMOUNT_REGEX = re.compile(r'([\$€£₹]|\b(?:rs|usd|eur|gbp))\s?([\d,]+(?:\.\d{1,2})?)\b|\b([\d,]+(?:\.\d{1,2})?)\s?([\$€£₹]|\b(?:rupees|rs|dollars|euros|pounds|usd|eur|gbp))\b', re.IGNORECASE)
+# Category keywords remain useful
+CATEGORY_KEYWORDS = {
+    "food": ["food", "meal", "lunch", "dinner", "snack", "restaurant", "dining", "groceries", "sandwich", "burger", "pizza"],
+    "coffee": ["coffee", "latte", "cappuccino", "espresso", "cafe", "starbucks", "ccd", "café", "mocha"],
+    "travel": ["travel", "taxi", "flight", "train", "bus", "uber", "ola", "fuel", "gas", "lyft", "cab", "ticket"],
+    "shopping": ["shop", "shopping", "clothes", "electronics", "mall", "amazon", "flipkart", "purchase", "order", "store"],
+    "groceries": ["groceries", "supermarket", "zepto", "blinkit", "bigbasket", "vegetables", "milk", "market"],
+    "utilities": ["utility", "utilities", "bill", "electricity", "water", "internet", "phone", "recharge"],
+    "entertainment": ["movie", "cinema", "concert", "game", "fun", "netflix", "spotify", "tickets"],
+    "rent": ["rent", "lease"],
+    "transport": ["transport", "metro", "auto", "rickshaw", "commute"]
+}
+# Keywords for intent detection (can be less critical now, intent inferred more from entities)
+QUERY_KEYWORDS = ["how much", "show me", "list", "what are", "total", "summary", "spending", "history", "report", "biggest", "view"]
+ADD_EXPENSE_VERBS = ["spent", "bought", "paid", "cost", "charged", "expensed", "got", "had"] # Verbs often associated with spending
+def parse_money_entity(text, doc):
+    """
+    Extracts amount using spaCy MONEY entities first, then falls back to regex.
+    Returns the amount as float and identified currency symbol/code.
+    """
+    amount = None
+    currency = None
+    text = text.replace(',', '') # Remove commas for easier parsing
+    # 1. Try spaCy MONEY entities first
+    money_ents = [ent for ent in doc.ents if ent.label_ == "MONEY"]
+    if money_ents:
+        # Prioritize longer entities or ones closer to verbs like 'spent' if multiple found
+        # Simple approach: take the first one for now
+        ent_text = money_ents[0].text.replace(',', '')
+        # Try to extract number and symbol/code from the entity text
+        num_match = re.search(r'([\d\.]+)', ent_text)
+        if num_match:
+            try:
+                amount = float(num_match.group(1))
+                # Try to find a known symbol or code within the entity text
+                symbol_match = re.search(r'([\$€£₹])', ent_text)
+                if symbol_match:
+                    currency = symbol_match.group(1)
+                else:
+                    # Check for codes like USD, GBP etc. (simple check)
+                    code_match = re.search(r'\b(USD|EUR|GBP|INR|RS)\b', ent_text, re.IGNORECASE)
+                    if code_match:
+                        currency = code_match.group(1).upper()
+                        # Standardize common ones
+                        if currency == "RS": currency = "INR"
+                # If amount found but no currency symbol in entity, check doc context
+                if amount is not None and currency is None:
+                     for token in doc:
+                         if token.text in CURRENCY_SYMBOLS:
+                             currency = token.text
+                             break
+                return amount, currency
+            except ValueError:
+                pass # Failed to convert number
+    # 2. Fallback Regex (if spaCy missed it or parsing failed)
+    match = FALLBACK_AMOUNT_REGEX.search(text)
+    if match:
+        try:
+            if match.group(2): # Format: $100 or Rs 100
+                amount = float(match.group(2))
+                currency_text = match.group(1)
+            elif match.group(3): # Format: 100 dollars or 100 Rs
+                amount = float(match.group(3))
+                currency_text = match.group(4)
+            else: # Should not happen with this regex, but safety first
+                return None, None
+            # Normalize currency symbol/code
+            if currency_text in CURRENCY_SYMBOLS:
+                currency = currency_text
+            else:
+                currency_text = currency_text.lower()
+                if currency_text in ["rs", "rupees"]: currency = "₹" # Or INR
+                elif currency_text in ["dollars", "usd"]: currency = "$" # Or USD
+                elif currency_text in ["pounds", "gbp"]: currency = "£" # Or GBP
+                elif currency_text in ["euros", "eur"]: currency = "€" # Or EUR
+            return amount, currency
+        except (ValueError, IndexError):
+            logging.warning(f"Regex fallback failed to parse amount from: {text}")
+            return None, None
+    return None, None # No amount found
+def parse_date_entities(doc):
+    """
+    Uses dateparser to interpret spaCy DATE entities.
+    Returns the *most likely* date found, defaulting to today.
+    """
+    dates = []
+    # Settings for dateparser: prefer past dates for expenses
+    settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
+    date_ents = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
+    logging.debug(f"Found DATE entities: {date_ents}")
+    if date_ents:
+        for date_str in date_ents:
+            # Sometimes spaCy includes words like "on", "last" in the entity, dateparser handles this
+            parsed = dateparser.parse(date_str, settings=settings)
+            if parsed:
+                dates.append(parsed.date())
+    if dates:
+        # Heuristic: If multiple dates, prefer the one closest to today? Or just the first?
+        # Let's prefer the latest valid past date found (most recent expense)
+        past_dates = [d for d in dates if d <= datetime.date.today()]
+        if past_dates:
+             return max(past_dates) # Return the most recent valid date
+        elif dates:
+             return min(dates) # If only future dates found, return the earliest one (less likely for expense)
+    # Fallback if no DATE entity found or parsed
+    logging.debug("No valid DATE entity found or parsed, defaulting to today.")
+    return datetime.date.today()
+def identify_merchant_and_category(doc):
+    """
+    Identifies merchant using ORG/PERSON/GPE entities and context.
+    Identifies category using keywords and context around amount/merchant.
+    """
+    merchant = None
+    category = "Uncategorized" # Default
+    money_token_indices = [token.i for token in doc if token.like_num or token.text in CURRENCY_SYMBOLS or any(sym in token.text for sym in CURRENCY_SYMBOLS) or (token.ent_type_ == "MONEY")]
+    potential_merchants = []
+    for ent in doc.ents:
+        if ent.label_ in ["ORG", "PERSON", "GPE", "FAC"]: # Facility might also be relevant
+             # Check context: is it preceded by "at", "from", "in"? Is it near the money amount?
+             prepositions = {"at", "from", "in", "on", "with"}
+             # Check token before the entity start
+             if ent.start > 0 and doc[ent.start - 1].lower_ in prepositions:
+                 potential_merchants.append(ent.text)
+                 continue
+             # Check dependency relation (e.g., object of preposition)
+             if ent.root.head.lemma_ in prepositions:
+                 potential_merchants.append(ent.text)
+                 continue
+            # Check proximity to money amount if indices available
+             if money_token_indices:
+                 min_dist = min(abs(ent.start - idx) for idx in money_token_indices)
+                 if min_dist < 5: # Arbitrary proximity threshold
+                     potential_merchants.append(ent.text)
+                     continue
+    if potential_merchants:
+        # Simple heuristic: choose the first likely one. Could be refined.
+        # Filter out very common words or locations if needed (e.g., "City", "Bank" if too generic)
+        merchant = potential_merchants[0].strip()
+        logging.debug(f"Identified potential merchant: {merchant} from entities {potential_merchants}")
+    # --- Category Identification ---
+    text_lower = doc.text.lower()
+    # 1. Check explicit category keywords
+    found_category = None
+    matched_keywords = []
+    for cat, keywords in CATEGORY_KEYWORDS.items():
+        if any(keyword in text_lower for keyword in keywords):
+            # If multiple categories match, prioritize based on merchant or context?
+            # Simple approach: Store all matches for now
+             matched_keywords.append(cat)
+    if len(matched_keywords) == 1:
+         found_category = matched_keywords[0]
+    elif len(matched_keywords) > 1:
+         # Ambiguity - Requires smarter logic. E.g., "Coffee at Food court" -> Coffee or Food?
+         # Prioritize based on merchant if known? E.g. if merchant is Starbucks -> Coffee
+         if merchant:
+             merchant_lower = merchant.lower()
+             if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower:
+                 if "coffee" in matched_keywords: found_category = "coffee"
+             elif "amazon" in merchant_lower or "flipkart" in merchant_lower:
+                 if "shopping" in matched_keywords: found_category = "shopping"
+             elif "zepto" in merchant_lower or "blinkit" in merchant_lower or "groceries" in merchant_lower:
+                 if "groceries" in matched_keywords: found_category = "groceries"
+                 elif "food" in matched_keywords: found_category = "groceries" # Prefer specific
+         # If still ambiguous, maybe pick the most specific one (e.g., prefer 'coffee' over 'food')
+         if not found_category:
+             if "coffee" in matched_keywords: found_category = "coffee"
+             elif "groceries" in matched_keywords: found_category = "groceries"
+             elif "transport" in matched_keywords: found_category = "transport"
+             # Add more specific priorities if needed
+             elif "food" in matched_keywords : found_category = "food" # More general last
+             else: found_category = matched_keywords[0] # Default to first match if no rules apply
+    if found_category:
+        category = found_category
+    # 2. (Optional/Advanced) Infer from merchant if category is Uncategorized
+    elif merchant and category == "Uncategorized":
+        merchant_lower = merchant.lower()
+        if "starbucks" in merchant_lower or "ccd" in merchant_lower or "café" in merchant_lower: category = "coffee"
+        elif "amazon" in merchant_lower or "flipkart" in merchant_lower: category = "shopping"
+        elif "zepto" in merchant_lower or "blinkit" in merchant_lower: category = "groceries"
+        elif "uber" in merchant_lower or "ola" in merchant_lower: category = "travel"
+        elif "netflix" in merchant_lower or "spotify" in merchant_lower: category = "entertainment"
+        # Add more merchant->category mappings
+    # 3. (Optional/Advanced) Use Dependency Parsing or Word Vectors
+    # Example: Look for nouns that are objects of spending verbs near the amount
+    # This requires more complex linguistic analysis.
+    logging.debug(f"Identified Category: {category}")
+    return merchant, category
+def determine_intent(doc):
+    """Determines intent: 'add_expense', 'query_expense', or 'unknown'."""
+    text_lower = doc.text.lower()
+    has_query_keyword = any(keyword in text_lower for keyword in QUERY_KEYWORDS)
+    has_add_verb = any(verb.lemma_ in ADD_EXPENSE_VERBS for verb in doc if verb.pos_ == "VERB")
+    has_money_entity = any(ent.label_ == "MONEY" for ent in doc.ents) or FALLBACK_AMOUNT_REGEX.search(text_lower) is not None
+    # More explicit questions are likely queries
+    if doc[0].pos_ == "AUX" or doc[0].lemma_ in ["what", "how", "show", "list", "view"]: # Starts like a question
+        return "query_expense"
+    if has_query_keyword:
+        return "query_expense"
+    # If it has a spending verb and a money amount, likely adding expense
+    if has_add_verb and has_money_entity:
+        return "add_expense"
+    # If it just has a money amount and maybe date/merchant, could be adding expense (implicit verb)
+    if has_money_entity and not has_query_keyword:
+         # Check if there are nouns suggesting items bought
+         has_object_noun = any(tok.pos_ == "NOUN" and tok.dep_ in ["dobj", "pobj", "attr"] for tok in doc)
+         if has_object_noun or any(ent.label_ in ["ORG", "PRODUCT"] for ent in doc.ents):
+            return "add_expense"
+    # If only query keywords or unclear structure, lean towards query or unknown
+    if has_query_keyword:
+         return "query_expense"
+    return "unknown"
+# --- Filtering and Formatting (largely reused, minor adjustments) ---
+def filter_expenses(criteria):
+    """Filters the global 'expenses' list based on criteria."""
+    # (This function remains largely the same as the previous version)
+    filtered = expenses
+    # Filter by Category
+    if 'category' in criteria and criteria['category'] is not None:
+        target_cat = criteria['category'].lower()
+        # Handle general 'food' query including 'coffee', 'groceries' etc.
+        food_related_cats = {'food', 'coffee', 'groceries', 'restaurant'} # Define food-related categories
+        if target_cat == 'food':
+            filtered = [e for e in filtered if e['category'].lower() in food_related_cats]
+        else:
+             filtered = [e for e in filtered if e['category'].lower() == target_cat]
+    # Filter by Date Range (start_date and end_date are inclusive)
+    if 'start_date' in criteria and criteria['start_date'] is not None:
+        filtered = [e for e in filtered if e['date'] >= criteria['start_date']]
+    if 'end_date' in criteria and criteria['end_date'] is not None:
+        filtered = [e for e in filtered if e['date'] <= criteria['end_date']]
+    # Filter by Merchant (case-insensitive substring match)
+    if 'merchant' in criteria and criteria['merchant'] is not None:
+        target_merchant = criteria['merchant'].lower()
+        filtered = [e for e in filtered if e['merchant'] and target_merchant in e['merchant'].lower()]
+    return filtered
+def parse_date_range_from_query(doc):
+    """Parses date ranges specifically for queries (e.g., 'this month', 'last week')."""
+    # (This function remains largely the same, using dateparser on DATE entities or keywords)
+    today = datetime.date.today()
+    text_lower = doc.text.lower() # Use full text for keywords like "this month"
+    start_date, end_date = None, None
+    # Prioritize DATE entities found by spaCy
+    date_ents_text = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
+    parsed_dates = []
+    settings = {'PREFER_DATES_FROM': 'past', 'RELATIVE_BASE': datetime.datetime.now()}
+    for date_str in date_ents_text:
+        # Try parsing as a potential range using dateparser's experimental range feature (or parse single dates)
+        # For simplicity, we'll stick to parsing single points and let keyword logic handle ranges
+        parsed = dateparser.parse(date_str, settings=settings)
+        if parsed:
+            parsed_dates.append(parsed.date())
+    # If spaCy found specific dates, use them
+    if len(parsed_dates) == 1:
+        start_date = end_date = parsed_dates[0]
+    elif len(parsed_dates) > 1:
+         # Ambiguous, maybe take min/max? Or rely on keywords below?
+         start_date = min(parsed_dates)
+         end_date = max(parsed_dates)
+         if start_date > end_date: # Swap if order is wrong
+             start_date, end_date = end_date, start_date
+    # If no specific date entities, check for range keywords
+    if start_date is None and end_date is None:
+        if "today" in text_lower:
+            start_date = end_date = today
+        elif "yesterday" in text_lower:
+            start_date = end_date = today - datetime.timedelta(days=1)
+        elif "this week" in text_lower:
+            start_of_week = today - datetime.timedelta(days=today.weekday()) # Monday
+            end_of_week = start_of_week + datetime.timedelta(days=6) # Sunday
+            start_date = start_of_week
+            end_date = end_of_week
+        elif "last week" in text_lower:
+            end_of_last_week = today - datetime.timedelta(days=today.weekday() + 1) # Last Sunday
+            start_of_last_week = end_of_last_week - datetime.timedelta(days=6) # Last Monday
+            start_date = start_of_last_week
+            end_date = end_of_last_week
+        elif "this month" in text_lower:
+            start_date = today.replace(day=1)
+            next_month = today.replace(day=28) + datetime.timedelta(days=4)
+            last_day_of_month = next_month - datetime.timedelta(days=next_month.day)
+            end_date = last_day_of_month
+        elif "last month" in text_lower:
+            first_day_of_current_month = today.replace(day=1)
+            last_day_of_last_month = first_day_of_current_month - datetime.timedelta(days=1)
+            first_day_of_last_month = last_day_of_last_month.replace(day=1)
+            start_date = first_day_of_last_month
+            end_date = last_day_of_last_month
+        elif "year" in text_lower: # e.g., "this year", "last year"
+             if "this year" in text_lower:
+                 start_date = datetime.date(today.year, 1, 1)
+                 end_date = datetime.date(today.year, 12, 31)
+             elif "last year" in text_lower:
+                 start_date = datetime.date(today.year - 1, 1, 1)
+                 end_date = datetime.date(today.year - 1, 12, 31)
+             # Check for specific year like "in 2023"
+             year_match = re.search(r'\b(in|for)\s+(\d{4})\b', text_lower)
+             if year_match:
+                 year = int(year_match.group(2))
+                 start_date = datetime.date(year, 1, 1)
+                 end_date = datetime.date(year, 12, 31)
+        # Add specific month parsing ("in January") if needed (similar to previous version)
+        else:
+             month_match = re.search(r'\b(in|for)\s+(january|february|march|april|may|june|july|august|september|october|november|december)\b', text_lower)
+             if month_match:
+                 month_name = month_match.group(2)
+                 year_context = today.year # Assume current year
+                 # Check if a year was mentioned nearby
+                 year_ent = [e.text for e in doc.ents if e.label_ == "DATE" and e.text.isdigit() and len(e.text)==4]
+                 if year_ent:
+                     year_context = int(year_ent[0])
+                 try:
+                     month_num = list(datetime.date(2000, i, 1).strftime('%B').lower() for i in range(1, 13)).index(month_name) + 1
+                     start_date = datetime.date(year_context, month_num, 1)
+                     next_m = (start_date.replace(day=28) + datetime.timedelta(days=4))
+                     end_date = next_m - datetime.timedelta(days=next_m.day)
+                 except (ValueError, IndexError): pass # Ignore invalid month/year
+    logging.debug(f"Parsed date range for query: {start_date} to {end_date}")
+    return start_date, end_date
+def format_expense_list(expense_list, title="Here are the expenses:"):
+    """Formats a list of expenses into a user-friendly string."""
+    # (This function remains largely the same)
+    if not expense_list:
+        return "No expenses found matching your criteria."
+    total_amount = sum(e['amount'] for e in expense_list)
+    # Try to get a consistent currency symbol, default to first expense's symbol or fallback
+    currency_symbol = expense_list[0].get("currency") or "₹" if expense_list else "₹"
+    response_lines = [title]
+    expense_list.sort(key=lambda x: x['date'], reverse=True)
+    for expense in expense_list:
+        cur = expense.get("currency") or currency_symbol # Use expense specific or default
+        amount_str = f"{cur}{expense['amount']:.2f}"
+        merchant_part = f" at {expense['merchant']}" if expense['merchant'] else ""
+        category_part = f" ({expense['category']})" if expense['category'] != 'Uncategorized' else ""
+        date_str = expense['date'].strftime("%b %d, %Y")
+        response_lines.append(f"- {amount_str}{category_part}{merchant_part} - {date_str}")
+    if len(expense_list) > 1:
+         total_str = f"{currency_symbol}{total_amount:.2f}"
+         response_lines.append(f"Total: {total_str}")
+    return "\n".join(response_lines)
+# --- NEW: Core NLP Processing Function ---
+def analyze_expense_text(text):
+    """
+    Analyzes text to extract expense details or understand queries using spaCy.
+    Returns a dictionary with action, status, and extracted details/message.
+    """
+    global next_expense_id # Allow modification of the global counter
+    if nlp is None:
+        logging.error("spaCy model not loaded. Cannot process text.")
+        return {"action": "error", "status": "failed", "message": "NLP model not available"}
+    logging.info(f"Analyzing text: {text[:100]}...") # Log snippet
+    doc = nlp(text)
+    logging.debug(f"spaCy Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
+    intent = determine_intent(doc)
+    logging.info(f"Determined Intent: {intent}")
+    response_data = {}
+    if intent == "add_expense":
+        amount, currency = parse_money_entity(text, doc)
+        expense_date = parse_date_entities(doc)
+        merchant, category = identify_merchant_and_category(doc)
+        if amount is not None:
+            currency_symbol = currency or "₹" # Default currency
+            new_expense = {
+                "id": next_expense_id,
+                "amount": amount,
+                "currency": currency_symbol,
+                "category": category,
+                "merchant": merchant,
+                "date": expense_date, # Keep as date object internally
+                "original_message": text
+            }
+            expenses.append(new_expense)
+            next_expense_id += 1
+            logging.info(f"Added expense (in-memory): {new_expense}")
+            merchant_part = f" at {merchant}" if merchant else ""
+            date_str = expense_date.strftime('%b %d, %Y')
+            confirmation_msg = f"✅ Expense added: {currency_symbol}{amount:.2f} for {category}{merchant_part} on {date_str}."
+            new_expense_serializable = new_expense.copy()
+            new_expense_serializable["date"] = new_expense["date"].isoformat()
+            response_data = {
+                "action": "add_expense",
+                "status": "success",
+                "message": confirmation_msg,
+                "details": new_expense_serializable
+            }
+        else:
+            logging.warning(f"Could not extract amount reliably from: {text}")
+            response_data = {
+                "action": "add_expense",
+                "status": "failed",
+                "message": f"Sorry, I couldn't understand the amount. Please include it clearly (e.g., '₹500', '$20', '15 pounds')."
+            }
+    elif intent == "query_expense":
+        logging.info("Processing query intent.")
+        query_criteria = {}
+        _q_merchant, q_category = identify_merchant_and_category(doc)
+        # ... (rest of query criteria extraction logic remains the same) ...
+        query_cat_found = None
+        text_lower = doc.text.lower()
+        for cat, keywords in CATEGORY_KEYWORDS.items():
+             if any(keyword in text_lower for keyword in keywords):
+                  if cat == 'food' or q_category == 'food':
+                      query_cat_found = 'food'
+                      break
+                  query_cat_found = q_category if q_category != 'Uncategorized' else cat
+                  break
+        query_criteria['category'] = query_cat_found
+        query_criteria['merchant'] = _q_merchant
+        start_date, end_date = parse_date_range_from_query(doc)
+        query_criteria['start_date'] = start_date
+        query_criteria['end_date'] = end_date
+        logging.info(f"Query Criteria: {query_criteria}")
+        results = filter_expenses(query_criteria)
+        response_message = ""
+        # ... (rest of query response formatting logic remains the same) ...
+        if results and ("total" in text_lower or "sum" in text_lower or "how much" in doc[0].lower_):
+             total_amount = sum(e['amount'] for e in results)
+             currency_symbol = results[0].get("currency") or "₹"
+             category_filter_text = f" on {query_criteria['category']}" if query_criteria['category'] else ""
+             date_filter_text = ""
+             if start_date and end_date and start_date == end_date: date_filter_text = f" for {start_date.strftime('%b %d, %Y')}"
+             elif start_date and end_date: date_filter_text = f" from {start_date.strftime('%b %d')} to {end_date.strftime('%b %d, %Y')}"
+             elif start_date: date_filter_text = f" since {start_date.strftime('%b %d, %Y')}"
+             elif end_date: date_filter_text = f" until {end_date.strftime('%b %d, %Y')}"
+             response_message = f"Your total spending{category_filter_text}{date_filter_text} is {currency_symbol}{total_amount:.2f}."
+             if len(results) <= 10:
+                 response_message += "\n" + format_expense_list(results, "Details:")
+             else:
+                 response_message += f" (from {len(results)} transactions)"
+        elif results and ("biggest" in text_lower or "largest" in text_lower or "top" in text_lower):
+             top_n = 3
+             top_expenses = sorted(results, key=lambda x: x['amount'], reverse=True)[:top_n]
+             response_message = format_expense_list(top_expenses, f"Your top {len(top_expenses)} expenses:")
+        else:
+            date_filter_desc = ""
+            if start_date and end_date and start_date == end_date: date_filter_desc = f" from {start_date.strftime('%b %d, %Y')}"
+            elif start_date or end_date: date_filter_desc = " matching the date criteria"
+            category_filter_desc = f" for {query_criteria['category']}" if query_criteria['category'] else ""
+            merchant_filter_desc = f" at {query_criteria['merchant']}" if query_criteria['merchant'] else ""
+            title = f"Expenses{category_filter_desc}{merchant_filter_desc}{date_filter_desc}:"
+            response_message = format_expense_list(results, title)
+        response_data = {
+            "action": "query_expense",
+            "status": "success",
+            "message": response_message,
+            "criteria": {k: v.isoformat() if isinstance(v, datetime.date) else v for k, v in query_criteria.items() if v is not None},
+            "results_count": len(results)
+        }
+    else: # intent == "unknown"
+        logging.info(f"Could not determine intent for: {text}")
+        response_data = {
+            "action": "unknown",
+            "status": "failed",
+            "message": "Sorry, I couldn't quite understand that. Please try phrasing your expense or query differently. \nExamples:\n- 'Spent ₹50 on coffee yesterday at Starbucks'\n- 'Show my food expenses last week'\n- 'What was my total spending last month?'"
+        }
+    logging.info(f"Analysis complete. Action: {response_data.get('action')}, Status: {response_data.get('status')}")
+    return response_data
+# --- Flask Blueprint Setup (Optional: Keep if direct API access is needed) ---
+nlp_bp = Blueprint('nlp_service', __name__)
+@nlp_bp.route('/process_nlp', methods=['POST'])
+def process_nlp_expense_route():
+    """Flask route handler that calls the core analysis function."""
+    data = request.get_json()
+    if not data or 'message' not in data:
+        logging.warning("Received request without 'message' field.")
+        return jsonify({"error": "Missing 'message' in request body"}), 400
+    user_message = data['message']
+    result = analyze_expense_text(user_message) # Call the core function
+    # Determine status code based on result
+    status_code = 200
+    if result.get("status") == "failed":
+        status_code = 400 # Or 500 if it's an internal NLP model error
+        if result.get("message") == "NLP model not available":
+            status_code = 500
+    return jsonify(result), status_code
+# --- Example Usage / Testing Setup ---
+if __name__ == '__main__':
+    from flask import Flask
+    app = Flask(__name__)
+    app.register_blueprint(nlp_bp) # Register the blueprint
+    # Dummy data removed
+    print("Starting Flask server for testing NLP service...")
+    # print("Registered expenses:", expenses) # Can be long
+    if nlp is None:
+        print("WARNING: spaCy model failed to load. /process_nlp endpoint will return errors.")
+    app.run(debug=True, host='0.0.0.0', port=5001)

requirements.txt CHANGED Viewed

@@ -1,5 +1,9 @@
 Pillow
 flask
-requests
 paddlepaddle
-paddleocr

 Pillow
 flask
+# requests # Removed as NLP is called directly now
 paddlepaddle
+paddleocr
+spacy>=3.0.0 # Added spaCy
+dateparser>=1.0.0 # Added dateparser
+# Note: spaCy model 'en_core_web_md' needs to be downloaded separately:
+# python -m spacy download en_core_web_md