ClearSpend / utils.py
MonilM's picture
Improved NLP Logic
07b50c0
import re
import json
from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols
def parse_entities(entities, full_text: str):
"""
Extracts amount, currency, and item description from NER entities and full text.
Args:
entities: List of dictionaries from the NER pipeline.
full_text: The original input text string.
Returns:
A tuple: (amount, currency, item)
"""
amount, currency, item = None, None, None
potential_amounts = []
# 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable
# Regex groups:
# 1: Symbol/Code before number ($, EUR, etc.)
# 2: Number when symbol/code is before
# 3: Number when symbol/code is after
# 4: Symbol/Code after number (rs, dollars, etc.)
# 5: Standalone number
for match in FALLBACK_AMOUNT_REGEX.finditer(full_text):
num_str = None
curr_symbol = None
curr_code = None
if match.group(1) and match.group(2): # Symbol/Code before
curr_symbol = match.group(1)
num_str = match.group(2)
elif match.group(3) and match.group(4): # Symbol/Code after
num_str = match.group(3)
curr_code = match.group(4)
elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number
num_str = match.group(5)
if num_str:
try:
value = float(num_str.replace(",", ""))
# Basic validation: avoid huge numbers unless they have decimals (might be IDs)
if value < 1_000_000 or '.' in num_str:
potential_amounts.append({
"value": value,
"currency_symbol": curr_symbol,
"currency_code": curr_code,
"match_obj": match # Store match object for position info later if needed
})
except ValueError:
continue # Ignore invalid numbers like "1,2,3"
# 2. Determine Amount and Currency from regex matches
if potential_amounts:
# Prioritize matches that included a currency symbol/code
currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]]
if currency_matches:
# Often the largest value with currency is the main one
best_match = max(currency_matches, key=lambda x: x["value"])
amount = best_match["value"]
# Determine currency from symbol/code
symbol = best_match["currency_symbol"]
code = best_match["currency_code"]
if symbol:
if "₹" in symbol: currency = "INR"
elif "$" in symbol: currency = "USD"
elif "€" in symbol: currency = "EUR"
elif "£" in symbol: currency = "GBP"
elif code:
code_lower = code.lower()
if code_lower in ["inr", "rs", "rupees"]: currency = "INR"
elif code_lower in ["usd", "dollars"]: currency = "USD"
elif code_lower in ["eur", "euros"]: currency = "EUR"
elif code_lower in ["gbp", "pounds"]: currency = "GBP"
else:
# If no currency found, take the largest standalone number as amount
best_match = max(potential_amounts, key=lambda x: x["value"])
amount = best_match["value"]
currency = None # Explicitly None if not found
# 3. Extract Item using NER entities (excluding amounts/currency)
item_parts = []
if entities:
# Get text segments identified as potential amounts by the regex
amount_texts = set()
for p in potential_amounts:
amount_texts.add(p["match_obj"].group(0)) # Add the full matched string
for entity in entities:
entity_group = entity.get("entity_group", "")
word = entity.get("word", "")
# Skip if the entity word is part of a detected amount or is just a currency symbol
if word in amount_texts or word in CURRENCY_SYMBOLS:
continue
# Skip if it's classified as MONEY by NER (already handled by regex)
# Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees")
if "MONEY" in entity_group:
continue
# Include relevant entity types for item description
if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]:
# Clean up sub-word tokens like ##ing
cleaned_word = word.replace(" ##", "").strip()
if cleaned_word:
item_parts.append(cleaned_word)
if item_parts:
item = " ".join(item_parts).strip()
# Further clean-up (optional): remove leading/trailing punctuation if desired
item = re.sub(r"^[^\w]+|[^\w]+$", "", item)
# 4. Final checks and return
# If amount is found but currency is None, consider a default (optional, decided against for now)
# if amount is not None and currency is None:
# currency = "INR" # Or keep as None
print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}")
return amount, currency, item
# ... (keep parse_gemini_response as is) ...
def parse_gemini_response(response_text):
"""
Parses a structured string response from Gemini (expected JSON-like).
Example expected format:
"{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }"
"""
try:
# Clean the response text if it's wrapped in markdown code blocks
response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip())
data = json.loads(response_text)
return data
except json.JSONDecodeError:
print(f"Warning: Could not parse Gemini response: {response_text}")
return None
except Exception as e:
print(f"Error parsing Gemini response: {e}")
return None