ClearSpend

Running

App Files Files Community

ClearSpend / utils.py

MonilM

Improved NLP Logic

07b50c0 15 days ago

raw

history blame contribute delete

6.16 kB

	import re
	import json
	from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols

	def parse_entities(entities, full_text: str):
	"""
	Extracts amount, currency, and item description from NER entities and full text.

	Args:
	entities: List of dictionaries from the NER pipeline.
	full_text: The original input text string.

	Returns:
	A tuple: (amount, currency, item)
	"""
	amount, currency, item = None, None, None
	potential_amounts = []

	# 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable
	# Regex groups:
	# 1: Symbol/Code before number ($, EUR, etc.)
	# 2: Number when symbol/code is before
	# 3: Number when symbol/code is after
	# 4: Symbol/Code after number (rs, dollars, etc.)
	# 5: Standalone number
	for match in FALLBACK_AMOUNT_REGEX.finditer(full_text):
	num_str = None
	curr_symbol = None
	curr_code = None

	if match.group(1) and match.group(2): # Symbol/Code before
	curr_symbol = match.group(1)
	num_str = match.group(2)
	elif match.group(3) and match.group(4): # Symbol/Code after
	num_str = match.group(3)
	curr_code = match.group(4)
	elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number
	num_str = match.group(5)

	if num_str:
	try:
	value = float(num_str.replace(",", ""))
	# Basic validation: avoid huge numbers unless they have decimals (might be IDs)
	if value < 1_000_000 or '.' in num_str:
	potential_amounts.append({
	"value": value,
	"currency_symbol": curr_symbol,
	"currency_code": curr_code,
	"match_obj": match # Store match object for position info later if needed
	})
	except ValueError:
	continue # Ignore invalid numbers like "1,2,3"

	# 2. Determine Amount and Currency from regex matches
	if potential_amounts:
	# Prioritize matches that included a currency symbol/code
	currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]]
	if currency_matches:
	# Often the largest value with currency is the main one
	best_match = max(currency_matches, key=lambda x: x["value"])
	amount = best_match["value"]
	# Determine currency from symbol/code
	symbol = best_match["currency_symbol"]
	code = best_match["currency_code"]
	if symbol:
	if "₹" in symbol: currency = "INR"
	elif "$" in symbol: currency = "USD"
	elif "€" in symbol: currency = "EUR"
	elif "£" in symbol: currency = "GBP"
	elif code:
	code_lower = code.lower()
	if code_lower in ["inr", "rs", "rupees"]: currency = "INR"
	elif code_lower in ["usd", "dollars"]: currency = "USD"
	elif code_lower in ["eur", "euros"]: currency = "EUR"
	elif code_lower in ["gbp", "pounds"]: currency = "GBP"
	else:
	# If no currency found, take the largest standalone number as amount
	best_match = max(potential_amounts, key=lambda x: x["value"])
	amount = best_match["value"]
	currency = None # Explicitly None if not found

	# 3. Extract Item using NER entities (excluding amounts/currency)
	item_parts = []
	if entities:
	# Get text segments identified as potential amounts by the regex
	amount_texts = set()
	for p in potential_amounts:
	amount_texts.add(p["match_obj"].group(0)) # Add the full matched string

	for entity in entities:
	entity_group = entity.get("entity_group", "")
	word = entity.get("word", "")

	# Skip if the entity word is part of a detected amount or is just a currency symbol
	if word in amount_texts or word in CURRENCY_SYMBOLS:
	continue

	# Skip if it's classified as MONEY by NER (already handled by regex)
	# Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees")
	if "MONEY" in entity_group:
	continue

	# Include relevant entity types for item description
	if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]:
	# Clean up sub-word tokens like ##ing
	cleaned_word = word.replace(" ##", "").strip()
	if cleaned_word:
	item_parts.append(cleaned_word)

	if item_parts:
	item = " ".join(item_parts).strip()
	# Further clean-up (optional): remove leading/trailing punctuation if desired
	item = re.sub(r"^[^\w]+\|[^\w]+$", "", item)


	# 4. Final checks and return
	# If amount is found but currency is None, consider a default (optional, decided against for now)
	# if amount is not None and currency is None:
	# currency = "INR" # Or keep as None

	print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}")
	return amount, currency, item

	# ... (keep parse_gemini_response as is) ...
	def parse_gemini_response(response_text):
	"""
	Parses a structured string response from Gemini (expected JSON-like).
	Example expected format:
	"{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }"
	"""
	try:
	# Clean the response text if it's wrapped in markdown code blocks
	response_text = re.sub(r"^```json\s\|\s```$", "", response_text.strip())
	data = json.loads(response_text)
	return data
	except json.JSONDecodeError:
	print(f"Warning: Could not parse Gemini response: {response_text}")
	return None
	except Exception as e:
	print(f"Error parsing Gemini response: {e}")
	return None