Spaces:
Running
Running
import re | |
import json | |
from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols | |
def parse_entities(entities, full_text: str): | |
""" | |
Extracts amount, currency, and item description from NER entities and full text. | |
Args: | |
entities: List of dictionaries from the NER pipeline. | |
full_text: The original input text string. | |
Returns: | |
A tuple: (amount, currency, item) | |
""" | |
amount, currency, item = None, None, None | |
potential_amounts = [] | |
# 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable | |
# Regex groups: | |
# 1: Symbol/Code before number ($, EUR, etc.) | |
# 2: Number when symbol/code is before | |
# 3: Number when symbol/code is after | |
# 4: Symbol/Code after number (rs, dollars, etc.) | |
# 5: Standalone number | |
for match in FALLBACK_AMOUNT_REGEX.finditer(full_text): | |
num_str = None | |
curr_symbol = None | |
curr_code = None | |
if match.group(1) and match.group(2): # Symbol/Code before | |
curr_symbol = match.group(1) | |
num_str = match.group(2) | |
elif match.group(3) and match.group(4): # Symbol/Code after | |
num_str = match.group(3) | |
curr_code = match.group(4) | |
elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number | |
num_str = match.group(5) | |
if num_str: | |
try: | |
value = float(num_str.replace(",", "")) | |
# Basic validation: avoid huge numbers unless they have decimals (might be IDs) | |
if value < 1_000_000 or '.' in num_str: | |
potential_amounts.append({ | |
"value": value, | |
"currency_symbol": curr_symbol, | |
"currency_code": curr_code, | |
"match_obj": match # Store match object for position info later if needed | |
}) | |
except ValueError: | |
continue # Ignore invalid numbers like "1,2,3" | |
# 2. Determine Amount and Currency from regex matches | |
if potential_amounts: | |
# Prioritize matches that included a currency symbol/code | |
currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]] | |
if currency_matches: | |
# Often the largest value with currency is the main one | |
best_match = max(currency_matches, key=lambda x: x["value"]) | |
amount = best_match["value"] | |
# Determine currency from symbol/code | |
symbol = best_match["currency_symbol"] | |
code = best_match["currency_code"] | |
if symbol: | |
if "₹" in symbol: currency = "INR" | |
elif "$" in symbol: currency = "USD" | |
elif "€" in symbol: currency = "EUR" | |
elif "£" in symbol: currency = "GBP" | |
elif code: | |
code_lower = code.lower() | |
if code_lower in ["inr", "rs", "rupees"]: currency = "INR" | |
elif code_lower in ["usd", "dollars"]: currency = "USD" | |
elif code_lower in ["eur", "euros"]: currency = "EUR" | |
elif code_lower in ["gbp", "pounds"]: currency = "GBP" | |
else: | |
# If no currency found, take the largest standalone number as amount | |
best_match = max(potential_amounts, key=lambda x: x["value"]) | |
amount = best_match["value"] | |
currency = None # Explicitly None if not found | |
# 3. Extract Item using NER entities (excluding amounts/currency) | |
item_parts = [] | |
if entities: | |
# Get text segments identified as potential amounts by the regex | |
amount_texts = set() | |
for p in potential_amounts: | |
amount_texts.add(p["match_obj"].group(0)) # Add the full matched string | |
for entity in entities: | |
entity_group = entity.get("entity_group", "") | |
word = entity.get("word", "") | |
# Skip if the entity word is part of a detected amount or is just a currency symbol | |
if word in amount_texts or word in CURRENCY_SYMBOLS: | |
continue | |
# Skip if it's classified as MONEY by NER (already handled by regex) | |
# Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees") | |
if "MONEY" in entity_group: | |
continue | |
# Include relevant entity types for item description | |
if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]: | |
# Clean up sub-word tokens like ##ing | |
cleaned_word = word.replace(" ##", "").strip() | |
if cleaned_word: | |
item_parts.append(cleaned_word) | |
if item_parts: | |
item = " ".join(item_parts).strip() | |
# Further clean-up (optional): remove leading/trailing punctuation if desired | |
item = re.sub(r"^[^\w]+|[^\w]+$", "", item) | |
# 4. Final checks and return | |
# If amount is found but currency is None, consider a default (optional, decided against for now) | |
# if amount is not None and currency is None: | |
# currency = "INR" # Or keep as None | |
print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}") | |
return amount, currency, item | |
# ... (keep parse_gemini_response as is) ... | |
def parse_gemini_response(response_text): | |
""" | |
Parses a structured string response from Gemini (expected JSON-like). | |
Example expected format: | |
"{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }" | |
""" | |
try: | |
# Clean the response text if it's wrapped in markdown code blocks | |
response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip()) | |
data = json.loads(response_text) | |
return data | |
except json.JSONDecodeError: | |
print(f"Warning: Could not parse Gemini response: {response_text}") | |
return None | |
except Exception as e: | |
print(f"Error parsing Gemini response: {e}") | |
return None | |