Spaces:

Shenuki
/

NER

Running

NER

File size: 7,457 Bytes

2a3aa81
 
5bf38c0
a70a295
 
e08081f
5bf38c0
0d7fa59
6bebb04
 
 
 
5bf38c0
948cffb
5bf38c0
 
 
 
 
ce5002a
6bebb04
948cffb
 
6bebb04
948cffb
 
5bf38c0
6bebb04
 
 
 
 
 
5bf38c0
 
 
 
2a3aa81
6bebb04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce5002a
 
 
 
 
 
 
 
e08081f
6bebb04
 
 
 
948cffb
2a3aa81
 
6bebb04
5bf38c0
6bebb04
 
948cffb
a70a295
6bebb04
 
948cffb
 
6bebb04
 
 
 
948cffb
2a3aa81
6bebb04
 
 
 
948cffb
6bebb04
 
 
 
 
 
 
 
 
 
 
 
 
ce5002a
6bebb04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce5002a
 
6bebb04
 
ce5002a
5bf38c0
948cffb
5bf38c0
948cffb
 
6bebb04
948cffb
6bebb04
 
948cffb
6bebb04
 
 
948cffb
6bebb04
948cffb
ce5002a
6bebb04
 
 
 
 
 
ce5002a
5bf38c0
6bebb04
ce5002a
6bebb04
 
948cffb
6bebb04
 
948cffb
6bebb04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bf38c0
948cffb
5bf38c0
6bebb04
ce5002a
6bebb04
a70a295
 
5bf38c0
948cffb
 
 
 
5bf38c0
a70a295
948cffb
 
ce5002a
e08081f
a70a295
6bebb04

# app.py

import os
import requests
import wikipedia
import gradio as gr
import torch

from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
from typing import List

from transformers import (
    SeamlessM4TTokenizer,
    SeamlessM4TProcessor,
    SeamlessM4TForTextToText,
    pipeline as hf_pipeline
)

# ── 1) Model setup ────────────────────────────────────────────────────────────

MODEL = "facebook/hf-seamless-m4t-medium"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = SeamlessM4TTokenizer.from_pretrained(MODEL, use_fast=False)
processor = SeamlessM4TProcessor.from_pretrained(MODEL, tokenizer=tokenizer)

m4t_model = SeamlessM4TForTextToText.from_pretrained(MODEL).to(device)
if device == "cuda":
    m4t_model = m4t_model.half()   # FP16 for faster inference on GPU
m4t_model.eval()

def translate_m4t(text: str, src_iso3: str, tgt_iso3: str, auto_detect=False) -> str:
    src = None if auto_detect else src_iso3
    inputs = processor(text=text, src_lang=src, return_tensors="pt").to(device)
    tokens = m4t_model.generate(**inputs, tgt_lang=tgt_iso3)
    return processor.decode(tokens[0].tolist(), skip_special_tokens=True)

def translate_m4t_batch(
    texts: List[str], src_iso3: str, tgt_iso3: str, auto_detect=False
) -> List[str]:
    src = None if auto_detect else src_iso3
    inputs = processor(
        text=texts, src_lang=src, return_tensors="pt", padding=True
    ).to(device)
    tokens = m4t_model.generate(
        **inputs,
        tgt_lang=tgt_iso3,
        max_new_tokens=60,
        num_beams=1
    )
    return processor.batch_decode(tokens, skip_special_tokens=True)


# ── 2) NER pipeline (updated for deprecation) ────────────────────────────────

ner = hf_pipeline(
    "ner",
    model="dslim/bert-base-NER-uncased",
    aggregation_strategy="simple"
)


# ── 3) CACHING helpers ──────────────────────────────────────────────────────

@lru_cache(maxsize=256)
def geocode_cache(place: str):
    r = requests.get(
        "https://nominatim.openstreetmap.org/search",
        params={"q": place, "format": "json", "limit": 1},
        headers={"User-Agent": "iVoiceContext/1.0"}
    ).json()
    if not r:
        return None
    return {"lat": float(r[0]["lat"]), "lon": float(r[0]["lon"])}

@lru_cache(maxsize=256)
def fetch_osm_cache(lat: float, lon: float, osm_filter: str, limit: int = 5):
    payload = f"""
      [out:json][timeout:25];
      (
        node{osm_filter}(around:1000,{lat},{lon});
        way{osm_filter}(around:1000,{lat},{lon});
      );
      out center {limit};
    """
    resp = requests.post(
        "https://overpass-api.de/api/interpreter",
        data={"data": payload}
    )
    elems = resp.json().get("elements", [])
    return [
        {"name": e["tags"]["name"]}
        for e in elems
        if e.get("tags", {}).get("name")
    ]

@lru_cache(maxsize=256)
def wiki_summary_cache(name: str) -> str:
    try:
        return wikipedia.summary(name, sentences=2)
    except:
        return "No summary available."


# ── 4) Per-entity worker ────────────────────────────────────────────────────

def process_entity(ent) -> dict:
    w = ent["word"]
    lbl = ent["entity_group"]

    if lbl == "LOC":
        geo = geocode_cache(w)
        if not geo:
            return {
                "text": w,
                "label": lbl,
                "type": "location",
                "error": "could not geocode"
            }

        restaurants = fetch_osm_cache(geo["lat"], geo["lon"], '["amenity"="restaurant"]')
        attractions = fetch_osm_cache(geo["lat"], geo["lon"], '["tourism"="attraction"]')

        return {
            "text": w,
            "label": lbl,
            "type": "location",
            "geo": geo,
            "restaurants": restaurants,
            "attractions": attractions
        }

    # PERSON / ORG / MISC → Wikipedia
    summary = wiki_summary_cache(w)
    return {"text": w, "label": lbl, "type": "wiki", "summary": summary}


# ── 5) Main function ────────────────────────────────────────────────────────

def get_context(
    text: str,
    source_lang: str,
    output_lang: str,
    auto_detect: bool
):
    # a) Ensure English for NER
    if auto_detect or source_lang != "eng":
        en = translate_m4t(text, source_lang, "eng", auto_detect=auto_detect)
    else:
        en = text

    # b) Run NER + dedupe
    ner_out = ner(en)
    seen = set()
    unique_ents = []
    for ent in ner_out:
        w = ent["word"]
        if w in seen:
            continue
        seen.add(w)
        unique_ents.append(ent)

    # c) Parallel I/O
    entities = []
    with ThreadPoolExecutor(max_workers=8) as exe:
        futures = [exe.submit(process_entity, ent) for ent in unique_ents]
        for fut in futures:
            entities.append(fut.result())

    # d) Batch-translate non-English fields
    if output_lang != "eng":
        to_translate = []
        translations_info = []

        for i, e in enumerate(entities):
            if e["type"] == "wiki":
                translations_info.append(("summary", i))
                to_translate.append(e["summary"])
            elif e["type"] == "location":
                for j, r in enumerate(e["restaurants"]):
                    translations_info.append(("restaurant", i, j))
                    to_translate.append(r["name"])
                for j, a in enumerate(e["attractions"]):
                    translations_info.append(("attraction", i, j))
                    to_translate.append(a["name"])

        translated = translate_m4t_batch(to_translate, "eng", output_lang)

        for txt, info in zip(translated, translations_info):
            kind = info[0]
            if kind == "summary":
                _, ei = info
                entities[ei]["summary"] = txt
            elif kind == "restaurant":
                _, ei, ri = info
                entities[ei]["restaurants"][ri]["name"] = txt
            elif kind == "attraction":
                _, ei, ai = info
                entities[ei]["attractions"][ai]["name"] = txt

    return {"entities": entities}


# ── 6) Gradio interface ─────────────────────────────────────────────────────

iface = gr.Interface(
    fn=get_context,
    inputs=[
        gr.Textbox(lines=3, placeholder="Enter text…"),
        gr.Textbox(label="Source Language (ISO 639-3)"),
        gr.Textbox(label="Target Language (ISO 639-3)"),
        gr.Checkbox(label="Auto-detect source language")
    ],
    outputs="json",
    title="iVoice Context-Aware",
    description="Returns only the detected entities and their related info."
).queue()    # ← removed unsupported kwargs

if __name__ == "__main__":
    iface.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
        share=True
    )