Spaces:

Yeetek
/

insightflowv2

Runtime error

App Files Files Community

Yeetek commited on Jun 6

Commit

7b5322f

verified ·

1 Parent(s): efa5b1a

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -8

app.py CHANGED Viewed

@@ -1,25 +1,29 @@
-# ---------- BEGIN app.py (diagnostic build) ----------
 import os, sys, json, uuid
-#  DEBUG – capture the first 20 env-vars Hugging Face passes in
 print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:20])))
-sys.stdout.flush()          # make sure it appears in HF build logs
-#  Optional: leave the numba lines in place for the real fix
-os.environ["NUMBA_DISABLE_CACHE"] = "1"
-os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba")
 os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
 from typing import List
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
 MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
 MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
 MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
 embeddings  = SentenceTransformer(MODEL_NAME)
 topic_model = BERTopic(
     embedding_model=embeddings,
@@ -27,7 +31,7 @@ topic_model = BERTopic(
     calculate_probabilities=True,
 )
-# ----- FastAPI schema & endpoint (unchanged) -----
 class Sentence(BaseModel):
     text: str
     start: float
@@ -36,4 +40,56 @@ class Sentence(BaseModel):
 class Segment(BaseModel):
     topic_id: int
-    label:

+# ---------- BEGIN app.py ----------
 import os, sys, json, uuid
+# ── 1. Diagnostics ────────────────────────────────────────────────────────────
+# Print the first 20 environment variables to the Space logs (delete later)
 print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:20])))
+sys.stdout.flush()
+# ── 2. Numba cache workaround (must run BEFORE bertopic import) ───────────────
+os.environ["NUMBA_DISABLE_CACHE"] = "1"                   # hard off-switch
+os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba")    # fallback dir
 os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
+# ── 3. Heavy imports ──────────────────────────────────────────────────────────
 from typing import List
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
+# ── 4. Configuration via env vars ─────────────────────────────────────────────
 MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
 MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
 MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
+# ── 5. Initialise models once at container start ─────────────────────────────
 embeddings  = SentenceTransformer(MODEL_NAME)
 topic_model = BERTopic(
     embedding_model=embeddings,
     calculate_probabilities=True,
 )
+# ── 6. Pydantic schemas ──────────────────────────────────────────────────────
 class Sentence(BaseModel):
     text: str
     start: float
 class Segment(BaseModel):
     topic_id: int
+    label: str | None
+    keywords: List[str]
+    start: float
+    end: float
+    probability: float | None
+    sentences: List[int]
+class SegmentationResponse(BaseModel):
+    run_id: str
+    segments: List[Segment]
+# ── 7. FastAPI app and endpoint ──────────────────────────────────────────────
+app = FastAPI(title="CZ Topic Segmenter", version="1.0")
+@app.post("/segment", response_model=SegmentationResponse)
+def segment(sentences: List[Sentence]):
+    # Guardrail: avoid oversize requests
+    if len(sentences) > MAX_DOCS:
+        raise HTTPException(
+            status_code=413,
+            detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
+        )
+    docs = [s.text for s in sentences]
+    topics, probs = topic_model.fit_transform(docs)
+    segments, cur = [], None
+    for idx, (t_id, prob) in enumerate(zip(topics, probs)):
+        if cur is None or t_id != cur["topic_id"]:
+            if cur:
+                segments.append(cur)
+            # Top-5 keywords for this topic
+            words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
+            cur = dict(
+                topic_id=t_id,
+                label=" ".join(words) if t_id != -1 else None,  # ✓ fixed ‘=’
+                keywords=words,
+                start=sentences[idx].start,
+                end=sentences[idx].end,
+                probability=float(prob or 0),
+                sentences=[idx],
+            )
+        else:
+            cur["end"] = sentences[idx].end
+            cur["sentences"].append(idx)
+    if cur:
+        segments.append(cur)
+    return {"run_id": str(uuid.uuid4()), "segments": segments}
+# ---------- END app.py ----------