Spaces:

Yeetek
/

insightflowv2

Runtime error

App Files Files Community

Yeetek commited on Jun 6

Commit

0d226e8

verified ·

1 Parent(s): a290aa5

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -0

app.py CHANGED Viewed

@@ -41,3 +41,82 @@ from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
 # ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------

 from sentence_transformers import SentenceTransformer
 # ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------
+...
+# ---------- rest of the file unchanged ----------
+# ── 4. Configuration via env vars ─────────────────────────────────────────────
+MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
+MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
+MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
+# ── 5. Initialise models once at container start ─────────────────────────────
+embeddings  = SentenceTransformer(MODEL_NAME)
+topic_model = BERTopic(
+    embedding_model=embeddings,
+    min_topic_size=MIN_TOPIC,
+    calculate_probabilities=True,
+)
+# ── 6. Pydantic schemas ──────────────────────────────────────────────────────
+class Sentence(BaseModel):
+    text: str
+    start: float
+    end: float
+    speaker: str | None = None
+class Segment(BaseModel):
+    topic_id: int
+    label: str | None
+    keywords: List[str]
+    start: float
+    end: float
+    probability: float | None
+    sentences: List[int]
+class SegmentationResponse(BaseModel):
+    run_id: str
+    segments: List[Segment]
+# ── 7. FastAPI app and endpoint ──────────────────────────────────────────────
+app = FastAPI(title="CZ Topic Segmenter", version="1.0")
+@app.post("/segment", response_model=SegmentationResponse)
+def segment(sentences: List[Sentence]):
+    # Guardrail: avoid oversize requests
+    if len(sentences) > MAX_DOCS:
+        raise HTTPException(
+            status_code=413,
+            detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
+        )
+    docs = [s.text for s in sentences]
+    topics, probs = topic_model.fit_transform(docs)
+    segments, cur = [], None
+    for idx, (t_id, prob) in enumerate(zip(topics, probs)):
+        if cur is None or t_id != cur["topic_id"]:
+            if cur:
+                segments.append(cur)
+            # Top-5 keywords for this topic
+            words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
+            cur = dict(
+                topic_id=t_id,
+                label=" ".join(words) if t_id != -1 else None,  # ✓ fixed ‘=’
+                keywords=words,
+                start=sentences[idx].start,
+                end=sentences[idx].end,
+                probability=float(prob or 0),
+                sentences=[idx],
+            )
+        else:
+            cur["end"] = sentences[idx].end
+            cur["sentences"].append(idx)
+    if cur:
+        segments.append(cur)
+    return {"run_id": str(uuid.uuid4()), "segments": segments}
+# ---------- END app.py ----------