Spaces:

Yeetek
/

insightflowv2

Runtime error

App Files Files Community

Yeetek commited on Jun 6

Commit

a48a75f

verified ·

1 Parent(s): 0655a6c

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -5

app.py CHANGED Viewed

@@ -1,7 +1,69 @@
-from fastapi import FastAPI
-app = FastAPI()
-@app.get("/")
-def greet_json():
-    return {"Hello": "World!"}

+import os, uuid
+from typing import List
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from bertopic import BERTopic
+from sentence_transformers import SentenceTransformer
+MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
+MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
+MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
+# --- init models once at container start ---
+embeddings  = SentenceTransformer(MODEL_NAME)
+topic_model = BERTopic(
+    embedding_model      = embeddings,
+    min_topic_size       = MIN_TOPIC,
+    calculate_probabilities = True,
+)
+# -------- FastAPI schema ----------
+class Sentence(BaseModel):
+    text: str
+    start: float
+    end: float
+    speaker: str | None = None
+class Segment(BaseModel):
+    topic_id: int
+    label: str | None
+    keywords: List[str]
+    start: float
+    end: float
+    probability: float | None
+    sentences: List[int]
+class SegmentationResponse(BaseModel):
+    run_id: str
+    segments: List[Segment]
+app = FastAPI(title="CZ Topic Segmenter", version="1.0")
+@app.post("/segment", response_model=SegmentationResponse)
+def segment(sentences: List[Sentence]):
+    if len(sentences) > MAX_DOCS:
+        raise HTTPException(413, f"Too many sentences ({len(sentences)} > {MAX_DOCS})")
+    docs = [s.text for s in sentences]
+    topics, probs = topic_model.fit_transform(docs)
+    segments, cur = [], None
+    for idx, (t_id, prob) in enumerate(zip(topics, probs)):
+        if cur is None or t_id != cur["topic_id"]:
+            if cur: segments.append(cur)
+            # top-5 keywords for the cluster
+            words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
+            cur = dict(topic_id=t_id,
+                       label=" ".join(words) if t_id != -1 else None,
+                       keywords=words,
+                       start=sentences[idx].start,
+                       end=sentences[idx].end,
+                       probability=float(prob or 0),
+                       sentences=[idx])
+        else:
+            cur["end"] = sentences[idx].end
+            cur["sentences"].append(idx)
+    if cur:
+        segments.append(cur)
+    return {"run_id": str(uuid.uuid4()), "segments": segments}