Spaces:

Yeetek
/

insightflowv2

Runtime error

App Files Files Community

Yeetek commited on Jun 7

Commit

65c2b1d

verified ·

1 Parent(s): 8fc16d0

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -32

app.py CHANGED Viewed

@@ -42,11 +42,8 @@ MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
 # 3) Initialise once
 embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
-topic_model = BERTopic(
-    embedding_model=embeddings,
-    min_topic_size=MIN_TOPIC,
-    calculate_probabilities=True,
-)
 # 4) Schemas
 class Sentence(BaseModel):
@@ -74,40 +71,57 @@ app = FastAPI(title="CZ Topic Segmenter", version="1.0")
 @app.post("/segment", response_model=SegmentationResponse)
 def segment(sentences: List[Sentence]):
     if len(sentences) > MAX_DOCS:
         raise HTTPException(413, f"Too many sentences ({len(sentences)} > {MAX_DOCS})")
-    # If chunk_index is present, use it to preserve original ordering
-sorted_sent = sorted(
-    sentences,
-    key=lambda s: s.chunk_index if s.chunk_index is not None else 0
-)
-docs = [s.text for s in sorted_sent]
     topics, probs = topic_model.fit_transform(docs)
-    segments, cur = [], None
     for idx, (t_id, prob) in enumerate(zip(topics, probs)):
-    # compute the original index
-    orig_idx = (
-        sorted_sent[idx].chunk_index
-        if sorted_sent[idx].chunk_index is not None
-        else idx
-    )
-    if cur is None or t_id != cur["topic_id"]:
-        # …
-        cur = dict(
-            topic_id=t_id,
-            label=" ".join(words) if t_id != -1 else None,
-            keywords=words,
-            start=sorted_sent[idx].start,
-            end=sorted_sent[idx].end,
-            probability=float(prob or 0),
-            sentences=[orig_idx],   # use chunk_index here
         )
-    else:
-        cur["end"] = sorted_sent[idx].end
-        cur["sentences"].append(orig_idx)
     if cur:
         segments.append(cur)

 # 3) Initialise once
 embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
+# We will not build a global topic_model because we need sorting per-request
 # 4) Schemas
 class Sentence(BaseModel):
 @app.post("/segment", response_model=SegmentationResponse)
 def segment(sentences: List[Sentence]):
+    # Guardrail: avoid oversize requests
     if len(sentences) > MAX_DOCS:
         raise HTTPException(413, f"Too many sentences ({len(sentences)} > {MAX_DOCS})")
+    # Sort by chunk_index if available, else maintain original order
+    sorted_sent = sorted(
+        sentences,
+        key=lambda s: s.chunk_index if s.chunk_index is not None else 0
+    )
+    docs = [s.text for s in sorted_sent]
+    # Build topic model per request to preserve order mapping
+    from bertopic import BERTopic
+    topic_model = BERTopic(
+        embedding_model=embeddings,
+        min_topic_size=MIN_TOPIC,
+        calculate_probabilities=True,
+    )
+    # Fit-transform
     topics, probs = topic_model.fit_transform(docs)
+    # Assemble segments
+    segments = []
+    cur = None
     for idx, (t_id, prob) in enumerate(zip(topics, probs)):
+        # Map back to original chunk_index or positional idx
+        orig_idx = (
+            sorted_sent[idx].chunk_index
+            if sorted_sent[idx].chunk_index is not None
+            else idx
         )
+        # When topic changes, push previous segment
+        if cur is None or t_id != cur["topic_id"]:
+            if cur:
+                segments.append(cur)
+            # Top-5 keywords for this topic
+            words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
+            cur = {
+                "topic_id": t_id,
+                "label": " ".join(words) if t_id != -1 else None,
+                "keywords": words,
+                "start": sorted_sent[idx].start,
+                "end": sorted_sent[idx].end,
+                "probability": float(prob or 0),
+                "sentences": [orig_idx],
+            }
+        else:
+            cur["end"] = sorted_sent[idx].end
+            cur["sentences"].append(orig_idx)
+    # Append last
     if cur:
         segments.append(cur)