Spaces:

Yeetek
/

insightflowv2

Runtime error

App Files Files Community

Yeetek commited on Jun 7

Commit

6948a45

verified ·

1 Parent(s): 1570c6c

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -11

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from pydantic import BaseModel
 from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
 from umap import UMAP
 # 0) Quick env dump
 print("ENV-snapshot:", json.dumps({k: os.environ[k] for k in list(os.environ)[:10]}))
@@ -40,8 +41,8 @@ MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
 MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
 MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
-# 3) Initialise once
-embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
 # 4) Schemas
 class Sentence(BaseModel):
@@ -69,21 +70,20 @@ app = FastAPI(title="CZ Topic Segmenter", version="1.0")
 @app.post("/segment", response_model=SegmentationResponse)
 def segment(sentences: List[Sentence]):
-    # Guardrail: avoid oversize requests
     if len(sentences) > MAX_DOCS:
         raise HTTPException(413, f"Too many sentences ({len(sentences)} > {MAX_DOCS})")
-    # Sort by chunk_index if available, else maintain original order
     sorted_sent = sorted(
         sentences,
         key=lambda s: s.chunk_index if s.chunk_index is not None else 0
     )
     docs = [s.text for s in sorted_sent]
-    # Choose dynamic n_neighbors <= n_samples-1
     n_samples = len(docs)
     n_neighbors = min(15, max(2, n_samples - 1))
-    # UMAP with cosine and random init to avoid spectral errors on tiny N
     umap_model = UMAP(
         n_neighbors=n_neighbors,
         metric="cosine",
@@ -91,12 +91,21 @@ def segment(sentences: List[Sentence]):
         random_state=42
     )
-    # Build BERTopic per request with dynamic UMAP
     topic_model = BERTopic(
         embedding_model=embeddings,
         umap_model=umap_model,
-        min_topic_size=MIN_TOPIC,
-        calculate_probabilities=True,
     )
     # Fit-transform
@@ -106,7 +115,6 @@ def segment(sentences: List[Sentence]):
     segments = []
     cur = None
     for idx, (t_id, prob) in enumerate(zip(topics, probs)):
-        # Map back to original chunk_index or positional idx
         orig_idx = (
             sorted_sent[idx].chunk_index
             if sorted_sent[idx].chunk_index is not None
@@ -115,7 +123,6 @@ def segment(sentences: List[Sentence]):
         if cur is None or t_id != cur["topic_id"]:
             if cur:
                 segments.append(cur)
-            # Top-5 keywords
             words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
             cur = {
                 "topic_id": t_id,

 from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
 from umap import UMAP
+from hdbscan import HDBSCAN
 # 0) Quick env dump
 print("ENV-snapshot:", json.dumps({k: os.environ[k] for k in list(os.environ)[:10]}))
 MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
 MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
+# 3) Initialise embeddings once
+en embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
 # 4) Schemas
 class Sentence(BaseModel):
 @app.post("/segment", response_model=SegmentationResponse)
 def segment(sentences: List[Sentence]):
+    # Guardrail
     if len(sentences) > MAX_DOCS:
         raise HTTPException(413, f"Too many sentences ({len(sentences)} > {MAX_DOCS})")
+    # Sort by chunk_index if present
     sorted_sent = sorted(
         sentences,
         key=lambda s: s.chunk_index if s.chunk_index is not None else 0
     )
     docs = [s.text for s in sorted_sent]
+    # UMAP with cosine, init random, dynamic neighbors
     n_samples = len(docs)
     n_neighbors = min(15, max(2, n_samples - 1))
     umap_model = UMAP(
         n_neighbors=n_neighbors,
         metric="cosine",
         random_state=42
     )
+    # HDBSCAN with dynamic cluster sizes
+    cluster_size = min(MIN_TOPIC, n_samples) if n_samples >= 2 else 2
+    hdbscan_model = HDBSCAN(
+        min_cluster_size=cluster_size,
+        min_samples=min(cluster_size, n_samples),
+        metric="euclidean",
+        cluster_selection_method="eom"
+    )
+    # Build BERTopic per request
     topic_model = BERTopic(
         embedding_model=embeddings,
         umap_model=umap_model,
+        hdbscan_model=hdbscan_model,
+        calculate_probabilities=True
     )
     # Fit-transform
     segments = []
     cur = None
     for idx, (t_id, prob) in enumerate(zip(topics, probs)):
         orig_idx = (
             sorted_sent[idx].chunk_index
             if sorted_sent[idx].chunk_index is not None
         if cur is None or t_id != cur["topic_id"]:
             if cur:
                 segments.append(cur)
             words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
             cur = {
                 "topic_id": t_id,