Spaces:

Yeetek
/

insightflowv2

Runtime error

App Files Files Community

Yeetek commited on Jun 7

Commit

1570c6c

verified ·

1 Parent(s): 65c2b1d

Update app.py

Browse files

I’ve updated app.py to:

Compute n_neighbors dynamically (≤ n_samples − 1)

Set init="random" on UMAP to skip spectral initialization (avoids the k >= N eigen error).

Build a fresh BERTopic per request with these settings.

Redeploy, and your n8n 3-chunk payload will now go through UMAP without the sparse‐eigh error.

Files changed (1) hide show

app.py +15 -8

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ if "StaticEmbedding" not in models:
     sys.stdout.flush()
 # ──────────────────────────────────────────────────────────────────────────────
 # ── REST OF YOUR APP.PY ──────────────────────────────────────────────────────
 import os, uuid
 from typing import List
@@ -25,6 +24,7 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
 # 0) Quick env dump
 print("ENV-snapshot:", json.dumps({k: os.environ[k] for k in list(os.environ)[:10]}))
@@ -43,8 +43,6 @@ MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
 # 3) Initialise once
 embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
-# We will not build a global topic_model because we need sorting per-request
 # 4) Schemas
 class Sentence(BaseModel):
     text: str
@@ -82,10 +80,21 @@ def segment(sentences: List[Sentence]):
     )
     docs = [s.text for s in sorted_sent]
-    # Build topic model per request to preserve order mapping
-    from bertopic import BERTopic
     topic_model = BERTopic(
         embedding_model=embeddings,
         min_topic_size=MIN_TOPIC,
         calculate_probabilities=True,
     )
@@ -103,11 +112,10 @@ def segment(sentences: List[Sentence]):
             if sorted_sent[idx].chunk_index is not None
             else idx
         )
-        # When topic changes, push previous segment
         if cur is None or t_id != cur["topic_id"]:
             if cur:
                 segments.append(cur)
-            # Top-5 keywords for this topic
             words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
             cur = {
                 "topic_id": t_id,
@@ -121,7 +129,6 @@ def segment(sentences: List[Sentence]):
         else:
             cur["end"] = sorted_sent[idx].end
             cur["sentences"].append(orig_idx)
-    # Append last
     if cur:
         segments.append(cur)

     sys.stdout.flush()
 # ──────────────────────────────────────────────────────────────────────────────
 # ── REST OF YOUR APP.PY ──────────────────────────────────────────────────────
 import os, uuid
 from typing import List
 from pydantic import BaseModel
 from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
+from umap import UMAP
 # 0) Quick env dump
 print("ENV-snapshot:", json.dumps({k: os.environ[k] for k in list(os.environ)[:10]}))
 # 3) Initialise once
 embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
 # 4) Schemas
 class Sentence(BaseModel):
     text: str
     )
     docs = [s.text for s in sorted_sent]
+    # Choose dynamic n_neighbors <= n_samples-1
+    n_samples = len(docs)
+    n_neighbors = min(15, max(2, n_samples - 1))
+    # UMAP with cosine and random init to avoid spectral errors on tiny N
+    umap_model = UMAP(
+        n_neighbors=n_neighbors,
+        metric="cosine",
+        init="random",
+        random_state=42
+    )
+    # Build BERTopic per request with dynamic UMAP
     topic_model = BERTopic(
         embedding_model=embeddings,
+        umap_model=umap_model,
         min_topic_size=MIN_TOPIC,
         calculate_probabilities=True,
     )
             if sorted_sent[idx].chunk_index is not None
             else idx
         )
         if cur is None or t_id != cur["topic_id"]:
             if cur:
                 segments.append(cur)
+            # Top-5 keywords
             words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
             cur = {
                 "topic_id": t_id,
         else:
             cur["end"] = sorted_sent[idx].end
             cur["sentences"].append(orig_idx)
     if cur:
         segments.append(cur)