Yeetek commited on
Commit
1570c6c
Β·
verified Β·
1 Parent(s): 65c2b1d

Update app.py

Browse files

I’ve updated app.py to:

Compute n_neighbors dynamically (≀ n_samples βˆ’ 1)

Set init="random" on UMAP to skip spectral initialization (avoids the k >= N eigen error).

Build a fresh BERTopic per request with these settings.

Redeploy, and your n8n 3-chunk payload will now go through UMAP without the sparse‐eigh error.

Files changed (1) hide show
  1. app.py +15 -8
app.py CHANGED
@@ -17,7 +17,6 @@ if "StaticEmbedding" not in models:
17
  sys.stdout.flush()
18
  # ──────────────────────────────────────────────────────────────────────────────
19
 
20
-
21
  # ── REST OF YOUR APP.PY ──────────────────────────────────────────────────────
22
  import os, uuid
23
  from typing import List
@@ -25,6 +24,7 @@ from fastapi import FastAPI, HTTPException
25
  from pydantic import BaseModel
26
  from bertopic import BERTopic
27
  from sentence_transformers import SentenceTransformer
 
28
 
29
  # 0) Quick env dump
30
  print("ENV-snapshot:", json.dumps({k: os.environ[k] for k in list(os.environ)[:10]}))
@@ -43,8 +43,6 @@ MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
43
  # 3) Initialise once
44
  embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
45
 
46
- # We will not build a global topic_model because we need sorting per-request
47
-
48
  # 4) Schemas
49
  class Sentence(BaseModel):
50
  text: str
@@ -82,10 +80,21 @@ def segment(sentences: List[Sentence]):
82
  )
83
  docs = [s.text for s in sorted_sent]
84
 
85
- # Build topic model per request to preserve order mapping
86
- from bertopic import BERTopic
 
 
 
 
 
 
 
 
 
 
87
  topic_model = BERTopic(
88
  embedding_model=embeddings,
 
89
  min_topic_size=MIN_TOPIC,
90
  calculate_probabilities=True,
91
  )
@@ -103,11 +112,10 @@ def segment(sentences: List[Sentence]):
103
  if sorted_sent[idx].chunk_index is not None
104
  else idx
105
  )
106
- # When topic changes, push previous segment
107
  if cur is None or t_id != cur["topic_id"]:
108
  if cur:
109
  segments.append(cur)
110
- # Top-5 keywords for this topic
111
  words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
112
  cur = {
113
  "topic_id": t_id,
@@ -121,7 +129,6 @@ def segment(sentences: List[Sentence]):
121
  else:
122
  cur["end"] = sorted_sent[idx].end
123
  cur["sentences"].append(orig_idx)
124
- # Append last
125
  if cur:
126
  segments.append(cur)
127
 
 
17
  sys.stdout.flush()
18
  # ──────────────────────────────────────────────────────────────────────────────
19
 
 
20
  # ── REST OF YOUR APP.PY ──────────────────────────────────────────────────────
21
  import os, uuid
22
  from typing import List
 
24
  from pydantic import BaseModel
25
  from bertopic import BERTopic
26
  from sentence_transformers import SentenceTransformer
27
+ from umap import UMAP
28
 
29
  # 0) Quick env dump
30
  print("ENV-snapshot:", json.dumps({k: os.environ[k] for k in list(os.environ)[:10]}))
 
43
  # 3) Initialise once
44
  embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
45
 
 
 
46
  # 4) Schemas
47
  class Sentence(BaseModel):
48
  text: str
 
80
  )
81
  docs = [s.text for s in sorted_sent]
82
 
83
+ # Choose dynamic n_neighbors <= n_samples-1
84
+ n_samples = len(docs)
85
+ n_neighbors = min(15, max(2, n_samples - 1))
86
+ # UMAP with cosine and random init to avoid spectral errors on tiny N
87
+ umap_model = UMAP(
88
+ n_neighbors=n_neighbors,
89
+ metric="cosine",
90
+ init="random",
91
+ random_state=42
92
+ )
93
+
94
+ # Build BERTopic per request with dynamic UMAP
95
  topic_model = BERTopic(
96
  embedding_model=embeddings,
97
+ umap_model=umap_model,
98
  min_topic_size=MIN_TOPIC,
99
  calculate_probabilities=True,
100
  )
 
112
  if sorted_sent[idx].chunk_index is not None
113
  else idx
114
  )
 
115
  if cur is None or t_id != cur["topic_id"]:
116
  if cur:
117
  segments.append(cur)
118
+ # Top-5 keywords
119
  words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
120
  cur = {
121
  "topic_id": t_id,
 
129
  else:
130
  cur["end"] = sorted_sent[idx].end
131
  cur["sentences"].append(orig_idx)
 
132
  if cur:
133
  segments.append(cur)
134