Spaces:

Yeetek
/

insightflowv2

Runtime error

App Files Files Community

Yeetek commited on Jun 7

Commit

a02884f

verified ·

1 Parent(s): 98c2919

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -81

app.py CHANGED Viewed

@@ -1,83 +1,46 @@
-# ── DIAGNOSTICS ─────────────────────────────────────────────
-import pkgutil, sentence_transformers, bertopic, sys
 print("ST version:", sentence_transformers.__version__)
 print("BERTopic version:", bertopic.__version__)
-print("ST models:", [m.name for m in pkgutil.iter_modules(sentence_transformers.models.__path__)])
 sys.stdout.flush()
-# ── STATICEMBEDDING SHIM ────────────────────────────────────
-if "StaticEmbedding" not in [m.name for m in pkgutil.iter_modules(sentence_transformers.models.__path__)]:
     from sentence_transformers.models import Transformer
-    import bertopic.backend._sentencetransformers as st_back
-    st_back.StaticEmbedding = Transformer
-# ── regular imports and monkey-patch if you choose ───────────────────────
-from bertopic import BERTopic
-# optionally:
-# import bertopic.backend._sentencetransformers as st_back
-# from sentence_transformers.models import Transformer
-# st_back.StaticEmbedding = Transformer
-from fastapi import FastAPI, HTTPException
-# ---------- BEGIN app.py ----------
-import os, sys, json, uuid, types
-# ── 0. Quick env print – delete later if you like ───────────────────────
-print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:25])))
-sys.stdout.flush()
-# ── 1. Ensure a writable dir (good housekeeping) ────────────────────────
-os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
-os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
-# ── 2. FINAL numba cache kill-switch  ────────────────────────────────────
-try:
-    import importlib, numba, types
-    from numba.core import dispatcher, caching
-    import numba.np.ufunc.ufuncbuilder as ufuncbuilder
-    # 2-a  UMAP path: no-op dispatcher method
-    dispatcher.Dispatcher.enable_caching = lambda self: None
-    # 2-b  Build a stub that pretends to be a FunctionCache
-    class _NoCache(types.SimpleNamespace):
-        def __init__(self, *_, **__): pass
-        load_overload   = lambda *_, **__: False
-        save_overload   = lambda *_, **__: None
-        enable_caching  = lambda *_, **__: None
-    # 2-c  Patch *every* place that still holds a reference
-    caching.FunctionCache           = _NoCache        # core path
-    ufuncbuilder.FunctionCache      = _NoCache        # PyNNDescent path
-    # 2-d  Extra belt-and-braces flag
-    os.environ["NUMBA_DISABLE_CACHE"] = "1"
-except ImportError:
-    # numba isn't installed yet during first pip install – harmless
-    pass
-# ─────────────────────────────────────────────────────────────────────────
-# ── 3. Heavy imports (UMAP, BERTopic, FastAPI, …) ───────────────────────
 from typing import List
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
-# ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------
-...
-# ---------- rest of the file unchanged ----------
-# ── 4. Configuration via env vars ─────────────────────────────���───────────────
 MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
 MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
 MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
-# ── 5. Initialise models once at container start ─────────────────────────────
 embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
 topic_model = BERTopic(
     embedding_model=embeddings,
@@ -85,7 +48,7 @@ topic_model = BERTopic(
     calculate_probabilities=True,
 )
-# ── 6. Pydantic schemas ──────────────────────────────────────────────────────
 class Sentence(BaseModel):
     text: str
     start: float
@@ -105,17 +68,13 @@ class SegmentationResponse(BaseModel):
     run_id: str
     segments: List[Segment]
-# ── 7. FastAPI app and endpoint ──────────────────────────────────────────────
 app = FastAPI(title="CZ Topic Segmenter", version="1.0")
 @app.post("/segment", response_model=SegmentationResponse)
 def segment(sentences: List[Sentence]):
-    # Guardrail: avoid oversize requests
     if len(sentences) > MAX_DOCS:
-        raise HTTPException(
-            status_code=413,
-            detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
-        )
     docs = [s.text for s in sentences]
     topics, probs = topic_model.fit_transform(docs)
@@ -125,25 +84,21 @@ def segment(sentences: List[Sentence]):
         if cur is None or t_id != cur["topic_id"]:
             if cur:
                 segments.append(cur)
-            # Top-5 keywords for this topic
-            words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
             cur = dict(
-                topic_id=t_id,
-                label=" ".join(words) if t_id != -1 else None,  # ✓ fixed ‘=’
-                keywords=words,
-                start=sentences[idx].start,
-                end=sentences[idx].end,
-                probability=float(prob or 0),
-                sentences=[idx],
             )
         else:
             cur["end"] = sentences[idx].end
             cur["sentences"].append(idx)
     if cur:
         segments.append(cur)
     return {"run_id": str(uuid.uuid4()), "segments": segments}
-# ---------- END app.py ----------

+# ── DIAGNOSTICS & SHIM (must come before any BERTopic import) ─────────────
+import pkgutil, sentence_transformers, bertopic, sys, json
+# 1) Print versions & model‐list
 print("ST version:", sentence_transformers.__version__)
 print("BERTopic version:", bertopic.__version__)
+models = [m.name for m in pkgutil.iter_modules(sentence_transformers.models.__path__)]
+print("ST models:", models)
 sys.stdout.flush()
+# 2) If StaticEmbedding is missing, alias Transformer → StaticEmbedding
+if "StaticEmbedding" not in models:
     from sentence_transformers.models import Transformer
+    import sentence_transformers.models as _st_mod
+    setattr(_st_mod, "StaticEmbedding", Transformer)
+    print("🔧 Shim applied: StaticEmbedding → Transformer")
+    sys.stdout.flush()
+# ──────────────────────────────────────────────────────────────────────────────
+# ── REST OF YOUR APP.PY ──────────────────────────────────────────────────────
+import os, uuid
 from typing import List
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
+# 0) Quick env dump
+print("ENV-snapshot:", json.dumps({k: os.environ[k] for k in list(os.environ)[:10]}))
+sys.stdout.flush()
+# 1) Tidy numba cache
+os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
+os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
+os.environ["NUMBA_DISABLE_CACHE"] = "1"
+# 2) Config from ENV
 MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
 MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
 MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))
+# 3) Initialise once
 embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
 topic_model = BERTopic(
     embedding_model=embeddings,
     calculate_probabilities=True,
 )
+# 4) Schemas
 class Sentence(BaseModel):
     text: str
     start: float
     run_id: str
     segments: List[Segment]
+# 5) FastAPI
 app = FastAPI(title="CZ Topic Segmenter", version="1.0")
 @app.post("/segment", response_model=SegmentationResponse)
 def segment(sentences: List[Sentence]):
     if len(sentences) > MAX_DOCS:
+        raise HTTPException(413, f"Too many sentences ({len(sentences)} > {MAX_DOCS})")
     docs = [s.text for s in sentences]
     topics, probs = topic_model.fit_transform(docs)
         if cur is None or t_id != cur["topic_id"]:
             if cur:
                 segments.append(cur)
+            words = [w for w,_ in topic_model.get_topic(t_id)[:5]]
             cur = dict(
+                topic_id   = t_id,
+                label      = None if t_id == -1 else " ".join(words),
+                keywords   = words,
+                start      = sentences[idx].start,
+                end        = sentences[idx].end,
+                probability= float(prob or 0),
+                sentences  = [idx],
             )
         else:
             cur["end"] = sentences[idx].end
             cur["sentences"].append(idx)
     if cur:
         segments.append(cur)
     return {"run_id": str(uuid.uuid4()), "segments": segments}
+# ──────────────────────────────────────────────────────────────────────────────