Yeetek commited on
Commit
7b5322f
Β·
verified Β·
1 Parent(s): efa5b1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -8
app.py CHANGED
@@ -1,25 +1,29 @@
1
- # ---------- BEGIN app.py (diagnostic build) ----------
2
  import os, sys, json, uuid
3
 
4
- # DEBUG – capture the first 20 env-vars Hugging Face passes in
 
5
  print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:20])))
6
- sys.stdout.flush() # make sure it appears in HF build logs
7
 
8
- # Optional: leave the numba lines in place for the real fix
9
- os.environ["NUMBA_DISABLE_CACHE"] = "1"
10
- os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba")
11
  os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
12
 
 
13
  from typing import List
14
  from fastapi import FastAPI, HTTPException
15
  from pydantic import BaseModel
16
  from bertopic import BERTopic
17
  from sentence_transformers import SentenceTransformer
18
 
 
19
  MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
20
  MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
21
  MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
22
 
 
23
  embeddings = SentenceTransformer(MODEL_NAME)
24
  topic_model = BERTopic(
25
  embedding_model=embeddings,
@@ -27,7 +31,7 @@ topic_model = BERTopic(
27
  calculate_probabilities=True,
28
  )
29
 
30
- # ----- FastAPI schema & endpoint (unchanged) -----
31
  class Sentence(BaseModel):
32
  text: str
33
  start: float
@@ -36,4 +40,56 @@ class Sentence(BaseModel):
36
 
37
  class Segment(BaseModel):
38
  topic_id: int
39
- label:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---------- BEGIN app.py ----------
2
  import os, sys, json, uuid
3
 
4
+ # ── 1. Diagnostics ────────────────────────────────────────────────────────────
5
+ # Print the first 20 environment variables to the Space logs (delete later)
6
  print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:20])))
7
+ sys.stdout.flush()
8
 
9
+ # ── 2. Numba cache workaround (must run BEFORE bertopic import) ───────────────
10
+ os.environ["NUMBA_DISABLE_CACHE"] = "1" # hard off-switch
11
+ os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba") # fallback dir
12
  os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
13
 
14
+ # ── 3. Heavy imports ──────────────────────────────────────────────────────────
15
  from typing import List
16
  from fastapi import FastAPI, HTTPException
17
  from pydantic import BaseModel
18
  from bertopic import BERTopic
19
  from sentence_transformers import SentenceTransformer
20
 
21
+ # ── 4. Configuration via env vars ─────────────────────────────────────────────
22
  MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
23
  MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
24
  MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
25
 
26
+ # ── 5. Initialise models once at container start ─────────────────────────────
27
  embeddings = SentenceTransformer(MODEL_NAME)
28
  topic_model = BERTopic(
29
  embedding_model=embeddings,
 
31
  calculate_probabilities=True,
32
  )
33
 
34
+ # ── 6. Pydantic schemas ──────────────────────────────────────────────────────
35
  class Sentence(BaseModel):
36
  text: str
37
  start: float
 
40
 
41
  class Segment(BaseModel):
42
  topic_id: int
43
+ label: str | None
44
+ keywords: List[str]
45
+ start: float
46
+ end: float
47
+ probability: float | None
48
+ sentences: List[int]
49
+
50
+ class SegmentationResponse(BaseModel):
51
+ run_id: str
52
+ segments: List[Segment]
53
+
54
+ # ── 7. FastAPI app and endpoint ──────────────────────────────────────────────
55
+ app = FastAPI(title="CZ Topic Segmenter", version="1.0")
56
+
57
+ @app.post("/segment", response_model=SegmentationResponse)
58
+ def segment(sentences: List[Sentence]):
59
+ # Guardrail: avoid oversize requests
60
+ if len(sentences) > MAX_DOCS:
61
+ raise HTTPException(
62
+ status_code=413,
63
+ detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
64
+ )
65
+
66
+ docs = [s.text for s in sentences]
67
+ topics, probs = topic_model.fit_transform(docs)
68
+
69
+ segments, cur = [], None
70
+ for idx, (t_id, prob) in enumerate(zip(topics, probs)):
71
+ if cur is None or t_id != cur["topic_id"]:
72
+ if cur:
73
+ segments.append(cur)
74
+
75
+ # Top-5 keywords for this topic
76
+ words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
77
+
78
+ cur = dict(
79
+ topic_id=t_id,
80
+ label=" ".join(words) if t_id != -1 else None, # βœ“ fixed β€˜=’
81
+ keywords=words,
82
+ start=sentences[idx].start,
83
+ end=sentences[idx].end,
84
+ probability=float(prob or 0),
85
+ sentences=[idx],
86
+ )
87
+ else:
88
+ cur["end"] = sentences[idx].end
89
+ cur["sentences"].append(idx)
90
+
91
+ if cur:
92
+ segments.append(cur)
93
+
94
+ return {"run_id": str(uuid.uuid4()), "segments": segments}
95
+ # ---------- END app.py ----------