Yeetek commited on
Commit
0d226e8
Β·
verified Β·
1 Parent(s): a290aa5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py CHANGED
@@ -41,3 +41,82 @@ from bertopic import BERTopic
41
  from sentence_transformers import SentenceTransformer
42
  # ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  from sentence_transformers import SentenceTransformer
42
  # ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------
43
 
44
+ ...
45
+ # ---------- rest of the file unchanged ----------
46
+
47
+
48
+ # ── 4. Configuration via env vars ─────────────────────────────────────────────
49
+ MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
50
+ MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
51
+ MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
52
+
53
+ # ── 5. Initialise models once at container start ─────────────────────────────
54
+ embeddings = SentenceTransformer(MODEL_NAME)
55
+ topic_model = BERTopic(
56
+ embedding_model=embeddings,
57
+ min_topic_size=MIN_TOPIC,
58
+ calculate_probabilities=True,
59
+ )
60
+
61
+ # ── 6. Pydantic schemas ──────────────────────────────────────────────────────
62
+ class Sentence(BaseModel):
63
+ text: str
64
+ start: float
65
+ end: float
66
+ speaker: str | None = None
67
+
68
+ class Segment(BaseModel):
69
+ topic_id: int
70
+ label: str | None
71
+ keywords: List[str]
72
+ start: float
73
+ end: float
74
+ probability: float | None
75
+ sentences: List[int]
76
+
77
+ class SegmentationResponse(BaseModel):
78
+ run_id: str
79
+ segments: List[Segment]
80
+
81
+ # ── 7. FastAPI app and endpoint ──────────────────────────────────────────────
82
+ app = FastAPI(title="CZ Topic Segmenter", version="1.0")
83
+
84
+ @app.post("/segment", response_model=SegmentationResponse)
85
+ def segment(sentences: List[Sentence]):
86
+ # Guardrail: avoid oversize requests
87
+ if len(sentences) > MAX_DOCS:
88
+ raise HTTPException(
89
+ status_code=413,
90
+ detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
91
+ )
92
+
93
+ docs = [s.text for s in sentences]
94
+ topics, probs = topic_model.fit_transform(docs)
95
+
96
+ segments, cur = [], None
97
+ for idx, (t_id, prob) in enumerate(zip(topics, probs)):
98
+ if cur is None or t_id != cur["topic_id"]:
99
+ if cur:
100
+ segments.append(cur)
101
+
102
+ # Top-5 keywords for this topic
103
+ words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
104
+
105
+ cur = dict(
106
+ topic_id=t_id,
107
+ label=" ".join(words) if t_id != -1 else None, # βœ“ fixed β€˜=’
108
+ keywords=words,
109
+ start=sentences[idx].start,
110
+ end=sentences[idx].end,
111
+ probability=float(prob or 0),
112
+ sentences=[idx],
113
+ )
114
+ else:
115
+ cur["end"] = sentences[idx].end
116
+ cur["sentences"].append(idx)
117
+
118
+ if cur:
119
+ segments.append(cur)
120
+
121
+ return {"run_id": str(uuid.uuid4()), "segments": segments}
122
+ # ---------- END app.py ----------