Alex Godelashvili commited on
Commit
47b2637
·
1 Parent(s): 55d889f

updated streamlit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.gif filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  import faiss
 
3
  import numpy as np
4
  import pandas as pd
5
  import requests
@@ -44,6 +45,10 @@ LABEL_TO_EMOTION = {
44
  "reaction_disgusted": "отвратительный",
45
  }
46
 
 
 
 
 
47
 
48
  # --- Load index and metadata
49
  @st.cache_resource
@@ -104,11 +109,26 @@ def extract_named_entities(text):
104
 
105
 
106
  # --- Helper: Rerank and boost candidates
107
- def rerank_and_boost(query, df, I, rerank_scores, k=10):
108
  """
109
- Applies post-reranking genre filtering and emotion-based boosting
110
- on top of cross-encoder scores.
 
 
 
 
 
 
 
 
 
 
111
  """
 
 
 
 
 
112
  entities = extract_named_entities(query)
113
  filtered_genres = set(entities.get("genres", []))
114
  emotion = entities.get("emotion", None)
@@ -116,11 +136,15 @@ def rerank_and_boost(query, df, I, rerank_scores, k=10):
116
  EMOTION_TO_LABEL = {v: k for k, v in LABEL_TO_EMOTION.items()}
117
  emotion_col = EMOTION_TO_LABEL.get(emotion, None)
118
 
119
- # Pair FAISS results with reranked scores
120
- results = list(zip(I[0], rerank_scores))
121
- adjusted = []
122
-
123
- for idx, base_score in results:
 
 
 
 
124
  row = df.iloc[idx]
125
  row_genres = set(str(row["genres"]).lower().split(","))
126
 
@@ -129,44 +153,36 @@ def rerank_and_boost(query, df, I, rerank_scores, k=10):
129
  continue
130
 
131
  score = base_score
132
-
133
  boost = 0
134
- # Emotion boost (small)
135
  if emotion_col and emotion_col in df.columns:
136
  raw_val = row.get(emotion_col, 0)
137
  boost += 0.1 * raw_val
138
  score += boost
139
 
140
- adjusted.append((idx, score, boost))
141
 
142
- # Sort by final adjusted score
143
- adjusted.sort(key=lambda x: x[1], reverse=True)
144
- final_indices = [idx for idx, _, _ in adjusted]
145
 
146
- # Fill if needed
147
  if len(final_indices) < k:
148
  seen = set(final_indices)
149
- for idx, base_score in results:
150
  if idx not in seen:
151
- final_indices.append(idx)
152
- if len(final_indices) == k:
 
 
 
 
 
 
 
 
153
  break
154
 
155
- return adjusted[:k]
156
-
157
-
158
- # --- Local method
159
- # stopwords = ["хочу", "посмотреть", "смотреть", "нашел", "фильм", "покажи"]
160
- stopwords = []
161
-
162
- query = "хочу посмотреть фильм про кибер пиратов"
163
- query_cleaned = "query: " + "".join([word for word in query if word not in stopwords])
164
- query_vector = encoder.encode(query_cleaned, convert_to_numpy=True)
165
- query_vector = query_vector / np.linalg.norm(query_vector)
166
- query_vector = query_vector.astype("float32").reshape(1, -1)
167
-
168
- # --- Search FAISS
169
- D, I = index.search(query_vector, k=10)
170
 
171
 
172
  # ---
@@ -176,40 +192,144 @@ D, I = index.search(query_vector, k=10)
176
  # ---
177
 
178
 
179
- st.title(query)
180
  st.divider()
181
 
182
- col1, col2, col3 = st.columns(3)
183
-
184
- with col1:
185
- for idx, score in zip(I[0], D[0]):
186
- title = df.iloc[idx]["title"]
187
- desc = df.iloc[idx]["description"]
188
- st.write(f"{title} | Score: {score:.4f}")
189
- st.write(f"Description: {desc}")
190
- st.write("")
191
-
192
- D, I = index.search(query_vector, k=50)
193
- candidate_pairs = [(query, df.iloc[idx]["description"]) for idx in I[0]]
194
- rerank_scores = cross_encoder.predict(candidate_pairs)
195
- results = list(zip(I[0], rerank_scores))
196
- results.sort(key=lambda x: x[1], reverse=True)
197
-
198
- with col2:
199
- for idx, score in results[:10]:
200
- title = df.iloc[idx]["title"]
201
- desc = df.iloc[idx]["description"]
202
- st.write(f"{title} | R-Score: {score:.4f}")
203
- st.write(f"Description: {desc}")
204
- st.write("")
205
-
206
- with col3:
207
- final_idxs = rerank_and_boost(query, df, I, rerank_scores, k=10)
208
- for idx, score, boost in final_idxs:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  title = df.iloc[idx]["title"]
210
  desc = df.iloc[idx]["description"]
211
  url = df.iloc[idx]["url"]
212
- st.write(f"{title} | {score} | {boost}")
213
- st.write(f"Description: {desc}")
214
- st.write(f"URL: {url}")
215
- st.write("")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import faiss
3
+ import random
4
  import numpy as np
5
  import pandas as pd
6
  import requests
 
45
  "reaction_disgusted": "отвратительный",
46
  }
47
 
48
+ placeholder_url = (
49
+ "https://critics.io/img/movies/poster-placeholder.png?text=Нет+изображения"
50
+ )
51
+
52
 
53
  # --- Load index and metadata
54
  @st.cache_resource
 
109
 
110
 
111
  # --- Helper: Rerank and boost candidates
112
+ def rerank_and_boost(query, df, I, faiss_scores=None, rerank_scores=None, k=10):
113
  """
114
+ Applies genre filtering and emotion-based boosting.
115
+
116
+ Parameters:
117
+ query (str): The raw user query.
118
+ df (pd.DataFrame): Corpus DataFrame.
119
+ I (ndarray): FAISS indices, shape (1, N).
120
+ faiss_scores (ndarray): FAISS similarity scores, shape (1, N). Optional.
121
+ rerank_scores (List[float]): Optional. Cross-encoder scores aligned to I.
122
+ k (int): Number of final top results to return.
123
+
124
+ Returns:
125
+ List[Tuple[int, float, float]]: List of (index, final_score, boost) triples.
126
  """
127
+
128
+ assert I.ndim == 2 and I.shape[0] == 1, "I must be shape (1, N)"
129
+ if rerank_scores is not None:
130
+ assert len(rerank_scores) == I.shape[1], "rerank_scores must match length of I"
131
+
132
  entities = extract_named_entities(query)
133
  filtered_genres = set(entities.get("genres", []))
134
  emotion = entities.get("emotion", None)
 
136
  EMOTION_TO_LABEL = {v: k for k, v in LABEL_TO_EMOTION.items()}
137
  emotion_col = EMOTION_TO_LABEL.get(emotion, None)
138
 
139
+ results = []
140
+ for i, idx in enumerate(I[0]):
141
+ base_score = (
142
+ rerank_scores[i]
143
+ if rerank_scores is not None
144
+ else faiss_scores[0][i]
145
+ if faiss_scores is not None
146
+ else 0
147
+ )
148
  row = df.iloc[idx]
149
  row_genres = set(str(row["genres"]).lower().split(","))
150
 
 
153
  continue
154
 
155
  score = base_score
 
156
  boost = 0
157
+
158
  if emotion_col and emotion_col in df.columns:
159
  raw_val = row.get(emotion_col, 0)
160
  boost += 0.1 * raw_val
161
  score += boost
162
 
163
+ results.append((idx, score, boost))
164
 
165
+ results.sort(key=lambda x: x[1], reverse=True)
166
+ final_indices = [idx for idx, _, _ in results]
 
167
 
168
+ # Fill to k
169
  if len(final_indices) < k:
170
  seen = set(final_indices)
171
+ for i, idx in enumerate(I[0]):
172
  if idx not in seen:
173
+ score = (
174
+ rerank_scores[i]
175
+ if rerank_scores is not None
176
+ else faiss_scores[0][i]
177
+ if faiss_scores is not None
178
+ else 0
179
+ )
180
+ results.append((idx, score, 0))
181
+ seen.add(idx)
182
+ if len(seen) == k:
183
  break
184
 
185
+ return results[:k]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
 
188
  # ---
 
192
  # ---
193
 
194
 
195
+ st.title("Ищем фильмы и бананы шучу не бананы")
196
  st.divider()
197
 
198
+ search_col, worm_col = st.columns([6, 1])
199
+
200
+ with search_col:
201
+ query = st.text_input(
202
+ "Что хотите посмотреть? 👇",
203
+ placeholder="хочу посмотреть фильм про кибер пиратов",
204
+ )
205
+
206
+ left, right, _ = search_col.columns([4, 2, 6])
207
+
208
+ with left:
209
+ option_map = {
210
+ 0: ":material/pan_tool: Стоп-лист",
211
+ 1: ":material/shuffle: Кросс-энкодинг",
212
+ 2: ":material/add_reaction: Эмоция + Жанр",
213
+ }
214
+
215
+ selection = st.pills(
216
+ "",
217
+ options=option_map.keys(),
218
+ format_func=lambda option: option_map[option],
219
+ selection_mode="multi",
220
+ )
221
+ button = st.button("Подобрать фильмецы", icon=":material/movie:")
222
+
223
+ with right:
224
+ user_k = st.number_input(
225
+ label="Сколько фильмов отобразить?",
226
+ min_value=3,
227
+ max_value=15,
228
+ value=5,
229
+ step=1,
230
+ )
231
+
232
+
233
+ with worm_col:
234
+ st.image(
235
+ "data/worm.gif",
236
+ caption="Добрый день, моя девочка... Спешу пожелать тебе самого прекрасного невероятного поиска. Хочу чтобы твой фильм был самым прекрасным...",
237
+ )
238
+
239
+
240
+ if button:
241
+ stopwords = []
242
+ if 0 in selection:
243
+ stopwords = ["хочу", "посмотреть", "смотреть", "нашел", "фильм", "покажи"]
244
+
245
+ query_cleaned = "query: " + " ".join(
246
+ [word for word in query.split() if word not in stopwords]
247
+ )
248
+ query_vector = encoder.encode(query_cleaned, convert_to_numpy=True)
249
+ query_vector = query_vector / np.linalg.norm(query_vector)
250
+ query_vector = query_vector.astype("float32").reshape(1, -1)
251
+
252
+ # --- Search FAISS
253
+ D, I = index.search(query_vector, k=50)
254
+
255
+ if any((1, 2)) not in selection:
256
+ boost = [0] * len(D[0])
257
+ results = list(zip(I[0], D[0], boost))
258
+
259
+ rerank_scores = None
260
+ if 1 in selection:
261
+ candidate_pairs = [(query, df.iloc[idx]["description"]) for idx in I[0]]
262
+ rerank_scores = cross_encoder.predict(candidate_pairs)
263
+ boost = [0] * len(rerank_scores)
264
+ results = list(zip(I[0], rerank_scores, boost))
265
+ results.sort(key=lambda x: x[1], reverse=True)
266
+
267
+ show_boost = False
268
+ if 2 in selection:
269
+ ners = extract_named_entities(query_cleaned)
270
+ if not any(ners.values()):
271
+ st.warning("Жанров и эмоций не обнаружено! Зачем опцию выбирал, умник?")
272
+ else:
273
+ colors = ["blue", "green", "orange", "red", "violet", "gray"]
274
+ badges = ""
275
+ for item in ners["genres"]:
276
+ random_color = random.choice(colors)
277
+ badges += f":{random_color}-badge[{item}]"
278
+ badges += f":{random.choice(colors)}-badge[{ners['emotion']}]"
279
+ info_cont = st.container(border=True)
280
+ info_cont.markdown(f"Дополнительно фильтруем по: {badges}")
281
+ # if ners["genres"]:
282
+ # st.wr
283
+ results = rerank_and_boost(query, df, I, rerank_scores=rerank_scores, k=user_k)
284
+ show_boost = True
285
+
286
+ rank = 1
287
+ for idx, score, boost in results[:user_k]:
288
+ container = st.container(border=True)
289
+ col_rank, col_img, col_main = container.columns([2, 3, 10])
290
  title = df.iloc[idx]["title"]
291
  desc = df.iloc[idx]["description"]
292
  url = df.iloc[idx]["url"]
293
+ kp_rating = df.iloc[idx]["kp_rating"]
294
+ imdb_rating = df.iloc[idx]["imdb_rating"]
295
+
296
+ rank_cont = col_rank.container(border=True)
297
+ score_cont = col_rank.container(border=True)
298
+ boost_cont = col_rank.container(border=True)
299
+ rank_cont.title(f"#{rank}")
300
+ if show_boost:
301
+ boost_cont.metric(label="Score", value=f"{score:.2f}", delta=f"{boost:.2f}")
302
+ else:
303
+ boost_cont.metric(label="Score", value=f"{score:.2f}")
304
+ image_url = df.iloc[idx]["poster_url"]
305
+ runtime = df.iloc[idx]["Длительность:"]
306
+ actors = df.iloc[idx]["actors"].replace("'", "")
307
+ year = df.iloc[idx]["release_date"].year
308
+ if (
309
+ pd.notna(image_url)
310
+ and isinstance(image_url, str)
311
+ and image_url.strip() != ""
312
+ ):
313
+ col_img.image(image_url, use_container_width=True)
314
+ else:
315
+ col_img.image(placeholder_url, use_container_width=True)
316
+ # --- Additional columns inside the container
317
+ col_title, col_ratings = col_main.columns([7, 3])
318
+ col_title.markdown(
319
+ f"<h3><a href='{url}' target='_blank'>{title}</a> ({year})</h3>",
320
+ unsafe_allow_html=True,
321
+ )
322
+ ratings_container = col_ratings.container(border=True)
323
+ ratings_container.markdown(
324
+ f"<h4>Кинопоиск: {kp_rating:.1f}</h4>",
325
+ unsafe_allow_html=True,
326
+ )
327
+ ratings_container.markdown(
328
+ f"<h4>IMDB: {imdb_rating:.1f}</h4>",
329
+ unsafe_allow_html=True,
330
+ )
331
+ col_title.write(f"Длительность: {runtime}")
332
+ col_title.write(f"Актеры: {actors[1:-1]}")
333
+ col_main.divider()
334
+ col_main.success(desc)
335
+ rank += 1
backups/app-0.1.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import faiss
3
+ import numpy as np
4
+ import pandas as pd
5
+ import requests
6
+ from sentence_transformers import SentenceTransformer, CrossEncoder
7
+ import os
8
+
9
+ from natasha import (
10
+ Segmenter,
11
+ MorphVocab,
12
+ NewsEmbedding,
13
+ NewsMorphTagger,
14
+ NewsNERTagger,
15
+ NamesExtractor,
16
+ Doc,
17
+ )
18
+
19
+ segmenter = Segmenter()
20
+ morph_vocab = MorphVocab()
21
+ emb = NewsEmbedding()
22
+ morph_tagger = NewsMorphTagger(emb)
23
+ ner_tagger = NewsNERTagger(emb)
24
+ names_extractor = NamesExtractor(morph_vocab)
25
+
26
+ st.set_page_config(page_title="test", page_icon=None, layout="wide")
27
+ HF_TOKEN = os.getenv("HF_TOKEN")
28
+
29
+ LABEL_TO_EMOTION = {
30
+ "reaction_relaxed": "легкий",
31
+ "reaction_hugging": "душевный",
32
+ "reaction_starstruck": "восхитительный",
33
+ "reaction_laughing": "смешной",
34
+ "reaction_thinking": "сложный",
35
+ "reaction_flushed": "неожиданный",
36
+ "reaction_grimacing": "напряженный",
37
+ "reaction_unamused": "слабый",
38
+ "reaction_loving": "романтический",
39
+ "reaction_grateful": "обнадеживающий",
40
+ "reaction_crying": "грустный",
41
+ "reaction_mindblown": "шокирующий",
42
+ "reaction_anxious": "страшный",
43
+ "reaction_silent": "депрессивный",
44
+ "reaction_disgusted": "отвратительный",
45
+ }
46
+
47
+
48
+ # --- Load index and metadata
49
+ @st.cache_resource
50
+ def load_faiss_index():
51
+ return faiss.read_index("data/index_e5_large.bin")
52
+
53
+
54
+ @st.cache_data
55
+ def load_dataframe():
56
+ return pd.read_parquet("data/films_fin.parquet")
57
+
58
+
59
+ index = load_faiss_index()
60
+ df = load_dataframe()
61
+
62
+
63
+ # --- Load models
64
+ @st.cache_resource
65
+ def load_models():
66
+ encoder = SentenceTransformer("intfloat/multilingual-e5-large")
67
+ cross_encoder = CrossEncoder("DiTy/cross-encoder-russian-msmarco")
68
+ return encoder, cross_encoder
69
+
70
+
71
+ encoder, cross_encoder = load_models()
72
+
73
+ # --- Get genres list
74
+ genres_series = df["genres"].dropna()
75
+ all_genres = genres_series.str.split(",").explode()
76
+ unique_genres = all_genres.str.strip().unique()
77
+
78
+ # --- Helper: return named entities
79
+ emotion_keywords = set(LABEL_TO_EMOTION.values())
80
+
81
+
82
+ def extract_named_entities(text):
83
+ doc = Doc(text)
84
+ doc.segment(segmenter)
85
+ doc.tag_morph(morph_tagger)
86
+ doc.tag_ner(ner_tagger)
87
+
88
+ # --- Genres ---
89
+ genres = []
90
+ genre_keywords = set(g.lower() for g in unique_genres)
91
+ emotion = None # ✅ fix: always define before the loop
92
+
93
+ for token in doc.tokens:
94
+ if not token.pos: # Skip if no POS (e.g., punctuation)
95
+ continue
96
+ token.lemmatize(morph_vocab)
97
+ lemma = token.lemma.lower()
98
+ if lemma in genre_keywords:
99
+ genres.append(lemma)
100
+ if lemma in emotion_keywords:
101
+ emotion = lemma
102
+
103
+ return {"genres": genres, "emotion": emotion}
104
+
105
+
106
+ # --- Helper: Rerank and boost candidates
107
+ def rerank_and_boost(query, df, I, faiss_scores=None, rerank_scores=None, k=10):
108
+ """
109
+ Applies genre filtering and emotion-based boosting.
110
+
111
+ Parameters:
112
+ query (str): The raw user query.
113
+ df (pd.DataFrame): Corpus DataFrame.
114
+ I (ndarray): FAISS indices, shape (1, N).
115
+ faiss_scores (ndarray): FAISS similarity scores, shape (1, N). Optional.
116
+ rerank_scores (List[float]): Optional. Cross-encoder scores aligned to I.
117
+ k (int): Number of final top results to return.
118
+
119
+ Returns:
120
+ List[Tuple[int, float, float]]: List of (index, final_score, boost) triples.
121
+ """
122
+
123
+ assert I.ndim == 2 and I.shape[0] == 1, "I must be shape (1, N)"
124
+ if rerank_scores is not None:
125
+ assert len(rerank_scores) == I.shape[1], "rerank_scores must match length of I"
126
+
127
+ entities = extract_named_entities(query)
128
+ filtered_genres = set(entities.get("genres", []))
129
+ emotion = entities.get("emotion", None)
130
+
131
+ EMOTION_TO_LABEL = {v: k for k, v in LABEL_TO_EMOTION.items()}
132
+ emotion_col = EMOTION_TO_LABEL.get(emotion, None)
133
+
134
+ results = []
135
+ for i, idx in enumerate(I[0]):
136
+ base_score = (
137
+ rerank_scores[i]
138
+ if rerank_scores is not None
139
+ else faiss_scores[0][i]
140
+ if faiss_scores is not None
141
+ else 0
142
+ )
143
+ row = df.iloc[idx]
144
+ row_genres = set(str(row["genres"]).lower().split(","))
145
+
146
+ genre_match = not filtered_genres or (filtered_genres & row_genres)
147
+ if not genre_match:
148
+ continue
149
+
150
+ score = base_score
151
+ boost = 0
152
+
153
+ if emotion_col and emotion_col in df.columns:
154
+ raw_val = row.get(emotion_col, 0)
155
+ boost += 0.1 * raw_val
156
+ score += boost
157
+
158
+ results.append((idx, score, boost))
159
+
160
+ results.sort(key=lambda x: x[1], reverse=True)
161
+ final_indices = [idx for idx, _, _ in results]
162
+
163
+ # Fill to k
164
+ if len(final_indices) < k:
165
+ seen = set(final_indices)
166
+ for i, idx in enumerate(I[0]):
167
+ if idx not in seen:
168
+ score = (
169
+ rerank_scores[i]
170
+ if rerank_scores is not None
171
+ else faiss_scores[0][i]
172
+ if faiss_scores is not None
173
+ else 0
174
+ )
175
+ results.append((idx, score, 0))
176
+ seen.add(idx)
177
+ if len(seen) == k:
178
+ break
179
+
180
+ return results[:k]
181
+
182
+
183
+ # ---
184
+ # ---
185
+ # ---
186
+ # ---
187
+ # ---
188
+
189
+
190
+ # st.title(query)
191
+ # st.divider()
192
+
193
+ search_1, search_2 = st.columns([3, 1])
194
+
195
+ with search_1:
196
+ query = st.text_input(
197
+ "Что хотите посмотреть? 👇",
198
+ placeholder="хочу посмотреть фильм про кибер пиратов",
199
+ )
200
+
201
+ with search_2:
202
+ user_k = st.number_input(
203
+ label="Сколько фильмов отобразить?", min_value=3, max_value=15, value=5, step=1
204
+ )
205
+
206
+ option_map = {
207
+ 0: ":material/pan_tool: Стоп-лист",
208
+ 1: ":material/shuffle: Кросс-энкодинг",
209
+ 2: ":material/add_reaction: Эмоция + Жанр",
210
+ }
211
+
212
+ selection = st.pills(
213
+ "",
214
+ options=option_map.keys(),
215
+ format_func=lambda option: option_map[option],
216
+ selection_mode="multi",
217
+ )
218
+
219
+ button = st.button("Подобрать фильмецы", icon=":material/movie:")
220
+
221
+ if button:
222
+ stopwords = []
223
+ if 0 in selection:
224
+ stopwords = ["хочу", "посмотреть", "смотреть", "нашел", "фильм", "покажи"]
225
+
226
+ query_cleaned = "query: " + " ".join(
227
+ [word for word in query.split() if word not in stopwords]
228
+ )
229
+ query_vector = encoder.encode(query_cleaned, convert_to_numpy=True)
230
+ query_vector = query_vector / np.linalg.norm(query_vector)
231
+ query_vector = query_vector.astype("float32").reshape(1, -1)
232
+
233
+ # --- Search FAISS
234
+ D, I = index.search(query_vector, k=50)
235
+
236
+ if any((1, 2)) not in selection:
237
+ boost = [0] * len(D[0])
238
+ results = list(zip(I[0], D[0], boost))
239
+
240
+ rerank_scores = None
241
+ if 1 in selection:
242
+ candidate_pairs = [(query, df.iloc[idx]["description"]) for idx in I[0]]
243
+ rerank_scores = cross_encoder.predict(candidate_pairs)
244
+ boost = [0] * len(rerank_scores)
245
+ results = list(zip(I[0], rerank_scores, boost))
246
+ results.sort(key=lambda x: x[1], reverse=True)
247
+
248
+ if 2 in selection:
249
+ results = rerank_and_boost(query, df, I, rerank_scores=rerank_scores, k=user_k)
250
+
251
+ for idx, score, boost in results[:user_k]:
252
+ container = st.container(border=True)
253
+ title = df.iloc[idx]["title"]
254
+ desc = df.iloc[idx]["description"]
255
+ url = df.iloc[idx]["url"]
256
+ container.write(f"{title} | {score} | {boost}")
257
+ container.write(f"Description: {desc}")
258
+ container.write(f"URL: {url}")
259
+ container.write("")
260
+
261
+ # col1, col2, col3 = st.columns(3)
262
+
263
+ # with col1:
264
+ # for idx, score in zip(I[0], D[0]):
265
+ # title = df.iloc[idx]["title"]
266
+ # desc = df.iloc[idx]["description"]
267
+ # st.write(f"{title} | Score: {score:.4f}")
268
+ # st.write(f"Description: {desc}")
269
+ # st.write("")
270
+
271
+ # D, I = index.search(query_vector, k=50)
272
+ # candidate_pairs = [(query, df.iloc[idx]["description"]) for idx in I[0]]
273
+ # rerank_scores = cross_encoder.predict(candidate_pairs)
274
+ # results = list(zip(I[0], rerank_scores))
275
+ # results.sort(key=lambda x: x[1], reverse=True)
276
+
277
+ # with col2:
278
+ # for idx, score in results[:10]:
279
+ # title = df.iloc[idx]["title"]
280
+ # desc = df.iloc[idx]["description"]
281
+ # st.write(f"{title} | R-Score: {score:.4f}")
282
+ # st.write(f"Description: {desc}")
283
+ # st.write("")
284
+
285
+ # with col3:
286
+ # final_idxs = rerank_and_boost(query, df, I, rerank_scores, k=10)
287
+ # for idx, score, boost in final_idxs:
288
+ # title = df.iloc[idx]["title"]
289
+ # desc = df.iloc[idx]["description"]
290
+ # url = df.iloc[idx]["url"]
291
+ # st.write(f"{title} | {score} | {boost}")
292
+ # st.write(f"Description: {desc}")
293
+ # st.write(f"URL: {url}")
294
+ # st.write("")
data/films_fin-bk.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8188bcba6a37caf0bc12037fa47fcae4b2b00dbf7cd4c17a393f1d1234da2f3
3
+ size 40296924
data/films_fin.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e64a358ad21171dbbdc0771a0bbd832923acfe2cdbb43393e85be7b0a9fd7e1e
3
- size 39640033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71ffa296fc49e95a075d3458ebdec1388c82118ed11a2a74373539afb82ec612
3
+ size 40248713
data/worm.gif ADDED

Git LFS Details

  • SHA256: 5a64a542849aa4056c8d8b62e02c11d083c2525b5c871ad40c8875b30c310fd9
  • Pointer size: 132 Bytes
  • Size of remote file: 6.87 MB
notebooks/preproc.ipynb CHANGED
The diff for this file is too large to render. See raw diff