Spaces:

aeternii
/

Mama-Mia-Movie-Posmotria

Sleeping

App Files Files Community

Alex Godelashvili commited on Apr 18

Commit

47b2637

1 Parent(s): 55d889f

updated streamlit

Browse files

Files changed (7) hide show

.gitattributes +1 -0
app.py +185 -65
backups/app-0.1.py +294 -0
data/films_fin-bk.parquet +3 -0
data/films_fin.parquet +2 -2
data/worm.gif +3 -0
notebooks/preproc.ipynb +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 import faiss
 import numpy as np
 import pandas as pd
 import requests
@@ -44,6 +45,10 @@ LABEL_TO_EMOTION = {
     "reaction_disgusted": "отвратительный",
 }
 # --- Load index and metadata
 @st.cache_resource
@@ -104,11 +109,26 @@ def extract_named_entities(text):
 # --- Helper: Rerank and boost candidates
-def rerank_and_boost(query, df, I, rerank_scores, k=10):
     """
-    Applies post-reranking genre filtering and emotion-based boosting
-    on top of cross-encoder scores.
     """
     entities = extract_named_entities(query)
     filtered_genres = set(entities.get("genres", []))
     emotion = entities.get("emotion", None)
@@ -116,11 +136,15 @@ def rerank_and_boost(query, df, I, rerank_scores, k=10):
     EMOTION_TO_LABEL = {v: k for k, v in LABEL_TO_EMOTION.items()}
     emotion_col = EMOTION_TO_LABEL.get(emotion, None)
-    # Pair FAISS results with reranked scores
-    results = list(zip(I[0], rerank_scores))
-    adjusted = []
-    for idx, base_score in results:
         row = df.iloc[idx]
         row_genres = set(str(row["genres"]).lower().split(","))
@@ -129,44 +153,36 @@ def rerank_and_boost(query, df, I, rerank_scores, k=10):
             continue
         score = base_score
         boost = 0
-        # Emotion boost (small)
         if emotion_col and emotion_col in df.columns:
             raw_val = row.get(emotion_col, 0)
             boost += 0.1 * raw_val
             score += boost
-        adjusted.append((idx, score, boost))
-    # Sort by final adjusted score
-    adjusted.sort(key=lambda x: x[1], reverse=True)
-    final_indices = [idx for idx, _, _ in adjusted]
-    # Fill if needed
     if len(final_indices) < k:
         seen = set(final_indices)
-        for idx, base_score in results:
             if idx not in seen:
-                final_indices.append(idx)
-            if len(final_indices) == k:
                 break
-    return adjusted[:k]
-# --- Local method
-# stopwords = ["хочу", "посмотреть", "смотреть", "нашел", "фильм", "покажи"]
-stopwords = []
-query = "хочу посмотреть фильм про кибер пиратов"
-query_cleaned = "query: " + "".join([word for word in query if word not in stopwords])
-query_vector = encoder.encode(query_cleaned, convert_to_numpy=True)
-query_vector = query_vector / np.linalg.norm(query_vector)
-query_vector = query_vector.astype("float32").reshape(1, -1)
-# --- Search FAISS
-D, I = index.search(query_vector, k=10)
 # ---
@@ -176,40 +192,144 @@ D, I = index.search(query_vector, k=10)
 # ---
-st.title(query)
 st.divider()
-col1, col2, col3 = st.columns(3)
-with col1:
-    for idx, score in zip(I[0], D[0]):
-        title = df.iloc[idx]["title"]
-        desc = df.iloc[idx]["description"]
-        st.write(f"{title} | Score: {score:.4f}")
-        st.write(f"Description: {desc}")
-        st.write("")
-D, I = index.search(query_vector, k=50)
-candidate_pairs = [(query, df.iloc[idx]["description"]) for idx in I[0]]
-rerank_scores = cross_encoder.predict(candidate_pairs)
-results = list(zip(I[0], rerank_scores))
-results.sort(key=lambda x: x[1], reverse=True)
-with col2:
-    for idx, score in results[:10]:
-        title = df.iloc[idx]["title"]
-        desc = df.iloc[idx]["description"]
-        st.write(f"{title} | R-Score: {score:.4f}")
-        st.write(f"Description: {desc}")
-        st.write("")
-with col3:
-    final_idxs = rerank_and_boost(query, df, I, rerank_scores, k=10)
-    for idx, score, boost in final_idxs:
         title = df.iloc[idx]["title"]
         desc = df.iloc[idx]["description"]
         url = df.iloc[idx]["url"]
-        st.write(f"{title} | {score} | {boost}")
-        st.write(f"Description: {desc}")
-        st.write(f"URL: {url}")
-        st.write("")

 import streamlit as st
 import faiss
+import random
 import numpy as np
 import pandas as pd
 import requests
     "reaction_disgusted": "отвратительный",
 }
+placeholder_url = (
+    "https://critics.io/img/movies/poster-placeholder.png?text=Нет+изображения"
+)
 # --- Load index and metadata
 @st.cache_resource
 # --- Helper: Rerank and boost candidates
+def rerank_and_boost(query, df, I, faiss_scores=None, rerank_scores=None, k=10):
     """
+    Applies genre filtering and emotion-based boosting.
+    Parameters:
+        query (str): The raw user query.
+        df (pd.DataFrame): Corpus DataFrame.
+        I (ndarray): FAISS indices, shape (1, N).
+        faiss_scores (ndarray): FAISS similarity scores, shape (1, N). Optional.
+        rerank_scores (List[float]): Optional. Cross-encoder scores aligned to I.
+        k (int): Number of final top results to return.
+    Returns:
+        List[Tuple[int, float, float]]: List of (index, final_score, boost) triples.
     """
+    assert I.ndim == 2 and I.shape[0] == 1, "I must be shape (1, N)"
+    if rerank_scores is not None:
+        assert len(rerank_scores) == I.shape[1], "rerank_scores must match length of I"
     entities = extract_named_entities(query)
     filtered_genres = set(entities.get("genres", []))
     emotion = entities.get("emotion", None)
     EMOTION_TO_LABEL = {v: k for k, v in LABEL_TO_EMOTION.items()}
     emotion_col = EMOTION_TO_LABEL.get(emotion, None)
+    results = []
+    for i, idx in enumerate(I[0]):
+        base_score = (
+            rerank_scores[i]
+            if rerank_scores is not None
+            else faiss_scores[0][i]
+            if faiss_scores is not None
+            else 0
+        )
         row = df.iloc[idx]
         row_genres = set(str(row["genres"]).lower().split(","))
             continue
         score = base_score
         boost = 0
         if emotion_col and emotion_col in df.columns:
             raw_val = row.get(emotion_col, 0)
             boost += 0.1 * raw_val
             score += boost
+        results.append((idx, score, boost))
+    results.sort(key=lambda x: x[1], reverse=True)
+    final_indices = [idx for idx, _, _ in results]
+    # Fill to k
     if len(final_indices) < k:
         seen = set(final_indices)
+        for i, idx in enumerate(I[0]):
             if idx not in seen:
+                score = (
+                    rerank_scores[i]
+                    if rerank_scores is not None
+                    else faiss_scores[0][i]
+                    if faiss_scores is not None
+                    else 0
+                )
+                results.append((idx, score, 0))
+                seen.add(idx)
+            if len(seen) == k:
                 break
+    return results[:k]
 # ---
 # ---
+st.title("Ищем фильмы и бананы шучу не бананы")
 st.divider()
+search_col, worm_col = st.columns([6, 1])
+with search_col:
+    query = st.text_input(
+        "Что хотите посмотреть? 👇",
+        placeholder="хочу посмотреть фильм про кибер пиратов",
+    )
+    left, right, _ = search_col.columns([4, 2, 6])
+    with left:
+        option_map = {
+            0: ":material/pan_tool: Стоп-лист",
+            1: ":material/shuffle: Кросс-энкодинг",
+            2: ":material/add_reaction: Эмоция + Жанр",
+        }
+        selection = st.pills(
+            "",
+            options=option_map.keys(),
+            format_func=lambda option: option_map[option],
+            selection_mode="multi",
+        )
+        button = st.button("Подобрать фильмецы", icon=":material/movie:")
+    with right:
+        user_k = st.number_input(
+            label="Сколько фильмов отобразить?",
+            min_value=3,
+            max_value=15,
+            value=5,
+            step=1,
+        )
+with worm_col:
+    st.image(
+        "data/worm.gif",
+        caption="Добрый день, моя девочка... Спешу пожелать тебе самого прекрасного невероятного поиска. Хочу чтобы твой фильм был самым прекрасным...",
+    )
+if button:
+    stopwords = []
+    if 0 in selection:
+        stopwords = ["хочу", "посмотреть", "смотреть", "нашел", "фильм", "покажи"]
+    query_cleaned = "query: " + " ".join(
+        [word for word in query.split() if word not in stopwords]
+    )
+    query_vector = encoder.encode(query_cleaned, convert_to_numpy=True)
+    query_vector = query_vector / np.linalg.norm(query_vector)
+    query_vector = query_vector.astype("float32").reshape(1, -1)
+    # --- Search FAISS
+    D, I = index.search(query_vector, k=50)
+    if any((1, 2)) not in selection:
+        boost = [0] * len(D[0])
+        results = list(zip(I[0], D[0], boost))
+    rerank_scores = None
+    if 1 in selection:
+        candidate_pairs = [(query, df.iloc[idx]["description"]) for idx in I[0]]
+        rerank_scores = cross_encoder.predict(candidate_pairs)
+        boost = [0] * len(rerank_scores)
+        results = list(zip(I[0], rerank_scores, boost))
+        results.sort(key=lambda x: x[1], reverse=True)
+    show_boost = False
+    if 2 in selection:
+        ners = extract_named_entities(query_cleaned)
+        if not any(ners.values()):
+            st.warning("Жанров и эмоций не обнаружено! Зачем опцию выбирал, умник?")
+        else:
+            colors = ["blue", "green", "orange", "red", "violet", "gray"]
+            badges = ""
+            for item in ners["genres"]:
+                random_color = random.choice(colors)
+                badges += f":{random_color}-badge[{item}]"
+            badges += f":{random.choice(colors)}-badge[{ners['emotion']}]"
+            info_cont = st.container(border=True)
+            info_cont.markdown(f"Дополнительно фильтруем по: {badges}")
+        # if ners["genres"]:
+        #     st.wr
+        results = rerank_and_boost(query, df, I, rerank_scores=rerank_scores, k=user_k)
+        show_boost = True
+    rank = 1
+    for idx, score, boost in results[:user_k]:
+        container = st.container(border=True)
+        col_rank, col_img, col_main = container.columns([2, 3, 10])
         title = df.iloc[idx]["title"]
         desc = df.iloc[idx]["description"]
         url = df.iloc[idx]["url"]
+        kp_rating = df.iloc[idx]["kp_rating"]
+        imdb_rating = df.iloc[idx]["imdb_rating"]
+        rank_cont = col_rank.container(border=True)
+        score_cont = col_rank.container(border=True)
+        boost_cont = col_rank.container(border=True)
+        rank_cont.title(f"#{rank}")
+        if show_boost:
+            boost_cont.metric(label="Score", value=f"{score:.2f}", delta=f"{boost:.2f}")
+        else:
+            boost_cont.metric(label="Score", value=f"{score:.2f}")
+        image_url = df.iloc[idx]["poster_url"]
+        runtime = df.iloc[idx]["Длительность:"]
+        actors = df.iloc[idx]["actors"].replace("'", "")
+        year = df.iloc[idx]["release_date"].year
+        if (
+            pd.notna(image_url)
+            and isinstance(image_url, str)
+            and image_url.strip() != ""
+        ):
+            col_img.image(image_url, use_container_width=True)
+        else:
+            col_img.image(placeholder_url, use_container_width=True)
+        # --- Additional columns inside the container
+        col_title, col_ratings = col_main.columns([7, 3])
+        col_title.markdown(
+            f"<h3><a href='{url}' target='_blank'>{title}</a> ({year})</h3>",
+            unsafe_allow_html=True,
+        )
+        ratings_container = col_ratings.container(border=True)
+        ratings_container.markdown(
+            f"<h4>Кинопоиск: {kp_rating:.1f}</h4>",
+            unsafe_allow_html=True,
+        )
+        ratings_container.markdown(
+            f"<h4>IMDB: {imdb_rating:.1f}</h4>",
+            unsafe_allow_html=True,
+        )
+        col_title.write(f"Длительность: {runtime}")
+        col_title.write(f"Актеры: {actors[1:-1]}")
+        col_main.divider()
+        col_main.success(desc)
+        rank += 1

backups/app-0.1.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import streamlit as st
+import faiss
+import numpy as np
+import pandas as pd
+import requests
+from sentence_transformers import SentenceTransformer, CrossEncoder
+import os
+from natasha import (
+    Segmenter,
+    MorphVocab,
+    NewsEmbedding,
+    NewsMorphTagger,
+    NewsNERTagger,
+    NamesExtractor,
+    Doc,
+)
+segmenter = Segmenter()
+morph_vocab = MorphVocab()
+emb = NewsEmbedding()
+morph_tagger = NewsMorphTagger(emb)
+ner_tagger = NewsNERTagger(emb)
+names_extractor = NamesExtractor(morph_vocab)
+st.set_page_config(page_title="test", page_icon=None, layout="wide")
+HF_TOKEN = os.getenv("HF_TOKEN")
+LABEL_TO_EMOTION = {
+    "reaction_relaxed": "легкий",
+    "reaction_hugging": "душевный",
+    "reaction_starstruck": "восхитительный",
+    "reaction_laughing": "смешной",
+    "reaction_thinking": "сложный",
+    "reaction_flushed": "неожиданный",
+    "reaction_grimacing": "напряженный",
+    "reaction_unamused": "слабый",
+    "reaction_loving": "романтический",
+    "reaction_grateful": "обнадеживающий",
+    "reaction_crying": "грустный",
+    "reaction_mindblown": "шокирующий",
+    "reaction_anxious": "страшный",
+    "reaction_silent": "депрессивный",
+    "reaction_disgusted": "отвратительный",
+}
+# --- Load index and metadata
+@st.cache_resource
+def load_faiss_index():
+    return faiss.read_index("data/index_e5_large.bin")
+@st.cache_data
+def load_dataframe():
+    return pd.read_parquet("data/films_fin.parquet")
+index = load_faiss_index()
+df = load_dataframe()
+# --- Load models
+@st.cache_resource
+def load_models():
+    encoder = SentenceTransformer("intfloat/multilingual-e5-large")
+    cross_encoder = CrossEncoder("DiTy/cross-encoder-russian-msmarco")
+    return encoder, cross_encoder
+encoder, cross_encoder = load_models()
+# --- Get genres list
+genres_series = df["genres"].dropna()
+all_genres = genres_series.str.split(",").explode()
+unique_genres = all_genres.str.strip().unique()
+# --- Helper: return named entities
+emotion_keywords = set(LABEL_TO_EMOTION.values())
+def extract_named_entities(text):
+    doc = Doc(text)
+    doc.segment(segmenter)
+    doc.tag_morph(morph_tagger)
+    doc.tag_ner(ner_tagger)
+    # --- Genres ---
+    genres = []
+    genre_keywords = set(g.lower() for g in unique_genres)
+    emotion = None  # ✅ fix: always define before the loop
+    for token in doc.tokens:
+        if not token.pos:  # Skip if no POS (e.g., punctuation)
+            continue
+        token.lemmatize(morph_vocab)
+        lemma = token.lemma.lower()
+        if lemma in genre_keywords:
+            genres.append(lemma)
+        if lemma in emotion_keywords:
+            emotion = lemma
+    return {"genres": genres, "emotion": emotion}
+# --- Helper: Rerank and boost candidates
+def rerank_and_boost(query, df, I, faiss_scores=None, rerank_scores=None, k=10):
+    """
+    Applies genre filtering and emotion-based boosting.
+    Parameters:
+        query (str): The raw user query.
+        df (pd.DataFrame): Corpus DataFrame.
+        I (ndarray): FAISS indices, shape (1, N).
+        faiss_scores (ndarray): FAISS similarity scores, shape (1, N). Optional.
+        rerank_scores (List[float]): Optional. Cross-encoder scores aligned to I.
+        k (int): Number of final top results to return.
+    Returns:
+        List[Tuple[int, float, float]]: List of (index, final_score, boost) triples.
+    """
+    assert I.ndim == 2 and I.shape[0] == 1, "I must be shape (1, N)"
+    if rerank_scores is not None:
+        assert len(rerank_scores) == I.shape[1], "rerank_scores must match length of I"
+    entities = extract_named_entities(query)
+    filtered_genres = set(entities.get("genres", []))
+    emotion = entities.get("emotion", None)
+    EMOTION_TO_LABEL = {v: k for k, v in LABEL_TO_EMOTION.items()}
+    emotion_col = EMOTION_TO_LABEL.get(emotion, None)
+    results = []
+    for i, idx in enumerate(I[0]):
+        base_score = (
+            rerank_scores[i]
+            if rerank_scores is not None
+            else faiss_scores[0][i]
+            if faiss_scores is not None
+            else 0
+        )
+        row = df.iloc[idx]
+        row_genres = set(str(row["genres"]).lower().split(","))
+        genre_match = not filtered_genres or (filtered_genres & row_genres)
+        if not genre_match:
+            continue
+        score = base_score
+        boost = 0
+        if emotion_col and emotion_col in df.columns:
+            raw_val = row.get(emotion_col, 0)
+            boost += 0.1 * raw_val
+            score += boost
+        results.append((idx, score, boost))
+    results.sort(key=lambda x: x[1], reverse=True)
+    final_indices = [idx for idx, _, _ in results]
+    # Fill to k
+    if len(final_indices) < k:
+        seen = set(final_indices)
+        for i, idx in enumerate(I[0]):
+            if idx not in seen:
+                score = (
+                    rerank_scores[i]
+                    if rerank_scores is not None
+                    else faiss_scores[0][i]
+                    if faiss_scores is not None
+                    else 0
+                )
+                results.append((idx, score, 0))
+                seen.add(idx)
+            if len(seen) == k:
+                break
+    return results[:k]
+# ---
+# ---
+# ---
+# ---
+# ---
+# st.title(query)
+# st.divider()
+search_1, search_2 = st.columns([3, 1])
+with search_1:
+    query = st.text_input(
+        "Что хотите посмотреть? 👇",
+        placeholder="хочу посмотреть фильм про кибер пиратов",
+    )
+with search_2:
+    user_k = st.number_input(
+        label="Сколько фильмов отобразить?", min_value=3, max_value=15, value=5, step=1
+    )
+option_map = {
+    0: ":material/pan_tool: Стоп-лист",
+    1: ":material/shuffle: Кросс-энкодинг",
+    2: ":material/add_reaction: Эмоция + Жанр",
+}
+selection = st.pills(
+    "",
+    options=option_map.keys(),
+    format_func=lambda option: option_map[option],
+    selection_mode="multi",
+)
+button = st.button("Подобрать фильмецы", icon=":material/movie:")
+if button:
+    stopwords = []
+    if 0 in selection:
+        stopwords = ["хочу", "посмотреть", "смотреть", "нашел", "фильм", "покажи"]
+    query_cleaned = "query: " + " ".join(
+        [word for word in query.split() if word not in stopwords]
+    )
+    query_vector = encoder.encode(query_cleaned, convert_to_numpy=True)
+    query_vector = query_vector / np.linalg.norm(query_vector)
+    query_vector = query_vector.astype("float32").reshape(1, -1)
+    # --- Search FAISS
+    D, I = index.search(query_vector, k=50)
+    if any((1, 2)) not in selection:
+        boost = [0] * len(D[0])
+        results = list(zip(I[0], D[0], boost))
+    rerank_scores = None
+    if 1 in selection:
+        candidate_pairs = [(query, df.iloc[idx]["description"]) for idx in I[0]]
+        rerank_scores = cross_encoder.predict(candidate_pairs)
+        boost = [0] * len(rerank_scores)
+        results = list(zip(I[0], rerank_scores, boost))
+        results.sort(key=lambda x: x[1], reverse=True)
+    if 2 in selection:
+        results = rerank_and_boost(query, df, I, rerank_scores=rerank_scores, k=user_k)
+    for idx, score, boost in results[:user_k]:
+        container = st.container(border=True)
+        title = df.iloc[idx]["title"]
+        desc = df.iloc[idx]["description"]
+        url = df.iloc[idx]["url"]
+        container.write(f"{title} | {score} | {boost}")
+        container.write(f"Description: {desc}")
+        container.write(f"URL: {url}")
+        container.write("")
+    # col1, col2, col3 = st.columns(3)
+    # with col1:
+    #     for idx, score in zip(I[0], D[0]):
+    #         title = df.iloc[idx]["title"]
+    #         desc = df.iloc[idx]["description"]
+    #         st.write(f"{title} | Score: {score:.4f}")
+    #         st.write(f"Description: {desc}")
+    #         st.write("")
+    # D, I = index.search(query_vector, k=50)
+    # candidate_pairs = [(query, df.iloc[idx]["description"]) for idx in I[0]]
+    # rerank_scores = cross_encoder.predict(candidate_pairs)
+    # results = list(zip(I[0], rerank_scores))
+    # results.sort(key=lambda x: x[1], reverse=True)
+    # with col2:
+    #     for idx, score in results[:10]:
+    #         title = df.iloc[idx]["title"]
+    #         desc = df.iloc[idx]["description"]
+    #         st.write(f"{title} | R-Score: {score:.4f}")
+    #         st.write(f"Description: {desc}")
+    #         st.write("")
+    # with col3:
+    #     final_idxs = rerank_and_boost(query, df, I, rerank_scores, k=10)
+    #     for idx, score, boost in final_idxs:
+    #         title = df.iloc[idx]["title"]
+    #         desc = df.iloc[idx]["description"]
+    #         url = df.iloc[idx]["url"]
+    #         st.write(f"{title} | {score} | {boost}")
+    #         st.write(f"Description: {desc}")
+    #         st.write(f"URL: {url}")
+    #         st.write("")

data/films_fin-bk.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8188bcba6a37caf0bc12037fa47fcae4b2b00dbf7cd4c17a393f1d1234da2f3
+size 40296924

data/films_fin.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e64a358ad21171dbbdc0771a0bbd832923acfe2cdbb43393e85be7b0a9fd7e1e
-size 39640033

 version https://git-lfs.github.com/spec/v1
+oid sha256:71ffa296fc49e95a075d3458ebdec1388c82118ed11a2a74373539afb82ec612
+size 40248713

data/worm.gif ADDED Viewed

Git LFS Details

SHA256: 5a64a542849aa4056c8d8b62e02c11d083c2525b5c871ad40c8875b30c310fd9
Pointer size: 132 Bytes
Size of remote file: 6.87 MB

notebooks/preproc.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff