Spaces:
Sleeping
Sleeping
import random | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.metrics.pairwise import pairwise_distances | |
import faiss | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import pickle | |
movies = pd.read_csv('data/data.csv') | |
toggle_state = False#st.sidebar.checkbox("режим разметки") | |
input_search = st.text_input('Search', value='собака очень преданно ждала хозяина на вокзале') | |
tfidf_slider = st.sidebar.slider("tf_idf_description", 0.0, 1.0, 0.9) | |
tf_idf_name = st.sidebar.slider("tf_idf_name", 0.0, 1.0, 0.66/100) | |
tf_idf_actors = st.sidebar.slider("tf_idf_actors", 0.0, 1.0, 0.9) | |
bert_weight = st.sidebar.slider("bert_weight", 0.0, 1.0, 0.5) | |
show_num = st.sidebar.slider("show_num", 1, 100, 10) | |
data = np.load('data/embeddings_bert.npy') | |
def top_indices(array, n,upsc=False): | |
# Получаем индексы элементов, отсортированных по убыванию | |
st.session_state["pred"] = array | |
sorted_indices = np.argsort(array)[::1 if upsc else -1] | |
# Выбираем первые n индексов | |
top_n_indices = sorted_indices[:n] | |
return top_n_indices | |
def get_embeddings(): | |
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2") | |
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2") | |
# model.cuda() | |
return model, tokenizer | |
def embed_bert_cls(text, ): | |
model, tokenizer = get_embeddings() | |
t = tokenizer(text, padding=True, truncation=True, return_tensors='pt') | |
with torch.no_grad(): | |
model_output = model(**{k: v.to(model.device) for k, v in t.items()}) | |
embeddings = model_output.last_hidden_state[:, 0, :] | |
embeddings = torch.nn.functional.normalize(embeddings) | |
return embeddings[0].cpu().numpy() | |
def getmodels(): | |
with open('data/logreg.pkl', 'rb') as f: | |
logreg = pickle.load(f) | |
with open('data/tf_idf_vectorizer.pkl', 'rb') as f: | |
vectorizer = pickle.load(f) | |
with open('data/vectorizer_actors.pkl', 'rb') as f: | |
vectorizer_actors = pickle.load(f) | |
tfidf_matrix = vectorizer.transform(movies['description']) | |
tfidf_matrix2 = vectorizer.transform(movies['name']) | |
tfidf_actors = vectorizer_actors.transform(movies['actors'].fillna('')) | |
return logreg, vectorizer,vectorizer_actors ,tfidf_matrix,tfidf_matrix2,tfidf_actors | |
def predict_rating(input_search,tfidf_slider,tf_idf_name,tf_idf_actors,bert_weight): | |
logreg, vectorizer,vectorizer_actors,tfidf_matrix,tfidf_matrix2,tfidf_actors=getmodels() | |
emb = embed_bert_cls(input_search) | |
X=np.column_stack((data, np.tile(emb, (data.shape[0], 1)))) | |
user_tfidf = vectorizer.transform([input_search]) | |
user_actors = vectorizer_actors.transform([input_search]) | |
similarity_actors=cosine_similarity(user_actors, tfidf_actors).reshape(-1) | |
similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix) | |
similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2) | |
y_log = logreg.predict(X) | |
y_emb = cosine_similarity(data, emb.reshape(1, -1)).reshape(-1) | |
y=(similarity_scores_desc*tfidf_slider | |
+similarity_scores_name*tf_idf_name | |
+y_emb*bert_weight | |
+similarity_actors*tf_idf_actors | |
).reshape(-1) | |
st.session_state["pred"]=y | |
return top_indices(y, show_num,upsc=False) | |
def saverank(index, new_X,new_y): | |
dx=np.load('X.npy') | |
dy=np.load('y.npy') | |
dx=np.concatenate((dx, new_X.reshape(1,-1))) | |
dy=np.concatenate((dy,np.array([new_y]))) | |
np.save('X.npy',dx) | |
np.save('y.npy',dy) | |
def ask_rating(movie,index): | |
# Создаем переменную для хранения оценки | |
rating = 0 | |
# Создаем горизонтальный столбец | |
col1, col2, col3, col4, col5 = st.columns(5) | |
# В каждом столбце выводим кнопку оценки | |
with col1: | |
b1 = st.button("1",key="1"+str(index)) | |
with col2: | |
b2 = st.button("2" ,key="2"+str(index)) | |
with col3: | |
b3 = st.button("3",key="3"+str(index)) | |
with col4: | |
b4 = st.button("4",key="4"+str(index)) | |
with col5: | |
b5 = st.button("5",key="5"+str(index)) | |
if b1: | |
rating = 1 | |
if b2: | |
rating = 2 | |
if b3: | |
rating = 3 | |
if b4: | |
rating = 4 | |
if b5: | |
rating = 5 | |
if rating>0: | |
saverank(index,st.session_state["X"][index],rating) | |
def display_rating(rating): | |
stars = int(rating / 2) # Переводим рейтинг из 0-10 в 0-5 и округляем до целого | |
remainder = rating % 2 # Доля рейтинга, которая не переводится в целое количество звезд | |
star_str = '🌕' * stars | |
if remainder >= 0.5: | |
star_str += '🌗' # Добавляем половину звезды в виде половины луны, если есть доля больше или равная 0.5 | |
return star_str | |
def display_movie_card(df, index): | |
movie = df.iloc[index] | |
col1, col2 = st.columns([1, 3]) | |
with col1: | |
st.image(movie['poster'], use_column_width=True) | |
st.write(f"Жанр: {movie['genres']}") | |
st.write(f"Страна: {movie['country']}") | |
st.write(f"рейтинг: {movie['age']}") | |
if "pred" in st.session_state: | |
st.write(st.session_state["pred"][index]) | |
with col2: | |
year = str(int(movie['year'])) if not np.isnan(movie['year']) else "" | |
st.markdown(f"<h2 style='text-align: left;'>{movie['name']} ({year})</h2>", unsafe_allow_html=True) | |
description = ' '.join(movie['description'][:200].split(" ")[:-1]) + '...' if len(movie['description']) > 200 else movie['description'] | |
e = st.empty() | |
b=toggle_state | |
if movie['description'] !=description and not toggle_state: | |
b = st.button("раскрыть описание",key=index) | |
with e: | |
if b: | |
st.write(movie['description']) | |
else: | |
st.write(description) | |
if toggle_state: | |
ask_rating(movie,index) | |
input = st.text_input(' ',key = "search"+str(index)) | |
if input: | |
emb = embed_bert_cls(input) | |
fullemb = np.concatenate(( st.session_state["X"][index,:312], emb)) | |
saverank(index,fullemb,5) | |
st.write(f"Актеры: {movie['actors']}") | |
imdb,kp = st.columns([1,2]) | |
with imdb: | |
st.write(f"IMDB: {display_rating(movie['imdb'])}" if not np.isnan(movie['imdb']) else "") | |
with kp: | |
st.write(f"Кинопоиск: { display_rating(movie['kp'])}" if not np.isnan(movie['kp']) else "") | |
st.write(f"[смотреть]({movie['link']})") | |
st.write("----------------------") | |
reqs= st.session_state["reqs"] if "reqs" in st.session_state else {} | |
def getnums(df,size=0,text=''): | |
if text in reqs: | |
return reqs[text] | |
else: | |
reqs[text]=list(np.random.randint(len(df), size=size)) | |
st.session_state["reqs"] = reqs | |
return reqs[text] | |
if input_search: | |
for i in predict_rating(input_search,tfidf_slider,tf_idf_name,tf_idf_actors,bert_weight): | |
display_movie_card(movies, i ) | |
def ask_rating(movie): | |
# Создаем переменную для хранения оценки | |
rating = 0 | |
# Создаем горизонтальный столбец | |
col1, col2, col3, col4, col5 = st.columns(5) | |
# В каждом столбце выводим кнопку оценки | |
with col1: | |
b1 = st.button("1") | |
with col2: | |
b2 = st.button("2") | |
with col3: | |
b3 = st.button("3") | |
with col4: | |
b4 = st.button("4") | |
with col5: | |
b5 = st.button("5") | |
if b1: | |
rating = 1 | |
if b2: | |
rating = 2 | |
if b3: | |
rating = 3 | |
if b4: | |
rating = 4 | |
if b5: | |
rating = 5 | |
return rating | |