Spaces:
Running
Running
kdramas ramdom
Browse files
app.py
CHANGED
@@ -10,6 +10,7 @@ import os
|
|
10 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
11 |
import torch
|
12 |
import unicodedata
|
|
|
13 |
|
14 |
|
15 |
# ==================================================
|
@@ -45,11 +46,11 @@ kdramas1 = filter_kdramas(netflix_movies_shows)
|
|
45 |
kdramas2 = filter_kdramas(mydramalist)
|
46 |
|
47 |
# Eliminar columnas innecesarias (incluyendo 'rating')
|
48 |
-
columns_to_remove = ["Unnamed: 0", "no_of_reviews", "aka_names", "screenwriter", "director",
|
49 |
-
"no_of_viewers", "end_date", "start_date", "year", "duration", "no_of_rating",
|
50 |
-
"rank", "popularity", "content_rating", "where_to_watch", "main_role",
|
51 |
-
"support_role", "no_of_extracted_reviews", "Total_sentences",
|
52 |
-
"POSITIVE_people_sentiment", "POSITIVE_sentences", "NEGATIVE_people_sentiment",
|
53 |
"NEGATIVE_sentences", "rating"] # Eliminar 'rating'
|
54 |
|
55 |
for dataset in [kdramas1, kdramas2]:
|
@@ -77,11 +78,11 @@ kdramas2 = add_genres_column(kdramas2, "Unknown")
|
|
77 |
def align_datasets(dataset1, dataset2):
|
78 |
# Obtener las columnas comunes
|
79 |
common_columns = set(dataset1.features.keys()).intersection(set(dataset2.features.keys()))
|
80 |
-
|
81 |
# Mantener solo las columnas comunes
|
82 |
dataset1 = dataset1.select_columns(list(common_columns))
|
83 |
dataset2 = dataset2.select_columns(list(common_columns))
|
84 |
-
|
85 |
return dataset1, dataset2
|
86 |
|
87 |
# Alinear los datasets
|
@@ -103,10 +104,20 @@ kdramas = Dataset.from_pandas(kdramas_df)
|
|
103 |
# ==================================================
|
104 |
|
105 |
# Cargar el modelo de embeddings
|
106 |
-
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
107 |
-
|
108 |
-
#
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
# Crear un índice FAISS para búsqueda eficiente
|
112 |
dimension = embeddings_np.shape[1]
|
@@ -118,10 +129,15 @@ def recommend_kdramas(title, k=5):
|
|
118 |
title_indices = [i for i, t in enumerate(kdramas['title']) if title.lower() in t.lower()]
|
119 |
if not title_indices:
|
120 |
return f"No se encontraron títulos similares a '{title}'."
|
|
|
121 |
query_embedding = embeddings_np[title_indices[0]].reshape(1, -1)
|
122 |
-
distances, similar_indices = index.search(query_embedding, k +
|
|
|
|
|
|
|
|
|
123 |
recommendations = []
|
124 |
-
for i in
|
125 |
recommended_title = kdramas["title"][i]
|
126 |
recommended_type = kdramas["type"][i] if "type" in kdramas.features else "Unknown"
|
127 |
recommended_genres = kdramas["genres"][i] if "genres" in kdramas.features else "Unknown"
|
@@ -152,7 +168,6 @@ mapeo_generos = {
|
|
152 |
"fantasia": "fantasy",
|
153 |
"thriller": "thriller",
|
154 |
"romance": "romance",
|
155 |
-
"drama": "drama",
|
156 |
# Palabras clave adicionales (sin tildes)
|
157 |
"aventura": "adventure",
|
158 |
"historico": "historical",
|
@@ -182,24 +197,30 @@ def buscar_por_genero(genero, k=5):
|
|
182 |
genre_embedding = model.encode(genero, convert_to_tensor=True).cpu().numpy()
|
183 |
genre_embedding = genre_embedding.reshape(1, -1)
|
184 |
|
185 |
-
#
|
186 |
-
D, I = index.search(genre_embedding, k) #
|
187 |
|
188 |
-
#
|
189 |
-
|
190 |
|
191 |
# Formatear las recomendaciones
|
192 |
recommendations = []
|
193 |
-
for
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
return "\n".join(recommendations) if recommendations else f"No se encontraron K-Dramas del género '{genero}'."
|
197 |
|
198 |
# Función para recomendar K-Dramas basado en preferencias
|
199 |
def recomendar_kdramas_chat(entrada_usuario):
|
200 |
# Traducir preferencia del usuario
|
201 |
preferencia_traducida = traducir_preferencia(entrada_usuario)
|
202 |
-
|
203 |
if preferencia_traducida:
|
204 |
return buscar_por_genero(preferencia_traducida, k=5)
|
205 |
else:
|
@@ -208,7 +229,7 @@ def recomendar_kdramas_chat(entrada_usuario):
|
|
208 |
# Función para generar respuestas del chatbot
|
209 |
def generar_respuesta(entrada_usuario, historial_chat=""):
|
210 |
inputs = tokenizer.encode(entrada_usuario + historial_chat, return_tensors="pt")
|
211 |
-
respuesta_ids = model_chat.generate(inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
|
212 |
respuesta = tokenizer.decode(respuesta_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)
|
213 |
return respuesta
|
214 |
|
@@ -240,6 +261,15 @@ def chat(entrada_usuario, historial_chat=""):
|
|
240 |
# Interfaz de Gradio
|
241 |
# ==================================================
|
242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
# Interfaz para el recomendador tradicional
|
244 |
interfaz_recomendador = gr.Interface(
|
245 |
theme=gr.themes.Citrus(),
|
@@ -254,16 +284,6 @@ interfaz_recomendador = gr.Interface(
|
|
254 |
allow_flagging="never",
|
255 |
)
|
256 |
|
257 |
-
# Interfaz para el chatbot
|
258 |
-
interfaz_chatbot = gr.Interface(
|
259 |
-
fn=chat,
|
260 |
-
inputs=[gr.Textbox(label="Escribe tu mensaje"), gr.Textbox(label="Historial", visible=False)],
|
261 |
-
outputs=[gr.Textbox(label="Respuesta del chatbot"), gr.Textbox(label="Mensaje", visible=False)],
|
262 |
-
title="Chatbot Recomendador de K-Dramas",
|
263 |
-
description="Habla con el chatbot para obtener recomendaciones personalizadas de K-Dramas.",
|
264 |
-
allow_flagging="never"
|
265 |
-
)
|
266 |
-
|
267 |
# Lanzar ambas interfaces
|
268 |
gr.TabbedInterface(
|
269 |
[interfaz_recomendador, interfaz_chatbot], # Chatbot en la segunda pestaña
|
|
|
10 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
11 |
import torch
|
12 |
import unicodedata
|
13 |
+
import random
|
14 |
|
15 |
|
16 |
# ==================================================
|
|
|
46 |
kdramas2 = filter_kdramas(mydramalist)
|
47 |
|
48 |
# Eliminar columnas innecesarias (incluyendo 'rating')
|
49 |
+
columns_to_remove = ["Unnamed: 0", "no_of_reviews", "aka_names", "screenwriter", "director",
|
50 |
+
"no_of_viewers", "end_date", "start_date", "year", "duration", "no_of_rating",
|
51 |
+
"rank", "popularity", "content_rating", "where_to_watch", "main_role",
|
52 |
+
"support_role", "no_of_extracted_reviews", "Total_sentences",
|
53 |
+
"POSITIVE_people_sentiment", "POSITIVE_sentences", "NEGATIVE_people_sentiment",
|
54 |
"NEGATIVE_sentences", "rating"] # Eliminar 'rating'
|
55 |
|
56 |
for dataset in [kdramas1, kdramas2]:
|
|
|
78 |
def align_datasets(dataset1, dataset2):
|
79 |
# Obtener las columnas comunes
|
80 |
common_columns = set(dataset1.features.keys()).intersection(set(dataset2.features.keys()))
|
81 |
+
|
82 |
# Mantener solo las columnas comunes
|
83 |
dataset1 = dataset1.select_columns(list(common_columns))
|
84 |
dataset2 = dataset2.select_columns(list(common_columns))
|
85 |
+
|
86 |
return dataset1, dataset2
|
87 |
|
88 |
# Alinear los datasets
|
|
|
104 |
# ==================================================
|
105 |
|
106 |
# Cargar el modelo de embeddings
|
107 |
+
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
108 |
+
|
109 |
+
# Calcular o cargar embeddings
|
110 |
+
# Verificar si el archivo de embeddings existe
|
111 |
+
if os.path.exists("kdrama_embeddings.npy"):
|
112 |
+
# Cargar embeddings precalculados
|
113 |
+
embeddings_np = np.load("kdrama_embeddings.npy")
|
114 |
+
else:
|
115 |
+
# Calcular embeddings y guardarlos
|
116 |
+
descriptions = kdramas["description"]
|
117 |
+
embeddings = model.encode(descriptions, convert_to_tensor=True)
|
118 |
+
embeddings_np = embeddings.cpu().numpy()
|
119 |
+
np.save("kdrama_embeddings.npy", embeddings_np)
|
120 |
+
print("¡Embeddings listos! Cada descripción ahora es un vector numérico.")
|
121 |
|
122 |
# Crear un índice FAISS para búsqueda eficiente
|
123 |
dimension = embeddings_np.shape[1]
|
|
|
129 |
title_indices = [i for i, t in enumerate(kdramas['title']) if title.lower() in t.lower()]
|
130 |
if not title_indices:
|
131 |
return f"No se encontraron títulos similares a '{title}'."
|
132 |
+
|
133 |
query_embedding = embeddings_np[title_indices[0]].reshape(1, -1)
|
134 |
+
distances, similar_indices = index.search(query_embedding, k + 10) # Ampliar el rango de búsqueda
|
135 |
+
|
136 |
+
# Seleccionar aleatoriamente k índices de los 10 más similares
|
137 |
+
selected_indices = random.sample(list(similar_indices[0][1:]), k)
|
138 |
+
|
139 |
recommendations = []
|
140 |
+
for i in selected_indices:
|
141 |
recommended_title = kdramas["title"][i]
|
142 |
recommended_type = kdramas["type"][i] if "type" in kdramas.features else "Unknown"
|
143 |
recommended_genres = kdramas["genres"][i] if "genres" in kdramas.features else "Unknown"
|
|
|
168 |
"fantasia": "fantasy",
|
169 |
"thriller": "thriller",
|
170 |
"romance": "romance",
|
|
|
171 |
# Palabras clave adicionales (sin tildes)
|
172 |
"aventura": "adventure",
|
173 |
"historico": "historical",
|
|
|
197 |
genre_embedding = model.encode(genero, convert_to_tensor=True).cpu().numpy()
|
198 |
genre_embedding = genre_embedding.reshape(1, -1)
|
199 |
|
200 |
+
# Buscar en el índice FAISS
|
201 |
+
D, I = index.search(genre_embedding, k * 2) # Ampliar el rango de búsqueda
|
202 |
|
203 |
+
# Seleccionar aleatoriamente k índices de los resultados
|
204 |
+
selected_indices = random.sample(list(I[0]), k)
|
205 |
|
206 |
# Formatear las recomendaciones
|
207 |
recommendations = []
|
208 |
+
for i in selected_indices:
|
209 |
+
recommended_title = kdramas["title"][i]
|
210 |
+
recommended_type = kdramas["type"][i] if "type" in kdramas.features else "Unknown"
|
211 |
+
recommended_genres = kdramas["genres"][i] if "genres" in kdramas.features else "Unknown"
|
212 |
+
recommendations.append(
|
213 |
+
f"{recommended_title}\n"
|
214 |
+
f"- **Tipo**: {recommended_type}\n"
|
215 |
+
f"- **Géneros**: {recommended_genres}\n"
|
216 |
+
)
|
217 |
return "\n".join(recommendations) if recommendations else f"No se encontraron K-Dramas del género '{genero}'."
|
218 |
|
219 |
# Función para recomendar K-Dramas basado en preferencias
|
220 |
def recomendar_kdramas_chat(entrada_usuario):
|
221 |
# Traducir preferencia del usuario
|
222 |
preferencia_traducida = traducir_preferencia(entrada_usuario)
|
223 |
+
|
224 |
if preferencia_traducida:
|
225 |
return buscar_por_genero(preferencia_traducida, k=5)
|
226 |
else:
|
|
|
229 |
# Función para generar respuestas del chatbot
|
230 |
def generar_respuesta(entrada_usuario, historial_chat=""):
|
231 |
inputs = tokenizer.encode(entrada_usuario + historial_chat, return_tensors="pt")
|
232 |
+
respuesta_ids = model_chat.generate(inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
|
233 |
respuesta = tokenizer.decode(respuesta_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)
|
234 |
return respuesta
|
235 |
|
|
|
261 |
# Interfaz de Gradio
|
262 |
# ==================================================
|
263 |
|
264 |
+
# Interfaz para el chatbot
|
265 |
+
interfaz_chatbot = gr.Interface(
|
266 |
+
fn=chat,
|
267 |
+
inputs=[gr.Textbox(label="Escribe tu mensaje"), gr.Textbox(label="Historial", visible=False)],
|
268 |
+
outputs=[gr.Textbox(label="Respuesta del chatbot"), gr.Textbox(label="Mensaje", visible=False)],
|
269 |
+
title="Chatbot Recomendador de K-Dramas",
|
270 |
+
description="Habla con el chatbot para obtener recomendaciones personalizadas de K-Dramas.",
|
271 |
+
allow_flagging="never"
|
272 |
+
)
|
273 |
# Interfaz para el recomendador tradicional
|
274 |
interfaz_recomendador = gr.Interface(
|
275 |
theme=gr.themes.Citrus(),
|
|
|
284 |
allow_flagging="never",
|
285 |
)
|
286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
# Lanzar ambas interfaces
|
288 |
gr.TabbedInterface(
|
289 |
[interfaz_recomendador, interfaz_chatbot], # Chatbot en la segunda pestaña
|