romnatall commited on
Commit
fe4db0a
·
1 Parent(s): 4eb227a

ребаланс весов

Browse files
Files changed (3) hide show
  1. app.py +21 -7
  2. data/books_model (2).ipynb +43 -2
  3. data/vectorizer_actors.pkl +3 -0
app.py CHANGED
@@ -19,6 +19,8 @@ input_search = st.text_input('Search')
19
 
20
 
21
 
 
 
22
  data = np.load('data/embeddings_bert.npy')
23
 
24
  def top_indices(array, n,upsc=False):
@@ -44,6 +46,8 @@ def embed_bert_cls(text, ):
44
  model_output = model(**{k: v.to(model.device) for k, v in t.items()})
45
  embeddings = model_output.last_hidden_state[:, 0, :]
46
  embeddings = torch.nn.functional.normalize(embeddings)
 
 
47
  return embeddings[0].cpu().numpy()
48
 
49
  @st.cache_resource
@@ -53,21 +57,31 @@ def getmodels():
53
  logreg = pickle.load(f)
54
  with open('data/tf_idf_vectorizer.pkl', 'rb') as f:
55
  vectorizer = pickle.load(f)
56
- return logreg, vectorizer
 
 
 
 
 
 
 
 
 
57
 
58
  @st.cache_data
59
  def predict_rating(input_search):
60
 
61
 
62
- logreg, vectorizer=getmodels()
63
 
64
  emb = embed_bert_cls(input_search)
65
  X=np.column_stack((data, np.tile(emb, (data.shape[0], 1))))
66
 
67
  user_tfidf = vectorizer.transform([input_search])
68
- tfidf_matrix = vectorizer.transform(movies['description'])
69
- tfidf_matrix2 = vectorizer.transform(movies['name'])
70
-
 
71
  similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix)
72
  similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2)
73
 
@@ -75,10 +89,10 @@ def predict_rating(input_search):
75
  y_emb = cosine_similarity(data, emb.reshape(1, -1)).reshape(-1)
76
 
77
 
78
- y=(similarity_scores_desc*0.9+similarity_scores_name*0.035+y_emb*.4+y_log*0.4).reshape(-1)
79
  st.session_state["pred"]=y
80
 
81
- return top_indices(y, 10,upsc=False)
82
 
83
 
84
 
 
19
 
20
 
21
 
22
+
23
+
24
  data = np.load('data/embeddings_bert.npy')
25
 
26
  def top_indices(array, n,upsc=False):
 
46
  model_output = model(**{k: v.to(model.device) for k, v in t.items()})
47
  embeddings = model_output.last_hidden_state[:, 0, :]
48
  embeddings = torch.nn.functional.normalize(embeddings)
49
+
50
+
51
  return embeddings[0].cpu().numpy()
52
 
53
  @st.cache_resource
 
57
  logreg = pickle.load(f)
58
  with open('data/tf_idf_vectorizer.pkl', 'rb') as f:
59
  vectorizer = pickle.load(f)
60
+
61
+ with open('data/vectorizer_actors.pkl', 'rb') as f:
62
+ vectorizer_actors = pickle.load(f)
63
+
64
+ tfidf_matrix = vectorizer.transform(movies['description'])
65
+ tfidf_matrix2 = vectorizer.transform(movies['name'])
66
+ tfidf_actors = vectorizer_actors.transform(movies['actors'].fillna(''))
67
+
68
+
69
+ return logreg, vectorizer,vectorizer_actors ,tfidf_matrix,tfidf_matrix2,tfidf_actors
70
 
71
  @st.cache_data
72
  def predict_rating(input_search):
73
 
74
 
75
+ logreg, vectorizer,vectorizer_actors,tfidf_matrix,tfidf_matrix2,tfidf_actors=getmodels()
76
 
77
  emb = embed_bert_cls(input_search)
78
  X=np.column_stack((data, np.tile(emb, (data.shape[0], 1))))
79
 
80
  user_tfidf = vectorizer.transform([input_search])
81
+ user_actors = vectorizer_actors.transform([input_search])
82
+
83
+ similarity_actors=cosine_similarity(user_actors, tfidf_actors).reshape(-1)
84
+
85
  similarity_scores_desc = cosine_similarity(user_tfidf, tfidf_matrix)
86
  similarity_scores_name = cosine_similarity(user_tfidf, tfidf_matrix2)
87
 
 
89
  y_emb = cosine_similarity(data, emb.reshape(1, -1)).reshape(-1)
90
 
91
 
92
+ y=(similarity_scores_desc*0.9+similarity_scores_name*0.0083+y_emb*0.9+similarity_actors*0.5).reshape(-1)
93
  st.session_state["pred"]=y
94
 
95
+ return top_indices(y, 20,upsc=False)
96
 
97
 
98
 
data/books_model (2).ipynb CHANGED
@@ -355,6 +355,47 @@
355
  "data.head()"
356
  ]
357
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  {
359
  "cell_type": "code",
360
  "execution_count": 3,
@@ -418,12 +459,12 @@
418
  },
419
  {
420
  "cell_type": "code",
421
- "execution_count": 56,
422
  "metadata": {},
423
  "outputs": [],
424
  "source": [
425
  "import pickle\n",
426
- "with open('vectorizer.pkl', 'wb') as f:\n",
427
  " pickle.dump(vectorizer, f)"
428
  ]
429
  },
 
355
  "data.head()"
356
  ]
357
  },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": 6,
361
+ "metadata": {},
362
+ "outputs": [
363
+ {
364
+ "data": {
365
+ "text/plain": [
366
+ "0 В американской хоррор-комедии показана детект...\n",
367
+ "1 Перестройка уже шагнула с кремлевских трибун ...\n",
368
+ "2 В Городе Стихий обитатели огня, воды, земли и...\n",
369
+ "3 Свои незабываемые каникулы, в которых есть ме...\n",
370
+ "4 Увлекательные приключения скандинавского бога...\n",
371
+ " ... \n",
372
+ "28443 Петер фон Кант - успешный режиссёр. Его прият...\n",
373
+ "28444 Объединившись с бывшим полицейским Зоуи Кэсс,...\n",
374
+ "28445 13 перетекающих из одного в другой эпизодов, ...\n",
375
+ "28446 Стремясь спасти свою сестру Софию, попавшую п...\n",
376
+ "28447 Три друга, Арав, Бхушан и Сунил, оказываются ...\n",
377
+ "Name: description, Length: 28448, dtype: object"
378
+ ]
379
+ },
380
+ "execution_count": 6,
381
+ "metadata": {},
382
+ "output_type": "execute_result"
383
+ }
384
+ ],
385
+ "source": [
386
+ "data['description']"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": 8,
392
+ "metadata": {},
393
+ "outputs": [],
394
+ "source": [
395
+ "vectorizer = TfidfVectorizer()\n",
396
+ "a=vectorizer.fit_transform(data['actors'].fillna(''))\n"
397
+ ]
398
+ },
399
  {
400
  "cell_type": "code",
401
  "execution_count": 3,
 
459
  },
460
  {
461
  "cell_type": "code",
462
+ "execution_count": 9,
463
  "metadata": {},
464
  "outputs": [],
465
  "source": [
466
  "import pickle\n",
467
+ "with open('vectorizer_actors.pkl', 'wb') as f:\n",
468
  " pickle.dump(vectorizer, f)"
469
  ]
470
  },
data/vectorizer_actors.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a171b209102d3999b7c6a5f91f26f02d7506c870ad740cb3c87b4a03593c4f68
3
+ size 2967595