romnatall commited on
Commit
e633090
·
1 Parent(s): f82d1d4

разделил данные по папкам

Browse files
app.py CHANGED
@@ -8,14 +8,14 @@ from transformers import AutoTokenizer, AutoModel
8
  import numpy as np
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
11
- movies = pd.read_csv('data.csv')
12
 
13
  toggle_state = st.sidebar.checkbox("режим разметки")
14
  input_search = st.text_input('Search')
15
 
16
 
17
 
18
- data = np.load('embeddings.npy')
19
 
20
 
21
 
 
8
  import numpy as np
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
11
+ movies = pd.read_csv('data/data.csv')
12
 
13
  toggle_state = st.sidebar.checkbox("режим разметки")
14
  input_search = st.text_input('Search')
15
 
16
 
17
 
18
+ data = np.load('data/embeddings.npy')
19
 
20
 
21
 
catboost.ipynb ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Результат:\n",
13
+ "[[1 2 3 7 8]\n",
14
+ " [4 5 6 7 8]]\n"
15
+ ]
16
+ }
17
+ ],
18
+ "source": [
19
+ "import numpy as np\n",
20
+ "\n",
21
+ "# Создаем матрицу (2D массив)\n",
22
+ "matrix = np.array([[1, 2, 3],\n",
23
+ " [4, 5, 6]])\n",
24
+ "\n",
25
+ "# Создаем вектор (1D массив)\n",
26
+ "vector = np.array([7, 8])\n",
27
+ "\n",
28
+ "# Сконкатенируем каждый вектор матрицы с вектором\n",
29
+ "result = np.column_stack((matrix, np.tile(vector, (matrix.shape[0], 1))))\n",
30
+ "\n",
31
+ "print(\"Результат:\")\n",
32
+ "print(result)\n"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 63,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "data": {
42
+ "text/plain": [
43
+ "137"
44
+ ]
45
+ },
46
+ "execution_count": 63,
47
+ "metadata": {},
48
+ "output_type": "execute_result"
49
+ }
50
+ ],
51
+ "source": []
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 93,
56
+ "metadata": {},
57
+ "outputs": [],
58
+ "source": [
59
+ "X = np.load('X.npy')\n",
60
+ "Y = np.load('y.npy')\n",
61
+ "X=X[-2:]\n",
62
+ "Y=Y[-2:]\n",
63
+ "np.save('X',X)\n",
64
+ "np.save('y',Y)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 104,
70
+ "metadata": {},
71
+ "outputs": [
72
+ {
73
+ "data": {
74
+ "text/plain": [
75
+ "(29263, 624)"
76
+ ]
77
+ },
78
+ "execution_count": 104,
79
+ "metadata": {},
80
+ "output_type": "execute_result"
81
+ }
82
+ ],
83
+ "source": [
84
+ "data.shape"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 106,
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "data": {
94
+ "text/plain": [
95
+ "(29265, 624)"
96
+ ]
97
+ },
98
+ "execution_count": 106,
99
+ "metadata": {},
100
+ "output_type": "execute_result"
101
+ }
102
+ ],
103
+ "source": [
104
+ ".shape"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 109,
110
+ "metadata": {},
111
+ "outputs": [
112
+ {
113
+ "data": {
114
+ "text/plain": [
115
+ "(29265,)"
116
+ ]
117
+ },
118
+ "execution_count": 109,
119
+ "metadata": {},
120
+ "output_type": "execute_result"
121
+ }
122
+ ],
123
+ "source": [
124
+ "Y.shape"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 4,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "\n",
134
+ "from sklearn.linear_model import LinearRegression\n",
135
+ "\n",
136
+ "dat = np.load('embeddings.npy')\n",
137
+ "data =np.column_stack((dat, dat))\n",
138
+ "datay = np.ones((data.shape[0]))*5\n",
139
+ "\n",
140
+ "data1 = np.column_stack((dat[1:], dat[:-1]))\n",
141
+ "datay1 = np.ones((data1.shape[0]))\n",
142
+ "\n",
143
+ "\n",
144
+ "X = np.load('X.npy') \n",
145
+ "Y = np.load('y.npy')\n",
146
+ "\n",
147
+ "\n",
148
+ "\n",
149
+ "X=np.concatenate((data,X))\n",
150
+ "Y=np.concatenate((datay,Y))\n",
151
+ "X = np.concatenate((data1,X))\n",
152
+ "Y = np.concatenate((datay1,Y))"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 132,
158
+ "metadata": {},
159
+ "outputs": [
160
+ {
161
+ "data": {
162
+ "text/plain": [
163
+ "(29263, 624)"
164
+ ]
165
+ },
166
+ "execution_count": 132,
167
+ "metadata": {},
168
+ "output_type": "execute_result"
169
+ }
170
+ ],
171
+ "source": [
172
+ "data.shape"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 5,
178
+ "metadata": {},
179
+ "outputs": [
180
+ {
181
+ "data": {
182
+ "text/plain": [
183
+ "4.5227014967230694e-05"
184
+ ]
185
+ },
186
+ "execution_count": 5,
187
+ "metadata": {},
188
+ "output_type": "execute_result"
189
+ }
190
+ ],
191
+ "source": [
192
+ "\n",
193
+ "logreg = LinearRegression()\n",
194
+ "logreg.fit(X, Y)\n",
195
+ "\n",
196
+ "import pickle\n",
197
+ "with open('logreg.pkl', 'wb') as f:\n",
198
+ " pickle.dump(logreg, f)\n",
199
+ "\n",
200
+ "logreg.score(X, Y)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 6,
206
+ "metadata": {},
207
+ "outputs": [
208
+ {
209
+ "name": "stdout",
210
+ "output_type": "stream",
211
+ "text": [
212
+ "0:\ttest: 0.9786223\tbest: 0.9786223 (0)\ttotal: 51.1s\tremaining: 7m 39s\n",
213
+ "1:\ttest: 0.9950170\tbest: 0.9950170 (1)\ttotal: 1m 13s\tremaining: 4m 54s\n",
214
+ "2:\ttest: 0.9966407\tbest: 0.9966407 (2)\ttotal: 1m 35s\tremaining: 3m 42s\n",
215
+ "3:\ttest: 0.9982912\tbest: 0.9982912 (3)\ttotal: 1m 56s\tremaining: 2m 55s\n",
216
+ "4:\ttest: 0.9988039\tbest: 0.9988039 (4)\ttotal: 2m 18s\tremaining: 2m 18s\n",
217
+ "5:\ttest: 0.9992459\tbest: 0.9992459 (5)\ttotal: 2m 39s\tremaining: 1m 46s\n",
218
+ "6:\ttest: 0.9997030\tbest: 0.9997030 (6)\ttotal: 3m 1s\tremaining: 1m 17s\n",
219
+ "7:\ttest: 0.9998173\tbest: 0.9998173 (7)\ttotal: 3m 22s\tremaining: 50.7s\n",
220
+ "8:\ttest: 0.9998216\tbest: 0.9998216 (8)\ttotal: 3m 44s\tremaining: 24.9s\n",
221
+ "9:\ttest: 0.9998608\tbest: 0.9998608 (9)\ttotal: 4m 5s\tremaining: 0us\n",
222
+ "\n",
223
+ "bestTest = 0.9998607928\n",
224
+ "bestIteration = 9\n",
225
+ "\n"
226
+ ]
227
+ },
228
+ {
229
+ "ename": "",
230
+ "evalue": "",
231
+ "output_type": "error",
232
+ "traceback": [
233
+ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
234
+ "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
235
+ "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
236
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
237
+ ]
238
+ }
239
+ ],
240
+ "source": [
241
+ "from catboost import CatBoostRanker,Pool\n",
242
+ "from sklearn.model_selection import train_test_split\n",
243
+ "\n",
244
+ "X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)\n",
245
+ "classes_test = np.ones(len(Y_test)).astype(int)\n",
246
+ "test_pool = Pool(data=X_test, label=Y_test, group_id=classes_test)\n",
247
+ "\n",
248
+ "\n",
249
+ "classes_train = np.ones(len(Y)).astype(int)\n",
250
+ "train_pool = Pool(data=X, label=Y, group_id=classes_train,)\n",
251
+ "\n",
252
+ "\n",
253
+ "cb = CatBoostRanker(iterations=10,)\n",
254
+ "cb.fit(train_pool,eval_set=test_pool)\n",
255
+ "cb.save_model('model.cbm')\n"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": 1,
261
+ "metadata": {},
262
+ "outputs": [
263
+ {
264
+ "name": "stdout",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "[[0.82140051]\n",
268
+ " [0.91314228]\n",
269
+ " [0.92991252]]\n"
270
+ ]
271
+ }
272
+ ],
273
+ "source": [
274
+ "import numpy as np\n",
275
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
276
+ "\n",
277
+ "# Пример данных\n",
278
+ "matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n",
279
+ "vector = np.array([0.5, 0.7, 0.3])\n",
280
+ "\n",
281
+ "# Вычисление косинусного сходства между матрицей и вектором\n",
282
+ "similarity = cosine_similarity(matrix, vector.reshape(1, -1))\n",
283
+ "\n",
284
+ "print(similarity)\n"
285
+ ]
286
+ }
287
+ ],
288
+ "metadata": {
289
+ "kernelspec": {
290
+ "display_name": "cv",
291
+ "language": "python",
292
+ "name": "python3"
293
+ },
294
+ "language_info": {
295
+ "codemirror_mode": {
296
+ "name": "ipython",
297
+ "version": 3
298
+ },
299
+ "file_extension": ".py",
300
+ "mimetype": "text/x-python",
301
+ "name": "python",
302
+ "nbconvert_exporter": "python",
303
+ "pygments_lexer": "ipython3",
304
+ "version": "3.12.2"
305
+ }
306
+ },
307
+ "nbformat": 4,
308
+ "nbformat_minor": 2
309
+ }
README.md → data/README.md RENAMED
File without changes
data/X.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27096da97e7c093830ae02cb275c732f43a59c3aa96db08297466544f92a9b58
3
+ size 37568
data.csv → data/data.csv RENAMED
File without changes
embeddings.npy → data/embeddings.npy RENAMED
File without changes
data/logreg.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecd4a43c1bd0b0f35262c896637f2beb4e020d3a26a28dec808b5c13ec67e093
3
+ size 5407
data/model.cbm ADDED
Binary file (32.6 kB). View file
 
movies_data.csv → data/movies_data.csv RENAMED
File without changes
requirements.txt → data/requirements.txt RENAMED
File without changes
data/y.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce8faa4674f06c82833ba7fddbd5e6ffb3a98bfcac2bad53facb1782a848f34a
3
+ size 248
search.ipynb ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 5,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/html": [
11
+ "<div>\n",
12
+ "<style scoped>\n",
13
+ " .dataframe tbody tr th:only-of-type {\n",
14
+ " vertical-align: middle;\n",
15
+ " }\n",
16
+ "\n",
17
+ " .dataframe tbody tr th {\n",
18
+ " vertical-align: top;\n",
19
+ " }\n",
20
+ "\n",
21
+ " .dataframe thead th {\n",
22
+ " text-align: right;\n",
23
+ " }\n",
24
+ "</style>\n",
25
+ "<table border=\"1\" class=\"dataframe\">\n",
26
+ " <thead>\n",
27
+ " <tr style=\"text-align: right;\">\n",
28
+ " <th></th>\n",
29
+ " <th>name</th>\n",
30
+ " <th>description</th>\n",
31
+ " <th>link</th>\n",
32
+ " <th>year</th>\n",
33
+ " <th>imdb</th>\n",
34
+ " <th>kp</th>\n",
35
+ " <th>country</th>\n",
36
+ " <th>age</th>\n",
37
+ " <th>actors</th>\n",
38
+ " <th>genres</th>\n",
39
+ " <th>poster</th>\n",
40
+ " </tr>\n",
41
+ " </thead>\n",
42
+ " <tbody>\n",
43
+ " <tr>\n",
44
+ " <th>28</th>\n",
45
+ " <td>Мстители: Война бесконечности</td>\n",
46
+ " <td>В то время как отважные Мстители с союзниками...</td>\n",
47
+ " <td>https://www.lordfilm.bot/3670-mstiteli-vojna-b...</td>\n",
48
+ " <td>2018.0</td>\n",
49
+ " <td>8.4</td>\n",
50
+ " <td>8.1</td>\n",
51
+ " <td>США</td>\n",
52
+ " <td>18+</td>\n",
53
+ " <td>Роберт Дауни мл., Крис Хемсворт, Марк Руффало,...</td>\n",
54
+ " <td>Фильмы, Фильмы Marvel, Боевики, Приключения, Ф...</td>\n",
55
+ " <td>https://www.lordfilm.bot/uploads/posts/2020-10...</td>\n",
56
+ " </tr>\n",
57
+ " <tr>\n",
58
+ " <th>4286</th>\n",
59
+ " <td>LEGO Мстители Марвел: Код красный</td>\n",
60
+ " <td>Супергерои объединяются, чтобы противостоять ...</td>\n",
61
+ " <td>https://www.lordfilm.bot/49932-lego-mstiteli-m...</td>\n",
62
+ " <td>2023.0</td>\n",
63
+ " <td>NaN</td>\n",
64
+ " <td>NaN</td>\n",
65
+ " <td>США</td>\n",
66
+ " <td>0+</td>\n",
67
+ " <td>NaN</td>\n",
68
+ " <td>Мультфильмы</td>\n",
69
+ " <td>https://www.lordfilm.bot/uploads/posts/2023-10...</td>\n",
70
+ " </tr>\n",
71
+ " <tr>\n",
72
+ " <th>13384</th>\n",
73
+ " <td>Могучие рейнджеры: Потерянная галактика</td>\n",
74
+ " <td>Казалось бы всё зло уже побеждено, однако в н...</td>\n",
75
+ " <td>https://www.lordfilm.bot/18827-moguchie-rejndz...</td>\n",
76
+ " <td>1999.0</td>\n",
77
+ " <td>6.8</td>\n",
78
+ " <td>4.2</td>\n",
79
+ " <td>США, Франция, Япония</td>\n",
80
+ " <td>0+</td>\n",
81
+ " <td>Арчи Као, Регги Ролли, Дэнни Славин, Серина Ви...</td>\n",
82
+ " <td>Сериалы</td>\n",
83
+ " <td>https://www.lordfilm.bot/uploads/posts/2021-03...</td>\n",
84
+ " </tr>\n",
85
+ " <tr>\n",
86
+ " <th>2609</th>\n",
87
+ " <td>Стражи терракоты</td>\n",
88
+ " <td>Стражи волшебной Терракоты и магические сущес...</td>\n",
89
+ " <td>https://www.lordfilm.bot/46847-strazhi-terrako...</td>\n",
90
+ " <td>2021.0</td>\n",
91
+ " <td>6.2</td>\n",
92
+ " <td>6.7</td>\n",
93
+ " <td>Китай</td>\n",
94
+ " <td>12+</td>\n",
95
+ " <td>Тань Сяо</td>\n",
96
+ " <td>Мультфильмы</td>\n",
97
+ " <td>https://www.lordfilm.bot/uploads/posts/2022-01...</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>1156</th>\n",
101
+ " <td>Царство падальщиков</td>\n",
102
+ " <td>Грузовой корабль «Деметра» терпит аварию на н...</td>\n",
103
+ " <td>https://www.lordfilm.bot/49892-carstvo-padalsc...</td>\n",
104
+ " <td>2023.0</td>\n",
105
+ " <td>8.8</td>\n",
106
+ " <td>NaN</td>\n",
107
+ " <td>США</td>\n",
108
+ " <td>0+</td>\n",
109
+ " <td>Вунми Моссаку, Алиа Шокат, Сунита Мани, Боб Ст...</td>\n",
110
+ " <td>Мультфильмы</td>\n",
111
+ " <td>https://www.lordfilm.bot/uploads/posts/2023-10...</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>3907</th>\n",
115
+ " <td>Железный человек и Халк: Союз героев</td>\n",
116
+ " <td>Когда питающийся электричеством и неуязвимый ...</td>\n",
117
+ " <td>https://www.lordfilm.bot/21296-zheleznyj-chelo...</td>\n",
118
+ " <td>2013.0</td>\n",
119
+ " <td>4.6</td>\n",
120
+ " <td>4.2</td>\n",
121
+ " <td>США</td>\n",
122
+ " <td>12+</td>\n",
123
+ " <td>Адриан Пасдар, Фред Таташиор, Ди Брэдли Бейкер...</td>\n",
124
+ " <td>Мультфильмы</td>\n",
125
+ " <td>https://www.lordfilm.bot/uploads/posts/2021-04...</td>\n",
126
+ " </tr>\n",
127
+ " <tr>\n",
128
+ " <th>27972</th>\n",
129
+ " <td>Потерянное львиное королевство</td>\n",
130
+ " <td>Мультфильм о борьбе добра со злом на простора...</td>\n",
131
+ " <td>https://www.lordfilm.bot/24407-poterjannoe-lvi...</td>\n",
132
+ " <td>2019.0</td>\n",
133
+ " <td>3.8</td>\n",
134
+ " <td>NaN</td>\n",
135
+ " <td>США</td>\n",
136
+ " <td>18+</td>\n",
137
+ " <td>Kj Schrock, Сара Тейлор</td>\n",
138
+ " <td>Мультфильмы</td>\n",
139
+ " <td>https://www.lordfilm.bot/uploads/posts/2021-05...</td>\n",
140
+ " </tr>\n",
141
+ " <tr>\n",
142
+ " <th>297</th>\n",
143
+ " <td>Мир Юрского периода 3: Господство</td>\n",
144
+ " <td>Катастрофическое извержение вулкана Сибо на И...</td>\n",
145
+ " <td>https://www.lordfilm.bot/47499-mir-jurskogo-pe...</td>\n",
146
+ " <td>2022.0</td>\n",
147
+ " <td>5.6</td>\n",
148
+ " <td>5.7</td>\n",
149
+ " <td>США, Мальта</td>\n",
150
+ " <td>12+</td>\n",
151
+ " <td>Крис Пратт, Брайс Даллас Ховард, Лора Дерн, Сэ...</td>\n",
152
+ " <td>Фильмы, 2022 год, Боевики, Приключения, Трилле...</td>\n",
153
+ " <td>https://www.lordfilm.bot/uploads/posts/2022-06...</td>\n",
154
+ " </tr>\n",
155
+ " <tr>\n",
156
+ " <th>7626</th>\n",
157
+ " <td>Лузеры</td>\n",
158
+ " <td>«Лузеры» – сумасшедший экшн о предательстве и...</td>\n",
159
+ " <td>https://www.lordfilm.bot/6139-luzery-2010.html</td>\n",
160
+ " <td>2010.0</td>\n",
161
+ " <td>6.2</td>\n",
162
+ " <td>6.3</td>\n",
163
+ " <td>США, Франция</td>\n",
164
+ " <td>16+</td>\n",
165
+ " <td>Джеффри Дин Морган, Зои Салдана, Крис Эванс, И...</td>\n",
166
+ " <td>Фильмы, Боевики, Детективы, Комедии, Криминаль...</td>\n",
167
+ " <td>https://www.lordfilm.bot/uploads/posts/2021-01...</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>25332</th>\n",
171
+ " <td>Ancestral World</td>\n",
172
+ " <td>Пытаясь спасти своего брата и королевство сво...</td>\n",
173
+ " <td>https://www.lordfilm.bot/10306-ancestral-world...</td>\n",
174
+ " <td>2020.0</td>\n",
175
+ " <td>2.5</td>\n",
176
+ " <td>NaN</td>\n",
177
+ " <td>США</td>\n",
178
+ " <td>0+</td>\n",
179
+ " <td>Jennifer Mischiati, Джо Морелли, Райан А. Филл...</td>\n",
180
+ " <td>Фильмы, Боевики, 2020 год, Фильмы про монстров</td>\n",
181
+ " <td>https://www.lordfilm.bot/uploads/posts/2021-02...</td>\n",
182
+ " </tr>\n",
183
+ " </tbody>\n",
184
+ "</table>\n",
185
+ "</div>"
186
+ ],
187
+ "text/plain": [
188
+ " name \\\n",
189
+ "28 Мстители: Война бесконечности \n",
190
+ "4286 LEGO Мстители Марвел: Код красный \n",
191
+ "13384 Могучие рейнджеры: Потерянная галактика \n",
192
+ "2609 Стражи терракоты \n",
193
+ "1156 Царство падальщиков \n",
194
+ "3907 Железный человек и Халк: Союз героев \n",
195
+ "27972 Потерянное львиное королевство \n",
196
+ "297 Мир Юрского периода 3: Господство \n",
197
+ "7626 Лузеры \n",
198
+ "25332 Ancestral World \n",
199
+ "\n",
200
+ " description \\\n",
201
+ "28 В то время как отважные Мстители с союзниками... \n",
202
+ "4286 Супергерои объединяются, чтобы противостоять ... \n",
203
+ "13384 Казалось бы всё зло уже побеждено, однако в н... \n",
204
+ "2609 Стражи волшебной Терракоты и магические сущес... \n",
205
+ "1156 Грузовой корабль «Деметра» терпит аварию на н... \n",
206
+ "3907 Когда питающийся электричеством и неуязвимый ... \n",
207
+ "27972 Мультфильм о борьбе добра со злом на простора... \n",
208
+ "297 Катастрофическое извержение вулкана Сибо на И... \n",
209
+ "7626 «Лузеры» – сумасшедший экшн о предательстве и... \n",
210
+ "25332 Пытаясь спасти своего брата и королевство сво... \n",
211
+ "\n",
212
+ " link year imdb kp \\\n",
213
+ "28 https://www.lordfilm.bot/3670-mstiteli-vojna-b... 2018.0 8.4 8.1 \n",
214
+ "4286 https://www.lordfilm.bot/49932-lego-mstiteli-m... 2023.0 NaN NaN \n",
215
+ "13384 https://www.lordfilm.bot/18827-moguchie-rejndz... 1999.0 6.8 4.2 \n",
216
+ "2609 https://www.lordfilm.bot/46847-strazhi-terrako... 2021.0 6.2 6.7 \n",
217
+ "1156 https://www.lordfilm.bot/49892-carstvo-padalsc... 2023.0 8.8 NaN \n",
218
+ "3907 https://www.lordfilm.bot/21296-zheleznyj-chelo... 2013.0 4.6 4.2 \n",
219
+ "27972 https://www.lordfilm.bot/24407-poterjannoe-lvi... 2019.0 3.8 NaN \n",
220
+ "297 https://www.lordfilm.bot/47499-mir-jurskogo-pe... 2022.0 5.6 5.7 \n",
221
+ "7626 https://www.lordfilm.bot/6139-luzery-2010.html 2010.0 6.2 6.3 \n",
222
+ "25332 https://www.lordfilm.bot/10306-ancestral-world... 2020.0 2.5 NaN \n",
223
+ "\n",
224
+ " country age \\\n",
225
+ "28 США 18+ \n",
226
+ "4286 США 0+ \n",
227
+ "13384 США, Франция, Япония 0+ \n",
228
+ "2609 Китай 12+ \n",
229
+ "1156 США 0+ \n",
230
+ "3907 США 12+ \n",
231
+ "27972 США 18+ \n",
232
+ "297 США, Мальта 12+ \n",
233
+ "7626 США, Франция 16+ \n",
234
+ "25332 США 0+ \n",
235
+ "\n",
236
+ " actors \\\n",
237
+ "28 Роберт Дауни мл., Крис Хемсворт, Марк Руффало,... \n",
238
+ "4286 NaN \n",
239
+ "13384 Арчи Као, Регги Ролли, Дэнни Славин, Серина Ви... \n",
240
+ "2609 Тань Сяо \n",
241
+ "1156 Вунми Моссаку, Алиа Шокат, Сунита Мани, Боб Ст... \n",
242
+ "3907 Адриан Пасдар, Фред Таташиор, Ди Брэдли Бейкер... \n",
243
+ "27972 Kj Schrock, Сара Тейлор \n",
244
+ "297 Крис Пратт, Брайс Даллас Ховард, Лора Дерн, Сэ... \n",
245
+ "7626 Джеффри Дин Морган, Зои Салдана, Крис Эванс, И... \n",
246
+ "25332 Jennifer Mischiati, Джо Морелли, Райан А. Филл... \n",
247
+ "\n",
248
+ " genres \\\n",
249
+ "28 Фильмы, Фильмы Marvel, Боевики, Приключения, Ф... \n",
250
+ "4286 Мультфильмы \n",
251
+ "13384 Сериалы \n",
252
+ "2609 Мультфильмы \n",
253
+ "1156 Мультфильмы \n",
254
+ "3907 Мультфильмы \n",
255
+ "27972 Мультфильмы \n",
256
+ "297 Фильмы, 2022 год, Боевики, Приключения, Трилле... \n",
257
+ "7626 Фильмы, Боевики, Детективы, Комедии, Криминаль... \n",
258
+ "25332 Фильмы, Боевики, 2020 год, Фильмы про монстров \n",
259
+ "\n",
260
+ " poster \n",
261
+ "28 https://www.lordfilm.bot/uploads/posts/2020-10... \n",
262
+ "4286 https://www.lordfilm.bot/uploads/posts/2023-10... \n",
263
+ "13384 https://www.lordfilm.bot/uploads/posts/2021-03... \n",
264
+ "2609 https://www.lordfilm.bot/uploads/posts/2022-01... \n",
265
+ "1156 https://www.lordfilm.bot/uploads/posts/2023-10... \n",
266
+ "3907 https://www.lordfilm.bot/uploads/posts/2021-04... \n",
267
+ "27972 https://www.lordfilm.bot/uploads/posts/2021-05... \n",
268
+ "297 https://www.lordfilm.bot/uploads/posts/2022-06... \n",
269
+ "7626 https://www.lordfilm.bot/uploads/posts/2021-01... \n",
270
+ "25332 https://www.lordfilm.bot/uploads/posts/2021-02... "
271
+ ]
272
+ },
273
+ "execution_count": 5,
274
+ "metadata": {},
275
+ "output_type": "execute_result"
276
+ }
277
+ ],
278
+ "source": [
279
+ "\n",
280
+ "\n",
281
+ "from transformers import AutoTokenizer, AutoModel\n",
282
+ "import numpy as np\n",
283
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
284
+ "import torch\n",
285
+ "import pandas as pd\n",
286
+ "\n",
287
+ "\n",
288
+ "data = np.load('embeddings.npy')\n",
289
+ "movies = pd.read_csv('data.csv')\n",
290
+ "\n",
291
+ "def get_embeddings():\n",
292
+ " tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
293
+ " model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
294
+ " # model.cuda() \n",
295
+ " return model, tokenizer\n",
296
+ "\n",
297
+ "def embed_bert_cls(text ):\n",
298
+ " model, tokenizer = get_embeddings()\n",
299
+ " t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n",
300
+ " with torch.no_grad():\n",
301
+ " model_output = model(**{k: v.to(model.device) for k, v in t.items()})\n",
302
+ " embeddings = model_output.last_hidden_state[:, 0, :]\n",
303
+ " embeddings = torch.nn.functional.normalize(embeddings)\n",
304
+ " return embeddings[0].cpu().numpy()\n",
305
+ "\n",
306
+ "def top_indices(array, n):\n",
307
+ "\n",
308
+ " sorted_indices = np.argsort(array)[::-1]\n",
309
+ " # Выбираем первые n индексов\n",
310
+ " top_n_indices = sorted_indices[:n]\n",
311
+ " return top_n_indices\n",
312
+ "\n",
313
+ "\n",
314
+ "def predict_rating(input_search):\n",
315
+ "\n",
316
+ " emb = embed_bert_cls(input_search)\n",
317
+ " X=np.column_stack((data, np.tile(emb, (data.shape[0], 1))))\n",
318
+ "\n",
319
+ "\n",
320
+ " # from catboost import CatBoostRanker\n",
321
+ " # cb= CatBoostRanker()\n",
322
+ " # cb.load_model('model.cbm')\n",
323
+ " # y = cb.predict(X)\n",
324
+ "\n",
325
+ " # import pickle\n",
326
+ " # with open('logreg.pkl', 'rb') as f:\n",
327
+ " # logreg = pickle.load(f)\n",
328
+ " # y = logreg.predict(X)\n",
329
+ "\n",
330
+ " y= cosine_similarity(data, emb.reshape(1, -1)).reshape(-1)\n",
331
+ "\n",
332
+ " return top_indices(y, 10)\n",
333
+ "\n",
334
+ "\n",
335
+ "preds=predict_rating(\"Пока Мстители и их союзники продолжают защищать мир от различных опасностей, с которыми не смог бы справиться один супергерой, новая угроза возникает из космоса: Танос. Межгалактический тиран преследует цель \")\n",
336
+ "\n",
337
+ "movies.iloc[preds]"
338
+ ]
339
+ }
340
+ ],
341
+ "metadata": {
342
+ "kernelspec": {
343
+ "display_name": "cv",
344
+ "language": "python",
345
+ "name": "python3"
346
+ },
347
+ "language_info": {
348
+ "codemirror_mode": {
349
+ "name": "ipython",
350
+ "version": 3
351
+ },
352
+ "file_extension": ".py",
353
+ "mimetype": "text/x-python",
354
+ "name": "python",
355
+ "nbconvert_exporter": "python",
356
+ "pygments_lexer": "ipython3",
357
+ "version": "3.12.2"
358
+ }
359
+ },
360
+ "nbformat": 4,
361
+ "nbformat_minor": 2
362
+ }