DanielIglesias97 commited on
Commit
b8f4ebc
·
1 Parent(s): d1447f8

First upload to the repo

Browse files
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1
6
+
7
+ RUN apt-get update && \
8
+ apt-get install -y git
9
+
10
+ # Set the working directory in the container
11
+ WORKDIR /app
12
+
13
+ # Copy the requirements file into the container
14
+ COPY requirements.txt .
15
+
16
+ # Install dependencies
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy the application code into the container
20
+ COPY . .
21
+
22
+ EXPOSE 5000
23
+
24
+ # Run the application
25
+ CMD ["python", "server.py"]
config.cfg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [DEFAULT]
2
+ embeddings_csv_path = /app/data/movie_embeddings.csv
3
+
4
+ [TEST]
5
+ query = A good film that you would recommend to your friends
data/movie_embeddings.csv ADDED
The diff for this file is too large to render. See raw diff
 
image_search_engine.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Step 1: Install required packages
2
+ import configparser
3
+ from datasets import load_dataset
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import numpy as np
7
+ import os
8
+ import pandas as pd
9
+
10
+ class ImageSearchEngine():
11
+
12
+ def __init__(self, embeddings_csv_path):
13
+ self.embeddings_csv_path = embeddings_csv_path
14
+
15
+ def load_data_and_model(self):
16
+ # Load a sample dataset (Stanford Movie Review Dataset)
17
+ dataset = load_dataset('imdb', split='train[:1000]') # Using first 1000 examples
18
+ df = pd.DataFrame(dataset)[['text', 'label']]
19
+
20
+ # Load a small model that fits in 4GB VRAM
21
+ model = SentenceTransformer('all-MiniLM-L6-v2') # 384-dimensional embeddings
22
+
23
+ return df, model
24
+
25
+ def generate_embeddings(self, df, model, overwrite=False):
26
+ if ((not os.path.exists(self.embeddings_csv_path)) or overwrite):
27
+ texts = df['text'].tolist()
28
+
29
+ # Generate embeddings in batches for efficiency
30
+ embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
31
+
32
+ # Convert numpy array to string representation for CSV storage
33
+ df['embedding'] = [','.join(map(str, emb)) for emb in embeddings]
34
+ df.to_csv(self.embeddings_csv_path, index=False)
35
+
36
+ return df
37
+
38
+ def semantic_search(self, query, model, top_k=5):
39
+ # Load embeddings from CSV
40
+ df = pd.read_csv(self.embeddings_csv_path)
41
+
42
+ # Convert string embeddings back to numpy arrays
43
+ df['embedding'] = df['embedding'].apply(lambda x: np.fromstring(x, sep=','))
44
+
45
+ # Encode query
46
+ query_embedding = model.encode([query])
47
+
48
+ # Calculate similarities
49
+ embeddings_matrix = np.vstack(df['embedding'].values)
50
+ similarities = cosine_similarity(query_embedding, embeddings_matrix).flatten()
51
+
52
+ # Create and sort results
53
+ df['similarity'] = similarities
54
+ results = df.sort_values('similarity', ascending=False).head(top_k)
55
+
56
+ return results[['text', 'similarity', 'label']]
57
+
58
+ # Execution flow
59
+ if __name__ == "__main__":
60
+ config = configparser.ConfigParser()
61
+ config.read('config.cfg')
62
+
63
+ embeddings_csv_path = config['DEFAULT']['embeddings_csv_path']
64
+
65
+ image_search_engine_manager = ImageSearchEngine(embeddings_csv_path)
66
+
67
+ # Generate and save embeddings (run once)
68
+ df, model = image_search_engine_manager.load_data_and_model()
69
+
70
+ image_search_engine_manager.generate_embeddings(df, model, overwrite=False)
71
+
72
+ # Example search
73
+ query = config['TEST']['query']
74
+ results = image_search_engine_manager.semantic_search(query, model)
75
+
76
+ print('Results -> ', results)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ datasets==3.2.0
2
+ flask==3.1.0
3
+ numpy==2.0.2
4
+ pandas==2.2.3
5
+ scikit-learn==1.6.1
6
+ git+https://github.com/UKPLab/sentence-transformers.git@e2a0098b0fbe10bf9a140a9b1d4c2a3451f1571f
7
+ faiss-cpu==1.9.0.post1
server.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ from flask import Flask, request, render_template
3
+ from image_search_engine import ImageSearchEngine
4
+ import os
5
+
6
+ app = Flask(__name__, template_folder='/app/views', static_url_path='/static')
7
+
8
+ config = configparser.ConfigParser()
9
+ config.read('config.cfg')
10
+ embeddings_csv_path = config['DEFAULT']['embeddings_csv_path']
11
+
12
+ image_search_engine_manager = ImageSearchEngine(embeddings_csv_path)
13
+ df, model = image_search_engine_manager.load_data_and_model()
14
+ image_search_engine_manager.generate_embeddings(df, model, overwrite=False)
15
+
16
+ @app.route('/')
17
+ def search():
18
+ query = request.args.get('query')
19
+ results = []
20
+
21
+ if ((query!=None) and len(query.strip())>0):
22
+ results = image_search_engine_manager.semantic_search(query, model)
23
+
24
+ return render_template("search.html", results=results)
25
+
26
+ def main():
27
+ app.run(host="0.0.0.0", port="5000", debug=True)
28
+
29
+ main()
static/css/styles.css ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* General Styles */
2
+ body {
3
+ font-family: Arial, sans-serif;
4
+ background-color: #f4f4f4;
5
+ margin: 0;
6
+ padding: 0;
7
+ color: #333;
8
+ }
9
+
10
+ .container {
11
+ max-width: 800px;
12
+ margin: 50px auto;
13
+ padding: 20px;
14
+ background-color: #fff;
15
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
16
+ border-radius: 8px;
17
+ }
18
+
19
+ h1 {
20
+ text-align: center;
21
+ color: #444;
22
+ margin-bottom: 30px;
23
+ }
24
+
25
+ hr {
26
+ display: block;
27
+ margin-top: 0.5em;
28
+ margin-bottom: 0.5em;
29
+ margin-left: auto;
30
+ margin-right: auto;
31
+ border-style: inset;
32
+ border-width: 1px;
33
+ }
34
+
35
+ /* Review List Styles */
36
+ .review-list {
37
+ list-style: none;
38
+ padding: 0;
39
+ }
40
+
41
+ .review-item {
42
+ background-color: #f9f9f9;
43
+ margin-bottom: 20px;
44
+ padding: 20px;
45
+ border-radius: 8px;
46
+ border: 1px solid #ddd;
47
+ transition: transform 0.2s ease, box-shadow 0.2s ease;
48
+ }
49
+
50
+ .review-item:hover {
51
+ transform: translateY(-5px);
52
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
53
+ }
54
+
55
+ .review-text {
56
+ font-size: 16px;
57
+ line-height: 1.6;
58
+ margin: 0 0 10px 0;
59
+ }
60
+
61
+ .review-label {
62
+ font-size: 14px;
63
+ font-weight: bold;
64
+ color: #fff;
65
+ background-color: #007bff;
66
+ padding: 5px 10px;
67
+ border-radius: 4px;
68
+ display: inline-block;
69
+ }
70
+
71
+ .review-label.positive {
72
+ background-color: #28a745;
73
+ }
74
+
75
+ .review-label.negative {
76
+ background-color: #dc3545;
77
+ }
views/search.html ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Movie Reviews</title>
7
+ <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='css/styles.css') }}">
8
+ </head>
9
+ <body>
10
+ <form action="/">
11
+ <input type="text" placeholder="Search.." name="query">
12
+ <button type="submit">Submit</button>
13
+ </form>
14
+ <div class="container">
15
+ <h1>Movie Reviews</h1>
16
+ <ul class="review-list">
17
+ {% if results|length > 0 %}
18
+ {% for item in results['text'].values %}
19
+ <li>{{ item }}</li>
20
+ <hr>
21
+ {% endfor %}
22
+ {% endif %}
23
+ </ul>
24
+ </div>
25
+ </body>
26
+ </html>