Vivien commited on
Commit
afb8825
Β·
1 Parent(s): 2b2d081

App moved to another Space

Browse files
Files changed (6) hide show
  1. README.md +1 -1
  2. app.py +1 -146
  3. embeddings.npy +0 -3
  4. embeddings2.npy +0 -3
  5. movies.csv +0 -3
  6. requirements.txt +0 -5
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Semantic Search
3
  emoji: πŸ“–
4
  colorFrom: purple
5
  colorTo: red
 
1
  ---
2
+ title: Semantic Search (obsolete)
3
  emoji: πŸ“–
4
  colorFrom: purple
5
  colorTo: red
app.py CHANGED
@@ -1,147 +1,2 @@
1
- import time
2
- import re
3
- import pandas as pd
4
- import numpy as np
5
- import torch
6
- import torch.nn.functional as F
7
- from transformers import AutoTokenizer, AutoModel
8
- from tokenizers import Tokenizer, AddedToken
9
  import streamlit as st
10
- from st_click_detector import click_detector
11
-
12
- DEVICE = "cpu"
13
- MODEL_OPTIONS = ["msmarco-distilbert-base-tas-b", "all-mpnet-base-v2"]
14
- DESCRIPTION = """
15
- # Semantic search
16
-
17
- **Enter your query and hit enter**
18
-
19
- Built with πŸ€— Hugging Face's [transformers](https://huggingface.co/transformers/) library, [SentenceBert](https://www.sbert.net/) models, [Streamlit](https://streamlit.io/) and 44k movie descriptions from the Kaggle [Movies Dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset)
20
- """
21
-
22
-
23
- @st.cache(
24
- show_spinner=False,
25
- hash_funcs={
26
- AutoModel: lambda _: None,
27
- AutoTokenizer: lambda _: None,
28
- dict: lambda _: None,
29
- },
30
- )
31
- def load():
32
- models, tokenizers, embeddings = [], [], []
33
- for model_option in MODEL_OPTIONS:
34
- tokenizers.append(
35
- AutoTokenizer.from_pretrained(f"sentence-transformers/{model_option}")
36
- )
37
- models.append(
38
- AutoModel.from_pretrained(f"sentence-transformers/{model_option}").to(
39
- DEVICE
40
- )
41
- )
42
- embeddings.append(np.load("embeddings.npy"))
43
- embeddings.append(np.load("embeddings2.npy"))
44
- df = pd.read_csv("movies.csv")
45
- return tokenizers, models, embeddings, df
46
-
47
-
48
- tokenizers, models, embeddings, df = load()
49
-
50
-
51
- def pooling(model_output):
52
- return model_output.last_hidden_state[:, 0]
53
-
54
-
55
- def compute_embeddings(texts):
56
- encoded_input = tokenizers[0](
57
- texts, padding=True, truncation=True, return_tensors="pt"
58
- ).to(DEVICE)
59
-
60
- with torch.no_grad():
61
- model_output = models[0](**encoded_input, return_dict=True)
62
-
63
- embeddings = pooling(model_output)
64
-
65
- return embeddings.cpu().numpy()
66
-
67
-
68
- def pooling2(model_output, attention_mask):
69
- token_embeddings = model_output[0]
70
- input_mask_expanded = (
71
- attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
72
- )
73
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
74
- input_mask_expanded.sum(1), min=1e-9
75
- )
76
-
77
-
78
- def compute_embeddings2(list_of_strings):
79
- encoded_input = tokenizers[1](
80
- list_of_strings, padding=True, truncation=True, return_tensors="pt"
81
- ).to(DEVICE)
82
- with torch.no_grad():
83
- model_output = models[1](**encoded_input)
84
- sentence_embeddings = pooling2(model_output, encoded_input["attention_mask"])
85
- return F.normalize(sentence_embeddings, p=2, dim=1).cpu().numpy()
86
-
87
-
88
- @st.cache(
89
- show_spinner=False,
90
- hash_funcs={Tokenizer: lambda _: None, AddedToken: lambda _: None},
91
- )
92
- def semantic_search(query, model_id):
93
- start = time.time()
94
- if len(query.strip()) == 0:
95
- return ""
96
- if "[Similar:" not in query:
97
- if model_id == 0:
98
- query_embedding = compute_embeddings([query])
99
- else:
100
- query_embedding = compute_embeddings2([query])
101
- else:
102
- match = re.match(r"\[Similar:(\d{1,5}).*", query)
103
- if match:
104
- idx = int(match.groups()[0])
105
- query_embedding = embeddings[model_id][idx : idx + 1, :]
106
- if query_embedding.shape[0] == 0:
107
- return ""
108
- else:
109
- return ""
110
- indices = np.argsort(embeddings[model_id] @ np.transpose(query_embedding)[:, 0])[
111
- -1:-11:-1
112
- ]
113
- if len(indices) == 0:
114
- return ""
115
- result = "<ol>"
116
- for i in indices:
117
- result += f"<li style='padding-top: 10px'><b>{df.iloc[i].title}</b> ({df.iloc[i].release_date}). {df.iloc[i].overview} "
118
- result += f"<a id='{i}' href='#'>Similar movies</a></li>"
119
- delay = "%.3f" % (time.time() - start)
120
- return f"<p><i>Computation time: {delay} seconds</i></p>{result}</ol>"
121
-
122
-
123
- st.sidebar.markdown(DESCRIPTION)
124
-
125
- model_choice = st.sidebar.selectbox("Similarity model", options=MODEL_OPTIONS)
126
- model_id = 0 if model_choice == MODEL_OPTIONS[0] else 1
127
-
128
- if "query" in st.session_state:
129
- query = st.text_input("", value=st.session_state["query"])
130
- else:
131
- query = st.text_input("", value="time travel")
132
-
133
- clicked = click_detector(semantic_search(query, model_id))
134
-
135
- if clicked != "":
136
- st.markdown(clicked)
137
- change_query = False
138
- if "last_clicked" not in st.session_state:
139
- st.session_state["last_clicked"] = clicked
140
- change_query = True
141
- else:
142
- if clicked != st.session_state["last_clicked"]:
143
- st.session_state["last_clicked"] = clicked
144
- change_query = True
145
- if change_query:
146
- st.session_state["query"] = f"[Similar:{clicked}] {df.iloc[int(clicked)].title}"
147
- st.experimental_rerun()
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ st.markdown("Web app moved [here](https://huggingface.co/spaces/vivien/semantic-search2)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
embeddings.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:64495712bf1903dd04604cd5641f5b521912d8938339e9e9e3071dad8952b34a
3
- size 134876288
 
 
 
 
embeddings2.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:136aa7ffd5630d19dc88f1e779dbeb04011ef918ac3fba2148a8f5d58303d736
3
- size 134876288
 
 
 
 
movies.csv DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1da4fb07829b3f57bce3fa663641c50b3d3e65cdf949f6e6f340960a5acc1005
3
- size 16293996
 
 
 
 
requirements.txt CHANGED
@@ -1,5 +0,0 @@
1
- torch
2
- transformers
3
- numpy
4
- pandas
5
- st-click-detector