Owos commited on
Commit
8231ebb
·
1 Parent(s): 5506d51

initial commit of the app

Browse files
Files changed (2) hide show
  1. app.py +201 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import numpy as np
4
+ import threading
5
+ import torch
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelWithLMHead
8
+ from huggingface_hub import HfApi, hf_hub_download
9
+ from torch.utils.data import Dataset, DataLoader
10
+
11
+ st.set_page_config(
12
+ page_title="Koya Recommendation System",
13
+ initial_sidebar_state="auto",
14
+ )
15
+
16
+
17
+
18
+ st.markdown(
19
+ """
20
+
21
+ # Koya recommeder System
22
+ ### 👋 Welcome to the to the Koya recommendation system. This system recommeds an LLM for you when you set some given parameters.
23
+ You can try it below"""
24
+ )
25
+
26
+ @st.cache
27
+ def get_model_infos(multilingual="multilingual"):
28
+ api = HfApi()
29
+ model_infos = api.list_models(filter=["fill-mask", multilingual], cardData=True)
30
+ data = [['id','task', 'lang', 'sha']]
31
+ count =0
32
+ for model in model_infos:
33
+ try:
34
+ data.append([model.modelId, model.pipeline_tag, model.cardData['language'], model.sha])
35
+ except:
36
+ data.append([model.modelId, model.pipeline_tag, None, model.sha])
37
+
38
+ df = pd.DataFrame.from_records(data[1:], columns=data[0])
39
+ return df
40
+
41
+ class MLMDataset(Dataset):
42
+ def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
43
+ self.sentence = sentence
44
+ self.tokenizer = tokenizer
45
+ self.num_samples = num_samples
46
+
47
+ self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
48
+ self.batch_input = self.tensor_input.repeat(len(self.sentence), 1)
49
+
50
+ self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],len(self.sentence),replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
51
+ self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
52
+
53
+ # Added by Chris Emezue on 29.01.2023
54
+ # Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
55
+ unk_mask = torch.ones(self.batch_input.size()[0],self.batch_input.size()[1],self.tokenizer.vocab_size)
56
+ batch_input_for_unk = self.batch_input.unsqueeze(-1).expand(unk_mask.size())
57
+ self.unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
58
+
59
+
60
+ self.mask = torch.zeros(self.batch_input.size())
61
+ src = torch.ones(self.batch_input.size(0)).unsqueeze(0).T
62
+
63
+ self.mask.scatter_(1, self.random_ids, src)
64
+ self.masked_input = self.batch_input.masked_fill(self.mask == 1, MLM_MASK_TOKEN)
65
+ self.labels = self.batch_input.masked_fill(self.masked_input != MLM_MASK_TOKEN, -100)
66
+ # If logits change when labels is not set to -100:
67
+ # If we are using the logits, this does not change it then. but if are using the loss,
68
+ # then this has an effect.
69
+
70
+ assert self.masked_input.shape[0]==self.labels.shape[0] == self.mask.shape[0] == self.unk_mask.shape[0]
71
+
72
+ def __len__(self):
73
+ return self.masked_input.shape[0]
74
+
75
+
76
+ def __getitem__(self,idx):
77
+ return self.masked_input[idx], self.mask[idx],self.labels[idx], self.unk_mask[idx]
78
+
79
+
80
+ def get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_samples,BATCH_SIZE):
81
+ mlm_dataset = MLMDataset(sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN)
82
+ dataloader = DataLoader(mlm_dataset,batch_size=BATCH_SIZE)
83
+
84
+ score =1
85
+
86
+ for i,batch in enumerate(dataloader):
87
+ masked_input, mask,labels, unk_mask = batch
88
+ output = model(masked_input, labels=labels)
89
+
90
+ logits_ = output['logits']
91
+ logits = logits_ * unk_mask # Penalizing the unk tokens by setting their probs to zero
92
+
93
+ indices = torch.nonzero(mask)
94
+ logits_of_interest = logits[indices[:,0],indices[:,1],:]
95
+
96
+ labels_of_interest = labels[indices[:,0],indices[:,1]]
97
+ log_probs = logits_of_interest.gather(1,labels_of_interest.view(-1,1))
98
+
99
+ batch_score = (log_probs.sum()/(-1 *num_samples)).exp().item() # exp(x+y) = exp(x)*exp(y)
100
+ score *= batch_score
101
+ return score
102
+
103
+
104
+
105
+ def get_sense_score(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_samples):
106
+ '''
107
+ IDEA
108
+ -----------------
109
+ PP = perplexity(P) where perplexity(P) function just computes:
110
+ (p_1*p_*p_3*...*p_N)^(-1/N) for p_i in P
111
+
112
+ In practice you need to do the computation in log space to avoid underflow:
113
+ e^-((log(p_1) + log(p_2) + ... + log(p_N)) / N)
114
+
115
+
116
+ Note: everytime you run this function, the results change slightly (but the ordering should be relatively the same),
117
+ because the tokens to mask are chosen randomly.
118
+ '''
119
+
120
+ tensor_input = tokenizer(sentence, return_tensors='pt')['input_ids']
121
+ batch_input = tensor_input.repeat(num_samples, 1)
122
+
123
+ random_ids = np.random.choice([i for i in range(1,tensor_input.size(1)-1)],num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
124
+ random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
125
+
126
+ # Added by Chris Emezue on 29.01.2023
127
+ # Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
128
+ unk_mask = torch.ones(batch_input.size()[0],batch_input.size()[1],tokenizer.vocab_size)
129
+ batch_input_for_unk = batch_input.unsqueeze(-1).expand(unk_mask.size())
130
+ unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
131
+
132
+
133
+ mask = torch.zeros(batch_input.size())
134
+ src = torch.ones(batch_input.size(0)).unsqueeze(0).T
135
+
136
+ mask.scatter_(1, random_ids, src)
137
+ masked_input = batch_input.masked_fill(mask == 1, MLM_MASK_TOKEN)
138
+ labels = batch_input.masked_fill( masked_input != MLM_MASK_TOKEN, -100)
139
+ # If logits change when labels is not set to -100:
140
+ # If we are using the logits, this does not change it then. but if are using the loss,
141
+ # then this has an effect.
142
+
143
+
144
+ output = model(masked_input, labels=labels)
145
+
146
+ logits_ = output['logits']
147
+ logits = logits_ * unk_mask # Penalizing the unk tokens by setting their probs to zero
148
+
149
+ indices = torch.nonzero(mask)
150
+ logits_of_interest = logits[indices[:,0],indices[:,1],:]
151
+
152
+ labels_of_interest = labels[indices[:,0],indices[:,1]]
153
+ log_probs = logits_of_interest.gather(1,labels_of_interest.view(-1,1))
154
+
155
+ score = (log_probs.sum()/(-1 *num_samples)).exp().item()
156
+
157
+ return score
158
+
159
+
160
+ def sort_dictionary(dict):
161
+
162
+ keys = list(dict.keys())
163
+ values = list(dict.values())
164
+ sorted_value_index = np.argsort(values)
165
+ sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
166
+ return sorted_dict
167
+
168
+ def set_seed():
169
+ np.random.seed(2023)
170
+ torch.manual_seed(2023)
171
+
172
+
173
+
174
+
175
+ sentence = st.text_input("Please input a sample sentence in the target language")
176
+
177
+ models = get_model_infos(multilingual=None)
178
+ selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
179
+
180
+ )
181
+
182
+ progress_text = "Computing recommendation Scores"
183
+ my_bar = st.progress(0, text=progress_text)
184
+
185
+
186
+
187
+ scores={}
188
+ for index, model_id in enumerate(selected_models):
189
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
190
+ model = AutoModelWithLMHead.from_pretrained(model_id)
191
+ if model_id == 'castorini/afriberta_base':
192
+ tokenizer.model_max_length = 512
193
+ MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
194
+ MLM_UNK_TOKEN = tokenizer.unk_token_id
195
+
196
+ BATCH_SIZE = 1
197
+ score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,None,BATCH_SIZE)
198
+ scores[model_id] = score
199
+ my_bar.progress(index + 1, text=progress_text)
200
+
201
+ st.write("Our recommendation is:", scores)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ streamlit
3
+ numpy
4
+ torch
5
+ huggingface_hub
6
+ git+https://github.com/huggingface/transformers