Spaces:

koya-recommender
/

system

Runtime error

App Files Files Community

Owos commited on Feb 24, 2023

Commit

8231ebb

1 Parent(s): 5506d51

initial commit of the app

Browse files

Files changed (2) hide show

app.py +201 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import pandas as pd
+import streamlit as st
+import numpy as np
+import threading
+import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModelWithLMHead
+from huggingface_hub import HfApi, hf_hub_download
+from torch.utils.data import Dataset, DataLoader
+st.set_page_config(
+    page_title="Koya Recommendation System",
+    initial_sidebar_state="auto",
+)
+st.markdown(
+    """
+# Koya recommeder System
+### 👋 Welcome to the to the Koya recommendation system. This system recommeds an LLM for you when you set some given parameters.
+You can try it below"""
+)
+@st.cache
+def get_model_infos(multilingual="multilingual"):
+    api = HfApi()
+    model_infos = api.list_models(filter=["fill-mask", multilingual], cardData=True)
+    data = [['id','task', 'lang', 'sha']]
+    count =0
+    for model in model_infos:
+        try:
+            data.append([model.modelId, model.pipeline_tag,  model.cardData['language'],  model.sha])
+        except:
+            data.append([model.modelId, model.pipeline_tag, None,  model.sha])
+    df = pd.DataFrame.from_records(data[1:], columns=data[0])
+    return df
+class MLMDataset(Dataset):
+    def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
+        self.sentence = sentence
+        self.tokenizer = tokenizer
+        self.num_samples = num_samples
+        self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
+        self.batch_input = self.tensor_input.repeat(len(self.sentence), 1)
+        self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],len(self.sentence),replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
+        self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
+        # Added by Chris Emezue on 29.01.2023
+        # Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
+        unk_mask = torch.ones(self.batch_input.size()[0],self.batch_input.size()[1],self.tokenizer.vocab_size)
+        batch_input_for_unk = self.batch_input.unsqueeze(-1).expand(unk_mask.size())
+        self.unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
+        self.mask = torch.zeros(self.batch_input.size())
+        src =  torch.ones(self.batch_input.size(0)).unsqueeze(0).T
+        self.mask.scatter_(1, self.random_ids, src)
+        self.masked_input = self.batch_input.masked_fill(self.mask == 1, MLM_MASK_TOKEN)
+        self.labels = self.batch_input.masked_fill(self.masked_input != MLM_MASK_TOKEN, -100)
+        # If logits change when labels is not set to -100:
+        # If we are using the logits, this does not change it then. but if are using the loss,
+        # then this has an effect.
+        assert self.masked_input.shape[0]==self.labels.shape[0] == self.mask.shape[0] == self.unk_mask.shape[0]
+    def __len__(self):
+        return self.masked_input.shape[0]
+    def __getitem__(self,idx):
+        return self.masked_input[idx], self.mask[idx],self.labels[idx], self.unk_mask[idx]
+def get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_samples,BATCH_SIZE):
+    mlm_dataset = MLMDataset(sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN)
+    dataloader = DataLoader(mlm_dataset,batch_size=BATCH_SIZE)
+    score =1
+    for i,batch in enumerate(dataloader):
+        masked_input, mask,labels, unk_mask = batch
+        output = model(masked_input, labels=labels)
+        logits_ = output['logits']
+        logits = logits_ * unk_mask # Penalizing the unk tokens by setting their probs to zero
+        indices = torch.nonzero(mask)
+        logits_of_interest = logits[indices[:,0],indices[:,1],:]
+        labels_of_interest = labels[indices[:,0],indices[:,1]]
+        log_probs = logits_of_interest.gather(1,labels_of_interest.view(-1,1))
+        batch_score = (log_probs.sum()/(-1 *num_samples)).exp().item() # exp(x+y) = exp(x)*exp(y)
+        score *= batch_score
+    return score
+def get_sense_score(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_samples):
+    '''
+    IDEA
+    -----------------
+    PP = perplexity(P) where perplexity(P) function just computes:
+        (p_1*p_*p_3*...*p_N)^(-1/N) for p_i in P
+    In practice you need to do the computation in log space to avoid underflow:
+        e^-((log(p_1) + log(p_2) + ... + log(p_N)) / N)
+    Note: everytime you run this function, the results change slightly (but the ordering should be relatively the same),
+            because the tokens to mask are chosen randomly.
+    '''
+    tensor_input = tokenizer(sentence, return_tensors='pt')['input_ids']
+    batch_input = tensor_input.repeat(num_samples, 1)
+    random_ids = np.random.choice([i for i in range(1,tensor_input.size(1)-1)],num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
+    random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
+    # Added by Chris Emezue on 29.01.2023
+    # Add a term called unk_mask, such that p(w|...) is 0 if w is unk and p(w|...) otherwise
+    unk_mask = torch.ones(batch_input.size()[0],batch_input.size()[1],tokenizer.vocab_size)
+    batch_input_for_unk = batch_input.unsqueeze(-1).expand(unk_mask.size())
+    unk_mask = unk_mask.masked_fill(batch_input_for_unk == MLM_UNK_TOKEN, 0)
+    mask = torch.zeros(batch_input.size())
+    src =  torch.ones(batch_input.size(0)).unsqueeze(0).T
+    mask.scatter_(1, random_ids, src)
+    masked_input = batch_input.masked_fill(mask == 1, MLM_MASK_TOKEN)
+    labels = batch_input.masked_fill( masked_input != MLM_MASK_TOKEN, -100)
+    # If logits change when labels is not set to -100:
+    # If we are using the logits, this does not change it then. but if are using the loss,
+    # then this has an effect.
+    output = model(masked_input, labels=labels)
+    logits_ = output['logits']
+    logits = logits_ * unk_mask # Penalizing the unk tokens by setting their probs to zero
+    indices = torch.nonzero(mask)
+    logits_of_interest = logits[indices[:,0],indices[:,1],:]
+    labels_of_interest = labels[indices[:,0],indices[:,1]]
+    log_probs = logits_of_interest.gather(1,labels_of_interest.view(-1,1))
+    score = (log_probs.sum()/(-1 *num_samples)).exp().item()
+    return score
+def sort_dictionary(dict):
+    keys = list(dict.keys())
+    values = list(dict.values())
+    sorted_value_index = np.argsort(values)
+    sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
+    return sorted_dict
+def set_seed():
+    np.random.seed(2023)
+    torch.manual_seed(2023)
+sentence = st.text_input("Please input a sample sentence in the target language")
+models = get_model_infos(multilingual=None)
+selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
+)
+progress_text = "Computing recommendation Scores"
+my_bar = st.progress(0, text=progress_text)
+scores={}
+for index, model_id in enumerate(selected_models):
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelWithLMHead.from_pretrained(model_id)
+    if model_id == 'castorini/afriberta_base':
+        tokenizer.model_max_length = 512
+    MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
+    MLM_UNK_TOKEN = tokenizer.unk_token_id
+    BATCH_SIZE = 1
+    score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,None,BATCH_SIZE)
+    scores[model_id] = score
+    my_bar.progress(index + 1, text=progress_text)
+st.write("Our recommendation is:", scores)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas
+streamlit
+numpy
+torch
+huggingface_hub
+git+https://github.com/huggingface/transformers