Owos commited on
Commit
06809f2
·
1 Parent(s): 0078df4

updated token error

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -39,14 +39,15 @@ def get_model_infos(multilingual="multilingual"):
39
  return df
40
 
41
  class MLMDataset(Dataset):
42
- def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
43
  self.sentence = sentence
44
  self.tokenizer = tokenizer
45
- self.num_samples = len(self.sentence) - 2
46
 
47
  self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
48
- self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
49
 
 
 
 
50
  self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
51
  self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
52
 
@@ -77,8 +78,8 @@ class MLMDataset(Dataset):
77
  return self.masked_input[idx], self.mask[idx],self.labels[idx], self.unk_mask[idx]
78
 
79
 
80
- def get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_samples,BATCH_SIZE):
81
- mlm_dataset = MLMDataset(sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN)
82
  dataloader = DataLoader(mlm_dataset,batch_size=BATCH_SIZE)
83
 
84
  score =1
@@ -119,7 +120,7 @@ def get_sense_score(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_sa
119
 
120
  tensor_input = tokenizer(sentence, return_tensors='pt')['input_ids']
121
  batch_input = tensor_input.repeat(num_samples, 1)
122
-
123
  random_ids = np.random.choice([i for i in range(1,tensor_input.size(1)-1)],num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
124
  random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
125
 
@@ -190,7 +191,7 @@ if run:
190
  for index, model_id in enumerate(selected_models):
191
  tokenizer = AutoTokenizer.from_pretrained(model_id)
192
  model = AutoModelWithLMHead.from_pretrained(model_id)
193
- if model_id == 'castorini/afriberta_base':
194
  tokenizer.model_max_length = 512
195
  MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
196
  MLM_UNK_TOKEN = tokenizer.unk_token_id
 
39
  return df
40
 
41
  class MLMDataset(Dataset):
42
+ def __init__(self,sentence,tokenizer,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
43
  self.sentence = sentence
44
  self.tokenizer = tokenizer
 
45
 
46
  self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
 
47
 
48
+ self.num_samples = self.tensor_input.size()[-1] - 2
49
+
50
+ self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
51
  self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
52
  self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
53
 
 
78
  return self.masked_input[idx], self.mask[idx],self.labels[idx], self.unk_mask[idx]
79
 
80
 
81
+ def get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,BATCH_SIZE):
82
+ mlm_dataset = MLMDataset(sentence,tokenizer,MLM_MASK_TOKEN,MLM_UNK_TOKEN)
83
  dataloader = DataLoader(mlm_dataset,batch_size=BATCH_SIZE)
84
 
85
  score =1
 
120
 
121
  tensor_input = tokenizer(sentence, return_tensors='pt')['input_ids']
122
  batch_input = tensor_input.repeat(num_samples, 1)
123
+
124
  random_ids = np.random.choice([i for i in range(1,tensor_input.size(1)-1)],num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
125
  random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
126
 
 
191
  for index, model_id in enumerate(selected_models):
192
  tokenizer = AutoTokenizer.from_pretrained(model_id)
193
  model = AutoModelWithLMHead.from_pretrained(model_id)
194
+ if model_id.startswith("castorini"):
195
  tokenizer.model_max_length = 512
196
  MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
197
  MLM_UNK_TOKEN = tokenizer.unk_token_id