Commit
·
d0a63c1
1
Parent(s):
6fdffe1
code added
Browse files
README.md
CHANGED
@@ -35,6 +35,64 @@ evaluation dataset: UKPLab/beir test data but we have used first 2lac passage on
|
|
35 |
|
36 |
Note: * means we have evaluated on same eval dataset.
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
### Training hyperparameters
|
39 |
|
40 |
The following hyperparameters were used during training:
|
|
|
35 |
|
36 |
Note: * means we have evaluated on same eval dataset.
|
37 |
|
38 |
+
### Usage (HuggingFace Transformers)
|
39 |
+
|
40 |
+
```python
|
41 |
+
|
42 |
+
passage_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-12_H-128_A-2")
|
43 |
+
query_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-12_H-128_A-2")
|
44 |
+
|
45 |
+
p_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-12_H-128_A-2")
|
46 |
+
q_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-12_H-128_A-2")
|
47 |
+
|
48 |
+
def get_title_text_combined(passage_dicts):
|
49 |
+
res = []
|
50 |
+
for p in passage_dicts:
|
51 |
+
res.append(tuple((p['title'], p['text'])))
|
52 |
+
return res
|
53 |
+
|
54 |
+
processed_passages = get_title_text_combined(passage_dicts)
|
55 |
+
|
56 |
+
def extracted_passage_embeddings(processed_passages, model_config):
|
57 |
+
passage_inputs = tokenizer.batch_encode_plus(
|
58 |
+
processed_passages,
|
59 |
+
add_special_tokens=True,
|
60 |
+
truncation=True,
|
61 |
+
padding="max_length",
|
62 |
+
max_length=model_config.passage_max_seq_len,
|
63 |
+
return_token_type_ids=True
|
64 |
+
)
|
65 |
+
passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']),
|
66 |
+
np.array(passage_inputs['attention_mask']),
|
67 |
+
np.array(passage_inputs['token_type_ids'])],
|
68 |
+
batch_size=512,
|
69 |
+
verbose=1)
|
70 |
+
return passage_embeddings
|
71 |
+
|
72 |
+
passage_embeddings = extracted_passage_embeddings(processed_passages, model_config)
|
73 |
+
|
74 |
+
|
75 |
+
def extracted_query_embeddings(queries, model_config):
|
76 |
+
query_inputs = tokenizer.batch_encode_plus(
|
77 |
+
queries,
|
78 |
+
add_special_tokens=True,
|
79 |
+
truncation=True,
|
80 |
+
padding="max_length",
|
81 |
+
max_length=model_config.query_max_seq_len,
|
82 |
+
return_token_type_ids=True
|
83 |
+
)
|
84 |
+
query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']),
|
85 |
+
np.array(query_inputs['attention_mask']),
|
86 |
+
np.array(query_inputs['token_type_ids'])],
|
87 |
+
batch_size=512,
|
88 |
+
verbose=1)
|
89 |
+
return query_embeddings
|
90 |
+
|
91 |
+
|
92 |
+
query_embeddings = extracted_query_embeddings(queries, model_config)
|
93 |
+
|
94 |
+
```
|
95 |
+
|
96 |
### Training hyperparameters
|
97 |
|
98 |
The following hyperparameters were used during training:
|