Commit
·
38f51c4
1
Parent(s):
fb13d71
Add semantic highlighter model files with LFS for large files
Browse files- .gitattributes +1 -0
- README.md +157 -3
- config.json +26 -0
- model.safetensors +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
- vocab.txt +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,157 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# opensearch-semantic-highlighter
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
The OpenSearch semantic highlighter is a trained classifier that takes a document and query as input and returns a binary score for each sentence in the document indicating its relevance to the query.
|
6 |
+
|
7 |
+
## Usage
|
8 |
+
|
9 |
+
The model is intended to be used within the OpenSearch cluster. However, for illustrative purposes, we include an example of how it can be used outside of OpenSearch:
|
10 |
+
|
11 |
+
```python
|
12 |
+
import nltk
|
13 |
+
import torch
|
14 |
+
import numpy as np
|
15 |
+
from datasets import Dataset
|
16 |
+
from functools import partial
|
17 |
+
from torch.utils.data import DataLoader
|
18 |
+
from dataclasses import dataclass, field
|
19 |
+
from typing import Any, Dict, List, Union
|
20 |
+
from torch.nn.utils.rnn import pad_sequence
|
21 |
+
from transformers import AutoTokenizer
|
22 |
+
from highlighter_model_tracing import TraceableBertTaggerForSentenceExtractionWithBackoff
|
23 |
+
|
24 |
+
@dataclass
|
25 |
+
class DataCollatorWithPadding:
|
26 |
+
pad_kvs: Dict[str, Union[int, float]] = field(default_factory=dict)
|
27 |
+
|
28 |
+
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
|
29 |
+
first = features[0]
|
30 |
+
batch = {}
|
31 |
+
|
32 |
+
# pad and collate keys in self.pad_kvs
|
33 |
+
for key, pad_value in self.pad_kvs.items():
|
34 |
+
if key in first and first[key] is not None:
|
35 |
+
batch[key] = pad_sequence(
|
36 |
+
[torch.tensor(f[key]) for f in features],
|
37 |
+
batch_first=True,
|
38 |
+
padding_value=pad_value,
|
39 |
+
)
|
40 |
+
|
41 |
+
# collate remaining keys assuming that the values can be stacked
|
42 |
+
for k, v in first.items():
|
43 |
+
if k not in self.pad_kvs and v is not None and isinstance(v, torch.Tensor):
|
44 |
+
batch[k] = torch.stack([f[k] for f in features])
|
45 |
+
|
46 |
+
return batch
|
47 |
+
|
48 |
+
|
49 |
+
def prepare_input_features(
|
50 |
+
tokenizer, examples, max_seq_length=512, stride=128, padding=False
|
51 |
+
):
|
52 |
+
|
53 |
+
# jointly tokenize questions and context
|
54 |
+
tokenized_examples = tokenizer(
|
55 |
+
examples["question"],
|
56 |
+
examples["context"],
|
57 |
+
truncation="only_second",
|
58 |
+
max_length=max_seq_length,
|
59 |
+
stride=stride,
|
60 |
+
return_overflowing_tokens=True,
|
61 |
+
padding=padding,
|
62 |
+
is_split_into_words=True,
|
63 |
+
)
|
64 |
+
|
65 |
+
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
66 |
+
tokenized_examples["example_id"] = []
|
67 |
+
tokenized_examples["word_ids"] = []
|
68 |
+
tokenized_examples["sentence_ids"] = []
|
69 |
+
|
70 |
+
# process model inputs
|
71 |
+
for i, sample_index in enumerate(sample_mapping):
|
72 |
+
word_ids = tokenized_examples.word_ids(i)
|
73 |
+
word_level_sentence_ids = examples["word_level_sentence_ids"][sample_index]
|
74 |
+
|
75 |
+
sequence_ids = tokenized_examples.sequence_ids(i)
|
76 |
+
token_start_index = 0
|
77 |
+
while sequence_ids[token_start_index] != 1:
|
78 |
+
token_start_index += 1
|
79 |
+
|
80 |
+
sentences_ids = [-100] * token_start_index
|
81 |
+
for word_idx in word_ids[token_start_index:]:
|
82 |
+
if word_idx is not None:
|
83 |
+
sentences_ids.append(word_level_sentence_ids[word_idx])
|
84 |
+
else:
|
85 |
+
sentences_ids.append(-100)
|
86 |
+
|
87 |
+
tokenized_examples["sentence_ids"].append(sentences_ids)
|
88 |
+
tokenized_examples["example_id"].append(examples["id"][sample_index])
|
89 |
+
tokenized_examples["word_ids"].append(word_ids)
|
90 |
+
|
91 |
+
return tokenized_examples
|
92 |
+
|
93 |
+
|
94 |
+
# example highlighting case, from OpenSearch documentation
|
95 |
+
query = "When does OpenSearch use text reanalysis for highlighting?"
|
96 |
+
document = "To highlight the search terms, the highlighter needs the start and end character offsets of each term. The offsets mark the term’s position in the original text. The highlighter can obtain the offsets from the following sources: Postings: When documents are indexed, OpenSearch creates an inverted search index—a core data structure used to search for documents. Postings represent the inverted search index and store the mapping of each analyzed term to the list of documents in which it occurs. If you set the index_options parameter to offsets when mapping a text field, OpenSearch adds each term’s start and end character offsets to the inverted index. During highlighting, the highlighter reruns the original query directly on the postings to locate each term. Thus, storing offsets makes highlighting more efficient for large fields because it does not require reanalyzing the text. Storing term offsets requires additional disk space, but uses less disk space than storing term vectors. Text reanalysis: In the absence of both postings and term vectors, the highlighter reanalyzes text in order to highlight it. For every document and every field that needs highlighting, the highlighter creates a small in-memory index and reruns the original query through Lucene’s query execution planner to access low-level match information for the current document. Reanalyzing the text works well in most use cases. However, this method is more memory and time intensive for large fields."
|
97 |
+
|
98 |
+
# sentence-level parsing
|
99 |
+
sentence_ids = []
|
100 |
+
context = []
|
101 |
+
document_sents = nltk.sent_tokenize(document)
|
102 |
+
for sent_id, sent in enumerate(document_sents):
|
103 |
+
sent_words = sent.split(' ')
|
104 |
+
context += sent_words
|
105 |
+
sentence_ids += [sent_id] * len(sent_words)
|
106 |
+
|
107 |
+
# format example highlighting case as a Dataset
|
108 |
+
example_dataset = Dataset.from_dict({'question': [[query]],
|
109 |
+
'context': [context],
|
110 |
+
'word_level_sentence_ids': [sentence_ids],
|
111 |
+
'id': [0]})
|
112 |
+
|
113 |
+
# prepare to featurize the raw text data
|
114 |
+
base_model_id = "bert-base-uncased"
|
115 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
|
116 |
+
collator = DataCollatorWithPadding(
|
117 |
+
pad_kvs={
|
118 |
+
"input_ids": 0,
|
119 |
+
"token_type_ids": 0,
|
120 |
+
"attention_mask": 0,
|
121 |
+
"sentence_ids": -100,
|
122 |
+
"sentence_labels": -100,
|
123 |
+
}
|
124 |
+
)
|
125 |
+
preprocess_fn = partial(prepare_input_features, tokenizer)
|
126 |
+
|
127 |
+
# featurize
|
128 |
+
example_dataset = example_dataset.map(
|
129 |
+
preprocess_fn,
|
130 |
+
batched=True,
|
131 |
+
remove_columns=example_dataset.column_names,
|
132 |
+
desc="Preparing model inputs",
|
133 |
+
)
|
134 |
+
loader = DataLoader(example_dataset, batch_size=1, collate_fn=collator)
|
135 |
+
batch = next(iter(loader))
|
136 |
+
|
137 |
+
# load model and get sentence highlights
|
138 |
+
model = TraceableBertTaggerForSentenceExtractionWithBackoff.from_pretrained('opensearch-project/opensearch-semantic-highlighter-v1')
|
139 |
+
highlights = model(
|
140 |
+
batch["input_ids"],
|
141 |
+
batch["attention_mask"],
|
142 |
+
batch["token_type_ids"],
|
143 |
+
batch["sentence_ids"],
|
144 |
+
)
|
145 |
+
highlighted_sentences = [document_sents[x] for x in highlights[0]]
|
146 |
+
print(highlighted_sentences)
|
147 |
+
```
|
148 |
+
|
149 |
+
|
150 |
+
## License
|
151 |
+
|
152 |
+
This project is licensed under the [Apache v2.0 License](https://github.com/opensearch-project/neural-search/blob/main/LICENSE).
|
153 |
+
|
154 |
+
|
155 |
+
## Copyright
|
156 |
+
|
157 |
+
Copyright OpenSearch Contributors. See [NOTICE](https://github.com/opensearch-project/neural-search/blob/main/NOTICE) for details.
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertTaggerForSentenceExtraction"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.44.2",
|
23 |
+
"type_vocab_size": 2,
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 30522
|
26 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae65ebbec6562c9badc6ae4bd5be5d1b55536ec840658c1d2821d0c6258b7146
|
3 |
+
size 437958648
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": true, "model_max_length": 512}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|