junqiu-dev commited on
Commit
38f51c4
·
1 Parent(s): fb13d71

Add semantic highlighter model files with LFS for large files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. README.md +157 -3
  3. config.json +26 -0
  4. model.safetensors +3 -0
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +1 -0
  7. vocab.txt +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,157 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # opensearch-semantic-highlighter
2
+
3
+ ## Overview
4
+
5
+ The OpenSearch semantic highlighter is a trained classifier that takes a document and query as input and returns a binary score for each sentence in the document indicating its relevance to the query.
6
+
7
+ ## Usage
8
+
9
+ The model is intended to be used within the OpenSearch cluster. However, for illustrative purposes, we include an example of how it can be used outside of OpenSearch:
10
+
11
+ ```python
12
+ import nltk
13
+ import torch
14
+ import numpy as np
15
+ from datasets import Dataset
16
+ from functools import partial
17
+ from torch.utils.data import DataLoader
18
+ from dataclasses import dataclass, field
19
+ from typing import Any, Dict, List, Union
20
+ from torch.nn.utils.rnn import pad_sequence
21
+ from transformers import AutoTokenizer
22
+ from highlighter_model_tracing import TraceableBertTaggerForSentenceExtractionWithBackoff
23
+
24
+ @dataclass
25
+ class DataCollatorWithPadding:
26
+ pad_kvs: Dict[str, Union[int, float]] = field(default_factory=dict)
27
+
28
+ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
29
+ first = features[0]
30
+ batch = {}
31
+
32
+ # pad and collate keys in self.pad_kvs
33
+ for key, pad_value in self.pad_kvs.items():
34
+ if key in first and first[key] is not None:
35
+ batch[key] = pad_sequence(
36
+ [torch.tensor(f[key]) for f in features],
37
+ batch_first=True,
38
+ padding_value=pad_value,
39
+ )
40
+
41
+ # collate remaining keys assuming that the values can be stacked
42
+ for k, v in first.items():
43
+ if k not in self.pad_kvs and v is not None and isinstance(v, torch.Tensor):
44
+ batch[k] = torch.stack([f[k] for f in features])
45
+
46
+ return batch
47
+
48
+
49
+ def prepare_input_features(
50
+ tokenizer, examples, max_seq_length=512, stride=128, padding=False
51
+ ):
52
+
53
+ # jointly tokenize questions and context
54
+ tokenized_examples = tokenizer(
55
+ examples["question"],
56
+ examples["context"],
57
+ truncation="only_second",
58
+ max_length=max_seq_length,
59
+ stride=stride,
60
+ return_overflowing_tokens=True,
61
+ padding=padding,
62
+ is_split_into_words=True,
63
+ )
64
+
65
+ sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
66
+ tokenized_examples["example_id"] = []
67
+ tokenized_examples["word_ids"] = []
68
+ tokenized_examples["sentence_ids"] = []
69
+
70
+ # process model inputs
71
+ for i, sample_index in enumerate(sample_mapping):
72
+ word_ids = tokenized_examples.word_ids(i)
73
+ word_level_sentence_ids = examples["word_level_sentence_ids"][sample_index]
74
+
75
+ sequence_ids = tokenized_examples.sequence_ids(i)
76
+ token_start_index = 0
77
+ while sequence_ids[token_start_index] != 1:
78
+ token_start_index += 1
79
+
80
+ sentences_ids = [-100] * token_start_index
81
+ for word_idx in word_ids[token_start_index:]:
82
+ if word_idx is not None:
83
+ sentences_ids.append(word_level_sentence_ids[word_idx])
84
+ else:
85
+ sentences_ids.append(-100)
86
+
87
+ tokenized_examples["sentence_ids"].append(sentences_ids)
88
+ tokenized_examples["example_id"].append(examples["id"][sample_index])
89
+ tokenized_examples["word_ids"].append(word_ids)
90
+
91
+ return tokenized_examples
92
+
93
+
94
+ # example highlighting case, from OpenSearch documentation
95
+ query = "When does OpenSearch use text reanalysis for highlighting?"
96
+ document = "To highlight the search terms, the highlighter needs the start and end character offsets of each term. The offsets mark the term’s position in the original text. The highlighter can obtain the offsets from the following sources: Postings: When documents are indexed, OpenSearch creates an inverted search index—a core data structure used to search for documents. Postings represent the inverted search index and store the mapping of each analyzed term to the list of documents in which it occurs. If you set the index_options parameter to offsets when mapping a text field, OpenSearch adds each term’s start and end character offsets to the inverted index. During highlighting, the highlighter reruns the original query directly on the postings to locate each term. Thus, storing offsets makes highlighting more efficient for large fields because it does not require reanalyzing the text. Storing term offsets requires additional disk space, but uses less disk space than storing term vectors. Text reanalysis: In the absence of both postings and term vectors, the highlighter reanalyzes text in order to highlight it. For every document and every field that needs highlighting, the highlighter creates a small in-memory index and reruns the original query through Lucene’s query execution planner to access low-level match information for the current document. Reanalyzing the text works well in most use cases. However, this method is more memory and time intensive for large fields."
97
+
98
+ # sentence-level parsing
99
+ sentence_ids = []
100
+ context = []
101
+ document_sents = nltk.sent_tokenize(document)
102
+ for sent_id, sent in enumerate(document_sents):
103
+ sent_words = sent.split(' ')
104
+ context += sent_words
105
+ sentence_ids += [sent_id] * len(sent_words)
106
+
107
+ # format example highlighting case as a Dataset
108
+ example_dataset = Dataset.from_dict({'question': [[query]],
109
+ 'context': [context],
110
+ 'word_level_sentence_ids': [sentence_ids],
111
+ 'id': [0]})
112
+
113
+ # prepare to featurize the raw text data
114
+ base_model_id = "bert-base-uncased"
115
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
116
+ collator = DataCollatorWithPadding(
117
+ pad_kvs={
118
+ "input_ids": 0,
119
+ "token_type_ids": 0,
120
+ "attention_mask": 0,
121
+ "sentence_ids": -100,
122
+ "sentence_labels": -100,
123
+ }
124
+ )
125
+ preprocess_fn = partial(prepare_input_features, tokenizer)
126
+
127
+ # featurize
128
+ example_dataset = example_dataset.map(
129
+ preprocess_fn,
130
+ batched=True,
131
+ remove_columns=example_dataset.column_names,
132
+ desc="Preparing model inputs",
133
+ )
134
+ loader = DataLoader(example_dataset, batch_size=1, collate_fn=collator)
135
+ batch = next(iter(loader))
136
+
137
+ # load model and get sentence highlights
138
+ model = TraceableBertTaggerForSentenceExtractionWithBackoff.from_pretrained('opensearch-project/opensearch-semantic-highlighter-v1')
139
+ highlights = model(
140
+ batch["input_ids"],
141
+ batch["attention_mask"],
142
+ batch["token_type_ids"],
143
+ batch["sentence_ids"],
144
+ )
145
+ highlighted_sentences = [document_sents[x] for x in highlights[0]]
146
+ print(highlighted_sentences)
147
+ ```
148
+
149
+
150
+ ## License
151
+
152
+ This project is licensed under the [Apache v2.0 License](https://github.com/opensearch-project/neural-search/blob/main/LICENSE).
153
+
154
+
155
+ ## Copyright
156
+
157
+ Copyright OpenSearch Contributors. See [NOTICE](https://github.com/opensearch-project/neural-search/blob/main/NOTICE) for details.
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-uncased",
3
+ "architectures": [
4
+ "BertTaggerForSentenceExtraction"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.44.2",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae65ebbec6562c9badc6ae4bd5be5d1b55536ec840658c1d2821d0c6258b7146
3
+ size 437958648
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "model_max_length": 512}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff