Update spaCy pipeline
Browse files- README.md +16 -7
- config.cfg +17 -7
- meta.json +19 -10
- relationFactory.py +152 -17
- relation_extractor/cfg +2 -2
- relation_extractor/model +0 -0
- ru_patents_rel-any-py3-none-any.whl +2 -2
- transformer/cfg +1 -1
- transformer/model +2 -2
- vocab/strings.json +0 -0
README.md
CHANGED
@@ -11,7 +11,7 @@ model-index:
|
|
11 |
| --- | --- |
|
12 |
| **Name** | `ru_patents_rel` |
|
13 |
| **Version** | `1.0.0` |
|
14 |
-
| **spaCy** | `>=3.8.
|
15 |
| **Default Pipeline** | `transformer`, `relation_extractor` |
|
16 |
| **Components** | `transformer`, `relation_extractor` |
|
17 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
@@ -27,7 +27,7 @@ model-index:
|
|
27 |
|
28 |
| Component | Labels |
|
29 |
| --- | --- |
|
30 |
-
| **`relation_extractor`** | `PART-OF`, `LOCATED-AT`, `CONNECTED-WITH`, `
|
31 |
|
32 |
</details>
|
33 |
|
@@ -35,8 +35,17 @@ model-index:
|
|
35 |
|
36 |
| Type | Score |
|
37 |
| --- | --- |
|
38 |
-
| `REL_MICRO_P` |
|
39 |
-
| `REL_MICRO_R` |
|
40 |
-
| `REL_MICRO_F` |
|
41 |
-
| `
|
42 |
-
| `
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
| --- | --- |
|
12 |
| **Name** | `ru_patents_rel` |
|
13 |
| **Version** | `1.0.0` |
|
14 |
+
| **spaCy** | `>=3.8.5,<3.9.0` |
|
15 |
| **Default Pipeline** | `transformer`, `relation_extractor` |
|
16 |
| **Components** | `transformer`, `relation_extractor` |
|
17 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
|
|
27 |
|
28 |
| Component | Labels |
|
29 |
| --- | --- |
|
30 |
+
| **`relation_extractor`** | `PART-OF`, `LOCATED-AT`, `CONNECTED-WITH`, `IN-MANNER-OF`, `ATTRIBUTE-FOR` |
|
31 |
|
32 |
</details>
|
33 |
|
|
|
35 |
|
36 |
| Type | Score |
|
37 |
| --- | --- |
|
38 |
+
| `REL_MICRO_P` | 56.34 |
|
39 |
+
| `REL_MICRO_R` | 21.41 |
|
40 |
+
| `REL_MICRO_F` | 31.03 |
|
41 |
+
| `REL_MACRO_F` | 22.09 |
|
42 |
+
| `REL_WEIGHTED_F` | 29.80 |
|
43 |
+
| `F1_PART-OF` | 46.48 |
|
44 |
+
| `F1_LOCATED-AT` | 20.86 |
|
45 |
+
| `F1_CONNECTED-WITH` | 13.81 |
|
46 |
+
| `F1_IN-MANNER-OF` | 11.96 |
|
47 |
+
| `F1_ATTRIBUTE-FOR` | 17.36 |
|
48 |
+
| `F1_MACRO` | 0.00 |
|
49 |
+
| `F1_WEIGHTED` | 0.00 |
|
50 |
+
| `TRANSFORMER_LOSS` | 0.77 |
|
51 |
+
| `RELATION_EXTRACTOR_LOSS` | 111.45 |
|
config.cfg
CHANGED
@@ -17,13 +17,14 @@ before_creation = null
|
|
17 |
after_creation = null
|
18 |
after_pipeline_creation = null
|
19 |
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
20 |
-
batch_size =
|
21 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
22 |
|
23 |
[components]
|
24 |
|
25 |
[components.relation_extractor]
|
26 |
factory = "relation_extractor"
|
|
|
27 |
threshold = 0.5
|
28 |
|
29 |
[components.relation_extractor.model]
|
@@ -40,7 +41,7 @@ pooling = {"@layers":"reduce_mean.v1"}
|
|
40 |
|
41 |
[components.relation_extractor.model.create_instance_tensor.get_instances]
|
42 |
@misc = "rel_instance_generator.v1"
|
43 |
-
max_length =
|
44 |
|
45 |
[components.relation_extractor.model.create_instance_tensor.tok2vec]
|
46 |
@architectures = "spacy-transformers.TransformerListener.v1"
|
@@ -50,12 +51,12 @@ upstream = "*"
|
|
50 |
|
51 |
[components.transformer]
|
52 |
factory = "transformer"
|
53 |
-
max_batch_items =
|
54 |
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
|
55 |
|
56 |
[components.transformer.model]
|
57 |
@architectures = "spacy-transformers.TransformerModel.v3"
|
58 |
-
name = "
|
59 |
mixed_precision = false
|
60 |
|
61 |
[components.transformer.model.get_spans]
|
@@ -87,8 +88,8 @@ dropout = 0.2
|
|
87 |
accumulate_gradient = 1
|
88 |
patience = 1600000
|
89 |
max_epochs = 0
|
90 |
-
max_steps =
|
91 |
-
eval_frequency =
|
92 |
frozen_components = []
|
93 |
dev_corpus = "corpora.dev"
|
94 |
train_corpus = "corpora.train"
|
@@ -126,7 +127,16 @@ initial_rate = 0.00005
|
|
126 |
[training.score_weights]
|
127 |
rel_micro_p = 0.0
|
128 |
rel_micro_r = 0.0
|
129 |
-
rel_micro_f = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
[pretraining]
|
132 |
|
|
|
17 |
after_creation = null
|
18 |
after_pipeline_creation = null
|
19 |
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
20 |
+
batch_size = 200
|
21 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
22 |
|
23 |
[components]
|
24 |
|
25 |
[components.relation_extractor]
|
26 |
factory = "relation_extractor"
|
27 |
+
eval_frequency = ${training.eval_frequency}
|
28 |
threshold = 0.5
|
29 |
|
30 |
[components.relation_extractor.model]
|
|
|
41 |
|
42 |
[components.relation_extractor.model.create_instance_tensor.get_instances]
|
43 |
@misc = "rel_instance_generator.v1"
|
44 |
+
max_length = 100
|
45 |
|
46 |
[components.relation_extractor.model.create_instance_tensor.tok2vec]
|
47 |
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
|
51 |
|
52 |
[components.transformer]
|
53 |
factory = "transformer"
|
54 |
+
max_batch_items = 2096
|
55 |
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
|
56 |
|
57 |
[components.transformer.model]
|
58 |
@architectures = "spacy-transformers.TransformerModel.v3"
|
59 |
+
name = "DeepPavlov/rubert-base-cased"
|
60 |
mixed_precision = false
|
61 |
|
62 |
[components.transformer.model.get_spans]
|
|
|
88 |
accumulate_gradient = 1
|
89 |
patience = 1600000
|
90 |
max_epochs = 0
|
91 |
+
max_steps = 5000
|
92 |
+
eval_frequency = 50
|
93 |
frozen_components = []
|
94 |
dev_corpus = "corpora.dev"
|
95 |
train_corpus = "corpora.train"
|
|
|
127 |
[training.score_weights]
|
128 |
rel_micro_p = 0.0
|
129 |
rel_micro_r = 0.0
|
130 |
+
rel_micro_f = 0.1
|
131 |
+
rel_macro_f = 0.1
|
132 |
+
rel_weighted_f = 0.1
|
133 |
+
f1_PART-OF = 0.1
|
134 |
+
f1_LOCATED-AT = 0.1
|
135 |
+
f1_CONNECTED-WITH = 0.1
|
136 |
+
f1_IN-MANNER-OF = 0.1
|
137 |
+
f1_ATTRIBUTE-FOR = 0.1
|
138 |
+
f1_macro = 0.1
|
139 |
+
f1_weighted = 0.1
|
140 |
|
141 |
[pretraining]
|
142 |
|
meta.json
CHANGED
@@ -7,8 +7,8 @@
|
|
7 |
"email":"",
|
8 |
"url":"",
|
9 |
"license":"",
|
10 |
-
"spacy_version":">=3.8.
|
11 |
-
"spacy_git_version":"
|
12 |
"vectors":{
|
13 |
"width":0,
|
14 |
"vectors":0,
|
@@ -23,8 +23,8 @@
|
|
23 |
"PART-OF",
|
24 |
"LOCATED-AT",
|
25 |
"CONNECTED-WITH",
|
26 |
-
"
|
27 |
-
"
|
28 |
]
|
29 |
},
|
30 |
"pipeline":[
|
@@ -39,14 +39,23 @@
|
|
39 |
|
40 |
],
|
41 |
"performance":{
|
42 |
-
"rel_micro_p":0.
|
43 |
-
"rel_micro_r":0.
|
44 |
-
"rel_micro_f":0.
|
45 |
-
"
|
46 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
},
|
48 |
"requirements":[
|
49 |
"spacy-transformers>=1.3.8,<1.4.0",
|
50 |
-
"spacy>=3.8.
|
51 |
]
|
52 |
}
|
|
|
7 |
"email":"",
|
8 |
"url":"",
|
9 |
"license":"",
|
10 |
+
"spacy_version":">=3.8.5,<3.9.0",
|
11 |
+
"spacy_git_version":"d0c705c",
|
12 |
"vectors":{
|
13 |
"width":0,
|
14 |
"vectors":0,
|
|
|
23 |
"PART-OF",
|
24 |
"LOCATED-AT",
|
25 |
"CONNECTED-WITH",
|
26 |
+
"IN-MANNER-OF",
|
27 |
+
"ATTRIBUTE-FOR"
|
28 |
]
|
29 |
},
|
30 |
"pipeline":[
|
|
|
39 |
|
40 |
],
|
41 |
"performance":{
|
42 |
+
"rel_micro_p":0.5634422111,
|
43 |
+
"rel_micro_r":0.2141322511,
|
44 |
+
"rel_micro_f":0.3103269331,
|
45 |
+
"rel_macro_f":0.2209389374,
|
46 |
+
"rel_weighted_f":0.2980346898,
|
47 |
+
"f1_PART-OF":0.4647938709,
|
48 |
+
"f1_LOCATED-AT":0.2086049544,
|
49 |
+
"f1_CONNECTED-WITH":0.1381294964,
|
50 |
+
"f1_IN-MANNER-OF":0.1195652174,
|
51 |
+
"f1_ATTRIBUTE-FOR":0.1736011478,
|
52 |
+
"f1_macro":0.0,
|
53 |
+
"f1_weighted":0.0,
|
54 |
+
"transformer_loss":0.0076538723,
|
55 |
+
"relation_extractor_loss":1.1145009976
|
56 |
},
|
57 |
"requirements":[
|
58 |
"spacy-transformers>=1.3.8,<1.4.0",
|
59 |
+
"spacy>=3.8.5,<3.9.0"
|
60 |
]
|
61 |
}
|
relationFactory.py
CHANGED
@@ -20,6 +20,15 @@ from spacy.tokens import Doc, Span
|
|
20 |
from thinc.types import Floats2d, Ints1d, Ragged, cast
|
21 |
from thinc.api import Model, Linear, chain, Logistic
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
@spacy.registry.architectures("rel_model.v1")
|
24 |
def create_relation_model(
|
25 |
create_instance_tensor: Model[List[Doc], Floats2d],
|
@@ -123,7 +132,6 @@ def instance_init(model: Model, X: List[Doc] = None, Y: Floats2d = None) -> Mode
|
|
123 |
tok2vec.initialize(X)
|
124 |
return model
|
125 |
|
126 |
-
|
127 |
Doc.set_extension("rel", default={}, force=True)
|
128 |
msg = Printer()
|
129 |
|
@@ -133,16 +141,23 @@ msg = Printer()
|
|
133 |
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
134 |
assigns=["doc._.rel"],
|
135 |
default_score_weights={
|
136 |
-
"rel_micro_p":
|
137 |
-
"rel_micro_r":
|
138 |
-
"rel_micro_f":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
},
|
140 |
)
|
141 |
def make_relation_extractor(
|
142 |
-
nlp: Language, name: str, model: Model, *, threshold: float
|
143 |
):
|
144 |
"""Construct a RelationExtractor component."""
|
145 |
-
return RelationExtractor(nlp.vocab, model, name, threshold=threshold)
|
146 |
|
147 |
|
148 |
class RelationExtractor(TrainablePipe):
|
@@ -153,12 +168,18 @@ class RelationExtractor(TrainablePipe):
|
|
153 |
name: str = "rel",
|
154 |
*,
|
155 |
threshold: float,
|
|
|
156 |
) -> None:
|
157 |
"""Initialize a relation extractor."""
|
158 |
self.vocab = vocab
|
159 |
self.model = model
|
160 |
self.name = name
|
161 |
self.cfg = {"labels": [], "threshold": threshold}
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
@property
|
164 |
def labels(self) -> Tuple[str]:
|
@@ -249,6 +270,17 @@ class RelationExtractor(TrainablePipe):
|
|
249 |
self.set_annotations(docs, predictions)
|
250 |
return losses
|
251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
253 |
"""Find the loss and gradient of loss for the batch of documents and
|
254 |
their predicted scores."""
|
@@ -308,28 +340,131 @@ class RelationExtractor(TrainablePipe):
|
|
308 |
|
309 |
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
310 |
"""Score a batch of examples."""
|
311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
|
313 |
|
314 |
def score_relations(examples: Iterable[Example], threshold: float) -> Dict[str, Any]:
|
315 |
"""Score a batch of examples."""
|
316 |
-
|
|
|
|
|
317 |
for example in examples:
|
318 |
gold = example.reference._.rel
|
319 |
pred = example.predicted._.rel
|
320 |
for key, pred_dict in pred.items():
|
321 |
-
gold_labels =
|
322 |
for k, v in pred_dict.items():
|
323 |
if v >= threshold:
|
324 |
if k in gold_labels:
|
325 |
-
|
|
|
326 |
else:
|
327 |
-
|
|
|
328 |
else:
|
329 |
if k in gold_labels:
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
from thinc.types import Floats2d, Ints1d, Ragged, cast
|
21 |
from thinc.api import Model, Linear, chain, Logistic
|
22 |
|
23 |
+
import json
|
24 |
+
import os
|
25 |
+
import time
|
26 |
+
from pathlib import Path
|
27 |
+
|
28 |
+
from sklearn.metrics import precision_recall_fscore_support, f1_score
|
29 |
+
import plotly.express as px
|
30 |
+
import plotly.graph_objects as go
|
31 |
+
|
32 |
@spacy.registry.architectures("rel_model.v1")
|
33 |
def create_relation_model(
|
34 |
create_instance_tensor: Model[List[Doc], Floats2d],
|
|
|
132 |
tok2vec.initialize(X)
|
133 |
return model
|
134 |
|
|
|
135 |
Doc.set_extension("rel", default={}, force=True)
|
136 |
msg = Printer()
|
137 |
|
|
|
141 |
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
142 |
assigns=["doc._.rel"],
|
143 |
default_score_weights={
|
144 |
+
"rel_micro_p": 0.0,
|
145 |
+
"rel_micro_r": 0.0,
|
146 |
+
"rel_micro_f": 1.0,
|
147 |
+
"rel_macro_f": 1.0,
|
148 |
+
"rel_weighted_f": 1.0,
|
149 |
+
"f1_PART-OF": 1.0,
|
150 |
+
"f1_LOCATED-AT": 1.0,
|
151 |
+
"f1_CONNECTED-WITH": 1.0,
|
152 |
+
"f1_IN-MANNER-OF": 1.0,
|
153 |
+
"f1_ATTRIBUTE-FOR": 1.0
|
154 |
},
|
155 |
)
|
156 |
def make_relation_extractor(
|
157 |
+
nlp: Language, name: str, model: Model, eval_frequency, *, threshold: float
|
158 |
):
|
159 |
"""Construct a RelationExtractor component."""
|
160 |
+
return RelationExtractor(nlp.vocab, model, name, threshold=threshold, eval_frequency=eval_frequency)
|
161 |
|
162 |
|
163 |
class RelationExtractor(TrainablePipe):
|
|
|
168 |
name: str = "rel",
|
169 |
*,
|
170 |
threshold: float,
|
171 |
+
eval_frequency = 100
|
172 |
) -> None:
|
173 |
"""Initialize a relation extractor."""
|
174 |
self.vocab = vocab
|
175 |
self.model = model
|
176 |
self.name = name
|
177 |
self.cfg = {"labels": [], "threshold": threshold}
|
178 |
+
self.eval_frequency = eval_frequency
|
179 |
+
self.start_learning_time = None
|
180 |
+
self.metric_history = []
|
181 |
+
self.max_f1 = 0
|
182 |
+
self.max_f1_step = 0
|
183 |
|
184 |
@property
|
185 |
def labels(self) -> Tuple[str]:
|
|
|
270 |
self.set_annotations(docs, predictions)
|
271 |
return losses
|
272 |
|
273 |
+
def get_focal_loss(self, examples: Iterable[Example], scores, gamma=3.0, alpha=0.25, eps=1e-8) -> Tuple[float, float]:
|
274 |
+
truths = self._examples_to_truth(examples)
|
275 |
+
scores_2 = numpy.clip(scores, eps, 1. - eps)
|
276 |
+
p_t = numpy.clip(scores_2 * truths + (1 - scores_2) * (1 - truths), eps, 1. - eps)
|
277 |
+
|
278 |
+
focal_loss = -(1 - p_t) ** gamma * numpy.log(p_t)
|
279 |
+
loss = numpy.mean(numpy.sum(focal_loss, axis=1))
|
280 |
+
gradient = focal_loss * (1 - 2 * truths)
|
281 |
+
return float(loss), gradient
|
282 |
+
|
283 |
+
|
284 |
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
285 |
"""Find the loss and gradient of loss for the batch of documents and
|
286 |
their predicted scores."""
|
|
|
340 |
|
341 |
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
342 |
"""Score a batch of examples."""
|
343 |
+
scores = score_relations(examples, self.threshold)
|
344 |
+
|
345 |
+
tmp_scores = scores.copy()
|
346 |
+
tmp_scores["step"] = len(self.metric_history) * self.eval_frequency
|
347 |
+
if tmp_scores["rel_macro_f"] > self.max_f1:
|
348 |
+
self.max_f1 = tmp_scores["rel_macro_f"]
|
349 |
+
self.max_f1_step = tmp_scores["step"]
|
350 |
+
self.metric_history.append(tmp_scores)
|
351 |
+
|
352 |
+
return scores
|
353 |
+
|
354 |
+
def preprocess_metric_history(self):
|
355 |
+
result = {
|
356 |
+
"metric_name": [],
|
357 |
+
"metric_value": [],
|
358 |
+
"step": []
|
359 |
+
}
|
360 |
+
for cur_metrics in self.metric_history:
|
361 |
+
cur_step = cur_metrics["step"]
|
362 |
+
for key, value in cur_metrics.items():
|
363 |
+
if key != "step" and isinstance(value, float):
|
364 |
+
result["metric_name"].append(key)
|
365 |
+
result["metric_value"].append(value)
|
366 |
+
result["step"].append(cur_step)
|
367 |
+
return result
|
368 |
+
|
369 |
+
def save_metrics_history(self, path):
|
370 |
+
if self.start_learning_time is None:
|
371 |
+
self.start_learning_time = time.monotonic()
|
372 |
+
|
373 |
+
if self.metric_history:
|
374 |
+
|
375 |
+
metrics_history_to_save = self.preprocess_metric_history()
|
376 |
+
fig = px.line(metrics_history_to_save, x="step", y="metric_value", color="metric_name")
|
377 |
+
for trace in fig.data:
|
378 |
+
if trace.name in ["rel_micro_f", "rel_macro_f", "rel_weighted_f"]:
|
379 |
+
trace.line.width = 6
|
380 |
+
else:
|
381 |
+
trace.line.width = 1
|
382 |
+
|
383 |
+
idx = list(trace.x).index(self.max_f1_step)
|
384 |
+
highlight_y = list(trace.y)[idx]
|
385 |
+
line_color = trace.line.color
|
386 |
+
line_name = trace.name
|
387 |
+
fig.add_trace(go.Scatter(
|
388 |
+
x=[self.max_f1_step], y=[highlight_y],
|
389 |
+
mode='markers+text',
|
390 |
+
marker=dict(
|
391 |
+
color=line_color, size=10),
|
392 |
+
text=[f"{round(highlight_y, 2)}"],
|
393 |
+
textposition="top center",
|
394 |
+
name=f"{line_name} best"
|
395 |
+
))
|
396 |
+
|
397 |
+
current_time = time.monotonic()
|
398 |
+
current_time_of_training = current_time - self.start_learning_time
|
399 |
+
current_time_of_training_text = f"{int(current_time_of_training // 3600)} hrs {int(current_time_of_training % 3600) // 60} min {round(current_time_of_training % 60)} sec"
|
400 |
+
|
401 |
+
fig.update_layout(title = dict(
|
402 |
+
text="Training statistics",
|
403 |
+
subtitle=dict(
|
404 |
+
text=f"Training time amounted to {current_time_of_training_text}",
|
405 |
+
font=dict(color="gray", size=13),
|
406 |
+
)
|
407 |
+
))
|
408 |
+
|
409 |
+
output_dir = os.path.join(str(path), "logs")
|
410 |
+
os.makedirs(output_dir, exist_ok=True)
|
411 |
+
fig_path = os.path.join(output_dir, "training_metrics.html")
|
412 |
+
json_path = os.path.join(output_dir, "training_metrics.json")
|
413 |
+
fig.write_html(fig_path)
|
414 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
415 |
+
json.dump({
|
416 |
+
"data": metrics_history_to_save,
|
417 |
+
"train_time_s": current_time_of_training
|
418 |
+
}, f, indent=2, ensure_ascii=False)
|
419 |
+
|
420 |
+
def to_disk(self, path, *args, **kwargs):
|
421 |
+
super().to_disk(path, *args, **kwargs)
|
422 |
+
output_dir = Path(path)
|
423 |
+
output_dir_metrics = output_dir.parent.parent
|
424 |
+
self.save_metrics_history(output_dir_metrics)
|
425 |
|
426 |
|
427 |
def score_relations(examples: Iterable[Example], threshold: float) -> Dict[str, Any]:
|
428 |
"""Score a batch of examples."""
|
429 |
+
|
430 |
+
y_true = []
|
431 |
+
y_pred = []
|
432 |
for example in examples:
|
433 |
gold = example.reference._.rel
|
434 |
pred = example.predicted._.rel
|
435 |
for key, pred_dict in pred.items():
|
436 |
+
gold_labels = {k for (k, v) in gold.get(key, {}).items() if v == 1.0}
|
437 |
for k, v in pred_dict.items():
|
438 |
if v >= threshold:
|
439 |
if k in gold_labels:
|
440 |
+
y_true.append(k)
|
441 |
+
y_pred.append(k)
|
442 |
else:
|
443 |
+
y_true.append("O")
|
444 |
+
y_pred.append(k)
|
445 |
else:
|
446 |
if k in gold_labels:
|
447 |
+
y_true.append(k)
|
448 |
+
y_pred.append("O")
|
449 |
+
|
450 |
+
|
451 |
+
labels = sorted({label for label in y_true if label != "O"})
|
452 |
+
|
453 |
+
precision, recall, f1, support = precision_recall_fscore_support(
|
454 |
+
y_true, y_pred, labels=labels, zero_division=0, average=None
|
455 |
+
)
|
456 |
+
result = {}
|
457 |
+
for l, p, r, f in zip(labels, precision, recall, f1):
|
458 |
+
result[f"f1_{l}"] = f
|
459 |
+
|
460 |
+
p, r, f1_micro, _ = precision_recall_fscore_support(
|
461 |
+
y_true, y_pred, labels=labels, zero_division=0, average="micro", beta=1
|
462 |
+
)
|
463 |
+
|
464 |
+
result["rel_micro_p"] = p
|
465 |
+
result["rel_micro_r"] = r
|
466 |
+
result["rel_micro_f"] = f1_micro
|
467 |
+
result["rel_macro_f"] = f1_score(y_true, y_pred, average="macro", labels=labels, zero_division=0)
|
468 |
+
result["rel_weighted_f"] = f1_score(y_true, y_pred, average="weighted", labels=labels, zero_division=0)
|
469 |
+
|
470 |
+
return result
|
relation_extractor/cfg
CHANGED
@@ -3,8 +3,8 @@
|
|
3 |
"PART-OF",
|
4 |
"LOCATED-AT",
|
5 |
"CONNECTED-WITH",
|
6 |
-
"
|
7 |
-
"
|
8 |
],
|
9 |
"threshold":0.5
|
10 |
}
|
|
|
3 |
"PART-OF",
|
4 |
"LOCATED-AT",
|
5 |
"CONNECTED-WITH",
|
6 |
+
"IN-MANNER-OF",
|
7 |
+
"ATTRIBUTE-FOR"
|
8 |
],
|
9 |
"threshold":0.5
|
10 |
}
|
relation_extractor/model
CHANGED
Binary files a/relation_extractor/model and b/relation_extractor/model differ
|
|
ru_patents_rel-any-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:01a4762ef635162b3d38a964bf53b46c50d350f75b69c96116caecc4e4660464
|
3 |
+
size 661156608
|
transformer/cfg
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
{
|
2 |
-
"max_batch_items":
|
3 |
}
|
|
|
1 |
{
|
2 |
+
"max_batch_items":2096
|
3 |
}
|
transformer/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96e76a1b01f5978e003b6acf1e0f20cc51201f638c85fb239e6819b9a444b6f8
|
3 |
+
size 716719271
|
vocab/strings.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|