Delicalib commited on
Commit
5e63477
·
verified ·
1 Parent(s): 935c348

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -11,7 +11,7 @@ model-index:
11
  | --- | --- |
12
  | **Name** | `ru_patents_rel` |
13
  | **Version** | `1.0.0` |
14
- | **spaCy** | `>=3.8.4,<3.9.0` |
15
  | **Default Pipeline** | `transformer`, `relation_extractor` |
16
  | **Components** | `transformer`, `relation_extractor` |
17
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
@@ -27,7 +27,7 @@ model-index:
27
 
28
  | Component | Labels |
29
  | --- | --- |
30
- | **`relation_extractor`** | `PART-OF`, `LOCATED-AT`, `CONNECTED-WITH`, `ATTRIBUTE-FOR`, `IN-MANNER-OF` |
31
 
32
  </details>
33
 
@@ -35,8 +35,17 @@ model-index:
35
 
36
  | Type | Score |
37
  | --- | --- |
38
- | `REL_MICRO_P` | 67.16 |
39
- | `REL_MICRO_R` | 28.99 |
40
- | `REL_MICRO_F` | 40.50 |
41
- | `TRANSFORMER_LOSS` | 1.05 |
42
- | `RELATION_EXTRACTOR_LOSS` | 3137.42 |
 
 
 
 
 
 
 
 
 
 
11
  | --- | --- |
12
  | **Name** | `ru_patents_rel` |
13
  | **Version** | `1.0.0` |
14
+ | **spaCy** | `>=3.8.5,<3.9.0` |
15
  | **Default Pipeline** | `transformer`, `relation_extractor` |
16
  | **Components** | `transformer`, `relation_extractor` |
17
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
 
27
 
28
  | Component | Labels |
29
  | --- | --- |
30
+ | **`relation_extractor`** | `PART-OF`, `LOCATED-AT`, `CONNECTED-WITH`, `IN-MANNER-OF`, `ATTRIBUTE-FOR` |
31
 
32
  </details>
33
 
 
35
 
36
  | Type | Score |
37
  | --- | --- |
38
+ | `REL_MICRO_P` | 56.34 |
39
+ | `REL_MICRO_R` | 21.41 |
40
+ | `REL_MICRO_F` | 31.03 |
41
+ | `REL_MACRO_F` | 22.09 |
42
+ | `REL_WEIGHTED_F` | 29.80 |
43
+ | `F1_PART-OF` | 46.48 |
44
+ | `F1_LOCATED-AT` | 20.86 |
45
+ | `F1_CONNECTED-WITH` | 13.81 |
46
+ | `F1_IN-MANNER-OF` | 11.96 |
47
+ | `F1_ATTRIBUTE-FOR` | 17.36 |
48
+ | `F1_MACRO` | 0.00 |
49
+ | `F1_WEIGHTED` | 0.00 |
50
+ | `TRANSFORMER_LOSS` | 0.77 |
51
+ | `RELATION_EXTRACTOR_LOSS` | 111.45 |
config.cfg CHANGED
@@ -17,13 +17,14 @@ before_creation = null
17
  after_creation = null
18
  after_pipeline_creation = null
19
  tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
- batch_size = 300
21
  vectors = {"@vectors":"spacy.Vectors.v1"}
22
 
23
  [components]
24
 
25
  [components.relation_extractor]
26
  factory = "relation_extractor"
 
27
  threshold = 0.5
28
 
29
  [components.relation_extractor.model]
@@ -40,7 +41,7 @@ pooling = {"@layers":"reduce_mean.v1"}
40
 
41
  [components.relation_extractor.model.create_instance_tensor.get_instances]
42
  @misc = "rel_instance_generator.v1"
43
- max_length = 200
44
 
45
  [components.relation_extractor.model.create_instance_tensor.tok2vec]
46
  @architectures = "spacy-transformers.TransformerListener.v1"
@@ -50,12 +51,12 @@ upstream = "*"
50
 
51
  [components.transformer]
52
  factory = "transformer"
53
- max_batch_items = 4096
54
  set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
55
 
56
  [components.transformer.model]
57
  @architectures = "spacy-transformers.TransformerModel.v3"
58
- name = "ai-forever/ruRoberta-large"
59
  mixed_precision = false
60
 
61
  [components.transformer.model.get_spans]
@@ -87,8 +88,8 @@ dropout = 0.2
87
  accumulate_gradient = 1
88
  patience = 1600000
89
  max_epochs = 0
90
- max_steps = 20000
91
- eval_frequency = 100
92
  frozen_components = []
93
  dev_corpus = "corpora.dev"
94
  train_corpus = "corpora.train"
@@ -126,7 +127,16 @@ initial_rate = 0.00005
126
  [training.score_weights]
127
  rel_micro_p = 0.0
128
  rel_micro_r = 0.0
129
- rel_micro_f = 1.0
 
 
 
 
 
 
 
 
 
130
 
131
  [pretraining]
132
 
 
17
  after_creation = null
18
  after_pipeline_creation = null
19
  tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+ batch_size = 200
21
  vectors = {"@vectors":"spacy.Vectors.v1"}
22
 
23
  [components]
24
 
25
  [components.relation_extractor]
26
  factory = "relation_extractor"
27
+ eval_frequency = ${training.eval_frequency}
28
  threshold = 0.5
29
 
30
  [components.relation_extractor.model]
 
41
 
42
  [components.relation_extractor.model.create_instance_tensor.get_instances]
43
  @misc = "rel_instance_generator.v1"
44
+ max_length = 100
45
 
46
  [components.relation_extractor.model.create_instance_tensor.tok2vec]
47
  @architectures = "spacy-transformers.TransformerListener.v1"
 
51
 
52
  [components.transformer]
53
  factory = "transformer"
54
+ max_batch_items = 2096
55
  set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
56
 
57
  [components.transformer.model]
58
  @architectures = "spacy-transformers.TransformerModel.v3"
59
+ name = "DeepPavlov/rubert-base-cased"
60
  mixed_precision = false
61
 
62
  [components.transformer.model.get_spans]
 
88
  accumulate_gradient = 1
89
  patience = 1600000
90
  max_epochs = 0
91
+ max_steps = 5000
92
+ eval_frequency = 50
93
  frozen_components = []
94
  dev_corpus = "corpora.dev"
95
  train_corpus = "corpora.train"
 
127
  [training.score_weights]
128
  rel_micro_p = 0.0
129
  rel_micro_r = 0.0
130
+ rel_micro_f = 0.1
131
+ rel_macro_f = 0.1
132
+ rel_weighted_f = 0.1
133
+ f1_PART-OF = 0.1
134
+ f1_LOCATED-AT = 0.1
135
+ f1_CONNECTED-WITH = 0.1
136
+ f1_IN-MANNER-OF = 0.1
137
+ f1_ATTRIBUTE-FOR = 0.1
138
+ f1_macro = 0.1
139
+ f1_weighted = 0.1
140
 
141
  [pretraining]
142
 
meta.json CHANGED
@@ -7,8 +7,8 @@
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
- "spacy_version":">=3.8.4,<3.9.0",
11
- "spacy_git_version":"85cc763",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
@@ -23,8 +23,8 @@
23
  "PART-OF",
24
  "LOCATED-AT",
25
  "CONNECTED-WITH",
26
- "ATTRIBUTE-FOR",
27
- "IN-MANNER-OF"
28
  ]
29
  },
30
  "pipeline":[
@@ -39,14 +39,23 @@
39
 
40
  ],
41
  "performance":{
42
- "rel_micro_p":0.6715583508,
43
- "rel_micro_r":0.2898944193,
44
- "rel_micro_f":0.4049726085,
45
- "transformer_loss":0.0105294202,
46
- "relation_extractor_loss":31.3741520271
 
 
 
 
 
 
 
 
 
47
  },
48
  "requirements":[
49
  "spacy-transformers>=1.3.8,<1.4.0",
50
- "spacy>=3.8.4,<3.9.0"
51
  ]
52
  }
 
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
+ "spacy_version":">=3.8.5,<3.9.0",
11
+ "spacy_git_version":"d0c705c",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
 
23
  "PART-OF",
24
  "LOCATED-AT",
25
  "CONNECTED-WITH",
26
+ "IN-MANNER-OF",
27
+ "ATTRIBUTE-FOR"
28
  ]
29
  },
30
  "pipeline":[
 
39
 
40
  ],
41
  "performance":{
42
+ "rel_micro_p":0.5634422111,
43
+ "rel_micro_r":0.2141322511,
44
+ "rel_micro_f":0.3103269331,
45
+ "rel_macro_f":0.2209389374,
46
+ "rel_weighted_f":0.2980346898,
47
+ "f1_PART-OF":0.4647938709,
48
+ "f1_LOCATED-AT":0.2086049544,
49
+ "f1_CONNECTED-WITH":0.1381294964,
50
+ "f1_IN-MANNER-OF":0.1195652174,
51
+ "f1_ATTRIBUTE-FOR":0.1736011478,
52
+ "f1_macro":0.0,
53
+ "f1_weighted":0.0,
54
+ "transformer_loss":0.0076538723,
55
+ "relation_extractor_loss":1.1145009976
56
  },
57
  "requirements":[
58
  "spacy-transformers>=1.3.8,<1.4.0",
59
+ "spacy>=3.8.5,<3.9.0"
60
  ]
61
  }
relationFactory.py CHANGED
@@ -20,6 +20,15 @@ from spacy.tokens import Doc, Span
20
  from thinc.types import Floats2d, Ints1d, Ragged, cast
21
  from thinc.api import Model, Linear, chain, Logistic
22
 
 
 
 
 
 
 
 
 
 
23
  @spacy.registry.architectures("rel_model.v1")
24
  def create_relation_model(
25
  create_instance_tensor: Model[List[Doc], Floats2d],
@@ -123,7 +132,6 @@ def instance_init(model: Model, X: List[Doc] = None, Y: Floats2d = None) -> Mode
123
  tok2vec.initialize(X)
124
  return model
125
 
126
-
127
  Doc.set_extension("rel", default={}, force=True)
128
  msg = Printer()
129
 
@@ -133,16 +141,23 @@ msg = Printer()
133
  requires=["doc.ents", "token.ent_iob", "token.ent_type"],
134
  assigns=["doc._.rel"],
135
  default_score_weights={
136
- "rel_micro_p": None,
137
- "rel_micro_r": None,
138
- "rel_micro_f": None,
 
 
 
 
 
 
 
139
  },
140
  )
141
  def make_relation_extractor(
142
- nlp: Language, name: str, model: Model, *, threshold: float
143
  ):
144
  """Construct a RelationExtractor component."""
145
- return RelationExtractor(nlp.vocab, model, name, threshold=threshold)
146
 
147
 
148
  class RelationExtractor(TrainablePipe):
@@ -153,12 +168,18 @@ class RelationExtractor(TrainablePipe):
153
  name: str = "rel",
154
  *,
155
  threshold: float,
 
156
  ) -> None:
157
  """Initialize a relation extractor."""
158
  self.vocab = vocab
159
  self.model = model
160
  self.name = name
161
  self.cfg = {"labels": [], "threshold": threshold}
 
 
 
 
 
162
 
163
  @property
164
  def labels(self) -> Tuple[str]:
@@ -249,6 +270,17 @@ class RelationExtractor(TrainablePipe):
249
  self.set_annotations(docs, predictions)
250
  return losses
251
 
 
 
 
 
 
 
 
 
 
 
 
252
  def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
253
  """Find the loss and gradient of loss for the batch of documents and
254
  their predicted scores."""
@@ -308,28 +340,131 @@ class RelationExtractor(TrainablePipe):
308
 
309
  def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
310
  """Score a batch of examples."""
311
- return score_relations(examples, self.threshold)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
 
314
  def score_relations(examples: Iterable[Example], threshold: float) -> Dict[str, Any]:
315
  """Score a batch of examples."""
316
- micro_prf = PRFScore()
 
 
317
  for example in examples:
318
  gold = example.reference._.rel
319
  pred = example.predicted._.rel
320
  for key, pred_dict in pred.items():
321
- gold_labels = [k for (k, v) in gold.get(key, {}).items() if v == 1.0]
322
  for k, v in pred_dict.items():
323
  if v >= threshold:
324
  if k in gold_labels:
325
- micro_prf.tp += 1
 
326
  else:
327
- micro_prf.fp += 1
 
328
  else:
329
  if k in gold_labels:
330
- micro_prf.fn += 1
331
- return {
332
- "rel_micro_p": micro_prf.precision,
333
- "rel_micro_r": micro_prf.recall,
334
- "rel_micro_f": micro_prf.fscore,
335
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  from thinc.types import Floats2d, Ints1d, Ragged, cast
21
  from thinc.api import Model, Linear, chain, Logistic
22
 
23
+ import json
24
+ import os
25
+ import time
26
+ from pathlib import Path
27
+
28
+ from sklearn.metrics import precision_recall_fscore_support, f1_score
29
+ import plotly.express as px
30
+ import plotly.graph_objects as go
31
+
32
  @spacy.registry.architectures("rel_model.v1")
33
  def create_relation_model(
34
  create_instance_tensor: Model[List[Doc], Floats2d],
 
132
  tok2vec.initialize(X)
133
  return model
134
 
 
135
  Doc.set_extension("rel", default={}, force=True)
136
  msg = Printer()
137
 
 
141
  requires=["doc.ents", "token.ent_iob", "token.ent_type"],
142
  assigns=["doc._.rel"],
143
  default_score_weights={
144
+ "rel_micro_p": 0.0,
145
+ "rel_micro_r": 0.0,
146
+ "rel_micro_f": 1.0,
147
+ "rel_macro_f": 1.0,
148
+ "rel_weighted_f": 1.0,
149
+ "f1_PART-OF": 1.0,
150
+ "f1_LOCATED-AT": 1.0,
151
+ "f1_CONNECTED-WITH": 1.0,
152
+ "f1_IN-MANNER-OF": 1.0,
153
+ "f1_ATTRIBUTE-FOR": 1.0
154
  },
155
  )
156
  def make_relation_extractor(
157
+ nlp: Language, name: str, model: Model, eval_frequency, *, threshold: float
158
  ):
159
  """Construct a RelationExtractor component."""
160
+ return RelationExtractor(nlp.vocab, model, name, threshold=threshold, eval_frequency=eval_frequency)
161
 
162
 
163
  class RelationExtractor(TrainablePipe):
 
168
  name: str = "rel",
169
  *,
170
  threshold: float,
171
+ eval_frequency = 100
172
  ) -> None:
173
  """Initialize a relation extractor."""
174
  self.vocab = vocab
175
  self.model = model
176
  self.name = name
177
  self.cfg = {"labels": [], "threshold": threshold}
178
+ self.eval_frequency = eval_frequency
179
+ self.start_learning_time = None
180
+ self.metric_history = []
181
+ self.max_f1 = 0
182
+ self.max_f1_step = 0
183
 
184
  @property
185
  def labels(self) -> Tuple[str]:
 
270
  self.set_annotations(docs, predictions)
271
  return losses
272
 
273
+ def get_focal_loss(self, examples: Iterable[Example], scores, gamma=3.0, alpha=0.25, eps=1e-8) -> Tuple[float, float]:
274
+ truths = self._examples_to_truth(examples)
275
+ scores_2 = numpy.clip(scores, eps, 1. - eps)
276
+ p_t = numpy.clip(scores_2 * truths + (1 - scores_2) * (1 - truths), eps, 1. - eps)
277
+
278
+ focal_loss = -(1 - p_t) ** gamma * numpy.log(p_t)
279
+ loss = numpy.mean(numpy.sum(focal_loss, axis=1))
280
+ gradient = focal_loss * (1 - 2 * truths)
281
+ return float(loss), gradient
282
+
283
+
284
  def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
285
  """Find the loss and gradient of loss for the batch of documents and
286
  their predicted scores."""
 
340
 
341
  def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
342
  """Score a batch of examples."""
343
+ scores = score_relations(examples, self.threshold)
344
+
345
+ tmp_scores = scores.copy()
346
+ tmp_scores["step"] = len(self.metric_history) * self.eval_frequency
347
+ if tmp_scores["rel_macro_f"] > self.max_f1:
348
+ self.max_f1 = tmp_scores["rel_macro_f"]
349
+ self.max_f1_step = tmp_scores["step"]
350
+ self.metric_history.append(tmp_scores)
351
+
352
+ return scores
353
+
354
+ def preprocess_metric_history(self):
355
+ result = {
356
+ "metric_name": [],
357
+ "metric_value": [],
358
+ "step": []
359
+ }
360
+ for cur_metrics in self.metric_history:
361
+ cur_step = cur_metrics["step"]
362
+ for key, value in cur_metrics.items():
363
+ if key != "step" and isinstance(value, float):
364
+ result["metric_name"].append(key)
365
+ result["metric_value"].append(value)
366
+ result["step"].append(cur_step)
367
+ return result
368
+
369
+ def save_metrics_history(self, path):
370
+ if self.start_learning_time is None:
371
+ self.start_learning_time = time.monotonic()
372
+
373
+ if self.metric_history:
374
+
375
+ metrics_history_to_save = self.preprocess_metric_history()
376
+ fig = px.line(metrics_history_to_save, x="step", y="metric_value", color="metric_name")
377
+ for trace in fig.data:
378
+ if trace.name in ["rel_micro_f", "rel_macro_f", "rel_weighted_f"]:
379
+ trace.line.width = 6
380
+ else:
381
+ trace.line.width = 1
382
+
383
+ idx = list(trace.x).index(self.max_f1_step)
384
+ highlight_y = list(trace.y)[idx]
385
+ line_color = trace.line.color
386
+ line_name = trace.name
387
+ fig.add_trace(go.Scatter(
388
+ x=[self.max_f1_step], y=[highlight_y],
389
+ mode='markers+text',
390
+ marker=dict(
391
+ color=line_color, size=10),
392
+ text=[f"{round(highlight_y, 2)}"],
393
+ textposition="top center",
394
+ name=f"{line_name} best"
395
+ ))
396
+
397
+ current_time = time.monotonic()
398
+ current_time_of_training = current_time - self.start_learning_time
399
+ current_time_of_training_text = f"{int(current_time_of_training // 3600)} hrs {int(current_time_of_training % 3600) // 60} min {round(current_time_of_training % 60)} sec"
400
+
401
+ fig.update_layout(title = dict(
402
+ text="Training statistics",
403
+ subtitle=dict(
404
+ text=f"Training time amounted to {current_time_of_training_text}",
405
+ font=dict(color="gray", size=13),
406
+ )
407
+ ))
408
+
409
+ output_dir = os.path.join(str(path), "logs")
410
+ os.makedirs(output_dir, exist_ok=True)
411
+ fig_path = os.path.join(output_dir, "training_metrics.html")
412
+ json_path = os.path.join(output_dir, "training_metrics.json")
413
+ fig.write_html(fig_path)
414
+ with open(json_path, "w", encoding="utf-8") as f:
415
+ json.dump({
416
+ "data": metrics_history_to_save,
417
+ "train_time_s": current_time_of_training
418
+ }, f, indent=2, ensure_ascii=False)
419
+
420
+ def to_disk(self, path, *args, **kwargs):
421
+ super().to_disk(path, *args, **kwargs)
422
+ output_dir = Path(path)
423
+ output_dir_metrics = output_dir.parent.parent
424
+ self.save_metrics_history(output_dir_metrics)
425
 
426
 
427
  def score_relations(examples: Iterable[Example], threshold: float) -> Dict[str, Any]:
428
  """Score a batch of examples."""
429
+
430
+ y_true = []
431
+ y_pred = []
432
  for example in examples:
433
  gold = example.reference._.rel
434
  pred = example.predicted._.rel
435
  for key, pred_dict in pred.items():
436
+ gold_labels = {k for (k, v) in gold.get(key, {}).items() if v == 1.0}
437
  for k, v in pred_dict.items():
438
  if v >= threshold:
439
  if k in gold_labels:
440
+ y_true.append(k)
441
+ y_pred.append(k)
442
  else:
443
+ y_true.append("O")
444
+ y_pred.append(k)
445
  else:
446
  if k in gold_labels:
447
+ y_true.append(k)
448
+ y_pred.append("O")
449
+
450
+
451
+ labels = sorted({label for label in y_true if label != "O"})
452
+
453
+ precision, recall, f1, support = precision_recall_fscore_support(
454
+ y_true, y_pred, labels=labels, zero_division=0, average=None
455
+ )
456
+ result = {}
457
+ for l, p, r, f in zip(labels, precision, recall, f1):
458
+ result[f"f1_{l}"] = f
459
+
460
+ p, r, f1_micro, _ = precision_recall_fscore_support(
461
+ y_true, y_pred, labels=labels, zero_division=0, average="micro", beta=1
462
+ )
463
+
464
+ result["rel_micro_p"] = p
465
+ result["rel_micro_r"] = r
466
+ result["rel_micro_f"] = f1_micro
467
+ result["rel_macro_f"] = f1_score(y_true, y_pred, average="macro", labels=labels, zero_division=0)
468
+ result["rel_weighted_f"] = f1_score(y_true, y_pred, average="weighted", labels=labels, zero_division=0)
469
+
470
+ return result
relation_extractor/cfg CHANGED
@@ -3,8 +3,8 @@
3
  "PART-OF",
4
  "LOCATED-AT",
5
  "CONNECTED-WITH",
6
- "ATTRIBUTE-FOR",
7
- "IN-MANNER-OF"
8
  ],
9
  "threshold":0.5
10
  }
 
3
  "PART-OF",
4
  "LOCATED-AT",
5
  "CONNECTED-WITH",
6
+ "IN-MANNER-OF",
7
+ "ATTRIBUTE-FOR"
8
  ],
9
  "threshold":0.5
10
  }
relation_extractor/model CHANGED
Binary files a/relation_extractor/model and b/relation_extractor/model differ
 
ru_patents_rel-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54dd6f43fca1c1cef88866cbbaa4dc8b2e450c19b3d11c0b5f3cdc893f96058f
3
- size 1321313598
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01a4762ef635162b3d38a964bf53b46c50d350f75b69c96116caecc4e4660464
3
+ size 661156608
transformer/cfg CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "max_batch_items":4096
3
  }
 
1
  {
2
+ "max_batch_items":2096
3
  }
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9be3ac022e449e0ce8bfbb1551933557fc5bad91718decd70fdd0e00ac36aeb5
3
- size 1430062590
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96e76a1b01f5978e003b6acf1e0f20cc51201f638c85fb239e6819b9a444b6f8
3
+ size 716719271
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff