Update spaCy pipeline

Browse files

Files changed (10) hide show

README.md +16 -7
config.cfg +17 -7
meta.json +19 -10
relationFactory.py +152 -17
relation_extractor/cfg +2 -2
relation_extractor/model +0 -0
ru_patents_rel-any-py3-none-any.whl +2 -2
transformer/cfg +1 -1
transformer/model +2 -2
vocab/strings.json +0 -0

README.md CHANGED Viewed

@@ -11,7 +11,7 @@ model-index:
 | --- | --- |
 | **Name** | `ru_patents_rel` |
 | **Version** | `1.0.0` |
-| **spaCy** | `>=3.8.4,<3.9.0` |
 | **Default Pipeline** | `transformer`, `relation_extractor` |
 | **Components** | `transformer`, `relation_extractor` |
 | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
@@ -27,7 +27,7 @@ model-index:
 | Component | Labels |
 | --- | --- |
-| **`relation_extractor`** | `PART-OF`, `LOCATED-AT`, `CONNECTED-WITH`, `ATTRIBUTE-FOR`, `IN-MANNER-OF` |
 </details>
@@ -35,8 +35,17 @@ model-index:
 | Type | Score |
 | --- | --- |
-| `REL_MICRO_P` | 67.16 |
-| `REL_MICRO_R` | 28.99 |
-| `REL_MICRO_F` | 40.50 |
-| `TRANSFORMER_LOSS` | 1.05 |
-| `RELATION_EXTRACTOR_LOSS` | 3137.42 |

 | --- | --- |
 | **Name** | `ru_patents_rel` |
 | **Version** | `1.0.0` |
+| **spaCy** | `>=3.8.5,<3.9.0` |
 | **Default Pipeline** | `transformer`, `relation_extractor` |
 | **Components** | `transformer`, `relation_extractor` |
 | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
 | Component | Labels |
 | --- | --- |
+| **`relation_extractor`** | `PART-OF`, `LOCATED-AT`, `CONNECTED-WITH`, `IN-MANNER-OF`, `ATTRIBUTE-FOR` |
 </details>
 | Type | Score |
 | --- | --- |
+| `REL_MICRO_P` | 56.34 |
+| `REL_MICRO_R` | 21.41 |
+| `REL_MICRO_F` | 31.03 |
+| `REL_MACRO_F` | 22.09 |
+| `REL_WEIGHTED_F` | 29.80 |
+| `F1_PART-OF` | 46.48 |
+| `F1_LOCATED-AT` | 20.86 |
+| `F1_CONNECTED-WITH` | 13.81 |
+| `F1_IN-MANNER-OF` | 11.96 |
+| `F1_ATTRIBUTE-FOR` | 17.36 |
+| `F1_MACRO` | 0.00 |
+| `F1_WEIGHTED` | 0.00 |
+| `TRANSFORMER_LOSS` | 0.77 |
+| `RELATION_EXTRACTOR_LOSS` | 111.45 |

config.cfg CHANGED Viewed

@@ -17,13 +17,14 @@ before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-batch_size = 300
 vectors = {"@vectors":"spacy.Vectors.v1"}
 [components]
 [components.relation_extractor]
 factory = "relation_extractor"
 threshold = 0.5
 [components.relation_extractor.model]
@@ -40,7 +41,7 @@ pooling = {"@layers":"reduce_mean.v1"}
 [components.relation_extractor.model.create_instance_tensor.get_instances]
 @misc = "rel_instance_generator.v1"
-max_length = 200
 [components.relation_extractor.model.create_instance_tensor.tok2vec]
 @architectures = "spacy-transformers.TransformerListener.v1"
@@ -50,12 +51,12 @@ upstream = "*"
 [components.transformer]
 factory = "transformer"
-max_batch_items = 4096
 set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
 [components.transformer.model]
 @architectures = "spacy-transformers.TransformerModel.v3"
-name = "ai-forever/ruRoberta-large"
 mixed_precision = false
 [components.transformer.model.get_spans]
@@ -87,8 +88,8 @@ dropout = 0.2
 accumulate_gradient = 1
 patience = 1600000
 max_epochs = 0
-max_steps = 20000
-eval_frequency = 100
 frozen_components = []
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
@@ -126,7 +127,16 @@ initial_rate = 0.00005
 [training.score_weights]
 rel_micro_p = 0.0
 rel_micro_r = 0.0
-rel_micro_f = 1.0
 [pretraining]

 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+batch_size = 200
 vectors = {"@vectors":"spacy.Vectors.v1"}
 [components]
 [components.relation_extractor]
 factory = "relation_extractor"
+eval_frequency = ${training.eval_frequency}
 threshold = 0.5
 [components.relation_extractor.model]
 [components.relation_extractor.model.create_instance_tensor.get_instances]
 @misc = "rel_instance_generator.v1"
+max_length = 100
 [components.relation_extractor.model.create_instance_tensor.tok2vec]
 @architectures = "spacy-transformers.TransformerListener.v1"
 [components.transformer]
 factory = "transformer"
+max_batch_items = 2096
 set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
 [components.transformer.model]
 @architectures = "spacy-transformers.TransformerModel.v3"
+name = "DeepPavlov/rubert-base-cased"
 mixed_precision = false
 [components.transformer.model.get_spans]
 accumulate_gradient = 1
 patience = 1600000
 max_epochs = 0
+max_steps = 5000
+eval_frequency = 50
 frozen_components = []
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 [training.score_weights]
 rel_micro_p = 0.0
 rel_micro_r = 0.0
+rel_micro_f = 0.1
+rel_macro_f = 0.1
+rel_weighted_f = 0.1
+f1_PART-OF = 0.1
+f1_LOCATED-AT = 0.1
+f1_CONNECTED-WITH = 0.1
+f1_IN-MANNER-OF = 0.1
+f1_ATTRIBUTE-FOR = 0.1
+f1_macro = 0.1
+f1_weighted = 0.1
 [pretraining]

meta.json CHANGED Viewed

@@ -7,8 +7,8 @@
   "email":"",
   "url":"",
   "license":"",
-  "spacy_version":">=3.8.4,<3.9.0",
-  "spacy_git_version":"85cc763",
   "vectors":{
     "width":0,
     "vectors":0,
@@ -23,8 +23,8 @@
       "PART-OF",
       "LOCATED-AT",
       "CONNECTED-WITH",
-      "ATTRIBUTE-FOR",
-      "IN-MANNER-OF"
     ]
   },
   "pipeline":[
@@ -39,14 +39,23 @@
   ],
   "performance":{
-    "rel_micro_p":0.6715583508,
-    "rel_micro_r":0.2898944193,
-    "rel_micro_f":0.4049726085,
-    "transformer_loss":0.0105294202,
-    "relation_extractor_loss":31.3741520271
   },
   "requirements":[
     "spacy-transformers>=1.3.8,<1.4.0",
-    "spacy>=3.8.4,<3.9.0"
   ]
 }

   "email":"",
   "url":"",
   "license":"",
+  "spacy_version":">=3.8.5,<3.9.0",
+  "spacy_git_version":"d0c705c",
   "vectors":{
     "width":0,
     "vectors":0,
       "PART-OF",
       "LOCATED-AT",
       "CONNECTED-WITH",
+      "IN-MANNER-OF",
+      "ATTRIBUTE-FOR"
     ]
   },
   "pipeline":[
   ],
   "performance":{
+    "rel_micro_p":0.5634422111,
+    "rel_micro_r":0.2141322511,
+    "rel_micro_f":0.3103269331,
+    "rel_macro_f":0.2209389374,
+    "rel_weighted_f":0.2980346898,
+    "f1_PART-OF":0.4647938709,
+    "f1_LOCATED-AT":0.2086049544,
+    "f1_CONNECTED-WITH":0.1381294964,
+    "f1_IN-MANNER-OF":0.1195652174,
+    "f1_ATTRIBUTE-FOR":0.1736011478,
+    "f1_macro":0.0,
+    "f1_weighted":0.0,
+    "transformer_loss":0.0076538723,
+    "relation_extractor_loss":1.1145009976
   },
   "requirements":[
     "spacy-transformers>=1.3.8,<1.4.0",
+    "spacy>=3.8.5,<3.9.0"
   ]
 }

relationFactory.py CHANGED Viewed

@@ -20,6 +20,15 @@ from spacy.tokens import Doc, Span
 from thinc.types import Floats2d, Ints1d, Ragged, cast
 from thinc.api import Model, Linear, chain, Logistic
 @spacy.registry.architectures("rel_model.v1")
 def create_relation_model(
     create_instance_tensor: Model[List[Doc], Floats2d],
@@ -123,7 +132,6 @@ def instance_init(model: Model, X: List[Doc] = None, Y: Floats2d = None) -> Mode
         tok2vec.initialize(X)
     return model
 Doc.set_extension("rel", default={}, force=True)
 msg = Printer()
@@ -133,16 +141,23 @@ msg = Printer()
     requires=["doc.ents", "token.ent_iob", "token.ent_type"],
     assigns=["doc._.rel"],
     default_score_weights={
-        "rel_micro_p": None,
-        "rel_micro_r": None,
-        "rel_micro_f": None,
     },
 )
 def make_relation_extractor(
-    nlp: Language, name: str, model: Model, *, threshold: float
 ):
     """Construct a RelationExtractor component."""
-    return RelationExtractor(nlp.vocab, model, name, threshold=threshold)
 class RelationExtractor(TrainablePipe):
@@ -153,12 +168,18 @@ class RelationExtractor(TrainablePipe):
         name: str = "rel",
         *,
         threshold: float,
     ) -> None:
         """Initialize a relation extractor."""
         self.vocab = vocab
         self.model = model
         self.name = name
         self.cfg = {"labels": [], "threshold": threshold}
     @property
     def labels(self) -> Tuple[str]:
@@ -249,6 +270,17 @@ class RelationExtractor(TrainablePipe):
             self.set_annotations(docs, predictions)
         return losses
     def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores."""
@@ -308,28 +340,131 @@ class RelationExtractor(TrainablePipe):
     def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
         """Score a batch of examples."""
-        return score_relations(examples, self.threshold)
 def score_relations(examples: Iterable[Example], threshold: float) -> Dict[str, Any]:
     """Score a batch of examples."""
-    micro_prf = PRFScore()
     for example in examples:
         gold = example.reference._.rel
         pred = example.predicted._.rel
         for key, pred_dict in pred.items():
-            gold_labels = [k for (k, v) in gold.get(key, {}).items() if v == 1.0]
             for k, v in pred_dict.items():
                 if v >= threshold:
                     if k in gold_labels:
-                        micro_prf.tp += 1
                     else:
-                        micro_prf.fp += 1
                 else:
                     if k in gold_labels:
-                        micro_prf.fn += 1
-    return {
-        "rel_micro_p": micro_prf.precision,
-        "rel_micro_r": micro_prf.recall,
-        "rel_micro_f": micro_prf.fscore,
-    }

 from thinc.types import Floats2d, Ints1d, Ragged, cast
 from thinc.api import Model, Linear, chain, Logistic
+import json
+import os
+import time
+from pathlib import Path
+from sklearn.metrics import precision_recall_fscore_support, f1_score
+import plotly.express as px
+import plotly.graph_objects as go
 @spacy.registry.architectures("rel_model.v1")
 def create_relation_model(
     create_instance_tensor: Model[List[Doc], Floats2d],
         tok2vec.initialize(X)
     return model
 Doc.set_extension("rel", default={}, force=True)
 msg = Printer()
     requires=["doc.ents", "token.ent_iob", "token.ent_type"],
     assigns=["doc._.rel"],
     default_score_weights={
+        "rel_micro_p": 0.0,
+        "rel_micro_r": 0.0,
+        "rel_micro_f": 1.0,
+        "rel_macro_f": 1.0,
+        "rel_weighted_f": 1.0,
+        "f1_PART-OF": 1.0,
+        "f1_LOCATED-AT": 1.0,
+        "f1_CONNECTED-WITH": 1.0,
+        "f1_IN-MANNER-OF": 1.0,
+        "f1_ATTRIBUTE-FOR": 1.0
     },
 )
 def make_relation_extractor(
+    nlp: Language, name: str, model: Model, eval_frequency, *, threshold: float
 ):
     """Construct a RelationExtractor component."""
+    return RelationExtractor(nlp.vocab, model, name, threshold=threshold, eval_frequency=eval_frequency)
 class RelationExtractor(TrainablePipe):
         name: str = "rel",
         *,
         threshold: float,
+        eval_frequency = 100
     ) -> None:
         """Initialize a relation extractor."""
         self.vocab = vocab
         self.model = model
         self.name = name
         self.cfg = {"labels": [], "threshold": threshold}
+        self.eval_frequency = eval_frequency
+        self.start_learning_time = None
+        self.metric_history = []
+        self.max_f1 = 0
+        self.max_f1_step = 0
     @property
     def labels(self) -> Tuple[str]:
             self.set_annotations(docs, predictions)
         return losses
+    def get_focal_loss(self, examples: Iterable[Example], scores, gamma=3.0, alpha=0.25, eps=1e-8) -> Tuple[float, float]:
+        truths = self._examples_to_truth(examples)
+        scores_2 = numpy.clip(scores, eps, 1. - eps)
+        p_t = numpy.clip(scores_2 * truths + (1 - scores_2) * (1 - truths), eps, 1. - eps)
+        focal_loss = -(1 - p_t) ** gamma * numpy.log(p_t)
+        loss = numpy.mean(numpy.sum(focal_loss, axis=1))
+        gradient = focal_loss * (1 - 2 * truths)
+        return float(loss), gradient
     def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores."""
     def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
         """Score a batch of examples."""
+        scores = score_relations(examples, self.threshold)
+        tmp_scores = scores.copy()
+        tmp_scores["step"] = len(self.metric_history) * self.eval_frequency
+        if tmp_scores["rel_macro_f"] > self.max_f1:
+            self.max_f1 = tmp_scores["rel_macro_f"]
+            self.max_f1_step = tmp_scores["step"]
+        self.metric_history.append(tmp_scores)
+        return scores
+    def preprocess_metric_history(self):
+        result = {
+            "metric_name": [],
+            "metric_value": [],
+            "step": []
+        }
+        for cur_metrics in self.metric_history:
+            cur_step = cur_metrics["step"]
+            for key, value in cur_metrics.items():
+                if key != "step" and isinstance(value, float):
+                    result["metric_name"].append(key)
+                    result["metric_value"].append(value)
+                    result["step"].append(cur_step)
+        return result
+    def save_metrics_history(self, path):
+        if self.start_learning_time is None:
+            self.start_learning_time = time.monotonic()
+        if self.metric_history:
+            metrics_history_to_save = self.preprocess_metric_history()
+            fig = px.line(metrics_history_to_save, x="step", y="metric_value", color="metric_name")
+            for trace in fig.data:
+                if trace.name in ["rel_micro_f", "rel_macro_f", "rel_weighted_f"]:
+                    trace.line.width = 6
+                else:
+                    trace.line.width = 1
+                idx = list(trace.x).index(self.max_f1_step)
+                highlight_y = list(trace.y)[idx]
+                line_color = trace.line.color
+                line_name = trace.name
+                fig.add_trace(go.Scatter(
+                    x=[self.max_f1_step], y=[highlight_y],
+                    mode='markers+text',
+                    marker=dict(
+                        color=line_color, size=10),
+                        text=[f"{round(highlight_y, 2)}"],
+                        textposition="top center",
+                        name=f"{line_name} best"
+                    ))
+            current_time = time.monotonic()
+            current_time_of_training = current_time - self.start_learning_time
+            current_time_of_training_text = f"{int(current_time_of_training // 3600)} hrs {int(current_time_of_training % 3600) // 60} min {round(current_time_of_training % 60)} sec"
+            fig.update_layout(title = dict(
+                text="Training statistics",
+                subtitle=dict(
+                    text=f"Training time amounted to {current_time_of_training_text}",
+                    font=dict(color="gray", size=13),
+                )
+            ))
+            output_dir = os.path.join(str(path), "logs")
+            os.makedirs(output_dir, exist_ok=True)
+            fig_path = os.path.join(output_dir, "training_metrics.html")
+            json_path = os.path.join(output_dir, "training_metrics.json")
+            fig.write_html(fig_path)
+            with open(json_path, "w", encoding="utf-8") as f:
+                json.dump({
+                    "data": metrics_history_to_save,
+                    "train_time_s": current_time_of_training
+                }, f, indent=2, ensure_ascii=False)
+    def to_disk(self, path, *args, **kwargs):
+        super().to_disk(path, *args, **kwargs)
+        output_dir = Path(path)
+        output_dir_metrics = output_dir.parent.parent
+        self.save_metrics_history(output_dir_metrics)
 def score_relations(examples: Iterable[Example], threshold: float) -> Dict[str, Any]:
     """Score a batch of examples."""
+    y_true = []
+    y_pred = []
     for example in examples:
         gold = example.reference._.rel
         pred = example.predicted._.rel
         for key, pred_dict in pred.items():
+            gold_labels = {k for (k, v) in gold.get(key, {}).items() if v == 1.0}
             for k, v in pred_dict.items():
                 if v >= threshold:
                     if k in gold_labels:
+                        y_true.append(k)
+                        y_pred.append(k)
                     else:
+                        y_true.append("O")
+                        y_pred.append(k)
                 else:
                     if k in gold_labels:
+                        y_true.append(k)
+                        y_pred.append("O")
+    labels = sorted({label for label in y_true if label != "O"})
+    precision, recall, f1, support = precision_recall_fscore_support(
+        y_true, y_pred, labels=labels, zero_division=0, average=None
+    )
+    result = {}
+    for l, p, r, f in zip(labels, precision, recall, f1):
+        result[f"f1_{l}"] = f
+    p, r, f1_micro, _ = precision_recall_fscore_support(
+        y_true, y_pred, labels=labels, zero_division=0, average="micro", beta=1
+    )
+    result["rel_micro_p"] = p
+    result["rel_micro_r"] = r
+    result["rel_micro_f"] = f1_micro
+    result["rel_macro_f"] = f1_score(y_true, y_pred, average="macro", labels=labels, zero_division=0)
+    result["rel_weighted_f"] = f1_score(y_true, y_pred, average="weighted", labels=labels, zero_division=0)
+    return result

relation_extractor/cfg CHANGED Viewed

@@ -3,8 +3,8 @@
     "PART-OF",
     "LOCATED-AT",
     "CONNECTED-WITH",
-    "ATTRIBUTE-FOR",
-    "IN-MANNER-OF"
   ],
   "threshold":0.5
 }

     "PART-OF",
     "LOCATED-AT",
     "CONNECTED-WITH",
+    "IN-MANNER-OF",
+    "ATTRIBUTE-FOR"
   ],
   "threshold":0.5
 }

relation_extractor/model CHANGED Viewed

Binary files a/relation_extractor/model and b/relation_extractor/model differ

ru_patents_rel-any-py3-none-any.whl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54dd6f43fca1c1cef88866cbbaa4dc8b2e450c19b3d11c0b5f3cdc893f96058f
-size 1321313598

 version https://git-lfs.github.com/spec/v1
+oid sha256:01a4762ef635162b3d38a964bf53b46c50d350f75b69c96116caecc4e4660464
+size 661156608

transformer/cfg CHANGED Viewed

@@ -1,3 +1,3 @@
 {
-  "max_batch_items":4096
 }

 {
+  "max_batch_items":2096
 }

transformer/model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9be3ac022e449e0ce8bfbb1551933557fc5bad91718decd70fdd0e00ac36aeb5
-size 1430062590

 version https://git-lfs.github.com/spec/v1
+oid sha256:96e76a1b01f5978e003b6acf1e0f20cc51201f638c85fb239e6819b9a444b6f8
+size 716719271

vocab/strings.json CHANGED Viewed

The diff for this file is too large to render. See raw diff