Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

README.md +66 -0
best_hyperparameters.json +7 -0
config.json +32 -0
model.safetensors +3 -0
performance_report.json +221 -0
performance_report.md +72 -0
rng_state.pth +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
trainer_state.json +162 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+---
+task: token-classification
+tags:
+- biomedical
+- bionlp
+license: mit
+base_model: microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
+---
+# bioner_ncbi_disease
+This is a named entity recognition model fine-tuned from the [microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext](https://huggingface.co/microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext) model. It predicts spans with 2 possible labels. The labels are **DiseaseClass and SpecificDisease**.
+The code used for training this model can be found at https://github.com/Glasgow-AI4BioMed/bioner along with links to other biomedical NER models trained on well-known biomedical corpora. The source dataset information is below.
+## Example Usage
+The code below will load up the model and apply it to the provided text. It uses a simple aggregation strategy to post-process the individual tokens into larger multi-token entities where needed.
+```python
+from transformers import pipeline
+# Load the model as part of an NER pipeline
+ner_pipeline = pipeline("token-classification",
+                        model="Glasgow-AI4BioMed/bioner_ncbi_disease",
+                        aggregation_strategy="max")
+# Apply it to some text
+ner_pipeline("EGFR T790M mutations have been known to affect treatment outcomes for NSCLC patients receiving erlotinib.")
+# Output:
+```
+## Dataset Info
+**Source:** The NCBI Disease dataset was downloaded from: https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/
+The dataset should be cited with: Doğan, Rezarta Islamaj, Robert Leaman, and Zhiyong Lu. "NCBI disease corpus: a resource for disease name recognition and concept normalization." Journal of biomedical informatics 47 (2014): 1-10. DOI: [10.1016/j.jbi.2013.12.006](https://doi.org/10.1016/j.jbi.2013.12.006)
+**Preprocessing:** The training/validation/test split was maintained from the original dataset. The annotations were filtered down to only 'DiseaseClass' and 'SpecificDisease'. The preprocessing script for this dataset is [prepare_ncbi_disease.py](https://github.com/Glasgow-AI4BioMed/bioner/blob/main/prepare_ncbi_disease.py).
+## Performance
+The span-level performance on the test split for the different labels are shown in the tables below. The full performance results are available in the model repo in Markdown format for viewing and JSON format for easier loading. These include the performance at token level (with individual B- and I- labels as the token classifier uses IOB2 token labelling).
+| Label | Precision | Recall | F1-score | Support |
+| --- | --- | --- | --- | --- |
+| DiseaseClass | 0.592 | 0.769 | 0.669 | 121 |
+| SpecificDisease | 0.816 | 0.809 | 0.813 | 555 |
+| macro avg | 0.704 | 0.789 | 0.741 | 676 |
+| weighted avg | 0.776 | 0.802 | 0.787 | 676 |
+## Hyperparameters
+Hyperparameter tuning was done with [optuna](https://optuna.org/) and the [hyperparameter_search](https://huggingface.co/docs/transformers/en/hpo_train) functionality. 100 trials were run. Early stopping was applied during training. The best performing model was selected using the macro F1 performance on the validation set. The selected hyperparameters are in the table below.
+| Hyperparameter | Value |
+|----------------|-------|
+| epochs | 9.0 |
+| learning_rate | 4.2369194386745274e-05 |
+| per_device_train_batch_size | 8 |
+| weight_decay | 0.11095292966544487 |
+| warmup_ratio | 0.009641097927077978 |

best_hyperparameters.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "epochs": 9.0,
+  "learning_rate": 4.2369194386745274e-05,
+  "per_device_train_batch_size": 8,
+  "weight_decay": 0.11095292966544487,
+  "warmup_ratio": 0.009641097927077978
+}

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "O",
+    "1": "B-DiseaseClass",
+    "2": "I-DiseaseClass",
+    "3": "B-SpecificDisease",
+    "4": "I-SpecificDisease"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41f40476612ba87cd2da8e641f59d9e2e82a56d63662eab279b6d045a80a3198
+size 435605316

performance_report.json ADDED Viewed

	@@ -0,0 +1,221 @@

+{
+  "train": {
+    "token_level": {
+      "O": {
+        "precision": 0.9998746044508043,
+        "recall": 0.9994322831800989,
+        "f1-score": 0.9996533948864684,
+        "support": 135631.0
+      },
+      "B-DiseaseClass": {
+        "precision": 0.9935064935064936,
+        "recall": 0.9947984395318595,
+        "f1-score": 0.9941520467836257,
+        "support": 769.0
+      },
+      "I-DiseaseClass": {
+        "precision": 0.9964080459770115,
+        "recall": 0.9914224446032881,
+        "f1-score": 0.9939089931924041,
+        "support": 1399.0
+      },
+      "B-SpecificDisease": {
+        "precision": 0.9986518368722616,
+        "recall": 0.9976430976430977,
+        "f1-score": 0.9981472123968335,
+        "support": 2970.0
+      },
+      "I-SpecificDisease": {
+        "precision": 0.9895332390381896,
+        "recall": 0.9992858163119555,
+        "f1-score": 0.9943856158055575,
+        "support": 7001.0
+      },
+      "accuracy": 0.9992894362861203,
+      "macro avg": {
+        "precision": 0.9955948439689521,
+        "recall": 0.99651641625406,
+        "f1-score": 0.9960494526129778,
+        "support": 147770.0
+      },
+      "weighted avg": {
+        "precision": 0.9992941191589114,
+        "recall": 0.9992894362861203,
+        "f1-score": 0.9992905334260858,
+        "support": 147770.0
+      }
+    },
+    "span_level": {
+      "DiseaseClass": {
+        "precision": 0.9883419689119171,
+        "recall": 0.9921976592977894,
+        "f1-score": 0.9902660609993511,
+        "support": 769
+      },
+      "SpecificDisease": {
+        "precision": 0.973927392739274,
+        "recall": 0.9929340511440108,
+        "f1-score": 0.9833388870376542,
+        "support": 2972
+      },
+      "macro avg": {
+        "precision": 0.9811346808255955,
+        "recall": 0.9925658552209,
+        "f1-score": 0.9868024740185026,
+        "support": 3741
+      },
+      "weighted avg": {
+        "precision": 0.976890453171448,
+        "recall": 0.9927826784282278,
+        "f1-score": 0.9847628369912882,
+        "support": 3741
+      }
+    }
+  },
+  "val": {
+    "token_level": {
+      "O": {
+        "precision": 0.9942266157168965,
+        "recall": 0.9965860360547899,
+        "f1-score": 0.995404927747167,
+        "support": 24019.0
+      },
+      "B-DiseaseClass": {
+        "precision": 0.8490566037735849,
+        "recall": 0.7142857142857143,
+        "f1-score": 0.7758620689655172,
+        "support": 126.0
+      },
+      "I-DiseaseClass": {
+        "precision": 0.821917808219178,
+        "recall": 0.6857142857142857,
+        "f1-score": 0.7476635514018691,
+        "support": 175.0
+      },
+      "B-SpecificDisease": {
+        "precision": 0.8813559322033898,
+        "recall": 0.883495145631068,
+        "f1-score": 0.8824242424242424,
+        "support": 412.0
+      },
+      "I-SpecificDisease": {
+        "precision": 0.922992299229923,
+        "recall": 0.9139433551198257,
+        "f1-score": 0.9184455391351943,
+        "support": 918.0
+      },
+      "accuracy": 0.9883040935672515,
+      "macro avg": {
+        "precision": 0.8939098518285944,
+        "recall": 0.8388049073611367,
+        "f1-score": 0.863960065934798,
+        "support": 25650.0
+      },
+      "weighted avg": {
+        "precision": 0.9879754934182778,
+        "recall": 0.9883040935672515,
+        "f1-score": 0.9880671537835901,
+        "support": 25650.0
+      }
+    },
+    "span_level": {
+      "DiseaseClass": {
+        "precision": 0.7631578947368421,
+        "recall": 0.6904761904761905,
+        "f1-score": 0.725,
+        "support": 126
+      },
+      "SpecificDisease": {
+        "precision": 0.8411214953271028,
+        "recall": 0.8737864077669902,
+        "f1-score": 0.8571428571428571,
+        "support": 412
+      },
+      "macro avg": {
+        "precision": 0.8021396950319725,
+        "recall": 0.7821312991215903,
+        "f1-score": 0.7910714285714285,
+        "support": 538
+      },
+      "weighted avg": {
+        "precision": 0.8228623621033615,
+        "recall": 0.8308550185873605,
+        "f1-score": 0.8261949017525225,
+        "support": 538
+      }
+    }
+  },
+  "test": {
+    "token_level": {
+      "O": {
+        "precision": 0.9955296723126769,
+        "recall": 0.9908159516714968,
+        "f1-score": 0.9931672190172252,
+        "support": 24499.0
+      },
+      "B-DiseaseClass": {
+        "precision": 0.62,
+        "recall": 0.768595041322314,
+        "f1-score": 0.6863468634686347,
+        "support": 121.0
+      },
+      "I-DiseaseClass": {
+        "precision": 0.5545454545454546,
+        "recall": 0.7484662576687117,
+        "f1-score": 0.6370757180156658,
+        "support": 163.0
+      },
+      "B-SpecificDisease": {
+        "precision": 0.8444444444444444,
+        "recall": 0.8216216216216217,
+        "f1-score": 0.8328767123287671,
+        "support": 555.0
+      },
+      "I-SpecificDisease": {
+        "precision": 0.8679549114331723,
+        "recall": 0.9005847953216374,
+        "f1-score": 0.8839688396883969,
+        "support": 1197.0
+      },
+      "accuracy": 0.980704729602412,
+      "macro avg": {
+        "precision": 0.7764948965471496,
+        "recall": 0.8460167335211564,
+        "f1-score": 0.8066870705037379,
+        "support": 26535.0
+      },
+      "weighted avg": {
+        "precision": 0.9821933690119222,
+        "recall": 0.980704729602412,
+        "f1-score": 0.9813021401043428,
+        "support": 26535.0
+      }
+    },
+    "span_level": {
+      "DiseaseClass": {
+        "precision": 0.5923566878980892,
+        "recall": 0.768595041322314,
+        "f1-score": 0.6690647482014389,
+        "support": 121
+      },
+      "SpecificDisease": {
+        "precision": 0.8163636363636364,
+        "recall": 0.809009009009009,
+        "f1-score": 0.8126696832579184,
+        "support": 555
+      },
+      "macro avg": {
+        "precision": 0.7043601621308628,
+        "recall": 0.7888020251656616,
+        "f1-score": 0.7408672157296787,
+        "support": 676
+      },
+      "weighted avg": {
+        "precision": 0.7762677180732056,
+        "recall": 0.8017751479289941,
+        "f1-score": 0.7869652496161522,
+        "support": 676
+      }
+    }
+  }
+}

performance_report.md ADDED Viewed

	@@ -0,0 +1,72 @@

+# Performance on Training Set
+## Span Level
+| Label | Precision | Recall | F1-score | Support |
+| --- | --- | --- | --- | --- |
+| DiseaseClass | 0.988 | 0.992 | 0.990 | 769 |
+| SpecificDisease | 0.974 | 0.993 | 0.983 | 2972 |
+| macro avg | 0.981 | 0.993 | 0.987 | 3741 |
+| weighted avg | 0.977 | 0.993 | 0.985 | 3741 |
+## Token Level
+| Label | Precision | Recall | F1-score | Support |
+| --- | --- | --- | --- | --- |
+| O | 1.000 | 0.999 | 1.000 | 135631 |
+| B-DiseaseClass | 0.994 | 0.995 | 0.994 | 769 |
+| I-DiseaseClass | 0.996 | 0.991 | 0.994 | 1399 |
+| B-SpecificDisease | 0.999 | 0.998 | 0.998 | 2970 |
+| I-SpecificDisease | 0.990 | 0.999 | 0.994 | 7001 |
+| macro avg | 0.996 | 0.997 | 0.996 | 147770 |
+| weighted avg | 0.999 | 0.999 | 0.999 | 147770 |
+# Performance on Validation Set
+## Span Level
+| Label | Precision | Recall | F1-score | Support |
+| --- | --- | --- | --- | --- |
+| DiseaseClass | 0.763 | 0.690 | 0.725 | 126 |
+| SpecificDisease | 0.841 | 0.874 | 0.857 | 412 |
+| macro avg | 0.802 | 0.782 | 0.791 | 538 |
+| weighted avg | 0.823 | 0.831 | 0.826 | 538 |
+## Token Level
+| Label | Precision | Recall | F1-score | Support |
+| --- | --- | --- | --- | --- |
+| O | 0.994 | 0.997 | 0.995 | 24019 |
+| B-DiseaseClass | 0.849 | 0.714 | 0.776 | 126 |
+| I-DiseaseClass | 0.822 | 0.686 | 0.748 | 175 |
+| B-SpecificDisease | 0.881 | 0.883 | 0.882 | 412 |
+| I-SpecificDisease | 0.923 | 0.914 | 0.918 | 918 |
+| macro avg | 0.894 | 0.839 | 0.864 | 25650 |
+| weighted avg | 0.988 | 0.988 | 0.988 | 25650 |
+# Performance on Testing Set
+## Span Level
+| Label | Precision | Recall | F1-score | Support |
+| --- | --- | --- | --- | --- |
+| DiseaseClass | 0.592 | 0.769 | 0.669 | 121 |
+| SpecificDisease | 0.816 | 0.809 | 0.813 | 555 |
+| macro avg | 0.704 | 0.789 | 0.741 | 676 |
+| weighted avg | 0.776 | 0.802 | 0.787 | 676 |
+## Token Level
+| Label | Precision | Recall | F1-score | Support |
+| --- | --- | --- | --- | --- |
+| O | 0.996 | 0.991 | 0.993 | 24499 |
+| B-DiseaseClass | 0.620 | 0.769 | 0.686 | 121 |
+| I-DiseaseClass | 0.555 | 0.748 | 0.637 | 163 |
+| B-SpecificDisease | 0.844 | 0.822 | 0.833 | 555 |
+| I-SpecificDisease | 0.868 | 0.901 | 0.884 | 1197 |
+| macro avg | 0.776 | 0.846 | 0.807 | 26535 |
+| weighted avg | 0.982 | 0.981 | 0.981 | 26535 |

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c325025280b89c0d4da3c9c2ad88816a8a0cc2c4c2de7729901f3971a2bc79ff
+size 14244

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,162 @@

+{
+  "best_metric": 0.863960065934798,
+  "best_model_checkpoint": "tmp_ner_fantastic-bale-09_44/run-61/checkpoint-675",
+  "epoch": 9.0,
+  "eval_steps": 500,
+  "global_step": 675,
+  "is_hyper_param_search": true,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.9783235867446394,
+      "eval_loss": 0.06952951103448868,
+      "eval_macro_f1": 0.6359234331864915,
+      "eval_macro_precision": 0.8484880913225343,
+      "eval_macro_recall": 0.621665816946364,
+      "eval_runtime": 0.6216,
+      "eval_samples_per_second": 160.873,
+      "eval_steps_per_second": 20.913,
+      "step": 75
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.983391812865497,
+      "eval_loss": 0.056511040776968,
+      "eval_macro_f1": 0.7614548378523822,
+      "eval_macro_precision": 0.8731710257534685,
+      "eval_macro_recall": 0.7203600182300229,
+      "eval_runtime": 0.6015,
+      "eval_samples_per_second": 166.249,
+      "eval_steps_per_second": 21.612,
+      "step": 150
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9857309941520468,
+      "eval_loss": 0.05191269889473915,
+      "eval_macro_f1": 0.8172545857097553,
+      "eval_macro_precision": 0.8413749539052772,
+      "eval_macro_recall": 0.7970101814719535,
+      "eval_runtime": 0.6777,
+      "eval_samples_per_second": 147.557,
+      "eval_steps_per_second": 19.182,
+      "step": 225
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.9871734892787525,
+      "eval_loss": 0.055695317685604095,
+      "eval_macro_f1": 0.8405769067306196,
+      "eval_macro_precision": 0.8557959065634769,
+      "eval_macro_recall": 0.8269452880439045,
+      "eval_runtime": 0.6377,
+      "eval_samples_per_second": 156.826,
+      "eval_steps_per_second": 20.387,
+      "step": 300
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.9875243664717349,
+      "eval_loss": 0.055003080517053604,
+      "eval_macro_f1": 0.8540405601187896,
+      "eval_macro_precision": 0.8983484203175353,
+      "eval_macro_recall": 0.8221656291922347,
+      "eval_runtime": 0.6394,
+      "eval_samples_per_second": 156.406,
+      "eval_steps_per_second": 20.333,
+      "step": 375
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.9878362573099415,
+      "eval_loss": 0.060502711683511734,
+      "eval_macro_f1": 0.8610460620824656,
+      "eval_macro_precision": 0.8865353357904526,
+      "eval_macro_recall": 0.8387695106889209,
+      "eval_runtime": 0.6171,
+      "eval_samples_per_second": 162.056,
+      "eval_steps_per_second": 21.067,
+      "step": 450
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 0.02602095529437065,
+      "learning_rate": 3.3881089787380476e-05,
+      "loss": 0.08,
+      "step": 500
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.9864327485380117,
+      "eval_loss": 0.06403131783008575,
+      "eval_macro_f1": 0.836324040811885,
+      "eval_macro_precision": 0.8309543877948833,
+      "eval_macro_recall": 0.8430665699031182,
+      "eval_runtime": 0.6657,
+      "eval_samples_per_second": 150.212,
+      "eval_steps_per_second": 19.528,
+      "step": 525
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.9874463937621832,
+      "eval_loss": 0.06809797883033752,
+      "eval_macro_f1": 0.853127015308474,
+      "eval_macro_precision": 0.871470843170948,
+      "eval_macro_recall": 0.8370896203600997,
+      "eval_runtime": 0.6629,
+      "eval_samples_per_second": 150.862,
+      "eval_steps_per_second": 19.612,
+      "step": 600
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.9883040935672515,
+      "eval_loss": 0.06698578596115112,
+      "eval_macro_f1": 0.863960065934798,
+      "eval_macro_precision": 0.8939098518285944,
+      "eval_macro_recall": 0.8388049073611367,
+      "eval_runtime": 0.6334,
+      "eval_samples_per_second": 157.89,
+      "eval_steps_per_second": 20.526,
+      "step": 675
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 2400,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 32,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 3,
+        "early_stopping_threshold": 0.001
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 768333841626390.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": {
+    "learning_rate": 4.2369194386745274e-05,
+    "per_device_train_batch_size": 8,
+    "warmup_ratio": 0.009641097927077978,
+    "weight_decay": 0.11095292966544487
+  }
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff