maximuspowers commited on
Commit
84f46c0
·
verified ·
1 Parent(s): 7d3b7c8

Model save

Browse files
README.md CHANGED
@@ -16,15 +16,15 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [maximuspowers/bert-philosophy-adapted](https://huggingface.co/maximuspowers/bert-philosophy-adapted) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.7200
20
- - Exact Match Accuracy: 0.2
21
- - Macro Precision: 0.1583
22
- - Macro Recall: 0.0909
23
- - Macro F1: 0.1152
24
- - Micro Precision: 0.8571
25
- - Micro Recall: 0.2105
26
- - Micro F1: 0.3380
27
- - Hamming Loss: 0.0691
28
 
29
  ## Model description
30
 
@@ -44,11 +44,11 @@ More information needed
44
 
45
  The following hyperparameters were used during training:
46
  - learning_rate: 2e-05
47
- - train_batch_size: 16
48
- - eval_batch_size: 16
49
  - seed: 42
50
  - gradient_accumulation_steps: 2
51
- - total_train_batch_size: 32
52
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
53
  - lr_scheduler_type: linear
54
  - lr_scheduler_warmup_steps: 100
@@ -59,8 +59,15 @@ The following hyperparameters were used during training:
59
 
60
  | Training Loss | Epoch | Step | Validation Loss | Exact Match Accuracy | Macro Precision | Macro Recall | Macro F1 | Micro Precision | Micro Recall | Micro F1 | Hamming Loss |
61
  |:-------------:|:-----:|:----:|:---------------:|:--------------------:|:---------------:|:------------:|:--------:|:---------------:|:------------:|:--------:|:------------:|
62
- | 0.811 | 25.0 | 250 | 0.7701 | 0.1 | 0.1092 | 0.0615 | 0.0784 | 0.875 | 0.1228 | 0.2154 | 0.075 |
63
- | 0.58 | 50.0 | 500 | 0.7200 | 0.2 | 0.1583 | 0.0909 | 0.1152 | 0.8571 | 0.2105 | 0.3380 | 0.0691 |
 
 
 
 
 
 
 
64
 
65
 
66
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [maximuspowers/bert-philosophy-adapted](https://huggingface.co/maximuspowers/bert-philosophy-adapted) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.8156
20
+ - Exact Match Accuracy: 0.275
21
+ - Macro Precision: 0.1574
22
+ - Macro Recall: 0.1134
23
+ - Macro F1: 0.1298
24
+ - Micro Precision: 0.8421
25
+ - Micro Recall: 0.2807
26
+ - Micro F1: 0.4211
27
+ - Hamming Loss: 0.0647
28
 
29
  ## Model description
30
 
 
44
 
45
  The following hyperparameters were used during training:
46
  - learning_rate: 2e-05
47
+ - train_batch_size: 8
48
+ - eval_batch_size: 8
49
  - seed: 42
50
  - gradient_accumulation_steps: 2
51
+ - total_train_batch_size: 16
52
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
53
  - lr_scheduler_type: linear
54
  - lr_scheduler_warmup_steps: 100
 
59
 
60
  | Training Loss | Epoch | Step | Validation Loss | Exact Match Accuracy | Macro Precision | Macro Recall | Macro F1 | Micro Precision | Micro Recall | Micro F1 | Hamming Loss |
61
  |:-------------:|:-----:|:----:|:---------------:|:--------------------:|:---------------:|:------------:|:--------:|:---------------:|:------------:|:--------:|:------------:|
62
+ | 1.7889 | 5.0 | 100 | 1.0021 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0853 |
63
+ | 1.156 | 10.0 | 200 | 0.8631 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0838 |
64
+ | 0.8775 | 15.0 | 300 | 0.9324 | 0.05 | 0.0588 | 0.0267 | 0.0368 | 1.0 | 0.0877 | 0.1613 | 0.0765 |
65
+ | 0.7747 | 20.0 | 400 | 0.7537 | 0.1 | 0.1092 | 0.0615 | 0.0784 | 0.875 | 0.1228 | 0.2154 | 0.075 |
66
+ | 0.7074 | 25.0 | 500 | 0.8191 | 0.175 | 0.1487 | 0.0845 | 0.1056 | 0.7857 | 0.1930 | 0.3099 | 0.0721 |
67
+ | 0.6281 | 30.0 | 600 | 0.8507 | 0.275 | 0.1574 | 0.1134 | 0.1298 | 0.8421 | 0.2807 | 0.4211 | 0.0647 |
68
+ | 0.5506 | 35.0 | 700 | 0.7439 | 0.25 | 0.1563 | 0.1075 | 0.1256 | 0.8333 | 0.2632 | 0.4 | 0.0662 |
69
+ | 0.5091 | 40.0 | 800 | 0.7972 | 0.275 | 0.1574 | 0.1134 | 0.1298 | 0.8421 | 0.2807 | 0.4211 | 0.0647 |
70
+ | 0.5038 | 45.0 | 900 | 0.8156 | 0.275 | 0.1574 | 0.1134 | 0.1298 | 0.8421 | 0.2807 | 0.4211 | 0.0647 |
71
 
72
 
73
  ### Framework versions
all_results.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
- "epoch": 50.0,
3
- "eval_exact_match_accuracy": 0.2,
4
- "eval_hamming_loss": 0.075,
5
- "eval_loss": 0.8420153856277466,
6
- "eval_macro_f1": 0.09192664920219099,
7
- "eval_macro_precision": 0.09243697478991597,
8
- "eval_macro_recall": 0.09215686274509804,
9
- "eval_micro_f1": 0.4,
10
- "eval_micro_precision": 0.6071428571428571,
11
- "eval_micro_recall": 0.2982456140350877,
12
- "eval_runtime": 0.2221,
13
- "eval_samples_per_second": 180.125,
14
- "eval_steps_per_second": 13.509,
15
  "total_flos": 0.0,
16
- "train_loss": 1.1355848159790038,
17
- "train_runtime": 246.5817,
18
- "train_samples_per_second": 64.076,
19
- "train_steps_per_second": 2.028
20
  }
 
1
  {
2
+ "epoch": 45.0,
3
+ "eval_exact_match_accuracy": 0.375,
4
+ "eval_hamming_loss": 0.052941176470588235,
5
+ "eval_loss": 0.5750908255577087,
6
+ "eval_macro_f1": 0.13746934180370715,
7
+ "eval_macro_precision": 0.17058823529411765,
8
+ "eval_macro_recall": 0.12058823529411763,
9
+ "eval_micro_f1": 0.55,
10
+ "eval_micro_precision": 0.9565217391304348,
11
+ "eval_micro_recall": 0.38596491228070173,
12
+ "eval_runtime": 0.2248,
13
+ "eval_samples_per_second": 177.928,
14
+ "eval_steps_per_second": 13.345,
15
  "total_flos": 0.0,
16
+ "train_loss": 0.9705644819471572,
17
+ "train_runtime": 232.6541,
18
+ "train_samples_per_second": 67.912,
19
+ "train_steps_per_second": 4.298
20
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d090d5e2966f2091768ffc79be690bc37433eae61a697aae158a43c8f2c1826
3
  size 441154988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:890f9065c802bc97c554e035af9eaa8ef8da20f13c0f284d224585cdb51a36aa
3
  size 441154988
runs/Jun15_00-40-18_92b2e0e6fb20/events.out.tfevents.1749948301.92b2e0e6fb20.2194.9 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:106c9524987fb915bca106b13f7d5b464b289c45f4975ddfcbcff0c2f11c817f
3
+ size 3837
runs/Jun15_00-47-20_92b2e0e6fb20/events.out.tfevents.1749948441.92b2e0e6fb20.2194.10 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b7d3f157a6e34681655d66b8adcdcc4d0909527514c24f24be959d00ecde095
3
+ size 57078
test_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_exact_match_accuracy": 0.2,
4
- "eval_hamming_loss": 0.075,
5
- "eval_loss": 0.8420153856277466,
6
- "eval_macro_f1": 0.09192664920219099,
7
- "eval_macro_precision": 0.09243697478991597,
8
- "eval_macro_recall": 0.09215686274509804,
9
- "eval_micro_f1": 0.4,
10
- "eval_micro_precision": 0.6071428571428571,
11
- "eval_micro_recall": 0.2982456140350877,
12
- "eval_runtime": 0.2221,
13
- "eval_samples_per_second": 180.125,
14
- "eval_steps_per_second": 13.509
15
  }
 
1
  {
2
+ "epoch": 50.0,
3
+ "eval_exact_match_accuracy": 0.375,
4
+ "eval_hamming_loss": 0.052941176470588235,
5
+ "eval_loss": 0.5750908255577087,
6
+ "eval_macro_f1": 0.13746934180370715,
7
+ "eval_macro_precision": 0.17058823529411765,
8
+ "eval_macro_recall": 0.12058823529411763,
9
+ "eval_micro_f1": 0.55,
10
+ "eval_micro_precision": 0.9565217391304348,
11
+ "eval_micro_recall": 0.38596491228070173,
12
+ "eval_runtime": 0.2248,
13
+ "eval_samples_per_second": 177.928,
14
+ "eval_steps_per_second": 13.345
15
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 50.0,
3
  "total_flos": 0.0,
4
- "train_loss": 1.1355848159790038,
5
- "train_runtime": 246.5817,
6
- "train_samples_per_second": 64.076,
7
- "train_steps_per_second": 2.028
8
  }
 
1
  {
2
+ "epoch": 45.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.9705644819471572,
5
+ "train_runtime": 232.6541,
6
+ "train_samples_per_second": 67.912,
7
+ "train_steps_per_second": 4.298
8
  }
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 5.0,
6
  "eval_steps": 250,
7
- "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12,46 +12,405 @@
12
  {
13
  "epoch": 0,
14
  "step": 0,
15
- "train/classification_loss": 0.6251798272132874,
16
- "train/contrastive_loss": 1.386080265045166,
17
- "train/negative_loss": 1.1070373058319092,
18
  "train/num_negatives": 190,
19
  "train/num_positives": 50,
20
- "train/positive_loss": 0.27904292941093445,
21
- "train/total_loss": 0.9023958444595337
22
  },
23
  {
24
  "epoch": 0,
25
  "step": 0,
26
- "train/classification_loss": 0.6276130080223083,
27
- "train/contrastive_loss": 1.6681630611419678,
28
- "train/negative_loss": 1.269258737564087,
29
  "train/num_negatives": 192,
30
  "train/num_positives": 48,
31
- "train/positive_loss": 0.39890438318252563,
32
- "train/total_loss": 0.9612456560134888
33
  },
34
  {
35
  "epoch": 5.0,
36
- "grad_norm": 10.065888404846191,
37
- "learning_rate": 9.800000000000001e-06,
38
- "loss": 1.6828,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 5.0,
43
  "step": 50,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "total_flos": 0.0,
45
- "train_loss": 1.6828109741210937,
46
- "train_runtime": 29.5351,
47
- "train_samples_per_second": 53.496,
48
- "train_steps_per_second": 1.693
49
  }
50
  ],
51
  "logging_steps": 50,
52
- "max_steps": 50,
53
  "num_input_tokens_seen": 0,
54
- "num_train_epochs": 5,
55
  "save_steps": 500,
56
  "stateful_callbacks": {
57
  "EarlyStoppingCallback": {
 
1
  {
2
+ "best_global_step": 500,
3
+ "best_metric": 0.3380281690140845,
4
+ "best_model_checkpoint": "./bert-philosophy-classifier/checkpoint-500",
5
+ "epoch": 50.0,
6
  "eval_steps": 250,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12
  {
13
  "epoch": 0,
14
  "step": 0,
15
+ "train/classification_loss": 0.679158627986908,
16
+ "train/contrastive_loss": 9.516982078552246,
17
+ "train/negative_loss": 9.516908645629883,
18
  "train/num_negatives": 190,
19
  "train/num_positives": 50,
20
+ "train/positive_loss": 7.310241926461458e-05,
21
+ "train/total_loss": 2.582555055618286
22
  },
23
  {
24
  "epoch": 0,
25
  "step": 0,
26
+ "train/classification_loss": 0.6693864464759827,
27
+ "train/contrastive_loss": 9.331222534179688,
28
+ "train/negative_loss": 9.331130981445312,
29
  "train/num_negatives": 192,
30
  "train/num_positives": 48,
31
+ "train/positive_loss": 9.195055463351309e-05,
32
+ "train/total_loss": 2.535630941390991
33
  },
34
  {
35
  "epoch": 5.0,
36
+ "grad_norm": 17.358003616333008,
37
+ "learning_rate": 9.600000000000001e-06,
38
+ "loss": 3.6257,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 5.0,
43
  "step": 50,
44
+ "train/classification_loss": 0.6362661719322205,
45
+ "train/contrastive_loss": 1.4868279695510864,
46
+ "train/negative_loss": 1.3825407028198242,
47
+ "train/num_negatives": 170,
48
+ "train/num_positives": 66,
49
+ "train/positive_loss": 0.10428724437952042,
50
+ "train/total_loss": 0.9336317777633667
51
+ },
52
+ {
53
+ "epoch": 5.0,
54
+ "step": 50,
55
+ "train/classification_loss": 0.6401901245117188,
56
+ "train/contrastive_loss": 1.6219159364700317,
57
+ "train/negative_loss": 1.440779447555542,
58
+ "train/num_negatives": 186,
59
+ "train/num_positives": 54,
60
+ "train/positive_loss": 0.18113651871681213,
61
+ "train/total_loss": 0.964573323726654
62
+ },
63
+ {
64
+ "epoch": 10.0,
65
+ "grad_norm": 20.2652645111084,
66
+ "learning_rate": 1.9600000000000002e-05,
67
+ "loss": 1.6163,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 10.0,
72
+ "step": 100,
73
+ "train/classification_loss": 0.44592994451522827,
74
+ "train/contrastive_loss": 0.9996287822723389,
75
+ "train/negative_loss": 0.8505972027778625,
76
+ "train/num_negatives": 190,
77
+ "train/num_positives": 50,
78
+ "train/positive_loss": 0.1490315943956375,
79
+ "train/total_loss": 0.645855724811554
80
+ },
81
+ {
82
+ "epoch": 10.0,
83
+ "step": 100,
84
+ "train/classification_loss": 0.4469062089920044,
85
+ "train/contrastive_loss": 1.1934728622436523,
86
+ "train/negative_loss": 0.9309344291687012,
87
+ "train/num_negatives": 206,
88
+ "train/num_positives": 30,
89
+ "train/positive_loss": 0.26253849267959595,
90
+ "train/total_loss": 0.685600757598877
91
+ },
92
+ {
93
+ "epoch": 15.0,
94
+ "grad_norm": 8.280580520629883,
95
+ "learning_rate": 1.76e-05,
96
+ "loss": 1.1607,
97
+ "step": 150
98
+ },
99
+ {
100
+ "epoch": 15.0,
101
+ "step": 150,
102
+ "train/classification_loss": 0.32877856492996216,
103
+ "train/contrastive_loss": 1.1336479187011719,
104
+ "train/negative_loss": 0.9981224536895752,
105
+ "train/num_negatives": 170,
106
+ "train/num_positives": 66,
107
+ "train/positive_loss": 0.1355254054069519,
108
+ "train/total_loss": 0.5555081367492676
109
+ },
110
+ {
111
+ "epoch": 15.0,
112
+ "step": 150,
113
+ "train/classification_loss": 0.3285317122936249,
114
+ "train/contrastive_loss": 0.8869010210037231,
115
+ "train/negative_loss": 0.591576099395752,
116
+ "train/num_negatives": 202,
117
+ "train/num_positives": 38,
118
+ "train/positive_loss": 0.2953248918056488,
119
+ "train/total_loss": 0.5059119462966919
120
+ },
121
+ {
122
+ "epoch": 20.0,
123
+ "grad_norm": 7.707197189331055,
124
+ "learning_rate": 1.5100000000000001e-05,
125
+ "loss": 0.9196,
126
+ "step": 200
127
+ },
128
+ {
129
+ "epoch": 20.0,
130
+ "step": 200,
131
+ "train/classification_loss": 0.293140172958374,
132
+ "train/contrastive_loss": 0.7223706245422363,
133
+ "train/negative_loss": 0.5778605937957764,
134
+ "train/num_negatives": 202,
135
+ "train/num_positives": 30,
136
+ "train/positive_loss": 0.14451001584529877,
137
+ "train/total_loss": 0.4376143217086792
138
+ },
139
+ {
140
+ "epoch": 20.0,
141
+ "step": 200,
142
+ "train/classification_loss": 0.2644300162792206,
143
+ "train/contrastive_loss": 0.4585617780685425,
144
+ "train/negative_loss": 0.39372602105140686,
145
+ "train/num_negatives": 184,
146
+ "train/num_positives": 56,
147
+ "train/positive_loss": 0.06483575701713562,
148
+ "train/total_loss": 0.3561423718929291
149
+ },
150
+ {
151
+ "epoch": 25.0,
152
+ "grad_norm": 6.953479766845703,
153
+ "learning_rate": 1.2600000000000001e-05,
154
+ "loss": 0.811,
155
+ "step": 250
156
+ },
157
+ {
158
+ "epoch": 25.0,
159
+ "step": 250,
160
+ "train/classification_loss": 0.2595597505569458,
161
+ "train/contrastive_loss": 2.3272764682769775,
162
+ "train/negative_loss": 1.8330672979354858,
163
+ "train/num_negatives": 196,
164
+ "train/num_positives": 44,
165
+ "train/positive_loss": 0.4942092299461365,
166
+ "train/total_loss": 0.7250150442123413
167
+ },
168
+ {
169
+ "epoch": 25.0,
170
+ "step": 250,
171
+ "train/classification_loss": 0.2660799026489258,
172
+ "train/contrastive_loss": 3.3698394298553467,
173
+ "train/negative_loss": 1.8154842853546143,
174
+ "train/num_negatives": 210,
175
+ "train/num_positives": 30,
176
+ "train/positive_loss": 1.5543551445007324,
177
+ "train/total_loss": 0.9400478005409241
178
+ },
179
+ {
180
+ "epoch": 25.0,
181
+ "step": 250,
182
+ "train/classification_loss": 0.2840481698513031,
183
+ "train/contrastive_loss": 1.1826257705688477,
184
+ "train/negative_loss": 1.1373339891433716,
185
+ "train/num_negatives": 46,
186
+ "train/num_positives": 8,
187
+ "train/positive_loss": 0.045291826128959656,
188
+ "train/total_loss": 0.5205733180046082
189
+ },
190
+ {
191
+ "epoch": 25.0,
192
+ "eval_exact_match_accuracy": 0.1,
193
+ "eval_hamming_loss": 0.075,
194
+ "eval_loss": 0.7701398134231567,
195
+ "eval_macro_f1": 0.0784313725490196,
196
+ "eval_macro_precision": 0.1092436974789916,
197
+ "eval_macro_recall": 0.06149732620320855,
198
+ "eval_micro_f1": 0.2153846153846154,
199
+ "eval_micro_precision": 0.875,
200
+ "eval_micro_recall": 0.12280701754385964,
201
+ "eval_runtime": 0.219,
202
+ "eval_samples_per_second": 182.685,
203
+ "eval_steps_per_second": 13.701,
204
+ "step": 250
205
+ },
206
+ {
207
+ "epoch": 25.0,
208
+ "step": 250,
209
+ "train/classification_loss": 0.25078481435775757,
210
+ "train/contrastive_loss": 0.9467111825942993,
211
+ "train/negative_loss": 0.8433182835578918,
212
+ "train/num_negatives": 198,
213
+ "train/num_positives": 40,
214
+ "train/positive_loss": 0.10339287668466568,
215
+ "train/total_loss": 0.44012707471847534
216
+ },
217
+ {
218
+ "epoch": 25.0,
219
+ "step": 250,
220
+ "train/classification_loss": 0.23322956264019012,
221
+ "train/contrastive_loss": 0.4987642168998718,
222
+ "train/negative_loss": 0.48307880759239197,
223
+ "train/num_negatives": 172,
224
+ "train/num_positives": 68,
225
+ "train/positive_loss": 0.015685414895415306,
226
+ "train/total_loss": 0.3329824209213257
227
+ },
228
+ {
229
+ "epoch": 30.0,
230
+ "grad_norm": 11.7496976852417,
231
+ "learning_rate": 1.0100000000000002e-05,
232
+ "loss": 0.7395,
233
+ "step": 300
234
+ },
235
+ {
236
+ "epoch": 30.0,
237
+ "step": 300,
238
+ "train/classification_loss": 0.22414086759090424,
239
+ "train/contrastive_loss": 0.9544009566307068,
240
+ "train/negative_loss": 0.6044885516166687,
241
+ "train/num_negatives": 186,
242
+ "train/num_positives": 54,
243
+ "train/positive_loss": 0.3499124050140381,
244
+ "train/total_loss": 0.41502106189727783
245
+ },
246
+ {
247
+ "epoch": 30.0,
248
+ "step": 300,
249
+ "train/classification_loss": 0.21396367251873016,
250
+ "train/contrastive_loss": 0.4959838390350342,
251
+ "train/negative_loss": 0.4717627763748169,
252
+ "train/num_negatives": 198,
253
+ "train/num_positives": 42,
254
+ "train/positive_loss": 0.02422107383608818,
255
+ "train/total_loss": 0.3131604492664337
256
+ },
257
+ {
258
+ "epoch": 35.0,
259
+ "grad_norm": 5.532268047332764,
260
+ "learning_rate": 7.600000000000001e-06,
261
+ "loss": 0.6737,
262
+ "step": 350
263
+ },
264
+ {
265
+ "epoch": 35.0,
266
+ "step": 350,
267
+ "train/classification_loss": 0.21886315941810608,
268
+ "train/contrastive_loss": 0.5652549266815186,
269
+ "train/negative_loss": 0.4682881832122803,
270
+ "train/num_negatives": 172,
271
+ "train/num_positives": 68,
272
+ "train/positive_loss": 0.09696672856807709,
273
+ "train/total_loss": 0.33191415667533875
274
+ },
275
+ {
276
+ "epoch": 35.0,
277
+ "step": 350,
278
+ "train/classification_loss": 0.1887310892343521,
279
+ "train/contrastive_loss": 0.18129800260066986,
280
+ "train/negative_loss": 0.17543496191501617,
281
+ "train/num_negatives": 152,
282
+ "train/num_positives": 88,
283
+ "train/positive_loss": 0.005863038823008537,
284
+ "train/total_loss": 0.22499069571495056
285
+ },
286
+ {
287
+ "epoch": 40.0,
288
+ "grad_norm": 5.668190002441406,
289
+ "learning_rate": 5.1e-06,
290
+ "loss": 0.6269,
291
+ "step": 400
292
+ },
293
+ {
294
+ "epoch": 40.0,
295
+ "step": 400,
296
+ "train/classification_loss": 0.18238115310668945,
297
+ "train/contrastive_loss": 0.33620232343673706,
298
+ "train/negative_loss": 0.2550373077392578,
299
+ "train/num_negatives": 156,
300
+ "train/num_positives": 84,
301
+ "train/positive_loss": 0.08116500079631805,
302
+ "train/total_loss": 0.24962162971496582
303
+ },
304
+ {
305
+ "epoch": 40.0,
306
+ "step": 400,
307
+ "train/classification_loss": 0.22312195599079132,
308
+ "train/contrastive_loss": 0.6893786191940308,
309
+ "train/negative_loss": 0.6744635105133057,
310
+ "train/num_negatives": 206,
311
+ "train/num_positives": 34,
312
+ "train/positive_loss": 0.014915116131305695,
313
+ "train/total_loss": 0.36099767684936523
314
+ },
315
+ {
316
+ "epoch": 45.0,
317
+ "grad_norm": 4.168755054473877,
318
+ "learning_rate": 2.6e-06,
319
+ "loss": 0.6025,
320
+ "step": 450
321
+ },
322
+ {
323
+ "epoch": 45.0,
324
+ "step": 450,
325
+ "train/classification_loss": 0.1997791826725006,
326
+ "train/contrastive_loss": 0.606022834777832,
327
+ "train/negative_loss": 0.5423316955566406,
328
+ "train/num_negatives": 216,
329
+ "train/num_positives": 24,
330
+ "train/positive_loss": 0.0636911541223526,
331
+ "train/total_loss": 0.32098376750946045
332
+ },
333
+ {
334
+ "epoch": 45.0,
335
+ "step": 450,
336
+ "train/classification_loss": 0.21638567745685577,
337
+ "train/contrastive_loss": 0.37323933839797974,
338
+ "train/negative_loss": 0.3397449851036072,
339
+ "train/num_negatives": 164,
340
+ "train/num_positives": 62,
341
+ "train/positive_loss": 0.03349434956908226,
342
+ "train/total_loss": 0.291033536195755
343
+ },
344
+ {
345
+ "epoch": 50.0,
346
+ "grad_norm": 5.069293022155762,
347
+ "learning_rate": 1.0000000000000001e-07,
348
+ "loss": 0.58,
349
+ "step": 500
350
+ },
351
+ {
352
+ "epoch": 50.0,
353
+ "step": 500,
354
+ "train/classification_loss": 0.22550146281719208,
355
+ "train/contrastive_loss": 2.474167823791504,
356
+ "train/negative_loss": 1.7999987602233887,
357
+ "train/num_negatives": 196,
358
+ "train/num_positives": 44,
359
+ "train/positive_loss": 0.6741690635681152,
360
+ "train/total_loss": 0.7203350067138672
361
+ },
362
+ {
363
+ "epoch": 50.0,
364
+ "step": 500,
365
+ "train/classification_loss": 0.23388545215129852,
366
+ "train/contrastive_loss": 3.272613763809204,
367
+ "train/negative_loss": 1.7668838500976562,
368
+ "train/num_negatives": 210,
369
+ "train/num_positives": 30,
370
+ "train/positive_loss": 1.5057299137115479,
371
+ "train/total_loss": 0.8884082436561584
372
+ },
373
+ {
374
+ "epoch": 50.0,
375
+ "step": 500,
376
+ "train/classification_loss": 0.2511661648750305,
377
+ "train/contrastive_loss": 0.6579197645187378,
378
+ "train/negative_loss": 0.537192702293396,
379
+ "train/num_negatives": 46,
380
+ "train/num_positives": 8,
381
+ "train/positive_loss": 0.12072707712650299,
382
+ "train/total_loss": 0.38275012373924255
383
+ },
384
+ {
385
+ "epoch": 50.0,
386
+ "eval_exact_match_accuracy": 0.2,
387
+ "eval_hamming_loss": 0.06911764705882353,
388
+ "eval_loss": 0.7200472950935364,
389
+ "eval_macro_f1": 0.11519607843137254,
390
+ "eval_macro_precision": 0.15826330532212887,
391
+ "eval_macro_recall": 0.09090909090909091,
392
+ "eval_micro_f1": 0.3380281690140845,
393
+ "eval_micro_precision": 0.8571428571428571,
394
+ "eval_micro_recall": 0.21052631578947367,
395
+ "eval_runtime": 0.219,
396
+ "eval_samples_per_second": 182.641,
397
+ "eval_steps_per_second": 13.698,
398
+ "step": 500
399
+ },
400
+ {
401
+ "epoch": 50.0,
402
+ "step": 500,
403
  "total_flos": 0.0,
404
+ "train_loss": 1.1355848159790038,
405
+ "train_runtime": 246.5817,
406
+ "train_samples_per_second": 64.076,
407
+ "train_steps_per_second": 2.028
408
  }
409
  ],
410
  "logging_steps": 50,
411
+ "max_steps": 500,
412
  "num_input_tokens_seen": 0,
413
+ "num_train_epochs": 50,
414
  "save_steps": 500,
415
  "stateful_callbacks": {
416
  "EarlyStoppingCallback": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e2d330b9dd8fe925b85bea0db478c22579b7da080ec4cac0c4183a4c7358e0
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e01343304a8027b49b07fccbfd92f2c7fc70a061de471b2412977d28ec9eac
3
  size 5368