maximuspowers commited on
Commit
a31df60
·
verified ·
1 Parent(s): 84f46c0

End of training

Browse files
README.md CHANGED
@@ -16,15 +16,15 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [maximuspowers/bert-philosophy-adapted](https://huggingface.co/maximuspowers/bert-philosophy-adapted) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.8156
20
- - Exact Match Accuracy: 0.275
21
- - Macro Precision: 0.1574
22
- - Macro Recall: 0.1134
23
- - Macro F1: 0.1298
24
- - Micro Precision: 0.8421
25
- - Micro Recall: 0.2807
26
- - Micro F1: 0.4211
27
- - Hamming Loss: 0.0647
28
 
29
  ## Model description
30
 
 
16
 
17
  This model is a fine-tuned version of [maximuspowers/bert-philosophy-adapted](https://huggingface.co/maximuspowers/bert-philosophy-adapted) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.5291
20
+ - Exact Match Accuracy: 0.4
21
+ - Macro Precision: 0.1658
22
+ - Macro Recall: 0.1265
23
+ - Macro F1: 0.1410
24
+ - Micro Precision: 0.92
25
+ - Micro Recall: 0.4035
26
+ - Micro F1: 0.5610
27
+ - Hamming Loss: 0.0529
28
 
29
  ## Model description
30
 
all_results.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
  "epoch": 45.0,
3
- "eval_exact_match_accuracy": 0.375,
4
  "eval_hamming_loss": 0.052941176470588235,
5
- "eval_loss": 0.5750908255577087,
6
- "eval_macro_f1": 0.13746934180370715,
7
- "eval_macro_precision": 0.17058823529411765,
8
- "eval_macro_recall": 0.12058823529411763,
9
- "eval_micro_f1": 0.55,
10
- "eval_micro_precision": 0.9565217391304348,
11
- "eval_micro_recall": 0.38596491228070173,
12
- "eval_runtime": 0.2248,
13
- "eval_samples_per_second": 177.928,
14
- "eval_steps_per_second": 13.345,
15
  "total_flos": 0.0,
16
  "train_loss": 0.9705644819471572,
17
  "train_runtime": 232.6541,
 
1
  {
2
  "epoch": 45.0,
3
+ "eval_exact_match_accuracy": 0.4,
4
  "eval_hamming_loss": 0.052941176470588235,
5
+ "eval_loss": 0.5290737152099609,
6
+ "eval_macro_f1": 0.14097904608067482,
7
+ "eval_macro_precision": 0.1657754010695187,
8
+ "eval_macro_recall": 0.1264705882352941,
9
+ "eval_micro_f1": 0.5609756097560976,
10
+ "eval_micro_precision": 0.92,
11
+ "eval_micro_recall": 0.40350877192982454,
12
+ "eval_runtime": 0.2121,
13
+ "eval_samples_per_second": 188.615,
14
+ "eval_steps_per_second": 23.577,
15
  "total_flos": 0.0,
16
  "train_loss": 0.9705644819471572,
17
  "train_runtime": 232.6541,
runs/Jun15_00-47-20_92b2e0e6fb20/events.out.tfevents.1749948709.92b2e0e6fb20.2194.11 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3a7d599c58099528756c657a42cc2454d32e18e282d12c575b8ebcb6fe242d8
3
+ size 5853
test_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 50.0,
3
- "eval_exact_match_accuracy": 0.375,
4
  "eval_hamming_loss": 0.052941176470588235,
5
- "eval_loss": 0.5750908255577087,
6
- "eval_macro_f1": 0.13746934180370715,
7
- "eval_macro_precision": 0.17058823529411765,
8
- "eval_macro_recall": 0.12058823529411763,
9
- "eval_micro_f1": 0.55,
10
- "eval_micro_precision": 0.9565217391304348,
11
- "eval_micro_recall": 0.38596491228070173,
12
- "eval_runtime": 0.2248,
13
- "eval_samples_per_second": 177.928,
14
- "eval_steps_per_second": 13.345
15
  }
 
1
  {
2
+ "epoch": 45.0,
3
+ "eval_exact_match_accuracy": 0.4,
4
  "eval_hamming_loss": 0.052941176470588235,
5
+ "eval_loss": 0.5290737152099609,
6
+ "eval_macro_f1": 0.14097904608067482,
7
+ "eval_macro_precision": 0.1657754010695187,
8
+ "eval_macro_recall": 0.1264705882352941,
9
+ "eval_micro_f1": 0.5609756097560976,
10
+ "eval_micro_precision": 0.92,
11
+ "eval_micro_recall": 0.40350877192982454,
12
+ "eval_runtime": 0.2121,
13
+ "eval_samples_per_second": 188.615,
14
+ "eval_steps_per_second": 23.577
15
  }
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 500,
3
- "best_metric": 0.3380281690140845,
4
- "best_model_checkpoint": "./bert-philosophy-classifier/checkpoint-500",
5
- "epoch": 50.0,
6
- "eval_steps": 250,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12,406 +12,1179 @@
12
  {
13
  "epoch": 0,
14
  "step": 0,
15
- "train/classification_loss": 0.679158627986908,
16
- "train/contrastive_loss": 9.516982078552246,
17
- "train/negative_loss": 9.516908645629883,
18
- "train/num_negatives": 190,
19
- "train/num_positives": 50,
20
- "train/positive_loss": 7.310241926461458e-05,
21
- "train/total_loss": 2.582555055618286
22
  },
23
  {
24
  "epoch": 0,
25
  "step": 0,
26
- "train/classification_loss": 0.6693864464759827,
27
- "train/contrastive_loss": 9.331222534179688,
28
- "train/negative_loss": 9.331130981445312,
29
- "train/num_negatives": 192,
30
- "train/num_positives": 48,
31
- "train/positive_loss": 9.195055463351309e-05,
32
- "train/total_loss": 2.535630941390991
33
  },
34
  {
35
- "epoch": 5.0,
36
- "grad_norm": 17.358003616333008,
37
  "learning_rate": 9.600000000000001e-06,
38
- "loss": 3.6257,
39
  "step": 50
40
  },
41
  {
42
- "epoch": 5.0,
43
  "step": 50,
44
- "train/classification_loss": 0.6362661719322205,
45
- "train/contrastive_loss": 1.4868279695510864,
46
- "train/negative_loss": 1.3825407028198242,
47
- "train/num_negatives": 170,
48
- "train/num_positives": 66,
49
- "train/positive_loss": 0.10428724437952042,
50
- "train/total_loss": 0.9336317777633667
51
  },
52
  {
53
- "epoch": 5.0,
54
  "step": 50,
55
- "train/classification_loss": 0.6401901245117188,
56
- "train/contrastive_loss": 1.6219159364700317,
57
- "train/negative_loss": 1.440779447555542,
58
- "train/num_negatives": 186,
59
- "train/num_positives": 54,
60
- "train/positive_loss": 0.18113651871681213,
61
- "train/total_loss": 0.964573323726654
62
  },
63
  {
64
- "epoch": 10.0,
65
- "grad_norm": 20.2652645111084,
66
  "learning_rate": 1.9600000000000002e-05,
67
- "loss": 1.6163,
68
  "step": 100
69
  },
70
  {
71
- "epoch": 10.0,
72
  "step": 100,
73
- "train/classification_loss": 0.44592994451522827,
74
- "train/contrastive_loss": 0.9996287822723389,
75
- "train/negative_loss": 0.8505972027778625,
76
- "train/num_negatives": 190,
77
- "train/num_positives": 50,
78
- "train/positive_loss": 0.1490315943956375,
79
- "train/total_loss": 0.645855724811554
80
  },
81
  {
82
- "epoch": 10.0,
83
  "step": 100,
84
- "train/classification_loss": 0.4469062089920044,
85
- "train/contrastive_loss": 1.1934728622436523,
86
- "train/negative_loss": 0.9309344291687012,
87
- "train/num_negatives": 206,
88
- "train/num_positives": 30,
89
- "train/positive_loss": 0.26253849267959595,
90
- "train/total_loss": 0.685600757598877
91
  },
92
  {
93
- "epoch": 15.0,
94
- "grad_norm": 8.280580520629883,
95
- "learning_rate": 1.76e-05,
96
- "loss": 1.1607,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  "step": 150
98
  },
99
  {
100
- "epoch": 15.0,
101
  "step": 150,
102
- "train/classification_loss": 0.32877856492996216,
103
- "train/contrastive_loss": 1.1336479187011719,
104
- "train/negative_loss": 0.9981224536895752,
105
- "train/num_negatives": 170,
106
- "train/num_positives": 66,
107
- "train/positive_loss": 0.1355254054069519,
108
- "train/total_loss": 0.5555081367492676
109
  },
110
  {
111
- "epoch": 15.0,
112
  "step": 150,
113
- "train/classification_loss": 0.3285317122936249,
114
- "train/contrastive_loss": 0.8869010210037231,
115
- "train/negative_loss": 0.591576099395752,
116
- "train/num_negatives": 202,
117
- "train/num_positives": 38,
118
- "train/positive_loss": 0.2953248918056488,
119
- "train/total_loss": 0.5059119462966919
120
  },
121
  {
122
- "epoch": 20.0,
123
- "grad_norm": 7.707197189331055,
124
- "learning_rate": 1.5100000000000001e-05,
125
- "loss": 0.9196,
126
  "step": 200
127
  },
128
  {
129
- "epoch": 20.0,
130
  "step": 200,
131
- "train/classification_loss": 0.293140172958374,
132
- "train/contrastive_loss": 0.7223706245422363,
133
- "train/negative_loss": 0.5778605937957764,
134
- "train/num_negatives": 202,
135
- "train/num_positives": 30,
136
- "train/positive_loss": 0.14451001584529877,
137
- "train/total_loss": 0.4376143217086792
138
  },
139
  {
140
- "epoch": 20.0,
141
  "step": 200,
142
- "train/classification_loss": 0.2644300162792206,
143
- "train/contrastive_loss": 0.4585617780685425,
144
- "train/negative_loss": 0.39372602105140686,
145
- "train/num_negatives": 184,
146
- "train/num_positives": 56,
147
- "train/positive_loss": 0.06483575701713562,
148
- "train/total_loss": 0.3561423718929291
149
  },
150
  {
151
- "epoch": 25.0,
152
- "grad_norm": 6.953479766845703,
153
- "learning_rate": 1.2600000000000001e-05,
154
- "loss": 0.811,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  "step": 250
156
  },
157
  {
158
- "epoch": 25.0,
159
  "step": 250,
160
- "train/classification_loss": 0.2595597505569458,
161
- "train/contrastive_loss": 2.3272764682769775,
162
- "train/negative_loss": 1.8330672979354858,
163
- "train/num_negatives": 196,
164
- "train/num_positives": 44,
165
- "train/positive_loss": 0.4942092299461365,
166
- "train/total_loss": 0.7250150442123413
167
  },
168
  {
169
- "epoch": 25.0,
170
  "step": 250,
171
- "train/classification_loss": 0.2660799026489258,
172
- "train/contrastive_loss": 3.3698394298553467,
173
- "train/negative_loss": 1.8154842853546143,
174
- "train/num_negatives": 210,
175
- "train/num_positives": 30,
176
- "train/positive_loss": 1.5543551445007324,
177
- "train/total_loss": 0.9400478005409241
178
  },
179
  {
180
- "epoch": 25.0,
181
- "step": 250,
182
- "train/classification_loss": 0.2840481698513031,
183
- "train/contrastive_loss": 1.1826257705688477,
184
- "train/negative_loss": 1.1373339891433716,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  "train/num_negatives": 46,
186
  "train/num_positives": 8,
187
- "train/positive_loss": 0.045291826128959656,
188
- "train/total_loss": 0.5205733180046082
189
  },
190
  {
191
- "epoch": 25.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  "eval_exact_match_accuracy": 0.1,
193
  "eval_hamming_loss": 0.075,
194
- "eval_loss": 0.7701398134231567,
195
  "eval_macro_f1": 0.0784313725490196,
196
  "eval_macro_precision": 0.1092436974789916,
197
  "eval_macro_recall": 0.06149732620320855,
198
  "eval_micro_f1": 0.2153846153846154,
199
  "eval_micro_precision": 0.875,
200
  "eval_micro_recall": 0.12280701754385964,
201
- "eval_runtime": 0.219,
202
- "eval_samples_per_second": 182.685,
203
- "eval_steps_per_second": 13.701,
204
- "step": 250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  },
206
  {
207
  "epoch": 25.0,
208
- "step": 250,
209
- "train/classification_loss": 0.25078481435775757,
210
- "train/contrastive_loss": 0.9467111825942993,
211
- "train/negative_loss": 0.8433182835578918,
212
- "train/num_negatives": 198,
213
- "train/num_positives": 40,
214
- "train/positive_loss": 0.10339287668466568,
215
- "train/total_loss": 0.44012707471847534
216
  },
217
  {
218
  "epoch": 25.0,
219
- "step": 250,
220
- "train/classification_loss": 0.23322956264019012,
221
- "train/contrastive_loss": 0.4987642168998718,
222
- "train/negative_loss": 0.48307880759239197,
223
- "train/num_negatives": 172,
224
- "train/num_positives": 68,
225
- "train/positive_loss": 0.015685414895415306,
226
- "train/total_loss": 0.3329824209213257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  },
228
  {
229
  "epoch": 30.0,
230
- "grad_norm": 11.7496976852417,
231
- "learning_rate": 1.0100000000000002e-05,
232
- "loss": 0.7395,
233
- "step": 300
234
  },
235
  {
236
  "epoch": 30.0,
237
- "step": 300,
238
- "train/classification_loss": 0.22414086759090424,
239
- "train/contrastive_loss": 0.9544009566307068,
240
- "train/negative_loss": 0.6044885516166687,
241
- "train/num_negatives": 186,
242
- "train/num_positives": 54,
243
- "train/positive_loss": 0.3499124050140381,
244
- "train/total_loss": 0.41502106189727783
245
  },
246
  {
247
  "epoch": 30.0,
248
- "step": 300,
249
- "train/classification_loss": 0.21396367251873016,
250
- "train/contrastive_loss": 0.4959838390350342,
251
- "train/negative_loss": 0.4717627763748169,
252
- "train/num_negatives": 198,
253
- "train/num_positives": 42,
254
- "train/positive_loss": 0.02422107383608818,
255
- "train/total_loss": 0.3131604492664337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  },
257
  {
258
  "epoch": 35.0,
259
- "grad_norm": 5.532268047332764,
260
- "learning_rate": 7.600000000000001e-06,
261
- "loss": 0.6737,
262
- "step": 350
263
  },
264
  {
265
  "epoch": 35.0,
266
- "step": 350,
267
- "train/classification_loss": 0.21886315941810608,
268
- "train/contrastive_loss": 0.5652549266815186,
269
- "train/negative_loss": 0.4682881832122803,
270
- "train/num_negatives": 172,
271
- "train/num_positives": 68,
272
- "train/positive_loss": 0.09696672856807709,
273
- "train/total_loss": 0.33191415667533875
274
  },
275
  {
276
  "epoch": 35.0,
277
- "step": 350,
278
- "train/classification_loss": 0.1887310892343521,
279
- "train/contrastive_loss": 0.18129800260066986,
280
- "train/negative_loss": 0.17543496191501617,
281
- "train/num_negatives": 152,
282
- "train/num_positives": 88,
283
- "train/positive_loss": 0.005863038823008537,
284
- "train/total_loss": 0.22499069571495056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  },
286
  {
287
  "epoch": 40.0,
288
- "grad_norm": 5.668190002441406,
289
- "learning_rate": 5.1e-06,
290
- "loss": 0.6269,
291
- "step": 400
292
  },
293
  {
294
  "epoch": 40.0,
295
- "step": 400,
296
- "train/classification_loss": 0.18238115310668945,
297
- "train/contrastive_loss": 0.33620232343673706,
298
- "train/negative_loss": 0.2550373077392578,
299
- "train/num_negatives": 156,
300
- "train/num_positives": 84,
301
- "train/positive_loss": 0.08116500079631805,
302
- "train/total_loss": 0.24962162971496582
303
  },
304
  {
305
  "epoch": 40.0,
306
- "step": 400,
307
- "train/classification_loss": 0.22312195599079132,
308
- "train/contrastive_loss": 0.6893786191940308,
309
- "train/negative_loss": 0.6744635105133057,
310
- "train/num_negatives": 206,
311
- "train/num_positives": 34,
312
- "train/positive_loss": 0.014915116131305695,
313
- "train/total_loss": 0.36099767684936523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  },
315
  {
316
  "epoch": 45.0,
317
- "grad_norm": 4.168755054473877,
318
- "learning_rate": 2.6e-06,
319
- "loss": 0.6025,
320
- "step": 450
321
  },
322
  {
323
  "epoch": 45.0,
324
- "step": 450,
325
- "train/classification_loss": 0.1997791826725006,
326
- "train/contrastive_loss": 0.606022834777832,
327
- "train/negative_loss": 0.5423316955566406,
328
- "train/num_negatives": 216,
329
- "train/num_positives": 24,
330
- "train/positive_loss": 0.0636911541223526,
331
- "train/total_loss": 0.32098376750946045
332
  },
333
  {
334
  "epoch": 45.0,
335
- "step": 450,
336
- "train/classification_loss": 0.21638567745685577,
337
- "train/contrastive_loss": 0.37323933839797974,
338
- "train/negative_loss": 0.3397449851036072,
339
- "train/num_negatives": 164,
340
- "train/num_positives": 62,
341
- "train/positive_loss": 0.03349434956908226,
342
- "train/total_loss": 0.291033536195755
343
- },
344
- {
345
- "epoch": 50.0,
346
- "grad_norm": 5.069293022155762,
347
- "learning_rate": 1.0000000000000001e-07,
348
- "loss": 0.58,
349
- "step": 500
350
  },
351
  {
352
- "epoch": 50.0,
353
- "step": 500,
354
- "train/classification_loss": 0.22550146281719208,
355
- "train/contrastive_loss": 2.474167823791504,
356
- "train/negative_loss": 1.7999987602233887,
357
- "train/num_negatives": 196,
358
- "train/num_positives": 44,
359
- "train/positive_loss": 0.6741690635681152,
360
- "train/total_loss": 0.7203350067138672
361
  },
362
  {
363
- "epoch": 50.0,
364
- "step": 500,
365
- "train/classification_loss": 0.23388545215129852,
366
- "train/contrastive_loss": 3.272613763809204,
367
- "train/negative_loss": 1.7668838500976562,
368
- "train/num_negatives": 210,
369
- "train/num_positives": 30,
370
- "train/positive_loss": 1.5057299137115479,
371
- "train/total_loss": 0.8884082436561584
372
  },
373
  {
374
- "epoch": 50.0,
375
- "step": 500,
376
- "train/classification_loss": 0.2511661648750305,
377
- "train/contrastive_loss": 0.6579197645187378,
378
- "train/negative_loss": 0.537192702293396,
379
  "train/num_negatives": 46,
380
  "train/num_positives": 8,
381
- "train/positive_loss": 0.12072707712650299,
382
- "train/total_loss": 0.38275012373924255
383
- },
384
- {
385
- "epoch": 50.0,
386
- "eval_exact_match_accuracy": 0.2,
387
- "eval_hamming_loss": 0.06911764705882353,
388
- "eval_loss": 0.7200472950935364,
389
- "eval_macro_f1": 0.11519607843137254,
390
- "eval_macro_precision": 0.15826330532212887,
391
- "eval_macro_recall": 0.09090909090909091,
392
- "eval_micro_f1": 0.3380281690140845,
393
- "eval_micro_precision": 0.8571428571428571,
394
- "eval_micro_recall": 0.21052631578947367,
395
- "eval_runtime": 0.219,
396
- "eval_samples_per_second": 182.641,
397
- "eval_steps_per_second": 13.698,
398
- "step": 500
399
  },
400
  {
401
- "epoch": 50.0,
402
- "step": 500,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  "total_flos": 0.0,
404
- "train_loss": 1.1355848159790038,
405
- "train_runtime": 246.5817,
406
- "train_samples_per_second": 64.076,
407
- "train_steps_per_second": 2.028
408
  }
409
  ],
410
  "logging_steps": 50,
411
- "max_steps": 500,
412
  "num_input_tokens_seen": 0,
413
  "num_train_epochs": 50,
414
- "save_steps": 500,
415
  "stateful_callbacks": {
416
  "EarlyStoppingCallback": {
417
  "args": {
@@ -427,14 +1200,14 @@
427
  "should_epoch_stop": false,
428
  "should_evaluate": false,
429
  "should_log": false,
430
- "should_save": true,
431
- "should_training_stop": true
432
  },
433
  "attributes": {}
434
  }
435
  },
436
  "total_flos": 0.0,
437
- "train_batch_size": 16,
438
  "trial_name": null,
439
  "trial_params": null
440
  }
 
1
  {
2
+ "best_global_step": 600,
3
+ "best_metric": 0.42105263157894735,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 45.0,
6
+ "eval_steps": 100,
7
+ "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12
  {
13
  "epoch": 0,
14
  "step": 0,
15
+ "train/classification_loss": 0.7007833123207092,
16
+ "train/contrastive_loss": 9.572936058044434,
17
+ "train/negative_loss": 9.572856903076172,
18
+ "train/num_negatives": 46,
19
+ "train/num_positives": 10,
20
+ "train/positive_loss": 7.908708357717842e-05,
21
+ "train/total_loss": 2.615370512008667
22
  },
23
  {
24
  "epoch": 0,
25
  "step": 0,
26
+ "train/classification_loss": 0.7040252089500427,
27
+ "train/contrastive_loss": 9.725648880004883,
28
+ "train/negative_loss": 9.725597381591797,
29
+ "train/num_negatives": 46,
30
+ "train/num_positives": 10,
31
+ "train/positive_loss": 5.152364246896468e-05,
32
+ "train/total_loss": 2.6491549015045166
33
  },
34
  {
35
+ "epoch": 2.5,
36
+ "grad_norm": 18.165754318237305,
37
  "learning_rate": 9.600000000000001e-06,
38
+ "loss": 3.7884,
39
  "step": 50
40
  },
41
  {
42
+ "epoch": 2.5,
43
  "step": 50,
44
+ "train/classification_loss": 0.6516271233558655,
45
+ "train/contrastive_loss": 1.8623473644256592,
46
+ "train/negative_loss": 1.4998806715011597,
47
+ "train/num_negatives": 38,
48
+ "train/num_positives": 18,
49
+ "train/positive_loss": 0.36246663331985474,
50
+ "train/total_loss": 1.0240966081619263
51
  },
52
  {
53
+ "epoch": 2.5,
54
  "step": 50,
55
+ "train/classification_loss": 0.6451266407966614,
56
+ "train/contrastive_loss": 1.7446768283843994,
57
+ "train/negative_loss": 1.668500304222107,
58
+ "train/num_negatives": 50,
59
+ "train/num_positives": 6,
60
+ "train/positive_loss": 0.07617650926113129,
61
+ "train/total_loss": 0.9940620064735413
62
  },
63
  {
64
+ "epoch": 5.0,
65
+ "grad_norm": 16.30499839782715,
66
  "learning_rate": 1.9600000000000002e-05,
67
+ "loss": 1.7889,
68
  "step": 100
69
  },
70
  {
71
+ "epoch": 5.0,
72
  "step": 100,
73
+ "train/classification_loss": 0.4927652180194855,
74
+ "train/contrastive_loss": 2.06559419631958,
75
+ "train/negative_loss": 1.8223981857299805,
76
+ "train/num_negatives": 44,
77
+ "train/num_positives": 12,
78
+ "train/positive_loss": 0.24319612979888916,
79
+ "train/total_loss": 0.9058840274810791
80
  },
81
  {
82
+ "epoch": 5.0,
83
  "step": 100,
84
+ "train/classification_loss": 0.5166366100311279,
85
+ "train/contrastive_loss": 3.1131491661071777,
86
+ "train/negative_loss": 3.112612724304199,
87
+ "train/num_negatives": 44,
88
+ "train/num_positives": 12,
89
+ "train/positive_loss": 0.0005365243996493518,
90
+ "train/total_loss": 1.1392664909362793
91
  },
92
  {
93
+ "epoch": 5.0,
94
+ "step": 100,
95
+ "train/classification_loss": 0.4922243058681488,
96
+ "train/contrastive_loss": 1.7773994207382202,
97
+ "train/negative_loss": 1.7757622003555298,
98
+ "train/num_negatives": 52,
99
+ "train/num_positives": 4,
100
+ "train/positive_loss": 0.0016372093232348561,
101
+ "train/total_loss": 0.8477041721343994
102
+ },
103
+ {
104
+ "epoch": 5.0,
105
+ "step": 100,
106
+ "train/classification_loss": 0.5021852850914001,
107
+ "train/contrastive_loss": 3.2608611583709717,
108
+ "train/negative_loss": 2.6507816314697266,
109
+ "train/num_negatives": 48,
110
+ "train/num_positives": 8,
111
+ "train/positive_loss": 0.6100795269012451,
112
+ "train/total_loss": 1.1543575525283813
113
+ },
114
+ {
115
+ "epoch": 5.0,
116
+ "step": 100,
117
+ "train/classification_loss": 0.4944652020931244,
118
+ "train/contrastive_loss": 2.343458652496338,
119
+ "train/negative_loss": 1.1261494159698486,
120
+ "train/num_negatives": 46,
121
+ "train/num_positives": 8,
122
+ "train/positive_loss": 1.2173092365264893,
123
+ "train/total_loss": 0.9631569385528564
124
+ },
125
+ {
126
+ "epoch": 5.0,
127
+ "eval_exact_match_accuracy": 0.0,
128
+ "eval_hamming_loss": 0.08529411764705883,
129
+ "eval_loss": 1.002073884010315,
130
+ "eval_macro_f1": 0.0,
131
+ "eval_macro_precision": 0.0,
132
+ "eval_macro_recall": 0.0,
133
+ "eval_micro_f1": 0.0,
134
+ "eval_micro_precision": 0.0,
135
+ "eval_micro_recall": 0.0,
136
+ "eval_runtime": 0.2122,
137
+ "eval_samples_per_second": 188.524,
138
+ "eval_steps_per_second": 23.566,
139
+ "step": 100
140
+ },
141
+ {
142
+ "epoch": 5.0,
143
+ "step": 100,
144
+ "train/classification_loss": 0.5136489272117615,
145
+ "train/contrastive_loss": 1.3931580781936646,
146
+ "train/negative_loss": 1.0857487916946411,
147
+ "train/num_negatives": 42,
148
+ "train/num_positives": 12,
149
+ "train/positive_loss": 0.30740925669670105,
150
+ "train/total_loss": 0.7922805547714233
151
+ },
152
+ {
153
+ "epoch": 5.0,
154
+ "step": 100,
155
+ "train/classification_loss": 0.5019432902336121,
156
+ "train/contrastive_loss": 0.7777740955352783,
157
+ "train/negative_loss": 0.5738040804862976,
158
+ "train/num_negatives": 40,
159
+ "train/num_positives": 16,
160
+ "train/positive_loss": 0.20396998524665833,
161
+ "train/total_loss": 0.6574981212615967
162
+ },
163
+ {
164
+ "epoch": 7.5,
165
+ "grad_norm": 12.862069129943848,
166
+ "learning_rate": 1.8933333333333334e-05,
167
+ "loss": 1.396,
168
  "step": 150
169
  },
170
  {
171
+ "epoch": 7.5,
172
  "step": 150,
173
+ "train/classification_loss": 0.3993939161300659,
174
+ "train/contrastive_loss": 1.1298128366470337,
175
+ "train/negative_loss": 0.8979541659355164,
176
+ "train/num_negatives": 44,
177
+ "train/num_positives": 8,
178
+ "train/positive_loss": 0.23185870051383972,
179
+ "train/total_loss": 0.6253564953804016
180
  },
181
  {
182
+ "epoch": 7.5,
183
  "step": 150,
184
+ "train/classification_loss": 0.393746018409729,
185
+ "train/contrastive_loss": 1.2817302942276,
186
+ "train/negative_loss": 1.1390491724014282,
187
+ "train/num_negatives": 44,
188
+ "train/num_positives": 10,
189
+ "train/positive_loss": 0.14268112182617188,
190
+ "train/total_loss": 0.6500921249389648
191
  },
192
  {
193
+ "epoch": 10.0,
194
+ "grad_norm": 23.76511573791504,
195
+ "learning_rate": 1.782222222222222e-05,
196
+ "loss": 1.156,
197
  "step": 200
198
  },
199
  {
200
+ "epoch": 10.0,
201
  "step": 200,
202
+ "train/classification_loss": 0.30127307772636414,
203
+ "train/contrastive_loss": 1.8966163396835327,
204
+ "train/negative_loss": 1.7765212059020996,
205
+ "train/num_negatives": 44,
206
+ "train/num_positives": 12,
207
+ "train/positive_loss": 0.12009509652853012,
208
+ "train/total_loss": 0.6805963516235352
209
  },
210
  {
211
+ "epoch": 10.0,
212
  "step": 200,
213
+ "train/classification_loss": 0.3545684218406677,
214
+ "train/contrastive_loss": 3.0673155784606934,
215
+ "train/negative_loss": 3.067002058029175,
216
+ "train/num_negatives": 44,
217
+ "train/num_positives": 12,
218
+ "train/positive_loss": 0.0003136020968668163,
219
+ "train/total_loss": 0.9680315256118774
220
  },
221
  {
222
+ "epoch": 10.0,
223
+ "step": 200,
224
+ "train/classification_loss": 0.32993215322494507,
225
+ "train/contrastive_loss": 2.723947048187256,
226
+ "train/negative_loss": 2.7232000827789307,
227
+ "train/num_negatives": 52,
228
+ "train/num_positives": 4,
229
+ "train/positive_loss": 0.0007470683194696903,
230
+ "train/total_loss": 0.8747215867042542
231
+ },
232
+ {
233
+ "epoch": 10.0,
234
+ "step": 200,
235
+ "train/classification_loss": 0.32662659883499146,
236
+ "train/contrastive_loss": 3.6181204319000244,
237
+ "train/negative_loss": 3.4868171215057373,
238
+ "train/num_negatives": 48,
239
+ "train/num_positives": 8,
240
+ "train/positive_loss": 0.1313033103942871,
241
+ "train/total_loss": 1.050250768661499
242
+ },
243
+ {
244
+ "epoch": 10.0,
245
+ "step": 200,
246
+ "train/classification_loss": 0.33877384662628174,
247
+ "train/contrastive_loss": 2.0167577266693115,
248
+ "train/negative_loss": 1.4009835720062256,
249
+ "train/num_negatives": 46,
250
+ "train/num_positives": 8,
251
+ "train/positive_loss": 0.6157740950584412,
252
+ "train/total_loss": 0.742125391960144
253
+ },
254
+ {
255
+ "epoch": 10.0,
256
+ "eval_exact_match_accuracy": 0.0,
257
+ "eval_hamming_loss": 0.0838235294117647,
258
+ "eval_loss": 0.8631451725959778,
259
+ "eval_macro_f1": 0.0,
260
+ "eval_macro_precision": 0.0,
261
+ "eval_macro_recall": 0.0,
262
+ "eval_micro_f1": 0.0,
263
+ "eval_micro_precision": 0.0,
264
+ "eval_micro_recall": 0.0,
265
+ "eval_runtime": 0.202,
266
+ "eval_samples_per_second": 198.018,
267
+ "eval_steps_per_second": 24.752,
268
+ "step": 200
269
+ },
270
+ {
271
+ "epoch": 10.0,
272
+ "step": 200,
273
+ "train/classification_loss": 0.34425634145736694,
274
+ "train/contrastive_loss": 1.217971682548523,
275
+ "train/negative_loss": 0.8478565216064453,
276
+ "train/num_negatives": 42,
277
+ "train/num_positives": 14,
278
+ "train/positive_loss": 0.37011516094207764,
279
+ "train/total_loss": 0.5878506898880005
280
+ },
281
+ {
282
+ "epoch": 10.0,
283
+ "step": 200,
284
+ "train/classification_loss": 0.3122542202472687,
285
+ "train/contrastive_loss": 0.6875693798065186,
286
+ "train/negative_loss": 0.686568558216095,
287
+ "train/num_negatives": 42,
288
+ "train/num_positives": 14,
289
+ "train/positive_loss": 0.0010008324170485139,
290
+ "train/total_loss": 0.4497680962085724
291
+ },
292
+ {
293
+ "epoch": 12.5,
294
+ "grad_norm": 7.754025459289551,
295
+ "learning_rate": 1.6711111111111112e-05,
296
+ "loss": 1.0042,
297
  "step": 250
298
  },
299
  {
300
+ "epoch": 12.5,
301
  "step": 250,
302
+ "train/classification_loss": 0.24464763700962067,
303
+ "train/contrastive_loss": 0.6364108920097351,
304
+ "train/negative_loss": 0.4732590615749359,
305
+ "train/num_negatives": 44,
306
+ "train/num_positives": 12,
307
+ "train/positive_loss": 0.163151815533638,
308
+ "train/total_loss": 0.3719298243522644
309
  },
310
  {
311
+ "epoch": 12.5,
312
  "step": 250,
313
+ "train/classification_loss": 0.3077278137207031,
314
+ "train/contrastive_loss": 0.4854884147644043,
315
+ "train/negative_loss": 0.33287519216537476,
316
+ "train/num_negatives": 42,
317
+ "train/num_positives": 8,
318
+ "train/positive_loss": 0.15261322259902954,
319
+ "train/total_loss": 0.40482550859451294
320
  },
321
  {
322
+ "epoch": 15.0,
323
+ "grad_norm": 16.385419845581055,
324
+ "learning_rate": 1.5600000000000003e-05,
325
+ "loss": 0.8775,
326
+ "step": 300
327
+ },
328
+ {
329
+ "epoch": 15.0,
330
+ "step": 300,
331
+ "train/classification_loss": 0.23176752030849457,
332
+ "train/contrastive_loss": 3.0300073623657227,
333
+ "train/negative_loss": 2.5657095909118652,
334
+ "train/num_negatives": 44,
335
+ "train/num_positives": 12,
336
+ "train/positive_loss": 0.4642978310585022,
337
+ "train/total_loss": 0.8377690315246582
338
+ },
339
+ {
340
+ "epoch": 15.0,
341
+ "step": 300,
342
+ "train/classification_loss": 0.3002067804336548,
343
+ "train/contrastive_loss": 3.911193609237671,
344
+ "train/negative_loss": 3.911159038543701,
345
+ "train/num_negatives": 44,
346
+ "train/num_positives": 12,
347
+ "train/positive_loss": 3.4572090953588486e-05,
348
+ "train/total_loss": 1.082445502281189
349
+ },
350
+ {
351
+ "epoch": 15.0,
352
+ "step": 300,
353
+ "train/classification_loss": 0.2759508788585663,
354
+ "train/contrastive_loss": 2.758004665374756,
355
+ "train/negative_loss": 2.7472095489501953,
356
+ "train/num_negatives": 52,
357
+ "train/num_positives": 4,
358
+ "train/positive_loss": 0.010795066133141518,
359
+ "train/total_loss": 0.8275518417358398
360
+ },
361
+ {
362
+ "epoch": 15.0,
363
+ "step": 300,
364
+ "train/classification_loss": 0.27436333894729614,
365
+ "train/contrastive_loss": 4.6799540519714355,
366
+ "train/negative_loss": 4.251977920532227,
367
+ "train/num_negatives": 48,
368
+ "train/num_positives": 8,
369
+ "train/positive_loss": 0.4279760420322418,
370
+ "train/total_loss": 1.2103540897369385
371
+ },
372
+ {
373
+ "epoch": 15.0,
374
+ "step": 300,
375
+ "train/classification_loss": 0.2907729744911194,
376
+ "train/contrastive_loss": 2.066659927368164,
377
+ "train/negative_loss": 1.4751646518707275,
378
  "train/num_negatives": 46,
379
  "train/num_positives": 8,
380
+ "train/positive_loss": 0.5914952754974365,
381
+ "train/total_loss": 0.7041049599647522
382
  },
383
  {
384
+ "epoch": 15.0,
385
+ "eval_exact_match_accuracy": 0.05,
386
+ "eval_hamming_loss": 0.07647058823529412,
387
+ "eval_loss": 0.9324451684951782,
388
+ "eval_macro_f1": 0.03676470588235294,
389
+ "eval_macro_precision": 0.058823529411764705,
390
+ "eval_macro_recall": 0.026737967914438502,
391
+ "eval_micro_f1": 0.16129032258064516,
392
+ "eval_micro_precision": 1.0,
393
+ "eval_micro_recall": 0.08771929824561403,
394
+ "eval_runtime": 0.207,
395
+ "eval_samples_per_second": 193.242,
396
+ "eval_steps_per_second": 24.155,
397
+ "step": 300
398
+ },
399
+ {
400
+ "epoch": 15.0,
401
+ "step": 300,
402
+ "train/classification_loss": 0.26088976860046387,
403
+ "train/contrastive_loss": 0.5876651406288147,
404
+ "train/negative_loss": 0.5801823139190674,
405
+ "train/num_negatives": 32,
406
+ "train/num_positives": 20,
407
+ "train/positive_loss": 0.007482839282602072,
408
+ "train/total_loss": 0.3784227967262268
409
+ },
410
+ {
411
+ "epoch": 15.0,
412
+ "step": 300,
413
+ "train/classification_loss": 0.2548993229866028,
414
+ "train/contrastive_loss": 1.50383722782135,
415
+ "train/negative_loss": 1.4029262065887451,
416
+ "train/num_negatives": 46,
417
+ "train/num_positives": 10,
418
+ "train/positive_loss": 0.10091102123260498,
419
+ "train/total_loss": 0.5556668043136597
420
+ },
421
+ {
422
+ "epoch": 17.5,
423
+ "grad_norm": 16.8145809173584,
424
+ "learning_rate": 1.448888888888889e-05,
425
+ "loss": 0.827,
426
+ "step": 350
427
+ },
428
+ {
429
+ "epoch": 17.5,
430
+ "step": 350,
431
+ "train/classification_loss": 0.24073848128318787,
432
+ "train/contrastive_loss": 0.6602705717086792,
433
+ "train/negative_loss": 0.4261236786842346,
434
+ "train/num_negatives": 42,
435
+ "train/num_positives": 14,
436
+ "train/positive_loss": 0.23414692282676697,
437
+ "train/total_loss": 0.3727926015853882
438
+ },
439
+ {
440
+ "epoch": 17.5,
441
+ "step": 350,
442
+ "train/classification_loss": 0.2786425054073334,
443
+ "train/contrastive_loss": 0.29787567257881165,
444
+ "train/negative_loss": 0.2970171868801117,
445
+ "train/num_negatives": 44,
446
+ "train/num_positives": 6,
447
+ "train/positive_loss": 0.0008584868628531694,
448
+ "train/total_loss": 0.3382176458835602
449
+ },
450
+ {
451
+ "epoch": 20.0,
452
+ "grad_norm": 18.7548828125,
453
+ "learning_rate": 1.3377777777777778e-05,
454
+ "loss": 0.7747,
455
+ "step": 400
456
+ },
457
+ {
458
+ "epoch": 20.0,
459
+ "step": 400,
460
+ "train/classification_loss": 0.20234902203083038,
461
+ "train/contrastive_loss": 1.7551348209381104,
462
+ "train/negative_loss": 1.1972665786743164,
463
+ "train/num_negatives": 44,
464
+ "train/num_positives": 12,
465
+ "train/positive_loss": 0.557868242263794,
466
+ "train/total_loss": 0.5533760190010071
467
+ },
468
+ {
469
+ "epoch": 20.0,
470
+ "step": 400,
471
+ "train/classification_loss": 0.27940884232521057,
472
+ "train/contrastive_loss": 3.1547234058380127,
473
+ "train/negative_loss": 3.1508476734161377,
474
+ "train/num_negatives": 44,
475
+ "train/num_positives": 12,
476
+ "train/positive_loss": 0.0038756858557462692,
477
+ "train/total_loss": 0.9103535413742065
478
+ },
479
+ {
480
+ "epoch": 20.0,
481
+ "step": 400,
482
+ "train/classification_loss": 0.2563062608242035,
483
+ "train/contrastive_loss": 3.433103084564209,
484
+ "train/negative_loss": 3.146430253982544,
485
+ "train/num_negatives": 52,
486
+ "train/num_positives": 4,
487
+ "train/positive_loss": 0.2866727411746979,
488
+ "train/total_loss": 0.9429268836975098
489
+ },
490
+ {
491
+ "epoch": 20.0,
492
+ "step": 400,
493
+ "train/classification_loss": 0.24611052870750427,
494
+ "train/contrastive_loss": 3.2940289974212646,
495
+ "train/negative_loss": 2.822859525680542,
496
+ "train/num_negatives": 48,
497
+ "train/num_positives": 8,
498
+ "train/positive_loss": 0.47116944193840027,
499
+ "train/total_loss": 0.9049162864685059
500
+ },
501
+ {
502
+ "epoch": 20.0,
503
+ "step": 400,
504
+ "train/classification_loss": 0.26658472418785095,
505
+ "train/contrastive_loss": 0.9518164992332458,
506
+ "train/negative_loss": 0.90028977394104,
507
+ "train/num_negatives": 46,
508
+ "train/num_positives": 8,
509
+ "train/positive_loss": 0.051526736468076706,
510
+ "train/total_loss": 0.45694804191589355
511
+ },
512
+ {
513
+ "epoch": 20.0,
514
  "eval_exact_match_accuracy": 0.1,
515
  "eval_hamming_loss": 0.075,
516
+ "eval_loss": 0.7537041902542114,
517
  "eval_macro_f1": 0.0784313725490196,
518
  "eval_macro_precision": 0.1092436974789916,
519
  "eval_macro_recall": 0.06149732620320855,
520
  "eval_micro_f1": 0.2153846153846154,
521
  "eval_micro_precision": 0.875,
522
  "eval_micro_recall": 0.12280701754385964,
523
+ "eval_runtime": 0.2075,
524
+ "eval_samples_per_second": 192.749,
525
+ "eval_steps_per_second": 24.094,
526
+ "step": 400
527
+ },
528
+ {
529
+ "epoch": 20.0,
530
+ "step": 400,
531
+ "train/classification_loss": 0.24341967701911926,
532
+ "train/contrastive_loss": 0.7751690149307251,
533
+ "train/negative_loss": 0.517412543296814,
534
+ "train/num_negatives": 42,
535
+ "train/num_positives": 10,
536
+ "train/positive_loss": 0.25775647163391113,
537
+ "train/total_loss": 0.3984534740447998
538
+ },
539
+ {
540
+ "epoch": 20.0,
541
+ "step": 400,
542
+ "train/classification_loss": 0.2490834891796112,
543
+ "train/contrastive_loss": 0.44699349999427795,
544
+ "train/negative_loss": 0.4466739594936371,
545
+ "train/num_negatives": 52,
546
+ "train/num_positives": 4,
547
+ "train/positive_loss": 0.0003195433528162539,
548
+ "train/total_loss": 0.33848220109939575
549
+ },
550
+ {
551
+ "epoch": 22.5,
552
+ "grad_norm": 3.832901954650879,
553
+ "learning_rate": 1.2266666666666667e-05,
554
+ "loss": 0.6929,
555
+ "step": 450
556
+ },
557
+ {
558
+ "epoch": 22.5,
559
+ "step": 450,
560
+ "train/classification_loss": 0.23125219345092773,
561
+ "train/contrastive_loss": 0.7171761393547058,
562
+ "train/negative_loss": 0.7147800922393799,
563
+ "train/num_negatives": 50,
564
+ "train/num_positives": 4,
565
+ "train/positive_loss": 0.0023960734251886606,
566
+ "train/total_loss": 0.37468743324279785
567
+ },
568
+ {
569
+ "epoch": 22.5,
570
+ "step": 450,
571
+ "train/classification_loss": 0.20611771941184998,
572
+ "train/contrastive_loss": 0.7421404123306274,
573
+ "train/negative_loss": 0.4822021722793579,
574
+ "train/num_negatives": 50,
575
+ "train/num_positives": 6,
576
+ "train/positive_loss": 0.2599382698535919,
577
+ "train/total_loss": 0.35454580187797546
578
  },
579
  {
580
  "epoch": 25.0,
581
+ "grad_norm": 11.741353988647461,
582
+ "learning_rate": 1.1155555555555556e-05,
583
+ "loss": 0.7074,
584
+ "step": 500
 
 
 
 
585
  },
586
  {
587
  "epoch": 25.0,
588
+ "step": 500,
589
+ "train/classification_loss": 0.1859707236289978,
590
+ "train/contrastive_loss": 2.448401689529419,
591
+ "train/negative_loss": 1.6338316202163696,
592
+ "train/num_negatives": 44,
593
+ "train/num_positives": 12,
594
+ "train/positive_loss": 0.8145700693130493,
595
+ "train/total_loss": 0.6756510734558105
596
+ },
597
+ {
598
+ "epoch": 25.0,
599
+ "step": 500,
600
+ "train/classification_loss": 0.2645534574985504,
601
+ "train/contrastive_loss": 2.718876838684082,
602
+ "train/negative_loss": 2.718696355819702,
603
+ "train/num_negatives": 44,
604
+ "train/num_positives": 12,
605
+ "train/positive_loss": 0.00018060117145068944,
606
+ "train/total_loss": 0.8083288669586182
607
+ },
608
+ {
609
+ "epoch": 25.0,
610
+ "step": 500,
611
+ "train/classification_loss": 0.244839608669281,
612
+ "train/contrastive_loss": 4.527173042297363,
613
+ "train/negative_loss": 2.9302120208740234,
614
+ "train/num_negatives": 52,
615
+ "train/num_positives": 4,
616
+ "train/positive_loss": 1.5969611406326294,
617
+ "train/total_loss": 1.1502742767333984
618
+ },
619
+ {
620
+ "epoch": 25.0,
621
+ "step": 500,
622
+ "train/classification_loss": 0.22313973307609558,
623
+ "train/contrastive_loss": 3.153029203414917,
624
+ "train/negative_loss": 2.193068504333496,
625
+ "train/num_negatives": 48,
626
+ "train/num_positives": 8,
627
+ "train/positive_loss": 0.9599607586860657,
628
+ "train/total_loss": 0.8537455797195435
629
+ },
630
+ {
631
+ "epoch": 25.0,
632
+ "step": 500,
633
+ "train/classification_loss": 0.25405386090278625,
634
+ "train/contrastive_loss": 1.7682042121887207,
635
+ "train/negative_loss": 0.7665292620658875,
636
+ "train/num_negatives": 46,
637
+ "train/num_positives": 8,
638
+ "train/positive_loss": 1.001675009727478,
639
+ "train/total_loss": 0.6076947450637817
640
+ },
641
+ {
642
+ "epoch": 25.0,
643
+ "eval_exact_match_accuracy": 0.175,
644
+ "eval_hamming_loss": 0.07205882352941176,
645
+ "eval_loss": 0.8191388845443726,
646
+ "eval_macro_f1": 0.10560224089635854,
647
+ "eval_macro_precision": 0.14869281045751634,
648
+ "eval_macro_recall": 0.08449197860962566,
649
+ "eval_micro_f1": 0.30985915492957744,
650
+ "eval_micro_precision": 0.7857142857142857,
651
+ "eval_micro_recall": 0.19298245614035087,
652
+ "eval_runtime": 0.2062,
653
+ "eval_samples_per_second": 193.983,
654
+ "eval_steps_per_second": 24.248,
655
+ "step": 500
656
+ },
657
+ {
658
+ "epoch": 25.0,
659
+ "step": 500,
660
+ "train/classification_loss": 0.16842614114284515,
661
+ "train/contrastive_loss": 0.2924913763999939,
662
+ "train/negative_loss": 0.28081196546554565,
663
+ "train/num_negatives": 42,
664
+ "train/num_positives": 14,
665
+ "train/positive_loss": 0.011679417453706264,
666
+ "train/total_loss": 0.22692441940307617
667
+ },
668
+ {
669
+ "epoch": 25.0,
670
+ "step": 500,
671
+ "train/classification_loss": 0.24865391850471497,
672
+ "train/contrastive_loss": 1.3489311933517456,
673
+ "train/negative_loss": 0.7319411635398865,
674
+ "train/num_negatives": 44,
675
+ "train/num_positives": 12,
676
+ "train/positive_loss": 0.6169900298118591,
677
+ "train/total_loss": 0.5184401273727417
678
+ },
679
+ {
680
+ "epoch": 27.5,
681
+ "grad_norm": 6.354713439941406,
682
+ "learning_rate": 1.0044444444444446e-05,
683
+ "loss": 0.6366,
684
+ "step": 550
685
+ },
686
+ {
687
+ "epoch": 27.5,
688
+ "step": 550,
689
+ "train/classification_loss": 0.23189660906791687,
690
+ "train/contrastive_loss": 0.8256056308746338,
691
+ "train/negative_loss": 0.8255752325057983,
692
+ "train/num_negatives": 54,
693
+ "train/num_positives": 2,
694
+ "train/positive_loss": 3.039883085875772e-05,
695
+ "train/total_loss": 0.3970177173614502
696
+ },
697
+ {
698
+ "epoch": 27.5,
699
+ "step": 550,
700
+ "train/classification_loss": 0.2541985809803009,
701
+ "train/contrastive_loss": 1.3727329969406128,
702
+ "train/negative_loss": 1.0316259860992432,
703
+ "train/num_negatives": 40,
704
+ "train/num_positives": 12,
705
+ "train/positive_loss": 0.34110698103904724,
706
+ "train/total_loss": 0.528745174407959
707
  },
708
  {
709
  "epoch": 30.0,
710
+ "grad_norm": 12.662666320800781,
711
+ "learning_rate": 8.933333333333333e-06,
712
+ "loss": 0.6281,
713
+ "step": 600
714
  },
715
  {
716
  "epoch": 30.0,
717
+ "step": 600,
718
+ "train/classification_loss": 0.17969225347042084,
719
+ "train/contrastive_loss": 2.1998844146728516,
720
+ "train/negative_loss": 1.262956976890564,
721
+ "train/num_negatives": 44,
722
+ "train/num_positives": 12,
723
+ "train/positive_loss": 0.9369274973869324,
724
+ "train/total_loss": 0.6196691393852234
725
  },
726
  {
727
  "epoch": 30.0,
728
+ "step": 600,
729
+ "train/classification_loss": 0.2561105787754059,
730
+ "train/contrastive_loss": 3.2061843872070312,
731
+ "train/negative_loss": 3.2013988494873047,
732
+ "train/num_negatives": 44,
733
+ "train/num_positives": 12,
734
+ "train/positive_loss": 0.00478551909327507,
735
+ "train/total_loss": 0.8973474502563477
736
+ },
737
+ {
738
+ "epoch": 30.0,
739
+ "step": 600,
740
+ "train/classification_loss": 0.24141749739646912,
741
+ "train/contrastive_loss": 4.734986305236816,
742
+ "train/negative_loss": 3.0736501216888428,
743
+ "train/num_negatives": 52,
744
+ "train/num_positives": 4,
745
+ "train/positive_loss": 1.6613364219665527,
746
+ "train/total_loss": 1.1884148120880127
747
+ },
748
+ {
749
+ "epoch": 30.0,
750
+ "step": 600,
751
+ "train/classification_loss": 0.22462235391139984,
752
+ "train/contrastive_loss": 4.2080254554748535,
753
+ "train/negative_loss": 3.6183528900146484,
754
+ "train/num_negatives": 48,
755
+ "train/num_positives": 8,
756
+ "train/positive_loss": 0.5896727442741394,
757
+ "train/total_loss": 1.0662274360656738
758
+ },
759
+ {
760
+ "epoch": 30.0,
761
+ "step": 600,
762
+ "train/classification_loss": 0.24234618246555328,
763
+ "train/contrastive_loss": 1.1968506574630737,
764
+ "train/negative_loss": 0.8532204031944275,
765
+ "train/num_negatives": 46,
766
+ "train/num_positives": 8,
767
+ "train/positive_loss": 0.34363028407096863,
768
+ "train/total_loss": 0.4817163348197937
769
+ },
770
+ {
771
+ "epoch": 30.0,
772
+ "eval_exact_match_accuracy": 0.275,
773
+ "eval_hamming_loss": 0.06470588235294118,
774
+ "eval_loss": 0.8506749868392944,
775
+ "eval_macro_f1": 0.12978524743230624,
776
+ "eval_macro_precision": 0.15735294117647058,
777
+ "eval_macro_recall": 0.11336898395721925,
778
+ "eval_micro_f1": 0.42105263157894735,
779
+ "eval_micro_precision": 0.8421052631578947,
780
+ "eval_micro_recall": 0.2807017543859649,
781
+ "eval_runtime": 0.2062,
782
+ "eval_samples_per_second": 193.977,
783
+ "eval_steps_per_second": 24.247,
784
+ "step": 600
785
+ },
786
+ {
787
+ "epoch": 30.0,
788
+ "step": 600,
789
+ "train/classification_loss": 0.16461151838302612,
790
+ "train/contrastive_loss": 0.5128712058067322,
791
+ "train/negative_loss": 0.23724108934402466,
792
+ "train/num_negatives": 38,
793
+ "train/num_positives": 18,
794
+ "train/positive_loss": 0.2756301164627075,
795
+ "train/total_loss": 0.2671857476234436
796
+ },
797
+ {
798
+ "epoch": 30.0,
799
+ "step": 600,
800
+ "train/classification_loss": 0.2038976103067398,
801
+ "train/contrastive_loss": 1.0636850595474243,
802
+ "train/negative_loss": 0.5897871255874634,
803
+ "train/num_negatives": 48,
804
+ "train/num_positives": 8,
805
+ "train/positive_loss": 0.47389790415763855,
806
+ "train/total_loss": 0.41663461923599243
807
+ },
808
+ {
809
+ "epoch": 32.5,
810
+ "grad_norm": 11.408817291259766,
811
+ "learning_rate": 7.822222222222224e-06,
812
+ "loss": 0.5854,
813
+ "step": 650
814
+ },
815
+ {
816
+ "epoch": 32.5,
817
+ "step": 650,
818
+ "train/classification_loss": 0.1786508709192276,
819
+ "train/contrastive_loss": 0.5145746469497681,
820
+ "train/negative_loss": 0.14455223083496094,
821
+ "train/num_negatives": 36,
822
+ "train/num_positives": 18,
823
+ "train/positive_loss": 0.3700224459171295,
824
+ "train/total_loss": 0.28156578540802
825
+ },
826
+ {
827
+ "epoch": 32.5,
828
+ "step": 650,
829
+ "train/classification_loss": 0.2088967263698578,
830
+ "train/contrastive_loss": 1.686006784439087,
831
+ "train/negative_loss": 1.5359704494476318,
832
+ "train/num_negatives": 50,
833
+ "train/num_positives": 6,
834
+ "train/positive_loss": 0.1500363051891327,
835
+ "train/total_loss": 0.5460981130599976
836
  },
837
  {
838
  "epoch": 35.0,
839
+ "grad_norm": 5.847558975219727,
840
+ "learning_rate": 6.711111111111111e-06,
841
+ "loss": 0.5506,
842
+ "step": 700
843
  },
844
  {
845
  "epoch": 35.0,
846
+ "step": 700,
847
+ "train/classification_loss": 0.17107558250427246,
848
+ "train/contrastive_loss": 2.287914276123047,
849
+ "train/negative_loss": 1.0503000020980835,
850
+ "train/num_negatives": 44,
851
+ "train/num_positives": 12,
852
+ "train/positive_loss": 1.237614393234253,
853
+ "train/total_loss": 0.6286584138870239
854
  },
855
  {
856
  "epoch": 35.0,
857
+ "step": 700,
858
+ "train/classification_loss": 0.2500559389591217,
859
+ "train/contrastive_loss": 2.9361915588378906,
860
+ "train/negative_loss": 2.824162721633911,
861
+ "train/num_negatives": 44,
862
+ "train/num_positives": 12,
863
+ "train/positive_loss": 0.1120288297533989,
864
+ "train/total_loss": 0.8372942209243774
865
+ },
866
+ {
867
+ "epoch": 35.0,
868
+ "step": 700,
869
+ "train/classification_loss": 0.23473431169986725,
870
+ "train/contrastive_loss": 4.627296447753906,
871
+ "train/negative_loss": 3.135880708694458,
872
+ "train/num_negatives": 52,
873
+ "train/num_positives": 4,
874
+ "train/positive_loss": 1.4914155006408691,
875
+ "train/total_loss": 1.160193681716919
876
+ },
877
+ {
878
+ "epoch": 35.0,
879
+ "step": 700,
880
+ "train/classification_loss": 0.20871424674987793,
881
+ "train/contrastive_loss": 2.308566093444824,
882
+ "train/negative_loss": 1.4599714279174805,
883
+ "train/num_negatives": 48,
884
+ "train/num_positives": 8,
885
+ "train/positive_loss": 0.8485947251319885,
886
+ "train/total_loss": 0.6704274415969849
887
+ },
888
+ {
889
+ "epoch": 35.0,
890
+ "step": 700,
891
+ "train/classification_loss": 0.23494853079319,
892
+ "train/contrastive_loss": 0.9399895071983337,
893
+ "train/negative_loss": 0.4285624623298645,
894
+ "train/num_negatives": 46,
895
+ "train/num_positives": 8,
896
+ "train/positive_loss": 0.5114270448684692,
897
+ "train/total_loss": 0.4229464530944824
898
+ },
899
+ {
900
+ "epoch": 35.0,
901
+ "eval_exact_match_accuracy": 0.25,
902
+ "eval_hamming_loss": 0.0661764705882353,
903
+ "eval_loss": 0.7439039945602417,
904
+ "eval_macro_f1": 0.12555610479485912,
905
+ "eval_macro_precision": 0.15630252100840336,
906
+ "eval_macro_recall": 0.10748663101604278,
907
+ "eval_micro_f1": 0.4,
908
+ "eval_micro_precision": 0.8333333333333334,
909
+ "eval_micro_recall": 0.2631578947368421,
910
+ "eval_runtime": 0.2043,
911
+ "eval_samples_per_second": 195.76,
912
+ "eval_steps_per_second": 24.47,
913
+ "step": 700
914
+ },
915
+ {
916
+ "epoch": 35.0,
917
+ "step": 700,
918
+ "train/classification_loss": 0.18580235540866852,
919
+ "train/contrastive_loss": 0.5447431802749634,
920
+ "train/negative_loss": 0.26964136958122253,
921
+ "train/num_negatives": 40,
922
+ "train/num_positives": 16,
923
+ "train/positive_loss": 0.27510178089141846,
924
+ "train/total_loss": 0.29475098848342896
925
+ },
926
+ {
927
+ "epoch": 35.0,
928
+ "step": 700,
929
+ "train/classification_loss": 0.19202794134616852,
930
+ "train/contrastive_loss": 1.1511749029159546,
931
+ "train/negative_loss": 0.8905836343765259,
932
+ "train/num_negatives": 40,
933
+ "train/num_positives": 16,
934
+ "train/positive_loss": 0.2605912685394287,
935
+ "train/total_loss": 0.42226290702819824
936
+ },
937
+ {
938
+ "epoch": 37.5,
939
+ "grad_norm": 5.041801452636719,
940
+ "learning_rate": 5.600000000000001e-06,
941
+ "loss": 0.5486,
942
+ "step": 750
943
+ },
944
+ {
945
+ "epoch": 37.5,
946
+ "step": 750,
947
+ "train/classification_loss": 0.18688128888607025,
948
+ "train/contrastive_loss": 0.7704952955245972,
949
+ "train/negative_loss": 0.7704557180404663,
950
+ "train/num_negatives": 48,
951
+ "train/num_positives": 8,
952
+ "train/positive_loss": 3.9578346331836656e-05,
953
+ "train/total_loss": 0.3409803509712219
954
+ },
955
+ {
956
+ "epoch": 37.5,
957
+ "step": 750,
958
+ "train/classification_loss": 0.19349302351474762,
959
+ "train/contrastive_loss": 0.4310402274131775,
960
+ "train/negative_loss": 0.4191313683986664,
961
+ "train/num_negatives": 46,
962
+ "train/num_positives": 10,
963
+ "train/positive_loss": 0.011908866465091705,
964
+ "train/total_loss": 0.2797010540962219
965
  },
966
  {
967
  "epoch": 40.0,
968
+ "grad_norm": 8.75462532043457,
969
+ "learning_rate": 4.488888888888889e-06,
970
+ "loss": 0.5091,
971
+ "step": 800
972
  },
973
  {
974
  "epoch": 40.0,
975
+ "step": 800,
976
+ "train/classification_loss": 0.1726197898387909,
977
+ "train/contrastive_loss": 1.8750531673431396,
978
+ "train/negative_loss": 0.9292571544647217,
979
+ "train/num_negatives": 44,
980
+ "train/num_positives": 12,
981
+ "train/positive_loss": 0.9457959532737732,
982
+ "train/total_loss": 0.5476304292678833
983
  },
984
  {
985
  "epoch": 40.0,
986
+ "step": 800,
987
+ "train/classification_loss": 0.24574041366577148,
988
+ "train/contrastive_loss": 3.074068784713745,
989
+ "train/negative_loss": 3.055785894393921,
990
+ "train/num_negatives": 44,
991
+ "train/num_positives": 12,
992
+ "train/positive_loss": 0.018282821401953697,
993
+ "train/total_loss": 0.8605541586875916
994
+ },
995
+ {
996
+ "epoch": 40.0,
997
+ "step": 800,
998
+ "train/classification_loss": 0.2323407232761383,
999
+ "train/contrastive_loss": 4.7350592613220215,
1000
+ "train/negative_loss": 2.995204210281372,
1001
+ "train/num_negatives": 52,
1002
+ "train/num_positives": 4,
1003
+ "train/positive_loss": 1.739855170249939,
1004
+ "train/total_loss": 1.1793526411056519
1005
+ },
1006
+ {
1007
+ "epoch": 40.0,
1008
+ "step": 800,
1009
+ "train/classification_loss": 0.20833879709243774,
1010
+ "train/contrastive_loss": 3.4109816551208496,
1011
+ "train/negative_loss": 2.4856531620025635,
1012
+ "train/num_negatives": 48,
1013
+ "train/num_positives": 8,
1014
+ "train/positive_loss": 0.9253284931182861,
1015
+ "train/total_loss": 0.8905351161956787
1016
+ },
1017
+ {
1018
+ "epoch": 40.0,
1019
+ "step": 800,
1020
+ "train/classification_loss": 0.23090128600597382,
1021
+ "train/contrastive_loss": 1.3861477375030518,
1022
+ "train/negative_loss": 0.5846720933914185,
1023
+ "train/num_negatives": 46,
1024
+ "train/num_positives": 8,
1025
+ "train/positive_loss": 0.8014755845069885,
1026
+ "train/total_loss": 0.5081308484077454
1027
+ },
1028
+ {
1029
+ "epoch": 40.0,
1030
+ "eval_exact_match_accuracy": 0.275,
1031
+ "eval_hamming_loss": 0.06470588235294118,
1032
+ "eval_loss": 0.7972406148910522,
1033
+ "eval_macro_f1": 0.12978524743230624,
1034
+ "eval_macro_precision": 0.15735294117647058,
1035
+ "eval_macro_recall": 0.11336898395721925,
1036
+ "eval_micro_f1": 0.42105263157894735,
1037
+ "eval_micro_precision": 0.8421052631578947,
1038
+ "eval_micro_recall": 0.2807017543859649,
1039
+ "eval_runtime": 0.2048,
1040
+ "eval_samples_per_second": 195.328,
1041
+ "eval_steps_per_second": 24.416,
1042
+ "step": 800
1043
+ },
1044
+ {
1045
+ "epoch": 40.0,
1046
+ "step": 800,
1047
+ "train/classification_loss": 0.16540196537971497,
1048
+ "train/contrastive_loss": 0.28263404965400696,
1049
+ "train/negative_loss": 0.28260505199432373,
1050
+ "train/num_negatives": 44,
1051
+ "train/num_positives": 12,
1052
+ "train/positive_loss": 2.9008200726821087e-05,
1053
+ "train/total_loss": 0.22192877531051636
1054
+ },
1055
+ {
1056
+ "epoch": 40.0,
1057
+ "step": 800,
1058
+ "train/classification_loss": 0.12730641663074493,
1059
+ "train/contrastive_loss": 0.2957398295402527,
1060
+ "train/negative_loss": 0.29565563797950745,
1061
+ "train/num_negatives": 30,
1062
+ "train/num_positives": 26,
1063
+ "train/positive_loss": 8.41914297780022e-05,
1064
+ "train/total_loss": 0.1864543855190277
1065
+ },
1066
+ {
1067
+ "epoch": 42.5,
1068
+ "grad_norm": 6.981760501861572,
1069
+ "learning_rate": 3.377777777777778e-06,
1070
+ "loss": 0.4948,
1071
+ "step": 850
1072
+ },
1073
+ {
1074
+ "epoch": 42.5,
1075
+ "step": 850,
1076
+ "train/classification_loss": 0.14926917850971222,
1077
+ "train/contrastive_loss": 0.2675209641456604,
1078
+ "train/negative_loss": 0.22713389992713928,
1079
+ "train/num_negatives": 40,
1080
+ "train/num_positives": 16,
1081
+ "train/positive_loss": 0.04038705304265022,
1082
+ "train/total_loss": 0.20277337729930878
1083
+ },
1084
+ {
1085
+ "epoch": 42.5,
1086
+ "step": 850,
1087
+ "train/classification_loss": 0.17404112219810486,
1088
+ "train/contrastive_loss": 0.08631884306669235,
1089
+ "train/negative_loss": 0.08508215099573135,
1090
+ "train/num_negatives": 40,
1091
+ "train/num_positives": 16,
1092
+ "train/positive_loss": 0.0012366925366222858,
1093
+ "train/total_loss": 0.19130489230155945
1094
  },
1095
  {
1096
  "epoch": 45.0,
1097
+ "grad_norm": 3.230358839035034,
1098
+ "learning_rate": 2.266666666666667e-06,
1099
+ "loss": 0.5038,
1100
+ "step": 900
1101
  },
1102
  {
1103
  "epoch": 45.0,
1104
+ "step": 900,
1105
+ "train/classification_loss": 0.17026303708553314,
1106
+ "train/contrastive_loss": 2.131587266921997,
1107
+ "train/negative_loss": 0.9242226481437683,
1108
+ "train/num_negatives": 44,
1109
+ "train/num_positives": 12,
1110
+ "train/positive_loss": 1.207364559173584,
1111
+ "train/total_loss": 0.5965805053710938
1112
  },
1113
  {
1114
  "epoch": 45.0,
1115
+ "step": 900,
1116
+ "train/classification_loss": 0.24070139229297638,
1117
+ "train/contrastive_loss": 3.244175910949707,
1118
+ "train/negative_loss": 3.2388288974761963,
1119
+ "train/num_negatives": 44,
1120
+ "train/num_positives": 12,
1121
+ "train/positive_loss": 0.005346930585801601,
1122
+ "train/total_loss": 0.8895365595817566
 
 
 
 
 
 
 
1123
  },
1124
  {
1125
+ "epoch": 45.0,
1126
+ "step": 900,
1127
+ "train/classification_loss": 0.2271755486726761,
1128
+ "train/contrastive_loss": 4.544618129730225,
1129
+ "train/negative_loss": 2.900235891342163,
1130
+ "train/num_negatives": 52,
1131
+ "train/num_positives": 4,
1132
+ "train/positive_loss": 1.6443822383880615,
1133
+ "train/total_loss": 1.1360992193222046
1134
  },
1135
  {
1136
+ "epoch": 45.0,
1137
+ "step": 900,
1138
+ "train/classification_loss": 0.20477482676506042,
1139
+ "train/contrastive_loss": 3.36401104927063,
1140
+ "train/negative_loss": 2.652341365814209,
1141
+ "train/num_negatives": 48,
1142
+ "train/num_positives": 8,
1143
+ "train/positive_loss": 0.7116697430610657,
1144
+ "train/total_loss": 0.8775770664215088
1145
  },
1146
  {
1147
+ "epoch": 45.0,
1148
+ "step": 900,
1149
+ "train/classification_loss": 0.22650422155857086,
1150
+ "train/contrastive_loss": 1.7576167583465576,
1151
+ "train/negative_loss": 0.546459436416626,
1152
  "train/num_negatives": 46,
1153
  "train/num_positives": 8,
1154
+ "train/positive_loss": 1.2111573219299316,
1155
+ "train/total_loss": 0.578027606010437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1156
  },
1157
  {
1158
+ "epoch": 45.0,
1159
+ "eval_exact_match_accuracy": 0.275,
1160
+ "eval_hamming_loss": 0.06470588235294118,
1161
+ "eval_loss": 0.8155641555786133,
1162
+ "eval_macro_f1": 0.12978524743230624,
1163
+ "eval_macro_precision": 0.15735294117647058,
1164
+ "eval_macro_recall": 0.11336898395721925,
1165
+ "eval_micro_f1": 0.42105263157894735,
1166
+ "eval_micro_precision": 0.8421052631578947,
1167
+ "eval_micro_recall": 0.2807017543859649,
1168
+ "eval_runtime": 0.2057,
1169
+ "eval_samples_per_second": 194.486,
1170
+ "eval_steps_per_second": 24.311,
1171
+ "step": 900
1172
+ },
1173
+ {
1174
+ "epoch": 45.0,
1175
+ "step": 900,
1176
  "total_flos": 0.0,
1177
+ "train_loss": 0.9705644819471572,
1178
+ "train_runtime": 232.6541,
1179
+ "train_samples_per_second": 67.912,
1180
+ "train_steps_per_second": 4.298
1181
  }
1182
  ],
1183
  "logging_steps": 50,
1184
+ "max_steps": 1000,
1185
  "num_input_tokens_seen": 0,
1186
  "num_train_epochs": 50,
1187
+ "save_steps": 50000,
1188
  "stateful_callbacks": {
1189
  "EarlyStoppingCallback": {
1190
  "args": {
 
1200
  "should_epoch_stop": false,
1201
  "should_evaluate": false,
1202
  "should_log": false,
1203
+ "should_save": false,
1204
+ "should_training_stop": false
1205
  },
1206
  "attributes": {}
1207
  }
1208
  },
1209
  "total_flos": 0.0,
1210
+ "train_batch_size": 8,
1211
  "trial_name": null,
1212
  "trial_params": null
1213
  }