wxnfifth commited on
Commit
1beb838
·
verified ·
1 Parent(s): c8752f1

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wxnfifth/huggingface/runs/pu6294f1)
31
 
32
 
33
  This model was trained with SFT.
@@ -35,7 +35,7 @@ This model was trained with SFT.
35
  ### Framework versions
36
 
37
  - TRL: 0.15.0.dev0
38
- - Transformers: 4.49.0.dev0
39
  - Pytorch: 2.5.1
40
  - Datasets: 3.2.0
41
  - Tokenizers: 0.21.0
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wxnfifth/huggingface/runs/j55yx0w5)
31
 
32
 
33
  This model was trained with SFT.
 
35
  ### Framework versions
36
 
37
  - TRL: 0.15.0.dev0
38
+ - Transformers: 4.48.2
39
  - Pytorch: 2.5.1
40
  - Datasets: 3.2.0
41
  - Tokenizers: 0.21.0
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
- "train_loss": 0.7814551896086787,
5
- "train_runtime": 4577.2961,
6
  "train_samples": 16610,
7
- "train_samples_per_second": 4.721,
8
- "train_steps_per_second": 0.074
9
  }
 
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
+ "train_loss": 0.7870922902217604,
5
+ "train_runtime": 4743.2538,
6
  "train_samples": 16610,
7
+ "train_samples_per_second": 4.556,
8
+ "train_steps_per_second": 0.071
9
  }
config.json CHANGED
@@ -22,7 +22,7 @@
22
  "sliding_window": null,
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
- "transformers_version": "4.49.0.dev0",
26
  "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
 
22
  "sliding_window": null,
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.48.2",
26
  "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
generation_config.json CHANGED
@@ -10,5 +10,5 @@
10
  "temperature": 0.7,
11
  "top_k": 20,
12
  "top_p": 0.8,
13
- "transformers_version": "4.49.0.dev0"
14
  }
 
10
  "temperature": 0.7,
11
  "top_k": 20,
12
  "top_p": 0.8,
13
+ "transformers_version": "4.48.2"
14
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a72b2d5bb187809d5bb9e78e96673a081df0491bc4402af7547768018f99387f
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c30079c59ee74a761bb370b7482619c19b48ace8a8c63e17b6b0cadc3f47833
3
  size 3087467144
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
- "train_loss": 0.7814551896086787,
5
- "train_runtime": 4577.2961,
6
  "train_samples": 16610,
7
- "train_samples_per_second": 4.721,
8
- "train_steps_per_second": 0.074
9
  }
 
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
+ "train_loss": 0.7870922902217604,
5
+ "train_runtime": 4743.2538,
6
  "train_samples": 16610,
7
+ "train_samples_per_second": 4.556,
8
+ "train_steps_per_second": 0.071
9
  }
trainer_state.json CHANGED
@@ -10,505 +10,505 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.014803849000740192,
13
- "grad_norm": 2.6030564308166504,
14
  "learning_rate": 2.9411764705882355e-06,
15
  "loss": 1.09,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.029607698001480384,
20
- "grad_norm": 1.507163405418396,
21
  "learning_rate": 5.882352941176471e-06,
22
- "loss": 1.0787,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.04441154700222058,
27
- "grad_norm": 1.2673176527023315,
28
  "learning_rate": 8.823529411764707e-06,
29
- "loss": 1.0147,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.05921539600296077,
34
- "grad_norm": 0.8998919725418091,
35
  "learning_rate": 1.1764705882352942e-05,
36
- "loss": 0.9334,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.07401924500370097,
41
- "grad_norm": 0.7936584949493408,
42
  "learning_rate": 1.4705882352941179e-05,
43
- "loss": 0.9,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.08882309400444116,
48
- "grad_norm": 0.6800016164779663,
49
  "learning_rate": 1.7647058823529414e-05,
50
- "loss": 0.8802,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.10362694300518134,
55
- "grad_norm": 0.6241341829299927,
56
  "learning_rate": 1.9999462497359468e-05,
57
- "loss": 0.8536,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.11843079200592153,
62
- "grad_norm": 0.542241096496582,
63
  "learning_rate": 1.9980655971335944e-05,
64
- "loss": 0.8348,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.13323464100666174,
69
- "grad_norm": 0.511304497718811,
70
  "learning_rate": 1.993503206718859e-05,
71
- "loss": 0.8136,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.14803849000740193,
76
- "grad_norm": 0.6061602234840393,
77
  "learning_rate": 1.986271337340182e-05,
78
- "loss": 0.8197,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.16284233900814213,
83
- "grad_norm": 0.5142273306846619,
84
  "learning_rate": 1.976389420563607e-05,
85
- "loss": 0.8027,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.17764618800888232,
90
- "grad_norm": 0.494143009185791,
91
  "learning_rate": 1.9638840084614182e-05,
92
- "loss": 0.7893,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.19245003700962252,
97
- "grad_norm": 0.5752012729644775,
98
  "learning_rate": 1.9487887022684336e-05,
99
- "loss": 0.7993,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.20725388601036268,
104
- "grad_norm": 0.5229590535163879,
105
  "learning_rate": 1.9311440620976597e-05,
106
- "loss": 0.7927,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.22205773501110287,
111
- "grad_norm": 0.5390455722808838,
112
  "learning_rate": 1.9109974979578852e-05,
113
- "loss": 0.7836,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.23686158401184307,
118
- "grad_norm": 0.5431194305419922,
119
  "learning_rate": 1.8884031423660492e-05,
120
- "loss": 0.8122,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.25166543301258326,
125
- "grad_norm": 0.5343985557556152,
126
  "learning_rate": 1.8634217048966638e-05,
127
- "loss": 0.795,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.2664692820133235,
132
- "grad_norm": 0.5408906936645508,
133
  "learning_rate": 1.836120309059107e-05,
134
- "loss": 0.7778,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.28127313101406365,
139
- "grad_norm": 0.5261913537979126,
140
  "learning_rate": 1.8065723119410885e-05,
141
- "loss": 0.7756,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.29607698001480387,
146
- "grad_norm": 0.5498437881469727,
147
  "learning_rate": 1.77485710710289e-05,
148
- "loss": 0.7824,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.29607698001480387,
153
- "eval_loss": 0.7986361980438232,
154
- "eval_runtime": 5.6695,
155
- "eval_samples_per_second": 22.577,
156
- "eval_steps_per_second": 1.411,
157
  "step": 100
158
  },
159
  {
160
  "epoch": 0.31088082901554404,
161
- "grad_norm": 0.4975713789463043,
162
  "learning_rate": 1.741059911251997e-05,
163
- "loss": 0.7731,
164
  "step": 105
165
  },
166
  {
167
  "epoch": 0.32568467801628426,
168
- "grad_norm": 0.5265988707542419,
169
  "learning_rate": 1.7052715352713076e-05,
170
- "loss": 0.7674,
171
  "step": 110
172
  },
173
  {
174
  "epoch": 0.3404885270170244,
175
- "grad_norm": 0.5293793082237244,
176
  "learning_rate": 1.667588140216154e-05,
177
- "loss": 0.7943,
178
  "step": 115
179
  },
180
  {
181
  "epoch": 0.35529237601776464,
182
- "grad_norm": 0.5241897702217102,
183
  "learning_rate": 1.628110978935756e-05,
184
- "loss": 0.7687,
185
  "step": 120
186
  },
187
  {
188
  "epoch": 0.3700962250185048,
189
- "grad_norm": 0.537781834602356,
190
  "learning_rate": 1.586946124013354e-05,
191
- "loss": 0.7683,
192
  "step": 125
193
  },
194
  {
195
  "epoch": 0.38490007401924503,
196
- "grad_norm": 0.5170732736587524,
197
  "learning_rate": 1.5442041827560274e-05,
198
- "loss": 0.7449,
199
  "step": 130
200
  },
201
  {
202
  "epoch": 0.3997039230199852,
203
- "grad_norm": 0.5464586019515991,
204
  "learning_rate": 1.5000000000000002e-05,
205
- "loss": 0.7558,
206
  "step": 135
207
  },
208
  {
209
  "epoch": 0.41450777202072536,
210
- "grad_norm": 0.5261128544807434,
211
  "learning_rate": 1.4544523495299843e-05,
212
- "loss": 0.762,
213
  "step": 140
214
  },
215
  {
216
  "epoch": 0.4293116210214656,
217
- "grad_norm": 0.565876305103302,
218
  "learning_rate": 1.4076836149416889e-05,
219
- "loss": 0.778,
220
  "step": 145
221
  },
222
  {
223
  "epoch": 0.44411547002220575,
224
- "grad_norm": 0.5250110030174255,
225
  "learning_rate": 1.3598194608050011e-05,
226
- "loss": 0.7627,
227
  "step": 150
228
  },
229
  {
230
  "epoch": 0.45891931902294597,
231
- "grad_norm": 0.47735342383384705,
232
  "learning_rate": 1.3109884950114007e-05,
233
- "loss": 0.7521,
234
  "step": 155
235
  },
236
  {
237
  "epoch": 0.47372316802368614,
238
- "grad_norm": 0.4897947609424591,
239
  "learning_rate": 1.2613219232128608e-05,
240
- "loss": 0.752,
241
  "step": 160
242
  },
243
  {
244
  "epoch": 0.48852701702442636,
245
- "grad_norm": 0.49062302708625793,
246
  "learning_rate": 1.2109531962807333e-05,
247
- "loss": 0.7536,
248
  "step": 165
249
  },
250
  {
251
  "epoch": 0.5033308660251665,
252
- "grad_norm": 0.47276487946510315,
253
  "learning_rate": 1.1600176517318742e-05,
254
- "loss": 0.7581,
255
  "step": 170
256
  },
257
  {
258
  "epoch": 0.5181347150259067,
259
- "grad_norm": 0.5006165504455566,
260
  "learning_rate": 1.1086521500854746e-05,
261
- "loss": 0.7452,
262
  "step": 175
263
  },
264
  {
265
  "epoch": 0.532938564026647,
266
- "grad_norm": 0.49434661865234375,
267
  "learning_rate": 1.0569947071276847e-05,
268
- "loss": 0.766,
269
  "step": 180
270
  },
271
  {
272
  "epoch": 0.5477424130273871,
273
- "grad_norm": 0.5508667826652527,
274
  "learning_rate": 1.0051841230721065e-05,
275
- "loss": 0.759,
276
  "step": 185
277
  },
278
  {
279
  "epoch": 0.5625462620281273,
280
- "grad_norm": 0.49814572930336,
281
  "learning_rate": 9.533596096125826e-06,
282
- "loss": 0.7656,
283
  "step": 190
284
  },
285
  {
286
  "epoch": 0.5773501110288675,
287
- "grad_norm": 0.48042887449264526,
288
  "learning_rate": 9.016604158703654e-06,
289
- "loss": 0.7397,
290
  "step": 195
291
  },
292
  {
293
  "epoch": 0.5921539600296077,
294
- "grad_norm": 0.4838988482952118,
295
  "learning_rate": 8.502254542407186e-06,
296
- "loss": 0.7375,
297
  "step": 200
298
  },
299
  {
300
  "epoch": 0.5921539600296077,
301
- "eval_loss": 0.7734147310256958,
302
- "eval_runtime": 5.714,
303
- "eval_samples_per_second": 22.401,
304
- "eval_steps_per_second": 1.4,
305
  "step": 200
306
  },
307
  {
308
  "epoch": 0.6069578090303479,
309
- "grad_norm": 0.47851860523223877,
310
  "learning_rate": 7.991929271442817e-06,
311
- "loss": 0.7415,
312
  "step": 205
313
  },
314
  {
315
  "epoch": 0.6217616580310881,
316
- "grad_norm": 0.49189141392707825,
317
  "learning_rate": 7.48699955686089e-06,
318
- "loss": 0.7437,
319
  "step": 210
320
  },
321
  {
322
  "epoch": 0.6365655070318282,
323
- "grad_norm": 0.4642721116542816,
324
  "learning_rate": 6.988822112200157e-06,
325
- "loss": 0.7517,
326
  "step": 215
327
  },
328
  {
329
  "epoch": 0.6513693560325685,
330
- "grad_norm": 0.5027055144309998,
331
  "learning_rate": 6.498735508086094e-06,
332
- "loss": 0.755,
333
  "step": 220
334
  },
335
  {
336
  "epoch": 0.6661732050333087,
337
- "grad_norm": 0.44715380668640137,
338
  "learning_rate": 6.018056575578075e-06,
339
- "loss": 0.7487,
340
  "step": 225
341
  },
342
  {
343
  "epoch": 0.6809770540340488,
344
- "grad_norm": 0.41352778673171997,
345
  "learning_rate": 5.548076867929331e-06,
346
- "loss": 0.7455,
347
  "step": 230
348
  },
349
  {
350
  "epoch": 0.695780903034789,
351
- "grad_norm": 0.4372292757034302,
352
  "learning_rate": 5.090059190266779e-06,
353
- "loss": 0.7336,
354
  "step": 235
355
  },
356
  {
357
  "epoch": 0.7105847520355293,
358
- "grad_norm": 0.43178072571754456,
359
  "learning_rate": 4.645234206515171e-06,
360
- "loss": 0.7387,
361
  "step": 240
362
  },
363
  {
364
  "epoch": 0.7253886010362695,
365
- "grad_norm": 0.4522764980792999,
366
  "learning_rate": 4.214797132682597e-06,
367
- "loss": 0.7352,
368
  "step": 245
369
  },
370
  {
371
  "epoch": 0.7401924500370096,
372
- "grad_norm": 0.4542970359325409,
373
  "learning_rate": 3.799904525392251e-06,
374
- "loss": 0.7421,
375
  "step": 250
376
  },
377
  {
378
  "epoch": 0.7549962990377498,
379
- "grad_norm": 0.443574458360672,
380
  "learning_rate": 3.401671174289469e-06,
381
- "loss": 0.7323,
382
  "step": 255
383
  },
384
  {
385
  "epoch": 0.7698001480384901,
386
- "grad_norm": 0.4417092502117157,
387
  "learning_rate": 3.021167106673928e-06,
388
- "loss": 0.7483,
389
  "step": 260
390
  },
391
  {
392
  "epoch": 0.7846039970392302,
393
- "grad_norm": 0.435789555311203,
394
  "learning_rate": 2.6594147124053983e-06,
395
- "loss": 0.7371,
396
  "step": 265
397
  },
398
  {
399
  "epoch": 0.7994078460399704,
400
- "grad_norm": 0.4441350996494293,
401
  "learning_rate": 2.317385996808195e-06,
402
- "loss": 0.7488,
403
  "step": 270
404
  },
405
  {
406
  "epoch": 0.8142116950407106,
407
- "grad_norm": 0.41490909457206726,
408
  "learning_rate": 1.9959999689556407e-06,
409
- "loss": 0.7413,
410
  "step": 275
411
  },
412
  {
413
  "epoch": 0.8290155440414507,
414
- "grad_norm": 0.4214578866958618,
415
  "learning_rate": 1.6961201723520248e-06,
416
- "loss": 0.7272,
417
  "step": 280
418
  },
419
  {
420
  "epoch": 0.843819393042191,
421
- "grad_norm": 0.4023810923099518,
422
  "learning_rate": 1.4185523646469822e-06,
423
- "loss": 0.752,
424
  "step": 285
425
  },
426
  {
427
  "epoch": 0.8586232420429312,
428
- "grad_norm": 0.4226447641849518,
429
  "learning_rate": 1.1640423526166987e-06,
430
- "loss": 0.7298,
431
  "step": 290
432
  },
433
  {
434
  "epoch": 0.8734270910436713,
435
- "grad_norm": 0.405353307723999,
436
  "learning_rate": 9.332739882292752e-07,
437
- "loss": 0.7559,
438
  "step": 295
439
  },
440
  {
441
  "epoch": 0.8882309400444115,
442
- "grad_norm": 0.4032580256462097,
443
  "learning_rate": 7.268673311786378e-07,
444
- "loss": 0.7455,
445
  "step": 300
446
  },
447
  {
448
  "epoch": 0.8882309400444115,
449
- "eval_loss": 0.7648502588272095,
450
- "eval_runtime": 5.5578,
451
- "eval_samples_per_second": 23.031,
452
- "eval_steps_per_second": 1.439,
453
  "step": 300
454
  },
455
  {
456
  "epoch": 0.9030347890451518,
457
- "grad_norm": 0.3890097737312317,
458
  "learning_rate": 5.453769828241872e-07,
459
- "loss": 0.7294,
460
  "step": 305
461
  },
462
  {
463
  "epoch": 0.9178386380458919,
464
- "grad_norm": 0.4142417311668396,
465
  "learning_rate": 3.8929059601275463e-07,
466
- "loss": 0.7617,
467
  "step": 310
468
  },
469
  {
470
  "epoch": 0.9326424870466321,
471
- "grad_norm": 0.3959888219833374,
472
  "learning_rate": 2.5902756478688674e-07,
473
- "loss": 0.7442,
474
  "step": 315
475
  },
476
  {
477
  "epoch": 0.9474463360473723,
478
- "grad_norm": 0.39996781945228577,
479
  "learning_rate": 1.5493789750014032e-07,
480
- "loss": 0.7459,
481
  "step": 320
482
  },
483
  {
484
  "epoch": 0.9622501850481125,
485
- "grad_norm": 0.42691072821617126,
486
  "learning_rate": 7.730127636723539e-08,
487
- "loss": 0.7266,
488
  "step": 325
489
  },
490
  {
491
  "epoch": 0.9770540340488527,
492
- "grad_norm": 0.4197230637073517,
493
  "learning_rate": 2.6326305976001054e-08,
494
- "loss": 0.7312,
495
  "step": 330
496
  },
497
  {
498
  "epoch": 0.9918578830495929,
499
- "grad_norm": 0.44191622734069824,
500
  "learning_rate": 2.149952780321485e-09,
501
- "loss": 0.7525,
502
  "step": 335
503
  },
504
  {
505
  "epoch": 0.997779422649889,
506
  "step": 337,
507
  "total_flos": 76745898196992.0,
508
- "train_loss": 0.7814551896086787,
509
- "train_runtime": 4577.2961,
510
- "train_samples_per_second": 4.721,
511
- "train_steps_per_second": 0.074
512
  }
513
  ],
514
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.014803849000740192,
13
+ "grad_norm": 0.6502017974853516,
14
  "learning_rate": 2.9411764705882355e-06,
15
  "loss": 1.09,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.029607698001480384,
20
+ "grad_norm": 0.3823283016681671,
21
  "learning_rate": 5.882352941176471e-06,
22
+ "loss": 1.0792,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.04441154700222058,
27
+ "grad_norm": 0.3939237594604492,
28
  "learning_rate": 8.823529411764707e-06,
29
+ "loss": 1.0223,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.05921539600296077,
34
+ "grad_norm": 0.2809950113296509,
35
  "learning_rate": 1.1764705882352942e-05,
36
+ "loss": 0.9451,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.07401924500370097,
41
+ "grad_norm": 0.22726929187774658,
42
  "learning_rate": 1.4705882352941179e-05,
43
+ "loss": 0.9125,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.08882309400444116,
48
+ "grad_norm": 0.17815199494361877,
49
  "learning_rate": 1.7647058823529414e-05,
50
+ "loss": 0.893,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.10362694300518134,
55
+ "grad_norm": 0.1736987829208374,
56
  "learning_rate": 1.9999462497359468e-05,
57
+ "loss": 0.8651,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.11843079200592153,
62
+ "grad_norm": 0.14923641085624695,
63
  "learning_rate": 1.9980655971335944e-05,
64
+ "loss": 0.8452,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.13323464100666174,
69
+ "grad_norm": 0.12440051883459091,
70
  "learning_rate": 1.993503206718859e-05,
71
+ "loss": 0.8228,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.14803849000740193,
76
+ "grad_norm": 0.14250266551971436,
77
  "learning_rate": 1.986271337340182e-05,
78
+ "loss": 0.8278,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.16284233900814213,
83
+ "grad_norm": 0.12134958058595657,
84
  "learning_rate": 1.976389420563607e-05,
85
+ "loss": 0.8105,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.17764618800888232,
90
+ "grad_norm": 0.1252334713935852,
91
  "learning_rate": 1.9638840084614182e-05,
92
+ "loss": 0.7963,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.19245003700962252,
97
+ "grad_norm": 0.12292112410068512,
98
  "learning_rate": 1.9487887022684336e-05,
99
+ "loss": 0.8063,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.20725388601036268,
104
+ "grad_norm": 0.13962922990322113,
105
  "learning_rate": 1.9311440620976597e-05,
106
+ "loss": 0.799,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.22205773501110287,
111
+ "grad_norm": 0.1244792640209198,
112
  "learning_rate": 1.9109974979578852e-05,
113
+ "loss": 0.7898,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.23686158401184307,
118
+ "grad_norm": 0.12688860297203064,
119
  "learning_rate": 1.8884031423660492e-05,
120
+ "loss": 0.8185,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.25166543301258326,
125
+ "grad_norm": 0.12278366833925247,
126
  "learning_rate": 1.8634217048966638e-05,
127
+ "loss": 0.801,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.2664692820133235,
132
+ "grad_norm": 0.11767291277647018,
133
  "learning_rate": 1.836120309059107e-05,
134
+ "loss": 0.7838,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.28127313101406365,
139
+ "grad_norm": 0.12599965929985046,
140
  "learning_rate": 1.8065723119410885e-05,
141
+ "loss": 0.7808,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.29607698001480387,
146
+ "grad_norm": 0.13216127455234528,
147
  "learning_rate": 1.77485710710289e-05,
148
+ "loss": 0.7879,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.29607698001480387,
153
+ "eval_loss": 0.80430006980896,
154
+ "eval_runtime": 5.8595,
155
+ "eval_samples_per_second": 21.845,
156
+ "eval_steps_per_second": 1.365,
157
  "step": 100
158
  },
159
  {
160
  "epoch": 0.31088082901554404,
161
+ "grad_norm": 0.12097267806529999,
162
  "learning_rate": 1.741059911251997e-05,
163
+ "loss": 0.7786,
164
  "step": 105
165
  },
166
  {
167
  "epoch": 0.32568467801628426,
168
+ "grad_norm": 0.12593290209770203,
169
  "learning_rate": 1.7052715352713076e-05,
170
+ "loss": 0.7727,
171
  "step": 110
172
  },
173
  {
174
  "epoch": 0.3404885270170244,
175
+ "grad_norm": 0.12331051379442215,
176
  "learning_rate": 1.667588140216154e-05,
177
+ "loss": 0.7994,
178
  "step": 115
179
  },
180
  {
181
  "epoch": 0.35529237601776464,
182
+ "grad_norm": 0.1359540820121765,
183
  "learning_rate": 1.628110978935756e-05,
184
+ "loss": 0.774,
185
  "step": 120
186
  },
187
  {
188
  "epoch": 0.3700962250185048,
189
+ "grad_norm": 0.13382965326309204,
190
  "learning_rate": 1.586946124013354e-05,
191
+ "loss": 0.7734,
192
  "step": 125
193
  },
194
  {
195
  "epoch": 0.38490007401924503,
196
+ "grad_norm": 0.12126770615577698,
197
  "learning_rate": 1.5442041827560274e-05,
198
+ "loss": 0.7497,
199
  "step": 130
200
  },
201
  {
202
  "epoch": 0.3997039230199852,
203
+ "grad_norm": 0.1166161447763443,
204
  "learning_rate": 1.5000000000000002e-05,
205
+ "loss": 0.7607,
206
  "step": 135
207
  },
208
  {
209
  "epoch": 0.41450777202072536,
210
+ "grad_norm": 0.12959471344947815,
211
  "learning_rate": 1.4544523495299843e-05,
212
+ "loss": 0.7669,
213
  "step": 140
214
  },
215
  {
216
  "epoch": 0.4293116210214656,
217
+ "grad_norm": 0.134933203458786,
218
  "learning_rate": 1.4076836149416889e-05,
219
+ "loss": 0.7829,
220
  "step": 145
221
  },
222
  {
223
  "epoch": 0.44411547002220575,
224
+ "grad_norm": 0.13227751851081848,
225
  "learning_rate": 1.3598194608050011e-05,
226
+ "loss": 0.7677,
227
  "step": 150
228
  },
229
  {
230
  "epoch": 0.45891931902294597,
231
+ "grad_norm": 0.11633738875389099,
232
  "learning_rate": 1.3109884950114007e-05,
233
+ "loss": 0.7567,
234
  "step": 155
235
  },
236
  {
237
  "epoch": 0.47372316802368614,
238
+ "grad_norm": 0.12226972728967667,
239
  "learning_rate": 1.2613219232128608e-05,
240
+ "loss": 0.7568,
241
  "step": 160
242
  },
243
  {
244
  "epoch": 0.48852701702442636,
245
+ "grad_norm": 0.11851673573255539,
246
  "learning_rate": 1.2109531962807333e-05,
247
+ "loss": 0.7583,
248
  "step": 165
249
  },
250
  {
251
  "epoch": 0.5033308660251665,
252
+ "grad_norm": 0.118564672768116,
253
  "learning_rate": 1.1600176517318742e-05,
254
+ "loss": 0.7631,
255
  "step": 170
256
  },
257
  {
258
  "epoch": 0.5181347150259067,
259
+ "grad_norm": 0.12198054790496826,
260
  "learning_rate": 1.1086521500854746e-05,
261
+ "loss": 0.75,
262
  "step": 175
263
  },
264
  {
265
  "epoch": 0.532938564026647,
266
+ "grad_norm": 0.1204327940940857,
267
  "learning_rate": 1.0569947071276847e-05,
268
+ "loss": 0.7708,
269
  "step": 180
270
  },
271
  {
272
  "epoch": 0.5477424130273871,
273
+ "grad_norm": 0.13036802411079407,
274
  "learning_rate": 1.0051841230721065e-05,
275
+ "loss": 0.764,
276
  "step": 185
277
  },
278
  {
279
  "epoch": 0.5625462620281273,
280
+ "grad_norm": 0.13102592527866364,
281
  "learning_rate": 9.533596096125826e-06,
282
+ "loss": 0.7705,
283
  "step": 190
284
  },
285
  {
286
  "epoch": 0.5773501110288675,
287
+ "grad_norm": 0.12162639945745468,
288
  "learning_rate": 9.016604158703654e-06,
289
+ "loss": 0.7444,
290
  "step": 195
291
  },
292
  {
293
  "epoch": 0.5921539600296077,
294
+ "grad_norm": 0.13102519512176514,
295
  "learning_rate": 8.502254542407186e-06,
296
+ "loss": 0.7423,
297
  "step": 200
298
  },
299
  {
300
  "epoch": 0.5921539600296077,
301
+ "eval_loss": 0.7782641053199768,
302
+ "eval_runtime": 5.9255,
303
+ "eval_samples_per_second": 21.601,
304
+ "eval_steps_per_second": 1.35,
305
  "step": 200
306
  },
307
  {
308
  "epoch": 0.6069578090303479,
309
+ "grad_norm": 0.11822319775819778,
310
  "learning_rate": 7.991929271442817e-06,
311
+ "loss": 0.7461,
312
  "step": 205
313
  },
314
  {
315
  "epoch": 0.6217616580310881,
316
+ "grad_norm": 0.11643572896718979,
317
  "learning_rate": 7.48699955686089e-06,
318
+ "loss": 0.7483,
319
  "step": 210
320
  },
321
  {
322
  "epoch": 0.6365655070318282,
323
+ "grad_norm": 0.11460445076227188,
324
  "learning_rate": 6.988822112200157e-06,
325
+ "loss": 0.7567,
326
  "step": 215
327
  },
328
  {
329
  "epoch": 0.6513693560325685,
330
+ "grad_norm": 0.12710332870483398,
331
  "learning_rate": 6.498735508086094e-06,
332
+ "loss": 0.7597,
333
  "step": 220
334
  },
335
  {
336
  "epoch": 0.6661732050333087,
337
+ "grad_norm": 0.11362723261117935,
338
  "learning_rate": 6.018056575578075e-06,
339
+ "loss": 0.7537,
340
  "step": 225
341
  },
342
  {
343
  "epoch": 0.6809770540340488,
344
+ "grad_norm": 0.10726357251405716,
345
  "learning_rate": 5.548076867929331e-06,
346
+ "loss": 0.7503,
347
  "step": 230
348
  },
349
  {
350
  "epoch": 0.695780903034789,
351
+ "grad_norm": 0.11590282618999481,
352
  "learning_rate": 5.090059190266779e-06,
353
+ "loss": 0.7384,
354
  "step": 235
355
  },
356
  {
357
  "epoch": 0.7105847520355293,
358
+ "grad_norm": 0.10790540277957916,
359
  "learning_rate": 4.645234206515171e-06,
360
+ "loss": 0.7436,
361
  "step": 240
362
  },
363
  {
364
  "epoch": 0.7253886010362695,
365
+ "grad_norm": 0.11345735192298889,
366
  "learning_rate": 4.214797132682597e-06,
367
+ "loss": 0.7401,
368
  "step": 245
369
  },
370
  {
371
  "epoch": 0.7401924500370096,
372
+ "grad_norm": 0.11736641824245453,
373
  "learning_rate": 3.799904525392251e-06,
374
+ "loss": 0.747,
375
  "step": 250
376
  },
377
  {
378
  "epoch": 0.7549962990377498,
379
+ "grad_norm": 0.11365947127342224,
380
  "learning_rate": 3.401671174289469e-06,
381
+ "loss": 0.7371,
382
  "step": 255
383
  },
384
  {
385
  "epoch": 0.7698001480384901,
386
+ "grad_norm": 0.11066755652427673,
387
  "learning_rate": 3.021167106673928e-06,
388
+ "loss": 0.7531,
389
  "step": 260
390
  },
391
  {
392
  "epoch": 0.7846039970392302,
393
+ "grad_norm": 0.10806834697723389,
394
  "learning_rate": 2.6594147124053983e-06,
395
+ "loss": 0.742,
396
  "step": 265
397
  },
398
  {
399
  "epoch": 0.7994078460399704,
400
+ "grad_norm": 0.1102728471159935,
401
  "learning_rate": 2.317385996808195e-06,
402
+ "loss": 0.7536,
403
  "step": 270
404
  },
405
  {
406
  "epoch": 0.8142116950407106,
407
+ "grad_norm": 0.10305000841617584,
408
  "learning_rate": 1.9959999689556407e-06,
409
+ "loss": 0.7463,
410
  "step": 275
411
  },
412
  {
413
  "epoch": 0.8290155440414507,
414
+ "grad_norm": 0.10573720186948776,
415
  "learning_rate": 1.6961201723520248e-06,
416
+ "loss": 0.732,
417
  "step": 280
418
  },
419
  {
420
  "epoch": 0.843819393042191,
421
+ "grad_norm": 0.10039414465427399,
422
  "learning_rate": 1.4185523646469822e-06,
423
+ "loss": 0.757,
424
  "step": 285
425
  },
426
  {
427
  "epoch": 0.8586232420429312,
428
+ "grad_norm": 0.10779191553592682,
429
  "learning_rate": 1.1640423526166987e-06,
430
+ "loss": 0.7348,
431
  "step": 290
432
  },
433
  {
434
  "epoch": 0.8734270910436713,
435
+ "grad_norm": 0.1016748696565628,
436
  "learning_rate": 9.332739882292752e-07,
437
+ "loss": 0.7608,
438
  "step": 295
439
  },
440
  {
441
  "epoch": 0.8882309400444115,
442
+ "grad_norm": 0.10107531398534775,
443
  "learning_rate": 7.268673311786378e-07,
444
+ "loss": 0.7507,
445
  "step": 300
446
  },
447
  {
448
  "epoch": 0.8882309400444115,
449
+ "eval_loss": 0.7698361873626709,
450
+ "eval_runtime": 5.8048,
451
+ "eval_samples_per_second": 22.051,
452
+ "eval_steps_per_second": 1.378,
453
  "step": 300
454
  },
455
  {
456
  "epoch": 0.9030347890451518,
457
+ "grad_norm": 0.09836234152317047,
458
  "learning_rate": 5.453769828241872e-07,
459
+ "loss": 0.7343,
460
  "step": 305
461
  },
462
  {
463
  "epoch": 0.9178386380458919,
464
+ "grad_norm": 0.10397884249687195,
465
  "learning_rate": 3.8929059601275463e-07,
466
+ "loss": 0.7668,
467
  "step": 310
468
  },
469
  {
470
  "epoch": 0.9326424870466321,
471
+ "grad_norm": 0.09893100708723068,
472
  "learning_rate": 2.5902756478688674e-07,
473
+ "loss": 0.749,
474
  "step": 315
475
  },
476
  {
477
  "epoch": 0.9474463360473723,
478
+ "grad_norm": 0.1005384624004364,
479
  "learning_rate": 1.5493789750014032e-07,
480
+ "loss": 0.7509,
481
  "step": 320
482
  },
483
  {
484
  "epoch": 0.9622501850481125,
485
+ "grad_norm": 0.10756874084472656,
486
  "learning_rate": 7.730127636723539e-08,
487
+ "loss": 0.7315,
488
  "step": 325
489
  },
490
  {
491
  "epoch": 0.9770540340488527,
492
+ "grad_norm": 0.1049792617559433,
493
  "learning_rate": 2.6326305976001054e-08,
494
+ "loss": 0.7362,
495
  "step": 330
496
  },
497
  {
498
  "epoch": 0.9918578830495929,
499
+ "grad_norm": 0.11218578368425369,
500
  "learning_rate": 2.149952780321485e-09,
501
+ "loss": 0.7576,
502
  "step": 335
503
  },
504
  {
505
  "epoch": 0.997779422649889,
506
  "step": 337,
507
  "total_flos": 76745898196992.0,
508
+ "train_loss": 0.7870922902217604,
509
+ "train_runtime": 4743.2538,
510
+ "train_samples_per_second": 4.556,
511
+ "train_steps_per_second": 0.071
512
  }
513
  ],
514
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e01b3694b919c97bbee46918cea0ad569de1f5a7e12fd01da965ead09ddea96
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0516457824250b3c72e8f3cba31e7ed9ce0733070d5423a99e9bb91778da8840
3
  size 7352