nimitt0 commited on
Commit
4ef6f3d
·
verified ·
1 Parent(s): b354593

Add files using upload-large-folder tool

Browse files
experiment_cfg/metadata.json CHANGED
@@ -4,139 +4,147 @@
4
  "state": {
5
  "single_arm": {
6
  "max": [
7
- 2.0003252029418945,
8
- 2.8696048259735107,
9
- -2.973998198285699e-05,
10
- 1.2001643180847168,
11
- 1.636069416999817
12
  ],
13
  "min": [
14
- -1.6366058588027954,
15
- -0.0013483419315889478,
16
- -2.867553949356079,
17
- -0.15347807109355927,
18
- -2.408719539642334
19
  ],
20
  "mean": [
21
- 0.5613881349563599,
22
- 1.2009493112564087,
23
- -1.1912317276000977,
24
- 0.7247916460037231,
25
- -0.5904473662376404
26
  ],
27
  "std": [
28
- 0.9785857200622559,
29
- 0.7331446409225464,
30
- 0.6282459497451782,
31
- 0.23302637040615082,
32
- 0.9657405614852905
33
  ],
34
  "q01": [
35
- -1.5449657726287842,
36
- -1.0683193213480989e-05,
37
- -2.7843262195587157,
38
- 0.2979250168800354,
39
- -1.9947426080703736
40
  ],
41
  "q99": [
42
- 1.9999996423721313,
43
- 2.7311124897003163,
44
- -0.3246664524078371,
45
  1.2000004053115845,
46
- 1.5377632951736446
47
  ]
48
  },
49
  "gripper": {
50
  "max": [
51
- 0.703035831451416
52
  ],
53
  "min": [
54
- -0.06299631297588348
55
  ],
56
  "mean": [
57
- 0.024504341185092926
58
  ],
59
  "std": [
60
- 0.08595108985900879
61
  ],
62
  "q01": [
63
- -0.010759906060993672
64
  ],
65
  "q99": [
66
- 0.4408635914325714
67
  ]
68
  }
69
  },
70
  "action": {
71
  "single_arm": {
72
  "max": [
73
- 2.000000476837158,
74
- 2.8696048259735107,
75
- -0.029101531952619553,
76
- 1.2001643180847168,
77
- 1.6363924741744995
78
  ],
79
  "min": [
80
- -1.6367615461349487,
81
- -0.0013483419315889478,
82
- -2.870177745819092,
83
- -0.15347807109355927,
84
- -2.378706455230713
85
  ],
86
  "mean": [
87
- 0.5605649352073669,
88
- 1.2173237800598145,
89
- -1.1972321271896362,
90
- 0.7227405309677124,
91
- -0.5876930356025696
92
  ],
93
  "std": [
94
- 0.9839946627616882,
95
- 0.7347026467323303,
96
- 0.6269937753677368,
97
- 0.23072293400764465,
98
- 0.9713127017021179
99
  ],
100
  "q01": [
101
- -1.5492996978759765,
102
- -8.676056131662335e-06,
103
- -2.7688085556030275,
104
- 0.30216108322143553,
105
- -1.9937553739547729
106
  ],
107
  "q99": [
108
- 1.9999995231628418,
109
- 2.744625654220581,
110
- -0.33259372472763077,
111
  1.200000286102295,
112
- 1.5410190296173096
113
  ]
114
  },
115
  "gripper": {
116
  "max": [
117
- 0.703035831451416
118
  ],
119
  "min": [
120
- -0.06299631297588348
121
  ],
122
  "mean": [
123
- 0.02159128151834011
124
  ],
125
  "std": [
126
- 0.07854700088500977
127
  ],
128
  "q01": [
129
- -0.01098821684718132
130
  ],
131
  "q99": [
132
- 0.41087187886238097
133
  ]
134
  }
135
  }
136
  },
137
  "modalities": {
138
  "video": {
139
- "webcam": {
 
 
 
 
 
 
 
 
140
  "resolution": [
141
  640,
142
  480
 
4
  "state": {
5
  "single_arm": {
6
  "max": [
7
+ 2.0000243186950684,
8
+ 2.9169764518737793,
9
+ -0.038691196590662,
10
+ 1.2000755071640015,
11
+ 1.5494728088378906
12
  ],
13
  "min": [
14
+ -1.6060552597045898,
15
+ -0.0013021699851378798,
16
+ -2.8603312969207764,
17
+ -0.24804846942424774,
18
+ -2.4647815227508545
19
  ],
20
  "mean": [
21
+ 0.5791597962379456,
22
+ 1.1458725929260254,
23
+ -1.20786452293396,
24
+ 0.7232474684715271,
25
+ -0.6134248375892639
26
  ],
27
  "std": [
28
+ 0.9811665415763855,
29
+ 0.7337602376937866,
30
+ 0.6142615079879761,
31
+ 0.23586146533489227,
32
+ 0.966107964515686
33
  ],
34
  "q01": [
35
+ -1.4408677732944488,
36
+ -1.425088475116354e-05,
37
+ -2.781974565982819,
38
+ 0.27418837130069734,
39
+ -1.9999565029144286
40
  ],
41
  "q99": [
42
+ 1.9999998807907104,
43
+ 2.6177230525016784,
44
+ -0.3294296935200691,
45
  1.2000004053115845,
46
+ 1.4374910080432892
47
  ]
48
  },
49
  "gripper": {
50
  "max": [
51
+ 0.7455036640167236
52
  ],
53
  "min": [
54
+ -0.09471277892589569
55
  ],
56
  "mean": [
57
+ 0.000568521732930094
58
  ],
59
  "std": [
60
+ 0.14351940155029297
61
  ],
62
  "q01": [
63
+ -0.05334045834839344
64
  ],
65
  "q99": [
66
+ 0.5978925746679306
67
  ]
68
  }
69
  },
70
  "action": {
71
  "single_arm": {
72
  "max": [
73
+ 2.0000085830688477,
74
+ 2.9173049926757812,
75
+ -0.0662246122956276,
76
+ 1.2000755071640015,
77
+ 1.549481987953186
78
  ],
79
  "min": [
80
+ -1.5781737565994263,
81
+ -0.0013021699851378798,
82
+ -2.8603312969207764,
83
+ -0.24804846942424774,
84
+ -2.4347522258758545
85
  ],
86
  "mean": [
87
+ 0.5763245820999146,
88
+ 1.1609699726104736,
89
+ -1.2109930515289307,
90
+ 0.7210843563079834,
91
+ -0.6085058450698853
92
  ],
93
  "std": [
94
+ 0.9860737323760986,
95
+ 0.7361332774162292,
96
+ 0.6108555197715759,
97
+ 0.23312774300575256,
98
+ 0.9712501168251038
99
  ],
100
  "q01": [
101
+ -1.4462056386470794,
102
+ -1.4080941818974679e-05,
103
+ -2.7542264652252197,
104
+ 0.27880221664905547,
105
+ -1.9997829270362855
106
  ],
107
  "q99": [
108
+ 1.9999998807907104,
109
+ 2.6269060873985284,
110
+ -0.33764528483152395,
111
  1.200000286102295,
112
+ 1.4436372423171995
113
  ]
114
  },
115
  "gripper": {
116
  "max": [
117
+ 0.7455036640167236
118
  ],
119
  "min": [
120
+ -0.07934218645095825
121
  ],
122
  "mean": [
123
+ -0.0036394144408404827
124
  ],
125
  "std": [
126
+ 0.13472281396389008
127
  ],
128
  "q01": [
129
+ -0.05393668111413717
130
  ],
131
  "q99": [
132
+ 0.56789559841156
133
  ]
134
  }
135
  }
136
  },
137
  "modalities": {
138
  "video": {
139
+ "front": {
140
+ "resolution": [
141
+ 640,
142
+ 480
143
+ ],
144
+ "channels": 3,
145
+ "fps": 30.0
146
+ },
147
+ "wrist": {
148
  "resolution": [
149
  640,
150
  480
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a5a2e3982d2f0d475a6bbfd4b7eb7ace8d29cb89aead3eba6a96736e3138ec5
3
  size 4999367032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e14836ce74a4e9b04da62ca4114faf735a2809c0b08a9108f38af75d77cd30d7
3
  size 4999367032
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f8f6e3d4c9ae4e9a3ed1d025d893b99a236845708cd12495469812f41b57a3d
3
- size 2586508600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0453b04145be502d125d4232207a885382b890f223ea28e76aaa9b9dec0b7a7d
3
+ size 2586705312
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 7585742720
4
  },
5
  "weight_map": {
6
  "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors",
@@ -13,6 +13,7 @@
13
  "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors",
14
  "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors",
15
  "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors",
 
16
  "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors",
17
  "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors",
18
  "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 7585939328
4
  },
5
  "weight_map": {
6
  "action_head.action_decoder.layer1.W": "model-00002-of-00002.safetensors",
 
13
  "action_head.action_encoder.W2.b": "model-00002-of-00002.safetensors",
14
  "action_head.action_encoder.W3.W": "model-00002-of-00002.safetensors",
15
  "action_head.action_encoder.W3.b": "model-00002-of-00002.safetensors",
16
+ "action_head.future_tokens.weight": "model-00002-of-00002.safetensors",
17
  "action_head.model.proj_out_1.bias": "model-00002-of-00002.safetensors",
18
  "action_head.model.proj_out_1.weight": "model-00002-of-00002.safetensors",
19
  "action_head.model.proj_out_2.bias": "model-00002-of-00002.safetensors",
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f75938496ae2e6484380c63bc241c2aa134598e299c3cde3af74c90448f6a3cc
3
- size 8550325978
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fff16306a8567006573dec88809781da838424eb8893b8ac50c99c1707d8bf
3
+ size 8550720062
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6942e7d0fad9ea5ca8d6849b863c5fca113d1802c0c6b4b6cb63f75db30e17a1
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa987f049de3910e95fa2a955bc4440367b40904739310b87f0dcc5b65565405
3
  size 14512
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:675c736ed11ba57de7a85176857b7317f9f79aec1c909ebf5cc00810df70079a
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a53f5bc2d14df97b7c41aafe1a226f125dc1e3b00e93b40e785ab358650080f3
3
  size 14512
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057194249d9cdc822a0752df7fedc436fc30dce92062cd380d9a3f5704199672
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3bbae53c08b86f610006e2dd1b837f5d64ea03e1ff2ff6562b0e6794c52c092
3
  size 1064
trainer_state.json CHANGED
@@ -2,718 +2,2118 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8090614886731392,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.008090614886731391,
14
- "grad_norm": 7.823570251464844,
15
- "learning_rate": 6e-06,
16
- "loss": 0.8903,
17
  "step": 10
18
  },
19
  {
20
- "epoch": 0.016181229773462782,
21
- "grad_norm": 2.272317886352539,
22
- "learning_rate": 1.2666666666666668e-05,
23
- "loss": 0.4601,
24
  "step": 20
25
  },
26
  {
27
- "epoch": 0.024271844660194174,
28
- "grad_norm": 1.5555341243743896,
29
- "learning_rate": 1.9333333333333333e-05,
30
- "loss": 0.276,
31
  "step": 30
32
  },
33
  {
34
- "epoch": 0.032362459546925564,
35
- "grad_norm": 1.7244459390640259,
36
- "learning_rate": 2.6000000000000002e-05,
37
- "loss": 0.1865,
38
  "step": 40
39
  },
40
  {
41
- "epoch": 0.040453074433656956,
42
- "grad_norm": 1.4668376445770264,
43
- "learning_rate": 3.266666666666667e-05,
44
- "loss": 0.1422,
45
  "step": 50
46
  },
47
  {
48
- "epoch": 0.04854368932038835,
49
- "grad_norm": 2.02225661277771,
50
- "learning_rate": 3.933333333333333e-05,
51
- "loss": 0.1238,
52
  "step": 60
53
  },
54
  {
55
- "epoch": 0.05663430420711974,
56
- "grad_norm": 1.1487993001937866,
57
- "learning_rate": 4.600000000000001e-05,
58
- "loss": 0.0791,
59
  "step": 70
60
  },
61
  {
62
- "epoch": 0.06472491909385113,
63
- "grad_norm": 0.5569996237754822,
64
- "learning_rate": 5.266666666666666e-05,
65
- "loss": 0.0819,
66
  "step": 80
67
  },
68
  {
69
- "epoch": 0.07281553398058252,
70
- "grad_norm": 1.194751262664795,
71
- "learning_rate": 5.9333333333333343e-05,
72
- "loss": 0.0698,
73
  "step": 90
74
  },
75
  {
76
- "epoch": 0.08090614886731391,
77
- "grad_norm": 0.8545956015586853,
78
- "learning_rate": 6.6e-05,
79
- "loss": 0.0732,
80
  "step": 100
81
  },
82
  {
83
- "epoch": 0.0889967637540453,
84
- "grad_norm": 0.6914941072463989,
85
- "learning_rate": 7.266666666666667e-05,
86
- "loss": 0.0641,
87
  "step": 110
88
  },
89
  {
90
- "epoch": 0.0970873786407767,
91
- "grad_norm": 0.4049356281757355,
92
- "learning_rate": 7.933333333333334e-05,
93
- "loss": 0.0565,
94
  "step": 120
95
  },
96
  {
97
- "epoch": 0.10517799352750809,
98
- "grad_norm": 0.7143370509147644,
99
- "learning_rate": 8.6e-05,
100
- "loss": 0.0555,
101
  "step": 130
102
  },
103
  {
104
- "epoch": 0.11326860841423948,
105
- "grad_norm": 0.9746547937393188,
106
- "learning_rate": 9.266666666666666e-05,
107
- "loss": 0.0523,
108
  "step": 140
109
  },
110
  {
111
- "epoch": 0.12135922330097088,
112
- "grad_norm": 0.6225072145462036,
113
- "learning_rate": 9.933333333333334e-05,
114
- "loss": 0.0492,
115
  "step": 150
116
  },
117
  {
118
- "epoch": 0.12944983818770225,
119
- "grad_norm": 0.6568852066993713,
120
- "learning_rate": 9.999753945398704e-05,
121
- "loss": 0.0478,
122
  "step": 160
123
  },
124
  {
125
- "epoch": 0.13754045307443366,
126
- "grad_norm": 0.345377653837204,
127
- "learning_rate": 9.998903417374228e-05,
128
- "loss": 0.0465,
129
  "step": 170
130
  },
131
  {
132
- "epoch": 0.14563106796116504,
133
- "grad_norm": 0.756610095500946,
134
- "learning_rate": 9.997445481536973e-05,
135
- "loss": 0.0467,
136
  "step": 180
137
  },
138
  {
139
- "epoch": 0.15372168284789645,
140
- "grad_norm": 0.494722843170166,
141
- "learning_rate": 9.995380315038119e-05,
142
- "loss": 0.0407,
143
  "step": 190
144
  },
145
  {
146
- "epoch": 0.16181229773462782,
147
- "grad_norm": 0.7570058107376099,
148
- "learning_rate": 9.99270816881235e-05,
149
- "loss": 0.042,
150
  "step": 200
151
  },
152
  {
153
- "epoch": 0.16990291262135923,
154
- "grad_norm": 0.3776613473892212,
155
- "learning_rate": 9.989429367547377e-05,
156
- "loss": 0.0429,
157
  "step": 210
158
  },
159
  {
160
- "epoch": 0.1779935275080906,
161
- "grad_norm": 0.5018635988235474,
162
- "learning_rate": 9.985544309644475e-05,
163
- "loss": 0.0367,
164
  "step": 220
165
  },
166
  {
167
- "epoch": 0.18608414239482202,
168
- "grad_norm": 0.3979959487915039,
169
- "learning_rate": 9.98105346717008e-05,
170
- "loss": 0.0341,
171
  "step": 230
172
  },
173
  {
174
- "epoch": 0.1941747572815534,
175
- "grad_norm": 0.5319691300392151,
176
- "learning_rate": 9.97595738579843e-05,
177
- "loss": 0.0347,
178
  "step": 240
179
  },
180
  {
181
- "epoch": 0.2022653721682848,
182
- "grad_norm": 0.5348716974258423,
183
- "learning_rate": 9.970256684745258e-05,
184
- "loss": 0.0308,
185
  "step": 250
186
  },
187
  {
188
- "epoch": 0.21035598705501618,
189
- "grad_norm": 0.6536835432052612,
190
- "learning_rate": 9.963952056692549e-05,
191
- "loss": 0.0301,
192
  "step": 260
193
  },
194
  {
195
- "epoch": 0.21844660194174756,
196
- "grad_norm": 0.5518924593925476,
197
- "learning_rate": 9.957044267704384e-05,
198
- "loss": 0.0364,
199
  "step": 270
200
  },
201
  {
202
- "epoch": 0.22653721682847897,
203
- "grad_norm": 0.40961554646492004,
204
- "learning_rate": 9.949534157133844e-05,
205
- "loss": 0.0293,
206
  "step": 280
207
  },
208
  {
209
- "epoch": 0.23462783171521034,
210
- "grad_norm": 0.7217941880226135,
211
- "learning_rate": 9.941422637521035e-05,
212
- "loss": 0.0363,
213
  "step": 290
214
  },
215
  {
216
- "epoch": 0.24271844660194175,
217
- "grad_norm": 0.5057093501091003,
218
- "learning_rate": 9.932710694482191e-05,
219
- "loss": 0.0295,
220
  "step": 300
221
  },
222
  {
223
- "epoch": 0.25080906148867316,
224
- "grad_norm": 0.75019770860672,
225
- "learning_rate": 9.923399386589933e-05,
226
- "loss": 0.033,
227
  "step": 310
228
  },
229
  {
230
- "epoch": 0.2588996763754045,
231
- "grad_norm": 0.48032259941101074,
232
- "learning_rate": 9.913489845244626e-05,
233
- "loss": 0.0277,
234
  "step": 320
235
  },
236
  {
237
- "epoch": 0.2669902912621359,
238
- "grad_norm": 0.5418844223022461,
239
- "learning_rate": 9.902983274536912e-05,
240
- "loss": 0.0264,
241
  "step": 330
242
  },
243
  {
244
- "epoch": 0.2750809061488673,
245
- "grad_norm": 0.5400299429893494,
246
- "learning_rate": 9.891880951101407e-05,
247
- "loss": 0.028,
248
  "step": 340
249
  },
250
  {
251
- "epoch": 0.28317152103559873,
252
- "grad_norm": 0.7953261733055115,
253
- "learning_rate": 9.880184223961573e-05,
254
- "loss": 0.0245,
255
  "step": 350
256
  },
257
  {
258
- "epoch": 0.2912621359223301,
259
- "grad_norm": 0.47858819365501404,
260
- "learning_rate": 9.867894514365802e-05,
261
- "loss": 0.0272,
262
  "step": 360
263
  },
264
  {
265
- "epoch": 0.2993527508090615,
266
- "grad_norm": 0.3962852954864502,
267
- "learning_rate": 9.855013315614725e-05,
268
- "loss": 0.0262,
269
  "step": 370
270
  },
271
  {
272
- "epoch": 0.3074433656957929,
273
- "grad_norm": 0.840124785900116,
274
- "learning_rate": 9.841542192879762e-05,
275
- "loss": 0.0249,
276
  "step": 380
277
  },
278
  {
279
- "epoch": 0.3155339805825243,
280
- "grad_norm": 0.6563257575035095,
281
- "learning_rate": 9.82748278301294e-05,
282
- "loss": 0.0268,
283
  "step": 390
284
  },
285
  {
286
- "epoch": 0.32362459546925565,
287
- "grad_norm": 0.4959515929222107,
288
- "learning_rate": 9.812836794348004e-05,
289
- "loss": 0.0241,
290
  "step": 400
291
  },
292
  {
293
- "epoch": 0.33171521035598706,
294
- "grad_norm": 0.5108354091644287,
295
- "learning_rate": 9.797606006492841e-05,
296
- "loss": 0.0329,
297
  "step": 410
298
  },
299
  {
300
- "epoch": 0.33980582524271846,
301
- "grad_norm": 0.5321593880653381,
302
- "learning_rate": 9.781792270113241e-05,
303
- "loss": 0.0248,
304
  "step": 420
305
  },
306
  {
307
- "epoch": 0.3478964401294498,
308
- "grad_norm": 0.7112411856651306,
309
- "learning_rate": 9.765397506708023e-05,
310
- "loss": 0.0266,
311
  "step": 430
312
  },
313
  {
314
- "epoch": 0.3559870550161812,
315
- "grad_norm": 0.4580034911632538,
316
- "learning_rate": 9.748423708375563e-05,
317
- "loss": 0.0228,
318
  "step": 440
319
  },
320
  {
321
- "epoch": 0.3640776699029126,
322
- "grad_norm": 0.43798476457595825,
323
- "learning_rate": 9.730872937571739e-05,
324
- "loss": 0.0239,
325
  "step": 450
326
  },
327
  {
328
- "epoch": 0.37216828478964403,
329
- "grad_norm": 0.5347399711608887,
330
- "learning_rate": 9.712747326859315e-05,
331
- "loss": 0.0265,
332
  "step": 460
333
  },
334
  {
335
- "epoch": 0.3802588996763754,
336
- "grad_norm": 0.5635089874267578,
337
- "learning_rate": 9.69404907864883e-05,
338
- "loss": 0.027,
339
  "step": 470
340
  },
341
  {
342
- "epoch": 0.3883495145631068,
343
- "grad_norm": 0.3755838871002197,
344
- "learning_rate": 9.674780464930979e-05,
345
- "loss": 0.0234,
346
  "step": 480
347
  },
348
  {
349
- "epoch": 0.3964401294498382,
350
- "grad_norm": 0.522113561630249,
351
- "learning_rate": 9.654943827000548e-05,
352
- "loss": 0.021,
353
  "step": 490
354
  },
355
  {
356
- "epoch": 0.4045307443365696,
357
- "grad_norm": 0.43958115577697754,
358
- "learning_rate": 9.634541575171929e-05,
359
- "loss": 0.0214,
360
  "step": 500
361
  },
362
  {
363
- "epoch": 0.41262135922330095,
364
- "grad_norm": 0.4480895698070526,
365
- "learning_rate": 9.613576188486253e-05,
366
- "loss": 0.0258,
367
  "step": 510
368
  },
369
  {
370
- "epoch": 0.42071197411003236,
371
- "grad_norm": 0.410576730966568,
372
- "learning_rate": 9.59205021441015e-05,
373
- "loss": 0.0226,
374
  "step": 520
375
  },
376
  {
377
- "epoch": 0.42880258899676377,
378
- "grad_norm": 0.45604780316352844,
379
- "learning_rate": 9.569966268526232e-05,
380
- "loss": 0.0256,
381
  "step": 530
382
  },
383
  {
384
- "epoch": 0.4368932038834951,
385
- "grad_norm": 0.30307430028915405,
386
- "learning_rate": 9.54732703421526e-05,
387
- "loss": 0.0204,
388
  "step": 540
389
  },
390
  {
391
- "epoch": 0.4449838187702265,
392
- "grad_norm": 0.29722708463668823,
393
- "learning_rate": 9.524135262330098e-05,
394
- "loss": 0.0198,
395
  "step": 550
396
  },
397
  {
398
- "epoch": 0.45307443365695793,
399
- "grad_norm": 0.38580235838890076,
400
- "learning_rate": 9.50039377086147e-05,
401
- "loss": 0.0168,
402
  "step": 560
403
  },
404
  {
405
- "epoch": 0.46116504854368934,
406
- "grad_norm": 0.39507967233657837,
407
- "learning_rate": 9.476105444595534e-05,
408
- "loss": 0.0157,
409
  "step": 570
410
  },
411
  {
412
- "epoch": 0.4692556634304207,
413
- "grad_norm": 0.34573355317115784,
414
- "learning_rate": 9.451273234763371e-05,
415
- "loss": 0.0176,
416
  "step": 580
417
  },
418
  {
419
- "epoch": 0.4773462783171521,
420
- "grad_norm": 0.2983342111110687,
421
- "learning_rate": 9.425900158682385e-05,
422
- "loss": 0.0177,
423
  "step": 590
424
  },
425
  {
426
- "epoch": 0.4854368932038835,
427
- "grad_norm": 0.38746461272239685,
428
- "learning_rate": 9.399989299389661e-05,
429
- "loss": 0.0216,
430
  "step": 600
431
  },
432
  {
433
- "epoch": 0.4935275080906149,
434
- "grad_norm": 0.39340198040008545,
435
- "learning_rate": 9.373543805267368e-05,
436
- "loss": 0.0221,
437
  "step": 610
438
  },
439
  {
440
- "epoch": 0.5016181229773463,
441
- "grad_norm": 0.47980770468711853,
442
- "learning_rate": 9.346566889660193e-05,
443
- "loss": 0.0172,
444
  "step": 620
445
  },
446
  {
447
- "epoch": 0.5097087378640777,
448
- "grad_norm": 0.421115905046463,
449
- "learning_rate": 9.319061830484898e-05,
450
- "loss": 0.0156,
451
  "step": 630
452
  },
453
  {
454
- "epoch": 0.517799352750809,
455
- "grad_norm": 0.3385259509086609,
456
- "learning_rate": 9.291031969832026e-05,
457
- "loss": 0.0176,
458
  "step": 640
459
  },
460
  {
461
- "epoch": 0.5258899676375405,
462
- "grad_norm": 0.32277145981788635,
463
- "learning_rate": 9.262480713559808e-05,
464
- "loss": 0.0169,
465
  "step": 650
466
  },
467
  {
468
- "epoch": 0.5339805825242718,
469
- "grad_norm": 0.32181084156036377,
470
- "learning_rate": 9.233411530880326e-05,
471
- "loss": 0.0187,
472
  "step": 660
473
  },
474
  {
475
- "epoch": 0.5420711974110033,
476
- "grad_norm": 0.5838663578033447,
477
- "learning_rate": 9.20382795393797e-05,
478
- "loss": 0.0228,
479
  "step": 670
480
  },
481
  {
482
- "epoch": 0.5501618122977346,
483
- "grad_norm": 0.30314013361930847,
484
- "learning_rate": 9.173733577380258e-05,
485
- "loss": 0.02,
486
  "step": 680
487
  },
488
  {
489
- "epoch": 0.558252427184466,
490
- "grad_norm": 0.5964832901954651,
491
- "learning_rate": 9.143132057921058e-05,
492
- "loss": 0.018,
493
  "step": 690
494
  },
495
  {
496
- "epoch": 0.5663430420711975,
497
- "grad_norm": 0.4126530587673187,
498
- "learning_rate": 9.112027113896262e-05,
499
- "loss": 0.0173,
500
  "step": 700
501
  },
502
  {
503
- "epoch": 0.5744336569579288,
504
- "grad_norm": 0.4253070652484894,
505
- "learning_rate": 9.080422524811982e-05,
506
- "loss": 0.0257,
507
  "step": 710
508
  },
509
  {
510
- "epoch": 0.5825242718446602,
511
- "grad_norm": 0.5417248606681824,
512
- "learning_rate": 9.048322130885305e-05,
513
- "loss": 0.0177,
514
  "step": 720
515
  },
516
  {
517
- "epoch": 0.5906148867313916,
518
- "grad_norm": 0.3459491431713104,
519
- "learning_rate": 9.015729832577681e-05,
520
- "loss": 0.019,
521
  "step": 730
522
  },
523
  {
524
- "epoch": 0.598705501618123,
525
- "grad_norm": 0.3335317373275757,
526
- "learning_rate": 8.982649590120982e-05,
527
- "loss": 0.0169,
528
  "step": 740
529
  },
530
  {
531
- "epoch": 0.6067961165048543,
532
- "grad_norm": 0.6572862267494202,
533
- "learning_rate": 8.949085423036296e-05,
534
- "loss": 0.0198,
535
  "step": 750
536
  },
537
  {
538
- "epoch": 0.6148867313915858,
539
- "grad_norm": 0.540212869644165,
540
- "learning_rate": 8.91504140964553e-05,
541
- "loss": 0.016,
542
  "step": 760
543
  },
544
  {
545
- "epoch": 0.6229773462783171,
546
- "grad_norm": 0.24550016224384308,
547
- "learning_rate": 8.880521686575857e-05,
548
- "loss": 0.0168,
549
  "step": 770
550
  },
551
  {
552
- "epoch": 0.6310679611650486,
553
- "grad_norm": 0.5790821313858032,
554
- "learning_rate": 8.845530448257085e-05,
555
- "loss": 0.0184,
556
  "step": 780
557
  },
558
  {
559
- "epoch": 0.63915857605178,
560
- "grad_norm": 0.6583966612815857,
561
- "learning_rate": 8.810071946411989e-05,
562
- "loss": 0.0167,
563
  "step": 790
564
  },
565
  {
566
- "epoch": 0.6472491909385113,
567
- "grad_norm": 0.3252386450767517,
568
- "learning_rate": 8.774150489539707e-05,
569
  "loss": 0.0173,
570
  "step": 800
571
  },
572
  {
573
- "epoch": 0.6553398058252428,
574
- "grad_norm": 0.37483498454093933,
575
- "learning_rate": 8.737770442392212e-05,
576
- "loss": 0.0169,
577
  "step": 810
578
  },
579
  {
580
- "epoch": 0.6634304207119741,
581
- "grad_norm": 0.29752904176712036,
582
- "learning_rate": 8.700936225443959e-05,
583
- "loss": 0.018,
584
  "step": 820
585
  },
586
  {
587
- "epoch": 0.6715210355987055,
588
- "grad_norm": 0.37370291352272034,
589
- "learning_rate": 8.663652314354765e-05,
590
- "loss": 0.0178,
591
  "step": 830
592
  },
593
  {
594
- "epoch": 0.6796116504854369,
595
- "grad_norm": 0.24446973204612732,
596
- "learning_rate": 8.625923239425978e-05,
597
- "loss": 0.0132,
598
  "step": 840
599
  },
600
  {
601
- "epoch": 0.6877022653721683,
602
- "grad_norm": 0.45145106315612793,
603
- "learning_rate": 8.587753585050004e-05,
604
- "loss": 0.0196,
605
  "step": 850
606
  },
607
  {
608
- "epoch": 0.6957928802588996,
609
- "grad_norm": 0.24976196885108948,
610
- "learning_rate": 8.549147989153276e-05,
611
- "loss": 0.017,
612
  "step": 860
613
  },
614
  {
615
- "epoch": 0.7038834951456311,
616
- "grad_norm": 0.41941019892692566,
617
- "learning_rate": 8.510111142632698e-05,
618
- "loss": 0.0153,
619
  "step": 870
620
  },
621
  {
622
- "epoch": 0.7119741100323624,
623
- "grad_norm": 0.3423904776573181,
624
- "learning_rate": 8.470647788785665e-05,
625
- "loss": 0.0143,
626
  "step": 880
627
  },
628
  {
629
- "epoch": 0.7200647249190939,
630
- "grad_norm": 0.28540822863578796,
631
- "learning_rate": 8.430762722733714e-05,
632
- "loss": 0.0125,
633
  "step": 890
634
  },
635
  {
636
- "epoch": 0.7281553398058253,
637
- "grad_norm": 0.38557255268096924,
638
- "learning_rate": 8.390460790839882e-05,
639
- "loss": 0.0193,
640
  "step": 900
641
  },
642
  {
643
- "epoch": 0.7362459546925566,
644
- "grad_norm": 0.496142715215683,
645
- "learning_rate": 8.349746890119826e-05,
646
- "loss": 0.0137,
647
  "step": 910
648
  },
649
  {
650
- "epoch": 0.7443365695792881,
651
- "grad_norm": 0.42488303780555725,
652
- "learning_rate": 8.308625967646795e-05,
653
- "loss": 0.0159,
654
  "step": 920
655
  },
656
  {
657
- "epoch": 0.7524271844660194,
658
- "grad_norm": 0.28551360964775085,
659
- "learning_rate": 8.267103019950529e-05,
660
- "loss": 0.0155,
661
  "step": 930
662
  },
663
  {
664
- "epoch": 0.7605177993527508,
665
- "grad_norm": 0.3401723802089691,
666
- "learning_rate": 8.225183092410128e-05,
667
- "loss": 0.0133,
668
  "step": 940
669
  },
670
  {
671
- "epoch": 0.7686084142394822,
672
- "grad_norm": 0.34012413024902344,
673
- "learning_rate": 8.182871278641009e-05,
674
- "loss": 0.0191,
675
  "step": 950
676
  },
677
  {
678
- "epoch": 0.7766990291262136,
679
- "grad_norm": 0.385560005903244,
680
- "learning_rate": 8.140172719875979e-05,
681
- "loss": 0.0108,
682
  "step": 960
683
  },
684
  {
685
- "epoch": 0.7847896440129449,
686
- "grad_norm": 0.3427627980709076,
687
- "learning_rate": 8.097092604340542e-05,
688
- "loss": 0.0121,
689
  "step": 970
690
  },
691
  {
692
- "epoch": 0.7928802588996764,
693
- "grad_norm": 0.45653820037841797,
694
- "learning_rate": 8.053636166622476e-05,
695
- "loss": 0.0154,
696
  "step": 980
697
  },
698
  {
699
- "epoch": 0.8009708737864077,
700
- "grad_norm": 0.3446105122566223,
701
- "learning_rate": 8.009808687035798e-05,
702
- "loss": 0.0171,
703
  "step": 990
704
  },
705
  {
706
- "epoch": 0.8090614886731392,
707
- "grad_norm": 0.25365254282951355,
708
- "learning_rate": 7.965615490979163e-05,
709
- "loss": 0.014,
710
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  }
712
  ],
713
  "logging_steps": 10,
714
- "max_steps": 3000,
715
  "num_input_tokens_seen": 0,
716
- "num_train_epochs": 3,
717
  "save_steps": 1000,
718
  "stateful_callbacks": {
719
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.5188916876574305,
6
  "eval_steps": 500,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.008396305625524769,
14
+ "grad_norm": 10.171950340270996,
15
+ "learning_rate": 4.5e-06,
16
+ "loss": 1.1261,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.016792611251049538,
21
+ "grad_norm": 2.374096632003784,
22
+ "learning_rate": 9.5e-06,
23
+ "loss": 0.5171,
24
  "step": 20
25
  },
26
  {
27
+ "epoch": 0.02518891687657431,
28
+ "grad_norm": 1.9998104572296143,
29
+ "learning_rate": 1.45e-05,
30
+ "loss": 0.2565,
31
  "step": 30
32
  },
33
  {
34
+ "epoch": 0.033585222502099076,
35
+ "grad_norm": 1.4298408031463623,
36
+ "learning_rate": 1.9500000000000003e-05,
37
+ "loss": 0.1707,
38
  "step": 40
39
  },
40
  {
41
+ "epoch": 0.041981528127623846,
42
+ "grad_norm": 1.2781989574432373,
43
+ "learning_rate": 2.45e-05,
44
+ "loss": 0.1302,
45
  "step": 50
46
  },
47
  {
48
+ "epoch": 0.05037783375314862,
49
+ "grad_norm": 1.4599987268447876,
50
+ "learning_rate": 2.95e-05,
51
+ "loss": 0.1016,
52
  "step": 60
53
  },
54
  {
55
+ "epoch": 0.05877413937867338,
56
+ "grad_norm": 0.7745999097824097,
57
+ "learning_rate": 3.45e-05,
58
+ "loss": 0.0848,
59
  "step": 70
60
  },
61
  {
62
+ "epoch": 0.06717044500419815,
63
+ "grad_norm": 1.022971272468567,
64
+ "learning_rate": 3.9500000000000005e-05,
65
+ "loss": 0.0818,
66
  "step": 80
67
  },
68
  {
69
+ "epoch": 0.07556675062972293,
70
+ "grad_norm": 0.4050329923629761,
71
+ "learning_rate": 4.4500000000000004e-05,
72
+ "loss": 0.0635,
73
  "step": 90
74
  },
75
  {
76
+ "epoch": 0.08396305625524769,
77
+ "grad_norm": 0.7355945110321045,
78
+ "learning_rate": 4.9500000000000004e-05,
79
+ "loss": 0.0647,
80
  "step": 100
81
  },
82
  {
83
+ "epoch": 0.09235936188077246,
84
+ "grad_norm": 0.9530194997787476,
85
+ "learning_rate": 5.45e-05,
86
+ "loss": 0.0634,
87
  "step": 110
88
  },
89
  {
90
+ "epoch": 0.10075566750629723,
91
+ "grad_norm": 0.8249333500862122,
92
+ "learning_rate": 5.95e-05,
93
+ "loss": 0.0556,
94
  "step": 120
95
  },
96
  {
97
+ "epoch": 0.109151973131822,
98
+ "grad_norm": 0.6259945631027222,
99
+ "learning_rate": 6.450000000000001e-05,
100
+ "loss": 0.0554,
101
  "step": 130
102
  },
103
  {
104
+ "epoch": 0.11754827875734676,
105
+ "grad_norm": 0.8876557350158691,
106
+ "learning_rate": 6.95e-05,
107
+ "loss": 0.0495,
108
  "step": 140
109
  },
110
  {
111
+ "epoch": 0.12594458438287154,
112
+ "grad_norm": 0.777101993560791,
113
+ "learning_rate": 7.450000000000001e-05,
114
+ "loss": 0.0518,
115
  "step": 150
116
  },
117
  {
118
+ "epoch": 0.1343408900083963,
119
+ "grad_norm": 0.9956705570220947,
120
+ "learning_rate": 7.950000000000001e-05,
121
+ "loss": 0.0456,
122
  "step": 160
123
  },
124
  {
125
+ "epoch": 0.14273719563392107,
126
+ "grad_norm": 0.5946229696273804,
127
+ "learning_rate": 8.450000000000001e-05,
128
+ "loss": 0.042,
129
  "step": 170
130
  },
131
  {
132
+ "epoch": 0.15113350125944586,
133
+ "grad_norm": 0.7513177990913391,
134
+ "learning_rate": 8.950000000000001e-05,
135
+ "loss": 0.0416,
136
  "step": 180
137
  },
138
  {
139
+ "epoch": 0.15952980688497062,
140
+ "grad_norm": 1.2336539030075073,
141
+ "learning_rate": 9.449999999999999e-05,
142
+ "loss": 0.04,
143
  "step": 190
144
  },
145
  {
146
+ "epoch": 0.16792611251049538,
147
+ "grad_norm": 0.6550417542457581,
148
+ "learning_rate": 9.95e-05,
149
+ "loss": 0.0419,
150
  "step": 200
151
  },
152
  {
153
+ "epoch": 0.17632241813602015,
154
+ "grad_norm": 0.7052122354507446,
155
+ "learning_rate": 9.999861593790126e-05,
156
+ "loss": 0.0444,
157
  "step": 210
158
  },
159
  {
160
+ "epoch": 0.1847187237615449,
161
+ "grad_norm": 0.5741733908653259,
162
+ "learning_rate": 9.999383162408304e-05,
163
+ "loss": 0.043,
164
  "step": 220
165
  },
166
  {
167
+ "epoch": 0.19311502938706968,
168
+ "grad_norm": 0.7300972938537598,
169
+ "learning_rate": 9.998563029828259e-05,
170
+ "loss": 0.0396,
171
  "step": 230
172
  },
173
  {
174
+ "epoch": 0.20151133501259447,
175
+ "grad_norm": 0.35928529500961304,
176
+ "learning_rate": 9.997401252104962e-05,
177
+ "loss": 0.0382,
178
  "step": 240
179
  },
180
  {
181
+ "epoch": 0.20990764063811923,
182
+ "grad_norm": 0.5260980129241943,
183
+ "learning_rate": 9.995897908644378e-05,
184
+ "loss": 0.0332,
185
  "step": 250
186
  },
187
  {
188
+ "epoch": 0.218303946263644,
189
+ "grad_norm": 0.4173150956630707,
190
+ "learning_rate": 9.994053102198034e-05,
191
+ "loss": 0.0314,
192
  "step": 260
193
  },
194
  {
195
+ "epoch": 0.22670025188916876,
196
+ "grad_norm": 0.6619486808776855,
197
+ "learning_rate": 9.991866958856003e-05,
198
+ "loss": 0.0375,
199
  "step": 270
200
  },
201
  {
202
+ "epoch": 0.23509655751469352,
203
+ "grad_norm": 0.6853200197219849,
204
+ "learning_rate": 9.989339628038276e-05,
205
+ "loss": 0.0348,
206
  "step": 280
207
  },
208
  {
209
+ "epoch": 0.2434928631402183,
210
+ "grad_norm": 0.7677571177482605,
211
+ "learning_rate": 9.98647128248456e-05,
212
+ "loss": 0.0434,
213
  "step": 290
214
  },
215
  {
216
+ "epoch": 0.2518891687657431,
217
+ "grad_norm": 0.47874388098716736,
218
+ "learning_rate": 9.98326211824246e-05,
219
+ "loss": 0.0339,
220
  "step": 300
221
  },
222
  {
223
+ "epoch": 0.26028547439126787,
224
+ "grad_norm": 0.5458245277404785,
225
+ "learning_rate": 9.979712354654091e-05,
226
+ "loss": 0.0365,
227
  "step": 310
228
  },
229
  {
230
+ "epoch": 0.2686817800167926,
231
+ "grad_norm": 0.7390539646148682,
232
+ "learning_rate": 9.975822234341079e-05,
233
+ "loss": 0.0319,
234
  "step": 320
235
  },
236
  {
237
+ "epoch": 0.2770780856423174,
238
+ "grad_norm": 0.9258743524551392,
239
+ "learning_rate": 9.97159202318798e-05,
240
+ "loss": 0.0313,
241
  "step": 330
242
  },
243
  {
244
+ "epoch": 0.28547439126784213,
245
+ "grad_norm": 0.4790763556957245,
246
+ "learning_rate": 9.967022010324105e-05,
247
+ "loss": 0.0308,
248
  "step": 340
249
  },
250
  {
251
+ "epoch": 0.2938706968933669,
252
+ "grad_norm": 0.5265215635299683,
253
+ "learning_rate": 9.962112508103765e-05,
254
+ "loss": 0.0292,
255
  "step": 350
256
  },
257
  {
258
+ "epoch": 0.3022670025188917,
259
+ "grad_norm": 0.5428227782249451,
260
+ "learning_rate": 9.956863852084914e-05,
261
+ "loss": 0.0286,
262
  "step": 360
263
  },
264
  {
265
+ "epoch": 0.31066330814441645,
266
+ "grad_norm": 0.5066298246383667,
267
+ "learning_rate": 9.951276401006221e-05,
268
+ "loss": 0.0316,
269
  "step": 370
270
  },
271
  {
272
+ "epoch": 0.31905961376994124,
273
+ "grad_norm": 0.3620965778827667,
274
+ "learning_rate": 9.945350536762543e-05,
275
+ "loss": 0.0251,
276
  "step": 380
277
  },
278
  {
279
+ "epoch": 0.327455919395466,
280
+ "grad_norm": 0.44671374559402466,
281
+ "learning_rate": 9.939086664378829e-05,
282
+ "loss": 0.0271,
283
  "step": 390
284
  },
285
  {
286
+ "epoch": 0.33585222502099077,
287
+ "grad_norm": 0.4878353178501129,
288
+ "learning_rate": 9.932485211982437e-05,
289
+ "loss": 0.0228,
290
  "step": 400
291
  },
292
  {
293
+ "epoch": 0.34424853064651556,
294
+ "grad_norm": 0.43716397881507874,
295
+ "learning_rate": 9.92554663077387e-05,
296
+ "loss": 0.0308,
297
  "step": 410
298
  },
299
  {
300
+ "epoch": 0.3526448362720403,
301
+ "grad_norm": 0.5659756064414978,
302
+ "learning_rate": 9.918271394995935e-05,
303
+ "loss": 0.0244,
304
  "step": 420
305
  },
306
  {
307
+ "epoch": 0.3610411418975651,
308
+ "grad_norm": 0.4486585259437561,
309
+ "learning_rate": 9.910660001901335e-05,
310
+ "loss": 0.028,
311
  "step": 430
312
  },
313
  {
314
+ "epoch": 0.3694374475230898,
315
+ "grad_norm": 0.4010409116744995,
316
+ "learning_rate": 9.902712971718675e-05,
317
+ "loss": 0.0268,
318
  "step": 440
319
  },
320
  {
321
+ "epoch": 0.3778337531486146,
322
+ "grad_norm": 0.48508334159851074,
323
+ "learning_rate": 9.894430847616915e-05,
324
+ "loss": 0.0258,
325
  "step": 450
326
  },
327
  {
328
+ "epoch": 0.38623005877413935,
329
+ "grad_norm": 0.5513168573379517,
330
+ "learning_rate": 9.885814195668232e-05,
331
+ "loss": 0.0288,
332
  "step": 460
333
  },
334
  {
335
+ "epoch": 0.39462636439966414,
336
+ "grad_norm": 0.3650225102901459,
337
+ "learning_rate": 9.876863604809344e-05,
338
+ "loss": 0.0284,
339
  "step": 470
340
  },
341
  {
342
+ "epoch": 0.40302267002518893,
343
+ "grad_norm": 0.35296711325645447,
344
+ "learning_rate": 9.867579686801245e-05,
345
+ "loss": 0.0246,
346
  "step": 480
347
  },
348
  {
349
+ "epoch": 0.41141897565071367,
350
+ "grad_norm": 0.3748459219932556,
351
+ "learning_rate": 9.8579630761874e-05,
352
+ "loss": 0.0233,
353
  "step": 490
354
  },
355
  {
356
+ "epoch": 0.41981528127623846,
357
+ "grad_norm": 0.41755491495132446,
358
+ "learning_rate": 9.848014430250367e-05,
359
+ "loss": 0.023,
360
  "step": 500
361
  },
362
  {
363
+ "epoch": 0.4282115869017632,
364
+ "grad_norm": 0.43207406997680664,
365
+ "learning_rate": 9.837734428966885e-05,
366
+ "loss": 0.0245,
367
  "step": 510
368
  },
369
  {
370
+ "epoch": 0.436607892527288,
371
+ "grad_norm": 0.27464979887008667,
372
+ "learning_rate": 9.827123774961383e-05,
373
+ "loss": 0.0216,
374
  "step": 520
375
  },
376
  {
377
+ "epoch": 0.4450041981528128,
378
+ "grad_norm": 0.5782731771469116,
379
+ "learning_rate": 9.816183193457968e-05,
380
+ "loss": 0.0245,
381
  "step": 530
382
  },
383
  {
384
+ "epoch": 0.4534005037783375,
385
+ "grad_norm": 0.559138834476471,
386
+ "learning_rate": 9.804913432230856e-05,
387
+ "loss": 0.024,
388
  "step": 540
389
  },
390
  {
391
+ "epoch": 0.4617968094038623,
392
+ "grad_norm": 0.6747875809669495,
393
+ "learning_rate": 9.793315261553252e-05,
394
+ "loss": 0.0256,
395
  "step": 550
396
  },
397
  {
398
+ "epoch": 0.47019311502938704,
399
+ "grad_norm": 0.55149245262146,
400
+ "learning_rate": 9.781389474144717e-05,
401
+ "loss": 0.0234,
402
  "step": 560
403
  },
404
  {
405
+ "epoch": 0.47858942065491183,
406
+ "grad_norm": 0.32782289385795593,
407
+ "learning_rate": 9.76913688511698e-05,
408
+ "loss": 0.0224,
409
  "step": 570
410
  },
411
  {
412
+ "epoch": 0.4869857262804366,
413
+ "grad_norm": 0.5114140510559082,
414
+ "learning_rate": 9.756558331918227e-05,
415
+ "loss": 0.0236,
416
  "step": 580
417
  },
418
  {
419
+ "epoch": 0.49538203190596136,
420
+ "grad_norm": 0.6146140098571777,
421
+ "learning_rate": 9.743654674275855e-05,
422
+ "loss": 0.0255,
423
  "step": 590
424
  },
425
  {
426
+ "epoch": 0.5037783375314862,
427
+ "grad_norm": 0.7248744964599609,
428
+ "learning_rate": 9.730426794137727e-05,
429
+ "loss": 0.0266,
430
  "step": 600
431
  },
432
  {
433
+ "epoch": 0.5121746431570109,
434
+ "grad_norm": 0.389180451631546,
435
+ "learning_rate": 9.716875595611879e-05,
436
+ "loss": 0.0267,
437
  "step": 610
438
  },
439
  {
440
+ "epoch": 0.5205709487825357,
441
+ "grad_norm": 0.49693188071250916,
442
+ "learning_rate": 9.703002004904729e-05,
443
+ "loss": 0.0227,
444
  "step": 620
445
  },
446
  {
447
+ "epoch": 0.5289672544080605,
448
+ "grad_norm": 0.5026288628578186,
449
+ "learning_rate": 9.688806970257773e-05,
450
+ "loss": 0.0197,
451
  "step": 630
452
  },
453
  {
454
+ "epoch": 0.5373635600335852,
455
+ "grad_norm": 0.35680270195007324,
456
+ "learning_rate": 9.674291461882774e-05,
457
+ "loss": 0.0195,
458
  "step": 640
459
  },
460
  {
461
+ "epoch": 0.5457598656591099,
462
+ "grad_norm": 0.3147616386413574,
463
+ "learning_rate": 9.659456471895445e-05,
464
+ "loss": 0.0212,
465
  "step": 650
466
  },
467
  {
468
+ "epoch": 0.5541561712846348,
469
+ "grad_norm": 0.25688570737838745,
470
+ "learning_rate": 9.644303014247648e-05,
471
+ "loss": 0.0184,
472
  "step": 660
473
  },
474
  {
475
+ "epoch": 0.5625524769101595,
476
+ "grad_norm": 0.33005818724632263,
477
+ "learning_rate": 9.628832124658085e-05,
478
+ "loss": 0.0248,
479
  "step": 670
480
  },
481
  {
482
+ "epoch": 0.5709487825356843,
483
+ "grad_norm": 0.3768410384654999,
484
+ "learning_rate": 9.613044860541507e-05,
485
+ "loss": 0.0207,
486
  "step": 680
487
  },
488
  {
489
+ "epoch": 0.5793450881612091,
490
+ "grad_norm": 0.27856314182281494,
491
+ "learning_rate": 9.596942300936445e-05,
492
+ "loss": 0.0174,
493
  "step": 690
494
  },
495
  {
496
+ "epoch": 0.5877413937867338,
497
+ "grad_norm": 0.44634294509887695,
498
+ "learning_rate": 9.580525546431459e-05,
499
+ "loss": 0.0194,
500
  "step": 700
501
  },
502
  {
503
+ "epoch": 0.5961376994122586,
504
+ "grad_norm": 0.35559549927711487,
505
+ "learning_rate": 9.563795719089911e-05,
506
+ "loss": 0.0236,
507
  "step": 710
508
  },
509
  {
510
+ "epoch": 0.6045340050377834,
511
+ "grad_norm": 0.3311902582645416,
512
+ "learning_rate": 9.546753962373281e-05,
513
+ "loss": 0.0175,
514
  "step": 720
515
  },
516
  {
517
+ "epoch": 0.6129303106633082,
518
+ "grad_norm": 0.2235281765460968,
519
+ "learning_rate": 9.529401441062997e-05,
520
+ "loss": 0.018,
521
  "step": 730
522
  },
523
  {
524
+ "epoch": 0.6213266162888329,
525
+ "grad_norm": 0.3235013782978058,
526
+ "learning_rate": 9.511739341180842e-05,
527
+ "loss": 0.0142,
528
  "step": 740
529
  },
530
  {
531
+ "epoch": 0.6297229219143576,
532
+ "grad_norm": 0.4594292640686035,
533
+ "learning_rate": 9.493768869907886e-05,
534
+ "loss": 0.0194,
535
  "step": 750
536
  },
537
  {
538
+ "epoch": 0.6381192275398825,
539
+ "grad_norm": 0.4702000617980957,
540
+ "learning_rate": 9.475491255501968e-05,
541
+ "loss": 0.0149,
542
  "step": 760
543
  },
544
  {
545
+ "epoch": 0.6465155331654072,
546
+ "grad_norm": 0.3968784511089325,
547
+ "learning_rate": 9.456907747213748e-05,
548
+ "loss": 0.0164,
549
  "step": 770
550
  },
551
  {
552
+ "epoch": 0.654911838790932,
553
+ "grad_norm": 0.5368077754974365,
554
+ "learning_rate": 9.438019615201336e-05,
555
+ "loss": 0.0183,
556
  "step": 780
557
  },
558
  {
559
+ "epoch": 0.6633081444164568,
560
+ "grad_norm": 0.32421955466270447,
561
+ "learning_rate": 9.418828150443469e-05,
562
+ "loss": 0.0154,
563
  "step": 790
564
  },
565
  {
566
+ "epoch": 0.6717044500419815,
567
+ "grad_norm": 0.36530694365501404,
568
+ "learning_rate": 9.399334664651262e-05,
569
  "loss": 0.0173,
570
  "step": 800
571
  },
572
  {
573
+ "epoch": 0.6801007556675063,
574
+ "grad_norm": 0.24784396588802338,
575
+ "learning_rate": 9.379540490178581e-05,
576
+ "loss": 0.0199,
577
  "step": 810
578
  },
579
  {
580
+ "epoch": 0.6884970612930311,
581
+ "grad_norm": 0.4272478520870209,
582
+ "learning_rate": 9.359446979930955e-05,
583
+ "loss": 0.0178,
584
  "step": 820
585
  },
586
  {
587
+ "epoch": 0.6968933669185559,
588
+ "grad_norm": 0.39507076144218445,
589
+ "learning_rate": 9.33905550727312e-05,
590
+ "loss": 0.0183,
591
  "step": 830
592
  },
593
  {
594
+ "epoch": 0.7052896725440806,
595
+ "grad_norm": 0.20049844682216644,
596
+ "learning_rate": 9.318367465935142e-05,
597
+ "loss": 0.0156,
598
  "step": 840
599
  },
600
  {
601
+ "epoch": 0.7136859781696053,
602
+ "grad_norm": 0.8157637119293213,
603
+ "learning_rate": 9.29738426991717e-05,
604
+ "loss": 0.0207,
605
  "step": 850
606
  },
607
  {
608
+ "epoch": 0.7220822837951302,
609
+ "grad_norm": 0.5720648765563965,
610
+ "learning_rate": 9.276107353392774e-05,
611
+ "loss": 0.02,
612
  "step": 860
613
  },
614
  {
615
+ "epoch": 0.7304785894206549,
616
+ "grad_norm": 0.7491711974143982,
617
+ "learning_rate": 9.254538170610938e-05,
618
+ "loss": 0.0215,
619
  "step": 870
620
  },
621
  {
622
+ "epoch": 0.7388748950461796,
623
+ "grad_norm": 0.6429204344749451,
624
+ "learning_rate": 9.232678195796654e-05,
625
+ "loss": 0.0161,
626
  "step": 880
627
  },
628
  {
629
+ "epoch": 0.7472712006717045,
630
+ "grad_norm": 0.5460074543952942,
631
+ "learning_rate": 9.210528923050164e-05,
632
+ "loss": 0.014,
633
  "step": 890
634
  },
635
  {
636
+ "epoch": 0.7556675062972292,
637
+ "grad_norm": 0.5716153383255005,
638
+ "learning_rate": 9.188091866244834e-05,
639
+ "loss": 0.0209,
640
  "step": 900
641
  },
642
  {
643
+ "epoch": 0.764063811922754,
644
+ "grad_norm": 0.5055357813835144,
645
+ "learning_rate": 9.165368558923695e-05,
646
+ "loss": 0.0143,
647
  "step": 910
648
  },
649
  {
650
+ "epoch": 0.7724601175482787,
651
+ "grad_norm": 0.5202294588088989,
652
+ "learning_rate": 9.142360554194618e-05,
653
+ "loss": 0.0173,
654
  "step": 920
655
  },
656
  {
657
+ "epoch": 0.7808564231738035,
658
+ "grad_norm": 0.2911999821662903,
659
+ "learning_rate": 9.119069424624163e-05,
660
+ "loss": 0.0152,
661
  "step": 930
662
  },
663
  {
664
+ "epoch": 0.7892527287993283,
665
+ "grad_norm": 0.28226837515830994,
666
+ "learning_rate": 9.0954967621301e-05,
667
+ "loss": 0.0125,
668
  "step": 940
669
  },
670
  {
671
+ "epoch": 0.797649034424853,
672
+ "grad_norm": 0.24133360385894775,
673
+ "learning_rate": 9.071644177872594e-05,
674
+ "loss": 0.018,
675
  "step": 950
676
  },
677
  {
678
+ "epoch": 0.8060453400503779,
679
+ "grad_norm": 0.28321564197540283,
680
+ "learning_rate": 9.047513302144095e-05,
681
+ "loss": 0.0126,
682
  "step": 960
683
  },
684
  {
685
+ "epoch": 0.8144416456759026,
686
+ "grad_norm": 0.3964134752750397,
687
+ "learning_rate": 9.023105784257906e-05,
688
+ "loss": 0.0131,
689
  "step": 970
690
  },
691
  {
692
+ "epoch": 0.8228379513014273,
693
+ "grad_norm": 0.4938385486602783,
694
+ "learning_rate": 8.998423292435454e-05,
695
+ "loss": 0.0166,
696
  "step": 980
697
  },
698
  {
699
+ "epoch": 0.8312342569269522,
700
+ "grad_norm": 0.38572144508361816,
701
+ "learning_rate": 8.973467513692265e-05,
702
+ "loss": 0.0194,
703
  "step": 990
704
  },
705
  {
706
+ "epoch": 0.8396305625524769,
707
+ "grad_norm": 0.5354552268981934,
708
+ "learning_rate": 8.94824015372267e-05,
709
+ "loss": 0.0159,
710
  "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.8480268681780017,
714
+ "grad_norm": 0.5144203305244446,
715
+ "learning_rate": 8.922742936783207e-05,
716
+ "loss": 0.0142,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.8564231738035264,
721
+ "grad_norm": 0.49846023321151733,
722
+ "learning_rate": 8.896977605574788e-05,
723
+ "loss": 0.015,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.8648194794290512,
728
+ "grad_norm": 0.48377203941345215,
729
+ "learning_rate": 8.870945921123576e-05,
730
+ "loss": 0.0151,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.873215785054576,
735
+ "grad_norm": 0.45066890120506287,
736
+ "learning_rate": 8.844649662660624e-05,
737
+ "loss": 0.0145,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.8816120906801007,
742
+ "grad_norm": 0.4206666946411133,
743
+ "learning_rate": 8.818090627500266e-05,
744
+ "loss": 0.0129,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.8900083963056256,
749
+ "grad_norm": 0.41736483573913574,
750
+ "learning_rate": 8.791270630917275e-05,
751
+ "loss": 0.0192,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.8984047019311503,
756
+ "grad_norm": 0.3356010615825653,
757
+ "learning_rate": 8.764191506022795e-05,
758
+ "loss": 0.0148,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.906801007556675,
763
+ "grad_norm": 0.3665725886821747,
764
+ "learning_rate": 8.736855103639037e-05,
765
+ "loss": 0.0155,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.9151973131821999,
770
+ "grad_norm": NaN,
771
+ "learning_rate": 8.709263292172794e-05,
772
+ "loss": 0.0082,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.9235936188077246,
777
+ "grad_norm": NaN,
778
+ "learning_rate": 8.681417957487729e-05,
779
+ "loss": 0.0,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.9319899244332494,
784
+ "grad_norm": NaN,
785
+ "learning_rate": 8.653321002775478e-05,
786
+ "loss": 0.0,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.9403862300587741,
791
+ "grad_norm": NaN,
792
+ "learning_rate": 8.624974348425574e-05,
793
+ "loss": 0.0,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.9487825356842989,
798
+ "grad_norm": NaN,
799
+ "learning_rate": 8.596379931894188e-05,
800
+ "loss": 0.0,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.9571788413098237,
805
+ "grad_norm": NaN,
806
+ "learning_rate": 8.567539707571703e-05,
807
+ "loss": 0.0,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.9655751469353484,
812
+ "grad_norm": NaN,
813
+ "learning_rate": 8.538455646649146e-05,
814
+ "loss": 0.0,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.9739714525608733,
819
+ "grad_norm": NaN,
820
+ "learning_rate": 8.509129736983446e-05,
821
+ "loss": 0.0,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.982367758186398,
826
+ "grad_norm": NaN,
827
+ "learning_rate": 8.479563982961571e-05,
828
+ "loss": 0.0,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.9907640638119227,
833
+ "grad_norm": NaN,
834
+ "learning_rate": 8.449760405363539e-05,
835
+ "loss": 0.0,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.9991603694374476,
840
+ "grad_norm": NaN,
841
+ "learning_rate": 8.419721041224287e-05,
842
+ "loss": 0.0,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 1.0075566750629723,
847
+ "grad_norm": NaN,
848
+ "learning_rate": 8.389447943694451e-05,
849
+ "loss": 0.0,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 1.015952980688497,
854
+ "grad_norm": NaN,
855
+ "learning_rate": 8.358943181900032e-05,
856
+ "loss": 0.0,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 1.0243492863140218,
861
+ "grad_norm": NaN,
862
+ "learning_rate": 8.328208840800981e-05,
863
+ "loss": 0.0,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 1.0327455919395465,
868
+ "grad_norm": NaN,
869
+ "learning_rate": 8.297247021048686e-05,
870
+ "loss": 0.0,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 1.0411418975650715,
875
+ "grad_norm": NaN,
876
+ "learning_rate": 8.266059838842396e-05,
877
+ "loss": 0.0,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 1.0495382031905962,
882
+ "grad_norm": NaN,
883
+ "learning_rate": 8.23464942578459e-05,
884
+ "loss": 0.0,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 1.057934508816121,
889
+ "grad_norm": NaN,
890
+ "learning_rate": 8.203017928735277e-05,
891
+ "loss": 0.0,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 1.0663308144416457,
896
+ "grad_norm": NaN,
897
+ "learning_rate": 8.17116750966526e-05,
898
+ "loss": 0.0,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 1.0747271200671704,
903
+ "grad_norm": NaN,
904
+ "learning_rate": 8.139100345508377e-05,
905
+ "loss": 0.0,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 1.0831234256926952,
910
+ "grad_norm": NaN,
911
+ "learning_rate": 8.106818628012697e-05,
912
+ "loss": 0.0,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 1.0915197313182199,
917
+ "grad_norm": NaN,
918
+ "learning_rate": 8.074324563590736e-05,
919
+ "loss": 0.0,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 1.0999160369437448,
924
+ "grad_norm": NaN,
925
+ "learning_rate": 8.041620373168628e-05,
926
+ "loss": 0.0,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 1.1083123425692696,
931
+ "grad_norm": NaN,
932
+ "learning_rate": 8.008708292034349e-05,
933
+ "loss": 0.0,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 1.1167086481947943,
938
+ "grad_norm": NaN,
939
+ "learning_rate": 7.975590569684925e-05,
940
+ "loss": 0.0,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 1.125104953820319,
945
+ "grad_norm": NaN,
946
+ "learning_rate": 7.942269469672687e-05,
947
+ "loss": 0.0,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 1.1335012594458438,
952
+ "grad_norm": NaN,
953
+ "learning_rate": 7.908747269450558e-05,
954
+ "loss": 0.0,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 1.1418975650713685,
959
+ "grad_norm": NaN,
960
+ "learning_rate": 7.875026260216393e-05,
961
+ "loss": 0.0,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 1.1502938706968933,
966
+ "grad_norm": NaN,
967
+ "learning_rate": 7.841108746756382e-05,
968
+ "loss": 0.0,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 1.1586901763224182,
973
+ "grad_norm": NaN,
974
+ "learning_rate": 7.806997047287516e-05,
975
+ "loss": 0.0,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 1.167086481947943,
980
+ "grad_norm": NaN,
981
+ "learning_rate": 7.772693493299138e-05,
982
+ "loss": 0.0,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 1.1754827875734677,
987
+ "grad_norm": NaN,
988
+ "learning_rate": 7.7382004293936e-05,
989
+ "loss": 0.0,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 1.1838790931989924,
994
+ "grad_norm": NaN,
995
+ "learning_rate": 7.703520213126e-05,
996
+ "loss": 0.0,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 1.1922753988245172,
1001
+ "grad_norm": NaN,
1002
+ "learning_rate": 7.66865521484305e-05,
1003
+ "loss": 0.0,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 1.200671704450042,
1008
+ "grad_norm": NaN,
1009
+ "learning_rate": 7.633607817521074e-05,
1010
+ "loss": 0.0,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 1.2090680100755669,
1015
+ "grad_norm": NaN,
1016
+ "learning_rate": 7.598380416603119e-05,
1017
+ "loss": 0.0,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 1.2174643157010916,
1022
+ "grad_norm": NaN,
1023
+ "learning_rate": 7.562975419835247e-05,
1024
+ "loss": 0.0,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 1.2258606213266163,
1029
+ "grad_norm": NaN,
1030
+ "learning_rate": 7.527395247101956e-05,
1031
+ "loss": 0.0,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 1.234256926952141,
1036
+ "grad_norm": NaN,
1037
+ "learning_rate": 7.491642330260789e-05,
1038
+ "loss": 0.0,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 1.2426532325776658,
1043
+ "grad_norm": NaN,
1044
+ "learning_rate": 7.45571911297612e-05,
1045
+ "loss": 0.0,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 1.2510495382031905,
1050
+ "grad_norm": NaN,
1051
+ "learning_rate": 7.419628050552131e-05,
1052
+ "loss": 0.0,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 1.2594458438287153,
1057
+ "grad_norm": NaN,
1058
+ "learning_rate": 7.383371609764999e-05,
1059
+ "loss": 0.0,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 1.26784214945424,
1064
+ "grad_norm": NaN,
1065
+ "learning_rate": 7.346952268694288e-05,
1066
+ "loss": 0.0,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 1.276238455079765,
1071
+ "grad_norm": NaN,
1072
+ "learning_rate": 7.310372516553585e-05,
1073
+ "loss": 0.0,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 1.2846347607052897,
1078
+ "grad_norm": NaN,
1079
+ "learning_rate": 7.273634853520356e-05,
1080
+ "loss": 0.0,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 1.2930310663308144,
1085
+ "grad_norm": NaN,
1086
+ "learning_rate": 7.236741790565072e-05,
1087
+ "loss": 0.0,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 1.3014273719563392,
1092
+ "grad_norm": NaN,
1093
+ "learning_rate": 7.199695849279576e-05,
1094
+ "loss": 0.0,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 1.309823677581864,
1099
+ "grad_norm": NaN,
1100
+ "learning_rate": 7.162499561704747e-05,
1101
+ "loss": 0.0,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 1.3182199832073889,
1106
+ "grad_norm": NaN,
1107
+ "learning_rate": 7.125155470157429e-05,
1108
+ "loss": 0.0,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 1.3266162888329136,
1113
+ "grad_norm": NaN,
1114
+ "learning_rate": 7.087666127056675e-05,
1115
+ "loss": 0.0,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 1.3350125944584383,
1120
+ "grad_norm": NaN,
1121
+ "learning_rate": 7.050034094749286e-05,
1122
+ "loss": 0.0,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 1.343408900083963,
1127
+ "grad_norm": NaN,
1128
+ "learning_rate": 7.012261945334683e-05,
1129
+ "loss": 0.0,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 1.3518052057094878,
1134
+ "grad_norm": NaN,
1135
+ "learning_rate": 6.974352260489103e-05,
1136
+ "loss": 0.0,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 1.3602015113350125,
1141
+ "grad_norm": NaN,
1142
+ "learning_rate": 6.936307631289148e-05,
1143
+ "loss": 0.0,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 1.3685978169605373,
1148
+ "grad_norm": NaN,
1149
+ "learning_rate": 6.898130658034685e-05,
1150
+ "loss": 0.0,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 1.376994122586062,
1155
+ "grad_norm": NaN,
1156
+ "learning_rate": 6.859823950071127e-05,
1157
+ "loss": 0.0,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 1.385390428211587,
1162
+ "grad_norm": NaN,
1163
+ "learning_rate": 6.821390125611078e-05,
1164
+ "loss": 0.0,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 1.3937867338371117,
1169
+ "grad_norm": NaN,
1170
+ "learning_rate": 6.782831811555385e-05,
1171
+ "loss": 0.0,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 1.4021830394626364,
1176
+ "grad_norm": NaN,
1177
+ "learning_rate": 6.744151643313597e-05,
1178
+ "loss": 0.0,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 1.4105793450881612,
1183
+ "grad_norm": NaN,
1184
+ "learning_rate": 6.705352264623828e-05,
1185
+ "loss": 0.0,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 1.418975650713686,
1190
+ "grad_norm": NaN,
1191
+ "learning_rate": 6.666436327372078e-05,
1192
+ "loss": 0.0,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 1.4273719563392109,
1197
+ "grad_norm": NaN,
1198
+ "learning_rate": 6.62740649141096e-05,
1199
+ "loss": 0.0,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 1.4357682619647356,
1204
+ "grad_norm": NaN,
1205
+ "learning_rate": 6.588265424377919e-05,
1206
+ "loss": 0.0,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 1.4441645675902604,
1211
+ "grad_norm": NaN,
1212
+ "learning_rate": 6.549015801512895e-05,
1213
+ "loss": 0.0,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 1.452560873215785,
1218
+ "grad_norm": NaN,
1219
+ "learning_rate": 6.509660305475468e-05,
1220
+ "loss": 0.0,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 1.4609571788413098,
1225
+ "grad_norm": NaN,
1226
+ "learning_rate": 6.47020162616152e-05,
1227
+ "loss": 0.0,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 1.4693534844668346,
1232
+ "grad_norm": NaN,
1233
+ "learning_rate": 6.430642460519365e-05,
1234
+ "loss": 0.0,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 1.4777497900923593,
1239
+ "grad_norm": NaN,
1240
+ "learning_rate": 6.390985512365426e-05,
1241
+ "loss": 0.0,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 1.486146095717884,
1246
+ "grad_norm": NaN,
1247
+ "learning_rate": 6.351233492199431e-05,
1248
+ "loss": 0.0,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 1.4945424013434088,
1253
+ "grad_norm": NaN,
1254
+ "learning_rate": 6.311389117019155e-05,
1255
+ "loss": 0.0,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 1.5029387069689337,
1260
+ "grad_norm": NaN,
1261
+ "learning_rate": 6.271455110134713e-05,
1262
+ "loss": 0.0,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 1.5113350125944585,
1267
+ "grad_norm": NaN,
1268
+ "learning_rate": 6.231434200982428e-05,
1269
+ "loss": 0.0,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 1.5197313182199832,
1274
+ "grad_norm": NaN,
1275
+ "learning_rate": 6.191329124938285e-05,
1276
+ "loss": 0.0,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 1.528127623845508,
1281
+ "grad_norm": NaN,
1282
+ "learning_rate": 6.15114262313095e-05,
1283
+ "loss": 0.0,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 1.536523929471033,
1288
+ "grad_norm": NaN,
1289
+ "learning_rate": 6.110877442254444e-05,
1290
+ "loss": 0.0,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 1.5449202350965576,
1295
+ "grad_norm": NaN,
1296
+ "learning_rate": 6.0705363343803946e-05,
1297
+ "loss": 0.0,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 1.5533165407220824,
1302
+ "grad_norm": NaN,
1303
+ "learning_rate": 6.030122056769934e-05,
1304
+ "loss": 0.0,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 1.561712846347607,
1309
+ "grad_norm": NaN,
1310
+ "learning_rate": 5.989637371685257e-05,
1311
+ "loss": 0.0,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 1.5701091519731318,
1316
+ "grad_norm": NaN,
1317
+ "learning_rate": 5.949085046200808e-05,
1318
+ "loss": 0.0,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 1.5785054575986566,
1323
+ "grad_norm": NaN,
1324
+ "learning_rate": 5.908467852014169e-05,
1325
+ "loss": 0.0,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 1.5869017632241813,
1330
+ "grad_norm": NaN,
1331
+ "learning_rate": 5.867788565256607e-05,
1332
+ "loss": 0.0,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 1.595298068849706,
1337
+ "grad_norm": NaN,
1338
+ "learning_rate": 5.827049966303335e-05,
1339
+ "loss": 0.0,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 1.6036943744752308,
1344
+ "grad_norm": NaN,
1345
+ "learning_rate": 5.786254839583478e-05,
1346
+ "loss": 0.0,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 1.6120906801007555,
1351
+ "grad_norm": NaN,
1352
+ "learning_rate": 5.745405973389757e-05,
1353
+ "loss": 0.0,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 1.6204869857262805,
1358
+ "grad_norm": NaN,
1359
+ "learning_rate": 5.7045061596879134e-05,
1360
+ "loss": 0.0,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 1.6288832913518052,
1365
+ "grad_norm": NaN,
1366
+ "learning_rate": 5.6635581939258855e-05,
1367
+ "loss": 0.0,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 1.63727959697733,
1372
+ "grad_norm": NaN,
1373
+ "learning_rate": 5.622564874842742e-05,
1374
+ "loss": 0.0,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 1.645675902602855,
1379
+ "grad_norm": NaN,
1380
+ "learning_rate": 5.5815290042773836e-05,
1381
+ "loss": 0.0,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 1.6540722082283796,
1386
+ "grad_norm": NaN,
1387
+ "learning_rate": 5.540453386977058e-05,
1388
+ "loss": 0.0,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 1.6624685138539044,
1393
+ "grad_norm": NaN,
1394
+ "learning_rate": 5.4993408304056425e-05,
1395
+ "loss": 0.0,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 1.670864819479429,
1400
+ "grad_norm": NaN,
1401
+ "learning_rate": 5.458194144551768e-05,
1402
+ "loss": 0.0,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 1.6792611251049538,
1407
+ "grad_norm": NaN,
1408
+ "learning_rate": 5.417016141736756e-05,
1409
+ "loss": 0.0,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 1.6876574307304786,
1414
+ "grad_norm": NaN,
1415
+ "learning_rate": 5.375809636422399e-05,
1416
+ "loss": 0.0,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 1.6960537363560033,
1421
+ "grad_norm": NaN,
1422
+ "learning_rate": 5.334577445018599e-05,
1423
+ "loss": 0.0,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 1.704450041981528,
1428
+ "grad_norm": NaN,
1429
+ "learning_rate": 5.293322385690867e-05,
1430
+ "loss": 0.0,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 1.7128463476070528,
1435
+ "grad_norm": NaN,
1436
+ "learning_rate": 5.252047278167709e-05,
1437
+ "loss": 0.0,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 1.7212426532325775,
1442
+ "grad_norm": NaN,
1443
+ "learning_rate": 5.210754943547893e-05,
1444
+ "loss": 0.0,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 1.7296389588581025,
1449
+ "grad_norm": NaN,
1450
+ "learning_rate": 5.169448204107643e-05,
1451
+ "loss": 0.0,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 1.7380352644836272,
1456
+ "grad_norm": NaN,
1457
+ "learning_rate": 5.128129883107729e-05,
1458
+ "loss": 0.0,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 1.746431570109152,
1463
+ "grad_norm": NaN,
1464
+ "learning_rate": 5.086802804600505e-05,
1465
+ "loss": 0.0,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 1.7548278757346767,
1470
+ "grad_norm": NaN,
1471
+ "learning_rate": 5.045469793236892e-05,
1472
+ "loss": 0.0,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 1.7632241813602016,
1477
+ "grad_norm": NaN,
1478
+ "learning_rate": 5.00413367407331e-05,
1479
+ "loss": 0.0,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 1.7716204869857264,
1484
+ "grad_norm": NaN,
1485
+ "learning_rate": 4.9627972723785964e-05,
1486
+ "loss": 0.0,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 1.7800167926112511,
1491
+ "grad_norm": NaN,
1492
+ "learning_rate": 4.921463413440898e-05,
1493
+ "loss": 0.0,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 1.7884130982367759,
1498
+ "grad_norm": NaN,
1499
+ "learning_rate": 4.8801349223745654e-05,
1500
+ "loss": 0.0,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 1.7968094038623006,
1505
+ "grad_norm": NaN,
1506
+ "learning_rate": 4.838814623927067e-05,
1507
+ "loss": 0.0,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 1.8052057094878253,
1512
+ "grad_norm": NaN,
1513
+ "learning_rate": 4.797505342285912e-05,
1514
+ "loss": 0.0,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 1.81360201511335,
1519
+ "grad_norm": NaN,
1520
+ "learning_rate": 4.756209900885628e-05,
1521
+ "loss": 0.0,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 1.8219983207388748,
1526
+ "grad_norm": NaN,
1527
+ "learning_rate": 4.714931122214781e-05,
1528
+ "loss": 0.0,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 1.8303946263643995,
1533
+ "grad_norm": NaN,
1534
+ "learning_rate": 4.673671827623058e-05,
1535
+ "loss": 0.0,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 1.8387909319899243,
1540
+ "grad_norm": NaN,
1541
+ "learning_rate": 4.632434837128443e-05,
1542
+ "loss": 0.0,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 1.8471872376154492,
1547
+ "grad_norm": NaN,
1548
+ "learning_rate": 4.591222969224453e-05,
1549
+ "loss": 0.0,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 1.855583543240974,
1554
+ "grad_norm": NaN,
1555
+ "learning_rate": 4.550039040687518e-05,
1556
+ "loss": 0.0,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 1.8639798488664987,
1561
+ "grad_norm": NaN,
1562
+ "learning_rate": 4.508885866384446e-05,
1563
+ "loss": 0.0,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 1.8723761544920237,
1568
+ "grad_norm": NaN,
1569
+ "learning_rate": 4.4677662590800355e-05,
1570
+ "loss": 0.0,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 1.8807724601175484,
1575
+ "grad_norm": NaN,
1576
+ "learning_rate": 4.426683029244825e-05,
1577
+ "loss": 0.0,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 1.8891687657430731,
1582
+ "grad_norm": NaN,
1583
+ "learning_rate": 4.385638984863e-05,
1584
+ "loss": 0.0,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 1.8975650713685979,
1589
+ "grad_norm": NaN,
1590
+ "learning_rate": 4.3446369312404745e-05,
1591
+ "loss": 0.0,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 1.9059613769941226,
1596
+ "grad_norm": NaN,
1597
+ "learning_rate": 4.3036796708131474e-05,
1598
+ "loss": 0.0,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 1.9143576826196473,
1603
+ "grad_norm": NaN,
1604
+ "learning_rate": 4.262770002955363e-05,
1605
+ "loss": 0.0,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 1.922753988245172,
1610
+ "grad_norm": NaN,
1611
+ "learning_rate": 4.221910723788578e-05,
1612
+ "loss": 0.0,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 1.9311502938706968,
1617
+ "grad_norm": NaN,
1618
+ "learning_rate": 4.1811046259902474e-05,
1619
+ "loss": 0.0,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 1.9395465994962215,
1624
+ "grad_norm": NaN,
1625
+ "learning_rate": 4.140354498602952e-05,
1626
+ "loss": 0.0,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 1.9479429051217463,
1631
+ "grad_norm": NaN,
1632
+ "learning_rate": 4.099663126843769e-05,
1633
+ "loss": 0.0,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 1.9563392107472712,
1638
+ "grad_norm": NaN,
1639
+ "learning_rate": 4.059033291913902e-05,
1640
+ "loss": 0.0,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 1.964735516372796,
1645
+ "grad_norm": NaN,
1646
+ "learning_rate": 4.0184677708086014e-05,
1647
+ "loss": 0.0,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 1.9731318219983207,
1652
+ "grad_norm": NaN,
1653
+ "learning_rate": 3.977969336127348e-05,
1654
+ "loss": 0.0,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 1.9815281276238457,
1659
+ "grad_norm": NaN,
1660
+ "learning_rate": 3.937540755884357e-05,
1661
+ "loss": 0.0,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 1.9899244332493704,
1666
+ "grad_norm": NaN,
1667
+ "learning_rate": 3.897184793319384e-05,
1668
+ "loss": 0.0,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 1.9983207388748951,
1673
+ "grad_norm": NaN,
1674
+ "learning_rate": 3.856904206708863e-05,
1675
+ "loss": 0.0,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 2.00671704450042,
1680
+ "grad_norm": NaN,
1681
+ "learning_rate": 3.8167017491773847e-05,
1682
+ "loss": 0.0,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 2.0151133501259446,
1687
+ "grad_norm": NaN,
1688
+ "learning_rate": 3.776580168509516e-05,
1689
+ "loss": 0.0,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 2.0235096557514693,
1694
+ "grad_norm": NaN,
1695
+ "learning_rate": 3.736542206962e-05,
1696
+ "loss": 0.0,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 2.031905961376994,
1701
+ "grad_norm": NaN,
1702
+ "learning_rate": 3.696590601076326e-05,
1703
+ "loss": 0.0,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 2.040302267002519,
1708
+ "grad_norm": NaN,
1709
+ "learning_rate": 3.656728081491686e-05,
1710
+ "loss": 0.0,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 2.0486985726280436,
1715
+ "grad_norm": NaN,
1716
+ "learning_rate": 3.6169573727583405e-05,
1717
+ "loss": 0.0,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 2.0570948782535683,
1722
+ "grad_norm": NaN,
1723
+ "learning_rate": 3.5772811931514036e-05,
1724
+ "loss": 0.0,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 2.065491183879093,
1729
+ "grad_norm": NaN,
1730
+ "learning_rate": 3.5377022544850505e-05,
1731
+ "loss": 0.0,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 2.0738874895046178,
1736
+ "grad_norm": NaN,
1737
+ "learning_rate": 3.498223261927158e-05,
1738
+ "loss": 0.0,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 2.082283795130143,
1743
+ "grad_norm": NaN,
1744
+ "learning_rate": 3.4588469138144295e-05,
1745
+ "loss": 0.0,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 2.0906801007556677,
1750
+ "grad_norm": NaN,
1751
+ "learning_rate": 3.419575901467952e-05,
1752
+ "loss": 0.0,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 2.0990764063811924,
1757
+ "grad_norm": NaN,
1758
+ "learning_rate": 3.380412909009254e-05,
1759
+ "loss": 0.0,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 2.107472712006717,
1764
+ "grad_norm": NaN,
1765
+ "learning_rate": 3.3413606131768475e-05,
1766
+ "loss": 0.0,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 2.115869017632242,
1771
+ "grad_norm": NaN,
1772
+ "learning_rate": 3.302421683143279e-05,
1773
+ "loss": 0.0,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 2.1242653232577666,
1778
+ "grad_norm": NaN,
1779
+ "learning_rate": 3.2635987803326896e-05,
1780
+ "loss": 0.0,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 2.1326616288832914,
1785
+ "grad_norm": NaN,
1786
+ "learning_rate": 3.224894558238918e-05,
1787
+ "loss": 0.0,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 2.141057934508816,
1792
+ "grad_norm": NaN,
1793
+ "learning_rate": 3.18631166224413e-05,
1794
+ "loss": 0.0,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 2.149454240134341,
1799
+ "grad_norm": NaN,
1800
+ "learning_rate": 3.147852729438017e-05,
1801
+ "loss": 0.0,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 2.1578505457598656,
1806
+ "grad_norm": NaN,
1807
+ "learning_rate": 3.109520388437548e-05,
1808
+ "loss": 0.0,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 2.1662468513853903,
1813
+ "grad_norm": NaN,
1814
+ "learning_rate": 3.0713172592073116e-05,
1815
+ "loss": 0.0,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 2.174643157010915,
1820
+ "grad_norm": NaN,
1821
+ "learning_rate": 3.0332459528804457e-05,
1822
+ "loss": 0.0,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 2.1830394626364398,
1827
+ "grad_norm": NaN,
1828
+ "learning_rate": 2.9953090715801634e-05,
1829
+ "loss": 0.0,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 2.1914357682619645,
1834
+ "grad_norm": NaN,
1835
+ "learning_rate": 2.9575092082419086e-05,
1836
+ "loss": 0.0,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 2.1998320738874897,
1841
+ "grad_norm": NaN,
1842
+ "learning_rate": 2.9198489464361288e-05,
1843
+ "loss": 0.0,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 2.2082283795130144,
1848
+ "grad_norm": NaN,
1849
+ "learning_rate": 2.8823308601916948e-05,
1850
+ "loss": 0.0,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 2.216624685138539,
1855
+ "grad_norm": NaN,
1856
+ "learning_rate": 2.8449575138199613e-05,
1857
+ "loss": 0.0,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 2.225020990764064,
1862
+ "grad_norm": NaN,
1863
+ "learning_rate": 2.807731461739509e-05,
1864
+ "loss": 0.0,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 2.2334172963895886,
1869
+ "grad_norm": NaN,
1870
+ "learning_rate": 2.7706552483015485e-05,
1871
+ "loss": 0.0,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 2.2418136020151134,
1876
+ "grad_norm": NaN,
1877
+ "learning_rate": 2.733731407616018e-05,
1878
+ "loss": 0.0,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 2.250209907640638,
1883
+ "grad_norm": NaN,
1884
+ "learning_rate": 2.6969624633783806e-05,
1885
+ "loss": 0.0,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 2.258606213266163,
1890
+ "grad_norm": NaN,
1891
+ "learning_rate": 2.660350928697134e-05,
1892
+ "loss": 0.0,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 2.2670025188916876,
1897
+ "grad_norm": NaN,
1898
+ "learning_rate": 2.6238993059220395e-05,
1899
+ "loss": 0.0,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 2.2753988245172123,
1904
+ "grad_norm": NaN,
1905
+ "learning_rate": 2.5876100864730933e-05,
1906
+ "loss": 0.0,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 2.283795130142737,
1911
+ "grad_norm": NaN,
1912
+ "learning_rate": 2.5514857506702405e-05,
1913
+ "loss": 0.0,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 2.292191435768262,
1918
+ "grad_norm": NaN,
1919
+ "learning_rate": 2.5155287675638474e-05,
1920
+ "loss": 0.0,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 2.3005877413937865,
1925
+ "grad_norm": NaN,
1926
+ "learning_rate": 2.4797415947659457e-05,
1927
+ "loss": 0.0,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 2.3089840470193117,
1932
+ "grad_norm": NaN,
1933
+ "learning_rate": 2.4441266782822588e-05,
1934
+ "loss": 0.0,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 2.3173803526448364,
1939
+ "grad_norm": NaN,
1940
+ "learning_rate": 2.4086864523450183e-05,
1941
+ "loss": 0.0,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 2.325776658270361,
1946
+ "grad_norm": NaN,
1947
+ "learning_rate": 2.3734233392465903e-05,
1948
+ "loss": 0.0,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 2.334172963895886,
1953
+ "grad_norm": NaN,
1954
+ "learning_rate": 2.3383397491739145e-05,
1955
+ "loss": 0.0,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 2.3425692695214106,
1960
+ "grad_norm": NaN,
1961
+ "learning_rate": 2.3034380800437678e-05,
1962
+ "loss": 0.0,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 2.3509655751469354,
1967
+ "grad_norm": NaN,
1968
+ "learning_rate": 2.2687207173388743e-05,
1969
+ "loss": 0.0,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 2.35936188077246,
1974
+ "grad_norm": NaN,
1975
+ "learning_rate": 2.234190033944858e-05,
1976
+ "loss": 0.0,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 2.367758186397985,
1981
+ "grad_norm": NaN,
1982
+ "learning_rate": 2.1998483899880596e-05,
1983
+ "loss": 0.0,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 2.3761544920235096,
1988
+ "grad_norm": NaN,
1989
+ "learning_rate": 2.1656981326742266e-05,
1990
+ "loss": 0.0,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 2.3845507976490343,
1995
+ "grad_norm": NaN,
1996
+ "learning_rate": 2.1317415961280824e-05,
1997
+ "loss": 0.0,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 2.392947103274559,
2002
+ "grad_norm": NaN,
2003
+ "learning_rate": 2.097981101233794e-05,
2004
+ "loss": 0.0,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 2.401343408900084,
2009
+ "grad_norm": NaN,
2010
+ "learning_rate": 2.0644189554763417e-05,
2011
+ "loss": 0.0,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 2.4097397145256085,
2016
+ "grad_norm": NaN,
2017
+ "learning_rate": 2.0310574527838072e-05,
2018
+ "loss": 0.0,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 2.4181360201511337,
2023
+ "grad_norm": NaN,
2024
+ "learning_rate": 1.9978988733705807e-05,
2025
+ "loss": 0.0,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 2.4265323257766584,
2030
+ "grad_norm": NaN,
2031
+ "learning_rate": 1.9649454835815202e-05,
2032
+ "loss": 0.0,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 2.434928631402183,
2037
+ "grad_norm": NaN,
2038
+ "learning_rate": 1.932199535737045e-05,
2039
+ "loss": 0.0,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 2.443324937027708,
2044
+ "grad_norm": NaN,
2045
+ "learning_rate": 1.8996632679791914e-05,
2046
+ "loss": 0.0,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 2.4517212426532327,
2051
+ "grad_norm": NaN,
2052
+ "learning_rate": 1.8673389041186418e-05,
2053
+ "loss": 0.0,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 2.4601175482787574,
2058
+ "grad_norm": NaN,
2059
+ "learning_rate": 1.8352286534827274e-05,
2060
+ "loss": 0.0,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 2.468513853904282,
2065
+ "grad_norm": NaN,
2066
+ "learning_rate": 1.803334710764426e-05,
2067
+ "loss": 0.0,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 2.476910159529807,
2072
+ "grad_norm": NaN,
2073
+ "learning_rate": 1.7716592558723556e-05,
2074
+ "loss": 0.0,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 2.4853064651553316,
2079
+ "grad_norm": NaN,
2080
+ "learning_rate": 1.7402044537817824e-05,
2081
+ "loss": 0.0,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 2.4937027707808563,
2086
+ "grad_norm": NaN,
2087
+ "learning_rate": 1.7089724543866465e-05,
2088
+ "loss": 0.0,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 2.502099076406381,
2093
+ "grad_norm": NaN,
2094
+ "learning_rate": 1.6779653923526188e-05,
2095
+ "loss": 0.0,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 2.510495382031906,
2100
+ "grad_norm": NaN,
2101
+ "learning_rate": 1.6471853869712023e-05,
2102
+ "loss": 0.0,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 2.5188916876574305,
2107
+ "grad_norm": NaN,
2108
+ "learning_rate": 1.6166345420148787e-05,
2109
+ "loss": 0.0,
2110
+ "step": 3000
2111
  }
2112
  ],
2113
  "logging_steps": 10,
2114
+ "max_steps": 4000,
2115
  "num_input_tokens_seen": 0,
2116
+ "num_train_epochs": 4,
2117
  "save_steps": 1000,
2118
  "stateful_callbacks": {
2119
  "TrainerControl": {