jnmrr commited on
Commit
3a530fe
·
verified ·
1 Parent(s): 978a129

Upload RT-DETRv2 voucher classifier

Browse files
Files changed (41) hide show
  1. README.md +14 -14
  2. checkpoint-1600/config.json +129 -0
  3. checkpoint-1600/model.safetensors +3 -0
  4. checkpoint-1600/optimizer.pt +3 -0
  5. checkpoint-1600/preprocessor_config.json +26 -0
  6. checkpoint-1600/rng_state.pth +3 -0
  7. checkpoint-1600/scheduler.pt +3 -0
  8. checkpoint-1600/trainer_state.json +258 -0
  9. checkpoint-1600/training_args.bin +3 -0
  10. checkpoint-1800/config.json +129 -0
  11. checkpoint-1800/model.safetensors +3 -0
  12. checkpoint-1800/optimizer.pt +3 -0
  13. checkpoint-1800/preprocessor_config.json +26 -0
  14. checkpoint-1800/rng_state.pth +3 -0
  15. checkpoint-1800/scheduler.pt +3 -0
  16. checkpoint-1800/trainer_state.json +286 -0
  17. checkpoint-1800/training_args.bin +3 -0
  18. checkpoint-2000/model.safetensors +1 -1
  19. checkpoint-2000/optimizer.pt +1 -1
  20. checkpoint-2000/rng_state.pth +1 -1
  21. checkpoint-2000/trainer_state.json +179 -2699
  22. checkpoint-2000/training_args.bin +1 -1
  23. checkpoint-2200/config.json +129 -0
  24. checkpoint-2200/model.safetensors +3 -0
  25. checkpoint-2200/optimizer.pt +3 -0
  26. checkpoint-2200/preprocessor_config.json +26 -0
  27. checkpoint-2200/rng_state.pth +3 -0
  28. checkpoint-2200/scheduler.pt +3 -0
  29. checkpoint-2200/trainer_state.json +342 -0
  30. checkpoint-2200/training_args.bin +3 -0
  31. checkpoint-2250/config.json +129 -0
  32. checkpoint-2250/model.safetensors +3 -0
  33. checkpoint-2250/optimizer.pt +3 -0
  34. checkpoint-2250/preprocessor_config.json +26 -0
  35. checkpoint-2250/rng_state.pth +3 -0
  36. checkpoint-2250/scheduler.pt +3 -0
  37. checkpoint-2250/trainer_state.json +349 -0
  38. checkpoint-2250/training_args.bin +3 -0
  39. model.safetensors +1 -1
  40. runs/Aug14_17-42-57_2676026c4495/events.out.tfevents.1755193378.2676026c4495.6591.0 +3 -0
  41. training_args.bin +1 -1
README.md CHANGED
@@ -36,11 +36,11 @@ This model is a fine-tuned version of [PekingU/rtdetr_v2_r101vd](https://hugging
36
  ### Training Details
37
 
38
  **Training Dataset:**
39
- - **Total Samples**: 2186
40
  - **Class Distribution**:
41
- - **fisico** (id: 1): 1023 samples (46.8%)
42
- - **digital** (id: 0): 626 samples (28.6%)
43
- - **tesoreria** (id: 2): 537 samples (24.6%)
44
 
45
 
46
  **Training Configuration:**
@@ -91,15 +91,15 @@ This model is a fine-tuned version of [PekingU/rtdetr_v2_r101vd](https://hugging
91
  - **Tesoreria receipts mean confidence**: 0.0000 (low)
92
 
93
  **Performance by Object Size:**
94
- - **Small objects**: 0.0000
95
- - **Medium objects**: -1.0000
96
  - **Large objects**: 0.0000
97
 
98
  **Evaluation Dataset:**
99
- - **Digital invoices**: 157 samples (28.5%)
100
- - **Fisico receipts**: 261 samples (47.4%)
101
- - **Tesoreria receipts**: 133 samples (24.1%)
102
- - **Total evaluation samples**: 551
103
 
104
  **Model Configuration:**
105
  - **Base model**: PekingU/rtdetr_v2_r101vd
@@ -114,16 +114,16 @@ This model is a fine-tuned version of [PekingU/rtdetr_v2_r101vd](https://hugging
114
  - **RAM**: 83.5 GB
115
  - **GPU configuration**: A100 optimized
116
 
117
- **Training Time**: 1.19 hours
118
 
119
  **Training Summary:**
120
- - **Final training loss**: 175.9119
121
- - **Final learning rate**: 3.00e-07
122
 
123
 
124
  ### MLflow Tracking
125
 
126
- - **MLflow Run ID**: 7407ca0e0c584be4988a47523f45fabd
127
  - **MLflow Experiment**: RT-DETRv2_Voucher_Classification
128
 
129
 
 
36
  ### Training Details
37
 
38
  **Training Dataset:**
39
+ - **Total Samples**: 2178
40
  - **Class Distribution**:
41
+ - **digital** (id: 0): 626 samples (28.7%)
42
+ - **tesoreria** (id: 2): 537 samples (24.7%)
43
+ - **fisico** (id: 1): 1015 samples (46.6%)
44
 
45
 
46
  **Training Configuration:**
 
91
  - **Tesoreria receipts mean confidence**: 0.0000 (low)
92
 
93
  **Performance by Object Size:**
94
+ - **Small objects**: -1.0000
95
+ - **Medium objects**: 0.0000
96
  - **Large objects**: 0.0000
97
 
98
  **Evaluation Dataset:**
99
+ - **Digital invoices**: 157 samples (28.8%)
100
+ - **Fisico receipts**: 255 samples (46.8%)
101
+ - **Tesoreria receipts**: 133 samples (24.4%)
102
+ - **Total evaluation samples**: 545
103
 
104
  **Model Configuration:**
105
  - **Base model**: PekingU/rtdetr_v2_r101vd
 
114
  - **RAM**: 83.5 GB
115
  - **GPU configuration**: A100 optimized
116
 
117
+ **Training Time**: 1.18 hours
118
 
119
  **Training Summary:**
120
+ - **Final training loss**: 196.0164
121
+ - **Final learning rate**: 6.00e-08
122
 
123
 
124
  ### MLflow Tracking
125
 
126
+ - **MLflow Run ID**: fe7bd26bd1b344c292d9b485139be46c
127
  - **MLflow Experiment**: RT-DETRv2_Voucher_Classification
128
 
129
 
checkpoint-1600/config.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "silu",
4
+ "anchor_image_size": null,
5
+ "architectures": [
6
+ "RTDetrV2ForObjectDetection"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "auxiliary_loss": true,
10
+ "backbone": null,
11
+ "backbone_config": {
12
+ "depths": [
13
+ 3,
14
+ 4,
15
+ 23,
16
+ 3
17
+ ],
18
+ "downsample_in_bottleneck": false,
19
+ "downsample_in_first_stage": false,
20
+ "embedding_size": 64,
21
+ "hidden_act": "relu",
22
+ "hidden_sizes": [
23
+ 256,
24
+ 512,
25
+ 1024,
26
+ 2048
27
+ ],
28
+ "layer_type": "bottleneck",
29
+ "model_type": "rt_detr_resnet",
30
+ "num_channels": 3,
31
+ "out_features": [
32
+ "stage2",
33
+ "stage3",
34
+ "stage4"
35
+ ],
36
+ "out_indices": [
37
+ 2,
38
+ 3,
39
+ 4
40
+ ],
41
+ "stage_names": [
42
+ "stem",
43
+ "stage1",
44
+ "stage2",
45
+ "stage3",
46
+ "stage4"
47
+ ],
48
+ "torch_dtype": "float32"
49
+ },
50
+ "backbone_kwargs": null,
51
+ "batch_norm_eps": 1e-05,
52
+ "box_noise_scale": 1.0,
53
+ "d_model": 256,
54
+ "decoder_activation_function": "relu",
55
+ "decoder_attention_heads": 8,
56
+ "decoder_ffn_dim": 1024,
57
+ "decoder_in_channels": [
58
+ 384,
59
+ 384,
60
+ 384
61
+ ],
62
+ "decoder_layers": 6,
63
+ "decoder_method": "default",
64
+ "decoder_n_levels": 3,
65
+ "decoder_n_points": 4,
66
+ "decoder_offset_scale": 0.5,
67
+ "disable_custom_kernels": true,
68
+ "dropout": 0.0,
69
+ "encode_proj_layers": [
70
+ 2
71
+ ],
72
+ "encoder_activation_function": "gelu",
73
+ "encoder_attention_heads": 8,
74
+ "encoder_ffn_dim": 2048,
75
+ "encoder_hidden_dim": 384,
76
+ "encoder_in_channels": [
77
+ 512,
78
+ 1024,
79
+ 2048
80
+ ],
81
+ "encoder_layers": 1,
82
+ "eos_coefficient": 0.0001,
83
+ "eval_size": null,
84
+ "feat_strides": [
85
+ 8,
86
+ 16,
87
+ 32
88
+ ],
89
+ "focal_loss_alpha": 0.75,
90
+ "focal_loss_gamma": 2.0,
91
+ "freeze_backbone_batch_norms": true,
92
+ "hidden_expansion": 1.0,
93
+ "id2label": {
94
+ "0": "LABEL_0",
95
+ "1": "LABEL_1",
96
+ "2": "LABEL_2"
97
+ },
98
+ "initializer_bias_prior_prob": null,
99
+ "initializer_range": 0.01,
100
+ "is_encoder_decoder": true,
101
+ "label2id": {
102
+ "LABEL_0": 0,
103
+ "LABEL_1": 1,
104
+ "LABEL_2": 2
105
+ },
106
+ "label_noise_ratio": 0.5,
107
+ "layer_norm_eps": 1e-05,
108
+ "learn_initial_query": false,
109
+ "matcher_alpha": 0.25,
110
+ "matcher_bbox_cost": 5.0,
111
+ "matcher_class_cost": 2.0,
112
+ "matcher_gamma": 2.0,
113
+ "matcher_giou_cost": 2.0,
114
+ "model_type": "rt_detr_v2",
115
+ "normalize_before": false,
116
+ "num_denoising": 100,
117
+ "num_feature_levels": 3,
118
+ "num_queries": 300,
119
+ "positional_encoding_temperature": 10000,
120
+ "torch_dtype": "float32",
121
+ "transformers_version": "4.55.0",
122
+ "use_focal_loss": true,
123
+ "use_pretrained_backbone": false,
124
+ "use_timm_backbone": false,
125
+ "weight_loss_bbox": 5.0,
126
+ "weight_loss_giou": 2.0,
127
+ "weight_loss_vfl": 1.0,
128
+ "with_box_refine": true
129
+ }
checkpoint-1600/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:226b3771abee1cf8215eee8ff3a6c866930ba69be4144e554d6ad8d45317d5a4
3
+ size 306699044
checkpoint-1600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e429f0d5be726eb26d197c034148dc43ca3de0a5498a36ae73ffb251656d55e0
3
+ size 611580433
checkpoint-1600/preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_annotations": true,
3
+ "do_normalize": false,
4
+ "do_pad": false,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "format": "coco_detection",
8
+ "image_mean": [
9
+ 0.485,
10
+ 0.456,
11
+ 0.406
12
+ ],
13
+ "image_processor_type": "RTDetrImageProcessor",
14
+ "image_std": [
15
+ 0.229,
16
+ 0.224,
17
+ 0.225
18
+ ],
19
+ "pad_size": null,
20
+ "resample": 2,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "height": 640,
24
+ "width": 640
25
+ }
26
+ }
checkpoint-1600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ec603acfcf0594fe5e0f4a5b622df7eaf620ca0be75f37087ee07cf6a8bc746
3
+ size 14244
checkpoint-1600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:294a2ee5d41cd38781c829986780523b3ca2c9ddcff37ad7f87c59ebe379a29a
3
+ size 1064
checkpoint-1600/trainer_state.json ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 35.55555555555556,
6
+ "eval_steps": 500,
7
+ "global_step": 1600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.1111111111111112,
14
+ "grad_norm": 3663.33642578125,
15
+ "learning_rate": 3.6750000000000003e-07,
16
+ "loss": 1356.1239,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 2.2222222222222223,
21
+ "grad_norm": 3973.032470703125,
22
+ "learning_rate": 7.425000000000001e-07,
23
+ "loss": 1275.5178,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 3.3333333333333335,
28
+ "grad_norm": 2667.0068359375,
29
+ "learning_rate": 1.1174999999999999e-06,
30
+ "loss": 1123.6059,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 4.444444444444445,
35
+ "grad_norm": 2320.48486328125,
36
+ "learning_rate": 1.4925000000000001e-06,
37
+ "loss": 922.8917,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 5.555555555555555,
42
+ "grad_norm": 1611.3345947265625,
43
+ "learning_rate": 1.8675000000000001e-06,
44
+ "loss": 714.5638,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 6.666666666666667,
49
+ "grad_norm": 1395.7064208984375,
50
+ "learning_rate": 2.2425e-06,
51
+ "loss": 542.1251,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 7.777777777777778,
56
+ "grad_norm": 1019.4860229492188,
57
+ "learning_rate": 2.6175e-06,
58
+ "loss": 411.1387,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 8.88888888888889,
63
+ "grad_norm": 888.315185546875,
64
+ "learning_rate": 2.9925e-06,
65
+ "loss": 318.4567,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 2590.113525390625,
71
+ "learning_rate": 3.3675000000000004e-06,
72
+ "loss": 261.6995,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 11.11111111111111,
77
+ "grad_norm": 711.677734375,
78
+ "learning_rate": 3.7425e-06,
79
+ "loss": 220.2936,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 12.222222222222221,
84
+ "grad_norm": 548.9371948242188,
85
+ "learning_rate": 4.117500000000001e-06,
86
+ "loss": 187.9833,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 13.333333333333334,
91
+ "grad_norm": 1127.1611328125,
92
+ "learning_rate": 4.4925e-06,
93
+ "loss": 159.1351,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 14.444444444444445,
98
+ "grad_norm": 426.074951171875,
99
+ "learning_rate": 4.8675e-06,
100
+ "loss": 137.1092,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 15.555555555555555,
105
+ "grad_norm": 348.842529296875,
106
+ "learning_rate": 5.2425e-06,
107
+ "loss": 119.822,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 16.666666666666668,
112
+ "grad_norm": 373.2169189453125,
113
+ "learning_rate": 5.6175e-06,
114
+ "loss": 104.3366,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 17.77777777777778,
119
+ "grad_norm": 324.51702880859375,
120
+ "learning_rate": 5.992500000000001e-06,
121
+ "loss": 90.8788,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 18.88888888888889,
126
+ "grad_norm": 269.91827392578125,
127
+ "learning_rate": 6.3675e-06,
128
+ "loss": 78.4644,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 20.0,
133
+ "grad_norm": 1744.54052734375,
134
+ "learning_rate": 6.7425e-06,
135
+ "loss": 70.3526,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 21.11111111111111,
140
+ "grad_norm": 369.39837646484375,
141
+ "learning_rate": 7.1175e-06,
142
+ "loss": 63.9417,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 22.22222222222222,
147
+ "grad_norm": 303.95977783203125,
148
+ "learning_rate": 7.4925e-06,
149
+ "loss": 63.4575,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 23.333333333333332,
154
+ "grad_norm": 187.462890625,
155
+ "learning_rate": 7.8675e-06,
156
+ "loss": 54.7417,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 24.444444444444443,
161
+ "grad_norm": 165.56666564941406,
162
+ "learning_rate": 8.2425e-06,
163
+ "loss": 49.6842,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 25.555555555555557,
168
+ "grad_norm": 147.3148193359375,
169
+ "learning_rate": 8.6175e-06,
170
+ "loss": 43.027,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 26.666666666666668,
175
+ "grad_norm": 125.53775024414062,
176
+ "learning_rate": 8.9925e-06,
177
+ "loss": 38.2579,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 27.77777777777778,
182
+ "grad_norm": 109.85810089111328,
183
+ "learning_rate": 9.367500000000001e-06,
184
+ "loss": 34.3957,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 28.88888888888889,
189
+ "grad_norm": 98.328369140625,
190
+ "learning_rate": 9.7425e-06,
191
+ "loss": 31.4378,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 30.0,
196
+ "grad_norm": 109.77750396728516,
197
+ "learning_rate": 1.01175e-05,
198
+ "loss": 28.5084,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 31.11111111111111,
203
+ "grad_norm": 112.47483825683594,
204
+ "learning_rate": 1.04925e-05,
205
+ "loss": 26.1671,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 32.22222222222222,
210
+ "grad_norm": 85.60242462158203,
211
+ "learning_rate": 1.08675e-05,
212
+ "loss": 24.2309,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 33.333333333333336,
217
+ "grad_norm": 73.19799041748047,
218
+ "learning_rate": 1.1242500000000001e-05,
219
+ "loss": 22.6248,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 34.44444444444444,
224
+ "grad_norm": 80.70884704589844,
225
+ "learning_rate": 1.16175e-05,
226
+ "loss": 21.1187,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 35.55555555555556,
231
+ "grad_norm": 110.98326110839844,
232
+ "learning_rate": 1.19925e-05,
233
+ "loss": 20.4828,
234
+ "step": 1600
235
+ }
236
+ ],
237
+ "logging_steps": 50,
238
+ "max_steps": 2250,
239
+ "num_input_tokens_seen": 0,
240
+ "num_train_epochs": 50,
241
+ "save_steps": 200,
242
+ "stateful_callbacks": {
243
+ "TrainerControl": {
244
+ "args": {
245
+ "should_epoch_stop": false,
246
+ "should_evaluate": false,
247
+ "should_log": false,
248
+ "should_save": true,
249
+ "should_training_stop": false
250
+ },
251
+ "attributes": {}
252
+ }
253
+ },
254
+ "total_flos": 4.31006450959319e+19,
255
+ "train_batch_size": 24,
256
+ "trial_name": null,
257
+ "trial_params": null
258
+ }
checkpoint-1600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
3
+ size 5368
checkpoint-1800/config.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "silu",
4
+ "anchor_image_size": null,
5
+ "architectures": [
6
+ "RTDetrV2ForObjectDetection"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "auxiliary_loss": true,
10
+ "backbone": null,
11
+ "backbone_config": {
12
+ "depths": [
13
+ 3,
14
+ 4,
15
+ 23,
16
+ 3
17
+ ],
18
+ "downsample_in_bottleneck": false,
19
+ "downsample_in_first_stage": false,
20
+ "embedding_size": 64,
21
+ "hidden_act": "relu",
22
+ "hidden_sizes": [
23
+ 256,
24
+ 512,
25
+ 1024,
26
+ 2048
27
+ ],
28
+ "layer_type": "bottleneck",
29
+ "model_type": "rt_detr_resnet",
30
+ "num_channels": 3,
31
+ "out_features": [
32
+ "stage2",
33
+ "stage3",
34
+ "stage4"
35
+ ],
36
+ "out_indices": [
37
+ 2,
38
+ 3,
39
+ 4
40
+ ],
41
+ "stage_names": [
42
+ "stem",
43
+ "stage1",
44
+ "stage2",
45
+ "stage3",
46
+ "stage4"
47
+ ],
48
+ "torch_dtype": "float32"
49
+ },
50
+ "backbone_kwargs": null,
51
+ "batch_norm_eps": 1e-05,
52
+ "box_noise_scale": 1.0,
53
+ "d_model": 256,
54
+ "decoder_activation_function": "relu",
55
+ "decoder_attention_heads": 8,
56
+ "decoder_ffn_dim": 1024,
57
+ "decoder_in_channels": [
58
+ 384,
59
+ 384,
60
+ 384
61
+ ],
62
+ "decoder_layers": 6,
63
+ "decoder_method": "default",
64
+ "decoder_n_levels": 3,
65
+ "decoder_n_points": 4,
66
+ "decoder_offset_scale": 0.5,
67
+ "disable_custom_kernels": true,
68
+ "dropout": 0.0,
69
+ "encode_proj_layers": [
70
+ 2
71
+ ],
72
+ "encoder_activation_function": "gelu",
73
+ "encoder_attention_heads": 8,
74
+ "encoder_ffn_dim": 2048,
75
+ "encoder_hidden_dim": 384,
76
+ "encoder_in_channels": [
77
+ 512,
78
+ 1024,
79
+ 2048
80
+ ],
81
+ "encoder_layers": 1,
82
+ "eos_coefficient": 0.0001,
83
+ "eval_size": null,
84
+ "feat_strides": [
85
+ 8,
86
+ 16,
87
+ 32
88
+ ],
89
+ "focal_loss_alpha": 0.75,
90
+ "focal_loss_gamma": 2.0,
91
+ "freeze_backbone_batch_norms": true,
92
+ "hidden_expansion": 1.0,
93
+ "id2label": {
94
+ "0": "LABEL_0",
95
+ "1": "LABEL_1",
96
+ "2": "LABEL_2"
97
+ },
98
+ "initializer_bias_prior_prob": null,
99
+ "initializer_range": 0.01,
100
+ "is_encoder_decoder": true,
101
+ "label2id": {
102
+ "LABEL_0": 0,
103
+ "LABEL_1": 1,
104
+ "LABEL_2": 2
105
+ },
106
+ "label_noise_ratio": 0.5,
107
+ "layer_norm_eps": 1e-05,
108
+ "learn_initial_query": false,
109
+ "matcher_alpha": 0.25,
110
+ "matcher_bbox_cost": 5.0,
111
+ "matcher_class_cost": 2.0,
112
+ "matcher_gamma": 2.0,
113
+ "matcher_giou_cost": 2.0,
114
+ "model_type": "rt_detr_v2",
115
+ "normalize_before": false,
116
+ "num_denoising": 100,
117
+ "num_feature_levels": 3,
118
+ "num_queries": 300,
119
+ "positional_encoding_temperature": 10000,
120
+ "torch_dtype": "float32",
121
+ "transformers_version": "4.55.0",
122
+ "use_focal_loss": true,
123
+ "use_pretrained_backbone": false,
124
+ "use_timm_backbone": false,
125
+ "weight_loss_bbox": 5.0,
126
+ "weight_loss_giou": 2.0,
127
+ "weight_loss_vfl": 1.0,
128
+ "with_box_refine": true
129
+ }
checkpoint-1800/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07ca2b0b335e81ed89a2f7f930b576eee925506e057c93002392f1e2376180f0
3
+ size 306699044
checkpoint-1800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bd5f81a1ac758d99c2a50f00a75bf78c82cda77f464f12cdda567e7a65306d7
3
+ size 611580433
checkpoint-1800/preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_annotations": true,
3
+ "do_normalize": false,
4
+ "do_pad": false,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "format": "coco_detection",
8
+ "image_mean": [
9
+ 0.485,
10
+ 0.456,
11
+ 0.406
12
+ ],
13
+ "image_processor_type": "RTDetrImageProcessor",
14
+ "image_std": [
15
+ 0.229,
16
+ 0.224,
17
+ 0.225
18
+ ],
19
+ "pad_size": null,
20
+ "resample": 2,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "height": 640,
24
+ "width": 640
25
+ }
26
+ }
checkpoint-1800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0f22c99e3d52b982805eed64c46ed79d1c93982c6433c732f7f4d4a2ab58f93
3
+ size 14244
checkpoint-1800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7ef919f119174a0bac8b992747548bc0ab5a6be59a5cfc5a6c8633fade536fa
3
+ size 1064
checkpoint-1800/trainer_state.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 40.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.1111111111111112,
14
+ "grad_norm": 3663.33642578125,
15
+ "learning_rate": 3.6750000000000003e-07,
16
+ "loss": 1356.1239,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 2.2222222222222223,
21
+ "grad_norm": 3973.032470703125,
22
+ "learning_rate": 7.425000000000001e-07,
23
+ "loss": 1275.5178,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 3.3333333333333335,
28
+ "grad_norm": 2667.0068359375,
29
+ "learning_rate": 1.1174999999999999e-06,
30
+ "loss": 1123.6059,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 4.444444444444445,
35
+ "grad_norm": 2320.48486328125,
36
+ "learning_rate": 1.4925000000000001e-06,
37
+ "loss": 922.8917,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 5.555555555555555,
42
+ "grad_norm": 1611.3345947265625,
43
+ "learning_rate": 1.8675000000000001e-06,
44
+ "loss": 714.5638,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 6.666666666666667,
49
+ "grad_norm": 1395.7064208984375,
50
+ "learning_rate": 2.2425e-06,
51
+ "loss": 542.1251,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 7.777777777777778,
56
+ "grad_norm": 1019.4860229492188,
57
+ "learning_rate": 2.6175e-06,
58
+ "loss": 411.1387,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 8.88888888888889,
63
+ "grad_norm": 888.315185546875,
64
+ "learning_rate": 2.9925e-06,
65
+ "loss": 318.4567,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 2590.113525390625,
71
+ "learning_rate": 3.3675000000000004e-06,
72
+ "loss": 261.6995,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 11.11111111111111,
77
+ "grad_norm": 711.677734375,
78
+ "learning_rate": 3.7425e-06,
79
+ "loss": 220.2936,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 12.222222222222221,
84
+ "grad_norm": 548.9371948242188,
85
+ "learning_rate": 4.117500000000001e-06,
86
+ "loss": 187.9833,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 13.333333333333334,
91
+ "grad_norm": 1127.1611328125,
92
+ "learning_rate": 4.4925e-06,
93
+ "loss": 159.1351,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 14.444444444444445,
98
+ "grad_norm": 426.074951171875,
99
+ "learning_rate": 4.8675e-06,
100
+ "loss": 137.1092,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 15.555555555555555,
105
+ "grad_norm": 348.842529296875,
106
+ "learning_rate": 5.2425e-06,
107
+ "loss": 119.822,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 16.666666666666668,
112
+ "grad_norm": 373.2169189453125,
113
+ "learning_rate": 5.6175e-06,
114
+ "loss": 104.3366,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 17.77777777777778,
119
+ "grad_norm": 324.51702880859375,
120
+ "learning_rate": 5.992500000000001e-06,
121
+ "loss": 90.8788,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 18.88888888888889,
126
+ "grad_norm": 269.91827392578125,
127
+ "learning_rate": 6.3675e-06,
128
+ "loss": 78.4644,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 20.0,
133
+ "grad_norm": 1744.54052734375,
134
+ "learning_rate": 6.7425e-06,
135
+ "loss": 70.3526,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 21.11111111111111,
140
+ "grad_norm": 369.39837646484375,
141
+ "learning_rate": 7.1175e-06,
142
+ "loss": 63.9417,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 22.22222222222222,
147
+ "grad_norm": 303.95977783203125,
148
+ "learning_rate": 7.4925e-06,
149
+ "loss": 63.4575,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 23.333333333333332,
154
+ "grad_norm": 187.462890625,
155
+ "learning_rate": 7.8675e-06,
156
+ "loss": 54.7417,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 24.444444444444443,
161
+ "grad_norm": 165.56666564941406,
162
+ "learning_rate": 8.2425e-06,
163
+ "loss": 49.6842,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 25.555555555555557,
168
+ "grad_norm": 147.3148193359375,
169
+ "learning_rate": 8.6175e-06,
170
+ "loss": 43.027,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 26.666666666666668,
175
+ "grad_norm": 125.53775024414062,
176
+ "learning_rate": 8.9925e-06,
177
+ "loss": 38.2579,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 27.77777777777778,
182
+ "grad_norm": 109.85810089111328,
183
+ "learning_rate": 9.367500000000001e-06,
184
+ "loss": 34.3957,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 28.88888888888889,
189
+ "grad_norm": 98.328369140625,
190
+ "learning_rate": 9.7425e-06,
191
+ "loss": 31.4378,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 30.0,
196
+ "grad_norm": 109.77750396728516,
197
+ "learning_rate": 1.01175e-05,
198
+ "loss": 28.5084,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 31.11111111111111,
203
+ "grad_norm": 112.47483825683594,
204
+ "learning_rate": 1.04925e-05,
205
+ "loss": 26.1671,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 32.22222222222222,
210
+ "grad_norm": 85.60242462158203,
211
+ "learning_rate": 1.08675e-05,
212
+ "loss": 24.2309,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 33.333333333333336,
217
+ "grad_norm": 73.19799041748047,
218
+ "learning_rate": 1.1242500000000001e-05,
219
+ "loss": 22.6248,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 34.44444444444444,
224
+ "grad_norm": 80.70884704589844,
225
+ "learning_rate": 1.16175e-05,
226
+ "loss": 21.1187,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 35.55555555555556,
231
+ "grad_norm": 110.98326110839844,
232
+ "learning_rate": 1.19925e-05,
233
+ "loss": 20.4828,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 36.666666666666664,
238
+ "grad_norm": 113.65286254882812,
239
+ "learning_rate": 1.23675e-05,
240
+ "loss": 19.7366,
241
+ "step": 1650
242
+ },
243
+ {
244
+ "epoch": 37.77777777777778,
245
+ "grad_norm": 77.65855407714844,
246
+ "learning_rate": 1.27425e-05,
247
+ "loss": 18.6632,
248
+ "step": 1700
249
+ },
250
+ {
251
+ "epoch": 38.888888888888886,
252
+ "grad_norm": 88.96723175048828,
253
+ "learning_rate": 1.3117500000000001e-05,
254
+ "loss": 18.0793,
255
+ "step": 1750
256
+ },
257
+ {
258
+ "epoch": 40.0,
259
+ "grad_norm": 79.1690902709961,
260
+ "learning_rate": 1.34925e-05,
261
+ "loss": 17.0667,
262
+ "step": 1800
263
+ }
264
+ ],
265
+ "logging_steps": 50,
266
+ "max_steps": 2250,
267
+ "num_input_tokens_seen": 0,
268
+ "num_train_epochs": 50,
269
+ "save_steps": 200,
270
+ "stateful_callbacks": {
271
+ "TrainerControl": {
272
+ "args": {
273
+ "should_epoch_stop": false,
274
+ "should_evaluate": false,
275
+ "should_log": false,
276
+ "should_save": true,
277
+ "should_training_stop": false
278
+ },
279
+ "attributes": {}
280
+ }
281
+ },
282
+ "total_flos": 4.848540943358362e+19,
283
+ "train_batch_size": 24,
284
+ "trial_name": null,
285
+ "trial_params": null
286
+ }
checkpoint-1800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
3
+ size 5368
checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f77f03ccce08e6fc68356cfd96e1e595cd091842dfe691b57b0240f9d0ec534d
3
  size 306699044
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74e332bd7772e2e7ca2e34234d7da49a711abdbfb273e02189615d3a85b39c3b
3
  size 306699044
checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:232a5353d5019158bc112b85a00cc968eabb74598e1ef8a15e7f726735442ca6
3
  size 611580433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04408b4cb5317b335a1dfa8034af865e68e9535b6db6f797f7454b76a0dbb0fd
3
  size 611580433
checkpoint-2000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b759482d4a8ad0299b9805d5a497036b386a5a5b40224d0e72b701c28b2f5ca
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf38b27cb6d82f34ae1038db7a7ddef1794dabfa258a278a90cdfdf859a01777
3
  size 14244
checkpoint-2000/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 43.505494505494504,
6
  "eval_steps": 500,
7
  "global_step": 2000,
8
  "is_hyper_param_search": false,
@@ -10,2811 +10,291 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.47619047619047616,
14
- "grad_norm": 5716.10205078125,
15
- "learning_rate": 3.0000000000000004e-08,
16
- "loss": 1383.1007,
17
- "step": 5
18
- },
19
- {
20
- "epoch": 0.9523809523809523,
21
- "grad_norm": 12343.9111328125,
22
- "learning_rate": 6.75e-08,
23
- "loss": 1349.0135,
24
- "step": 10
25
- },
26
- {
27
- "epoch": 1.380952380952381,
28
- "grad_norm": 10210.48046875,
29
- "learning_rate": 1.05e-07,
30
- "loss": 1340.3351,
31
- "step": 15
32
- },
33
- {
34
- "epoch": 1.8571428571428572,
35
- "grad_norm": 7633.39599609375,
36
- "learning_rate": 1.425e-07,
37
- "loss": 1369.3844,
38
- "step": 20
39
- },
40
- {
41
- "epoch": 0.5494505494505495,
42
- "grad_norm": 12217.7294921875,
43
- "learning_rate": 1.8e-07,
44
- "loss": 1362.663,
45
- "step": 25
46
- },
47
- {
48
- "epoch": 0.6593406593406593,
49
- "grad_norm": 31348.806640625,
50
- "learning_rate": 2.175e-07,
51
- "loss": 1361.9012,
52
- "step": 30
53
- },
54
- {
55
- "epoch": 0.7692307692307693,
56
- "grad_norm": 7385.05322265625,
57
- "learning_rate": 2.5500000000000005e-07,
58
- "loss": 1365.5517,
59
- "step": 35
60
- },
61
- {
62
- "epoch": 0.8791208791208791,
63
- "grad_norm": 15594.4677734375,
64
- "learning_rate": 2.925e-07,
65
- "loss": 1341.6402,
66
- "step": 40
67
- },
68
- {
69
- "epoch": 0.989010989010989,
70
- "grad_norm": 6404.94775390625,
71
- "learning_rate": 3.2999999999999996e-07,
72
- "loss": 1343.9832,
73
- "step": 45
74
- },
75
- {
76
- "epoch": 1.10989010989011,
77
- "grad_norm": 6927.478515625,
78
  "learning_rate": 3.6750000000000003e-07,
79
- "loss": 1611.4768,
80
  "step": 50
81
  },
82
  {
83
- "epoch": 1.2197802197802199,
84
- "grad_norm": 12092.9287109375,
85
- "learning_rate": 4.05e-07,
86
- "loss": 1330.508,
87
- "step": 55
88
- },
89
- {
90
- "epoch": 1.3296703296703296,
91
- "grad_norm": 5830.10693359375,
92
- "learning_rate": 4.425e-07,
93
- "loss": 1333.5215,
94
- "step": 60
95
- },
96
- {
97
- "epoch": 1.4395604395604396,
98
- "grad_norm": 5059.5302734375,
99
- "learning_rate": 4.800000000000001e-07,
100
- "loss": 1305.5257,
101
- "step": 65
102
- },
103
- {
104
- "epoch": 1.5494505494505495,
105
- "grad_norm": 14086.837890625,
106
- "learning_rate": 5.175e-07,
107
- "loss": 1293.7212,
108
- "step": 70
109
- },
110
- {
111
- "epoch": 1.6593406593406592,
112
- "grad_norm": 4606.49462890625,
113
- "learning_rate": 5.55e-07,
114
- "loss": 1293.7292,
115
- "step": 75
116
- },
117
- {
118
- "epoch": 1.7692307692307692,
119
- "grad_norm": 7120.25244140625,
120
- "learning_rate": 5.925e-07,
121
- "loss": 1283.8888,
122
- "step": 80
123
- },
124
- {
125
- "epoch": 1.879120879120879,
126
- "grad_norm": 4402.51513671875,
127
- "learning_rate": 6.3e-07,
128
- "loss": 1270.6494,
129
- "step": 85
130
- },
131
- {
132
- "epoch": 1.989010989010989,
133
- "grad_norm": 4826.724609375,
134
- "learning_rate": 6.675e-07,
135
- "loss": 1263.9209,
136
- "step": 90
137
- },
138
- {
139
- "epoch": 2.087912087912088,
140
- "grad_norm": 4356.83056640625,
141
- "learning_rate": 7.05e-07,
142
- "loss": 1255.111,
143
- "step": 95
144
- },
145
- {
146
- "epoch": 2.197802197802198,
147
- "grad_norm": 6895.61962890625,
148
- "learning_rate": 7.425000000000001e-07,
149
- "loss": 1224.4364,
150
- "step": 100
151
- },
152
- {
153
- "epoch": 2.3076923076923075,
154
- "grad_norm": 4793.59375,
155
- "learning_rate": 7.799999999999999e-07,
156
- "loss": 1211.3252,
157
- "step": 105
158
- },
159
- {
160
- "epoch": 2.4175824175824174,
161
- "grad_norm": 6701.6357421875,
162
- "learning_rate": 8.175e-07,
163
- "loss": 1183.1627,
164
- "step": 110
165
- },
166
- {
167
- "epoch": 2.5274725274725274,
168
- "grad_norm": 4253.81005859375,
169
- "learning_rate": 8.550000000000001e-07,
170
- "loss": 1179.29,
171
- "step": 115
172
- },
173
- {
174
- "epoch": 2.6373626373626373,
175
- "grad_norm": 4965.86181640625,
176
- "learning_rate": 8.925e-07,
177
- "loss": 1168.3469,
178
- "step": 120
179
- },
180
- {
181
- "epoch": 2.7472527472527473,
182
- "grad_norm": 3376.816650390625,
183
- "learning_rate": 9.3e-07,
184
- "loss": 1139.6433,
185
- "step": 125
186
- },
187
- {
188
- "epoch": 2.857142857142857,
189
- "grad_norm": 3226.073486328125,
190
- "learning_rate": 9.675e-07,
191
- "loss": 1133.0777,
192
- "step": 130
193
- },
194
- {
195
- "epoch": 2.967032967032967,
196
- "grad_norm": 3560.005126953125,
197
- "learning_rate": 1.0050000000000001e-06,
198
- "loss": 1113.3471,
199
- "step": 135
200
- },
201
- {
202
- "epoch": 3.065934065934066,
203
- "grad_norm": 2917.787109375,
204
- "learning_rate": 1.0425000000000002e-06,
205
- "loss": 1094.2038,
206
- "step": 140
207
- },
208
- {
209
- "epoch": 3.1758241758241756,
210
- "grad_norm": 3291.990478515625,
211
- "learning_rate": 1.08e-06,
212
- "loss": 1069.3087,
213
- "step": 145
214
- },
215
- {
216
- "epoch": 3.2857142857142856,
217
- "grad_norm": 3082.956298828125,
218
- "learning_rate": 1.1174999999999999e-06,
219
- "loss": 1043.3319,
220
- "step": 150
221
- },
222
- {
223
- "epoch": 3.3956043956043955,
224
- "grad_norm": 2947.577392578125,
225
- "learning_rate": 1.155e-06,
226
- "loss": 1040.2479,
227
- "step": 155
228
- },
229
- {
230
- "epoch": 3.5054945054945055,
231
- "grad_norm": 2917.072021484375,
232
- "learning_rate": 1.1925e-06,
233
- "loss": 1014.5039,
234
- "step": 160
235
- },
236
- {
237
- "epoch": 3.6153846153846154,
238
- "grad_norm": 2572.1064453125,
239
- "learning_rate": 1.23e-06,
240
- "loss": 980.6475,
241
- "step": 165
242
- },
243
- {
244
- "epoch": 3.7252747252747254,
245
- "grad_norm": 3038.7041015625,
246
- "learning_rate": 1.2675000000000001e-06,
247
- "loss": 969.8687,
248
- "step": 170
249
- },
250
- {
251
- "epoch": 3.8351648351648353,
252
- "grad_norm": 2459.19482421875,
253
- "learning_rate": 1.305e-06,
254
- "loss": 946.8183,
255
- "step": 175
256
- },
257
- {
258
- "epoch": 3.9450549450549453,
259
- "grad_norm": 2485.954345703125,
260
- "learning_rate": 1.3425e-06,
261
- "loss": 923.6708,
262
- "step": 180
263
- },
264
- {
265
- "epoch": 4.043956043956044,
266
- "grad_norm": 2574.14599609375,
267
- "learning_rate": 1.38e-06,
268
- "loss": 906.8975,
269
- "step": 185
270
- },
271
- {
272
- "epoch": 4.153846153846154,
273
- "grad_norm": 2498.670166015625,
274
- "learning_rate": 1.4175e-06,
275
- "loss": 879.6545,
276
- "step": 190
277
- },
278
- {
279
- "epoch": 4.263736263736264,
280
- "grad_norm": 2621.924072265625,
281
- "learning_rate": 1.455e-06,
282
- "loss": 859.6516,
283
- "step": 195
284
- },
285
- {
286
- "epoch": 4.373626373626374,
287
- "grad_norm": 2298.13671875,
288
- "learning_rate": 1.4925000000000001e-06,
289
- "loss": 835.168,
290
- "step": 200
291
- },
292
- {
293
- "epoch": 4.483516483516484,
294
- "grad_norm": 2228.95458984375,
295
- "learning_rate": 1.53e-06,
296
- "loss": 817.8176,
297
- "step": 205
298
- },
299
- {
300
- "epoch": 4.593406593406593,
301
- "grad_norm": 2075.57568359375,
302
- "learning_rate": 1.5675e-06,
303
- "loss": 795.5578,
304
- "step": 210
305
- },
306
- {
307
- "epoch": 4.7032967032967035,
308
- "grad_norm": 2308.193603515625,
309
- "learning_rate": 1.605e-06,
310
- "loss": 769.8123,
311
- "step": 215
312
- },
313
- {
314
- "epoch": 4.813186813186813,
315
- "grad_norm": 2495.91259765625,
316
- "learning_rate": 1.6425e-06,
317
- "loss": 745.4971,
318
- "step": 220
319
- },
320
- {
321
- "epoch": 4.923076923076923,
322
- "grad_norm": 2490.6796875,
323
- "learning_rate": 1.68e-06,
324
- "loss": 724.5927,
325
- "step": 225
326
- },
327
- {
328
- "epoch": 5.021978021978022,
329
- "grad_norm": 1986.7320556640625,
330
- "learning_rate": 1.7175e-06,
331
- "loss": 711.5865,
332
- "step": 230
333
- },
334
- {
335
- "epoch": 5.131868131868132,
336
- "grad_norm": 1901.4664306640625,
337
- "learning_rate": 1.7550000000000001e-06,
338
- "loss": 692.5993,
339
- "step": 235
340
- },
341
- {
342
- "epoch": 5.241758241758242,
343
- "grad_norm": 2567.854248046875,
344
- "learning_rate": 1.7925e-06,
345
- "loss": 665.2945,
346
- "step": 240
347
- },
348
- {
349
- "epoch": 5.351648351648351,
350
- "grad_norm": 1668.3482666015625,
351
- "learning_rate": 1.83e-06,
352
- "loss": 648.977,
353
- "step": 245
354
- },
355
- {
356
- "epoch": 5.461538461538462,
357
- "grad_norm": 1845.254150390625,
358
- "learning_rate": 1.8675000000000001e-06,
359
- "loss": 627.8005,
360
- "step": 250
361
- },
362
- {
363
- "epoch": 5.571428571428571,
364
- "grad_norm": 1811.66845703125,
365
- "learning_rate": 1.905e-06,
366
- "loss": 606.1213,
367
- "step": 255
368
- },
369
- {
370
- "epoch": 5.681318681318682,
371
- "grad_norm": 1720.802734375,
372
- "learning_rate": 1.9425e-06,
373
- "loss": 587.5198,
374
- "step": 260
375
- },
376
- {
377
- "epoch": 5.791208791208791,
378
- "grad_norm": 1535.228759765625,
379
- "learning_rate": 1.98e-06,
380
- "loss": 570.1765,
381
- "step": 265
382
- },
383
- {
384
- "epoch": 5.9010989010989015,
385
- "grad_norm": 1655.658447265625,
386
- "learning_rate": 2.0175e-06,
387
- "loss": 551.0042,
388
- "step": 270
389
- },
390
- {
391
- "epoch": 6.0,
392
- "grad_norm": 3979.0732421875,
393
- "learning_rate": 2.0550000000000002e-06,
394
- "loss": 535.4148,
395
- "step": 275
396
- },
397
- {
398
- "epoch": 6.1098901098901095,
399
- "grad_norm": 1727.4771728515625,
400
- "learning_rate": 2.0925000000000003e-06,
401
- "loss": 519.8219,
402
- "step": 280
403
- },
404
- {
405
- "epoch": 6.21978021978022,
406
- "grad_norm": 1350.4666748046875,
407
- "learning_rate": 2.13e-06,
408
- "loss": 510.5004,
409
- "step": 285
410
- },
411
- {
412
- "epoch": 6.329670329670329,
413
- "grad_norm": 2242.578857421875,
414
- "learning_rate": 2.1675e-06,
415
- "loss": 497.497,
416
- "step": 290
417
- },
418
- {
419
- "epoch": 6.43956043956044,
420
- "grad_norm": 1353.7908935546875,
421
- "learning_rate": 2.205e-06,
422
- "loss": 476.5748,
423
- "step": 295
424
- },
425
- {
426
- "epoch": 6.549450549450549,
427
- "grad_norm": 1527.2796630859375,
428
- "learning_rate": 2.2425e-06,
429
- "loss": 468.7679,
430
- "step": 300
431
- },
432
- {
433
- "epoch": 6.65934065934066,
434
- "grad_norm": 1145.3853759765625,
435
- "learning_rate": 2.28e-06,
436
- "loss": 454.7695,
437
- "step": 305
438
- },
439
- {
440
- "epoch": 6.769230769230769,
441
- "grad_norm": 1233.435302734375,
442
- "learning_rate": 2.3175e-06,
443
- "loss": 438.3474,
444
- "step": 310
445
- },
446
- {
447
- "epoch": 6.8791208791208796,
448
- "grad_norm": 1669.4859619140625,
449
- "learning_rate": 2.355e-06,
450
- "loss": 424.2438,
451
- "step": 315
452
- },
453
- {
454
- "epoch": 6.989010989010989,
455
- "grad_norm": 1667.7239990234375,
456
- "learning_rate": 2.3925e-06,
457
- "loss": 412.884,
458
- "step": 320
459
- },
460
- {
461
- "epoch": 7.087912087912088,
462
- "grad_norm": 1087.6927490234375,
463
- "learning_rate": 2.43e-06,
464
- "loss": 399.3542,
465
- "step": 325
466
- },
467
- {
468
- "epoch": 7.197802197802198,
469
- "grad_norm": 1415.59765625,
470
- "learning_rate": 2.4675e-06,
471
- "loss": 387.3436,
472
- "step": 330
473
- },
474
- {
475
- "epoch": 7.3076923076923075,
476
- "grad_norm": 1952.5645751953125,
477
- "learning_rate": 2.505e-06,
478
- "loss": 377.6118,
479
- "step": 335
480
- },
481
- {
482
- "epoch": 7.417582417582418,
483
- "grad_norm": 1404.5712890625,
484
- "learning_rate": 2.5425000000000002e-06,
485
- "loss": 371.8316,
486
- "step": 340
487
- },
488
- {
489
- "epoch": 7.527472527472527,
490
- "grad_norm": 1185.5135498046875,
491
- "learning_rate": 2.58e-06,
492
- "loss": 363.4571,
493
- "step": 345
494
- },
495
- {
496
- "epoch": 7.637362637362637,
497
- "grad_norm": 4727.9638671875,
498
- "learning_rate": 2.6175e-06,
499
- "loss": 354.3981,
500
- "step": 350
501
- },
502
- {
503
- "epoch": 7.747252747252747,
504
- "grad_norm": 937.0252075195312,
505
- "learning_rate": 2.655e-06,
506
- "loss": 344.7615,
507
- "step": 355
508
- },
509
- {
510
- "epoch": 7.857142857142857,
511
- "grad_norm": 1180.9298095703125,
512
- "learning_rate": 2.6925e-06,
513
- "loss": 340.5888,
514
- "step": 360
515
- },
516
- {
517
- "epoch": 7.967032967032967,
518
- "grad_norm": 1405.458984375,
519
- "learning_rate": 2.73e-06,
520
- "loss": 326.6432,
521
- "step": 365
522
- },
523
- {
524
- "epoch": 8.065934065934066,
525
- "grad_norm": 936.8318481445312,
526
- "learning_rate": 2.7675e-06,
527
- "loss": 317.4954,
528
- "step": 370
529
- },
530
- {
531
- "epoch": 8.175824175824175,
532
- "grad_norm": 888.236328125,
533
- "learning_rate": 2.8050000000000002e-06,
534
- "loss": 309.4422,
535
- "step": 375
536
- },
537
- {
538
- "epoch": 8.285714285714286,
539
- "grad_norm": 970.7135620117188,
540
- "learning_rate": 2.8425e-06,
541
- "loss": 301.2726,
542
- "step": 380
543
- },
544
- {
545
- "epoch": 8.395604395604396,
546
- "grad_norm": 1607.8035888671875,
547
- "learning_rate": 2.88e-06,
548
- "loss": 295.7361,
549
- "step": 385
550
- },
551
- {
552
- "epoch": 8.505494505494505,
553
- "grad_norm": 814.48486328125,
554
- "learning_rate": 2.9175e-06,
555
- "loss": 290.0745,
556
- "step": 390
557
- },
558
- {
559
- "epoch": 8.615384615384615,
560
- "grad_norm": 803.1909790039062,
561
- "learning_rate": 2.955e-06,
562
- "loss": 279.9774,
563
- "step": 395
564
- },
565
- {
566
- "epoch": 8.725274725274724,
567
- "grad_norm": 1034.652099609375,
568
- "learning_rate": 2.9925e-06,
569
- "loss": 277.7123,
570
- "step": 400
571
- },
572
- {
573
- "epoch": 8.835164835164836,
574
- "grad_norm": 993.9649658203125,
575
- "learning_rate": 3.0300000000000002e-06,
576
- "loss": 267.6938,
577
- "step": 405
578
- },
579
- {
580
- "epoch": 8.945054945054945,
581
- "grad_norm": 1010.8787841796875,
582
- "learning_rate": 3.0675e-06,
583
- "loss": 266.7134,
584
- "step": 410
585
- },
586
- {
587
- "epoch": 9.043956043956044,
588
- "grad_norm": 762.2285766601562,
589
- "learning_rate": 3.105e-06,
590
- "loss": 264.4567,
591
- "step": 415
592
- },
593
- {
594
- "epoch": 9.153846153846153,
595
- "grad_norm": 5807.85791015625,
596
- "learning_rate": 3.1425e-06,
597
- "loss": 254.0208,
598
- "step": 420
599
- },
600
- {
601
- "epoch": 9.263736263736265,
602
- "grad_norm": 717.2619018554688,
603
- "learning_rate": 3.18e-06,
604
- "loss": 249.0201,
605
- "step": 425
606
- },
607
- {
608
- "epoch": 9.373626373626374,
609
- "grad_norm": 725.6996459960938,
610
- "learning_rate": 3.2175e-06,
611
- "loss": 243.3289,
612
- "step": 430
613
- },
614
- {
615
- "epoch": 9.483516483516484,
616
- "grad_norm": 761.1790161132812,
617
- "learning_rate": 3.255e-06,
618
- "loss": 238.8994,
619
- "step": 435
620
- },
621
- {
622
- "epoch": 9.593406593406593,
623
- "grad_norm": 792.3602905273438,
624
- "learning_rate": 3.2925000000000002e-06,
625
- "loss": 231.8852,
626
- "step": 440
627
- },
628
- {
629
- "epoch": 9.703296703296703,
630
- "grad_norm": 744.7413330078125,
631
- "learning_rate": 3.3300000000000003e-06,
632
- "loss": 229.136,
633
- "step": 445
634
- },
635
- {
636
- "epoch": 9.813186813186814,
637
- "grad_norm": 673.6207885742188,
638
- "learning_rate": 3.3675000000000004e-06,
639
- "loss": 221.0368,
640
- "step": 450
641
- },
642
- {
643
- "epoch": 9.923076923076923,
644
- "grad_norm": 966.46630859375,
645
- "learning_rate": 3.405e-06,
646
- "loss": 217.8422,
647
- "step": 455
648
- },
649
- {
650
- "epoch": 10.021978021978022,
651
- "grad_norm": 740.294921875,
652
- "learning_rate": 3.4425e-06,
653
- "loss": 228.7085,
654
- "step": 460
655
- },
656
- {
657
- "epoch": 10.131868131868131,
658
- "grad_norm": 629.9981689453125,
659
- "learning_rate": 3.48e-06,
660
- "loss": 211.0439,
661
- "step": 465
662
- },
663
- {
664
- "epoch": 10.241758241758241,
665
- "grad_norm": 809.6885375976562,
666
- "learning_rate": 3.5174999999999998e-06,
667
- "loss": 204.4649,
668
- "step": 470
669
- },
670
- {
671
- "epoch": 10.351648351648352,
672
- "grad_norm": 1631.996337890625,
673
- "learning_rate": 3.555e-06,
674
- "loss": 204.0642,
675
- "step": 475
676
- },
677
- {
678
- "epoch": 10.461538461538462,
679
- "grad_norm": 958.5594482421875,
680
- "learning_rate": 3.5925e-06,
681
- "loss": 199.6854,
682
- "step": 480
683
- },
684
- {
685
- "epoch": 10.571428571428571,
686
- "grad_norm": 588.5241088867188,
687
- "learning_rate": 3.63e-06,
688
- "loss": 196.1547,
689
- "step": 485
690
- },
691
- {
692
- "epoch": 10.68131868131868,
693
- "grad_norm": 1439.742919921875,
694
- "learning_rate": 3.6675e-06,
695
- "loss": 194.8075,
696
- "step": 490
697
- },
698
- {
699
- "epoch": 10.791208791208792,
700
- "grad_norm": 547.2682495117188,
701
- "learning_rate": 3.705e-06,
702
- "loss": 190.5422,
703
- "step": 495
704
- },
705
- {
706
- "epoch": 10.901098901098901,
707
- "grad_norm": 550.932373046875,
708
- "learning_rate": 3.7425e-06,
709
- "loss": 188.0797,
710
- "step": 500
711
- },
712
- {
713
- "epoch": 11.0,
714
- "grad_norm": 1168.6212158203125,
715
- "learning_rate": 3.7800000000000002e-06,
716
- "loss": 180.8073,
717
- "step": 505
718
- },
719
- {
720
- "epoch": 11.10989010989011,
721
- "grad_norm": 613.2390747070312,
722
- "learning_rate": 3.8175e-06,
723
- "loss": 181.0391,
724
- "step": 510
725
- },
726
- {
727
- "epoch": 11.219780219780219,
728
- "grad_norm": 589.2745361328125,
729
- "learning_rate": 3.855e-06,
730
- "loss": 175.6959,
731
- "step": 515
732
- },
733
- {
734
- "epoch": 11.32967032967033,
735
- "grad_norm": 531.6744995117188,
736
- "learning_rate": 3.8925000000000004e-06,
737
- "loss": 174.1553,
738
- "step": 520
739
- },
740
- {
741
- "epoch": 11.43956043956044,
742
- "grad_norm": 514.2449340820312,
743
- "learning_rate": 3.9300000000000005e-06,
744
- "loss": 172.9302,
745
- "step": 525
746
- },
747
- {
748
- "epoch": 11.54945054945055,
749
- "grad_norm": 532.4719848632812,
750
- "learning_rate": 3.9675000000000006e-06,
751
- "loss": 168.4628,
752
- "step": 530
753
- },
754
- {
755
- "epoch": 11.659340659340659,
756
- "grad_norm": 475.0870361328125,
757
- "learning_rate": 4.005000000000001e-06,
758
- "loss": 164.1821,
759
- "step": 535
760
- },
761
- {
762
- "epoch": 11.76923076923077,
763
- "grad_norm": 531.968994140625,
764
- "learning_rate": 4.042500000000001e-06,
765
- "loss": 160.9798,
766
- "step": 540
767
- },
768
- {
769
- "epoch": 11.87912087912088,
770
- "grad_norm": 1276.7720947265625,
771
- "learning_rate": 4.080000000000001e-06,
772
- "loss": 161.732,
773
- "step": 545
774
- },
775
- {
776
- "epoch": 11.989010989010989,
777
- "grad_norm": 566.782958984375,
778
- "learning_rate": 4.117500000000001e-06,
779
- "loss": 157.6948,
780
- "step": 550
781
- },
782
- {
783
- "epoch": 12.087912087912088,
784
- "grad_norm": 749.6525268554688,
785
- "learning_rate": 4.155000000000001e-06,
786
- "loss": 157.8984,
787
- "step": 555
788
- },
789
- {
790
- "epoch": 12.197802197802197,
791
- "grad_norm": 465.9924011230469,
792
- "learning_rate": 4.1925e-06,
793
- "loss": 151.8522,
794
- "step": 560
795
- },
796
- {
797
- "epoch": 12.307692307692308,
798
- "grad_norm": 508.3736572265625,
799
- "learning_rate": 4.229999999999999e-06,
800
- "loss": 148.1901,
801
- "step": 565
802
- },
803
- {
804
- "epoch": 12.417582417582418,
805
- "grad_norm": 1925.7998046875,
806
- "learning_rate": 4.267499999999999e-06,
807
- "loss": 148.9536,
808
- "step": 570
809
- },
810
- {
811
- "epoch": 12.527472527472527,
812
- "grad_norm": 528.435546875,
813
- "learning_rate": 4.3049999999999994e-06,
814
- "loss": 146.6833,
815
- "step": 575
816
- },
817
- {
818
- "epoch": 12.637362637362637,
819
- "grad_norm": 629.6632690429688,
820
- "learning_rate": 4.3424999999999995e-06,
821
- "loss": 141.5915,
822
- "step": 580
823
- },
824
- {
825
- "epoch": 12.747252747252748,
826
- "grad_norm": 475.8123474121094,
827
- "learning_rate": 4.3799999999999996e-06,
828
- "loss": 139.1265,
829
- "step": 585
830
- },
831
- {
832
- "epoch": 12.857142857142858,
833
- "grad_norm": 545.7217407226562,
834
- "learning_rate": 4.4175e-06,
835
- "loss": 135.9861,
836
- "step": 590
837
- },
838
- {
839
- "epoch": 12.967032967032967,
840
- "grad_norm": 2502.422607421875,
841
- "learning_rate": 4.455e-06,
842
- "loss": 138.432,
843
- "step": 595
844
- },
845
- {
846
- "epoch": 13.065934065934066,
847
- "grad_norm": 432.36279296875,
848
- "learning_rate": 4.4925e-06,
849
- "loss": 144.0422,
850
- "step": 600
851
- },
852
- {
853
- "epoch": 13.175824175824175,
854
- "grad_norm": 429.08685302734375,
855
- "learning_rate": 4.53e-06,
856
- "loss": 132.7908,
857
- "step": 605
858
- },
859
- {
860
- "epoch": 13.285714285714286,
861
- "grad_norm": 467.1208801269531,
862
- "learning_rate": 4.5675e-06,
863
- "loss": 130.9718,
864
- "step": 610
865
- },
866
- {
867
- "epoch": 13.395604395604396,
868
- "grad_norm": 1329.150634765625,
869
- "learning_rate": 4.605e-06,
870
- "loss": 134.7514,
871
- "step": 615
872
- },
873
- {
874
- "epoch": 13.505494505494505,
875
- "grad_norm": 404.3941650390625,
876
- "learning_rate": 4.6425e-06,
877
- "loss": 127.4091,
878
- "step": 620
879
- },
880
- {
881
- "epoch": 13.615384615384615,
882
- "grad_norm": 439.34234619140625,
883
- "learning_rate": 4.68e-06,
884
- "loss": 123.8929,
885
- "step": 625
886
- },
887
- {
888
- "epoch": 13.725274725274724,
889
- "grad_norm": 418.9570007324219,
890
- "learning_rate": 4.7175e-06,
891
- "loss": 122.8744,
892
- "step": 630
893
- },
894
- {
895
- "epoch": 13.835164835164836,
896
- "grad_norm": 396.3748474121094,
897
- "learning_rate": 4.755e-06,
898
- "loss": 121.2023,
899
- "step": 635
900
- },
901
- {
902
- "epoch": 13.945054945054945,
903
- "grad_norm": 617.2871704101562,
904
- "learning_rate": 4.7925e-06,
905
- "loss": 121.1829,
906
- "step": 640
907
- },
908
- {
909
- "epoch": 14.043956043956044,
910
- "grad_norm": 409.8064270019531,
911
- "learning_rate": 4.83e-06,
912
- "loss": 117.2707,
913
- "step": 645
914
- },
915
- {
916
- "epoch": 14.153846153846153,
917
- "grad_norm": 516.0888671875,
918
- "learning_rate": 4.8675e-06,
919
- "loss": 118.2377,
920
- "step": 650
921
- },
922
- {
923
- "epoch": 14.263736263736265,
924
- "grad_norm": 505.36480712890625,
925
- "learning_rate": 4.9050000000000005e-06,
926
- "loss": 116.6757,
927
- "step": 655
928
- },
929
- {
930
- "epoch": 14.373626373626374,
931
- "grad_norm": 449.5859680175781,
932
- "learning_rate": 4.9425000000000005e-06,
933
- "loss": 114.7447,
934
- "step": 660
935
- },
936
- {
937
- "epoch": 14.483516483516484,
938
- "grad_norm": 420.5121765136719,
939
- "learning_rate": 4.980000000000001e-06,
940
- "loss": 113.2027,
941
- "step": 665
942
- },
943
- {
944
- "epoch": 14.593406593406593,
945
- "grad_norm": 345.0196838378906,
946
- "learning_rate": 5.017500000000001e-06,
947
- "loss": 111.8875,
948
- "step": 670
949
- },
950
- {
951
- "epoch": 14.703296703296703,
952
- "grad_norm": 377.2486877441406,
953
- "learning_rate": 5.055000000000001e-06,
954
- "loss": 107.6133,
955
- "step": 675
956
- },
957
- {
958
- "epoch": 14.813186813186814,
959
- "grad_norm": 384.3365783691406,
960
- "learning_rate": 5.092500000000001e-06,
961
- "loss": 109.3438,
962
- "step": 680
963
- },
964
- {
965
- "epoch": 14.923076923076923,
966
- "grad_norm": 385.41998291015625,
967
- "learning_rate": 5.130000000000001e-06,
968
- "loss": 103.7929,
969
- "step": 685
970
- },
971
- {
972
- "epoch": 15.021978021978022,
973
- "grad_norm": 362.2362976074219,
974
- "learning_rate": 5.1675e-06,
975
- "loss": 110.7998,
976
- "step": 690
977
- },
978
- {
979
- "epoch": 15.131868131868131,
980
- "grad_norm": 351.775146484375,
981
- "learning_rate": 5.205e-06,
982
- "loss": 100.9369,
983
- "step": 695
984
- },
985
- {
986
- "epoch": 15.241758241758241,
987
- "grad_norm": 390.7868347167969,
988
- "learning_rate": 5.2425e-06,
989
- "loss": 103.8922,
990
- "step": 700
991
- },
992
- {
993
- "epoch": 15.351648351648352,
994
- "grad_norm": 344.78564453125,
995
- "learning_rate": 5.279999999999999e-06,
996
- "loss": 99.1799,
997
- "step": 705
998
- },
999
- {
1000
- "epoch": 15.461538461538462,
1001
- "grad_norm": 378.85675048828125,
1002
- "learning_rate": 5.3174999999999995e-06,
1003
- "loss": 98.0914,
1004
- "step": 710
1005
- },
1006
- {
1007
- "epoch": 15.571428571428571,
1008
- "grad_norm": 342.2828063964844,
1009
- "learning_rate": 5.3549999999999996e-06,
1010
- "loss": 97.6728,
1011
- "step": 715
1012
- },
1013
- {
1014
- "epoch": 15.68131868131868,
1015
- "grad_norm": 341.1894226074219,
1016
- "learning_rate": 5.3925e-06,
1017
- "loss": 98.8686,
1018
- "step": 720
1019
- },
1020
- {
1021
- "epoch": 15.791208791208792,
1022
- "grad_norm": 336.378662109375,
1023
- "learning_rate": 5.43e-06,
1024
- "loss": 93.9906,
1025
- "step": 725
1026
- },
1027
- {
1028
- "epoch": 15.901098901098901,
1029
- "grad_norm": 334.4036560058594,
1030
- "learning_rate": 5.4675e-06,
1031
- "loss": 93.7508,
1032
- "step": 730
1033
- },
1034
- {
1035
- "epoch": 16.0,
1036
- "grad_norm": 1449.2354736328125,
1037
- "learning_rate": 5.505e-06,
1038
- "loss": 96.8374,
1039
- "step": 735
1040
- },
1041
- {
1042
- "epoch": 16.10989010989011,
1043
- "grad_norm": 341.5977478027344,
1044
- "learning_rate": 5.5425e-06,
1045
- "loss": 90.1757,
1046
- "step": 740
1047
- },
1048
- {
1049
- "epoch": 16.21978021978022,
1050
- "grad_norm": 562.0274658203125,
1051
- "learning_rate": 5.58e-06,
1052
- "loss": 89.7033,
1053
- "step": 745
1054
- },
1055
- {
1056
- "epoch": 16.32967032967033,
1057
- "grad_norm": 341.64959716796875,
1058
- "learning_rate": 5.6175e-06,
1059
- "loss": 90.8916,
1060
- "step": 750
1061
- },
1062
- {
1063
- "epoch": 16.439560439560438,
1064
- "grad_norm": 305.3785400390625,
1065
- "learning_rate": 5.655e-06,
1066
- "loss": 87.8087,
1067
- "step": 755
1068
- },
1069
- {
1070
- "epoch": 16.54945054945055,
1071
- "grad_norm": 317.1134338378906,
1072
- "learning_rate": 5.6925e-06,
1073
- "loss": 87.2254,
1074
- "step": 760
1075
- },
1076
- {
1077
- "epoch": 16.65934065934066,
1078
- "grad_norm": 305.0978088378906,
1079
- "learning_rate": 5.73e-06,
1080
- "loss": 86.9592,
1081
- "step": 765
1082
- },
1083
- {
1084
- "epoch": 16.76923076923077,
1085
- "grad_norm": 280.41778564453125,
1086
- "learning_rate": 5.7675e-06,
1087
- "loss": 83.7875,
1088
- "step": 770
1089
- },
1090
- {
1091
- "epoch": 16.87912087912088,
1092
- "grad_norm": 318.2386169433594,
1093
- "learning_rate": 5.805e-06,
1094
- "loss": 87.919,
1095
- "step": 775
1096
- },
1097
- {
1098
- "epoch": 16.98901098901099,
1099
- "grad_norm": 278.9162902832031,
1100
- "learning_rate": 5.8425e-06,
1101
- "loss": 82.1755,
1102
- "step": 780
1103
- },
1104
- {
1105
- "epoch": 17.087912087912088,
1106
- "grad_norm": 267.6657409667969,
1107
- "learning_rate": 5.8800000000000005e-06,
1108
- "loss": 84.2234,
1109
- "step": 785
1110
- },
1111
- {
1112
- "epoch": 17.197802197802197,
1113
- "grad_norm": 286.16162109375,
1114
- "learning_rate": 5.9175000000000005e-06,
1115
- "loss": 81.0474,
1116
- "step": 790
1117
- },
1118
- {
1119
- "epoch": 17.307692307692307,
1120
- "grad_norm": 288.74151611328125,
1121
- "learning_rate": 5.955000000000001e-06,
1122
- "loss": 80.2032,
1123
- "step": 795
1124
- },
1125
- {
1126
- "epoch": 17.417582417582416,
1127
- "grad_norm": 293.4186096191406,
1128
- "learning_rate": 5.992500000000001e-06,
1129
- "loss": 79.8146,
1130
- "step": 800
1131
- },
1132
- {
1133
- "epoch": 17.52747252747253,
1134
- "grad_norm": 294.4632873535156,
1135
- "learning_rate": 6.030000000000001e-06,
1136
- "loss": 78.9017,
1137
- "step": 805
1138
- },
1139
- {
1140
- "epoch": 17.63736263736264,
1141
- "grad_norm": 278.7751159667969,
1142
- "learning_rate": 6.067500000000001e-06,
1143
- "loss": 74.4998,
1144
- "step": 810
1145
- },
1146
- {
1147
- "epoch": 17.747252747252748,
1148
- "grad_norm": 281.1651306152344,
1149
- "learning_rate": 6.105e-06,
1150
- "loss": 73.4448,
1151
- "step": 815
1152
- },
1153
- {
1154
- "epoch": 17.857142857142858,
1155
- "grad_norm": 253.44654846191406,
1156
- "learning_rate": 6.1425e-06,
1157
- "loss": 72.5747,
1158
- "step": 820
1159
- },
1160
- {
1161
- "epoch": 17.967032967032967,
1162
- "grad_norm": 270.03265380859375,
1163
- "learning_rate": 6.18e-06,
1164
- "loss": 72.7589,
1165
- "step": 825
1166
- },
1167
- {
1168
- "epoch": 18.065934065934066,
1169
- "grad_norm": 275.12652587890625,
1170
- "learning_rate": 6.2175e-06,
1171
- "loss": 72.1071,
1172
- "step": 830
1173
- },
1174
- {
1175
- "epoch": 18.175824175824175,
1176
- "grad_norm": 267.95965576171875,
1177
- "learning_rate": 6.255e-06,
1178
- "loss": 71.183,
1179
- "step": 835
1180
- },
1181
- {
1182
- "epoch": 18.285714285714285,
1183
- "grad_norm": 250.47987365722656,
1184
- "learning_rate": 6.2925e-06,
1185
- "loss": 72.3531,
1186
- "step": 840
1187
- },
1188
- {
1189
- "epoch": 18.395604395604394,
1190
- "grad_norm": 284.7598876953125,
1191
- "learning_rate": 6.3299999999999995e-06,
1192
- "loss": 75.0338,
1193
- "step": 845
1194
- },
1195
- {
1196
- "epoch": 18.505494505494504,
1197
- "grad_norm": 303.2804260253906,
1198
- "learning_rate": 6.3675e-06,
1199
- "loss": 73.4008,
1200
- "step": 850
1201
- },
1202
- {
1203
- "epoch": 18.615384615384617,
1204
- "grad_norm": 245.9633026123047,
1205
- "learning_rate": 6.405e-06,
1206
- "loss": 69.7974,
1207
- "step": 855
1208
- },
1209
- {
1210
- "epoch": 18.725274725274726,
1211
- "grad_norm": 243.55807495117188,
1212
- "learning_rate": 6.4425e-06,
1213
- "loss": 67.0503,
1214
- "step": 860
1215
- },
1216
- {
1217
- "epoch": 18.835164835164836,
1218
- "grad_norm": 274.70330810546875,
1219
- "learning_rate": 6.48e-06,
1220
- "loss": 67.1543,
1221
- "step": 865
1222
- },
1223
- {
1224
- "epoch": 18.945054945054945,
1225
- "grad_norm": 233.40606689453125,
1226
- "learning_rate": 6.5175e-06,
1227
- "loss": 64.5814,
1228
- "step": 870
1229
- },
1230
- {
1231
- "epoch": 19.043956043956044,
1232
- "grad_norm": 240.22552490234375,
1233
- "learning_rate": 6.555e-06,
1234
- "loss": 79.7664,
1235
- "step": 875
1236
- },
1237
- {
1238
- "epoch": 19.153846153846153,
1239
- "grad_norm": 242.1979522705078,
1240
- "learning_rate": 6.5925e-06,
1241
- "loss": 64.3813,
1242
- "step": 880
1243
- },
1244
- {
1245
- "epoch": 19.263736263736263,
1246
- "grad_norm": 231.37562561035156,
1247
- "learning_rate": 6.63e-06,
1248
- "loss": 65.6568,
1249
- "step": 885
1250
- },
1251
- {
1252
- "epoch": 19.373626373626372,
1253
- "grad_norm": 235.3353271484375,
1254
- "learning_rate": 6.6675e-06,
1255
- "loss": 63.7893,
1256
- "step": 890
1257
- },
1258
- {
1259
- "epoch": 19.483516483516482,
1260
- "grad_norm": 322.8166198730469,
1261
- "learning_rate": 6.705e-06,
1262
- "loss": 64.4515,
1263
- "step": 895
1264
- },
1265
- {
1266
- "epoch": 19.593406593406595,
1267
- "grad_norm": 226.3159637451172,
1268
- "learning_rate": 6.7425e-06,
1269
- "loss": 63.5289,
1270
- "step": 900
1271
- },
1272
- {
1273
- "epoch": 19.703296703296704,
1274
- "grad_norm": 218.580322265625,
1275
- "learning_rate": 6.78e-06,
1276
- "loss": 61.9305,
1277
- "step": 905
1278
- },
1279
- {
1280
- "epoch": 19.813186813186814,
1281
- "grad_norm": 263.286376953125,
1282
- "learning_rate": 6.8175e-06,
1283
- "loss": 58.6838,
1284
- "step": 910
1285
- },
1286
- {
1287
- "epoch": 19.923076923076923,
1288
- "grad_norm": 208.91209411621094,
1289
- "learning_rate": 6.8550000000000004e-06,
1290
- "loss": 57.9055,
1291
- "step": 915
1292
- },
1293
- {
1294
- "epoch": 20.021978021978022,
1295
- "grad_norm": 211.89749145507812,
1296
- "learning_rate": 6.8925000000000005e-06,
1297
- "loss": 59.9611,
1298
- "step": 920
1299
- },
1300
- {
1301
- "epoch": 20.13186813186813,
1302
- "grad_norm": 203.60804748535156,
1303
- "learning_rate": 6.9300000000000006e-06,
1304
- "loss": 57.4417,
1305
- "step": 925
1306
- },
1307
- {
1308
- "epoch": 20.24175824175824,
1309
- "grad_norm": 217.0232696533203,
1310
- "learning_rate": 6.967500000000001e-06,
1311
- "loss": 57.6954,
1312
- "step": 930
1313
- },
1314
- {
1315
- "epoch": 20.35164835164835,
1316
- "grad_norm": 215.05755615234375,
1317
- "learning_rate": 7.005000000000001e-06,
1318
- "loss": 58.4159,
1319
- "step": 935
1320
- },
1321
- {
1322
- "epoch": 20.46153846153846,
1323
- "grad_norm": 208.3353729248047,
1324
- "learning_rate": 7.0425e-06,
1325
- "loss": 54.717,
1326
- "step": 940
1327
- },
1328
- {
1329
- "epoch": 20.571428571428573,
1330
- "grad_norm": 205.00282287597656,
1331
- "learning_rate": 7.08e-06,
1332
- "loss": 53.7791,
1333
- "step": 945
1334
- },
1335
- {
1336
- "epoch": 20.681318681318682,
1337
- "grad_norm": 197.68243408203125,
1338
- "learning_rate": 7.1175e-06,
1339
- "loss": 53.9224,
1340
- "step": 950
1341
- },
1342
- {
1343
- "epoch": 20.791208791208792,
1344
- "grad_norm": 183.80923461914062,
1345
- "learning_rate": 7.155e-06,
1346
- "loss": 51.6971,
1347
- "step": 955
1348
- },
1349
- {
1350
- "epoch": 20.9010989010989,
1351
- "grad_norm": 201.22625732421875,
1352
- "learning_rate": 7.1925e-06,
1353
- "loss": 50.1511,
1354
- "step": 960
1355
- },
1356
- {
1357
- "epoch": 21.0,
1358
- "grad_norm": 433.5864562988281,
1359
- "learning_rate": 7.23e-06,
1360
- "loss": 53.6671,
1361
- "step": 965
1362
- },
1363
- {
1364
- "epoch": 21.10989010989011,
1365
- "grad_norm": 180.5747833251953,
1366
- "learning_rate": 7.2675e-06,
1367
- "loss": 51.5476,
1368
- "step": 970
1369
- },
1370
- {
1371
- "epoch": 21.21978021978022,
1372
- "grad_norm": 187.3274383544922,
1373
- "learning_rate": 7.305e-06,
1374
- "loss": 49.8905,
1375
- "step": 975
1376
- },
1377
- {
1378
- "epoch": 21.32967032967033,
1379
- "grad_norm": 175.39138793945312,
1380
- "learning_rate": 7.3425000000000004e-06,
1381
- "loss": 50.4588,
1382
- "step": 980
1383
- },
1384
- {
1385
- "epoch": 21.439560439560438,
1386
- "grad_norm": 173.7222900390625,
1387
- "learning_rate": 7.3800000000000005e-06,
1388
- "loss": 48.5218,
1389
- "step": 985
1390
- },
1391
- {
1392
- "epoch": 21.54945054945055,
1393
- "grad_norm": 429.6211242675781,
1394
- "learning_rate": 7.4175e-06,
1395
- "loss": 48.636,
1396
- "step": 990
1397
- },
1398
- {
1399
- "epoch": 21.65934065934066,
1400
- "grad_norm": 178.93069458007812,
1401
- "learning_rate": 7.455e-06,
1402
- "loss": 49.8087,
1403
- "step": 995
1404
- },
1405
- {
1406
- "epoch": 21.76923076923077,
1407
- "grad_norm": 232.20291137695312,
1408
- "learning_rate": 7.4925e-06,
1409
- "loss": 47.1187,
1410
- "step": 1000
1411
- },
1412
- {
1413
- "epoch": 21.87912087912088,
1414
- "grad_norm": 181.4752655029297,
1415
- "learning_rate": 7.53e-06,
1416
- "loss": 51.2776,
1417
- "step": 1005
1418
- },
1419
- {
1420
- "epoch": 21.98901098901099,
1421
- "grad_norm": 167.86483764648438,
1422
- "learning_rate": 7.567499999999999e-06,
1423
- "loss": 46.6794,
1424
- "step": 1010
1425
- },
1426
- {
1427
- "epoch": 22.087912087912088,
1428
- "grad_norm": 172.2127685546875,
1429
- "learning_rate": 7.605e-06,
1430
- "loss": 44.9643,
1431
- "step": 1015
1432
- },
1433
- {
1434
- "epoch": 22.197802197802197,
1435
- "grad_norm": 184.90206909179688,
1436
- "learning_rate": 7.6425e-06,
1437
- "loss": 45.4152,
1438
- "step": 1020
1439
- },
1440
- {
1441
- "epoch": 22.307692307692307,
1442
- "grad_norm": 155.5305938720703,
1443
- "learning_rate": 7.680000000000001e-06,
1444
- "loss": 45.4304,
1445
- "step": 1025
1446
- },
1447
- {
1448
- "epoch": 22.417582417582416,
1449
- "grad_norm": 670.8698120117188,
1450
- "learning_rate": 7.7175e-06,
1451
- "loss": 44.4255,
1452
- "step": 1030
1453
- },
1454
- {
1455
- "epoch": 22.52747252747253,
1456
- "grad_norm": 162.875244140625,
1457
- "learning_rate": 7.755000000000001e-06,
1458
- "loss": 43.0364,
1459
- "step": 1035
1460
- },
1461
- {
1462
- "epoch": 22.63736263736264,
1463
- "grad_norm": 159.903076171875,
1464
- "learning_rate": 7.7925e-06,
1465
- "loss": 44.4935,
1466
- "step": 1040
1467
- },
1468
- {
1469
- "epoch": 22.747252747252748,
1470
- "grad_norm": 161.0757598876953,
1471
- "learning_rate": 7.830000000000001e-06,
1472
- "loss": 43.3238,
1473
- "step": 1045
1474
- },
1475
- {
1476
- "epoch": 22.857142857142858,
1477
- "grad_norm": 140.3941650390625,
1478
- "learning_rate": 7.8675e-06,
1479
- "loss": 42.2547,
1480
- "step": 1050
1481
- },
1482
- {
1483
- "epoch": 22.967032967032967,
1484
- "grad_norm": 164.62034606933594,
1485
- "learning_rate": 7.905000000000001e-06,
1486
- "loss": 42.7775,
1487
- "step": 1055
1488
- },
1489
- {
1490
- "epoch": 23.065934065934066,
1491
- "grad_norm": 148.73211669921875,
1492
- "learning_rate": 7.942499999999999e-06,
1493
- "loss": 42.7699,
1494
- "step": 1060
1495
- },
1496
- {
1497
- "epoch": 23.175824175824175,
1498
- "grad_norm": 137.26893615722656,
1499
- "learning_rate": 7.98e-06,
1500
- "loss": 40.5411,
1501
- "step": 1065
1502
- },
1503
- {
1504
- "epoch": 23.285714285714285,
1505
- "grad_norm": 143.8221893310547,
1506
- "learning_rate": 8.017499999999999e-06,
1507
- "loss": 39.9001,
1508
- "step": 1070
1509
- },
1510
- {
1511
- "epoch": 23.395604395604394,
1512
- "grad_norm": 160.19586181640625,
1513
- "learning_rate": 8.055e-06,
1514
- "loss": 39.8523,
1515
- "step": 1075
1516
- },
1517
- {
1518
- "epoch": 23.505494505494504,
1519
- "grad_norm": 146.14981079101562,
1520
- "learning_rate": 8.0925e-06,
1521
- "loss": 39.7639,
1522
- "step": 1080
1523
- },
1524
- {
1525
- "epoch": 23.615384615384617,
1526
- "grad_norm": 132.55712890625,
1527
- "learning_rate": 8.13e-06,
1528
- "loss": 39.7051,
1529
- "step": 1085
1530
- },
1531
- {
1532
- "epoch": 23.725274725274726,
1533
- "grad_norm": 137.0689239501953,
1534
- "learning_rate": 8.1675e-06,
1535
- "loss": 39.1014,
1536
- "step": 1090
1537
- },
1538
- {
1539
- "epoch": 23.835164835164836,
1540
- "grad_norm": 125.66040802001953,
1541
- "learning_rate": 8.205e-06,
1542
- "loss": 38.4976,
1543
- "step": 1095
1544
- },
1545
- {
1546
- "epoch": 23.945054945054945,
1547
- "grad_norm": 118.39325714111328,
1548
- "learning_rate": 8.2425e-06,
1549
- "loss": 39.2203,
1550
- "step": 1100
1551
- },
1552
- {
1553
- "epoch": 24.043956043956044,
1554
- "grad_norm": 131.4208221435547,
1555
- "learning_rate": 8.28e-06,
1556
- "loss": 37.6028,
1557
- "step": 1105
1558
- },
1559
- {
1560
- "epoch": 24.153846153846153,
1561
- "grad_norm": 131.80392456054688,
1562
- "learning_rate": 8.3175e-06,
1563
- "loss": 38.8382,
1564
- "step": 1110
1565
- },
1566
- {
1567
- "epoch": 24.263736263736263,
1568
- "grad_norm": 120.43126678466797,
1569
- "learning_rate": 8.355e-06,
1570
- "loss": 37.7083,
1571
- "step": 1115
1572
- },
1573
- {
1574
- "epoch": 24.373626373626372,
1575
- "grad_norm": 147.37368774414062,
1576
- "learning_rate": 8.3925e-06,
1577
- "loss": 36.4178,
1578
- "step": 1120
1579
- },
1580
- {
1581
- "epoch": 24.483516483516482,
1582
- "grad_norm": 162.19711303710938,
1583
- "learning_rate": 8.43e-06,
1584
- "loss": 36.3008,
1585
- "step": 1125
1586
- },
1587
- {
1588
- "epoch": 24.593406593406595,
1589
- "grad_norm": 128.9326171875,
1590
- "learning_rate": 8.4675e-06,
1591
- "loss": 35.7747,
1592
- "step": 1130
1593
- },
1594
- {
1595
- "epoch": 24.703296703296704,
1596
- "grad_norm": 123.9283218383789,
1597
- "learning_rate": 8.504999999999999e-06,
1598
- "loss": 33.9809,
1599
- "step": 1135
1600
- },
1601
- {
1602
- "epoch": 24.813186813186814,
1603
- "grad_norm": 125.92423248291016,
1604
- "learning_rate": 8.5425e-06,
1605
- "loss": 35.4138,
1606
- "step": 1140
1607
- },
1608
- {
1609
- "epoch": 24.923076923076923,
1610
- "grad_norm": 129.04722595214844,
1611
- "learning_rate": 8.58e-06,
1612
- "loss": 35.177,
1613
- "step": 1145
1614
- },
1615
- {
1616
- "epoch": 25.021978021978022,
1617
- "grad_norm": 126.06185913085938,
1618
- "learning_rate": 8.6175e-06,
1619
- "loss": 34.5535,
1620
- "step": 1150
1621
- },
1622
- {
1623
- "epoch": 25.13186813186813,
1624
- "grad_norm": 122.82085418701172,
1625
- "learning_rate": 8.655e-06,
1626
- "loss": 34.716,
1627
- "step": 1155
1628
- },
1629
- {
1630
- "epoch": 25.24175824175824,
1631
- "grad_norm": 110.43183898925781,
1632
- "learning_rate": 8.6925e-06,
1633
- "loss": 33.8336,
1634
- "step": 1160
1635
- },
1636
- {
1637
- "epoch": 25.35164835164835,
1638
- "grad_norm": 111.39581298828125,
1639
- "learning_rate": 8.73e-06,
1640
- "loss": 33.2828,
1641
- "step": 1165
1642
- },
1643
- {
1644
- "epoch": 25.46153846153846,
1645
- "grad_norm": 133.24420166015625,
1646
- "learning_rate": 8.7675e-06,
1647
- "loss": 33.4885,
1648
- "step": 1170
1649
- },
1650
- {
1651
- "epoch": 25.571428571428573,
1652
- "grad_norm": 117.64434051513672,
1653
- "learning_rate": 8.805e-06,
1654
- "loss": 33.8398,
1655
- "step": 1175
1656
- },
1657
- {
1658
- "epoch": 25.681318681318682,
1659
- "grad_norm": 117.87113952636719,
1660
- "learning_rate": 8.8425e-06,
1661
- "loss": 31.6979,
1662
- "step": 1180
1663
- },
1664
- {
1665
- "epoch": 25.791208791208792,
1666
- "grad_norm": 115.08634948730469,
1667
- "learning_rate": 8.88e-06,
1668
- "loss": 32.0385,
1669
- "step": 1185
1670
- },
1671
- {
1672
- "epoch": 25.9010989010989,
1673
- "grad_norm": 127.63653564453125,
1674
- "learning_rate": 8.9175e-06,
1675
- "loss": 31.6513,
1676
- "step": 1190
1677
- },
1678
- {
1679
- "epoch": 26.0,
1680
- "grad_norm": 283.0741271972656,
1681
- "learning_rate": 8.955e-06,
1682
- "loss": 33.2661,
1683
- "step": 1195
1684
- },
1685
- {
1686
- "epoch": 26.10989010989011,
1687
- "grad_norm": 110.21710968017578,
1688
- "learning_rate": 8.9925e-06,
1689
- "loss": 30.7823,
1690
- "step": 1200
1691
- },
1692
- {
1693
- "epoch": 26.21978021978022,
1694
- "grad_norm": 100.66688537597656,
1695
- "learning_rate": 9.03e-06,
1696
- "loss": 31.5021,
1697
- "step": 1205
1698
- },
1699
- {
1700
- "epoch": 26.32967032967033,
1701
- "grad_norm": 105.822021484375,
1702
- "learning_rate": 9.067500000000001e-06,
1703
- "loss": 30.0609,
1704
- "step": 1210
1705
- },
1706
- {
1707
- "epoch": 26.439560439560438,
1708
- "grad_norm": 99.89389038085938,
1709
- "learning_rate": 9.105e-06,
1710
- "loss": 30.5733,
1711
- "step": 1215
1712
- },
1713
- {
1714
- "epoch": 26.54945054945055,
1715
- "grad_norm": 96.10974884033203,
1716
- "learning_rate": 9.142500000000001e-06,
1717
- "loss": 29.9534,
1718
- "step": 1220
1719
- },
1720
- {
1721
- "epoch": 26.65934065934066,
1722
- "grad_norm": 112.5010986328125,
1723
- "learning_rate": 9.18e-06,
1724
- "loss": 30.4645,
1725
- "step": 1225
1726
- },
1727
- {
1728
- "epoch": 26.76923076923077,
1729
- "grad_norm": 113.7469253540039,
1730
- "learning_rate": 9.217500000000001e-06,
1731
- "loss": 29.8336,
1732
- "step": 1230
1733
- },
1734
- {
1735
- "epoch": 26.87912087912088,
1736
- "grad_norm": 99.2851791381836,
1737
- "learning_rate": 9.255e-06,
1738
- "loss": 28.8012,
1739
- "step": 1235
1740
- },
1741
- {
1742
- "epoch": 26.98901098901099,
1743
- "grad_norm": 92.29454040527344,
1744
- "learning_rate": 9.292500000000001e-06,
1745
- "loss": 29.5607,
1746
- "step": 1240
1747
- },
1748
- {
1749
- "epoch": 27.087912087912088,
1750
- "grad_norm": 368.2291564941406,
1751
- "learning_rate": 9.33e-06,
1752
- "loss": 57.1964,
1753
- "step": 1245
1754
- },
1755
- {
1756
- "epoch": 27.197802197802197,
1757
- "grad_norm": 87.99185943603516,
1758
- "learning_rate": 9.367500000000001e-06,
1759
- "loss": 29.8244,
1760
- "step": 1250
1761
- },
1762
- {
1763
- "epoch": 27.307692307692307,
1764
- "grad_norm": 87.86891174316406,
1765
- "learning_rate": 9.405e-06,
1766
- "loss": 29.159,
1767
- "step": 1255
1768
- },
1769
- {
1770
- "epoch": 27.417582417582416,
1771
- "grad_norm": 106.52743530273438,
1772
- "learning_rate": 9.4425e-06,
1773
- "loss": 28.5417,
1774
- "step": 1260
1775
- },
1776
- {
1777
- "epoch": 27.52747252747253,
1778
- "grad_norm": 105.37548828125,
1779
- "learning_rate": 9.48e-06,
1780
- "loss": 29.6107,
1781
- "step": 1265
1782
- },
1783
- {
1784
- "epoch": 27.63736263736264,
1785
- "grad_norm": 126.07006072998047,
1786
- "learning_rate": 9.5175e-06,
1787
- "loss": 28.5252,
1788
- "step": 1270
1789
- },
1790
- {
1791
- "epoch": 27.747252747252748,
1792
- "grad_norm": 102.43346405029297,
1793
- "learning_rate": 9.555e-06,
1794
- "loss": 26.8626,
1795
- "step": 1275
1796
- },
1797
- {
1798
- "epoch": 27.857142857142858,
1799
- "grad_norm": 95.87886047363281,
1800
- "learning_rate": 9.5925e-06,
1801
- "loss": 27.4449,
1802
- "step": 1280
1803
- },
1804
- {
1805
- "epoch": 27.967032967032967,
1806
- "grad_norm": 183.47979736328125,
1807
- "learning_rate": 9.630000000000001e-06,
1808
- "loss": 26.4121,
1809
- "step": 1285
1810
- },
1811
- {
1812
- "epoch": 28.065934065934066,
1813
- "grad_norm": 95.40010070800781,
1814
- "learning_rate": 9.6675e-06,
1815
- "loss": 35.9095,
1816
- "step": 1290
1817
- },
1818
- {
1819
- "epoch": 28.175824175824175,
1820
- "grad_norm": 90.59423828125,
1821
- "learning_rate": 9.705000000000001e-06,
1822
- "loss": 26.6954,
1823
- "step": 1295
1824
- },
1825
- {
1826
- "epoch": 28.285714285714285,
1827
- "grad_norm": 91.06011199951172,
1828
- "learning_rate": 9.7425e-06,
1829
- "loss": 26.3503,
1830
- "step": 1300
1831
- },
1832
- {
1833
- "epoch": 28.395604395604394,
1834
- "grad_norm": 88.42830657958984,
1835
- "learning_rate": 9.780000000000001e-06,
1836
- "loss": 26.0797,
1837
- "step": 1305
1838
- },
1839
- {
1840
- "epoch": 28.505494505494504,
1841
- "grad_norm": 80.00355529785156,
1842
- "learning_rate": 9.8175e-06,
1843
- "loss": 25.8798,
1844
- "step": 1310
1845
- },
1846
- {
1847
- "epoch": 28.615384615384617,
1848
- "grad_norm": 81.79782104492188,
1849
- "learning_rate": 9.855000000000001e-06,
1850
- "loss": 26.4297,
1851
- "step": 1315
1852
  },
1853
  {
1854
- "epoch": 28.725274725274726,
1855
- "grad_norm": 75.8064193725586,
1856
- "learning_rate": 9.8925e-06,
1857
- "loss": 26.3461,
1858
- "step": 1320
1859
  },
1860
  {
1861
- "epoch": 28.835164835164836,
1862
- "grad_norm": 85.9918212890625,
1863
- "learning_rate": 9.930000000000001e-06,
1864
- "loss": 26.0146,
1865
- "step": 1325
1866
  },
1867
  {
1868
- "epoch": 28.945054945054945,
1869
- "grad_norm": 94.3312759399414,
1870
- "learning_rate": 9.9675e-06,
1871
- "loss": 25.2894,
1872
- "step": 1330
1873
  },
1874
  {
1875
- "epoch": 29.043956043956044,
1876
- "grad_norm": 126.92018127441406,
1877
- "learning_rate": 1.0005000000000002e-05,
1878
- "loss": 24.2071,
1879
- "step": 1335
1880
  },
1881
  {
1882
- "epoch": 29.153846153846153,
1883
- "grad_norm": 81.69698333740234,
1884
- "learning_rate": 1.00425e-05,
1885
- "loss": 25.3439,
1886
- "step": 1340
1887
  },
1888
  {
1889
- "epoch": 29.263736263736263,
1890
- "grad_norm": 74.29949188232422,
1891
- "learning_rate": 1.008e-05,
1892
- "loss": 25.7623,
1893
- "step": 1345
1894
  },
1895
  {
1896
- "epoch": 29.373626373626372,
1897
- "grad_norm": 93.50883483886719,
1898
- "learning_rate": 1.01175e-05,
1899
- "loss": 25.0202,
1900
- "step": 1350
1901
  },
1902
  {
1903
- "epoch": 29.483516483516482,
1904
- "grad_norm": 90.64075469970703,
1905
- "learning_rate": 1.0155e-05,
1906
- "loss": 25.1573,
1907
- "step": 1355
1908
  },
1909
  {
1910
- "epoch": 29.593406593406595,
1911
- "grad_norm": 89.82637786865234,
1912
- "learning_rate": 1.01925e-05,
1913
- "loss": 24.7463,
1914
- "step": 1360
1915
  },
1916
  {
1917
- "epoch": 29.703296703296704,
1918
- "grad_norm": 83.58129119873047,
1919
- "learning_rate": 1.023e-05,
1920
- "loss": 23.3669,
1921
- "step": 1365
1922
  },
1923
  {
1924
- "epoch": 29.813186813186814,
1925
- "grad_norm": 91.34410095214844,
1926
- "learning_rate": 1.02675e-05,
1927
- "loss": 25.5278,
1928
- "step": 1370
1929
  },
1930
  {
1931
- "epoch": 29.923076923076923,
1932
- "grad_norm": 118.92642211914062,
1933
- "learning_rate": 1.0305e-05,
1934
- "loss": 24.4363,
1935
- "step": 1375
1936
  },
1937
  {
1938
- "epoch": 30.021978021978022,
1939
- "grad_norm": 75.5335693359375,
1940
- "learning_rate": 1.03425e-05,
1941
- "loss": 32.3354,
1942
- "step": 1380
1943
  },
1944
  {
1945
- "epoch": 30.13186813186813,
1946
- "grad_norm": 76.67096710205078,
1947
- "learning_rate": 1.0379999999999999e-05,
1948
- "loss": 24.7026,
1949
- "step": 1385
1950
  },
1951
  {
1952
- "epoch": 30.24175824175824,
1953
- "grad_norm": 76.3076400756836,
1954
- "learning_rate": 1.04175e-05,
1955
- "loss": 23.3343,
1956
- "step": 1390
1957
  },
1958
  {
1959
- "epoch": 30.35164835164835,
1960
- "grad_norm": 76.03828430175781,
1961
- "learning_rate": 1.0454999999999999e-05,
1962
- "loss": 23.62,
1963
- "step": 1395
1964
  },
1965
  {
1966
- "epoch": 30.46153846153846,
1967
- "grad_norm": 86.96392059326172,
1968
- "learning_rate": 1.04925e-05,
1969
- "loss": 23.4644,
1970
- "step": 1400
1971
  },
1972
  {
1973
- "epoch": 30.571428571428573,
1974
- "grad_norm": 76.41240692138672,
1975
- "learning_rate": 1.0529999999999999e-05,
1976
- "loss": 22.895,
1977
- "step": 1405
1978
  },
1979
  {
1980
- "epoch": 30.681318681318682,
1981
- "grad_norm": 67.73085021972656,
1982
- "learning_rate": 1.05675e-05,
1983
- "loss": 23.6923,
1984
- "step": 1410
1985
  },
1986
  {
1987
- "epoch": 30.791208791208792,
1988
- "grad_norm": 141.68394470214844,
1989
- "learning_rate": 1.0605e-05,
1990
- "loss": 22.9296,
1991
- "step": 1415
1992
  },
1993
  {
1994
- "epoch": 30.9010989010989,
1995
- "grad_norm": 102.01689910888672,
1996
- "learning_rate": 1.06425e-05,
1997
- "loss": 22.6854,
1998
- "step": 1420
1999
  },
2000
  {
2001
- "epoch": 31.0,
2002
- "grad_norm": 252.77476501464844,
2003
- "learning_rate": 1.068e-05,
2004
- "loss": 24.2562,
2005
- "step": 1425
2006
  },
2007
  {
2008
- "epoch": 31.10989010989011,
2009
- "grad_norm": 79.69669342041016,
2010
- "learning_rate": 1.07175e-05,
2011
- "loss": 22.6885,
2012
- "step": 1430
2013
  },
2014
  {
2015
- "epoch": 31.21978021978022,
2016
- "grad_norm": 73.90184783935547,
2017
- "learning_rate": 1.0755e-05,
2018
- "loss": 21.5442,
2019
- "step": 1435
2020
  },
2021
  {
2022
- "epoch": 31.32967032967033,
2023
- "grad_norm": 100.4890365600586,
2024
- "learning_rate": 1.07925e-05,
2025
- "loss": 22.8439,
2026
- "step": 1440
2027
  },
2028
  {
2029
- "epoch": 31.439560439560438,
2030
- "grad_norm": 73.010009765625,
2031
- "learning_rate": 1.083e-05,
2032
- "loss": 22.7848,
2033
- "step": 1445
2034
  },
2035
  {
2036
- "epoch": 31.54945054945055,
2037
- "grad_norm": 107.31619262695312,
2038
  "learning_rate": 1.08675e-05,
2039
- "loss": 22.1164,
2040
  "step": 1450
2041
  },
2042
  {
2043
- "epoch": 31.65934065934066,
2044
- "grad_norm": 72.39157104492188,
2045
- "learning_rate": 1.0905e-05,
2046
- "loss": 21.4437,
2047
- "step": 1455
2048
- },
2049
- {
2050
- "epoch": 31.76923076923077,
2051
- "grad_norm": 79.01976013183594,
2052
- "learning_rate": 1.09425e-05,
2053
- "loss": 22.4324,
2054
- "step": 1460
2055
- },
2056
- {
2057
- "epoch": 31.87912087912088,
2058
- "grad_norm": 94.18357849121094,
2059
- "learning_rate": 1.098e-05,
2060
- "loss": 23.0076,
2061
- "step": 1465
2062
- },
2063
- {
2064
- "epoch": 31.98901098901099,
2065
- "grad_norm": 76.7142333984375,
2066
- "learning_rate": 1.10175e-05,
2067
- "loss": 21.8061,
2068
- "step": 1470
2069
- },
2070
- {
2071
- "epoch": 32.08791208791209,
2072
- "grad_norm": 65.66316986083984,
2073
- "learning_rate": 1.1055e-05,
2074
- "loss": 22.7557,
2075
- "step": 1475
2076
- },
2077
- {
2078
- "epoch": 32.1978021978022,
2079
- "grad_norm": 85.76339721679688,
2080
- "learning_rate": 1.1092500000000001e-05,
2081
- "loss": 20.9964,
2082
- "step": 1480
2083
- },
2084
- {
2085
- "epoch": 32.30769230769231,
2086
- "grad_norm": 84.71622467041016,
2087
- "learning_rate": 1.113e-05,
2088
- "loss": 21.0421,
2089
- "step": 1485
2090
- },
2091
- {
2092
- "epoch": 32.417582417582416,
2093
- "grad_norm": 94.56781768798828,
2094
- "learning_rate": 1.1167500000000001e-05,
2095
- "loss": 21.6047,
2096
- "step": 1490
2097
- },
2098
- {
2099
- "epoch": 32.527472527472526,
2100
- "grad_norm": 71.27996063232422,
2101
- "learning_rate": 1.1205e-05,
2102
- "loss": 21.281,
2103
- "step": 1495
2104
- },
2105
- {
2106
- "epoch": 32.637362637362635,
2107
- "grad_norm": 58.75380325317383,
2108
  "learning_rate": 1.1242500000000001e-05,
2109
- "loss": 20.1862,
2110
  "step": 1500
2111
  },
2112
  {
2113
- "epoch": 32.747252747252745,
2114
- "grad_norm": 73.71588134765625,
2115
- "learning_rate": 1.128e-05,
2116
- "loss": 20.9145,
2117
- "step": 1505
2118
- },
2119
- {
2120
- "epoch": 32.857142857142854,
2121
- "grad_norm": 109.56021881103516,
2122
- "learning_rate": 1.13175e-05,
2123
- "loss": 21.6375,
2124
- "step": 1510
2125
- },
2126
- {
2127
- "epoch": 32.967032967032964,
2128
- "grad_norm": 63.0958251953125,
2129
- "learning_rate": 1.1355e-05,
2130
- "loss": 21.3934,
2131
- "step": 1515
2132
- },
2133
- {
2134
- "epoch": 33.065934065934066,
2135
- "grad_norm": 72.25486755371094,
2136
- "learning_rate": 1.13925e-05,
2137
- "loss": 20.5177,
2138
- "step": 1520
2139
- },
2140
- {
2141
- "epoch": 33.175824175824175,
2142
- "grad_norm": 79.46709442138672,
2143
- "learning_rate": 1.143e-05,
2144
- "loss": 20.1796,
2145
- "step": 1525
2146
- },
2147
- {
2148
- "epoch": 33.285714285714285,
2149
- "grad_norm": 80.37206268310547,
2150
- "learning_rate": 1.14675e-05,
2151
- "loss": 20.519,
2152
- "step": 1530
2153
- },
2154
- {
2155
- "epoch": 33.395604395604394,
2156
- "grad_norm": 88.30254364013672,
2157
- "learning_rate": 1.1505e-05,
2158
- "loss": 20.5289,
2159
- "step": 1535
2160
- },
2161
- {
2162
- "epoch": 33.505494505494504,
2163
- "grad_norm": 59.99192428588867,
2164
- "learning_rate": 1.15425e-05,
2165
- "loss": 20.5064,
2166
- "step": 1540
2167
- },
2168
- {
2169
- "epoch": 33.61538461538461,
2170
- "grad_norm": 71.29468536376953,
2171
- "learning_rate": 1.1580000000000001e-05,
2172
- "loss": 19.2822,
2173
- "step": 1545
2174
- },
2175
- {
2176
- "epoch": 33.72527472527472,
2177
- "grad_norm": 67.42713928222656,
2178
  "learning_rate": 1.16175e-05,
2179
- "loss": 20.9368,
2180
  "step": 1550
2181
  },
2182
  {
2183
- "epoch": 33.83516483516483,
2184
- "grad_norm": 78.21910858154297,
2185
- "learning_rate": 1.1655000000000001e-05,
2186
- "loss": 19.9386,
2187
- "step": 1555
2188
- },
2189
- {
2190
- "epoch": 33.94505494505494,
2191
- "grad_norm": 133.632080078125,
2192
- "learning_rate": 1.16925e-05,
2193
- "loss": 20.3614,
2194
- "step": 1560
2195
- },
2196
- {
2197
- "epoch": 34.043956043956044,
2198
- "grad_norm": 56.151954650878906,
2199
- "learning_rate": 1.1730000000000001e-05,
2200
- "loss": 18.905,
2201
- "step": 1565
2202
- },
2203
- {
2204
- "epoch": 34.15384615384615,
2205
- "grad_norm": 72.71345520019531,
2206
- "learning_rate": 1.17675e-05,
2207
- "loss": 20.4241,
2208
- "step": 1570
2209
- },
2210
- {
2211
- "epoch": 34.26373626373626,
2212
- "grad_norm": 74.98194122314453,
2213
- "learning_rate": 1.1805000000000001e-05,
2214
- "loss": 19.3519,
2215
- "step": 1575
2216
- },
2217
- {
2218
- "epoch": 34.37362637362637,
2219
- "grad_norm": 67.34236145019531,
2220
- "learning_rate": 1.18425e-05,
2221
- "loss": 18.7936,
2222
- "step": 1580
2223
- },
2224
- {
2225
- "epoch": 34.48351648351648,
2226
- "grad_norm": 72.32279205322266,
2227
- "learning_rate": 1.1880000000000001e-05,
2228
- "loss": 20.0671,
2229
- "step": 1585
2230
- },
2231
- {
2232
- "epoch": 34.59340659340659,
2233
- "grad_norm": 54.22994613647461,
2234
- "learning_rate": 1.19175e-05,
2235
- "loss": 19.3878,
2236
- "step": 1590
2237
- },
2238
- {
2239
- "epoch": 34.7032967032967,
2240
- "grad_norm": 72.89130401611328,
2241
- "learning_rate": 1.1955000000000002e-05,
2242
- "loss": 19.4714,
2243
- "step": 1595
2244
- },
2245
- {
2246
- "epoch": 34.81318681318681,
2247
- "grad_norm": 65.4543685913086,
2248
  "learning_rate": 1.19925e-05,
2249
- "loss": 20.4795,
2250
  "step": 1600
2251
  },
2252
  {
2253
- "epoch": 34.92307692307692,
2254
- "grad_norm": 65.6441650390625,
2255
- "learning_rate": 1.2030000000000002e-05,
2256
- "loss": 18.6467,
2257
- "step": 1605
2258
- },
2259
- {
2260
- "epoch": 35.02197802197802,
2261
- "grad_norm": 82.81990051269531,
2262
- "learning_rate": 1.2067500000000001e-05,
2263
- "loss": 26.0395,
2264
- "step": 1610
2265
- },
2266
- {
2267
- "epoch": 35.13186813186813,
2268
- "grad_norm": 62.68242645263672,
2269
- "learning_rate": 1.2105000000000002e-05,
2270
- "loss": 19.2074,
2271
- "step": 1615
2272
- },
2273
- {
2274
- "epoch": 35.24175824175824,
2275
- "grad_norm": 79.71977996826172,
2276
- "learning_rate": 1.2142500000000001e-05,
2277
- "loss": 20.164,
2278
- "step": 1620
2279
- },
2280
- {
2281
- "epoch": 35.35164835164835,
2282
- "grad_norm": 82.37974548339844,
2283
- "learning_rate": 1.2180000000000002e-05,
2284
- "loss": 18.7425,
2285
- "step": 1625
2286
- },
2287
- {
2288
- "epoch": 35.46153846153846,
2289
- "grad_norm": 68.9244155883789,
2290
- "learning_rate": 1.22175e-05,
2291
- "loss": 18.8626,
2292
- "step": 1630
2293
- },
2294
- {
2295
- "epoch": 35.57142857142857,
2296
- "grad_norm": 72.45255279541016,
2297
- "learning_rate": 1.2254999999999999e-05,
2298
- "loss": 18.4994,
2299
- "step": 1635
2300
- },
2301
- {
2302
- "epoch": 35.68131868131868,
2303
- "grad_norm": 76.19093322753906,
2304
- "learning_rate": 1.22925e-05,
2305
- "loss": 18.911,
2306
- "step": 1640
2307
- },
2308
- {
2309
- "epoch": 35.79120879120879,
2310
- "grad_norm": 50.99027633666992,
2311
- "learning_rate": 1.2329999999999999e-05,
2312
- "loss": 18.2682,
2313
- "step": 1645
2314
- },
2315
- {
2316
- "epoch": 35.9010989010989,
2317
- "grad_norm": 79.6258316040039,
2318
  "learning_rate": 1.23675e-05,
2319
- "loss": 19.1667,
2320
  "step": 1650
2321
  },
2322
  {
2323
- "epoch": 36.0,
2324
- "grad_norm": 198.5669403076172,
2325
- "learning_rate": 1.2404999999999999e-05,
2326
- "loss": 18.7113,
2327
- "step": 1655
2328
- },
2329
- {
2330
- "epoch": 36.10989010989011,
2331
- "grad_norm": 52.13261032104492,
2332
- "learning_rate": 1.24425e-05,
2333
- "loss": 17.3056,
2334
- "step": 1660
2335
- },
2336
- {
2337
- "epoch": 36.21978021978022,
2338
- "grad_norm": 64.57988739013672,
2339
- "learning_rate": 1.2479999999999999e-05,
2340
- "loss": 17.8365,
2341
- "step": 1665
2342
- },
2343
- {
2344
- "epoch": 36.32967032967033,
2345
- "grad_norm": 111.00614929199219,
2346
- "learning_rate": 1.25175e-05,
2347
- "loss": 18.5584,
2348
- "step": 1670
2349
- },
2350
- {
2351
- "epoch": 36.43956043956044,
2352
- "grad_norm": 77.30622863769531,
2353
- "learning_rate": 1.2555e-05,
2354
- "loss": 18.0333,
2355
- "step": 1675
2356
- },
2357
- {
2358
- "epoch": 36.54945054945055,
2359
- "grad_norm": 60.33699417114258,
2360
- "learning_rate": 1.25925e-05,
2361
- "loss": 18.5811,
2362
- "step": 1680
2363
- },
2364
- {
2365
- "epoch": 36.65934065934066,
2366
- "grad_norm": 84.54650115966797,
2367
- "learning_rate": 1.263e-05,
2368
- "loss": 18.7701,
2369
- "step": 1685
2370
- },
2371
- {
2372
- "epoch": 36.76923076923077,
2373
- "grad_norm": 73.11846923828125,
2374
- "learning_rate": 1.26675e-05,
2375
- "loss": 18.816,
2376
- "step": 1690
2377
- },
2378
- {
2379
- "epoch": 36.879120879120876,
2380
- "grad_norm": 61.516761779785156,
2381
- "learning_rate": 1.2705e-05,
2382
- "loss": 17.5013,
2383
- "step": 1695
2384
- },
2385
- {
2386
- "epoch": 36.98901098901099,
2387
- "grad_norm": 74.83867645263672,
2388
  "learning_rate": 1.27425e-05,
2389
- "loss": 17.6083,
2390
  "step": 1700
2391
  },
2392
  {
2393
- "epoch": 37.08791208791209,
2394
- "grad_norm": 69.51549530029297,
2395
- "learning_rate": 1.278e-05,
2396
- "loss": 20.9143,
2397
- "step": 1705
2398
- },
2399
- {
2400
- "epoch": 37.1978021978022,
2401
- "grad_norm": 59.48684310913086,
2402
- "learning_rate": 1.28175e-05,
2403
- "loss": 17.725,
2404
- "step": 1710
2405
- },
2406
- {
2407
- "epoch": 37.30769230769231,
2408
- "grad_norm": 71.78469848632812,
2409
- "learning_rate": 1.2855e-05,
2410
- "loss": 17.7316,
2411
- "step": 1715
2412
- },
2413
- {
2414
- "epoch": 37.417582417582416,
2415
- "grad_norm": 82.73149871826172,
2416
- "learning_rate": 1.28925e-05,
2417
- "loss": 17.8607,
2418
- "step": 1720
2419
- },
2420
- {
2421
- "epoch": 37.527472527472526,
2422
- "grad_norm": 104.99292755126953,
2423
- "learning_rate": 1.293e-05,
2424
- "loss": 17.7329,
2425
- "step": 1725
2426
- },
2427
- {
2428
- "epoch": 37.637362637362635,
2429
- "grad_norm": 56.97990417480469,
2430
- "learning_rate": 1.29675e-05,
2431
- "loss": 17.7514,
2432
- "step": 1730
2433
- },
2434
- {
2435
- "epoch": 37.747252747252745,
2436
- "grad_norm": 76.46744537353516,
2437
- "learning_rate": 1.3005e-05,
2438
- "loss": 17.467,
2439
- "step": 1735
2440
- },
2441
- {
2442
- "epoch": 37.857142857142854,
2443
- "grad_norm": 67.00051879882812,
2444
- "learning_rate": 1.3042500000000001e-05,
2445
- "loss": 17.9124,
2446
- "step": 1740
2447
- },
2448
- {
2449
- "epoch": 37.967032967032964,
2450
- "grad_norm": 73.53226470947266,
2451
- "learning_rate": 1.308e-05,
2452
- "loss": 17.2669,
2453
- "step": 1745
2454
- },
2455
- {
2456
- "epoch": 38.065934065934066,
2457
- "grad_norm": 58.128150939941406,
2458
  "learning_rate": 1.3117500000000001e-05,
2459
- "loss": 18.5807,
2460
  "step": 1750
2461
  },
2462
  {
2463
- "epoch": 38.175824175824175,
2464
- "grad_norm": 66.3156509399414,
2465
- "learning_rate": 1.3155e-05,
2466
- "loss": 17.0461,
2467
- "step": 1755
2468
- },
2469
- {
2470
- "epoch": 38.285714285714285,
2471
- "grad_norm": 64.85994720458984,
2472
- "learning_rate": 1.31925e-05,
2473
- "loss": 16.7421,
2474
- "step": 1760
2475
- },
2476
- {
2477
- "epoch": 38.395604395604394,
2478
- "grad_norm": 65.9932861328125,
2479
- "learning_rate": 1.323e-05,
2480
- "loss": 16.7541,
2481
- "step": 1765
2482
- },
2483
- {
2484
- "epoch": 38.505494505494504,
2485
- "grad_norm": 60.61685562133789,
2486
- "learning_rate": 1.32675e-05,
2487
- "loss": 17.6106,
2488
- "step": 1770
2489
- },
2490
- {
2491
- "epoch": 38.61538461538461,
2492
- "grad_norm": 67.54483032226562,
2493
- "learning_rate": 1.3305e-05,
2494
- "loss": 16.9768,
2495
- "step": 1775
2496
- },
2497
- {
2498
- "epoch": 38.72527472527472,
2499
- "grad_norm": 74.80374908447266,
2500
- "learning_rate": 1.33425e-05,
2501
- "loss": 16.6762,
2502
- "step": 1780
2503
- },
2504
- {
2505
- "epoch": 38.83516483516483,
2506
- "grad_norm": 50.045692443847656,
2507
- "learning_rate": 1.338e-05,
2508
- "loss": 16.3018,
2509
- "step": 1785
2510
- },
2511
- {
2512
- "epoch": 38.94505494505494,
2513
- "grad_norm": 75.77816009521484,
2514
- "learning_rate": 1.34175e-05,
2515
- "loss": 17.1515,
2516
- "step": 1790
2517
- },
2518
- {
2519
- "epoch": 39.043956043956044,
2520
- "grad_norm": 48.32164764404297,
2521
- "learning_rate": 1.3455e-05,
2522
- "loss": 15.5191,
2523
- "step": 1795
2524
- },
2525
- {
2526
- "epoch": 39.15384615384615,
2527
- "grad_norm": 69.80590057373047,
2528
  "learning_rate": 1.34925e-05,
2529
- "loss": 16.7698,
2530
  "step": 1800
2531
  },
2532
  {
2533
- "epoch": 39.26373626373626,
2534
- "grad_norm": 58.667015075683594,
2535
- "learning_rate": 1.3530000000000001e-05,
2536
- "loss": 16.3254,
2537
- "step": 1805
2538
- },
2539
- {
2540
- "epoch": 39.37362637362637,
2541
- "grad_norm": 52.71421432495117,
2542
- "learning_rate": 1.35675e-05,
2543
- "loss": 16.1323,
2544
- "step": 1810
2545
- },
2546
- {
2547
- "epoch": 39.48351648351648,
2548
- "grad_norm": 68.22982025146484,
2549
- "learning_rate": 1.3605000000000001e-05,
2550
- "loss": 16.3548,
2551
- "step": 1815
2552
- },
2553
- {
2554
- "epoch": 39.59340659340659,
2555
- "grad_norm": 82.97122192382812,
2556
- "learning_rate": 1.36425e-05,
2557
- "loss": 16.154,
2558
- "step": 1820
2559
- },
2560
- {
2561
- "epoch": 39.7032967032967,
2562
- "grad_norm": 78.51126861572266,
2563
- "learning_rate": 1.3680000000000001e-05,
2564
- "loss": 16.1856,
2565
- "step": 1825
2566
- },
2567
- {
2568
- "epoch": 39.81318681318681,
2569
- "grad_norm": 84.22386932373047,
2570
- "learning_rate": 1.37175e-05,
2571
- "loss": 16.1977,
2572
- "step": 1830
2573
- },
2574
- {
2575
- "epoch": 39.92307692307692,
2576
- "grad_norm": 72.61677551269531,
2577
- "learning_rate": 1.3755000000000001e-05,
2578
- "loss": 15.5414,
2579
- "step": 1835
2580
- },
2581
- {
2582
- "epoch": 40.02197802197802,
2583
- "grad_norm": 60.4188117980957,
2584
- "learning_rate": 1.37925e-05,
2585
- "loss": 17.9055,
2586
- "step": 1840
2587
- },
2588
- {
2589
- "epoch": 40.13186813186813,
2590
- "grad_norm": 55.08159637451172,
2591
- "learning_rate": 1.3830000000000001e-05,
2592
- "loss": 15.7148,
2593
- "step": 1845
2594
- },
2595
- {
2596
- "epoch": 40.24175824175824,
2597
- "grad_norm": 58.98347091674805,
2598
  "learning_rate": 1.38675e-05,
2599
- "loss": 16.586,
2600
  "step": 1850
2601
  },
2602
  {
2603
- "epoch": 40.35164835164835,
2604
- "grad_norm": 59.37347412109375,
2605
- "learning_rate": 1.3905000000000002e-05,
2606
- "loss": 15.9047,
2607
- "step": 1855
2608
- },
2609
- {
2610
- "epoch": 40.46153846153846,
2611
- "grad_norm": 64.99933624267578,
2612
- "learning_rate": 1.39425e-05,
2613
- "loss": 15.9342,
2614
- "step": 1860
2615
- },
2616
- {
2617
- "epoch": 40.57142857142857,
2618
- "grad_norm": 63.66476058959961,
2619
- "learning_rate": 1.3980000000000002e-05,
2620
- "loss": 15.6676,
2621
- "step": 1865
2622
- },
2623
- {
2624
- "epoch": 40.68131868131868,
2625
- "grad_norm": 62.42832565307617,
2626
- "learning_rate": 1.4017500000000001e-05,
2627
- "loss": 16.0192,
2628
- "step": 1870
2629
- },
2630
- {
2631
- "epoch": 40.79120879120879,
2632
- "grad_norm": 64.26334381103516,
2633
- "learning_rate": 1.4055000000000002e-05,
2634
- "loss": 15.4637,
2635
- "step": 1875
2636
- },
2637
- {
2638
- "epoch": 40.9010989010989,
2639
- "grad_norm": 53.68131637573242,
2640
- "learning_rate": 1.4092500000000001e-05,
2641
- "loss": 15.4847,
2642
- "step": 1880
2643
- },
2644
- {
2645
- "epoch": 41.0,
2646
- "grad_norm": 133.40896606445312,
2647
- "learning_rate": 1.413e-05,
2648
- "loss": 15.1,
2649
- "step": 1885
2650
- },
2651
- {
2652
- "epoch": 41.10989010989011,
2653
- "grad_norm": 64.47783660888672,
2654
- "learning_rate": 1.4167500000000001e-05,
2655
- "loss": 15.8261,
2656
- "step": 1890
2657
- },
2658
- {
2659
- "epoch": 41.21978021978022,
2660
- "grad_norm": 53.95432662963867,
2661
- "learning_rate": 1.4205e-05,
2662
- "loss": 15.3971,
2663
- "step": 1895
2664
- },
2665
- {
2666
- "epoch": 41.32967032967033,
2667
- "grad_norm": 60.42184829711914,
2668
  "learning_rate": 1.4242500000000001e-05,
2669
- "loss": 15.6888,
2670
  "step": 1900
2671
  },
2672
  {
2673
- "epoch": 41.43956043956044,
2674
- "grad_norm": 65.85467529296875,
2675
- "learning_rate": 1.428e-05,
2676
- "loss": 16.5622,
2677
- "step": 1905
2678
- },
2679
- {
2680
- "epoch": 41.54945054945055,
2681
- "grad_norm": 71.76943969726562,
2682
- "learning_rate": 1.4317500000000001e-05,
2683
- "loss": 15.9214,
2684
- "step": 1910
2685
- },
2686
- {
2687
- "epoch": 41.65934065934066,
2688
- "grad_norm": 56.51178741455078,
2689
- "learning_rate": 1.4355e-05,
2690
- "loss": 15.283,
2691
- "step": 1915
2692
- },
2693
- {
2694
- "epoch": 41.76923076923077,
2695
- "grad_norm": 77.75347900390625,
2696
- "learning_rate": 1.43925e-05,
2697
- "loss": 15.6863,
2698
- "step": 1920
2699
- },
2700
- {
2701
- "epoch": 41.879120879120876,
2702
- "grad_norm": 69.0566635131836,
2703
- "learning_rate": 1.4429999999999999e-05,
2704
- "loss": 14.7517,
2705
- "step": 1925
2706
- },
2707
- {
2708
- "epoch": 41.98901098901099,
2709
- "grad_norm": 52.313724517822266,
2710
- "learning_rate": 1.44675e-05,
2711
- "loss": 14.5892,
2712
- "step": 1930
2713
- },
2714
- {
2715
- "epoch": 42.08791208791209,
2716
- "grad_norm": 46.05543518066406,
2717
- "learning_rate": 1.4505e-05,
2718
- "loss": 17.2186,
2719
- "step": 1935
2720
- },
2721
- {
2722
- "epoch": 42.1978021978022,
2723
- "grad_norm": 60.843135833740234,
2724
- "learning_rate": 1.45425e-05,
2725
- "loss": 15.052,
2726
- "step": 1940
2727
- },
2728
- {
2729
- "epoch": 42.30769230769231,
2730
- "grad_norm": 69.2349853515625,
2731
- "learning_rate": 1.458e-05,
2732
- "loss": 14.8454,
2733
- "step": 1945
2734
- },
2735
- {
2736
- "epoch": 42.417582417582416,
2737
- "grad_norm": 63.5760383605957,
2738
  "learning_rate": 1.46175e-05,
2739
- "loss": 15.1218,
2740
  "step": 1950
2741
  },
2742
  {
2743
- "epoch": 42.527472527472526,
2744
- "grad_norm": 61.12398147583008,
2745
- "learning_rate": 1.4655e-05,
2746
- "loss": 15.7076,
2747
- "step": 1955
2748
- },
2749
- {
2750
- "epoch": 42.637362637362635,
2751
- "grad_norm": 67.10089111328125,
2752
- "learning_rate": 1.46925e-05,
2753
- "loss": 14.6074,
2754
- "step": 1960
2755
- },
2756
- {
2757
- "epoch": 42.747252747252745,
2758
- "grad_norm": 56.15253448486328,
2759
- "learning_rate": 1.473e-05,
2760
- "loss": 14.7393,
2761
- "step": 1965
2762
- },
2763
- {
2764
- "epoch": 42.857142857142854,
2765
- "grad_norm": 62.48304748535156,
2766
- "learning_rate": 1.47675e-05,
2767
- "loss": 15.0895,
2768
- "step": 1970
2769
- },
2770
- {
2771
- "epoch": 42.967032967032964,
2772
- "grad_norm": 64.34083557128906,
2773
- "learning_rate": 1.4805e-05,
2774
- "loss": 14.4191,
2775
- "step": 1975
2776
- },
2777
- {
2778
- "epoch": 43.065934065934066,
2779
- "grad_norm": 72.964111328125,
2780
- "learning_rate": 1.48425e-05,
2781
- "loss": 15.1835,
2782
- "step": 1980
2783
- },
2784
- {
2785
- "epoch": 43.175824175824175,
2786
- "grad_norm": 64.65727233886719,
2787
- "learning_rate": 1.488e-05,
2788
- "loss": 14.8041,
2789
- "step": 1985
2790
- },
2791
- {
2792
- "epoch": 43.285714285714285,
2793
- "grad_norm": 58.626461029052734,
2794
- "learning_rate": 1.49175e-05,
2795
- "loss": 15.048,
2796
- "step": 1990
2797
- },
2798
- {
2799
- "epoch": 43.395604395604394,
2800
- "grad_norm": 66.43981170654297,
2801
- "learning_rate": 1.4955e-05,
2802
- "loss": 14.752,
2803
- "step": 1995
2804
- },
2805
- {
2806
- "epoch": 43.505494505494504,
2807
- "grad_norm": 62.531463623046875,
2808
  "learning_rate": 1.4992500000000001e-05,
2809
- "loss": 14.5582,
2810
  "step": 2000
2811
  }
2812
  ],
2813
- "logging_steps": 5,
2814
- "max_steps": 2300,
2815
  "num_input_tokens_seen": 0,
2816
  "num_train_epochs": 50,
2817
- "save_steps": 500,
2818
  "stateful_callbacks": {
2819
  "TrainerControl": {
2820
  "args": {
@@ -2827,7 +307,7 @@
2827
  "attributes": {}
2828
  }
2829
  },
2830
- "total_flos": 5.300275357458432e+19,
2831
  "train_batch_size": 24,
2832
  "trial_name": null,
2833
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 44.44444444444444,
6
  "eval_steps": 500,
7
  "global_step": 2000,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 1.1111111111111112,
14
+ "grad_norm": 3663.33642578125,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "learning_rate": 3.6750000000000003e-07,
16
+ "loss": 1356.1239,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 2.2222222222222223,
21
+ "grad_norm": 3973.032470703125,
22
+ "learning_rate": 7.425000000000001e-07,
23
+ "loss": 1275.5178,
24
+ "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  },
26
  {
27
+ "epoch": 3.3333333333333335,
28
+ "grad_norm": 2667.0068359375,
29
+ "learning_rate": 1.1174999999999999e-06,
30
+ "loss": 1123.6059,
31
+ "step": 150
32
  },
33
  {
34
+ "epoch": 4.444444444444445,
35
+ "grad_norm": 2320.48486328125,
36
+ "learning_rate": 1.4925000000000001e-06,
37
+ "loss": 922.8917,
38
+ "step": 200
39
  },
40
  {
41
+ "epoch": 5.555555555555555,
42
+ "grad_norm": 1611.3345947265625,
43
+ "learning_rate": 1.8675000000000001e-06,
44
+ "loss": 714.5638,
45
+ "step": 250
46
  },
47
  {
48
+ "epoch": 6.666666666666667,
49
+ "grad_norm": 1395.7064208984375,
50
+ "learning_rate": 2.2425e-06,
51
+ "loss": 542.1251,
52
+ "step": 300
53
  },
54
  {
55
+ "epoch": 7.777777777777778,
56
+ "grad_norm": 1019.4860229492188,
57
+ "learning_rate": 2.6175e-06,
58
+ "loss": 411.1387,
59
+ "step": 350
60
  },
61
  {
62
+ "epoch": 8.88888888888889,
63
+ "grad_norm": 888.315185546875,
64
+ "learning_rate": 2.9925e-06,
65
+ "loss": 318.4567,
66
+ "step": 400
67
  },
68
  {
69
+ "epoch": 10.0,
70
+ "grad_norm": 2590.113525390625,
71
+ "learning_rate": 3.3675000000000004e-06,
72
+ "loss": 261.6995,
73
+ "step": 450
74
  },
75
  {
76
+ "epoch": 11.11111111111111,
77
+ "grad_norm": 711.677734375,
78
+ "learning_rate": 3.7425e-06,
79
+ "loss": 220.2936,
80
+ "step": 500
81
  },
82
  {
83
+ "epoch": 12.222222222222221,
84
+ "grad_norm": 548.9371948242188,
85
+ "learning_rate": 4.117500000000001e-06,
86
+ "loss": 187.9833,
87
+ "step": 550
88
  },
89
  {
90
+ "epoch": 13.333333333333334,
91
+ "grad_norm": 1127.1611328125,
92
+ "learning_rate": 4.4925e-06,
93
+ "loss": 159.1351,
94
+ "step": 600
95
  },
96
  {
97
+ "epoch": 14.444444444444445,
98
+ "grad_norm": 426.074951171875,
99
+ "learning_rate": 4.8675e-06,
100
+ "loss": 137.1092,
101
+ "step": 650
102
  },
103
  {
104
+ "epoch": 15.555555555555555,
105
+ "grad_norm": 348.842529296875,
106
+ "learning_rate": 5.2425e-06,
107
+ "loss": 119.822,
108
+ "step": 700
109
  },
110
  {
111
+ "epoch": 16.666666666666668,
112
+ "grad_norm": 373.2169189453125,
113
+ "learning_rate": 5.6175e-06,
114
+ "loss": 104.3366,
115
+ "step": 750
116
  },
117
  {
118
+ "epoch": 17.77777777777778,
119
+ "grad_norm": 324.51702880859375,
120
+ "learning_rate": 5.992500000000001e-06,
121
+ "loss": 90.8788,
122
+ "step": 800
123
  },
124
  {
125
+ "epoch": 18.88888888888889,
126
+ "grad_norm": 269.91827392578125,
127
+ "learning_rate": 6.3675e-06,
128
+ "loss": 78.4644,
129
+ "step": 850
130
  },
131
  {
132
+ "epoch": 20.0,
133
+ "grad_norm": 1744.54052734375,
134
+ "learning_rate": 6.7425e-06,
135
+ "loss": 70.3526,
136
+ "step": 900
137
  },
138
  {
139
+ "epoch": 21.11111111111111,
140
+ "grad_norm": 369.39837646484375,
141
+ "learning_rate": 7.1175e-06,
142
+ "loss": 63.9417,
143
+ "step": 950
144
  },
145
  {
146
+ "epoch": 22.22222222222222,
147
+ "grad_norm": 303.95977783203125,
148
+ "learning_rate": 7.4925e-06,
149
+ "loss": 63.4575,
150
+ "step": 1000
151
  },
152
  {
153
+ "epoch": 23.333333333333332,
154
+ "grad_norm": 187.462890625,
155
+ "learning_rate": 7.8675e-06,
156
+ "loss": 54.7417,
157
+ "step": 1050
158
  },
159
  {
160
+ "epoch": 24.444444444444443,
161
+ "grad_norm": 165.56666564941406,
162
+ "learning_rate": 8.2425e-06,
163
+ "loss": 49.6842,
164
+ "step": 1100
165
  },
166
  {
167
+ "epoch": 25.555555555555557,
168
+ "grad_norm": 147.3148193359375,
169
+ "learning_rate": 8.6175e-06,
170
+ "loss": 43.027,
171
+ "step": 1150
172
  },
173
  {
174
+ "epoch": 26.666666666666668,
175
+ "grad_norm": 125.53775024414062,
176
+ "learning_rate": 8.9925e-06,
177
+ "loss": 38.2579,
178
+ "step": 1200
179
  },
180
  {
181
+ "epoch": 27.77777777777778,
182
+ "grad_norm": 109.85810089111328,
183
+ "learning_rate": 9.367500000000001e-06,
184
+ "loss": 34.3957,
185
+ "step": 1250
186
  },
187
  {
188
+ "epoch": 28.88888888888889,
189
+ "grad_norm": 98.328369140625,
190
+ "learning_rate": 9.7425e-06,
191
+ "loss": 31.4378,
192
+ "step": 1300
193
  },
194
  {
195
+ "epoch": 30.0,
196
+ "grad_norm": 109.77750396728516,
197
+ "learning_rate": 1.01175e-05,
198
+ "loss": 28.5084,
199
+ "step": 1350
200
  },
201
  {
202
+ "epoch": 31.11111111111111,
203
+ "grad_norm": 112.47483825683594,
204
+ "learning_rate": 1.04925e-05,
205
+ "loss": 26.1671,
206
+ "step": 1400
207
  },
208
  {
209
+ "epoch": 32.22222222222222,
210
+ "grad_norm": 85.60242462158203,
211
  "learning_rate": 1.08675e-05,
212
+ "loss": 24.2309,
213
  "step": 1450
214
  },
215
  {
216
+ "epoch": 33.333333333333336,
217
+ "grad_norm": 73.19799041748047,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  "learning_rate": 1.1242500000000001e-05,
219
+ "loss": 22.6248,
220
  "step": 1500
221
  },
222
  {
223
+ "epoch": 34.44444444444444,
224
+ "grad_norm": 80.70884704589844,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  "learning_rate": 1.16175e-05,
226
+ "loss": 21.1187,
227
  "step": 1550
228
  },
229
  {
230
+ "epoch": 35.55555555555556,
231
+ "grad_norm": 110.98326110839844,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  "learning_rate": 1.19925e-05,
233
+ "loss": 20.4828,
234
  "step": 1600
235
  },
236
  {
237
+ "epoch": 36.666666666666664,
238
+ "grad_norm": 113.65286254882812,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  "learning_rate": 1.23675e-05,
240
+ "loss": 19.7366,
241
  "step": 1650
242
  },
243
  {
244
+ "epoch": 37.77777777777778,
245
+ "grad_norm": 77.65855407714844,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  "learning_rate": 1.27425e-05,
247
+ "loss": 18.6632,
248
  "step": 1700
249
  },
250
  {
251
+ "epoch": 38.888888888888886,
252
+ "grad_norm": 88.96723175048828,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  "learning_rate": 1.3117500000000001e-05,
254
+ "loss": 18.0793,
255
  "step": 1750
256
  },
257
  {
258
+ "epoch": 40.0,
259
+ "grad_norm": 79.1690902709961,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  "learning_rate": 1.34925e-05,
261
+ "loss": 17.0667,
262
  "step": 1800
263
  },
264
  {
265
+ "epoch": 41.111111111111114,
266
+ "grad_norm": 93.60108184814453,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  "learning_rate": 1.38675e-05,
268
+ "loss": 16.6106,
269
  "step": 1850
270
  },
271
  {
272
+ "epoch": 42.22222222222222,
273
+ "grad_norm": 66.16129302978516,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  "learning_rate": 1.4242500000000001e-05,
275
+ "loss": 16.4905,
276
  "step": 1900
277
  },
278
  {
279
+ "epoch": 43.333333333333336,
280
+ "grad_norm": 62.41362380981445,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  "learning_rate": 1.46175e-05,
282
+ "loss": 15.6427,
283
  "step": 1950
284
  },
285
  {
286
+ "epoch": 44.44444444444444,
287
+ "grad_norm": 107.30168151855469,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  "learning_rate": 1.4992500000000001e-05,
289
+ "loss": 15.0731,
290
  "step": 2000
291
  }
292
  ],
293
+ "logging_steps": 50,
294
+ "max_steps": 2250,
295
  "num_input_tokens_seen": 0,
296
  "num_train_epochs": 50,
297
+ "save_steps": 200,
298
  "stateful_callbacks": {
299
  "TrainerControl": {
300
  "args": {
 
307
  "attributes": {}
308
  }
309
  },
310
+ "total_flos": 5.387467985017897e+19,
311
  "train_batch_size": 24,
312
  "trial_name": null,
313
  "trial_params": null
checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e1ce90134704b5a6d94e1dd2b2ac60499368f272f8eda658e4e1ca0663c44cd
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
3
  size 5368
checkpoint-2200/config.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "silu",
4
+ "anchor_image_size": null,
5
+ "architectures": [
6
+ "RTDetrV2ForObjectDetection"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "auxiliary_loss": true,
10
+ "backbone": null,
11
+ "backbone_config": {
12
+ "depths": [
13
+ 3,
14
+ 4,
15
+ 23,
16
+ 3
17
+ ],
18
+ "downsample_in_bottleneck": false,
19
+ "downsample_in_first_stage": false,
20
+ "embedding_size": 64,
21
+ "hidden_act": "relu",
22
+ "hidden_sizes": [
23
+ 256,
24
+ 512,
25
+ 1024,
26
+ 2048
27
+ ],
28
+ "layer_type": "bottleneck",
29
+ "model_type": "rt_detr_resnet",
30
+ "num_channels": 3,
31
+ "out_features": [
32
+ "stage2",
33
+ "stage3",
34
+ "stage4"
35
+ ],
36
+ "out_indices": [
37
+ 2,
38
+ 3,
39
+ 4
40
+ ],
41
+ "stage_names": [
42
+ "stem",
43
+ "stage1",
44
+ "stage2",
45
+ "stage3",
46
+ "stage4"
47
+ ],
48
+ "torch_dtype": "float32"
49
+ },
50
+ "backbone_kwargs": null,
51
+ "batch_norm_eps": 1e-05,
52
+ "box_noise_scale": 1.0,
53
+ "d_model": 256,
54
+ "decoder_activation_function": "relu",
55
+ "decoder_attention_heads": 8,
56
+ "decoder_ffn_dim": 1024,
57
+ "decoder_in_channels": [
58
+ 384,
59
+ 384,
60
+ 384
61
+ ],
62
+ "decoder_layers": 6,
63
+ "decoder_method": "default",
64
+ "decoder_n_levels": 3,
65
+ "decoder_n_points": 4,
66
+ "decoder_offset_scale": 0.5,
67
+ "disable_custom_kernels": true,
68
+ "dropout": 0.0,
69
+ "encode_proj_layers": [
70
+ 2
71
+ ],
72
+ "encoder_activation_function": "gelu",
73
+ "encoder_attention_heads": 8,
74
+ "encoder_ffn_dim": 2048,
75
+ "encoder_hidden_dim": 384,
76
+ "encoder_in_channels": [
77
+ 512,
78
+ 1024,
79
+ 2048
80
+ ],
81
+ "encoder_layers": 1,
82
+ "eos_coefficient": 0.0001,
83
+ "eval_size": null,
84
+ "feat_strides": [
85
+ 8,
86
+ 16,
87
+ 32
88
+ ],
89
+ "focal_loss_alpha": 0.75,
90
+ "focal_loss_gamma": 2.0,
91
+ "freeze_backbone_batch_norms": true,
92
+ "hidden_expansion": 1.0,
93
+ "id2label": {
94
+ "0": "LABEL_0",
95
+ "1": "LABEL_1",
96
+ "2": "LABEL_2"
97
+ },
98
+ "initializer_bias_prior_prob": null,
99
+ "initializer_range": 0.01,
100
+ "is_encoder_decoder": true,
101
+ "label2id": {
102
+ "LABEL_0": 0,
103
+ "LABEL_1": 1,
104
+ "LABEL_2": 2
105
+ },
106
+ "label_noise_ratio": 0.5,
107
+ "layer_norm_eps": 1e-05,
108
+ "learn_initial_query": false,
109
+ "matcher_alpha": 0.25,
110
+ "matcher_bbox_cost": 5.0,
111
+ "matcher_class_cost": 2.0,
112
+ "matcher_gamma": 2.0,
113
+ "matcher_giou_cost": 2.0,
114
+ "model_type": "rt_detr_v2",
115
+ "normalize_before": false,
116
+ "num_denoising": 100,
117
+ "num_feature_levels": 3,
118
+ "num_queries": 300,
119
+ "positional_encoding_temperature": 10000,
120
+ "torch_dtype": "float32",
121
+ "transformers_version": "4.55.0",
122
+ "use_focal_loss": true,
123
+ "use_pretrained_backbone": false,
124
+ "use_timm_backbone": false,
125
+ "weight_loss_bbox": 5.0,
126
+ "weight_loss_giou": 2.0,
127
+ "weight_loss_vfl": 1.0,
128
+ "with_box_refine": true
129
+ }
checkpoint-2200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cb217c7b7445eb9473cddef9f4ea779004ca224f4615b6f69aa70c7cc0b781f
3
+ size 306699044
checkpoint-2200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ca9f9b9c554485c8d65a25449ddeb6582be923a8692ab873d9c1e6a21062298
3
+ size 611580433
checkpoint-2200/preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_annotations": true,
3
+ "do_normalize": false,
4
+ "do_pad": false,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "format": "coco_detection",
8
+ "image_mean": [
9
+ 0.485,
10
+ 0.456,
11
+ 0.406
12
+ ],
13
+ "image_processor_type": "RTDetrImageProcessor",
14
+ "image_std": [
15
+ 0.229,
16
+ 0.224,
17
+ 0.225
18
+ ],
19
+ "pad_size": null,
20
+ "resample": 2,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "height": 640,
24
+ "width": 640
25
+ }
26
+ }
checkpoint-2200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a75f76146ccf9511e324998dc7ec6df7764e7adf2655174e5f732a90e23392
3
+ size 14244
checkpoint-2200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be819b970a626d9fdb18122878b575ce46f2fa37fc360325a23e4cfdc87bcd1
3
+ size 1064
checkpoint-2200/trainer_state.json ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 48.888888888888886,
6
+ "eval_steps": 500,
7
+ "global_step": 2200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.1111111111111112,
14
+ "grad_norm": 3663.33642578125,
15
+ "learning_rate": 3.6750000000000003e-07,
16
+ "loss": 1356.1239,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 2.2222222222222223,
21
+ "grad_norm": 3973.032470703125,
22
+ "learning_rate": 7.425000000000001e-07,
23
+ "loss": 1275.5178,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 3.3333333333333335,
28
+ "grad_norm": 2667.0068359375,
29
+ "learning_rate": 1.1174999999999999e-06,
30
+ "loss": 1123.6059,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 4.444444444444445,
35
+ "grad_norm": 2320.48486328125,
36
+ "learning_rate": 1.4925000000000001e-06,
37
+ "loss": 922.8917,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 5.555555555555555,
42
+ "grad_norm": 1611.3345947265625,
43
+ "learning_rate": 1.8675000000000001e-06,
44
+ "loss": 714.5638,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 6.666666666666667,
49
+ "grad_norm": 1395.7064208984375,
50
+ "learning_rate": 2.2425e-06,
51
+ "loss": 542.1251,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 7.777777777777778,
56
+ "grad_norm": 1019.4860229492188,
57
+ "learning_rate": 2.6175e-06,
58
+ "loss": 411.1387,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 8.88888888888889,
63
+ "grad_norm": 888.315185546875,
64
+ "learning_rate": 2.9925e-06,
65
+ "loss": 318.4567,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 2590.113525390625,
71
+ "learning_rate": 3.3675000000000004e-06,
72
+ "loss": 261.6995,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 11.11111111111111,
77
+ "grad_norm": 711.677734375,
78
+ "learning_rate": 3.7425e-06,
79
+ "loss": 220.2936,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 12.222222222222221,
84
+ "grad_norm": 548.9371948242188,
85
+ "learning_rate": 4.117500000000001e-06,
86
+ "loss": 187.9833,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 13.333333333333334,
91
+ "grad_norm": 1127.1611328125,
92
+ "learning_rate": 4.4925e-06,
93
+ "loss": 159.1351,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 14.444444444444445,
98
+ "grad_norm": 426.074951171875,
99
+ "learning_rate": 4.8675e-06,
100
+ "loss": 137.1092,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 15.555555555555555,
105
+ "grad_norm": 348.842529296875,
106
+ "learning_rate": 5.2425e-06,
107
+ "loss": 119.822,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 16.666666666666668,
112
+ "grad_norm": 373.2169189453125,
113
+ "learning_rate": 5.6175e-06,
114
+ "loss": 104.3366,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 17.77777777777778,
119
+ "grad_norm": 324.51702880859375,
120
+ "learning_rate": 5.992500000000001e-06,
121
+ "loss": 90.8788,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 18.88888888888889,
126
+ "grad_norm": 269.91827392578125,
127
+ "learning_rate": 6.3675e-06,
128
+ "loss": 78.4644,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 20.0,
133
+ "grad_norm": 1744.54052734375,
134
+ "learning_rate": 6.7425e-06,
135
+ "loss": 70.3526,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 21.11111111111111,
140
+ "grad_norm": 369.39837646484375,
141
+ "learning_rate": 7.1175e-06,
142
+ "loss": 63.9417,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 22.22222222222222,
147
+ "grad_norm": 303.95977783203125,
148
+ "learning_rate": 7.4925e-06,
149
+ "loss": 63.4575,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 23.333333333333332,
154
+ "grad_norm": 187.462890625,
155
+ "learning_rate": 7.8675e-06,
156
+ "loss": 54.7417,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 24.444444444444443,
161
+ "grad_norm": 165.56666564941406,
162
+ "learning_rate": 8.2425e-06,
163
+ "loss": 49.6842,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 25.555555555555557,
168
+ "grad_norm": 147.3148193359375,
169
+ "learning_rate": 8.6175e-06,
170
+ "loss": 43.027,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 26.666666666666668,
175
+ "grad_norm": 125.53775024414062,
176
+ "learning_rate": 8.9925e-06,
177
+ "loss": 38.2579,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 27.77777777777778,
182
+ "grad_norm": 109.85810089111328,
183
+ "learning_rate": 9.367500000000001e-06,
184
+ "loss": 34.3957,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 28.88888888888889,
189
+ "grad_norm": 98.328369140625,
190
+ "learning_rate": 9.7425e-06,
191
+ "loss": 31.4378,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 30.0,
196
+ "grad_norm": 109.77750396728516,
197
+ "learning_rate": 1.01175e-05,
198
+ "loss": 28.5084,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 31.11111111111111,
203
+ "grad_norm": 112.47483825683594,
204
+ "learning_rate": 1.04925e-05,
205
+ "loss": 26.1671,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 32.22222222222222,
210
+ "grad_norm": 85.60242462158203,
211
+ "learning_rate": 1.08675e-05,
212
+ "loss": 24.2309,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 33.333333333333336,
217
+ "grad_norm": 73.19799041748047,
218
+ "learning_rate": 1.1242500000000001e-05,
219
+ "loss": 22.6248,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 34.44444444444444,
224
+ "grad_norm": 80.70884704589844,
225
+ "learning_rate": 1.16175e-05,
226
+ "loss": 21.1187,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 35.55555555555556,
231
+ "grad_norm": 110.98326110839844,
232
+ "learning_rate": 1.19925e-05,
233
+ "loss": 20.4828,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 36.666666666666664,
238
+ "grad_norm": 113.65286254882812,
239
+ "learning_rate": 1.23675e-05,
240
+ "loss": 19.7366,
241
+ "step": 1650
242
+ },
243
+ {
244
+ "epoch": 37.77777777777778,
245
+ "grad_norm": 77.65855407714844,
246
+ "learning_rate": 1.27425e-05,
247
+ "loss": 18.6632,
248
+ "step": 1700
249
+ },
250
+ {
251
+ "epoch": 38.888888888888886,
252
+ "grad_norm": 88.96723175048828,
253
+ "learning_rate": 1.3117500000000001e-05,
254
+ "loss": 18.0793,
255
+ "step": 1750
256
+ },
257
+ {
258
+ "epoch": 40.0,
259
+ "grad_norm": 79.1690902709961,
260
+ "learning_rate": 1.34925e-05,
261
+ "loss": 17.0667,
262
+ "step": 1800
263
+ },
264
+ {
265
+ "epoch": 41.111111111111114,
266
+ "grad_norm": 93.60108184814453,
267
+ "learning_rate": 1.38675e-05,
268
+ "loss": 16.6106,
269
+ "step": 1850
270
+ },
271
+ {
272
+ "epoch": 42.22222222222222,
273
+ "grad_norm": 66.16129302978516,
274
+ "learning_rate": 1.4242500000000001e-05,
275
+ "loss": 16.4905,
276
+ "step": 1900
277
+ },
278
+ {
279
+ "epoch": 43.333333333333336,
280
+ "grad_norm": 62.41362380981445,
281
+ "learning_rate": 1.46175e-05,
282
+ "loss": 15.6427,
283
+ "step": 1950
284
+ },
285
+ {
286
+ "epoch": 44.44444444444444,
287
+ "grad_norm": 107.30168151855469,
288
+ "learning_rate": 1.4992500000000001e-05,
289
+ "loss": 15.0731,
290
+ "step": 2000
291
+ },
292
+ {
293
+ "epoch": 45.55555555555556,
294
+ "grad_norm": 100.18666076660156,
295
+ "learning_rate": 1.2060000000000001e-05,
296
+ "loss": 14.6973,
297
+ "step": 2050
298
+ },
299
+ {
300
+ "epoch": 46.666666666666664,
301
+ "grad_norm": 68.78209686279297,
302
+ "learning_rate": 9.06e-06,
303
+ "loss": 13.8928,
304
+ "step": 2100
305
+ },
306
+ {
307
+ "epoch": 47.77777777777778,
308
+ "grad_norm": 65.85958099365234,
309
+ "learning_rate": 6.0600000000000004e-06,
310
+ "loss": 13.4318,
311
+ "step": 2150
312
+ },
313
+ {
314
+ "epoch": 48.888888888888886,
315
+ "grad_norm": 72.31432342529297,
316
+ "learning_rate": 3.06e-06,
317
+ "loss": 12.6856,
318
+ "step": 2200
319
+ }
320
+ ],
321
+ "logging_steps": 50,
322
+ "max_steps": 2250,
323
+ "num_input_tokens_seen": 0,
324
+ "num_train_epochs": 50,
325
+ "save_steps": 200,
326
+ "stateful_callbacks": {
327
+ "TrainerControl": {
328
+ "args": {
329
+ "should_epoch_stop": false,
330
+ "should_evaluate": false,
331
+ "should_log": false,
332
+ "should_save": true,
333
+ "should_training_stop": false
334
+ },
335
+ "attributes": {}
336
+ }
337
+ },
338
+ "total_flos": 5.926395026677432e+19,
339
+ "train_batch_size": 24,
340
+ "trial_name": null,
341
+ "trial_params": null
342
+ }
checkpoint-2200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
3
+ size 5368
checkpoint-2250/config.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "silu",
4
+ "anchor_image_size": null,
5
+ "architectures": [
6
+ "RTDetrV2ForObjectDetection"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "auxiliary_loss": true,
10
+ "backbone": null,
11
+ "backbone_config": {
12
+ "depths": [
13
+ 3,
14
+ 4,
15
+ 23,
16
+ 3
17
+ ],
18
+ "downsample_in_bottleneck": false,
19
+ "downsample_in_first_stage": false,
20
+ "embedding_size": 64,
21
+ "hidden_act": "relu",
22
+ "hidden_sizes": [
23
+ 256,
24
+ 512,
25
+ 1024,
26
+ 2048
27
+ ],
28
+ "layer_type": "bottleneck",
29
+ "model_type": "rt_detr_resnet",
30
+ "num_channels": 3,
31
+ "out_features": [
32
+ "stage2",
33
+ "stage3",
34
+ "stage4"
35
+ ],
36
+ "out_indices": [
37
+ 2,
38
+ 3,
39
+ 4
40
+ ],
41
+ "stage_names": [
42
+ "stem",
43
+ "stage1",
44
+ "stage2",
45
+ "stage3",
46
+ "stage4"
47
+ ],
48
+ "torch_dtype": "float32"
49
+ },
50
+ "backbone_kwargs": null,
51
+ "batch_norm_eps": 1e-05,
52
+ "box_noise_scale": 1.0,
53
+ "d_model": 256,
54
+ "decoder_activation_function": "relu",
55
+ "decoder_attention_heads": 8,
56
+ "decoder_ffn_dim": 1024,
57
+ "decoder_in_channels": [
58
+ 384,
59
+ 384,
60
+ 384
61
+ ],
62
+ "decoder_layers": 6,
63
+ "decoder_method": "default",
64
+ "decoder_n_levels": 3,
65
+ "decoder_n_points": 4,
66
+ "decoder_offset_scale": 0.5,
67
+ "disable_custom_kernels": true,
68
+ "dropout": 0.0,
69
+ "encode_proj_layers": [
70
+ 2
71
+ ],
72
+ "encoder_activation_function": "gelu",
73
+ "encoder_attention_heads": 8,
74
+ "encoder_ffn_dim": 2048,
75
+ "encoder_hidden_dim": 384,
76
+ "encoder_in_channels": [
77
+ 512,
78
+ 1024,
79
+ 2048
80
+ ],
81
+ "encoder_layers": 1,
82
+ "eos_coefficient": 0.0001,
83
+ "eval_size": null,
84
+ "feat_strides": [
85
+ 8,
86
+ 16,
87
+ 32
88
+ ],
89
+ "focal_loss_alpha": 0.75,
90
+ "focal_loss_gamma": 2.0,
91
+ "freeze_backbone_batch_norms": true,
92
+ "hidden_expansion": 1.0,
93
+ "id2label": {
94
+ "0": "LABEL_0",
95
+ "1": "LABEL_1",
96
+ "2": "LABEL_2"
97
+ },
98
+ "initializer_bias_prior_prob": null,
99
+ "initializer_range": 0.01,
100
+ "is_encoder_decoder": true,
101
+ "label2id": {
102
+ "LABEL_0": 0,
103
+ "LABEL_1": 1,
104
+ "LABEL_2": 2
105
+ },
106
+ "label_noise_ratio": 0.5,
107
+ "layer_norm_eps": 1e-05,
108
+ "learn_initial_query": false,
109
+ "matcher_alpha": 0.25,
110
+ "matcher_bbox_cost": 5.0,
111
+ "matcher_class_cost": 2.0,
112
+ "matcher_gamma": 2.0,
113
+ "matcher_giou_cost": 2.0,
114
+ "model_type": "rt_detr_v2",
115
+ "normalize_before": false,
116
+ "num_denoising": 100,
117
+ "num_feature_levels": 3,
118
+ "num_queries": 300,
119
+ "positional_encoding_temperature": 10000,
120
+ "torch_dtype": "float32",
121
+ "transformers_version": "4.55.0",
122
+ "use_focal_loss": true,
123
+ "use_pretrained_backbone": false,
124
+ "use_timm_backbone": false,
125
+ "weight_loss_bbox": 5.0,
126
+ "weight_loss_giou": 2.0,
127
+ "weight_loss_vfl": 1.0,
128
+ "with_box_refine": true
129
+ }
checkpoint-2250/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a52d4cd4386295eedbfb267bc679eca4b27864d745fff06694c0f9dbf823a6
3
+ size 306699044
checkpoint-2250/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05dd6b927d3bf0bbfe4b8e26ceb5b08121784a39348599920d07011f715be702
3
+ size 611580433
checkpoint-2250/preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_annotations": true,
3
+ "do_normalize": false,
4
+ "do_pad": false,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "format": "coco_detection",
8
+ "image_mean": [
9
+ 0.485,
10
+ 0.456,
11
+ 0.406
12
+ ],
13
+ "image_processor_type": "RTDetrImageProcessor",
14
+ "image_std": [
15
+ 0.229,
16
+ 0.224,
17
+ 0.225
18
+ ],
19
+ "pad_size": null,
20
+ "resample": 2,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "height": 640,
24
+ "width": 640
25
+ }
26
+ }
checkpoint-2250/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66787fd9597d0c4424fa82a41803d924585f86f960101c754c7f5cfada26d864
3
+ size 14244
checkpoint-2250/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e97b09861aa673666d96648b6cbe41d7c63c06fd70e7ca49c43cf61264897a95
3
+ size 1064
checkpoint-2250/trainer_state.json ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 50.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2250,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.1111111111111112,
14
+ "grad_norm": 3663.33642578125,
15
+ "learning_rate": 3.6750000000000003e-07,
16
+ "loss": 1356.1239,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 2.2222222222222223,
21
+ "grad_norm": 3973.032470703125,
22
+ "learning_rate": 7.425000000000001e-07,
23
+ "loss": 1275.5178,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 3.3333333333333335,
28
+ "grad_norm": 2667.0068359375,
29
+ "learning_rate": 1.1174999999999999e-06,
30
+ "loss": 1123.6059,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 4.444444444444445,
35
+ "grad_norm": 2320.48486328125,
36
+ "learning_rate": 1.4925000000000001e-06,
37
+ "loss": 922.8917,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 5.555555555555555,
42
+ "grad_norm": 1611.3345947265625,
43
+ "learning_rate": 1.8675000000000001e-06,
44
+ "loss": 714.5638,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 6.666666666666667,
49
+ "grad_norm": 1395.7064208984375,
50
+ "learning_rate": 2.2425e-06,
51
+ "loss": 542.1251,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 7.777777777777778,
56
+ "grad_norm": 1019.4860229492188,
57
+ "learning_rate": 2.6175e-06,
58
+ "loss": 411.1387,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 8.88888888888889,
63
+ "grad_norm": 888.315185546875,
64
+ "learning_rate": 2.9925e-06,
65
+ "loss": 318.4567,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 10.0,
70
+ "grad_norm": 2590.113525390625,
71
+ "learning_rate": 3.3675000000000004e-06,
72
+ "loss": 261.6995,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 11.11111111111111,
77
+ "grad_norm": 711.677734375,
78
+ "learning_rate": 3.7425e-06,
79
+ "loss": 220.2936,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 12.222222222222221,
84
+ "grad_norm": 548.9371948242188,
85
+ "learning_rate": 4.117500000000001e-06,
86
+ "loss": 187.9833,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 13.333333333333334,
91
+ "grad_norm": 1127.1611328125,
92
+ "learning_rate": 4.4925e-06,
93
+ "loss": 159.1351,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 14.444444444444445,
98
+ "grad_norm": 426.074951171875,
99
+ "learning_rate": 4.8675e-06,
100
+ "loss": 137.1092,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 15.555555555555555,
105
+ "grad_norm": 348.842529296875,
106
+ "learning_rate": 5.2425e-06,
107
+ "loss": 119.822,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 16.666666666666668,
112
+ "grad_norm": 373.2169189453125,
113
+ "learning_rate": 5.6175e-06,
114
+ "loss": 104.3366,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 17.77777777777778,
119
+ "grad_norm": 324.51702880859375,
120
+ "learning_rate": 5.992500000000001e-06,
121
+ "loss": 90.8788,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 18.88888888888889,
126
+ "grad_norm": 269.91827392578125,
127
+ "learning_rate": 6.3675e-06,
128
+ "loss": 78.4644,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 20.0,
133
+ "grad_norm": 1744.54052734375,
134
+ "learning_rate": 6.7425e-06,
135
+ "loss": 70.3526,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 21.11111111111111,
140
+ "grad_norm": 369.39837646484375,
141
+ "learning_rate": 7.1175e-06,
142
+ "loss": 63.9417,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 22.22222222222222,
147
+ "grad_norm": 303.95977783203125,
148
+ "learning_rate": 7.4925e-06,
149
+ "loss": 63.4575,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 23.333333333333332,
154
+ "grad_norm": 187.462890625,
155
+ "learning_rate": 7.8675e-06,
156
+ "loss": 54.7417,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 24.444444444444443,
161
+ "grad_norm": 165.56666564941406,
162
+ "learning_rate": 8.2425e-06,
163
+ "loss": 49.6842,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 25.555555555555557,
168
+ "grad_norm": 147.3148193359375,
169
+ "learning_rate": 8.6175e-06,
170
+ "loss": 43.027,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 26.666666666666668,
175
+ "grad_norm": 125.53775024414062,
176
+ "learning_rate": 8.9925e-06,
177
+ "loss": 38.2579,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 27.77777777777778,
182
+ "grad_norm": 109.85810089111328,
183
+ "learning_rate": 9.367500000000001e-06,
184
+ "loss": 34.3957,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 28.88888888888889,
189
+ "grad_norm": 98.328369140625,
190
+ "learning_rate": 9.7425e-06,
191
+ "loss": 31.4378,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 30.0,
196
+ "grad_norm": 109.77750396728516,
197
+ "learning_rate": 1.01175e-05,
198
+ "loss": 28.5084,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 31.11111111111111,
203
+ "grad_norm": 112.47483825683594,
204
+ "learning_rate": 1.04925e-05,
205
+ "loss": 26.1671,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 32.22222222222222,
210
+ "grad_norm": 85.60242462158203,
211
+ "learning_rate": 1.08675e-05,
212
+ "loss": 24.2309,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 33.333333333333336,
217
+ "grad_norm": 73.19799041748047,
218
+ "learning_rate": 1.1242500000000001e-05,
219
+ "loss": 22.6248,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 34.44444444444444,
224
+ "grad_norm": 80.70884704589844,
225
+ "learning_rate": 1.16175e-05,
226
+ "loss": 21.1187,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 35.55555555555556,
231
+ "grad_norm": 110.98326110839844,
232
+ "learning_rate": 1.19925e-05,
233
+ "loss": 20.4828,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 36.666666666666664,
238
+ "grad_norm": 113.65286254882812,
239
+ "learning_rate": 1.23675e-05,
240
+ "loss": 19.7366,
241
+ "step": 1650
242
+ },
243
+ {
244
+ "epoch": 37.77777777777778,
245
+ "grad_norm": 77.65855407714844,
246
+ "learning_rate": 1.27425e-05,
247
+ "loss": 18.6632,
248
+ "step": 1700
249
+ },
250
+ {
251
+ "epoch": 38.888888888888886,
252
+ "grad_norm": 88.96723175048828,
253
+ "learning_rate": 1.3117500000000001e-05,
254
+ "loss": 18.0793,
255
+ "step": 1750
256
+ },
257
+ {
258
+ "epoch": 40.0,
259
+ "grad_norm": 79.1690902709961,
260
+ "learning_rate": 1.34925e-05,
261
+ "loss": 17.0667,
262
+ "step": 1800
263
+ },
264
+ {
265
+ "epoch": 41.111111111111114,
266
+ "grad_norm": 93.60108184814453,
267
+ "learning_rate": 1.38675e-05,
268
+ "loss": 16.6106,
269
+ "step": 1850
270
+ },
271
+ {
272
+ "epoch": 42.22222222222222,
273
+ "grad_norm": 66.16129302978516,
274
+ "learning_rate": 1.4242500000000001e-05,
275
+ "loss": 16.4905,
276
+ "step": 1900
277
+ },
278
+ {
279
+ "epoch": 43.333333333333336,
280
+ "grad_norm": 62.41362380981445,
281
+ "learning_rate": 1.46175e-05,
282
+ "loss": 15.6427,
283
+ "step": 1950
284
+ },
285
+ {
286
+ "epoch": 44.44444444444444,
287
+ "grad_norm": 107.30168151855469,
288
+ "learning_rate": 1.4992500000000001e-05,
289
+ "loss": 15.0731,
290
+ "step": 2000
291
+ },
292
+ {
293
+ "epoch": 45.55555555555556,
294
+ "grad_norm": 100.18666076660156,
295
+ "learning_rate": 1.2060000000000001e-05,
296
+ "loss": 14.6973,
297
+ "step": 2050
298
+ },
299
+ {
300
+ "epoch": 46.666666666666664,
301
+ "grad_norm": 68.78209686279297,
302
+ "learning_rate": 9.06e-06,
303
+ "loss": 13.8928,
304
+ "step": 2100
305
+ },
306
+ {
307
+ "epoch": 47.77777777777778,
308
+ "grad_norm": 65.85958099365234,
309
+ "learning_rate": 6.0600000000000004e-06,
310
+ "loss": 13.4318,
311
+ "step": 2150
312
+ },
313
+ {
314
+ "epoch": 48.888888888888886,
315
+ "grad_norm": 72.31432342529297,
316
+ "learning_rate": 3.06e-06,
317
+ "loss": 12.6856,
318
+ "step": 2200
319
+ },
320
+ {
321
+ "epoch": 50.0,
322
+ "grad_norm": 50.271549224853516,
323
+ "learning_rate": 6.000000000000001e-08,
324
+ "loss": 12.0923,
325
+ "step": 2250
326
+ }
327
+ ],
328
+ "logging_steps": 50,
329
+ "max_steps": 2250,
330
+ "num_input_tokens_seen": 0,
331
+ "num_train_epochs": 50,
332
+ "save_steps": 200,
333
+ "stateful_callbacks": {
334
+ "TrainerControl": {
335
+ "args": {
336
+ "should_epoch_stop": false,
337
+ "should_evaluate": false,
338
+ "should_log": false,
339
+ "should_save": true,
340
+ "should_training_stop": true
341
+ },
342
+ "attributes": {}
343
+ }
344
+ },
345
+ "total_flos": 6.060676179197952e+19,
346
+ "train_batch_size": 24,
347
+ "trial_name": null,
348
+ "trial_params": null
349
+ }
checkpoint-2250/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
3
+ size 5368
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c86f420c1f46d6f0b69c1628aa2bc3d17e96bd9bbaa480bfb7f3ac4ba2c3c4e
3
  size 306699044
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a52d4cd4386295eedbfb267bc679eca4b27864d745fff06694c0f9dbf823a6
3
  size 306699044
runs/Aug14_17-42-57_2676026c4495/events.out.tfevents.1755193378.2676026c4495.6591.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6333d2a8e567a7e0ba6820d48e67bd616d919e804c37bbbe27e4f49f18a3f884
3
+ size 16890
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e1ce90134704b5a6d94e1dd2b2ac60499368f272f8eda658e4e1ca0663c44cd
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
3
  size 5368