chansung commited on
Commit
df4ab9f
·
verified ·
1 Parent(s): df688cc

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 9.9389
24
 
25
  ## Model description
26
 
@@ -48,19 +48,22 @@ The following hyperparameters were used during training:
48
  - gradient_accumulation_steps: 2
49
  - total_train_batch_size: 128
50
  - total_eval_batch_size: 64
51
- - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
  - num_epochs: 1
55
 
56
  ### Training results
57
 
 
 
 
58
 
59
 
60
  ### Framework versions
61
 
62
  - PEFT 0.13.1.dev0
63
- - Transformers 4.46.2
64
- - Pytorch 2.5.1+cu124
65
  - Datasets 3.1.0
66
  - Tokenizers 0.20.3
 
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.7454
24
 
25
  ## Model description
26
 
 
48
  - gradient_accumulation_steps: 2
49
  - total_train_batch_size: 128
50
  - total_eval_batch_size: 64
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
  - num_epochs: 1
55
 
56
  ### Training results
57
 
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.0272 | 1.0 | 301 | 1.7454 |
61
 
62
 
63
  ### Framework versions
64
 
65
  - PEFT 0.13.1.dev0
66
+ - Transformers 4.46.3
67
+ - Pytorch 2.3.1+cu121
68
  - Datasets 3.1.0
69
  - Tokenizers 0.20.3
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 9.938864707946777,
4
- "eval_runtime": 1.9434,
5
- "eval_samples": 518,
6
- "eval_samples_per_second": 91.592,
7
- "eval_steps_per_second": 1.544,
8
- "total_flos": 4.268850850782249e+17,
9
- "train_loss": 0.0,
10
- "train_runtime": 0.0104,
11
- "train_samples": 51241,
12
- "train_samples_per_second": 1725286.468,
13
- "train_steps_per_second": 13490.846
14
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 9.178029324081562e+17,
4
+ "train_loss": 3.2307936707604368,
5
+ "train_runtime": 823.5769,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 46.729,
8
+ "train_steps_per_second": 0.365
 
 
 
 
 
9
  }
runs/Nov20_23-09-14_main-kasa-gemma7b-coding-0-0/events.out.tfevents.1732163862.main-kasa-gemma7b-coding-0-0.471.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6ea13034732fa8c5ade12e0e09b20114bc9a99b59430324a2446ae868e35e3c
3
- size 18490
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44d0f0bb9e8589bfacf302fd482cb0d22adb7ad93c8f56441b95b36aa3a08fb
3
+ size 19115
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 4.268850850782249e+17,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0104,
6
- "train_samples": 51241,
7
- "train_samples_per_second": 1725286.468,
8
- "train_steps_per_second": 13490.846
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 9.178029324081562e+17,
4
+ "train_loss": 3.2307936707604368,
5
+ "train_runtime": 823.5769,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 46.729,
8
+ "train_steps_per_second": 0.365
9
  }
trainer_state.json CHANGED
@@ -3,226 +3,458 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 140,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.007142857142857143,
13
- "grad_norm": 1138.7742919921875,
14
- "learning_rate": 1.4285714285714285e-05,
15
- "loss": 48.0816,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.03571428571428571,
20
- "grad_norm": 242.92247009277344,
21
- "learning_rate": 7.142857142857143e-05,
22
- "loss": 38.2264,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.07142857142857142,
27
- "grad_norm": 99.10001373291016,
28
- "learning_rate": 0.00014285714285714287,
29
- "loss": 26.8907,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.10714285714285714,
34
- "grad_norm": 51.8281364440918,
35
- "learning_rate": 0.00019996891820008164,
36
- "loss": 23.0483,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.14285714285714285,
41
- "grad_norm": 16.425758361816406,
42
- "learning_rate": 0.00019888308262251285,
43
- "loss": 20.2011,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.17857142857142858,
48
- "grad_norm": 8.041358947753906,
49
- "learning_rate": 0.0001962624246950012,
50
- "loss": 18.1936,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.21428571428571427,
55
- "grad_norm": 5.986516952514648,
56
- "learning_rate": 0.00019214762118704076,
57
- "loss": 16.4036,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.25,
62
- "grad_norm": 3.8124237060546875,
63
- "learning_rate": 0.00018660254037844388,
64
- "loss": 15.1102,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.2857142857142857,
69
- "grad_norm": 3.229938268661499,
70
- "learning_rate": 0.00017971325072229226,
71
- "loss": 14.0912,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.32142857142857145,
76
- "grad_norm": 3.9065780639648438,
77
- "learning_rate": 0.00017158668492597186,
78
- "loss": 13.3336,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.35714285714285715,
83
- "grad_norm": 4.85410213470459,
84
- "learning_rate": 0.00016234898018587337,
85
- "loss": 12.878,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.39285714285714285,
90
- "grad_norm": 4.169612884521484,
91
- "learning_rate": 0.0001521435203379498,
92
- "loss": 12.3397,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.42857142857142855,
97
- "grad_norm": 2.173008441925049,
98
- "learning_rate": 0.00014112871031306119,
99
- "loss": 11.9716,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.4642857142857143,
104
- "grad_norm": 2.008362054824829,
105
- "learning_rate": 0.00012947551744109043,
106
- "loss": 11.6934,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.5,
111
- "grad_norm": 3.3055272102355957,
112
- "learning_rate": 0.00011736481776669306,
113
- "loss": 11.4663,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.5357142857142857,
118
- "grad_norm": 1.9420216083526611,
119
- "learning_rate": 0.00010498458856606972,
120
- "loss": 11.2446,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.5714285714285714,
125
- "grad_norm": 1.257276177406311,
126
- "learning_rate": 9.252699064135758e-05,
127
- "loss": 11.0546,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.6071428571428571,
132
- "grad_norm": 1.5491865873336792,
133
- "learning_rate": 8.018538568006027e-05,
134
- "loss": 10.8896,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.6428571428571429,
139
- "grad_norm": 21.172082901000977,
140
- "learning_rate": 6.815133497483157e-05,
141
- "loss": 10.7927,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.6785714285714286,
146
- "grad_norm": 1.501454472541809,
147
- "learning_rate": 5.6611626088244194e-05,
148
- "loss": 10.7241,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.7142857142857143,
153
- "grad_norm": 1.7851982116699219,
154
- "learning_rate": 4.574537361342407e-05,
155
- "loss": 10.5994,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.75,
160
- "grad_norm": 2.065108299255371,
161
- "learning_rate": 3.5721239031346066e-05,
162
- "loss": 10.5151,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.7857142857142857,
167
- "grad_norm": 1.3877239227294922,
168
- "learning_rate": 2.669481281701739e-05,
169
- "loss": 10.5179,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.8214285714285714,
174
- "grad_norm": 1.3329989910125732,
175
- "learning_rate": 1.880619942841435e-05,
176
- "loss": 10.4364,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.8571428571428571,
181
- "grad_norm": 1.767424464225769,
182
- "learning_rate": 1.2177842662977135e-05,
183
- "loss": 10.4225,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.8928571428571429,
188
- "grad_norm": 1.3661062717437744,
189
- "learning_rate": 6.9126251355795864e-06,
190
- "loss": 10.4211,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.9285714285714286,
195
- "grad_norm": 1.1075615882873535,
196
- "learning_rate": 3.092271377092215e-06,
197
- "loss": 10.3691,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.9642857142857143,
202
- "grad_norm": 1.3276323080062866,
203
- "learning_rate": 7.760793399827937e-07,
204
- "loss": 10.3975,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 1.0,
209
- "grad_norm": 1.1502283811569214,
210
- "learning_rate": 0.0,
211
- "loss": 10.3696,
212
  "step": 140
213
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  {
215
  "epoch": 1.0,
216
- "step": 140,
217
- "total_flos": 4.268850850782249e+17,
218
- "train_loss": 0.0,
219
- "train_runtime": 0.0104,
220
- "train_samples_per_second": 1725286.468,
221
- "train_steps_per_second": 13490.846
222
  }
223
  ],
224
  "logging_steps": 5,
225
- "max_steps": 140,
226
  "num_input_tokens_seen": 0,
227
  "num_train_epochs": 1,
228
  "save_steps": 100,
@@ -238,7 +470,7 @@
238
  "attributes": {}
239
  }
240
  },
241
- "total_flos": 4.268850850782249e+17,
242
  "train_batch_size": 8,
243
  "trial_name": null,
244
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 301,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0033222591362126247,
13
+ "grad_norm": 50.78891372680664,
14
+ "learning_rate": 6.451612903225806e-06,
15
+ "loss": 20.4635,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.016611295681063124,
20
+ "grad_norm": 50.44277572631836,
21
+ "learning_rate": 3.2258064516129034e-05,
22
+ "loss": 20.8825,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.03322259136212625,
27
+ "grad_norm": 21.746339797973633,
28
+ "learning_rate": 6.451612903225807e-05,
29
+ "loss": 18.4425,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.04983388704318937,
34
+ "grad_norm": 8.766548156738281,
35
+ "learning_rate": 9.677419354838711e-05,
36
+ "loss": 16.2833,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.0664451827242525,
41
+ "grad_norm": 6.890467643737793,
42
+ "learning_rate": 0.00012903225806451613,
43
+ "loss": 14.6695,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.08305647840531562,
48
+ "grad_norm": 3.6492936611175537,
49
+ "learning_rate": 0.00016129032258064516,
50
+ "loss": 13.4213,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.09966777408637874,
55
+ "grad_norm": 3.308103561401367,
56
+ "learning_rate": 0.00019354838709677422,
57
+ "loss": 12.7739,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.11627906976744186,
62
+ "grad_norm": 6.093023777008057,
63
+ "learning_rate": 0.0001998917111338525,
64
+ "loss": 11.916,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.132890365448505,
69
+ "grad_norm": 11.974358558654785,
70
+ "learning_rate": 0.00019945218953682734,
71
+ "loss": 11.0787,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.14950166112956811,
76
+ "grad_norm": 20.57221221923828,
77
+ "learning_rate": 0.00019867615321125795,
78
+ "loss": 9.0421,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.16611295681063123,
83
+ "grad_norm": 25.553680419921875,
84
+ "learning_rate": 0.00019756622801842143,
85
+ "loss": 6.0272,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.18272425249169436,
90
+ "grad_norm": 7.413240432739258,
91
+ "learning_rate": 0.0001961261695938319,
92
+ "loss": 3.1666,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.19933554817275748,
97
+ "grad_norm": 6.828798294067383,
98
+ "learning_rate": 0.00019436085063935835,
99
+ "loss": 2.2143,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.2159468438538206,
104
+ "grad_norm": 4.01786470413208,
105
+ "learning_rate": 0.00019227624443554425,
106
+ "loss": 1.9151,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.23255813953488372,
111
+ "grad_norm": 2.935255765914917,
112
+ "learning_rate": 0.0001898794046299167,
113
+ "loss": 1.6848,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.24916943521594684,
118
+ "grad_norm": 1.8950200080871582,
119
+ "learning_rate": 0.00018717844136967624,
120
+ "loss": 1.5136,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.26578073089701,
125
+ "grad_norm": 0.5441950559616089,
126
+ "learning_rate": 0.00018418249385952575,
127
+ "loss": 1.4275,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.2823920265780731,
132
+ "grad_norm": 0.7461830377578735,
133
+ "learning_rate": 0.00018090169943749476,
134
+ "loss": 1.3593,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.29900332225913623,
139
+ "grad_norm": 0.5518223643302917,
140
+ "learning_rate": 0.0001773471592733964,
141
+ "loss": 1.2975,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.31561461794019935,
146
+ "grad_norm": 0.4475560486316681,
147
+ "learning_rate": 0.0001735309008059829,
148
+ "loss": 1.2742,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.33222591362126247,
153
+ "grad_norm": 0.6053428053855896,
154
+ "learning_rate": 0.00016946583704589973,
155
+ "loss": 1.2478,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.3488372093023256,
160
+ "grad_norm": 0.7606108784675598,
161
+ "learning_rate": 0.00016516572288214552,
162
+ "loss": 1.2128,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.3654485049833887,
167
+ "grad_norm": 0.4854182004928589,
168
+ "learning_rate": 0.00016064510853988138,
169
+ "loss": 1.2048,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.38205980066445183,
174
+ "grad_norm": 1.0294697284698486,
175
+ "learning_rate": 0.0001559192903470747,
176
+ "loss": 1.1671,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.39867109634551495,
181
+ "grad_norm": 0.45866039395332336,
182
+ "learning_rate": 0.00015100425897656753,
183
+ "loss": 1.1478,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.4152823920265781,
188
+ "grad_norm": 0.8675262331962585,
189
+ "learning_rate": 0.00014591664533870118,
190
+ "loss": 1.13,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.4318936877076412,
195
+ "grad_norm": 0.4108262062072754,
196
+ "learning_rate": 0.00014067366430758004,
197
+ "loss": 1.1204,
198
  "step": 130
199
  },
200
  {
201
+ "epoch": 0.4485049833887043,
202
+ "grad_norm": 0.5368378758430481,
203
+ "learning_rate": 0.00013529305647138687,
204
+ "loss": 1.115,
205
  "step": 135
206
  },
207
  {
208
+ "epoch": 0.46511627906976744,
209
+ "grad_norm": 0.7188141942024231,
210
+ "learning_rate": 0.0001297930281038482,
211
+ "loss": 1.1142,
212
  "step": 140
213
  },
214
+ {
215
+ "epoch": 0.48172757475083056,
216
+ "grad_norm": 0.922228217124939,
217
+ "learning_rate": 0.00012419218955996676,
218
+ "loss": 1.098,
219
+ "step": 145
220
+ },
221
+ {
222
+ "epoch": 0.4983388704318937,
223
+ "grad_norm": 0.645747721195221,
224
+ "learning_rate": 0.00011850949230447145,
225
+ "loss": 1.0962,
226
+ "step": 150
227
+ },
228
+ {
229
+ "epoch": 0.5149501661129569,
230
+ "grad_norm": 0.8197333812713623,
231
+ "learning_rate": 0.00011276416478605949,
232
+ "loss": 1.0953,
233
+ "step": 155
234
+ },
235
+ {
236
+ "epoch": 0.53156146179402,
237
+ "grad_norm": 0.6454654932022095,
238
+ "learning_rate": 0.00010697564737441252,
239
+ "loss": 1.0993,
240
+ "step": 160
241
+ },
242
+ {
243
+ "epoch": 0.5481727574750831,
244
+ "grad_norm": 1.0970648527145386,
245
+ "learning_rate": 0.00010116352658013973,
246
+ "loss": 1.0815,
247
+ "step": 165
248
+ },
249
+ {
250
+ "epoch": 0.5647840531561462,
251
+ "grad_norm": 0.427190899848938,
252
+ "learning_rate": 9.534746878022534e-05,
253
+ "loss": 1.0915,
254
+ "step": 170
255
+ },
256
+ {
257
+ "epoch": 0.5813953488372093,
258
+ "grad_norm": 0.40045276284217834,
259
+ "learning_rate": 8.954715367323468e-05,
260
+ "loss": 1.0804,
261
+ "step": 175
262
+ },
263
+ {
264
+ "epoch": 0.5980066445182725,
265
+ "grad_norm": 0.8688796758651733,
266
+ "learning_rate": 8.378220768944327e-05,
267
+ "loss": 1.0561,
268
+ "step": 180
269
+ },
270
+ {
271
+ "epoch": 0.6146179401993356,
272
+ "grad_norm": 0.6383819580078125,
273
+ "learning_rate": 7.807213758120966e-05,
274
+ "loss": 1.0619,
275
+ "step": 185
276
+ },
277
+ {
278
+ "epoch": 0.6312292358803987,
279
+ "grad_norm": 0.624631404876709,
280
+ "learning_rate": 7.243626441830009e-05,
281
+ "loss": 1.0538,
282
+ "step": 190
283
+ },
284
+ {
285
+ "epoch": 0.6478405315614618,
286
+ "grad_norm": 0.43649202585220337,
287
+ "learning_rate": 6.68936582115042e-05,
288
+ "loss": 1.0518,
289
+ "step": 195
290
+ },
291
+ {
292
+ "epoch": 0.6644518272425249,
293
+ "grad_norm": 0.7624740600585938,
294
+ "learning_rate": 6.146307338575519e-05,
295
+ "loss": 1.0323,
296
+ "step": 200
297
+ },
298
+ {
299
+ "epoch": 0.6810631229235881,
300
+ "grad_norm": 0.5153250098228455,
301
+ "learning_rate": 5.616288532109225e-05,
302
+ "loss": 1.0534,
303
+ "step": 205
304
+ },
305
+ {
306
+ "epoch": 0.6976744186046512,
307
+ "grad_norm": 0.5389082431793213,
308
+ "learning_rate": 5.101102817619131e-05,
309
+ "loss": 1.0411,
310
+ "step": 210
311
+ },
312
+ {
313
+ "epoch": 0.7142857142857143,
314
+ "grad_norm": 0.539851725101471,
315
+ "learning_rate": 4.6024934204848745e-05,
316
+ "loss": 1.0291,
317
+ "step": 215
318
+ },
319
+ {
320
+ "epoch": 0.7308970099667774,
321
+ "grad_norm": 0.5766161680221558,
322
+ "learning_rate": 4.12214747707527e-05,
323
+ "loss": 1.0463,
324
+ "step": 220
325
+ },
326
+ {
327
+ "epoch": 0.7475083056478405,
328
+ "grad_norm": 0.4088208079338074,
329
+ "learning_rate": 3.661690326012897e-05,
330
+ "loss": 1.0435,
331
+ "step": 225
332
+ },
333
+ {
334
+ "epoch": 0.7641196013289037,
335
+ "grad_norm": 0.5515505075454712,
336
+ "learning_rate": 3.222680008542678e-05,
337
+ "loss": 1.0276,
338
+ "step": 230
339
+ },
340
+ {
341
+ "epoch": 0.7807308970099668,
342
+ "grad_norm": 0.3717176914215088,
343
+ "learning_rate": 2.8066019966134904e-05,
344
+ "loss": 1.0297,
345
+ "step": 235
346
+ },
347
+ {
348
+ "epoch": 0.7973421926910299,
349
+ "grad_norm": 0.42352986335754395,
350
+ "learning_rate": 2.4148641665113113e-05,
351
+ "loss": 1.0233,
352
+ "step": 240
353
+ },
354
+ {
355
+ "epoch": 0.813953488372093,
356
+ "grad_norm": 0.5951583385467529,
357
+ "learning_rate": 2.0487920350515212e-05,
358
+ "loss": 1.0306,
359
+ "step": 245
360
+ },
361
+ {
362
+ "epoch": 0.8305647840531561,
363
+ "grad_norm": 0.4466446042060852,
364
+ "learning_rate": 1.7096242744495837e-05,
365
+ "loss": 1.031,
366
+ "step": 250
367
+ },
368
+ {
369
+ "epoch": 0.8471760797342193,
370
+ "grad_norm": 0.4255996644496918,
371
+ "learning_rate": 1.3985085210463477e-05,
372
+ "loss": 1.0327,
373
+ "step": 255
374
+ },
375
+ {
376
+ "epoch": 0.8637873754152824,
377
+ "grad_norm": 0.4321277141571045,
378
+ "learning_rate": 1.116497492069961e-05,
379
+ "loss": 1.0231,
380
+ "step": 260
381
+ },
382
+ {
383
+ "epoch": 0.8803986710963455,
384
+ "grad_norm": 0.47173938155174255,
385
+ "learning_rate": 8.645454235739903e-06,
386
+ "loss": 1.0402,
387
+ "step": 265
388
+ },
389
+ {
390
+ "epoch": 0.8970099667774086,
391
+ "grad_norm": 0.46528318524360657,
392
+ "learning_rate": 6.435048416046863e-06,
393
+ "loss": 1.03,
394
+ "step": 270
395
+ },
396
+ {
397
+ "epoch": 0.9136212624584718,
398
+ "grad_norm": 0.5179603099822998,
399
+ "learning_rate": 4.541236775226809e-06,
400
+ "loss": 1.025,
401
+ "step": 275
402
+ },
403
+ {
404
+ "epoch": 0.9302325581395349,
405
+ "grad_norm": 0.4209299385547638,
406
+ "learning_rate": 2.970427372400353e-06,
407
+ "loss": 1.0186,
408
+ "step": 280
409
+ },
410
+ {
411
+ "epoch": 0.946843853820598,
412
+ "grad_norm": 0.37040743231773376,
413
+ "learning_rate": 1.7279353293586765e-06,
414
+ "loss": 1.024,
415
+ "step": 285
416
+ },
417
+ {
418
+ "epoch": 0.9634551495016611,
419
+ "grad_norm": 0.37352651357650757,
420
+ "learning_rate": 8.17964845873831e-07,
421
+ "loss": 1.0293,
422
+ "step": 290
423
+ },
424
+ {
425
+ "epoch": 0.9800664451827242,
426
+ "grad_norm": 0.41717350482940674,
427
+ "learning_rate": 2.4359497401758024e-07,
428
+ "loss": 1.0289,
429
+ "step": 295
430
+ },
431
+ {
432
+ "epoch": 0.9966777408637874,
433
+ "grad_norm": 0.497938334941864,
434
+ "learning_rate": 6.769199623779532e-09,
435
+ "loss": 1.0272,
436
+ "step": 300
437
+ },
438
+ {
439
+ "epoch": 1.0,
440
+ "eval_loss": 1.7453839778900146,
441
+ "eval_runtime": 0.4824,
442
+ "eval_samples_per_second": 20.73,
443
+ "eval_steps_per_second": 2.073,
444
+ "step": 301
445
+ },
446
  {
447
  "epoch": 1.0,
448
+ "step": 301,
449
+ "total_flos": 9.178029324081562e+17,
450
+ "train_loss": 3.2307936707604368,
451
+ "train_runtime": 823.5769,
452
+ "train_samples_per_second": 46.729,
453
+ "train_steps_per_second": 0.365
454
  }
455
  ],
456
  "logging_steps": 5,
457
+ "max_steps": 301,
458
  "num_input_tokens_seen": 0,
459
  "num_train_epochs": 1,
460
  "save_steps": 100,
 
470
  "attributes": {}
471
  }
472
  },
473
+ "total_flos": 9.178029324081562e+17,
474
  "train_batch_size": 8,
475
  "trial_name": null,
476
  "trial_params": null