chansung commited on
Commit
0ca1461
·
verified ·
1 Parent(s): 5cee720

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +4 -9
  3. train_results.json +4 -4
  4. trainer_state.json +60 -60
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 2.2849
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
- | 1.4631 | 0.9961 | 129 | 2.2849 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.2714
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.4571 | 0.9961 | 129 | 2.2714 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 0.9961389961389961,
3
- "eval_loss": 2.2848541736602783,
4
- "eval_runtime": 1.1208,
5
- "eval_samples": 15,
6
- "eval_samples_per_second": 10.707,
7
- "eval_steps_per_second": 0.892,
8
  "total_flos": 7.61719099628716e+17,
9
- "train_loss": 1.5859730927519096,
10
- "train_runtime": 2226.6138,
11
  "train_samples": 111440,
12
- "train_samples_per_second": 14.886,
13
- "train_steps_per_second": 0.058
14
  }
 
1
  {
2
  "epoch": 0.9961389961389961,
 
 
 
 
 
3
  "total_flos": 7.61719099628716e+17,
4
+ "train_loss": 1.5814437108446462,
5
+ "train_runtime": 642.9915,
6
  "train_samples": 111440,
7
+ "train_samples_per_second": 51.55,
8
+ "train_steps_per_second": 0.201
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9961389961389961,
3
  "total_flos": 7.61719099628716e+17,
4
- "train_loss": 1.5859730927519096,
5
- "train_runtime": 2226.6138,
6
  "train_samples": 111440,
7
- "train_samples_per_second": 14.886,
8
- "train_steps_per_second": 0.058
9
  }
 
1
  {
2
  "epoch": 0.9961389961389961,
3
  "total_flos": 7.61719099628716e+17,
4
+ "train_loss": 1.5814437108446462,
5
+ "train_runtime": 642.9915,
6
  "train_samples": 111440,
7
+ "train_samples_per_second": 51.55,
8
+ "train_steps_per_second": 0.201
9
  }
trainer_state.json CHANGED
@@ -10,202 +10,202 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.007722007722007722,
13
- "grad_norm": 0.7566112279891968,
14
  "learning_rate": 1.5384615384615387e-05,
15
- "loss": 2.1393,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.03861003861003861,
20
- "grad_norm": 1.1990445852279663,
21
  "learning_rate": 7.692307692307693e-05,
22
- "loss": 2.1362,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.07722007722007722,
27
- "grad_norm": 0.5208460092544556,
28
  "learning_rate": 0.00015384615384615385,
29
- "loss": 2.0814,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.11583011583011583,
34
- "grad_norm": 0.6390016078948975,
35
  "learning_rate": 0.00019985334138511237,
36
- "loss": 1.9687,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.15444015444015444,
41
- "grad_norm": 0.6168708801269531,
42
  "learning_rate": 0.0001982083682742156,
43
- "loss": 1.8165,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.19305019305019305,
48
- "grad_norm": 0.5843796133995056,
49
  "learning_rate": 0.00019476531711828027,
50
- "loss": 1.6979,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.23166023166023167,
55
- "grad_norm": 0.329062283039093,
56
  "learning_rate": 0.0001895872260758688,
57
- "loss": 1.6433,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.2702702702702703,
62
- "grad_norm": 0.35946840047836304,
63
  "learning_rate": 0.00018276889981568906,
64
- "loss": 1.5942,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.3088803088803089,
69
- "grad_norm": 0.24955272674560547,
70
  "learning_rate": 0.00017443517375622704,
71
- "loss": 1.5573,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3474903474903475,
76
- "grad_norm": 0.2598620355129242,
77
  "learning_rate": 0.00016473862847818277,
78
- "loss": 1.5501,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.3861003861003861,
83
- "grad_norm": 0.21895311772823334,
84
  "learning_rate": 0.00015385679615609042,
85
- "loss": 1.5251,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.4247104247104247,
90
- "grad_norm": 0.2305791676044464,
91
  "learning_rate": 0.00014198891015602646,
92
- "loss": 1.5191,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.46332046332046334,
97
- "grad_norm": 0.21794967353343964,
98
  "learning_rate": 0.00012935225731039348,
99
- "loss": 1.5042,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.5019305019305019,
104
- "grad_norm": 0.22447626292705536,
105
  "learning_rate": 0.0001161781996552765,
106
- "loss": 1.4992,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5405405405405406,
111
- "grad_norm": 0.2490142434835434,
112
  "learning_rate": 0.00010270793846761347,
113
- "loss": 1.4967,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5791505791505791,
118
- "grad_norm": 0.2434530258178711,
119
  "learning_rate": 8.918809815760585e-05,
120
- "loss": 1.4842,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.6177606177606177,
125
- "grad_norm": 0.29860490560531616,
126
  "learning_rate": 7.586621087002945e-05,
127
- "loss": 1.4808,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6563706563706564,
132
- "grad_norm": 0.2233101725578308,
133
  "learning_rate": 6.298618446600856e-05,
134
- "loss": 1.4863,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.694980694980695,
139
- "grad_norm": 0.24076718091964722,
140
  "learning_rate": 5.078383686109926e-05,
141
- "loss": 1.4625,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.7335907335907336,
146
- "grad_norm": 0.2131790667772293,
147
  "learning_rate": 3.948257848062351e-05,
148
- "loss": 1.4676,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7722007722007722,
153
- "grad_norm": 0.2535589635372162,
154
  "learning_rate": 2.9289321881345254e-05,
155
- "loss": 1.4689,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.8108108108108109,
160
- "grad_norm": 0.2187783420085907,
161
  "learning_rate": 2.0390693429435627e-05,
162
- "loss": 1.4613,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.8494208494208494,
167
- "grad_norm": 0.22383730113506317,
168
  "learning_rate": 1.2949616394382802e-05,
169
- "loss": 1.4626,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.888030888030888,
174
- "grad_norm": 0.24992400407791138,
175
  "learning_rate": 7.102328018320858e-06,
176
- "loss": 1.4698,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.9266409266409267,
181
- "grad_norm": 0.24492764472961426,
182
  "learning_rate": 2.9558851746788517e-06,
183
- "loss": 1.4742,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.9652509652509652,
188
- "grad_norm": 0.27442848682403564,
189
  "learning_rate": 5.862042845640403e-07,
190
- "loss": 1.4631,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.9961389961389961,
195
- "eval_loss": 2.2848541736602783,
196
- "eval_runtime": 1.132,
197
- "eval_samples_per_second": 10.601,
198
- "eval_steps_per_second": 0.883,
199
  "step": 129
200
  },
201
  {
202
  "epoch": 0.9961389961389961,
203
  "step": 129,
204
  "total_flos": 7.61719099628716e+17,
205
- "train_loss": 1.5859730927519096,
206
- "train_runtime": 2226.6138,
207
- "train_samples_per_second": 14.886,
208
- "train_steps_per_second": 0.058
209
  }
210
  ],
211
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.007722007722007722,
13
+ "grad_norm": 3.172396421432495,
14
  "learning_rate": 1.5384615384615387e-05,
15
+ "loss": 2.1653,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.03861003861003861,
20
+ "grad_norm": 2.1693484783172607,
21
  "learning_rate": 7.692307692307693e-05,
22
+ "loss": 2.1599,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.07722007722007722,
27
+ "grad_norm": 0.5863114595413208,
28
  "learning_rate": 0.00015384615384615385,
29
+ "loss": 2.0828,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.11583011583011583,
34
+ "grad_norm": 0.6443110108375549,
35
  "learning_rate": 0.00019985334138511237,
36
+ "loss": 1.9643,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.15444015444015444,
41
+ "grad_norm": 0.628454864025116,
42
  "learning_rate": 0.0001982083682742156,
43
+ "loss": 1.813,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.19305019305019305,
48
+ "grad_norm": 0.5933235883712769,
49
  "learning_rate": 0.00019476531711828027,
50
+ "loss": 1.6936,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.23166023166023167,
55
+ "grad_norm": 0.31888535618782043,
56
  "learning_rate": 0.0001895872260758688,
57
+ "loss": 1.6382,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.2702702702702703,
62
+ "grad_norm": 0.33554232120513916,
63
  "learning_rate": 0.00018276889981568906,
64
+ "loss": 1.5888,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.3088803088803089,
69
+ "grad_norm": 0.24805951118469238,
70
  "learning_rate": 0.00017443517375622704,
71
+ "loss": 1.5513,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3474903474903475,
76
+ "grad_norm": 0.2511727213859558,
77
  "learning_rate": 0.00016473862847818277,
78
+ "loss": 1.5439,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.3861003861003861,
83
+ "grad_norm": 0.2083854228258133,
84
  "learning_rate": 0.00015385679615609042,
85
+ "loss": 1.5181,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.4247104247104247,
90
+ "grad_norm": 0.21075661480426788,
91
  "learning_rate": 0.00014198891015602646,
92
+ "loss": 1.5124,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.46332046332046334,
97
+ "grad_norm": 0.20275065302848816,
98
  "learning_rate": 0.00012935225731039348,
99
+ "loss": 1.4972,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.5019305019305019,
104
+ "grad_norm": 0.2260345220565796,
105
  "learning_rate": 0.0001161781996552765,
106
+ "loss": 1.4923,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5405405405405406,
111
+ "grad_norm": 0.2425995022058487,
112
  "learning_rate": 0.00010270793846761347,
113
+ "loss": 1.4897,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5791505791505791,
118
+ "grad_norm": 0.23694849014282227,
119
  "learning_rate": 8.918809815760585e-05,
120
+ "loss": 1.4775,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.6177606177606177,
125
+ "grad_norm": 0.26492005586624146,
126
  "learning_rate": 7.586621087002945e-05,
127
+ "loss": 1.474,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6563706563706564,
132
+ "grad_norm": 0.21545687317848206,
133
  "learning_rate": 6.298618446600856e-05,
134
+ "loss": 1.4795,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.694980694980695,
139
+ "grad_norm": 0.2343147248029709,
140
  "learning_rate": 5.078383686109926e-05,
141
+ "loss": 1.4562,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.7335907335907336,
146
+ "grad_norm": 0.2037033885717392,
147
  "learning_rate": 3.948257848062351e-05,
148
+ "loss": 1.4611,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7722007722007722,
153
+ "grad_norm": 0.24056674540042877,
154
  "learning_rate": 2.9289321881345254e-05,
155
+ "loss": 1.4629,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.8108108108108109,
160
+ "grad_norm": 0.22205579280853271,
161
  "learning_rate": 2.0390693429435627e-05,
162
+ "loss": 1.4556,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.8494208494208494,
167
+ "grad_norm": 0.2215537130832672,
168
  "learning_rate": 1.2949616394382802e-05,
169
+ "loss": 1.4569,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.888030888030888,
174
+ "grad_norm": 0.2468106597661972,
175
  "learning_rate": 7.102328018320858e-06,
176
+ "loss": 1.4641,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.9266409266409267,
181
+ "grad_norm": 0.23411813378334045,
182
  "learning_rate": 2.9558851746788517e-06,
183
+ "loss": 1.4681,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.9652509652509652,
188
+ "grad_norm": 0.2601025402545929,
189
  "learning_rate": 5.862042845640403e-07,
190
+ "loss": 1.4571,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.9961389961389961,
195
+ "eval_loss": 2.2714059352874756,
196
+ "eval_runtime": 0.7834,
197
+ "eval_samples_per_second": 15.317,
198
+ "eval_steps_per_second": 1.276,
199
  "step": 129
200
  },
201
  {
202
  "epoch": 0.9961389961389961,
203
  "step": 129,
204
  "total_flos": 7.61719099628716e+17,
205
+ "train_loss": 1.5814437108446462,
206
+ "train_runtime": 642.9915,
207
+ "train_samples_per_second": 51.55,
208
+ "train_steps_per_second": 0.201
209
  }
210
  ],
211
  "logging_steps": 5,