Training in progress, step 10500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2384234968
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b85521102a01aa9ca0cac30f77dc681cafb77e29acfb0cfb308a2655c5df66d7
|
3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4768663315
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:540ea13aa0494c204cfb6c6b5f87988f9d4d15f8ed6f18e14b57f97e602a0555
|
3 |
size 4768663315
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1465
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4dd1ea77e8b79a8e0c06815d69eb9b02aa74cbe131a4af6f145c955f8944e41f
|
3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 2.
|
6 |
"eval_steps": 100,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -9008,6 +9008,456 @@
|
|
9008 |
"mean_token_accuracy": 0.7658023487776517,
|
9009 |
"num_tokens": 81917952.0,
|
9010 |
"step": 10000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9011 |
}
|
9012 |
],
|
9013 |
"logging_steps": 10,
|
@@ -9027,7 +9477,7 @@
|
|
9027 |
"attributes": {}
|
9028 |
}
|
9029 |
},
|
9030 |
-
"total_flos": 2.
|
9031 |
"train_batch_size": 2,
|
9032 |
"trial_name": null,
|
9033 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 2.9736963217332812,
|
6 |
"eval_steps": 100,
|
7 |
+
"global_step": 10500,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
9008 |
"mean_token_accuracy": 0.7658023487776517,
|
9009 |
"num_tokens": 81917952.0,
|
9010 |
"step": 10000
|
9011 |
+
},
|
9012 |
+
{
|
9013 |
+
"epoch": 2.8349205225333662,
|
9014 |
+
"grad_norm": 2.1276166439056396,
|
9015 |
+
"learning_rate": 6.126088324766601e-07,
|
9016 |
+
"loss": 0.1176,
|
9017 |
+
"mean_token_accuracy": 0.7733365941792727,
|
9018 |
+
"num_tokens": 81999872.0,
|
9019 |
+
"step": 10010
|
9020 |
+
},
|
9021 |
+
{
|
9022 |
+
"epoch": 2.8377526817007115,
|
9023 |
+
"grad_norm": 1.4682493209838867,
|
9024 |
+
"learning_rate": 6.021189552082242e-07,
|
9025 |
+
"loss": 0.1185,
|
9026 |
+
"mean_token_accuracy": 0.7733732853084803,
|
9027 |
+
"num_tokens": 82081792.0,
|
9028 |
+
"step": 10020
|
9029 |
+
},
|
9030 |
+
{
|
9031 |
+
"epoch": 2.8405848408680567,
|
9032 |
+
"grad_norm": 1.2164413928985596,
|
9033 |
+
"learning_rate": 5.916290779397881e-07,
|
9034 |
+
"loss": 0.1097,
|
9035 |
+
"mean_token_accuracy": 0.7680772997438907,
|
9036 |
+
"num_tokens": 82163712.0,
|
9037 |
+
"step": 10030
|
9038 |
+
},
|
9039 |
+
{
|
9040 |
+
"epoch": 2.843417000035402,
|
9041 |
+
"grad_norm": 1.2230916023254395,
|
9042 |
+
"learning_rate": 5.811392006713522e-07,
|
9043 |
+
"loss": 0.1209,
|
9044 |
+
"mean_token_accuracy": 0.770596868917346,
|
9045 |
+
"num_tokens": 82245632.0,
|
9046 |
+
"step": 10040
|
9047 |
+
},
|
9048 |
+
{
|
9049 |
+
"epoch": 2.8462491592027472,
|
9050 |
+
"grad_norm": 1.3805590867996216,
|
9051 |
+
"learning_rate": 5.706493234029163e-07,
|
9052 |
+
"loss": 0.1109,
|
9053 |
+
"mean_token_accuracy": 0.7664261229336262,
|
9054 |
+
"num_tokens": 82327552.0,
|
9055 |
+
"step": 10050
|
9056 |
+
},
|
9057 |
+
{
|
9058 |
+
"epoch": 2.8490813183700925,
|
9059 |
+
"grad_norm": 1.2335084676742554,
|
9060 |
+
"learning_rate": 5.601594461344803e-07,
|
9061 |
+
"loss": 0.1087,
|
9062 |
+
"mean_token_accuracy": 0.7922333665192127,
|
9063 |
+
"num_tokens": 82409472.0,
|
9064 |
+
"step": 10060
|
9065 |
+
},
|
9066 |
+
{
|
9067 |
+
"epoch": 2.8519134775374377,
|
9068 |
+
"grad_norm": 1.4766696691513062,
|
9069 |
+
"learning_rate": 5.496695688660443e-07,
|
9070 |
+
"loss": 0.0949,
|
9071 |
+
"mean_token_accuracy": 0.7757460854947567,
|
9072 |
+
"num_tokens": 82491392.0,
|
9073 |
+
"step": 10070
|
9074 |
+
},
|
9075 |
+
{
|
9076 |
+
"epoch": 2.8547456367047825,
|
9077 |
+
"grad_norm": 1.2470474243164062,
|
9078 |
+
"learning_rate": 5.391796915976084e-07,
|
9079 |
+
"loss": 0.112,
|
9080 |
+
"mean_token_accuracy": 0.7856042079627514,
|
9081 |
+
"num_tokens": 82573312.0,
|
9082 |
+
"step": 10080
|
9083 |
+
},
|
9084 |
+
{
|
9085 |
+
"epoch": 2.8575777958721282,
|
9086 |
+
"grad_norm": 1.7810742855072021,
|
9087 |
+
"learning_rate": 5.286898143291724e-07,
|
9088 |
+
"loss": 0.121,
|
9089 |
+
"mean_token_accuracy": 0.7693003930151463,
|
9090 |
+
"num_tokens": 82655232.0,
|
9091 |
+
"step": 10090
|
9092 |
+
},
|
9093 |
+
{
|
9094 |
+
"epoch": 2.860409955039473,
|
9095 |
+
"grad_norm": 1.3474197387695312,
|
9096 |
+
"learning_rate": 5.181999370607364e-07,
|
9097 |
+
"loss": 0.1182,
|
9098 |
+
"mean_token_accuracy": 0.7601394318044186,
|
9099 |
+
"num_tokens": 82737152.0,
|
9100 |
+
"step": 10100
|
9101 |
+
},
|
9102 |
+
{
|
9103 |
+
"epoch": 2.8632421142068183,
|
9104 |
+
"grad_norm": 1.096218466758728,
|
9105 |
+
"learning_rate": 5.077100597923005e-07,
|
9106 |
+
"loss": 0.13,
|
9107 |
+
"mean_token_accuracy": 0.7537181980907917,
|
9108 |
+
"num_tokens": 82819072.0,
|
9109 |
+
"step": 10110
|
9110 |
+
},
|
9111 |
+
{
|
9112 |
+
"epoch": 2.8660742733741635,
|
9113 |
+
"grad_norm": 1.064784049987793,
|
9114 |
+
"learning_rate": 4.972201825238645e-07,
|
9115 |
+
"loss": 0.1348,
|
9116 |
+
"mean_token_accuracy": 0.7513820916414261,
|
9117 |
+
"num_tokens": 82900992.0,
|
9118 |
+
"step": 10120
|
9119 |
+
},
|
9120 |
+
{
|
9121 |
+
"epoch": 2.868906432541509,
|
9122 |
+
"grad_norm": 1.5605591535568237,
|
9123 |
+
"learning_rate": 4.867303052554286e-07,
|
9124 |
+
"loss": 0.141,
|
9125 |
+
"mean_token_accuracy": 0.7740704540163279,
|
9126 |
+
"num_tokens": 82982912.0,
|
9127 |
+
"step": 10130
|
9128 |
+
},
|
9129 |
+
{
|
9130 |
+
"epoch": 2.871738591708854,
|
9131 |
+
"grad_norm": 1.420284390449524,
|
9132 |
+
"learning_rate": 4.7624042798699264e-07,
|
9133 |
+
"loss": 0.11,
|
9134 |
+
"mean_token_accuracy": 0.7743272982537747,
|
9135 |
+
"num_tokens": 83064832.0,
|
9136 |
+
"step": 10140
|
9137 |
+
},
|
9138 |
+
{
|
9139 |
+
"epoch": 2.8745707508761993,
|
9140 |
+
"grad_norm": 1.2748111486434937,
|
9141 |
+
"learning_rate": 4.657505507185566e-07,
|
9142 |
+
"loss": 0.1273,
|
9143 |
+
"mean_token_accuracy": 0.7646037183701992,
|
9144 |
+
"num_tokens": 83146752.0,
|
9145 |
+
"step": 10150
|
9146 |
+
},
|
9147 |
+
{
|
9148 |
+
"epoch": 2.8774029100435445,
|
9149 |
+
"grad_norm": 1.1738097667694092,
|
9150 |
+
"learning_rate": 4.552606734501207e-07,
|
9151 |
+
"loss": 0.1224,
|
9152 |
+
"mean_token_accuracy": 0.7754525426775217,
|
9153 |
+
"num_tokens": 83228672.0,
|
9154 |
+
"step": 10160
|
9155 |
+
},
|
9156 |
+
{
|
9157 |
+
"epoch": 2.88023506921089,
|
9158 |
+
"grad_norm": 1.5003738403320312,
|
9159 |
+
"learning_rate": 4.4477079618168476e-07,
|
9160 |
+
"loss": 0.1128,
|
9161 |
+
"mean_token_accuracy": 0.7775073368102312,
|
9162 |
+
"num_tokens": 83310592.0,
|
9163 |
+
"step": 10170
|
9164 |
+
},
|
9165 |
+
{
|
9166 |
+
"epoch": 2.883067228378235,
|
9167 |
+
"grad_norm": 1.2533864974975586,
|
9168 |
+
"learning_rate": 4.3428091891324873e-07,
|
9169 |
+
"loss": 0.1311,
|
9170 |
+
"mean_token_accuracy": 0.7413649678230285,
|
9171 |
+
"num_tokens": 83392512.0,
|
9172 |
+
"step": 10180
|
9173 |
+
},
|
9174 |
+
{
|
9175 |
+
"epoch": 2.88589938754558,
|
9176 |
+
"grad_norm": 1.5065313577651978,
|
9177 |
+
"learning_rate": 4.237910416448128e-07,
|
9178 |
+
"loss": 0.1546,
|
9179 |
+
"mean_token_accuracy": 0.77030332647264,
|
9180 |
+
"num_tokens": 83474432.0,
|
9181 |
+
"step": 10190
|
9182 |
+
},
|
9183 |
+
{
|
9184 |
+
"epoch": 2.8887315467129255,
|
9185 |
+
"grad_norm": 1.491937518119812,
|
9186 |
+
"learning_rate": 4.133011643763768e-07,
|
9187 |
+
"loss": 0.1268,
|
9188 |
+
"mean_token_accuracy": 0.7824853226542473,
|
9189 |
+
"num_tokens": 83556352.0,
|
9190 |
+
"step": 10200
|
9191 |
+
},
|
9192 |
+
{
|
9193 |
+
"epoch": 2.8915637058802703,
|
9194 |
+
"grad_norm": 1.166266918182373,
|
9195 |
+
"learning_rate": 4.0281128710794085e-07,
|
9196 |
+
"loss": 0.1116,
|
9197 |
+
"mean_token_accuracy": 0.782081701233983,
|
9198 |
+
"num_tokens": 83638272.0,
|
9199 |
+
"step": 10210
|
9200 |
+
},
|
9201 |
+
{
|
9202 |
+
"epoch": 2.8943958650476156,
|
9203 |
+
"grad_norm": 1.42288076877594,
|
9204 |
+
"learning_rate": 3.923214098395049e-07,
|
9205 |
+
"loss": 0.1282,
|
9206 |
+
"mean_token_accuracy": 0.7608121354132891,
|
9207 |
+
"num_tokens": 83720192.0,
|
9208 |
+
"step": 10220
|
9209 |
+
},
|
9210 |
+
{
|
9211 |
+
"epoch": 2.897228024214961,
|
9212 |
+
"grad_norm": 1.6304948329925537,
|
9213 |
+
"learning_rate": 3.818315325710689e-07,
|
9214 |
+
"loss": 0.1231,
|
9215 |
+
"mean_token_accuracy": 0.7633317038416862,
|
9216 |
+
"num_tokens": 83802112.0,
|
9217 |
+
"step": 10230
|
9218 |
+
},
|
9219 |
+
{
|
9220 |
+
"epoch": 2.900060183382306,
|
9221 |
+
"grad_norm": 1.4208807945251465,
|
9222 |
+
"learning_rate": 3.7134165530263297e-07,
|
9223 |
+
"loss": 0.0992,
|
9224 |
+
"mean_token_accuracy": 0.7627568505704403,
|
9225 |
+
"num_tokens": 83884032.0,
|
9226 |
+
"step": 10240
|
9227 |
+
},
|
9228 |
+
{
|
9229 |
+
"epoch": 2.9028923425496513,
|
9230 |
+
"grad_norm": 1.291266679763794,
|
9231 |
+
"learning_rate": 3.6085177803419705e-07,
|
9232 |
+
"loss": 0.11,
|
9233 |
+
"mean_token_accuracy": 0.7762964777648449,
|
9234 |
+
"num_tokens": 83965952.0,
|
9235 |
+
"step": 10250
|
9236 |
+
},
|
9237 |
+
{
|
9238 |
+
"epoch": 2.9057245017169966,
|
9239 |
+
"grad_norm": 1.5174055099487305,
|
9240 |
+
"learning_rate": 3.5036190076576107e-07,
|
9241 |
+
"loss": 0.1259,
|
9242 |
+
"mean_token_accuracy": 0.7930772956460714,
|
9243 |
+
"num_tokens": 84047872.0,
|
9244 |
+
"step": 10260
|
9245 |
+
},
|
9246 |
+
{
|
9247 |
+
"epoch": 2.908556660884342,
|
9248 |
+
"grad_norm": 1.2579764127731323,
|
9249 |
+
"learning_rate": 3.398720234973251e-07,
|
9250 |
+
"loss": 0.1173,
|
9251 |
+
"mean_token_accuracy": 0.7738992158323527,
|
9252 |
+
"num_tokens": 84129792.0,
|
9253 |
+
"step": 10270
|
9254 |
+
},
|
9255 |
+
{
|
9256 |
+
"epoch": 2.9113888200516866,
|
9257 |
+
"grad_norm": 1.7533577680587769,
|
9258 |
+
"learning_rate": 3.2938214622888917e-07,
|
9259 |
+
"loss": 0.1219,
|
9260 |
+
"mean_token_accuracy": 0.7630137003958225,
|
9261 |
+
"num_tokens": 84211712.0,
|
9262 |
+
"step": 10280
|
9263 |
+
},
|
9264 |
+
{
|
9265 |
+
"epoch": 2.9142209792190323,
|
9266 |
+
"grad_norm": 1.3265914916992188,
|
9267 |
+
"learning_rate": 3.188922689604532e-07,
|
9268 |
+
"loss": 0.139,
|
9269 |
+
"mean_token_accuracy": 0.7553082194179297,
|
9270 |
+
"num_tokens": 84293632.0,
|
9271 |
+
"step": 10290
|
9272 |
+
},
|
9273 |
+
{
|
9274 |
+
"epoch": 2.917053138386377,
|
9275 |
+
"grad_norm": 1.803127408027649,
|
9276 |
+
"learning_rate": 3.084023916920172e-07,
|
9277 |
+
"loss": 0.1207,
|
9278 |
+
"mean_token_accuracy": 0.7525073390454053,
|
9279 |
+
"num_tokens": 84375552.0,
|
9280 |
+
"step": 10300
|
9281 |
+
},
|
9282 |
+
{
|
9283 |
+
"epoch": 2.9198852975537224,
|
9284 |
+
"grad_norm": 1.6787763833999634,
|
9285 |
+
"learning_rate": 2.979125144235813e-07,
|
9286 |
+
"loss": 0.1139,
|
9287 |
+
"mean_token_accuracy": 0.7773361060768366,
|
9288 |
+
"num_tokens": 84457472.0,
|
9289 |
+
"step": 10310
|
9290 |
+
},
|
9291 |
+
{
|
9292 |
+
"epoch": 2.9227174567210676,
|
9293 |
+
"grad_norm": 1.486560344696045,
|
9294 |
+
"learning_rate": 2.874226371551453e-07,
|
9295 |
+
"loss": 0.1424,
|
9296 |
+
"mean_token_accuracy": 0.739921722188592,
|
9297 |
+
"num_tokens": 84539392.0,
|
9298 |
+
"step": 10320
|
9299 |
+
},
|
9300 |
+
{
|
9301 |
+
"epoch": 2.925549615888413,
|
9302 |
+
"grad_norm": 1.3302429914474487,
|
9303 |
+
"learning_rate": 2.7693275988670933e-07,
|
9304 |
+
"loss": 0.0954,
|
9305 |
+
"mean_token_accuracy": 0.7770058684051037,
|
9306 |
+
"num_tokens": 84621312.0,
|
9307 |
+
"step": 10330
|
9308 |
+
},
|
9309 |
+
{
|
9310 |
+
"epoch": 2.928381775055758,
|
9311 |
+
"grad_norm": 1.5905101299285889,
|
9312 |
+
"learning_rate": 2.664428826182734e-07,
|
9313 |
+
"loss": 0.1068,
|
9314 |
+
"mean_token_accuracy": 0.7657045032829046,
|
9315 |
+
"num_tokens": 84703232.0,
|
9316 |
+
"step": 10340
|
9317 |
+
},
|
9318 |
+
{
|
9319 |
+
"epoch": 2.9312139342231034,
|
9320 |
+
"grad_norm": 1.2340965270996094,
|
9321 |
+
"learning_rate": 2.559530053498374e-07,
|
9322 |
+
"loss": 0.121,
|
9323 |
+
"mean_token_accuracy": 0.7530577316880226,
|
9324 |
+
"num_tokens": 84785152.0,
|
9325 |
+
"step": 10350
|
9326 |
+
},
|
9327 |
+
{
|
9328 |
+
"epoch": 2.9340460933904486,
|
9329 |
+
"grad_norm": 1.4800512790679932,
|
9330 |
+
"learning_rate": 2.454631280814015e-07,
|
9331 |
+
"loss": 0.1025,
|
9332 |
+
"mean_token_accuracy": 0.783512718975544,
|
9333 |
+
"num_tokens": 84867072.0,
|
9334 |
+
"step": 10360
|
9335 |
+
},
|
9336 |
+
{
|
9337 |
+
"epoch": 2.936878252557794,
|
9338 |
+
"grad_norm": 1.4509563446044922,
|
9339 |
+
"learning_rate": 2.349732508129655e-07,
|
9340 |
+
"loss": 0.1136,
|
9341 |
+
"mean_token_accuracy": 0.7643346376717091,
|
9342 |
+
"num_tokens": 84948992.0,
|
9343 |
+
"step": 10370
|
9344 |
+
},
|
9345 |
+
{
|
9346 |
+
"epoch": 2.939710411725139,
|
9347 |
+
"grad_norm": 1.5300997495651245,
|
9348 |
+
"learning_rate": 2.2448337354452955e-07,
|
9349 |
+
"loss": 0.1394,
|
9350 |
+
"mean_token_accuracy": 0.7647504940629005,
|
9351 |
+
"num_tokens": 85030912.0,
|
9352 |
+
"step": 10380
|
9353 |
+
},
|
9354 |
+
{
|
9355 |
+
"epoch": 2.942542570892484,
|
9356 |
+
"grad_norm": 1.0120151042938232,
|
9357 |
+
"learning_rate": 2.139934962760936e-07,
|
9358 |
+
"loss": 0.1119,
|
9359 |
+
"mean_token_accuracy": 0.7749999992549419,
|
9360 |
+
"num_tokens": 85112832.0,
|
9361 |
+
"step": 10390
|
9362 |
+
},
|
9363 |
+
{
|
9364 |
+
"epoch": 2.9453747300598296,
|
9365 |
+
"grad_norm": 1.1445319652557373,
|
9366 |
+
"learning_rate": 2.0350361900765764e-07,
|
9367 |
+
"loss": 0.1343,
|
9368 |
+
"mean_token_accuracy": 0.7669031299650669,
|
9369 |
+
"num_tokens": 85194752.0,
|
9370 |
+
"step": 10400
|
9371 |
+
},
|
9372 |
+
{
|
9373 |
+
"epoch": 2.9482068892271744,
|
9374 |
+
"grad_norm": 1.1299060583114624,
|
9375 |
+
"learning_rate": 1.9301374173922166e-07,
|
9376 |
+
"loss": 0.1373,
|
9377 |
+
"mean_token_accuracy": 0.7592465754598379,
|
9378 |
+
"num_tokens": 85276672.0,
|
9379 |
+
"step": 10410
|
9380 |
+
},
|
9381 |
+
{
|
9382 |
+
"epoch": 2.9510390483945197,
|
9383 |
+
"grad_norm": 1.0287593603134155,
|
9384 |
+
"learning_rate": 1.8252386447078569e-07,
|
9385 |
+
"loss": 0.1243,
|
9386 |
+
"mean_token_accuracy": 0.7871330726891757,
|
9387 |
+
"num_tokens": 85358592.0,
|
9388 |
+
"step": 10420
|
9389 |
+
},
|
9390 |
+
{
|
9391 |
+
"epoch": 2.953871207561865,
|
9392 |
+
"grad_norm": 1.2568093538284302,
|
9393 |
+
"learning_rate": 1.7203398720234976e-07,
|
9394 |
+
"loss": 0.0979,
|
9395 |
+
"mean_token_accuracy": 0.77096379250288,
|
9396 |
+
"num_tokens": 85440512.0,
|
9397 |
+
"step": 10430
|
9398 |
+
},
|
9399 |
+
{
|
9400 |
+
"epoch": 2.95670336672921,
|
9401 |
+
"grad_norm": 2.05387020111084,
|
9402 |
+
"learning_rate": 1.6154410993391378e-07,
|
9403 |
+
"loss": 0.1146,
|
9404 |
+
"mean_token_accuracy": 0.7754647746682167,
|
9405 |
+
"num_tokens": 85522432.0,
|
9406 |
+
"step": 10440
|
9407 |
+
},
|
9408 |
+
{
|
9409 |
+
"epoch": 2.9595355258965554,
|
9410 |
+
"grad_norm": 1.3246551752090454,
|
9411 |
+
"learning_rate": 1.5105423266547783e-07,
|
9412 |
+
"loss": 0.1108,
|
9413 |
+
"mean_token_accuracy": 0.779549902677536,
|
9414 |
+
"num_tokens": 85604352.0,
|
9415 |
+
"step": 10450
|
9416 |
+
},
|
9417 |
+
{
|
9418 |
+
"epoch": 2.9623676850639007,
|
9419 |
+
"grad_norm": 1.5421769618988037,
|
9420 |
+
"learning_rate": 1.4056435539704185e-07,
|
9421 |
+
"loss": 0.1215,
|
9422 |
+
"mean_token_accuracy": 0.7485934443771839,
|
9423 |
+
"num_tokens": 85686272.0,
|
9424 |
+
"step": 10460
|
9425 |
+
},
|
9426 |
+
{
|
9427 |
+
"epoch": 2.965199844231246,
|
9428 |
+
"grad_norm": 1.457680583000183,
|
9429 |
+
"learning_rate": 1.300744781286059e-07,
|
9430 |
+
"loss": 0.1266,
|
9431 |
+
"mean_token_accuracy": 0.7566780854016543,
|
9432 |
+
"num_tokens": 85768192.0,
|
9433 |
+
"step": 10470
|
9434 |
+
},
|
9435 |
+
{
|
9436 |
+
"epoch": 2.9680320033985907,
|
9437 |
+
"grad_norm": 1.1517871618270874,
|
9438 |
+
"learning_rate": 1.1958460086016993e-07,
|
9439 |
+
"loss": 0.1209,
|
9440 |
+
"mean_token_accuracy": 0.7584148690104484,
|
9441 |
+
"num_tokens": 85850112.0,
|
9442 |
+
"step": 10480
|
9443 |
+
},
|
9444 |
+
{
|
9445 |
+
"epoch": 2.9708641625659364,
|
9446 |
+
"grad_norm": 1.3935081958770752,
|
9447 |
+
"learning_rate": 1.0909472359173399e-07,
|
9448 |
+
"loss": 0.1103,
|
9449 |
+
"mean_token_accuracy": 0.782497552037239,
|
9450 |
+
"num_tokens": 85932032.0,
|
9451 |
+
"step": 10490
|
9452 |
+
},
|
9453 |
+
{
|
9454 |
+
"epoch": 2.9736963217332812,
|
9455 |
+
"grad_norm": 1.209938883781433,
|
9456 |
+
"learning_rate": 9.860484632329804e-08,
|
9457 |
+
"loss": 0.1417,
|
9458 |
+
"mean_token_accuracy": 0.7534368880093097,
|
9459 |
+
"num_tokens": 86013952.0,
|
9460 |
+
"step": 10500
|
9461 |
}
|
9462 |
],
|
9463 |
"logging_steps": 10,
|
|
|
9477 |
"attributes": {}
|
9478 |
}
|
9479 |
},
|
9480 |
+
"total_flos": 2.2731807970767667e+17,
|
9481 |
"train_batch_size": 2,
|
9482 |
"trial_name": null,
|
9483 |
"trial_params": null
|