|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 21460, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11649580615097857, |
|
"grad_norm": 2.5263092517852783, |
|
"learning_rate": 4.883504193849022e-05, |
|
"loss": 0.612, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23299161230195714, |
|
"grad_norm": 2.820680856704712, |
|
"learning_rate": 4.767008387698043e-05, |
|
"loss": 0.4835, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3494874184529357, |
|
"grad_norm": 2.4191653728485107, |
|
"learning_rate": 4.650512581547065e-05, |
|
"loss": 0.4403, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4659832246039143, |
|
"grad_norm": 2.0218257904052734, |
|
"learning_rate": 4.534016775396086e-05, |
|
"loss": 0.4163, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5824790307548928, |
|
"grad_norm": 2.193092107772827, |
|
"learning_rate": 4.4175209692451076e-05, |
|
"loss": 0.4056, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6989748369058714, |
|
"grad_norm": 2.5563910007476807, |
|
"learning_rate": 4.3010251630941286e-05, |
|
"loss": 0.3903, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8154706430568499, |
|
"grad_norm": 2.4142184257507324, |
|
"learning_rate": 4.1845293569431504e-05, |
|
"loss": 0.3818, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.9319664492078286, |
|
"grad_norm": 2.6781787872314453, |
|
"learning_rate": 4.068033550792172e-05, |
|
"loss": 0.3752, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.048462255358807, |
|
"grad_norm": 3.981492757797241, |
|
"learning_rate": 3.951537744641193e-05, |
|
"loss": 0.3407, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.1649580615097856, |
|
"grad_norm": 2.2366013526916504, |
|
"learning_rate": 3.835041938490215e-05, |
|
"loss": 0.3079, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.281453867660764, |
|
"grad_norm": 2.7982800006866455, |
|
"learning_rate": 3.718546132339236e-05, |
|
"loss": 0.3003, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.3979496738117427, |
|
"grad_norm": 2.147766590118408, |
|
"learning_rate": 3.602050326188258e-05, |
|
"loss": 0.3042, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.5144454799627214, |
|
"grad_norm": 3.3207345008850098, |
|
"learning_rate": 3.485554520037279e-05, |
|
"loss": 0.2994, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.6309412861136998, |
|
"grad_norm": 2.8516502380371094, |
|
"learning_rate": 3.3690587138863e-05, |
|
"loss": 0.3019, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.7474370922646785, |
|
"grad_norm": 2.4767708778381348, |
|
"learning_rate": 3.2525629077353216e-05, |
|
"loss": 0.3029, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.8639328984156571, |
|
"grad_norm": 2.9029157161712646, |
|
"learning_rate": 3.1360671015843426e-05, |
|
"loss": 0.2971, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.9804287045666356, |
|
"grad_norm": 3.0262856483459473, |
|
"learning_rate": 3.0195712954333644e-05, |
|
"loss": 0.2926, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.096924510717614, |
|
"grad_norm": 2.862046957015991, |
|
"learning_rate": 2.9030754892823857e-05, |
|
"loss": 0.2322, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.213420316868593, |
|
"grad_norm": 2.759274482727051, |
|
"learning_rate": 2.786579683131407e-05, |
|
"loss": 0.2295, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.3299161230195713, |
|
"grad_norm": 3.3560988903045654, |
|
"learning_rate": 2.670083876980429e-05, |
|
"loss": 0.2288, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.4464119291705497, |
|
"grad_norm": 2.7938876152038574, |
|
"learning_rate": 2.5535880708294503e-05, |
|
"loss": 0.228, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.562907735321528, |
|
"grad_norm": 3.100569486618042, |
|
"learning_rate": 2.4370922646784717e-05, |
|
"loss": 0.2288, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.679403541472507, |
|
"grad_norm": 2.6765120029449463, |
|
"learning_rate": 2.320596458527493e-05, |
|
"loss": 0.2307, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.7958993476234855, |
|
"grad_norm": 2.7146239280700684, |
|
"learning_rate": 2.2041006523765145e-05, |
|
"loss": 0.2275, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.9123951537744643, |
|
"grad_norm": 3.6457717418670654, |
|
"learning_rate": 2.087604846225536e-05, |
|
"loss": 0.2328, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.0288909599254428, |
|
"grad_norm": 3.6852147579193115, |
|
"learning_rate": 1.9711090400745573e-05, |
|
"loss": 0.2172, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.145386766076421, |
|
"grad_norm": 3.0440313816070557, |
|
"learning_rate": 1.854613233923579e-05, |
|
"loss": 0.1749, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.2618825722273996, |
|
"grad_norm": 2.364924192428589, |
|
"learning_rate": 1.7381174277726004e-05, |
|
"loss": 0.1767, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 2.6927711963653564, |
|
"learning_rate": 1.6216216216216218e-05, |
|
"loss": 0.172, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.494874184529357, |
|
"grad_norm": 3.143772602081299, |
|
"learning_rate": 1.5051258154706432e-05, |
|
"loss": 0.1787, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.6113699906803354, |
|
"grad_norm": 2.1713764667510986, |
|
"learning_rate": 1.3886300093196648e-05, |
|
"loss": 0.1724, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.7278657968313142, |
|
"grad_norm": 4.549355506896973, |
|
"learning_rate": 1.2721342031686858e-05, |
|
"loss": 0.1744, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.8443616029822927, |
|
"grad_norm": 1.8248041868209839, |
|
"learning_rate": 1.1556383970177074e-05, |
|
"loss": 0.1781, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.960857409133271, |
|
"grad_norm": 3.255676507949829, |
|
"learning_rate": 1.039142590866729e-05, |
|
"loss": 0.1752, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.0773532152842495, |
|
"grad_norm": 4.369929790496826, |
|
"learning_rate": 9.226467847157502e-06, |
|
"loss": 0.1491, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 4.193849021435228, |
|
"grad_norm": 3.5270001888275146, |
|
"learning_rate": 8.061509785647716e-06, |
|
"loss": 0.1355, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.310344827586207, |
|
"grad_norm": 3.747748613357544, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 0.1402, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.426840633737186, |
|
"grad_norm": 2.4927761554718018, |
|
"learning_rate": 5.731593662628146e-06, |
|
"loss": 0.1341, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.543336439888164, |
|
"grad_norm": 3.129467010498047, |
|
"learning_rate": 4.56663560111836e-06, |
|
"loss": 0.1349, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.659832246039143, |
|
"grad_norm": 5.417072772979736, |
|
"learning_rate": 3.401677539608574e-06, |
|
"loss": 0.1322, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.776328052190121, |
|
"grad_norm": 3.4678492546081543, |
|
"learning_rate": 2.2367194780987884e-06, |
|
"loss": 0.1402, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 4.8928238583410995, |
|
"grad_norm": 3.649914026260376, |
|
"learning_rate": 1.0717614165890028e-06, |
|
"loss": 0.1375, |
|
"step": 21000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 21460, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.096814400558976e+16, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|