|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 18, |
|
"global_step": 213, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014084507042253521, |
|
"eval_loss": 11.91716480255127, |
|
"eval_runtime": 0.6125, |
|
"eval_samples_per_second": 195.905, |
|
"eval_steps_per_second": 24.488, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04225352112676056, |
|
"grad_norm": 0.2159075140953064, |
|
"learning_rate": 3e-05, |
|
"loss": 11.9107, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.08450704225352113, |
|
"grad_norm": 0.2119741290807724, |
|
"learning_rate": 6e-05, |
|
"loss": 11.9164, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1267605633802817, |
|
"grad_norm": 0.21554657816886902, |
|
"learning_rate": 9e-05, |
|
"loss": 11.9129, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16901408450704225, |
|
"grad_norm": 0.19983112812042236, |
|
"learning_rate": 9.997605179330019e-05, |
|
"loss": 11.9137, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2112676056338028, |
|
"grad_norm": 0.23735706508159637, |
|
"learning_rate": 9.98503864319978e-05, |
|
"loss": 11.9074, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2535211267605634, |
|
"grad_norm": 0.28685566782951355, |
|
"learning_rate": 9.961728733030318e-05, |
|
"loss": 11.9116, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2535211267605634, |
|
"eval_loss": 11.904885292053223, |
|
"eval_runtime": 0.6128, |
|
"eval_samples_per_second": 195.813, |
|
"eval_steps_per_second": 24.477, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.29577464788732394, |
|
"grad_norm": 0.32204487919807434, |
|
"learning_rate": 9.927725684557338e-05, |
|
"loss": 11.9111, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3380281690140845, |
|
"grad_norm": 0.36651644110679626, |
|
"learning_rate": 9.883102778550434e-05, |
|
"loss": 11.8957, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.38028169014084506, |
|
"grad_norm": 0.5040445923805237, |
|
"learning_rate": 9.82795618288397e-05, |
|
"loss": 11.888, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.4225352112676056, |
|
"grad_norm": 0.5162233114242554, |
|
"learning_rate": 9.762404745283439e-05, |
|
"loss": 11.8833, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4647887323943662, |
|
"grad_norm": 0.5750740170478821, |
|
"learning_rate": 9.686589737193929e-05, |
|
"loss": 11.879, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5070422535211268, |
|
"grad_norm": 0.5393357276916504, |
|
"learning_rate": 9.600674549322717e-05, |
|
"loss": 11.8624, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5070422535211268, |
|
"eval_loss": 11.857889175415039, |
|
"eval_runtime": 0.612, |
|
"eval_samples_per_second": 196.066, |
|
"eval_steps_per_second": 24.508, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5492957746478874, |
|
"grad_norm": 0.5249887108802795, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 11.8531, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5915492957746479, |
|
"grad_norm": 0.33441707491874695, |
|
"learning_rate": 9.399305633701373e-05, |
|
"loss": 11.8462, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6338028169014085, |
|
"grad_norm": 0.2839614152908325, |
|
"learning_rate": 9.284285880837946e-05, |
|
"loss": 11.8347, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.676056338028169, |
|
"grad_norm": 0.2555694878101349, |
|
"learning_rate": 9.160032962696734e-05, |
|
"loss": 11.834, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7183098591549296, |
|
"grad_norm": 0.23353426158428192, |
|
"learning_rate": 9.026814659664331e-05, |
|
"loss": 11.8343, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7605633802816901, |
|
"grad_norm": 0.23047791421413422, |
|
"learning_rate": 8.88491807363919e-05, |
|
"loss": 11.8276, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7605633802816901, |
|
"eval_loss": 11.828319549560547, |
|
"eval_runtime": 0.6154, |
|
"eval_samples_per_second": 194.994, |
|
"eval_steps_per_second": 24.374, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8028169014084507, |
|
"grad_norm": 0.333280086517334, |
|
"learning_rate": 8.734649009291585e-05, |
|
"loss": 11.8283, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8450704225352113, |
|
"grad_norm": 0.29108697175979614, |
|
"learning_rate": 8.576331315016753e-05, |
|
"loss": 11.8236, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8873239436619719, |
|
"grad_norm": 0.2189083993434906, |
|
"learning_rate": 8.410306185001611e-05, |
|
"loss": 11.8169, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.9295774647887324, |
|
"grad_norm": 0.2126510888338089, |
|
"learning_rate": 8.236931423909138e-05, |
|
"loss": 11.819, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.971830985915493, |
|
"grad_norm": 0.22723552584648132, |
|
"learning_rate": 8.05658067576513e-05, |
|
"loss": 11.8203, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.0140845070422535, |
|
"grad_norm": 0.20461063086986542, |
|
"learning_rate": 7.86964261870916e-05, |
|
"loss": 11.8133, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.0140845070422535, |
|
"eval_loss": 11.81312370300293, |
|
"eval_runtime": 0.6161, |
|
"eval_samples_per_second": 194.759, |
|
"eval_steps_per_second": 24.345, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.056338028169014, |
|
"grad_norm": 0.24878963828086853, |
|
"learning_rate": 7.676520127345197e-05, |
|
"loss": 11.8165, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0985915492957747, |
|
"grad_norm": 0.2186020314693451, |
|
"learning_rate": 7.477629404497048e-05, |
|
"loss": 11.8114, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.1408450704225352, |
|
"grad_norm": 0.28986242413520813, |
|
"learning_rate": 7.273399084239878e-05, |
|
"loss": 11.8118, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.1830985915492958, |
|
"grad_norm": 0.1976727992296219, |
|
"learning_rate": 7.06426930814083e-05, |
|
"loss": 11.8103, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.2253521126760563, |
|
"grad_norm": 0.17177629470825195, |
|
"learning_rate": 6.850690776699573e-05, |
|
"loss": 11.8052, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.267605633802817, |
|
"grad_norm": 0.2314409464597702, |
|
"learning_rate": 6.633123778033061e-05, |
|
"loss": 11.8072, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.267605633802817, |
|
"eval_loss": 11.802838325500488, |
|
"eval_runtime": 0.615, |
|
"eval_samples_per_second": 195.129, |
|
"eval_steps_per_second": 24.391, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3098591549295775, |
|
"grad_norm": 0.17084774374961853, |
|
"learning_rate": 6.412037195897785e-05, |
|
"loss": 11.807, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.352112676056338, |
|
"grad_norm": 0.1774490773677826, |
|
"learning_rate": 6.187907499187356e-05, |
|
"loss": 11.8047, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3943661971830985, |
|
"grad_norm": 0.1694556325674057, |
|
"learning_rate": 5.961217715083185e-05, |
|
"loss": 11.8025, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.436619718309859, |
|
"grad_norm": 0.32927870750427246, |
|
"learning_rate": 5.732456388071247e-05, |
|
"loss": 11.8011, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.4788732394366197, |
|
"grad_norm": 0.22614863514900208, |
|
"learning_rate": 5.502116527068363e-05, |
|
"loss": 11.8016, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.5211267605633803, |
|
"grad_norm": 0.4875750243663788, |
|
"learning_rate": 5.270694542927088e-05, |
|
"loss": 11.8004, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.5211267605633803, |
|
"eval_loss": 11.798242568969727, |
|
"eval_runtime": 0.6119, |
|
"eval_samples_per_second": 196.101, |
|
"eval_steps_per_second": 24.513, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.563380281690141, |
|
"grad_norm": 0.16188736259937286, |
|
"learning_rate": 5.0386891786090105e-05, |
|
"loss": 11.7992, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.6056338028169015, |
|
"grad_norm": 0.2233981341123581, |
|
"learning_rate": 4.806600434332056e-05, |
|
"loss": 11.7992, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.647887323943662, |
|
"grad_norm": 0.16819994151592255, |
|
"learning_rate": 4.574928490008264e-05, |
|
"loss": 11.798, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.6901408450704225, |
|
"grad_norm": 0.12405195087194443, |
|
"learning_rate": 4.344172627294289e-05, |
|
"loss": 11.8012, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.732394366197183, |
|
"grad_norm": 0.14492444694042206, |
|
"learning_rate": 4.114830153577759e-05, |
|
"loss": 11.7989, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.7746478873239435, |
|
"grad_norm": 0.3022187054157257, |
|
"learning_rate": 3.887395330218429e-05, |
|
"loss": 11.7966, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.7746478873239435, |
|
"eval_loss": 11.796996116638184, |
|
"eval_runtime": 0.6204, |
|
"eval_samples_per_second": 193.41, |
|
"eval_steps_per_second": 24.176, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.8169014084507042, |
|
"grad_norm": 0.34615883231163025, |
|
"learning_rate": 3.6623583073538966e-05, |
|
"loss": 11.7966, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.8591549295774648, |
|
"grad_norm": 0.1670173704624176, |
|
"learning_rate": 3.440204067565511e-05, |
|
"loss": 11.798, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.9014084507042255, |
|
"grad_norm": 0.25132665038108826, |
|
"learning_rate": 3.221411380681007e-05, |
|
"loss": 11.7989, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.943661971830986, |
|
"grad_norm": 0.3825485408306122, |
|
"learning_rate": 3.006451771966383e-05, |
|
"loss": 11.8008, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.9859154929577465, |
|
"grad_norm": 0.14148132503032684, |
|
"learning_rate": 2.79578850593071e-05, |
|
"loss": 11.7995, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.028169014084507, |
|
"grad_norm": 0.1258476823568344, |
|
"learning_rate": 2.589875587933892e-05, |
|
"loss": 11.7966, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.028169014084507, |
|
"eval_loss": 11.796266555786133, |
|
"eval_runtime": 0.6121, |
|
"eval_samples_per_second": 196.045, |
|
"eval_steps_per_second": 24.506, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.0704225352112675, |
|
"grad_norm": 0.29805803298950195, |
|
"learning_rate": 2.3891567857490372e-05, |
|
"loss": 11.7973, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.112676056338028, |
|
"grad_norm": 0.13624520599842072, |
|
"learning_rate": 2.194064673188089e-05, |
|
"loss": 11.7978, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.1549295774647885, |
|
"grad_norm": 0.10893828421831131, |
|
"learning_rate": 2.005019697851832e-05, |
|
"loss": 11.7972, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.1971830985915495, |
|
"grad_norm": 0.13635849952697754, |
|
"learning_rate": 1.8224292750133743e-05, |
|
"loss": 11.7988, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.23943661971831, |
|
"grad_norm": 0.13502788543701172, |
|
"learning_rate": 1.646686909587908e-05, |
|
"loss": 11.7981, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.2816901408450705, |
|
"grad_norm": 0.1165008544921875, |
|
"learning_rate": 1.4781713480810184e-05, |
|
"loss": 11.7969, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.2816901408450705, |
|
"eval_loss": 11.796056747436523, |
|
"eval_runtime": 0.6117, |
|
"eval_samples_per_second": 196.169, |
|
"eval_steps_per_second": 24.521, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.323943661971831, |
|
"grad_norm": 0.19734472036361694, |
|
"learning_rate": 1.3172457623431706e-05, |
|
"loss": 11.7956, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.3661971830985915, |
|
"grad_norm": 0.3292880356311798, |
|
"learning_rate": 1.164256966889517e-05, |
|
"loss": 11.7975, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.408450704225352, |
|
"grad_norm": 0.24362030625343323, |
|
"learning_rate": 1.0195346714717813e-05, |
|
"loss": 11.7978, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.4507042253521125, |
|
"grad_norm": 0.3114156723022461, |
|
"learning_rate": 8.83390770513009e-06, |
|
"loss": 11.7975, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.492957746478873, |
|
"grad_norm": 0.26279327273368835, |
|
"learning_rate": 7.561186709365653e-06, |
|
"loss": 11.8017, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.535211267605634, |
|
"grad_norm": 0.1340506225824356, |
|
"learning_rate": 6.379926598379726e-06, |
|
"loss": 11.7968, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.535211267605634, |
|
"eval_loss": 11.796035766601562, |
|
"eval_runtime": 0.6227, |
|
"eval_samples_per_second": 192.708, |
|
"eval_steps_per_second": 24.089, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.5774647887323945, |
|
"grad_norm": 0.36095118522644043, |
|
"learning_rate": 5.292673133623371e-06, |
|
"loss": 11.7982, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.619718309859155, |
|
"grad_norm": 0.33758965134620667, |
|
"learning_rate": 4.301769480613116e-06, |
|
"loss": 11.7951, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.6619718309859155, |
|
"grad_norm": 0.1514846533536911, |
|
"learning_rate": 3.4093511591198445e-06, |
|
"loss": 11.7959, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.704225352112676, |
|
"grad_norm": 0.15260806679725647, |
|
"learning_rate": 2.6173414408598827e-06, |
|
"loss": 11.7991, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.7464788732394365, |
|
"grad_norm": 0.16173598170280457, |
|
"learning_rate": 1.92744720460688e-06, |
|
"loss": 11.7977, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.788732394366197, |
|
"grad_norm": 0.21083351969718933, |
|
"learning_rate": 1.341155257657256e-06, |
|
"loss": 11.8001, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.788732394366197, |
|
"eval_loss": 11.795927047729492, |
|
"eval_runtime": 0.6119, |
|
"eval_samples_per_second": 196.107, |
|
"eval_steps_per_second": 24.513, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.830985915492958, |
|
"grad_norm": 0.1594826728105545, |
|
"learning_rate": 8.597291315767808e-07, |
|
"loss": 11.8005, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.873239436619718, |
|
"grad_norm": 0.20315298438072205, |
|
"learning_rate": 4.842063591339763e-07, |
|
"loss": 11.7946, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.915492957746479, |
|
"grad_norm": 0.15598557889461517, |
|
"learning_rate": 2.153962382888841e-07, |
|
"loss": 11.7981, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.9577464788732395, |
|
"grad_norm": 0.38635027408599854, |
|
"learning_rate": 5.3878088055947515e-08, |
|
"loss": 11.7958, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.2931283414363861, |
|
"learning_rate": 0.0, |
|
"loss": 11.7986, |
|
"step": 213 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 213, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 18, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1025932492800.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|