|
{ |
|
"best_metric": 1.1142354011535645, |
|
"best_model_checkpoint": "output-definitions/checkpoint-3000", |
|
"epoch": 0.158798575349924, |
|
"eval_steps": 100, |
|
"global_step": 3500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004537102152854971, |
|
"grad_norm": 0.5996940732002258, |
|
"learning_rate": 9.90495247623812e-06, |
|
"loss": 2.0003, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004537102152854971, |
|
"eval_loss": 1.3947523832321167, |
|
"eval_runtime": 3495.2941, |
|
"eval_samples_per_second": 33.15, |
|
"eval_steps_per_second": 4.144, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009074204305709942, |
|
"grad_norm": 1.2082221508026123, |
|
"learning_rate": 9.804902451225614e-06, |
|
"loss": 1.1884, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.009074204305709942, |
|
"eval_loss": 1.2319905757904053, |
|
"eval_runtime": 3493.2433, |
|
"eval_samples_per_second": 33.17, |
|
"eval_steps_per_second": 4.146, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.013611306458564915, |
|
"grad_norm": 0.5810414552688599, |
|
"learning_rate": 9.704852426213108e-06, |
|
"loss": 1.0592, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.013611306458564915, |
|
"eval_loss": 1.1850800514221191, |
|
"eval_runtime": 3493.3223, |
|
"eval_samples_per_second": 33.169, |
|
"eval_steps_per_second": 4.146, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.018148408611419885, |
|
"grad_norm": 0.6719651818275452, |
|
"learning_rate": 9.604802401200602e-06, |
|
"loss": 1.036, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.018148408611419885, |
|
"eval_loss": 1.168699860572815, |
|
"eval_runtime": 3494.1614, |
|
"eval_samples_per_second": 33.161, |
|
"eval_steps_per_second": 4.145, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.022685510764274858, |
|
"grad_norm": 0.7384809851646423, |
|
"learning_rate": 9.504752376188094e-06, |
|
"loss": 1.0105, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.022685510764274858, |
|
"eval_loss": 1.1580127477645874, |
|
"eval_runtime": 3518.2589, |
|
"eval_samples_per_second": 32.934, |
|
"eval_steps_per_second": 4.117, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.02722261291712983, |
|
"grad_norm": 0.8317062258720398, |
|
"learning_rate": 9.404702351175588e-06, |
|
"loss": 0.9879, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.02722261291712983, |
|
"eval_loss": 1.148302674293518, |
|
"eval_runtime": 3494.3971, |
|
"eval_samples_per_second": 33.159, |
|
"eval_steps_per_second": 4.145, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.031759715069984804, |
|
"grad_norm": 0.6463008522987366, |
|
"learning_rate": 9.304652326163084e-06, |
|
"loss": 0.9698, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.031759715069984804, |
|
"eval_loss": 1.1414070129394531, |
|
"eval_runtime": 3494.7514, |
|
"eval_samples_per_second": 33.155, |
|
"eval_steps_per_second": 4.145, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.03629681722283977, |
|
"grad_norm": 0.705141544342041, |
|
"learning_rate": 9.204602301150576e-06, |
|
"loss": 0.9458, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.03629681722283977, |
|
"eval_loss": 1.1373766660690308, |
|
"eval_runtime": 3493.8551, |
|
"eval_samples_per_second": 33.164, |
|
"eval_steps_per_second": 4.146, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.04083391937569474, |
|
"grad_norm": 0.710448145866394, |
|
"learning_rate": 9.10455227613807e-06, |
|
"loss": 0.9262, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.04083391937569474, |
|
"eval_loss": 1.1317179203033447, |
|
"eval_runtime": 3494.4581, |
|
"eval_samples_per_second": 33.158, |
|
"eval_steps_per_second": 4.145, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.045371021528549715, |
|
"grad_norm": 0.8912506103515625, |
|
"learning_rate": 9.004502251125564e-06, |
|
"loss": 0.9281, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.045371021528549715, |
|
"eval_loss": 1.133247971534729, |
|
"eval_runtime": 3494.4137, |
|
"eval_samples_per_second": 33.159, |
|
"eval_steps_per_second": 4.145, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.04990812368140469, |
|
"grad_norm": 0.8533815145492554, |
|
"learning_rate": 8.904452226113058e-06, |
|
"loss": 0.9204, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.04990812368140469, |
|
"eval_loss": 1.1253483295440674, |
|
"eval_runtime": 3493.2383, |
|
"eval_samples_per_second": 33.17, |
|
"eval_steps_per_second": 4.146, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.05444522583425966, |
|
"grad_norm": 0.7760438323020935, |
|
"learning_rate": 8.804402201100552e-06, |
|
"loss": 0.8959, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.05444522583425966, |
|
"eval_loss": 1.1242249011993408, |
|
"eval_runtime": 3493.0538, |
|
"eval_samples_per_second": 33.172, |
|
"eval_steps_per_second": 4.147, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.05898232798711463, |
|
"grad_norm": 0.8464282751083374, |
|
"learning_rate": 8.704352176088044e-06, |
|
"loss": 0.8889, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.05898232798711463, |
|
"eval_loss": 1.1236740350723267, |
|
"eval_runtime": 3492.632, |
|
"eval_samples_per_second": 33.176, |
|
"eval_steps_per_second": 4.147, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.06351943013996961, |
|
"grad_norm": 0.7117558121681213, |
|
"learning_rate": 8.604302151075538e-06, |
|
"loss": 0.8898, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.06351943013996961, |
|
"eval_loss": 1.1239374876022339, |
|
"eval_runtime": 3492.8289, |
|
"eval_samples_per_second": 33.174, |
|
"eval_steps_per_second": 4.147, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.06805653229282457, |
|
"grad_norm": 0.8221879005432129, |
|
"learning_rate": 8.504252126063032e-06, |
|
"loss": 0.8862, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.06805653229282457, |
|
"eval_loss": 1.1258162260055542, |
|
"eval_runtime": 3492.1621, |
|
"eval_samples_per_second": 33.18, |
|
"eval_steps_per_second": 4.148, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.07259363444567954, |
|
"grad_norm": 0.6950238347053528, |
|
"learning_rate": 8.404202101050526e-06, |
|
"loss": 0.8745, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.07259363444567954, |
|
"eval_loss": 1.115903615951538, |
|
"eval_runtime": 3492.7622, |
|
"eval_samples_per_second": 33.174, |
|
"eval_steps_per_second": 4.147, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.07713073659853452, |
|
"grad_norm": 0.8201397061347961, |
|
"learning_rate": 8.30415207603802e-06, |
|
"loss": 0.8764, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.07713073659853452, |
|
"eval_loss": 1.1180561780929565, |
|
"eval_runtime": 3492.8242, |
|
"eval_samples_per_second": 33.174, |
|
"eval_steps_per_second": 4.147, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.08166783875138948, |
|
"grad_norm": 0.9200800061225891, |
|
"learning_rate": 8.204102051025512e-06, |
|
"loss": 0.8512, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.08166783875138948, |
|
"eval_loss": 1.1254093647003174, |
|
"eval_runtime": 3493.1426, |
|
"eval_samples_per_second": 33.171, |
|
"eval_steps_per_second": 4.146, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.08620494090424446, |
|
"grad_norm": 0.8169254064559937, |
|
"learning_rate": 8.104052026013006e-06, |
|
"loss": 0.8587, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.08620494090424446, |
|
"eval_loss": 1.121141791343689, |
|
"eval_runtime": 3493.1336, |
|
"eval_samples_per_second": 33.171, |
|
"eval_steps_per_second": 4.146, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.09074204305709943, |
|
"grad_norm": 0.9403719305992126, |
|
"learning_rate": 8.0040020010005e-06, |
|
"loss": 0.8577, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.09074204305709943, |
|
"eval_loss": 1.1173222064971924, |
|
"eval_runtime": 3494.3505, |
|
"eval_samples_per_second": 33.159, |
|
"eval_steps_per_second": 4.145, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0952791452099544, |
|
"grad_norm": 0.8594589233398438, |
|
"learning_rate": 7.903951975987994e-06, |
|
"loss": 0.85, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.0952791452099544, |
|
"eval_loss": 1.1184200048446655, |
|
"eval_runtime": 3493.8891, |
|
"eval_samples_per_second": 33.164, |
|
"eval_steps_per_second": 4.146, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.09981624736280938, |
|
"grad_norm": 0.7728345990180969, |
|
"learning_rate": 7.803901950975488e-06, |
|
"loss": 0.8418, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.09981624736280938, |
|
"eval_loss": 1.1176252365112305, |
|
"eval_runtime": 3496.1473, |
|
"eval_samples_per_second": 33.142, |
|
"eval_steps_per_second": 4.143, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.10435334951566434, |
|
"grad_norm": 0.826181948184967, |
|
"learning_rate": 7.703851925962982e-06, |
|
"loss": 0.8407, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.10435334951566434, |
|
"eval_loss": 1.117732048034668, |
|
"eval_runtime": 3505.8694, |
|
"eval_samples_per_second": 33.05, |
|
"eval_steps_per_second": 4.131, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.10889045166851932, |
|
"grad_norm": 0.8817082047462463, |
|
"learning_rate": 7.603801900950476e-06, |
|
"loss": 0.8391, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.10889045166851932, |
|
"eval_loss": 1.1185327768325806, |
|
"eval_runtime": 3511.4803, |
|
"eval_samples_per_second": 32.997, |
|
"eval_steps_per_second": 4.125, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.11342755382137429, |
|
"grad_norm": 1.1382585763931274, |
|
"learning_rate": 7.503751875937969e-06, |
|
"loss": 0.8439, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.11342755382137429, |
|
"eval_loss": 1.118537187576294, |
|
"eval_runtime": 3501.1131, |
|
"eval_samples_per_second": 33.095, |
|
"eval_steps_per_second": 4.137, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.11796465597422925, |
|
"grad_norm": 0.9408033490180969, |
|
"learning_rate": 7.403701850925464e-06, |
|
"loss": 0.8309, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.11796465597422925, |
|
"eval_loss": 1.112176775932312, |
|
"eval_runtime": 3495.6002, |
|
"eval_samples_per_second": 33.147, |
|
"eval_steps_per_second": 4.143, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.12250175812708423, |
|
"grad_norm": 0.9824351072311401, |
|
"learning_rate": 7.303651825912958e-06, |
|
"loss": 0.8282, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.12250175812708423, |
|
"eval_loss": 1.1151224374771118, |
|
"eval_runtime": 3494.7487, |
|
"eval_samples_per_second": 33.155, |
|
"eval_steps_per_second": 4.145, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.12703886027993921, |
|
"grad_norm": 0.832771897315979, |
|
"learning_rate": 7.203601800900451e-06, |
|
"loss": 0.8253, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.12703886027993921, |
|
"eval_loss": 1.1167434453964233, |
|
"eval_runtime": 3496.4871, |
|
"eval_samples_per_second": 33.139, |
|
"eval_steps_per_second": 4.142, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.13157596243279418, |
|
"grad_norm": 0.7724223732948303, |
|
"learning_rate": 7.103551775887945e-06, |
|
"loss": 0.8317, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.13157596243279418, |
|
"eval_loss": 1.1156556606292725, |
|
"eval_runtime": 3495.6568, |
|
"eval_samples_per_second": 33.147, |
|
"eval_steps_per_second": 4.143, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.13611306458564915, |
|
"grad_norm": 0.84904944896698, |
|
"learning_rate": 7.003501750875439e-06, |
|
"loss": 0.8296, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.13611306458564915, |
|
"eval_loss": 1.1142354011535645, |
|
"eval_runtime": 3495.4619, |
|
"eval_samples_per_second": 33.149, |
|
"eval_steps_per_second": 4.144, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1406501667385041, |
|
"grad_norm": 1.1638028621673584, |
|
"learning_rate": 6.903451725862932e-06, |
|
"loss": 0.8194, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.1406501667385041, |
|
"eval_loss": 1.1132752895355225, |
|
"eval_runtime": 3496.3654, |
|
"eval_samples_per_second": 33.14, |
|
"eval_steps_per_second": 4.143, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.14518726889135908, |
|
"grad_norm": 0.9577890634536743, |
|
"learning_rate": 6.803401700850426e-06, |
|
"loss": 0.8197, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.14518726889135908, |
|
"eval_loss": 1.1138397455215454, |
|
"eval_runtime": 3495.7388, |
|
"eval_samples_per_second": 33.146, |
|
"eval_steps_per_second": 4.143, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.14972437104421407, |
|
"grad_norm": 0.9326839447021484, |
|
"learning_rate": 6.703351675837919e-06, |
|
"loss": 0.7983, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.14972437104421407, |
|
"eval_loss": 1.1186929941177368, |
|
"eval_runtime": 3495.9382, |
|
"eval_samples_per_second": 33.144, |
|
"eval_steps_per_second": 4.143, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.15426147319706904, |
|
"grad_norm": 0.806262195110321, |
|
"learning_rate": 6.603301650825413e-06, |
|
"loss": 0.8024, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.15426147319706904, |
|
"eval_loss": 1.1130287647247314, |
|
"eval_runtime": 3496.2696, |
|
"eval_samples_per_second": 33.141, |
|
"eval_steps_per_second": 4.143, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.158798575349924, |
|
"grad_norm": 0.9623129963874817, |
|
"learning_rate": 6.503251625812907e-06, |
|
"loss": 0.8114, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.158798575349924, |
|
"eval_loss": 1.1146483421325684, |
|
"eval_runtime": 3493.8871, |
|
"eval_samples_per_second": 33.164, |
|
"eval_steps_per_second": 4.146, |
|
"step": 3500 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 100, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 1 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.069128403044139e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|