{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 902, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "grad_norm": 2.5633599758148193, "learning_rate": 5.494505494505495e-05, "loss": 3.1831, "step": 25 }, { "epoch": 0.11, "grad_norm": 2.523214340209961, "learning_rate": 0.0001098901098901099, "loss": 2.9698, "step": 50 }, { "epoch": 0.17, "grad_norm": 2.6384470462799072, "learning_rate": 0.00016483516483516484, "loss": 2.7894, "step": 75 }, { "epoch": 0.22, "grad_norm": 2.2789688110351562, "learning_rate": 0.00019778051787916153, "loss": 2.7214, "step": 100 }, { "epoch": 0.28, "grad_norm": 1.6615029573440552, "learning_rate": 0.00019161528976572133, "loss": 2.7224, "step": 125 }, { "epoch": 0.33, "grad_norm": 1.7445863485336304, "learning_rate": 0.00018545006165228113, "loss": 2.7081, "step": 150 }, { "epoch": 0.39, "grad_norm": 1.5059261322021484, "learning_rate": 0.00017928483353884094, "loss": 2.668, "step": 175 }, { "epoch": 0.44, "grad_norm": 1.2297817468643188, "learning_rate": 0.00017311960542540076, "loss": 2.6149, "step": 200 }, { "epoch": 0.5, "grad_norm": 1.3275938034057617, "learning_rate": 0.00016695437731196054, "loss": 2.591, "step": 225 }, { "epoch": 0.55, "grad_norm": 1.3491621017456055, "learning_rate": 0.00016078914919852034, "loss": 2.583, "step": 250 }, { "epoch": 0.61, "grad_norm": 0.9846614003181458, "learning_rate": 0.00015462392108508014, "loss": 2.5189, "step": 275 }, { "epoch": 0.67, "grad_norm": 0.9622399806976318, "learning_rate": 0.00014845869297163997, "loss": 2.549, "step": 300 }, { "epoch": 0.72, "grad_norm": 0.9879335761070251, "learning_rate": 0.00014229346485819977, "loss": 2.4825, "step": 325 }, { "epoch": 0.78, "grad_norm": 0.9367666840553284, "learning_rate": 0.00013612823674475957, "loss": 2.4431, "step": 350 }, { "epoch": 0.83, "grad_norm": 0.8422412276268005, "learning_rate": 0.00012996300863131935, "loss": 2.399, "step": 375 }, { "epoch": 0.89, "grad_norm": 1.3478554487228394, "learning_rate": 0.00012379778051787915, "loss": 2.4595, "step": 400 }, { "epoch": 0.94, "grad_norm": 0.7269034385681152, "learning_rate": 0.00011763255240443898, "loss": 2.4464, "step": 425 }, { "epoch": 1.0, "grad_norm": 0.9614003896713257, "learning_rate": 0.00011146732429099878, "loss": 2.474, "step": 450 }, { "epoch": 1.05, "grad_norm": 0.8424810767173767, "learning_rate": 0.00010530209617755857, "loss": 2.3071, "step": 475 }, { "epoch": 1.11, "grad_norm": 0.7542237639427185, "learning_rate": 9.913686806411838e-05, "loss": 2.2797, "step": 500 }, { "epoch": 1.16, "grad_norm": 1.0208224058151245, "learning_rate": 9.297163995067819e-05, "loss": 2.2255, "step": 525 }, { "epoch": 1.22, "grad_norm": 0.8640455007553101, "learning_rate": 8.680641183723797e-05, "loss": 2.3139, "step": 550 }, { "epoch": 1.27, "grad_norm": 0.8734548687934875, "learning_rate": 8.064118372379779e-05, "loss": 2.321, "step": 575 }, { "epoch": 1.33, "grad_norm": 0.7769907116889954, "learning_rate": 7.447595561035759e-05, "loss": 2.2748, "step": 600 }, { "epoch": 1.39, "grad_norm": 0.7717509865760803, "learning_rate": 6.831072749691739e-05, "loss": 2.2641, "step": 625 }, { "epoch": 1.44, "grad_norm": 0.8536700010299683, "learning_rate": 6.214549938347719e-05, "loss": 2.2658, "step": 650 }, { "epoch": 1.5, "grad_norm": 0.8191765546798706, "learning_rate": 5.5980271270037e-05, "loss": 2.2729, "step": 675 }, { "epoch": 1.55, "grad_norm": 1.0077537298202515, "learning_rate": 4.9815043156596796e-05, "loss": 2.2726, "step": 700 }, { "epoch": 1.61, "grad_norm": 0.8226682543754578, "learning_rate": 4.36498150431566e-05, "loss": 2.2624, "step": 725 }, { "epoch": 1.66, "grad_norm": 0.8827477693557739, "learning_rate": 3.7484586929716406e-05, "loss": 2.1979, "step": 750 }, { "epoch": 1.72, "grad_norm": 0.8012568354606628, "learning_rate": 3.131935881627621e-05, "loss": 2.2563, "step": 775 }, { "epoch": 1.77, "grad_norm": 0.8210929036140442, "learning_rate": 2.5154130702836005e-05, "loss": 2.2923, "step": 800 }, { "epoch": 1.83, "grad_norm": 0.8134270906448364, "learning_rate": 1.8988902589395807e-05, "loss": 2.2719, "step": 825 }, { "epoch": 1.88, "grad_norm": 0.8563103079795837, "learning_rate": 1.282367447595561e-05, "loss": 2.2821, "step": 850 }, { "epoch": 1.94, "grad_norm": 0.8670592308044434, "learning_rate": 6.6584463625154135e-06, "loss": 2.2923, "step": 875 }, { "epoch": 2.0, "grad_norm": 0.7817544341087341, "learning_rate": 4.932182490752158e-07, "loss": 2.2142, "step": 900 } ], "logging_steps": 25, "max_steps": 902, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 5650178899968000.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }