|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 46.553125, |
|
"epoch": 0.008, |
|
"grad_norm": 0.05134107172489166, |
|
"kl": 0.012939453125, |
|
"learning_rate": 9.95e-07, |
|
"loss": 0.0001, |
|
"reward": 2.703125, |
|
"reward_std": 0.11205126643180847, |
|
"rewards/accuracy_reward": 1.7125, |
|
"rewards/format_reward": 0.990625, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 49.34375, |
|
"epoch": 0.016, |
|
"grad_norm": 0.06966069340705872, |
|
"kl": 0.01898193359375, |
|
"learning_rate": 9.9e-07, |
|
"loss": 0.0002, |
|
"reward": 2.775, |
|
"reward_std": 0.05, |
|
"rewards/accuracy_reward": 1.78125, |
|
"rewards/format_reward": 0.99375, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 44.63125, |
|
"epoch": 0.024, |
|
"grad_norm": 5.11226749420166, |
|
"kl": 0.0212158203125, |
|
"learning_rate": 9.849999999999999e-07, |
|
"loss": 0.0002, |
|
"reward": 2.546875, |
|
"reward_std": 0.09568375647068024, |
|
"rewards/accuracy_reward": 1.55625, |
|
"rewards/format_reward": 0.990625, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 42.775, |
|
"epoch": 0.032, |
|
"grad_norm": 0.0820818841457367, |
|
"kl": 0.042626953125, |
|
"learning_rate": 9.8e-07, |
|
"loss": 0.0004, |
|
"reward": 2.775, |
|
"reward_std": 0.03943375647068024, |
|
"rewards/accuracy_reward": 1.775, |
|
"rewards/format_reward": 1.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 44.275, |
|
"epoch": 0.04, |
|
"grad_norm": 0.06030546873807907, |
|
"kl": 0.03828125, |
|
"learning_rate": 9.75e-07, |
|
"loss": 0.0004, |
|
"reward": 2.74375, |
|
"reward_std": 0.026933756470680238, |
|
"rewards/accuracy_reward": 1.74375, |
|
"rewards/format_reward": 1.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 50.990625, |
|
"epoch": 0.048, |
|
"grad_norm": 0.10005596280097961, |
|
"kl": 0.03060302734375, |
|
"learning_rate": 9.7e-07, |
|
"loss": 0.0003, |
|
"reward": 2.60625, |
|
"reward_std": 0.10193375647068023, |
|
"rewards/accuracy_reward": 1.60625, |
|
"rewards/format_reward": 1.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 54.375, |
|
"epoch": 0.056, |
|
"grad_norm": 4.453707695007324, |
|
"kl": 0.0556640625, |
|
"learning_rate": 9.649999999999999e-07, |
|
"loss": 0.0006, |
|
"reward": 2.590625, |
|
"reward_std": 0.08318375647068024, |
|
"rewards/accuracy_reward": 1.59375, |
|
"rewards/format_reward": 0.996875, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 48.815625, |
|
"epoch": 0.064, |
|
"grad_norm": 2.5629329681396484, |
|
"kl": 0.040283203125, |
|
"learning_rate": 9.6e-07, |
|
"loss": 0.0004, |
|
"reward": 2.765625, |
|
"reward_std": 0.058183756470680234, |
|
"rewards/accuracy_reward": 1.76875, |
|
"rewards/format_reward": 0.996875, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 47.53125, |
|
"epoch": 0.072, |
|
"grad_norm": 0.08292120695114136, |
|
"kl": 0.0712646484375, |
|
"learning_rate": 9.55e-07, |
|
"loss": 0.0007, |
|
"reward": 2.825, |
|
"reward_std": 0.05, |
|
"rewards/accuracy_reward": 1.825, |
|
"rewards/format_reward": 1.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 46.703125, |
|
"epoch": 0.08, |
|
"grad_norm": 2.7465286254882812, |
|
"kl": 0.05367431640625, |
|
"learning_rate": 9.499999999999999e-07, |
|
"loss": 0.0005, |
|
"reward": 2.71875, |
|
"reward_std": 0.07693375647068024, |
|
"rewards/accuracy_reward": 1.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 46.225, |
|
"epoch": 0.088, |
|
"grad_norm": 2.1839213371276855, |
|
"kl": 0.0655517578125, |
|
"learning_rate": 9.45e-07, |
|
"loss": 0.0007, |
|
"reward": 2.609375, |
|
"reward_std": 0.03125, |
|
"rewards/accuracy_reward": 1.61875, |
|
"rewards/format_reward": 0.990625, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 44.096875, |
|
"epoch": 0.096, |
|
"grad_norm": 0.07181887328624725, |
|
"kl": 0.06865234375, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 0.0007, |
|
"reward": 2.71875, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 44.590625, |
|
"epoch": 0.104, |
|
"grad_norm": 0.09902142733335495, |
|
"kl": 0.0936767578125, |
|
"learning_rate": 9.35e-07, |
|
"loss": 0.0009, |
|
"reward": 2.56875, |
|
"reward_std": 0.0625, |
|
"rewards/accuracy_reward": 1.575, |
|
"rewards/format_reward": 0.99375, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 43.81875, |
|
"epoch": 0.112, |
|
"grad_norm": 2.340815305709839, |
|
"kl": 0.066015625, |
|
"learning_rate": 9.3e-07, |
|
"loss": 0.0007, |
|
"reward": 2.75, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 49.06875, |
|
"epoch": 0.12, |
|
"grad_norm": 2.58245849609375, |
|
"kl": 0.0600341796875, |
|
"learning_rate": 9.25e-07, |
|
"loss": 0.0006, |
|
"reward": 2.7125, |
|
"reward_std": 0.125, |
|
"rewards/accuracy_reward": 1.7125, |
|
"rewards/format_reward": 1.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 52.84375, |
|
"epoch": 0.128, |
|
"grad_norm": 0.06839890778064728, |
|
"kl": 0.0785400390625, |
|
"learning_rate": 9.2e-07, |
|
"loss": 0.0008, |
|
"reward": 2.7, |
|
"reward_std": 0.03943375647068024, |
|
"rewards/accuracy_reward": 1.7, |
|
"rewards/format_reward": 1.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 47.7, |
|
"epoch": 0.136, |
|
"grad_norm": 0.11428700387477875, |
|
"kl": 0.06865234375, |
|
"learning_rate": 9.15e-07, |
|
"loss": 0.0007, |
|
"reward": 2.75, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 43.478125, |
|
"epoch": 0.144, |
|
"grad_norm": 2.188392400741577, |
|
"kl": 0.062451171875, |
|
"learning_rate": 9.1e-07, |
|
"loss": 0.0006, |
|
"reward": 2.615625, |
|
"reward_std": 0.06875, |
|
"rewards/accuracy_reward": 1.61875, |
|
"rewards/format_reward": 0.996875, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 42.540625, |
|
"epoch": 0.152, |
|
"grad_norm": 3.399991512298584, |
|
"kl": 0.076953125, |
|
"learning_rate": 9.05e-07, |
|
"loss": 0.0008, |
|
"reward": 2.64375, |
|
"reward_std": 0.09136751294136047, |
|
"rewards/accuracy_reward": 1.64375, |
|
"rewards/format_reward": 1.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 50.21875, |
|
"epoch": 0.16, |
|
"grad_norm": 0.10214658826589584, |
|
"kl": 0.09365234375, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0009, |
|
"reward": 2.784375, |
|
"reward_std": 0.05625, |
|
"rewards/accuracy_reward": 1.7875, |
|
"rewards/format_reward": 0.996875, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 54.35, |
|
"epoch": 0.168, |
|
"grad_norm": 0.08639144152402878, |
|
"kl": 0.1749267578125, |
|
"learning_rate": 8.95e-07, |
|
"loss": 0.0017, |
|
"reward": 2.7875, |
|
"reward_std": 0.014433756470680237, |
|
"rewards/accuracy_reward": 1.7875, |
|
"rewards/format_reward": 1.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 56.29375, |
|
"epoch": 0.176, |
|
"grad_norm": 0.06954076141119003, |
|
"kl": 0.119091796875, |
|
"learning_rate": 8.9e-07, |
|
"loss": 0.0012, |
|
"reward": 2.75, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.75625, |
|
"rewards/format_reward": 0.99375, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 48.8875, |
|
"epoch": 0.184, |
|
"grad_norm": 0.06490013003349304, |
|
"kl": 0.12080078125, |
|
"learning_rate": 8.85e-07, |
|
"loss": 0.0012, |
|
"reward": 2.65, |
|
"reward_std": 0.07886751294136048, |
|
"rewards/accuracy_reward": 1.65, |
|
"rewards/format_reward": 1.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 41.115625, |
|
"epoch": 0.192, |
|
"grad_norm": 0.12679292261600494, |
|
"kl": 0.12470703125, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 0.0012, |
|
"reward": 2.8125, |
|
"reward_std": 0.07886751294136048, |
|
"rewards/accuracy_reward": 1.81875, |
|
"rewards/format_reward": 0.99375, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 40.4125, |
|
"epoch": 0.2, |
|
"grad_norm": 0.11438746750354767, |
|
"kl": 30.11142578125, |
|
"learning_rate": 8.75e-07, |
|
"loss": 0.3012, |
|
"reward": 2.725, |
|
"reward_std": 0.05, |
|
"rewards/accuracy_reward": 1.73125, |
|
"rewards/format_reward": 0.99375, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 46.790625, |
|
"epoch": 0.208, |
|
"grad_norm": 2.282456159591675, |
|
"kl": 0.10205078125, |
|
"learning_rate": 8.699999999999999e-07, |
|
"loss": 0.001, |
|
"reward": 2.578125, |
|
"reward_std": 0.06875, |
|
"rewards/accuracy_reward": 1.58125, |
|
"rewards/format_reward": 0.996875, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 52.253125, |
|
"epoch": 0.216, |
|
"grad_norm": 1.9098315238952637, |
|
"kl": 0.105810546875, |
|
"learning_rate": 8.65e-07, |
|
"loss": 0.0011, |
|
"reward": 2.8, |
|
"reward_std": 0.07886751294136048, |
|
"rewards/accuracy_reward": 1.8, |
|
"rewards/format_reward": 1.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 49.828125, |
|
"epoch": 0.224, |
|
"grad_norm": 0.058336157351732254, |
|
"kl": 0.0714599609375, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.0007, |
|
"reward": 2.73125, |
|
"reward_std": 0.0375, |
|
"rewards/accuracy_reward": 1.73125, |
|
"rewards/format_reward": 1.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 47.14375, |
|
"epoch": 0.232, |
|
"grad_norm": 0.07711385935544968, |
|
"kl": 0.08037109375, |
|
"learning_rate": 8.55e-07, |
|
"loss": 0.0008, |
|
"reward": 2.875, |
|
"reward_std": 0.03943375647068024, |
|
"rewards/accuracy_reward": 1.875, |
|
"rewards/format_reward": 1.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 46.759375, |
|
"epoch": 0.24, |
|
"grad_norm": 0.059466563165187836, |
|
"kl": 0.079248046875, |
|
"learning_rate": 8.499999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.70625, |
|
"reward_std": 0.051933756470680235, |
|
"rewards/accuracy_reward": 1.70625, |
|
"rewards/format_reward": 1.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 48.540625, |
|
"epoch": 0.248, |
|
"grad_norm": 3.2264294624328613, |
|
"kl": 0.0768310546875, |
|
"learning_rate": 8.45e-07, |
|
"loss": 0.0008, |
|
"reward": 2.7375, |
|
"reward_std": 0.075, |
|
"rewards/accuracy_reward": 1.7375, |
|
"rewards/format_reward": 1.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 46.85, |
|
"epoch": 0.256, |
|
"grad_norm": 0.08373435586690903, |
|
"kl": 0.088037109375, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.0009, |
|
"reward": 2.728125, |
|
"reward_std": 0.08318375647068024, |
|
"rewards/accuracy_reward": 1.73125, |
|
"rewards/format_reward": 0.996875, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 45.0125, |
|
"epoch": 0.264, |
|
"grad_norm": 0.08248328417539597, |
|
"kl": 0.084375, |
|
"learning_rate": 8.349999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.684375, |
|
"reward_std": 0.04568375647068024, |
|
"rewards/accuracy_reward": 1.6875, |
|
"rewards/format_reward": 0.996875, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 47.1, |
|
"epoch": 0.272, |
|
"grad_norm": 0.08357389271259308, |
|
"kl": 0.07880859375, |
|
"learning_rate": 8.299999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.628125, |
|
"reward_std": 0.03318375647068024, |
|
"rewards/accuracy_reward": 1.63125, |
|
"rewards/format_reward": 0.996875, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 48.95625, |
|
"epoch": 0.28, |
|
"grad_norm": 1.7901896238327026, |
|
"kl": 0.084033203125, |
|
"learning_rate": 8.249999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.609375, |
|
"reward_std": 0.03125, |
|
"rewards/accuracy_reward": 1.6125, |
|
"rewards/format_reward": 0.996875, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 46.15, |
|
"epoch": 0.288, |
|
"grad_norm": 0.07559721171855927, |
|
"kl": 0.14404296875, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 0.0014, |
|
"reward": 2.778125, |
|
"reward_std": 0.04375, |
|
"rewards/accuracy_reward": 1.78125, |
|
"rewards/format_reward": 0.996875, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 43.8375, |
|
"epoch": 0.296, |
|
"grad_norm": 3.8727450370788574, |
|
"kl": 0.109521484375, |
|
"learning_rate": 8.149999999999999e-07, |
|
"loss": 0.0011, |
|
"reward": 2.83125, |
|
"reward_std": 0.0375, |
|
"rewards/accuracy_reward": 1.83125, |
|
"rewards/format_reward": 1.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 45.9625, |
|
"epoch": 0.304, |
|
"grad_norm": 0.05233932286500931, |
|
"kl": 0.0930908203125, |
|
"learning_rate": 8.1e-07, |
|
"loss": 0.0009, |
|
"reward": 2.796875, |
|
"reward_std": 0.00625, |
|
"rewards/accuracy_reward": 1.8, |
|
"rewards/format_reward": 0.996875, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 49.55, |
|
"epoch": 0.312, |
|
"grad_norm": 4.457919120788574, |
|
"kl": 0.0723876953125, |
|
"learning_rate": 8.05e-07, |
|
"loss": 0.0007, |
|
"reward": 2.75, |
|
"reward_std": 0.053867512941360475, |
|
"rewards/accuracy_reward": 1.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 50.909375, |
|
"epoch": 0.32, |
|
"grad_norm": 0.050397127866744995, |
|
"kl": 0.08388671875, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0008, |
|
"reward": 2.7625, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.7625, |
|
"rewards/format_reward": 1.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 49.165625, |
|
"epoch": 0.328, |
|
"grad_norm": 0.1388678401708603, |
|
"kl": 0.084033203125, |
|
"learning_rate": 7.95e-07, |
|
"loss": 0.0008, |
|
"reward": 2.6875, |
|
"reward_std": 0.014433756470680237, |
|
"rewards/accuracy_reward": 1.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 48.190625, |
|
"epoch": 0.336, |
|
"grad_norm": 2.034395933151245, |
|
"kl": 0.078125, |
|
"learning_rate": 7.9e-07, |
|
"loss": 0.0008, |
|
"reward": 2.76875, |
|
"reward_std": 0.0375, |
|
"rewards/accuracy_reward": 1.76875, |
|
"rewards/format_reward": 1.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 49.45, |
|
"epoch": 0.344, |
|
"grad_norm": 2.2621846199035645, |
|
"kl": 0.075048828125, |
|
"learning_rate": 7.85e-07, |
|
"loss": 0.0008, |
|
"reward": 2.634375, |
|
"reward_std": 0.03125, |
|
"rewards/accuracy_reward": 1.6375, |
|
"rewards/format_reward": 0.996875, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 52.03125, |
|
"epoch": 0.352, |
|
"grad_norm": 2.9660024642944336, |
|
"kl": 0.0776123046875, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.7625, |
|
"reward_std": 0.03943375647068024, |
|
"rewards/accuracy_reward": 1.7625, |
|
"rewards/format_reward": 1.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 52.496875, |
|
"epoch": 0.36, |
|
"grad_norm": 0.040182050317525864, |
|
"kl": 0.0726806640625, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0007, |
|
"reward": 2.6875, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 51.725, |
|
"epoch": 0.368, |
|
"grad_norm": 0.06841447949409485, |
|
"kl": 0.0802001953125, |
|
"learning_rate": 7.699999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.8, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 1.8, |
|
"rewards/format_reward": 1.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 48.14375, |
|
"epoch": 0.376, |
|
"grad_norm": 0.04733005911111832, |
|
"kl": 0.0659912109375, |
|
"learning_rate": 7.65e-07, |
|
"loss": 0.0007, |
|
"reward": 2.61875, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.61875, |
|
"rewards/format_reward": 1.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 46.89375, |
|
"epoch": 0.384, |
|
"grad_norm": 2.7484917640686035, |
|
"kl": 0.0697998046875, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0007, |
|
"reward": 2.74375, |
|
"reward_std": 0.09136751294136047, |
|
"rewards/accuracy_reward": 1.74375, |
|
"rewards/format_reward": 1.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 48.48125, |
|
"epoch": 0.392, |
|
"grad_norm": 1.7968782186508179, |
|
"kl": 0.0580078125, |
|
"learning_rate": 7.55e-07, |
|
"loss": 0.0006, |
|
"reward": 2.7125, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.7125, |
|
"rewards/format_reward": 1.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 50.034375, |
|
"epoch": 0.4, |
|
"grad_norm": 0.08426347374916077, |
|
"kl": 0.077099609375, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.0008, |
|
"reward": 2.68125, |
|
"reward_std": 0.04136751294136047, |
|
"rewards/accuracy_reward": 1.68125, |
|
"rewards/format_reward": 1.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 51.378125, |
|
"epoch": 0.408, |
|
"grad_norm": 0.040815118700265884, |
|
"kl": 0.06416015625, |
|
"learning_rate": 7.45e-07, |
|
"loss": 0.0006, |
|
"reward": 2.73125, |
|
"reward_std": 0.026933756470680238, |
|
"rewards/accuracy_reward": 1.73125, |
|
"rewards/format_reward": 1.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 49.878125, |
|
"epoch": 0.416, |
|
"grad_norm": 0.06027600169181824, |
|
"kl": 0.0675537109375, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.0007, |
|
"reward": 2.671875, |
|
"reward_std": 0.00625, |
|
"rewards/accuracy_reward": 1.675, |
|
"rewards/format_reward": 0.996875, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 47.921875, |
|
"epoch": 0.424, |
|
"grad_norm": 0.06604389101266861, |
|
"kl": 0.07177734375, |
|
"learning_rate": 7.35e-07, |
|
"loss": 0.0007, |
|
"reward": 2.675, |
|
"reward_std": 0.08943375647068023, |
|
"rewards/accuracy_reward": 1.68125, |
|
"rewards/format_reward": 0.99375, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 41.890625, |
|
"epoch": 0.432, |
|
"grad_norm": 2.579275608062744, |
|
"kl": 0.080859375, |
|
"learning_rate": 7.3e-07, |
|
"loss": 0.0008, |
|
"reward": 2.75, |
|
"reward_std": 0.03943375647068024, |
|
"rewards/accuracy_reward": 1.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 44.046875, |
|
"epoch": 0.44, |
|
"grad_norm": 0.04179125651717186, |
|
"kl": 0.076025390625, |
|
"learning_rate": 7.249999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.5375, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.5375, |
|
"rewards/format_reward": 1.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 46.725, |
|
"epoch": 0.448, |
|
"grad_norm": 0.04865502566099167, |
|
"kl": 0.075830078125, |
|
"learning_rate": 7.2e-07, |
|
"loss": 0.0008, |
|
"reward": 2.66875, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.66875, |
|
"rewards/format_reward": 1.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 48.0875, |
|
"epoch": 0.456, |
|
"grad_norm": 0.1781499981880188, |
|
"kl": 92.47451171875, |
|
"learning_rate": 7.149999999999999e-07, |
|
"loss": 0.9243, |
|
"reward": 2.8, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.8, |
|
"rewards/format_reward": 1.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 49.703125, |
|
"epoch": 0.464, |
|
"grad_norm": 0.05255131423473358, |
|
"kl": 0.0656982421875, |
|
"learning_rate": 7.1e-07, |
|
"loss": 0.0007, |
|
"reward": 2.6625, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.6625, |
|
"rewards/format_reward": 1.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 52.06875, |
|
"epoch": 0.472, |
|
"grad_norm": 0.1266418695449829, |
|
"kl": 0.0781005859375, |
|
"learning_rate": 7.049999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.75, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 53.475, |
|
"epoch": 0.48, |
|
"grad_norm": 0.07561592757701874, |
|
"kl": 0.0699951171875, |
|
"learning_rate": 7e-07, |
|
"loss": 0.0007, |
|
"reward": 2.6875, |
|
"reward_std": 0.053867512941360475, |
|
"rewards/accuracy_reward": 1.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 52.0625, |
|
"epoch": 0.488, |
|
"grad_norm": 0.04883831739425659, |
|
"kl": 0.0799560546875, |
|
"learning_rate": 6.949999999999999e-07, |
|
"loss": 0.0008, |
|
"reward": 2.65625, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.65625, |
|
"rewards/format_reward": 1.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 49.54375, |
|
"epoch": 0.496, |
|
"grad_norm": 2.3243064880371094, |
|
"kl": 0.0752685546875, |
|
"learning_rate": 6.9e-07, |
|
"loss": 0.0008, |
|
"reward": 2.815625, |
|
"reward_std": 0.058183756470680234, |
|
"rewards/accuracy_reward": 1.81875, |
|
"rewards/format_reward": 0.996875, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 48.690625, |
|
"epoch": 0.504, |
|
"grad_norm": 0.06750122457742691, |
|
"kl": 0.06513671875, |
|
"learning_rate": 6.85e-07, |
|
"loss": 0.0007, |
|
"reward": 2.84375, |
|
"reward_std": 0.0375, |
|
"rewards/accuracy_reward": 1.84375, |
|
"rewards/format_reward": 1.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 49.271875, |
|
"epoch": 0.512, |
|
"grad_norm": 0.056099992245435715, |
|
"kl": 0.0666259765625, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.0007, |
|
"reward": 2.69375, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.69375, |
|
"rewards/format_reward": 1.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 46.4375, |
|
"epoch": 0.52, |
|
"grad_norm": 0.0455087348818779, |
|
"kl": 0.0546630859375, |
|
"learning_rate": 6.75e-07, |
|
"loss": 0.0005, |
|
"reward": 2.75625, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.75625, |
|
"rewards/format_reward": 1.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 46.496875, |
|
"epoch": 0.528, |
|
"grad_norm": 0.05418640747666359, |
|
"kl": 0.0645263671875, |
|
"learning_rate": 6.7e-07, |
|
"loss": 0.0006, |
|
"reward": 2.6875, |
|
"reward_std": 0.014433756470680237, |
|
"rewards/accuracy_reward": 1.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 46.328125, |
|
"epoch": 0.536, |
|
"grad_norm": 4.0458455085754395, |
|
"kl": 0.081103515625, |
|
"learning_rate": 6.65e-07, |
|
"loss": 0.0008, |
|
"reward": 2.65625, |
|
"reward_std": 0.08080126941204072, |
|
"rewards/accuracy_reward": 1.65625, |
|
"rewards/format_reward": 1.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 48.91875, |
|
"epoch": 0.544, |
|
"grad_norm": 0.04970540851354599, |
|
"kl": 0.0717041015625, |
|
"learning_rate": 6.6e-07, |
|
"loss": 0.0007, |
|
"reward": 2.7625, |
|
"reward_std": 0.04330126941204071, |
|
"rewards/accuracy_reward": 1.7625, |
|
"rewards/format_reward": 1.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 49.01875, |
|
"epoch": 0.552, |
|
"grad_norm": 0.1746923178434372, |
|
"kl": 0.073779296875, |
|
"learning_rate": 6.55e-07, |
|
"loss": 0.0007, |
|
"reward": 2.75, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 48.3125, |
|
"epoch": 0.56, |
|
"grad_norm": 0.051023293286561966, |
|
"kl": 0.06783447265625, |
|
"learning_rate": 6.5e-07, |
|
"loss": 0.0007, |
|
"reward": 2.7625, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.7625, |
|
"rewards/format_reward": 1.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 49.11875, |
|
"epoch": 0.568, |
|
"grad_norm": 0.07166194915771484, |
|
"kl": 0.0619384765625, |
|
"learning_rate": 6.45e-07, |
|
"loss": 0.0006, |
|
"reward": 2.7375, |
|
"reward_std": 0.014433756470680237, |
|
"rewards/accuracy_reward": 1.7375, |
|
"rewards/format_reward": 1.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 51.103125, |
|
"epoch": 0.576, |
|
"grad_norm": 0.08520376682281494, |
|
"kl": 0.0830078125, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0008, |
|
"reward": 2.7375, |
|
"reward_std": 0.014433756470680237, |
|
"rewards/accuracy_reward": 1.7375, |
|
"rewards/format_reward": 1.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 49.615625, |
|
"epoch": 0.584, |
|
"grad_norm": 0.10399647802114487, |
|
"kl": 0.0688232421875, |
|
"learning_rate": 6.35e-07, |
|
"loss": 0.0007, |
|
"reward": 2.69375, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.69375, |
|
"rewards/format_reward": 1.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 50.596875, |
|
"epoch": 0.592, |
|
"grad_norm": 0.06369677186012268, |
|
"kl": 0.087890625, |
|
"learning_rate": 6.3e-07, |
|
"loss": 0.0009, |
|
"reward": 2.61875, |
|
"reward_std": 0.0375, |
|
"rewards/accuracy_reward": 1.61875, |
|
"rewards/format_reward": 1.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 50.56875, |
|
"epoch": 0.6, |
|
"grad_norm": 0.07198835164308548, |
|
"kl": 0.10087890625, |
|
"learning_rate": 6.249999999999999e-07, |
|
"loss": 0.001, |
|
"reward": 2.625, |
|
"reward_std": 0.06443375647068024, |
|
"rewards/accuracy_reward": 1.625, |
|
"rewards/format_reward": 1.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 50.953125, |
|
"epoch": 0.608, |
|
"grad_norm": 0.04980659857392311, |
|
"kl": 0.101806640625, |
|
"learning_rate": 6.2e-07, |
|
"loss": 0.001, |
|
"reward": 2.721875, |
|
"reward_std": 0.03125, |
|
"rewards/accuracy_reward": 1.725, |
|
"rewards/format_reward": 0.996875, |
|
"step": 760 |
|
}, |
|
{ |
|
"completion_length": 46.609375, |
|
"epoch": 0.616, |
|
"grad_norm": 2.673631191253662, |
|
"kl": 0.0730224609375, |
|
"learning_rate": 6.149999999999999e-07, |
|
"loss": 0.0007, |
|
"reward": 2.6875, |
|
"reward_std": 0.03943375647068024, |
|
"rewards/accuracy_reward": 1.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"completion_length": 46.16875, |
|
"epoch": 0.624, |
|
"grad_norm": 0.07191024720668793, |
|
"kl": 0.07197265625, |
|
"learning_rate": 6.1e-07, |
|
"loss": 0.0007, |
|
"reward": 2.6125, |
|
"reward_std": 0.03943375647068024, |
|
"rewards/accuracy_reward": 1.6125, |
|
"rewards/format_reward": 1.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"completion_length": 47.346875, |
|
"epoch": 0.632, |
|
"grad_norm": 0.31487828493118286, |
|
"kl": 0.0890625, |
|
"learning_rate": 6.049999999999999e-07, |
|
"loss": 0.0009, |
|
"reward": 2.7, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 1.7, |
|
"rewards/format_reward": 1.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"completion_length": 48.5125, |
|
"epoch": 0.64, |
|
"grad_norm": 0.04281134530901909, |
|
"kl": 0.0651611328125, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0007, |
|
"reward": 2.65, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 1.65, |
|
"rewards/format_reward": 1.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"completion_length": 47.9625, |
|
"epoch": 0.648, |
|
"grad_norm": 1.7782899141311646, |
|
"kl": 0.0711669921875, |
|
"learning_rate": 5.949999999999999e-07, |
|
"loss": 0.0007, |
|
"reward": 2.634375, |
|
"reward_std": 0.04568375647068024, |
|
"rewards/accuracy_reward": 1.6375, |
|
"rewards/format_reward": 0.996875, |
|
"step": 810 |
|
}, |
|
{ |
|
"completion_length": 47.7, |
|
"epoch": 0.656, |
|
"grad_norm": 0.9939271211624146, |
|
"kl": 0.07099609375, |
|
"learning_rate": 5.9e-07, |
|
"loss": 0.0007, |
|
"reward": 2.6625, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.6625, |
|
"rewards/format_reward": 1.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"completion_length": 45.790625, |
|
"epoch": 0.664, |
|
"grad_norm": 0.05890406668186188, |
|
"kl": 0.0596923828125, |
|
"learning_rate": 5.849999999999999e-07, |
|
"loss": 0.0006, |
|
"reward": 2.696875, |
|
"reward_std": 0.00625, |
|
"rewards/accuracy_reward": 1.7, |
|
"rewards/format_reward": 0.996875, |
|
"step": 830 |
|
}, |
|
{ |
|
"completion_length": 46.675, |
|
"epoch": 0.672, |
|
"grad_norm": 0.062360286712646484, |
|
"kl": 0.071240234375, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0007, |
|
"reward": 2.5875, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.5875, |
|
"rewards/format_reward": 1.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"completion_length": 47.621875, |
|
"epoch": 0.68, |
|
"grad_norm": 2.2732224464416504, |
|
"kl": 1.6703369140625, |
|
"learning_rate": 5.749999999999999e-07, |
|
"loss": 0.0167, |
|
"reward": 2.64375, |
|
"reward_std": 0.04136751294136047, |
|
"rewards/accuracy_reward": 1.64375, |
|
"rewards/format_reward": 1.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"completion_length": 50.35, |
|
"epoch": 0.688, |
|
"grad_norm": 2.1026487350463867, |
|
"kl": 0.09111328125, |
|
"learning_rate": 5.699999999999999e-07, |
|
"loss": 0.0009, |
|
"reward": 2.6625, |
|
"reward_std": 0.053867512941360475, |
|
"rewards/accuracy_reward": 1.6625, |
|
"rewards/format_reward": 1.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"completion_length": 52.321875, |
|
"epoch": 0.696, |
|
"grad_norm": 3.0173561573028564, |
|
"kl": 321.521728515625, |
|
"learning_rate": 5.649999999999999e-07, |
|
"loss": 3.2171, |
|
"reward": 2.46875, |
|
"reward_std": 0.04136751294136047, |
|
"rewards/accuracy_reward": 1.46875, |
|
"rewards/format_reward": 1.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"completion_length": 49.7625, |
|
"epoch": 0.704, |
|
"grad_norm": 0.06468257308006287, |
|
"kl": 0.39306640625, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.0039, |
|
"reward": 2.7125, |
|
"reward_std": 0.06443375647068024, |
|
"rewards/accuracy_reward": 1.7125, |
|
"rewards/format_reward": 1.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"completion_length": 48.240625, |
|
"epoch": 0.712, |
|
"grad_norm": 0.07906866073608398, |
|
"kl": 0.12939453125, |
|
"learning_rate": 5.55e-07, |
|
"loss": 0.0013, |
|
"reward": 2.68125, |
|
"reward_std": 0.026933756470680238, |
|
"rewards/accuracy_reward": 1.68125, |
|
"rewards/format_reward": 1.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"completion_length": 49.03125, |
|
"epoch": 0.72, |
|
"grad_norm": 0.07313551008701324, |
|
"kl": 0.18505859375, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0019, |
|
"reward": 2.69375, |
|
"reward_std": 0.026933756470680238, |
|
"rewards/accuracy_reward": 1.69375, |
|
"rewards/format_reward": 1.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"completion_length": 49.60625, |
|
"epoch": 0.728, |
|
"grad_norm": 4.0763630867004395, |
|
"kl": 2.13671875, |
|
"learning_rate": 5.45e-07, |
|
"loss": 0.0213, |
|
"reward": 2.753125, |
|
"reward_std": 0.058183756470680234, |
|
"rewards/accuracy_reward": 1.75625, |
|
"rewards/format_reward": 0.996875, |
|
"step": 910 |
|
}, |
|
{ |
|
"completion_length": 49.83125, |
|
"epoch": 0.736, |
|
"grad_norm": 4.245804786682129, |
|
"kl": 0.094384765625, |
|
"learning_rate": 5.4e-07, |
|
"loss": 0.0009, |
|
"reward": 2.75625, |
|
"reward_std": 0.04136751294136047, |
|
"rewards/accuracy_reward": 1.75625, |
|
"rewards/format_reward": 1.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"completion_length": 50.996875, |
|
"epoch": 0.744, |
|
"grad_norm": 39.86748504638672, |
|
"kl": 3.122021484375, |
|
"learning_rate": 5.35e-07, |
|
"loss": 0.0313, |
|
"reward": 2.8125, |
|
"reward_std": 0.03943375647068024, |
|
"rewards/accuracy_reward": 1.8125, |
|
"rewards/format_reward": 1.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"completion_length": 49.59375, |
|
"epoch": 0.752, |
|
"grad_norm": 0.05779128894209862, |
|
"kl": 0.0998046875, |
|
"learning_rate": 5.3e-07, |
|
"loss": 0.001, |
|
"reward": 2.79375, |
|
"reward_std": 0.051933756470680235, |
|
"rewards/accuracy_reward": 1.79375, |
|
"rewards/format_reward": 1.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"completion_length": 47.74375, |
|
"epoch": 0.76, |
|
"grad_norm": 0.056427907198667526, |
|
"kl": 0.445361328125, |
|
"learning_rate": 5.25e-07, |
|
"loss": 0.0045, |
|
"reward": 2.76875, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.76875, |
|
"rewards/format_reward": 1.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"completion_length": 48.50625, |
|
"epoch": 0.768, |
|
"grad_norm": 66.90420532226562, |
|
"kl": 2.232958984375, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.0223, |
|
"reward": 2.80625, |
|
"reward_std": 0.0375, |
|
"rewards/accuracy_reward": 1.80625, |
|
"rewards/format_reward": 1.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"completion_length": 51.01875, |
|
"epoch": 0.776, |
|
"grad_norm": 0.09945037215948105, |
|
"kl": 0.09228515625, |
|
"learning_rate": 5.149999999999999e-07, |
|
"loss": 0.0009, |
|
"reward": 2.775, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 1.775, |
|
"rewards/format_reward": 1.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"completion_length": 50.090625, |
|
"epoch": 0.784, |
|
"grad_norm": 0.07448896020650864, |
|
"kl": 0.15419921875, |
|
"learning_rate": 5.1e-07, |
|
"loss": 0.0015, |
|
"reward": 2.76875, |
|
"reward_std": 0.0375, |
|
"rewards/accuracy_reward": 1.76875, |
|
"rewards/format_reward": 1.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"completion_length": 49.23125, |
|
"epoch": 0.792, |
|
"grad_norm": 2.0038902759552, |
|
"kl": 0.098828125, |
|
"learning_rate": 5.049999999999999e-07, |
|
"loss": 0.001, |
|
"reward": 2.64375, |
|
"reward_std": 0.0125, |
|
"rewards/accuracy_reward": 1.64375, |
|
"rewards/format_reward": 1.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"completion_length": 49.790625, |
|
"epoch": 0.8, |
|
"grad_norm": 0.03871207684278488, |
|
"kl": 0.31083984375, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0031, |
|
"reward": 2.7625, |
|
"reward_std": 0.025, |
|
"rewards/accuracy_reward": 1.7625, |
|
"rewards/format_reward": 1.0, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|