{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8780487804878048, "eval_steps": 100, "global_step": 30, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 19.635416507720947, "epoch": 0.0975609756097561, "grad_norm": 0.5056069493293762, "kl": 0.0, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5598958358168602, "reward_std": 0.09434993285685778, "rewards/semantic_entropy": 0.5598958358168602, "step": 1 }, { "completion_length": 17.5711807012558, "epoch": 0.1951219512195122, "grad_norm": 1.0753142833709717, "kl": 0.0, "learning_rate": 1.9396926207859082e-06, "loss": 0.0, "reward": 0.5069444291293621, "reward_std": 0.130544594489038, "rewards/semantic_entropy": 0.5069444291293621, "step": 2 }, { "completion_length": 17.8680557012558, "epoch": 0.2926829268292683, "grad_norm": 1.0308548212051392, "kl": 0.0010528564453125, "learning_rate": 1.766044443118978e-06, "loss": 0.0, "reward": 0.5251736156642437, "reward_std": 0.13977272436022758, "rewards/semantic_entropy": 0.5251736156642437, "step": 3 }, { "completion_length": 20.52256941795349, "epoch": 0.3902439024390244, "grad_norm": 0.6616184711456299, "kl": 0.00194549560546875, "learning_rate": 1.5e-06, "loss": 0.0001, "reward": 0.5691550932824612, "reward_std": 0.13807372376322746, "rewards/semantic_entropy": 0.5691550932824612, "step": 4 }, { "completion_length": 19.373263835906982, "epoch": 0.4878048780487805, "grad_norm": 0.7476760149002075, "kl": 0.003894805908203125, "learning_rate": 1.1736481776669305e-06, "loss": 0.0002, "reward": 0.5060763992369175, "reward_std": 0.10967991594225168, "rewards/semantic_entropy": 0.5060763992369175, "step": 5 }, { "completion_length": 19.572916984558105, "epoch": 0.5853658536585366, "grad_norm": 0.5244656205177307, "kl": 0.0069637298583984375, "learning_rate": 8.263518223330696e-07, "loss": 0.0003, "reward": 0.6232638955116272, "reward_std": 0.11568193091079593, "rewards/semantic_entropy": 0.6232638955116272, "step": 6 }, { "completion_length": 18.697916507720947, "epoch": 0.6829268292682927, "grad_norm": 0.5207864046096802, "kl": 0.013153076171875, "learning_rate": 5.000000000000002e-07, "loss": 0.0005, "reward": 0.621527798473835, "reward_std": 0.1189112700521946, "rewards/semantic_entropy": 0.621527798473835, "step": 7 }, { "completion_length": 19.454861402511597, "epoch": 0.7804878048780488, "grad_norm": 0.6792996525764465, "kl": 0.0097503662109375, "learning_rate": 2.339555568810221e-07, "loss": 0.0004, "reward": 0.5682870484888554, "reward_std": 0.13645811565220356, "rewards/semantic_entropy": 0.5682870484888554, "step": 8 }, { "completion_length": 20.125000476837158, "epoch": 0.8780487804878049, "grad_norm": 0.5297054052352905, "kl": 0.010364532470703125, "learning_rate": 6.030737921409168e-08, "loss": 0.0004, "reward": 0.5454282537102699, "reward_std": 0.10149804083630443, "rewards/semantic_entropy": 0.5454282537102699, "step": 9 }, { "completion_length": 19.699653148651123, "epoch": 0.975609756097561, "grad_norm": 0.4804926812648773, "kl": 0.01203155517578125, "learning_rate": 0.0, "loss": 0.0005, "reward": 0.6212384253740311, "reward_std": 0.09729464584961534, "rewards/semantic_entropy": 0.6212384253740311, "step": 10 }, { "completion_length": 17.564236164093018, "epoch": 1.0975609756097562, "grad_norm": 0.6446972489356995, "kl": 0.009983062744140625, "learning_rate": 1.5971585917027862e-06, "loss": 0.0004, "reward": 0.5920138917863369, "reward_std": 0.11490329634398222, "rewards/semantic_entropy": 0.5920138917863369, "step": 11 }, { "completion_length": 20.96701431274414, "epoch": 1.1951219512195121, "grad_norm": 0.4366309642791748, "kl": 0.012844085693359375, "learning_rate": 1.5e-06, "loss": 0.0005, "reward": 0.5960648246109486, "reward_std": 0.10349765885621309, "rewards/semantic_entropy": 0.5960648246109486, "step": 12 }, { "completion_length": 20.28819489479065, "epoch": 1.2926829268292683, "grad_norm": 0.7926385998725891, "kl": 0.017467498779296875, "learning_rate": 1.3960797660391568e-06, "loss": 0.0007, "reward": 0.5014467723667622, "reward_std": 0.12278107088059187, "rewards/semantic_entropy": 0.5014467723667622, "step": 13 }, { "completion_length": 20.972223043441772, "epoch": 1.3902439024390243, "grad_norm": 0.67302006483078, "kl": 0.017255783081054688, "learning_rate": 1.2868032327110903e-06, "loss": 0.0007, "reward": 0.5095486305654049, "reward_std": 0.09976449748501182, "rewards/semantic_entropy": 0.5095486305654049, "step": 14 }, { "completion_length": 18.17013907432556, "epoch": 1.4878048780487805, "grad_norm": 0.43078961968421936, "kl": 0.0076904296875, "learning_rate": 1.1736481776669305e-06, "loss": 0.0003, "reward": 0.588252317160368, "reward_std": 0.1104576913639903, "rewards/semantic_entropy": 0.588252317160368, "step": 15 }, { "completion_length": 19.697917222976685, "epoch": 1.5853658536585367, "grad_norm": 0.4722602665424347, "kl": 0.010288238525390625, "learning_rate": 1.0581448289104758e-06, "loss": 0.0004, "reward": 0.559606496244669, "reward_std": 0.11904297955334187, "rewards/semantic_entropy": 0.559606496244669, "step": 16 }, { "completion_length": 19.661458730697632, "epoch": 1.6829268292682928, "grad_norm": 0.6996189951896667, "kl": 0.012149810791015625, "learning_rate": 9.418551710895241e-07, "loss": 0.0005, "reward": 0.591435182839632, "reward_std": 0.10133868269622326, "rewards/semantic_entropy": 0.591435182839632, "step": 17 }, { "completion_length": 21.19444465637207, "epoch": 1.7804878048780488, "grad_norm": 0.5424029231071472, "kl": 0.023311614990234375, "learning_rate": 8.263518223330696e-07, "loss": 0.0009, "reward": 0.5104166567325592, "reward_std": 0.1390146454796195, "rewards/semantic_entropy": 0.5104166567325592, "step": 18 }, { "completion_length": 21.192708253860474, "epoch": 1.8780487804878048, "grad_norm": 0.5004396438598633, "kl": 0.007457733154296875, "learning_rate": 7.1319676728891e-07, "loss": 0.0003, "reward": 0.5214120373129845, "reward_std": 0.12240998912602663, "rewards/semantic_entropy": 0.5214120373129845, "step": 19 }, { "completion_length": 19.510417342185974, "epoch": 1.975609756097561, "grad_norm": 0.5037679076194763, "kl": 0.012096405029296875, "learning_rate": 6.039202339608431e-07, "loss": 0.0005, "reward": 0.6221064738929272, "reward_std": 0.11695278249680996, "rewards/semantic_entropy": 0.6221064738929272, "step": 20 }, { "completion_length": 17.875, "epoch": 2.0, "grad_norm": 0.5037679076194763, "kl": 0.03363037109375, "learning_rate": 5.000000000000002e-07, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0, "rewards/semantic_entropy": 1.0, "step": 21 }, { "completion_length": 18.83506989479065, "epoch": 2.097560975609756, "grad_norm": 0.727460503578186, "kl": 0.01171875, "learning_rate": 4.02841408297214e-07, "loss": 0.0005, "reward": 0.6105324216187, "reward_std": 0.0801440766081214, "rewards/semantic_entropy": 0.6105324216187, "step": 22 }, { "completion_length": 20.21701443195343, "epoch": 2.1951219512195124, "grad_norm": 0.6406362056732178, "kl": 0.01398468017578125, "learning_rate": 3.137583621312665e-07, "loss": 0.0006, "reward": 0.5815972313284874, "reward_std": 0.12349289190024137, "rewards/semantic_entropy": 0.5815972313284874, "step": 23 }, { "completion_length": 21.807291984558105, "epoch": 2.292682926829268, "grad_norm": 0.45703092217445374, "kl": 0.012033462524414062, "learning_rate": 2.339555568810221e-07, "loss": 0.0005, "reward": 0.5804398320615292, "reward_std": 0.1076621082611382, "rewards/semantic_entropy": 0.5804398320615292, "step": 24 }, { "completion_length": 20.090278387069702, "epoch": 2.3902439024390243, "grad_norm": 1.17950439453125, "kl": 0.023633956909179688, "learning_rate": 1.6451218858706372e-07, "loss": 0.0009, "reward": 0.4774305485188961, "reward_std": 0.11873876117169857, "rewards/semantic_entropy": 0.4774305485188961, "step": 25 }, { "completion_length": 19.114583492279053, "epoch": 2.4878048780487805, "grad_norm": 1.0232934951782227, "kl": 0.021167755126953125, "learning_rate": 1.0636735967658784e-07, "loss": 0.0008, "reward": 0.5917245410382748, "reward_std": 0.10972362849861383, "rewards/semantic_entropy": 0.5917245410382748, "step": 26 }, { "completion_length": 17.560763835906982, "epoch": 2.5853658536585367, "grad_norm": 0.45686405897140503, "kl": 0.0112457275390625, "learning_rate": 6.030737921409168e-08, "loss": 0.0004, "reward": 0.5879629701375961, "reward_std": 0.10896958655212075, "rewards/semantic_entropy": 0.5879629701375961, "step": 27 }, { "completion_length": 18.482638955116272, "epoch": 2.682926829268293, "grad_norm": 2.8627212047576904, "kl": 0.028564453125, "learning_rate": 2.6955129420176193e-08, "loss": 0.0011, "reward": 0.5341435223817825, "reward_std": 0.11482170736417174, "rewards/semantic_entropy": 0.5341435223817825, "step": 28 }, { "completion_length": 20.75868058204651, "epoch": 2.7804878048780486, "grad_norm": 0.60300213098526, "kl": 0.014234542846679688, "learning_rate": 6.761642258056976e-09, "loss": 0.0006, "reward": 0.47453703358769417, "reward_std": 0.12850847654044628, "rewards/semantic_entropy": 0.47453703358769417, "step": 29 }, { "completion_length": 19.67881965637207, "epoch": 2.8780487804878048, "grad_norm": 0.4153330624103546, "kl": 0.01200103759765625, "learning_rate": 0.0, "loss": 0.0005, "reward": 0.6588541865348816, "reward_std": 0.10488813614938408, "rewards/semantic_entropy": 0.6588541865348816, "step": 30 }, { "epoch": 2.8780487804878048, "step": 30, "total_flos": 0.0, "train_loss": 0.00038097099556277193, "train_runtime": 1888.0541, "train_samples_per_second": 0.779, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 30, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }