{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9932459276916965, "eval_steps": 5000, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012415574096146206, "grad_norm": 2.5686280727386475, "learning_rate": 0.00025, "loss": 7.7629, "step": 500 }, { "epoch": 0.024831148192292412, "grad_norm": 2.2654430866241455, "learning_rate": 0.0005, "loss": 6.0856, "step": 1000 }, { "epoch": 0.037246722288438616, "grad_norm": 2.7833986282348633, "learning_rate": 0.0004987522459572769, "loss": 5.6864, "step": 1500 }, { "epoch": 0.049662296384584824, "grad_norm": 2.7468013763427734, "learning_rate": 0.0004975044919145539, "loss": 5.3895, "step": 2000 }, { "epoch": 0.06207787048073103, "grad_norm": 2.8749406337738037, "learning_rate": 0.0004962567378718307, "loss": 5.186, "step": 2500 }, { "epoch": 0.07449344457687723, "grad_norm": 2.867882490158081, "learning_rate": 0.0004950089838291076, "loss": 4.977, "step": 3000 }, { "epoch": 0.08690901867302345, "grad_norm": 3.050034523010254, "learning_rate": 0.0004937612297863845, "loss": 4.7687, "step": 3500 }, { "epoch": 0.09932459276916965, "grad_norm": 4.819639682769775, "learning_rate": 0.0004925134757436613, "loss": 4.4337, "step": 4000 }, { "epoch": 0.11174016686531585, "grad_norm": 2.481872081756592, "learning_rate": 0.0004912657217009383, "loss": 4.0712, "step": 4500 }, { "epoch": 0.12415574096146206, "grad_norm": 2.837972402572632, "learning_rate": 0.0004900179676582152, "loss": 3.915, "step": 5000 }, { "epoch": 0.12415574096146206, "eval_loss": 2.701279878616333, "eval_runtime": 3666.7471, "eval_samples_per_second": 351.453, "eval_steps_per_second": 10.983, "step": 5000 }, { "epoch": 0.13657131505760825, "grad_norm": 3.349369525909424, "learning_rate": 0.0004887702136154921, "loss": 3.7757, "step": 5500 }, { "epoch": 0.14898688915375446, "grad_norm": 3.129056930541992, "learning_rate": 0.00048752245957276905, "loss": 3.7123, "step": 6000 }, { "epoch": 0.16140246324990068, "grad_norm": 3.715869426727295, "learning_rate": 0.00048627969654621686, "loss": 3.6079, "step": 6500 }, { "epoch": 0.1738180373460469, "grad_norm": 2.6967413425445557, "learning_rate": 0.0004850319425034937, "loss": 3.5496, "step": 7000 }, { "epoch": 0.18623361144219308, "grad_norm": 2.7264204025268555, "learning_rate": 0.0004837841884607706, "loss": 3.431, "step": 7500 }, { "epoch": 0.1986491855383393, "grad_norm": 2.5806965827941895, "learning_rate": 0.00048253643441804753, "loss": 3.384, "step": 8000 }, { "epoch": 0.2110647596344855, "grad_norm": 3.236356019973755, "learning_rate": 0.0004812886803753244, "loss": 3.3063, "step": 8500 }, { "epoch": 0.2234803337306317, "grad_norm": 3.126569986343384, "learning_rate": 0.00048004342184068677, "loss": 3.2365, "step": 9000 }, { "epoch": 0.2358959078267779, "grad_norm": 2.903918981552124, "learning_rate": 0.0004787956677979637, "loss": 3.1995, "step": 9500 }, { "epoch": 0.24831148192292413, "grad_norm": 2.595036745071411, "learning_rate": 0.00047754791375524057, "loss": 3.1525, "step": 10000 }, { "epoch": 0.24831148192292413, "eval_loss": 2.1151881217956543, "eval_runtime": 3661.5842, "eval_samples_per_second": 351.949, "eval_steps_per_second": 10.999, "step": 10000 }, { "epoch": 0.2607270560190703, "grad_norm": 2.7406485080718994, "learning_rate": 0.0004763001597125175, "loss": 3.1375, "step": 10500 }, { "epoch": 0.2731426301152165, "grad_norm": 3.198718309402466, "learning_rate": 0.0004750549011778798, "loss": 3.0598, "step": 11000 }, { "epoch": 0.28555820421136274, "grad_norm": 2.6438426971435547, "learning_rate": 0.00047380714713515673, "loss": 3.0657, "step": 11500 }, { "epoch": 0.29797377830750893, "grad_norm": 3.145714282989502, "learning_rate": 0.0004725593930924336, "loss": 3.003, "step": 12000 }, { "epoch": 0.31038935240365517, "grad_norm": 3.4619131088256836, "learning_rate": 0.00047131413455779597, "loss": 2.9664, "step": 12500 }, { "epoch": 0.32280492649980136, "grad_norm": 2.8198635578155518, "learning_rate": 0.0004700663805150729, "loss": 2.9595, "step": 13000 }, { "epoch": 0.33522050059594755, "grad_norm": 3.9205424785614014, "learning_rate": 0.0004688186264723498, "loss": 2.9262, "step": 13500 }, { "epoch": 0.3476360746920938, "grad_norm": 3.130042791366577, "learning_rate": 0.0004675708724296267, "loss": 2.8829, "step": 14000 }, { "epoch": 0.36005164878824, "grad_norm": 3.2414395809173584, "learning_rate": 0.0004663231183869036, "loss": 2.8598, "step": 14500 }, { "epoch": 0.37246722288438616, "grad_norm": 3.659555196762085, "learning_rate": 0.00046507536434418044, "loss": 2.8592, "step": 15000 }, { "epoch": 0.37246722288438616, "eval_loss": 1.8211588859558105, "eval_runtime": 3647.4211, "eval_samples_per_second": 353.316, "eval_steps_per_second": 11.041, "step": 15000 }, { "epoch": 0.3848827969805324, "grad_norm": 3.733859062194824, "learning_rate": 0.0004638301058095428, "loss": 2.8209, "step": 15500 }, { "epoch": 0.3972983710766786, "grad_norm": 2.9978246688842773, "learning_rate": 0.00046258235176681973, "loss": 2.7854, "step": 16000 }, { "epoch": 0.4097139451728248, "grad_norm": 2.694765567779541, "learning_rate": 0.00046133459772409666, "loss": 2.789, "step": 16500 }, { "epoch": 0.422129519268971, "grad_norm": 3.022148370742798, "learning_rate": 0.00046008684368137353, "loss": 2.7819, "step": 17000 }, { "epoch": 0.4345450933651172, "grad_norm": 2.756038188934326, "learning_rate": 0.00045883908963865045, "loss": 2.7496, "step": 17500 }, { "epoch": 0.4469606674612634, "grad_norm": 3.0430989265441895, "learning_rate": 0.0004575913355959274, "loss": 2.7558, "step": 18000 }, { "epoch": 0.45937624155740964, "grad_norm": 2.710583209991455, "learning_rate": 0.0004563460770612897, "loss": 2.7214, "step": 18500 }, { "epoch": 0.4717918156535558, "grad_norm": 4.814529895782471, "learning_rate": 0.00045509832301856656, "loss": 2.6977, "step": 19000 }, { "epoch": 0.484207389749702, "grad_norm": 2.782024621963501, "learning_rate": 0.0004538505689758435, "loss": 2.7066, "step": 19500 }, { "epoch": 0.49662296384584825, "grad_norm": 2.9942479133605957, "learning_rate": 0.00045260281493312036, "loss": 2.6744, "step": 20000 }, { "epoch": 0.49662296384584825, "eval_loss": 1.681386947631836, "eval_runtime": 3650.0754, "eval_samples_per_second": 353.059, "eval_steps_per_second": 11.033, "step": 20000 }, { "epoch": 0.5090385379419944, "grad_norm": 2.6107559204101562, "learning_rate": 0.00045135755639848273, "loss": 2.6627, "step": 20500 }, { "epoch": 0.5214541120381406, "grad_norm": 3.603623390197754, "learning_rate": 0.00045010980235575966, "loss": 2.6374, "step": 21000 }, { "epoch": 0.5338696861342869, "grad_norm": 2.804776668548584, "learning_rate": 0.0004488620483130366, "loss": 2.6477, "step": 21500 }, { "epoch": 0.546285260230433, "grad_norm": 3.2368860244750977, "learning_rate": 0.00044761429427031345, "loss": 2.6416, "step": 22000 }, { "epoch": 0.5587008343265792, "grad_norm": 2.6095378398895264, "learning_rate": 0.0004463665402275903, "loss": 2.6248, "step": 22500 }, { "epoch": 0.5711164084227255, "grad_norm": 2.9860754013061523, "learning_rate": 0.0004451212816929527, "loss": 2.6254, "step": 23000 }, { "epoch": 0.5835319825188717, "grad_norm": 3.114459276199341, "learning_rate": 0.0004438735276502296, "loss": 2.6257, "step": 23500 }, { "epoch": 0.5959475566150179, "grad_norm": 2.812556028366089, "learning_rate": 0.0004426257736075065, "loss": 2.5746, "step": 24000 }, { "epoch": 0.6083631307111641, "grad_norm": 3.2355823516845703, "learning_rate": 0.0004413780195647834, "loss": 2.5919, "step": 24500 }, { "epoch": 0.6207787048073103, "grad_norm": 2.5201354026794434, "learning_rate": 0.0004401302655220603, "loss": 2.5754, "step": 25000 }, { "epoch": 0.6207787048073103, "eval_loss": 1.5900601148605347, "eval_runtime": 4174.4112, "eval_samples_per_second": 308.712, "eval_steps_per_second": 9.647, "step": 25000 }, { "epoch": 0.6331942789034565, "grad_norm": 2.8540596961975098, "learning_rate": 0.0004388825114793372, "loss": 2.5705, "step": 25500 }, { "epoch": 0.6456098529996027, "grad_norm": 2.603358507156372, "learning_rate": 0.00043763475743661414, "loss": 2.5342, "step": 26000 }, { "epoch": 0.658025427095749, "grad_norm": 2.7852208614349365, "learning_rate": 0.00043638700339389096, "loss": 2.5463, "step": 26500 }, { "epoch": 0.6704410011918951, "grad_norm": 2.7578940391540527, "learning_rate": 0.0004351392493511679, "loss": 2.5372, "step": 27000 }, { "epoch": 0.6828565752880413, "grad_norm": 2.941049337387085, "learning_rate": 0.00043389399081653025, "loss": 2.5207, "step": 27500 }, { "epoch": 0.6952721493841876, "grad_norm": 2.7455787658691406, "learning_rate": 0.0004326462367738072, "loss": 2.5233, "step": 28000 }, { "epoch": 0.7076877234803337, "grad_norm": 2.4482600688934326, "learning_rate": 0.00043139848273108405, "loss": 2.5105, "step": 28500 }, { "epoch": 0.72010329757648, "grad_norm": 2.8398752212524414, "learning_rate": 0.000430150728688361, "loss": 2.531, "step": 29000 }, { "epoch": 0.7325188716726262, "grad_norm": 2.608999013900757, "learning_rate": 0.00042890547015372334, "loss": 2.4864, "step": 29500 }, { "epoch": 0.7449344457687723, "grad_norm": 2.071620225906372, "learning_rate": 0.00042765771611100016, "loss": 2.4574, "step": 30000 }, { "epoch": 0.7449344457687723, "eval_loss": 1.5331941843032837, "eval_runtime": 4179.1286, "eval_samples_per_second": 308.364, "eval_steps_per_second": 9.636, "step": 30000 }, { "epoch": 0.7573500198649186, "grad_norm": 3.0172479152679443, "learning_rate": 0.0004264099620682771, "loss": 2.4733, "step": 30500 }, { "epoch": 0.7697655939610648, "grad_norm": 2.6325442790985107, "learning_rate": 0.000425162208025554, "loss": 2.4721, "step": 31000 }, { "epoch": 0.7821811680572109, "grad_norm": 2.826345682144165, "learning_rate": 0.0004239144539828309, "loss": 2.4692, "step": 31500 }, { "epoch": 0.7945967421533572, "grad_norm": 2.456289291381836, "learning_rate": 0.00042266919544819325, "loss": 2.4385, "step": 32000 }, { "epoch": 0.8070123162495034, "grad_norm": 2.4803292751312256, "learning_rate": 0.0004214214414054702, "loss": 2.439, "step": 32500 }, { "epoch": 0.8194278903456496, "grad_norm": 2.6469247341156006, "learning_rate": 0.00042017618287083254, "loss": 2.4729, "step": 33000 }, { "epoch": 0.8318434644417958, "grad_norm": 2.7024786472320557, "learning_rate": 0.0004189284288281094, "loss": 2.4244, "step": 33500 }, { "epoch": 0.844259038537942, "grad_norm": 2.847285270690918, "learning_rate": 0.0004176806747853863, "loss": 2.4636, "step": 34000 }, { "epoch": 0.8566746126340882, "grad_norm": 2.453200340270996, "learning_rate": 0.0004164329207426632, "loss": 2.4524, "step": 34500 }, { "epoch": 0.8690901867302344, "grad_norm": 2.49642276763916, "learning_rate": 0.0004151876622080256, "loss": 2.4457, "step": 35000 }, { "epoch": 0.8690901867302344, "eval_loss": 1.4776599407196045, "eval_runtime": 4154.52, "eval_samples_per_second": 310.19, "eval_steps_per_second": 9.694, "step": 35000 }, { "epoch": 0.8815057608263807, "grad_norm": 2.576984405517578, "learning_rate": 0.00041393990816530245, "loss": 2.4149, "step": 35500 }, { "epoch": 0.8939213349225268, "grad_norm": 3.0729165077209473, "learning_rate": 0.0004126921541225794, "loss": 2.4067, "step": 36000 }, { "epoch": 0.906336909018673, "grad_norm": 2.7619829177856445, "learning_rate": 0.0004114444000798563, "loss": 2.4121, "step": 36500 }, { "epoch": 0.9187524831148193, "grad_norm": 3.5316452980041504, "learning_rate": 0.0004101991415452186, "loss": 2.3781, "step": 37000 }, { "epoch": 0.9311680572109654, "grad_norm": 2.7174599170684814, "learning_rate": 0.0004089538830105809, "loss": 2.4013, "step": 37500 }, { "epoch": 0.9435836313071116, "grad_norm": 13.372625350952148, "learning_rate": 0.00040771611100019967, "loss": 2.5449, "step": 38000 }, { "epoch": 0.9559992054032579, "grad_norm": 11.173745155334473, "learning_rate": 0.0004064883210221601, "loss": 6.7867, "step": 38500 }, { "epoch": 0.968414779499404, "grad_norm": 0.9608703255653381, "learning_rate": 0.000405240566979437, "loss": 7.7097, "step": 39000 }, { "epoch": 0.9808303535955503, "grad_norm": 8286.201171875, "learning_rate": 0.0004039928129367139, "loss": 7.7291, "step": 39500 }, { "epoch": 0.9932459276916965, "grad_norm": 2.1965973377227783, "learning_rate": 0.00040274505889399085, "loss": 7.7205, "step": 40000 }, { "epoch": 0.9932459276916965, "eval_loss": 7.322892665863037, "eval_runtime": 4170.5156, "eval_samples_per_second": 309.0, "eval_steps_per_second": 9.656, "step": 40000 } ], "logging_steps": 500, "max_steps": 201360, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6807169819837645e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }