{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.180281690140845, "eval_steps": 200, "global_step": 2200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02682763246143528, "grad_norm": 7.5027008056640625, "learning_rate": 0.00019991475902800314, "loss": 18.1731, "step": 50 }, { "epoch": 0.05365526492287056, "grad_norm": 5.68405294418335, "learning_rate": 0.0001996521945196495, "loss": 16.1431, "step": 100 }, { "epoch": 0.08048289738430583, "grad_norm": 5.743220806121826, "learning_rate": 0.00019921273704680872, "loss": 16.2968, "step": 150 }, { "epoch": 0.10731052984574112, "grad_norm": 6.115276336669922, "learning_rate": 0.0001985971666920019, "loss": 15.9962, "step": 200 }, { "epoch": 0.10731052984574112, "eval_loss": 1.7225275039672852, "eval_runtime": 27.5115, "eval_samples_per_second": 3.635, "eval_steps_per_second": 1.817, "step": 200 }, { "epoch": 0.1341381623071764, "grad_norm": 8.04736328125, "learning_rate": 0.00019780657615635263, "loss": 15.9676, "step": 250 }, { "epoch": 0.16096579476861167, "grad_norm": 6.840994358062744, "learning_rate": 0.0001968423688199291, "loss": 15.7539, "step": 300 }, { "epoch": 0.18779342723004694, "grad_norm": 8.343657493591309, "learning_rate": 0.00019570625625059935, "loss": 15.5103, "step": 350 }, { "epoch": 0.21462105969148224, "grad_norm": 6.539078712463379, "learning_rate": 0.00019440025516582107, "loss": 15.4896, "step": 400 }, { "epoch": 0.21462105969148224, "eval_loss": 1.6813037395477295, "eval_runtime": 27.4264, "eval_samples_per_second": 3.646, "eval_steps_per_second": 1.823, "step": 400 }, { "epoch": 0.2414486921529175, "grad_norm": 5.919412612915039, "learning_rate": 0.00019292668385275914, "loss": 15.7171, "step": 450 }, { "epoch": 0.2682763246143528, "grad_norm": 8.476408958435059, "learning_rate": 0.00019128815805308596, "loss": 15.6568, "step": 500 }, { "epoch": 0.29510395707578807, "grad_norm": 6.3846330642700195, "learning_rate": 0.00018948758631976906, "loss": 15.5006, "step": 550 }, { "epoch": 0.32193158953722334, "grad_norm": 9.207799911499023, "learning_rate": 0.00018752816485408843, "loss": 15.3622, "step": 600 }, { "epoch": 0.32193158953722334, "eval_loss": 1.657413363456726, "eval_runtime": 27.4485, "eval_samples_per_second": 3.643, "eval_steps_per_second": 1.822, "step": 600 }, { "epoch": 0.3487592219986586, "grad_norm": 8.288525581359863, "learning_rate": 0.00018541337183204833, "loss": 15.1776, "step": 650 }, { "epoch": 0.3755868544600939, "grad_norm": 6.99819278717041, "learning_rate": 0.00018314696123025454, "loss": 15.2676, "step": 700 }, { "epoch": 0.4024144869215292, "grad_norm": 6.276946544647217, "learning_rate": 0.0001807329561622173, "loss": 15.4822, "step": 750 }, { "epoch": 0.42924211938296447, "grad_norm": 7.4382123947143555, "learning_rate": 0.00017817564173690811, "loss": 15.17, "step": 800 }, { "epoch": 0.42924211938296447, "eval_loss": 1.6430630683898926, "eval_runtime": 27.4278, "eval_samples_per_second": 3.646, "eval_steps_per_second": 1.823, "step": 800 }, { "epoch": 0.45606975184439974, "grad_norm": 6.525240898132324, "learning_rate": 0.00017547955745224783, "loss": 15.2127, "step": 850 }, { "epoch": 0.482897384305835, "grad_norm": 6.991734027862549, "learning_rate": 0.00017264948913702733, "loss": 14.9923, "step": 900 }, { "epoch": 0.5097250167672703, "grad_norm": 7.74118709564209, "learning_rate": 0.0001696904604555664, "loss": 14.9959, "step": 950 }, { "epoch": 0.5365526492287056, "grad_norm": 8.33617115020752, "learning_rate": 0.00016660772399018878, "loss": 14.8148, "step": 1000 }, { "epoch": 0.5365526492287056, "eval_loss": 1.5783880949020386, "eval_runtime": 27.4606, "eval_samples_per_second": 3.642, "eval_steps_per_second": 1.821, "step": 1000 }, { "epoch": 0.5633802816901409, "grad_norm": 7.486099720001221, "learning_rate": 0.000163406751917345, "loss": 14.7974, "step": 1050 }, { "epoch": 0.5902079141515761, "grad_norm": 8.10119342803955, "learning_rate": 0.0001600932262939324, "loss": 14.5883, "step": 1100 }, { "epoch": 0.6170355466130114, "grad_norm": 10.781996726989746, "learning_rate": 0.0001566730289710558, "loss": 14.2, "step": 1150 }, { "epoch": 0.6438631790744467, "grad_norm": 14.64252758026123, "learning_rate": 0.00015315223115313262, "loss": 11.8162, "step": 1200 }, { "epoch": 0.6438631790744467, "eval_loss": 1.005837082862854, "eval_runtime": 27.4338, "eval_samples_per_second": 3.645, "eval_steps_per_second": 1.823, "step": 1200 }, { "epoch": 0.670690811535882, "grad_norm": 13.796377182006836, "learning_rate": 0.0001495370826208764, "loss": 6.8475, "step": 1250 }, { "epoch": 0.6975184439973172, "grad_norm": 11.29936408996582, "learning_rate": 0.0001458340006372889, "loss": 2.4798, "step": 1300 }, { "epoch": 0.7243460764587525, "grad_norm": 6.6834492683410645, "learning_rate": 0.0001420495585563536, "loss": 1.6183, "step": 1350 }, { "epoch": 0.7511737089201878, "grad_norm": 5.344634532928467, "learning_rate": 0.00013819047415465196, "loss": 1.4575, "step": 1400 }, { "epoch": 0.7511737089201878, "eval_loss": 0.19598053395748138, "eval_runtime": 27.4448, "eval_samples_per_second": 3.644, "eval_steps_per_second": 1.822, "step": 1400 }, { "epoch": 0.778001341381623, "grad_norm": 8.362627029418945, "learning_rate": 0.00013426359770661405, "loss": 1.2922, "step": 1450 }, { "epoch": 0.8048289738430584, "grad_norm": 4.9234418869018555, "learning_rate": 0.00013027589982457228, "loss": 1.2766, "step": 1500 }, { "epoch": 0.8316566063044937, "grad_norm": 5.768574237823486, "learning_rate": 0.00012623445908520258, "loss": 1.1293, "step": 1550 }, { "epoch": 0.8584842387659289, "grad_norm": 5.284246444702148, "learning_rate": 0.00012214644946431785, "loss": 1.1295, "step": 1600 }, { "epoch": 0.8584842387659289, "eval_loss": 0.16764384508132935, "eval_runtime": 27.4111, "eval_samples_per_second": 3.648, "eval_steps_per_second": 1.824, "step": 1600 }, { "epoch": 0.8853118712273642, "grad_norm": 5.7961225509643555, "learning_rate": 0.00011801912760231802, "loss": 1.0936, "step": 1650 }, { "epoch": 0.9121395036887995, "grad_norm": 4.642612934112549, "learning_rate": 0.0001138598199229018, "loss": 1.0898, "step": 1700 }, { "epoch": 0.9389671361502347, "grad_norm": 5.459797382354736, "learning_rate": 0.0001096759096279061, "loss": 1.0331, "step": 1750 }, { "epoch": 0.96579476861167, "grad_norm": 4.336981773376465, "learning_rate": 0.00010547482359135805, "loss": 1.0427, "step": 1800 }, { "epoch": 0.96579476861167, "eval_loss": 0.15734320878982544, "eval_runtime": 27.4537, "eval_samples_per_second": 3.642, "eval_steps_per_second": 1.821, "step": 1800 }, { "epoch": 0.9926224010731053, "grad_norm": 4.62598991394043, "learning_rate": 0.00010126401917600476, "loss": 1.0167, "step": 1850 }, { "epoch": 1.0193158953722334, "grad_norm": 3.739978551864624, "learning_rate": 9.705097099572197e-05, "loss": 0.818, "step": 1900 }, { "epoch": 1.0461435278336686, "grad_norm": 3.3912863731384277, "learning_rate": 9.284315764730047e-05, "loss": 0.9189, "step": 1950 }, { "epoch": 1.072971160295104, "grad_norm": 3.9238812923431396, "learning_rate": 8.864804843516263e-05, "loss": 0.8213, "step": 2000 }, { "epoch": 1.072971160295104, "eval_loss": 0.15182910859584808, "eval_runtime": 27.4216, "eval_samples_per_second": 3.647, "eval_steps_per_second": 1.823, "step": 2000 }, { "epoch": 1.0997987927565391, "grad_norm": 3.6631221771240234, "learning_rate": 8.447309011257313e-05, "loss": 0.8213, "step": 2050 }, { "epoch": 1.1266264252179745, "grad_norm": 3.994750499725342, "learning_rate": 8.032569366288115e-05, "loss": 0.9157, "step": 2100 }, { "epoch": 1.1534540576794097, "grad_norm": 4.084840297698975, "learning_rate": 7.621322114425742e-05, "loss": 0.827, "step": 2150 }, { "epoch": 1.180281690140845, "grad_norm": 3.7172036170959473, "learning_rate": 7.214297262127847e-05, "loss": 0.8447, "step": 2200 }, { "epoch": 1.180281690140845, "eval_loss": 0.14608320593833923, "eval_runtime": 27.4428, "eval_samples_per_second": 3.644, "eval_steps_per_second": 1.822, "step": 2200 } ], "logging_steps": 50, "max_steps": 3728, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5950289413921587e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }