{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7511737089201878, "eval_steps": 200, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02682763246143528, "grad_norm": 7.5027008056640625, "learning_rate": 0.00019991475902800314, "loss": 18.1731, "step": 50 }, { "epoch": 0.05365526492287056, "grad_norm": 5.68405294418335, "learning_rate": 0.0001996521945196495, "loss": 16.1431, "step": 100 }, { "epoch": 0.08048289738430583, "grad_norm": 5.743220806121826, "learning_rate": 0.00019921273704680872, "loss": 16.2968, "step": 150 }, { "epoch": 0.10731052984574112, "grad_norm": 6.115276336669922, "learning_rate": 0.0001985971666920019, "loss": 15.9962, "step": 200 }, { "epoch": 0.10731052984574112, "eval_loss": 1.7225275039672852, "eval_runtime": 27.5115, "eval_samples_per_second": 3.635, "eval_steps_per_second": 1.817, "step": 200 }, { "epoch": 0.1341381623071764, "grad_norm": 8.04736328125, "learning_rate": 0.00019780657615635263, "loss": 15.9676, "step": 250 }, { "epoch": 0.16096579476861167, "grad_norm": 6.840994358062744, "learning_rate": 0.0001968423688199291, "loss": 15.7539, "step": 300 }, { "epoch": 0.18779342723004694, "grad_norm": 8.343657493591309, "learning_rate": 0.00019570625625059935, "loss": 15.5103, "step": 350 }, { "epoch": 0.21462105969148224, "grad_norm": 6.539078712463379, "learning_rate": 0.00019440025516582107, "loss": 15.4896, "step": 400 }, { "epoch": 0.21462105969148224, "eval_loss": 1.6813037395477295, "eval_runtime": 27.4264, "eval_samples_per_second": 3.646, "eval_steps_per_second": 1.823, "step": 400 }, { "epoch": 0.2414486921529175, "grad_norm": 5.919412612915039, "learning_rate": 0.00019292668385275914, "loss": 15.7171, "step": 450 }, { "epoch": 0.2682763246143528, "grad_norm": 8.476408958435059, "learning_rate": 0.00019128815805308596, "loss": 15.6568, "step": 500 }, { "epoch": 0.29510395707578807, "grad_norm": 6.3846330642700195, "learning_rate": 0.00018948758631976906, "loss": 15.5006, "step": 550 }, { "epoch": 0.32193158953722334, "grad_norm": 9.207799911499023, "learning_rate": 0.00018752816485408843, "loss": 15.3622, "step": 600 }, { "epoch": 0.32193158953722334, "eval_loss": 1.657413363456726, "eval_runtime": 27.4485, "eval_samples_per_second": 3.643, "eval_steps_per_second": 1.822, "step": 600 }, { "epoch": 0.3487592219986586, "grad_norm": 8.288525581359863, "learning_rate": 0.00018541337183204833, "loss": 15.1776, "step": 650 }, { "epoch": 0.3755868544600939, "grad_norm": 6.99819278717041, "learning_rate": 0.00018314696123025454, "loss": 15.2676, "step": 700 }, { "epoch": 0.4024144869215292, "grad_norm": 6.276946544647217, "learning_rate": 0.0001807329561622173, "loss": 15.4822, "step": 750 }, { "epoch": 0.42924211938296447, "grad_norm": 7.4382123947143555, "learning_rate": 0.00017817564173690811, "loss": 15.17, "step": 800 }, { "epoch": 0.42924211938296447, "eval_loss": 1.6430630683898926, "eval_runtime": 27.4278, "eval_samples_per_second": 3.646, "eval_steps_per_second": 1.823, "step": 800 }, { "epoch": 0.45606975184439974, "grad_norm": 6.525240898132324, "learning_rate": 0.00017547955745224783, "loss": 15.2127, "step": 850 }, { "epoch": 0.482897384305835, "grad_norm": 6.991734027862549, "learning_rate": 0.00017264948913702733, "loss": 14.9923, "step": 900 }, { "epoch": 0.5097250167672703, "grad_norm": 7.74118709564209, "learning_rate": 0.0001696904604555664, "loss": 14.9959, "step": 950 }, { "epoch": 0.5365526492287056, "grad_norm": 8.33617115020752, "learning_rate": 0.00016660772399018878, "loss": 14.8148, "step": 1000 }, { "epoch": 0.5365526492287056, "eval_loss": 1.5783880949020386, "eval_runtime": 27.4606, "eval_samples_per_second": 3.642, "eval_steps_per_second": 1.821, "step": 1000 }, { "epoch": 0.5633802816901409, "grad_norm": 7.486099720001221, "learning_rate": 0.000163406751917345, "loss": 14.7974, "step": 1050 }, { "epoch": 0.5902079141515761, "grad_norm": 8.10119342803955, "learning_rate": 0.0001600932262939324, "loss": 14.5883, "step": 1100 }, { "epoch": 0.6170355466130114, "grad_norm": 10.781996726989746, "learning_rate": 0.0001566730289710558, "loss": 14.2, "step": 1150 }, { "epoch": 0.6438631790744467, "grad_norm": 14.64252758026123, "learning_rate": 0.00015315223115313262, "loss": 11.8162, "step": 1200 }, { "epoch": 0.6438631790744467, "eval_loss": 1.005837082862854, "eval_runtime": 27.4338, "eval_samples_per_second": 3.645, "eval_steps_per_second": 1.823, "step": 1200 }, { "epoch": 0.670690811535882, "grad_norm": 13.796377182006836, "learning_rate": 0.0001495370826208764, "loss": 6.8475, "step": 1250 }, { "epoch": 0.6975184439973172, "grad_norm": 11.29936408996582, "learning_rate": 0.0001458340006372889, "loss": 2.4798, "step": 1300 }, { "epoch": 0.7243460764587525, "grad_norm": 6.6834492683410645, "learning_rate": 0.0001420495585563536, "loss": 1.6183, "step": 1350 }, { "epoch": 0.7511737089201878, "grad_norm": 5.344634532928467, "learning_rate": 0.00013819047415465196, "loss": 1.4575, "step": 1400 }, { "epoch": 0.7511737089201878, "eval_loss": 0.19598053395748138, "eval_runtime": 27.4448, "eval_samples_per_second": 3.644, "eval_steps_per_second": 1.822, "step": 1400 } ], "logging_steps": 50, "max_steps": 3728, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0146684194441626e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }