{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 2.7287635803222656, "learning_rate": 2.3936170212765957e-06, "loss": 1.8321, "step": 10 }, { "epoch": 0.032, "grad_norm": 2.5078585147857666, "learning_rate": 5.053191489361702e-06, "loss": 1.7408, "step": 20 }, { "epoch": 0.048, "grad_norm": 2.1067042350769043, "learning_rate": 7.712765957446808e-06, "loss": 1.5053, "step": 30 }, { "epoch": 0.064, "grad_norm": 1.6803735494613647, "learning_rate": 1.0372340425531916e-05, "loss": 1.176, "step": 40 }, { "epoch": 0.08, "grad_norm": 1.358530044555664, "learning_rate": 1.3031914893617023e-05, "loss": 0.7815, "step": 50 }, { "epoch": 0.096, "grad_norm": 0.7375353574752808, "learning_rate": 1.5691489361702127e-05, "loss": 0.4287, "step": 60 }, { "epoch": 0.112, "grad_norm": 0.5832469463348389, "learning_rate": 1.8351063829787234e-05, "loss": 0.3052, "step": 70 }, { "epoch": 0.128, "grad_norm": 0.41397199034690857, "learning_rate": 2.1010638297872342e-05, "loss": 0.1874, "step": 80 }, { "epoch": 0.144, "grad_norm": 0.4094902575016022, "learning_rate": 2.3670212765957446e-05, "loss": 0.1135, "step": 90 }, { "epoch": 0.16, "grad_norm": 0.2328539937734604, "learning_rate": 2.6329787234042553e-05, "loss": 0.047, "step": 100 }, { "epoch": 0.176, "grad_norm": 0.1451612263917923, "learning_rate": 2.898936170212766e-05, "loss": 0.0175, "step": 110 }, { "epoch": 0.192, "grad_norm": 0.02662522904574871, "learning_rate": 3.164893617021277e-05, "loss": 0.0036, "step": 120 }, { "epoch": 0.208, "grad_norm": 0.01232597604393959, "learning_rate": 3.430851063829787e-05, "loss": 0.0014, "step": 130 }, { "epoch": 0.224, "grad_norm": 0.0077546993270516396, "learning_rate": 3.696808510638298e-05, "loss": 0.0007, "step": 140 }, { "epoch": 0.24, "grad_norm": 0.005459086503833532, "learning_rate": 3.962765957446809e-05, "loss": 0.0006, "step": 150 }, { "epoch": 0.256, "grad_norm": 0.005705484189093113, "learning_rate": 4.228723404255319e-05, "loss": 0.0005, "step": 160 }, { "epoch": 0.272, "grad_norm": 0.0035389093682169914, "learning_rate": 4.49468085106383e-05, "loss": 0.0004, "step": 170 }, { "epoch": 0.288, "grad_norm": 0.003010712331160903, "learning_rate": 4.7606382978723405e-05, "loss": 0.0003, "step": 180 }, { "epoch": 0.304, "grad_norm": 0.002946693217381835, "learning_rate": 4.999995665096164e-05, "loss": 0.0003, "step": 190 }, { "epoch": 0.32, "grad_norm": 0.002301169792190194, "learning_rate": 4.9994754948256304e-05, "loss": 0.0002, "step": 200 }, { "epoch": 0.336, "grad_norm": 0.002140331780537963, "learning_rate": 4.998088550481357e-05, "loss": 0.0002, "step": 210 }, { "epoch": 0.352, "grad_norm": 0.0022257098462432623, "learning_rate": 4.9958353130312106e-05, "loss": 0.0002, "step": 220 }, { "epoch": 0.368, "grad_norm": 0.001662266324274242, "learning_rate": 4.99271656385825e-05, "loss": 0.0002, "step": 230 }, { "epoch": 0.384, "grad_norm": 0.002042123582214117, "learning_rate": 4.9887333844897506e-05, "loss": 0.0002, "step": 240 }, { "epoch": 0.4, "grad_norm": 0.0013473546132445335, "learning_rate": 4.983887156222155e-05, "loss": 0.0002, "step": 250 }, { "epoch": 0.416, "grad_norm": 0.0018928167410194874, "learning_rate": 4.978179559642061e-05, "loss": 0.0002, "step": 260 }, { "epoch": 0.432, "grad_norm": 0.0016440442996099591, "learning_rate": 4.9716125740434235e-05, "loss": 0.0002, "step": 270 }, { "epoch": 0.448, "grad_norm": 0.001509015099145472, "learning_rate": 4.9641884767411714e-05, "loss": 0.0002, "step": 280 }, { "epoch": 0.464, "grad_norm": 0.002279238309711218, "learning_rate": 4.955909842281477e-05, "loss": 0.0001, "step": 290 }, { "epoch": 0.48, "grad_norm": 0.0011063262354582548, "learning_rate": 4.946779541548942e-05, "loss": 0.0001, "step": 300 }, { "epoch": 0.496, "grad_norm": 0.0013478458859026432, "learning_rate": 4.936800740771033e-05, "loss": 0.0001, "step": 310 }, { "epoch": 0.512, "grad_norm": 0.0010506451362743974, "learning_rate": 4.925976900420083e-05, "loss": 0.0001, "step": 320 }, { "epoch": 0.528, "grad_norm": 0.0013516925973817706, "learning_rate": 4.9143117740132667e-05, "loss": 0.0001, "step": 330 }, { "epoch": 0.544, "grad_norm": 0.001162629690952599, "learning_rate": 4.901809406810942e-05, "loss": 0.0001, "step": 340 }, { "epoch": 0.56, "grad_norm": 0.0008260276517830789, "learning_rate": 4.8884741344138294e-05, "loss": 0.0001, "step": 350 }, { "epoch": 0.576, "grad_norm": 0.0009220085921697319, "learning_rate": 4.8743105812594944e-05, "loss": 0.0001, "step": 360 }, { "epoch": 0.592, "grad_norm": 0.0009574664290994406, "learning_rate": 4.8593236590186855e-05, "loss": 0.0001, "step": 370 }, { "epoch": 0.608, "grad_norm": 0.0009008324705064297, "learning_rate": 4.8435185648920403e-05, "loss": 0.0001, "step": 380 }, { "epoch": 0.624, "grad_norm": 0.0007590301102027297, "learning_rate": 4.8269007798077994e-05, "loss": 0.0001, "step": 390 }, { "epoch": 0.64, "grad_norm": 0.0007175425416789949, "learning_rate": 4.809476066521111e-05, "loss": 0.0001, "step": 400 }, { "epoch": 0.656, "grad_norm": 0.0008309377590194345, "learning_rate": 4.791250467615608e-05, "loss": 0.0001, "step": 410 }, { "epoch": 0.672, "grad_norm": 0.0006082377512939274, "learning_rate": 4.77223030340795e-05, "loss": 0.0001, "step": 420 }, { "epoch": 0.688, "grad_norm": 0.0007051264983601868, "learning_rate": 4.752422169756048e-05, "loss": 0.0001, "step": 430 }, { "epoch": 0.704, "grad_norm": 0.000669407716486603, "learning_rate": 4.7318329357717345e-05, "loss": 0.0001, "step": 440 }, { "epoch": 0.72, "grad_norm": 0.0006657831836491823, "learning_rate": 4.710469741438679e-05, "loss": 0.0001, "step": 450 }, { "epoch": 0.736, "grad_norm": 0.0005902393022552133, "learning_rate": 4.688339995136368e-05, "loss": 0.0001, "step": 460 }, { "epoch": 0.752, "grad_norm": 0.0006702489918097854, "learning_rate": 4.6654513710710056e-05, "loss": 0.0001, "step": 470 }, { "epoch": 0.768, "grad_norm": 0.0005900064134038985, "learning_rate": 4.6418118066142395e-05, "loss": 0.0001, "step": 480 }, { "epoch": 0.784, "grad_norm": 0.0005075543303973973, "learning_rate": 4.6174294995506154e-05, "loss": 0.0001, "step": 490 }, { "epoch": 0.8, "grad_norm": 0.0005521568236872554, "learning_rate": 4.5923129052347334e-05, "loss": 0.0001, "step": 500 } ], "logging_steps": 10, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9952307416334336e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }