{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8903591682419658, "eval_steps": 100, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01890359168241966, "grad_norm": 1.484338641166687, "learning_rate": 1.69811320754717e-05, "loss": 0.7493, "step": 10 }, { "epoch": 0.03780718336483932, "grad_norm": 0.4748876690864563, "learning_rate": 3.5849056603773584e-05, "loss": 0.4004, "step": 20 }, { "epoch": 0.05671077504725898, "grad_norm": 0.25469017028808594, "learning_rate": 5.4716981132075475e-05, "loss": 0.1629, "step": 30 }, { "epoch": 0.07561436672967864, "grad_norm": 0.20669250190258026, "learning_rate": 7.358490566037736e-05, "loss": 0.0913, "step": 40 }, { "epoch": 0.0945179584120983, "grad_norm": 0.15886972844600677, "learning_rate": 9.245283018867925e-05, "loss": 0.0597, "step": 50 }, { "epoch": 0.11342155009451796, "grad_norm": 0.1460949182510376, "learning_rate": 0.00011132075471698113, "loss": 0.0432, "step": 60 }, { "epoch": 0.1323251417769376, "grad_norm": 0.1386682391166687, "learning_rate": 0.000130188679245283, "loss": 0.0363, "step": 70 }, { "epoch": 0.15122873345935728, "grad_norm": 0.11621111631393433, "learning_rate": 0.0001490566037735849, "loss": 0.0561, "step": 80 }, { "epoch": 0.17013232514177692, "grad_norm": 0.0876479223370552, "learning_rate": 0.00016792452830188682, "loss": 0.0386, "step": 90 }, { "epoch": 0.1890359168241966, "grad_norm": 0.14801622927188873, "learning_rate": 0.00018679245283018868, "loss": 0.0319, "step": 100 }, { "epoch": 0.1890359168241966, "eval_loss": 0.03663806989789009, "eval_runtime": 3533.1544, "eval_samples_per_second": 0.845, "eval_steps_per_second": 0.211, "step": 100 }, { "epoch": 0.20793950850661624, "grad_norm": 0.1149626076221466, "learning_rate": 0.00019999509956294324, "loss": 0.0354, "step": 110 }, { "epoch": 0.22684310018903592, "grad_norm": 0.09332916885614395, "learning_rate": 0.0001999079940423227, "loss": 0.0536, "step": 120 }, { "epoch": 0.24574669187145556, "grad_norm": 0.12284527719020844, "learning_rate": 0.0001997120990981624, "loss": 0.0494, "step": 130 }, { "epoch": 0.2646502835538752, "grad_norm": 0.13534563779830933, "learning_rate": 0.00019940762803970005, "loss": 0.0392, "step": 140 }, { "epoch": 0.2835538752362949, "grad_norm": 0.08950494974851608, "learning_rate": 0.00019899491240427916, "loss": 0.0449, "step": 150 }, { "epoch": 0.30245746691871456, "grad_norm": 0.07946058362722397, "learning_rate": 0.0001984744015963392, "loss": 0.0305, "step": 160 }, { "epoch": 0.32136105860113423, "grad_norm": 0.09646843373775482, "learning_rate": 0.0001978466623980609, "loss": 0.0324, "step": 170 }, { "epoch": 0.34026465028355385, "grad_norm": 0.09262252599000931, "learning_rate": 0.00019711237835219952, "loss": 0.0318, "step": 180 }, { "epoch": 0.3591682419659735, "grad_norm": 0.09316161274909973, "learning_rate": 0.00019627234901777767, "loss": 0.0326, "step": 190 }, { "epoch": 0.3780718336483932, "grad_norm": 0.09103809297084808, "learning_rate": 0.0001953274890994485, "loss": 0.0298, "step": 200 }, { "epoch": 0.3780718336483932, "eval_loss": 0.03245590627193451, "eval_runtime": 3537.1525, "eval_samples_per_second": 0.844, "eval_steps_per_second": 0.211, "step": 200 }, { "epoch": 0.39697542533081287, "grad_norm": 0.08285395801067352, "learning_rate": 0.0001942788274514777, "loss": 0.0315, "step": 210 }, { "epoch": 0.4158790170132325, "grad_norm": 0.09007158130407333, "learning_rate": 0.00019312750595742791, "loss": 0.027, "step": 220 }, { "epoch": 0.43478260869565216, "grad_norm": 0.0819140374660492, "learning_rate": 0.00019187477828676617, "loss": 0.0433, "step": 230 }, { "epoch": 0.45368620037807184, "grad_norm": 0.09432337433099747, "learning_rate": 0.00019052200852974819, "loss": 0.0362, "step": 240 }, { "epoch": 0.4725897920604915, "grad_norm": 0.08810629695653915, "learning_rate": 0.00018907066971206592, "loss": 0.0378, "step": 250 }, { "epoch": 0.4914933837429111, "grad_norm": 0.07055918127298355, "learning_rate": 0.00018752234219087538, "loss": 0.033, "step": 260 }, { "epoch": 0.5103969754253308, "grad_norm": 0.08121134340763092, "learning_rate": 0.00018587871193395222, "loss": 0.0242, "step": 270 }, { "epoch": 0.5293005671077504, "grad_norm": 0.0509517528116703, "learning_rate": 0.0001841415686838478, "loss": 0.0296, "step": 280 }, { "epoch": 0.5482041587901701, "grad_norm": 0.1037987470626831, "learning_rate": 0.00018231280400904577, "loss": 0.0398, "step": 290 }, { "epoch": 0.5671077504725898, "grad_norm": 0.06649813801050186, "learning_rate": 0.0001803944092442402, "loss": 0.0247, "step": 300 }, { "epoch": 0.5671077504725898, "eval_loss": 0.031097637489438057, "eval_runtime": 3528.8232, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 300 }, { "epoch": 0.5860113421550095, "grad_norm": 0.06433621048927307, "learning_rate": 0.00017838847332197938, "loss": 0.0325, "step": 310 }, { "epoch": 0.6049149338374291, "grad_norm": 0.07686641812324524, "learning_rate": 0.00017629718049803512, "loss": 0.0218, "step": 320 }, { "epoch": 0.6238185255198487, "grad_norm": 0.0717320442199707, "learning_rate": 0.00017412280797297536, "loss": 0.0257, "step": 330 }, { "epoch": 0.6427221172022685, "grad_norm": 0.09712419658899307, "learning_rate": 0.0001718677234125292, "loss": 0.032, "step": 340 }, { "epoch": 0.6616257088846881, "grad_norm": 0.07800764590501785, "learning_rate": 0.00016953438236944515, "loss": 0.0274, "step": 350 }, { "epoch": 0.6805293005671077, "grad_norm": 0.13370630145072937, "learning_rate": 0.00016712532560964917, "loss": 0.0462, "step": 360 }, { "epoch": 0.6994328922495274, "grad_norm": 0.06654191017150879, "learning_rate": 0.0001646431763456148, "loss": 0.0316, "step": 370 }, { "epoch": 0.718336483931947, "grad_norm": 0.06407943367958069, "learning_rate": 0.00016209063737995715, "loss": 0.0312, "step": 380 }, { "epoch": 0.7372400756143668, "grad_norm": 0.08451473712921143, "learning_rate": 0.00015947048816236192, "loss": 0.0371, "step": 390 }, { "epoch": 0.7561436672967864, "grad_norm": 0.06291916221380234, "learning_rate": 0.0001567855817630534, "loss": 0.0379, "step": 400 }, { "epoch": 0.7561436672967864, "eval_loss": 0.029843054711818695, "eval_runtime": 3529.8971, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 400 }, { "epoch": 0.775047258979206, "grad_norm": 0.0544995553791523, "learning_rate": 0.00015403884176609748, "loss": 0.0304, "step": 410 }, { "epoch": 0.7939508506616257, "grad_norm": 0.05621599778532982, "learning_rate": 0.00015123325908592244, "loss": 0.0395, "step": 420 }, { "epoch": 0.8128544423440454, "grad_norm": 0.057970017194747925, "learning_rate": 0.000148371888710524, "loss": 0.027, "step": 430 }, { "epoch": 0.831758034026465, "grad_norm": 0.09150703996419907, "learning_rate": 0.00014545784637490065, "loss": 0.0305, "step": 440 }, { "epoch": 0.8506616257088847, "grad_norm": 0.05133228376507759, "learning_rate": 0.0001424943051683422, "loss": 0.0286, "step": 450 }, { "epoch": 0.8695652173913043, "grad_norm": 0.0378057062625885, "learning_rate": 0.00013948449207926527, "loss": 0.027, "step": 460 }, { "epoch": 0.888468809073724, "grad_norm": 0.07247433066368103, "learning_rate": 0.0001364316844813581, "loss": 0.0416, "step": 470 }, { "epoch": 0.9073724007561437, "grad_norm": 0.0373314768075943, "learning_rate": 0.00013333920656486133, "loss": 0.0304, "step": 480 }, { "epoch": 0.9262759924385633, "grad_norm": 0.06260782480239868, "learning_rate": 0.00013021042571687014, "loss": 0.0323, "step": 490 }, { "epoch": 0.945179584120983, "grad_norm": 0.043123096227645874, "learning_rate": 0.0001270487488545997, "loss": 0.0382, "step": 500 }, { "epoch": 0.945179584120983, "eval_loss": 0.029131991788744926, "eval_runtime": 3532.7479, "eval_samples_per_second": 0.845, "eval_steps_per_second": 0.211, "step": 500 }, { "epoch": 0.9640831758034026, "grad_norm": 0.04353553429245949, "learning_rate": 0.0001238576187156063, "loss": 0.0286, "step": 510 }, { "epoch": 0.9829867674858223, "grad_norm": 0.0442516915500164, "learning_rate": 0.00012064051010900397, "loss": 0.0344, "step": 520 }, { "epoch": 1.001890359168242, "grad_norm": 0.031339362263679504, "learning_rate": 0.00011740092613175842, "loss": 0.0263, "step": 530 }, { "epoch": 1.0207939508506616, "grad_norm": 0.04457208141684532, "learning_rate": 0.00011414239435417837, "loss": 0.0227, "step": 540 }, { "epoch": 1.0396975425330812, "grad_norm": 0.05170178413391113, "learning_rate": 0.00011086846297875823, "loss": 0.0259, "step": 550 }, { "epoch": 1.0586011342155008, "grad_norm": 0.04105263203382492, "learning_rate": 0.00010758269697655416, "loss": 0.0285, "step": 560 }, { "epoch": 1.0775047258979207, "grad_norm": 0.06378618627786636, "learning_rate": 0.0001042886742053011, "loss": 0.0241, "step": 570 }, { "epoch": 1.0964083175803403, "grad_norm": 0.049605123698711395, "learning_rate": 0.00010098998151349745, "loss": 0.0211, "step": 580 }, { "epoch": 1.11531190926276, "grad_norm": 0.0684175044298172, "learning_rate": 9.76902108346999e-05, "loss": 0.0216, "step": 590 }, { "epoch": 1.1342155009451795, "grad_norm": 0.05630769208073616, "learning_rate": 9.439295527628081e-05, "loss": 0.0232, "step": 600 }, { "epoch": 1.1342155009451795, "eval_loss": 0.02984483726322651, "eval_runtime": 3536.1992, "eval_samples_per_second": 0.844, "eval_steps_per_second": 0.211, "step": 600 }, { "epoch": 1.1531190926275992, "grad_norm": 0.058041468262672424, "learning_rate": 9.110180520690798e-05, "loss": 0.0236, "step": 610 }, { "epoch": 1.172022684310019, "grad_norm": 0.046324472874403, "learning_rate": 8.782034434700594e-05, "loss": 0.0214, "step": 620 }, { "epoch": 1.1909262759924386, "grad_norm": 0.05600952357053757, "learning_rate": 8.455214586645703e-05, "loss": 0.0201, "step": 630 }, { "epoch": 1.2098298676748582, "grad_norm": 0.043209731578826904, "learning_rate": 8.13007684937907e-05, "loss": 0.0219, "step": 640 }, { "epoch": 1.2287334593572778, "grad_norm": 0.06478435546159744, "learning_rate": 7.806975264109779e-05, "loss": 0.0287, "step": 650 }, { "epoch": 1.2476370510396975, "grad_norm": 0.04555751755833626, "learning_rate": 7.486261654888973e-05, "loss": 0.0238, "step": 660 }, { "epoch": 1.2665406427221173, "grad_norm": 0.0735645741224289, "learning_rate": 7.168285245510044e-05, "loss": 0.0217, "step": 670 }, { "epoch": 1.285444234404537, "grad_norm": 0.05558573454618454, "learning_rate": 6.853392279240175e-05, "loss": 0.0201, "step": 680 }, { "epoch": 1.3043478260869565, "grad_norm": 0.08889973908662796, "learning_rate": 6.541925641797423e-05, "loss": 0.0296, "step": 690 }, { "epoch": 1.3232514177693762, "grad_norm": 0.05940109118819237, "learning_rate": 6.234224487983816e-05, "loss": 0.0189, "step": 700 }, { "epoch": 1.3232514177693762, "eval_loss": 0.029020262882113457, "eval_runtime": 3528.406, "eval_samples_per_second": 0.846, "eval_steps_per_second": 0.212, "step": 700 }, { "epoch": 1.3421550094517958, "grad_norm": 0.09710100293159485, "learning_rate": 5.9306238723809815e-05, "loss": 0.0251, "step": 710 }, { "epoch": 1.3610586011342156, "grad_norm": 0.04113279655575752, "learning_rate": 5.6314543845105475e-05, "loss": 0.0233, "step": 720 }, { "epoch": 1.3799621928166352, "grad_norm": 0.06698830425739288, "learning_rate": 5.337041788856518e-05, "loss": 0.0225, "step": 730 }, { "epoch": 1.3988657844990549, "grad_norm": 0.12462351471185684, "learning_rate": 5.0477066701415765e-05, "loss": 0.0236, "step": 740 }, { "epoch": 1.4177693761814745, "grad_norm": 0.057194944471120834, "learning_rate": 4.7637640842436407e-05, "loss": 0.0237, "step": 750 }, { "epoch": 1.436672967863894, "grad_norm": 0.0688399150967598, "learning_rate": 4.485523215132774e-05, "loss": 0.0227, "step": 760 }, { "epoch": 1.455576559546314, "grad_norm": 0.0633525550365448, "learning_rate": 4.213287038201943e-05, "loss": 0.0211, "step": 770 }, { "epoch": 1.4744801512287333, "grad_norm": 0.09471452981233597, "learning_rate": 3.947351990358309e-05, "loss": 0.0233, "step": 780 }, { "epoch": 1.4933837429111532, "grad_norm": 0.05234042555093765, "learning_rate": 3.6880076472342516e-05, "loss": 0.0198, "step": 790 }, { "epoch": 1.5122873345935728, "grad_norm": 0.038761626929044724, "learning_rate": 3.435536407869575e-05, "loss": 0.0214, "step": 800 }, { "epoch": 1.5122873345935728, "eval_loss": 0.02837200090289116, "eval_runtime": 3534.8861, "eval_samples_per_second": 0.845, "eval_steps_per_second": 0.211, "step": 800 }, { "epoch": 1.5311909262759924, "grad_norm": 0.04780678078532219, "learning_rate": 3.190213187208313e-05, "loss": 0.0218, "step": 810 }, { "epoch": 1.5500945179584122, "grad_norm": 0.039974238723516464, "learning_rate": 2.9523051167449277e-05, "loss": 0.0201, "step": 820 }, { "epoch": 1.5689981096408316, "grad_norm": 0.04368181154131889, "learning_rate": 2.7220712536458547e-05, "loss": 0.0192, "step": 830 }, { "epoch": 1.5879017013232515, "grad_norm": 0.08702870458364487, "learning_rate": 2.4997622986631776e-05, "loss": 0.025, "step": 840 }, { "epoch": 1.606805293005671, "grad_norm": 0.050207559019327164, "learning_rate": 2.285620323147569e-05, "loss": 0.0176, "step": 850 }, { "epoch": 1.6257088846880907, "grad_norm": 0.1366623491048813, "learning_rate": 2.0798785054577284e-05, "loss": 0.0193, "step": 860 }, { "epoch": 1.6446124763705106, "grad_norm": 0.04110514745116234, "learning_rate": 1.882760877053388e-05, "loss": 0.0179, "step": 870 }, { "epoch": 1.66351606805293, "grad_norm": 0.1032591164112091, "learning_rate": 1.6944820785483405e-05, "loss": 0.0225, "step": 880 }, { "epoch": 1.6824196597353498, "grad_norm": 0.07322273403406143, "learning_rate": 1.5152471259891043e-05, "loss": 0.0208, "step": 890 }, { "epoch": 1.7013232514177694, "grad_norm": 0.05240940675139427, "learning_rate": 1.345251187613752e-05, "loss": 0.0213, "step": 900 }, { "epoch": 1.7013232514177694, "eval_loss": 0.0286864060908556, "eval_runtime": 3527.0806, "eval_samples_per_second": 0.847, "eval_steps_per_second": 0.212, "step": 900 }, { "epoch": 1.720226843100189, "grad_norm": 0.05167185142636299, "learning_rate": 1.1846793713339966e-05, "loss": 0.0216, "step": 910 }, { "epoch": 1.7391304347826086, "grad_norm": 0.09499327838420868, "learning_rate": 1.0337065231718889e-05, "loss": 0.0265, "step": 920 }, { "epoch": 1.7580340264650283, "grad_norm": 0.06521856039762497, "learning_rate": 8.92497036870682e-06, "loss": 0.0257, "step": 930 }, { "epoch": 1.776937618147448, "grad_norm": 0.062058351933956146, "learning_rate": 7.612046748871327e-06, "loss": 0.0221, "step": 940 }, { "epoch": 1.7958412098298677, "grad_norm": 0.057433273643255234, "learning_rate": 6.399724009601493e-06, "loss": 0.0206, "step": 950 }, { "epoch": 1.8147448015122873, "grad_norm": 0.045936308801174164, "learning_rate": 5.2893222443814335e-06, "loss": 0.0293, "step": 960 }, { "epoch": 1.833648393194707, "grad_norm": 0.053229495882987976, "learning_rate": 4.282050565345752e-06, "loss": 0.0273, "step": 970 }, { "epoch": 1.8525519848771266, "grad_norm": 0.04462295025587082, "learning_rate": 3.3790057866819324e-06, "loss": 0.0168, "step": 980 }, { "epoch": 1.8714555765595464, "grad_norm": 0.12018444389104843, "learning_rate": 2.581171230313717e-06, "loss": 0.0184, "step": 990 }, { "epoch": 1.8903591682419658, "grad_norm": 0.13049866259098053, "learning_rate": 1.8894156551657828e-06, "loss": 0.0289, "step": 1000 }, { "epoch": 1.8903591682419658, "eval_loss": 0.02830129861831665, "eval_runtime": 3535.4807, "eval_samples_per_second": 0.845, "eval_steps_per_second": 0.211, "step": 1000 } ], "logging_steps": 10, "max_steps": 1058, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.782036432921559e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }