{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 6723, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004462293618920125, "grad_norm": 3.7051006087750524, "learning_rate": 1.337295690936107e-07, "loss": 0.5444, "step": 10 }, { "epoch": 0.00892458723784025, "grad_norm": 3.048784847442942, "learning_rate": 2.823179791976226e-07, "loss": 0.5572, "step": 20 }, { "epoch": 0.013386880856760375, "grad_norm": 2.048957705438653, "learning_rate": 4.309063893016345e-07, "loss": 0.5423, "step": 30 }, { "epoch": 0.0178491744756805, "grad_norm": 1.461567031084094, "learning_rate": 5.794947994056464e-07, "loss": 0.5134, "step": 40 }, { "epoch": 0.022311468094600623, "grad_norm": 1.0103720435475612, "learning_rate": 7.280832095096584e-07, "loss": 0.4907, "step": 50 }, { "epoch": 0.02677376171352075, "grad_norm": 0.6889844403965782, "learning_rate": 8.766716196136702e-07, "loss": 0.4818, "step": 60 }, { "epoch": 0.031236055332440876, "grad_norm": 0.5268842063919165, "learning_rate": 1.025260029717682e-06, "loss": 0.4483, "step": 70 }, { "epoch": 0.035698348951361, "grad_norm": 0.46396807974468, "learning_rate": 1.173848439821694e-06, "loss": 0.4457, "step": 80 }, { "epoch": 0.040160642570281124, "grad_norm": 0.45432241367642123, "learning_rate": 1.322436849925706e-06, "loss": 0.4398, "step": 90 }, { "epoch": 0.04462293618920125, "grad_norm": 0.3566878144571345, "learning_rate": 1.4710252600297179e-06, "loss": 0.4295, "step": 100 }, { "epoch": 0.049085229808121376, "grad_norm": 0.3076647669838012, "learning_rate": 1.6196136701337297e-06, "loss": 0.4126, "step": 110 }, { "epoch": 0.0535475234270415, "grad_norm": 0.30260053228577904, "learning_rate": 1.7682020802377416e-06, "loss": 0.4021, "step": 120 }, { "epoch": 0.05800981704596162, "grad_norm": 0.3122317267903143, "learning_rate": 1.9167904903417537e-06, "loss": 0.3946, "step": 130 }, { "epoch": 0.06247211066488175, "grad_norm": 0.3476751821982303, "learning_rate": 2.0653789004457653e-06, "loss": 0.3965, "step": 140 }, { "epoch": 0.06693440428380187, "grad_norm": 0.33812027755011803, "learning_rate": 2.213967310549777e-06, "loss": 0.4031, "step": 150 }, { "epoch": 0.071396697902722, "grad_norm": 0.3120124573653305, "learning_rate": 2.362555720653789e-06, "loss": 0.3984, "step": 160 }, { "epoch": 0.07585899152164212, "grad_norm": 0.32219654623219857, "learning_rate": 2.511144130757801e-06, "loss": 0.382, "step": 170 }, { "epoch": 0.08032128514056225, "grad_norm": 0.37321844110875607, "learning_rate": 2.659732540861813e-06, "loss": 0.397, "step": 180 }, { "epoch": 0.08478357875948238, "grad_norm": 0.34121006704676865, "learning_rate": 2.808320950965825e-06, "loss": 0.3877, "step": 190 }, { "epoch": 0.0892458723784025, "grad_norm": 0.34259431719098377, "learning_rate": 2.956909361069837e-06, "loss": 0.3778, "step": 200 }, { "epoch": 0.09370816599732262, "grad_norm": 0.4634153760914143, "learning_rate": 3.1054977711738487e-06, "loss": 0.3838, "step": 210 }, { "epoch": 0.09817045961624275, "grad_norm": 0.34508537944318013, "learning_rate": 3.2540861812778607e-06, "loss": 0.3823, "step": 220 }, { "epoch": 0.10263275323516287, "grad_norm": 0.38159809656131194, "learning_rate": 3.4026745913818724e-06, "loss": 0.3641, "step": 230 }, { "epoch": 0.107095046854083, "grad_norm": 0.3204989890164747, "learning_rate": 3.5512630014858845e-06, "loss": 0.3819, "step": 240 }, { "epoch": 0.11155734047300313, "grad_norm": 0.3576572957193155, "learning_rate": 3.6998514115898966e-06, "loss": 0.3727, "step": 250 }, { "epoch": 0.11601963409192324, "grad_norm": 0.35700198129793526, "learning_rate": 3.848439821693909e-06, "loss": 0.3563, "step": 260 }, { "epoch": 0.12048192771084337, "grad_norm": 0.3192301931360082, "learning_rate": 3.99702823179792e-06, "loss": 0.3738, "step": 270 }, { "epoch": 0.1249442213297635, "grad_norm": 0.3196766826358136, "learning_rate": 4.145616641901932e-06, "loss": 0.3587, "step": 280 }, { "epoch": 0.12940651494868363, "grad_norm": 0.610281080707191, "learning_rate": 4.294205052005943e-06, "loss": 0.3664, "step": 290 }, { "epoch": 0.13386880856760375, "grad_norm": 0.35583167134748195, "learning_rate": 4.442793462109955e-06, "loss": 0.3631, "step": 300 }, { "epoch": 0.13833110218652386, "grad_norm": 0.3595334153498753, "learning_rate": 4.591381872213967e-06, "loss": 0.3764, "step": 310 }, { "epoch": 0.142793395805444, "grad_norm": 0.3524084490333156, "learning_rate": 4.7399702823179795e-06, "loss": 0.3644, "step": 320 }, { "epoch": 0.14725568942436412, "grad_norm": 0.3288515185706153, "learning_rate": 4.8885586924219916e-06, "loss": 0.3661, "step": 330 }, { "epoch": 0.15171798304328424, "grad_norm": 0.3351573861983467, "learning_rate": 5.037147102526004e-06, "loss": 0.3532, "step": 340 }, { "epoch": 0.15618027666220438, "grad_norm": 0.33700658750684853, "learning_rate": 5.185735512630016e-06, "loss": 0.3531, "step": 350 }, { "epoch": 0.1606425702811245, "grad_norm": 0.32516141156935247, "learning_rate": 5.334323922734027e-06, "loss": 0.3473, "step": 360 }, { "epoch": 0.1651048639000446, "grad_norm": 0.3337986135592338, "learning_rate": 5.48291233283804e-06, "loss": 0.3511, "step": 370 }, { "epoch": 0.16956715751896476, "grad_norm": 0.3323774922346478, "learning_rate": 5.631500742942051e-06, "loss": 0.3586, "step": 380 }, { "epoch": 0.17402945113788487, "grad_norm": 0.4792457547540971, "learning_rate": 5.780089153046062e-06, "loss": 0.3624, "step": 390 }, { "epoch": 0.178491744756805, "grad_norm": 0.42063892972841144, "learning_rate": 5.9286775631500745e-06, "loss": 0.3608, "step": 400 }, { "epoch": 0.18295403837572513, "grad_norm": 1.0840249935976123, "learning_rate": 6.0772659732540865e-06, "loss": 0.3602, "step": 410 }, { "epoch": 0.18741633199464525, "grad_norm": 0.3558589627850871, "learning_rate": 6.225854383358099e-06, "loss": 0.3514, "step": 420 }, { "epoch": 0.19187862561356536, "grad_norm": 0.40948391676162177, "learning_rate": 6.37444279346211e-06, "loss": 0.3568, "step": 430 }, { "epoch": 0.1963409192324855, "grad_norm": 0.33870866081421813, "learning_rate": 6.523031203566123e-06, "loss": 0.3475, "step": 440 }, { "epoch": 0.20080321285140562, "grad_norm": 0.3491791728647271, "learning_rate": 6.671619613670134e-06, "loss": 0.3455, "step": 450 }, { "epoch": 0.20526550647032574, "grad_norm": 0.35294486681555637, "learning_rate": 6.820208023774146e-06, "loss": 0.358, "step": 460 }, { "epoch": 0.20972780008924588, "grad_norm": 0.3561095555301197, "learning_rate": 6.968796433878157e-06, "loss": 0.3484, "step": 470 }, { "epoch": 0.214190093708166, "grad_norm": 0.37400205186538277, "learning_rate": 7.11738484398217e-06, "loss": 0.3512, "step": 480 }, { "epoch": 0.2186523873270861, "grad_norm": 0.368948524072989, "learning_rate": 7.2659732540861815e-06, "loss": 0.3564, "step": 490 }, { "epoch": 0.22311468094600626, "grad_norm": 0.31992893707470843, "learning_rate": 7.4145616641901944e-06, "loss": 0.3491, "step": 500 }, { "epoch": 0.22757697456492637, "grad_norm": 0.3196176367911485, "learning_rate": 7.563150074294206e-06, "loss": 0.3465, "step": 510 }, { "epoch": 0.2320392681838465, "grad_norm": 0.36129211410497347, "learning_rate": 7.711738484398219e-06, "loss": 0.3406, "step": 520 }, { "epoch": 0.23650156180276663, "grad_norm": 0.3545213437105017, "learning_rate": 7.86032689450223e-06, "loss": 0.3445, "step": 530 }, { "epoch": 0.24096385542168675, "grad_norm": 0.35389453665592974, "learning_rate": 8.008915304606241e-06, "loss": 0.3459, "step": 540 }, { "epoch": 0.24542614904060686, "grad_norm": 0.3543572102436423, "learning_rate": 8.157503714710252e-06, "loss": 0.3412, "step": 550 }, { "epoch": 0.249888442659527, "grad_norm": 0.36271347062518616, "learning_rate": 8.306092124814265e-06, "loss": 0.3387, "step": 560 }, { "epoch": 0.2543507362784471, "grad_norm": 0.4426754614509785, "learning_rate": 8.454680534918276e-06, "loss": 0.3491, "step": 570 }, { "epoch": 0.25881302989736726, "grad_norm": 0.39369474546630295, "learning_rate": 8.60326894502229e-06, "loss": 0.3384, "step": 580 }, { "epoch": 0.26327532351628735, "grad_norm": 0.3582656486286681, "learning_rate": 8.7518573551263e-06, "loss": 0.3544, "step": 590 }, { "epoch": 0.2677376171352075, "grad_norm": 0.34957569136942046, "learning_rate": 8.900445765230312e-06, "loss": 0.3429, "step": 600 }, { "epoch": 0.27219991075412764, "grad_norm": 0.3945306585524228, "learning_rate": 9.049034175334325e-06, "loss": 0.3478, "step": 610 }, { "epoch": 0.2766622043730477, "grad_norm": 0.3424763037614921, "learning_rate": 9.197622585438336e-06, "loss": 0.3312, "step": 620 }, { "epoch": 0.28112449799196787, "grad_norm": 0.3459851560845357, "learning_rate": 9.346210995542349e-06, "loss": 0.3495, "step": 630 }, { "epoch": 0.285586791610888, "grad_norm": 0.3744360789694989, "learning_rate": 9.49479940564636e-06, "loss": 0.3483, "step": 640 }, { "epoch": 0.2900490852298081, "grad_norm": 0.35814716631290894, "learning_rate": 9.643387815750373e-06, "loss": 0.341, "step": 650 }, { "epoch": 0.29451137884872824, "grad_norm": 0.5299368509807092, "learning_rate": 9.791976225854384e-06, "loss": 0.3365, "step": 660 }, { "epoch": 0.2989736724676484, "grad_norm": 0.38875418783791804, "learning_rate": 9.940564635958396e-06, "loss": 0.3424, "step": 670 }, { "epoch": 0.3034359660865685, "grad_norm": 0.45411665098963555, "learning_rate": 9.999975732158253e-06, "loss": 0.3368, "step": 680 }, { "epoch": 0.3078982597054886, "grad_norm": 0.3825559069839744, "learning_rate": 9.999827429534007e-06, "loss": 0.3389, "step": 690 }, { "epoch": 0.31236055332440876, "grad_norm": 0.36176618602239424, "learning_rate": 9.999544310413833e-06, "loss": 0.3395, "step": 700 }, { "epoch": 0.31682284694332885, "grad_norm": 0.40796102054124517, "learning_rate": 9.999126382431823e-06, "loss": 0.3435, "step": 710 }, { "epoch": 0.321285140562249, "grad_norm": 0.3829565760724817, "learning_rate": 9.998573656857085e-06, "loss": 0.3339, "step": 720 }, { "epoch": 0.32574743418116914, "grad_norm": 0.40664177051788203, "learning_rate": 9.997886148593436e-06, "loss": 0.3317, "step": 730 }, { "epoch": 0.3302097278000892, "grad_norm": 0.3640080288216767, "learning_rate": 9.997063876179007e-06, "loss": 0.3452, "step": 740 }, { "epoch": 0.33467202141900937, "grad_norm": 0.41068702259937845, "learning_rate": 9.996106861785741e-06, "loss": 0.3388, "step": 750 }, { "epoch": 0.3391343150379295, "grad_norm": 0.3410374798720669, "learning_rate": 9.995015131218794e-06, "loss": 0.349, "step": 760 }, { "epoch": 0.3435966086568496, "grad_norm": 0.37561698032089047, "learning_rate": 9.99378871391584e-06, "loss": 0.3375, "step": 770 }, { "epoch": 0.34805890227576974, "grad_norm": 0.3844118776195893, "learning_rate": 9.992427642946278e-06, "loss": 0.3339, "step": 780 }, { "epoch": 0.3525211958946899, "grad_norm": 0.35418994101086143, "learning_rate": 9.990931955010335e-06, "loss": 0.3373, "step": 790 }, { "epoch": 0.35698348951361, "grad_norm": 0.3109086430490103, "learning_rate": 9.989301690438087e-06, "loss": 0.3255, "step": 800 }, { "epoch": 0.3614457831325301, "grad_norm": 0.35125455281231016, "learning_rate": 9.987536893188363e-06, "loss": 0.3385, "step": 810 }, { "epoch": 0.36590807675145026, "grad_norm": 0.35539785853231953, "learning_rate": 9.98563761084756e-06, "loss": 0.3334, "step": 820 }, { "epoch": 0.37037037037037035, "grad_norm": 0.8732726071617729, "learning_rate": 9.983603894628366e-06, "loss": 0.3216, "step": 830 }, { "epoch": 0.3748326639892905, "grad_norm": 0.3239156286291644, "learning_rate": 9.98143579936837e-06, "loss": 0.3395, "step": 840 }, { "epoch": 0.37929495760821064, "grad_norm": 0.35416799423504325, "learning_rate": 9.979133383528591e-06, "loss": 0.3419, "step": 850 }, { "epoch": 0.3837572512271307, "grad_norm": 0.3697582867922802, "learning_rate": 9.976696709191899e-06, "loss": 0.3385, "step": 860 }, { "epoch": 0.38821954484605087, "grad_norm": 0.31652698452061007, "learning_rate": 9.974125842061343e-06, "loss": 0.3415, "step": 870 }, { "epoch": 0.392681838464971, "grad_norm": 0.3806737592473093, "learning_rate": 9.971420851458373e-06, "loss": 0.34, "step": 880 }, { "epoch": 0.3971441320838911, "grad_norm": 0.3493066568615837, "learning_rate": 9.968581810320979e-06, "loss": 0.3435, "step": 890 }, { "epoch": 0.40160642570281124, "grad_norm": 0.3441435892323655, "learning_rate": 9.965608795201717e-06, "loss": 0.327, "step": 900 }, { "epoch": 0.4060687193217314, "grad_norm": 0.3347599296485585, "learning_rate": 9.96250188626565e-06, "loss": 0.3356, "step": 910 }, { "epoch": 0.4105310129406515, "grad_norm": 0.30962222868754863, "learning_rate": 9.959261167288185e-06, "loss": 0.3279, "step": 920 }, { "epoch": 0.4149933065595716, "grad_norm": 0.48278711145118325, "learning_rate": 9.955886725652815e-06, "loss": 0.3358, "step": 930 }, { "epoch": 0.41945560017849176, "grad_norm": 0.33458745369628223, "learning_rate": 9.952378652348758e-06, "loss": 0.3359, "step": 940 }, { "epoch": 0.42391789379741185, "grad_norm": 0.36375248941574195, "learning_rate": 9.948737041968509e-06, "loss": 0.333, "step": 950 }, { "epoch": 0.428380187416332, "grad_norm": 0.38000071048511164, "learning_rate": 9.944961992705288e-06, "loss": 0.3393, "step": 960 }, { "epoch": 0.43284248103525214, "grad_norm": 0.3322040446425915, "learning_rate": 9.941053606350389e-06, "loss": 0.3289, "step": 970 }, { "epoch": 0.4373047746541722, "grad_norm": 0.31368274498497106, "learning_rate": 9.937011988290443e-06, "loss": 0.323, "step": 980 }, { "epoch": 0.44176706827309237, "grad_norm": 0.33379459029750647, "learning_rate": 9.932837247504566e-06, "loss": 0.3332, "step": 990 }, { "epoch": 0.4462293618920125, "grad_norm": 0.3481057681661188, "learning_rate": 9.928529496561428e-06, "loss": 0.3249, "step": 1000 }, { "epoch": 0.4506916555109326, "grad_norm": 0.3805120805185267, "learning_rate": 9.924088851616216e-06, "loss": 0.3332, "step": 1010 }, { "epoch": 0.45515394912985274, "grad_norm": 0.3514086594689373, "learning_rate": 9.919515432407502e-06, "loss": 0.3329, "step": 1020 }, { "epoch": 0.4596162427487729, "grad_norm": 0.41618025357077787, "learning_rate": 9.914809362254013e-06, "loss": 0.3387, "step": 1030 }, { "epoch": 0.464078536367693, "grad_norm": 0.3777422639135785, "learning_rate": 9.909970768051302e-06, "loss": 0.3359, "step": 1040 }, { "epoch": 0.4685408299866131, "grad_norm": 0.32565404467366765, "learning_rate": 9.904999780268341e-06, "loss": 0.3239, "step": 1050 }, { "epoch": 0.47300312360553326, "grad_norm": 0.33049786269193765, "learning_rate": 9.899896532943983e-06, "loss": 0.3347, "step": 1060 }, { "epoch": 0.47746541722445335, "grad_norm": 0.2971213231433049, "learning_rate": 9.894661163683361e-06, "loss": 0.3256, "step": 1070 }, { "epoch": 0.4819277108433735, "grad_norm": 0.3265643495101442, "learning_rate": 9.889293813654179e-06, "loss": 0.3325, "step": 1080 }, { "epoch": 0.48639000446229363, "grad_norm": 0.3291514794248415, "learning_rate": 9.883794627582893e-06, "loss": 0.327, "step": 1090 }, { "epoch": 0.4908522980812137, "grad_norm": 0.3243181563194639, "learning_rate": 9.878163753750823e-06, "loss": 0.3311, "step": 1100 }, { "epoch": 0.49531459170013387, "grad_norm": 0.31360700475820674, "learning_rate": 9.872401343990143e-06, "loss": 0.3349, "step": 1110 }, { "epoch": 0.499776885319054, "grad_norm": 0.3427769144056513, "learning_rate": 9.866507553679797e-06, "loss": 0.3302, "step": 1120 }, { "epoch": 0.5042391789379741, "grad_norm": 0.3268750022359849, "learning_rate": 9.860482541741298e-06, "loss": 0.3387, "step": 1130 }, { "epoch": 0.5087014725568942, "grad_norm": 0.36779760794001126, "learning_rate": 9.854326470634454e-06, "loss": 0.3204, "step": 1140 }, { "epoch": 0.5131637661758144, "grad_norm": 0.3437763178921102, "learning_rate": 9.848039506352982e-06, "loss": 0.314, "step": 1150 }, { "epoch": 0.5176260597947345, "grad_norm": 0.3146556406530087, "learning_rate": 9.841621818420032e-06, "loss": 0.3291, "step": 1160 }, { "epoch": 0.5220883534136547, "grad_norm": 0.3372062278564171, "learning_rate": 9.835073579883614e-06, "loss": 0.3184, "step": 1170 }, { "epoch": 0.5265506470325747, "grad_norm": 0.3279517665081858, "learning_rate": 9.82839496731194e-06, "loss": 0.3301, "step": 1180 }, { "epoch": 0.5310129406514948, "grad_norm": 0.35435668550704597, "learning_rate": 9.821586160788652e-06, "loss": 0.3192, "step": 1190 }, { "epoch": 0.535475234270415, "grad_norm": 0.39621154055386365, "learning_rate": 9.814647343907975e-06, "loss": 0.3237, "step": 1200 }, { "epoch": 0.5399375278893351, "grad_norm": 0.314845881198322, "learning_rate": 9.807578703769763e-06, "loss": 0.317, "step": 1210 }, { "epoch": 0.5443998215082553, "grad_norm": 0.30404528418981586, "learning_rate": 9.80038043097445e-06, "loss": 0.327, "step": 1220 }, { "epoch": 0.5488621151271754, "grad_norm": 0.34277234804316314, "learning_rate": 9.793052719617921e-06, "loss": 0.3305, "step": 1230 }, { "epoch": 0.5533244087460955, "grad_norm": 0.315701512000068, "learning_rate": 9.78559576728627e-06, "loss": 0.3159, "step": 1240 }, { "epoch": 0.5577867023650156, "grad_norm": 0.4064220753807774, "learning_rate": 9.77800977505047e-06, "loss": 0.3222, "step": 1250 }, { "epoch": 0.5622489959839357, "grad_norm": 0.38345533251016317, "learning_rate": 9.770294947460964e-06, "loss": 0.3155, "step": 1260 }, { "epoch": 0.5667112896028559, "grad_norm": 0.3666235687309694, "learning_rate": 9.76245149254213e-06, "loss": 0.3223, "step": 1270 }, { "epoch": 0.571173583221776, "grad_norm": 0.3254173561944929, "learning_rate": 9.754479621786694e-06, "loss": 0.3253, "step": 1280 }, { "epoch": 0.5756358768406962, "grad_norm": 0.36867900317972835, "learning_rate": 9.74637955015001e-06, "loss": 0.3234, "step": 1290 }, { "epoch": 0.5800981704596162, "grad_norm": 0.3205667116149489, "learning_rate": 9.738151496044268e-06, "loss": 0.3238, "step": 1300 }, { "epoch": 0.5845604640785363, "grad_norm": 0.35181980934522605, "learning_rate": 9.729795681332616e-06, "loss": 0.3212, "step": 1310 }, { "epoch": 0.5890227576974565, "grad_norm": 0.3157343074157218, "learning_rate": 9.721312331323159e-06, "loss": 0.324, "step": 1320 }, { "epoch": 0.5934850513163766, "grad_norm": 0.3267527618647797, "learning_rate": 9.712701674762894e-06, "loss": 0.3293, "step": 1330 }, { "epoch": 0.5979473449352968, "grad_norm": 0.3709092454966587, "learning_rate": 9.703963943831548e-06, "loss": 0.3195, "step": 1340 }, { "epoch": 0.6024096385542169, "grad_norm": 0.3404725913270717, "learning_rate": 9.695099374135304e-06, "loss": 0.3261, "step": 1350 }, { "epoch": 0.606871932173137, "grad_norm": 0.33688175782559104, "learning_rate": 9.686108204700456e-06, "loss": 0.322, "step": 1360 }, { "epoch": 0.6113342257920571, "grad_norm": 0.3264040693280103, "learning_rate": 9.676990677966965e-06, "loss": 0.3262, "step": 1370 }, { "epoch": 0.6157965194109772, "grad_norm": 0.2946497602423895, "learning_rate": 9.667747039781915e-06, "loss": 0.3204, "step": 1380 }, { "epoch": 0.6202588130298974, "grad_norm": 0.3247321923979366, "learning_rate": 9.65837753939289e-06, "loss": 0.3115, "step": 1390 }, { "epoch": 0.6247211066488175, "grad_norm": 0.3609340508110271, "learning_rate": 9.648882429441258e-06, "loss": 0.3299, "step": 1400 }, { "epoch": 0.6291834002677377, "grad_norm": 0.3140886682745277, "learning_rate": 9.639261965955339e-06, "loss": 0.3204, "step": 1410 }, { "epoch": 0.6336456938866577, "grad_norm": 0.30604068166540555, "learning_rate": 9.629516408343524e-06, "loss": 0.3296, "step": 1420 }, { "epoch": 0.6381079875055778, "grad_norm": 0.34254988617650395, "learning_rate": 9.619646019387269e-06, "loss": 0.3291, "step": 1430 }, { "epoch": 0.642570281124498, "grad_norm": 0.34269558101387265, "learning_rate": 9.609651065234008e-06, "loss": 0.3308, "step": 1440 }, { "epoch": 0.6470325747434181, "grad_norm": 0.3110380628077838, "learning_rate": 9.599531815389987e-06, "loss": 0.3231, "step": 1450 }, { "epoch": 0.6514948683623383, "grad_norm": 0.3543086604772253, "learning_rate": 9.589288542712978e-06, "loss": 0.315, "step": 1460 }, { "epoch": 0.6559571619812584, "grad_norm": 0.29688329030679655, "learning_rate": 9.578921523404943e-06, "loss": 0.3212, "step": 1470 }, { "epoch": 0.6604194556001785, "grad_norm": 0.6477411665067504, "learning_rate": 9.568431037004574e-06, "loss": 0.3184, "step": 1480 }, { "epoch": 0.6648817492190986, "grad_norm": 0.4105291056831329, "learning_rate": 9.557817366379756e-06, "loss": 0.3316, "step": 1490 }, { "epoch": 0.6693440428380187, "grad_norm": 0.3108965775539251, "learning_rate": 9.547080797719943e-06, "loss": 0.3292, "step": 1500 }, { "epoch": 0.6738063364569389, "grad_norm": 0.2925176192434652, "learning_rate": 9.536221620528442e-06, "loss": 0.3188, "step": 1510 }, { "epoch": 0.678268630075859, "grad_norm": 0.32680827418487335, "learning_rate": 9.5252401276146e-06, "loss": 0.3189, "step": 1520 }, { "epoch": 0.6827309236947792, "grad_norm": 0.31001306605003415, "learning_rate": 9.514136615085917e-06, "loss": 0.3246, "step": 1530 }, { "epoch": 0.6871932173136992, "grad_norm": 0.3232212027929326, "learning_rate": 9.502911382340056e-06, "loss": 0.3223, "step": 1540 }, { "epoch": 0.6916555109326193, "grad_norm": 0.280224760234745, "learning_rate": 9.491564732056775e-06, "loss": 0.3144, "step": 1550 }, { "epoch": 0.6961178045515395, "grad_norm": 0.3237056713223196, "learning_rate": 9.480096970189756e-06, "loss": 0.3182, "step": 1560 }, { "epoch": 0.7005800981704596, "grad_norm": 0.31882269101613014, "learning_rate": 9.46850840595837e-06, "loss": 0.3223, "step": 1570 }, { "epoch": 0.7050423917893798, "grad_norm": 0.32578619260595243, "learning_rate": 9.456799351839324e-06, "loss": 0.3176, "step": 1580 }, { "epoch": 0.7095046854082999, "grad_norm": 0.3385848752838862, "learning_rate": 9.44497012355824e-06, "loss": 0.3284, "step": 1590 }, { "epoch": 0.71396697902722, "grad_norm": 0.3130273460464875, "learning_rate": 9.433021040081159e-06, "loss": 0.3195, "step": 1600 }, { "epoch": 0.7184292726461401, "grad_norm": 0.3516738754933172, "learning_rate": 9.420952423605904e-06, "loss": 0.3267, "step": 1610 }, { "epoch": 0.7228915662650602, "grad_norm": 0.3251716278941906, "learning_rate": 9.408764599553429e-06, "loss": 0.318, "step": 1620 }, { "epoch": 0.7273538598839804, "grad_norm": 0.32226390759863277, "learning_rate": 9.396457896559021e-06, "loss": 0.3201, "step": 1630 }, { "epoch": 0.7318161535029005, "grad_norm": 0.3035427600136032, "learning_rate": 9.384032646463448e-06, "loss": 0.3176, "step": 1640 }, { "epoch": 0.7362784471218207, "grad_norm": 0.35729461477248853, "learning_rate": 9.37148918430401e-06, "loss": 0.3191, "step": 1650 }, { "epoch": 0.7407407407407407, "grad_norm": 0.3103181941339719, "learning_rate": 9.358827848305502e-06, "loss": 0.3218, "step": 1660 }, { "epoch": 0.7452030343596608, "grad_norm": 0.34994784215316105, "learning_rate": 9.346048979871098e-06, "loss": 0.3212, "step": 1670 }, { "epoch": 0.749665327978581, "grad_norm": 0.3270663588481017, "learning_rate": 9.333152923573146e-06, "loss": 0.3194, "step": 1680 }, { "epoch": 0.7541276215975011, "grad_norm": 0.3017394614877857, "learning_rate": 9.320140027143869e-06, "loss": 0.3244, "step": 1690 }, { "epoch": 0.7585899152164213, "grad_norm": 0.3371418446207733, "learning_rate": 9.307010641466e-06, "loss": 0.3223, "step": 1700 }, { "epoch": 0.7630522088353414, "grad_norm": 0.36102711717253444, "learning_rate": 9.293765120563309e-06, "loss": 0.3235, "step": 1710 }, { "epoch": 0.7675145024542614, "grad_norm": 0.3007012064707952, "learning_rate": 9.280403821591066e-06, "loss": 0.3208, "step": 1720 }, { "epoch": 0.7719767960731816, "grad_norm": 0.2993075622208931, "learning_rate": 9.26692710482641e-06, "loss": 0.3242, "step": 1730 }, { "epoch": 0.7764390896921017, "grad_norm": 0.31517853620369296, "learning_rate": 9.253335333658627e-06, "loss": 0.3209, "step": 1740 }, { "epoch": 0.7809013833110219, "grad_norm": 0.30592785176098447, "learning_rate": 9.239628874579359e-06, "loss": 0.3211, "step": 1750 }, { "epoch": 0.785363676929942, "grad_norm": 0.3511159488999007, "learning_rate": 9.22580809717272e-06, "loss": 0.3159, "step": 1760 }, { "epoch": 0.7898259705488622, "grad_norm": 0.3207835963226278, "learning_rate": 9.211873374105325e-06, "loss": 0.323, "step": 1770 }, { "epoch": 0.7942882641677822, "grad_norm": 0.3169720632937591, "learning_rate": 9.197825081116255e-06, "loss": 0.3165, "step": 1780 }, { "epoch": 0.7987505577867023, "grad_norm": 0.32507773728896977, "learning_rate": 9.183663597006904e-06, "loss": 0.3154, "step": 1790 }, { "epoch": 0.8032128514056225, "grad_norm": 0.29822880733968704, "learning_rate": 9.169389303630792e-06, "loss": 0.3202, "step": 1800 }, { "epoch": 0.8076751450245426, "grad_norm": 0.27944537447466095, "learning_rate": 9.155002585883238e-06, "loss": 0.3194, "step": 1810 }, { "epoch": 0.8121374386434628, "grad_norm": 0.3257811138318699, "learning_rate": 9.140503831691014e-06, "loss": 0.319, "step": 1820 }, { "epoch": 0.8165997322623829, "grad_norm": 0.38751538240318106, "learning_rate": 9.125893432001856e-06, "loss": 0.324, "step": 1830 }, { "epoch": 0.821062025881303, "grad_norm": 0.35676044959951964, "learning_rate": 9.111171780773938e-06, "loss": 0.3207, "step": 1840 }, { "epoch": 0.8255243195002231, "grad_norm": 0.33066533272987186, "learning_rate": 9.096339274965248e-06, "loss": 0.3225, "step": 1850 }, { "epoch": 0.8299866131191432, "grad_norm": 0.30077549406173915, "learning_rate": 9.081396314522883e-06, "loss": 0.3156, "step": 1860 }, { "epoch": 0.8344489067380634, "grad_norm": 0.4455200746218136, "learning_rate": 9.066343302372262e-06, "loss": 0.3208, "step": 1870 }, { "epoch": 0.8389112003569835, "grad_norm": 0.29588329863262136, "learning_rate": 9.051180644406265e-06, "loss": 0.3158, "step": 1880 }, { "epoch": 0.8433734939759037, "grad_norm": 0.34195626405047574, "learning_rate": 9.035908749474286e-06, "loss": 0.3048, "step": 1890 }, { "epoch": 0.8478357875948237, "grad_norm": 0.6420035384465244, "learning_rate": 9.020528029371209e-06, "loss": 0.3072, "step": 1900 }, { "epoch": 0.8522980812137438, "grad_norm": 0.3087614249615553, "learning_rate": 9.005038898826307e-06, "loss": 0.3178, "step": 1910 }, { "epoch": 0.856760374832664, "grad_norm": 0.31279375725653125, "learning_rate": 8.989441775492054e-06, "loss": 0.3187, "step": 1920 }, { "epoch": 0.8612226684515841, "grad_norm": 0.31745786814951604, "learning_rate": 8.97373707993287e-06, "loss": 0.3138, "step": 1930 }, { "epoch": 0.8656849620705043, "grad_norm": 0.32729085562805416, "learning_rate": 8.957925235613774e-06, "loss": 0.3186, "step": 1940 }, { "epoch": 0.8701472556894244, "grad_norm": 0.29401213857632397, "learning_rate": 8.942006668888972e-06, "loss": 0.3209, "step": 1950 }, { "epoch": 0.8746095493083444, "grad_norm": 0.35071321892553325, "learning_rate": 8.925981808990353e-06, "loss": 0.3067, "step": 1960 }, { "epoch": 0.8790718429272646, "grad_norm": 0.3254090010272678, "learning_rate": 8.909851088015929e-06, "loss": 0.319, "step": 1970 }, { "epoch": 0.8835341365461847, "grad_norm": 0.31143570006446, "learning_rate": 8.89361494091816e-06, "loss": 0.3173, "step": 1980 }, { "epoch": 0.8879964301651049, "grad_norm": 0.3368629213736036, "learning_rate": 8.877273805492251e-06, "loss": 0.3142, "step": 1990 }, { "epoch": 0.892458723784025, "grad_norm": 0.32123265502775333, "learning_rate": 8.860828122364333e-06, "loss": 0.3114, "step": 2000 }, { "epoch": 0.8969210174029452, "grad_norm": 0.32040081950305627, "learning_rate": 8.844278334979587e-06, "loss": 0.3173, "step": 2010 }, { "epoch": 0.9013833110218652, "grad_norm": 0.31575327422413973, "learning_rate": 8.82762488959028e-06, "loss": 0.3161, "step": 2020 }, { "epoch": 0.9058456046407853, "grad_norm": 0.2756449995981234, "learning_rate": 8.810868235243746e-06, "loss": 0.3127, "step": 2030 }, { "epoch": 0.9103078982597055, "grad_norm": 0.3264003117051716, "learning_rate": 8.79400882377026e-06, "loss": 0.3122, "step": 2040 }, { "epoch": 0.9147701918786256, "grad_norm": 0.32023283877896064, "learning_rate": 8.777047109770872e-06, "loss": 0.3161, "step": 2050 }, { "epoch": 0.9192324854975458, "grad_norm": 0.32023324438450435, "learning_rate": 8.759983550605132e-06, "loss": 0.3075, "step": 2060 }, { "epoch": 0.9236947791164659, "grad_norm": 0.3495626563053784, "learning_rate": 8.74281860637877e-06, "loss": 0.321, "step": 2070 }, { "epoch": 0.928157072735386, "grad_norm": 0.365734666051995, "learning_rate": 8.725552739931295e-06, "loss": 0.3172, "step": 2080 }, { "epoch": 0.9326193663543061, "grad_norm": 0.5195262244868563, "learning_rate": 8.70818641682349e-06, "loss": 0.3114, "step": 2090 }, { "epoch": 0.9370816599732262, "grad_norm": 0.31318241394583657, "learning_rate": 8.690720105324887e-06, "loss": 0.3145, "step": 2100 }, { "epoch": 0.9415439535921464, "grad_norm": 0.30511794190937597, "learning_rate": 8.673154276401123e-06, "loss": 0.3107, "step": 2110 }, { "epoch": 0.9460062472110665, "grad_norm": 0.33299848484970584, "learning_rate": 8.655489403701244e-06, "loss": 0.3199, "step": 2120 }, { "epoch": 0.9504685408299867, "grad_norm": 0.2918060481120866, "learning_rate": 8.63772596354494e-06, "loss": 0.3241, "step": 2130 }, { "epoch": 0.9549308344489067, "grad_norm": 0.3267067284562192, "learning_rate": 8.619864434909692e-06, "loss": 0.3144, "step": 2140 }, { "epoch": 0.9593931280678268, "grad_norm": 0.3620884501681288, "learning_rate": 8.601905299417865e-06, "loss": 0.3064, "step": 2150 }, { "epoch": 0.963855421686747, "grad_norm": 0.3385652298929547, "learning_rate": 8.583849041323717e-06, "loss": 0.3167, "step": 2160 }, { "epoch": 0.9683177153056671, "grad_norm": 0.31358881851824566, "learning_rate": 8.565696147500338e-06, "loss": 0.3173, "step": 2170 }, { "epoch": 0.9727800089245873, "grad_norm": 0.26819324932064814, "learning_rate": 8.54744710742653e-06, "loss": 0.3098, "step": 2180 }, { "epoch": 0.9772423025435074, "grad_norm": 0.3193733001824146, "learning_rate": 8.529102413173605e-06, "loss": 0.3019, "step": 2190 }, { "epoch": 0.9817045961624274, "grad_norm": 0.28919148613320567, "learning_rate": 8.510662559392113e-06, "loss": 0.313, "step": 2200 }, { "epoch": 0.9861668897813476, "grad_norm": 0.30310308101898775, "learning_rate": 8.492128043298511e-06, "loss": 0.3229, "step": 2210 }, { "epoch": 0.9906291834002677, "grad_norm": 0.31282347332567895, "learning_rate": 8.47349936466175e-06, "loss": 0.3159, "step": 2220 }, { "epoch": 0.9950914770191879, "grad_norm": 0.28804399192631563, "learning_rate": 8.454777025789805e-06, "loss": 0.313, "step": 2230 }, { "epoch": 0.999553770638108, "grad_norm": 0.2868177500862966, "learning_rate": 8.435961531516119e-06, "loss": 0.2984, "step": 2240 }, { "epoch": 1.0040160642570282, "grad_norm": 0.3322502574109939, "learning_rate": 8.417053389186009e-06, "loss": 0.2982, "step": 2250 }, { "epoch": 1.0084783578759482, "grad_norm": 0.31601076209209755, "learning_rate": 8.398053108642966e-06, "loss": 0.2938, "step": 2260 }, { "epoch": 1.0129406514948684, "grad_norm": 0.3085084676907704, "learning_rate": 8.378961202214927e-06, "loss": 0.3022, "step": 2270 }, { "epoch": 1.0174029451137885, "grad_norm": 0.3030562804942466, "learning_rate": 8.35977818470044e-06, "loss": 0.2854, "step": 2280 }, { "epoch": 1.0218652387327085, "grad_norm": 0.30673490305127743, "learning_rate": 8.3405045733548e-06, "loss": 0.2937, "step": 2290 }, { "epoch": 1.0263275323516288, "grad_norm": 0.3079905769883883, "learning_rate": 8.321140887876093e-06, "loss": 0.2937, "step": 2300 }, { "epoch": 1.0307898259705488, "grad_norm": 0.2833185510901884, "learning_rate": 8.301687650391184e-06, "loss": 0.2911, "step": 2310 }, { "epoch": 1.035252119589469, "grad_norm": 0.2928224557074968, "learning_rate": 8.28214538544164e-06, "loss": 0.2925, "step": 2320 }, { "epoch": 1.039714413208389, "grad_norm": 0.3119622799871382, "learning_rate": 8.262514619969583e-06, "loss": 0.2928, "step": 2330 }, { "epoch": 1.0441767068273093, "grad_norm": 0.2909193051250449, "learning_rate": 8.242795883303489e-06, "loss": 0.2913, "step": 2340 }, { "epoch": 1.0486390004462294, "grad_norm": 0.2878712035948744, "learning_rate": 8.222989707143903e-06, "loss": 0.2963, "step": 2350 }, { "epoch": 1.0531012940651494, "grad_norm": 0.31820465000033393, "learning_rate": 8.203096625549109e-06, "loss": 0.293, "step": 2360 }, { "epoch": 1.0575635876840697, "grad_norm": 0.3374605509762875, "learning_rate": 8.183117174920733e-06, "loss": 0.2946, "step": 2370 }, { "epoch": 1.0620258813029897, "grad_norm": 0.3020068367212028, "learning_rate": 8.163051893989273e-06, "loss": 0.2925, "step": 2380 }, { "epoch": 1.06648817492191, "grad_norm": 0.31068593851856324, "learning_rate": 8.142901323799578e-06, "loss": 0.2981, "step": 2390 }, { "epoch": 1.07095046854083, "grad_norm": 0.3066541366441537, "learning_rate": 8.122666007696251e-06, "loss": 0.2916, "step": 2400 }, { "epoch": 1.07541276215975, "grad_norm": 0.2898134349503606, "learning_rate": 8.102346491309007e-06, "loss": 0.2887, "step": 2410 }, { "epoch": 1.0798750557786703, "grad_norm": 0.3017984948209697, "learning_rate": 8.081943322537958e-06, "loss": 0.2975, "step": 2420 }, { "epoch": 1.0843373493975903, "grad_norm": 0.3106616772867366, "learning_rate": 8.061457051538832e-06, "loss": 0.2934, "step": 2430 }, { "epoch": 1.0887996430165106, "grad_norm": 0.3117195701092512, "learning_rate": 8.040888230708153e-06, "loss": 0.2887, "step": 2440 }, { "epoch": 1.0932619366354306, "grad_norm": 0.324577741353284, "learning_rate": 8.02023741466833e-06, "loss": 0.2886, "step": 2450 }, { "epoch": 1.0977242302543506, "grad_norm": 0.31435192682062346, "learning_rate": 7.999505160252712e-06, "loss": 0.2913, "step": 2460 }, { "epoch": 1.1021865238732709, "grad_norm": 0.30511195151083603, "learning_rate": 7.978692026490576e-06, "loss": 0.2963, "step": 2470 }, { "epoch": 1.106648817492191, "grad_norm": 0.30143500498716036, "learning_rate": 7.957798574592042e-06, "loss": 0.2904, "step": 2480 }, { "epoch": 1.1111111111111112, "grad_norm": 0.3029934237549778, "learning_rate": 7.936825367932947e-06, "loss": 0.2874, "step": 2490 }, { "epoch": 1.1155734047300312, "grad_norm": 0.310379596751207, "learning_rate": 7.91577297203966e-06, "loss": 0.2864, "step": 2500 }, { "epoch": 1.1200356983489514, "grad_norm": 0.2756683563500154, "learning_rate": 7.89464195457382e-06, "loss": 0.2838, "step": 2510 }, { "epoch": 1.1244979919678715, "grad_norm": 0.2886788009666996, "learning_rate": 7.873432885317036e-06, "loss": 0.2892, "step": 2520 }, { "epoch": 1.1289602855867917, "grad_norm": 0.6175230364141268, "learning_rate": 7.852146336155531e-06, "loss": 0.2961, "step": 2530 }, { "epoch": 1.1334225792057118, "grad_norm": 0.33735487775658257, "learning_rate": 7.830782881064707e-06, "loss": 0.2989, "step": 2540 }, { "epoch": 1.1378848728246318, "grad_norm": 0.2815200857691379, "learning_rate": 7.809343096093676e-06, "loss": 0.2918, "step": 2550 }, { "epoch": 1.142347166443552, "grad_norm": 0.3190460322083763, "learning_rate": 7.787827559349727e-06, "loss": 0.2992, "step": 2560 }, { "epoch": 1.146809460062472, "grad_norm": 0.316056300209666, "learning_rate": 7.766236850982739e-06, "loss": 0.2882, "step": 2570 }, { "epoch": 1.1512717536813923, "grad_norm": 0.30178501428282056, "learning_rate": 7.744571553169534e-06, "loss": 0.2925, "step": 2580 }, { "epoch": 1.1557340473003124, "grad_norm": 0.3001665670771815, "learning_rate": 7.722832250098183e-06, "loss": 0.2901, "step": 2590 }, { "epoch": 1.1601963409192324, "grad_norm": 0.29583953397424706, "learning_rate": 7.701019527952248e-06, "loss": 0.2971, "step": 2600 }, { "epoch": 1.1646586345381527, "grad_norm": 0.3241701984511909, "learning_rate": 7.679133974894984e-06, "loss": 0.2951, "step": 2610 }, { "epoch": 1.1691209281570727, "grad_norm": 0.31677851739186413, "learning_rate": 7.657176181053472e-06, "loss": 0.3, "step": 2620 }, { "epoch": 1.173583221775993, "grad_norm": 0.3740238542554803, "learning_rate": 7.635146738502714e-06, "loss": 0.2934, "step": 2630 }, { "epoch": 1.178045515394913, "grad_norm": 0.31202486519983047, "learning_rate": 7.6130462412496605e-06, "loss": 0.2999, "step": 2640 }, { "epoch": 1.182507809013833, "grad_norm": 0.3488189130672866, "learning_rate": 7.590875285217201e-06, "loss": 0.2913, "step": 2650 }, { "epoch": 1.1869701026327533, "grad_norm": 0.31070266182233724, "learning_rate": 7.568634468228085e-06, "loss": 0.2847, "step": 2660 }, { "epoch": 1.1914323962516733, "grad_norm": 0.29933114524992155, "learning_rate": 7.546324389988817e-06, "loss": 0.3017, "step": 2670 }, { "epoch": 1.1958946898705936, "grad_norm": 0.29798742777776227, "learning_rate": 7.5239456520734726e-06, "loss": 0.2972, "step": 2680 }, { "epoch": 1.2003569834895136, "grad_norm": 0.3062412855044856, "learning_rate": 7.501498857907485e-06, "loss": 0.2845, "step": 2690 }, { "epoch": 1.2048192771084336, "grad_norm": 0.27470746072655944, "learning_rate": 7.478984612751371e-06, "loss": 0.2863, "step": 2700 }, { "epoch": 1.2092815707273539, "grad_norm": 0.27866953124323884, "learning_rate": 7.456403523684412e-06, "loss": 0.2863, "step": 2710 }, { "epoch": 1.213743864346274, "grad_norm": 0.2909116045188707, "learning_rate": 7.433756199588282e-06, "loss": 0.2926, "step": 2720 }, { "epoch": 1.2182061579651942, "grad_norm": 0.29453973806838746, "learning_rate": 7.411043251130634e-06, "loss": 0.3008, "step": 2730 }, { "epoch": 1.2226684515841142, "grad_norm": 0.31106123745886194, "learning_rate": 7.388265290748631e-06, "loss": 0.2966, "step": 2740 }, { "epoch": 1.2271307452030344, "grad_norm": 0.3011866488239261, "learning_rate": 7.36542293263243e-06, "loss": 0.2937, "step": 2750 }, { "epoch": 1.2315930388219545, "grad_norm": 0.2940575796812627, "learning_rate": 7.342516792708627e-06, "loss": 0.2852, "step": 2760 }, { "epoch": 1.2360553324408747, "grad_norm": 0.26936074456201886, "learning_rate": 7.319547488623642e-06, "loss": 0.2921, "step": 2770 }, { "epoch": 1.2405176260597948, "grad_norm": 0.290558981897068, "learning_rate": 7.296515639727071e-06, "loss": 0.2951, "step": 2780 }, { "epoch": 1.2449799196787148, "grad_norm": 0.2852457393680859, "learning_rate": 7.273421867054979e-06, "loss": 0.2883, "step": 2790 }, { "epoch": 1.249442213297635, "grad_norm": 0.3014603513538629, "learning_rate": 7.250266793313161e-06, "loss": 0.2865, "step": 2800 }, { "epoch": 1.253904506916555, "grad_norm": 0.31555167954226493, "learning_rate": 7.2270510428603465e-06, "loss": 0.2872, "step": 2810 }, { "epoch": 1.2583668005354753, "grad_norm": 0.30225060098084444, "learning_rate": 7.2037752416913684e-06, "loss": 0.2917, "step": 2820 }, { "epoch": 1.2628290941543954, "grad_norm": 0.3044780265386901, "learning_rate": 7.180440017420277e-06, "loss": 0.2928, "step": 2830 }, { "epoch": 1.2672913877733154, "grad_norm": 0.28549429454817354, "learning_rate": 7.157045999263423e-06, "loss": 0.2966, "step": 2840 }, { "epoch": 1.2717536813922357, "grad_norm": 0.3366282530568157, "learning_rate": 7.13359381802249e-06, "loss": 0.2934, "step": 2850 }, { "epoch": 1.2762159750111557, "grad_norm": 0.37361922324450697, "learning_rate": 7.110084106067483e-06, "loss": 0.2838, "step": 2860 }, { "epoch": 1.280678268630076, "grad_norm": 0.3167267835607769, "learning_rate": 7.0865174973196746e-06, "loss": 0.303, "step": 2870 }, { "epoch": 1.285140562248996, "grad_norm": 0.3151077338868855, "learning_rate": 7.062894627234525e-06, "loss": 0.286, "step": 2880 }, { "epoch": 1.289602855867916, "grad_norm": 0.2948165918607916, "learning_rate": 7.039216132784528e-06, "loss": 0.2823, "step": 2890 }, { "epoch": 1.2940651494868363, "grad_norm": 0.34833417956591767, "learning_rate": 7.0154826524420506e-06, "loss": 0.282, "step": 2900 }, { "epoch": 1.2985274431057563, "grad_norm": 0.29607288645591945, "learning_rate": 6.9916948261621145e-06, "loss": 0.2903, "step": 2910 }, { "epoch": 1.3029897367246766, "grad_norm": 0.3141375337783596, "learning_rate": 6.96785329536513e-06, "loss": 0.3022, "step": 2920 }, { "epoch": 1.3074520303435966, "grad_norm": 0.3094386182548052, "learning_rate": 6.943958702919618e-06, "loss": 0.2963, "step": 2930 }, { "epoch": 1.3119143239625166, "grad_norm": 0.29702645469864675, "learning_rate": 6.9200116931248575e-06, "loss": 0.2925, "step": 2940 }, { "epoch": 1.3163766175814369, "grad_norm": 0.3131467834081039, "learning_rate": 6.896012911693527e-06, "loss": 0.2926, "step": 2950 }, { "epoch": 1.320838911200357, "grad_norm": 0.30420307046401684, "learning_rate": 6.871963005734283e-06, "loss": 0.2914, "step": 2960 }, { "epoch": 1.3253012048192772, "grad_norm": 0.2874880602952481, "learning_rate": 6.847862623734316e-06, "loss": 0.2907, "step": 2970 }, { "epoch": 1.3297634984381972, "grad_norm": 0.31117439728389396, "learning_rate": 6.823712415541867e-06, "loss": 0.2906, "step": 2980 }, { "epoch": 1.3342257920571172, "grad_norm": 0.3154785866126041, "learning_rate": 6.7995130323486995e-06, "loss": 0.2905, "step": 2990 }, { "epoch": 1.3386880856760375, "grad_norm": 0.3162007988121743, "learning_rate": 6.775265126672544e-06, "loss": 0.2944, "step": 3000 }, { "epoch": 1.3431503792949577, "grad_norm": 0.2996244368117481, "learning_rate": 6.750969352339503e-06, "loss": 0.2791, "step": 3010 }, { "epoch": 1.3476126729138778, "grad_norm": 0.31285203943822604, "learning_rate": 6.726626364466418e-06, "loss": 0.2978, "step": 3020 }, { "epoch": 1.3520749665327978, "grad_norm": 0.3069565443579846, "learning_rate": 6.70223681944321e-06, "loss": 0.2887, "step": 3030 }, { "epoch": 1.356537260151718, "grad_norm": 0.3555134690217621, "learning_rate": 6.677801374915175e-06, "loss": 0.2938, "step": 3040 }, { "epoch": 1.360999553770638, "grad_norm": 0.32323874683363596, "learning_rate": 6.653320689765257e-06, "loss": 0.2878, "step": 3050 }, { "epoch": 1.3654618473895583, "grad_norm": 0.2958320258483459, "learning_rate": 6.628795424096276e-06, "loss": 0.2846, "step": 3060 }, { "epoch": 1.3699241410084784, "grad_norm": 0.29591479942269067, "learning_rate": 6.604226239213131e-06, "loss": 0.2912, "step": 3070 }, { "epoch": 1.3743864346273984, "grad_norm": 0.275955956029984, "learning_rate": 6.579613797604971e-06, "loss": 0.2841, "step": 3080 }, { "epoch": 1.3788487282463187, "grad_norm": 0.3169860752144058, "learning_rate": 6.554958762927328e-06, "loss": 0.2911, "step": 3090 }, { "epoch": 1.3833110218652387, "grad_norm": 0.2965804636519538, "learning_rate": 6.530261799984225e-06, "loss": 0.289, "step": 3100 }, { "epoch": 1.387773315484159, "grad_norm": 0.3240128684238396, "learning_rate": 6.5055235747102456e-06, "loss": 0.2914, "step": 3110 }, { "epoch": 1.392235609103079, "grad_norm": 0.27893092037168543, "learning_rate": 6.480744754152581e-06, "loss": 0.2904, "step": 3120 }, { "epoch": 1.396697902721999, "grad_norm": 0.30817444273855016, "learning_rate": 6.455926006453045e-06, "loss": 0.2888, "step": 3130 }, { "epoch": 1.4011601963409193, "grad_norm": 0.29700021328473286, "learning_rate": 6.431068000830054e-06, "loss": 0.2892, "step": 3140 }, { "epoch": 1.4056224899598393, "grad_norm": 0.31253083958098826, "learning_rate": 6.406171407560587e-06, "loss": 0.2939, "step": 3150 }, { "epoch": 1.4100847835787595, "grad_norm": 0.2949725175395465, "learning_rate": 6.381236897962102e-06, "loss": 0.2918, "step": 3160 }, { "epoch": 1.4145470771976796, "grad_norm": 0.2835765420987079, "learning_rate": 6.356265144374451e-06, "loss": 0.2912, "step": 3170 }, { "epoch": 1.4190093708165996, "grad_norm": 0.2911266952336509, "learning_rate": 6.3312568201417335e-06, "loss": 0.2863, "step": 3180 }, { "epoch": 1.4234716644355199, "grad_norm": 0.2975023838090645, "learning_rate": 6.306212599594155e-06, "loss": 0.287, "step": 3190 }, { "epoch": 1.42793395805444, "grad_norm": 0.33944728741572633, "learning_rate": 6.281133158029833e-06, "loss": 0.28, "step": 3200 }, { "epoch": 1.4323962516733602, "grad_norm": 0.29846101119632584, "learning_rate": 6.256019171696595e-06, "loss": 0.2804, "step": 3210 }, { "epoch": 1.4368585452922802, "grad_norm": 0.28248180975947623, "learning_rate": 6.230871317773737e-06, "loss": 0.2926, "step": 3220 }, { "epoch": 1.4413208389112002, "grad_norm": 0.2851276821074005, "learning_rate": 6.205690274353775e-06, "loss": 0.2922, "step": 3230 }, { "epoch": 1.4457831325301205, "grad_norm": 0.39429209152368455, "learning_rate": 6.1804767204241515e-06, "loss": 0.2836, "step": 3240 }, { "epoch": 1.4502454261490407, "grad_norm": 0.28036841762852593, "learning_rate": 6.155231335848927e-06, "loss": 0.297, "step": 3250 }, { "epoch": 1.4547077197679608, "grad_norm": 0.31725786994860594, "learning_rate": 6.129954801350455e-06, "loss": 0.2912, "step": 3260 }, { "epoch": 1.4591700133868808, "grad_norm": 0.3367577417882933, "learning_rate": 6.1046477984910215e-06, "loss": 0.2941, "step": 3270 }, { "epoch": 1.463632307005801, "grad_norm": 0.3029711035622468, "learning_rate": 6.079311009654466e-06, "loss": 0.2714, "step": 3280 }, { "epoch": 1.468094600624721, "grad_norm": 0.36320361509650795, "learning_rate": 6.053945118027789e-06, "loss": 0.2998, "step": 3290 }, { "epoch": 1.4725568942436413, "grad_norm": 0.3004963431679098, "learning_rate": 6.028550807582718e-06, "loss": 0.2841, "step": 3300 }, { "epoch": 1.4770191878625614, "grad_norm": 0.28832303322372754, "learning_rate": 6.00312876305728e-06, "loss": 0.2907, "step": 3310 }, { "epoch": 1.4814814814814814, "grad_norm": 0.2738988460065302, "learning_rate": 5.977679669937325e-06, "loss": 0.2865, "step": 3320 }, { "epoch": 1.4859437751004017, "grad_norm": 0.3055752634708775, "learning_rate": 5.952204214438049e-06, "loss": 0.2886, "step": 3330 }, { "epoch": 1.4904060687193217, "grad_norm": 0.3716780982022642, "learning_rate": 5.926703083485488e-06, "loss": 0.2906, "step": 3340 }, { "epoch": 1.494868362338242, "grad_norm": 0.324100431102628, "learning_rate": 5.901176964698e-06, "loss": 0.2857, "step": 3350 }, { "epoch": 1.499330655957162, "grad_norm": 0.27889777787988906, "learning_rate": 5.875626546367716e-06, "loss": 0.292, "step": 3360 }, { "epoch": 1.503792949576082, "grad_norm": 0.30807078683693484, "learning_rate": 5.850052517441991e-06, "loss": 0.2963, "step": 3370 }, { "epoch": 1.5082552431950023, "grad_norm": 0.2762209648675528, "learning_rate": 5.824455567504817e-06, "loss": 0.286, "step": 3380 }, { "epoch": 1.5127175368139225, "grad_norm": 0.33369974271939384, "learning_rate": 5.798836386758235e-06, "loss": 0.2954, "step": 3390 }, { "epoch": 1.5171798304328425, "grad_norm": 0.2860459702457261, "learning_rate": 5.773195666003724e-06, "loss": 0.2938, "step": 3400 }, { "epoch": 1.5216421240517626, "grad_norm": 0.2779676376622302, "learning_rate": 5.747534096623569e-06, "loss": 0.2876, "step": 3410 }, { "epoch": 1.5261044176706826, "grad_norm": 0.2955504033746034, "learning_rate": 5.7218523705622275e-06, "loss": 0.2929, "step": 3420 }, { "epoch": 1.5305667112896029, "grad_norm": 0.2980806596476541, "learning_rate": 5.696151180307661e-06, "loss": 0.2925, "step": 3430 }, { "epoch": 1.5350290049085231, "grad_norm": 0.2794624988961113, "learning_rate": 5.670431218872672e-06, "loss": 0.2931, "step": 3440 }, { "epoch": 1.5394912985274432, "grad_norm": 0.2786208372363059, "learning_rate": 5.644693179776213e-06, "loss": 0.2974, "step": 3450 }, { "epoch": 1.5439535921463632, "grad_norm": 0.27275146705308617, "learning_rate": 5.618937757024683e-06, "loss": 0.2912, "step": 3460 }, { "epoch": 1.5484158857652832, "grad_norm": 0.27976748800298595, "learning_rate": 5.593165645093222e-06, "loss": 0.2951, "step": 3470 }, { "epoch": 1.5528781793842035, "grad_norm": 0.2818438825086793, "learning_rate": 5.567377538906977e-06, "loss": 0.2923, "step": 3480 }, { "epoch": 1.5573404730031237, "grad_norm": 0.27434503383015063, "learning_rate": 5.541574133822374e-06, "loss": 0.2903, "step": 3490 }, { "epoch": 1.5618027666220438, "grad_norm": 0.28657591757615636, "learning_rate": 5.515756125608355e-06, "loss": 0.288, "step": 3500 }, { "epoch": 1.5662650602409638, "grad_norm": 0.27282164230782224, "learning_rate": 5.489924210427628e-06, "loss": 0.2896, "step": 3510 }, { "epoch": 1.5707273538598838, "grad_norm": 0.417801968862151, "learning_rate": 5.464079084817892e-06, "loss": 0.284, "step": 3520 }, { "epoch": 1.575189647478804, "grad_norm": 0.29465703843658075, "learning_rate": 5.4382214456730546e-06, "loss": 0.2918, "step": 3530 }, { "epoch": 1.5796519410977243, "grad_norm": 0.28542382527132004, "learning_rate": 5.412351990224438e-06, "loss": 0.2857, "step": 3540 }, { "epoch": 1.5841142347166444, "grad_norm": 0.2693415075013077, "learning_rate": 5.386471416021987e-06, "loss": 0.2833, "step": 3550 }, { "epoch": 1.5885765283355644, "grad_norm": 0.28674507691911566, "learning_rate": 5.36058042091545e-06, "loss": 0.2788, "step": 3560 }, { "epoch": 1.5930388219544847, "grad_norm": 0.28366032186224716, "learning_rate": 5.33467970303557e-06, "loss": 0.2867, "step": 3570 }, { "epoch": 1.5975011155734047, "grad_norm": 0.2953521127402904, "learning_rate": 5.308769960775257e-06, "loss": 0.2939, "step": 3580 }, { "epoch": 1.601963409192325, "grad_norm": 0.2888287000335896, "learning_rate": 5.28285189277076e-06, "loss": 0.2905, "step": 3590 }, { "epoch": 1.606425702811245, "grad_norm": 0.27937721122117365, "learning_rate": 5.2569261978828155e-06, "loss": 0.2982, "step": 3600 }, { "epoch": 1.610887996430165, "grad_norm": 0.28062178431057444, "learning_rate": 5.230993575177823e-06, "loss": 0.2925, "step": 3610 }, { "epoch": 1.6153502900490853, "grad_norm": 0.2959051465736045, "learning_rate": 5.2050547239089796e-06, "loss": 0.2766, "step": 3620 }, { "epoch": 1.6198125836680055, "grad_norm": 0.3167636433656721, "learning_rate": 5.179110343497432e-06, "loss": 0.2921, "step": 3630 }, { "epoch": 1.6242748772869255, "grad_norm": 0.27553361706720186, "learning_rate": 5.15316113351342e-06, "loss": 0.2866, "step": 3640 }, { "epoch": 1.6287371709058456, "grad_norm": 0.281198216094774, "learning_rate": 5.1272077936574005e-06, "loss": 0.2869, "step": 3650 }, { "epoch": 1.6331994645247656, "grad_norm": 0.28566411066751285, "learning_rate": 5.1012510237411975e-06, "loss": 0.2874, "step": 3660 }, { "epoch": 1.6376617581436859, "grad_norm": 0.27128449582175607, "learning_rate": 5.075291523669118e-06, "loss": 0.2771, "step": 3670 }, { "epoch": 1.6421240517626061, "grad_norm": 0.25597160345393283, "learning_rate": 5.049329993419092e-06, "loss": 0.2882, "step": 3680 }, { "epoch": 1.6465863453815262, "grad_norm": 0.27857161981462486, "learning_rate": 5.023367133023784e-06, "loss": 0.2942, "step": 3690 }, { "epoch": 1.6510486390004462, "grad_norm": 0.2862312874917413, "learning_rate": 4.997403642551733e-06, "loss": 0.2949, "step": 3700 }, { "epoch": 1.6555109326193662, "grad_norm": 0.2553590681517932, "learning_rate": 4.971440222088459e-06, "loss": 0.2823, "step": 3710 }, { "epoch": 1.6599732262382865, "grad_norm": 0.2806614787948605, "learning_rate": 4.945477571717602e-06, "loss": 0.2946, "step": 3720 }, { "epoch": 1.6644355198572067, "grad_norm": 0.3072907017218825, "learning_rate": 4.91951639150203e-06, "loss": 0.2814, "step": 3730 }, { "epoch": 1.6688978134761268, "grad_norm": 0.2874398898980506, "learning_rate": 4.8935573814649765e-06, "loss": 0.2915, "step": 3740 }, { "epoch": 1.6733601070950468, "grad_norm": 0.28754909498663717, "learning_rate": 4.867601241571153e-06, "loss": 0.2883, "step": 3750 }, { "epoch": 1.6778224007139668, "grad_norm": 0.273658543396969, "learning_rate": 4.841648671707881e-06, "loss": 0.2829, "step": 3760 }, { "epoch": 1.682284694332887, "grad_norm": 0.288492021314724, "learning_rate": 4.815700371666219e-06, "loss": 0.2897, "step": 3770 }, { "epoch": 1.6867469879518073, "grad_norm": 0.302492958266678, "learning_rate": 4.789757041122093e-06, "loss": 0.2806, "step": 3780 }, { "epoch": 1.6912092815707274, "grad_norm": 0.2995952463789012, "learning_rate": 4.763819379617432e-06, "loss": 0.2896, "step": 3790 }, { "epoch": 1.6956715751896474, "grad_norm": 0.30875546003288895, "learning_rate": 4.737888086541298e-06, "loss": 0.2859, "step": 3800 }, { "epoch": 1.7001338688085676, "grad_norm": 0.2944845035908801, "learning_rate": 4.711963861111043e-06, "loss": 0.3009, "step": 3810 }, { "epoch": 1.7045961624274877, "grad_norm": 0.3619439737256338, "learning_rate": 4.686047402353433e-06, "loss": 0.2841, "step": 3820 }, { "epoch": 1.709058456046408, "grad_norm": 0.29449858921532607, "learning_rate": 4.660139409085825e-06, "loss": 0.2935, "step": 3830 }, { "epoch": 1.713520749665328, "grad_norm": 0.2829699337998857, "learning_rate": 4.634240579897299e-06, "loss": 0.2921, "step": 3840 }, { "epoch": 1.717983043284248, "grad_norm": 0.2600468326834046, "learning_rate": 4.608351613129841e-06, "loss": 0.2835, "step": 3850 }, { "epoch": 1.7224453369031683, "grad_norm": 0.2793462803832517, "learning_rate": 4.582473206859498e-06, "loss": 0.2882, "step": 3860 }, { "epoch": 1.7269076305220885, "grad_norm": 0.2868869922030311, "learning_rate": 4.556606058877567e-06, "loss": 0.2883, "step": 3870 }, { "epoch": 1.7313699241410085, "grad_norm": 0.3862045749220281, "learning_rate": 4.530750866671769e-06, "loss": 0.2924, "step": 3880 }, { "epoch": 1.7358322177599286, "grad_norm": 0.2561819626783043, "learning_rate": 4.504908327407452e-06, "loss": 0.2902, "step": 3890 }, { "epoch": 1.7402945113788486, "grad_norm": 0.26603791907774466, "learning_rate": 4.479079137908781e-06, "loss": 0.2753, "step": 3900 }, { "epoch": 1.7447568049977689, "grad_norm": 0.30516815757249277, "learning_rate": 4.453263994639959e-06, "loss": 0.293, "step": 3910 }, { "epoch": 1.7492190986166891, "grad_norm": 0.2689881363760943, "learning_rate": 4.427463593686442e-06, "loss": 0.2947, "step": 3920 }, { "epoch": 1.7536813922356091, "grad_norm": 0.26851951669717095, "learning_rate": 4.401678630736172e-06, "loss": 0.2835, "step": 3930 }, { "epoch": 1.7581436858545292, "grad_norm": 0.26883125019727905, "learning_rate": 4.3759098010608155e-06, "loss": 0.2782, "step": 3940 }, { "epoch": 1.7626059794734492, "grad_norm": 0.25218594985078024, "learning_rate": 4.350157799497017e-06, "loss": 0.2893, "step": 3950 }, { "epoch": 1.7670682730923695, "grad_norm": 0.26946851688893303, "learning_rate": 4.324423320427669e-06, "loss": 0.279, "step": 3960 }, { "epoch": 1.7715305667112897, "grad_norm": 0.2974783320098152, "learning_rate": 4.298707057763175e-06, "loss": 0.2855, "step": 3970 }, { "epoch": 1.7759928603302098, "grad_norm": 0.2966897122885392, "learning_rate": 4.273009704922757e-06, "loss": 0.2884, "step": 3980 }, { "epoch": 1.7804551539491298, "grad_norm": 0.2902971297459419, "learning_rate": 4.24733195481574e-06, "loss": 0.2796, "step": 3990 }, { "epoch": 1.7849174475680498, "grad_norm": 0.296879555221403, "learning_rate": 4.221674499822878e-06, "loss": 0.2941, "step": 4000 }, { "epoch": 1.78937974118697, "grad_norm": 0.2772609554006714, "learning_rate": 4.196038031777688e-06, "loss": 0.2784, "step": 4010 }, { "epoch": 1.7938420348058903, "grad_norm": 0.3728763097589947, "learning_rate": 4.170423241947782e-06, "loss": 0.2825, "step": 4020 }, { "epoch": 1.7983043284248104, "grad_norm": 0.2690864141407203, "learning_rate": 4.144830821016245e-06, "loss": 0.2848, "step": 4030 }, { "epoch": 1.8027666220437304, "grad_norm": 0.2945504927368404, "learning_rate": 4.119261459062992e-06, "loss": 0.2886, "step": 4040 }, { "epoch": 1.8072289156626506, "grad_norm": 0.2817141780614377, "learning_rate": 4.0937158455461805e-06, "loss": 0.2861, "step": 4050 }, { "epoch": 1.8116912092815707, "grad_norm": 0.2908042660570454, "learning_rate": 4.068194669283599e-06, "loss": 0.2855, "step": 4060 }, { "epoch": 1.816153502900491, "grad_norm": 0.26095488586703197, "learning_rate": 4.042698618434115e-06, "loss": 0.2775, "step": 4070 }, { "epoch": 1.820615796519411, "grad_norm": 0.26782163829467626, "learning_rate": 4.017228380479099e-06, "loss": 0.2902, "step": 4080 }, { "epoch": 1.825078090138331, "grad_norm": 0.2893359193551187, "learning_rate": 3.991784642203904e-06, "loss": 0.286, "step": 4090 }, { "epoch": 1.8295403837572513, "grad_norm": 0.28187305106972493, "learning_rate": 3.966368089679337e-06, "loss": 0.2951, "step": 4100 }, { "epoch": 1.8340026773761715, "grad_norm": 0.7149499820103131, "learning_rate": 3.9409794082431585e-06, "loss": 0.278, "step": 4110 }, { "epoch": 1.8384649709950915, "grad_norm": 0.2781228028139923, "learning_rate": 3.915619282481613e-06, "loss": 0.2804, "step": 4120 }, { "epoch": 1.8429272646140116, "grad_norm": 0.27787506538029344, "learning_rate": 3.890288396210958e-06, "loss": 0.2773, "step": 4130 }, { "epoch": 1.8473895582329316, "grad_norm": 0.2775837112621051, "learning_rate": 3.8649874324590355e-06, "loss": 0.2891, "step": 4140 }, { "epoch": 1.8518518518518519, "grad_norm": 0.27038074332915685, "learning_rate": 3.839717073446842e-06, "loss": 0.2819, "step": 4150 }, { "epoch": 1.8563141454707721, "grad_norm": 0.27153282107037896, "learning_rate": 3.8144780005701526e-06, "loss": 0.2808, "step": 4160 }, { "epoch": 1.8607764390896921, "grad_norm": 0.27119597948809526, "learning_rate": 3.7892708943811224e-06, "loss": 0.2828, "step": 4170 }, { "epoch": 1.8652387327086122, "grad_norm": 0.26751701443601517, "learning_rate": 3.7640964345699613e-06, "loss": 0.2872, "step": 4180 }, { "epoch": 1.8697010263275322, "grad_norm": 0.2668805357678453, "learning_rate": 3.738955299946588e-06, "loss": 0.2808, "step": 4190 }, { "epoch": 1.8741633199464525, "grad_norm": 0.26385620413020505, "learning_rate": 3.7138481684223316e-06, "loss": 0.284, "step": 4200 }, { "epoch": 1.8786256135653727, "grad_norm": 0.27804749998996886, "learning_rate": 3.688775716991661e-06, "loss": 0.2901, "step": 4210 }, { "epoch": 1.8830879071842928, "grad_norm": 0.26632920843385016, "learning_rate": 3.6637386217139158e-06, "loss": 0.2817, "step": 4220 }, { "epoch": 1.8875502008032128, "grad_norm": 0.28107011263915094, "learning_rate": 3.6387375576950902e-06, "loss": 0.2875, "step": 4230 }, { "epoch": 1.8920124944221328, "grad_norm": 0.27319254801532444, "learning_rate": 3.613773199069618e-06, "loss": 0.2893, "step": 4240 }, { "epoch": 1.896474788041053, "grad_norm": 0.2895639115697261, "learning_rate": 3.588846218982204e-06, "loss": 0.2869, "step": 4250 }, { "epoch": 1.9009370816599733, "grad_norm": 0.27271032068079937, "learning_rate": 3.563957289569669e-06, "loss": 0.2869, "step": 4260 }, { "epoch": 1.9053993752788934, "grad_norm": 0.26855796153698186, "learning_rate": 3.5391070819428246e-06, "loss": 0.2857, "step": 4270 }, { "epoch": 1.9098616688978134, "grad_norm": 0.28223224193709756, "learning_rate": 3.514296266168381e-06, "loss": 0.2915, "step": 4280 }, { "epoch": 1.9143239625167336, "grad_norm": 0.2641850196296661, "learning_rate": 3.4895255112508773e-06, "loss": 0.2762, "step": 4290 }, { "epoch": 1.9187862561356537, "grad_norm": 0.3420858629489308, "learning_rate": 3.4647954851146437e-06, "loss": 0.2925, "step": 4300 }, { "epoch": 1.923248549754574, "grad_norm": 0.2604588316374171, "learning_rate": 3.4401068545857843e-06, "loss": 0.2822, "step": 4310 }, { "epoch": 1.927710843373494, "grad_norm": 0.2715732112484208, "learning_rate": 3.4154602853742115e-06, "loss": 0.2764, "step": 4320 }, { "epoch": 1.932173136992414, "grad_norm": 0.29731471914640445, "learning_rate": 3.3908564420556778e-06, "loss": 0.284, "step": 4330 }, { "epoch": 1.9366354306113343, "grad_norm": 0.299436802757538, "learning_rate": 3.3662959880538744e-06, "loss": 0.2785, "step": 4340 }, { "epoch": 1.9410977242302545, "grad_norm": 0.25848952586710894, "learning_rate": 3.341779585622522e-06, "loss": 0.2782, "step": 4350 }, { "epoch": 1.9455600178491745, "grad_norm": 0.26318821809460174, "learning_rate": 3.3173078958275355e-06, "loss": 0.2772, "step": 4360 }, { "epoch": 1.9500223114680946, "grad_norm": 0.2849012354934153, "learning_rate": 3.292881578529179e-06, "loss": 0.2878, "step": 4370 }, { "epoch": 1.9544846050870146, "grad_norm": 0.28212180104540435, "learning_rate": 3.268501292364289e-06, "loss": 0.2765, "step": 4380 }, { "epoch": 1.9589468987059349, "grad_norm": 0.28217071136679106, "learning_rate": 3.2441676947285035e-06, "loss": 0.2841, "step": 4390 }, { "epoch": 1.9634091923248551, "grad_norm": 0.4014867685340475, "learning_rate": 3.219881441758541e-06, "loss": 0.2842, "step": 4400 }, { "epoch": 1.9678714859437751, "grad_norm": 0.26510923757329297, "learning_rate": 3.19564318831451e-06, "loss": 0.282, "step": 4410 }, { "epoch": 1.9723337795626952, "grad_norm": 0.26724901583056526, "learning_rate": 3.171453587962246e-06, "loss": 0.2829, "step": 4420 }, { "epoch": 1.9767960731816152, "grad_norm": 0.25923632674916225, "learning_rate": 3.1473132929556927e-06, "loss": 0.2879, "step": 4430 }, { "epoch": 1.9812583668005355, "grad_norm": 0.2819718374318917, "learning_rate": 3.1232229542193126e-06, "loss": 0.2887, "step": 4440 }, { "epoch": 1.9857206604194557, "grad_norm": 0.27554261636665484, "learning_rate": 3.0991832213305367e-06, "loss": 0.2868, "step": 4450 }, { "epoch": 1.9901829540383758, "grad_norm": 0.3200504664553153, "learning_rate": 3.0751947425022465e-06, "loss": 0.2796, "step": 4460 }, { "epoch": 1.9946452476572958, "grad_norm": 0.2610653468140633, "learning_rate": 3.0512581645653007e-06, "loss": 0.2911, "step": 4470 }, { "epoch": 1.9991075412762158, "grad_norm": 0.3126154839381511, "learning_rate": 3.0273741329510852e-06, "loss": 0.287, "step": 4480 }, { "epoch": 2.0035698348951363, "grad_norm": 0.29397339441332226, "learning_rate": 3.0035432916741215e-06, "loss": 0.2665, "step": 4490 }, { "epoch": 2.0080321285140563, "grad_norm": 0.275630796936842, "learning_rate": 2.979766283314688e-06, "loss": 0.2643, "step": 4500 }, { "epoch": 2.0124944221329764, "grad_norm": 0.26340335226827566, "learning_rate": 2.9560437490015013e-06, "loss": 0.2727, "step": 4510 }, { "epoch": 2.0169567157518964, "grad_norm": 0.30161717213091244, "learning_rate": 2.9323763283944338e-06, "loss": 0.2625, "step": 4520 }, { "epoch": 2.0214190093708164, "grad_norm": 0.3171900997207839, "learning_rate": 2.9087646596672487e-06, "loss": 0.2598, "step": 4530 }, { "epoch": 2.025881302989737, "grad_norm": 0.3222582665257153, "learning_rate": 2.8852093794904136e-06, "loss": 0.2652, "step": 4540 }, { "epoch": 2.030343596608657, "grad_norm": 0.28771230139561266, "learning_rate": 2.861711123013911e-06, "loss": 0.2708, "step": 4550 }, { "epoch": 2.034805890227577, "grad_norm": 0.26951340411915475, "learning_rate": 2.838270523850135e-06, "loss": 0.271, "step": 4560 }, { "epoch": 2.039268183846497, "grad_norm": 0.26621140063523485, "learning_rate": 2.8148882140567844e-06, "loss": 0.2675, "step": 4570 }, { "epoch": 2.043730477465417, "grad_norm": 0.30555915513232423, "learning_rate": 2.7915648241198386e-06, "loss": 0.263, "step": 4580 }, { "epoch": 2.0481927710843375, "grad_norm": 0.2975295129021857, "learning_rate": 2.7683009829365417e-06, "loss": 0.2598, "step": 4590 }, { "epoch": 2.0526550647032575, "grad_norm": 0.2734727711740158, "learning_rate": 2.745097317798452e-06, "loss": 0.268, "step": 4600 }, { "epoch": 2.0571173583221776, "grad_norm": 0.29391413176160824, "learning_rate": 2.7219544543745335e-06, "loss": 0.2661, "step": 4610 }, { "epoch": 2.0615796519410976, "grad_norm": 0.2840209475626469, "learning_rate": 2.698873016694271e-06, "loss": 0.2655, "step": 4620 }, { "epoch": 2.0660419455600176, "grad_norm": 0.2600456780446525, "learning_rate": 2.6758536271308582e-06, "loss": 0.2588, "step": 4630 }, { "epoch": 2.070504239178938, "grad_norm": 0.28816278556575226, "learning_rate": 2.6528969063844022e-06, "loss": 0.2708, "step": 4640 }, { "epoch": 2.074966532797858, "grad_norm": 0.27537400614220536, "learning_rate": 2.630003473465202e-06, "loss": 0.2603, "step": 4650 }, { "epoch": 2.079428826416778, "grad_norm": 0.2799988616872821, "learning_rate": 2.6071739456770394e-06, "loss": 0.265, "step": 4660 }, { "epoch": 2.083891120035698, "grad_norm": 0.27519216667774027, "learning_rate": 2.5844089386005512e-06, "loss": 0.2615, "step": 4670 }, { "epoch": 2.0883534136546187, "grad_norm": 0.2817941036734103, "learning_rate": 2.5617090660766218e-06, "loss": 0.2747, "step": 4680 }, { "epoch": 2.0928157072735387, "grad_norm": 0.2538075181305876, "learning_rate": 2.5390749401898274e-06, "loss": 0.2705, "step": 4690 }, { "epoch": 2.0972780008924587, "grad_norm": 0.30549102286988794, "learning_rate": 2.5165071712519447e-06, "loss": 0.2751, "step": 4700 }, { "epoch": 2.101740294511379, "grad_norm": 0.2794818884211519, "learning_rate": 2.4940063677854775e-06, "loss": 0.2668, "step": 4710 }, { "epoch": 2.106202588130299, "grad_norm": 0.27303724559190734, "learning_rate": 2.4715731365072666e-06, "loss": 0.2628, "step": 4720 }, { "epoch": 2.1106648817492193, "grad_norm": 0.29272010242374324, "learning_rate": 2.449208082312111e-06, "loss": 0.2647, "step": 4730 }, { "epoch": 2.1151271753681393, "grad_norm": 0.270738388759199, "learning_rate": 2.4269118082564774e-06, "loss": 0.2617, "step": 4740 }, { "epoch": 2.1195894689870594, "grad_norm": 0.2840129241196674, "learning_rate": 2.4046849155422193e-06, "loss": 0.274, "step": 4750 }, { "epoch": 2.1240517626059794, "grad_norm": 0.2645544390527427, "learning_rate": 2.382528003500384e-06, "loss": 0.2686, "step": 4760 }, { "epoch": 2.1285140562248994, "grad_norm": 0.2865191645329176, "learning_rate": 2.3604416695750364e-06, "loss": 0.2601, "step": 4770 }, { "epoch": 2.13297634984382, "grad_norm": 0.2656950110555082, "learning_rate": 2.3384265093071645e-06, "loss": 0.2652, "step": 4780 }, { "epoch": 2.13743864346274, "grad_norm": 0.2912542173901888, "learning_rate": 2.316483116318608e-06, "loss": 0.2569, "step": 4790 }, { "epoch": 2.14190093708166, "grad_norm": 0.2747221058539636, "learning_rate": 2.2946120822960562e-06, "loss": 0.2662, "step": 4800 }, { "epoch": 2.14636323070058, "grad_norm": 0.27225697835899576, "learning_rate": 2.2728139969751005e-06, "loss": 0.267, "step": 4810 }, { "epoch": 2.1508255243195, "grad_norm": 0.2526714399887326, "learning_rate": 2.2510894481243205e-06, "loss": 0.2659, "step": 4820 }, { "epoch": 2.1552878179384205, "grad_norm": 0.2787198573344132, "learning_rate": 2.2294390215294483e-06, "loss": 0.2612, "step": 4830 }, { "epoch": 2.1597501115573405, "grad_norm": 0.2881621459035689, "learning_rate": 2.207863300977558e-06, "loss": 0.2658, "step": 4840 }, { "epoch": 2.1642124051762606, "grad_norm": 0.2651959620965174, "learning_rate": 2.186362868241341e-06, "loss": 0.2656, "step": 4850 }, { "epoch": 2.1686746987951806, "grad_norm": 0.27079635036661953, "learning_rate": 2.164938303063404e-06, "loss": 0.2651, "step": 4860 }, { "epoch": 2.1731369924141006, "grad_norm": 0.2627129405327371, "learning_rate": 2.1435901831406504e-06, "loss": 0.259, "step": 4870 }, { "epoch": 2.177599286033021, "grad_norm": 0.28039269813095946, "learning_rate": 2.1223190841086893e-06, "loss": 0.2672, "step": 4880 }, { "epoch": 2.182061579651941, "grad_norm": 0.2641978867337932, "learning_rate": 2.1011255795263232e-06, "loss": 0.2658, "step": 4890 }, { "epoch": 2.186523873270861, "grad_norm": 0.26996809053622206, "learning_rate": 2.080010240860083e-06, "loss": 0.2714, "step": 4900 }, { "epoch": 2.190986166889781, "grad_norm": 0.28824583701799006, "learning_rate": 2.058973637468811e-06, "loss": 0.2676, "step": 4910 }, { "epoch": 2.1954484605087012, "grad_norm": 0.27005127612792346, "learning_rate": 2.0380163365883188e-06, "loss": 0.2738, "step": 4920 }, { "epoch": 2.1999107541276217, "grad_norm": 0.2740103878032033, "learning_rate": 2.01713890331608e-06, "loss": 0.2579, "step": 4930 }, { "epoch": 2.2043730477465417, "grad_norm": 0.2908767212364278, "learning_rate": 1.996341900596008e-06, "loss": 0.2696, "step": 4940 }, { "epoch": 2.208835341365462, "grad_norm": 0.2950349079099773, "learning_rate": 1.9756258892032604e-06, "loss": 0.2645, "step": 4950 }, { "epoch": 2.213297634984382, "grad_norm": 0.26050135319115303, "learning_rate": 1.9549914277291326e-06, "loss": 0.2642, "step": 4960 }, { "epoch": 2.2177599286033023, "grad_norm": 0.26630446740688435, "learning_rate": 1.9344390725659827e-06, "loss": 0.2684, "step": 4970 }, { "epoch": 2.2222222222222223, "grad_norm": 0.2743425147318156, "learning_rate": 1.9139693778922437e-06, "loss": 0.2667, "step": 4980 }, { "epoch": 2.2266845158411424, "grad_norm": 0.2736976794496928, "learning_rate": 1.8935828956574615e-06, "loss": 0.2696, "step": 4990 }, { "epoch": 2.2311468094600624, "grad_norm": 0.26998232015281187, "learning_rate": 1.873280175567434e-06, "loss": 0.2685, "step": 5000 }, { "epoch": 2.2356091030789824, "grad_norm": 0.2668005078365468, "learning_rate": 1.8530617650693671e-06, "loss": 0.2658, "step": 5010 }, { "epoch": 2.240071396697903, "grad_norm": 0.27297408370818904, "learning_rate": 1.832928209337133e-06, "loss": 0.2711, "step": 5020 }, { "epoch": 2.244533690316823, "grad_norm": 0.27120763446153706, "learning_rate": 1.8128800512565514e-06, "loss": 0.2632, "step": 5030 }, { "epoch": 2.248995983935743, "grad_norm": 0.3043844117742173, "learning_rate": 1.792917831410767e-06, "loss": 0.2646, "step": 5040 }, { "epoch": 2.253458277554663, "grad_norm": 0.27555460116170827, "learning_rate": 1.7730420880656641e-06, "loss": 0.2627, "step": 5050 }, { "epoch": 2.2579205711735835, "grad_norm": 0.25867849621197275, "learning_rate": 1.7532533571553523e-06, "loss": 0.2723, "step": 5060 }, { "epoch": 2.2623828647925035, "grad_norm": 0.2696220165863831, "learning_rate": 1.7335521722677223e-06, "loss": 0.2567, "step": 5070 }, { "epoch": 2.2668451584114235, "grad_norm": 0.2844615695492899, "learning_rate": 1.7139390646300503e-06, "loss": 0.2636, "step": 5080 }, { "epoch": 2.2713074520303436, "grad_norm": 0.26100844869286605, "learning_rate": 1.6944145630946757e-06, "loss": 0.2547, "step": 5090 }, { "epoch": 2.2757697456492636, "grad_norm": 0.2655209152406865, "learning_rate": 1.6749791941247501e-06, "loss": 0.2667, "step": 5100 }, { "epoch": 2.280232039268184, "grad_norm": 0.2670441776783262, "learning_rate": 1.6556334817800247e-06, "loss": 0.2593, "step": 5110 }, { "epoch": 2.284694332887104, "grad_norm": 0.2702953527750314, "learning_rate": 1.636377947702737e-06, "loss": 0.2668, "step": 5120 }, { "epoch": 2.289156626506024, "grad_norm": 0.2801526427682019, "learning_rate": 1.6172131111035305e-06, "loss": 0.2593, "step": 5130 }, { "epoch": 2.293618920124944, "grad_norm": 0.2664506709616671, "learning_rate": 1.598139488747467e-06, "loss": 0.2679, "step": 5140 }, { "epoch": 2.298081213743864, "grad_norm": 0.2842866153483872, "learning_rate": 1.5791575949400801e-06, "loss": 0.2683, "step": 5150 }, { "epoch": 2.3025435073627847, "grad_norm": 0.2713206331551484, "learning_rate": 1.5602679415135203e-06, "loss": 0.2672, "step": 5160 }, { "epoch": 2.3070058009817047, "grad_norm": 0.2680243773548362, "learning_rate": 1.5414710378127407e-06, "loss": 0.2668, "step": 5170 }, { "epoch": 2.3114680946006247, "grad_norm": 0.2937301531788135, "learning_rate": 1.522767390681776e-06, "loss": 0.2621, "step": 5180 }, { "epoch": 2.3159303882195448, "grad_norm": 0.359404031790861, "learning_rate": 1.5041575044500645e-06, "loss": 0.2667, "step": 5190 }, { "epoch": 2.320392681838465, "grad_norm": 0.26130760139495907, "learning_rate": 1.4856418809188538e-06, "loss": 0.2544, "step": 5200 }, { "epoch": 2.3248549754573853, "grad_norm": 0.25686940370213246, "learning_rate": 1.4672210193476766e-06, "loss": 0.274, "step": 5210 }, { "epoch": 2.3293172690763053, "grad_norm": 0.2939624963929075, "learning_rate": 1.4488954164408736e-06, "loss": 0.2701, "step": 5220 }, { "epoch": 2.3337795626952254, "grad_norm": 0.2783635407103021, "learning_rate": 1.4306655663342173e-06, "loss": 0.2563, "step": 5230 }, { "epoch": 2.3382418563141454, "grad_norm": 0.29226415842720005, "learning_rate": 1.412531960581572e-06, "loss": 0.2541, "step": 5240 }, { "epoch": 2.3427041499330654, "grad_norm": 0.3552878374922974, "learning_rate": 1.3944950881416541e-06, "loss": 0.2645, "step": 5250 }, { "epoch": 2.347166443551986, "grad_norm": 0.25303935909037156, "learning_rate": 1.3765554353648348e-06, "loss": 0.26, "step": 5260 }, { "epoch": 2.351628737170906, "grad_norm": 0.26959464702039115, "learning_rate": 1.3587134859800378e-06, "loss": 0.2622, "step": 5270 }, { "epoch": 2.356091030789826, "grad_norm": 0.2767449988752219, "learning_rate": 1.3409697210816846e-06, "loss": 0.2631, "step": 5280 }, { "epoch": 2.360553324408746, "grad_norm": 0.2948939949561596, "learning_rate": 1.3233246191167293e-06, "loss": 0.2721, "step": 5290 }, { "epoch": 2.365015618027666, "grad_norm": 0.2806355856045073, "learning_rate": 1.3057786558717593e-06, "loss": 0.2674, "step": 5300 }, { "epoch": 2.3694779116465865, "grad_norm": 0.2738252090208611, "learning_rate": 1.2883323044601575e-06, "loss": 0.274, "step": 5310 }, { "epoch": 2.3739402052655065, "grad_norm": 0.2577368049277014, "learning_rate": 1.2709860353093555e-06, "loss": 0.2668, "step": 5320 }, { "epoch": 2.3784024988844266, "grad_norm": 0.29099029260946785, "learning_rate": 1.2537403161481387e-06, "loss": 0.2669, "step": 5330 }, { "epoch": 2.3828647925033466, "grad_norm": 0.264039286727906, "learning_rate": 1.2365956119940436e-06, "loss": 0.2768, "step": 5340 }, { "epoch": 2.3873270861222666, "grad_norm": 0.2686625114381172, "learning_rate": 1.2195523851408153e-06, "loss": 0.2735, "step": 5350 }, { "epoch": 2.391789379741187, "grad_norm": 0.27917896436759787, "learning_rate": 1.2026110951459364e-06, "loss": 0.2709, "step": 5360 }, { "epoch": 2.396251673360107, "grad_norm": 0.2759607663737845, "learning_rate": 1.1857721988182468e-06, "loss": 0.264, "step": 5370 }, { "epoch": 2.400713966979027, "grad_norm": 0.2744901640260912, "learning_rate": 1.169036150205614e-06, "loss": 0.2638, "step": 5380 }, { "epoch": 2.405176260597947, "grad_norm": 0.2639450101124726, "learning_rate": 1.1524034005827028e-06, "loss": 0.2609, "step": 5390 }, { "epoch": 2.4096385542168672, "grad_norm": 0.26462733650289344, "learning_rate": 1.1358743984387939e-06, "loss": 0.2571, "step": 5400 }, { "epoch": 2.4141008478357877, "grad_norm": 0.26467540440445797, "learning_rate": 1.1194495894657021e-06, "loss": 0.264, "step": 5410 }, { "epoch": 2.4185631414547077, "grad_norm": 0.2541839967727397, "learning_rate": 1.103129416545749e-06, "loss": 0.2734, "step": 5420 }, { "epoch": 2.4230254350736278, "grad_norm": 0.25980858502680365, "learning_rate": 1.0869143197398313e-06, "loss": 0.2711, "step": 5430 }, { "epoch": 2.427487728692548, "grad_norm": 0.2749178435606242, "learning_rate": 1.070804736275543e-06, "loss": 0.2638, "step": 5440 }, { "epoch": 2.4319500223114683, "grad_norm": 0.2507799683207913, "learning_rate": 1.0548011005353975e-06, "loss": 0.2639, "step": 5450 }, { "epoch": 2.4364123159303883, "grad_norm": 0.27822037022845514, "learning_rate": 1.0389038440451048e-06, "loss": 0.2687, "step": 5460 }, { "epoch": 2.4408746095493083, "grad_norm": 0.25912298984876, "learning_rate": 1.0231133954619449e-06, "loss": 0.2517, "step": 5470 }, { "epoch": 2.4453369031682284, "grad_norm": 0.26989363458508225, "learning_rate": 1.0074301805632014e-06, "loss": 0.261, "step": 5480 }, { "epoch": 2.4497991967871484, "grad_norm": 0.2611164113008286, "learning_rate": 9.918546222346837e-07, "loss": 0.2732, "step": 5490 }, { "epoch": 2.454261490406069, "grad_norm": 0.26664227118678985, "learning_rate": 9.763871404593295e-07, "loss": 0.2635, "step": 5500 }, { "epoch": 2.458723784024989, "grad_norm": 0.2845594958886921, "learning_rate": 9.610281523058696e-07, "loss": 0.2724, "step": 5510 }, { "epoch": 2.463186077643909, "grad_norm": 0.26533071138557196, "learning_rate": 9.457780719175924e-07, "loss": 0.2594, "step": 5520 }, { "epoch": 2.467648371262829, "grad_norm": 0.2931575613556212, "learning_rate": 9.306373105011685e-07, "loss": 0.2642, "step": 5530 }, { "epoch": 2.4721106648817495, "grad_norm": 0.2579593656604341, "learning_rate": 9.15606276315571e-07, "loss": 0.2686, "step": 5540 }, { "epoch": 2.4765729585006695, "grad_norm": 0.25308387491964407, "learning_rate": 9.006853746610578e-07, "loss": 0.2748, "step": 5550 }, { "epoch": 2.4810352521195895, "grad_norm": 0.26649803073733885, "learning_rate": 8.858750078682526e-07, "loss": 0.2702, "step": 5560 }, { "epoch": 2.4854975457385096, "grad_norm": 0.26710930733839655, "learning_rate": 8.711755752872875e-07, "loss": 0.2741, "step": 5570 }, { "epoch": 2.4899598393574296, "grad_norm": 0.28129936297807595, "learning_rate": 8.565874732770429e-07, "loss": 0.2711, "step": 5580 }, { "epoch": 2.49442213297635, "grad_norm": 0.2562852808371683, "learning_rate": 8.421110951944533e-07, "loss": 0.2729, "step": 5590 }, { "epoch": 2.49888442659527, "grad_norm": 0.2705997291745019, "learning_rate": 8.277468313839033e-07, "loss": 0.266, "step": 5600 }, { "epoch": 2.50334672021419, "grad_norm": 0.27415617746696014, "learning_rate": 8.13495069166706e-07, "loss": 0.2635, "step": 5610 }, { "epoch": 2.50780901383311, "grad_norm": 0.268499538957006, "learning_rate": 7.993561928306503e-07, "loss": 0.2626, "step": 5620 }, { "epoch": 2.51227130745203, "grad_norm": 0.2605445838763644, "learning_rate": 7.853305836196507e-07, "loss": 0.2684, "step": 5630 }, { "epoch": 2.5167336010709507, "grad_norm": 0.259462323778387, "learning_rate": 7.714186197234547e-07, "loss": 0.2669, "step": 5640 }, { "epoch": 2.5211958946898707, "grad_norm": 0.2737812098281487, "learning_rate": 7.576206762674565e-07, "loss": 0.2677, "step": 5650 }, { "epoch": 2.5256581883087907, "grad_norm": 0.24457747335646154, "learning_rate": 7.439371253025718e-07, "loss": 0.2441, "step": 5660 }, { "epoch": 2.5301204819277108, "grad_norm": 0.25318796192153853, "learning_rate": 7.303683357952168e-07, "loss": 0.2692, "step": 5670 }, { "epoch": 2.534582775546631, "grad_norm": 0.25584827852861586, "learning_rate": 7.169146736173477e-07, "loss": 0.2696, "step": 5680 }, { "epoch": 2.5390450691655513, "grad_norm": 0.2602088423877808, "learning_rate": 7.035765015366047e-07, "loss": 0.2668, "step": 5690 }, { "epoch": 2.5435073627844713, "grad_norm": 0.2820757957669124, "learning_rate": 6.903541792065265e-07, "loss": 0.2771, "step": 5700 }, { "epoch": 2.5479696564033913, "grad_norm": 0.289619323277333, "learning_rate": 6.772480631568496e-07, "loss": 0.2677, "step": 5710 }, { "epoch": 2.5524319500223114, "grad_norm": 0.24262073349803384, "learning_rate": 6.642585067839003e-07, "loss": 0.2632, "step": 5720 }, { "epoch": 2.5568942436412314, "grad_norm": 0.3063970631496104, "learning_rate": 6.513858603410605e-07, "loss": 0.2645, "step": 5730 }, { "epoch": 2.561356537260152, "grad_norm": 0.24469985287690035, "learning_rate": 6.386304709293295e-07, "loss": 0.2674, "step": 5740 }, { "epoch": 2.565818830879072, "grad_norm": 0.25778009113854466, "learning_rate": 6.259926824879575e-07, "loss": 0.2686, "step": 5750 }, { "epoch": 2.570281124497992, "grad_norm": 0.2519725071505496, "learning_rate": 6.134728357851777e-07, "loss": 0.2614, "step": 5760 }, { "epoch": 2.574743418116912, "grad_norm": 0.24559660905732697, "learning_rate": 6.010712684090125e-07, "loss": 0.2538, "step": 5770 }, { "epoch": 2.579205711735832, "grad_norm": 0.2549259950940346, "learning_rate": 5.887883147581769e-07, "loss": 0.2669, "step": 5780 }, { "epoch": 2.5836680053547525, "grad_norm": 0.26474210608850174, "learning_rate": 5.766243060330551e-07, "loss": 0.2645, "step": 5790 }, { "epoch": 2.5881302989736725, "grad_norm": 0.2611149545019317, "learning_rate": 5.645795702267731e-07, "loss": 0.2713, "step": 5800 }, { "epoch": 2.5925925925925926, "grad_norm": 0.27028998633235257, "learning_rate": 5.526544321163573e-07, "loss": 0.2765, "step": 5810 }, { "epoch": 2.5970548862115126, "grad_norm": 0.26131292774366577, "learning_rate": 5.408492132539705e-07, "loss": 0.2601, "step": 5820 }, { "epoch": 2.6015171798304326, "grad_norm": 0.25036551401682255, "learning_rate": 5.29164231958249e-07, "loss": 0.2667, "step": 5830 }, { "epoch": 2.605979473449353, "grad_norm": 0.32252379358555433, "learning_rate": 5.175998033057128e-07, "loss": 0.2598, "step": 5840 }, { "epoch": 2.610441767068273, "grad_norm": 0.26111392130818567, "learning_rate": 5.061562391222752e-07, "loss": 0.2708, "step": 5850 }, { "epoch": 2.614904060687193, "grad_norm": 0.2610753618549455, "learning_rate": 4.948338479748293e-07, "loss": 0.264, "step": 5860 }, { "epoch": 2.619366354306113, "grad_norm": 0.24433197279025629, "learning_rate": 4.836329351629343e-07, "loss": 0.2591, "step": 5870 }, { "epoch": 2.6238286479250332, "grad_norm": 0.25566187611860886, "learning_rate": 4.7255380271057637e-07, "loss": 0.2709, "step": 5880 }, { "epoch": 2.6282909415439537, "grad_norm": 0.25059259535846407, "learning_rate": 4.6159674935802867e-07, "loss": 0.2623, "step": 5890 }, { "epoch": 2.6327532351628737, "grad_norm": 0.2623672779635096, "learning_rate": 4.507620705537974e-07, "loss": 0.259, "step": 5900 }, { "epoch": 2.6372155287817938, "grad_norm": 0.2786335146735297, "learning_rate": 4.400500584466505e-07, "loss": 0.2676, "step": 5910 }, { "epoch": 2.641677822400714, "grad_norm": 0.24900048144785913, "learning_rate": 4.294610018777462e-07, "loss": 0.263, "step": 5920 }, { "epoch": 2.646140116019634, "grad_norm": 0.25847202609000797, "learning_rate": 4.1899518637283753e-07, "loss": 0.2677, "step": 5930 }, { "epoch": 2.6506024096385543, "grad_norm": 0.2782722288055866, "learning_rate": 4.0865289413458074e-07, "loss": 0.2617, "step": 5940 }, { "epoch": 2.6550647032574743, "grad_norm": 0.28648904043807316, "learning_rate": 3.984344040349197e-07, "loss": 0.2572, "step": 5950 }, { "epoch": 2.6595269968763944, "grad_norm": 0.2636564981421794, "learning_rate": 3.883399916075714e-07, "loss": 0.2623, "step": 5960 }, { "epoch": 2.663989290495315, "grad_norm": 0.2493631339678549, "learning_rate": 3.783699290405901e-07, "loss": 0.2649, "step": 5970 }, { "epoch": 2.6684515841142344, "grad_norm": 0.2528637606092601, "learning_rate": 3.6852448516903727e-07, "loss": 0.2764, "step": 5980 }, { "epoch": 2.672913877733155, "grad_norm": 0.2729817482325442, "learning_rate": 3.588039254677211e-07, "loss": 0.2622, "step": 5990 }, { "epoch": 2.677376171352075, "grad_norm": 0.3108379178962195, "learning_rate": 3.4920851204405026e-07, "loss": 0.2614, "step": 6000 }, { "epoch": 2.681838464970995, "grad_norm": 0.25516673857932876, "learning_rate": 3.397385036309558e-07, "loss": 0.2545, "step": 6010 }, { "epoch": 2.6863007585899155, "grad_norm": 0.26499244030577884, "learning_rate": 3.303941555799223e-07, "loss": 0.269, "step": 6020 }, { "epoch": 2.6907630522088355, "grad_norm": 0.2644403491762858, "learning_rate": 3.211757198540971e-07, "loss": 0.261, "step": 6030 }, { "epoch": 2.6952253458277555, "grad_norm": 0.2512005184606446, "learning_rate": 3.12083445021501e-07, "loss": 0.2608, "step": 6040 }, { "epoch": 2.6996876394466756, "grad_norm": 0.24284759524855465, "learning_rate": 3.031175762483207e-07, "loss": 0.2573, "step": 6050 }, { "epoch": 2.7041499330655956, "grad_norm": 0.2588341633636346, "learning_rate": 2.942783552923034e-07, "loss": 0.2721, "step": 6060 }, { "epoch": 2.708612226684516, "grad_norm": 0.27162185956855733, "learning_rate": 2.8556602049623515e-07, "loss": 0.2635, "step": 6070 }, { "epoch": 2.713074520303436, "grad_norm": 0.24785721701569988, "learning_rate": 2.769808067815127e-07, "loss": 0.2654, "step": 6080 }, { "epoch": 2.717536813922356, "grad_norm": 0.26379289564503067, "learning_rate": 2.68522945641812e-07, "loss": 0.2703, "step": 6090 }, { "epoch": 2.721999107541276, "grad_norm": 0.25223927852747313, "learning_rate": 2.6019266513684525e-07, "loss": 0.2633, "step": 6100 }, { "epoch": 2.726461401160196, "grad_norm": 0.26022587215739407, "learning_rate": 2.5199018988620925e-07, "loss": 0.2628, "step": 6110 }, { "epoch": 2.7309236947791167, "grad_norm": 0.25618186440473806, "learning_rate": 2.439157410633336e-07, "loss": 0.2549, "step": 6120 }, { "epoch": 2.7353859883980367, "grad_norm": 0.2709291288264617, "learning_rate": 2.3596953638951093e-07, "loss": 0.2673, "step": 6130 }, { "epoch": 2.7398482820169567, "grad_norm": 0.3097723757526494, "learning_rate": 2.2815179012803056e-07, "loss": 0.2667, "step": 6140 }, { "epoch": 2.7443105756358768, "grad_norm": 0.26215128271049865, "learning_rate": 2.2046271307839928e-07, "loss": 0.2659, "step": 6150 }, { "epoch": 2.748772869254797, "grad_norm": 0.26937507169769476, "learning_rate": 2.1290251257065852e-07, "loss": 0.2647, "step": 6160 }, { "epoch": 2.7532351628737173, "grad_norm": 0.28086965259228747, "learning_rate": 2.054713924597923e-07, "loss": 0.2596, "step": 6170 }, { "epoch": 2.7576974564926373, "grad_norm": 0.25164611443705137, "learning_rate": 1.981695531202299e-07, "loss": 0.2613, "step": 6180 }, { "epoch": 2.7621597501115573, "grad_norm": 0.26328055433222686, "learning_rate": 1.9099719144044737e-07, "loss": 0.2585, "step": 6190 }, { "epoch": 2.7666220437304774, "grad_norm": 0.24268837305344704, "learning_rate": 1.8395450081765133e-07, "loss": 0.2594, "step": 6200 }, { "epoch": 2.7710843373493974, "grad_norm": 0.27208555089507047, "learning_rate": 1.7704167115257242e-07, "loss": 0.2701, "step": 6210 }, { "epoch": 2.775546630968318, "grad_norm": 0.2592924605339396, "learning_rate": 1.7025888884433682e-07, "loss": 0.258, "step": 6220 }, { "epoch": 2.780008924587238, "grad_norm": 0.25268564338580296, "learning_rate": 1.636063367854468e-07, "loss": 0.2643, "step": 6230 }, { "epoch": 2.784471218206158, "grad_norm": 0.25499560352534534, "learning_rate": 1.5708419435684463e-07, "loss": 0.2592, "step": 6240 }, { "epoch": 2.788933511825078, "grad_norm": 0.2754254434988614, "learning_rate": 1.506926374230777e-07, "loss": 0.2685, "step": 6250 }, { "epoch": 2.793395805443998, "grad_norm": 0.24677760129233578, "learning_rate": 1.4443183832755558e-07, "loss": 0.2668, "step": 6260 }, { "epoch": 2.7978580990629185, "grad_norm": 0.2419766232491537, "learning_rate": 1.3830196588790535e-07, "loss": 0.2649, "step": 6270 }, { "epoch": 2.8023203926818385, "grad_norm": 0.25179145421288907, "learning_rate": 1.3230318539141586e-07, "loss": 0.2613, "step": 6280 }, { "epoch": 2.8067826863007586, "grad_norm": 0.24418794826639928, "learning_rate": 1.2643565859058182e-07, "loss": 0.2735, "step": 6290 }, { "epoch": 2.8112449799196786, "grad_norm": 0.2484899106044706, "learning_rate": 1.206995436987457e-07, "loss": 0.2676, "step": 6300 }, { "epoch": 2.8157072735385986, "grad_norm": 0.2542428658067045, "learning_rate": 1.1509499538582768e-07, "loss": 0.2634, "step": 6310 }, { "epoch": 2.820169567157519, "grad_norm": 0.24625678831935105, "learning_rate": 1.0962216477415632e-07, "loss": 0.2644, "step": 6320 }, { "epoch": 2.824631860776439, "grad_norm": 0.2557578526542899, "learning_rate": 1.0428119943439396e-07, "loss": 0.2697, "step": 6330 }, { "epoch": 2.829094154395359, "grad_norm": 0.2626526421729072, "learning_rate": 9.907224338155774e-08, "loss": 0.2641, "step": 6340 }, { "epoch": 2.833556448014279, "grad_norm": 0.24742251678799534, "learning_rate": 9.399543707113601e-08, "loss": 0.2672, "step": 6350 }, { "epoch": 2.8380187416331992, "grad_norm": 0.24976844344830124, "learning_rate": 8.905091739530026e-08, "loss": 0.2642, "step": 6360 }, { "epoch": 2.8424810352521197, "grad_norm": 0.2434658676513343, "learning_rate": 8.423881767921637e-08, "loss": 0.2666, "step": 6370 }, { "epoch": 2.8469433288710397, "grad_norm": 0.2709418898144275, "learning_rate": 7.955926767744649e-08, "loss": 0.2678, "step": 6380 }, { "epoch": 2.8514056224899598, "grad_norm": 0.2682012558379756, "learning_rate": 7.501239357045275e-08, "loss": 0.2599, "step": 6390 }, { "epoch": 2.85586791610888, "grad_norm": 0.2541699897689904, "learning_rate": 7.059831796119243e-08, "loss": 0.2637, "step": 6400 }, { "epoch": 2.8603302097278, "grad_norm": 0.2786116189525862, "learning_rate": 6.631715987181653e-08, "loss": 0.2633, "step": 6410 }, { "epoch": 2.8647925033467203, "grad_norm": 0.2549013358787648, "learning_rate": 6.216903474045411e-08, "loss": 0.2675, "step": 6420 }, { "epoch": 2.8692547969656403, "grad_norm": 0.24035403326033933, "learning_rate": 5.815405441810584e-08, "loss": 0.2704, "step": 6430 }, { "epoch": 2.8737170905845604, "grad_norm": 0.24485345786981255, "learning_rate": 5.427232716562314e-08, "loss": 0.2654, "step": 6440 }, { "epoch": 2.878179384203481, "grad_norm": 0.2509308995806849, "learning_rate": 5.05239576507921e-08, "loss": 0.2676, "step": 6450 }, { "epoch": 2.8826416778224004, "grad_norm": 0.25125126609422216, "learning_rate": 4.690904694550913e-08, "loss": 0.2659, "step": 6460 }, { "epoch": 2.887103971441321, "grad_norm": 0.29884930114175684, "learning_rate": 4.342769252305867e-08, "loss": 0.2659, "step": 6470 }, { "epoch": 2.891566265060241, "grad_norm": 0.2621295720462137, "learning_rate": 4.007998825548032e-08, "loss": 0.2635, "step": 6480 }, { "epoch": 2.896028558679161, "grad_norm": 0.2559890093701122, "learning_rate": 3.686602441104137e-08, "loss": 0.2579, "step": 6490 }, { "epoch": 2.9004908522980815, "grad_norm": 0.294711369331024, "learning_rate": 3.378588765180268e-08, "loss": 0.2596, "step": 6500 }, { "epoch": 2.9049531459170015, "grad_norm": 0.2406776827039642, "learning_rate": 3.083966103127833e-08, "loss": 0.2628, "step": 6510 }, { "epoch": 2.9094154395359215, "grad_norm": 0.2592651922892189, "learning_rate": 2.8027423992201265e-08, "loss": 0.2586, "step": 6520 }, { "epoch": 2.9138777331548416, "grad_norm": 0.2544628675769128, "learning_rate": 2.5349252364376132e-08, "loss": 0.2636, "step": 6530 }, { "epoch": 2.9183400267737616, "grad_norm": 0.24915354540620344, "learning_rate": 2.280521836263927e-08, "loss": 0.2736, "step": 6540 }, { "epoch": 2.922802320392682, "grad_norm": 0.25636383494451315, "learning_rate": 2.0395390584908027e-08, "loss": 0.2604, "step": 6550 }, { "epoch": 2.927264614011602, "grad_norm": 0.23743990674304524, "learning_rate": 1.8119834010332236e-08, "loss": 0.2644, "step": 6560 }, { "epoch": 2.931726907630522, "grad_norm": 0.2421382237738656, "learning_rate": 1.5978609997542306e-08, "loss": 0.2695, "step": 6570 }, { "epoch": 2.936189201249442, "grad_norm": 0.25987130839419936, "learning_rate": 1.3971776282994398e-08, "loss": 0.2612, "step": 6580 }, { "epoch": 2.940651494868362, "grad_norm": 0.28118171114124185, "learning_rate": 1.2099386979414484e-08, "loss": 0.2727, "step": 6590 }, { "epoch": 2.9451137884872827, "grad_norm": 0.25875766974667813, "learning_rate": 1.0361492574337827e-08, "loss": 0.2599, "step": 6600 }, { "epoch": 2.9495760821062027, "grad_norm": 0.24856208937215735, "learning_rate": 8.758139928748966e-09, "loss": 0.2585, "step": 6610 }, { "epoch": 2.9540383757251227, "grad_norm": 0.2605005110493228, "learning_rate": 7.289372275816608e-09, "loss": 0.263, "step": 6620 }, { "epoch": 2.9585006693440428, "grad_norm": 0.27730135009271706, "learning_rate": 5.95522921973013e-09, "loss": 0.267, "step": 6630 }, { "epoch": 2.962962962962963, "grad_norm": 0.25897469150617725, "learning_rate": 4.7557467346281975e-09, "loss": 0.2682, "step": 6640 }, { "epoch": 2.9674252565818833, "grad_norm": 0.240962238635422, "learning_rate": 3.690957163633435e-09, "loss": 0.2604, "step": 6650 }, { "epoch": 2.9718875502008033, "grad_norm": 0.29524556685558273, "learning_rate": 2.760889217976459e-09, "loss": 0.2694, "step": 6660 }, { "epoch": 2.9763498438197233, "grad_norm": 0.2399731201737577, "learning_rate": 1.9655679762220494e-09, "loss": 0.2615, "step": 6670 }, { "epoch": 2.9808121374386434, "grad_norm": 0.26022562581894665, "learning_rate": 1.305014883595801e-09, "loss": 0.2624, "step": 6680 }, { "epoch": 2.9852744310575634, "grad_norm": 0.25348317394477593, "learning_rate": 7.792477514034779e-10, "loss": 0.2586, "step": 6690 }, { "epoch": 2.989736724676484, "grad_norm": 0.2596395509558711, "learning_rate": 3.882807565502855e-10, "loss": 0.2687, "step": 6700 }, { "epoch": 2.994199018295404, "grad_norm": 0.25130314991574476, "learning_rate": 1.3212444115950907e-10, "loss": 0.2606, "step": 6710 }, { "epoch": 2.998661311914324, "grad_norm": 0.27465854203632456, "learning_rate": 1.0785712290517503e-11, "loss": 0.26, "step": 6720 } ], "logging_steps": 10, "max_steps": 6723, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6290612428931072e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }