{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 13698, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002190100744634253, "grad_norm": 41.51729202270508, "learning_rate": 8.759124087591242e-08, "loss": 0.5201, "step": 10 }, { "epoch": 0.004380201489268506, "grad_norm": 75.11473846435547, "learning_rate": 3.795620437956205e-07, "loss": 0.426, "step": 20 }, { "epoch": 0.006570302233902759, "grad_norm": 46.39939880371094, "learning_rate": 6.423357664233578e-07, "loss": 0.2653, "step": 30 }, { "epoch": 0.008760402978537012, "grad_norm": 15.723746299743652, "learning_rate": 9.343065693430657e-07, "loss": 0.3463, "step": 40 }, { "epoch": 0.010950503723171266, "grad_norm": 27.370849609375, "learning_rate": 1.2262773722627739e-06, "loss": 0.2095, "step": 50 }, { "epoch": 0.013140604467805518, "grad_norm": 75.13024139404297, "learning_rate": 1.518248175182482e-06, "loss": 0.2387, "step": 60 }, { "epoch": 0.015330705212439772, "grad_norm": 37.722660064697266, "learning_rate": 1.8102189781021901e-06, "loss": 0.3115, "step": 70 }, { "epoch": 0.017520805957074025, "grad_norm": 21.071760177612305, "learning_rate": 2.102189781021898e-06, "loss": 0.192, "step": 80 }, { "epoch": 0.01971090670170828, "grad_norm": 28.379289627075195, "learning_rate": 2.394160583941606e-06, "loss": 0.1876, "step": 90 }, { "epoch": 0.021901007446342532, "grad_norm": 36.52322769165039, "learning_rate": 2.686131386861314e-06, "loss": 0.1944, "step": 100 }, { "epoch": 0.024091108190976786, "grad_norm": 20.63442039489746, "learning_rate": 2.978102189781022e-06, "loss": 0.2654, "step": 110 }, { "epoch": 0.026281208935611037, "grad_norm": 6.119592666625977, "learning_rate": 3.27007299270073e-06, "loss": 0.1559, "step": 120 }, { "epoch": 0.02847130968024529, "grad_norm": 4.615947246551514, "learning_rate": 3.5620437956204386e-06, "loss": 0.1513, "step": 130 }, { "epoch": 0.030661410424879545, "grad_norm": 18.560989379882812, "learning_rate": 3.854014598540146e-06, "loss": 0.164, "step": 140 }, { "epoch": 0.0328515111695138, "grad_norm": 30.79267692565918, "learning_rate": 4.145985401459855e-06, "loss": 0.1709, "step": 150 }, { "epoch": 0.03504161191414805, "grad_norm": 16.77621841430664, "learning_rate": 4.437956204379563e-06, "loss": 0.2884, "step": 160 }, { "epoch": 0.03723171265878231, "grad_norm": 12.920726776123047, "learning_rate": 4.729927007299271e-06, "loss": 0.2282, "step": 170 }, { "epoch": 0.03942181340341656, "grad_norm": 19.979284286499023, "learning_rate": 5.021897810218979e-06, "loss": 0.2393, "step": 180 }, { "epoch": 0.04161191414805081, "grad_norm": 39.93806457519531, "learning_rate": 5.313868613138686e-06, "loss": 0.2868, "step": 190 }, { "epoch": 0.043802014892685065, "grad_norm": 18.0382080078125, "learning_rate": 5.605839416058395e-06, "loss": 0.2218, "step": 200 }, { "epoch": 0.045992115637319315, "grad_norm": 0.10194139182567596, "learning_rate": 5.897810218978103e-06, "loss": 0.1169, "step": 210 }, { "epoch": 0.04818221638195357, "grad_norm": 14.437867164611816, "learning_rate": 6.18978102189781e-06, "loss": 0.0644, "step": 220 }, { "epoch": 0.05037231712658782, "grad_norm": 36.17061996459961, "learning_rate": 6.481751824817519e-06, "loss": 0.3643, "step": 230 }, { "epoch": 0.052562417871222074, "grad_norm": 13.471918106079102, "learning_rate": 6.773722627737227e-06, "loss": 0.3501, "step": 240 }, { "epoch": 0.05475251861585633, "grad_norm": 15.893131256103516, "learning_rate": 7.065693430656935e-06, "loss": 0.1194, "step": 250 }, { "epoch": 0.05694261936049058, "grad_norm": 13.697599411010742, "learning_rate": 7.357664233576642e-06, "loss": 0.1697, "step": 260 }, { "epoch": 0.05913272010512484, "grad_norm": 17.737754821777344, "learning_rate": 7.649635036496352e-06, "loss": 0.148, "step": 270 }, { "epoch": 0.06132282084975909, "grad_norm": 32.19956970214844, "learning_rate": 7.941605839416058e-06, "loss": 0.2896, "step": 280 }, { "epoch": 0.06351292159439334, "grad_norm": 6.542981147766113, "learning_rate": 8.233576642335766e-06, "loss": 0.1782, "step": 290 }, { "epoch": 0.0657030223390276, "grad_norm": 19.069080352783203, "learning_rate": 8.525547445255476e-06, "loss": 0.1742, "step": 300 }, { "epoch": 0.06789312308366185, "grad_norm": 11.393919944763184, "learning_rate": 8.817518248175182e-06, "loss": 0.2146, "step": 310 }, { "epoch": 0.0700832238282961, "grad_norm": 22.753799438476562, "learning_rate": 9.10948905109489e-06, "loss": 0.1672, "step": 320 }, { "epoch": 0.07227332457293036, "grad_norm": 4.803249359130859, "learning_rate": 9.4014598540146e-06, "loss": 0.3702, "step": 330 }, { "epoch": 0.07446342531756461, "grad_norm": 13.460676193237305, "learning_rate": 9.693430656934307e-06, "loss": 0.1675, "step": 340 }, { "epoch": 0.07665352606219886, "grad_norm": 12.076494216918945, "learning_rate": 9.985401459854015e-06, "loss": 0.1515, "step": 350 }, { "epoch": 0.07884362680683311, "grad_norm": 18.581897735595703, "learning_rate": 1.0277372262773724e-05, "loss": 0.1828, "step": 360 }, { "epoch": 0.08103372755146737, "grad_norm": 3.7014622688293457, "learning_rate": 1.056934306569343e-05, "loss": 0.3149, "step": 370 }, { "epoch": 0.08322382829610162, "grad_norm": 11.063940048217773, "learning_rate": 1.0861313868613139e-05, "loss": 0.2228, "step": 380 }, { "epoch": 0.08541392904073587, "grad_norm": 15.643813133239746, "learning_rate": 1.1153284671532847e-05, "loss": 0.2059, "step": 390 }, { "epoch": 0.08760402978537013, "grad_norm": 15.401198387145996, "learning_rate": 1.1445255474452555e-05, "loss": 0.2277, "step": 400 }, { "epoch": 0.08979413053000437, "grad_norm": 13.633001327514648, "learning_rate": 1.1737226277372265e-05, "loss": 0.2043, "step": 410 }, { "epoch": 0.09198423127463863, "grad_norm": 21.23131561279297, "learning_rate": 1.2029197080291973e-05, "loss": 0.1305, "step": 420 }, { "epoch": 0.09417433201927289, "grad_norm": 9.310542106628418, "learning_rate": 1.232116788321168e-05, "loss": 0.3295, "step": 430 }, { "epoch": 0.09636443276390715, "grad_norm": 0.36292779445648193, "learning_rate": 1.2613138686131387e-05, "loss": 0.1891, "step": 440 }, { "epoch": 0.09855453350854139, "grad_norm": 14.237196922302246, "learning_rate": 1.2905109489051095e-05, "loss": 0.2526, "step": 450 }, { "epoch": 0.10074463425317565, "grad_norm": 14.935922622680664, "learning_rate": 1.3197080291970803e-05, "loss": 0.2147, "step": 460 }, { "epoch": 0.1029347349978099, "grad_norm": 8.13823413848877, "learning_rate": 1.3489051094890513e-05, "loss": 0.1688, "step": 470 }, { "epoch": 0.10512483574244415, "grad_norm": 10.621323585510254, "learning_rate": 1.3781021897810221e-05, "loss": 0.1444, "step": 480 }, { "epoch": 0.1073149364870784, "grad_norm": 12.949259757995605, "learning_rate": 1.4072992700729929e-05, "loss": 0.1692, "step": 490 }, { "epoch": 0.10950503723171266, "grad_norm": 17.187467575073242, "learning_rate": 1.4364963503649635e-05, "loss": 0.2065, "step": 500 }, { "epoch": 0.1116951379763469, "grad_norm": 10.795698165893555, "learning_rate": 1.4656934306569343e-05, "loss": 0.2468, "step": 510 }, { "epoch": 0.11388523872098116, "grad_norm": 10.788908958435059, "learning_rate": 1.4948905109489051e-05, "loss": 0.1375, "step": 520 }, { "epoch": 0.11607533946561542, "grad_norm": 5.051388263702393, "learning_rate": 1.5240875912408761e-05, "loss": 0.1166, "step": 530 }, { "epoch": 0.11826544021024968, "grad_norm": 18.808759689331055, "learning_rate": 1.553284671532847e-05, "loss": 0.361, "step": 540 }, { "epoch": 0.12045554095488392, "grad_norm": 0.08614397048950195, "learning_rate": 1.5824817518248175e-05, "loss": 0.1286, "step": 550 }, { "epoch": 0.12264564169951818, "grad_norm": 16.375041961669922, "learning_rate": 1.6116788321167885e-05, "loss": 0.3507, "step": 560 }, { "epoch": 0.12483574244415244, "grad_norm": 17.1914119720459, "learning_rate": 1.640875912408759e-05, "loss": 0.21, "step": 570 }, { "epoch": 0.12702584318878668, "grad_norm": 15.989970207214355, "learning_rate": 1.67007299270073e-05, "loss": 0.2695, "step": 580 }, { "epoch": 0.12921594393342092, "grad_norm": 3.329091787338257, "learning_rate": 1.699270072992701e-05, "loss": 0.1385, "step": 590 }, { "epoch": 0.1314060446780552, "grad_norm": 0.28428786993026733, "learning_rate": 1.7284671532846717e-05, "loss": 0.1611, "step": 600 }, { "epoch": 0.13359614542268944, "grad_norm": 14.721022605895996, "learning_rate": 1.7576642335766424e-05, "loss": 0.1673, "step": 610 }, { "epoch": 0.1357862461673237, "grad_norm": 0.41195258498191833, "learning_rate": 1.7868613138686133e-05, "loss": 0.2175, "step": 620 }, { "epoch": 0.13797634691195795, "grad_norm": 465.7552795410156, "learning_rate": 1.816058394160584e-05, "loss": 0.471, "step": 630 }, { "epoch": 0.1401664476565922, "grad_norm": 13.02332592010498, "learning_rate": 1.845255474452555e-05, "loss": 0.1859, "step": 640 }, { "epoch": 0.14235654840122647, "grad_norm": 2.651271104812622, "learning_rate": 1.874452554744526e-05, "loss": 0.1304, "step": 650 }, { "epoch": 0.1445466491458607, "grad_norm": 16.789899826049805, "learning_rate": 1.9036496350364966e-05, "loss": 0.2003, "step": 660 }, { "epoch": 0.14673674989049496, "grad_norm": 15.49885082244873, "learning_rate": 1.9328467153284672e-05, "loss": 0.1423, "step": 670 }, { "epoch": 0.14892685063512923, "grad_norm": 1.058356761932373, "learning_rate": 1.962043795620438e-05, "loss": 0.1385, "step": 680 }, { "epoch": 0.15111695137976347, "grad_norm": 20.688344955444336, "learning_rate": 1.9912408759124088e-05, "loss": 0.2144, "step": 690 }, { "epoch": 0.1533070521243977, "grad_norm": 40.343135833740234, "learning_rate": 1.9999985720570166e-05, "loss": 0.3484, "step": 700 }, { "epoch": 0.15549715286903198, "grad_norm": 35.53125762939453, "learning_rate": 1.9999915780603835e-05, "loss": 0.2255, "step": 710 }, { "epoch": 0.15768725361366623, "grad_norm": 11.476936340332031, "learning_rate": 1.9999787557755714e-05, "loss": 0.2389, "step": 720 }, { "epoch": 0.15987735435830047, "grad_norm": 15.674410820007324, "learning_rate": 1.999960105277313e-05, "loss": 0.2851, "step": 730 }, { "epoch": 0.16206745510293474, "grad_norm": 1.6132586002349854, "learning_rate": 1.9999356266743096e-05, "loss": 0.2565, "step": 740 }, { "epoch": 0.164257555847569, "grad_norm": 17.173669815063477, "learning_rate": 1.9999053201092308e-05, "loss": 0.2432, "step": 750 }, { "epoch": 0.16644765659220323, "grad_norm": 12.896418571472168, "learning_rate": 1.9998691857587134e-05, "loss": 0.3666, "step": 760 }, { "epoch": 0.1686377573368375, "grad_norm": 2.9710299968719482, "learning_rate": 1.9998272238333606e-05, "loss": 0.2806, "step": 770 }, { "epoch": 0.17082785808147175, "grad_norm": 10.651814460754395, "learning_rate": 1.999779434577741e-05, "loss": 0.1904, "step": 780 }, { "epoch": 0.173017958826106, "grad_norm": 8.84263801574707, "learning_rate": 1.999725818270386e-05, "loss": 0.153, "step": 790 }, { "epoch": 0.17520805957074026, "grad_norm": 7.657432556152344, "learning_rate": 1.99966637522379e-05, "loss": 0.2019, "step": 800 }, { "epoch": 0.1773981603153745, "grad_norm": 6.842016220092773, "learning_rate": 1.9996011057844063e-05, "loss": 0.148, "step": 810 }, { "epoch": 0.17958826106000875, "grad_norm": 5.838778972625732, "learning_rate": 1.999530010332648e-05, "loss": 0.1086, "step": 820 }, { "epoch": 0.18177836180464302, "grad_norm": 4.016898155212402, "learning_rate": 1.9994530892828832e-05, "loss": 0.1749, "step": 830 }, { "epoch": 0.18396846254927726, "grad_norm": 0.11960650235414505, "learning_rate": 1.9993703430834336e-05, "loss": 0.0834, "step": 840 }, { "epoch": 0.18615856329391153, "grad_norm": 11.673768997192383, "learning_rate": 1.999281772216572e-05, "loss": 0.1763, "step": 850 }, { "epoch": 0.18834866403854578, "grad_norm": 10.15770149230957, "learning_rate": 1.999187377198519e-05, "loss": 0.1304, "step": 860 }, { "epoch": 0.19053876478318002, "grad_norm": 21.572341918945312, "learning_rate": 1.9990871585794416e-05, "loss": 0.1716, "step": 870 }, { "epoch": 0.1927288655278143, "grad_norm": 0.3874415457248688, "learning_rate": 1.9989811169434463e-05, "loss": 0.0892, "step": 880 }, { "epoch": 0.19491896627244854, "grad_norm": 0.9225696921348572, "learning_rate": 1.9988692529085807e-05, "loss": 0.2901, "step": 890 }, { "epoch": 0.19710906701708278, "grad_norm": 34.34111785888672, "learning_rate": 1.9987755699904926e-05, "loss": 0.7586, "step": 900 }, { "epoch": 0.19929916776171705, "grad_norm": 24.94373893737793, "learning_rate": 1.998653227303454e-05, "loss": 0.2473, "step": 910 }, { "epoch": 0.2014892685063513, "grad_norm": 10.81885814666748, "learning_rate": 1.998538142347011e-05, "loss": 0.239, "step": 920 }, { "epoch": 0.20367936925098554, "grad_norm": 25.226037979125977, "learning_rate": 1.9984047413708153e-05, "loss": 0.2572, "step": 930 }, { "epoch": 0.2058694699956198, "grad_norm": 21.34076499938965, "learning_rate": 1.99826552135506e-05, "loss": 0.1144, "step": 940 }, { "epoch": 0.20805957074025405, "grad_norm": 5.700305938720703, "learning_rate": 1.9981204831111664e-05, "loss": 0.2097, "step": 950 }, { "epoch": 0.2102496714848883, "grad_norm": 11.903735160827637, "learning_rate": 1.9979696274844665e-05, "loss": 0.093, "step": 960 }, { "epoch": 0.21243977222952257, "grad_norm": 19.8076171875, "learning_rate": 1.9978129553541974e-05, "loss": 0.1385, "step": 970 }, { "epoch": 0.2146298729741568, "grad_norm": 11.645663261413574, "learning_rate": 1.997650467633497e-05, "loss": 0.1678, "step": 980 }, { "epoch": 0.21681997371879105, "grad_norm": 16.10714340209961, "learning_rate": 1.997482165269399e-05, "loss": 0.1675, "step": 990 }, { "epoch": 0.21901007446342532, "grad_norm": 0.906162440776825, "learning_rate": 1.9973080492428264e-05, "loss": 0.1431, "step": 1000 }, { "epoch": 0.22120017520805957, "grad_norm": 5.618923187255859, "learning_rate": 1.9971281205685852e-05, "loss": 0.2302, "step": 1010 }, { "epoch": 0.2233902759526938, "grad_norm": 6.884607315063477, "learning_rate": 1.9969423802953612e-05, "loss": 0.2095, "step": 1020 }, { "epoch": 0.22558037669732808, "grad_norm": 4.537491798400879, "learning_rate": 1.9967508295057117e-05, "loss": 0.1274, "step": 1030 }, { "epoch": 0.22777047744196233, "grad_norm": 4.935811996459961, "learning_rate": 1.9965534693160586e-05, "loss": 0.0911, "step": 1040 }, { "epoch": 0.22996057818659657, "grad_norm": 7.54110050201416, "learning_rate": 1.9963503008766836e-05, "loss": 0.192, "step": 1050 }, { "epoch": 0.23215067893123084, "grad_norm": 23.602153778076172, "learning_rate": 1.9961413253717214e-05, "loss": 0.2447, "step": 1060 }, { "epoch": 0.23434077967586509, "grad_norm": 22.559946060180664, "learning_rate": 1.995926544019151e-05, "loss": 0.1912, "step": 1070 }, { "epoch": 0.23653088042049936, "grad_norm": 6.328823089599609, "learning_rate": 1.9957059580707916e-05, "loss": 0.1215, "step": 1080 }, { "epoch": 0.2387209811651336, "grad_norm": 9.642956733703613, "learning_rate": 1.9954795688122916e-05, "loss": 0.1361, "step": 1090 }, { "epoch": 0.24091108190976784, "grad_norm": 5.040389060974121, "learning_rate": 1.9952473775631246e-05, "loss": 0.0867, "step": 1100 }, { "epoch": 0.24310118265440211, "grad_norm": 18.554792404174805, "learning_rate": 1.9950093856765788e-05, "loss": 0.1347, "step": 1110 }, { "epoch": 0.24529128339903636, "grad_norm": 0.9616299271583557, "learning_rate": 1.9947655945397517e-05, "loss": 0.1889, "step": 1120 }, { "epoch": 0.2474813841436706, "grad_norm": 22.05847930908203, "learning_rate": 1.9945160055735402e-05, "loss": 0.0474, "step": 1130 }, { "epoch": 0.24967148488830487, "grad_norm": 11.809725761413574, "learning_rate": 1.9942606202326326e-05, "loss": 0.1253, "step": 1140 }, { "epoch": 0.2518615856329391, "grad_norm": 4.009741306304932, "learning_rate": 1.9939994400055015e-05, "loss": 0.1757, "step": 1150 }, { "epoch": 0.25405168637757336, "grad_norm": 7.047018527984619, "learning_rate": 1.993732466414393e-05, "loss": 0.1865, "step": 1160 }, { "epoch": 0.25624178712220763, "grad_norm": 18.014875411987305, "learning_rate": 1.993459701015319e-05, "loss": 0.1133, "step": 1170 }, { "epoch": 0.25843188786684185, "grad_norm": 0.3730584383010864, "learning_rate": 1.9931811453980482e-05, "loss": 0.1301, "step": 1180 }, { "epoch": 0.2606219886114761, "grad_norm": 20.2712345123291, "learning_rate": 1.9928968011860973e-05, "loss": 0.1874, "step": 1190 }, { "epoch": 0.2628120893561104, "grad_norm": 3.2324841022491455, "learning_rate": 1.9926066700367196e-05, "loss": 0.1431, "step": 1200 }, { "epoch": 0.26500219010074466, "grad_norm": 6.957901477813721, "learning_rate": 1.992310753640898e-05, "loss": 0.1765, "step": 1210 }, { "epoch": 0.2671922908453789, "grad_norm": 12.450915336608887, "learning_rate": 1.9920090537233322e-05, "loss": 0.17, "step": 1220 }, { "epoch": 0.26938239159001315, "grad_norm": 14.78199577331543, "learning_rate": 1.9917015720424317e-05, "loss": 0.1006, "step": 1230 }, { "epoch": 0.2715724923346474, "grad_norm": 17.788541793823242, "learning_rate": 1.991388310390303e-05, "loss": 0.1526, "step": 1240 }, { "epoch": 0.27376259307928164, "grad_norm": 16.428804397583008, "learning_rate": 1.9910692705927413e-05, "loss": 0.147, "step": 1250 }, { "epoch": 0.2759526938239159, "grad_norm": 1.2698190212249756, "learning_rate": 1.9907444545092175e-05, "loss": 0.0905, "step": 1260 }, { "epoch": 0.2781427945685502, "grad_norm": 5.782829284667969, "learning_rate": 1.990413864032869e-05, "loss": 0.2471, "step": 1270 }, { "epoch": 0.2803328953131844, "grad_norm": 0.30539631843566895, "learning_rate": 1.99007750109049e-05, "loss": 0.3544, "step": 1280 }, { "epoch": 0.28252299605781866, "grad_norm": 13.326297760009766, "learning_rate": 1.989735367642516e-05, "loss": 0.1699, "step": 1290 }, { "epoch": 0.28471309680245294, "grad_norm": 12.966431617736816, "learning_rate": 1.9893874656830162e-05, "loss": 0.181, "step": 1300 }, { "epoch": 0.28690319754708715, "grad_norm": 7.286040782928467, "learning_rate": 1.9890337972396808e-05, "loss": 0.1532, "step": 1310 }, { "epoch": 0.2890932982917214, "grad_norm": 9.810023307800293, "learning_rate": 1.988674364373809e-05, "loss": 0.1667, "step": 1320 }, { "epoch": 0.2912833990363557, "grad_norm": 3.064018726348877, "learning_rate": 1.9883091691802963e-05, "loss": 0.0846, "step": 1330 }, { "epoch": 0.2934734997809899, "grad_norm": 10.534605026245117, "learning_rate": 1.9879382137876234e-05, "loss": 0.1707, "step": 1340 }, { "epoch": 0.2956636005256242, "grad_norm": 10.480161666870117, "learning_rate": 1.987561500357844e-05, "loss": 0.1549, "step": 1350 }, { "epoch": 0.29785370127025845, "grad_norm": 10.970039367675781, "learning_rate": 1.9871790310865707e-05, "loss": 0.1252, "step": 1360 }, { "epoch": 0.30004380201489267, "grad_norm": 8.283377647399902, "learning_rate": 1.9867908082029634e-05, "loss": 0.1159, "step": 1370 }, { "epoch": 0.30223390275952694, "grad_norm": 0.3829292356967926, "learning_rate": 1.986396833969716e-05, "loss": 0.1105, "step": 1380 }, { "epoch": 0.3044240035041612, "grad_norm": 5.1020002365112305, "learning_rate": 1.9859971106830435e-05, "loss": 0.1562, "step": 1390 }, { "epoch": 0.3066141042487954, "grad_norm": 9.880990028381348, "learning_rate": 1.9855916406726672e-05, "loss": 0.1278, "step": 1400 }, { "epoch": 0.3088042049934297, "grad_norm": 9.246286392211914, "learning_rate": 1.9851804263018044e-05, "loss": 0.1479, "step": 1410 }, { "epoch": 0.31099430573806397, "grad_norm": 11.83699893951416, "learning_rate": 1.9847634699671497e-05, "loss": 0.2778, "step": 1420 }, { "epoch": 0.3131844064826982, "grad_norm": 9.11435317993164, "learning_rate": 1.984340774098866e-05, "loss": 0.1264, "step": 1430 }, { "epoch": 0.31537450722733246, "grad_norm": 20.10544204711914, "learning_rate": 1.9839123411605677e-05, "loss": 0.256, "step": 1440 }, { "epoch": 0.31756460797196673, "grad_norm": 3.6529557704925537, "learning_rate": 1.9834781736493057e-05, "loss": 0.1673, "step": 1450 }, { "epoch": 0.31975470871660094, "grad_norm": 12.054941177368164, "learning_rate": 1.983038274095555e-05, "loss": 0.1754, "step": 1460 }, { "epoch": 0.3219448094612352, "grad_norm": 20.188112258911133, "learning_rate": 1.9825926450631988e-05, "loss": 0.088, "step": 1470 }, { "epoch": 0.3241349102058695, "grad_norm": 6.536449432373047, "learning_rate": 1.982141289149513e-05, "loss": 0.1305, "step": 1480 }, { "epoch": 0.3263250109505037, "grad_norm": 5.705020427703857, "learning_rate": 1.9816842089851525e-05, "loss": 0.1246, "step": 1490 }, { "epoch": 0.328515111695138, "grad_norm": 2.411208152770996, "learning_rate": 1.981221407234134e-05, "loss": 0.1837, "step": 1500 }, { "epoch": 0.33070521243977224, "grad_norm": 36.575347900390625, "learning_rate": 1.980752886593823e-05, "loss": 0.2393, "step": 1510 }, { "epoch": 0.33289531318440646, "grad_norm": 2.5054516792297363, "learning_rate": 1.9802786497949156e-05, "loss": 0.1751, "step": 1520 }, { "epoch": 0.33508541392904073, "grad_norm": 2.9033756256103516, "learning_rate": 1.9797986996014233e-05, "loss": 0.0972, "step": 1530 }, { "epoch": 0.337275514673675, "grad_norm": 4.304263114929199, "learning_rate": 1.9793130388106584e-05, "loss": 0.1424, "step": 1540 }, { "epoch": 0.3394656154183092, "grad_norm": 6.944413185119629, "learning_rate": 1.9788216702532147e-05, "loss": 0.098, "step": 1550 }, { "epoch": 0.3416557161629435, "grad_norm": 5.566818714141846, "learning_rate": 1.9783245967929554e-05, "loss": 0.0857, "step": 1560 }, { "epoch": 0.34384581690757776, "grad_norm": 3.921610116958618, "learning_rate": 1.9778218213269912e-05, "loss": 0.2772, "step": 1570 }, { "epoch": 0.346035917652212, "grad_norm": 4.925644874572754, "learning_rate": 1.9773133467856672e-05, "loss": 0.1331, "step": 1580 }, { "epoch": 0.34822601839684625, "grad_norm": 4.444881916046143, "learning_rate": 1.9767991761325447e-05, "loss": 0.1806, "step": 1590 }, { "epoch": 0.3504161191414805, "grad_norm": 1.2111713886260986, "learning_rate": 1.9762793123643836e-05, "loss": 0.0859, "step": 1600 }, { "epoch": 0.35260621988611474, "grad_norm": 1.4036651849746704, "learning_rate": 1.9757537585111257e-05, "loss": 0.1833, "step": 1610 }, { "epoch": 0.354796320630749, "grad_norm": 23.02633285522461, "learning_rate": 1.9752225176358757e-05, "loss": 0.1043, "step": 1620 }, { "epoch": 0.3569864213753833, "grad_norm": 2.471585988998413, "learning_rate": 1.974685592834884e-05, "loss": 0.1683, "step": 1630 }, { "epoch": 0.3591765221200175, "grad_norm": 5.386203765869141, "learning_rate": 1.974142987237531e-05, "loss": 0.1138, "step": 1640 }, { "epoch": 0.36136662286465177, "grad_norm": 12.106959342956543, "learning_rate": 1.9735947040063036e-05, "loss": 0.1969, "step": 1650 }, { "epoch": 0.36355672360928604, "grad_norm": 14.18044662475586, "learning_rate": 1.973040746336782e-05, "loss": 0.13, "step": 1660 }, { "epoch": 0.3657468243539203, "grad_norm": 1.5135153532028198, "learning_rate": 1.9724811174576182e-05, "loss": 0.1402, "step": 1670 }, { "epoch": 0.3679369250985545, "grad_norm": 4.524163722991943, "learning_rate": 1.9719158206305182e-05, "loss": 0.1106, "step": 1680 }, { "epoch": 0.3701270258431888, "grad_norm": 15.274569511413574, "learning_rate": 1.9713448591502228e-05, "loss": 0.2612, "step": 1690 }, { "epoch": 0.37231712658782307, "grad_norm": 3.9119160175323486, "learning_rate": 1.9707682363444872e-05, "loss": 0.1259, "step": 1700 }, { "epoch": 0.3745072273324573, "grad_norm": 7.143598556518555, "learning_rate": 1.9701859555740647e-05, "loss": 0.0751, "step": 1710 }, { "epoch": 0.37669732807709155, "grad_norm": 23.19759178161621, "learning_rate": 1.969598020232683e-05, "loss": 0.1546, "step": 1720 }, { "epoch": 0.3788874288217258, "grad_norm": 18.014686584472656, "learning_rate": 1.9690044337470286e-05, "loss": 0.1511, "step": 1730 }, { "epoch": 0.38107752956636004, "grad_norm": 6.412353038787842, "learning_rate": 1.968405199576723e-05, "loss": 0.0873, "step": 1740 }, { "epoch": 0.3832676303109943, "grad_norm": 15.597745895385742, "learning_rate": 1.967800321214305e-05, "loss": 0.1212, "step": 1750 }, { "epoch": 0.3854577310556286, "grad_norm": 4.399725437164307, "learning_rate": 1.96718980218521e-05, "loss": 0.1323, "step": 1760 }, { "epoch": 0.3876478318002628, "grad_norm": 12.94128131866455, "learning_rate": 1.9665736460477487e-05, "loss": 0.2107, "step": 1770 }, { "epoch": 0.38983793254489707, "grad_norm": 17.424596786499023, "learning_rate": 1.965951856393087e-05, "loss": 0.0823, "step": 1780 }, { "epoch": 0.39202803328953134, "grad_norm": 4.065990447998047, "learning_rate": 1.9653244368452245e-05, "loss": 0.1223, "step": 1790 }, { "epoch": 0.39421813403416556, "grad_norm": 15.39638900756836, "learning_rate": 1.964691391060974e-05, "loss": 0.0741, "step": 1800 }, { "epoch": 0.39640823477879983, "grad_norm": 2.373101234436035, "learning_rate": 1.96405272272994e-05, "loss": 0.1479, "step": 1810 }, { "epoch": 0.3985983355234341, "grad_norm": 11.513437271118164, "learning_rate": 1.963408435574497e-05, "loss": 0.1999, "step": 1820 }, { "epoch": 0.4007884362680683, "grad_norm": 11.093876838684082, "learning_rate": 1.962758533349767e-05, "loss": 0.0611, "step": 1830 }, { "epoch": 0.4029785370127026, "grad_norm": 0.8569280505180359, "learning_rate": 1.9621030198436007e-05, "loss": 0.1153, "step": 1840 }, { "epoch": 0.40516863775733686, "grad_norm": 9.452397346496582, "learning_rate": 1.961441898876551e-05, "loss": 0.1847, "step": 1850 }, { "epoch": 0.4073587385019711, "grad_norm": 11.130989074707031, "learning_rate": 1.960775174301854e-05, "loss": 0.1588, "step": 1860 }, { "epoch": 0.40954883924660535, "grad_norm": 1.6233234405517578, "learning_rate": 1.9601028500054053e-05, "loss": 0.1262, "step": 1870 }, { "epoch": 0.4117389399912396, "grad_norm": 15.054749488830566, "learning_rate": 1.9594249299057383e-05, "loss": 0.2018, "step": 1880 }, { "epoch": 0.41392904073587383, "grad_norm": 11.786283493041992, "learning_rate": 1.958741417953999e-05, "loss": 0.1371, "step": 1890 }, { "epoch": 0.4161191414805081, "grad_norm": 13.437487602233887, "learning_rate": 1.9580523181339264e-05, "loss": 0.1171, "step": 1900 }, { "epoch": 0.4183092422251424, "grad_norm": 8.45596981048584, "learning_rate": 1.9573576344618258e-05, "loss": 0.1023, "step": 1910 }, { "epoch": 0.4204993429697766, "grad_norm": 3.717137336730957, "learning_rate": 1.9566573709865492e-05, "loss": 0.0869, "step": 1920 }, { "epoch": 0.42268944371441086, "grad_norm": 12.962523460388184, "learning_rate": 1.955951531789467e-05, "loss": 0.2222, "step": 1930 }, { "epoch": 0.42487954445904513, "grad_norm": 24.712831497192383, "learning_rate": 1.955240120984449e-05, "loss": 0.1482, "step": 1940 }, { "epoch": 0.42706964520367935, "grad_norm": 1.6323201656341553, "learning_rate": 1.954523142717837e-05, "loss": 0.1218, "step": 1950 }, { "epoch": 0.4292597459483136, "grad_norm": 22.809532165527344, "learning_rate": 1.9538006011684218e-05, "loss": 0.1223, "step": 1960 }, { "epoch": 0.4314498466929479, "grad_norm": 1.0428169965744019, "learning_rate": 1.9530725005474195e-05, "loss": 0.1138, "step": 1970 }, { "epoch": 0.4336399474375821, "grad_norm": 5.233839511871338, "learning_rate": 1.9523388450984465e-05, "loss": 0.2357, "step": 1980 }, { "epoch": 0.4358300481822164, "grad_norm": 8.51112174987793, "learning_rate": 1.9515996390974935e-05, "loss": 0.1585, "step": 1990 }, { "epoch": 0.43802014892685065, "grad_norm": 2.8058927059173584, "learning_rate": 1.950854886852903e-05, "loss": 0.0931, "step": 2000 }, { "epoch": 0.44021024967148487, "grad_norm": 3.7863986492156982, "learning_rate": 1.950104592705342e-05, "loss": 0.0875, "step": 2010 }, { "epoch": 0.44240035041611914, "grad_norm": 4.907229900360107, "learning_rate": 1.9493487610277776e-05, "loss": 0.1334, "step": 2020 }, { "epoch": 0.4445904511607534, "grad_norm": 10.500575065612793, "learning_rate": 1.948587396225452e-05, "loss": 0.1936, "step": 2030 }, { "epoch": 0.4467805519053876, "grad_norm": 15.607919692993164, "learning_rate": 1.9478205027358557e-05, "loss": 0.1528, "step": 2040 }, { "epoch": 0.4489706526500219, "grad_norm": 12.086151123046875, "learning_rate": 1.9470480850287036e-05, "loss": 0.0577, "step": 2050 }, { "epoch": 0.45116075339465617, "grad_norm": 2.9599990844726562, "learning_rate": 1.9462701476059054e-05, "loss": 0.087, "step": 2060 }, { "epoch": 0.4533508541392904, "grad_norm": 8.690507888793945, "learning_rate": 1.9454866950015433e-05, "loss": 0.1368, "step": 2070 }, { "epoch": 0.45554095488392465, "grad_norm": 7.167850017547607, "learning_rate": 1.9446977317818433e-05, "loss": 0.1244, "step": 2080 }, { "epoch": 0.4577310556285589, "grad_norm": 4.184047222137451, "learning_rate": 1.9439032625451495e-05, "loss": 0.0673, "step": 2090 }, { "epoch": 0.45992115637319314, "grad_norm": 0.35424381494522095, "learning_rate": 1.9431032919218957e-05, "loss": 0.1199, "step": 2100 }, { "epoch": 0.4621112571178274, "grad_norm": 11.39419937133789, "learning_rate": 1.942297824574581e-05, "loss": 0.1626, "step": 2110 }, { "epoch": 0.4643013578624617, "grad_norm": 3.6221561431884766, "learning_rate": 1.941486865197741e-05, "loss": 0.1127, "step": 2120 }, { "epoch": 0.4664914586070959, "grad_norm": 8.206335067749023, "learning_rate": 1.9406704185179203e-05, "loss": 0.2068, "step": 2130 }, { "epoch": 0.46868155935173017, "grad_norm": 11.432991981506348, "learning_rate": 1.9398484892936448e-05, "loss": 0.0654, "step": 2140 }, { "epoch": 0.47087166009636444, "grad_norm": 0.40915176272392273, "learning_rate": 1.9390210823153964e-05, "loss": 0.0816, "step": 2150 }, { "epoch": 0.4730617608409987, "grad_norm": 11.091645240783691, "learning_rate": 1.9381882024055813e-05, "loss": 0.1029, "step": 2160 }, { "epoch": 0.47525186158563293, "grad_norm": 2.5653114318847656, "learning_rate": 1.9373498544185046e-05, "loss": 0.1128, "step": 2170 }, { "epoch": 0.4774419623302672, "grad_norm": 7.6043701171875, "learning_rate": 1.936506043240341e-05, "loss": 0.1958, "step": 2180 }, { "epoch": 0.47963206307490147, "grad_norm": 25.004302978515625, "learning_rate": 1.935656773789107e-05, "loss": 0.2273, "step": 2190 }, { "epoch": 0.4818221638195357, "grad_norm": 11.13805866241455, "learning_rate": 1.9348020510146314e-05, "loss": 0.1587, "step": 2200 }, { "epoch": 0.48401226456416996, "grad_norm": 19.71245002746582, "learning_rate": 1.933941879898526e-05, "loss": 0.2022, "step": 2210 }, { "epoch": 0.48620236530880423, "grad_norm": 6.077335357666016, "learning_rate": 1.933076265454159e-05, "loss": 0.1756, "step": 2220 }, { "epoch": 0.48839246605343845, "grad_norm": 1.910692811012268, "learning_rate": 1.9322052127266234e-05, "loss": 0.1236, "step": 2230 }, { "epoch": 0.4905825667980727, "grad_norm": 6.62217903137207, "learning_rate": 1.9313287267927077e-05, "loss": 0.159, "step": 2240 }, { "epoch": 0.492772667542707, "grad_norm": 1.770951509475708, "learning_rate": 1.930446812760868e-05, "loss": 0.1087, "step": 2250 }, { "epoch": 0.4949627682873412, "grad_norm": 23.67471694946289, "learning_rate": 1.9295594757711963e-05, "loss": 0.1274, "step": 2260 }, { "epoch": 0.4971528690319755, "grad_norm": 12.301440238952637, "learning_rate": 1.9286667209953924e-05, "loss": 0.1076, "step": 2270 }, { "epoch": 0.49934296977660975, "grad_norm": 7.678272724151611, "learning_rate": 1.9277685536367313e-05, "loss": 0.0928, "step": 2280 }, { "epoch": 0.501533070521244, "grad_norm": 3.765711545944214, "learning_rate": 1.9268649789300362e-05, "loss": 0.0789, "step": 2290 }, { "epoch": 0.5037231712658782, "grad_norm": 9.533684730529785, "learning_rate": 1.925956002141645e-05, "loss": 0.0817, "step": 2300 }, { "epoch": 0.5059132720105125, "grad_norm": 0.35622021555900574, "learning_rate": 1.9250416285693806e-05, "loss": 0.0848, "step": 2310 }, { "epoch": 0.5081033727551467, "grad_norm": 0.31084880232810974, "learning_rate": 1.9241218635425212e-05, "loss": 0.2882, "step": 2320 }, { "epoch": 0.5102934734997809, "grad_norm": 21.496559143066406, "learning_rate": 1.923196712421767e-05, "loss": 0.118, "step": 2330 }, { "epoch": 0.5124835742444153, "grad_norm": 20.486251831054688, "learning_rate": 1.922266180599211e-05, "loss": 0.131, "step": 2340 }, { "epoch": 0.5146736749890495, "grad_norm": 6.7747111320495605, "learning_rate": 1.9213302734983065e-05, "loss": 0.1314, "step": 2350 }, { "epoch": 0.5168637757336837, "grad_norm": 4.821852207183838, "learning_rate": 1.9203889965738354e-05, "loss": 0.0871, "step": 2360 }, { "epoch": 0.519053876478318, "grad_norm": 10.27305793762207, "learning_rate": 1.919442355311878e-05, "loss": 0.1191, "step": 2370 }, { "epoch": 0.5212439772229522, "grad_norm": 0.5141002535820007, "learning_rate": 1.918490355229778e-05, "loss": 0.0803, "step": 2380 }, { "epoch": 0.5234340779675866, "grad_norm": 2.369067907333374, "learning_rate": 1.917533001876113e-05, "loss": 0.1538, "step": 2390 }, { "epoch": 0.5256241787122208, "grad_norm": 8.9265775680542, "learning_rate": 1.9165703008306615e-05, "loss": 0.123, "step": 2400 }, { "epoch": 0.527814279456855, "grad_norm": 0.5889332294464111, "learning_rate": 1.91560225770437e-05, "loss": 0.2033, "step": 2410 }, { "epoch": 0.5300043802014893, "grad_norm": 4.266817092895508, "learning_rate": 1.91462887813932e-05, "loss": 0.0813, "step": 2420 }, { "epoch": 0.5321944809461235, "grad_norm": 15.837199211120605, "learning_rate": 1.913650167808696e-05, "loss": 0.193, "step": 2430 }, { "epoch": 0.5343845816907578, "grad_norm": 4.093066215515137, "learning_rate": 1.9126661324167523e-05, "loss": 0.2352, "step": 2440 }, { "epoch": 0.5365746824353921, "grad_norm": 11.454073905944824, "learning_rate": 1.911676777698778e-05, "loss": 0.2219, "step": 2450 }, { "epoch": 0.5387647831800263, "grad_norm": 9.941503524780273, "learning_rate": 1.910682109421067e-05, "loss": 0.1744, "step": 2460 }, { "epoch": 0.5409548839246605, "grad_norm": 24.301671981811523, "learning_rate": 1.9096821333808814e-05, "loss": 0.0684, "step": 2470 }, { "epoch": 0.5431449846692948, "grad_norm": 10.73775577545166, "learning_rate": 1.908676855406418e-05, "loss": 0.0971, "step": 2480 }, { "epoch": 0.545335085413929, "grad_norm": 6.897889614105225, "learning_rate": 1.9076662813567772e-05, "loss": 0.1516, "step": 2490 }, { "epoch": 0.5475251861585633, "grad_norm": 10.118902206420898, "learning_rate": 1.9066504171219245e-05, "loss": 0.1161, "step": 2500 }, { "epoch": 0.5497152869031976, "grad_norm": 13.134893417358398, "learning_rate": 1.9056292686226596e-05, "loss": 0.2371, "step": 2510 }, { "epoch": 0.5519053876478318, "grad_norm": 14.983389854431152, "learning_rate": 1.9046028418105797e-05, "loss": 0.0873, "step": 2520 }, { "epoch": 0.554095488392466, "grad_norm": 15.583308219909668, "learning_rate": 1.9035711426680475e-05, "loss": 0.1192, "step": 2530 }, { "epoch": 0.5562855891371004, "grad_norm": 0.6449429392814636, "learning_rate": 1.902534177208153e-05, "loss": 0.0601, "step": 2540 }, { "epoch": 0.5584756898817346, "grad_norm": 3.2007322311401367, "learning_rate": 1.9014919514746806e-05, "loss": 0.1739, "step": 2550 }, { "epoch": 0.5606657906263688, "grad_norm": 25.884014129638672, "learning_rate": 1.900444471542073e-05, "loss": 0.1836, "step": 2560 }, { "epoch": 0.5628558913710031, "grad_norm": 2.932692527770996, "learning_rate": 1.8993917435153976e-05, "loss": 0.1031, "step": 2570 }, { "epoch": 0.5650459921156373, "grad_norm": 18.582134246826172, "learning_rate": 1.8983337735303074e-05, "loss": 0.1365, "step": 2580 }, { "epoch": 0.5672360928602715, "grad_norm": 11.569768905639648, "learning_rate": 1.8972705677530087e-05, "loss": 0.0881, "step": 2590 }, { "epoch": 0.5694261936049059, "grad_norm": 1.2210649251937866, "learning_rate": 1.8962021323802234e-05, "loss": 0.0914, "step": 2600 }, { "epoch": 0.5716162943495401, "grad_norm": 10.461421966552734, "learning_rate": 1.895128473639153e-05, "loss": 0.1618, "step": 2610 }, { "epoch": 0.5738063950941743, "grad_norm": 1.465786099433899, "learning_rate": 1.894049597787443e-05, "loss": 0.1215, "step": 2620 }, { "epoch": 0.5759964958388086, "grad_norm": 6.987858295440674, "learning_rate": 1.8929655111131454e-05, "loss": 0.0792, "step": 2630 }, { "epoch": 0.5781865965834428, "grad_norm": 9.723291397094727, "learning_rate": 1.891876219934683e-05, "loss": 0.1361, "step": 2640 }, { "epoch": 0.5803766973280771, "grad_norm": 10.994476318359375, "learning_rate": 1.8907817306008123e-05, "loss": 0.1, "step": 2650 }, { "epoch": 0.5825667980727114, "grad_norm": 13.831147193908691, "learning_rate": 1.8896820494905867e-05, "loss": 0.2563, "step": 2660 }, { "epoch": 0.5847568988173456, "grad_norm": 9.939831733703613, "learning_rate": 1.888577183013318e-05, "loss": 0.1356, "step": 2670 }, { "epoch": 0.5869469995619798, "grad_norm": 0.6647922992706299, "learning_rate": 1.8874671376085408e-05, "loss": 0.0686, "step": 2680 }, { "epoch": 0.5891371003066141, "grad_norm": 16.246030807495117, "learning_rate": 1.886351919745974e-05, "loss": 0.1756, "step": 2690 }, { "epoch": 0.5913272010512484, "grad_norm": 11.162679672241211, "learning_rate": 1.885231535925484e-05, "loss": 0.1422, "step": 2700 }, { "epoch": 0.5935173017958826, "grad_norm": 11.898168563842773, "learning_rate": 1.8841059926770443e-05, "loss": 0.2806, "step": 2710 }, { "epoch": 0.5957074025405169, "grad_norm": 27.269563674926758, "learning_rate": 1.882975296560702e-05, "loss": 0.2201, "step": 2720 }, { "epoch": 0.5978975032851511, "grad_norm": 6.5583176612854, "learning_rate": 1.8818394541665345e-05, "loss": 0.1142, "step": 2730 }, { "epoch": 0.6000876040297853, "grad_norm": 10.405722618103027, "learning_rate": 1.880698472114615e-05, "loss": 0.1257, "step": 2740 }, { "epoch": 0.6022777047744197, "grad_norm": 1.1365935802459717, "learning_rate": 1.879552357054971e-05, "loss": 0.0706, "step": 2750 }, { "epoch": 0.6044678055190539, "grad_norm": 0.08854208886623383, "learning_rate": 1.8784011156675483e-05, "loss": 0.1092, "step": 2760 }, { "epoch": 0.6066579062636881, "grad_norm": 0.2354384958744049, "learning_rate": 1.8772447546621696e-05, "loss": 0.1224, "step": 2770 }, { "epoch": 0.6088480070083224, "grad_norm": 29.68575096130371, "learning_rate": 1.8760832807784967e-05, "loss": 0.2276, "step": 2780 }, { "epoch": 0.6110381077529566, "grad_norm": 9.590601921081543, "learning_rate": 1.8749167007859913e-05, "loss": 0.0895, "step": 2790 }, { "epoch": 0.6132282084975909, "grad_norm": 11.035505294799805, "learning_rate": 1.873745021483875e-05, "loss": 0.1542, "step": 2800 }, { "epoch": 0.6154183092422252, "grad_norm": 23.79170036315918, "learning_rate": 1.8725682497010903e-05, "loss": 0.113, "step": 2810 }, { "epoch": 0.6176084099868594, "grad_norm": 1.6446751356124878, "learning_rate": 1.8713863922962596e-05, "loss": 0.1211, "step": 2820 }, { "epoch": 0.6197985107314936, "grad_norm": 8.34101676940918, "learning_rate": 1.8701994561576463e-05, "loss": 0.1055, "step": 2830 }, { "epoch": 0.6219886114761279, "grad_norm": 9.67935848236084, "learning_rate": 1.8690074482031154e-05, "loss": 0.1089, "step": 2840 }, { "epoch": 0.6241787122207622, "grad_norm": 5.100308895111084, "learning_rate": 1.8678103753800903e-05, "loss": 0.1087, "step": 2850 }, { "epoch": 0.6263688129653964, "grad_norm": 11.457816123962402, "learning_rate": 1.8666082446655155e-05, "loss": 0.0972, "step": 2860 }, { "epoch": 0.6285589137100307, "grad_norm": 2.142587900161743, "learning_rate": 1.8654010630658142e-05, "loss": 0.1903, "step": 2870 }, { "epoch": 0.6307490144546649, "grad_norm": 13.43486213684082, "learning_rate": 1.8641888376168483e-05, "loss": 0.0861, "step": 2880 }, { "epoch": 0.6329391151992991, "grad_norm": 7.9858856201171875, "learning_rate": 1.862971575383876e-05, "loss": 0.143, "step": 2890 }, { "epoch": 0.6351292159439335, "grad_norm": 1.6484079360961914, "learning_rate": 1.8617492834615126e-05, "loss": 0.1627, "step": 2900 }, { "epoch": 0.6373193166885677, "grad_norm": 11.097623825073242, "learning_rate": 1.860521968973687e-05, "loss": 0.098, "step": 2910 }, { "epoch": 0.6395094174332019, "grad_norm": 1.8917572498321533, "learning_rate": 1.859289639073603e-05, "loss": 0.1712, "step": 2920 }, { "epoch": 0.6416995181778362, "grad_norm": 11.026522636413574, "learning_rate": 1.858052300943694e-05, "loss": 0.119, "step": 2930 }, { "epoch": 0.6438896189224704, "grad_norm": 0.3990042209625244, "learning_rate": 1.8568099617955838e-05, "loss": 0.1079, "step": 2940 }, { "epoch": 0.6460797196671046, "grad_norm": 2.540764331817627, "learning_rate": 1.855562628870045e-05, "loss": 0.076, "step": 2950 }, { "epoch": 0.648269820411739, "grad_norm": 0.049650486558675766, "learning_rate": 1.8543103094369533e-05, "loss": 0.1164, "step": 2960 }, { "epoch": 0.6504599211563732, "grad_norm": 3.4120094776153564, "learning_rate": 1.85305301079525e-05, "loss": 0.2003, "step": 2970 }, { "epoch": 0.6526500219010074, "grad_norm": 9.17834758758545, "learning_rate": 1.8517907402728944e-05, "loss": 0.1166, "step": 2980 }, { "epoch": 0.6548401226456417, "grad_norm": 11.271235466003418, "learning_rate": 1.8505235052268258e-05, "loss": 0.1484, "step": 2990 }, { "epoch": 0.657030223390276, "grad_norm": 3.9622983932495117, "learning_rate": 1.8492513130429167e-05, "loss": 0.114, "step": 3000 }, { "epoch": 0.6592203241349102, "grad_norm": 4.854787349700928, "learning_rate": 1.847974171135933e-05, "loss": 0.0637, "step": 3010 }, { "epoch": 0.6614104248795445, "grad_norm": 12.61373233795166, "learning_rate": 1.8466920869494874e-05, "loss": 0.1122, "step": 3020 }, { "epoch": 0.6636005256241787, "grad_norm": 22.26468849182129, "learning_rate": 1.8454050679560002e-05, "loss": 0.1328, "step": 3030 }, { "epoch": 0.6657906263688129, "grad_norm": 21.44456672668457, "learning_rate": 1.844113121656651e-05, "loss": 0.1287, "step": 3040 }, { "epoch": 0.6679807271134472, "grad_norm": 4.976673126220703, "learning_rate": 1.8428162555813395e-05, "loss": 0.1031, "step": 3050 }, { "epoch": 0.6701708278580815, "grad_norm": 10.150763511657715, "learning_rate": 1.841514477288638e-05, "loss": 0.0898, "step": 3060 }, { "epoch": 0.6723609286027157, "grad_norm": 12.601266860961914, "learning_rate": 1.8402077943657494e-05, "loss": 0.1886, "step": 3070 }, { "epoch": 0.67455102934735, "grad_norm": 19.39771842956543, "learning_rate": 1.838896214428463e-05, "loss": 0.1278, "step": 3080 }, { "epoch": 0.6767411300919842, "grad_norm": 18.862014770507812, "learning_rate": 1.8375797451211083e-05, "loss": 0.1184, "step": 3090 }, { "epoch": 0.6789312308366184, "grad_norm": 7.8055219650268555, "learning_rate": 1.8362583941165134e-05, "loss": 0.1044, "step": 3100 }, { "epoch": 0.6811213315812528, "grad_norm": 0.13505922257900238, "learning_rate": 1.8349321691159564e-05, "loss": 0.0857, "step": 3110 }, { "epoch": 0.683311432325887, "grad_norm": 4.918792724609375, "learning_rate": 1.833601077849125e-05, "loss": 0.1152, "step": 3120 }, { "epoch": 0.6855015330705212, "grad_norm": 8.951217651367188, "learning_rate": 1.832265128074067e-05, "loss": 0.1231, "step": 3130 }, { "epoch": 0.6876916338151555, "grad_norm": 0.33724814653396606, "learning_rate": 1.830924327577149e-05, "loss": 0.2228, "step": 3140 }, { "epoch": 0.6898817345597897, "grad_norm": 6.348689556121826, "learning_rate": 1.8295786841730088e-05, "loss": 0.1218, "step": 3150 }, { "epoch": 0.692071835304424, "grad_norm": 7.206477165222168, "learning_rate": 1.8282282057045087e-05, "loss": 0.0918, "step": 3160 }, { "epoch": 0.6942619360490583, "grad_norm": 2.205155849456787, "learning_rate": 1.826872900042694e-05, "loss": 0.1203, "step": 3170 }, { "epoch": 0.6964520367936925, "grad_norm": 10.938039779663086, "learning_rate": 1.8255127750867428e-05, "loss": 0.07, "step": 3180 }, { "epoch": 0.6986421375383267, "grad_norm": 1.1665598154067993, "learning_rate": 1.8241478387639215e-05, "loss": 0.2094, "step": 3190 }, { "epoch": 0.700832238282961, "grad_norm": 0.12293468415737152, "learning_rate": 1.8227780990295398e-05, "loss": 0.1, "step": 3200 }, { "epoch": 0.7030223390275953, "grad_norm": 2.817885160446167, "learning_rate": 1.8214035638669034e-05, "loss": 0.1019, "step": 3210 }, { "epoch": 0.7052124397722295, "grad_norm": 5.96680212020874, "learning_rate": 1.8200242412872655e-05, "loss": 0.1325, "step": 3220 }, { "epoch": 0.7074025405168638, "grad_norm": 2.034914016723633, "learning_rate": 1.818640139329784e-05, "loss": 0.154, "step": 3230 }, { "epoch": 0.709592641261498, "grad_norm": 13.945418357849121, "learning_rate": 1.8172512660614723e-05, "loss": 0.1602, "step": 3240 }, { "epoch": 0.7117827420061322, "grad_norm": 4.050970554351807, "learning_rate": 1.815857629577152e-05, "loss": 0.0732, "step": 3250 }, { "epoch": 0.7139728427507666, "grad_norm": 2.184335947036743, "learning_rate": 1.8144592379994058e-05, "loss": 0.0676, "step": 3260 }, { "epoch": 0.7161629434954008, "grad_norm": 1.0158073902130127, "learning_rate": 1.8130560994785325e-05, "loss": 0.1556, "step": 3270 }, { "epoch": 0.718353044240035, "grad_norm": 6.063960075378418, "learning_rate": 1.8116482221924962e-05, "loss": 0.0833, "step": 3280 }, { "epoch": 0.7205431449846693, "grad_norm": 6.114946365356445, "learning_rate": 1.8102356143468805e-05, "loss": 0.0813, "step": 3290 }, { "epoch": 0.7227332457293035, "grad_norm": 15.890973091125488, "learning_rate": 1.80881828417484e-05, "loss": 0.1563, "step": 3300 }, { "epoch": 0.7249233464739377, "grad_norm": 4.60066556930542, "learning_rate": 1.807396239937054e-05, "loss": 0.2162, "step": 3310 }, { "epoch": 0.7271134472185721, "grad_norm": 24.105998992919922, "learning_rate": 1.805969489921675e-05, "loss": 0.153, "step": 3320 }, { "epoch": 0.7293035479632063, "grad_norm": 8.074901580810547, "learning_rate": 1.8045380424442833e-05, "loss": 0.0807, "step": 3330 }, { "epoch": 0.7314936487078406, "grad_norm": 12.312774658203125, "learning_rate": 1.803101905847838e-05, "loss": 0.0882, "step": 3340 }, { "epoch": 0.7336837494524748, "grad_norm": 14.724958419799805, "learning_rate": 1.8016610885026275e-05, "loss": 0.161, "step": 3350 }, { "epoch": 0.735873850197109, "grad_norm": 8.059611320495605, "learning_rate": 1.8002155988062207e-05, "loss": 0.1095, "step": 3360 }, { "epoch": 0.7380639509417434, "grad_norm": 11.035114288330078, "learning_rate": 1.7987654451834196e-05, "loss": 0.1027, "step": 3370 }, { "epoch": 0.7402540516863776, "grad_norm": 4.394042491912842, "learning_rate": 1.7973106360862086e-05, "loss": 0.0619, "step": 3380 }, { "epoch": 0.7424441524310118, "grad_norm": 6.704696178436279, "learning_rate": 1.7958511799937056e-05, "loss": 0.0582, "step": 3390 }, { "epoch": 0.7446342531756461, "grad_norm": 9.613452911376953, "learning_rate": 1.7943870854121126e-05, "loss": 0.0617, "step": 3400 }, { "epoch": 0.7468243539202803, "grad_norm": 6.212586402893066, "learning_rate": 1.7929183608746673e-05, "loss": 0.0679, "step": 3410 }, { "epoch": 0.7490144546649146, "grad_norm": 14.913146018981934, "learning_rate": 1.7914450149415913e-05, "loss": 0.0861, "step": 3420 }, { "epoch": 0.7512045554095489, "grad_norm": 14.158679962158203, "learning_rate": 1.7899670562000416e-05, "loss": 0.1495, "step": 3430 }, { "epoch": 0.7533946561541831, "grad_norm": 1.1554079055786133, "learning_rate": 1.7884844932640604e-05, "loss": 0.3271, "step": 3440 }, { "epoch": 0.7555847568988173, "grad_norm": 0.08195368200540543, "learning_rate": 1.786997334774524e-05, "loss": 0.0605, "step": 3450 }, { "epoch": 0.7577748576434516, "grad_norm": 2.456305980682373, "learning_rate": 1.785505589399094e-05, "loss": 0.065, "step": 3460 }, { "epoch": 0.7599649583880859, "grad_norm": 12.771835327148438, "learning_rate": 1.784009265832166e-05, "loss": 0.1081, "step": 3470 }, { "epoch": 0.7621550591327201, "grad_norm": 12.2388277053833, "learning_rate": 1.7825083727948175e-05, "loss": 0.1641, "step": 3480 }, { "epoch": 0.7643451598773544, "grad_norm": 12.077229499816895, "learning_rate": 1.78100291903476e-05, "loss": 0.1492, "step": 3490 }, { "epoch": 0.7665352606219886, "grad_norm": 6.636394500732422, "learning_rate": 1.7794929133262854e-05, "loss": 0.1824, "step": 3500 }, { "epoch": 0.7687253613666228, "grad_norm": 9.448235511779785, "learning_rate": 1.7779783644702162e-05, "loss": 0.1735, "step": 3510 }, { "epoch": 0.7709154621112572, "grad_norm": 10.851614952087402, "learning_rate": 1.7764592812938544e-05, "loss": 0.1346, "step": 3520 }, { "epoch": 0.7731055628558914, "grad_norm": 3.880979299545288, "learning_rate": 1.7749356726509286e-05, "loss": 0.0763, "step": 3530 }, { "epoch": 0.7752956636005256, "grad_norm": 2.5566213130950928, "learning_rate": 1.773407547421544e-05, "loss": 0.1523, "step": 3540 }, { "epoch": 0.7774857643451599, "grad_norm": 10.146451950073242, "learning_rate": 1.7718749145121302e-05, "loss": 0.0783, "step": 3550 }, { "epoch": 0.7796758650897941, "grad_norm": 11.654008865356445, "learning_rate": 1.7703377828553878e-05, "loss": 0.0959, "step": 3560 }, { "epoch": 0.7818659658344284, "grad_norm": 2.5637104511260986, "learning_rate": 1.768796161410239e-05, "loss": 0.0742, "step": 3570 }, { "epoch": 0.7840560665790627, "grad_norm": 10.506292343139648, "learning_rate": 1.7672500591617742e-05, "loss": 0.222, "step": 3580 }, { "epoch": 0.7862461673236969, "grad_norm": 12.882474899291992, "learning_rate": 1.7656994851211978e-05, "loss": 0.1473, "step": 3590 }, { "epoch": 0.7884362680683311, "grad_norm": 5.517815589904785, "learning_rate": 1.7641444483257782e-05, "loss": 0.1305, "step": 3600 }, { "epoch": 0.7906263688129654, "grad_norm": 4.1038126945495605, "learning_rate": 1.762584957838795e-05, "loss": 0.1046, "step": 3610 }, { "epoch": 0.7928164695575997, "grad_norm": 1.3273707628250122, "learning_rate": 1.7610210227494845e-05, "loss": 0.0505, "step": 3620 }, { "epoch": 0.7950065703022339, "grad_norm": 0.24146081507205963, "learning_rate": 1.7594526521729876e-05, "loss": 0.135, "step": 3630 }, { "epoch": 0.7971966710468682, "grad_norm": 2.4364922046661377, "learning_rate": 1.7578798552502972e-05, "loss": 0.0738, "step": 3640 }, { "epoch": 0.7993867717915024, "grad_norm": 0.5693516135215759, "learning_rate": 1.7563026411482042e-05, "loss": 0.1252, "step": 3650 }, { "epoch": 0.8015768725361366, "grad_norm": 0.2617485821247101, "learning_rate": 1.7547210190592446e-05, "loss": 0.1202, "step": 3660 }, { "epoch": 0.803766973280771, "grad_norm": 6.397878170013428, "learning_rate": 1.7531349982016443e-05, "loss": 0.1114, "step": 3670 }, { "epoch": 0.8059570740254052, "grad_norm": 2.322713851928711, "learning_rate": 1.751544587819269e-05, "loss": 0.0579, "step": 3680 }, { "epoch": 0.8081471747700394, "grad_norm": 10.28016185760498, "learning_rate": 1.749949797181566e-05, "loss": 0.1021, "step": 3690 }, { "epoch": 0.8103372755146737, "grad_norm": 2.2540297508239746, "learning_rate": 1.748350635583514e-05, "loss": 0.0876, "step": 3700 }, { "epoch": 0.8125273762593079, "grad_norm": 5.814713954925537, "learning_rate": 1.7467471123455647e-05, "loss": 0.074, "step": 3710 }, { "epoch": 0.8147174770039421, "grad_norm": 1.555265188217163, "learning_rate": 1.7451392368135933e-05, "loss": 0.0599, "step": 3720 }, { "epoch": 0.8169075777485765, "grad_norm": 11.439265251159668, "learning_rate": 1.7435270183588408e-05, "loss": 0.1083, "step": 3730 }, { "epoch": 0.8190976784932107, "grad_norm": 6.168491840362549, "learning_rate": 1.741910466377859e-05, "loss": 0.1311, "step": 3740 }, { "epoch": 0.8212877792378449, "grad_norm": 4.275791645050049, "learning_rate": 1.740289590292459e-05, "loss": 0.0672, "step": 3750 }, { "epoch": 0.8234778799824792, "grad_norm": 1.4524646997451782, "learning_rate": 1.738664399549654e-05, "loss": 0.137, "step": 3760 }, { "epoch": 0.8256679807271134, "grad_norm": 10.094250679016113, "learning_rate": 1.737034903621602e-05, "loss": 0.1799, "step": 3770 }, { "epoch": 0.8278580814717477, "grad_norm": 11.459970474243164, "learning_rate": 1.735401112005556e-05, "loss": 0.1367, "step": 3780 }, { "epoch": 0.830048182216382, "grad_norm": 1.6316137313842773, "learning_rate": 1.733763034223804e-05, "loss": 0.0724, "step": 3790 }, { "epoch": 0.8322382829610162, "grad_norm": 1.730171799659729, "learning_rate": 1.7321206798236164e-05, "loss": 0.0342, "step": 3800 }, { "epoch": 0.8344283837056504, "grad_norm": 9.077439308166504, "learning_rate": 1.730474058377188e-05, "loss": 0.0942, "step": 3810 }, { "epoch": 0.8366184844502847, "grad_norm": 9.810347557067871, "learning_rate": 1.7288231794815842e-05, "loss": 0.0872, "step": 3820 }, { "epoch": 0.838808585194919, "grad_norm": 5.215786933898926, "learning_rate": 1.727168052758684e-05, "loss": 0.0811, "step": 3830 }, { "epoch": 0.8409986859395532, "grad_norm": 5.15041971206665, "learning_rate": 1.725508687855124e-05, "loss": 0.0815, "step": 3840 }, { "epoch": 0.8431887866841875, "grad_norm": 0.7137413620948792, "learning_rate": 1.7238450944422428e-05, "loss": 0.0714, "step": 3850 }, { "epoch": 0.8453788874288217, "grad_norm": 13.712797164916992, "learning_rate": 1.7221772822160233e-05, "loss": 0.0947, "step": 3860 }, { "epoch": 0.8475689881734559, "grad_norm": 22.908184051513672, "learning_rate": 1.7205052608970385e-05, "loss": 0.1898, "step": 3870 }, { "epoch": 0.8497590889180903, "grad_norm": 7.313767433166504, "learning_rate": 1.7188290402303915e-05, "loss": 0.1609, "step": 3880 }, { "epoch": 0.8519491896627245, "grad_norm": 4.333178520202637, "learning_rate": 1.7171486299856628e-05, "loss": 0.1075, "step": 3890 }, { "epoch": 0.8541392904073587, "grad_norm": 11.811752319335938, "learning_rate": 1.715464039956849e-05, "loss": 0.1272, "step": 3900 }, { "epoch": 0.856329391151993, "grad_norm": 8.436993598937988, "learning_rate": 1.71377527996231e-05, "loss": 0.0695, "step": 3910 }, { "epoch": 0.8585194918966272, "grad_norm": 7.5556111335754395, "learning_rate": 1.7120823598447077e-05, "loss": 0.0701, "step": 3920 }, { "epoch": 0.8607095926412615, "grad_norm": 1.8812776803970337, "learning_rate": 1.7103852894709517e-05, "loss": 0.0501, "step": 3930 }, { "epoch": 0.8628996933858958, "grad_norm": 3.4604718685150146, "learning_rate": 1.708684078732141e-05, "loss": 0.1466, "step": 3940 }, { "epoch": 0.86508979413053, "grad_norm": 0.1827421337366104, "learning_rate": 1.706978737543505e-05, "loss": 0.0928, "step": 3950 }, { "epoch": 0.8672798948751642, "grad_norm": 6.3987135887146, "learning_rate": 1.7052692758443477e-05, "loss": 0.0381, "step": 3960 }, { "epoch": 0.8694699956197985, "grad_norm": 10.518330574035645, "learning_rate": 1.7035557035979882e-05, "loss": 0.1477, "step": 3970 }, { "epoch": 0.8716600963644328, "grad_norm": 8.481457710266113, "learning_rate": 1.7018380307917034e-05, "loss": 0.058, "step": 3980 }, { "epoch": 0.873850197109067, "grad_norm": 13.368366241455078, "learning_rate": 1.70011626743667e-05, "loss": 0.0938, "step": 3990 }, { "epoch": 0.8760402978537013, "grad_norm": 24.52857208251953, "learning_rate": 1.698390423567905e-05, "loss": 0.169, "step": 4000 }, { "epoch": 0.8782303985983355, "grad_norm": 6.6533098220825195, "learning_rate": 1.696660509244209e-05, "loss": 0.1158, "step": 4010 }, { "epoch": 0.8804204993429697, "grad_norm": 10.287108421325684, "learning_rate": 1.694926534548106e-05, "loss": 0.0684, "step": 4020 }, { "epoch": 0.8826106000876041, "grad_norm": 2.5053839683532715, "learning_rate": 1.6931885095857857e-05, "loss": 0.0947, "step": 4030 }, { "epoch": 0.8848007008322383, "grad_norm": 1.1731847524642944, "learning_rate": 1.6914464444870432e-05, "loss": 0.0537, "step": 4040 }, { "epoch": 0.8869908015768725, "grad_norm": 11.618463516235352, "learning_rate": 1.6897003494052217e-05, "loss": 0.1216, "step": 4050 }, { "epoch": 0.8891809023215068, "grad_norm": 6.041053771972656, "learning_rate": 1.687950234517152e-05, "loss": 0.0684, "step": 4060 }, { "epoch": 0.891371003066141, "grad_norm": 1.0377060174942017, "learning_rate": 1.6861961100230942e-05, "loss": 0.079, "step": 4070 }, { "epoch": 0.8935611038107752, "grad_norm": 9.177011489868164, "learning_rate": 1.684437986146677e-05, "loss": 0.0642, "step": 4080 }, { "epoch": 0.8957512045554096, "grad_norm": 7.936344623565674, "learning_rate": 1.6826758731348404e-05, "loss": 0.039, "step": 4090 }, { "epoch": 0.8979413053000438, "grad_norm": 0.2865859866142273, "learning_rate": 1.680909781257772e-05, "loss": 0.1731, "step": 4100 }, { "epoch": 0.900131406044678, "grad_norm": 12.702764511108398, "learning_rate": 1.6791397208088516e-05, "loss": 0.2261, "step": 4110 }, { "epoch": 0.9023215067893123, "grad_norm": 15.344644546508789, "learning_rate": 1.6773657021045883e-05, "loss": 0.1423, "step": 4120 }, { "epoch": 0.9045116075339465, "grad_norm": 0.9401340484619141, "learning_rate": 1.6755877354845616e-05, "loss": 0.1292, "step": 4130 }, { "epoch": 0.9067017082785808, "grad_norm": 11.093612670898438, "learning_rate": 1.6738058313113603e-05, "loss": 0.1255, "step": 4140 }, { "epoch": 0.9088918090232151, "grad_norm": 10.675973892211914, "learning_rate": 1.6720199999705232e-05, "loss": 0.1244, "step": 4150 }, { "epoch": 0.9110819097678493, "grad_norm": 14.477042198181152, "learning_rate": 1.6702302518704776e-05, "loss": 0.0635, "step": 4160 }, { "epoch": 0.9132720105124835, "grad_norm": 8.27789306640625, "learning_rate": 1.668436597442479e-05, "loss": 0.2111, "step": 4170 }, { "epoch": 0.9154621112571178, "grad_norm": 9.59162425994873, "learning_rate": 1.6666390471405504e-05, "loss": 0.0844, "step": 4180 }, { "epoch": 0.9176522120017521, "grad_norm": 10.787023544311523, "learning_rate": 1.6648376114414208e-05, "loss": 0.125, "step": 4190 }, { "epoch": 0.9198423127463863, "grad_norm": 5.271928787231445, "learning_rate": 1.6630323008444656e-05, "loss": 0.2461, "step": 4200 }, { "epoch": 0.9220324134910206, "grad_norm": 1.8084982633590698, "learning_rate": 1.6612231258716437e-05, "loss": 0.0434, "step": 4210 }, { "epoch": 0.9242225142356548, "grad_norm": 1.2426623106002808, "learning_rate": 1.6594100970674368e-05, "loss": 0.0851, "step": 4220 }, { "epoch": 0.926412614980289, "grad_norm": 5.976527214050293, "learning_rate": 1.6575932249987882e-05, "loss": 0.1057, "step": 4230 }, { "epoch": 0.9286027157249234, "grad_norm": 12.50660514831543, "learning_rate": 1.6557725202550413e-05, "loss": 0.0282, "step": 4240 }, { "epoch": 0.9307928164695576, "grad_norm": 6.819702625274658, "learning_rate": 1.653947993447877e-05, "loss": 0.089, "step": 4250 }, { "epoch": 0.9329829172141918, "grad_norm": 10.757926940917969, "learning_rate": 1.6521196552112537e-05, "loss": 0.1208, "step": 4260 }, { "epoch": 0.9351730179588261, "grad_norm": 2.3442206382751465, "learning_rate": 1.6502875162013424e-05, "loss": 0.1034, "step": 4270 }, { "epoch": 0.9373631187034603, "grad_norm": 1.6122549772262573, "learning_rate": 1.6484515870964676e-05, "loss": 0.0646, "step": 4280 }, { "epoch": 0.9395532194480947, "grad_norm": 16.476318359375, "learning_rate": 1.6466118785970438e-05, "loss": 0.1016, "step": 4290 }, { "epoch": 0.9417433201927289, "grad_norm": 7.72664737701416, "learning_rate": 1.6447684014255115e-05, "loss": 0.0763, "step": 4300 }, { "epoch": 0.9439334209373631, "grad_norm": 17.780689239501953, "learning_rate": 1.642921166326278e-05, "loss": 0.1399, "step": 4310 }, { "epoch": 0.9461235216819974, "grad_norm": 2.459083318710327, "learning_rate": 1.6410701840656525e-05, "loss": 0.0632, "step": 4320 }, { "epoch": 0.9483136224266316, "grad_norm": 2.4532432556152344, "learning_rate": 1.639215465431784e-05, "loss": 0.1067, "step": 4330 }, { "epoch": 0.9505037231712659, "grad_norm": 12.71472454071045, "learning_rate": 1.6373570212345976e-05, "loss": 0.0818, "step": 4340 }, { "epoch": 0.9526938239159002, "grad_norm": 13.893026351928711, "learning_rate": 1.6354948623057337e-05, "loss": 0.1465, "step": 4350 }, { "epoch": 0.9548839246605344, "grad_norm": 0.9928886294364929, "learning_rate": 1.633628999498482e-05, "loss": 0.0662, "step": 4360 }, { "epoch": 0.9570740254051686, "grad_norm": 10.89462661743164, "learning_rate": 1.6317594436877207e-05, "loss": 0.0745, "step": 4370 }, { "epoch": 0.9592641261498029, "grad_norm": 9.313480377197266, "learning_rate": 1.629886205769851e-05, "loss": 0.0719, "step": 4380 }, { "epoch": 0.9614542268944372, "grad_norm": 1.694072961807251, "learning_rate": 1.628009296662736e-05, "loss": 0.1872, "step": 4390 }, { "epoch": 0.9636443276390714, "grad_norm": 27.52518081665039, "learning_rate": 1.626128727305634e-05, "loss": 0.1581, "step": 4400 }, { "epoch": 0.9658344283837057, "grad_norm": 0.9912569522857666, "learning_rate": 1.6242445086591384e-05, "loss": 0.1169, "step": 4410 }, { "epoch": 0.9680245291283399, "grad_norm": 4.777322769165039, "learning_rate": 1.6223566517051105e-05, "loss": 0.0793, "step": 4420 }, { "epoch": 0.9702146298729741, "grad_norm": 7.789610385894775, "learning_rate": 1.6204651674466173e-05, "loss": 0.0743, "step": 4430 }, { "epoch": 0.9724047306176085, "grad_norm": 22.649566650390625, "learning_rate": 1.6185700669078674e-05, "loss": 0.1862, "step": 4440 }, { "epoch": 0.9745948313622427, "grad_norm": 8.29400634765625, "learning_rate": 1.6166713611341457e-05, "loss": 0.1299, "step": 4450 }, { "epoch": 0.9767849321068769, "grad_norm": 0.33978042006492615, "learning_rate": 1.6147690611917496e-05, "loss": 0.0529, "step": 4460 }, { "epoch": 0.9789750328515112, "grad_norm": 0.7050483822822571, "learning_rate": 1.6128631781679244e-05, "loss": 0.1042, "step": 4470 }, { "epoch": 0.9811651335961454, "grad_norm": 10.278597831726074, "learning_rate": 1.6109537231708e-05, "loss": 0.0756, "step": 4480 }, { "epoch": 0.9833552343407796, "grad_norm": 3.643583059310913, "learning_rate": 1.609040707329324e-05, "loss": 0.0817, "step": 4490 }, { "epoch": 0.985545335085414, "grad_norm": 9.892813682556152, "learning_rate": 1.607124141793197e-05, "loss": 0.0827, "step": 4500 }, { "epoch": 0.9877354358300482, "grad_norm": 16.842300415039062, "learning_rate": 1.6052040377328103e-05, "loss": 0.1062, "step": 4510 }, { "epoch": 0.9899255365746824, "grad_norm": 8.603888511657715, "learning_rate": 1.6032804063391777e-05, "loss": 0.1059, "step": 4520 }, { "epoch": 0.9921156373193167, "grad_norm": 24.068283081054688, "learning_rate": 1.601353258823872e-05, "loss": 0.1205, "step": 4530 }, { "epoch": 0.994305738063951, "grad_norm": 3.3719892501831055, "learning_rate": 1.599422606418958e-05, "loss": 0.1111, "step": 4540 }, { "epoch": 0.9964958388085852, "grad_norm": 9.853516578674316, "learning_rate": 1.5974884603769298e-05, "loss": 0.1089, "step": 4550 }, { "epoch": 0.9986859395532195, "grad_norm": 7.868588924407959, "learning_rate": 1.5955508319706428e-05, "loss": 0.0777, "step": 4560 }, { "epoch": 1.0008760402978536, "grad_norm": 1.703294038772583, "learning_rate": 1.5936097324932487e-05, "loss": 0.055, "step": 4570 }, { "epoch": 1.003066141042488, "grad_norm": 0.994607150554657, "learning_rate": 1.5916651732581295e-05, "loss": 0.1185, "step": 4580 }, { "epoch": 1.0052562417871223, "grad_norm": 7.936860084533691, "learning_rate": 1.5897171655988334e-05, "loss": 0.0588, "step": 4590 }, { "epoch": 1.0074463425317564, "grad_norm": 5.055603981018066, "learning_rate": 1.5877657208690047e-05, "loss": 0.064, "step": 4600 }, { "epoch": 1.0096364432763907, "grad_norm": 0.8618549704551697, "learning_rate": 1.5858108504423223e-05, "loss": 0.1134, "step": 4610 }, { "epoch": 1.011826544021025, "grad_norm": 8.43459701538086, "learning_rate": 1.58385256571243e-05, "loss": 0.0431, "step": 4620 }, { "epoch": 1.0140166447656591, "grad_norm": 12.006895065307617, "learning_rate": 1.5818908780928718e-05, "loss": 0.0525, "step": 4630 }, { "epoch": 1.0162067455102934, "grad_norm": 8.87588882446289, "learning_rate": 1.579925799017025e-05, "loss": 0.1468, "step": 4640 }, { "epoch": 1.0183968462549278, "grad_norm": 11.340096473693848, "learning_rate": 1.5779573399380324e-05, "loss": 0.1324, "step": 4650 }, { "epoch": 1.0205869469995619, "grad_norm": 9.714674949645996, "learning_rate": 1.5759855123287382e-05, "loss": 0.0987, "step": 4660 }, { "epoch": 1.0227770477441962, "grad_norm": 1.903538465499878, "learning_rate": 1.5740103276816185e-05, "loss": 0.0814, "step": 4670 }, { "epoch": 1.0249671484888305, "grad_norm": 4.019851207733154, "learning_rate": 1.5720317975087153e-05, "loss": 0.0692, "step": 4680 }, { "epoch": 1.0271572492334646, "grad_norm": 3.91469407081604, "learning_rate": 1.5700499333415702e-05, "loss": 0.0422, "step": 4690 }, { "epoch": 1.029347349978099, "grad_norm": 10.391172409057617, "learning_rate": 1.568064746731156e-05, "loss": 0.041, "step": 4700 }, { "epoch": 1.0315374507227333, "grad_norm": 11.046605110168457, "learning_rate": 1.566076249247809e-05, "loss": 0.0804, "step": 4710 }, { "epoch": 1.0337275514673676, "grad_norm": 16.249835968017578, "learning_rate": 1.5640844524811644e-05, "loss": 0.1705, "step": 4720 }, { "epoch": 1.0359176522120017, "grad_norm": 7.519586086273193, "learning_rate": 1.5620893680400847e-05, "loss": 0.1173, "step": 4730 }, { "epoch": 1.038107752956636, "grad_norm": 12.720919609069824, "learning_rate": 1.560091007552595e-05, "loss": 0.1856, "step": 4740 }, { "epoch": 1.0402978537012704, "grad_norm": 12.122481346130371, "learning_rate": 1.558089382665814e-05, "loss": 0.0486, "step": 4750 }, { "epoch": 1.0424879544459045, "grad_norm": 1.294742465019226, "learning_rate": 1.556084505045887e-05, "loss": 0.0372, "step": 4760 }, { "epoch": 1.0446780551905388, "grad_norm": 10.044445991516113, "learning_rate": 1.554076386377917e-05, "loss": 0.1406, "step": 4770 }, { "epoch": 1.0468681559351731, "grad_norm": 0.24504348635673523, "learning_rate": 1.5520650383658968e-05, "loss": 0.0774, "step": 4780 }, { "epoch": 1.0490582566798072, "grad_norm": 3.3398680686950684, "learning_rate": 1.5500504727326405e-05, "loss": 0.0789, "step": 4790 }, { "epoch": 1.0512483574244416, "grad_norm": 12.48293399810791, "learning_rate": 1.5480327012197165e-05, "loss": 0.1098, "step": 4800 }, { "epoch": 1.0534384581690759, "grad_norm": 13.119927406311035, "learning_rate": 1.546011735587378e-05, "loss": 0.0616, "step": 4810 }, { "epoch": 1.05562855891371, "grad_norm": 19.549123764038086, "learning_rate": 1.543987587614493e-05, "loss": 0.1139, "step": 4820 }, { "epoch": 1.0578186596583443, "grad_norm": 12.230664253234863, "learning_rate": 1.5419602690984805e-05, "loss": 0.081, "step": 4830 }, { "epoch": 1.0600087604029786, "grad_norm": 4.5526299476623535, "learning_rate": 1.539929791855235e-05, "loss": 0.1248, "step": 4840 }, { "epoch": 1.0621988611476127, "grad_norm": 6.89523458480835, "learning_rate": 1.537896167719063e-05, "loss": 0.0352, "step": 4850 }, { "epoch": 1.064388961892247, "grad_norm": 0.267836332321167, "learning_rate": 1.5358594085426125e-05, "loss": 0.0269, "step": 4860 }, { "epoch": 1.0665790626368814, "grad_norm": 0.8087835311889648, "learning_rate": 1.5338195261968024e-05, "loss": 0.0607, "step": 4870 }, { "epoch": 1.0687691633815155, "grad_norm": 0.992652177810669, "learning_rate": 1.531776532570755e-05, "loss": 0.0616, "step": 4880 }, { "epoch": 1.0709592641261498, "grad_norm": 5.440254211425781, "learning_rate": 1.5297304395717267e-05, "loss": 0.0678, "step": 4890 }, { "epoch": 1.0731493648707842, "grad_norm": 1.4581581354141235, "learning_rate": 1.5276812591250365e-05, "loss": 0.0505, "step": 4900 }, { "epoch": 1.0753394656154183, "grad_norm": 4.058531761169434, "learning_rate": 1.5256290031739998e-05, "loss": 0.1273, "step": 4910 }, { "epoch": 1.0775295663600526, "grad_norm": 13.37671184539795, "learning_rate": 1.5239849920873937e-05, "loss": 0.0883, "step": 4920 }, { "epoch": 1.079719667104687, "grad_norm": 9.649218559265137, "learning_rate": 1.5219272303828589e-05, "loss": 0.0548, "step": 4930 }, { "epoch": 1.081909767849321, "grad_norm": 0.20135408639907837, "learning_rate": 1.5198664267103978e-05, "loss": 0.0923, "step": 4940 }, { "epoch": 1.0840998685939554, "grad_norm": 17.547317504882812, "learning_rate": 1.5178025930810688e-05, "loss": 0.1545, "step": 4950 }, { "epoch": 1.0862899693385897, "grad_norm": 0.7877618074417114, "learning_rate": 1.5157357415235914e-05, "loss": 0.0452, "step": 4960 }, { "epoch": 1.0884800700832238, "grad_norm": 2.6584513187408447, "learning_rate": 1.5136658840842722e-05, "loss": 0.0741, "step": 4970 }, { "epoch": 1.090670170827858, "grad_norm": 0.07592464983463287, "learning_rate": 1.5115930328269393e-05, "loss": 0.0865, "step": 4980 }, { "epoch": 1.0928602715724924, "grad_norm": 18.665529251098633, "learning_rate": 1.5095171998328688e-05, "loss": 0.2498, "step": 4990 }, { "epoch": 1.0950503723171265, "grad_norm": 14.311424255371094, "learning_rate": 1.5074383972007152e-05, "loss": 0.1229, "step": 5000 }, { "epoch": 1.0972404730617609, "grad_norm": 2.9915549755096436, "learning_rate": 1.5053566370464416e-05, "loss": 0.0632, "step": 5010 }, { "epoch": 1.0994305738063952, "grad_norm": 11.12956428527832, "learning_rate": 1.5032719315032483e-05, "loss": 0.0533, "step": 5020 }, { "epoch": 1.1016206745510293, "grad_norm": 0.5755620002746582, "learning_rate": 1.5011842927215026e-05, "loss": 0.139, "step": 5030 }, { "epoch": 1.1038107752956636, "grad_norm": 9.451416969299316, "learning_rate": 1.4990937328686665e-05, "loss": 0.0422, "step": 5040 }, { "epoch": 1.106000876040298, "grad_norm": 8.770936012268066, "learning_rate": 1.497000264129229e-05, "loss": 0.0331, "step": 5050 }, { "epoch": 1.108190976784932, "grad_norm": 9.560881614685059, "learning_rate": 1.4949038987046312e-05, "loss": 0.0736, "step": 5060 }, { "epoch": 1.1103810775295664, "grad_norm": 1.8579931259155273, "learning_rate": 1.492804648813198e-05, "loss": 0.0398, "step": 5070 }, { "epoch": 1.1125711782742007, "grad_norm": 7.129519939422607, "learning_rate": 1.490702526690066e-05, "loss": 0.1488, "step": 5080 }, { "epoch": 1.1147612790188348, "grad_norm": 1.793743371963501, "learning_rate": 1.4885975445871114e-05, "loss": 0.0727, "step": 5090 }, { "epoch": 1.1169513797634691, "grad_norm": 0.06678088009357452, "learning_rate": 1.4864897147728805e-05, "loss": 0.0548, "step": 5100 }, { "epoch": 1.1191414805081035, "grad_norm": 0.03883006423711777, "learning_rate": 1.4843790495325158e-05, "loss": 0.0738, "step": 5110 }, { "epoch": 1.1213315812527376, "grad_norm": 21.09810447692871, "learning_rate": 1.4822655611676865e-05, "loss": 0.1989, "step": 5120 }, { "epoch": 1.123521681997372, "grad_norm": 0.009001745842397213, "learning_rate": 1.4801492619965154e-05, "loss": 0.0693, "step": 5130 }, { "epoch": 1.1257117827420062, "grad_norm": 2.2340657711029053, "learning_rate": 1.478030164353508e-05, "loss": 0.1479, "step": 5140 }, { "epoch": 1.1279018834866403, "grad_norm": 6.962562561035156, "learning_rate": 1.47590828058948e-05, "loss": 0.0654, "step": 5150 }, { "epoch": 1.1300919842312747, "grad_norm": 5.998634338378906, "learning_rate": 1.4737836230714854e-05, "loss": 0.0367, "step": 5160 }, { "epoch": 1.132282084975909, "grad_norm": 2.287637710571289, "learning_rate": 1.4716562041827451e-05, "loss": 0.0937, "step": 5170 }, { "epoch": 1.134472185720543, "grad_norm": 0.010309090837836266, "learning_rate": 1.4695260363225736e-05, "loss": 0.0832, "step": 5180 }, { "epoch": 1.1366622864651774, "grad_norm": 0.01483870204538107, "learning_rate": 1.467393131906308e-05, "loss": 0.0686, "step": 5190 }, { "epoch": 1.1388523872098117, "grad_norm": 0.6539881825447083, "learning_rate": 1.465257503365234e-05, "loss": 0.082, "step": 5200 }, { "epoch": 1.1410424879544458, "grad_norm": 0.5702245831489563, "learning_rate": 1.4631191631465158e-05, "loss": 0.0712, "step": 5210 }, { "epoch": 1.1432325886990802, "grad_norm": 1.0854642391204834, "learning_rate": 1.4609781237131209e-05, "loss": 0.0204, "step": 5220 }, { "epoch": 1.1454226894437145, "grad_norm": 0.320843368768692, "learning_rate": 1.458834397543749e-05, "loss": 0.1176, "step": 5230 }, { "epoch": 1.1476127901883486, "grad_norm": 5.536646842956543, "learning_rate": 1.4566879971327597e-05, "loss": 0.1105, "step": 5240 }, { "epoch": 1.149802890932983, "grad_norm": 7.964202404022217, "learning_rate": 1.4545389349900982e-05, "loss": 0.1155, "step": 5250 }, { "epoch": 1.1519929916776173, "grad_norm": 13.571941375732422, "learning_rate": 1.4523872236412237e-05, "loss": 0.0565, "step": 5260 }, { "epoch": 1.1541830924222514, "grad_norm": 4.726943492889404, "learning_rate": 1.4502328756270355e-05, "loss": 0.0794, "step": 5270 }, { "epoch": 1.1563731931668857, "grad_norm": 6.58601713180542, "learning_rate": 1.4480759035038004e-05, "loss": 0.0338, "step": 5280 }, { "epoch": 1.15856329391152, "grad_norm": 0.6495316624641418, "learning_rate": 1.4459163198430793e-05, "loss": 0.0649, "step": 5290 }, { "epoch": 1.1607533946561541, "grad_norm": 0.003626406192779541, "learning_rate": 1.4437541372316543e-05, "loss": 0.0623, "step": 5300 }, { "epoch": 1.1629434954007885, "grad_norm": 6.244755744934082, "learning_rate": 1.4415893682714549e-05, "loss": 0.0928, "step": 5310 }, { "epoch": 1.1651335961454228, "grad_norm": 0.1181970164179802, "learning_rate": 1.4394220255794844e-05, "loss": 0.05, "step": 5320 }, { "epoch": 1.1673236968900569, "grad_norm": 2.4534833431243896, "learning_rate": 1.4372521217877475e-05, "loss": 0.061, "step": 5330 }, { "epoch": 1.1695137976346912, "grad_norm": 10.695149421691895, "learning_rate": 1.4350796695431748e-05, "loss": 0.1294, "step": 5340 }, { "epoch": 1.1717038983793255, "grad_norm": 7.147988319396973, "learning_rate": 1.4329046815075511e-05, "loss": 0.0924, "step": 5350 }, { "epoch": 1.1738939991239596, "grad_norm": 6.20286750793457, "learning_rate": 1.4307271703574399e-05, "loss": 0.0528, "step": 5360 }, { "epoch": 1.176084099868594, "grad_norm": 1.7355530261993408, "learning_rate": 1.4285471487841107e-05, "loss": 0.0823, "step": 5370 }, { "epoch": 1.1782742006132283, "grad_norm": 10.747481346130371, "learning_rate": 1.4263646294934645e-05, "loss": 0.1265, "step": 5380 }, { "epoch": 1.1804643013578624, "grad_norm": 0.07942114770412445, "learning_rate": 1.42417962520596e-05, "loss": 0.1404, "step": 5390 }, { "epoch": 1.1826544021024967, "grad_norm": 1.2766234874725342, "learning_rate": 1.421992148656539e-05, "loss": 0.0348, "step": 5400 }, { "epoch": 1.184844502847131, "grad_norm": 23.60717010498047, "learning_rate": 1.419802212594552e-05, "loss": 0.0643, "step": 5410 }, { "epoch": 1.1870346035917652, "grad_norm": 2.484929323196411, "learning_rate": 1.417609829783686e-05, "loss": 0.0928, "step": 5420 }, { "epoch": 1.1892247043363995, "grad_norm": 0.5586147308349609, "learning_rate": 1.4154150130018867e-05, "loss": 0.0862, "step": 5430 }, { "epoch": 1.1914148050810338, "grad_norm": 10.657268524169922, "learning_rate": 1.4132177750412865e-05, "loss": 0.0792, "step": 5440 }, { "epoch": 1.193604905825668, "grad_norm": 3.373382091522217, "learning_rate": 1.4110181287081298e-05, "loss": 0.1178, "step": 5450 }, { "epoch": 1.1957950065703022, "grad_norm": 4.232138633728027, "learning_rate": 1.408816086822697e-05, "loss": 0.0405, "step": 5460 }, { "epoch": 1.1979851073149366, "grad_norm": 0.0016181356040760875, "learning_rate": 1.4066116622192308e-05, "loss": 0.1079, "step": 5470 }, { "epoch": 1.2001752080595707, "grad_norm": 1.9689974784851074, "learning_rate": 1.4044048677458612e-05, "loss": 0.0512, "step": 5480 }, { "epoch": 1.202365308804205, "grad_norm": 19.73598289489746, "learning_rate": 1.4021957162645313e-05, "loss": 0.0802, "step": 5490 }, { "epoch": 1.2045554095488393, "grad_norm": 18.060136795043945, "learning_rate": 1.39998422065092e-05, "loss": 0.1097, "step": 5500 }, { "epoch": 1.2067455102934734, "grad_norm": 10.406551361083984, "learning_rate": 1.39777039379437e-05, "loss": 0.1031, "step": 5510 }, { "epoch": 1.2089356110381078, "grad_norm": 10.111130714416504, "learning_rate": 1.3955542485978115e-05, "loss": 0.1586, "step": 5520 }, { "epoch": 1.211125711782742, "grad_norm": 5.91014289855957, "learning_rate": 1.3933357979776854e-05, "loss": 0.1664, "step": 5530 }, { "epoch": 1.2133158125273762, "grad_norm": 10.260980606079102, "learning_rate": 1.3911150548638705e-05, "loss": 0.1338, "step": 5540 }, { "epoch": 1.2155059132720105, "grad_norm": 2.3469550609588623, "learning_rate": 1.3888920321996065e-05, "loss": 0.1175, "step": 5550 }, { "epoch": 1.2176960140166448, "grad_norm": 12.977310180664062, "learning_rate": 1.3866667429414188e-05, "loss": 0.0476, "step": 5560 }, { "epoch": 1.219886114761279, "grad_norm": 0.11630243062973022, "learning_rate": 1.3844392000590445e-05, "loss": 0.0398, "step": 5570 }, { "epoch": 1.2220762155059133, "grad_norm": 13.153382301330566, "learning_rate": 1.3822094165353543e-05, "loss": 0.1526, "step": 5580 }, { "epoch": 1.2242663162505476, "grad_norm": 8.721677780151367, "learning_rate": 1.3799774053662788e-05, "loss": 0.0866, "step": 5590 }, { "epoch": 1.2264564169951817, "grad_norm": 5.472198963165283, "learning_rate": 1.3777431795607318e-05, "loss": 0.0785, "step": 5600 }, { "epoch": 1.228646517739816, "grad_norm": 2.5994927883148193, "learning_rate": 1.375506752140535e-05, "loss": 0.0553, "step": 5610 }, { "epoch": 1.2308366184844504, "grad_norm": 3.8210840225219727, "learning_rate": 1.3732681361403413e-05, "loss": 0.0295, "step": 5620 }, { "epoch": 1.2330267192290845, "grad_norm": 0.3484339118003845, "learning_rate": 1.3710273446075596e-05, "loss": 0.1297, "step": 5630 }, { "epoch": 1.2352168199737188, "grad_norm": 0.5113843083381653, "learning_rate": 1.3687843906022796e-05, "loss": 0.0346, "step": 5640 }, { "epoch": 1.2374069207183531, "grad_norm": 0.9716311693191528, "learning_rate": 1.3665392871971928e-05, "loss": 0.0153, "step": 5650 }, { "epoch": 1.2395970214629872, "grad_norm": 12.424820899963379, "learning_rate": 1.3642920474775197e-05, "loss": 0.1213, "step": 5660 }, { "epoch": 1.2417871222076216, "grad_norm": 9.678114891052246, "learning_rate": 1.3620426845409304e-05, "loss": 0.0752, "step": 5670 }, { "epoch": 1.2439772229522559, "grad_norm": 5.078012466430664, "learning_rate": 1.3597912114974718e-05, "loss": 0.0964, "step": 5680 }, { "epoch": 1.24616732369689, "grad_norm": 5.309476375579834, "learning_rate": 1.3575376414694876e-05, "loss": 0.0501, "step": 5690 }, { "epoch": 1.2483574244415243, "grad_norm": 7.304897308349609, "learning_rate": 1.355281987591544e-05, "loss": 0.0577, "step": 5700 }, { "epoch": 1.2505475251861586, "grad_norm": 7.255605697631836, "learning_rate": 1.3530242630103525e-05, "loss": 0.0616, "step": 5710 }, { "epoch": 1.2527376259307927, "grad_norm": 10.257238388061523, "learning_rate": 1.3507644808846936e-05, "loss": 0.0535, "step": 5720 }, { "epoch": 1.254927726675427, "grad_norm": 0.9835705161094666, "learning_rate": 1.3485026543853397e-05, "loss": 0.1193, "step": 5730 }, { "epoch": 1.2571178274200614, "grad_norm": 2.9460463523864746, "learning_rate": 1.3462387966949781e-05, "loss": 0.1459, "step": 5740 }, { "epoch": 1.2593079281646955, "grad_norm": 14.589485168457031, "learning_rate": 1.3439729210081351e-05, "loss": 0.0712, "step": 5750 }, { "epoch": 1.2614980289093298, "grad_norm": 7.7696027755737305, "learning_rate": 1.3417050405310987e-05, "loss": 0.0713, "step": 5760 }, { "epoch": 1.2636881296539642, "grad_norm": 2.291095733642578, "learning_rate": 1.3394351684818409e-05, "loss": 0.1249, "step": 5770 }, { "epoch": 1.2658782303985983, "grad_norm": 5.799035549163818, "learning_rate": 1.3371633180899417e-05, "loss": 0.0533, "step": 5780 }, { "epoch": 1.2680683311432326, "grad_norm": 12.889134407043457, "learning_rate": 1.3348895025965112e-05, "loss": 0.041, "step": 5790 }, { "epoch": 1.270258431887867, "grad_norm": 7.134917736053467, "learning_rate": 1.3326137352541132e-05, "loss": 0.1477, "step": 5800 }, { "epoch": 1.272448532632501, "grad_norm": 10.878463745117188, "learning_rate": 1.3303360293266873e-05, "loss": 0.0761, "step": 5810 }, { "epoch": 1.2746386333771353, "grad_norm": 1.4681440591812134, "learning_rate": 1.3280563980894716e-05, "loss": 0.0554, "step": 5820 }, { "epoch": 1.2768287341217697, "grad_norm": 0.06866442412137985, "learning_rate": 1.325774854828926e-05, "loss": 0.0924, "step": 5830 }, { "epoch": 1.2790188348664038, "grad_norm": 14.703214645385742, "learning_rate": 1.3234914128426538e-05, "loss": 0.0691, "step": 5840 }, { "epoch": 1.281208935611038, "grad_norm": 9.91274356842041, "learning_rate": 1.3212060854393256e-05, "loss": 0.0727, "step": 5850 }, { "epoch": 1.2833990363556724, "grad_norm": 0.14547300338745117, "learning_rate": 1.3189188859385997e-05, "loss": 0.0173, "step": 5860 }, { "epoch": 1.2855891371003065, "grad_norm": 12.242005348205566, "learning_rate": 1.3166298276710466e-05, "loss": 0.03, "step": 5870 }, { "epoch": 1.2877792378449409, "grad_norm": 0.008704107254743576, "learning_rate": 1.3143389239780697e-05, "loss": 0.1059, "step": 5880 }, { "epoch": 1.2899693385895752, "grad_norm": 3.2513678073883057, "learning_rate": 1.3120461882118283e-05, "loss": 0.0664, "step": 5890 }, { "epoch": 1.2921594393342093, "grad_norm": 0.010061273351311684, "learning_rate": 1.30975163373516e-05, "loss": 0.0362, "step": 5900 }, { "epoch": 1.2943495400788436, "grad_norm": 8.55904483795166, "learning_rate": 1.3074552739215017e-05, "loss": 0.13, "step": 5910 }, { "epoch": 1.296539640823478, "grad_norm": 0.7941040992736816, "learning_rate": 1.3053870175877093e-05, "loss": 0.0743, "step": 5920 }, { "epoch": 1.298729741568112, "grad_norm": 9.431867599487305, "learning_rate": 1.3033173195359705e-05, "loss": 0.0634, "step": 5930 }, { "epoch": 1.3009198423127464, "grad_norm": 4.754759788513184, "learning_rate": 1.3010159760146932e-05, "loss": 0.0834, "step": 5940 }, { "epoch": 1.3031099430573807, "grad_norm": 0.22233600914478302, "learning_rate": 1.2987128780707856e-05, "loss": 0.0907, "step": 5950 }, { "epoch": 1.3053000438020148, "grad_norm": 9.772262573242188, "learning_rate": 1.2964080391274792e-05, "loss": 0.161, "step": 5960 }, { "epoch": 1.3074901445466491, "grad_norm": 1.0325156450271606, "learning_rate": 1.2941014726181527e-05, "loss": 0.0369, "step": 5970 }, { "epoch": 1.3096802452912835, "grad_norm": 0.26427239179611206, "learning_rate": 1.2917931919862534e-05, "loss": 0.0422, "step": 5980 }, { "epoch": 1.3118703460359176, "grad_norm": 10.44315242767334, "learning_rate": 1.28948321068522e-05, "loss": 0.0478, "step": 5990 }, { "epoch": 1.314060446780552, "grad_norm": 9.725469589233398, "learning_rate": 1.2871715421784022e-05, "loss": 0.1069, "step": 6000 }, { "epoch": 1.3162505475251862, "grad_norm": 9.662418365478516, "learning_rate": 1.2848581999389835e-05, "loss": 0.1069, "step": 6010 }, { "epoch": 1.3184406482698203, "grad_norm": 8.185514450073242, "learning_rate": 1.282543197449903e-05, "loss": 0.0737, "step": 6020 }, { "epoch": 1.3206307490144547, "grad_norm": 11.130629539489746, "learning_rate": 1.2802265482037758e-05, "loss": 0.1668, "step": 6030 }, { "epoch": 1.322820849759089, "grad_norm": 5.510688304901123, "learning_rate": 1.2779082657028153e-05, "loss": 0.0406, "step": 6040 }, { "epoch": 1.325010950503723, "grad_norm": 0.8124195337295532, "learning_rate": 1.275588363458753e-05, "loss": 0.0846, "step": 6050 }, { "epoch": 1.3272010512483574, "grad_norm": 0.06429574638605118, "learning_rate": 1.273266854992763e-05, "loss": 0.0511, "step": 6060 }, { "epoch": 1.3293911519929917, "grad_norm": 8.23216724395752, "learning_rate": 1.2709437538353782e-05, "loss": 0.0985, "step": 6070 }, { "epoch": 1.3315812527376258, "grad_norm": 0.21902796626091003, "learning_rate": 1.2686190735264165e-05, "loss": 0.0557, "step": 6080 }, { "epoch": 1.3337713534822602, "grad_norm": 11.643610954284668, "learning_rate": 1.2662928276148985e-05, "loss": 0.1563, "step": 6090 }, { "epoch": 1.3359614542268945, "grad_norm": 5.946606159210205, "learning_rate": 1.2639650296589698e-05, "loss": 0.0747, "step": 6100 }, { "epoch": 1.3381515549715286, "grad_norm": 7.202443599700928, "learning_rate": 1.2616356932258222e-05, "loss": 0.1359, "step": 6110 }, { "epoch": 1.340341655716163, "grad_norm": 1.4773874282836914, "learning_rate": 1.2593048318916142e-05, "loss": 0.0504, "step": 6120 }, { "epoch": 1.3425317564607973, "grad_norm": 5.321267604827881, "learning_rate": 1.2569724592413912e-05, "loss": 0.1133, "step": 6130 }, { "epoch": 1.3447218572054314, "grad_norm": 7.59853458404541, "learning_rate": 1.2546385888690083e-05, "loss": 0.0721, "step": 6140 }, { "epoch": 1.3469119579500657, "grad_norm": 9.034475326538086, "learning_rate": 1.2523032343770484e-05, "loss": 0.0753, "step": 6150 }, { "epoch": 1.3491020586947, "grad_norm": 19.85488510131836, "learning_rate": 1.2499664093767458e-05, "loss": 0.0573, "step": 6160 }, { "epoch": 1.3512921594393341, "grad_norm": 13.950851440429688, "learning_rate": 1.2476281274879045e-05, "loss": 0.0636, "step": 6170 }, { "epoch": 1.3534822601839684, "grad_norm": 0.1767183393239975, "learning_rate": 1.2452884023388196e-05, "loss": 0.0655, "step": 6180 }, { "epoch": 1.3556723609286028, "grad_norm": 10.384352684020996, "learning_rate": 1.242947247566199e-05, "loss": 0.075, "step": 6190 }, { "epoch": 1.3578624616732369, "grad_norm": 6.867451190948486, "learning_rate": 1.240604676815082e-05, "loss": 0.1206, "step": 6200 }, { "epoch": 1.3600525624178712, "grad_norm": 0.6172086000442505, "learning_rate": 1.238260703738761e-05, "loss": 0.1004, "step": 6210 }, { "epoch": 1.3622426631625055, "grad_norm": 11.253986358642578, "learning_rate": 1.2359153419987013e-05, "loss": 0.0763, "step": 6220 }, { "epoch": 1.3644327639071396, "grad_norm": 3.181884765625, "learning_rate": 1.2335686052644629e-05, "loss": 0.1107, "step": 6230 }, { "epoch": 1.366622864651774, "grad_norm": 9.615938186645508, "learning_rate": 1.2312205072136184e-05, "loss": 0.0437, "step": 6240 }, { "epoch": 1.3688129653964083, "grad_norm": 0.24824906885623932, "learning_rate": 1.2288710615316755e-05, "loss": 0.1091, "step": 6250 }, { "epoch": 1.3710030661410424, "grad_norm": 2.3654232025146484, "learning_rate": 1.2265202819119955e-05, "loss": 0.084, "step": 6260 }, { "epoch": 1.3731931668856767, "grad_norm": 0.7973864674568176, "learning_rate": 1.2241681820557153e-05, "loss": 0.0568, "step": 6270 }, { "epoch": 1.375383267630311, "grad_norm": 7.021385192871094, "learning_rate": 1.2218147756716661e-05, "loss": 0.1211, "step": 6280 }, { "epoch": 1.3775733683749452, "grad_norm": 1.0289604663848877, "learning_rate": 1.2194600764762941e-05, "loss": 0.0619, "step": 6290 }, { "epoch": 1.3797634691195795, "grad_norm": 0.3207782804965973, "learning_rate": 1.21710409819358e-05, "loss": 0.0425, "step": 6300 }, { "epoch": 1.3819535698642138, "grad_norm": 10.054275512695312, "learning_rate": 1.21474685455496e-05, "loss": 0.092, "step": 6310 }, { "epoch": 1.384143670608848, "grad_norm": 3.0174367427825928, "learning_rate": 1.2123883592992451e-05, "loss": 0.0428, "step": 6320 }, { "epoch": 1.3863337713534822, "grad_norm": 10.506322860717773, "learning_rate": 1.2100286261725412e-05, "loss": 0.0738, "step": 6330 }, { "epoch": 1.3885238720981166, "grad_norm": 11.201553344726562, "learning_rate": 1.2076676689281685e-05, "loss": 0.1255, "step": 6340 }, { "epoch": 1.3907139728427507, "grad_norm": 0.4540427625179291, "learning_rate": 1.2053055013265827e-05, "loss": 0.0376, "step": 6350 }, { "epoch": 1.392904073587385, "grad_norm": 0.5703373551368713, "learning_rate": 1.2029421371352923e-05, "loss": 0.1042, "step": 6360 }, { "epoch": 1.3950941743320193, "grad_norm": 0.08392885327339172, "learning_rate": 1.2005775901287818e-05, "loss": 0.0944, "step": 6370 }, { "epoch": 1.3972842750766534, "grad_norm": 5.717464447021484, "learning_rate": 1.1982118740884278e-05, "loss": 0.0591, "step": 6380 }, { "epoch": 1.3994743758212878, "grad_norm": 0.48545020818710327, "learning_rate": 1.1958450028024223e-05, "loss": 0.0909, "step": 6390 }, { "epoch": 1.401664476565922, "grad_norm": 10.520610809326172, "learning_rate": 1.1934769900656884e-05, "loss": 0.1327, "step": 6400 }, { "epoch": 1.4038545773105562, "grad_norm": 4.346755027770996, "learning_rate": 1.1911078496798036e-05, "loss": 0.063, "step": 6410 }, { "epoch": 1.4060446780551905, "grad_norm": 10.559310913085938, "learning_rate": 1.1887375954529167e-05, "loss": 0.036, "step": 6420 }, { "epoch": 1.4082347787998248, "grad_norm": 8.93211555480957, "learning_rate": 1.1863662411996692e-05, "loss": 0.0585, "step": 6430 }, { "epoch": 1.410424879544459, "grad_norm": 8.109956741333008, "learning_rate": 1.1839938007411132e-05, "loss": 0.1599, "step": 6440 }, { "epoch": 1.4126149802890933, "grad_norm": 8.202881813049316, "learning_rate": 1.1816202879046316e-05, "loss": 0.0696, "step": 6450 }, { "epoch": 1.4148050810337276, "grad_norm": 8.988107681274414, "learning_rate": 1.1792457165238583e-05, "loss": 0.0625, "step": 6460 }, { "epoch": 1.4169951817783617, "grad_norm": 16.47657012939453, "learning_rate": 1.1768701004385954e-05, "loss": 0.0925, "step": 6470 }, { "epoch": 1.419185282522996, "grad_norm": 10.974760055541992, "learning_rate": 1.1744934534947354e-05, "loss": 0.0365, "step": 6480 }, { "epoch": 1.4213753832676304, "grad_norm": 17.555282592773438, "learning_rate": 1.1721157895441778e-05, "loss": 0.0968, "step": 6490 }, { "epoch": 1.4235654840122645, "grad_norm": 0.2904272675514221, "learning_rate": 1.16973712244475e-05, "loss": 0.0697, "step": 6500 }, { "epoch": 1.4257555847568988, "grad_norm": 0.36571285128593445, "learning_rate": 1.1673574660601265e-05, "loss": 0.1098, "step": 6510 }, { "epoch": 1.4279456855015331, "grad_norm": 8.44431209564209, "learning_rate": 1.1649768342597469e-05, "loss": 0.0442, "step": 6520 }, { "epoch": 1.4301357862461672, "grad_norm": 0.23281489312648773, "learning_rate": 1.1625952409187362e-05, "loss": 0.0606, "step": 6530 }, { "epoch": 1.4323258869908015, "grad_norm": 0.8732020854949951, "learning_rate": 1.1602126999178238e-05, "loss": 0.0789, "step": 6540 }, { "epoch": 1.4345159877354359, "grad_norm": 1.8265255689620972, "learning_rate": 1.1578292251432622e-05, "loss": 0.0203, "step": 6550 }, { "epoch": 1.43670608848007, "grad_norm": 9.862552642822266, "learning_rate": 1.1554448304867463e-05, "loss": 0.0908, "step": 6560 }, { "epoch": 1.4388961892247043, "grad_norm": 1.6325030326843262, "learning_rate": 1.153059529845332e-05, "loss": 0.0389, "step": 6570 }, { "epoch": 1.4410862899693386, "grad_norm": 2.9630517959594727, "learning_rate": 1.1506733371213566e-05, "loss": 0.0496, "step": 6580 }, { "epoch": 1.4432763907139727, "grad_norm": 11.324121475219727, "learning_rate": 1.1482862662223552e-05, "loss": 0.0672, "step": 6590 }, { "epoch": 1.445466491458607, "grad_norm": 8.368335723876953, "learning_rate": 1.1458983310609828e-05, "loss": 0.1125, "step": 6600 }, { "epoch": 1.4476565922032414, "grad_norm": 0.2716255784034729, "learning_rate": 1.1435095455549303e-05, "loss": 0.0754, "step": 6610 }, { "epoch": 1.4498466929478755, "grad_norm": 9.818949699401855, "learning_rate": 1.141119923626846e-05, "loss": 0.1538, "step": 6620 }, { "epoch": 1.4520367936925098, "grad_norm": 8.167736053466797, "learning_rate": 1.1387294792042515e-05, "loss": 0.0326, "step": 6630 }, { "epoch": 1.4542268944371441, "grad_norm": 0.1539860963821411, "learning_rate": 1.136338226219464e-05, "loss": 0.0468, "step": 6640 }, { "epoch": 1.4564169951817783, "grad_norm": 0.13053065538406372, "learning_rate": 1.133946178609512e-05, "loss": 0.0349, "step": 6650 }, { "epoch": 1.4586070959264126, "grad_norm": 0.11723353713750839, "learning_rate": 1.1315533503160556e-05, "loss": 0.0561, "step": 6660 }, { "epoch": 1.460797196671047, "grad_norm": 3.0153372287750244, "learning_rate": 1.1291597552853054e-05, "loss": 0.0396, "step": 6670 }, { "epoch": 1.462987297415681, "grad_norm": 0.018486522138118744, "learning_rate": 1.12676540746794e-05, "loss": 0.0284, "step": 6680 }, { "epoch": 1.4651773981603153, "grad_norm": 16.320417404174805, "learning_rate": 1.1243703208190267e-05, "loss": 0.0579, "step": 6690 }, { "epoch": 1.4673674989049497, "grad_norm": 3.616212844848633, "learning_rate": 1.1219745092979378e-05, "loss": 0.1045, "step": 6700 }, { "epoch": 1.4695575996495838, "grad_norm": 6.374945640563965, "learning_rate": 1.1195779868682708e-05, "loss": 0.0905, "step": 6710 }, { "epoch": 1.471747700394218, "grad_norm": 2.141786575317383, "learning_rate": 1.1171807674977668e-05, "loss": 0.0548, "step": 6720 }, { "epoch": 1.4739378011388524, "grad_norm": 3.0836243629455566, "learning_rate": 1.1147828651582294e-05, "loss": 0.0781, "step": 6730 }, { "epoch": 1.4761279018834865, "grad_norm": 1.3645882606506348, "learning_rate": 1.1123842938254413e-05, "loss": 0.0765, "step": 6740 }, { "epoch": 1.4783180026281209, "grad_norm": 7.577390670776367, "learning_rate": 1.1099850674790856e-05, "loss": 0.031, "step": 6750 }, { "epoch": 1.4805081033727552, "grad_norm": 5.7059431076049805, "learning_rate": 1.1075852001026623e-05, "loss": 0.1, "step": 6760 }, { "epoch": 1.4826982041173893, "grad_norm": 8.23361873626709, "learning_rate": 1.1051847056834085e-05, "loss": 0.0577, "step": 6770 }, { "epoch": 1.4848883048620236, "grad_norm": 0.1769704818725586, "learning_rate": 1.1027835982122146e-05, "loss": 0.1202, "step": 6780 }, { "epoch": 1.487078405606658, "grad_norm": 15.7285737991333, "learning_rate": 1.1003818916835449e-05, "loss": 0.0917, "step": 6790 }, { "epoch": 1.489268506351292, "grad_norm": 0.0496988408267498, "learning_rate": 1.0979796000953556e-05, "loss": 0.054, "step": 6800 }, { "epoch": 1.4914586070959264, "grad_norm": 17.40839385986328, "learning_rate": 1.0955767374490116e-05, "loss": 0.0851, "step": 6810 }, { "epoch": 1.4936487078405607, "grad_norm": 13.793992042541504, "learning_rate": 1.093173317749207e-05, "loss": 0.1074, "step": 6820 }, { "epoch": 1.4958388085851948, "grad_norm": 2.2975621223449707, "learning_rate": 1.0907693550038825e-05, "loss": 0.0976, "step": 6830 }, { "epoch": 1.4980289093298291, "grad_norm": 5.519827365875244, "learning_rate": 1.088364863224144e-05, "loss": 0.072, "step": 6840 }, { "epoch": 1.5002190100744635, "grad_norm": 9.0175142288208, "learning_rate": 1.08595985642418e-05, "loss": 0.1089, "step": 6850 }, { "epoch": 1.5024091108190976, "grad_norm": 14.090483665466309, "learning_rate": 1.0835543486211815e-05, "loss": 0.0497, "step": 6860 }, { "epoch": 1.5045992115637319, "grad_norm": 0.2074485868215561, "learning_rate": 1.0811483538352594e-05, "loss": 0.0732, "step": 6870 }, { "epoch": 1.5067893123083662, "grad_norm": 11.074170112609863, "learning_rate": 1.0787418860893624e-05, "loss": 0.1342, "step": 6880 }, { "epoch": 1.5089794130530003, "grad_norm": 0.490258127450943, "learning_rate": 1.0763349594091967e-05, "loss": 0.1154, "step": 6890 }, { "epoch": 1.5111695137976346, "grad_norm": 5.131935119628906, "learning_rate": 1.0739275878231423e-05, "loss": 0.0562, "step": 6900 }, { "epoch": 1.513359614542269, "grad_norm": 10.276161193847656, "learning_rate": 1.071519785362173e-05, "loss": 0.0754, "step": 6910 }, { "epoch": 1.515549715286903, "grad_norm": 8.828187942504883, "learning_rate": 1.0691115660597736e-05, "loss": 0.0261, "step": 6920 }, { "epoch": 1.5177398160315374, "grad_norm": 1.5734862089157104, "learning_rate": 1.0667029439518578e-05, "loss": 0.1003, "step": 6930 }, { "epoch": 1.5199299167761717, "grad_norm": 6.691889762878418, "learning_rate": 1.0645348512585945e-05, "loss": 0.057, "step": 6940 }, { "epoch": 1.5221200175208058, "grad_norm": 6.366805076599121, "learning_rate": 1.062366453899291e-05, "loss": 0.1382, "step": 6950 }, { "epoch": 1.5243101182654402, "grad_norm": 10.088261604309082, "learning_rate": 1.0599567786266654e-05, "loss": 0.0522, "step": 6960 }, { "epoch": 1.5265002190100745, "grad_norm": 7.632128715515137, "learning_rate": 1.0575467539057125e-05, "loss": 0.1452, "step": 6970 }, { "epoch": 1.5286903197547086, "grad_norm": 0.03694286569952965, "learning_rate": 1.0551363937828693e-05, "loss": 0.0607, "step": 6980 }, { "epoch": 1.530880420499343, "grad_norm": 0.9921841621398926, "learning_rate": 1.0527257123065275e-05, "loss": 0.0289, "step": 6990 }, { "epoch": 1.5330705212439772, "grad_norm": 0.3752399682998657, "learning_rate": 1.050314723526952e-05, "loss": 0.0659, "step": 7000 }, { "epoch": 1.5352606219886114, "grad_norm": 8.542110443115234, "learning_rate": 1.0479034414961983e-05, "loss": 0.134, "step": 7010 }, { "epoch": 1.5374507227332457, "grad_norm": 13.881056785583496, "learning_rate": 1.0454918802680316e-05, "loss": 0.1176, "step": 7020 }, { "epoch": 1.53964082347788, "grad_norm": 0.03836442530155182, "learning_rate": 1.0430800538978437e-05, "loss": 0.0576, "step": 7030 }, { "epoch": 1.541830924222514, "grad_norm": 4.362994194030762, "learning_rate": 1.0406679764425723e-05, "loss": 0.023, "step": 7040 }, { "epoch": 1.5440210249671484, "grad_norm": 5.992679119110107, "learning_rate": 1.0382556619606184e-05, "loss": 0.0918, "step": 7050 }, { "epoch": 1.5462111257117828, "grad_norm": 22.82040786743164, "learning_rate": 1.0358431245117642e-05, "loss": 0.101, "step": 7060 }, { "epoch": 1.5484012264564169, "grad_norm": 9.083914756774902, "learning_rate": 1.0334303781570918e-05, "loss": 0.0693, "step": 7070 }, { "epoch": 1.5505913272010512, "grad_norm": 1.3297282457351685, "learning_rate": 1.0310174369589005e-05, "loss": 0.1178, "step": 7080 }, { "epoch": 1.5527814279456855, "grad_norm": 7.009533405303955, "learning_rate": 1.0286043149806258e-05, "loss": 0.0511, "step": 7090 }, { "epoch": 1.5549715286903196, "grad_norm": 0.3763047754764557, "learning_rate": 1.0261910262867556e-05, "loss": 0.1001, "step": 7100 }, { "epoch": 1.557161629434954, "grad_norm": 5.370127201080322, "learning_rate": 1.0237775849427512e-05, "loss": 0.0573, "step": 7110 }, { "epoch": 1.5593517301795883, "grad_norm": 4.698090076446533, "learning_rate": 1.0213640050149625e-05, "loss": 0.117, "step": 7120 }, { "epoch": 1.5615418309242224, "grad_norm": 0.7446310520172119, "learning_rate": 1.0189503005705473e-05, "loss": 0.1194, "step": 7130 }, { "epoch": 1.563731931668857, "grad_norm": 11.536616325378418, "learning_rate": 1.016536485677389e-05, "loss": 0.031, "step": 7140 }, { "epoch": 1.565922032413491, "grad_norm": 4.547265529632568, "learning_rate": 1.0141225744040148e-05, "loss": 0.0476, "step": 7150 }, { "epoch": 1.5681121331581251, "grad_norm": 4.5376973152160645, "learning_rate": 1.0117085808195142e-05, "loss": 0.0612, "step": 7160 }, { "epoch": 1.5703022339027597, "grad_norm": 2.737612724304199, "learning_rate": 1.0092945189934558e-05, "loss": 0.1175, "step": 7170 }, { "epoch": 1.5724923346473938, "grad_norm": 8.227771759033203, "learning_rate": 1.0068804029958056e-05, "loss": 0.0536, "step": 7180 }, { "epoch": 1.574682435392028, "grad_norm": 5.110659599304199, "learning_rate": 1.0044662468968465e-05, "loss": 0.0252, "step": 7190 }, { "epoch": 1.5768725361366625, "grad_norm": 4.364129543304443, "learning_rate": 1.002052064767094e-05, "loss": 0.0917, "step": 7200 }, { "epoch": 1.5790626368812966, "grad_norm": 6.832279682159424, "learning_rate": 9.996378706772161e-06, "loss": 0.1093, "step": 7210 }, { "epoch": 1.5812527376259307, "grad_norm": 4.476935863494873, "learning_rate": 9.9722367869795e-06, "loss": 0.0149, "step": 7220 }, { "epoch": 1.5834428383705652, "grad_norm": 0.2782779633998871, "learning_rate": 9.948095029000208e-06, "loss": 0.0745, "step": 7230 }, { "epoch": 1.5856329391151993, "grad_norm": 1.4107962846755981, "learning_rate": 9.923953573540593e-06, "loss": 0.0486, "step": 7240 }, { "epoch": 1.5878230398598334, "grad_norm": 0.8028163313865662, "learning_rate": 9.8998125613052e-06, "loss": 0.0867, "step": 7250 }, { "epoch": 1.590013140604468, "grad_norm": 0.3211402893066406, "learning_rate": 9.875672132995985e-06, "loss": 0.041, "step": 7260 }, { "epoch": 1.592203241349102, "grad_norm": 1.3429604768753052, "learning_rate": 9.851532429311514e-06, "loss": 0.1443, "step": 7270 }, { "epoch": 1.5943933420937362, "grad_norm": 0.03590783476829529, "learning_rate": 9.827393590946116e-06, "loss": 0.0849, "step": 7280 }, { "epoch": 1.5965834428383707, "grad_norm": 0.2529953420162201, "learning_rate": 9.803255758589086e-06, "loss": 0.1087, "step": 7290 }, { "epoch": 1.5987735435830048, "grad_norm": 12.62936019897461, "learning_rate": 9.77911907292385e-06, "loss": 0.0933, "step": 7300 }, { "epoch": 1.600963644327639, "grad_norm": 2.5823099613189697, "learning_rate": 9.75498367462715e-06, "loss": 0.077, "step": 7310 }, { "epoch": 1.6031537450722735, "grad_norm": 3.3571252822875977, "learning_rate": 9.730849704368232e-06, "loss": 0.0779, "step": 7320 }, { "epoch": 1.6053438458169076, "grad_norm": 14.581657409667969, "learning_rate": 9.70671730280801e-06, "loss": 0.1422, "step": 7330 }, { "epoch": 1.6075339465615417, "grad_norm": 3.475109577178955, "learning_rate": 9.682586610598263e-06, "loss": 0.0277, "step": 7340 }, { "epoch": 1.6097240473061762, "grad_norm": 1.535677194595337, "learning_rate": 9.658457768380802e-06, "loss": 0.0428, "step": 7350 }, { "epoch": 1.6119141480508103, "grad_norm": 11.305051803588867, "learning_rate": 9.634330916786657e-06, "loss": 0.1094, "step": 7360 }, { "epoch": 1.6141042487954445, "grad_norm": 5.779110431671143, "learning_rate": 9.610206196435255e-06, "loss": 0.0901, "step": 7370 }, { "epoch": 1.616294349540079, "grad_norm": 0.34296348690986633, "learning_rate": 9.586083747933604e-06, "loss": 0.0624, "step": 7380 }, { "epoch": 1.618484450284713, "grad_norm": 5.306641578674316, "learning_rate": 9.56196371187547e-06, "loss": 0.0287, "step": 7390 }, { "epoch": 1.6206745510293472, "grad_norm": 9.704771995544434, "learning_rate": 9.537846228840556e-06, "loss": 0.0956, "step": 7400 }, { "epoch": 1.6228646517739818, "grad_norm": 3.5721728801727295, "learning_rate": 9.513731439393691e-06, "loss": 0.0415, "step": 7410 }, { "epoch": 1.6250547525186159, "grad_norm": 1.2056152820587158, "learning_rate": 9.489619484083998e-06, "loss": 0.0784, "step": 7420 }, { "epoch": 1.62724485326325, "grad_norm": 9.83515739440918, "learning_rate": 9.465510503444082e-06, "loss": 0.0552, "step": 7430 }, { "epoch": 1.6294349540078845, "grad_norm": 0.06283877044916153, "learning_rate": 9.441404637989219e-06, "loss": 0.0643, "step": 7440 }, { "epoch": 1.6316250547525186, "grad_norm": 0.9084874391555786, "learning_rate": 9.41730202821652e-06, "loss": 0.0309, "step": 7450 }, { "epoch": 1.6338151554971527, "grad_norm": 7.613016605377197, "learning_rate": 9.393202814604125e-06, "loss": 0.0391, "step": 7460 }, { "epoch": 1.6360052562417873, "grad_norm": 9.826533317565918, "learning_rate": 9.369107137610378e-06, "loss": 0.049, "step": 7470 }, { "epoch": 1.6381953569864214, "grad_norm": 0.9197579622268677, "learning_rate": 9.34501513767301e-06, "loss": 0.0876, "step": 7480 }, { "epoch": 1.6403854577310555, "grad_norm": 9.174433708190918, "learning_rate": 9.320926955208323e-06, "loss": 0.0683, "step": 7490 }, { "epoch": 1.64257555847569, "grad_norm": 6.790747165679932, "learning_rate": 9.29684273061037e-06, "loss": 0.1566, "step": 7500 }, { "epoch": 1.6447656592203241, "grad_norm": 0.16844797134399414, "learning_rate": 9.272762604250131e-06, "loss": 0.0208, "step": 7510 }, { "epoch": 1.6469557599649582, "grad_norm": 0.897287130355835, "learning_rate": 9.24868671647471e-06, "loss": 0.0712, "step": 7520 }, { "epoch": 1.6491458607095928, "grad_norm": 0.37365949153900146, "learning_rate": 9.224615207606494e-06, "loss": 0.0645, "step": 7530 }, { "epoch": 1.651335961454227, "grad_norm": 6.627313613891602, "learning_rate": 9.200548217942357e-06, "loss": 0.0646, "step": 7540 }, { "epoch": 1.653526062198861, "grad_norm": 0.3961053490638733, "learning_rate": 9.176485887752835e-06, "loss": 0.0689, "step": 7550 }, { "epoch": 1.6557161629434956, "grad_norm": 2.6070027351379395, "learning_rate": 9.1524283572813e-06, "loss": 0.0908, "step": 7560 }, { "epoch": 1.6579062636881297, "grad_norm": 0.02399611845612526, "learning_rate": 9.128375766743155e-06, "loss": 0.0759, "step": 7570 }, { "epoch": 1.6600963644327638, "grad_norm": 4.420040130615234, "learning_rate": 9.104328256325011e-06, "loss": 0.0804, "step": 7580 }, { "epoch": 1.6622864651773983, "grad_norm": 0.48654696345329285, "learning_rate": 9.080285966183865e-06, "loss": 0.0774, "step": 7590 }, { "epoch": 1.6644765659220324, "grad_norm": 16.966053009033203, "learning_rate": 9.056249036446297e-06, "loss": 0.1406, "step": 7600 }, { "epoch": 1.6666666666666665, "grad_norm": 5.185943603515625, "learning_rate": 9.03221760720764e-06, "loss": 0.0595, "step": 7610 }, { "epoch": 1.668856767411301, "grad_norm": 0.5483208894729614, "learning_rate": 9.008191818531166e-06, "loss": 0.0376, "step": 7620 }, { "epoch": 1.6710468681559352, "grad_norm": 9.714131355285645, "learning_rate": 8.984171810447275e-06, "loss": 0.086, "step": 7630 }, { "epoch": 1.6732369689005693, "grad_norm": 7.819004058837891, "learning_rate": 8.960157722952677e-06, "loss": 0.058, "step": 7640 }, { "epoch": 1.6754270696452038, "grad_norm": 12.197850227355957, "learning_rate": 8.936149696009571e-06, "loss": 0.1003, "step": 7650 }, { "epoch": 1.677617170389838, "grad_norm": 9.457324028015137, "learning_rate": 8.912147869544838e-06, "loss": 0.046, "step": 7660 }, { "epoch": 1.679807271134472, "grad_norm": 4.650213241577148, "learning_rate": 8.888152383449214e-06, "loss": 0.0771, "step": 7670 }, { "epoch": 1.6819973718791066, "grad_norm": 0.0636896938085556, "learning_rate": 8.864163377576487e-06, "loss": 0.0613, "step": 7680 }, { "epoch": 1.6841874726237407, "grad_norm": 3.5382239818573, "learning_rate": 8.840180991742672e-06, "loss": 0.0642, "step": 7690 }, { "epoch": 1.6863775733683748, "grad_norm": 13.423056602478027, "learning_rate": 8.816205365725204e-06, "loss": 0.0507, "step": 7700 }, { "epoch": 1.6885676741130093, "grad_norm": 0.07884453982114792, "learning_rate": 8.792236639262117e-06, "loss": 0.0616, "step": 7710 }, { "epoch": 1.6907577748576434, "grad_norm": 12.510711669921875, "learning_rate": 8.768274952051234e-06, "loss": 0.1113, "step": 7720 }, { "epoch": 1.6929478756022776, "grad_norm": 1.9943735599517822, "learning_rate": 8.744320443749346e-06, "loss": 0.0266, "step": 7730 }, { "epoch": 1.695137976346912, "grad_norm": 2.2527310848236084, "learning_rate": 8.72037325397141e-06, "loss": 0.0725, "step": 7740 }, { "epoch": 1.6973280770915462, "grad_norm": 10.473712921142578, "learning_rate": 8.696433522289721e-06, "loss": 0.123, "step": 7750 }, { "epoch": 1.6995181778361803, "grad_norm": 0.6873182654380798, "learning_rate": 8.67250138823311e-06, "loss": 0.0276, "step": 7760 }, { "epoch": 1.7017082785808149, "grad_norm": 8.880640983581543, "learning_rate": 8.648576991286131e-06, "loss": 0.0906, "step": 7770 }, { "epoch": 1.703898379325449, "grad_norm": 2.360377311706543, "learning_rate": 8.624660470888232e-06, "loss": 0.0373, "step": 7780 }, { "epoch": 1.706088480070083, "grad_norm": 9.835999488830566, "learning_rate": 8.600751966432963e-06, "loss": 0.0499, "step": 7790 }, { "epoch": 1.7082785808147176, "grad_norm": 4.022478103637695, "learning_rate": 8.576851617267151e-06, "loss": 0.1111, "step": 7800 }, { "epoch": 1.7104686815593517, "grad_norm": 6.333062648773193, "learning_rate": 8.552959562690091e-06, "loss": 0.0577, "step": 7810 }, { "epoch": 1.7126587823039858, "grad_norm": 18.277101516723633, "learning_rate": 8.529075941952736e-06, "loss": 0.126, "step": 7820 }, { "epoch": 1.7148488830486204, "grad_norm": 0.35233399271965027, "learning_rate": 8.505200894256882e-06, "loss": 0.0358, "step": 7830 }, { "epoch": 1.7170389837932545, "grad_norm": 0.5816495418548584, "learning_rate": 8.48133455875436e-06, "loss": 0.0707, "step": 7840 }, { "epoch": 1.7192290845378886, "grad_norm": 0.2614850103855133, "learning_rate": 8.457477074546221e-06, "loss": 0.0491, "step": 7850 }, { "epoch": 1.7214191852825231, "grad_norm": 10.311013221740723, "learning_rate": 8.433628580681933e-06, "loss": 0.0891, "step": 7860 }, { "epoch": 1.7236092860271572, "grad_norm": 0.02627030946314335, "learning_rate": 8.409789216158556e-06, "loss": 0.0965, "step": 7870 }, { "epoch": 1.7257993867717913, "grad_norm": 0.6106966137886047, "learning_rate": 8.38595911991995e-06, "loss": 0.0701, "step": 7880 }, { "epoch": 1.727989487516426, "grad_norm": 6.417677402496338, "learning_rate": 8.362138430855955e-06, "loss": 0.0396, "step": 7890 }, { "epoch": 1.73017958826106, "grad_norm": 2.7876412868499756, "learning_rate": 8.338327287801575e-06, "loss": 0.099, "step": 7900 }, { "epoch": 1.732369689005694, "grad_norm": 5.569888114929199, "learning_rate": 8.314525829536186e-06, "loss": 0.0707, "step": 7910 }, { "epoch": 1.7345597897503287, "grad_norm": 3.60909366607666, "learning_rate": 8.290734194782712e-06, "loss": 0.071, "step": 7920 }, { "epoch": 1.7367498904949628, "grad_norm": 9.758267402648926, "learning_rate": 8.266952522206828e-06, "loss": 0.1356, "step": 7930 }, { "epoch": 1.7389399912395969, "grad_norm": 0.2731553614139557, "learning_rate": 8.243180950416142e-06, "loss": 0.029, "step": 7940 }, { "epoch": 1.7411300919842314, "grad_norm": 3.2966954708099365, "learning_rate": 8.224171058655684e-06, "loss": 0.0732, "step": 7950 }, { "epoch": 1.7433201927288655, "grad_norm": 1.3530393838882446, "learning_rate": 8.200418017381094e-06, "loss": 0.0182, "step": 7960 }, { "epoch": 1.7455102934734996, "grad_norm": 0.46221843361854553, "learning_rate": 8.176675464677232e-06, "loss": 0.0958, "step": 7970 }, { "epoch": 1.7477003942181342, "grad_norm": 0.017537014558911324, "learning_rate": 8.152943538923707e-06, "loss": 0.0657, "step": 7980 }, { "epoch": 1.7498904949627683, "grad_norm": 0.5254737734794617, "learning_rate": 8.129222378438183e-06, "loss": 0.0902, "step": 7990 }, { "epoch": 1.7520805957074024, "grad_norm": 9.099427223205566, "learning_rate": 8.105512121475587e-06, "loss": 0.0708, "step": 8000 }, { "epoch": 1.754270696452037, "grad_norm": 0.20789888501167297, "learning_rate": 8.081812906227291e-06, "loss": 0.1043, "step": 8010 }, { "epoch": 1.756460797196671, "grad_norm": 0.08837087452411652, "learning_rate": 8.058124870820315e-06, "loss": 0.0523, "step": 8020 }, { "epoch": 1.7586508979413051, "grad_norm": 3.2547051906585693, "learning_rate": 8.034448153316516e-06, "loss": 0.0373, "step": 8030 }, { "epoch": 1.7608409986859397, "grad_norm": 0.04665054753422737, "learning_rate": 8.010782891711793e-06, "loss": 0.0429, "step": 8040 }, { "epoch": 1.7630310994305738, "grad_norm": 7.060179710388184, "learning_rate": 7.987129223935268e-06, "loss": 0.0724, "step": 8050 }, { "epoch": 1.765221200175208, "grad_norm": 6.897003173828125, "learning_rate": 7.9634872878485e-06, "loss": 0.0597, "step": 8060 }, { "epoch": 1.7674113009198424, "grad_norm": 0.8591756820678711, "learning_rate": 7.93985722124466e-06, "loss": 0.073, "step": 8070 }, { "epoch": 1.7696014016644765, "grad_norm": 9.962007522583008, "learning_rate": 7.91623916184775e-06, "loss": 0.0838, "step": 8080 }, { "epoch": 1.7717915024091109, "grad_norm": 0.681604266166687, "learning_rate": 7.89263324731178e-06, "loss": 0.0649, "step": 8090 }, { "epoch": 1.7739816031537452, "grad_norm": 9.29695987701416, "learning_rate": 7.86903961521999e-06, "loss": 0.1114, "step": 8100 }, { "epoch": 1.7761717038983793, "grad_norm": 5.584187984466553, "learning_rate": 7.84545840308402e-06, "loss": 0.0826, "step": 8110 }, { "epoch": 1.7783618046430136, "grad_norm": 0.07980848848819733, "learning_rate": 7.821889748343126e-06, "loss": 0.0174, "step": 8120 }, { "epoch": 1.780551905387648, "grad_norm": 0.051824361085891724, "learning_rate": 7.79833378836338e-06, "loss": 0.0249, "step": 8130 }, { "epoch": 1.782742006132282, "grad_norm": 3.716097354888916, "learning_rate": 7.774790660436857e-06, "loss": 0.0373, "step": 8140 }, { "epoch": 1.7849321068769164, "grad_norm": 1.7128194570541382, "learning_rate": 7.751260501780853e-06, "loss": 0.0602, "step": 8150 }, { "epoch": 1.7871222076215507, "grad_norm": 0.30199313163757324, "learning_rate": 7.727743449537065e-06, "loss": 0.0555, "step": 8160 }, { "epoch": 1.7893123083661848, "grad_norm": 13.441886901855469, "learning_rate": 7.704239640770806e-06, "loss": 0.1028, "step": 8170 }, { "epoch": 1.7915024091108191, "grad_norm": 6.350060939788818, "learning_rate": 7.6807492124702e-06, "loss": 0.0288, "step": 8180 }, { "epoch": 1.7936925098554535, "grad_norm": 0.061654288321733475, "learning_rate": 7.657272301545385e-06, "loss": 0.045, "step": 8190 }, { "epoch": 1.7958826106000876, "grad_norm": 6.74562406539917, "learning_rate": 7.633809044827715e-06, "loss": 0.0659, "step": 8200 }, { "epoch": 1.798072711344722, "grad_norm": 10.576141357421875, "learning_rate": 7.610359579068966e-06, "loss": 0.0483, "step": 8210 }, { "epoch": 1.8002628120893562, "grad_norm": 0.0030790793243795633, "learning_rate": 7.586924040940533e-06, "loss": 0.0549, "step": 8220 }, { "epoch": 1.8024529128339903, "grad_norm": 8.747251510620117, "learning_rate": 7.563502567032635e-06, "loss": 0.0558, "step": 8230 }, { "epoch": 1.8046430135786247, "grad_norm": 9.28692626953125, "learning_rate": 7.54009529385352e-06, "loss": 0.0253, "step": 8240 }, { "epoch": 1.806833114323259, "grad_norm": 3.1650545597076416, "learning_rate": 7.516702357828672e-06, "loss": 0.0696, "step": 8250 }, { "epoch": 1.809023215067893, "grad_norm": 0.004813139792531729, "learning_rate": 7.493323895300013e-06, "loss": 0.0523, "step": 8260 }, { "epoch": 1.8112133158125274, "grad_norm": 0.01844978705048561, "learning_rate": 7.469960042525105e-06, "loss": 0.0926, "step": 8270 }, { "epoch": 1.8134034165571618, "grad_norm": 8.335030555725098, "learning_rate": 7.446610935676363e-06, "loss": 0.0762, "step": 8280 }, { "epoch": 1.8155935173017959, "grad_norm": 5.293061256408691, "learning_rate": 7.423276710840256e-06, "loss": 0.0324, "step": 8290 }, { "epoch": 1.8177836180464302, "grad_norm": 0.3841320276260376, "learning_rate": 7.399957504016515e-06, "loss": 0.06, "step": 8300 }, { "epoch": 1.8199737187910645, "grad_norm": 9.635478973388672, "learning_rate": 7.376653451117345e-06, "loss": 0.1288, "step": 8310 }, { "epoch": 1.8221638195356986, "grad_norm": 3.6589043140411377, "learning_rate": 7.353364687966624e-06, "loss": 0.052, "step": 8320 }, { "epoch": 1.824353920280333, "grad_norm": 7.381948947906494, "learning_rate": 7.33009135029912e-06, "loss": 0.0285, "step": 8330 }, { "epoch": 1.8265440210249673, "grad_norm": 0.6322674751281738, "learning_rate": 7.306833573759693e-06, "loss": 0.0506, "step": 8340 }, { "epoch": 1.8287341217696014, "grad_norm": 9.502379417419434, "learning_rate": 7.283591493902506e-06, "loss": 0.0975, "step": 8350 }, { "epoch": 1.8309242225142357, "grad_norm": 5.4688639640808105, "learning_rate": 7.260365246190244e-06, "loss": 0.0433, "step": 8360 }, { "epoch": 1.83311432325887, "grad_norm": 9.433446884155273, "learning_rate": 7.2371549659933095e-06, "loss": 0.022, "step": 8370 }, { "epoch": 1.8353044240035041, "grad_norm": 4.5495381355285645, "learning_rate": 7.213960788589043e-06, "loss": 0.0459, "step": 8380 }, { "epoch": 1.8374945247481385, "grad_norm": 0.19871436059474945, "learning_rate": 7.190782849160935e-06, "loss": 0.0529, "step": 8390 }, { "epoch": 1.8396846254927728, "grad_norm": 0.21969793736934662, "learning_rate": 7.167621282797832e-06, "loss": 0.0449, "step": 8400 }, { "epoch": 1.841874726237407, "grad_norm": 1.909063696861267, "learning_rate": 7.144476224493153e-06, "loss": 0.0331, "step": 8410 }, { "epoch": 1.8440648269820412, "grad_norm": 8.0787935256958, "learning_rate": 7.121347809144108e-06, "loss": 0.0404, "step": 8420 }, { "epoch": 1.8462549277266755, "grad_norm": 1.3858003616333008, "learning_rate": 7.098236171550898e-06, "loss": 0.083, "step": 8430 }, { "epoch": 1.8484450284713096, "grad_norm": 0.24444814026355743, "learning_rate": 7.075141446415943e-06, "loss": 0.0294, "step": 8440 }, { "epoch": 1.850635129215944, "grad_norm": 9.009507179260254, "learning_rate": 7.052063768343091e-06, "loss": 0.0718, "step": 8450 }, { "epoch": 1.8528252299605783, "grad_norm": 0.02800663374364376, "learning_rate": 7.029003271836832e-06, "loss": 0.0601, "step": 8460 }, { "epoch": 1.8550153307052124, "grad_norm": 0.8958655595779419, "learning_rate": 7.0059600913015155e-06, "loss": 0.1083, "step": 8470 }, { "epoch": 1.8572054314498467, "grad_norm": 0.048825327306985855, "learning_rate": 6.982934361040574e-06, "loss": 0.0684, "step": 8480 }, { "epoch": 1.859395532194481, "grad_norm": 4.292994976043701, "learning_rate": 6.959926215255726e-06, "loss": 0.093, "step": 8490 }, { "epoch": 1.8615856329391152, "grad_norm": 2.043471574783325, "learning_rate": 6.936935788046206e-06, "loss": 0.0701, "step": 8500 }, { "epoch": 1.8637757336837495, "grad_norm": 2.9659621715545654, "learning_rate": 6.913963213407977e-06, "loss": 0.0265, "step": 8510 }, { "epoch": 1.8659658344283838, "grad_norm": 0.5019598007202148, "learning_rate": 6.8910086252329536e-06, "loss": 0.0572, "step": 8520 }, { "epoch": 1.868155935173018, "grad_norm": 14.020390510559082, "learning_rate": 6.868072157308213e-06, "loss": 0.0683, "step": 8530 }, { "epoch": 1.8703460359176522, "grad_norm": 6.672788143157959, "learning_rate": 6.8451539433152305e-06, "loss": 0.0618, "step": 8540 }, { "epoch": 1.8725361366622866, "grad_norm": 0.06283014267683029, "learning_rate": 6.822254116829086e-06, "loss": 0.0675, "step": 8550 }, { "epoch": 1.8747262374069207, "grad_norm": 14.961589813232422, "learning_rate": 6.799372811317688e-06, "loss": 0.0737, "step": 8560 }, { "epoch": 1.876916338151555, "grad_norm": 0.38605207204818726, "learning_rate": 6.776510160141006e-06, "loss": 0.0098, "step": 8570 }, { "epoch": 1.8791064388961893, "grad_norm": 1.8146640062332153, "learning_rate": 6.753666296550282e-06, "loss": 0.0874, "step": 8580 }, { "epoch": 1.8812965396408234, "grad_norm": 9.246001243591309, "learning_rate": 6.730841353687254e-06, "loss": 0.0247, "step": 8590 }, { "epoch": 1.8834866403854578, "grad_norm": 8.938494682312012, "learning_rate": 6.708035464583391e-06, "loss": 0.0703, "step": 8600 }, { "epoch": 1.885676741130092, "grad_norm": 7.806215763092041, "learning_rate": 6.685248762159102e-06, "loss": 0.1064, "step": 8610 }, { "epoch": 1.8878668418747262, "grad_norm": 0.12758301198482513, "learning_rate": 6.6624813792229785e-06, "loss": 0.0162, "step": 8620 }, { "epoch": 1.8900569426193605, "grad_norm": 13.940812110900879, "learning_rate": 6.6397334484710015e-06, "loss": 0.1047, "step": 8630 }, { "epoch": 1.8922470433639949, "grad_norm": 0.16418476402759552, "learning_rate": 6.61700510248579e-06, "loss": 0.0469, "step": 8640 }, { "epoch": 1.894437144108629, "grad_norm": 0.10566751658916473, "learning_rate": 6.594296473735804e-06, "loss": 0.0412, "step": 8650 }, { "epoch": 1.8966272448532633, "grad_norm": 0.8708646893501282, "learning_rate": 6.571607694574596e-06, "loss": 0.012, "step": 8660 }, { "epoch": 1.8988173455978976, "grad_norm": 4.691579341888428, "learning_rate": 6.548938897240018e-06, "loss": 0.1106, "step": 8670 }, { "epoch": 1.9010074463425317, "grad_norm": 0.6086575388908386, "learning_rate": 6.52629021385347e-06, "loss": 0.076, "step": 8680 }, { "epoch": 1.903197547087166, "grad_norm": 0.05114951729774475, "learning_rate": 6.503661776419116e-06, "loss": 0.0882, "step": 8690 }, { "epoch": 1.9053876478318004, "grad_norm": 0.5560314059257507, "learning_rate": 6.48105371682312e-06, "loss": 0.035, "step": 8700 }, { "epoch": 1.9075777485764345, "grad_norm": 16.29448890686035, "learning_rate": 6.458466166832879e-06, "loss": 0.1012, "step": 8710 }, { "epoch": 1.9097678493210688, "grad_norm": 2.1588475704193115, "learning_rate": 6.435899258096254e-06, "loss": 0.1108, "step": 8720 }, { "epoch": 1.9119579500657031, "grad_norm": 0.5798647999763489, "learning_rate": 6.413353122140798e-06, "loss": 0.0414, "step": 8730 }, { "epoch": 1.9141480508103372, "grad_norm": 19.341800689697266, "learning_rate": 6.390827890372993e-06, "loss": 0.1258, "step": 8740 }, { "epoch": 1.9163381515549716, "grad_norm": 0.31613656878471375, "learning_rate": 6.368323694077494e-06, "loss": 0.0974, "step": 8750 }, { "epoch": 1.9185282522996059, "grad_norm": 0.6673941612243652, "learning_rate": 6.345840664416341e-06, "loss": 0.021, "step": 8760 }, { "epoch": 1.92071835304424, "grad_norm": 4.35990571975708, "learning_rate": 6.323378932428217e-06, "loss": 0.0621, "step": 8770 }, { "epoch": 1.9229084537888743, "grad_norm": 7.6394877433776855, "learning_rate": 6.300938629027669e-06, "loss": 0.0286, "step": 8780 }, { "epoch": 1.9250985545335086, "grad_norm": 7.6478095054626465, "learning_rate": 6.278519885004355e-06, "loss": 0.0311, "step": 8790 }, { "epoch": 1.9272886552781427, "grad_norm": 5.093833923339844, "learning_rate": 6.256122831022276e-06, "loss": 0.1063, "step": 8800 }, { "epoch": 1.929478756022777, "grad_norm": 7.615957260131836, "learning_rate": 6.233747597619015e-06, "loss": 0.0573, "step": 8810 }, { "epoch": 1.9316688567674114, "grad_norm": 2.268455743789673, "learning_rate": 6.2113943152049805e-06, "loss": 0.0962, "step": 8820 }, { "epoch": 1.9338589575120455, "grad_norm": 15.710822105407715, "learning_rate": 6.18906311406264e-06, "loss": 0.1455, "step": 8830 }, { "epoch": 1.9360490582566798, "grad_norm": 4.179837703704834, "learning_rate": 6.1667541243457665e-06, "loss": 0.0706, "step": 8840 }, { "epoch": 1.9382391590013142, "grad_norm": 8.442146301269531, "learning_rate": 6.1444674760786745e-06, "loss": 0.0777, "step": 8850 }, { "epoch": 1.9404292597459483, "grad_norm": 3.3746235370635986, "learning_rate": 6.122203299155469e-06, "loss": 0.0957, "step": 8860 }, { "epoch": 1.9426193604905826, "grad_norm": 1.084843397140503, "learning_rate": 6.099961723339278e-06, "loss": 0.1015, "step": 8870 }, { "epoch": 1.944809461235217, "grad_norm": 19.260208129882812, "learning_rate": 6.0777428782615114e-06, "loss": 0.0911, "step": 8880 }, { "epoch": 1.946999561979851, "grad_norm": 2.777545213699341, "learning_rate": 6.055546893421089e-06, "loss": 0.0498, "step": 8890 }, { "epoch": 1.9491896627244854, "grad_norm": 1.3504369258880615, "learning_rate": 6.033373898183697e-06, "loss": 0.0726, "step": 8900 }, { "epoch": 1.9513797634691197, "grad_norm": 9.752876281738281, "learning_rate": 6.011224021781031e-06, "loss": 0.1386, "step": 8910 }, { "epoch": 1.9535698642137538, "grad_norm": 0.3136870563030243, "learning_rate": 5.9890973933100405e-06, "loss": 0.0387, "step": 8920 }, { "epoch": 1.955759964958388, "grad_norm": 10.092342376708984, "learning_rate": 5.9669941417321774e-06, "loss": 0.0719, "step": 8930 }, { "epoch": 1.9579500657030224, "grad_norm": 0.420943945646286, "learning_rate": 5.9449143958726475e-06, "loss": 0.0538, "step": 8940 }, { "epoch": 1.9601401664476565, "grad_norm": 3.7645761966705322, "learning_rate": 5.922858284419657e-06, "loss": 0.0476, "step": 8950 }, { "epoch": 1.9623302671922909, "grad_norm": 6.8745222091674805, "learning_rate": 5.905230498420529e-06, "loss": 0.0928, "step": 8960 }, { "epoch": 1.9645203679369252, "grad_norm": 9.23366641998291, "learning_rate": 5.883217252753412e-06, "loss": 0.0924, "step": 8970 }, { "epoch": 1.9667104686815593, "grad_norm": 5.903098106384277, "learning_rate": 5.8612280010845925e-06, "loss": 0.0828, "step": 8980 }, { "epoch": 1.9689005694261936, "grad_norm": 6.861005783081055, "learning_rate": 5.839262871574845e-06, "loss": 0.0603, "step": 8990 }, { "epoch": 1.971090670170828, "grad_norm": 10.727412223815918, "learning_rate": 5.8173219922443516e-06, "loss": 0.0743, "step": 9000 }, { "epoch": 1.973280770915462, "grad_norm": 12.319658279418945, "learning_rate": 5.795405490971956e-06, "loss": 0.0593, "step": 9010 }, { "epoch": 1.9754708716600964, "grad_norm": 0.45042884349823, "learning_rate": 5.773513495494425e-06, "loss": 0.0521, "step": 9020 }, { "epoch": 1.9776609724047307, "grad_norm": 1.839205265045166, "learning_rate": 5.7516461334056835e-06, "loss": 0.0167, "step": 9030 }, { "epoch": 1.9798510731493648, "grad_norm": 1.0183196067810059, "learning_rate": 5.729803532156102e-06, "loss": 0.0365, "step": 9040 }, { "epoch": 1.9820411738939991, "grad_norm": 8.869464874267578, "learning_rate": 5.707985819051718e-06, "loss": 0.0733, "step": 9050 }, { "epoch": 1.9842312746386335, "grad_norm": 0.8049930334091187, "learning_rate": 5.6861931212535295e-06, "loss": 0.055, "step": 9060 }, { "epoch": 1.9864213753832676, "grad_norm": 6.1276535987854, "learning_rate": 5.664425565776723e-06, "loss": 0.0595, "step": 9070 }, { "epoch": 1.988611476127902, "grad_norm": 0.8358739018440247, "learning_rate": 5.642683279489959e-06, "loss": 0.0944, "step": 9080 }, { "epoch": 1.9908015768725362, "grad_norm": 16.989784240722656, "learning_rate": 5.620966389114609e-06, "loss": 0.0807, "step": 9090 }, { "epoch": 1.9929916776171703, "grad_norm": 11.551353454589844, "learning_rate": 5.599275021224037e-06, "loss": 0.0804, "step": 9100 }, { "epoch": 1.9951817783618047, "grad_norm": 10.96682071685791, "learning_rate": 5.577609302242854e-06, "loss": 0.0887, "step": 9110 }, { "epoch": 1.997371879106439, "grad_norm": 7.266369819641113, "learning_rate": 5.555969358446172e-06, "loss": 0.1059, "step": 9120 }, { "epoch": 1.999561979851073, "grad_norm": 9.569356918334961, "learning_rate": 5.5343553159588884e-06, "loss": 0.0849, "step": 9130 }, { "epoch": 2.001752080595707, "grad_norm": 0.840337872505188, "learning_rate": 5.512767300754924e-06, "loss": 0.0328, "step": 9140 }, { "epoch": 2.0039421813403417, "grad_norm": 6.5176777839660645, "learning_rate": 5.49120543865652e-06, "loss": 0.0194, "step": 9150 }, { "epoch": 2.006132282084976, "grad_norm": 11.299787521362305, "learning_rate": 5.469669855333475e-06, "loss": 0.0579, "step": 9160 }, { "epoch": 2.00832238282961, "grad_norm": 0.03983398526906967, "learning_rate": 5.448160676302436e-06, "loss": 0.0639, "step": 9170 }, { "epoch": 2.0105124835742445, "grad_norm": 0.3649865686893463, "learning_rate": 5.42667802692615e-06, "loss": 0.0639, "step": 9180 }, { "epoch": 2.0127025843188786, "grad_norm": 0.1840098351240158, "learning_rate": 5.405222032412747e-06, "loss": 0.0148, "step": 9190 }, { "epoch": 2.0148926850635127, "grad_norm": 0.38783952593803406, "learning_rate": 5.383792817814991e-06, "loss": 0.0238, "step": 9200 }, { "epoch": 2.0170827858081473, "grad_norm": 0.7510598301887512, "learning_rate": 5.362390508029582e-06, "loss": 0.0335, "step": 9210 }, { "epoch": 2.0192728865527814, "grad_norm": 7.381070137023926, "learning_rate": 5.341015227796391e-06, "loss": 0.0262, "step": 9220 }, { "epoch": 2.0214629872974155, "grad_norm": 2.9963531494140625, "learning_rate": 5.319667101697769e-06, "loss": 0.0166, "step": 9230 }, { "epoch": 2.02365308804205, "grad_norm": 0.01576431281864643, "learning_rate": 5.2983462541577845e-06, "loss": 0.0306, "step": 9240 }, { "epoch": 2.025843188786684, "grad_norm": 1.2588666677474976, "learning_rate": 5.277052809441539e-06, "loss": 0.0199, "step": 9250 }, { "epoch": 2.0280332895313182, "grad_norm": 0.0006827327306382358, "learning_rate": 5.2557868916543996e-06, "loss": 0.0042, "step": 9260 }, { "epoch": 2.0302233902759528, "grad_norm": 15.037066459655762, "learning_rate": 5.234548624741311e-06, "loss": 0.084, "step": 9270 }, { "epoch": 2.032413491020587, "grad_norm": 2.749844551086426, "learning_rate": 5.2133381324860635e-06, "loss": 0.0342, "step": 9280 }, { "epoch": 2.034603591765221, "grad_norm": 0.002695726230740547, "learning_rate": 5.192155538510546e-06, "loss": 0.0181, "step": 9290 }, { "epoch": 2.0367936925098555, "grad_norm": 2.543112277984619, "learning_rate": 5.171000966274069e-06, "loss": 0.0044, "step": 9300 }, { "epoch": 2.0389837932544896, "grad_norm": 0.06324001401662827, "learning_rate": 5.149874539072615e-06, "loss": 0.0107, "step": 9310 }, { "epoch": 2.0411738939991237, "grad_norm": 2.3665401935577393, "learning_rate": 5.128776380038121e-06, "loss": 0.0845, "step": 9320 }, { "epoch": 2.0433639947437583, "grad_norm": 2.152229070663452, "learning_rate": 5.107706612137776e-06, "loss": 0.0108, "step": 9330 }, { "epoch": 2.0455540954883924, "grad_norm": 0.7792295813560486, "learning_rate": 5.0866653581732985e-06, "loss": 0.0834, "step": 9340 }, { "epoch": 2.0477441962330265, "grad_norm": 1.357008457183838, "learning_rate": 5.065652740780205e-06, "loss": 0.0318, "step": 9350 }, { "epoch": 2.049934296977661, "grad_norm": 0.16741587221622467, "learning_rate": 5.044668882427124e-06, "loss": 0.0047, "step": 9360 }, { "epoch": 2.052124397722295, "grad_norm": 8.084095001220703, "learning_rate": 5.023713905415051e-06, "loss": 0.0718, "step": 9370 }, { "epoch": 2.0543144984669293, "grad_norm": 7.928176403045654, "learning_rate": 5.002787931876667e-06, "loss": 0.0653, "step": 9380 }, { "epoch": 2.056504599211564, "grad_norm": 0.05364307388663292, "learning_rate": 4.981891083775597e-06, "loss": 0.0217, "step": 9390 }, { "epoch": 2.058694699956198, "grad_norm": 0.10703098773956299, "learning_rate": 4.961023482905728e-06, "loss": 0.0148, "step": 9400 }, { "epoch": 2.060884800700832, "grad_norm": 9.668065071105957, "learning_rate": 4.940185250890465e-06, "loss": 0.0551, "step": 9410 }, { "epoch": 2.0630749014454666, "grad_norm": 0.1026332676410675, "learning_rate": 4.919376509182061e-06, "loss": 0.0794, "step": 9420 }, { "epoch": 2.0652650021901007, "grad_norm": 2.7725613117218018, "learning_rate": 4.8985973790608745e-06, "loss": 0.1044, "step": 9430 }, { "epoch": 2.0674551029347352, "grad_norm": 0.1350996047258377, "learning_rate": 4.87784798163469e-06, "loss": 0.0024, "step": 9440 }, { "epoch": 2.0696452036793693, "grad_norm": 0.2855980396270752, "learning_rate": 4.857128437837987e-06, "loss": 0.0292, "step": 9450 }, { "epoch": 2.0718353044240034, "grad_norm": 1.4524846076965332, "learning_rate": 4.836438868431262e-06, "loss": 0.0305, "step": 9460 }, { "epoch": 2.074025405168638, "grad_norm": 0.011096697300672531, "learning_rate": 4.815779394000294e-06, "loss": 0.0141, "step": 9470 }, { "epoch": 2.076215505913272, "grad_norm": 0.009723620489239693, "learning_rate": 4.795150134955479e-06, "loss": 0.0483, "step": 9480 }, { "epoch": 2.078405606657906, "grad_norm": 0.1697608232498169, "learning_rate": 4.774551211531086e-06, "loss": 0.0251, "step": 9490 }, { "epoch": 2.0805957074025407, "grad_norm": 11.922332763671875, "learning_rate": 4.753982743784593e-06, "loss": 0.0884, "step": 9500 }, { "epoch": 2.082785808147175, "grad_norm": 0.017650220543146133, "learning_rate": 4.7334448515959695e-06, "loss": 0.0106, "step": 9510 }, { "epoch": 2.084975908891809, "grad_norm": 0.2010495513677597, "learning_rate": 4.712937654666971e-06, "loss": 0.0826, "step": 9520 }, { "epoch": 2.0871660096364435, "grad_norm": 2.174421787261963, "learning_rate": 4.692461272520467e-06, "loss": 0.0446, "step": 9530 }, { "epoch": 2.0893561103810776, "grad_norm": 0.0064881290309131145, "learning_rate": 4.672015824499708e-06, "loss": 0.061, "step": 9540 }, { "epoch": 2.0915462111257117, "grad_norm": 0.40953329205513, "learning_rate": 4.651601429767669e-06, "loss": 0.0771, "step": 9550 }, { "epoch": 2.0937363118703463, "grad_norm": 3.0396525859832764, "learning_rate": 4.631218207306321e-06, "loss": 0.0624, "step": 9560 }, { "epoch": 2.0959264126149804, "grad_norm": 1.7303316593170166, "learning_rate": 4.610866275915967e-06, "loss": 0.0593, "step": 9570 }, { "epoch": 2.0981165133596145, "grad_norm": 0.06534786522388458, "learning_rate": 4.590545754214518e-06, "loss": 0.0023, "step": 9580 }, { "epoch": 2.100306614104249, "grad_norm": 5.701974391937256, "learning_rate": 4.570256760636836e-06, "loss": 0.0261, "step": 9590 }, { "epoch": 2.102496714848883, "grad_norm": 0.06714854389429092, "learning_rate": 4.549999413434011e-06, "loss": 0.0097, "step": 9600 }, { "epoch": 2.1046868155935172, "grad_norm": 0.6208577752113342, "learning_rate": 4.529773830672701e-06, "loss": 0.0928, "step": 9610 }, { "epoch": 2.1068769163381518, "grad_norm": 11.083459854125977, "learning_rate": 4.509580130234419e-06, "loss": 0.0433, "step": 9620 }, { "epoch": 2.109067017082786, "grad_norm": 0.1622035950422287, "learning_rate": 4.489418429814866e-06, "loss": 0.0672, "step": 9630 }, { "epoch": 2.11125711782742, "grad_norm": 10.26950740814209, "learning_rate": 4.469288846923226e-06, "loss": 0.1401, "step": 9640 }, { "epoch": 2.1134472185720545, "grad_norm": 1.411583423614502, "learning_rate": 4.4491914988815055e-06, "loss": 0.0277, "step": 9650 }, { "epoch": 2.1156373193166886, "grad_norm": 0.0005271229310892522, "learning_rate": 4.429126502823817e-06, "loss": 0.006, "step": 9660 }, { "epoch": 2.1178274200613227, "grad_norm": 0.05095307156443596, "learning_rate": 4.409093975695732e-06, "loss": 0.0516, "step": 9670 }, { "epoch": 2.1200175208059573, "grad_norm": 0.41955050826072693, "learning_rate": 4.389094034253575e-06, "loss": 0.0521, "step": 9680 }, { "epoch": 2.1222076215505914, "grad_norm": 0.061787817627191544, "learning_rate": 4.369126795063744e-06, "loss": 0.0591, "step": 9690 }, { "epoch": 2.1243977222952255, "grad_norm": 0.22877174615859985, "learning_rate": 4.349192374502051e-06, "loss": 0.0042, "step": 9700 }, { "epoch": 2.12658782303986, "grad_norm": 6.8114776611328125, "learning_rate": 4.329290888753015e-06, "loss": 0.0391, "step": 9710 }, { "epoch": 2.128777923784494, "grad_norm": 1.54594886302948, "learning_rate": 4.309422453809214e-06, "loss": 0.011, "step": 9720 }, { "epoch": 2.1309680245291283, "grad_norm": 0.29878583550453186, "learning_rate": 4.289587185470581e-06, "loss": 0.0139, "step": 9730 }, { "epoch": 2.133158125273763, "grad_norm": 7.086662769317627, "learning_rate": 4.269785199343754e-06, "loss": 0.1086, "step": 9740 }, { "epoch": 2.135348226018397, "grad_norm": 0.03798053041100502, "learning_rate": 4.250016610841384e-06, "loss": 0.0299, "step": 9750 }, { "epoch": 2.137538326763031, "grad_norm": 0.010158129036426544, "learning_rate": 4.230281535181476e-06, "loss": 0.0336, "step": 9760 }, { "epoch": 2.1397284275076656, "grad_norm": 2.089038372039795, "learning_rate": 4.210580087386702e-06, "loss": 0.0322, "step": 9770 }, { "epoch": 2.1419185282522997, "grad_norm": 0.2857571542263031, "learning_rate": 4.190912382283749e-06, "loss": 0.0362, "step": 9780 }, { "epoch": 2.1441086289969338, "grad_norm": 0.12719859182834625, "learning_rate": 4.17127853450263e-06, "loss": 0.1107, "step": 9790 }, { "epoch": 2.1462987297415683, "grad_norm": 6.822968134656549e-05, "learning_rate": 4.151678658476038e-06, "loss": 0.0452, "step": 9800 }, { "epoch": 2.1484888304862024, "grad_norm": 2.7943925857543945, "learning_rate": 4.132112868438655e-06, "loss": 0.068, "step": 9810 }, { "epoch": 2.1506789312308365, "grad_norm": 6.338434219360352, "learning_rate": 4.112581278426509e-06, "loss": 0.0523, "step": 9820 }, { "epoch": 2.152869031975471, "grad_norm": 1.6564533710479736, "learning_rate": 4.0930840022762895e-06, "loss": 0.0042, "step": 9830 }, { "epoch": 2.155059132720105, "grad_norm": 0.0676155835390091, "learning_rate": 4.073621153624701e-06, "loss": 0.0684, "step": 9840 }, { "epoch": 2.1572492334647393, "grad_norm": 0.06697005778551102, "learning_rate": 4.054192845907791e-06, "loss": 0.0083, "step": 9850 }, { "epoch": 2.159439334209374, "grad_norm": 0.011783111840486526, "learning_rate": 4.0347991923602824e-06, "loss": 0.0313, "step": 9860 }, { "epoch": 2.161629434954008, "grad_norm": 12.520441055297852, "learning_rate": 4.015440306014938e-06, "loss": 0.0452, "step": 9870 }, { "epoch": 2.163819535698642, "grad_norm": 0.017418911680579185, "learning_rate": 3.996116299701867e-06, "loss": 0.0274, "step": 9880 }, { "epoch": 2.1660096364432766, "grad_norm": 0.03571068495512009, "learning_rate": 3.976827286047906e-06, "loss": 0.01, "step": 9890 }, { "epoch": 2.1681997371879107, "grad_norm": 8.288607597351074, "learning_rate": 3.957573377475922e-06, "loss": 0.1248, "step": 9900 }, { "epoch": 2.170389837932545, "grad_norm": 5.886268138885498, "learning_rate": 3.9383546862041955e-06, "loss": 0.0162, "step": 9910 }, { "epoch": 2.1725799386771794, "grad_norm": 3.7146174907684326, "learning_rate": 3.9191713242457355e-06, "loss": 0.056, "step": 9920 }, { "epoch": 2.1747700394218135, "grad_norm": 0.31446176767349243, "learning_rate": 3.90002340340765e-06, "loss": 0.0493, "step": 9930 }, { "epoch": 2.1769601401664476, "grad_norm": 0.10302296280860901, "learning_rate": 3.880911035290473e-06, "loss": 0.0128, "step": 9940 }, { "epoch": 2.179150240911082, "grad_norm": 0.0007031428976915777, "learning_rate": 3.861834331287542e-06, "loss": 0.0619, "step": 9950 }, { "epoch": 2.181340341655716, "grad_norm": 2.1073789596557617, "learning_rate": 3.8446958824016655e-06, "loss": 0.0535, "step": 9960 }, { "epoch": 2.1835304424003503, "grad_norm": 0.1305907815694809, "learning_rate": 3.82758649242105e-06, "loss": 0.041, "step": 9970 }, { "epoch": 2.185720543144985, "grad_norm": 0.018617836758494377, "learning_rate": 3.8086102387786815e-06, "loss": 0.0777, "step": 9980 }, { "epoch": 2.187910643889619, "grad_norm": 0.016018524765968323, "learning_rate": 3.7896700706439826e-06, "loss": 0.0492, "step": 9990 }, { "epoch": 2.190100744634253, "grad_norm": 0.1632128804922104, "learning_rate": 3.7707660984066343e-06, "loss": 0.0795, "step": 10000 }, { "epoch": 2.1922908453788876, "grad_norm": 7.29586935043335, "learning_rate": 3.75189843224537e-06, "loss": 0.0418, "step": 10010 }, { "epoch": 2.1944809461235217, "grad_norm": 0.1849966049194336, "learning_rate": 3.7330671821273036e-06, "loss": 0.063, "step": 10020 }, { "epoch": 2.196671046868156, "grad_norm": 4.528728485107422, "learning_rate": 3.71427245780732e-06, "loss": 0.0318, "step": 10030 }, { "epoch": 2.1988611476127904, "grad_norm": 0.08763193339109421, "learning_rate": 3.695514368827403e-06, "loss": 0.0017, "step": 10040 }, { "epoch": 2.2010512483574245, "grad_norm": 6.2515411376953125, "learning_rate": 3.6767930245160275e-06, "loss": 0.0194, "step": 10050 }, { "epoch": 2.2032413491020586, "grad_norm": 0.01605885475873947, "learning_rate": 3.6581085339875033e-06, "loss": 0.0377, "step": 10060 }, { "epoch": 2.205431449846693, "grad_norm": 8.09802532196045, "learning_rate": 3.639461006141337e-06, "loss": 0.0318, "step": 10070 }, { "epoch": 2.2076215505913273, "grad_norm": 0.0010597211075946689, "learning_rate": 3.6208505496616166e-06, "loss": 0.0375, "step": 10080 }, { "epoch": 2.2098116513359614, "grad_norm": 0.05859110876917839, "learning_rate": 3.6022772730163522e-06, "loss": 0.0227, "step": 10090 }, { "epoch": 2.212001752080596, "grad_norm": 0.01456915121525526, "learning_rate": 3.583741284456871e-06, "loss": 0.0451, "step": 10100 }, { "epoch": 2.21419185282523, "grad_norm": 0.09151438623666763, "learning_rate": 3.56524269201716e-06, "loss": 0.0311, "step": 10110 }, { "epoch": 2.216381953569864, "grad_norm": 9.699755668640137, "learning_rate": 3.5467816035132596e-06, "loss": 0.0401, "step": 10120 }, { "epoch": 2.2185720543144987, "grad_norm": 1.6159425973892212, "learning_rate": 3.528358126542616e-06, "loss": 0.05, "step": 10130 }, { "epoch": 2.2207621550591328, "grad_norm": 0.05873635411262512, "learning_rate": 3.5099723684834707e-06, "loss": 0.0796, "step": 10140 }, { "epoch": 2.222952255803767, "grad_norm": 9.678986549377441, "learning_rate": 3.4916244364942178e-06, "loss": 0.0364, "step": 10150 }, { "epoch": 2.2251423565484014, "grad_norm": 0.0009396448149345815, "learning_rate": 3.473314437512797e-06, "loss": 0.0379, "step": 10160 }, { "epoch": 2.2273324572930355, "grad_norm": 10.297322273254395, "learning_rate": 3.4550424782560543e-06, "loss": 0.0786, "step": 10170 }, { "epoch": 2.2295225580376696, "grad_norm": 0.5186501145362854, "learning_rate": 3.4368086652191357e-06, "loss": 0.0926, "step": 10180 }, { "epoch": 2.231712658782304, "grad_norm": 5.774144649505615, "learning_rate": 3.4186131046748463e-06, "loss": 0.0434, "step": 10190 }, { "epoch": 2.2339027595269383, "grad_norm": 0.15737713873386383, "learning_rate": 3.4004559026730564e-06, "loss": 0.028, "step": 10200 }, { "epoch": 2.2360928602715724, "grad_norm": 0.33650192618370056, "learning_rate": 3.382337165040058e-06, "loss": 0.0741, "step": 10210 }, { "epoch": 2.238282961016207, "grad_norm": 0.03550396487116814, "learning_rate": 3.364256997377966e-06, "loss": 0.0595, "step": 10220 }, { "epoch": 2.240473061760841, "grad_norm": 9.937628746032715, "learning_rate": 3.3462155050640997e-06, "loss": 0.0304, "step": 10230 }, { "epoch": 2.242663162505475, "grad_norm": 0.07369370013475418, "learning_rate": 3.328212793250355e-06, "loss": 0.0281, "step": 10240 }, { "epoch": 2.2448532632501097, "grad_norm": 2.7196874618530273, "learning_rate": 3.3102489668626125e-06, "loss": 0.0252, "step": 10250 }, { "epoch": 2.247043363994744, "grad_norm": 8.375349044799805, "learning_rate": 3.2923241306001075e-06, "loss": 0.027, "step": 10260 }, { "epoch": 2.249233464739378, "grad_norm": 0.42318907380104065, "learning_rate": 3.2744383889348354e-06, "loss": 0.016, "step": 10270 }, { "epoch": 2.2514235654840125, "grad_norm": 5.941165924072266, "learning_rate": 3.2565918461109246e-06, "loss": 0.0163, "step": 10280 }, { "epoch": 2.2536136662286466, "grad_norm": 4.5944695472717285, "learning_rate": 3.2387846061440533e-06, "loss": 0.0284, "step": 10290 }, { "epoch": 2.2558037669732807, "grad_norm": 10.353431701660156, "learning_rate": 3.2210167728208143e-06, "loss": 0.0493, "step": 10300 }, { "epoch": 2.257993867717915, "grad_norm": 5.879848480224609, "learning_rate": 3.2032884496981388e-06, "loss": 0.0611, "step": 10310 }, { "epoch": 2.2601839684625493, "grad_norm": 0.06764402240514755, "learning_rate": 3.185599740102667e-06, "loss": 0.077, "step": 10320 }, { "epoch": 2.2623740692071834, "grad_norm": 0.20550240576267242, "learning_rate": 3.1679507471301718e-06, "loss": 0.0431, "step": 10330 }, { "epoch": 2.264564169951818, "grad_norm": 10.046934127807617, "learning_rate": 3.1503415736449293e-06, "loss": 0.0799, "step": 10340 }, { "epoch": 2.266754270696452, "grad_norm": 0.20419423282146454, "learning_rate": 3.1327723222791483e-06, "loss": 0.03, "step": 10350 }, { "epoch": 2.268944371441086, "grad_norm": 7.060661315917969, "learning_rate": 3.1152430954323442e-06, "loss": 0.0102, "step": 10360 }, { "epoch": 2.2711344721857207, "grad_norm": 0.013744733296334743, "learning_rate": 3.097753995270766e-06, "loss": 0.0634, "step": 10370 }, { "epoch": 2.273324572930355, "grad_norm": 0.018780864775180817, "learning_rate": 3.0803051237267924e-06, "loss": 0.0856, "step": 10380 }, { "epoch": 2.275514673674989, "grad_norm": 0.1843547523021698, "learning_rate": 3.0628965824983235e-06, "loss": 0.019, "step": 10390 }, { "epoch": 2.2777047744196235, "grad_norm": 0.10724405944347382, "learning_rate": 3.045528473048217e-06, "loss": 0.0311, "step": 10400 }, { "epoch": 2.2798948751642576, "grad_norm": 0.9219202995300293, "learning_rate": 3.0282008966036647e-06, "loss": 0.0689, "step": 10410 }, { "epoch": 2.2820849759088917, "grad_norm": 4.23945951461792, "learning_rate": 3.010913954155634e-06, "loss": 0.0133, "step": 10420 }, { "epoch": 2.2842750766535262, "grad_norm": 0.09342587739229202, "learning_rate": 2.9936677464582477e-06, "loss": 0.0144, "step": 10430 }, { "epoch": 2.2864651773981604, "grad_norm": 13.743094444274902, "learning_rate": 2.9764623740282285e-06, "loss": 0.0628, "step": 10440 }, { "epoch": 2.2886552781427945, "grad_norm": 2.5184903144836426, "learning_rate": 2.959297937144283e-06, "loss": 0.0422, "step": 10450 }, { "epoch": 2.290845378887429, "grad_norm": 10.027556419372559, "learning_rate": 2.942174535846544e-06, "loss": 0.0562, "step": 10460 }, { "epoch": 2.293035479632063, "grad_norm": 0.08877971768379211, "learning_rate": 2.925092269935963e-06, "loss": 0.1066, "step": 10470 }, { "epoch": 2.295225580376697, "grad_norm": 0.11047007888555527, "learning_rate": 2.9080512389737547e-06, "loss": 0.0753, "step": 10480 }, { "epoch": 2.2974156811213318, "grad_norm": 0.9941737055778503, "learning_rate": 2.891051542280787e-06, "loss": 0.0133, "step": 10490 }, { "epoch": 2.299605781865966, "grad_norm": 0.529569685459137, "learning_rate": 2.8740932789370324e-06, "loss": 0.0069, "step": 10500 }, { "epoch": 2.3017958826106, "grad_norm": 0.020843710750341415, "learning_rate": 2.8571765477809645e-06, "loss": 0.0508, "step": 10510 }, { "epoch": 2.3039859833552345, "grad_norm": 0.0033396142534911633, "learning_rate": 2.840301447409001e-06, "loss": 0.0092, "step": 10520 }, { "epoch": 2.3061760840998686, "grad_norm": 9.580755233764648, "learning_rate": 2.8234680761749144e-06, "loss": 0.0468, "step": 10530 }, { "epoch": 2.3083661848445027, "grad_norm": 0.0421302430331707, "learning_rate": 2.806676532189272e-06, "loss": 0.0839, "step": 10540 }, { "epoch": 2.3105562855891373, "grad_norm": 0.7296952605247498, "learning_rate": 2.789926913318858e-06, "loss": 0.0093, "step": 10550 }, { "epoch": 2.3127463863337714, "grad_norm": 1.1732386350631714, "learning_rate": 2.7732193171860945e-06, "loss": 0.0886, "step": 10560 }, { "epoch": 2.3149364870784055, "grad_norm": 4.182519912719727, "learning_rate": 2.7565538411684932e-06, "loss": 0.0206, "step": 10570 }, { "epoch": 2.31712658782304, "grad_norm": 11.064467430114746, "learning_rate": 2.7399305823980627e-06, "loss": 0.0582, "step": 10580 }, { "epoch": 2.319316688567674, "grad_norm": 0.01977277360856533, "learning_rate": 2.723349637760766e-06, "loss": 0.0254, "step": 10590 }, { "epoch": 2.3215067893123082, "grad_norm": 0.4228614866733551, "learning_rate": 2.706811103895938e-06, "loss": 0.0168, "step": 10600 }, { "epoch": 2.323696890056943, "grad_norm": 0.001154767582193017, "learning_rate": 2.6903150771957343e-06, "loss": 0.0106, "step": 10610 }, { "epoch": 2.325886990801577, "grad_norm": 8.669196128845215, "learning_rate": 2.673861653804558e-06, "loss": 0.0375, "step": 10620 }, { "epoch": 2.328077091546211, "grad_norm": 0.023739797994494438, "learning_rate": 2.6574509296185146e-06, "loss": 0.0093, "step": 10630 }, { "epoch": 2.3302671922908456, "grad_norm": 0.6126247048377991, "learning_rate": 2.6410830002848353e-06, "loss": 0.0489, "step": 10640 }, { "epoch": 2.3324572930354797, "grad_norm": 0.04919245466589928, "learning_rate": 2.6247579612013373e-06, "loss": 0.0087, "step": 10650 }, { "epoch": 2.3346473937801138, "grad_norm": 11.036314964294434, "learning_rate": 2.608475907515848e-06, "loss": 0.0878, "step": 10660 }, { "epoch": 2.3368374945247483, "grad_norm": 7.343563556671143, "learning_rate": 2.592236934125675e-06, "loss": 0.0248, "step": 10670 }, { "epoch": 2.3390275952693824, "grad_norm": 0.03621721267700195, "learning_rate": 2.5760411356770256e-06, "loss": 0.0333, "step": 10680 }, { "epoch": 2.3412176960140165, "grad_norm": 9.314860343933105, "learning_rate": 2.559888606564479e-06, "loss": 0.0372, "step": 10690 }, { "epoch": 2.343407796758651, "grad_norm": 0.2493022084236145, "learning_rate": 2.5437794409304196e-06, "loss": 0.0272, "step": 10700 }, { "epoch": 2.345597897503285, "grad_norm": 0.03339756280183792, "learning_rate": 2.5277137326644997e-06, "loss": 0.075, "step": 10710 }, { "epoch": 2.3477879982479193, "grad_norm": 1.6665163040161133, "learning_rate": 2.511691575403089e-06, "loss": 0.0468, "step": 10720 }, { "epoch": 2.349978098992554, "grad_norm": 1.8215875625610352, "learning_rate": 2.495713062528716e-06, "loss": 0.0521, "step": 10730 }, { "epoch": 2.352168199737188, "grad_norm": 0.10033299773931503, "learning_rate": 2.4797782871695507e-06, "loss": 0.0053, "step": 10740 }, { "epoch": 2.354358300481822, "grad_norm": 6.526181221008301, "learning_rate": 2.463887342198832e-06, "loss": 0.0269, "step": 10750 }, { "epoch": 2.3565484012264566, "grad_norm": 1.4751542806625366, "learning_rate": 2.4480403202343506e-06, "loss": 0.031, "step": 10760 }, { "epoch": 2.3587385019710907, "grad_norm": 0.38678300380706787, "learning_rate": 2.432237313637892e-06, "loss": 0.0526, "step": 10770 }, { "epoch": 2.360928602715725, "grad_norm": 0.27941739559173584, "learning_rate": 2.4164784145147113e-06, "loss": 0.023, "step": 10780 }, { "epoch": 2.3631187034603593, "grad_norm": 7.6076130867004395, "learning_rate": 2.4007637147129847e-06, "loss": 0.0106, "step": 10790 }, { "epoch": 2.3653088042049935, "grad_norm": 0.26686909794807434, "learning_rate": 2.385093305823286e-06, "loss": 0.0325, "step": 10800 }, { "epoch": 2.3674989049496276, "grad_norm": 0.012969826348125935, "learning_rate": 2.3694672791780395e-06, "loss": 0.0063, "step": 10810 }, { "epoch": 2.369689005694262, "grad_norm": 0.07745005190372467, "learning_rate": 2.353885725851003e-06, "loss": 0.0292, "step": 10820 }, { "epoch": 2.371879106438896, "grad_norm": 0.15122023224830627, "learning_rate": 2.3383487366567194e-06, "loss": 0.0432, "step": 10830 }, { "epoch": 2.3740692071835303, "grad_norm": 0.0649198442697525, "learning_rate": 2.3228564021500064e-06, "loss": 0.0608, "step": 10840 }, { "epoch": 2.376259307928165, "grad_norm": 0.5270485281944275, "learning_rate": 2.3074088126254113e-06, "loss": 0.021, "step": 10850 }, { "epoch": 2.378449408672799, "grad_norm": 8.585930824279785, "learning_rate": 2.2920060581166983e-06, "loss": 0.0866, "step": 10860 }, { "epoch": 2.380639509417433, "grad_norm": 0.02257196232676506, "learning_rate": 2.276648228396312e-06, "loss": 0.0428, "step": 10870 }, { "epoch": 2.3828296101620676, "grad_norm": 0.5084486603736877, "learning_rate": 2.2613354129748654e-06, "loss": 0.0247, "step": 10880 }, { "epoch": 2.3850197109067017, "grad_norm": 0.0628821849822998, "learning_rate": 2.246067701100615e-06, "loss": 0.011, "step": 10890 }, { "epoch": 2.387209811651336, "grad_norm": 0.09191448241472244, "learning_rate": 2.230845181758928e-06, "loss": 0.0449, "step": 10900 }, { "epoch": 2.3893999123959704, "grad_norm": 0.1834626942873001, "learning_rate": 2.2156679436717874e-06, "loss": 0.0497, "step": 10910 }, { "epoch": 2.3915900131406045, "grad_norm": 0.0005779406055808067, "learning_rate": 2.2005360752972514e-06, "loss": 0.0191, "step": 10920 }, { "epoch": 2.3937801138852386, "grad_norm": 0.03631008788943291, "learning_rate": 2.185449664828956e-06, "loss": 0.0377, "step": 10930 }, { "epoch": 2.395970214629873, "grad_norm": 1.633986234664917, "learning_rate": 2.1704088001955835e-06, "loss": 0.0179, "step": 10940 }, { "epoch": 2.3981603153745072, "grad_norm": 7.511910915374756, "learning_rate": 2.155413569060373e-06, "loss": 0.0895, "step": 10950 }, { "epoch": 2.4003504161191414, "grad_norm": 5.717944145202637, "learning_rate": 2.1404640588205826e-06, "loss": 0.0643, "step": 10960 }, { "epoch": 2.402540516863776, "grad_norm": 0.06082858890295029, "learning_rate": 2.1270486629894703e-06, "loss": 0.0371, "step": 10970 }, { "epoch": 2.40473061760841, "grad_norm": 0.016977889463305473, "learning_rate": 2.113670434996472e-06, "loss": 0.024, "step": 10980 }, { "epoch": 2.406920718353044, "grad_norm": 0.022806450724601746, "learning_rate": 2.0988494059471197e-06, "loss": 0.0425, "step": 10990 }, { "epoch": 2.4091108190976787, "grad_norm": 2.847071886062622, "learning_rate": 2.084074427468209e-06, "loss": 0.0192, "step": 11000 }, { "epoch": 2.4113009198423128, "grad_norm": 7.661363124847412, "learning_rate": 2.0693455856733e-06, "loss": 0.072, "step": 11010 }, { "epoch": 2.413491020586947, "grad_norm": 4.15516996383667, "learning_rate": 2.054662966407046e-06, "loss": 0.064, "step": 11020 }, { "epoch": 2.4156811213315814, "grad_norm": 0.03475746139883995, "learning_rate": 2.04002665524471e-06, "loss": 0.0115, "step": 11030 }, { "epoch": 2.4178712220762155, "grad_norm": 0.009050685912370682, "learning_rate": 2.025436737491645e-06, "loss": 0.0263, "step": 11040 }, { "epoch": 2.4200613228208496, "grad_norm": 3.4575037956237793, "learning_rate": 2.0108932981828157e-06, "loss": 0.0563, "step": 11050 }, { "epoch": 2.422251423565484, "grad_norm": 9.348785400390625, "learning_rate": 1.9963964220822863e-06, "loss": 0.0271, "step": 11060 }, { "epoch": 2.4244415243101183, "grad_norm": 17.701356887817383, "learning_rate": 1.9819461936827465e-06, "loss": 0.0811, "step": 11070 }, { "epoch": 2.4266316250547524, "grad_norm": 0.0485520176589489, "learning_rate": 1.967542697204995e-06, "loss": 0.056, "step": 11080 }, { "epoch": 2.428821725799387, "grad_norm": 1.1157996654510498, "learning_rate": 1.9531860165974702e-06, "loss": 0.0078, "step": 11090 }, { "epoch": 2.431011826544021, "grad_norm": 0.02702515199780464, "learning_rate": 1.9388762355357505e-06, "loss": 0.0589, "step": 11100 }, { "epoch": 2.433201927288655, "grad_norm": 18.266599655151367, "learning_rate": 1.9246134374220614e-06, "loss": 0.061, "step": 11110 }, { "epoch": 2.4353920280332897, "grad_norm": 6.464306354522705, "learning_rate": 1.9103977053848066e-06, "loss": 0.0629, "step": 11120 }, { "epoch": 2.437582128777924, "grad_norm": 0.19361186027526855, "learning_rate": 1.8962291222780605e-06, "loss": 0.0178, "step": 11130 }, { "epoch": 2.439772229522558, "grad_norm": 7.641140937805176, "learning_rate": 1.8821077706811087e-06, "loss": 0.0665, "step": 11140 }, { "epoch": 2.4419623302671924, "grad_norm": 0.0493302084505558, "learning_rate": 1.868033732897948e-06, "loss": 0.0445, "step": 11150 }, { "epoch": 2.4441524310118266, "grad_norm": 1.850546956062317, "learning_rate": 1.8540070909568197e-06, "loss": 0.0286, "step": 11160 }, { "epoch": 2.4463425317564607, "grad_norm": 9.651247024536133, "learning_rate": 1.840027926609721e-06, "loss": 0.0168, "step": 11170 }, { "epoch": 2.448532632501095, "grad_norm": 0.29811570048332214, "learning_rate": 1.8260963213319404e-06, "loss": 0.0143, "step": 11180 }, { "epoch": 2.4507227332457293, "grad_norm": 0.1148192286491394, "learning_rate": 1.8122123563215667e-06, "loss": 0.0079, "step": 11190 }, { "epoch": 2.4529128339903634, "grad_norm": 7.610918998718262, "learning_rate": 1.7983761124990362e-06, "loss": 0.0173, "step": 11200 }, { "epoch": 2.455102934734998, "grad_norm": 0.4004572033882141, "learning_rate": 1.784587670506639e-06, "loss": 0.0267, "step": 11210 }, { "epoch": 2.457293035479632, "grad_norm": 1.9361090660095215, "learning_rate": 1.7708471107080705e-06, "loss": 0.0649, "step": 11220 }, { "epoch": 2.459483136224266, "grad_norm": 7.427115440368652, "learning_rate": 1.757154513187942e-06, "loss": 0.0376, "step": 11230 }, { "epoch": 2.4616732369689007, "grad_norm": 0.4180677831172943, "learning_rate": 1.743509957751336e-06, "loss": 0.0433, "step": 11240 }, { "epoch": 2.463863337713535, "grad_norm": 0.1115199476480484, "learning_rate": 1.7299135239233178e-06, "loss": 0.0573, "step": 11250 }, { "epoch": 2.466053438458169, "grad_norm": 0.6109313368797302, "learning_rate": 1.716365290948492e-06, "loss": 0.0111, "step": 11260 }, { "epoch": 2.4682435392028035, "grad_norm": 0.3206368684768677, "learning_rate": 1.7028653377905314e-06, "loss": 0.014, "step": 11270 }, { "epoch": 2.4704336399474376, "grad_norm": 0.013824772089719772, "learning_rate": 1.6894137431317116e-06, "loss": 0.0342, "step": 11280 }, { "epoch": 2.4726237406920717, "grad_norm": 0.03883196413516998, "learning_rate": 1.6760105853724684e-06, "loss": 0.0097, "step": 11290 }, { "epoch": 2.4748138414367062, "grad_norm": 0.029989417642354965, "learning_rate": 1.6626559426309208e-06, "loss": 0.022, "step": 11300 }, { "epoch": 2.4770039421813403, "grad_norm": 10.211779594421387, "learning_rate": 1.6493498927424345e-06, "loss": 0.0679, "step": 11310 }, { "epoch": 2.4791940429259745, "grad_norm": 0.029188064858317375, "learning_rate": 1.636092513259152e-06, "loss": 0.0643, "step": 11320 }, { "epoch": 2.481384143670609, "grad_norm": 0.3984737992286682, "learning_rate": 1.6228838814495573e-06, "loss": 0.007, "step": 11330 }, { "epoch": 2.483574244415243, "grad_norm": 0.2774839997291565, "learning_rate": 1.6097240742980058e-06, "loss": 0.0418, "step": 11340 }, { "epoch": 2.485764345159877, "grad_norm": 0.02474031411111355, "learning_rate": 1.5966131685042963e-06, "loss": 0.0312, "step": 11350 }, { "epoch": 2.4879544459045118, "grad_norm": 1.464008092880249, "learning_rate": 1.5835512404832066e-06, "loss": 0.0189, "step": 11360 }, { "epoch": 2.490144546649146, "grad_norm": 0.09371896088123322, "learning_rate": 1.5705383663640616e-06, "loss": 0.0608, "step": 11370 }, { "epoch": 2.49233464739378, "grad_norm": 9.74116039276123, "learning_rate": 1.5575746219902765e-06, "loss": 0.0199, "step": 11380 }, { "epoch": 2.4945247481384145, "grad_norm": 0.8118864893913269, "learning_rate": 1.5446600829189296e-06, "loss": 0.0362, "step": 11390 }, { "epoch": 2.4967148488830486, "grad_norm": 1.14534592628479, "learning_rate": 1.5317948244203074e-06, "loss": 0.0532, "step": 11400 }, { "epoch": 2.4989049496276827, "grad_norm": 3.389566421508789, "learning_rate": 1.5189789214774753e-06, "loss": 0.03, "step": 11410 }, { "epoch": 2.5010950503723173, "grad_norm": 6.305450439453125, "learning_rate": 1.5062124487858432e-06, "loss": 0.0324, "step": 11420 }, { "epoch": 2.5032851511169514, "grad_norm": 0.056873735040426254, "learning_rate": 1.4934954807527125e-06, "loss": 0.0194, "step": 11430 }, { "epoch": 2.5054752518615855, "grad_norm": 14.082108497619629, "learning_rate": 1.4808280914968676e-06, "loss": 0.0285, "step": 11440 }, { "epoch": 2.50766535260622, "grad_norm": 2.4273581504821777, "learning_rate": 1.4682103548481197e-06, "loss": 0.0478, "step": 11450 }, { "epoch": 2.509855453350854, "grad_norm": 7.351220607757568, "learning_rate": 1.4556423443469004e-06, "loss": 0.0378, "step": 11460 }, { "epoch": 2.5120455540954882, "grad_norm": 5.655972480773926, "learning_rate": 1.4431241332438061e-06, "loss": 0.0306, "step": 11470 }, { "epoch": 2.514235654840123, "grad_norm": 7.364433765411377, "learning_rate": 1.4306557944992006e-06, "loss": 0.026, "step": 11480 }, { "epoch": 2.516425755584757, "grad_norm": 0.4095740020275116, "learning_rate": 1.4182374007827605e-06, "loss": 0.0053, "step": 11490 }, { "epoch": 2.518615856329391, "grad_norm": 3.114041328430176, "learning_rate": 1.4058690244730777e-06, "loss": 0.1052, "step": 11500 }, { "epoch": 2.5208059570740255, "grad_norm": 5.198855876922607, "learning_rate": 1.3935507376572167e-06, "loss": 0.0717, "step": 11510 }, { "epoch": 2.5229960578186597, "grad_norm": 0.056828804314136505, "learning_rate": 1.3812826121303114e-06, "loss": 0.0156, "step": 11520 }, { "epoch": 2.5251861585632938, "grad_norm": 0.009340592660009861, "learning_rate": 1.369064719395131e-06, "loss": 0.0229, "step": 11530 }, { "epoch": 2.5273762593079283, "grad_norm": 10.321012496948242, "learning_rate": 1.3568971306616808e-06, "loss": 0.025, "step": 11540 }, { "epoch": 2.5295663600525624, "grad_norm": 0.01185495313256979, "learning_rate": 1.3447799168467647e-06, "loss": 0.0836, "step": 11550 }, { "epoch": 2.5317564607971965, "grad_norm": 0.2772132456302643, "learning_rate": 1.3327131485735978e-06, "loss": 0.0992, "step": 11560 }, { "epoch": 2.533946561541831, "grad_norm": 0.24496282637119293, "learning_rate": 1.3206968961713695e-06, "loss": 0.0381, "step": 11570 }, { "epoch": 2.536136662286465, "grad_norm": 1.9238848686218262, "learning_rate": 1.3087312296748544e-06, "loss": 0.0423, "step": 11580 }, { "epoch": 2.5383267630310993, "grad_norm": 4.519456386566162, "learning_rate": 1.296816218823994e-06, "loss": 0.0153, "step": 11590 }, { "epoch": 2.540516863775734, "grad_norm": 0.0005254403222352266, "learning_rate": 1.2849519330634874e-06, "loss": 0.037, "step": 11600 }, { "epoch": 2.542706964520368, "grad_norm": 0.4219284951686859, "learning_rate": 1.2731384415423964e-06, "loss": 0.0162, "step": 11610 }, { "epoch": 2.544897065265002, "grad_norm": 0.04044210910797119, "learning_rate": 1.2613758131137287e-06, "loss": 0.0262, "step": 11620 }, { "epoch": 2.5470871660096366, "grad_norm": 0.09268059581518173, "learning_rate": 1.2496641163340562e-06, "loss": 0.0432, "step": 11630 }, { "epoch": 2.5492772667542707, "grad_norm": 7.252458572387695, "learning_rate": 1.238003419463093e-06, "loss": 0.0124, "step": 11640 }, { "epoch": 2.551467367498905, "grad_norm": 0.9315186142921448, "learning_rate": 1.2263937904633162e-06, "loss": 0.0747, "step": 11650 }, { "epoch": 2.5536574682435393, "grad_norm": 0.023703956976532936, "learning_rate": 1.2148352969995558e-06, "loss": 0.0134, "step": 11660 }, { "epoch": 2.5558475689881734, "grad_norm": 9.646428108215332, "learning_rate": 1.2033280064386133e-06, "loss": 0.0317, "step": 11670 }, { "epoch": 2.5580376697328076, "grad_norm": 0.5800561904907227, "learning_rate": 1.1918719858488559e-06, "loss": 0.0019, "step": 11680 }, { "epoch": 2.560227770477442, "grad_norm": 0.021618977189064026, "learning_rate": 1.1804673019998403e-06, "loss": 0.0552, "step": 11690 }, { "epoch": 2.562417871222076, "grad_norm": 0.02040577121078968, "learning_rate": 1.1691140213619034e-06, "loss": 0.0333, "step": 11700 }, { "epoch": 2.5646079719667103, "grad_norm": 1.2705762386322021, "learning_rate": 1.157812210105801e-06, "loss": 0.0441, "step": 11710 }, { "epoch": 2.566798072711345, "grad_norm": 9.663126945495605, "learning_rate": 1.1465619341022948e-06, "loss": 0.1121, "step": 11720 }, { "epoch": 2.568988173455979, "grad_norm": 10.121922492980957, "learning_rate": 1.1353632589217922e-06, "loss": 0.0416, "step": 11730 }, { "epoch": 2.571178274200613, "grad_norm": 0.08818479627370834, "learning_rate": 1.1242162498339447e-06, "loss": 0.0272, "step": 11740 }, { "epoch": 2.5733683749452476, "grad_norm": 0.3730843663215637, "learning_rate": 1.1131209718072844e-06, "loss": 0.0744, "step": 11750 }, { "epoch": 2.5755584756898817, "grad_norm": 0.022303223609924316, "learning_rate": 1.1020774895088338e-06, "loss": 0.0376, "step": 11760 }, { "epoch": 2.577748576434516, "grad_norm": 3.5585010051727295, "learning_rate": 1.0910858673037294e-06, "loss": 0.0385, "step": 11770 }, { "epoch": 2.5799386771791504, "grad_norm": 9.670849800109863, "learning_rate": 1.0801461692548554e-06, "loss": 0.0173, "step": 11780 }, { "epoch": 2.5821287779237845, "grad_norm": 0.2038198709487915, "learning_rate": 1.0692584591224586e-06, "loss": 0.0285, "step": 11790 }, { "epoch": 2.5843188786684186, "grad_norm": 0.11282247304916382, "learning_rate": 1.0584228003637897e-06, "loss": 0.0527, "step": 11800 }, { "epoch": 2.586508979413053, "grad_norm": 13.732769966125488, "learning_rate": 1.0476392561327176e-06, "loss": 0.0248, "step": 11810 }, { "epoch": 2.5886990801576872, "grad_norm": 7.433139324188232, "learning_rate": 1.0369078892793804e-06, "loss": 0.0968, "step": 11820 }, { "epoch": 2.5908891809023213, "grad_norm": 4.339325904846191, "learning_rate": 1.0262287623497979e-06, "loss": 0.0545, "step": 11830 }, { "epoch": 2.593079281646956, "grad_norm": 3.86357045173645, "learning_rate": 1.0156019375855287e-06, "loss": 0.0681, "step": 11840 }, { "epoch": 2.59526938239159, "grad_norm": 0.2354222983121872, "learning_rate": 1.0050274769232859e-06, "loss": 0.0213, "step": 11850 }, { "epoch": 2.597459483136224, "grad_norm": 1.6461262702941895, "learning_rate": 9.945054419945987e-07, "loss": 0.0269, "step": 11860 }, { "epoch": 2.5996495838808586, "grad_norm": 0.020808998495340347, "learning_rate": 9.840358941254301e-07, "loss": 0.0347, "step": 11870 }, { "epoch": 2.6018396846254928, "grad_norm": 0.1988832652568817, "learning_rate": 9.736188943358416e-07, "loss": 0.0104, "step": 11880 }, { "epoch": 2.604029785370127, "grad_norm": 10.150317192077637, "learning_rate": 9.632545033396147e-07, "loss": 0.0684, "step": 11890 }, { "epoch": 2.6062198861147614, "grad_norm": 0.017003584653139114, "learning_rate": 9.529427815439219e-07, "loss": 0.0313, "step": 11900 }, { "epoch": 2.6084099868593955, "grad_norm": 2.123278856277466, "learning_rate": 9.426837890489504e-07, "loss": 0.0485, "step": 11910 }, { "epoch": 2.6106000876040296, "grad_norm": 0.07788466662168503, "learning_rate": 9.324775856475721e-07, "loss": 0.0288, "step": 11920 }, { "epoch": 2.612790188348664, "grad_norm": 0.030165301635861397, "learning_rate": 9.223242308249824e-07, "loss": 0.0182, "step": 11930 }, { "epoch": 2.6149802890932983, "grad_norm": 0.01696665585041046, "learning_rate": 9.122237837583547e-07, "loss": 0.0521, "step": 11940 }, { "epoch": 2.6171703898379324, "grad_norm": 0.17959335446357727, "learning_rate": 9.021763033165043e-07, "loss": 0.0552, "step": 11950 }, { "epoch": 2.619360490582567, "grad_norm": 0.39405763149261475, "learning_rate": 8.921818480595334e-07, "loss": 0.0406, "step": 11960 }, { "epoch": 2.621550591327201, "grad_norm": 0.03113142028450966, "learning_rate": 8.822404762385006e-07, "loss": 0.0464, "step": 11970 }, { "epoch": 2.623740692071835, "grad_norm": 8.113153457641602, "learning_rate": 8.733386758324813e-07, "loss": 0.0681, "step": 11980 }, { "epoch": 2.6259307928164697, "grad_norm": 9.766229629516602, "learning_rate": 8.644799619712829e-07, "loss": 0.0711, "step": 11990 }, { "epoch": 2.628120893561104, "grad_norm": 0.4738243818283081, "learning_rate": 8.546875310316838e-07, "loss": 0.0535, "step": 12000 }, { "epoch": 2.630310994305738, "grad_norm": 0.1726207435131073, "learning_rate": 8.449484020575493e-07, "loss": 0.0414, "step": 12010 }, { "epoch": 2.6325010950503724, "grad_norm": 7.792304992675781, "learning_rate": 8.352626318118029e-07, "loss": 0.0494, "step": 12020 }, { "epoch": 2.6346911957950065, "grad_norm": 8.798876762390137, "learning_rate": 8.256302767463831e-07, "loss": 0.0354, "step": 12030 }, { "epoch": 2.6368812965396407, "grad_norm": 0.28635525703430176, "learning_rate": 8.160513930019021e-07, "loss": 0.0591, "step": 12040 }, { "epoch": 2.639071397284275, "grad_norm": 0.12087702006101608, "learning_rate": 8.065260364073268e-07, "loss": 0.0265, "step": 12050 }, { "epoch": 2.6412614980289093, "grad_norm": 0.2926778793334961, "learning_rate": 7.970542624796462e-07, "loss": 0.1056, "step": 12060 }, { "epoch": 2.6434515987735434, "grad_norm": 10.764483451843262, "learning_rate": 7.876361264235555e-07, "loss": 0.0448, "step": 12070 }, { "epoch": 2.645641699518178, "grad_norm": 9.699063301086426, "learning_rate": 7.782716831311255e-07, "loss": 0.0404, "step": 12080 }, { "epoch": 2.647831800262812, "grad_norm": 14.730912208557129, "learning_rate": 7.689609871814918e-07, "loss": 0.0443, "step": 12090 }, { "epoch": 2.650021901007446, "grad_norm": 10.732462882995605, "learning_rate": 7.597040928405297e-07, "loss": 0.0345, "step": 12100 }, { "epoch": 2.6522120017520807, "grad_norm": 3.841738700866699, "learning_rate": 7.505010540605417e-07, "loss": 0.013, "step": 12110 }, { "epoch": 2.654402102496715, "grad_norm": 4.816169261932373, "learning_rate": 7.413519244799417e-07, "loss": 0.0613, "step": 12120 }, { "epoch": 2.656592203241349, "grad_norm": 0.2099931389093399, "learning_rate": 7.322567574229411e-07, "loss": 0.0266, "step": 12130 }, { "epoch": 2.6587823039859835, "grad_norm": 0.03638794273138046, "learning_rate": 7.232156058992456e-07, "loss": 0.0358, "step": 12140 }, { "epoch": 2.6609724047306176, "grad_norm": 0.26869332790374756, "learning_rate": 7.142285226037293e-07, "loss": 0.0679, "step": 12150 }, { "epoch": 2.6631625054752517, "grad_norm": 0.011009445413947105, "learning_rate": 7.052955599161504e-07, "loss": 0.0079, "step": 12160 }, { "epoch": 2.6653526062198862, "grad_norm": 0.11883076280355453, "learning_rate": 6.96416769900824e-07, "loss": 0.0163, "step": 12170 }, { "epoch": 2.6675427069645203, "grad_norm": 5.6963324546813965, "learning_rate": 6.875922043063355e-07, "loss": 0.0189, "step": 12180 }, { "epoch": 2.6697328077091544, "grad_norm": 1.6469123363494873, "learning_rate": 6.788219145652264e-07, "loss": 0.021, "step": 12190 }, { "epoch": 2.671922908453789, "grad_norm": 6.454479217529297, "learning_rate": 6.70105951793707e-07, "loss": 0.0294, "step": 12200 }, { "epoch": 2.674113009198423, "grad_norm": 7.654608249664307, "learning_rate": 6.61444366791345e-07, "loss": 0.0929, "step": 12210 }, { "epoch": 2.676303109943057, "grad_norm": 14.82779598236084, "learning_rate": 6.528372100407809e-07, "loss": 0.0648, "step": 12220 }, { "epoch": 2.6784932106876918, "grad_norm": 0.32993870973587036, "learning_rate": 6.442845317074253e-07, "loss": 0.0433, "step": 12230 }, { "epoch": 2.680683311432326, "grad_norm": 7.137444972991943, "learning_rate": 6.357863816391741e-07, "loss": 0.0192, "step": 12240 }, { "epoch": 2.68287341217696, "grad_norm": 0.01482816319912672, "learning_rate": 6.273428093661093e-07, "loss": 0.0429, "step": 12250 }, { "epoch": 2.6850635129215945, "grad_norm": 5.47510290145874, "learning_rate": 6.189538641002213e-07, "loss": 0.0259, "step": 12260 }, { "epoch": 2.6872536136662286, "grad_norm": 0.17006351053714752, "learning_rate": 6.106195947351101e-07, "loss": 0.0043, "step": 12270 }, { "epoch": 2.6894437144108627, "grad_norm": 0.4496266543865204, "learning_rate": 6.023400498457088e-07, "loss": 0.0219, "step": 12280 }, { "epoch": 2.6916338151554973, "grad_norm": 0.34782305359840393, "learning_rate": 5.941152776879966e-07, "loss": 0.0198, "step": 12290 }, { "epoch": 2.6938239159001314, "grad_norm": 0.020884022116661072, "learning_rate": 5.859453261987213e-07, "loss": 0.0366, "step": 12300 }, { "epoch": 2.6960140166447655, "grad_norm": 8.046713829040527, "learning_rate": 5.778302429951155e-07, "loss": 0.0141, "step": 12310 }, { "epoch": 2.6982041173894, "grad_norm": 0.01791343279182911, "learning_rate": 5.697700753746194e-07, "loss": 0.0126, "step": 12320 }, { "epoch": 2.700394218134034, "grad_norm": 0.02134949527680874, "learning_rate": 5.617648703146106e-07, "loss": 0.0269, "step": 12330 }, { "epoch": 2.7025843188786682, "grad_norm": 0.13765187561511993, "learning_rate": 5.538146744721207e-07, "loss": 0.0042, "step": 12340 }, { "epoch": 2.704774419623303, "grad_norm": 0.6140391230583191, "learning_rate": 5.459195341835766e-07, "loss": 0.0098, "step": 12350 }, { "epoch": 2.706964520367937, "grad_norm": 10.319211959838867, "learning_rate": 5.380794954645141e-07, "loss": 0.0306, "step": 12360 }, { "epoch": 2.709154621112571, "grad_norm": 2.40873384475708, "learning_rate": 5.302946040093282e-07, "loss": 0.002, "step": 12370 }, { "epoch": 2.7113447218572055, "grad_norm": 7.512635231018066, "learning_rate": 5.225649051909865e-07, "loss": 0.0466, "step": 12380 }, { "epoch": 2.7135348226018396, "grad_norm": 0.04386173188686371, "learning_rate": 5.148904440607849e-07, "loss": 0.0147, "step": 12390 }, { "epoch": 2.7157249233464738, "grad_norm": 10.708556175231934, "learning_rate": 5.072712653480682e-07, "loss": 0.0541, "step": 12400 }, { "epoch": 2.7179150240911083, "grad_norm": 0.022989969700574875, "learning_rate": 4.997074134599821e-07, "loss": 0.0293, "step": 12410 }, { "epoch": 2.7201051248357424, "grad_norm": 9.40102481842041, "learning_rate": 4.921989324812049e-07, "loss": 0.0669, "step": 12420 }, { "epoch": 2.7222952255803765, "grad_norm": 8.323657035827637, "learning_rate": 4.847458661736971e-07, "loss": 0.0235, "step": 12430 }, { "epoch": 2.724485326325011, "grad_norm": 0.15721505880355835, "learning_rate": 4.77348257976441e-07, "loss": 0.0245, "step": 12440 }, { "epoch": 2.726675427069645, "grad_norm": 5.829390048980713, "learning_rate": 4.7000615100519345e-07, "loss": 0.0204, "step": 12450 }, { "epoch": 2.7288655278142793, "grad_norm": 0.5402848124504089, "learning_rate": 4.627195880522306e-07, "loss": 0.0282, "step": 12460 }, { "epoch": 2.731055628558914, "grad_norm": 0.20608052611351013, "learning_rate": 4.5548861158609794e-07, "loss": 0.0244, "step": 12470 }, { "epoch": 2.733245729303548, "grad_norm": 0.13509126007556915, "learning_rate": 4.483132637513654e-07, "loss": 0.051, "step": 12480 }, { "epoch": 2.735435830048182, "grad_norm": 0.017962230369448662, "learning_rate": 4.411935863683803e-07, "loss": 0.0279, "step": 12490 }, { "epoch": 2.7376259307928166, "grad_norm": 0.08614633977413177, "learning_rate": 4.341296209330237e-07, "loss": 0.0441, "step": 12500 }, { "epoch": 2.7398160315374507, "grad_norm": 0.8985525965690613, "learning_rate": 4.2712140861646656e-07, "loss": 0.0243, "step": 12510 }, { "epoch": 2.742006132282085, "grad_norm": 8.991988182067871, "learning_rate": 4.2016899026493753e-07, "loss": 0.0622, "step": 12520 }, { "epoch": 2.7441962330267193, "grad_norm": 0.039479002356529236, "learning_rate": 4.1327240639947306e-07, "loss": 0.0118, "step": 12530 }, { "epoch": 2.7463863337713534, "grad_norm": 0.47407758235931396, "learning_rate": 4.064316972156912e-07, "loss": 0.0116, "step": 12540 }, { "epoch": 2.7485764345159875, "grad_norm": 9.767305374145508, "learning_rate": 3.996469025835503e-07, "loss": 0.0348, "step": 12550 }, { "epoch": 2.750766535260622, "grad_norm": 0.35034656524658203, "learning_rate": 3.92918062047124e-07, "loss": 0.01, "step": 12560 }, { "epoch": 2.752956636005256, "grad_norm": 7.979565620422363, "learning_rate": 3.862452148243623e-07, "loss": 0.0378, "step": 12570 }, { "epoch": 2.7551467367498903, "grad_norm": 0.44257134199142456, "learning_rate": 3.796283998068706e-07, "loss": 0.0213, "step": 12580 }, { "epoch": 2.757336837494525, "grad_norm": 4.723199844360352, "learning_rate": 3.7306765555967793e-07, "loss": 0.037, "step": 12590 }, { "epoch": 2.759526938239159, "grad_norm": 0.14705337584018707, "learning_rate": 3.6656302032101576e-07, "loss": 0.053, "step": 12600 }, { "epoch": 2.761717038983793, "grad_norm": 0.1678166538476944, "learning_rate": 3.6011453200209047e-07, "loss": 0.0339, "step": 12610 }, { "epoch": 2.7639071397284276, "grad_norm": 0.02788754366338253, "learning_rate": 3.5372222818686685e-07, "loss": 0.0238, "step": 12620 }, { "epoch": 2.7660972404730617, "grad_norm": 0.9186058640480042, "learning_rate": 3.473861461318495e-07, "loss": 0.0651, "step": 12630 }, { "epoch": 2.768287341217696, "grad_norm": 0.22539930045604706, "learning_rate": 3.411063227658584e-07, "loss": 0.0575, "step": 12640 }, { "epoch": 2.7704774419623304, "grad_norm": 0.02629416435956955, "learning_rate": 3.3488279468982476e-07, "loss": 0.063, "step": 12650 }, { "epoch": 2.7726675427069645, "grad_norm": 9.930575370788574, "learning_rate": 3.287155981765666e-07, "loss": 0.0907, "step": 12660 }, { "epoch": 2.7748576434515986, "grad_norm": 0.021218854933977127, "learning_rate": 3.2260476917058913e-07, "loss": 0.039, "step": 12670 }, { "epoch": 2.777047744196233, "grad_norm": 0.03904233127832413, "learning_rate": 3.1655034328786027e-07, "loss": 0.0059, "step": 12680 }, { "epoch": 2.7792378449408672, "grad_norm": 0.059481941163539886, "learning_rate": 3.1055235581561983e-07, "loss": 0.0443, "step": 12690 }, { "epoch": 2.7814279456855013, "grad_norm": 1.7187482118606567, "learning_rate": 3.0461084171215847e-07, "loss": 0.0542, "step": 12700 }, { "epoch": 2.783618046430136, "grad_norm": 6.203575611114502, "learning_rate": 2.9872583560662695e-07, "loss": 0.0284, "step": 12710 }, { "epoch": 2.78580814717477, "grad_norm": 8.129768371582031, "learning_rate": 2.928973717988226e-07, "loss": 0.0398, "step": 12720 }, { "epoch": 2.787998247919404, "grad_norm": 0.38582733273506165, "learning_rate": 2.871254842590021e-07, "loss": 0.0437, "step": 12730 }, { "epoch": 2.7901883486640386, "grad_norm": 0.00038919298094697297, "learning_rate": 2.8141020662767024e-07, "loss": 0.0079, "step": 12740 }, { "epoch": 2.7923784494086727, "grad_norm": 7.471248626708984, "learning_rate": 2.757515722153925e-07, "loss": 0.0479, "step": 12750 }, { "epoch": 2.794568550153307, "grad_norm": 1.6210508346557617, "learning_rate": 2.7014961400259943e-07, "loss": 0.0298, "step": 12760 }, { "epoch": 2.7967586508979414, "grad_norm": 0.28909286856651306, "learning_rate": 2.6460436463939365e-07, "loss": 0.0026, "step": 12770 }, { "epoch": 2.7989487516425755, "grad_norm": 0.023134293034672737, "learning_rate": 2.5911585644535553e-07, "loss": 0.0562, "step": 12780 }, { "epoch": 2.8011388523872096, "grad_norm": 0.32009032368659973, "learning_rate": 2.536841214093655e-07, "loss": 0.0017, "step": 12790 }, { "epoch": 2.803328953131844, "grad_norm": 1.581437110900879, "learning_rate": 2.483091911894064e-07, "loss": 0.0478, "step": 12800 }, { "epoch": 2.8055190538764783, "grad_norm": 0.0420171394944191, "learning_rate": 2.429910971123817e-07, "loss": 0.0307, "step": 12810 }, { "epoch": 2.8077091546211124, "grad_norm": 9.534303665161133, "learning_rate": 2.3772987017394166e-07, "loss": 0.0723, "step": 12820 }, { "epoch": 2.809899255365747, "grad_norm": 9.096306800842285, "learning_rate": 2.325255410382876e-07, "loss": 0.022, "step": 12830 }, { "epoch": 2.812089356110381, "grad_norm": 0.0023589637130498886, "learning_rate": 2.2737814003800707e-07, "loss": 0.0102, "step": 12840 }, { "epoch": 2.814279456855015, "grad_norm": 0.14500868320465088, "learning_rate": 2.2228769717388742e-07, "loss": 0.0283, "step": 12850 }, { "epoch": 2.8164695575996497, "grad_norm": 0.042636580765247345, "learning_rate": 2.1725424211474943e-07, "loss": 0.0477, "step": 12860 }, { "epoch": 2.818659658344284, "grad_norm": 0.09764314442873001, "learning_rate": 2.1227780419726285e-07, "loss": 0.0512, "step": 12870 }, { "epoch": 2.820849759088918, "grad_norm": 0.15343749523162842, "learning_rate": 2.0735841242578992e-07, "loss": 0.0035, "step": 12880 }, { "epoch": 2.8230398598335524, "grad_norm": 0.28310132026672363, "learning_rate": 2.0249609547220106e-07, "loss": 0.003, "step": 12890 }, { "epoch": 2.8252299605781865, "grad_norm": 1.0581351518630981, "learning_rate": 1.9769088167572393e-07, "loss": 0.0543, "step": 12900 }, { "epoch": 2.8274200613228206, "grad_norm": 3.390519618988037, "learning_rate": 1.9294279904276126e-07, "loss": 0.0302, "step": 12910 }, { "epoch": 2.829610162067455, "grad_norm": 3.8201913833618164, "learning_rate": 1.8825187524674327e-07, "loss": 0.0496, "step": 12920 }, { "epoch": 2.8318002628120893, "grad_norm": 0.041408833116292953, "learning_rate": 1.8361813762795332e-07, "loss": 0.0195, "step": 12930 }, { "epoch": 2.8339903635567234, "grad_norm": 0.03204504773020744, "learning_rate": 1.790416131933781e-07, "loss": 0.0214, "step": 12940 }, { "epoch": 2.836180464301358, "grad_norm": 7.1436381340026855, "learning_rate": 1.745223286165454e-07, "loss": 0.0906, "step": 12950 }, { "epoch": 2.838370565045992, "grad_norm": 0.2234649509191513, "learning_rate": 1.700603102373688e-07, "loss": 0.0106, "step": 12960 }, { "epoch": 2.840560665790626, "grad_norm": 0.07009760290384293, "learning_rate": 1.6565558406199888e-07, "loss": 0.0341, "step": 12970 }, { "epoch": 2.8427507665352607, "grad_norm": 9.495372772216797, "learning_rate": 1.6130817576266333e-07, "loss": 0.0223, "step": 12980 }, { "epoch": 2.844940867279895, "grad_norm": 0.437217116355896, "learning_rate": 1.5744453602616781e-07, "loss": 0.0404, "step": 12990 }, { "epoch": 2.847130968024529, "grad_norm": 9.335092544555664, "learning_rate": 1.5362736253752064e-07, "loss": 0.0454, "step": 13000 }, { "epoch": 2.8493210687691635, "grad_norm": 3.509570837020874, "learning_rate": 1.4944057802319555e-07, "loss": 0.035, "step": 13010 }, { "epoch": 2.8515111695137976, "grad_norm": 0.19957946240901947, "learning_rate": 1.4531120589142856e-07, "loss": 0.0275, "step": 13020 }, { "epoch": 2.8537012702584317, "grad_norm": 0.35728198289871216, "learning_rate": 1.4123927020959437e-07, "loss": 0.0307, "step": 13030 }, { "epoch": 2.8558913710030662, "grad_norm": 9.612245559692383, "learning_rate": 1.3722479471030558e-07, "loss": 0.0173, "step": 13040 }, { "epoch": 2.8580814717477003, "grad_norm": 4.515768527984619, "learning_rate": 1.3326780279128258e-07, "loss": 0.0293, "step": 13050 }, { "epoch": 2.8602715724923344, "grad_norm": 0.08466468006372452, "learning_rate": 1.2936831751520605e-07, "loss": 0.0171, "step": 13060 }, { "epoch": 2.862461673236969, "grad_norm": 10.645101547241211, "learning_rate": 1.2552636160959365e-07, "loss": 0.0284, "step": 13070 }, { "epoch": 2.864651773981603, "grad_norm": 0.0040908209048211575, "learning_rate": 1.2174195746665674e-07, "loss": 0.0215, "step": 13080 }, { "epoch": 2.866841874726237, "grad_norm": 4.853795528411865, "learning_rate": 1.1801512714318286e-07, "loss": 0.0059, "step": 13090 }, { "epoch": 2.8690319754708717, "grad_norm": 10.70047378540039, "learning_rate": 1.143458923603935e-07, "loss": 0.1065, "step": 13100 }, { "epoch": 2.871222076215506, "grad_norm": 0.042464204132556915, "learning_rate": 1.1073427450382756e-07, "loss": 0.0391, "step": 13110 }, { "epoch": 2.87341217696014, "grad_norm": 7.492866516113281, "learning_rate": 1.0718029462321144e-07, "loss": 0.047, "step": 13120 }, { "epoch": 2.8756022777047745, "grad_norm": 0.1017516478896141, "learning_rate": 1.0368397343233915e-07, "loss": 0.0371, "step": 13130 }, { "epoch": 2.8777923784494086, "grad_norm": 0.10130762308835983, "learning_rate": 1.0024533130894909e-07, "loss": 0.0283, "step": 13140 }, { "epoch": 2.8799824791940427, "grad_norm": 0.07096447795629501, "learning_rate": 9.686438829460742e-08, "loss": 0.0208, "step": 13150 }, { "epoch": 2.8821725799386773, "grad_norm": 8.342753410339355, "learning_rate": 9.354116409459157e-08, "loss": 0.0305, "step": 13160 }, { "epoch": 2.8843626806833114, "grad_norm": 0.1246112510561943, "learning_rate": 9.027567807777027e-08, "loss": 0.04, "step": 13170 }, { "epoch": 2.8865527814279455, "grad_norm": 9.109840393066406, "learning_rate": 8.706794927650031e-08, "loss": 0.0876, "step": 13180 }, { "epoch": 2.88874288217258, "grad_norm": 8.936704635620117, "learning_rate": 8.391799638650555e-08, "loss": 0.0517, "step": 13190 }, { "epoch": 2.890932982917214, "grad_norm": 9.722352027893066, "learning_rate": 8.082583776677367e-08, "loss": 0.0413, "step": 13200 }, { "epoch": 2.8931230836618482, "grad_norm": 0.9101546406745911, "learning_rate": 7.77914914394473e-08, "loss": 0.0264, "step": 13210 }, { "epoch": 2.8953131844064828, "grad_norm": 10.80632209777832, "learning_rate": 7.481497508972313e-08, "loss": 0.0744, "step": 13220 }, { "epoch": 2.897503285151117, "grad_norm": 7.437931537628174, "learning_rate": 7.18963060657385e-08, "loss": 0.0425, "step": 13230 }, { "epoch": 2.899693385895751, "grad_norm": 0.002111058682203293, "learning_rate": 6.903550137848381e-08, "loss": 0.0697, "step": 13240 }, { "epoch": 2.9018834866403855, "grad_norm": 0.09892608970403671, "learning_rate": 6.623257770169256e-08, "loss": 0.0349, "step": 13250 }, { "epoch": 2.9040735873850196, "grad_norm": 3.277552843093872, "learning_rate": 6.348755137174923e-08, "loss": 0.0818, "step": 13260 }, { "epoch": 2.9062636881296537, "grad_norm": 4.254228115081787, "learning_rate": 6.080043838759264e-08, "loss": 0.0456, "step": 13270 }, { "epoch": 2.9084537888742883, "grad_norm": 0.038634199649095535, "learning_rate": 5.817125441062388e-08, "loss": 0.0022, "step": 13280 }, { "epoch": 2.9106438896189224, "grad_norm": 6.89719295501709, "learning_rate": 5.5600014764614073e-08, "loss": 0.0451, "step": 13290 }, { "epoch": 2.9128339903635565, "grad_norm": 0.8880706429481506, "learning_rate": 5.308673443561563e-08, "loss": 0.0416, "step": 13300 }, { "epoch": 2.915024091108191, "grad_norm": 0.5370209217071533, "learning_rate": 5.063142807187227e-08, "loss": 0.0405, "step": 13310 }, { "epoch": 2.917214191852825, "grad_norm": 0.047260869294404984, "learning_rate": 4.823410998373912e-08, "loss": 0.0277, "step": 13320 }, { "epoch": 2.9194042925974593, "grad_norm": 0.24953371286392212, "learning_rate": 4.589479414359388e-08, "loss": 0.0082, "step": 13330 }, { "epoch": 2.921594393342094, "grad_norm": 0.4939959943294525, "learning_rate": 4.3613494185758e-08, "loss": 0.0548, "step": 13340 }, { "epoch": 2.923784494086728, "grad_norm": 0.07067824900150299, "learning_rate": 4.139022340641785e-08, "loss": 0.0107, "step": 13350 }, { "epoch": 2.925974594831362, "grad_norm": 0.036173462867736816, "learning_rate": 3.92249947635448e-08, "loss": 0.0051, "step": 13360 }, { "epoch": 2.9281646955759966, "grad_norm": 4.334299087524414, "learning_rate": 3.711782087682192e-08, "loss": 0.0179, "step": 13370 }, { "epoch": 2.9303547963206307, "grad_norm": 2.331427574157715, "learning_rate": 3.5068714027569615e-08, "loss": 0.0484, "step": 13380 }, { "epoch": 2.9325448970652648, "grad_norm": 6.559699535369873, "learning_rate": 3.307768615867235e-08, "loss": 0.0412, "step": 13390 }, { "epoch": 2.9347349978098993, "grad_norm": 0.0944531112909317, "learning_rate": 3.114474887451313e-08, "loss": 0.0352, "step": 13400 }, { "epoch": 2.9369250985545334, "grad_norm": 2.088341236114502, "learning_rate": 2.9269913440902466e-08, "loss": 0.0769, "step": 13410 }, { "epoch": 2.9391151992991675, "grad_norm": 4.424472808837891, "learning_rate": 2.745319078501174e-08, "loss": 0.0359, "step": 13420 }, { "epoch": 2.941305300043802, "grad_norm": 7.268121719360352, "learning_rate": 2.5694591495316613e-08, "loss": 0.0784, "step": 13430 }, { "epoch": 2.943495400788436, "grad_norm": 9.986968040466309, "learning_rate": 2.3994125821523716e-08, "loss": 0.0339, "step": 13440 }, { "epoch": 2.9456855015330703, "grad_norm": 0.6594504117965698, "learning_rate": 2.2351803674522942e-08, "loss": 0.0155, "step": 13450 }, { "epoch": 2.947875602277705, "grad_norm": 9.950922966003418, "learning_rate": 2.07676346263197e-08, "loss": 0.0496, "step": 13460 }, { "epoch": 2.950065703022339, "grad_norm": 9.533273696899414, "learning_rate": 1.924162790998718e-08, "loss": 0.0557, "step": 13470 }, { "epoch": 2.952255803766973, "grad_norm": 0.10136058181524277, "learning_rate": 1.7773792419606418e-08, "loss": 0.0399, "step": 13480 }, { "epoch": 2.9544459045116076, "grad_norm": 9.685738563537598, "learning_rate": 1.6364136710216305e-08, "loss": 0.042, "step": 13490 }, { "epoch": 2.9566360052562417, "grad_norm": 10.604461669921875, "learning_rate": 1.50126689977681e-08, "loss": 0.0714, "step": 13500 }, { "epoch": 2.958826106000876, "grad_norm": 10.06485366821289, "learning_rate": 1.3719397159069892e-08, "loss": 0.0725, "step": 13510 }, { "epoch": 2.9610162067455104, "grad_norm": 5.8250274658203125, "learning_rate": 1.2484328731747763e-08, "loss": 0.0207, "step": 13520 }, { "epoch": 2.9632063074901445, "grad_norm": 0.022053489461541176, "learning_rate": 1.1307470914194707e-08, "loss": 0.0781, "step": 13530 }, { "epoch": 2.9653964082347786, "grad_norm": 0.11585117876529694, "learning_rate": 1.0188830565537321e-08, "loss": 0.0158, "step": 13540 }, { "epoch": 2.967586508979413, "grad_norm": 1.3162285089492798, "learning_rate": 9.128414205586967e-09, "loss": 0.0206, "step": 13550 }, { "epoch": 2.9697766097240472, "grad_norm": 0.0020387242548167706, "learning_rate": 8.126228014808668e-09, "loss": 0.0414, "step": 13560 }, { "epoch": 2.9719667104686813, "grad_norm": 5.92432165145874, "learning_rate": 7.182277834281159e-09, "loss": 0.0093, "step": 13570 }, { "epoch": 2.974156811213316, "grad_norm": 10.113669395446777, "learning_rate": 6.296569165665789e-09, "loss": 0.1149, "step": 13580 }, { "epoch": 2.97634691195795, "grad_norm": 7.431230545043945, "learning_rate": 5.469107171169885e-09, "loss": 0.0381, "step": 13590 }, { "epoch": 2.978537012702584, "grad_norm": 1.3582772016525269, "learning_rate": 4.699896673521221e-09, "loss": 0.0261, "step": 13600 }, { "epoch": 2.9807271134472186, "grad_norm": 0.16403117775917053, "learning_rate": 3.988942155939146e-09, "loss": 0.0329, "step": 13610 }, { "epoch": 2.9829172141918527, "grad_norm": 0.10906914621591568, "learning_rate": 3.336247762104616e-09, "loss": 0.0456, "step": 13620 }, { "epoch": 2.985107314936487, "grad_norm": 0.013551436364650726, "learning_rate": 2.7418172961424237e-09, "loss": 0.0093, "step": 13630 }, { "epoch": 2.9872974156811214, "grad_norm": 1.618685007095337, "learning_rate": 2.205654222592335e-09, "loss": 0.0419, "step": 13640 }, { "epoch": 2.9894875164257555, "grad_norm": 0.11479564011096954, "learning_rate": 1.7277616663946562e-09, "loss": 0.0238, "step": 13650 }, { "epoch": 2.9916776171703896, "grad_norm": 0.09242575615644455, "learning_rate": 1.3081424128669195e-09, "loss": 0.0265, "step": 13660 }, { "epoch": 2.993867717915024, "grad_norm": 0.19523653388023376, "learning_rate": 9.46798907693891e-10, "loss": 0.0024, "step": 13670 }, { "epoch": 2.9960578186596583, "grad_norm": 8.324796676635742, "learning_rate": 6.437332569053656e-10, "loss": 0.0214, "step": 13680 }, { "epoch": 2.9982479194042924, "grad_norm": 0.01888529397547245, "learning_rate": 3.989472268717265e-10, "loss": 0.0335, "step": 13690 }, { "epoch": 3.0, "step": 13698, "total_flos": 1.2962727349043855e+18, "train_loss": 0.08785007219527181, "train_runtime": 59814.4051, "train_samples_per_second": 1.832, "train_steps_per_second": 0.229 } ], "logging_steps": 10, "max_steps": 13698, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2962727349043855e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }