{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.023999808001535988, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.999936000511996e-05, "grad_norm": 4.414881453815827, "learning_rate": 7.199424046076314e-09, "loss": 0.5441, "step": 10 }, { "epoch": 0.00015999872001023991, "grad_norm": 4.004957247533114, "learning_rate": 1.519878409727222e-08, "loss": 0.5353, "step": 20 }, { "epoch": 0.00023999808001535987, "grad_norm": 4.889790729834478, "learning_rate": 2.3198144148468124e-08, "loss": 0.537, "step": 30 }, { "epoch": 0.00031999744002047983, "grad_norm": 3.7138301639437508, "learning_rate": 3.119750419966403e-08, "loss": 0.5488, "step": 40 }, { "epoch": 0.0003999968000255998, "grad_norm": 4.208028943214464, "learning_rate": 3.919686425085993e-08, "loss": 0.5557, "step": 50 }, { "epoch": 0.00047999616003071974, "grad_norm": 4.326896533998126, "learning_rate": 4.719622430205584e-08, "loss": 0.5061, "step": 60 }, { "epoch": 0.0005599955200358397, "grad_norm": 7.1723877700496415, "learning_rate": 5.519558435325175e-08, "loss": 0.5193, "step": 70 }, { "epoch": 0.0006399948800409597, "grad_norm": 3.892238785520058, "learning_rate": 6.319494440444764e-08, "loss": 0.5311, "step": 80 }, { "epoch": 0.0007199942400460796, "grad_norm": 3.4517472540795513, "learning_rate": 7.119430445564356e-08, "loss": 0.5232, "step": 90 }, { "epoch": 0.0007999936000511996, "grad_norm": 4.094057558667446, "learning_rate": 7.919366450683946e-08, "loss": 0.5225, "step": 100 }, { "epoch": 0.0008799929600563195, "grad_norm": 3.4395682865399144, "learning_rate": 8.719302455803536e-08, "loss": 0.5371, "step": 110 }, { "epoch": 0.0009599923200614395, "grad_norm": 3.7029712275218682, "learning_rate": 9.519238460923127e-08, "loss": 0.5305, "step": 120 }, { "epoch": 0.0010399916800665594, "grad_norm": 3.493027269180126, "learning_rate": 1.0319174466042718e-07, "loss": 0.5167, "step": 130 }, { "epoch": 0.0011199910400716794, "grad_norm": 3.480790725115363, "learning_rate": 1.1119110471162308e-07, "loss": 0.5543, "step": 140 }, { "epoch": 0.0011999904000767993, "grad_norm": 3.0350778792458564, "learning_rate": 1.1919046476281897e-07, "loss": 0.5188, "step": 150 }, { "epoch": 0.0012799897600819193, "grad_norm": 2.7703962024693216, "learning_rate": 1.271898248140149e-07, "loss": 0.5176, "step": 160 }, { "epoch": 0.0013599891200870393, "grad_norm": 2.7312812150288277, "learning_rate": 1.351891848652108e-07, "loss": 0.5206, "step": 170 }, { "epoch": 0.0014399884800921593, "grad_norm": 2.6772442085163837, "learning_rate": 1.431885449164067e-07, "loss": 0.5187, "step": 180 }, { "epoch": 0.0015199878400972793, "grad_norm": 2.6017371604025503, "learning_rate": 1.5118790496760262e-07, "loss": 0.4858, "step": 190 }, { "epoch": 0.0015999872001023993, "grad_norm": 2.361921882047865, "learning_rate": 1.5918726501879854e-07, "loss": 0.4965, "step": 200 }, { "epoch": 0.0016799865601075192, "grad_norm": 1.9949603945902301, "learning_rate": 1.6718662506999443e-07, "loss": 0.49, "step": 210 }, { "epoch": 0.001759985920112639, "grad_norm": 1.689017850332906, "learning_rate": 1.7518598512119031e-07, "loss": 0.4474, "step": 220 }, { "epoch": 0.001839985280117759, "grad_norm": 1.7526108484730667, "learning_rate": 1.8318534517238623e-07, "loss": 0.4758, "step": 230 }, { "epoch": 0.001919984640122879, "grad_norm": 1.6619250196852287, "learning_rate": 1.9118470522358212e-07, "loss": 0.4551, "step": 240 }, { "epoch": 0.001999984000127999, "grad_norm": 1.4244411983222107, "learning_rate": 1.9918406527477803e-07, "loss": 0.4666, "step": 250 }, { "epoch": 0.0020799833601331187, "grad_norm": 1.4091371713945773, "learning_rate": 2.0718342532597392e-07, "loss": 0.4744, "step": 260 }, { "epoch": 0.002159982720138239, "grad_norm": 1.4037698231377602, "learning_rate": 2.1518278537716986e-07, "loss": 0.4577, "step": 270 }, { "epoch": 0.0022399820801433587, "grad_norm": 1.1233973407058262, "learning_rate": 2.2318214542836575e-07, "loss": 0.4442, "step": 280 }, { "epoch": 0.002319981440148479, "grad_norm": 1.0934034827825632, "learning_rate": 2.3118150547956164e-07, "loss": 0.4746, "step": 290 }, { "epoch": 0.0023999808001535987, "grad_norm": 0.9547032856231036, "learning_rate": 2.3918086553075753e-07, "loss": 0.4631, "step": 300 }, { "epoch": 0.002479980160158719, "grad_norm": 1.011447249716743, "learning_rate": 2.4718022558195345e-07, "loss": 0.4359, "step": 310 }, { "epoch": 0.0025599795201638386, "grad_norm": 0.7506687829382205, "learning_rate": 2.5517958563314936e-07, "loss": 0.4554, "step": 320 }, { "epoch": 0.002639978880168959, "grad_norm": 0.8268141903301145, "learning_rate": 2.631789456843453e-07, "loss": 0.4444, "step": 330 }, { "epoch": 0.0027199782401740786, "grad_norm": 0.9120933866995852, "learning_rate": 2.711783057355412e-07, "loss": 0.4662, "step": 340 }, { "epoch": 0.0027999776001791984, "grad_norm": 0.9481570868182263, "learning_rate": 2.791776657867371e-07, "loss": 0.4677, "step": 350 }, { "epoch": 0.0028799769601843186, "grad_norm": 0.8010445040150771, "learning_rate": 2.8717702583793297e-07, "loss": 0.4196, "step": 360 }, { "epoch": 0.0029599763201894383, "grad_norm": 0.8670839473253033, "learning_rate": 2.951763858891289e-07, "loss": 0.4492, "step": 370 }, { "epoch": 0.0030399756801945585, "grad_norm": 0.8536232676044045, "learning_rate": 3.031757459403248e-07, "loss": 0.4375, "step": 380 }, { "epoch": 0.0031199750401996783, "grad_norm": 0.8135677817452803, "learning_rate": 3.111751059915207e-07, "loss": 0.438, "step": 390 }, { "epoch": 0.0031999744002047985, "grad_norm": 0.9667962048008838, "learning_rate": 3.191744660427166e-07, "loss": 0.4348, "step": 400 }, { "epoch": 0.0032799737602099183, "grad_norm": 0.885059264680228, "learning_rate": 3.271738260939125e-07, "loss": 0.4295, "step": 410 }, { "epoch": 0.0033599731202150385, "grad_norm": 0.7112007856245484, "learning_rate": 3.3517318614510846e-07, "loss": 0.4412, "step": 420 }, { "epoch": 0.0034399724802201583, "grad_norm": 0.9486242885147141, "learning_rate": 3.431725461963043e-07, "loss": 0.4357, "step": 430 }, { "epoch": 0.003519971840225278, "grad_norm": 0.7792187180449996, "learning_rate": 3.5117190624750024e-07, "loss": 0.4053, "step": 440 }, { "epoch": 0.0035999712002303982, "grad_norm": 0.8013118911013118, "learning_rate": 3.591712662986961e-07, "loss": 0.4526, "step": 450 }, { "epoch": 0.003679970560235518, "grad_norm": 0.9290353407648725, "learning_rate": 3.6717062634989207e-07, "loss": 0.4394, "step": 460 }, { "epoch": 0.003759969920240638, "grad_norm": 0.7440275824040646, "learning_rate": 3.7516998640108793e-07, "loss": 0.4354, "step": 470 }, { "epoch": 0.003839969280245758, "grad_norm": 0.6661735557442742, "learning_rate": 3.8316934645228385e-07, "loss": 0.4132, "step": 480 }, { "epoch": 0.003919968640250878, "grad_norm": 0.80714967397643, "learning_rate": 3.9116870650347976e-07, "loss": 0.4247, "step": 490 }, { "epoch": 0.003999968000255998, "grad_norm": 0.7269691108846945, "learning_rate": 3.9916806655467563e-07, "loss": 0.4308, "step": 500 }, { "epoch": 0.004079967360261118, "grad_norm": 0.8112092339496537, "learning_rate": 4.071674266058716e-07, "loss": 0.4215, "step": 510 }, { "epoch": 0.0041599667202662375, "grad_norm": 0.8465738727092779, "learning_rate": 4.1516678665706746e-07, "loss": 0.4438, "step": 520 }, { "epoch": 0.004239966080271358, "grad_norm": 0.7947114246478345, "learning_rate": 4.2316614670826337e-07, "loss": 0.402, "step": 530 }, { "epoch": 0.004319965440276478, "grad_norm": 0.771445835897062, "learning_rate": 4.311655067594593e-07, "loss": 0.4434, "step": 540 }, { "epoch": 0.004399964800281598, "grad_norm": 0.7143580000434352, "learning_rate": 4.3916486681065515e-07, "loss": 0.4227, "step": 550 }, { "epoch": 0.004479964160286717, "grad_norm": 0.7487773075552359, "learning_rate": 4.471642268618511e-07, "loss": 0.438, "step": 560 }, { "epoch": 0.004559963520291838, "grad_norm": 0.7373810653849879, "learning_rate": 4.55163586913047e-07, "loss": 0.4288, "step": 570 }, { "epoch": 0.004639962880296958, "grad_norm": 0.7104505393064982, "learning_rate": 4.631629469642429e-07, "loss": 0.4462, "step": 580 }, { "epoch": 0.004719962240302078, "grad_norm": 0.6996674322090365, "learning_rate": 4.7116230701543876e-07, "loss": 0.4263, "step": 590 }, { "epoch": 0.004799961600307197, "grad_norm": 0.779468888606517, "learning_rate": 4.791616670666347e-07, "loss": 0.4378, "step": 600 }, { "epoch": 0.0048799609603123175, "grad_norm": 0.6348456379271163, "learning_rate": 4.871610271178306e-07, "loss": 0.4098, "step": 610 }, { "epoch": 0.004959960320317438, "grad_norm": 0.6384341588537313, "learning_rate": 4.951603871690265e-07, "loss": 0.4489, "step": 620 }, { "epoch": 0.005039959680322557, "grad_norm": 0.6779051066808497, "learning_rate": 5.031597472202224e-07, "loss": 0.4522, "step": 630 }, { "epoch": 0.005119959040327677, "grad_norm": 0.7405337420145796, "learning_rate": 5.111591072714183e-07, "loss": 0.4248, "step": 640 }, { "epoch": 0.0051999584003327975, "grad_norm": 0.6915688533513078, "learning_rate": 5.191584673226143e-07, "loss": 0.4129, "step": 650 }, { "epoch": 0.005279957760337918, "grad_norm": 0.6044444399034763, "learning_rate": 5.271578273738101e-07, "loss": 0.4237, "step": 660 }, { "epoch": 0.005359957120343037, "grad_norm": 0.7401624173362278, "learning_rate": 5.351571874250061e-07, "loss": 0.4216, "step": 670 }, { "epoch": 0.005439956480348157, "grad_norm": 0.7496141168534413, "learning_rate": 5.43156547476202e-07, "loss": 0.4268, "step": 680 }, { "epoch": 0.005519955840353277, "grad_norm": 0.730614366570662, "learning_rate": 5.511559075273978e-07, "loss": 0.4589, "step": 690 }, { "epoch": 0.005599955200358397, "grad_norm": 0.7836211351094068, "learning_rate": 5.591552675785937e-07, "loss": 0.4176, "step": 700 }, { "epoch": 0.005679954560363517, "grad_norm": 0.6648189022606215, "learning_rate": 5.671546276297896e-07, "loss": 0.4274, "step": 710 }, { "epoch": 0.005759953920368637, "grad_norm": 0.6607901995290275, "learning_rate": 5.751539876809856e-07, "loss": 0.428, "step": 720 }, { "epoch": 0.005839953280373757, "grad_norm": 0.8422142881250906, "learning_rate": 5.831533477321815e-07, "loss": 0.4206, "step": 730 }, { "epoch": 0.005919952640378877, "grad_norm": 0.8636103630308042, "learning_rate": 5.911527077833774e-07, "loss": 0.4384, "step": 740 }, { "epoch": 0.005999952000383997, "grad_norm": 0.7020283376881055, "learning_rate": 5.991520678345733e-07, "loss": 0.4336, "step": 750 }, { "epoch": 0.006079951360389117, "grad_norm": 0.6637768636661737, "learning_rate": 6.071514278857692e-07, "loss": 0.421, "step": 760 }, { "epoch": 0.006159950720394236, "grad_norm": 0.6720228370970702, "learning_rate": 6.151507879369651e-07, "loss": 0.4425, "step": 770 }, { "epoch": 0.006239950080399357, "grad_norm": 0.7915308024009126, "learning_rate": 6.231501479881609e-07, "loss": 0.4665, "step": 780 }, { "epoch": 0.006319949440404477, "grad_norm": 0.7005483141808596, "learning_rate": 6.311495080393569e-07, "loss": 0.4371, "step": 790 }, { "epoch": 0.006399948800409597, "grad_norm": 0.7085842808188052, "learning_rate": 6.391488680905528e-07, "loss": 0.4481, "step": 800 }, { "epoch": 0.006479948160414716, "grad_norm": 0.797364113205394, "learning_rate": 6.471482281417488e-07, "loss": 0.43, "step": 810 }, { "epoch": 0.0065599475204198366, "grad_norm": 0.7769920944189029, "learning_rate": 6.551475881929446e-07, "loss": 0.4421, "step": 820 }, { "epoch": 0.006639946880424957, "grad_norm": 0.7190098449382926, "learning_rate": 6.631469482441405e-07, "loss": 0.4524, "step": 830 }, { "epoch": 0.006719946240430077, "grad_norm": 0.8999089940382301, "learning_rate": 6.711463082953363e-07, "loss": 0.4327, "step": 840 }, { "epoch": 0.006799945600435196, "grad_norm": 0.8071258672281636, "learning_rate": 6.791456683465323e-07, "loss": 0.3999, "step": 850 }, { "epoch": 0.0068799449604403165, "grad_norm": 0.9718244104757909, "learning_rate": 6.871450283977283e-07, "loss": 0.4109, "step": 860 }, { "epoch": 0.006959944320445437, "grad_norm": 0.7260805899057289, "learning_rate": 6.951443884489241e-07, "loss": 0.4254, "step": 870 }, { "epoch": 0.007039943680450556, "grad_norm": 0.6793848246836304, "learning_rate": 7.031437485001201e-07, "loss": 0.4168, "step": 880 }, { "epoch": 0.007119943040455676, "grad_norm": 0.8220405330990818, "learning_rate": 7.11143108551316e-07, "loss": 0.4368, "step": 890 }, { "epoch": 0.0071999424004607964, "grad_norm": 0.8650021092848164, "learning_rate": 7.191424686025118e-07, "loss": 0.4336, "step": 900 }, { "epoch": 0.007279941760465917, "grad_norm": 0.6667689577948444, "learning_rate": 7.271418286537078e-07, "loss": 0.4325, "step": 910 }, { "epoch": 0.007359941120471036, "grad_norm": 0.6443211014509481, "learning_rate": 7.351411887049036e-07, "loss": 0.443, "step": 920 }, { "epoch": 0.007439940480476156, "grad_norm": 0.6067913066251965, "learning_rate": 7.431405487560996e-07, "loss": 0.4095, "step": 930 }, { "epoch": 0.007519939840481276, "grad_norm": 0.6935512372810877, "learning_rate": 7.511399088072954e-07, "loss": 0.4272, "step": 940 }, { "epoch": 0.007599939200486396, "grad_norm": 0.6599113657719949, "learning_rate": 7.591392688584914e-07, "loss": 0.3942, "step": 950 }, { "epoch": 0.007679938560491516, "grad_norm": 0.611225655896161, "learning_rate": 7.671386289096873e-07, "loss": 0.4451, "step": 960 }, { "epoch": 0.007759937920496636, "grad_norm": 0.757015652343119, "learning_rate": 7.751379889608831e-07, "loss": 0.4059, "step": 970 }, { "epoch": 0.007839937280501755, "grad_norm": 0.8456384517410107, "learning_rate": 7.831373490120791e-07, "loss": 0.4278, "step": 980 }, { "epoch": 0.007919936640506876, "grad_norm": 0.7996016208371923, "learning_rate": 7.911367090632751e-07, "loss": 0.4536, "step": 990 }, { "epoch": 0.007999936000511996, "grad_norm": 0.6228736248515305, "learning_rate": 7.991360691144709e-07, "loss": 0.3969, "step": 1000 }, { "epoch": 0.008079935360517116, "grad_norm": 0.6440105424946402, "learning_rate": 8.071354291656668e-07, "loss": 0.4307, "step": 1010 }, { "epoch": 0.008159934720522236, "grad_norm": 0.6636370756757088, "learning_rate": 8.151347892168628e-07, "loss": 0.4019, "step": 1020 }, { "epoch": 0.008239934080527356, "grad_norm": 0.6534728964626215, "learning_rate": 8.231341492680586e-07, "loss": 0.4357, "step": 1030 }, { "epoch": 0.008319933440532475, "grad_norm": 0.6854238349847609, "learning_rate": 8.311335093192545e-07, "loss": 0.4156, "step": 1040 }, { "epoch": 0.008399932800537595, "grad_norm": 0.7692797686595098, "learning_rate": 8.391328693704504e-07, "loss": 0.4364, "step": 1050 }, { "epoch": 0.008479932160542715, "grad_norm": 0.6892139807862941, "learning_rate": 8.471322294216464e-07, "loss": 0.4595, "step": 1060 }, { "epoch": 0.008559931520547836, "grad_norm": 0.6786670330601587, "learning_rate": 8.551315894728423e-07, "loss": 0.4207, "step": 1070 }, { "epoch": 0.008639930880552956, "grad_norm": 0.6118366771253826, "learning_rate": 8.631309495240381e-07, "loss": 0.4186, "step": 1080 }, { "epoch": 0.008719930240558076, "grad_norm": 0.7157718863255984, "learning_rate": 8.711303095752341e-07, "loss": 0.4412, "step": 1090 }, { "epoch": 0.008799929600563196, "grad_norm": 0.7102096103999658, "learning_rate": 8.791296696264299e-07, "loss": 0.4205, "step": 1100 }, { "epoch": 0.008879928960568315, "grad_norm": 0.6326552697725677, "learning_rate": 8.871290296776258e-07, "loss": 0.4161, "step": 1110 }, { "epoch": 0.008959928320573435, "grad_norm": 0.7455469536196851, "learning_rate": 8.951283897288219e-07, "loss": 0.4259, "step": 1120 }, { "epoch": 0.009039927680578555, "grad_norm": 0.8219540514047248, "learning_rate": 9.031277497800177e-07, "loss": 0.4423, "step": 1130 }, { "epoch": 0.009119927040583675, "grad_norm": 0.6428846698552027, "learning_rate": 9.111271098312136e-07, "loss": 0.432, "step": 1140 }, { "epoch": 0.009199926400588795, "grad_norm": 0.7959015617833209, "learning_rate": 9.191264698824094e-07, "loss": 0.4374, "step": 1150 }, { "epoch": 0.009279925760593916, "grad_norm": 0.722690696817927, "learning_rate": 9.271258299336054e-07, "loss": 0.4177, "step": 1160 }, { "epoch": 0.009359925120599036, "grad_norm": 0.7583146312532495, "learning_rate": 9.351251899848013e-07, "loss": 0.4349, "step": 1170 }, { "epoch": 0.009439924480604156, "grad_norm": 0.8841341873240007, "learning_rate": 9.431245500359971e-07, "loss": 0.4323, "step": 1180 }, { "epoch": 0.009519923840609274, "grad_norm": 0.6844990736142492, "learning_rate": 9.511239100871932e-07, "loss": 0.4195, "step": 1190 }, { "epoch": 0.009599923200614395, "grad_norm": 0.7129122863481299, "learning_rate": 9.59123270138389e-07, "loss": 0.4088, "step": 1200 }, { "epoch": 0.009679922560619515, "grad_norm": 0.6532760834208395, "learning_rate": 9.67122630189585e-07, "loss": 0.3969, "step": 1210 }, { "epoch": 0.009759921920624635, "grad_norm": 0.7563260927255472, "learning_rate": 9.75121990240781e-07, "loss": 0.4075, "step": 1220 }, { "epoch": 0.009839921280629755, "grad_norm": 0.7704505024635662, "learning_rate": 9.831213502919768e-07, "loss": 0.3981, "step": 1230 }, { "epoch": 0.009919920640634875, "grad_norm": 0.7056792723024738, "learning_rate": 9.911207103431725e-07, "loss": 0.4223, "step": 1240 }, { "epoch": 0.009999920000639996, "grad_norm": 0.750390399783068, "learning_rate": 9.991200703943684e-07, "loss": 0.4102, "step": 1250 }, { "epoch": 0.010079919360645114, "grad_norm": 0.7529460057363149, "learning_rate": 1.0071194304455646e-06, "loss": 0.4278, "step": 1260 }, { "epoch": 0.010159918720650234, "grad_norm": 1.041695148553965, "learning_rate": 1.0151187904967603e-06, "loss": 0.4261, "step": 1270 }, { "epoch": 0.010239918080655355, "grad_norm": 0.8098838670880232, "learning_rate": 1.0231181505479562e-06, "loss": 0.4163, "step": 1280 }, { "epoch": 0.010319917440660475, "grad_norm": 0.7139180273227849, "learning_rate": 1.0311175105991523e-06, "loss": 0.4139, "step": 1290 }, { "epoch": 0.010399916800665595, "grad_norm": 0.7293628900497775, "learning_rate": 1.039116870650348e-06, "loss": 0.4139, "step": 1300 }, { "epoch": 0.010479916160670715, "grad_norm": 3.9899125178456276, "learning_rate": 1.047116230701544e-06, "loss": 0.4352, "step": 1310 }, { "epoch": 0.010559915520675835, "grad_norm": 0.741428608083807, "learning_rate": 1.0551155907527398e-06, "loss": 0.4487, "step": 1320 }, { "epoch": 0.010639914880680954, "grad_norm": 0.7645959836257802, "learning_rate": 1.0631149508039358e-06, "loss": 0.4509, "step": 1330 }, { "epoch": 0.010719914240686074, "grad_norm": 0.6630163901596575, "learning_rate": 1.0711143108551317e-06, "loss": 0.422, "step": 1340 }, { "epoch": 0.010799913600691194, "grad_norm": 0.7653242364061862, "learning_rate": 1.0791136709063276e-06, "loss": 0.4047, "step": 1350 }, { "epoch": 0.010879912960696314, "grad_norm": 0.6952138151449674, "learning_rate": 1.0871130309575235e-06, "loss": 0.424, "step": 1360 }, { "epoch": 0.010959912320701435, "grad_norm": 0.8400358998787617, "learning_rate": 1.0951123910087194e-06, "loss": 0.4329, "step": 1370 }, { "epoch": 0.011039911680706555, "grad_norm": 0.7959368745719518, "learning_rate": 1.1031117510599153e-06, "loss": 0.4296, "step": 1380 }, { "epoch": 0.011119911040711675, "grad_norm": 0.7182831330712102, "learning_rate": 1.111111111111111e-06, "loss": 0.4053, "step": 1390 }, { "epoch": 0.011199910400716794, "grad_norm": 0.7631459457678664, "learning_rate": 1.1191104711623072e-06, "loss": 0.4174, "step": 1400 }, { "epoch": 0.011279909760721914, "grad_norm": 0.6904060292491346, "learning_rate": 1.127109831213503e-06, "loss": 0.4262, "step": 1410 }, { "epoch": 0.011359909120727034, "grad_norm": 0.7154968430224656, "learning_rate": 1.1351091912646988e-06, "loss": 0.4116, "step": 1420 }, { "epoch": 0.011439908480732154, "grad_norm": 0.6430506977204669, "learning_rate": 1.143108551315895e-06, "loss": 0.3985, "step": 1430 }, { "epoch": 0.011519907840737274, "grad_norm": 0.6060536937789834, "learning_rate": 1.1511079113670908e-06, "loss": 0.4366, "step": 1440 }, { "epoch": 0.011599907200742395, "grad_norm": 0.7070045694797042, "learning_rate": 1.1591072714182865e-06, "loss": 0.3951, "step": 1450 }, { "epoch": 0.011679906560747515, "grad_norm": 0.6799481270205386, "learning_rate": 1.1671066314694824e-06, "loss": 0.4277, "step": 1460 }, { "epoch": 0.011759905920752633, "grad_norm": 0.7706688131579046, "learning_rate": 1.1751059915206786e-06, "loss": 0.4106, "step": 1470 }, { "epoch": 0.011839905280757753, "grad_norm": 0.7659401655636618, "learning_rate": 1.1831053515718743e-06, "loss": 0.4147, "step": 1480 }, { "epoch": 0.011919904640762874, "grad_norm": 0.7880351649260696, "learning_rate": 1.1911047116230702e-06, "loss": 0.4018, "step": 1490 }, { "epoch": 0.011999904000767994, "grad_norm": 0.6643952514129879, "learning_rate": 1.1991040716742661e-06, "loss": 0.4225, "step": 1500 }, { "epoch": 0.012079903360773114, "grad_norm": 0.7410891062680932, "learning_rate": 1.207103431725462e-06, "loss": 0.4235, "step": 1510 }, { "epoch": 0.012159902720778234, "grad_norm": 0.6702465025094532, "learning_rate": 1.215102791776658e-06, "loss": 0.4485, "step": 1520 }, { "epoch": 0.012239902080783354, "grad_norm": 0.72511805103151, "learning_rate": 1.2231021518278539e-06, "loss": 0.4075, "step": 1530 }, { "epoch": 0.012319901440788473, "grad_norm": 0.7364457263640842, "learning_rate": 1.2311015118790498e-06, "loss": 0.4209, "step": 1540 }, { "epoch": 0.012399900800793593, "grad_norm": 0.6760801533876919, "learning_rate": 1.2391008719302457e-06, "loss": 0.4153, "step": 1550 }, { "epoch": 0.012479900160798713, "grad_norm": 1.329240774622819, "learning_rate": 1.2471002319814416e-06, "loss": 0.4569, "step": 1560 }, { "epoch": 0.012559899520803833, "grad_norm": 0.7197695590369367, "learning_rate": 1.2550995920326375e-06, "loss": 0.4148, "step": 1570 }, { "epoch": 0.012639898880808954, "grad_norm": 0.7197146321929337, "learning_rate": 1.2630989520838332e-06, "loss": 0.4309, "step": 1580 }, { "epoch": 0.012719898240814074, "grad_norm": 0.6941046794760515, "learning_rate": 1.2710983121350293e-06, "loss": 0.4266, "step": 1590 }, { "epoch": 0.012799897600819194, "grad_norm": 0.6685733935987979, "learning_rate": 1.2790976721862253e-06, "loss": 0.4148, "step": 1600 }, { "epoch": 0.012879896960824314, "grad_norm": 0.829963069106683, "learning_rate": 1.287097032237421e-06, "loss": 0.438, "step": 1610 }, { "epoch": 0.012959896320829433, "grad_norm": 0.8772001383565942, "learning_rate": 1.295096392288617e-06, "loss": 0.4167, "step": 1620 }, { "epoch": 0.013039895680834553, "grad_norm": 0.7693529714105686, "learning_rate": 1.303095752339813e-06, "loss": 0.4465, "step": 1630 }, { "epoch": 0.013119895040839673, "grad_norm": 0.6969819161826472, "learning_rate": 1.3110951123910087e-06, "loss": 0.4374, "step": 1640 }, { "epoch": 0.013199894400844793, "grad_norm": 0.7245418409893126, "learning_rate": 1.3190944724422048e-06, "loss": 0.4375, "step": 1650 }, { "epoch": 0.013279893760849914, "grad_norm": 0.6704658854506884, "learning_rate": 1.3270938324934008e-06, "loss": 0.4144, "step": 1660 }, { "epoch": 0.013359893120855034, "grad_norm": 0.7596780468310279, "learning_rate": 1.3350931925445965e-06, "loss": 0.4142, "step": 1670 }, { "epoch": 0.013439892480860154, "grad_norm": 0.799033496950235, "learning_rate": 1.3430925525957924e-06, "loss": 0.424, "step": 1680 }, { "epoch": 0.013519891840865272, "grad_norm": 0.8360146268303155, "learning_rate": 1.3510919126469885e-06, "loss": 0.4104, "step": 1690 }, { "epoch": 0.013599891200870393, "grad_norm": 0.7830381641790578, "learning_rate": 1.3590912726981842e-06, "loss": 0.4067, "step": 1700 }, { "epoch": 0.013679890560875513, "grad_norm": 0.7376045815830132, "learning_rate": 1.3670906327493801e-06, "loss": 0.4232, "step": 1710 }, { "epoch": 0.013759889920880633, "grad_norm": 0.7780257393234633, "learning_rate": 1.3750899928005762e-06, "loss": 0.4001, "step": 1720 }, { "epoch": 0.013839889280885753, "grad_norm": 0.7400853428241427, "learning_rate": 1.383089352851772e-06, "loss": 0.4283, "step": 1730 }, { "epoch": 0.013919888640890873, "grad_norm": 0.6301335389858812, "learning_rate": 1.3910887129029679e-06, "loss": 0.4315, "step": 1740 }, { "epoch": 0.013999888000895994, "grad_norm": 0.6873058205506691, "learning_rate": 1.3990880729541636e-06, "loss": 0.4294, "step": 1750 }, { "epoch": 0.014079887360901112, "grad_norm": 0.6118849450571252, "learning_rate": 1.4070874330053597e-06, "loss": 0.4116, "step": 1760 }, { "epoch": 0.014159886720906232, "grad_norm": 0.8643963519063279, "learning_rate": 1.4150867930565556e-06, "loss": 0.4189, "step": 1770 }, { "epoch": 0.014239886080911352, "grad_norm": 0.6986286668684888, "learning_rate": 1.4230861531077513e-06, "loss": 0.4393, "step": 1780 }, { "epoch": 0.014319885440916473, "grad_norm": 0.7361554237470329, "learning_rate": 1.4310855131589474e-06, "loss": 0.4048, "step": 1790 }, { "epoch": 0.014399884800921593, "grad_norm": 0.7526595512221772, "learning_rate": 1.4390848732101434e-06, "loss": 0.4434, "step": 1800 }, { "epoch": 0.014479884160926713, "grad_norm": 0.6793121490590498, "learning_rate": 1.447084233261339e-06, "loss": 0.4313, "step": 1810 }, { "epoch": 0.014559883520931833, "grad_norm": 0.7503593630371851, "learning_rate": 1.4550835933125352e-06, "loss": 0.4363, "step": 1820 }, { "epoch": 0.014639882880936952, "grad_norm": 0.6631126747097913, "learning_rate": 1.463082953363731e-06, "loss": 0.4172, "step": 1830 }, { "epoch": 0.014719882240942072, "grad_norm": 0.6201041671912539, "learning_rate": 1.4710823134149268e-06, "loss": 0.426, "step": 1840 }, { "epoch": 0.014799881600947192, "grad_norm": 0.7986575840999489, "learning_rate": 1.479081673466123e-06, "loss": 0.4418, "step": 1850 }, { "epoch": 0.014879880960952312, "grad_norm": 0.7485644191714067, "learning_rate": 1.4870810335173188e-06, "loss": 0.4315, "step": 1860 }, { "epoch": 0.014959880320957433, "grad_norm": 0.767180980787683, "learning_rate": 1.4950803935685146e-06, "loss": 0.44, "step": 1870 }, { "epoch": 0.015039879680962553, "grad_norm": 0.7085244935047633, "learning_rate": 1.5030797536197107e-06, "loss": 0.4197, "step": 1880 }, { "epoch": 0.015119879040967673, "grad_norm": 0.9096558305501717, "learning_rate": 1.5110791136709064e-06, "loss": 0.4276, "step": 1890 }, { "epoch": 0.015199878400972791, "grad_norm": 0.7782449144127891, "learning_rate": 1.5190784737221023e-06, "loss": 0.4433, "step": 1900 }, { "epoch": 0.015279877760977912, "grad_norm": 0.6784288997728832, "learning_rate": 1.5270778337732984e-06, "loss": 0.408, "step": 1910 }, { "epoch": 0.015359877120983032, "grad_norm": 0.7394160138728095, "learning_rate": 1.5350771938244941e-06, "loss": 0.427, "step": 1920 }, { "epoch": 0.015439876480988152, "grad_norm": 0.7020184796096351, "learning_rate": 1.54307655387569e-06, "loss": 0.4585, "step": 1930 }, { "epoch": 0.015519875840993272, "grad_norm": 0.6570216742745165, "learning_rate": 1.551075913926886e-06, "loss": 0.4203, "step": 1940 }, { "epoch": 0.015599875200998392, "grad_norm": 0.6492915506893296, "learning_rate": 1.5590752739780819e-06, "loss": 0.4225, "step": 1950 }, { "epoch": 0.01567987456100351, "grad_norm": 0.6404207960330748, "learning_rate": 1.5670746340292778e-06, "loss": 0.4155, "step": 1960 }, { "epoch": 0.01575987392100863, "grad_norm": 0.7069937527425317, "learning_rate": 1.5750739940804737e-06, "loss": 0.4017, "step": 1970 }, { "epoch": 0.01583987328101375, "grad_norm": 0.7497814910327999, "learning_rate": 1.5830733541316694e-06, "loss": 0.4336, "step": 1980 }, { "epoch": 0.01591987264101887, "grad_norm": 0.997502334828043, "learning_rate": 1.5910727141828655e-06, "loss": 0.4252, "step": 1990 }, { "epoch": 0.01599987200102399, "grad_norm": 0.7051737520195863, "learning_rate": 1.5990720742340615e-06, "loss": 0.4136, "step": 2000 }, { "epoch": 0.016079871361029112, "grad_norm": 0.7358677233078202, "learning_rate": 1.6070714342852572e-06, "loss": 0.4041, "step": 2010 }, { "epoch": 0.016159870721034232, "grad_norm": 0.7232160368639224, "learning_rate": 1.6150707943364533e-06, "loss": 0.3925, "step": 2020 }, { "epoch": 0.016239870081039352, "grad_norm": 0.7492396218793221, "learning_rate": 1.6230701543876492e-06, "loss": 0.4453, "step": 2030 }, { "epoch": 0.016319869441044473, "grad_norm": 0.7879468186487532, "learning_rate": 1.631069514438845e-06, "loss": 0.4143, "step": 2040 }, { "epoch": 0.016399868801049593, "grad_norm": 0.7027247029038095, "learning_rate": 1.639068874490041e-06, "loss": 0.4487, "step": 2050 }, { "epoch": 0.016479868161054713, "grad_norm": 0.6646702988688921, "learning_rate": 1.6470682345412367e-06, "loss": 0.4189, "step": 2060 }, { "epoch": 0.016559867521059833, "grad_norm": 0.7214592394412016, "learning_rate": 1.6550675945924326e-06, "loss": 0.4298, "step": 2070 }, { "epoch": 0.01663986688106495, "grad_norm": 0.7009780752105863, "learning_rate": 1.6630669546436288e-06, "loss": 0.43, "step": 2080 }, { "epoch": 0.01671986624107007, "grad_norm": 0.6802031501540443, "learning_rate": 1.6710663146948245e-06, "loss": 0.4064, "step": 2090 }, { "epoch": 0.01679986560107519, "grad_norm": 0.7764568933093239, "learning_rate": 1.6790656747460204e-06, "loss": 0.4192, "step": 2100 }, { "epoch": 0.01687986496108031, "grad_norm": 0.7257831241745193, "learning_rate": 1.6870650347972165e-06, "loss": 0.4137, "step": 2110 }, { "epoch": 0.01695986432108543, "grad_norm": 0.7231093177815116, "learning_rate": 1.6950643948484122e-06, "loss": 0.4321, "step": 2120 }, { "epoch": 0.01703986368109055, "grad_norm": 0.7604848904942334, "learning_rate": 1.7030637548996081e-06, "loss": 0.4173, "step": 2130 }, { "epoch": 0.01711986304109567, "grad_norm": 0.8671202797505491, "learning_rate": 1.7110631149508043e-06, "loss": 0.4172, "step": 2140 }, { "epoch": 0.01719986240110079, "grad_norm": 0.8818905733108134, "learning_rate": 1.719062475002e-06, "loss": 0.4096, "step": 2150 }, { "epoch": 0.01727986176110591, "grad_norm": 0.7073377083366502, "learning_rate": 1.7270618350531959e-06, "loss": 0.4327, "step": 2160 }, { "epoch": 0.01735986112111103, "grad_norm": 0.9637856611728648, "learning_rate": 1.735061195104392e-06, "loss": 0.457, "step": 2170 }, { "epoch": 0.017439860481116152, "grad_norm": 0.7763253238443666, "learning_rate": 1.7430605551555877e-06, "loss": 0.4444, "step": 2180 }, { "epoch": 0.017519859841121272, "grad_norm": 0.7532387544984117, "learning_rate": 1.7510599152067836e-06, "loss": 0.4193, "step": 2190 }, { "epoch": 0.017599859201126392, "grad_norm": 0.7001703245870058, "learning_rate": 1.7590592752579793e-06, "loss": 0.4334, "step": 2200 }, { "epoch": 0.017679858561131512, "grad_norm": 0.653320909525973, "learning_rate": 1.7670586353091755e-06, "loss": 0.4137, "step": 2210 }, { "epoch": 0.01775985792113663, "grad_norm": 0.6431905950303065, "learning_rate": 1.7750579953603714e-06, "loss": 0.4056, "step": 2220 }, { "epoch": 0.01783985728114175, "grad_norm": 0.6355510699319944, "learning_rate": 1.783057355411567e-06, "loss": 0.3967, "step": 2230 }, { "epoch": 0.01791985664114687, "grad_norm": 1.2693108108319435, "learning_rate": 1.7910567154627632e-06, "loss": 0.3958, "step": 2240 }, { "epoch": 0.01799985600115199, "grad_norm": 0.7084484301635297, "learning_rate": 1.7990560755139591e-06, "loss": 0.4341, "step": 2250 }, { "epoch": 0.01807985536115711, "grad_norm": 0.7305495029250423, "learning_rate": 1.8070554355651548e-06, "loss": 0.4343, "step": 2260 }, { "epoch": 0.01815985472116223, "grad_norm": 0.7855576477357182, "learning_rate": 1.8150547956163507e-06, "loss": 0.4158, "step": 2270 }, { "epoch": 0.01823985408116735, "grad_norm": 0.8272843321029247, "learning_rate": 1.8230541556675469e-06, "loss": 0.4355, "step": 2280 }, { "epoch": 0.01831985344117247, "grad_norm": 0.753702291149743, "learning_rate": 1.8310535157187426e-06, "loss": 0.4025, "step": 2290 }, { "epoch": 0.01839985280117759, "grad_norm": 0.7598104262546688, "learning_rate": 1.8390528757699385e-06, "loss": 0.413, "step": 2300 }, { "epoch": 0.01847985216118271, "grad_norm": 0.7611395927729616, "learning_rate": 1.8470522358211346e-06, "loss": 0.4373, "step": 2310 }, { "epoch": 0.01855985152118783, "grad_norm": 0.760722694363519, "learning_rate": 1.8550515958723303e-06, "loss": 0.4179, "step": 2320 }, { "epoch": 0.01863985088119295, "grad_norm": 0.5957047498546761, "learning_rate": 1.8630509559235262e-06, "loss": 0.4231, "step": 2330 }, { "epoch": 0.01871985024119807, "grad_norm": 0.7186940138009597, "learning_rate": 1.8710503159747224e-06, "loss": 0.4171, "step": 2340 }, { "epoch": 0.018799849601203192, "grad_norm": 0.7760364408450707, "learning_rate": 1.879049676025918e-06, "loss": 0.4003, "step": 2350 }, { "epoch": 0.018879848961208312, "grad_norm": 0.7481302705420546, "learning_rate": 1.887049036077114e-06, "loss": 0.4182, "step": 2360 }, { "epoch": 0.01895984832121343, "grad_norm": 0.7358536669321117, "learning_rate": 1.8950483961283097e-06, "loss": 0.4272, "step": 2370 }, { "epoch": 0.01903984768121855, "grad_norm": 0.7602582219791679, "learning_rate": 1.9030477561795058e-06, "loss": 0.4157, "step": 2380 }, { "epoch": 0.01911984704122367, "grad_norm": 0.7426387012574098, "learning_rate": 1.911047116230702e-06, "loss": 0.4136, "step": 2390 }, { "epoch": 0.01919984640122879, "grad_norm": 0.6954613448087369, "learning_rate": 1.9190464762818974e-06, "loss": 0.4081, "step": 2400 }, { "epoch": 0.01927984576123391, "grad_norm": 0.7604294663875293, "learning_rate": 1.9270458363330933e-06, "loss": 0.4116, "step": 2410 }, { "epoch": 0.01935984512123903, "grad_norm": 0.7196806209080392, "learning_rate": 1.9350451963842897e-06, "loss": 0.4253, "step": 2420 }, { "epoch": 0.01943984448124415, "grad_norm": 0.7619473447002568, "learning_rate": 1.943044556435485e-06, "loss": 0.4048, "step": 2430 }, { "epoch": 0.01951984384124927, "grad_norm": 0.7052222380767645, "learning_rate": 1.951043916486681e-06, "loss": 0.4092, "step": 2440 }, { "epoch": 0.01959984320125439, "grad_norm": 0.6311702076235657, "learning_rate": 1.9590432765378774e-06, "loss": 0.409, "step": 2450 }, { "epoch": 0.01967984256125951, "grad_norm": 0.7989650814927569, "learning_rate": 1.967042636589073e-06, "loss": 0.4031, "step": 2460 }, { "epoch": 0.01975984192126463, "grad_norm": 0.7243725001067509, "learning_rate": 1.975041996640269e-06, "loss": 0.4139, "step": 2470 }, { "epoch": 0.01983984128126975, "grad_norm": 0.7689695050248523, "learning_rate": 1.983041356691465e-06, "loss": 0.419, "step": 2480 }, { "epoch": 0.01991984064127487, "grad_norm": 0.6784111247678405, "learning_rate": 1.9910407167426607e-06, "loss": 0.4255, "step": 2490 }, { "epoch": 0.01999984000127999, "grad_norm": 0.696241896489532, "learning_rate": 1.9990400767938566e-06, "loss": 0.4387, "step": 2500 }, { "epoch": 0.020079839361285108, "grad_norm": 0.7337662500397681, "learning_rate": 2.0070394368450525e-06, "loss": 0.4107, "step": 2510 }, { "epoch": 0.02015983872129023, "grad_norm": 0.7522121421950386, "learning_rate": 2.0150387968962484e-06, "loss": 0.4341, "step": 2520 }, { "epoch": 0.02023983808129535, "grad_norm": 0.7635117961228429, "learning_rate": 2.0230381569474443e-06, "loss": 0.444, "step": 2530 }, { "epoch": 0.02031983744130047, "grad_norm": 0.7258340461733414, "learning_rate": 2.0310375169986402e-06, "loss": 0.4532, "step": 2540 }, { "epoch": 0.02039983680130559, "grad_norm": 0.8227531741667117, "learning_rate": 2.039036877049836e-06, "loss": 0.4211, "step": 2550 }, { "epoch": 0.02047983616131071, "grad_norm": 0.8046278992558573, "learning_rate": 2.047036237101032e-06, "loss": 0.4304, "step": 2560 }, { "epoch": 0.02055983552131583, "grad_norm": 0.8583586766230429, "learning_rate": 2.055035597152228e-06, "loss": 0.389, "step": 2570 }, { "epoch": 0.02063983488132095, "grad_norm": 0.7059091393994599, "learning_rate": 2.063034957203424e-06, "loss": 0.4143, "step": 2580 }, { "epoch": 0.02071983424132607, "grad_norm": 0.7169812723780176, "learning_rate": 2.07103431725462e-06, "loss": 0.4231, "step": 2590 }, { "epoch": 0.02079983360133119, "grad_norm": 0.744498117665369, "learning_rate": 2.0790336773058157e-06, "loss": 0.3931, "step": 2600 }, { "epoch": 0.02087983296133631, "grad_norm": 0.8714679551154206, "learning_rate": 2.0870330373570117e-06, "loss": 0.4227, "step": 2610 }, { "epoch": 0.02095983232134143, "grad_norm": 0.6310793333444085, "learning_rate": 2.0950323974082076e-06, "loss": 0.4092, "step": 2620 }, { "epoch": 0.02103983168134655, "grad_norm": 0.7793866057944923, "learning_rate": 2.1030317574594035e-06, "loss": 0.4139, "step": 2630 }, { "epoch": 0.02111983104135167, "grad_norm": 0.8117526334063988, "learning_rate": 2.1110311175105994e-06, "loss": 0.4154, "step": 2640 }, { "epoch": 0.021199830401356787, "grad_norm": 0.7466901509429923, "learning_rate": 2.1190304775617953e-06, "loss": 0.4139, "step": 2650 }, { "epoch": 0.021279829761361908, "grad_norm": 0.9396797684739518, "learning_rate": 2.1270298376129912e-06, "loss": 0.4166, "step": 2660 }, { "epoch": 0.021359829121367028, "grad_norm": 0.7212101339600039, "learning_rate": 2.135029197664187e-06, "loss": 0.4299, "step": 2670 }, { "epoch": 0.021439828481372148, "grad_norm": 0.9051397916977733, "learning_rate": 2.1430285577153826e-06, "loss": 0.4203, "step": 2680 }, { "epoch": 0.021519827841377268, "grad_norm": 0.6759148258397492, "learning_rate": 2.151027917766579e-06, "loss": 0.4065, "step": 2690 }, { "epoch": 0.02159982720138239, "grad_norm": 0.7191776566727898, "learning_rate": 2.159027277817775e-06, "loss": 0.3957, "step": 2700 }, { "epoch": 0.02167982656138751, "grad_norm": 0.7752464373073066, "learning_rate": 2.1670266378689704e-06, "loss": 0.4198, "step": 2710 }, { "epoch": 0.02175982592139263, "grad_norm": 0.7096879372714776, "learning_rate": 2.1750259979201667e-06, "loss": 0.4337, "step": 2720 }, { "epoch": 0.02183982528139775, "grad_norm": 0.7577341998111574, "learning_rate": 2.1830253579713626e-06, "loss": 0.4484, "step": 2730 }, { "epoch": 0.02191982464140287, "grad_norm": 0.8303467101337862, "learning_rate": 2.191024718022558e-06, "loss": 0.436, "step": 2740 }, { "epoch": 0.02199982400140799, "grad_norm": 0.740434972574641, "learning_rate": 2.1990240780737545e-06, "loss": 0.4026, "step": 2750 }, { "epoch": 0.02207982336141311, "grad_norm": 0.7385703007931491, "learning_rate": 2.2070234381249504e-06, "loss": 0.3994, "step": 2760 }, { "epoch": 0.02215982272141823, "grad_norm": 0.804160370262748, "learning_rate": 2.215022798176146e-06, "loss": 0.4247, "step": 2770 }, { "epoch": 0.02223982208142335, "grad_norm": 0.9699580286471233, "learning_rate": 2.2230221582273422e-06, "loss": 0.4158, "step": 2780 }, { "epoch": 0.02231982144142847, "grad_norm": 0.854317456477185, "learning_rate": 2.231021518278538e-06, "loss": 0.4224, "step": 2790 }, { "epoch": 0.022399820801433587, "grad_norm": 0.7261443031804087, "learning_rate": 2.2390208783297336e-06, "loss": 0.4153, "step": 2800 }, { "epoch": 0.022479820161438707, "grad_norm": 0.7221332661288903, "learning_rate": 2.24702023838093e-06, "loss": 0.4287, "step": 2810 }, { "epoch": 0.022559819521443827, "grad_norm": 0.8879662037891538, "learning_rate": 2.2550195984321255e-06, "loss": 0.4221, "step": 2820 }, { "epoch": 0.022639818881448948, "grad_norm": 0.7885980884870208, "learning_rate": 2.2630189584833214e-06, "loss": 0.4394, "step": 2830 }, { "epoch": 0.022719818241454068, "grad_norm": 4.841458093614334, "learning_rate": 2.2710183185345173e-06, "loss": 0.4306, "step": 2840 }, { "epoch": 0.022799817601459188, "grad_norm": 0.762762416297712, "learning_rate": 2.279017678585713e-06, "loss": 0.4248, "step": 2850 }, { "epoch": 0.022879816961464308, "grad_norm": 0.8448845187481899, "learning_rate": 2.287017038636909e-06, "loss": 0.4016, "step": 2860 }, { "epoch": 0.02295981632146943, "grad_norm": 0.7119692504285716, "learning_rate": 2.295016398688105e-06, "loss": 0.4187, "step": 2870 }, { "epoch": 0.02303981568147455, "grad_norm": 0.7322860524669678, "learning_rate": 2.303015758739301e-06, "loss": 0.3992, "step": 2880 }, { "epoch": 0.02311981504147967, "grad_norm": 0.7809553100217358, "learning_rate": 2.311015118790497e-06, "loss": 0.4323, "step": 2890 }, { "epoch": 0.02319981440148479, "grad_norm": 0.7497042609672665, "learning_rate": 2.3190144788416928e-06, "loss": 0.4216, "step": 2900 }, { "epoch": 0.02327981376148991, "grad_norm": 0.77908486703842, "learning_rate": 2.3270138388928887e-06, "loss": 0.4139, "step": 2910 }, { "epoch": 0.02335981312149503, "grad_norm": 0.8259941131428074, "learning_rate": 2.3350131989440846e-06, "loss": 0.4298, "step": 2920 }, { "epoch": 0.02343981248150015, "grad_norm": 0.7315958297934407, "learning_rate": 2.3430125589952805e-06, "loss": 0.4148, "step": 2930 }, { "epoch": 0.023519811841505266, "grad_norm": 0.7127501348213877, "learning_rate": 2.3510119190464764e-06, "loss": 0.4226, "step": 2940 }, { "epoch": 0.023599811201510387, "grad_norm": 0.7004822200975431, "learning_rate": 2.3590112790976724e-06, "loss": 0.4037, "step": 2950 }, { "epoch": 0.023679810561515507, "grad_norm": 0.7640873416448367, "learning_rate": 2.3670106391488683e-06, "loss": 0.4196, "step": 2960 }, { "epoch": 0.023759809921520627, "grad_norm": 0.7562584220469137, "learning_rate": 2.375009999200064e-06, "loss": 0.4226, "step": 2970 }, { "epoch": 0.023839809281525747, "grad_norm": 0.700408619087647, "learning_rate": 2.38300935925126e-06, "loss": 0.4194, "step": 2980 }, { "epoch": 0.023919808641530867, "grad_norm": 0.6926553567290514, "learning_rate": 2.391008719302456e-06, "loss": 0.4377, "step": 2990 }, { "epoch": 0.023999808001535988, "grad_norm": 0.8080569171783707, "learning_rate": 2.399008079353652e-06, "loss": 0.425, "step": 3000 } ], "logging_steps": 10, "max_steps": 125001, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 173052491300864.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }