Qwen3-4B-openmath-1m-ckpt-3000 / trainer_state.json
Seongyun's picture
Upload folder using huggingface_hub
279b68c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.023999808001535988,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 7.999936000511996e-05,
"grad_norm": 4.414881453815827,
"learning_rate": 7.199424046076314e-09,
"loss": 0.5441,
"step": 10
},
{
"epoch": 0.00015999872001023991,
"grad_norm": 4.004957247533114,
"learning_rate": 1.519878409727222e-08,
"loss": 0.5353,
"step": 20
},
{
"epoch": 0.00023999808001535987,
"grad_norm": 4.889790729834478,
"learning_rate": 2.3198144148468124e-08,
"loss": 0.537,
"step": 30
},
{
"epoch": 0.00031999744002047983,
"grad_norm": 3.7138301639437508,
"learning_rate": 3.119750419966403e-08,
"loss": 0.5488,
"step": 40
},
{
"epoch": 0.0003999968000255998,
"grad_norm": 4.208028943214464,
"learning_rate": 3.919686425085993e-08,
"loss": 0.5557,
"step": 50
},
{
"epoch": 0.00047999616003071974,
"grad_norm": 4.326896533998126,
"learning_rate": 4.719622430205584e-08,
"loss": 0.5061,
"step": 60
},
{
"epoch": 0.0005599955200358397,
"grad_norm": 7.1723877700496415,
"learning_rate": 5.519558435325175e-08,
"loss": 0.5193,
"step": 70
},
{
"epoch": 0.0006399948800409597,
"grad_norm": 3.892238785520058,
"learning_rate": 6.319494440444764e-08,
"loss": 0.5311,
"step": 80
},
{
"epoch": 0.0007199942400460796,
"grad_norm": 3.4517472540795513,
"learning_rate": 7.119430445564356e-08,
"loss": 0.5232,
"step": 90
},
{
"epoch": 0.0007999936000511996,
"grad_norm": 4.094057558667446,
"learning_rate": 7.919366450683946e-08,
"loss": 0.5225,
"step": 100
},
{
"epoch": 0.0008799929600563195,
"grad_norm": 3.4395682865399144,
"learning_rate": 8.719302455803536e-08,
"loss": 0.5371,
"step": 110
},
{
"epoch": 0.0009599923200614395,
"grad_norm": 3.7029712275218682,
"learning_rate": 9.519238460923127e-08,
"loss": 0.5305,
"step": 120
},
{
"epoch": 0.0010399916800665594,
"grad_norm": 3.493027269180126,
"learning_rate": 1.0319174466042718e-07,
"loss": 0.5167,
"step": 130
},
{
"epoch": 0.0011199910400716794,
"grad_norm": 3.480790725115363,
"learning_rate": 1.1119110471162308e-07,
"loss": 0.5543,
"step": 140
},
{
"epoch": 0.0011999904000767993,
"grad_norm": 3.0350778792458564,
"learning_rate": 1.1919046476281897e-07,
"loss": 0.5188,
"step": 150
},
{
"epoch": 0.0012799897600819193,
"grad_norm": 2.7703962024693216,
"learning_rate": 1.271898248140149e-07,
"loss": 0.5176,
"step": 160
},
{
"epoch": 0.0013599891200870393,
"grad_norm": 2.7312812150288277,
"learning_rate": 1.351891848652108e-07,
"loss": 0.5206,
"step": 170
},
{
"epoch": 0.0014399884800921593,
"grad_norm": 2.6772442085163837,
"learning_rate": 1.431885449164067e-07,
"loss": 0.5187,
"step": 180
},
{
"epoch": 0.0015199878400972793,
"grad_norm": 2.6017371604025503,
"learning_rate": 1.5118790496760262e-07,
"loss": 0.4858,
"step": 190
},
{
"epoch": 0.0015999872001023993,
"grad_norm": 2.361921882047865,
"learning_rate": 1.5918726501879854e-07,
"loss": 0.4965,
"step": 200
},
{
"epoch": 0.0016799865601075192,
"grad_norm": 1.9949603945902301,
"learning_rate": 1.6718662506999443e-07,
"loss": 0.49,
"step": 210
},
{
"epoch": 0.001759985920112639,
"grad_norm": 1.689017850332906,
"learning_rate": 1.7518598512119031e-07,
"loss": 0.4474,
"step": 220
},
{
"epoch": 0.001839985280117759,
"grad_norm": 1.7526108484730667,
"learning_rate": 1.8318534517238623e-07,
"loss": 0.4758,
"step": 230
},
{
"epoch": 0.001919984640122879,
"grad_norm": 1.6619250196852287,
"learning_rate": 1.9118470522358212e-07,
"loss": 0.4551,
"step": 240
},
{
"epoch": 0.001999984000127999,
"grad_norm": 1.4244411983222107,
"learning_rate": 1.9918406527477803e-07,
"loss": 0.4666,
"step": 250
},
{
"epoch": 0.0020799833601331187,
"grad_norm": 1.4091371713945773,
"learning_rate": 2.0718342532597392e-07,
"loss": 0.4744,
"step": 260
},
{
"epoch": 0.002159982720138239,
"grad_norm": 1.4037698231377602,
"learning_rate": 2.1518278537716986e-07,
"loss": 0.4577,
"step": 270
},
{
"epoch": 0.0022399820801433587,
"grad_norm": 1.1233973407058262,
"learning_rate": 2.2318214542836575e-07,
"loss": 0.4442,
"step": 280
},
{
"epoch": 0.002319981440148479,
"grad_norm": 1.0934034827825632,
"learning_rate": 2.3118150547956164e-07,
"loss": 0.4746,
"step": 290
},
{
"epoch": 0.0023999808001535987,
"grad_norm": 0.9547032856231036,
"learning_rate": 2.3918086553075753e-07,
"loss": 0.4631,
"step": 300
},
{
"epoch": 0.002479980160158719,
"grad_norm": 1.011447249716743,
"learning_rate": 2.4718022558195345e-07,
"loss": 0.4359,
"step": 310
},
{
"epoch": 0.0025599795201638386,
"grad_norm": 0.7506687829382205,
"learning_rate": 2.5517958563314936e-07,
"loss": 0.4554,
"step": 320
},
{
"epoch": 0.002639978880168959,
"grad_norm": 0.8268141903301145,
"learning_rate": 2.631789456843453e-07,
"loss": 0.4444,
"step": 330
},
{
"epoch": 0.0027199782401740786,
"grad_norm": 0.9120933866995852,
"learning_rate": 2.711783057355412e-07,
"loss": 0.4662,
"step": 340
},
{
"epoch": 0.0027999776001791984,
"grad_norm": 0.9481570868182263,
"learning_rate": 2.791776657867371e-07,
"loss": 0.4677,
"step": 350
},
{
"epoch": 0.0028799769601843186,
"grad_norm": 0.8010445040150771,
"learning_rate": 2.8717702583793297e-07,
"loss": 0.4196,
"step": 360
},
{
"epoch": 0.0029599763201894383,
"grad_norm": 0.8670839473253033,
"learning_rate": 2.951763858891289e-07,
"loss": 0.4492,
"step": 370
},
{
"epoch": 0.0030399756801945585,
"grad_norm": 0.8536232676044045,
"learning_rate": 3.031757459403248e-07,
"loss": 0.4375,
"step": 380
},
{
"epoch": 0.0031199750401996783,
"grad_norm": 0.8135677817452803,
"learning_rate": 3.111751059915207e-07,
"loss": 0.438,
"step": 390
},
{
"epoch": 0.0031999744002047985,
"grad_norm": 0.9667962048008838,
"learning_rate": 3.191744660427166e-07,
"loss": 0.4348,
"step": 400
},
{
"epoch": 0.0032799737602099183,
"grad_norm": 0.885059264680228,
"learning_rate": 3.271738260939125e-07,
"loss": 0.4295,
"step": 410
},
{
"epoch": 0.0033599731202150385,
"grad_norm": 0.7112007856245484,
"learning_rate": 3.3517318614510846e-07,
"loss": 0.4412,
"step": 420
},
{
"epoch": 0.0034399724802201583,
"grad_norm": 0.9486242885147141,
"learning_rate": 3.431725461963043e-07,
"loss": 0.4357,
"step": 430
},
{
"epoch": 0.003519971840225278,
"grad_norm": 0.7792187180449996,
"learning_rate": 3.5117190624750024e-07,
"loss": 0.4053,
"step": 440
},
{
"epoch": 0.0035999712002303982,
"grad_norm": 0.8013118911013118,
"learning_rate": 3.591712662986961e-07,
"loss": 0.4526,
"step": 450
},
{
"epoch": 0.003679970560235518,
"grad_norm": 0.9290353407648725,
"learning_rate": 3.6717062634989207e-07,
"loss": 0.4394,
"step": 460
},
{
"epoch": 0.003759969920240638,
"grad_norm": 0.7440275824040646,
"learning_rate": 3.7516998640108793e-07,
"loss": 0.4354,
"step": 470
},
{
"epoch": 0.003839969280245758,
"grad_norm": 0.6661735557442742,
"learning_rate": 3.8316934645228385e-07,
"loss": 0.4132,
"step": 480
},
{
"epoch": 0.003919968640250878,
"grad_norm": 0.80714967397643,
"learning_rate": 3.9116870650347976e-07,
"loss": 0.4247,
"step": 490
},
{
"epoch": 0.003999968000255998,
"grad_norm": 0.7269691108846945,
"learning_rate": 3.9916806655467563e-07,
"loss": 0.4308,
"step": 500
},
{
"epoch": 0.004079967360261118,
"grad_norm": 0.8112092339496537,
"learning_rate": 4.071674266058716e-07,
"loss": 0.4215,
"step": 510
},
{
"epoch": 0.0041599667202662375,
"grad_norm": 0.8465738727092779,
"learning_rate": 4.1516678665706746e-07,
"loss": 0.4438,
"step": 520
},
{
"epoch": 0.004239966080271358,
"grad_norm": 0.7947114246478345,
"learning_rate": 4.2316614670826337e-07,
"loss": 0.402,
"step": 530
},
{
"epoch": 0.004319965440276478,
"grad_norm": 0.771445835897062,
"learning_rate": 4.311655067594593e-07,
"loss": 0.4434,
"step": 540
},
{
"epoch": 0.004399964800281598,
"grad_norm": 0.7143580000434352,
"learning_rate": 4.3916486681065515e-07,
"loss": 0.4227,
"step": 550
},
{
"epoch": 0.004479964160286717,
"grad_norm": 0.7487773075552359,
"learning_rate": 4.471642268618511e-07,
"loss": 0.438,
"step": 560
},
{
"epoch": 0.004559963520291838,
"grad_norm": 0.7373810653849879,
"learning_rate": 4.55163586913047e-07,
"loss": 0.4288,
"step": 570
},
{
"epoch": 0.004639962880296958,
"grad_norm": 0.7104505393064982,
"learning_rate": 4.631629469642429e-07,
"loss": 0.4462,
"step": 580
},
{
"epoch": 0.004719962240302078,
"grad_norm": 0.6996674322090365,
"learning_rate": 4.7116230701543876e-07,
"loss": 0.4263,
"step": 590
},
{
"epoch": 0.004799961600307197,
"grad_norm": 0.779468888606517,
"learning_rate": 4.791616670666347e-07,
"loss": 0.4378,
"step": 600
},
{
"epoch": 0.0048799609603123175,
"grad_norm": 0.6348456379271163,
"learning_rate": 4.871610271178306e-07,
"loss": 0.4098,
"step": 610
},
{
"epoch": 0.004959960320317438,
"grad_norm": 0.6384341588537313,
"learning_rate": 4.951603871690265e-07,
"loss": 0.4489,
"step": 620
},
{
"epoch": 0.005039959680322557,
"grad_norm": 0.6779051066808497,
"learning_rate": 5.031597472202224e-07,
"loss": 0.4522,
"step": 630
},
{
"epoch": 0.005119959040327677,
"grad_norm": 0.7405337420145796,
"learning_rate": 5.111591072714183e-07,
"loss": 0.4248,
"step": 640
},
{
"epoch": 0.0051999584003327975,
"grad_norm": 0.6915688533513078,
"learning_rate": 5.191584673226143e-07,
"loss": 0.4129,
"step": 650
},
{
"epoch": 0.005279957760337918,
"grad_norm": 0.6044444399034763,
"learning_rate": 5.271578273738101e-07,
"loss": 0.4237,
"step": 660
},
{
"epoch": 0.005359957120343037,
"grad_norm": 0.7401624173362278,
"learning_rate": 5.351571874250061e-07,
"loss": 0.4216,
"step": 670
},
{
"epoch": 0.005439956480348157,
"grad_norm": 0.7496141168534413,
"learning_rate": 5.43156547476202e-07,
"loss": 0.4268,
"step": 680
},
{
"epoch": 0.005519955840353277,
"grad_norm": 0.730614366570662,
"learning_rate": 5.511559075273978e-07,
"loss": 0.4589,
"step": 690
},
{
"epoch": 0.005599955200358397,
"grad_norm": 0.7836211351094068,
"learning_rate": 5.591552675785937e-07,
"loss": 0.4176,
"step": 700
},
{
"epoch": 0.005679954560363517,
"grad_norm": 0.6648189022606215,
"learning_rate": 5.671546276297896e-07,
"loss": 0.4274,
"step": 710
},
{
"epoch": 0.005759953920368637,
"grad_norm": 0.6607901995290275,
"learning_rate": 5.751539876809856e-07,
"loss": 0.428,
"step": 720
},
{
"epoch": 0.005839953280373757,
"grad_norm": 0.8422142881250906,
"learning_rate": 5.831533477321815e-07,
"loss": 0.4206,
"step": 730
},
{
"epoch": 0.005919952640378877,
"grad_norm": 0.8636103630308042,
"learning_rate": 5.911527077833774e-07,
"loss": 0.4384,
"step": 740
},
{
"epoch": 0.005999952000383997,
"grad_norm": 0.7020283376881055,
"learning_rate": 5.991520678345733e-07,
"loss": 0.4336,
"step": 750
},
{
"epoch": 0.006079951360389117,
"grad_norm": 0.6637768636661737,
"learning_rate": 6.071514278857692e-07,
"loss": 0.421,
"step": 760
},
{
"epoch": 0.006159950720394236,
"grad_norm": 0.6720228370970702,
"learning_rate": 6.151507879369651e-07,
"loss": 0.4425,
"step": 770
},
{
"epoch": 0.006239950080399357,
"grad_norm": 0.7915308024009126,
"learning_rate": 6.231501479881609e-07,
"loss": 0.4665,
"step": 780
},
{
"epoch": 0.006319949440404477,
"grad_norm": 0.7005483141808596,
"learning_rate": 6.311495080393569e-07,
"loss": 0.4371,
"step": 790
},
{
"epoch": 0.006399948800409597,
"grad_norm": 0.7085842808188052,
"learning_rate": 6.391488680905528e-07,
"loss": 0.4481,
"step": 800
},
{
"epoch": 0.006479948160414716,
"grad_norm": 0.797364113205394,
"learning_rate": 6.471482281417488e-07,
"loss": 0.43,
"step": 810
},
{
"epoch": 0.0065599475204198366,
"grad_norm": 0.7769920944189029,
"learning_rate": 6.551475881929446e-07,
"loss": 0.4421,
"step": 820
},
{
"epoch": 0.006639946880424957,
"grad_norm": 0.7190098449382926,
"learning_rate": 6.631469482441405e-07,
"loss": 0.4524,
"step": 830
},
{
"epoch": 0.006719946240430077,
"grad_norm": 0.8999089940382301,
"learning_rate": 6.711463082953363e-07,
"loss": 0.4327,
"step": 840
},
{
"epoch": 0.006799945600435196,
"grad_norm": 0.8071258672281636,
"learning_rate": 6.791456683465323e-07,
"loss": 0.3999,
"step": 850
},
{
"epoch": 0.0068799449604403165,
"grad_norm": 0.9718244104757909,
"learning_rate": 6.871450283977283e-07,
"loss": 0.4109,
"step": 860
},
{
"epoch": 0.006959944320445437,
"grad_norm": 0.7260805899057289,
"learning_rate": 6.951443884489241e-07,
"loss": 0.4254,
"step": 870
},
{
"epoch": 0.007039943680450556,
"grad_norm": 0.6793848246836304,
"learning_rate": 7.031437485001201e-07,
"loss": 0.4168,
"step": 880
},
{
"epoch": 0.007119943040455676,
"grad_norm": 0.8220405330990818,
"learning_rate": 7.11143108551316e-07,
"loss": 0.4368,
"step": 890
},
{
"epoch": 0.0071999424004607964,
"grad_norm": 0.8650021092848164,
"learning_rate": 7.191424686025118e-07,
"loss": 0.4336,
"step": 900
},
{
"epoch": 0.007279941760465917,
"grad_norm": 0.6667689577948444,
"learning_rate": 7.271418286537078e-07,
"loss": 0.4325,
"step": 910
},
{
"epoch": 0.007359941120471036,
"grad_norm": 0.6443211014509481,
"learning_rate": 7.351411887049036e-07,
"loss": 0.443,
"step": 920
},
{
"epoch": 0.007439940480476156,
"grad_norm": 0.6067913066251965,
"learning_rate": 7.431405487560996e-07,
"loss": 0.4095,
"step": 930
},
{
"epoch": 0.007519939840481276,
"grad_norm": 0.6935512372810877,
"learning_rate": 7.511399088072954e-07,
"loss": 0.4272,
"step": 940
},
{
"epoch": 0.007599939200486396,
"grad_norm": 0.6599113657719949,
"learning_rate": 7.591392688584914e-07,
"loss": 0.3942,
"step": 950
},
{
"epoch": 0.007679938560491516,
"grad_norm": 0.611225655896161,
"learning_rate": 7.671386289096873e-07,
"loss": 0.4451,
"step": 960
},
{
"epoch": 0.007759937920496636,
"grad_norm": 0.757015652343119,
"learning_rate": 7.751379889608831e-07,
"loss": 0.4059,
"step": 970
},
{
"epoch": 0.007839937280501755,
"grad_norm": 0.8456384517410107,
"learning_rate": 7.831373490120791e-07,
"loss": 0.4278,
"step": 980
},
{
"epoch": 0.007919936640506876,
"grad_norm": 0.7996016208371923,
"learning_rate": 7.911367090632751e-07,
"loss": 0.4536,
"step": 990
},
{
"epoch": 0.007999936000511996,
"grad_norm": 0.6228736248515305,
"learning_rate": 7.991360691144709e-07,
"loss": 0.3969,
"step": 1000
},
{
"epoch": 0.008079935360517116,
"grad_norm": 0.6440105424946402,
"learning_rate": 8.071354291656668e-07,
"loss": 0.4307,
"step": 1010
},
{
"epoch": 0.008159934720522236,
"grad_norm": 0.6636370756757088,
"learning_rate": 8.151347892168628e-07,
"loss": 0.4019,
"step": 1020
},
{
"epoch": 0.008239934080527356,
"grad_norm": 0.6534728964626215,
"learning_rate": 8.231341492680586e-07,
"loss": 0.4357,
"step": 1030
},
{
"epoch": 0.008319933440532475,
"grad_norm": 0.6854238349847609,
"learning_rate": 8.311335093192545e-07,
"loss": 0.4156,
"step": 1040
},
{
"epoch": 0.008399932800537595,
"grad_norm": 0.7692797686595098,
"learning_rate": 8.391328693704504e-07,
"loss": 0.4364,
"step": 1050
},
{
"epoch": 0.008479932160542715,
"grad_norm": 0.6892139807862941,
"learning_rate": 8.471322294216464e-07,
"loss": 0.4595,
"step": 1060
},
{
"epoch": 0.008559931520547836,
"grad_norm": 0.6786670330601587,
"learning_rate": 8.551315894728423e-07,
"loss": 0.4207,
"step": 1070
},
{
"epoch": 0.008639930880552956,
"grad_norm": 0.6118366771253826,
"learning_rate": 8.631309495240381e-07,
"loss": 0.4186,
"step": 1080
},
{
"epoch": 0.008719930240558076,
"grad_norm": 0.7157718863255984,
"learning_rate": 8.711303095752341e-07,
"loss": 0.4412,
"step": 1090
},
{
"epoch": 0.008799929600563196,
"grad_norm": 0.7102096103999658,
"learning_rate": 8.791296696264299e-07,
"loss": 0.4205,
"step": 1100
},
{
"epoch": 0.008879928960568315,
"grad_norm": 0.6326552697725677,
"learning_rate": 8.871290296776258e-07,
"loss": 0.4161,
"step": 1110
},
{
"epoch": 0.008959928320573435,
"grad_norm": 0.7455469536196851,
"learning_rate": 8.951283897288219e-07,
"loss": 0.4259,
"step": 1120
},
{
"epoch": 0.009039927680578555,
"grad_norm": 0.8219540514047248,
"learning_rate": 9.031277497800177e-07,
"loss": 0.4423,
"step": 1130
},
{
"epoch": 0.009119927040583675,
"grad_norm": 0.6428846698552027,
"learning_rate": 9.111271098312136e-07,
"loss": 0.432,
"step": 1140
},
{
"epoch": 0.009199926400588795,
"grad_norm": 0.7959015617833209,
"learning_rate": 9.191264698824094e-07,
"loss": 0.4374,
"step": 1150
},
{
"epoch": 0.009279925760593916,
"grad_norm": 0.722690696817927,
"learning_rate": 9.271258299336054e-07,
"loss": 0.4177,
"step": 1160
},
{
"epoch": 0.009359925120599036,
"grad_norm": 0.7583146312532495,
"learning_rate": 9.351251899848013e-07,
"loss": 0.4349,
"step": 1170
},
{
"epoch": 0.009439924480604156,
"grad_norm": 0.8841341873240007,
"learning_rate": 9.431245500359971e-07,
"loss": 0.4323,
"step": 1180
},
{
"epoch": 0.009519923840609274,
"grad_norm": 0.6844990736142492,
"learning_rate": 9.511239100871932e-07,
"loss": 0.4195,
"step": 1190
},
{
"epoch": 0.009599923200614395,
"grad_norm": 0.7129122863481299,
"learning_rate": 9.59123270138389e-07,
"loss": 0.4088,
"step": 1200
},
{
"epoch": 0.009679922560619515,
"grad_norm": 0.6532760834208395,
"learning_rate": 9.67122630189585e-07,
"loss": 0.3969,
"step": 1210
},
{
"epoch": 0.009759921920624635,
"grad_norm": 0.7563260927255472,
"learning_rate": 9.75121990240781e-07,
"loss": 0.4075,
"step": 1220
},
{
"epoch": 0.009839921280629755,
"grad_norm": 0.7704505024635662,
"learning_rate": 9.831213502919768e-07,
"loss": 0.3981,
"step": 1230
},
{
"epoch": 0.009919920640634875,
"grad_norm": 0.7056792723024738,
"learning_rate": 9.911207103431725e-07,
"loss": 0.4223,
"step": 1240
},
{
"epoch": 0.009999920000639996,
"grad_norm": 0.750390399783068,
"learning_rate": 9.991200703943684e-07,
"loss": 0.4102,
"step": 1250
},
{
"epoch": 0.010079919360645114,
"grad_norm": 0.7529460057363149,
"learning_rate": 1.0071194304455646e-06,
"loss": 0.4278,
"step": 1260
},
{
"epoch": 0.010159918720650234,
"grad_norm": 1.041695148553965,
"learning_rate": 1.0151187904967603e-06,
"loss": 0.4261,
"step": 1270
},
{
"epoch": 0.010239918080655355,
"grad_norm": 0.8098838670880232,
"learning_rate": 1.0231181505479562e-06,
"loss": 0.4163,
"step": 1280
},
{
"epoch": 0.010319917440660475,
"grad_norm": 0.7139180273227849,
"learning_rate": 1.0311175105991523e-06,
"loss": 0.4139,
"step": 1290
},
{
"epoch": 0.010399916800665595,
"grad_norm": 0.7293628900497775,
"learning_rate": 1.039116870650348e-06,
"loss": 0.4139,
"step": 1300
},
{
"epoch": 0.010479916160670715,
"grad_norm": 3.9899125178456276,
"learning_rate": 1.047116230701544e-06,
"loss": 0.4352,
"step": 1310
},
{
"epoch": 0.010559915520675835,
"grad_norm": 0.741428608083807,
"learning_rate": 1.0551155907527398e-06,
"loss": 0.4487,
"step": 1320
},
{
"epoch": 0.010639914880680954,
"grad_norm": 0.7645959836257802,
"learning_rate": 1.0631149508039358e-06,
"loss": 0.4509,
"step": 1330
},
{
"epoch": 0.010719914240686074,
"grad_norm": 0.6630163901596575,
"learning_rate": 1.0711143108551317e-06,
"loss": 0.422,
"step": 1340
},
{
"epoch": 0.010799913600691194,
"grad_norm": 0.7653242364061862,
"learning_rate": 1.0791136709063276e-06,
"loss": 0.4047,
"step": 1350
},
{
"epoch": 0.010879912960696314,
"grad_norm": 0.6952138151449674,
"learning_rate": 1.0871130309575235e-06,
"loss": 0.424,
"step": 1360
},
{
"epoch": 0.010959912320701435,
"grad_norm": 0.8400358998787617,
"learning_rate": 1.0951123910087194e-06,
"loss": 0.4329,
"step": 1370
},
{
"epoch": 0.011039911680706555,
"grad_norm": 0.7959368745719518,
"learning_rate": 1.1031117510599153e-06,
"loss": 0.4296,
"step": 1380
},
{
"epoch": 0.011119911040711675,
"grad_norm": 0.7182831330712102,
"learning_rate": 1.111111111111111e-06,
"loss": 0.4053,
"step": 1390
},
{
"epoch": 0.011199910400716794,
"grad_norm": 0.7631459457678664,
"learning_rate": 1.1191104711623072e-06,
"loss": 0.4174,
"step": 1400
},
{
"epoch": 0.011279909760721914,
"grad_norm": 0.6904060292491346,
"learning_rate": 1.127109831213503e-06,
"loss": 0.4262,
"step": 1410
},
{
"epoch": 0.011359909120727034,
"grad_norm": 0.7154968430224656,
"learning_rate": 1.1351091912646988e-06,
"loss": 0.4116,
"step": 1420
},
{
"epoch": 0.011439908480732154,
"grad_norm": 0.6430506977204669,
"learning_rate": 1.143108551315895e-06,
"loss": 0.3985,
"step": 1430
},
{
"epoch": 0.011519907840737274,
"grad_norm": 0.6060536937789834,
"learning_rate": 1.1511079113670908e-06,
"loss": 0.4366,
"step": 1440
},
{
"epoch": 0.011599907200742395,
"grad_norm": 0.7070045694797042,
"learning_rate": 1.1591072714182865e-06,
"loss": 0.3951,
"step": 1450
},
{
"epoch": 0.011679906560747515,
"grad_norm": 0.6799481270205386,
"learning_rate": 1.1671066314694824e-06,
"loss": 0.4277,
"step": 1460
},
{
"epoch": 0.011759905920752633,
"grad_norm": 0.7706688131579046,
"learning_rate": 1.1751059915206786e-06,
"loss": 0.4106,
"step": 1470
},
{
"epoch": 0.011839905280757753,
"grad_norm": 0.7659401655636618,
"learning_rate": 1.1831053515718743e-06,
"loss": 0.4147,
"step": 1480
},
{
"epoch": 0.011919904640762874,
"grad_norm": 0.7880351649260696,
"learning_rate": 1.1911047116230702e-06,
"loss": 0.4018,
"step": 1490
},
{
"epoch": 0.011999904000767994,
"grad_norm": 0.6643952514129879,
"learning_rate": 1.1991040716742661e-06,
"loss": 0.4225,
"step": 1500
},
{
"epoch": 0.012079903360773114,
"grad_norm": 0.7410891062680932,
"learning_rate": 1.207103431725462e-06,
"loss": 0.4235,
"step": 1510
},
{
"epoch": 0.012159902720778234,
"grad_norm": 0.6702465025094532,
"learning_rate": 1.215102791776658e-06,
"loss": 0.4485,
"step": 1520
},
{
"epoch": 0.012239902080783354,
"grad_norm": 0.72511805103151,
"learning_rate": 1.2231021518278539e-06,
"loss": 0.4075,
"step": 1530
},
{
"epoch": 0.012319901440788473,
"grad_norm": 0.7364457263640842,
"learning_rate": 1.2311015118790498e-06,
"loss": 0.4209,
"step": 1540
},
{
"epoch": 0.012399900800793593,
"grad_norm": 0.6760801533876919,
"learning_rate": 1.2391008719302457e-06,
"loss": 0.4153,
"step": 1550
},
{
"epoch": 0.012479900160798713,
"grad_norm": 1.329240774622819,
"learning_rate": 1.2471002319814416e-06,
"loss": 0.4569,
"step": 1560
},
{
"epoch": 0.012559899520803833,
"grad_norm": 0.7197695590369367,
"learning_rate": 1.2550995920326375e-06,
"loss": 0.4148,
"step": 1570
},
{
"epoch": 0.012639898880808954,
"grad_norm": 0.7197146321929337,
"learning_rate": 1.2630989520838332e-06,
"loss": 0.4309,
"step": 1580
},
{
"epoch": 0.012719898240814074,
"grad_norm": 0.6941046794760515,
"learning_rate": 1.2710983121350293e-06,
"loss": 0.4266,
"step": 1590
},
{
"epoch": 0.012799897600819194,
"grad_norm": 0.6685733935987979,
"learning_rate": 1.2790976721862253e-06,
"loss": 0.4148,
"step": 1600
},
{
"epoch": 0.012879896960824314,
"grad_norm": 0.829963069106683,
"learning_rate": 1.287097032237421e-06,
"loss": 0.438,
"step": 1610
},
{
"epoch": 0.012959896320829433,
"grad_norm": 0.8772001383565942,
"learning_rate": 1.295096392288617e-06,
"loss": 0.4167,
"step": 1620
},
{
"epoch": 0.013039895680834553,
"grad_norm": 0.7693529714105686,
"learning_rate": 1.303095752339813e-06,
"loss": 0.4465,
"step": 1630
},
{
"epoch": 0.013119895040839673,
"grad_norm": 0.6969819161826472,
"learning_rate": 1.3110951123910087e-06,
"loss": 0.4374,
"step": 1640
},
{
"epoch": 0.013199894400844793,
"grad_norm": 0.7245418409893126,
"learning_rate": 1.3190944724422048e-06,
"loss": 0.4375,
"step": 1650
},
{
"epoch": 0.013279893760849914,
"grad_norm": 0.6704658854506884,
"learning_rate": 1.3270938324934008e-06,
"loss": 0.4144,
"step": 1660
},
{
"epoch": 0.013359893120855034,
"grad_norm": 0.7596780468310279,
"learning_rate": 1.3350931925445965e-06,
"loss": 0.4142,
"step": 1670
},
{
"epoch": 0.013439892480860154,
"grad_norm": 0.799033496950235,
"learning_rate": 1.3430925525957924e-06,
"loss": 0.424,
"step": 1680
},
{
"epoch": 0.013519891840865272,
"grad_norm": 0.8360146268303155,
"learning_rate": 1.3510919126469885e-06,
"loss": 0.4104,
"step": 1690
},
{
"epoch": 0.013599891200870393,
"grad_norm": 0.7830381641790578,
"learning_rate": 1.3590912726981842e-06,
"loss": 0.4067,
"step": 1700
},
{
"epoch": 0.013679890560875513,
"grad_norm": 0.7376045815830132,
"learning_rate": 1.3670906327493801e-06,
"loss": 0.4232,
"step": 1710
},
{
"epoch": 0.013759889920880633,
"grad_norm": 0.7780257393234633,
"learning_rate": 1.3750899928005762e-06,
"loss": 0.4001,
"step": 1720
},
{
"epoch": 0.013839889280885753,
"grad_norm": 0.7400853428241427,
"learning_rate": 1.383089352851772e-06,
"loss": 0.4283,
"step": 1730
},
{
"epoch": 0.013919888640890873,
"grad_norm": 0.6301335389858812,
"learning_rate": 1.3910887129029679e-06,
"loss": 0.4315,
"step": 1740
},
{
"epoch": 0.013999888000895994,
"grad_norm": 0.6873058205506691,
"learning_rate": 1.3990880729541636e-06,
"loss": 0.4294,
"step": 1750
},
{
"epoch": 0.014079887360901112,
"grad_norm": 0.6118849450571252,
"learning_rate": 1.4070874330053597e-06,
"loss": 0.4116,
"step": 1760
},
{
"epoch": 0.014159886720906232,
"grad_norm": 0.8643963519063279,
"learning_rate": 1.4150867930565556e-06,
"loss": 0.4189,
"step": 1770
},
{
"epoch": 0.014239886080911352,
"grad_norm": 0.6986286668684888,
"learning_rate": 1.4230861531077513e-06,
"loss": 0.4393,
"step": 1780
},
{
"epoch": 0.014319885440916473,
"grad_norm": 0.7361554237470329,
"learning_rate": 1.4310855131589474e-06,
"loss": 0.4048,
"step": 1790
},
{
"epoch": 0.014399884800921593,
"grad_norm": 0.7526595512221772,
"learning_rate": 1.4390848732101434e-06,
"loss": 0.4434,
"step": 1800
},
{
"epoch": 0.014479884160926713,
"grad_norm": 0.6793121490590498,
"learning_rate": 1.447084233261339e-06,
"loss": 0.4313,
"step": 1810
},
{
"epoch": 0.014559883520931833,
"grad_norm": 0.7503593630371851,
"learning_rate": 1.4550835933125352e-06,
"loss": 0.4363,
"step": 1820
},
{
"epoch": 0.014639882880936952,
"grad_norm": 0.6631126747097913,
"learning_rate": 1.463082953363731e-06,
"loss": 0.4172,
"step": 1830
},
{
"epoch": 0.014719882240942072,
"grad_norm": 0.6201041671912539,
"learning_rate": 1.4710823134149268e-06,
"loss": 0.426,
"step": 1840
},
{
"epoch": 0.014799881600947192,
"grad_norm": 0.7986575840999489,
"learning_rate": 1.479081673466123e-06,
"loss": 0.4418,
"step": 1850
},
{
"epoch": 0.014879880960952312,
"grad_norm": 0.7485644191714067,
"learning_rate": 1.4870810335173188e-06,
"loss": 0.4315,
"step": 1860
},
{
"epoch": 0.014959880320957433,
"grad_norm": 0.767180980787683,
"learning_rate": 1.4950803935685146e-06,
"loss": 0.44,
"step": 1870
},
{
"epoch": 0.015039879680962553,
"grad_norm": 0.7085244935047633,
"learning_rate": 1.5030797536197107e-06,
"loss": 0.4197,
"step": 1880
},
{
"epoch": 0.015119879040967673,
"grad_norm": 0.9096558305501717,
"learning_rate": 1.5110791136709064e-06,
"loss": 0.4276,
"step": 1890
},
{
"epoch": 0.015199878400972791,
"grad_norm": 0.7782449144127891,
"learning_rate": 1.5190784737221023e-06,
"loss": 0.4433,
"step": 1900
},
{
"epoch": 0.015279877760977912,
"grad_norm": 0.6784288997728832,
"learning_rate": 1.5270778337732984e-06,
"loss": 0.408,
"step": 1910
},
{
"epoch": 0.015359877120983032,
"grad_norm": 0.7394160138728095,
"learning_rate": 1.5350771938244941e-06,
"loss": 0.427,
"step": 1920
},
{
"epoch": 0.015439876480988152,
"grad_norm": 0.7020184796096351,
"learning_rate": 1.54307655387569e-06,
"loss": 0.4585,
"step": 1930
},
{
"epoch": 0.015519875840993272,
"grad_norm": 0.6570216742745165,
"learning_rate": 1.551075913926886e-06,
"loss": 0.4203,
"step": 1940
},
{
"epoch": 0.015599875200998392,
"grad_norm": 0.6492915506893296,
"learning_rate": 1.5590752739780819e-06,
"loss": 0.4225,
"step": 1950
},
{
"epoch": 0.01567987456100351,
"grad_norm": 0.6404207960330748,
"learning_rate": 1.5670746340292778e-06,
"loss": 0.4155,
"step": 1960
},
{
"epoch": 0.01575987392100863,
"grad_norm": 0.7069937527425317,
"learning_rate": 1.5750739940804737e-06,
"loss": 0.4017,
"step": 1970
},
{
"epoch": 0.01583987328101375,
"grad_norm": 0.7497814910327999,
"learning_rate": 1.5830733541316694e-06,
"loss": 0.4336,
"step": 1980
},
{
"epoch": 0.01591987264101887,
"grad_norm": 0.997502334828043,
"learning_rate": 1.5910727141828655e-06,
"loss": 0.4252,
"step": 1990
},
{
"epoch": 0.01599987200102399,
"grad_norm": 0.7051737520195863,
"learning_rate": 1.5990720742340615e-06,
"loss": 0.4136,
"step": 2000
},
{
"epoch": 0.016079871361029112,
"grad_norm": 0.7358677233078202,
"learning_rate": 1.6070714342852572e-06,
"loss": 0.4041,
"step": 2010
},
{
"epoch": 0.016159870721034232,
"grad_norm": 0.7232160368639224,
"learning_rate": 1.6150707943364533e-06,
"loss": 0.3925,
"step": 2020
},
{
"epoch": 0.016239870081039352,
"grad_norm": 0.7492396218793221,
"learning_rate": 1.6230701543876492e-06,
"loss": 0.4453,
"step": 2030
},
{
"epoch": 0.016319869441044473,
"grad_norm": 0.7879468186487532,
"learning_rate": 1.631069514438845e-06,
"loss": 0.4143,
"step": 2040
},
{
"epoch": 0.016399868801049593,
"grad_norm": 0.7027247029038095,
"learning_rate": 1.639068874490041e-06,
"loss": 0.4487,
"step": 2050
},
{
"epoch": 0.016479868161054713,
"grad_norm": 0.6646702988688921,
"learning_rate": 1.6470682345412367e-06,
"loss": 0.4189,
"step": 2060
},
{
"epoch": 0.016559867521059833,
"grad_norm": 0.7214592394412016,
"learning_rate": 1.6550675945924326e-06,
"loss": 0.4298,
"step": 2070
},
{
"epoch": 0.01663986688106495,
"grad_norm": 0.7009780752105863,
"learning_rate": 1.6630669546436288e-06,
"loss": 0.43,
"step": 2080
},
{
"epoch": 0.01671986624107007,
"grad_norm": 0.6802031501540443,
"learning_rate": 1.6710663146948245e-06,
"loss": 0.4064,
"step": 2090
},
{
"epoch": 0.01679986560107519,
"grad_norm": 0.7764568933093239,
"learning_rate": 1.6790656747460204e-06,
"loss": 0.4192,
"step": 2100
},
{
"epoch": 0.01687986496108031,
"grad_norm": 0.7257831241745193,
"learning_rate": 1.6870650347972165e-06,
"loss": 0.4137,
"step": 2110
},
{
"epoch": 0.01695986432108543,
"grad_norm": 0.7231093177815116,
"learning_rate": 1.6950643948484122e-06,
"loss": 0.4321,
"step": 2120
},
{
"epoch": 0.01703986368109055,
"grad_norm": 0.7604848904942334,
"learning_rate": 1.7030637548996081e-06,
"loss": 0.4173,
"step": 2130
},
{
"epoch": 0.01711986304109567,
"grad_norm": 0.8671202797505491,
"learning_rate": 1.7110631149508043e-06,
"loss": 0.4172,
"step": 2140
},
{
"epoch": 0.01719986240110079,
"grad_norm": 0.8818905733108134,
"learning_rate": 1.719062475002e-06,
"loss": 0.4096,
"step": 2150
},
{
"epoch": 0.01727986176110591,
"grad_norm": 0.7073377083366502,
"learning_rate": 1.7270618350531959e-06,
"loss": 0.4327,
"step": 2160
},
{
"epoch": 0.01735986112111103,
"grad_norm": 0.9637856611728648,
"learning_rate": 1.735061195104392e-06,
"loss": 0.457,
"step": 2170
},
{
"epoch": 0.017439860481116152,
"grad_norm": 0.7763253238443666,
"learning_rate": 1.7430605551555877e-06,
"loss": 0.4444,
"step": 2180
},
{
"epoch": 0.017519859841121272,
"grad_norm": 0.7532387544984117,
"learning_rate": 1.7510599152067836e-06,
"loss": 0.4193,
"step": 2190
},
{
"epoch": 0.017599859201126392,
"grad_norm": 0.7001703245870058,
"learning_rate": 1.7590592752579793e-06,
"loss": 0.4334,
"step": 2200
},
{
"epoch": 0.017679858561131512,
"grad_norm": 0.653320909525973,
"learning_rate": 1.7670586353091755e-06,
"loss": 0.4137,
"step": 2210
},
{
"epoch": 0.01775985792113663,
"grad_norm": 0.6431905950303065,
"learning_rate": 1.7750579953603714e-06,
"loss": 0.4056,
"step": 2220
},
{
"epoch": 0.01783985728114175,
"grad_norm": 0.6355510699319944,
"learning_rate": 1.783057355411567e-06,
"loss": 0.3967,
"step": 2230
},
{
"epoch": 0.01791985664114687,
"grad_norm": 1.2693108108319435,
"learning_rate": 1.7910567154627632e-06,
"loss": 0.3958,
"step": 2240
},
{
"epoch": 0.01799985600115199,
"grad_norm": 0.7084484301635297,
"learning_rate": 1.7990560755139591e-06,
"loss": 0.4341,
"step": 2250
},
{
"epoch": 0.01807985536115711,
"grad_norm": 0.7305495029250423,
"learning_rate": 1.8070554355651548e-06,
"loss": 0.4343,
"step": 2260
},
{
"epoch": 0.01815985472116223,
"grad_norm": 0.7855576477357182,
"learning_rate": 1.8150547956163507e-06,
"loss": 0.4158,
"step": 2270
},
{
"epoch": 0.01823985408116735,
"grad_norm": 0.8272843321029247,
"learning_rate": 1.8230541556675469e-06,
"loss": 0.4355,
"step": 2280
},
{
"epoch": 0.01831985344117247,
"grad_norm": 0.753702291149743,
"learning_rate": 1.8310535157187426e-06,
"loss": 0.4025,
"step": 2290
},
{
"epoch": 0.01839985280117759,
"grad_norm": 0.7598104262546688,
"learning_rate": 1.8390528757699385e-06,
"loss": 0.413,
"step": 2300
},
{
"epoch": 0.01847985216118271,
"grad_norm": 0.7611395927729616,
"learning_rate": 1.8470522358211346e-06,
"loss": 0.4373,
"step": 2310
},
{
"epoch": 0.01855985152118783,
"grad_norm": 0.760722694363519,
"learning_rate": 1.8550515958723303e-06,
"loss": 0.4179,
"step": 2320
},
{
"epoch": 0.01863985088119295,
"grad_norm": 0.5957047498546761,
"learning_rate": 1.8630509559235262e-06,
"loss": 0.4231,
"step": 2330
},
{
"epoch": 0.01871985024119807,
"grad_norm": 0.7186940138009597,
"learning_rate": 1.8710503159747224e-06,
"loss": 0.4171,
"step": 2340
},
{
"epoch": 0.018799849601203192,
"grad_norm": 0.7760364408450707,
"learning_rate": 1.879049676025918e-06,
"loss": 0.4003,
"step": 2350
},
{
"epoch": 0.018879848961208312,
"grad_norm": 0.7481302705420546,
"learning_rate": 1.887049036077114e-06,
"loss": 0.4182,
"step": 2360
},
{
"epoch": 0.01895984832121343,
"grad_norm": 0.7358536669321117,
"learning_rate": 1.8950483961283097e-06,
"loss": 0.4272,
"step": 2370
},
{
"epoch": 0.01903984768121855,
"grad_norm": 0.7602582219791679,
"learning_rate": 1.9030477561795058e-06,
"loss": 0.4157,
"step": 2380
},
{
"epoch": 0.01911984704122367,
"grad_norm": 0.7426387012574098,
"learning_rate": 1.911047116230702e-06,
"loss": 0.4136,
"step": 2390
},
{
"epoch": 0.01919984640122879,
"grad_norm": 0.6954613448087369,
"learning_rate": 1.9190464762818974e-06,
"loss": 0.4081,
"step": 2400
},
{
"epoch": 0.01927984576123391,
"grad_norm": 0.7604294663875293,
"learning_rate": 1.9270458363330933e-06,
"loss": 0.4116,
"step": 2410
},
{
"epoch": 0.01935984512123903,
"grad_norm": 0.7196806209080392,
"learning_rate": 1.9350451963842897e-06,
"loss": 0.4253,
"step": 2420
},
{
"epoch": 0.01943984448124415,
"grad_norm": 0.7619473447002568,
"learning_rate": 1.943044556435485e-06,
"loss": 0.4048,
"step": 2430
},
{
"epoch": 0.01951984384124927,
"grad_norm": 0.7052222380767645,
"learning_rate": 1.951043916486681e-06,
"loss": 0.4092,
"step": 2440
},
{
"epoch": 0.01959984320125439,
"grad_norm": 0.6311702076235657,
"learning_rate": 1.9590432765378774e-06,
"loss": 0.409,
"step": 2450
},
{
"epoch": 0.01967984256125951,
"grad_norm": 0.7989650814927569,
"learning_rate": 1.967042636589073e-06,
"loss": 0.4031,
"step": 2460
},
{
"epoch": 0.01975984192126463,
"grad_norm": 0.7243725001067509,
"learning_rate": 1.975041996640269e-06,
"loss": 0.4139,
"step": 2470
},
{
"epoch": 0.01983984128126975,
"grad_norm": 0.7689695050248523,
"learning_rate": 1.983041356691465e-06,
"loss": 0.419,
"step": 2480
},
{
"epoch": 0.01991984064127487,
"grad_norm": 0.6784111247678405,
"learning_rate": 1.9910407167426607e-06,
"loss": 0.4255,
"step": 2490
},
{
"epoch": 0.01999984000127999,
"grad_norm": 0.696241896489532,
"learning_rate": 1.9990400767938566e-06,
"loss": 0.4387,
"step": 2500
},
{
"epoch": 0.020079839361285108,
"grad_norm": 0.7337662500397681,
"learning_rate": 2.0070394368450525e-06,
"loss": 0.4107,
"step": 2510
},
{
"epoch": 0.02015983872129023,
"grad_norm": 0.7522121421950386,
"learning_rate": 2.0150387968962484e-06,
"loss": 0.4341,
"step": 2520
},
{
"epoch": 0.02023983808129535,
"grad_norm": 0.7635117961228429,
"learning_rate": 2.0230381569474443e-06,
"loss": 0.444,
"step": 2530
},
{
"epoch": 0.02031983744130047,
"grad_norm": 0.7258340461733414,
"learning_rate": 2.0310375169986402e-06,
"loss": 0.4532,
"step": 2540
},
{
"epoch": 0.02039983680130559,
"grad_norm": 0.8227531741667117,
"learning_rate": 2.039036877049836e-06,
"loss": 0.4211,
"step": 2550
},
{
"epoch": 0.02047983616131071,
"grad_norm": 0.8046278992558573,
"learning_rate": 2.047036237101032e-06,
"loss": 0.4304,
"step": 2560
},
{
"epoch": 0.02055983552131583,
"grad_norm": 0.8583586766230429,
"learning_rate": 2.055035597152228e-06,
"loss": 0.389,
"step": 2570
},
{
"epoch": 0.02063983488132095,
"grad_norm": 0.7059091393994599,
"learning_rate": 2.063034957203424e-06,
"loss": 0.4143,
"step": 2580
},
{
"epoch": 0.02071983424132607,
"grad_norm": 0.7169812723780176,
"learning_rate": 2.07103431725462e-06,
"loss": 0.4231,
"step": 2590
},
{
"epoch": 0.02079983360133119,
"grad_norm": 0.744498117665369,
"learning_rate": 2.0790336773058157e-06,
"loss": 0.3931,
"step": 2600
},
{
"epoch": 0.02087983296133631,
"grad_norm": 0.8714679551154206,
"learning_rate": 2.0870330373570117e-06,
"loss": 0.4227,
"step": 2610
},
{
"epoch": 0.02095983232134143,
"grad_norm": 0.6310793333444085,
"learning_rate": 2.0950323974082076e-06,
"loss": 0.4092,
"step": 2620
},
{
"epoch": 0.02103983168134655,
"grad_norm": 0.7793866057944923,
"learning_rate": 2.1030317574594035e-06,
"loss": 0.4139,
"step": 2630
},
{
"epoch": 0.02111983104135167,
"grad_norm": 0.8117526334063988,
"learning_rate": 2.1110311175105994e-06,
"loss": 0.4154,
"step": 2640
},
{
"epoch": 0.021199830401356787,
"grad_norm": 0.7466901509429923,
"learning_rate": 2.1190304775617953e-06,
"loss": 0.4139,
"step": 2650
},
{
"epoch": 0.021279829761361908,
"grad_norm": 0.9396797684739518,
"learning_rate": 2.1270298376129912e-06,
"loss": 0.4166,
"step": 2660
},
{
"epoch": 0.021359829121367028,
"grad_norm": 0.7212101339600039,
"learning_rate": 2.135029197664187e-06,
"loss": 0.4299,
"step": 2670
},
{
"epoch": 0.021439828481372148,
"grad_norm": 0.9051397916977733,
"learning_rate": 2.1430285577153826e-06,
"loss": 0.4203,
"step": 2680
},
{
"epoch": 0.021519827841377268,
"grad_norm": 0.6759148258397492,
"learning_rate": 2.151027917766579e-06,
"loss": 0.4065,
"step": 2690
},
{
"epoch": 0.02159982720138239,
"grad_norm": 0.7191776566727898,
"learning_rate": 2.159027277817775e-06,
"loss": 0.3957,
"step": 2700
},
{
"epoch": 0.02167982656138751,
"grad_norm": 0.7752464373073066,
"learning_rate": 2.1670266378689704e-06,
"loss": 0.4198,
"step": 2710
},
{
"epoch": 0.02175982592139263,
"grad_norm": 0.7096879372714776,
"learning_rate": 2.1750259979201667e-06,
"loss": 0.4337,
"step": 2720
},
{
"epoch": 0.02183982528139775,
"grad_norm": 0.7577341998111574,
"learning_rate": 2.1830253579713626e-06,
"loss": 0.4484,
"step": 2730
},
{
"epoch": 0.02191982464140287,
"grad_norm": 0.8303467101337862,
"learning_rate": 2.191024718022558e-06,
"loss": 0.436,
"step": 2740
},
{
"epoch": 0.02199982400140799,
"grad_norm": 0.740434972574641,
"learning_rate": 2.1990240780737545e-06,
"loss": 0.4026,
"step": 2750
},
{
"epoch": 0.02207982336141311,
"grad_norm": 0.7385703007931491,
"learning_rate": 2.2070234381249504e-06,
"loss": 0.3994,
"step": 2760
},
{
"epoch": 0.02215982272141823,
"grad_norm": 0.804160370262748,
"learning_rate": 2.215022798176146e-06,
"loss": 0.4247,
"step": 2770
},
{
"epoch": 0.02223982208142335,
"grad_norm": 0.9699580286471233,
"learning_rate": 2.2230221582273422e-06,
"loss": 0.4158,
"step": 2780
},
{
"epoch": 0.02231982144142847,
"grad_norm": 0.854317456477185,
"learning_rate": 2.231021518278538e-06,
"loss": 0.4224,
"step": 2790
},
{
"epoch": 0.022399820801433587,
"grad_norm": 0.7261443031804087,
"learning_rate": 2.2390208783297336e-06,
"loss": 0.4153,
"step": 2800
},
{
"epoch": 0.022479820161438707,
"grad_norm": 0.7221332661288903,
"learning_rate": 2.24702023838093e-06,
"loss": 0.4287,
"step": 2810
},
{
"epoch": 0.022559819521443827,
"grad_norm": 0.8879662037891538,
"learning_rate": 2.2550195984321255e-06,
"loss": 0.4221,
"step": 2820
},
{
"epoch": 0.022639818881448948,
"grad_norm": 0.7885980884870208,
"learning_rate": 2.2630189584833214e-06,
"loss": 0.4394,
"step": 2830
},
{
"epoch": 0.022719818241454068,
"grad_norm": 4.841458093614334,
"learning_rate": 2.2710183185345173e-06,
"loss": 0.4306,
"step": 2840
},
{
"epoch": 0.022799817601459188,
"grad_norm": 0.762762416297712,
"learning_rate": 2.279017678585713e-06,
"loss": 0.4248,
"step": 2850
},
{
"epoch": 0.022879816961464308,
"grad_norm": 0.8448845187481899,
"learning_rate": 2.287017038636909e-06,
"loss": 0.4016,
"step": 2860
},
{
"epoch": 0.02295981632146943,
"grad_norm": 0.7119692504285716,
"learning_rate": 2.295016398688105e-06,
"loss": 0.4187,
"step": 2870
},
{
"epoch": 0.02303981568147455,
"grad_norm": 0.7322860524669678,
"learning_rate": 2.303015758739301e-06,
"loss": 0.3992,
"step": 2880
},
{
"epoch": 0.02311981504147967,
"grad_norm": 0.7809553100217358,
"learning_rate": 2.311015118790497e-06,
"loss": 0.4323,
"step": 2890
},
{
"epoch": 0.02319981440148479,
"grad_norm": 0.7497042609672665,
"learning_rate": 2.3190144788416928e-06,
"loss": 0.4216,
"step": 2900
},
{
"epoch": 0.02327981376148991,
"grad_norm": 0.77908486703842,
"learning_rate": 2.3270138388928887e-06,
"loss": 0.4139,
"step": 2910
},
{
"epoch": 0.02335981312149503,
"grad_norm": 0.8259941131428074,
"learning_rate": 2.3350131989440846e-06,
"loss": 0.4298,
"step": 2920
},
{
"epoch": 0.02343981248150015,
"grad_norm": 0.7315958297934407,
"learning_rate": 2.3430125589952805e-06,
"loss": 0.4148,
"step": 2930
},
{
"epoch": 0.023519811841505266,
"grad_norm": 0.7127501348213877,
"learning_rate": 2.3510119190464764e-06,
"loss": 0.4226,
"step": 2940
},
{
"epoch": 0.023599811201510387,
"grad_norm": 0.7004822200975431,
"learning_rate": 2.3590112790976724e-06,
"loss": 0.4037,
"step": 2950
},
{
"epoch": 0.023679810561515507,
"grad_norm": 0.7640873416448367,
"learning_rate": 2.3670106391488683e-06,
"loss": 0.4196,
"step": 2960
},
{
"epoch": 0.023759809921520627,
"grad_norm": 0.7562584220469137,
"learning_rate": 2.375009999200064e-06,
"loss": 0.4226,
"step": 2970
},
{
"epoch": 0.023839809281525747,
"grad_norm": 0.700408619087647,
"learning_rate": 2.38300935925126e-06,
"loss": 0.4194,
"step": 2980
},
{
"epoch": 0.023919808641530867,
"grad_norm": 0.6926553567290514,
"learning_rate": 2.391008719302456e-06,
"loss": 0.4377,
"step": 2990
},
{
"epoch": 0.023999808001535988,
"grad_norm": 0.8080569171783707,
"learning_rate": 2.399008079353652e-06,
"loss": 0.425,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 125001,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 173052491300864.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}