andstor's picture
Upload folder using huggingface_hub
6c82a5f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999179655455291,
"eval_steps": 500,
"global_step": 1371,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021875854525567405,
"grad_norm": 0.017111310735344887,
"learning_rate": 2.173913043478261e-05,
"loss": 1.059,
"step": 1
},
{
"epoch": 0.004375170905113481,
"grad_norm": 0.018623707816004753,
"learning_rate": 4.347826086956522e-05,
"loss": 1.315,
"step": 2
},
{
"epoch": 0.006562756357670222,
"grad_norm": 0.018533790484070778,
"learning_rate": 6.521739130434782e-05,
"loss": 0.9224,
"step": 3
},
{
"epoch": 0.008750341810226962,
"grad_norm": 0.015920396894216537,
"learning_rate": 8.695652173913044e-05,
"loss": 0.9201,
"step": 4
},
{
"epoch": 0.010937927262783703,
"grad_norm": 0.01558469608426094,
"learning_rate": 0.00010869565217391305,
"loss": 0.8457,
"step": 5
},
{
"epoch": 0.013125512715340444,
"grad_norm": 0.023962153121829033,
"learning_rate": 0.00013043478260869564,
"loss": 0.9347,
"step": 6
},
{
"epoch": 0.015313098167897183,
"grad_norm": 0.029316680505871773,
"learning_rate": 0.00015217391304347827,
"loss": 0.8043,
"step": 7
},
{
"epoch": 0.017500683620453924,
"grad_norm": 0.028927722945809364,
"learning_rate": 0.00017391304347826088,
"loss": 0.9963,
"step": 8
},
{
"epoch": 0.019688269073010665,
"grad_norm": 0.025050047785043716,
"learning_rate": 0.0001956521739130435,
"loss": 0.7861,
"step": 9
},
{
"epoch": 0.021875854525567406,
"grad_norm": 0.04486666992306709,
"learning_rate": 0.0002173913043478261,
"loss": 1.0884,
"step": 10
},
{
"epoch": 0.024063439978124147,
"grad_norm": 0.035062652081251144,
"learning_rate": 0.00023913043478260867,
"loss": 0.9876,
"step": 11
},
{
"epoch": 0.026251025430680888,
"grad_norm": 0.033111147582530975,
"learning_rate": 0.0002608695652173913,
"loss": 0.8838,
"step": 12
},
{
"epoch": 0.028438610883237625,
"grad_norm": 0.04362301528453827,
"learning_rate": 0.00028260869565217394,
"loss": 0.8189,
"step": 13
},
{
"epoch": 0.030626196335794366,
"grad_norm": 0.04369740933179855,
"learning_rate": 0.00030434782608695655,
"loss": 0.9065,
"step": 14
},
{
"epoch": 0.03281378178835111,
"grad_norm": 0.04280918091535568,
"learning_rate": 0.0003260869565217391,
"loss": 0.8706,
"step": 15
},
{
"epoch": 0.03500136724090785,
"grad_norm": 0.06369622051715851,
"learning_rate": 0.00034782608695652176,
"loss": 1.0769,
"step": 16
},
{
"epoch": 0.03718895269346459,
"grad_norm": 0.04528605565428734,
"learning_rate": 0.00036956521739130437,
"loss": 0.9662,
"step": 17
},
{
"epoch": 0.03937653814602133,
"grad_norm": 0.045731619000434875,
"learning_rate": 0.000391304347826087,
"loss": 0.7727,
"step": 18
},
{
"epoch": 0.04156412359857807,
"grad_norm": 0.03585459291934967,
"learning_rate": 0.0004130434782608696,
"loss": 0.7813,
"step": 19
},
{
"epoch": 0.04375170905113481,
"grad_norm": 0.07666835933923721,
"learning_rate": 0.0004347826086956522,
"loss": 1.1307,
"step": 20
},
{
"epoch": 0.04593929450369155,
"grad_norm": 0.02985576167702675,
"learning_rate": 0.0004565217391304348,
"loss": 0.785,
"step": 21
},
{
"epoch": 0.04812687995624829,
"grad_norm": 0.02983052283525467,
"learning_rate": 0.00047826086956521735,
"loss": 0.8323,
"step": 22
},
{
"epoch": 0.050314465408805034,
"grad_norm": 0.09363115578889847,
"learning_rate": 0.0005,
"loss": 0.6885,
"step": 23
},
{
"epoch": 0.052502050861361775,
"grad_norm": 0.47768905758857727,
"learning_rate": 0.0005217391304347826,
"loss": 1.0143,
"step": 24
},
{
"epoch": 0.05468963631391851,
"grad_norm": 0.1065075695514679,
"learning_rate": 0.0005434782608695652,
"loss": 0.8331,
"step": 25
},
{
"epoch": 0.05687722176647525,
"grad_norm": 0.023991186171770096,
"learning_rate": 0.0005652173913043479,
"loss": 0.837,
"step": 26
},
{
"epoch": 0.05906480721903199,
"grad_norm": 0.020501986145973206,
"learning_rate": 0.0005869565217391304,
"loss": 0.7117,
"step": 27
},
{
"epoch": 0.06125239267158873,
"grad_norm": 0.052776217460632324,
"learning_rate": 0.0006086956521739131,
"loss": 0.9429,
"step": 28
},
{
"epoch": 0.06343997812414548,
"grad_norm": 0.0542248971760273,
"learning_rate": 0.0006304347826086957,
"loss": 0.7759,
"step": 29
},
{
"epoch": 0.06562756357670221,
"grad_norm": 0.061998218297958374,
"learning_rate": 0.0006521739130434782,
"loss": 0.7904,
"step": 30
},
{
"epoch": 0.06781514902925896,
"grad_norm": 0.03401396796107292,
"learning_rate": 0.0006739130434782609,
"loss": 0.7386,
"step": 31
},
{
"epoch": 0.0700027344818157,
"grad_norm": 0.027073826640844345,
"learning_rate": 0.0006956521739130435,
"loss": 0.6702,
"step": 32
},
{
"epoch": 0.07219031993437244,
"grad_norm": 0.0468217171728611,
"learning_rate": 0.0007173913043478261,
"loss": 0.9944,
"step": 33
},
{
"epoch": 0.07437790538692918,
"grad_norm": 0.1130312904715538,
"learning_rate": 0.0007391304347826087,
"loss": 0.8396,
"step": 34
},
{
"epoch": 0.07656549083948591,
"grad_norm": 0.05662137269973755,
"learning_rate": 0.0007608695652173914,
"loss": 0.918,
"step": 35
},
{
"epoch": 0.07875307629204266,
"grad_norm": 0.030792295932769775,
"learning_rate": 0.000782608695652174,
"loss": 1.0882,
"step": 36
},
{
"epoch": 0.0809406617445994,
"grad_norm": 0.02346004731953144,
"learning_rate": 0.0008043478260869566,
"loss": 0.9276,
"step": 37
},
{
"epoch": 0.08312824719715614,
"grad_norm": 0.0640161782503128,
"learning_rate": 0.0008260869565217392,
"loss": 0.9607,
"step": 38
},
{
"epoch": 0.08531583264971287,
"grad_norm": 0.01127663068473339,
"learning_rate": 0.0008478260869565217,
"loss": 0.7476,
"step": 39
},
{
"epoch": 0.08750341810226962,
"grad_norm": 0.020388390868902206,
"learning_rate": 0.0008695652173913044,
"loss": 0.9294,
"step": 40
},
{
"epoch": 0.08969100355482636,
"grad_norm": 0.011159627698361874,
"learning_rate": 0.0008913043478260869,
"loss": 0.7403,
"step": 41
},
{
"epoch": 0.0918785890073831,
"grad_norm": 0.01922360621392727,
"learning_rate": 0.0009130434782608696,
"loss": 0.9901,
"step": 42
},
{
"epoch": 0.09406617445993984,
"grad_norm": 0.022499792277812958,
"learning_rate": 0.0009347826086956521,
"loss": 0.8491,
"step": 43
},
{
"epoch": 0.09625375991249659,
"grad_norm": 0.019557347521185875,
"learning_rate": 0.0009565217391304347,
"loss": 0.8855,
"step": 44
},
{
"epoch": 0.09844134536505332,
"grad_norm": 0.023644007742404938,
"learning_rate": 0.0009782608695652175,
"loss": 0.6828,
"step": 45
},
{
"epoch": 0.10062893081761007,
"grad_norm": 0.0180030707269907,
"learning_rate": 0.001,
"loss": 1.0325,
"step": 46
},
{
"epoch": 0.1028165162701668,
"grad_norm": 0.026917221024632454,
"learning_rate": 0.0010217391304347826,
"loss": 1.0596,
"step": 47
},
{
"epoch": 0.10500410172272355,
"grad_norm": 0.016843197867274284,
"learning_rate": 0.0010434782608695651,
"loss": 0.6183,
"step": 48
},
{
"epoch": 0.10719168717528028,
"grad_norm": 0.0393221415579319,
"learning_rate": 0.001065217391304348,
"loss": 0.9009,
"step": 49
},
{
"epoch": 0.10937927262783702,
"grad_norm": 0.025003232061862946,
"learning_rate": 0.0010869565217391304,
"loss": 0.9701,
"step": 50
},
{
"epoch": 0.11156685808039377,
"grad_norm": 0.029358206316828728,
"learning_rate": 0.001108695652173913,
"loss": 0.8351,
"step": 51
},
{
"epoch": 0.1137544435329505,
"grad_norm": 0.02484138496220112,
"learning_rate": 0.0011304347826086958,
"loss": 0.906,
"step": 52
},
{
"epoch": 0.11594202898550725,
"grad_norm": 0.012963383458554745,
"learning_rate": 0.0011521739130434783,
"loss": 1.098,
"step": 53
},
{
"epoch": 0.11812961443806398,
"grad_norm": 0.022173907607793808,
"learning_rate": 0.0011739130434782609,
"loss": 0.9086,
"step": 54
},
{
"epoch": 0.12031719989062073,
"grad_norm": 0.018844394013285637,
"learning_rate": 0.0011956521739130434,
"loss": 0.8087,
"step": 55
},
{
"epoch": 0.12250478534317746,
"grad_norm": 0.01081649400293827,
"learning_rate": 0.0012173913043478262,
"loss": 0.6714,
"step": 56
},
{
"epoch": 0.12469237079573421,
"grad_norm": 0.012590788304805756,
"learning_rate": 0.0012391304347826087,
"loss": 0.8224,
"step": 57
},
{
"epoch": 0.12687995624829096,
"grad_norm": 0.007173096761107445,
"learning_rate": 0.0012608695652173913,
"loss": 0.9208,
"step": 58
},
{
"epoch": 0.1290675417008477,
"grad_norm": 0.023659205064177513,
"learning_rate": 0.001282608695652174,
"loss": 0.8385,
"step": 59
},
{
"epoch": 0.13125512715340443,
"grad_norm": 0.016100220382213593,
"learning_rate": 0.0013043478260869564,
"loss": 0.6562,
"step": 60
},
{
"epoch": 0.13344271260596116,
"grad_norm": 0.01680757850408554,
"learning_rate": 0.0013260869565217392,
"loss": 0.8996,
"step": 61
},
{
"epoch": 0.13563029805851792,
"grad_norm": 0.02230382151901722,
"learning_rate": 0.0013478260869565217,
"loss": 0.7927,
"step": 62
},
{
"epoch": 0.13781788351107466,
"grad_norm": 0.03682897984981537,
"learning_rate": 0.0013695652173913043,
"loss": 0.8507,
"step": 63
},
{
"epoch": 0.1400054689636314,
"grad_norm": 0.02873164229094982,
"learning_rate": 0.001391304347826087,
"loss": 0.7379,
"step": 64
},
{
"epoch": 0.14219305441618812,
"grad_norm": 0.08291471749544144,
"learning_rate": 0.0014130434782608696,
"loss": 0.9686,
"step": 65
},
{
"epoch": 0.1443806398687449,
"grad_norm": 0.005648148711770773,
"learning_rate": 0.0014347826086956522,
"loss": 0.9365,
"step": 66
},
{
"epoch": 0.14656822532130162,
"grad_norm": 0.010281619615852833,
"learning_rate": 0.0014565217391304347,
"loss": 0.7989,
"step": 67
},
{
"epoch": 0.14875581077385835,
"grad_norm": 0.010221214964985847,
"learning_rate": 0.0014782608695652175,
"loss": 0.7567,
"step": 68
},
{
"epoch": 0.1509433962264151,
"grad_norm": 0.008735002018511295,
"learning_rate": 0.0015,
"loss": 0.9644,
"step": 69
},
{
"epoch": 0.15313098167897182,
"grad_norm": 0.021798064932227135,
"learning_rate": 0.0015217391304347828,
"loss": 0.8451,
"step": 70
},
{
"epoch": 0.15531856713152858,
"grad_norm": 0.011285695247352123,
"learning_rate": 0.0015434782608695651,
"loss": 0.9334,
"step": 71
},
{
"epoch": 0.15750615258408532,
"grad_norm": 0.005321874748915434,
"learning_rate": 0.001565217391304348,
"loss": 0.9424,
"step": 72
},
{
"epoch": 0.15969373803664205,
"grad_norm": 0.010020822286605835,
"learning_rate": 0.0015869565217391305,
"loss": 0.7219,
"step": 73
},
{
"epoch": 0.1618813234891988,
"grad_norm": 0.008384345099329948,
"learning_rate": 0.0016086956521739132,
"loss": 0.8169,
"step": 74
},
{
"epoch": 0.16406890894175555,
"grad_norm": 0.010866906493902206,
"learning_rate": 0.0016304347826086956,
"loss": 0.7556,
"step": 75
},
{
"epoch": 0.16625649439431228,
"grad_norm": 0.01588907279074192,
"learning_rate": 0.0016521739130434783,
"loss": 0.766,
"step": 76
},
{
"epoch": 0.16844407984686902,
"grad_norm": 0.018410617485642433,
"learning_rate": 0.001673913043478261,
"loss": 0.8201,
"step": 77
},
{
"epoch": 0.17063166529942575,
"grad_norm": 0.01711914874613285,
"learning_rate": 0.0016956521739130434,
"loss": 1.0454,
"step": 78
},
{
"epoch": 0.1728192507519825,
"grad_norm": 0.040495071560144424,
"learning_rate": 0.001717391304347826,
"loss": 0.9048,
"step": 79
},
{
"epoch": 0.17500683620453925,
"grad_norm": 0.008844586089253426,
"learning_rate": 0.0017391304347826088,
"loss": 0.9716,
"step": 80
},
{
"epoch": 0.17719442165709598,
"grad_norm": 0.020504243671894073,
"learning_rate": 0.0017608695652173915,
"loss": 0.706,
"step": 81
},
{
"epoch": 0.1793820071096527,
"grad_norm": 0.005656155291944742,
"learning_rate": 0.0017826086956521739,
"loss": 0.8948,
"step": 82
},
{
"epoch": 0.18156959256220945,
"grad_norm": 0.011604691855609417,
"learning_rate": 0.0018043478260869566,
"loss": 0.9465,
"step": 83
},
{
"epoch": 0.1837571780147662,
"grad_norm": 0.004078141879290342,
"learning_rate": 0.0018260869565217392,
"loss": 0.9462,
"step": 84
},
{
"epoch": 0.18594476346732294,
"grad_norm": 0.008594767190515995,
"learning_rate": 0.0018478260869565217,
"loss": 0.9287,
"step": 85
},
{
"epoch": 0.18813234891987968,
"grad_norm": 0.008353278040885925,
"learning_rate": 0.0018695652173913043,
"loss": 0.8743,
"step": 86
},
{
"epoch": 0.1903199343724364,
"grad_norm": 0.010675789788365364,
"learning_rate": 0.001891304347826087,
"loss": 0.7836,
"step": 87
},
{
"epoch": 0.19250751982499317,
"grad_norm": 0.004695142153650522,
"learning_rate": 0.0019130434782608694,
"loss": 0.7587,
"step": 88
},
{
"epoch": 0.1946951052775499,
"grad_norm": 0.005462712608277798,
"learning_rate": 0.0019347826086956522,
"loss": 0.9685,
"step": 89
},
{
"epoch": 0.19688269073010664,
"grad_norm": 0.005652555730193853,
"learning_rate": 0.001956521739130435,
"loss": 0.7821,
"step": 90
},
{
"epoch": 0.19907027618266337,
"grad_norm": 0.0058873966336250305,
"learning_rate": 0.0019782608695652175,
"loss": 0.8027,
"step": 91
},
{
"epoch": 0.20125786163522014,
"grad_norm": 0.004672915209084749,
"learning_rate": 0.002,
"loss": 0.8367,
"step": 92
},
{
"epoch": 0.20344544708777687,
"grad_norm": 0.004535092506557703,
"learning_rate": 0.0020217391304347826,
"loss": 0.7124,
"step": 93
},
{
"epoch": 0.2056330325403336,
"grad_norm": 0.003576159942895174,
"learning_rate": 0.002043478260869565,
"loss": 0.8596,
"step": 94
},
{
"epoch": 0.20782061799289034,
"grad_norm": 0.005423200782388449,
"learning_rate": 0.0020652173913043477,
"loss": 0.8582,
"step": 95
},
{
"epoch": 0.2100082034454471,
"grad_norm": 0.00573402363806963,
"learning_rate": 0.0020869565217391303,
"loss": 0.6699,
"step": 96
},
{
"epoch": 0.21219578889800383,
"grad_norm": 0.004158989991992712,
"learning_rate": 0.0021086956521739132,
"loss": 0.9819,
"step": 97
},
{
"epoch": 0.21438337435056057,
"grad_norm": 0.009630310349166393,
"learning_rate": 0.002130434782608696,
"loss": 1.0568,
"step": 98
},
{
"epoch": 0.2165709598031173,
"grad_norm": 0.006703687831759453,
"learning_rate": 0.0021521739130434783,
"loss": 0.7342,
"step": 99
},
{
"epoch": 0.21875854525567404,
"grad_norm": 0.008040892891585827,
"learning_rate": 0.002173913043478261,
"loss": 0.8339,
"step": 100
},
{
"epoch": 0.2209461307082308,
"grad_norm": 0.00757247768342495,
"learning_rate": 0.0021956521739130434,
"loss": 0.934,
"step": 101
},
{
"epoch": 0.22313371616078753,
"grad_norm": 0.009981849230825901,
"learning_rate": 0.002217391304347826,
"loss": 0.9303,
"step": 102
},
{
"epoch": 0.22532130161334427,
"grad_norm": 0.007599060423672199,
"learning_rate": 0.0022391304347826086,
"loss": 0.7928,
"step": 103
},
{
"epoch": 0.227508887065901,
"grad_norm": 0.010048196651041508,
"learning_rate": 0.0022608695652173915,
"loss": 0.8811,
"step": 104
},
{
"epoch": 0.22969647251845776,
"grad_norm": 0.008185365237295628,
"learning_rate": 0.002282608695652174,
"loss": 0.8586,
"step": 105
},
{
"epoch": 0.2318840579710145,
"grad_norm": 0.02917146123945713,
"learning_rate": 0.0023043478260869566,
"loss": 0.8296,
"step": 106
},
{
"epoch": 0.23407164342357123,
"grad_norm": 0.03497055917978287,
"learning_rate": 0.002326086956521739,
"loss": 0.9071,
"step": 107
},
{
"epoch": 0.23625922887612796,
"grad_norm": 0.0127785699442029,
"learning_rate": 0.0023478260869565218,
"loss": 0.8497,
"step": 108
},
{
"epoch": 0.23844681432868473,
"grad_norm": 0.007704317104071379,
"learning_rate": 0.0023695652173913043,
"loss": 0.836,
"step": 109
},
{
"epoch": 0.24063439978124146,
"grad_norm": 0.007215225137770176,
"learning_rate": 0.002391304347826087,
"loss": 0.9434,
"step": 110
},
{
"epoch": 0.2428219852337982,
"grad_norm": 0.008061757311224937,
"learning_rate": 0.00241304347826087,
"loss": 0.9648,
"step": 111
},
{
"epoch": 0.24500957068635493,
"grad_norm": 0.007565490435808897,
"learning_rate": 0.0024347826086956524,
"loss": 0.9409,
"step": 112
},
{
"epoch": 0.2471971561389117,
"grad_norm": 0.00492995698004961,
"learning_rate": 0.002456521739130435,
"loss": 0.8753,
"step": 113
},
{
"epoch": 0.24938474159146842,
"grad_norm": 0.005053890403360128,
"learning_rate": 0.0024782608695652175,
"loss": 0.8239,
"step": 114
},
{
"epoch": 0.25157232704402516,
"grad_norm": 0.009602397680282593,
"learning_rate": 0.0025,
"loss": 0.7825,
"step": 115
},
{
"epoch": 0.2537599124965819,
"grad_norm": 0.004248041659593582,
"learning_rate": 0.0025217391304347826,
"loss": 0.7295,
"step": 116
},
{
"epoch": 0.2559474979491386,
"grad_norm": 0.009284190833568573,
"learning_rate": 0.002543478260869565,
"loss": 0.9205,
"step": 117
},
{
"epoch": 0.2581350834016954,
"grad_norm": 0.00780320493504405,
"learning_rate": 0.002565217391304348,
"loss": 0.8928,
"step": 118
},
{
"epoch": 0.2603226688542521,
"grad_norm": 0.014100235886871815,
"learning_rate": 0.0025869565217391307,
"loss": 1.0,
"step": 119
},
{
"epoch": 0.26251025430680885,
"grad_norm": 0.0031979018822312355,
"learning_rate": 0.002608695652173913,
"loss": 0.8661,
"step": 120
},
{
"epoch": 0.2646978397593656,
"grad_norm": 0.010853100568056107,
"learning_rate": 0.002630434782608696,
"loss": 0.7197,
"step": 121
},
{
"epoch": 0.2668854252119223,
"grad_norm": 0.00902815256267786,
"learning_rate": 0.0026521739130434784,
"loss": 0.8987,
"step": 122
},
{
"epoch": 0.2690730106644791,
"grad_norm": 0.006882428657263517,
"learning_rate": 0.002673913043478261,
"loss": 0.7676,
"step": 123
},
{
"epoch": 0.27126059611703585,
"grad_norm": 0.014947020448744297,
"learning_rate": 0.0026956521739130435,
"loss": 0.8857,
"step": 124
},
{
"epoch": 0.27344818156959255,
"grad_norm": 0.005454353056848049,
"learning_rate": 0.002717391304347826,
"loss": 0.7277,
"step": 125
},
{
"epoch": 0.2756357670221493,
"grad_norm": 0.0050047156400978565,
"learning_rate": 0.0027391304347826086,
"loss": 0.9257,
"step": 126
},
{
"epoch": 0.277823352474706,
"grad_norm": 0.008737878873944283,
"learning_rate": 0.002760869565217391,
"loss": 0.8598,
"step": 127
},
{
"epoch": 0.2800109379272628,
"grad_norm": 0.008086539804935455,
"learning_rate": 0.002782608695652174,
"loss": 0.9844,
"step": 128
},
{
"epoch": 0.28219852337981954,
"grad_norm": 0.01979847252368927,
"learning_rate": 0.0028043478260869567,
"loss": 0.9718,
"step": 129
},
{
"epoch": 0.28438610883237625,
"grad_norm": 0.016869032755494118,
"learning_rate": 0.002826086956521739,
"loss": 0.8311,
"step": 130
},
{
"epoch": 0.286573694284933,
"grad_norm": 0.008929664269089699,
"learning_rate": 0.0028478260869565218,
"loss": 0.6482,
"step": 131
},
{
"epoch": 0.2887612797374898,
"grad_norm": 0.013361562974750996,
"learning_rate": 0.0028695652173913043,
"loss": 0.991,
"step": 132
},
{
"epoch": 0.2909488651900465,
"grad_norm": 0.0223986953496933,
"learning_rate": 0.002891304347826087,
"loss": 0.8566,
"step": 133
},
{
"epoch": 0.29313645064260324,
"grad_norm": 0.00690645445138216,
"learning_rate": 0.0029130434782608694,
"loss": 0.7706,
"step": 134
},
{
"epoch": 0.29532403609515995,
"grad_norm": 0.007177585270255804,
"learning_rate": 0.0029347826086956524,
"loss": 0.7896,
"step": 135
},
{
"epoch": 0.2975116215477167,
"grad_norm": 0.024162376299500465,
"learning_rate": 0.002956521739130435,
"loss": 0.8316,
"step": 136
},
{
"epoch": 0.29969920700027347,
"grad_norm": 0.009236878715455532,
"learning_rate": 0.0029782608695652175,
"loss": 0.7563,
"step": 137
},
{
"epoch": 0.3018867924528302,
"grad_norm": 0.008098084479570389,
"learning_rate": 0.003,
"loss": 0.9313,
"step": 138
},
{
"epoch": 0.30407437790538694,
"grad_norm": 0.01629616692662239,
"learning_rate": 0.002997566909975669,
"loss": 0.7942,
"step": 139
},
{
"epoch": 0.30626196335794365,
"grad_norm": 0.013256334699690342,
"learning_rate": 0.0029951338199513382,
"loss": 0.819,
"step": 140
},
{
"epoch": 0.3084495488105004,
"grad_norm": 0.016614550724625587,
"learning_rate": 0.0029927007299270073,
"loss": 0.823,
"step": 141
},
{
"epoch": 0.31063713426305717,
"grad_norm": 0.015185157768428326,
"learning_rate": 0.0029902676399026764,
"loss": 0.8534,
"step": 142
},
{
"epoch": 0.3128247197156139,
"grad_norm": 0.012511268258094788,
"learning_rate": 0.0029878345498783455,
"loss": 1.0021,
"step": 143
},
{
"epoch": 0.31501230516817064,
"grad_norm": 0.03368072584271431,
"learning_rate": 0.0029854014598540146,
"loss": 0.9333,
"step": 144
},
{
"epoch": 0.3171998906207274,
"grad_norm": 0.014194028452038765,
"learning_rate": 0.0029829683698296837,
"loss": 0.7353,
"step": 145
},
{
"epoch": 0.3193874760732841,
"grad_norm": 0.022817425429821014,
"learning_rate": 0.002980535279805353,
"loss": 0.8653,
"step": 146
},
{
"epoch": 0.32157506152584087,
"grad_norm": 0.034395311027765274,
"learning_rate": 0.002978102189781022,
"loss": 0.7872,
"step": 147
},
{
"epoch": 0.3237626469783976,
"grad_norm": 0.04415661096572876,
"learning_rate": 0.002975669099756691,
"loss": 0.8668,
"step": 148
},
{
"epoch": 0.32595023243095433,
"grad_norm": 0.013315894640982151,
"learning_rate": 0.0029732360097323605,
"loss": 0.7012,
"step": 149
},
{
"epoch": 0.3281378178835111,
"grad_norm": 0.01931261457502842,
"learning_rate": 0.002970802919708029,
"loss": 0.6949,
"step": 150
},
{
"epoch": 0.3303254033360678,
"grad_norm": 0.01766936294734478,
"learning_rate": 0.0029683698296836983,
"loss": 1.0637,
"step": 151
},
{
"epoch": 0.33251298878862456,
"grad_norm": 0.04097762331366539,
"learning_rate": 0.002965936739659368,
"loss": 0.6945,
"step": 152
},
{
"epoch": 0.33470057424118127,
"grad_norm": 0.019335204735398293,
"learning_rate": 0.0029635036496350364,
"loss": 0.9677,
"step": 153
},
{
"epoch": 0.33688815969373803,
"grad_norm": 0.02775772474706173,
"learning_rate": 0.0029610705596107055,
"loss": 0.9445,
"step": 154
},
{
"epoch": 0.3390757451462948,
"grad_norm": 0.012738276273012161,
"learning_rate": 0.002958637469586375,
"loss": 0.7254,
"step": 155
},
{
"epoch": 0.3412633305988515,
"grad_norm": 0.025990145280957222,
"learning_rate": 0.0029562043795620437,
"loss": 0.7296,
"step": 156
},
{
"epoch": 0.34345091605140826,
"grad_norm": 0.08288227766752243,
"learning_rate": 0.002953771289537713,
"loss": 0.8595,
"step": 157
},
{
"epoch": 0.345638501503965,
"grad_norm": 0.05340643599629402,
"learning_rate": 0.002951338199513382,
"loss": 0.7425,
"step": 158
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.030417539179325104,
"learning_rate": 0.0029489051094890514,
"loss": 0.7976,
"step": 159
},
{
"epoch": 0.3500136724090785,
"grad_norm": 0.04232973977923393,
"learning_rate": 0.00294647201946472,
"loss": 0.8764,
"step": 160
},
{
"epoch": 0.3522012578616352,
"grad_norm": 0.025519737973809242,
"learning_rate": 0.002944038929440389,
"loss": 0.8247,
"step": 161
},
{
"epoch": 0.35438884331419196,
"grad_norm": 0.046103380620479584,
"learning_rate": 0.0029416058394160587,
"loss": 0.8432,
"step": 162
},
{
"epoch": 0.3565764287667487,
"grad_norm": 0.01843344047665596,
"learning_rate": 0.0029391727493917274,
"loss": 0.8721,
"step": 163
},
{
"epoch": 0.3587640142193054,
"grad_norm": 0.029839089140295982,
"learning_rate": 0.0029367396593673965,
"loss": 0.7955,
"step": 164
},
{
"epoch": 0.3609515996718622,
"grad_norm": 0.023799125105142593,
"learning_rate": 0.002934306569343066,
"loss": 0.8929,
"step": 165
},
{
"epoch": 0.3631391851244189,
"grad_norm": 0.01695132628083229,
"learning_rate": 0.0029318734793187346,
"loss": 0.8149,
"step": 166
},
{
"epoch": 0.36532677057697566,
"grad_norm": 0.01710570976138115,
"learning_rate": 0.0029294403892944037,
"loss": 0.9953,
"step": 167
},
{
"epoch": 0.3675143560295324,
"grad_norm": 0.008958813734352589,
"learning_rate": 0.0029270072992700733,
"loss": 0.6486,
"step": 168
},
{
"epoch": 0.3697019414820891,
"grad_norm": 0.02033080905675888,
"learning_rate": 0.002924574209245742,
"loss": 0.7759,
"step": 169
},
{
"epoch": 0.3718895269346459,
"grad_norm": 0.01737876608967781,
"learning_rate": 0.002922141119221411,
"loss": 0.7998,
"step": 170
},
{
"epoch": 0.37407711238720265,
"grad_norm": 0.011925026774406433,
"learning_rate": 0.0029197080291970805,
"loss": 0.6405,
"step": 171
},
{
"epoch": 0.37626469783975935,
"grad_norm": 0.010621492750942707,
"learning_rate": 0.0029172749391727496,
"loss": 0.735,
"step": 172
},
{
"epoch": 0.3784522832923161,
"grad_norm": 0.02744341269135475,
"learning_rate": 0.0029148418491484183,
"loss": 0.9386,
"step": 173
},
{
"epoch": 0.3806398687448728,
"grad_norm": 0.010641987435519695,
"learning_rate": 0.002912408759124088,
"loss": 0.6368,
"step": 174
},
{
"epoch": 0.3828274541974296,
"grad_norm": 0.016506191343069077,
"learning_rate": 0.002909975669099757,
"loss": 0.7212,
"step": 175
},
{
"epoch": 0.38501503964998635,
"grad_norm": 0.029457593336701393,
"learning_rate": 0.0029075425790754256,
"loss": 0.8386,
"step": 176
},
{
"epoch": 0.38720262510254305,
"grad_norm": 0.008680049329996109,
"learning_rate": 0.002905109489051095,
"loss": 0.8827,
"step": 177
},
{
"epoch": 0.3893902105550998,
"grad_norm": 0.029479682445526123,
"learning_rate": 0.002902676399026764,
"loss": 0.797,
"step": 178
},
{
"epoch": 0.3915777960076566,
"grad_norm": 0.01670117862522602,
"learning_rate": 0.002900243309002433,
"loss": 0.7164,
"step": 179
},
{
"epoch": 0.3937653814602133,
"grad_norm": 0.019070839509367943,
"learning_rate": 0.0028978102189781024,
"loss": 0.7425,
"step": 180
},
{
"epoch": 0.39595296691277004,
"grad_norm": 0.010363463312387466,
"learning_rate": 0.0028953771289537715,
"loss": 0.9502,
"step": 181
},
{
"epoch": 0.39814055236532675,
"grad_norm": 0.02518656477332115,
"learning_rate": 0.0028929440389294406,
"loss": 0.7689,
"step": 182
},
{
"epoch": 0.4003281378178835,
"grad_norm": 0.014663388952612877,
"learning_rate": 0.0028905109489051097,
"loss": 0.8839,
"step": 183
},
{
"epoch": 0.4025157232704403,
"grad_norm": 0.009784224443137646,
"learning_rate": 0.0028880778588807787,
"loss": 0.82,
"step": 184
},
{
"epoch": 0.404703308722997,
"grad_norm": 0.02763255313038826,
"learning_rate": 0.002885644768856448,
"loss": 0.8051,
"step": 185
},
{
"epoch": 0.40689089417555374,
"grad_norm": 0.023367729038000107,
"learning_rate": 0.002883211678832117,
"loss": 0.7191,
"step": 186
},
{
"epoch": 0.40907847962811045,
"grad_norm": 0.025467796251177788,
"learning_rate": 0.002880778588807786,
"loss": 0.7385,
"step": 187
},
{
"epoch": 0.4112660650806672,
"grad_norm": 0.03302817419171333,
"learning_rate": 0.002878345498783455,
"loss": 0.6099,
"step": 188
},
{
"epoch": 0.41345365053322397,
"grad_norm": 0.016808858141303062,
"learning_rate": 0.002875912408759124,
"loss": 0.687,
"step": 189
},
{
"epoch": 0.4156412359857807,
"grad_norm": 0.030584512278437614,
"learning_rate": 0.0028734793187347933,
"loss": 0.7922,
"step": 190
},
{
"epoch": 0.41782882143833744,
"grad_norm": 0.05187975615262985,
"learning_rate": 0.0028710462287104624,
"loss": 0.8282,
"step": 191
},
{
"epoch": 0.4200164068908942,
"grad_norm": 0.03264329209923744,
"learning_rate": 0.0028686131386861315,
"loss": 0.542,
"step": 192
},
{
"epoch": 0.4222039923434509,
"grad_norm": 0.08889129012823105,
"learning_rate": 0.0028661800486618006,
"loss": 0.7642,
"step": 193
},
{
"epoch": 0.42439157779600767,
"grad_norm": 0.017528299242258072,
"learning_rate": 0.0028637469586374697,
"loss": 1.0019,
"step": 194
},
{
"epoch": 0.4265791632485644,
"grad_norm": 0.042831018567085266,
"learning_rate": 0.0028613138686131388,
"loss": 0.7849,
"step": 195
},
{
"epoch": 0.42876674870112114,
"grad_norm": 0.06844168901443481,
"learning_rate": 0.002858880778588808,
"loss": 0.8431,
"step": 196
},
{
"epoch": 0.4309543341536779,
"grad_norm": 0.056285906583070755,
"learning_rate": 0.002856447688564477,
"loss": 0.766,
"step": 197
},
{
"epoch": 0.4331419196062346,
"grad_norm": 0.03165756165981293,
"learning_rate": 0.002854014598540146,
"loss": 0.526,
"step": 198
},
{
"epoch": 0.43532950505879137,
"grad_norm": 0.01906641758978367,
"learning_rate": 0.002851581508515815,
"loss": 0.634,
"step": 199
},
{
"epoch": 0.4375170905113481,
"grad_norm": 0.03528127446770668,
"learning_rate": 0.0028491484184914842,
"loss": 0.7152,
"step": 200
},
{
"epoch": 0.43970467596390483,
"grad_norm": 0.03441726043820381,
"learning_rate": 0.0028467153284671533,
"loss": 0.807,
"step": 201
},
{
"epoch": 0.4418922614164616,
"grad_norm": 0.07585262507200241,
"learning_rate": 0.0028442822384428224,
"loss": 0.8068,
"step": 202
},
{
"epoch": 0.4440798468690183,
"grad_norm": 0.04637427628040314,
"learning_rate": 0.0028418491484184915,
"loss": 0.6726,
"step": 203
},
{
"epoch": 0.44626743232157506,
"grad_norm": 0.014708532020449638,
"learning_rate": 0.0028394160583941606,
"loss": 0.7633,
"step": 204
},
{
"epoch": 0.4484550177741318,
"grad_norm": 0.06609700620174408,
"learning_rate": 0.0028369829683698297,
"loss": 0.9395,
"step": 205
},
{
"epoch": 0.45064260322668853,
"grad_norm": 0.014884551987051964,
"learning_rate": 0.0028345498783454988,
"loss": 0.7629,
"step": 206
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.02310200408101082,
"learning_rate": 0.002832116788321168,
"loss": 0.6696,
"step": 207
},
{
"epoch": 0.455017774131802,
"grad_norm": 0.020516803488135338,
"learning_rate": 0.002829683698296837,
"loss": 0.6966,
"step": 208
},
{
"epoch": 0.45720535958435876,
"grad_norm": 0.018198775127530098,
"learning_rate": 0.002827250608272506,
"loss": 0.936,
"step": 209
},
{
"epoch": 0.4593929450369155,
"grad_norm": 0.032083529978990555,
"learning_rate": 0.002824817518248175,
"loss": 0.853,
"step": 210
},
{
"epoch": 0.46158053048947223,
"grad_norm": 0.01605304516851902,
"learning_rate": 0.0028223844282238442,
"loss": 0.8602,
"step": 211
},
{
"epoch": 0.463768115942029,
"grad_norm": 0.024932844564318657,
"learning_rate": 0.0028199513381995133,
"loss": 0.9888,
"step": 212
},
{
"epoch": 0.46595570139458575,
"grad_norm": 0.04917526990175247,
"learning_rate": 0.0028175182481751824,
"loss": 0.7155,
"step": 213
},
{
"epoch": 0.46814328684714246,
"grad_norm": 0.017666855826973915,
"learning_rate": 0.002815085158150852,
"loss": 0.7597,
"step": 214
},
{
"epoch": 0.4703308722996992,
"grad_norm": 0.06158105283975601,
"learning_rate": 0.0028126520681265206,
"loss": 0.808,
"step": 215
},
{
"epoch": 0.4725184577522559,
"grad_norm": 0.028100378811359406,
"learning_rate": 0.0028102189781021897,
"loss": 0.8217,
"step": 216
},
{
"epoch": 0.4747060432048127,
"grad_norm": 0.02049509435892105,
"learning_rate": 0.0028077858880778592,
"loss": 0.8441,
"step": 217
},
{
"epoch": 0.47689362865736945,
"grad_norm": 0.018524937331676483,
"learning_rate": 0.002805352798053528,
"loss": 0.7565,
"step": 218
},
{
"epoch": 0.47908121410992616,
"grad_norm": 0.017941996455192566,
"learning_rate": 0.002802919708029197,
"loss": 0.7098,
"step": 219
},
{
"epoch": 0.4812687995624829,
"grad_norm": 0.042154472321271896,
"learning_rate": 0.0028004866180048665,
"loss": 0.8742,
"step": 220
},
{
"epoch": 0.4834563850150396,
"grad_norm": 0.026872573420405388,
"learning_rate": 0.002798053527980535,
"loss": 0.7273,
"step": 221
},
{
"epoch": 0.4856439704675964,
"grad_norm": 0.02051514759659767,
"learning_rate": 0.0027956204379562043,
"loss": 0.795,
"step": 222
},
{
"epoch": 0.48783155592015315,
"grad_norm": 0.02145540714263916,
"learning_rate": 0.0027931873479318738,
"loss": 0.8192,
"step": 223
},
{
"epoch": 0.49001914137270985,
"grad_norm": 0.04769520461559296,
"learning_rate": 0.002790754257907543,
"loss": 0.8592,
"step": 224
},
{
"epoch": 0.4922067268252666,
"grad_norm": 0.01415792852640152,
"learning_rate": 0.0027883211678832115,
"loss": 0.7553,
"step": 225
},
{
"epoch": 0.4943943122778234,
"grad_norm": 0.012172535061836243,
"learning_rate": 0.002785888077858881,
"loss": 0.739,
"step": 226
},
{
"epoch": 0.4965818977303801,
"grad_norm": 0.055700596421957016,
"learning_rate": 0.00278345498783455,
"loss": 0.9405,
"step": 227
},
{
"epoch": 0.49876948318293685,
"grad_norm": 0.025790488347411156,
"learning_rate": 0.002781021897810219,
"loss": 0.6641,
"step": 228
},
{
"epoch": 0.5009570686354936,
"grad_norm": 0.013937574811279774,
"learning_rate": 0.0027785888077858883,
"loss": 0.7727,
"step": 229
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.03238683566451073,
"learning_rate": 0.0027761557177615574,
"loss": 0.8109,
"step": 230
},
{
"epoch": 0.505332239540607,
"grad_norm": 0.06841892749071121,
"learning_rate": 0.002773722627737226,
"loss": 0.7827,
"step": 231
},
{
"epoch": 0.5075198249931638,
"grad_norm": 0.05782823637127876,
"learning_rate": 0.002771289537712895,
"loss": 0.9616,
"step": 232
},
{
"epoch": 0.5097074104457205,
"grad_norm": 0.1389644742012024,
"learning_rate": 0.0027688564476885647,
"loss": 0.7447,
"step": 233
},
{
"epoch": 0.5118949958982772,
"grad_norm": 0.07213829457759857,
"learning_rate": 0.002766423357664234,
"loss": 0.8738,
"step": 234
},
{
"epoch": 0.5140825813508341,
"grad_norm": 0.03161882609128952,
"learning_rate": 0.0027639902676399025,
"loss": 0.5307,
"step": 235
},
{
"epoch": 0.5162701668033908,
"grad_norm": 0.03051130659878254,
"learning_rate": 0.002761557177615572,
"loss": 0.7123,
"step": 236
},
{
"epoch": 0.5184577522559475,
"grad_norm": 0.02562803030014038,
"learning_rate": 0.002759124087591241,
"loss": 0.8167,
"step": 237
},
{
"epoch": 0.5206453377085042,
"grad_norm": 0.03016614355146885,
"learning_rate": 0.0027566909975669097,
"loss": 0.6904,
"step": 238
},
{
"epoch": 0.522832923161061,
"grad_norm": 0.01147315464913845,
"learning_rate": 0.0027542579075425793,
"loss": 0.7007,
"step": 239
},
{
"epoch": 0.5250205086136177,
"grad_norm": 0.017779918387532234,
"learning_rate": 0.0027518248175182483,
"loss": 0.9054,
"step": 240
},
{
"epoch": 0.5272080940661744,
"grad_norm": 0.03238027170300484,
"learning_rate": 0.002749391727493917,
"loss": 0.7599,
"step": 241
},
{
"epoch": 0.5293956795187312,
"grad_norm": 0.007716326508671045,
"learning_rate": 0.0027469586374695865,
"loss": 0.7561,
"step": 242
},
{
"epoch": 0.5315832649712879,
"grad_norm": 0.028708985075354576,
"learning_rate": 0.0027445255474452556,
"loss": 0.6842,
"step": 243
},
{
"epoch": 0.5337708504238446,
"grad_norm": 0.021554840728640556,
"learning_rate": 0.0027420924574209247,
"loss": 0.9046,
"step": 244
},
{
"epoch": 0.5359584358764015,
"grad_norm": 0.010056296363472939,
"learning_rate": 0.002739659367396594,
"loss": 0.7747,
"step": 245
},
{
"epoch": 0.5381460213289582,
"grad_norm": 0.014583374373614788,
"learning_rate": 0.002737226277372263,
"loss": 0.8104,
"step": 246
},
{
"epoch": 0.5403336067815149,
"grad_norm": 0.10760743170976639,
"learning_rate": 0.002734793187347932,
"loss": 1.0181,
"step": 247
},
{
"epoch": 0.5425211922340717,
"grad_norm": 0.030982421711087227,
"learning_rate": 0.002732360097323601,
"loss": 0.7125,
"step": 248
},
{
"epoch": 0.5447087776866284,
"grad_norm": 0.017710238695144653,
"learning_rate": 0.00272992700729927,
"loss": 0.9256,
"step": 249
},
{
"epoch": 0.5468963631391851,
"grad_norm": 0.027831239625811577,
"learning_rate": 0.0027274939172749393,
"loss": 0.7537,
"step": 250
},
{
"epoch": 0.5490839485917418,
"grad_norm": 0.019798962399363518,
"learning_rate": 0.0027250608272506084,
"loss": 0.6165,
"step": 251
},
{
"epoch": 0.5512715340442986,
"grad_norm": 0.00836907234042883,
"learning_rate": 0.0027226277372262775,
"loss": 0.7968,
"step": 252
},
{
"epoch": 0.5534591194968553,
"grad_norm": 0.018117599189281464,
"learning_rate": 0.0027201946472019465,
"loss": 0.6087,
"step": 253
},
{
"epoch": 0.555646704949412,
"grad_norm": 0.017056763172149658,
"learning_rate": 0.0027177615571776156,
"loss": 0.7837,
"step": 254
},
{
"epoch": 0.5578342904019689,
"grad_norm": 0.009035620838403702,
"learning_rate": 0.0027153284671532847,
"loss": 0.6376,
"step": 255
},
{
"epoch": 0.5600218758545256,
"grad_norm": 0.015250611118972301,
"learning_rate": 0.002712895377128954,
"loss": 0.7869,
"step": 256
},
{
"epoch": 0.5622094613070823,
"grad_norm": 0.014554915949702263,
"learning_rate": 0.002710462287104623,
"loss": 0.9046,
"step": 257
},
{
"epoch": 0.5643970467596391,
"grad_norm": 0.011779931373894215,
"learning_rate": 0.002708029197080292,
"loss": 0.8662,
"step": 258
},
{
"epoch": 0.5665846322121958,
"grad_norm": 0.012663912028074265,
"learning_rate": 0.002705596107055961,
"loss": 1.3081,
"step": 259
},
{
"epoch": 0.5687722176647525,
"grad_norm": 0.0059722489677369595,
"learning_rate": 0.00270316301703163,
"loss": 0.6796,
"step": 260
},
{
"epoch": 0.5709598031173093,
"grad_norm": 0.03664208948612213,
"learning_rate": 0.0027007299270072993,
"loss": 0.9093,
"step": 261
},
{
"epoch": 0.573147388569866,
"grad_norm": 0.042986199259757996,
"learning_rate": 0.0026982968369829684,
"loss": 0.9444,
"step": 262
},
{
"epoch": 0.5753349740224227,
"grad_norm": 0.012048511765897274,
"learning_rate": 0.0026958637469586375,
"loss": 0.8134,
"step": 263
},
{
"epoch": 0.5775225594749795,
"grad_norm": 0.012062503024935722,
"learning_rate": 0.0026934306569343066,
"loss": 0.7274,
"step": 264
},
{
"epoch": 0.5797101449275363,
"grad_norm": 0.02607789821922779,
"learning_rate": 0.0026909975669099757,
"loss": 0.6531,
"step": 265
},
{
"epoch": 0.581897730380093,
"grad_norm": 0.014329343102872372,
"learning_rate": 0.002688564476885645,
"loss": 0.6966,
"step": 266
},
{
"epoch": 0.5840853158326497,
"grad_norm": 0.013629244640469551,
"learning_rate": 0.002686131386861314,
"loss": 0.7831,
"step": 267
},
{
"epoch": 0.5862729012852065,
"grad_norm": 0.009315542876720428,
"learning_rate": 0.002683698296836983,
"loss": 0.6297,
"step": 268
},
{
"epoch": 0.5884604867377632,
"grad_norm": 0.051916949450969696,
"learning_rate": 0.002681265206812652,
"loss": 0.7651,
"step": 269
},
{
"epoch": 0.5906480721903199,
"grad_norm": 0.012272450141608715,
"learning_rate": 0.002678832116788321,
"loss": 0.6713,
"step": 270
},
{
"epoch": 0.5928356576428767,
"grad_norm": 0.011517216451466084,
"learning_rate": 0.00267639902676399,
"loss": 0.6117,
"step": 271
},
{
"epoch": 0.5950232430954334,
"grad_norm": 0.010973330587148666,
"learning_rate": 0.0026739659367396593,
"loss": 0.7631,
"step": 272
},
{
"epoch": 0.5972108285479901,
"grad_norm": 0.06580788642168045,
"learning_rate": 0.0026715328467153284,
"loss": 0.9153,
"step": 273
},
{
"epoch": 0.5993984140005469,
"grad_norm": 0.011350773274898529,
"learning_rate": 0.0026690997566909975,
"loss": 0.8094,
"step": 274
},
{
"epoch": 0.6015859994531036,
"grad_norm": 0.019090717658400536,
"learning_rate": 0.0026666666666666666,
"loss": 0.9304,
"step": 275
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.015177314169704914,
"learning_rate": 0.002664233576642336,
"loss": 0.6859,
"step": 276
},
{
"epoch": 0.6059611703582172,
"grad_norm": 0.020254317671060562,
"learning_rate": 0.0026618004866180048,
"loss": 0.8386,
"step": 277
},
{
"epoch": 0.6081487558107739,
"grad_norm": 0.014171348884701729,
"learning_rate": 0.002659367396593674,
"loss": 0.8112,
"step": 278
},
{
"epoch": 0.6103363412633306,
"grad_norm": 0.00894536729902029,
"learning_rate": 0.0026569343065693434,
"loss": 0.6877,
"step": 279
},
{
"epoch": 0.6125239267158873,
"grad_norm": 0.011850811541080475,
"learning_rate": 0.002654501216545012,
"loss": 0.8639,
"step": 280
},
{
"epoch": 0.6147115121684441,
"grad_norm": 0.012202342972159386,
"learning_rate": 0.002652068126520681,
"loss": 0.7851,
"step": 281
},
{
"epoch": 0.6168990976210008,
"grad_norm": 0.014019378460943699,
"learning_rate": 0.0026496350364963507,
"loss": 0.945,
"step": 282
},
{
"epoch": 0.6190866830735575,
"grad_norm": 0.013264323584735394,
"learning_rate": 0.0026472019464720193,
"loss": 0.6363,
"step": 283
},
{
"epoch": 0.6212742685261143,
"grad_norm": 0.010803530924022198,
"learning_rate": 0.0026447688564476884,
"loss": 0.7855,
"step": 284
},
{
"epoch": 0.623461853978671,
"grad_norm": 0.015852496027946472,
"learning_rate": 0.002642335766423358,
"loss": 0.6334,
"step": 285
},
{
"epoch": 0.6256494394312277,
"grad_norm": 0.023904947564005852,
"learning_rate": 0.002639902676399027,
"loss": 0.5551,
"step": 286
},
{
"epoch": 0.6278370248837846,
"grad_norm": 0.00868566520512104,
"learning_rate": 0.0026374695863746957,
"loss": 0.9256,
"step": 287
},
{
"epoch": 0.6300246103363413,
"grad_norm": 0.011297028511762619,
"learning_rate": 0.002635036496350365,
"loss": 0.7896,
"step": 288
},
{
"epoch": 0.632212195788898,
"grad_norm": 0.01018528826534748,
"learning_rate": 0.0026326034063260343,
"loss": 0.8198,
"step": 289
},
{
"epoch": 0.6343997812414548,
"grad_norm": 0.015003956854343414,
"learning_rate": 0.002630170316301703,
"loss": 0.7424,
"step": 290
},
{
"epoch": 0.6365873666940115,
"grad_norm": 0.007440235000103712,
"learning_rate": 0.0026277372262773725,
"loss": 0.6904,
"step": 291
},
{
"epoch": 0.6387749521465682,
"grad_norm": 0.014310602098703384,
"learning_rate": 0.0026253041362530416,
"loss": 0.7179,
"step": 292
},
{
"epoch": 0.6409625375991249,
"grad_norm": 0.008294426836073399,
"learning_rate": 0.0026228710462287102,
"loss": 0.827,
"step": 293
},
{
"epoch": 0.6431501230516817,
"grad_norm": 0.006840107962489128,
"learning_rate": 0.0026204379562043798,
"loss": 0.6749,
"step": 294
},
{
"epoch": 0.6453377085042384,
"grad_norm": 0.008538591675460339,
"learning_rate": 0.002618004866180049,
"loss": 0.7467,
"step": 295
},
{
"epoch": 0.6475252939567951,
"grad_norm": 0.007157974410802126,
"learning_rate": 0.0026155717761557175,
"loss": 0.7233,
"step": 296
},
{
"epoch": 0.649712879409352,
"grad_norm": 0.030327659100294113,
"learning_rate": 0.002613138686131387,
"loss": 0.6642,
"step": 297
},
{
"epoch": 0.6519004648619087,
"grad_norm": 0.012880248948931694,
"learning_rate": 0.002610705596107056,
"loss": 0.9694,
"step": 298
},
{
"epoch": 0.6540880503144654,
"grad_norm": 0.014233557507395744,
"learning_rate": 0.0026082725060827252,
"loss": 0.7686,
"step": 299
},
{
"epoch": 0.6562756357670222,
"grad_norm": 0.008432603441178799,
"learning_rate": 0.0026058394160583943,
"loss": 0.9355,
"step": 300
},
{
"epoch": 0.6584632212195789,
"grad_norm": 0.009492720477283001,
"learning_rate": 0.0026034063260340634,
"loss": 0.7637,
"step": 301
},
{
"epoch": 0.6606508066721356,
"grad_norm": 0.008224152028560638,
"learning_rate": 0.0026009732360097325,
"loss": 0.7609,
"step": 302
},
{
"epoch": 0.6628383921246924,
"grad_norm": 0.011647099629044533,
"learning_rate": 0.0025985401459854016,
"loss": 0.6565,
"step": 303
},
{
"epoch": 0.6650259775772491,
"grad_norm": 0.0120640117675066,
"learning_rate": 0.0025961070559610707,
"loss": 0.6751,
"step": 304
},
{
"epoch": 0.6672135630298058,
"grad_norm": 0.014007077552378178,
"learning_rate": 0.0025936739659367398,
"loss": 0.8132,
"step": 305
},
{
"epoch": 0.6694011484823625,
"grad_norm": 0.014167044311761856,
"learning_rate": 0.002591240875912409,
"loss": 0.8102,
"step": 306
},
{
"epoch": 0.6715887339349194,
"grad_norm": 0.016142327338457108,
"learning_rate": 0.002588807785888078,
"loss": 0.8004,
"step": 307
},
{
"epoch": 0.6737763193874761,
"grad_norm": 0.007279639132320881,
"learning_rate": 0.002586374695863747,
"loss": 0.732,
"step": 308
},
{
"epoch": 0.6759639048400328,
"grad_norm": 0.011619196273386478,
"learning_rate": 0.002583941605839416,
"loss": 0.603,
"step": 309
},
{
"epoch": 0.6781514902925896,
"grad_norm": 0.011564897373318672,
"learning_rate": 0.0025815085158150852,
"loss": 0.9163,
"step": 310
},
{
"epoch": 0.6803390757451463,
"grad_norm": 0.010117938742041588,
"learning_rate": 0.0025790754257907543,
"loss": 0.8683,
"step": 311
},
{
"epoch": 0.682526661197703,
"grad_norm": 0.017769185826182365,
"learning_rate": 0.0025766423357664234,
"loss": 0.6244,
"step": 312
},
{
"epoch": 0.6847142466502598,
"grad_norm": 0.012199788354337215,
"learning_rate": 0.0025742092457420925,
"loss": 0.7076,
"step": 313
},
{
"epoch": 0.6869018321028165,
"grad_norm": 0.008083075284957886,
"learning_rate": 0.0025717761557177616,
"loss": 0.8658,
"step": 314
},
{
"epoch": 0.6890894175553732,
"grad_norm": 0.01086794026196003,
"learning_rate": 0.0025693430656934307,
"loss": 0.6941,
"step": 315
},
{
"epoch": 0.69127700300793,
"grad_norm": 0.010161925107240677,
"learning_rate": 0.0025669099756691,
"loss": 0.6715,
"step": 316
},
{
"epoch": 0.6934645884604868,
"grad_norm": 0.008891239762306213,
"learning_rate": 0.002564476885644769,
"loss": 0.8093,
"step": 317
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.018787039443850517,
"learning_rate": 0.002562043795620438,
"loss": 0.8482,
"step": 318
},
{
"epoch": 0.6978397593656002,
"grad_norm": 0.02541973814368248,
"learning_rate": 0.002559610705596107,
"loss": 0.76,
"step": 319
},
{
"epoch": 0.700027344818157,
"grad_norm": 0.011948470957577229,
"learning_rate": 0.002557177615571776,
"loss": 0.7625,
"step": 320
},
{
"epoch": 0.7022149302707137,
"grad_norm": 0.009559310041368008,
"learning_rate": 0.0025547445255474453,
"loss": 0.7354,
"step": 321
},
{
"epoch": 0.7044025157232704,
"grad_norm": 0.008267502300441265,
"learning_rate": 0.0025523114355231144,
"loss": 0.7065,
"step": 322
},
{
"epoch": 0.7065901011758272,
"grad_norm": 0.010692731477320194,
"learning_rate": 0.0025498783454987834,
"loss": 0.983,
"step": 323
},
{
"epoch": 0.7087776866283839,
"grad_norm": 0.0124723045155406,
"learning_rate": 0.0025474452554744525,
"loss": 0.6154,
"step": 324
},
{
"epoch": 0.7109652720809406,
"grad_norm": 0.015448692254722118,
"learning_rate": 0.0025450121654501216,
"loss": 0.6129,
"step": 325
},
{
"epoch": 0.7131528575334974,
"grad_norm": 0.013601388782262802,
"learning_rate": 0.0025425790754257907,
"loss": 0.7214,
"step": 326
},
{
"epoch": 0.7153404429860541,
"grad_norm": 0.012070258148014545,
"learning_rate": 0.00254014598540146,
"loss": 0.7077,
"step": 327
},
{
"epoch": 0.7175280284386109,
"grad_norm": 0.05267300084233284,
"learning_rate": 0.0025377128953771293,
"loss": 0.7714,
"step": 328
},
{
"epoch": 0.7197156138911677,
"grad_norm": 0.012087949551641941,
"learning_rate": 0.002535279805352798,
"loss": 0.9047,
"step": 329
},
{
"epoch": 0.7219031993437244,
"grad_norm": 0.01940520666539669,
"learning_rate": 0.002532846715328467,
"loss": 0.7804,
"step": 330
},
{
"epoch": 0.7240907847962811,
"grad_norm": 0.011884646490216255,
"learning_rate": 0.0025304136253041366,
"loss": 0.6859,
"step": 331
},
{
"epoch": 0.7262783702488378,
"grad_norm": 0.02514353021979332,
"learning_rate": 0.0025279805352798053,
"loss": 0.7764,
"step": 332
},
{
"epoch": 0.7284659557013946,
"grad_norm": 0.015074629336595535,
"learning_rate": 0.0025255474452554744,
"loss": 0.6756,
"step": 333
},
{
"epoch": 0.7306535411539513,
"grad_norm": 0.036420077085494995,
"learning_rate": 0.002523114355231144,
"loss": 0.7407,
"step": 334
},
{
"epoch": 0.732841126606508,
"grad_norm": 0.015621097758412361,
"learning_rate": 0.0025206812652068126,
"loss": 0.8072,
"step": 335
},
{
"epoch": 0.7350287120590648,
"grad_norm": 0.010994632728397846,
"learning_rate": 0.0025182481751824816,
"loss": 0.9436,
"step": 336
},
{
"epoch": 0.7372162975116215,
"grad_norm": 0.017064619809389114,
"learning_rate": 0.002515815085158151,
"loss": 0.9386,
"step": 337
},
{
"epoch": 0.7394038829641782,
"grad_norm": 0.023198846727609634,
"learning_rate": 0.00251338199513382,
"loss": 0.7892,
"step": 338
},
{
"epoch": 0.7415914684167351,
"grad_norm": 0.005636582616716623,
"learning_rate": 0.002510948905109489,
"loss": 0.8005,
"step": 339
},
{
"epoch": 0.7437790538692918,
"grad_norm": 0.008022590540349483,
"learning_rate": 0.0025085158150851584,
"loss": 0.9142,
"step": 340
},
{
"epoch": 0.7459666393218485,
"grad_norm": 0.013106726109981537,
"learning_rate": 0.0025060827250608275,
"loss": 0.6845,
"step": 341
},
{
"epoch": 0.7481542247744053,
"grad_norm": 0.015878600999712944,
"learning_rate": 0.002503649635036496,
"loss": 0.8528,
"step": 342
},
{
"epoch": 0.750341810226962,
"grad_norm": 0.013783195056021214,
"learning_rate": 0.0025012165450121657,
"loss": 0.8487,
"step": 343
},
{
"epoch": 0.7525293956795187,
"grad_norm": 0.05050954222679138,
"learning_rate": 0.002498783454987835,
"loss": 0.9014,
"step": 344
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.009747706353664398,
"learning_rate": 0.0024963503649635035,
"loss": 0.8331,
"step": 345
},
{
"epoch": 0.7569045665846322,
"grad_norm": 0.27641791105270386,
"learning_rate": 0.0024939172749391726,
"loss": 0.8328,
"step": 346
},
{
"epoch": 0.7590921520371889,
"grad_norm": 0.022615063935518265,
"learning_rate": 0.002491484184914842,
"loss": 1.025,
"step": 347
},
{
"epoch": 0.7612797374897456,
"grad_norm": 0.018037477508187294,
"learning_rate": 0.0024890510948905108,
"loss": 0.8058,
"step": 348
},
{
"epoch": 0.7634673229423025,
"grad_norm": 0.03229966387152672,
"learning_rate": 0.00248661800486618,
"loss": 0.8224,
"step": 349
},
{
"epoch": 0.7656549083948592,
"grad_norm": 0.03468572720885277,
"learning_rate": 0.0024841849148418494,
"loss": 0.6558,
"step": 350
},
{
"epoch": 0.7678424938474159,
"grad_norm": 0.04352645203471184,
"learning_rate": 0.0024817518248175185,
"loss": 0.7869,
"step": 351
},
{
"epoch": 0.7700300792999727,
"grad_norm": 0.0520501509308815,
"learning_rate": 0.002479318734793187,
"loss": 0.8318,
"step": 352
},
{
"epoch": 0.7722176647525294,
"grad_norm": 0.025180073454976082,
"learning_rate": 0.0024768856447688566,
"loss": 0.8454,
"step": 353
},
{
"epoch": 0.7744052502050861,
"grad_norm": 0.013843162916600704,
"learning_rate": 0.0024744525547445257,
"loss": 0.979,
"step": 354
},
{
"epoch": 0.7765928356576429,
"grad_norm": 0.026960408315062523,
"learning_rate": 0.0024720194647201944,
"loss": 0.7692,
"step": 355
},
{
"epoch": 0.7787804211101996,
"grad_norm": 0.02509387582540512,
"learning_rate": 0.002469586374695864,
"loss": 0.7471,
"step": 356
},
{
"epoch": 0.7809680065627563,
"grad_norm": 0.014011479914188385,
"learning_rate": 0.002467153284671533,
"loss": 0.7752,
"step": 357
},
{
"epoch": 0.7831555920153132,
"grad_norm": 0.01862008310854435,
"learning_rate": 0.0024647201946472017,
"loss": 0.9891,
"step": 358
},
{
"epoch": 0.7853431774678699,
"grad_norm": 0.01249686349183321,
"learning_rate": 0.002462287104622871,
"loss": 0.9046,
"step": 359
},
{
"epoch": 0.7875307629204266,
"grad_norm": 0.018710242584347725,
"learning_rate": 0.0024598540145985403,
"loss": 0.7926,
"step": 360
},
{
"epoch": 0.7897183483729833,
"grad_norm": 0.015550883486866951,
"learning_rate": 0.0024574209245742094,
"loss": 0.9209,
"step": 361
},
{
"epoch": 0.7919059338255401,
"grad_norm": 0.011178571730852127,
"learning_rate": 0.0024549878345498785,
"loss": 0.7962,
"step": 362
},
{
"epoch": 0.7940935192780968,
"grad_norm": 0.017678866162896156,
"learning_rate": 0.0024525547445255476,
"loss": 0.9532,
"step": 363
},
{
"epoch": 0.7962811047306535,
"grad_norm": 0.021445617079734802,
"learning_rate": 0.0024501216545012167,
"loss": 0.8302,
"step": 364
},
{
"epoch": 0.7984686901832103,
"grad_norm": 0.015537573955953121,
"learning_rate": 0.0024476885644768858,
"loss": 0.7665,
"step": 365
},
{
"epoch": 0.800656275635767,
"grad_norm": 0.015302474610507488,
"learning_rate": 0.002445255474452555,
"loss": 0.7161,
"step": 366
},
{
"epoch": 0.8028438610883237,
"grad_norm": 0.013649791479110718,
"learning_rate": 0.002442822384428224,
"loss": 0.6766,
"step": 367
},
{
"epoch": 0.8050314465408805,
"grad_norm": 0.01138269528746605,
"learning_rate": 0.002440389294403893,
"loss": 0.7797,
"step": 368
},
{
"epoch": 0.8072190319934373,
"grad_norm": 0.014025691896677017,
"learning_rate": 0.002437956204379562,
"loss": 0.779,
"step": 369
},
{
"epoch": 0.809406617445994,
"grad_norm": 0.011000445112586021,
"learning_rate": 0.002435523114355231,
"loss": 0.8064,
"step": 370
},
{
"epoch": 0.8115942028985508,
"grad_norm": 0.010309292934834957,
"learning_rate": 0.0024330900243309003,
"loss": 0.7252,
"step": 371
},
{
"epoch": 0.8137817883511075,
"grad_norm": 0.007664249278604984,
"learning_rate": 0.0024306569343065694,
"loss": 0.7081,
"step": 372
},
{
"epoch": 0.8159693738036642,
"grad_norm": 0.015154222957789898,
"learning_rate": 0.0024282238442822385,
"loss": 0.7869,
"step": 373
},
{
"epoch": 0.8181569592562209,
"grad_norm": 0.01371028833091259,
"learning_rate": 0.0024257907542579076,
"loss": 0.7423,
"step": 374
},
{
"epoch": 0.8203445447087777,
"grad_norm": 0.012794865295290947,
"learning_rate": 0.0024233576642335767,
"loss": 0.9341,
"step": 375
},
{
"epoch": 0.8225321301613344,
"grad_norm": 0.011340939439833164,
"learning_rate": 0.0024209245742092458,
"loss": 1.0406,
"step": 376
},
{
"epoch": 0.8247197156138911,
"grad_norm": 0.013491635210812092,
"learning_rate": 0.002418491484184915,
"loss": 0.763,
"step": 377
},
{
"epoch": 0.8269073010664479,
"grad_norm": 0.008016029372811317,
"learning_rate": 0.002416058394160584,
"loss": 0.7132,
"step": 378
},
{
"epoch": 0.8290948865190046,
"grad_norm": 0.011460046283900738,
"learning_rate": 0.002413625304136253,
"loss": 0.6306,
"step": 379
},
{
"epoch": 0.8312824719715614,
"grad_norm": 0.0110190873965621,
"learning_rate": 0.002411192214111922,
"loss": 0.6944,
"step": 380
},
{
"epoch": 0.8334700574241182,
"grad_norm": 0.008347691036760807,
"learning_rate": 0.0024087591240875912,
"loss": 0.8926,
"step": 381
},
{
"epoch": 0.8356576428766749,
"grad_norm": 0.007940311916172504,
"learning_rate": 0.0024063260340632603,
"loss": 0.8666,
"step": 382
},
{
"epoch": 0.8378452283292316,
"grad_norm": 0.011534546501934528,
"learning_rate": 0.0024038929440389294,
"loss": 0.9077,
"step": 383
},
{
"epoch": 0.8400328137817884,
"grad_norm": 0.010218126699328423,
"learning_rate": 0.0024014598540145985,
"loss": 0.8393,
"step": 384
},
{
"epoch": 0.8422203992343451,
"grad_norm": 0.01117737777531147,
"learning_rate": 0.0023990267639902676,
"loss": 0.8401,
"step": 385
},
{
"epoch": 0.8444079846869018,
"grad_norm": 0.01495604682713747,
"learning_rate": 0.0023965936739659367,
"loss": 0.6524,
"step": 386
},
{
"epoch": 0.8465955701394585,
"grad_norm": 0.01132154744118452,
"learning_rate": 0.002394160583941606,
"loss": 0.6973,
"step": 387
},
{
"epoch": 0.8487831555920153,
"grad_norm": 0.016704557463526726,
"learning_rate": 0.002391727493917275,
"loss": 0.8638,
"step": 388
},
{
"epoch": 0.850970741044572,
"grad_norm": 0.03163198381662369,
"learning_rate": 0.002389294403892944,
"loss": 0.6569,
"step": 389
},
{
"epoch": 0.8531583264971287,
"grad_norm": 0.009892611764371395,
"learning_rate": 0.002386861313868613,
"loss": 0.8507,
"step": 390
},
{
"epoch": 0.8553459119496856,
"grad_norm": 0.009704566560685635,
"learning_rate": 0.002384428223844282,
"loss": 0.7567,
"step": 391
},
{
"epoch": 0.8575334974022423,
"grad_norm": 0.011233623139560223,
"learning_rate": 0.0023819951338199512,
"loss": 0.9072,
"step": 392
},
{
"epoch": 0.859721082854799,
"grad_norm": 0.017818894237279892,
"learning_rate": 0.0023795620437956208,
"loss": 0.6716,
"step": 393
},
{
"epoch": 0.8619086683073558,
"grad_norm": 0.009800358675420284,
"learning_rate": 0.0023771289537712894,
"loss": 0.6331,
"step": 394
},
{
"epoch": 0.8640962537599125,
"grad_norm": 0.00855625793337822,
"learning_rate": 0.0023746958637469585,
"loss": 0.8208,
"step": 395
},
{
"epoch": 0.8662838392124692,
"grad_norm": 0.007912772707641125,
"learning_rate": 0.002372262773722628,
"loss": 0.6897,
"step": 396
},
{
"epoch": 0.868471424665026,
"grad_norm": 0.015991948544979095,
"learning_rate": 0.0023698296836982967,
"loss": 0.5838,
"step": 397
},
{
"epoch": 0.8706590101175827,
"grad_norm": 0.013330014422535896,
"learning_rate": 0.002367396593673966,
"loss": 0.7765,
"step": 398
},
{
"epoch": 0.8728465955701394,
"grad_norm": 0.0108262337744236,
"learning_rate": 0.0023649635036496353,
"loss": 0.8259,
"step": 399
},
{
"epoch": 0.8750341810226961,
"grad_norm": 0.01277016382664442,
"learning_rate": 0.002362530413625304,
"loss": 0.5084,
"step": 400
},
{
"epoch": 0.877221766475253,
"grad_norm": 0.00825558416545391,
"learning_rate": 0.002360097323600973,
"loss": 0.8388,
"step": 401
},
{
"epoch": 0.8794093519278097,
"grad_norm": 0.008703862316906452,
"learning_rate": 0.0023576642335766426,
"loss": 0.889,
"step": 402
},
{
"epoch": 0.8815969373803664,
"grad_norm": 0.009978721849620342,
"learning_rate": 0.0023552311435523117,
"loss": 0.7724,
"step": 403
},
{
"epoch": 0.8837845228329232,
"grad_norm": 0.009193633683025837,
"learning_rate": 0.0023527980535279804,
"loss": 0.9257,
"step": 404
},
{
"epoch": 0.8859721082854799,
"grad_norm": 0.009905806742608547,
"learning_rate": 0.00235036496350365,
"loss": 0.9046,
"step": 405
},
{
"epoch": 0.8881596937380366,
"grad_norm": 0.0108295027166605,
"learning_rate": 0.002347931873479319,
"loss": 0.6427,
"step": 406
},
{
"epoch": 0.8903472791905934,
"grad_norm": 0.010898306965827942,
"learning_rate": 0.0023454987834549876,
"loss": 0.6888,
"step": 407
},
{
"epoch": 0.8925348646431501,
"grad_norm": 0.013794617727398872,
"learning_rate": 0.002343065693430657,
"loss": 0.8544,
"step": 408
},
{
"epoch": 0.8947224500957068,
"grad_norm": 0.014423336833715439,
"learning_rate": 0.0023406326034063262,
"loss": 0.7525,
"step": 409
},
{
"epoch": 0.8969100355482637,
"grad_norm": 0.010249799117445946,
"learning_rate": 0.002338199513381995,
"loss": 0.7588,
"step": 410
},
{
"epoch": 0.8990976210008204,
"grad_norm": 0.014359788969159126,
"learning_rate": 0.0023357664233576644,
"loss": 0.8303,
"step": 411
},
{
"epoch": 0.9012852064533771,
"grad_norm": 0.007848945446312428,
"learning_rate": 0.0023333333333333335,
"loss": 0.7478,
"step": 412
},
{
"epoch": 0.9034727919059339,
"grad_norm": 0.010217231698334217,
"learning_rate": 0.0023309002433090026,
"loss": 0.8758,
"step": 413
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.008166585117578506,
"learning_rate": 0.0023284671532846717,
"loss": 0.8669,
"step": 414
},
{
"epoch": 0.9078479628110473,
"grad_norm": 0.08122234046459198,
"learning_rate": 0.002326034063260341,
"loss": 0.8672,
"step": 415
},
{
"epoch": 0.910035548263604,
"grad_norm": 0.026630746200680733,
"learning_rate": 0.00232360097323601,
"loss": 0.8429,
"step": 416
},
{
"epoch": 0.9122231337161608,
"grad_norm": 0.011199391447007656,
"learning_rate": 0.002321167883211679,
"loss": 0.7394,
"step": 417
},
{
"epoch": 0.9144107191687175,
"grad_norm": 0.034359946846961975,
"learning_rate": 0.002318734793187348,
"loss": 0.757,
"step": 418
},
{
"epoch": 0.9165983046212742,
"grad_norm": 0.007310883607715368,
"learning_rate": 0.002316301703163017,
"loss": 0.8614,
"step": 419
},
{
"epoch": 0.918785890073831,
"grad_norm": 0.017180046066641808,
"learning_rate": 0.002313868613138686,
"loss": 0.7018,
"step": 420
},
{
"epoch": 0.9209734755263878,
"grad_norm": 0.010772480629384518,
"learning_rate": 0.0023114355231143554,
"loss": 1.0247,
"step": 421
},
{
"epoch": 0.9231610609789445,
"grad_norm": 0.013757293112576008,
"learning_rate": 0.0023090024330900244,
"loss": 0.7243,
"step": 422
},
{
"epoch": 0.9253486464315013,
"grad_norm": 0.010658146813511848,
"learning_rate": 0.0023065693430656935,
"loss": 0.8289,
"step": 423
},
{
"epoch": 0.927536231884058,
"grad_norm": 0.013902239501476288,
"learning_rate": 0.0023041362530413626,
"loss": 0.7706,
"step": 424
},
{
"epoch": 0.9297238173366147,
"grad_norm": 0.011173736304044724,
"learning_rate": 0.0023017031630170317,
"loss": 0.8055,
"step": 425
},
{
"epoch": 0.9319114027891715,
"grad_norm": 0.011386138387024403,
"learning_rate": 0.002299270072992701,
"loss": 0.6273,
"step": 426
},
{
"epoch": 0.9340989882417282,
"grad_norm": 0.008862471207976341,
"learning_rate": 0.00229683698296837,
"loss": 0.7032,
"step": 427
},
{
"epoch": 0.9362865736942849,
"grad_norm": 0.02106628008186817,
"learning_rate": 0.002294403892944039,
"loss": 0.7835,
"step": 428
},
{
"epoch": 0.9384741591468416,
"grad_norm": 0.010091581381857395,
"learning_rate": 0.002291970802919708,
"loss": 0.6805,
"step": 429
},
{
"epoch": 0.9406617445993984,
"grad_norm": 0.012447184883058071,
"learning_rate": 0.002289537712895377,
"loss": 0.7323,
"step": 430
},
{
"epoch": 0.9428493300519551,
"grad_norm": 0.015980314463377,
"learning_rate": 0.0022871046228710463,
"loss": 0.8842,
"step": 431
},
{
"epoch": 0.9450369155045119,
"grad_norm": 0.007705094758421183,
"learning_rate": 0.0022846715328467154,
"loss": 0.8907,
"step": 432
},
{
"epoch": 0.9472245009570687,
"grad_norm": 0.00878717191517353,
"learning_rate": 0.0022822384428223845,
"loss": 0.7455,
"step": 433
},
{
"epoch": 0.9494120864096254,
"grad_norm": 0.026101326569914818,
"learning_rate": 0.0022798053527980536,
"loss": 0.6827,
"step": 434
},
{
"epoch": 0.9515996718621821,
"grad_norm": 0.008718657307326794,
"learning_rate": 0.0022773722627737226,
"loss": 0.9253,
"step": 435
},
{
"epoch": 0.9537872573147389,
"grad_norm": 0.009151890873908997,
"learning_rate": 0.0022749391727493917,
"loss": 0.8735,
"step": 436
},
{
"epoch": 0.9559748427672956,
"grad_norm": 0.012189007364213467,
"learning_rate": 0.002272506082725061,
"loss": 0.94,
"step": 437
},
{
"epoch": 0.9581624282198523,
"grad_norm": 0.00890439935028553,
"learning_rate": 0.00227007299270073,
"loss": 0.7572,
"step": 438
},
{
"epoch": 0.9603500136724091,
"grad_norm": 0.013200386427342892,
"learning_rate": 0.002267639902676399,
"loss": 0.7361,
"step": 439
},
{
"epoch": 0.9625375991249658,
"grad_norm": 0.011736634187400341,
"learning_rate": 0.002265206812652068,
"loss": 0.6326,
"step": 440
},
{
"epoch": 0.9647251845775225,
"grad_norm": 0.006781425327062607,
"learning_rate": 0.002262773722627737,
"loss": 0.7254,
"step": 441
},
{
"epoch": 0.9669127700300792,
"grad_norm": 0.008296315558254719,
"learning_rate": 0.0022603406326034063,
"loss": 0.6898,
"step": 442
},
{
"epoch": 0.9691003554826361,
"grad_norm": 0.008293522521853447,
"learning_rate": 0.0022579075425790754,
"loss": 0.7953,
"step": 443
},
{
"epoch": 0.9712879409351928,
"grad_norm": 0.00848364643752575,
"learning_rate": 0.0022554744525547445,
"loss": 0.8203,
"step": 444
},
{
"epoch": 0.9734755263877495,
"grad_norm": 0.012193895876407623,
"learning_rate": 0.002253041362530414,
"loss": 0.6794,
"step": 445
},
{
"epoch": 0.9756631118403063,
"grad_norm": 0.018784867599606514,
"learning_rate": 0.0022506082725060827,
"loss": 0.5793,
"step": 446
},
{
"epoch": 0.977850697292863,
"grad_norm": 0.008517356589436531,
"learning_rate": 0.0022481751824817518,
"loss": 0.4866,
"step": 447
},
{
"epoch": 0.9800382827454197,
"grad_norm": 0.017300793901085854,
"learning_rate": 0.0022457420924574213,
"loss": 0.8304,
"step": 448
},
{
"epoch": 0.9822258681979765,
"grad_norm": 0.010441828519105911,
"learning_rate": 0.00224330900243309,
"loss": 0.9823,
"step": 449
},
{
"epoch": 0.9844134536505332,
"grad_norm": 0.013992452062666416,
"learning_rate": 0.002240875912408759,
"loss": 0.7828,
"step": 450
},
{
"epoch": 0.9866010391030899,
"grad_norm": 0.006943755783140659,
"learning_rate": 0.0022384428223844286,
"loss": 0.6205,
"step": 451
},
{
"epoch": 0.9887886245556468,
"grad_norm": 0.0063702561892569065,
"learning_rate": 0.0022360097323600972,
"loss": 1.0355,
"step": 452
},
{
"epoch": 0.9909762100082035,
"grad_norm": 0.007510766386985779,
"learning_rate": 0.0022335766423357663,
"loss": 0.7581,
"step": 453
},
{
"epoch": 0.9931637954607602,
"grad_norm": 0.010165141895413399,
"learning_rate": 0.002231143552311436,
"loss": 0.8831,
"step": 454
},
{
"epoch": 0.9953513809133169,
"grad_norm": 0.012972669675946236,
"learning_rate": 0.002228710462287105,
"loss": 0.6523,
"step": 455
},
{
"epoch": 0.9975389663658737,
"grad_norm": 0.007454239297658205,
"learning_rate": 0.0022262773722627736,
"loss": 0.8721,
"step": 456
},
{
"epoch": 0.9997265518184304,
"grad_norm": 0.007078221533447504,
"learning_rate": 0.0022238442822384427,
"loss": 0.6737,
"step": 457
},
{
"epoch": 1.0019141372709872,
"grad_norm": 0.021942665800452232,
"learning_rate": 0.002221411192214112,
"loss": 0.8231,
"step": 458
},
{
"epoch": 1.0041017227235438,
"grad_norm": 0.019108066335320473,
"learning_rate": 0.002218978102189781,
"loss": 0.6809,
"step": 459
},
{
"epoch": 1.0062893081761006,
"grad_norm": 0.013495873659849167,
"learning_rate": 0.00221654501216545,
"loss": 0.7663,
"step": 460
},
{
"epoch": 1.0084768936286574,
"grad_norm": 0.009844646789133549,
"learning_rate": 0.0022141119221411195,
"loss": 0.8189,
"step": 461
},
{
"epoch": 1.010664479081214,
"grad_norm": 0.008135687559843063,
"learning_rate": 0.002211678832116788,
"loss": 0.7935,
"step": 462
},
{
"epoch": 1.0128520645337709,
"grad_norm": 0.01022945623844862,
"learning_rate": 0.0022092457420924572,
"loss": 0.7855,
"step": 463
},
{
"epoch": 1.0150396499863277,
"grad_norm": 0.011145783588290215,
"learning_rate": 0.0022068126520681268,
"loss": 0.9334,
"step": 464
},
{
"epoch": 1.0172272354388843,
"grad_norm": 0.014914394356310368,
"learning_rate": 0.002204379562043796,
"loss": 0.8769,
"step": 465
},
{
"epoch": 1.019414820891441,
"grad_norm": 0.010317330248653889,
"learning_rate": 0.0022019464720194645,
"loss": 0.9083,
"step": 466
},
{
"epoch": 1.021602406343998,
"grad_norm": 0.012516210786998272,
"learning_rate": 0.002199513381995134,
"loss": 0.7169,
"step": 467
},
{
"epoch": 1.0237899917965545,
"grad_norm": 0.015528671443462372,
"learning_rate": 0.002197080291970803,
"loss": 0.6738,
"step": 468
},
{
"epoch": 1.0259775772491113,
"grad_norm": 0.007066753227263689,
"learning_rate": 0.002194647201946472,
"loss": 0.5918,
"step": 469
},
{
"epoch": 1.0281651627016681,
"grad_norm": 0.007939637638628483,
"learning_rate": 0.0021922141119221413,
"loss": 0.6588,
"step": 470
},
{
"epoch": 1.0303527481542247,
"grad_norm": 0.007144363131374121,
"learning_rate": 0.0021897810218978104,
"loss": 0.4427,
"step": 471
},
{
"epoch": 1.0325403336067815,
"grad_norm": 0.007886086590588093,
"learning_rate": 0.002187347931873479,
"loss": 0.7392,
"step": 472
},
{
"epoch": 1.0347279190593381,
"grad_norm": 0.007826312445104122,
"learning_rate": 0.0021849148418491486,
"loss": 0.743,
"step": 473
},
{
"epoch": 1.036915504511895,
"grad_norm": 0.007945370860397816,
"learning_rate": 0.0021824817518248177,
"loss": 0.6567,
"step": 474
},
{
"epoch": 1.0391030899644518,
"grad_norm": 0.009234143421053886,
"learning_rate": 0.0021800486618004863,
"loss": 0.8079,
"step": 475
},
{
"epoch": 1.0412906754170084,
"grad_norm": 0.011828969232738018,
"learning_rate": 0.002177615571776156,
"loss": 0.7132,
"step": 476
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.008681892417371273,
"learning_rate": 0.002175182481751825,
"loss": 0.8417,
"step": 477
},
{
"epoch": 1.045665846322122,
"grad_norm": 0.008761374279856682,
"learning_rate": 0.002172749391727494,
"loss": 0.7446,
"step": 478
},
{
"epoch": 1.0478534317746786,
"grad_norm": 0.014171335846185684,
"learning_rate": 0.002170316301703163,
"loss": 0.739,
"step": 479
},
{
"epoch": 1.0500410172272354,
"grad_norm": 0.011624401435256004,
"learning_rate": 0.0021678832116788322,
"loss": 0.8935,
"step": 480
},
{
"epoch": 1.0522286026797922,
"grad_norm": 0.019760416820645332,
"learning_rate": 0.0021654501216545013,
"loss": 0.9159,
"step": 481
},
{
"epoch": 1.0544161881323488,
"grad_norm": 0.0076353419572114944,
"learning_rate": 0.0021630170316301704,
"loss": 0.8153,
"step": 482
},
{
"epoch": 1.0566037735849056,
"grad_norm": 0.009698878973722458,
"learning_rate": 0.0021605839416058395,
"loss": 0.8043,
"step": 483
},
{
"epoch": 1.0587913590374625,
"grad_norm": 0.007674135267734528,
"learning_rate": 0.0021581508515815086,
"loss": 0.6816,
"step": 484
},
{
"epoch": 1.060978944490019,
"grad_norm": 0.01642732322216034,
"learning_rate": 0.0021557177615571777,
"loss": 0.9525,
"step": 485
},
{
"epoch": 1.0631665299425759,
"grad_norm": 0.016669275239109993,
"learning_rate": 0.002153284671532847,
"loss": 0.5482,
"step": 486
},
{
"epoch": 1.0653541153951327,
"grad_norm": 0.012565388344228268,
"learning_rate": 0.002150851581508516,
"loss": 0.6211,
"step": 487
},
{
"epoch": 1.0675417008476893,
"grad_norm": 0.01363010797649622,
"learning_rate": 0.002148418491484185,
"loss": 0.5152,
"step": 488
},
{
"epoch": 1.069729286300246,
"grad_norm": 0.020599598065018654,
"learning_rate": 0.002145985401459854,
"loss": 0.8035,
"step": 489
},
{
"epoch": 1.071916871752803,
"grad_norm": 0.013294585980474949,
"learning_rate": 0.002143552311435523,
"loss": 0.8999,
"step": 490
},
{
"epoch": 1.0741044572053595,
"grad_norm": 0.038667161017656326,
"learning_rate": 0.0021411192214111923,
"loss": 0.7514,
"step": 491
},
{
"epoch": 1.0762920426579163,
"grad_norm": 0.010547326877713203,
"learning_rate": 0.0021386861313868613,
"loss": 0.6819,
"step": 492
},
{
"epoch": 1.0784796281104732,
"grad_norm": 0.009484006091952324,
"learning_rate": 0.0021362530413625304,
"loss": 0.6253,
"step": 493
},
{
"epoch": 1.0806672135630297,
"grad_norm": 0.009657086804509163,
"learning_rate": 0.0021338199513381995,
"loss": 0.7112,
"step": 494
},
{
"epoch": 1.0828547990155866,
"grad_norm": 0.01714419014751911,
"learning_rate": 0.0021313868613138686,
"loss": 0.9098,
"step": 495
},
{
"epoch": 1.0850423844681434,
"grad_norm": 0.01343261357396841,
"learning_rate": 0.0021289537712895377,
"loss": 0.7902,
"step": 496
},
{
"epoch": 1.0872299699207,
"grad_norm": 0.00883649941533804,
"learning_rate": 0.002126520681265207,
"loss": 0.9971,
"step": 497
},
{
"epoch": 1.0894175553732568,
"grad_norm": 0.00613701157271862,
"learning_rate": 0.002124087591240876,
"loss": 0.7527,
"step": 498
},
{
"epoch": 1.0916051408258136,
"grad_norm": 0.009846502915024757,
"learning_rate": 0.002121654501216545,
"loss": 0.7402,
"step": 499
},
{
"epoch": 1.0937927262783702,
"grad_norm": 0.010731893591582775,
"learning_rate": 0.002119221411192214,
"loss": 0.7848,
"step": 500
},
{
"epoch": 1.095980311730927,
"grad_norm": 0.011895066127181053,
"learning_rate": 0.002116788321167883,
"loss": 0.7164,
"step": 501
},
{
"epoch": 1.0981678971834836,
"grad_norm": 0.007519803941249847,
"learning_rate": 0.0021143552311435523,
"loss": 0.9606,
"step": 502
},
{
"epoch": 1.1003554826360404,
"grad_norm": 0.009692378342151642,
"learning_rate": 0.0021119221411192214,
"loss": 0.7633,
"step": 503
},
{
"epoch": 1.1025430680885973,
"grad_norm": 0.011364142410457134,
"learning_rate": 0.0021094890510948905,
"loss": 0.6945,
"step": 504
},
{
"epoch": 1.1047306535411539,
"grad_norm": 0.007994066923856735,
"learning_rate": 0.0021070559610705595,
"loss": 0.6423,
"step": 505
},
{
"epoch": 1.1069182389937107,
"grad_norm": 0.02612650953233242,
"learning_rate": 0.0021046228710462286,
"loss": 0.8676,
"step": 506
},
{
"epoch": 1.1091058244462675,
"grad_norm": 0.007825646549463272,
"learning_rate": 0.002102189781021898,
"loss": 0.5687,
"step": 507
},
{
"epoch": 1.111293409898824,
"grad_norm": 0.008077848702669144,
"learning_rate": 0.002099756690997567,
"loss": 0.7509,
"step": 508
},
{
"epoch": 1.113480995351381,
"grad_norm": 0.009620738215744495,
"learning_rate": 0.002097323600973236,
"loss": 0.5996,
"step": 509
},
{
"epoch": 1.1156685808039377,
"grad_norm": 0.0255615022033453,
"learning_rate": 0.0020948905109489054,
"loss": 0.6696,
"step": 510
},
{
"epoch": 1.1178561662564943,
"grad_norm": 0.010550931096076965,
"learning_rate": 0.002092457420924574,
"loss": 0.7019,
"step": 511
},
{
"epoch": 1.1200437517090511,
"grad_norm": 0.028004566207528114,
"learning_rate": 0.002090024330900243,
"loss": 0.8809,
"step": 512
},
{
"epoch": 1.122231337161608,
"grad_norm": 0.013075259514153004,
"learning_rate": 0.0020875912408759127,
"loss": 0.6108,
"step": 513
},
{
"epoch": 1.1244189226141645,
"grad_norm": 0.015426448546350002,
"learning_rate": 0.0020851581508515814,
"loss": 0.7146,
"step": 514
},
{
"epoch": 1.1266065080667214,
"grad_norm": 0.007735779043287039,
"learning_rate": 0.0020827250608272505,
"loss": 0.8517,
"step": 515
},
{
"epoch": 1.1287940935192782,
"grad_norm": 0.012412245385348797,
"learning_rate": 0.00208029197080292,
"loss": 0.6694,
"step": 516
},
{
"epoch": 1.1309816789718348,
"grad_norm": 0.009669258259236813,
"learning_rate": 0.0020778588807785887,
"loss": 0.612,
"step": 517
},
{
"epoch": 1.1331692644243916,
"grad_norm": 0.010346516966819763,
"learning_rate": 0.0020754257907542577,
"loss": 0.7956,
"step": 518
},
{
"epoch": 1.1353568498769484,
"grad_norm": 0.008683484978973866,
"learning_rate": 0.0020729927007299273,
"loss": 0.7012,
"step": 519
},
{
"epoch": 1.137544435329505,
"grad_norm": 0.009093291126191616,
"learning_rate": 0.0020705596107055964,
"loss": 0.6406,
"step": 520
},
{
"epoch": 1.1397320207820618,
"grad_norm": 0.019143717363476753,
"learning_rate": 0.002068126520681265,
"loss": 0.6632,
"step": 521
},
{
"epoch": 1.1419196062346186,
"grad_norm": 0.008810199797153473,
"learning_rate": 0.0020656934306569345,
"loss": 0.6248,
"step": 522
},
{
"epoch": 1.1441071916871752,
"grad_norm": 0.009826627559959888,
"learning_rate": 0.0020632603406326036,
"loss": 0.7367,
"step": 523
},
{
"epoch": 1.146294777139732,
"grad_norm": 0.007178613916039467,
"learning_rate": 0.0020608272506082723,
"loss": 0.6688,
"step": 524
},
{
"epoch": 1.1484823625922886,
"grad_norm": 0.00853504054248333,
"learning_rate": 0.002058394160583942,
"loss": 0.6802,
"step": 525
},
{
"epoch": 1.1506699480448455,
"grad_norm": 0.011418921872973442,
"learning_rate": 0.002055961070559611,
"loss": 0.5832,
"step": 526
},
{
"epoch": 1.1528575334974023,
"grad_norm": 0.015032613649964333,
"learning_rate": 0.0020535279805352796,
"loss": 0.6841,
"step": 527
},
{
"epoch": 1.155045118949959,
"grad_norm": 0.008302520960569382,
"learning_rate": 0.002051094890510949,
"loss": 0.7869,
"step": 528
},
{
"epoch": 1.1572327044025157,
"grad_norm": 0.006403745152056217,
"learning_rate": 0.002048661800486618,
"loss": 0.7054,
"step": 529
},
{
"epoch": 1.1594202898550725,
"grad_norm": 0.00577664515003562,
"learning_rate": 0.0020462287104622873,
"loss": 0.8063,
"step": 530
},
{
"epoch": 1.161607875307629,
"grad_norm": 0.011647713370621204,
"learning_rate": 0.002043795620437956,
"loss": 0.7921,
"step": 531
},
{
"epoch": 1.163795460760186,
"grad_norm": 0.011479120701551437,
"learning_rate": 0.0020413625304136255,
"loss": 0.9256,
"step": 532
},
{
"epoch": 1.1659830462127427,
"grad_norm": 0.007622700184583664,
"learning_rate": 0.0020389294403892946,
"loss": 0.722,
"step": 533
},
{
"epoch": 1.1681706316652993,
"grad_norm": 0.0064216419123113155,
"learning_rate": 0.0020364963503649632,
"loss": 0.6979,
"step": 534
},
{
"epoch": 1.1703582171178561,
"grad_norm": 0.007917587645351887,
"learning_rate": 0.0020340632603406327,
"loss": 0.8049,
"step": 535
},
{
"epoch": 1.172545802570413,
"grad_norm": 0.0061738938093185425,
"learning_rate": 0.002031630170316302,
"loss": 0.7057,
"step": 536
},
{
"epoch": 1.1747333880229696,
"grad_norm": 0.0060928682796657085,
"learning_rate": 0.0020291970802919705,
"loss": 0.8,
"step": 537
},
{
"epoch": 1.1769209734755264,
"grad_norm": 0.00664818799123168,
"learning_rate": 0.00202676399026764,
"loss": 0.7944,
"step": 538
},
{
"epoch": 1.1791085589280832,
"grad_norm": 0.027486886829137802,
"learning_rate": 0.002024330900243309,
"loss": 0.8446,
"step": 539
},
{
"epoch": 1.1812961443806398,
"grad_norm": 0.01736626587808132,
"learning_rate": 0.002021897810218978,
"loss": 0.8303,
"step": 540
},
{
"epoch": 1.1834837298331966,
"grad_norm": 0.0084115294739604,
"learning_rate": 0.0020194647201946473,
"loss": 0.7323,
"step": 541
},
{
"epoch": 1.1856713152857534,
"grad_norm": 0.01464123371988535,
"learning_rate": 0.0020170316301703164,
"loss": 0.8395,
"step": 542
},
{
"epoch": 1.18785890073831,
"grad_norm": 0.007480619940906763,
"learning_rate": 0.0020145985401459855,
"loss": 0.7309,
"step": 543
},
{
"epoch": 1.1900464861908668,
"grad_norm": 0.014315255917608738,
"learning_rate": 0.0020121654501216546,
"loss": 0.6468,
"step": 544
},
{
"epoch": 1.1922340716434237,
"grad_norm": 0.009927434846758842,
"learning_rate": 0.0020097323600973237,
"loss": 0.7544,
"step": 545
},
{
"epoch": 1.1944216570959802,
"grad_norm": 0.019481701776385307,
"learning_rate": 0.0020072992700729928,
"loss": 0.8124,
"step": 546
},
{
"epoch": 1.196609242548537,
"grad_norm": 0.007046518847346306,
"learning_rate": 0.002004866180048662,
"loss": 0.6582,
"step": 547
},
{
"epoch": 1.1987968280010939,
"grad_norm": 0.012643888592720032,
"learning_rate": 0.002002433090024331,
"loss": 0.8098,
"step": 548
},
{
"epoch": 1.2009844134536505,
"grad_norm": 0.008585029281675816,
"learning_rate": 0.002,
"loss": 0.7206,
"step": 549
},
{
"epoch": 1.2031719989062073,
"grad_norm": 0.014269394800066948,
"learning_rate": 0.001997566909975669,
"loss": 0.8426,
"step": 550
},
{
"epoch": 1.2053595843587641,
"grad_norm": 0.006986747495830059,
"learning_rate": 0.0019951338199513382,
"loss": 0.7793,
"step": 551
},
{
"epoch": 1.2075471698113207,
"grad_norm": 0.014269756153225899,
"learning_rate": 0.0019927007299270073,
"loss": 0.7668,
"step": 552
},
{
"epoch": 1.2097347552638775,
"grad_norm": 0.009506807662546635,
"learning_rate": 0.0019902676399026764,
"loss": 0.771,
"step": 553
},
{
"epoch": 1.2119223407164341,
"grad_norm": 0.008203186094760895,
"learning_rate": 0.0019878345498783455,
"loss": 0.8037,
"step": 554
},
{
"epoch": 1.214109926168991,
"grad_norm": 0.01714324578642845,
"learning_rate": 0.0019854014598540146,
"loss": 0.66,
"step": 555
},
{
"epoch": 1.2162975116215478,
"grad_norm": 0.01466370839625597,
"learning_rate": 0.0019829683698296837,
"loss": 0.8761,
"step": 556
},
{
"epoch": 1.2184850970741046,
"grad_norm": 0.049504704773426056,
"learning_rate": 0.0019805352798053528,
"loss": 0.7717,
"step": 557
},
{
"epoch": 1.2206726825266612,
"grad_norm": 0.010891391895711422,
"learning_rate": 0.001978102189781022,
"loss": 0.7754,
"step": 558
},
{
"epoch": 1.222860267979218,
"grad_norm": 0.007297700271010399,
"learning_rate": 0.001975669099756691,
"loss": 0.882,
"step": 559
},
{
"epoch": 1.2250478534317746,
"grad_norm": 0.010113504715263844,
"learning_rate": 0.00197323600973236,
"loss": 0.7514,
"step": 560
},
{
"epoch": 1.2272354388843314,
"grad_norm": 0.0076246317476034164,
"learning_rate": 0.001970802919708029,
"loss": 0.9311,
"step": 561
},
{
"epoch": 1.2294230243368882,
"grad_norm": 0.010274101980030537,
"learning_rate": 0.0019683698296836987,
"loss": 0.9348,
"step": 562
},
{
"epoch": 1.2316106097894448,
"grad_norm": 0.007466154173016548,
"learning_rate": 0.0019659367396593673,
"loss": 0.6847,
"step": 563
},
{
"epoch": 1.2337981952420016,
"grad_norm": 0.012906615622341633,
"learning_rate": 0.0019635036496350364,
"loss": 0.9068,
"step": 564
},
{
"epoch": 1.2359857806945584,
"grad_norm": 0.008850296027958393,
"learning_rate": 0.001961070559610706,
"loss": 0.9032,
"step": 565
},
{
"epoch": 1.238173366147115,
"grad_norm": 0.009153778664767742,
"learning_rate": 0.0019586374695863746,
"loss": 0.7872,
"step": 566
},
{
"epoch": 1.2403609515996719,
"grad_norm": 0.014177209697663784,
"learning_rate": 0.0019562043795620437,
"loss": 0.7902,
"step": 567
},
{
"epoch": 1.2425485370522287,
"grad_norm": 0.008819716051220894,
"learning_rate": 0.001953771289537713,
"loss": 0.5116,
"step": 568
},
{
"epoch": 1.2447361225047853,
"grad_norm": 0.012600511312484741,
"learning_rate": 0.001951338199513382,
"loss": 0.8224,
"step": 569
},
{
"epoch": 1.246923707957342,
"grad_norm": 0.012330558151006699,
"learning_rate": 0.001948905109489051,
"loss": 0.6959,
"step": 570
},
{
"epoch": 1.249111293409899,
"grad_norm": 0.013719186186790466,
"learning_rate": 0.00194647201946472,
"loss": 0.8555,
"step": 571
},
{
"epoch": 1.2512988788624555,
"grad_norm": 0.019239958375692368,
"learning_rate": 0.0019440389294403894,
"loss": 0.8459,
"step": 572
},
{
"epoch": 1.2534864643150123,
"grad_norm": 0.00825503934174776,
"learning_rate": 0.0019416058394160585,
"loss": 0.6807,
"step": 573
},
{
"epoch": 1.2556740497675691,
"grad_norm": 0.00811754260212183,
"learning_rate": 0.0019391727493917273,
"loss": 0.661,
"step": 574
},
{
"epoch": 1.2578616352201257,
"grad_norm": 0.009656975045800209,
"learning_rate": 0.0019367396593673967,
"loss": 0.693,
"step": 575
},
{
"epoch": 1.2600492206726825,
"grad_norm": 0.01010841503739357,
"learning_rate": 0.0019343065693430658,
"loss": 0.7331,
"step": 576
},
{
"epoch": 1.2622368061252391,
"grad_norm": 0.01344444788992405,
"learning_rate": 0.0019318734793187346,
"loss": 0.89,
"step": 577
},
{
"epoch": 1.264424391577796,
"grad_norm": 0.009256028570234776,
"learning_rate": 0.001929440389294404,
"loss": 0.7227,
"step": 578
},
{
"epoch": 1.2666119770303528,
"grad_norm": 0.009699441492557526,
"learning_rate": 0.001927007299270073,
"loss": 0.6758,
"step": 579
},
{
"epoch": 1.2687995624829096,
"grad_norm": 0.013547690585255623,
"learning_rate": 0.001924574209245742,
"loss": 0.8159,
"step": 580
},
{
"epoch": 1.2709871479354662,
"grad_norm": 0.011569716967642307,
"learning_rate": 0.0019221411192214114,
"loss": 0.7126,
"step": 581
},
{
"epoch": 1.273174733388023,
"grad_norm": 0.009194127283990383,
"learning_rate": 0.0019197080291970803,
"loss": 0.8327,
"step": 582
},
{
"epoch": 1.2753623188405796,
"grad_norm": 0.01622292585670948,
"learning_rate": 0.0019172749391727494,
"loss": 0.8118,
"step": 583
},
{
"epoch": 1.2775499042931364,
"grad_norm": 0.016841020435094833,
"learning_rate": 0.0019148418491484187,
"loss": 0.8746,
"step": 584
},
{
"epoch": 1.2797374897456932,
"grad_norm": 0.011160912923514843,
"learning_rate": 0.0019124087591240876,
"loss": 0.7846,
"step": 585
},
{
"epoch": 1.28192507519825,
"grad_norm": 0.013098710216581821,
"learning_rate": 0.0019099756690997567,
"loss": 0.666,
"step": 586
},
{
"epoch": 1.2841126606508066,
"grad_norm": 0.008245709352195263,
"learning_rate": 0.001907542579075426,
"loss": 0.7799,
"step": 587
},
{
"epoch": 1.2863002461033635,
"grad_norm": 0.005503001157194376,
"learning_rate": 0.0019051094890510949,
"loss": 0.605,
"step": 588
},
{
"epoch": 1.28848783155592,
"grad_norm": 0.014160554856061935,
"learning_rate": 0.001902676399026764,
"loss": 0.7715,
"step": 589
},
{
"epoch": 1.2906754170084769,
"grad_norm": 0.06220156326889992,
"learning_rate": 0.0019002433090024333,
"loss": 1.0173,
"step": 590
},
{
"epoch": 1.2928630024610337,
"grad_norm": 0.023459481075406075,
"learning_rate": 0.0018978102189781021,
"loss": 0.7195,
"step": 591
},
{
"epoch": 1.2950505879135905,
"grad_norm": 0.02028430998325348,
"learning_rate": 0.0018953771289537712,
"loss": 0.8889,
"step": 592
},
{
"epoch": 1.297238173366147,
"grad_norm": 0.007861199788749218,
"learning_rate": 0.0018929440389294405,
"loss": 0.8249,
"step": 593
},
{
"epoch": 1.299425758818704,
"grad_norm": 0.008794757537543774,
"learning_rate": 0.0018905109489051096,
"loss": 0.8978,
"step": 594
},
{
"epoch": 1.3016133442712605,
"grad_norm": 0.027899743989109993,
"learning_rate": 0.0018880778588807785,
"loss": 0.8259,
"step": 595
},
{
"epoch": 1.3038009297238173,
"grad_norm": 0.006755333859473467,
"learning_rate": 0.0018856447688564478,
"loss": 0.8913,
"step": 596
},
{
"epoch": 1.3059885151763742,
"grad_norm": 0.016409730538725853,
"learning_rate": 0.001883211678832117,
"loss": 0.7902,
"step": 597
},
{
"epoch": 1.3081761006289307,
"grad_norm": 0.012431084178388119,
"learning_rate": 0.0018807785888077858,
"loss": 0.5474,
"step": 598
},
{
"epoch": 1.3103636860814876,
"grad_norm": 0.0099630793556571,
"learning_rate": 0.001878345498783455,
"loss": 0.7595,
"step": 599
},
{
"epoch": 1.3125512715340442,
"grad_norm": 0.027248527854681015,
"learning_rate": 0.0018759124087591242,
"loss": 1.0273,
"step": 600
},
{
"epoch": 1.314738856986601,
"grad_norm": 0.008029641583561897,
"learning_rate": 0.001873479318734793,
"loss": 0.6951,
"step": 601
},
{
"epoch": 1.3169264424391578,
"grad_norm": 0.011218305677175522,
"learning_rate": 0.0018710462287104626,
"loss": 0.9217,
"step": 602
},
{
"epoch": 1.3191140278917146,
"grad_norm": 0.024159464985132217,
"learning_rate": 0.0018686131386861315,
"loss": 0.7839,
"step": 603
},
{
"epoch": 1.3213016133442712,
"grad_norm": 0.01127669122070074,
"learning_rate": 0.0018661800486618006,
"loss": 0.6711,
"step": 604
},
{
"epoch": 1.323489198796828,
"grad_norm": 0.014322164468467236,
"learning_rate": 0.0018637469586374699,
"loss": 0.8935,
"step": 605
},
{
"epoch": 1.3256767842493846,
"grad_norm": 0.010018724948167801,
"learning_rate": 0.0018613138686131387,
"loss": 0.7622,
"step": 606
},
{
"epoch": 1.3278643697019414,
"grad_norm": 0.02816806361079216,
"learning_rate": 0.0018588807785888078,
"loss": 0.8948,
"step": 607
},
{
"epoch": 1.3300519551544983,
"grad_norm": 0.011105911806225777,
"learning_rate": 0.0018564476885644767,
"loss": 0.754,
"step": 608
},
{
"epoch": 1.332239540607055,
"grad_norm": 0.007195697631686926,
"learning_rate": 0.001854014598540146,
"loss": 0.6923,
"step": 609
},
{
"epoch": 1.3344271260596117,
"grad_norm": 0.010149553418159485,
"learning_rate": 0.001851581508515815,
"loss": 0.8129,
"step": 610
},
{
"epoch": 1.3366147115121685,
"grad_norm": 0.006798075046390295,
"learning_rate": 0.001849148418491484,
"loss": 0.5858,
"step": 611
},
{
"epoch": 1.338802296964725,
"grad_norm": 0.006904991343617439,
"learning_rate": 0.0018467153284671533,
"loss": 0.7058,
"step": 612
},
{
"epoch": 1.340989882417282,
"grad_norm": 0.019244657829403877,
"learning_rate": 0.0018442822384428224,
"loss": 0.7452,
"step": 613
},
{
"epoch": 1.3431774678698387,
"grad_norm": 0.10027986764907837,
"learning_rate": 0.0018418491484184915,
"loss": 0.7935,
"step": 614
},
{
"epoch": 1.3453650533223955,
"grad_norm": 0.028616629540920258,
"learning_rate": 0.0018394160583941608,
"loss": 0.7798,
"step": 615
},
{
"epoch": 1.3475526387749521,
"grad_norm": 0.02287200279533863,
"learning_rate": 0.0018369829683698297,
"loss": 0.7231,
"step": 616
},
{
"epoch": 1.349740224227509,
"grad_norm": 0.029162835329771042,
"learning_rate": 0.0018345498783454988,
"loss": 0.7196,
"step": 617
},
{
"epoch": 1.3519278096800655,
"grad_norm": 0.00748335849493742,
"learning_rate": 0.001832116788321168,
"loss": 0.6841,
"step": 618
},
{
"epoch": 1.3541153951326224,
"grad_norm": 0.012842601165175438,
"learning_rate": 0.001829683698296837,
"loss": 0.8114,
"step": 619
},
{
"epoch": 1.3563029805851792,
"grad_norm": 0.01425047405064106,
"learning_rate": 0.001827250608272506,
"loss": 0.713,
"step": 620
},
{
"epoch": 1.3584905660377358,
"grad_norm": 0.011411231942474842,
"learning_rate": 0.0018248175182481753,
"loss": 0.8576,
"step": 621
},
{
"epoch": 1.3606781514902926,
"grad_norm": 0.02541513741016388,
"learning_rate": 0.0018223844282238442,
"loss": 0.7529,
"step": 622
},
{
"epoch": 1.3628657369428494,
"grad_norm": 0.009776429273188114,
"learning_rate": 0.0018199513381995133,
"loss": 0.6062,
"step": 623
},
{
"epoch": 1.365053322395406,
"grad_norm": 0.01603938452899456,
"learning_rate": 0.0018175182481751826,
"loss": 1.3558,
"step": 624
},
{
"epoch": 1.3672409078479628,
"grad_norm": 0.01858574151992798,
"learning_rate": 0.0018150851581508517,
"loss": 0.7067,
"step": 625
},
{
"epoch": 1.3694284933005196,
"grad_norm": 0.014604609459638596,
"learning_rate": 0.0018126520681265206,
"loss": 0.65,
"step": 626
},
{
"epoch": 1.3716160787530762,
"grad_norm": 0.01383352093398571,
"learning_rate": 0.00181021897810219,
"loss": 0.724,
"step": 627
},
{
"epoch": 1.373803664205633,
"grad_norm": 0.007166001014411449,
"learning_rate": 0.001807785888077859,
"loss": 0.7063,
"step": 628
},
{
"epoch": 1.3759912496581896,
"grad_norm": 0.01364620216190815,
"learning_rate": 0.0018053527980535279,
"loss": 0.942,
"step": 629
},
{
"epoch": 1.3781788351107465,
"grad_norm": 0.013178148306906223,
"learning_rate": 0.0018029197080291972,
"loss": 0.7134,
"step": 630
},
{
"epoch": 1.3803664205633033,
"grad_norm": 0.016469091176986694,
"learning_rate": 0.0018004866180048663,
"loss": 0.8652,
"step": 631
},
{
"epoch": 1.38255400601586,
"grad_norm": 0.008818808011710644,
"learning_rate": 0.0017980535279805351,
"loss": 0.7157,
"step": 632
},
{
"epoch": 1.3847415914684167,
"grad_norm": 0.006165484432131052,
"learning_rate": 0.0017956204379562047,
"loss": 0.8267,
"step": 633
},
{
"epoch": 1.3869291769209735,
"grad_norm": 0.017317302525043488,
"learning_rate": 0.0017931873479318735,
"loss": 0.7661,
"step": 634
},
{
"epoch": 1.38911676237353,
"grad_norm": 0.01045684702694416,
"learning_rate": 0.0017907542579075426,
"loss": 0.797,
"step": 635
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.004696684889495373,
"learning_rate": 0.001788321167883212,
"loss": 0.793,
"step": 636
},
{
"epoch": 1.3934919332786437,
"grad_norm": 0.01570739410817623,
"learning_rate": 0.0017858880778588808,
"loss": 0.9052,
"step": 637
},
{
"epoch": 1.3956795187312006,
"grad_norm": 0.006558465771377087,
"learning_rate": 0.00178345498783455,
"loss": 0.7475,
"step": 638
},
{
"epoch": 1.3978671041837571,
"grad_norm": 0.008167284540832043,
"learning_rate": 0.0017810218978102192,
"loss": 0.7801,
"step": 639
},
{
"epoch": 1.400054689636314,
"grad_norm": 0.007898733019828796,
"learning_rate": 0.001778588807785888,
"loss": 0.7694,
"step": 640
},
{
"epoch": 1.4022422750888706,
"grad_norm": 0.011702708899974823,
"learning_rate": 0.0017761557177615572,
"loss": 0.7104,
"step": 641
},
{
"epoch": 1.4044298605414274,
"grad_norm": 0.01823602244257927,
"learning_rate": 0.0017737226277372265,
"loss": 0.837,
"step": 642
},
{
"epoch": 1.4066174459939842,
"grad_norm": 0.019088082015514374,
"learning_rate": 0.0017712895377128954,
"loss": 0.8105,
"step": 643
},
{
"epoch": 1.408805031446541,
"grad_norm": 0.008738362230360508,
"learning_rate": 0.0017688564476885645,
"loss": 0.9423,
"step": 644
},
{
"epoch": 1.4109926168990976,
"grad_norm": 0.010799618437886238,
"learning_rate": 0.0017664233576642336,
"loss": 0.7173,
"step": 645
},
{
"epoch": 1.4131802023516544,
"grad_norm": 0.007114489562809467,
"learning_rate": 0.0017639902676399029,
"loss": 0.7322,
"step": 646
},
{
"epoch": 1.415367787804211,
"grad_norm": 0.021334782242774963,
"learning_rate": 0.0017615571776155717,
"loss": 0.7808,
"step": 647
},
{
"epoch": 1.4175553732567678,
"grad_norm": 0.06464671343564987,
"learning_rate": 0.0017591240875912408,
"loss": 0.8948,
"step": 648
},
{
"epoch": 1.4197429587093247,
"grad_norm": 0.016822345554828644,
"learning_rate": 0.0017566909975669101,
"loss": 0.7481,
"step": 649
},
{
"epoch": 1.4219305441618812,
"grad_norm": 0.01005722675472498,
"learning_rate": 0.001754257907542579,
"loss": 0.8027,
"step": 650
},
{
"epoch": 1.424118129614438,
"grad_norm": 0.01469690166413784,
"learning_rate": 0.0017518248175182481,
"loss": 0.7487,
"step": 651
},
{
"epoch": 1.4263057150669949,
"grad_norm": 0.013352830894291401,
"learning_rate": 0.0017493917274939174,
"loss": 0.9439,
"step": 652
},
{
"epoch": 1.4284933005195515,
"grad_norm": 0.01574932225048542,
"learning_rate": 0.0017469586374695863,
"loss": 0.7926,
"step": 653
},
{
"epoch": 1.4306808859721083,
"grad_norm": 0.012712597846984863,
"learning_rate": 0.0017445255474452554,
"loss": 0.8369,
"step": 654
},
{
"epoch": 1.4328684714246651,
"grad_norm": 0.018248263746500015,
"learning_rate": 0.0017420924574209247,
"loss": 0.6585,
"step": 655
},
{
"epoch": 1.4350560568772217,
"grad_norm": 0.0181551706045866,
"learning_rate": 0.0017396593673965938,
"loss": 0.8487,
"step": 656
},
{
"epoch": 1.4372436423297785,
"grad_norm": 0.009059487842023373,
"learning_rate": 0.0017372262773722627,
"loss": 0.8897,
"step": 657
},
{
"epoch": 1.4394312277823351,
"grad_norm": 0.007483980618417263,
"learning_rate": 0.001734793187347932,
"loss": 0.6673,
"step": 658
},
{
"epoch": 1.441618813234892,
"grad_norm": 0.007589507382363081,
"learning_rate": 0.001732360097323601,
"loss": 0.7013,
"step": 659
},
{
"epoch": 1.4438063986874488,
"grad_norm": 0.011493782512843609,
"learning_rate": 0.00172992700729927,
"loss": 0.5457,
"step": 660
},
{
"epoch": 1.4459939841400056,
"grad_norm": 0.027656735852360725,
"learning_rate": 0.0017274939172749392,
"loss": 0.7251,
"step": 661
},
{
"epoch": 1.4481815695925622,
"grad_norm": 0.022569406777620316,
"learning_rate": 0.0017250608272506083,
"loss": 0.7104,
"step": 662
},
{
"epoch": 1.450369155045119,
"grad_norm": 0.028735000640153885,
"learning_rate": 0.0017226277372262772,
"loss": 0.8682,
"step": 663
},
{
"epoch": 1.4525567404976756,
"grad_norm": 0.012052370235323906,
"learning_rate": 0.0017201946472019465,
"loss": 0.7508,
"step": 664
},
{
"epoch": 1.4547443259502324,
"grad_norm": 0.008707467466592789,
"learning_rate": 0.0017177615571776156,
"loss": 0.83,
"step": 665
},
{
"epoch": 1.4569319114027892,
"grad_norm": 0.01061397884041071,
"learning_rate": 0.0017153284671532847,
"loss": 0.9431,
"step": 666
},
{
"epoch": 1.459119496855346,
"grad_norm": 0.011903772130608559,
"learning_rate": 0.001712895377128954,
"loss": 0.723,
"step": 667
},
{
"epoch": 1.4613070823079026,
"grad_norm": 0.03922785073518753,
"learning_rate": 0.001710462287104623,
"loss": 0.6581,
"step": 668
},
{
"epoch": 1.4634946677604594,
"grad_norm": 0.014414667151868343,
"learning_rate": 0.001708029197080292,
"loss": 0.8511,
"step": 669
},
{
"epoch": 1.465682253213016,
"grad_norm": 0.010338617488741875,
"learning_rate": 0.0017055961070559613,
"loss": 0.7162,
"step": 670
},
{
"epoch": 1.4678698386655729,
"grad_norm": 0.011176107451319695,
"learning_rate": 0.0017031630170316302,
"loss": 0.8674,
"step": 671
},
{
"epoch": 1.4700574241181297,
"grad_norm": 0.014365148730576038,
"learning_rate": 0.0017007299270072993,
"loss": 0.7739,
"step": 672
},
{
"epoch": 1.4722450095706865,
"grad_norm": 0.019749363884329796,
"learning_rate": 0.0016982968369829686,
"loss": 0.7571,
"step": 673
},
{
"epoch": 1.474432595023243,
"grad_norm": 0.011761876754462719,
"learning_rate": 0.0016958637469586374,
"loss": 0.7208,
"step": 674
},
{
"epoch": 1.4766201804758,
"grad_norm": 0.025715123862028122,
"learning_rate": 0.0016934306569343065,
"loss": 0.7554,
"step": 675
},
{
"epoch": 1.4788077659283565,
"grad_norm": 0.028069710358977318,
"learning_rate": 0.0016909975669099759,
"loss": 0.6652,
"step": 676
},
{
"epoch": 1.4809953513809133,
"grad_norm": 0.02627987042069435,
"learning_rate": 0.001688564476885645,
"loss": 0.7924,
"step": 677
},
{
"epoch": 1.4831829368334701,
"grad_norm": 0.005099075846374035,
"learning_rate": 0.0016861313868613138,
"loss": 0.75,
"step": 678
},
{
"epoch": 1.4853705222860267,
"grad_norm": 0.007156622130423784,
"learning_rate": 0.0016836982968369831,
"loss": 0.8034,
"step": 679
},
{
"epoch": 1.4875581077385835,
"grad_norm": 0.008162274025380611,
"learning_rate": 0.0016812652068126522,
"loss": 0.6174,
"step": 680
},
{
"epoch": 1.4897456931911401,
"grad_norm": 0.01390012539923191,
"learning_rate": 0.001678832116788321,
"loss": 0.7813,
"step": 681
},
{
"epoch": 1.491933278643697,
"grad_norm": 0.03663848340511322,
"learning_rate": 0.0016763990267639902,
"loss": 0.6028,
"step": 682
},
{
"epoch": 1.4941208640962538,
"grad_norm": 0.01389587577432394,
"learning_rate": 0.0016739659367396595,
"loss": 0.9186,
"step": 683
},
{
"epoch": 1.4963084495488106,
"grad_norm": 0.007214284967631102,
"learning_rate": 0.0016715328467153284,
"loss": 1.0112,
"step": 684
},
{
"epoch": 1.4984960350013672,
"grad_norm": 0.01086746621876955,
"learning_rate": 0.0016690997566909975,
"loss": 0.7628,
"step": 685
},
{
"epoch": 1.500683620453924,
"grad_norm": 0.006750196684151888,
"learning_rate": 0.0016666666666666668,
"loss": 0.8025,
"step": 686
},
{
"epoch": 1.5028712059064806,
"grad_norm": 0.012172271497547626,
"learning_rate": 0.0016642335766423359,
"loss": 0.7559,
"step": 687
},
{
"epoch": 1.5050587913590374,
"grad_norm": 0.03923722356557846,
"learning_rate": 0.0016618004866180047,
"loss": 0.8227,
"step": 688
},
{
"epoch": 1.5072463768115942,
"grad_norm": 0.020949123427271843,
"learning_rate": 0.001659367396593674,
"loss": 0.7272,
"step": 689
},
{
"epoch": 1.509433962264151,
"grad_norm": 0.012365633621811867,
"learning_rate": 0.0016569343065693431,
"loss": 0.9127,
"step": 690
},
{
"epoch": 1.5116215477167076,
"grad_norm": 0.012725708074867725,
"learning_rate": 0.001654501216545012,
"loss": 0.7576,
"step": 691
},
{
"epoch": 1.5138091331692645,
"grad_norm": 0.014691759832203388,
"learning_rate": 0.0016520681265206813,
"loss": 0.6951,
"step": 692
},
{
"epoch": 1.515996718621821,
"grad_norm": 0.009719770401716232,
"learning_rate": 0.0016496350364963504,
"loss": 0.6947,
"step": 693
},
{
"epoch": 1.5181843040743779,
"grad_norm": 0.0074682896956801414,
"learning_rate": 0.0016472019464720193,
"loss": 0.8467,
"step": 694
},
{
"epoch": 1.5203718895269347,
"grad_norm": 0.011303418315947056,
"learning_rate": 0.0016447688564476886,
"loss": 0.7453,
"step": 695
},
{
"epoch": 1.5225594749794915,
"grad_norm": 0.009616104885935783,
"learning_rate": 0.0016423357664233577,
"loss": 0.8284,
"step": 696
},
{
"epoch": 1.524747060432048,
"grad_norm": 0.004562855698168278,
"learning_rate": 0.0016399026763990268,
"loss": 0.776,
"step": 697
},
{
"epoch": 1.5269346458846047,
"grad_norm": 0.0057913740165531635,
"learning_rate": 0.001637469586374696,
"loss": 0.5635,
"step": 698
},
{
"epoch": 1.5291222313371615,
"grad_norm": 0.011465840972959995,
"learning_rate": 0.001635036496350365,
"loss": 0.7466,
"step": 699
},
{
"epoch": 1.5313098167897183,
"grad_norm": 0.009356693364679813,
"learning_rate": 0.001632603406326034,
"loss": 0.7555,
"step": 700
},
{
"epoch": 1.5334974022422752,
"grad_norm": 0.01132314745336771,
"learning_rate": 0.0016301703163017034,
"loss": 0.6987,
"step": 701
},
{
"epoch": 1.535684987694832,
"grad_norm": 0.011162355542182922,
"learning_rate": 0.0016277372262773723,
"loss": 0.7895,
"step": 702
},
{
"epoch": 1.5378725731473886,
"grad_norm": 0.008752882480621338,
"learning_rate": 0.0016253041362530413,
"loss": 0.7829,
"step": 703
},
{
"epoch": 1.5400601585999452,
"grad_norm": 0.0067902375012636185,
"learning_rate": 0.0016228710462287107,
"loss": 0.7541,
"step": 704
},
{
"epoch": 1.542247744052502,
"grad_norm": 0.010398069396615028,
"learning_rate": 0.0016204379562043795,
"loss": 0.84,
"step": 705
},
{
"epoch": 1.5444353295050588,
"grad_norm": 0.006489087361842394,
"learning_rate": 0.0016180048661800486,
"loss": 0.7745,
"step": 706
},
{
"epoch": 1.5466229149576156,
"grad_norm": 0.00789352972060442,
"learning_rate": 0.001615571776155718,
"loss": 0.7006,
"step": 707
},
{
"epoch": 1.5488105004101724,
"grad_norm": 0.005906807258725166,
"learning_rate": 0.001613138686131387,
"loss": 0.826,
"step": 708
},
{
"epoch": 1.550998085862729,
"grad_norm": 0.006026630289852619,
"learning_rate": 0.001610705596107056,
"loss": 0.6783,
"step": 709
},
{
"epoch": 1.5531856713152856,
"grad_norm": 0.010388746857643127,
"learning_rate": 0.0016082725060827252,
"loss": 0.8531,
"step": 710
},
{
"epoch": 1.5553732567678424,
"grad_norm": 0.01053705345839262,
"learning_rate": 0.0016058394160583943,
"loss": 0.7257,
"step": 711
},
{
"epoch": 1.5575608422203993,
"grad_norm": 0.006276300642639399,
"learning_rate": 0.0016034063260340632,
"loss": 0.7996,
"step": 712
},
{
"epoch": 1.559748427672956,
"grad_norm": 0.006276302970945835,
"learning_rate": 0.0016009732360097325,
"loss": 0.8443,
"step": 713
},
{
"epoch": 1.5619360131255127,
"grad_norm": 0.008509790524840355,
"learning_rate": 0.0015985401459854016,
"loss": 0.7289,
"step": 714
},
{
"epoch": 1.5641235985780695,
"grad_norm": 0.01978105679154396,
"learning_rate": 0.0015961070559610705,
"loss": 0.846,
"step": 715
},
{
"epoch": 1.566311184030626,
"grad_norm": 0.012076129205524921,
"learning_rate": 0.0015936739659367398,
"loss": 0.7292,
"step": 716
},
{
"epoch": 1.568498769483183,
"grad_norm": 0.01716456562280655,
"learning_rate": 0.0015912408759124089,
"loss": 0.7655,
"step": 717
},
{
"epoch": 1.5706863549357397,
"grad_norm": 0.016601664945483208,
"learning_rate": 0.001588807785888078,
"loss": 0.7277,
"step": 718
},
{
"epoch": 1.5728739403882965,
"grad_norm": 0.010958652012050152,
"learning_rate": 0.0015863746958637468,
"loss": 0.7392,
"step": 719
},
{
"epoch": 1.5750615258408531,
"grad_norm": 0.007287964224815369,
"learning_rate": 0.0015839416058394161,
"loss": 0.822,
"step": 720
},
{
"epoch": 1.57724911129341,
"grad_norm": 0.010577067732810974,
"learning_rate": 0.0015815085158150852,
"loss": 0.732,
"step": 721
},
{
"epoch": 1.5794366967459665,
"grad_norm": 0.007742591667920351,
"learning_rate": 0.001579075425790754,
"loss": 0.8312,
"step": 722
},
{
"epoch": 1.5816242821985234,
"grad_norm": 0.009659879840910435,
"learning_rate": 0.0015766423357664234,
"loss": 0.8213,
"step": 723
},
{
"epoch": 1.5838118676510802,
"grad_norm": 0.015149835497140884,
"learning_rate": 0.0015742092457420925,
"loss": 0.6992,
"step": 724
},
{
"epoch": 1.585999453103637,
"grad_norm": 0.007888193242251873,
"learning_rate": 0.0015717761557177614,
"loss": 0.8853,
"step": 725
},
{
"epoch": 1.5881870385561936,
"grad_norm": 0.011876450851559639,
"learning_rate": 0.0015693430656934307,
"loss": 0.7645,
"step": 726
},
{
"epoch": 1.5903746240087502,
"grad_norm": 0.015837261453270912,
"learning_rate": 0.0015669099756690998,
"loss": 0.8061,
"step": 727
},
{
"epoch": 1.592562209461307,
"grad_norm": 0.006944081746041775,
"learning_rate": 0.0015644768856447687,
"loss": 0.6043,
"step": 728
},
{
"epoch": 1.5947497949138638,
"grad_norm": 0.01456182450056076,
"learning_rate": 0.0015620437956204382,
"loss": 0.9343,
"step": 729
},
{
"epoch": 1.5969373803664206,
"grad_norm": 0.007655070163309574,
"learning_rate": 0.001559610705596107,
"loss": 0.727,
"step": 730
},
{
"epoch": 1.5991249658189775,
"grad_norm": 0.014365557581186295,
"learning_rate": 0.0015571776155717761,
"loss": 0.6884,
"step": 731
},
{
"epoch": 1.601312551271534,
"grad_norm": 0.013196627609431744,
"learning_rate": 0.0015547445255474455,
"loss": 0.6522,
"step": 732
},
{
"epoch": 1.6035001367240906,
"grad_norm": 0.0069740209728479385,
"learning_rate": 0.0015523114355231143,
"loss": 0.812,
"step": 733
},
{
"epoch": 1.6056877221766475,
"grad_norm": 0.018947165459394455,
"learning_rate": 0.0015498783454987834,
"loss": 0.7464,
"step": 734
},
{
"epoch": 1.6078753076292043,
"grad_norm": 0.02975570783019066,
"learning_rate": 0.0015474452554744527,
"loss": 1.0338,
"step": 735
},
{
"epoch": 1.610062893081761,
"grad_norm": 0.01144670695066452,
"learning_rate": 0.0015450121654501216,
"loss": 0.9582,
"step": 736
},
{
"epoch": 1.612250478534318,
"grad_norm": 0.08359838277101517,
"learning_rate": 0.0015425790754257907,
"loss": 0.6188,
"step": 737
},
{
"epoch": 1.6144380639868745,
"grad_norm": 0.005582269746810198,
"learning_rate": 0.00154014598540146,
"loss": 0.6557,
"step": 738
},
{
"epoch": 1.616625649439431,
"grad_norm": 0.008966202847659588,
"learning_rate": 0.001537712895377129,
"loss": 0.6564,
"step": 739
},
{
"epoch": 1.618813234891988,
"grad_norm": 0.011794374324381351,
"learning_rate": 0.001535279805352798,
"loss": 0.8051,
"step": 740
},
{
"epoch": 1.6210008203445447,
"grad_norm": 0.00766439875587821,
"learning_rate": 0.0015328467153284673,
"loss": 0.8145,
"step": 741
},
{
"epoch": 1.6231884057971016,
"grad_norm": 0.014379739761352539,
"learning_rate": 0.0015304136253041364,
"loss": 0.8658,
"step": 742
},
{
"epoch": 1.6253759912496581,
"grad_norm": 0.01025471929460764,
"learning_rate": 0.0015279805352798053,
"loss": 0.6969,
"step": 743
},
{
"epoch": 1.627563576702215,
"grad_norm": 0.012737879529595375,
"learning_rate": 0.0015255474452554746,
"loss": 0.9006,
"step": 744
},
{
"epoch": 1.6297511621547716,
"grad_norm": 0.0110158147290349,
"learning_rate": 0.0015231143552311437,
"loss": 0.7326,
"step": 745
},
{
"epoch": 1.6319387476073284,
"grad_norm": 0.011220619082450867,
"learning_rate": 0.0015206812652068125,
"loss": 0.8275,
"step": 746
},
{
"epoch": 1.6341263330598852,
"grad_norm": 0.00941223930567503,
"learning_rate": 0.0015182481751824818,
"loss": 0.8187,
"step": 747
},
{
"epoch": 1.636313918512442,
"grad_norm": 0.004144694656133652,
"learning_rate": 0.001515815085158151,
"loss": 0.7248,
"step": 748
},
{
"epoch": 1.6385015039649986,
"grad_norm": 0.013639383018016815,
"learning_rate": 0.0015133819951338198,
"loss": 0.7966,
"step": 749
},
{
"epoch": 1.6406890894175554,
"grad_norm": 0.006385320797562599,
"learning_rate": 0.0015109489051094893,
"loss": 0.6772,
"step": 750
},
{
"epoch": 1.642876674870112,
"grad_norm": 0.011585132218897343,
"learning_rate": 0.0015085158150851582,
"loss": 0.6696,
"step": 751
},
{
"epoch": 1.6450642603226688,
"grad_norm": 0.023672277107834816,
"learning_rate": 0.0015060827250608273,
"loss": 0.6978,
"step": 752
},
{
"epoch": 1.6472518457752257,
"grad_norm": 0.014683379791676998,
"learning_rate": 0.0015036496350364966,
"loss": 0.5735,
"step": 753
},
{
"epoch": 1.6494394312277825,
"grad_norm": 0.010881925001740456,
"learning_rate": 0.0015012165450121655,
"loss": 0.6773,
"step": 754
},
{
"epoch": 1.651627016680339,
"grad_norm": 0.009006233885884285,
"learning_rate": 0.0014987834549878346,
"loss": 0.7773,
"step": 755
},
{
"epoch": 1.6538146021328957,
"grad_norm": 0.01426916103810072,
"learning_rate": 0.0014963503649635037,
"loss": 0.7436,
"step": 756
},
{
"epoch": 1.6560021875854525,
"grad_norm": 0.005649265833199024,
"learning_rate": 0.0014939172749391728,
"loss": 0.7041,
"step": 757
},
{
"epoch": 1.6581897730380093,
"grad_norm": 0.008767529390752316,
"learning_rate": 0.0014914841849148419,
"loss": 0.6701,
"step": 758
},
{
"epoch": 1.6603773584905661,
"grad_norm": 0.007580756675451994,
"learning_rate": 0.001489051094890511,
"loss": 0.6593,
"step": 759
},
{
"epoch": 1.662564943943123,
"grad_norm": 0.010842681862413883,
"learning_rate": 0.0014866180048661803,
"loss": 0.9414,
"step": 760
},
{
"epoch": 1.6647525293956795,
"grad_norm": 0.008890979923307896,
"learning_rate": 0.0014841849148418491,
"loss": 0.8333,
"step": 761
},
{
"epoch": 1.6669401148482361,
"grad_norm": 0.00815370213240385,
"learning_rate": 0.0014817518248175182,
"loss": 0.8596,
"step": 762
},
{
"epoch": 1.669127700300793,
"grad_norm": 0.007434117142111063,
"learning_rate": 0.0014793187347931875,
"loss": 0.631,
"step": 763
},
{
"epoch": 1.6713152857533498,
"grad_norm": 0.007965626195073128,
"learning_rate": 0.0014768856447688564,
"loss": 0.7377,
"step": 764
},
{
"epoch": 1.6735028712059066,
"grad_norm": 0.014369670301675797,
"learning_rate": 0.0014744525547445257,
"loss": 0.6907,
"step": 765
},
{
"epoch": 1.6756904566584632,
"grad_norm": 0.013002739287912846,
"learning_rate": 0.0014720194647201946,
"loss": 0.8491,
"step": 766
},
{
"epoch": 1.67787804211102,
"grad_norm": 0.008742110803723335,
"learning_rate": 0.0014695863746958637,
"loss": 1.0319,
"step": 767
},
{
"epoch": 1.6800656275635766,
"grad_norm": 0.01362073328346014,
"learning_rate": 0.001467153284671533,
"loss": 0.596,
"step": 768
},
{
"epoch": 1.6822532130161334,
"grad_norm": 0.007842877879738808,
"learning_rate": 0.0014647201946472019,
"loss": 0.848,
"step": 769
},
{
"epoch": 1.6844407984686902,
"grad_norm": 0.007685767021030188,
"learning_rate": 0.001462287104622871,
"loss": 0.6811,
"step": 770
},
{
"epoch": 1.686628383921247,
"grad_norm": 0.07299596816301346,
"learning_rate": 0.0014598540145985403,
"loss": 0.7739,
"step": 771
},
{
"epoch": 1.6888159693738036,
"grad_norm": 0.02475287765264511,
"learning_rate": 0.0014574209245742091,
"loss": 0.8412,
"step": 772
},
{
"epoch": 1.6910035548263604,
"grad_norm": 0.02310485951602459,
"learning_rate": 0.0014549878345498785,
"loss": 0.7707,
"step": 773
},
{
"epoch": 1.693191140278917,
"grad_norm": 0.006614830810576677,
"learning_rate": 0.0014525547445255475,
"loss": 0.9116,
"step": 774
},
{
"epoch": 1.6953787257314739,
"grad_norm": 0.017114151269197464,
"learning_rate": 0.0014501216545012164,
"loss": 0.7767,
"step": 775
},
{
"epoch": 1.6975663111840307,
"grad_norm": 0.007972135208547115,
"learning_rate": 0.0014476885644768857,
"loss": 0.8053,
"step": 776
},
{
"epoch": 1.6997538966365875,
"grad_norm": 0.013452711515128613,
"learning_rate": 0.0014452554744525548,
"loss": 0.633,
"step": 777
},
{
"epoch": 1.701941482089144,
"grad_norm": 0.01562053058296442,
"learning_rate": 0.001442822384428224,
"loss": 0.8312,
"step": 778
},
{
"epoch": 1.7041290675417007,
"grad_norm": 0.006510770879685879,
"learning_rate": 0.001440389294403893,
"loss": 0.7721,
"step": 779
},
{
"epoch": 1.7063166529942575,
"grad_norm": 0.011892448179423809,
"learning_rate": 0.001437956204379562,
"loss": 0.6629,
"step": 780
},
{
"epoch": 1.7085042384468143,
"grad_norm": 0.005237538833171129,
"learning_rate": 0.0014355231143552312,
"loss": 0.5767,
"step": 781
},
{
"epoch": 1.7106918238993711,
"grad_norm": 0.020627424120903015,
"learning_rate": 0.0014330900243309003,
"loss": 0.8974,
"step": 782
},
{
"epoch": 1.712879409351928,
"grad_norm": 0.012742357328534126,
"learning_rate": 0.0014306569343065694,
"loss": 0.5843,
"step": 783
},
{
"epoch": 1.7150669948044845,
"grad_norm": 0.011114447377622128,
"learning_rate": 0.0014282238442822385,
"loss": 0.9336,
"step": 784
},
{
"epoch": 1.7172545802570411,
"grad_norm": 0.01212508138269186,
"learning_rate": 0.0014257907542579076,
"loss": 0.853,
"step": 785
},
{
"epoch": 1.719442165709598,
"grad_norm": 0.006842518225312233,
"learning_rate": 0.0014233576642335767,
"loss": 0.8329,
"step": 786
},
{
"epoch": 1.7216297511621548,
"grad_norm": 0.008684076368808746,
"learning_rate": 0.0014209245742092457,
"loss": 0.8503,
"step": 787
},
{
"epoch": 1.7238173366147116,
"grad_norm": 0.009845465421676636,
"learning_rate": 0.0014184914841849148,
"loss": 0.9911,
"step": 788
},
{
"epoch": 1.7260049220672684,
"grad_norm": 0.007301978301256895,
"learning_rate": 0.001416058394160584,
"loss": 0.6684,
"step": 789
},
{
"epoch": 1.728192507519825,
"grad_norm": 0.010263817384839058,
"learning_rate": 0.001413625304136253,
"loss": 0.6852,
"step": 790
},
{
"epoch": 1.7303800929723816,
"grad_norm": 0.012078475207090378,
"learning_rate": 0.0014111922141119221,
"loss": 0.6509,
"step": 791
},
{
"epoch": 1.7325676784249384,
"grad_norm": 0.012108572758734226,
"learning_rate": 0.0014087591240875912,
"loss": 0.7183,
"step": 792
},
{
"epoch": 1.7347552638774952,
"grad_norm": 0.011477826163172722,
"learning_rate": 0.0014063260340632603,
"loss": 0.8856,
"step": 793
},
{
"epoch": 1.736942849330052,
"grad_norm": 0.007066864520311356,
"learning_rate": 0.0014038929440389296,
"loss": 0.6114,
"step": 794
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.011538154445588589,
"learning_rate": 0.0014014598540145985,
"loss": 0.6716,
"step": 795
},
{
"epoch": 1.7413180202351655,
"grad_norm": 0.008611057884991169,
"learning_rate": 0.0013990267639902676,
"loss": 0.9979,
"step": 796
},
{
"epoch": 1.743505605687722,
"grad_norm": 0.013740317896008492,
"learning_rate": 0.0013965936739659369,
"loss": 0.8166,
"step": 797
},
{
"epoch": 1.7456931911402789,
"grad_norm": 0.008636080659925938,
"learning_rate": 0.0013941605839416058,
"loss": 0.8138,
"step": 798
},
{
"epoch": 1.7478807765928357,
"grad_norm": 0.008637238293886185,
"learning_rate": 0.001391727493917275,
"loss": 0.9225,
"step": 799
},
{
"epoch": 1.7500683620453925,
"grad_norm": 0.022517461329698563,
"learning_rate": 0.0013892944038929442,
"loss": 0.735,
"step": 800
},
{
"epoch": 1.752255947497949,
"grad_norm": 0.005302282981574535,
"learning_rate": 0.001386861313868613,
"loss": 0.657,
"step": 801
},
{
"epoch": 1.754443532950506,
"grad_norm": 0.04943990707397461,
"learning_rate": 0.0013844282238442824,
"loss": 0.623,
"step": 802
},
{
"epoch": 1.7566311184030625,
"grad_norm": 0.011758695356547832,
"learning_rate": 0.0013819951338199512,
"loss": 0.8038,
"step": 803
},
{
"epoch": 1.7588187038556193,
"grad_norm": 0.009712104685604572,
"learning_rate": 0.0013795620437956205,
"loss": 0.7268,
"step": 804
},
{
"epoch": 1.7610062893081762,
"grad_norm": 0.007741864304989576,
"learning_rate": 0.0013771289537712896,
"loss": 0.7049,
"step": 805
},
{
"epoch": 1.763193874760733,
"grad_norm": 0.010713865980505943,
"learning_rate": 0.0013746958637469585,
"loss": 0.6425,
"step": 806
},
{
"epoch": 1.7653814602132896,
"grad_norm": 0.006576141808182001,
"learning_rate": 0.0013722627737226278,
"loss": 0.7601,
"step": 807
},
{
"epoch": 1.7675690456658462,
"grad_norm": 0.007796050515025854,
"learning_rate": 0.001369829683698297,
"loss": 0.659,
"step": 808
},
{
"epoch": 1.769756631118403,
"grad_norm": 0.01460753008723259,
"learning_rate": 0.001367396593673966,
"loss": 0.769,
"step": 809
},
{
"epoch": 1.7719442165709598,
"grad_norm": 0.010747969150543213,
"learning_rate": 0.001364963503649635,
"loss": 0.8531,
"step": 810
},
{
"epoch": 1.7741318020235166,
"grad_norm": 0.011500733904540539,
"learning_rate": 0.0013625304136253042,
"loss": 0.7294,
"step": 811
},
{
"epoch": 1.7763193874760734,
"grad_norm": 0.013433235697448254,
"learning_rate": 0.0013600973236009733,
"loss": 0.6442,
"step": 812
},
{
"epoch": 1.77850697292863,
"grad_norm": 0.019317343831062317,
"learning_rate": 0.0013576642335766424,
"loss": 0.6254,
"step": 813
},
{
"epoch": 1.7806945583811866,
"grad_norm": 0.020062780007719994,
"learning_rate": 0.0013552311435523115,
"loss": 0.6957,
"step": 814
},
{
"epoch": 1.7828821438337434,
"grad_norm": 0.00756926229223609,
"learning_rate": 0.0013527980535279806,
"loss": 0.7532,
"step": 815
},
{
"epoch": 1.7850697292863003,
"grad_norm": 0.0089380769059062,
"learning_rate": 0.0013503649635036496,
"loss": 0.6534,
"step": 816
},
{
"epoch": 1.787257314738857,
"grad_norm": 0.006980338133871555,
"learning_rate": 0.0013479318734793187,
"loss": 0.7314,
"step": 817
},
{
"epoch": 1.789444900191414,
"grad_norm": 0.0074529629200696945,
"learning_rate": 0.0013454987834549878,
"loss": 0.8291,
"step": 818
},
{
"epoch": 1.7916324856439705,
"grad_norm": 0.02699979580938816,
"learning_rate": 0.001343065693430657,
"loss": 0.7249,
"step": 819
},
{
"epoch": 1.793820071096527,
"grad_norm": 0.008204830810427666,
"learning_rate": 0.001340632603406326,
"loss": 0.7446,
"step": 820
},
{
"epoch": 1.796007656549084,
"grad_norm": 0.006959575694054365,
"learning_rate": 0.001338199513381995,
"loss": 0.6694,
"step": 821
},
{
"epoch": 1.7981952420016407,
"grad_norm": 0.006019539665430784,
"learning_rate": 0.0013357664233576642,
"loss": 0.7947,
"step": 822
},
{
"epoch": 1.8003828274541975,
"grad_norm": 0.007515772711485624,
"learning_rate": 0.0013333333333333333,
"loss": 0.6259,
"step": 823
},
{
"epoch": 1.8025704129067541,
"grad_norm": 0.0231679268181324,
"learning_rate": 0.0013309002433090024,
"loss": 0.5702,
"step": 824
},
{
"epoch": 1.804757998359311,
"grad_norm": 0.009831500239670277,
"learning_rate": 0.0013284671532846717,
"loss": 0.7197,
"step": 825
},
{
"epoch": 1.8069455838118675,
"grad_norm": 0.011389415711164474,
"learning_rate": 0.0013260340632603406,
"loss": 0.8466,
"step": 826
},
{
"epoch": 1.8091331692644244,
"grad_norm": 0.010654733516275883,
"learning_rate": 0.0013236009732360097,
"loss": 0.7456,
"step": 827
},
{
"epoch": 1.8113207547169812,
"grad_norm": 0.010770871303975582,
"learning_rate": 0.001321167883211679,
"loss": 0.6827,
"step": 828
},
{
"epoch": 1.813508340169538,
"grad_norm": 0.00828484632074833,
"learning_rate": 0.0013187347931873478,
"loss": 0.6794,
"step": 829
},
{
"epoch": 1.8156959256220946,
"grad_norm": 0.00973398145288229,
"learning_rate": 0.0013163017031630172,
"loss": 0.7354,
"step": 830
},
{
"epoch": 1.8178835110746514,
"grad_norm": 0.00983220711350441,
"learning_rate": 0.0013138686131386862,
"loss": 0.8531,
"step": 831
},
{
"epoch": 1.820071096527208,
"grad_norm": 0.02620159089565277,
"learning_rate": 0.0013114355231143551,
"loss": 0.7631,
"step": 832
},
{
"epoch": 1.8222586819797648,
"grad_norm": 0.057880647480487823,
"learning_rate": 0.0013090024330900244,
"loss": 0.9336,
"step": 833
},
{
"epoch": 1.8244462674323216,
"grad_norm": 0.011240589432418346,
"learning_rate": 0.0013065693430656935,
"loss": 0.5887,
"step": 834
},
{
"epoch": 1.8266338528848785,
"grad_norm": 0.012356660328805447,
"learning_rate": 0.0013041362530413626,
"loss": 0.702,
"step": 835
},
{
"epoch": 1.828821438337435,
"grad_norm": 0.006840168032795191,
"learning_rate": 0.0013017031630170317,
"loss": 0.756,
"step": 836
},
{
"epoch": 1.8310090237899916,
"grad_norm": 0.005550102796405554,
"learning_rate": 0.0012992700729927008,
"loss": 0.7161,
"step": 837
},
{
"epoch": 1.8331966092425485,
"grad_norm": 0.0120685501024127,
"learning_rate": 0.0012968369829683699,
"loss": 0.9234,
"step": 838
},
{
"epoch": 1.8353841946951053,
"grad_norm": 0.008514792658388615,
"learning_rate": 0.001294403892944039,
"loss": 0.5988,
"step": 839
},
{
"epoch": 1.837571780147662,
"grad_norm": 0.019344119355082512,
"learning_rate": 0.001291970802919708,
"loss": 0.8419,
"step": 840
},
{
"epoch": 1.839759365600219,
"grad_norm": 0.01257373858243227,
"learning_rate": 0.0012895377128953772,
"loss": 0.6785,
"step": 841
},
{
"epoch": 1.8419469510527755,
"grad_norm": 0.022899962961673737,
"learning_rate": 0.0012871046228710463,
"loss": 0.6617,
"step": 842
},
{
"epoch": 1.844134536505332,
"grad_norm": 0.012275392189621925,
"learning_rate": 0.0012846715328467154,
"loss": 0.8096,
"step": 843
},
{
"epoch": 1.846322121957889,
"grad_norm": 0.01191315334290266,
"learning_rate": 0.0012822384428223844,
"loss": 0.7757,
"step": 844
},
{
"epoch": 1.8485097074104457,
"grad_norm": 0.012164206244051456,
"learning_rate": 0.0012798053527980535,
"loss": 0.7284,
"step": 845
},
{
"epoch": 1.8506972928630026,
"grad_norm": 0.007747825235128403,
"learning_rate": 0.0012773722627737226,
"loss": 0.673,
"step": 846
},
{
"epoch": 1.8528848783155591,
"grad_norm": 0.01633123680949211,
"learning_rate": 0.0012749391727493917,
"loss": 0.6006,
"step": 847
},
{
"epoch": 1.855072463768116,
"grad_norm": 0.008600953966379166,
"learning_rate": 0.0012725060827250608,
"loss": 0.7354,
"step": 848
},
{
"epoch": 1.8572600492206726,
"grad_norm": 0.008487503044307232,
"learning_rate": 0.00127007299270073,
"loss": 0.689,
"step": 849
},
{
"epoch": 1.8594476346732294,
"grad_norm": 0.01615467295050621,
"learning_rate": 0.001267639902676399,
"loss": 0.7461,
"step": 850
},
{
"epoch": 1.8616352201257862,
"grad_norm": 0.008541187271475792,
"learning_rate": 0.0012652068126520683,
"loss": 0.6958,
"step": 851
},
{
"epoch": 1.863822805578343,
"grad_norm": 0.01053849421441555,
"learning_rate": 0.0012627737226277372,
"loss": 0.6786,
"step": 852
},
{
"epoch": 1.8660103910308996,
"grad_norm": 0.008857163600623608,
"learning_rate": 0.0012603406326034063,
"loss": 0.6645,
"step": 853
},
{
"epoch": 1.8681979764834564,
"grad_norm": 0.006793574895709753,
"learning_rate": 0.0012579075425790756,
"loss": 0.6311,
"step": 854
},
{
"epoch": 1.870385561936013,
"grad_norm": 0.01936703361570835,
"learning_rate": 0.0012554744525547445,
"loss": 0.9318,
"step": 855
},
{
"epoch": 1.8725731473885698,
"grad_norm": 0.009839971549808979,
"learning_rate": 0.0012530413625304138,
"loss": 0.7309,
"step": 856
},
{
"epoch": 1.8747607328411267,
"grad_norm": 0.010399356484413147,
"learning_rate": 0.0012506082725060829,
"loss": 0.8351,
"step": 857
},
{
"epoch": 1.8769483182936835,
"grad_norm": 0.014294488355517387,
"learning_rate": 0.0012481751824817517,
"loss": 0.6187,
"step": 858
},
{
"epoch": 1.87913590374624,
"grad_norm": 0.011614672839641571,
"learning_rate": 0.001245742092457421,
"loss": 0.9295,
"step": 859
},
{
"epoch": 1.8813234891987969,
"grad_norm": 0.015355818904936314,
"learning_rate": 0.00124330900243309,
"loss": 0.5266,
"step": 860
},
{
"epoch": 1.8835110746513535,
"grad_norm": 0.011674858629703522,
"learning_rate": 0.0012408759124087592,
"loss": 0.6467,
"step": 861
},
{
"epoch": 1.8856986601039103,
"grad_norm": 0.013345809653401375,
"learning_rate": 0.0012384428223844283,
"loss": 0.8166,
"step": 862
},
{
"epoch": 1.8878862455564671,
"grad_norm": 0.009595265612006187,
"learning_rate": 0.0012360097323600972,
"loss": 0.7704,
"step": 863
},
{
"epoch": 1.890073831009024,
"grad_norm": 0.01896647922694683,
"learning_rate": 0.0012335766423357665,
"loss": 0.7815,
"step": 864
},
{
"epoch": 1.8922614164615805,
"grad_norm": 0.017639558762311935,
"learning_rate": 0.0012311435523114356,
"loss": 0.7979,
"step": 865
},
{
"epoch": 1.8944490019141371,
"grad_norm": 0.022902049124240875,
"learning_rate": 0.0012287104622871047,
"loss": 0.8904,
"step": 866
},
{
"epoch": 1.896636587366694,
"grad_norm": 0.0124649154022336,
"learning_rate": 0.0012262773722627738,
"loss": 0.7693,
"step": 867
},
{
"epoch": 1.8988241728192508,
"grad_norm": 0.007474742829799652,
"learning_rate": 0.0012238442822384429,
"loss": 0.6641,
"step": 868
},
{
"epoch": 1.9010117582718076,
"grad_norm": 0.008987569250166416,
"learning_rate": 0.001221411192214112,
"loss": 0.6378,
"step": 869
},
{
"epoch": 1.9031993437243644,
"grad_norm": 0.009300309233367443,
"learning_rate": 0.001218978102189781,
"loss": 0.7426,
"step": 870
},
{
"epoch": 1.905386929176921,
"grad_norm": 0.01408142875880003,
"learning_rate": 0.0012165450121654502,
"loss": 0.7824,
"step": 871
},
{
"epoch": 1.9075745146294776,
"grad_norm": 0.00678917346522212,
"learning_rate": 0.0012141119221411192,
"loss": 0.7978,
"step": 872
},
{
"epoch": 1.9097621000820344,
"grad_norm": 0.010661943815648556,
"learning_rate": 0.0012116788321167883,
"loss": 0.6591,
"step": 873
},
{
"epoch": 1.9119496855345912,
"grad_norm": 0.009882554411888123,
"learning_rate": 0.0012092457420924574,
"loss": 0.7443,
"step": 874
},
{
"epoch": 1.914137270987148,
"grad_norm": 0.12100229412317276,
"learning_rate": 0.0012068126520681265,
"loss": 0.8035,
"step": 875
},
{
"epoch": 1.9163248564397046,
"grad_norm": 0.01500593964010477,
"learning_rate": 0.0012043795620437956,
"loss": 0.8671,
"step": 876
},
{
"epoch": 1.9185124418922614,
"grad_norm": 0.01351536437869072,
"learning_rate": 0.0012019464720194647,
"loss": 0.824,
"step": 877
},
{
"epoch": 1.920700027344818,
"grad_norm": 0.02334493212401867,
"learning_rate": 0.0011995133819951338,
"loss": 0.7728,
"step": 878
},
{
"epoch": 1.9228876127973749,
"grad_norm": 0.04414600878953934,
"learning_rate": 0.001197080291970803,
"loss": 0.7811,
"step": 879
},
{
"epoch": 1.9250751982499317,
"grad_norm": 0.03064621239900589,
"learning_rate": 0.001194647201946472,
"loss": 0.8812,
"step": 880
},
{
"epoch": 1.9272627837024885,
"grad_norm": 0.010438323952257633,
"learning_rate": 0.001192214111922141,
"loss": 0.8027,
"step": 881
},
{
"epoch": 1.929450369155045,
"grad_norm": 0.016364533454179764,
"learning_rate": 0.0011897810218978104,
"loss": 0.6239,
"step": 882
},
{
"epoch": 1.931637954607602,
"grad_norm": 0.02069861628115177,
"learning_rate": 0.0011873479318734793,
"loss": 0.8137,
"step": 883
},
{
"epoch": 1.9338255400601585,
"grad_norm": 0.017191501334309578,
"learning_rate": 0.0011849148418491484,
"loss": 0.8052,
"step": 884
},
{
"epoch": 1.9360131255127153,
"grad_norm": 0.014077574014663696,
"learning_rate": 0.0011824817518248177,
"loss": 0.8584,
"step": 885
},
{
"epoch": 1.9382007109652721,
"grad_norm": 0.009209788404405117,
"learning_rate": 0.0011800486618004865,
"loss": 0.6426,
"step": 886
},
{
"epoch": 1.940388296417829,
"grad_norm": 0.026021014899015427,
"learning_rate": 0.0011776155717761558,
"loss": 0.7457,
"step": 887
},
{
"epoch": 1.9425758818703855,
"grad_norm": 0.024019265547394753,
"learning_rate": 0.001175182481751825,
"loss": 0.869,
"step": 888
},
{
"epoch": 1.9447634673229421,
"grad_norm": 0.020230406895279884,
"learning_rate": 0.0011727493917274938,
"loss": 0.8532,
"step": 889
},
{
"epoch": 1.946951052775499,
"grad_norm": 0.018076736479997635,
"learning_rate": 0.0011703163017031631,
"loss": 0.7276,
"step": 890
},
{
"epoch": 1.9491386382280558,
"grad_norm": 0.019679049029946327,
"learning_rate": 0.0011678832116788322,
"loss": 0.7214,
"step": 891
},
{
"epoch": 1.9513262236806126,
"grad_norm": 0.010772393085062504,
"learning_rate": 0.0011654501216545013,
"loss": 0.6786,
"step": 892
},
{
"epoch": 1.9535138091331694,
"grad_norm": 0.010874917730689049,
"learning_rate": 0.0011630170316301704,
"loss": 0.7272,
"step": 893
},
{
"epoch": 1.955701394585726,
"grad_norm": 0.00815314520150423,
"learning_rate": 0.0011605839416058395,
"loss": 0.8908,
"step": 894
},
{
"epoch": 1.9578889800382826,
"grad_norm": 0.008539310656487942,
"learning_rate": 0.0011581508515815086,
"loss": 0.6394,
"step": 895
},
{
"epoch": 1.9600765654908394,
"grad_norm": 0.039017412811517715,
"learning_rate": 0.0011557177615571777,
"loss": 0.6505,
"step": 896
},
{
"epoch": 1.9622641509433962,
"grad_norm": 0.009175320155918598,
"learning_rate": 0.0011532846715328468,
"loss": 0.975,
"step": 897
},
{
"epoch": 1.964451736395953,
"grad_norm": 0.014542749151587486,
"learning_rate": 0.0011508515815085159,
"loss": 0.7222,
"step": 898
},
{
"epoch": 1.9666393218485099,
"grad_norm": 0.01856316812336445,
"learning_rate": 0.001148418491484185,
"loss": 0.7575,
"step": 899
},
{
"epoch": 1.9688269073010665,
"grad_norm": 0.007601718418300152,
"learning_rate": 0.001145985401459854,
"loss": 0.7233,
"step": 900
},
{
"epoch": 1.971014492753623,
"grad_norm": 0.034239862114191055,
"learning_rate": 0.0011435523114355231,
"loss": 0.6989,
"step": 901
},
{
"epoch": 1.9732020782061799,
"grad_norm": 0.00851233210414648,
"learning_rate": 0.0011411192214111922,
"loss": 0.8321,
"step": 902
},
{
"epoch": 1.9753896636587367,
"grad_norm": 0.009412054903805256,
"learning_rate": 0.0011386861313868613,
"loss": 0.7139,
"step": 903
},
{
"epoch": 1.9775772491112935,
"grad_norm": 0.012049161829054356,
"learning_rate": 0.0011362530413625304,
"loss": 0.6989,
"step": 904
},
{
"epoch": 1.97976483456385,
"grad_norm": 0.010931652970612049,
"learning_rate": 0.0011338199513381995,
"loss": 0.8747,
"step": 905
},
{
"epoch": 1.981952420016407,
"grad_norm": 0.015494965016841888,
"learning_rate": 0.0011313868613138686,
"loss": 0.8644,
"step": 906
},
{
"epoch": 1.9841400054689635,
"grad_norm": 0.012480970472097397,
"learning_rate": 0.0011289537712895377,
"loss": 0.907,
"step": 907
},
{
"epoch": 1.9863275909215203,
"grad_norm": 0.01492912694811821,
"learning_rate": 0.001126520681265207,
"loss": 0.7421,
"step": 908
},
{
"epoch": 1.9885151763740772,
"grad_norm": 0.012027468532323837,
"learning_rate": 0.0011240875912408759,
"loss": 0.9274,
"step": 909
},
{
"epoch": 1.990702761826634,
"grad_norm": 0.014835814945399761,
"learning_rate": 0.001121654501216545,
"loss": 0.8337,
"step": 910
},
{
"epoch": 1.9928903472791906,
"grad_norm": 0.008667545393109322,
"learning_rate": 0.0011192214111922143,
"loss": 0.6117,
"step": 911
},
{
"epoch": 1.9950779327317474,
"grad_norm": 0.01624200865626335,
"learning_rate": 0.0011167883211678832,
"loss": 0.8712,
"step": 912
},
{
"epoch": 1.997265518184304,
"grad_norm": 0.008188914507627487,
"learning_rate": 0.0011143552311435525,
"loss": 0.8495,
"step": 913
},
{
"epoch": 1.9994531036368608,
"grad_norm": 0.013718970119953156,
"learning_rate": 0.0011119221411192213,
"loss": 0.8417,
"step": 914
},
{
"epoch": 2.0016406890894176,
"grad_norm": 0.00691073015332222,
"learning_rate": 0.0011094890510948904,
"loss": 0.7033,
"step": 915
},
{
"epoch": 2.0038282745419744,
"grad_norm": 0.017321942374110222,
"learning_rate": 0.0011070559610705597,
"loss": 0.7979,
"step": 916
},
{
"epoch": 2.0060158599945312,
"grad_norm": 0.007781198713928461,
"learning_rate": 0.0011046228710462286,
"loss": 0.6795,
"step": 917
},
{
"epoch": 2.0082034454470876,
"grad_norm": 0.007755633443593979,
"learning_rate": 0.001102189781021898,
"loss": 0.6363,
"step": 918
},
{
"epoch": 2.0103910308996444,
"grad_norm": 0.015355097129940987,
"learning_rate": 0.001099756690997567,
"loss": 0.7684,
"step": 919
},
{
"epoch": 2.0125786163522013,
"grad_norm": 0.009972341358661652,
"learning_rate": 0.001097323600973236,
"loss": 0.7659,
"step": 920
},
{
"epoch": 2.014766201804758,
"grad_norm": 0.00998846534639597,
"learning_rate": 0.0010948905109489052,
"loss": 0.918,
"step": 921
},
{
"epoch": 2.016953787257315,
"grad_norm": 0.007050537038594484,
"learning_rate": 0.0010924574209245743,
"loss": 0.7083,
"step": 922
},
{
"epoch": 2.0191413727098713,
"grad_norm": 0.008426625281572342,
"learning_rate": 0.0010900243309002432,
"loss": 0.7962,
"step": 923
},
{
"epoch": 2.021328958162428,
"grad_norm": 0.009424027986824512,
"learning_rate": 0.0010875912408759125,
"loss": 0.7369,
"step": 924
},
{
"epoch": 2.023516543614985,
"grad_norm": 0.012517026625573635,
"learning_rate": 0.0010851581508515816,
"loss": 0.8281,
"step": 925
},
{
"epoch": 2.0257041290675417,
"grad_norm": 0.016427017748355865,
"learning_rate": 0.0010827250608272507,
"loss": 0.7808,
"step": 926
},
{
"epoch": 2.0278917145200985,
"grad_norm": 0.011162400245666504,
"learning_rate": 0.0010802919708029198,
"loss": 0.8512,
"step": 927
},
{
"epoch": 2.0300792999726553,
"grad_norm": 0.025822371244430542,
"learning_rate": 0.0010778588807785888,
"loss": 0.6347,
"step": 928
},
{
"epoch": 2.0322668854252117,
"grad_norm": 0.008243129588663578,
"learning_rate": 0.001075425790754258,
"loss": 0.7126,
"step": 929
},
{
"epoch": 2.0344544708777685,
"grad_norm": 0.01245404314249754,
"learning_rate": 0.001072992700729927,
"loss": 0.6111,
"step": 930
},
{
"epoch": 2.0366420563303254,
"grad_norm": 0.006443020887672901,
"learning_rate": 0.0010705596107055961,
"loss": 0.6287,
"step": 931
},
{
"epoch": 2.038829641782882,
"grad_norm": 0.01358412578701973,
"learning_rate": 0.0010681265206812652,
"loss": 1.0563,
"step": 932
},
{
"epoch": 2.041017227235439,
"grad_norm": 0.010836120694875717,
"learning_rate": 0.0010656934306569343,
"loss": 0.7046,
"step": 933
},
{
"epoch": 2.043204812687996,
"grad_norm": 0.012488581240177155,
"learning_rate": 0.0010632603406326034,
"loss": 0.8661,
"step": 934
},
{
"epoch": 2.045392398140552,
"grad_norm": 0.009522946551442146,
"learning_rate": 0.0010608272506082725,
"loss": 0.6687,
"step": 935
},
{
"epoch": 2.047579983593109,
"grad_norm": 0.03695467486977577,
"learning_rate": 0.0010583941605839416,
"loss": 0.7727,
"step": 936
},
{
"epoch": 2.049767569045666,
"grad_norm": 0.04616512730717659,
"learning_rate": 0.0010559610705596107,
"loss": 0.7193,
"step": 937
},
{
"epoch": 2.0519551544982226,
"grad_norm": 0.010503578931093216,
"learning_rate": 0.0010535279805352798,
"loss": 0.6701,
"step": 938
},
{
"epoch": 2.0541427399507795,
"grad_norm": 0.008623762056231499,
"learning_rate": 0.001051094890510949,
"loss": 0.7161,
"step": 939
},
{
"epoch": 2.0563303254033363,
"grad_norm": 0.007583661004900932,
"learning_rate": 0.001048661800486618,
"loss": 0.7402,
"step": 940
},
{
"epoch": 2.0585179108558926,
"grad_norm": 0.008966002613306046,
"learning_rate": 0.001046228710462287,
"loss": 0.7016,
"step": 941
},
{
"epoch": 2.0607054963084495,
"grad_norm": 0.0104443971067667,
"learning_rate": 0.0010437956204379564,
"loss": 0.7877,
"step": 942
},
{
"epoch": 2.0628930817610063,
"grad_norm": 0.011073727160692215,
"learning_rate": 0.0010413625304136252,
"loss": 0.8216,
"step": 943
},
{
"epoch": 2.065080667213563,
"grad_norm": 0.006104661151766777,
"learning_rate": 0.0010389294403892943,
"loss": 0.7218,
"step": 944
},
{
"epoch": 2.06726825266612,
"grad_norm": 0.006152690388262272,
"learning_rate": 0.0010364963503649636,
"loss": 0.6807,
"step": 945
},
{
"epoch": 2.0694558381186763,
"grad_norm": 0.01146136224269867,
"learning_rate": 0.0010340632603406325,
"loss": 0.8706,
"step": 946
},
{
"epoch": 2.071643423571233,
"grad_norm": 0.008924251422286034,
"learning_rate": 0.0010316301703163018,
"loss": 0.7596,
"step": 947
},
{
"epoch": 2.07383100902379,
"grad_norm": 0.01587800122797489,
"learning_rate": 0.001029197080291971,
"loss": 0.8315,
"step": 948
},
{
"epoch": 2.0760185944763467,
"grad_norm": 0.007868033833801746,
"learning_rate": 0.0010267639902676398,
"loss": 0.8498,
"step": 949
},
{
"epoch": 2.0782061799289036,
"grad_norm": 0.009655119851231575,
"learning_rate": 0.001024330900243309,
"loss": 0.909,
"step": 950
},
{
"epoch": 2.0803937653814604,
"grad_norm": 0.014302834868431091,
"learning_rate": 0.001021897810218978,
"loss": 0.8934,
"step": 951
},
{
"epoch": 2.0825813508340167,
"grad_norm": 0.008887048810720444,
"learning_rate": 0.0010194647201946473,
"loss": 0.62,
"step": 952
},
{
"epoch": 2.0847689362865736,
"grad_norm": 0.016339240595698357,
"learning_rate": 0.0010170316301703164,
"loss": 0.7503,
"step": 953
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.013042870908975601,
"learning_rate": 0.0010145985401459853,
"loss": 0.7425,
"step": 954
},
{
"epoch": 2.089144107191687,
"grad_norm": 0.009357294999063015,
"learning_rate": 0.0010121654501216546,
"loss": 0.7565,
"step": 955
},
{
"epoch": 2.091331692644244,
"grad_norm": 0.008100231178104877,
"learning_rate": 0.0010097323600973237,
"loss": 0.659,
"step": 956
},
{
"epoch": 2.093519278096801,
"grad_norm": 0.008745480328798294,
"learning_rate": 0.0010072992700729927,
"loss": 0.6722,
"step": 957
},
{
"epoch": 2.095706863549357,
"grad_norm": 0.02181909792125225,
"learning_rate": 0.0010048661800486618,
"loss": 0.7497,
"step": 958
},
{
"epoch": 2.097894449001914,
"grad_norm": 0.005593753885477781,
"learning_rate": 0.001002433090024331,
"loss": 0.6413,
"step": 959
},
{
"epoch": 2.100082034454471,
"grad_norm": 0.0110318623483181,
"learning_rate": 0.001,
"loss": 0.7437,
"step": 960
},
{
"epoch": 2.1022696199070277,
"grad_norm": 0.07487611472606659,
"learning_rate": 0.0009975669099756691,
"loss": 0.8967,
"step": 961
},
{
"epoch": 2.1044572053595845,
"grad_norm": 0.011572844348847866,
"learning_rate": 0.0009951338199513382,
"loss": 0.7016,
"step": 962
},
{
"epoch": 2.1066447908121413,
"grad_norm": 0.0219709649682045,
"learning_rate": 0.0009927007299270073,
"loss": 0.7582,
"step": 963
},
{
"epoch": 2.1088323762646977,
"grad_norm": 0.014250703155994415,
"learning_rate": 0.0009902676399026764,
"loss": 0.6485,
"step": 964
},
{
"epoch": 2.1110199617172545,
"grad_norm": 0.010836089961230755,
"learning_rate": 0.0009878345498783455,
"loss": 0.7457,
"step": 965
},
{
"epoch": 2.1132075471698113,
"grad_norm": 0.010538347065448761,
"learning_rate": 0.0009854014598540146,
"loss": 0.7283,
"step": 966
},
{
"epoch": 2.115395132622368,
"grad_norm": 0.011399851180613041,
"learning_rate": 0.0009829683698296837,
"loss": 0.6896,
"step": 967
},
{
"epoch": 2.117582718074925,
"grad_norm": 0.027435095980763435,
"learning_rate": 0.000980535279805353,
"loss": 0.9376,
"step": 968
},
{
"epoch": 2.1197703035274817,
"grad_norm": 0.00705757224932313,
"learning_rate": 0.0009781021897810219,
"loss": 0.7243,
"step": 969
},
{
"epoch": 2.121957888980038,
"grad_norm": 0.0098995016887784,
"learning_rate": 0.000975669099756691,
"loss": 0.7931,
"step": 970
},
{
"epoch": 2.124145474432595,
"grad_norm": 0.011125714518129826,
"learning_rate": 0.00097323600973236,
"loss": 0.6044,
"step": 971
},
{
"epoch": 2.1263330598851518,
"grad_norm": 0.009387229569256306,
"learning_rate": 0.0009708029197080292,
"loss": 0.7187,
"step": 972
},
{
"epoch": 2.1285206453377086,
"grad_norm": 0.01129234954714775,
"learning_rate": 0.0009683698296836983,
"loss": 0.8324,
"step": 973
},
{
"epoch": 2.1307082307902654,
"grad_norm": 0.011272157542407513,
"learning_rate": 0.0009659367396593673,
"loss": 0.7128,
"step": 974
},
{
"epoch": 2.132895816242822,
"grad_norm": 0.010409243404865265,
"learning_rate": 0.0009635036496350365,
"loss": 0.7535,
"step": 975
},
{
"epoch": 2.1350834016953786,
"grad_norm": 0.00857408158481121,
"learning_rate": 0.0009610705596107057,
"loss": 0.8129,
"step": 976
},
{
"epoch": 2.1372709871479354,
"grad_norm": 0.014548208564519882,
"learning_rate": 0.0009586374695863747,
"loss": 0.676,
"step": 977
},
{
"epoch": 2.139458572600492,
"grad_norm": 0.016449380666017532,
"learning_rate": 0.0009562043795620438,
"loss": 0.7384,
"step": 978
},
{
"epoch": 2.141646158053049,
"grad_norm": 0.007109857629984617,
"learning_rate": 0.000953771289537713,
"loss": 0.6808,
"step": 979
},
{
"epoch": 2.143833743505606,
"grad_norm": 0.009979904629290104,
"learning_rate": 0.000951338199513382,
"loss": 0.6907,
"step": 980
},
{
"epoch": 2.146021328958162,
"grad_norm": 0.008424636907875538,
"learning_rate": 0.0009489051094890511,
"loss": 0.7423,
"step": 981
},
{
"epoch": 2.148208914410719,
"grad_norm": 0.01054910384118557,
"learning_rate": 0.0009464720194647203,
"loss": 0.6611,
"step": 982
},
{
"epoch": 2.150396499863276,
"grad_norm": 0.0084614809602499,
"learning_rate": 0.0009440389294403893,
"loss": 0.7548,
"step": 983
},
{
"epoch": 2.1525840853158327,
"grad_norm": 0.008796039037406445,
"learning_rate": 0.0009416058394160585,
"loss": 0.9042,
"step": 984
},
{
"epoch": 2.1547716707683895,
"grad_norm": 0.011639994569122791,
"learning_rate": 0.0009391727493917275,
"loss": 0.6474,
"step": 985
},
{
"epoch": 2.1569592562209463,
"grad_norm": 0.011916186660528183,
"learning_rate": 0.0009367396593673965,
"loss": 0.7848,
"step": 986
},
{
"epoch": 2.1591468416735027,
"grad_norm": 0.01620625890791416,
"learning_rate": 0.0009343065693430657,
"loss": 0.7924,
"step": 987
},
{
"epoch": 2.1613344271260595,
"grad_norm": 0.008310189470648766,
"learning_rate": 0.0009318734793187349,
"loss": 0.8015,
"step": 988
},
{
"epoch": 2.1635220125786163,
"grad_norm": 0.008162159472703934,
"learning_rate": 0.0009294403892944039,
"loss": 0.8261,
"step": 989
},
{
"epoch": 2.165709598031173,
"grad_norm": 0.009289762936532497,
"learning_rate": 0.000927007299270073,
"loss": 0.8676,
"step": 990
},
{
"epoch": 2.16789718348373,
"grad_norm": 0.007392804138362408,
"learning_rate": 0.000924574209245742,
"loss": 0.6025,
"step": 991
},
{
"epoch": 2.1700847689362868,
"grad_norm": 0.008378117345273495,
"learning_rate": 0.0009221411192214112,
"loss": 0.5951,
"step": 992
},
{
"epoch": 2.172272354388843,
"grad_norm": 0.037044674158096313,
"learning_rate": 0.0009197080291970804,
"loss": 0.7454,
"step": 993
},
{
"epoch": 2.1744599398414,
"grad_norm": 0.01427681464701891,
"learning_rate": 0.0009172749391727494,
"loss": 0.5663,
"step": 994
},
{
"epoch": 2.1766475252939568,
"grad_norm": 0.010998294688761234,
"learning_rate": 0.0009148418491484185,
"loss": 0.9058,
"step": 995
},
{
"epoch": 2.1788351107465136,
"grad_norm": 0.007977189496159554,
"learning_rate": 0.0009124087591240877,
"loss": 0.664,
"step": 996
},
{
"epoch": 2.1810226961990704,
"grad_norm": 0.008938194252550602,
"learning_rate": 0.0009099756690997567,
"loss": 0.7787,
"step": 997
},
{
"epoch": 2.1832102816516272,
"grad_norm": 0.014179794117808342,
"learning_rate": 0.0009075425790754259,
"loss": 0.6453,
"step": 998
},
{
"epoch": 2.1853978671041836,
"grad_norm": 0.01838630810379982,
"learning_rate": 0.000905109489051095,
"loss": 0.7138,
"step": 999
},
{
"epoch": 2.1875854525567404,
"grad_norm": 0.027501361444592476,
"learning_rate": 0.0009026763990267639,
"loss": 0.7204,
"step": 1000
},
{
"epoch": 2.1897730380092972,
"grad_norm": 0.007381811738014221,
"learning_rate": 0.0009002433090024331,
"loss": 0.8955,
"step": 1001
},
{
"epoch": 2.191960623461854,
"grad_norm": 0.07506415992975235,
"learning_rate": 0.0008978102189781023,
"loss": 0.802,
"step": 1002
},
{
"epoch": 2.194148208914411,
"grad_norm": 0.028858385980129242,
"learning_rate": 0.0008953771289537713,
"loss": 0.7682,
"step": 1003
},
{
"epoch": 2.1963357943669672,
"grad_norm": 0.013214879669249058,
"learning_rate": 0.0008929440389294404,
"loss": 0.7162,
"step": 1004
},
{
"epoch": 2.198523379819524,
"grad_norm": 0.007629261817783117,
"learning_rate": 0.0008905109489051096,
"loss": 0.7283,
"step": 1005
},
{
"epoch": 2.200710965272081,
"grad_norm": 0.007726036943495274,
"learning_rate": 0.0008880778588807786,
"loss": 0.8558,
"step": 1006
},
{
"epoch": 2.2028985507246377,
"grad_norm": 0.008436914533376694,
"learning_rate": 0.0008856447688564477,
"loss": 0.7377,
"step": 1007
},
{
"epoch": 2.2050861361771945,
"grad_norm": 0.02465754747390747,
"learning_rate": 0.0008832116788321168,
"loss": 0.5909,
"step": 1008
},
{
"epoch": 2.2072737216297513,
"grad_norm": 0.007964403368532658,
"learning_rate": 0.0008807785888077859,
"loss": 0.9931,
"step": 1009
},
{
"epoch": 2.2094613070823077,
"grad_norm": 0.008428809233009815,
"learning_rate": 0.0008783454987834551,
"loss": 0.8308,
"step": 1010
},
{
"epoch": 2.2116488925348645,
"grad_norm": 0.005988140590488911,
"learning_rate": 0.0008759124087591241,
"loss": 0.6528,
"step": 1011
},
{
"epoch": 2.2138364779874213,
"grad_norm": 0.009502807632088661,
"learning_rate": 0.0008734793187347931,
"loss": 0.7241,
"step": 1012
},
{
"epoch": 2.216024063439978,
"grad_norm": 0.01181811187416315,
"learning_rate": 0.0008710462287104623,
"loss": 0.5897,
"step": 1013
},
{
"epoch": 2.218211648892535,
"grad_norm": 0.013522054068744183,
"learning_rate": 0.0008686131386861313,
"loss": 0.7664,
"step": 1014
},
{
"epoch": 2.220399234345092,
"grad_norm": 0.008381453342735767,
"learning_rate": 0.0008661800486618005,
"loss": 0.7758,
"step": 1015
},
{
"epoch": 2.222586819797648,
"grad_norm": 0.011634815484285355,
"learning_rate": 0.0008637469586374696,
"loss": 0.7362,
"step": 1016
},
{
"epoch": 2.224774405250205,
"grad_norm": 0.008570423349738121,
"learning_rate": 0.0008613138686131386,
"loss": 0.8869,
"step": 1017
},
{
"epoch": 2.226961990702762,
"grad_norm": 0.01613277941942215,
"learning_rate": 0.0008588807785888078,
"loss": 0.8074,
"step": 1018
},
{
"epoch": 2.2291495761553186,
"grad_norm": 0.0062742773443460464,
"learning_rate": 0.000856447688564477,
"loss": 0.7695,
"step": 1019
},
{
"epoch": 2.2313371616078754,
"grad_norm": 0.011958430521190166,
"learning_rate": 0.000854014598540146,
"loss": 0.9689,
"step": 1020
},
{
"epoch": 2.2335247470604322,
"grad_norm": 0.010232674889266491,
"learning_rate": 0.0008515815085158151,
"loss": 0.7289,
"step": 1021
},
{
"epoch": 2.2357123325129886,
"grad_norm": 0.010546423494815826,
"learning_rate": 0.0008491484184914843,
"loss": 0.7882,
"step": 1022
},
{
"epoch": 2.2378999179655454,
"grad_norm": 0.006704252678900957,
"learning_rate": 0.0008467153284671533,
"loss": 0.7245,
"step": 1023
},
{
"epoch": 2.2400875034181023,
"grad_norm": 0.00856088288128376,
"learning_rate": 0.0008442822384428225,
"loss": 0.8478,
"step": 1024
},
{
"epoch": 2.242275088870659,
"grad_norm": 0.011011838912963867,
"learning_rate": 0.0008418491484184916,
"loss": 0.8878,
"step": 1025
},
{
"epoch": 2.244462674323216,
"grad_norm": 0.008859807625412941,
"learning_rate": 0.0008394160583941605,
"loss": 1.0637,
"step": 1026
},
{
"epoch": 2.2466502597757723,
"grad_norm": 0.019353823736310005,
"learning_rate": 0.0008369829683698297,
"loss": 0.6664,
"step": 1027
},
{
"epoch": 2.248837845228329,
"grad_norm": 0.007266916800290346,
"learning_rate": 0.0008345498783454987,
"loss": 0.7924,
"step": 1028
},
{
"epoch": 2.251025430680886,
"grad_norm": 0.00936873722821474,
"learning_rate": 0.0008321167883211679,
"loss": 0.7045,
"step": 1029
},
{
"epoch": 2.2532130161334427,
"grad_norm": 0.007908246479928493,
"learning_rate": 0.000829683698296837,
"loss": 0.9256,
"step": 1030
},
{
"epoch": 2.2554006015859995,
"grad_norm": 0.024966659024357796,
"learning_rate": 0.000827250608272506,
"loss": 0.7243,
"step": 1031
},
{
"epoch": 2.2575881870385563,
"grad_norm": 0.009444604627788067,
"learning_rate": 0.0008248175182481752,
"loss": 0.7369,
"step": 1032
},
{
"epoch": 2.259775772491113,
"grad_norm": 0.009447803720831871,
"learning_rate": 0.0008223844282238443,
"loss": 0.7721,
"step": 1033
},
{
"epoch": 2.2619633579436695,
"grad_norm": 0.008546645753085613,
"learning_rate": 0.0008199513381995134,
"loss": 0.8094,
"step": 1034
},
{
"epoch": 2.2641509433962264,
"grad_norm": 0.006809299346059561,
"learning_rate": 0.0008175182481751825,
"loss": 0.7907,
"step": 1035
},
{
"epoch": 2.266338528848783,
"grad_norm": 0.013527573086321354,
"learning_rate": 0.0008150851581508517,
"loss": 0.6692,
"step": 1036
},
{
"epoch": 2.26852611430134,
"grad_norm": 0.007041016593575478,
"learning_rate": 0.0008126520681265207,
"loss": 0.7474,
"step": 1037
},
{
"epoch": 2.270713699753897,
"grad_norm": 0.006707175634801388,
"learning_rate": 0.0008102189781021898,
"loss": 0.8134,
"step": 1038
},
{
"epoch": 2.272901285206453,
"grad_norm": 0.030407702550292015,
"learning_rate": 0.000807785888077859,
"loss": 0.7734,
"step": 1039
},
{
"epoch": 2.27508887065901,
"grad_norm": 0.011364832520484924,
"learning_rate": 0.000805352798053528,
"loss": 0.6188,
"step": 1040
},
{
"epoch": 2.277276456111567,
"grad_norm": 0.009676680900156498,
"learning_rate": 0.0008029197080291971,
"loss": 0.8262,
"step": 1041
},
{
"epoch": 2.2794640415641236,
"grad_norm": 0.012146366760134697,
"learning_rate": 0.0008004866180048662,
"loss": 0.7543,
"step": 1042
},
{
"epoch": 2.2816516270166805,
"grad_norm": 0.021344035863876343,
"learning_rate": 0.0007980535279805352,
"loss": 0.8434,
"step": 1043
},
{
"epoch": 2.2838392124692373,
"grad_norm": 0.019379200413823128,
"learning_rate": 0.0007956204379562044,
"loss": 0.6678,
"step": 1044
},
{
"epoch": 2.2860267979217936,
"grad_norm": 0.012972463853657246,
"learning_rate": 0.0007931873479318734,
"loss": 0.7363,
"step": 1045
},
{
"epoch": 2.2882143833743505,
"grad_norm": 0.005540755111724138,
"learning_rate": 0.0007907542579075426,
"loss": 0.7702,
"step": 1046
},
{
"epoch": 2.2904019688269073,
"grad_norm": 0.01054232195019722,
"learning_rate": 0.0007883211678832117,
"loss": 0.8086,
"step": 1047
},
{
"epoch": 2.292589554279464,
"grad_norm": 0.006333992816507816,
"learning_rate": 0.0007858880778588807,
"loss": 0.8547,
"step": 1048
},
{
"epoch": 2.294777139732021,
"grad_norm": 0.007503498811274767,
"learning_rate": 0.0007834549878345499,
"loss": 0.9384,
"step": 1049
},
{
"epoch": 2.2969647251845773,
"grad_norm": 0.009519786573946476,
"learning_rate": 0.0007810218978102191,
"loss": 0.7457,
"step": 1050
},
{
"epoch": 2.299152310637134,
"grad_norm": 0.009697610512375832,
"learning_rate": 0.0007785888077858881,
"loss": 0.6572,
"step": 1051
},
{
"epoch": 2.301339896089691,
"grad_norm": 0.01142230723053217,
"learning_rate": 0.0007761557177615572,
"loss": 0.7003,
"step": 1052
},
{
"epoch": 2.3035274815422477,
"grad_norm": 0.014880196191370487,
"learning_rate": 0.0007737226277372264,
"loss": 0.9522,
"step": 1053
},
{
"epoch": 2.3057150669948046,
"grad_norm": 0.03530775010585785,
"learning_rate": 0.0007712895377128953,
"loss": 0.8303,
"step": 1054
},
{
"epoch": 2.3079026524473614,
"grad_norm": 0.008375970646739006,
"learning_rate": 0.0007688564476885646,
"loss": 0.9399,
"step": 1055
},
{
"epoch": 2.310090237899918,
"grad_norm": 0.011312820017337799,
"learning_rate": 0.0007664233576642336,
"loss": 0.6918,
"step": 1056
},
{
"epoch": 2.3122778233524746,
"grad_norm": 0.00965717900544405,
"learning_rate": 0.0007639902676399026,
"loss": 0.6898,
"step": 1057
},
{
"epoch": 2.3144654088050314,
"grad_norm": 0.046056658029556274,
"learning_rate": 0.0007615571776155718,
"loss": 0.7655,
"step": 1058
},
{
"epoch": 2.316652994257588,
"grad_norm": 0.006473752204328775,
"learning_rate": 0.0007591240875912409,
"loss": 0.7825,
"step": 1059
},
{
"epoch": 2.318840579710145,
"grad_norm": 0.012731518596410751,
"learning_rate": 0.0007566909975669099,
"loss": 0.7138,
"step": 1060
},
{
"epoch": 2.321028165162702,
"grad_norm": 0.01815684884786606,
"learning_rate": 0.0007542579075425791,
"loss": 0.7992,
"step": 1061
},
{
"epoch": 2.323215750615258,
"grad_norm": 0.012457008473575115,
"learning_rate": 0.0007518248175182483,
"loss": 0.7565,
"step": 1062
},
{
"epoch": 2.325403336067815,
"grad_norm": 0.011130121536552906,
"learning_rate": 0.0007493917274939173,
"loss": 0.6585,
"step": 1063
},
{
"epoch": 2.327590921520372,
"grad_norm": 0.009390764869749546,
"learning_rate": 0.0007469586374695864,
"loss": 0.5921,
"step": 1064
},
{
"epoch": 2.3297785069729287,
"grad_norm": 0.006265114061534405,
"learning_rate": 0.0007445255474452555,
"loss": 0.862,
"step": 1065
},
{
"epoch": 2.3319660924254855,
"grad_norm": 0.014493511989712715,
"learning_rate": 0.0007420924574209246,
"loss": 0.6529,
"step": 1066
},
{
"epoch": 2.3341536778780423,
"grad_norm": 0.01009755115956068,
"learning_rate": 0.0007396593673965938,
"loss": 1.0077,
"step": 1067
},
{
"epoch": 2.3363412633305987,
"grad_norm": 0.022166702896356583,
"learning_rate": 0.0007372262773722629,
"loss": 0.9121,
"step": 1068
},
{
"epoch": 2.3385288487831555,
"grad_norm": 0.028010999783873558,
"learning_rate": 0.0007347931873479318,
"loss": 0.6663,
"step": 1069
},
{
"epoch": 2.3407164342357123,
"grad_norm": 0.012431381270289421,
"learning_rate": 0.0007323600973236009,
"loss": 0.7579,
"step": 1070
},
{
"epoch": 2.342904019688269,
"grad_norm": 0.0932813212275505,
"learning_rate": 0.0007299270072992701,
"loss": 0.5542,
"step": 1071
},
{
"epoch": 2.345091605140826,
"grad_norm": 0.011022589169442654,
"learning_rate": 0.0007274939172749392,
"loss": 0.7093,
"step": 1072
},
{
"epoch": 2.3472791905933827,
"grad_norm": 0.008994583040475845,
"learning_rate": 0.0007250608272506082,
"loss": 0.7466,
"step": 1073
},
{
"epoch": 2.349466776045939,
"grad_norm": 0.01782486028969288,
"learning_rate": 0.0007226277372262774,
"loss": 0.6847,
"step": 1074
},
{
"epoch": 2.351654361498496,
"grad_norm": 0.011398195289075375,
"learning_rate": 0.0007201946472019465,
"loss": 0.687,
"step": 1075
},
{
"epoch": 2.3538419469510528,
"grad_norm": 0.023858705535531044,
"learning_rate": 0.0007177615571776156,
"loss": 0.6984,
"step": 1076
},
{
"epoch": 2.3560295324036096,
"grad_norm": 0.008185802958905697,
"learning_rate": 0.0007153284671532847,
"loss": 0.8747,
"step": 1077
},
{
"epoch": 2.3582171178561664,
"grad_norm": 0.018106609582901,
"learning_rate": 0.0007128953771289538,
"loss": 0.6591,
"step": 1078
},
{
"epoch": 2.360404703308723,
"grad_norm": 0.013991002924740314,
"learning_rate": 0.0007104622871046229,
"loss": 0.818,
"step": 1079
},
{
"epoch": 2.3625922887612796,
"grad_norm": 0.007820016704499722,
"learning_rate": 0.000708029197080292,
"loss": 0.9661,
"step": 1080
},
{
"epoch": 2.3647798742138364,
"grad_norm": 0.020563364028930664,
"learning_rate": 0.0007055961070559611,
"loss": 0.896,
"step": 1081
},
{
"epoch": 2.366967459666393,
"grad_norm": 0.01632773131132126,
"learning_rate": 0.0007031630170316302,
"loss": 0.8516,
"step": 1082
},
{
"epoch": 2.36915504511895,
"grad_norm": 0.012202097102999687,
"learning_rate": 0.0007007299270072992,
"loss": 0.921,
"step": 1083
},
{
"epoch": 2.371342630571507,
"grad_norm": 0.009598075412213802,
"learning_rate": 0.0006982968369829684,
"loss": 0.677,
"step": 1084
},
{
"epoch": 2.373530216024063,
"grad_norm": 0.010769539512693882,
"learning_rate": 0.0006958637469586375,
"loss": 0.7964,
"step": 1085
},
{
"epoch": 2.37571780147662,
"grad_norm": 0.011242173612117767,
"learning_rate": 0.0006934306569343065,
"loss": 0.6444,
"step": 1086
},
{
"epoch": 2.377905386929177,
"grad_norm": 0.009250817820429802,
"learning_rate": 0.0006909975669099756,
"loss": 0.7456,
"step": 1087
},
{
"epoch": 2.3800929723817337,
"grad_norm": 0.008871940895915031,
"learning_rate": 0.0006885644768856448,
"loss": 0.7497,
"step": 1088
},
{
"epoch": 2.3822805578342905,
"grad_norm": 0.014774895273149014,
"learning_rate": 0.0006861313868613139,
"loss": 0.8508,
"step": 1089
},
{
"epoch": 2.3844681432868473,
"grad_norm": 0.008470469154417515,
"learning_rate": 0.000683698296836983,
"loss": 0.6278,
"step": 1090
},
{
"epoch": 2.386655728739404,
"grad_norm": 0.02862645871937275,
"learning_rate": 0.0006812652068126521,
"loss": 0.7235,
"step": 1091
},
{
"epoch": 2.3888433141919605,
"grad_norm": 0.010565055534243584,
"learning_rate": 0.0006788321167883212,
"loss": 0.7064,
"step": 1092
},
{
"epoch": 2.3910308996445173,
"grad_norm": 0.00996407214552164,
"learning_rate": 0.0006763990267639903,
"loss": 0.747,
"step": 1093
},
{
"epoch": 2.393218485097074,
"grad_norm": 0.008201108314096928,
"learning_rate": 0.0006739659367396594,
"loss": 0.8917,
"step": 1094
},
{
"epoch": 2.395406070549631,
"grad_norm": 0.007856379263103008,
"learning_rate": 0.0006715328467153285,
"loss": 0.8106,
"step": 1095
},
{
"epoch": 2.3975936560021878,
"grad_norm": 0.01899876445531845,
"learning_rate": 0.0006690997566909976,
"loss": 0.9151,
"step": 1096
},
{
"epoch": 2.399781241454744,
"grad_norm": 0.0086012938991189,
"learning_rate": 0.0006666666666666666,
"loss": 0.872,
"step": 1097
},
{
"epoch": 2.401968826907301,
"grad_norm": 0.007030507083982229,
"learning_rate": 0.0006642335766423358,
"loss": 0.6529,
"step": 1098
},
{
"epoch": 2.4041564123598578,
"grad_norm": 0.01876233145594597,
"learning_rate": 0.0006618004866180048,
"loss": 0.8421,
"step": 1099
},
{
"epoch": 2.4063439978124146,
"grad_norm": 0.033474959433078766,
"learning_rate": 0.0006593673965936739,
"loss": 0.6956,
"step": 1100
},
{
"epoch": 2.4085315832649714,
"grad_norm": 0.018535858020186424,
"learning_rate": 0.0006569343065693431,
"loss": 0.7232,
"step": 1101
},
{
"epoch": 2.4107191687175282,
"grad_norm": 0.010383503511548042,
"learning_rate": 0.0006545012165450122,
"loss": 0.5804,
"step": 1102
},
{
"epoch": 2.4129067541700846,
"grad_norm": 0.0077387490309774876,
"learning_rate": 0.0006520681265206813,
"loss": 0.828,
"step": 1103
},
{
"epoch": 2.4150943396226414,
"grad_norm": 0.011656009592115879,
"learning_rate": 0.0006496350364963504,
"loss": 0.9106,
"step": 1104
},
{
"epoch": 2.4172819250751982,
"grad_norm": 0.005996339488774538,
"learning_rate": 0.0006472019464720195,
"loss": 0.6921,
"step": 1105
},
{
"epoch": 2.419469510527755,
"grad_norm": 0.022230584174394608,
"learning_rate": 0.0006447688564476886,
"loss": 0.9711,
"step": 1106
},
{
"epoch": 2.421657095980312,
"grad_norm": 0.031066155061125755,
"learning_rate": 0.0006423357664233577,
"loss": 0.8718,
"step": 1107
},
{
"epoch": 2.4238446814328682,
"grad_norm": 0.011762702837586403,
"learning_rate": 0.0006399026763990268,
"loss": 0.818,
"step": 1108
},
{
"epoch": 2.426032266885425,
"grad_norm": 0.009383924305438995,
"learning_rate": 0.0006374695863746959,
"loss": 0.5913,
"step": 1109
},
{
"epoch": 2.428219852337982,
"grad_norm": 0.012824693694710732,
"learning_rate": 0.000635036496350365,
"loss": 0.7115,
"step": 1110
},
{
"epoch": 2.4304074377905387,
"grad_norm": 0.007453750818967819,
"learning_rate": 0.0006326034063260342,
"loss": 0.7374,
"step": 1111
},
{
"epoch": 2.4325950232430955,
"grad_norm": 0.007933787070214748,
"learning_rate": 0.0006301703163017031,
"loss": 0.7921,
"step": 1112
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.01717616245150566,
"learning_rate": 0.0006277372262773722,
"loss": 0.9326,
"step": 1113
},
{
"epoch": 2.436970194148209,
"grad_norm": 0.009397076442837715,
"learning_rate": 0.0006253041362530414,
"loss": 0.6388,
"step": 1114
},
{
"epoch": 2.4391577796007655,
"grad_norm": 0.008330175653100014,
"learning_rate": 0.0006228710462287105,
"loss": 0.5517,
"step": 1115
},
{
"epoch": 2.4413453650533223,
"grad_norm": 0.013194689527153969,
"learning_rate": 0.0006204379562043796,
"loss": 0.8779,
"step": 1116
},
{
"epoch": 2.443532950505879,
"grad_norm": 0.012824257835745811,
"learning_rate": 0.0006180048661800486,
"loss": 0.7731,
"step": 1117
},
{
"epoch": 2.445720535958436,
"grad_norm": 0.011488651856780052,
"learning_rate": 0.0006155717761557178,
"loss": 0.7806,
"step": 1118
},
{
"epoch": 2.447908121410993,
"grad_norm": 0.006684242747724056,
"learning_rate": 0.0006131386861313869,
"loss": 1.0212,
"step": 1119
},
{
"epoch": 2.450095706863549,
"grad_norm": 0.010995331220328808,
"learning_rate": 0.000610705596107056,
"loss": 0.8499,
"step": 1120
},
{
"epoch": 2.452283292316106,
"grad_norm": 0.016977710649371147,
"learning_rate": 0.0006082725060827251,
"loss": 0.7029,
"step": 1121
},
{
"epoch": 2.454470877768663,
"grad_norm": 0.008742439560592175,
"learning_rate": 0.0006058394160583942,
"loss": 0.6834,
"step": 1122
},
{
"epoch": 2.4566584632212196,
"grad_norm": 0.006410808768123388,
"learning_rate": 0.0006034063260340633,
"loss": 0.8371,
"step": 1123
},
{
"epoch": 2.4588460486737764,
"grad_norm": 0.008776198141276836,
"learning_rate": 0.0006009732360097324,
"loss": 0.7001,
"step": 1124
},
{
"epoch": 2.4610336341263332,
"grad_norm": 0.007712388876825571,
"learning_rate": 0.0005985401459854014,
"loss": 0.5664,
"step": 1125
},
{
"epoch": 2.4632212195788896,
"grad_norm": 0.011250052601099014,
"learning_rate": 0.0005961070559610705,
"loss": 0.8572,
"step": 1126
},
{
"epoch": 2.4654088050314464,
"grad_norm": 0.010831180959939957,
"learning_rate": 0.0005936739659367396,
"loss": 0.6984,
"step": 1127
},
{
"epoch": 2.4675963904840033,
"grad_norm": 0.025114471092820168,
"learning_rate": 0.0005912408759124088,
"loss": 0.7401,
"step": 1128
},
{
"epoch": 2.46978397593656,
"grad_norm": 0.006640868727117777,
"learning_rate": 0.0005888077858880779,
"loss": 0.5887,
"step": 1129
},
{
"epoch": 2.471971561389117,
"grad_norm": 0.0060841697268188,
"learning_rate": 0.0005863746958637469,
"loss": 0.7121,
"step": 1130
},
{
"epoch": 2.4741591468416733,
"grad_norm": 0.012216274626553059,
"learning_rate": 0.0005839416058394161,
"loss": 0.8174,
"step": 1131
},
{
"epoch": 2.47634673229423,
"grad_norm": 0.009857951663434505,
"learning_rate": 0.0005815085158150852,
"loss": 0.7229,
"step": 1132
},
{
"epoch": 2.478534317746787,
"grad_norm": 0.010938407853245735,
"learning_rate": 0.0005790754257907543,
"loss": 0.5738,
"step": 1133
},
{
"epoch": 2.4807219031993437,
"grad_norm": 0.026813512668013573,
"learning_rate": 0.0005766423357664234,
"loss": 0.8543,
"step": 1134
},
{
"epoch": 2.4829094886519005,
"grad_norm": 0.01071678102016449,
"learning_rate": 0.0005742092457420925,
"loss": 0.9774,
"step": 1135
},
{
"epoch": 2.4850970741044573,
"grad_norm": 0.009592295624315739,
"learning_rate": 0.0005717761557177616,
"loss": 0.9619,
"step": 1136
},
{
"epoch": 2.487284659557014,
"grad_norm": 0.005114677362143993,
"learning_rate": 0.0005693430656934307,
"loss": 0.8033,
"step": 1137
},
{
"epoch": 2.4894722450095705,
"grad_norm": 0.012539639137685299,
"learning_rate": 0.0005669099756690998,
"loss": 0.8993,
"step": 1138
},
{
"epoch": 2.4916598304621274,
"grad_norm": 0.026053965091705322,
"learning_rate": 0.0005644768856447688,
"loss": 0.6817,
"step": 1139
},
{
"epoch": 2.493847415914684,
"grad_norm": 0.007609077729284763,
"learning_rate": 0.0005620437956204379,
"loss": 0.8549,
"step": 1140
},
{
"epoch": 2.496035001367241,
"grad_norm": 0.010698397643864155,
"learning_rate": 0.0005596107055961071,
"loss": 0.7068,
"step": 1141
},
{
"epoch": 2.498222586819798,
"grad_norm": 0.008611828088760376,
"learning_rate": 0.0005571776155717762,
"loss": 0.7465,
"step": 1142
},
{
"epoch": 2.500410172272354,
"grad_norm": 0.01089494489133358,
"learning_rate": 0.0005547445255474452,
"loss": 0.6224,
"step": 1143
},
{
"epoch": 2.502597757724911,
"grad_norm": 0.024782098829746246,
"learning_rate": 0.0005523114355231143,
"loss": 0.8328,
"step": 1144
},
{
"epoch": 2.504785343177468,
"grad_norm": 0.006382483057677746,
"learning_rate": 0.0005498783454987835,
"loss": 0.7787,
"step": 1145
},
{
"epoch": 2.5069729286300246,
"grad_norm": 0.016949672251939774,
"learning_rate": 0.0005474452554744526,
"loss": 0.7046,
"step": 1146
},
{
"epoch": 2.5091605140825815,
"grad_norm": 0.027401480823755264,
"learning_rate": 0.0005450121654501216,
"loss": 0.6702,
"step": 1147
},
{
"epoch": 2.5113480995351383,
"grad_norm": 0.01999586448073387,
"learning_rate": 0.0005425790754257908,
"loss": 0.8054,
"step": 1148
},
{
"epoch": 2.513535684987695,
"grad_norm": 0.010145720094442368,
"learning_rate": 0.0005401459854014599,
"loss": 0.6592,
"step": 1149
},
{
"epoch": 2.5157232704402515,
"grad_norm": 0.018535887822508812,
"learning_rate": 0.000537712895377129,
"loss": 0.7254,
"step": 1150
},
{
"epoch": 2.5179108558928083,
"grad_norm": 0.009648307226598263,
"learning_rate": 0.0005352798053527981,
"loss": 0.6838,
"step": 1151
},
{
"epoch": 2.520098441345365,
"grad_norm": 0.016310011968016624,
"learning_rate": 0.0005328467153284672,
"loss": 0.8777,
"step": 1152
},
{
"epoch": 2.522286026797922,
"grad_norm": 0.010320610366761684,
"learning_rate": 0.0005304136253041362,
"loss": 0.7651,
"step": 1153
},
{
"epoch": 2.5244736122504783,
"grad_norm": 0.012834092602133751,
"learning_rate": 0.0005279805352798053,
"loss": 0.7847,
"step": 1154
},
{
"epoch": 2.526661197703035,
"grad_norm": 0.011668582446873188,
"learning_rate": 0.0005255474452554745,
"loss": 0.7225,
"step": 1155
},
{
"epoch": 2.528848783155592,
"grad_norm": 0.009817942976951599,
"learning_rate": 0.0005231143552311435,
"loss": 0.6983,
"step": 1156
},
{
"epoch": 2.5310363686081487,
"grad_norm": 0.009282633662223816,
"learning_rate": 0.0005206812652068126,
"loss": 0.7688,
"step": 1157
},
{
"epoch": 2.5332239540607056,
"grad_norm": 0.007419208530336618,
"learning_rate": 0.0005182481751824818,
"loss": 0.728,
"step": 1158
},
{
"epoch": 2.5354115395132624,
"grad_norm": 0.029275061562657356,
"learning_rate": 0.0005158150851581509,
"loss": 0.8293,
"step": 1159
},
{
"epoch": 2.537599124965819,
"grad_norm": 0.01723194308578968,
"learning_rate": 0.0005133819951338199,
"loss": 0.6128,
"step": 1160
},
{
"epoch": 2.5397867104183756,
"grad_norm": 0.009285934269428253,
"learning_rate": 0.000510948905109489,
"loss": 0.6788,
"step": 1161
},
{
"epoch": 2.5419742958709324,
"grad_norm": 0.008555158041417599,
"learning_rate": 0.0005085158150851582,
"loss": 0.6507,
"step": 1162
},
{
"epoch": 2.544161881323489,
"grad_norm": 0.0168358962982893,
"learning_rate": 0.0005060827250608273,
"loss": 0.942,
"step": 1163
},
{
"epoch": 2.546349466776046,
"grad_norm": 0.0068771797232329845,
"learning_rate": 0.0005036496350364964,
"loss": 0.7844,
"step": 1164
},
{
"epoch": 2.548537052228603,
"grad_norm": 0.04532065615057945,
"learning_rate": 0.0005012165450121655,
"loss": 0.8095,
"step": 1165
},
{
"epoch": 2.550724637681159,
"grad_norm": 0.00933657493442297,
"learning_rate": 0.0004987834549878346,
"loss": 0.8072,
"step": 1166
},
{
"epoch": 2.552912223133716,
"grad_norm": 0.009804673492908478,
"learning_rate": 0.0004963503649635036,
"loss": 0.8715,
"step": 1167
},
{
"epoch": 2.555099808586273,
"grad_norm": 0.010783910751342773,
"learning_rate": 0.0004939172749391727,
"loss": 0.7891,
"step": 1168
},
{
"epoch": 2.5572873940388297,
"grad_norm": 0.011784784495830536,
"learning_rate": 0.0004914841849148418,
"loss": 0.7262,
"step": 1169
},
{
"epoch": 2.5594749794913865,
"grad_norm": 0.007322199642658234,
"learning_rate": 0.0004890510948905109,
"loss": 0.7809,
"step": 1170
},
{
"epoch": 2.5616625649439433,
"grad_norm": 0.011777276173233986,
"learning_rate": 0.00048661800486618,
"loss": 0.7791,
"step": 1171
},
{
"epoch": 2.5638501503965,
"grad_norm": 0.015589660964906216,
"learning_rate": 0.00048418491484184916,
"loss": 0.921,
"step": 1172
},
{
"epoch": 2.5660377358490565,
"grad_norm": 0.010277018882334232,
"learning_rate": 0.00048175182481751826,
"loss": 0.9368,
"step": 1173
},
{
"epoch": 2.5682253213016133,
"grad_norm": 0.02483278699219227,
"learning_rate": 0.00047931873479318735,
"loss": 0.7714,
"step": 1174
},
{
"epoch": 2.57041290675417,
"grad_norm": 0.013863074593245983,
"learning_rate": 0.0004768856447688565,
"loss": 0.6637,
"step": 1175
},
{
"epoch": 2.572600492206727,
"grad_norm": 0.015338894911110401,
"learning_rate": 0.00047445255474452553,
"loss": 0.7678,
"step": 1176
},
{
"epoch": 2.5747880776592833,
"grad_norm": 0.007364062592387199,
"learning_rate": 0.0004720194647201946,
"loss": 0.995,
"step": 1177
},
{
"epoch": 2.57697566311184,
"grad_norm": 0.1765730232000351,
"learning_rate": 0.00046958637469586377,
"loss": 0.7865,
"step": 1178
},
{
"epoch": 2.579163248564397,
"grad_norm": 0.010664415545761585,
"learning_rate": 0.00046715328467153287,
"loss": 0.5741,
"step": 1179
},
{
"epoch": 2.5813508340169538,
"grad_norm": 0.012521582655608654,
"learning_rate": 0.00046472019464720196,
"loss": 0.6621,
"step": 1180
},
{
"epoch": 2.5835384194695106,
"grad_norm": 0.03732423484325409,
"learning_rate": 0.000462287104622871,
"loss": 0.7453,
"step": 1181
},
{
"epoch": 2.5857260049220674,
"grad_norm": 0.013986853882670403,
"learning_rate": 0.0004598540145985402,
"loss": 0.7057,
"step": 1182
},
{
"epoch": 2.587913590374624,
"grad_norm": 0.013078927993774414,
"learning_rate": 0.00045742092457420923,
"loss": 0.7167,
"step": 1183
},
{
"epoch": 2.590101175827181,
"grad_norm": 0.006835412234067917,
"learning_rate": 0.0004549878345498783,
"loss": 0.8064,
"step": 1184
},
{
"epoch": 2.5922887612797374,
"grad_norm": 0.020057901740074158,
"learning_rate": 0.0004525547445255475,
"loss": 0.7096,
"step": 1185
},
{
"epoch": 2.594476346732294,
"grad_norm": 0.026187503710389137,
"learning_rate": 0.00045012165450121657,
"loss": 0.9496,
"step": 1186
},
{
"epoch": 2.596663932184851,
"grad_norm": 0.012171875685453415,
"learning_rate": 0.00044768856447688566,
"loss": 0.7529,
"step": 1187
},
{
"epoch": 2.598851517637408,
"grad_norm": 0.012145042419433594,
"learning_rate": 0.0004452554744525548,
"loss": 0.8654,
"step": 1188
},
{
"epoch": 2.601039103089964,
"grad_norm": 0.013504109345376492,
"learning_rate": 0.00044282238442822384,
"loss": 0.6347,
"step": 1189
},
{
"epoch": 2.603226688542521,
"grad_norm": 0.01362569723278284,
"learning_rate": 0.00044038929440389293,
"loss": 0.661,
"step": 1190
},
{
"epoch": 2.605414273995078,
"grad_norm": 0.013327688910067081,
"learning_rate": 0.00043795620437956203,
"loss": 0.6851,
"step": 1191
},
{
"epoch": 2.6076018594476347,
"grad_norm": 0.008194427005946636,
"learning_rate": 0.0004355231143552312,
"loss": 0.8226,
"step": 1192
},
{
"epoch": 2.6097894449001915,
"grad_norm": 0.017937535420060158,
"learning_rate": 0.00043309002433090027,
"loss": 0.7033,
"step": 1193
},
{
"epoch": 2.6119770303527483,
"grad_norm": 0.005625641439110041,
"learning_rate": 0.0004306569343065693,
"loss": 0.7106,
"step": 1194
},
{
"epoch": 2.614164615805305,
"grad_norm": 0.01812170445919037,
"learning_rate": 0.0004282238442822385,
"loss": 0.7344,
"step": 1195
},
{
"epoch": 2.6163522012578615,
"grad_norm": 0.007461361587047577,
"learning_rate": 0.00042579075425790754,
"loss": 0.835,
"step": 1196
},
{
"epoch": 2.6185397867104183,
"grad_norm": 0.014407969079911709,
"learning_rate": 0.00042335766423357664,
"loss": 0.7829,
"step": 1197
},
{
"epoch": 2.620727372162975,
"grad_norm": 0.008925898931920528,
"learning_rate": 0.0004209245742092458,
"loss": 0.6425,
"step": 1198
},
{
"epoch": 2.622914957615532,
"grad_norm": 0.010357217863202095,
"learning_rate": 0.0004184914841849149,
"loss": 0.894,
"step": 1199
},
{
"epoch": 2.6251025430680883,
"grad_norm": 0.01632748544216156,
"learning_rate": 0.00041605839416058397,
"loss": 0.6886,
"step": 1200
},
{
"epoch": 2.627290128520645,
"grad_norm": 0.021274514496326447,
"learning_rate": 0.000413625304136253,
"loss": 0.7503,
"step": 1201
},
{
"epoch": 2.629477713973202,
"grad_norm": 0.021467119455337524,
"learning_rate": 0.00041119221411192215,
"loss": 0.9202,
"step": 1202
},
{
"epoch": 2.6316652994257588,
"grad_norm": 0.011900427751243114,
"learning_rate": 0.00040875912408759124,
"loss": 0.7084,
"step": 1203
},
{
"epoch": 2.6338528848783156,
"grad_norm": 0.010819557122886181,
"learning_rate": 0.00040632603406326034,
"loss": 1.0455,
"step": 1204
},
{
"epoch": 2.6360404703308724,
"grad_norm": 0.012575685046613216,
"learning_rate": 0.0004038929440389295,
"loss": 0.6894,
"step": 1205
},
{
"epoch": 2.6382280557834292,
"grad_norm": 0.011274064891040325,
"learning_rate": 0.0004014598540145986,
"loss": 0.8449,
"step": 1206
},
{
"epoch": 2.640415641235986,
"grad_norm": 0.013194631785154343,
"learning_rate": 0.0003990267639902676,
"loss": 0.8192,
"step": 1207
},
{
"epoch": 2.6426032266885424,
"grad_norm": 0.009542672894895077,
"learning_rate": 0.0003965936739659367,
"loss": 0.8768,
"step": 1208
},
{
"epoch": 2.6447908121410992,
"grad_norm": 0.016639290377497673,
"learning_rate": 0.00039416058394160585,
"loss": 0.7371,
"step": 1209
},
{
"epoch": 2.646978397593656,
"grad_norm": 0.02203970216214657,
"learning_rate": 0.00039172749391727494,
"loss": 0.6598,
"step": 1210
},
{
"epoch": 2.649165983046213,
"grad_norm": 0.027763044461607933,
"learning_rate": 0.00038929440389294404,
"loss": 0.6819,
"step": 1211
},
{
"epoch": 2.6513535684987692,
"grad_norm": 0.01537309866398573,
"learning_rate": 0.0003868613138686132,
"loss": 0.8249,
"step": 1212
},
{
"epoch": 2.653541153951326,
"grad_norm": 0.01565646007657051,
"learning_rate": 0.0003844282238442823,
"loss": 0.569,
"step": 1213
},
{
"epoch": 2.655728739403883,
"grad_norm": 0.01048749778419733,
"learning_rate": 0.0003819951338199513,
"loss": 0.6359,
"step": 1214
},
{
"epoch": 2.6579163248564397,
"grad_norm": 0.061209116131067276,
"learning_rate": 0.00037956204379562046,
"loss": 0.7011,
"step": 1215
},
{
"epoch": 2.6601039103089965,
"grad_norm": 0.016036316752433777,
"learning_rate": 0.00037712895377128955,
"loss": 0.5889,
"step": 1216
},
{
"epoch": 2.6622914957615533,
"grad_norm": 0.014299210160970688,
"learning_rate": 0.00037469586374695864,
"loss": 0.7685,
"step": 1217
},
{
"epoch": 2.66447908121411,
"grad_norm": 0.010716800577938557,
"learning_rate": 0.00037226277372262774,
"loss": 0.7795,
"step": 1218
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.007198740262538195,
"learning_rate": 0.0003698296836982969,
"loss": 0.8868,
"step": 1219
},
{
"epoch": 2.6688542521192233,
"grad_norm": 0.018458040431141853,
"learning_rate": 0.0003673965936739659,
"loss": 0.6935,
"step": 1220
},
{
"epoch": 2.67104183757178,
"grad_norm": 0.011869457550346851,
"learning_rate": 0.00036496350364963507,
"loss": 0.7638,
"step": 1221
},
{
"epoch": 2.673229423024337,
"grad_norm": 0.00896628387272358,
"learning_rate": 0.0003625304136253041,
"loss": 0.7615,
"step": 1222
},
{
"epoch": 2.675417008476894,
"grad_norm": 0.008536278270184994,
"learning_rate": 0.00036009732360097325,
"loss": 0.6647,
"step": 1223
},
{
"epoch": 2.67760459392945,
"grad_norm": 0.02423817664384842,
"learning_rate": 0.00035766423357664234,
"loss": 0.6876,
"step": 1224
},
{
"epoch": 2.679792179382007,
"grad_norm": 0.011117582209408283,
"learning_rate": 0.00035523114355231144,
"loss": 0.665,
"step": 1225
},
{
"epoch": 2.681979764834564,
"grad_norm": 0.009505179710686207,
"learning_rate": 0.00035279805352798053,
"loss": 0.6284,
"step": 1226
},
{
"epoch": 2.6841673502871206,
"grad_norm": 0.0063440497033298016,
"learning_rate": 0.0003503649635036496,
"loss": 0.8279,
"step": 1227
},
{
"epoch": 2.6863549357396774,
"grad_norm": 0.0201023630797863,
"learning_rate": 0.00034793187347931877,
"loss": 0.8996,
"step": 1228
},
{
"epoch": 2.6885425211922342,
"grad_norm": 0.006452304311096668,
"learning_rate": 0.0003454987834549878,
"loss": 0.8563,
"step": 1229
},
{
"epoch": 2.690730106644791,
"grad_norm": 0.00840191449970007,
"learning_rate": 0.00034306569343065695,
"loss": 0.6543,
"step": 1230
},
{
"epoch": 2.6929176920973474,
"grad_norm": 0.011340702883899212,
"learning_rate": 0.00034063260340632605,
"loss": 0.733,
"step": 1231
},
{
"epoch": 2.6951052775499043,
"grad_norm": 0.01761777698993683,
"learning_rate": 0.00033819951338199514,
"loss": 0.9136,
"step": 1232
},
{
"epoch": 2.697292863002461,
"grad_norm": 0.012587963603436947,
"learning_rate": 0.00033576642335766423,
"loss": 0.801,
"step": 1233
},
{
"epoch": 2.699480448455018,
"grad_norm": 0.006971995811909437,
"learning_rate": 0.0003333333333333333,
"loss": 0.8079,
"step": 1234
},
{
"epoch": 2.7016680339075743,
"grad_norm": 0.00921553373336792,
"learning_rate": 0.0003309002433090024,
"loss": 0.6801,
"step": 1235
},
{
"epoch": 2.703855619360131,
"grad_norm": 0.012788954190909863,
"learning_rate": 0.00032846715328467156,
"loss": 0.8119,
"step": 1236
},
{
"epoch": 2.706043204812688,
"grad_norm": 0.01745203509926796,
"learning_rate": 0.00032603406326034065,
"loss": 0.808,
"step": 1237
},
{
"epoch": 2.7082307902652447,
"grad_norm": 0.010819566436111927,
"learning_rate": 0.00032360097323600975,
"loss": 0.6882,
"step": 1238
},
{
"epoch": 2.7104183757178015,
"grad_norm": 0.013807238079607487,
"learning_rate": 0.00032116788321167884,
"loss": 0.5872,
"step": 1239
},
{
"epoch": 2.7126059611703583,
"grad_norm": 0.015879668295383453,
"learning_rate": 0.00031873479318734793,
"loss": 0.7541,
"step": 1240
},
{
"epoch": 2.714793546622915,
"grad_norm": 0.008229264058172703,
"learning_rate": 0.0003163017031630171,
"loss": 0.8002,
"step": 1241
},
{
"epoch": 2.7169811320754715,
"grad_norm": 0.011732214130461216,
"learning_rate": 0.0003138686131386861,
"loss": 0.7049,
"step": 1242
},
{
"epoch": 2.7191687175280284,
"grad_norm": 0.008688759990036488,
"learning_rate": 0.00031143552311435526,
"loss": 0.9007,
"step": 1243
},
{
"epoch": 2.721356302980585,
"grad_norm": 0.014027293771505356,
"learning_rate": 0.0003090024330900243,
"loss": 0.6098,
"step": 1244
},
{
"epoch": 2.723543888433142,
"grad_norm": 0.00831068679690361,
"learning_rate": 0.00030656934306569345,
"loss": 0.7435,
"step": 1245
},
{
"epoch": 2.725731473885699,
"grad_norm": 0.017324576154351234,
"learning_rate": 0.00030413625304136254,
"loss": 0.7317,
"step": 1246
},
{
"epoch": 2.727919059338255,
"grad_norm": 0.01490398496389389,
"learning_rate": 0.00030170316301703163,
"loss": 0.7434,
"step": 1247
},
{
"epoch": 2.730106644790812,
"grad_norm": 0.02181348390877247,
"learning_rate": 0.0002992700729927007,
"loss": 0.7395,
"step": 1248
},
{
"epoch": 2.732294230243369,
"grad_norm": 0.017193686217069626,
"learning_rate": 0.0002968369829683698,
"loss": 1.0303,
"step": 1249
},
{
"epoch": 2.7344818156959256,
"grad_norm": 0.011623183265328407,
"learning_rate": 0.00029440389294403896,
"loss": 0.5918,
"step": 1250
},
{
"epoch": 2.7366694011484825,
"grad_norm": 0.007596330717206001,
"learning_rate": 0.00029197080291970805,
"loss": 0.6441,
"step": 1251
},
{
"epoch": 2.7388569866010393,
"grad_norm": 0.022759029641747475,
"learning_rate": 0.00028953771289537715,
"loss": 0.6192,
"step": 1252
},
{
"epoch": 2.741044572053596,
"grad_norm": 0.0065732188522815704,
"learning_rate": 0.00028710462287104624,
"loss": 0.73,
"step": 1253
},
{
"epoch": 2.7432321575061525,
"grad_norm": 0.009496266953647137,
"learning_rate": 0.00028467153284671533,
"loss": 0.839,
"step": 1254
},
{
"epoch": 2.7454197429587093,
"grad_norm": 0.007220600266009569,
"learning_rate": 0.0002822384428223844,
"loss": 0.6448,
"step": 1255
},
{
"epoch": 2.747607328411266,
"grad_norm": 0.015215203166007996,
"learning_rate": 0.00027980535279805357,
"loss": 0.7697,
"step": 1256
},
{
"epoch": 2.749794913863823,
"grad_norm": 0.015471878461539745,
"learning_rate": 0.0002773722627737226,
"loss": 0.7398,
"step": 1257
},
{
"epoch": 2.7519824993163793,
"grad_norm": 0.009130065329372883,
"learning_rate": 0.00027493917274939175,
"loss": 0.6993,
"step": 1258
},
{
"epoch": 2.754170084768936,
"grad_norm": 0.007493583485484123,
"learning_rate": 0.0002725060827250608,
"loss": 0.6525,
"step": 1259
},
{
"epoch": 2.756357670221493,
"grad_norm": 0.018882576376199722,
"learning_rate": 0.00027007299270072994,
"loss": 0.785,
"step": 1260
},
{
"epoch": 2.7585452556740497,
"grad_norm": 0.010290750302374363,
"learning_rate": 0.00026763990267639903,
"loss": 0.6355,
"step": 1261
},
{
"epoch": 2.7607328411266066,
"grad_norm": 0.020789271220564842,
"learning_rate": 0.0002652068126520681,
"loss": 0.6681,
"step": 1262
},
{
"epoch": 2.7629204265791634,
"grad_norm": 0.010807972401380539,
"learning_rate": 0.00026277372262773727,
"loss": 0.8581,
"step": 1263
},
{
"epoch": 2.76510801203172,
"grad_norm": 0.006756063550710678,
"learning_rate": 0.0002603406326034063,
"loss": 0.7499,
"step": 1264
},
{
"epoch": 2.767295597484277,
"grad_norm": 0.013115596026182175,
"learning_rate": 0.00025790754257907546,
"loss": 0.6298,
"step": 1265
},
{
"epoch": 2.7694831829368334,
"grad_norm": 0.010143927298486233,
"learning_rate": 0.0002554744525547445,
"loss": 0.7911,
"step": 1266
},
{
"epoch": 2.77167076838939,
"grad_norm": 0.011593978852033615,
"learning_rate": 0.00025304136253041364,
"loss": 0.6558,
"step": 1267
},
{
"epoch": 2.773858353841947,
"grad_norm": 0.011897698044776917,
"learning_rate": 0.00025060827250608273,
"loss": 0.7177,
"step": 1268
},
{
"epoch": 2.776045939294504,
"grad_norm": 0.011287844739854336,
"learning_rate": 0.0002481751824817518,
"loss": 0.8625,
"step": 1269
},
{
"epoch": 2.77823352474706,
"grad_norm": 0.017498012632131577,
"learning_rate": 0.0002457420924574209,
"loss": 0.896,
"step": 1270
},
{
"epoch": 2.780421110199617,
"grad_norm": 0.011069230735301971,
"learning_rate": 0.00024330900243309,
"loss": 0.6567,
"step": 1271
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.005669731646776199,
"learning_rate": 0.00024087591240875913,
"loss": 0.7313,
"step": 1272
},
{
"epoch": 2.7847962811047307,
"grad_norm": 0.02650737576186657,
"learning_rate": 0.00023844282238442825,
"loss": 0.8647,
"step": 1273
},
{
"epoch": 2.7869838665572875,
"grad_norm": 0.010408868081867695,
"learning_rate": 0.0002360097323600973,
"loss": 0.8034,
"step": 1274
},
{
"epoch": 2.7891714520098443,
"grad_norm": 0.013187460601329803,
"learning_rate": 0.00023357664233576643,
"loss": 0.8,
"step": 1275
},
{
"epoch": 2.791359037462401,
"grad_norm": 0.009964399971067905,
"learning_rate": 0.0002311435523114355,
"loss": 0.8949,
"step": 1276
},
{
"epoch": 2.7935466229149575,
"grad_norm": 0.01696036383509636,
"learning_rate": 0.00022871046228710462,
"loss": 0.678,
"step": 1277
},
{
"epoch": 2.7957342083675143,
"grad_norm": 0.07283343374729156,
"learning_rate": 0.00022627737226277374,
"loss": 0.7264,
"step": 1278
},
{
"epoch": 2.797921793820071,
"grad_norm": 0.007607647217810154,
"learning_rate": 0.00022384428223844283,
"loss": 0.8112,
"step": 1279
},
{
"epoch": 2.800109379272628,
"grad_norm": 0.015119451098144054,
"learning_rate": 0.00022141119221411192,
"loss": 0.6995,
"step": 1280
},
{
"epoch": 2.8022969647251843,
"grad_norm": 0.013507510535418987,
"learning_rate": 0.00021897810218978101,
"loss": 0.8193,
"step": 1281
},
{
"epoch": 2.804484550177741,
"grad_norm": 0.007651912048459053,
"learning_rate": 0.00021654501216545013,
"loss": 0.5999,
"step": 1282
},
{
"epoch": 2.806672135630298,
"grad_norm": 0.010115343146026134,
"learning_rate": 0.00021411192214111925,
"loss": 0.7694,
"step": 1283
},
{
"epoch": 2.8088597210828548,
"grad_norm": 0.011188814416527748,
"learning_rate": 0.00021167883211678832,
"loss": 0.8099,
"step": 1284
},
{
"epoch": 2.8110473065354116,
"grad_norm": 0.007763843517750502,
"learning_rate": 0.00020924574209245744,
"loss": 0.7182,
"step": 1285
},
{
"epoch": 2.8132348919879684,
"grad_norm": 0.00900893472135067,
"learning_rate": 0.0002068126520681265,
"loss": 0.6297,
"step": 1286
},
{
"epoch": 2.815422477440525,
"grad_norm": 0.006093029864132404,
"learning_rate": 0.00020437956204379562,
"loss": 1.0166,
"step": 1287
},
{
"epoch": 2.817610062893082,
"grad_norm": 0.008186981081962585,
"learning_rate": 0.00020194647201946474,
"loss": 0.6606,
"step": 1288
},
{
"epoch": 2.8197976483456384,
"grad_norm": 0.011285791173577309,
"learning_rate": 0.0001995133819951338,
"loss": 0.672,
"step": 1289
},
{
"epoch": 2.821985233798195,
"grad_norm": 0.011607305146753788,
"learning_rate": 0.00019708029197080293,
"loss": 0.6903,
"step": 1290
},
{
"epoch": 2.824172819250752,
"grad_norm": 0.008523947559297085,
"learning_rate": 0.00019464720194647202,
"loss": 0.8383,
"step": 1291
},
{
"epoch": 2.826360404703309,
"grad_norm": 0.010200290009379387,
"learning_rate": 0.00019221411192214114,
"loss": 0.7475,
"step": 1292
},
{
"epoch": 2.828547990155865,
"grad_norm": 0.01312936469912529,
"learning_rate": 0.00018978102189781023,
"loss": 0.7571,
"step": 1293
},
{
"epoch": 2.830735575608422,
"grad_norm": 0.021754464134573936,
"learning_rate": 0.00018734793187347932,
"loss": 0.7915,
"step": 1294
},
{
"epoch": 2.832923161060979,
"grad_norm": 0.022569775581359863,
"learning_rate": 0.00018491484184914844,
"loss": 0.7305,
"step": 1295
},
{
"epoch": 2.8351107465135357,
"grad_norm": 0.009172527119517326,
"learning_rate": 0.00018248175182481753,
"loss": 0.8616,
"step": 1296
},
{
"epoch": 2.8372983319660925,
"grad_norm": 0.00900851096957922,
"learning_rate": 0.00018004866180048663,
"loss": 0.8411,
"step": 1297
},
{
"epoch": 2.8394859174186493,
"grad_norm": 0.033786166459321976,
"learning_rate": 0.00017761557177615572,
"loss": 0.6755,
"step": 1298
},
{
"epoch": 2.841673502871206,
"grad_norm": 0.006091755349189043,
"learning_rate": 0.0001751824817518248,
"loss": 0.7822,
"step": 1299
},
{
"epoch": 2.8438610883237625,
"grad_norm": 0.011280403472483158,
"learning_rate": 0.0001727493917274939,
"loss": 0.8669,
"step": 1300
},
{
"epoch": 2.8460486737763193,
"grad_norm": 0.007846282795071602,
"learning_rate": 0.00017031630170316302,
"loss": 0.752,
"step": 1301
},
{
"epoch": 2.848236259228876,
"grad_norm": 0.008928561583161354,
"learning_rate": 0.00016788321167883211,
"loss": 0.7062,
"step": 1302
},
{
"epoch": 2.850423844681433,
"grad_norm": 0.0234297476708889,
"learning_rate": 0.0001654501216545012,
"loss": 0.7319,
"step": 1303
},
{
"epoch": 2.8526114301339898,
"grad_norm": 0.07628759741783142,
"learning_rate": 0.00016301703163017033,
"loss": 0.8256,
"step": 1304
},
{
"epoch": 2.854799015586546,
"grad_norm": 0.00962966587394476,
"learning_rate": 0.00016058394160583942,
"loss": 0.825,
"step": 1305
},
{
"epoch": 2.856986601039103,
"grad_norm": 0.008182559162378311,
"learning_rate": 0.00015815085158150854,
"loss": 0.7628,
"step": 1306
},
{
"epoch": 2.8591741864916598,
"grad_norm": 0.0483902171254158,
"learning_rate": 0.00015571776155717763,
"loss": 0.8631,
"step": 1307
},
{
"epoch": 2.8613617719442166,
"grad_norm": 0.01323285885155201,
"learning_rate": 0.00015328467153284672,
"loss": 0.7958,
"step": 1308
},
{
"epoch": 2.8635493573967734,
"grad_norm": 0.009712522849440575,
"learning_rate": 0.00015085158150851582,
"loss": 0.6506,
"step": 1309
},
{
"epoch": 2.8657369428493302,
"grad_norm": 0.0073866224847733974,
"learning_rate": 0.0001484184914841849,
"loss": 0.5997,
"step": 1310
},
{
"epoch": 2.867924528301887,
"grad_norm": 0.009534020908176899,
"learning_rate": 0.00014598540145985403,
"loss": 0.7732,
"step": 1311
},
{
"epoch": 2.8701121137544434,
"grad_norm": 0.008029601536691189,
"learning_rate": 0.00014355231143552312,
"loss": 0.7837,
"step": 1312
},
{
"epoch": 2.8722996992070002,
"grad_norm": 0.01388575229793787,
"learning_rate": 0.0001411192214111922,
"loss": 0.6959,
"step": 1313
},
{
"epoch": 2.874487284659557,
"grad_norm": 0.011830773204565048,
"learning_rate": 0.0001386861313868613,
"loss": 0.7597,
"step": 1314
},
{
"epoch": 2.876674870112114,
"grad_norm": 0.013655097223818302,
"learning_rate": 0.0001362530413625304,
"loss": 0.6103,
"step": 1315
},
{
"epoch": 2.8788624555646702,
"grad_norm": 0.009793232195079327,
"learning_rate": 0.00013381995133819952,
"loss": 0.7327,
"step": 1316
},
{
"epoch": 2.881050041017227,
"grad_norm": 0.009699089452624321,
"learning_rate": 0.00013138686131386864,
"loss": 0.7882,
"step": 1317
},
{
"epoch": 2.883237626469784,
"grad_norm": 0.01353220921009779,
"learning_rate": 0.00012895377128953773,
"loss": 0.7567,
"step": 1318
},
{
"epoch": 2.8854252119223407,
"grad_norm": 0.012468249537050724,
"learning_rate": 0.00012652068126520682,
"loss": 0.6502,
"step": 1319
},
{
"epoch": 2.8876127973748975,
"grad_norm": 0.010982934385538101,
"learning_rate": 0.0001240875912408759,
"loss": 0.6542,
"step": 1320
},
{
"epoch": 2.8898003828274543,
"grad_norm": 0.008489643223583698,
"learning_rate": 0.000121654501216545,
"loss": 0.7122,
"step": 1321
},
{
"epoch": 2.891987968280011,
"grad_norm": 0.009710462763905525,
"learning_rate": 0.00011922141119221412,
"loss": 0.8059,
"step": 1322
},
{
"epoch": 2.8941755537325675,
"grad_norm": 0.008519637398421764,
"learning_rate": 0.00011678832116788322,
"loss": 0.668,
"step": 1323
},
{
"epoch": 2.8963631391851243,
"grad_norm": 0.012375866994261742,
"learning_rate": 0.00011435523114355231,
"loss": 0.8298,
"step": 1324
},
{
"epoch": 2.898550724637681,
"grad_norm": 0.011852890253067017,
"learning_rate": 0.00011192214111922141,
"loss": 1.0037,
"step": 1325
},
{
"epoch": 2.900738310090238,
"grad_norm": 0.01731940545141697,
"learning_rate": 0.00010948905109489051,
"loss": 0.7002,
"step": 1326
},
{
"epoch": 2.902925895542795,
"grad_norm": 0.026805153116583824,
"learning_rate": 0.00010705596107055963,
"loss": 0.9983,
"step": 1327
},
{
"epoch": 2.905113480995351,
"grad_norm": 0.011630130000412464,
"learning_rate": 0.00010462287104622872,
"loss": 0.575,
"step": 1328
},
{
"epoch": 2.907301066447908,
"grad_norm": 0.012041180394589901,
"learning_rate": 0.00010218978102189781,
"loss": 0.6631,
"step": 1329
},
{
"epoch": 2.909488651900465,
"grad_norm": 0.009331166744232178,
"learning_rate": 9.97566909975669e-05,
"loss": 0.7661,
"step": 1330
},
{
"epoch": 2.9116762373530216,
"grad_norm": 0.010035173036158085,
"learning_rate": 9.732360097323601e-05,
"loss": 0.7367,
"step": 1331
},
{
"epoch": 2.9138638228055784,
"grad_norm": 0.0184579249471426,
"learning_rate": 9.489051094890511e-05,
"loss": 0.7267,
"step": 1332
},
{
"epoch": 2.9160514082581352,
"grad_norm": 0.019723238423466682,
"learning_rate": 9.245742092457422e-05,
"loss": 0.9285,
"step": 1333
},
{
"epoch": 2.918238993710692,
"grad_norm": 0.01119768712669611,
"learning_rate": 9.002433090024331e-05,
"loss": 0.8886,
"step": 1334
},
{
"epoch": 2.9204265791632484,
"grad_norm": 0.010187883861362934,
"learning_rate": 8.75912408759124e-05,
"loss": 0.6872,
"step": 1335
},
{
"epoch": 2.9226141646158053,
"grad_norm": 0.006695912219583988,
"learning_rate": 8.515815085158151e-05,
"loss": 0.6093,
"step": 1336
},
{
"epoch": 2.924801750068362,
"grad_norm": 0.009726252406835556,
"learning_rate": 8.27250608272506e-05,
"loss": 0.735,
"step": 1337
},
{
"epoch": 2.926989335520919,
"grad_norm": 0.006968527100980282,
"learning_rate": 8.029197080291971e-05,
"loss": 0.9525,
"step": 1338
},
{
"epoch": 2.9291769209734753,
"grad_norm": 0.019444549456238747,
"learning_rate": 7.785888077858882e-05,
"loss": 0.7423,
"step": 1339
},
{
"epoch": 2.931364506426032,
"grad_norm": 0.014326276257634163,
"learning_rate": 7.542579075425791e-05,
"loss": 0.7437,
"step": 1340
},
{
"epoch": 2.933552091878589,
"grad_norm": 0.008168605156242847,
"learning_rate": 7.299270072992701e-05,
"loss": 0.7014,
"step": 1341
},
{
"epoch": 2.9357396773311457,
"grad_norm": 0.010011604055762291,
"learning_rate": 7.05596107055961e-05,
"loss": 0.6541,
"step": 1342
},
{
"epoch": 2.9379272627837025,
"grad_norm": 0.013739430345594883,
"learning_rate": 6.81265206812652e-05,
"loss": 0.7885,
"step": 1343
},
{
"epoch": 2.9401148482362593,
"grad_norm": 0.01414500456303358,
"learning_rate": 6.569343065693432e-05,
"loss": 0.9111,
"step": 1344
},
{
"epoch": 2.942302433688816,
"grad_norm": 0.010208160616457462,
"learning_rate": 6.326034063260341e-05,
"loss": 0.6641,
"step": 1345
},
{
"epoch": 2.944490019141373,
"grad_norm": 0.012237477116286755,
"learning_rate": 6.08272506082725e-05,
"loss": 0.6199,
"step": 1346
},
{
"epoch": 2.9466776045939294,
"grad_norm": 0.008850525133311749,
"learning_rate": 5.839416058394161e-05,
"loss": 0.8436,
"step": 1347
},
{
"epoch": 2.948865190046486,
"grad_norm": 0.01408157218247652,
"learning_rate": 5.596107055961071e-05,
"loss": 0.667,
"step": 1348
},
{
"epoch": 2.951052775499043,
"grad_norm": 0.017354557290673256,
"learning_rate": 5.352798053527981e-05,
"loss": 0.7591,
"step": 1349
},
{
"epoch": 2.9532403609516,
"grad_norm": 0.013411460444331169,
"learning_rate": 5.1094890510948905e-05,
"loss": 0.8248,
"step": 1350
},
{
"epoch": 2.955427946404156,
"grad_norm": 0.018828334286808968,
"learning_rate": 4.8661800486618005e-05,
"loss": 0.8297,
"step": 1351
},
{
"epoch": 2.957615531856713,
"grad_norm": 0.012131531722843647,
"learning_rate": 4.622871046228711e-05,
"loss": 0.8469,
"step": 1352
},
{
"epoch": 2.95980311730927,
"grad_norm": 0.017933214083313942,
"learning_rate": 4.37956204379562e-05,
"loss": 0.886,
"step": 1353
},
{
"epoch": 2.9619907027618266,
"grad_norm": 0.007120661437511444,
"learning_rate": 4.13625304136253e-05,
"loss": 0.7975,
"step": 1354
},
{
"epoch": 2.9641782882143834,
"grad_norm": 0.008959448896348476,
"learning_rate": 3.892944038929441e-05,
"loss": 0.7624,
"step": 1355
},
{
"epoch": 2.9663658736669403,
"grad_norm": 0.00703001581132412,
"learning_rate": 3.649635036496351e-05,
"loss": 0.9414,
"step": 1356
},
{
"epoch": 2.968553459119497,
"grad_norm": 0.009628667496144772,
"learning_rate": 3.40632603406326e-05,
"loss": 0.7348,
"step": 1357
},
{
"epoch": 2.9707410445720535,
"grad_norm": 0.010123343206942081,
"learning_rate": 3.1630170316301705e-05,
"loss": 0.5589,
"step": 1358
},
{
"epoch": 2.9729286300246103,
"grad_norm": 0.012991656549274921,
"learning_rate": 2.9197080291970804e-05,
"loss": 0.7015,
"step": 1359
},
{
"epoch": 2.975116215477167,
"grad_norm": 0.008844063617289066,
"learning_rate": 2.6763990267639907e-05,
"loss": 0.7395,
"step": 1360
},
{
"epoch": 2.977303800929724,
"grad_norm": 0.010974117554724216,
"learning_rate": 2.4330900243309002e-05,
"loss": 0.815,
"step": 1361
},
{
"epoch": 2.9794913863822803,
"grad_norm": 0.011202923953533173,
"learning_rate": 2.18978102189781e-05,
"loss": 0.7593,
"step": 1362
},
{
"epoch": 2.981678971834837,
"grad_norm": 0.011004596017301083,
"learning_rate": 1.9464720194647204e-05,
"loss": 0.6727,
"step": 1363
},
{
"epoch": 2.983866557287394,
"grad_norm": 0.009554206393659115,
"learning_rate": 1.70316301703163e-05,
"loss": 0.8229,
"step": 1364
},
{
"epoch": 2.9860541427399507,
"grad_norm": 0.013814912177622318,
"learning_rate": 1.4598540145985402e-05,
"loss": 1.0031,
"step": 1365
},
{
"epoch": 2.9882417281925076,
"grad_norm": 0.006289259064942598,
"learning_rate": 1.2165450121654501e-05,
"loss": 0.6995,
"step": 1366
},
{
"epoch": 2.9904293136450644,
"grad_norm": 0.008405916392803192,
"learning_rate": 9.732360097323602e-06,
"loss": 0.7135,
"step": 1367
},
{
"epoch": 2.992616899097621,
"grad_norm": 0.012755095958709717,
"learning_rate": 7.299270072992701e-06,
"loss": 0.8523,
"step": 1368
},
{
"epoch": 2.994804484550178,
"grad_norm": 0.011079053394496441,
"learning_rate": 4.866180048661801e-06,
"loss": 0.6673,
"step": 1369
},
{
"epoch": 2.9969920700027344,
"grad_norm": 0.011697685346007347,
"learning_rate": 2.4330900243309005e-06,
"loss": 0.7831,
"step": 1370
},
{
"epoch": 2.999179655455291,
"grad_norm": 0.0072103943675756454,
"learning_rate": 0.0,
"loss": 0.8479,
"step": 1371
},
{
"epoch": 2.999179655455291,
"step": 1371,
"total_flos": 4.3134948379459584e+17,
"train_loss": 0.7785058324133541,
"train_runtime": 1561.6761,
"train_samples_per_second": 14.048,
"train_steps_per_second": 0.878
}
],
"logging_steps": 1.0,
"max_steps": 1371,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.3134948379459584e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}