{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.043402401020824474, "eval_steps": 0, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.680480204164894e-06, "grad_norm": 15.5625, "learning_rate": 0.0, "loss": 1.2891, "step": 1 }, { "epoch": 1.736096040832979e-05, "grad_norm": 22.625, "learning_rate": 2e-06, "loss": 1.6406, "step": 2 }, { "epoch": 2.6041440612494685e-05, "grad_norm": 10.75, "learning_rate": 4e-06, "loss": 1.1016, "step": 3 }, { "epoch": 3.472192081665958e-05, "grad_norm": 10.6875, "learning_rate": 6e-06, "loss": 0.9922, "step": 4 }, { "epoch": 4.3402401020824474e-05, "grad_norm": 18.125, "learning_rate": 8e-06, "loss": 1.4375, "step": 5 }, { "epoch": 5.208288122498937e-05, "grad_norm": 10.1875, "learning_rate": 1e-05, "loss": 1.0, "step": 6 }, { "epoch": 6.076336142915426e-05, "grad_norm": 12.9375, "learning_rate": 1.2e-05, "loss": 1.2344, "step": 7 }, { "epoch": 6.944384163331915e-05, "grad_norm": 15.8125, "learning_rate": 1.4e-05, "loss": 1.1875, "step": 8 }, { "epoch": 7.812432183748404e-05, "grad_norm": 13.3125, "learning_rate": 1.6e-05, "loss": 1.0469, "step": 9 }, { "epoch": 8.680480204164895e-05, "grad_norm": 11.375, "learning_rate": 1.8e-05, "loss": 0.9844, "step": 10 }, { "epoch": 9.548528224581384e-05, "grad_norm": 11.125, "learning_rate": 2e-05, "loss": 1.0391, "step": 11 }, { "epoch": 0.00010416576244997874, "grad_norm": 12.0625, "learning_rate": 2.2e-05, "loss": 1.0469, "step": 12 }, { "epoch": 0.00011284624265414363, "grad_norm": 7.25, "learning_rate": 2.4e-05, "loss": 0.8203, "step": 13 }, { "epoch": 0.00012152672285830852, "grad_norm": 18.5, "learning_rate": 2.6e-05, "loss": 1.0312, "step": 14 }, { "epoch": 0.0001302072030624734, "grad_norm": 7.4375, "learning_rate": 2.8e-05, "loss": 0.9141, "step": 15 }, { "epoch": 0.0001388876832666383, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 0.7305, "step": 16 }, { "epoch": 0.0001475681634708032, "grad_norm": 4.15625, "learning_rate": 3.2e-05, "loss": 0.7031, "step": 17 }, { "epoch": 0.0001562486436749681, "grad_norm": 4.6875, "learning_rate": 3.4000000000000007e-05, "loss": 0.5977, "step": 18 }, { "epoch": 0.000164929123879133, "grad_norm": 3.75, "learning_rate": 3.6e-05, "loss": 0.625, "step": 19 }, { "epoch": 0.0001736096040832979, "grad_norm": 2.65625, "learning_rate": 3.8e-05, "loss": 0.5898, "step": 20 }, { "epoch": 0.00018229008428746277, "grad_norm": 2.25, "learning_rate": 4e-05, "loss": 0.582, "step": 21 }, { "epoch": 0.00019097056449162767, "grad_norm": 2.65625, "learning_rate": 4.2000000000000004e-05, "loss": 0.5977, "step": 22 }, { "epoch": 0.00019965104469579258, "grad_norm": 2.09375, "learning_rate": 4.4e-05, "loss": 0.4551, "step": 23 }, { "epoch": 0.00020833152489995748, "grad_norm": 1.625, "learning_rate": 4.6e-05, "loss": 0.4844, "step": 24 }, { "epoch": 0.00021701200510412235, "grad_norm": 1.7421875, "learning_rate": 4.8e-05, "loss": 0.5156, "step": 25 }, { "epoch": 0.00022569248530828726, "grad_norm": 3.703125, "learning_rate": 5e-05, "loss": 0.4688, "step": 26 }, { "epoch": 0.00023437296551245216, "grad_norm": 1.296875, "learning_rate": 5.2e-05, "loss": 0.457, "step": 27 }, { "epoch": 0.00024305344571661704, "grad_norm": 2.0625, "learning_rate": 5.4e-05, "loss": 0.5938, "step": 28 }, { "epoch": 0.0002517339259207819, "grad_norm": 1.2890625, "learning_rate": 5.6e-05, "loss": 0.4648, "step": 29 }, { "epoch": 0.0002604144061249468, "grad_norm": 1.125, "learning_rate": 5.800000000000001e-05, "loss": 0.4805, "step": 30 }, { "epoch": 0.0002690948863291117, "grad_norm": 1.265625, "learning_rate": 6e-05, "loss": 0.5234, "step": 31 }, { "epoch": 0.0002777753665332766, "grad_norm": 1.0078125, "learning_rate": 6.2e-05, "loss": 0.4297, "step": 32 }, { "epoch": 0.0002864558467374415, "grad_norm": 1.0078125, "learning_rate": 6.4e-05, "loss": 0.3555, "step": 33 }, { "epoch": 0.0002951363269416064, "grad_norm": 0.984375, "learning_rate": 6.6e-05, "loss": 0.3633, "step": 34 }, { "epoch": 0.00030381680714577133, "grad_norm": 0.78125, "learning_rate": 6.800000000000001e-05, "loss": 0.3398, "step": 35 }, { "epoch": 0.0003124972873499362, "grad_norm": 0.87890625, "learning_rate": 7.000000000000001e-05, "loss": 0.3574, "step": 36 }, { "epoch": 0.0003211777675541011, "grad_norm": 0.6015625, "learning_rate": 7.2e-05, "loss": 0.3145, "step": 37 }, { "epoch": 0.000329858247758266, "grad_norm": 0.6875, "learning_rate": 7.4e-05, "loss": 0.3906, "step": 38 }, { "epoch": 0.0003385387279624309, "grad_norm": 0.73046875, "learning_rate": 7.6e-05, "loss": 0.3477, "step": 39 }, { "epoch": 0.0003472192081665958, "grad_norm": 0.71484375, "learning_rate": 7.8e-05, "loss": 0.3477, "step": 40 }, { "epoch": 0.0003558996883707607, "grad_norm": 0.59765625, "learning_rate": 8e-05, "loss": 0.3711, "step": 41 }, { "epoch": 0.00036458016857492554, "grad_norm": 0.5546875, "learning_rate": 8.2e-05, "loss": 0.3398, "step": 42 }, { "epoch": 0.00037326064877909044, "grad_norm": 0.5703125, "learning_rate": 8.400000000000001e-05, "loss": 0.3145, "step": 43 }, { "epoch": 0.00038194112898325535, "grad_norm": 0.55078125, "learning_rate": 8.599999999999999e-05, "loss": 0.332, "step": 44 }, { "epoch": 0.00039062160918742025, "grad_norm": 0.671875, "learning_rate": 8.8e-05, "loss": 0.4297, "step": 45 }, { "epoch": 0.00039930208939158515, "grad_norm": 0.68359375, "learning_rate": 8.999999999999999e-05, "loss": 0.3945, "step": 46 }, { "epoch": 0.00040798256959575005, "grad_norm": 0.482421875, "learning_rate": 9.2e-05, "loss": 0.3066, "step": 47 }, { "epoch": 0.00041666304979991496, "grad_norm": 0.5703125, "learning_rate": 9.400000000000001e-05, "loss": 0.2988, "step": 48 }, { "epoch": 0.0004253435300040798, "grad_norm": 0.4609375, "learning_rate": 9.6e-05, "loss": 0.3203, "step": 49 }, { "epoch": 0.0004340240102082447, "grad_norm": 0.6015625, "learning_rate": 9.800000000000001e-05, "loss": 0.3828, "step": 50 }, { "epoch": 0.0004427044904124096, "grad_norm": 0.48828125, "learning_rate": 0.0001, "loss": 0.2773, "step": 51 }, { "epoch": 0.0004513849706165745, "grad_norm": 0.392578125, "learning_rate": 0.000102, "loss": 0.3672, "step": 52 }, { "epoch": 0.0004600654508207394, "grad_norm": 0.5859375, "learning_rate": 0.000104, "loss": 0.3965, "step": 53 }, { "epoch": 0.0004687459310249043, "grad_norm": 0.70703125, "learning_rate": 0.000106, "loss": 0.6133, "step": 54 }, { "epoch": 0.00047742641122906917, "grad_norm": 0.357421875, "learning_rate": 0.000108, "loss": 0.3184, "step": 55 }, { "epoch": 0.00048610689143323407, "grad_norm": 0.5703125, "learning_rate": 0.00011, "loss": 0.4648, "step": 56 }, { "epoch": 0.000494787371637399, "grad_norm": 0.408203125, "learning_rate": 0.000112, "loss": 0.3496, "step": 57 }, { "epoch": 0.0005034678518415638, "grad_norm": 0.4375, "learning_rate": 0.000114, "loss": 0.3008, "step": 58 }, { "epoch": 0.0005121483320457287, "grad_norm": 0.86328125, "learning_rate": 0.00011600000000000001, "loss": 0.3867, "step": 59 }, { "epoch": 0.0005208288122498936, "grad_norm": 0.455078125, "learning_rate": 0.000118, "loss": 0.3359, "step": 60 }, { "epoch": 0.0005295092924540585, "grad_norm": 0.412109375, "learning_rate": 0.00012, "loss": 0.3926, "step": 61 }, { "epoch": 0.0005381897726582234, "grad_norm": 0.361328125, "learning_rate": 0.000122, "loss": 0.375, "step": 62 }, { "epoch": 0.0005468702528623883, "grad_norm": 0.51171875, "learning_rate": 0.000124, "loss": 0.3574, "step": 63 }, { "epoch": 0.0005555507330665532, "grad_norm": 0.3125, "learning_rate": 0.000126, "loss": 0.2695, "step": 64 }, { "epoch": 0.0005642312132707181, "grad_norm": 0.408203125, "learning_rate": 0.000128, "loss": 0.3594, "step": 65 }, { "epoch": 0.000572911693474883, "grad_norm": 0.3984375, "learning_rate": 0.00013000000000000002, "loss": 0.3438, "step": 66 }, { "epoch": 0.000581592173679048, "grad_norm": 0.369140625, "learning_rate": 0.000132, "loss": 0.3047, "step": 67 }, { "epoch": 0.0005902726538832129, "grad_norm": 0.306640625, "learning_rate": 0.000134, "loss": 0.2773, "step": 68 }, { "epoch": 0.0005989531340873778, "grad_norm": 0.443359375, "learning_rate": 0.00013600000000000003, "loss": 0.3242, "step": 69 }, { "epoch": 0.0006076336142915427, "grad_norm": 0.380859375, "learning_rate": 0.00013800000000000002, "loss": 0.3203, "step": 70 }, { "epoch": 0.0006163140944957075, "grad_norm": 0.314453125, "learning_rate": 0.00014000000000000001, "loss": 0.3047, "step": 71 }, { "epoch": 0.0006249945746998724, "grad_norm": 0.5703125, "learning_rate": 0.00014199999999999998, "loss": 0.3887, "step": 72 }, { "epoch": 0.0006336750549040373, "grad_norm": 0.3828125, "learning_rate": 0.000144, "loss": 0.332, "step": 73 }, { "epoch": 0.0006423555351082022, "grad_norm": 0.35546875, "learning_rate": 0.000146, "loss": 0.3203, "step": 74 }, { "epoch": 0.0006510360153123671, "grad_norm": 0.369140625, "learning_rate": 0.000148, "loss": 0.3555, "step": 75 }, { "epoch": 0.000659716495516532, "grad_norm": 0.4765625, "learning_rate": 0.00015, "loss": 0.377, "step": 76 }, { "epoch": 0.0006683969757206969, "grad_norm": 0.376953125, "learning_rate": 0.000152, "loss": 0.2656, "step": 77 }, { "epoch": 0.0006770774559248618, "grad_norm": 0.484375, "learning_rate": 0.000154, "loss": 0.4102, "step": 78 }, { "epoch": 0.0006857579361290267, "grad_norm": 0.46875, "learning_rate": 0.000156, "loss": 0.2949, "step": 79 }, { "epoch": 0.0006944384163331916, "grad_norm": 0.306640625, "learning_rate": 0.000158, "loss": 0.3906, "step": 80 }, { "epoch": 0.0007031188965373565, "grad_norm": 0.294921875, "learning_rate": 0.00016, "loss": 0.332, "step": 81 }, { "epoch": 0.0007117993767415214, "grad_norm": 0.65234375, "learning_rate": 0.000162, "loss": 0.3164, "step": 82 }, { "epoch": 0.0007204798569456863, "grad_norm": 0.296875, "learning_rate": 0.000164, "loss": 0.3535, "step": 83 }, { "epoch": 0.0007291603371498511, "grad_norm": 0.400390625, "learning_rate": 0.00016600000000000002, "loss": 0.3438, "step": 84 }, { "epoch": 0.000737840817354016, "grad_norm": 0.314453125, "learning_rate": 0.00016800000000000002, "loss": 0.3066, "step": 85 }, { "epoch": 0.0007465212975581809, "grad_norm": 0.318359375, "learning_rate": 0.00017, "loss": 0.3262, "step": 86 }, { "epoch": 0.0007552017777623458, "grad_norm": 0.404296875, "learning_rate": 0.00017199999999999998, "loss": 0.334, "step": 87 }, { "epoch": 0.0007638822579665107, "grad_norm": 0.4296875, "learning_rate": 0.000174, "loss": 0.3242, "step": 88 }, { "epoch": 0.0007725627381706756, "grad_norm": 0.310546875, "learning_rate": 0.000176, "loss": 0.2539, "step": 89 }, { "epoch": 0.0007812432183748405, "grad_norm": 0.271484375, "learning_rate": 0.000178, "loss": 0.3184, "step": 90 }, { "epoch": 0.0007899236985790054, "grad_norm": 0.34375, "learning_rate": 0.00017999999999999998, "loss": 0.3398, "step": 91 }, { "epoch": 0.0007986041787831703, "grad_norm": 0.287109375, "learning_rate": 0.000182, "loss": 0.3516, "step": 92 }, { "epoch": 0.0008072846589873352, "grad_norm": 0.328125, "learning_rate": 0.000184, "loss": 0.2852, "step": 93 }, { "epoch": 0.0008159651391915001, "grad_norm": 0.357421875, "learning_rate": 0.000186, "loss": 0.3184, "step": 94 }, { "epoch": 0.000824645619395665, "grad_norm": 0.404296875, "learning_rate": 0.00018800000000000002, "loss": 0.2871, "step": 95 }, { "epoch": 0.0008333260995998299, "grad_norm": 0.267578125, "learning_rate": 0.00019, "loss": 0.2637, "step": 96 }, { "epoch": 0.0008420065798039947, "grad_norm": 0.318359375, "learning_rate": 0.000192, "loss": 0.2559, "step": 97 }, { "epoch": 0.0008506870600081596, "grad_norm": 0.36328125, "learning_rate": 0.000194, "loss": 0.4199, "step": 98 }, { "epoch": 0.0008593675402123245, "grad_norm": 0.5703125, "learning_rate": 0.00019600000000000002, "loss": 0.3438, "step": 99 }, { "epoch": 0.0008680480204164894, "grad_norm": 0.38671875, "learning_rate": 0.00019800000000000002, "loss": 0.3398, "step": 100 }, { "epoch": 0.0008767285006206543, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3555, "step": 101 }, { "epoch": 0.0008854089808248192, "grad_norm": 0.3359375, "learning_rate": 0.000202, "loss": 0.249, "step": 102 }, { "epoch": 0.0008940894610289841, "grad_norm": 0.6015625, "learning_rate": 0.000204, "loss": 0.3535, "step": 103 }, { "epoch": 0.000902769941233149, "grad_norm": 0.314453125, "learning_rate": 0.000206, "loss": 0.3887, "step": 104 }, { "epoch": 0.0009114504214373139, "grad_norm": 0.255859375, "learning_rate": 0.000208, "loss": 0.2383, "step": 105 }, { "epoch": 0.0009201309016414788, "grad_norm": 0.314453125, "learning_rate": 0.00021, "loss": 0.3086, "step": 106 }, { "epoch": 0.0009288113818456437, "grad_norm": 0.375, "learning_rate": 0.000212, "loss": 0.3809, "step": 107 }, { "epoch": 0.0009374918620498086, "grad_norm": 0.333984375, "learning_rate": 0.000214, "loss": 0.2969, "step": 108 }, { "epoch": 0.0009461723422539735, "grad_norm": 0.291015625, "learning_rate": 0.000216, "loss": 0.3691, "step": 109 }, { "epoch": 0.0009548528224581383, "grad_norm": 0.271484375, "learning_rate": 0.000218, "loss": 0.332, "step": 110 }, { "epoch": 0.0009635333026623032, "grad_norm": 0.400390625, "learning_rate": 0.00022, "loss": 0.3828, "step": 111 }, { "epoch": 0.0009722137828664681, "grad_norm": 0.263671875, "learning_rate": 0.000222, "loss": 0.4023, "step": 112 }, { "epoch": 0.0009808942630706332, "grad_norm": 0.333984375, "learning_rate": 0.000224, "loss": 0.3145, "step": 113 }, { "epoch": 0.000989574743274798, "grad_norm": 0.3203125, "learning_rate": 0.00022600000000000002, "loss": 0.3945, "step": 114 }, { "epoch": 0.000998255223478963, "grad_norm": 0.23046875, "learning_rate": 0.000228, "loss": 0.2969, "step": 115 }, { "epoch": 0.0010069357036831276, "grad_norm": 0.32421875, "learning_rate": 0.00023, "loss": 0.3438, "step": 116 }, { "epoch": 0.0010156161838872925, "grad_norm": 0.59375, "learning_rate": 0.00023200000000000003, "loss": 0.6211, "step": 117 }, { "epoch": 0.0010242966640914575, "grad_norm": 0.35546875, "learning_rate": 0.00023400000000000002, "loss": 0.3145, "step": 118 }, { "epoch": 0.0010329771442956224, "grad_norm": 0.30859375, "learning_rate": 0.000236, "loss": 0.3477, "step": 119 }, { "epoch": 0.0010416576244997873, "grad_norm": 0.265625, "learning_rate": 0.00023799999999999998, "loss": 0.3789, "step": 120 }, { "epoch": 0.0010503381047039522, "grad_norm": 0.21875, "learning_rate": 0.00024, "loss": 0.2422, "step": 121 }, { "epoch": 0.001059018584908117, "grad_norm": 0.345703125, "learning_rate": 0.000242, "loss": 0.2988, "step": 122 }, { "epoch": 0.001067699065112282, "grad_norm": 0.318359375, "learning_rate": 0.000244, "loss": 0.2949, "step": 123 }, { "epoch": 0.0010763795453164469, "grad_norm": 0.2392578125, "learning_rate": 0.000246, "loss": 0.2451, "step": 124 }, { "epoch": 0.0010850600255206118, "grad_norm": 0.296875, "learning_rate": 0.000248, "loss": 0.291, "step": 125 }, { "epoch": 0.0010937405057247767, "grad_norm": 0.30078125, "learning_rate": 0.00025, "loss": 0.3516, "step": 126 }, { "epoch": 0.0011024209859289416, "grad_norm": 0.296875, "learning_rate": 0.000252, "loss": 0.3301, "step": 127 }, { "epoch": 0.0011111014661331065, "grad_norm": 0.251953125, "learning_rate": 0.000254, "loss": 0.252, "step": 128 }, { "epoch": 0.0011197819463372714, "grad_norm": 0.228515625, "learning_rate": 0.000256, "loss": 0.2539, "step": 129 }, { "epoch": 0.0011284624265414363, "grad_norm": 0.76953125, "learning_rate": 0.00025800000000000004, "loss": 0.3535, "step": 130 }, { "epoch": 0.0011371429067456012, "grad_norm": 0.296875, "learning_rate": 0.00026000000000000003, "loss": 0.3242, "step": 131 }, { "epoch": 0.001145823386949766, "grad_norm": 0.380859375, "learning_rate": 0.000262, "loss": 0.3184, "step": 132 }, { "epoch": 0.001154503867153931, "grad_norm": 0.427734375, "learning_rate": 0.000264, "loss": 0.5039, "step": 133 }, { "epoch": 0.001163184347358096, "grad_norm": 0.27734375, "learning_rate": 0.000266, "loss": 0.2656, "step": 134 }, { "epoch": 0.0011718648275622608, "grad_norm": 0.314453125, "learning_rate": 0.000268, "loss": 0.3652, "step": 135 }, { "epoch": 0.0011805453077664257, "grad_norm": 0.255859375, "learning_rate": 0.00027, "loss": 0.3086, "step": 136 }, { "epoch": 0.0011892257879705906, "grad_norm": 0.275390625, "learning_rate": 0.00027200000000000005, "loss": 0.3145, "step": 137 }, { "epoch": 0.0011979062681747555, "grad_norm": 0.353515625, "learning_rate": 0.00027400000000000005, "loss": 0.4375, "step": 138 }, { "epoch": 0.0012065867483789204, "grad_norm": 0.2578125, "learning_rate": 0.00027600000000000004, "loss": 0.3203, "step": 139 }, { "epoch": 0.0012152672285830853, "grad_norm": 0.2275390625, "learning_rate": 0.00027800000000000004, "loss": 0.2949, "step": 140 }, { "epoch": 0.0012239477087872502, "grad_norm": 0.322265625, "learning_rate": 0.00028000000000000003, "loss": 0.3613, "step": 141 }, { "epoch": 0.001232628188991415, "grad_norm": 0.337890625, "learning_rate": 0.00028199999999999997, "loss": 0.3828, "step": 142 }, { "epoch": 0.0012413086691955798, "grad_norm": 0.21484375, "learning_rate": 0.00028399999999999996, "loss": 0.3125, "step": 143 }, { "epoch": 0.0012499891493997447, "grad_norm": 0.248046875, "learning_rate": 0.00028599999999999996, "loss": 0.2773, "step": 144 }, { "epoch": 0.0012586696296039096, "grad_norm": 0.296875, "learning_rate": 0.000288, "loss": 0.293, "step": 145 }, { "epoch": 0.0012673501098080745, "grad_norm": 0.2333984375, "learning_rate": 0.00029, "loss": 0.2461, "step": 146 }, { "epoch": 0.0012760305900122394, "grad_norm": 0.2255859375, "learning_rate": 0.000292, "loss": 0.2461, "step": 147 }, { "epoch": 0.0012847110702164043, "grad_norm": 0.27734375, "learning_rate": 0.000294, "loss": 0.3887, "step": 148 }, { "epoch": 0.0012933915504205692, "grad_norm": 0.2373046875, "learning_rate": 0.000296, "loss": 0.3105, "step": 149 }, { "epoch": 0.0013020720306247341, "grad_norm": 0.1806640625, "learning_rate": 0.000298, "loss": 0.2402, "step": 150 }, { "epoch": 0.001310752510828899, "grad_norm": 0.255859375, "learning_rate": 0.0003, "loss": 0.2734, "step": 151 }, { "epoch": 0.001319432991033064, "grad_norm": 0.259765625, "learning_rate": 0.000302, "loss": 0.3125, "step": 152 }, { "epoch": 0.0013281134712372288, "grad_norm": 0.251953125, "learning_rate": 0.000304, "loss": 0.3223, "step": 153 }, { "epoch": 0.0013367939514413937, "grad_norm": 0.23828125, "learning_rate": 0.000306, "loss": 0.2695, "step": 154 }, { "epoch": 0.0013454744316455586, "grad_norm": 0.19921875, "learning_rate": 0.000308, "loss": 0.2441, "step": 155 }, { "epoch": 0.0013541549118497235, "grad_norm": 0.244140625, "learning_rate": 0.00031, "loss": 0.375, "step": 156 }, { "epoch": 0.0013628353920538884, "grad_norm": 0.3125, "learning_rate": 0.000312, "loss": 0.3242, "step": 157 }, { "epoch": 0.0013715158722580533, "grad_norm": 0.24609375, "learning_rate": 0.000314, "loss": 0.293, "step": 158 }, { "epoch": 0.0013801963524622182, "grad_norm": 0.2421875, "learning_rate": 0.000316, "loss": 0.3027, "step": 159 }, { "epoch": 0.0013888768326663832, "grad_norm": 0.21484375, "learning_rate": 0.00031800000000000003, "loss": 0.2217, "step": 160 }, { "epoch": 0.001397557312870548, "grad_norm": 0.265625, "learning_rate": 0.00032, "loss": 0.3691, "step": 161 }, { "epoch": 0.001406237793074713, "grad_norm": 0.29296875, "learning_rate": 0.000322, "loss": 0.2891, "step": 162 }, { "epoch": 0.0014149182732788779, "grad_norm": 0.2353515625, "learning_rate": 0.000324, "loss": 0.3301, "step": 163 }, { "epoch": 0.0014235987534830428, "grad_norm": 0.2421875, "learning_rate": 0.000326, "loss": 0.334, "step": 164 }, { "epoch": 0.0014322792336872077, "grad_norm": 0.23828125, "learning_rate": 0.000328, "loss": 0.2988, "step": 165 }, { "epoch": 0.0014409597138913726, "grad_norm": 0.2177734375, "learning_rate": 0.00033, "loss": 0.2578, "step": 166 }, { "epoch": 0.0014496401940955375, "grad_norm": 0.2734375, "learning_rate": 0.00033200000000000005, "loss": 0.3008, "step": 167 }, { "epoch": 0.0014583206742997022, "grad_norm": 0.25, "learning_rate": 0.00033400000000000004, "loss": 0.293, "step": 168 }, { "epoch": 0.001467001154503867, "grad_norm": 0.255859375, "learning_rate": 0.00033600000000000004, "loss": 0.2832, "step": 169 }, { "epoch": 0.001475681634708032, "grad_norm": 0.25390625, "learning_rate": 0.00033800000000000003, "loss": 0.2676, "step": 170 }, { "epoch": 0.0014843621149121969, "grad_norm": 0.2578125, "learning_rate": 0.00034, "loss": 0.2734, "step": 171 }, { "epoch": 0.0014930425951163618, "grad_norm": 0.251953125, "learning_rate": 0.000342, "loss": 0.3125, "step": 172 }, { "epoch": 0.0015017230753205267, "grad_norm": 0.333984375, "learning_rate": 0.00034399999999999996, "loss": 0.3008, "step": 173 }, { "epoch": 0.0015104035555246916, "grad_norm": 0.2890625, "learning_rate": 0.000346, "loss": 0.332, "step": 174 }, { "epoch": 0.0015190840357288565, "grad_norm": 0.26953125, "learning_rate": 0.000348, "loss": 0.2539, "step": 175 }, { "epoch": 0.0015277645159330214, "grad_norm": 0.478515625, "learning_rate": 0.00035, "loss": 0.3203, "step": 176 }, { "epoch": 0.0015364449961371863, "grad_norm": 0.302734375, "learning_rate": 0.000352, "loss": 0.3066, "step": 177 }, { "epoch": 0.0015451254763413512, "grad_norm": 0.330078125, "learning_rate": 0.000354, "loss": 0.3047, "step": 178 }, { "epoch": 0.001553805956545516, "grad_norm": 0.265625, "learning_rate": 0.000356, "loss": 0.3184, "step": 179 }, { "epoch": 0.001562486436749681, "grad_norm": 0.251953125, "learning_rate": 0.000358, "loss": 0.3105, "step": 180 }, { "epoch": 0.001571166916953846, "grad_norm": 0.3671875, "learning_rate": 0.00035999999999999997, "loss": 0.3125, "step": 181 }, { "epoch": 0.0015798473971580108, "grad_norm": 0.201171875, "learning_rate": 0.000362, "loss": 0.3223, "step": 182 }, { "epoch": 0.0015885278773621757, "grad_norm": 0.3671875, "learning_rate": 0.000364, "loss": 0.4199, "step": 183 }, { "epoch": 0.0015972083575663406, "grad_norm": 0.248046875, "learning_rate": 0.000366, "loss": 0.3281, "step": 184 }, { "epoch": 0.0016058888377705055, "grad_norm": 0.228515625, "learning_rate": 0.000368, "loss": 0.2891, "step": 185 }, { "epoch": 0.0016145693179746704, "grad_norm": 0.23828125, "learning_rate": 0.00037, "loss": 0.3555, "step": 186 }, { "epoch": 0.0016232497981788353, "grad_norm": 0.294921875, "learning_rate": 0.000372, "loss": 0.3457, "step": 187 }, { "epoch": 0.0016319302783830002, "grad_norm": 0.2578125, "learning_rate": 0.000374, "loss": 0.2988, "step": 188 }, { "epoch": 0.0016406107585871651, "grad_norm": 0.25390625, "learning_rate": 0.00037600000000000003, "loss": 0.3242, "step": 189 }, { "epoch": 0.00164929123879133, "grad_norm": 0.27734375, "learning_rate": 0.000378, "loss": 0.2949, "step": 190 }, { "epoch": 0.001657971718995495, "grad_norm": 0.2578125, "learning_rate": 0.00038, "loss": 0.3672, "step": 191 }, { "epoch": 0.0016666521991996598, "grad_norm": 0.19921875, "learning_rate": 0.000382, "loss": 0.3047, "step": 192 }, { "epoch": 0.0016753326794038245, "grad_norm": 0.16796875, "learning_rate": 0.000384, "loss": 0.2715, "step": 193 }, { "epoch": 0.0016840131596079894, "grad_norm": 0.2734375, "learning_rate": 0.000386, "loss": 0.3086, "step": 194 }, { "epoch": 0.0016926936398121543, "grad_norm": 0.28515625, "learning_rate": 0.000388, "loss": 0.2617, "step": 195 }, { "epoch": 0.0017013741200163192, "grad_norm": 0.2412109375, "learning_rate": 0.00039000000000000005, "loss": 0.3262, "step": 196 }, { "epoch": 0.0017100546002204841, "grad_norm": 0.2373046875, "learning_rate": 0.00039200000000000004, "loss": 0.252, "step": 197 }, { "epoch": 0.001718735080424649, "grad_norm": 0.228515625, "learning_rate": 0.00039400000000000004, "loss": 0.3047, "step": 198 }, { "epoch": 0.001727415560628814, "grad_norm": 0.201171875, "learning_rate": 0.00039600000000000003, "loss": 0.332, "step": 199 }, { "epoch": 0.0017360960408329788, "grad_norm": 0.2421875, "learning_rate": 0.000398, "loss": 0.2695, "step": 200 }, { "epoch": 0.0017447765210371437, "grad_norm": 0.287109375, "learning_rate": 0.0004, "loss": 0.3867, "step": 201 }, { "epoch": 0.0017534570012413086, "grad_norm": 0.203125, "learning_rate": 0.000402, "loss": 0.2598, "step": 202 }, { "epoch": 0.0017621374814454735, "grad_norm": 0.2431640625, "learning_rate": 0.000404, "loss": 0.3203, "step": 203 }, { "epoch": 0.0017708179616496384, "grad_norm": 0.251953125, "learning_rate": 0.00040600000000000006, "loss": 0.3438, "step": 204 }, { "epoch": 0.0017794984418538033, "grad_norm": 0.1923828125, "learning_rate": 0.000408, "loss": 0.248, "step": 205 }, { "epoch": 0.0017881789220579682, "grad_norm": 0.216796875, "learning_rate": 0.00041, "loss": 0.2852, "step": 206 }, { "epoch": 0.0017968594022621332, "grad_norm": 0.287109375, "learning_rate": 0.000412, "loss": 0.3652, "step": 207 }, { "epoch": 0.001805539882466298, "grad_norm": 0.265625, "learning_rate": 0.000414, "loss": 0.3359, "step": 208 }, { "epoch": 0.001814220362670463, "grad_norm": 0.29296875, "learning_rate": 0.000416, "loss": 0.2773, "step": 209 }, { "epoch": 0.0018229008428746279, "grad_norm": 0.2392578125, "learning_rate": 0.00041799999999999997, "loss": 0.3047, "step": 210 }, { "epoch": 0.0018315813230787928, "grad_norm": 0.24609375, "learning_rate": 0.00042, "loss": 0.3613, "step": 211 }, { "epoch": 0.0018402618032829577, "grad_norm": 0.2470703125, "learning_rate": 0.000422, "loss": 0.2676, "step": 212 }, { "epoch": 0.0018489422834871226, "grad_norm": 0.2431640625, "learning_rate": 0.000424, "loss": 0.3633, "step": 213 }, { "epoch": 0.0018576227636912875, "grad_norm": 0.5234375, "learning_rate": 0.000426, "loss": 0.4062, "step": 214 }, { "epoch": 0.0018663032438954524, "grad_norm": 0.306640625, "learning_rate": 0.000428, "loss": 0.3223, "step": 215 }, { "epoch": 0.0018749837240996173, "grad_norm": 0.2490234375, "learning_rate": 0.00043, "loss": 0.293, "step": 216 }, { "epoch": 0.0018836642043037822, "grad_norm": 0.22265625, "learning_rate": 0.000432, "loss": 0.3086, "step": 217 }, { "epoch": 0.001892344684507947, "grad_norm": 0.1904296875, "learning_rate": 0.00043400000000000003, "loss": 0.2656, "step": 218 }, { "epoch": 0.0019010251647121118, "grad_norm": 0.1982421875, "learning_rate": 0.000436, "loss": 0.248, "step": 219 }, { "epoch": 0.0019097056449162767, "grad_norm": 0.1728515625, "learning_rate": 0.000438, "loss": 0.3379, "step": 220 }, { "epoch": 0.0019183861251204416, "grad_norm": 0.216796875, "learning_rate": 0.00044, "loss": 0.3301, "step": 221 }, { "epoch": 0.0019270666053246065, "grad_norm": 0.1474609375, "learning_rate": 0.000442, "loss": 0.2812, "step": 222 }, { "epoch": 0.0019357470855287714, "grad_norm": 0.1240234375, "learning_rate": 0.000444, "loss": 0.2188, "step": 223 }, { "epoch": 0.0019444275657329363, "grad_norm": 0.21484375, "learning_rate": 0.000446, "loss": 0.3242, "step": 224 }, { "epoch": 0.0019531080459371012, "grad_norm": 0.193359375, "learning_rate": 0.000448, "loss": 0.2773, "step": 225 }, { "epoch": 0.0019617885261412663, "grad_norm": 0.224609375, "learning_rate": 0.00045000000000000004, "loss": 0.293, "step": 226 }, { "epoch": 0.001970469006345431, "grad_norm": 0.259765625, "learning_rate": 0.00045200000000000004, "loss": 0.3281, "step": 227 }, { "epoch": 0.001979149486549596, "grad_norm": 0.2412109375, "learning_rate": 0.00045400000000000003, "loss": 0.3184, "step": 228 }, { "epoch": 0.001987829966753761, "grad_norm": 0.2333984375, "learning_rate": 0.000456, "loss": 0.3164, "step": 229 }, { "epoch": 0.001996510446957926, "grad_norm": 0.1796875, "learning_rate": 0.000458, "loss": 0.2734, "step": 230 }, { "epoch": 0.0020051909271620906, "grad_norm": 0.265625, "learning_rate": 0.00046, "loss": 0.4395, "step": 231 }, { "epoch": 0.0020138714073662553, "grad_norm": 0.25, "learning_rate": 0.000462, "loss": 0.291, "step": 232 }, { "epoch": 0.0020225518875704204, "grad_norm": 0.2119140625, "learning_rate": 0.00046400000000000006, "loss": 0.332, "step": 233 }, { "epoch": 0.002031232367774585, "grad_norm": 0.275390625, "learning_rate": 0.00046600000000000005, "loss": 0.3457, "step": 234 }, { "epoch": 0.00203991284797875, "grad_norm": 0.197265625, "learning_rate": 0.00046800000000000005, "loss": 0.3086, "step": 235 }, { "epoch": 0.002048593328182915, "grad_norm": 0.283203125, "learning_rate": 0.00047, "loss": 0.4199, "step": 236 }, { "epoch": 0.00205727380838708, "grad_norm": 0.26171875, "learning_rate": 0.000472, "loss": 0.3477, "step": 237 }, { "epoch": 0.0020659542885912447, "grad_norm": 0.23046875, "learning_rate": 0.000474, "loss": 0.3145, "step": 238 }, { "epoch": 0.00207463476879541, "grad_norm": 0.2197265625, "learning_rate": 0.00047599999999999997, "loss": 0.2617, "step": 239 }, { "epoch": 0.0020833152489995745, "grad_norm": 0.1845703125, "learning_rate": 0.00047799999999999996, "loss": 0.3809, "step": 240 }, { "epoch": 0.0020919957292037396, "grad_norm": 0.1748046875, "learning_rate": 0.00048, "loss": 0.2715, "step": 241 }, { "epoch": 0.0021006762094079043, "grad_norm": 0.1923828125, "learning_rate": 0.000482, "loss": 0.2578, "step": 242 }, { "epoch": 0.0021093566896120694, "grad_norm": 0.1943359375, "learning_rate": 0.000484, "loss": 0.2578, "step": 243 }, { "epoch": 0.002118037169816234, "grad_norm": 0.33203125, "learning_rate": 0.000486, "loss": 0.2793, "step": 244 }, { "epoch": 0.0021267176500203992, "grad_norm": 0.255859375, "learning_rate": 0.000488, "loss": 0.3828, "step": 245 }, { "epoch": 0.002135398130224564, "grad_norm": 0.177734375, "learning_rate": 0.00049, "loss": 0.2695, "step": 246 }, { "epoch": 0.002144078610428729, "grad_norm": 0.1748046875, "learning_rate": 0.000492, "loss": 0.2656, "step": 247 }, { "epoch": 0.0021527590906328937, "grad_norm": 0.138671875, "learning_rate": 0.000494, "loss": 0.2432, "step": 248 }, { "epoch": 0.002161439570837059, "grad_norm": 0.1650390625, "learning_rate": 0.000496, "loss": 0.3125, "step": 249 }, { "epoch": 0.0021701200510412235, "grad_norm": 0.15234375, "learning_rate": 0.000498, "loss": 0.2695, "step": 250 }, { "epoch": 0.0021788005312453887, "grad_norm": 0.1845703125, "learning_rate": 0.0005, "loss": 0.3184, "step": 251 }, { "epoch": 0.0021874810114495533, "grad_norm": 0.2412109375, "learning_rate": 0.0005020000000000001, "loss": 0.3027, "step": 252 }, { "epoch": 0.0021961614916537185, "grad_norm": 0.2158203125, "learning_rate": 0.000504, "loss": 0.2988, "step": 253 }, { "epoch": 0.002204841971857883, "grad_norm": 0.193359375, "learning_rate": 0.000506, "loss": 0.293, "step": 254 }, { "epoch": 0.0022135224520620483, "grad_norm": 0.1826171875, "learning_rate": 0.000508, "loss": 0.2656, "step": 255 }, { "epoch": 0.002222202932266213, "grad_norm": 0.21875, "learning_rate": 0.00051, "loss": 0.3066, "step": 256 }, { "epoch": 0.002230883412470378, "grad_norm": 0.185546875, "learning_rate": 0.000512, "loss": 0.3027, "step": 257 }, { "epoch": 0.0022395638926745428, "grad_norm": 0.2275390625, "learning_rate": 0.000514, "loss": 0.2969, "step": 258 }, { "epoch": 0.0022482443728787074, "grad_norm": 0.451171875, "learning_rate": 0.0005160000000000001, "loss": 0.3887, "step": 259 }, { "epoch": 0.0022569248530828726, "grad_norm": 0.1689453125, "learning_rate": 0.000518, "loss": 0.2695, "step": 260 }, { "epoch": 0.0022656053332870373, "grad_norm": 0.294921875, "learning_rate": 0.0005200000000000001, "loss": 0.2812, "step": 261 }, { "epoch": 0.0022742858134912024, "grad_norm": 0.158203125, "learning_rate": 0.000522, "loss": 0.3105, "step": 262 }, { "epoch": 0.002282966293695367, "grad_norm": 0.1845703125, "learning_rate": 0.000524, "loss": 0.3516, "step": 263 }, { "epoch": 0.002291646773899532, "grad_norm": 0.236328125, "learning_rate": 0.000526, "loss": 0.2578, "step": 264 }, { "epoch": 0.002300327254103697, "grad_norm": 0.23828125, "learning_rate": 0.000528, "loss": 0.252, "step": 265 }, { "epoch": 0.002309007734307862, "grad_norm": 0.2080078125, "learning_rate": 0.0005300000000000001, "loss": 0.3516, "step": 266 }, { "epoch": 0.0023176882145120267, "grad_norm": 0.224609375, "learning_rate": 0.000532, "loss": 0.2969, "step": 267 }, { "epoch": 0.002326368694716192, "grad_norm": 0.1796875, "learning_rate": 0.0005340000000000001, "loss": 0.3203, "step": 268 }, { "epoch": 0.0023350491749203565, "grad_norm": 0.68359375, "learning_rate": 0.000536, "loss": 0.3242, "step": 269 }, { "epoch": 0.0023437296551245216, "grad_norm": 0.2138671875, "learning_rate": 0.0005380000000000001, "loss": 0.2832, "step": 270 }, { "epoch": 0.0023524101353286863, "grad_norm": 0.1748046875, "learning_rate": 0.00054, "loss": 0.3145, "step": 271 }, { "epoch": 0.0023610906155328514, "grad_norm": 0.2197265625, "learning_rate": 0.0005420000000000001, "loss": 0.2637, "step": 272 }, { "epoch": 0.002369771095737016, "grad_norm": 0.34765625, "learning_rate": 0.0005440000000000001, "loss": 0.4414, "step": 273 }, { "epoch": 0.002378451575941181, "grad_norm": 0.232421875, "learning_rate": 0.000546, "loss": 0.2695, "step": 274 }, { "epoch": 0.002387132056145346, "grad_norm": 0.302734375, "learning_rate": 0.0005480000000000001, "loss": 0.3809, "step": 275 }, { "epoch": 0.002395812536349511, "grad_norm": 0.1904296875, "learning_rate": 0.00055, "loss": 0.3242, "step": 276 }, { "epoch": 0.0024044930165536757, "grad_norm": 0.291015625, "learning_rate": 0.0005520000000000001, "loss": 0.3281, "step": 277 }, { "epoch": 0.002413173496757841, "grad_norm": 0.224609375, "learning_rate": 0.000554, "loss": 0.2695, "step": 278 }, { "epoch": 0.0024218539769620055, "grad_norm": 0.2451171875, "learning_rate": 0.0005560000000000001, "loss": 0.3359, "step": 279 }, { "epoch": 0.0024305344571661706, "grad_norm": 0.21484375, "learning_rate": 0.000558, "loss": 0.2969, "step": 280 }, { "epoch": 0.0024392149373703353, "grad_norm": 0.1767578125, "learning_rate": 0.0005600000000000001, "loss": 0.2363, "step": 281 }, { "epoch": 0.0024478954175745004, "grad_norm": 0.166015625, "learning_rate": 0.0005620000000000001, "loss": 0.3203, "step": 282 }, { "epoch": 0.002456575897778665, "grad_norm": 0.1376953125, "learning_rate": 0.0005639999999999999, "loss": 0.2402, "step": 283 }, { "epoch": 0.00246525637798283, "grad_norm": 0.173828125, "learning_rate": 0.000566, "loss": 0.2949, "step": 284 }, { "epoch": 0.002473936858186995, "grad_norm": 0.171875, "learning_rate": 0.0005679999999999999, "loss": 0.3457, "step": 285 }, { "epoch": 0.0024826173383911596, "grad_norm": 0.2021484375, "learning_rate": 0.00057, "loss": 0.3047, "step": 286 }, { "epoch": 0.0024912978185953247, "grad_norm": 0.181640625, "learning_rate": 0.0005719999999999999, "loss": 0.3555, "step": 287 }, { "epoch": 0.0024999782987994894, "grad_norm": 0.1640625, "learning_rate": 0.000574, "loss": 0.2539, "step": 288 }, { "epoch": 0.0025086587790036545, "grad_norm": 0.1728515625, "learning_rate": 0.000576, "loss": 0.3066, "step": 289 }, { "epoch": 0.0025173392592078192, "grad_norm": 0.2138671875, "learning_rate": 0.000578, "loss": 0.3379, "step": 290 }, { "epoch": 0.0025260197394119843, "grad_norm": 0.2275390625, "learning_rate": 0.00058, "loss": 0.2832, "step": 291 }, { "epoch": 0.002534700219616149, "grad_norm": 0.259765625, "learning_rate": 0.0005819999999999999, "loss": 0.3867, "step": 292 }, { "epoch": 0.002543380699820314, "grad_norm": 0.283203125, "learning_rate": 0.000584, "loss": 0.3398, "step": 293 }, { "epoch": 0.002552061180024479, "grad_norm": 0.201171875, "learning_rate": 0.0005859999999999999, "loss": 0.3203, "step": 294 }, { "epoch": 0.002560741660228644, "grad_norm": 0.134765625, "learning_rate": 0.000588, "loss": 0.25, "step": 295 }, { "epoch": 0.0025694221404328086, "grad_norm": 0.201171875, "learning_rate": 0.00059, "loss": 0.3105, "step": 296 }, { "epoch": 0.0025781026206369738, "grad_norm": 0.1708984375, "learning_rate": 0.000592, "loss": 0.3027, "step": 297 }, { "epoch": 0.0025867831008411384, "grad_norm": 0.16796875, "learning_rate": 0.000594, "loss": 0.3047, "step": 298 }, { "epoch": 0.0025954635810453036, "grad_norm": 0.203125, "learning_rate": 0.000596, "loss": 0.457, "step": 299 }, { "epoch": 0.0026041440612494682, "grad_norm": 0.1513671875, "learning_rate": 0.000598, "loss": 0.2793, "step": 300 }, { "epoch": 0.0026128245414536334, "grad_norm": 0.177734375, "learning_rate": 0.0006, "loss": 0.3379, "step": 301 }, { "epoch": 0.002621505021657798, "grad_norm": 0.1591796875, "learning_rate": 0.000602, "loss": 0.2676, "step": 302 }, { "epoch": 0.002630185501861963, "grad_norm": 0.232421875, "learning_rate": 0.000604, "loss": 0.3223, "step": 303 }, { "epoch": 0.002638865982066128, "grad_norm": 0.1630859375, "learning_rate": 0.000606, "loss": 0.2617, "step": 304 }, { "epoch": 0.002647546462270293, "grad_norm": 0.189453125, "learning_rate": 0.000608, "loss": 0.3066, "step": 305 }, { "epoch": 0.0026562269424744577, "grad_norm": 0.1767578125, "learning_rate": 0.00061, "loss": 0.2637, "step": 306 }, { "epoch": 0.002664907422678623, "grad_norm": 0.15625, "learning_rate": 0.000612, "loss": 0.3105, "step": 307 }, { "epoch": 0.0026735879028827875, "grad_norm": 0.1962890625, "learning_rate": 0.000614, "loss": 0.3164, "step": 308 }, { "epoch": 0.002682268383086952, "grad_norm": 0.1953125, "learning_rate": 0.000616, "loss": 0.2539, "step": 309 }, { "epoch": 0.0026909488632911173, "grad_norm": 0.19140625, "learning_rate": 0.0006180000000000001, "loss": 0.2969, "step": 310 }, { "epoch": 0.002699629343495282, "grad_norm": 0.1845703125, "learning_rate": 0.00062, "loss": 0.2598, "step": 311 }, { "epoch": 0.002708309823699447, "grad_norm": 0.21484375, "learning_rate": 0.000622, "loss": 0.2734, "step": 312 }, { "epoch": 0.0027169903039036118, "grad_norm": 0.1435546875, "learning_rate": 0.000624, "loss": 0.3359, "step": 313 }, { "epoch": 0.002725670784107777, "grad_norm": 0.2333984375, "learning_rate": 0.000626, "loss": 0.3359, "step": 314 }, { "epoch": 0.0027343512643119416, "grad_norm": 0.2109375, "learning_rate": 0.000628, "loss": 0.3008, "step": 315 }, { "epoch": 0.0027430317445161067, "grad_norm": 0.1865234375, "learning_rate": 0.00063, "loss": 0.3086, "step": 316 }, { "epoch": 0.0027517122247202714, "grad_norm": 0.27734375, "learning_rate": 0.000632, "loss": 0.3105, "step": 317 }, { "epoch": 0.0027603927049244365, "grad_norm": 0.1884765625, "learning_rate": 0.000634, "loss": 0.248, "step": 318 }, { "epoch": 0.002769073185128601, "grad_norm": 0.2294921875, "learning_rate": 0.0006360000000000001, "loss": 0.3008, "step": 319 }, { "epoch": 0.0027777536653327663, "grad_norm": 0.1806640625, "learning_rate": 0.000638, "loss": 0.2617, "step": 320 }, { "epoch": 0.002786434145536931, "grad_norm": 0.16796875, "learning_rate": 0.00064, "loss": 0.248, "step": 321 }, { "epoch": 0.002795114625741096, "grad_norm": 0.32421875, "learning_rate": 0.000642, "loss": 0.3457, "step": 322 }, { "epoch": 0.002803795105945261, "grad_norm": 0.2001953125, "learning_rate": 0.000644, "loss": 0.3047, "step": 323 }, { "epoch": 0.002812475586149426, "grad_norm": 0.1845703125, "learning_rate": 0.000646, "loss": 0.2969, "step": 324 }, { "epoch": 0.0028211560663535906, "grad_norm": 0.1787109375, "learning_rate": 0.000648, "loss": 0.248, "step": 325 }, { "epoch": 0.0028298365465577557, "grad_norm": 0.220703125, "learning_rate": 0.0006500000000000001, "loss": 0.3164, "step": 326 }, { "epoch": 0.0028385170267619204, "grad_norm": 0.17578125, "learning_rate": 0.000652, "loss": 0.25, "step": 327 }, { "epoch": 0.0028471975069660855, "grad_norm": 0.357421875, "learning_rate": 0.0006540000000000001, "loss": 0.3594, "step": 328 }, { "epoch": 0.00285587798717025, "grad_norm": 0.130859375, "learning_rate": 0.000656, "loss": 0.2471, "step": 329 }, { "epoch": 0.0028645584673744153, "grad_norm": 0.2470703125, "learning_rate": 0.0006580000000000001, "loss": 0.3359, "step": 330 }, { "epoch": 0.00287323894757858, "grad_norm": 0.26171875, "learning_rate": 0.00066, "loss": 0.3184, "step": 331 }, { "epoch": 0.002881919427782745, "grad_norm": 0.2734375, "learning_rate": 0.000662, "loss": 0.3438, "step": 332 }, { "epoch": 0.00289059990798691, "grad_norm": 0.166015625, "learning_rate": 0.0006640000000000001, "loss": 0.2617, "step": 333 }, { "epoch": 0.002899280388191075, "grad_norm": 0.173828125, "learning_rate": 0.000666, "loss": 0.3105, "step": 334 }, { "epoch": 0.0029079608683952396, "grad_norm": 0.2099609375, "learning_rate": 0.0006680000000000001, "loss": 0.3359, "step": 335 }, { "epoch": 0.0029166413485994043, "grad_norm": 0.2099609375, "learning_rate": 0.00067, "loss": 0.3301, "step": 336 }, { "epoch": 0.0029253218288035694, "grad_norm": 0.158203125, "learning_rate": 0.0006720000000000001, "loss": 0.3008, "step": 337 }, { "epoch": 0.002934002309007734, "grad_norm": 0.6015625, "learning_rate": 0.000674, "loss": 0.4102, "step": 338 }, { "epoch": 0.0029426827892118992, "grad_norm": 0.2255859375, "learning_rate": 0.0006760000000000001, "loss": 0.2773, "step": 339 }, { "epoch": 0.002951363269416064, "grad_norm": 0.2060546875, "learning_rate": 0.0006780000000000001, "loss": 0.2969, "step": 340 }, { "epoch": 0.002960043749620229, "grad_norm": 0.19921875, "learning_rate": 0.00068, "loss": 0.3203, "step": 341 }, { "epoch": 0.0029687242298243937, "grad_norm": 0.166015625, "learning_rate": 0.0006820000000000001, "loss": 0.3066, "step": 342 }, { "epoch": 0.002977404710028559, "grad_norm": 0.1796875, "learning_rate": 0.000684, "loss": 0.3789, "step": 343 }, { "epoch": 0.0029860851902327235, "grad_norm": 0.1455078125, "learning_rate": 0.0006860000000000001, "loss": 0.3242, "step": 344 }, { "epoch": 0.0029947656704368887, "grad_norm": 0.1826171875, "learning_rate": 0.0006879999999999999, "loss": 0.2988, "step": 345 }, { "epoch": 0.0030034461506410533, "grad_norm": 0.1552734375, "learning_rate": 0.00069, "loss": 0.2695, "step": 346 }, { "epoch": 0.0030121266308452185, "grad_norm": 0.25, "learning_rate": 0.000692, "loss": 0.2871, "step": 347 }, { "epoch": 0.003020807111049383, "grad_norm": 0.13671875, "learning_rate": 0.000694, "loss": 0.2812, "step": 348 }, { "epoch": 0.0030294875912535483, "grad_norm": 0.357421875, "learning_rate": 0.000696, "loss": 0.4082, "step": 349 }, { "epoch": 0.003038168071457713, "grad_norm": 0.1650390625, "learning_rate": 0.0006979999999999999, "loss": 0.2793, "step": 350 }, { "epoch": 0.003046848551661878, "grad_norm": 0.19921875, "learning_rate": 0.0007, "loss": 0.2891, "step": 351 }, { "epoch": 0.0030555290318660428, "grad_norm": 0.181640625, "learning_rate": 0.0007019999999999999, "loss": 0.3535, "step": 352 }, { "epoch": 0.003064209512070208, "grad_norm": 0.1689453125, "learning_rate": 0.000704, "loss": 0.2656, "step": 353 }, { "epoch": 0.0030728899922743726, "grad_norm": 0.27734375, "learning_rate": 0.0007059999999999999, "loss": 0.2383, "step": 354 }, { "epoch": 0.0030815704724785377, "grad_norm": 0.2265625, "learning_rate": 0.000708, "loss": 0.3086, "step": 355 }, { "epoch": 0.0030902509526827024, "grad_norm": 0.1904296875, "learning_rate": 0.00071, "loss": 0.3086, "step": 356 }, { "epoch": 0.0030989314328868675, "grad_norm": 0.1611328125, "learning_rate": 0.000712, "loss": 0.291, "step": 357 }, { "epoch": 0.003107611913091032, "grad_norm": 0.16796875, "learning_rate": 0.000714, "loss": 0.291, "step": 358 }, { "epoch": 0.0031162923932951973, "grad_norm": 0.1875, "learning_rate": 0.000716, "loss": 0.2988, "step": 359 }, { "epoch": 0.003124972873499362, "grad_norm": 0.16015625, "learning_rate": 0.000718, "loss": 0.3398, "step": 360 }, { "epoch": 0.0031336533537035267, "grad_norm": 0.1611328125, "learning_rate": 0.0007199999999999999, "loss": 0.2793, "step": 361 }, { "epoch": 0.003142333833907692, "grad_norm": 0.1396484375, "learning_rate": 0.000722, "loss": 0.3047, "step": 362 }, { "epoch": 0.0031510143141118565, "grad_norm": 0.185546875, "learning_rate": 0.000724, "loss": 0.3281, "step": 363 }, { "epoch": 0.0031596947943160216, "grad_norm": 0.154296875, "learning_rate": 0.000726, "loss": 0.2969, "step": 364 }, { "epoch": 0.0031683752745201863, "grad_norm": 0.181640625, "learning_rate": 0.000728, "loss": 0.3652, "step": 365 }, { "epoch": 0.0031770557547243514, "grad_norm": 0.1279296875, "learning_rate": 0.00073, "loss": 0.2344, "step": 366 }, { "epoch": 0.003185736234928516, "grad_norm": 0.1455078125, "learning_rate": 0.000732, "loss": 0.2891, "step": 367 }, { "epoch": 0.003194416715132681, "grad_norm": 0.1884765625, "learning_rate": 0.000734, "loss": 0.3438, "step": 368 }, { "epoch": 0.003203097195336846, "grad_norm": 0.181640625, "learning_rate": 0.000736, "loss": 0.3164, "step": 369 }, { "epoch": 0.003211777675541011, "grad_norm": 0.1669921875, "learning_rate": 0.000738, "loss": 0.2832, "step": 370 }, { "epoch": 0.0032204581557451757, "grad_norm": 0.18359375, "learning_rate": 0.00074, "loss": 0.2695, "step": 371 }, { "epoch": 0.003229138635949341, "grad_norm": 0.22265625, "learning_rate": 0.000742, "loss": 0.2559, "step": 372 }, { "epoch": 0.0032378191161535055, "grad_norm": 0.2060546875, "learning_rate": 0.000744, "loss": 0.2949, "step": 373 }, { "epoch": 0.0032464995963576706, "grad_norm": 0.1494140625, "learning_rate": 0.000746, "loss": 0.2832, "step": 374 }, { "epoch": 0.0032551800765618353, "grad_norm": 0.1669921875, "learning_rate": 0.000748, "loss": 0.2832, "step": 375 }, { "epoch": 0.0032638605567660004, "grad_norm": 0.279296875, "learning_rate": 0.00075, "loss": 0.2812, "step": 376 }, { "epoch": 0.003272541036970165, "grad_norm": 0.48828125, "learning_rate": 0.0007520000000000001, "loss": 0.4453, "step": 377 }, { "epoch": 0.0032812215171743302, "grad_norm": 0.1630859375, "learning_rate": 0.000754, "loss": 0.3223, "step": 378 }, { "epoch": 0.003289901997378495, "grad_norm": 0.150390625, "learning_rate": 0.000756, "loss": 0.2695, "step": 379 }, { "epoch": 0.00329858247758266, "grad_norm": 0.236328125, "learning_rate": 0.000758, "loss": 0.3164, "step": 380 }, { "epoch": 0.0033072629577868247, "grad_norm": 0.25390625, "learning_rate": 0.00076, "loss": 0.291, "step": 381 }, { "epoch": 0.00331594343799099, "grad_norm": 0.220703125, "learning_rate": 0.000762, "loss": 0.3047, "step": 382 }, { "epoch": 0.0033246239181951545, "grad_norm": 0.1923828125, "learning_rate": 0.000764, "loss": 0.2832, "step": 383 }, { "epoch": 0.0033333043983993197, "grad_norm": 0.2021484375, "learning_rate": 0.0007660000000000001, "loss": 0.291, "step": 384 }, { "epoch": 0.0033419848786034843, "grad_norm": 0.173828125, "learning_rate": 0.000768, "loss": 0.2305, "step": 385 }, { "epoch": 0.003350665358807649, "grad_norm": 0.25390625, "learning_rate": 0.0007700000000000001, "loss": 0.3105, "step": 386 }, { "epoch": 0.003359345839011814, "grad_norm": 0.1611328125, "learning_rate": 0.000772, "loss": 0.3145, "step": 387 }, { "epoch": 0.003368026319215979, "grad_norm": 0.171875, "learning_rate": 0.0007740000000000001, "loss": 0.373, "step": 388 }, { "epoch": 0.003376706799420144, "grad_norm": 0.1533203125, "learning_rate": 0.000776, "loss": 0.2158, "step": 389 }, { "epoch": 0.0033853872796243086, "grad_norm": 0.2060546875, "learning_rate": 0.000778, "loss": 0.2676, "step": 390 }, { "epoch": 0.0033940677598284738, "grad_norm": 0.21484375, "learning_rate": 0.0007800000000000001, "loss": 0.3438, "step": 391 }, { "epoch": 0.0034027482400326384, "grad_norm": 0.1962890625, "learning_rate": 0.000782, "loss": 0.25, "step": 392 }, { "epoch": 0.0034114287202368036, "grad_norm": 0.1865234375, "learning_rate": 0.0007840000000000001, "loss": 0.2832, "step": 393 }, { "epoch": 0.0034201092004409682, "grad_norm": 0.1484375, "learning_rate": 0.000786, "loss": 0.293, "step": 394 }, { "epoch": 0.0034287896806451334, "grad_norm": 0.1943359375, "learning_rate": 0.0007880000000000001, "loss": 0.3242, "step": 395 }, { "epoch": 0.003437470160849298, "grad_norm": 0.1376953125, "learning_rate": 0.00079, "loss": 0.293, "step": 396 }, { "epoch": 0.003446150641053463, "grad_norm": 0.146484375, "learning_rate": 0.0007920000000000001, "loss": 0.2266, "step": 397 }, { "epoch": 0.003454831121257628, "grad_norm": 0.1640625, "learning_rate": 0.0007940000000000001, "loss": 0.3711, "step": 398 }, { "epoch": 0.003463511601461793, "grad_norm": 0.21484375, "learning_rate": 0.000796, "loss": 0.3184, "step": 399 }, { "epoch": 0.0034721920816659577, "grad_norm": 0.2021484375, "learning_rate": 0.0007980000000000001, "loss": 0.334, "step": 400 }, { "epoch": 0.003480872561870123, "grad_norm": 0.1953125, "learning_rate": 0.0008, "loss": 0.3398, "step": 401 }, { "epoch": 0.0034895530420742875, "grad_norm": 0.1689453125, "learning_rate": 0.0008020000000000001, "loss": 0.2578, "step": 402 }, { "epoch": 0.0034982335222784526, "grad_norm": 0.12890625, "learning_rate": 0.000804, "loss": 0.2734, "step": 403 }, { "epoch": 0.0035069140024826173, "grad_norm": 0.2197265625, "learning_rate": 0.0008060000000000001, "loss": 0.332, "step": 404 }, { "epoch": 0.0035155944826867824, "grad_norm": 0.1806640625, "learning_rate": 0.000808, "loss": 0.2969, "step": 405 }, { "epoch": 0.003524274962890947, "grad_norm": 0.205078125, "learning_rate": 0.0008100000000000001, "loss": 0.249, "step": 406 }, { "epoch": 0.003532955443095112, "grad_norm": 0.244140625, "learning_rate": 0.0008120000000000001, "loss": 0.3672, "step": 407 }, { "epoch": 0.003541635923299277, "grad_norm": 0.1484375, "learning_rate": 0.0008139999999999999, "loss": 0.2617, "step": 408 }, { "epoch": 0.003550316403503442, "grad_norm": 0.1806640625, "learning_rate": 0.000816, "loss": 0.3086, "step": 409 }, { "epoch": 0.0035589968837076067, "grad_norm": 0.13671875, "learning_rate": 0.0008179999999999999, "loss": 0.2363, "step": 410 }, { "epoch": 0.003567677363911772, "grad_norm": 0.12158203125, "learning_rate": 0.00082, "loss": 0.3008, "step": 411 }, { "epoch": 0.0035763578441159365, "grad_norm": 0.138671875, "learning_rate": 0.0008219999999999999, "loss": 0.2852, "step": 412 }, { "epoch": 0.003585038324320101, "grad_norm": 0.1708984375, "learning_rate": 0.000824, "loss": 0.3027, "step": 413 }, { "epoch": 0.0035937188045242663, "grad_norm": 0.1455078125, "learning_rate": 0.000826, "loss": 0.2451, "step": 414 }, { "epoch": 0.003602399284728431, "grad_norm": 0.1416015625, "learning_rate": 0.000828, "loss": 0.3164, "step": 415 }, { "epoch": 0.003611079764932596, "grad_norm": 0.1201171875, "learning_rate": 0.00083, "loss": 0.2734, "step": 416 }, { "epoch": 0.003619760245136761, "grad_norm": 0.12060546875, "learning_rate": 0.000832, "loss": 0.2734, "step": 417 }, { "epoch": 0.003628440725340926, "grad_norm": 0.1904296875, "learning_rate": 0.000834, "loss": 0.3262, "step": 418 }, { "epoch": 0.0036371212055450906, "grad_norm": 0.11669921875, "learning_rate": 0.0008359999999999999, "loss": 0.2969, "step": 419 }, { "epoch": 0.0036458016857492557, "grad_norm": 0.1591796875, "learning_rate": 0.000838, "loss": 0.2539, "step": 420 }, { "epoch": 0.0036544821659534204, "grad_norm": 0.1083984375, "learning_rate": 0.00084, "loss": 0.2363, "step": 421 }, { "epoch": 0.0036631626461575855, "grad_norm": 0.142578125, "learning_rate": 0.000842, "loss": 0.3066, "step": 422 }, { "epoch": 0.00367184312636175, "grad_norm": 0.1328125, "learning_rate": 0.000844, "loss": 0.2969, "step": 423 }, { "epoch": 0.0036805236065659153, "grad_norm": 0.1923828125, "learning_rate": 0.000846, "loss": 0.2988, "step": 424 }, { "epoch": 0.00368920408677008, "grad_norm": 0.12890625, "learning_rate": 0.000848, "loss": 0.2393, "step": 425 }, { "epoch": 0.003697884566974245, "grad_norm": 0.134765625, "learning_rate": 0.00085, "loss": 0.2812, "step": 426 }, { "epoch": 0.00370656504717841, "grad_norm": 0.1474609375, "learning_rate": 0.000852, "loss": 0.2422, "step": 427 }, { "epoch": 0.003715245527382575, "grad_norm": 0.26953125, "learning_rate": 0.000854, "loss": 0.334, "step": 428 }, { "epoch": 0.0037239260075867396, "grad_norm": 0.1328125, "learning_rate": 0.000856, "loss": 0.3047, "step": 429 }, { "epoch": 0.0037326064877909047, "grad_norm": 0.228515625, "learning_rate": 0.000858, "loss": 0.3066, "step": 430 }, { "epoch": 0.0037412869679950694, "grad_norm": 0.19140625, "learning_rate": 0.00086, "loss": 0.2852, "step": 431 }, { "epoch": 0.0037499674481992346, "grad_norm": 0.17578125, "learning_rate": 0.000862, "loss": 0.2754, "step": 432 }, { "epoch": 0.0037586479284033992, "grad_norm": 0.1796875, "learning_rate": 0.000864, "loss": 0.2402, "step": 433 }, { "epoch": 0.0037673284086075644, "grad_norm": 0.12109375, "learning_rate": 0.000866, "loss": 0.2734, "step": 434 }, { "epoch": 0.003776008888811729, "grad_norm": 0.193359375, "learning_rate": 0.0008680000000000001, "loss": 0.2871, "step": 435 }, { "epoch": 0.003784689369015894, "grad_norm": 0.142578125, "learning_rate": 0.00087, "loss": 0.3008, "step": 436 }, { "epoch": 0.003793369849220059, "grad_norm": 0.1669921875, "learning_rate": 0.000872, "loss": 0.2715, "step": 437 }, { "epoch": 0.0038020503294242235, "grad_norm": 0.16015625, "learning_rate": 0.000874, "loss": 0.3086, "step": 438 }, { "epoch": 0.0038107308096283887, "grad_norm": 0.11767578125, "learning_rate": 0.000876, "loss": 0.2275, "step": 439 }, { "epoch": 0.0038194112898325533, "grad_norm": 0.189453125, "learning_rate": 0.000878, "loss": 0.375, "step": 440 }, { "epoch": 0.0038280917700367185, "grad_norm": 0.1572265625, "learning_rate": 0.00088, "loss": 0.3242, "step": 441 }, { "epoch": 0.003836772250240883, "grad_norm": 0.1123046875, "learning_rate": 0.000882, "loss": 0.2852, "step": 442 }, { "epoch": 0.0038454527304450483, "grad_norm": 0.11962890625, "learning_rate": 0.000884, "loss": 0.3105, "step": 443 }, { "epoch": 0.003854133210649213, "grad_norm": 0.1376953125, "learning_rate": 0.0008860000000000001, "loss": 0.2891, "step": 444 }, { "epoch": 0.003862813690853378, "grad_norm": 0.1669921875, "learning_rate": 0.000888, "loss": 0.3516, "step": 445 }, { "epoch": 0.0038714941710575428, "grad_norm": 0.1689453125, "learning_rate": 0.0008900000000000001, "loss": 0.3945, "step": 446 }, { "epoch": 0.003880174651261708, "grad_norm": 0.181640625, "learning_rate": 0.000892, "loss": 0.209, "step": 447 }, { "epoch": 0.0038888551314658726, "grad_norm": 0.1376953125, "learning_rate": 0.000894, "loss": 0.2715, "step": 448 }, { "epoch": 0.0038975356116700377, "grad_norm": 0.1396484375, "learning_rate": 0.000896, "loss": 0.3066, "step": 449 }, { "epoch": 0.0039062160918742024, "grad_norm": 0.12060546875, "learning_rate": 0.000898, "loss": 0.2559, "step": 450 }, { "epoch": 0.0039148965720783675, "grad_norm": 0.1416015625, "learning_rate": 0.0009000000000000001, "loss": 0.2617, "step": 451 }, { "epoch": 0.003923577052282533, "grad_norm": 0.1640625, "learning_rate": 0.000902, "loss": 0.2988, "step": 452 }, { "epoch": 0.003932257532486697, "grad_norm": 0.1494140625, "learning_rate": 0.0009040000000000001, "loss": 0.3164, "step": 453 }, { "epoch": 0.003940938012690862, "grad_norm": 0.267578125, "learning_rate": 0.000906, "loss": 0.3789, "step": 454 }, { "epoch": 0.003949618492895027, "grad_norm": 0.15234375, "learning_rate": 0.0009080000000000001, "loss": 0.2773, "step": 455 }, { "epoch": 0.003958298973099192, "grad_norm": 0.1123046875, "learning_rate": 0.00091, "loss": 0.2051, "step": 456 }, { "epoch": 0.0039669794533033565, "grad_norm": 0.142578125, "learning_rate": 0.000912, "loss": 0.2422, "step": 457 }, { "epoch": 0.003975659933507522, "grad_norm": 0.1513671875, "learning_rate": 0.0009140000000000001, "loss": 0.2441, "step": 458 }, { "epoch": 0.003984340413711687, "grad_norm": 0.1162109375, "learning_rate": 0.000916, "loss": 0.2676, "step": 459 }, { "epoch": 0.003993020893915852, "grad_norm": 0.15625, "learning_rate": 0.0009180000000000001, "loss": 0.3477, "step": 460 }, { "epoch": 0.004001701374120016, "grad_norm": 0.173828125, "learning_rate": 0.00092, "loss": 0.2578, "step": 461 }, { "epoch": 0.004010381854324181, "grad_norm": 0.201171875, "learning_rate": 0.0009220000000000001, "loss": 0.3086, "step": 462 }, { "epoch": 0.004019062334528346, "grad_norm": 0.1611328125, "learning_rate": 0.000924, "loss": 0.2227, "step": 463 }, { "epoch": 0.004027742814732511, "grad_norm": 0.2138671875, "learning_rate": 0.0009260000000000001, "loss": 0.4023, "step": 464 }, { "epoch": 0.004036423294936676, "grad_norm": 0.1416015625, "learning_rate": 0.0009280000000000001, "loss": 0.2969, "step": 465 }, { "epoch": 0.004045103775140841, "grad_norm": 0.177734375, "learning_rate": 0.00093, "loss": 0.2773, "step": 466 }, { "epoch": 0.004053784255345006, "grad_norm": 0.2138671875, "learning_rate": 0.0009320000000000001, "loss": 0.3027, "step": 467 }, { "epoch": 0.00406246473554917, "grad_norm": 0.1328125, "learning_rate": 0.000934, "loss": 0.2617, "step": 468 }, { "epoch": 0.004071145215753335, "grad_norm": 0.173828125, "learning_rate": 0.0009360000000000001, "loss": 0.2539, "step": 469 }, { "epoch": 0.0040798256959575, "grad_norm": 0.306640625, "learning_rate": 0.0009379999999999999, "loss": 0.3242, "step": 470 }, { "epoch": 0.0040885061761616655, "grad_norm": 0.12158203125, "learning_rate": 0.00094, "loss": 0.2334, "step": 471 }, { "epoch": 0.00409718665636583, "grad_norm": 0.1298828125, "learning_rate": 0.000942, "loss": 0.2891, "step": 472 }, { "epoch": 0.004105867136569995, "grad_norm": 0.19921875, "learning_rate": 0.000944, "loss": 0.4492, "step": 473 }, { "epoch": 0.00411454761677416, "grad_norm": 0.1474609375, "learning_rate": 0.000946, "loss": 0.3164, "step": 474 }, { "epoch": 0.004123228096978325, "grad_norm": 0.173828125, "learning_rate": 0.000948, "loss": 0.3027, "step": 475 }, { "epoch": 0.004131908577182489, "grad_norm": 0.1728515625, "learning_rate": 0.00095, "loss": 0.2754, "step": 476 }, { "epoch": 0.0041405890573866545, "grad_norm": 0.1376953125, "learning_rate": 0.0009519999999999999, "loss": 0.2275, "step": 477 }, { "epoch": 0.00414926953759082, "grad_norm": 0.2373046875, "learning_rate": 0.000954, "loss": 0.2891, "step": 478 }, { "epoch": 0.004157950017794985, "grad_norm": 0.1943359375, "learning_rate": 0.0009559999999999999, "loss": 0.332, "step": 479 }, { "epoch": 0.004166630497999149, "grad_norm": 0.16796875, "learning_rate": 0.000958, "loss": 0.3027, "step": 480 }, { "epoch": 0.004175310978203314, "grad_norm": 0.1318359375, "learning_rate": 0.00096, "loss": 0.2295, "step": 481 }, { "epoch": 0.004183991458407479, "grad_norm": 0.1181640625, "learning_rate": 0.000962, "loss": 0.2285, "step": 482 }, { "epoch": 0.004192671938611644, "grad_norm": 0.1494140625, "learning_rate": 0.000964, "loss": 0.2969, "step": 483 }, { "epoch": 0.004201352418815809, "grad_norm": 0.2099609375, "learning_rate": 0.000966, "loss": 0.3027, "step": 484 }, { "epoch": 0.004210032899019974, "grad_norm": 0.1640625, "learning_rate": 0.000968, "loss": 0.3477, "step": 485 }, { "epoch": 0.004218713379224139, "grad_norm": 0.17578125, "learning_rate": 0.0009699999999999999, "loss": 0.2617, "step": 486 }, { "epoch": 0.004227393859428304, "grad_norm": 0.208984375, "learning_rate": 0.000972, "loss": 0.4824, "step": 487 }, { "epoch": 0.004236074339632468, "grad_norm": 0.1806640625, "learning_rate": 0.000974, "loss": 0.293, "step": 488 }, { "epoch": 0.004244754819836633, "grad_norm": 0.326171875, "learning_rate": 0.000976, "loss": 0.4102, "step": 489 }, { "epoch": 0.0042534353000407985, "grad_norm": 0.138671875, "learning_rate": 0.000978, "loss": 0.2637, "step": 490 }, { "epoch": 0.004262115780244963, "grad_norm": 0.11279296875, "learning_rate": 0.00098, "loss": 0.2578, "step": 491 }, { "epoch": 0.004270796260449128, "grad_norm": 0.11962890625, "learning_rate": 0.000982, "loss": 0.1992, "step": 492 }, { "epoch": 0.004279476740653293, "grad_norm": 0.1533203125, "learning_rate": 0.000984, "loss": 0.252, "step": 493 }, { "epoch": 0.004288157220857458, "grad_norm": 0.177734375, "learning_rate": 0.0009860000000000001, "loss": 0.2598, "step": 494 }, { "epoch": 0.004296837701061622, "grad_norm": 0.142578125, "learning_rate": 0.000988, "loss": 0.2422, "step": 495 }, { "epoch": 0.0043055181812657875, "grad_norm": 0.1279296875, "learning_rate": 0.00099, "loss": 0.2578, "step": 496 }, { "epoch": 0.004314198661469953, "grad_norm": 0.1455078125, "learning_rate": 0.000992, "loss": 0.2676, "step": 497 }, { "epoch": 0.004322879141674118, "grad_norm": 0.169921875, "learning_rate": 0.000994, "loss": 0.2852, "step": 498 }, { "epoch": 0.004331559621878282, "grad_norm": 0.1357421875, "learning_rate": 0.000996, "loss": 0.2754, "step": 499 }, { "epoch": 0.004340240102082447, "grad_norm": 0.1962890625, "learning_rate": 0.000998, "loss": 0.2754, "step": 500 }, { "epoch": 0.004348920582286612, "grad_norm": 0.1669921875, "learning_rate": 0.001, "loss": 0.3984, "step": 501 }, { "epoch": 0.004357601062490777, "grad_norm": 0.11865234375, "learning_rate": 0.001002, "loss": 0.2656, "step": 502 }, { "epoch": 0.004366281542694942, "grad_norm": 0.2451171875, "learning_rate": 0.0010040000000000001, "loss": 0.3008, "step": 503 }, { "epoch": 0.004374962022899107, "grad_norm": 0.1630859375, "learning_rate": 0.001006, "loss": 0.2676, "step": 504 }, { "epoch": 0.004383642503103272, "grad_norm": 0.123046875, "learning_rate": 0.001008, "loss": 0.2461, "step": 505 }, { "epoch": 0.004392322983307437, "grad_norm": 0.126953125, "learning_rate": 0.00101, "loss": 0.2793, "step": 506 }, { "epoch": 0.004401003463511601, "grad_norm": 0.11669921875, "learning_rate": 0.001012, "loss": 0.3027, "step": 507 }, { "epoch": 0.004409683943715766, "grad_norm": 0.1630859375, "learning_rate": 0.001014, "loss": 0.2793, "step": 508 }, { "epoch": 0.004418364423919931, "grad_norm": 0.142578125, "learning_rate": 0.001016, "loss": 0.2559, "step": 509 }, { "epoch": 0.0044270449041240965, "grad_norm": 0.1298828125, "learning_rate": 0.001018, "loss": 0.2812, "step": 510 }, { "epoch": 0.004435725384328261, "grad_norm": 0.158203125, "learning_rate": 0.00102, "loss": 0.2793, "step": 511 }, { "epoch": 0.004444405864532426, "grad_norm": 0.142578125, "learning_rate": 0.0010220000000000001, "loss": 0.2891, "step": 512 }, { "epoch": 0.004453086344736591, "grad_norm": 0.146484375, "learning_rate": 0.001024, "loss": 0.334, "step": 513 }, { "epoch": 0.004461766824940756, "grad_norm": 0.0986328125, "learning_rate": 0.001026, "loss": 0.252, "step": 514 }, { "epoch": 0.00447044730514492, "grad_norm": 0.150390625, "learning_rate": 0.001028, "loss": 0.2432, "step": 515 }, { "epoch": 0.0044791277853490855, "grad_norm": 0.1650390625, "learning_rate": 0.00103, "loss": 0.3223, "step": 516 }, { "epoch": 0.004487808265553251, "grad_norm": 0.15234375, "learning_rate": 0.0010320000000000001, "loss": 0.2148, "step": 517 }, { "epoch": 0.004496488745757415, "grad_norm": 0.1357421875, "learning_rate": 0.001034, "loss": 0.3164, "step": 518 }, { "epoch": 0.00450516922596158, "grad_norm": 0.2734375, "learning_rate": 0.001036, "loss": 0.3379, "step": 519 }, { "epoch": 0.004513849706165745, "grad_norm": 0.12890625, "learning_rate": 0.001038, "loss": 0.2656, "step": 520 }, { "epoch": 0.00452253018636991, "grad_norm": 0.142578125, "learning_rate": 0.0010400000000000001, "loss": 0.293, "step": 521 }, { "epoch": 0.0045312106665740745, "grad_norm": 0.1318359375, "learning_rate": 0.001042, "loss": 0.2656, "step": 522 }, { "epoch": 0.00453989114677824, "grad_norm": 0.1376953125, "learning_rate": 0.001044, "loss": 0.3262, "step": 523 }, { "epoch": 0.004548571626982405, "grad_norm": 0.15625, "learning_rate": 0.001046, "loss": 0.293, "step": 524 }, { "epoch": 0.00455725210718657, "grad_norm": 0.12890625, "learning_rate": 0.001048, "loss": 0.2207, "step": 525 }, { "epoch": 0.004565932587390734, "grad_norm": 0.1474609375, "learning_rate": 0.0010500000000000002, "loss": 0.3086, "step": 526 }, { "epoch": 0.004574613067594899, "grad_norm": 0.2236328125, "learning_rate": 0.001052, "loss": 0.2676, "step": 527 }, { "epoch": 0.004583293547799064, "grad_norm": 0.2236328125, "learning_rate": 0.001054, "loss": 0.332, "step": 528 }, { "epoch": 0.0045919740280032295, "grad_norm": 0.171875, "learning_rate": 0.001056, "loss": 0.2715, "step": 529 }, { "epoch": 0.004600654508207394, "grad_norm": 0.1513671875, "learning_rate": 0.0010580000000000001, "loss": 0.3008, "step": 530 }, { "epoch": 0.004609334988411559, "grad_norm": 0.2080078125, "learning_rate": 0.0010600000000000002, "loss": 0.2969, "step": 531 }, { "epoch": 0.004618015468615724, "grad_norm": 0.1572265625, "learning_rate": 0.001062, "loss": 0.2246, "step": 532 }, { "epoch": 0.004626695948819889, "grad_norm": 0.146484375, "learning_rate": 0.001064, "loss": 0.249, "step": 533 }, { "epoch": 0.004635376429024053, "grad_norm": 0.1552734375, "learning_rate": 0.001066, "loss": 0.2832, "step": 534 }, { "epoch": 0.0046440569092282185, "grad_norm": 0.17578125, "learning_rate": 0.0010680000000000002, "loss": 0.2578, "step": 535 }, { "epoch": 0.004652737389432384, "grad_norm": 0.0888671875, "learning_rate": 0.00107, "loss": 0.2383, "step": 536 }, { "epoch": 0.004661417869636549, "grad_norm": 0.138671875, "learning_rate": 0.001072, "loss": 0.2734, "step": 537 }, { "epoch": 0.004670098349840713, "grad_norm": 0.1650390625, "learning_rate": 0.001074, "loss": 0.291, "step": 538 }, { "epoch": 0.004678778830044878, "grad_norm": 0.1591796875, "learning_rate": 0.0010760000000000001, "loss": 0.2676, "step": 539 }, { "epoch": 0.004687459310249043, "grad_norm": 0.10205078125, "learning_rate": 0.0010780000000000002, "loss": 0.2285, "step": 540 }, { "epoch": 0.0046961397904532074, "grad_norm": 0.15234375, "learning_rate": 0.00108, "loss": 0.2988, "step": 541 }, { "epoch": 0.0047048202706573726, "grad_norm": 0.1630859375, "learning_rate": 0.001082, "loss": 0.3203, "step": 542 }, { "epoch": 0.004713500750861538, "grad_norm": 0.138671875, "learning_rate": 0.0010840000000000001, "loss": 0.2793, "step": 543 }, { "epoch": 0.004722181231065703, "grad_norm": 0.189453125, "learning_rate": 0.0010860000000000002, "loss": 0.293, "step": 544 }, { "epoch": 0.004730861711269867, "grad_norm": 0.1875, "learning_rate": 0.0010880000000000002, "loss": 0.25, "step": 545 }, { "epoch": 0.004739542191474032, "grad_norm": 0.134765625, "learning_rate": 0.00109, "loss": 0.1982, "step": 546 }, { "epoch": 0.004748222671678197, "grad_norm": 0.1591796875, "learning_rate": 0.001092, "loss": 0.2773, "step": 547 }, { "epoch": 0.004756903151882362, "grad_norm": 0.1435546875, "learning_rate": 0.0010940000000000001, "loss": 0.2891, "step": 548 }, { "epoch": 0.004765583632086527, "grad_norm": 0.1484375, "learning_rate": 0.0010960000000000002, "loss": 0.2285, "step": 549 }, { "epoch": 0.004774264112290692, "grad_norm": 0.1611328125, "learning_rate": 0.001098, "loss": 0.2949, "step": 550 }, { "epoch": 0.004782944592494857, "grad_norm": 0.220703125, "learning_rate": 0.0011, "loss": 0.3047, "step": 551 }, { "epoch": 0.004791625072699022, "grad_norm": 0.1103515625, "learning_rate": 0.0011020000000000001, "loss": 0.2793, "step": 552 }, { "epoch": 0.004800305552903186, "grad_norm": 0.181640625, "learning_rate": 0.0011040000000000002, "loss": 0.3809, "step": 553 }, { "epoch": 0.004808986033107351, "grad_norm": 0.1455078125, "learning_rate": 0.0011060000000000002, "loss": 0.2539, "step": 554 }, { "epoch": 0.0048176665133115165, "grad_norm": 0.17578125, "learning_rate": 0.001108, "loss": 0.2559, "step": 555 }, { "epoch": 0.004826346993515682, "grad_norm": 0.1416015625, "learning_rate": 0.00111, "loss": 0.3125, "step": 556 }, { "epoch": 0.004835027473719846, "grad_norm": 0.12353515625, "learning_rate": 0.0011120000000000001, "loss": 0.2441, "step": 557 }, { "epoch": 0.004843707953924011, "grad_norm": 0.130859375, "learning_rate": 0.0011140000000000002, "loss": 0.2598, "step": 558 }, { "epoch": 0.004852388434128176, "grad_norm": 0.1279296875, "learning_rate": 0.001116, "loss": 0.2852, "step": 559 }, { "epoch": 0.004861068914332341, "grad_norm": 0.1064453125, "learning_rate": 0.001118, "loss": 0.2754, "step": 560 }, { "epoch": 0.0048697493945365055, "grad_norm": 0.2314453125, "learning_rate": 0.0011200000000000001, "loss": 0.248, "step": 561 }, { "epoch": 0.004878429874740671, "grad_norm": 0.1279296875, "learning_rate": 0.0011220000000000002, "loss": 0.2832, "step": 562 }, { "epoch": 0.004887110354944836, "grad_norm": 0.11376953125, "learning_rate": 0.0011240000000000002, "loss": 0.2275, "step": 563 }, { "epoch": 0.004895790835149001, "grad_norm": 0.177734375, "learning_rate": 0.0011259999999999998, "loss": 0.457, "step": 564 }, { "epoch": 0.004904471315353165, "grad_norm": 0.169921875, "learning_rate": 0.0011279999999999999, "loss": 0.3164, "step": 565 }, { "epoch": 0.00491315179555733, "grad_norm": 0.1552734375, "learning_rate": 0.00113, "loss": 0.2754, "step": 566 }, { "epoch": 0.004921832275761495, "grad_norm": 0.1591796875, "learning_rate": 0.001132, "loss": 0.2559, "step": 567 }, { "epoch": 0.00493051275596566, "grad_norm": 0.1591796875, "learning_rate": 0.001134, "loss": 0.25, "step": 568 }, { "epoch": 0.004939193236169825, "grad_norm": 0.1630859375, "learning_rate": 0.0011359999999999999, "loss": 0.2363, "step": 569 }, { "epoch": 0.00494787371637399, "grad_norm": 0.435546875, "learning_rate": 0.001138, "loss": 0.8203, "step": 570 }, { "epoch": 0.004956554196578155, "grad_norm": 0.1455078125, "learning_rate": 0.00114, "loss": 0.2598, "step": 571 }, { "epoch": 0.004965234676782319, "grad_norm": 0.1455078125, "learning_rate": 0.001142, "loss": 0.2969, "step": 572 }, { "epoch": 0.004973915156986484, "grad_norm": 0.205078125, "learning_rate": 0.0011439999999999998, "loss": 0.293, "step": 573 }, { "epoch": 0.0049825956371906495, "grad_norm": 0.1748046875, "learning_rate": 0.0011459999999999999, "loss": 0.291, "step": 574 }, { "epoch": 0.004991276117394815, "grad_norm": 0.1689453125, "learning_rate": 0.001148, "loss": 0.252, "step": 575 }, { "epoch": 0.004999956597598979, "grad_norm": 0.126953125, "learning_rate": 0.00115, "loss": 0.2773, "step": 576 }, { "epoch": 0.005008637077803144, "grad_norm": 0.1611328125, "learning_rate": 0.001152, "loss": 0.2578, "step": 577 }, { "epoch": 0.005017317558007309, "grad_norm": 0.205078125, "learning_rate": 0.0011539999999999999, "loss": 0.25, "step": 578 }, { "epoch": 0.005025998038211474, "grad_norm": 0.111328125, "learning_rate": 0.001156, "loss": 0.2461, "step": 579 }, { "epoch": 0.0050346785184156384, "grad_norm": 0.1162109375, "learning_rate": 0.001158, "loss": 0.2168, "step": 580 }, { "epoch": 0.0050433589986198036, "grad_norm": 0.08544921875, "learning_rate": 0.00116, "loss": 0.2559, "step": 581 }, { "epoch": 0.005052039478823969, "grad_norm": 0.412109375, "learning_rate": 0.0011619999999999998, "loss": 0.4023, "step": 582 }, { "epoch": 0.005060719959028134, "grad_norm": 0.177734375, "learning_rate": 0.0011639999999999999, "loss": 0.3223, "step": 583 }, { "epoch": 0.005069400439232298, "grad_norm": 0.130859375, "learning_rate": 0.001166, "loss": 0.2754, "step": 584 }, { "epoch": 0.005078080919436463, "grad_norm": 0.09716796875, "learning_rate": 0.001168, "loss": 0.2891, "step": 585 }, { "epoch": 0.005086761399640628, "grad_norm": 0.11328125, "learning_rate": 0.00117, "loss": 0.2578, "step": 586 }, { "epoch": 0.005095441879844793, "grad_norm": 0.1455078125, "learning_rate": 0.0011719999999999999, "loss": 0.2871, "step": 587 }, { "epoch": 0.005104122360048958, "grad_norm": 0.111328125, "learning_rate": 0.001174, "loss": 0.2617, "step": 588 }, { "epoch": 0.005112802840253123, "grad_norm": 0.1171875, "learning_rate": 0.001176, "loss": 0.3145, "step": 589 }, { "epoch": 0.005121483320457288, "grad_norm": 0.10595703125, "learning_rate": 0.001178, "loss": 0.2734, "step": 590 }, { "epoch": 0.005130163800661453, "grad_norm": 0.09716796875, "learning_rate": 0.00118, "loss": 0.2578, "step": 591 }, { "epoch": 0.005138844280865617, "grad_norm": 0.1279296875, "learning_rate": 0.0011819999999999999, "loss": 0.3203, "step": 592 }, { "epoch": 0.005147524761069782, "grad_norm": 0.12109375, "learning_rate": 0.001184, "loss": 0.2637, "step": 593 }, { "epoch": 0.0051562052412739475, "grad_norm": 0.158203125, "learning_rate": 0.001186, "loss": 0.2695, "step": 594 }, { "epoch": 0.005164885721478112, "grad_norm": 0.380859375, "learning_rate": 0.001188, "loss": 0.3398, "step": 595 }, { "epoch": 0.005173566201682277, "grad_norm": 0.154296875, "learning_rate": 0.0011899999999999999, "loss": 0.2734, "step": 596 }, { "epoch": 0.005182246681886442, "grad_norm": 0.1396484375, "learning_rate": 0.001192, "loss": 0.3418, "step": 597 }, { "epoch": 0.005190927162090607, "grad_norm": 0.162109375, "learning_rate": 0.001194, "loss": 0.2695, "step": 598 }, { "epoch": 0.005199607642294771, "grad_norm": 0.1552734375, "learning_rate": 0.001196, "loss": 0.2656, "step": 599 }, { "epoch": 0.0052082881224989365, "grad_norm": 0.16015625, "learning_rate": 0.001198, "loss": 0.3262, "step": 600 }, { "epoch": 0.005216968602703102, "grad_norm": 0.158203125, "learning_rate": 0.0012, "loss": 0.2891, "step": 601 }, { "epoch": 0.005225649082907267, "grad_norm": 0.228515625, "learning_rate": 0.001202, "loss": 0.3535, "step": 602 }, { "epoch": 0.005234329563111431, "grad_norm": 0.1796875, "learning_rate": 0.001204, "loss": 0.2656, "step": 603 }, { "epoch": 0.005243010043315596, "grad_norm": 0.1318359375, "learning_rate": 0.001206, "loss": 0.2539, "step": 604 }, { "epoch": 0.005251690523519761, "grad_norm": 0.1572265625, "learning_rate": 0.001208, "loss": 0.2812, "step": 605 }, { "epoch": 0.005260371003723926, "grad_norm": 0.12109375, "learning_rate": 0.00121, "loss": 0.2598, "step": 606 }, { "epoch": 0.005269051483928091, "grad_norm": 0.248046875, "learning_rate": 0.001212, "loss": 0.293, "step": 607 }, { "epoch": 0.005277731964132256, "grad_norm": 1.828125, "learning_rate": 0.001214, "loss": 0.7344, "step": 608 }, { "epoch": 0.005286412444336421, "grad_norm": 0.12158203125, "learning_rate": 0.001216, "loss": 0.2539, "step": 609 }, { "epoch": 0.005295092924540586, "grad_norm": 0.1396484375, "learning_rate": 0.001218, "loss": 0.3047, "step": 610 }, { "epoch": 0.00530377340474475, "grad_norm": 0.1220703125, "learning_rate": 0.00122, "loss": 0.2188, "step": 611 }, { "epoch": 0.005312453884948915, "grad_norm": 0.1484375, "learning_rate": 0.001222, "loss": 0.2852, "step": 612 }, { "epoch": 0.0053211343651530804, "grad_norm": 0.23828125, "learning_rate": 0.001224, "loss": 0.2969, "step": 613 }, { "epoch": 0.005329814845357246, "grad_norm": 0.302734375, "learning_rate": 0.001226, "loss": 0.3613, "step": 614 }, { "epoch": 0.00533849532556141, "grad_norm": 0.265625, "learning_rate": 0.001228, "loss": 0.2617, "step": 615 }, { "epoch": 0.005347175805765575, "grad_norm": 0.08935546875, "learning_rate": 0.00123, "loss": 0.2539, "step": 616 }, { "epoch": 0.00535585628596974, "grad_norm": 0.134765625, "learning_rate": 0.001232, "loss": 0.2988, "step": 617 }, { "epoch": 0.005364536766173904, "grad_norm": 0.1396484375, "learning_rate": 0.001234, "loss": 0.3164, "step": 618 }, { "epoch": 0.005373217246378069, "grad_norm": 0.16796875, "learning_rate": 0.0012360000000000001, "loss": 0.2598, "step": 619 }, { "epoch": 0.0053818977265822346, "grad_norm": 0.12451171875, "learning_rate": 0.001238, "loss": 0.3203, "step": 620 }, { "epoch": 0.0053905782067864, "grad_norm": 0.11474609375, "learning_rate": 0.00124, "loss": 0.2891, "step": 621 }, { "epoch": 0.005399258686990564, "grad_norm": 0.1162109375, "learning_rate": 0.001242, "loss": 0.3086, "step": 622 }, { "epoch": 0.005407939167194729, "grad_norm": 0.1005859375, "learning_rate": 0.001244, "loss": 0.3008, "step": 623 }, { "epoch": 0.005416619647398894, "grad_norm": 0.07958984375, "learning_rate": 0.001246, "loss": 0.2578, "step": 624 }, { "epoch": 0.005425300127603059, "grad_norm": 0.12890625, "learning_rate": 0.001248, "loss": 0.2832, "step": 625 }, { "epoch": 0.0054339806078072235, "grad_norm": 0.1318359375, "learning_rate": 0.00125, "loss": 0.3555, "step": 626 }, { "epoch": 0.005442661088011389, "grad_norm": 0.07373046875, "learning_rate": 0.001252, "loss": 0.2354, "step": 627 }, { "epoch": 0.005451341568215554, "grad_norm": 0.142578125, "learning_rate": 0.0012540000000000001, "loss": 0.2852, "step": 628 }, { "epoch": 0.005460022048419719, "grad_norm": 0.11181640625, "learning_rate": 0.001256, "loss": 0.3789, "step": 629 }, { "epoch": 0.005468702528623883, "grad_norm": 0.11376953125, "learning_rate": 0.001258, "loss": 0.2852, "step": 630 }, { "epoch": 0.005477383008828048, "grad_norm": 0.146484375, "learning_rate": 0.00126, "loss": 0.3027, "step": 631 }, { "epoch": 0.005486063489032213, "grad_norm": 0.1298828125, "learning_rate": 0.001262, "loss": 0.2871, "step": 632 }, { "epoch": 0.0054947439692363785, "grad_norm": 0.11474609375, "learning_rate": 0.001264, "loss": 0.2988, "step": 633 }, { "epoch": 0.005503424449440543, "grad_norm": 0.39453125, "learning_rate": 0.001266, "loss": 0.4531, "step": 634 }, { "epoch": 0.005512104929644708, "grad_norm": 0.09716796875, "learning_rate": 0.001268, "loss": 0.252, "step": 635 }, { "epoch": 0.005520785409848873, "grad_norm": 0.1474609375, "learning_rate": 0.00127, "loss": 0.291, "step": 636 }, { "epoch": 0.005529465890053038, "grad_norm": 0.1875, "learning_rate": 0.0012720000000000001, "loss": 0.293, "step": 637 }, { "epoch": 0.005538146370257202, "grad_norm": 0.2060546875, "learning_rate": 0.001274, "loss": 0.3555, "step": 638 }, { "epoch": 0.0055468268504613675, "grad_norm": 0.34765625, "learning_rate": 0.001276, "loss": 0.375, "step": 639 }, { "epoch": 0.005555507330665533, "grad_norm": 0.1416015625, "learning_rate": 0.001278, "loss": 0.2236, "step": 640 }, { "epoch": 0.005564187810869698, "grad_norm": 0.10400390625, "learning_rate": 0.00128, "loss": 0.3066, "step": 641 }, { "epoch": 0.005572868291073862, "grad_norm": 0.08740234375, "learning_rate": 0.0012820000000000002, "loss": 0.2891, "step": 642 }, { "epoch": 0.005581548771278027, "grad_norm": 0.1279296875, "learning_rate": 0.001284, "loss": 0.3242, "step": 643 }, { "epoch": 0.005590229251482192, "grad_norm": 0.146484375, "learning_rate": 0.001286, "loss": 0.3398, "step": 644 }, { "epoch": 0.0055989097316863565, "grad_norm": 0.07763671875, "learning_rate": 0.001288, "loss": 0.2148, "step": 645 }, { "epoch": 0.005607590211890522, "grad_norm": 0.1025390625, "learning_rate": 0.0012900000000000001, "loss": 0.293, "step": 646 }, { "epoch": 0.005616270692094687, "grad_norm": 0.11279296875, "learning_rate": 0.001292, "loss": 0.3281, "step": 647 }, { "epoch": 0.005624951172298852, "grad_norm": 0.091796875, "learning_rate": 0.001294, "loss": 0.3105, "step": 648 }, { "epoch": 0.005633631652503016, "grad_norm": 0.12109375, "learning_rate": 0.001296, "loss": 0.3555, "step": 649 }, { "epoch": 0.005642312132707181, "grad_norm": 0.14453125, "learning_rate": 0.0012980000000000001, "loss": 0.3477, "step": 650 }, { "epoch": 0.005650992612911346, "grad_norm": 0.10205078125, "learning_rate": 0.0013000000000000002, "loss": 0.2988, "step": 651 }, { "epoch": 0.0056596730931155114, "grad_norm": 0.0908203125, "learning_rate": 0.001302, "loss": 0.2539, "step": 652 }, { "epoch": 0.005668353573319676, "grad_norm": 0.11572265625, "learning_rate": 0.001304, "loss": 0.3086, "step": 653 }, { "epoch": 0.005677034053523841, "grad_norm": 0.11083984375, "learning_rate": 0.001306, "loss": 0.3906, "step": 654 }, { "epoch": 0.005685714533728006, "grad_norm": 0.1376953125, "learning_rate": 0.0013080000000000001, "loss": 0.4121, "step": 655 }, { "epoch": 0.005694395013932171, "grad_norm": 0.11572265625, "learning_rate": 0.0013100000000000002, "loss": 0.3281, "step": 656 }, { "epoch": 0.005703075494136335, "grad_norm": 0.1396484375, "learning_rate": 0.001312, "loss": 0.3574, "step": 657 }, { "epoch": 0.0057117559743405, "grad_norm": 0.0849609375, "learning_rate": 0.001314, "loss": 0.2422, "step": 658 }, { "epoch": 0.0057204364545446655, "grad_norm": 0.1376953125, "learning_rate": 0.0013160000000000001, "loss": 0.2676, "step": 659 }, { "epoch": 0.005729116934748831, "grad_norm": 0.1328125, "learning_rate": 0.0013180000000000002, "loss": 0.3477, "step": 660 }, { "epoch": 0.005737797414952995, "grad_norm": 0.11572265625, "learning_rate": 0.00132, "loss": 0.3008, "step": 661 }, { "epoch": 0.00574647789515716, "grad_norm": 0.134765625, "learning_rate": 0.001322, "loss": 0.2852, "step": 662 }, { "epoch": 0.005755158375361325, "grad_norm": 0.11279296875, "learning_rate": 0.001324, "loss": 0.2812, "step": 663 }, { "epoch": 0.00576383885556549, "grad_norm": 0.0986328125, "learning_rate": 0.0013260000000000001, "loss": 0.3086, "step": 664 }, { "epoch": 0.0057725193357696545, "grad_norm": 0.08154296875, "learning_rate": 0.0013280000000000002, "loss": 0.2188, "step": 665 }, { "epoch": 0.00578119981597382, "grad_norm": 0.12158203125, "learning_rate": 0.00133, "loss": 0.2891, "step": 666 }, { "epoch": 0.005789880296177985, "grad_norm": 0.1181640625, "learning_rate": 0.001332, "loss": 0.332, "step": 667 }, { "epoch": 0.00579856077638215, "grad_norm": 0.1953125, "learning_rate": 0.0013340000000000001, "loss": 0.3223, "step": 668 }, { "epoch": 0.005807241256586314, "grad_norm": 0.119140625, "learning_rate": 0.0013360000000000002, "loss": 0.3359, "step": 669 }, { "epoch": 0.005815921736790479, "grad_norm": 0.1103515625, "learning_rate": 0.0013380000000000002, "loss": 0.2461, "step": 670 }, { "epoch": 0.005824602216994644, "grad_norm": 0.103515625, "learning_rate": 0.00134, "loss": 0.2871, "step": 671 }, { "epoch": 0.005833282697198809, "grad_norm": 0.10546875, "learning_rate": 0.001342, "loss": 0.2656, "step": 672 }, { "epoch": 0.005841963177402974, "grad_norm": 0.10595703125, "learning_rate": 0.0013440000000000001, "loss": 0.2559, "step": 673 }, { "epoch": 0.005850643657607139, "grad_norm": 0.10400390625, "learning_rate": 0.0013460000000000002, "loss": 0.2656, "step": 674 }, { "epoch": 0.005859324137811304, "grad_norm": 0.13671875, "learning_rate": 0.001348, "loss": 0.2793, "step": 675 }, { "epoch": 0.005868004618015468, "grad_norm": 0.111328125, "learning_rate": 0.00135, "loss": 0.3008, "step": 676 }, { "epoch": 0.005876685098219633, "grad_norm": 0.10595703125, "learning_rate": 0.0013520000000000001, "loss": 0.2451, "step": 677 }, { "epoch": 0.0058853655784237985, "grad_norm": 0.1142578125, "learning_rate": 0.0013540000000000002, "loss": 0.2812, "step": 678 }, { "epoch": 0.005894046058627964, "grad_norm": 0.1044921875, "learning_rate": 0.0013560000000000002, "loss": 0.373, "step": 679 }, { "epoch": 0.005902726538832128, "grad_norm": 0.11083984375, "learning_rate": 0.001358, "loss": 0.2168, "step": 680 }, { "epoch": 0.005911407019036293, "grad_norm": 0.0966796875, "learning_rate": 0.00136, "loss": 0.2793, "step": 681 }, { "epoch": 0.005920087499240458, "grad_norm": 0.12890625, "learning_rate": 0.0013620000000000001, "loss": 0.293, "step": 682 }, { "epoch": 0.005928767979444623, "grad_norm": 0.0927734375, "learning_rate": 0.0013640000000000002, "loss": 0.249, "step": 683 }, { "epoch": 0.0059374484596487875, "grad_norm": 0.10205078125, "learning_rate": 0.001366, "loss": 0.2461, "step": 684 }, { "epoch": 0.005946128939852953, "grad_norm": 0.1357421875, "learning_rate": 0.001368, "loss": 0.3066, "step": 685 }, { "epoch": 0.005954809420057118, "grad_norm": 0.095703125, "learning_rate": 0.0013700000000000001, "loss": 0.2598, "step": 686 }, { "epoch": 0.005963489900261283, "grad_norm": 0.14453125, "learning_rate": 0.0013720000000000002, "loss": 0.2314, "step": 687 }, { "epoch": 0.005972170380465447, "grad_norm": 0.1201171875, "learning_rate": 0.0013740000000000002, "loss": 0.3086, "step": 688 }, { "epoch": 0.005980850860669612, "grad_norm": 0.185546875, "learning_rate": 0.0013759999999999998, "loss": 0.2734, "step": 689 }, { "epoch": 0.005989531340873777, "grad_norm": 0.1142578125, "learning_rate": 0.0013779999999999999, "loss": 0.1924, "step": 690 }, { "epoch": 0.0059982118210779424, "grad_norm": 0.1787109375, "learning_rate": 0.00138, "loss": 0.2891, "step": 691 }, { "epoch": 0.006006892301282107, "grad_norm": 0.1904296875, "learning_rate": 0.001382, "loss": 0.3047, "step": 692 }, { "epoch": 0.006015572781486272, "grad_norm": 0.1357421875, "learning_rate": 0.001384, "loss": 0.2773, "step": 693 }, { "epoch": 0.006024253261690437, "grad_norm": 0.10693359375, "learning_rate": 0.0013859999999999999, "loss": 0.21, "step": 694 }, { "epoch": 0.006032933741894601, "grad_norm": 0.1279296875, "learning_rate": 0.001388, "loss": 0.2793, "step": 695 }, { "epoch": 0.006041614222098766, "grad_norm": 0.134765625, "learning_rate": 0.00139, "loss": 0.3359, "step": 696 }, { "epoch": 0.006050294702302931, "grad_norm": 0.12890625, "learning_rate": 0.001392, "loss": 0.2432, "step": 697 }, { "epoch": 0.0060589751825070965, "grad_norm": 0.1259765625, "learning_rate": 0.0013939999999999998, "loss": 0.3105, "step": 698 }, { "epoch": 0.006067655662711261, "grad_norm": 0.1259765625, "learning_rate": 0.0013959999999999999, "loss": 0.3457, "step": 699 }, { "epoch": 0.006076336142915426, "grad_norm": 0.12109375, "learning_rate": 0.001398, "loss": 0.3086, "step": 700 }, { "epoch": 0.006085016623119591, "grad_norm": 0.162109375, "learning_rate": 0.0014, "loss": 0.249, "step": 701 }, { "epoch": 0.006093697103323756, "grad_norm": 0.1337890625, "learning_rate": 0.001402, "loss": 0.2812, "step": 702 }, { "epoch": 0.00610237758352792, "grad_norm": 0.12890625, "learning_rate": 0.0014039999999999999, "loss": 0.3145, "step": 703 }, { "epoch": 0.0061110580637320855, "grad_norm": 0.10302734375, "learning_rate": 0.001406, "loss": 0.2617, "step": 704 }, { "epoch": 0.006119738543936251, "grad_norm": 0.126953125, "learning_rate": 0.001408, "loss": 0.2539, "step": 705 }, { "epoch": 0.006128419024140416, "grad_norm": 0.0908203125, "learning_rate": 0.00141, "loss": 0.2432, "step": 706 }, { "epoch": 0.00613709950434458, "grad_norm": 0.12158203125, "learning_rate": 0.0014119999999999998, "loss": 0.3047, "step": 707 }, { "epoch": 0.006145779984548745, "grad_norm": 0.1318359375, "learning_rate": 0.001414, "loss": 0.4062, "step": 708 }, { "epoch": 0.00615446046475291, "grad_norm": 0.107421875, "learning_rate": 0.001416, "loss": 0.2832, "step": 709 }, { "epoch": 0.006163140944957075, "grad_norm": 0.0830078125, "learning_rate": 0.001418, "loss": 0.2578, "step": 710 }, { "epoch": 0.00617182142516124, "grad_norm": 0.099609375, "learning_rate": 0.00142, "loss": 0.2949, "step": 711 }, { "epoch": 0.006180501905365405, "grad_norm": 0.11279296875, "learning_rate": 0.0014219999999999999, "loss": 0.2949, "step": 712 }, { "epoch": 0.00618918238556957, "grad_norm": 0.09228515625, "learning_rate": 0.001424, "loss": 0.252, "step": 713 }, { "epoch": 0.006197862865773735, "grad_norm": 0.1416015625, "learning_rate": 0.001426, "loss": 0.3047, "step": 714 }, { "epoch": 0.006206543345977899, "grad_norm": 0.1328125, "learning_rate": 0.001428, "loss": 0.25, "step": 715 }, { "epoch": 0.006215223826182064, "grad_norm": 0.1513671875, "learning_rate": 0.00143, "loss": 0.3164, "step": 716 }, { "epoch": 0.0062239043063862295, "grad_norm": 0.10009765625, "learning_rate": 0.001432, "loss": 0.2812, "step": 717 }, { "epoch": 0.006232584786590395, "grad_norm": 0.115234375, "learning_rate": 0.001434, "loss": 0.2852, "step": 718 }, { "epoch": 0.006241265266794559, "grad_norm": 0.10888671875, "learning_rate": 0.001436, "loss": 0.3008, "step": 719 }, { "epoch": 0.006249945746998724, "grad_norm": 0.1376953125, "learning_rate": 0.001438, "loss": 0.2246, "step": 720 }, { "epoch": 0.006258626227202889, "grad_norm": 0.12890625, "learning_rate": 0.0014399999999999999, "loss": 0.2949, "step": 721 }, { "epoch": 0.006267306707407053, "grad_norm": 0.09716796875, "learning_rate": 0.001442, "loss": 0.2383, "step": 722 }, { "epoch": 0.0062759871876112185, "grad_norm": 0.1044921875, "learning_rate": 0.001444, "loss": 0.2734, "step": 723 }, { "epoch": 0.006284667667815384, "grad_norm": 0.0947265625, "learning_rate": 0.001446, "loss": 0.207, "step": 724 }, { "epoch": 0.006293348148019549, "grad_norm": 0.11328125, "learning_rate": 0.001448, "loss": 0.3086, "step": 725 }, { "epoch": 0.006302028628223713, "grad_norm": 0.11083984375, "learning_rate": 0.00145, "loss": 0.2188, "step": 726 }, { "epoch": 0.006310709108427878, "grad_norm": 0.1396484375, "learning_rate": 0.001452, "loss": 0.3105, "step": 727 }, { "epoch": 0.006319389588632043, "grad_norm": 0.10546875, "learning_rate": 0.001454, "loss": 0.3066, "step": 728 }, { "epoch": 0.006328070068836208, "grad_norm": 0.12158203125, "learning_rate": 0.001456, "loss": 0.293, "step": 729 }, { "epoch": 0.0063367505490403726, "grad_norm": 0.10302734375, "learning_rate": 0.001458, "loss": 0.2402, "step": 730 }, { "epoch": 0.006345431029244538, "grad_norm": 0.11669921875, "learning_rate": 0.00146, "loss": 0.2773, "step": 731 }, { "epoch": 0.006354111509448703, "grad_norm": 0.11865234375, "learning_rate": 0.001462, "loss": 0.2559, "step": 732 }, { "epoch": 0.006362791989652868, "grad_norm": 0.126953125, "learning_rate": 0.001464, "loss": 0.2988, "step": 733 }, { "epoch": 0.006371472469857032, "grad_norm": 0.09423828125, "learning_rate": 0.001466, "loss": 0.2246, "step": 734 }, { "epoch": 0.006380152950061197, "grad_norm": 0.1318359375, "learning_rate": 0.001468, "loss": 0.3301, "step": 735 }, { "epoch": 0.006388833430265362, "grad_norm": 0.115234375, "learning_rate": 0.00147, "loss": 0.2422, "step": 736 }, { "epoch": 0.0063975139104695275, "grad_norm": 0.1591796875, "learning_rate": 0.001472, "loss": 0.2539, "step": 737 }, { "epoch": 0.006406194390673692, "grad_norm": 0.08984375, "learning_rate": 0.001474, "loss": 0.2773, "step": 738 }, { "epoch": 0.006414874870877857, "grad_norm": 0.07568359375, "learning_rate": 0.001476, "loss": 0.2539, "step": 739 }, { "epoch": 0.006423555351082022, "grad_norm": 0.1474609375, "learning_rate": 0.001478, "loss": 0.3477, "step": 740 }, { "epoch": 0.006432235831286187, "grad_norm": 0.12890625, "learning_rate": 0.00148, "loss": 0.3477, "step": 741 }, { "epoch": 0.006440916311490351, "grad_norm": 0.16796875, "learning_rate": 0.001482, "loss": 0.2773, "step": 742 }, { "epoch": 0.0064495967916945165, "grad_norm": 0.12060546875, "learning_rate": 0.001484, "loss": 0.2754, "step": 743 }, { "epoch": 0.006458277271898682, "grad_norm": 0.115234375, "learning_rate": 0.0014860000000000001, "loss": 0.3145, "step": 744 }, { "epoch": 0.006466957752102847, "grad_norm": 0.1259765625, "learning_rate": 0.001488, "loss": 0.3066, "step": 745 }, { "epoch": 0.006475638232307011, "grad_norm": 0.0908203125, "learning_rate": 0.00149, "loss": 0.2393, "step": 746 }, { "epoch": 0.006484318712511176, "grad_norm": 0.10791015625, "learning_rate": 0.001492, "loss": 0.2461, "step": 747 }, { "epoch": 0.006492999192715341, "grad_norm": 0.10546875, "learning_rate": 0.001494, "loss": 0.2617, "step": 748 }, { "epoch": 0.0065016796729195055, "grad_norm": 0.08935546875, "learning_rate": 0.001496, "loss": 0.249, "step": 749 }, { "epoch": 0.006510360153123671, "grad_norm": 0.11865234375, "learning_rate": 0.001498, "loss": 0.2656, "step": 750 }, { "epoch": 0.006519040633327836, "grad_norm": 0.12109375, "learning_rate": 0.0015, "loss": 0.2578, "step": 751 }, { "epoch": 0.006527721113532001, "grad_norm": 0.1162109375, "learning_rate": 0.001502, "loss": 0.2559, "step": 752 }, { "epoch": 0.006536401593736165, "grad_norm": 0.0888671875, "learning_rate": 0.0015040000000000001, "loss": 0.2285, "step": 753 }, { "epoch": 0.00654508207394033, "grad_norm": 0.1591796875, "learning_rate": 0.001506, "loss": 0.3125, "step": 754 }, { "epoch": 0.006553762554144495, "grad_norm": 0.12353515625, "learning_rate": 0.001508, "loss": 0.2451, "step": 755 }, { "epoch": 0.0065624430343486605, "grad_norm": 0.11181640625, "learning_rate": 0.00151, "loss": 0.3027, "step": 756 }, { "epoch": 0.006571123514552825, "grad_norm": 0.1015625, "learning_rate": 0.001512, "loss": 0.2734, "step": 757 }, { "epoch": 0.00657980399475699, "grad_norm": 0.12451171875, "learning_rate": 0.001514, "loss": 0.3281, "step": 758 }, { "epoch": 0.006588484474961155, "grad_norm": 0.1201171875, "learning_rate": 0.001516, "loss": 0.2305, "step": 759 }, { "epoch": 0.00659716495516532, "grad_norm": 0.1025390625, "learning_rate": 0.001518, "loss": 0.2314, "step": 760 }, { "epoch": 0.006605845435369484, "grad_norm": 0.140625, "learning_rate": 0.00152, "loss": 0.2773, "step": 761 }, { "epoch": 0.0066145259155736495, "grad_norm": 0.12255859375, "learning_rate": 0.0015220000000000001, "loss": 0.2227, "step": 762 }, { "epoch": 0.006623206395777815, "grad_norm": 0.1123046875, "learning_rate": 0.001524, "loss": 0.2617, "step": 763 }, { "epoch": 0.00663188687598198, "grad_norm": 0.1005859375, "learning_rate": 0.001526, "loss": 0.25, "step": 764 }, { "epoch": 0.006640567356186144, "grad_norm": 0.08740234375, "learning_rate": 0.001528, "loss": 0.207, "step": 765 }, { "epoch": 0.006649247836390309, "grad_norm": 0.09814453125, "learning_rate": 0.0015300000000000001, "loss": 0.2598, "step": 766 }, { "epoch": 0.006657928316594474, "grad_norm": 0.09130859375, "learning_rate": 0.0015320000000000002, "loss": 0.2578, "step": 767 }, { "epoch": 0.006666608796798639, "grad_norm": 0.10009765625, "learning_rate": 0.001534, "loss": 0.2617, "step": 768 }, { "epoch": 0.0066752892770028036, "grad_norm": 0.10693359375, "learning_rate": 0.001536, "loss": 0.3008, "step": 769 }, { "epoch": 0.006683969757206969, "grad_norm": 0.1279296875, "learning_rate": 0.001538, "loss": 0.3613, "step": 770 }, { "epoch": 0.006692650237411134, "grad_norm": 0.10205078125, "learning_rate": 0.0015400000000000001, "loss": 0.2422, "step": 771 }, { "epoch": 0.006701330717615298, "grad_norm": 0.1982421875, "learning_rate": 0.001542, "loss": 0.3379, "step": 772 }, { "epoch": 0.006710011197819463, "grad_norm": 0.1357421875, "learning_rate": 0.001544, "loss": 0.2695, "step": 773 }, { "epoch": 0.006718691678023628, "grad_norm": 0.1318359375, "learning_rate": 0.001546, "loss": 0.3477, "step": 774 }, { "epoch": 0.006727372158227793, "grad_norm": 0.1162109375, "learning_rate": 0.0015480000000000001, "loss": 0.252, "step": 775 }, { "epoch": 0.006736052638431958, "grad_norm": 0.09130859375, "learning_rate": 0.0015500000000000002, "loss": 0.2637, "step": 776 }, { "epoch": 0.006744733118636123, "grad_norm": 0.099609375, "learning_rate": 0.001552, "loss": 0.2559, "step": 777 }, { "epoch": 0.006753413598840288, "grad_norm": 0.44921875, "learning_rate": 0.001554, "loss": 0.4492, "step": 778 }, { "epoch": 0.006762094079044453, "grad_norm": 0.1044921875, "learning_rate": 0.001556, "loss": 0.2188, "step": 779 }, { "epoch": 0.006770774559248617, "grad_norm": 0.244140625, "learning_rate": 0.0015580000000000001, "loss": 0.2656, "step": 780 }, { "epoch": 0.006779455039452782, "grad_norm": 0.1533203125, "learning_rate": 0.0015600000000000002, "loss": 0.2793, "step": 781 }, { "epoch": 0.0067881355196569475, "grad_norm": 0.130859375, "learning_rate": 0.001562, "loss": 0.25, "step": 782 }, { "epoch": 0.006796815999861113, "grad_norm": 0.146484375, "learning_rate": 0.001564, "loss": 0.2217, "step": 783 }, { "epoch": 0.006805496480065277, "grad_norm": 0.146484375, "learning_rate": 0.0015660000000000001, "loss": 0.2422, "step": 784 }, { "epoch": 0.006814176960269442, "grad_norm": 0.1201171875, "learning_rate": 0.0015680000000000002, "loss": 0.2812, "step": 785 }, { "epoch": 0.006822857440473607, "grad_norm": 0.1259765625, "learning_rate": 0.00157, "loss": 0.2188, "step": 786 }, { "epoch": 0.006831537920677772, "grad_norm": 0.134765625, "learning_rate": 0.001572, "loss": 0.2305, "step": 787 }, { "epoch": 0.0068402184008819365, "grad_norm": 0.1162109375, "learning_rate": 0.001574, "loss": 0.2812, "step": 788 }, { "epoch": 0.006848898881086102, "grad_norm": 0.091796875, "learning_rate": 0.0015760000000000001, "loss": 0.2168, "step": 789 }, { "epoch": 0.006857579361290267, "grad_norm": 0.10498046875, "learning_rate": 0.0015780000000000002, "loss": 0.252, "step": 790 }, { "epoch": 0.006866259841494432, "grad_norm": 0.126953125, "learning_rate": 0.00158, "loss": 0.3359, "step": 791 }, { "epoch": 0.006874940321698596, "grad_norm": 0.1416015625, "learning_rate": 0.001582, "loss": 0.3125, "step": 792 }, { "epoch": 0.006883620801902761, "grad_norm": 0.09765625, "learning_rate": 0.0015840000000000001, "loss": 0.293, "step": 793 }, { "epoch": 0.006892301282106926, "grad_norm": 0.0693359375, "learning_rate": 0.0015860000000000002, "loss": 0.2617, "step": 794 }, { "epoch": 0.0069009817623110915, "grad_norm": 0.119140625, "learning_rate": 0.0015880000000000002, "loss": 0.3008, "step": 795 }, { "epoch": 0.006909662242515256, "grad_norm": 0.1025390625, "learning_rate": 0.00159, "loss": 0.2949, "step": 796 }, { "epoch": 0.006918342722719421, "grad_norm": 0.1455078125, "learning_rate": 0.001592, "loss": 0.3262, "step": 797 }, { "epoch": 0.006927023202923586, "grad_norm": 0.1376953125, "learning_rate": 0.0015940000000000001, "loss": 0.2637, "step": 798 }, { "epoch": 0.00693570368312775, "grad_norm": 0.10205078125, "learning_rate": 0.0015960000000000002, "loss": 0.25, "step": 799 }, { "epoch": 0.006944384163331915, "grad_norm": 0.1455078125, "learning_rate": 0.001598, "loss": 0.2148, "step": 800 }, { "epoch": 0.0069530646435360804, "grad_norm": 0.10107421875, "learning_rate": 0.0016, "loss": 0.3086, "step": 801 }, { "epoch": 0.006961745123740246, "grad_norm": 0.10302734375, "learning_rate": 0.0016020000000000001, "loss": 0.2891, "step": 802 }, { "epoch": 0.00697042560394441, "grad_norm": 0.09326171875, "learning_rate": 0.0016040000000000002, "loss": 0.2207, "step": 803 }, { "epoch": 0.006979106084148575, "grad_norm": 0.10986328125, "learning_rate": 0.0016060000000000002, "loss": 0.2852, "step": 804 }, { "epoch": 0.00698778656435274, "grad_norm": 0.10546875, "learning_rate": 0.001608, "loss": 0.2793, "step": 805 }, { "epoch": 0.006996467044556905, "grad_norm": 0.14453125, "learning_rate": 0.00161, "loss": 0.291, "step": 806 }, { "epoch": 0.007005147524761069, "grad_norm": 0.115234375, "learning_rate": 0.0016120000000000002, "loss": 0.3086, "step": 807 }, { "epoch": 0.0070138280049652345, "grad_norm": 0.103515625, "learning_rate": 0.0016140000000000002, "loss": 0.2373, "step": 808 }, { "epoch": 0.0070225084851694, "grad_norm": 0.0966796875, "learning_rate": 0.001616, "loss": 0.209, "step": 809 }, { "epoch": 0.007031188965373565, "grad_norm": 0.1611328125, "learning_rate": 0.001618, "loss": 0.3398, "step": 810 }, { "epoch": 0.007039869445577729, "grad_norm": 0.080078125, "learning_rate": 0.0016200000000000001, "loss": 0.1973, "step": 811 }, { "epoch": 0.007048549925781894, "grad_norm": 0.162109375, "learning_rate": 0.0016220000000000002, "loss": 0.3281, "step": 812 }, { "epoch": 0.007057230405986059, "grad_norm": 0.1845703125, "learning_rate": 0.0016240000000000002, "loss": 0.2832, "step": 813 }, { "epoch": 0.007065910886190224, "grad_norm": 0.1435546875, "learning_rate": 0.0016259999999999998, "loss": 0.2793, "step": 814 }, { "epoch": 0.007074591366394389, "grad_norm": 0.08154296875, "learning_rate": 0.0016279999999999999, "loss": 0.2275, "step": 815 }, { "epoch": 0.007083271846598554, "grad_norm": 0.0771484375, "learning_rate": 0.00163, "loss": 0.2295, "step": 816 }, { "epoch": 0.007091952326802719, "grad_norm": 0.11474609375, "learning_rate": 0.001632, "loss": 0.2988, "step": 817 }, { "epoch": 0.007100632807006884, "grad_norm": 0.1884765625, "learning_rate": 0.001634, "loss": 0.4414, "step": 818 }, { "epoch": 0.007109313287211048, "grad_norm": 0.134765625, "learning_rate": 0.0016359999999999999, "loss": 0.3105, "step": 819 }, { "epoch": 0.007117993767415213, "grad_norm": 0.10205078125, "learning_rate": 0.001638, "loss": 0.2227, "step": 820 }, { "epoch": 0.0071266742476193785, "grad_norm": 0.10205078125, "learning_rate": 0.00164, "loss": 0.2852, "step": 821 }, { "epoch": 0.007135354727823544, "grad_norm": 0.10498046875, "learning_rate": 0.001642, "loss": 0.208, "step": 822 }, { "epoch": 0.007144035208027708, "grad_norm": 0.1181640625, "learning_rate": 0.0016439999999999998, "loss": 0.3652, "step": 823 }, { "epoch": 0.007152715688231873, "grad_norm": 0.11572265625, "learning_rate": 0.001646, "loss": 0.2832, "step": 824 }, { "epoch": 0.007161396168436038, "grad_norm": 0.099609375, "learning_rate": 0.001648, "loss": 0.2715, "step": 825 }, { "epoch": 0.007170076648640202, "grad_norm": 0.10791015625, "learning_rate": 0.00165, "loss": 0.2305, "step": 826 }, { "epoch": 0.0071787571288443675, "grad_norm": 0.111328125, "learning_rate": 0.001652, "loss": 0.252, "step": 827 }, { "epoch": 0.007187437609048533, "grad_norm": 0.09423828125, "learning_rate": 0.0016539999999999999, "loss": 0.25, "step": 828 }, { "epoch": 0.007196118089252698, "grad_norm": 0.107421875, "learning_rate": 0.001656, "loss": 0.2734, "step": 829 }, { "epoch": 0.007204798569456862, "grad_norm": 0.10302734375, "learning_rate": 0.001658, "loss": 0.21, "step": 830 }, { "epoch": 0.007213479049661027, "grad_norm": 0.10400390625, "learning_rate": 0.00166, "loss": 0.2412, "step": 831 }, { "epoch": 0.007222159529865192, "grad_norm": 0.11572265625, "learning_rate": 0.0016619999999999998, "loss": 0.252, "step": 832 }, { "epoch": 0.007230840010069357, "grad_norm": 0.087890625, "learning_rate": 0.001664, "loss": 0.2734, "step": 833 }, { "epoch": 0.007239520490273522, "grad_norm": 0.0966796875, "learning_rate": 0.001666, "loss": 0.208, "step": 834 }, { "epoch": 0.007248200970477687, "grad_norm": 0.0888671875, "learning_rate": 0.001668, "loss": 0.2422, "step": 835 }, { "epoch": 0.007256881450681852, "grad_norm": 0.07373046875, "learning_rate": 0.00167, "loss": 0.2539, "step": 836 }, { "epoch": 0.007265561930886017, "grad_norm": 0.09765625, "learning_rate": 0.0016719999999999999, "loss": 0.3086, "step": 837 }, { "epoch": 0.007274242411090181, "grad_norm": 0.0859375, "learning_rate": 0.001674, "loss": 0.2324, "step": 838 }, { "epoch": 0.007282922891294346, "grad_norm": 0.0927734375, "learning_rate": 0.001676, "loss": 0.2734, "step": 839 }, { "epoch": 0.0072916033714985114, "grad_norm": 0.14453125, "learning_rate": 0.001678, "loss": 0.293, "step": 840 }, { "epoch": 0.0073002838517026766, "grad_norm": 0.0908203125, "learning_rate": 0.00168, "loss": 0.2246, "step": 841 }, { "epoch": 0.007308964331906841, "grad_norm": 0.1298828125, "learning_rate": 0.001682, "loss": 0.2734, "step": 842 }, { "epoch": 0.007317644812111006, "grad_norm": 0.1279296875, "learning_rate": 0.001684, "loss": 0.3281, "step": 843 }, { "epoch": 0.007326325292315171, "grad_norm": 0.10693359375, "learning_rate": 0.001686, "loss": 0.3008, "step": 844 }, { "epoch": 0.007335005772519336, "grad_norm": 0.115234375, "learning_rate": 0.001688, "loss": 0.25, "step": 845 }, { "epoch": 0.0073436862527235, "grad_norm": 0.12890625, "learning_rate": 0.0016899999999999999, "loss": 0.3027, "step": 846 }, { "epoch": 0.0073523667329276655, "grad_norm": 0.0927734375, "learning_rate": 0.001692, "loss": 0.2207, "step": 847 }, { "epoch": 0.007361047213131831, "grad_norm": 0.1259765625, "learning_rate": 0.001694, "loss": 0.2793, "step": 848 }, { "epoch": 0.007369727693335995, "grad_norm": 0.11376953125, "learning_rate": 0.001696, "loss": 0.2031, "step": 849 }, { "epoch": 0.00737840817354016, "grad_norm": 0.130859375, "learning_rate": 0.001698, "loss": 0.2871, "step": 850 }, { "epoch": 0.007387088653744325, "grad_norm": 0.1455078125, "learning_rate": 0.0017, "loss": 0.2949, "step": 851 }, { "epoch": 0.00739576913394849, "grad_norm": 0.11572265625, "learning_rate": 0.001702, "loss": 0.2285, "step": 852 }, { "epoch": 0.0074044496141526545, "grad_norm": 0.1044921875, "learning_rate": 0.001704, "loss": 0.2275, "step": 853 }, { "epoch": 0.00741313009435682, "grad_norm": 0.12060546875, "learning_rate": 0.001706, "loss": 0.3008, "step": 854 }, { "epoch": 0.007421810574560985, "grad_norm": 0.1171875, "learning_rate": 0.001708, "loss": 0.3184, "step": 855 }, { "epoch": 0.00743049105476515, "grad_norm": 0.10546875, "learning_rate": 0.00171, "loss": 0.2695, "step": 856 }, { "epoch": 0.007439171534969314, "grad_norm": 0.09130859375, "learning_rate": 0.001712, "loss": 0.2637, "step": 857 }, { "epoch": 0.007447852015173479, "grad_norm": 0.12890625, "learning_rate": 0.001714, "loss": 0.3008, "step": 858 }, { "epoch": 0.007456532495377644, "grad_norm": 3.140625, "learning_rate": 0.001716, "loss": 0.6523, "step": 859 }, { "epoch": 0.0074652129755818095, "grad_norm": 0.1474609375, "learning_rate": 0.001718, "loss": 0.3145, "step": 860 }, { "epoch": 0.007473893455785974, "grad_norm": 0.10791015625, "learning_rate": 0.00172, "loss": 0.291, "step": 861 }, { "epoch": 0.007482573935990139, "grad_norm": 0.11279296875, "learning_rate": 0.001722, "loss": 0.3359, "step": 862 }, { "epoch": 0.007491254416194304, "grad_norm": 0.09423828125, "learning_rate": 0.001724, "loss": 0.2363, "step": 863 }, { "epoch": 0.007499934896398469, "grad_norm": 0.099609375, "learning_rate": 0.001726, "loss": 0.2734, "step": 864 }, { "epoch": 0.007508615376602633, "grad_norm": 0.11962890625, "learning_rate": 0.001728, "loss": 0.2812, "step": 865 }, { "epoch": 0.0075172958568067985, "grad_norm": 0.0888671875, "learning_rate": 0.00173, "loss": 0.2695, "step": 866 }, { "epoch": 0.007525976337010964, "grad_norm": 0.09521484375, "learning_rate": 0.001732, "loss": 0.2734, "step": 867 }, { "epoch": 0.007534656817215129, "grad_norm": 0.271484375, "learning_rate": 0.001734, "loss": 0.2832, "step": 868 }, { "epoch": 0.007543337297419293, "grad_norm": 0.1044921875, "learning_rate": 0.0017360000000000001, "loss": 0.2988, "step": 869 }, { "epoch": 0.007552017777623458, "grad_norm": 0.0966796875, "learning_rate": 0.001738, "loss": 0.2539, "step": 870 }, { "epoch": 0.007560698257827623, "grad_norm": 0.126953125, "learning_rate": 0.00174, "loss": 0.2578, "step": 871 }, { "epoch": 0.007569378738031788, "grad_norm": 0.1416015625, "learning_rate": 0.001742, "loss": 0.248, "step": 872 }, { "epoch": 0.007578059218235953, "grad_norm": 0.12353515625, "learning_rate": 0.001744, "loss": 0.3066, "step": 873 }, { "epoch": 0.007586739698440118, "grad_norm": 0.09423828125, "learning_rate": 0.001746, "loss": 0.2148, "step": 874 }, { "epoch": 0.007595420178644283, "grad_norm": 0.125, "learning_rate": 0.001748, "loss": 0.3125, "step": 875 }, { "epoch": 0.007604100658848447, "grad_norm": 0.11669921875, "learning_rate": 0.00175, "loss": 0.2441, "step": 876 }, { "epoch": 0.007612781139052612, "grad_norm": 0.1474609375, "learning_rate": 0.001752, "loss": 0.2617, "step": 877 }, { "epoch": 0.007621461619256777, "grad_norm": 0.103515625, "learning_rate": 0.0017540000000000001, "loss": 0.2715, "step": 878 }, { "epoch": 0.0076301420994609424, "grad_norm": 0.78515625, "learning_rate": 0.001756, "loss": 0.6016, "step": 879 }, { "epoch": 0.007638822579665107, "grad_norm": 0.08251953125, "learning_rate": 0.001758, "loss": 0.1846, "step": 880 }, { "epoch": 0.007647503059869272, "grad_norm": 0.1376953125, "learning_rate": 0.00176, "loss": 0.3008, "step": 881 }, { "epoch": 0.007656183540073437, "grad_norm": 0.12060546875, "learning_rate": 0.0017620000000000001, "loss": 0.2715, "step": 882 }, { "epoch": 0.007664864020277602, "grad_norm": 0.1318359375, "learning_rate": 0.001764, "loss": 0.293, "step": 883 }, { "epoch": 0.007673544500481766, "grad_norm": 0.130859375, "learning_rate": 0.001766, "loss": 0.2695, "step": 884 }, { "epoch": 0.007682224980685931, "grad_norm": 0.1259765625, "learning_rate": 0.001768, "loss": 0.2812, "step": 885 }, { "epoch": 0.0076909054608900965, "grad_norm": 0.150390625, "learning_rate": 0.00177, "loss": 0.375, "step": 886 }, { "epoch": 0.007699585941094262, "grad_norm": 0.0830078125, "learning_rate": 0.0017720000000000001, "loss": 0.2578, "step": 887 }, { "epoch": 0.007708266421298426, "grad_norm": 0.095703125, "learning_rate": 0.001774, "loss": 0.2637, "step": 888 }, { "epoch": 0.007716946901502591, "grad_norm": 0.1376953125, "learning_rate": 0.001776, "loss": 0.2656, "step": 889 }, { "epoch": 0.007725627381706756, "grad_norm": 0.07421875, "learning_rate": 0.001778, "loss": 0.2393, "step": 890 }, { "epoch": 0.007734307861910921, "grad_norm": 0.15234375, "learning_rate": 0.0017800000000000001, "loss": 0.3184, "step": 891 }, { "epoch": 0.0077429883421150855, "grad_norm": 0.0859375, "learning_rate": 0.0017820000000000002, "loss": 0.2676, "step": 892 }, { "epoch": 0.007751668822319251, "grad_norm": 0.1328125, "learning_rate": 0.001784, "loss": 0.3086, "step": 893 }, { "epoch": 0.007760349302523416, "grad_norm": 0.09814453125, "learning_rate": 0.001786, "loss": 0.293, "step": 894 }, { "epoch": 0.007769029782727581, "grad_norm": 0.072265625, "learning_rate": 0.001788, "loss": 0.1904, "step": 895 }, { "epoch": 0.007777710262931745, "grad_norm": 0.11474609375, "learning_rate": 0.0017900000000000001, "loss": 0.3086, "step": 896 }, { "epoch": 0.00778639074313591, "grad_norm": 0.08935546875, "learning_rate": 0.001792, "loss": 0.2305, "step": 897 }, { "epoch": 0.007795071223340075, "grad_norm": 0.11083984375, "learning_rate": 0.001794, "loss": 0.3105, "step": 898 }, { "epoch": 0.0078037517035442405, "grad_norm": 0.11865234375, "learning_rate": 0.001796, "loss": 0.3223, "step": 899 }, { "epoch": 0.007812432183748405, "grad_norm": 0.19140625, "learning_rate": 0.0017980000000000001, "loss": 0.3691, "step": 900 }, { "epoch": 0.00782111266395257, "grad_norm": 0.16796875, "learning_rate": 0.0018000000000000002, "loss": 0.3047, "step": 901 }, { "epoch": 0.007829793144156735, "grad_norm": 0.1318359375, "learning_rate": 0.001802, "loss": 0.2031, "step": 902 }, { "epoch": 0.0078384736243609, "grad_norm": 0.10986328125, "learning_rate": 0.001804, "loss": 0.2422, "step": 903 }, { "epoch": 0.007847154104565065, "grad_norm": 0.51953125, "learning_rate": 0.001806, "loss": 0.3965, "step": 904 }, { "epoch": 0.007855834584769229, "grad_norm": 0.138671875, "learning_rate": 0.0018080000000000001, "loss": 0.2441, "step": 905 }, { "epoch": 0.007864515064973394, "grad_norm": 0.11083984375, "learning_rate": 0.0018100000000000002, "loss": 0.2656, "step": 906 }, { "epoch": 0.007873195545177559, "grad_norm": 0.1435546875, "learning_rate": 0.001812, "loss": 0.2188, "step": 907 }, { "epoch": 0.007881876025381724, "grad_norm": 0.10498046875, "learning_rate": 0.001814, "loss": 0.3027, "step": 908 }, { "epoch": 0.007890556505585889, "grad_norm": 0.51171875, "learning_rate": 0.0018160000000000001, "loss": 0.4023, "step": 909 }, { "epoch": 0.007899236985790054, "grad_norm": 0.08251953125, "learning_rate": 0.0018180000000000002, "loss": 0.2598, "step": 910 }, { "epoch": 0.00790791746599422, "grad_norm": 0.09375, "learning_rate": 0.00182, "loss": 0.2715, "step": 911 }, { "epoch": 0.007916597946198384, "grad_norm": 0.0791015625, "learning_rate": 0.001822, "loss": 0.2275, "step": 912 }, { "epoch": 0.007925278426402548, "grad_norm": 0.0908203125, "learning_rate": 0.001824, "loss": 0.2891, "step": 913 }, { "epoch": 0.007933958906606713, "grad_norm": 0.09619140625, "learning_rate": 0.0018260000000000001, "loss": 0.293, "step": 914 }, { "epoch": 0.007942639386810878, "grad_norm": 0.083984375, "learning_rate": 0.0018280000000000002, "loss": 0.2402, "step": 915 }, { "epoch": 0.007951319867015043, "grad_norm": 0.08984375, "learning_rate": 0.00183, "loss": 0.2598, "step": 916 }, { "epoch": 0.007960000347219208, "grad_norm": 0.080078125, "learning_rate": 0.001832, "loss": 0.3125, "step": 917 }, { "epoch": 0.007968680827423373, "grad_norm": 0.11865234375, "learning_rate": 0.0018340000000000001, "loss": 0.2578, "step": 918 }, { "epoch": 0.007977361307627539, "grad_norm": 0.333984375, "learning_rate": 0.0018360000000000002, "loss": 0.4102, "step": 919 }, { "epoch": 0.007986041787831704, "grad_norm": 0.28515625, "learning_rate": 0.0018380000000000002, "loss": 0.7422, "step": 920 }, { "epoch": 0.007994722268035867, "grad_norm": 0.10009765625, "learning_rate": 0.00184, "loss": 0.2617, "step": 921 }, { "epoch": 0.008003402748240032, "grad_norm": 0.0966796875, "learning_rate": 0.001842, "loss": 0.2441, "step": 922 }, { "epoch": 0.008012083228444197, "grad_norm": 0.0927734375, "learning_rate": 0.0018440000000000002, "loss": 0.2227, "step": 923 }, { "epoch": 0.008020763708648362, "grad_norm": 0.0693359375, "learning_rate": 0.0018460000000000002, "loss": 0.2061, "step": 924 }, { "epoch": 0.008029444188852528, "grad_norm": 0.087890625, "learning_rate": 0.001848, "loss": 0.252, "step": 925 }, { "epoch": 0.008038124669056693, "grad_norm": 0.09814453125, "learning_rate": 0.00185, "loss": 0.2559, "step": 926 }, { "epoch": 0.008046805149260858, "grad_norm": 0.07666015625, "learning_rate": 0.0018520000000000001, "loss": 0.2197, "step": 927 }, { "epoch": 0.008055485629465021, "grad_norm": 0.0859375, "learning_rate": 0.0018540000000000002, "loss": 0.2617, "step": 928 }, { "epoch": 0.008064166109669186, "grad_norm": 0.099609375, "learning_rate": 0.0018560000000000002, "loss": 0.2773, "step": 929 }, { "epoch": 0.008072846589873351, "grad_norm": 0.119140625, "learning_rate": 0.001858, "loss": 0.2676, "step": 930 }, { "epoch": 0.008081527070077517, "grad_norm": 0.09033203125, "learning_rate": 0.00186, "loss": 0.252, "step": 931 }, { "epoch": 0.008090207550281682, "grad_norm": 0.0859375, "learning_rate": 0.0018620000000000002, "loss": 0.2363, "step": 932 }, { "epoch": 0.008098888030485847, "grad_norm": 0.07275390625, "learning_rate": 0.0018640000000000002, "loss": 0.2188, "step": 933 }, { "epoch": 0.008107568510690012, "grad_norm": 0.1845703125, "learning_rate": 0.001866, "loss": 0.2637, "step": 934 }, { "epoch": 0.008116248990894177, "grad_norm": 0.080078125, "learning_rate": 0.001868, "loss": 0.2314, "step": 935 }, { "epoch": 0.00812492947109834, "grad_norm": 0.0947265625, "learning_rate": 0.0018700000000000001, "loss": 0.2969, "step": 936 }, { "epoch": 0.008133609951302505, "grad_norm": 0.10595703125, "learning_rate": 0.0018720000000000002, "loss": 0.3047, "step": 937 }, { "epoch": 0.00814229043150667, "grad_norm": 0.111328125, "learning_rate": 0.0018740000000000002, "loss": 0.2773, "step": 938 }, { "epoch": 0.008150970911710836, "grad_norm": 0.078125, "learning_rate": 0.0018759999999999998, "loss": 0.2422, "step": 939 }, { "epoch": 0.008159651391915, "grad_norm": 0.10107421875, "learning_rate": 0.001878, "loss": 0.3203, "step": 940 }, { "epoch": 0.008168331872119166, "grad_norm": 0.083984375, "learning_rate": 0.00188, "loss": 0.2773, "step": 941 }, { "epoch": 0.008177012352323331, "grad_norm": 0.07958984375, "learning_rate": 0.001882, "loss": 0.2637, "step": 942 }, { "epoch": 0.008185692832527496, "grad_norm": 0.09228515625, "learning_rate": 0.001884, "loss": 0.2383, "step": 943 }, { "epoch": 0.00819437331273166, "grad_norm": 0.08935546875, "learning_rate": 0.0018859999999999999, "loss": 0.3105, "step": 944 }, { "epoch": 0.008203053792935825, "grad_norm": 0.09375, "learning_rate": 0.001888, "loss": 0.252, "step": 945 }, { "epoch": 0.00821173427313999, "grad_norm": 0.09814453125, "learning_rate": 0.00189, "loss": 0.293, "step": 946 }, { "epoch": 0.008220414753344155, "grad_norm": 0.107421875, "learning_rate": 0.001892, "loss": 0.332, "step": 947 }, { "epoch": 0.00822909523354832, "grad_norm": 0.078125, "learning_rate": 0.0018939999999999999, "loss": 0.2168, "step": 948 }, { "epoch": 0.008237775713752485, "grad_norm": 0.0947265625, "learning_rate": 0.001896, "loss": 0.2676, "step": 949 }, { "epoch": 0.00824645619395665, "grad_norm": 0.08642578125, "learning_rate": 0.001898, "loss": 0.2266, "step": 950 }, { "epoch": 0.008255136674160815, "grad_norm": 0.0966796875, "learning_rate": 0.0019, "loss": 0.2578, "step": 951 }, { "epoch": 0.008263817154364979, "grad_norm": 0.08203125, "learning_rate": 0.001902, "loss": 0.2324, "step": 952 }, { "epoch": 0.008272497634569144, "grad_norm": 0.10302734375, "learning_rate": 0.0019039999999999999, "loss": 0.3008, "step": 953 }, { "epoch": 0.008281178114773309, "grad_norm": 0.11474609375, "learning_rate": 0.001906, "loss": 0.3066, "step": 954 }, { "epoch": 0.008289858594977474, "grad_norm": 0.1142578125, "learning_rate": 0.001908, "loss": 0.3516, "step": 955 }, { "epoch": 0.00829853907518164, "grad_norm": 0.068359375, "learning_rate": 0.00191, "loss": 0.2148, "step": 956 }, { "epoch": 0.008307219555385804, "grad_norm": 0.11328125, "learning_rate": 0.0019119999999999999, "loss": 0.291, "step": 957 }, { "epoch": 0.00831590003558997, "grad_norm": 0.14453125, "learning_rate": 0.001914, "loss": 0.5, "step": 958 }, { "epoch": 0.008324580515794133, "grad_norm": 0.107421875, "learning_rate": 0.001916, "loss": 0.2656, "step": 959 }, { "epoch": 0.008333260995998298, "grad_norm": 0.10693359375, "learning_rate": 0.001918, "loss": 0.3086, "step": 960 }, { "epoch": 0.008341941476202463, "grad_norm": 0.0830078125, "learning_rate": 0.00192, "loss": 0.2246, "step": 961 }, { "epoch": 0.008350621956406628, "grad_norm": 0.07666015625, "learning_rate": 0.0019219999999999999, "loss": 0.2422, "step": 962 }, { "epoch": 0.008359302436610793, "grad_norm": 0.09814453125, "learning_rate": 0.001924, "loss": 0.2637, "step": 963 }, { "epoch": 0.008367982916814959, "grad_norm": 0.10205078125, "learning_rate": 0.001926, "loss": 0.2578, "step": 964 }, { "epoch": 0.008376663397019124, "grad_norm": 0.11376953125, "learning_rate": 0.001928, "loss": 0.2402, "step": 965 }, { "epoch": 0.008385343877223289, "grad_norm": 0.11962890625, "learning_rate": 0.00193, "loss": 0.2617, "step": 966 }, { "epoch": 0.008394024357427452, "grad_norm": 0.091796875, "learning_rate": 0.001932, "loss": 0.2148, "step": 967 }, { "epoch": 0.008402704837631617, "grad_norm": 0.12109375, "learning_rate": 0.001934, "loss": 0.332, "step": 968 }, { "epoch": 0.008411385317835782, "grad_norm": 0.10986328125, "learning_rate": 0.001936, "loss": 0.2139, "step": 969 }, { "epoch": 0.008420065798039948, "grad_norm": 0.1044921875, "learning_rate": 0.001938, "loss": 0.2812, "step": 970 }, { "epoch": 0.008428746278244113, "grad_norm": 0.07177734375, "learning_rate": 0.0019399999999999999, "loss": 0.25, "step": 971 }, { "epoch": 0.008437426758448278, "grad_norm": 0.1025390625, "learning_rate": 0.001942, "loss": 0.2656, "step": 972 }, { "epoch": 0.008446107238652443, "grad_norm": 0.083984375, "learning_rate": 0.001944, "loss": 0.2422, "step": 973 }, { "epoch": 0.008454787718856608, "grad_norm": 0.0966796875, "learning_rate": 0.001946, "loss": 0.2617, "step": 974 }, { "epoch": 0.008463468199060771, "grad_norm": 0.1044921875, "learning_rate": 0.001948, "loss": 0.3086, "step": 975 }, { "epoch": 0.008472148679264936, "grad_norm": 0.0810546875, "learning_rate": 0.00195, "loss": 0.2832, "step": 976 }, { "epoch": 0.008480829159469102, "grad_norm": 0.095703125, "learning_rate": 0.001952, "loss": 0.2617, "step": 977 }, { "epoch": 0.008489509639673267, "grad_norm": 0.078125, "learning_rate": 0.001954, "loss": 0.2461, "step": 978 }, { "epoch": 0.008498190119877432, "grad_norm": 0.080078125, "learning_rate": 0.001956, "loss": 0.2168, "step": 979 }, { "epoch": 0.008506870600081597, "grad_norm": 0.07958984375, "learning_rate": 0.001958, "loss": 0.2266, "step": 980 }, { "epoch": 0.008515551080285762, "grad_norm": 0.10107421875, "learning_rate": 0.00196, "loss": 0.2373, "step": 981 }, { "epoch": 0.008524231560489925, "grad_norm": 0.08984375, "learning_rate": 0.001962, "loss": 0.2734, "step": 982 }, { "epoch": 0.00853291204069409, "grad_norm": 0.23046875, "learning_rate": 0.001964, "loss": 0.3574, "step": 983 }, { "epoch": 0.008541592520898256, "grad_norm": 0.08935546875, "learning_rate": 0.001966, "loss": 0.2324, "step": 984 }, { "epoch": 0.00855027300110242, "grad_norm": 0.1484375, "learning_rate": 0.001968, "loss": 0.2676, "step": 985 }, { "epoch": 0.008558953481306586, "grad_norm": 0.083984375, "learning_rate": 0.00197, "loss": 0.2139, "step": 986 }, { "epoch": 0.008567633961510751, "grad_norm": 0.1572265625, "learning_rate": 0.0019720000000000002, "loss": 0.2793, "step": 987 }, { "epoch": 0.008576314441714916, "grad_norm": 0.12353515625, "learning_rate": 0.001974, "loss": 0.3164, "step": 988 }, { "epoch": 0.008584994921919081, "grad_norm": 0.07666015625, "learning_rate": 0.001976, "loss": 0.2207, "step": 989 }, { "epoch": 0.008593675402123245, "grad_norm": 0.07861328125, "learning_rate": 0.001978, "loss": 0.2324, "step": 990 }, { "epoch": 0.00860235588232741, "grad_norm": 0.08544921875, "learning_rate": 0.00198, "loss": 0.2295, "step": 991 }, { "epoch": 0.008611036362531575, "grad_norm": 0.1259765625, "learning_rate": 0.001982, "loss": 0.3203, "step": 992 }, { "epoch": 0.00861971684273574, "grad_norm": 0.123046875, "learning_rate": 0.001984, "loss": 0.2656, "step": 993 }, { "epoch": 0.008628397322939905, "grad_norm": 0.10302734375, "learning_rate": 0.001986, "loss": 0.2637, "step": 994 }, { "epoch": 0.00863707780314407, "grad_norm": 0.08935546875, "learning_rate": 0.001988, "loss": 0.2383, "step": 995 }, { "epoch": 0.008645758283348235, "grad_norm": 0.07666015625, "learning_rate": 0.00199, "loss": 0.2178, "step": 996 }, { "epoch": 0.0086544387635524, "grad_norm": 0.1083984375, "learning_rate": 0.001992, "loss": 0.2832, "step": 997 }, { "epoch": 0.008663119243756564, "grad_norm": 0.0908203125, "learning_rate": 0.001994, "loss": 0.252, "step": 998 }, { "epoch": 0.008671799723960729, "grad_norm": 0.10791015625, "learning_rate": 0.001996, "loss": 0.2734, "step": 999 }, { "epoch": 0.008680480204164894, "grad_norm": 0.2236328125, "learning_rate": 0.001998, "loss": 0.5977, "step": 1000 }, { "epoch": 0.00868916068436906, "grad_norm": 0.10546875, "learning_rate": 0.002, "loss": 0.2871, "step": 1001 }, { "epoch": 0.008697841164573224, "grad_norm": 0.08154296875, "learning_rate": 0.0019999999995078867, "loss": 0.2734, "step": 1002 }, { "epoch": 0.00870652164477739, "grad_norm": 0.10595703125, "learning_rate": 0.0019999999980315473, "loss": 0.2891, "step": 1003 }, { "epoch": 0.008715202124981555, "grad_norm": 0.076171875, "learning_rate": 0.001999999995570981, "loss": 0.2578, "step": 1004 }, { "epoch": 0.008723882605185718, "grad_norm": 0.07861328125, "learning_rate": 0.001999999992126188, "loss": 0.2578, "step": 1005 }, { "epoch": 0.008732563085389883, "grad_norm": 0.1923828125, "learning_rate": 0.001999999987697169, "loss": 0.2949, "step": 1006 }, { "epoch": 0.008741243565594048, "grad_norm": 0.1435546875, "learning_rate": 0.0019999999822839236, "loss": 0.3164, "step": 1007 }, { "epoch": 0.008749924045798213, "grad_norm": 0.1279296875, "learning_rate": 0.0019999999758864516, "loss": 0.2676, "step": 1008 }, { "epoch": 0.008758604526002379, "grad_norm": 0.07861328125, "learning_rate": 0.0019999999685047527, "loss": 0.248, "step": 1009 }, { "epoch": 0.008767285006206544, "grad_norm": 0.0849609375, "learning_rate": 0.001999999960138828, "loss": 0.2754, "step": 1010 }, { "epoch": 0.008775965486410709, "grad_norm": 0.10302734375, "learning_rate": 0.0019999999507886767, "loss": 0.3027, "step": 1011 }, { "epoch": 0.008784645966614874, "grad_norm": 0.08984375, "learning_rate": 0.0019999999404542988, "loss": 0.2285, "step": 1012 }, { "epoch": 0.008793326446819037, "grad_norm": 0.06884765625, "learning_rate": 0.0019999999291356943, "loss": 0.2617, "step": 1013 }, { "epoch": 0.008802006927023202, "grad_norm": 0.111328125, "learning_rate": 0.001999999916832864, "loss": 0.2617, "step": 1014 }, { "epoch": 0.008810687407227367, "grad_norm": 0.080078125, "learning_rate": 0.001999999903545807, "loss": 0.3086, "step": 1015 }, { "epoch": 0.008819367887431533, "grad_norm": 0.08984375, "learning_rate": 0.0019999998892745235, "loss": 0.2383, "step": 1016 }, { "epoch": 0.008828048367635698, "grad_norm": 0.1005859375, "learning_rate": 0.0019999998740190135, "loss": 0.3086, "step": 1017 }, { "epoch": 0.008836728847839863, "grad_norm": 0.10205078125, "learning_rate": 0.001999999857779278, "loss": 0.2539, "step": 1018 }, { "epoch": 0.008845409328044028, "grad_norm": 0.08447265625, "learning_rate": 0.0019999998405553156, "loss": 0.2002, "step": 1019 }, { "epoch": 0.008854089808248193, "grad_norm": 0.08837890625, "learning_rate": 0.0019999998223471267, "loss": 0.332, "step": 1020 }, { "epoch": 0.008862770288452356, "grad_norm": 0.091796875, "learning_rate": 0.0019999998031547117, "loss": 0.2363, "step": 1021 }, { "epoch": 0.008871450768656522, "grad_norm": 0.091796875, "learning_rate": 0.0019999997829780705, "loss": 0.25, "step": 1022 }, { "epoch": 0.008880131248860687, "grad_norm": 0.0849609375, "learning_rate": 0.001999999761817203, "loss": 0.2412, "step": 1023 }, { "epoch": 0.008888811729064852, "grad_norm": 0.068359375, "learning_rate": 0.0019999997396721093, "loss": 0.2246, "step": 1024 }, { "epoch": 0.008897492209269017, "grad_norm": 0.0693359375, "learning_rate": 0.001999999716542789, "loss": 0.2324, "step": 1025 }, { "epoch": 0.008906172689473182, "grad_norm": 0.08837890625, "learning_rate": 0.001999999692429243, "loss": 0.3105, "step": 1026 }, { "epoch": 0.008914853169677347, "grad_norm": 0.10009765625, "learning_rate": 0.001999999667331471, "loss": 0.2422, "step": 1027 }, { "epoch": 0.008923533649881512, "grad_norm": 0.09716796875, "learning_rate": 0.001999999641249473, "loss": 0.2539, "step": 1028 }, { "epoch": 0.008932214130085676, "grad_norm": 0.103515625, "learning_rate": 0.0019999996141832482, "loss": 0.2812, "step": 1029 }, { "epoch": 0.00894089461028984, "grad_norm": 0.076171875, "learning_rate": 0.001999999586132798, "loss": 0.2539, "step": 1030 }, { "epoch": 0.008949575090494006, "grad_norm": 0.1298828125, "learning_rate": 0.0019999995570981212, "loss": 0.2891, "step": 1031 }, { "epoch": 0.008958255570698171, "grad_norm": 0.10693359375, "learning_rate": 0.001999999527079219, "loss": 0.25, "step": 1032 }, { "epoch": 0.008966936050902336, "grad_norm": 0.06689453125, "learning_rate": 0.0019999994960760905, "loss": 0.2266, "step": 1033 }, { "epoch": 0.008975616531106501, "grad_norm": 0.09228515625, "learning_rate": 0.0019999994640887363, "loss": 0.2754, "step": 1034 }, { "epoch": 0.008984297011310666, "grad_norm": 0.076171875, "learning_rate": 0.001999999431117156, "loss": 0.209, "step": 1035 }, { "epoch": 0.00899297749151483, "grad_norm": 0.091796875, "learning_rate": 0.00199999939716135, "loss": 0.249, "step": 1036 }, { "epoch": 0.009001657971718995, "grad_norm": 0.12451171875, "learning_rate": 0.0019999993622213177, "loss": 0.2422, "step": 1037 }, { "epoch": 0.00901033845192316, "grad_norm": 0.09033203125, "learning_rate": 0.0019999993262970603, "loss": 0.2363, "step": 1038 }, { "epoch": 0.009019018932127325, "grad_norm": 0.0908203125, "learning_rate": 0.0019999992893885766, "loss": 0.2871, "step": 1039 }, { "epoch": 0.00902769941233149, "grad_norm": 0.08740234375, "learning_rate": 0.0019999992514958677, "loss": 0.2734, "step": 1040 }, { "epoch": 0.009036379892535655, "grad_norm": 0.10791015625, "learning_rate": 0.0019999992126189326, "loss": 0.3496, "step": 1041 }, { "epoch": 0.00904506037273982, "grad_norm": 0.09716796875, "learning_rate": 0.001999999172757772, "loss": 0.2598, "step": 1042 }, { "epoch": 0.009053740852943986, "grad_norm": 0.1337890625, "learning_rate": 0.001999999131912386, "loss": 0.2422, "step": 1043 }, { "epoch": 0.009062421333148149, "grad_norm": 0.0927734375, "learning_rate": 0.0019999990900827747, "loss": 0.2305, "step": 1044 }, { "epoch": 0.009071101813352314, "grad_norm": 0.08740234375, "learning_rate": 0.0019999990472689376, "loss": 0.2324, "step": 1045 }, { "epoch": 0.00907978229355648, "grad_norm": 0.0732421875, "learning_rate": 0.001999999003470875, "loss": 0.2129, "step": 1046 }, { "epoch": 0.009088462773760644, "grad_norm": 0.103515625, "learning_rate": 0.0019999989586885875, "loss": 0.2891, "step": 1047 }, { "epoch": 0.00909714325396481, "grad_norm": 0.09033203125, "learning_rate": 0.0019999989129220745, "loss": 0.2891, "step": 1048 }, { "epoch": 0.009105823734168975, "grad_norm": 0.08154296875, "learning_rate": 0.001999998866171336, "loss": 0.2285, "step": 1049 }, { "epoch": 0.00911450421437314, "grad_norm": 0.10498046875, "learning_rate": 0.001999998818436372, "loss": 0.2402, "step": 1050 }, { "epoch": 0.009123184694577305, "grad_norm": 0.08935546875, "learning_rate": 0.0019999987697171834, "loss": 0.2363, "step": 1051 }, { "epoch": 0.009131865174781468, "grad_norm": 0.09521484375, "learning_rate": 0.0019999987200137693, "loss": 0.2012, "step": 1052 }, { "epoch": 0.009140545654985633, "grad_norm": 0.14453125, "learning_rate": 0.00199999866932613, "loss": 0.252, "step": 1053 }, { "epoch": 0.009149226135189798, "grad_norm": 0.08740234375, "learning_rate": 0.001999998617654266, "loss": 0.3047, "step": 1054 }, { "epoch": 0.009157906615393964, "grad_norm": 0.09912109375, "learning_rate": 0.001999998564998177, "loss": 0.2266, "step": 1055 }, { "epoch": 0.009166587095598129, "grad_norm": 0.08349609375, "learning_rate": 0.001999998511357863, "loss": 0.2539, "step": 1056 }, { "epoch": 0.009175267575802294, "grad_norm": 0.11279296875, "learning_rate": 0.001999998456733324, "loss": 0.2422, "step": 1057 }, { "epoch": 0.009183948056006459, "grad_norm": 0.09521484375, "learning_rate": 0.0019999984011245604, "loss": 0.2266, "step": 1058 }, { "epoch": 0.009192628536210622, "grad_norm": 0.091796875, "learning_rate": 0.0019999983445315723, "loss": 0.3203, "step": 1059 }, { "epoch": 0.009201309016414787, "grad_norm": 0.10400390625, "learning_rate": 0.001999998286954359, "loss": 0.3066, "step": 1060 }, { "epoch": 0.009209989496618953, "grad_norm": 0.11083984375, "learning_rate": 0.0019999982283929216, "loss": 0.3086, "step": 1061 }, { "epoch": 0.009218669976823118, "grad_norm": 0.068359375, "learning_rate": 0.0019999981688472593, "loss": 0.2227, "step": 1062 }, { "epoch": 0.009227350457027283, "grad_norm": 0.11474609375, "learning_rate": 0.0019999981083173727, "loss": 0.248, "step": 1063 }, { "epoch": 0.009236030937231448, "grad_norm": 0.0703125, "learning_rate": 0.0019999980468032616, "loss": 0.2451, "step": 1064 }, { "epoch": 0.009244711417435613, "grad_norm": 0.1083984375, "learning_rate": 0.001999997984304926, "loss": 0.2676, "step": 1065 }, { "epoch": 0.009253391897639778, "grad_norm": 0.0859375, "learning_rate": 0.0019999979208223666, "loss": 0.2402, "step": 1066 }, { "epoch": 0.009262072377843942, "grad_norm": 0.09716796875, "learning_rate": 0.0019999978563555827, "loss": 0.2871, "step": 1067 }, { "epoch": 0.009270752858048107, "grad_norm": 0.1318359375, "learning_rate": 0.0019999977909045744, "loss": 0.2773, "step": 1068 }, { "epoch": 0.009279433338252272, "grad_norm": 0.103515625, "learning_rate": 0.0019999977244693425, "loss": 0.2285, "step": 1069 }, { "epoch": 0.009288113818456437, "grad_norm": 0.08837890625, "learning_rate": 0.0019999976570498866, "loss": 0.248, "step": 1070 }, { "epoch": 0.009296794298660602, "grad_norm": 0.0751953125, "learning_rate": 0.0019999975886462063, "loss": 0.2852, "step": 1071 }, { "epoch": 0.009305474778864767, "grad_norm": 0.154296875, "learning_rate": 0.001999997519258303, "loss": 0.3711, "step": 1072 }, { "epoch": 0.009314155259068932, "grad_norm": 0.07373046875, "learning_rate": 0.001999997448886175, "loss": 0.2314, "step": 1073 }, { "epoch": 0.009322835739273097, "grad_norm": 0.08837890625, "learning_rate": 0.001999997377529824, "loss": 0.3281, "step": 1074 }, { "epoch": 0.00933151621947726, "grad_norm": 0.09814453125, "learning_rate": 0.001999997305189249, "loss": 0.2773, "step": 1075 }, { "epoch": 0.009340196699681426, "grad_norm": 0.09912109375, "learning_rate": 0.0019999972318644503, "loss": 0.2266, "step": 1076 }, { "epoch": 0.009348877179885591, "grad_norm": 0.1435546875, "learning_rate": 0.0019999971575554287, "loss": 0.2715, "step": 1077 }, { "epoch": 0.009357557660089756, "grad_norm": 0.0947265625, "learning_rate": 0.0019999970822621835, "loss": 0.332, "step": 1078 }, { "epoch": 0.009366238140293921, "grad_norm": 0.0869140625, "learning_rate": 0.001999997005984715, "loss": 0.2432, "step": 1079 }, { "epoch": 0.009374918620498086, "grad_norm": 0.11962890625, "learning_rate": 0.0019999969287230234, "loss": 0.2949, "step": 1080 }, { "epoch": 0.009383599100702252, "grad_norm": 0.076171875, "learning_rate": 0.0019999968504771084, "loss": 0.2461, "step": 1081 }, { "epoch": 0.009392279580906415, "grad_norm": 0.087890625, "learning_rate": 0.0019999967712469708, "loss": 0.1953, "step": 1082 }, { "epoch": 0.00940096006111058, "grad_norm": 0.1083984375, "learning_rate": 0.0019999966910326096, "loss": 0.3008, "step": 1083 }, { "epoch": 0.009409640541314745, "grad_norm": 0.1025390625, "learning_rate": 0.001999996609834026, "loss": 0.2852, "step": 1084 }, { "epoch": 0.00941832102151891, "grad_norm": 0.0810546875, "learning_rate": 0.0019999965276512196, "loss": 0.2441, "step": 1085 }, { "epoch": 0.009427001501723075, "grad_norm": 0.1171875, "learning_rate": 0.0019999964444841903, "loss": 0.252, "step": 1086 }, { "epoch": 0.00943568198192724, "grad_norm": 0.08935546875, "learning_rate": 0.001999996360332939, "loss": 0.2578, "step": 1087 }, { "epoch": 0.009444362462131406, "grad_norm": 0.12890625, "learning_rate": 0.0019999962751974646, "loss": 0.3242, "step": 1088 }, { "epoch": 0.00945304294233557, "grad_norm": 0.08349609375, "learning_rate": 0.0019999961890777677, "loss": 0.248, "step": 1089 }, { "epoch": 0.009461723422539734, "grad_norm": 0.1064453125, "learning_rate": 0.001999996101973849, "loss": 0.2324, "step": 1090 }, { "epoch": 0.0094704039027439, "grad_norm": 0.08203125, "learning_rate": 0.0019999960138857077, "loss": 0.2402, "step": 1091 }, { "epoch": 0.009479084382948064, "grad_norm": 0.1083984375, "learning_rate": 0.001999995924813345, "loss": 0.3281, "step": 1092 }, { "epoch": 0.00948776486315223, "grad_norm": 0.134765625, "learning_rate": 0.0019999958347567594, "loss": 0.2031, "step": 1093 }, { "epoch": 0.009496445343356395, "grad_norm": 0.076171875, "learning_rate": 0.0019999957437159517, "loss": 0.1943, "step": 1094 }, { "epoch": 0.00950512582356056, "grad_norm": 0.0751953125, "learning_rate": 0.001999995651690923, "loss": 0.2305, "step": 1095 }, { "epoch": 0.009513806303764725, "grad_norm": 0.11572265625, "learning_rate": 0.0019999955586816726, "loss": 0.2969, "step": 1096 }, { "epoch": 0.00952248678396889, "grad_norm": 0.0771484375, "learning_rate": 0.0019999954646882, "loss": 0.2031, "step": 1097 }, { "epoch": 0.009531167264173053, "grad_norm": 0.1279296875, "learning_rate": 0.001999995369710506, "loss": 0.2988, "step": 1098 }, { "epoch": 0.009539847744377218, "grad_norm": 0.09130859375, "learning_rate": 0.0019999952737485907, "loss": 0.2891, "step": 1099 }, { "epoch": 0.009548528224581384, "grad_norm": 0.08642578125, "learning_rate": 0.0019999951768024543, "loss": 0.2773, "step": 1100 }, { "epoch": 0.009557208704785549, "grad_norm": 0.2431640625, "learning_rate": 0.0019999950788720964, "loss": 0.1904, "step": 1101 }, { "epoch": 0.009565889184989714, "grad_norm": 0.0859375, "learning_rate": 0.0019999949799575176, "loss": 0.1973, "step": 1102 }, { "epoch": 0.009574569665193879, "grad_norm": 0.09033203125, "learning_rate": 0.0019999948800587175, "loss": 0.2344, "step": 1103 }, { "epoch": 0.009583250145398044, "grad_norm": 0.083984375, "learning_rate": 0.001999994779175697, "loss": 0.2412, "step": 1104 }, { "epoch": 0.00959193062560221, "grad_norm": 0.08154296875, "learning_rate": 0.0019999946773084556, "loss": 0.2578, "step": 1105 }, { "epoch": 0.009600611105806373, "grad_norm": 0.1357421875, "learning_rate": 0.001999994574456993, "loss": 0.248, "step": 1106 }, { "epoch": 0.009609291586010538, "grad_norm": 0.0810546875, "learning_rate": 0.0019999944706213103, "loss": 0.209, "step": 1107 }, { "epoch": 0.009617972066214703, "grad_norm": 0.06884765625, "learning_rate": 0.001999994365801407, "loss": 0.2578, "step": 1108 }, { "epoch": 0.009626652546418868, "grad_norm": 0.10009765625, "learning_rate": 0.001999994259997284, "loss": 0.3359, "step": 1109 }, { "epoch": 0.009635333026623033, "grad_norm": 0.08349609375, "learning_rate": 0.00199999415320894, "loss": 0.2891, "step": 1110 }, { "epoch": 0.009644013506827198, "grad_norm": 0.08447265625, "learning_rate": 0.001999994045436376, "loss": 0.2441, "step": 1111 }, { "epoch": 0.009652693987031363, "grad_norm": 0.0927734375, "learning_rate": 0.0019999939366795927, "loss": 0.252, "step": 1112 }, { "epoch": 0.009661374467235527, "grad_norm": 0.0830078125, "learning_rate": 0.001999993826938589, "loss": 0.2305, "step": 1113 }, { "epoch": 0.009670054947439692, "grad_norm": 0.09130859375, "learning_rate": 0.0019999937162133658, "loss": 0.2402, "step": 1114 }, { "epoch": 0.009678735427643857, "grad_norm": 0.083984375, "learning_rate": 0.0019999936045039224, "loss": 0.252, "step": 1115 }, { "epoch": 0.009687415907848022, "grad_norm": 0.09423828125, "learning_rate": 0.00199999349181026, "loss": 0.2451, "step": 1116 }, { "epoch": 0.009696096388052187, "grad_norm": 0.1201171875, "learning_rate": 0.0019999933781323785, "loss": 0.3516, "step": 1117 }, { "epoch": 0.009704776868256352, "grad_norm": 0.08740234375, "learning_rate": 0.0019999932634702775, "loss": 0.2422, "step": 1118 }, { "epoch": 0.009713457348460517, "grad_norm": 0.126953125, "learning_rate": 0.0019999931478239573, "loss": 0.2305, "step": 1119 }, { "epoch": 0.009722137828664682, "grad_norm": 0.21875, "learning_rate": 0.0019999930311934183, "loss": 0.7617, "step": 1120 }, { "epoch": 0.009730818308868846, "grad_norm": 0.0751953125, "learning_rate": 0.00199999291357866, "loss": 0.2266, "step": 1121 }, { "epoch": 0.009739498789073011, "grad_norm": 0.09326171875, "learning_rate": 0.0019999927949796836, "loss": 0.2773, "step": 1122 }, { "epoch": 0.009748179269277176, "grad_norm": 0.11083984375, "learning_rate": 0.0019999926753964882, "loss": 0.2578, "step": 1123 }, { "epoch": 0.009756859749481341, "grad_norm": 0.09130859375, "learning_rate": 0.0019999925548290745, "loss": 0.2734, "step": 1124 }, { "epoch": 0.009765540229685506, "grad_norm": 0.09619140625, "learning_rate": 0.0019999924332774425, "loss": 0.2617, "step": 1125 }, { "epoch": 0.009774220709889671, "grad_norm": 0.07568359375, "learning_rate": 0.001999992310741592, "loss": 0.2158, "step": 1126 }, { "epoch": 0.009782901190093837, "grad_norm": 0.0810546875, "learning_rate": 0.001999992187221524, "loss": 0.2188, "step": 1127 }, { "epoch": 0.009791581670298002, "grad_norm": 0.08251953125, "learning_rate": 0.001999992062717238, "loss": 0.2412, "step": 1128 }, { "epoch": 0.009800262150502165, "grad_norm": 0.08056640625, "learning_rate": 0.0019999919372287334, "loss": 0.248, "step": 1129 }, { "epoch": 0.00980894263070633, "grad_norm": 0.06494140625, "learning_rate": 0.001999991810756012, "loss": 0.1885, "step": 1130 }, { "epoch": 0.009817623110910495, "grad_norm": 0.0673828125, "learning_rate": 0.0019999916832990727, "loss": 0.2188, "step": 1131 }, { "epoch": 0.00982630359111466, "grad_norm": 0.07421875, "learning_rate": 0.001999991554857916, "loss": 0.2637, "step": 1132 }, { "epoch": 0.009834984071318826, "grad_norm": 0.10302734375, "learning_rate": 0.001999991425432542, "loss": 0.2695, "step": 1133 }, { "epoch": 0.00984366455152299, "grad_norm": 0.0830078125, "learning_rate": 0.001999991295022951, "loss": 0.2158, "step": 1134 }, { "epoch": 0.009852345031727156, "grad_norm": 0.07568359375, "learning_rate": 0.0019999911636291432, "loss": 0.2637, "step": 1135 }, { "epoch": 0.00986102551193132, "grad_norm": 0.197265625, "learning_rate": 0.0019999910312511185, "loss": 0.2559, "step": 1136 }, { "epoch": 0.009869705992135484, "grad_norm": 0.08740234375, "learning_rate": 0.0019999908978888766, "loss": 0.2344, "step": 1137 }, { "epoch": 0.00987838647233965, "grad_norm": 0.10400390625, "learning_rate": 0.0019999907635424186, "loss": 0.2256, "step": 1138 }, { "epoch": 0.009887066952543815, "grad_norm": 0.08984375, "learning_rate": 0.0019999906282117444, "loss": 0.2363, "step": 1139 }, { "epoch": 0.00989574743274798, "grad_norm": 0.07568359375, "learning_rate": 0.001999990491896854, "loss": 0.2324, "step": 1140 }, { "epoch": 0.009904427912952145, "grad_norm": 0.09521484375, "learning_rate": 0.0019999903545977475, "loss": 0.3125, "step": 1141 }, { "epoch": 0.00991310839315631, "grad_norm": 0.0673828125, "learning_rate": 0.001999990216314425, "loss": 0.2109, "step": 1142 }, { "epoch": 0.009921788873360475, "grad_norm": 0.08203125, "learning_rate": 0.0019999900770468863, "loss": 0.2109, "step": 1143 }, { "epoch": 0.009930469353564638, "grad_norm": 0.0791015625, "learning_rate": 0.0019999899367951325, "loss": 0.25, "step": 1144 }, { "epoch": 0.009939149833768804, "grad_norm": 0.08154296875, "learning_rate": 0.001999989795559163, "loss": 0.2119, "step": 1145 }, { "epoch": 0.009947830313972969, "grad_norm": 0.1279296875, "learning_rate": 0.001999989653338978, "loss": 0.3125, "step": 1146 }, { "epoch": 0.009956510794177134, "grad_norm": 0.087890625, "learning_rate": 0.0019999895101345784, "loss": 0.2715, "step": 1147 }, { "epoch": 0.009965191274381299, "grad_norm": 0.09130859375, "learning_rate": 0.0019999893659459634, "loss": 0.2129, "step": 1148 }, { "epoch": 0.009973871754585464, "grad_norm": 0.11181640625, "learning_rate": 0.001999989220773134, "loss": 0.2539, "step": 1149 }, { "epoch": 0.00998255223478963, "grad_norm": 0.09033203125, "learning_rate": 0.0019999890746160895, "loss": 0.2363, "step": 1150 }, { "epoch": 0.009991232714993794, "grad_norm": 0.11083984375, "learning_rate": 0.0019999889274748303, "loss": 0.3008, "step": 1151 }, { "epoch": 0.009999913195197958, "grad_norm": 0.0869140625, "learning_rate": 0.0019999887793493566, "loss": 0.2637, "step": 1152 }, { "epoch": 0.010008593675402123, "grad_norm": 0.10009765625, "learning_rate": 0.0019999886302396693, "loss": 0.2695, "step": 1153 }, { "epoch": 0.010017274155606288, "grad_norm": 0.0810546875, "learning_rate": 0.0019999884801457676, "loss": 0.2539, "step": 1154 }, { "epoch": 0.010025954635810453, "grad_norm": 0.078125, "learning_rate": 0.0019999883290676523, "loss": 0.2578, "step": 1155 }, { "epoch": 0.010034635116014618, "grad_norm": 0.0859375, "learning_rate": 0.0019999881770053226, "loss": 0.2334, "step": 1156 }, { "epoch": 0.010043315596218783, "grad_norm": 0.09912109375, "learning_rate": 0.00199998802395878, "loss": 0.2949, "step": 1157 }, { "epoch": 0.010051996076422948, "grad_norm": 0.0947265625, "learning_rate": 0.001999987869928024, "loss": 0.2422, "step": 1158 }, { "epoch": 0.010060676556627112, "grad_norm": 0.10205078125, "learning_rate": 0.001999987714913055, "loss": 0.2832, "step": 1159 }, { "epoch": 0.010069357036831277, "grad_norm": 0.08349609375, "learning_rate": 0.001999987558913872, "loss": 0.2422, "step": 1160 }, { "epoch": 0.010078037517035442, "grad_norm": 0.07861328125, "learning_rate": 0.0019999874019304767, "loss": 0.2432, "step": 1161 }, { "epoch": 0.010086717997239607, "grad_norm": 0.130859375, "learning_rate": 0.001999987243962869, "loss": 0.2969, "step": 1162 }, { "epoch": 0.010095398477443772, "grad_norm": 0.09228515625, "learning_rate": 0.0019999870850110485, "loss": 0.2812, "step": 1163 }, { "epoch": 0.010104078957647937, "grad_norm": 0.07470703125, "learning_rate": 0.0019999869250750158, "loss": 0.2109, "step": 1164 }, { "epoch": 0.010112759437852102, "grad_norm": 0.08447265625, "learning_rate": 0.0019999867641547707, "loss": 0.2559, "step": 1165 }, { "epoch": 0.010121439918056268, "grad_norm": 0.087890625, "learning_rate": 0.0019999866022503135, "loss": 0.2363, "step": 1166 }, { "epoch": 0.010130120398260431, "grad_norm": 0.123046875, "learning_rate": 0.0019999864393616448, "loss": 0.3125, "step": 1167 }, { "epoch": 0.010138800878464596, "grad_norm": 0.10791015625, "learning_rate": 0.0019999862754887642, "loss": 0.2441, "step": 1168 }, { "epoch": 0.010147481358668761, "grad_norm": 0.08984375, "learning_rate": 0.0019999861106316723, "loss": 0.2236, "step": 1169 }, { "epoch": 0.010156161838872926, "grad_norm": 0.076171875, "learning_rate": 0.001999985944790369, "loss": 0.2812, "step": 1170 }, { "epoch": 0.010164842319077091, "grad_norm": 0.10595703125, "learning_rate": 0.0019999857779648546, "loss": 0.3379, "step": 1171 }, { "epoch": 0.010173522799281257, "grad_norm": 0.091796875, "learning_rate": 0.0019999856101551292, "loss": 0.2324, "step": 1172 }, { "epoch": 0.010182203279485422, "grad_norm": 0.1337890625, "learning_rate": 0.001999985441361193, "loss": 0.4375, "step": 1173 }, { "epoch": 0.010190883759689587, "grad_norm": 0.0927734375, "learning_rate": 0.0019999852715830465, "loss": 0.2617, "step": 1174 }, { "epoch": 0.01019956423989375, "grad_norm": 0.078125, "learning_rate": 0.0019999851008206896, "loss": 0.1914, "step": 1175 }, { "epoch": 0.010208244720097915, "grad_norm": 0.0908203125, "learning_rate": 0.0019999849290741225, "loss": 0.252, "step": 1176 }, { "epoch": 0.01021692520030208, "grad_norm": 0.083984375, "learning_rate": 0.0019999847563433454, "loss": 0.2031, "step": 1177 }, { "epoch": 0.010225605680506246, "grad_norm": 0.08251953125, "learning_rate": 0.001999984582628358, "loss": 0.1982, "step": 1178 }, { "epoch": 0.01023428616071041, "grad_norm": 0.0869140625, "learning_rate": 0.001999984407929162, "loss": 0.2598, "step": 1179 }, { "epoch": 0.010242966640914576, "grad_norm": 0.1279296875, "learning_rate": 0.001999984232245756, "loss": 0.2656, "step": 1180 }, { "epoch": 0.010251647121118741, "grad_norm": 0.08740234375, "learning_rate": 0.0019999840555781404, "loss": 0.2246, "step": 1181 }, { "epoch": 0.010260327601322906, "grad_norm": 0.0791015625, "learning_rate": 0.0019999838779263166, "loss": 0.1943, "step": 1182 }, { "epoch": 0.01026900808152707, "grad_norm": 0.11767578125, "learning_rate": 0.001999983699290283, "loss": 0.2344, "step": 1183 }, { "epoch": 0.010277688561731235, "grad_norm": 0.111328125, "learning_rate": 0.0019999835196700413, "loss": 0.332, "step": 1184 }, { "epoch": 0.0102863690419354, "grad_norm": 0.154296875, "learning_rate": 0.001999983339065591, "loss": 0.3359, "step": 1185 }, { "epoch": 0.010295049522139565, "grad_norm": 0.095703125, "learning_rate": 0.001999983157476933, "loss": 0.332, "step": 1186 }, { "epoch": 0.01030373000234373, "grad_norm": 0.09765625, "learning_rate": 0.0019999829749040663, "loss": 0.293, "step": 1187 }, { "epoch": 0.010312410482547895, "grad_norm": 0.09228515625, "learning_rate": 0.001999982791346992, "loss": 0.2217, "step": 1188 }, { "epoch": 0.01032109096275206, "grad_norm": 0.1494140625, "learning_rate": 0.00199998260680571, "loss": 0.2539, "step": 1189 }, { "epoch": 0.010329771442956224, "grad_norm": 0.0869140625, "learning_rate": 0.0019999824212802203, "loss": 0.252, "step": 1190 }, { "epoch": 0.010338451923160389, "grad_norm": 0.1279296875, "learning_rate": 0.0019999822347705233, "loss": 0.2422, "step": 1191 }, { "epoch": 0.010347132403364554, "grad_norm": 0.1259765625, "learning_rate": 0.0019999820472766197, "loss": 0.2949, "step": 1192 }, { "epoch": 0.010355812883568719, "grad_norm": 0.1181640625, "learning_rate": 0.001999981858798509, "loss": 0.2852, "step": 1193 }, { "epoch": 0.010364493363772884, "grad_norm": 0.07666015625, "learning_rate": 0.0019999816693361916, "loss": 0.1719, "step": 1194 }, { "epoch": 0.010373173843977049, "grad_norm": 0.08203125, "learning_rate": 0.0019999814788896676, "loss": 0.2383, "step": 1195 }, { "epoch": 0.010381854324181214, "grad_norm": 0.109375, "learning_rate": 0.0019999812874589382, "loss": 0.2754, "step": 1196 }, { "epoch": 0.01039053480438538, "grad_norm": 0.1474609375, "learning_rate": 0.0019999810950440023, "loss": 0.2734, "step": 1197 }, { "epoch": 0.010399215284589543, "grad_norm": 0.10595703125, "learning_rate": 0.00199998090164486, "loss": 0.2422, "step": 1198 }, { "epoch": 0.010407895764793708, "grad_norm": 0.0859375, "learning_rate": 0.001999980707261513, "loss": 0.2539, "step": 1199 }, { "epoch": 0.010416576244997873, "grad_norm": 0.08251953125, "learning_rate": 0.00199998051189396, "loss": 0.2715, "step": 1200 }, { "epoch": 0.010425256725202038, "grad_norm": 0.11767578125, "learning_rate": 0.0019999803155422023, "loss": 0.3281, "step": 1201 }, { "epoch": 0.010433937205406203, "grad_norm": 0.11962890625, "learning_rate": 0.0019999801182062392, "loss": 0.4082, "step": 1202 }, { "epoch": 0.010442617685610368, "grad_norm": 0.10302734375, "learning_rate": 0.0019999799198860716, "loss": 0.2559, "step": 1203 }, { "epoch": 0.010451298165814533, "grad_norm": 0.08544921875, "learning_rate": 0.0019999797205816996, "loss": 0.252, "step": 1204 }, { "epoch": 0.010459978646018699, "grad_norm": 0.0791015625, "learning_rate": 0.001999979520293123, "loss": 0.2266, "step": 1205 }, { "epoch": 0.010468659126222862, "grad_norm": 0.10205078125, "learning_rate": 0.001999979319020343, "loss": 0.2344, "step": 1206 }, { "epoch": 0.010477339606427027, "grad_norm": 0.10498046875, "learning_rate": 0.0019999791167633583, "loss": 0.2695, "step": 1207 }, { "epoch": 0.010486020086631192, "grad_norm": 0.09814453125, "learning_rate": 0.00199997891352217, "loss": 0.2246, "step": 1208 }, { "epoch": 0.010494700566835357, "grad_norm": 0.08154296875, "learning_rate": 0.0019999787092967788, "loss": 0.2188, "step": 1209 }, { "epoch": 0.010503381047039522, "grad_norm": 0.07373046875, "learning_rate": 0.0019999785040871842, "loss": 0.2344, "step": 1210 }, { "epoch": 0.010512061527243688, "grad_norm": 0.11767578125, "learning_rate": 0.0019999782978933865, "loss": 0.2695, "step": 1211 }, { "epoch": 0.010520742007447853, "grad_norm": 0.10302734375, "learning_rate": 0.0019999780907153865, "loss": 0.2852, "step": 1212 }, { "epoch": 0.010529422487652016, "grad_norm": 0.1005859375, "learning_rate": 0.0019999778825531838, "loss": 0.2598, "step": 1213 }, { "epoch": 0.010538102967856181, "grad_norm": 0.0810546875, "learning_rate": 0.0019999776734067787, "loss": 0.2891, "step": 1214 }, { "epoch": 0.010546783448060346, "grad_norm": 0.080078125, "learning_rate": 0.0019999774632761713, "loss": 0.3008, "step": 1215 }, { "epoch": 0.010555463928264511, "grad_norm": 0.07958984375, "learning_rate": 0.001999977252161362, "loss": 0.2324, "step": 1216 }, { "epoch": 0.010564144408468677, "grad_norm": 0.09033203125, "learning_rate": 0.001999977040062352, "loss": 0.2734, "step": 1217 }, { "epoch": 0.010572824888672842, "grad_norm": 0.08349609375, "learning_rate": 0.0019999768269791393, "loss": 0.2539, "step": 1218 }, { "epoch": 0.010581505368877007, "grad_norm": 0.06396484375, "learning_rate": 0.0019999766129117267, "loss": 0.1826, "step": 1219 }, { "epoch": 0.010590185849081172, "grad_norm": 0.07470703125, "learning_rate": 0.0019999763978601126, "loss": 0.2695, "step": 1220 }, { "epoch": 0.010598866329285335, "grad_norm": 0.08056640625, "learning_rate": 0.0019999761818242975, "loss": 0.2637, "step": 1221 }, { "epoch": 0.0106075468094895, "grad_norm": 0.0986328125, "learning_rate": 0.0019999759648042827, "loss": 0.2471, "step": 1222 }, { "epoch": 0.010616227289693666, "grad_norm": 0.09814453125, "learning_rate": 0.0019999757468000673, "loss": 0.3125, "step": 1223 }, { "epoch": 0.01062490776989783, "grad_norm": 0.09521484375, "learning_rate": 0.0019999755278116518, "loss": 0.2871, "step": 1224 }, { "epoch": 0.010633588250101996, "grad_norm": 0.0869140625, "learning_rate": 0.001999975307839037, "loss": 0.2949, "step": 1225 }, { "epoch": 0.010642268730306161, "grad_norm": 0.07421875, "learning_rate": 0.0019999750868822225, "loss": 0.2178, "step": 1226 }, { "epoch": 0.010650949210510326, "grad_norm": 0.1513671875, "learning_rate": 0.0019999748649412088, "loss": 0.4844, "step": 1227 }, { "epoch": 0.010659629690714491, "grad_norm": 0.10205078125, "learning_rate": 0.001999974642015996, "loss": 0.2461, "step": 1228 }, { "epoch": 0.010668310170918655, "grad_norm": 0.07373046875, "learning_rate": 0.001999974418106585, "loss": 0.2158, "step": 1229 }, { "epoch": 0.01067699065112282, "grad_norm": 0.09228515625, "learning_rate": 0.0019999741932129745, "loss": 0.2246, "step": 1230 }, { "epoch": 0.010685671131326985, "grad_norm": 0.08349609375, "learning_rate": 0.0019999739673351668, "loss": 0.2393, "step": 1231 }, { "epoch": 0.01069435161153115, "grad_norm": 0.078125, "learning_rate": 0.00199997374047316, "loss": 0.2773, "step": 1232 }, { "epoch": 0.010703032091735315, "grad_norm": 0.08447265625, "learning_rate": 0.0019999735126269565, "loss": 0.2539, "step": 1233 }, { "epoch": 0.01071171257193948, "grad_norm": 0.083984375, "learning_rate": 0.0019999732837965548, "loss": 0.1895, "step": 1234 }, { "epoch": 0.010720393052143645, "grad_norm": 0.068359375, "learning_rate": 0.001999973053981956, "loss": 0.2236, "step": 1235 }, { "epoch": 0.010729073532347809, "grad_norm": 0.10205078125, "learning_rate": 0.00199997282318316, "loss": 0.2451, "step": 1236 }, { "epoch": 0.010737754012551974, "grad_norm": 0.10498046875, "learning_rate": 0.001999972591400168, "loss": 0.2598, "step": 1237 }, { "epoch": 0.010746434492756139, "grad_norm": 0.126953125, "learning_rate": 0.001999972358632979, "loss": 0.25, "step": 1238 }, { "epoch": 0.010755114972960304, "grad_norm": 0.11083984375, "learning_rate": 0.001999972124881594, "loss": 0.2441, "step": 1239 }, { "epoch": 0.010763795453164469, "grad_norm": 0.08349609375, "learning_rate": 0.0019999718901460126, "loss": 0.2988, "step": 1240 }, { "epoch": 0.010772475933368634, "grad_norm": 0.08154296875, "learning_rate": 0.0019999716544262356, "loss": 0.1875, "step": 1241 }, { "epoch": 0.0107811564135728, "grad_norm": 0.06787109375, "learning_rate": 0.001999971417722263, "loss": 0.2295, "step": 1242 }, { "epoch": 0.010789836893776964, "grad_norm": 0.08056640625, "learning_rate": 0.001999971180034095, "loss": 0.248, "step": 1243 }, { "epoch": 0.010798517373981128, "grad_norm": 0.0927734375, "learning_rate": 0.0019999709413617327, "loss": 0.2734, "step": 1244 }, { "epoch": 0.010807197854185293, "grad_norm": 0.0869140625, "learning_rate": 0.001999970701705175, "loss": 0.2246, "step": 1245 }, { "epoch": 0.010815878334389458, "grad_norm": 0.07080078125, "learning_rate": 0.0019999704610644234, "loss": 0.2051, "step": 1246 }, { "epoch": 0.010824558814593623, "grad_norm": 0.07958984375, "learning_rate": 0.0019999702194394777, "loss": 0.3223, "step": 1247 }, { "epoch": 0.010833239294797788, "grad_norm": 0.11181640625, "learning_rate": 0.001999969976830338, "loss": 0.2441, "step": 1248 }, { "epoch": 0.010841919775001953, "grad_norm": 0.076171875, "learning_rate": 0.001999969733237004, "loss": 0.209, "step": 1249 }, { "epoch": 0.010850600255206119, "grad_norm": 0.07666015625, "learning_rate": 0.001999969488659477, "loss": 0.209, "step": 1250 }, { "epoch": 0.010859280735410284, "grad_norm": 0.08544921875, "learning_rate": 0.001999969243097757, "loss": 0.2207, "step": 1251 }, { "epoch": 0.010867961215614447, "grad_norm": 0.0791015625, "learning_rate": 0.0019999689965518445, "loss": 0.2871, "step": 1252 }, { "epoch": 0.010876641695818612, "grad_norm": 0.0966796875, "learning_rate": 0.0019999687490217387, "loss": 0.2441, "step": 1253 }, { "epoch": 0.010885322176022777, "grad_norm": 0.076171875, "learning_rate": 0.0019999685005074415, "loss": 0.2539, "step": 1254 }, { "epoch": 0.010894002656226942, "grad_norm": 0.08642578125, "learning_rate": 0.0019999682510089514, "loss": 0.2324, "step": 1255 }, { "epoch": 0.010902683136431108, "grad_norm": 0.0791015625, "learning_rate": 0.00199996800052627, "loss": 0.2266, "step": 1256 }, { "epoch": 0.010911363616635273, "grad_norm": 0.07763671875, "learning_rate": 0.001999967749059397, "loss": 0.2285, "step": 1257 }, { "epoch": 0.010920044096839438, "grad_norm": 0.10986328125, "learning_rate": 0.0019999674966083326, "loss": 0.2734, "step": 1258 }, { "epoch": 0.010928724577043603, "grad_norm": 0.0869140625, "learning_rate": 0.0019999672431730777, "loss": 0.2109, "step": 1259 }, { "epoch": 0.010937405057247766, "grad_norm": 0.1123046875, "learning_rate": 0.0019999669887536316, "loss": 0.2773, "step": 1260 }, { "epoch": 0.010946085537451931, "grad_norm": 0.1572265625, "learning_rate": 0.001999966733349996, "loss": 0.2598, "step": 1261 }, { "epoch": 0.010954766017656097, "grad_norm": 0.0693359375, "learning_rate": 0.001999966476962169, "loss": 0.2188, "step": 1262 }, { "epoch": 0.010963446497860262, "grad_norm": 0.072265625, "learning_rate": 0.0019999662195901535, "loss": 0.2324, "step": 1263 }, { "epoch": 0.010972126978064427, "grad_norm": 0.11083984375, "learning_rate": 0.0019999659612339477, "loss": 0.3516, "step": 1264 }, { "epoch": 0.010980807458268592, "grad_norm": 0.09130859375, "learning_rate": 0.001999965701893553, "loss": 0.2305, "step": 1265 }, { "epoch": 0.010989487938472757, "grad_norm": 0.087890625, "learning_rate": 0.001999965441568969, "loss": 0.2617, "step": 1266 }, { "epoch": 0.01099816841867692, "grad_norm": 0.1005859375, "learning_rate": 0.0019999651802601968, "loss": 0.2578, "step": 1267 }, { "epoch": 0.011006848898881086, "grad_norm": 0.3671875, "learning_rate": 0.0019999649179672356, "loss": 0.3828, "step": 1268 }, { "epoch": 0.01101552937908525, "grad_norm": 0.1103515625, "learning_rate": 0.0019999646546900863, "loss": 0.3105, "step": 1269 }, { "epoch": 0.011024209859289416, "grad_norm": 0.111328125, "learning_rate": 0.0019999643904287496, "loss": 0.2461, "step": 1270 }, { "epoch": 0.011032890339493581, "grad_norm": 0.0888671875, "learning_rate": 0.0019999641251832252, "loss": 0.2539, "step": 1271 }, { "epoch": 0.011041570819697746, "grad_norm": 0.09814453125, "learning_rate": 0.001999963858953514, "loss": 0.2891, "step": 1272 }, { "epoch": 0.011050251299901911, "grad_norm": 0.11376953125, "learning_rate": 0.001999963591739615, "loss": 0.2793, "step": 1273 }, { "epoch": 0.011058931780106076, "grad_norm": 0.111328125, "learning_rate": 0.00199996332354153, "loss": 0.293, "step": 1274 }, { "epoch": 0.01106761226031024, "grad_norm": 0.1484375, "learning_rate": 0.001999963054359258, "loss": 0.25, "step": 1275 }, { "epoch": 0.011076292740514405, "grad_norm": 0.0869140625, "learning_rate": 0.0019999627841928006, "loss": 0.2266, "step": 1276 }, { "epoch": 0.01108497322071857, "grad_norm": 0.09130859375, "learning_rate": 0.001999962513042157, "loss": 0.2637, "step": 1277 }, { "epoch": 0.011093653700922735, "grad_norm": 0.0732421875, "learning_rate": 0.001999962240907328, "loss": 0.2383, "step": 1278 }, { "epoch": 0.0111023341811269, "grad_norm": 0.150390625, "learning_rate": 0.0019999619677883137, "loss": 0.3633, "step": 1279 }, { "epoch": 0.011111014661331065, "grad_norm": 0.07763671875, "learning_rate": 0.0019999616936851147, "loss": 0.2578, "step": 1280 }, { "epoch": 0.01111969514153523, "grad_norm": 0.1005859375, "learning_rate": 0.001999961418597731, "loss": 0.2598, "step": 1281 }, { "epoch": 0.011128375621739395, "grad_norm": 0.09423828125, "learning_rate": 0.0019999611425261634, "loss": 0.2832, "step": 1282 }, { "epoch": 0.011137056101943559, "grad_norm": 0.099609375, "learning_rate": 0.0019999608654704118, "loss": 0.2578, "step": 1283 }, { "epoch": 0.011145736582147724, "grad_norm": 0.08154296875, "learning_rate": 0.0019999605874304756, "loss": 0.3125, "step": 1284 }, { "epoch": 0.011154417062351889, "grad_norm": 0.416015625, "learning_rate": 0.001999960308406357, "loss": 0.6797, "step": 1285 }, { "epoch": 0.011163097542556054, "grad_norm": 0.07470703125, "learning_rate": 0.0019999600283980546, "loss": 0.1934, "step": 1286 }, { "epoch": 0.01117177802276022, "grad_norm": 0.09130859375, "learning_rate": 0.00199995974740557, "loss": 0.291, "step": 1287 }, { "epoch": 0.011180458502964384, "grad_norm": 0.09326171875, "learning_rate": 0.001999959465428903, "loss": 0.2852, "step": 1288 }, { "epoch": 0.01118913898316855, "grad_norm": 0.0673828125, "learning_rate": 0.0019999591824680536, "loss": 0.1953, "step": 1289 }, { "epoch": 0.011197819463372713, "grad_norm": 0.09423828125, "learning_rate": 0.001999958898523022, "loss": 0.2324, "step": 1290 }, { "epoch": 0.011206499943576878, "grad_norm": 0.095703125, "learning_rate": 0.0019999586135938095, "loss": 0.2422, "step": 1291 }, { "epoch": 0.011215180423781043, "grad_norm": 0.0966796875, "learning_rate": 0.0019999583276804154, "loss": 0.2656, "step": 1292 }, { "epoch": 0.011223860903985208, "grad_norm": 0.08251953125, "learning_rate": 0.0019999580407828402, "loss": 0.252, "step": 1293 }, { "epoch": 0.011232541384189373, "grad_norm": 0.07275390625, "learning_rate": 0.0019999577529010854, "loss": 0.209, "step": 1294 }, { "epoch": 0.011241221864393539, "grad_norm": 0.08154296875, "learning_rate": 0.0019999574640351494, "loss": 0.2393, "step": 1295 }, { "epoch": 0.011249902344597704, "grad_norm": 0.10009765625, "learning_rate": 0.0019999571741850337, "loss": 0.2217, "step": 1296 }, { "epoch": 0.011258582824801869, "grad_norm": 0.1484375, "learning_rate": 0.0019999568833507383, "loss": 0.2539, "step": 1297 }, { "epoch": 0.011267263305006032, "grad_norm": 0.09619140625, "learning_rate": 0.001999956591532264, "loss": 0.2617, "step": 1298 }, { "epoch": 0.011275943785210197, "grad_norm": 0.08203125, "learning_rate": 0.00199995629872961, "loss": 0.1953, "step": 1299 }, { "epoch": 0.011284624265414362, "grad_norm": 0.08935546875, "learning_rate": 0.0019999560049427777, "loss": 0.2246, "step": 1300 }, { "epoch": 0.011293304745618528, "grad_norm": 0.11181640625, "learning_rate": 0.0019999557101717668, "loss": 0.248, "step": 1301 }, { "epoch": 0.011301985225822693, "grad_norm": 0.0673828125, "learning_rate": 0.0019999554144165778, "loss": 0.1865, "step": 1302 }, { "epoch": 0.011310665706026858, "grad_norm": 0.142578125, "learning_rate": 0.0019999551176772116, "loss": 0.3086, "step": 1303 }, { "epoch": 0.011319346186231023, "grad_norm": 0.1123046875, "learning_rate": 0.0019999548199536674, "loss": 0.25, "step": 1304 }, { "epoch": 0.011328026666435188, "grad_norm": 0.07373046875, "learning_rate": 0.0019999545212459465, "loss": 0.1973, "step": 1305 }, { "epoch": 0.011336707146639351, "grad_norm": 0.09716796875, "learning_rate": 0.001999954221554049, "loss": 0.3066, "step": 1306 }, { "epoch": 0.011345387626843517, "grad_norm": 0.09521484375, "learning_rate": 0.001999953920877975, "loss": 0.2637, "step": 1307 }, { "epoch": 0.011354068107047682, "grad_norm": 0.1025390625, "learning_rate": 0.0019999536192177245, "loss": 0.2031, "step": 1308 }, { "epoch": 0.011362748587251847, "grad_norm": 0.1083984375, "learning_rate": 0.0019999533165732985, "loss": 0.3164, "step": 1309 }, { "epoch": 0.011371429067456012, "grad_norm": 0.08984375, "learning_rate": 0.001999953012944697, "loss": 0.2266, "step": 1310 }, { "epoch": 0.011380109547660177, "grad_norm": 0.09033203125, "learning_rate": 0.001999952708331921, "loss": 0.252, "step": 1311 }, { "epoch": 0.011388790027864342, "grad_norm": 0.078125, "learning_rate": 0.0019999524027349697, "loss": 0.2246, "step": 1312 }, { "epoch": 0.011397470508068505, "grad_norm": 0.09423828125, "learning_rate": 0.0019999520961538437, "loss": 0.2617, "step": 1313 }, { "epoch": 0.01140615098827267, "grad_norm": 0.10302734375, "learning_rate": 0.001999951788588544, "loss": 0.3242, "step": 1314 }, { "epoch": 0.011414831468476836, "grad_norm": 0.0908203125, "learning_rate": 0.0019999514800390704, "loss": 0.248, "step": 1315 }, { "epoch": 0.011423511948681, "grad_norm": 0.07470703125, "learning_rate": 0.0019999511705054234, "loss": 0.2227, "step": 1316 }, { "epoch": 0.011432192428885166, "grad_norm": 0.111328125, "learning_rate": 0.0019999508599876036, "loss": 0.2812, "step": 1317 }, { "epoch": 0.011440872909089331, "grad_norm": 0.08154296875, "learning_rate": 0.001999950548485611, "loss": 0.2598, "step": 1318 }, { "epoch": 0.011449553389293496, "grad_norm": 0.11767578125, "learning_rate": 0.0019999502359994456, "loss": 0.2617, "step": 1319 }, { "epoch": 0.011458233869497661, "grad_norm": 0.10595703125, "learning_rate": 0.0019999499225291087, "loss": 0.2988, "step": 1320 }, { "epoch": 0.011466914349701825, "grad_norm": 0.07861328125, "learning_rate": 0.0019999496080746, "loss": 0.2539, "step": 1321 }, { "epoch": 0.01147559482990599, "grad_norm": 0.06884765625, "learning_rate": 0.0019999492926359194, "loss": 0.2148, "step": 1322 }, { "epoch": 0.011484275310110155, "grad_norm": 0.06689453125, "learning_rate": 0.001999948976213068, "loss": 0.2422, "step": 1323 }, { "epoch": 0.01149295579031432, "grad_norm": 0.07177734375, "learning_rate": 0.001999948658806046, "loss": 0.2109, "step": 1324 }, { "epoch": 0.011501636270518485, "grad_norm": 0.09326171875, "learning_rate": 0.0019999483404148535, "loss": 0.332, "step": 1325 }, { "epoch": 0.01151031675072265, "grad_norm": 0.1298828125, "learning_rate": 0.0019999480210394914, "loss": 0.2793, "step": 1326 }, { "epoch": 0.011518997230926815, "grad_norm": 0.302734375, "learning_rate": 0.0019999477006799595, "loss": 0.457, "step": 1327 }, { "epoch": 0.01152767771113098, "grad_norm": 0.06201171875, "learning_rate": 0.0019999473793362583, "loss": 0.2266, "step": 1328 }, { "epoch": 0.011536358191335144, "grad_norm": 0.1005859375, "learning_rate": 0.001999947057008388, "loss": 0.3301, "step": 1329 }, { "epoch": 0.011545038671539309, "grad_norm": 0.1015625, "learning_rate": 0.001999946733696349, "loss": 0.2354, "step": 1330 }, { "epoch": 0.011553719151743474, "grad_norm": 0.0888671875, "learning_rate": 0.001999946409400142, "loss": 0.2246, "step": 1331 }, { "epoch": 0.01156239963194764, "grad_norm": 0.09619140625, "learning_rate": 0.001999946084119768, "loss": 0.2383, "step": 1332 }, { "epoch": 0.011571080112151804, "grad_norm": 0.11669921875, "learning_rate": 0.001999945757855225, "loss": 0.2656, "step": 1333 }, { "epoch": 0.01157976059235597, "grad_norm": 0.07861328125, "learning_rate": 0.0019999454306065157, "loss": 0.2422, "step": 1334 }, { "epoch": 0.011588441072560135, "grad_norm": 0.07958984375, "learning_rate": 0.0019999451023736394, "loss": 0.2559, "step": 1335 }, { "epoch": 0.0115971215527643, "grad_norm": 0.107421875, "learning_rate": 0.0019999447731565965, "loss": 0.2754, "step": 1336 }, { "epoch": 0.011605802032968463, "grad_norm": 0.0810546875, "learning_rate": 0.0019999444429553877, "loss": 0.2129, "step": 1337 }, { "epoch": 0.011614482513172628, "grad_norm": 0.08984375, "learning_rate": 0.0019999441117700134, "loss": 0.2324, "step": 1338 }, { "epoch": 0.011623162993376793, "grad_norm": 0.0751953125, "learning_rate": 0.0019999437796004733, "loss": 0.2217, "step": 1339 }, { "epoch": 0.011631843473580959, "grad_norm": 0.10595703125, "learning_rate": 0.0019999434464467686, "loss": 0.2598, "step": 1340 }, { "epoch": 0.011640523953785124, "grad_norm": 0.1357421875, "learning_rate": 0.001999943112308899, "loss": 0.2539, "step": 1341 }, { "epoch": 0.011649204433989289, "grad_norm": 0.091796875, "learning_rate": 0.001999942777186865, "loss": 0.3066, "step": 1342 }, { "epoch": 0.011657884914193454, "grad_norm": 0.1015625, "learning_rate": 0.0019999424410806674, "loss": 0.2969, "step": 1343 }, { "epoch": 0.011666565394397617, "grad_norm": 0.076171875, "learning_rate": 0.001999942103990306, "loss": 0.2051, "step": 1344 }, { "epoch": 0.011675245874601782, "grad_norm": 0.09619140625, "learning_rate": 0.0019999417659157816, "loss": 0.2539, "step": 1345 }, { "epoch": 0.011683926354805948, "grad_norm": 0.09375, "learning_rate": 0.0019999414268570947, "loss": 0.2354, "step": 1346 }, { "epoch": 0.011692606835010113, "grad_norm": 0.08740234375, "learning_rate": 0.001999941086814245, "loss": 0.3164, "step": 1347 }, { "epoch": 0.011701287315214278, "grad_norm": 0.1015625, "learning_rate": 0.001999940745787233, "loss": 0.2832, "step": 1348 }, { "epoch": 0.011709967795418443, "grad_norm": 0.0654296875, "learning_rate": 0.00199994040377606, "loss": 0.2266, "step": 1349 }, { "epoch": 0.011718648275622608, "grad_norm": 0.07861328125, "learning_rate": 0.001999940060780725, "loss": 0.248, "step": 1350 }, { "epoch": 0.011727328755826773, "grad_norm": 0.0986328125, "learning_rate": 0.0019999397168012295, "loss": 0.2188, "step": 1351 }, { "epoch": 0.011736009236030936, "grad_norm": 0.07275390625, "learning_rate": 0.0019999393718375734, "loss": 0.2236, "step": 1352 }, { "epoch": 0.011744689716235102, "grad_norm": 0.08740234375, "learning_rate": 0.001999939025889757, "loss": 0.2539, "step": 1353 }, { "epoch": 0.011753370196439267, "grad_norm": 0.08203125, "learning_rate": 0.0019999386789577808, "loss": 0.207, "step": 1354 }, { "epoch": 0.011762050676643432, "grad_norm": 0.10498046875, "learning_rate": 0.001999938331041645, "loss": 0.2363, "step": 1355 }, { "epoch": 0.011770731156847597, "grad_norm": 0.09033203125, "learning_rate": 0.0019999379821413507, "loss": 0.2461, "step": 1356 }, { "epoch": 0.011779411637051762, "grad_norm": 0.08642578125, "learning_rate": 0.0019999376322568977, "loss": 0.25, "step": 1357 }, { "epoch": 0.011788092117255927, "grad_norm": 0.1015625, "learning_rate": 0.001999937281388286, "loss": 0.2598, "step": 1358 }, { "epoch": 0.011796772597460092, "grad_norm": 0.08984375, "learning_rate": 0.0019999369295355166, "loss": 0.2539, "step": 1359 }, { "epoch": 0.011805453077664256, "grad_norm": 0.13671875, "learning_rate": 0.00199993657669859, "loss": 0.2285, "step": 1360 }, { "epoch": 0.01181413355786842, "grad_norm": 0.08056640625, "learning_rate": 0.001999936222877506, "loss": 0.2812, "step": 1361 }, { "epoch": 0.011822814038072586, "grad_norm": 0.0966796875, "learning_rate": 0.001999935868072265, "loss": 0.2637, "step": 1362 }, { "epoch": 0.011831494518276751, "grad_norm": 0.08740234375, "learning_rate": 0.001999935512282868, "loss": 0.2695, "step": 1363 }, { "epoch": 0.011840174998480916, "grad_norm": 0.07568359375, "learning_rate": 0.001999935155509315, "loss": 0.2422, "step": 1364 }, { "epoch": 0.011848855478685081, "grad_norm": 0.0791015625, "learning_rate": 0.0019999347977516066, "loss": 0.2559, "step": 1365 }, { "epoch": 0.011857535958889246, "grad_norm": 0.06982421875, "learning_rate": 0.0019999344390097423, "loss": 0.2158, "step": 1366 }, { "epoch": 0.01186621643909341, "grad_norm": 0.095703125, "learning_rate": 0.0019999340792837238, "loss": 0.3008, "step": 1367 }, { "epoch": 0.011874896919297575, "grad_norm": 0.068359375, "learning_rate": 0.001999933718573551, "loss": 0.2207, "step": 1368 }, { "epoch": 0.01188357739950174, "grad_norm": 0.08447265625, "learning_rate": 0.0019999333568792243, "loss": 0.1973, "step": 1369 }, { "epoch": 0.011892257879705905, "grad_norm": 0.08544921875, "learning_rate": 0.0019999329942007437, "loss": 0.2295, "step": 1370 }, { "epoch": 0.01190093835991007, "grad_norm": 0.1142578125, "learning_rate": 0.00199993263053811, "loss": 0.2734, "step": 1371 }, { "epoch": 0.011909618840114235, "grad_norm": 0.09521484375, "learning_rate": 0.001999932265891323, "loss": 0.252, "step": 1372 }, { "epoch": 0.0119182993203184, "grad_norm": 0.1328125, "learning_rate": 0.0019999319002603844, "loss": 0.2402, "step": 1373 }, { "epoch": 0.011926979800522566, "grad_norm": 0.1240234375, "learning_rate": 0.0019999315336452933, "loss": 0.2812, "step": 1374 }, { "epoch": 0.011935660280726729, "grad_norm": 0.08447265625, "learning_rate": 0.001999931166046051, "loss": 0.2432, "step": 1375 }, { "epoch": 0.011944340760930894, "grad_norm": 0.10498046875, "learning_rate": 0.001999930797462657, "loss": 0.2148, "step": 1376 }, { "epoch": 0.01195302124113506, "grad_norm": 0.0859375, "learning_rate": 0.0019999304278951126, "loss": 0.2324, "step": 1377 }, { "epoch": 0.011961701721339224, "grad_norm": 0.0927734375, "learning_rate": 0.001999930057343418, "loss": 0.2402, "step": 1378 }, { "epoch": 0.01197038220154339, "grad_norm": 0.0859375, "learning_rate": 0.0019999296858075727, "loss": 0.2852, "step": 1379 }, { "epoch": 0.011979062681747555, "grad_norm": 0.09814453125, "learning_rate": 0.0019999293132875783, "loss": 0.2344, "step": 1380 }, { "epoch": 0.01198774316195172, "grad_norm": 0.142578125, "learning_rate": 0.0019999289397834344, "loss": 0.3809, "step": 1381 }, { "epoch": 0.011996423642155885, "grad_norm": 0.0869140625, "learning_rate": 0.001999928565295142, "loss": 0.2051, "step": 1382 }, { "epoch": 0.012005104122360048, "grad_norm": 0.0908203125, "learning_rate": 0.001999928189822701, "loss": 0.2402, "step": 1383 }, { "epoch": 0.012013784602564213, "grad_norm": 0.0771484375, "learning_rate": 0.0019999278133661126, "loss": 0.2012, "step": 1384 }, { "epoch": 0.012022465082768378, "grad_norm": 0.0888671875, "learning_rate": 0.001999927435925376, "loss": 0.2363, "step": 1385 }, { "epoch": 0.012031145562972544, "grad_norm": 0.07861328125, "learning_rate": 0.0019999270575004925, "loss": 0.25, "step": 1386 }, { "epoch": 0.012039826043176709, "grad_norm": 0.34375, "learning_rate": 0.0019999266780914623, "loss": 0.3281, "step": 1387 }, { "epoch": 0.012048506523380874, "grad_norm": 0.08251953125, "learning_rate": 0.001999926297698286, "loss": 0.252, "step": 1388 }, { "epoch": 0.012057187003585039, "grad_norm": 0.08349609375, "learning_rate": 0.0019999259163209636, "loss": 0.2715, "step": 1389 }, { "epoch": 0.012065867483789202, "grad_norm": 0.07568359375, "learning_rate": 0.001999925533959496, "loss": 0.2148, "step": 1390 }, { "epoch": 0.012074547963993367, "grad_norm": 0.07373046875, "learning_rate": 0.0019999251506138834, "loss": 0.2402, "step": 1391 }, { "epoch": 0.012083228444197533, "grad_norm": 0.06396484375, "learning_rate": 0.0019999247662841257, "loss": 0.2324, "step": 1392 }, { "epoch": 0.012091908924401698, "grad_norm": 0.13671875, "learning_rate": 0.0019999243809702243, "loss": 0.3066, "step": 1393 }, { "epoch": 0.012100589404605863, "grad_norm": 0.09326171875, "learning_rate": 0.001999923994672179, "loss": 0.3828, "step": 1394 }, { "epoch": 0.012109269884810028, "grad_norm": 0.0673828125, "learning_rate": 0.00199992360738999, "loss": 0.2207, "step": 1395 }, { "epoch": 0.012117950365014193, "grad_norm": 0.07958984375, "learning_rate": 0.0019999232191236583, "loss": 0.252, "step": 1396 }, { "epoch": 0.012126630845218358, "grad_norm": 0.06591796875, "learning_rate": 0.0019999228298731844, "loss": 0.1924, "step": 1397 }, { "epoch": 0.012135311325422522, "grad_norm": 0.0654296875, "learning_rate": 0.0019999224396385676, "loss": 0.2207, "step": 1398 }, { "epoch": 0.012143991805626687, "grad_norm": 0.0693359375, "learning_rate": 0.00199992204841981, "loss": 0.2363, "step": 1399 }, { "epoch": 0.012152672285830852, "grad_norm": 0.09912109375, "learning_rate": 0.0019999216562169107, "loss": 0.3672, "step": 1400 }, { "epoch": 0.012161352766035017, "grad_norm": 0.10986328125, "learning_rate": 0.0019999212630298704, "loss": 0.3086, "step": 1401 }, { "epoch": 0.012170033246239182, "grad_norm": 0.08935546875, "learning_rate": 0.0019999208688586904, "loss": 0.25, "step": 1402 }, { "epoch": 0.012178713726443347, "grad_norm": 0.09033203125, "learning_rate": 0.00199992047370337, "loss": 0.2695, "step": 1403 }, { "epoch": 0.012187394206647512, "grad_norm": 0.08544921875, "learning_rate": 0.00199992007756391, "loss": 0.2324, "step": 1404 }, { "epoch": 0.012196074686851677, "grad_norm": 0.07177734375, "learning_rate": 0.001999919680440311, "loss": 0.2598, "step": 1405 }, { "epoch": 0.01220475516705584, "grad_norm": 0.06640625, "learning_rate": 0.0019999192823325737, "loss": 0.2637, "step": 1406 }, { "epoch": 0.012213435647260006, "grad_norm": 0.0810546875, "learning_rate": 0.001999918883240698, "loss": 0.1895, "step": 1407 }, { "epoch": 0.012222116127464171, "grad_norm": 0.07568359375, "learning_rate": 0.0019999184831646847, "loss": 0.2578, "step": 1408 }, { "epoch": 0.012230796607668336, "grad_norm": 0.0986328125, "learning_rate": 0.0019999180821045335, "loss": 0.2598, "step": 1409 }, { "epoch": 0.012239477087872501, "grad_norm": 0.06494140625, "learning_rate": 0.001999917680060246, "loss": 0.1953, "step": 1410 }, { "epoch": 0.012248157568076666, "grad_norm": 0.1044921875, "learning_rate": 0.001999917277031822, "loss": 0.2676, "step": 1411 }, { "epoch": 0.012256838048280832, "grad_norm": 0.1025390625, "learning_rate": 0.0019999168730192615, "loss": 0.334, "step": 1412 }, { "epoch": 0.012265518528484997, "grad_norm": 0.07666015625, "learning_rate": 0.001999916468022566, "loss": 0.2871, "step": 1413 }, { "epoch": 0.01227419900868916, "grad_norm": 0.099609375, "learning_rate": 0.001999916062041735, "loss": 0.3027, "step": 1414 }, { "epoch": 0.012282879488893325, "grad_norm": 0.10009765625, "learning_rate": 0.0019999156550767694, "loss": 0.2539, "step": 1415 }, { "epoch": 0.01229155996909749, "grad_norm": 0.08935546875, "learning_rate": 0.0019999152471276696, "loss": 0.2441, "step": 1416 }, { "epoch": 0.012300240449301655, "grad_norm": 0.07421875, "learning_rate": 0.0019999148381944355, "loss": 0.2598, "step": 1417 }, { "epoch": 0.01230892092950582, "grad_norm": 0.0888671875, "learning_rate": 0.001999914428277069, "loss": 0.2734, "step": 1418 }, { "epoch": 0.012317601409709986, "grad_norm": 0.09716796875, "learning_rate": 0.0019999140173755686, "loss": 0.2578, "step": 1419 }, { "epoch": 0.01232628188991415, "grad_norm": 0.080078125, "learning_rate": 0.0019999136054899367, "loss": 0.2266, "step": 1420 }, { "epoch": 0.012334962370118314, "grad_norm": 0.062255859375, "learning_rate": 0.0019999131926201723, "loss": 0.2217, "step": 1421 }, { "epoch": 0.01234364285032248, "grad_norm": 0.078125, "learning_rate": 0.0019999127787662767, "loss": 0.2832, "step": 1422 }, { "epoch": 0.012352323330526644, "grad_norm": 0.08837890625, "learning_rate": 0.0019999123639282495, "loss": 0.2715, "step": 1423 }, { "epoch": 0.01236100381073081, "grad_norm": 0.091796875, "learning_rate": 0.001999911948106092, "loss": 0.2617, "step": 1424 }, { "epoch": 0.012369684290934975, "grad_norm": 0.07177734375, "learning_rate": 0.001999911531299804, "loss": 0.2109, "step": 1425 }, { "epoch": 0.01237836477113914, "grad_norm": 0.056396484375, "learning_rate": 0.0019999111135093864, "loss": 0.1924, "step": 1426 }, { "epoch": 0.012387045251343305, "grad_norm": 0.06787109375, "learning_rate": 0.00199991069473484, "loss": 0.1992, "step": 1427 }, { "epoch": 0.01239572573154747, "grad_norm": 0.09423828125, "learning_rate": 0.0019999102749761644, "loss": 0.2578, "step": 1428 }, { "epoch": 0.012404406211751633, "grad_norm": 0.10009765625, "learning_rate": 0.00199990985423336, "loss": 0.25, "step": 1429 }, { "epoch": 0.012413086691955798, "grad_norm": 0.08984375, "learning_rate": 0.0019999094325064285, "loss": 0.2891, "step": 1430 }, { "epoch": 0.012421767172159964, "grad_norm": 0.10986328125, "learning_rate": 0.001999909009795369, "loss": 0.2812, "step": 1431 }, { "epoch": 0.012430447652364129, "grad_norm": 0.14453125, "learning_rate": 0.001999908586100183, "loss": 0.3008, "step": 1432 }, { "epoch": 0.012439128132568294, "grad_norm": 0.08349609375, "learning_rate": 0.00199990816142087, "loss": 0.2559, "step": 1433 }, { "epoch": 0.012447808612772459, "grad_norm": 0.08056640625, "learning_rate": 0.001999907735757431, "loss": 0.2217, "step": 1434 }, { "epoch": 0.012456489092976624, "grad_norm": 0.0869140625, "learning_rate": 0.001999907309109867, "loss": 0.2676, "step": 1435 }, { "epoch": 0.01246516957318079, "grad_norm": 0.115234375, "learning_rate": 0.0019999068814781774, "loss": 0.3359, "step": 1436 }, { "epoch": 0.012473850053384953, "grad_norm": 0.09912109375, "learning_rate": 0.001999906452862363, "loss": 0.2041, "step": 1437 }, { "epoch": 0.012482530533589118, "grad_norm": 0.1298828125, "learning_rate": 0.001999906023262425, "loss": 0.25, "step": 1438 }, { "epoch": 0.012491211013793283, "grad_norm": 0.08447265625, "learning_rate": 0.0019999055926783627, "loss": 0.2021, "step": 1439 }, { "epoch": 0.012499891493997448, "grad_norm": 0.0810546875, "learning_rate": 0.0019999051611101775, "loss": 0.2031, "step": 1440 }, { "epoch": 0.012508571974201613, "grad_norm": 0.068359375, "learning_rate": 0.0019999047285578697, "loss": 0.2324, "step": 1441 }, { "epoch": 0.012517252454405778, "grad_norm": 0.08154296875, "learning_rate": 0.0019999042950214394, "loss": 0.2539, "step": 1442 }, { "epoch": 0.012525932934609943, "grad_norm": 0.0859375, "learning_rate": 0.001999903860500887, "loss": 0.2715, "step": 1443 }, { "epoch": 0.012534613414814107, "grad_norm": 0.0625, "learning_rate": 0.0019999034249962135, "loss": 0.1484, "step": 1444 }, { "epoch": 0.012543293895018272, "grad_norm": 0.06494140625, "learning_rate": 0.001999902988507419, "loss": 0.2256, "step": 1445 }, { "epoch": 0.012551974375222437, "grad_norm": 0.1103515625, "learning_rate": 0.0019999025510345045, "loss": 0.2207, "step": 1446 }, { "epoch": 0.012560654855426602, "grad_norm": 0.08837890625, "learning_rate": 0.00199990211257747, "loss": 0.2422, "step": 1447 }, { "epoch": 0.012569335335630767, "grad_norm": 0.058837890625, "learning_rate": 0.001999901673136316, "loss": 0.2314, "step": 1448 }, { "epoch": 0.012578015815834932, "grad_norm": 0.08203125, "learning_rate": 0.0019999012327110433, "loss": 0.208, "step": 1449 }, { "epoch": 0.012586696296039097, "grad_norm": 0.0908203125, "learning_rate": 0.0019999007913016516, "loss": 0.2715, "step": 1450 }, { "epoch": 0.012595376776243263, "grad_norm": 0.28125, "learning_rate": 0.001999900348908142, "loss": 0.3086, "step": 1451 }, { "epoch": 0.012604057256447426, "grad_norm": 0.0703125, "learning_rate": 0.001999899905530515, "loss": 0.2344, "step": 1452 }, { "epoch": 0.012612737736651591, "grad_norm": 0.11279296875, "learning_rate": 0.001999899461168771, "loss": 0.332, "step": 1453 }, { "epoch": 0.012621418216855756, "grad_norm": 0.10400390625, "learning_rate": 0.0019998990158229106, "loss": 0.2891, "step": 1454 }, { "epoch": 0.012630098697059921, "grad_norm": 0.08154296875, "learning_rate": 0.001999898569492934, "loss": 0.2969, "step": 1455 }, { "epoch": 0.012638779177264086, "grad_norm": 0.08154296875, "learning_rate": 0.001999898122178842, "loss": 0.2051, "step": 1456 }, { "epoch": 0.012647459657468252, "grad_norm": 0.09521484375, "learning_rate": 0.0019998976738806345, "loss": 0.2539, "step": 1457 }, { "epoch": 0.012656140137672417, "grad_norm": 0.09033203125, "learning_rate": 0.001999897224598313, "loss": 0.2344, "step": 1458 }, { "epoch": 0.012664820617876582, "grad_norm": 0.06787109375, "learning_rate": 0.0019998967743318774, "loss": 0.252, "step": 1459 }, { "epoch": 0.012673501098080745, "grad_norm": 0.08740234375, "learning_rate": 0.001999896323081328, "loss": 0.2324, "step": 1460 }, { "epoch": 0.01268218157828491, "grad_norm": 0.09814453125, "learning_rate": 0.0019998958708466654, "loss": 0.3281, "step": 1461 }, { "epoch": 0.012690862058489075, "grad_norm": 0.08935546875, "learning_rate": 0.00199989541762789, "loss": 0.2461, "step": 1462 }, { "epoch": 0.01269954253869324, "grad_norm": 0.10205078125, "learning_rate": 0.001999894963425003, "loss": 0.3086, "step": 1463 }, { "epoch": 0.012708223018897406, "grad_norm": 0.06640625, "learning_rate": 0.001999894508238004, "loss": 0.1689, "step": 1464 }, { "epoch": 0.01271690349910157, "grad_norm": 0.08251953125, "learning_rate": 0.001999894052066894, "loss": 0.2852, "step": 1465 }, { "epoch": 0.012725583979305736, "grad_norm": 0.0625, "learning_rate": 0.0019998935949116737, "loss": 0.2129, "step": 1466 }, { "epoch": 0.0127342644595099, "grad_norm": 0.10107421875, "learning_rate": 0.0019998931367723426, "loss": 0.3242, "step": 1467 }, { "epoch": 0.012742944939714064, "grad_norm": 0.06787109375, "learning_rate": 0.0019998926776489028, "loss": 0.2441, "step": 1468 }, { "epoch": 0.01275162541991823, "grad_norm": 0.08447265625, "learning_rate": 0.001999892217541353, "loss": 0.2256, "step": 1469 }, { "epoch": 0.012760305900122395, "grad_norm": 0.09765625, "learning_rate": 0.001999891756449695, "loss": 0.2539, "step": 1470 }, { "epoch": 0.01276898638032656, "grad_norm": 0.08740234375, "learning_rate": 0.001999891294373929, "loss": 0.2578, "step": 1471 }, { "epoch": 0.012777666860530725, "grad_norm": 0.078125, "learning_rate": 0.001999890831314055, "loss": 0.2383, "step": 1472 }, { "epoch": 0.01278634734073489, "grad_norm": 0.09033203125, "learning_rate": 0.0019998903672700744, "loss": 0.1992, "step": 1473 }, { "epoch": 0.012795027820939055, "grad_norm": 0.08935546875, "learning_rate": 0.001999889902241987, "loss": 0.2656, "step": 1474 }, { "epoch": 0.012803708301143218, "grad_norm": 0.095703125, "learning_rate": 0.0019998894362297935, "loss": 0.252, "step": 1475 }, { "epoch": 0.012812388781347384, "grad_norm": 0.119140625, "learning_rate": 0.0019998889692334947, "loss": 0.3223, "step": 1476 }, { "epoch": 0.012821069261551549, "grad_norm": 0.0849609375, "learning_rate": 0.0019998885012530903, "loss": 0.25, "step": 1477 }, { "epoch": 0.012829749741755714, "grad_norm": 0.08984375, "learning_rate": 0.001999888032288582, "loss": 0.1875, "step": 1478 }, { "epoch": 0.012838430221959879, "grad_norm": 0.0693359375, "learning_rate": 0.001999887562339969, "loss": 0.1943, "step": 1479 }, { "epoch": 0.012847110702164044, "grad_norm": 0.13671875, "learning_rate": 0.001999887091407253, "loss": 0.2422, "step": 1480 }, { "epoch": 0.01285579118236821, "grad_norm": 0.0732421875, "learning_rate": 0.001999886619490434, "loss": 0.2129, "step": 1481 }, { "epoch": 0.012864471662572374, "grad_norm": 0.10888671875, "learning_rate": 0.001999886146589512, "loss": 0.2676, "step": 1482 }, { "epoch": 0.012873152142776538, "grad_norm": 0.08447265625, "learning_rate": 0.0019998856727044883, "loss": 0.2246, "step": 1483 }, { "epoch": 0.012881832622980703, "grad_norm": 0.0810546875, "learning_rate": 0.0019998851978353634, "loss": 0.2988, "step": 1484 }, { "epoch": 0.012890513103184868, "grad_norm": 0.09814453125, "learning_rate": 0.0019998847219821377, "loss": 0.2344, "step": 1485 }, { "epoch": 0.012899193583389033, "grad_norm": 0.140625, "learning_rate": 0.001999884245144811, "loss": 0.2539, "step": 1486 }, { "epoch": 0.012907874063593198, "grad_norm": 0.10107421875, "learning_rate": 0.0019998837673233846, "loss": 0.2324, "step": 1487 }, { "epoch": 0.012916554543797363, "grad_norm": 0.0693359375, "learning_rate": 0.001999883288517859, "loss": 0.2461, "step": 1488 }, { "epoch": 0.012925235024001528, "grad_norm": 0.08203125, "learning_rate": 0.0019998828087282343, "loss": 0.2432, "step": 1489 }, { "epoch": 0.012933915504205694, "grad_norm": 0.06689453125, "learning_rate": 0.0019998823279545113, "loss": 0.25, "step": 1490 }, { "epoch": 0.012942595984409857, "grad_norm": 0.076171875, "learning_rate": 0.0019998818461966906, "loss": 0.2148, "step": 1491 }, { "epoch": 0.012951276464614022, "grad_norm": 0.1083984375, "learning_rate": 0.001999881363454773, "loss": 0.3281, "step": 1492 }, { "epoch": 0.012959956944818187, "grad_norm": 0.10986328125, "learning_rate": 0.001999880879728758, "loss": 0.2969, "step": 1493 }, { "epoch": 0.012968637425022352, "grad_norm": 0.0791015625, "learning_rate": 0.0019998803950186475, "loss": 0.2285, "step": 1494 }, { "epoch": 0.012977317905226517, "grad_norm": 0.058349609375, "learning_rate": 0.001999879909324441, "loss": 0.1855, "step": 1495 }, { "epoch": 0.012985998385430682, "grad_norm": 0.5703125, "learning_rate": 0.001999879422646139, "loss": 0.5859, "step": 1496 }, { "epoch": 0.012994678865634848, "grad_norm": 0.103515625, "learning_rate": 0.001999878934983743, "loss": 0.2129, "step": 1497 }, { "epoch": 0.013003359345839011, "grad_norm": 0.06787109375, "learning_rate": 0.0019998784463372524, "loss": 0.1992, "step": 1498 }, { "epoch": 0.013012039826043176, "grad_norm": 0.08203125, "learning_rate": 0.0019998779567066685, "loss": 0.1768, "step": 1499 }, { "epoch": 0.013020720306247341, "grad_norm": 0.10546875, "learning_rate": 0.0019998774660919916, "loss": 0.248, "step": 1500 }, { "epoch": 0.013029400786451506, "grad_norm": 0.220703125, "learning_rate": 0.001999876974493222, "loss": 0.332, "step": 1501 }, { "epoch": 0.013038081266655671, "grad_norm": 0.09716796875, "learning_rate": 0.001999876481910361, "loss": 0.2617, "step": 1502 }, { "epoch": 0.013046761746859837, "grad_norm": 0.1044921875, "learning_rate": 0.001999875988343408, "loss": 0.252, "step": 1503 }, { "epoch": 0.013055442227064002, "grad_norm": 0.08349609375, "learning_rate": 0.001999875493792364, "loss": 0.2129, "step": 1504 }, { "epoch": 0.013064122707268167, "grad_norm": 0.07958984375, "learning_rate": 0.0019998749982572304, "loss": 0.2002, "step": 1505 }, { "epoch": 0.01307280318747233, "grad_norm": 0.09423828125, "learning_rate": 0.0019998745017380066, "loss": 0.2734, "step": 1506 }, { "epoch": 0.013081483667676495, "grad_norm": 0.08544921875, "learning_rate": 0.0019998740042346938, "loss": 0.2461, "step": 1507 }, { "epoch": 0.01309016414788066, "grad_norm": 0.07373046875, "learning_rate": 0.0019998735057472922, "loss": 0.2148, "step": 1508 }, { "epoch": 0.013098844628084826, "grad_norm": 0.07421875, "learning_rate": 0.0019998730062758025, "loss": 0.21, "step": 1509 }, { "epoch": 0.01310752510828899, "grad_norm": 0.076171875, "learning_rate": 0.001999872505820225, "loss": 0.2461, "step": 1510 }, { "epoch": 0.013116205588493156, "grad_norm": 0.0732421875, "learning_rate": 0.0019998720043805603, "loss": 0.2539, "step": 1511 }, { "epoch": 0.013124886068697321, "grad_norm": 0.0810546875, "learning_rate": 0.0019998715019568093, "loss": 0.2656, "step": 1512 }, { "epoch": 0.013133566548901486, "grad_norm": 0.0712890625, "learning_rate": 0.0019998709985489726, "loss": 0.2236, "step": 1513 }, { "epoch": 0.01314224702910565, "grad_norm": 0.07373046875, "learning_rate": 0.0019998704941570503, "loss": 0.1973, "step": 1514 }, { "epoch": 0.013150927509309815, "grad_norm": 0.1181640625, "learning_rate": 0.001999869988781043, "loss": 0.2598, "step": 1515 }, { "epoch": 0.01315960798951398, "grad_norm": 0.134765625, "learning_rate": 0.0019998694824209517, "loss": 0.2695, "step": 1516 }, { "epoch": 0.013168288469718145, "grad_norm": 0.11767578125, "learning_rate": 0.0019998689750767764, "loss": 0.2578, "step": 1517 }, { "epoch": 0.01317696894992231, "grad_norm": 0.08447265625, "learning_rate": 0.001999868466748518, "loss": 0.2363, "step": 1518 }, { "epoch": 0.013185649430126475, "grad_norm": 0.09228515625, "learning_rate": 0.001999867957436177, "loss": 0.293, "step": 1519 }, { "epoch": 0.01319432991033064, "grad_norm": 0.06787109375, "learning_rate": 0.001999867447139754, "loss": 0.1914, "step": 1520 }, { "epoch": 0.013203010390534804, "grad_norm": 0.09228515625, "learning_rate": 0.0019998669358592494, "loss": 0.2451, "step": 1521 }, { "epoch": 0.013211690870738969, "grad_norm": 0.09326171875, "learning_rate": 0.0019998664235946636, "loss": 0.2334, "step": 1522 }, { "epoch": 0.013220371350943134, "grad_norm": 0.09716796875, "learning_rate": 0.0019998659103459978, "loss": 0.2617, "step": 1523 }, { "epoch": 0.013229051831147299, "grad_norm": 0.06591796875, "learning_rate": 0.001999865396113252, "loss": 0.168, "step": 1524 }, { "epoch": 0.013237732311351464, "grad_norm": 0.08349609375, "learning_rate": 0.0019998648808964266, "loss": 0.248, "step": 1525 }, { "epoch": 0.01324641279155563, "grad_norm": 0.0771484375, "learning_rate": 0.001999864364695523, "loss": 0.2334, "step": 1526 }, { "epoch": 0.013255093271759794, "grad_norm": 0.05859375, "learning_rate": 0.0019998638475105406, "loss": 0.1562, "step": 1527 }, { "epoch": 0.01326377375196396, "grad_norm": 0.07275390625, "learning_rate": 0.0019998633293414813, "loss": 0.2109, "step": 1528 }, { "epoch": 0.013272454232168123, "grad_norm": 0.08056640625, "learning_rate": 0.0019998628101883446, "loss": 0.1924, "step": 1529 }, { "epoch": 0.013281134712372288, "grad_norm": 0.10595703125, "learning_rate": 0.001999862290051132, "loss": 0.2324, "step": 1530 }, { "epoch": 0.013289815192576453, "grad_norm": 0.0771484375, "learning_rate": 0.001999861768929843, "loss": 0.2256, "step": 1531 }, { "epoch": 0.013298495672780618, "grad_norm": 0.07373046875, "learning_rate": 0.0019998612468244787, "loss": 0.2188, "step": 1532 }, { "epoch": 0.013307176152984783, "grad_norm": 0.0693359375, "learning_rate": 0.00199986072373504, "loss": 0.1934, "step": 1533 }, { "epoch": 0.013315856633188948, "grad_norm": 0.08251953125, "learning_rate": 0.0019998601996615265, "loss": 0.2559, "step": 1534 }, { "epoch": 0.013324537113393113, "grad_norm": 0.07080078125, "learning_rate": 0.00199985967460394, "loss": 0.209, "step": 1535 }, { "epoch": 0.013333217593597279, "grad_norm": 0.09375, "learning_rate": 0.00199985914856228, "loss": 0.2441, "step": 1536 }, { "epoch": 0.013341898073801442, "grad_norm": 0.0849609375, "learning_rate": 0.001999858621536548, "loss": 0.2598, "step": 1537 }, { "epoch": 0.013350578554005607, "grad_norm": 0.080078125, "learning_rate": 0.001999858093526744, "loss": 0.2539, "step": 1538 }, { "epoch": 0.013359259034209772, "grad_norm": 0.08740234375, "learning_rate": 0.0019998575645328687, "loss": 0.1963, "step": 1539 }, { "epoch": 0.013367939514413937, "grad_norm": 0.0888671875, "learning_rate": 0.0019998570345549226, "loss": 0.2266, "step": 1540 }, { "epoch": 0.013376619994618102, "grad_norm": 0.10009765625, "learning_rate": 0.0019998565035929065, "loss": 0.3203, "step": 1541 }, { "epoch": 0.013385300474822268, "grad_norm": 0.07275390625, "learning_rate": 0.0019998559716468208, "loss": 0.2109, "step": 1542 }, { "epoch": 0.013393980955026433, "grad_norm": 0.0888671875, "learning_rate": 0.001999855438716666, "loss": 0.2598, "step": 1543 }, { "epoch": 0.013402661435230596, "grad_norm": 0.3671875, "learning_rate": 0.0019998549048024427, "loss": 0.3496, "step": 1544 }, { "epoch": 0.013411341915434761, "grad_norm": 0.08203125, "learning_rate": 0.001999854369904152, "loss": 0.2373, "step": 1545 }, { "epoch": 0.013420022395638926, "grad_norm": 0.07275390625, "learning_rate": 0.001999853834021794, "loss": 0.209, "step": 1546 }, { "epoch": 0.013428702875843091, "grad_norm": 0.1298828125, "learning_rate": 0.001999853297155369, "loss": 0.2148, "step": 1547 }, { "epoch": 0.013437383356047257, "grad_norm": 0.08349609375, "learning_rate": 0.0019998527593048784, "loss": 0.1836, "step": 1548 }, { "epoch": 0.013446063836251422, "grad_norm": 0.10498046875, "learning_rate": 0.0019998522204703224, "loss": 0.3203, "step": 1549 }, { "epoch": 0.013454744316455587, "grad_norm": 0.07861328125, "learning_rate": 0.001999851680651701, "loss": 0.1914, "step": 1550 }, { "epoch": 0.013463424796659752, "grad_norm": 0.1025390625, "learning_rate": 0.0019998511398490156, "loss": 0.2812, "step": 1551 }, { "epoch": 0.013472105276863915, "grad_norm": 0.244140625, "learning_rate": 0.0019998505980622664, "loss": 0.3457, "step": 1552 }, { "epoch": 0.01348078575706808, "grad_norm": 0.08740234375, "learning_rate": 0.0019998500552914546, "loss": 0.2598, "step": 1553 }, { "epoch": 0.013489466237272246, "grad_norm": 0.1044921875, "learning_rate": 0.0019998495115365797, "loss": 0.2246, "step": 1554 }, { "epoch": 0.01349814671747641, "grad_norm": 0.12255859375, "learning_rate": 0.001999848966797643, "loss": 0.293, "step": 1555 }, { "epoch": 0.013506827197680576, "grad_norm": 0.080078125, "learning_rate": 0.0019998484210746455, "loss": 0.2305, "step": 1556 }, { "epoch": 0.013515507677884741, "grad_norm": 0.091796875, "learning_rate": 0.0019998478743675865, "loss": 0.2129, "step": 1557 }, { "epoch": 0.013524188158088906, "grad_norm": 0.21875, "learning_rate": 0.001999847326676468, "loss": 0.3477, "step": 1558 }, { "epoch": 0.013532868638293071, "grad_norm": 0.0693359375, "learning_rate": 0.0019998467780012897, "loss": 0.1992, "step": 1559 }, { "epoch": 0.013541549118497235, "grad_norm": 0.0927734375, "learning_rate": 0.0019998462283420527, "loss": 0.2812, "step": 1560 }, { "epoch": 0.0135502295987014, "grad_norm": 0.0732421875, "learning_rate": 0.001999845677698757, "loss": 0.2383, "step": 1561 }, { "epoch": 0.013558910078905565, "grad_norm": 0.271484375, "learning_rate": 0.001999845126071404, "loss": 0.543, "step": 1562 }, { "epoch": 0.01356759055910973, "grad_norm": 0.07177734375, "learning_rate": 0.0019998445734599937, "loss": 0.2324, "step": 1563 }, { "epoch": 0.013576271039313895, "grad_norm": 0.0830078125, "learning_rate": 0.001999844019864527, "loss": 0.2422, "step": 1564 }, { "epoch": 0.01358495151951806, "grad_norm": 0.1455078125, "learning_rate": 0.001999843465285004, "loss": 0.2891, "step": 1565 }, { "epoch": 0.013593631999722225, "grad_norm": 0.0673828125, "learning_rate": 0.001999842909721426, "loss": 0.209, "step": 1566 }, { "epoch": 0.01360231247992639, "grad_norm": 0.109375, "learning_rate": 0.0019998423531737935, "loss": 0.3027, "step": 1567 }, { "epoch": 0.013610992960130554, "grad_norm": 0.08447265625, "learning_rate": 0.001999841795642107, "loss": 0.2891, "step": 1568 }, { "epoch": 0.013619673440334719, "grad_norm": 0.06787109375, "learning_rate": 0.0019998412371263668, "loss": 0.2422, "step": 1569 }, { "epoch": 0.013628353920538884, "grad_norm": 0.12353515625, "learning_rate": 0.0019998406776265735, "loss": 0.3398, "step": 1570 }, { "epoch": 0.013637034400743049, "grad_norm": 0.0966796875, "learning_rate": 0.001999840117142728, "loss": 0.2324, "step": 1571 }, { "epoch": 0.013645714880947214, "grad_norm": 0.169921875, "learning_rate": 0.001999839555674831, "loss": 0.3418, "step": 1572 }, { "epoch": 0.01365439536115138, "grad_norm": 0.09375, "learning_rate": 0.001999838993222883, "loss": 0.3398, "step": 1573 }, { "epoch": 0.013663075841355544, "grad_norm": 0.072265625, "learning_rate": 0.0019998384297868848, "loss": 0.2129, "step": 1574 }, { "epoch": 0.013671756321559708, "grad_norm": 0.07666015625, "learning_rate": 0.0019998378653668368, "loss": 0.2637, "step": 1575 }, { "epoch": 0.013680436801763873, "grad_norm": 0.08740234375, "learning_rate": 0.0019998372999627395, "loss": 0.2422, "step": 1576 }, { "epoch": 0.013689117281968038, "grad_norm": 0.0859375, "learning_rate": 0.001999836733574594, "loss": 0.2285, "step": 1577 }, { "epoch": 0.013697797762172203, "grad_norm": 0.0927734375, "learning_rate": 0.0019998361662023996, "loss": 0.2246, "step": 1578 }, { "epoch": 0.013706478242376368, "grad_norm": 0.10205078125, "learning_rate": 0.0019998355978461586, "loss": 0.2578, "step": 1579 }, { "epoch": 0.013715158722580533, "grad_norm": 0.0810546875, "learning_rate": 0.0019998350285058706, "loss": 0.2188, "step": 1580 }, { "epoch": 0.013723839202784699, "grad_norm": 0.0927734375, "learning_rate": 0.001999834458181537, "loss": 0.2598, "step": 1581 }, { "epoch": 0.013732519682988864, "grad_norm": 0.08740234375, "learning_rate": 0.0019998338868731573, "loss": 0.3125, "step": 1582 }, { "epoch": 0.013741200163193027, "grad_norm": 0.1455078125, "learning_rate": 0.0019998333145807333, "loss": 0.3594, "step": 1583 }, { "epoch": 0.013749880643397192, "grad_norm": 0.09033203125, "learning_rate": 0.0019998327413042654, "loss": 0.2178, "step": 1584 }, { "epoch": 0.013758561123601357, "grad_norm": 0.07373046875, "learning_rate": 0.001999832167043753, "loss": 0.1699, "step": 1585 }, { "epoch": 0.013767241603805522, "grad_norm": 0.08349609375, "learning_rate": 0.0019998315917991983, "loss": 0.2402, "step": 1586 }, { "epoch": 0.013775922084009688, "grad_norm": 0.07177734375, "learning_rate": 0.0019998310155706013, "loss": 0.1895, "step": 1587 }, { "epoch": 0.013784602564213853, "grad_norm": 0.07373046875, "learning_rate": 0.0019998304383579625, "loss": 0.2227, "step": 1588 }, { "epoch": 0.013793283044418018, "grad_norm": 0.06640625, "learning_rate": 0.0019998298601612828, "loss": 0.2012, "step": 1589 }, { "epoch": 0.013801963524622183, "grad_norm": 0.078125, "learning_rate": 0.001999829280980562, "loss": 0.2109, "step": 1590 }, { "epoch": 0.013810644004826346, "grad_norm": 0.072265625, "learning_rate": 0.001999828700815802, "loss": 0.2383, "step": 1591 }, { "epoch": 0.013819324485030511, "grad_norm": 0.08740234375, "learning_rate": 0.001999828119667003, "loss": 0.2539, "step": 1592 }, { "epoch": 0.013828004965234677, "grad_norm": 0.08251953125, "learning_rate": 0.001999827537534165, "loss": 0.2617, "step": 1593 }, { "epoch": 0.013836685445438842, "grad_norm": 0.06298828125, "learning_rate": 0.0019998269544172897, "loss": 0.2422, "step": 1594 }, { "epoch": 0.013845365925643007, "grad_norm": 0.06591796875, "learning_rate": 0.0019998263703163766, "loss": 0.2656, "step": 1595 }, { "epoch": 0.013854046405847172, "grad_norm": 0.061767578125, "learning_rate": 0.0019998257852314274, "loss": 0.2168, "step": 1596 }, { "epoch": 0.013862726886051337, "grad_norm": 0.076171875, "learning_rate": 0.001999825199162442, "loss": 0.2598, "step": 1597 }, { "epoch": 0.0138714073662555, "grad_norm": 0.1044921875, "learning_rate": 0.001999824612109421, "loss": 0.248, "step": 1598 }, { "epoch": 0.013880087846459666, "grad_norm": 0.0927734375, "learning_rate": 0.001999824024072366, "loss": 0.2871, "step": 1599 }, { "epoch": 0.01388876832666383, "grad_norm": 0.08935546875, "learning_rate": 0.0019998234350512762, "loss": 0.2871, "step": 1600 }, { "epoch": 0.013897448806867996, "grad_norm": 0.0654296875, "learning_rate": 0.0019998228450461533, "loss": 0.25, "step": 1601 }, { "epoch": 0.013906129287072161, "grad_norm": 0.07861328125, "learning_rate": 0.0019998222540569977, "loss": 0.2168, "step": 1602 }, { "epoch": 0.013914809767276326, "grad_norm": 0.10791015625, "learning_rate": 0.0019998216620838102, "loss": 0.2793, "step": 1603 }, { "epoch": 0.013923490247480491, "grad_norm": 0.0888671875, "learning_rate": 0.0019998210691265913, "loss": 0.2207, "step": 1604 }, { "epoch": 0.013932170727684656, "grad_norm": 0.0947265625, "learning_rate": 0.0019998204751853414, "loss": 0.291, "step": 1605 }, { "epoch": 0.01394085120788882, "grad_norm": 0.0869140625, "learning_rate": 0.0019998198802600614, "loss": 0.2441, "step": 1606 }, { "epoch": 0.013949531688092985, "grad_norm": 0.07861328125, "learning_rate": 0.001999819284350752, "loss": 0.2559, "step": 1607 }, { "epoch": 0.01395821216829715, "grad_norm": 0.08935546875, "learning_rate": 0.001999818687457413, "loss": 0.252, "step": 1608 }, { "epoch": 0.013966892648501315, "grad_norm": 0.06640625, "learning_rate": 0.0019998180895800465, "loss": 0.1865, "step": 1609 }, { "epoch": 0.01397557312870548, "grad_norm": 0.115234375, "learning_rate": 0.0019998174907186524, "loss": 0.2539, "step": 1610 }, { "epoch": 0.013984253608909645, "grad_norm": 0.07568359375, "learning_rate": 0.0019998168908732317, "loss": 0.2324, "step": 1611 }, { "epoch": 0.01399293408911381, "grad_norm": 0.08984375, "learning_rate": 0.0019998162900437843, "loss": 0.2402, "step": 1612 }, { "epoch": 0.014001614569317975, "grad_norm": 0.10791015625, "learning_rate": 0.0019998156882303116, "loss": 0.2812, "step": 1613 }, { "epoch": 0.014010295049522139, "grad_norm": 0.08544921875, "learning_rate": 0.0019998150854328134, "loss": 0.2246, "step": 1614 }, { "epoch": 0.014018975529726304, "grad_norm": 0.09375, "learning_rate": 0.0019998144816512917, "loss": 0.2637, "step": 1615 }, { "epoch": 0.014027656009930469, "grad_norm": 0.08642578125, "learning_rate": 0.0019998138768857463, "loss": 0.2656, "step": 1616 }, { "epoch": 0.014036336490134634, "grad_norm": 0.07275390625, "learning_rate": 0.0019998132711361778, "loss": 0.25, "step": 1617 }, { "epoch": 0.0140450169703388, "grad_norm": 0.10986328125, "learning_rate": 0.0019998126644025865, "loss": 0.2578, "step": 1618 }, { "epoch": 0.014053697450542964, "grad_norm": 0.0810546875, "learning_rate": 0.001999812056684974, "loss": 0.2812, "step": 1619 }, { "epoch": 0.01406237793074713, "grad_norm": 0.07666015625, "learning_rate": 0.0019998114479833407, "loss": 0.1621, "step": 1620 }, { "epoch": 0.014071058410951293, "grad_norm": 0.08984375, "learning_rate": 0.0019998108382976868, "loss": 0.2383, "step": 1621 }, { "epoch": 0.014079738891155458, "grad_norm": 0.06591796875, "learning_rate": 0.001999810227628014, "loss": 0.2246, "step": 1622 }, { "epoch": 0.014088419371359623, "grad_norm": 0.08642578125, "learning_rate": 0.0019998096159743214, "loss": 0.2314, "step": 1623 }, { "epoch": 0.014097099851563788, "grad_norm": 0.08984375, "learning_rate": 0.001999809003336611, "loss": 0.2637, "step": 1624 }, { "epoch": 0.014105780331767953, "grad_norm": 0.07958984375, "learning_rate": 0.0019998083897148832, "loss": 0.2246, "step": 1625 }, { "epoch": 0.014114460811972119, "grad_norm": 0.08984375, "learning_rate": 0.001999807775109138, "loss": 0.2324, "step": 1626 }, { "epoch": 0.014123141292176284, "grad_norm": 0.07275390625, "learning_rate": 0.0019998071595193766, "loss": 0.2129, "step": 1627 }, { "epoch": 0.014131821772380449, "grad_norm": 0.0810546875, "learning_rate": 0.0019998065429455997, "loss": 0.2207, "step": 1628 }, { "epoch": 0.014140502252584612, "grad_norm": 0.07373046875, "learning_rate": 0.001999805925387808, "loss": 0.2344, "step": 1629 }, { "epoch": 0.014149182732788777, "grad_norm": 0.099609375, "learning_rate": 0.0019998053068460016, "loss": 0.25, "step": 1630 }, { "epoch": 0.014157863212992942, "grad_norm": 0.1171875, "learning_rate": 0.001999804687320182, "loss": 0.2207, "step": 1631 }, { "epoch": 0.014166543693197108, "grad_norm": 0.078125, "learning_rate": 0.0019998040668103498, "loss": 0.2695, "step": 1632 }, { "epoch": 0.014175224173401273, "grad_norm": 0.14453125, "learning_rate": 0.001999803445316505, "loss": 0.4668, "step": 1633 }, { "epoch": 0.014183904653605438, "grad_norm": 0.07568359375, "learning_rate": 0.0019998028228386485, "loss": 0.2412, "step": 1634 }, { "epoch": 0.014192585133809603, "grad_norm": 0.083984375, "learning_rate": 0.0019998021993767818, "loss": 0.2314, "step": 1635 }, { "epoch": 0.014201265614013768, "grad_norm": 0.10498046875, "learning_rate": 0.0019998015749309044, "loss": 0.2754, "step": 1636 }, { "epoch": 0.014209946094217931, "grad_norm": 0.1298828125, "learning_rate": 0.0019998009495010177, "loss": 0.293, "step": 1637 }, { "epoch": 0.014218626574422097, "grad_norm": 0.1005859375, "learning_rate": 0.001999800323087122, "loss": 0.3281, "step": 1638 }, { "epoch": 0.014227307054626262, "grad_norm": 0.0654296875, "learning_rate": 0.0019997996956892185, "loss": 0.2246, "step": 1639 }, { "epoch": 0.014235987534830427, "grad_norm": 0.1123046875, "learning_rate": 0.0019997990673073078, "loss": 0.2578, "step": 1640 }, { "epoch": 0.014244668015034592, "grad_norm": 0.06396484375, "learning_rate": 0.0019997984379413894, "loss": 0.1748, "step": 1641 }, { "epoch": 0.014253348495238757, "grad_norm": 0.17578125, "learning_rate": 0.0019997978075914657, "loss": 0.875, "step": 1642 }, { "epoch": 0.014262028975442922, "grad_norm": 0.07763671875, "learning_rate": 0.0019997971762575366, "loss": 0.2168, "step": 1643 }, { "epoch": 0.014270709455647087, "grad_norm": 0.111328125, "learning_rate": 0.0019997965439396024, "loss": 0.2344, "step": 1644 }, { "epoch": 0.01427938993585125, "grad_norm": 0.08837890625, "learning_rate": 0.001999795910637665, "loss": 0.2314, "step": 1645 }, { "epoch": 0.014288070416055416, "grad_norm": 0.2294921875, "learning_rate": 0.0019997952763517236, "loss": 0.3125, "step": 1646 }, { "epoch": 0.014296750896259581, "grad_norm": 0.09765625, "learning_rate": 0.00199979464108178, "loss": 0.2949, "step": 1647 }, { "epoch": 0.014305431376463746, "grad_norm": 0.0712890625, "learning_rate": 0.0019997940048278344, "loss": 0.2158, "step": 1648 }, { "epoch": 0.014314111856667911, "grad_norm": 0.064453125, "learning_rate": 0.001999793367589887, "loss": 0.1777, "step": 1649 }, { "epoch": 0.014322792336872076, "grad_norm": 0.08251953125, "learning_rate": 0.00199979272936794, "loss": 0.2109, "step": 1650 }, { "epoch": 0.014331472817076241, "grad_norm": 0.09765625, "learning_rate": 0.0019997920901619927, "loss": 0.2637, "step": 1651 }, { "epoch": 0.014340153297280405, "grad_norm": 0.0810546875, "learning_rate": 0.0019997914499720465, "loss": 0.2773, "step": 1652 }, { "epoch": 0.01434883377748457, "grad_norm": 0.08154296875, "learning_rate": 0.001999790808798102, "loss": 0.1914, "step": 1653 }, { "epoch": 0.014357514257688735, "grad_norm": 0.126953125, "learning_rate": 0.001999790166640159, "loss": 0.2402, "step": 1654 }, { "epoch": 0.0143661947378929, "grad_norm": 0.091796875, "learning_rate": 0.0019997895234982197, "loss": 0.2461, "step": 1655 }, { "epoch": 0.014374875218097065, "grad_norm": 0.0791015625, "learning_rate": 0.001999788879372284, "loss": 0.248, "step": 1656 }, { "epoch": 0.01438355569830123, "grad_norm": 0.11279296875, "learning_rate": 0.001999788234262353, "loss": 0.2812, "step": 1657 }, { "epoch": 0.014392236178505395, "grad_norm": 0.08154296875, "learning_rate": 0.0019997875881684266, "loss": 0.2793, "step": 1658 }, { "epoch": 0.01440091665870956, "grad_norm": 0.091796875, "learning_rate": 0.0019997869410905062, "loss": 0.2295, "step": 1659 }, { "epoch": 0.014409597138913724, "grad_norm": 0.091796875, "learning_rate": 0.001999786293028592, "loss": 0.2402, "step": 1660 }, { "epoch": 0.014418277619117889, "grad_norm": 0.059814453125, "learning_rate": 0.0019997856439826858, "loss": 0.1855, "step": 1661 }, { "epoch": 0.014426958099322054, "grad_norm": 0.1005859375, "learning_rate": 0.001999784993952787, "loss": 0.2949, "step": 1662 }, { "epoch": 0.01443563857952622, "grad_norm": 0.09814453125, "learning_rate": 0.001999784342938897, "loss": 0.209, "step": 1663 }, { "epoch": 0.014444319059730384, "grad_norm": 0.0615234375, "learning_rate": 0.0019997836909410164, "loss": 0.25, "step": 1664 }, { "epoch": 0.01445299953993455, "grad_norm": 0.10986328125, "learning_rate": 0.0019997830379591456, "loss": 0.2656, "step": 1665 }, { "epoch": 0.014461680020138715, "grad_norm": 0.072265625, "learning_rate": 0.001999782383993286, "loss": 0.2041, "step": 1666 }, { "epoch": 0.01447036050034288, "grad_norm": 0.08544921875, "learning_rate": 0.0019997817290434376, "loss": 0.2422, "step": 1667 }, { "epoch": 0.014479040980547043, "grad_norm": 0.0615234375, "learning_rate": 0.0019997810731096012, "loss": 0.1904, "step": 1668 }, { "epoch": 0.014487721460751208, "grad_norm": 0.08447265625, "learning_rate": 0.001999780416191778, "loss": 0.2539, "step": 1669 }, { "epoch": 0.014496401940955373, "grad_norm": 0.08056640625, "learning_rate": 0.0019997797582899687, "loss": 0.2754, "step": 1670 }, { "epoch": 0.014505082421159539, "grad_norm": 0.08203125, "learning_rate": 0.0019997790994041734, "loss": 0.2207, "step": 1671 }, { "epoch": 0.014513762901363704, "grad_norm": 0.0712890625, "learning_rate": 0.001999778439534393, "loss": 0.2285, "step": 1672 }, { "epoch": 0.014522443381567869, "grad_norm": 0.1025390625, "learning_rate": 0.001999777778680629, "loss": 0.3398, "step": 1673 }, { "epoch": 0.014531123861772034, "grad_norm": 0.08837890625, "learning_rate": 0.001999777116842881, "loss": 0.1904, "step": 1674 }, { "epoch": 0.014539804341976197, "grad_norm": 0.10986328125, "learning_rate": 0.001999776454021151, "loss": 0.2734, "step": 1675 }, { "epoch": 0.014548484822180362, "grad_norm": 0.091796875, "learning_rate": 0.0019997757902154387, "loss": 0.2617, "step": 1676 }, { "epoch": 0.014557165302384528, "grad_norm": 0.09375, "learning_rate": 0.001999775125425745, "loss": 0.3008, "step": 1677 }, { "epoch": 0.014565845782588693, "grad_norm": 0.57421875, "learning_rate": 0.0019997744596520705, "loss": 0.6094, "step": 1678 }, { "epoch": 0.014574526262792858, "grad_norm": 0.06201171875, "learning_rate": 0.001999773792894416, "loss": 0.207, "step": 1679 }, { "epoch": 0.014583206742997023, "grad_norm": 0.091796875, "learning_rate": 0.001999773125152783, "loss": 0.2314, "step": 1680 }, { "epoch": 0.014591887223201188, "grad_norm": 0.076171875, "learning_rate": 0.001999772456427171, "loss": 0.1758, "step": 1681 }, { "epoch": 0.014600567703405353, "grad_norm": 0.08154296875, "learning_rate": 0.0019997717867175826, "loss": 0.2109, "step": 1682 }, { "epoch": 0.014609248183609517, "grad_norm": 0.1728515625, "learning_rate": 0.0019997711160240164, "loss": 0.2617, "step": 1683 }, { "epoch": 0.014617928663813682, "grad_norm": 0.0673828125, "learning_rate": 0.001999770444346474, "loss": 0.2305, "step": 1684 }, { "epoch": 0.014626609144017847, "grad_norm": 0.0732421875, "learning_rate": 0.001999769771684956, "loss": 0.1992, "step": 1685 }, { "epoch": 0.014635289624222012, "grad_norm": 0.09326171875, "learning_rate": 0.0019997690980394635, "loss": 0.2969, "step": 1686 }, { "epoch": 0.014643970104426177, "grad_norm": 0.09326171875, "learning_rate": 0.001999768423409997, "loss": 0.2988, "step": 1687 }, { "epoch": 0.014652650584630342, "grad_norm": 0.0673828125, "learning_rate": 0.0019997677477965577, "loss": 0.1943, "step": 1688 }, { "epoch": 0.014661331064834507, "grad_norm": 0.07568359375, "learning_rate": 0.0019997670711991455, "loss": 0.2129, "step": 1689 }, { "epoch": 0.014670011545038672, "grad_norm": 0.07958984375, "learning_rate": 0.0019997663936177613, "loss": 0.1992, "step": 1690 }, { "epoch": 0.014678692025242836, "grad_norm": 0.10888671875, "learning_rate": 0.0019997657150524067, "loss": 0.2539, "step": 1691 }, { "epoch": 0.014687372505447, "grad_norm": 0.0771484375, "learning_rate": 0.0019997650355030815, "loss": 0.2246, "step": 1692 }, { "epoch": 0.014696052985651166, "grad_norm": 0.06787109375, "learning_rate": 0.001999764354969787, "loss": 0.2275, "step": 1693 }, { "epoch": 0.014704733465855331, "grad_norm": 0.07958984375, "learning_rate": 0.0019997636734525237, "loss": 0.2148, "step": 1694 }, { "epoch": 0.014713413946059496, "grad_norm": 0.12060546875, "learning_rate": 0.001999762990951292, "loss": 0.3242, "step": 1695 }, { "epoch": 0.014722094426263661, "grad_norm": 0.064453125, "learning_rate": 0.001999762307466093, "loss": 0.2383, "step": 1696 }, { "epoch": 0.014730774906467826, "grad_norm": 0.07666015625, "learning_rate": 0.0019997616229969276, "loss": 0.248, "step": 1697 }, { "epoch": 0.01473945538667199, "grad_norm": 0.0947265625, "learning_rate": 0.0019997609375437967, "loss": 0.2988, "step": 1698 }, { "epoch": 0.014748135866876155, "grad_norm": 0.0712890625, "learning_rate": 0.0019997602511067003, "loss": 0.2617, "step": 1699 }, { "epoch": 0.01475681634708032, "grad_norm": 0.078125, "learning_rate": 0.0019997595636856397, "loss": 0.2891, "step": 1700 }, { "epoch": 0.014765496827284485, "grad_norm": 0.07763671875, "learning_rate": 0.001999758875280616, "loss": 0.2002, "step": 1701 }, { "epoch": 0.01477417730748865, "grad_norm": 0.09423828125, "learning_rate": 0.0019997581858916293, "loss": 0.2656, "step": 1702 }, { "epoch": 0.014782857787692815, "grad_norm": 0.09228515625, "learning_rate": 0.0019997574955186804, "loss": 0.3457, "step": 1703 }, { "epoch": 0.01479153826789698, "grad_norm": 0.11572265625, "learning_rate": 0.0019997568041617703, "loss": 0.2344, "step": 1704 }, { "epoch": 0.014800218748101146, "grad_norm": 0.07666015625, "learning_rate": 0.0019997561118208994, "loss": 0.1973, "step": 1705 }, { "epoch": 0.014808899228305309, "grad_norm": 0.0810546875, "learning_rate": 0.001999755418496069, "loss": 0.2695, "step": 1706 }, { "epoch": 0.014817579708509474, "grad_norm": 0.07666015625, "learning_rate": 0.00199975472418728, "loss": 0.2354, "step": 1707 }, { "epoch": 0.01482626018871364, "grad_norm": 0.10986328125, "learning_rate": 0.001999754028894532, "loss": 0.1934, "step": 1708 }, { "epoch": 0.014834940668917804, "grad_norm": 0.07080078125, "learning_rate": 0.001999753332617827, "loss": 0.208, "step": 1709 }, { "epoch": 0.01484362114912197, "grad_norm": 0.09033203125, "learning_rate": 0.0019997526353571654, "loss": 0.2617, "step": 1710 }, { "epoch": 0.014852301629326135, "grad_norm": 0.0703125, "learning_rate": 0.0019997519371125474, "loss": 0.2393, "step": 1711 }, { "epoch": 0.0148609821095303, "grad_norm": 0.10888671875, "learning_rate": 0.001999751237883975, "loss": 0.2715, "step": 1712 }, { "epoch": 0.014869662589734465, "grad_norm": 0.1455078125, "learning_rate": 0.001999750537671447, "loss": 0.2852, "step": 1713 }, { "epoch": 0.014878343069938628, "grad_norm": 0.083984375, "learning_rate": 0.0019997498364749656, "loss": 0.2266, "step": 1714 }, { "epoch": 0.014887023550142793, "grad_norm": 0.07421875, "learning_rate": 0.001999749134294532, "loss": 0.2285, "step": 1715 }, { "epoch": 0.014895704030346959, "grad_norm": 0.1025390625, "learning_rate": 0.0019997484311301454, "loss": 0.252, "step": 1716 }, { "epoch": 0.014904384510551124, "grad_norm": 0.0771484375, "learning_rate": 0.001999747726981808, "loss": 0.2578, "step": 1717 }, { "epoch": 0.014913064990755289, "grad_norm": 0.07080078125, "learning_rate": 0.00199974702184952, "loss": 0.2168, "step": 1718 }, { "epoch": 0.014921745470959454, "grad_norm": 0.1083984375, "learning_rate": 0.001999746315733282, "loss": 0.2305, "step": 1719 }, { "epoch": 0.014930425951163619, "grad_norm": 0.08056640625, "learning_rate": 0.001999745608633095, "loss": 0.2852, "step": 1720 }, { "epoch": 0.014939106431367784, "grad_norm": 0.0693359375, "learning_rate": 0.00199974490054896, "loss": 0.2275, "step": 1721 }, { "epoch": 0.014947786911571947, "grad_norm": 0.09130859375, "learning_rate": 0.001999744191480877, "loss": 0.2969, "step": 1722 }, { "epoch": 0.014956467391776113, "grad_norm": 0.171875, "learning_rate": 0.0019997434814288477, "loss": 0.3086, "step": 1723 }, { "epoch": 0.014965147871980278, "grad_norm": 0.08349609375, "learning_rate": 0.001999742770392872, "loss": 0.2559, "step": 1724 }, { "epoch": 0.014973828352184443, "grad_norm": 0.10791015625, "learning_rate": 0.001999742058372951, "loss": 0.3418, "step": 1725 }, { "epoch": 0.014982508832388608, "grad_norm": 0.09326171875, "learning_rate": 0.0019997413453690864, "loss": 0.2539, "step": 1726 }, { "epoch": 0.014991189312592773, "grad_norm": 0.087890625, "learning_rate": 0.0019997406313812774, "loss": 0.3047, "step": 1727 }, { "epoch": 0.014999869792796938, "grad_norm": 0.07958984375, "learning_rate": 0.0019997399164095263, "loss": 0.2402, "step": 1728 }, { "epoch": 0.015008550273001102, "grad_norm": 0.0771484375, "learning_rate": 0.0019997392004538327, "loss": 0.2021, "step": 1729 }, { "epoch": 0.015017230753205267, "grad_norm": 0.1015625, "learning_rate": 0.001999738483514198, "loss": 0.2539, "step": 1730 }, { "epoch": 0.015025911233409432, "grad_norm": 0.0712890625, "learning_rate": 0.0019997377655906227, "loss": 0.2344, "step": 1731 }, { "epoch": 0.015034591713613597, "grad_norm": 0.103515625, "learning_rate": 0.0019997370466831076, "loss": 0.3164, "step": 1732 }, { "epoch": 0.015043272193817762, "grad_norm": 0.337890625, "learning_rate": 0.001999736326791654, "loss": 0.4219, "step": 1733 }, { "epoch": 0.015051952674021927, "grad_norm": 0.09619140625, "learning_rate": 0.0019997356059162615, "loss": 0.2715, "step": 1734 }, { "epoch": 0.015060633154226092, "grad_norm": 0.1083984375, "learning_rate": 0.0019997348840569322, "loss": 0.2334, "step": 1735 }, { "epoch": 0.015069313634430257, "grad_norm": 0.080078125, "learning_rate": 0.0019997341612136665, "loss": 0.1836, "step": 1736 }, { "epoch": 0.01507799411463442, "grad_norm": 0.11181640625, "learning_rate": 0.001999733437386465, "loss": 0.2773, "step": 1737 }, { "epoch": 0.015086674594838586, "grad_norm": 0.138671875, "learning_rate": 0.001999732712575328, "loss": 0.2461, "step": 1738 }, { "epoch": 0.015095355075042751, "grad_norm": 0.07568359375, "learning_rate": 0.001999731986780257, "loss": 0.2461, "step": 1739 }, { "epoch": 0.015104035555246916, "grad_norm": 0.0703125, "learning_rate": 0.0019997312600012526, "loss": 0.1787, "step": 1740 }, { "epoch": 0.015112716035451081, "grad_norm": 0.08203125, "learning_rate": 0.0019997305322383163, "loss": 0.2168, "step": 1741 }, { "epoch": 0.015121396515655246, "grad_norm": 0.1357421875, "learning_rate": 0.0019997298034914474, "loss": 0.2324, "step": 1742 }, { "epoch": 0.015130076995859412, "grad_norm": 0.08251953125, "learning_rate": 0.0019997290737606478, "loss": 0.2422, "step": 1743 }, { "epoch": 0.015138757476063577, "grad_norm": 0.0751953125, "learning_rate": 0.001999728343045918, "loss": 0.2354, "step": 1744 }, { "epoch": 0.01514743795626774, "grad_norm": 0.10009765625, "learning_rate": 0.001999727611347259, "loss": 0.2451, "step": 1745 }, { "epoch": 0.015156118436471905, "grad_norm": 0.12158203125, "learning_rate": 0.0019997268786646713, "loss": 0.3359, "step": 1746 }, { "epoch": 0.01516479891667607, "grad_norm": 0.103515625, "learning_rate": 0.0019997261449981553, "loss": 0.2656, "step": 1747 }, { "epoch": 0.015173479396880235, "grad_norm": 0.09326171875, "learning_rate": 0.0019997254103477128, "loss": 0.2559, "step": 1748 }, { "epoch": 0.0151821598770844, "grad_norm": 0.1162109375, "learning_rate": 0.0019997246747133443, "loss": 0.2168, "step": 1749 }, { "epoch": 0.015190840357288566, "grad_norm": 0.06884765625, "learning_rate": 0.00199972393809505, "loss": 0.208, "step": 1750 }, { "epoch": 0.01519952083749273, "grad_norm": 0.076171875, "learning_rate": 0.0019997232004928312, "loss": 0.2227, "step": 1751 }, { "epoch": 0.015208201317696894, "grad_norm": 0.111328125, "learning_rate": 0.001999722461906689, "loss": 0.3418, "step": 1752 }, { "epoch": 0.01521688179790106, "grad_norm": 0.07275390625, "learning_rate": 0.001999721722336623, "loss": 0.2227, "step": 1753 }, { "epoch": 0.015225562278105224, "grad_norm": 0.080078125, "learning_rate": 0.0019997209817826356, "loss": 0.2188, "step": 1754 }, { "epoch": 0.01523424275830939, "grad_norm": 0.10595703125, "learning_rate": 0.001999720240244727, "loss": 0.2363, "step": 1755 }, { "epoch": 0.015242923238513555, "grad_norm": 0.07080078125, "learning_rate": 0.0019997194977228972, "loss": 0.2207, "step": 1756 }, { "epoch": 0.01525160371871772, "grad_norm": 0.142578125, "learning_rate": 0.001999718754217148, "loss": 0.21, "step": 1757 }, { "epoch": 0.015260284198921885, "grad_norm": 0.12255859375, "learning_rate": 0.00199971800972748, "loss": 0.2637, "step": 1758 }, { "epoch": 0.01526896467912605, "grad_norm": 0.0791015625, "learning_rate": 0.0019997172642538933, "loss": 0.2266, "step": 1759 }, { "epoch": 0.015277645159330213, "grad_norm": 0.0771484375, "learning_rate": 0.00199971651779639, "loss": 0.1797, "step": 1760 }, { "epoch": 0.015286325639534378, "grad_norm": 0.083984375, "learning_rate": 0.00199971577035497, "loss": 0.2559, "step": 1761 }, { "epoch": 0.015295006119738544, "grad_norm": 0.068359375, "learning_rate": 0.0019997150219296343, "loss": 0.2344, "step": 1762 }, { "epoch": 0.015303686599942709, "grad_norm": 0.0869140625, "learning_rate": 0.001999714272520384, "loss": 0.2578, "step": 1763 }, { "epoch": 0.015312367080146874, "grad_norm": 0.06396484375, "learning_rate": 0.0019997135221272192, "loss": 0.1914, "step": 1764 }, { "epoch": 0.015321047560351039, "grad_norm": 0.0693359375, "learning_rate": 0.001999712770750142, "loss": 0.2695, "step": 1765 }, { "epoch": 0.015329728040555204, "grad_norm": 0.08203125, "learning_rate": 0.0019997120183891516, "loss": 0.2197, "step": 1766 }, { "epoch": 0.01533840852075937, "grad_norm": 0.07568359375, "learning_rate": 0.0019997112650442504, "loss": 0.2773, "step": 1767 }, { "epoch": 0.015347089000963533, "grad_norm": 0.0888671875, "learning_rate": 0.001999710510715438, "loss": 0.3086, "step": 1768 }, { "epoch": 0.015355769481167698, "grad_norm": 0.0869140625, "learning_rate": 0.0019997097554027158, "loss": 0.2812, "step": 1769 }, { "epoch": 0.015364449961371863, "grad_norm": 0.06298828125, "learning_rate": 0.0019997089991060845, "loss": 0.1846, "step": 1770 }, { "epoch": 0.015373130441576028, "grad_norm": 0.07080078125, "learning_rate": 0.001999708241825545, "loss": 0.2354, "step": 1771 }, { "epoch": 0.015381810921780193, "grad_norm": 0.0849609375, "learning_rate": 0.0019997074835610977, "loss": 0.2305, "step": 1772 }, { "epoch": 0.015390491401984358, "grad_norm": 0.08056640625, "learning_rate": 0.0019997067243127443, "loss": 0.2148, "step": 1773 }, { "epoch": 0.015399171882188523, "grad_norm": 0.1591796875, "learning_rate": 0.001999705964080485, "loss": 0.334, "step": 1774 }, { "epoch": 0.015407852362392687, "grad_norm": 0.0947265625, "learning_rate": 0.001999705202864321, "loss": 0.2178, "step": 1775 }, { "epoch": 0.015416532842596852, "grad_norm": 0.06298828125, "learning_rate": 0.0019997044406642526, "loss": 0.1758, "step": 1776 }, { "epoch": 0.015425213322801017, "grad_norm": 0.08154296875, "learning_rate": 0.0019997036774802813, "loss": 0.1992, "step": 1777 }, { "epoch": 0.015433893803005182, "grad_norm": 0.1689453125, "learning_rate": 0.001999702913312407, "loss": 0.2871, "step": 1778 }, { "epoch": 0.015442574283209347, "grad_norm": 0.095703125, "learning_rate": 0.0019997021481606312, "loss": 0.2256, "step": 1779 }, { "epoch": 0.015451254763413512, "grad_norm": 0.09765625, "learning_rate": 0.001999701382024955, "loss": 0.2598, "step": 1780 }, { "epoch": 0.015459935243617677, "grad_norm": 0.1513671875, "learning_rate": 0.0019997006149053793, "loss": 0.2578, "step": 1781 }, { "epoch": 0.015468615723821843, "grad_norm": 0.06982421875, "learning_rate": 0.0019996998468019035, "loss": 0.2188, "step": 1782 }, { "epoch": 0.015477296204026006, "grad_norm": 0.1337890625, "learning_rate": 0.00199969907771453, "loss": 0.2969, "step": 1783 }, { "epoch": 0.015485976684230171, "grad_norm": 0.09521484375, "learning_rate": 0.0019996983076432592, "loss": 0.2266, "step": 1784 }, { "epoch": 0.015494657164434336, "grad_norm": 0.0712890625, "learning_rate": 0.0019996975365880916, "loss": 0.2168, "step": 1785 }, { "epoch": 0.015503337644638501, "grad_norm": 0.06396484375, "learning_rate": 0.0019996967645490283, "loss": 0.2324, "step": 1786 }, { "epoch": 0.015512018124842666, "grad_norm": 0.11279296875, "learning_rate": 0.0019996959915260706, "loss": 0.2695, "step": 1787 }, { "epoch": 0.015520698605046832, "grad_norm": 0.072265625, "learning_rate": 0.001999695217519218, "loss": 0.1855, "step": 1788 }, { "epoch": 0.015529379085250997, "grad_norm": 0.0849609375, "learning_rate": 0.0019996944425284733, "loss": 0.1904, "step": 1789 }, { "epoch": 0.015538059565455162, "grad_norm": 0.0693359375, "learning_rate": 0.0019996936665538354, "loss": 0.2578, "step": 1790 }, { "epoch": 0.015546740045659325, "grad_norm": 0.08447265625, "learning_rate": 0.0019996928895953067, "loss": 0.2324, "step": 1791 }, { "epoch": 0.01555542052586349, "grad_norm": 0.07470703125, "learning_rate": 0.001999692111652887, "loss": 0.2695, "step": 1792 }, { "epoch": 0.015564101006067655, "grad_norm": 0.166015625, "learning_rate": 0.0019996913327265777, "loss": 0.2695, "step": 1793 }, { "epoch": 0.01557278148627182, "grad_norm": 0.0830078125, "learning_rate": 0.001999690552816379, "loss": 0.2773, "step": 1794 }, { "epoch": 0.015581461966475986, "grad_norm": 0.0693359375, "learning_rate": 0.0019996897719222924, "loss": 0.1982, "step": 1795 }, { "epoch": 0.01559014244668015, "grad_norm": 0.1005859375, "learning_rate": 0.001999688990044319, "loss": 0.2656, "step": 1796 }, { "epoch": 0.015598822926884316, "grad_norm": 0.07861328125, "learning_rate": 0.0019996882071824586, "loss": 0.2305, "step": 1797 }, { "epoch": 0.015607503407088481, "grad_norm": 0.07275390625, "learning_rate": 0.0019996874233367133, "loss": 0.1904, "step": 1798 }, { "epoch": 0.015616183887292644, "grad_norm": 0.06201171875, "learning_rate": 0.0019996866385070832, "loss": 0.2578, "step": 1799 }, { "epoch": 0.01562486436749681, "grad_norm": 0.072265625, "learning_rate": 0.001999685852693569, "loss": 0.2266, "step": 1800 }, { "epoch": 0.015633544847700975, "grad_norm": 0.07958984375, "learning_rate": 0.0019996850658961724, "loss": 0.2285, "step": 1801 }, { "epoch": 0.01564222532790514, "grad_norm": 0.0693359375, "learning_rate": 0.0019996842781148934, "loss": 0.2041, "step": 1802 }, { "epoch": 0.015650905808109305, "grad_norm": 0.0791015625, "learning_rate": 0.001999683489349733, "loss": 0.2832, "step": 1803 }, { "epoch": 0.01565958628831347, "grad_norm": 0.0859375, "learning_rate": 0.0019996826996006925, "loss": 0.248, "step": 1804 }, { "epoch": 0.015668266768517635, "grad_norm": 0.064453125, "learning_rate": 0.001999681908867773, "loss": 0.1719, "step": 1805 }, { "epoch": 0.0156769472487218, "grad_norm": 0.08447265625, "learning_rate": 0.001999681117150974, "loss": 0.2578, "step": 1806 }, { "epoch": 0.015685627728925965, "grad_norm": 0.056884765625, "learning_rate": 0.0019996803244502976, "loss": 0.207, "step": 1807 }, { "epoch": 0.01569430820913013, "grad_norm": 0.0751953125, "learning_rate": 0.0019996795307657446, "loss": 0.2363, "step": 1808 }, { "epoch": 0.015702988689334296, "grad_norm": 0.09130859375, "learning_rate": 0.001999678736097315, "loss": 0.2617, "step": 1809 }, { "epoch": 0.015711669169538457, "grad_norm": 0.103515625, "learning_rate": 0.0019996779404450105, "loss": 0.2354, "step": 1810 }, { "epoch": 0.015720349649742622, "grad_norm": 0.08251953125, "learning_rate": 0.001999677143808832, "loss": 0.3027, "step": 1811 }, { "epoch": 0.015729030129946787, "grad_norm": 0.083984375, "learning_rate": 0.00199967634618878, "loss": 0.2637, "step": 1812 }, { "epoch": 0.015737710610150953, "grad_norm": 0.06982421875, "learning_rate": 0.0019996755475848553, "loss": 0.293, "step": 1813 }, { "epoch": 0.015746391090355118, "grad_norm": 0.07666015625, "learning_rate": 0.001999674747997059, "loss": 0.2422, "step": 1814 }, { "epoch": 0.015755071570559283, "grad_norm": 0.0888671875, "learning_rate": 0.001999673947425392, "loss": 0.3125, "step": 1815 }, { "epoch": 0.015763752050763448, "grad_norm": 0.056884765625, "learning_rate": 0.001999673145869855, "loss": 0.1943, "step": 1816 }, { "epoch": 0.015772432530967613, "grad_norm": 0.072265625, "learning_rate": 0.001999672343330449, "loss": 0.2061, "step": 1817 }, { "epoch": 0.015781113011171778, "grad_norm": 0.07177734375, "learning_rate": 0.0019996715398071744, "loss": 0.1641, "step": 1818 }, { "epoch": 0.015789793491375943, "grad_norm": 0.0947265625, "learning_rate": 0.0019996707353000334, "loss": 0.248, "step": 1819 }, { "epoch": 0.01579847397158011, "grad_norm": 0.080078125, "learning_rate": 0.0019996699298090253, "loss": 0.2539, "step": 1820 }, { "epoch": 0.015807154451784274, "grad_norm": 0.064453125, "learning_rate": 0.001999669123334152, "loss": 0.2168, "step": 1821 }, { "epoch": 0.01581583493198844, "grad_norm": 0.0693359375, "learning_rate": 0.001999668315875414, "loss": 0.2168, "step": 1822 }, { "epoch": 0.015824515412192604, "grad_norm": 0.0751953125, "learning_rate": 0.001999667507432812, "loss": 0.209, "step": 1823 }, { "epoch": 0.01583319589239677, "grad_norm": 0.09228515625, "learning_rate": 0.0019996666980063474, "loss": 0.2578, "step": 1824 }, { "epoch": 0.015841876372600934, "grad_norm": 0.0869140625, "learning_rate": 0.001999665887596021, "loss": 0.2207, "step": 1825 }, { "epoch": 0.015850556852805096, "grad_norm": 0.0810546875, "learning_rate": 0.0019996650762018333, "loss": 0.2656, "step": 1826 }, { "epoch": 0.01585923733300926, "grad_norm": 0.08154296875, "learning_rate": 0.001999664263823785, "loss": 0.2383, "step": 1827 }, { "epoch": 0.015867917813213426, "grad_norm": 0.095703125, "learning_rate": 0.001999663450461878, "loss": 0.2324, "step": 1828 }, { "epoch": 0.01587659829341759, "grad_norm": 0.0654296875, "learning_rate": 0.001999662636116112, "loss": 0.2578, "step": 1829 }, { "epoch": 0.015885278773621756, "grad_norm": 0.099609375, "learning_rate": 0.001999661820786489, "loss": 0.2324, "step": 1830 }, { "epoch": 0.01589395925382592, "grad_norm": 0.11865234375, "learning_rate": 0.0019996610044730094, "loss": 0.332, "step": 1831 }, { "epoch": 0.015902639734030086, "grad_norm": 0.06298828125, "learning_rate": 0.0019996601871756737, "loss": 0.1855, "step": 1832 }, { "epoch": 0.01591132021423425, "grad_norm": 0.095703125, "learning_rate": 0.0019996593688944827, "loss": 0.2676, "step": 1833 }, { "epoch": 0.015920000694438417, "grad_norm": 0.09130859375, "learning_rate": 0.0019996585496294384, "loss": 0.2598, "step": 1834 }, { "epoch": 0.015928681174642582, "grad_norm": 0.087890625, "learning_rate": 0.001999657729380541, "loss": 0.2256, "step": 1835 }, { "epoch": 0.015937361654846747, "grad_norm": 0.06494140625, "learning_rate": 0.001999656908147791, "loss": 0.1865, "step": 1836 }, { "epoch": 0.015946042135050912, "grad_norm": 0.07568359375, "learning_rate": 0.0019996560859311904, "loss": 0.2246, "step": 1837 }, { "epoch": 0.015954722615255077, "grad_norm": 0.08935546875, "learning_rate": 0.001999655262730739, "loss": 0.248, "step": 1838 }, { "epoch": 0.015963403095459242, "grad_norm": 0.08154296875, "learning_rate": 0.0019996544385464383, "loss": 0.2168, "step": 1839 }, { "epoch": 0.015972083575663407, "grad_norm": 0.09423828125, "learning_rate": 0.001999653613378289, "loss": 0.2129, "step": 1840 }, { "epoch": 0.01598076405586757, "grad_norm": 0.06982421875, "learning_rate": 0.0019996527872262913, "loss": 0.1846, "step": 1841 }, { "epoch": 0.015989444536071734, "grad_norm": 0.05810546875, "learning_rate": 0.0019996519600904475, "loss": 0.1777, "step": 1842 }, { "epoch": 0.0159981250162759, "grad_norm": 0.09033203125, "learning_rate": 0.001999651131970758, "loss": 0.2715, "step": 1843 }, { "epoch": 0.016006805496480064, "grad_norm": 0.1357421875, "learning_rate": 0.001999650302867223, "loss": 0.3555, "step": 1844 }, { "epoch": 0.01601548597668423, "grad_norm": 0.09130859375, "learning_rate": 0.0019996494727798444, "loss": 0.1914, "step": 1845 }, { "epoch": 0.016024166456888395, "grad_norm": 0.0908203125, "learning_rate": 0.0019996486417086226, "loss": 0.291, "step": 1846 }, { "epoch": 0.01603284693709256, "grad_norm": 0.0869140625, "learning_rate": 0.0019996478096535584, "loss": 0.2422, "step": 1847 }, { "epoch": 0.016041527417296725, "grad_norm": 0.201171875, "learning_rate": 0.0019996469766146528, "loss": 0.2559, "step": 1848 }, { "epoch": 0.01605020789750089, "grad_norm": 0.0712890625, "learning_rate": 0.001999646142591907, "loss": 0.1982, "step": 1849 }, { "epoch": 0.016058888377705055, "grad_norm": 0.07861328125, "learning_rate": 0.0019996453075853218, "loss": 0.2227, "step": 1850 }, { "epoch": 0.01606756885790922, "grad_norm": 0.11572265625, "learning_rate": 0.0019996444715948978, "loss": 0.2715, "step": 1851 }, { "epoch": 0.016076249338113385, "grad_norm": 0.07373046875, "learning_rate": 0.001999643634620636, "loss": 0.2227, "step": 1852 }, { "epoch": 0.01608492981831755, "grad_norm": 0.059326171875, "learning_rate": 0.001999642796662538, "loss": 0.2012, "step": 1853 }, { "epoch": 0.016093610298521716, "grad_norm": 0.10009765625, "learning_rate": 0.0019996419577206033, "loss": 0.3027, "step": 1854 }, { "epoch": 0.01610229077872588, "grad_norm": 0.07568359375, "learning_rate": 0.001999641117794834, "loss": 0.2227, "step": 1855 }, { "epoch": 0.016110971258930042, "grad_norm": 0.076171875, "learning_rate": 0.001999640276885231, "loss": 0.2344, "step": 1856 }, { "epoch": 0.016119651739134207, "grad_norm": 0.10205078125, "learning_rate": 0.0019996394349917944, "loss": 0.2539, "step": 1857 }, { "epoch": 0.016128332219338373, "grad_norm": 0.07568359375, "learning_rate": 0.0019996385921145264, "loss": 0.1982, "step": 1858 }, { "epoch": 0.016137012699542538, "grad_norm": 0.08251953125, "learning_rate": 0.0019996377482534265, "loss": 0.2344, "step": 1859 }, { "epoch": 0.016145693179746703, "grad_norm": 0.09716796875, "learning_rate": 0.0019996369034084964, "loss": 0.2676, "step": 1860 }, { "epoch": 0.016154373659950868, "grad_norm": 0.07080078125, "learning_rate": 0.001999636057579737, "loss": 0.2129, "step": 1861 }, { "epoch": 0.016163054140155033, "grad_norm": 0.0869140625, "learning_rate": 0.001999635210767149, "loss": 0.2422, "step": 1862 }, { "epoch": 0.016171734620359198, "grad_norm": 0.07568359375, "learning_rate": 0.0019996343629707335, "loss": 0.2334, "step": 1863 }, { "epoch": 0.016180415100563363, "grad_norm": 0.0849609375, "learning_rate": 0.001999633514190491, "loss": 0.2051, "step": 1864 }, { "epoch": 0.01618909558076753, "grad_norm": 0.062255859375, "learning_rate": 0.001999632664426424, "loss": 0.1699, "step": 1865 }, { "epoch": 0.016197776060971694, "grad_norm": 0.07421875, "learning_rate": 0.001999631813678531, "loss": 0.2266, "step": 1866 }, { "epoch": 0.01620645654117586, "grad_norm": 0.0830078125, "learning_rate": 0.001999630961946815, "loss": 0.2539, "step": 1867 }, { "epoch": 0.016215137021380024, "grad_norm": 0.07470703125, "learning_rate": 0.0019996301092312756, "loss": 0.2031, "step": 1868 }, { "epoch": 0.01622381750158419, "grad_norm": 0.0751953125, "learning_rate": 0.0019996292555319144, "loss": 0.2168, "step": 1869 }, { "epoch": 0.016232497981788354, "grad_norm": 0.087890625, "learning_rate": 0.001999628400848732, "loss": 0.2305, "step": 1870 }, { "epoch": 0.01624117846199252, "grad_norm": 0.0771484375, "learning_rate": 0.00199962754518173, "loss": 0.2432, "step": 1871 }, { "epoch": 0.01624985894219668, "grad_norm": 0.09033203125, "learning_rate": 0.001999626688530908, "loss": 0.2275, "step": 1872 }, { "epoch": 0.016258539422400846, "grad_norm": 0.06591796875, "learning_rate": 0.001999625830896268, "loss": 0.209, "step": 1873 }, { "epoch": 0.01626721990260501, "grad_norm": 0.08154296875, "learning_rate": 0.0019996249722778114, "loss": 0.2285, "step": 1874 }, { "epoch": 0.016275900382809176, "grad_norm": 0.080078125, "learning_rate": 0.001999624112675538, "loss": 0.1973, "step": 1875 }, { "epoch": 0.01628458086301334, "grad_norm": 0.0791015625, "learning_rate": 0.001999623252089449, "loss": 0.252, "step": 1876 }, { "epoch": 0.016293261343217506, "grad_norm": 0.0712890625, "learning_rate": 0.0019996223905195455, "loss": 0.2354, "step": 1877 }, { "epoch": 0.01630194182342167, "grad_norm": 0.053955078125, "learning_rate": 0.0019996215279658286, "loss": 0.1934, "step": 1878 }, { "epoch": 0.016310622303625837, "grad_norm": 0.10595703125, "learning_rate": 0.0019996206644282994, "loss": 0.2812, "step": 1879 }, { "epoch": 0.01631930278383, "grad_norm": 0.07861328125, "learning_rate": 0.001999619799906958, "loss": 0.2012, "step": 1880 }, { "epoch": 0.016327983264034167, "grad_norm": 0.0810546875, "learning_rate": 0.0019996189344018067, "loss": 0.25, "step": 1881 }, { "epoch": 0.016336663744238332, "grad_norm": 0.06787109375, "learning_rate": 0.001999618067912845, "loss": 0.1758, "step": 1882 }, { "epoch": 0.016345344224442497, "grad_norm": 0.0703125, "learning_rate": 0.0019996172004400747, "loss": 0.2031, "step": 1883 }, { "epoch": 0.016354024704646662, "grad_norm": 0.083984375, "learning_rate": 0.0019996163319834967, "loss": 0.2334, "step": 1884 }, { "epoch": 0.016362705184850827, "grad_norm": 0.080078125, "learning_rate": 0.001999615462543111, "loss": 0.2412, "step": 1885 }, { "epoch": 0.016371385665054992, "grad_norm": 0.09619140625, "learning_rate": 0.0019996145921189206, "loss": 0.2285, "step": 1886 }, { "epoch": 0.016380066145259154, "grad_norm": 0.10791015625, "learning_rate": 0.0019996137207109244, "loss": 0.2773, "step": 1887 }, { "epoch": 0.01638874662546332, "grad_norm": 0.08251953125, "learning_rate": 0.0019996128483191246, "loss": 0.2129, "step": 1888 }, { "epoch": 0.016397427105667484, "grad_norm": 0.0830078125, "learning_rate": 0.001999611974943521, "loss": 0.2402, "step": 1889 }, { "epoch": 0.01640610758587165, "grad_norm": 0.1298828125, "learning_rate": 0.001999611100584116, "loss": 0.2852, "step": 1890 }, { "epoch": 0.016414788066075815, "grad_norm": 0.103515625, "learning_rate": 0.0019996102252409094, "loss": 0.2324, "step": 1891 }, { "epoch": 0.01642346854627998, "grad_norm": 0.08544921875, "learning_rate": 0.001999609348913903, "loss": 0.2158, "step": 1892 }, { "epoch": 0.016432149026484145, "grad_norm": 0.07421875, "learning_rate": 0.001999608471603097, "loss": 0.2236, "step": 1893 }, { "epoch": 0.01644082950668831, "grad_norm": 0.08935546875, "learning_rate": 0.001999607593308492, "loss": 0.2227, "step": 1894 }, { "epoch": 0.016449509986892475, "grad_norm": 0.07763671875, "learning_rate": 0.001999606714030091, "loss": 0.1816, "step": 1895 }, { "epoch": 0.01645819046709664, "grad_norm": 0.0771484375, "learning_rate": 0.0019996058337678926, "loss": 0.2051, "step": 1896 }, { "epoch": 0.016466870947300805, "grad_norm": 0.08203125, "learning_rate": 0.001999604952521899, "loss": 0.2207, "step": 1897 }, { "epoch": 0.01647555142750497, "grad_norm": 0.0869140625, "learning_rate": 0.0019996040702921114, "loss": 0.2617, "step": 1898 }, { "epoch": 0.016484231907709136, "grad_norm": 0.076171875, "learning_rate": 0.0019996031870785297, "loss": 0.2168, "step": 1899 }, { "epoch": 0.0164929123879133, "grad_norm": 0.07568359375, "learning_rate": 0.001999602302881156, "loss": 0.2246, "step": 1900 }, { "epoch": 0.016501592868117466, "grad_norm": 0.08251953125, "learning_rate": 0.00199960141769999, "loss": 0.2656, "step": 1901 }, { "epoch": 0.01651027334832163, "grad_norm": 0.1494140625, "learning_rate": 0.001999600531535034, "loss": 0.2715, "step": 1902 }, { "epoch": 0.016518953828525793, "grad_norm": 0.0703125, "learning_rate": 0.0019995996443862886, "loss": 0.1953, "step": 1903 }, { "epoch": 0.016527634308729958, "grad_norm": 0.07373046875, "learning_rate": 0.0019995987562537545, "loss": 0.2168, "step": 1904 }, { "epoch": 0.016536314788934123, "grad_norm": 0.08740234375, "learning_rate": 0.001999597867137432, "loss": 0.2246, "step": 1905 }, { "epoch": 0.016544995269138288, "grad_norm": 0.07177734375, "learning_rate": 0.0019995969770373236, "loss": 0.1924, "step": 1906 }, { "epoch": 0.016553675749342453, "grad_norm": 0.0830078125, "learning_rate": 0.001999596085953429, "loss": 0.2119, "step": 1907 }, { "epoch": 0.016562356229546618, "grad_norm": 0.07275390625, "learning_rate": 0.0019995951938857497, "loss": 0.2227, "step": 1908 }, { "epoch": 0.016571036709750783, "grad_norm": 0.07568359375, "learning_rate": 0.0019995943008342867, "loss": 0.2129, "step": 1909 }, { "epoch": 0.01657971718995495, "grad_norm": 0.07958984375, "learning_rate": 0.0019995934067990407, "loss": 0.2734, "step": 1910 }, { "epoch": 0.016588397670159113, "grad_norm": 0.11181640625, "learning_rate": 0.001999592511780013, "loss": 0.3398, "step": 1911 }, { "epoch": 0.01659707815036328, "grad_norm": 0.0888671875, "learning_rate": 0.0019995916157772046, "loss": 0.2266, "step": 1912 }, { "epoch": 0.016605758630567444, "grad_norm": 0.07666015625, "learning_rate": 0.001999590718790616, "loss": 0.248, "step": 1913 }, { "epoch": 0.01661443911077161, "grad_norm": 0.0732421875, "learning_rate": 0.001999589820820249, "loss": 0.1787, "step": 1914 }, { "epoch": 0.016623119590975774, "grad_norm": 0.08837890625, "learning_rate": 0.0019995889218661035, "loss": 0.2422, "step": 1915 }, { "epoch": 0.01663180007117994, "grad_norm": 0.0927734375, "learning_rate": 0.0019995880219281816, "loss": 0.2324, "step": 1916 }, { "epoch": 0.016640480551384104, "grad_norm": 0.10107421875, "learning_rate": 0.001999587121006483, "loss": 0.252, "step": 1917 }, { "epoch": 0.016649161031588266, "grad_norm": 0.08349609375, "learning_rate": 0.00199958621910101, "loss": 0.1973, "step": 1918 }, { "epoch": 0.01665784151179243, "grad_norm": 0.099609375, "learning_rate": 0.001999585316211763, "loss": 0.2617, "step": 1919 }, { "epoch": 0.016666521991996596, "grad_norm": 0.064453125, "learning_rate": 0.001999584412338743, "loss": 0.2158, "step": 1920 }, { "epoch": 0.01667520247220076, "grad_norm": 0.09765625, "learning_rate": 0.001999583507481951, "loss": 0.2715, "step": 1921 }, { "epoch": 0.016683882952404926, "grad_norm": 0.0693359375, "learning_rate": 0.001999582601641388, "loss": 0.2285, "step": 1922 }, { "epoch": 0.01669256343260909, "grad_norm": 0.07568359375, "learning_rate": 0.001999581694817055, "loss": 0.1934, "step": 1923 }, { "epoch": 0.016701243912813257, "grad_norm": 0.115234375, "learning_rate": 0.0019995807870089527, "loss": 0.2656, "step": 1924 }, { "epoch": 0.01670992439301742, "grad_norm": 0.08251953125, "learning_rate": 0.001999579878217083, "loss": 0.1914, "step": 1925 }, { "epoch": 0.016718604873221587, "grad_norm": 0.08935546875, "learning_rate": 0.0019995789684414456, "loss": 0.1963, "step": 1926 }, { "epoch": 0.016727285353425752, "grad_norm": 0.08984375, "learning_rate": 0.0019995780576820424, "loss": 0.2246, "step": 1927 }, { "epoch": 0.016735965833629917, "grad_norm": 0.11669921875, "learning_rate": 0.0019995771459388745, "loss": 0.2246, "step": 1928 }, { "epoch": 0.016744646313834082, "grad_norm": 0.083984375, "learning_rate": 0.001999576233211942, "loss": 0.2422, "step": 1929 }, { "epoch": 0.016753326794038247, "grad_norm": 0.068359375, "learning_rate": 0.0019995753195012466, "loss": 0.25, "step": 1930 }, { "epoch": 0.016762007274242412, "grad_norm": 0.087890625, "learning_rate": 0.0019995744048067893, "loss": 0.2559, "step": 1931 }, { "epoch": 0.016770687754446578, "grad_norm": 0.078125, "learning_rate": 0.0019995734891285707, "loss": 0.2441, "step": 1932 }, { "epoch": 0.01677936823465074, "grad_norm": 0.08154296875, "learning_rate": 0.001999572572466592, "loss": 0.2295, "step": 1933 }, { "epoch": 0.016788048714854904, "grad_norm": 0.09130859375, "learning_rate": 0.0019995716548208546, "loss": 0.2031, "step": 1934 }, { "epoch": 0.01679672919505907, "grad_norm": 0.095703125, "learning_rate": 0.001999570736191359, "loss": 0.3203, "step": 1935 }, { "epoch": 0.016805409675263235, "grad_norm": 0.0625, "learning_rate": 0.0019995698165781064, "loss": 0.168, "step": 1936 }, { "epoch": 0.0168140901554674, "grad_norm": 0.09326171875, "learning_rate": 0.0019995688959810977, "loss": 0.2812, "step": 1937 }, { "epoch": 0.016822770635671565, "grad_norm": 0.1162109375, "learning_rate": 0.001999567974400334, "loss": 0.2539, "step": 1938 }, { "epoch": 0.01683145111587573, "grad_norm": 0.10205078125, "learning_rate": 0.0019995670518358163, "loss": 0.3047, "step": 1939 }, { "epoch": 0.016840131596079895, "grad_norm": 0.08447265625, "learning_rate": 0.001999566128287546, "loss": 0.2295, "step": 1940 }, { "epoch": 0.01684881207628406, "grad_norm": 0.138671875, "learning_rate": 0.001999565203755523, "loss": 0.2695, "step": 1941 }, { "epoch": 0.016857492556488225, "grad_norm": 0.0810546875, "learning_rate": 0.001999564278239749, "loss": 0.1846, "step": 1942 }, { "epoch": 0.01686617303669239, "grad_norm": 0.095703125, "learning_rate": 0.0019995633517402253, "loss": 0.2812, "step": 1943 }, { "epoch": 0.016874853516896555, "grad_norm": 0.0859375, "learning_rate": 0.001999562424256953, "loss": 0.2246, "step": 1944 }, { "epoch": 0.01688353399710072, "grad_norm": 0.087890625, "learning_rate": 0.001999561495789932, "loss": 0.2197, "step": 1945 }, { "epoch": 0.016892214477304886, "grad_norm": 0.10693359375, "learning_rate": 0.001999560566339165, "loss": 0.2656, "step": 1946 }, { "epoch": 0.01690089495750905, "grad_norm": 0.09912109375, "learning_rate": 0.001999559635904651, "loss": 0.2168, "step": 1947 }, { "epoch": 0.016909575437713216, "grad_norm": 0.1142578125, "learning_rate": 0.0019995587044863926, "loss": 0.2422, "step": 1948 }, { "epoch": 0.016918255917917378, "grad_norm": 0.1181640625, "learning_rate": 0.0019995577720843907, "loss": 0.2363, "step": 1949 }, { "epoch": 0.016926936398121543, "grad_norm": 0.09619140625, "learning_rate": 0.0019995568386986452, "loss": 0.1846, "step": 1950 }, { "epoch": 0.016935616878325708, "grad_norm": 0.080078125, "learning_rate": 0.0019995559043291585, "loss": 0.2422, "step": 1951 }, { "epoch": 0.016944297358529873, "grad_norm": 0.103515625, "learning_rate": 0.0019995549689759305, "loss": 0.21, "step": 1952 }, { "epoch": 0.016952977838734038, "grad_norm": 0.07666015625, "learning_rate": 0.001999554032638963, "loss": 0.25, "step": 1953 }, { "epoch": 0.016961658318938203, "grad_norm": 0.11083984375, "learning_rate": 0.0019995530953182566, "loss": 0.2461, "step": 1954 }, { "epoch": 0.01697033879914237, "grad_norm": 0.10302734375, "learning_rate": 0.0019995521570138125, "loss": 0.2227, "step": 1955 }, { "epoch": 0.016979019279346533, "grad_norm": 0.0849609375, "learning_rate": 0.001999551217725632, "loss": 0.2344, "step": 1956 }, { "epoch": 0.0169876997595507, "grad_norm": 0.07666015625, "learning_rate": 0.001999550277453715, "loss": 0.1963, "step": 1957 }, { "epoch": 0.016996380239754864, "grad_norm": 0.0849609375, "learning_rate": 0.001999549336198064, "loss": 0.2832, "step": 1958 }, { "epoch": 0.01700506071995903, "grad_norm": 0.10107421875, "learning_rate": 0.0019995483939586793, "loss": 0.2441, "step": 1959 }, { "epoch": 0.017013741200163194, "grad_norm": 0.6796875, "learning_rate": 0.0019995474507355617, "loss": 0.3984, "step": 1960 }, { "epoch": 0.01702242168036736, "grad_norm": 0.07568359375, "learning_rate": 0.0019995465065287127, "loss": 0.21, "step": 1961 }, { "epoch": 0.017031102160571524, "grad_norm": 0.07275390625, "learning_rate": 0.0019995455613381332, "loss": 0.207, "step": 1962 }, { "epoch": 0.01703978264077569, "grad_norm": 0.0634765625, "learning_rate": 0.001999544615163824, "loss": 0.1768, "step": 1963 }, { "epoch": 0.01704846312097985, "grad_norm": 0.07421875, "learning_rate": 0.0019995436680057864, "loss": 0.2637, "step": 1964 }, { "epoch": 0.017057143601184016, "grad_norm": 0.08740234375, "learning_rate": 0.0019995427198640217, "loss": 0.2441, "step": 1965 }, { "epoch": 0.01706582408138818, "grad_norm": 0.10546875, "learning_rate": 0.00199954177073853, "loss": 0.2275, "step": 1966 }, { "epoch": 0.017074504561592346, "grad_norm": 0.0849609375, "learning_rate": 0.0019995408206293134, "loss": 0.2285, "step": 1967 }, { "epoch": 0.01708318504179651, "grad_norm": 0.20703125, "learning_rate": 0.0019995398695363724, "loss": 0.3086, "step": 1968 }, { "epoch": 0.017091865522000677, "grad_norm": 0.11376953125, "learning_rate": 0.001999538917459708, "loss": 0.2314, "step": 1969 }, { "epoch": 0.01710054600220484, "grad_norm": 0.1181640625, "learning_rate": 0.0019995379643993213, "loss": 0.2314, "step": 1970 }, { "epoch": 0.017109226482409007, "grad_norm": 0.058837890625, "learning_rate": 0.0019995370103552137, "loss": 0.1719, "step": 1971 }, { "epoch": 0.017117906962613172, "grad_norm": 0.09619140625, "learning_rate": 0.0019995360553273856, "loss": 0.2227, "step": 1972 }, { "epoch": 0.017126587442817337, "grad_norm": 0.0810546875, "learning_rate": 0.0019995350993158387, "loss": 0.2617, "step": 1973 }, { "epoch": 0.017135267923021502, "grad_norm": 0.08935546875, "learning_rate": 0.001999534142320573, "loss": 0.293, "step": 1974 }, { "epoch": 0.017143948403225667, "grad_norm": 0.1044921875, "learning_rate": 0.001999533184341591, "loss": 0.2559, "step": 1975 }, { "epoch": 0.017152628883429832, "grad_norm": 0.0810546875, "learning_rate": 0.001999532225378893, "loss": 0.2109, "step": 1976 }, { "epoch": 0.017161309363633998, "grad_norm": 0.1142578125, "learning_rate": 0.00199953126543248, "loss": 0.3164, "step": 1977 }, { "epoch": 0.017169989843838163, "grad_norm": 0.07177734375, "learning_rate": 0.001999530304502353, "loss": 0.2129, "step": 1978 }, { "epoch": 0.017178670324042328, "grad_norm": 0.09130859375, "learning_rate": 0.001999529342588513, "loss": 0.2314, "step": 1979 }, { "epoch": 0.01718735080424649, "grad_norm": 0.142578125, "learning_rate": 0.0019995283796909614, "loss": 0.3008, "step": 1980 }, { "epoch": 0.017196031284450655, "grad_norm": 0.10595703125, "learning_rate": 0.0019995274158096992, "loss": 0.2109, "step": 1981 }, { "epoch": 0.01720471176465482, "grad_norm": 0.083984375, "learning_rate": 0.0019995264509447275, "loss": 0.2441, "step": 1982 }, { "epoch": 0.017213392244858985, "grad_norm": 0.076171875, "learning_rate": 0.0019995254850960465, "loss": 0.2041, "step": 1983 }, { "epoch": 0.01722207272506315, "grad_norm": 0.1357421875, "learning_rate": 0.0019995245182636585, "loss": 0.2578, "step": 1984 }, { "epoch": 0.017230753205267315, "grad_norm": 0.08544921875, "learning_rate": 0.001999523550447564, "loss": 0.2354, "step": 1985 }, { "epoch": 0.01723943368547148, "grad_norm": 0.09423828125, "learning_rate": 0.001999522581647764, "loss": 0.2441, "step": 1986 }, { "epoch": 0.017248114165675645, "grad_norm": 0.0703125, "learning_rate": 0.0019995216118642595, "loss": 0.2324, "step": 1987 }, { "epoch": 0.01725679464587981, "grad_norm": 0.0966796875, "learning_rate": 0.001999520641097052, "loss": 0.2656, "step": 1988 }, { "epoch": 0.017265475126083975, "grad_norm": 0.0947265625, "learning_rate": 0.001999519669346142, "loss": 0.1973, "step": 1989 }, { "epoch": 0.01727415560628814, "grad_norm": 0.11376953125, "learning_rate": 0.0019995186966115307, "loss": 0.2383, "step": 1990 }, { "epoch": 0.017282836086492306, "grad_norm": 0.08935546875, "learning_rate": 0.0019995177228932194, "loss": 0.2295, "step": 1991 }, { "epoch": 0.01729151656669647, "grad_norm": 0.09130859375, "learning_rate": 0.001999516748191209, "loss": 0.2676, "step": 1992 }, { "epoch": 0.017300197046900636, "grad_norm": 0.06591796875, "learning_rate": 0.0019995157725055004, "loss": 0.2188, "step": 1993 }, { "epoch": 0.0173088775271048, "grad_norm": 0.07470703125, "learning_rate": 0.001999514795836095, "loss": 0.2324, "step": 1994 }, { "epoch": 0.017317558007308963, "grad_norm": 0.53515625, "learning_rate": 0.0019995138181829936, "loss": 0.4336, "step": 1995 }, { "epoch": 0.017326238487513128, "grad_norm": 0.0908203125, "learning_rate": 0.001999512839546198, "loss": 0.293, "step": 1996 }, { "epoch": 0.017334918967717293, "grad_norm": 0.078125, "learning_rate": 0.001999511859925708, "loss": 0.1982, "step": 1997 }, { "epoch": 0.017343599447921458, "grad_norm": 0.09716796875, "learning_rate": 0.0019995108793215257, "loss": 0.2363, "step": 1998 }, { "epoch": 0.017352279928125623, "grad_norm": 0.109375, "learning_rate": 0.0019995098977336517, "loss": 0.2559, "step": 1999 }, { "epoch": 0.01736096040832979, "grad_norm": 0.0869140625, "learning_rate": 0.0019995089151620873, "loss": 0.2012, "step": 2000 }, { "epoch": 0.017369640888533953, "grad_norm": 0.1787109375, "learning_rate": 0.0019995079316068335, "loss": 0.2598, "step": 2001 }, { "epoch": 0.01737832136873812, "grad_norm": 0.10888671875, "learning_rate": 0.0019995069470678914, "loss": 0.3105, "step": 2002 }, { "epoch": 0.017387001848942284, "grad_norm": 0.0693359375, "learning_rate": 0.0019995059615452613, "loss": 0.2422, "step": 2003 }, { "epoch": 0.01739568232914645, "grad_norm": 0.08056640625, "learning_rate": 0.001999504975038946, "loss": 0.2021, "step": 2004 }, { "epoch": 0.017404362809350614, "grad_norm": 0.07080078125, "learning_rate": 0.001999503987548945, "loss": 0.248, "step": 2005 }, { "epoch": 0.01741304328955478, "grad_norm": 0.1044921875, "learning_rate": 0.00199950299907526, "loss": 0.2773, "step": 2006 }, { "epoch": 0.017421723769758944, "grad_norm": 0.08642578125, "learning_rate": 0.0019995020096178918, "loss": 0.1973, "step": 2007 }, { "epoch": 0.01743040424996311, "grad_norm": 0.09765625, "learning_rate": 0.0019995010191768427, "loss": 0.248, "step": 2008 }, { "epoch": 0.017439084730167274, "grad_norm": 0.078125, "learning_rate": 0.0019995000277521118, "loss": 0.2002, "step": 2009 }, { "epoch": 0.017447765210371436, "grad_norm": 0.078125, "learning_rate": 0.0019994990353437016, "loss": 0.2676, "step": 2010 }, { "epoch": 0.0174564456905756, "grad_norm": 0.07177734375, "learning_rate": 0.0019994980419516125, "loss": 0.1768, "step": 2011 }, { "epoch": 0.017465126170779766, "grad_norm": 0.0673828125, "learning_rate": 0.0019994970475758463, "loss": 0.2109, "step": 2012 }, { "epoch": 0.01747380665098393, "grad_norm": 0.068359375, "learning_rate": 0.001999496052216403, "loss": 0.2539, "step": 2013 }, { "epoch": 0.017482487131188097, "grad_norm": 0.06884765625, "learning_rate": 0.001999495055873285, "loss": 0.2285, "step": 2014 }, { "epoch": 0.01749116761139226, "grad_norm": 0.064453125, "learning_rate": 0.0019994940585464924, "loss": 0.2168, "step": 2015 }, { "epoch": 0.017499848091596427, "grad_norm": 0.0693359375, "learning_rate": 0.0019994930602360264, "loss": 0.2188, "step": 2016 }, { "epoch": 0.017508528571800592, "grad_norm": 0.087890625, "learning_rate": 0.0019994920609418885, "loss": 0.2695, "step": 2017 }, { "epoch": 0.017517209052004757, "grad_norm": 0.09423828125, "learning_rate": 0.00199949106066408, "loss": 0.2812, "step": 2018 }, { "epoch": 0.017525889532208922, "grad_norm": 0.08447265625, "learning_rate": 0.001999490059402601, "loss": 0.2139, "step": 2019 }, { "epoch": 0.017534570012413087, "grad_norm": 0.0703125, "learning_rate": 0.0019994890571574534, "loss": 0.25, "step": 2020 }, { "epoch": 0.017543250492617252, "grad_norm": 0.0751953125, "learning_rate": 0.0019994880539286383, "loss": 0.2383, "step": 2021 }, { "epoch": 0.017551930972821417, "grad_norm": 0.10400390625, "learning_rate": 0.0019994870497161564, "loss": 0.2393, "step": 2022 }, { "epoch": 0.017560611453025583, "grad_norm": 0.10302734375, "learning_rate": 0.001999486044520009, "loss": 0.2422, "step": 2023 }, { "epoch": 0.017569291933229748, "grad_norm": 0.07421875, "learning_rate": 0.0019994850383401974, "loss": 0.2109, "step": 2024 }, { "epoch": 0.017577972413433913, "grad_norm": 0.080078125, "learning_rate": 0.001999484031176722, "loss": 0.2363, "step": 2025 }, { "epoch": 0.017586652893638074, "grad_norm": 0.1259765625, "learning_rate": 0.0019994830230295846, "loss": 0.7656, "step": 2026 }, { "epoch": 0.01759533337384224, "grad_norm": 0.1005859375, "learning_rate": 0.001999482013898786, "loss": 0.2129, "step": 2027 }, { "epoch": 0.017604013854046405, "grad_norm": 0.07080078125, "learning_rate": 0.0019994810037843275, "loss": 0.2246, "step": 2028 }, { "epoch": 0.01761269433425057, "grad_norm": 0.0810546875, "learning_rate": 0.00199947999268621, "loss": 0.2461, "step": 2029 }, { "epoch": 0.017621374814454735, "grad_norm": 0.057861328125, "learning_rate": 0.0019994789806044347, "loss": 0.1875, "step": 2030 }, { "epoch": 0.0176300552946589, "grad_norm": 0.10107421875, "learning_rate": 0.001999477967539003, "loss": 0.2891, "step": 2031 }, { "epoch": 0.017638735774863065, "grad_norm": 0.17578125, "learning_rate": 0.0019994769534899155, "loss": 0.293, "step": 2032 }, { "epoch": 0.01764741625506723, "grad_norm": 0.06494140625, "learning_rate": 0.0019994759384571736, "loss": 0.2148, "step": 2033 }, { "epoch": 0.017656096735271395, "grad_norm": 0.08740234375, "learning_rate": 0.001999474922440778, "loss": 0.3027, "step": 2034 }, { "epoch": 0.01766477721547556, "grad_norm": 0.06396484375, "learning_rate": 0.00199947390544073, "loss": 0.2227, "step": 2035 }, { "epoch": 0.017673457695679726, "grad_norm": 0.09033203125, "learning_rate": 0.0019994728874570314, "loss": 0.2617, "step": 2036 }, { "epoch": 0.01768213817588389, "grad_norm": 0.09326171875, "learning_rate": 0.0019994718684896826, "loss": 0.2051, "step": 2037 }, { "epoch": 0.017690818656088056, "grad_norm": 0.072265625, "learning_rate": 0.0019994708485386848, "loss": 0.1777, "step": 2038 }, { "epoch": 0.01769949913629222, "grad_norm": 0.087890625, "learning_rate": 0.0019994698276040394, "loss": 0.25, "step": 2039 }, { "epoch": 0.017708179616496386, "grad_norm": 0.08447265625, "learning_rate": 0.001999468805685747, "loss": 0.2305, "step": 2040 }, { "epoch": 0.017716860096700548, "grad_norm": 0.0673828125, "learning_rate": 0.001999467782783809, "loss": 0.2383, "step": 2041 }, { "epoch": 0.017725540576904713, "grad_norm": 0.154296875, "learning_rate": 0.001999466758898227, "loss": 0.2041, "step": 2042 }, { "epoch": 0.017734221057108878, "grad_norm": 0.0927734375, "learning_rate": 0.0019994657340290014, "loss": 0.25, "step": 2043 }, { "epoch": 0.017742901537313043, "grad_norm": 0.41015625, "learning_rate": 0.0019994647081761335, "loss": 0.4082, "step": 2044 }, { "epoch": 0.01775158201751721, "grad_norm": 0.107421875, "learning_rate": 0.0019994636813396244, "loss": 0.2031, "step": 2045 }, { "epoch": 0.017760262497721373, "grad_norm": 0.07421875, "learning_rate": 0.0019994626535194755, "loss": 0.2734, "step": 2046 }, { "epoch": 0.01776894297792554, "grad_norm": 0.0634765625, "learning_rate": 0.0019994616247156877, "loss": 0.2168, "step": 2047 }, { "epoch": 0.017777623458129704, "grad_norm": 0.07080078125, "learning_rate": 0.0019994605949282622, "loss": 0.1797, "step": 2048 }, { "epoch": 0.01778630393833387, "grad_norm": 0.08251953125, "learning_rate": 0.0019994595641572004, "loss": 0.2754, "step": 2049 }, { "epoch": 0.017794984418538034, "grad_norm": 0.08740234375, "learning_rate": 0.001999458532402503, "loss": 0.2695, "step": 2050 }, { "epoch": 0.0178036648987422, "grad_norm": 0.0654296875, "learning_rate": 0.001999457499664171, "loss": 0.2207, "step": 2051 }, { "epoch": 0.017812345378946364, "grad_norm": 0.07666015625, "learning_rate": 0.001999456465942206, "loss": 0.2188, "step": 2052 }, { "epoch": 0.01782102585915053, "grad_norm": 0.0732421875, "learning_rate": 0.0019994554312366087, "loss": 0.1982, "step": 2053 }, { "epoch": 0.017829706339354694, "grad_norm": 0.08447265625, "learning_rate": 0.0019994543955473808, "loss": 0.2334, "step": 2054 }, { "epoch": 0.01783838681955886, "grad_norm": 0.07421875, "learning_rate": 0.001999453358874523, "loss": 0.2383, "step": 2055 }, { "epoch": 0.017847067299763025, "grad_norm": 0.1064453125, "learning_rate": 0.0019994523212180366, "loss": 0.2559, "step": 2056 }, { "epoch": 0.017855747779967186, "grad_norm": 0.0888671875, "learning_rate": 0.001999451282577922, "loss": 0.2158, "step": 2057 }, { "epoch": 0.01786442826017135, "grad_norm": 0.0712890625, "learning_rate": 0.001999450242954182, "loss": 0.209, "step": 2058 }, { "epoch": 0.017873108740375517, "grad_norm": 0.08837890625, "learning_rate": 0.001999449202346816, "loss": 0.2275, "step": 2059 }, { "epoch": 0.01788178922057968, "grad_norm": 0.06201171875, "learning_rate": 0.001999448160755826, "loss": 0.2188, "step": 2060 }, { "epoch": 0.017890469700783847, "grad_norm": 0.07080078125, "learning_rate": 0.001999447118181213, "loss": 0.2129, "step": 2061 }, { "epoch": 0.017899150180988012, "grad_norm": 0.1875, "learning_rate": 0.001999446074622978, "loss": 0.168, "step": 2062 }, { "epoch": 0.017907830661192177, "grad_norm": 0.08642578125, "learning_rate": 0.001999445030081123, "loss": 0.2471, "step": 2063 }, { "epoch": 0.017916511141396342, "grad_norm": 0.06396484375, "learning_rate": 0.001999443984555648, "loss": 0.2217, "step": 2064 }, { "epoch": 0.017925191621600507, "grad_norm": 0.054931640625, "learning_rate": 0.001999442938046554, "loss": 0.1797, "step": 2065 }, { "epoch": 0.017933872101804672, "grad_norm": 0.0771484375, "learning_rate": 0.0019994418905538436, "loss": 0.2695, "step": 2066 }, { "epoch": 0.017942552582008837, "grad_norm": 0.07275390625, "learning_rate": 0.0019994408420775166, "loss": 0.1992, "step": 2067 }, { "epoch": 0.017951233062213003, "grad_norm": 0.099609375, "learning_rate": 0.001999439792617575, "loss": 0.2334, "step": 2068 }, { "epoch": 0.017959913542417168, "grad_norm": 0.072265625, "learning_rate": 0.001999438742174019, "loss": 0.1738, "step": 2069 }, { "epoch": 0.017968594022621333, "grad_norm": 0.0654296875, "learning_rate": 0.0019994376907468505, "loss": 0.1914, "step": 2070 }, { "epoch": 0.017977274502825498, "grad_norm": 0.1171875, "learning_rate": 0.001999436638336071, "loss": 0.2852, "step": 2071 }, { "epoch": 0.01798595498302966, "grad_norm": 0.0732421875, "learning_rate": 0.0019994355849416805, "loss": 0.2539, "step": 2072 }, { "epoch": 0.017994635463233825, "grad_norm": 0.08203125, "learning_rate": 0.0019994345305636807, "loss": 0.2461, "step": 2073 }, { "epoch": 0.01800331594343799, "grad_norm": 0.08349609375, "learning_rate": 0.001999433475202073, "loss": 0.2129, "step": 2074 }, { "epoch": 0.018011996423642155, "grad_norm": 0.06494140625, "learning_rate": 0.0019994324188568583, "loss": 0.1943, "step": 2075 }, { "epoch": 0.01802067690384632, "grad_norm": 0.06591796875, "learning_rate": 0.001999431361528038, "loss": 0.2256, "step": 2076 }, { "epoch": 0.018029357384050485, "grad_norm": 0.10888671875, "learning_rate": 0.0019994303032156127, "loss": 0.2656, "step": 2077 }, { "epoch": 0.01803803786425465, "grad_norm": 0.07958984375, "learning_rate": 0.0019994292439195847, "loss": 0.2754, "step": 2078 }, { "epoch": 0.018046718344458815, "grad_norm": 0.09716796875, "learning_rate": 0.0019994281836399536, "loss": 0.1924, "step": 2079 }, { "epoch": 0.01805539882466298, "grad_norm": 0.078125, "learning_rate": 0.0019994271223767214, "loss": 0.2393, "step": 2080 }, { "epoch": 0.018064079304867146, "grad_norm": 0.06494140625, "learning_rate": 0.0019994260601298897, "loss": 0.1875, "step": 2081 }, { "epoch": 0.01807275978507131, "grad_norm": 0.06201171875, "learning_rate": 0.001999424996899459, "loss": 0.208, "step": 2082 }, { "epoch": 0.018081440265275476, "grad_norm": 0.0966796875, "learning_rate": 0.0019994239326854304, "loss": 0.2539, "step": 2083 }, { "epoch": 0.01809012074547964, "grad_norm": 0.06640625, "learning_rate": 0.0019994228674878054, "loss": 0.1719, "step": 2084 }, { "epoch": 0.018098801225683806, "grad_norm": 0.0791015625, "learning_rate": 0.0019994218013065852, "loss": 0.1914, "step": 2085 }, { "epoch": 0.01810748170588797, "grad_norm": 0.08349609375, "learning_rate": 0.001999420734141771, "loss": 0.2676, "step": 2086 }, { "epoch": 0.018116162186092133, "grad_norm": 0.197265625, "learning_rate": 0.0019994196659933634, "loss": 0.2852, "step": 2087 }, { "epoch": 0.018124842666296298, "grad_norm": 0.0830078125, "learning_rate": 0.001999418596861364, "loss": 0.2266, "step": 2088 }, { "epoch": 0.018133523146500463, "grad_norm": 0.11962890625, "learning_rate": 0.001999417526745774, "loss": 0.2812, "step": 2089 }, { "epoch": 0.01814220362670463, "grad_norm": 0.0830078125, "learning_rate": 0.0019994164556465946, "loss": 0.2207, "step": 2090 }, { "epoch": 0.018150884106908793, "grad_norm": 0.062255859375, "learning_rate": 0.001999415383563827, "loss": 0.2158, "step": 2091 }, { "epoch": 0.01815956458711296, "grad_norm": 0.083984375, "learning_rate": 0.001999414310497472, "loss": 0.2061, "step": 2092 }, { "epoch": 0.018168245067317124, "grad_norm": 0.06787109375, "learning_rate": 0.001999413236447531, "loss": 0.2012, "step": 2093 }, { "epoch": 0.01817692554752129, "grad_norm": 0.0947265625, "learning_rate": 0.0019994121614140053, "loss": 0.2363, "step": 2094 }, { "epoch": 0.018185606027725454, "grad_norm": 0.0947265625, "learning_rate": 0.001999411085396896, "loss": 0.1875, "step": 2095 }, { "epoch": 0.01819428650792962, "grad_norm": 0.060546875, "learning_rate": 0.0019994100083962044, "loss": 0.1924, "step": 2096 }, { "epoch": 0.018202966988133784, "grad_norm": 0.055908203125, "learning_rate": 0.001999408930411932, "loss": 0.1816, "step": 2097 }, { "epoch": 0.01821164746833795, "grad_norm": 0.0654296875, "learning_rate": 0.0019994078514440784, "loss": 0.1846, "step": 2098 }, { "epoch": 0.018220327948542114, "grad_norm": 0.0751953125, "learning_rate": 0.0019994067714926463, "loss": 0.2422, "step": 2099 }, { "epoch": 0.01822900842874628, "grad_norm": 0.0849609375, "learning_rate": 0.0019994056905576364, "loss": 0.2461, "step": 2100 }, { "epoch": 0.018237688908950445, "grad_norm": 0.10888671875, "learning_rate": 0.0019994046086390504, "loss": 0.2158, "step": 2101 }, { "epoch": 0.01824636938915461, "grad_norm": 0.0576171875, "learning_rate": 0.0019994035257368883, "loss": 0.2168, "step": 2102 }, { "epoch": 0.01825504986935877, "grad_norm": 0.0859375, "learning_rate": 0.001999402441851153, "loss": 0.2402, "step": 2103 }, { "epoch": 0.018263730349562936, "grad_norm": 0.06640625, "learning_rate": 0.001999401356981844, "loss": 0.209, "step": 2104 }, { "epoch": 0.0182724108297671, "grad_norm": 0.059326171875, "learning_rate": 0.0019994002711289636, "loss": 0.1914, "step": 2105 }, { "epoch": 0.018281091309971267, "grad_norm": 0.07666015625, "learning_rate": 0.001999399184292512, "loss": 0.2285, "step": 2106 }, { "epoch": 0.018289771790175432, "grad_norm": 0.07763671875, "learning_rate": 0.0019993980964724913, "loss": 0.1816, "step": 2107 }, { "epoch": 0.018298452270379597, "grad_norm": 0.078125, "learning_rate": 0.0019993970076689024, "loss": 0.2227, "step": 2108 }, { "epoch": 0.018307132750583762, "grad_norm": 0.0947265625, "learning_rate": 0.001999395917881746, "loss": 0.1885, "step": 2109 }, { "epoch": 0.018315813230787927, "grad_norm": 0.07763671875, "learning_rate": 0.0019993948271110245, "loss": 0.2402, "step": 2110 }, { "epoch": 0.018324493710992092, "grad_norm": 0.07421875, "learning_rate": 0.001999393735356738, "loss": 0.248, "step": 2111 }, { "epoch": 0.018333174191196257, "grad_norm": 0.107421875, "learning_rate": 0.001999392642618888, "loss": 0.2305, "step": 2112 }, { "epoch": 0.018341854671400423, "grad_norm": 0.0966796875, "learning_rate": 0.0019993915488974753, "loss": 0.2266, "step": 2113 }, { "epoch": 0.018350535151604588, "grad_norm": 0.1044921875, "learning_rate": 0.001999390454192502, "loss": 0.2402, "step": 2114 }, { "epoch": 0.018359215631808753, "grad_norm": 0.07177734375, "learning_rate": 0.001999389358503969, "loss": 0.1973, "step": 2115 }, { "epoch": 0.018367896112012918, "grad_norm": 0.0771484375, "learning_rate": 0.001999388261831877, "loss": 0.1836, "step": 2116 }, { "epoch": 0.018376576592217083, "grad_norm": 0.08056640625, "learning_rate": 0.0019993871641762273, "loss": 0.2305, "step": 2117 }, { "epoch": 0.018385257072421245, "grad_norm": 0.07373046875, "learning_rate": 0.0019993860655370217, "loss": 0.2441, "step": 2118 }, { "epoch": 0.01839393755262541, "grad_norm": 0.09716796875, "learning_rate": 0.001999384965914261, "loss": 0.2344, "step": 2119 }, { "epoch": 0.018402618032829575, "grad_norm": 0.08154296875, "learning_rate": 0.001999383865307946, "loss": 0.2275, "step": 2120 }, { "epoch": 0.01841129851303374, "grad_norm": 0.09228515625, "learning_rate": 0.001999382763718079, "loss": 0.208, "step": 2121 }, { "epoch": 0.018419978993237905, "grad_norm": 0.0830078125, "learning_rate": 0.00199938166114466, "loss": 0.2256, "step": 2122 }, { "epoch": 0.01842865947344207, "grad_norm": 0.072265625, "learning_rate": 0.0019993805575876907, "loss": 0.1816, "step": 2123 }, { "epoch": 0.018437339953646235, "grad_norm": 0.0869140625, "learning_rate": 0.001999379453047172, "loss": 0.25, "step": 2124 }, { "epoch": 0.0184460204338504, "grad_norm": 0.07666015625, "learning_rate": 0.0019993783475231062, "loss": 0.2266, "step": 2125 }, { "epoch": 0.018454700914054566, "grad_norm": 0.09521484375, "learning_rate": 0.0019993772410154937, "loss": 0.2285, "step": 2126 }, { "epoch": 0.01846338139425873, "grad_norm": 0.06591796875, "learning_rate": 0.0019993761335243354, "loss": 0.2344, "step": 2127 }, { "epoch": 0.018472061874462896, "grad_norm": 0.130859375, "learning_rate": 0.0019993750250496332, "loss": 0.2539, "step": 2128 }, { "epoch": 0.01848074235466706, "grad_norm": 0.08251953125, "learning_rate": 0.001999373915591388, "loss": 0.2344, "step": 2129 }, { "epoch": 0.018489422834871226, "grad_norm": 0.08837890625, "learning_rate": 0.0019993728051496003, "loss": 0.2559, "step": 2130 }, { "epoch": 0.01849810331507539, "grad_norm": 0.0810546875, "learning_rate": 0.0019993716937242727, "loss": 0.2109, "step": 2131 }, { "epoch": 0.018506783795279556, "grad_norm": 0.06494140625, "learning_rate": 0.0019993705813154054, "loss": 0.1807, "step": 2132 }, { "epoch": 0.01851546427548372, "grad_norm": 0.09375, "learning_rate": 0.0019993694679230002, "loss": 0.2383, "step": 2133 }, { "epoch": 0.018524144755687883, "grad_norm": 0.10400390625, "learning_rate": 0.001999368353547058, "loss": 0.2441, "step": 2134 }, { "epoch": 0.018532825235892048, "grad_norm": 0.080078125, "learning_rate": 0.00199936723818758, "loss": 0.1758, "step": 2135 }, { "epoch": 0.018541505716096213, "grad_norm": 0.1474609375, "learning_rate": 0.0019993661218445677, "loss": 0.207, "step": 2136 }, { "epoch": 0.01855018619630038, "grad_norm": 0.138671875, "learning_rate": 0.0019993650045180217, "loss": 0.2871, "step": 2137 }, { "epoch": 0.018558866676504544, "grad_norm": 0.07568359375, "learning_rate": 0.001999363886207944, "loss": 0.2305, "step": 2138 }, { "epoch": 0.01856754715670871, "grad_norm": 0.1943359375, "learning_rate": 0.001999362766914335, "loss": 0.3477, "step": 2139 }, { "epoch": 0.018576227636912874, "grad_norm": 0.078125, "learning_rate": 0.001999361646637197, "loss": 0.1924, "step": 2140 }, { "epoch": 0.01858490811711704, "grad_norm": 0.07958984375, "learning_rate": 0.0019993605253765304, "loss": 0.2129, "step": 2141 }, { "epoch": 0.018593588597321204, "grad_norm": 0.07421875, "learning_rate": 0.0019993594031323366, "loss": 0.2168, "step": 2142 }, { "epoch": 0.01860226907752537, "grad_norm": 0.06640625, "learning_rate": 0.001999358279904617, "loss": 0.1826, "step": 2143 }, { "epoch": 0.018610949557729534, "grad_norm": 0.0693359375, "learning_rate": 0.0019993571556933725, "loss": 0.2129, "step": 2144 }, { "epoch": 0.0186196300379337, "grad_norm": 0.08935546875, "learning_rate": 0.001999356030498605, "loss": 0.1885, "step": 2145 }, { "epoch": 0.018628310518137865, "grad_norm": 0.07958984375, "learning_rate": 0.0019993549043203144, "loss": 0.2441, "step": 2146 }, { "epoch": 0.01863699099834203, "grad_norm": 0.0703125, "learning_rate": 0.0019993537771585035, "loss": 0.2402, "step": 2147 }, { "epoch": 0.018645671478546195, "grad_norm": 0.072265625, "learning_rate": 0.001999352649013172, "loss": 0.1748, "step": 2148 }, { "epoch": 0.018654351958750356, "grad_norm": 0.1064453125, "learning_rate": 0.001999351519884323, "loss": 0.2637, "step": 2149 }, { "epoch": 0.01866303243895452, "grad_norm": 0.06298828125, "learning_rate": 0.001999350389771956, "loss": 0.1865, "step": 2150 }, { "epoch": 0.018671712919158687, "grad_norm": 0.0673828125, "learning_rate": 0.0019993492586760733, "loss": 0.2188, "step": 2151 }, { "epoch": 0.018680393399362852, "grad_norm": 0.0625, "learning_rate": 0.0019993481265966756, "loss": 0.1885, "step": 2152 }, { "epoch": 0.018689073879567017, "grad_norm": 0.34375, "learning_rate": 0.0019993469935337643, "loss": 0.3555, "step": 2153 }, { "epoch": 0.018697754359771182, "grad_norm": 0.08251953125, "learning_rate": 0.0019993458594873407, "loss": 0.2539, "step": 2154 }, { "epoch": 0.018706434839975347, "grad_norm": 0.2099609375, "learning_rate": 0.001999344724457406, "loss": 0.2129, "step": 2155 }, { "epoch": 0.018715115320179512, "grad_norm": 0.0830078125, "learning_rate": 0.0019993435884439613, "loss": 0.2109, "step": 2156 }, { "epoch": 0.018723795800383677, "grad_norm": 0.08984375, "learning_rate": 0.001999342451447008, "loss": 0.2285, "step": 2157 }, { "epoch": 0.018732476280587843, "grad_norm": 0.12109375, "learning_rate": 0.0019993413134665474, "loss": 0.2695, "step": 2158 }, { "epoch": 0.018741156760792008, "grad_norm": 0.130859375, "learning_rate": 0.0019993401745025804, "loss": 0.2168, "step": 2159 }, { "epoch": 0.018749837240996173, "grad_norm": 0.0869140625, "learning_rate": 0.0019993390345551085, "loss": 0.2383, "step": 2160 }, { "epoch": 0.018758517721200338, "grad_norm": 0.091796875, "learning_rate": 0.001999337893624133, "loss": 0.2021, "step": 2161 }, { "epoch": 0.018767198201404503, "grad_norm": 0.0888671875, "learning_rate": 0.0019993367517096555, "loss": 0.2227, "step": 2162 }, { "epoch": 0.018775878681608668, "grad_norm": 0.111328125, "learning_rate": 0.0019993356088116765, "loss": 0.2852, "step": 2163 }, { "epoch": 0.01878455916181283, "grad_norm": 0.3359375, "learning_rate": 0.001999334464930198, "loss": 0.2988, "step": 2164 }, { "epoch": 0.018793239642016995, "grad_norm": 0.07958984375, "learning_rate": 0.0019993333200652203, "loss": 0.2207, "step": 2165 }, { "epoch": 0.01880192012222116, "grad_norm": 0.11376953125, "learning_rate": 0.001999332174216745, "loss": 0.2773, "step": 2166 }, { "epoch": 0.018810600602425325, "grad_norm": 0.068359375, "learning_rate": 0.001999331027384774, "loss": 0.2002, "step": 2167 }, { "epoch": 0.01881928108262949, "grad_norm": 0.08251953125, "learning_rate": 0.0019993298795693082, "loss": 0.2441, "step": 2168 }, { "epoch": 0.018827961562833655, "grad_norm": 0.0859375, "learning_rate": 0.0019993287307703486, "loss": 0.2852, "step": 2169 }, { "epoch": 0.01883664204303782, "grad_norm": 0.08642578125, "learning_rate": 0.0019993275809878966, "loss": 0.2236, "step": 2170 }, { "epoch": 0.018845322523241986, "grad_norm": 0.07177734375, "learning_rate": 0.001999326430221953, "loss": 0.1709, "step": 2171 }, { "epoch": 0.01885400300344615, "grad_norm": 0.06982421875, "learning_rate": 0.0019993252784725204, "loss": 0.2197, "step": 2172 }, { "epoch": 0.018862683483650316, "grad_norm": 0.078125, "learning_rate": 0.0019993241257395987, "loss": 0.1836, "step": 2173 }, { "epoch": 0.01887136396385448, "grad_norm": 0.0771484375, "learning_rate": 0.0019993229720231902, "loss": 0.2344, "step": 2174 }, { "epoch": 0.018880044444058646, "grad_norm": 0.12890625, "learning_rate": 0.0019993218173232946, "loss": 0.3477, "step": 2175 }, { "epoch": 0.01888872492426281, "grad_norm": 0.058349609375, "learning_rate": 0.001999320661639915, "loss": 0.1797, "step": 2176 }, { "epoch": 0.018897405404466976, "grad_norm": 0.0693359375, "learning_rate": 0.001999319504973051, "loss": 0.2363, "step": 2177 }, { "epoch": 0.01890608588467114, "grad_norm": 0.107421875, "learning_rate": 0.0019993183473227057, "loss": 0.2275, "step": 2178 }, { "epoch": 0.018914766364875307, "grad_norm": 0.0908203125, "learning_rate": 0.0019993171886888788, "loss": 0.2344, "step": 2179 }, { "epoch": 0.018923446845079468, "grad_norm": 0.07568359375, "learning_rate": 0.0019993160290715725, "loss": 0.2656, "step": 2180 }, { "epoch": 0.018932127325283633, "grad_norm": 0.10205078125, "learning_rate": 0.0019993148684707873, "loss": 0.2197, "step": 2181 }, { "epoch": 0.0189408078054878, "grad_norm": 0.10107421875, "learning_rate": 0.001999313706886525, "loss": 0.1885, "step": 2182 }, { "epoch": 0.018949488285691964, "grad_norm": 0.07763671875, "learning_rate": 0.0019993125443187867, "loss": 0.2617, "step": 2183 }, { "epoch": 0.01895816876589613, "grad_norm": 0.07958984375, "learning_rate": 0.001999311380767574, "loss": 0.2344, "step": 2184 }, { "epoch": 0.018966849246100294, "grad_norm": 0.08154296875, "learning_rate": 0.0019993102162328877, "loss": 0.2148, "step": 2185 }, { "epoch": 0.01897552972630446, "grad_norm": 0.11474609375, "learning_rate": 0.001999309050714729, "loss": 0.2734, "step": 2186 }, { "epoch": 0.018984210206508624, "grad_norm": 0.080078125, "learning_rate": 0.0019993078842131, "loss": 0.1836, "step": 2187 }, { "epoch": 0.01899289068671279, "grad_norm": 0.08642578125, "learning_rate": 0.001999306716728001, "loss": 0.209, "step": 2188 }, { "epoch": 0.019001571166916954, "grad_norm": 0.09521484375, "learning_rate": 0.0019993055482594338, "loss": 0.249, "step": 2189 }, { "epoch": 0.01901025164712112, "grad_norm": 0.07177734375, "learning_rate": 0.0019993043788073994, "loss": 0.1807, "step": 2190 }, { "epoch": 0.019018932127325285, "grad_norm": 0.06201171875, "learning_rate": 0.001999303208371899, "loss": 0.1836, "step": 2191 }, { "epoch": 0.01902761260752945, "grad_norm": 0.0947265625, "learning_rate": 0.0019993020369529347, "loss": 0.2734, "step": 2192 }, { "epoch": 0.019036293087733615, "grad_norm": 0.06494140625, "learning_rate": 0.001999300864550507, "loss": 0.2031, "step": 2193 }, { "epoch": 0.01904497356793778, "grad_norm": 0.07666015625, "learning_rate": 0.0019992996911646172, "loss": 0.2305, "step": 2194 }, { "epoch": 0.01905365404814194, "grad_norm": 0.0830078125, "learning_rate": 0.0019992985167952672, "loss": 0.2266, "step": 2195 }, { "epoch": 0.019062334528346107, "grad_norm": 0.08203125, "learning_rate": 0.0019992973414424578, "loss": 0.2227, "step": 2196 }, { "epoch": 0.019071015008550272, "grad_norm": 0.0732421875, "learning_rate": 0.00199929616510619, "loss": 0.2363, "step": 2197 }, { "epoch": 0.019079695488754437, "grad_norm": 0.103515625, "learning_rate": 0.001999294987786466, "loss": 0.291, "step": 2198 }, { "epoch": 0.019088375968958602, "grad_norm": 0.0732421875, "learning_rate": 0.0019992938094832856, "loss": 0.1934, "step": 2199 }, { "epoch": 0.019097056449162767, "grad_norm": 0.08447265625, "learning_rate": 0.0019992926301966515, "loss": 0.3066, "step": 2200 }, { "epoch": 0.019105736929366932, "grad_norm": 0.1064453125, "learning_rate": 0.0019992914499265646, "loss": 0.2695, "step": 2201 }, { "epoch": 0.019114417409571097, "grad_norm": 0.0625, "learning_rate": 0.001999290268673026, "loss": 0.1787, "step": 2202 }, { "epoch": 0.019123097889775263, "grad_norm": 0.07470703125, "learning_rate": 0.0019992890864360367, "loss": 0.2461, "step": 2203 }, { "epoch": 0.019131778369979428, "grad_norm": 0.1005859375, "learning_rate": 0.001999287903215599, "loss": 0.2578, "step": 2204 }, { "epoch": 0.019140458850183593, "grad_norm": 0.08837890625, "learning_rate": 0.0019992867190117133, "loss": 0.2695, "step": 2205 }, { "epoch": 0.019149139330387758, "grad_norm": 0.0869140625, "learning_rate": 0.001999285533824381, "loss": 0.2363, "step": 2206 }, { "epoch": 0.019157819810591923, "grad_norm": 0.07470703125, "learning_rate": 0.0019992843476536034, "loss": 0.1836, "step": 2207 }, { "epoch": 0.019166500290796088, "grad_norm": 0.06494140625, "learning_rate": 0.0019992831604993826, "loss": 0.1533, "step": 2208 }, { "epoch": 0.019175180771000253, "grad_norm": 0.06494140625, "learning_rate": 0.001999281972361719, "loss": 0.1768, "step": 2209 }, { "epoch": 0.01918386125120442, "grad_norm": 0.06787109375, "learning_rate": 0.001999280783240614, "loss": 0.2354, "step": 2210 }, { "epoch": 0.01919254173140858, "grad_norm": 0.0693359375, "learning_rate": 0.001999279593136069, "loss": 0.2168, "step": 2211 }, { "epoch": 0.019201222211612745, "grad_norm": 0.1376953125, "learning_rate": 0.001999278402048085, "loss": 0.25, "step": 2212 }, { "epoch": 0.01920990269181691, "grad_norm": 0.07275390625, "learning_rate": 0.001999277209976664, "loss": 0.207, "step": 2213 }, { "epoch": 0.019218583172021075, "grad_norm": 0.07275390625, "learning_rate": 0.001999276016921807, "loss": 0.2305, "step": 2214 }, { "epoch": 0.01922726365222524, "grad_norm": 0.07861328125, "learning_rate": 0.0019992748228835153, "loss": 0.2383, "step": 2215 }, { "epoch": 0.019235944132429406, "grad_norm": 0.10546875, "learning_rate": 0.00199927362786179, "loss": 0.3027, "step": 2216 }, { "epoch": 0.01924462461263357, "grad_norm": 0.103515625, "learning_rate": 0.0019992724318566328, "loss": 0.2266, "step": 2217 }, { "epoch": 0.019253305092837736, "grad_norm": 0.05859375, "learning_rate": 0.0019992712348680447, "loss": 0.1816, "step": 2218 }, { "epoch": 0.0192619855730419, "grad_norm": 0.06787109375, "learning_rate": 0.001999270036896027, "loss": 0.2158, "step": 2219 }, { "epoch": 0.019270666053246066, "grad_norm": 0.10205078125, "learning_rate": 0.0019992688379405813, "loss": 0.2734, "step": 2220 }, { "epoch": 0.01927934653345023, "grad_norm": 0.0693359375, "learning_rate": 0.0019992676380017086, "loss": 0.1787, "step": 2221 }, { "epoch": 0.019288027013654396, "grad_norm": 0.0966796875, "learning_rate": 0.0019992664370794104, "loss": 0.2559, "step": 2222 }, { "epoch": 0.01929670749385856, "grad_norm": 0.08447265625, "learning_rate": 0.001999265235173688, "loss": 0.2441, "step": 2223 }, { "epoch": 0.019305387974062727, "grad_norm": 0.10205078125, "learning_rate": 0.0019992640322845424, "loss": 0.2432, "step": 2224 }, { "epoch": 0.01931406845426689, "grad_norm": 0.076171875, "learning_rate": 0.001999262828411975, "loss": 0.1992, "step": 2225 }, { "epoch": 0.019322748934471053, "grad_norm": 0.1005859375, "learning_rate": 0.0019992616235559877, "loss": 0.2344, "step": 2226 }, { "epoch": 0.01933142941467522, "grad_norm": 0.09326171875, "learning_rate": 0.0019992604177165815, "loss": 0.2402, "step": 2227 }, { "epoch": 0.019340109894879384, "grad_norm": 0.1455078125, "learning_rate": 0.001999259210893757, "loss": 0.2617, "step": 2228 }, { "epoch": 0.01934879037508355, "grad_norm": 0.06298828125, "learning_rate": 0.0019992580030875166, "loss": 0.1934, "step": 2229 }, { "epoch": 0.019357470855287714, "grad_norm": 0.099609375, "learning_rate": 0.001999256794297861, "loss": 0.1885, "step": 2230 }, { "epoch": 0.01936615133549188, "grad_norm": 0.07568359375, "learning_rate": 0.001999255584524792, "loss": 0.2188, "step": 2231 }, { "epoch": 0.019374831815696044, "grad_norm": 0.125, "learning_rate": 0.0019992543737683104, "loss": 0.2617, "step": 2232 }, { "epoch": 0.01938351229590021, "grad_norm": 0.07958984375, "learning_rate": 0.0019992531620284175, "loss": 0.1914, "step": 2233 }, { "epoch": 0.019392192776104374, "grad_norm": 0.07666015625, "learning_rate": 0.001999251949305115, "loss": 0.209, "step": 2234 }, { "epoch": 0.01940087325630854, "grad_norm": 0.10693359375, "learning_rate": 0.0019992507355984044, "loss": 0.209, "step": 2235 }, { "epoch": 0.019409553736512705, "grad_norm": 0.11181640625, "learning_rate": 0.001999249520908286, "loss": 0.2656, "step": 2236 }, { "epoch": 0.01941823421671687, "grad_norm": 0.515625, "learning_rate": 0.0019992483052347623, "loss": 0.625, "step": 2237 }, { "epoch": 0.019426914696921035, "grad_norm": 0.10498046875, "learning_rate": 0.0019992470885778343, "loss": 0.2559, "step": 2238 }, { "epoch": 0.0194355951771252, "grad_norm": 0.0927734375, "learning_rate": 0.001999245870937503, "loss": 0.248, "step": 2239 }, { "epoch": 0.019444275657329365, "grad_norm": 0.11083984375, "learning_rate": 0.00199924465231377, "loss": 0.2148, "step": 2240 }, { "epoch": 0.019452956137533527, "grad_norm": 0.08984375, "learning_rate": 0.0019992434327066365, "loss": 0.207, "step": 2241 }, { "epoch": 0.019461636617737692, "grad_norm": 0.1318359375, "learning_rate": 0.001999242212116104, "loss": 0.3105, "step": 2242 }, { "epoch": 0.019470317097941857, "grad_norm": 0.080078125, "learning_rate": 0.0019992409905421732, "loss": 0.2178, "step": 2243 }, { "epoch": 0.019478997578146022, "grad_norm": 0.083984375, "learning_rate": 0.0019992397679848466, "loss": 0.2598, "step": 2244 }, { "epoch": 0.019487678058350187, "grad_norm": 0.083984375, "learning_rate": 0.0019992385444441247, "loss": 0.2109, "step": 2245 }, { "epoch": 0.019496358538554352, "grad_norm": 0.1044921875, "learning_rate": 0.001999237319920009, "loss": 0.2383, "step": 2246 }, { "epoch": 0.019505039018758517, "grad_norm": 0.07275390625, "learning_rate": 0.001999236094412501, "loss": 0.2383, "step": 2247 }, { "epoch": 0.019513719498962682, "grad_norm": 0.06640625, "learning_rate": 0.0019992348679216017, "loss": 0.207, "step": 2248 }, { "epoch": 0.019522399979166848, "grad_norm": 0.125, "learning_rate": 0.001999233640447313, "loss": 0.2656, "step": 2249 }, { "epoch": 0.019531080459371013, "grad_norm": 0.07958984375, "learning_rate": 0.0019992324119896356, "loss": 0.207, "step": 2250 }, { "epoch": 0.019539760939575178, "grad_norm": 0.07666015625, "learning_rate": 0.001999231182548571, "loss": 0.2188, "step": 2251 }, { "epoch": 0.019548441419779343, "grad_norm": 0.08447265625, "learning_rate": 0.001999229952124121, "loss": 0.1982, "step": 2252 }, { "epoch": 0.019557121899983508, "grad_norm": 0.1015625, "learning_rate": 0.0019992287207162866, "loss": 0.2812, "step": 2253 }, { "epoch": 0.019565802380187673, "grad_norm": 0.1064453125, "learning_rate": 0.001999227488325069, "loss": 0.2598, "step": 2254 }, { "epoch": 0.01957448286039184, "grad_norm": 0.1767578125, "learning_rate": 0.00199922625495047, "loss": 0.2266, "step": 2255 }, { "epoch": 0.019583163340596003, "grad_norm": 0.08935546875, "learning_rate": 0.0019992250205924903, "loss": 0.248, "step": 2256 }, { "epoch": 0.019591843820800165, "grad_norm": 0.07568359375, "learning_rate": 0.001999223785251132, "loss": 0.1768, "step": 2257 }, { "epoch": 0.01960052430100433, "grad_norm": 0.21875, "learning_rate": 0.001999222548926396, "loss": 0.3125, "step": 2258 }, { "epoch": 0.019609204781208495, "grad_norm": 0.083984375, "learning_rate": 0.0019992213116182835, "loss": 0.2324, "step": 2259 }, { "epoch": 0.01961788526141266, "grad_norm": 0.09033203125, "learning_rate": 0.0019992200733267964, "loss": 0.248, "step": 2260 }, { "epoch": 0.019626565741616826, "grad_norm": 0.0791015625, "learning_rate": 0.0019992188340519357, "loss": 0.2402, "step": 2261 }, { "epoch": 0.01963524622182099, "grad_norm": 0.061767578125, "learning_rate": 0.0019992175937937023, "loss": 0.1953, "step": 2262 }, { "epoch": 0.019643926702025156, "grad_norm": 0.09228515625, "learning_rate": 0.0019992163525520985, "loss": 0.2158, "step": 2263 }, { "epoch": 0.01965260718222932, "grad_norm": 0.10205078125, "learning_rate": 0.0019992151103271254, "loss": 0.2285, "step": 2264 }, { "epoch": 0.019661287662433486, "grad_norm": 0.091796875, "learning_rate": 0.0019992138671187836, "loss": 0.249, "step": 2265 }, { "epoch": 0.01966996814263765, "grad_norm": 0.7109375, "learning_rate": 0.0019992126229270756, "loss": 0.5078, "step": 2266 }, { "epoch": 0.019678648622841816, "grad_norm": 0.08203125, "learning_rate": 0.001999211377752002, "loss": 0.2207, "step": 2267 }, { "epoch": 0.01968732910304598, "grad_norm": 0.10205078125, "learning_rate": 0.001999210131593564, "loss": 0.2002, "step": 2268 }, { "epoch": 0.019696009583250147, "grad_norm": 0.150390625, "learning_rate": 0.0019992088844517637, "loss": 0.2061, "step": 2269 }, { "epoch": 0.01970469006345431, "grad_norm": 0.142578125, "learning_rate": 0.001999207636326602, "loss": 0.2207, "step": 2270 }, { "epoch": 0.019713370543658477, "grad_norm": 0.0927734375, "learning_rate": 0.00199920638721808, "loss": 0.2227, "step": 2271 }, { "epoch": 0.01972205102386264, "grad_norm": 0.12451171875, "learning_rate": 0.0019992051371262, "loss": 0.2539, "step": 2272 }, { "epoch": 0.019730731504066804, "grad_norm": 0.125, "learning_rate": 0.0019992038860509624, "loss": 0.2539, "step": 2273 }, { "epoch": 0.01973941198427097, "grad_norm": 0.051025390625, "learning_rate": 0.001999202633992369, "loss": 0.1719, "step": 2274 }, { "epoch": 0.019748092464475134, "grad_norm": 0.09814453125, "learning_rate": 0.0019992013809504213, "loss": 0.2051, "step": 2275 }, { "epoch": 0.0197567729446793, "grad_norm": 0.0859375, "learning_rate": 0.00199920012692512, "loss": 0.2393, "step": 2276 }, { "epoch": 0.019765453424883464, "grad_norm": 0.0732421875, "learning_rate": 0.001999198871916467, "loss": 0.2148, "step": 2277 }, { "epoch": 0.01977413390508763, "grad_norm": 0.080078125, "learning_rate": 0.001999197615924464, "loss": 0.2236, "step": 2278 }, { "epoch": 0.019782814385291794, "grad_norm": 0.083984375, "learning_rate": 0.001999196358949112, "loss": 0.2012, "step": 2279 }, { "epoch": 0.01979149486549596, "grad_norm": 0.10595703125, "learning_rate": 0.0019991951009904123, "loss": 0.2383, "step": 2280 }, { "epoch": 0.019800175345700124, "grad_norm": 0.08544921875, "learning_rate": 0.0019991938420483666, "loss": 0.2207, "step": 2281 }, { "epoch": 0.01980885582590429, "grad_norm": 0.0966796875, "learning_rate": 0.001999192582122975, "loss": 0.2461, "step": 2282 }, { "epoch": 0.019817536306108455, "grad_norm": 0.056640625, "learning_rate": 0.0019991913212142414, "loss": 0.1719, "step": 2283 }, { "epoch": 0.01982621678631262, "grad_norm": 0.080078125, "learning_rate": 0.001999190059322165, "loss": 0.2109, "step": 2284 }, { "epoch": 0.019834897266516785, "grad_norm": 0.1357421875, "learning_rate": 0.0019991887964467474, "loss": 0.6211, "step": 2285 }, { "epoch": 0.01984357774672095, "grad_norm": 0.08984375, "learning_rate": 0.001999187532587991, "loss": 0.2656, "step": 2286 }, { "epoch": 0.019852258226925115, "grad_norm": 0.06494140625, "learning_rate": 0.0019991862677458962, "loss": 0.2461, "step": 2287 }, { "epoch": 0.019860938707129277, "grad_norm": 0.06640625, "learning_rate": 0.001999185001920465, "loss": 0.1777, "step": 2288 }, { "epoch": 0.019869619187333442, "grad_norm": 0.09228515625, "learning_rate": 0.001999183735111699, "loss": 0.3047, "step": 2289 }, { "epoch": 0.019878299667537607, "grad_norm": 0.087890625, "learning_rate": 0.0019991824673195983, "loss": 0.2637, "step": 2290 }, { "epoch": 0.019886980147741772, "grad_norm": 0.1015625, "learning_rate": 0.0019991811985441655, "loss": 0.2539, "step": 2291 }, { "epoch": 0.019895660627945937, "grad_norm": 0.054931640625, "learning_rate": 0.001999179928785402, "loss": 0.1865, "step": 2292 }, { "epoch": 0.019904341108150102, "grad_norm": 0.07373046875, "learning_rate": 0.001999178658043309, "loss": 0.2363, "step": 2293 }, { "epoch": 0.019913021588354268, "grad_norm": 0.1552734375, "learning_rate": 0.0019991773863178874, "loss": 0.2871, "step": 2294 }, { "epoch": 0.019921702068558433, "grad_norm": 0.0712890625, "learning_rate": 0.0019991761136091387, "loss": 0.2344, "step": 2295 }, { "epoch": 0.019930382548762598, "grad_norm": 0.08642578125, "learning_rate": 0.001999174839917065, "loss": 0.2422, "step": 2296 }, { "epoch": 0.019939063028966763, "grad_norm": 0.08740234375, "learning_rate": 0.0019991735652416668, "loss": 0.2324, "step": 2297 }, { "epoch": 0.019947743509170928, "grad_norm": 0.0693359375, "learning_rate": 0.001999172289582946, "loss": 0.25, "step": 2298 }, { "epoch": 0.019956423989375093, "grad_norm": 0.078125, "learning_rate": 0.001999171012940904, "loss": 0.2422, "step": 2299 }, { "epoch": 0.01996510446957926, "grad_norm": 0.080078125, "learning_rate": 0.001999169735315542, "loss": 0.1914, "step": 2300 }, { "epoch": 0.019973784949783423, "grad_norm": 0.06494140625, "learning_rate": 0.0019991684567068615, "loss": 0.252, "step": 2301 }, { "epoch": 0.01998246542998759, "grad_norm": 0.0712890625, "learning_rate": 0.001999167177114864, "loss": 0.1748, "step": 2302 }, { "epoch": 0.01999114591019175, "grad_norm": 0.07568359375, "learning_rate": 0.0019991658965395503, "loss": 0.2441, "step": 2303 }, { "epoch": 0.019999826390395915, "grad_norm": 0.083984375, "learning_rate": 0.001999164614980923, "loss": 0.25, "step": 2304 }, { "epoch": 0.02000850687060008, "grad_norm": 0.080078125, "learning_rate": 0.001999163332438982, "loss": 0.1992, "step": 2305 }, { "epoch": 0.020017187350804246, "grad_norm": 0.08544921875, "learning_rate": 0.00199916204891373, "loss": 0.2949, "step": 2306 }, { "epoch": 0.02002586783100841, "grad_norm": 0.0654296875, "learning_rate": 0.0019991607644051683, "loss": 0.1953, "step": 2307 }, { "epoch": 0.020034548311212576, "grad_norm": 0.189453125, "learning_rate": 0.0019991594789132974, "loss": 0.3125, "step": 2308 }, { "epoch": 0.02004322879141674, "grad_norm": 0.080078125, "learning_rate": 0.0019991581924381193, "loss": 0.1992, "step": 2309 }, { "epoch": 0.020051909271620906, "grad_norm": 0.08349609375, "learning_rate": 0.001999156904979635, "loss": 0.2217, "step": 2310 }, { "epoch": 0.02006058975182507, "grad_norm": 0.07080078125, "learning_rate": 0.0019991556165378462, "loss": 0.2041, "step": 2311 }, { "epoch": 0.020069270232029236, "grad_norm": 0.0634765625, "learning_rate": 0.0019991543271127552, "loss": 0.2021, "step": 2312 }, { "epoch": 0.0200779507122334, "grad_norm": 0.0849609375, "learning_rate": 0.0019991530367043614, "loss": 0.2109, "step": 2313 }, { "epoch": 0.020086631192437567, "grad_norm": 0.08251953125, "learning_rate": 0.001999151745312668, "loss": 0.2422, "step": 2314 }, { "epoch": 0.02009531167264173, "grad_norm": 0.068359375, "learning_rate": 0.0019991504529376755, "loss": 0.1924, "step": 2315 }, { "epoch": 0.020103992152845897, "grad_norm": 0.076171875, "learning_rate": 0.0019991491595793857, "loss": 0.2559, "step": 2316 }, { "epoch": 0.020112672633050062, "grad_norm": 0.07568359375, "learning_rate": 0.0019991478652378, "loss": 0.2119, "step": 2317 }, { "epoch": 0.020121353113254224, "grad_norm": 0.0869140625, "learning_rate": 0.0019991465699129196, "loss": 0.2344, "step": 2318 }, { "epoch": 0.02013003359345839, "grad_norm": 0.1083984375, "learning_rate": 0.001999145273604746, "loss": 0.2109, "step": 2319 }, { "epoch": 0.020138714073662554, "grad_norm": 0.06689453125, "learning_rate": 0.0019991439763132808, "loss": 0.1826, "step": 2320 }, { "epoch": 0.02014739455386672, "grad_norm": 0.0810546875, "learning_rate": 0.001999142678038525, "loss": 0.2305, "step": 2321 }, { "epoch": 0.020156075034070884, "grad_norm": 0.08984375, "learning_rate": 0.001999141378780481, "loss": 0.209, "step": 2322 }, { "epoch": 0.02016475551427505, "grad_norm": 0.0810546875, "learning_rate": 0.0019991400785391487, "loss": 0.2598, "step": 2323 }, { "epoch": 0.020173435994479214, "grad_norm": 0.09619140625, "learning_rate": 0.0019991387773145306, "loss": 0.2148, "step": 2324 }, { "epoch": 0.02018211647468338, "grad_norm": 0.07763671875, "learning_rate": 0.001999137475106627, "loss": 0.2188, "step": 2325 }, { "epoch": 0.020190796954887544, "grad_norm": 0.08203125, "learning_rate": 0.0019991361719154414, "loss": 0.2461, "step": 2326 }, { "epoch": 0.02019947743509171, "grad_norm": 0.06298828125, "learning_rate": 0.0019991348677409733, "loss": 0.1836, "step": 2327 }, { "epoch": 0.020208157915295875, "grad_norm": 0.12109375, "learning_rate": 0.0019991335625832254, "loss": 0.252, "step": 2328 }, { "epoch": 0.02021683839550004, "grad_norm": 0.052490234375, "learning_rate": 0.001999132256442198, "loss": 0.1572, "step": 2329 }, { "epoch": 0.020225518875704205, "grad_norm": 0.09228515625, "learning_rate": 0.001999130949317893, "loss": 0.2188, "step": 2330 }, { "epoch": 0.02023419935590837, "grad_norm": 0.09423828125, "learning_rate": 0.0019991296412103124, "loss": 0.2363, "step": 2331 }, { "epoch": 0.020242879836112535, "grad_norm": 0.0703125, "learning_rate": 0.0019991283321194567, "loss": 0.2119, "step": 2332 }, { "epoch": 0.0202515603163167, "grad_norm": 0.0751953125, "learning_rate": 0.001999127022045328, "loss": 0.1855, "step": 2333 }, { "epoch": 0.020260240796520862, "grad_norm": 0.0693359375, "learning_rate": 0.001999125710987928, "loss": 0.2188, "step": 2334 }, { "epoch": 0.020268921276725027, "grad_norm": 0.099609375, "learning_rate": 0.0019991243989472564, "loss": 0.2559, "step": 2335 }, { "epoch": 0.020277601756929192, "grad_norm": 0.068359375, "learning_rate": 0.001999123085923317, "loss": 0.1797, "step": 2336 }, { "epoch": 0.020286282237133357, "grad_norm": 0.09814453125, "learning_rate": 0.001999121771916109, "loss": 0.2402, "step": 2337 }, { "epoch": 0.020294962717337522, "grad_norm": 0.1005859375, "learning_rate": 0.001999120456925636, "loss": 0.2617, "step": 2338 }, { "epoch": 0.020303643197541688, "grad_norm": 0.09130859375, "learning_rate": 0.001999119140951898, "loss": 0.252, "step": 2339 }, { "epoch": 0.020312323677745853, "grad_norm": 0.09521484375, "learning_rate": 0.001999117823994897, "loss": 0.2207, "step": 2340 }, { "epoch": 0.020321004157950018, "grad_norm": 0.0859375, "learning_rate": 0.0019991165060546335, "loss": 0.2539, "step": 2341 }, { "epoch": 0.020329684638154183, "grad_norm": 0.0771484375, "learning_rate": 0.0019991151871311106, "loss": 0.1953, "step": 2342 }, { "epoch": 0.020338365118358348, "grad_norm": 0.060546875, "learning_rate": 0.0019991138672243282, "loss": 0.165, "step": 2343 }, { "epoch": 0.020347045598562513, "grad_norm": 0.08447265625, "learning_rate": 0.001999112546334289, "loss": 0.2002, "step": 2344 }, { "epoch": 0.02035572607876668, "grad_norm": 0.0849609375, "learning_rate": 0.0019991112244609932, "loss": 0.252, "step": 2345 }, { "epoch": 0.020364406558970843, "grad_norm": 0.07861328125, "learning_rate": 0.0019991099016044432, "loss": 0.2441, "step": 2346 }, { "epoch": 0.02037308703917501, "grad_norm": 0.0869140625, "learning_rate": 0.0019991085777646403, "loss": 0.2188, "step": 2347 }, { "epoch": 0.020381767519379174, "grad_norm": 0.09130859375, "learning_rate": 0.0019991072529415856, "loss": 0.2236, "step": 2348 }, { "epoch": 0.020390447999583335, "grad_norm": 0.09765625, "learning_rate": 0.0019991059271352806, "loss": 0.3047, "step": 2349 }, { "epoch": 0.0203991284797875, "grad_norm": 0.0810546875, "learning_rate": 0.001999104600345727, "loss": 0.248, "step": 2350 }, { "epoch": 0.020407808959991666, "grad_norm": 0.09130859375, "learning_rate": 0.001999103272572926, "loss": 0.2344, "step": 2351 }, { "epoch": 0.02041648944019583, "grad_norm": 0.0634765625, "learning_rate": 0.0019991019438168793, "loss": 0.1533, "step": 2352 }, { "epoch": 0.020425169920399996, "grad_norm": 0.1015625, "learning_rate": 0.001999100614077588, "loss": 0.252, "step": 2353 }, { "epoch": 0.02043385040060416, "grad_norm": 0.09521484375, "learning_rate": 0.0019990992833550537, "loss": 0.2246, "step": 2354 }, { "epoch": 0.020442530880808326, "grad_norm": 0.1416015625, "learning_rate": 0.001999097951649278, "loss": 0.2539, "step": 2355 }, { "epoch": 0.02045121136101249, "grad_norm": 0.06689453125, "learning_rate": 0.001999096618960263, "loss": 0.1904, "step": 2356 }, { "epoch": 0.020459891841216656, "grad_norm": 0.09130859375, "learning_rate": 0.0019990952852880087, "loss": 0.2373, "step": 2357 }, { "epoch": 0.02046857232142082, "grad_norm": 0.08740234375, "learning_rate": 0.0019990939506325175, "loss": 0.2617, "step": 2358 }, { "epoch": 0.020477252801624986, "grad_norm": 0.06787109375, "learning_rate": 0.0019990926149937908, "loss": 0.1689, "step": 2359 }, { "epoch": 0.02048593328182915, "grad_norm": 0.1044921875, "learning_rate": 0.0019990912783718296, "loss": 0.2246, "step": 2360 }, { "epoch": 0.020494613762033317, "grad_norm": 0.07568359375, "learning_rate": 0.001999089940766636, "loss": 0.2129, "step": 2361 }, { "epoch": 0.020503294242237482, "grad_norm": 0.06298828125, "learning_rate": 0.001999088602178211, "loss": 0.1787, "step": 2362 }, { "epoch": 0.020511974722441647, "grad_norm": 0.095703125, "learning_rate": 0.001999087262606556, "loss": 0.2598, "step": 2363 }, { "epoch": 0.020520655202645812, "grad_norm": 0.057373046875, "learning_rate": 0.001999085922051673, "loss": 0.1963, "step": 2364 }, { "epoch": 0.020529335682849974, "grad_norm": 0.06103515625, "learning_rate": 0.0019990845805135634, "loss": 0.2051, "step": 2365 }, { "epoch": 0.02053801616305414, "grad_norm": 0.08984375, "learning_rate": 0.001999083237992228, "loss": 0.25, "step": 2366 }, { "epoch": 0.020546696643258304, "grad_norm": 0.07275390625, "learning_rate": 0.0019990818944876686, "loss": 0.2402, "step": 2367 }, { "epoch": 0.02055537712346247, "grad_norm": 0.09619140625, "learning_rate": 0.0019990805499998866, "loss": 0.2227, "step": 2368 }, { "epoch": 0.020564057603666634, "grad_norm": 0.09228515625, "learning_rate": 0.0019990792045288842, "loss": 0.2383, "step": 2369 }, { "epoch": 0.0205727380838708, "grad_norm": 0.072265625, "learning_rate": 0.001999077858074662, "loss": 0.2148, "step": 2370 }, { "epoch": 0.020581418564074964, "grad_norm": 0.078125, "learning_rate": 0.001999076510637222, "loss": 0.2266, "step": 2371 }, { "epoch": 0.02059009904427913, "grad_norm": 0.1005859375, "learning_rate": 0.001999075162216565, "loss": 0.2598, "step": 2372 }, { "epoch": 0.020598779524483295, "grad_norm": 0.083984375, "learning_rate": 0.001999073812812693, "loss": 0.2148, "step": 2373 }, { "epoch": 0.02060746000468746, "grad_norm": 0.09033203125, "learning_rate": 0.001999072462425608, "loss": 0.2422, "step": 2374 }, { "epoch": 0.020616140484891625, "grad_norm": 0.11083984375, "learning_rate": 0.00199907111105531, "loss": 0.2402, "step": 2375 }, { "epoch": 0.02062482096509579, "grad_norm": 0.0849609375, "learning_rate": 0.0019990697587018015, "loss": 0.2285, "step": 2376 }, { "epoch": 0.020633501445299955, "grad_norm": 0.080078125, "learning_rate": 0.001999068405365084, "loss": 0.2041, "step": 2377 }, { "epoch": 0.02064218192550412, "grad_norm": 0.09619140625, "learning_rate": 0.001999067051045159, "loss": 0.2227, "step": 2378 }, { "epoch": 0.020650862405708285, "grad_norm": 0.06689453125, "learning_rate": 0.0019990656957420277, "loss": 0.1934, "step": 2379 }, { "epoch": 0.020659542885912447, "grad_norm": 0.08056640625, "learning_rate": 0.001999064339455692, "loss": 0.2324, "step": 2380 }, { "epoch": 0.020668223366116612, "grad_norm": 0.08056640625, "learning_rate": 0.0019990629821861525, "loss": 0.2061, "step": 2381 }, { "epoch": 0.020676903846320777, "grad_norm": 0.0576171875, "learning_rate": 0.0019990616239334113, "loss": 0.1826, "step": 2382 }, { "epoch": 0.020685584326524942, "grad_norm": 0.083984375, "learning_rate": 0.0019990602646974697, "loss": 0.2363, "step": 2383 }, { "epoch": 0.020694264806729108, "grad_norm": 0.08056640625, "learning_rate": 0.0019990589044783296, "loss": 0.2305, "step": 2384 }, { "epoch": 0.020702945286933273, "grad_norm": 0.099609375, "learning_rate": 0.001999057543275992, "loss": 0.2617, "step": 2385 }, { "epoch": 0.020711625767137438, "grad_norm": 0.0703125, "learning_rate": 0.0019990561810904585, "loss": 0.21, "step": 2386 }, { "epoch": 0.020720306247341603, "grad_norm": 0.08642578125, "learning_rate": 0.001999054817921731, "loss": 0.2354, "step": 2387 }, { "epoch": 0.020728986727545768, "grad_norm": 0.10205078125, "learning_rate": 0.0019990534537698106, "loss": 0.2393, "step": 2388 }, { "epoch": 0.020737667207749933, "grad_norm": 0.0791015625, "learning_rate": 0.0019990520886346984, "loss": 0.2344, "step": 2389 }, { "epoch": 0.020746347687954098, "grad_norm": 0.09130859375, "learning_rate": 0.001999050722516397, "loss": 0.2266, "step": 2390 }, { "epoch": 0.020755028168158263, "grad_norm": 0.08642578125, "learning_rate": 0.001999049355414907, "loss": 0.2617, "step": 2391 }, { "epoch": 0.02076370864836243, "grad_norm": 0.09375, "learning_rate": 0.00199904798733023, "loss": 0.2197, "step": 2392 }, { "epoch": 0.020772389128566594, "grad_norm": 0.08056640625, "learning_rate": 0.0019990466182623675, "loss": 0.2539, "step": 2393 }, { "epoch": 0.02078106960877076, "grad_norm": 0.142578125, "learning_rate": 0.0019990452482113214, "loss": 0.3516, "step": 2394 }, { "epoch": 0.02078975008897492, "grad_norm": 0.0751953125, "learning_rate": 0.0019990438771770925, "loss": 0.2188, "step": 2395 }, { "epoch": 0.020798430569179086, "grad_norm": 0.10302734375, "learning_rate": 0.001999042505159683, "loss": 0.2656, "step": 2396 }, { "epoch": 0.02080711104938325, "grad_norm": 0.12158203125, "learning_rate": 0.0019990411321590944, "loss": 0.25, "step": 2397 }, { "epoch": 0.020815791529587416, "grad_norm": 0.0791015625, "learning_rate": 0.001999039758175328, "loss": 0.1895, "step": 2398 }, { "epoch": 0.02082447200979158, "grad_norm": 0.07373046875, "learning_rate": 0.0019990383832083846, "loss": 0.2148, "step": 2399 }, { "epoch": 0.020833152489995746, "grad_norm": 0.455078125, "learning_rate": 0.001999037007258267, "loss": 0.3008, "step": 2400 }, { "epoch": 0.02084183297019991, "grad_norm": 0.06982421875, "learning_rate": 0.0019990356303249755, "loss": 0.1992, "step": 2401 }, { "epoch": 0.020850513450404076, "grad_norm": 0.08740234375, "learning_rate": 0.0019990342524085123, "loss": 0.2246, "step": 2402 }, { "epoch": 0.02085919393060824, "grad_norm": 0.08154296875, "learning_rate": 0.001999032873508879, "loss": 0.1934, "step": 2403 }, { "epoch": 0.020867874410812406, "grad_norm": 0.1611328125, "learning_rate": 0.0019990314936260767, "loss": 0.5469, "step": 2404 }, { "epoch": 0.02087655489101657, "grad_norm": 0.09521484375, "learning_rate": 0.0019990301127601065, "loss": 0.2227, "step": 2405 }, { "epoch": 0.020885235371220737, "grad_norm": 0.0810546875, "learning_rate": 0.001999028730910971, "loss": 0.208, "step": 2406 }, { "epoch": 0.020893915851424902, "grad_norm": 0.095703125, "learning_rate": 0.0019990273480786714, "loss": 0.2344, "step": 2407 }, { "epoch": 0.020902596331629067, "grad_norm": 0.072265625, "learning_rate": 0.0019990259642632085, "loss": 0.1797, "step": 2408 }, { "epoch": 0.020911276811833232, "grad_norm": 0.07958984375, "learning_rate": 0.0019990245794645847, "loss": 0.2227, "step": 2409 }, { "epoch": 0.020919957292037397, "grad_norm": 0.058349609375, "learning_rate": 0.001999023193682801, "loss": 0.1953, "step": 2410 }, { "epoch": 0.02092863777224156, "grad_norm": 0.09326171875, "learning_rate": 0.001999021806917859, "loss": 0.1855, "step": 2411 }, { "epoch": 0.020937318252445724, "grad_norm": 0.064453125, "learning_rate": 0.0019990204191697603, "loss": 0.209, "step": 2412 }, { "epoch": 0.02094599873264989, "grad_norm": 0.10595703125, "learning_rate": 0.0019990190304385066, "loss": 0.2246, "step": 2413 }, { "epoch": 0.020954679212854054, "grad_norm": 0.0908203125, "learning_rate": 0.001999017640724099, "loss": 0.2695, "step": 2414 }, { "epoch": 0.02096335969305822, "grad_norm": 0.0751953125, "learning_rate": 0.001999016250026539, "loss": 0.2148, "step": 2415 }, { "epoch": 0.020972040173262384, "grad_norm": 0.09716796875, "learning_rate": 0.001999014858345829, "loss": 0.291, "step": 2416 }, { "epoch": 0.02098072065346655, "grad_norm": 0.10009765625, "learning_rate": 0.001999013465681969, "loss": 0.248, "step": 2417 }, { "epoch": 0.020989401133670715, "grad_norm": 0.0849609375, "learning_rate": 0.001999012072034962, "loss": 0.2344, "step": 2418 }, { "epoch": 0.02099808161387488, "grad_norm": 0.0732421875, "learning_rate": 0.0019990106774048087, "loss": 0.1865, "step": 2419 }, { "epoch": 0.021006762094079045, "grad_norm": 0.064453125, "learning_rate": 0.0019990092817915106, "loss": 0.2285, "step": 2420 }, { "epoch": 0.02101544257428321, "grad_norm": 0.08349609375, "learning_rate": 0.0019990078851950697, "loss": 0.207, "step": 2421 }, { "epoch": 0.021024123054487375, "grad_norm": 0.0732421875, "learning_rate": 0.0019990064876154873, "loss": 0.2168, "step": 2422 }, { "epoch": 0.02103280353469154, "grad_norm": 0.08740234375, "learning_rate": 0.001999005089052765, "loss": 0.2773, "step": 2423 }, { "epoch": 0.021041484014895705, "grad_norm": 0.07177734375, "learning_rate": 0.0019990036895069043, "loss": 0.2305, "step": 2424 }, { "epoch": 0.02105016449509987, "grad_norm": 0.056640625, "learning_rate": 0.0019990022889779064, "loss": 0.1992, "step": 2425 }, { "epoch": 0.021058844975304032, "grad_norm": 0.078125, "learning_rate": 0.001999000887465773, "loss": 0.2676, "step": 2426 }, { "epoch": 0.021067525455508197, "grad_norm": 0.08544921875, "learning_rate": 0.001998999484970506, "loss": 0.252, "step": 2427 }, { "epoch": 0.021076205935712362, "grad_norm": 0.06982421875, "learning_rate": 0.0019989980814921066, "loss": 0.1914, "step": 2428 }, { "epoch": 0.021084886415916528, "grad_norm": 0.0478515625, "learning_rate": 0.0019989966770305764, "loss": 0.168, "step": 2429 }, { "epoch": 0.021093566896120693, "grad_norm": 0.2080078125, "learning_rate": 0.001998995271585917, "loss": 0.3555, "step": 2430 }, { "epoch": 0.021102247376324858, "grad_norm": 0.0791015625, "learning_rate": 0.0019989938651581297, "loss": 0.2266, "step": 2431 }, { "epoch": 0.021110927856529023, "grad_norm": 0.06787109375, "learning_rate": 0.001998992457747216, "loss": 0.2041, "step": 2432 }, { "epoch": 0.021119608336733188, "grad_norm": 0.0693359375, "learning_rate": 0.0019989910493531785, "loss": 0.1973, "step": 2433 }, { "epoch": 0.021128288816937353, "grad_norm": 0.171875, "learning_rate": 0.001998989639976017, "loss": 0.3125, "step": 2434 }, { "epoch": 0.021136969297141518, "grad_norm": 0.083984375, "learning_rate": 0.0019989882296157346, "loss": 0.2207, "step": 2435 }, { "epoch": 0.021145649777345683, "grad_norm": 0.06201171875, "learning_rate": 0.0019989868182723317, "loss": 0.1973, "step": 2436 }, { "epoch": 0.02115433025754985, "grad_norm": 0.1591796875, "learning_rate": 0.001998985405945811, "loss": 0.2695, "step": 2437 }, { "epoch": 0.021163010737754014, "grad_norm": 0.060546875, "learning_rate": 0.0019989839926361727, "loss": 0.1904, "step": 2438 }, { "epoch": 0.02117169121795818, "grad_norm": 0.08740234375, "learning_rate": 0.001998982578343419, "loss": 0.2012, "step": 2439 }, { "epoch": 0.021180371698162344, "grad_norm": 0.0751953125, "learning_rate": 0.001998981163067552, "loss": 0.2188, "step": 2440 }, { "epoch": 0.02118905217836651, "grad_norm": 0.07568359375, "learning_rate": 0.0019989797468085726, "loss": 0.2441, "step": 2441 }, { "epoch": 0.02119773265857067, "grad_norm": 0.0791015625, "learning_rate": 0.001998978329566482, "loss": 0.2324, "step": 2442 }, { "epoch": 0.021206413138774836, "grad_norm": 0.0732421875, "learning_rate": 0.001998976911341282, "loss": 0.2285, "step": 2443 }, { "epoch": 0.021215093618979, "grad_norm": 0.061279296875, "learning_rate": 0.001998975492132975, "loss": 0.208, "step": 2444 }, { "epoch": 0.021223774099183166, "grad_norm": 0.0693359375, "learning_rate": 0.001998974071941562, "loss": 0.1875, "step": 2445 }, { "epoch": 0.02123245457938733, "grad_norm": 0.078125, "learning_rate": 0.001998972650767044, "loss": 0.2266, "step": 2446 }, { "epoch": 0.021241135059591496, "grad_norm": 0.09716796875, "learning_rate": 0.001998971228609423, "loss": 0.2617, "step": 2447 }, { "epoch": 0.02124981553979566, "grad_norm": 0.07958984375, "learning_rate": 0.001998969805468701, "loss": 0.2012, "step": 2448 }, { "epoch": 0.021258496019999826, "grad_norm": 0.0703125, "learning_rate": 0.0019989683813448787, "loss": 0.2373, "step": 2449 }, { "epoch": 0.02126717650020399, "grad_norm": 0.08056640625, "learning_rate": 0.001998966956237958, "loss": 0.2637, "step": 2450 }, { "epoch": 0.021275856980408157, "grad_norm": 0.125, "learning_rate": 0.0019989655301479413, "loss": 0.3086, "step": 2451 }, { "epoch": 0.021284537460612322, "grad_norm": 0.07861328125, "learning_rate": 0.001998964103074829, "loss": 0.2002, "step": 2452 }, { "epoch": 0.021293217940816487, "grad_norm": 0.10205078125, "learning_rate": 0.0019989626750186226, "loss": 0.2178, "step": 2453 }, { "epoch": 0.021301898421020652, "grad_norm": 0.09912109375, "learning_rate": 0.0019989612459793246, "loss": 0.2441, "step": 2454 }, { "epoch": 0.021310578901224817, "grad_norm": 0.0751953125, "learning_rate": 0.001998959815956936, "loss": 0.209, "step": 2455 }, { "epoch": 0.021319259381428982, "grad_norm": 0.07421875, "learning_rate": 0.001998958384951458, "loss": 0.2334, "step": 2456 }, { "epoch": 0.021327939861633144, "grad_norm": 0.072265625, "learning_rate": 0.0019989569529628936, "loss": 0.2012, "step": 2457 }, { "epoch": 0.02133662034183731, "grad_norm": 0.09521484375, "learning_rate": 0.001998955519991243, "loss": 0.2227, "step": 2458 }, { "epoch": 0.021345300822041474, "grad_norm": 0.058837890625, "learning_rate": 0.001998954086036508, "loss": 0.1768, "step": 2459 }, { "epoch": 0.02135398130224564, "grad_norm": 0.099609375, "learning_rate": 0.00199895265109869, "loss": 0.2168, "step": 2460 }, { "epoch": 0.021362661782449804, "grad_norm": 0.0966796875, "learning_rate": 0.001998951215177791, "loss": 0.2227, "step": 2461 }, { "epoch": 0.02137134226265397, "grad_norm": 0.10986328125, "learning_rate": 0.001998949778273813, "loss": 0.2363, "step": 2462 }, { "epoch": 0.021380022742858135, "grad_norm": 0.09814453125, "learning_rate": 0.001998948340386757, "loss": 0.2285, "step": 2463 }, { "epoch": 0.0213887032230623, "grad_norm": 0.0654296875, "learning_rate": 0.001998946901516624, "loss": 0.1904, "step": 2464 }, { "epoch": 0.021397383703266465, "grad_norm": 0.09423828125, "learning_rate": 0.001998945461663416, "loss": 0.2402, "step": 2465 }, { "epoch": 0.02140606418347063, "grad_norm": 0.08056640625, "learning_rate": 0.0019989440208271355, "loss": 0.2021, "step": 2466 }, { "epoch": 0.021414744663674795, "grad_norm": 0.08056640625, "learning_rate": 0.001998942579007783, "loss": 0.1895, "step": 2467 }, { "epoch": 0.02142342514387896, "grad_norm": 0.0771484375, "learning_rate": 0.0019989411362053605, "loss": 0.1699, "step": 2468 }, { "epoch": 0.021432105624083125, "grad_norm": 0.09716796875, "learning_rate": 0.0019989396924198696, "loss": 0.208, "step": 2469 }, { "epoch": 0.02144078610428729, "grad_norm": 0.0751953125, "learning_rate": 0.0019989382476513114, "loss": 0.2051, "step": 2470 }, { "epoch": 0.021449466584491456, "grad_norm": 0.1181640625, "learning_rate": 0.001998936801899688, "loss": 0.248, "step": 2471 }, { "epoch": 0.021458147064695617, "grad_norm": 0.07861328125, "learning_rate": 0.001998935355165001, "loss": 0.1963, "step": 2472 }, { "epoch": 0.021466827544899782, "grad_norm": 0.05859375, "learning_rate": 0.0019989339074472515, "loss": 0.1738, "step": 2473 }, { "epoch": 0.021475508025103947, "grad_norm": 0.06005859375, "learning_rate": 0.0019989324587464416, "loss": 0.1709, "step": 2474 }, { "epoch": 0.021484188505308113, "grad_norm": 0.057373046875, "learning_rate": 0.0019989310090625725, "loss": 0.1768, "step": 2475 }, { "epoch": 0.021492868985512278, "grad_norm": 0.1171875, "learning_rate": 0.001998929558395646, "loss": 0.252, "step": 2476 }, { "epoch": 0.021501549465716443, "grad_norm": 0.07763671875, "learning_rate": 0.0019989281067456636, "loss": 0.2109, "step": 2477 }, { "epoch": 0.021510229945920608, "grad_norm": 0.109375, "learning_rate": 0.001998926654112627, "loss": 0.2852, "step": 2478 }, { "epoch": 0.021518910426124773, "grad_norm": 0.0947265625, "learning_rate": 0.001998925200496538, "loss": 0.2227, "step": 2479 }, { "epoch": 0.021527590906328938, "grad_norm": 0.1142578125, "learning_rate": 0.0019989237458973978, "loss": 0.2383, "step": 2480 }, { "epoch": 0.021536271386533103, "grad_norm": 0.06884765625, "learning_rate": 0.0019989222903152074, "loss": 0.2051, "step": 2481 }, { "epoch": 0.02154495186673727, "grad_norm": 0.0908203125, "learning_rate": 0.00199892083374997, "loss": 0.2266, "step": 2482 }, { "epoch": 0.021553632346941434, "grad_norm": 0.06884765625, "learning_rate": 0.001998919376201686, "loss": 0.2275, "step": 2483 }, { "epoch": 0.0215623128271456, "grad_norm": 0.09814453125, "learning_rate": 0.001998917917670357, "loss": 0.2188, "step": 2484 }, { "epoch": 0.021570993307349764, "grad_norm": 0.07568359375, "learning_rate": 0.0019989164581559853, "loss": 0.2793, "step": 2485 }, { "epoch": 0.02157967378755393, "grad_norm": 0.06884765625, "learning_rate": 0.0019989149976585717, "loss": 0.2168, "step": 2486 }, { "epoch": 0.021588354267758094, "grad_norm": 0.06396484375, "learning_rate": 0.001998913536178118, "loss": 0.1738, "step": 2487 }, { "epoch": 0.021597034747962256, "grad_norm": 0.08837890625, "learning_rate": 0.0019989120737146267, "loss": 0.2461, "step": 2488 }, { "epoch": 0.02160571522816642, "grad_norm": 0.0888671875, "learning_rate": 0.0019989106102680982, "loss": 0.2715, "step": 2489 }, { "epoch": 0.021614395708370586, "grad_norm": 0.1015625, "learning_rate": 0.0019989091458385345, "loss": 0.2617, "step": 2490 }, { "epoch": 0.02162307618857475, "grad_norm": 0.060546875, "learning_rate": 0.0019989076804259372, "loss": 0.1895, "step": 2491 }, { "epoch": 0.021631756668778916, "grad_norm": 0.08544921875, "learning_rate": 0.001998906214030308, "loss": 0.2344, "step": 2492 }, { "epoch": 0.02164043714898308, "grad_norm": 0.06884765625, "learning_rate": 0.0019989047466516486, "loss": 0.1641, "step": 2493 }, { "epoch": 0.021649117629187246, "grad_norm": 0.0673828125, "learning_rate": 0.0019989032782899607, "loss": 0.2266, "step": 2494 }, { "epoch": 0.02165779810939141, "grad_norm": 0.08544921875, "learning_rate": 0.0019989018089452454, "loss": 0.2441, "step": 2495 }, { "epoch": 0.021666478589595577, "grad_norm": 0.080078125, "learning_rate": 0.0019989003386175043, "loss": 0.1973, "step": 2496 }, { "epoch": 0.021675159069799742, "grad_norm": 0.07421875, "learning_rate": 0.00199889886730674, "loss": 0.2148, "step": 2497 }, { "epoch": 0.021683839550003907, "grad_norm": 0.0908203125, "learning_rate": 0.001998897395012953, "loss": 0.248, "step": 2498 }, { "epoch": 0.021692520030208072, "grad_norm": 0.08203125, "learning_rate": 0.0019988959217361454, "loss": 0.2578, "step": 2499 }, { "epoch": 0.021701200510412237, "grad_norm": 0.08984375, "learning_rate": 0.0019988944474763188, "loss": 0.2422, "step": 2500 }, { "epoch": 0.021709880990616402, "grad_norm": 0.060546875, "learning_rate": 0.0019988929722334742, "loss": 0.1768, "step": 2501 }, { "epoch": 0.021718561470820567, "grad_norm": 0.06982421875, "learning_rate": 0.0019988914960076144, "loss": 0.2207, "step": 2502 }, { "epoch": 0.02172724195102473, "grad_norm": 0.080078125, "learning_rate": 0.00199889001879874, "loss": 0.2021, "step": 2503 }, { "epoch": 0.021735922431228894, "grad_norm": 0.083984375, "learning_rate": 0.0019988885406068534, "loss": 0.2197, "step": 2504 }, { "epoch": 0.02174460291143306, "grad_norm": 0.08349609375, "learning_rate": 0.0019988870614319554, "loss": 0.2148, "step": 2505 }, { "epoch": 0.021753283391637224, "grad_norm": 0.076171875, "learning_rate": 0.001998885581274048, "loss": 0.2227, "step": 2506 }, { "epoch": 0.02176196387184139, "grad_norm": 0.1572265625, "learning_rate": 0.001998884100133133, "loss": 0.2471, "step": 2507 }, { "epoch": 0.021770644352045555, "grad_norm": 0.08935546875, "learning_rate": 0.0019988826180092115, "loss": 0.2031, "step": 2508 }, { "epoch": 0.02177932483224972, "grad_norm": 0.0849609375, "learning_rate": 0.001998881134902286, "loss": 0.2197, "step": 2509 }, { "epoch": 0.021788005312453885, "grad_norm": 0.09375, "learning_rate": 0.0019988796508123574, "loss": 0.1982, "step": 2510 }, { "epoch": 0.02179668579265805, "grad_norm": 0.0908203125, "learning_rate": 0.0019988781657394273, "loss": 0.2402, "step": 2511 }, { "epoch": 0.021805366272862215, "grad_norm": 0.09619140625, "learning_rate": 0.001998876679683498, "loss": 0.2246, "step": 2512 }, { "epoch": 0.02181404675306638, "grad_norm": 0.07666015625, "learning_rate": 0.0019988751926445702, "loss": 0.2207, "step": 2513 }, { "epoch": 0.021822727233270545, "grad_norm": 0.091796875, "learning_rate": 0.0019988737046226462, "loss": 0.2383, "step": 2514 }, { "epoch": 0.02183140771347471, "grad_norm": 0.08154296875, "learning_rate": 0.0019988722156177273, "loss": 0.2305, "step": 2515 }, { "epoch": 0.021840088193678876, "grad_norm": 0.08740234375, "learning_rate": 0.0019988707256298156, "loss": 0.2383, "step": 2516 }, { "epoch": 0.02184876867388304, "grad_norm": 0.07421875, "learning_rate": 0.001998869234658912, "loss": 0.207, "step": 2517 }, { "epoch": 0.021857449154087206, "grad_norm": 0.07421875, "learning_rate": 0.0019988677427050187, "loss": 0.1738, "step": 2518 }, { "epoch": 0.021866129634291367, "grad_norm": 0.0869140625, "learning_rate": 0.0019988662497681373, "loss": 0.2002, "step": 2519 }, { "epoch": 0.021874810114495533, "grad_norm": 0.07568359375, "learning_rate": 0.001998864755848269, "loss": 0.1963, "step": 2520 }, { "epoch": 0.021883490594699698, "grad_norm": 0.068359375, "learning_rate": 0.001998863260945416, "loss": 0.2061, "step": 2521 }, { "epoch": 0.021892171074903863, "grad_norm": 0.07958984375, "learning_rate": 0.0019988617650595793, "loss": 0.208, "step": 2522 }, { "epoch": 0.021900851555108028, "grad_norm": 0.09423828125, "learning_rate": 0.001998860268190761, "loss": 0.2334, "step": 2523 }, { "epoch": 0.021909532035312193, "grad_norm": 0.08837890625, "learning_rate": 0.0019988587703389624, "loss": 0.1836, "step": 2524 }, { "epoch": 0.021918212515516358, "grad_norm": 0.0712890625, "learning_rate": 0.0019988572715041856, "loss": 0.1738, "step": 2525 }, { "epoch": 0.021926892995720523, "grad_norm": 0.06884765625, "learning_rate": 0.001998855771686432, "loss": 0.1689, "step": 2526 }, { "epoch": 0.02193557347592469, "grad_norm": 0.07421875, "learning_rate": 0.001998854270885703, "loss": 0.2305, "step": 2527 }, { "epoch": 0.021944253956128854, "grad_norm": 0.07763671875, "learning_rate": 0.001998852769102001, "loss": 0.2363, "step": 2528 }, { "epoch": 0.02195293443633302, "grad_norm": 0.08935546875, "learning_rate": 0.0019988512663353265, "loss": 0.2188, "step": 2529 }, { "epoch": 0.021961614916537184, "grad_norm": 0.08203125, "learning_rate": 0.001998849762585682, "loss": 0.1953, "step": 2530 }, { "epoch": 0.02197029539674135, "grad_norm": 0.08984375, "learning_rate": 0.0019988482578530693, "loss": 0.2236, "step": 2531 }, { "epoch": 0.021978975876945514, "grad_norm": 0.09228515625, "learning_rate": 0.0019988467521374893, "loss": 0.2285, "step": 2532 }, { "epoch": 0.02198765635714968, "grad_norm": 0.0791015625, "learning_rate": 0.001998845245438944, "loss": 0.1895, "step": 2533 }, { "epoch": 0.02199633683735384, "grad_norm": 0.087890625, "learning_rate": 0.001998843737757435, "loss": 0.2246, "step": 2534 }, { "epoch": 0.022005017317558006, "grad_norm": 0.07421875, "learning_rate": 0.001998842229092964, "loss": 0.2158, "step": 2535 }, { "epoch": 0.02201369779776217, "grad_norm": 0.061767578125, "learning_rate": 0.0019988407194455327, "loss": 0.2119, "step": 2536 }, { "epoch": 0.022022378277966336, "grad_norm": 0.06884765625, "learning_rate": 0.0019988392088151428, "loss": 0.2051, "step": 2537 }, { "epoch": 0.0220310587581705, "grad_norm": 0.06689453125, "learning_rate": 0.0019988376972017957, "loss": 0.1777, "step": 2538 }, { "epoch": 0.022039739238374666, "grad_norm": 0.0986328125, "learning_rate": 0.001998836184605493, "loss": 0.248, "step": 2539 }, { "epoch": 0.02204841971857883, "grad_norm": 0.10302734375, "learning_rate": 0.001998834671026237, "loss": 0.2188, "step": 2540 }, { "epoch": 0.022057100198782997, "grad_norm": 0.06494140625, "learning_rate": 0.001998833156464029, "loss": 0.1963, "step": 2541 }, { "epoch": 0.022065780678987162, "grad_norm": 0.078125, "learning_rate": 0.00199883164091887, "loss": 0.2041, "step": 2542 }, { "epoch": 0.022074461159191327, "grad_norm": 0.08984375, "learning_rate": 0.0019988301243907625, "loss": 0.2246, "step": 2543 }, { "epoch": 0.022083141639395492, "grad_norm": 0.08447265625, "learning_rate": 0.001998828606879708, "loss": 0.2109, "step": 2544 }, { "epoch": 0.022091822119599657, "grad_norm": 0.57421875, "learning_rate": 0.001998827088385708, "loss": 0.2988, "step": 2545 }, { "epoch": 0.022100502599803822, "grad_norm": 0.30078125, "learning_rate": 0.001998825568908764, "loss": 0.2324, "step": 2546 }, { "epoch": 0.022109183080007987, "grad_norm": 0.09130859375, "learning_rate": 0.001998824048448878, "loss": 0.1836, "step": 2547 }, { "epoch": 0.022117863560212152, "grad_norm": 0.08935546875, "learning_rate": 0.0019988225270060516, "loss": 0.2402, "step": 2548 }, { "epoch": 0.022126544040416314, "grad_norm": 0.06787109375, "learning_rate": 0.0019988210045802863, "loss": 0.1641, "step": 2549 }, { "epoch": 0.02213522452062048, "grad_norm": 0.07861328125, "learning_rate": 0.0019988194811715837, "loss": 0.2129, "step": 2550 }, { "epoch": 0.022143905000824644, "grad_norm": 0.0751953125, "learning_rate": 0.0019988179567799456, "loss": 0.2041, "step": 2551 }, { "epoch": 0.02215258548102881, "grad_norm": 0.0888671875, "learning_rate": 0.0019988164314053737, "loss": 0.2637, "step": 2552 }, { "epoch": 0.022161265961232975, "grad_norm": 0.1015625, "learning_rate": 0.00199881490504787, "loss": 0.2656, "step": 2553 }, { "epoch": 0.02216994644143714, "grad_norm": 0.09423828125, "learning_rate": 0.0019988133777074355, "loss": 0.2402, "step": 2554 }, { "epoch": 0.022178626921641305, "grad_norm": 0.07275390625, "learning_rate": 0.0019988118493840727, "loss": 0.2227, "step": 2555 }, { "epoch": 0.02218730740184547, "grad_norm": 0.087890625, "learning_rate": 0.001998810320077782, "loss": 0.2148, "step": 2556 }, { "epoch": 0.022195987882049635, "grad_norm": 0.06982421875, "learning_rate": 0.0019988087897885665, "loss": 0.1885, "step": 2557 }, { "epoch": 0.0222046683622538, "grad_norm": 0.11083984375, "learning_rate": 0.001998807258516427, "loss": 0.2285, "step": 2558 }, { "epoch": 0.022213348842457965, "grad_norm": 0.078125, "learning_rate": 0.001998805726261365, "loss": 0.1758, "step": 2559 }, { "epoch": 0.02222202932266213, "grad_norm": 0.42578125, "learning_rate": 0.001998804193023383, "loss": 0.2637, "step": 2560 }, { "epoch": 0.022230709802866296, "grad_norm": 0.1083984375, "learning_rate": 0.001998802658802482, "loss": 0.2324, "step": 2561 }, { "epoch": 0.02223939028307046, "grad_norm": 0.068359375, "learning_rate": 0.001998801123598664, "loss": 0.2256, "step": 2562 }, { "epoch": 0.022248070763274626, "grad_norm": 0.1318359375, "learning_rate": 0.001998799587411931, "loss": 0.2344, "step": 2563 }, { "epoch": 0.02225675124347879, "grad_norm": 0.087890625, "learning_rate": 0.001998798050242284, "loss": 0.2246, "step": 2564 }, { "epoch": 0.022265431723682953, "grad_norm": 0.07666015625, "learning_rate": 0.0019987965120897245, "loss": 0.209, "step": 2565 }, { "epoch": 0.022274112203887118, "grad_norm": 0.0625, "learning_rate": 0.001998794972954255, "loss": 0.1699, "step": 2566 }, { "epoch": 0.022282792684091283, "grad_norm": 0.0615234375, "learning_rate": 0.001998793432835877, "loss": 0.1943, "step": 2567 }, { "epoch": 0.022291473164295448, "grad_norm": 0.259765625, "learning_rate": 0.0019987918917345917, "loss": 0.2676, "step": 2568 }, { "epoch": 0.022300153644499613, "grad_norm": 0.08447265625, "learning_rate": 0.0019987903496504014, "loss": 0.2266, "step": 2569 }, { "epoch": 0.022308834124703778, "grad_norm": 0.0625, "learning_rate": 0.001998788806583307, "loss": 0.1748, "step": 2570 }, { "epoch": 0.022317514604907943, "grad_norm": 0.0986328125, "learning_rate": 0.001998787262533311, "loss": 0.2275, "step": 2571 }, { "epoch": 0.02232619508511211, "grad_norm": 0.08984375, "learning_rate": 0.001998785717500415, "loss": 0.2656, "step": 2572 }, { "epoch": 0.022334875565316274, "grad_norm": 0.06640625, "learning_rate": 0.00199878417148462, "loss": 0.2295, "step": 2573 }, { "epoch": 0.02234355604552044, "grad_norm": 0.08447265625, "learning_rate": 0.001998782624485928, "loss": 0.2412, "step": 2574 }, { "epoch": 0.022352236525724604, "grad_norm": 0.0673828125, "learning_rate": 0.0019987810765043413, "loss": 0.2061, "step": 2575 }, { "epoch": 0.02236091700592877, "grad_norm": 0.06982421875, "learning_rate": 0.001998779527539861, "loss": 0.2061, "step": 2576 }, { "epoch": 0.022369597486132934, "grad_norm": 0.1005859375, "learning_rate": 0.001998777977592489, "loss": 0.2188, "step": 2577 }, { "epoch": 0.0223782779663371, "grad_norm": 0.0703125, "learning_rate": 0.0019987764266622267, "loss": 0.2217, "step": 2578 }, { "epoch": 0.022386958446541264, "grad_norm": 0.0947265625, "learning_rate": 0.0019987748747490757, "loss": 0.2256, "step": 2579 }, { "epoch": 0.022395638926745426, "grad_norm": 0.1171875, "learning_rate": 0.0019987733218530387, "loss": 0.2891, "step": 2580 }, { "epoch": 0.02240431940694959, "grad_norm": 0.083984375, "learning_rate": 0.001998771767974116, "loss": 0.2246, "step": 2581 }, { "epoch": 0.022412999887153756, "grad_norm": 0.1162109375, "learning_rate": 0.0019987702131123108, "loss": 0.3809, "step": 2582 }, { "epoch": 0.02242168036735792, "grad_norm": 0.10693359375, "learning_rate": 0.0019987686572676237, "loss": 0.1729, "step": 2583 }, { "epoch": 0.022430360847562086, "grad_norm": 0.0927734375, "learning_rate": 0.0019987671004400563, "loss": 0.2168, "step": 2584 }, { "epoch": 0.02243904132776625, "grad_norm": 0.10107421875, "learning_rate": 0.001998765542629611, "loss": 0.2207, "step": 2585 }, { "epoch": 0.022447721807970417, "grad_norm": 0.0869140625, "learning_rate": 0.001998763983836289, "loss": 0.2314, "step": 2586 }, { "epoch": 0.02245640228817458, "grad_norm": 0.087890625, "learning_rate": 0.0019987624240600924, "loss": 0.2715, "step": 2587 }, { "epoch": 0.022465082768378747, "grad_norm": 0.10400390625, "learning_rate": 0.001998760863301023, "loss": 0.2617, "step": 2588 }, { "epoch": 0.022473763248582912, "grad_norm": 0.0810546875, "learning_rate": 0.0019987593015590817, "loss": 0.1777, "step": 2589 }, { "epoch": 0.022482443728787077, "grad_norm": 0.072265625, "learning_rate": 0.001998757738834271, "loss": 0.1631, "step": 2590 }, { "epoch": 0.022491124208991242, "grad_norm": 0.08349609375, "learning_rate": 0.0019987561751265925, "loss": 0.1875, "step": 2591 }, { "epoch": 0.022499804689195407, "grad_norm": 0.0751953125, "learning_rate": 0.001998754610436048, "loss": 0.2021, "step": 2592 }, { "epoch": 0.022508485169399572, "grad_norm": 0.05712890625, "learning_rate": 0.001998753044762638, "loss": 0.1973, "step": 2593 }, { "epoch": 0.022517165649603738, "grad_norm": 0.10107421875, "learning_rate": 0.0019987514781063657, "loss": 0.2188, "step": 2594 }, { "epoch": 0.022525846129807903, "grad_norm": 0.11767578125, "learning_rate": 0.001998749910467232, "loss": 0.2969, "step": 2595 }, { "epoch": 0.022534526610012064, "grad_norm": 0.0771484375, "learning_rate": 0.001998748341845239, "loss": 0.209, "step": 2596 }, { "epoch": 0.02254320709021623, "grad_norm": 0.0859375, "learning_rate": 0.001998746772240389, "loss": 0.1836, "step": 2597 }, { "epoch": 0.022551887570420395, "grad_norm": 0.07861328125, "learning_rate": 0.0019987452016526825, "loss": 0.2061, "step": 2598 }, { "epoch": 0.02256056805062456, "grad_norm": 0.06396484375, "learning_rate": 0.0019987436300821218, "loss": 0.1992, "step": 2599 }, { "epoch": 0.022569248530828725, "grad_norm": 0.0849609375, "learning_rate": 0.0019987420575287083, "loss": 0.2207, "step": 2600 }, { "epoch": 0.02257792901103289, "grad_norm": 0.0986328125, "learning_rate": 0.0019987404839924444, "loss": 0.1787, "step": 2601 }, { "epoch": 0.022586609491237055, "grad_norm": 0.1396484375, "learning_rate": 0.001998738909473331, "loss": 0.2402, "step": 2602 }, { "epoch": 0.02259528997144122, "grad_norm": 0.0859375, "learning_rate": 0.001998737333971371, "loss": 0.1992, "step": 2603 }, { "epoch": 0.022603970451645385, "grad_norm": 0.10888671875, "learning_rate": 0.0019987357574865645, "loss": 0.1963, "step": 2604 }, { "epoch": 0.02261265093184955, "grad_norm": 0.0986328125, "learning_rate": 0.001998734180018914, "loss": 0.2012, "step": 2605 }, { "epoch": 0.022621331412053716, "grad_norm": 0.083984375, "learning_rate": 0.001998732601568422, "loss": 0.1973, "step": 2606 }, { "epoch": 0.02263001189225788, "grad_norm": 0.08935546875, "learning_rate": 0.0019987310221350895, "loss": 0.2178, "step": 2607 }, { "epoch": 0.022638692372462046, "grad_norm": 0.0888671875, "learning_rate": 0.001998729441718918, "loss": 0.1875, "step": 2608 }, { "epoch": 0.02264737285266621, "grad_norm": 0.11669921875, "learning_rate": 0.0019987278603199096, "loss": 0.3105, "step": 2609 }, { "epoch": 0.022656053332870376, "grad_norm": 0.275390625, "learning_rate": 0.0019987262779380655, "loss": 0.3203, "step": 2610 }, { "epoch": 0.022664733813074538, "grad_norm": 0.0908203125, "learning_rate": 0.0019987246945733883, "loss": 0.2656, "step": 2611 }, { "epoch": 0.022673414293278703, "grad_norm": 0.09521484375, "learning_rate": 0.001998723110225879, "loss": 0.2188, "step": 2612 }, { "epoch": 0.022682094773482868, "grad_norm": 0.08740234375, "learning_rate": 0.00199872152489554, "loss": 0.207, "step": 2613 }, { "epoch": 0.022690775253687033, "grad_norm": 0.095703125, "learning_rate": 0.0019987199385823723, "loss": 0.2266, "step": 2614 }, { "epoch": 0.022699455733891198, "grad_norm": 0.0859375, "learning_rate": 0.001998718351286378, "loss": 0.2168, "step": 2615 }, { "epoch": 0.022708136214095363, "grad_norm": 0.103515625, "learning_rate": 0.001998716763007559, "loss": 0.2891, "step": 2616 }, { "epoch": 0.02271681669429953, "grad_norm": 0.06396484375, "learning_rate": 0.0019987151737459164, "loss": 0.1689, "step": 2617 }, { "epoch": 0.022725497174503693, "grad_norm": 0.1875, "learning_rate": 0.0019987135835014525, "loss": 0.2578, "step": 2618 }, { "epoch": 0.02273417765470786, "grad_norm": 0.10791015625, "learning_rate": 0.0019987119922741698, "loss": 0.2334, "step": 2619 }, { "epoch": 0.022742858134912024, "grad_norm": 0.1015625, "learning_rate": 0.001998710400064068, "loss": 0.2734, "step": 2620 }, { "epoch": 0.02275153861511619, "grad_norm": 0.09375, "learning_rate": 0.0019987088068711507, "loss": 0.2949, "step": 2621 }, { "epoch": 0.022760219095320354, "grad_norm": 0.08544921875, "learning_rate": 0.0019987072126954187, "loss": 0.1973, "step": 2622 }, { "epoch": 0.02276889957552452, "grad_norm": 0.0791015625, "learning_rate": 0.001998705617536874, "loss": 0.1865, "step": 2623 }, { "epoch": 0.022777580055728684, "grad_norm": 0.1015625, "learning_rate": 0.0019987040213955186, "loss": 0.2891, "step": 2624 }, { "epoch": 0.02278626053593285, "grad_norm": 0.07763671875, "learning_rate": 0.001998702424271354, "loss": 0.2266, "step": 2625 }, { "epoch": 0.02279494101613701, "grad_norm": 0.0791015625, "learning_rate": 0.0019987008261643817, "loss": 0.2227, "step": 2626 }, { "epoch": 0.022803621496341176, "grad_norm": 0.08984375, "learning_rate": 0.0019986992270746035, "loss": 0.25, "step": 2627 }, { "epoch": 0.02281230197654534, "grad_norm": 0.13671875, "learning_rate": 0.0019986976270020213, "loss": 0.25, "step": 2628 }, { "epoch": 0.022820982456749506, "grad_norm": 0.07666015625, "learning_rate": 0.0019986960259466375, "loss": 0.1924, "step": 2629 }, { "epoch": 0.02282966293695367, "grad_norm": 0.060546875, "learning_rate": 0.0019986944239084527, "loss": 0.1797, "step": 2630 }, { "epoch": 0.022838343417157837, "grad_norm": 0.10009765625, "learning_rate": 0.0019986928208874694, "loss": 0.2559, "step": 2631 }, { "epoch": 0.022847023897362, "grad_norm": 0.06591796875, "learning_rate": 0.001998691216883689, "loss": 0.1719, "step": 2632 }, { "epoch": 0.022855704377566167, "grad_norm": 0.083984375, "learning_rate": 0.0019986896118971134, "loss": 0.209, "step": 2633 }, { "epoch": 0.022864384857770332, "grad_norm": 0.5546875, "learning_rate": 0.001998688005927744, "loss": 0.5664, "step": 2634 }, { "epoch": 0.022873065337974497, "grad_norm": 0.06787109375, "learning_rate": 0.001998686398975584, "loss": 0.2236, "step": 2635 }, { "epoch": 0.022881745818178662, "grad_norm": 0.0908203125, "learning_rate": 0.001998684791040633, "loss": 0.2129, "step": 2636 }, { "epoch": 0.022890426298382827, "grad_norm": 0.087890625, "learning_rate": 0.0019986831821228943, "loss": 0.2383, "step": 2637 }, { "epoch": 0.022899106778586992, "grad_norm": 0.1748046875, "learning_rate": 0.001998681572222369, "loss": 0.2266, "step": 2638 }, { "epoch": 0.022907787258791158, "grad_norm": 0.078125, "learning_rate": 0.001998679961339059, "loss": 0.2344, "step": 2639 }, { "epoch": 0.022916467738995323, "grad_norm": 0.11669921875, "learning_rate": 0.001998678349472966, "loss": 0.252, "step": 2640 }, { "epoch": 0.022925148219199488, "grad_norm": 0.0703125, "learning_rate": 0.001998676736624092, "loss": 0.1807, "step": 2641 }, { "epoch": 0.02293382869940365, "grad_norm": 0.09716796875, "learning_rate": 0.0019986751227924386, "loss": 0.2344, "step": 2642 }, { "epoch": 0.022942509179607815, "grad_norm": 0.062255859375, "learning_rate": 0.001998673507978008, "loss": 0.1982, "step": 2643 }, { "epoch": 0.02295118965981198, "grad_norm": 0.1005859375, "learning_rate": 0.001998671892180801, "loss": 0.2734, "step": 2644 }, { "epoch": 0.022959870140016145, "grad_norm": 0.06494140625, "learning_rate": 0.0019986702754008203, "loss": 0.2051, "step": 2645 }, { "epoch": 0.02296855062022031, "grad_norm": 0.07470703125, "learning_rate": 0.0019986686576380667, "loss": 0.2305, "step": 2646 }, { "epoch": 0.022977231100424475, "grad_norm": 0.1357421875, "learning_rate": 0.001998667038892543, "loss": 0.2363, "step": 2647 }, { "epoch": 0.02298591158062864, "grad_norm": 0.06982421875, "learning_rate": 0.0019986654191642508, "loss": 0.1924, "step": 2648 }, { "epoch": 0.022994592060832805, "grad_norm": 0.0859375, "learning_rate": 0.001998663798453191, "loss": 0.1934, "step": 2649 }, { "epoch": 0.02300327254103697, "grad_norm": 0.07177734375, "learning_rate": 0.0019986621767593663, "loss": 0.2188, "step": 2650 }, { "epoch": 0.023011953021241136, "grad_norm": 0.11572265625, "learning_rate": 0.001998660554082778, "loss": 0.2539, "step": 2651 }, { "epoch": 0.0230206335014453, "grad_norm": 0.158203125, "learning_rate": 0.0019986589304234284, "loss": 0.2852, "step": 2652 }, { "epoch": 0.023029313981649466, "grad_norm": 0.95703125, "learning_rate": 0.0019986573057813183, "loss": 0.4863, "step": 2653 }, { "epoch": 0.02303799446185363, "grad_norm": 0.15625, "learning_rate": 0.0019986556801564505, "loss": 0.2031, "step": 2654 }, { "epoch": 0.023046674942057796, "grad_norm": 0.1259765625, "learning_rate": 0.0019986540535488263, "loss": 0.1895, "step": 2655 }, { "epoch": 0.02305535542226196, "grad_norm": 0.0927734375, "learning_rate": 0.0019986524259584474, "loss": 0.2363, "step": 2656 }, { "epoch": 0.023064035902466123, "grad_norm": 0.6484375, "learning_rate": 0.001998650797385316, "loss": 0.5977, "step": 2657 }, { "epoch": 0.023072716382670288, "grad_norm": 0.11669921875, "learning_rate": 0.001998649167829433, "loss": 0.293, "step": 2658 }, { "epoch": 0.023081396862874453, "grad_norm": 0.072265625, "learning_rate": 0.0019986475372908014, "loss": 0.2168, "step": 2659 }, { "epoch": 0.023090077343078618, "grad_norm": 0.09765625, "learning_rate": 0.001998645905769422, "loss": 0.2051, "step": 2660 }, { "epoch": 0.023098757823282783, "grad_norm": 0.062255859375, "learning_rate": 0.001998644273265297, "loss": 0.1738, "step": 2661 }, { "epoch": 0.02310743830348695, "grad_norm": 0.11767578125, "learning_rate": 0.0019986426397784283, "loss": 0.209, "step": 2662 }, { "epoch": 0.023116118783691113, "grad_norm": 0.060546875, "learning_rate": 0.0019986410053088174, "loss": 0.2031, "step": 2663 }, { "epoch": 0.02312479926389528, "grad_norm": 0.953125, "learning_rate": 0.001998639369856466, "loss": 0.7188, "step": 2664 }, { "epoch": 0.023133479744099444, "grad_norm": 0.09375, "learning_rate": 0.0019986377334213763, "loss": 0.1875, "step": 2665 }, { "epoch": 0.02314216022430361, "grad_norm": 0.1455078125, "learning_rate": 0.00199863609600355, "loss": 0.3125, "step": 2666 }, { "epoch": 0.023150840704507774, "grad_norm": 0.08642578125, "learning_rate": 0.0019986344576029885, "loss": 0.2158, "step": 2667 }, { "epoch": 0.02315952118471194, "grad_norm": 0.06640625, "learning_rate": 0.0019986328182196936, "loss": 0.168, "step": 2668 }, { "epoch": 0.023168201664916104, "grad_norm": 0.06494140625, "learning_rate": 0.0019986311778536683, "loss": 0.2891, "step": 2669 }, { "epoch": 0.02317688214512027, "grad_norm": 0.0625, "learning_rate": 0.0019986295365049126, "loss": 0.1934, "step": 2670 }, { "epoch": 0.023185562625324434, "grad_norm": 0.08984375, "learning_rate": 0.0019986278941734295, "loss": 0.2559, "step": 2671 }, { "epoch": 0.0231942431055286, "grad_norm": 0.06640625, "learning_rate": 0.0019986262508592204, "loss": 0.2383, "step": 2672 }, { "epoch": 0.02320292358573276, "grad_norm": 0.083984375, "learning_rate": 0.001998624606562287, "loss": 0.2402, "step": 2673 }, { "epoch": 0.023211604065936926, "grad_norm": 0.0947265625, "learning_rate": 0.001998622961282631, "loss": 0.2178, "step": 2674 }, { "epoch": 0.02322028454614109, "grad_norm": 0.08447265625, "learning_rate": 0.0019986213150202546, "loss": 0.2578, "step": 2675 }, { "epoch": 0.023228965026345257, "grad_norm": 0.0751953125, "learning_rate": 0.00199861966777516, "loss": 0.2363, "step": 2676 }, { "epoch": 0.02323764550654942, "grad_norm": 0.0810546875, "learning_rate": 0.001998618019547348, "loss": 0.2617, "step": 2677 }, { "epoch": 0.023246325986753587, "grad_norm": 0.0673828125, "learning_rate": 0.0019986163703368206, "loss": 0.2246, "step": 2678 }, { "epoch": 0.023255006466957752, "grad_norm": 0.236328125, "learning_rate": 0.0019986147201435803, "loss": 0.2207, "step": 2679 }, { "epoch": 0.023263686947161917, "grad_norm": 0.1181640625, "learning_rate": 0.0019986130689676283, "loss": 0.2227, "step": 2680 }, { "epoch": 0.023272367427366082, "grad_norm": 0.0888671875, "learning_rate": 0.001998611416808966, "loss": 0.248, "step": 2681 }, { "epoch": 0.023281047907570247, "grad_norm": 0.10791015625, "learning_rate": 0.0019986097636675963, "loss": 0.2002, "step": 2682 }, { "epoch": 0.023289728387774412, "grad_norm": 0.080078125, "learning_rate": 0.0019986081095435208, "loss": 0.2402, "step": 2683 }, { "epoch": 0.023298408867978578, "grad_norm": 0.1396484375, "learning_rate": 0.0019986064544367404, "loss": 0.2168, "step": 2684 }, { "epoch": 0.023307089348182743, "grad_norm": 0.07763671875, "learning_rate": 0.0019986047983472574, "loss": 0.1953, "step": 2685 }, { "epoch": 0.023315769828386908, "grad_norm": 0.095703125, "learning_rate": 0.001998603141275074, "loss": 0.2168, "step": 2686 }, { "epoch": 0.023324450308591073, "grad_norm": 0.1298828125, "learning_rate": 0.001998601483220192, "loss": 0.2236, "step": 2687 }, { "epoch": 0.023333130788795235, "grad_norm": 0.0810546875, "learning_rate": 0.0019985998241826126, "loss": 0.2109, "step": 2688 }, { "epoch": 0.0233418112689994, "grad_norm": 0.107421875, "learning_rate": 0.0019985981641623377, "loss": 0.248, "step": 2689 }, { "epoch": 0.023350491749203565, "grad_norm": 0.08154296875, "learning_rate": 0.0019985965031593697, "loss": 0.1982, "step": 2690 }, { "epoch": 0.02335917222940773, "grad_norm": 0.08740234375, "learning_rate": 0.00199859484117371, "loss": 0.2285, "step": 2691 }, { "epoch": 0.023367852709611895, "grad_norm": 0.0966796875, "learning_rate": 0.00199859317820536, "loss": 0.2021, "step": 2692 }, { "epoch": 0.02337653318981606, "grad_norm": 0.19140625, "learning_rate": 0.0019985915142543224, "loss": 0.3047, "step": 2693 }, { "epoch": 0.023385213670020225, "grad_norm": 0.08154296875, "learning_rate": 0.001998589849320599, "loss": 0.1816, "step": 2694 }, { "epoch": 0.02339389415022439, "grad_norm": 0.0859375, "learning_rate": 0.0019985881834041906, "loss": 0.2598, "step": 2695 }, { "epoch": 0.023402574630428555, "grad_norm": 0.0654296875, "learning_rate": 0.0019985865165050997, "loss": 0.2002, "step": 2696 }, { "epoch": 0.02341125511063272, "grad_norm": 0.07763671875, "learning_rate": 0.0019985848486233286, "loss": 0.2188, "step": 2697 }, { "epoch": 0.023419935590836886, "grad_norm": 0.09765625, "learning_rate": 0.0019985831797588783, "loss": 0.2617, "step": 2698 }, { "epoch": 0.02342861607104105, "grad_norm": 0.1142578125, "learning_rate": 0.001998581509911751, "loss": 0.2314, "step": 2699 }, { "epoch": 0.023437296551245216, "grad_norm": 0.0751953125, "learning_rate": 0.0019985798390819483, "loss": 0.252, "step": 2700 }, { "epoch": 0.02344597703144938, "grad_norm": 0.0751953125, "learning_rate": 0.001998578167269472, "loss": 0.2246, "step": 2701 }, { "epoch": 0.023454657511653546, "grad_norm": 0.07568359375, "learning_rate": 0.001998576494474325, "loss": 0.248, "step": 2702 }, { "epoch": 0.023463337991857708, "grad_norm": 0.0869140625, "learning_rate": 0.0019985748206965076, "loss": 0.2754, "step": 2703 }, { "epoch": 0.023472018472061873, "grad_norm": 0.056640625, "learning_rate": 0.0019985731459360224, "loss": 0.2021, "step": 2704 }, { "epoch": 0.023480698952266038, "grad_norm": 0.1083984375, "learning_rate": 0.001998571470192871, "loss": 0.332, "step": 2705 }, { "epoch": 0.023489379432470203, "grad_norm": 0.09130859375, "learning_rate": 0.001998569793467055, "loss": 0.2695, "step": 2706 }, { "epoch": 0.02349805991267437, "grad_norm": 0.09765625, "learning_rate": 0.0019985681157585772, "loss": 0.248, "step": 2707 }, { "epoch": 0.023506740392878533, "grad_norm": 0.10986328125, "learning_rate": 0.0019985664370674385, "loss": 0.1719, "step": 2708 }, { "epoch": 0.0235154208730827, "grad_norm": 0.1962890625, "learning_rate": 0.0019985647573936413, "loss": 0.2168, "step": 2709 }, { "epoch": 0.023524101353286864, "grad_norm": 0.73828125, "learning_rate": 0.0019985630767371866, "loss": 0.498, "step": 2710 }, { "epoch": 0.02353278183349103, "grad_norm": 0.111328125, "learning_rate": 0.0019985613950980778, "loss": 0.165, "step": 2711 }, { "epoch": 0.023541462313695194, "grad_norm": 0.07958984375, "learning_rate": 0.001998559712476315, "loss": 0.208, "step": 2712 }, { "epoch": 0.02355014279389936, "grad_norm": 0.0791015625, "learning_rate": 0.001998558028871901, "loss": 0.2246, "step": 2713 }, { "epoch": 0.023558823274103524, "grad_norm": 0.07568359375, "learning_rate": 0.0019985563442848375, "loss": 0.252, "step": 2714 }, { "epoch": 0.02356750375430769, "grad_norm": 0.1328125, "learning_rate": 0.0019985546587151263, "loss": 0.1953, "step": 2715 }, { "epoch": 0.023576184234511854, "grad_norm": 0.16796875, "learning_rate": 0.001998552972162769, "loss": 0.2656, "step": 2716 }, { "epoch": 0.02358486471471602, "grad_norm": 0.1376953125, "learning_rate": 0.0019985512846277674, "loss": 0.293, "step": 2717 }, { "epoch": 0.023593545194920185, "grad_norm": 0.076171875, "learning_rate": 0.0019985495961101244, "loss": 0.2227, "step": 2718 }, { "epoch": 0.023602225675124346, "grad_norm": 0.10302734375, "learning_rate": 0.0019985479066098404, "loss": 0.25, "step": 2719 }, { "epoch": 0.02361090615532851, "grad_norm": 0.0888671875, "learning_rate": 0.0019985462161269184, "loss": 0.2168, "step": 2720 }, { "epoch": 0.023619586635532677, "grad_norm": 0.1083984375, "learning_rate": 0.0019985445246613596, "loss": 0.209, "step": 2721 }, { "epoch": 0.02362826711573684, "grad_norm": 0.09375, "learning_rate": 0.001998542832213166, "loss": 0.2246, "step": 2722 }, { "epoch": 0.023636947595941007, "grad_norm": 0.1142578125, "learning_rate": 0.0019985411387823393, "loss": 0.2246, "step": 2723 }, { "epoch": 0.023645628076145172, "grad_norm": 0.08837890625, "learning_rate": 0.0019985394443688817, "loss": 0.2393, "step": 2724 }, { "epoch": 0.023654308556349337, "grad_norm": 0.158203125, "learning_rate": 0.0019985377489727947, "loss": 0.25, "step": 2725 }, { "epoch": 0.023662989036553502, "grad_norm": 0.09033203125, "learning_rate": 0.0019985360525940805, "loss": 0.2246, "step": 2726 }, { "epoch": 0.023671669516757667, "grad_norm": 0.1025390625, "learning_rate": 0.0019985343552327405, "loss": 0.2402, "step": 2727 }, { "epoch": 0.023680349996961832, "grad_norm": 0.07470703125, "learning_rate": 0.0019985326568887772, "loss": 0.1738, "step": 2728 }, { "epoch": 0.023689030477165997, "grad_norm": 0.1044921875, "learning_rate": 0.001998530957562192, "loss": 0.2344, "step": 2729 }, { "epoch": 0.023697710957370163, "grad_norm": 0.08837890625, "learning_rate": 0.001998529257252987, "loss": 0.2461, "step": 2730 }, { "epoch": 0.023706391437574328, "grad_norm": 0.14453125, "learning_rate": 0.0019985275559611633, "loss": 0.2168, "step": 2731 }, { "epoch": 0.023715071917778493, "grad_norm": 0.0849609375, "learning_rate": 0.001998525853686724, "loss": 0.2656, "step": 2732 }, { "epoch": 0.023723752397982658, "grad_norm": 0.0859375, "learning_rate": 0.00199852415042967, "loss": 0.2109, "step": 2733 }, { "epoch": 0.02373243287818682, "grad_norm": 0.158203125, "learning_rate": 0.0019985224461900033, "loss": 0.2871, "step": 2734 }, { "epoch": 0.023741113358390985, "grad_norm": 0.10498046875, "learning_rate": 0.0019985207409677266, "loss": 0.2949, "step": 2735 }, { "epoch": 0.02374979383859515, "grad_norm": 0.068359375, "learning_rate": 0.0019985190347628404, "loss": 0.1895, "step": 2736 }, { "epoch": 0.023758474318799315, "grad_norm": 0.1689453125, "learning_rate": 0.0019985173275753475, "loss": 0.2598, "step": 2737 }, { "epoch": 0.02376715479900348, "grad_norm": 0.059814453125, "learning_rate": 0.0019985156194052495, "loss": 0.2012, "step": 2738 }, { "epoch": 0.023775835279207645, "grad_norm": 0.154296875, "learning_rate": 0.0019985139102525486, "loss": 0.2246, "step": 2739 }, { "epoch": 0.02378451575941181, "grad_norm": 0.0732421875, "learning_rate": 0.001998512200117246, "loss": 0.1914, "step": 2740 }, { "epoch": 0.023793196239615975, "grad_norm": 0.1025390625, "learning_rate": 0.0019985104889993443, "loss": 0.2207, "step": 2741 }, { "epoch": 0.02380187671982014, "grad_norm": 0.12890625, "learning_rate": 0.001998508776898845, "loss": 0.2969, "step": 2742 }, { "epoch": 0.023810557200024306, "grad_norm": 0.130859375, "learning_rate": 0.00199850706381575, "loss": 0.2275, "step": 2743 }, { "epoch": 0.02381923768022847, "grad_norm": 0.06884765625, "learning_rate": 0.001998505349750061, "loss": 0.2275, "step": 2744 }, { "epoch": 0.023827918160432636, "grad_norm": 0.12109375, "learning_rate": 0.0019985036347017803, "loss": 0.1523, "step": 2745 }, { "epoch": 0.0238365986406368, "grad_norm": 0.181640625, "learning_rate": 0.001998501918670909, "loss": 0.2197, "step": 2746 }, { "epoch": 0.023845279120840966, "grad_norm": 0.0859375, "learning_rate": 0.00199850020165745, "loss": 0.209, "step": 2747 }, { "epoch": 0.02385395960104513, "grad_norm": 0.05859375, "learning_rate": 0.001998498483661405, "loss": 0.1914, "step": 2748 }, { "epoch": 0.023862640081249296, "grad_norm": 0.2451171875, "learning_rate": 0.0019984967646827748, "loss": 0.2754, "step": 2749 }, { "epoch": 0.023871320561453458, "grad_norm": 0.0927734375, "learning_rate": 0.0019984950447215627, "loss": 0.1992, "step": 2750 }, { "epoch": 0.023880001041657623, "grad_norm": 0.0966796875, "learning_rate": 0.0019984933237777694, "loss": 0.2051, "step": 2751 }, { "epoch": 0.02388868152186179, "grad_norm": 0.1708984375, "learning_rate": 0.0019984916018513975, "loss": 0.2773, "step": 2752 }, { "epoch": 0.023897362002065953, "grad_norm": 0.0634765625, "learning_rate": 0.0019984898789424488, "loss": 0.1494, "step": 2753 }, { "epoch": 0.02390604248227012, "grad_norm": 0.08154296875, "learning_rate": 0.0019984881550509244, "loss": 0.1758, "step": 2754 }, { "epoch": 0.023914722962474284, "grad_norm": 0.08203125, "learning_rate": 0.001998486430176828, "loss": 0.167, "step": 2755 }, { "epoch": 0.02392340344267845, "grad_norm": 0.1103515625, "learning_rate": 0.0019984847043201595, "loss": 0.2217, "step": 2756 }, { "epoch": 0.023932083922882614, "grad_norm": 0.173828125, "learning_rate": 0.001998482977480922, "loss": 0.2119, "step": 2757 }, { "epoch": 0.02394076440308678, "grad_norm": 0.0751953125, "learning_rate": 0.0019984812496591166, "loss": 0.2021, "step": 2758 }, { "epoch": 0.023949444883290944, "grad_norm": 0.1171875, "learning_rate": 0.001998479520854746, "loss": 0.2363, "step": 2759 }, { "epoch": 0.02395812536349511, "grad_norm": 0.068359375, "learning_rate": 0.0019984777910678118, "loss": 0.2432, "step": 2760 }, { "epoch": 0.023966805843699274, "grad_norm": 0.2451171875, "learning_rate": 0.0019984760602983153, "loss": 0.1758, "step": 2761 }, { "epoch": 0.02397548632390344, "grad_norm": 0.0751953125, "learning_rate": 0.001998474328546259, "loss": 0.21, "step": 2762 }, { "epoch": 0.023984166804107605, "grad_norm": 0.0732421875, "learning_rate": 0.0019984725958116446, "loss": 0.2148, "step": 2763 }, { "epoch": 0.02399284728431177, "grad_norm": 0.06103515625, "learning_rate": 0.001998470862094475, "loss": 0.2246, "step": 2764 }, { "epoch": 0.02400152776451593, "grad_norm": 0.1142578125, "learning_rate": 0.00199846912739475, "loss": 0.2031, "step": 2765 }, { "epoch": 0.024010208244720097, "grad_norm": 0.07470703125, "learning_rate": 0.001998467391712473, "loss": 0.1953, "step": 2766 }, { "epoch": 0.02401888872492426, "grad_norm": 0.1357421875, "learning_rate": 0.0019984656550476455, "loss": 0.2559, "step": 2767 }, { "epoch": 0.024027569205128427, "grad_norm": 0.1787109375, "learning_rate": 0.00199846391740027, "loss": 0.3066, "step": 2768 }, { "epoch": 0.024036249685332592, "grad_norm": 0.083984375, "learning_rate": 0.0019984621787703474, "loss": 0.2041, "step": 2769 }, { "epoch": 0.024044930165536757, "grad_norm": 0.099609375, "learning_rate": 0.0019984604391578803, "loss": 0.2324, "step": 2770 }, { "epoch": 0.024053610645740922, "grad_norm": 0.10302734375, "learning_rate": 0.00199845869856287, "loss": 0.2559, "step": 2771 }, { "epoch": 0.024062291125945087, "grad_norm": 0.1279296875, "learning_rate": 0.001998456956985319, "loss": 0.2109, "step": 2772 }, { "epoch": 0.024070971606149252, "grad_norm": 0.177734375, "learning_rate": 0.001998455214425229, "loss": 0.2656, "step": 2773 }, { "epoch": 0.024079652086353417, "grad_norm": 0.173828125, "learning_rate": 0.001998453470882602, "loss": 0.3633, "step": 2774 }, { "epoch": 0.024088332566557583, "grad_norm": 0.234375, "learning_rate": 0.0019984517263574395, "loss": 0.2559, "step": 2775 }, { "epoch": 0.024097013046761748, "grad_norm": 0.111328125, "learning_rate": 0.0019984499808497437, "loss": 0.2363, "step": 2776 }, { "epoch": 0.024105693526965913, "grad_norm": 0.1142578125, "learning_rate": 0.001998448234359517, "loss": 0.2871, "step": 2777 }, { "epoch": 0.024114374007170078, "grad_norm": 0.146484375, "learning_rate": 0.00199844648688676, "loss": 0.2188, "step": 2778 }, { "epoch": 0.024123054487374243, "grad_norm": 0.0712890625, "learning_rate": 0.001998444738431476, "loss": 0.2158, "step": 2779 }, { "epoch": 0.024131734967578405, "grad_norm": 0.10009765625, "learning_rate": 0.001998442988993666, "loss": 0.248, "step": 2780 }, { "epoch": 0.02414041544778257, "grad_norm": 0.1328125, "learning_rate": 0.0019984412385733326, "loss": 0.2324, "step": 2781 }, { "epoch": 0.024149095927986735, "grad_norm": 0.08642578125, "learning_rate": 0.001998439487170477, "loss": 0.1807, "step": 2782 }, { "epoch": 0.0241577764081909, "grad_norm": 0.271484375, "learning_rate": 0.0019984377347851017, "loss": 0.2051, "step": 2783 }, { "epoch": 0.024166456888395065, "grad_norm": 0.068359375, "learning_rate": 0.001998435981417208, "loss": 0.1943, "step": 2784 }, { "epoch": 0.02417513736859923, "grad_norm": 0.1044921875, "learning_rate": 0.001998434227066799, "loss": 0.3086, "step": 2785 }, { "epoch": 0.024183817848803395, "grad_norm": 0.1982421875, "learning_rate": 0.001998432471733875, "loss": 0.2559, "step": 2786 }, { "epoch": 0.02419249832900756, "grad_norm": 0.0849609375, "learning_rate": 0.001998430715418439, "loss": 0.2383, "step": 2787 }, { "epoch": 0.024201178809211726, "grad_norm": 0.1083984375, "learning_rate": 0.001998428958120493, "loss": 0.2207, "step": 2788 }, { "epoch": 0.02420985928941589, "grad_norm": 0.08349609375, "learning_rate": 0.0019984271998400387, "loss": 0.1689, "step": 2789 }, { "epoch": 0.024218539769620056, "grad_norm": 0.07861328125, "learning_rate": 0.0019984254405770773, "loss": 0.2246, "step": 2790 }, { "epoch": 0.02422722024982422, "grad_norm": 0.158203125, "learning_rate": 0.001998423680331612, "loss": 0.1982, "step": 2791 }, { "epoch": 0.024235900730028386, "grad_norm": 0.10546875, "learning_rate": 0.0019984219191036432, "loss": 0.2324, "step": 2792 }, { "epoch": 0.02424458121023255, "grad_norm": 0.0693359375, "learning_rate": 0.0019984201568931746, "loss": 0.1924, "step": 2793 }, { "epoch": 0.024253261690436716, "grad_norm": 0.078125, "learning_rate": 0.0019984183937002066, "loss": 0.2246, "step": 2794 }, { "epoch": 0.02426194217064088, "grad_norm": 0.08056640625, "learning_rate": 0.001998416629524742, "loss": 0.1826, "step": 2795 }, { "epoch": 0.024270622650845043, "grad_norm": 0.068359375, "learning_rate": 0.001998414864366782, "loss": 0.2188, "step": 2796 }, { "epoch": 0.02427930313104921, "grad_norm": 0.07666015625, "learning_rate": 0.00199841309822633, "loss": 0.2021, "step": 2797 }, { "epoch": 0.024287983611253373, "grad_norm": 0.08447265625, "learning_rate": 0.001998411331103386, "loss": 0.2637, "step": 2798 }, { "epoch": 0.02429666409145754, "grad_norm": 0.1787109375, "learning_rate": 0.0019984095629979534, "loss": 0.1973, "step": 2799 }, { "epoch": 0.024305344571661704, "grad_norm": 0.1064453125, "learning_rate": 0.0019984077939100334, "loss": 0.1992, "step": 2800 }, { "epoch": 0.02431402505186587, "grad_norm": 0.1435546875, "learning_rate": 0.001998406023839628, "loss": 0.1934, "step": 2801 }, { "epoch": 0.024322705532070034, "grad_norm": 0.0947265625, "learning_rate": 0.0019984042527867395, "loss": 0.1943, "step": 2802 }, { "epoch": 0.0243313860122742, "grad_norm": 0.09228515625, "learning_rate": 0.0019984024807513695, "loss": 0.2363, "step": 2803 }, { "epoch": 0.024340066492478364, "grad_norm": 0.10302734375, "learning_rate": 0.0019984007077335202, "loss": 0.2383, "step": 2804 }, { "epoch": 0.02434874697268253, "grad_norm": 0.07470703125, "learning_rate": 0.001998398933733193, "loss": 0.2236, "step": 2805 }, { "epoch": 0.024357427452886694, "grad_norm": 0.0634765625, "learning_rate": 0.0019983971587503907, "loss": 0.1865, "step": 2806 }, { "epoch": 0.02436610793309086, "grad_norm": 0.08154296875, "learning_rate": 0.0019983953827851144, "loss": 0.2402, "step": 2807 }, { "epoch": 0.024374788413295025, "grad_norm": 0.1083984375, "learning_rate": 0.0019983936058373666, "loss": 0.2832, "step": 2808 }, { "epoch": 0.02438346889349919, "grad_norm": 0.1357421875, "learning_rate": 0.001998391827907149, "loss": 0.2578, "step": 2809 }, { "epoch": 0.024392149373703355, "grad_norm": 0.07568359375, "learning_rate": 0.0019983900489944635, "loss": 0.2139, "step": 2810 }, { "epoch": 0.024400829853907516, "grad_norm": 0.07666015625, "learning_rate": 0.001998388269099312, "loss": 0.2383, "step": 2811 }, { "epoch": 0.02440951033411168, "grad_norm": 0.146484375, "learning_rate": 0.0019983864882216974, "loss": 0.2412, "step": 2812 }, { "epoch": 0.024418190814315847, "grad_norm": 0.10205078125, "learning_rate": 0.0019983847063616204, "loss": 0.1748, "step": 2813 }, { "epoch": 0.024426871294520012, "grad_norm": 0.10888671875, "learning_rate": 0.001998382923519083, "loss": 0.2266, "step": 2814 }, { "epoch": 0.024435551774724177, "grad_norm": 0.10107421875, "learning_rate": 0.0019983811396940875, "loss": 0.1875, "step": 2815 }, { "epoch": 0.024444232254928342, "grad_norm": 0.1669921875, "learning_rate": 0.0019983793548866364, "loss": 0.1807, "step": 2816 }, { "epoch": 0.024452912735132507, "grad_norm": 0.115234375, "learning_rate": 0.0019983775690967306, "loss": 0.2129, "step": 2817 }, { "epoch": 0.024461593215336672, "grad_norm": 0.0810546875, "learning_rate": 0.001998375782324373, "loss": 0.2031, "step": 2818 }, { "epoch": 0.024470273695540837, "grad_norm": 0.0927734375, "learning_rate": 0.0019983739945695655, "loss": 0.2148, "step": 2819 }, { "epoch": 0.024478954175745003, "grad_norm": 0.08544921875, "learning_rate": 0.001998372205832309, "loss": 0.291, "step": 2820 }, { "epoch": 0.024487634655949168, "grad_norm": 0.173828125, "learning_rate": 0.0019983704161126064, "loss": 0.25, "step": 2821 }, { "epoch": 0.024496315136153333, "grad_norm": 0.08447265625, "learning_rate": 0.0019983686254104595, "loss": 0.25, "step": 2822 }, { "epoch": 0.024504995616357498, "grad_norm": 0.10888671875, "learning_rate": 0.0019983668337258697, "loss": 0.2148, "step": 2823 }, { "epoch": 0.024513676096561663, "grad_norm": 0.078125, "learning_rate": 0.00199836504105884, "loss": 0.2383, "step": 2824 }, { "epoch": 0.024522356576765828, "grad_norm": 0.07763671875, "learning_rate": 0.0019983632474093716, "loss": 0.1787, "step": 2825 }, { "epoch": 0.024531037056969993, "grad_norm": 0.10986328125, "learning_rate": 0.0019983614527774667, "loss": 0.1934, "step": 2826 }, { "epoch": 0.024539717537174155, "grad_norm": 0.1884765625, "learning_rate": 0.001998359657163127, "loss": 0.2148, "step": 2827 }, { "epoch": 0.02454839801737832, "grad_norm": 0.123046875, "learning_rate": 0.001998357860566355, "loss": 0.2637, "step": 2828 }, { "epoch": 0.024557078497582485, "grad_norm": 0.11669921875, "learning_rate": 0.001998356062987152, "loss": 0.2793, "step": 2829 }, { "epoch": 0.02456575897778665, "grad_norm": 0.072265625, "learning_rate": 0.0019983542644255205, "loss": 0.2344, "step": 2830 }, { "epoch": 0.024574439457990815, "grad_norm": 0.10107421875, "learning_rate": 0.001998352464881462, "loss": 0.2227, "step": 2831 }, { "epoch": 0.02458311993819498, "grad_norm": 0.1806640625, "learning_rate": 0.0019983506643549793, "loss": 0.2041, "step": 2832 }, { "epoch": 0.024591800418399146, "grad_norm": 0.0810546875, "learning_rate": 0.0019983488628460733, "loss": 0.208, "step": 2833 }, { "epoch": 0.02460048089860331, "grad_norm": 0.1328125, "learning_rate": 0.001998347060354747, "loss": 0.249, "step": 2834 }, { "epoch": 0.024609161378807476, "grad_norm": 0.0859375, "learning_rate": 0.001998345256881001, "loss": 0.1973, "step": 2835 }, { "epoch": 0.02461784185901164, "grad_norm": 0.0732421875, "learning_rate": 0.001998343452424839, "loss": 0.2285, "step": 2836 }, { "epoch": 0.024626522339215806, "grad_norm": 0.10009765625, "learning_rate": 0.0019983416469862617, "loss": 0.1758, "step": 2837 }, { "epoch": 0.02463520281941997, "grad_norm": 0.10009765625, "learning_rate": 0.0019983398405652715, "loss": 0.2266, "step": 2838 }, { "epoch": 0.024643883299624136, "grad_norm": 0.12451171875, "learning_rate": 0.0019983380331618705, "loss": 0.252, "step": 2839 }, { "epoch": 0.0246525637798283, "grad_norm": 0.07177734375, "learning_rate": 0.0019983362247760605, "loss": 0.1973, "step": 2840 }, { "epoch": 0.024661244260032467, "grad_norm": 0.083984375, "learning_rate": 0.001998334415407843, "loss": 0.1562, "step": 2841 }, { "epoch": 0.024669924740236628, "grad_norm": 0.07275390625, "learning_rate": 0.001998332605057221, "loss": 0.1953, "step": 2842 }, { "epoch": 0.024678605220440793, "grad_norm": 0.11328125, "learning_rate": 0.0019983307937241957, "loss": 0.2617, "step": 2843 }, { "epoch": 0.02468728570064496, "grad_norm": 0.05908203125, "learning_rate": 0.0019983289814087695, "loss": 0.1523, "step": 2844 }, { "epoch": 0.024695966180849124, "grad_norm": 0.263671875, "learning_rate": 0.0019983271681109447, "loss": 0.1855, "step": 2845 }, { "epoch": 0.02470464666105329, "grad_norm": 0.0791015625, "learning_rate": 0.001998325353830722, "loss": 0.1553, "step": 2846 }, { "epoch": 0.024713327141257454, "grad_norm": 0.205078125, "learning_rate": 0.0019983235385681044, "loss": 0.2148, "step": 2847 }, { "epoch": 0.02472200762146162, "grad_norm": 0.3671875, "learning_rate": 0.001998321722323094, "loss": 0.2148, "step": 2848 }, { "epoch": 0.024730688101665784, "grad_norm": 0.0908203125, "learning_rate": 0.001998319905095692, "loss": 0.2344, "step": 2849 }, { "epoch": 0.02473936858186995, "grad_norm": 0.3125, "learning_rate": 0.001998318086885901, "loss": 0.2598, "step": 2850 }, { "epoch": 0.024748049062074114, "grad_norm": 0.1845703125, "learning_rate": 0.001998316267693723, "loss": 0.25, "step": 2851 }, { "epoch": 0.02475672954227828, "grad_norm": 0.0791015625, "learning_rate": 0.00199831444751916, "loss": 0.1719, "step": 2852 }, { "epoch": 0.024765410022482445, "grad_norm": 0.11865234375, "learning_rate": 0.0019983126263622138, "loss": 0.3223, "step": 2853 }, { "epoch": 0.02477409050268661, "grad_norm": 0.12060546875, "learning_rate": 0.001998310804222886, "loss": 0.1758, "step": 2854 }, { "epoch": 0.024782770982890775, "grad_norm": 0.197265625, "learning_rate": 0.001998308981101179, "loss": 0.2363, "step": 2855 }, { "epoch": 0.02479145146309494, "grad_norm": 0.21875, "learning_rate": 0.001998307156997095, "loss": 0.2148, "step": 2856 }, { "epoch": 0.0248001319432991, "grad_norm": 0.1435546875, "learning_rate": 0.001998305331910636, "loss": 0.1699, "step": 2857 }, { "epoch": 0.024808812423503267, "grad_norm": 0.1376953125, "learning_rate": 0.0019983035058418032, "loss": 0.1963, "step": 2858 }, { "epoch": 0.024817492903707432, "grad_norm": 0.1064453125, "learning_rate": 0.0019983016787905998, "loss": 0.2734, "step": 2859 }, { "epoch": 0.024826173383911597, "grad_norm": 0.1162109375, "learning_rate": 0.0019982998507570267, "loss": 0.1289, "step": 2860 }, { "epoch": 0.024834853864115762, "grad_norm": 0.279296875, "learning_rate": 0.0019982980217410867, "loss": 0.2139, "step": 2861 }, { "epoch": 0.024843534344319927, "grad_norm": 0.1005859375, "learning_rate": 0.0019982961917427815, "loss": 0.209, "step": 2862 }, { "epoch": 0.024852214824524092, "grad_norm": 0.126953125, "learning_rate": 0.0019982943607621127, "loss": 0.1885, "step": 2863 }, { "epoch": 0.024860895304728257, "grad_norm": 0.177734375, "learning_rate": 0.001998292528799083, "loss": 0.2246, "step": 2864 }, { "epoch": 0.024869575784932423, "grad_norm": 0.1767578125, "learning_rate": 0.001998290695853694, "loss": 0.2031, "step": 2865 }, { "epoch": 0.024878256265136588, "grad_norm": 0.419921875, "learning_rate": 0.0019982888619259477, "loss": 0.1895, "step": 2866 }, { "epoch": 0.024886936745340753, "grad_norm": 0.1962890625, "learning_rate": 0.0019982870270158462, "loss": 0.2227, "step": 2867 }, { "epoch": 0.024895617225544918, "grad_norm": 0.0712890625, "learning_rate": 0.001998285191123392, "loss": 0.2129, "step": 2868 }, { "epoch": 0.024904297705749083, "grad_norm": 0.2265625, "learning_rate": 0.001998283354248586, "loss": 0.2354, "step": 2869 }, { "epoch": 0.024912978185953248, "grad_norm": 0.1015625, "learning_rate": 0.001998281516391431, "loss": 0.207, "step": 2870 }, { "epoch": 0.024921658666157413, "grad_norm": 0.087890625, "learning_rate": 0.001998279677551929, "loss": 0.2637, "step": 2871 }, { "epoch": 0.02493033914636158, "grad_norm": 0.11376953125, "learning_rate": 0.0019982778377300816, "loss": 0.2656, "step": 2872 }, { "epoch": 0.02493901962656574, "grad_norm": 0.07763671875, "learning_rate": 0.0019982759969258915, "loss": 0.2266, "step": 2873 }, { "epoch": 0.024947700106769905, "grad_norm": 0.0703125, "learning_rate": 0.0019982741551393597, "loss": 0.1846, "step": 2874 }, { "epoch": 0.02495638058697407, "grad_norm": 0.1689453125, "learning_rate": 0.001998272312370489, "loss": 0.2266, "step": 2875 }, { "epoch": 0.024965061067178235, "grad_norm": 0.2138671875, "learning_rate": 0.0019982704686192813, "loss": 0.1787, "step": 2876 }, { "epoch": 0.0249737415473824, "grad_norm": 0.08203125, "learning_rate": 0.0019982686238857387, "loss": 0.2227, "step": 2877 }, { "epoch": 0.024982422027586566, "grad_norm": 0.185546875, "learning_rate": 0.0019982667781698626, "loss": 0.2197, "step": 2878 }, { "epoch": 0.02499110250779073, "grad_norm": 0.08837890625, "learning_rate": 0.0019982649314716555, "loss": 0.2227, "step": 2879 }, { "epoch": 0.024999782987994896, "grad_norm": 0.2392578125, "learning_rate": 0.0019982630837911196, "loss": 0.3633, "step": 2880 }, { "epoch": 0.02500846346819906, "grad_norm": 0.1337890625, "learning_rate": 0.0019982612351282566, "loss": 0.2285, "step": 2881 }, { "epoch": 0.025017143948403226, "grad_norm": 0.08642578125, "learning_rate": 0.0019982593854830683, "loss": 0.2168, "step": 2882 }, { "epoch": 0.02502582442860739, "grad_norm": 0.1015625, "learning_rate": 0.0019982575348555572, "loss": 0.2832, "step": 2883 }, { "epoch": 0.025034504908811556, "grad_norm": 0.1416015625, "learning_rate": 0.0019982556832457256, "loss": 0.2178, "step": 2884 }, { "epoch": 0.02504318538901572, "grad_norm": 0.0732421875, "learning_rate": 0.0019982538306535748, "loss": 0.1826, "step": 2885 }, { "epoch": 0.025051865869219887, "grad_norm": 0.0849609375, "learning_rate": 0.001998251977079107, "loss": 0.2012, "step": 2886 }, { "epoch": 0.02506054634942405, "grad_norm": 0.2353515625, "learning_rate": 0.0019982501225223243, "loss": 0.2422, "step": 2887 }, { "epoch": 0.025069226829628213, "grad_norm": 0.08154296875, "learning_rate": 0.001998248266983229, "loss": 0.2578, "step": 2888 }, { "epoch": 0.02507790730983238, "grad_norm": 0.08984375, "learning_rate": 0.001998246410461823, "loss": 0.1514, "step": 2889 }, { "epoch": 0.025086587790036544, "grad_norm": 0.1953125, "learning_rate": 0.0019982445529581074, "loss": 0.2266, "step": 2890 }, { "epoch": 0.02509526827024071, "grad_norm": 0.1669921875, "learning_rate": 0.0019982426944720856, "loss": 0.2227, "step": 2891 }, { "epoch": 0.025103948750444874, "grad_norm": 0.06884765625, "learning_rate": 0.0019982408350037594, "loss": 0.1748, "step": 2892 }, { "epoch": 0.02511262923064904, "grad_norm": 0.1875, "learning_rate": 0.00199823897455313, "loss": 0.1689, "step": 2893 }, { "epoch": 0.025121309710853204, "grad_norm": 0.1123046875, "learning_rate": 0.0019982371131202, "loss": 0.1709, "step": 2894 }, { "epoch": 0.02512999019105737, "grad_norm": 0.08447265625, "learning_rate": 0.0019982352507049717, "loss": 0.1875, "step": 2895 }, { "epoch": 0.025138670671261534, "grad_norm": 0.11767578125, "learning_rate": 0.001998233387307447, "loss": 0.2207, "step": 2896 }, { "epoch": 0.0251473511514657, "grad_norm": 0.080078125, "learning_rate": 0.0019982315229276275, "loss": 0.2324, "step": 2897 }, { "epoch": 0.025156031631669865, "grad_norm": 0.11767578125, "learning_rate": 0.0019982296575655153, "loss": 0.2344, "step": 2898 }, { "epoch": 0.02516471211187403, "grad_norm": 0.1484375, "learning_rate": 0.0019982277912211125, "loss": 0.1992, "step": 2899 }, { "epoch": 0.025173392592078195, "grad_norm": 0.10009765625, "learning_rate": 0.0019982259238944216, "loss": 0.2266, "step": 2900 }, { "epoch": 0.02518207307228236, "grad_norm": 0.1259765625, "learning_rate": 0.0019982240555854445, "loss": 0.1924, "step": 2901 }, { "epoch": 0.025190753552486525, "grad_norm": 0.150390625, "learning_rate": 0.001998222186294183, "loss": 0.2119, "step": 2902 }, { "epoch": 0.02519943403269069, "grad_norm": 0.0732421875, "learning_rate": 0.001998220316020639, "loss": 0.2324, "step": 2903 }, { "epoch": 0.025208114512894852, "grad_norm": 0.11669921875, "learning_rate": 0.0019982184447648148, "loss": 0.1973, "step": 2904 }, { "epoch": 0.025216794993099017, "grad_norm": 0.1455078125, "learning_rate": 0.0019982165725267124, "loss": 0.1895, "step": 2905 }, { "epoch": 0.025225475473303182, "grad_norm": 0.220703125, "learning_rate": 0.001998214699306334, "loss": 0.1426, "step": 2906 }, { "epoch": 0.025234155953507347, "grad_norm": 0.228515625, "learning_rate": 0.001998212825103681, "loss": 0.1875, "step": 2907 }, { "epoch": 0.025242836433711512, "grad_norm": 0.11865234375, "learning_rate": 0.0019982109499187568, "loss": 0.2119, "step": 2908 }, { "epoch": 0.025251516913915677, "grad_norm": 0.08251953125, "learning_rate": 0.001998209073751562, "loss": 0.1943, "step": 2909 }, { "epoch": 0.025260197394119843, "grad_norm": 0.10009765625, "learning_rate": 0.0019982071966020993, "loss": 0.2383, "step": 2910 }, { "epoch": 0.025268877874324008, "grad_norm": 0.2294921875, "learning_rate": 0.0019982053184703706, "loss": 0.25, "step": 2911 }, { "epoch": 0.025277558354528173, "grad_norm": 0.13671875, "learning_rate": 0.0019982034393563786, "loss": 0.2402, "step": 2912 }, { "epoch": 0.025286238834732338, "grad_norm": 0.1376953125, "learning_rate": 0.0019982015592601246, "loss": 0.2539, "step": 2913 }, { "epoch": 0.025294919314936503, "grad_norm": 0.123046875, "learning_rate": 0.0019981996781816107, "loss": 0.2266, "step": 2914 }, { "epoch": 0.025303599795140668, "grad_norm": 0.06494140625, "learning_rate": 0.001998197796120839, "loss": 0.2266, "step": 2915 }, { "epoch": 0.025312280275344833, "grad_norm": 0.111328125, "learning_rate": 0.001998195913077812, "loss": 0.2676, "step": 2916 }, { "epoch": 0.025320960755549, "grad_norm": 0.0986328125, "learning_rate": 0.0019981940290525312, "loss": 0.1758, "step": 2917 }, { "epoch": 0.025329641235753163, "grad_norm": 0.1484375, "learning_rate": 0.0019981921440449988, "loss": 0.2148, "step": 2918 }, { "epoch": 0.025338321715957325, "grad_norm": 0.1669921875, "learning_rate": 0.0019981902580552173, "loss": 0.2363, "step": 2919 }, { "epoch": 0.02534700219616149, "grad_norm": 0.10546875, "learning_rate": 0.001998188371083188, "loss": 0.1895, "step": 2920 }, { "epoch": 0.025355682676365655, "grad_norm": 0.1298828125, "learning_rate": 0.0019981864831289144, "loss": 0.1787, "step": 2921 }, { "epoch": 0.02536436315656982, "grad_norm": 0.158203125, "learning_rate": 0.0019981845941923967, "loss": 0.2471, "step": 2922 }, { "epoch": 0.025373043636773986, "grad_norm": 0.08447265625, "learning_rate": 0.001998182704273638, "loss": 0.2109, "step": 2923 }, { "epoch": 0.02538172411697815, "grad_norm": 0.078125, "learning_rate": 0.00199818081337264, "loss": 0.1895, "step": 2924 }, { "epoch": 0.025390404597182316, "grad_norm": 0.2353515625, "learning_rate": 0.0019981789214894054, "loss": 0.1602, "step": 2925 }, { "epoch": 0.02539908507738648, "grad_norm": 0.2216796875, "learning_rate": 0.0019981770286239355, "loss": 0.2637, "step": 2926 }, { "epoch": 0.025407765557590646, "grad_norm": 0.185546875, "learning_rate": 0.0019981751347762327, "loss": 0.2266, "step": 2927 }, { "epoch": 0.02541644603779481, "grad_norm": 0.2021484375, "learning_rate": 0.0019981732399462987, "loss": 0.2119, "step": 2928 }, { "epoch": 0.025425126517998976, "grad_norm": 0.2099609375, "learning_rate": 0.0019981713441341365, "loss": 0.2578, "step": 2929 }, { "epoch": 0.02543380699820314, "grad_norm": 0.1787109375, "learning_rate": 0.0019981694473397474, "loss": 0.1826, "step": 2930 }, { "epoch": 0.025442487478407307, "grad_norm": 0.068359375, "learning_rate": 0.0019981675495631336, "loss": 0.1553, "step": 2931 }, { "epoch": 0.02545116795861147, "grad_norm": 0.10498046875, "learning_rate": 0.0019981656508042977, "loss": 0.2324, "step": 2932 }, { "epoch": 0.025459848438815637, "grad_norm": 0.08740234375, "learning_rate": 0.001998163751063241, "loss": 0.21, "step": 2933 }, { "epoch": 0.0254685289190198, "grad_norm": 0.083984375, "learning_rate": 0.001998161850339966, "loss": 0.209, "step": 2934 }, { "epoch": 0.025477209399223964, "grad_norm": 0.11767578125, "learning_rate": 0.001998159948634475, "loss": 0.1865, "step": 2935 }, { "epoch": 0.02548588987942813, "grad_norm": 0.0771484375, "learning_rate": 0.0019981580459467693, "loss": 0.1689, "step": 2936 }, { "epoch": 0.025494570359632294, "grad_norm": 0.1728515625, "learning_rate": 0.0019981561422768514, "loss": 0.2227, "step": 2937 }, { "epoch": 0.02550325083983646, "grad_norm": 0.09326171875, "learning_rate": 0.001998154237624724, "loss": 0.2363, "step": 2938 }, { "epoch": 0.025511931320040624, "grad_norm": 0.3359375, "learning_rate": 0.001998152331990388, "loss": 0.1934, "step": 2939 }, { "epoch": 0.02552061180024479, "grad_norm": 0.302734375, "learning_rate": 0.0019981504253738466, "loss": 0.2188, "step": 2940 }, { "epoch": 0.025529292280448954, "grad_norm": 0.46484375, "learning_rate": 0.001998148517775101, "loss": 0.1982, "step": 2941 }, { "epoch": 0.02553797276065312, "grad_norm": 0.4765625, "learning_rate": 0.001998146609194154, "loss": 0.2383, "step": 2942 }, { "epoch": 0.025546653240857285, "grad_norm": 0.1474609375, "learning_rate": 0.001998144699631007, "loss": 0.1973, "step": 2943 }, { "epoch": 0.02555533372106145, "grad_norm": 0.0966796875, "learning_rate": 0.0019981427890856628, "loss": 0.1992, "step": 2944 }, { "epoch": 0.025564014201265615, "grad_norm": 0.25, "learning_rate": 0.0019981408775581228, "loss": 0.1396, "step": 2945 }, { "epoch": 0.02557269468146978, "grad_norm": 0.35546875, "learning_rate": 0.0019981389650483897, "loss": 0.207, "step": 2946 }, { "epoch": 0.025581375161673945, "grad_norm": 0.5703125, "learning_rate": 0.0019981370515564654, "loss": 0.2988, "step": 2947 }, { "epoch": 0.02559005564187811, "grad_norm": 0.283203125, "learning_rate": 0.0019981351370823514, "loss": 0.1934, "step": 2948 }, { "epoch": 0.025598736122082275, "grad_norm": 0.337890625, "learning_rate": 0.0019981332216260504, "loss": 0.2061, "step": 2949 }, { "epoch": 0.025607416602286437, "grad_norm": 0.09765625, "learning_rate": 0.001998131305187565, "loss": 0.1904, "step": 2950 }, { "epoch": 0.025616097082490602, "grad_norm": 0.271484375, "learning_rate": 0.0019981293877668962, "loss": 0.1719, "step": 2951 }, { "epoch": 0.025624777562694767, "grad_norm": 0.251953125, "learning_rate": 0.001998127469364047, "loss": 0.2266, "step": 2952 }, { "epoch": 0.025633458042898932, "grad_norm": 0.0908203125, "learning_rate": 0.001998125549979019, "loss": 0.2129, "step": 2953 }, { "epoch": 0.025642138523103097, "grad_norm": 0.17578125, "learning_rate": 0.0019981236296118137, "loss": 0.3711, "step": 2954 }, { "epoch": 0.025650819003307263, "grad_norm": 0.0654296875, "learning_rate": 0.0019981217082624347, "loss": 0.2148, "step": 2955 }, { "epoch": 0.025659499483511428, "grad_norm": 0.2734375, "learning_rate": 0.001998119785930883, "loss": 0.1963, "step": 2956 }, { "epoch": 0.025668179963715593, "grad_norm": 0.1337890625, "learning_rate": 0.001998117862617161, "loss": 0.1826, "step": 2957 }, { "epoch": 0.025676860443919758, "grad_norm": 0.1376953125, "learning_rate": 0.001998115938321271, "loss": 0.2451, "step": 2958 }, { "epoch": 0.025685540924123923, "grad_norm": 0.296875, "learning_rate": 0.0019981140130432146, "loss": 0.2471, "step": 2959 }, { "epoch": 0.025694221404328088, "grad_norm": 0.1513671875, "learning_rate": 0.0019981120867829947, "loss": 0.1826, "step": 2960 }, { "epoch": 0.025702901884532253, "grad_norm": 0.296875, "learning_rate": 0.0019981101595406125, "loss": 0.2139, "step": 2961 }, { "epoch": 0.02571158236473642, "grad_norm": 0.08544921875, "learning_rate": 0.00199810823131607, "loss": 0.2061, "step": 2962 }, { "epoch": 0.025720262844940583, "grad_norm": 0.0830078125, "learning_rate": 0.001998106302109371, "loss": 0.2119, "step": 2963 }, { "epoch": 0.02572894332514475, "grad_norm": 0.185546875, "learning_rate": 0.0019981043719205158, "loss": 0.1816, "step": 2964 }, { "epoch": 0.02573762380534891, "grad_norm": 0.15625, "learning_rate": 0.001998102440749507, "loss": 0.2363, "step": 2965 }, { "epoch": 0.025746304285553075, "grad_norm": 0.0791015625, "learning_rate": 0.001998100508596347, "loss": 0.2324, "step": 2966 }, { "epoch": 0.02575498476575724, "grad_norm": 0.1884765625, "learning_rate": 0.001998098575461038, "loss": 0.2363, "step": 2967 }, { "epoch": 0.025763665245961406, "grad_norm": 0.255859375, "learning_rate": 0.0019980966413435815, "loss": 0.2188, "step": 2968 }, { "epoch": 0.02577234572616557, "grad_norm": 0.10791015625, "learning_rate": 0.0019980947062439806, "loss": 0.2266, "step": 2969 }, { "epoch": 0.025781026206369736, "grad_norm": 0.134765625, "learning_rate": 0.0019980927701622364, "loss": 0.248, "step": 2970 }, { "epoch": 0.0257897066865739, "grad_norm": 0.2734375, "learning_rate": 0.0019980908330983513, "loss": 0.2041, "step": 2971 }, { "epoch": 0.025798387166778066, "grad_norm": 0.083984375, "learning_rate": 0.001998088895052328, "loss": 0.2305, "step": 2972 }, { "epoch": 0.02580706764698223, "grad_norm": 0.1044921875, "learning_rate": 0.001998086956024168, "loss": 0.2422, "step": 2973 }, { "epoch": 0.025815748127186396, "grad_norm": 0.15234375, "learning_rate": 0.001998085016013873, "loss": 0.2539, "step": 2974 }, { "epoch": 0.02582442860739056, "grad_norm": 0.19140625, "learning_rate": 0.0019980830750214464, "loss": 0.209, "step": 2975 }, { "epoch": 0.025833109087594727, "grad_norm": 0.07861328125, "learning_rate": 0.0019980811330468896, "loss": 0.1992, "step": 2976 }, { "epoch": 0.02584178956779889, "grad_norm": 0.09326171875, "learning_rate": 0.0019980791900902047, "loss": 0.2188, "step": 2977 }, { "epoch": 0.025850470048003057, "grad_norm": 0.1376953125, "learning_rate": 0.001998077246151394, "loss": 0.1963, "step": 2978 }, { "epoch": 0.025859150528207222, "grad_norm": 0.091796875, "learning_rate": 0.001998075301230459, "loss": 0.2383, "step": 2979 }, { "epoch": 0.025867831008411387, "grad_norm": 0.07275390625, "learning_rate": 0.0019980733553274033, "loss": 0.1953, "step": 2980 }, { "epoch": 0.02587651148861555, "grad_norm": 0.2353515625, "learning_rate": 0.0019980714084422272, "loss": 0.2363, "step": 2981 }, { "epoch": 0.025885191968819714, "grad_norm": 0.2255859375, "learning_rate": 0.001998069460574934, "loss": 0.1855, "step": 2982 }, { "epoch": 0.02589387244902388, "grad_norm": 0.2236328125, "learning_rate": 0.0019980675117255257, "loss": 0.248, "step": 2983 }, { "epoch": 0.025902552929228044, "grad_norm": 0.08984375, "learning_rate": 0.001998065561894004, "loss": 0.208, "step": 2984 }, { "epoch": 0.02591123340943221, "grad_norm": 0.236328125, "learning_rate": 0.001998063611080371, "loss": 0.2031, "step": 2985 }, { "epoch": 0.025919913889636374, "grad_norm": 0.05908203125, "learning_rate": 0.00199806165928463, "loss": 0.168, "step": 2986 }, { "epoch": 0.02592859436984054, "grad_norm": 0.0859375, "learning_rate": 0.001998059706506782, "loss": 0.2246, "step": 2987 }, { "epoch": 0.025937274850044705, "grad_norm": 0.0927734375, "learning_rate": 0.001998057752746829, "loss": 0.1826, "step": 2988 }, { "epoch": 0.02594595533024887, "grad_norm": 0.11376953125, "learning_rate": 0.0019980557980047737, "loss": 0.1592, "step": 2989 }, { "epoch": 0.025954635810453035, "grad_norm": 0.083984375, "learning_rate": 0.001998053842280618, "loss": 0.1836, "step": 2990 }, { "epoch": 0.0259633162906572, "grad_norm": 0.2197265625, "learning_rate": 0.0019980518855743645, "loss": 0.209, "step": 2991 }, { "epoch": 0.025971996770861365, "grad_norm": 0.1689453125, "learning_rate": 0.0019980499278860146, "loss": 0.2168, "step": 2992 }, { "epoch": 0.02598067725106553, "grad_norm": 1.046875, "learning_rate": 0.001998047969215571, "loss": 0.2324, "step": 2993 }, { "epoch": 0.025989357731269695, "grad_norm": 0.11376953125, "learning_rate": 0.001998046009563035, "loss": 0.2197, "step": 2994 }, { "epoch": 0.02599803821147386, "grad_norm": 0.330078125, "learning_rate": 0.00199804404892841, "loss": 0.2109, "step": 2995 }, { "epoch": 0.026006718691678022, "grad_norm": 0.2578125, "learning_rate": 0.0019980420873116976, "loss": 0.1963, "step": 2996 }, { "epoch": 0.026015399171882187, "grad_norm": 0.482421875, "learning_rate": 0.0019980401247128993, "loss": 0.2305, "step": 2997 }, { "epoch": 0.026024079652086352, "grad_norm": 0.185546875, "learning_rate": 0.001998038161132018, "loss": 0.1777, "step": 2998 }, { "epoch": 0.026032760132290517, "grad_norm": 0.2216796875, "learning_rate": 0.001998036196569056, "loss": 0.2021, "step": 2999 }, { "epoch": 0.026041440612494682, "grad_norm": 0.08935546875, "learning_rate": 0.001998034231024015, "loss": 0.2012, "step": 3000 }, { "epoch": 0.026050121092698848, "grad_norm": 0.4765625, "learning_rate": 0.0019980322644968973, "loss": 0.2773, "step": 3001 }, { "epoch": 0.026058801572903013, "grad_norm": 0.59765625, "learning_rate": 0.001998030296987705, "loss": 0.2471, "step": 3002 }, { "epoch": 0.026067482053107178, "grad_norm": 0.119140625, "learning_rate": 0.00199802832849644, "loss": 0.2334, "step": 3003 }, { "epoch": 0.026076162533311343, "grad_norm": 0.2236328125, "learning_rate": 0.0019980263590231046, "loss": 0.1885, "step": 3004 }, { "epoch": 0.026084843013515508, "grad_norm": 0.1240234375, "learning_rate": 0.0019980243885677016, "loss": 0.2188, "step": 3005 }, { "epoch": 0.026093523493719673, "grad_norm": 0.087890625, "learning_rate": 0.001998022417130232, "loss": 0.2051, "step": 3006 }, { "epoch": 0.02610220397392384, "grad_norm": 0.2060546875, "learning_rate": 0.001998020444710699, "loss": 0.2539, "step": 3007 }, { "epoch": 0.026110884454128003, "grad_norm": 0.07568359375, "learning_rate": 0.0019980184713091044, "loss": 0.1709, "step": 3008 }, { "epoch": 0.02611956493433217, "grad_norm": 0.44140625, "learning_rate": 0.00199801649692545, "loss": 0.2354, "step": 3009 }, { "epoch": 0.026128245414536334, "grad_norm": 0.11083984375, "learning_rate": 0.0019980145215597383, "loss": 0.2109, "step": 3010 }, { "epoch": 0.026136925894740495, "grad_norm": 0.8203125, "learning_rate": 0.001998012545211972, "loss": 0.2617, "step": 3011 }, { "epoch": 0.02614560637494466, "grad_norm": 0.1025390625, "learning_rate": 0.001998010567882152, "loss": 0.1914, "step": 3012 }, { "epoch": 0.026154286855148826, "grad_norm": 0.1484375, "learning_rate": 0.001998008589570281, "loss": 0.2363, "step": 3013 }, { "epoch": 0.02616296733535299, "grad_norm": 0.1845703125, "learning_rate": 0.001998006610276362, "loss": 0.1699, "step": 3014 }, { "epoch": 0.026171647815557156, "grad_norm": 0.1669921875, "learning_rate": 0.001998004630000396, "loss": 0.1719, "step": 3015 }, { "epoch": 0.02618032829576132, "grad_norm": 0.453125, "learning_rate": 0.001998002648742386, "loss": 0.2461, "step": 3016 }, { "epoch": 0.026189008775965486, "grad_norm": 0.1708984375, "learning_rate": 0.001998000666502333, "loss": 0.2539, "step": 3017 }, { "epoch": 0.02619768925616965, "grad_norm": 0.07861328125, "learning_rate": 0.0019979986832802405, "loss": 0.2109, "step": 3018 }, { "epoch": 0.026206369736373816, "grad_norm": 0.1005859375, "learning_rate": 0.00199799669907611, "loss": 0.252, "step": 3019 }, { "epoch": 0.02621505021657798, "grad_norm": 0.302734375, "learning_rate": 0.001997994713889944, "loss": 0.2461, "step": 3020 }, { "epoch": 0.026223730696782147, "grad_norm": 0.216796875, "learning_rate": 0.0019979927277217445, "loss": 0.1895, "step": 3021 }, { "epoch": 0.02623241117698631, "grad_norm": 0.1318359375, "learning_rate": 0.0019979907405715137, "loss": 0.2188, "step": 3022 }, { "epoch": 0.026241091657190477, "grad_norm": 0.14453125, "learning_rate": 0.0019979887524392533, "loss": 0.1807, "step": 3023 }, { "epoch": 0.026249772137394642, "grad_norm": 0.07470703125, "learning_rate": 0.0019979867633249664, "loss": 0.1992, "step": 3024 }, { "epoch": 0.026258452617598807, "grad_norm": 0.2001953125, "learning_rate": 0.0019979847732286542, "loss": 0.2207, "step": 3025 }, { "epoch": 0.026267133097802972, "grad_norm": 0.296875, "learning_rate": 0.00199798278215032, "loss": 0.1924, "step": 3026 }, { "epoch": 0.026275813578007134, "grad_norm": 0.0888671875, "learning_rate": 0.0019979807900899647, "loss": 0.1953, "step": 3027 }, { "epoch": 0.0262844940582113, "grad_norm": 0.09912109375, "learning_rate": 0.001997978797047591, "loss": 0.1982, "step": 3028 }, { "epoch": 0.026293174538415464, "grad_norm": 0.197265625, "learning_rate": 0.0019979768030232016, "loss": 0.2441, "step": 3029 }, { "epoch": 0.02630185501861963, "grad_norm": 0.1962890625, "learning_rate": 0.001997974808016798, "loss": 0.1455, "step": 3030 }, { "epoch": 0.026310535498823794, "grad_norm": 0.10498046875, "learning_rate": 0.0019979728120283827, "loss": 0.2168, "step": 3031 }, { "epoch": 0.02631921597902796, "grad_norm": 0.1640625, "learning_rate": 0.0019979708150579577, "loss": 0.1914, "step": 3032 }, { "epoch": 0.026327896459232124, "grad_norm": 0.53125, "learning_rate": 0.0019979688171055257, "loss": 0.2119, "step": 3033 }, { "epoch": 0.02633657693943629, "grad_norm": 0.1748046875, "learning_rate": 0.0019979668181710885, "loss": 0.2148, "step": 3034 }, { "epoch": 0.026345257419640455, "grad_norm": 0.10546875, "learning_rate": 0.001997964818254648, "loss": 0.2852, "step": 3035 }, { "epoch": 0.02635393789984462, "grad_norm": 0.0810546875, "learning_rate": 0.0019979628173562064, "loss": 0.2383, "step": 3036 }, { "epoch": 0.026362618380048785, "grad_norm": 0.1865234375, "learning_rate": 0.001997960815475767, "loss": 0.2051, "step": 3037 }, { "epoch": 0.02637129886025295, "grad_norm": 0.0947265625, "learning_rate": 0.0019979588126133306, "loss": 0.2109, "step": 3038 }, { "epoch": 0.026379979340457115, "grad_norm": 0.296875, "learning_rate": 0.0019979568087689, "loss": 0.1953, "step": 3039 }, { "epoch": 0.02638865982066128, "grad_norm": 0.498046875, "learning_rate": 0.001997954803942477, "loss": 0.2246, "step": 3040 }, { "epoch": 0.026397340300865445, "grad_norm": 0.1953125, "learning_rate": 0.0019979527981340644, "loss": 0.1875, "step": 3041 }, { "epoch": 0.026406020781069607, "grad_norm": 0.373046875, "learning_rate": 0.0019979507913436643, "loss": 0.2051, "step": 3042 }, { "epoch": 0.026414701261273772, "grad_norm": 0.08056640625, "learning_rate": 0.0019979487835712784, "loss": 0.1816, "step": 3043 }, { "epoch": 0.026423381741477937, "grad_norm": 0.12158203125, "learning_rate": 0.001997946774816909, "loss": 0.2197, "step": 3044 }, { "epoch": 0.026432062221682102, "grad_norm": 0.392578125, "learning_rate": 0.001997944765080559, "loss": 0.2539, "step": 3045 }, { "epoch": 0.026440742701886268, "grad_norm": 0.287109375, "learning_rate": 0.0019979427543622297, "loss": 0.1699, "step": 3046 }, { "epoch": 0.026449423182090433, "grad_norm": 0.166015625, "learning_rate": 0.001997940742661924, "loss": 0.3574, "step": 3047 }, { "epoch": 0.026458103662294598, "grad_norm": 0.185546875, "learning_rate": 0.001997938729979644, "loss": 0.2422, "step": 3048 }, { "epoch": 0.026466784142498763, "grad_norm": 0.2119140625, "learning_rate": 0.001997936716315391, "loss": 0.2422, "step": 3049 }, { "epoch": 0.026475464622702928, "grad_norm": 0.296875, "learning_rate": 0.0019979347016691688, "loss": 0.2119, "step": 3050 }, { "epoch": 0.026484145102907093, "grad_norm": 0.0703125, "learning_rate": 0.001997932686040978, "loss": 0.168, "step": 3051 }, { "epoch": 0.02649282558311126, "grad_norm": 0.1044921875, "learning_rate": 0.0019979306694308217, "loss": 0.2578, "step": 3052 }, { "epoch": 0.026501506063315423, "grad_norm": 0.13671875, "learning_rate": 0.001997928651838702, "loss": 0.1875, "step": 3053 }, { "epoch": 0.02651018654351959, "grad_norm": 0.384765625, "learning_rate": 0.001997926633264621, "loss": 0.1797, "step": 3054 }, { "epoch": 0.026518867023723754, "grad_norm": 0.17578125, "learning_rate": 0.001997924613708581, "loss": 0.1797, "step": 3055 }, { "epoch": 0.02652754750392792, "grad_norm": 0.369140625, "learning_rate": 0.001997922593170584, "loss": 0.1846, "step": 3056 }, { "epoch": 0.026536227984132084, "grad_norm": 0.29296875, "learning_rate": 0.001997920571650632, "loss": 0.2051, "step": 3057 }, { "epoch": 0.026544908464336246, "grad_norm": 0.08544921875, "learning_rate": 0.0019979185491487282, "loss": 0.2207, "step": 3058 }, { "epoch": 0.02655358894454041, "grad_norm": 0.17578125, "learning_rate": 0.001997916525664874, "loss": 0.2031, "step": 3059 }, { "epoch": 0.026562269424744576, "grad_norm": 0.10595703125, "learning_rate": 0.0019979145011990713, "loss": 0.1768, "step": 3060 }, { "epoch": 0.02657094990494874, "grad_norm": 0.265625, "learning_rate": 0.001997912475751323, "loss": 0.252, "step": 3061 }, { "epoch": 0.026579630385152906, "grad_norm": 0.10595703125, "learning_rate": 0.0019979104493216314, "loss": 0.1807, "step": 3062 }, { "epoch": 0.02658831086535707, "grad_norm": 0.0888671875, "learning_rate": 0.0019979084219099983, "loss": 0.209, "step": 3063 }, { "epoch": 0.026596991345561236, "grad_norm": 0.1162109375, "learning_rate": 0.001997906393516426, "loss": 0.2266, "step": 3064 }, { "epoch": 0.0266056718257654, "grad_norm": 0.11572265625, "learning_rate": 0.0019979043641409166, "loss": 0.2246, "step": 3065 }, { "epoch": 0.026614352305969566, "grad_norm": 0.41015625, "learning_rate": 0.001997902333783473, "loss": 0.2324, "step": 3066 }, { "epoch": 0.02662303278617373, "grad_norm": 0.2373046875, "learning_rate": 0.0019979003024440966, "loss": 0.2129, "step": 3067 }, { "epoch": 0.026631713266377897, "grad_norm": 0.09423828125, "learning_rate": 0.0019978982701227897, "loss": 0.208, "step": 3068 }, { "epoch": 0.026640393746582062, "grad_norm": 0.81640625, "learning_rate": 0.0019978962368195547, "loss": 0.8203, "step": 3069 }, { "epoch": 0.026649074226786227, "grad_norm": 0.16796875, "learning_rate": 0.0019978942025343943, "loss": 0.2061, "step": 3070 }, { "epoch": 0.026657754706990392, "grad_norm": 0.41796875, "learning_rate": 0.0019978921672673096, "loss": 0.167, "step": 3071 }, { "epoch": 0.026666435187194557, "grad_norm": 0.408203125, "learning_rate": 0.0019978901310183043, "loss": 0.1836, "step": 3072 }, { "epoch": 0.02667511566739872, "grad_norm": 0.1904296875, "learning_rate": 0.0019978880937873795, "loss": 0.2285, "step": 3073 }, { "epoch": 0.026683796147602884, "grad_norm": 0.60546875, "learning_rate": 0.001997886055574538, "loss": 0.2734, "step": 3074 }, { "epoch": 0.02669247662780705, "grad_norm": 0.27734375, "learning_rate": 0.0019978840163797813, "loss": 0.2891, "step": 3075 }, { "epoch": 0.026701157108011214, "grad_norm": 0.1953125, "learning_rate": 0.0019978819762031122, "loss": 0.2373, "step": 3076 }, { "epoch": 0.02670983758821538, "grad_norm": 0.09375, "learning_rate": 0.0019978799350445337, "loss": 0.2188, "step": 3077 }, { "epoch": 0.026718518068419544, "grad_norm": 0.345703125, "learning_rate": 0.0019978778929040466, "loss": 0.2734, "step": 3078 }, { "epoch": 0.02672719854862371, "grad_norm": 0.337890625, "learning_rate": 0.0019978758497816535, "loss": 0.2324, "step": 3079 }, { "epoch": 0.026735879028827875, "grad_norm": 0.2294921875, "learning_rate": 0.0019978738056773567, "loss": 0.2422, "step": 3080 }, { "epoch": 0.02674455950903204, "grad_norm": 0.1708984375, "learning_rate": 0.0019978717605911595, "loss": 0.1973, "step": 3081 }, { "epoch": 0.026753239989236205, "grad_norm": 0.15625, "learning_rate": 0.0019978697145230625, "loss": 0.3438, "step": 3082 }, { "epoch": 0.02676192046944037, "grad_norm": 0.1259765625, "learning_rate": 0.001997867667473069, "loss": 0.1914, "step": 3083 }, { "epoch": 0.026770600949644535, "grad_norm": 0.185546875, "learning_rate": 0.001997865619441181, "loss": 0.2412, "step": 3084 }, { "epoch": 0.0267792814298487, "grad_norm": 0.279296875, "learning_rate": 0.0019978635704274, "loss": 0.2197, "step": 3085 }, { "epoch": 0.026787961910052865, "grad_norm": 0.2294921875, "learning_rate": 0.0019978615204317295, "loss": 0.2266, "step": 3086 }, { "epoch": 0.02679664239025703, "grad_norm": 0.0712890625, "learning_rate": 0.0019978594694541707, "loss": 0.1797, "step": 3087 }, { "epoch": 0.026805322870461192, "grad_norm": 0.11669921875, "learning_rate": 0.0019978574174947267, "loss": 0.2383, "step": 3088 }, { "epoch": 0.026814003350665357, "grad_norm": 0.2255859375, "learning_rate": 0.001997855364553399, "loss": 0.2227, "step": 3089 }, { "epoch": 0.026822683830869522, "grad_norm": 0.10986328125, "learning_rate": 0.0019978533106301904, "loss": 0.2285, "step": 3090 }, { "epoch": 0.026831364311073688, "grad_norm": 0.1376953125, "learning_rate": 0.0019978512557251027, "loss": 0.2578, "step": 3091 }, { "epoch": 0.026840044791277853, "grad_norm": 0.20703125, "learning_rate": 0.001997849199838138, "loss": 0.2695, "step": 3092 }, { "epoch": 0.026848725271482018, "grad_norm": 0.1083984375, "learning_rate": 0.0019978471429693, "loss": 0.168, "step": 3093 }, { "epoch": 0.026857405751686183, "grad_norm": 0.1962890625, "learning_rate": 0.001997845085118589, "loss": 0.1953, "step": 3094 }, { "epoch": 0.026866086231890348, "grad_norm": 0.134765625, "learning_rate": 0.0019978430262860085, "loss": 0.3535, "step": 3095 }, { "epoch": 0.026874766712094513, "grad_norm": 0.11962890625, "learning_rate": 0.00199784096647156, "loss": 0.2109, "step": 3096 }, { "epoch": 0.02688344719229868, "grad_norm": 0.1015625, "learning_rate": 0.001997838905675246, "loss": 0.1875, "step": 3097 }, { "epoch": 0.026892127672502843, "grad_norm": 0.142578125, "learning_rate": 0.0019978368438970696, "loss": 0.1914, "step": 3098 }, { "epoch": 0.02690080815270701, "grad_norm": 0.119140625, "learning_rate": 0.0019978347811370318, "loss": 0.1973, "step": 3099 }, { "epoch": 0.026909488632911174, "grad_norm": 0.30078125, "learning_rate": 0.0019978327173951357, "loss": 0.25, "step": 3100 }, { "epoch": 0.02691816911311534, "grad_norm": 0.1708984375, "learning_rate": 0.0019978306526713826, "loss": 0.2129, "step": 3101 }, { "epoch": 0.026926849593319504, "grad_norm": 0.1083984375, "learning_rate": 0.001997828586965776, "loss": 0.2061, "step": 3102 }, { "epoch": 0.02693553007352367, "grad_norm": 0.09814453125, "learning_rate": 0.0019978265202783172, "loss": 0.1846, "step": 3103 }, { "epoch": 0.02694421055372783, "grad_norm": 0.2314453125, "learning_rate": 0.001997824452609009, "loss": 0.2891, "step": 3104 }, { "epoch": 0.026952891033931996, "grad_norm": 0.1318359375, "learning_rate": 0.001997822383957853, "loss": 0.1865, "step": 3105 }, { "epoch": 0.02696157151413616, "grad_norm": 0.06396484375, "learning_rate": 0.0019978203143248526, "loss": 0.1494, "step": 3106 }, { "epoch": 0.026970251994340326, "grad_norm": 0.1171875, "learning_rate": 0.0019978182437100094, "loss": 0.1797, "step": 3107 }, { "epoch": 0.02697893247454449, "grad_norm": 0.369140625, "learning_rate": 0.0019978161721133252, "loss": 0.1855, "step": 3108 }, { "epoch": 0.026987612954748656, "grad_norm": 0.1572265625, "learning_rate": 0.001997814099534803, "loss": 0.1543, "step": 3109 }, { "epoch": 0.02699629343495282, "grad_norm": 0.10888671875, "learning_rate": 0.0019978120259744447, "loss": 0.2539, "step": 3110 }, { "epoch": 0.027004973915156986, "grad_norm": 0.50390625, "learning_rate": 0.0019978099514322526, "loss": 0.1816, "step": 3111 }, { "epoch": 0.02701365439536115, "grad_norm": 0.1640625, "learning_rate": 0.001997807875908229, "loss": 0.1924, "step": 3112 }, { "epoch": 0.027022334875565317, "grad_norm": 0.33203125, "learning_rate": 0.0019978057994023764, "loss": 0.249, "step": 3113 }, { "epoch": 0.027031015355769482, "grad_norm": 0.400390625, "learning_rate": 0.0019978037219146967, "loss": 0.2275, "step": 3114 }, { "epoch": 0.027039695835973647, "grad_norm": 0.283203125, "learning_rate": 0.0019978016434451925, "loss": 0.1699, "step": 3115 }, { "epoch": 0.027048376316177812, "grad_norm": 0.185546875, "learning_rate": 0.0019977995639938657, "loss": 0.1953, "step": 3116 }, { "epoch": 0.027057056796381977, "grad_norm": 0.1572265625, "learning_rate": 0.0019977974835607188, "loss": 0.2236, "step": 3117 }, { "epoch": 0.027065737276586142, "grad_norm": 0.0947265625, "learning_rate": 0.001997795402145754, "loss": 0.1846, "step": 3118 }, { "epoch": 0.027074417756790304, "grad_norm": 0.1142578125, "learning_rate": 0.001997793319748974, "loss": 0.2363, "step": 3119 }, { "epoch": 0.02708309823699447, "grad_norm": 0.1728515625, "learning_rate": 0.0019977912363703804, "loss": 0.1865, "step": 3120 }, { "epoch": 0.027091778717198634, "grad_norm": 0.13671875, "learning_rate": 0.0019977891520099756, "loss": 0.2031, "step": 3121 }, { "epoch": 0.0271004591974028, "grad_norm": 0.2470703125, "learning_rate": 0.0019977870666677625, "loss": 0.1836, "step": 3122 }, { "epoch": 0.027109139677606964, "grad_norm": 0.119140625, "learning_rate": 0.001997784980343743, "loss": 0.1865, "step": 3123 }, { "epoch": 0.02711782015781113, "grad_norm": 0.11474609375, "learning_rate": 0.0019977828930379193, "loss": 0.1719, "step": 3124 }, { "epoch": 0.027126500638015295, "grad_norm": 0.0732421875, "learning_rate": 0.0019977808047502935, "loss": 0.1934, "step": 3125 }, { "epoch": 0.02713518111821946, "grad_norm": 0.14453125, "learning_rate": 0.001997778715480868, "loss": 0.1611, "step": 3126 }, { "epoch": 0.027143861598423625, "grad_norm": 0.11279296875, "learning_rate": 0.001997776625229645, "loss": 0.1836, "step": 3127 }, { "epoch": 0.02715254207862779, "grad_norm": 0.1552734375, "learning_rate": 0.0019977745339966276, "loss": 0.2148, "step": 3128 }, { "epoch": 0.027161222558831955, "grad_norm": 0.255859375, "learning_rate": 0.0019977724417818174, "loss": 0.2207, "step": 3129 }, { "epoch": 0.02716990303903612, "grad_norm": 0.212890625, "learning_rate": 0.0019977703485852165, "loss": 0.1709, "step": 3130 }, { "epoch": 0.027178583519240285, "grad_norm": 0.1474609375, "learning_rate": 0.0019977682544068272, "loss": 0.2012, "step": 3131 }, { "epoch": 0.02718726399944445, "grad_norm": 0.099609375, "learning_rate": 0.0019977661592466525, "loss": 0.209, "step": 3132 }, { "epoch": 0.027195944479648616, "grad_norm": 0.271484375, "learning_rate": 0.001997764063104694, "loss": 0.3164, "step": 3133 }, { "epoch": 0.02720462495985278, "grad_norm": 0.09814453125, "learning_rate": 0.0019977619659809543, "loss": 0.2012, "step": 3134 }, { "epoch": 0.027213305440056942, "grad_norm": 0.166015625, "learning_rate": 0.001997759867875435, "loss": 0.2305, "step": 3135 }, { "epoch": 0.027221985920261108, "grad_norm": 0.3046875, "learning_rate": 0.0019977577687881397, "loss": 0.1934, "step": 3136 }, { "epoch": 0.027230666400465273, "grad_norm": 0.07080078125, "learning_rate": 0.00199775566871907, "loss": 0.2041, "step": 3137 }, { "epoch": 0.027239346880669438, "grad_norm": 0.19140625, "learning_rate": 0.001997753567668228, "loss": 0.2246, "step": 3138 }, { "epoch": 0.027248027360873603, "grad_norm": 0.1591796875, "learning_rate": 0.0019977514656356163, "loss": 0.2178, "step": 3139 }, { "epoch": 0.027256707841077768, "grad_norm": 0.123046875, "learning_rate": 0.0019977493626212365, "loss": 0.1943, "step": 3140 }, { "epoch": 0.027265388321281933, "grad_norm": 0.88671875, "learning_rate": 0.001997747258625092, "loss": 0.2129, "step": 3141 }, { "epoch": 0.027274068801486098, "grad_norm": 0.51953125, "learning_rate": 0.001997745153647185, "loss": 0.1885, "step": 3142 }, { "epoch": 0.027282749281690263, "grad_norm": 0.099609375, "learning_rate": 0.001997743047687517, "loss": 0.2021, "step": 3143 }, { "epoch": 0.02729142976189443, "grad_norm": 0.08056640625, "learning_rate": 0.0019977409407460904, "loss": 0.2266, "step": 3144 }, { "epoch": 0.027300110242098594, "grad_norm": 0.1640625, "learning_rate": 0.0019977388328229076, "loss": 0.2432, "step": 3145 }, { "epoch": 0.02730879072230276, "grad_norm": 0.0859375, "learning_rate": 0.001997736723917972, "loss": 0.1826, "step": 3146 }, { "epoch": 0.027317471202506924, "grad_norm": 0.166015625, "learning_rate": 0.0019977346140312843, "loss": 0.2207, "step": 3147 }, { "epoch": 0.02732615168271109, "grad_norm": 1.4921875, "learning_rate": 0.0019977325031628477, "loss": 0.375, "step": 3148 }, { "epoch": 0.027334832162915254, "grad_norm": 0.19921875, "learning_rate": 0.0019977303913126646, "loss": 0.2539, "step": 3149 }, { "epoch": 0.027343512643119416, "grad_norm": 0.58984375, "learning_rate": 0.001997728278480737, "loss": 0.1807, "step": 3150 }, { "epoch": 0.02735219312332358, "grad_norm": 0.359375, "learning_rate": 0.001997726164667067, "loss": 0.2012, "step": 3151 }, { "epoch": 0.027360873603527746, "grad_norm": 0.349609375, "learning_rate": 0.001997724049871657, "loss": 0.1797, "step": 3152 }, { "epoch": 0.02736955408373191, "grad_norm": 0.337890625, "learning_rate": 0.0019977219340945097, "loss": 0.207, "step": 3153 }, { "epoch": 0.027378234563936076, "grad_norm": 0.50390625, "learning_rate": 0.001997719817335627, "loss": 0.1836, "step": 3154 }, { "epoch": 0.02738691504414024, "grad_norm": 0.486328125, "learning_rate": 0.0019977176995950117, "loss": 0.1953, "step": 3155 }, { "epoch": 0.027395595524344406, "grad_norm": 0.09130859375, "learning_rate": 0.0019977155808726657, "loss": 0.1982, "step": 3156 }, { "epoch": 0.02740427600454857, "grad_norm": 0.0947265625, "learning_rate": 0.001997713461168591, "loss": 0.2246, "step": 3157 }, { "epoch": 0.027412956484752737, "grad_norm": 0.255859375, "learning_rate": 0.001997711340482791, "loss": 0.207, "step": 3158 }, { "epoch": 0.027421636964956902, "grad_norm": 0.08935546875, "learning_rate": 0.001997709218815267, "loss": 0.2109, "step": 3159 }, { "epoch": 0.027430317445161067, "grad_norm": 0.2734375, "learning_rate": 0.001997707096166022, "loss": 0.1387, "step": 3160 }, { "epoch": 0.027438997925365232, "grad_norm": 0.0986328125, "learning_rate": 0.0019977049725350576, "loss": 0.2559, "step": 3161 }, { "epoch": 0.027447678405569397, "grad_norm": 0.64453125, "learning_rate": 0.0019977028479223765, "loss": 0.207, "step": 3162 }, { "epoch": 0.027456358885773562, "grad_norm": 0.5078125, "learning_rate": 0.001997700722327981, "loss": 0.1836, "step": 3163 }, { "epoch": 0.027465039365977727, "grad_norm": 0.11865234375, "learning_rate": 0.001997698595751874, "loss": 0.1729, "step": 3164 }, { "epoch": 0.02747371984618189, "grad_norm": 0.0791015625, "learning_rate": 0.001997696468194057, "loss": 0.2021, "step": 3165 }, { "epoch": 0.027482400326386054, "grad_norm": 0.109375, "learning_rate": 0.001997694339654533, "loss": 0.1885, "step": 3166 }, { "epoch": 0.02749108080659022, "grad_norm": 0.1005859375, "learning_rate": 0.0019976922101333037, "loss": 0.1895, "step": 3167 }, { "epoch": 0.027499761286794384, "grad_norm": 0.13671875, "learning_rate": 0.0019976900796303716, "loss": 0.2031, "step": 3168 }, { "epoch": 0.02750844176699855, "grad_norm": 0.11376953125, "learning_rate": 0.001997687948145739, "loss": 0.2051, "step": 3169 }, { "epoch": 0.027517122247202715, "grad_norm": 0.19140625, "learning_rate": 0.0019976858156794085, "loss": 0.1621, "step": 3170 }, { "epoch": 0.02752580272740688, "grad_norm": 0.1708984375, "learning_rate": 0.001997683682231382, "loss": 0.207, "step": 3171 }, { "epoch": 0.027534483207611045, "grad_norm": 0.26953125, "learning_rate": 0.0019976815478016624, "loss": 0.252, "step": 3172 }, { "epoch": 0.02754316368781521, "grad_norm": 0.1328125, "learning_rate": 0.0019976794123902518, "loss": 0.2754, "step": 3173 }, { "epoch": 0.027551844168019375, "grad_norm": 0.1396484375, "learning_rate": 0.0019976772759971524, "loss": 0.1816, "step": 3174 }, { "epoch": 0.02756052464822354, "grad_norm": 0.125, "learning_rate": 0.001997675138622367, "loss": 0.249, "step": 3175 }, { "epoch": 0.027569205128427705, "grad_norm": 0.125, "learning_rate": 0.001997673000265897, "loss": 0.209, "step": 3176 }, { "epoch": 0.02757788560863187, "grad_norm": 0.291015625, "learning_rate": 0.0019976708609277453, "loss": 0.1602, "step": 3177 }, { "epoch": 0.027586566088836036, "grad_norm": 0.390625, "learning_rate": 0.0019976687206079147, "loss": 0.1875, "step": 3178 }, { "epoch": 0.0275952465690402, "grad_norm": 0.232421875, "learning_rate": 0.0019976665793064066, "loss": 0.1885, "step": 3179 }, { "epoch": 0.027603927049244366, "grad_norm": 0.380859375, "learning_rate": 0.001997664437023224, "loss": 0.2207, "step": 3180 }, { "epoch": 0.027612607529448528, "grad_norm": 0.13671875, "learning_rate": 0.001997662293758369, "loss": 0.2051, "step": 3181 }, { "epoch": 0.027621288009652693, "grad_norm": 0.1728515625, "learning_rate": 0.001997660149511844, "loss": 0.2246, "step": 3182 }, { "epoch": 0.027629968489856858, "grad_norm": 0.10693359375, "learning_rate": 0.0019976580042836514, "loss": 0.2207, "step": 3183 }, { "epoch": 0.027638648970061023, "grad_norm": 0.1328125, "learning_rate": 0.0019976558580737935, "loss": 0.2168, "step": 3184 }, { "epoch": 0.027647329450265188, "grad_norm": 0.267578125, "learning_rate": 0.001997653710882273, "loss": 0.2334, "step": 3185 }, { "epoch": 0.027656009930469353, "grad_norm": 0.46484375, "learning_rate": 0.0019976515627090916, "loss": 0.1797, "step": 3186 }, { "epoch": 0.027664690410673518, "grad_norm": 0.2734375, "learning_rate": 0.0019976494135542514, "loss": 0.252, "step": 3187 }, { "epoch": 0.027673370890877683, "grad_norm": 0.357421875, "learning_rate": 0.001997647263417756, "loss": 0.2656, "step": 3188 }, { "epoch": 0.02768205137108185, "grad_norm": 0.10546875, "learning_rate": 0.001997645112299607, "loss": 0.1729, "step": 3189 }, { "epoch": 0.027690731851286014, "grad_norm": 0.7421875, "learning_rate": 0.0019976429601998064, "loss": 0.2598, "step": 3190 }, { "epoch": 0.02769941233149018, "grad_norm": 0.181640625, "learning_rate": 0.001997640807118357, "loss": 0.1943, "step": 3191 }, { "epoch": 0.027708092811694344, "grad_norm": 0.1953125, "learning_rate": 0.0019976386530552613, "loss": 0.2305, "step": 3192 }, { "epoch": 0.02771677329189851, "grad_norm": 0.1494140625, "learning_rate": 0.0019976364980105214, "loss": 0.1855, "step": 3193 }, { "epoch": 0.027725453772102674, "grad_norm": 0.1064453125, "learning_rate": 0.0019976343419841397, "loss": 0.1699, "step": 3194 }, { "epoch": 0.02773413425230684, "grad_norm": 0.1689453125, "learning_rate": 0.0019976321849761187, "loss": 0.1875, "step": 3195 }, { "epoch": 0.027742814732511, "grad_norm": 0.15625, "learning_rate": 0.0019976300269864606, "loss": 0.2305, "step": 3196 }, { "epoch": 0.027751495212715166, "grad_norm": 0.1201171875, "learning_rate": 0.001997627868015167, "loss": 0.2168, "step": 3197 }, { "epoch": 0.02776017569291933, "grad_norm": 0.376953125, "learning_rate": 0.0019976257080622424, "loss": 0.1689, "step": 3198 }, { "epoch": 0.027768856173123496, "grad_norm": 0.2138671875, "learning_rate": 0.001997623547127687, "loss": 0.1768, "step": 3199 }, { "epoch": 0.02777753665332766, "grad_norm": 0.1318359375, "learning_rate": 0.001997621385211504, "loss": 0.207, "step": 3200 }, { "epoch": 0.027786217133531826, "grad_norm": 0.134765625, "learning_rate": 0.001997619222313696, "loss": 0.2363, "step": 3201 }, { "epoch": 0.02779489761373599, "grad_norm": 0.08251953125, "learning_rate": 0.001997617058434265, "loss": 0.1689, "step": 3202 }, { "epoch": 0.027803578093940157, "grad_norm": 0.08984375, "learning_rate": 0.0019976148935732137, "loss": 0.1895, "step": 3203 }, { "epoch": 0.027812258574144322, "grad_norm": 0.1533203125, "learning_rate": 0.001997612727730544, "loss": 0.1924, "step": 3204 }, { "epoch": 0.027820939054348487, "grad_norm": 0.142578125, "learning_rate": 0.0019976105609062584, "loss": 0.1875, "step": 3205 }, { "epoch": 0.027829619534552652, "grad_norm": 0.328125, "learning_rate": 0.0019976083931003597, "loss": 0.2236, "step": 3206 }, { "epoch": 0.027838300014756817, "grad_norm": 0.10107421875, "learning_rate": 0.0019976062243128494, "loss": 0.2637, "step": 3207 }, { "epoch": 0.027846980494960982, "grad_norm": 0.0986328125, "learning_rate": 0.0019976040545437307, "loss": 0.1719, "step": 3208 }, { "epoch": 0.027855660975165147, "grad_norm": 0.412109375, "learning_rate": 0.0019976018837930057, "loss": 0.1992, "step": 3209 }, { "epoch": 0.027864341455369313, "grad_norm": 0.28515625, "learning_rate": 0.001997599712060677, "loss": 0.248, "step": 3210 }, { "epoch": 0.027873021935573478, "grad_norm": 0.1748046875, "learning_rate": 0.0019975975393467463, "loss": 0.209, "step": 3211 }, { "epoch": 0.02788170241577764, "grad_norm": 0.0751953125, "learning_rate": 0.001997595365651217, "loss": 0.1797, "step": 3212 }, { "epoch": 0.027890382895981804, "grad_norm": 0.416015625, "learning_rate": 0.0019975931909740905, "loss": 0.2158, "step": 3213 }, { "epoch": 0.02789906337618597, "grad_norm": 0.1806640625, "learning_rate": 0.001997591015315369, "loss": 0.2148, "step": 3214 }, { "epoch": 0.027907743856390135, "grad_norm": 0.099609375, "learning_rate": 0.0019975888386750563, "loss": 0.207, "step": 3215 }, { "epoch": 0.0279164243365943, "grad_norm": 0.26171875, "learning_rate": 0.001997586661053154, "loss": 0.2217, "step": 3216 }, { "epoch": 0.027925104816798465, "grad_norm": 0.1494140625, "learning_rate": 0.001997584482449664, "loss": 0.2031, "step": 3217 }, { "epoch": 0.02793378529700263, "grad_norm": 0.70703125, "learning_rate": 0.0019975823028645892, "loss": 0.2041, "step": 3218 }, { "epoch": 0.027942465777206795, "grad_norm": 0.1357421875, "learning_rate": 0.001997580122297932, "loss": 0.167, "step": 3219 }, { "epoch": 0.02795114625741096, "grad_norm": 0.1640625, "learning_rate": 0.0019975779407496947, "loss": 0.2129, "step": 3220 }, { "epoch": 0.027959826737615125, "grad_norm": 0.10888671875, "learning_rate": 0.0019975757582198794, "loss": 0.1836, "step": 3221 }, { "epoch": 0.02796850721781929, "grad_norm": 0.25, "learning_rate": 0.0019975735747084886, "loss": 0.1992, "step": 3222 }, { "epoch": 0.027977187698023456, "grad_norm": 0.2001953125, "learning_rate": 0.0019975713902155253, "loss": 0.1768, "step": 3223 }, { "epoch": 0.02798586817822762, "grad_norm": 0.177734375, "learning_rate": 0.001997569204740991, "loss": 0.2471, "step": 3224 }, { "epoch": 0.027994548658431786, "grad_norm": 0.25390625, "learning_rate": 0.001997567018284889, "loss": 0.1914, "step": 3225 }, { "epoch": 0.02800322913863595, "grad_norm": 0.2216796875, "learning_rate": 0.0019975648308472207, "loss": 0.1719, "step": 3226 }, { "epoch": 0.028011909618840113, "grad_norm": 0.2412109375, "learning_rate": 0.0019975626424279893, "loss": 0.1963, "step": 3227 }, { "epoch": 0.028020590099044278, "grad_norm": 0.3515625, "learning_rate": 0.0019975604530271967, "loss": 0.2031, "step": 3228 }, { "epoch": 0.028029270579248443, "grad_norm": 0.2138671875, "learning_rate": 0.0019975582626448455, "loss": 0.1992, "step": 3229 }, { "epoch": 0.028037951059452608, "grad_norm": 0.26953125, "learning_rate": 0.001997556071280938, "loss": 0.2031, "step": 3230 }, { "epoch": 0.028046631539656773, "grad_norm": 0.0908203125, "learning_rate": 0.001997553878935477, "loss": 0.2227, "step": 3231 }, { "epoch": 0.028055312019860938, "grad_norm": 0.1474609375, "learning_rate": 0.0019975516856084643, "loss": 0.1826, "step": 3232 }, { "epoch": 0.028063992500065103, "grad_norm": 0.07958984375, "learning_rate": 0.001997549491299902, "loss": 0.1973, "step": 3233 }, { "epoch": 0.02807267298026927, "grad_norm": 0.1044921875, "learning_rate": 0.001997547296009794, "loss": 0.1758, "step": 3234 }, { "epoch": 0.028081353460473434, "grad_norm": 0.205078125, "learning_rate": 0.0019975450997381417, "loss": 0.2812, "step": 3235 }, { "epoch": 0.0280900339406776, "grad_norm": 0.1328125, "learning_rate": 0.001997542902484947, "loss": 0.1875, "step": 3236 }, { "epoch": 0.028098714420881764, "grad_norm": 0.1298828125, "learning_rate": 0.001997540704250213, "loss": 0.165, "step": 3237 }, { "epoch": 0.02810739490108593, "grad_norm": 0.1201171875, "learning_rate": 0.0019975385050339423, "loss": 0.2041, "step": 3238 }, { "epoch": 0.028116075381290094, "grad_norm": 0.396484375, "learning_rate": 0.0019975363048361366, "loss": 0.2168, "step": 3239 }, { "epoch": 0.02812475586149426, "grad_norm": 0.140625, "learning_rate": 0.0019975341036567993, "loss": 0.1777, "step": 3240 }, { "epoch": 0.028133436341698424, "grad_norm": 0.1943359375, "learning_rate": 0.0019975319014959316, "loss": 0.25, "step": 3241 }, { "epoch": 0.028142116821902586, "grad_norm": 0.6953125, "learning_rate": 0.0019975296983535365, "loss": 0.2344, "step": 3242 }, { "epoch": 0.02815079730210675, "grad_norm": 0.21875, "learning_rate": 0.0019975274942296167, "loss": 0.1836, "step": 3243 }, { "epoch": 0.028159477782310916, "grad_norm": 0.08544921875, "learning_rate": 0.0019975252891241742, "loss": 0.1631, "step": 3244 }, { "epoch": 0.02816815826251508, "grad_norm": 0.10009765625, "learning_rate": 0.0019975230830372114, "loss": 0.1992, "step": 3245 }, { "epoch": 0.028176838742719246, "grad_norm": 0.259765625, "learning_rate": 0.001997520875968731, "loss": 0.1719, "step": 3246 }, { "epoch": 0.02818551922292341, "grad_norm": 0.169921875, "learning_rate": 0.0019975186679187353, "loss": 0.1641, "step": 3247 }, { "epoch": 0.028194199703127577, "grad_norm": 0.267578125, "learning_rate": 0.0019975164588872264, "loss": 0.2168, "step": 3248 }, { "epoch": 0.028202880183331742, "grad_norm": 0.09765625, "learning_rate": 0.0019975142488742074, "loss": 0.209, "step": 3249 }, { "epoch": 0.028211560663535907, "grad_norm": 0.16796875, "learning_rate": 0.0019975120378796798, "loss": 0.2168, "step": 3250 }, { "epoch": 0.028220241143740072, "grad_norm": 0.5234375, "learning_rate": 0.001997509825903647, "loss": 0.1943, "step": 3251 }, { "epoch": 0.028228921623944237, "grad_norm": 0.138671875, "learning_rate": 0.001997507612946111, "loss": 0.1807, "step": 3252 }, { "epoch": 0.028237602104148402, "grad_norm": 0.2060546875, "learning_rate": 0.0019975053990070736, "loss": 0.166, "step": 3253 }, { "epoch": 0.028246282584352567, "grad_norm": 0.10888671875, "learning_rate": 0.001997503184086538, "loss": 0.1924, "step": 3254 }, { "epoch": 0.028254963064556732, "grad_norm": 0.6640625, "learning_rate": 0.0019975009681845067, "loss": 0.2021, "step": 3255 }, { "epoch": 0.028263643544760898, "grad_norm": 0.5546875, "learning_rate": 0.0019974987513009814, "loss": 0.1738, "step": 3256 }, { "epoch": 0.028272324024965063, "grad_norm": 0.287109375, "learning_rate": 0.001997496533435965, "loss": 0.207, "step": 3257 }, { "epoch": 0.028281004505169224, "grad_norm": 0.11376953125, "learning_rate": 0.00199749431458946, "loss": 0.1924, "step": 3258 }, { "epoch": 0.02828968498537339, "grad_norm": 0.4609375, "learning_rate": 0.001997492094761469, "loss": 0.1689, "step": 3259 }, { "epoch": 0.028298365465577555, "grad_norm": 0.11328125, "learning_rate": 0.001997489873951994, "loss": 0.1875, "step": 3260 }, { "epoch": 0.02830704594578172, "grad_norm": 0.08935546875, "learning_rate": 0.001997487652161037, "loss": 0.1953, "step": 3261 }, { "epoch": 0.028315726425985885, "grad_norm": 0.279296875, "learning_rate": 0.0019974854293886017, "loss": 0.2129, "step": 3262 }, { "epoch": 0.02832440690619005, "grad_norm": 0.1357421875, "learning_rate": 0.00199748320563469, "loss": 0.2539, "step": 3263 }, { "epoch": 0.028333087386394215, "grad_norm": 0.134765625, "learning_rate": 0.0019974809808993035, "loss": 0.1582, "step": 3264 }, { "epoch": 0.02834176786659838, "grad_norm": 0.255859375, "learning_rate": 0.0019974787551824452, "loss": 0.1719, "step": 3265 }, { "epoch": 0.028350448346802545, "grad_norm": 1.1171875, "learning_rate": 0.0019974765284841178, "loss": 0.7461, "step": 3266 }, { "epoch": 0.02835912882700671, "grad_norm": 0.439453125, "learning_rate": 0.0019974743008043237, "loss": 0.2002, "step": 3267 }, { "epoch": 0.028367809307210876, "grad_norm": 0.78125, "learning_rate": 0.001997472072143065, "loss": 0.2227, "step": 3268 }, { "epoch": 0.02837648978741504, "grad_norm": 0.341796875, "learning_rate": 0.0019974698425003446, "loss": 0.2246, "step": 3269 }, { "epoch": 0.028385170267619206, "grad_norm": 0.259765625, "learning_rate": 0.0019974676118761645, "loss": 0.1758, "step": 3270 }, { "epoch": 0.02839385074782337, "grad_norm": 0.259765625, "learning_rate": 0.0019974653802705272, "loss": 0.1641, "step": 3271 }, { "epoch": 0.028402531228027536, "grad_norm": 0.2294921875, "learning_rate": 0.0019974631476834355, "loss": 0.166, "step": 3272 }, { "epoch": 0.028411211708231698, "grad_norm": 0.2255859375, "learning_rate": 0.0019974609141148914, "loss": 0.1543, "step": 3273 }, { "epoch": 0.028419892188435863, "grad_norm": 0.0771484375, "learning_rate": 0.0019974586795648975, "loss": 0.1553, "step": 3274 }, { "epoch": 0.028428572668640028, "grad_norm": 0.267578125, "learning_rate": 0.0019974564440334566, "loss": 0.1904, "step": 3275 }, { "epoch": 0.028437253148844193, "grad_norm": 0.1552734375, "learning_rate": 0.0019974542075205702, "loss": 0.1621, "step": 3276 }, { "epoch": 0.028445933629048358, "grad_norm": 0.0859375, "learning_rate": 0.001997451970026242, "loss": 0.1553, "step": 3277 }, { "epoch": 0.028454614109252523, "grad_norm": 0.15625, "learning_rate": 0.0019974497315504735, "loss": 0.1973, "step": 3278 }, { "epoch": 0.02846329458945669, "grad_norm": 0.07763671875, "learning_rate": 0.0019974474920932675, "loss": 0.1729, "step": 3279 }, { "epoch": 0.028471975069660854, "grad_norm": 0.0947265625, "learning_rate": 0.0019974452516546264, "loss": 0.2031, "step": 3280 }, { "epoch": 0.02848065554986502, "grad_norm": 0.126953125, "learning_rate": 0.0019974430102345526, "loss": 0.2656, "step": 3281 }, { "epoch": 0.028489336030069184, "grad_norm": 0.2314453125, "learning_rate": 0.0019974407678330485, "loss": 0.1855, "step": 3282 }, { "epoch": 0.02849801651027335, "grad_norm": 0.140625, "learning_rate": 0.001997438524450117, "loss": 0.2324, "step": 3283 }, { "epoch": 0.028506696990477514, "grad_norm": 0.09619140625, "learning_rate": 0.00199743628008576, "loss": 0.2031, "step": 3284 }, { "epoch": 0.02851537747068168, "grad_norm": 0.1337890625, "learning_rate": 0.00199743403473998, "loss": 0.1875, "step": 3285 }, { "epoch": 0.028524057950885844, "grad_norm": 0.30078125, "learning_rate": 0.00199743178841278, "loss": 0.2246, "step": 3286 }, { "epoch": 0.02853273843109001, "grad_norm": 0.09423828125, "learning_rate": 0.0019974295411041617, "loss": 0.1992, "step": 3287 }, { "epoch": 0.028541418911294174, "grad_norm": 0.193359375, "learning_rate": 0.001997427292814129, "loss": 0.1855, "step": 3288 }, { "epoch": 0.028550099391498336, "grad_norm": 0.10302734375, "learning_rate": 0.0019974250435426818, "loss": 0.1855, "step": 3289 }, { "epoch": 0.0285587798717025, "grad_norm": 0.2451171875, "learning_rate": 0.001997422793289825, "loss": 0.1562, "step": 3290 }, { "epoch": 0.028567460351906666, "grad_norm": 0.470703125, "learning_rate": 0.0019974205420555595, "loss": 0.1777, "step": 3291 }, { "epoch": 0.02857614083211083, "grad_norm": 0.298828125, "learning_rate": 0.0019974182898398886, "loss": 0.1777, "step": 3292 }, { "epoch": 0.028584821312314997, "grad_norm": 0.18359375, "learning_rate": 0.0019974160366428148, "loss": 0.2129, "step": 3293 }, { "epoch": 0.028593501792519162, "grad_norm": 0.1552734375, "learning_rate": 0.0019974137824643402, "loss": 0.2051, "step": 3294 }, { "epoch": 0.028602182272723327, "grad_norm": 0.158203125, "learning_rate": 0.001997411527304467, "loss": 0.1406, "step": 3295 }, { "epoch": 0.028610862752927492, "grad_norm": 0.2099609375, "learning_rate": 0.0019974092711631986, "loss": 0.209, "step": 3296 }, { "epoch": 0.028619543233131657, "grad_norm": 0.22265625, "learning_rate": 0.0019974070140405366, "loss": 0.207, "step": 3297 }, { "epoch": 0.028628223713335822, "grad_norm": 0.08447265625, "learning_rate": 0.001997404755936484, "loss": 0.1797, "step": 3298 }, { "epoch": 0.028636904193539987, "grad_norm": 0.13671875, "learning_rate": 0.001997402496851043, "loss": 0.1992, "step": 3299 }, { "epoch": 0.028645584673744152, "grad_norm": 0.1904296875, "learning_rate": 0.001997400236784216, "loss": 0.1895, "step": 3300 }, { "epoch": 0.028654265153948318, "grad_norm": 0.12109375, "learning_rate": 0.0019973979757360056, "loss": 0.1953, "step": 3301 }, { "epoch": 0.028662945634152483, "grad_norm": 0.56640625, "learning_rate": 0.0019973957137064137, "loss": 0.1992, "step": 3302 }, { "epoch": 0.028671626114356648, "grad_norm": 0.10791015625, "learning_rate": 0.001997393450695444, "loss": 0.2539, "step": 3303 }, { "epoch": 0.02868030659456081, "grad_norm": 0.1015625, "learning_rate": 0.001997391186703098, "loss": 0.2256, "step": 3304 }, { "epoch": 0.028688987074764975, "grad_norm": 0.11083984375, "learning_rate": 0.001997388921729379, "loss": 0.1426, "step": 3305 }, { "epoch": 0.02869766755496914, "grad_norm": 0.11474609375, "learning_rate": 0.0019973866557742885, "loss": 0.165, "step": 3306 }, { "epoch": 0.028706348035173305, "grad_norm": 0.1025390625, "learning_rate": 0.0019973843888378296, "loss": 0.168, "step": 3307 }, { "epoch": 0.02871502851537747, "grad_norm": 0.0703125, "learning_rate": 0.0019973821209200043, "loss": 0.1992, "step": 3308 }, { "epoch": 0.028723708995581635, "grad_norm": 0.359375, "learning_rate": 0.001997379852020816, "loss": 0.1328, "step": 3309 }, { "epoch": 0.0287323894757858, "grad_norm": 0.08642578125, "learning_rate": 0.001997377582140266, "loss": 0.21, "step": 3310 }, { "epoch": 0.028741069955989965, "grad_norm": 0.10791015625, "learning_rate": 0.0019973753112783577, "loss": 0.1689, "step": 3311 }, { "epoch": 0.02874975043619413, "grad_norm": 0.31640625, "learning_rate": 0.0019973730394350934, "loss": 0.2129, "step": 3312 }, { "epoch": 0.028758430916398296, "grad_norm": 0.66796875, "learning_rate": 0.001997370766610475, "loss": 0.4082, "step": 3313 }, { "epoch": 0.02876711139660246, "grad_norm": 0.08642578125, "learning_rate": 0.0019973684928045053, "loss": 0.1865, "step": 3314 }, { "epoch": 0.028775791876806626, "grad_norm": 0.296875, "learning_rate": 0.0019973662180171876, "loss": 0.166, "step": 3315 }, { "epoch": 0.02878447235701079, "grad_norm": 0.232421875, "learning_rate": 0.001997363942248523, "loss": 0.2363, "step": 3316 }, { "epoch": 0.028793152837214956, "grad_norm": 0.16796875, "learning_rate": 0.001997361665498515, "loss": 0.1729, "step": 3317 }, { "epoch": 0.02880183331741912, "grad_norm": 0.26171875, "learning_rate": 0.0019973593877671654, "loss": 0.2148, "step": 3318 }, { "epoch": 0.028810513797623283, "grad_norm": 0.490234375, "learning_rate": 0.0019973571090544776, "loss": 0.2129, "step": 3319 }, { "epoch": 0.028819194277827448, "grad_norm": 0.1298828125, "learning_rate": 0.0019973548293604534, "loss": 0.1973, "step": 3320 }, { "epoch": 0.028827874758031613, "grad_norm": 0.52734375, "learning_rate": 0.001997352548685095, "loss": 0.2168, "step": 3321 }, { "epoch": 0.028836555238235778, "grad_norm": 0.140625, "learning_rate": 0.0019973502670284056, "loss": 0.2021, "step": 3322 }, { "epoch": 0.028845235718439943, "grad_norm": 0.265625, "learning_rate": 0.001997347984390388, "loss": 0.2461, "step": 3323 }, { "epoch": 0.02885391619864411, "grad_norm": 0.57421875, "learning_rate": 0.0019973457007710434, "loss": 0.2314, "step": 3324 }, { "epoch": 0.028862596678848274, "grad_norm": 0.228515625, "learning_rate": 0.001997343416170375, "loss": 0.2305, "step": 3325 }, { "epoch": 0.02887127715905244, "grad_norm": 0.07421875, "learning_rate": 0.001997341130588386, "loss": 0.2012, "step": 3326 }, { "epoch": 0.028879957639256604, "grad_norm": 0.1279296875, "learning_rate": 0.0019973388440250777, "loss": 0.25, "step": 3327 }, { "epoch": 0.02888863811946077, "grad_norm": 0.080078125, "learning_rate": 0.001997336556480453, "loss": 0.1807, "step": 3328 }, { "epoch": 0.028897318599664934, "grad_norm": 0.087890625, "learning_rate": 0.001997334267954515, "loss": 0.2002, "step": 3329 }, { "epoch": 0.0289059990798691, "grad_norm": 0.392578125, "learning_rate": 0.0019973319784472652, "loss": 0.2441, "step": 3330 }, { "epoch": 0.028914679560073264, "grad_norm": 0.224609375, "learning_rate": 0.001997329687958707, "loss": 0.2617, "step": 3331 }, { "epoch": 0.02892336004027743, "grad_norm": 0.62109375, "learning_rate": 0.0019973273964888428, "loss": 0.2754, "step": 3332 }, { "epoch": 0.028932040520481594, "grad_norm": 0.2109375, "learning_rate": 0.001997325104037674, "loss": 0.208, "step": 3333 }, { "epoch": 0.02894072100068576, "grad_norm": 0.107421875, "learning_rate": 0.0019973228106052046, "loss": 0.1709, "step": 3334 }, { "epoch": 0.02894940148088992, "grad_norm": 0.1640625, "learning_rate": 0.0019973205161914363, "loss": 0.1992, "step": 3335 }, { "epoch": 0.028958081961094086, "grad_norm": 0.0810546875, "learning_rate": 0.0019973182207963717, "loss": 0.1836, "step": 3336 }, { "epoch": 0.02896676244129825, "grad_norm": 0.0947265625, "learning_rate": 0.0019973159244200136, "loss": 0.1836, "step": 3337 }, { "epoch": 0.028975442921502417, "grad_norm": 0.46484375, "learning_rate": 0.001997313627062364, "loss": 0.1807, "step": 3338 }, { "epoch": 0.02898412340170658, "grad_norm": 0.07666015625, "learning_rate": 0.001997311328723426, "loss": 0.1924, "step": 3339 }, { "epoch": 0.028992803881910747, "grad_norm": 0.1064453125, "learning_rate": 0.0019973090294032013, "loss": 0.2119, "step": 3340 }, { "epoch": 0.029001484362114912, "grad_norm": 0.236328125, "learning_rate": 0.0019973067291016934, "loss": 0.1768, "step": 3341 }, { "epoch": 0.029010164842319077, "grad_norm": 0.58984375, "learning_rate": 0.0019973044278189045, "loss": 0.2188, "step": 3342 }, { "epoch": 0.029018845322523242, "grad_norm": 0.12060546875, "learning_rate": 0.0019973021255548363, "loss": 0.2344, "step": 3343 }, { "epoch": 0.029027525802727407, "grad_norm": 0.244140625, "learning_rate": 0.0019972998223094923, "loss": 0.2207, "step": 3344 }, { "epoch": 0.029036206282931572, "grad_norm": 0.12353515625, "learning_rate": 0.001997297518082875, "loss": 0.1855, "step": 3345 }, { "epoch": 0.029044886763135738, "grad_norm": 0.166015625, "learning_rate": 0.0019972952128749864, "loss": 0.1406, "step": 3346 }, { "epoch": 0.029053567243339903, "grad_norm": 0.41796875, "learning_rate": 0.001997292906685829, "loss": 0.3105, "step": 3347 }, { "epoch": 0.029062247723544068, "grad_norm": 0.11669921875, "learning_rate": 0.0019972905995154057, "loss": 0.207, "step": 3348 }, { "epoch": 0.029070928203748233, "grad_norm": 0.2392578125, "learning_rate": 0.001997288291363719, "loss": 0.1797, "step": 3349 }, { "epoch": 0.029079608683952395, "grad_norm": 0.30859375, "learning_rate": 0.0019972859822307712, "loss": 0.2246, "step": 3350 }, { "epoch": 0.02908828916415656, "grad_norm": 0.263671875, "learning_rate": 0.001997283672116565, "loss": 0.2344, "step": 3351 }, { "epoch": 0.029096969644360725, "grad_norm": 0.50390625, "learning_rate": 0.001997281361021103, "loss": 0.2266, "step": 3352 }, { "epoch": 0.02910565012456489, "grad_norm": 0.2314453125, "learning_rate": 0.001997279048944387, "loss": 0.2021, "step": 3353 }, { "epoch": 0.029114330604769055, "grad_norm": 0.197265625, "learning_rate": 0.001997276735886421, "loss": 0.1895, "step": 3354 }, { "epoch": 0.02912301108497322, "grad_norm": 0.1748046875, "learning_rate": 0.001997274421847206, "loss": 0.1729, "step": 3355 }, { "epoch": 0.029131691565177385, "grad_norm": 0.1142578125, "learning_rate": 0.0019972721068267454, "loss": 0.207, "step": 3356 }, { "epoch": 0.02914037204538155, "grad_norm": 0.0927734375, "learning_rate": 0.0019972697908250416, "loss": 0.1797, "step": 3357 }, { "epoch": 0.029149052525585716, "grad_norm": 0.103515625, "learning_rate": 0.0019972674738420967, "loss": 0.1758, "step": 3358 }, { "epoch": 0.02915773300578988, "grad_norm": 0.53125, "learning_rate": 0.0019972651558779137, "loss": 0.2266, "step": 3359 }, { "epoch": 0.029166413485994046, "grad_norm": 0.31640625, "learning_rate": 0.001997262836932495, "loss": 0.2363, "step": 3360 }, { "epoch": 0.02917509396619821, "grad_norm": 0.076171875, "learning_rate": 0.0019972605170058434, "loss": 0.1934, "step": 3361 }, { "epoch": 0.029183774446402376, "grad_norm": 0.216796875, "learning_rate": 0.001997258196097961, "loss": 0.2734, "step": 3362 }, { "epoch": 0.02919245492660654, "grad_norm": 0.14453125, "learning_rate": 0.00199725587420885, "loss": 0.2051, "step": 3363 }, { "epoch": 0.029201135406810706, "grad_norm": 0.3359375, "learning_rate": 0.001997253551338514, "loss": 0.1816, "step": 3364 }, { "epoch": 0.02920981588701487, "grad_norm": 0.12451171875, "learning_rate": 0.0019972512274869553, "loss": 0.2051, "step": 3365 }, { "epoch": 0.029218496367219033, "grad_norm": 0.10009765625, "learning_rate": 0.001997248902654176, "loss": 0.2021, "step": 3366 }, { "epoch": 0.029227176847423198, "grad_norm": 0.494140625, "learning_rate": 0.0019972465768401783, "loss": 0.1865, "step": 3367 }, { "epoch": 0.029235857327627363, "grad_norm": 2.046875, "learning_rate": 0.0019972442500449657, "loss": 0.4219, "step": 3368 }, { "epoch": 0.02924453780783153, "grad_norm": 0.314453125, "learning_rate": 0.00199724192226854, "loss": 0.2539, "step": 3369 }, { "epoch": 0.029253218288035693, "grad_norm": 0.0859375, "learning_rate": 0.001997239593510904, "loss": 0.1973, "step": 3370 }, { "epoch": 0.02926189876823986, "grad_norm": 0.08740234375, "learning_rate": 0.0019972372637720604, "loss": 0.1738, "step": 3371 }, { "epoch": 0.029270579248444024, "grad_norm": 0.13671875, "learning_rate": 0.0019972349330520116, "loss": 0.2578, "step": 3372 }, { "epoch": 0.02927925972864819, "grad_norm": 0.1318359375, "learning_rate": 0.00199723260135076, "loss": 0.2188, "step": 3373 }, { "epoch": 0.029287940208852354, "grad_norm": 0.134765625, "learning_rate": 0.0019972302686683085, "loss": 0.2295, "step": 3374 }, { "epoch": 0.02929662068905652, "grad_norm": 0.0771484375, "learning_rate": 0.0019972279350046595, "loss": 0.167, "step": 3375 }, { "epoch": 0.029305301169260684, "grad_norm": 0.08935546875, "learning_rate": 0.0019972256003598153, "loss": 0.1836, "step": 3376 }, { "epoch": 0.02931398164946485, "grad_norm": 0.44140625, "learning_rate": 0.0019972232647337785, "loss": 0.2246, "step": 3377 }, { "epoch": 0.029322662129669014, "grad_norm": 0.2294921875, "learning_rate": 0.0019972209281265522, "loss": 0.1699, "step": 3378 }, { "epoch": 0.02933134260987318, "grad_norm": 0.353515625, "learning_rate": 0.0019972185905381386, "loss": 0.1992, "step": 3379 }, { "epoch": 0.029340023090077345, "grad_norm": 0.310546875, "learning_rate": 0.00199721625196854, "loss": 0.2041, "step": 3380 }, { "epoch": 0.029348703570281506, "grad_norm": 0.0654296875, "learning_rate": 0.0019972139124177593, "loss": 0.168, "step": 3381 }, { "epoch": 0.02935738405048567, "grad_norm": 0.455078125, "learning_rate": 0.0019972115718857987, "loss": 0.2197, "step": 3382 }, { "epoch": 0.029366064530689837, "grad_norm": 0.1220703125, "learning_rate": 0.0019972092303726613, "loss": 0.208, "step": 3383 }, { "epoch": 0.029374745010894, "grad_norm": 0.09130859375, "learning_rate": 0.0019972068878783495, "loss": 0.1689, "step": 3384 }, { "epoch": 0.029383425491098167, "grad_norm": 0.09716796875, "learning_rate": 0.001997204544402865, "loss": 0.165, "step": 3385 }, { "epoch": 0.029392105971302332, "grad_norm": 0.11767578125, "learning_rate": 0.001997202199946212, "loss": 0.2012, "step": 3386 }, { "epoch": 0.029400786451506497, "grad_norm": 0.1337890625, "learning_rate": 0.001997199854508392, "loss": 0.1953, "step": 3387 }, { "epoch": 0.029409466931710662, "grad_norm": 0.083984375, "learning_rate": 0.001997197508089407, "loss": 0.1641, "step": 3388 }, { "epoch": 0.029418147411914827, "grad_norm": 0.1103515625, "learning_rate": 0.0019971951606892607, "loss": 0.2031, "step": 3389 }, { "epoch": 0.029426827892118992, "grad_norm": 0.11865234375, "learning_rate": 0.0019971928123079558, "loss": 0.1689, "step": 3390 }, { "epoch": 0.029435508372323158, "grad_norm": 0.1435546875, "learning_rate": 0.0019971904629454934, "loss": 0.1855, "step": 3391 }, { "epoch": 0.029444188852527323, "grad_norm": 0.1123046875, "learning_rate": 0.0019971881126018775, "loss": 0.1504, "step": 3392 }, { "epoch": 0.029452869332731488, "grad_norm": 0.48828125, "learning_rate": 0.00199718576127711, "loss": 0.1611, "step": 3393 }, { "epoch": 0.029461549812935653, "grad_norm": 0.353515625, "learning_rate": 0.001997183408971194, "loss": 0.2012, "step": 3394 }, { "epoch": 0.029470230293139818, "grad_norm": 0.12451171875, "learning_rate": 0.001997181055684131, "loss": 0.1934, "step": 3395 }, { "epoch": 0.02947891077334398, "grad_norm": 0.2578125, "learning_rate": 0.001997178701415925, "loss": 0.2188, "step": 3396 }, { "epoch": 0.029487591253548145, "grad_norm": 0.19140625, "learning_rate": 0.001997176346166578, "loss": 0.1992, "step": 3397 }, { "epoch": 0.02949627173375231, "grad_norm": 0.333984375, "learning_rate": 0.0019971739899360915, "loss": 0.1592, "step": 3398 }, { "epoch": 0.029504952213956475, "grad_norm": 0.279296875, "learning_rate": 0.0019971716327244694, "loss": 0.168, "step": 3399 }, { "epoch": 0.02951363269416064, "grad_norm": 0.30859375, "learning_rate": 0.0019971692745317142, "loss": 0.2266, "step": 3400 }, { "epoch": 0.029522313174364805, "grad_norm": 0.09814453125, "learning_rate": 0.0019971669153578276, "loss": 0.1807, "step": 3401 }, { "epoch": 0.02953099365456897, "grad_norm": 0.134765625, "learning_rate": 0.0019971645552028135, "loss": 0.1768, "step": 3402 }, { "epoch": 0.029539674134773135, "grad_norm": 0.17578125, "learning_rate": 0.001997162194066673, "loss": 0.2227, "step": 3403 }, { "epoch": 0.0295483546149773, "grad_norm": 0.22265625, "learning_rate": 0.00199715983194941, "loss": 0.1982, "step": 3404 }, { "epoch": 0.029557035095181466, "grad_norm": 0.16796875, "learning_rate": 0.001997157468851026, "loss": 0.1699, "step": 3405 }, { "epoch": 0.02956571557538563, "grad_norm": 0.0869140625, "learning_rate": 0.0019971551047715244, "loss": 0.1895, "step": 3406 }, { "epoch": 0.029574396055589796, "grad_norm": 0.1767578125, "learning_rate": 0.001997152739710907, "loss": 0.2324, "step": 3407 }, { "epoch": 0.02958307653579396, "grad_norm": 0.10302734375, "learning_rate": 0.0019971503736691773, "loss": 0.2207, "step": 3408 }, { "epoch": 0.029591757015998126, "grad_norm": 0.259765625, "learning_rate": 0.0019971480066463374, "loss": 0.1768, "step": 3409 }, { "epoch": 0.02960043749620229, "grad_norm": 0.310546875, "learning_rate": 0.0019971456386423895, "loss": 0.2051, "step": 3410 }, { "epoch": 0.029609117976406456, "grad_norm": 0.2021484375, "learning_rate": 0.001997143269657337, "loss": 0.1719, "step": 3411 }, { "epoch": 0.029617798456610618, "grad_norm": 0.25, "learning_rate": 0.001997140899691182, "loss": 0.1973, "step": 3412 }, { "epoch": 0.029626478936814783, "grad_norm": 0.1376953125, "learning_rate": 0.0019971385287439274, "loss": 0.207, "step": 3413 }, { "epoch": 0.02963515941701895, "grad_norm": 0.1103515625, "learning_rate": 0.0019971361568155753, "loss": 0.1973, "step": 3414 }, { "epoch": 0.029643839897223113, "grad_norm": 0.4765625, "learning_rate": 0.0019971337839061287, "loss": 0.207, "step": 3415 }, { "epoch": 0.02965252037742728, "grad_norm": 0.400390625, "learning_rate": 0.00199713141001559, "loss": 0.2236, "step": 3416 }, { "epoch": 0.029661200857631444, "grad_norm": 0.20703125, "learning_rate": 0.001997129035143962, "loss": 0.1689, "step": 3417 }, { "epoch": 0.02966988133783561, "grad_norm": 0.35546875, "learning_rate": 0.001997126659291247, "loss": 0.1836, "step": 3418 }, { "epoch": 0.029678561818039774, "grad_norm": 0.080078125, "learning_rate": 0.001997124282457448, "loss": 0.1562, "step": 3419 }, { "epoch": 0.02968724229824394, "grad_norm": 0.1484375, "learning_rate": 0.001997121904642567, "loss": 0.1777, "step": 3420 }, { "epoch": 0.029695922778448104, "grad_norm": 0.15234375, "learning_rate": 0.0019971195258466075, "loss": 0.1611, "step": 3421 }, { "epoch": 0.02970460325865227, "grad_norm": 0.4609375, "learning_rate": 0.001997117146069571, "loss": 0.168, "step": 3422 }, { "epoch": 0.029713283738856434, "grad_norm": 0.10693359375, "learning_rate": 0.001997114765311461, "loss": 0.2031, "step": 3423 }, { "epoch": 0.0297219642190606, "grad_norm": 0.11572265625, "learning_rate": 0.0019971123835722795, "loss": 0.2275, "step": 3424 }, { "epoch": 0.029730644699264765, "grad_norm": 0.18359375, "learning_rate": 0.0019971100008520297, "loss": 0.1895, "step": 3425 }, { "epoch": 0.02973932517946893, "grad_norm": 0.306640625, "learning_rate": 0.0019971076171507135, "loss": 0.1895, "step": 3426 }, { "epoch": 0.02974800565967309, "grad_norm": 0.08642578125, "learning_rate": 0.0019971052324683344, "loss": 0.1777, "step": 3427 }, { "epoch": 0.029756686139877257, "grad_norm": 0.12109375, "learning_rate": 0.001997102846804894, "loss": 0.1748, "step": 3428 }, { "epoch": 0.02976536662008142, "grad_norm": 0.41015625, "learning_rate": 0.001997100460160396, "loss": 0.1562, "step": 3429 }, { "epoch": 0.029774047100285587, "grad_norm": 0.140625, "learning_rate": 0.001997098072534842, "loss": 0.207, "step": 3430 }, { "epoch": 0.029782727580489752, "grad_norm": 0.33203125, "learning_rate": 0.0019970956839282347, "loss": 0.1699, "step": 3431 }, { "epoch": 0.029791408060693917, "grad_norm": 0.09228515625, "learning_rate": 0.0019970932943405777, "loss": 0.124, "step": 3432 }, { "epoch": 0.029800088540898082, "grad_norm": 0.1181640625, "learning_rate": 0.0019970909037718724, "loss": 0.1934, "step": 3433 }, { "epoch": 0.029808769021102247, "grad_norm": 0.15234375, "learning_rate": 0.0019970885122221225, "loss": 0.1826, "step": 3434 }, { "epoch": 0.029817449501306412, "grad_norm": 0.07275390625, "learning_rate": 0.0019970861196913297, "loss": 0.1494, "step": 3435 }, { "epoch": 0.029826129981510578, "grad_norm": 0.1015625, "learning_rate": 0.001997083726179497, "loss": 0.168, "step": 3436 }, { "epoch": 0.029834810461714743, "grad_norm": 0.1298828125, "learning_rate": 0.001997081331686627, "loss": 0.1631, "step": 3437 }, { "epoch": 0.029843490941918908, "grad_norm": 0.125, "learning_rate": 0.0019970789362127226, "loss": 0.1895, "step": 3438 }, { "epoch": 0.029852171422123073, "grad_norm": 0.396484375, "learning_rate": 0.001997076539757786, "loss": 0.1777, "step": 3439 }, { "epoch": 0.029860851902327238, "grad_norm": 0.19921875, "learning_rate": 0.00199707414232182, "loss": 0.2119, "step": 3440 }, { "epoch": 0.029869532382531403, "grad_norm": 0.3984375, "learning_rate": 0.001997071743904827, "loss": 0.2021, "step": 3441 }, { "epoch": 0.029878212862735568, "grad_norm": 0.2734375, "learning_rate": 0.0019970693445068104, "loss": 0.1816, "step": 3442 }, { "epoch": 0.02988689334293973, "grad_norm": 0.11083984375, "learning_rate": 0.0019970669441277717, "loss": 0.2217, "step": 3443 }, { "epoch": 0.029895573823143895, "grad_norm": 0.14453125, "learning_rate": 0.0019970645427677142, "loss": 0.1885, "step": 3444 }, { "epoch": 0.02990425430334806, "grad_norm": 0.173828125, "learning_rate": 0.0019970621404266403, "loss": 0.2178, "step": 3445 }, { "epoch": 0.029912934783552225, "grad_norm": 0.1484375, "learning_rate": 0.001997059737104553, "loss": 0.1826, "step": 3446 }, { "epoch": 0.02992161526375639, "grad_norm": 0.2021484375, "learning_rate": 0.0019970573328014544, "loss": 0.1709, "step": 3447 }, { "epoch": 0.029930295743960555, "grad_norm": 0.1923828125, "learning_rate": 0.0019970549275173475, "loss": 0.209, "step": 3448 }, { "epoch": 0.02993897622416472, "grad_norm": 0.08447265625, "learning_rate": 0.0019970525212522345, "loss": 0.1787, "step": 3449 }, { "epoch": 0.029947656704368886, "grad_norm": 0.0830078125, "learning_rate": 0.001997050114006119, "loss": 0.1699, "step": 3450 }, { "epoch": 0.02995633718457305, "grad_norm": 0.6796875, "learning_rate": 0.0019970477057790026, "loss": 0.2461, "step": 3451 }, { "epoch": 0.029965017664777216, "grad_norm": 0.083984375, "learning_rate": 0.001997045296570888, "loss": 0.1787, "step": 3452 }, { "epoch": 0.02997369814498138, "grad_norm": 0.1015625, "learning_rate": 0.0019970428863817784, "loss": 0.1836, "step": 3453 }, { "epoch": 0.029982378625185546, "grad_norm": 0.279296875, "learning_rate": 0.0019970404752116763, "loss": 0.1855, "step": 3454 }, { "epoch": 0.02999105910538971, "grad_norm": 0.12060546875, "learning_rate": 0.001997038063060584, "loss": 0.2334, "step": 3455 }, { "epoch": 0.029999739585593876, "grad_norm": 0.73828125, "learning_rate": 0.0019970356499285045, "loss": 0.2207, "step": 3456 }, { "epoch": 0.03000842006579804, "grad_norm": 0.09716796875, "learning_rate": 0.0019970332358154406, "loss": 0.1885, "step": 3457 }, { "epoch": 0.030017100546002203, "grad_norm": 0.07958984375, "learning_rate": 0.001997030820721394, "loss": 0.168, "step": 3458 }, { "epoch": 0.03002578102620637, "grad_norm": 0.173828125, "learning_rate": 0.001997028404646368, "loss": 0.1777, "step": 3459 }, { "epoch": 0.030034461506410533, "grad_norm": 0.1435546875, "learning_rate": 0.0019970259875903658, "loss": 0.2402, "step": 3460 }, { "epoch": 0.0300431419866147, "grad_norm": 0.12890625, "learning_rate": 0.001997023569553389, "loss": 0.2598, "step": 3461 }, { "epoch": 0.030051822466818864, "grad_norm": 0.0830078125, "learning_rate": 0.001997021150535441, "loss": 0.1855, "step": 3462 }, { "epoch": 0.03006050294702303, "grad_norm": 0.3984375, "learning_rate": 0.0019970187305365238, "loss": 0.1934, "step": 3463 }, { "epoch": 0.030069183427227194, "grad_norm": 0.349609375, "learning_rate": 0.0019970163095566406, "loss": 0.25, "step": 3464 }, { "epoch": 0.03007786390743136, "grad_norm": 0.205078125, "learning_rate": 0.001997013887595794, "loss": 0.1855, "step": 3465 }, { "epoch": 0.030086544387635524, "grad_norm": 0.0947265625, "learning_rate": 0.0019970114646539862, "loss": 0.2324, "step": 3466 }, { "epoch": 0.03009522486783969, "grad_norm": 0.384765625, "learning_rate": 0.0019970090407312206, "loss": 0.1943, "step": 3467 }, { "epoch": 0.030103905348043854, "grad_norm": 0.380859375, "learning_rate": 0.0019970066158274988, "loss": 0.1699, "step": 3468 }, { "epoch": 0.03011258582824802, "grad_norm": 0.1669921875, "learning_rate": 0.001997004189942824, "loss": 0.2207, "step": 3469 }, { "epoch": 0.030121266308452185, "grad_norm": 0.09716796875, "learning_rate": 0.0019970017630771995, "loss": 0.2422, "step": 3470 }, { "epoch": 0.03012994678865635, "grad_norm": 0.12890625, "learning_rate": 0.001996999335230627, "loss": 0.207, "step": 3471 }, { "epoch": 0.030138627268860515, "grad_norm": 0.71875, "learning_rate": 0.0019969969064031097, "loss": 0.1875, "step": 3472 }, { "epoch": 0.030147307749064677, "grad_norm": 0.24609375, "learning_rate": 0.00199699447659465, "loss": 0.1758, "step": 3473 }, { "epoch": 0.03015598822926884, "grad_norm": 0.388671875, "learning_rate": 0.0019969920458052506, "loss": 0.1807, "step": 3474 }, { "epoch": 0.030164668709473007, "grad_norm": 0.279296875, "learning_rate": 0.0019969896140349134, "loss": 0.2051, "step": 3475 }, { "epoch": 0.030173349189677172, "grad_norm": 0.212890625, "learning_rate": 0.001996987181283643, "loss": 0.1553, "step": 3476 }, { "epoch": 0.030182029669881337, "grad_norm": 0.07275390625, "learning_rate": 0.0019969847475514403, "loss": 0.166, "step": 3477 }, { "epoch": 0.030190710150085502, "grad_norm": 0.322265625, "learning_rate": 0.0019969823128383087, "loss": 0.1494, "step": 3478 }, { "epoch": 0.030199390630289667, "grad_norm": 0.31640625, "learning_rate": 0.0019969798771442508, "loss": 0.1631, "step": 3479 }, { "epoch": 0.030208071110493832, "grad_norm": 0.24609375, "learning_rate": 0.0019969774404692687, "loss": 0.1455, "step": 3480 }, { "epoch": 0.030216751590697997, "grad_norm": 0.2333984375, "learning_rate": 0.001996975002813366, "loss": 0.1963, "step": 3481 }, { "epoch": 0.030225432070902163, "grad_norm": 0.10400390625, "learning_rate": 0.001996972564176545, "loss": 0.1855, "step": 3482 }, { "epoch": 0.030234112551106328, "grad_norm": 0.171875, "learning_rate": 0.001996970124558808, "loss": 0.1631, "step": 3483 }, { "epoch": 0.030242793031310493, "grad_norm": 0.09619140625, "learning_rate": 0.001996967683960158, "loss": 0.2031, "step": 3484 }, { "epoch": 0.030251473511514658, "grad_norm": 0.21875, "learning_rate": 0.0019969652423805976, "loss": 0.1914, "step": 3485 }, { "epoch": 0.030260153991718823, "grad_norm": 0.330078125, "learning_rate": 0.00199696279982013, "loss": 0.2051, "step": 3486 }, { "epoch": 0.030268834471922988, "grad_norm": 0.107421875, "learning_rate": 0.001996960356278757, "loss": 0.1826, "step": 3487 }, { "epoch": 0.030277514952127153, "grad_norm": 0.08056640625, "learning_rate": 0.0019969579117564812, "loss": 0.1846, "step": 3488 }, { "epoch": 0.030286195432331315, "grad_norm": 0.4140625, "learning_rate": 0.001996955466253306, "loss": 0.2422, "step": 3489 }, { "epoch": 0.03029487591253548, "grad_norm": 0.1845703125, "learning_rate": 0.0019969530197692337, "loss": 0.1562, "step": 3490 }, { "epoch": 0.030303556392739645, "grad_norm": 0.185546875, "learning_rate": 0.0019969505723042677, "loss": 0.1689, "step": 3491 }, { "epoch": 0.03031223687294381, "grad_norm": 0.33984375, "learning_rate": 0.0019969481238584093, "loss": 0.1826, "step": 3492 }, { "epoch": 0.030320917353147975, "grad_norm": 0.0859375, "learning_rate": 0.001996945674431662, "loss": 0.1338, "step": 3493 }, { "epoch": 0.03032959783335214, "grad_norm": 0.353515625, "learning_rate": 0.0019969432240240286, "loss": 0.1885, "step": 3494 }, { "epoch": 0.030338278313556306, "grad_norm": 0.1044921875, "learning_rate": 0.0019969407726355116, "loss": 0.207, "step": 3495 }, { "epoch": 0.03034695879376047, "grad_norm": 0.26953125, "learning_rate": 0.0019969383202661137, "loss": 0.1826, "step": 3496 }, { "epoch": 0.030355639273964636, "grad_norm": 0.86328125, "learning_rate": 0.0019969358669158373, "loss": 0.2031, "step": 3497 }, { "epoch": 0.0303643197541688, "grad_norm": 0.80078125, "learning_rate": 0.0019969334125846854, "loss": 0.2344, "step": 3498 }, { "epoch": 0.030373000234372966, "grad_norm": 0.2890625, "learning_rate": 0.0019969309572726605, "loss": 0.1699, "step": 3499 }, { "epoch": 0.03038168071457713, "grad_norm": 0.08935546875, "learning_rate": 0.0019969285009797657, "loss": 0.1934, "step": 3500 }, { "epoch": 0.030390361194781296, "grad_norm": 0.1435546875, "learning_rate": 0.001996926043706003, "loss": 0.1855, "step": 3501 }, { "epoch": 0.03039904167498546, "grad_norm": 0.375, "learning_rate": 0.0019969235854513756, "loss": 0.1592, "step": 3502 }, { "epoch": 0.030407722155189627, "grad_norm": 0.1494140625, "learning_rate": 0.001996921126215886, "loss": 0.2051, "step": 3503 }, { "epoch": 0.03041640263539379, "grad_norm": 0.59765625, "learning_rate": 0.001996918665999537, "loss": 0.1855, "step": 3504 }, { "epoch": 0.030425083115597953, "grad_norm": 0.29296875, "learning_rate": 0.001996916204802331, "loss": 0.1992, "step": 3505 }, { "epoch": 0.03043376359580212, "grad_norm": 0.1884765625, "learning_rate": 0.001996913742624271, "loss": 0.1787, "step": 3506 }, { "epoch": 0.030442444076006284, "grad_norm": 0.0859375, "learning_rate": 0.00199691127946536, "loss": 0.1973, "step": 3507 }, { "epoch": 0.03045112455621045, "grad_norm": 0.419921875, "learning_rate": 0.0019969088153256, "loss": 0.1865, "step": 3508 }, { "epoch": 0.030459805036414614, "grad_norm": 0.337890625, "learning_rate": 0.001996906350204994, "loss": 0.1582, "step": 3509 }, { "epoch": 0.03046848551661878, "grad_norm": 0.375, "learning_rate": 0.0019969038841035447, "loss": 0.1602, "step": 3510 }, { "epoch": 0.030477165996822944, "grad_norm": 0.1806640625, "learning_rate": 0.001996901417021255, "loss": 0.1543, "step": 3511 }, { "epoch": 0.03048584647702711, "grad_norm": 0.1259765625, "learning_rate": 0.001996898948958127, "loss": 0.1992, "step": 3512 }, { "epoch": 0.030494526957231274, "grad_norm": 0.640625, "learning_rate": 0.0019968964799141637, "loss": 0.3164, "step": 3513 }, { "epoch": 0.03050320743743544, "grad_norm": 0.2109375, "learning_rate": 0.0019968940098893683, "loss": 0.1504, "step": 3514 }, { "epoch": 0.030511887917639605, "grad_norm": 0.09375, "learning_rate": 0.001996891538883743, "loss": 0.1455, "step": 3515 }, { "epoch": 0.03052056839784377, "grad_norm": 0.26171875, "learning_rate": 0.00199688906689729, "loss": 0.1846, "step": 3516 }, { "epoch": 0.030529248878047935, "grad_norm": 0.09228515625, "learning_rate": 0.0019968865939300135, "loss": 0.168, "step": 3517 }, { "epoch": 0.0305379293582521, "grad_norm": 0.86328125, "learning_rate": 0.0019968841199819146, "loss": 0.1885, "step": 3518 }, { "epoch": 0.030546609838456265, "grad_norm": 0.173828125, "learning_rate": 0.0019968816450529974, "loss": 0.1484, "step": 3519 }, { "epoch": 0.030555290318660427, "grad_norm": 0.2431640625, "learning_rate": 0.001996879169143263, "loss": 0.1621, "step": 3520 }, { "epoch": 0.030563970798864592, "grad_norm": 0.1005859375, "learning_rate": 0.001996876692252716, "loss": 0.1973, "step": 3521 }, { "epoch": 0.030572651279068757, "grad_norm": 0.45703125, "learning_rate": 0.0019968742143813578, "loss": 0.2539, "step": 3522 }, { "epoch": 0.030581331759272922, "grad_norm": 0.158203125, "learning_rate": 0.001996871735529191, "loss": 0.1641, "step": 3523 }, { "epoch": 0.030590012239477087, "grad_norm": 0.07275390625, "learning_rate": 0.001996869255696219, "loss": 0.1953, "step": 3524 }, { "epoch": 0.030598692719681252, "grad_norm": 0.125, "learning_rate": 0.0019968667748824446, "loss": 0.1484, "step": 3525 }, { "epoch": 0.030607373199885417, "grad_norm": 0.130859375, "learning_rate": 0.0019968642930878696, "loss": 0.1914, "step": 3526 }, { "epoch": 0.030616053680089583, "grad_norm": 0.3359375, "learning_rate": 0.0019968618103124976, "loss": 0.1719, "step": 3527 }, { "epoch": 0.030624734160293748, "grad_norm": 0.0927734375, "learning_rate": 0.001996859326556331, "loss": 0.1973, "step": 3528 }, { "epoch": 0.030633414640497913, "grad_norm": 0.06982421875, "learning_rate": 0.0019968568418193724, "loss": 0.1738, "step": 3529 }, { "epoch": 0.030642095120702078, "grad_norm": 0.30078125, "learning_rate": 0.001996854356101625, "loss": 0.2949, "step": 3530 }, { "epoch": 0.030650775600906243, "grad_norm": 0.84765625, "learning_rate": 0.0019968518694030908, "loss": 0.2598, "step": 3531 }, { "epoch": 0.030659456081110408, "grad_norm": 0.30859375, "learning_rate": 0.0019968493817237726, "loss": 0.1826, "step": 3532 }, { "epoch": 0.030668136561314573, "grad_norm": 0.27734375, "learning_rate": 0.001996846893063674, "loss": 0.1279, "step": 3533 }, { "epoch": 0.03067681704151874, "grad_norm": 0.474609375, "learning_rate": 0.0019968444034227967, "loss": 0.208, "step": 3534 }, { "epoch": 0.0306854975217229, "grad_norm": 0.208984375, "learning_rate": 0.0019968419128011438, "loss": 0.1816, "step": 3535 }, { "epoch": 0.030694178001927065, "grad_norm": 0.205078125, "learning_rate": 0.0019968394211987185, "loss": 0.1748, "step": 3536 }, { "epoch": 0.03070285848213123, "grad_norm": 0.63671875, "learning_rate": 0.0019968369286155227, "loss": 0.2031, "step": 3537 }, { "epoch": 0.030711538962335395, "grad_norm": 0.1630859375, "learning_rate": 0.0019968344350515593, "loss": 0.1787, "step": 3538 }, { "epoch": 0.03072021944253956, "grad_norm": 0.072265625, "learning_rate": 0.0019968319405068314, "loss": 0.1729, "step": 3539 }, { "epoch": 0.030728899922743726, "grad_norm": 0.1845703125, "learning_rate": 0.001996829444981342, "loss": 0.1553, "step": 3540 }, { "epoch": 0.03073758040294789, "grad_norm": 0.58203125, "learning_rate": 0.0019968269484750933, "loss": 0.2041, "step": 3541 }, { "epoch": 0.030746260883152056, "grad_norm": 0.1728515625, "learning_rate": 0.001996824450988088, "loss": 0.1426, "step": 3542 }, { "epoch": 0.03075494136335622, "grad_norm": 0.283203125, "learning_rate": 0.0019968219525203284, "loss": 0.2051, "step": 3543 }, { "epoch": 0.030763621843560386, "grad_norm": 0.158203125, "learning_rate": 0.0019968194530718183, "loss": 0.248, "step": 3544 }, { "epoch": 0.03077230232376455, "grad_norm": 0.10009765625, "learning_rate": 0.00199681695264256, "loss": 0.1748, "step": 3545 }, { "epoch": 0.030780982803968716, "grad_norm": 0.353515625, "learning_rate": 0.001996814451232556, "loss": 0.1689, "step": 3546 }, { "epoch": 0.03078966328417288, "grad_norm": 0.28125, "learning_rate": 0.0019968119488418096, "loss": 0.2305, "step": 3547 }, { "epoch": 0.030798343764377047, "grad_norm": 0.28125, "learning_rate": 0.0019968094454703225, "loss": 0.2812, "step": 3548 }, { "epoch": 0.030807024244581212, "grad_norm": 0.310546875, "learning_rate": 0.0019968069411180982, "loss": 0.2266, "step": 3549 }, { "epoch": 0.030815704724785373, "grad_norm": 0.08251953125, "learning_rate": 0.0019968044357851398, "loss": 0.1562, "step": 3550 }, { "epoch": 0.03082438520498954, "grad_norm": 1.21875, "learning_rate": 0.001996801929471449, "loss": 0.2539, "step": 3551 }, { "epoch": 0.030833065685193704, "grad_norm": 0.059326171875, "learning_rate": 0.001996799422177029, "loss": 0.1523, "step": 3552 }, { "epoch": 0.03084174616539787, "grad_norm": 0.70703125, "learning_rate": 0.0019967969139018833, "loss": 0.2695, "step": 3553 }, { "epoch": 0.030850426645602034, "grad_norm": 0.31640625, "learning_rate": 0.0019967944046460134, "loss": 0.1602, "step": 3554 }, { "epoch": 0.0308591071258062, "grad_norm": 0.55078125, "learning_rate": 0.0019967918944094228, "loss": 0.1963, "step": 3555 }, { "epoch": 0.030867787606010364, "grad_norm": 0.35546875, "learning_rate": 0.001996789383192114, "loss": 0.1777, "step": 3556 }, { "epoch": 0.03087646808621453, "grad_norm": 0.1611328125, "learning_rate": 0.0019967868709940897, "loss": 0.1543, "step": 3557 }, { "epoch": 0.030885148566418694, "grad_norm": 0.77734375, "learning_rate": 0.001996784357815353, "loss": 0.2324, "step": 3558 }, { "epoch": 0.03089382904662286, "grad_norm": 0.189453125, "learning_rate": 0.0019967818436559064, "loss": 0.168, "step": 3559 }, { "epoch": 0.030902509526827025, "grad_norm": 0.2353515625, "learning_rate": 0.001996779328515752, "loss": 0.1719, "step": 3560 }, { "epoch": 0.03091119000703119, "grad_norm": 0.216796875, "learning_rate": 0.001996776812394894, "loss": 0.1777, "step": 3561 }, { "epoch": 0.030919870487235355, "grad_norm": 0.10595703125, "learning_rate": 0.0019967742952933343, "loss": 0.2041, "step": 3562 }, { "epoch": 0.03092855096743952, "grad_norm": 0.07861328125, "learning_rate": 0.0019967717772110755, "loss": 0.1787, "step": 3563 }, { "epoch": 0.030937231447643685, "grad_norm": 0.423828125, "learning_rate": 0.0019967692581481203, "loss": 0.2461, "step": 3564 }, { "epoch": 0.03094591192784785, "grad_norm": 0.06982421875, "learning_rate": 0.001996766738104472, "loss": 0.2129, "step": 3565 }, { "epoch": 0.030954592408052012, "grad_norm": 0.265625, "learning_rate": 0.001996764217080133, "loss": 0.1729, "step": 3566 }, { "epoch": 0.030963272888256177, "grad_norm": 0.1494140625, "learning_rate": 0.001996761695075106, "loss": 0.1797, "step": 3567 }, { "epoch": 0.030971953368460342, "grad_norm": 0.2333984375, "learning_rate": 0.0019967591720893942, "loss": 0.1738, "step": 3568 }, { "epoch": 0.030980633848664507, "grad_norm": 0.49609375, "learning_rate": 0.001996756648123, "loss": 0.1982, "step": 3569 }, { "epoch": 0.030989314328868672, "grad_norm": 0.1494140625, "learning_rate": 0.0019967541231759264, "loss": 0.2227, "step": 3570 }, { "epoch": 0.030997994809072837, "grad_norm": 0.28515625, "learning_rate": 0.001996751597248175, "loss": 0.166, "step": 3571 }, { "epoch": 0.031006675289277003, "grad_norm": 1.796875, "learning_rate": 0.001996749070339751, "loss": 0.3945, "step": 3572 }, { "epoch": 0.031015355769481168, "grad_norm": 0.30859375, "learning_rate": 0.0019967465424506545, "loss": 0.1719, "step": 3573 }, { "epoch": 0.031024036249685333, "grad_norm": 0.2578125, "learning_rate": 0.00199674401358089, "loss": 0.207, "step": 3574 }, { "epoch": 0.031032716729889498, "grad_norm": 0.328125, "learning_rate": 0.0019967414837304596, "loss": 0.2188, "step": 3575 }, { "epoch": 0.031041397210093663, "grad_norm": 0.359375, "learning_rate": 0.001996738952899366, "loss": 0.2002, "step": 3576 }, { "epoch": 0.031050077690297828, "grad_norm": 1.1484375, "learning_rate": 0.001996736421087612, "loss": 0.2207, "step": 3577 }, { "epoch": 0.031058758170501993, "grad_norm": 0.08935546875, "learning_rate": 0.001996733888295201, "loss": 0.1924, "step": 3578 }, { "epoch": 0.03106743865070616, "grad_norm": 0.27734375, "learning_rate": 0.001996731354522135, "loss": 0.2363, "step": 3579 }, { "epoch": 0.031076119130910324, "grad_norm": 0.578125, "learning_rate": 0.0019967288197684173, "loss": 0.1953, "step": 3580 }, { "epoch": 0.031084799611114485, "grad_norm": 0.1259765625, "learning_rate": 0.0019967262840340505, "loss": 0.2422, "step": 3581 }, { "epoch": 0.03109348009131865, "grad_norm": 0.16015625, "learning_rate": 0.0019967237473190367, "loss": 0.1914, "step": 3582 }, { "epoch": 0.031102160571522815, "grad_norm": 0.1083984375, "learning_rate": 0.00199672120962338, "loss": 0.2402, "step": 3583 }, { "epoch": 0.03111084105172698, "grad_norm": 0.359375, "learning_rate": 0.001996718670947082, "loss": 0.3438, "step": 3584 }, { "epoch": 0.031119521531931146, "grad_norm": 0.119140625, "learning_rate": 0.0019967161312901462, "loss": 0.1533, "step": 3585 }, { "epoch": 0.03112820201213531, "grad_norm": 0.400390625, "learning_rate": 0.001996713590652575, "loss": 0.2617, "step": 3586 }, { "epoch": 0.031136882492339476, "grad_norm": 0.328125, "learning_rate": 0.001996711049034371, "loss": 0.248, "step": 3587 }, { "epoch": 0.03114556297254364, "grad_norm": 0.30078125, "learning_rate": 0.001996708506435538, "loss": 0.1787, "step": 3588 }, { "epoch": 0.031154243452747806, "grad_norm": 0.09228515625, "learning_rate": 0.0019967059628560775, "loss": 0.1416, "step": 3589 }, { "epoch": 0.03116292393295197, "grad_norm": 0.31640625, "learning_rate": 0.0019967034182959927, "loss": 0.167, "step": 3590 }, { "epoch": 0.031171604413156136, "grad_norm": 0.29296875, "learning_rate": 0.001996700872755287, "loss": 0.167, "step": 3591 }, { "epoch": 0.0311802848933603, "grad_norm": 0.162109375, "learning_rate": 0.001996698326233962, "loss": 0.1758, "step": 3592 }, { "epoch": 0.031188965373564467, "grad_norm": 0.1728515625, "learning_rate": 0.001996695778732022, "loss": 0.1631, "step": 3593 }, { "epoch": 0.03119764585376863, "grad_norm": 0.1748046875, "learning_rate": 0.0019966932302494687, "loss": 0.1914, "step": 3594 }, { "epoch": 0.031206326333972797, "grad_norm": 0.59375, "learning_rate": 0.0019966906807863047, "loss": 0.1729, "step": 3595 }, { "epoch": 0.031215006814176962, "grad_norm": 0.09130859375, "learning_rate": 0.001996688130342534, "loss": 0.1963, "step": 3596 }, { "epoch": 0.031223687294381124, "grad_norm": 0.166015625, "learning_rate": 0.001996685578918158, "loss": 0.1699, "step": 3597 }, { "epoch": 0.03123236777458529, "grad_norm": 0.296875, "learning_rate": 0.0019966830265131805, "loss": 0.209, "step": 3598 }, { "epoch": 0.031241048254789454, "grad_norm": 0.12158203125, "learning_rate": 0.0019966804731276037, "loss": 0.1836, "step": 3599 }, { "epoch": 0.03124972873499362, "grad_norm": 0.337890625, "learning_rate": 0.0019966779187614307, "loss": 0.2793, "step": 3600 }, { "epoch": 0.031258409215197784, "grad_norm": 0.142578125, "learning_rate": 0.001996675363414664, "loss": 0.1797, "step": 3601 }, { "epoch": 0.03126708969540195, "grad_norm": 0.267578125, "learning_rate": 0.001996672807087307, "loss": 0.1768, "step": 3602 }, { "epoch": 0.031275770175606114, "grad_norm": 0.09716796875, "learning_rate": 0.001996670249779362, "loss": 0.1963, "step": 3603 }, { "epoch": 0.03128445065581028, "grad_norm": 0.283203125, "learning_rate": 0.0019966676914908313, "loss": 0.1758, "step": 3604 }, { "epoch": 0.031293131136014445, "grad_norm": 0.365234375, "learning_rate": 0.0019966651322217187, "loss": 0.2559, "step": 3605 }, { "epoch": 0.03130181161621861, "grad_norm": 0.326171875, "learning_rate": 0.001996662571972027, "loss": 0.2207, "step": 3606 }, { "epoch": 0.031310492096422775, "grad_norm": 0.2265625, "learning_rate": 0.001996660010741758, "loss": 0.1504, "step": 3607 }, { "epoch": 0.03131917257662694, "grad_norm": 0.39453125, "learning_rate": 0.0019966574485309153, "loss": 0.1641, "step": 3608 }, { "epoch": 0.031327853056831105, "grad_norm": 0.1279296875, "learning_rate": 0.001996654885339502, "loss": 0.1914, "step": 3609 }, { "epoch": 0.03133653353703527, "grad_norm": 0.1796875, "learning_rate": 0.0019966523211675195, "loss": 0.1523, "step": 3610 }, { "epoch": 0.031345214017239435, "grad_norm": 0.486328125, "learning_rate": 0.001996649756014972, "loss": 0.2031, "step": 3611 }, { "epoch": 0.0313538944974436, "grad_norm": 0.11865234375, "learning_rate": 0.0019966471898818614, "loss": 0.1377, "step": 3612 }, { "epoch": 0.031362574977647766, "grad_norm": 0.3359375, "learning_rate": 0.0019966446227681913, "loss": 0.2031, "step": 3613 }, { "epoch": 0.03137125545785193, "grad_norm": 0.4453125, "learning_rate": 0.001996642054673964, "loss": 0.1777, "step": 3614 }, { "epoch": 0.031379935938056096, "grad_norm": 0.380859375, "learning_rate": 0.0019966394855991825, "loss": 0.1885, "step": 3615 }, { "epoch": 0.03138861641826026, "grad_norm": 0.427734375, "learning_rate": 0.0019966369155438495, "loss": 0.248, "step": 3616 }, { "epoch": 0.031397296898464426, "grad_norm": 0.08203125, "learning_rate": 0.001996634344507968, "loss": 0.1543, "step": 3617 }, { "epoch": 0.03140597737866859, "grad_norm": 0.4609375, "learning_rate": 0.0019966317724915404, "loss": 0.1797, "step": 3618 }, { "epoch": 0.031414657858872756, "grad_norm": 0.11669921875, "learning_rate": 0.0019966291994945695, "loss": 0.1729, "step": 3619 }, { "epoch": 0.031423338339076914, "grad_norm": 0.126953125, "learning_rate": 0.0019966266255170588, "loss": 0.1709, "step": 3620 }, { "epoch": 0.03143201881928108, "grad_norm": 0.119140625, "learning_rate": 0.0019966240505590103, "loss": 0.1738, "step": 3621 }, { "epoch": 0.031440699299485245, "grad_norm": 0.111328125, "learning_rate": 0.0019966214746204277, "loss": 0.1855, "step": 3622 }, { "epoch": 0.03144937977968941, "grad_norm": 0.08837890625, "learning_rate": 0.001996618897701313, "loss": 0.2324, "step": 3623 }, { "epoch": 0.031458060259893575, "grad_norm": 0.07958984375, "learning_rate": 0.001996616319801669, "loss": 0.1641, "step": 3624 }, { "epoch": 0.03146674074009774, "grad_norm": 0.5078125, "learning_rate": 0.0019966137409215, "loss": 0.1797, "step": 3625 }, { "epoch": 0.031475421220301905, "grad_norm": 0.546875, "learning_rate": 0.0019966111610608068, "loss": 0.2168, "step": 3626 }, { "epoch": 0.03148410170050607, "grad_norm": 0.4609375, "learning_rate": 0.0019966085802195933, "loss": 0.2773, "step": 3627 }, { "epoch": 0.031492782180710235, "grad_norm": 0.46875, "learning_rate": 0.001996605998397862, "loss": 0.2148, "step": 3628 }, { "epoch": 0.0315014626609144, "grad_norm": 0.2431640625, "learning_rate": 0.001996603415595616, "loss": 0.1816, "step": 3629 }, { "epoch": 0.031510143141118566, "grad_norm": 0.232421875, "learning_rate": 0.001996600831812858, "loss": 0.1826, "step": 3630 }, { "epoch": 0.03151882362132273, "grad_norm": 0.259765625, "learning_rate": 0.001996598247049591, "loss": 0.1465, "step": 3631 }, { "epoch": 0.031527504101526896, "grad_norm": 0.25, "learning_rate": 0.0019965956613058173, "loss": 0.1719, "step": 3632 }, { "epoch": 0.03153618458173106, "grad_norm": 0.404296875, "learning_rate": 0.00199659307458154, "loss": 0.1953, "step": 3633 }, { "epoch": 0.031544865061935226, "grad_norm": 0.640625, "learning_rate": 0.0019965904868767623, "loss": 0.2324, "step": 3634 }, { "epoch": 0.03155354554213939, "grad_norm": 0.068359375, "learning_rate": 0.0019965878981914867, "loss": 0.1484, "step": 3635 }, { "epoch": 0.031562226022343556, "grad_norm": 0.16796875, "learning_rate": 0.0019965853085257153, "loss": 0.2002, "step": 3636 }, { "epoch": 0.03157090650254772, "grad_norm": 0.5234375, "learning_rate": 0.0019965827178794527, "loss": 0.21, "step": 3637 }, { "epoch": 0.03157958698275189, "grad_norm": 0.314453125, "learning_rate": 0.0019965801262527005, "loss": 0.2295, "step": 3638 }, { "epoch": 0.03158826746295605, "grad_norm": 0.1142578125, "learning_rate": 0.0019965775336454614, "loss": 0.166, "step": 3639 }, { "epoch": 0.03159694794316022, "grad_norm": 0.0673828125, "learning_rate": 0.0019965749400577383, "loss": 0.1445, "step": 3640 }, { "epoch": 0.03160562842336438, "grad_norm": 0.197265625, "learning_rate": 0.001996572345489535, "loss": 0.1699, "step": 3641 }, { "epoch": 0.03161430890356855, "grad_norm": 0.1494140625, "learning_rate": 0.0019965697499408535, "loss": 0.1846, "step": 3642 }, { "epoch": 0.03162298938377271, "grad_norm": 0.44140625, "learning_rate": 0.0019965671534116964, "loss": 0.418, "step": 3643 }, { "epoch": 0.03163166986397688, "grad_norm": 0.1669921875, "learning_rate": 0.001996564555902067, "loss": 0.1787, "step": 3644 }, { "epoch": 0.03164035034418104, "grad_norm": 0.296875, "learning_rate": 0.0019965619574119687, "loss": 0.1924, "step": 3645 }, { "epoch": 0.03164903082438521, "grad_norm": 0.1611328125, "learning_rate": 0.0019965593579414033, "loss": 0.1826, "step": 3646 }, { "epoch": 0.03165771130458937, "grad_norm": 0.09814453125, "learning_rate": 0.001996556757490374, "loss": 0.1855, "step": 3647 }, { "epoch": 0.03166639178479354, "grad_norm": 0.10400390625, "learning_rate": 0.0019965541560588835, "loss": 0.1729, "step": 3648 }, { "epoch": 0.0316750722649977, "grad_norm": 0.13671875, "learning_rate": 0.001996551553646935, "loss": 0.1387, "step": 3649 }, { "epoch": 0.03168375274520187, "grad_norm": 0.251953125, "learning_rate": 0.0019965489502545316, "loss": 0.1758, "step": 3650 }, { "epoch": 0.031692433225406026, "grad_norm": 0.26953125, "learning_rate": 0.0019965463458816754, "loss": 0.1885, "step": 3651 }, { "epoch": 0.03170111370561019, "grad_norm": 0.330078125, "learning_rate": 0.0019965437405283695, "loss": 0.1885, "step": 3652 }, { "epoch": 0.031709794185814356, "grad_norm": 0.443359375, "learning_rate": 0.001996541134194617, "loss": 0.2031, "step": 3653 }, { "epoch": 0.03171847466601852, "grad_norm": 0.162109375, "learning_rate": 0.0019965385268804208, "loss": 0.166, "step": 3654 }, { "epoch": 0.03172715514622269, "grad_norm": 0.408203125, "learning_rate": 0.001996535918585783, "loss": 0.2207, "step": 3655 }, { "epoch": 0.03173583562642685, "grad_norm": 0.376953125, "learning_rate": 0.0019965333093107072, "loss": 0.2461, "step": 3656 }, { "epoch": 0.03174451610663102, "grad_norm": 0.1875, "learning_rate": 0.001996530699055196, "loss": 0.1719, "step": 3657 }, { "epoch": 0.03175319658683518, "grad_norm": 0.1708984375, "learning_rate": 0.0019965280878192523, "loss": 0.1621, "step": 3658 }, { "epoch": 0.03176187706703935, "grad_norm": 0.1923828125, "learning_rate": 0.0019965254756028794, "loss": 0.1689, "step": 3659 }, { "epoch": 0.03177055754724351, "grad_norm": 0.1826171875, "learning_rate": 0.001996522862406079, "loss": 0.1924, "step": 3660 }, { "epoch": 0.03177923802744768, "grad_norm": 0.1494140625, "learning_rate": 0.0019965202482288553, "loss": 0.1826, "step": 3661 }, { "epoch": 0.03178791850765184, "grad_norm": 0.6015625, "learning_rate": 0.00199651763307121, "loss": 0.2217, "step": 3662 }, { "epoch": 0.03179659898785601, "grad_norm": 0.1357421875, "learning_rate": 0.001996515016933147, "loss": 0.1689, "step": 3663 }, { "epoch": 0.03180527946806017, "grad_norm": 0.1591796875, "learning_rate": 0.0019965123998146686, "loss": 0.1621, "step": 3664 }, { "epoch": 0.03181395994826434, "grad_norm": 0.171875, "learning_rate": 0.0019965097817157772, "loss": 0.2051, "step": 3665 }, { "epoch": 0.0318226404284685, "grad_norm": 0.08544921875, "learning_rate": 0.0019965071626364766, "loss": 0.1777, "step": 3666 }, { "epoch": 0.03183132090867267, "grad_norm": 0.5546875, "learning_rate": 0.0019965045425767687, "loss": 0.1777, "step": 3667 }, { "epoch": 0.03184000138887683, "grad_norm": 0.291015625, "learning_rate": 0.0019965019215366577, "loss": 0.2051, "step": 3668 }, { "epoch": 0.031848681869081, "grad_norm": 0.2294921875, "learning_rate": 0.001996499299516145, "loss": 0.1895, "step": 3669 }, { "epoch": 0.031857362349285163, "grad_norm": 0.1865234375, "learning_rate": 0.0019964966765152344, "loss": 0.1689, "step": 3670 }, { "epoch": 0.03186604282948933, "grad_norm": 0.0849609375, "learning_rate": 0.0019964940525339287, "loss": 0.1582, "step": 3671 }, { "epoch": 0.031874723309693494, "grad_norm": 0.34375, "learning_rate": 0.00199649142757223, "loss": 0.1689, "step": 3672 }, { "epoch": 0.03188340378989766, "grad_norm": 0.1533203125, "learning_rate": 0.0019964888016301427, "loss": 0.1934, "step": 3673 }, { "epoch": 0.031892084270101824, "grad_norm": 0.48046875, "learning_rate": 0.001996486174707668, "loss": 0.1855, "step": 3674 }, { "epoch": 0.03190076475030599, "grad_norm": 0.361328125, "learning_rate": 0.0019964835468048096, "loss": 0.3066, "step": 3675 }, { "epoch": 0.031909445230510154, "grad_norm": 0.1337890625, "learning_rate": 0.0019964809179215705, "loss": 0.2217, "step": 3676 }, { "epoch": 0.03191812571071432, "grad_norm": 1.703125, "learning_rate": 0.0019964782880579532, "loss": 0.2539, "step": 3677 }, { "epoch": 0.031926806190918484, "grad_norm": 0.13671875, "learning_rate": 0.0019964756572139605, "loss": 0.2383, "step": 3678 }, { "epoch": 0.03193548667112265, "grad_norm": 0.50390625, "learning_rate": 0.0019964730253895957, "loss": 0.1895, "step": 3679 }, { "epoch": 0.031944167151326815, "grad_norm": 0.65234375, "learning_rate": 0.0019964703925848615, "loss": 0.2227, "step": 3680 }, { "epoch": 0.03195284763153097, "grad_norm": 0.41015625, "learning_rate": 0.0019964677587997605, "loss": 0.168, "step": 3681 }, { "epoch": 0.03196152811173514, "grad_norm": 0.296875, "learning_rate": 0.001996465124034296, "loss": 0.1514, "step": 3682 }, { "epoch": 0.0319702085919393, "grad_norm": 0.1806640625, "learning_rate": 0.001996462488288471, "loss": 0.1914, "step": 3683 }, { "epoch": 0.03197888907214347, "grad_norm": 0.3203125, "learning_rate": 0.0019964598515622876, "loss": 0.1543, "step": 3684 }, { "epoch": 0.03198756955234763, "grad_norm": 0.134765625, "learning_rate": 0.001996457213855749, "loss": 0.2041, "step": 3685 }, { "epoch": 0.0319962500325518, "grad_norm": 0.154296875, "learning_rate": 0.001996454575168859, "loss": 0.209, "step": 3686 }, { "epoch": 0.032004930512755964, "grad_norm": 0.095703125, "learning_rate": 0.001996451935501619, "loss": 0.1387, "step": 3687 }, { "epoch": 0.03201361099296013, "grad_norm": 0.1572265625, "learning_rate": 0.001996449294854033, "loss": 0.1904, "step": 3688 }, { "epoch": 0.032022291473164294, "grad_norm": 0.09033203125, "learning_rate": 0.0019964466532261037, "loss": 0.2012, "step": 3689 }, { "epoch": 0.03203097195336846, "grad_norm": 0.2294921875, "learning_rate": 0.001996444010617834, "loss": 0.1885, "step": 3690 }, { "epoch": 0.032039652433572624, "grad_norm": 0.64453125, "learning_rate": 0.001996441367029226, "loss": 0.1963, "step": 3691 }, { "epoch": 0.03204833291377679, "grad_norm": 0.337890625, "learning_rate": 0.0019964387224602836, "loss": 0.1602, "step": 3692 }, { "epoch": 0.032057013393980954, "grad_norm": 0.2236328125, "learning_rate": 0.001996436076911009, "loss": 0.2383, "step": 3693 }, { "epoch": 0.03206569387418512, "grad_norm": 0.310546875, "learning_rate": 0.001996433430381405, "loss": 0.1797, "step": 3694 }, { "epoch": 0.032074374354389285, "grad_norm": 0.0908203125, "learning_rate": 0.001996430782871476, "loss": 0.1758, "step": 3695 }, { "epoch": 0.03208305483459345, "grad_norm": 0.2001953125, "learning_rate": 0.001996428134381223, "loss": 0.1807, "step": 3696 }, { "epoch": 0.032091735314797615, "grad_norm": 0.16796875, "learning_rate": 0.0019964254849106494, "loss": 0.1572, "step": 3697 }, { "epoch": 0.03210041579500178, "grad_norm": 0.341796875, "learning_rate": 0.001996422834459759, "loss": 0.1855, "step": 3698 }, { "epoch": 0.032109096275205945, "grad_norm": 0.1748046875, "learning_rate": 0.0019964201830285537, "loss": 0.1758, "step": 3699 }, { "epoch": 0.03211777675541011, "grad_norm": 0.126953125, "learning_rate": 0.0019964175306170368, "loss": 0.2002, "step": 3700 }, { "epoch": 0.032126457235614275, "grad_norm": 0.32421875, "learning_rate": 0.0019964148772252115, "loss": 0.1738, "step": 3701 }, { "epoch": 0.03213513771581844, "grad_norm": 0.5, "learning_rate": 0.00199641222285308, "loss": 0.1807, "step": 3702 }, { "epoch": 0.032143818196022605, "grad_norm": 0.66015625, "learning_rate": 0.0019964095675006456, "loss": 0.1807, "step": 3703 }, { "epoch": 0.03215249867622677, "grad_norm": 0.150390625, "learning_rate": 0.001996406911167911, "loss": 0.1963, "step": 3704 }, { "epoch": 0.032161179156430936, "grad_norm": 0.1376953125, "learning_rate": 0.0019964042538548796, "loss": 0.2031, "step": 3705 }, { "epoch": 0.0321698596366351, "grad_norm": 0.5, "learning_rate": 0.0019964015955615537, "loss": 0.2119, "step": 3706 }, { "epoch": 0.032178540116839266, "grad_norm": 0.2080078125, "learning_rate": 0.001996398936287937, "loss": 0.1572, "step": 3707 }, { "epoch": 0.03218722059704343, "grad_norm": 0.09033203125, "learning_rate": 0.0019963962760340312, "loss": 0.1572, "step": 3708 }, { "epoch": 0.032195901077247596, "grad_norm": 0.095703125, "learning_rate": 0.0019963936147998403, "loss": 0.1699, "step": 3709 }, { "epoch": 0.03220458155745176, "grad_norm": 0.07568359375, "learning_rate": 0.0019963909525853667, "loss": 0.1455, "step": 3710 }, { "epoch": 0.032213262037655926, "grad_norm": 0.25390625, "learning_rate": 0.0019963882893906135, "loss": 0.2227, "step": 3711 }, { "epoch": 0.032221942517860085, "grad_norm": 0.3359375, "learning_rate": 0.0019963856252155836, "loss": 0.1611, "step": 3712 }, { "epoch": 0.03223062299806425, "grad_norm": 0.09326171875, "learning_rate": 0.0019963829600602796, "loss": 0.1797, "step": 3713 }, { "epoch": 0.032239303478268415, "grad_norm": 0.55078125, "learning_rate": 0.0019963802939247047, "loss": 0.2012, "step": 3714 }, { "epoch": 0.03224798395847258, "grad_norm": 0.34375, "learning_rate": 0.001996377626808862, "loss": 0.1611, "step": 3715 }, { "epoch": 0.032256664438676745, "grad_norm": 0.1513671875, "learning_rate": 0.001996374958712754, "loss": 0.1816, "step": 3716 }, { "epoch": 0.03226534491888091, "grad_norm": 0.337890625, "learning_rate": 0.001996372289636384, "loss": 0.209, "step": 3717 }, { "epoch": 0.032274025399085075, "grad_norm": 0.2119140625, "learning_rate": 0.0019963696195797547, "loss": 0.2246, "step": 3718 }, { "epoch": 0.03228270587928924, "grad_norm": 0.1279296875, "learning_rate": 0.0019963669485428687, "loss": 0.1533, "step": 3719 }, { "epoch": 0.032291386359493406, "grad_norm": 0.5546875, "learning_rate": 0.00199636427652573, "loss": 0.2441, "step": 3720 }, { "epoch": 0.03230006683969757, "grad_norm": 0.388671875, "learning_rate": 0.00199636160352834, "loss": 0.1914, "step": 3721 }, { "epoch": 0.032308747319901736, "grad_norm": 0.203125, "learning_rate": 0.0019963589295507026, "loss": 0.1934, "step": 3722 }, { "epoch": 0.0323174278001059, "grad_norm": 0.1728515625, "learning_rate": 0.0019963562545928207, "loss": 0.1641, "step": 3723 }, { "epoch": 0.032326108280310066, "grad_norm": 0.11328125, "learning_rate": 0.0019963535786546973, "loss": 0.1973, "step": 3724 }, { "epoch": 0.03233478876051423, "grad_norm": 0.22265625, "learning_rate": 0.0019963509017363345, "loss": 0.1562, "step": 3725 }, { "epoch": 0.032343469240718396, "grad_norm": 0.205078125, "learning_rate": 0.0019963482238377363, "loss": 0.2031, "step": 3726 }, { "epoch": 0.03235214972092256, "grad_norm": 0.318359375, "learning_rate": 0.001996345544958905, "loss": 0.1582, "step": 3727 }, { "epoch": 0.03236083020112673, "grad_norm": 0.2392578125, "learning_rate": 0.0019963428650998437, "loss": 0.1738, "step": 3728 }, { "epoch": 0.03236951068133089, "grad_norm": 0.22265625, "learning_rate": 0.0019963401842605557, "loss": 0.1719, "step": 3729 }, { "epoch": 0.03237819116153506, "grad_norm": 0.78125, "learning_rate": 0.0019963375024410427, "loss": 0.2773, "step": 3730 }, { "epoch": 0.03238687164173922, "grad_norm": 0.18359375, "learning_rate": 0.001996334819641309, "loss": 0.1885, "step": 3731 }, { "epoch": 0.03239555212194339, "grad_norm": 0.42578125, "learning_rate": 0.001996332135861357, "loss": 0.1719, "step": 3732 }, { "epoch": 0.03240423260214755, "grad_norm": 0.12060546875, "learning_rate": 0.0019963294511011896, "loss": 0.1934, "step": 3733 }, { "epoch": 0.03241291308235172, "grad_norm": 0.1357421875, "learning_rate": 0.00199632676536081, "loss": 0.2051, "step": 3734 }, { "epoch": 0.03242159356255588, "grad_norm": 0.3359375, "learning_rate": 0.0019963240786402205, "loss": 0.2207, "step": 3735 }, { "epoch": 0.03243027404276005, "grad_norm": 0.08203125, "learning_rate": 0.0019963213909394246, "loss": 0.1807, "step": 3736 }, { "epoch": 0.03243895452296421, "grad_norm": 0.193359375, "learning_rate": 0.0019963187022584254, "loss": 0.2227, "step": 3737 }, { "epoch": 0.03244763500316838, "grad_norm": 0.08154296875, "learning_rate": 0.001996316012597225, "loss": 0.1689, "step": 3738 }, { "epoch": 0.03245631548337254, "grad_norm": 0.1591796875, "learning_rate": 0.001996313321955827, "loss": 0.1738, "step": 3739 }, { "epoch": 0.03246499596357671, "grad_norm": 0.275390625, "learning_rate": 0.0019963106303342347, "loss": 0.2119, "step": 3740 }, { "epoch": 0.03247367644378087, "grad_norm": 0.13671875, "learning_rate": 0.00199630793773245, "loss": 0.2285, "step": 3741 }, { "epoch": 0.03248235692398504, "grad_norm": 0.65234375, "learning_rate": 0.0019963052441504766, "loss": 0.2363, "step": 3742 }, { "epoch": 0.032491037404189196, "grad_norm": 0.146484375, "learning_rate": 0.0019963025495883173, "loss": 0.1895, "step": 3743 }, { "epoch": 0.03249971788439336, "grad_norm": 1.0546875, "learning_rate": 0.001996299854045975, "loss": 0.4746, "step": 3744 }, { "epoch": 0.03250839836459753, "grad_norm": 0.185546875, "learning_rate": 0.0019962971575234524, "loss": 0.2285, "step": 3745 }, { "epoch": 0.03251707884480169, "grad_norm": 0.130859375, "learning_rate": 0.0019962944600207533, "loss": 0.1973, "step": 3746 }, { "epoch": 0.03252575932500586, "grad_norm": 0.166015625, "learning_rate": 0.00199629176153788, "loss": 0.1836, "step": 3747 }, { "epoch": 0.03253443980521002, "grad_norm": 0.10009765625, "learning_rate": 0.0019962890620748345, "loss": 0.2002, "step": 3748 }, { "epoch": 0.03254312028541419, "grad_norm": 0.345703125, "learning_rate": 0.0019962863616316217, "loss": 0.1533, "step": 3749 }, { "epoch": 0.03255180076561835, "grad_norm": 0.1318359375, "learning_rate": 0.0019962836602082433, "loss": 0.1934, "step": 3750 }, { "epoch": 0.03256048124582252, "grad_norm": 0.359375, "learning_rate": 0.0019962809578047023, "loss": 0.2285, "step": 3751 }, { "epoch": 0.03256916172602668, "grad_norm": 0.11865234375, "learning_rate": 0.0019962782544210023, "loss": 0.1953, "step": 3752 }, { "epoch": 0.03257784220623085, "grad_norm": 0.12890625, "learning_rate": 0.0019962755500571457, "loss": 0.1924, "step": 3753 }, { "epoch": 0.03258652268643501, "grad_norm": 0.185546875, "learning_rate": 0.0019962728447131357, "loss": 0.166, "step": 3754 }, { "epoch": 0.03259520316663918, "grad_norm": 0.1552734375, "learning_rate": 0.0019962701383889753, "loss": 0.1562, "step": 3755 }, { "epoch": 0.03260388364684334, "grad_norm": 0.1015625, "learning_rate": 0.001996267431084667, "loss": 0.2207, "step": 3756 }, { "epoch": 0.03261256412704751, "grad_norm": 0.9609375, "learning_rate": 0.0019962647228002144, "loss": 0.2324, "step": 3757 }, { "epoch": 0.03262124460725167, "grad_norm": 0.09765625, "learning_rate": 0.00199626201353562, "loss": 0.1924, "step": 3758 }, { "epoch": 0.03262992508745584, "grad_norm": 0.55859375, "learning_rate": 0.001996259303290887, "loss": 0.1797, "step": 3759 }, { "epoch": 0.03263860556766, "grad_norm": 0.306640625, "learning_rate": 0.0019962565920660187, "loss": 0.1348, "step": 3760 }, { "epoch": 0.03264728604786417, "grad_norm": 0.357421875, "learning_rate": 0.001996253879861017, "loss": 0.2344, "step": 3761 }, { "epoch": 0.032655966528068334, "grad_norm": 0.095703125, "learning_rate": 0.001996251166675886, "loss": 0.2148, "step": 3762 }, { "epoch": 0.0326646470082725, "grad_norm": 0.11328125, "learning_rate": 0.001996248452510628, "loss": 0.166, "step": 3763 }, { "epoch": 0.032673327488476664, "grad_norm": 0.103515625, "learning_rate": 0.001996245737365246, "loss": 0.1943, "step": 3764 }, { "epoch": 0.03268200796868083, "grad_norm": 0.361328125, "learning_rate": 0.001996243021239743, "loss": 0.1592, "step": 3765 }, { "epoch": 0.032690688448884994, "grad_norm": 0.06640625, "learning_rate": 0.0019962403041341222, "loss": 0.1895, "step": 3766 }, { "epoch": 0.03269936892908916, "grad_norm": 0.470703125, "learning_rate": 0.001996237586048387, "loss": 0.1836, "step": 3767 }, { "epoch": 0.032708049409293324, "grad_norm": 0.2080078125, "learning_rate": 0.001996234866982539, "loss": 0.2002, "step": 3768 }, { "epoch": 0.03271672988949749, "grad_norm": 0.166015625, "learning_rate": 0.001996232146936583, "loss": 0.209, "step": 3769 }, { "epoch": 0.032725410369701655, "grad_norm": 0.22265625, "learning_rate": 0.0019962294259105204, "loss": 0.1855, "step": 3770 }, { "epoch": 0.03273409084990582, "grad_norm": 0.328125, "learning_rate": 0.0019962267039043543, "loss": 0.1729, "step": 3771 }, { "epoch": 0.032742771330109985, "grad_norm": 0.11181640625, "learning_rate": 0.001996223980918089, "loss": 0.1934, "step": 3772 }, { "epoch": 0.03275145181031415, "grad_norm": 0.146484375, "learning_rate": 0.0019962212569517262, "loss": 0.2129, "step": 3773 }, { "epoch": 0.03276013229051831, "grad_norm": 0.0751953125, "learning_rate": 0.0019962185320052694, "loss": 0.1602, "step": 3774 }, { "epoch": 0.03276881277072247, "grad_norm": 0.3515625, "learning_rate": 0.0019962158060787215, "loss": 0.2285, "step": 3775 }, { "epoch": 0.03277749325092664, "grad_norm": 0.111328125, "learning_rate": 0.0019962130791720853, "loss": 0.2227, "step": 3776 }, { "epoch": 0.032786173731130804, "grad_norm": 0.0908203125, "learning_rate": 0.001996210351285364, "loss": 0.1699, "step": 3777 }, { "epoch": 0.03279485421133497, "grad_norm": 0.08984375, "learning_rate": 0.0019962076224185605, "loss": 0.2119, "step": 3778 }, { "epoch": 0.032803534691539134, "grad_norm": 0.2353515625, "learning_rate": 0.001996204892571678, "loss": 0.1738, "step": 3779 }, { "epoch": 0.0328122151717433, "grad_norm": 0.126953125, "learning_rate": 0.001996202161744719, "loss": 0.1504, "step": 3780 }, { "epoch": 0.032820895651947464, "grad_norm": 0.5234375, "learning_rate": 0.001996199429937687, "loss": 0.1445, "step": 3781 }, { "epoch": 0.03282957613215163, "grad_norm": 0.60546875, "learning_rate": 0.001996196697150585, "loss": 0.207, "step": 3782 }, { "epoch": 0.032838256612355794, "grad_norm": 0.408203125, "learning_rate": 0.001996193963383416, "loss": 0.209, "step": 3783 }, { "epoch": 0.03284693709255996, "grad_norm": 0.0791015625, "learning_rate": 0.001996191228636182, "loss": 0.2227, "step": 3784 }, { "epoch": 0.032855617572764124, "grad_norm": 0.396484375, "learning_rate": 0.0019961884929088868, "loss": 0.2324, "step": 3785 }, { "epoch": 0.03286429805296829, "grad_norm": 0.283203125, "learning_rate": 0.0019961857562015334, "loss": 0.1846, "step": 3786 }, { "epoch": 0.032872978533172455, "grad_norm": 0.33984375, "learning_rate": 0.0019961830185141246, "loss": 0.2246, "step": 3787 }, { "epoch": 0.03288165901337662, "grad_norm": 0.0927734375, "learning_rate": 0.001996180279846664, "loss": 0.2031, "step": 3788 }, { "epoch": 0.032890339493580785, "grad_norm": 0.375, "learning_rate": 0.001996177540199154, "loss": 0.1816, "step": 3789 }, { "epoch": 0.03289901997378495, "grad_norm": 0.1650390625, "learning_rate": 0.0019961747995715976, "loss": 0.1953, "step": 3790 }, { "epoch": 0.032907700453989115, "grad_norm": 0.29296875, "learning_rate": 0.001996172057963998, "loss": 0.2051, "step": 3791 }, { "epoch": 0.03291638093419328, "grad_norm": 0.5234375, "learning_rate": 0.001996169315376358, "loss": 0.2051, "step": 3792 }, { "epoch": 0.032925061414397445, "grad_norm": 0.375, "learning_rate": 0.0019961665718086804, "loss": 0.1973, "step": 3793 }, { "epoch": 0.03293374189460161, "grad_norm": 0.0986328125, "learning_rate": 0.001996163827260969, "loss": 0.1963, "step": 3794 }, { "epoch": 0.032942422374805776, "grad_norm": 0.30078125, "learning_rate": 0.001996161081733226, "loss": 0.168, "step": 3795 }, { "epoch": 0.03295110285500994, "grad_norm": 0.08642578125, "learning_rate": 0.0019961583352254546, "loss": 0.1777, "step": 3796 }, { "epoch": 0.032959783335214106, "grad_norm": 0.421875, "learning_rate": 0.0019961555877376583, "loss": 0.1807, "step": 3797 }, { "epoch": 0.03296846381541827, "grad_norm": 0.111328125, "learning_rate": 0.0019961528392698396, "loss": 0.1533, "step": 3798 }, { "epoch": 0.032977144295622436, "grad_norm": 0.296875, "learning_rate": 0.0019961500898220013, "loss": 0.2061, "step": 3799 }, { "epoch": 0.0329858247758266, "grad_norm": 0.41015625, "learning_rate": 0.001996147339394147, "loss": 0.1885, "step": 3800 }, { "epoch": 0.032994505256030766, "grad_norm": 0.62109375, "learning_rate": 0.0019961445879862795, "loss": 0.2285, "step": 3801 }, { "epoch": 0.03300318573623493, "grad_norm": 0.625, "learning_rate": 0.0019961418355984016, "loss": 0.1973, "step": 3802 }, { "epoch": 0.0330118662164391, "grad_norm": 0.259765625, "learning_rate": 0.0019961390822305167, "loss": 0.1611, "step": 3803 }, { "epoch": 0.03302054669664326, "grad_norm": 0.10498046875, "learning_rate": 0.001996136327882627, "loss": 0.1777, "step": 3804 }, { "epoch": 0.03302922717684742, "grad_norm": 0.51171875, "learning_rate": 0.001996133572554737, "loss": 0.1602, "step": 3805 }, { "epoch": 0.033037907657051585, "grad_norm": 0.5078125, "learning_rate": 0.0019961308162468475, "loss": 0.3379, "step": 3806 }, { "epoch": 0.03304658813725575, "grad_norm": 0.32421875, "learning_rate": 0.001996128058958964, "loss": 0.2559, "step": 3807 }, { "epoch": 0.033055268617459915, "grad_norm": 0.79296875, "learning_rate": 0.0019961253006910876, "loss": 0.1621, "step": 3808 }, { "epoch": 0.03306394909766408, "grad_norm": 0.205078125, "learning_rate": 0.0019961225414432226, "loss": 0.1621, "step": 3809 }, { "epoch": 0.033072629577868246, "grad_norm": 0.119140625, "learning_rate": 0.0019961197812153712, "loss": 0.1758, "step": 3810 }, { "epoch": 0.03308131005807241, "grad_norm": 0.48046875, "learning_rate": 0.0019961170200075364, "loss": 0.1729, "step": 3811 }, { "epoch": 0.033089990538276576, "grad_norm": 0.65625, "learning_rate": 0.0019961142578197215, "loss": 0.2246, "step": 3812 }, { "epoch": 0.03309867101848074, "grad_norm": 0.2373046875, "learning_rate": 0.0019961114946519297, "loss": 0.1543, "step": 3813 }, { "epoch": 0.033107351498684906, "grad_norm": 0.2294921875, "learning_rate": 0.001996108730504164, "loss": 0.1641, "step": 3814 }, { "epoch": 0.03311603197888907, "grad_norm": 0.275390625, "learning_rate": 0.001996105965376427, "loss": 0.1953, "step": 3815 }, { "epoch": 0.033124712459093236, "grad_norm": 0.08935546875, "learning_rate": 0.001996103199268722, "loss": 0.1582, "step": 3816 }, { "epoch": 0.0331333929392974, "grad_norm": 0.77734375, "learning_rate": 0.0019961004321810524, "loss": 0.1631, "step": 3817 }, { "epoch": 0.033142073419501566, "grad_norm": 0.12353515625, "learning_rate": 0.00199609766411342, "loss": 0.1572, "step": 3818 }, { "epoch": 0.03315075389970573, "grad_norm": 0.267578125, "learning_rate": 0.0019960948950658295, "loss": 0.1777, "step": 3819 }, { "epoch": 0.0331594343799099, "grad_norm": 0.2578125, "learning_rate": 0.0019960921250382828, "loss": 0.168, "step": 3820 }, { "epoch": 0.03316811486011406, "grad_norm": 0.216796875, "learning_rate": 0.0019960893540307834, "loss": 0.168, "step": 3821 }, { "epoch": 0.03317679534031823, "grad_norm": 0.08740234375, "learning_rate": 0.0019960865820433334, "loss": 0.2129, "step": 3822 }, { "epoch": 0.03318547582052239, "grad_norm": 0.25390625, "learning_rate": 0.0019960838090759374, "loss": 0.168, "step": 3823 }, { "epoch": 0.03319415630072656, "grad_norm": 0.126953125, "learning_rate": 0.0019960810351285973, "loss": 0.2012, "step": 3824 }, { "epoch": 0.03320283678093072, "grad_norm": 0.392578125, "learning_rate": 0.0019960782602013163, "loss": 0.1572, "step": 3825 }, { "epoch": 0.03321151726113489, "grad_norm": 0.2001953125, "learning_rate": 0.001996075484294098, "loss": 0.1396, "step": 3826 }, { "epoch": 0.03322019774133905, "grad_norm": 0.248046875, "learning_rate": 0.0019960727074069444, "loss": 0.1426, "step": 3827 }, { "epoch": 0.03322887822154322, "grad_norm": 0.1669921875, "learning_rate": 0.0019960699295398596, "loss": 0.1768, "step": 3828 }, { "epoch": 0.03323755870174738, "grad_norm": 0.341796875, "learning_rate": 0.001996067150692846, "loss": 0.2148, "step": 3829 }, { "epoch": 0.03324623918195155, "grad_norm": 0.259765625, "learning_rate": 0.001996064370865907, "loss": 0.2539, "step": 3830 }, { "epoch": 0.03325491966215571, "grad_norm": 0.12109375, "learning_rate": 0.0019960615900590454, "loss": 0.1699, "step": 3831 }, { "epoch": 0.03326360014235988, "grad_norm": 0.255859375, "learning_rate": 0.001996058808272264, "loss": 0.1328, "step": 3832 }, { "epoch": 0.03327228062256404, "grad_norm": 0.400390625, "learning_rate": 0.001996056025505567, "loss": 0.1406, "step": 3833 }, { "epoch": 0.03328096110276821, "grad_norm": 0.29296875, "learning_rate": 0.001996053241758956, "loss": 0.1855, "step": 3834 }, { "epoch": 0.03328964158297237, "grad_norm": 0.56640625, "learning_rate": 0.0019960504570324345, "loss": 0.1719, "step": 3835 }, { "epoch": 0.03329832206317653, "grad_norm": 0.4609375, "learning_rate": 0.0019960476713260056, "loss": 0.1963, "step": 3836 }, { "epoch": 0.0333070025433807, "grad_norm": 0.314453125, "learning_rate": 0.001996044884639673, "loss": 0.2305, "step": 3837 }, { "epoch": 0.03331568302358486, "grad_norm": 0.1220703125, "learning_rate": 0.0019960420969734383, "loss": 0.1543, "step": 3838 }, { "epoch": 0.03332436350378903, "grad_norm": 0.12890625, "learning_rate": 0.001996039308327306, "loss": 0.1816, "step": 3839 }, { "epoch": 0.03333304398399319, "grad_norm": 0.24609375, "learning_rate": 0.0019960365187012786, "loss": 0.1738, "step": 3840 }, { "epoch": 0.03334172446419736, "grad_norm": 0.12255859375, "learning_rate": 0.0019960337280953595, "loss": 0.1885, "step": 3841 }, { "epoch": 0.03335040494440152, "grad_norm": 0.12060546875, "learning_rate": 0.0019960309365095507, "loss": 0.166, "step": 3842 }, { "epoch": 0.03335908542460569, "grad_norm": 0.08203125, "learning_rate": 0.0019960281439438564, "loss": 0.1641, "step": 3843 }, { "epoch": 0.03336776590480985, "grad_norm": 0.1396484375, "learning_rate": 0.0019960253503982788, "loss": 0.2324, "step": 3844 }, { "epoch": 0.03337644638501402, "grad_norm": 0.58984375, "learning_rate": 0.001996022555872822, "loss": 0.2344, "step": 3845 }, { "epoch": 0.03338512686521818, "grad_norm": 0.30859375, "learning_rate": 0.0019960197603674876, "loss": 0.1846, "step": 3846 }, { "epoch": 0.03339380734542235, "grad_norm": 0.48828125, "learning_rate": 0.0019960169638822797, "loss": 0.1816, "step": 3847 }, { "epoch": 0.03340248782562651, "grad_norm": 0.056640625, "learning_rate": 0.001996014166417201, "loss": 0.127, "step": 3848 }, { "epoch": 0.03341116830583068, "grad_norm": 0.2119140625, "learning_rate": 0.001996011367972255, "loss": 0.1553, "step": 3849 }, { "epoch": 0.03341984878603484, "grad_norm": 0.453125, "learning_rate": 0.0019960085685474444, "loss": 0.1592, "step": 3850 }, { "epoch": 0.03342852926623901, "grad_norm": 0.1884765625, "learning_rate": 0.001996005768142772, "loss": 0.1729, "step": 3851 }, { "epoch": 0.033437209746443174, "grad_norm": 0.16015625, "learning_rate": 0.0019960029667582414, "loss": 0.1572, "step": 3852 }, { "epoch": 0.03344589022664734, "grad_norm": 0.291015625, "learning_rate": 0.0019960001643938552, "loss": 0.3203, "step": 3853 }, { "epoch": 0.033454570706851504, "grad_norm": 0.263671875, "learning_rate": 0.0019959973610496166, "loss": 0.1572, "step": 3854 }, { "epoch": 0.03346325118705567, "grad_norm": 0.1748046875, "learning_rate": 0.0019959945567255295, "loss": 0.25, "step": 3855 }, { "epoch": 0.033471931667259834, "grad_norm": 0.28125, "learning_rate": 0.0019959917514215954, "loss": 0.1523, "step": 3856 }, { "epoch": 0.033480612147464, "grad_norm": 0.234375, "learning_rate": 0.0019959889451378185, "loss": 0.1377, "step": 3857 }, { "epoch": 0.033489292627668164, "grad_norm": 0.109375, "learning_rate": 0.0019959861378742016, "loss": 0.1836, "step": 3858 }, { "epoch": 0.03349797310787233, "grad_norm": 0.30859375, "learning_rate": 0.001995983329630748, "loss": 0.1914, "step": 3859 }, { "epoch": 0.033506653588076495, "grad_norm": 0.216796875, "learning_rate": 0.00199598052040746, "loss": 0.1699, "step": 3860 }, { "epoch": 0.03351533406828066, "grad_norm": 0.11083984375, "learning_rate": 0.001995977710204341, "loss": 0.1904, "step": 3861 }, { "epoch": 0.033524014548484825, "grad_norm": 0.60546875, "learning_rate": 0.0019959748990213944, "loss": 0.1963, "step": 3862 }, { "epoch": 0.03353269502868899, "grad_norm": 0.138671875, "learning_rate": 0.0019959720868586235, "loss": 0.1562, "step": 3863 }, { "epoch": 0.033541375508893155, "grad_norm": 0.2236328125, "learning_rate": 0.0019959692737160305, "loss": 0.165, "step": 3864 }, { "epoch": 0.03355005598909732, "grad_norm": 0.4765625, "learning_rate": 0.0019959664595936193, "loss": 0.1875, "step": 3865 }, { "epoch": 0.03355873646930148, "grad_norm": 0.458984375, "learning_rate": 0.001995963644491393, "loss": 0.2402, "step": 3866 }, { "epoch": 0.033567416949505643, "grad_norm": 0.1484375, "learning_rate": 0.0019959608284093534, "loss": 0.168, "step": 3867 }, { "epoch": 0.03357609742970981, "grad_norm": 0.6484375, "learning_rate": 0.001995958011347505, "loss": 0.1797, "step": 3868 }, { "epoch": 0.033584777909913974, "grad_norm": 0.24609375, "learning_rate": 0.0019959551933058503, "loss": 0.2168, "step": 3869 }, { "epoch": 0.03359345839011814, "grad_norm": 0.08642578125, "learning_rate": 0.0019959523742843926, "loss": 0.1318, "step": 3870 }, { "epoch": 0.033602138870322304, "grad_norm": 0.07080078125, "learning_rate": 0.001995949554283135, "loss": 0.1504, "step": 3871 }, { "epoch": 0.03361081935052647, "grad_norm": 0.3203125, "learning_rate": 0.00199594673330208, "loss": 0.1719, "step": 3872 }, { "epoch": 0.033619499830730634, "grad_norm": 0.349609375, "learning_rate": 0.0019959439113412312, "loss": 0.21, "step": 3873 }, { "epoch": 0.0336281803109348, "grad_norm": 0.29296875, "learning_rate": 0.0019959410884005917, "loss": 0.1973, "step": 3874 }, { "epoch": 0.033636860791138964, "grad_norm": 0.2119140625, "learning_rate": 0.0019959382644801644, "loss": 0.1562, "step": 3875 }, { "epoch": 0.03364554127134313, "grad_norm": 0.10693359375, "learning_rate": 0.0019959354395799526, "loss": 0.168, "step": 3876 }, { "epoch": 0.033654221751547295, "grad_norm": 0.232421875, "learning_rate": 0.001995932613699959, "loss": 0.1855, "step": 3877 }, { "epoch": 0.03366290223175146, "grad_norm": 0.3359375, "learning_rate": 0.0019959297868401876, "loss": 0.1875, "step": 3878 }, { "epoch": 0.033671582711955625, "grad_norm": 0.35546875, "learning_rate": 0.00199592695900064, "loss": 0.2031, "step": 3879 }, { "epoch": 0.03368026319215979, "grad_norm": 0.10546875, "learning_rate": 0.001995924130181321, "loss": 0.1836, "step": 3880 }, { "epoch": 0.033688943672363955, "grad_norm": 0.30078125, "learning_rate": 0.001995921300382232, "loss": 0.167, "step": 3881 }, { "epoch": 0.03369762415256812, "grad_norm": 0.60546875, "learning_rate": 0.0019959184696033776, "loss": 0.2539, "step": 3882 }, { "epoch": 0.033706304632772285, "grad_norm": 0.37890625, "learning_rate": 0.0019959156378447597, "loss": 0.1719, "step": 3883 }, { "epoch": 0.03371498511297645, "grad_norm": 0.1728515625, "learning_rate": 0.0019959128051063825, "loss": 0.1807, "step": 3884 }, { "epoch": 0.033723665593180616, "grad_norm": 0.10888671875, "learning_rate": 0.001995909971388248, "loss": 0.1719, "step": 3885 }, { "epoch": 0.03373234607338478, "grad_norm": 0.58203125, "learning_rate": 0.00199590713669036, "loss": 0.1367, "step": 3886 }, { "epoch": 0.033741026553588946, "grad_norm": 0.396484375, "learning_rate": 0.0019959043010127214, "loss": 0.1836, "step": 3887 }, { "epoch": 0.03374970703379311, "grad_norm": 0.408203125, "learning_rate": 0.0019959014643553354, "loss": 0.1602, "step": 3888 }, { "epoch": 0.033758387513997276, "grad_norm": 0.171875, "learning_rate": 0.001995898626718205, "loss": 0.1641, "step": 3889 }, { "epoch": 0.03376706799420144, "grad_norm": 0.1455078125, "learning_rate": 0.001995895788101333, "loss": 0.1855, "step": 3890 }, { "epoch": 0.033775748474405606, "grad_norm": 0.57421875, "learning_rate": 0.001995892948504723, "loss": 0.1426, "step": 3891 }, { "epoch": 0.03378442895460977, "grad_norm": 0.6328125, "learning_rate": 0.001995890107928378, "loss": 0.1934, "step": 3892 }, { "epoch": 0.03379310943481394, "grad_norm": 0.53515625, "learning_rate": 0.0019958872663723014, "loss": 0.2031, "step": 3893 }, { "epoch": 0.0338017899150181, "grad_norm": 0.1513671875, "learning_rate": 0.0019958844238364957, "loss": 0.1992, "step": 3894 }, { "epoch": 0.03381047039522227, "grad_norm": 0.4296875, "learning_rate": 0.001995881580320964, "loss": 0.1738, "step": 3895 }, { "epoch": 0.03381915087542643, "grad_norm": 0.09912109375, "learning_rate": 0.00199587873582571, "loss": 0.1904, "step": 3896 }, { "epoch": 0.03382783135563059, "grad_norm": 0.5859375, "learning_rate": 0.001995875890350736, "loss": 0.1777, "step": 3897 }, { "epoch": 0.033836511835834755, "grad_norm": 0.12451171875, "learning_rate": 0.001995873043896046, "loss": 0.1611, "step": 3898 }, { "epoch": 0.03384519231603892, "grad_norm": 0.55078125, "learning_rate": 0.001995870196461643, "loss": 0.168, "step": 3899 }, { "epoch": 0.033853872796243085, "grad_norm": 0.462890625, "learning_rate": 0.0019958673480475293, "loss": 0.1846, "step": 3900 }, { "epoch": 0.03386255327644725, "grad_norm": 0.12255859375, "learning_rate": 0.0019958644986537086, "loss": 0.1611, "step": 3901 }, { "epoch": 0.033871233756651416, "grad_norm": 0.21875, "learning_rate": 0.0019958616482801837, "loss": 0.2266, "step": 3902 }, { "epoch": 0.03387991423685558, "grad_norm": 0.1044921875, "learning_rate": 0.0019958587969269585, "loss": 0.1387, "step": 3903 }, { "epoch": 0.033888594717059746, "grad_norm": 2.484375, "learning_rate": 0.001995855944594035, "loss": 0.457, "step": 3904 }, { "epoch": 0.03389727519726391, "grad_norm": 0.07275390625, "learning_rate": 0.0019958530912814174, "loss": 0.1553, "step": 3905 }, { "epoch": 0.033905955677468076, "grad_norm": 0.197265625, "learning_rate": 0.001995850236989108, "loss": 0.1504, "step": 3906 }, { "epoch": 0.03391463615767224, "grad_norm": 0.09814453125, "learning_rate": 0.0019958473817171104, "loss": 0.1797, "step": 3907 }, { "epoch": 0.033923316637876406, "grad_norm": 1.1484375, "learning_rate": 0.0019958445254654278, "loss": 0.5625, "step": 3908 }, { "epoch": 0.03393199711808057, "grad_norm": 0.50390625, "learning_rate": 0.0019958416682340626, "loss": 0.1885, "step": 3909 }, { "epoch": 0.03394067759828474, "grad_norm": 0.1962890625, "learning_rate": 0.0019958388100230185, "loss": 0.1641, "step": 3910 }, { "epoch": 0.0339493580784889, "grad_norm": 0.25390625, "learning_rate": 0.001995835950832299, "loss": 0.1738, "step": 3911 }, { "epoch": 0.03395803855869307, "grad_norm": 0.1611328125, "learning_rate": 0.001995833090661906, "loss": 0.1318, "step": 3912 }, { "epoch": 0.03396671903889723, "grad_norm": 0.4375, "learning_rate": 0.001995830229511844, "loss": 0.2266, "step": 3913 }, { "epoch": 0.0339753995191014, "grad_norm": 0.091796875, "learning_rate": 0.0019958273673821155, "loss": 0.2119, "step": 3914 }, { "epoch": 0.03398407999930556, "grad_norm": 0.75390625, "learning_rate": 0.0019958245042727236, "loss": 0.2578, "step": 3915 }, { "epoch": 0.03399276047950973, "grad_norm": 0.384765625, "learning_rate": 0.0019958216401836713, "loss": 0.3477, "step": 3916 }, { "epoch": 0.03400144095971389, "grad_norm": 0.12060546875, "learning_rate": 0.001995818775114962, "loss": 0.166, "step": 3917 }, { "epoch": 0.03401012143991806, "grad_norm": 0.359375, "learning_rate": 0.0019958159090665986, "loss": 0.2031, "step": 3918 }, { "epoch": 0.03401880192012222, "grad_norm": 0.1083984375, "learning_rate": 0.0019958130420385848, "loss": 0.2461, "step": 3919 }, { "epoch": 0.03402748240032639, "grad_norm": 0.10107421875, "learning_rate": 0.001995810174030923, "loss": 0.1689, "step": 3920 }, { "epoch": 0.03403616288053055, "grad_norm": 0.7578125, "learning_rate": 0.0019958073050436167, "loss": 0.2402, "step": 3921 }, { "epoch": 0.03404484336073472, "grad_norm": 0.2431640625, "learning_rate": 0.001995804435076669, "loss": 0.165, "step": 3922 }, { "epoch": 0.03405352384093888, "grad_norm": 0.1904296875, "learning_rate": 0.001995801564130083, "loss": 0.1797, "step": 3923 }, { "epoch": 0.03406220432114305, "grad_norm": 0.671875, "learning_rate": 0.001995798692203862, "loss": 0.2305, "step": 3924 }, { "epoch": 0.034070884801347213, "grad_norm": 0.373046875, "learning_rate": 0.0019957958192980086, "loss": 0.1992, "step": 3925 }, { "epoch": 0.03407956528155138, "grad_norm": 0.3203125, "learning_rate": 0.001995792945412527, "loss": 0.2129, "step": 3926 }, { "epoch": 0.034088245761755544, "grad_norm": 0.50390625, "learning_rate": 0.0019957900705474194, "loss": 0.21, "step": 3927 }, { "epoch": 0.0340969262419597, "grad_norm": 0.2734375, "learning_rate": 0.001995787194702689, "loss": 0.1553, "step": 3928 }, { "epoch": 0.03410560672216387, "grad_norm": 0.92578125, "learning_rate": 0.0019957843178783395, "loss": 0.4922, "step": 3929 }, { "epoch": 0.03411428720236803, "grad_norm": 0.439453125, "learning_rate": 0.0019957814400743735, "loss": 0.1816, "step": 3930 }, { "epoch": 0.0341229676825722, "grad_norm": 0.2412109375, "learning_rate": 0.001995778561290795, "loss": 0.2246, "step": 3931 }, { "epoch": 0.03413164816277636, "grad_norm": 0.1904296875, "learning_rate": 0.0019957756815276056, "loss": 0.1738, "step": 3932 }, { "epoch": 0.03414032864298053, "grad_norm": 0.11083984375, "learning_rate": 0.0019957728007848094, "loss": 0.2012, "step": 3933 }, { "epoch": 0.03414900912318469, "grad_norm": 0.16796875, "learning_rate": 0.00199576991906241, "loss": 0.1631, "step": 3934 }, { "epoch": 0.03415768960338886, "grad_norm": 0.1435546875, "learning_rate": 0.00199576703636041, "loss": 0.1572, "step": 3935 }, { "epoch": 0.03416637008359302, "grad_norm": 0.1875, "learning_rate": 0.0019957641526788124, "loss": 0.2207, "step": 3936 }, { "epoch": 0.03417505056379719, "grad_norm": 0.08251953125, "learning_rate": 0.001995761268017621, "loss": 0.1953, "step": 3937 }, { "epoch": 0.03418373104400135, "grad_norm": 0.140625, "learning_rate": 0.0019957583823768383, "loss": 0.25, "step": 3938 }, { "epoch": 0.03419241152420552, "grad_norm": 0.431640625, "learning_rate": 0.0019957554957564677, "loss": 0.1641, "step": 3939 }, { "epoch": 0.03420109200440968, "grad_norm": 0.49609375, "learning_rate": 0.0019957526081565125, "loss": 0.2168, "step": 3940 }, { "epoch": 0.03420977248461385, "grad_norm": 0.1376953125, "learning_rate": 0.0019957497195769755, "loss": 0.1777, "step": 3941 }, { "epoch": 0.034218452964818014, "grad_norm": 0.482421875, "learning_rate": 0.0019957468300178605, "loss": 0.2383, "step": 3942 }, { "epoch": 0.03422713344502218, "grad_norm": 0.279296875, "learning_rate": 0.0019957439394791693, "loss": 0.1719, "step": 3943 }, { "epoch": 0.034235813925226344, "grad_norm": 0.359375, "learning_rate": 0.001995741047960907, "loss": 0.1484, "step": 3944 }, { "epoch": 0.03424449440543051, "grad_norm": 0.326171875, "learning_rate": 0.001995738155463075, "loss": 0.1729, "step": 3945 }, { "epoch": 0.034253174885634674, "grad_norm": 0.232421875, "learning_rate": 0.0019957352619856777, "loss": 0.2422, "step": 3946 }, { "epoch": 0.03426185536583884, "grad_norm": 0.08203125, "learning_rate": 0.0019957323675287176, "loss": 0.1504, "step": 3947 }, { "epoch": 0.034270535846043004, "grad_norm": 0.150390625, "learning_rate": 0.0019957294720921977, "loss": 0.1621, "step": 3948 }, { "epoch": 0.03427921632624717, "grad_norm": 0.66015625, "learning_rate": 0.001995726575676122, "loss": 0.167, "step": 3949 }, { "epoch": 0.034287896806451335, "grad_norm": 0.6875, "learning_rate": 0.001995723678280493, "loss": 0.1992, "step": 3950 }, { "epoch": 0.0342965772866555, "grad_norm": 0.1875, "learning_rate": 0.001995720779905314, "loss": 0.1963, "step": 3951 }, { "epoch": 0.034305257766859665, "grad_norm": 1.6171875, "learning_rate": 0.0019957178805505883, "loss": 0.2852, "step": 3952 }, { "epoch": 0.03431393824706383, "grad_norm": 0.5234375, "learning_rate": 0.0019957149802163187, "loss": 0.2402, "step": 3953 }, { "epoch": 0.034322618727267995, "grad_norm": 0.30078125, "learning_rate": 0.001995712078902509, "loss": 0.1709, "step": 3954 }, { "epoch": 0.03433129920747216, "grad_norm": 0.1376953125, "learning_rate": 0.0019957091766091618, "loss": 0.1445, "step": 3955 }, { "epoch": 0.034339979687676325, "grad_norm": 0.7421875, "learning_rate": 0.0019957062733362806, "loss": 0.1748, "step": 3956 }, { "epoch": 0.03434866016788049, "grad_norm": 0.796875, "learning_rate": 0.0019957033690838686, "loss": 0.209, "step": 3957 }, { "epoch": 0.034357340648084655, "grad_norm": 0.443359375, "learning_rate": 0.001995700463851929, "loss": 0.1914, "step": 3958 }, { "epoch": 0.034366021128288814, "grad_norm": 0.404296875, "learning_rate": 0.0019956975576404645, "loss": 0.1816, "step": 3959 }, { "epoch": 0.03437470160849298, "grad_norm": 0.09619140625, "learning_rate": 0.0019956946504494787, "loss": 0.1992, "step": 3960 }, { "epoch": 0.034383382088697144, "grad_norm": 0.240234375, "learning_rate": 0.001995691742278975, "loss": 0.1963, "step": 3961 }, { "epoch": 0.03439206256890131, "grad_norm": 0.10986328125, "learning_rate": 0.0019956888331289554, "loss": 0.1621, "step": 3962 }, { "epoch": 0.034400743049105474, "grad_norm": 0.1533203125, "learning_rate": 0.001995685922999425, "loss": 0.165, "step": 3963 }, { "epoch": 0.03440942352930964, "grad_norm": 0.1103515625, "learning_rate": 0.001995683011890385, "loss": 0.1699, "step": 3964 }, { "epoch": 0.034418104009513804, "grad_norm": 0.38671875, "learning_rate": 0.00199568009980184, "loss": 0.207, "step": 3965 }, { "epoch": 0.03442678448971797, "grad_norm": 0.197265625, "learning_rate": 0.0019956771867337924, "loss": 0.2012, "step": 3966 }, { "epoch": 0.034435464969922135, "grad_norm": 0.1474609375, "learning_rate": 0.001995674272686246, "loss": 0.2246, "step": 3967 }, { "epoch": 0.0344441454501263, "grad_norm": 0.5546875, "learning_rate": 0.001995671357659204, "loss": 0.1455, "step": 3968 }, { "epoch": 0.034452825930330465, "grad_norm": 0.37890625, "learning_rate": 0.001995668441652669, "loss": 0.1797, "step": 3969 }, { "epoch": 0.03446150641053463, "grad_norm": 0.337890625, "learning_rate": 0.001995665524666644, "loss": 0.1738, "step": 3970 }, { "epoch": 0.034470186890738795, "grad_norm": 0.6171875, "learning_rate": 0.001995662606701133, "loss": 0.1982, "step": 3971 }, { "epoch": 0.03447886737094296, "grad_norm": 0.64453125, "learning_rate": 0.001995659687756139, "loss": 0.248, "step": 3972 }, { "epoch": 0.034487547851147125, "grad_norm": 0.267578125, "learning_rate": 0.0019956567678316647, "loss": 0.1992, "step": 3973 }, { "epoch": 0.03449622833135129, "grad_norm": 0.08935546875, "learning_rate": 0.0019956538469277137, "loss": 0.1475, "step": 3974 }, { "epoch": 0.034504908811555456, "grad_norm": 0.4921875, "learning_rate": 0.0019956509250442893, "loss": 0.1543, "step": 3975 }, { "epoch": 0.03451358929175962, "grad_norm": 0.16796875, "learning_rate": 0.001995648002181394, "loss": 0.1533, "step": 3976 }, { "epoch": 0.034522269771963786, "grad_norm": 0.3359375, "learning_rate": 0.0019956450783390318, "loss": 0.1963, "step": 3977 }, { "epoch": 0.03453095025216795, "grad_norm": 0.396484375, "learning_rate": 0.0019956421535172056, "loss": 0.1797, "step": 3978 }, { "epoch": 0.034539630732372116, "grad_norm": 0.6875, "learning_rate": 0.0019956392277159186, "loss": 0.207, "step": 3979 }, { "epoch": 0.03454831121257628, "grad_norm": 0.55859375, "learning_rate": 0.001995636300935174, "loss": 0.2109, "step": 3980 }, { "epoch": 0.034556991692780446, "grad_norm": 0.12109375, "learning_rate": 0.0019956333731749754, "loss": 0.1621, "step": 3981 }, { "epoch": 0.03456567217298461, "grad_norm": 0.09521484375, "learning_rate": 0.001995630444435325, "loss": 0.1484, "step": 3982 }, { "epoch": 0.03457435265318878, "grad_norm": 0.234375, "learning_rate": 0.001995627514716227, "loss": 0.2227, "step": 3983 }, { "epoch": 0.03458303313339294, "grad_norm": 0.1162109375, "learning_rate": 0.001995624584017684, "loss": 0.2197, "step": 3984 }, { "epoch": 0.03459171361359711, "grad_norm": 0.287109375, "learning_rate": 0.0019956216523396996, "loss": 0.2012, "step": 3985 }, { "epoch": 0.03460039409380127, "grad_norm": 0.07421875, "learning_rate": 0.0019956187196822767, "loss": 0.1562, "step": 3986 }, { "epoch": 0.03460907457400544, "grad_norm": 0.1044921875, "learning_rate": 0.0019956157860454185, "loss": 0.1729, "step": 3987 }, { "epoch": 0.0346177550542096, "grad_norm": 0.408203125, "learning_rate": 0.0019956128514291285, "loss": 0.1543, "step": 3988 }, { "epoch": 0.03462643553441376, "grad_norm": 0.1630859375, "learning_rate": 0.0019956099158334097, "loss": 0.1387, "step": 3989 }, { "epoch": 0.034635116014617925, "grad_norm": 0.1708984375, "learning_rate": 0.0019956069792582652, "loss": 0.1621, "step": 3990 }, { "epoch": 0.03464379649482209, "grad_norm": 0.11865234375, "learning_rate": 0.0019956040417036984, "loss": 0.1455, "step": 3991 }, { "epoch": 0.034652476975026256, "grad_norm": 0.33984375, "learning_rate": 0.001995601103169713, "loss": 0.2129, "step": 3992 }, { "epoch": 0.03466115745523042, "grad_norm": 0.33984375, "learning_rate": 0.001995598163656311, "loss": 0.1885, "step": 3993 }, { "epoch": 0.034669837935434586, "grad_norm": 1.7109375, "learning_rate": 0.0019955952231634965, "loss": 0.3242, "step": 3994 }, { "epoch": 0.03467851841563875, "grad_norm": 0.37109375, "learning_rate": 0.0019955922816912728, "loss": 0.2441, "step": 3995 }, { "epoch": 0.034687198895842916, "grad_norm": 0.1708984375, "learning_rate": 0.0019955893392396423, "loss": 0.2012, "step": 3996 }, { "epoch": 0.03469587937604708, "grad_norm": 0.1611328125, "learning_rate": 0.0019955863958086096, "loss": 0.1641, "step": 3997 }, { "epoch": 0.034704559856251246, "grad_norm": 0.13671875, "learning_rate": 0.001995583451398176, "loss": 0.1719, "step": 3998 }, { "epoch": 0.03471324033645541, "grad_norm": 0.38671875, "learning_rate": 0.0019955805060083466, "loss": 0.1748, "step": 3999 }, { "epoch": 0.03472192081665958, "grad_norm": 0.73828125, "learning_rate": 0.0019955775596391234, "loss": 0.1484, "step": 4000 }, { "epoch": 0.03473060129686374, "grad_norm": 1.1953125, "learning_rate": 0.0019955746122905104, "loss": 0.4453, "step": 4001 }, { "epoch": 0.03473928177706791, "grad_norm": 0.255859375, "learning_rate": 0.00199557166396251, "loss": 0.2109, "step": 4002 }, { "epoch": 0.03474796225727207, "grad_norm": 0.1923828125, "learning_rate": 0.0019955687146551264, "loss": 0.2441, "step": 4003 }, { "epoch": 0.03475664273747624, "grad_norm": 0.62109375, "learning_rate": 0.001995565764368362, "loss": 0.2324, "step": 4004 }, { "epoch": 0.0347653232176804, "grad_norm": 0.2265625, "learning_rate": 0.0019955628131022203, "loss": 0.1318, "step": 4005 }, { "epoch": 0.03477400369788457, "grad_norm": 0.11962890625, "learning_rate": 0.0019955598608567045, "loss": 0.1758, "step": 4006 }, { "epoch": 0.03478268417808873, "grad_norm": 0.08203125, "learning_rate": 0.001995556907631818, "loss": 0.1572, "step": 4007 }, { "epoch": 0.0347913646582929, "grad_norm": 0.357421875, "learning_rate": 0.001995553953427564, "loss": 0.2021, "step": 4008 }, { "epoch": 0.03480004513849706, "grad_norm": 0.142578125, "learning_rate": 0.001995550998243946, "loss": 0.2031, "step": 4009 }, { "epoch": 0.03480872561870123, "grad_norm": 0.12060546875, "learning_rate": 0.0019955480420809665, "loss": 0.1729, "step": 4010 }, { "epoch": 0.03481740609890539, "grad_norm": 0.294921875, "learning_rate": 0.001995545084938629, "loss": 0.1357, "step": 4011 }, { "epoch": 0.03482608657910956, "grad_norm": 0.291015625, "learning_rate": 0.001995542126816937, "loss": 0.1582, "step": 4012 }, { "epoch": 0.03483476705931372, "grad_norm": 0.4765625, "learning_rate": 0.0019955391677158934, "loss": 0.1553, "step": 4013 }, { "epoch": 0.03484344753951789, "grad_norm": 0.1259765625, "learning_rate": 0.0019955362076355017, "loss": 0.1523, "step": 4014 }, { "epoch": 0.03485212801972205, "grad_norm": 0.236328125, "learning_rate": 0.0019955332465757653, "loss": 0.1396, "step": 4015 }, { "epoch": 0.03486080849992622, "grad_norm": 0.3125, "learning_rate": 0.001995530284536687, "loss": 0.1855, "step": 4016 }, { "epoch": 0.034869488980130384, "grad_norm": 0.259765625, "learning_rate": 0.0019955273215182703, "loss": 0.1543, "step": 4017 }, { "epoch": 0.03487816946033455, "grad_norm": 0.1044921875, "learning_rate": 0.0019955243575205182, "loss": 0.1436, "step": 4018 }, { "epoch": 0.034886849940538714, "grad_norm": 0.1640625, "learning_rate": 0.0019955213925434345, "loss": 0.1553, "step": 4019 }, { "epoch": 0.03489553042074287, "grad_norm": 0.58203125, "learning_rate": 0.0019955184265870217, "loss": 0.1895, "step": 4020 }, { "epoch": 0.03490421090094704, "grad_norm": 0.31640625, "learning_rate": 0.0019955154596512834, "loss": 0.165, "step": 4021 }, { "epoch": 0.0349128913811512, "grad_norm": 0.376953125, "learning_rate": 0.001995512491736223, "loss": 0.1533, "step": 4022 }, { "epoch": 0.03492157186135537, "grad_norm": 0.330078125, "learning_rate": 0.0019955095228418435, "loss": 0.2246, "step": 4023 }, { "epoch": 0.03493025234155953, "grad_norm": 0.23828125, "learning_rate": 0.001995506552968148, "loss": 0.1299, "step": 4024 }, { "epoch": 0.0349389328217637, "grad_norm": 0.09912109375, "learning_rate": 0.0019955035821151404, "loss": 0.1357, "step": 4025 }, { "epoch": 0.03494761330196786, "grad_norm": 0.62890625, "learning_rate": 0.0019955006102828237, "loss": 0.1699, "step": 4026 }, { "epoch": 0.03495629378217203, "grad_norm": 0.20703125, "learning_rate": 0.001995497637471201, "loss": 0.1846, "step": 4027 }, { "epoch": 0.03496497426237619, "grad_norm": 0.2734375, "learning_rate": 0.0019954946636802752, "loss": 0.1494, "step": 4028 }, { "epoch": 0.03497365474258036, "grad_norm": 0.4296875, "learning_rate": 0.0019954916889100495, "loss": 0.2129, "step": 4029 }, { "epoch": 0.03498233522278452, "grad_norm": 0.1552734375, "learning_rate": 0.0019954887131605286, "loss": 0.1719, "step": 4030 }, { "epoch": 0.03499101570298869, "grad_norm": 0.68359375, "learning_rate": 0.0019954857364317138, "loss": 0.1289, "step": 4031 }, { "epoch": 0.034999696183192854, "grad_norm": 0.1083984375, "learning_rate": 0.0019954827587236094, "loss": 0.1953, "step": 4032 }, { "epoch": 0.03500837666339702, "grad_norm": 0.59375, "learning_rate": 0.001995479780036219, "loss": 0.1641, "step": 4033 }, { "epoch": 0.035017057143601184, "grad_norm": 0.458984375, "learning_rate": 0.001995476800369545, "loss": 0.1719, "step": 4034 }, { "epoch": 0.03502573762380535, "grad_norm": 0.2255859375, "learning_rate": 0.001995473819723591, "loss": 0.1797, "step": 4035 }, { "epoch": 0.035034418104009514, "grad_norm": 0.234375, "learning_rate": 0.00199547083809836, "loss": 0.1787, "step": 4036 }, { "epoch": 0.03504309858421368, "grad_norm": 0.11083984375, "learning_rate": 0.001995467855493856, "loss": 0.2148, "step": 4037 }, { "epoch": 0.035051779064417844, "grad_norm": 0.5625, "learning_rate": 0.0019954648719100816, "loss": 0.1582, "step": 4038 }, { "epoch": 0.03506045954462201, "grad_norm": 0.177734375, "learning_rate": 0.0019954618873470405, "loss": 0.1689, "step": 4039 }, { "epoch": 0.035069140024826174, "grad_norm": 0.365234375, "learning_rate": 0.0019954589018047354, "loss": 0.1602, "step": 4040 }, { "epoch": 0.03507782050503034, "grad_norm": 0.5390625, "learning_rate": 0.0019954559152831706, "loss": 0.1895, "step": 4041 }, { "epoch": 0.035086500985234505, "grad_norm": 0.5703125, "learning_rate": 0.0019954529277823475, "loss": 0.2148, "step": 4042 }, { "epoch": 0.03509518146543867, "grad_norm": 0.400390625, "learning_rate": 0.001995449939302271, "loss": 0.2021, "step": 4043 }, { "epoch": 0.035103861945642835, "grad_norm": 0.2333984375, "learning_rate": 0.0019954469498429444, "loss": 0.1719, "step": 4044 }, { "epoch": 0.035112542425847, "grad_norm": 0.375, "learning_rate": 0.00199544395940437, "loss": 0.1924, "step": 4045 }, { "epoch": 0.035121222906051165, "grad_norm": 0.515625, "learning_rate": 0.001995440967986552, "loss": 0.1895, "step": 4046 }, { "epoch": 0.03512990338625533, "grad_norm": 0.11376953125, "learning_rate": 0.001995437975589493, "loss": 0.1494, "step": 4047 }, { "epoch": 0.035138583866459495, "grad_norm": 0.1513671875, "learning_rate": 0.001995434982213196, "loss": 0.1689, "step": 4048 }, { "epoch": 0.03514726434666366, "grad_norm": 0.07470703125, "learning_rate": 0.0019954319878576652, "loss": 0.1875, "step": 4049 }, { "epoch": 0.035155944826867826, "grad_norm": 0.0634765625, "learning_rate": 0.0019954289925229033, "loss": 0.1719, "step": 4050 }, { "epoch": 0.035164625307071984, "grad_norm": 0.072265625, "learning_rate": 0.001995425996208914, "loss": 0.123, "step": 4051 }, { "epoch": 0.03517330578727615, "grad_norm": 0.416015625, "learning_rate": 0.0019954229989157003, "loss": 0.1348, "step": 4052 }, { "epoch": 0.035181986267480314, "grad_norm": 0.24609375, "learning_rate": 0.001995420000643265, "loss": 0.2061, "step": 4053 }, { "epoch": 0.03519066674768448, "grad_norm": 0.1796875, "learning_rate": 0.001995417001391612, "loss": 0.1699, "step": 4054 }, { "epoch": 0.035199347227888644, "grad_norm": 0.1396484375, "learning_rate": 0.0019954140011607447, "loss": 0.1816, "step": 4055 }, { "epoch": 0.03520802770809281, "grad_norm": 0.275390625, "learning_rate": 0.0019954109999506657, "loss": 0.1934, "step": 4056 }, { "epoch": 0.035216708188296975, "grad_norm": 0.166015625, "learning_rate": 0.001995407997761379, "loss": 0.1777, "step": 4057 }, { "epoch": 0.03522538866850114, "grad_norm": 0.376953125, "learning_rate": 0.0019954049945928878, "loss": 0.2109, "step": 4058 }, { "epoch": 0.035234069148705305, "grad_norm": 0.22265625, "learning_rate": 0.0019954019904451946, "loss": 0.2207, "step": 4059 }, { "epoch": 0.03524274962890947, "grad_norm": 0.83984375, "learning_rate": 0.0019953989853183033, "loss": 0.1689, "step": 4060 }, { "epoch": 0.035251430109113635, "grad_norm": 0.1630859375, "learning_rate": 0.001995395979212217, "loss": 0.208, "step": 4061 }, { "epoch": 0.0352601105893178, "grad_norm": 0.26953125, "learning_rate": 0.0019953929721269396, "loss": 0.1641, "step": 4062 }, { "epoch": 0.035268791069521965, "grad_norm": 0.58203125, "learning_rate": 0.0019953899640624737, "loss": 0.1602, "step": 4063 }, { "epoch": 0.03527747154972613, "grad_norm": 0.365234375, "learning_rate": 0.0019953869550188227, "loss": 0.1836, "step": 4064 }, { "epoch": 0.035286152029930296, "grad_norm": 0.333984375, "learning_rate": 0.00199538394499599, "loss": 0.1562, "step": 4065 }, { "epoch": 0.03529483251013446, "grad_norm": 0.54296875, "learning_rate": 0.001995380933993979, "loss": 0.6406, "step": 4066 }, { "epoch": 0.035303512990338626, "grad_norm": 0.55859375, "learning_rate": 0.0019953779220127925, "loss": 0.2227, "step": 4067 }, { "epoch": 0.03531219347054279, "grad_norm": 0.0888671875, "learning_rate": 0.0019953749090524347, "loss": 0.1816, "step": 4068 }, { "epoch": 0.035320873950746956, "grad_norm": 0.427734375, "learning_rate": 0.001995371895112908, "loss": 0.1719, "step": 4069 }, { "epoch": 0.03532955443095112, "grad_norm": 0.1103515625, "learning_rate": 0.001995368880194216, "loss": 0.2061, "step": 4070 }, { "epoch": 0.035338234911155286, "grad_norm": 0.7578125, "learning_rate": 0.001995365864296362, "loss": 0.1855, "step": 4071 }, { "epoch": 0.03534691539135945, "grad_norm": 0.10693359375, "learning_rate": 0.00199536284741935, "loss": 0.166, "step": 4072 }, { "epoch": 0.035355595871563616, "grad_norm": 0.15234375, "learning_rate": 0.001995359829563182, "loss": 0.1719, "step": 4073 }, { "epoch": 0.03536427635176778, "grad_norm": 0.34765625, "learning_rate": 0.0019953568107278622, "loss": 0.1504, "step": 4074 }, { "epoch": 0.03537295683197195, "grad_norm": 0.26171875, "learning_rate": 0.0019953537909133934, "loss": 0.1426, "step": 4075 }, { "epoch": 0.03538163731217611, "grad_norm": 1.0234375, "learning_rate": 0.0019953507701197795, "loss": 0.2988, "step": 4076 }, { "epoch": 0.03539031779238028, "grad_norm": 0.154296875, "learning_rate": 0.001995347748347023, "loss": 0.1846, "step": 4077 }, { "epoch": 0.03539899827258444, "grad_norm": 0.1259765625, "learning_rate": 0.001995344725595128, "loss": 0.1758, "step": 4078 }, { "epoch": 0.03540767875278861, "grad_norm": 0.357421875, "learning_rate": 0.0019953417018640975, "loss": 0.1846, "step": 4079 }, { "epoch": 0.03541635923299277, "grad_norm": 0.275390625, "learning_rate": 0.001995338677153934, "loss": 0.2314, "step": 4080 }, { "epoch": 0.03542503971319694, "grad_norm": 0.330078125, "learning_rate": 0.0019953356514646426, "loss": 0.1514, "step": 4081 }, { "epoch": 0.035433720193401096, "grad_norm": 0.64453125, "learning_rate": 0.0019953326247962254, "loss": 0.1465, "step": 4082 }, { "epoch": 0.03544240067360526, "grad_norm": 0.302734375, "learning_rate": 0.0019953295971486856, "loss": 0.168, "step": 4083 }, { "epoch": 0.035451081153809426, "grad_norm": 0.373046875, "learning_rate": 0.0019953265685220267, "loss": 0.1553, "step": 4084 }, { "epoch": 0.03545976163401359, "grad_norm": 0.3046875, "learning_rate": 0.001995323538916252, "loss": 0.168, "step": 4085 }, { "epoch": 0.035468442114217756, "grad_norm": 0.1181640625, "learning_rate": 0.0019953205083313654, "loss": 0.1826, "step": 4086 }, { "epoch": 0.03547712259442192, "grad_norm": 0.1953125, "learning_rate": 0.0019953174767673694, "loss": 0.1777, "step": 4087 }, { "epoch": 0.035485803074626086, "grad_norm": 0.302734375, "learning_rate": 0.0019953144442242682, "loss": 0.1855, "step": 4088 }, { "epoch": 0.03549448355483025, "grad_norm": 0.1181640625, "learning_rate": 0.0019953114107020644, "loss": 0.1797, "step": 4089 }, { "epoch": 0.03550316403503442, "grad_norm": 0.1279296875, "learning_rate": 0.001995308376200761, "loss": 0.1787, "step": 4090 }, { "epoch": 0.03551184451523858, "grad_norm": 0.16015625, "learning_rate": 0.0019953053407203622, "loss": 0.1836, "step": 4091 }, { "epoch": 0.03552052499544275, "grad_norm": 0.275390625, "learning_rate": 0.001995302304260871, "loss": 0.2676, "step": 4092 }, { "epoch": 0.03552920547564691, "grad_norm": 0.1005859375, "learning_rate": 0.0019952992668222902, "loss": 0.1797, "step": 4093 }, { "epoch": 0.03553788595585108, "grad_norm": 0.2470703125, "learning_rate": 0.001995296228404624, "loss": 0.1504, "step": 4094 }, { "epoch": 0.03554656643605524, "grad_norm": 0.45703125, "learning_rate": 0.0019952931890078753, "loss": 0.1582, "step": 4095 }, { "epoch": 0.03555524691625941, "grad_norm": 0.494140625, "learning_rate": 0.001995290148632047, "loss": 0.2324, "step": 4096 }, { "epoch": 0.03556392739646357, "grad_norm": 0.11181640625, "learning_rate": 0.0019952871072771434, "loss": 0.1807, "step": 4097 }, { "epoch": 0.03557260787666774, "grad_norm": 0.12353515625, "learning_rate": 0.001995284064943167, "loss": 0.2344, "step": 4098 }, { "epoch": 0.0355812883568719, "grad_norm": 0.123046875, "learning_rate": 0.0019952810216301215, "loss": 0.25, "step": 4099 }, { "epoch": 0.03558996883707607, "grad_norm": 0.39453125, "learning_rate": 0.00199527797733801, "loss": 0.249, "step": 4100 }, { "epoch": 0.03559864931728023, "grad_norm": 0.1435546875, "learning_rate": 0.001995274932066836, "loss": 0.1865, "step": 4101 }, { "epoch": 0.0356073297974844, "grad_norm": 0.13671875, "learning_rate": 0.0019952718858166026, "loss": 0.2012, "step": 4102 }, { "epoch": 0.03561601027768856, "grad_norm": 0.2109375, "learning_rate": 0.001995268838587314, "loss": 0.1523, "step": 4103 }, { "epoch": 0.03562469075789273, "grad_norm": 0.515625, "learning_rate": 0.001995265790378972, "loss": 0.1484, "step": 4104 }, { "epoch": 0.03563337123809689, "grad_norm": 0.1279296875, "learning_rate": 0.001995262741191581, "loss": 0.1514, "step": 4105 }, { "epoch": 0.03564205171830106, "grad_norm": 0.11865234375, "learning_rate": 0.0019952596910251443, "loss": 0.1641, "step": 4106 }, { "epoch": 0.035650732198505224, "grad_norm": 0.8125, "learning_rate": 0.001995256639879665, "loss": 0.457, "step": 4107 }, { "epoch": 0.03565941267870939, "grad_norm": 0.26171875, "learning_rate": 0.0019952535877551464, "loss": 0.1543, "step": 4108 }, { "epoch": 0.035668093158913554, "grad_norm": 0.1826171875, "learning_rate": 0.0019952505346515917, "loss": 0.1953, "step": 4109 }, { "epoch": 0.03567677363911772, "grad_norm": 0.39453125, "learning_rate": 0.001995247480569005, "loss": 0.2002, "step": 4110 }, { "epoch": 0.035685454119321884, "grad_norm": 0.4140625, "learning_rate": 0.001995244425507389, "loss": 0.1826, "step": 4111 }, { "epoch": 0.03569413459952605, "grad_norm": 0.28515625, "learning_rate": 0.001995241369466747, "loss": 0.2188, "step": 4112 }, { "epoch": 0.03570281507973021, "grad_norm": 0.2265625, "learning_rate": 0.0019952383124470824, "loss": 0.1992, "step": 4113 }, { "epoch": 0.03571149555993437, "grad_norm": 0.07568359375, "learning_rate": 0.0019952352544483987, "loss": 0.1807, "step": 4114 }, { "epoch": 0.03572017604013854, "grad_norm": 0.5078125, "learning_rate": 0.0019952321954706995, "loss": 0.1875, "step": 4115 }, { "epoch": 0.0357288565203427, "grad_norm": 0.359375, "learning_rate": 0.0019952291355139875, "loss": 0.1836, "step": 4116 }, { "epoch": 0.03573753700054687, "grad_norm": 0.43359375, "learning_rate": 0.0019952260745782664, "loss": 0.2168, "step": 4117 }, { "epoch": 0.03574621748075103, "grad_norm": 0.169921875, "learning_rate": 0.0019952230126635394, "loss": 0.1602, "step": 4118 }, { "epoch": 0.0357548979609552, "grad_norm": 0.21484375, "learning_rate": 0.0019952199497698104, "loss": 0.2314, "step": 4119 }, { "epoch": 0.03576357844115936, "grad_norm": 0.2734375, "learning_rate": 0.001995216885897082, "loss": 0.1738, "step": 4120 }, { "epoch": 0.03577225892136353, "grad_norm": 0.4609375, "learning_rate": 0.0019952138210453576, "loss": 0.2227, "step": 4121 }, { "epoch": 0.035780939401567693, "grad_norm": 0.134765625, "learning_rate": 0.001995210755214641, "loss": 0.1719, "step": 4122 }, { "epoch": 0.03578961988177186, "grad_norm": 0.12158203125, "learning_rate": 0.0019952076884049356, "loss": 0.2148, "step": 4123 }, { "epoch": 0.035798300361976024, "grad_norm": 0.1904296875, "learning_rate": 0.0019952046206162446, "loss": 0.1953, "step": 4124 }, { "epoch": 0.03580698084218019, "grad_norm": 0.1455078125, "learning_rate": 0.001995201551848571, "loss": 0.1699, "step": 4125 }, { "epoch": 0.035815661322384354, "grad_norm": 0.12890625, "learning_rate": 0.0019951984821019184, "loss": 0.1758, "step": 4126 }, { "epoch": 0.03582434180258852, "grad_norm": 0.1669921875, "learning_rate": 0.0019951954113762906, "loss": 0.1445, "step": 4127 }, { "epoch": 0.035833022282792684, "grad_norm": 0.62109375, "learning_rate": 0.00199519233967169, "loss": 0.1699, "step": 4128 }, { "epoch": 0.03584170276299685, "grad_norm": 0.19921875, "learning_rate": 0.0019951892669881205, "loss": 0.1777, "step": 4129 }, { "epoch": 0.035850383243201014, "grad_norm": 0.1123046875, "learning_rate": 0.0019951861933255855, "loss": 0.1709, "step": 4130 }, { "epoch": 0.03585906372340518, "grad_norm": 0.443359375, "learning_rate": 0.001995183118684089, "loss": 0.2188, "step": 4131 }, { "epoch": 0.035867744203609345, "grad_norm": 0.2041015625, "learning_rate": 0.001995180043063633, "loss": 0.1582, "step": 4132 }, { "epoch": 0.03587642468381351, "grad_norm": 0.1201171875, "learning_rate": 0.001995176966464222, "loss": 0.2578, "step": 4133 }, { "epoch": 0.035885105164017675, "grad_norm": 0.1611328125, "learning_rate": 0.0019951738888858584, "loss": 0.1953, "step": 4134 }, { "epoch": 0.03589378564422184, "grad_norm": 0.376953125, "learning_rate": 0.0019951708103285466, "loss": 0.1973, "step": 4135 }, { "epoch": 0.035902466124426005, "grad_norm": 0.0712890625, "learning_rate": 0.001995167730792289, "loss": 0.1631, "step": 4136 }, { "epoch": 0.03591114660463017, "grad_norm": 0.37890625, "learning_rate": 0.0019951646502770898, "loss": 0.1953, "step": 4137 }, { "epoch": 0.035919827084834335, "grad_norm": 0.197265625, "learning_rate": 0.001995161568782952, "loss": 0.1748, "step": 4138 }, { "epoch": 0.0359285075650385, "grad_norm": 0.138671875, "learning_rate": 0.0019951584863098786, "loss": 0.1826, "step": 4139 }, { "epoch": 0.035937188045242666, "grad_norm": 0.283203125, "learning_rate": 0.001995155402857874, "loss": 0.1338, "step": 4140 }, { "epoch": 0.03594586852544683, "grad_norm": 0.310546875, "learning_rate": 0.00199515231842694, "loss": 0.1504, "step": 4141 }, { "epoch": 0.035954549005650996, "grad_norm": 0.1171875, "learning_rate": 0.0019951492330170816, "loss": 0.1641, "step": 4142 }, { "epoch": 0.035963229485855154, "grad_norm": 0.080078125, "learning_rate": 0.001995146146628301, "loss": 0.1699, "step": 4143 }, { "epoch": 0.03597190996605932, "grad_norm": 0.7578125, "learning_rate": 0.001995143059260602, "loss": 0.1855, "step": 4144 }, { "epoch": 0.035980590446263484, "grad_norm": 0.1416015625, "learning_rate": 0.001995139970913988, "loss": 0.1592, "step": 4145 }, { "epoch": 0.03598927092646765, "grad_norm": 0.09521484375, "learning_rate": 0.0019951368815884626, "loss": 0.1328, "step": 4146 }, { "epoch": 0.035997951406671815, "grad_norm": 0.1982421875, "learning_rate": 0.0019951337912840292, "loss": 0.1699, "step": 4147 }, { "epoch": 0.03600663188687598, "grad_norm": 0.1806640625, "learning_rate": 0.0019951307000006906, "loss": 0.1367, "step": 4148 }, { "epoch": 0.036015312367080145, "grad_norm": 0.099609375, "learning_rate": 0.0019951276077384505, "loss": 0.1699, "step": 4149 }, { "epoch": 0.03602399284728431, "grad_norm": 0.10302734375, "learning_rate": 0.0019951245144973124, "loss": 0.1621, "step": 4150 }, { "epoch": 0.036032673327488475, "grad_norm": 0.287109375, "learning_rate": 0.0019951214202772794, "loss": 0.1689, "step": 4151 }, { "epoch": 0.03604135380769264, "grad_norm": 0.55859375, "learning_rate": 0.0019951183250783554, "loss": 0.1807, "step": 4152 }, { "epoch": 0.036050034287896805, "grad_norm": 0.2080078125, "learning_rate": 0.001995115228900543, "loss": 0.1904, "step": 4153 }, { "epoch": 0.03605871476810097, "grad_norm": 0.2138671875, "learning_rate": 0.001995112131743846, "loss": 0.1064, "step": 4154 }, { "epoch": 0.036067395248305135, "grad_norm": 0.09619140625, "learning_rate": 0.0019951090336082686, "loss": 0.2031, "step": 4155 }, { "epoch": 0.0360760757285093, "grad_norm": 0.09912109375, "learning_rate": 0.001995105934493813, "loss": 0.2012, "step": 4156 }, { "epoch": 0.036084756208713466, "grad_norm": 0.302734375, "learning_rate": 0.0019951028344004826, "loss": 0.2188, "step": 4157 }, { "epoch": 0.03609343668891763, "grad_norm": 0.421875, "learning_rate": 0.0019950997333282815, "loss": 0.2031, "step": 4158 }, { "epoch": 0.036102117169121796, "grad_norm": 0.173828125, "learning_rate": 0.001995096631277213, "loss": 0.1582, "step": 4159 }, { "epoch": 0.03611079764932596, "grad_norm": 0.1435546875, "learning_rate": 0.00199509352824728, "loss": 0.2031, "step": 4160 }, { "epoch": 0.036119478129530126, "grad_norm": 0.53125, "learning_rate": 0.001995090424238486, "loss": 0.5156, "step": 4161 }, { "epoch": 0.03612815860973429, "grad_norm": 0.53125, "learning_rate": 0.001995087319250835, "loss": 0.1797, "step": 4162 }, { "epoch": 0.036136839089938456, "grad_norm": 0.302734375, "learning_rate": 0.0019950842132843297, "loss": 0.1797, "step": 4163 }, { "epoch": 0.03614551957014262, "grad_norm": 0.40234375, "learning_rate": 0.0019950811063389738, "loss": 0.1582, "step": 4164 }, { "epoch": 0.03615420005034679, "grad_norm": 0.447265625, "learning_rate": 0.0019950779984147706, "loss": 0.1709, "step": 4165 }, { "epoch": 0.03616288053055095, "grad_norm": 0.107421875, "learning_rate": 0.0019950748895117237, "loss": 0.1797, "step": 4166 }, { "epoch": 0.03617156101075512, "grad_norm": 0.6796875, "learning_rate": 0.001995071779629836, "loss": 0.1973, "step": 4167 }, { "epoch": 0.03618024149095928, "grad_norm": 0.53125, "learning_rate": 0.0019950686687691117, "loss": 0.2012, "step": 4168 }, { "epoch": 0.03618892197116345, "grad_norm": 0.2421875, "learning_rate": 0.0019950655569295535, "loss": 0.1377, "step": 4169 }, { "epoch": 0.03619760245136761, "grad_norm": 0.287109375, "learning_rate": 0.001995062444111165, "loss": 0.1914, "step": 4170 }, { "epoch": 0.03620628293157178, "grad_norm": 0.373046875, "learning_rate": 0.00199505933031395, "loss": 0.1436, "step": 4171 }, { "epoch": 0.03621496341177594, "grad_norm": 0.1787109375, "learning_rate": 0.001995056215537911, "loss": 0.1836, "step": 4172 }, { "epoch": 0.03622364389198011, "grad_norm": 0.162109375, "learning_rate": 0.0019950530997830526, "loss": 0.126, "step": 4173 }, { "epoch": 0.036232324372184266, "grad_norm": 0.1689453125, "learning_rate": 0.001995049983049377, "loss": 0.1582, "step": 4174 }, { "epoch": 0.03624100485238843, "grad_norm": 0.1435546875, "learning_rate": 0.001995046865336889, "loss": 0.1328, "step": 4175 }, { "epoch": 0.036249685332592596, "grad_norm": 0.2265625, "learning_rate": 0.0019950437466455906, "loss": 0.1445, "step": 4176 }, { "epoch": 0.03625836581279676, "grad_norm": 0.1015625, "learning_rate": 0.001995040626975486, "loss": 0.21, "step": 4177 }, { "epoch": 0.036267046293000926, "grad_norm": 0.12890625, "learning_rate": 0.0019950375063265777, "loss": 0.2031, "step": 4178 }, { "epoch": 0.03627572677320509, "grad_norm": 0.12451171875, "learning_rate": 0.0019950343846988705, "loss": 0.1787, "step": 4179 }, { "epoch": 0.03628440725340926, "grad_norm": 0.2294921875, "learning_rate": 0.0019950312620923673, "loss": 0.1514, "step": 4180 }, { "epoch": 0.03629308773361342, "grad_norm": 0.2578125, "learning_rate": 0.0019950281385070706, "loss": 0.1484, "step": 4181 }, { "epoch": 0.03630176821381759, "grad_norm": 0.29296875, "learning_rate": 0.0019950250139429853, "loss": 0.209, "step": 4182 }, { "epoch": 0.03631044869402175, "grad_norm": 0.57421875, "learning_rate": 0.001995021888400114, "loss": 0.2188, "step": 4183 }, { "epoch": 0.03631912917422592, "grad_norm": 0.0859375, "learning_rate": 0.00199501876187846, "loss": 0.1748, "step": 4184 }, { "epoch": 0.03632780965443008, "grad_norm": 0.28515625, "learning_rate": 0.001995015634378027, "loss": 0.1934, "step": 4185 }, { "epoch": 0.03633649013463425, "grad_norm": 0.130859375, "learning_rate": 0.0019950125058988185, "loss": 0.1787, "step": 4186 }, { "epoch": 0.03634517061483841, "grad_norm": 0.322265625, "learning_rate": 0.001995009376440838, "loss": 0.2461, "step": 4187 }, { "epoch": 0.03635385109504258, "grad_norm": 0.1005859375, "learning_rate": 0.001995006246004088, "loss": 0.1709, "step": 4188 }, { "epoch": 0.03636253157524674, "grad_norm": 0.271484375, "learning_rate": 0.0019950031145885734, "loss": 0.1895, "step": 4189 }, { "epoch": 0.03637121205545091, "grad_norm": 0.271484375, "learning_rate": 0.001994999982194296, "loss": 0.1865, "step": 4190 }, { "epoch": 0.03637989253565507, "grad_norm": 0.55078125, "learning_rate": 0.0019949968488212604, "loss": 0.1758, "step": 4191 }, { "epoch": 0.03638857301585924, "grad_norm": 0.32421875, "learning_rate": 0.0019949937144694697, "loss": 0.1777, "step": 4192 }, { "epoch": 0.0363972534960634, "grad_norm": 0.13671875, "learning_rate": 0.0019949905791389277, "loss": 0.1758, "step": 4193 }, { "epoch": 0.03640593397626757, "grad_norm": 0.2080078125, "learning_rate": 0.001994987442829637, "loss": 0.1338, "step": 4194 }, { "epoch": 0.03641461445647173, "grad_norm": 0.1484375, "learning_rate": 0.0019949843055416016, "loss": 0.2188, "step": 4195 }, { "epoch": 0.0364232949366759, "grad_norm": 0.2109375, "learning_rate": 0.001994981167274825, "loss": 0.123, "step": 4196 }, { "epoch": 0.036431975416880064, "grad_norm": 0.54296875, "learning_rate": 0.00199497802802931, "loss": 0.1826, "step": 4197 }, { "epoch": 0.03644065589708423, "grad_norm": 0.126953125, "learning_rate": 0.001994974887805061, "loss": 0.1572, "step": 4198 }, { "epoch": 0.036449336377288394, "grad_norm": 0.58203125, "learning_rate": 0.0019949717466020804, "loss": 0.1973, "step": 4199 }, { "epoch": 0.03645801685749256, "grad_norm": 0.3671875, "learning_rate": 0.0019949686044203724, "loss": 0.166, "step": 4200 }, { "epoch": 0.036466697337696724, "grad_norm": 0.47265625, "learning_rate": 0.00199496546125994, "loss": 0.1934, "step": 4201 }, { "epoch": 0.03647537781790089, "grad_norm": 0.25390625, "learning_rate": 0.001994962317120787, "loss": 0.1641, "step": 4202 }, { "epoch": 0.036484058298105054, "grad_norm": 0.66796875, "learning_rate": 0.001994959172002917, "loss": 0.2314, "step": 4203 }, { "epoch": 0.03649273877830922, "grad_norm": 0.17578125, "learning_rate": 0.0019949560259063326, "loss": 0.1895, "step": 4204 }, { "epoch": 0.03650141925851338, "grad_norm": 0.142578125, "learning_rate": 0.001994952878831038, "loss": 0.1455, "step": 4205 }, { "epoch": 0.03651009973871754, "grad_norm": 0.498046875, "learning_rate": 0.0019949497307770363, "loss": 0.2031, "step": 4206 }, { "epoch": 0.03651878021892171, "grad_norm": 0.1962890625, "learning_rate": 0.0019949465817443307, "loss": 0.1484, "step": 4207 }, { "epoch": 0.03652746069912587, "grad_norm": 0.2158203125, "learning_rate": 0.0019949434317329253, "loss": 0.1318, "step": 4208 }, { "epoch": 0.03653614117933004, "grad_norm": 0.1591796875, "learning_rate": 0.001994940280742823, "loss": 0.1445, "step": 4209 }, { "epoch": 0.0365448216595342, "grad_norm": 0.0927734375, "learning_rate": 0.001994937128774028, "loss": 0.1494, "step": 4210 }, { "epoch": 0.03655350213973837, "grad_norm": 0.1669921875, "learning_rate": 0.001994933975826543, "loss": 0.1875, "step": 4211 }, { "epoch": 0.03656218261994253, "grad_norm": 0.294921875, "learning_rate": 0.001994930821900371, "loss": 0.1572, "step": 4212 }, { "epoch": 0.0365708631001467, "grad_norm": 0.462890625, "learning_rate": 0.0019949276669955166, "loss": 0.2061, "step": 4213 }, { "epoch": 0.036579543580350864, "grad_norm": 0.3984375, "learning_rate": 0.0019949245111119825, "loss": 0.1436, "step": 4214 }, { "epoch": 0.03658822406055503, "grad_norm": 0.392578125, "learning_rate": 0.001994921354249773, "loss": 0.1611, "step": 4215 }, { "epoch": 0.036596904540759194, "grad_norm": 0.1455078125, "learning_rate": 0.00199491819640889, "loss": 0.1777, "step": 4216 }, { "epoch": 0.03660558502096336, "grad_norm": 0.130859375, "learning_rate": 0.0019949150375893386, "loss": 0.1943, "step": 4217 }, { "epoch": 0.036614265501167524, "grad_norm": 0.3203125, "learning_rate": 0.0019949118777911217, "loss": 0.1484, "step": 4218 }, { "epoch": 0.03662294598137169, "grad_norm": 0.134765625, "learning_rate": 0.0019949087170142423, "loss": 0.1709, "step": 4219 }, { "epoch": 0.036631626461575854, "grad_norm": 0.1123046875, "learning_rate": 0.0019949055552587044, "loss": 0.1943, "step": 4220 }, { "epoch": 0.03664030694178002, "grad_norm": 0.298828125, "learning_rate": 0.001994902392524511, "loss": 0.1318, "step": 4221 }, { "epoch": 0.036648987421984185, "grad_norm": 0.181640625, "learning_rate": 0.001994899228811666, "loss": 0.125, "step": 4222 }, { "epoch": 0.03665766790218835, "grad_norm": 0.5703125, "learning_rate": 0.001994896064120172, "loss": 0.1953, "step": 4223 }, { "epoch": 0.036666348382392515, "grad_norm": 0.16015625, "learning_rate": 0.001994892898450034, "loss": 0.1855, "step": 4224 }, { "epoch": 0.03667502886259668, "grad_norm": 0.5546875, "learning_rate": 0.0019948897318012543, "loss": 0.1816, "step": 4225 }, { "epoch": 0.036683709342800845, "grad_norm": 0.09326171875, "learning_rate": 0.001994886564173836, "loss": 0.248, "step": 4226 }, { "epoch": 0.03669238982300501, "grad_norm": 0.216796875, "learning_rate": 0.001994883395567784, "loss": 0.1543, "step": 4227 }, { "epoch": 0.036701070303209175, "grad_norm": 0.404296875, "learning_rate": 0.001994880225983101, "loss": 0.1953, "step": 4228 }, { "epoch": 0.03670975078341334, "grad_norm": 0.41015625, "learning_rate": 0.00199487705541979, "loss": 0.2285, "step": 4229 }, { "epoch": 0.036718431263617506, "grad_norm": 0.10302734375, "learning_rate": 0.0019948738838778545, "loss": 0.1206, "step": 4230 }, { "epoch": 0.03672711174382167, "grad_norm": 0.12255859375, "learning_rate": 0.001994870711357299, "loss": 0.1934, "step": 4231 }, { "epoch": 0.036735792224025836, "grad_norm": 0.66796875, "learning_rate": 0.001994867537858126, "loss": 0.1777, "step": 4232 }, { "epoch": 0.03674447270423, "grad_norm": 0.70703125, "learning_rate": 0.00199486436338034, "loss": 0.168, "step": 4233 }, { "epoch": 0.036753153184434166, "grad_norm": 0.1474609375, "learning_rate": 0.001994861187923943, "loss": 0.1699, "step": 4234 }, { "epoch": 0.03676183366463833, "grad_norm": 0.2333984375, "learning_rate": 0.0019948580114889395, "loss": 0.1309, "step": 4235 }, { "epoch": 0.03677051414484249, "grad_norm": 0.11962890625, "learning_rate": 0.0019948548340753326, "loss": 0.2109, "step": 4236 }, { "epoch": 0.036779194625046654, "grad_norm": 0.29296875, "learning_rate": 0.001994851655683126, "loss": 0.1484, "step": 4237 }, { "epoch": 0.03678787510525082, "grad_norm": 0.5, "learning_rate": 0.0019948484763123234, "loss": 0.1934, "step": 4238 }, { "epoch": 0.036796555585454985, "grad_norm": 0.263671875, "learning_rate": 0.0019948452959629276, "loss": 0.1787, "step": 4239 }, { "epoch": 0.03680523606565915, "grad_norm": 0.1689453125, "learning_rate": 0.0019948421146349423, "loss": 0.2041, "step": 4240 }, { "epoch": 0.036813916545863315, "grad_norm": 0.416015625, "learning_rate": 0.0019948389323283717, "loss": 0.2334, "step": 4241 }, { "epoch": 0.03682259702606748, "grad_norm": 0.10791015625, "learning_rate": 0.0019948357490432184, "loss": 0.252, "step": 4242 }, { "epoch": 0.036831277506271645, "grad_norm": 0.287109375, "learning_rate": 0.001994832564779486, "loss": 0.1885, "step": 4243 }, { "epoch": 0.03683995798647581, "grad_norm": 0.13671875, "learning_rate": 0.001994829379537178, "loss": 0.1426, "step": 4244 }, { "epoch": 0.036848638466679975, "grad_norm": 0.2353515625, "learning_rate": 0.001994826193316298, "loss": 0.1328, "step": 4245 }, { "epoch": 0.03685731894688414, "grad_norm": 0.365234375, "learning_rate": 0.0019948230061168503, "loss": 0.1738, "step": 4246 }, { "epoch": 0.036865999427088306, "grad_norm": 0.12890625, "learning_rate": 0.001994819817938837, "loss": 0.1504, "step": 4247 }, { "epoch": 0.03687467990729247, "grad_norm": 0.326171875, "learning_rate": 0.0019948166287822623, "loss": 0.1465, "step": 4248 }, { "epoch": 0.036883360387496636, "grad_norm": 0.103515625, "learning_rate": 0.0019948134386471297, "loss": 0.1338, "step": 4249 }, { "epoch": 0.0368920408677008, "grad_norm": 0.4375, "learning_rate": 0.001994810247533442, "loss": 0.2188, "step": 4250 }, { "epoch": 0.036900721347904966, "grad_norm": 0.34765625, "learning_rate": 0.001994807055441204, "loss": 0.1738, "step": 4251 }, { "epoch": 0.03690940182810913, "grad_norm": 0.08740234375, "learning_rate": 0.001994803862370418, "loss": 0.167, "step": 4252 }, { "epoch": 0.036918082308313296, "grad_norm": 0.240234375, "learning_rate": 0.001994800668321088, "loss": 0.1758, "step": 4253 }, { "epoch": 0.03692676278851746, "grad_norm": 0.251953125, "learning_rate": 0.0019947974732932175, "loss": 0.1494, "step": 4254 }, { "epoch": 0.03693544326872163, "grad_norm": 0.220703125, "learning_rate": 0.00199479427728681, "loss": 0.2266, "step": 4255 }, { "epoch": 0.03694412374892579, "grad_norm": 0.1328125, "learning_rate": 0.001994791080301869, "loss": 0.2119, "step": 4256 }, { "epoch": 0.03695280422912996, "grad_norm": 0.5390625, "learning_rate": 0.0019947878823383977, "loss": 0.1826, "step": 4257 }, { "epoch": 0.03696148470933412, "grad_norm": 0.11376953125, "learning_rate": 0.0019947846833963997, "loss": 0.1602, "step": 4258 }, { "epoch": 0.03697016518953829, "grad_norm": 0.2216796875, "learning_rate": 0.001994781483475879, "loss": 0.1133, "step": 4259 }, { "epoch": 0.03697884566974245, "grad_norm": 0.4765625, "learning_rate": 0.0019947782825768387, "loss": 0.1426, "step": 4260 }, { "epoch": 0.03698752614994662, "grad_norm": 0.443359375, "learning_rate": 0.0019947750806992817, "loss": 0.1641, "step": 4261 }, { "epoch": 0.03699620663015078, "grad_norm": 0.279296875, "learning_rate": 0.001994771877843213, "loss": 0.166, "step": 4262 }, { "epoch": 0.03700488711035495, "grad_norm": 0.283203125, "learning_rate": 0.0019947686740086346, "loss": 0.126, "step": 4263 }, { "epoch": 0.03701356759055911, "grad_norm": 0.39453125, "learning_rate": 0.001994765469195551, "loss": 0.1611, "step": 4264 }, { "epoch": 0.03702224807076328, "grad_norm": 0.5546875, "learning_rate": 0.0019947622634039646, "loss": 0.1934, "step": 4265 }, { "epoch": 0.03703092855096744, "grad_norm": 0.09130859375, "learning_rate": 0.0019947590566338803, "loss": 0.1621, "step": 4266 }, { "epoch": 0.0370396090311716, "grad_norm": 0.10009765625, "learning_rate": 0.001994755848885301, "loss": 0.1914, "step": 4267 }, { "epoch": 0.037048289511375766, "grad_norm": 0.150390625, "learning_rate": 0.00199475264015823, "loss": 0.1289, "step": 4268 }, { "epoch": 0.03705696999157993, "grad_norm": 0.126953125, "learning_rate": 0.0019947494304526706, "loss": 0.1992, "step": 4269 }, { "epoch": 0.037065650471784096, "grad_norm": 0.2275390625, "learning_rate": 0.001994746219768627, "loss": 0.1914, "step": 4270 }, { "epoch": 0.03707433095198826, "grad_norm": 0.142578125, "learning_rate": 0.001994743008106103, "loss": 0.2012, "step": 4271 }, { "epoch": 0.03708301143219243, "grad_norm": 0.2216796875, "learning_rate": 0.0019947397954651, "loss": 0.1758, "step": 4272 }, { "epoch": 0.03709169191239659, "grad_norm": 0.11083984375, "learning_rate": 0.001994736581845624, "loss": 0.1758, "step": 4273 }, { "epoch": 0.03710037239260076, "grad_norm": 0.3828125, "learning_rate": 0.0019947333672476774, "loss": 0.1494, "step": 4274 }, { "epoch": 0.03710905287280492, "grad_norm": 0.0810546875, "learning_rate": 0.0019947301516712638, "loss": 0.1426, "step": 4275 }, { "epoch": 0.03711773335300909, "grad_norm": 0.138671875, "learning_rate": 0.0019947269351163867, "loss": 0.1436, "step": 4276 }, { "epoch": 0.03712641383321325, "grad_norm": 0.3359375, "learning_rate": 0.0019947237175830497, "loss": 0.1504, "step": 4277 }, { "epoch": 0.03713509431341742, "grad_norm": 0.380859375, "learning_rate": 0.0019947204990712563, "loss": 0.208, "step": 4278 }, { "epoch": 0.03714377479362158, "grad_norm": 0.3203125, "learning_rate": 0.0019947172795810103, "loss": 0.2412, "step": 4279 }, { "epoch": 0.03715245527382575, "grad_norm": 0.1826171875, "learning_rate": 0.0019947140591123147, "loss": 0.1582, "step": 4280 }, { "epoch": 0.03716113575402991, "grad_norm": 0.1298828125, "learning_rate": 0.0019947108376651727, "loss": 0.1865, "step": 4281 }, { "epoch": 0.03716981623423408, "grad_norm": 0.12060546875, "learning_rate": 0.001994707615239589, "loss": 0.1582, "step": 4282 }, { "epoch": 0.03717849671443824, "grad_norm": 0.2333984375, "learning_rate": 0.001994704391835566, "loss": 0.1318, "step": 4283 }, { "epoch": 0.03718717719464241, "grad_norm": 0.330078125, "learning_rate": 0.0019947011674531084, "loss": 0.2539, "step": 4284 }, { "epoch": 0.03719585767484657, "grad_norm": 0.1982421875, "learning_rate": 0.0019946979420922186, "loss": 0.207, "step": 4285 }, { "epoch": 0.03720453815505074, "grad_norm": 0.07177734375, "learning_rate": 0.0019946947157529005, "loss": 0.1157, "step": 4286 }, { "epoch": 0.037213218635254904, "grad_norm": 0.271484375, "learning_rate": 0.001994691488435158, "loss": 0.1904, "step": 4287 }, { "epoch": 0.03722189911545907, "grad_norm": 0.56640625, "learning_rate": 0.001994688260138994, "loss": 0.1934, "step": 4288 }, { "epoch": 0.037230579595663234, "grad_norm": 0.162109375, "learning_rate": 0.001994685030864413, "loss": 0.1855, "step": 4289 }, { "epoch": 0.0372392600758674, "grad_norm": 0.1328125, "learning_rate": 0.0019946818006114173, "loss": 0.1572, "step": 4290 }, { "epoch": 0.037247940556071564, "grad_norm": 0.181640625, "learning_rate": 0.001994678569380011, "loss": 0.1631, "step": 4291 }, { "epoch": 0.03725662103627573, "grad_norm": 0.15234375, "learning_rate": 0.001994675337170198, "loss": 0.2012, "step": 4292 }, { "epoch": 0.037265301516479894, "grad_norm": 0.10546875, "learning_rate": 0.0019946721039819813, "loss": 0.1836, "step": 4293 }, { "epoch": 0.03727398199668406, "grad_norm": 0.248046875, "learning_rate": 0.0019946688698153645, "loss": 0.1523, "step": 4294 }, { "epoch": 0.037282662476888224, "grad_norm": 0.25, "learning_rate": 0.0019946656346703515, "loss": 0.166, "step": 4295 }, { "epoch": 0.03729134295709239, "grad_norm": 0.2236328125, "learning_rate": 0.001994662398546946, "loss": 0.2188, "step": 4296 }, { "epoch": 0.03730002343729655, "grad_norm": 0.52734375, "learning_rate": 0.0019946591614451504, "loss": 0.1621, "step": 4297 }, { "epoch": 0.03730870391750071, "grad_norm": 1.125, "learning_rate": 0.0019946559233649692, "loss": 0.2129, "step": 4298 }, { "epoch": 0.03731738439770488, "grad_norm": 0.486328125, "learning_rate": 0.0019946526843064058, "loss": 0.2148, "step": 4299 }, { "epoch": 0.03732606487790904, "grad_norm": 0.65625, "learning_rate": 0.0019946494442694635, "loss": 0.1885, "step": 4300 }, { "epoch": 0.03733474535811321, "grad_norm": 0.2470703125, "learning_rate": 0.0019946462032541462, "loss": 0.1992, "step": 4301 }, { "epoch": 0.03734342583831737, "grad_norm": 0.1884765625, "learning_rate": 0.001994642961260457, "loss": 0.1572, "step": 4302 }, { "epoch": 0.03735210631852154, "grad_norm": 0.2451171875, "learning_rate": 0.0019946397182884, "loss": 0.2578, "step": 4303 }, { "epoch": 0.037360786798725704, "grad_norm": 0.095703125, "learning_rate": 0.0019946364743379788, "loss": 0.1387, "step": 4304 }, { "epoch": 0.03736946727892987, "grad_norm": 0.51953125, "learning_rate": 0.0019946332294091956, "loss": 0.1455, "step": 4305 }, { "epoch": 0.037378147759134034, "grad_norm": 0.14453125, "learning_rate": 0.0019946299835020558, "loss": 0.1445, "step": 4306 }, { "epoch": 0.0373868282393382, "grad_norm": 0.130859375, "learning_rate": 0.001994626736616562, "loss": 0.1348, "step": 4307 }, { "epoch": 0.037395508719542364, "grad_norm": 0.11767578125, "learning_rate": 0.0019946234887527176, "loss": 0.1553, "step": 4308 }, { "epoch": 0.03740418919974653, "grad_norm": 0.0966796875, "learning_rate": 0.0019946202399105262, "loss": 0.1602, "step": 4309 }, { "epoch": 0.037412869679950694, "grad_norm": 0.220703125, "learning_rate": 0.0019946169900899916, "loss": 0.1494, "step": 4310 }, { "epoch": 0.03742155016015486, "grad_norm": 0.1845703125, "learning_rate": 0.0019946137392911176, "loss": 0.1338, "step": 4311 }, { "epoch": 0.037430230640359025, "grad_norm": 0.41796875, "learning_rate": 0.0019946104875139072, "loss": 0.1602, "step": 4312 }, { "epoch": 0.03743891112056319, "grad_norm": 0.0849609375, "learning_rate": 0.0019946072347583645, "loss": 0.1719, "step": 4313 }, { "epoch": 0.037447591600767355, "grad_norm": 0.490234375, "learning_rate": 0.0019946039810244927, "loss": 0.1475, "step": 4314 }, { "epoch": 0.03745627208097152, "grad_norm": 0.2333984375, "learning_rate": 0.0019946007263122955, "loss": 0.1533, "step": 4315 }, { "epoch": 0.037464952561175685, "grad_norm": 0.4921875, "learning_rate": 0.001994597470621776, "loss": 0.2031, "step": 4316 }, { "epoch": 0.03747363304137985, "grad_norm": 0.2255859375, "learning_rate": 0.0019945942139529384, "loss": 0.2305, "step": 4317 }, { "epoch": 0.037482313521584015, "grad_norm": 0.107421875, "learning_rate": 0.001994590956305786, "loss": 0.1562, "step": 4318 }, { "epoch": 0.03749099400178818, "grad_norm": 0.21875, "learning_rate": 0.0019945876976803223, "loss": 0.1455, "step": 4319 }, { "epoch": 0.037499674481992346, "grad_norm": 0.40625, "learning_rate": 0.001994584438076551, "loss": 0.2207, "step": 4320 }, { "epoch": 0.03750835496219651, "grad_norm": 0.2099609375, "learning_rate": 0.0019945811774944753, "loss": 0.1533, "step": 4321 }, { "epoch": 0.037517035442400676, "grad_norm": 0.205078125, "learning_rate": 0.0019945779159340994, "loss": 0.1602, "step": 4322 }, { "epoch": 0.03752571592260484, "grad_norm": 0.126953125, "learning_rate": 0.0019945746533954266, "loss": 0.1533, "step": 4323 }, { "epoch": 0.037534396402809006, "grad_norm": 0.2890625, "learning_rate": 0.0019945713898784603, "loss": 0.1245, "step": 4324 }, { "epoch": 0.03754307688301317, "grad_norm": 0.5, "learning_rate": 0.001994568125383204, "loss": 0.1758, "step": 4325 }, { "epoch": 0.037551757363217336, "grad_norm": 0.2001953125, "learning_rate": 0.001994564859909662, "loss": 0.166, "step": 4326 }, { "epoch": 0.0375604378434215, "grad_norm": 0.1533203125, "learning_rate": 0.001994561593457837, "loss": 0.2109, "step": 4327 }, { "epoch": 0.03756911832362566, "grad_norm": 0.39453125, "learning_rate": 0.0019945583260277326, "loss": 0.1621, "step": 4328 }, { "epoch": 0.037577798803829825, "grad_norm": 0.2373046875, "learning_rate": 0.0019945550576193527, "loss": 0.123, "step": 4329 }, { "epoch": 0.03758647928403399, "grad_norm": 0.08056640625, "learning_rate": 0.001994551788232701, "loss": 0.1826, "step": 4330 }, { "epoch": 0.037595159764238155, "grad_norm": 0.29296875, "learning_rate": 0.001994548517867781, "loss": 0.1797, "step": 4331 }, { "epoch": 0.03760384024444232, "grad_norm": 0.2001953125, "learning_rate": 0.001994545246524596, "loss": 0.1904, "step": 4332 }, { "epoch": 0.037612520724646485, "grad_norm": 0.1875, "learning_rate": 0.00199454197420315, "loss": 0.168, "step": 4333 }, { "epoch": 0.03762120120485065, "grad_norm": 0.28515625, "learning_rate": 0.001994538700903446, "loss": 0.1836, "step": 4334 }, { "epoch": 0.037629881685054815, "grad_norm": 0.396484375, "learning_rate": 0.001994535426625488, "loss": 0.1689, "step": 4335 }, { "epoch": 0.03763856216525898, "grad_norm": 0.189453125, "learning_rate": 0.0019945321513692798, "loss": 0.1465, "step": 4336 }, { "epoch": 0.037647242645463146, "grad_norm": 0.1396484375, "learning_rate": 0.001994528875134824, "loss": 0.1758, "step": 4337 }, { "epoch": 0.03765592312566731, "grad_norm": 0.97265625, "learning_rate": 0.0019945255979221255, "loss": 0.3945, "step": 4338 }, { "epoch": 0.037664603605871476, "grad_norm": 0.5390625, "learning_rate": 0.0019945223197311867, "loss": 0.1953, "step": 4339 }, { "epoch": 0.03767328408607564, "grad_norm": 1.03125, "learning_rate": 0.0019945190405620122, "loss": 0.168, "step": 4340 }, { "epoch": 0.037681964566279806, "grad_norm": 0.3046875, "learning_rate": 0.001994515760414605, "loss": 0.1914, "step": 4341 }, { "epoch": 0.03769064504648397, "grad_norm": 0.3515625, "learning_rate": 0.001994512479288969, "loss": 0.1865, "step": 4342 }, { "epoch": 0.037699325526688136, "grad_norm": 0.1201171875, "learning_rate": 0.0019945091971851074, "loss": 0.1602, "step": 4343 }, { "epoch": 0.0377080060068923, "grad_norm": 0.115234375, "learning_rate": 0.001994505914103024, "loss": 0.1475, "step": 4344 }, { "epoch": 0.03771668648709647, "grad_norm": 0.375, "learning_rate": 0.0019945026300427224, "loss": 0.124, "step": 4345 }, { "epoch": 0.03772536696730063, "grad_norm": 0.2392578125, "learning_rate": 0.001994499345004206, "loss": 0.2217, "step": 4346 }, { "epoch": 0.0377340474475048, "grad_norm": 0.337890625, "learning_rate": 0.0019944960589874786, "loss": 0.1865, "step": 4347 }, { "epoch": 0.03774272792770896, "grad_norm": 0.458984375, "learning_rate": 0.0019944927719925437, "loss": 0.1699, "step": 4348 }, { "epoch": 0.03775140840791313, "grad_norm": 0.1630859375, "learning_rate": 0.001994489484019405, "loss": 0.1582, "step": 4349 }, { "epoch": 0.03776008888811729, "grad_norm": 0.232421875, "learning_rate": 0.0019944861950680665, "loss": 0.1523, "step": 4350 }, { "epoch": 0.03776876936832146, "grad_norm": 0.80078125, "learning_rate": 0.0019944829051385306, "loss": 0.1758, "step": 4351 }, { "epoch": 0.03777744984852562, "grad_norm": 0.9921875, "learning_rate": 0.001994479614230802, "loss": 0.2305, "step": 4352 }, { "epoch": 0.03778613032872979, "grad_norm": 0.470703125, "learning_rate": 0.001994476322344884, "loss": 0.2266, "step": 4353 }, { "epoch": 0.03779481080893395, "grad_norm": 0.1396484375, "learning_rate": 0.00199447302948078, "loss": 0.1201, "step": 4354 }, { "epoch": 0.03780349128913812, "grad_norm": 0.10400390625, "learning_rate": 0.001994469735638494, "loss": 0.2129, "step": 4355 }, { "epoch": 0.03781217176934228, "grad_norm": 0.0830078125, "learning_rate": 0.001994466440818029, "loss": 0.1807, "step": 4356 }, { "epoch": 0.03782085224954645, "grad_norm": 0.11474609375, "learning_rate": 0.001994463145019389, "loss": 0.1582, "step": 4357 }, { "epoch": 0.03782953272975061, "grad_norm": 0.66796875, "learning_rate": 0.001994459848242578, "loss": 0.1924, "step": 4358 }, { "epoch": 0.03783821320995477, "grad_norm": 0.1259765625, "learning_rate": 0.0019944565504875986, "loss": 0.2041, "step": 4359 }, { "epoch": 0.037846893690158936, "grad_norm": 0.46484375, "learning_rate": 0.0019944532517544547, "loss": 0.1719, "step": 4360 }, { "epoch": 0.0378555741703631, "grad_norm": 0.08642578125, "learning_rate": 0.0019944499520431506, "loss": 0.1514, "step": 4361 }, { "epoch": 0.03786425465056727, "grad_norm": 0.2255859375, "learning_rate": 0.0019944466513536897, "loss": 0.1943, "step": 4362 }, { "epoch": 0.03787293513077143, "grad_norm": 0.671875, "learning_rate": 0.001994443349686075, "loss": 0.1611, "step": 4363 }, { "epoch": 0.0378816156109756, "grad_norm": 0.1572265625, "learning_rate": 0.0019944400470403106, "loss": 0.166, "step": 4364 }, { "epoch": 0.03789029609117976, "grad_norm": 0.98828125, "learning_rate": 0.0019944367434164, "loss": 0.1445, "step": 4365 }, { "epoch": 0.03789897657138393, "grad_norm": 0.79296875, "learning_rate": 0.0019944334388143467, "loss": 0.1387, "step": 4366 }, { "epoch": 0.03790765705158809, "grad_norm": 0.7890625, "learning_rate": 0.0019944301332341548, "loss": 0.1768, "step": 4367 }, { "epoch": 0.03791633753179226, "grad_norm": 0.1552734375, "learning_rate": 0.0019944268266758273, "loss": 0.1855, "step": 4368 }, { "epoch": 0.03792501801199642, "grad_norm": 0.640625, "learning_rate": 0.001994423519139368, "loss": 0.1904, "step": 4369 }, { "epoch": 0.03793369849220059, "grad_norm": 0.318359375, "learning_rate": 0.0019944202106247804, "loss": 0.165, "step": 4370 }, { "epoch": 0.03794237897240475, "grad_norm": 0.12451171875, "learning_rate": 0.001994416901132069, "loss": 0.1582, "step": 4371 }, { "epoch": 0.03795105945260892, "grad_norm": 1.2265625, "learning_rate": 0.001994413590661236, "loss": 0.1943, "step": 4372 }, { "epoch": 0.03795973993281308, "grad_norm": 0.61328125, "learning_rate": 0.001994410279212286, "loss": 0.1719, "step": 4373 }, { "epoch": 0.03796842041301725, "grad_norm": 0.8671875, "learning_rate": 0.0019944069667852224, "loss": 0.1602, "step": 4374 }, { "epoch": 0.03797710089322141, "grad_norm": 0.9921875, "learning_rate": 0.001994403653380049, "loss": 0.1953, "step": 4375 }, { "epoch": 0.03798578137342558, "grad_norm": 0.5078125, "learning_rate": 0.001994400338996769, "loss": 0.1621, "step": 4376 }, { "epoch": 0.037994461853629743, "grad_norm": 0.09716796875, "learning_rate": 0.0019943970236353855, "loss": 0.2031, "step": 4377 }, { "epoch": 0.03800314233383391, "grad_norm": 0.96484375, "learning_rate": 0.001994393707295904, "loss": 0.1514, "step": 4378 }, { "epoch": 0.038011822814038074, "grad_norm": 0.66015625, "learning_rate": 0.001994390389978326, "loss": 0.1631, "step": 4379 }, { "epoch": 0.03802050329424224, "grad_norm": 0.7734375, "learning_rate": 0.001994387071682657, "loss": 0.1641, "step": 4380 }, { "epoch": 0.038029183774446404, "grad_norm": 0.263671875, "learning_rate": 0.0019943837524088993, "loss": 0.1035, "step": 4381 }, { "epoch": 0.03803786425465057, "grad_norm": 0.4453125, "learning_rate": 0.0019943804321570567, "loss": 0.1709, "step": 4382 }, { "epoch": 0.038046544734854734, "grad_norm": 1.015625, "learning_rate": 0.0019943771109271337, "loss": 0.1934, "step": 4383 }, { "epoch": 0.0380552252150589, "grad_norm": 0.2041015625, "learning_rate": 0.0019943737887191328, "loss": 0.1631, "step": 4384 }, { "epoch": 0.038063905695263064, "grad_norm": 0.478515625, "learning_rate": 0.0019943704655330584, "loss": 0.1709, "step": 4385 }, { "epoch": 0.03807258617546723, "grad_norm": 0.384765625, "learning_rate": 0.0019943671413689143, "loss": 0.1738, "step": 4386 }, { "epoch": 0.038081266655671395, "grad_norm": 0.2490234375, "learning_rate": 0.001994363816226703, "loss": 0.1836, "step": 4387 }, { "epoch": 0.03808994713587556, "grad_norm": 0.78515625, "learning_rate": 0.0019943604901064295, "loss": 0.2129, "step": 4388 }, { "epoch": 0.038098627616079725, "grad_norm": 0.69921875, "learning_rate": 0.001994357163008096, "loss": 0.1738, "step": 4389 }, { "epoch": 0.03810730809628388, "grad_norm": 0.3359375, "learning_rate": 0.0019943538349317075, "loss": 0.1895, "step": 4390 }, { "epoch": 0.03811598857648805, "grad_norm": 0.58203125, "learning_rate": 0.001994350505877267, "loss": 0.1445, "step": 4391 }, { "epoch": 0.03812466905669221, "grad_norm": 0.11865234375, "learning_rate": 0.0019943471758447785, "loss": 0.1758, "step": 4392 }, { "epoch": 0.03813334953689638, "grad_norm": 0.267578125, "learning_rate": 0.001994343844834245, "loss": 0.166, "step": 4393 }, { "epoch": 0.038142030017100544, "grad_norm": 0.142578125, "learning_rate": 0.0019943405128456707, "loss": 0.2129, "step": 4394 }, { "epoch": 0.03815071049730471, "grad_norm": 0.392578125, "learning_rate": 0.0019943371798790592, "loss": 0.2031, "step": 4395 }, { "epoch": 0.038159390977508874, "grad_norm": 0.09033203125, "learning_rate": 0.0019943338459344133, "loss": 0.1465, "step": 4396 }, { "epoch": 0.03816807145771304, "grad_norm": 0.87109375, "learning_rate": 0.001994330511011738, "loss": 0.1631, "step": 4397 }, { "epoch": 0.038176751937917204, "grad_norm": 0.53515625, "learning_rate": 0.001994327175111036, "loss": 0.1504, "step": 4398 }, { "epoch": 0.03818543241812137, "grad_norm": 0.5078125, "learning_rate": 0.001994323838232312, "loss": 0.1318, "step": 4399 }, { "epoch": 0.038194112898325534, "grad_norm": 0.65234375, "learning_rate": 0.0019943205003755677, "loss": 0.2207, "step": 4400 }, { "epoch": 0.0382027933785297, "grad_norm": 0.73046875, "learning_rate": 0.0019943171615408083, "loss": 0.1748, "step": 4401 }, { "epoch": 0.038211473858733865, "grad_norm": 0.1982421875, "learning_rate": 0.0019943138217280378, "loss": 0.1279, "step": 4402 }, { "epoch": 0.03822015433893803, "grad_norm": 0.50390625, "learning_rate": 0.0019943104809372583, "loss": 0.2139, "step": 4403 }, { "epoch": 0.038228834819142195, "grad_norm": 0.5, "learning_rate": 0.0019943071391684748, "loss": 0.1533, "step": 4404 }, { "epoch": 0.03823751529934636, "grad_norm": 0.4609375, "learning_rate": 0.00199430379642169, "loss": 0.2236, "step": 4405 }, { "epoch": 0.038246195779550525, "grad_norm": 0.09521484375, "learning_rate": 0.0019943004526969082, "loss": 0.1709, "step": 4406 }, { "epoch": 0.03825487625975469, "grad_norm": 1.46875, "learning_rate": 0.0019942971079941326, "loss": 0.2227, "step": 4407 }, { "epoch": 0.038263556739958855, "grad_norm": 0.11669921875, "learning_rate": 0.001994293762313368, "loss": 0.1738, "step": 4408 }, { "epoch": 0.03827223722016302, "grad_norm": 0.1484375, "learning_rate": 0.0019942904156546163, "loss": 0.1621, "step": 4409 }, { "epoch": 0.038280917700367185, "grad_norm": 0.6015625, "learning_rate": 0.0019942870680178825, "loss": 0.2031, "step": 4410 }, { "epoch": 0.03828959818057135, "grad_norm": 0.08447265625, "learning_rate": 0.0019942837194031698, "loss": 0.1504, "step": 4411 }, { "epoch": 0.038298278660775516, "grad_norm": 0.09716796875, "learning_rate": 0.0019942803698104815, "loss": 0.1562, "step": 4412 }, { "epoch": 0.03830695914097968, "grad_norm": 0.28515625, "learning_rate": 0.001994277019239822, "loss": 0.1826, "step": 4413 }, { "epoch": 0.038315639621183846, "grad_norm": 0.1962890625, "learning_rate": 0.001994273667691194, "loss": 0.2129, "step": 4414 }, { "epoch": 0.03832432010138801, "grad_norm": 0.1708984375, "learning_rate": 0.0019942703151646026, "loss": 0.1641, "step": 4415 }, { "epoch": 0.038333000581592176, "grad_norm": 0.17578125, "learning_rate": 0.00199426696166005, "loss": 0.1631, "step": 4416 }, { "epoch": 0.03834168106179634, "grad_norm": 0.392578125, "learning_rate": 0.0019942636071775405, "loss": 0.1836, "step": 4417 }, { "epoch": 0.038350361542000506, "grad_norm": 0.283203125, "learning_rate": 0.001994260251717078, "loss": 0.1475, "step": 4418 }, { "epoch": 0.03835904202220467, "grad_norm": 0.294921875, "learning_rate": 0.001994256895278666, "loss": 0.1699, "step": 4419 }, { "epoch": 0.03836772250240884, "grad_norm": 0.1611328125, "learning_rate": 0.0019942535378623077, "loss": 0.2207, "step": 4420 }, { "epoch": 0.038376402982612995, "grad_norm": 0.19140625, "learning_rate": 0.0019942501794680077, "loss": 0.1465, "step": 4421 }, { "epoch": 0.03838508346281716, "grad_norm": 0.21484375, "learning_rate": 0.001994246820095769, "loss": 0.1621, "step": 4422 }, { "epoch": 0.038393763943021325, "grad_norm": 0.126953125, "learning_rate": 0.001994243459745595, "loss": 0.1699, "step": 4423 }, { "epoch": 0.03840244442322549, "grad_norm": 0.76953125, "learning_rate": 0.00199424009841749, "loss": 0.1719, "step": 4424 }, { "epoch": 0.038411124903429655, "grad_norm": 0.12109375, "learning_rate": 0.0019942367361114577, "loss": 0.1758, "step": 4425 }, { "epoch": 0.03841980538363382, "grad_norm": 0.1103515625, "learning_rate": 0.0019942333728275013, "loss": 0.1562, "step": 4426 }, { "epoch": 0.038428485863837986, "grad_norm": 0.0791015625, "learning_rate": 0.001994230008565625, "loss": 0.2012, "step": 4427 }, { "epoch": 0.03843716634404215, "grad_norm": 0.173828125, "learning_rate": 0.001994226643325832, "loss": 0.1406, "step": 4428 }, { "epoch": 0.038445846824246316, "grad_norm": 0.2470703125, "learning_rate": 0.0019942232771081258, "loss": 0.1738, "step": 4429 }, { "epoch": 0.03845452730445048, "grad_norm": 0.2333984375, "learning_rate": 0.001994219909912511, "loss": 0.1553, "step": 4430 }, { "epoch": 0.038463207784654646, "grad_norm": 0.326171875, "learning_rate": 0.0019942165417389905, "loss": 0.2227, "step": 4431 }, { "epoch": 0.03847188826485881, "grad_norm": 0.376953125, "learning_rate": 0.0019942131725875683, "loss": 0.1924, "step": 4432 }, { "epoch": 0.038480568745062976, "grad_norm": 0.365234375, "learning_rate": 0.001994209802458248, "loss": 0.1807, "step": 4433 }, { "epoch": 0.03848924922526714, "grad_norm": 0.1806640625, "learning_rate": 0.001994206431351033, "loss": 0.1592, "step": 4434 }, { "epoch": 0.03849792970547131, "grad_norm": 0.1044921875, "learning_rate": 0.0019942030592659276, "loss": 0.1738, "step": 4435 }, { "epoch": 0.03850661018567547, "grad_norm": 0.185546875, "learning_rate": 0.0019941996862029355, "loss": 0.165, "step": 4436 }, { "epoch": 0.03851529066587964, "grad_norm": 0.09228515625, "learning_rate": 0.00199419631216206, "loss": 0.1514, "step": 4437 }, { "epoch": 0.0385239711460838, "grad_norm": 0.41796875, "learning_rate": 0.001994192937143304, "loss": 0.1533, "step": 4438 }, { "epoch": 0.03853265162628797, "grad_norm": 0.296875, "learning_rate": 0.001994189561146673, "loss": 0.1367, "step": 4439 }, { "epoch": 0.03854133210649213, "grad_norm": 0.318359375, "learning_rate": 0.001994186184172169, "loss": 0.1426, "step": 4440 }, { "epoch": 0.0385500125866963, "grad_norm": 0.12451171875, "learning_rate": 0.001994182806219797, "loss": 0.1455, "step": 4441 }, { "epoch": 0.03855869306690046, "grad_norm": 0.38671875, "learning_rate": 0.0019941794272895596, "loss": 0.1484, "step": 4442 }, { "epoch": 0.03856737354710463, "grad_norm": 0.111328125, "learning_rate": 0.0019941760473814614, "loss": 0.1562, "step": 4443 }, { "epoch": 0.03857605402730879, "grad_norm": 0.2265625, "learning_rate": 0.0019941726664955057, "loss": 0.1719, "step": 4444 }, { "epoch": 0.03858473450751296, "grad_norm": 0.10546875, "learning_rate": 0.0019941692846316963, "loss": 0.168, "step": 4445 }, { "epoch": 0.03859341498771712, "grad_norm": 0.5703125, "learning_rate": 0.0019941659017900363, "loss": 0.1504, "step": 4446 }, { "epoch": 0.03860209546792129, "grad_norm": 0.267578125, "learning_rate": 0.0019941625179705305, "loss": 0.1562, "step": 4447 }, { "epoch": 0.03861077594812545, "grad_norm": 0.33203125, "learning_rate": 0.0019941591331731814, "loss": 0.1973, "step": 4448 }, { "epoch": 0.03861945642832962, "grad_norm": 0.11328125, "learning_rate": 0.001994155747397994, "loss": 0.1621, "step": 4449 }, { "epoch": 0.03862813690853378, "grad_norm": 0.2470703125, "learning_rate": 0.0019941523606449704, "loss": 0.166, "step": 4450 }, { "epoch": 0.03863681738873794, "grad_norm": 0.146484375, "learning_rate": 0.001994148972914116, "loss": 0.1348, "step": 4451 }, { "epoch": 0.03864549786894211, "grad_norm": 0.10302734375, "learning_rate": 0.0019941455842054333, "loss": 0.2051, "step": 4452 }, { "epoch": 0.03865417834914627, "grad_norm": 0.212890625, "learning_rate": 0.0019941421945189265, "loss": 0.1504, "step": 4453 }, { "epoch": 0.03866285882935044, "grad_norm": 0.43359375, "learning_rate": 0.0019941388038545995, "loss": 0.1729, "step": 4454 }, { "epoch": 0.0386715393095546, "grad_norm": 0.10400390625, "learning_rate": 0.0019941354122124553, "loss": 0.1465, "step": 4455 }, { "epoch": 0.03868021978975877, "grad_norm": 0.447265625, "learning_rate": 0.0019941320195924982, "loss": 0.1699, "step": 4456 }, { "epoch": 0.03868890026996293, "grad_norm": 0.1962890625, "learning_rate": 0.001994128625994732, "loss": 0.1533, "step": 4457 }, { "epoch": 0.0386975807501671, "grad_norm": 0.12255859375, "learning_rate": 0.00199412523141916, "loss": 0.1357, "step": 4458 }, { "epoch": 0.03870626123037126, "grad_norm": 0.71484375, "learning_rate": 0.001994121835865786, "loss": 0.1738, "step": 4459 }, { "epoch": 0.03871494171057543, "grad_norm": 0.0947265625, "learning_rate": 0.0019941184393346143, "loss": 0.1416, "step": 4460 }, { "epoch": 0.03872362219077959, "grad_norm": 0.455078125, "learning_rate": 0.0019941150418256474, "loss": 0.1543, "step": 4461 }, { "epoch": 0.03873230267098376, "grad_norm": 0.359375, "learning_rate": 0.00199411164333889, "loss": 0.1797, "step": 4462 }, { "epoch": 0.03874098315118792, "grad_norm": 0.1494140625, "learning_rate": 0.0019941082438743457, "loss": 0.1748, "step": 4463 }, { "epoch": 0.03874966363139209, "grad_norm": 0.353515625, "learning_rate": 0.001994104843432018, "loss": 0.1934, "step": 4464 }, { "epoch": 0.03875834411159625, "grad_norm": 0.34765625, "learning_rate": 0.0019941014420119104, "loss": 0.2021, "step": 4465 }, { "epoch": 0.03876702459180042, "grad_norm": 0.8046875, "learning_rate": 0.0019940980396140275, "loss": 0.1367, "step": 4466 }, { "epoch": 0.03877570507200458, "grad_norm": 0.330078125, "learning_rate": 0.001994094636238372, "loss": 0.208, "step": 4467 }, { "epoch": 0.03878438555220875, "grad_norm": 0.10009765625, "learning_rate": 0.001994091231884948, "loss": 0.1055, "step": 4468 }, { "epoch": 0.038793066032412914, "grad_norm": 0.31640625, "learning_rate": 0.0019940878265537593, "loss": 0.1719, "step": 4469 }, { "epoch": 0.03880174651261708, "grad_norm": 0.216796875, "learning_rate": 0.0019940844202448096, "loss": 0.1934, "step": 4470 }, { "epoch": 0.038810426992821244, "grad_norm": 0.1904296875, "learning_rate": 0.0019940810129581025, "loss": 0.1797, "step": 4471 }, { "epoch": 0.03881910747302541, "grad_norm": 0.08447265625, "learning_rate": 0.001994077604693642, "loss": 0.1377, "step": 4472 }, { "epoch": 0.038827787953229574, "grad_norm": 0.2216796875, "learning_rate": 0.0019940741954514317, "loss": 0.1875, "step": 4473 }, { "epoch": 0.03883646843343374, "grad_norm": 0.09033203125, "learning_rate": 0.001994070785231475, "loss": 0.1309, "step": 4474 }, { "epoch": 0.038845148913637904, "grad_norm": 0.193359375, "learning_rate": 0.001994067374033776, "loss": 0.1396, "step": 4475 }, { "epoch": 0.03885382939384207, "grad_norm": 0.5078125, "learning_rate": 0.001994063961858339, "loss": 0.1602, "step": 4476 }, { "epoch": 0.038862509874046235, "grad_norm": 0.162109375, "learning_rate": 0.0019940605487051666, "loss": 0.1904, "step": 4477 }, { "epoch": 0.0388711903542504, "grad_norm": 0.08154296875, "learning_rate": 0.001994057134574263, "loss": 0.1406, "step": 4478 }, { "epoch": 0.038879870834454565, "grad_norm": 0.20703125, "learning_rate": 0.0019940537194656316, "loss": 0.1514, "step": 4479 }, { "epoch": 0.03888855131465873, "grad_norm": 0.5, "learning_rate": 0.0019940503033792772, "loss": 0.2051, "step": 4480 }, { "epoch": 0.038897231794862895, "grad_norm": 0.263671875, "learning_rate": 0.001994046886315202, "loss": 0.2266, "step": 4481 }, { "epoch": 0.03890591227506705, "grad_norm": 0.111328125, "learning_rate": 0.0019940434682734114, "loss": 0.1465, "step": 4482 }, { "epoch": 0.03891459275527122, "grad_norm": 0.25, "learning_rate": 0.001994040049253908, "loss": 0.1572, "step": 4483 }, { "epoch": 0.038923273235475384, "grad_norm": 0.333984375, "learning_rate": 0.0019940366292566956, "loss": 0.2012, "step": 4484 }, { "epoch": 0.03893195371567955, "grad_norm": 0.470703125, "learning_rate": 0.001994033208281778, "loss": 0.1953, "step": 4485 }, { "epoch": 0.038940634195883714, "grad_norm": 0.24609375, "learning_rate": 0.00199402978632916, "loss": 0.165, "step": 4486 }, { "epoch": 0.03894931467608788, "grad_norm": 0.3359375, "learning_rate": 0.0019940263633988434, "loss": 0.1934, "step": 4487 }, { "epoch": 0.038957995156292044, "grad_norm": 0.66015625, "learning_rate": 0.0019940229394908335, "loss": 0.1787, "step": 4488 }, { "epoch": 0.03896667563649621, "grad_norm": 0.11279296875, "learning_rate": 0.0019940195146051338, "loss": 0.1553, "step": 4489 }, { "epoch": 0.038975356116700374, "grad_norm": 0.091796875, "learning_rate": 0.0019940160887417474, "loss": 0.1455, "step": 4490 }, { "epoch": 0.03898403659690454, "grad_norm": 0.07080078125, "learning_rate": 0.0019940126619006787, "loss": 0.1094, "step": 4491 }, { "epoch": 0.038992717077108704, "grad_norm": 0.515625, "learning_rate": 0.001994009234081931, "loss": 0.1172, "step": 4492 }, { "epoch": 0.03900139755731287, "grad_norm": 0.376953125, "learning_rate": 0.001994005805285508, "loss": 0.1768, "step": 4493 }, { "epoch": 0.039010078037517035, "grad_norm": 0.287109375, "learning_rate": 0.0019940023755114144, "loss": 0.1436, "step": 4494 }, { "epoch": 0.0390187585177212, "grad_norm": 0.3125, "learning_rate": 0.0019939989447596528, "loss": 0.1553, "step": 4495 }, { "epoch": 0.039027438997925365, "grad_norm": 0.6484375, "learning_rate": 0.0019939955130302274, "loss": 0.1484, "step": 4496 }, { "epoch": 0.03903611947812953, "grad_norm": 0.49609375, "learning_rate": 0.001993992080323142, "loss": 0.1807, "step": 4497 }, { "epoch": 0.039044799958333695, "grad_norm": 0.375, "learning_rate": 0.0019939886466384, "loss": 0.1221, "step": 4498 }, { "epoch": 0.03905348043853786, "grad_norm": 0.443359375, "learning_rate": 0.001993985211976006, "loss": 0.1543, "step": 4499 }, { "epoch": 0.039062160918742025, "grad_norm": 0.130859375, "learning_rate": 0.0019939817763359622, "loss": 0.1533, "step": 4500 }, { "epoch": 0.03907084139894619, "grad_norm": 0.1884765625, "learning_rate": 0.0019939783397182743, "loss": 0.1328, "step": 4501 }, { "epoch": 0.039079521879150356, "grad_norm": 0.326171875, "learning_rate": 0.001993974902122945, "loss": 0.1855, "step": 4502 }, { "epoch": 0.03908820235935452, "grad_norm": 0.1591796875, "learning_rate": 0.001993971463549978, "loss": 0.1377, "step": 4503 }, { "epoch": 0.039096882839558686, "grad_norm": 0.10205078125, "learning_rate": 0.001993968023999377, "loss": 0.1514, "step": 4504 }, { "epoch": 0.03910556331976285, "grad_norm": 0.2001953125, "learning_rate": 0.001993964583471147, "loss": 0.1523, "step": 4505 }, { "epoch": 0.039114243799967016, "grad_norm": 0.09375, "learning_rate": 0.0019939611419652896, "loss": 0.1602, "step": 4506 }, { "epoch": 0.03912292428017118, "grad_norm": 0.1923828125, "learning_rate": 0.00199395769948181, "loss": 0.1494, "step": 4507 }, { "epoch": 0.039131604760375346, "grad_norm": 0.0771484375, "learning_rate": 0.001993954256020712, "loss": 0.1406, "step": 4508 }, { "epoch": 0.03914028524057951, "grad_norm": 0.30078125, "learning_rate": 0.001993950811581999, "loss": 0.1562, "step": 4509 }, { "epoch": 0.03914896572078368, "grad_norm": 0.162109375, "learning_rate": 0.0019939473661656744, "loss": 0.2012, "step": 4510 }, { "epoch": 0.03915764620098784, "grad_norm": 0.357421875, "learning_rate": 0.001993943919771743, "loss": 0.165, "step": 4511 }, { "epoch": 0.03916632668119201, "grad_norm": 0.3515625, "learning_rate": 0.0019939404724002075, "loss": 0.1836, "step": 4512 }, { "epoch": 0.039175007161396165, "grad_norm": 0.07421875, "learning_rate": 0.0019939370240510726, "loss": 0.168, "step": 4513 }, { "epoch": 0.03918368764160033, "grad_norm": 0.10546875, "learning_rate": 0.0019939335747243413, "loss": 0.1895, "step": 4514 }, { "epoch": 0.039192368121804495, "grad_norm": 0.0859375, "learning_rate": 0.001993930124420018, "loss": 0.1758, "step": 4515 }, { "epoch": 0.03920104860200866, "grad_norm": 0.11767578125, "learning_rate": 0.001993926673138105, "loss": 0.1826, "step": 4516 }, { "epoch": 0.039209729082212826, "grad_norm": 0.0849609375, "learning_rate": 0.0019939232208786087, "loss": 0.1172, "step": 4517 }, { "epoch": 0.03921840956241699, "grad_norm": 0.26171875, "learning_rate": 0.0019939197676415304, "loss": 0.1562, "step": 4518 }, { "epoch": 0.039227090042621156, "grad_norm": 0.1337890625, "learning_rate": 0.0019939163134268753, "loss": 0.1592, "step": 4519 }, { "epoch": 0.03923577052282532, "grad_norm": 0.09375, "learning_rate": 0.001993912858234647, "loss": 0.1758, "step": 4520 }, { "epoch": 0.039244451003029486, "grad_norm": 0.296875, "learning_rate": 0.0019939094020648487, "loss": 0.1748, "step": 4521 }, { "epoch": 0.03925313148323365, "grad_norm": 0.0908203125, "learning_rate": 0.0019939059449174843, "loss": 0.1377, "step": 4522 }, { "epoch": 0.039261811963437816, "grad_norm": 0.23828125, "learning_rate": 0.001993902486792558, "loss": 0.2363, "step": 4523 }, { "epoch": 0.03927049244364198, "grad_norm": 0.0947265625, "learning_rate": 0.0019938990276900737, "loss": 0.1797, "step": 4524 }, { "epoch": 0.039279172923846147, "grad_norm": 0.640625, "learning_rate": 0.0019938955676100344, "loss": 0.1406, "step": 4525 }, { "epoch": 0.03928785340405031, "grad_norm": 0.298828125, "learning_rate": 0.0019938921065524445, "loss": 0.2266, "step": 4526 }, { "epoch": 0.03929653388425448, "grad_norm": 0.201171875, "learning_rate": 0.0019938886445173077, "loss": 0.1543, "step": 4527 }, { "epoch": 0.03930521436445864, "grad_norm": 0.130859375, "learning_rate": 0.0019938851815046277, "loss": 0.1582, "step": 4528 }, { "epoch": 0.03931389484466281, "grad_norm": 0.443359375, "learning_rate": 0.001993881717514408, "loss": 0.1514, "step": 4529 }, { "epoch": 0.03932257532486697, "grad_norm": 0.49609375, "learning_rate": 0.0019938782525466526, "loss": 0.1904, "step": 4530 }, { "epoch": 0.03933125580507114, "grad_norm": 0.375, "learning_rate": 0.001993874786601366, "loss": 0.1855, "step": 4531 }, { "epoch": 0.0393399362852753, "grad_norm": 0.447265625, "learning_rate": 0.0019938713196785514, "loss": 0.1475, "step": 4532 }, { "epoch": 0.03934861676547947, "grad_norm": 0.3359375, "learning_rate": 0.0019938678517782116, "loss": 0.2539, "step": 4533 }, { "epoch": 0.03935729724568363, "grad_norm": 0.6171875, "learning_rate": 0.001993864382900352, "loss": 0.1318, "step": 4534 }, { "epoch": 0.0393659777258878, "grad_norm": 0.390625, "learning_rate": 0.0019938609130449756, "loss": 0.1992, "step": 4535 }, { "epoch": 0.03937465820609196, "grad_norm": 0.1806640625, "learning_rate": 0.0019938574422120867, "loss": 0.2148, "step": 4536 }, { "epoch": 0.03938333868629613, "grad_norm": 0.294921875, "learning_rate": 0.001993853970401688, "loss": 0.105, "step": 4537 }, { "epoch": 0.03939201916650029, "grad_norm": 0.1416015625, "learning_rate": 0.001993850497613785, "loss": 0.2227, "step": 4538 }, { "epoch": 0.03940069964670446, "grad_norm": 0.13671875, "learning_rate": 0.00199384702384838, "loss": 0.1533, "step": 4539 }, { "epoch": 0.03940938012690862, "grad_norm": 0.0732421875, "learning_rate": 0.001993843549105477, "loss": 0.1387, "step": 4540 }, { "epoch": 0.03941806060711279, "grad_norm": 0.06396484375, "learning_rate": 0.0019938400733850808, "loss": 0.1348, "step": 4541 }, { "epoch": 0.039426741087316954, "grad_norm": 0.404296875, "learning_rate": 0.0019938365966871937, "loss": 0.1738, "step": 4542 }, { "epoch": 0.03943542156752112, "grad_norm": 0.19921875, "learning_rate": 0.001993833119011821, "loss": 0.1426, "step": 4543 }, { "epoch": 0.03944410204772528, "grad_norm": 0.263671875, "learning_rate": 0.0019938296403589654, "loss": 0.1865, "step": 4544 }, { "epoch": 0.03945278252792944, "grad_norm": 0.490234375, "learning_rate": 0.0019938261607286316, "loss": 0.2148, "step": 4545 }, { "epoch": 0.03946146300813361, "grad_norm": 1.09375, "learning_rate": 0.0019938226801208226, "loss": 0.1582, "step": 4546 }, { "epoch": 0.03947014348833777, "grad_norm": 0.09326171875, "learning_rate": 0.0019938191985355426, "loss": 0.1406, "step": 4547 }, { "epoch": 0.03947882396854194, "grad_norm": 0.169921875, "learning_rate": 0.0019938157159727953, "loss": 0.168, "step": 4548 }, { "epoch": 0.0394875044487461, "grad_norm": 0.392578125, "learning_rate": 0.001993812232432585, "loss": 0.1484, "step": 4549 }, { "epoch": 0.03949618492895027, "grad_norm": 0.4921875, "learning_rate": 0.0019938087479149146, "loss": 0.1436, "step": 4550 }, { "epoch": 0.03950486540915443, "grad_norm": 0.578125, "learning_rate": 0.0019938052624197886, "loss": 0.1221, "step": 4551 }, { "epoch": 0.0395135458893586, "grad_norm": 0.79296875, "learning_rate": 0.0019938017759472105, "loss": 0.1826, "step": 4552 }, { "epoch": 0.03952222636956276, "grad_norm": 0.09130859375, "learning_rate": 0.0019937982884971842, "loss": 0.1621, "step": 4553 }, { "epoch": 0.03953090684976693, "grad_norm": 0.10205078125, "learning_rate": 0.0019937948000697133, "loss": 0.1992, "step": 4554 }, { "epoch": 0.03953958732997109, "grad_norm": 0.1533203125, "learning_rate": 0.0019937913106648024, "loss": 0.1338, "step": 4555 }, { "epoch": 0.03954826781017526, "grad_norm": 0.53125, "learning_rate": 0.0019937878202824543, "loss": 0.1699, "step": 4556 }, { "epoch": 0.03955694829037942, "grad_norm": 0.5625, "learning_rate": 0.0019937843289226736, "loss": 0.2188, "step": 4557 }, { "epoch": 0.03956562877058359, "grad_norm": 0.208984375, "learning_rate": 0.0019937808365854633, "loss": 0.167, "step": 4558 }, { "epoch": 0.039574309250787754, "grad_norm": 0.068359375, "learning_rate": 0.001993777343270828, "loss": 0.1426, "step": 4559 }, { "epoch": 0.03958298973099192, "grad_norm": 0.2734375, "learning_rate": 0.0019937738489787713, "loss": 0.168, "step": 4560 }, { "epoch": 0.039591670211196084, "grad_norm": 0.251953125, "learning_rate": 0.001993770353709297, "loss": 0.1758, "step": 4561 }, { "epoch": 0.03960035069140025, "grad_norm": 0.14453125, "learning_rate": 0.0019937668574624085, "loss": 0.1895, "step": 4562 }, { "epoch": 0.039609031171604414, "grad_norm": 1.1953125, "learning_rate": 0.0019937633602381106, "loss": 0.3926, "step": 4563 }, { "epoch": 0.03961771165180858, "grad_norm": 0.89453125, "learning_rate": 0.001993759862036406, "loss": 0.1543, "step": 4564 }, { "epoch": 0.039626392132012744, "grad_norm": 0.765625, "learning_rate": 0.001993756362857299, "loss": 0.1494, "step": 4565 }, { "epoch": 0.03963507261221691, "grad_norm": 0.2197265625, "learning_rate": 0.0019937528627007937, "loss": 0.1943, "step": 4566 }, { "epoch": 0.039643753092421075, "grad_norm": 0.28125, "learning_rate": 0.001993749361566894, "loss": 0.1777, "step": 4567 }, { "epoch": 0.03965243357262524, "grad_norm": 0.220703125, "learning_rate": 0.001993745859455603, "loss": 0.1523, "step": 4568 }, { "epoch": 0.039661114052829405, "grad_norm": 0.35546875, "learning_rate": 0.001993742356366925, "loss": 0.1533, "step": 4569 }, { "epoch": 0.03966979453303357, "grad_norm": 0.65234375, "learning_rate": 0.0019937388523008637, "loss": 0.1895, "step": 4570 }, { "epoch": 0.039678475013237735, "grad_norm": 0.259765625, "learning_rate": 0.0019937353472574233, "loss": 0.1328, "step": 4571 }, { "epoch": 0.0396871554934419, "grad_norm": 0.3984375, "learning_rate": 0.001993731841236607, "loss": 0.1631, "step": 4572 }, { "epoch": 0.039695835973646065, "grad_norm": 0.220703125, "learning_rate": 0.0019937283342384192, "loss": 0.1348, "step": 4573 }, { "epoch": 0.03970451645385023, "grad_norm": 0.123046875, "learning_rate": 0.0019937248262628634, "loss": 0.1484, "step": 4574 }, { "epoch": 0.03971319693405439, "grad_norm": 0.0849609375, "learning_rate": 0.001993721317309944, "loss": 0.1621, "step": 4575 }, { "epoch": 0.039721877414258554, "grad_norm": 0.2890625, "learning_rate": 0.001993717807379664, "loss": 0.1973, "step": 4576 }, { "epoch": 0.03973055789446272, "grad_norm": 0.23046875, "learning_rate": 0.0019937142964720276, "loss": 0.1357, "step": 4577 }, { "epoch": 0.039739238374666884, "grad_norm": 0.126953125, "learning_rate": 0.0019937107845870387, "loss": 0.1309, "step": 4578 }, { "epoch": 0.03974791885487105, "grad_norm": 0.1904296875, "learning_rate": 0.0019937072717247013, "loss": 0.1777, "step": 4579 }, { "epoch": 0.039756599335075214, "grad_norm": 0.462890625, "learning_rate": 0.001993703757885019, "loss": 0.2012, "step": 4580 }, { "epoch": 0.03976527981527938, "grad_norm": 0.337890625, "learning_rate": 0.0019937002430679956, "loss": 0.1953, "step": 4581 }, { "epoch": 0.039773960295483544, "grad_norm": 0.275390625, "learning_rate": 0.001993696727273635, "loss": 0.1533, "step": 4582 }, { "epoch": 0.03978264077568771, "grad_norm": 0.099609375, "learning_rate": 0.0019936932105019413, "loss": 0.1758, "step": 4583 }, { "epoch": 0.039791321255891875, "grad_norm": 0.15234375, "learning_rate": 0.001993689692752918, "loss": 0.1924, "step": 4584 }, { "epoch": 0.03980000173609604, "grad_norm": 0.1943359375, "learning_rate": 0.001993686174026569, "loss": 0.1426, "step": 4585 }, { "epoch": 0.039808682216300205, "grad_norm": 0.90234375, "learning_rate": 0.0019936826543228985, "loss": 0.1963, "step": 4586 }, { "epoch": 0.03981736269650437, "grad_norm": 0.494140625, "learning_rate": 0.00199367913364191, "loss": 0.1475, "step": 4587 }, { "epoch": 0.039826043176708535, "grad_norm": 0.361328125, "learning_rate": 0.001993675611983607, "loss": 0.1602, "step": 4588 }, { "epoch": 0.0398347236569127, "grad_norm": 0.09130859375, "learning_rate": 0.001993672089347994, "loss": 0.1738, "step": 4589 }, { "epoch": 0.039843404137116865, "grad_norm": 0.8359375, "learning_rate": 0.0019936685657350748, "loss": 0.209, "step": 4590 }, { "epoch": 0.03985208461732103, "grad_norm": 0.51171875, "learning_rate": 0.001993665041144853, "loss": 0.1621, "step": 4591 }, { "epoch": 0.039860765097525196, "grad_norm": 0.09033203125, "learning_rate": 0.0019936615155773324, "loss": 0.1582, "step": 4592 }, { "epoch": 0.03986944557772936, "grad_norm": 0.33984375, "learning_rate": 0.001993657989032517, "loss": 0.1641, "step": 4593 }, { "epoch": 0.039878126057933526, "grad_norm": 0.11181640625, "learning_rate": 0.001993654461510411, "loss": 0.1416, "step": 4594 }, { "epoch": 0.03988680653813769, "grad_norm": 0.1689453125, "learning_rate": 0.001993650933011018, "loss": 0.1562, "step": 4595 }, { "epoch": 0.039895487018341856, "grad_norm": 0.283203125, "learning_rate": 0.0019936474035343412, "loss": 0.1514, "step": 4596 }, { "epoch": 0.03990416749854602, "grad_norm": 0.095703125, "learning_rate": 0.0019936438730803856, "loss": 0.1826, "step": 4597 }, { "epoch": 0.039912847978750186, "grad_norm": 0.69921875, "learning_rate": 0.001993640341649154, "loss": 0.1719, "step": 4598 }, { "epoch": 0.03992152845895435, "grad_norm": 0.1318359375, "learning_rate": 0.0019936368092406506, "loss": 0.1895, "step": 4599 }, { "epoch": 0.03993020893915852, "grad_norm": 0.1845703125, "learning_rate": 0.00199363327585488, "loss": 0.1699, "step": 4600 }, { "epoch": 0.03993888941936268, "grad_norm": 0.09033203125, "learning_rate": 0.001993629741491845, "loss": 0.1641, "step": 4601 }, { "epoch": 0.03994756989956685, "grad_norm": 0.283203125, "learning_rate": 0.0019936262061515503, "loss": 0.1543, "step": 4602 }, { "epoch": 0.03995625037977101, "grad_norm": 0.77734375, "learning_rate": 0.001993622669833999, "loss": 0.1709, "step": 4603 }, { "epoch": 0.03996493085997518, "grad_norm": 0.365234375, "learning_rate": 0.001993619132539196, "loss": 0.1602, "step": 4604 }, { "epoch": 0.039973611340179335, "grad_norm": 0.5, "learning_rate": 0.001993615594267144, "loss": 0.1592, "step": 4605 }, { "epoch": 0.0399822918203835, "grad_norm": 0.427734375, "learning_rate": 0.0019936120550178476, "loss": 0.1426, "step": 4606 }, { "epoch": 0.039990972300587665, "grad_norm": 0.1962890625, "learning_rate": 0.0019936085147913107, "loss": 0.1895, "step": 4607 }, { "epoch": 0.03999965278079183, "grad_norm": 0.08349609375, "learning_rate": 0.001993604973587537, "loss": 0.1387, "step": 4608 }, { "epoch": 0.040008333260995996, "grad_norm": 0.31640625, "learning_rate": 0.0019936014314065297, "loss": 0.2656, "step": 4609 }, { "epoch": 0.04001701374120016, "grad_norm": 0.197265625, "learning_rate": 0.0019935978882482937, "loss": 0.1562, "step": 4610 }, { "epoch": 0.040025694221404326, "grad_norm": 0.87890625, "learning_rate": 0.0019935943441128324, "loss": 0.1748, "step": 4611 }, { "epoch": 0.04003437470160849, "grad_norm": 0.11962890625, "learning_rate": 0.0019935907990001494, "loss": 0.1387, "step": 4612 }, { "epoch": 0.040043055181812656, "grad_norm": 0.26171875, "learning_rate": 0.0019935872529102494, "loss": 0.166, "step": 4613 }, { "epoch": 0.04005173566201682, "grad_norm": 0.416015625, "learning_rate": 0.001993583705843136, "loss": 0.1406, "step": 4614 }, { "epoch": 0.040060416142220986, "grad_norm": 0.201171875, "learning_rate": 0.0019935801577988126, "loss": 0.125, "step": 4615 }, { "epoch": 0.04006909662242515, "grad_norm": 0.373046875, "learning_rate": 0.0019935766087772833, "loss": 0.1904, "step": 4616 }, { "epoch": 0.04007777710262932, "grad_norm": 0.29296875, "learning_rate": 0.001993573058778552, "loss": 0.1797, "step": 4617 }, { "epoch": 0.04008645758283348, "grad_norm": 0.5859375, "learning_rate": 0.001993569507802623, "loss": 0.1641, "step": 4618 }, { "epoch": 0.04009513806303765, "grad_norm": 0.9609375, "learning_rate": 0.0019935659558494995, "loss": 0.1602, "step": 4619 }, { "epoch": 0.04010381854324181, "grad_norm": 0.099609375, "learning_rate": 0.001993562402919186, "loss": 0.1875, "step": 4620 }, { "epoch": 0.04011249902344598, "grad_norm": 0.69921875, "learning_rate": 0.0019935588490116855, "loss": 0.168, "step": 4621 }, { "epoch": 0.04012117950365014, "grad_norm": 0.248046875, "learning_rate": 0.001993555294127003, "loss": 0.1826, "step": 4622 }, { "epoch": 0.04012985998385431, "grad_norm": 0.380859375, "learning_rate": 0.0019935517382651414, "loss": 0.1729, "step": 4623 }, { "epoch": 0.04013854046405847, "grad_norm": 0.19140625, "learning_rate": 0.0019935481814261054, "loss": 0.1543, "step": 4624 }, { "epoch": 0.04014722094426264, "grad_norm": 0.15234375, "learning_rate": 0.0019935446236098984, "loss": 0.1934, "step": 4625 }, { "epoch": 0.0401559014244668, "grad_norm": 0.5, "learning_rate": 0.0019935410648165247, "loss": 0.1523, "step": 4626 }, { "epoch": 0.04016458190467097, "grad_norm": 0.2353515625, "learning_rate": 0.0019935375050459873, "loss": 0.1572, "step": 4627 }, { "epoch": 0.04017326238487513, "grad_norm": 0.66796875, "learning_rate": 0.001993533944298291, "loss": 0.1699, "step": 4628 }, { "epoch": 0.0401819428650793, "grad_norm": 0.28125, "learning_rate": 0.00199353038257344, "loss": 0.1738, "step": 4629 }, { "epoch": 0.04019062334528346, "grad_norm": 0.4453125, "learning_rate": 0.0019935268198714366, "loss": 0.2031, "step": 4630 }, { "epoch": 0.04019930382548763, "grad_norm": 0.34375, "learning_rate": 0.0019935232561922867, "loss": 0.123, "step": 4631 }, { "epoch": 0.040207984305691793, "grad_norm": 0.458984375, "learning_rate": 0.001993519691535992, "loss": 0.1406, "step": 4632 }, { "epoch": 0.04021666478589596, "grad_norm": 0.10888671875, "learning_rate": 0.0019935161259025586, "loss": 0.1797, "step": 4633 }, { "epoch": 0.040225345266100124, "grad_norm": 0.10498046875, "learning_rate": 0.0019935125592919893, "loss": 0.1367, "step": 4634 }, { "epoch": 0.04023402574630429, "grad_norm": 0.09033203125, "learning_rate": 0.001993508991704288, "loss": 0.165, "step": 4635 }, { "epoch": 0.04024270622650845, "grad_norm": 0.09521484375, "learning_rate": 0.0019935054231394584, "loss": 0.1797, "step": 4636 }, { "epoch": 0.04025138670671261, "grad_norm": 0.43359375, "learning_rate": 0.0019935018535975047, "loss": 0.1475, "step": 4637 }, { "epoch": 0.04026006718691678, "grad_norm": 0.1650390625, "learning_rate": 0.001993498283078431, "loss": 0.1934, "step": 4638 }, { "epoch": 0.04026874766712094, "grad_norm": 0.11376953125, "learning_rate": 0.0019934947115822408, "loss": 0.1621, "step": 4639 }, { "epoch": 0.04027742814732511, "grad_norm": 0.2060546875, "learning_rate": 0.0019934911391089384, "loss": 0.168, "step": 4640 }, { "epoch": 0.04028610862752927, "grad_norm": 0.75390625, "learning_rate": 0.0019934875656585273, "loss": 0.2168, "step": 4641 }, { "epoch": 0.04029478910773344, "grad_norm": 0.30078125, "learning_rate": 0.001993483991231012, "loss": 0.2188, "step": 4642 }, { "epoch": 0.0403034695879376, "grad_norm": 0.921875, "learning_rate": 0.0019934804158263956, "loss": 0.1855, "step": 4643 }, { "epoch": 0.04031215006814177, "grad_norm": 0.42578125, "learning_rate": 0.0019934768394446827, "loss": 0.2334, "step": 4644 }, { "epoch": 0.04032083054834593, "grad_norm": 0.83984375, "learning_rate": 0.0019934732620858773, "loss": 0.1328, "step": 4645 }, { "epoch": 0.0403295110285501, "grad_norm": 0.29296875, "learning_rate": 0.0019934696837499823, "loss": 0.1699, "step": 4646 }, { "epoch": 0.04033819150875426, "grad_norm": 0.283203125, "learning_rate": 0.0019934661044370026, "loss": 0.1982, "step": 4647 }, { "epoch": 0.04034687198895843, "grad_norm": 0.212890625, "learning_rate": 0.0019934625241469417, "loss": 0.1602, "step": 4648 }, { "epoch": 0.040355552469162594, "grad_norm": 0.265625, "learning_rate": 0.0019934589428798038, "loss": 0.1328, "step": 4649 }, { "epoch": 0.04036423294936676, "grad_norm": 0.265625, "learning_rate": 0.0019934553606355924, "loss": 0.1523, "step": 4650 }, { "epoch": 0.040372913429570924, "grad_norm": 0.232421875, "learning_rate": 0.0019934517774143116, "loss": 0.1475, "step": 4651 }, { "epoch": 0.04038159390977509, "grad_norm": 0.10693359375, "learning_rate": 0.0019934481932159655, "loss": 0.1416, "step": 4652 }, { "epoch": 0.040390274389979254, "grad_norm": 0.2294921875, "learning_rate": 0.0019934446080405576, "loss": 0.1914, "step": 4653 }, { "epoch": 0.04039895487018342, "grad_norm": 0.353515625, "learning_rate": 0.0019934410218880928, "loss": 0.1602, "step": 4654 }, { "epoch": 0.040407635350387584, "grad_norm": 0.451171875, "learning_rate": 0.0019934374347585736, "loss": 0.1699, "step": 4655 }, { "epoch": 0.04041631583059175, "grad_norm": 0.5546875, "learning_rate": 0.001993433846652005, "loss": 0.1592, "step": 4656 }, { "epoch": 0.040424996310795915, "grad_norm": 0.77734375, "learning_rate": 0.0019934302575683903, "loss": 0.1816, "step": 4657 }, { "epoch": 0.04043367679100008, "grad_norm": 0.279296875, "learning_rate": 0.001993426667507734, "loss": 0.1504, "step": 4658 }, { "epoch": 0.040442357271204245, "grad_norm": 0.359375, "learning_rate": 0.0019934230764700397, "loss": 0.1182, "step": 4659 }, { "epoch": 0.04045103775140841, "grad_norm": 0.10498046875, "learning_rate": 0.001993419484455311, "loss": 0.1445, "step": 4660 }, { "epoch": 0.040459718231612575, "grad_norm": 0.39453125, "learning_rate": 0.001993415891463553, "loss": 0.1484, "step": 4661 }, { "epoch": 0.04046839871181674, "grad_norm": 0.236328125, "learning_rate": 0.0019934122974947684, "loss": 0.1396, "step": 4662 }, { "epoch": 0.040477079192020905, "grad_norm": 0.271484375, "learning_rate": 0.001993408702548961, "loss": 0.1875, "step": 4663 }, { "epoch": 0.04048575967222507, "grad_norm": 0.5703125, "learning_rate": 0.0019934051066261356, "loss": 0.2188, "step": 4664 }, { "epoch": 0.040494440152429236, "grad_norm": 0.353515625, "learning_rate": 0.001993401509726296, "loss": 0.1592, "step": 4665 }, { "epoch": 0.0405031206326334, "grad_norm": 0.74609375, "learning_rate": 0.0019933979118494454, "loss": 0.1426, "step": 4666 }, { "epoch": 0.04051180111283756, "grad_norm": 0.314453125, "learning_rate": 0.001993394312995589, "loss": 0.1367, "step": 4667 }, { "epoch": 0.040520481593041724, "grad_norm": 0.4140625, "learning_rate": 0.0019933907131647294, "loss": 0.1885, "step": 4668 }, { "epoch": 0.04052916207324589, "grad_norm": 0.130859375, "learning_rate": 0.0019933871123568713, "loss": 0.1455, "step": 4669 }, { "epoch": 0.040537842553450054, "grad_norm": 0.19140625, "learning_rate": 0.0019933835105720187, "loss": 0.1611, "step": 4670 }, { "epoch": 0.04054652303365422, "grad_norm": 0.3671875, "learning_rate": 0.001993379907810175, "loss": 0.2148, "step": 4671 }, { "epoch": 0.040555203513858384, "grad_norm": 0.271484375, "learning_rate": 0.0019933763040713448, "loss": 0.1768, "step": 4672 }, { "epoch": 0.04056388399406255, "grad_norm": 0.423828125, "learning_rate": 0.0019933726993555316, "loss": 0.1069, "step": 4673 }, { "epoch": 0.040572564474266715, "grad_norm": 0.12255859375, "learning_rate": 0.0019933690936627395, "loss": 0.2051, "step": 4674 }, { "epoch": 0.04058124495447088, "grad_norm": 0.478515625, "learning_rate": 0.001993365486992972, "loss": 0.1787, "step": 4675 }, { "epoch": 0.040589925434675045, "grad_norm": 0.490234375, "learning_rate": 0.0019933618793462338, "loss": 0.1758, "step": 4676 }, { "epoch": 0.04059860591487921, "grad_norm": 0.193359375, "learning_rate": 0.0019933582707225284, "loss": 0.1436, "step": 4677 }, { "epoch": 0.040607286395083375, "grad_norm": 0.4765625, "learning_rate": 0.0019933546611218596, "loss": 0.1426, "step": 4678 }, { "epoch": 0.04061596687528754, "grad_norm": 0.19140625, "learning_rate": 0.0019933510505442315, "loss": 0.2168, "step": 4679 }, { "epoch": 0.040624647355491705, "grad_norm": 0.095703125, "learning_rate": 0.001993347438989649, "loss": 0.165, "step": 4680 }, { "epoch": 0.04063332783569587, "grad_norm": 0.22265625, "learning_rate": 0.001993343826458114, "loss": 0.1104, "step": 4681 }, { "epoch": 0.040642008315900036, "grad_norm": 0.76953125, "learning_rate": 0.0019933402129496324, "loss": 0.1963, "step": 4682 }, { "epoch": 0.0406506887961042, "grad_norm": 0.404296875, "learning_rate": 0.001993336598464207, "loss": 0.1777, "step": 4683 }, { "epoch": 0.040659369276308366, "grad_norm": 0.30859375, "learning_rate": 0.0019933329830018423, "loss": 0.1992, "step": 4684 }, { "epoch": 0.04066804975651253, "grad_norm": 0.44921875, "learning_rate": 0.001993329366562542, "loss": 0.1553, "step": 4685 }, { "epoch": 0.040676730236716696, "grad_norm": 0.09814453125, "learning_rate": 0.00199332574914631, "loss": 0.165, "step": 4686 }, { "epoch": 0.04068541071692086, "grad_norm": 0.82421875, "learning_rate": 0.00199332213075315, "loss": 0.2129, "step": 4687 }, { "epoch": 0.040694091197125026, "grad_norm": 0.10205078125, "learning_rate": 0.0019933185113830674, "loss": 0.1543, "step": 4688 }, { "epoch": 0.04070277167732919, "grad_norm": 0.63671875, "learning_rate": 0.0019933148910360643, "loss": 0.1465, "step": 4689 }, { "epoch": 0.04071145215753336, "grad_norm": 0.0830078125, "learning_rate": 0.0019933112697121456, "loss": 0.1445, "step": 4690 }, { "epoch": 0.04072013263773752, "grad_norm": 0.6171875, "learning_rate": 0.0019933076474113152, "loss": 0.1973, "step": 4691 }, { "epoch": 0.04072881311794169, "grad_norm": 1.1640625, "learning_rate": 0.001993304024133577, "loss": 0.1992, "step": 4692 }, { "epoch": 0.04073749359814585, "grad_norm": 0.79296875, "learning_rate": 0.001993300399878935, "loss": 0.1553, "step": 4693 }, { "epoch": 0.04074617407835002, "grad_norm": 0.59375, "learning_rate": 0.001993296774647393, "loss": 0.1699, "step": 4694 }, { "epoch": 0.04075485455855418, "grad_norm": 0.578125, "learning_rate": 0.001993293148438955, "loss": 0.1602, "step": 4695 }, { "epoch": 0.04076353503875835, "grad_norm": 0.0849609375, "learning_rate": 0.001993289521253625, "loss": 0.124, "step": 4696 }, { "epoch": 0.04077221551896251, "grad_norm": 0.76171875, "learning_rate": 0.0019932858930914073, "loss": 0.1895, "step": 4697 }, { "epoch": 0.04078089599916667, "grad_norm": 0.65234375, "learning_rate": 0.001993282263952305, "loss": 0.1572, "step": 4698 }, { "epoch": 0.040789576479370836, "grad_norm": 1.109375, "learning_rate": 0.0019932786338363235, "loss": 0.1357, "step": 4699 }, { "epoch": 0.040798256959575, "grad_norm": 1.1796875, "learning_rate": 0.0019932750027434653, "loss": 0.1758, "step": 4700 }, { "epoch": 0.040806937439779166, "grad_norm": 0.578125, "learning_rate": 0.001993271370673735, "loss": 0.1445, "step": 4701 }, { "epoch": 0.04081561791998333, "grad_norm": 0.62109375, "learning_rate": 0.001993267737627137, "loss": 0.1426, "step": 4702 }, { "epoch": 0.040824298400187496, "grad_norm": 0.1962890625, "learning_rate": 0.0019932641036036745, "loss": 0.1494, "step": 4703 }, { "epoch": 0.04083297888039166, "grad_norm": 0.197265625, "learning_rate": 0.0019932604686033516, "loss": 0.1777, "step": 4704 }, { "epoch": 0.040841659360595826, "grad_norm": 0.474609375, "learning_rate": 0.0019932568326261725, "loss": 0.1582, "step": 4705 }, { "epoch": 0.04085033984079999, "grad_norm": 0.345703125, "learning_rate": 0.0019932531956721416, "loss": 0.1768, "step": 4706 }, { "epoch": 0.04085902032100416, "grad_norm": 0.1015625, "learning_rate": 0.001993249557741262, "loss": 0.2207, "step": 4707 }, { "epoch": 0.04086770080120832, "grad_norm": 0.50390625, "learning_rate": 0.001993245918833538, "loss": 0.1387, "step": 4708 }, { "epoch": 0.04087638128141249, "grad_norm": 0.11669921875, "learning_rate": 0.0019932422789489743, "loss": 0.1738, "step": 4709 }, { "epoch": 0.04088506176161665, "grad_norm": 0.1123046875, "learning_rate": 0.0019932386380875737, "loss": 0.1562, "step": 4710 }, { "epoch": 0.04089374224182082, "grad_norm": 0.1455078125, "learning_rate": 0.001993234996249341, "loss": 0.166, "step": 4711 }, { "epoch": 0.04090242272202498, "grad_norm": 0.318359375, "learning_rate": 0.00199323135343428, "loss": 0.1484, "step": 4712 }, { "epoch": 0.04091110320222915, "grad_norm": 0.099609375, "learning_rate": 0.0019932277096423945, "loss": 0.1641, "step": 4713 }, { "epoch": 0.04091978368243331, "grad_norm": 0.3046875, "learning_rate": 0.0019932240648736885, "loss": 0.1641, "step": 4714 }, { "epoch": 0.04092846416263748, "grad_norm": 0.111328125, "learning_rate": 0.001993220419128166, "loss": 0.1543, "step": 4715 }, { "epoch": 0.04093714464284164, "grad_norm": 0.1845703125, "learning_rate": 0.001993216772405831, "loss": 0.1689, "step": 4716 }, { "epoch": 0.04094582512304581, "grad_norm": 1.703125, "learning_rate": 0.001993213124706688, "loss": 0.1895, "step": 4717 }, { "epoch": 0.04095450560324997, "grad_norm": 0.71484375, "learning_rate": 0.0019932094760307406, "loss": 0.1748, "step": 4718 }, { "epoch": 0.04096318608345414, "grad_norm": 0.59765625, "learning_rate": 0.0019932058263779926, "loss": 0.1367, "step": 4719 }, { "epoch": 0.0409718665636583, "grad_norm": 0.12060546875, "learning_rate": 0.001993202175748448, "loss": 0.167, "step": 4720 }, { "epoch": 0.04098054704386247, "grad_norm": 0.337890625, "learning_rate": 0.0019931985241421106, "loss": 0.1562, "step": 4721 }, { "epoch": 0.04098922752406663, "grad_norm": 0.310546875, "learning_rate": 0.001993194871558985, "loss": 0.1855, "step": 4722 }, { "epoch": 0.0409979080042708, "grad_norm": 0.9921875, "learning_rate": 0.001993191217999075, "loss": 0.1562, "step": 4723 }, { "epoch": 0.041006588484474964, "grad_norm": 0.6015625, "learning_rate": 0.0019931875634623844, "loss": 0.1475, "step": 4724 }, { "epoch": 0.04101526896467913, "grad_norm": 0.447265625, "learning_rate": 0.0019931839079489174, "loss": 0.1504, "step": 4725 }, { "epoch": 0.041023949444883294, "grad_norm": 0.06884765625, "learning_rate": 0.0019931802514586774, "loss": 0.1196, "step": 4726 }, { "epoch": 0.04103262992508746, "grad_norm": 0.07470703125, "learning_rate": 0.0019931765939916694, "loss": 0.1475, "step": 4727 }, { "epoch": 0.041041310405291624, "grad_norm": 0.404296875, "learning_rate": 0.001993172935547897, "loss": 0.1777, "step": 4728 }, { "epoch": 0.04104999088549578, "grad_norm": 0.09130859375, "learning_rate": 0.001993169276127364, "loss": 0.1387, "step": 4729 }, { "epoch": 0.04105867136569995, "grad_norm": 0.10205078125, "learning_rate": 0.0019931656157300744, "loss": 0.1455, "step": 4730 }, { "epoch": 0.04106735184590411, "grad_norm": 0.3515625, "learning_rate": 0.0019931619543560324, "loss": 0.1475, "step": 4731 }, { "epoch": 0.04107603232610828, "grad_norm": 0.35546875, "learning_rate": 0.0019931582920052417, "loss": 0.1436, "step": 4732 }, { "epoch": 0.04108471280631244, "grad_norm": 0.3203125, "learning_rate": 0.0019931546286777067, "loss": 0.1475, "step": 4733 }, { "epoch": 0.04109339328651661, "grad_norm": 0.40625, "learning_rate": 0.0019931509643734313, "loss": 0.1641, "step": 4734 }, { "epoch": 0.04110207376672077, "grad_norm": 0.0830078125, "learning_rate": 0.001993147299092419, "loss": 0.1367, "step": 4735 }, { "epoch": 0.04111075424692494, "grad_norm": 0.09326171875, "learning_rate": 0.001993143632834675, "loss": 0.1729, "step": 4736 }, { "epoch": 0.0411194347271291, "grad_norm": 0.2265625, "learning_rate": 0.001993139965600202, "loss": 0.1875, "step": 4737 }, { "epoch": 0.04112811520733327, "grad_norm": 0.34375, "learning_rate": 0.0019931362973890044, "loss": 0.1758, "step": 4738 }, { "epoch": 0.041136795687537434, "grad_norm": 0.28125, "learning_rate": 0.0019931326282010865, "loss": 0.2266, "step": 4739 }, { "epoch": 0.0411454761677416, "grad_norm": 0.58984375, "learning_rate": 0.0019931289580364525, "loss": 0.1875, "step": 4740 }, { "epoch": 0.041154156647945764, "grad_norm": 0.197265625, "learning_rate": 0.001993125286895106, "loss": 0.1699, "step": 4741 }, { "epoch": 0.04116283712814993, "grad_norm": 0.1728515625, "learning_rate": 0.0019931216147770505, "loss": 0.1475, "step": 4742 }, { "epoch": 0.041171517608354094, "grad_norm": 0.57421875, "learning_rate": 0.0019931179416822916, "loss": 0.1631, "step": 4743 }, { "epoch": 0.04118019808855826, "grad_norm": 0.3515625, "learning_rate": 0.0019931142676108318, "loss": 0.1689, "step": 4744 }, { "epoch": 0.041188878568762424, "grad_norm": 0.09423828125, "learning_rate": 0.0019931105925626753, "loss": 0.1543, "step": 4745 }, { "epoch": 0.04119755904896659, "grad_norm": 0.244140625, "learning_rate": 0.0019931069165378267, "loss": 0.1309, "step": 4746 }, { "epoch": 0.041206239529170754, "grad_norm": 1.46875, "learning_rate": 0.00199310323953629, "loss": 0.3652, "step": 4747 }, { "epoch": 0.04121492000937492, "grad_norm": 0.72265625, "learning_rate": 0.001993099561558069, "loss": 0.1533, "step": 4748 }, { "epoch": 0.041223600489579085, "grad_norm": 0.46875, "learning_rate": 0.001993095882603168, "loss": 0.1562, "step": 4749 }, { "epoch": 0.04123228096978325, "grad_norm": 0.0888671875, "learning_rate": 0.00199309220267159, "loss": 0.1572, "step": 4750 }, { "epoch": 0.041240961449987415, "grad_norm": 0.380859375, "learning_rate": 0.0019930885217633405, "loss": 0.1826, "step": 4751 }, { "epoch": 0.04124964193019158, "grad_norm": 0.515625, "learning_rate": 0.001993084839878423, "loss": 0.1426, "step": 4752 }, { "epoch": 0.041258322410395745, "grad_norm": 0.828125, "learning_rate": 0.0019930811570168403, "loss": 0.1582, "step": 4753 }, { "epoch": 0.04126700289059991, "grad_norm": 0.14453125, "learning_rate": 0.0019930774731785985, "loss": 0.1729, "step": 4754 }, { "epoch": 0.041275683370804075, "grad_norm": 0.34765625, "learning_rate": 0.0019930737883637, "loss": 0.1387, "step": 4755 }, { "epoch": 0.04128436385100824, "grad_norm": 1.015625, "learning_rate": 0.0019930701025721496, "loss": 0.1572, "step": 4756 }, { "epoch": 0.041293044331212406, "grad_norm": 0.0888671875, "learning_rate": 0.001993066415803951, "loss": 0.1533, "step": 4757 }, { "epoch": 0.04130172481141657, "grad_norm": 0.12353515625, "learning_rate": 0.0019930627280591085, "loss": 0.1934, "step": 4758 }, { "epoch": 0.04131040529162073, "grad_norm": 0.09423828125, "learning_rate": 0.001993059039337626, "loss": 0.1826, "step": 4759 }, { "epoch": 0.041319085771824894, "grad_norm": 0.173828125, "learning_rate": 0.001993055349639508, "loss": 0.1602, "step": 4760 }, { "epoch": 0.04132776625202906, "grad_norm": 0.490234375, "learning_rate": 0.0019930516589647574, "loss": 0.1719, "step": 4761 }, { "epoch": 0.041336446732233224, "grad_norm": 0.162109375, "learning_rate": 0.001993047967313379, "loss": 0.127, "step": 4762 }, { "epoch": 0.04134512721243739, "grad_norm": 0.1943359375, "learning_rate": 0.001993044274685377, "loss": 0.1396, "step": 4763 }, { "epoch": 0.041353807692641555, "grad_norm": 0.310546875, "learning_rate": 0.0019930405810807553, "loss": 0.1504, "step": 4764 }, { "epoch": 0.04136248817284572, "grad_norm": 0.58984375, "learning_rate": 0.001993036886499518, "loss": 0.1348, "step": 4765 }, { "epoch": 0.041371168653049885, "grad_norm": 0.259765625, "learning_rate": 0.0019930331909416682, "loss": 0.1611, "step": 4766 }, { "epoch": 0.04137984913325405, "grad_norm": 0.453125, "learning_rate": 0.0019930294944072117, "loss": 0.1572, "step": 4767 }, { "epoch": 0.041388529613458215, "grad_norm": 0.26171875, "learning_rate": 0.001993025796896151, "loss": 0.1611, "step": 4768 }, { "epoch": 0.04139721009366238, "grad_norm": 0.3828125, "learning_rate": 0.0019930220984084902, "loss": 0.1465, "step": 4769 }, { "epoch": 0.041405890573866545, "grad_norm": 0.7421875, "learning_rate": 0.001993018398944235, "loss": 0.1602, "step": 4770 }, { "epoch": 0.04141457105407071, "grad_norm": 0.6953125, "learning_rate": 0.0019930146985033875, "loss": 0.1484, "step": 4771 }, { "epoch": 0.041423251534274876, "grad_norm": 0.244140625, "learning_rate": 0.001993010997085953, "loss": 0.1436, "step": 4772 }, { "epoch": 0.04143193201447904, "grad_norm": 0.3359375, "learning_rate": 0.0019930072946919347, "loss": 0.1377, "step": 4773 }, { "epoch": 0.041440612494683206, "grad_norm": 0.3671875, "learning_rate": 0.001993003591321337, "loss": 0.1553, "step": 4774 }, { "epoch": 0.04144929297488737, "grad_norm": 0.484375, "learning_rate": 0.0019929998869741643, "loss": 0.1572, "step": 4775 }, { "epoch": 0.041457973455091536, "grad_norm": 0.09033203125, "learning_rate": 0.0019929961816504203, "loss": 0.1719, "step": 4776 }, { "epoch": 0.0414666539352957, "grad_norm": 0.46875, "learning_rate": 0.001992992475350109, "loss": 0.1367, "step": 4777 }, { "epoch": 0.041475334415499866, "grad_norm": 0.26953125, "learning_rate": 0.0019929887680732346, "loss": 0.1426, "step": 4778 }, { "epoch": 0.04148401489570403, "grad_norm": 0.294921875, "learning_rate": 0.001992985059819801, "loss": 0.1338, "step": 4779 }, { "epoch": 0.041492695375908197, "grad_norm": 0.23828125, "learning_rate": 0.0019929813505898124, "loss": 0.1475, "step": 4780 }, { "epoch": 0.04150137585611236, "grad_norm": 0.376953125, "learning_rate": 0.001992977640383273, "loss": 0.1465, "step": 4781 }, { "epoch": 0.04151005633631653, "grad_norm": 0.17578125, "learning_rate": 0.0019929739292001863, "loss": 0.1777, "step": 4782 }, { "epoch": 0.04151873681652069, "grad_norm": 0.09912109375, "learning_rate": 0.0019929702170405567, "loss": 0.1318, "step": 4783 }, { "epoch": 0.04152741729672486, "grad_norm": 0.08837890625, "learning_rate": 0.001992966503904389, "loss": 0.1504, "step": 4784 }, { "epoch": 0.04153609777692902, "grad_norm": 0.2265625, "learning_rate": 0.0019929627897916856, "loss": 0.1855, "step": 4785 }, { "epoch": 0.04154477825713319, "grad_norm": 0.244140625, "learning_rate": 0.001992959074702452, "loss": 0.1211, "step": 4786 }, { "epoch": 0.04155345873733735, "grad_norm": 0.435546875, "learning_rate": 0.0019929553586366918, "loss": 0.1807, "step": 4787 }, { "epoch": 0.04156213921754152, "grad_norm": 0.12890625, "learning_rate": 0.0019929516415944093, "loss": 0.209, "step": 4788 }, { "epoch": 0.04157081969774568, "grad_norm": 0.29296875, "learning_rate": 0.001992947923575608, "loss": 0.1523, "step": 4789 }, { "epoch": 0.04157950017794984, "grad_norm": 0.36328125, "learning_rate": 0.0019929442045802923, "loss": 0.1504, "step": 4790 }, { "epoch": 0.041588180658154006, "grad_norm": 0.1845703125, "learning_rate": 0.001992940484608466, "loss": 0.1182, "step": 4791 }, { "epoch": 0.04159686113835817, "grad_norm": 0.435546875, "learning_rate": 0.0019929367636601332, "loss": 0.2051, "step": 4792 }, { "epoch": 0.041605541618562336, "grad_norm": 0.169921875, "learning_rate": 0.001992933041735299, "loss": 0.1167, "step": 4793 }, { "epoch": 0.0416142220987665, "grad_norm": 0.158203125, "learning_rate": 0.0019929293188339662, "loss": 0.1338, "step": 4794 }, { "epoch": 0.041622902578970666, "grad_norm": 0.125, "learning_rate": 0.001992925594956139, "loss": 0.1445, "step": 4795 }, { "epoch": 0.04163158305917483, "grad_norm": 0.1884765625, "learning_rate": 0.001992921870101822, "loss": 0.1221, "step": 4796 }, { "epoch": 0.041640263539379, "grad_norm": 0.52734375, "learning_rate": 0.0019929181442710194, "loss": 0.1758, "step": 4797 }, { "epoch": 0.04164894401958316, "grad_norm": 0.39453125, "learning_rate": 0.0019929144174637347, "loss": 0.1758, "step": 4798 }, { "epoch": 0.04165762449978733, "grad_norm": 0.1298828125, "learning_rate": 0.001992910689679972, "loss": 0.1436, "step": 4799 }, { "epoch": 0.04166630497999149, "grad_norm": 0.330078125, "learning_rate": 0.0019929069609197357, "loss": 0.1855, "step": 4800 }, { "epoch": 0.04167498546019566, "grad_norm": 0.12890625, "learning_rate": 0.00199290323118303, "loss": 0.1196, "step": 4801 }, { "epoch": 0.04168366594039982, "grad_norm": 0.375, "learning_rate": 0.0019928995004698585, "loss": 0.1826, "step": 4802 }, { "epoch": 0.04169234642060399, "grad_norm": 0.138671875, "learning_rate": 0.001992895768780226, "loss": 0.1387, "step": 4803 }, { "epoch": 0.04170102690080815, "grad_norm": 0.453125, "learning_rate": 0.0019928920361141356, "loss": 0.1514, "step": 4804 }, { "epoch": 0.04170970738101232, "grad_norm": 0.1123046875, "learning_rate": 0.001992888302471592, "loss": 0.1055, "step": 4805 }, { "epoch": 0.04171838786121648, "grad_norm": 0.59765625, "learning_rate": 0.001992884567852599, "loss": 0.1543, "step": 4806 }, { "epoch": 0.04172706834142065, "grad_norm": 0.1875, "learning_rate": 0.001992880832257161, "loss": 0.1826, "step": 4807 }, { "epoch": 0.04173574882162481, "grad_norm": 0.32421875, "learning_rate": 0.001992877095685282, "loss": 0.1699, "step": 4808 }, { "epoch": 0.04174442930182898, "grad_norm": 0.1123046875, "learning_rate": 0.001992873358136966, "loss": 0.126, "step": 4809 }, { "epoch": 0.04175310978203314, "grad_norm": 0.96875, "learning_rate": 0.001992869619612217, "loss": 0.1582, "step": 4810 }, { "epoch": 0.04176179026223731, "grad_norm": 0.1787109375, "learning_rate": 0.0019928658801110395, "loss": 0.1855, "step": 4811 }, { "epoch": 0.04177047074244147, "grad_norm": 0.4140625, "learning_rate": 0.001992862139633437, "loss": 0.1465, "step": 4812 }, { "epoch": 0.04177915122264564, "grad_norm": 0.33984375, "learning_rate": 0.001992858398179414, "loss": 0.1719, "step": 4813 }, { "epoch": 0.041787831702849804, "grad_norm": 0.2177734375, "learning_rate": 0.0019928546557489743, "loss": 0.1484, "step": 4814 }, { "epoch": 0.04179651218305397, "grad_norm": 2.34375, "learning_rate": 0.0019928509123421224, "loss": 0.3672, "step": 4815 }, { "epoch": 0.041805192663258134, "grad_norm": 0.09326171875, "learning_rate": 0.001992847167958862, "loss": 0.1436, "step": 4816 }, { "epoch": 0.0418138731434623, "grad_norm": 0.51171875, "learning_rate": 0.0019928434225991976, "loss": 0.166, "step": 4817 }, { "epoch": 0.041822553623666464, "grad_norm": 0.1708984375, "learning_rate": 0.001992839676263133, "loss": 0.2002, "step": 4818 }, { "epoch": 0.04183123410387063, "grad_norm": 0.2177734375, "learning_rate": 0.0019928359289506717, "loss": 0.126, "step": 4819 }, { "epoch": 0.041839914584074794, "grad_norm": 0.404296875, "learning_rate": 0.001992832180661819, "loss": 0.1523, "step": 4820 }, { "epoch": 0.04184859506427895, "grad_norm": 0.83984375, "learning_rate": 0.0019928284313965785, "loss": 0.1895, "step": 4821 }, { "epoch": 0.04185727554448312, "grad_norm": 0.2314453125, "learning_rate": 0.0019928246811549543, "loss": 0.1084, "step": 4822 }, { "epoch": 0.04186595602468728, "grad_norm": 0.43359375, "learning_rate": 0.00199282092993695, "loss": 0.1992, "step": 4823 }, { "epoch": 0.04187463650489145, "grad_norm": 0.0751953125, "learning_rate": 0.0019928171777425703, "loss": 0.1973, "step": 4824 }, { "epoch": 0.04188331698509561, "grad_norm": 0.2060546875, "learning_rate": 0.0019928134245718195, "loss": 0.1582, "step": 4825 }, { "epoch": 0.04189199746529978, "grad_norm": 0.251953125, "learning_rate": 0.001992809670424701, "loss": 0.1377, "step": 4826 }, { "epoch": 0.04190067794550394, "grad_norm": 0.361328125, "learning_rate": 0.001992805915301219, "loss": 0.1367, "step": 4827 }, { "epoch": 0.04190935842570811, "grad_norm": 0.54296875, "learning_rate": 0.0019928021592013783, "loss": 0.2891, "step": 4828 }, { "epoch": 0.041918038905912273, "grad_norm": 0.40625, "learning_rate": 0.001992798402125183, "loss": 0.1641, "step": 4829 }, { "epoch": 0.04192671938611644, "grad_norm": 0.45703125, "learning_rate": 0.001992794644072636, "loss": 0.1738, "step": 4830 }, { "epoch": 0.041935399866320604, "grad_norm": 0.1787109375, "learning_rate": 0.0019927908850437426, "loss": 0.21, "step": 4831 }, { "epoch": 0.04194408034652477, "grad_norm": 0.318359375, "learning_rate": 0.0019927871250385062, "loss": 0.1602, "step": 4832 }, { "epoch": 0.041952760826728934, "grad_norm": 0.189453125, "learning_rate": 0.001992783364056932, "loss": 0.2168, "step": 4833 }, { "epoch": 0.0419614413069331, "grad_norm": 0.205078125, "learning_rate": 0.001992779602099022, "loss": 0.168, "step": 4834 }, { "epoch": 0.041970121787137264, "grad_norm": 0.1484375, "learning_rate": 0.001992775839164783, "loss": 0.1172, "step": 4835 }, { "epoch": 0.04197880226734143, "grad_norm": 0.201171875, "learning_rate": 0.0019927720752542168, "loss": 0.1289, "step": 4836 }, { "epoch": 0.041987482747545594, "grad_norm": 0.61328125, "learning_rate": 0.001992768310367329, "loss": 0.168, "step": 4837 }, { "epoch": 0.04199616322774976, "grad_norm": 0.375, "learning_rate": 0.0019927645445041225, "loss": 0.2285, "step": 4838 }, { "epoch": 0.042004843707953925, "grad_norm": 0.337890625, "learning_rate": 0.001992760777664603, "loss": 0.2041, "step": 4839 }, { "epoch": 0.04201352418815809, "grad_norm": 0.146484375, "learning_rate": 0.0019927570098487732, "loss": 0.1572, "step": 4840 }, { "epoch": 0.042022204668362255, "grad_norm": 0.1455078125, "learning_rate": 0.001992753241056638, "loss": 0.1309, "step": 4841 }, { "epoch": 0.04203088514856642, "grad_norm": 0.1318359375, "learning_rate": 0.001992749471288201, "loss": 0.1543, "step": 4842 }, { "epoch": 0.042039565628770585, "grad_norm": 0.119140625, "learning_rate": 0.0019927457005434667, "loss": 0.1445, "step": 4843 }, { "epoch": 0.04204824610897475, "grad_norm": 0.63671875, "learning_rate": 0.0019927419288224392, "loss": 0.1855, "step": 4844 }, { "epoch": 0.042056926589178915, "grad_norm": 0.318359375, "learning_rate": 0.0019927381561251224, "loss": 0.2227, "step": 4845 }, { "epoch": 0.04206560706938308, "grad_norm": 0.337890625, "learning_rate": 0.001992734382451521, "loss": 0.1406, "step": 4846 }, { "epoch": 0.042074287549587246, "grad_norm": 0.56640625, "learning_rate": 0.0019927306078016383, "loss": 0.2188, "step": 4847 }, { "epoch": 0.04208296802979141, "grad_norm": 0.416015625, "learning_rate": 0.0019927268321754785, "loss": 0.1523, "step": 4848 }, { "epoch": 0.042091648509995576, "grad_norm": 0.341796875, "learning_rate": 0.0019927230555730467, "loss": 0.1338, "step": 4849 }, { "epoch": 0.04210032899019974, "grad_norm": 0.07470703125, "learning_rate": 0.001992719277994346, "loss": 0.1455, "step": 4850 }, { "epoch": 0.042109009470403906, "grad_norm": 0.45703125, "learning_rate": 0.001992715499439381, "loss": 0.1826, "step": 4851 }, { "epoch": 0.042117689950608064, "grad_norm": 0.455078125, "learning_rate": 0.0019927117199081555, "loss": 0.2012, "step": 4852 }, { "epoch": 0.04212637043081223, "grad_norm": 0.251953125, "learning_rate": 0.0019927079394006742, "loss": 0.1602, "step": 4853 }, { "epoch": 0.042135050911016395, "grad_norm": 0.13671875, "learning_rate": 0.001992704157916941, "loss": 0.1943, "step": 4854 }, { "epoch": 0.04214373139122056, "grad_norm": 0.671875, "learning_rate": 0.00199270037545696, "loss": 0.1875, "step": 4855 }, { "epoch": 0.042152411871424725, "grad_norm": 0.076171875, "learning_rate": 0.001992696592020735, "loss": 0.1592, "step": 4856 }, { "epoch": 0.04216109235162889, "grad_norm": 0.11767578125, "learning_rate": 0.0019926928076082705, "loss": 0.1855, "step": 4857 }, { "epoch": 0.042169772831833055, "grad_norm": 0.11376953125, "learning_rate": 0.0019926890222195705, "loss": 0.1709, "step": 4858 }, { "epoch": 0.04217845331203722, "grad_norm": 0.11962890625, "learning_rate": 0.001992685235854639, "loss": 0.168, "step": 4859 }, { "epoch": 0.042187133792241385, "grad_norm": 0.25, "learning_rate": 0.0019926814485134807, "loss": 0.1396, "step": 4860 }, { "epoch": 0.04219581427244555, "grad_norm": 0.53515625, "learning_rate": 0.001992677660196099, "loss": 0.1719, "step": 4861 }, { "epoch": 0.042204494752649716, "grad_norm": 0.251953125, "learning_rate": 0.001992673870902499, "loss": 0.1592, "step": 4862 }, { "epoch": 0.04221317523285388, "grad_norm": 0.158203125, "learning_rate": 0.0019926700806326835, "loss": 0.168, "step": 4863 }, { "epoch": 0.042221855713058046, "grad_norm": 0.1240234375, "learning_rate": 0.0019926662893866584, "loss": 0.1328, "step": 4864 }, { "epoch": 0.04223053619326221, "grad_norm": 0.1982421875, "learning_rate": 0.001992662497164426, "loss": 0.1406, "step": 4865 }, { "epoch": 0.042239216673466376, "grad_norm": 0.4140625, "learning_rate": 0.001992658703965992, "loss": 0.1914, "step": 4866 }, { "epoch": 0.04224789715367054, "grad_norm": 0.1455078125, "learning_rate": 0.001992654909791359, "loss": 0.1543, "step": 4867 }, { "epoch": 0.042256577633874706, "grad_norm": 0.2431640625, "learning_rate": 0.0019926511146405325, "loss": 0.1709, "step": 4868 }, { "epoch": 0.04226525811407887, "grad_norm": 0.498046875, "learning_rate": 0.001992647318513516, "loss": 0.1621, "step": 4869 }, { "epoch": 0.042273938594283036, "grad_norm": 0.12109375, "learning_rate": 0.0019926435214103143, "loss": 0.127, "step": 4870 }, { "epoch": 0.0422826190744872, "grad_norm": 0.328125, "learning_rate": 0.001992639723330931, "loss": 0.1299, "step": 4871 }, { "epoch": 0.04229129955469137, "grad_norm": 0.546875, "learning_rate": 0.0019926359242753698, "loss": 0.1934, "step": 4872 }, { "epoch": 0.04229998003489553, "grad_norm": 0.126953125, "learning_rate": 0.001992632124243635, "loss": 0.168, "step": 4873 }, { "epoch": 0.0423086605150997, "grad_norm": 0.3984375, "learning_rate": 0.0019926283232357318, "loss": 0.1807, "step": 4874 }, { "epoch": 0.04231734099530386, "grad_norm": 0.1640625, "learning_rate": 0.001992624521251664, "loss": 0.1504, "step": 4875 }, { "epoch": 0.04232602147550803, "grad_norm": 0.17578125, "learning_rate": 0.001992620718291435, "loss": 0.1875, "step": 4876 }, { "epoch": 0.04233470195571219, "grad_norm": 0.5546875, "learning_rate": 0.001992616914355049, "loss": 0.1245, "step": 4877 }, { "epoch": 0.04234338243591636, "grad_norm": 0.212890625, "learning_rate": 0.001992613109442511, "loss": 0.168, "step": 4878 }, { "epoch": 0.04235206291612052, "grad_norm": 0.46875, "learning_rate": 0.0019926093035538247, "loss": 0.2168, "step": 4879 }, { "epoch": 0.04236074339632469, "grad_norm": 0.318359375, "learning_rate": 0.0019926054966889943, "loss": 0.166, "step": 4880 }, { "epoch": 0.04236942387652885, "grad_norm": 0.1845703125, "learning_rate": 0.001992601688848024, "loss": 0.1934, "step": 4881 }, { "epoch": 0.04237810435673302, "grad_norm": 0.08837890625, "learning_rate": 0.0019925978800309175, "loss": 0.1582, "step": 4882 }, { "epoch": 0.042386784836937176, "grad_norm": 0.52734375, "learning_rate": 0.0019925940702376797, "loss": 0.168, "step": 4883 }, { "epoch": 0.04239546531714134, "grad_norm": 0.16015625, "learning_rate": 0.0019925902594683147, "loss": 0.166, "step": 4884 }, { "epoch": 0.042404145797345506, "grad_norm": 0.5390625, "learning_rate": 0.001992586447722826, "loss": 0.1338, "step": 4885 }, { "epoch": 0.04241282627754967, "grad_norm": 0.478515625, "learning_rate": 0.001992582635001218, "loss": 0.2695, "step": 4886 }, { "epoch": 0.04242150675775384, "grad_norm": 0.32421875, "learning_rate": 0.0019925788213034953, "loss": 0.1543, "step": 4887 }, { "epoch": 0.042430187237958, "grad_norm": 0.232421875, "learning_rate": 0.001992575006629662, "loss": 0.1348, "step": 4888 }, { "epoch": 0.04243886771816217, "grad_norm": 0.158203125, "learning_rate": 0.001992571190979722, "loss": 0.1689, "step": 4889 }, { "epoch": 0.04244754819836633, "grad_norm": 2.359375, "learning_rate": 0.0019925673743536793, "loss": 0.3301, "step": 4890 }, { "epoch": 0.0424562286785705, "grad_norm": 0.1689453125, "learning_rate": 0.0019925635567515387, "loss": 0.165, "step": 4891 }, { "epoch": 0.04246490915877466, "grad_norm": 0.302734375, "learning_rate": 0.001992559738173304, "loss": 0.1348, "step": 4892 }, { "epoch": 0.04247358963897883, "grad_norm": 0.474609375, "learning_rate": 0.001992555918618979, "loss": 0.124, "step": 4893 }, { "epoch": 0.04248227011918299, "grad_norm": 0.53125, "learning_rate": 0.0019925520980885684, "loss": 0.1787, "step": 4894 }, { "epoch": 0.04249095059938716, "grad_norm": 0.380859375, "learning_rate": 0.001992548276582076, "loss": 0.1387, "step": 4895 }, { "epoch": 0.04249963107959132, "grad_norm": 0.11767578125, "learning_rate": 0.001992544454099507, "loss": 0.1631, "step": 4896 }, { "epoch": 0.04250831155979549, "grad_norm": 0.291015625, "learning_rate": 0.001992540630640864, "loss": 0.1807, "step": 4897 }, { "epoch": 0.04251699203999965, "grad_norm": 0.2265625, "learning_rate": 0.0019925368062061522, "loss": 0.1318, "step": 4898 }, { "epoch": 0.04252567252020382, "grad_norm": 0.1650390625, "learning_rate": 0.0019925329807953755, "loss": 0.1523, "step": 4899 }, { "epoch": 0.04253435300040798, "grad_norm": 1.03125, "learning_rate": 0.0019925291544085383, "loss": 0.1748, "step": 4900 }, { "epoch": 0.04254303348061215, "grad_norm": 0.189453125, "learning_rate": 0.0019925253270456447, "loss": 0.1465, "step": 4901 }, { "epoch": 0.04255171396081631, "grad_norm": 0.1396484375, "learning_rate": 0.0019925214987066985, "loss": 0.2324, "step": 4902 }, { "epoch": 0.04256039444102048, "grad_norm": 0.73828125, "learning_rate": 0.0019925176693917045, "loss": 0.2012, "step": 4903 }, { "epoch": 0.042569074921224644, "grad_norm": 0.205078125, "learning_rate": 0.0019925138391006666, "loss": 0.1914, "step": 4904 }, { "epoch": 0.04257775540142881, "grad_norm": 0.08984375, "learning_rate": 0.0019925100078335887, "loss": 0.1641, "step": 4905 }, { "epoch": 0.042586435881632974, "grad_norm": 0.205078125, "learning_rate": 0.0019925061755904755, "loss": 0.1309, "step": 4906 }, { "epoch": 0.04259511636183714, "grad_norm": 0.39453125, "learning_rate": 0.0019925023423713307, "loss": 0.1328, "step": 4907 }, { "epoch": 0.042603796842041304, "grad_norm": 1.140625, "learning_rate": 0.0019924985081761592, "loss": 0.165, "step": 4908 }, { "epoch": 0.04261247732224547, "grad_norm": 0.07470703125, "learning_rate": 0.0019924946730049643, "loss": 0.1221, "step": 4909 }, { "epoch": 0.042621157802449634, "grad_norm": 0.201171875, "learning_rate": 0.0019924908368577506, "loss": 0.1768, "step": 4910 }, { "epoch": 0.0426298382826538, "grad_norm": 0.61328125, "learning_rate": 0.001992486999734523, "loss": 0.1895, "step": 4911 }, { "epoch": 0.042638518762857965, "grad_norm": 0.328125, "learning_rate": 0.0019924831616352843, "loss": 0.1436, "step": 4912 }, { "epoch": 0.04264719924306212, "grad_norm": 0.130859375, "learning_rate": 0.0019924793225600396, "loss": 0.1475, "step": 4913 }, { "epoch": 0.04265587972326629, "grad_norm": 0.328125, "learning_rate": 0.001992475482508793, "loss": 0.1641, "step": 4914 }, { "epoch": 0.04266456020347045, "grad_norm": 0.421875, "learning_rate": 0.001992471641481549, "loss": 0.1699, "step": 4915 }, { "epoch": 0.04267324068367462, "grad_norm": 0.7421875, "learning_rate": 0.0019924677994783107, "loss": 0.1523, "step": 4916 }, { "epoch": 0.04268192116387878, "grad_norm": 0.1845703125, "learning_rate": 0.0019924639564990834, "loss": 0.1533, "step": 4917 }, { "epoch": 0.04269060164408295, "grad_norm": 0.1416015625, "learning_rate": 0.0019924601125438706, "loss": 0.1709, "step": 4918 }, { "epoch": 0.04269928212428711, "grad_norm": 0.138671875, "learning_rate": 0.0019924562676126773, "loss": 0.1348, "step": 4919 }, { "epoch": 0.04270796260449128, "grad_norm": 0.1923828125, "learning_rate": 0.0019924524217055073, "loss": 0.1719, "step": 4920 }, { "epoch": 0.042716643084695444, "grad_norm": 0.74609375, "learning_rate": 0.0019924485748223646, "loss": 0.1504, "step": 4921 }, { "epoch": 0.04272532356489961, "grad_norm": 0.451171875, "learning_rate": 0.0019924447269632534, "loss": 0.1089, "step": 4922 }, { "epoch": 0.042734004045103774, "grad_norm": 0.0703125, "learning_rate": 0.001992440878128178, "loss": 0.1157, "step": 4923 }, { "epoch": 0.04274268452530794, "grad_norm": 0.462890625, "learning_rate": 0.0019924370283171426, "loss": 0.1309, "step": 4924 }, { "epoch": 0.042751365005512104, "grad_norm": 0.1416015625, "learning_rate": 0.0019924331775301517, "loss": 0.1191, "step": 4925 }, { "epoch": 0.04276004548571627, "grad_norm": 0.1318359375, "learning_rate": 0.0019924293257672096, "loss": 0.2002, "step": 4926 }, { "epoch": 0.042768725965920434, "grad_norm": 0.1357421875, "learning_rate": 0.0019924254730283196, "loss": 0.1689, "step": 4927 }, { "epoch": 0.0427774064461246, "grad_norm": 0.296875, "learning_rate": 0.0019924216193134866, "loss": 0.2051, "step": 4928 }, { "epoch": 0.042786086926328765, "grad_norm": 0.10205078125, "learning_rate": 0.001992417764622715, "loss": 0.1465, "step": 4929 }, { "epoch": 0.04279476740653293, "grad_norm": 0.2294921875, "learning_rate": 0.0019924139089560087, "loss": 0.1201, "step": 4930 }, { "epoch": 0.042803447886737095, "grad_norm": 0.205078125, "learning_rate": 0.001992410052313372, "loss": 0.1797, "step": 4931 }, { "epoch": 0.04281212836694126, "grad_norm": 0.5, "learning_rate": 0.001992406194694809, "loss": 0.1045, "step": 4932 }, { "epoch": 0.042820808847145425, "grad_norm": 0.388671875, "learning_rate": 0.0019924023361003237, "loss": 0.1206, "step": 4933 }, { "epoch": 0.04282948932734959, "grad_norm": 0.10595703125, "learning_rate": 0.001992398476529921, "loss": 0.1357, "step": 4934 }, { "epoch": 0.042838169807553755, "grad_norm": 0.35546875, "learning_rate": 0.0019923946159836046, "loss": 0.1416, "step": 4935 }, { "epoch": 0.04284685028775792, "grad_norm": 0.09423828125, "learning_rate": 0.0019923907544613785, "loss": 0.168, "step": 4936 }, { "epoch": 0.042855530767962086, "grad_norm": 0.5390625, "learning_rate": 0.0019923868919632477, "loss": 0.1885, "step": 4937 }, { "epoch": 0.04286421124816625, "grad_norm": 0.703125, "learning_rate": 0.001992383028489216, "loss": 0.1709, "step": 4938 }, { "epoch": 0.042872891728370416, "grad_norm": 0.2216796875, "learning_rate": 0.001992379164039287, "loss": 0.1348, "step": 4939 }, { "epoch": 0.04288157220857458, "grad_norm": 0.1650390625, "learning_rate": 0.0019923752986134666, "loss": 0.1621, "step": 4940 }, { "epoch": 0.042890252688778746, "grad_norm": 0.099609375, "learning_rate": 0.001992371432211758, "loss": 0.1787, "step": 4941 }, { "epoch": 0.04289893316898291, "grad_norm": 0.3984375, "learning_rate": 0.0019923675648341643, "loss": 0.1396, "step": 4942 }, { "epoch": 0.042907613649187076, "grad_norm": 0.4375, "learning_rate": 0.0019923636964806913, "loss": 0.1416, "step": 4943 }, { "epoch": 0.042916294129391235, "grad_norm": 0.220703125, "learning_rate": 0.001992359827151343, "loss": 0.1797, "step": 4944 }, { "epoch": 0.0429249746095954, "grad_norm": 0.1767578125, "learning_rate": 0.0019923559568461235, "loss": 0.1621, "step": 4945 }, { "epoch": 0.042933655089799565, "grad_norm": 0.515625, "learning_rate": 0.0019923520855650366, "loss": 0.1484, "step": 4946 }, { "epoch": 0.04294233557000373, "grad_norm": 0.75, "learning_rate": 0.001992348213308087, "loss": 0.1816, "step": 4947 }, { "epoch": 0.042951016050207895, "grad_norm": 0.294921875, "learning_rate": 0.001992344340075279, "loss": 0.1328, "step": 4948 }, { "epoch": 0.04295969653041206, "grad_norm": 0.123046875, "learning_rate": 0.001992340465866616, "loss": 0.1357, "step": 4949 }, { "epoch": 0.042968377010616225, "grad_norm": 0.095703125, "learning_rate": 0.001992336590682104, "loss": 0.1641, "step": 4950 }, { "epoch": 0.04297705749082039, "grad_norm": 0.333984375, "learning_rate": 0.001992332714521745, "loss": 0.1348, "step": 4951 }, { "epoch": 0.042985737971024555, "grad_norm": 0.25390625, "learning_rate": 0.001992328837385545, "loss": 0.1719, "step": 4952 }, { "epoch": 0.04299441845122872, "grad_norm": 0.30859375, "learning_rate": 0.001992324959273507, "loss": 0.1914, "step": 4953 }, { "epoch": 0.043003098931432886, "grad_norm": 0.19921875, "learning_rate": 0.0019923210801856364, "loss": 0.1553, "step": 4954 }, { "epoch": 0.04301177941163705, "grad_norm": 0.609375, "learning_rate": 0.0019923172001219368, "loss": 0.1768, "step": 4955 }, { "epoch": 0.043020459891841216, "grad_norm": 0.3828125, "learning_rate": 0.0019923133190824123, "loss": 0.1953, "step": 4956 }, { "epoch": 0.04302914037204538, "grad_norm": 0.40234375, "learning_rate": 0.0019923094370670673, "loss": 0.1074, "step": 4957 }, { "epoch": 0.043037820852249546, "grad_norm": 0.115234375, "learning_rate": 0.0019923055540759065, "loss": 0.1689, "step": 4958 }, { "epoch": 0.04304650133245371, "grad_norm": 0.30078125, "learning_rate": 0.0019923016701089338, "loss": 0.1758, "step": 4959 }, { "epoch": 0.043055181812657876, "grad_norm": 0.10595703125, "learning_rate": 0.0019922977851661527, "loss": 0.1514, "step": 4960 }, { "epoch": 0.04306386229286204, "grad_norm": 1.203125, "learning_rate": 0.001992293899247569, "loss": 0.1445, "step": 4961 }, { "epoch": 0.04307254277306621, "grad_norm": 0.8359375, "learning_rate": 0.001992290012353185, "loss": 0.2559, "step": 4962 }, { "epoch": 0.04308122325327037, "grad_norm": 0.2578125, "learning_rate": 0.001992286124483007, "loss": 0.166, "step": 4963 }, { "epoch": 0.04308990373347454, "grad_norm": 0.236328125, "learning_rate": 0.001992282235637038, "loss": 0.1562, "step": 4964 }, { "epoch": 0.0430985842136787, "grad_norm": 0.0908203125, "learning_rate": 0.0019922783458152828, "loss": 0.1641, "step": 4965 }, { "epoch": 0.04310726469388287, "grad_norm": 0.1953125, "learning_rate": 0.001992274455017745, "loss": 0.2217, "step": 4966 }, { "epoch": 0.04311594517408703, "grad_norm": 0.28125, "learning_rate": 0.0019922705632444294, "loss": 0.1177, "step": 4967 }, { "epoch": 0.0431246256542912, "grad_norm": 0.48828125, "learning_rate": 0.00199226667049534, "loss": 0.168, "step": 4968 }, { "epoch": 0.04313330613449536, "grad_norm": 0.484375, "learning_rate": 0.0019922627767704816, "loss": 0.1543, "step": 4969 }, { "epoch": 0.04314198661469953, "grad_norm": 0.228515625, "learning_rate": 0.001992258882069858, "loss": 0.124, "step": 4970 }, { "epoch": 0.04315066709490369, "grad_norm": 0.703125, "learning_rate": 0.0019922549863934727, "loss": 0.1348, "step": 4971 }, { "epoch": 0.04315934757510786, "grad_norm": 0.640625, "learning_rate": 0.0019922510897413316, "loss": 0.166, "step": 4972 }, { "epoch": 0.04316802805531202, "grad_norm": 0.41796875, "learning_rate": 0.0019922471921134375, "loss": 0.1953, "step": 4973 }, { "epoch": 0.04317670853551619, "grad_norm": 0.3671875, "learning_rate": 0.0019922432935097958, "loss": 0.1348, "step": 4974 }, { "epoch": 0.043185389015720346, "grad_norm": 0.404296875, "learning_rate": 0.00199223939393041, "loss": 0.1611, "step": 4975 }, { "epoch": 0.04319406949592451, "grad_norm": 1.0625, "learning_rate": 0.001992235493375285, "loss": 0.1572, "step": 4976 }, { "epoch": 0.043202749976128677, "grad_norm": 0.69921875, "learning_rate": 0.001992231591844424, "loss": 0.1445, "step": 4977 }, { "epoch": 0.04321143045633284, "grad_norm": 0.494140625, "learning_rate": 0.0019922276893378325, "loss": 0.1523, "step": 4978 }, { "epoch": 0.04322011093653701, "grad_norm": 0.69140625, "learning_rate": 0.0019922237858555135, "loss": 0.1377, "step": 4979 }, { "epoch": 0.04322879141674117, "grad_norm": 0.251953125, "learning_rate": 0.0019922198813974723, "loss": 0.1611, "step": 4980 }, { "epoch": 0.04323747189694534, "grad_norm": 0.71484375, "learning_rate": 0.0019922159759637134, "loss": 0.1953, "step": 4981 }, { "epoch": 0.0432461523771495, "grad_norm": 0.33203125, "learning_rate": 0.0019922120695542397, "loss": 0.1465, "step": 4982 }, { "epoch": 0.04325483285735367, "grad_norm": 0.546875, "learning_rate": 0.001992208162169057, "loss": 0.1602, "step": 4983 }, { "epoch": 0.04326351333755783, "grad_norm": 0.84765625, "learning_rate": 0.0019922042538081685, "loss": 0.1914, "step": 4984 }, { "epoch": 0.043272193817762, "grad_norm": 0.10888671875, "learning_rate": 0.001992200344471579, "loss": 0.1816, "step": 4985 }, { "epoch": 0.04328087429796616, "grad_norm": 0.2294921875, "learning_rate": 0.001992196434159292, "loss": 0.168, "step": 4986 }, { "epoch": 0.04328955477817033, "grad_norm": 0.51953125, "learning_rate": 0.0019921925228713134, "loss": 0.168, "step": 4987 }, { "epoch": 0.04329823525837449, "grad_norm": 0.25, "learning_rate": 0.001992188610607646, "loss": 0.1523, "step": 4988 }, { "epoch": 0.04330691573857866, "grad_norm": 0.322265625, "learning_rate": 0.001992184697368294, "loss": 0.1299, "step": 4989 }, { "epoch": 0.04331559621878282, "grad_norm": 0.376953125, "learning_rate": 0.001992180783153263, "loss": 0.1895, "step": 4990 }, { "epoch": 0.04332427669898699, "grad_norm": 0.609375, "learning_rate": 0.001992176867962556, "loss": 0.1514, "step": 4991 }, { "epoch": 0.04333295717919115, "grad_norm": 0.087890625, "learning_rate": 0.001992172951796178, "loss": 0.1416, "step": 4992 }, { "epoch": 0.04334163765939532, "grad_norm": 0.40234375, "learning_rate": 0.001992169034654133, "loss": 0.1582, "step": 4993 }, { "epoch": 0.043350318139599484, "grad_norm": 0.2373046875, "learning_rate": 0.001992165116536425, "loss": 0.2031, "step": 4994 }, { "epoch": 0.04335899861980365, "grad_norm": 0.267578125, "learning_rate": 0.0019921611974430594, "loss": 0.127, "step": 4995 }, { "epoch": 0.043367679100007814, "grad_norm": 0.53125, "learning_rate": 0.001992157277374039, "loss": 0.1475, "step": 4996 }, { "epoch": 0.04337635958021198, "grad_norm": 0.8828125, "learning_rate": 0.0019921533563293696, "loss": 0.1484, "step": 4997 }, { "epoch": 0.043385040060416144, "grad_norm": 0.3359375, "learning_rate": 0.001992149434309054, "loss": 0.1797, "step": 4998 }, { "epoch": 0.04339372054062031, "grad_norm": 0.345703125, "learning_rate": 0.001992145511313097, "loss": 0.1514, "step": 4999 }, { "epoch": 0.043402401020824474, "grad_norm": 0.09619140625, "learning_rate": 0.0019921415873415038, "loss": 0.1387, "step": 5000 } ], "logging_steps": 1, "max_steps": 96000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1234185050968064e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }