{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995355318160706, "eval_steps": 500, "global_step": 269, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037157454714352067, "grad_norm": 20.0954532623291, "learning_rate": 1.111111111111111e-06, "loss": 1.0396, "step": 1 }, { "epoch": 0.0074314909428704135, "grad_norm": 18.111774444580078, "learning_rate": 2.222222222222222e-06, "loss": 1.0631, "step": 2 }, { "epoch": 0.01114723641430562, "grad_norm": 11.729887008666992, "learning_rate": 3.3333333333333333e-06, "loss": 0.9979, "step": 3 }, { "epoch": 0.014862981885740827, "grad_norm": 8.470993041992188, "learning_rate": 4.444444444444444e-06, "loss": 0.9424, "step": 4 }, { "epoch": 0.018578727357176035, "grad_norm": 36.664833068847656, "learning_rate": 5.555555555555557e-06, "loss": 1.0295, "step": 5 }, { "epoch": 0.02229447282861124, "grad_norm": 14.391983985900879, "learning_rate": 6.666666666666667e-06, "loss": 0.988, "step": 6 }, { "epoch": 0.026010218300046448, "grad_norm": 13.340031623840332, "learning_rate": 7.77777777777778e-06, "loss": 0.973, "step": 7 }, { "epoch": 0.029725963771481654, "grad_norm": 11.491482734680176, "learning_rate": 8.888888888888888e-06, "loss": 0.9431, "step": 8 }, { "epoch": 0.03344170924291686, "grad_norm": 8.707291603088379, "learning_rate": 1e-05, "loss": 0.969, "step": 9 }, { "epoch": 0.03715745471435207, "grad_norm": 13.96293830871582, "learning_rate": 9.999635004278054e-06, "loss": 0.9283, "step": 10 }, { "epoch": 0.04087320018578727, "grad_norm": 7.823294639587402, "learning_rate": 9.998540070400966e-06, "loss": 0.9423, "step": 11 }, { "epoch": 0.04458894565722248, "grad_norm": 8.87206745147705, "learning_rate": 9.996715358227208e-06, "loss": 0.9248, "step": 12 }, { "epoch": 0.048304691128657685, "grad_norm": 5.740382671356201, "learning_rate": 9.994161134161635e-06, "loss": 0.8843, "step": 13 }, { "epoch": 0.052020436600092895, "grad_norm": 5.519284725189209, "learning_rate": 9.990877771116588e-06, "loss": 0.8882, "step": 14 }, { "epoch": 0.0557361820715281, "grad_norm": 4.628316402435303, "learning_rate": 9.986865748457457e-06, "loss": 0.8786, "step": 15 }, { "epoch": 0.05945192754296331, "grad_norm": 5.834085941314697, "learning_rate": 9.982125651932681e-06, "loss": 0.8852, "step": 16 }, { "epoch": 0.06316767301439852, "grad_norm": 5.242204189300537, "learning_rate": 9.976658173588244e-06, "loss": 0.881, "step": 17 }, { "epoch": 0.06688341848583372, "grad_norm": 4.722640514373779, "learning_rate": 9.970464111666627e-06, "loss": 0.8612, "step": 18 }, { "epoch": 0.07059916395726892, "grad_norm": 4.319486618041992, "learning_rate": 9.96354437049027e-06, "loss": 0.872, "step": 19 }, { "epoch": 0.07431490942870414, "grad_norm": 4.070723533630371, "learning_rate": 9.955899960329546e-06, "loss": 0.8392, "step": 20 }, { "epoch": 0.07803065490013934, "grad_norm": 4.365532875061035, "learning_rate": 9.947531997255256e-06, "loss": 0.8826, "step": 21 }, { "epoch": 0.08174640037157455, "grad_norm": 3.6431972980499268, "learning_rate": 9.938441702975689e-06, "loss": 0.8618, "step": 22 }, { "epoch": 0.08546214584300975, "grad_norm": 4.284820556640625, "learning_rate": 9.928630404658255e-06, "loss": 0.891, "step": 23 }, { "epoch": 0.08917789131444497, "grad_norm": 3.621809482574463, "learning_rate": 9.91809953473572e-06, "loss": 0.8527, "step": 24 }, { "epoch": 0.09289363678588017, "grad_norm": 4.9281134605407715, "learning_rate": 9.906850630697068e-06, "loss": 0.8621, "step": 25 }, { "epoch": 0.09660938225731537, "grad_norm": 4.055799961090088, "learning_rate": 9.894885334863044e-06, "loss": 0.8795, "step": 26 }, { "epoch": 0.10032512772875057, "grad_norm": 3.8977296352386475, "learning_rate": 9.882205394146362e-06, "loss": 0.8515, "step": 27 }, { "epoch": 0.10404087320018579, "grad_norm": 3.988004207611084, "learning_rate": 9.868812659796669e-06, "loss": 0.8834, "step": 28 }, { "epoch": 0.107756618671621, "grad_norm": 3.673720359802246, "learning_rate": 9.854709087130261e-06, "loss": 0.8663, "step": 29 }, { "epoch": 0.1114723641430562, "grad_norm": 3.393507242202759, "learning_rate": 9.839896735244615e-06, "loss": 0.8949, "step": 30 }, { "epoch": 0.11518810961449141, "grad_norm": 4.008629322052002, "learning_rate": 9.824377766717758e-06, "loss": 0.878, "step": 31 }, { "epoch": 0.11890385508592662, "grad_norm": 3.543630361557007, "learning_rate": 9.808154447292539e-06, "loss": 0.8487, "step": 32 }, { "epoch": 0.12261960055736182, "grad_norm": 3.6210310459136963, "learning_rate": 9.791229145545832e-06, "loss": 0.8601, "step": 33 }, { "epoch": 0.12633534602879704, "grad_norm": 3.313815116882324, "learning_rate": 9.77360433254273e-06, "loss": 0.8907, "step": 34 }, { "epoch": 0.13005109150023222, "grad_norm": 3.567662239074707, "learning_rate": 9.755282581475769e-06, "loss": 0.8696, "step": 35 }, { "epoch": 0.13376683697166744, "grad_norm": 4.121949195861816, "learning_rate": 9.736266567289255e-06, "loss": 0.8777, "step": 36 }, { "epoch": 0.13748258244310266, "grad_norm": 3.3143727779388428, "learning_rate": 9.716559066288716e-06, "loss": 0.845, "step": 37 }, { "epoch": 0.14119832791453785, "grad_norm": 3.454806327819824, "learning_rate": 9.696162955735577e-06, "loss": 0.8641, "step": 38 }, { "epoch": 0.14491407338597306, "grad_norm": 3.4413001537323, "learning_rate": 9.675081213427076e-06, "loss": 0.844, "step": 39 }, { "epoch": 0.14862981885740828, "grad_norm": 3.246232748031616, "learning_rate": 9.653316917261511e-06, "loss": 0.8483, "step": 40 }, { "epoch": 0.15234556432884347, "grad_norm": 2.963979959487915, "learning_rate": 9.630873244788884e-06, "loss": 0.8911, "step": 41 }, { "epoch": 0.15606130980027869, "grad_norm": 3.023815631866455, "learning_rate": 9.607753472746967e-06, "loss": 0.8724, "step": 42 }, { "epoch": 0.15977705527171387, "grad_norm": 3.17449951171875, "learning_rate": 9.583960976582914e-06, "loss": 0.8763, "step": 43 }, { "epoch": 0.1634928007431491, "grad_norm": 3.4051811695098877, "learning_rate": 9.55949922996045e-06, "loss": 0.8285, "step": 44 }, { "epoch": 0.1672085462145843, "grad_norm": 3.4274067878723145, "learning_rate": 9.534371804252727e-06, "loss": 0.847, "step": 45 }, { "epoch": 0.1709242916860195, "grad_norm": 3.438002586364746, "learning_rate": 9.508582368020897e-06, "loss": 0.86, "step": 46 }, { "epoch": 0.17464003715745471, "grad_norm": 3.0596799850463867, "learning_rate": 9.48213468647852e-06, "loss": 0.8458, "step": 47 }, { "epoch": 0.17835578262888993, "grad_norm": 3.6326303482055664, "learning_rate": 9.45503262094184e-06, "loss": 0.8584, "step": 48 }, { "epoch": 0.18207152810032512, "grad_norm": 3.186222553253174, "learning_rate": 9.427280128266049e-06, "loss": 0.8621, "step": 49 }, { "epoch": 0.18578727357176034, "grad_norm": 3.188178300857544, "learning_rate": 9.398881260267589e-06, "loss": 0.8709, "step": 50 }, { "epoch": 0.18950301904319555, "grad_norm": 2.9039716720581055, "learning_rate": 9.36984016313259e-06, "loss": 0.8464, "step": 51 }, { "epoch": 0.19321876451463074, "grad_norm": 3.451613187789917, "learning_rate": 9.340161076811539e-06, "loss": 0.8805, "step": 52 }, { "epoch": 0.19693450998606596, "grad_norm": 2.9744162559509277, "learning_rate": 9.309848334400247e-06, "loss": 0.8234, "step": 53 }, { "epoch": 0.20065025545750115, "grad_norm": 3.27294659614563, "learning_rate": 9.278906361507238e-06, "loss": 0.8696, "step": 54 }, { "epoch": 0.20436600092893636, "grad_norm": 3.1406092643737793, "learning_rate": 9.247339675607606e-06, "loss": 0.8367, "step": 55 }, { "epoch": 0.20808174640037158, "grad_norm": 2.9955215454101562, "learning_rate": 9.215152885383473e-06, "loss": 0.8566, "step": 56 }, { "epoch": 0.21179749187180677, "grad_norm": 2.9043989181518555, "learning_rate": 9.182350690051134e-06, "loss": 0.8277, "step": 57 }, { "epoch": 0.215513237343242, "grad_norm": 2.9203925132751465, "learning_rate": 9.148937878674975e-06, "loss": 0.8237, "step": 58 }, { "epoch": 0.2192289828146772, "grad_norm": 3.3251900672912598, "learning_rate": 9.114919329468283e-06, "loss": 0.873, "step": 59 }, { "epoch": 0.2229447282861124, "grad_norm": 3.443455219268799, "learning_rate": 9.080300009081025e-06, "loss": 0.8798, "step": 60 }, { "epoch": 0.2266604737575476, "grad_norm": 2.9733846187591553, "learning_rate": 9.045084971874738e-06, "loss": 0.8335, "step": 61 }, { "epoch": 0.23037621922898283, "grad_norm": 3.0066165924072266, "learning_rate": 9.009279359184594e-06, "loss": 0.8556, "step": 62 }, { "epoch": 0.23409196470041801, "grad_norm": 3.133880615234375, "learning_rate": 8.972888398568772e-06, "loss": 0.8352, "step": 63 }, { "epoch": 0.23780771017185323, "grad_norm": 3.2234981060028076, "learning_rate": 8.935917403045251e-06, "loss": 0.8324, "step": 64 }, { "epoch": 0.24152345564328845, "grad_norm": 3.028475046157837, "learning_rate": 8.898371770316113e-06, "loss": 0.822, "step": 65 }, { "epoch": 0.24523920111472364, "grad_norm": 2.924118757247925, "learning_rate": 8.860256981979485e-06, "loss": 0.8401, "step": 66 }, { "epoch": 0.24895494658615885, "grad_norm": 3.151426076889038, "learning_rate": 8.821578602729242e-06, "loss": 0.8455, "step": 67 }, { "epoch": 0.25267069205759407, "grad_norm": 3.0176525115966797, "learning_rate": 8.782342279542569e-06, "loss": 0.8674, "step": 68 }, { "epoch": 0.2563864375290293, "grad_norm": 2.982210874557495, "learning_rate": 8.742553740855507e-06, "loss": 0.8804, "step": 69 }, { "epoch": 0.26010218300046445, "grad_norm": 2.870077133178711, "learning_rate": 8.702218795726619e-06, "loss": 0.8658, "step": 70 }, { "epoch": 0.26381792847189967, "grad_norm": 3.198518753051758, "learning_rate": 8.661343332988869e-06, "loss": 0.8476, "step": 71 }, { "epoch": 0.2675336739433349, "grad_norm": 2.90989089012146, "learning_rate": 8.619933320389872e-06, "loss": 0.8501, "step": 72 }, { "epoch": 0.2712494194147701, "grad_norm": 3.0661652088165283, "learning_rate": 8.577994803720605e-06, "loss": 0.8526, "step": 73 }, { "epoch": 0.2749651648862053, "grad_norm": 2.7846736907958984, "learning_rate": 8.535533905932739e-06, "loss": 0.8658, "step": 74 }, { "epoch": 0.2786809103576405, "grad_norm": 3.0816805362701416, "learning_rate": 8.492556826244687e-06, "loss": 0.8762, "step": 75 }, { "epoch": 0.2823966558290757, "grad_norm": 2.841280937194824, "learning_rate": 8.44906983923654e-06, "loss": 0.8689, "step": 76 }, { "epoch": 0.2861124013005109, "grad_norm": 3.150681734085083, "learning_rate": 8.405079293933986e-06, "loss": 0.8327, "step": 77 }, { "epoch": 0.2898281467719461, "grad_norm": 2.7397427558898926, "learning_rate": 8.360591612881363e-06, "loss": 0.8298, "step": 78 }, { "epoch": 0.29354389224338134, "grad_norm": 3.215749502182007, "learning_rate": 8.315613291203977e-06, "loss": 0.8214, "step": 79 }, { "epoch": 0.29725963771481656, "grad_norm": 3.1648685932159424, "learning_rate": 8.270150895659824e-06, "loss": 0.8383, "step": 80 }, { "epoch": 0.3009753831862517, "grad_norm": 2.72301983833313, "learning_rate": 8.224211063680854e-06, "loss": 0.8428, "step": 81 }, { "epoch": 0.30469112865768694, "grad_norm": 2.839674472808838, "learning_rate": 8.177800502403928e-06, "loss": 0.8355, "step": 82 }, { "epoch": 0.30840687412912215, "grad_norm": 2.820302963256836, "learning_rate": 8.13092598769157e-06, "loss": 0.8482, "step": 83 }, { "epoch": 0.31212261960055737, "grad_norm": 2.7327470779418945, "learning_rate": 8.083594363142717e-06, "loss": 0.841, "step": 84 }, { "epoch": 0.3158383650719926, "grad_norm": 2.756406545639038, "learning_rate": 8.035812539093557e-06, "loss": 0.8258, "step": 85 }, { "epoch": 0.31955411054342775, "grad_norm": 2.8273730278015137, "learning_rate": 7.987587491608636e-06, "loss": 0.834, "step": 86 }, { "epoch": 0.32326985601486297, "grad_norm": 2.8114731311798096, "learning_rate": 7.938926261462366e-06, "loss": 0.8379, "step": 87 }, { "epoch": 0.3269856014862982, "grad_norm": 2.866734266281128, "learning_rate": 7.889835953111075e-06, "loss": 0.8811, "step": 88 }, { "epoch": 0.3307013469577334, "grad_norm": 2.8514320850372314, "learning_rate": 7.84032373365578e-06, "loss": 0.8743, "step": 89 }, { "epoch": 0.3344170924291686, "grad_norm": 3.2439370155334473, "learning_rate": 7.790396831795792e-06, "loss": 0.8264, "step": 90 }, { "epoch": 0.33813283790060383, "grad_norm": 2.96039080619812, "learning_rate": 7.740062536773352e-06, "loss": 0.8174, "step": 91 }, { "epoch": 0.341848583372039, "grad_norm": 3.002448320388794, "learning_rate": 7.689328197309394e-06, "loss": 0.841, "step": 92 }, { "epoch": 0.3455643288434742, "grad_norm": 2.891981840133667, "learning_rate": 7.638201220530664e-06, "loss": 0.8463, "step": 93 }, { "epoch": 0.34928007431490943, "grad_norm": 2.832390546798706, "learning_rate": 7.586689070888284e-06, "loss": 0.8194, "step": 94 }, { "epoch": 0.35299581978634464, "grad_norm": 2.9103174209594727, "learning_rate": 7.534799269067952e-06, "loss": 0.8083, "step": 95 }, { "epoch": 0.35671156525777986, "grad_norm": 2.6890742778778076, "learning_rate": 7.482539390891941e-06, "loss": 0.853, "step": 96 }, { "epoch": 0.360427310729215, "grad_norm": 2.86749529838562, "learning_rate": 7.42991706621303e-06, "loss": 0.8479, "step": 97 }, { "epoch": 0.36414305620065024, "grad_norm": 3.1002535820007324, "learning_rate": 7.376939977800581e-06, "loss": 0.8259, "step": 98 }, { "epoch": 0.36785880167208546, "grad_norm": 2.6948513984680176, "learning_rate": 7.323615860218844e-06, "loss": 0.8264, "step": 99 }, { "epoch": 0.3715745471435207, "grad_norm": 2.566476583480835, "learning_rate": 7.269952498697734e-06, "loss": 0.8327, "step": 100 }, { "epoch": 0.3752902926149559, "grad_norm": 2.6251931190490723, "learning_rate": 7.215957727996208e-06, "loss": 0.828, "step": 101 }, { "epoch": 0.3790060380863911, "grad_norm": 2.7968850135803223, "learning_rate": 7.161639431258387e-06, "loss": 0.841, "step": 102 }, { "epoch": 0.38272178355782627, "grad_norm": 2.7724828720092773, "learning_rate": 7.107005538862647e-06, "loss": 0.7908, "step": 103 }, { "epoch": 0.3864375290292615, "grad_norm": 2.8875327110290527, "learning_rate": 7.052064027263785e-06, "loss": 0.7763, "step": 104 }, { "epoch": 0.3901532745006967, "grad_norm": 3.0686705112457275, "learning_rate": 6.9968229178284775e-06, "loss": 0.8339, "step": 105 }, { "epoch": 0.3938690199721319, "grad_norm": 2.588174819946289, "learning_rate": 6.941290275664175e-06, "loss": 0.8246, "step": 106 }, { "epoch": 0.39758476544356713, "grad_norm": 2.9062623977661133, "learning_rate": 6.885474208441602e-06, "loss": 0.8011, "step": 107 }, { "epoch": 0.4013005109150023, "grad_norm": 3.0397439002990723, "learning_rate": 6.829382865211063e-06, "loss": 0.8069, "step": 108 }, { "epoch": 0.4050162563864375, "grad_norm": 2.7592780590057373, "learning_rate": 6.773024435212678e-06, "loss": 0.8211, "step": 109 }, { "epoch": 0.40873200185787273, "grad_norm": 2.770179033279419, "learning_rate": 6.716407146680793e-06, "loss": 0.8196, "step": 110 }, { "epoch": 0.41244774732930795, "grad_norm": 3.039098024368286, "learning_rate": 6.659539265642643e-06, "loss": 0.8033, "step": 111 }, { "epoch": 0.41616349280074316, "grad_norm": 2.896697998046875, "learning_rate": 6.602429094711549e-06, "loss": 0.7993, "step": 112 }, { "epoch": 0.4198792382721784, "grad_norm": 2.830800771713257, "learning_rate": 6.545084971874738e-06, "loss": 0.8513, "step": 113 }, { "epoch": 0.42359498374361354, "grad_norm": 2.8710081577301025, "learning_rate": 6.487515269276015e-06, "loss": 0.8161, "step": 114 }, { "epoch": 0.42731072921504876, "grad_norm": 2.7982544898986816, "learning_rate": 6.429728391993446e-06, "loss": 0.8194, "step": 115 }, { "epoch": 0.431026474686484, "grad_norm": 2.819709062576294, "learning_rate": 6.37173277681223e-06, "loss": 0.8115, "step": 116 }, { "epoch": 0.4347422201579192, "grad_norm": 2.832136631011963, "learning_rate": 6.313536890992935e-06, "loss": 0.8274, "step": 117 }, { "epoch": 0.4384579656293544, "grad_norm": 2.9193575382232666, "learning_rate": 6.2551492310353094e-06, "loss": 0.8187, "step": 118 }, { "epoch": 0.44217371110078957, "grad_norm": 2.8140170574188232, "learning_rate": 6.1965783214377895e-06, "loss": 0.8231, "step": 119 }, { "epoch": 0.4458894565722248, "grad_norm": 2.649017810821533, "learning_rate": 6.13783271345295e-06, "loss": 0.7933, "step": 120 }, { "epoch": 0.44960520204366, "grad_norm": 2.98210072517395, "learning_rate": 6.078920983839032e-06, "loss": 0.8251, "step": 121 }, { "epoch": 0.4533209475150952, "grad_norm": 3.0255842208862305, "learning_rate": 6.019851733607744e-06, "loss": 0.8237, "step": 122 }, { "epoch": 0.45703669298653044, "grad_norm": 2.5767600536346436, "learning_rate": 5.9606335867685424e-06, "loss": 0.8131, "step": 123 }, { "epoch": 0.46075243845796565, "grad_norm": 2.823826789855957, "learning_rate": 5.90127518906953e-06, "loss": 0.8222, "step": 124 }, { "epoch": 0.4644681839294008, "grad_norm": 2.703465223312378, "learning_rate": 5.841785206735192e-06, "loss": 0.8108, "step": 125 }, { "epoch": 0.46818392940083603, "grad_norm": 2.6367640495300293, "learning_rate": 5.782172325201155e-06, "loss": 0.8329, "step": 126 }, { "epoch": 0.47189967487227125, "grad_norm": 3.0382368564605713, "learning_rate": 5.722445247846107e-06, "loss": 0.8341, "step": 127 }, { "epoch": 0.47561542034370646, "grad_norm": 3.049957752227783, "learning_rate": 5.662612694721139e-06, "loss": 0.8277, "step": 128 }, { "epoch": 0.4793311658151417, "grad_norm": 2.802960157394409, "learning_rate": 5.6026834012766155e-06, "loss": 0.8032, "step": 129 }, { "epoch": 0.4830469112865769, "grad_norm": 2.779818058013916, "learning_rate": 5.542666117086832e-06, "loss": 0.7753, "step": 130 }, { "epoch": 0.48676265675801206, "grad_norm": 2.7739217281341553, "learning_rate": 5.482569604572577e-06, "loss": 0.81, "step": 131 }, { "epoch": 0.4904784022294473, "grad_norm": 2.5975005626678467, "learning_rate": 5.4224026377218365e-06, "loss": 0.808, "step": 132 }, { "epoch": 0.4941941477008825, "grad_norm": 2.766979932785034, "learning_rate": 5.362174000808813e-06, "loss": 0.7686, "step": 133 }, { "epoch": 0.4979098931723177, "grad_norm": 2.670982599258423, "learning_rate": 5.301892487111431e-06, "loss": 0.8308, "step": 134 }, { "epoch": 0.5016256386437529, "grad_norm": 2.434584617614746, "learning_rate": 5.241566897627536e-06, "loss": 0.8116, "step": 135 }, { "epoch": 0.5053413841151881, "grad_norm": 2.5681159496307373, "learning_rate": 5.1812060397899624e-06, "loss": 0.779, "step": 136 }, { "epoch": 0.5090571295866233, "grad_norm": 2.5115795135498047, "learning_rate": 5.120818726180662e-06, "loss": 0.817, "step": 137 }, { "epoch": 0.5127728750580586, "grad_norm": 2.578521251678467, "learning_rate": 5.0604137732440875e-06, "loss": 0.7924, "step": 138 }, { "epoch": 0.5164886205294937, "grad_norm": 2.7697277069091797, "learning_rate": 5e-06, "loss": 0.816, "step": 139 }, { "epoch": 0.5202043660009289, "grad_norm": 2.730084180831909, "learning_rate": 4.939586226755913e-06, "loss": 0.7916, "step": 140 }, { "epoch": 0.5239201114723642, "grad_norm": 2.382354259490967, "learning_rate": 4.87918127381934e-06, "loss": 0.8236, "step": 141 }, { "epoch": 0.5276358569437993, "grad_norm": 2.714177370071411, "learning_rate": 4.81879396021004e-06, "loss": 0.7959, "step": 142 }, { "epoch": 0.5313516024152346, "grad_norm": 2.76314377784729, "learning_rate": 4.758433102372466e-06, "loss": 0.8, "step": 143 }, { "epoch": 0.5350673478866698, "grad_norm": 2.615633726119995, "learning_rate": 4.69810751288857e-06, "loss": 0.7958, "step": 144 }, { "epoch": 0.5387830933581049, "grad_norm": 2.5601184368133545, "learning_rate": 4.637825999191189e-06, "loss": 0.8078, "step": 145 }, { "epoch": 0.5424988388295402, "grad_norm": 2.5851452350616455, "learning_rate": 4.577597362278165e-06, "loss": 0.7882, "step": 146 }, { "epoch": 0.5462145843009754, "grad_norm": 2.749624490737915, "learning_rate": 4.517430395427424e-06, "loss": 0.802, "step": 147 }, { "epoch": 0.5499303297724106, "grad_norm": 2.867662191390991, "learning_rate": 4.45733388291317e-06, "loss": 0.805, "step": 148 }, { "epoch": 0.5536460752438458, "grad_norm": 2.711284637451172, "learning_rate": 4.397316598723385e-06, "loss": 0.7897, "step": 149 }, { "epoch": 0.557361820715281, "grad_norm": 2.705087184906006, "learning_rate": 4.337387305278864e-06, "loss": 0.8275, "step": 150 }, { "epoch": 0.5610775661867162, "grad_norm": 2.7347421646118164, "learning_rate": 4.277554752153895e-06, "loss": 0.802, "step": 151 }, { "epoch": 0.5647933116581514, "grad_norm": 2.939225196838379, "learning_rate": 4.217827674798845e-06, "loss": 0.7979, "step": 152 }, { "epoch": 0.5685090571295867, "grad_norm": 2.636723041534424, "learning_rate": 4.158214793264808e-06, "loss": 0.8197, "step": 153 }, { "epoch": 0.5722248026010218, "grad_norm": 2.544318199157715, "learning_rate": 4.098724810930472e-06, "loss": 0.8087, "step": 154 }, { "epoch": 0.575940548072457, "grad_norm": 2.7283096313476562, "learning_rate": 4.039366413231458e-06, "loss": 0.8202, "step": 155 }, { "epoch": 0.5796562935438923, "grad_norm": 2.621203899383545, "learning_rate": 3.980148266392257e-06, "loss": 0.8049, "step": 156 }, { "epoch": 0.5833720390153274, "grad_norm": 2.5380094051361084, "learning_rate": 3.92107901616097e-06, "loss": 0.8022, "step": 157 }, { "epoch": 0.5870877844867627, "grad_norm": 2.4992268085479736, "learning_rate": 3.8621672865470505e-06, "loss": 0.7745, "step": 158 }, { "epoch": 0.5908035299581978, "grad_norm": 2.629713296890259, "learning_rate": 3.803421678562213e-06, "loss": 0.7869, "step": 159 }, { "epoch": 0.5945192754296331, "grad_norm": 2.6328611373901367, "learning_rate": 3.744850768964692e-06, "loss": 0.8146, "step": 160 }, { "epoch": 0.5982350209010683, "grad_norm": 2.801602602005005, "learning_rate": 3.6864631090070656e-06, "loss": 0.8152, "step": 161 }, { "epoch": 0.6019507663725034, "grad_norm": 2.5954527854919434, "learning_rate": 3.6282672231877714e-06, "loss": 0.8035, "step": 162 }, { "epoch": 0.6056665118439387, "grad_norm": 2.4939026832580566, "learning_rate": 3.5702716080065546e-06, "loss": 0.8149, "step": 163 }, { "epoch": 0.6093822573153739, "grad_norm": 2.49111008644104, "learning_rate": 3.5124847307239863e-06, "loss": 0.8058, "step": 164 }, { "epoch": 0.6130980027868091, "grad_norm": 2.5582475662231445, "learning_rate": 3.4549150281252635e-06, "loss": 0.805, "step": 165 }, { "epoch": 0.6168137482582443, "grad_norm": 2.641791343688965, "learning_rate": 3.397570905288453e-06, "loss": 0.7845, "step": 166 }, { "epoch": 0.6205294937296795, "grad_norm": 2.466529369354248, "learning_rate": 3.340460734357359e-06, "loss": 0.7826, "step": 167 }, { "epoch": 0.6242452392011147, "grad_norm": 2.4412331581115723, "learning_rate": 3.2835928533192086e-06, "loss": 0.7976, "step": 168 }, { "epoch": 0.6279609846725499, "grad_norm": 2.4601681232452393, "learning_rate": 3.226975564787322e-06, "loss": 0.7791, "step": 169 }, { "epoch": 0.6316767301439852, "grad_norm": 2.7120959758758545, "learning_rate": 3.170617134788939e-06, "loss": 0.8263, "step": 170 }, { "epoch": 0.6353924756154203, "grad_norm": 2.687011957168579, "learning_rate": 3.114525791558398e-06, "loss": 0.7951, "step": 171 }, { "epoch": 0.6391082210868555, "grad_norm": 2.469672441482544, "learning_rate": 3.0587097243358254e-06, "loss": 0.7772, "step": 172 }, { "epoch": 0.6428239665582908, "grad_norm": 2.569567918777466, "learning_rate": 3.0031770821715233e-06, "loss": 0.8143, "step": 173 }, { "epoch": 0.6465397120297259, "grad_norm": 2.5790843963623047, "learning_rate": 2.947935972736217e-06, "loss": 0.7719, "step": 174 }, { "epoch": 0.6502554575011612, "grad_norm": 2.4998183250427246, "learning_rate": 2.8929944611373555e-06, "loss": 0.806, "step": 175 }, { "epoch": 0.6539712029725964, "grad_norm": 2.5869035720825195, "learning_rate": 2.838360568741613e-06, "loss": 0.7869, "step": 176 }, { "epoch": 0.6576869484440316, "grad_norm": 2.5482499599456787, "learning_rate": 2.7840422720037943e-06, "loss": 0.8089, "step": 177 }, { "epoch": 0.6614026939154668, "grad_norm": 2.5611495971679688, "learning_rate": 2.7300475013022666e-06, "loss": 0.7524, "step": 178 }, { "epoch": 0.665118439386902, "grad_norm": 2.424504518508911, "learning_rate": 2.6763841397811576e-06, "loss": 0.7762, "step": 179 }, { "epoch": 0.6688341848583372, "grad_norm": 2.445756196975708, "learning_rate": 2.6230600221994195e-06, "loss": 0.8205, "step": 180 }, { "epoch": 0.6725499303297724, "grad_norm": 2.4726805686950684, "learning_rate": 2.57008293378697e-06, "loss": 0.8175, "step": 181 }, { "epoch": 0.6762656758012077, "grad_norm": 2.622877359390259, "learning_rate": 2.517460609108063e-06, "loss": 0.7896, "step": 182 }, { "epoch": 0.6799814212726428, "grad_norm": 2.3721368312835693, "learning_rate": 2.4652007309320497e-06, "loss": 0.8076, "step": 183 }, { "epoch": 0.683697166744078, "grad_norm": 2.472506523132324, "learning_rate": 2.4133109291117156e-06, "loss": 0.7756, "step": 184 }, { "epoch": 0.6874129122155133, "grad_norm": 2.777744770050049, "learning_rate": 2.3617987794693358e-06, "loss": 0.7495, "step": 185 }, { "epoch": 0.6911286576869484, "grad_norm": 2.833951234817505, "learning_rate": 2.3106718026906073e-06, "loss": 0.8168, "step": 186 }, { "epoch": 0.6948444031583837, "grad_norm": 2.6643426418304443, "learning_rate": 2.2599374632266514e-06, "loss": 0.7811, "step": 187 }, { "epoch": 0.6985601486298189, "grad_norm": 2.62332820892334, "learning_rate": 2.209603168204209e-06, "loss": 0.7808, "step": 188 }, { "epoch": 0.702275894101254, "grad_norm": 2.7696139812469482, "learning_rate": 2.159676266344222e-06, "loss": 0.8023, "step": 189 }, { "epoch": 0.7059916395726893, "grad_norm": 2.6566128730773926, "learning_rate": 2.1101640468889255e-06, "loss": 0.7867, "step": 190 }, { "epoch": 0.7097073850441245, "grad_norm": 2.4867961406707764, "learning_rate": 2.061073738537635e-06, "loss": 0.7488, "step": 191 }, { "epoch": 0.7134231305155597, "grad_norm": 2.360543966293335, "learning_rate": 2.0124125083913636e-06, "loss": 0.7925, "step": 192 }, { "epoch": 0.7171388759869949, "grad_norm": 2.5860140323638916, "learning_rate": 1.9641874609064443e-06, "loss": 0.8052, "step": 193 }, { "epoch": 0.72085462145843, "grad_norm": 2.4666905403137207, "learning_rate": 1.9164056368572847e-06, "loss": 0.7766, "step": 194 }, { "epoch": 0.7245703669298653, "grad_norm": 2.5768556594848633, "learning_rate": 1.8690740123084316e-06, "loss": 0.8001, "step": 195 }, { "epoch": 0.7282861124013005, "grad_norm": 2.5521411895751953, "learning_rate": 1.8221994975960739e-06, "loss": 0.7923, "step": 196 }, { "epoch": 0.7320018578727358, "grad_norm": 2.377108097076416, "learning_rate": 1.7757889363191484e-06, "loss": 0.7488, "step": 197 }, { "epoch": 0.7357176033441709, "grad_norm": 2.30350399017334, "learning_rate": 1.7298491043401794e-06, "loss": 0.7659, "step": 198 }, { "epoch": 0.7394333488156062, "grad_norm": 2.613494396209717, "learning_rate": 1.6843867087960252e-06, "loss": 0.8003, "step": 199 }, { "epoch": 0.7431490942870413, "grad_norm": 2.3713924884796143, "learning_rate": 1.6394083871186362e-06, "loss": 0.7719, "step": 200 }, { "epoch": 0.7468648397584765, "grad_norm": 2.6182761192321777, "learning_rate": 1.5949207060660138e-06, "loss": 0.7922, "step": 201 }, { "epoch": 0.7505805852299118, "grad_norm": 2.520794630050659, "learning_rate": 1.550930160763462e-06, "loss": 0.7721, "step": 202 }, { "epoch": 0.7542963307013469, "grad_norm": 2.509988784790039, "learning_rate": 1.5074431737553158e-06, "loss": 0.7721, "step": 203 }, { "epoch": 0.7580120761727822, "grad_norm": 2.4135901927948, "learning_rate": 1.4644660940672628e-06, "loss": 0.7601, "step": 204 }, { "epoch": 0.7617278216442174, "grad_norm": 2.3835434913635254, "learning_rate": 1.4220051962793952e-06, "loss": 0.768, "step": 205 }, { "epoch": 0.7654435671156525, "grad_norm": 2.455984115600586, "learning_rate": 1.3800666796101291e-06, "loss": 0.7598, "step": 206 }, { "epoch": 0.7691593125870878, "grad_norm": 2.523127317428589, "learning_rate": 1.3386566670111339e-06, "loss": 0.7973, "step": 207 }, { "epoch": 0.772875058058523, "grad_norm": 2.4497673511505127, "learning_rate": 1.297781204273385e-06, "loss": 0.8143, "step": 208 }, { "epoch": 0.7765908035299582, "grad_norm": 2.722075939178467, "learning_rate": 1.257446259144494e-06, "loss": 0.779, "step": 209 }, { "epoch": 0.7803065490013934, "grad_norm": 2.407463550567627, "learning_rate": 1.2176577204574318e-06, "loss": 0.7884, "step": 210 }, { "epoch": 0.7840222944728286, "grad_norm": 2.408154249191284, "learning_rate": 1.1784213972707581e-06, "loss": 0.7751, "step": 211 }, { "epoch": 0.7877380399442638, "grad_norm": 2.544851064682007, "learning_rate": 1.139743018020517e-06, "loss": 0.7583, "step": 212 }, { "epoch": 0.791453785415699, "grad_norm": 2.4154045581817627, "learning_rate": 1.1016282296838887e-06, "loss": 0.7908, "step": 213 }, { "epoch": 0.7951695308871343, "grad_norm": 2.6085379123687744, "learning_rate": 1.0640825969547498e-06, "loss": 0.7354, "step": 214 }, { "epoch": 0.7988852763585694, "grad_norm": 2.358438491821289, "learning_rate": 1.0271116014312293e-06, "loss": 0.76, "step": 215 }, { "epoch": 0.8026010218300046, "grad_norm": 2.5363781452178955, "learning_rate": 9.90720640815408e-07, "loss": 0.7652, "step": 216 }, { "epoch": 0.8063167673014399, "grad_norm": 2.402181386947632, "learning_rate": 9.549150281252633e-07, "loss": 0.769, "step": 217 }, { "epoch": 0.810032512772875, "grad_norm": 2.413909912109375, "learning_rate": 9.196999909189764e-07, "loss": 0.7688, "step": 218 }, { "epoch": 0.8137482582443103, "grad_norm": 2.438744306564331, "learning_rate": 8.850806705317183e-07, "loss": 0.79, "step": 219 }, { "epoch": 0.8174640037157455, "grad_norm": 2.4197418689727783, "learning_rate": 8.510621213250248e-07, "loss": 0.7462, "step": 220 }, { "epoch": 0.8211797491871807, "grad_norm": 2.4547455310821533, "learning_rate": 8.176493099488664e-07, "loss": 0.7815, "step": 221 }, { "epoch": 0.8248954946586159, "grad_norm": 2.5494790077209473, "learning_rate": 7.848471146165287e-07, "loss": 0.7185, "step": 222 }, { "epoch": 0.828611240130051, "grad_norm": 2.4248874187469482, "learning_rate": 7.526603243923958e-07, "loss": 0.7427, "step": 223 }, { "epoch": 0.8323269856014863, "grad_norm": 2.490534543991089, "learning_rate": 7.210936384927631e-07, "loss": 0.7818, "step": 224 }, { "epoch": 0.8360427310729215, "grad_norm": 2.4028074741363525, "learning_rate": 6.901516655997536e-07, "loss": 0.7668, "step": 225 }, { "epoch": 0.8397584765443568, "grad_norm": 2.2774248123168945, "learning_rate": 6.598389231884628e-07, "loss": 0.7283, "step": 226 }, { "epoch": 0.8434742220157919, "grad_norm": 2.5930631160736084, "learning_rate": 6.301598368674106e-07, "loss": 0.7875, "step": 227 }, { "epoch": 0.8471899674872271, "grad_norm": 2.6912310123443604, "learning_rate": 6.011187397324114e-07, "loss": 0.7723, "step": 228 }, { "epoch": 0.8509057129586624, "grad_norm": 2.518649101257324, "learning_rate": 5.727198717339511e-07, "loss": 0.7559, "step": 229 }, { "epoch": 0.8546214584300975, "grad_norm": 2.4604172706604004, "learning_rate": 5.449673790581611e-07, "loss": 0.792, "step": 230 }, { "epoch": 0.8583372039015328, "grad_norm": 2.408017635345459, "learning_rate": 5.178653135214811e-07, "loss": 0.7653, "step": 231 }, { "epoch": 0.862052949372968, "grad_norm": 2.4024996757507324, "learning_rate": 4.914176319791037e-07, "loss": 0.7243, "step": 232 }, { "epoch": 0.8657686948444031, "grad_norm": 2.557715892791748, "learning_rate": 4.6562819574727304e-07, "loss": 0.7345, "step": 233 }, { "epoch": 0.8694844403158384, "grad_norm": 2.4792795181274414, "learning_rate": 4.405007700395497e-07, "loss": 0.7745, "step": 234 }, { "epoch": 0.8732001857872735, "grad_norm": 2.453551769256592, "learning_rate": 4.1603902341708804e-07, "loss": 0.8035, "step": 235 }, { "epoch": 0.8769159312587088, "grad_norm": 2.4629323482513428, "learning_rate": 3.9224652725303514e-07, "loss": 0.8033, "step": 236 }, { "epoch": 0.880631676730144, "grad_norm": 2.312333345413208, "learning_rate": 3.691267552111183e-07, "loss": 0.7421, "step": 237 }, { "epoch": 0.8843474222015791, "grad_norm": 2.5578577518463135, "learning_rate": 3.4668308273848985e-07, "loss": 0.7725, "step": 238 }, { "epoch": 0.8880631676730144, "grad_norm": 2.574186325073242, "learning_rate": 3.2491878657292643e-07, "loss": 0.7921, "step": 239 }, { "epoch": 0.8917789131444496, "grad_norm": 2.2268948554992676, "learning_rate": 3.0383704426442396e-07, "loss": 0.7685, "step": 240 }, { "epoch": 0.8954946586158848, "grad_norm": 2.536660671234131, "learning_rate": 2.834409337112842e-07, "loss": 0.7791, "step": 241 }, { "epoch": 0.89921040408732, "grad_norm": 2.341561794281006, "learning_rate": 2.6373343271074657e-07, "loss": 0.765, "step": 242 }, { "epoch": 0.9029261495587553, "grad_norm": 2.37235951423645, "learning_rate": 2.447174185242324e-07, "loss": 0.7778, "step": 243 }, { "epoch": 0.9066418950301904, "grad_norm": 2.2644269466400146, "learning_rate": 2.2639566745727203e-07, "loss": 0.7755, "step": 244 }, { "epoch": 0.9103576405016256, "grad_norm": 2.224494695663452, "learning_rate": 2.0877085445416889e-07, "loss": 0.7517, "step": 245 }, { "epoch": 0.9140733859730609, "grad_norm": 2.5631179809570312, "learning_rate": 1.9184555270746198e-07, "loss": 0.7703, "step": 246 }, { "epoch": 0.917789131444496, "grad_norm": 2.2541210651397705, "learning_rate": 1.7562223328224327e-07, "loss": 0.7354, "step": 247 }, { "epoch": 0.9215048769159313, "grad_norm": 2.4160380363464355, "learning_rate": 1.6010326475538628e-07, "loss": 0.7399, "step": 248 }, { "epoch": 0.9252206223873665, "grad_norm": 2.519101142883301, "learning_rate": 1.4529091286973994e-07, "loss": 0.766, "step": 249 }, { "epoch": 0.9289363678588016, "grad_norm": 2.368459463119507, "learning_rate": 1.3118734020333257e-07, "loss": 0.7833, "step": 250 }, { "epoch": 0.9326521133302369, "grad_norm": 2.297985792160034, "learning_rate": 1.1779460585363945e-07, "loss": 0.7833, "step": 251 }, { "epoch": 0.9363678588016721, "grad_norm": 2.2768945693969727, "learning_rate": 1.0511466513695778e-07, "loss": 0.7511, "step": 252 }, { "epoch": 0.9400836042731073, "grad_norm": 2.397083282470703, "learning_rate": 9.314936930293283e-08, "loss": 0.7798, "step": 253 }, { "epoch": 0.9437993497445425, "grad_norm": 2.590940237045288, "learning_rate": 8.190046526428241e-08, "loss": 0.8034, "step": 254 }, { "epoch": 0.9475150952159777, "grad_norm": 2.3255016803741455, "learning_rate": 7.136959534174592e-08, "loss": 0.7783, "step": 255 }, { "epoch": 0.9512308406874129, "grad_norm": 2.646681308746338, "learning_rate": 6.15582970243117e-08, "loss": 0.7669, "step": 256 }, { "epoch": 0.9549465861588481, "grad_norm": 2.5319759845733643, "learning_rate": 5.246800274474439e-08, "loss": 0.743, "step": 257 }, { "epoch": 0.9586623316302834, "grad_norm": 2.170186758041382, "learning_rate": 4.41000396704544e-08, "loss": 0.7573, "step": 258 }, { "epoch": 0.9623780771017185, "grad_norm": 2.349637746810913, "learning_rate": 3.645562950973014e-08, "loss": 0.7807, "step": 259 }, { "epoch": 0.9660938225731538, "grad_norm": 2.6679821014404297, "learning_rate": 2.9535888333374064e-08, "loss": 0.7522, "step": 260 }, { "epoch": 0.969809568044589, "grad_norm": 2.230440855026245, "learning_rate": 2.3341826411756863e-08, "loss": 0.7723, "step": 261 }, { "epoch": 0.9735253135160241, "grad_norm": 2.3344039916992188, "learning_rate": 1.7874348067319912e-08, "loss": 0.7704, "step": 262 }, { "epoch": 0.9772410589874594, "grad_norm": 2.456907272338867, "learning_rate": 1.3134251542544774e-08, "loss": 0.7498, "step": 263 }, { "epoch": 0.9809568044588945, "grad_norm": 2.447007417678833, "learning_rate": 9.12222888341252e-09, "loss": 0.787, "step": 264 }, { "epoch": 0.9846725499303298, "grad_norm": 2.3086886405944824, "learning_rate": 5.838865838366792e-09, "loss": 0.8057, "step": 265 }, { "epoch": 0.988388295401765, "grad_norm": 2.687796115875244, "learning_rate": 3.284641772793862e-09, "loss": 0.7676, "step": 266 }, { "epoch": 0.9921040408732001, "grad_norm": 2.431873321533203, "learning_rate": 1.4599295990352924e-09, "loss": 0.7706, "step": 267 }, { "epoch": 0.9958197863446354, "grad_norm": 2.4270150661468506, "learning_rate": 3.649957219464817e-10, "loss": 0.75, "step": 268 }, { "epoch": 0.9995355318160706, "grad_norm": 2.4313602447509766, "learning_rate": 0.0, "loss": 0.7913, "step": 269 }, { "epoch": 0.9995355318160706, "step": 269, "total_flos": 2.6430782143515853e+18, "train_loss": 0.8186407917937382, "train_runtime": 3697.6094, "train_samples_per_second": 18.627, "train_steps_per_second": 0.073 } ], "logging_steps": 1.0, "max_steps": 269, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6430782143515853e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }