{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982507288629737, "eval_steps": 500, "global_step": 428, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011661807580174927, "grad_norm": 42.984135524813716, "learning_rate": 2.3255813953488376e-06, "loss": 0.7996, "step": 5 }, { "epoch": 0.023323615160349854, "grad_norm": 34.540550987887656, "learning_rate": 4.651162790697675e-06, "loss": 0.9118, "step": 10 }, { "epoch": 0.03498542274052478, "grad_norm": 13.382857138229893, "learning_rate": 6.976744186046513e-06, "loss": 0.809, "step": 15 }, { "epoch": 0.04664723032069971, "grad_norm": 16.118929797627043, "learning_rate": 9.30232558139535e-06, "loss": 0.6494, "step": 20 }, { "epoch": 0.05830903790087463, "grad_norm": 12.84126148340563, "learning_rate": 1.1627906976744187e-05, "loss": 0.6804, "step": 25 }, { "epoch": 0.06997084548104957, "grad_norm": 11.615382887994867, "learning_rate": 1.3953488372093025e-05, "loss": 0.742, "step": 30 }, { "epoch": 0.08163265306122448, "grad_norm": 9.64223874983612, "learning_rate": 1.6279069767441862e-05, "loss": 0.7071, "step": 35 }, { "epoch": 0.09329446064139942, "grad_norm": 15.158090621941907, "learning_rate": 1.86046511627907e-05, "loss": 0.7067, "step": 40 }, { "epoch": 0.10495626822157435, "grad_norm": 11.684269090645673, "learning_rate": 1.9998668323785298e-05, "loss": 0.773, "step": 45 }, { "epoch": 0.11661807580174927, "grad_norm": 10.880832162601031, "learning_rate": 1.9983691039261358e-05, "loss": 0.8095, "step": 50 }, { "epoch": 0.1282798833819242, "grad_norm": 9.366717268177585, "learning_rate": 1.995209688628471e-05, "loss": 0.7483, "step": 55 }, { "epoch": 0.13994169096209913, "grad_norm": 8.249786072407542, "learning_rate": 1.9903938450204972e-05, "loss": 0.76, "step": 60 }, { "epoch": 0.15160349854227406, "grad_norm": 10.168494788619096, "learning_rate": 1.98392958859863e-05, "loss": 0.7775, "step": 65 }, { "epoch": 0.16326530612244897, "grad_norm": 9.913819683231004, "learning_rate": 1.97582767847973e-05, "loss": 0.8216, "step": 70 }, { "epoch": 0.1749271137026239, "grad_norm": 10.032392176068132, "learning_rate": 1.9661015994936204e-05, "loss": 0.817, "step": 75 }, { "epoch": 0.18658892128279883, "grad_norm": 8.410150731378423, "learning_rate": 1.9547675397389144e-05, "loss": 0.808, "step": 80 }, { "epoch": 0.19825072886297376, "grad_norm": 9.863100877355185, "learning_rate": 1.941844363639525e-05, "loss": 0.8507, "step": 85 }, { "epoch": 0.2099125364431487, "grad_norm": 8.89188615237443, "learning_rate": 1.927353580546692e-05, "loss": 0.8161, "step": 90 }, { "epoch": 0.22157434402332363, "grad_norm": 8.531706916482218, "learning_rate": 1.91131930893879e-05, "loss": 0.8066, "step": 95 }, { "epoch": 0.23323615160349853, "grad_norm": 7.458499940704951, "learning_rate": 1.8937682362785025e-05, "loss": 0.7976, "step": 100 }, { "epoch": 0.24489795918367346, "grad_norm": 9.821230792634003, "learning_rate": 1.8747295745941705e-05, "loss": 0.745, "step": 105 }, { "epoch": 0.2565597667638484, "grad_norm": 7.927784309119138, "learning_rate": 1.8542350118592585e-05, "loss": 0.7847, "step": 110 }, { "epoch": 0.26822157434402333, "grad_norm": 7.469324880697332, "learning_rate": 1.8323186592508474e-05, "loss": 0.732, "step": 115 }, { "epoch": 0.27988338192419826, "grad_norm": 7.78062857430588, "learning_rate": 1.8090169943749477e-05, "loss": 0.8037, "step": 120 }, { "epoch": 0.2915451895043732, "grad_norm": 7.9449659800223245, "learning_rate": 1.7843688005531227e-05, "loss": 0.8004, "step": 125 }, { "epoch": 0.3032069970845481, "grad_norm": 6.980347848053137, "learning_rate": 1.75841510227148e-05, "loss": 0.8104, "step": 130 }, { "epoch": 0.31486880466472306, "grad_norm": 6.932872697470111, "learning_rate": 1.7311990968994598e-05, "loss": 0.784, "step": 135 }, { "epoch": 0.32653061224489793, "grad_norm": 7.03014955634306, "learning_rate": 1.7027660827920798e-05, "loss": 0.7162, "step": 140 }, { "epoch": 0.33819241982507287, "grad_norm": 9.027538747767453, "learning_rate": 1.6731633838952905e-05, "loss": 0.8222, "step": 145 }, { "epoch": 0.3498542274052478, "grad_norm": 7.2129295786721555, "learning_rate": 1.6424402709799404e-05, "loss": 0.7633, "step": 150 }, { "epoch": 0.36151603498542273, "grad_norm": 9.545693558216767, "learning_rate": 1.6106478796354382e-05, "loss": 0.8089, "step": 155 }, { "epoch": 0.37317784256559766, "grad_norm": 6.988871661431575, "learning_rate": 1.577839125159613e-05, "loss": 0.7499, "step": 160 }, { "epoch": 0.3848396501457726, "grad_norm": 6.859494877590503, "learning_rate": 1.5440686144864207e-05, "loss": 0.7838, "step": 165 }, { "epoch": 0.3965014577259475, "grad_norm": 10.280934791815703, "learning_rate": 1.5093925552980934e-05, "loss": 0.7613, "step": 170 }, { "epoch": 0.40816326530612246, "grad_norm": 6.569790871142721, "learning_rate": 1.4738686624729987e-05, "loss": 0.7123, "step": 175 }, { "epoch": 0.4198250728862974, "grad_norm": 6.872979375161206, "learning_rate": 1.4375560620249209e-05, "loss": 0.7469, "step": 180 }, { "epoch": 0.4314868804664723, "grad_norm": 7.636079144384279, "learning_rate": 1.400515192693645e-05, "loss": 0.7756, "step": 185 }, { "epoch": 0.44314868804664725, "grad_norm": 6.7063579238681115, "learning_rate": 1.362807705350641e-05, "loss": 0.761, "step": 190 }, { "epoch": 0.45481049562682213, "grad_norm": 10.606488159624234, "learning_rate": 1.3244963603872707e-05, "loss": 0.7783, "step": 195 }, { "epoch": 0.46647230320699706, "grad_norm": 10.490595832658897, "learning_rate": 1.285644923256311e-05, "loss": 0.752, "step": 200 }, { "epoch": 0.478134110787172, "grad_norm": 6.474060545032819, "learning_rate": 1.24631805834065e-05, "loss": 0.7329, "step": 205 }, { "epoch": 0.4897959183673469, "grad_norm": 6.195922083451403, "learning_rate": 1.2065812213258051e-05, "loss": 0.736, "step": 210 }, { "epoch": 0.5014577259475219, "grad_norm": 6.590048221376104, "learning_rate": 1.1665005502553912e-05, "loss": 0.7913, "step": 215 }, { "epoch": 0.5131195335276968, "grad_norm": 8.125319637789433, "learning_rate": 1.126142755450878e-05, "loss": 0.706, "step": 220 }, { "epoch": 0.5247813411078717, "grad_norm": 5.990671270414111, "learning_rate": 1.08557500847884e-05, "loss": 0.6663, "step": 225 }, { "epoch": 0.5364431486880467, "grad_norm": 5.457424439670184, "learning_rate": 1.044864830350515e-05, "loss": 0.6863, "step": 230 }, { "epoch": 0.5481049562682215, "grad_norm": 5.029069081649806, "learning_rate": 1.0040799791397444e-05, "loss": 0.7419, "step": 235 }, { "epoch": 0.5597667638483965, "grad_norm": 5.97976812053812, "learning_rate": 9.632883372063458e-06, "loss": 0.7409, "step": 240 }, { "epoch": 0.5714285714285714, "grad_norm": 5.73404097091946, "learning_rate": 9.225577982126234e-06, "loss": 0.721, "step": 245 }, { "epoch": 0.5830903790087464, "grad_norm": 7.281992128147145, "learning_rate": 8.819561541210698e-06, "loss": 0.7259, "step": 250 }, { "epoch": 0.5947521865889213, "grad_norm": 6.711399802745363, "learning_rate": 8.415509823613332e-06, "loss": 0.7184, "step": 255 }, { "epoch": 0.6064139941690962, "grad_norm": 5.973201176677851, "learning_rate": 8.014095333542548e-06, "loss": 0.6783, "step": 260 }, { "epoch": 0.6180758017492711, "grad_norm": 5.883172455604572, "learning_rate": 7.615986185801807e-06, "loss": 0.6978, "step": 265 }, { "epoch": 0.6297376093294461, "grad_norm": 7.527367196282041, "learning_rate": 7.221844993778464e-06, "loss": 0.7021, "step": 270 }, { "epoch": 0.641399416909621, "grad_norm": 6.248190397413816, "learning_rate": 6.832327766589177e-06, "loss": 0.645, "step": 275 }, { "epoch": 0.6530612244897959, "grad_norm": 8.28544581437624, "learning_rate": 6.4480828172174714e-06, "loss": 0.7004, "step": 280 }, { "epoch": 0.6647230320699709, "grad_norm": 6.406350729534571, "learning_rate": 6.069749683460765e-06, "loss": 0.6751, "step": 285 }, { "epoch": 0.6763848396501457, "grad_norm": 5.604361979041009, "learning_rate": 5.6979580634828125e-06, "loss": 0.6728, "step": 290 }, { "epoch": 0.6880466472303207, "grad_norm": 6.500294469087635, "learning_rate": 5.333326767743263e-06, "loss": 0.7096, "step": 295 }, { "epoch": 0.6997084548104956, "grad_norm": 6.173069337697736, "learning_rate": 4.976462689048718e-06, "loss": 0.6718, "step": 300 }, { "epoch": 0.7113702623906706, "grad_norm": 5.7326380834003094, "learning_rate": 4.6279597924395434e-06, "loss": 0.6785, "step": 305 }, { "epoch": 0.7230320699708455, "grad_norm": 6.5217701514071935, "learning_rate": 4.2883981265936884e-06, "loss": 0.661, "step": 310 }, { "epoch": 0.7346938775510204, "grad_norm": 5.886240007868726, "learning_rate": 3.958342858392893e-06, "loss": 0.6106, "step": 315 }, { "epoch": 0.7463556851311953, "grad_norm": 5.8543609692801315, "learning_rate": 3.6383433322582028e-06, "loss": 0.6282, "step": 320 }, { "epoch": 0.7580174927113703, "grad_norm": 5.540207550170581, "learning_rate": 3.3289321558203767e-06, "loss": 0.6551, "step": 325 }, { "epoch": 0.7696793002915452, "grad_norm": 6.364763436662215, "learning_rate": 3.0306243134470668e-06, "loss": 0.6425, "step": 330 }, { "epoch": 0.7813411078717201, "grad_norm": 6.546697364781114, "learning_rate": 2.7439163091021525e-06, "loss": 0.6829, "step": 335 }, { "epoch": 0.793002915451895, "grad_norm": 5.54305433545965, "learning_rate": 2.469285339963892e-06, "loss": 0.608, "step": 340 }, { "epoch": 0.8046647230320699, "grad_norm": 5.2130427650497335, "learning_rate": 2.207188502177313e-06, "loss": 0.6752, "step": 345 }, { "epoch": 0.8163265306122449, "grad_norm": 5.966791369980658, "learning_rate": 1.958062030062795e-06, "loss": 0.6451, "step": 350 }, { "epoch": 0.8279883381924198, "grad_norm": 5.8319616356981685, "learning_rate": 1.722320570047089e-06, "loss": 0.6544, "step": 355 }, { "epoch": 0.8396501457725948, "grad_norm": 5.587246353476323, "learning_rate": 1.500356490525261e-06, "loss": 0.6203, "step": 360 }, { "epoch": 0.8513119533527697, "grad_norm": 4.792540178325021, "learning_rate": 1.2925392288022299e-06, "loss": 0.613, "step": 365 }, { "epoch": 0.8629737609329446, "grad_norm": 5.761492562329574, "learning_rate": 1.099214676200816e-06, "loss": 0.5613, "step": 370 }, { "epoch": 0.8746355685131195, "grad_norm": 6.12518244267695, "learning_rate": 9.207046023597866e-07, "loss": 0.644, "step": 375 }, { "epoch": 0.8862973760932945, "grad_norm": 6.335150291563363, "learning_rate": 7.573061196800414e-07, "loss": 0.6359, "step": 380 }, { "epoch": 0.8979591836734694, "grad_norm": 5.644774834626501, "learning_rate": 6.092911888103404e-07, "loss": 0.6445, "step": 385 }, { "epoch": 0.9096209912536443, "grad_norm": 6.208859110734176, "learning_rate": 4.769061659956464e-07, "loss": 0.6238, "step": 390 }, { "epoch": 0.9212827988338192, "grad_norm": 6.180815291205464, "learning_rate": 3.603713930414676e-07, "loss": 0.6209, "step": 395 }, { "epoch": 0.9329446064139941, "grad_norm": 5.580445503476568, "learning_rate": 2.5988083057666534e-07, "loss": 0.6283, "step": 400 }, { "epoch": 0.9446064139941691, "grad_norm": 6.358276656532779, "learning_rate": 1.7560173522513268e-07, "loss": 0.6104, "step": 405 }, { "epoch": 0.956268221574344, "grad_norm": 5.486425722613415, "learning_rate": 1.0767438122364914e-07, "loss": 0.6326, "step": 410 }, { "epoch": 0.967930029154519, "grad_norm": 6.212119018151375, "learning_rate": 5.621182694925731e-08, "loss": 0.5795, "step": 415 }, { "epoch": 0.9795918367346939, "grad_norm": 5.964963969186111, "learning_rate": 2.1299726744747896e-08, "loss": 0.6607, "step": 420 }, { "epoch": 0.9912536443148688, "grad_norm": 6.566929080347384, "learning_rate": 2.9961883554674443e-09, "loss": 0.6411, "step": 425 }, { "epoch": 0.9982507288629737, "eval_loss": 0.5953396558761597, "eval_runtime": 106.3241, "eval_samples_per_second": 2.633, "eval_steps_per_second": 0.329, "step": 428 }, { "epoch": 0.9982507288629737, "step": 428, "total_flos": 11492314030080.0, "train_loss": 0.7172290003188303, "train_runtime": 14976.0603, "train_samples_per_second": 0.916, "train_steps_per_second": 0.029 } ], "logging_steps": 5, "max_steps": 428, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 11492314030080.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }