{ "best_metric": 1.7585577964782715, "best_model_checkpoint": "outputs/checkpoint-50", "epoch": 0.14967259120673526, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014967259120673526, "grad_norm": 2.434373378753662, "learning_rate": 4e-05, "loss": 2.0037, "step": 1 }, { "epoch": 0.002993451824134705, "grad_norm": 2.795464038848877, "learning_rate": 8e-05, "loss": 1.9814, "step": 2 }, { "epoch": 0.004490177736202058, "grad_norm": 2.0998575687408447, "learning_rate": 0.00012, "loss": 2.0044, "step": 3 }, { "epoch": 0.00598690364826941, "grad_norm": 4.470895290374756, "learning_rate": 0.00016, "loss": 1.8696, "step": 4 }, { "epoch": 0.007483629560336763, "grad_norm": 1.6047176122665405, "learning_rate": 0.0002, "loss": 1.8021, "step": 5 }, { "epoch": 0.008980355472404116, "grad_norm": 1.9450230598449707, "learning_rate": 0.0001996638655462185, "loss": 1.8558, "step": 6 }, { "epoch": 0.01047708138447147, "grad_norm": 1.4420437812805176, "learning_rate": 0.00019932773109243698, "loss": 1.9079, "step": 7 }, { "epoch": 0.01197380729653882, "grad_norm": 2.428009033203125, "learning_rate": 0.00019899159663865548, "loss": 1.7173, "step": 8 }, { "epoch": 0.013470533208606174, "grad_norm": 1.6478683948516846, "learning_rate": 0.00019865546218487395, "loss": 1.7631, "step": 9 }, { "epoch": 0.014967259120673527, "grad_norm": 1.6283013820648193, "learning_rate": 0.00019831932773109245, "loss": 1.9375, "step": 10 }, { "epoch": 0.01646398503274088, "grad_norm": 1.7736356258392334, "learning_rate": 0.00019798319327731095, "loss": 1.7921, "step": 11 }, { "epoch": 0.017960710944808233, "grad_norm": 1.3353335857391357, "learning_rate": 0.00019764705882352942, "loss": 1.8699, "step": 12 }, { "epoch": 0.019457436856875586, "grad_norm": 1.5582761764526367, "learning_rate": 0.00019731092436974792, "loss": 1.8015, "step": 13 }, { "epoch": 0.02095416276894294, "grad_norm": 1.3032814264297485, "learning_rate": 0.00019697478991596642, "loss": 1.8513, "step": 14 }, { "epoch": 0.02245088868101029, "grad_norm": 1.3724833726882935, "learning_rate": 0.00019663865546218486, "loss": 1.8694, "step": 15 }, { "epoch": 0.02394761459307764, "grad_norm": 1.715156078338623, "learning_rate": 0.00019630252100840336, "loss": 1.7997, "step": 16 }, { "epoch": 0.025444340505144995, "grad_norm": 1.9989070892333984, "learning_rate": 0.00019596638655462186, "loss": 1.7037, "step": 17 }, { "epoch": 0.026941066417212348, "grad_norm": 1.6255011558532715, "learning_rate": 0.00019563025210084033, "loss": 1.8382, "step": 18 }, { "epoch": 0.0284377923292797, "grad_norm": 1.305870532989502, "learning_rate": 0.00019529411764705883, "loss": 1.8095, "step": 19 }, { "epoch": 0.029934518241347054, "grad_norm": 3.40390944480896, "learning_rate": 0.0001949579831932773, "loss": 1.6112, "step": 20 }, { "epoch": 0.0314312441534144, "grad_norm": 1.1514052152633667, "learning_rate": 0.0001946218487394958, "loss": 1.9725, "step": 21 }, { "epoch": 0.03292797006548176, "grad_norm": 1.9419797658920288, "learning_rate": 0.0001942857142857143, "loss": 1.8492, "step": 22 }, { "epoch": 0.03442469597754911, "grad_norm": 1.830913782119751, "learning_rate": 0.00019394957983193278, "loss": 1.7852, "step": 23 }, { "epoch": 0.035921421889616466, "grad_norm": 1.345436453819275, "learning_rate": 0.00019361344537815127, "loss": 1.8929, "step": 24 }, { "epoch": 0.037418147801683815, "grad_norm": 1.1833657026290894, "learning_rate": 0.00019327731092436975, "loss": 1.8822, "step": 25 }, { "epoch": 0.037418147801683815, "eval_loss": 1.7638475894927979, "eval_runtime": 13.3387, "eval_samples_per_second": 4.798, "eval_steps_per_second": 2.399, "step": 25 }, { "epoch": 0.03891487371375117, "grad_norm": 1.4092592000961304, "learning_rate": 0.00019294117647058825, "loss": 1.954, "step": 26 }, { "epoch": 0.04041159962581852, "grad_norm": 1.201281189918518, "learning_rate": 0.00019260504201680674, "loss": 1.9364, "step": 27 }, { "epoch": 0.04190832553788588, "grad_norm": 1.407148838043213, "learning_rate": 0.00019226890756302522, "loss": 1.7673, "step": 28 }, { "epoch": 0.04340505144995323, "grad_norm": 1.3781392574310303, "learning_rate": 0.00019193277310924372, "loss": 1.81, "step": 29 }, { "epoch": 0.04490177736202058, "grad_norm": 1.4952391386032104, "learning_rate": 0.00019159663865546221, "loss": 1.8216, "step": 30 }, { "epoch": 0.046398503274087934, "grad_norm": 1.5127140283584595, "learning_rate": 0.0001912605042016807, "loss": 1.8316, "step": 31 }, { "epoch": 0.04789522918615528, "grad_norm": 1.3208520412445068, "learning_rate": 0.00019092436974789919, "loss": 1.7302, "step": 32 }, { "epoch": 0.04939195509822264, "grad_norm": 1.3473477363586426, "learning_rate": 0.00019058823529411766, "loss": 1.9661, "step": 33 }, { "epoch": 0.05088868101028999, "grad_norm": 1.201379418373108, "learning_rate": 0.00019025210084033613, "loss": 1.8531, "step": 34 }, { "epoch": 0.052385406922357346, "grad_norm": 1.32240891456604, "learning_rate": 0.00018991596638655463, "loss": 1.7485, "step": 35 }, { "epoch": 0.053882132834424695, "grad_norm": 1.3222694396972656, "learning_rate": 0.0001895798319327731, "loss": 1.7939, "step": 36 }, { "epoch": 0.05537885874649205, "grad_norm": 1.1342493295669556, "learning_rate": 0.0001892436974789916, "loss": 1.855, "step": 37 }, { "epoch": 0.0568755846585594, "grad_norm": 1.4912521839141846, "learning_rate": 0.0001889075630252101, "loss": 1.8394, "step": 38 }, { "epoch": 0.05837231057062675, "grad_norm": 1.4635943174362183, "learning_rate": 0.00018857142857142857, "loss": 1.7669, "step": 39 }, { "epoch": 0.05986903648269411, "grad_norm": 1.4757208824157715, "learning_rate": 0.00018823529411764707, "loss": 1.7842, "step": 40 }, { "epoch": 0.06136576239476146, "grad_norm": 1.5162277221679688, "learning_rate": 0.00018789915966386554, "loss": 1.8873, "step": 41 }, { "epoch": 0.0628624883068288, "grad_norm": 1.3085792064666748, "learning_rate": 0.00018756302521008404, "loss": 1.9716, "step": 42 }, { "epoch": 0.06435921421889616, "grad_norm": 1.1214959621429443, "learning_rate": 0.00018722689075630254, "loss": 1.8991, "step": 43 }, { "epoch": 0.06585594013096352, "grad_norm": 1.1944588422775269, "learning_rate": 0.000186890756302521, "loss": 1.8894, "step": 44 }, { "epoch": 0.06735266604303088, "grad_norm": 1.509717345237732, "learning_rate": 0.0001865546218487395, "loss": 1.8035, "step": 45 }, { "epoch": 0.06884939195509822, "grad_norm": 1.3220465183258057, "learning_rate": 0.000186218487394958, "loss": 1.8673, "step": 46 }, { "epoch": 0.07034611786716558, "grad_norm": 1.3592686653137207, "learning_rate": 0.00018588235294117648, "loss": 1.8396, "step": 47 }, { "epoch": 0.07184284377923293, "grad_norm": 1.3568888902664185, "learning_rate": 0.00018554621848739498, "loss": 1.6562, "step": 48 }, { "epoch": 0.07333956969130027, "grad_norm": 1.1371209621429443, "learning_rate": 0.00018521008403361345, "loss": 1.9563, "step": 49 }, { "epoch": 0.07483629560336763, "grad_norm": 1.234221339225769, "learning_rate": 0.00018487394957983195, "loss": 1.7702, "step": 50 }, { "epoch": 0.07483629560336763, "eval_loss": 1.7585577964782715, "eval_runtime": 9.906, "eval_samples_per_second": 6.461, "eval_steps_per_second": 3.23, "step": 50 }, { "epoch": 0.07633302151543499, "grad_norm": 1.4679025411605835, "learning_rate": 0.00018453781512605045, "loss": 1.7903, "step": 51 }, { "epoch": 0.07782974742750234, "grad_norm": 1.6469783782958984, "learning_rate": 0.0001842016806722689, "loss": 1.9121, "step": 52 }, { "epoch": 0.07932647333956969, "grad_norm": 1.0950040817260742, "learning_rate": 0.0001838655462184874, "loss": 1.8015, "step": 53 }, { "epoch": 0.08082319925163704, "grad_norm": 1.4614354372024536, "learning_rate": 0.0001835294117647059, "loss": 1.8627, "step": 54 }, { "epoch": 0.0823199251637044, "grad_norm": 1.0772849321365356, "learning_rate": 0.00018319327731092437, "loss": 1.8474, "step": 55 }, { "epoch": 0.08381665107577176, "grad_norm": 0.8980317115783691, "learning_rate": 0.00018285714285714286, "loss": 1.9381, "step": 56 }, { "epoch": 0.0853133769878391, "grad_norm": 1.028698205947876, "learning_rate": 0.00018252100840336134, "loss": 1.8726, "step": 57 }, { "epoch": 0.08681010289990646, "grad_norm": 1.2643156051635742, "learning_rate": 0.00018218487394957984, "loss": 1.937, "step": 58 }, { "epoch": 0.08830682881197381, "grad_norm": 1.0845692157745361, "learning_rate": 0.00018184873949579833, "loss": 1.9708, "step": 59 }, { "epoch": 0.08980355472404115, "grad_norm": 1.2025495767593384, "learning_rate": 0.0001815126050420168, "loss": 1.8674, "step": 60 }, { "epoch": 0.09130028063610851, "grad_norm": 1.2060717344284058, "learning_rate": 0.0001811764705882353, "loss": 1.8268, "step": 61 }, { "epoch": 0.09279700654817587, "grad_norm": 1.3296293020248413, "learning_rate": 0.0001808403361344538, "loss": 1.6956, "step": 62 }, { "epoch": 0.09429373246024322, "grad_norm": 1.2353034019470215, "learning_rate": 0.00018050420168067228, "loss": 1.9816, "step": 63 }, { "epoch": 0.09579045837231057, "grad_norm": 1.5975768566131592, "learning_rate": 0.00018016806722689078, "loss": 1.7846, "step": 64 }, { "epoch": 0.09728718428437792, "grad_norm": 1.2220622301101685, "learning_rate": 0.00017983193277310925, "loss": 1.7895, "step": 65 }, { "epoch": 0.09878391019644528, "grad_norm": 1.2025718688964844, "learning_rate": 0.00017949579831932775, "loss": 1.9242, "step": 66 }, { "epoch": 0.10028063610851262, "grad_norm": 3.2830123901367188, "learning_rate": 0.00017915966386554625, "loss": 1.7076, "step": 67 }, { "epoch": 0.10177736202057998, "grad_norm": 1.5499017238616943, "learning_rate": 0.00017882352941176472, "loss": 1.7964, "step": 68 }, { "epoch": 0.10327408793264733, "grad_norm": 1.4630420207977295, "learning_rate": 0.00017848739495798322, "loss": 1.8724, "step": 69 }, { "epoch": 0.10477081384471469, "grad_norm": 1.4005722999572754, "learning_rate": 0.0001781512605042017, "loss": 1.6889, "step": 70 }, { "epoch": 0.10626753975678203, "grad_norm": 1.114207148551941, "learning_rate": 0.00017781512605042016, "loss": 1.8272, "step": 71 }, { "epoch": 0.10776426566884939, "grad_norm": 1.4557619094848633, "learning_rate": 0.00017747899159663866, "loss": 1.6877, "step": 72 }, { "epoch": 0.10926099158091675, "grad_norm": 1.4767951965332031, "learning_rate": 0.00017714285714285713, "loss": 1.8667, "step": 73 }, { "epoch": 0.1107577174929841, "grad_norm": 1.3078974485397339, "learning_rate": 0.00017680672268907563, "loss": 1.9319, "step": 74 }, { "epoch": 0.11225444340505145, "grad_norm": 1.1861608028411865, "learning_rate": 0.00017647058823529413, "loss": 1.9233, "step": 75 }, { "epoch": 0.11225444340505145, "eval_loss": 1.7611756324768066, "eval_runtime": 9.9015, "eval_samples_per_second": 6.464, "eval_steps_per_second": 3.232, "step": 75 }, { "epoch": 0.1137511693171188, "grad_norm": 1.1504981517791748, "learning_rate": 0.0001761344537815126, "loss": 1.8044, "step": 76 }, { "epoch": 0.11524789522918616, "grad_norm": 1.3776837587356567, "learning_rate": 0.0001757983193277311, "loss": 1.725, "step": 77 }, { "epoch": 0.1167446211412535, "grad_norm": 1.3975869417190552, "learning_rate": 0.0001754621848739496, "loss": 1.7935, "step": 78 }, { "epoch": 0.11824134705332086, "grad_norm": 1.3506461381912231, "learning_rate": 0.00017512605042016807, "loss": 1.7342, "step": 79 }, { "epoch": 0.11973807296538821, "grad_norm": 1.1317209005355835, "learning_rate": 0.00017478991596638657, "loss": 1.8149, "step": 80 }, { "epoch": 0.12123479887745557, "grad_norm": 1.2540264129638672, "learning_rate": 0.00017445378151260504, "loss": 1.84, "step": 81 }, { "epoch": 0.12273152478952291, "grad_norm": 1.23360276222229, "learning_rate": 0.00017411764705882354, "loss": 1.7623, "step": 82 }, { "epoch": 0.12422825070159027, "grad_norm": 1.0347758531570435, "learning_rate": 0.00017378151260504204, "loss": 1.7381, "step": 83 }, { "epoch": 0.1257249766136576, "grad_norm": 1.4501961469650269, "learning_rate": 0.0001734453781512605, "loss": 1.7075, "step": 84 }, { "epoch": 0.12722170252572498, "grad_norm": 1.0509997606277466, "learning_rate": 0.000173109243697479, "loss": 1.7295, "step": 85 }, { "epoch": 0.12871842843779233, "grad_norm": 1.2986621856689453, "learning_rate": 0.00017277310924369748, "loss": 1.7988, "step": 86 }, { "epoch": 0.13021515434985967, "grad_norm": 1.1701687574386597, "learning_rate": 0.00017243697478991598, "loss": 1.6763, "step": 87 }, { "epoch": 0.13171188026192704, "grad_norm": 1.2512173652648926, "learning_rate": 0.00017210084033613448, "loss": 1.6641, "step": 88 }, { "epoch": 0.13320860617399438, "grad_norm": 1.658525824546814, "learning_rate": 0.00017176470588235293, "loss": 1.6849, "step": 89 }, { "epoch": 0.13470533208606175, "grad_norm": 1.5465582609176636, "learning_rate": 0.00017142857142857143, "loss": 1.6439, "step": 90 }, { "epoch": 0.1362020579981291, "grad_norm": 1.3289684057235718, "learning_rate": 0.00017109243697478992, "loss": 1.7429, "step": 91 }, { "epoch": 0.13769878391019644, "grad_norm": 1.3123184442520142, "learning_rate": 0.0001707563025210084, "loss": 1.6429, "step": 92 }, { "epoch": 0.1391955098222638, "grad_norm": 1.385330319404602, "learning_rate": 0.0001704201680672269, "loss": 1.8257, "step": 93 }, { "epoch": 0.14069223573433115, "grad_norm": 1.3719394207000732, "learning_rate": 0.0001700840336134454, "loss": 1.7493, "step": 94 }, { "epoch": 0.1421889616463985, "grad_norm": 1.468948245048523, "learning_rate": 0.00016974789915966387, "loss": 1.8626, "step": 95 }, { "epoch": 0.14368568755846586, "grad_norm": 1.2705055475234985, "learning_rate": 0.00016941176470588237, "loss": 1.79, "step": 96 }, { "epoch": 0.1451824134705332, "grad_norm": 1.0876643657684326, "learning_rate": 0.00016907563025210084, "loss": 1.9631, "step": 97 }, { "epoch": 0.14667913938260055, "grad_norm": 1.1760327816009521, "learning_rate": 0.00016873949579831934, "loss": 1.91, "step": 98 }, { "epoch": 0.14817586529466792, "grad_norm": 1.0915436744689941, "learning_rate": 0.00016840336134453784, "loss": 1.8369, "step": 99 }, { "epoch": 0.14967259120673526, "grad_norm": 1.4619494676589966, "learning_rate": 0.0001680672268907563, "loss": 1.7074, "step": 100 }, { "epoch": 0.14967259120673526, "eval_loss": 1.7622296810150146, "eval_runtime": 9.9165, "eval_samples_per_second": 6.454, "eval_steps_per_second": 3.227, "step": 100 } ], "logging_steps": 1, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0866208664612045e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }