{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2479, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016138793625176519, "grad_norm": 9.077141761779785, "learning_rate": 4e-08, "logits/chosen": -1.8235963582992554, "logits/rejected": -1.7823007106781006, "logps/chosen": -697.4658203125, "logps/rejected": -162.27154541015625, "loss": 0.6877, "rewards/accuracies": 0.25, "rewards/chosen": -0.002143668942153454, "rewards/margins": 0.011764192022383213, "rewards/rejected": -0.013907860964536667, "step": 4 }, { "epoch": 0.0032277587250353038, "grad_norm": 9.186835289001465, "learning_rate": 9.333333333333334e-08, "logits/chosen": -1.819604754447937, "logits/rejected": -1.8089861869812012, "logps/chosen": -632.4598999023438, "logps/rejected": -179.32366943359375, "loss": 0.7103, "rewards/accuracies": 0.5, "rewards/chosen": -0.012548781931400299, "rewards/margins": -0.031342748552560806, "rewards/rejected": 0.018793966621160507, "step": 8 }, { "epoch": 0.004841638087552955, "grad_norm": 8.93552303314209, "learning_rate": 1.4666666666666666e-07, "logits/chosen": -1.7898244857788086, "logits/rejected": -1.774796962738037, "logps/chosen": -609.4016723632812, "logps/rejected": -188.09117126464844, "loss": 0.6696, "rewards/accuracies": 0.625, "rewards/chosen": 0.05078878626227379, "rewards/margins": 0.05015573650598526, "rewards/rejected": 0.0006330488249659538, "step": 12 }, { "epoch": 0.0064555174500706075, "grad_norm": 9.988286972045898, "learning_rate": 2e-07, "logits/chosen": -1.8582216501235962, "logits/rejected": -1.8168505430221558, "logps/chosen": -657.343017578125, "logps/rejected": -171.9979248046875, "loss": 0.6892, "rewards/accuracies": 0.5625, "rewards/chosen": 0.013883782550692558, "rewards/margins": 0.010537289083003998, "rewards/rejected": 0.003346489742398262, "step": 16 }, { "epoch": 0.008069396812588258, "grad_norm": 10.504063606262207, "learning_rate": 2.533333333333333e-07, "logits/chosen": -1.9168463945388794, "logits/rejected": -1.8164907693862915, "logps/chosen": -621.0966796875, "logps/rejected": -174.78024291992188, "loss": 0.6869, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004906081594526768, "rewards/margins": 0.013704253360629082, "rewards/rejected": -0.00879816897213459, "step": 20 }, { "epoch": 0.00968327617510591, "grad_norm": 10.774713516235352, "learning_rate": 3.066666666666666e-07, "logits/chosen": -1.8839260339736938, "logits/rejected": -1.8332624435424805, "logps/chosen": -641.7215576171875, "logps/rejected": -180.1142120361328, "loss": 0.7061, "rewards/accuracies": 0.5, "rewards/chosen": -0.00733261089771986, "rewards/margins": -0.023157311603426933, "rewards/rejected": 0.01582469791173935, "step": 24 }, { "epoch": 0.011297155537623563, "grad_norm": 10.0447359085083, "learning_rate": 3.6e-07, "logits/chosen": -1.8547735214233398, "logits/rejected": -1.8206897974014282, "logps/chosen": -636.6668701171875, "logps/rejected": -166.59011840820312, "loss": 0.7058, "rewards/accuracies": 0.5, "rewards/chosen": -0.006894588936120272, "rewards/margins": -0.023304034024477005, "rewards/rejected": 0.01640944369137287, "step": 28 }, { "epoch": 0.012911034900141215, "grad_norm": 10.692578315734863, "learning_rate": 4.1333333333333333e-07, "logits/chosen": -1.8482537269592285, "logits/rejected": -1.8186211585998535, "logps/chosen": -620.1280517578125, "logps/rejected": -187.54550170898438, "loss": 0.6731, "rewards/accuracies": 0.75, "rewards/chosen": 0.01992187649011612, "rewards/margins": 0.04289202392101288, "rewards/rejected": -0.022970151156187057, "step": 32 }, { "epoch": 0.014524914262658867, "grad_norm": 11.262819290161133, "learning_rate": 4.6666666666666666e-07, "logits/chosen": -1.8519309759140015, "logits/rejected": -1.806135892868042, "logps/chosen": -619.154541015625, "logps/rejected": -186.5546875, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": 0.0045795440673828125, "rewards/margins": 0.005698011722415686, "rewards/rejected": -0.0011184688191860914, "step": 36 }, { "epoch": 0.016138793625176517, "grad_norm": 9.49013614654541, "learning_rate": 5.2e-07, "logits/chosen": -1.8408842086791992, "logits/rejected": -1.8239388465881348, "logps/chosen": -582.840576171875, "logps/rejected": -166.47251892089844, "loss": 0.6954, "rewards/accuracies": 0.5, "rewards/chosen": -0.02891865000128746, "rewards/margins": -0.0021320355590432882, "rewards/rejected": -0.02678661234676838, "step": 40 }, { "epoch": 0.01775267298769417, "grad_norm": 10.238863945007324, "learning_rate": 5.733333333333334e-07, "logits/chosen": -1.8676425218582153, "logits/rejected": -1.829979419708252, "logps/chosen": -715.9750366210938, "logps/rejected": -182.09170532226562, "loss": 0.6906, "rewards/accuracies": 0.4375, "rewards/chosen": 0.02002429962158203, "rewards/margins": 0.008623981848359108, "rewards/rejected": 0.011400317773222923, "step": 44 }, { "epoch": 0.01936655235021182, "grad_norm": 10.513614654541016, "learning_rate": 6.266666666666667e-07, "logits/chosen": -1.9073965549468994, "logits/rejected": -1.8164485692977905, "logps/chosen": -682.51220703125, "logps/rejected": -176.90841674804688, "loss": 0.7003, "rewards/accuracies": 0.3125, "rewards/chosen": 0.020811652764678, "rewards/margins": -0.01142406277358532, "rewards/rejected": 0.03223571926355362, "step": 48 }, { "epoch": 0.020980431712729473, "grad_norm": 10.410325050354004, "learning_rate": 6.800000000000001e-07, "logits/chosen": -1.841076374053955, "logits/rejected": -1.874393105506897, "logps/chosen": -718.8765869140625, "logps/rejected": -176.82833862304688, "loss": 0.7067, "rewards/accuracies": 0.375, "rewards/chosen": -0.012143135070800781, "rewards/margins": -0.024462033063173294, "rewards/rejected": 0.012318897992372513, "step": 52 }, { "epoch": 0.022594311075247127, "grad_norm": 10.754411697387695, "learning_rate": 7.333333333333332e-07, "logits/chosen": -1.882813572883606, "logits/rejected": -1.8344117403030396, "logps/chosen": -707.3211669921875, "logps/rejected": -161.5318145751953, "loss": 0.6793, "rewards/accuracies": 0.6875, "rewards/chosen": 0.023887254297733307, "rewards/margins": 0.029755450785160065, "rewards/rejected": -0.005868196487426758, "step": 56 }, { "epoch": 0.024208190437764777, "grad_norm": 10.008980751037598, "learning_rate": 7.866666666666666e-07, "logits/chosen": -1.8649877309799194, "logits/rejected": -1.8097047805786133, "logps/chosen": -571.0418701171875, "logps/rejected": -191.5659942626953, "loss": 0.6944, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0006765371654182673, "rewards/margins": -0.00047426018863916397, "rewards/rejected": -0.0002022748813033104, "step": 60 }, { "epoch": 0.02582206980028243, "grad_norm": 9.264538764953613, "learning_rate": 8.399999999999999e-07, "logits/chosen": -1.8578335046768188, "logits/rejected": -1.8112698793411255, "logps/chosen": -573.8051147460938, "logps/rejected": -170.08560180664062, "loss": 0.6786, "rewards/accuracies": 0.625, "rewards/chosen": 0.03522224351763725, "rewards/margins": 0.03165812790393829, "rewards/rejected": 0.003564119804650545, "step": 64 }, { "epoch": 0.02743594916280008, "grad_norm": 9.287182807922363, "learning_rate": 8.933333333333333e-07, "logits/chosen": -1.8697422742843628, "logits/rejected": -1.805645227432251, "logps/chosen": -504.7207946777344, "logps/rejected": -180.1349639892578, "loss": 0.6875, "rewards/accuracies": 0.4375, "rewards/chosen": 0.024673843756318092, "rewards/margins": 0.014757443219423294, "rewards/rejected": 0.009916400536894798, "step": 68 }, { "epoch": 0.029049828525317734, "grad_norm": 11.626725196838379, "learning_rate": 9.466666666666666e-07, "logits/chosen": -1.828177571296692, "logits/rejected": -1.7643202543258667, "logps/chosen": -689.0641479492188, "logps/rejected": -191.6230926513672, "loss": 0.6849, "rewards/accuracies": 0.75, "rewards/chosen": 0.01744556427001953, "rewards/margins": 0.02034006081521511, "rewards/rejected": -0.002894497010856867, "step": 72 }, { "epoch": 0.030663707887835383, "grad_norm": 9.36368179321289, "learning_rate": 1e-06, "logits/chosen": -1.8573331832885742, "logits/rejected": -1.8401379585266113, "logps/chosen": -651.4483642578125, "logps/rejected": -184.19436645507812, "loss": 0.6669, "rewards/accuracies": 0.75, "rewards/chosen": 0.04055405035614967, "rewards/margins": 0.05503583699464798, "rewards/rejected": -0.014481783844530582, "step": 76 }, { "epoch": 0.03227758725035303, "grad_norm": 9.259771347045898, "learning_rate": 9.983361064891845e-07, "logits/chosen": -1.8858129978179932, "logits/rejected": -1.8323912620544434, "logps/chosen": -609.9225463867188, "logps/rejected": -188.63671875, "loss": 0.6546, "rewards/accuracies": 0.875, "rewards/chosen": 0.07321024686098099, "rewards/margins": 0.08051343262195587, "rewards/rejected": -0.007303190883249044, "step": 80 }, { "epoch": 0.03389146661287069, "grad_norm": 10.036303520202637, "learning_rate": 9.966722129783693e-07, "logits/chosen": -1.7906129360198975, "logits/rejected": -1.779247760772705, "logps/chosen": -625.0037841796875, "logps/rejected": -171.78475952148438, "loss": 0.6508, "rewards/accuracies": 0.75, "rewards/chosen": 0.06164121627807617, "rewards/margins": 0.09076390415430069, "rewards/rejected": -0.029122687876224518, "step": 84 }, { "epoch": 0.03550534597538834, "grad_norm": 10.572989463806152, "learning_rate": 9.95008319467554e-07, "logits/chosen": -1.8626290559768677, "logits/rejected": -1.8228033781051636, "logps/chosen": -712.7799682617188, "logps/rejected": -193.0518798828125, "loss": 0.636, "rewards/accuracies": 0.875, "rewards/chosen": 0.09401846677064896, "rewards/margins": 0.1203092634677887, "rewards/rejected": -0.02629080042243004, "step": 88 }, { "epoch": 0.037119225337905994, "grad_norm": 8.901836395263672, "learning_rate": 9.933444259567387e-07, "logits/chosen": -1.8151315450668335, "logits/rejected": -1.7728750705718994, "logps/chosen": -708.342041015625, "logps/rejected": -155.09866333007812, "loss": 0.6499, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08686323463916779, "rewards/margins": 0.0922490730881691, "rewards/rejected": -0.005385827273130417, "step": 92 }, { "epoch": 0.03873310470042364, "grad_norm": 9.04437255859375, "learning_rate": 9.916805324459233e-07, "logits/chosen": -1.8634467124938965, "logits/rejected": -1.7756600379943848, "logps/chosen": -525.9450073242188, "logps/rejected": -180.24559020996094, "loss": 0.6298, "rewards/accuracies": 1.0, "rewards/chosen": 0.11129942536354065, "rewards/margins": 0.132988840341568, "rewards/rejected": -0.021689414978027344, "step": 96 }, { "epoch": 0.040346984062941293, "grad_norm": 9.580463409423828, "learning_rate": 9.900166389351081e-07, "logits/chosen": -1.8459299802780151, "logits/rejected": -1.8302876949310303, "logps/chosen": -761.770751953125, "logps/rejected": -177.57623291015625, "loss": 0.6122, "rewards/accuracies": 0.875, "rewards/chosen": 0.1674884855747223, "rewards/margins": 0.17264166474342346, "rewards/rejected": -0.005153179168701172, "step": 100 }, { "epoch": 0.04196086342545895, "grad_norm": 11.016587257385254, "learning_rate": 9.883527454242927e-07, "logits/chosen": -1.8891446590423584, "logits/rejected": -1.8224517107009888, "logps/chosen": -630.7316284179688, "logps/rejected": -206.77084350585938, "loss": 0.6109, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1402856856584549, "rewards/margins": 0.1757189929485321, "rewards/rejected": -0.035433292388916016, "step": 104 }, { "epoch": 0.0435747427879766, "grad_norm": 9.553730010986328, "learning_rate": 9.866888519134775e-07, "logits/chosen": -1.8555597066879272, "logits/rejected": -1.7626439332962036, "logps/chosen": -633.5787353515625, "logps/rejected": -162.94183349609375, "loss": 0.6351, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13311901688575745, "rewards/margins": 0.12320089340209961, "rewards/rejected": 0.00991811789572239, "step": 108 }, { "epoch": 0.045188622150494254, "grad_norm": 7.1617512702941895, "learning_rate": 9.85024958402662e-07, "logits/chosen": -1.9020514488220215, "logits/rejected": -1.8328626155853271, "logps/chosen": -510.8736572265625, "logps/rejected": -167.32327270507812, "loss": 0.5996, "rewards/accuracies": 1.0, "rewards/chosen": 0.17981749773025513, "rewards/margins": 0.19826579093933105, "rewards/rejected": -0.018448304384946823, "step": 112 }, { "epoch": 0.0468025015130119, "grad_norm": 11.529860496520996, "learning_rate": 9.83361064891847e-07, "logits/chosen": -1.8347463607788086, "logits/rejected": -1.8149213790893555, "logps/chosen": -681.3889770507812, "logps/rejected": -155.6142578125, "loss": 0.5692, "rewards/accuracies": 1.0, "rewards/chosen": 0.22275543212890625, "rewards/margins": 0.26891106367111206, "rewards/rejected": -0.046155646443367004, "step": 116 }, { "epoch": 0.048416380875529554, "grad_norm": 8.369261741638184, "learning_rate": 9.816971713810315e-07, "logits/chosen": -1.851243495941162, "logits/rejected": -1.8446911573410034, "logps/chosen": -680.895263671875, "logps/rejected": -171.30349731445312, "loss": 0.5742, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1928505003452301, "rewards/margins": 0.2605333626270294, "rewards/rejected": -0.06768283993005753, "step": 120 }, { "epoch": 0.05003026023804721, "grad_norm": 10.11263656616211, "learning_rate": 9.800332778702163e-07, "logits/chosen": -1.8450337648391724, "logits/rejected": -1.840282917022705, "logps/chosen": -599.8352661132812, "logps/rejected": -174.2366180419922, "loss": 0.5428, "rewards/accuracies": 1.0, "rewards/chosen": 0.2672262191772461, "rewards/margins": 0.33165156841278076, "rewards/rejected": -0.06442537158727646, "step": 124 }, { "epoch": 0.05164413960056486, "grad_norm": 10.036280632019043, "learning_rate": 9.783693843594009e-07, "logits/chosen": -1.848441243171692, "logits/rejected": -1.7691915035247803, "logps/chosen": -627.5263671875, "logps/rejected": -169.86163330078125, "loss": 0.5532, "rewards/accuracies": 1.0, "rewards/chosen": 0.28240805864334106, "rewards/margins": 0.3094378709793091, "rewards/rejected": -0.027029801160097122, "step": 128 }, { "epoch": 0.05325801896308251, "grad_norm": 10.277750968933105, "learning_rate": 9.767054908485857e-07, "logits/chosen": -1.869267463684082, "logits/rejected": -1.8065874576568604, "logps/chosen": -640.40283203125, "logps/rejected": -176.6936798095703, "loss": 0.5478, "rewards/accuracies": 1.0, "rewards/chosen": 0.28104326128959656, "rewards/margins": 0.32055872678756714, "rewards/rejected": -0.03951544687151909, "step": 132 }, { "epoch": 0.05487189832560016, "grad_norm": 9.711833000183105, "learning_rate": 9.750415973377703e-07, "logits/chosen": -1.8828225135803223, "logits/rejected": -1.8163931369781494, "logps/chosen": -746.4608764648438, "logps/rejected": -152.89015197753906, "loss": 0.4901, "rewards/accuracies": 1.0, "rewards/chosen": 0.3804107904434204, "rewards/margins": 0.4651769697666168, "rewards/rejected": -0.0847662016749382, "step": 136 }, { "epoch": 0.056485777688117814, "grad_norm": 9.27696418762207, "learning_rate": 9.73377703826955e-07, "logits/chosen": -1.8605766296386719, "logits/rejected": -1.8011505603790283, "logps/chosen": -635.22412109375, "logps/rejected": -169.20855712890625, "loss": 0.5061, "rewards/accuracies": 1.0, "rewards/chosen": 0.3754282295703888, "rewards/margins": 0.42136767506599426, "rewards/rejected": -0.04593944177031517, "step": 140 }, { "epoch": 0.05809965705063547, "grad_norm": 9.277071952819824, "learning_rate": 9.717138103161397e-07, "logits/chosen": -1.8763195276260376, "logits/rejected": -1.820366621017456, "logps/chosen": -595.7280883789062, "logps/rejected": -173.5071258544922, "loss": 0.4979, "rewards/accuracies": 1.0, "rewards/chosen": 0.40021756291389465, "rewards/margins": 0.4509323835372925, "rewards/rejected": -0.050714824348688126, "step": 144 }, { "epoch": 0.059713536413153114, "grad_norm": 8.457143783569336, "learning_rate": 9.700499168053245e-07, "logits/chosen": -1.8310701847076416, "logits/rejected": -1.7955127954483032, "logps/chosen": -629.056640625, "logps/rejected": -171.4107208251953, "loss": 0.4636, "rewards/accuracies": 1.0, "rewards/chosen": 0.49705734848976135, "rewards/margins": 0.532558798789978, "rewards/rejected": -0.03550143167376518, "step": 148 }, { "epoch": 0.06132741577567077, "grad_norm": 8.559431076049805, "learning_rate": 9.68386023294509e-07, "logits/chosen": -1.8524607419967651, "logits/rejected": -1.800382137298584, "logps/chosen": -712.405517578125, "logps/rejected": -173.54351806640625, "loss": 0.4695, "rewards/accuracies": 1.0, "rewards/chosen": 0.48576146364212036, "rewards/margins": 0.5224766135215759, "rewards/rejected": -0.03671512380242348, "step": 152 }, { "epoch": 0.06294129513818843, "grad_norm": 8.089521408081055, "learning_rate": 9.667221297836938e-07, "logits/chosen": -1.7879118919372559, "logits/rejected": -1.7554512023925781, "logps/chosen": -627.6424560546875, "logps/rejected": -158.81387329101562, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 0.60613614320755, "rewards/margins": 0.6517617106437683, "rewards/rejected": -0.04562554508447647, "step": 156 }, { "epoch": 0.06455517450070607, "grad_norm": 9.429780006408691, "learning_rate": 9.650582362728784e-07, "logits/chosen": -1.803401231765747, "logits/rejected": -1.7865647077560425, "logps/chosen": -663.918701171875, "logps/rejected": -178.0006103515625, "loss": 0.4186, "rewards/accuracies": 1.0, "rewards/chosen": 0.5826452374458313, "rewards/margins": 0.6699901819229126, "rewards/rejected": -0.0873449370265007, "step": 160 }, { "epoch": 0.06616905386322372, "grad_norm": 7.796865463256836, "learning_rate": 9.633943427620632e-07, "logits/chosen": -1.836142897605896, "logits/rejected": -1.8172032833099365, "logps/chosen": -647.5742797851562, "logps/rejected": -166.88780212402344, "loss": 0.4117, "rewards/accuracies": 1.0, "rewards/chosen": 0.6031762957572937, "rewards/margins": 0.685782253742218, "rewards/rejected": -0.08260588347911835, "step": 164 }, { "epoch": 0.06778293322574137, "grad_norm": 7.6634745597839355, "learning_rate": 9.617304492512478e-07, "logits/chosen": -1.8437217473983765, "logits/rejected": -1.8233957290649414, "logps/chosen": -585.651611328125, "logps/rejected": -167.30047607421875, "loss": 0.3793, "rewards/accuracies": 1.0, "rewards/chosen": 0.6858773827552795, "rewards/margins": 0.7880266904830933, "rewards/rejected": -0.10214924812316895, "step": 168 }, { "epoch": 0.06939681258825903, "grad_norm": 6.964993476867676, "learning_rate": 9.600665557404326e-07, "logits/chosen": -1.8836581707000732, "logits/rejected": -1.837230920791626, "logps/chosen": -605.7140502929688, "logps/rejected": -185.67628479003906, "loss": 0.3676, "rewards/accuracies": 1.0, "rewards/chosen": 0.7289239764213562, "rewards/margins": 0.8237885236740112, "rewards/rejected": -0.09486451745033264, "step": 172 }, { "epoch": 0.07101069195077668, "grad_norm": 6.798006534576416, "learning_rate": 9.584026622296172e-07, "logits/chosen": -1.840924620628357, "logits/rejected": -1.8057985305786133, "logps/chosen": -613.834716796875, "logps/rejected": -162.14759826660156, "loss": 0.3291, "rewards/accuracies": 1.0, "rewards/chosen": 0.8210676312446594, "rewards/margins": 0.962322473526001, "rewards/rejected": -0.14125481247901917, "step": 176 }, { "epoch": 0.07262457131329433, "grad_norm": 6.633759021759033, "learning_rate": 9.56738768718802e-07, "logits/chosen": -1.8859058618545532, "logits/rejected": -1.8302955627441406, "logps/chosen": -604.2639770507812, "logps/rejected": -182.3938751220703, "loss": 0.3122, "rewards/accuracies": 1.0, "rewards/chosen": 0.9180707335472107, "rewards/margins": 1.0523040294647217, "rewards/rejected": -0.13423334062099457, "step": 180 }, { "epoch": 0.07423845067581199, "grad_norm": 6.229453086853027, "learning_rate": 9.550748752079866e-07, "logits/chosen": -1.8906497955322266, "logits/rejected": -1.8026591539382935, "logps/chosen": -596.3997192382812, "logps/rejected": -195.84957885742188, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 0.9736951589584351, "rewards/margins": 1.079097867012024, "rewards/rejected": -0.10540261119604111, "step": 184 }, { "epoch": 0.07585233003832964, "grad_norm": 5.826483249664307, "learning_rate": 9.534109816971714e-07, "logits/chosen": -1.897779107093811, "logits/rejected": -1.7993252277374268, "logps/chosen": -696.0611572265625, "logps/rejected": -182.28521728515625, "loss": 0.2563, "rewards/accuracies": 1.0, "rewards/chosen": 1.1640598773956299, "rewards/margins": 1.2536303997039795, "rewards/rejected": -0.08957052230834961, "step": 188 }, { "epoch": 0.07746620940084728, "grad_norm": 5.956470966339111, "learning_rate": 9.517470881863561e-07, "logits/chosen": -1.8985055685043335, "logits/rejected": -1.8336517810821533, "logps/chosen": -639.809814453125, "logps/rejected": -187.35842895507812, "loss": 0.2587, "rewards/accuracies": 1.0, "rewards/chosen": 1.0798907279968262, "rewards/margins": 1.2375035285949707, "rewards/rejected": -0.1576129049062729, "step": 192 }, { "epoch": 0.07908008876336493, "grad_norm": 5.296114921569824, "learning_rate": 9.500831946755408e-07, "logits/chosen": -1.877612590789795, "logits/rejected": -1.787947416305542, "logps/chosen": -562.8646850585938, "logps/rejected": -171.16598510742188, "loss": 0.2519, "rewards/accuracies": 1.0, "rewards/chosen": 1.175575852394104, "rewards/margins": 1.2945899963378906, "rewards/rejected": -0.11901402473449707, "step": 196 }, { "epoch": 0.08069396812588259, "grad_norm": 4.823089599609375, "learning_rate": 9.484193011647255e-07, "logits/chosen": -1.8627557754516602, "logits/rejected": -1.8524413108825684, "logps/chosen": -564.0380249023438, "logps/rejected": -169.01828002929688, "loss": 0.2396, "rewards/accuracies": 1.0, "rewards/chosen": 1.1915247440338135, "rewards/margins": 1.3673070669174194, "rewards/rejected": -0.17578236758708954, "step": 200 }, { "epoch": 0.08230784748840024, "grad_norm": 4.708045482635498, "learning_rate": 9.467554076539102e-07, "logits/chosen": -1.8740744590759277, "logits/rejected": -1.825553297996521, "logps/chosen": -600.1619873046875, "logps/rejected": -170.49176025390625, "loss": 0.1914, "rewards/accuracies": 1.0, "rewards/chosen": 1.4285141229629517, "rewards/margins": 1.6031705141067505, "rewards/rejected": -0.17465639114379883, "step": 204 }, { "epoch": 0.0839217268509179, "grad_norm": 3.46640944480896, "learning_rate": 9.450915141430949e-07, "logits/chosen": -1.8692810535430908, "logits/rejected": -1.8688883781433105, "logps/chosen": -735.9435424804688, "logps/rejected": -209.58779907226562, "loss": 0.164, "rewards/accuracies": 1.0, "rewards/chosen": 1.5681211948394775, "rewards/margins": 1.8015025854110718, "rewards/rejected": -0.2333814799785614, "step": 208 }, { "epoch": 0.08553560621343555, "grad_norm": 3.262192726135254, "learning_rate": 9.434276206322796e-07, "logits/chosen": -1.836745023727417, "logits/rejected": -1.782804250717163, "logps/chosen": -639.732177734375, "logps/rejected": -182.7959442138672, "loss": 0.1668, "rewards/accuracies": 1.0, "rewards/chosen": 1.5732530355453491, "rewards/margins": 1.758592128753662, "rewards/rejected": -0.18533901870250702, "step": 212 }, { "epoch": 0.0871494855759532, "grad_norm": 2.9248430728912354, "learning_rate": 9.417637271214643e-07, "logits/chosen": -1.8648481369018555, "logits/rejected": -1.8195734024047852, "logps/chosen": -701.0437622070312, "logps/rejected": -190.5961151123047, "loss": 0.1551, "rewards/accuracies": 1.0, "rewards/chosen": 1.7235438823699951, "rewards/margins": 1.9199185371398926, "rewards/rejected": -0.19637469947338104, "step": 216 }, { "epoch": 0.08876336493847085, "grad_norm": 3.6723570823669434, "learning_rate": 9.40099833610649e-07, "logits/chosen": -1.881626844406128, "logits/rejected": -1.8475261926651, "logps/chosen": -639.2783203125, "logps/rejected": -179.34564208984375, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 1.7791390419006348, "rewards/margins": 2.046505928039551, "rewards/rejected": -0.2673667073249817, "step": 220 }, { "epoch": 0.09037724430098851, "grad_norm": 2.784539222717285, "learning_rate": 9.384359400998337e-07, "logits/chosen": -1.8553237915039062, "logits/rejected": -1.812622308731079, "logps/chosen": -670.880859375, "logps/rejected": -166.1134796142578, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 2.03114652633667, "rewards/margins": 2.3463332653045654, "rewards/rejected": -0.315187007188797, "step": 224 }, { "epoch": 0.09199112366350615, "grad_norm": 2.734344244003296, "learning_rate": 9.367720465890182e-07, "logits/chosen": -1.919129729270935, "logits/rejected": -1.8518396615982056, "logps/chosen": -606.1495361328125, "logps/rejected": -186.41476440429688, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 1.8963017463684082, "rewards/margins": 2.242725133895874, "rewards/rejected": -0.3464234471321106, "step": 228 }, { "epoch": 0.0936050030260238, "grad_norm": 2.642324924468994, "learning_rate": 9.351081530782029e-07, "logits/chosen": -1.9058830738067627, "logits/rejected": -1.8435331583023071, "logps/chosen": -620.9544067382812, "logps/rejected": -170.3316650390625, "loss": 0.0915, "rewards/accuracies": 1.0, "rewards/chosen": 2.1819722652435303, "rewards/margins": 2.5006728172302246, "rewards/rejected": -0.31870052218437195, "step": 232 }, { "epoch": 0.09521888238854145, "grad_norm": 2.5412051677703857, "learning_rate": 9.334442595673876e-07, "logits/chosen": -1.9024293422698975, "logits/rejected": -1.8309537172317505, "logps/chosen": -492.47174072265625, "logps/rejected": -174.3946075439453, "loss": 0.1114, "rewards/accuracies": 1.0, "rewards/chosen": 1.8983821868896484, "rewards/margins": 2.2451133728027344, "rewards/rejected": -0.3467312753200531, "step": 236 }, { "epoch": 0.09683276175105911, "grad_norm": 2.115011215209961, "learning_rate": 9.317803660565723e-07, "logits/chosen": -1.8626179695129395, "logits/rejected": -1.8234471082687378, "logps/chosen": -634.0989379882812, "logps/rejected": -192.6904754638672, "loss": 0.0914, "rewards/accuracies": 1.0, "rewards/chosen": 2.2067978382110596, "rewards/margins": 2.6560347080230713, "rewards/rejected": -0.4492369294166565, "step": 240 }, { "epoch": 0.09844664111357676, "grad_norm": 1.4107238054275513, "learning_rate": 9.30116472545757e-07, "logits/chosen": -1.8785319328308105, "logits/rejected": -1.8042782545089722, "logps/chosen": -559.055908203125, "logps/rejected": -162.2275848388672, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": 2.3704121112823486, "rewards/margins": 2.7206814289093018, "rewards/rejected": -0.35026925802230835, "step": 244 }, { "epoch": 0.10006052047609441, "grad_norm": 1.0951142311096191, "learning_rate": 9.284525790349417e-07, "logits/chosen": -1.9208076000213623, "logits/rejected": -1.8807799816131592, "logps/chosen": -617.2267456054688, "logps/rejected": -184.17849731445312, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": 2.632871150970459, "rewards/margins": 3.0540738105773926, "rewards/rejected": -0.4212028384208679, "step": 248 }, { "epoch": 0.10167439983861207, "grad_norm": 1.7120704650878906, "learning_rate": 9.267886855241264e-07, "logits/chosen": -1.8855781555175781, "logits/rejected": -1.849815011024475, "logps/chosen": -694.651611328125, "logps/rejected": -186.77923583984375, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 2.81239652633667, "rewards/margins": 3.2596933841705322, "rewards/rejected": -0.44729721546173096, "step": 252 }, { "epoch": 0.10328827920112972, "grad_norm": 1.123317837715149, "learning_rate": 9.251247920133111e-07, "logits/chosen": -1.8879116773605347, "logits/rejected": -1.8704044818878174, "logps/chosen": -625.6282958984375, "logps/rejected": -182.09402465820312, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 2.90934681892395, "rewards/margins": 3.330071210861206, "rewards/rejected": -0.42072463035583496, "step": 256 }, { "epoch": 0.10490215856364737, "grad_norm": 0.9969452619552612, "learning_rate": 9.234608985024958e-07, "logits/chosen": -1.957910180091858, "logits/rejected": -1.872218370437622, "logps/chosen": -591.1821899414062, "logps/rejected": -185.39501953125, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 2.797227144241333, "rewards/margins": 3.4245991706848145, "rewards/rejected": -0.6273719668388367, "step": 260 }, { "epoch": 0.10651603792616501, "grad_norm": 0.9096449613571167, "learning_rate": 9.217970049916805e-07, "logits/chosen": -1.826223373413086, "logits/rejected": -1.798948884010315, "logps/chosen": -697.7139282226562, "logps/rejected": -170.2051544189453, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 3.347254753112793, "rewards/margins": 3.7956621646881104, "rewards/rejected": -0.44840753078460693, "step": 264 }, { "epoch": 0.10812991728868267, "grad_norm": 0.7581660151481628, "learning_rate": 9.201331114808652e-07, "logits/chosen": -1.8509432077407837, "logits/rejected": -1.8351494073867798, "logps/chosen": -600.7438354492188, "logps/rejected": -170.31341552734375, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 3.1054396629333496, "rewards/margins": 3.6346957683563232, "rewards/rejected": -0.529256284236908, "step": 268 }, { "epoch": 0.10974379665120032, "grad_norm": 0.7356258630752563, "learning_rate": 9.184692179700499e-07, "logits/chosen": -1.8772118091583252, "logits/rejected": -1.8435137271881104, "logps/chosen": -639.7233276367188, "logps/rejected": -176.55210876464844, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 3.0785694122314453, "rewards/margins": 3.6540303230285645, "rewards/rejected": -0.5754609107971191, "step": 272 }, { "epoch": 0.11135767601371797, "grad_norm": 1.016947627067566, "learning_rate": 9.168053244592346e-07, "logits/chosen": -1.9088422060012817, "logits/rejected": -1.8874526023864746, "logps/chosen": -601.9957275390625, "logps/rejected": -203.20567321777344, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": 2.9845173358917236, "rewards/margins": 3.608407974243164, "rewards/rejected": -0.6238910555839539, "step": 276 }, { "epoch": 0.11297155537623563, "grad_norm": 0.2594591975212097, "learning_rate": 9.151414309484193e-07, "logits/chosen": -1.9489314556121826, "logits/rejected": -1.8656785488128662, "logps/chosen": -660.7191772460938, "logps/rejected": -212.38247680664062, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 3.82008695602417, "rewards/margins": 4.5188188552856445, "rewards/rejected": -0.6987316608428955, "step": 280 }, { "epoch": 0.11458543473875328, "grad_norm": 0.5364330410957336, "learning_rate": 9.13477537437604e-07, "logits/chosen": -1.9456171989440918, "logits/rejected": -1.9060781002044678, "logps/chosen": -546.7889404296875, "logps/rejected": -181.5035400390625, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 3.2177364826202393, "rewards/margins": 3.8707311153411865, "rewards/rejected": -0.6529947519302368, "step": 284 }, { "epoch": 0.11619931410127093, "grad_norm": 0.4348556101322174, "learning_rate": 9.118136439267887e-07, "logits/chosen": -1.9463292360305786, "logits/rejected": -1.8484570980072021, "logps/chosen": -604.8233642578125, "logps/rejected": -174.00482177734375, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 3.929997682571411, "rewards/margins": 4.570127487182617, "rewards/rejected": -0.6401302814483643, "step": 288 }, { "epoch": 0.11781319346378859, "grad_norm": 0.6889119148254395, "learning_rate": 9.101497504159734e-07, "logits/chosen": -1.8386610746383667, "logits/rejected": -1.8709819316864014, "logps/chosen": -754.1253051757812, "logps/rejected": -164.28073120117188, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": 4.1909685134887695, "rewards/margins": 4.821207046508789, "rewards/rejected": -0.6302385330200195, "step": 292 }, { "epoch": 0.11942707282630623, "grad_norm": 0.9312723875045776, "learning_rate": 9.08485856905158e-07, "logits/chosen": -1.9304311275482178, "logits/rejected": -1.877934455871582, "logps/chosen": -663.476318359375, "logps/rejected": -178.3296661376953, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 3.8921289443969727, "rewards/margins": 4.508294105529785, "rewards/rejected": -0.6161651611328125, "step": 296 }, { "epoch": 0.12104095218882388, "grad_norm": 0.45145049691200256, "learning_rate": 9.068219633943427e-07, "logits/chosen": -1.9461368322372437, "logits/rejected": -1.9137922525405884, "logps/chosen": -519.5482788085938, "logps/rejected": -171.5550994873047, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": 3.5039620399475098, "rewards/margins": 4.2493672370910645, "rewards/rejected": -0.7454049587249756, "step": 300 }, { "epoch": 0.12265483155134153, "grad_norm": 0.34256625175476074, "learning_rate": 9.051580698835274e-07, "logits/chosen": -1.9334783554077148, "logits/rejected": -1.866568684577942, "logps/chosen": -516.4837646484375, "logps/rejected": -166.8575439453125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 3.3952345848083496, "rewards/margins": 4.189650058746338, "rewards/rejected": -0.7944154739379883, "step": 304 }, { "epoch": 0.12426871091385919, "grad_norm": 0.19535161554813385, "learning_rate": 9.034941763727121e-07, "logits/chosen": -1.890864372253418, "logits/rejected": -1.9076076745986938, "logps/chosen": -604.8763427734375, "logps/rejected": -186.29266357421875, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": 4.159204006195068, "rewards/margins": 5.199936389923096, "rewards/rejected": -1.0407326221466064, "step": 308 }, { "epoch": 0.12588259027637685, "grad_norm": 0.46754637360572815, "learning_rate": 9.018302828618967e-07, "logits/chosen": -1.8786312341690063, "logits/rejected": -1.894760012626648, "logps/chosen": -679.2012939453125, "logps/rejected": -193.29876708984375, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 4.544890403747559, "rewards/margins": 5.467522621154785, "rewards/rejected": -0.9226320385932922, "step": 312 }, { "epoch": 0.1274964696388945, "grad_norm": 0.6174956560134888, "learning_rate": 9.001663893510814e-07, "logits/chosen": -1.886858582496643, "logits/rejected": -1.8386237621307373, "logps/chosen": -570.9767456054688, "logps/rejected": -189.11257934570312, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 4.197934150695801, "rewards/margins": 4.918734073638916, "rewards/rejected": -0.7208000421524048, "step": 316 }, { "epoch": 0.12911034900141213, "grad_norm": 0.21690328419208527, "learning_rate": 8.985024958402661e-07, "logits/chosen": -1.8807096481323242, "logits/rejected": -1.8404361009597778, "logps/chosen": -631.512939453125, "logps/rejected": -202.00491333007812, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 4.185035705566406, "rewards/margins": 5.164657115936279, "rewards/rejected": -0.9796210527420044, "step": 320 }, { "epoch": 0.1307242283639298, "grad_norm": 0.27453818917274475, "learning_rate": 8.968386023294508e-07, "logits/chosen": -1.9047508239746094, "logits/rejected": -1.8355705738067627, "logps/chosen": -634.4239501953125, "logps/rejected": -171.60614013671875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 4.566464424133301, "rewards/margins": 5.428445339202881, "rewards/rejected": -0.8619803786277771, "step": 324 }, { "epoch": 0.13233810772644744, "grad_norm": 0.12078557163476944, "learning_rate": 8.951747088186355e-07, "logits/chosen": -1.9390285015106201, "logits/rejected": -1.9311224222183228, "logps/chosen": -535.9077758789062, "logps/rejected": -177.8657684326172, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 4.048047065734863, "rewards/margins": 5.055552959442139, "rewards/rejected": -1.0075058937072754, "step": 328 }, { "epoch": 0.1339519870889651, "grad_norm": 0.2596322298049927, "learning_rate": 8.935108153078202e-07, "logits/chosen": -1.926051378250122, "logits/rejected": -1.886147379875183, "logps/chosen": -552.9322509765625, "logps/rejected": -185.4685821533203, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 3.8754191398620605, "rewards/margins": 4.825630187988281, "rewards/rejected": -0.9502105116844177, "step": 332 }, { "epoch": 0.13556586645148275, "grad_norm": 0.3235815763473511, "learning_rate": 8.918469217970049e-07, "logits/chosen": -1.904013991355896, "logits/rejected": -1.8691765069961548, "logps/chosen": -660.4971313476562, "logps/rejected": -188.06716918945312, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 4.380438327789307, "rewards/margins": 5.319826602935791, "rewards/rejected": -0.9393880367279053, "step": 336 }, { "epoch": 0.13717974581400041, "grad_norm": 0.20612092316150665, "learning_rate": 8.901830282861896e-07, "logits/chosen": -1.9383623600006104, "logits/rejected": -1.8907164335250854, "logps/chosen": -613.3983764648438, "logps/rejected": -190.37286376953125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": 4.4248199462890625, "rewards/margins": 5.254948616027832, "rewards/rejected": -0.83012855052948, "step": 340 }, { "epoch": 0.13879362517651805, "grad_norm": 0.07328493893146515, "learning_rate": 8.885191347753743e-07, "logits/chosen": -1.8945516347885132, "logits/rejected": -1.938434362411499, "logps/chosen": -520.166748046875, "logps/rejected": -183.89401245117188, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 4.050896167755127, "rewards/margins": 5.034641265869141, "rewards/rejected": -0.9837454557418823, "step": 344 }, { "epoch": 0.1404075045390357, "grad_norm": 0.22947318851947784, "learning_rate": 8.86855241264559e-07, "logits/chosen": -1.9118045568466187, "logits/rejected": -1.886781096458435, "logps/chosen": -617.0621337890625, "logps/rejected": -182.9320526123047, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 4.453612327575684, "rewards/margins": 5.405158996582031, "rewards/rejected": -0.9515475630760193, "step": 348 }, { "epoch": 0.14202138390155336, "grad_norm": 0.2297365665435791, "learning_rate": 8.851913477537437e-07, "logits/chosen": -1.9014477729797363, "logits/rejected": -1.9050116539001465, "logps/chosen": -710.4368896484375, "logps/rejected": -184.95883178710938, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 5.195467948913574, "rewards/margins": 6.1372294425964355, "rewards/rejected": -0.9417620897293091, "step": 352 }, { "epoch": 0.143635263264071, "grad_norm": 0.12087662518024445, "learning_rate": 8.835274542429284e-07, "logits/chosen": -1.9141509532928467, "logits/rejected": -1.8687645196914673, "logps/chosen": -583.0523681640625, "logps/rejected": -192.8323974609375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 4.609492301940918, "rewards/margins": 5.784029483795166, "rewards/rejected": -1.174537181854248, "step": 356 }, { "epoch": 0.14524914262658867, "grad_norm": 0.40845686197280884, "learning_rate": 8.818635607321131e-07, "logits/chosen": -1.9192259311676025, "logits/rejected": -1.8843393325805664, "logps/chosen": -647.46826171875, "logps/rejected": -187.15878295898438, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 4.887372970581055, "rewards/margins": 5.861534118652344, "rewards/rejected": -0.9741606116294861, "step": 360 }, { "epoch": 0.1468630219891063, "grad_norm": 0.21726560592651367, "learning_rate": 8.801996672212978e-07, "logits/chosen": -1.940704107284546, "logits/rejected": -1.8792184591293335, "logps/chosen": -541.5570068359375, "logps/rejected": -175.240478515625, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 4.699819087982178, "rewards/margins": 5.520890235900879, "rewards/rejected": -0.821071207523346, "step": 364 }, { "epoch": 0.14847690135162397, "grad_norm": 0.18219402432441711, "learning_rate": 8.785357737104824e-07, "logits/chosen": -1.9364285469055176, "logits/rejected": -1.8981354236602783, "logps/chosen": -709.27392578125, "logps/rejected": -177.76229858398438, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 5.27650785446167, "rewards/margins": 6.369976043701172, "rewards/rejected": -1.0934683084487915, "step": 368 }, { "epoch": 0.15009078071414161, "grad_norm": 0.08504246175289154, "learning_rate": 8.768718801996671e-07, "logits/chosen": -1.890986442565918, "logits/rejected": -1.8894208669662476, "logps/chosen": -561.3111572265625, "logps/rejected": -172.63150024414062, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 4.322070121765137, "rewards/margins": 5.302407264709473, "rewards/rejected": -0.9803376197814941, "step": 372 }, { "epoch": 0.15170466007665928, "grad_norm": 0.07193358242511749, "learning_rate": 8.752079866888518e-07, "logits/chosen": -1.9256517887115479, "logits/rejected": -1.9270548820495605, "logps/chosen": -539.645263671875, "logps/rejected": -182.17649841308594, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 4.666395664215088, "rewards/margins": 5.78600549697876, "rewards/rejected": -1.119610071182251, "step": 376 }, { "epoch": 0.15331853943917692, "grad_norm": 0.1710950881242752, "learning_rate": 8.735440931780365e-07, "logits/chosen": -1.8373366594314575, "logits/rejected": -1.8480095863342285, "logps/chosen": -739.8154907226562, "logps/rejected": -183.90869140625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 5.701541423797607, "rewards/margins": 6.78715181350708, "rewards/rejected": -1.0856106281280518, "step": 380 }, { "epoch": 0.15493241880169456, "grad_norm": 0.24807879328727722, "learning_rate": 8.718801996672212e-07, "logits/chosen": -1.9742302894592285, "logits/rejected": -1.9238582849502563, "logps/chosen": -561.7692260742188, "logps/rejected": -176.57542419433594, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 4.566779613494873, "rewards/margins": 5.618612289428711, "rewards/rejected": -1.051832914352417, "step": 384 }, { "epoch": 0.15654629816421223, "grad_norm": 0.17327402532100677, "learning_rate": 8.702163061564059e-07, "logits/chosen": -1.9250305891036987, "logits/rejected": -1.9304999113082886, "logps/chosen": -651.58349609375, "logps/rejected": -185.38400268554688, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 5.240414619445801, "rewards/margins": 6.244239807128906, "rewards/rejected": -1.0038256645202637, "step": 388 }, { "epoch": 0.15816017752672987, "grad_norm": 0.04345053434371948, "learning_rate": 8.685524126455906e-07, "logits/chosen": -1.9195263385772705, "logits/rejected": -1.9161914587020874, "logps/chosen": -614.3543090820312, "logps/rejected": -193.79336547851562, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 4.829865455627441, "rewards/margins": 6.060251235961914, "rewards/rejected": -1.2303862571716309, "step": 392 }, { "epoch": 0.15977405688924753, "grad_norm": 0.07986317574977875, "learning_rate": 8.668885191347753e-07, "logits/chosen": -1.9372272491455078, "logits/rejected": -1.9221218824386597, "logps/chosen": -657.3023071289062, "logps/rejected": -201.88780212402344, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 5.119308948516846, "rewards/margins": 6.300180435180664, "rewards/rejected": -1.1808708906173706, "step": 396 }, { "epoch": 0.16138793625176517, "grad_norm": 0.5749224424362183, "learning_rate": 8.6522462562396e-07, "logits/chosen": -1.9121482372283936, "logits/rejected": -1.8626840114593506, "logps/chosen": -607.6536254882812, "logps/rejected": -181.15809631347656, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 5.051713466644287, "rewards/margins": 5.930785655975342, "rewards/rejected": -0.8790733218193054, "step": 400 }, { "epoch": 0.16300181561428284, "grad_norm": 0.16225050389766693, "learning_rate": 8.635607321131447e-07, "logits/chosen": -1.922587275505066, "logits/rejected": -1.8392772674560547, "logps/chosen": -594.6026611328125, "logps/rejected": -190.15066528320312, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 5.030785083770752, "rewards/margins": 6.235131740570068, "rewards/rejected": -1.2043468952178955, "step": 404 }, { "epoch": 0.16461569497680048, "grad_norm": 0.31736910343170166, "learning_rate": 8.618968386023294e-07, "logits/chosen": -1.938856840133667, "logits/rejected": -1.874018669128418, "logps/chosen": -629.7726440429688, "logps/rejected": -191.27476501464844, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 5.161418437957764, "rewards/margins": 6.157814025878906, "rewards/rejected": -0.9963955283164978, "step": 408 }, { "epoch": 0.16622957433931815, "grad_norm": 0.078969806432724, "learning_rate": 8.602329450915141e-07, "logits/chosen": -1.9578258991241455, "logits/rejected": -1.904362678527832, "logps/chosen": -557.5865478515625, "logps/rejected": -165.2068328857422, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 5.066812992095947, "rewards/margins": 6.078699588775635, "rewards/rejected": -1.011886715888977, "step": 412 }, { "epoch": 0.1678434537018358, "grad_norm": 0.10809255391359329, "learning_rate": 8.585690515806988e-07, "logits/chosen": -1.9391454458236694, "logits/rejected": -1.8803722858428955, "logps/chosen": -552.7350463867188, "logps/rejected": -194.1470184326172, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 5.143176078796387, "rewards/margins": 6.324812889099121, "rewards/rejected": -1.1816368103027344, "step": 416 }, { "epoch": 0.16945733306435343, "grad_norm": 0.14840175211429596, "learning_rate": 8.569051580698835e-07, "logits/chosen": -1.944985032081604, "logits/rejected": -1.850041389465332, "logps/chosen": -537.119140625, "logps/rejected": -186.03939819335938, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 5.215565204620361, "rewards/margins": 6.318350315093994, "rewards/rejected": -1.102785587310791, "step": 420 }, { "epoch": 0.1710712124268711, "grad_norm": 0.16172438859939575, "learning_rate": 8.552412645590682e-07, "logits/chosen": -1.9151146411895752, "logits/rejected": -1.8890461921691895, "logps/chosen": -577.1220703125, "logps/rejected": -182.85760498046875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 5.291979789733887, "rewards/margins": 6.4571123123168945, "rewards/rejected": -1.1651322841644287, "step": 424 }, { "epoch": 0.17268509178938873, "grad_norm": 0.16055883467197418, "learning_rate": 8.535773710482529e-07, "logits/chosen": -1.8585450649261475, "logits/rejected": -1.8200111389160156, "logps/chosen": -647.0691528320312, "logps/rejected": -165.2606964111328, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 5.728975772857666, "rewards/margins": 6.900187969207764, "rewards/rejected": -1.1712125539779663, "step": 428 }, { "epoch": 0.1742989711519064, "grad_norm": 0.008601455017924309, "learning_rate": 8.519134775374376e-07, "logits/chosen": -1.8973114490509033, "logits/rejected": -1.8693630695343018, "logps/chosen": -694.0446166992188, "logps/rejected": -203.99891662597656, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 6.087047576904297, "rewards/margins": 7.384302139282227, "rewards/rejected": -1.2972549200057983, "step": 432 }, { "epoch": 0.17591285051442404, "grad_norm": 0.06516802310943604, "learning_rate": 8.502495840266223e-07, "logits/chosen": -1.9302247762680054, "logits/rejected": -1.9261226654052734, "logps/chosen": -723.577880859375, "logps/rejected": -203.6065216064453, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 5.906829357147217, "rewards/margins": 7.456884860992432, "rewards/rejected": -1.550054907798767, "step": 436 }, { "epoch": 0.1775267298769417, "grad_norm": 0.061921585351228714, "learning_rate": 8.485856905158069e-07, "logits/chosen": -1.8932920694351196, "logits/rejected": -1.8569614887237549, "logps/chosen": -820.576416015625, "logps/rejected": -197.39053344726562, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 5.813792705535889, "rewards/margins": 6.803739070892334, "rewards/rejected": -0.9899464249610901, "step": 440 }, { "epoch": 0.17914060923945935, "grad_norm": 0.13375072181224823, "learning_rate": 8.469217970049916e-07, "logits/chosen": -1.923279881477356, "logits/rejected": -1.9171723127365112, "logps/chosen": -590.0340576171875, "logps/rejected": -199.8806915283203, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 5.576312065124512, "rewards/margins": 6.61418342590332, "rewards/rejected": -1.0378706455230713, "step": 444 }, { "epoch": 0.18075448860197701, "grad_norm": 0.11292973905801773, "learning_rate": 8.452579034941763e-07, "logits/chosen": -1.9549925327301025, "logits/rejected": -1.9018619060516357, "logps/chosen": -513.802978515625, "logps/rejected": -187.83737182617188, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 4.515269756317139, "rewards/margins": 5.593171119689941, "rewards/rejected": -1.0779012441635132, "step": 448 }, { "epoch": 0.18236836796449465, "grad_norm": 0.15013451874256134, "learning_rate": 8.43594009983361e-07, "logits/chosen": -1.9650650024414062, "logits/rejected": -1.9657684564590454, "logps/chosen": -570.8408203125, "logps/rejected": -181.2580108642578, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 5.164285659790039, "rewards/margins": 6.4370317459106445, "rewards/rejected": -1.2727466821670532, "step": 452 }, { "epoch": 0.1839822473270123, "grad_norm": 0.31928369402885437, "learning_rate": 8.419301164725457e-07, "logits/chosen": -1.9018311500549316, "logits/rejected": -1.8793752193450928, "logps/chosen": -619.4147338867188, "logps/rejected": -198.4810791015625, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 4.999479293823242, "rewards/margins": 6.064955711364746, "rewards/rejected": -1.065476655960083, "step": 456 }, { "epoch": 0.18559612668952996, "grad_norm": 0.09624543786048889, "learning_rate": 8.402662229617304e-07, "logits/chosen": -1.9278525114059448, "logits/rejected": -1.8871760368347168, "logps/chosen": -598.8556518554688, "logps/rejected": -194.28054809570312, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 5.6376776695251465, "rewards/margins": 6.7787766456604, "rewards/rejected": -1.141099214553833, "step": 460 }, { "epoch": 0.1872100060520476, "grad_norm": 0.05709528550505638, "learning_rate": 8.386023294509151e-07, "logits/chosen": -1.9166765213012695, "logits/rejected": -1.895799994468689, "logps/chosen": -507.5955810546875, "logps/rejected": -217.8443145751953, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 5.538414001464844, "rewards/margins": 6.912988662719727, "rewards/rejected": -1.3745735883712769, "step": 464 }, { "epoch": 0.18882388541456527, "grad_norm": 0.026953477412462234, "learning_rate": 8.369384359400998e-07, "logits/chosen": -1.9014549255371094, "logits/rejected": -1.9116346836090088, "logps/chosen": -724.1201171875, "logps/rejected": -187.603515625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 6.304912567138672, "rewards/margins": 7.6815996170043945, "rewards/rejected": -1.3766865730285645, "step": 468 }, { "epoch": 0.1904377647770829, "grad_norm": 0.07903099805116653, "learning_rate": 8.352745424292845e-07, "logits/chosen": -1.8818055391311646, "logits/rejected": -1.8560270071029663, "logps/chosen": -577.0653076171875, "logps/rejected": -178.42153930664062, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 5.342180252075195, "rewards/margins": 6.541779518127441, "rewards/rejected": -1.1995996236801147, "step": 472 }, { "epoch": 0.19205164413960057, "grad_norm": 0.21611925959587097, "learning_rate": 8.336106489184692e-07, "logits/chosen": -1.9659340381622314, "logits/rejected": -1.8743841648101807, "logps/chosen": -497.7950744628906, "logps/rejected": -183.23516845703125, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 4.80767822265625, "rewards/margins": 6.104327201843262, "rewards/rejected": -1.2966488599777222, "step": 476 }, { "epoch": 0.19366552350211821, "grad_norm": 0.24866189062595367, "learning_rate": 8.319467554076539e-07, "logits/chosen": -1.908347249031067, "logits/rejected": -1.910021424293518, "logps/chosen": -607.9837646484375, "logps/rejected": -185.052001953125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 5.337136745452881, "rewards/margins": 6.493431091308594, "rewards/rejected": -1.1562941074371338, "step": 480 }, { "epoch": 0.19527940286463588, "grad_norm": 0.06139581650495529, "learning_rate": 8.302828618968386e-07, "logits/chosen": -1.9786298274993896, "logits/rejected": -1.9032433032989502, "logps/chosen": -587.313232421875, "logps/rejected": -190.5347137451172, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 5.532355308532715, "rewards/margins": 7.032785415649414, "rewards/rejected": -1.5004299879074097, "step": 484 }, { "epoch": 0.19689328222715352, "grad_norm": 0.09836946427822113, "learning_rate": 8.286189683860233e-07, "logits/chosen": -1.920711874961853, "logits/rejected": -1.8608055114746094, "logps/chosen": -597.4845581054688, "logps/rejected": -191.59690856933594, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 5.381219863891602, "rewards/margins": 6.63675594329834, "rewards/rejected": -1.2555367946624756, "step": 488 }, { "epoch": 0.19850716158967116, "grad_norm": 0.02149817906320095, "learning_rate": 8.26955074875208e-07, "logits/chosen": -1.9471436738967896, "logits/rejected": -1.907917857170105, "logps/chosen": -589.512451171875, "logps/rejected": -175.90396118164062, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 5.453587532043457, "rewards/margins": 6.756851673126221, "rewards/rejected": -1.3032641410827637, "step": 492 }, { "epoch": 0.20012104095218883, "grad_norm": 0.01611105166375637, "learning_rate": 8.252911813643927e-07, "logits/chosen": -1.9373788833618164, "logits/rejected": -1.8780406713485718, "logps/chosen": -593.4351196289062, "logps/rejected": -174.8452911376953, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 5.505245208740234, "rewards/margins": 6.7562456130981445, "rewards/rejected": -1.251001000404358, "step": 496 }, { "epoch": 0.20173492031470647, "grad_norm": 0.07755707204341888, "learning_rate": 8.236272878535774e-07, "logits/chosen": -1.9053893089294434, "logits/rejected": -1.8700766563415527, "logps/chosen": -639.397705078125, "logps/rejected": -181.3414764404297, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 5.76812744140625, "rewards/margins": 7.175937652587891, "rewards/rejected": -1.4078105688095093, "step": 500 }, { "epoch": 0.20334879967722413, "grad_norm": 0.13203193247318268, "learning_rate": 8.219633943427621e-07, "logits/chosen": -1.9028880596160889, "logits/rejected": -1.9365698099136353, "logps/chosen": -634.5409545898438, "logps/rejected": -192.31919860839844, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 6.13217306137085, "rewards/margins": 7.2719268798828125, "rewards/rejected": -1.139754056930542, "step": 504 }, { "epoch": 0.20496267903974177, "grad_norm": 0.08185280114412308, "learning_rate": 8.202995008319468e-07, "logits/chosen": -1.9227089881896973, "logits/rejected": -1.8932616710662842, "logps/chosen": -568.471923828125, "logps/rejected": -177.1082305908203, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 5.302500247955322, "rewards/margins": 6.548261642456055, "rewards/rejected": -1.2457613945007324, "step": 508 }, { "epoch": 0.20657655840225944, "grad_norm": 0.042652033269405365, "learning_rate": 8.186356073211314e-07, "logits/chosen": -1.9280370473861694, "logits/rejected": -1.9254779815673828, "logps/chosen": -659.2647094726562, "logps/rejected": -196.09413146972656, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 6.2763671875, "rewards/margins": 7.652406215667725, "rewards/rejected": -1.3760390281677246, "step": 512 }, { "epoch": 0.20819043776477708, "grad_norm": 0.10625182092189789, "learning_rate": 8.169717138103161e-07, "logits/chosen": -2.0046403408050537, "logits/rejected": -1.9275269508361816, "logps/chosen": -592.036376953125, "logps/rejected": -176.73626708984375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 5.397172927856445, "rewards/margins": 6.658295631408691, "rewards/rejected": -1.2611230611801147, "step": 516 }, { "epoch": 0.20980431712729475, "grad_norm": 0.026842469349503517, "learning_rate": 8.153078202995008e-07, "logits/chosen": -1.916214942932129, "logits/rejected": -1.858412742614746, "logps/chosen": -614.1990356445312, "logps/rejected": -188.16221618652344, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 5.529000759124756, "rewards/margins": 6.793569564819336, "rewards/rejected": -1.2645690441131592, "step": 520 }, { "epoch": 0.2114181964898124, "grad_norm": 0.2691098153591156, "learning_rate": 8.136439267886855e-07, "logits/chosen": -1.9061782360076904, "logits/rejected": -1.9004026651382446, "logps/chosen": -668.024169921875, "logps/rejected": -188.3176727294922, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 5.824760913848877, "rewards/margins": 7.1862616539001465, "rewards/rejected": -1.3615007400512695, "step": 524 }, { "epoch": 0.21303207585233003, "grad_norm": 0.04816541448235512, "learning_rate": 8.119800332778701e-07, "logits/chosen": -1.9195411205291748, "logits/rejected": -1.8879984617233276, "logps/chosen": -520.4843139648438, "logps/rejected": -196.556396484375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 5.542881965637207, "rewards/margins": 6.9178924560546875, "rewards/rejected": -1.3750112056732178, "step": 528 }, { "epoch": 0.2146459552148477, "grad_norm": 0.013207744807004929, "learning_rate": 8.103161397670548e-07, "logits/chosen": -1.8960598707199097, "logits/rejected": -1.8628833293914795, "logps/chosen": -575.0956420898438, "logps/rejected": -180.6763153076172, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 5.80772590637207, "rewards/margins": 7.034721374511719, "rewards/rejected": -1.2269948720932007, "step": 532 }, { "epoch": 0.21625983457736533, "grad_norm": 0.04492935910820961, "learning_rate": 8.086522462562395e-07, "logits/chosen": -1.9279592037200928, "logits/rejected": -1.924379587173462, "logps/chosen": -650.9527587890625, "logps/rejected": -187.57369995117188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 5.90622615814209, "rewards/margins": 7.342747211456299, "rewards/rejected": -1.4365205764770508, "step": 536 }, { "epoch": 0.217873713939883, "grad_norm": 0.040289487689733505, "learning_rate": 8.069883527454242e-07, "logits/chosen": -1.9787566661834717, "logits/rejected": -1.936347484588623, "logps/chosen": -522.09619140625, "logps/rejected": -196.83053588867188, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 5.282732009887695, "rewards/margins": 6.8043389320373535, "rewards/rejected": -1.5216064453125, "step": 540 }, { "epoch": 0.21948759330240064, "grad_norm": 0.4773135781288147, "learning_rate": 8.053244592346089e-07, "logits/chosen": -1.9225316047668457, "logits/rejected": -1.8842490911483765, "logps/chosen": -567.26513671875, "logps/rejected": -179.46780395507812, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 5.123950004577637, "rewards/margins": 6.719346523284912, "rewards/rejected": -1.5953962802886963, "step": 544 }, { "epoch": 0.2211014726649183, "grad_norm": 0.04165881499648094, "learning_rate": 8.036605657237936e-07, "logits/chosen": -1.9288203716278076, "logits/rejected": -1.9164494276046753, "logps/chosen": -631.2778930664062, "logps/rejected": -190.44622802734375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 6.2517852783203125, "rewards/margins": 7.54589319229126, "rewards/rejected": -1.2941081523895264, "step": 548 }, { "epoch": 0.22271535202743595, "grad_norm": 0.044236715883016586, "learning_rate": 8.019966722129783e-07, "logits/chosen": -1.9591870307922363, "logits/rejected": -1.9170758724212646, "logps/chosen": -589.001953125, "logps/rejected": -194.56546020507812, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 5.632788181304932, "rewards/margins": 7.208251953125, "rewards/rejected": -1.5754637718200684, "step": 552 }, { "epoch": 0.2243292313899536, "grad_norm": 0.025635378435254097, "learning_rate": 8.00332778702163e-07, "logits/chosen": -1.9273452758789062, "logits/rejected": -1.9201503992080688, "logps/chosen": -559.1937866210938, "logps/rejected": -197.9849853515625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 5.903825283050537, "rewards/margins": 7.564258098602295, "rewards/rejected": -1.6604321002960205, "step": 556 }, { "epoch": 0.22594311075247125, "grad_norm": 0.04669136926531792, "learning_rate": 7.986688851913477e-07, "logits/chosen": -1.9187164306640625, "logits/rejected": -1.883826732635498, "logps/chosen": -583.4932861328125, "logps/rejected": -171.30076599121094, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 5.956411361694336, "rewards/margins": 7.320271015167236, "rewards/rejected": -1.3638601303100586, "step": 560 }, { "epoch": 0.2275569901149889, "grad_norm": 0.011015149764716625, "learning_rate": 7.970049916805324e-07, "logits/chosen": -1.9214916229248047, "logits/rejected": -1.9365153312683105, "logps/chosen": -651.9737548828125, "logps/rejected": -194.82444763183594, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 6.266908645629883, "rewards/margins": 7.754495143890381, "rewards/rejected": -1.4875867366790771, "step": 564 }, { "epoch": 0.22917086947750656, "grad_norm": 0.044071681797504425, "learning_rate": 7.953410981697171e-07, "logits/chosen": -1.9330661296844482, "logits/rejected": -1.8891563415527344, "logps/chosen": -628.9828491210938, "logps/rejected": -170.7098388671875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 6.38037633895874, "rewards/margins": 7.5281572341918945, "rewards/rejected": -1.1477806568145752, "step": 568 }, { "epoch": 0.2307847488400242, "grad_norm": 0.05808074027299881, "learning_rate": 7.936772046589018e-07, "logits/chosen": -1.9544706344604492, "logits/rejected": -1.9229828119277954, "logps/chosen": -596.4066772460938, "logps/rejected": -173.1801300048828, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 5.976884365081787, "rewards/margins": 7.6263604164123535, "rewards/rejected": -1.6494756937026978, "step": 572 }, { "epoch": 0.23239862820254187, "grad_norm": 0.0295318104326725, "learning_rate": 7.920133111480865e-07, "logits/chosen": -1.9220975637435913, "logits/rejected": -1.8671207427978516, "logps/chosen": -600.0115966796875, "logps/rejected": -198.57977294921875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 5.927572250366211, "rewards/margins": 7.321686267852783, "rewards/rejected": -1.3941144943237305, "step": 576 }, { "epoch": 0.2340125075650595, "grad_norm": 0.01197292935103178, "learning_rate": 7.903494176372711e-07, "logits/chosen": -1.9246530532836914, "logits/rejected": -1.8978956937789917, "logps/chosen": -631.7338256835938, "logps/rejected": -179.79612731933594, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 6.381148815155029, "rewards/margins": 7.694772720336914, "rewards/rejected": -1.313624620437622, "step": 580 }, { "epoch": 0.23562638692757717, "grad_norm": 0.0910879448056221, "learning_rate": 7.886855241264558e-07, "logits/chosen": -1.889186143875122, "logits/rejected": -1.882289171218872, "logps/chosen": -663.4276123046875, "logps/rejected": -177.9473419189453, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 6.188752174377441, "rewards/margins": 7.665579795837402, "rewards/rejected": -1.4768280982971191, "step": 584 }, { "epoch": 0.23724026629009481, "grad_norm": 0.014490382745862007, "learning_rate": 7.870216306156405e-07, "logits/chosen": -1.9420596361160278, "logits/rejected": -1.8983983993530273, "logps/chosen": -637.1312255859375, "logps/rejected": -192.50244140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 6.303738117218018, "rewards/margins": 7.906347274780273, "rewards/rejected": -1.602608561515808, "step": 588 }, { "epoch": 0.23885414565261245, "grad_norm": 0.01364422868937254, "learning_rate": 7.853577371048252e-07, "logits/chosen": -1.882974624633789, "logits/rejected": -1.9001988172531128, "logps/chosen": -702.8563232421875, "logps/rejected": -172.81153869628906, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 6.6591997146606445, "rewards/margins": 8.047064781188965, "rewards/rejected": -1.387864589691162, "step": 592 }, { "epoch": 0.24046802501513012, "grad_norm": 0.3288860023021698, "learning_rate": 7.836938435940099e-07, "logits/chosen": -1.91611647605896, "logits/rejected": -1.9146919250488281, "logps/chosen": -530.7276000976562, "logps/rejected": -184.98712158203125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 5.885509490966797, "rewards/margins": 7.114204406738281, "rewards/rejected": -1.2286943197250366, "step": 596 }, { "epoch": 0.24208190437764776, "grad_norm": 0.061846181750297546, "learning_rate": 7.820299500831946e-07, "logits/chosen": -1.9260382652282715, "logits/rejected": -1.8482518196105957, "logps/chosen": -621.698486328125, "logps/rejected": -186.7850799560547, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 5.785602569580078, "rewards/margins": 7.0305609703063965, "rewards/rejected": -1.2449591159820557, "step": 600 }, { "epoch": 0.24369578374016543, "grad_norm": 0.033548664301633835, "learning_rate": 7.803660565723793e-07, "logits/chosen": -1.8988384008407593, "logits/rejected": -1.9235966205596924, "logps/chosen": -654.9081420898438, "logps/rejected": -178.92999267578125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 5.987019062042236, "rewards/margins": 7.520371913909912, "rewards/rejected": -1.5333529710769653, "step": 604 }, { "epoch": 0.24530966310268307, "grad_norm": 0.012734122574329376, "learning_rate": 7.78702163061564e-07, "logits/chosen": -1.953587532043457, "logits/rejected": -1.8957901000976562, "logps/chosen": -603.5957641601562, "logps/rejected": -180.30213928222656, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 6.374105930328369, "rewards/margins": 7.748699188232422, "rewards/rejected": -1.3745930194854736, "step": 608 }, { "epoch": 0.24692354246520073, "grad_norm": 0.12893182039260864, "learning_rate": 7.770382695507487e-07, "logits/chosen": -1.9323984384536743, "logits/rejected": -1.848400354385376, "logps/chosen": -572.578369140625, "logps/rejected": -188.12603759765625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 5.810790061950684, "rewards/margins": 7.325648784637451, "rewards/rejected": -1.5148584842681885, "step": 612 }, { "epoch": 0.24853742182771837, "grad_norm": 0.020289942622184753, "learning_rate": 7.753743760399334e-07, "logits/chosen": -1.8870551586151123, "logits/rejected": -1.927607536315918, "logps/chosen": -599.76220703125, "logps/rejected": -182.688720703125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 5.919105052947998, "rewards/margins": 7.412703514099121, "rewards/rejected": -1.493598461151123, "step": 616 }, { "epoch": 0.25015130119023604, "grad_norm": 0.06981196999549866, "learning_rate": 7.737104825291181e-07, "logits/chosen": -1.9487760066986084, "logits/rejected": -1.923244595527649, "logps/chosen": -540.86767578125, "logps/rejected": -213.55990600585938, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 5.83527946472168, "rewards/margins": 7.412863731384277, "rewards/rejected": -1.5775840282440186, "step": 620 }, { "epoch": 0.2517651805527537, "grad_norm": 0.09039198607206345, "learning_rate": 7.720465890183028e-07, "logits/chosen": -1.9652656316757202, "logits/rejected": -1.9097367525100708, "logps/chosen": -505.0038146972656, "logps/rejected": -191.85302734375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 5.208578586578369, "rewards/margins": 6.656178951263428, "rewards/rejected": -1.4476011991500854, "step": 624 }, { "epoch": 0.2533790599152713, "grad_norm": 0.0780641958117485, "learning_rate": 7.703826955074875e-07, "logits/chosen": -1.9327268600463867, "logits/rejected": -1.9126496315002441, "logps/chosen": -599.8588256835938, "logps/rejected": -205.3011474609375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 6.28946590423584, "rewards/margins": 7.736734867095947, "rewards/rejected": -1.4472699165344238, "step": 628 }, { "epoch": 0.254992939277789, "grad_norm": 0.006758814677596092, "learning_rate": 7.687188019966722e-07, "logits/chosen": -1.9569164514541626, "logits/rejected": -1.9039442539215088, "logps/chosen": -523.7020263671875, "logps/rejected": -184.41009521484375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 6.088662147521973, "rewards/margins": 7.364947319030762, "rewards/rejected": -1.276285171508789, "step": 632 }, { "epoch": 0.25660681864030666, "grad_norm": 0.14163704216480255, "learning_rate": 7.670549084858569e-07, "logits/chosen": -1.9547746181488037, "logits/rejected": -1.932267665863037, "logps/chosen": -611.2115478515625, "logps/rejected": -194.95074462890625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 5.757643699645996, "rewards/margins": 7.174437522888184, "rewards/rejected": -1.4167944192886353, "step": 636 }, { "epoch": 0.25822069800282427, "grad_norm": 0.10928457230329514, "learning_rate": 7.653910149750416e-07, "logits/chosen": -1.970252275466919, "logits/rejected": -1.9532129764556885, "logps/chosen": -550.6874389648438, "logps/rejected": -180.5746307373047, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 5.962819576263428, "rewards/margins": 7.896392345428467, "rewards/rejected": -1.9335724115371704, "step": 640 }, { "epoch": 0.25983457736534193, "grad_norm": 0.2113751918077469, "learning_rate": 7.637271214642263e-07, "logits/chosen": -1.923952341079712, "logits/rejected": -1.8994972705841064, "logps/chosen": -655.6493530273438, "logps/rejected": -190.41004943847656, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 6.771247863769531, "rewards/margins": 8.07024097442627, "rewards/rejected": -1.2989928722381592, "step": 644 }, { "epoch": 0.2614484567278596, "grad_norm": 0.027803877368569374, "learning_rate": 7.62063227953411e-07, "logits/chosen": -1.9471144676208496, "logits/rejected": -1.9292399883270264, "logps/chosen": -519.6847534179688, "logps/rejected": -196.80715942382812, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 5.72599983215332, "rewards/margins": 7.127876281738281, "rewards/rejected": -1.4018771648406982, "step": 648 }, { "epoch": 0.26306233609037727, "grad_norm": 0.06113160029053688, "learning_rate": 7.603993344425957e-07, "logits/chosen": -1.9746885299682617, "logits/rejected": -1.9125902652740479, "logps/chosen": -706.5521240234375, "logps/rejected": -203.7216796875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 6.655674934387207, "rewards/margins": 8.105162620544434, "rewards/rejected": -1.4494876861572266, "step": 652 }, { "epoch": 0.2646762154528949, "grad_norm": 0.029089657589793205, "learning_rate": 7.587354409317803e-07, "logits/chosen": -1.9136505126953125, "logits/rejected": -1.897823691368103, "logps/chosen": -601.5096435546875, "logps/rejected": -195.9197540283203, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 6.479061603546143, "rewards/margins": 8.176251411437988, "rewards/rejected": -1.697190761566162, "step": 656 }, { "epoch": 0.26629009481541255, "grad_norm": 0.024450503289699554, "learning_rate": 7.57071547420965e-07, "logits/chosen": -1.9478548765182495, "logits/rejected": -1.9389933347702026, "logps/chosen": -549.5435791015625, "logps/rejected": -189.12681579589844, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 5.890069007873535, "rewards/margins": 7.569022178649902, "rewards/rejected": -1.6789536476135254, "step": 660 }, { "epoch": 0.2679039741779302, "grad_norm": 0.03129173070192337, "learning_rate": 7.554076539101497e-07, "logits/chosen": -1.979838490486145, "logits/rejected": -1.9305124282836914, "logps/chosen": -690.2908935546875, "logps/rejected": -201.1938934326172, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 6.037391662597656, "rewards/margins": 7.645337104797363, "rewards/rejected": -1.6079461574554443, "step": 664 }, { "epoch": 0.2695178535404478, "grad_norm": 0.01753445900976658, "learning_rate": 7.537437603993344e-07, "logits/chosen": -1.9023274183273315, "logits/rejected": -1.923431158065796, "logps/chosen": -660.8574829101562, "logps/rejected": -189.8162841796875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 6.363310813903809, "rewards/margins": 7.736293315887451, "rewards/rejected": -1.3729828596115112, "step": 668 }, { "epoch": 0.2711317329029655, "grad_norm": 0.09512970596551895, "learning_rate": 7.520798668885191e-07, "logits/chosen": -1.9779924154281616, "logits/rejected": -1.9420959949493408, "logps/chosen": -551.363525390625, "logps/rejected": -204.19281005859375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 5.722014904022217, "rewards/margins": 7.390229225158691, "rewards/rejected": -1.668214201927185, "step": 672 }, { "epoch": 0.27274561226548316, "grad_norm": 0.07702352851629257, "learning_rate": 7.504159733777038e-07, "logits/chosen": -1.8914786577224731, "logits/rejected": -1.8705425262451172, "logps/chosen": -588.5633544921875, "logps/rejected": -186.31301879882812, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 5.902681827545166, "rewards/margins": 7.44985294342041, "rewards/rejected": -1.5471714735031128, "step": 676 }, { "epoch": 0.27435949162800083, "grad_norm": 0.1086677610874176, "learning_rate": 7.487520798668885e-07, "logits/chosen": -1.976760983467102, "logits/rejected": -1.9541994333267212, "logps/chosen": -523.1632690429688, "logps/rejected": -210.79196166992188, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 6.086148738861084, "rewards/margins": 7.732944965362549, "rewards/rejected": -1.6467962265014648, "step": 680 }, { "epoch": 0.27597337099051844, "grad_norm": 0.024171333760023117, "learning_rate": 7.470881863560732e-07, "logits/chosen": -1.9454660415649414, "logits/rejected": -1.9335111379623413, "logps/chosen": -586.666748046875, "logps/rejected": -191.52574157714844, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 6.287234783172607, "rewards/margins": 7.813823699951172, "rewards/rejected": -1.5265893936157227, "step": 684 }, { "epoch": 0.2775872503530361, "grad_norm": 0.040645014494657516, "learning_rate": 7.454242928452579e-07, "logits/chosen": -1.915562391281128, "logits/rejected": -1.8797332048416138, "logps/chosen": -583.9312744140625, "logps/rejected": -201.8753662109375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 6.975578308105469, "rewards/margins": 8.586902618408203, "rewards/rejected": -1.611324667930603, "step": 688 }, { "epoch": 0.2792011297155538, "grad_norm": 0.05405467003583908, "learning_rate": 7.437603993344426e-07, "logits/chosen": -1.979967713356018, "logits/rejected": -1.8889647722244263, "logps/chosen": -536.1749877929688, "logps/rejected": -189.13348388671875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 6.234835624694824, "rewards/margins": 7.638007640838623, "rewards/rejected": -1.4031715393066406, "step": 692 }, { "epoch": 0.2808150090780714, "grad_norm": 0.02726822718977928, "learning_rate": 7.420965058236273e-07, "logits/chosen": -1.9711570739746094, "logits/rejected": -1.900430679321289, "logps/chosen": -535.8851318359375, "logps/rejected": -177.5406951904297, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 6.2670392990112305, "rewards/margins": 7.55366325378418, "rewards/rejected": -1.286623477935791, "step": 696 }, { "epoch": 0.28242888844058905, "grad_norm": 0.00478845601901412, "learning_rate": 7.40432612312812e-07, "logits/chosen": -1.9295339584350586, "logits/rejected": -1.9002552032470703, "logps/chosen": -675.7437744140625, "logps/rejected": -190.54122924804688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.650766372680664, "rewards/margins": 8.343780517578125, "rewards/rejected": -1.6930148601531982, "step": 700 }, { "epoch": 0.2840427678031067, "grad_norm": 0.035793520510196686, "learning_rate": 7.387687188019967e-07, "logits/chosen": -1.8846255540847778, "logits/rejected": -1.8337141275405884, "logps/chosen": -478.6111755371094, "logps/rejected": -180.29185485839844, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 5.600649833679199, "rewards/margins": 7.018938064575195, "rewards/rejected": -1.4182881116867065, "step": 704 }, { "epoch": 0.2856566471656244, "grad_norm": 0.013141079805791378, "learning_rate": 7.371048252911814e-07, "logits/chosen": -1.885514259338379, "logits/rejected": -1.9080872535705566, "logps/chosen": -564.4568481445312, "logps/rejected": -194.3576202392578, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 6.38169527053833, "rewards/margins": 8.011942863464355, "rewards/rejected": -1.630247950553894, "step": 708 }, { "epoch": 0.287270526528142, "grad_norm": 0.10597706586122513, "learning_rate": 7.354409317803661e-07, "logits/chosen": -1.9374375343322754, "logits/rejected": -1.8915388584136963, "logps/chosen": -592.4541625976562, "logps/rejected": -210.9936065673828, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 6.405781269073486, "rewards/margins": 7.960553169250488, "rewards/rejected": -1.5547715425491333, "step": 712 }, { "epoch": 0.28888440589065967, "grad_norm": 0.034928254783153534, "learning_rate": 7.337770382695508e-07, "logits/chosen": -1.9575929641723633, "logits/rejected": -1.8892122507095337, "logps/chosen": -570.9534912109375, "logps/rejected": -194.24636840820312, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 5.875996112823486, "rewards/margins": 7.565704822540283, "rewards/rejected": -1.6897085905075073, "step": 716 }, { "epoch": 0.29049828525317734, "grad_norm": 0.06385759264230728, "learning_rate": 7.321131447587355e-07, "logits/chosen": -1.9510563611984253, "logits/rejected": -1.979369878768921, "logps/chosen": -529.5039672851562, "logps/rejected": -181.50665283203125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 5.493731498718262, "rewards/margins": 7.313755989074707, "rewards/rejected": -1.8200244903564453, "step": 720 }, { "epoch": 0.292112164615695, "grad_norm": 0.03217162936925888, "learning_rate": 7.304492512479202e-07, "logits/chosen": -1.9162757396697998, "logits/rejected": -1.9250762462615967, "logps/chosen": -660.95703125, "logps/rejected": -196.26463317871094, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.477663993835449, "rewards/margins": 7.805630683898926, "rewards/rejected": -1.3279672861099243, "step": 724 }, { "epoch": 0.2937260439782126, "grad_norm": 0.018973903730511665, "learning_rate": 7.287853577371048e-07, "logits/chosen": -1.9460331201553345, "logits/rejected": -1.9091904163360596, "logps/chosen": -517.0439453125, "logps/rejected": -176.53091430664062, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 5.817058086395264, "rewards/margins": 7.488324165344238, "rewards/rejected": -1.6712661981582642, "step": 728 }, { "epoch": 0.2953399233407303, "grad_norm": 0.07541202753782272, "learning_rate": 7.271214642262895e-07, "logits/chosen": -1.9080225229263306, "logits/rejected": -1.8540575504302979, "logps/chosen": -587.1410522460938, "logps/rejected": -178.22805786132812, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 6.332078456878662, "rewards/margins": 7.975255966186523, "rewards/rejected": -1.6431777477264404, "step": 732 }, { "epoch": 0.29695380270324795, "grad_norm": 0.048339713364839554, "learning_rate": 7.254575707154742e-07, "logits/chosen": -1.9083560705184937, "logits/rejected": -1.8943049907684326, "logps/chosen": -595.3967895507812, "logps/rejected": -176.25213623046875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.387312889099121, "rewards/margins": 7.922239303588867, "rewards/rejected": -1.5349266529083252, "step": 736 }, { "epoch": 0.29856768206576556, "grad_norm": 0.21796870231628418, "learning_rate": 7.237936772046589e-07, "logits/chosen": -1.9192619323730469, "logits/rejected": -1.9225327968597412, "logps/chosen": -580.9888916015625, "logps/rejected": -208.0860595703125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 6.340082168579102, "rewards/margins": 8.130502700805664, "rewards/rejected": -1.7904205322265625, "step": 740 }, { "epoch": 0.30018156142828323, "grad_norm": 0.045838337391614914, "learning_rate": 7.221297836938436e-07, "logits/chosen": -1.9274702072143555, "logits/rejected": -1.9504469633102417, "logps/chosen": -650.2282104492188, "logps/rejected": -198.4795684814453, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.717188358306885, "rewards/margins": 8.751901626586914, "rewards/rejected": -2.0347135066986084, "step": 744 }, { "epoch": 0.3017954407908009, "grad_norm": 0.007083447650074959, "learning_rate": 7.204658901830283e-07, "logits/chosen": -1.9178129434585571, "logits/rejected": -1.9547377824783325, "logps/chosen": -567.985107421875, "logps/rejected": -211.17483520507812, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.095490455627441, "rewards/margins": 8.118324279785156, "rewards/rejected": -2.022834300994873, "step": 748 }, { "epoch": 0.30340932015331856, "grad_norm": 0.09265980124473572, "learning_rate": 7.18801996672213e-07, "logits/chosen": -1.9538295269012451, "logits/rejected": -1.923251748085022, "logps/chosen": -508.88409423828125, "logps/rejected": -174.88082885742188, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 5.715723037719727, "rewards/margins": 7.368039608001709, "rewards/rejected": -1.6523168087005615, "step": 752 }, { "epoch": 0.3050231995158362, "grad_norm": 0.006732017267495394, "learning_rate": 7.171381031613977e-07, "logits/chosen": -1.9270073175430298, "logits/rejected": -1.9271186590194702, "logps/chosen": -633.9362182617188, "logps/rejected": -196.2408905029297, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 6.630672454833984, "rewards/margins": 8.22664737701416, "rewards/rejected": -1.5959752798080444, "step": 756 }, { "epoch": 0.30663707887835384, "grad_norm": 0.06100720539689064, "learning_rate": 7.154742096505824e-07, "logits/chosen": -1.954552173614502, "logits/rejected": -1.9248647689819336, "logps/chosen": -517.2080688476562, "logps/rejected": -194.17111206054688, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 6.339962959289551, "rewards/margins": 8.037684440612793, "rewards/rejected": -1.6977219581604004, "step": 760 }, { "epoch": 0.3082509582408715, "grad_norm": 0.07550331205129623, "learning_rate": 7.138103161397671e-07, "logits/chosen": -1.9494125843048096, "logits/rejected": -1.9443860054016113, "logps/chosen": -590.50439453125, "logps/rejected": -190.44839477539062, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 5.791325569152832, "rewards/margins": 7.566539764404297, "rewards/rejected": -1.7752138376235962, "step": 764 }, { "epoch": 0.3098648376033891, "grad_norm": 0.02517550066113472, "learning_rate": 7.121464226289518e-07, "logits/chosen": -1.956843614578247, "logits/rejected": -1.9373818635940552, "logps/chosen": -576.1766357421875, "logps/rejected": -186.9102783203125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 6.374565601348877, "rewards/margins": 8.311368942260742, "rewards/rejected": -1.9368047714233398, "step": 768 }, { "epoch": 0.3114787169659068, "grad_norm": 0.02659507840871811, "learning_rate": 7.104825291181365e-07, "logits/chosen": -1.944745421409607, "logits/rejected": -1.9182993173599243, "logps/chosen": -543.2911987304688, "logps/rejected": -210.95730590820312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 6.439809799194336, "rewards/margins": 8.275406837463379, "rewards/rejected": -1.8355971574783325, "step": 772 }, { "epoch": 0.31309259632842446, "grad_norm": 0.046082593500614166, "learning_rate": 7.088186356073212e-07, "logits/chosen": -1.9052424430847168, "logits/rejected": -1.8364704847335815, "logps/chosen": -525.1375122070312, "logps/rejected": -167.77635192871094, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 5.76659631729126, "rewards/margins": 7.023763179779053, "rewards/rejected": -1.257167100906372, "step": 776 }, { "epoch": 0.3147064756909421, "grad_norm": 0.0750167965888977, "learning_rate": 7.071547420965059e-07, "logits/chosen": -1.9156404733657837, "logits/rejected": -1.9509721994400024, "logps/chosen": -598.7197875976562, "logps/rejected": -193.7984619140625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 6.9247846603393555, "rewards/margins": 8.811848640441895, "rewards/rejected": -1.88706374168396, "step": 780 }, { "epoch": 0.31632035505345973, "grad_norm": 0.12552636861801147, "learning_rate": 7.054908485856906e-07, "logits/chosen": -1.9466532468795776, "logits/rejected": -1.8765583038330078, "logps/chosen": -492.2370910644531, "logps/rejected": -174.9585723876953, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 5.8003997802734375, "rewards/margins": 7.350320816040039, "rewards/rejected": -1.549920916557312, "step": 784 }, { "epoch": 0.3179342344159774, "grad_norm": 0.012276340276002884, "learning_rate": 7.038269550748753e-07, "logits/chosen": -1.9458484649658203, "logits/rejected": -1.9364690780639648, "logps/chosen": -580.38232421875, "logps/rejected": -187.4744415283203, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 6.455801010131836, "rewards/margins": 8.191352844238281, "rewards/rejected": -1.7355525493621826, "step": 788 }, { "epoch": 0.31954811377849507, "grad_norm": 0.015637286007404327, "learning_rate": 7.0216306156406e-07, "logits/chosen": -1.8707361221313477, "logits/rejected": -1.8636021614074707, "logps/chosen": -650.94580078125, "logps/rejected": -186.5550079345703, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.265646457672119, "rewards/margins": 8.823201179504395, "rewards/rejected": -1.5575547218322754, "step": 792 }, { "epoch": 0.32116199314101274, "grad_norm": 0.07168936729431152, "learning_rate": 7.004991680532447e-07, "logits/chosen": -1.9374241828918457, "logits/rejected": -1.9645403623580933, "logps/chosen": -573.480224609375, "logps/rejected": -187.64114379882812, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 6.135420799255371, "rewards/margins": 8.029410362243652, "rewards/rejected": -1.8939902782440186, "step": 796 }, { "epoch": 0.32277587250353035, "grad_norm": 0.02777090296149254, "learning_rate": 6.988352745424293e-07, "logits/chosen": -1.9059945344924927, "logits/rejected": -1.9286472797393799, "logps/chosen": -517.528076171875, "logps/rejected": -163.12109375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 5.8436737060546875, "rewards/margins": 7.512122631072998, "rewards/rejected": -1.6684489250183105, "step": 800 }, { "epoch": 0.324389751866048, "grad_norm": 0.04046224430203438, "learning_rate": 6.97171381031614e-07, "logits/chosen": -1.9411041736602783, "logits/rejected": -1.9234236478805542, "logps/chosen": -545.490234375, "logps/rejected": -176.35218811035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 6.62785530090332, "rewards/margins": 8.282234191894531, "rewards/rejected": -1.654378890991211, "step": 804 }, { "epoch": 0.3260036312285657, "grad_norm": 0.10420721769332886, "learning_rate": 6.955074875207986e-07, "logits/chosen": -1.9753177165985107, "logits/rejected": -1.9376976490020752, "logps/chosen": -557.6132202148438, "logps/rejected": -186.17910766601562, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 6.732089042663574, "rewards/margins": 8.254746437072754, "rewards/rejected": -1.5226565599441528, "step": 808 }, { "epoch": 0.3276175105910833, "grad_norm": 0.00415422860532999, "learning_rate": 6.938435940099833e-07, "logits/chosen": -1.9505969285964966, "logits/rejected": -1.9200564622879028, "logps/chosen": -551.82373046875, "logps/rejected": -198.29263305664062, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 6.517372131347656, "rewards/margins": 8.294368743896484, "rewards/rejected": -1.7769966125488281, "step": 812 }, { "epoch": 0.32923138995360096, "grad_norm": 0.030412232503294945, "learning_rate": 6.92179700499168e-07, "logits/chosen": -1.924582600593567, "logits/rejected": -1.8792307376861572, "logps/chosen": -628.7982788085938, "logps/rejected": -175.77822875976562, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.815149784088135, "rewards/margins": 8.477224349975586, "rewards/rejected": -1.662074327468872, "step": 816 }, { "epoch": 0.33084526931611863, "grad_norm": 0.025508305057883263, "learning_rate": 6.905158069883527e-07, "logits/chosen": -1.9181338548660278, "logits/rejected": -1.9261795282363892, "logps/chosen": -669.955810546875, "logps/rejected": -212.17181396484375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.617237567901611, "rewards/margins": 8.640678405761719, "rewards/rejected": -2.023440361022949, "step": 820 }, { "epoch": 0.3324591486786363, "grad_norm": 0.008759116753935814, "learning_rate": 6.888519134775374e-07, "logits/chosen": -1.9712437391281128, "logits/rejected": -1.9301774501800537, "logps/chosen": -595.25146484375, "logps/rejected": -194.1730499267578, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 6.257084846496582, "rewards/margins": 8.095664978027344, "rewards/rejected": -1.8385794162750244, "step": 824 }, { "epoch": 0.3340730280411539, "grad_norm": 0.04397183284163475, "learning_rate": 6.87188019966722e-07, "logits/chosen": -1.9374957084655762, "logits/rejected": -1.884482502937317, "logps/chosen": -608.088623046875, "logps/rejected": -183.62701416015625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 6.814413547515869, "rewards/margins": 8.50701904296875, "rewards/rejected": -1.692604422569275, "step": 828 }, { "epoch": 0.3356869074036716, "grad_norm": 0.08009158819913864, "learning_rate": 6.855241264559067e-07, "logits/chosen": -1.956276297569275, "logits/rejected": -1.9358737468719482, "logps/chosen": -616.6802368164062, "logps/rejected": -198.32801818847656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.785815238952637, "rewards/margins": 8.837486267089844, "rewards/rejected": -2.051671028137207, "step": 832 }, { "epoch": 0.33730078676618924, "grad_norm": 0.040808044373989105, "learning_rate": 6.838602329450914e-07, "logits/chosen": -1.9929709434509277, "logits/rejected": -1.9244838953018188, "logps/chosen": -572.5156860351562, "logps/rejected": -217.1726531982422, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 5.9257307052612305, "rewards/margins": 7.60105562210083, "rewards/rejected": -1.6753250360488892, "step": 836 }, { "epoch": 0.33891466612870685, "grad_norm": 0.023983286693692207, "learning_rate": 6.821963394342761e-07, "logits/chosen": -1.9130549430847168, "logits/rejected": -1.9502537250518799, "logps/chosen": -514.3883056640625, "logps/rejected": -178.52569580078125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 5.9270806312561035, "rewards/margins": 7.686283588409424, "rewards/rejected": -1.7592031955718994, "step": 840 }, { "epoch": 0.3405285454912245, "grad_norm": 0.010019694454967976, "learning_rate": 6.805324459234608e-07, "logits/chosen": -1.93647301197052, "logits/rejected": -1.942413330078125, "logps/chosen": -676.4921264648438, "logps/rejected": -196.15756225585938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.3996262550354, "rewards/margins": 9.223345756530762, "rewards/rejected": -1.8237197399139404, "step": 844 }, { "epoch": 0.3421424248537422, "grad_norm": 0.05626523122191429, "learning_rate": 6.788685524126455e-07, "logits/chosen": -1.9050676822662354, "logits/rejected": -1.9132516384124756, "logps/chosen": -606.2343139648438, "logps/rejected": -191.0836181640625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.125362396240234, "rewards/margins": 8.8479585647583, "rewards/rejected": -1.7225968837738037, "step": 848 }, { "epoch": 0.34375630421625986, "grad_norm": 0.007355029229074717, "learning_rate": 6.772046589018302e-07, "logits/chosen": -1.9791120290756226, "logits/rejected": -1.9788198471069336, "logps/chosen": -664.267822265625, "logps/rejected": -208.94259643554688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.077528953552246, "rewards/margins": 8.66952133178711, "rewards/rejected": -1.5919928550720215, "step": 852 }, { "epoch": 0.34537018357877747, "grad_norm": 0.02963581308722496, "learning_rate": 6.755407653910149e-07, "logits/chosen": -1.918620228767395, "logits/rejected": -1.8734290599822998, "logps/chosen": -653.130126953125, "logps/rejected": -208.23121643066406, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 6.961310863494873, "rewards/margins": 8.766632080078125, "rewards/rejected": -1.8053210973739624, "step": 856 }, { "epoch": 0.34698406294129513, "grad_norm": 0.050631873309612274, "learning_rate": 6.738768718801996e-07, "logits/chosen": -1.8936010599136353, "logits/rejected": -1.9293054342269897, "logps/chosen": -617.9654541015625, "logps/rejected": -182.90545654296875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.524641513824463, "rewards/margins": 8.351251602172852, "rewards/rejected": -1.8266092538833618, "step": 860 }, { "epoch": 0.3485979423038128, "grad_norm": 0.005959841422736645, "learning_rate": 6.722129783693843e-07, "logits/chosen": -1.9607820510864258, "logits/rejected": -1.9200009107589722, "logps/chosen": -635.8944702148438, "logps/rejected": -194.32452392578125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.628870010375977, "rewards/margins": 8.417109489440918, "rewards/rejected": -1.7882393598556519, "step": 864 }, { "epoch": 0.35021182166633047, "grad_norm": 0.01879369653761387, "learning_rate": 6.705490848585689e-07, "logits/chosen": -1.987038254737854, "logits/rejected": -1.9422657489776611, "logps/chosen": -623.9428100585938, "logps/rejected": -193.27476501464844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 6.551445007324219, "rewards/margins": 8.49710750579834, "rewards/rejected": -1.9456627368927002, "step": 868 }, { "epoch": 0.3518257010288481, "grad_norm": 0.011375589296221733, "learning_rate": 6.688851913477536e-07, "logits/chosen": -1.9569958448410034, "logits/rejected": -1.9479682445526123, "logps/chosen": -591.7084350585938, "logps/rejected": -182.8351287841797, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 6.6322479248046875, "rewards/margins": 8.334357261657715, "rewards/rejected": -1.7021089792251587, "step": 872 }, { "epoch": 0.35343958039136575, "grad_norm": 0.0170380137860775, "learning_rate": 6.672212978369383e-07, "logits/chosen": -1.951852560043335, "logits/rejected": -1.9124541282653809, "logps/chosen": -592.2799072265625, "logps/rejected": -177.9345245361328, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.4514641761779785, "rewards/margins": 8.219091415405273, "rewards/rejected": -1.7676277160644531, "step": 876 }, { "epoch": 0.3550534597538834, "grad_norm": 0.05186901241540909, "learning_rate": 6.65557404326123e-07, "logits/chosen": -1.8778126239776611, "logits/rejected": -1.8868016004562378, "logps/chosen": -693.2175903320312, "logps/rejected": -199.1944580078125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.389717102050781, "rewards/margins": 9.224071502685547, "rewards/rejected": -1.8343538045883179, "step": 880 }, { "epoch": 0.356667339116401, "grad_norm": 0.01641934923827648, "learning_rate": 6.638935108153077e-07, "logits/chosen": -1.9367520809173584, "logits/rejected": -1.919547438621521, "logps/chosen": -650.5889892578125, "logps/rejected": -191.75633239746094, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.277103900909424, "rewards/margins": 9.073773384094238, "rewards/rejected": -1.7966694831848145, "step": 884 }, { "epoch": 0.3582812184789187, "grad_norm": 0.05502952262759209, "learning_rate": 6.622296173044924e-07, "logits/chosen": -1.9445141553878784, "logits/rejected": -1.909563660621643, "logps/chosen": -603.1407470703125, "logps/rejected": -178.66571044921875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.947625160217285, "rewards/margins": 8.504727363586426, "rewards/rejected": -1.557102084159851, "step": 888 }, { "epoch": 0.35989509784143636, "grad_norm": 0.02586834691464901, "learning_rate": 6.605657237936771e-07, "logits/chosen": -1.9757575988769531, "logits/rejected": -1.9597227573394775, "logps/chosen": -555.0670776367188, "logps/rejected": -192.54647827148438, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 6.224790096282959, "rewards/margins": 8.240776062011719, "rewards/rejected": -2.015986204147339, "step": 892 }, { "epoch": 0.36150897720395403, "grad_norm": 0.024232439696788788, "learning_rate": 6.589018302828618e-07, "logits/chosen": -1.9654958248138428, "logits/rejected": -1.8687621355056763, "logps/chosen": -483.0306091308594, "logps/rejected": -198.94049072265625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 6.327938079833984, "rewards/margins": 8.172499656677246, "rewards/rejected": -1.8445615768432617, "step": 896 }, { "epoch": 0.36312285656647164, "grad_norm": 0.01601296104490757, "learning_rate": 6.572379367720465e-07, "logits/chosen": -1.9415886402130127, "logits/rejected": -1.9930737018585205, "logps/chosen": -712.5814208984375, "logps/rejected": -198.70521545410156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.330328464508057, "rewards/margins": 9.318495750427246, "rewards/rejected": -1.9881669282913208, "step": 900 }, { "epoch": 0.3647367359289893, "grad_norm": 0.05817662179470062, "learning_rate": 6.555740432612312e-07, "logits/chosen": -1.9273086786270142, "logits/rejected": -1.902387022972107, "logps/chosen": -517.6026000976562, "logps/rejected": -204.0680389404297, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 6.50600528717041, "rewards/margins": 8.153182983398438, "rewards/rejected": -1.6471774578094482, "step": 904 }, { "epoch": 0.366350615291507, "grad_norm": 0.0028873577248305082, "learning_rate": 6.539101497504159e-07, "logits/chosen": -1.9398447275161743, "logits/rejected": -1.9420585632324219, "logps/chosen": -706.7715454101562, "logps/rejected": -212.1455078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.649332046508789, "rewards/margins": 9.84432601928711, "rewards/rejected": -2.1949946880340576, "step": 908 }, { "epoch": 0.3679644946540246, "grad_norm": 0.02474329061806202, "learning_rate": 6.522462562396006e-07, "logits/chosen": -1.9440048933029175, "logits/rejected": -1.9706608057022095, "logps/chosen": -646.8698120117188, "logps/rejected": -209.96519470214844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 6.770456314086914, "rewards/margins": 8.624558448791504, "rewards/rejected": -1.8541024923324585, "step": 912 }, { "epoch": 0.36957837401654225, "grad_norm": 0.03593067079782486, "learning_rate": 6.505823627287853e-07, "logits/chosen": -1.8650156259536743, "logits/rejected": -1.8405320644378662, "logps/chosen": -572.8207397460938, "logps/rejected": -183.48989868164062, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 6.493901252746582, "rewards/margins": 8.206184387207031, "rewards/rejected": -1.7122828960418701, "step": 916 }, { "epoch": 0.3711922533790599, "grad_norm": 0.08778494596481323, "learning_rate": 6.4891846921797e-07, "logits/chosen": -1.9268856048583984, "logits/rejected": -1.942408561706543, "logps/chosen": -562.637939453125, "logps/rejected": -215.46884155273438, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.910264492034912, "rewards/margins": 8.543943405151367, "rewards/rejected": -1.6336784362792969, "step": 920 }, { "epoch": 0.3728061327415776, "grad_norm": 0.010769553482532501, "learning_rate": 6.472545757071547e-07, "logits/chosen": -1.9581338167190552, "logits/rejected": -1.9255847930908203, "logps/chosen": -574.40185546875, "logps/rejected": -188.9175262451172, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.676064491271973, "rewards/margins": 8.941415786743164, "rewards/rejected": -2.265350818634033, "step": 924 }, { "epoch": 0.3744200121040952, "grad_norm": 0.023630479350686073, "learning_rate": 6.455906821963394e-07, "logits/chosen": -1.964378833770752, "logits/rejected": -1.9087932109832764, "logps/chosen": -485.9841003417969, "logps/rejected": -193.84156799316406, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 5.7110795974731445, "rewards/margins": 7.7595109939575195, "rewards/rejected": -2.0484304428100586, "step": 928 }, { "epoch": 0.37603389146661287, "grad_norm": 0.0487801656126976, "learning_rate": 6.439267886855241e-07, "logits/chosen": -1.9305438995361328, "logits/rejected": -1.9351520538330078, "logps/chosen": -559.9175415039062, "logps/rejected": -192.95530700683594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.656792163848877, "rewards/margins": 8.230308532714844, "rewards/rejected": -1.5735161304473877, "step": 932 }, { "epoch": 0.37764777082913054, "grad_norm": 0.005238568875938654, "learning_rate": 6.422628951747088e-07, "logits/chosen": -1.9196078777313232, "logits/rejected": -1.9165228605270386, "logps/chosen": -654.2920532226562, "logps/rejected": -194.13067626953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.968348503112793, "rewards/margins": 9.833093643188477, "rewards/rejected": -1.8647446632385254, "step": 936 }, { "epoch": 0.37926165019164815, "grad_norm": 0.07812482863664627, "learning_rate": 6.405990016638934e-07, "logits/chosen": -1.9091360569000244, "logits/rejected": -1.9239163398742676, "logps/chosen": -630.5042724609375, "logps/rejected": -198.91207885742188, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 7.144596576690674, "rewards/margins": 8.954609870910645, "rewards/rejected": -1.8100132942199707, "step": 940 }, { "epoch": 0.3808755295541658, "grad_norm": 0.009957020170986652, "learning_rate": 6.389351081530781e-07, "logits/chosen": -1.9125887155532837, "logits/rejected": -1.8880256414413452, "logps/chosen": -452.0191650390625, "logps/rejected": -182.02027893066406, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 6.232110977172852, "rewards/margins": 8.04847240447998, "rewards/rejected": -1.816361427307129, "step": 944 }, { "epoch": 0.3824894089166835, "grad_norm": 0.015520499087870121, "learning_rate": 6.372712146422628e-07, "logits/chosen": -1.9057499170303345, "logits/rejected": -1.9060219526290894, "logps/chosen": -666.0677490234375, "logps/rejected": -198.77867126464844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.528844833374023, "rewards/margins": 9.659141540527344, "rewards/rejected": -2.130296230316162, "step": 948 }, { "epoch": 0.38410328827920115, "grad_norm": 0.044486261904239655, "learning_rate": 6.356073211314475e-07, "logits/chosen": -1.9310541152954102, "logits/rejected": -1.900843858718872, "logps/chosen": -534.695068359375, "logps/rejected": -179.0294952392578, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 6.620725154876709, "rewards/margins": 8.464913368225098, "rewards/rejected": -1.84418785572052, "step": 952 }, { "epoch": 0.38571716764171876, "grad_norm": 0.02794666215777397, "learning_rate": 6.339434276206322e-07, "logits/chosen": -1.959954023361206, "logits/rejected": -1.9634037017822266, "logps/chosen": -515.0735473632812, "logps/rejected": -192.10833740234375, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 6.0295634269714355, "rewards/margins": 7.957436561584473, "rewards/rejected": -1.927872896194458, "step": 956 }, { "epoch": 0.38733104700423643, "grad_norm": 0.034617602825164795, "learning_rate": 6.322795341098169e-07, "logits/chosen": -1.9335181713104248, "logits/rejected": -1.8976356983184814, "logps/chosen": -558.5010375976562, "logps/rejected": -189.98008728027344, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.949714183807373, "rewards/margins": 8.775166511535645, "rewards/rejected": -1.8254531621932983, "step": 960 }, { "epoch": 0.3889449263667541, "grad_norm": 0.023845413699746132, "learning_rate": 6.306156405990016e-07, "logits/chosen": -1.9889020919799805, "logits/rejected": -2.0035862922668457, "logps/chosen": -583.6094970703125, "logps/rejected": -207.7798614501953, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.893913745880127, "rewards/margins": 8.897465705871582, "rewards/rejected": -2.0035510063171387, "step": 964 }, { "epoch": 0.39055880572927176, "grad_norm": 0.022340890020132065, "learning_rate": 6.289517470881863e-07, "logits/chosen": -1.8911035060882568, "logits/rejected": -1.9091596603393555, "logps/chosen": -613.7164916992188, "logps/rejected": -190.47537231445312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.276937484741211, "rewards/margins": 9.093494415283203, "rewards/rejected": -1.816556453704834, "step": 968 }, { "epoch": 0.3921726850917894, "grad_norm": 0.08413345366716385, "learning_rate": 6.27287853577371e-07, "logits/chosen": -1.9483861923217773, "logits/rejected": -1.9210608005523682, "logps/chosen": -665.8541259765625, "logps/rejected": -203.126953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.308953285217285, "rewards/margins": 9.405411720275879, "rewards/rejected": -2.096458673477173, "step": 972 }, { "epoch": 0.39378656445430704, "grad_norm": 0.04811552166938782, "learning_rate": 6.256239600665557e-07, "logits/chosen": -1.9949203729629517, "logits/rejected": -1.967159628868103, "logps/chosen": -519.6574096679688, "logps/rejected": -189.46444702148438, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.4700541496276855, "rewards/margins": 8.442337989807129, "rewards/rejected": -1.9722844362258911, "step": 976 }, { "epoch": 0.3954004438168247, "grad_norm": 0.02094968408346176, "learning_rate": 6.239600665557404e-07, "logits/chosen": -1.9816782474517822, "logits/rejected": -1.9247698783874512, "logps/chosen": -499.572509765625, "logps/rejected": -187.3372344970703, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 6.340493679046631, "rewards/margins": 8.323511123657227, "rewards/rejected": -1.983017921447754, "step": 980 }, { "epoch": 0.3970143231793423, "grad_norm": 0.0013992745662108064, "learning_rate": 6.222961730449251e-07, "logits/chosen": -1.9215202331542969, "logits/rejected": -1.9319528341293335, "logps/chosen": -685.5482177734375, "logps/rejected": -176.41531372070312, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.9256744384765625, "rewards/margins": 9.851841926574707, "rewards/rejected": -1.9261674880981445, "step": 984 }, { "epoch": 0.39862820254186, "grad_norm": 0.046316325664520264, "learning_rate": 6.206322795341098e-07, "logits/chosen": -1.9628549814224243, "logits/rejected": -1.9154279232025146, "logps/chosen": -582.0474853515625, "logps/rejected": -195.32550048828125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 6.659042835235596, "rewards/margins": 8.339271545410156, "rewards/rejected": -1.6802279949188232, "step": 988 }, { "epoch": 0.40024208190437766, "grad_norm": 0.012888907454907894, "learning_rate": 6.189683860232945e-07, "logits/chosen": -1.9659626483917236, "logits/rejected": -1.920491337776184, "logps/chosen": -533.0806274414062, "logps/rejected": -194.4873504638672, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 6.30048942565918, "rewards/margins": 8.225896835327148, "rewards/rejected": -1.9254074096679688, "step": 992 }, { "epoch": 0.4018559612668953, "grad_norm": 0.0009010803187265992, "learning_rate": 6.173044925124792e-07, "logits/chosen": -1.9162322282791138, "logits/rejected": -1.8789595365524292, "logps/chosen": -639.1532592773438, "logps/rejected": -203.8126983642578, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 7.143144130706787, "rewards/margins": 9.149251937866211, "rewards/rejected": -2.006108283996582, "step": 996 }, { "epoch": 0.40346984062941293, "grad_norm": 0.0030536106787621975, "learning_rate": 6.156405990016639e-07, "logits/chosen": -1.9057064056396484, "logits/rejected": -1.9273066520690918, "logps/chosen": -657.9024658203125, "logps/rejected": -185.53558349609375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.753528118133545, "rewards/margins": 9.42508316040039, "rewards/rejected": -1.6715552806854248, "step": 1000 }, { "epoch": 0.4050837199919306, "grad_norm": 0.027086157351732254, "learning_rate": 6.139767054908486e-07, "logits/chosen": -1.9536768198013306, "logits/rejected": -1.9666200876235962, "logps/chosen": -603.0996704101562, "logps/rejected": -213.695556640625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.0279927253723145, "rewards/margins": 9.299970626831055, "rewards/rejected": -2.2719779014587402, "step": 1004 }, { "epoch": 0.40669759935444827, "grad_norm": 0.025801124051213264, "learning_rate": 6.123128119800333e-07, "logits/chosen": -2.003777503967285, "logits/rejected": -1.9656994342803955, "logps/chosen": -488.47467041015625, "logps/rejected": -175.87261962890625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 5.942250728607178, "rewards/margins": 7.706140518188477, "rewards/rejected": -1.7638887166976929, "step": 1008 }, { "epoch": 0.4083114787169659, "grad_norm": 0.005446454975754023, "learning_rate": 6.10648918469218e-07, "logits/chosen": -1.938076138496399, "logits/rejected": -1.9385313987731934, "logps/chosen": -621.1357421875, "logps/rejected": -201.28701782226562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.5549516677856445, "rewards/margins": 9.675981521606445, "rewards/rejected": -2.12103009223938, "step": 1012 }, { "epoch": 0.40992535807948355, "grad_norm": 0.013515722937881947, "learning_rate": 6.089850249584026e-07, "logits/chosen": -2.0177056789398193, "logits/rejected": -1.9850513935089111, "logps/chosen": -507.079345703125, "logps/rejected": -200.44349670410156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.593064308166504, "rewards/margins": 8.6827974319458, "rewards/rejected": -2.089733123779297, "step": 1016 }, { "epoch": 0.4115392374420012, "grad_norm": 0.007047024555504322, "learning_rate": 6.073211314475873e-07, "logits/chosen": -1.9892404079437256, "logits/rejected": -1.9619371891021729, "logps/chosen": -533.0233154296875, "logps/rejected": -213.83572387695312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 6.522357940673828, "rewards/margins": 8.622007369995117, "rewards/rejected": -2.099648952484131, "step": 1020 }, { "epoch": 0.4131531168045189, "grad_norm": 0.0341547392308712, "learning_rate": 6.05657237936772e-07, "logits/chosen": -1.930118441581726, "logits/rejected": -1.933486819267273, "logps/chosen": -546.5753173828125, "logps/rejected": -188.8538055419922, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.718876361846924, "rewards/margins": 8.651607513427734, "rewards/rejected": -1.9327303171157837, "step": 1024 }, { "epoch": 0.4147669961670365, "grad_norm": 0.008423049934208393, "learning_rate": 6.039933444259567e-07, "logits/chosen": -1.9038822650909424, "logits/rejected": -1.8975721597671509, "logps/chosen": -655.1964721679688, "logps/rejected": -215.96218872070312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.321654319763184, "rewards/margins": 8.874247550964355, "rewards/rejected": -1.5525931119918823, "step": 1028 }, { "epoch": 0.41638087552955416, "grad_norm": 0.005719688721001148, "learning_rate": 6.023294509151414e-07, "logits/chosen": -1.9657106399536133, "logits/rejected": -1.9172520637512207, "logps/chosen": -584.5862426757812, "logps/rejected": -207.45962524414062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.162179470062256, "rewards/margins": 9.158191680908203, "rewards/rejected": -1.9960130453109741, "step": 1032 }, { "epoch": 0.41799475489207183, "grad_norm": 0.17047683894634247, "learning_rate": 6.006655574043261e-07, "logits/chosen": -1.9568921327590942, "logits/rejected": -1.9694132804870605, "logps/chosen": -560.7867431640625, "logps/rejected": -203.9275665283203, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 6.736236095428467, "rewards/margins": 8.553138732910156, "rewards/rejected": -1.816902756690979, "step": 1036 }, { "epoch": 0.4196086342545895, "grad_norm": 0.17048490047454834, "learning_rate": 5.990016638935108e-07, "logits/chosen": -1.8997257947921753, "logits/rejected": -1.9218554496765137, "logps/chosen": -585.1653442382812, "logps/rejected": -186.489990234375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 7.026707649230957, "rewards/margins": 9.061711311340332, "rewards/rejected": -2.035003423690796, "step": 1040 }, { "epoch": 0.4212225136171071, "grad_norm": 0.047775860875844955, "learning_rate": 5.973377703826955e-07, "logits/chosen": -1.9785871505737305, "logits/rejected": -1.9373071193695068, "logps/chosen": -616.56982421875, "logps/rejected": -188.8338623046875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.972675323486328, "rewards/margins": 8.729272842407227, "rewards/rejected": -1.7565964460372925, "step": 1044 }, { "epoch": 0.4228363929796248, "grad_norm": 0.009417595341801643, "learning_rate": 5.956738768718802e-07, "logits/chosen": -1.9110404253005981, "logits/rejected": -1.9015251398086548, "logps/chosen": -655.7503051757812, "logps/rejected": -207.07757568359375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 7.2995781898498535, "rewards/margins": 9.154539108276367, "rewards/rejected": -1.8549606800079346, "step": 1048 }, { "epoch": 0.42445027234214244, "grad_norm": 0.014614434912800789, "learning_rate": 5.940099833610649e-07, "logits/chosen": -1.9587862491607666, "logits/rejected": -1.8709607124328613, "logps/chosen": -470.2625732421875, "logps/rejected": -182.9962615966797, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 6.055819511413574, "rewards/margins": 7.880839824676514, "rewards/rejected": -1.8250205516815186, "step": 1052 }, { "epoch": 0.42606415170466005, "grad_norm": 0.033913563936948776, "learning_rate": 5.923460898502496e-07, "logits/chosen": -1.9502745866775513, "logits/rejected": -1.8831697702407837, "logps/chosen": -545.484375, "logps/rejected": -189.96499633789062, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 6.555781364440918, "rewards/margins": 8.20710277557373, "rewards/rejected": -1.6513222455978394, "step": 1056 }, { "epoch": 0.4276780310671777, "grad_norm": 0.008011486381292343, "learning_rate": 5.906821963394343e-07, "logits/chosen": -2.0003178119659424, "logits/rejected": -1.9627704620361328, "logps/chosen": -492.762939453125, "logps/rejected": -194.58078002929688, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 6.340209484100342, "rewards/margins": 8.556674003601074, "rewards/rejected": -2.2164647579193115, "step": 1060 }, { "epoch": 0.4292919104296954, "grad_norm": 0.05212472006678581, "learning_rate": 5.89018302828619e-07, "logits/chosen": -1.9067308902740479, "logits/rejected": -1.8866784572601318, "logps/chosen": -613.60498046875, "logps/rejected": -217.97525024414062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.830389022827148, "rewards/margins": 8.930854797363281, "rewards/rejected": -2.1004655361175537, "step": 1064 }, { "epoch": 0.43090578979221306, "grad_norm": 0.024569297209382057, "learning_rate": 5.873544093178037e-07, "logits/chosen": -2.017029047012329, "logits/rejected": -1.994143009185791, "logps/chosen": -506.3706359863281, "logps/rejected": -191.1002197265625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.869837284088135, "rewards/margins": 8.986981391906738, "rewards/rejected": -2.1171441078186035, "step": 1068 }, { "epoch": 0.43251966915473067, "grad_norm": 0.007659610826522112, "learning_rate": 5.856905158069884e-07, "logits/chosen": -1.9706467390060425, "logits/rejected": -1.9155362844467163, "logps/chosen": -585.3357543945312, "logps/rejected": -197.29086303710938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.008872032165527, "rewards/margins": 9.004765510559082, "rewards/rejected": -1.995894432067871, "step": 1072 }, { "epoch": 0.43413354851724834, "grad_norm": 0.0220956988632679, "learning_rate": 5.840266222961731e-07, "logits/chosen": -1.9931610822677612, "logits/rejected": -1.945159912109375, "logps/chosen": -459.5959777832031, "logps/rejected": -186.7078399658203, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 5.198247909545898, "rewards/margins": 7.064477920532227, "rewards/rejected": -1.866229772567749, "step": 1076 }, { "epoch": 0.435747427879766, "grad_norm": 0.019349757581949234, "learning_rate": 5.823627287853578e-07, "logits/chosen": -1.9494805335998535, "logits/rejected": -1.9418641328811646, "logps/chosen": -691.3120727539062, "logps/rejected": -212.51333618164062, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.843912601470947, "rewards/margins": 8.665177345275879, "rewards/rejected": -1.8212645053863525, "step": 1080 }, { "epoch": 0.4373613072422836, "grad_norm": 0.0008654408156871796, "learning_rate": 5.806988352745425e-07, "logits/chosen": -1.9530394077301025, "logits/rejected": -1.9719445705413818, "logps/chosen": -665.39794921875, "logps/rejected": -203.12371826171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.795338153839111, "rewards/margins": 9.99510383605957, "rewards/rejected": -2.199766159057617, "step": 1084 }, { "epoch": 0.4389751866048013, "grad_norm": 0.0215070229023695, "learning_rate": 5.790349417637271e-07, "logits/chosen": -2.0045347213745117, "logits/rejected": -1.9445502758026123, "logps/chosen": -482.85174560546875, "logps/rejected": -186.9453125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 5.892232894897461, "rewards/margins": 8.089595794677734, "rewards/rejected": -2.19736385345459, "step": 1088 }, { "epoch": 0.44058906596731895, "grad_norm": 0.02451961673796177, "learning_rate": 5.773710482529118e-07, "logits/chosen": -1.8992165327072144, "logits/rejected": -1.9131587743759155, "logps/chosen": -582.26220703125, "logps/rejected": -195.69297790527344, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.331181526184082, "rewards/margins": 9.310480117797852, "rewards/rejected": -1.9792969226837158, "step": 1092 }, { "epoch": 0.4422029453298366, "grad_norm": 0.0057367547415196896, "learning_rate": 5.757071547420965e-07, "logits/chosen": -1.9488577842712402, "logits/rejected": -1.9355590343475342, "logps/chosen": -693.2987060546875, "logps/rejected": -202.767578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.19438362121582, "rewards/margins": 10.433257102966309, "rewards/rejected": -2.2388744354248047, "step": 1096 }, { "epoch": 0.44381682469235423, "grad_norm": 0.00894203782081604, "learning_rate": 5.740432612312812e-07, "logits/chosen": -1.9438257217407227, "logits/rejected": -1.9603071212768555, "logps/chosen": -645.8823852539062, "logps/rejected": -222.20513916015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.536291122436523, "rewards/margins": 9.915311813354492, "rewards/rejected": -2.3790206909179688, "step": 1100 }, { "epoch": 0.4454307040548719, "grad_norm": 0.024341586977243423, "learning_rate": 5.723793677204659e-07, "logits/chosen": -1.9573278427124023, "logits/rejected": -1.9429997205734253, "logps/chosen": -636.8718872070312, "logps/rejected": -234.06295776367188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.31941032409668, "rewards/margins": 9.365674018859863, "rewards/rejected": -2.046264171600342, "step": 1104 }, { "epoch": 0.44704458341738956, "grad_norm": 0.009238296188414097, "learning_rate": 5.707154742096506e-07, "logits/chosen": -1.9285259246826172, "logits/rejected": -1.8919577598571777, "logps/chosen": -540.9061889648438, "logps/rejected": -194.9390411376953, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.474114418029785, "rewards/margins": 8.43596076965332, "rewards/rejected": -1.9618459939956665, "step": 1108 }, { "epoch": 0.4486584627799072, "grad_norm": 0.0053011151030659676, "learning_rate": 5.690515806988353e-07, "logits/chosen": -1.9154037237167358, "logits/rejected": -1.9216842651367188, "logps/chosen": -556.5299682617188, "logps/rejected": -192.5624542236328, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 6.570614814758301, "rewards/margins": 8.47523307800293, "rewards/rejected": -1.9046186208724976, "step": 1112 }, { "epoch": 0.45027234214242484, "grad_norm": 0.030345546081662178, "learning_rate": 5.6738768718802e-07, "logits/chosen": -1.9388477802276611, "logits/rejected": -1.9337499141693115, "logps/chosen": -606.7693481445312, "logps/rejected": -186.21011352539062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.61848258972168, "rewards/margins": 9.68882942199707, "rewards/rejected": -2.0703465938568115, "step": 1116 }, { "epoch": 0.4518862215049425, "grad_norm": 0.027264012023806572, "learning_rate": 5.657237936772047e-07, "logits/chosen": -1.948606252670288, "logits/rejected": -1.9525359869003296, "logps/chosen": -556.0435791015625, "logps/rejected": -197.65274047851562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 6.766746520996094, "rewards/margins": 8.742637634277344, "rewards/rejected": -1.9758923053741455, "step": 1120 }, { "epoch": 0.4535001008674602, "grad_norm": 0.007660525385290384, "learning_rate": 5.640599001663894e-07, "logits/chosen": -1.8876006603240967, "logits/rejected": -1.8439995050430298, "logps/chosen": -591.87158203125, "logps/rejected": -176.9176483154297, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 7.190242767333984, "rewards/margins": 9.178365707397461, "rewards/rejected": -1.9881219863891602, "step": 1124 }, { "epoch": 0.4551139802299778, "grad_norm": 0.005620306357741356, "learning_rate": 5.62396006655574e-07, "logits/chosen": -1.9527791738510132, "logits/rejected": -1.887128233909607, "logps/chosen": -581.17431640625, "logps/rejected": -192.23291015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.1905107498168945, "rewards/margins": 9.00893783569336, "rewards/rejected": -1.8184278011322021, "step": 1128 }, { "epoch": 0.45672785959249546, "grad_norm": 0.11267098784446716, "learning_rate": 5.607321131447587e-07, "logits/chosen": -1.949232816696167, "logits/rejected": -1.8992102146148682, "logps/chosen": -569.1506958007812, "logps/rejected": -191.79150390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.846667289733887, "rewards/margins": 8.733184814453125, "rewards/rejected": -1.8865180015563965, "step": 1132 }, { "epoch": 0.4583417389550131, "grad_norm": 0.021589597687125206, "learning_rate": 5.590682196339434e-07, "logits/chosen": -2.0102248191833496, "logits/rejected": -1.9592649936676025, "logps/chosen": -575.1451416015625, "logps/rejected": -216.13241577148438, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.570772171020508, "rewards/margins": 8.77194595336914, "rewards/rejected": -2.2011733055114746, "step": 1136 }, { "epoch": 0.4599556183175308, "grad_norm": 0.0262326467782259, "learning_rate": 5.574043261231281e-07, "logits/chosen": -1.975756049156189, "logits/rejected": -1.9554497003555298, "logps/chosen": -518.0825805664062, "logps/rejected": -192.63621520996094, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 6.55406379699707, "rewards/margins": 8.494430541992188, "rewards/rejected": -1.9403672218322754, "step": 1140 }, { "epoch": 0.4615694976800484, "grad_norm": 0.006861544214189053, "learning_rate": 5.557404326123128e-07, "logits/chosen": -1.931782603263855, "logits/rejected": -1.92754328250885, "logps/chosen": -568.1206665039062, "logps/rejected": -202.23983764648438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.387733459472656, "rewards/margins": 9.287446022033691, "rewards/rejected": -1.8997132778167725, "step": 1144 }, { "epoch": 0.46318337704256607, "grad_norm": 0.016442686319351196, "learning_rate": 5.540765391014975e-07, "logits/chosen": -1.9435499906539917, "logits/rejected": -1.9456310272216797, "logps/chosen": -614.3421020507812, "logps/rejected": -178.60182189941406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.475590705871582, "rewards/margins": 9.353108406066895, "rewards/rejected": -1.877517580986023, "step": 1148 }, { "epoch": 0.46479725640508374, "grad_norm": 0.0059676505625247955, "learning_rate": 5.524126455906822e-07, "logits/chosen": -1.9193265438079834, "logits/rejected": -1.9343922138214111, "logps/chosen": -608.6721801757812, "logps/rejected": -177.2775115966797, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.17625093460083, "rewards/margins": 9.041749954223633, "rewards/rejected": -1.865498661994934, "step": 1152 }, { "epoch": 0.46641113576760135, "grad_norm": 0.016520952805876732, "learning_rate": 5.507487520798668e-07, "logits/chosen": -1.9456629753112793, "logits/rejected": -1.9573960304260254, "logps/chosen": -505.503662109375, "logps/rejected": -206.6979522705078, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.717719078063965, "rewards/margins": 8.831727981567383, "rewards/rejected": -2.114008665084839, "step": 1156 }, { "epoch": 0.468025015130119, "grad_norm": 0.013905966654419899, "learning_rate": 5.490848585690515e-07, "logits/chosen": -1.9200551509857178, "logits/rejected": -1.9134129285812378, "logps/chosen": -561.5953369140625, "logps/rejected": -202.3114013671875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.563882350921631, "rewards/margins": 8.577544212341309, "rewards/rejected": -2.0136613845825195, "step": 1160 }, { "epoch": 0.4696388944926367, "grad_norm": 0.013133925385773182, "learning_rate": 5.474209650582362e-07, "logits/chosen": -1.8991508483886719, "logits/rejected": -1.9046592712402344, "logps/chosen": -579.7406616210938, "logps/rejected": -195.89450073242188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.6132097244262695, "rewards/margins": 9.597818374633789, "rewards/rejected": -1.9846088886260986, "step": 1164 }, { "epoch": 0.47125277385515435, "grad_norm": 0.011910934932529926, "learning_rate": 5.457570715474209e-07, "logits/chosen": -1.9509276151657104, "logits/rejected": -1.9066928625106812, "logps/chosen": -498.31964111328125, "logps/rejected": -194.478271484375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 6.793839931488037, "rewards/margins": 8.857150077819824, "rewards/rejected": -2.063309907913208, "step": 1168 }, { "epoch": 0.47286665321767196, "grad_norm": 0.009367916733026505, "learning_rate": 5.440931780366056e-07, "logits/chosen": -1.9685680866241455, "logits/rejected": -1.9284205436706543, "logps/chosen": -599.3546142578125, "logps/rejected": -209.75140380859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.749727249145508, "rewards/margins": 9.881489753723145, "rewards/rejected": -2.1317622661590576, "step": 1172 }, { "epoch": 0.47448053258018963, "grad_norm": 0.13349802792072296, "learning_rate": 5.424292845257903e-07, "logits/chosen": -1.9523228406906128, "logits/rejected": -1.941222906112671, "logps/chosen": -574.6190795898438, "logps/rejected": -189.04368591308594, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 6.953378677368164, "rewards/margins": 8.752217292785645, "rewards/rejected": -1.7988389730453491, "step": 1176 }, { "epoch": 0.4760944119427073, "grad_norm": 0.017886407673358917, "learning_rate": 5.40765391014975e-07, "logits/chosen": -1.9703028202056885, "logits/rejected": -1.9845049381256104, "logps/chosen": -594.593994140625, "logps/rejected": -192.51324462890625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 7.194549560546875, "rewards/margins": 9.022284507751465, "rewards/rejected": -1.8277348279953003, "step": 1180 }, { "epoch": 0.4777082913052249, "grad_norm": 0.012999208644032478, "learning_rate": 5.391014975041597e-07, "logits/chosen": -1.9452145099639893, "logits/rejected": -1.9440656900405884, "logps/chosen": -703.0396728515625, "logps/rejected": -207.8959503173828, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.7818145751953125, "rewards/margins": 9.946669578552246, "rewards/rejected": -2.16485595703125, "step": 1184 }, { "epoch": 0.4793221706677426, "grad_norm": 0.006976998411118984, "learning_rate": 5.374376039933444e-07, "logits/chosen": -1.903515100479126, "logits/rejected": -1.9211552143096924, "logps/chosen": -669.50048828125, "logps/rejected": -202.2239532470703, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 7.772323131561279, "rewards/margins": 9.801573753356934, "rewards/rejected": -2.0292513370513916, "step": 1188 }, { "epoch": 0.48093605003026024, "grad_norm": 0.014352737925946712, "learning_rate": 5.357737104825291e-07, "logits/chosen": -1.9408529996871948, "logits/rejected": -1.9513499736785889, "logps/chosen": -649.4365234375, "logps/rejected": -212.72569274902344, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 7.601327419281006, "rewards/margins": 9.825721740722656, "rewards/rejected": -2.2243947982788086, "step": 1192 }, { "epoch": 0.4825499293927779, "grad_norm": 0.0030000568367540836, "learning_rate": 5.341098169717138e-07, "logits/chosen": -1.950404167175293, "logits/rejected": -1.8623133897781372, "logps/chosen": -495.0749206542969, "logps/rejected": -189.71804809570312, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 6.996458530426025, "rewards/margins": 9.031914710998535, "rewards/rejected": -2.0354559421539307, "step": 1196 }, { "epoch": 0.4841638087552955, "grad_norm": 0.03536267206072807, "learning_rate": 5.324459234608985e-07, "logits/chosen": -1.9427766799926758, "logits/rejected": -1.9847636222839355, "logps/chosen": -602.2952880859375, "logps/rejected": -209.5355224609375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.168055534362793, "rewards/margins": 9.363693237304688, "rewards/rejected": -2.1956372261047363, "step": 1200 }, { "epoch": 0.4857776881178132, "grad_norm": 0.024555416777729988, "learning_rate": 5.307820299500832e-07, "logits/chosen": -1.907313346862793, "logits/rejected": -1.893943428993225, "logps/chosen": -631.8547973632812, "logps/rejected": -213.51243591308594, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.502682209014893, "rewards/margins": 9.8486967086792, "rewards/rejected": -2.346014976501465, "step": 1204 }, { "epoch": 0.48739156748033086, "grad_norm": 0.06726301461458206, "learning_rate": 5.291181364392679e-07, "logits/chosen": -2.0102055072784424, "logits/rejected": -1.9635992050170898, "logps/chosen": -485.38018798828125, "logps/rejected": -177.56785583496094, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.212246894836426, "rewards/margins": 8.255290985107422, "rewards/rejected": -2.043044090270996, "step": 1208 }, { "epoch": 0.4890054468428485, "grad_norm": 0.04332156106829643, "learning_rate": 5.274542429284526e-07, "logits/chosen": -2.005838394165039, "logits/rejected": -1.9877393245697021, "logps/chosen": -540.1764526367188, "logps/rejected": -194.23422241210938, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 7.063762187957764, "rewards/margins": 8.88789176940918, "rewards/rejected": -1.8241301774978638, "step": 1212 }, { "epoch": 0.49061932620536614, "grad_norm": 0.011897094547748566, "learning_rate": 5.257903494176373e-07, "logits/chosen": -1.9672061204910278, "logits/rejected": -1.9581787586212158, "logps/chosen": -488.4551086425781, "logps/rejected": -204.0894775390625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.825464725494385, "rewards/margins": 9.18692684173584, "rewards/rejected": -2.3614625930786133, "step": 1216 }, { "epoch": 0.4922332055678838, "grad_norm": 0.007602924481034279, "learning_rate": 5.24126455906822e-07, "logits/chosen": -1.9372005462646484, "logits/rejected": -1.8893638849258423, "logps/chosen": -539.2034912109375, "logps/rejected": -188.10464477539062, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 6.523911476135254, "rewards/margins": 8.760072708129883, "rewards/rejected": -2.236161947250366, "step": 1220 }, { "epoch": 0.49384708493040147, "grad_norm": 0.02956073172390461, "learning_rate": 5.224625623960067e-07, "logits/chosen": -1.947015643119812, "logits/rejected": -1.9377124309539795, "logps/chosen": -574.1597290039062, "logps/rejected": -166.86964416503906, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 6.552640438079834, "rewards/margins": 8.370871543884277, "rewards/rejected": -1.8182317018508911, "step": 1224 }, { "epoch": 0.4954609642929191, "grad_norm": 0.014369814656674862, "learning_rate": 5.207986688851913e-07, "logits/chosen": -1.9783180952072144, "logits/rejected": -1.9444252252578735, "logps/chosen": -595.8175659179688, "logps/rejected": -191.223388671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.262261867523193, "rewards/margins": 9.054800033569336, "rewards/rejected": -1.7925386428833008, "step": 1228 }, { "epoch": 0.49707484365543675, "grad_norm": 0.008710119873285294, "learning_rate": 5.19134775374376e-07, "logits/chosen": -1.917776346206665, "logits/rejected": -1.9099576473236084, "logps/chosen": -585.882568359375, "logps/rejected": -197.810302734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 6.9187774658203125, "rewards/margins": 9.28205680847168, "rewards/rejected": -2.3632798194885254, "step": 1232 }, { "epoch": 0.4986887230179544, "grad_norm": 0.007549123838543892, "learning_rate": 5.174708818635607e-07, "logits/chosen": -1.9388244152069092, "logits/rejected": -1.9360047578811646, "logps/chosen": -606.5652465820312, "logps/rejected": -200.30340576171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.336550235748291, "rewards/margins": 9.367645263671875, "rewards/rejected": -2.031094789505005, "step": 1236 }, { "epoch": 0.5003026023804721, "grad_norm": 0.0019197618821635842, "learning_rate": 5.158069883527454e-07, "logits/chosen": -2.013106107711792, "logits/rejected": -1.995920181274414, "logps/chosen": -604.6414184570312, "logps/rejected": -202.77175903320312, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 7.018340110778809, "rewards/margins": 9.097182273864746, "rewards/rejected": -2.0788426399230957, "step": 1240 }, { "epoch": 0.5019164817429898, "grad_norm": 0.022173115983605385, "learning_rate": 5.141430948419301e-07, "logits/chosen": -1.9935420751571655, "logits/rejected": -1.9317840337753296, "logps/chosen": -522.0113525390625, "logps/rejected": -189.1124267578125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.5004072189331055, "rewards/margins": 8.311385154724121, "rewards/rejected": -1.8109785318374634, "step": 1244 }, { "epoch": 0.5035303611055074, "grad_norm": 0.03218907117843628, "learning_rate": 5.124792013311148e-07, "logits/chosen": -1.949345588684082, "logits/rejected": -1.9307537078857422, "logps/chosen": -570.9005126953125, "logps/rejected": -169.94290161132812, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 6.907129287719727, "rewards/margins": 8.909610748291016, "rewards/rejected": -2.0024807453155518, "step": 1248 }, { "epoch": 0.505144240468025, "grad_norm": 0.0337279848754406, "learning_rate": 5.108153078202995e-07, "logits/chosen": -1.9562889337539673, "logits/rejected": -1.9268057346343994, "logps/chosen": -647.940185546875, "logps/rejected": -205.54660034179688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.02614974975586, "rewards/margins": 10.17542552947998, "rewards/rejected": -2.149275541305542, "step": 1252 }, { "epoch": 0.5067581198305426, "grad_norm": 0.002843159018084407, "learning_rate": 5.091514143094842e-07, "logits/chosen": -1.9529671669006348, "logits/rejected": -1.9340062141418457, "logps/chosen": -604.7838745117188, "logps/rejected": -189.8416290283203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.30690860748291, "rewards/margins": 10.414636611938477, "rewards/rejected": -2.1077284812927246, "step": 1256 }, { "epoch": 0.5083719991930603, "grad_norm": 0.0013281854335218668, "learning_rate": 5.074875207986689e-07, "logits/chosen": -1.9667465686798096, "logits/rejected": -1.9656091928482056, "logps/chosen": -562.5265502929688, "logps/rejected": -187.34619140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.890307426452637, "rewards/margins": 8.996766090393066, "rewards/rejected": -2.106459140777588, "step": 1260 }, { "epoch": 0.509985878555578, "grad_norm": 0.027299338951706886, "learning_rate": 5.058236272878536e-07, "logits/chosen": -1.9580891132354736, "logits/rejected": -1.9661346673965454, "logps/chosen": -517.7352294921875, "logps/rejected": -193.4201202392578, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.969622611999512, "rewards/margins": 8.976411819458008, "rewards/rejected": -2.0067896842956543, "step": 1264 }, { "epoch": 0.5115997579180956, "grad_norm": 0.012945602647960186, "learning_rate": 5.041597337770383e-07, "logits/chosen": -1.9527442455291748, "logits/rejected": -1.9350943565368652, "logps/chosen": -659.712890625, "logps/rejected": -205.51382446289062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 8.301533699035645, "rewards/margins": 10.780890464782715, "rewards/rejected": -2.4793572425842285, "step": 1268 }, { "epoch": 0.5132136372806133, "grad_norm": 0.007230910938233137, "learning_rate": 5.02495840266223e-07, "logits/chosen": -1.8929927349090576, "logits/rejected": -1.8715476989746094, "logps/chosen": -590.7606201171875, "logps/rejected": -193.90675354003906, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.290658473968506, "rewards/margins": 9.526777267456055, "rewards/rejected": -2.236118793487549, "step": 1272 }, { "epoch": 0.514827516643131, "grad_norm": 0.003982809372246265, "learning_rate": 5.008319467554077e-07, "logits/chosen": -1.8469290733337402, "logits/rejected": -1.9148478507995605, "logps/chosen": -682.81005859375, "logps/rejected": -211.75543212890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.45554256439209, "rewards/margins": 10.918524742126465, "rewards/rejected": -2.4629814624786377, "step": 1276 }, { "epoch": 0.5164413960056485, "grad_norm": 0.03049899823963642, "learning_rate": 4.991680532445923e-07, "logits/chosen": -1.969294786453247, "logits/rejected": -1.9704253673553467, "logps/chosen": -585.3409423828125, "logps/rejected": -196.185546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.862716197967529, "rewards/margins": 8.889430046081543, "rewards/rejected": -2.026712656021118, "step": 1280 }, { "epoch": 0.5180552753681662, "grad_norm": 0.013213871978223324, "learning_rate": 4.97504159733777e-07, "logits/chosen": -1.9419517517089844, "logits/rejected": -1.9093108177185059, "logps/chosen": -607.7613525390625, "logps/rejected": -198.76829528808594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.276205539703369, "rewards/margins": 9.434825897216797, "rewards/rejected": -2.158620834350586, "step": 1284 }, { "epoch": 0.5196691547306839, "grad_norm": 0.0642031654715538, "learning_rate": 4.958402662229617e-07, "logits/chosen": -1.9744126796722412, "logits/rejected": -1.9536405801773071, "logps/chosen": -585.2025146484375, "logps/rejected": -192.10379028320312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.624017238616943, "rewards/margins": 8.701056480407715, "rewards/rejected": -2.077038526535034, "step": 1288 }, { "epoch": 0.5212830340932015, "grad_norm": 0.08774548023939133, "learning_rate": 4.941763727121464e-07, "logits/chosen": -1.951315999031067, "logits/rejected": -1.9772398471832275, "logps/chosen": -577.1619262695312, "logps/rejected": -196.32496643066406, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 6.471190929412842, "rewards/margins": 8.650944709777832, "rewards/rejected": -2.1797540187835693, "step": 1292 }, { "epoch": 0.5228969134557192, "grad_norm": 0.0057691666297614574, "learning_rate": 4.92512479201331e-07, "logits/chosen": -1.9537123441696167, "logits/rejected": -1.919345736503601, "logps/chosen": -590.9545288085938, "logps/rejected": -202.12318420410156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.454626083374023, "rewards/margins": 9.645831108093262, "rewards/rejected": -2.191206455230713, "step": 1296 }, { "epoch": 0.5245107928182369, "grad_norm": 0.02596125565469265, "learning_rate": 4.908485856905157e-07, "logits/chosen": -1.9519639015197754, "logits/rejected": -1.968557596206665, "logps/chosen": -599.7173461914062, "logps/rejected": -205.1290283203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 6.965020179748535, "rewards/margins": 8.9757080078125, "rewards/rejected": -2.0106875896453857, "step": 1300 }, { "epoch": 0.5261246721807545, "grad_norm": 0.001736780279316008, "learning_rate": 4.891846921797004e-07, "logits/chosen": -1.948601484298706, "logits/rejected": -1.9828418493270874, "logps/chosen": -645.2648315429688, "logps/rejected": -202.14662170410156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.519781112670898, "rewards/margins": 9.652931213378906, "rewards/rejected": -2.133150100708008, "step": 1304 }, { "epoch": 0.5277385515432721, "grad_norm": 0.0014411529991775751, "learning_rate": 4.875207986688851e-07, "logits/chosen": -1.9495891332626343, "logits/rejected": -1.9074541330337524, "logps/chosen": -565.1793212890625, "logps/rejected": -198.68701171875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.8723320960998535, "rewards/margins": 9.201797485351562, "rewards/rejected": -2.329465866088867, "step": 1308 }, { "epoch": 0.5293524309057898, "grad_norm": 0.01626935787498951, "learning_rate": 4.858569051580698e-07, "logits/chosen": -1.898332953453064, "logits/rejected": -1.8943743705749512, "logps/chosen": -667.0679931640625, "logps/rejected": -195.36012268066406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.250835418701172, "rewards/margins": 10.28007984161377, "rewards/rejected": -2.0292422771453857, "step": 1312 }, { "epoch": 0.5309663102683074, "grad_norm": 0.3726865351200104, "learning_rate": 4.841930116472545e-07, "logits/chosen": -1.9537198543548584, "logits/rejected": -1.9023139476776123, "logps/chosen": -516.434326171875, "logps/rejected": -192.5177764892578, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 6.238185405731201, "rewards/margins": 8.341540336608887, "rewards/rejected": -2.1033544540405273, "step": 1316 }, { "epoch": 0.5325801896308251, "grad_norm": 0.0039161499589681625, "learning_rate": 4.825291181364392e-07, "logits/chosen": -1.954939842224121, "logits/rejected": -1.9342536926269531, "logps/chosen": -565.8009033203125, "logps/rejected": -197.71983337402344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.286872386932373, "rewards/margins": 9.254889488220215, "rewards/rejected": -1.9680171012878418, "step": 1320 }, { "epoch": 0.5341940689933428, "grad_norm": 0.008202419616281986, "learning_rate": 4.808652246256239e-07, "logits/chosen": -1.8977875709533691, "logits/rejected": -1.89401376247406, "logps/chosen": -609.9589233398438, "logps/rejected": -187.70892333984375, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 7.061158657073975, "rewards/margins": 9.111042022705078, "rewards/rejected": -2.0498833656311035, "step": 1324 }, { "epoch": 0.5358079483558604, "grad_norm": 0.0027778134681284428, "learning_rate": 4.792013311148086e-07, "logits/chosen": -1.958451509475708, "logits/rejected": -1.957456350326538, "logps/chosen": -584.3950805664062, "logps/rejected": -185.71624755859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.320656776428223, "rewards/margins": 9.567909240722656, "rewards/rejected": -2.2472527027130127, "step": 1328 }, { "epoch": 0.5374218277183781, "grad_norm": 0.009780088439583778, "learning_rate": 4.775374376039933e-07, "logits/chosen": -1.9599438905715942, "logits/rejected": -1.932640552520752, "logps/chosen": -526.3729248046875, "logps/rejected": -186.10385131835938, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 6.932224273681641, "rewards/margins": 9.079632759094238, "rewards/rejected": -2.1474077701568604, "step": 1332 }, { "epoch": 0.5390357070808957, "grad_norm": 0.06792566925287247, "learning_rate": 4.7587354409317805e-07, "logits/chosen": -1.940143346786499, "logits/rejected": -1.9098728895187378, "logps/chosen": -599.3258056640625, "logps/rejected": -191.55740356445312, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.507044792175293, "rewards/margins": 9.76462459564209, "rewards/rejected": -2.257579803466797, "step": 1336 }, { "epoch": 0.5406495864434133, "grad_norm": 0.014578678645193577, "learning_rate": 4.7420965058236274e-07, "logits/chosen": -1.9539031982421875, "logits/rejected": -1.9509024620056152, "logps/chosen": -561.5401000976562, "logps/rejected": -201.3792724609375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 6.959167003631592, "rewards/margins": 9.324223518371582, "rewards/rejected": -2.365056037902832, "step": 1340 }, { "epoch": 0.542263465805931, "grad_norm": 0.0022002209443598986, "learning_rate": 4.7254575707154744e-07, "logits/chosen": -1.9445898532867432, "logits/rejected": -1.937038540840149, "logps/chosen": -658.0079345703125, "logps/rejected": -181.23806762695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.357365608215332, "rewards/margins": 10.637049674987793, "rewards/rejected": -2.279684066772461, "step": 1344 }, { "epoch": 0.5438773451684487, "grad_norm": 0.008864504285156727, "learning_rate": 4.7088186356073213e-07, "logits/chosen": -1.9023969173431396, "logits/rejected": -1.8977296352386475, "logps/chosen": -564.1538696289062, "logps/rejected": -190.22662353515625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.088417053222656, "rewards/margins": 9.225637435913086, "rewards/rejected": -2.1372203826904297, "step": 1348 }, { "epoch": 0.5454912245309663, "grad_norm": 0.0720176175236702, "learning_rate": 4.692179700499168e-07, "logits/chosen": -1.9447240829467773, "logits/rejected": -1.9336180686950684, "logps/chosen": -579.311279296875, "logps/rejected": -185.41639709472656, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.473271369934082, "rewards/margins": 9.275671005249023, "rewards/rejected": -1.802398920059204, "step": 1352 }, { "epoch": 0.547105103893484, "grad_norm": 0.010311376303434372, "learning_rate": 4.6755407653910147e-07, "logits/chosen": -1.9250704050064087, "logits/rejected": -1.893057107925415, "logps/chosen": -588.669189453125, "logps/rejected": -189.32940673828125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.335265159606934, "rewards/margins": 9.583401679992676, "rewards/rejected": -2.248136043548584, "step": 1356 }, { "epoch": 0.5487189832560017, "grad_norm": 0.0013230983167886734, "learning_rate": 4.6589018302828616e-07, "logits/chosen": -1.9466015100479126, "logits/rejected": -1.9566789865493774, "logps/chosen": -707.5850219726562, "logps/rejected": -195.3712158203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.986310958862305, "rewards/margins": 10.939123153686523, "rewards/rejected": -1.952812910079956, "step": 1360 }, { "epoch": 0.5503328626185192, "grad_norm": 0.015718746930360794, "learning_rate": 4.6422628951747086e-07, "logits/chosen": -2.005387306213379, "logits/rejected": -1.989922046661377, "logps/chosen": -573.9415283203125, "logps/rejected": -208.6775360107422, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.706757545471191, "rewards/margins": 9.11020278930664, "rewards/rejected": -2.403444766998291, "step": 1364 }, { "epoch": 0.5519467419810369, "grad_norm": 0.017633378505706787, "learning_rate": 4.6256239600665555e-07, "logits/chosen": -1.8991397619247437, "logits/rejected": -1.9143304824829102, "logps/chosen": -597.2449951171875, "logps/rejected": -189.9311981201172, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.187883377075195, "rewards/margins": 9.203907012939453, "rewards/rejected": -2.016024351119995, "step": 1368 }, { "epoch": 0.5535606213435545, "grad_norm": 0.002281520515680313, "learning_rate": 4.6089850249584025e-07, "logits/chosen": -1.997175931930542, "logits/rejected": -1.9650994539260864, "logps/chosen": -579.8212890625, "logps/rejected": -204.63150024414062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.481656074523926, "rewards/margins": 9.678969383239746, "rewards/rejected": -2.1973142623901367, "step": 1372 }, { "epoch": 0.5551745007060722, "grad_norm": 0.017003647983074188, "learning_rate": 4.5923460898502494e-07, "logits/chosen": -1.9840794801712036, "logits/rejected": -1.9582351446151733, "logps/chosen": -570.366455078125, "logps/rejected": -197.6856231689453, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.212687015533447, "rewards/margins": 9.282402038574219, "rewards/rejected": -2.0697150230407715, "step": 1376 }, { "epoch": 0.5567883800685899, "grad_norm": 0.007630456704646349, "learning_rate": 4.5757071547420964e-07, "logits/chosen": -1.9052666425704956, "logits/rejected": -1.9010013341903687, "logps/chosen": -555.7613525390625, "logps/rejected": -175.1800079345703, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 6.709856986999512, "rewards/margins": 8.832000732421875, "rewards/rejected": -2.122143507003784, "step": 1380 }, { "epoch": 0.5584022594311076, "grad_norm": 0.0013823037734255195, "learning_rate": 4.5590682196339433e-07, "logits/chosen": -1.9302215576171875, "logits/rejected": -1.9297187328338623, "logps/chosen": -637.6644897460938, "logps/rejected": -196.21319580078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.841482162475586, "rewards/margins": 10.22208309173584, "rewards/rejected": -2.3806025981903076, "step": 1384 }, { "epoch": 0.5600161387936252, "grad_norm": 0.005816981662064791, "learning_rate": 4.54242928452579e-07, "logits/chosen": -1.9178646802902222, "logits/rejected": -1.9598970413208008, "logps/chosen": -561.2465209960938, "logps/rejected": -192.75848388671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.201777458190918, "rewards/margins": 9.479962348937988, "rewards/rejected": -2.278184413909912, "step": 1388 }, { "epoch": 0.5616300181561428, "grad_norm": 0.06176711246371269, "learning_rate": 4.525790349417637e-07, "logits/chosen": -1.939990520477295, "logits/rejected": -1.9339171648025513, "logps/chosen": -578.4144897460938, "logps/rejected": -196.91139221191406, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.606768608093262, "rewards/margins": 9.641931533813477, "rewards/rejected": -2.035162925720215, "step": 1392 }, { "epoch": 0.5632438975186604, "grad_norm": 0.0007918255869299173, "learning_rate": 4.5091514143094836e-07, "logits/chosen": -1.9547683000564575, "logits/rejected": -1.9275414943695068, "logps/chosen": -580.8418579101562, "logps/rejected": -212.38345336914062, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 7.099363327026367, "rewards/margins": 9.581192970275879, "rewards/rejected": -2.4818294048309326, "step": 1396 }, { "epoch": 0.5648577768811781, "grad_norm": 0.014726952649652958, "learning_rate": 4.4925124792013306e-07, "logits/chosen": -1.8796240091323853, "logits/rejected": -1.9181627035140991, "logps/chosen": -593.0827026367188, "logps/rejected": -199.6938934326172, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.630122184753418, "rewards/margins": 9.959226608276367, "rewards/rejected": -2.329104423522949, "step": 1400 }, { "epoch": 0.5664716562436958, "grad_norm": 0.013997410424053669, "learning_rate": 4.4758735440931775e-07, "logits/chosen": -1.9592630863189697, "logits/rejected": -1.9418971538543701, "logps/chosen": -674.487060546875, "logps/rejected": -208.6606903076172, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.441813945770264, "rewards/margins": 9.733752250671387, "rewards/rejected": -2.291938543319702, "step": 1404 }, { "epoch": 0.5680855356062134, "grad_norm": 0.010746434330940247, "learning_rate": 4.4592346089850244e-07, "logits/chosen": -1.9371082782745361, "logits/rejected": -1.94197416305542, "logps/chosen": -537.156005859375, "logps/rejected": -191.6694793701172, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 6.150659561157227, "rewards/margins": 8.224425315856934, "rewards/rejected": -2.073765516281128, "step": 1408 }, { "epoch": 0.5696994149687311, "grad_norm": 0.006391435395926237, "learning_rate": 4.4425956738768714e-07, "logits/chosen": -1.909582495689392, "logits/rejected": -1.8921709060668945, "logps/chosen": -582.93310546875, "logps/rejected": -197.23635864257812, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 8.006169319152832, "rewards/margins": 10.002711296081543, "rewards/rejected": -1.9965437650680542, "step": 1412 }, { "epoch": 0.5713132943312488, "grad_norm": 0.003690955461934209, "learning_rate": 4.4259567387687183e-07, "logits/chosen": -1.9849746227264404, "logits/rejected": -1.9632229804992676, "logps/chosen": -561.9823608398438, "logps/rejected": -214.3079376220703, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.777618408203125, "rewards/margins": 9.886162757873535, "rewards/rejected": -2.1085445880889893, "step": 1416 }, { "epoch": 0.5729271736937664, "grad_norm": 0.013788290321826935, "learning_rate": 4.4093178036605653e-07, "logits/chosen": -2.013640880584717, "logits/rejected": -1.9603034257888794, "logps/chosen": -527.65380859375, "logps/rejected": -204.26686096191406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.099742889404297, "rewards/margins": 9.149202346801758, "rewards/rejected": -2.049459218978882, "step": 1420 }, { "epoch": 0.574541053056284, "grad_norm": 0.009683886542916298, "learning_rate": 4.392678868552412e-07, "logits/chosen": -1.9260016679763794, "logits/rejected": -1.950756549835205, "logps/chosen": -676.360107421875, "logps/rejected": -181.06939697265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.306970596313477, "rewards/margins": 10.35153865814209, "rewards/rejected": -2.044567584991455, "step": 1424 }, { "epoch": 0.5761549324188017, "grad_norm": 0.002083543920889497, "learning_rate": 4.376039933444259e-07, "logits/chosen": -1.9570767879486084, "logits/rejected": -1.9151118993759155, "logps/chosen": -609.766357421875, "logps/rejected": -200.43434143066406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.604677200317383, "rewards/margins": 9.766128540039062, "rewards/rejected": -2.161451578140259, "step": 1428 }, { "epoch": 0.5777688117813193, "grad_norm": 0.03687971830368042, "learning_rate": 4.359400998336106e-07, "logits/chosen": -1.9646745920181274, "logits/rejected": -1.9536234140396118, "logps/chosen": -588.1460571289062, "logps/rejected": -195.87747192382812, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 6.973261833190918, "rewards/margins": 9.005895614624023, "rewards/rejected": -2.032633066177368, "step": 1432 }, { "epoch": 0.579382691143837, "grad_norm": 0.32595857977867126, "learning_rate": 4.342762063227953e-07, "logits/chosen": -1.9556647539138794, "logits/rejected": -1.908352255821228, "logps/chosen": -541.5323486328125, "logps/rejected": -196.13999938964844, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 7.001401901245117, "rewards/margins": 9.085317611694336, "rewards/rejected": -2.0839149951934814, "step": 1436 }, { "epoch": 0.5809965705063547, "grad_norm": 0.02745467610657215, "learning_rate": 4.3261231281198e-07, "logits/chosen": -1.963482141494751, "logits/rejected": -1.9529272317886353, "logps/chosen": -564.8900756835938, "logps/rejected": -199.538330078125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.123093605041504, "rewards/margins": 9.352452278137207, "rewards/rejected": -2.2293577194213867, "step": 1440 }, { "epoch": 0.5826104498688723, "grad_norm": 0.015403863042593002, "learning_rate": 4.309484193011647e-07, "logits/chosen": -1.9797723293304443, "logits/rejected": -1.9551329612731934, "logps/chosen": -651.4147338867188, "logps/rejected": -212.31304931640625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 8.472796440124512, "rewards/margins": 10.581093788146973, "rewards/rejected": -2.1082961559295654, "step": 1444 }, { "epoch": 0.58422432923139, "grad_norm": 0.010284919291734695, "learning_rate": 4.292845257903494e-07, "logits/chosen": -1.9182127714157104, "logits/rejected": -1.905826210975647, "logps/chosen": -605.23388671875, "logps/rejected": -212.91529846191406, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.595898628234863, "rewards/margins": 9.507662773132324, "rewards/rejected": -1.911763310432434, "step": 1448 }, { "epoch": 0.5858382085939076, "grad_norm": 0.002170214196667075, "learning_rate": 4.276206322795341e-07, "logits/chosen": -1.9839518070220947, "logits/rejected": -1.9673688411712646, "logps/chosen": -611.668212890625, "logps/rejected": -202.5149383544922, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.847504615783691, "rewards/margins": 10.275556564331055, "rewards/rejected": -2.428051233291626, "step": 1452 }, { "epoch": 0.5874520879564252, "grad_norm": 0.00975659117102623, "learning_rate": 4.259567387687188e-07, "logits/chosen": -1.9566844701766968, "logits/rejected": -1.9752178192138672, "logps/chosen": -633.8223876953125, "logps/rejected": -208.3706512451172, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.733653545379639, "rewards/margins": 10.087392807006836, "rewards/rejected": -2.3537395000457764, "step": 1456 }, { "epoch": 0.5890659673189429, "grad_norm": 0.012274743989109993, "learning_rate": 4.2429284525790347e-07, "logits/chosen": -2.017028570175171, "logits/rejected": -1.9762194156646729, "logps/chosen": -533.4309692382812, "logps/rejected": -212.39828491210938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.138069152832031, "rewards/margins": 9.602746963500977, "rewards/rejected": -2.464677333831787, "step": 1460 }, { "epoch": 0.5906798466814606, "grad_norm": 0.004770489409565926, "learning_rate": 4.2262895174708817e-07, "logits/chosen": -1.9961007833480835, "logits/rejected": -1.9720114469528198, "logps/chosen": -588.2650146484375, "logps/rejected": -196.7570343017578, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 6.852327346801758, "rewards/margins": 8.862354278564453, "rewards/rejected": -2.010026216506958, "step": 1464 }, { "epoch": 0.5922937260439782, "grad_norm": 0.010928248055279255, "learning_rate": 4.2096505823627286e-07, "logits/chosen": -1.9310392141342163, "logits/rejected": -1.8981680870056152, "logps/chosen": -597.4927978515625, "logps/rejected": -205.697021484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.805450439453125, "rewards/margins": 9.980157852172852, "rewards/rejected": -2.174708127975464, "step": 1468 }, { "epoch": 0.5939076054064959, "grad_norm": 0.008750226348638535, "learning_rate": 4.1930116472545756e-07, "logits/chosen": -1.9012269973754883, "logits/rejected": -1.914914608001709, "logps/chosen": -532.727294921875, "logps/rejected": -189.30914306640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.676774501800537, "rewards/margins": 9.967604637145996, "rewards/rejected": -2.29082989692688, "step": 1472 }, { "epoch": 0.5955214847690136, "grad_norm": 0.005578696262091398, "learning_rate": 4.1763727121464225e-07, "logits/chosen": -2.0219674110412598, "logits/rejected": -1.9487874507904053, "logps/chosen": -546.4888916015625, "logps/rejected": -198.8837432861328, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 6.794992923736572, "rewards/margins": 8.897412300109863, "rewards/rejected": -2.10241961479187, "step": 1476 }, { "epoch": 0.5971353641315311, "grad_norm": 0.01587824709713459, "learning_rate": 4.1597337770382695e-07, "logits/chosen": -1.9940316677093506, "logits/rejected": -1.9584314823150635, "logps/chosen": -524.4888916015625, "logps/rejected": -190.53671264648438, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 6.969544887542725, "rewards/margins": 9.219322204589844, "rewards/rejected": -2.2497763633728027, "step": 1480 }, { "epoch": 0.5987492434940488, "grad_norm": 0.0014844723045825958, "learning_rate": 4.1430948419301164e-07, "logits/chosen": -1.9650883674621582, "logits/rejected": -1.9727470874786377, "logps/chosen": -656.821044921875, "logps/rejected": -199.7067413330078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.882815361022949, "rewards/margins": 10.454669952392578, "rewards/rejected": -2.57185435295105, "step": 1484 }, { "epoch": 0.6003631228565665, "grad_norm": 0.041306063532829285, "learning_rate": 4.1264559068219634e-07, "logits/chosen": -1.9464051723480225, "logits/rejected": -1.9081701040267944, "logps/chosen": -582.2799072265625, "logps/rejected": -196.649658203125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 6.973422050476074, "rewards/margins": 8.837532043457031, "rewards/rejected": -1.864109992980957, "step": 1488 }, { "epoch": 0.6019770022190841, "grad_norm": 0.03607243672013283, "learning_rate": 4.1098169717138103e-07, "logits/chosen": -1.9643349647521973, "logits/rejected": -1.940718650817871, "logps/chosen": -541.8094482421875, "logps/rejected": -191.43667602539062, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.26632833480835, "rewards/margins": 9.287495613098145, "rewards/rejected": -2.0211679935455322, "step": 1492 }, { "epoch": 0.6035908815816018, "grad_norm": 0.03503239154815674, "learning_rate": 4.093178036605657e-07, "logits/chosen": -1.9543697834014893, "logits/rejected": -1.935934066772461, "logps/chosen": -596.6967163085938, "logps/rejected": -196.03746032714844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.7832841873168945, "rewards/margins": 9.942380905151367, "rewards/rejected": -2.159095287322998, "step": 1496 }, { "epoch": 0.6052047609441195, "grad_norm": 0.017019683495163918, "learning_rate": 4.076539101497504e-07, "logits/chosen": -1.9621638059616089, "logits/rejected": -1.9566978216171265, "logps/chosen": -571.2904663085938, "logps/rejected": -231.3235321044922, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.2054548263549805, "rewards/margins": 9.691427230834961, "rewards/rejected": -2.4859731197357178, "step": 1500 }, { "epoch": 0.6068186403066371, "grad_norm": 0.0007580218371003866, "learning_rate": 4.0599001663893506e-07, "logits/chosen": -1.8777357339859009, "logits/rejected": -1.920845627784729, "logps/chosen": -628.2002563476562, "logps/rejected": -192.279296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.857298851013184, "rewards/margins": 10.144111633300781, "rewards/rejected": -2.2868118286132812, "step": 1504 }, { "epoch": 0.6084325196691547, "grad_norm": 0.001156120328232646, "learning_rate": 4.0432612312811975e-07, "logits/chosen": -1.9376574754714966, "logits/rejected": -1.9204272031784058, "logps/chosen": -616.7096557617188, "logps/rejected": -194.49697875976562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.572098255157471, "rewards/margins": 9.778945922851562, "rewards/rejected": -2.206847667694092, "step": 1508 }, { "epoch": 0.6100463990316723, "grad_norm": 0.0019476283341646194, "learning_rate": 4.0266222961730445e-07, "logits/chosen": -2.026930809020996, "logits/rejected": -2.0135767459869385, "logps/chosen": -519.3638916015625, "logps/rejected": -211.5479278564453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.494919776916504, "rewards/margins": 9.908876419067383, "rewards/rejected": -2.413957118988037, "step": 1512 }, { "epoch": 0.61166027839419, "grad_norm": 0.0066610719077289104, "learning_rate": 4.0099833610648914e-07, "logits/chosen": -1.9373143911361694, "logits/rejected": -1.975534200668335, "logps/chosen": -535.5565185546875, "logps/rejected": -182.87664794921875, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 6.977367877960205, "rewards/margins": 9.10945987701416, "rewards/rejected": -2.132091999053955, "step": 1516 }, { "epoch": 0.6132741577567077, "grad_norm": 0.0035142686683684587, "learning_rate": 3.9933444259567384e-07, "logits/chosen": -1.9264549016952515, "logits/rejected": -2.0035085678100586, "logps/chosen": -644.848388671875, "logps/rejected": -216.68772888183594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.642140865325928, "rewards/margins": 10.253950119018555, "rewards/rejected": -2.611809253692627, "step": 1520 }, { "epoch": 0.6148880371192254, "grad_norm": 0.0021825111471116543, "learning_rate": 3.9767054908485853e-07, "logits/chosen": -1.9681100845336914, "logits/rejected": -1.9591021537780762, "logps/chosen": -542.2494506835938, "logps/rejected": -221.89599609375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.998046875, "rewards/margins": 9.243230819702148, "rewards/rejected": -2.245184898376465, "step": 1524 }, { "epoch": 0.616501916481743, "grad_norm": 0.02958015538752079, "learning_rate": 3.9600665557404323e-07, "logits/chosen": -1.95374596118927, "logits/rejected": -1.928580403327942, "logps/chosen": -496.39886474609375, "logps/rejected": -204.19044494628906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.386816024780273, "rewards/margins": 8.557648658752441, "rewards/rejected": -2.170832395553589, "step": 1528 }, { "epoch": 0.6181157958442607, "grad_norm": 0.0018118283478543162, "learning_rate": 3.943427620632279e-07, "logits/chosen": -2.020399570465088, "logits/rejected": -1.9504778385162354, "logps/chosen": -592.3741455078125, "logps/rejected": -191.3798065185547, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.050278663635254, "rewards/margins": 10.20608901977539, "rewards/rejected": -2.155810594558716, "step": 1532 }, { "epoch": 0.6197296752067782, "grad_norm": 0.0067185163497924805, "learning_rate": 3.926788685524126e-07, "logits/chosen": -1.970051646232605, "logits/rejected": -1.987046480178833, "logps/chosen": -498.1391296386719, "logps/rejected": -181.3747100830078, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 6.909139633178711, "rewards/margins": 8.9336576461792, "rewards/rejected": -2.02451753616333, "step": 1536 }, { "epoch": 0.6213435545692959, "grad_norm": 0.01679498516023159, "learning_rate": 3.910149750415973e-07, "logits/chosen": -1.9632422924041748, "logits/rejected": -1.922234296798706, "logps/chosen": -542.36376953125, "logps/rejected": -190.15467834472656, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.440287113189697, "rewards/margins": 9.610267639160156, "rewards/rejected": -2.169980764389038, "step": 1540 }, { "epoch": 0.6229574339318136, "grad_norm": 0.0231950581073761, "learning_rate": 3.89351081530782e-07, "logits/chosen": -1.9207338094711304, "logits/rejected": -1.9201709032058716, "logps/chosen": -642.7880859375, "logps/rejected": -202.8985137939453, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.9381794929504395, "rewards/margins": 10.163654327392578, "rewards/rejected": -2.2254762649536133, "step": 1544 }, { "epoch": 0.6245713132943312, "grad_norm": 0.009600454941391945, "learning_rate": 3.876871880199667e-07, "logits/chosen": -2.0107736587524414, "logits/rejected": -1.944657802581787, "logps/chosen": -527.9483032226562, "logps/rejected": -188.3282928466797, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.560681343078613, "rewards/margins": 9.833681106567383, "rewards/rejected": -2.2729992866516113, "step": 1548 }, { "epoch": 0.6261851926568489, "grad_norm": 0.0019395567942410707, "learning_rate": 3.860232945091514e-07, "logits/chosen": -1.9669685363769531, "logits/rejected": -1.9756242036819458, "logps/chosen": -641.6556396484375, "logps/rejected": -213.41371154785156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.262602806091309, "rewards/margins": 10.090603828430176, "rewards/rejected": -2.828001022338867, "step": 1552 }, { "epoch": 0.6277990720193666, "grad_norm": 0.0342312827706337, "learning_rate": 3.843594009983361e-07, "logits/chosen": -1.9875154495239258, "logits/rejected": -1.9113175868988037, "logps/chosen": -572.3963012695312, "logps/rejected": -189.4143829345703, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 7.0729169845581055, "rewards/margins": 9.09128475189209, "rewards/rejected": -2.018367052078247, "step": 1556 }, { "epoch": 0.6294129513818842, "grad_norm": 0.0024008837062865496, "learning_rate": 3.826955074875208e-07, "logits/chosen": -1.9619370698928833, "logits/rejected": -1.9519468545913696, "logps/chosen": -563.45947265625, "logps/rejected": -203.7288360595703, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.382933616638184, "rewards/margins": 9.858662605285645, "rewards/rejected": -2.475729465484619, "step": 1560 }, { "epoch": 0.6310268307444018, "grad_norm": 0.013503005728125572, "learning_rate": 3.810316139767055e-07, "logits/chosen": -1.9741334915161133, "logits/rejected": -1.9447619915008545, "logps/chosen": -584.9071044921875, "logps/rejected": -195.02061462402344, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.567935943603516, "rewards/margins": 10.171335220336914, "rewards/rejected": -2.6033995151519775, "step": 1564 }, { "epoch": 0.6326407101069195, "grad_norm": 0.0008718158933334053, "learning_rate": 3.7936772046589017e-07, "logits/chosen": -1.9355616569519043, "logits/rejected": -1.918622612953186, "logps/chosen": -598.0664672851562, "logps/rejected": -214.01235961914062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.685476779937744, "rewards/margins": 9.79330825805664, "rewards/rejected": -2.1078310012817383, "step": 1568 }, { "epoch": 0.6342545894694371, "grad_norm": 0.002572049153968692, "learning_rate": 3.7770382695507487e-07, "logits/chosen": -1.9781978130340576, "logits/rejected": -1.9130200147628784, "logps/chosen": -515.694580078125, "logps/rejected": -174.42098999023438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.580434799194336, "rewards/margins": 9.735032081604004, "rewards/rejected": -2.154597520828247, "step": 1572 }, { "epoch": 0.6358684688319548, "grad_norm": 0.001208022003993392, "learning_rate": 3.7603993344425956e-07, "logits/chosen": -1.9538249969482422, "logits/rejected": -1.9515752792358398, "logps/chosen": -653.6048583984375, "logps/rejected": -213.12400817871094, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 8.256149291992188, "rewards/margins": 10.79405403137207, "rewards/rejected": -2.5379040241241455, "step": 1576 }, { "epoch": 0.6374823481944725, "grad_norm": 0.0009493555407971144, "learning_rate": 3.7437603993344426e-07, "logits/chosen": -1.929785966873169, "logits/rejected": -1.9340308904647827, "logps/chosen": -635.4281616210938, "logps/rejected": -216.3395233154297, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.42003059387207, "rewards/margins": 9.774547576904297, "rewards/rejected": -2.3545167446136475, "step": 1580 }, { "epoch": 0.6390962275569901, "grad_norm": 0.011036033742129803, "learning_rate": 3.7271214642262895e-07, "logits/chosen": -1.9026939868927002, "logits/rejected": -1.896729588508606, "logps/chosen": -533.8150024414062, "logps/rejected": -156.8038330078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 6.986824035644531, "rewards/margins": 8.91995620727539, "rewards/rejected": -1.9331326484680176, "step": 1584 }, { "epoch": 0.6407101069195078, "grad_norm": 0.0015801065601408482, "learning_rate": 3.7104825291181365e-07, "logits/chosen": -1.9935064315795898, "logits/rejected": -1.9518134593963623, "logps/chosen": -528.8705444335938, "logps/rejected": -181.23451232910156, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 6.829871654510498, "rewards/margins": 8.989469528198242, "rewards/rejected": -2.159597873687744, "step": 1588 }, { "epoch": 0.6423239862820255, "grad_norm": 0.00404201028868556, "learning_rate": 3.6938435940099834e-07, "logits/chosen": -1.9462358951568604, "logits/rejected": -1.9842804670333862, "logps/chosen": -630.5953369140625, "logps/rejected": -200.19039916992188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.027186393737793, "rewards/margins": 10.81678295135498, "rewards/rejected": -2.7895967960357666, "step": 1592 }, { "epoch": 0.643937865644543, "grad_norm": 0.0060095698572695255, "learning_rate": 3.6772046589018303e-07, "logits/chosen": -1.9546937942504883, "logits/rejected": -1.9599016904830933, "logps/chosen": -596.5791625976562, "logps/rejected": -206.882568359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.5905656814575195, "rewards/margins": 10.005411148071289, "rewards/rejected": -2.414844512939453, "step": 1596 }, { "epoch": 0.6455517450070607, "grad_norm": 0.0010497464099898934, "learning_rate": 3.6605657237936773e-07, "logits/chosen": -1.9726002216339111, "logits/rejected": -1.964287281036377, "logps/chosen": -608.6497192382812, "logps/rejected": -214.85690307617188, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 7.151241302490234, "rewards/margins": 9.343860626220703, "rewards/rejected": -2.192619800567627, "step": 1600 }, { "epoch": 0.6471656243695784, "grad_norm": 0.020462125539779663, "learning_rate": 3.643926788685524e-07, "logits/chosen": -1.9625056982040405, "logits/rejected": -1.9276878833770752, "logps/chosen": -552.4097290039062, "logps/rejected": -193.4833221435547, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 7.6651811599731445, "rewards/margins": 9.72356128692627, "rewards/rejected": -2.058380365371704, "step": 1604 }, { "epoch": 0.648779503732096, "grad_norm": 0.0050386665388941765, "learning_rate": 3.627287853577371e-07, "logits/chosen": -1.9494271278381348, "logits/rejected": -1.9246349334716797, "logps/chosen": -575.5982666015625, "logps/rejected": -195.3052978515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.672126770019531, "rewards/margins": 9.967022895812988, "rewards/rejected": -2.2948968410491943, "step": 1608 }, { "epoch": 0.6503933830946137, "grad_norm": 0.004401859361678362, "learning_rate": 3.610648918469218e-07, "logits/chosen": -1.9659841060638428, "logits/rejected": -1.9822514057159424, "logps/chosen": -620.5738525390625, "logps/rejected": -202.32373046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.967751502990723, "rewards/margins": 10.716408729553223, "rewards/rejected": -2.748657703399658, "step": 1612 }, { "epoch": 0.6520072624571314, "grad_norm": 0.0028456689324229956, "learning_rate": 3.594009983361065e-07, "logits/chosen": -1.9387974739074707, "logits/rejected": -1.9488590955734253, "logps/chosen": -704.8046264648438, "logps/rejected": -208.69131469726562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.355522155761719, "rewards/margins": 10.569496154785156, "rewards/rejected": -2.2139744758605957, "step": 1616 }, { "epoch": 0.653621141819649, "grad_norm": 0.0035628993064165115, "learning_rate": 3.577371048252912e-07, "logits/chosen": -1.9496828317642212, "logits/rejected": -1.932965636253357, "logps/chosen": -567.5556030273438, "logps/rejected": -184.00502014160156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.177309989929199, "rewards/margins": 9.428170204162598, "rewards/rejected": -2.2508604526519775, "step": 1620 }, { "epoch": 0.6552350211821666, "grad_norm": 0.003707815194502473, "learning_rate": 3.560732113144759e-07, "logits/chosen": -1.9521242380142212, "logits/rejected": -1.9335246086120605, "logps/chosen": -498.704345703125, "logps/rejected": -197.5185546875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 6.413857460021973, "rewards/margins": 8.703349113464355, "rewards/rejected": -2.289492130279541, "step": 1624 }, { "epoch": 0.6568489005446843, "grad_norm": 0.0791642889380455, "learning_rate": 3.544093178036606e-07, "logits/chosen": -1.9859884977340698, "logits/rejected": -1.9800949096679688, "logps/chosen": -567.1405029296875, "logps/rejected": -221.3816680908203, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.653709411621094, "rewards/margins": 9.99533462524414, "rewards/rejected": -2.341625452041626, "step": 1628 }, { "epoch": 0.6584627799072019, "grad_norm": 0.0006625878158956766, "learning_rate": 3.527454242928453e-07, "logits/chosen": -1.9628126621246338, "logits/rejected": -1.9207648038864136, "logps/chosen": -634.904296875, "logps/rejected": -192.42601013183594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.76247501373291, "rewards/margins": 10.40119457244873, "rewards/rejected": -2.638719320297241, "step": 1632 }, { "epoch": 0.6600766592697196, "grad_norm": 0.002643912099301815, "learning_rate": 3.5108153078203e-07, "logits/chosen": -1.8859001398086548, "logits/rejected": -1.8844395875930786, "logps/chosen": -579.4344482421875, "logps/rejected": -194.39186096191406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.873961448669434, "rewards/margins": 10.145095825195312, "rewards/rejected": -2.2711334228515625, "step": 1636 }, { "epoch": 0.6616905386322373, "grad_norm": 0.011379554867744446, "learning_rate": 3.494176372712147e-07, "logits/chosen": -1.9468789100646973, "logits/rejected": -1.9643763303756714, "logps/chosen": -630.5573120117188, "logps/rejected": -190.74020385742188, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.418044090270996, "rewards/margins": 9.336996078491211, "rewards/rejected": -1.9189517498016357, "step": 1640 }, { "epoch": 0.6633044179947549, "grad_norm": 0.0029683669563382864, "learning_rate": 3.477537437603993e-07, "logits/chosen": -1.935317873954773, "logits/rejected": -1.9375979900360107, "logps/chosen": -682.7294311523438, "logps/rejected": -202.67788696289062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.617460250854492, "rewards/margins": 10.90317440032959, "rewards/rejected": -2.2857141494750977, "step": 1644 }, { "epoch": 0.6649182973572726, "grad_norm": 0.014755886979401112, "learning_rate": 3.46089850249584e-07, "logits/chosen": -1.959398627281189, "logits/rejected": -1.9113410711288452, "logps/chosen": -539.9741821289062, "logps/rejected": -207.39755249023438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.369673252105713, "rewards/margins": 9.914566040039062, "rewards/rejected": -2.544893264770508, "step": 1648 }, { "epoch": 0.6665321767197901, "grad_norm": 0.08310380578041077, "learning_rate": 3.444259567387687e-07, "logits/chosen": -1.9524904489517212, "logits/rejected": -1.9081974029541016, "logps/chosen": -524.7522583007812, "logps/rejected": -220.77105712890625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 6.809507369995117, "rewards/margins": 9.352087020874023, "rewards/rejected": -2.542579174041748, "step": 1652 }, { "epoch": 0.6681460560823078, "grad_norm": 0.0013109538704156876, "learning_rate": 3.4276206322795335e-07, "logits/chosen": -1.9413963556289673, "logits/rejected": -1.9410138130187988, "logps/chosen": -651.90771484375, "logps/rejected": -196.4068603515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.532693862915039, "rewards/margins": 10.874225616455078, "rewards/rejected": -2.3415327072143555, "step": 1656 }, { "epoch": 0.6697599354448255, "grad_norm": 0.007726287003606558, "learning_rate": 3.4109816971713804e-07, "logits/chosen": -1.9465084075927734, "logits/rejected": -1.8881582021713257, "logps/chosen": -699.4803466796875, "logps/rejected": -194.74969482421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.033940315246582, "rewards/margins": 10.222884178161621, "rewards/rejected": -2.1889455318450928, "step": 1660 }, { "epoch": 0.6713738148073431, "grad_norm": 0.006224425975233316, "learning_rate": 3.3943427620632274e-07, "logits/chosen": -1.95878267288208, "logits/rejected": -1.9226868152618408, "logps/chosen": -502.3583068847656, "logps/rejected": -201.32818603515625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.302216529846191, "rewards/margins": 9.724200248718262, "rewards/rejected": -2.421983003616333, "step": 1664 }, { "epoch": 0.6729876941698608, "grad_norm": 0.001217696233652532, "learning_rate": 3.3777038269550743e-07, "logits/chosen": -1.9176533222198486, "logits/rejected": -1.913426160812378, "logps/chosen": -583.19873046875, "logps/rejected": -194.38922119140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 8.088021278381348, "rewards/margins": 10.58553409576416, "rewards/rejected": -2.4975130558013916, "step": 1668 }, { "epoch": 0.6746015735323785, "grad_norm": 0.01546825747936964, "learning_rate": 3.361064891846921e-07, "logits/chosen": -1.9677255153656006, "logits/rejected": -1.9447780847549438, "logps/chosen": -508.8048095703125, "logps/rejected": -207.24224853515625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.82826042175293, "rewards/margins": 8.934151649475098, "rewards/rejected": -2.105891466140747, "step": 1672 }, { "epoch": 0.6762154528948962, "grad_norm": 0.008401715196669102, "learning_rate": 3.344425956738768e-07, "logits/chosen": -1.9411019086837769, "logits/rejected": -1.9517321586608887, "logps/chosen": -598.6673583984375, "logps/rejected": -210.42568969726562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.929134368896484, "rewards/margins": 10.005777359008789, "rewards/rejected": -2.076643943786621, "step": 1676 }, { "epoch": 0.6778293322574137, "grad_norm": 0.0025178398936986923, "learning_rate": 3.327787021630615e-07, "logits/chosen": -2.0071685314178467, "logits/rejected": -1.99196457862854, "logps/chosen": -481.8070983886719, "logps/rejected": -220.9036102294922, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.214915752410889, "rewards/margins": 9.648110389709473, "rewards/rejected": -2.433194160461426, "step": 1680 }, { "epoch": 0.6794432116199314, "grad_norm": 0.006364191882312298, "learning_rate": 3.311148086522462e-07, "logits/chosen": -1.9390041828155518, "logits/rejected": -1.9267264604568481, "logps/chosen": -534.7975463867188, "logps/rejected": -205.01864624023438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.877176761627197, "rewards/margins": 10.11354923248291, "rewards/rejected": -2.236372709274292, "step": 1684 }, { "epoch": 0.681057090982449, "grad_norm": 0.0023895467165857553, "learning_rate": 3.294509151414309e-07, "logits/chosen": -1.9313979148864746, "logits/rejected": -1.9259307384490967, "logps/chosen": -646.6519165039062, "logps/rejected": -208.57736206054688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.203030586242676, "rewards/margins": 10.81086540222168, "rewards/rejected": -2.607834577560425, "step": 1688 }, { "epoch": 0.6826709703449667, "grad_norm": 0.10200002044439316, "learning_rate": 3.277870216306156e-07, "logits/chosen": -1.9804837703704834, "logits/rejected": -1.9398927688598633, "logps/chosen": -530.6928100585938, "logps/rejected": -211.68405151367188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.5438385009765625, "rewards/margins": 10.058014869689941, "rewards/rejected": -2.514176368713379, "step": 1692 }, { "epoch": 0.6842848497074844, "grad_norm": 0.004740898963063955, "learning_rate": 3.261231281198003e-07, "logits/chosen": -1.915860891342163, "logits/rejected": -1.9039323329925537, "logps/chosen": -641.7454223632812, "logps/rejected": -180.6741180419922, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.100789070129395, "rewards/margins": 9.99792194366455, "rewards/rejected": -1.8971335887908936, "step": 1696 }, { "epoch": 0.685898729070002, "grad_norm": 0.006186998449265957, "learning_rate": 3.24459234608985e-07, "logits/chosen": -1.8984365463256836, "logits/rejected": -1.9369758367538452, "logps/chosen": -706.9638671875, "logps/rejected": -210.36465454101562, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 8.311698913574219, "rewards/margins": 10.707715034484863, "rewards/rejected": -2.3960154056549072, "step": 1700 }, { "epoch": 0.6875126084325197, "grad_norm": 0.001470147049985826, "learning_rate": 3.227953410981697e-07, "logits/chosen": -1.9611146450042725, "logits/rejected": -1.9390037059783936, "logps/chosen": -666.0673828125, "logps/rejected": -202.5988006591797, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.139660835266113, "rewards/margins": 10.558382987976074, "rewards/rejected": -2.418721914291382, "step": 1704 }, { "epoch": 0.6891264877950373, "grad_norm": 0.022144466638565063, "learning_rate": 3.211314475873544e-07, "logits/chosen": -1.945811152458191, "logits/rejected": -1.9745402336120605, "logps/chosen": -552.5719604492188, "logps/rejected": -194.96533203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.344338893890381, "rewards/margins": 9.44023609161377, "rewards/rejected": -2.0958964824676514, "step": 1708 }, { "epoch": 0.6907403671575549, "grad_norm": 0.0026976538356393576, "learning_rate": 3.1946755407653907e-07, "logits/chosen": -1.9441814422607422, "logits/rejected": -1.9283123016357422, "logps/chosen": -564.6856689453125, "logps/rejected": -207.04730224609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.014627456665039, "rewards/margins": 10.0791597366333, "rewards/rejected": -2.06453275680542, "step": 1712 }, { "epoch": 0.6923542465200726, "grad_norm": 0.011888330802321434, "learning_rate": 3.1780366056572377e-07, "logits/chosen": -1.9020260572433472, "logits/rejected": -1.9635099172592163, "logps/chosen": -614.4972534179688, "logps/rejected": -174.44496154785156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.643667221069336, "rewards/margins": 9.785414695739746, "rewards/rejected": -2.141747236251831, "step": 1716 }, { "epoch": 0.6939681258825903, "grad_norm": 0.0029058153741061687, "learning_rate": 3.1613976705490846e-07, "logits/chosen": -1.9533792734146118, "logits/rejected": -1.9598702192306519, "logps/chosen": -623.0563354492188, "logps/rejected": -193.99876403808594, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.845127105712891, "rewards/margins": 10.065107345581055, "rewards/rejected": -2.2199807167053223, "step": 1720 }, { "epoch": 0.6955820052451079, "grad_norm": 0.0037783635780215263, "learning_rate": 3.1447587354409315e-07, "logits/chosen": -1.924607515335083, "logits/rejected": -1.9107089042663574, "logps/chosen": -647.537109375, "logps/rejected": -216.02587890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.867968559265137, "rewards/margins": 11.136098861694336, "rewards/rejected": -2.2681307792663574, "step": 1724 }, { "epoch": 0.6971958846076256, "grad_norm": 0.002819715067744255, "learning_rate": 3.1281198003327785e-07, "logits/chosen": -1.9454185962677002, "logits/rejected": -1.9374178647994995, "logps/chosen": -576.3846435546875, "logps/rejected": -207.88119506835938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.983126640319824, "rewards/margins": 10.73914623260498, "rewards/rejected": -2.7560200691223145, "step": 1728 }, { "epoch": 0.6988097639701433, "grad_norm": 0.006461744662374258, "learning_rate": 3.1114808652246254e-07, "logits/chosen": -2.033625841140747, "logits/rejected": -1.9815280437469482, "logps/chosen": -535.7724609375, "logps/rejected": -194.44107055664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.804723262786865, "rewards/margins": 10.383468627929688, "rewards/rejected": -2.5787456035614014, "step": 1732 }, { "epoch": 0.7004236433326609, "grad_norm": 0.0010916158789768815, "learning_rate": 3.0948419301164724e-07, "logits/chosen": -1.9362220764160156, "logits/rejected": -1.938491940498352, "logps/chosen": -577.9946899414062, "logps/rejected": -186.90333557128906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.620790004730225, "rewards/margins": 9.923973083496094, "rewards/rejected": -2.3031840324401855, "step": 1736 }, { "epoch": 0.7020375226951785, "grad_norm": 0.011661114171147346, "learning_rate": 3.0782029950083193e-07, "logits/chosen": -1.9333622455596924, "logits/rejected": -1.951903223991394, "logps/chosen": -573.6441040039062, "logps/rejected": -199.88247680664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.738377571105957, "rewards/margins": 10.477256774902344, "rewards/rejected": -2.7388787269592285, "step": 1740 }, { "epoch": 0.7036514020576962, "grad_norm": 0.029399100691080093, "learning_rate": 3.0615640599001663e-07, "logits/chosen": -1.9808193445205688, "logits/rejected": -1.9718278646469116, "logps/chosen": -578.7116088867188, "logps/rejected": -207.44117736816406, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.944240093231201, "rewards/margins": 10.276739120483398, "rewards/rejected": -2.3324997425079346, "step": 1744 }, { "epoch": 0.7052652814202138, "grad_norm": 0.0011915041832253337, "learning_rate": 3.044925124792013e-07, "logits/chosen": -1.9130468368530273, "logits/rejected": -1.9594553709030151, "logps/chosen": -669.271240234375, "logps/rejected": -211.50535583496094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.530938148498535, "rewards/margins": 10.752389907836914, "rewards/rejected": -2.2214508056640625, "step": 1748 }, { "epoch": 0.7068791607827315, "grad_norm": 0.0014519531978294253, "learning_rate": 3.02828618968386e-07, "logits/chosen": -1.9531059265136719, "logits/rejected": -1.9544100761413574, "logps/chosen": -646.322021484375, "logps/rejected": -180.9946746826172, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 7.754940032958984, "rewards/margins": 10.082914352416992, "rewards/rejected": -2.327974319458008, "step": 1752 }, { "epoch": 0.7084930401452492, "grad_norm": 0.0016000784235075116, "learning_rate": 3.011647254575707e-07, "logits/chosen": -1.9709672927856445, "logits/rejected": -1.9582560062408447, "logps/chosen": -552.463134765625, "logps/rejected": -196.21328735351562, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.442378520965576, "rewards/margins": 10.009683609008789, "rewards/rejected": -2.5673046112060547, "step": 1756 }, { "epoch": 0.7101069195077668, "grad_norm": 0.03597128763794899, "learning_rate": 2.995008319467554e-07, "logits/chosen": -1.9734995365142822, "logits/rejected": -1.95762038230896, "logps/chosen": -547.9842529296875, "logps/rejected": -184.13526916503906, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.974520683288574, "rewards/margins": 9.279068946838379, "rewards/rejected": -2.3045482635498047, "step": 1760 }, { "epoch": 0.7117207988702845, "grad_norm": 0.03645109012722969, "learning_rate": 2.978369384359401e-07, "logits/chosen": -1.9531971216201782, "logits/rejected": -1.946610450744629, "logps/chosen": -561.1610717773438, "logps/rejected": -193.81549072265625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.020261287689209, "rewards/margins": 9.026603698730469, "rewards/rejected": -2.006342649459839, "step": 1764 }, { "epoch": 0.713334678232802, "grad_norm": 0.005250926595181227, "learning_rate": 2.961730449251248e-07, "logits/chosen": -1.947913408279419, "logits/rejected": -1.9582308530807495, "logps/chosen": -655.6384887695312, "logps/rejected": -196.451904296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.911996841430664, "rewards/margins": 10.476532936096191, "rewards/rejected": -2.5645363330841064, "step": 1768 }, { "epoch": 0.7149485575953197, "grad_norm": 0.016746988520026207, "learning_rate": 2.945091514143095e-07, "logits/chosen": -1.9272820949554443, "logits/rejected": -1.9658348560333252, "logps/chosen": -577.8687744140625, "logps/rejected": -207.48843383789062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.5092854499816895, "rewards/margins": 9.925705909729004, "rewards/rejected": -2.4164206981658936, "step": 1772 }, { "epoch": 0.7165624369578374, "grad_norm": 0.00030140025774016976, "learning_rate": 2.928452579034942e-07, "logits/chosen": -1.9625329971313477, "logits/rejected": -1.9629342555999756, "logps/chosen": -618.767822265625, "logps/rejected": -182.43191528320312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.857381820678711, "rewards/margins": 9.953717231750488, "rewards/rejected": -2.0963354110717773, "step": 1776 }, { "epoch": 0.7181763163203551, "grad_norm": 0.1553896963596344, "learning_rate": 2.911813643926789e-07, "logits/chosen": -1.993658423423767, "logits/rejected": -1.9459822177886963, "logps/chosen": -559.052978515625, "logps/rejected": -194.2206573486328, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 7.16761589050293, "rewards/margins": 9.193378448486328, "rewards/rejected": -2.0257620811462402, "step": 1780 }, { "epoch": 0.7197901956828727, "grad_norm": 0.03358777239918709, "learning_rate": 2.8951747088186357e-07, "logits/chosen": -1.9307273626327515, "logits/rejected": -1.9385383129119873, "logps/chosen": -607.5662841796875, "logps/rejected": -200.5782928466797, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.571842193603516, "rewards/margins": 10.243568420410156, "rewards/rejected": -2.6717264652252197, "step": 1784 }, { "epoch": 0.7214040750453904, "grad_norm": 0.029381267726421356, "learning_rate": 2.8785357737104827e-07, "logits/chosen": -1.9809290170669556, "logits/rejected": -1.9917272329330444, "logps/chosen": -517.4364624023438, "logps/rejected": -199.72918701171875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.461383819580078, "rewards/margins": 9.460489273071289, "rewards/rejected": -1.9991047382354736, "step": 1788 }, { "epoch": 0.7230179544079081, "grad_norm": 0.006375699304044247, "learning_rate": 2.8618968386023296e-07, "logits/chosen": -1.9565463066101074, "logits/rejected": -1.9467923641204834, "logps/chosen": -532.6605834960938, "logps/rejected": -181.5381622314453, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.028401851654053, "rewards/margins": 9.241705894470215, "rewards/rejected": -2.213303804397583, "step": 1792 }, { "epoch": 0.7246318337704256, "grad_norm": 0.005874668247997761, "learning_rate": 2.8452579034941766e-07, "logits/chosen": -1.9886891841888428, "logits/rejected": -1.9497467279434204, "logps/chosen": -505.4976806640625, "logps/rejected": -203.02169799804688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.455031871795654, "rewards/margins": 9.924569129943848, "rewards/rejected": -2.4695370197296143, "step": 1796 }, { "epoch": 0.7262457131329433, "grad_norm": 0.2523076832294464, "learning_rate": 2.8286189683860235e-07, "logits/chosen": -1.890436053276062, "logits/rejected": -1.9285961389541626, "logps/chosen": -509.0183410644531, "logps/rejected": -185.11463928222656, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 6.608155727386475, "rewards/margins": 8.719098091125488, "rewards/rejected": -2.110943078994751, "step": 1800 }, { "epoch": 0.727859592495461, "grad_norm": 0.014366067945957184, "learning_rate": 2.81198003327787e-07, "logits/chosen": -1.949224591255188, "logits/rejected": -1.9283411502838135, "logps/chosen": -605.2280883789062, "logps/rejected": -208.445068359375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.57573127746582, "rewards/margins": 9.860421180725098, "rewards/rejected": -2.2846908569335938, "step": 1804 }, { "epoch": 0.7294734718579786, "grad_norm": 0.03493184596300125, "learning_rate": 2.795341098169717e-07, "logits/chosen": -1.975151777267456, "logits/rejected": -1.9074333906173706, "logps/chosen": -546.2334594726562, "logps/rejected": -200.51519775390625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 6.984680652618408, "rewards/margins": 9.178604125976562, "rewards/rejected": -2.1939239501953125, "step": 1808 }, { "epoch": 0.7310873512204963, "grad_norm": 0.011917552910745144, "learning_rate": 2.778702163061564e-07, "logits/chosen": -1.9226553440093994, "logits/rejected": -1.953282117843628, "logps/chosen": -613.467529296875, "logps/rejected": -212.33799743652344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.350959300994873, "rewards/margins": 10.210865020751953, "rewards/rejected": -2.8599061965942383, "step": 1812 }, { "epoch": 0.732701230583014, "grad_norm": 0.013110501691699028, "learning_rate": 2.762063227953411e-07, "logits/chosen": -1.9351096153259277, "logits/rejected": -1.9801685810089111, "logps/chosen": -570.0748291015625, "logps/rejected": -198.3055419921875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.619271278381348, "rewards/margins": 9.963164329528809, "rewards/rejected": -2.343893527984619, "step": 1816 }, { "epoch": 0.7343151099455316, "grad_norm": 0.005511410068720579, "learning_rate": 2.7454242928452577e-07, "logits/chosen": -1.9391082525253296, "logits/rejected": -1.983085036277771, "logps/chosen": -638.9471435546875, "logps/rejected": -197.3665313720703, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.388506889343262, "rewards/margins": 10.682618141174316, "rewards/rejected": -2.294111490249634, "step": 1820 }, { "epoch": 0.7359289893080492, "grad_norm": 0.013145902194082737, "learning_rate": 2.7287853577371047e-07, "logits/chosen": -1.9763342142105103, "logits/rejected": -1.976247787475586, "logps/chosen": -595.8575439453125, "logps/rejected": -193.9748077392578, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.187828063964844, "rewards/margins": 10.763101577758789, "rewards/rejected": -2.5752742290496826, "step": 1824 }, { "epoch": 0.7375428686705668, "grad_norm": 0.005567939020693302, "learning_rate": 2.7121464226289516e-07, "logits/chosen": -1.9504501819610596, "logits/rejected": -1.9483922719955444, "logps/chosen": -551.9839477539062, "logps/rejected": -187.47604370117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.2904462814331055, "rewards/margins": 9.831924438476562, "rewards/rejected": -2.541478395462036, "step": 1828 }, { "epoch": 0.7391567480330845, "grad_norm": 0.0016265955055132508, "learning_rate": 2.6955074875207985e-07, "logits/chosen": -1.8757381439208984, "logits/rejected": -1.911813735961914, "logps/chosen": -641.2689208984375, "logps/rejected": -209.26988220214844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.1260347366333, "rewards/margins": 10.647101402282715, "rewards/rejected": -2.5210678577423096, "step": 1832 }, { "epoch": 0.7407706273956022, "grad_norm": 0.0068307374604046345, "learning_rate": 2.6788685524126455e-07, "logits/chosen": -1.9564610719680786, "logits/rejected": -1.9656716585159302, "logps/chosen": -600.02734375, "logps/rejected": -195.36538696289062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.348214149475098, "rewards/margins": 9.680442810058594, "rewards/rejected": -2.3322296142578125, "step": 1836 }, { "epoch": 0.7423845067581198, "grad_norm": 0.0024278033524751663, "learning_rate": 2.6622296173044924e-07, "logits/chosen": -1.926835298538208, "logits/rejected": -1.9318180084228516, "logps/chosen": -606.0884399414062, "logps/rejected": -195.33731079101562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.639714241027832, "rewards/margins": 10.031450271606445, "rewards/rejected": -2.391735553741455, "step": 1840 }, { "epoch": 0.7439983861206375, "grad_norm": 0.013463087379932404, "learning_rate": 2.6455906821963394e-07, "logits/chosen": -1.9510084390640259, "logits/rejected": -1.9014865159988403, "logps/chosen": -488.94549560546875, "logps/rejected": -204.88230895996094, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 6.051938533782959, "rewards/margins": 8.295595169067383, "rewards/rejected": -2.2436563968658447, "step": 1844 }, { "epoch": 0.7456122654831552, "grad_norm": 0.018994154408574104, "learning_rate": 2.6289517470881863e-07, "logits/chosen": -1.966989517211914, "logits/rejected": -1.9596519470214844, "logps/chosen": -508.9800109863281, "logps/rejected": -183.09739685058594, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.205238342285156, "rewards/margins": 9.550969123840332, "rewards/rejected": -2.345731019973755, "step": 1848 }, { "epoch": 0.7472261448456727, "grad_norm": 0.0015017056139186025, "learning_rate": 2.6123128119800333e-07, "logits/chosen": -1.9697265625, "logits/rejected": -1.9608089923858643, "logps/chosen": -600.2505493164062, "logps/rejected": -188.92990112304688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.40201473236084, "rewards/margins": 10.999792098999023, "rewards/rejected": -2.5977766513824463, "step": 1852 }, { "epoch": 0.7488400242081904, "grad_norm": 0.025756552815437317, "learning_rate": 2.59567387687188e-07, "logits/chosen": -1.9434473514556885, "logits/rejected": -1.9223166704177856, "logps/chosen": -527.55419921875, "logps/rejected": -192.0076904296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.036865711212158, "rewards/margins": 9.277807235717773, "rewards/rejected": -2.240941047668457, "step": 1856 }, { "epoch": 0.7504539035707081, "grad_norm": 0.0053466921672225, "learning_rate": 2.579034941763727e-07, "logits/chosen": -1.9091438055038452, "logits/rejected": -1.8749792575836182, "logps/chosen": -707.8280639648438, "logps/rejected": -209.68296813964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.606313705444336, "rewards/margins": 11.093856811523438, "rewards/rejected": -2.4875423908233643, "step": 1860 }, { "epoch": 0.7520677829332257, "grad_norm": 0.035423461347818375, "learning_rate": 2.562396006655574e-07, "logits/chosen": -1.9863337278366089, "logits/rejected": -1.9855589866638184, "logps/chosen": -490.1580810546875, "logps/rejected": -213.1865997314453, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.163394927978516, "rewards/margins": 9.447924613952637, "rewards/rejected": -2.284529209136963, "step": 1864 }, { "epoch": 0.7536816622957434, "grad_norm": 0.006776235532015562, "learning_rate": 2.545757071547421e-07, "logits/chosen": -1.9644336700439453, "logits/rejected": -1.936968207359314, "logps/chosen": -593.2294311523438, "logps/rejected": -189.1863250732422, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.9965691566467285, "rewards/margins": 10.208352088928223, "rewards/rejected": -2.2117819786071777, "step": 1868 }, { "epoch": 0.7552955416582611, "grad_norm": 0.031014198437333107, "learning_rate": 2.529118136439268e-07, "logits/chosen": -1.9405014514923096, "logits/rejected": -1.9240939617156982, "logps/chosen": -528.7400512695312, "logps/rejected": -195.2759552001953, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.619192123413086, "rewards/margins": 8.869572639465332, "rewards/rejected": -2.250379800796509, "step": 1872 }, { "epoch": 0.7569094210207787, "grad_norm": 0.0015364313730970025, "learning_rate": 2.512479201331115e-07, "logits/chosen": -1.9636163711547852, "logits/rejected": -1.913517951965332, "logps/chosen": -576.7318115234375, "logps/rejected": -218.53689575195312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.12057113647461, "rewards/margins": 10.925397872924805, "rewards/rejected": -2.8048253059387207, "step": 1876 }, { "epoch": 0.7585233003832963, "grad_norm": 0.0027513715904206038, "learning_rate": 2.4958402662229614e-07, "logits/chosen": -1.953940987586975, "logits/rejected": -1.9102489948272705, "logps/chosen": -668.67529296875, "logps/rejected": -201.53762817382812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.9400153160095215, "rewards/margins": 10.254171371459961, "rewards/rejected": -2.3141555786132812, "step": 1880 }, { "epoch": 0.760137179745814, "grad_norm": 0.11782655864953995, "learning_rate": 2.4792013311148083e-07, "logits/chosen": -2.0011777877807617, "logits/rejected": -1.9810110330581665, "logps/chosen": -545.54736328125, "logps/rejected": -195.3970947265625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 6.600249767303467, "rewards/margins": 8.999195098876953, "rewards/rejected": -2.3989455699920654, "step": 1884 }, { "epoch": 0.7617510591083316, "grad_norm": 0.053934432566165924, "learning_rate": 2.462562396006655e-07, "logits/chosen": -1.948798656463623, "logits/rejected": -1.9471601247787476, "logps/chosen": -518.640625, "logps/rejected": -208.00588989257812, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.031424522399902, "rewards/margins": 9.666607856750488, "rewards/rejected": -2.6351828575134277, "step": 1888 }, { "epoch": 0.7633649384708493, "grad_norm": 0.018933264538645744, "learning_rate": 2.445923460898502e-07, "logits/chosen": -1.9453850984573364, "logits/rejected": -1.9066615104675293, "logps/chosen": -555.6270141601562, "logps/rejected": -206.28472900390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.320058822631836, "rewards/margins": 9.790708541870117, "rewards/rejected": -2.4706506729125977, "step": 1892 }, { "epoch": 0.764978817833367, "grad_norm": 0.009022870101034641, "learning_rate": 2.429284525790349e-07, "logits/chosen": -1.9207005500793457, "logits/rejected": -1.914454698562622, "logps/chosen": -565.1117553710938, "logps/rejected": -213.18441772460938, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.464051246643066, "rewards/margins": 10.378478050231934, "rewards/rejected": -2.914426803588867, "step": 1896 }, { "epoch": 0.7665926971958846, "grad_norm": 0.06343423575162888, "learning_rate": 2.412645590682196e-07, "logits/chosen": -1.9563437700271606, "logits/rejected": -1.9729256629943848, "logps/chosen": -585.6924438476562, "logps/rejected": -199.35057067871094, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.630346775054932, "rewards/margins": 10.187692642211914, "rewards/rejected": -2.557345390319824, "step": 1900 }, { "epoch": 0.7682065765584023, "grad_norm": 0.0028299777768552303, "learning_rate": 2.396006655574043e-07, "logits/chosen": -1.9621015787124634, "logits/rejected": -1.9212448596954346, "logps/chosen": -618.6233520507812, "logps/rejected": -207.92929077148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.249571800231934, "rewards/margins": 10.443973541259766, "rewards/rejected": -2.194401264190674, "step": 1904 }, { "epoch": 0.76982045592092, "grad_norm": 0.0014983549481257796, "learning_rate": 2.3793677204658902e-07, "logits/chosen": -1.9652888774871826, "logits/rejected": -1.9092050790786743, "logps/chosen": -542.7032470703125, "logps/rejected": -209.64608764648438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.979635238647461, "rewards/margins": 9.180910110473633, "rewards/rejected": -2.201274871826172, "step": 1908 }, { "epoch": 0.7714343352834375, "grad_norm": 0.003133055055513978, "learning_rate": 2.3627287853577372e-07, "logits/chosen": -1.9702494144439697, "logits/rejected": -1.9469213485717773, "logps/chosen": -545.337890625, "logps/rejected": -209.71682739257812, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 7.249837875366211, "rewards/margins": 9.743078231811523, "rewards/rejected": -2.4932401180267334, "step": 1912 }, { "epoch": 0.7730482146459552, "grad_norm": 0.11457090079784393, "learning_rate": 2.346089850249584e-07, "logits/chosen": -1.9801099300384521, "logits/rejected": -1.9334946870803833, "logps/chosen": -444.1725769042969, "logps/rejected": -198.5611572265625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 7.029510498046875, "rewards/margins": 9.361377716064453, "rewards/rejected": -2.33186674118042, "step": 1916 }, { "epoch": 0.7746620940084729, "grad_norm": 0.002445167861878872, "learning_rate": 2.3294509151414308e-07, "logits/chosen": -1.9645049571990967, "logits/rejected": -1.9583840370178223, "logps/chosen": -535.0956420898438, "logps/rejected": -203.62191772460938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.702035427093506, "rewards/margins": 10.314143180847168, "rewards/rejected": -2.612107753753662, "step": 1920 }, { "epoch": 0.7762759733709905, "grad_norm": 0.007241574581712484, "learning_rate": 2.3128119800332778e-07, "logits/chosen": -1.9355313777923584, "logits/rejected": -1.9828592538833618, "logps/chosen": -536.7160034179688, "logps/rejected": -207.3948974609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.619925022125244, "rewards/margins": 10.320430755615234, "rewards/rejected": -2.7005057334899902, "step": 1924 }, { "epoch": 0.7778898527335082, "grad_norm": 0.0032174973748624325, "learning_rate": 2.2961730449251247e-07, "logits/chosen": -2.0111074447631836, "logits/rejected": -1.9810136556625366, "logps/chosen": -526.0936279296875, "logps/rejected": -209.0682830810547, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.557690620422363, "rewards/margins": 10.266834259033203, "rewards/rejected": -2.7091424465179443, "step": 1928 }, { "epoch": 0.7795037320960259, "grad_norm": 0.05227642133831978, "learning_rate": 2.2795341098169716e-07, "logits/chosen": -1.9407751560211182, "logits/rejected": -1.9610675573349, "logps/chosen": -648.305908203125, "logps/rejected": -206.0720977783203, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 7.7418904304504395, "rewards/margins": 9.726991653442383, "rewards/rejected": -1.9850995540618896, "step": 1932 }, { "epoch": 0.7811176114585435, "grad_norm": 0.002464055782184005, "learning_rate": 2.2628951747088186e-07, "logits/chosen": -2.0140557289123535, "logits/rejected": -1.9942609071731567, "logps/chosen": -635.303466796875, "logps/rejected": -205.43392944335938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.04512882232666, "rewards/margins": 10.92768669128418, "rewards/rejected": -2.882556915283203, "step": 1936 }, { "epoch": 0.7827314908210611, "grad_norm": 0.009504868648946285, "learning_rate": 2.2462562396006653e-07, "logits/chosen": -1.9455581903457642, "logits/rejected": -1.959303379058838, "logps/chosen": -546.1251831054688, "logps/rejected": -216.9942169189453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.429033279418945, "rewards/margins": 9.839219093322754, "rewards/rejected": -2.4101858139038086, "step": 1940 }, { "epoch": 0.7843453701835787, "grad_norm": 0.0087587870657444, "learning_rate": 2.2296173044925122e-07, "logits/chosen": -1.9632997512817383, "logits/rejected": -1.9460232257843018, "logps/chosen": -588.8814086914062, "logps/rejected": -197.31082153320312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.485593318939209, "rewards/margins": 9.966062545776367, "rewards/rejected": -2.4804694652557373, "step": 1944 }, { "epoch": 0.7859592495460964, "grad_norm": 0.024040473625063896, "learning_rate": 2.2129783693843592e-07, "logits/chosen": -1.9922959804534912, "logits/rejected": -1.9338749647140503, "logps/chosen": -573.9337768554688, "logps/rejected": -198.42327880859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 8.005708694458008, "rewards/margins": 10.429488182067871, "rewards/rejected": -2.4237794876098633, "step": 1948 }, { "epoch": 0.7875731289086141, "grad_norm": 0.022323206067085266, "learning_rate": 2.196339434276206e-07, "logits/chosen": -2.0031442642211914, "logits/rejected": -1.9420980215072632, "logps/chosen": -519.3341674804688, "logps/rejected": -183.11935424804688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.268186569213867, "rewards/margins": 9.77748966217041, "rewards/rejected": -2.5093038082122803, "step": 1952 }, { "epoch": 0.7891870082711318, "grad_norm": 0.002867747563868761, "learning_rate": 2.179700499168053e-07, "logits/chosen": -1.9586167335510254, "logits/rejected": -1.9409246444702148, "logps/chosen": -620.181396484375, "logps/rejected": -215.18853759765625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.615111351013184, "rewards/margins": 10.157381057739258, "rewards/rejected": -2.542269706726074, "step": 1956 }, { "epoch": 0.7908008876336494, "grad_norm": 0.004164470359683037, "learning_rate": 2.1630615640599e-07, "logits/chosen": -2.001955032348633, "logits/rejected": -1.9889142513275146, "logps/chosen": -496.8046569824219, "logps/rejected": -218.08035278320312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.240952014923096, "rewards/margins": 10.178262710571289, "rewards/rejected": -2.937311887741089, "step": 1960 }, { "epoch": 0.7924147669961671, "grad_norm": 0.0019759840797632933, "learning_rate": 2.146422628951747e-07, "logits/chosen": -1.9873361587524414, "logits/rejected": -2.0003232955932617, "logps/chosen": -625.7353515625, "logps/rejected": -193.87255859375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 7.579718589782715, "rewards/margins": 9.803082466125488, "rewards/rejected": -2.2233643531799316, "step": 1964 }, { "epoch": 0.7940286463586846, "grad_norm": 0.01195582002401352, "learning_rate": 2.129783693843594e-07, "logits/chosen": -1.9543282985687256, "logits/rejected": -1.9298789501190186, "logps/chosen": -570.7440185546875, "logps/rejected": -186.3670654296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.313053607940674, "rewards/margins": 9.657898902893066, "rewards/rejected": -2.3448445796966553, "step": 1968 }, { "epoch": 0.7956425257212023, "grad_norm": 0.003642721800133586, "learning_rate": 2.1131447587354408e-07, "logits/chosen": -1.8858375549316406, "logits/rejected": -1.8799386024475098, "logps/chosen": -636.21826171875, "logps/rejected": -197.64332580566406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.57403564453125, "rewards/margins": 11.170844078063965, "rewards/rejected": -2.5968079566955566, "step": 1972 }, { "epoch": 0.79725640508372, "grad_norm": 0.03533411771059036, "learning_rate": 2.0965058236272878e-07, "logits/chosen": -1.9955341815948486, "logits/rejected": -1.9740753173828125, "logps/chosen": -542.4580078125, "logps/rejected": -192.6697998046875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.302762031555176, "rewards/margins": 9.781347274780273, "rewards/rejected": -2.4785845279693604, "step": 1976 }, { "epoch": 0.7988702844462376, "grad_norm": 0.006252410355955362, "learning_rate": 2.0798668885191347e-07, "logits/chosen": -1.9404999017715454, "logits/rejected": -1.9455125331878662, "logps/chosen": -551.4974975585938, "logps/rejected": -217.16610717773438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.010839462280273, "rewards/margins": 10.337525367736816, "rewards/rejected": -2.3266868591308594, "step": 1980 }, { "epoch": 0.8004841638087553, "grad_norm": 0.016545388847589493, "learning_rate": 2.0632279534109817e-07, "logits/chosen": -1.95366632938385, "logits/rejected": -1.9709562063217163, "logps/chosen": -517.9901123046875, "logps/rejected": -190.70379638671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.0026960372924805, "rewards/margins": 9.563921928405762, "rewards/rejected": -2.5612263679504395, "step": 1984 }, { "epoch": 0.802098043171273, "grad_norm": 0.00023572196369059384, "learning_rate": 2.0465890183028286e-07, "logits/chosen": -1.9703092575073242, "logits/rejected": -1.9809660911560059, "logps/chosen": -621.3912353515625, "logps/rejected": -213.7008819580078, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.920166015625, "rewards/margins": 11.512845039367676, "rewards/rejected": -2.592679023742676, "step": 1988 }, { "epoch": 0.8037119225337906, "grad_norm": 0.0008282049675472081, "learning_rate": 2.0299500831946753e-07, "logits/chosen": -1.9545385837554932, "logits/rejected": -1.9401062726974487, "logps/chosen": -656.6465454101562, "logps/rejected": -218.62916564941406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.902204513549805, "rewards/margins": 11.592126846313477, "rewards/rejected": -2.6899220943450928, "step": 1992 }, { "epoch": 0.8053258018963082, "grad_norm": 0.008635669946670532, "learning_rate": 2.0133111480865222e-07, "logits/chosen": -1.9920576810836792, "logits/rejected": -1.9762872457504272, "logps/chosen": -589.9715576171875, "logps/rejected": -188.923583984375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.307101249694824, "rewards/margins": 10.62148380279541, "rewards/rejected": -2.314383029937744, "step": 1996 }, { "epoch": 0.8069396812588259, "grad_norm": 0.0027046434115618467, "learning_rate": 1.9966722129783692e-07, "logits/chosen": -1.9346226453781128, "logits/rejected": -1.961793065071106, "logps/chosen": -595.6929321289062, "logps/rejected": -178.39828491210938, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.673994541168213, "rewards/margins": 9.999893188476562, "rewards/rejected": -2.3258984088897705, "step": 2000 }, { "epoch": 0.8085535606213435, "grad_norm": 0.017349133267998695, "learning_rate": 1.9800332778702161e-07, "logits/chosen": -1.9548275470733643, "logits/rejected": -1.9495638608932495, "logps/chosen": -573.5785522460938, "logps/rejected": -193.32241821289062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.809907913208008, "rewards/margins": 10.160087585449219, "rewards/rejected": -2.3501791954040527, "step": 2004 }, { "epoch": 0.8101674399838612, "grad_norm": 0.007145206443965435, "learning_rate": 1.963394342762063e-07, "logits/chosen": -1.9010958671569824, "logits/rejected": -1.9414005279541016, "logps/chosen": -534.0484008789062, "logps/rejected": -186.56204223632812, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.285098075866699, "rewards/margins": 9.373290061950684, "rewards/rejected": -2.088190793991089, "step": 2008 }, { "epoch": 0.8117813193463789, "grad_norm": 0.006121462676674128, "learning_rate": 1.94675540765391e-07, "logits/chosen": -2.0103018283843994, "logits/rejected": -1.9858275651931763, "logps/chosen": -569.8963623046875, "logps/rejected": -189.00450134277344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.353415489196777, "rewards/margins": 10.560012817382812, "rewards/rejected": -2.2065975666046143, "step": 2012 }, { "epoch": 0.8133951987088965, "grad_norm": 0.0014833149034529924, "learning_rate": 1.930116472545757e-07, "logits/chosen": -1.9774360656738281, "logits/rejected": -1.9670798778533936, "logps/chosen": -533.6181640625, "logps/rejected": -207.80685424804688, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 7.424815654754639, "rewards/margins": 9.869003295898438, "rewards/rejected": -2.444188117980957, "step": 2016 }, { "epoch": 0.8150090780714142, "grad_norm": 0.014652105048298836, "learning_rate": 1.913477537437604e-07, "logits/chosen": -1.945785403251648, "logits/rejected": -1.9302518367767334, "logps/chosen": -617.7239990234375, "logps/rejected": -195.8604736328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.680196762084961, "rewards/margins": 11.229948997497559, "rewards/rejected": -2.5497524738311768, "step": 2020 }, { "epoch": 0.8166229574339318, "grad_norm": 0.0005591234657913446, "learning_rate": 1.8968386023294509e-07, "logits/chosen": -1.9698848724365234, "logits/rejected": -1.9268821477890015, "logps/chosen": -609.9744873046875, "logps/rejected": -213.9993896484375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 7.975358963012695, "rewards/margins": 10.388092041015625, "rewards/rejected": -2.4127326011657715, "step": 2024 }, { "epoch": 0.8182368367964494, "grad_norm": 0.0014562932774424553, "learning_rate": 1.8801996672212978e-07, "logits/chosen": -1.9310849905014038, "logits/rejected": -1.9672877788543701, "logps/chosen": -667.3247680664062, "logps/rejected": -187.92440795898438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.801055908203125, "rewards/margins": 11.215276718139648, "rewards/rejected": -2.4142205715179443, "step": 2028 }, { "epoch": 0.8198507161589671, "grad_norm": 0.007202428299933672, "learning_rate": 1.8635607321131448e-07, "logits/chosen": -1.9838364124298096, "logits/rejected": -1.9776476621627808, "logps/chosen": -537.66455078125, "logps/rejected": -209.4038543701172, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 6.926480770111084, "rewards/margins": 9.648280143737793, "rewards/rejected": -2.721799373626709, "step": 2032 }, { "epoch": 0.8214645955214848, "grad_norm": 0.010928391478955746, "learning_rate": 1.8469217970049917e-07, "logits/chosen": -1.988613247871399, "logits/rejected": -1.9678013324737549, "logps/chosen": -545.6563720703125, "logps/rejected": -219.39450073242188, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.540011405944824, "rewards/margins": 10.028406143188477, "rewards/rejected": -2.4883947372436523, "step": 2036 }, { "epoch": 0.8230784748840024, "grad_norm": 0.00970606692135334, "learning_rate": 1.8302828618968386e-07, "logits/chosen": -1.945348858833313, "logits/rejected": -1.9284907579421997, "logps/chosen": -621.3656616210938, "logps/rejected": -203.38320922851562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.735448837280273, "rewards/margins": 10.233053207397461, "rewards/rejected": -2.4976043701171875, "step": 2040 }, { "epoch": 0.8246923542465201, "grad_norm": 0.0023108399473130703, "learning_rate": 1.8136439267886856e-07, "logits/chosen": -1.9376139640808105, "logits/rejected": -1.9286924600601196, "logps/chosen": -633.24609375, "logps/rejected": -217.78997802734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.716198921203613, "rewards/margins": 11.40963077545166, "rewards/rejected": -2.6934309005737305, "step": 2044 }, { "epoch": 0.8263062336090378, "grad_norm": 0.00025937031023204327, "learning_rate": 1.7970049916805325e-07, "logits/chosen": -1.985826015472412, "logits/rejected": -1.9743854999542236, "logps/chosen": -549.8696899414062, "logps/rejected": -187.05908203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.104022026062012, "rewards/margins": 10.61567211151123, "rewards/rejected": -2.511650323867798, "step": 2048 }, { "epoch": 0.8279201129715553, "grad_norm": 0.000746559293475002, "learning_rate": 1.7803660565723795e-07, "logits/chosen": -1.9360309839248657, "logits/rejected": -1.9413515329360962, "logps/chosen": -646.6172485351562, "logps/rejected": -194.09669494628906, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 8.389915466308594, "rewards/margins": 11.157281875610352, "rewards/rejected": -2.7673656940460205, "step": 2052 }, { "epoch": 0.829533992334073, "grad_norm": 0.014698189683258533, "learning_rate": 1.7637271214642264e-07, "logits/chosen": -1.9239870309829712, "logits/rejected": -1.9567885398864746, "logps/chosen": -537.6549072265625, "logps/rejected": -208.8583221435547, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 6.8745622634887695, "rewards/margins": 9.590612411499023, "rewards/rejected": -2.7160496711730957, "step": 2056 }, { "epoch": 0.8311478716965907, "grad_norm": 0.0006518694572150707, "learning_rate": 1.7470881863560734e-07, "logits/chosen": -1.9858269691467285, "logits/rejected": -1.958023190498352, "logps/chosen": -666.1148071289062, "logps/rejected": -208.99021911621094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.670503616333008, "rewards/margins": 11.216609001159668, "rewards/rejected": -2.5461058616638184, "step": 2060 }, { "epoch": 0.8327617510591083, "grad_norm": 0.0025312250945717096, "learning_rate": 1.73044925124792e-07, "logits/chosen": -1.9700847864151, "logits/rejected": -1.9759966135025024, "logps/chosen": -593.8184204101562, "logps/rejected": -217.93463134765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.334147453308105, "rewards/margins": 11.190747261047363, "rewards/rejected": -2.856599807739258, "step": 2064 }, { "epoch": 0.834375630421626, "grad_norm": 0.015584556385874748, "learning_rate": 1.7138103161397667e-07, "logits/chosen": -1.9414355754852295, "logits/rejected": -1.9426562786102295, "logps/chosen": -601.4395751953125, "logps/rejected": -180.1129913330078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.885752201080322, "rewards/margins": 10.253803253173828, "rewards/rejected": -2.368051528930664, "step": 2068 }, { "epoch": 0.8359895097841437, "grad_norm": 0.01497507281601429, "learning_rate": 1.6971713810316137e-07, "logits/chosen": -1.984220027923584, "logits/rejected": -1.9599052667617798, "logps/chosen": -609.5572509765625, "logps/rejected": -193.11981201171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.910809516906738, "rewards/margins": 10.4951171875, "rewards/rejected": -2.5843076705932617, "step": 2072 }, { "epoch": 0.8376033891466613, "grad_norm": 0.0005057248054072261, "learning_rate": 1.6805324459234606e-07, "logits/chosen": -1.9362083673477173, "logits/rejected": -1.9358729124069214, "logps/chosen": -665.6556396484375, "logps/rejected": -203.93296813964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.062862396240234, "rewards/margins": 10.489458084106445, "rewards/rejected": -2.426595687866211, "step": 2076 }, { "epoch": 0.839217268509179, "grad_norm": 0.004167359322309494, "learning_rate": 1.6638935108153076e-07, "logits/chosen": -1.9411249160766602, "logits/rejected": -1.9376909732818604, "logps/chosen": -608.2251586914062, "logps/rejected": -198.4532012939453, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.744987487792969, "rewards/margins": 9.987955093383789, "rewards/rejected": -2.242967367172241, "step": 2080 }, { "epoch": 0.8408311478716965, "grad_norm": 0.0035164530854672194, "learning_rate": 1.6472545757071545e-07, "logits/chosen": -2.022171974182129, "logits/rejected": -1.9883896112442017, "logps/chosen": -492.74407958984375, "logps/rejected": -213.33773803710938, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.355404376983643, "rewards/margins": 9.91620922088623, "rewards/rejected": -2.560805320739746, "step": 2084 }, { "epoch": 0.8424450272342142, "grad_norm": 0.0052895089611411095, "learning_rate": 1.6306156405990015e-07, "logits/chosen": -1.9119035005569458, "logits/rejected": -1.9103432893753052, "logps/chosen": -554.06298828125, "logps/rejected": -213.87965393066406, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.613840103149414, "rewards/margins": 9.995025634765625, "rewards/rejected": -2.381186008453369, "step": 2088 }, { "epoch": 0.8440589065967319, "grad_norm": 0.0022597569040954113, "learning_rate": 1.6139767054908484e-07, "logits/chosen": -2.0156607627868652, "logits/rejected": -1.989465355873108, "logps/chosen": -548.3692016601562, "logps/rejected": -185.21104431152344, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 8.062376022338867, "rewards/margins": 10.42442512512207, "rewards/rejected": -2.3620495796203613, "step": 2092 }, { "epoch": 0.8456727859592496, "grad_norm": 0.0017370128771290183, "learning_rate": 1.5973377703826954e-07, "logits/chosen": -1.9228050708770752, "logits/rejected": -1.9412509202957153, "logps/chosen": -698.7863159179688, "logps/rejected": -187.14370727539062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.458852767944336, "rewards/margins": 10.95190143585205, "rewards/rejected": -2.4930496215820312, "step": 2096 }, { "epoch": 0.8472866653217672, "grad_norm": 0.002115788869559765, "learning_rate": 1.5806988352745423e-07, "logits/chosen": -1.944624662399292, "logits/rejected": -1.9676543474197388, "logps/chosen": -633.0092163085938, "logps/rejected": -203.33547973632812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.35001277923584, "rewards/margins": 10.931876182556152, "rewards/rejected": -2.581862211227417, "step": 2100 }, { "epoch": 0.8489005446842849, "grad_norm": 0.029535211622714996, "learning_rate": 1.5640599001663892e-07, "logits/chosen": -1.9474557638168335, "logits/rejected": -1.9326255321502686, "logps/chosen": -544.145751953125, "logps/rejected": -186.17193603515625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 7.5760979652404785, "rewards/margins": 10.02546501159668, "rewards/rejected": -2.4493680000305176, "step": 2104 }, { "epoch": 0.8505144240468026, "grad_norm": 0.0007578348158858716, "learning_rate": 1.5474209650582362e-07, "logits/chosen": -1.9812449216842651, "logits/rejected": -1.9562485218048096, "logps/chosen": -542.3788452148438, "logps/rejected": -197.67562866210938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.896705627441406, "rewards/margins": 10.268312454223633, "rewards/rejected": -2.3716065883636475, "step": 2108 }, { "epoch": 0.8521283034093201, "grad_norm": 0.007515254896134138, "learning_rate": 1.5307820299500831e-07, "logits/chosen": -1.9324719905853271, "logits/rejected": -1.9537785053253174, "logps/chosen": -715.0415649414062, "logps/rejected": -195.10467529296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.462176322937012, "rewards/margins": 11.030756950378418, "rewards/rejected": -2.568580150604248, "step": 2112 }, { "epoch": 0.8537421827718378, "grad_norm": 0.08117762953042984, "learning_rate": 1.51414309484193e-07, "logits/chosen": -2.0193305015563965, "logits/rejected": -1.9613970518112183, "logps/chosen": -499.6443786621094, "logps/rejected": -203.05816650390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 7.203658580780029, "rewards/margins": 9.778995513916016, "rewards/rejected": -2.5753366947174072, "step": 2116 }, { "epoch": 0.8553560621343554, "grad_norm": 0.0010395144345238805, "learning_rate": 1.497504159733777e-07, "logits/chosen": -2.0034074783325195, "logits/rejected": -2.013056516647339, "logps/chosen": -573.281005859375, "logps/rejected": -207.90740966796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.81705904006958, "rewards/margins": 10.419827461242676, "rewards/rejected": -2.6027679443359375, "step": 2120 }, { "epoch": 0.8569699414968731, "grad_norm": 0.004917428828775883, "learning_rate": 1.480865224625624e-07, "logits/chosen": -1.9966566562652588, "logits/rejected": -1.9195388555526733, "logps/chosen": -445.9723815917969, "logps/rejected": -207.1592559814453, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 6.816241264343262, "rewards/margins": 9.271907806396484, "rewards/rejected": -2.455667495727539, "step": 2124 }, { "epoch": 0.8585838208593908, "grad_norm": 0.004593564663082361, "learning_rate": 1.464226289517471e-07, "logits/chosen": -2.01322078704834, "logits/rejected": -1.9992918968200684, "logps/chosen": -580.567626953125, "logps/rejected": -227.24209594726562, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 8.017784118652344, "rewards/margins": 10.46375846862793, "rewards/rejected": -2.445974826812744, "step": 2128 }, { "epoch": 0.8601977002219084, "grad_norm": 0.0012116642901673913, "learning_rate": 1.4475873544093179e-07, "logits/chosen": -1.9585033655166626, "logits/rejected": -1.9624497890472412, "logps/chosen": -611.6246337890625, "logps/rejected": -210.64479064941406, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.276490211486816, "rewards/margins": 10.975924491882324, "rewards/rejected": -2.699434280395508, "step": 2132 }, { "epoch": 0.8618115795844261, "grad_norm": 0.00015725182311143726, "learning_rate": 1.4309484193011648e-07, "logits/chosen": -1.8926582336425781, "logits/rejected": -1.9003466367721558, "logps/chosen": -642.2838745117188, "logps/rejected": -204.37393188476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.203202247619629, "rewards/margins": 12.068556785583496, "rewards/rejected": -2.86535382270813, "step": 2136 }, { "epoch": 0.8634254589469437, "grad_norm": 0.0010620049433782697, "learning_rate": 1.4143094841930118e-07, "logits/chosen": -1.9373283386230469, "logits/rejected": -1.9443448781967163, "logps/chosen": -627.5159301757812, "logps/rejected": -206.3082275390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.399989128112793, "rewards/margins": 10.848422050476074, "rewards/rejected": -2.4484333992004395, "step": 2140 }, { "epoch": 0.8650393383094613, "grad_norm": 0.0008209888474084437, "learning_rate": 1.3976705490848584e-07, "logits/chosen": -1.9250413179397583, "logits/rejected": -1.9535772800445557, "logps/chosen": -599.38330078125, "logps/rejected": -179.1233367919922, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.511911392211914, "rewards/margins": 10.934341430664062, "rewards/rejected": -2.4224307537078857, "step": 2144 }, { "epoch": 0.866653217671979, "grad_norm": 0.028431354090571404, "learning_rate": 1.3810316139767054e-07, "logits/chosen": -1.9765689373016357, "logits/rejected": -1.9660075902938843, "logps/chosen": -505.04754638671875, "logps/rejected": -199.93418884277344, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 7.377435684204102, "rewards/margins": 9.878969192504883, "rewards/rejected": -2.501533031463623, "step": 2148 }, { "epoch": 0.8682670970344967, "grad_norm": 0.0026963185518980026, "learning_rate": 1.3643926788685523e-07, "logits/chosen": -1.9363377094268799, "logits/rejected": -1.8948676586151123, "logps/chosen": -536.2967529296875, "logps/rejected": -188.4244842529297, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 7.370992183685303, "rewards/margins": 9.52306079864502, "rewards/rejected": -2.1520698070526123, "step": 2152 }, { "epoch": 0.8698809763970143, "grad_norm": 0.004297483712434769, "learning_rate": 1.3477537437603993e-07, "logits/chosen": -1.9601538181304932, "logits/rejected": -1.9360593557357788, "logps/chosen": -616.6476440429688, "logps/rejected": -193.1970977783203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.798155307769775, "rewards/margins": 10.224129676818848, "rewards/rejected": -2.4259743690490723, "step": 2156 }, { "epoch": 0.871494855759532, "grad_norm": 0.0010488844709470868, "learning_rate": 1.3311148086522462e-07, "logits/chosen": -1.9151992797851562, "logits/rejected": -1.9167428016662598, "logps/chosen": -593.6932373046875, "logps/rejected": -197.4088134765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.08647632598877, "rewards/margins": 10.677909851074219, "rewards/rejected": -2.591433525085449, "step": 2160 }, { "epoch": 0.8731087351220497, "grad_norm": 0.22330044209957123, "learning_rate": 1.3144758735440932e-07, "logits/chosen": -1.9273473024368286, "logits/rejected": -1.929797649383545, "logps/chosen": -507.3981018066406, "logps/rejected": -185.3137969970703, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 6.846053123474121, "rewards/margins": 8.969970703125, "rewards/rejected": -2.123918294906616, "step": 2164 }, { "epoch": 0.8747226144845672, "grad_norm": 0.0018654640298336744, "learning_rate": 1.29783693843594e-07, "logits/chosen": -1.962241768836975, "logits/rejected": -1.949162483215332, "logps/chosen": -598.574462890625, "logps/rejected": -198.6273956298828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.375568389892578, "rewards/margins": 10.904097557067871, "rewards/rejected": -2.528529167175293, "step": 2168 }, { "epoch": 0.8763364938470849, "grad_norm": 0.0018578276503831148, "learning_rate": 1.281198003327787e-07, "logits/chosen": -2.020348072052002, "logits/rejected": -1.96686851978302, "logps/chosen": -486.45953369140625, "logps/rejected": -212.23516845703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.232388496398926, "rewards/margins": 10.197342872619629, "rewards/rejected": -2.9649546146392822, "step": 2172 }, { "epoch": 0.8779503732096026, "grad_norm": 0.003160596825182438, "learning_rate": 1.264559068219634e-07, "logits/chosen": -1.9596773386001587, "logits/rejected": -1.9485933780670166, "logps/chosen": -621.4105224609375, "logps/rejected": -181.83033752441406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.648035526275635, "rewards/margins": 10.13938045501709, "rewards/rejected": -2.491344928741455, "step": 2176 }, { "epoch": 0.8795642525721202, "grad_norm": 0.020167183130979538, "learning_rate": 1.2479201331114807e-07, "logits/chosen": -1.9590054750442505, "logits/rejected": -1.9317580461502075, "logps/chosen": -594.5398559570312, "logps/rejected": -187.8367919921875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.010028839111328, "rewards/margins": 10.27939510345459, "rewards/rejected": -2.2693662643432617, "step": 2180 }, { "epoch": 0.8811781319346379, "grad_norm": 0.004604825284332037, "learning_rate": 1.2312811980033276e-07, "logits/chosen": -1.955763578414917, "logits/rejected": -1.9365050792694092, "logps/chosen": -528.75390625, "logps/rejected": -196.27395629882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.237925052642822, "rewards/margins": 9.875265121459961, "rewards/rejected": -2.637341022491455, "step": 2184 }, { "epoch": 0.8827920112971556, "grad_norm": 0.00423140125349164, "learning_rate": 1.2146422628951746e-07, "logits/chosen": -1.9635376930236816, "logits/rejected": -1.9189456701278687, "logps/chosen": -586.3699951171875, "logps/rejected": -193.4700164794922, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 8.45522403717041, "rewards/margins": 10.88260555267334, "rewards/rejected": -2.427382469177246, "step": 2188 }, { "epoch": 0.8844058906596732, "grad_norm": 0.0044121877290308475, "learning_rate": 1.1980033277870215e-07, "logits/chosen": -1.970357894897461, "logits/rejected": -1.9514366388320923, "logps/chosen": -532.273681640625, "logps/rejected": -189.6227569580078, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.86663818359375, "rewards/margins": 10.273088455200195, "rewards/rejected": -2.4064502716064453, "step": 2192 }, { "epoch": 0.8860197700221908, "grad_norm": 0.007188714575022459, "learning_rate": 1.1813643926788686e-07, "logits/chosen": -1.9745264053344727, "logits/rejected": -1.9271416664123535, "logps/chosen": -508.4303283691406, "logps/rejected": -207.35946655273438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 6.9446120262146, "rewards/margins": 9.810519218444824, "rewards/rejected": -2.86590838432312, "step": 2196 }, { "epoch": 0.8876336493847085, "grad_norm": 0.008952920325100422, "learning_rate": 1.1647254575707154e-07, "logits/chosen": -1.9555314779281616, "logits/rejected": -1.947456955909729, "logps/chosen": -598.7382202148438, "logps/rejected": -200.23086547851562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.788602352142334, "rewards/margins": 10.301923751831055, "rewards/rejected": -2.513322353363037, "step": 2200 }, { "epoch": 0.8892475287472261, "grad_norm": 0.003547424916177988, "learning_rate": 1.1480865224625624e-07, "logits/chosen": -1.9553638696670532, "logits/rejected": -1.971273422241211, "logps/chosen": -579.3164672851562, "logps/rejected": -215.98043823242188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.572243690490723, "rewards/margins": 10.107776641845703, "rewards/rejected": -2.5355336666107178, "step": 2204 }, { "epoch": 0.8908614081097438, "grad_norm": 0.0032165558077394962, "learning_rate": 1.1314475873544093e-07, "logits/chosen": -1.9641681909561157, "logits/rejected": -1.9081870317459106, "logps/chosen": -555.7399291992188, "logps/rejected": -196.6915283203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.6991071701049805, "rewards/margins": 10.211429595947266, "rewards/rejected": -2.5123229026794434, "step": 2208 }, { "epoch": 0.8924752874722615, "grad_norm": 0.004298418760299683, "learning_rate": 1.1148086522462561e-07, "logits/chosen": -1.9499428272247314, "logits/rejected": -1.967283010482788, "logps/chosen": -595.0771484375, "logps/rejected": -213.1467742919922, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.005367279052734, "rewards/margins": 10.955217361450195, "rewards/rejected": -2.949850559234619, "step": 2212 }, { "epoch": 0.8940891668347791, "grad_norm": 0.0009413140942342579, "learning_rate": 1.098169717138103e-07, "logits/chosen": -1.9561331272125244, "logits/rejected": -1.9299981594085693, "logps/chosen": -591.7747802734375, "logps/rejected": -183.7716522216797, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.21725082397461, "rewards/margins": 11.019977569580078, "rewards/rejected": -2.802727222442627, "step": 2216 }, { "epoch": 0.8957030461972968, "grad_norm": 0.010237318463623524, "learning_rate": 1.08153078202995e-07, "logits/chosen": -1.9584877490997314, "logits/rejected": -1.9808075428009033, "logps/chosen": -653.6846923828125, "logps/rejected": -217.6410675048828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.736632347106934, "rewards/margins": 10.575833320617676, "rewards/rejected": -2.8392016887664795, "step": 2220 }, { "epoch": 0.8973169255598143, "grad_norm": 0.004208914469927549, "learning_rate": 1.064891846921797e-07, "logits/chosen": -1.9464210271835327, "logits/rejected": -1.933936357498169, "logps/chosen": -571.8275146484375, "logps/rejected": -213.26498413085938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.987466335296631, "rewards/margins": 10.736618041992188, "rewards/rejected": -2.7491512298583984, "step": 2224 }, { "epoch": 0.898930804922332, "grad_norm": 0.0016071452992036939, "learning_rate": 1.0482529118136439e-07, "logits/chosen": -1.964534878730774, "logits/rejected": -1.9834566116333008, "logps/chosen": -574.8720092773438, "logps/rejected": -211.44967651367188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.467386245727539, "rewards/margins": 9.992053031921387, "rewards/rejected": -2.5246667861938477, "step": 2228 }, { "epoch": 0.9005446842848497, "grad_norm": 0.0038677973207086325, "learning_rate": 1.0316139767054908e-07, "logits/chosen": -1.938987135887146, "logits/rejected": -1.9347413778305054, "logps/chosen": -525.45654296875, "logps/rejected": -203.27377319335938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.5667243003845215, "rewards/margins": 10.359435081481934, "rewards/rejected": -2.792710542678833, "step": 2232 }, { "epoch": 0.9021585636473674, "grad_norm": 0.12820056080818176, "learning_rate": 1.0149750415973377e-07, "logits/chosen": -1.9710922241210938, "logits/rejected": -1.958834171295166, "logps/chosen": -522.5095825195312, "logps/rejected": -191.97755432128906, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 6.945425987243652, "rewards/margins": 9.433082580566406, "rewards/rejected": -2.487657308578491, "step": 2236 }, { "epoch": 0.903772443009885, "grad_norm": 0.00513134989887476, "learning_rate": 9.983361064891846e-08, "logits/chosen": -1.9725544452667236, "logits/rejected": -1.9489004611968994, "logps/chosen": -599.30322265625, "logps/rejected": -197.52764892578125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 7.779577255249023, "rewards/margins": 10.245851516723633, "rewards/rejected": -2.466273546218872, "step": 2240 }, { "epoch": 0.9053863223724027, "grad_norm": 0.0021222520153969526, "learning_rate": 9.816971713810315e-08, "logits/chosen": -2.0002827644348145, "logits/rejected": -1.944633960723877, "logps/chosen": -495.18157958984375, "logps/rejected": -191.78712463378906, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.710754871368408, "rewards/margins": 9.230676651000977, "rewards/rejected": -2.5199217796325684, "step": 2244 }, { "epoch": 0.9070002017349204, "grad_norm": 0.01909686252474785, "learning_rate": 9.650582362728785e-08, "logits/chosen": -1.956575632095337, "logits/rejected": -1.9566620588302612, "logps/chosen": -516.43212890625, "logps/rejected": -198.45358276367188, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 6.952939987182617, "rewards/margins": 9.337143898010254, "rewards/rejected": -2.384204387664795, "step": 2248 }, { "epoch": 0.908614081097438, "grad_norm": 0.14136604964733124, "learning_rate": 9.484193011647254e-08, "logits/chosen": -1.9613327980041504, "logits/rejected": -1.9484248161315918, "logps/chosen": -560.4368896484375, "logps/rejected": -196.08523559570312, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 7.943796634674072, "rewards/margins": 10.4188232421875, "rewards/rejected": -2.475027084350586, "step": 2252 }, { "epoch": 0.9102279604599556, "grad_norm": 0.00795560609549284, "learning_rate": 9.317803660565724e-08, "logits/chosen": -1.9661191701889038, "logits/rejected": -1.9149067401885986, "logps/chosen": -494.5516052246094, "logps/rejected": -215.0869140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 6.746519088745117, "rewards/margins": 9.091409683227539, "rewards/rejected": -2.3448901176452637, "step": 2256 }, { "epoch": 0.9118418398224732, "grad_norm": 0.05181167274713516, "learning_rate": 9.151414309484193e-08, "logits/chosen": -1.9633406400680542, "logits/rejected": -1.935342788696289, "logps/chosen": -541.651611328125, "logps/rejected": -207.64158630371094, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.643606185913086, "rewards/margins": 10.124765396118164, "rewards/rejected": -2.4811580181121826, "step": 2260 }, { "epoch": 0.9134557191849909, "grad_norm": 0.0087064728140831, "learning_rate": 8.985024958402663e-08, "logits/chosen": -1.9386643171310425, "logits/rejected": -1.9598429203033447, "logps/chosen": -597.7614135742188, "logps/rejected": -205.498779296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.419035911560059, "rewards/margins": 10.23982048034668, "rewards/rejected": -2.8207850456237793, "step": 2264 }, { "epoch": 0.9150695985475086, "grad_norm": 0.0013066570973023772, "learning_rate": 8.818635607321132e-08, "logits/chosen": -1.982245683670044, "logits/rejected": -1.9904437065124512, "logps/chosen": -578.0158081054688, "logps/rejected": -203.42462158203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.798466205596924, "rewards/margins": 10.537128448486328, "rewards/rejected": -2.7386622428894043, "step": 2268 }, { "epoch": 0.9166834779100262, "grad_norm": 0.013224029913544655, "learning_rate": 8.6522462562396e-08, "logits/chosen": -1.9562819004058838, "logits/rejected": -1.9647612571716309, "logps/chosen": -477.7041015625, "logps/rejected": -187.61553955078125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.595162391662598, "rewards/margins": 8.922281265258789, "rewards/rejected": -2.327119827270508, "step": 2272 }, { "epoch": 0.9182973572725439, "grad_norm": 0.007978118024766445, "learning_rate": 8.485856905158068e-08, "logits/chosen": -1.987709641456604, "logits/rejected": -1.9791945219039917, "logps/chosen": -622.2003784179688, "logps/rejected": -202.357177734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.074975967407227, "rewards/margins": 10.497710227966309, "rewards/rejected": -2.4227349758148193, "step": 2276 }, { "epoch": 0.9199112366350616, "grad_norm": 0.016553970053792, "learning_rate": 8.319467554076538e-08, "logits/chosen": -1.971767544746399, "logits/rejected": -1.9454960823059082, "logps/chosen": -573.9810791015625, "logps/rejected": -200.93917846679688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 8.2744140625, "rewards/margins": 10.502092361450195, "rewards/rejected": -2.2276771068573, "step": 2280 }, { "epoch": 0.9215251159975791, "grad_norm": 0.007110063452273607, "learning_rate": 8.153078202995007e-08, "logits/chosen": -1.9909430742263794, "logits/rejected": -1.9815030097961426, "logps/chosen": -477.7131652832031, "logps/rejected": -185.46432495117188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.122833251953125, "rewards/margins": 9.898612976074219, "rewards/rejected": -2.775780439376831, "step": 2284 }, { "epoch": 0.9231389953600968, "grad_norm": 0.0068983291275799274, "learning_rate": 7.986688851913477e-08, "logits/chosen": -1.9706134796142578, "logits/rejected": -1.9677231311798096, "logps/chosen": -597.0238037109375, "logps/rejected": -197.17713928222656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.828166961669922, "rewards/margins": 10.163195610046387, "rewards/rejected": -2.3350281715393066, "step": 2288 }, { "epoch": 0.9247528747226145, "grad_norm": 0.01427419576793909, "learning_rate": 7.820299500831946e-08, "logits/chosen": -1.91902756690979, "logits/rejected": -1.9341588020324707, "logps/chosen": -597.708251953125, "logps/rejected": -199.2935791015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.429521083831787, "rewards/margins": 10.085774421691895, "rewards/rejected": -2.6562540531158447, "step": 2292 }, { "epoch": 0.9263667540851321, "grad_norm": 0.004831741098314524, "learning_rate": 7.653910149750416e-08, "logits/chosen": -1.9768664836883545, "logits/rejected": -1.941735029220581, "logps/chosen": -547.8282470703125, "logps/rejected": -192.31971740722656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.425360679626465, "rewards/margins": 10.214485168457031, "rewards/rejected": -2.78912353515625, "step": 2296 }, { "epoch": 0.9279806334476498, "grad_norm": 0.0056070322170853615, "learning_rate": 7.487520798668885e-08, "logits/chosen": -1.9096853733062744, "logits/rejected": -1.9182517528533936, "logps/chosen": -631.1071166992188, "logps/rejected": -201.56658935546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.871713638305664, "rewards/margins": 10.599752426147461, "rewards/rejected": -2.7280385494232178, "step": 2300 }, { "epoch": 0.9295945128101675, "grad_norm": 0.06097284331917763, "learning_rate": 7.321131447587355e-08, "logits/chosen": -2.0158724784851074, "logits/rejected": -1.9851518869400024, "logps/chosen": -554.22802734375, "logps/rejected": -215.06243896484375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.922417640686035, "rewards/margins": 10.706696510314941, "rewards/rejected": -2.784278392791748, "step": 2304 }, { "epoch": 0.9312083921726851, "grad_norm": 0.0006991773843765259, "learning_rate": 7.154742096505824e-08, "logits/chosen": -1.947166919708252, "logits/rejected": -1.9528326988220215, "logps/chosen": -617.4672241210938, "logps/rejected": -195.3257293701172, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 8.576740264892578, "rewards/margins": 11.056242942810059, "rewards/rejected": -2.479501485824585, "step": 2308 }, { "epoch": 0.9328222715352027, "grad_norm": 0.010540401563048363, "learning_rate": 6.988352745424292e-08, "logits/chosen": -2.017611503601074, "logits/rejected": -2.0188376903533936, "logps/chosen": -460.2166748046875, "logps/rejected": -219.23953247070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.08725118637085, "rewards/margins": 9.927509307861328, "rewards/rejected": -2.8402578830718994, "step": 2312 }, { "epoch": 0.9344361508977204, "grad_norm": 0.0031016087159514427, "learning_rate": 6.821963394342762e-08, "logits/chosen": -1.996953010559082, "logits/rejected": -1.9164704084396362, "logps/chosen": -539.83447265625, "logps/rejected": -188.55484008789062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.279851913452148, "rewards/margins": 10.792409896850586, "rewards/rejected": -2.512558937072754, "step": 2316 }, { "epoch": 0.936050030260238, "grad_norm": 0.008572681806981564, "learning_rate": 6.655574043261231e-08, "logits/chosen": -1.9294888973236084, "logits/rejected": -1.9513804912567139, "logps/chosen": -609.9036865234375, "logps/rejected": -207.98013305664062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.653085708618164, "rewards/margins": 10.358214378356934, "rewards/rejected": -2.7051289081573486, "step": 2320 }, { "epoch": 0.9376639096227557, "grad_norm": 0.0015376312658190727, "learning_rate": 6.4891846921797e-08, "logits/chosen": -1.9529556035995483, "logits/rejected": -1.9568126201629639, "logps/chosen": -688.2650146484375, "logps/rejected": -190.69107055664062, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 8.111884117126465, "rewards/margins": 10.620707511901855, "rewards/rejected": -2.5088236331939697, "step": 2324 }, { "epoch": 0.9392777889852734, "grad_norm": 0.001975918421521783, "learning_rate": 6.32279534109817e-08, "logits/chosen": -1.9823153018951416, "logits/rejected": -1.9648116827011108, "logps/chosen": -662.1198120117188, "logps/rejected": -210.68069458007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.674493789672852, "rewards/margins": 11.579400062561035, "rewards/rejected": -2.904906749725342, "step": 2328 }, { "epoch": 0.940891668347791, "grad_norm": 0.000869189330842346, "learning_rate": 6.156405990016638e-08, "logits/chosen": -1.975644588470459, "logits/rejected": -1.9598273038864136, "logps/chosen": -634.6510009765625, "logps/rejected": -195.50811767578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.673144340515137, "rewards/margins": 11.08203411102295, "rewards/rejected": -2.408890724182129, "step": 2332 }, { "epoch": 0.9425055477103087, "grad_norm": 0.002743723802268505, "learning_rate": 5.990016638935108e-08, "logits/chosen": -2.040480136871338, "logits/rejected": -1.98549485206604, "logps/chosen": -494.8893737792969, "logps/rejected": -209.12008666992188, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.97926664352417, "rewards/margins": 9.55869197845459, "rewards/rejected": -2.579425573348999, "step": 2336 }, { "epoch": 0.9441194270728263, "grad_norm": 0.00355963665060699, "learning_rate": 5.823627287853577e-08, "logits/chosen": -1.9926104545593262, "logits/rejected": -1.938683032989502, "logps/chosen": -546.7926025390625, "logps/rejected": -195.13108825683594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.222939968109131, "rewards/margins": 9.695463180541992, "rewards/rejected": -2.4725236892700195, "step": 2340 }, { "epoch": 0.9457333064353439, "grad_norm": 0.005733412690460682, "learning_rate": 5.6572379367720465e-08, "logits/chosen": -1.9779168367385864, "logits/rejected": -1.9558717012405396, "logps/chosen": -535.4456176757812, "logps/rejected": -187.14553833007812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.41929817199707, "rewards/margins": 9.951004028320312, "rewards/rejected": -2.531705617904663, "step": 2344 }, { "epoch": 0.9473471857978616, "grad_norm": 0.0017202591989189386, "learning_rate": 5.490848585690515e-08, "logits/chosen": -1.9331518411636353, "logits/rejected": -1.8871835470199585, "logps/chosen": -602.348876953125, "logps/rejected": -190.06964111328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.601446151733398, "rewards/margins": 10.905570983886719, "rewards/rejected": -2.304124116897583, "step": 2348 }, { "epoch": 0.9489610651603793, "grad_norm": 0.0006953923148103058, "learning_rate": 5.324459234608985e-08, "logits/chosen": -1.9603687524795532, "logits/rejected": -1.950567364692688, "logps/chosen": -636.3812255859375, "logps/rejected": -203.18707275390625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 8.09544849395752, "rewards/margins": 10.711150169372559, "rewards/rejected": -2.6157007217407227, "step": 2352 }, { "epoch": 0.9505749445228969, "grad_norm": 0.007147460710257292, "learning_rate": 5.158069883527454e-08, "logits/chosen": -2.02960205078125, "logits/rejected": -1.9851789474487305, "logps/chosen": -529.8826904296875, "logps/rejected": -168.01148986816406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.425908088684082, "rewards/margins": 9.748892784118652, "rewards/rejected": -2.322983980178833, "step": 2356 }, { "epoch": 0.9521888238854146, "grad_norm": 0.011745446361601353, "learning_rate": 4.991680532445923e-08, "logits/chosen": -1.977084994316101, "logits/rejected": -1.9777553081512451, "logps/chosen": -519.7965698242188, "logps/rejected": -199.56114196777344, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 7.054460048675537, "rewards/margins": 9.65392017364502, "rewards/rejected": -2.5994603633880615, "step": 2360 }, { "epoch": 0.9538027032479323, "grad_norm": 0.00036666003870777786, "learning_rate": 4.8252911813643924e-08, "logits/chosen": -1.9125556945800781, "logits/rejected": -1.90922212600708, "logps/chosen": -598.0546875, "logps/rejected": -195.66705322265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.157264709472656, "rewards/margins": 10.675834655761719, "rewards/rejected": -2.518569231033325, "step": 2364 }, { "epoch": 0.9554165826104498, "grad_norm": 0.010000882670283318, "learning_rate": 4.658901830282862e-08, "logits/chosen": -1.9666526317596436, "logits/rejected": -1.9572985172271729, "logps/chosen": -563.4773559570312, "logps/rejected": -194.95193481445312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 7.395757675170898, "rewards/margins": 9.645254135131836, "rewards/rejected": -2.2494964599609375, "step": 2368 }, { "epoch": 0.9570304619729675, "grad_norm": 0.009719014167785645, "learning_rate": 4.4925124792013313e-08, "logits/chosen": -1.9401307106018066, "logits/rejected": -1.9157006740570068, "logps/chosen": -633.8336181640625, "logps/rejected": -209.88848876953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.536530494689941, "rewards/margins": 11.327733039855957, "rewards/rejected": -2.7912025451660156, "step": 2372 }, { "epoch": 0.9586443413354852, "grad_norm": 0.01856349967420101, "learning_rate": 4.3261231281198e-08, "logits/chosen": -1.9416378736495972, "logits/rejected": -1.888044834136963, "logps/chosen": -546.4309692382812, "logps/rejected": -205.0238037109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.712806701660156, "rewards/margins": 10.036308288574219, "rewards/rejected": -2.3235023021698, "step": 2376 }, { "epoch": 0.9602582206980028, "grad_norm": 0.0003416066465433687, "learning_rate": 4.159733777038269e-08, "logits/chosen": -1.9221917390823364, "logits/rejected": -1.92808198928833, "logps/chosen": -590.5059204101562, "logps/rejected": -197.27894592285156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.490342140197754, "rewards/margins": 10.271543502807617, "rewards/rejected": -2.781200885772705, "step": 2380 }, { "epoch": 0.9618721000605205, "grad_norm": 0.013899177312850952, "learning_rate": 3.9933444259567384e-08, "logits/chosen": -1.9602147340774536, "logits/rejected": -1.9277094602584839, "logps/chosen": -647.98291015625, "logps/rejected": -193.43701171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 8.359036445617676, "rewards/margins": 10.919185638427734, "rewards/rejected": -2.560149908065796, "step": 2384 }, { "epoch": 0.9634859794230382, "grad_norm": 0.00849966797977686, "learning_rate": 3.826955074875208e-08, "logits/chosen": -1.9331220388412476, "logits/rejected": -1.9138157367706299, "logps/chosen": -571.346923828125, "logps/rejected": -212.62950134277344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.980587482452393, "rewards/margins": 11.004870414733887, "rewards/rejected": -3.0242838859558105, "step": 2388 }, { "epoch": 0.9650998587855558, "grad_norm": 0.016639478504657745, "learning_rate": 3.660565723793677e-08, "logits/chosen": -1.9589921236038208, "logits/rejected": -1.9496128559112549, "logps/chosen": -476.03497314453125, "logps/rejected": -197.25283813476562, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.759002208709717, "rewards/margins": 9.32736587524414, "rewards/rejected": -2.568363904953003, "step": 2392 }, { "epoch": 0.9667137381480735, "grad_norm": 0.0067138378508389, "learning_rate": 3.494176372712146e-08, "logits/chosen": -1.956956386566162, "logits/rejected": -2.018627405166626, "logps/chosen": -601.5103759765625, "logps/rejected": -210.40301513671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.5934929847717285, "rewards/margins": 10.111224174499512, "rewards/rejected": -2.5177316665649414, "step": 2396 }, { "epoch": 0.968327617510591, "grad_norm": 0.02846740186214447, "learning_rate": 3.3277870216306155e-08, "logits/chosen": -1.9700076580047607, "logits/rejected": -1.9194566011428833, "logps/chosen": -586.5913696289062, "logps/rejected": -208.490966796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.915290355682373, "rewards/margins": 10.217430114746094, "rewards/rejected": -2.3021392822265625, "step": 2400 }, { "epoch": 0.9699414968731087, "grad_norm": 0.001431932090781629, "learning_rate": 3.161397670549085e-08, "logits/chosen": -1.9559874534606934, "logits/rejected": -1.9398815631866455, "logps/chosen": -622.4332885742188, "logps/rejected": -206.44927978515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.709281921386719, "rewards/margins": 11.34769058227539, "rewards/rejected": -2.6384096145629883, "step": 2404 }, { "epoch": 0.9715553762356264, "grad_norm": 0.0001495666801929474, "learning_rate": 2.995008319467554e-08, "logits/chosen": -1.9673175811767578, "logits/rejected": -1.9780542850494385, "logps/chosen": -639.9940795898438, "logps/rejected": -195.94439697265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.2504243850708, "rewards/margins": 10.863600730895996, "rewards/rejected": -2.6131770610809326, "step": 2408 }, { "epoch": 0.973169255598144, "grad_norm": 0.005092945881187916, "learning_rate": 2.8286189683860232e-08, "logits/chosen": -1.9754958152770996, "logits/rejected": -1.9487643241882324, "logps/chosen": -556.5565795898438, "logps/rejected": -200.08705139160156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.746192455291748, "rewards/margins": 10.31916332244873, "rewards/rejected": -2.572970390319824, "step": 2412 }, { "epoch": 0.9747831349606617, "grad_norm": 0.00019496263121254742, "learning_rate": 2.6622296173044924e-08, "logits/chosen": -2.0144405364990234, "logits/rejected": -2.0065693855285645, "logps/chosen": -617.623779296875, "logps/rejected": -208.27392578125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.421403884887695, "rewards/margins": 11.250767707824707, "rewards/rejected": -2.8293638229370117, "step": 2416 }, { "epoch": 0.9763970143231794, "grad_norm": 0.003437003353610635, "learning_rate": 2.4958402662229615e-08, "logits/chosen": -1.9273126125335693, "logits/rejected": -1.9262257814407349, "logps/chosen": -667.33203125, "logps/rejected": -187.8031005859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.709585189819336, "rewards/margins": 11.024407386779785, "rewards/rejected": -2.3148210048675537, "step": 2420 }, { "epoch": 0.978010893685697, "grad_norm": 0.010736319236457348, "learning_rate": 2.329450915141431e-08, "logits/chosen": -1.9403722286224365, "logits/rejected": -1.9475584030151367, "logps/chosen": -669.9805908203125, "logps/rejected": -225.76246643066406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.675244331359863, "rewards/margins": 11.404251098632812, "rewards/rejected": -2.7290053367614746, "step": 2424 }, { "epoch": 0.9796247730482146, "grad_norm": 0.0031744521111249924, "learning_rate": 2.1630615640599e-08, "logits/chosen": -1.9672961235046387, "logits/rejected": -1.9568848609924316, "logps/chosen": -687.1852416992188, "logps/rejected": -216.0614471435547, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.620283126831055, "rewards/margins": 11.227103233337402, "rewards/rejected": -2.606820821762085, "step": 2428 }, { "epoch": 0.9812386524107323, "grad_norm": 0.0020818235352635384, "learning_rate": 1.9966722129783692e-08, "logits/chosen": -1.9933454990386963, "logits/rejected": -1.9836554527282715, "logps/chosen": -593.0174560546875, "logps/rejected": -205.3551788330078, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.770843505859375, "rewards/margins": 10.844222068786621, "rewards/rejected": -2.073377847671509, "step": 2432 }, { "epoch": 0.9828525317732499, "grad_norm": 0.002028702525421977, "learning_rate": 1.8302828618968386e-08, "logits/chosen": -1.9364641904830933, "logits/rejected": -1.928995132446289, "logps/chosen": -516.173828125, "logps/rejected": -193.2967529296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.466804504394531, "rewards/margins": 10.009012222290039, "rewards/rejected": -2.542207956314087, "step": 2436 }, { "epoch": 0.9844664111357676, "grad_norm": 0.007925420068204403, "learning_rate": 1.6638935108153078e-08, "logits/chosen": -1.9584999084472656, "logits/rejected": -1.9115580320358276, "logps/chosen": -598.3190307617188, "logps/rejected": -207.4603271484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.29245376586914, "rewards/margins": 11.041905403137207, "rewards/rejected": -2.7494521141052246, "step": 2440 }, { "epoch": 0.9860802904982853, "grad_norm": 0.028999343514442444, "learning_rate": 1.497504159733777e-08, "logits/chosen": -1.9876980781555176, "logits/rejected": -1.951137661933899, "logps/chosen": -570.0836791992188, "logps/rejected": -216.26235961914062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.029938697814941, "rewards/margins": 10.860445976257324, "rewards/rejected": -2.8305068016052246, "step": 2444 }, { "epoch": 0.9876941698608029, "grad_norm": 0.02188098058104515, "learning_rate": 1.3311148086522462e-08, "logits/chosen": -1.9865086078643799, "logits/rejected": -1.9465644359588623, "logps/chosen": -505.2719421386719, "logps/rejected": -194.8648681640625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 6.617441177368164, "rewards/margins": 8.97486400604248, "rewards/rejected": -2.3574230670928955, "step": 2448 }, { "epoch": 0.9893080492233206, "grad_norm": 0.01449115015566349, "learning_rate": 1.1647254575707155e-08, "logits/chosen": -1.974801778793335, "logits/rejected": -1.9598311185836792, "logps/chosen": -588.7413940429688, "logps/rejected": -210.99884033203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.60077953338623, "rewards/margins": 11.112259864807129, "rewards/rejected": -2.511479616165161, "step": 2452 }, { "epoch": 0.9909219285858382, "grad_norm": 0.014199871569871902, "learning_rate": 9.983361064891846e-09, "logits/chosen": -1.9874229431152344, "logits/rejected": -1.989527940750122, "logps/chosen": -486.08502197265625, "logps/rejected": -217.39166259765625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 6.547196865081787, "rewards/margins": 9.216949462890625, "rewards/rejected": -2.669752836227417, "step": 2456 }, { "epoch": 0.9925358079483558, "grad_norm": 0.0064820642583072186, "learning_rate": 8.319467554076539e-09, "logits/chosen": -1.9209837913513184, "logits/rejected": -1.9539835453033447, "logps/chosen": -603.2359008789062, "logps/rejected": -197.79428100585938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.523219108581543, "rewards/margins": 10.991716384887695, "rewards/rejected": -2.4684975147247314, "step": 2460 }, { "epoch": 0.9941496873108735, "grad_norm": 0.002752237720414996, "learning_rate": 6.655574043261231e-09, "logits/chosen": -1.9195678234100342, "logits/rejected": -1.9238159656524658, "logps/chosen": -576.064453125, "logps/rejected": -184.51617431640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.8465399742126465, "rewards/margins": 10.33941650390625, "rewards/rejected": -2.4928760528564453, "step": 2464 }, { "epoch": 0.9957635666733912, "grad_norm": 0.036954715847969055, "learning_rate": 4.991680532445923e-09, "logits/chosen": -1.9584980010986328, "logits/rejected": -1.9793095588684082, "logps/chosen": -654.3820190429688, "logps/rejected": -190.0885467529297, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 8.392622947692871, "rewards/margins": 10.481419563293457, "rewards/rejected": -2.088796854019165, "step": 2468 }, { "epoch": 0.9973774460359088, "grad_norm": 0.0012481111334636807, "learning_rate": 3.3277870216306155e-09, "logits/chosen": -1.8978352546691895, "logits/rejected": -1.9425159692764282, "logps/chosen": -673.3843994140625, "logps/rejected": -207.91168212890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.854183197021484, "rewards/margins": 11.358531951904297, "rewards/rejected": -2.5043489933013916, "step": 2472 }, { "epoch": 0.9989913253984265, "grad_norm": 0.00045745307579636574, "learning_rate": 1.6638935108153077e-09, "logits/chosen": -1.9979405403137207, "logits/rejected": -2.002479076385498, "logps/chosen": -547.2967529296875, "logps/rejected": -203.73068237304688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 7.467577934265137, "rewards/margins": 9.99644947052002, "rewards/rejected": -2.5288708209991455, "step": 2476 } ], "logging_steps": 4, "max_steps": 2479, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }