{ "best_global_step": 3375, "best_metric": 0.9395306859205776, "best_model_checkpoint": "/workspace/code_langid/CodeLanguage-Encoder-v1/checkpoint-3375", "epoch": 3.0, "eval_steps": 500, "global_step": 3375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017777777777777778, "grad_norm": 91.99722290039062, "learning_rate": 2.2485207100591717e-06, "loss": 1.3883, "step": 20 }, { "epoch": 0.035555555555555556, "grad_norm": 22.086933135986328, "learning_rate": 4.615384615384616e-06, "loss": 0.57, "step": 40 }, { "epoch": 0.05333333333333334, "grad_norm": 21.929166793823242, "learning_rate": 6.98224852071006e-06, "loss": 0.394, "step": 60 }, { "epoch": 0.07111111111111111, "grad_norm": 11.914956092834473, "learning_rate": 9.349112426035503e-06, "loss": 0.3654, "step": 80 }, { "epoch": 0.08888888888888889, "grad_norm": 9.060860633850098, "learning_rate": 1.1715976331360948e-05, "loss": 0.3761, "step": 100 }, { "epoch": 0.10666666666666667, "grad_norm": 26.926528930664062, "learning_rate": 1.4082840236686392e-05, "loss": 0.3866, "step": 120 }, { "epoch": 0.12444444444444444, "grad_norm": 5.112385272979736, "learning_rate": 1.6449704142011837e-05, "loss": 0.3834, "step": 140 }, { "epoch": 0.14222222222222222, "grad_norm": 4.676296234130859, "learning_rate": 1.881656804733728e-05, "loss": 0.36, "step": 160 }, { "epoch": 0.16, "grad_norm": 3.6760458946228027, "learning_rate": 1.9999519891672918e-05, "loss": 0.3531, "step": 180 }, { "epoch": 0.17777777777777778, "grad_norm": 4.675507068634033, "learning_rate": 1.99956793016566e-05, "loss": 0.3881, "step": 200 }, { "epoch": 0.19555555555555557, "grad_norm": 6.470630645751953, "learning_rate": 1.998799959670796e-05, "loss": 0.3449, "step": 220 }, { "epoch": 0.21333333333333335, "grad_norm": 4.329215049743652, "learning_rate": 1.9976483726428423e-05, "loss": 0.3406, "step": 240 }, { "epoch": 0.2311111111111111, "grad_norm": 6.4355058670043945, "learning_rate": 1.9961136113803982e-05, "loss": 0.3488, "step": 260 }, { "epoch": 0.24888888888888888, "grad_norm": 25.0487117767334, "learning_rate": 1.9941962653506426e-05, "loss": 0.3448, "step": 280 }, { "epoch": 0.26666666666666666, "grad_norm": 6.033944129943848, "learning_rate": 1.991897070962933e-05, "loss": 0.3207, "step": 300 }, { "epoch": 0.28444444444444444, "grad_norm": 4.8149919509887695, "learning_rate": 1.9892169112859677e-05, "loss": 0.3065, "step": 320 }, { "epoch": 0.3022222222222222, "grad_norm": 16.133445739746094, "learning_rate": 1.9861568157086182e-05, "loss": 0.2826, "step": 340 }, { "epoch": 0.32, "grad_norm": 4.365119457244873, "learning_rate": 1.9827179595445644e-05, "loss": 0.261, "step": 360 }, { "epoch": 0.3377777777777778, "grad_norm": 5.095998764038086, "learning_rate": 1.9789016635808836e-05, "loss": 0.2536, "step": 380 }, { "epoch": 0.35555555555555557, "grad_norm": 6.346428871154785, "learning_rate": 1.9747093935707658e-05, "loss": 0.2196, "step": 400 }, { "epoch": 0.37333333333333335, "grad_norm": 4.671675205230713, "learning_rate": 1.9701427596705504e-05, "loss": 0.2519, "step": 420 }, { "epoch": 0.39111111111111113, "grad_norm": 8.537422180175781, "learning_rate": 1.9652035158213015e-05, "loss": 0.2311, "step": 440 }, { "epoch": 0.4088888888888889, "grad_norm": 4.477969169616699, "learning_rate": 1.959893559075161e-05, "loss": 0.2234, "step": 460 }, { "epoch": 0.4266666666666667, "grad_norm": 26.87811279296875, "learning_rate": 1.9542149288667295e-05, "loss": 0.1985, "step": 480 }, { "epoch": 0.4444444444444444, "grad_norm": 7.016199111938477, "learning_rate": 1.9481698062297692e-05, "loss": 0.2214, "step": 500 }, { "epoch": 0.4622222222222222, "grad_norm": 9.351459503173828, "learning_rate": 1.941760512959516e-05, "loss": 0.2035, "step": 520 }, { "epoch": 0.48, "grad_norm": 10.939650535583496, "learning_rate": 1.9349895107209308e-05, "loss": 0.1576, "step": 540 }, { "epoch": 0.49777777777777776, "grad_norm": 6.640588283538818, "learning_rate": 1.9278594001032302e-05, "loss": 0.1763, "step": 560 }, { "epoch": 0.5155555555555555, "grad_norm": 8.542006492614746, "learning_rate": 1.920372919621057e-05, "loss": 0.1562, "step": 580 }, { "epoch": 0.5333333333333333, "grad_norm": 6.431937217712402, "learning_rate": 1.9125329446626823e-05, "loss": 0.1478, "step": 600 }, { "epoch": 0.5511111111111111, "grad_norm": 5.852420330047607, "learning_rate": 1.9043424863856286e-05, "loss": 0.1169, "step": 620 }, { "epoch": 0.5688888888888889, "grad_norm": 9.895511627197266, "learning_rate": 1.895804690560156e-05, "loss": 0.1085, "step": 640 }, { "epoch": 0.5866666666666667, "grad_norm": 9.346630096435547, "learning_rate": 1.8869228363610406e-05, "loss": 0.1389, "step": 660 }, { "epoch": 0.6044444444444445, "grad_norm": 2.5748658180236816, "learning_rate": 1.87770033510812e-05, "loss": 0.1102, "step": 680 }, { "epoch": 0.6222222222222222, "grad_norm": 3.143669843673706, "learning_rate": 1.868140728956079e-05, "loss": 0.1177, "step": 700 }, { "epoch": 0.64, "grad_norm": 2.2248847484588623, "learning_rate": 1.8582476895339912e-05, "loss": 0.1281, "step": 720 }, { "epoch": 0.6577777777777778, "grad_norm": 6.335216999053955, "learning_rate": 1.8480250165351256e-05, "loss": 0.1279, "step": 740 }, { "epoch": 0.6755555555555556, "grad_norm": 7.8944091796875, "learning_rate": 1.837476636257574e-05, "loss": 0.1102, "step": 760 }, { "epoch": 0.6933333333333334, "grad_norm": 4.683923244476318, "learning_rate": 1.8266066000962444e-05, "loss": 0.072, "step": 780 }, { "epoch": 0.7111111111111111, "grad_norm": 6.723703861236572, "learning_rate": 1.8154190829868152e-05, "loss": 0.1092, "step": 800 }, { "epoch": 0.7288888888888889, "grad_norm": 6.945916175842285, "learning_rate": 1.803918381802235e-05, "loss": 0.1264, "step": 820 }, { "epoch": 0.7466666666666667, "grad_norm": 7.265810012817383, "learning_rate": 1.7921089137023897e-05, "loss": 0.0858, "step": 840 }, { "epoch": 0.7644444444444445, "grad_norm": 3.5829412937164307, "learning_rate": 1.779995214437573e-05, "loss": 0.1233, "step": 860 }, { "epoch": 0.7822222222222223, "grad_norm": 5.7208123207092285, "learning_rate": 1.767581936606406e-05, "loss": 0.0956, "step": 880 }, { "epoch": 0.8, "grad_norm": 5.497292995452881, "learning_rate": 1.7548738478688785e-05, "loss": 0.0926, "step": 900 }, { "epoch": 0.8177777777777778, "grad_norm": 3.0958940982818604, "learning_rate": 1.7418758291151995e-05, "loss": 0.1092, "step": 920 }, { "epoch": 0.8355555555555556, "grad_norm": 2.6405749320983887, "learning_rate": 1.7285928725911562e-05, "loss": 0.0991, "step": 940 }, { "epoch": 0.8533333333333334, "grad_norm": 4.4936723709106445, "learning_rate": 1.7150300799807067e-05, "loss": 0.0767, "step": 960 }, { "epoch": 0.8711111111111111, "grad_norm": 4.851830005645752, "learning_rate": 1.7011926604465357e-05, "loss": 0.0932, "step": 980 }, { "epoch": 0.8888888888888888, "grad_norm": 4.944611549377441, "learning_rate": 1.6870859286293354e-05, "loss": 0.0814, "step": 1000 }, { "epoch": 0.9066666666666666, "grad_norm": 4.881468296051025, "learning_rate": 1.6727153026065707e-05, "loss": 0.0932, "step": 1020 }, { "epoch": 0.9244444444444444, "grad_norm": 8.76336669921875, "learning_rate": 1.6580863018115163e-05, "loss": 0.1098, "step": 1040 }, { "epoch": 0.9422222222222222, "grad_norm": 2.69486141204834, "learning_rate": 1.64320454491337e-05, "loss": 0.0757, "step": 1060 }, { "epoch": 0.96, "grad_norm": 2.511228322982788, "learning_rate": 1.6280757476592467e-05, "loss": 0.0683, "step": 1080 }, { "epoch": 0.9777777777777777, "grad_norm": 8.538342475891113, "learning_rate": 1.612705720678888e-05, "loss": 0.0711, "step": 1100 }, { "epoch": 0.9955555555555555, "grad_norm": 7.436290264129639, "learning_rate": 1.5971003672529332e-05, "loss": 0.0936, "step": 1120 }, { "epoch": 1.0, "eval_category_set_accuracy": 0.77, "eval_is_valid_accuracy": 0.929, "eval_loss": 0.03886782005429268, "eval_macro_f1": 0.8593521556125059, "eval_micro_f1": 0.8701359587435537, "eval_runtime": 17.3499, "eval_samples_per_second": 57.637, "eval_steps_per_second": 14.409, "step": 1125 }, { "epoch": 1.0133333333333334, "grad_norm": 6.135354995727539, "learning_rate": 1.5812656810455996e-05, "loss": 0.0731, "step": 1140 }, { "epoch": 1.031111111111111, "grad_norm": 7.639137268066406, "learning_rate": 1.565207743802653e-05, "loss": 0.0579, "step": 1160 }, { "epoch": 1.048888888888889, "grad_norm": 9.30708122253418, "learning_rate": 1.5489327230155455e-05, "loss": 0.0575, "step": 1180 }, { "epoch": 1.0666666666666667, "grad_norm": 0.26070043444633484, "learning_rate": 1.5324468695526215e-05, "loss": 0.0413, "step": 1200 }, { "epoch": 1.0844444444444445, "grad_norm": 7.059542179107666, "learning_rate": 1.5157565152583002e-05, "loss": 0.0779, "step": 1220 }, { "epoch": 1.1022222222222222, "grad_norm": 12.312825202941895, "learning_rate": 1.4988680705211568e-05, "loss": 0.0732, "step": 1240 }, { "epoch": 1.12, "grad_norm": 0.6326771378517151, "learning_rate": 1.481788021811837e-05, "loss": 0.0522, "step": 1260 }, { "epoch": 1.1377777777777778, "grad_norm": 6.887764930725098, "learning_rate": 1.46452292919175e-05, "loss": 0.0667, "step": 1280 }, { "epoch": 1.1555555555555554, "grad_norm": 6.309450626373291, "learning_rate": 1.4470794237934966e-05, "loss": 0.0623, "step": 1300 }, { "epoch": 1.1733333333333333, "grad_norm": 2.185997247695923, "learning_rate": 1.4294642052740015e-05, "loss": 0.0538, "step": 1320 }, { "epoch": 1.1911111111111112, "grad_norm": 1.244521975517273, "learning_rate": 1.4116840392413247e-05, "loss": 0.0392, "step": 1340 }, { "epoch": 1.208888888888889, "grad_norm": 7.956927299499512, "learning_rate": 1.393745754656146e-05, "loss": 0.0572, "step": 1360 }, { "epoch": 1.2266666666666666, "grad_norm": 7.431176662445068, "learning_rate": 1.3756562412089141e-05, "loss": 0.059, "step": 1380 }, { "epoch": 1.2444444444444445, "grad_norm": 2.3178868293762207, "learning_rate": 1.3574224466736716e-05, "loss": 0.0605, "step": 1400 }, { "epoch": 1.2622222222222224, "grad_norm": 0.8065705895423889, "learning_rate": 1.3390513742395725e-05, "loss": 0.0377, "step": 1420 }, { "epoch": 1.28, "grad_norm": 2.56118106842041, "learning_rate": 1.3205500798211155e-05, "loss": 0.0382, "step": 1440 }, { "epoch": 1.2977777777777777, "grad_norm": 5.321646213531494, "learning_rate": 1.3019256693481253e-05, "loss": 0.0346, "step": 1460 }, { "epoch": 1.3155555555555556, "grad_norm": 6.427070617675781, "learning_rate": 1.2831852960365256e-05, "loss": 0.0598, "step": 1480 }, { "epoch": 1.3333333333333333, "grad_norm": 4.72144889831543, "learning_rate": 1.2643361576409517e-05, "loss": 0.0683, "step": 1500 }, { "epoch": 1.3511111111111112, "grad_norm": 8.388670921325684, "learning_rate": 1.2453854936902525e-05, "loss": 0.0521, "step": 1520 }, { "epoch": 1.3688888888888888, "grad_norm": 6.774533748626709, "learning_rate": 1.2263405827069531e-05, "loss": 0.0677, "step": 1540 }, { "epoch": 1.3866666666666667, "grad_norm": 3.9661309719085693, "learning_rate": 1.2072087394117382e-05, "loss": 0.045, "step": 1560 }, { "epoch": 1.4044444444444444, "grad_norm": 0.7026374340057373, "learning_rate": 1.1879973119140316e-05, "loss": 0.0363, "step": 1580 }, { "epoch": 1.4222222222222223, "grad_norm": 8.110880851745605, "learning_rate": 1.1687136788897544e-05, "loss": 0.0474, "step": 1600 }, { "epoch": 1.44, "grad_norm": 6.098938465118408, "learning_rate": 1.1493652467473418e-05, "loss": 0.063, "step": 1620 }, { "epoch": 1.4577777777777778, "grad_norm": 6.203182697296143, "learning_rate": 1.1299594467831079e-05, "loss": 0.0445, "step": 1640 }, { "epoch": 1.4755555555555555, "grad_norm": 4.827872276306152, "learning_rate": 1.1105037323270538e-05, "loss": 0.0467, "step": 1660 }, { "epoch": 1.4933333333333334, "grad_norm": 1.4762051105499268, "learning_rate": 1.09100557588021e-05, "loss": 0.0466, "step": 1680 }, { "epoch": 1.511111111111111, "grad_norm": 3.032404899597168, "learning_rate": 1.0714724662446194e-05, "loss": 0.0482, "step": 1700 }, { "epoch": 1.528888888888889, "grad_norm": 2.016261100769043, "learning_rate": 1.051911905647055e-05, "loss": 0.0458, "step": 1720 }, { "epoch": 1.5466666666666666, "grad_norm": 1.3372159004211426, "learning_rate": 1.0323314068575858e-05, "loss": 0.0473, "step": 1740 }, { "epoch": 1.5644444444444443, "grad_norm": 9.508127212524414, "learning_rate": 1.0127384903040907e-05, "loss": 0.0547, "step": 1760 }, { "epoch": 1.5822222222222222, "grad_norm": 7.226083278656006, "learning_rate": 9.931406811838307e-06, "loss": 0.0403, "step": 1780 }, { "epoch": 1.6, "grad_norm": 6.973893165588379, "learning_rate": 9.735455065731922e-06, "loss": 0.0404, "step": 1800 }, { "epoch": 1.6177777777777778, "grad_norm": 5.251108646392822, "learning_rate": 9.539604925367052e-06, "loss": 0.0358, "step": 1820 }, { "epoch": 1.6355555555555554, "grad_norm": 3.708037853240967, "learning_rate": 9.343931612364533e-06, "loss": 0.0319, "step": 1840 }, { "epoch": 1.6533333333333333, "grad_norm": 1.9106441736221313, "learning_rate": 9.148510280429786e-06, "loss": 0.03, "step": 1860 }, { "epoch": 1.6711111111111112, "grad_norm": 5.1486592292785645, "learning_rate": 8.95341598648801e-06, "loss": 0.0474, "step": 1880 }, { "epoch": 1.6888888888888889, "grad_norm": 0.31516969203948975, "learning_rate": 8.758723661856465e-06, "loss": 0.0239, "step": 1900 }, { "epoch": 1.7066666666666666, "grad_norm": 2.8174779415130615, "learning_rate": 8.56450808346508e-06, "loss": 0.0405, "step": 1920 }, { "epoch": 1.7244444444444444, "grad_norm": 2.6797842979431152, "learning_rate": 8.370843845136307e-06, "loss": 0.0338, "step": 1940 }, { "epoch": 1.7422222222222223, "grad_norm": 7.398653984069824, "learning_rate": 8.177805328935311e-06, "loss": 0.0367, "step": 1960 }, { "epoch": 1.76, "grad_norm": 0.9176409244537354, "learning_rate": 7.98546667660151e-06, "loss": 0.0514, "step": 1980 }, { "epoch": 1.7777777777777777, "grad_norm": 6.95626974105835, "learning_rate": 7.793901761072396e-06, "loss": 0.0462, "step": 2000 }, { "epoch": 1.7955555555555556, "grad_norm": 7.002866268157959, "learning_rate": 7.6031841581106045e-06, "loss": 0.0431, "step": 2020 }, { "epoch": 1.8133333333333335, "grad_norm": 3.7514915466308594, "learning_rate": 7.413387118045136e-06, "loss": 0.0275, "step": 2040 }, { "epoch": 1.8311111111111111, "grad_norm": 4.360569477081299, "learning_rate": 7.224583537637544e-06, "loss": 0.0545, "step": 2060 }, { "epoch": 1.8488888888888888, "grad_norm": 4.490522861480713, "learning_rate": 7.036845932083938e-06, "loss": 0.0364, "step": 2080 }, { "epoch": 1.8666666666666667, "grad_norm": 0.8960697650909424, "learning_rate": 6.850246407163532e-06, "loss": 0.025, "step": 2100 }, { "epoch": 1.8844444444444446, "grad_norm": 8.298011779785156, "learning_rate": 6.664856631544449e-06, "loss": 0.0488, "step": 2120 }, { "epoch": 1.9022222222222223, "grad_norm": 0.39041435718536377, "learning_rate": 6.48074780925739e-06, "loss": 0.0499, "step": 2140 }, { "epoch": 1.92, "grad_norm": 4.386420249938965, "learning_rate": 6.2979906523477765e-06, "loss": 0.0357, "step": 2160 }, { "epoch": 1.9377777777777778, "grad_norm": 0.0772632360458374, "learning_rate": 6.1166553537168494e-06, "loss": 0.0434, "step": 2180 }, { "epoch": 1.9555555555555557, "grad_norm": 3.595348358154297, "learning_rate": 5.936811560162169e-06, "loss": 0.0356, "step": 2200 }, { "epoch": 1.9733333333333334, "grad_norm": 2.2714860439300537, "learning_rate": 5.758528345627828e-06, "loss": 0.0268, "step": 2220 }, { "epoch": 1.991111111111111, "grad_norm": 0.9474923610687256, "learning_rate": 5.581874184674734e-06, "loss": 0.0421, "step": 2240 }, { "epoch": 2.0, "eval_category_set_accuracy": 0.874, "eval_is_valid_accuracy": 0.965, "eval_loss": 0.022557925432920456, "eval_macro_f1": 0.9249589403924913, "eval_micro_f1": 0.9310653536257834, "eval_runtime": 17.6046, "eval_samples_per_second": 56.803, "eval_steps_per_second": 14.201, "step": 2250 }, { "epoch": 2.008888888888889, "grad_norm": 0.81345134973526, "learning_rate": 5.406916926181052e-06, "loss": 0.0178, "step": 2260 }, { "epoch": 2.026666666666667, "grad_norm": 0.6435673832893372, "learning_rate": 5.2337237672830055e-06, "loss": 0.0153, "step": 2280 }, { "epoch": 2.0444444444444443, "grad_norm": 2.50960636138916, "learning_rate": 5.062361227565946e-06, "loss": 0.0147, "step": 2300 }, { "epoch": 2.062222222222222, "grad_norm": 0.061879731714725494, "learning_rate": 4.892895123515696e-06, "loss": 0.011, "step": 2320 }, { "epoch": 2.08, "grad_norm": 0.04990602657198906, "learning_rate": 4.7253905432399295e-06, "loss": 0.0127, "step": 2340 }, { "epoch": 2.097777777777778, "grad_norm": 5.34898567199707, "learning_rate": 4.559911821469275e-06, "loss": 0.0157, "step": 2360 }, { "epoch": 2.1155555555555554, "grad_norm": 0.2640538513660431, "learning_rate": 4.396522514847811e-06, "loss": 0.0113, "step": 2380 }, { "epoch": 2.1333333333333333, "grad_norm": 0.38076090812683105, "learning_rate": 4.235285377522401e-06, "loss": 0.0069, "step": 2400 }, { "epoch": 2.151111111111111, "grad_norm": 0.957832932472229, "learning_rate": 4.076262337040223e-06, "loss": 0.0136, "step": 2420 }, { "epoch": 2.168888888888889, "grad_norm": 4.484457015991211, "learning_rate": 3.9195144705638034e-06, "loss": 0.0085, "step": 2440 }, { "epoch": 2.1866666666666665, "grad_norm": 5.153985023498535, "learning_rate": 3.7651019814126656e-06, "loss": 0.0144, "step": 2460 }, { "epoch": 2.2044444444444444, "grad_norm": 0.1879798024892807, "learning_rate": 3.6130841759405776e-06, "loss": 0.0041, "step": 2480 }, { "epoch": 2.2222222222222223, "grad_norm": 5.104600429534912, "learning_rate": 3.4635194407573247e-06, "loss": 0.0154, "step": 2500 }, { "epoch": 2.24, "grad_norm": 0.09507002681493759, "learning_rate": 3.316465220303744e-06, "loss": 0.0152, "step": 2520 }, { "epoch": 2.2577777777777777, "grad_norm": 1.6253083944320679, "learning_rate": 3.1719779947885863e-06, "loss": 0.0063, "step": 2540 }, { "epoch": 2.2755555555555556, "grad_norm": 4.733920574188232, "learning_rate": 3.030113258495756e-06, "loss": 0.0083, "step": 2560 }, { "epoch": 2.2933333333333334, "grad_norm": 4.784224033355713, "learning_rate": 2.890925498470213e-06, "loss": 0.012, "step": 2580 }, { "epoch": 2.311111111111111, "grad_norm": 0.4101342260837555, "learning_rate": 2.754468173590713e-06, "loss": 0.0034, "step": 2600 }, { "epoch": 2.328888888888889, "grad_norm": 0.1665552854537964, "learning_rate": 2.6207936940374767e-06, "loss": 0.0199, "step": 2620 }, { "epoch": 2.3466666666666667, "grad_norm": 0.27030205726623535, "learning_rate": 2.4899534011626012e-06, "loss": 0.0162, "step": 2640 }, { "epoch": 2.3644444444444446, "grad_norm": 2.187560796737671, "learning_rate": 2.36199754777102e-06, "loss": 0.0035, "step": 2660 }, { "epoch": 2.3822222222222225, "grad_norm": 0.5252636671066284, "learning_rate": 2.2369752788195343e-06, "loss": 0.012, "step": 2680 }, { "epoch": 2.4, "grad_norm": 2.124925136566162, "learning_rate": 2.1149346125413316e-06, "loss": 0.0144, "step": 2700 }, { "epoch": 2.417777777777778, "grad_norm": 2.135836601257324, "learning_rate": 1.9959224220032747e-06, "loss": 0.0187, "step": 2720 }, { "epoch": 2.4355555555555557, "grad_norm": 0.13177770376205444, "learning_rate": 1.879984417103017e-06, "loss": 0.01, "step": 2740 }, { "epoch": 2.453333333333333, "grad_norm": 0.11943187564611435, "learning_rate": 1.7671651270128531e-06, "loss": 0.0124, "step": 2760 }, { "epoch": 2.471111111111111, "grad_norm": 2.2270348072052, "learning_rate": 1.6575078830770708e-06, "loss": 0.0109, "step": 2780 }, { "epoch": 2.488888888888889, "grad_norm": 0.23311637341976166, "learning_rate": 1.5510548021693693e-06, "loss": 0.015, "step": 2800 }, { "epoch": 2.506666666666667, "grad_norm": 0.0807090774178505, "learning_rate": 1.447846770516701e-06, "loss": 0.0089, "step": 2820 }, { "epoch": 2.5244444444444447, "grad_norm": 2.7864654064178467, "learning_rate": 1.3479234279958041e-06, "loss": 0.013, "step": 2840 }, { "epoch": 2.542222222222222, "grad_norm": 0.3439638018608093, "learning_rate": 1.2513231529084269e-06, "loss": 0.0096, "step": 2860 }, { "epoch": 2.56, "grad_norm": 0.3769543766975403, "learning_rate": 1.1580830472410709e-06, "loss": 0.0065, "step": 2880 }, { "epoch": 2.5777777777777775, "grad_norm": 4.0474677085876465, "learning_rate": 1.0682389224149648e-06, "loss": 0.0116, "step": 2900 }, { "epoch": 2.5955555555555554, "grad_norm": 2.4342126846313477, "learning_rate": 9.818252855317112e-07, "loss": 0.0063, "step": 2920 }, { "epoch": 2.6133333333333333, "grad_norm": 0.04185617342591286, "learning_rate": 8.988753261198724e-07, "loss": 0.0027, "step": 2940 }, { "epoch": 2.631111111111111, "grad_norm": 1.2005906105041504, "learning_rate": 8.19420903387631e-07, "loss": 0.0084, "step": 2960 }, { "epoch": 2.648888888888889, "grad_norm": 1.7181979417800903, "learning_rate": 7.434925339863908e-07, "loss": 0.0047, "step": 2980 }, { "epoch": 2.6666666666666665, "grad_norm": 0.12996995449066162, "learning_rate": 6.711193802900074e-07, "loss": 0.0128, "step": 3000 }, { "epoch": 2.6844444444444444, "grad_norm": 3.3893086910247803, "learning_rate": 6.023292391941859e-07, "loss": 0.0139, "step": 3020 }, { "epoch": 2.7022222222222223, "grad_norm": 0.1592389941215515, "learning_rate": 5.371485314403202e-07, "loss": 0.0101, "step": 3040 }, { "epoch": 2.7199999999999998, "grad_norm": 0.9441222548484802, "learning_rate": 4.756022914678804e-07, "loss": 0.0082, "step": 3060 }, { "epoch": 2.7377777777777776, "grad_norm": 0.5016261339187622, "learning_rate": 4.1771415779924826e-07, "loss": 0.0082, "step": 3080 }, { "epoch": 2.7555555555555555, "grad_norm": 0.2186020165681839, "learning_rate": 3.6350636396069947e-07, "loss": 0.0127, "step": 3100 }, { "epoch": 2.7733333333333334, "grad_norm": 0.1453126072883606, "learning_rate": 3.1299972994299874e-07, "loss": 0.0058, "step": 3120 }, { "epoch": 2.7911111111111113, "grad_norm": 1.516239881515503, "learning_rate": 2.6621365420491984e-07, "loss": 0.0151, "step": 3140 }, { "epoch": 2.8088888888888888, "grad_norm": 5.567173480987549, "learning_rate": 2.2316610622273082e-07, "loss": 0.0108, "step": 3160 }, { "epoch": 2.8266666666666667, "grad_norm": 0.06365057826042175, "learning_rate": 1.8387361958852378e-07, "loss": 0.0141, "step": 3180 }, { "epoch": 2.8444444444444446, "grad_norm": 0.7341110706329346, "learning_rate": 1.4835128566003553e-07, "loss": 0.0132, "step": 3200 }, { "epoch": 2.862222222222222, "grad_norm": 0.07594721764326096, "learning_rate": 1.1661274776439857e-07, "loss": 0.0067, "step": 3220 }, { "epoch": 2.88, "grad_norm": 1.0531156063079834, "learning_rate": 8.867019595804272e-08, "loss": 0.0089, "step": 3240 }, { "epoch": 2.897777777777778, "grad_norm": 0.6338114738464355, "learning_rate": 6.453436234477805e-08, "loss": 0.0084, "step": 3260 }, { "epoch": 2.9155555555555557, "grad_norm": 0.8637073040008545, "learning_rate": 4.4214516953825505e-08, "loss": 0.0037, "step": 3280 }, { "epoch": 2.9333333333333336, "grad_norm": 2.1685738563537598, "learning_rate": 2.7718464179415928e-08, "loss": 0.0062, "step": 3300 }, { "epoch": 2.951111111111111, "grad_norm": 1.4732089042663574, "learning_rate": 1.5052539783292353e-08, "loss": 0.0061, "step": 3320 }, { "epoch": 2.968888888888889, "grad_norm": 1.7118995189666748, "learning_rate": 6.2216084612931606e-09, "loss": 0.0166, "step": 3340 }, { "epoch": 2.986666666666667, "grad_norm": 0.22905214130878448, "learning_rate": 1.2290619749244504e-09, "loss": 0.0174, "step": 3360 }, { "epoch": 3.0, "eval_category_set_accuracy": 0.883, "eval_is_valid_accuracy": 0.966, "eval_loss": 0.021968627348542213, "eval_macro_f1": 0.9343878654799134, "eval_micro_f1": 0.9395306859205776, "eval_runtime": 17.5098, "eval_samples_per_second": 57.111, "eval_steps_per_second": 14.278, "step": 3375 } ], "logging_steps": 20, "max_steps": 3375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.318107799420652e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }