AjayP13's picture
Super-squash branch 'main' using huggingface_hub
24a464d verified
{"created_at": "2025-08-31T00:55:25.070507", "global_step": 30000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2203112203112203, "acc_stderr,none": 0.011865854943402445}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.42113124875522806, "acc_stderr,none": 0.004927314729433553, "acc_norm,none": 0.5525791674965146, "acc_norm_stderr,none": 0.0049621155260142855}, "mmlu": {"acc,none": 0.2697621421449936, "acc_stderr,none": 0.003735674913871737, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23868225292242295, "acc_stderr,none": 0.0062171894827945376, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.0328766675860349}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695066}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.19831223628691982, "acc_stderr,none": 0.025955020841621112}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19834710743801653, "acc_stderr,none": 0.036401182719909456}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507416}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.02279711027807115}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2508038585209003, "acc_stderr,none": 0.024619771956697154}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005712}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24445893089960888, "acc_stderr,none": 0.010976425013113907}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03126781714663179}, "mmlu_other": {"acc,none": 0.2841969745735436, "acc_stderr,none": 0.00807538045922164, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30943396226415093, "acc_stderr,none": 0.028450154794118627}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.0355068398916558}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21973094170403587, "acc_stderr,none": 0.02779017706438361}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.044532548363264673}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.027601921381417604}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28991060025542786, "acc_stderr,none": 0.016225017944770964}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3104575163398693, "acc_stderr,none": 0.02649303322514589}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537762}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3639705882352941, "acc_stderr,none": 0.02922719246003203}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.2999675008124797, "acc_stderr,none": 0.008240239772998847, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.037752050135836386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.34196891191709844, "acc_stderr,none": 0.03423465100104283}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.023119362758232277}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.030176808288974337}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3211009174311927, "acc_stderr,none": 0.020018149772733744}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.017362473762146644}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.0449429086625209}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3836734693877551, "acc_stderr,none": 0.031130880396235933}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31343283582089554, "acc_stderr,none": 0.032801882053486435}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.2724389470345703, "acc_stderr,none": 0.007920607935592936, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.037125378336148665}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993176}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342343}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.33793103448275863, "acc_stderr,none": 0.03941707632064891}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2830687830687831, "acc_stderr,none": 0.023201392938194974}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.0253781399708852}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.026202766534652148}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.036030385453603826}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.03309682581119035}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.043270409325787296}, "sciq": {"alias": "sciq", "acc,none": 0.897, "acc_stderr,none": 0.009616833339695789, "acc_norm,none": 0.846, "acc_norm_stderr,none": 0.011419913065098698}}
{"created_at": "2025-08-31T08:08:55.474615", "global_step": 60000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22194922194922195, "acc_stderr,none": 0.011897367280936745}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43975303724357695, "acc_stderr,none": 0.004953426186069839, "acc_norm,none": 0.5746863174666401, "acc_norm_stderr,none": 0.0049338009275605435}, "mmlu": {"acc,none": 0.2488961686369463, "acc_stderr,none": 0.0036422852784231126, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24888416578108397, "acc_stderr,none": 0.006306232794353303, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.035122074123020534}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139406}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02977177522814565}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658332}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908706}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854933}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615623}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104429}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2829581993569132, "acc_stderr,none": 0.025583062489984827}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.024659685185967287}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23859191655801826, "acc_stderr,none": 0.010885929742002228}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23391812865497075, "acc_stderr,none": 0.03246721765117827}, "mmlu_other": {"acc,none": 0.272288381074992, "acc_stderr,none": 0.007966659817341723, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.027008766090708087}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3721973094170404, "acc_stderr,none": 0.03244305283008732}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.043546310772605956}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523418}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2937420178799489, "acc_stderr,none": 0.016287759388491672}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341016}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.02635806569888059}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142317}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.036471685236832266}, "mmlu_social_sciences": {"acc,none": 0.2349691257718557, "acc_stderr,none": 0.007634457724332419, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.04227054451232199}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.029620227874790486}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681725}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.021020672680827912}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23109243697478993, "acc_stderr,none": 0.027381406927868973}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.018224078117299085}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596918}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.01759348689536683}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.04582004841505415}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1673469387755102, "acc_stderr,none": 0.02389714476891452}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_stem": {"acc,none": 0.23945448778940692, "acc_stderr,none": 0.007578029014293256, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03820169914517904}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.03110318238312338}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.12, "acc_stderr,none": 0.03265986323710906}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.030783736757745633}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277716}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.022418042891113946}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.0307127300709826}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.025130453652268455}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.907, "acc_stderr,none": 0.00918887563499665, "acc_norm,none": 0.852, "acc_norm_stderr,none": 0.01123486636423525}}
{"created_at": "2025-08-31T16:32:11.872001", "global_step": 90000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19328419328419327, "acc_stderr,none": 0.01130520748682768}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4473212507468632, "acc_stderr,none": 0.0049620103382263464, "acc_norm,none": 0.5896235809599681, "acc_norm_stderr,none": 0.004908967278222474}, "mmlu": {"acc,none": 0.2627118644067797, "acc_stderr,none": 0.0037097189158895663, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2724760892667375, "acc_stderr,none": 0.006483815662280377, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.03512207412302054}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.03567969772268049}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24472573839662448, "acc_stderr,none": 0.027985699387036413}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3884297520661157, "acc_stderr,none": 0.044492703500683815}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.036230899157241474}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.024547617794803838}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31189710610932475, "acc_stderr,none": 0.026311858071854155}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2839506172839506, "acc_stderr,none": 0.025089478523765127}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.273142112125163, "acc_stderr,none": 0.01138015056783041}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.2590923720630834, "acc_stderr,none": 0.007845488431512249, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756192}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.19653179190751446, "acc_stderr,none": 0.03029957466478814}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.27802690582959644, "acc_stderr,none": 0.030069584874494047}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928313}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500517}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.02617390850671858}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17279411764705882, "acc_stderr,none": 0.022966067585581753}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.2560935976600585, "acc_stderr,none": 0.007859235365861035, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.030532892233932036}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.030975436386845436}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.021362027725222724}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.02788682807838058}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21834862385321102, "acc_stderr,none": 0.01771260052872272}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.01860755213127983}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22040816326530613, "acc_stderr,none": 0.026537045312145284}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3034825870646766, "acc_stderr,none": 0.03251006816458618}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_stem": {"acc,none": 0.25816682524579765, "acc_stderr,none": 0.007793906974136423, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.03853254836552003}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361061}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036622}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171453}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.037245636197746325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2804232804232804, "acc_stderr,none": 0.02313528797432561}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642514}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233484}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371217}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422263}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03894641120044792}, "sciq": {"alias": "sciq", "acc,none": 0.918, "acc_stderr,none": 0.008680515615523687, "acc_norm,none": 0.879, "acc_norm_stderr,none": 0.01031821038094609}}
{"created_at": "2025-08-31T22:40:13.424437", "global_step": 120000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2285012285012285, "acc_stderr,none": 0.012020761312005525}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4592710615415256, "acc_stderr,none": 0.004973199296339976, "acc_norm,none": 0.6035650268870743, "acc_norm_stderr,none": 0.004881570100014373}, "mmlu": {"acc,none": 0.27161373023785784, "acc_stderr,none": 0.003748814312993974, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2726886291179596, "acc_stderr,none": 0.006486015850112414, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501947}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.041391127276354626}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946336}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.31901840490797545, "acc_stderr,none": 0.03661997551073836}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30346820809248554, "acc_stderr,none": 0.02475241196091721}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3215434083601286, "acc_stderr,none": 0.026527724079528872}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.025329888171900926}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27053455019556716, "acc_stderr,none": 0.011345996743539265}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3508771929824561, "acc_stderr,none": 0.03660298834049163}, "mmlu_other": {"acc,none": 0.27615062761506276, "acc_stderr,none": 0.008019676507671311, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.02700876609070809}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229132}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.040580420156460344}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.03057281131029961}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.280970625798212, "acc_stderr,none": 0.016073127851221225}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.31699346405228757, "acc_stderr,none": 0.02664327847450875}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21691176470588236, "acc_stderr,none": 0.025035845227711254}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.26974325641858954, "acc_stderr,none": 0.007994307266974964, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646857}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565319}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.022489389793654824}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361266}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.017765978652327572}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.04142313771996665}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.018342529845275908}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27346938775510204, "acc_stderr,none": 0.028535560337128448}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31840796019900497, "acc_stderr,none": 0.03294118479054095}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_stem": {"acc,none": 0.26736441484300666, "acc_stderr,none": 0.007857732796857413, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.35, "acc_stderr,none": 0.04793724854411019}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.0391545063041425}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237654}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952365}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.32340425531914896, "acc_stderr,none": 0.030579442773610334}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.02218203720294836}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24193548387096775, "acc_stderr,none": 0.024362599693031103}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489596}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085626}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19907407407407407, "acc_stderr,none": 0.02723229846269023}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3392857142857143, "acc_stderr,none": 0.044939490686135404}, "sciq": {"alias": "sciq", "acc,none": 0.918, "acc_stderr,none": 0.008680515615523703, "acc_norm,none": 0.903, "acc_norm_stderr,none": 0.009363689373248076}}
{"created_at": "2025-09-01T06:00:45.750919", "global_step": 150000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634098}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4684325831507668, "acc_stderr,none": 0.004979826829400762, "acc_norm,none": 0.6212905795658236, "acc_norm_stderr,none": 0.004840742206718105}, "mmlu": {"acc,none": 0.2802307363623415, "acc_stderr,none": 0.003780715760669275, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2669500531349628, "acc_stderr,none": 0.006444830250317966, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3412698412698413, "acc_stderr,none": 0.04240799327574926}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.03287666758603489}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693254}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460274}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.04266416363352167}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.023948512905468348}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25139664804469275, "acc_stderr,none": 0.014508979453553988}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2861736334405145, "acc_stderr,none": 0.025670259242188947}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294677}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25358539765319427, "acc_stderr,none": 0.011111715336101138}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.036996580176568775}, "mmlu_other": {"acc,none": 0.28838107499195365, "acc_stderr,none": 0.008117322491050178, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.027724236492700907}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.03583901754736411}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.25112107623318386, "acc_stderr,none": 0.029105220833224633}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.042450224863844935}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891165}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2886334610472541, "acc_stderr,none": 0.016203792703197793}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.33986928104575165, "acc_stderr,none": 0.027121956071388852}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.0271871270115038}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.024723110407677062}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3373493975903614, "acc_stderr,none": 0.0368078369072758}, "mmlu_social_sciences": {"acc,none": 0.2918427039324017, "acc_stderr,none": 0.008155156972654442, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276586}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.023661296393964283}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29831932773109243, "acc_stderr,none": 0.02971914287634286}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.018175110510343574}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.46564885496183206, "acc_stderr,none": 0.043749285605997376}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.017848089574913226}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721377}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.34285714285714286, "acc_stderr,none": 0.030387262919547735}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.34328358208955223, "acc_stderr,none": 0.03357379665433431}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_stem": {"acc,none": 0.28068506184586106, "acc_stderr,none": 0.007980929532120578, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3355263157894737, "acc_stderr,none": 0.038424985593952694}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.15, "acc_stderr,none": 0.035887028128263686}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.04488482852329017}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.30638297872340425, "acc_stderr,none": 0.03013590647851756}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3931034482758621, "acc_stderr,none": 0.040703290137070705}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.023068188848261114}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.26129032258064516, "acc_stderr,none": 0.024993053397764815}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.23645320197044334, "acc_stderr,none": 0.02989611429173355}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.025928876132766107}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.3509933774834437, "acc_stderr,none": 0.03896981964257374}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.030225226160012404}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285714}, "sciq": {"alias": "sciq", "acc,none": 0.912, "acc_stderr,none": 0.008963053962592072, "acc_norm,none": 0.882, "acc_norm_stderr,none": 0.010206869264381786}}
{"created_at": "2025-09-01T13:26:56.389659", "global_step": 180000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22932022932022933, "acc_stderr,none": 0.012035891058050911}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4803823939454292, "acc_stderr,none": 0.004985939292819593, "acc_norm,none": 0.6394144592710616, "acc_norm_stderr,none": 0.004791890625834196}, "mmlu": {"acc,none": 0.2711864406779661, "acc_stderr,none": 0.0037382018183056555, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27013815090329435, "acc_stderr,none": 0.006458788474034644, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15079365079365079, "acc_stderr,none": 0.03200686497287396}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3151515151515151, "acc_stderr,none": 0.0362773057502241}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059802}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.35537190082644626, "acc_stderr,none": 0.04369236326573981}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.044531975073749834}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.27607361963190186, "acc_stderr,none": 0.03512385283705051}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.024818350129436593}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.014444157808261457}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3408360128617363, "acc_stderr,none": 0.026920841260776155}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.025630824975621355}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25554106910039115, "acc_stderr,none": 0.011139857833598514}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23391812865497075, "acc_stderr,none": 0.032467217651178264}, "mmlu_other": {"acc,none": 0.2925651754103637, "acc_stderr,none": 0.008128549063669001, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.027134291628741702}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.336322869955157, "acc_stderr,none": 0.031708824268455}, "mmlu_management": {"alias": " - management", "acc,none": 0.2524271844660194, "acc_stderr,none": 0.04301250399690878}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.029343114798094448}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.334610472541507, "acc_stderr,none": 0.016873468641592157}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.28104575163398693, "acc_stderr,none": 0.025738854797818726}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.02624492034984301}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.02472311040767705}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.39759036144578314, "acc_stderr,none": 0.038099730845402184}, "mmlu_social_sciences": {"acc,none": 0.26876828079298015, "acc_stderr,none": 0.007961998587447238, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.038351539543994194}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.02912652283458684}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.030516111371476005}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.021020672680827916}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23109243697478993, "acc_stderr,none": 0.027381406927868966}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26238532110091745, "acc_stderr,none": 0.01886188502153473}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3511450381679389, "acc_stderr,none": 0.04186445163013751}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3104575163398693, "acc_stderr,none": 0.01871806705262322}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.04554619617541054}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.027212835884073132}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.35323383084577115, "acc_stderr,none": 0.03379790611796776}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.25404376784015226, "acc_stderr,none": 0.007728777824332866, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.039725528847851375}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.035834961763610645}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106133}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.13, "acc_stderr,none": 0.0337997668989631}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.02937917046412483}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.31724137931034485, "acc_stderr,none": 0.038783523721386215}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2830687830687831, "acc_stderr,none": 0.023201392938194978}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.0253781399708852}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.03240661565868408}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959316}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987054}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.02541642838876747}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976235}, "sciq": {"alias": "sciq", "acc,none": 0.933, "acc_stderr,none": 0.007910345983177547, "acc_norm,none": 0.906, "acc_norm_stderr,none": 0.009233052000787726}}
{"created_at": "2025-09-03T05:39:57.367897", "global_step": 210000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2710892710892711, "acc_stderr,none": 0.012726630083024076}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.49044015136427005, "acc_stderr,none": 0.004988869288786873, "acc_norm,none": 0.654052977494523, "acc_norm_stderr,none": 0.004747038768172532}, "mmlu": {"acc,none": 0.2918387694060675, "acc_stderr,none": 0.0038138419961127263, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2809776833156217, "acc_stderr,none": 0.0065202093015426935, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.035670166752768614}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3696969696969697, "acc_stderr,none": 0.03769430314512568}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3206751054852321, "acc_stderr,none": 0.030381931949990407}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.38016528925619836, "acc_stderr,none": 0.04431324501968432}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.045245960070300496}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615624}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30346820809248554, "acc_stderr,none": 0.024752411960917202}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2346368715083799, "acc_stderr,none": 0.014173044098303663}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3762057877813505, "acc_stderr,none": 0.027513925683549427}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.35185185185185186, "acc_stderr,none": 0.02657148348071997}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.01106415102716543}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.035087719298245654}, "mmlu_other": {"acc,none": 0.3279691020276794, "acc_stderr,none": 0.00837065569800705, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3283018867924528, "acc_stderr,none": 0.028901593612411784}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.0339175032232166}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4484304932735426, "acc_stderr,none": 0.03337883736255099}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.34615384615384615, "acc_stderr,none": 0.0311669573672359}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.34610472541507026, "acc_stderr,none": 0.017011965266412073}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.027475969910660952}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902006}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23161764705882354, "acc_stderr,none": 0.025626533803777562}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.42771084337349397, "acc_stderr,none": 0.038515976837185335}, "mmlu_social_sciences": {"acc,none": 0.2967175820604485, "acc_stderr,none": 0.008194287519628352, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.041857744240220575}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.03115626951964684}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27979274611398963, "acc_stderr,none": 0.03239637046735703}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23846153846153847, "acc_stderr,none": 0.021606294494647727}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29908256880733947, "acc_stderr,none": 0.01963041728541518}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.42748091603053434, "acc_stderr,none": 0.04338920305792401}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.018492596536396955}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.04582004841505415}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2612244897959184, "acc_stderr,none": 0.028123429335142787}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4228855721393035, "acc_stderr,none": 0.03493231777421281}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_stem": {"acc,none": 0.26768157310497936, "acc_stderr,none": 0.00786368367511772, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.362962962962963, "acc_stderr,none": 0.04153948404742398}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.0358687928008034}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.030783736757745643}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378949}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.026148685930671742}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844072}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473837}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1712962962962963, "acc_stderr,none": 0.02569534164382469}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841043}, "sciq": {"alias": "sciq", "acc,none": 0.94, "acc_stderr,none": 0.007513751157474925, "acc_norm,none": 0.914, "acc_norm_stderr,none": 0.008870325962594766}}
{"created_at": "2025-09-03T07:27:30.171170", "global_step": 240000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.31203931203931207, "acc_stderr,none": 0.013264978535922402}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.5009958175662219, "acc_stderr,none": 0.004989771515176689, "acc_norm,none": 0.6686914957179845, "acc_norm_stderr,none": 0.004697217912462986}, "mmlu": {"acc,none": 0.37587238285144564, "acc_stderr,none": 0.00403967256822256, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3538788522848034, "acc_stderr,none": 0.006881927287636883, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924318}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.45454545454545453, "acc_stderr,none": 0.03888176921674099}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4166666666666667, "acc_stderr,none": 0.03460228327239171}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3881856540084388, "acc_stderr,none": 0.0317229500433233}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.5537190082644629, "acc_stderr,none": 0.0453793517794788}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4166666666666667, "acc_stderr,none": 0.04766075165356462}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.37423312883435583, "acc_stderr,none": 0.03802068102899615}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.4046242774566474, "acc_stderr,none": 0.026424816594009852}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26927374301675977, "acc_stderr,none": 0.014835616582882601}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.4694533762057878, "acc_stderr,none": 0.028345045864840678}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.42901234567901236, "acc_stderr,none": 0.027538925613470863}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.29986962190352023, "acc_stderr,none": 0.011702660860193984}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.4853801169590643, "acc_stderr,none": 0.038331852752130205}, "mmlu_other": {"acc,none": 0.4209848728677181, "acc_stderr,none": 0.008804318450477664, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.4377358490566038, "acc_stderr,none": 0.030533338430467516}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3988439306358382, "acc_stderr,none": 0.037336266553835096}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3991031390134529, "acc_stderr,none": 0.03286745312567961}, "mmlu_management": {"alias": " - management", "acc,none": 0.3786407766990291, "acc_stderr,none": 0.04802694698258974}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.5470085470085471, "acc_stderr,none": 0.0326109987309862}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4495530012771392, "acc_stderr,none": 0.017788725283507337}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.4738562091503268, "acc_stderr,none": 0.028590752958852394}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.027187127011503793}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3897058823529412, "acc_stderr,none": 0.02962466358115969}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.4397590361445783, "acc_stderr,none": 0.03864139923699121}, "mmlu_social_sciences": {"acc,none": 0.398440038999025, "acc_stderr,none": 0.008746080882531293, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.37373737373737376, "acc_stderr,none": 0.03446897738659333}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.45077720207253885, "acc_stderr,none": 0.03590910952235523}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30256410256410254, "acc_stderr,none": 0.02329088805377272}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.0302839955258844}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.45321100917431195, "acc_stderr,none": 0.021343255165546037}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.5038167938931297, "acc_stderr,none": 0.043851623256015534}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3562091503267974, "acc_stderr,none": 0.019373332420724493}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.39090909090909093, "acc_stderr,none": 0.04673752333670239}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.44081632653061226, "acc_stderr,none": 0.03178419114175363}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5074626865671642, "acc_stderr,none": 0.03535140084276719}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.54, "acc_stderr,none": 0.05009082659620332}, "mmlu_stem": {"acc,none": 0.3422137646685696, "acc_stderr,none": 0.008371766131487296, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952365}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.04292596718256981}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.4342105263157895, "acc_stderr,none": 0.04033565667848319}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04016660030451233}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.04488482852329017}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238167}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.4689655172413793, "acc_stderr,none": 0.04158632762097828}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.023456037383982026}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4483870967741935, "acc_stderr,none": 0.02829205683011273}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.32019704433497537, "acc_stderr,none": 0.032826493853041504}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.026202766534652148}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.03802039760107903}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.27314814814814814, "acc_stderr,none": 0.03038805130167812}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "sciq": {"alias": "sciq", "acc,none": 0.937, "acc_stderr,none": 0.007687007876286406, "acc_norm,none": 0.923, "acc_norm_stderr,none": 0.00843458014024067}}
{"created_at": "2025-09-03T07:39:21.523909", "global_step": 270000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.33988533988533987, "acc_stderr,none": 0.013561133458127721}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.5148376817367059, "acc_stderr,none": 0.004987583858923224, "acc_norm,none": 0.6868153754232225, "acc_norm_stderr,none": 0.004628409084218786}, "mmlu": {"acc,none": 0.38605611736219914, "acc_stderr,none": 0.004035113780916413, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.36769394261424015, "acc_stderr,none": 0.006884040317849944, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.0380952380952381}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.5212121212121212, "acc_stderr,none": 0.03900828913737302}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.5, "acc_stderr,none": 0.03509312031717982}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4810126582278481, "acc_stderr,none": 0.03252375148090448}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.5206611570247934, "acc_stderr,none": 0.04560456086387235}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.46296296296296297, "acc_stderr,none": 0.04820403072760628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3987730061349693, "acc_stderr,none": 0.038470214204560246}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.43352601156069365, "acc_stderr,none": 0.026680134761679217}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24916201117318434, "acc_stderr,none": 0.014465893829859926}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.4405144694533762, "acc_stderr,none": 0.028196400574197426}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.44135802469135804, "acc_stderr,none": 0.027628737155668777}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3050847457627119, "acc_stderr,none": 0.011759939618085453}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.5789473684210527, "acc_stderr,none": 0.037867207062342145}, "mmlu_other": {"acc,none": 0.4345027357579659, "acc_stderr,none": 0.008753602580848462, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3849056603773585, "acc_stderr,none": 0.029946498567699948}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.0356760379963917}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.5112107623318386, "acc_stderr,none": 0.033549366530984746}, "mmlu_management": {"alias": " - management", "acc,none": 0.3300970873786408, "acc_stderr,none": 0.046561471100123514}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.5982905982905983, "acc_stderr,none": 0.03211693751051621}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.46, "acc_stderr,none": 0.05009082659620332}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.5134099616858238, "acc_stderr,none": 0.017873531736510403}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.45751633986928103, "acc_stderr,none": 0.028526383452142635}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.31560283687943264, "acc_stderr,none": 0.027724989449509317}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34558823529411764, "acc_stderr,none": 0.02888819310398864}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.42771084337349397, "acc_stderr,none": 0.038515976837185335}, "mmlu_social_sciences": {"acc,none": 0.41826454338641533, "acc_stderr,none": 0.008803567935028352, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3939393939393939, "acc_stderr,none": 0.03481285338232963}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.42487046632124353, "acc_stderr,none": 0.035674713352125395}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3230769230769231, "acc_stderr,none": 0.023710888501970565}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135367}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.46605504587155966, "acc_stderr,none": 0.02138786335035399}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.5419847328244275, "acc_stderr,none": 0.04369802690578757}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.4084967320261438, "acc_stderr,none": 0.019886221037501865}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.41818181818181815, "acc_stderr,none": 0.047245774057315726}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3877551020408163, "acc_stderr,none": 0.031192230726795656}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5671641791044776, "acc_stderr,none": 0.0350349092367328}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.58, "acc_stderr,none": 0.049604496374885836}, "mmlu_stem": {"acc,none": 0.3342848081192515, "acc_stderr,none": 0.008318252074239184, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952344}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.04292596718256981}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3881578947368421, "acc_stderr,none": 0.03965842097512744}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04016660030451233}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993177}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.4085106382978723, "acc_stderr,none": 0.03213418026701576}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.42758620689655175, "acc_stderr,none": 0.041227371113703316}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.28835978835978837, "acc_stderr,none": 0.023330654054535886}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.432258064516129, "acc_stderr,none": 0.02818173972001941}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03255086769970103}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895992}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.035118075718047245}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25462962962962965, "acc_stderr,none": 0.02971127586000534}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.044642857142857116}, "sciq": {"alias": "sciq", "acc,none": 0.945, "acc_stderr,none": 0.007212976294639234, "acc_norm,none": 0.925, "acc_norm_stderr,none": 0.008333333333333368}}
{"created_at": "2025-09-03T07:47:47.816895", "global_step": 300000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3546273546273546, "acc_stderr,none": 0.013696559157990467}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.5187213702449711, "acc_stderr,none": 0.004986282450647318, "acc_norm,none": 0.6926906990639314, "acc_norm_stderr,none": 0.004604357610190324}, "mmlu": {"acc,none": 0.4040734938043014, "acc_stderr,none": 0.004064100528499119, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.39086078639744953, "acc_stderr,none": 0.006969291428438996, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604673}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.5636363636363636, "acc_stderr,none": 0.03872592983524754}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.5245098039215687, "acc_stderr,none": 0.035050931943487976}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5316455696202531, "acc_stderr,none": 0.032481974005110756}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.5371900826446281, "acc_stderr,none": 0.04551711196104218}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4537037037037037, "acc_stderr,none": 0.04812917324536823}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.4539877300613497, "acc_stderr,none": 0.0391170190467718}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.4393063583815029, "acc_stderr,none": 0.026720034380514995}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2748603351955307, "acc_stderr,none": 0.014931316703220504}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.4533762057877814, "acc_stderr,none": 0.02827435985489425}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.4567901234567901, "acc_stderr,none": 0.027716661650194038}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3285528031290743, "acc_stderr,none": 0.011996027247502917}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.5964912280701754, "acc_stderr,none": 0.037627386999170565}, "mmlu_other": {"acc,none": 0.4496298680399099, "acc_stderr,none": 0.008777619184681194, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.029773082713319878}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.34104046242774566, "acc_stderr,none": 0.036146654241808254}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.48878923766816146, "acc_stderr,none": 0.033549366530984746}, "mmlu_management": {"alias": " - management", "acc,none": 0.34951456310679613, "acc_stderr,none": 0.047211885060971716}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.6153846153846154, "acc_stderr,none": 0.03187195347942466}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.51, "acc_stderr,none": 0.05024183937956912}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.5389527458492975, "acc_stderr,none": 0.017825621793239016}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.4803921568627451, "acc_stderr,none": 0.028607893699576063}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.30851063829787234, "acc_stderr,none": 0.02755336616510137}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.4007352941176471, "acc_stderr,none": 0.029768263528933105}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.42168674698795183, "acc_stderr,none": 0.03844453181770917}, "mmlu_social_sciences": {"acc,none": 0.4354891127721807, "acc_stderr,none": 0.00882657417845094, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3838383838383838, "acc_stderr,none": 0.03464881675016338}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.48704663212435234, "acc_stderr,none": 0.036072280610477486}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.34102564102564104, "acc_stderr,none": 0.02403548967633508}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.37815126050420167, "acc_stderr,none": 0.03149930577784906}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.48807339449541287, "acc_stderr,none": 0.021431223617362233}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.5725190839694656, "acc_stderr,none": 0.043389203057924}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.41830065359477125, "acc_stderr,none": 0.019955975145835553}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.44545454545454544, "acc_stderr,none": 0.047605488214603246}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4, "acc_stderr,none": 0.03136250240935893}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5920398009950248, "acc_stderr,none": 0.03475116365194092}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.58, "acc_stderr,none": 0.049604496374885836}, "mmlu_stem": {"acc,none": 0.34823977164605135, "acc_stderr,none": 0.008391888280453908, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.04292596718256981}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.40789473684210525, "acc_stderr,none": 0.03999309712777471}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04016660030451233}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.045338381959297736}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.4127659574468085, "acc_stderr,none": 0.03218471141400351}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.4413793103448276, "acc_stderr,none": 0.04137931034482758}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2830687830687831, "acc_stderr,none": 0.023201392938194978}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.47096774193548385, "acc_stderr,none": 0.028396016402760998}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.02708037281514566}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.03479185572599661}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2824074074074074, "acc_stderr,none": 0.03070137211151092}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3392857142857143, "acc_stderr,none": 0.04493949068613539}, "sciq": {"alias": "sciq", "acc,none": 0.946, "acc_stderr,none": 0.007150883521295438, "acc_norm,none": 0.923, "acc_norm_stderr,none": 0.008434580140240672}}