diff --git "a/src/detail_math_score.json" "b/src/detail_math_score.json" --- "a/src/detail_math_score.json" +++ "b/src/detail_math_score.json" @@ -1,5 +1,5 @@ { - "time": "2025-01-24 15:10:27", + "time": "2025-02-11 13:22:59", "results": { "IO": { "gpt-3.5-turbo": { @@ -13,7 +13,7 @@ "Pass rate": 0.9992, "Cost($)": 0.3328, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 586553, "Total input tokens": 546990, @@ -26,13 +26,26 @@ "Pass rate": 1.0, "Cost($)": 0.038, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 42471, "Total input tokens": 25701, "Average input tokens": 101, "Total output tokens": 16770, "Average output tokens": 66 + }, + "MATH-500": { + "Score": 17.2, + "Pass rate": 1.0, + "Cost($)": 0.2436, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 265625, + "Total input tokens": 154881, + "Average input tokens": 310, + "Total output tokens": 110744, + "Average output tokens": 221 } }, "Doubao-lite-32k": { @@ -46,7 +59,7 @@ "Pass rate": 0.9992, "Cost($)": 0.0354, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 740483, "Total input tokens": 617377, @@ -59,13 +72,26 @@ "Pass rate": 1.0, "Cost($)": 0.0058, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 87742, "Total input tokens": 33058, "Average input tokens": 130, "Total output tokens": 54684, "Average output tokens": 215 + }, + "MATH-500": { + "Score": 37.4, + "Pass rate": 1.0, + "Cost($)": 0.0187, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 311730, + "Total input tokens": 166870, + "Average input tokens": 334, + "Total output tokens": 144860, + "Average output tokens": 290 } }, "gpt-4o": { @@ -79,7 +105,7 @@ "Pass rate": 1.0, "Cost($)": 3.3463, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 741446, "Total input tokens": 542416, @@ -92,13 +118,26 @@ "Pass rate": 0.9724, "Cost($)": 1.1453, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 133752, "Total input tokens": 25631, "Average input tokens": 101, "Total output tokens": 108121, "Average output tokens": 426 + }, + "MATH-500": { + "Score": 41.8, + "Pass rate": 1.0, + "Cost($)": 2.7907, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 394447, + "Total input tokens": 153832, + "Average input tokens": 308, + "Total output tokens": 240615, + "Average output tokens": 481 } }, "Qwen2.5-72B-Instruct": { @@ -112,7 +151,7 @@ "Pass rate": 1.0, "Cost($)": 0.4899, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 869060, "Total input tokens": 555340, @@ -125,13 +164,26 @@ "Pass rate": 0.9961, "Cost($)": 0.0742, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 131604, "Total input tokens": 25397, "Average input tokens": 100, "Total output tokens": 106207, "Average output tokens": 418 + }, + "MATH-500": { + "Score": 70.2, + "Pass rate": 1.0, + "Cost($)": 0.2506, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 444591, + "Total input tokens": 169549, + "Average input tokens": 339, + "Total output tokens": 275042, + "Average output tokens": 550 } }, "Llama-3.3-70B-Instruct": { @@ -145,7 +197,7 @@ "Pass rate": 1.0, "Cost($)": 0.4709, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 835275, "Total input tokens": 583916, @@ -158,13 +210,26 @@ "Pass rate": 0.9921, "Cost($)": 0.0798, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 141567, "Total input tokens": 32809, "Average input tokens": 129, "Total output tokens": 108758, "Average output tokens": 428 + }, + "MATH-500": { + "Score": 69.4, + "Pass rate": 1.0, + "Cost($)": 0.2386, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 423216, + "Total input tokens": 155879, + "Average input tokens": 312, + "Total output tokens": 267337, + "Average output tokens": 535 } }, "Qwen2.5-7B-Instruct": { @@ -178,7 +243,7 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 887913, "Total input tokens": 596229, @@ -191,13 +256,26 @@ "Pass rate": 0.9843, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 137771, "Total input tokens": 33271, "Average input tokens": 131, "Total output tokens": 104500, "Average output tokens": 411 + }, + "MATH-500": { + "Score": 59.4, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 411362, + "Total input tokens": 169549, + "Average input tokens": 339, + "Total output tokens": 241813, + "Average output tokens": 484 } }, "Llama-3.1-8B-Instruct": { @@ -211,7 +289,7 @@ "Pass rate": 0.9955, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1745429, "Total input tokens": 550941, @@ -224,13 +302,26 @@ "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 133106, "Total input tokens": 26459, "Average input tokens": 104, "Total output tokens": 106647, "Average output tokens": 420 + }, + "MATH-500": { + "Score": 38.6, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 503934, + "Total input tokens": 155563, + "Average input tokens": 311, + "Total output tokens": 348371, + "Average output tokens": 697 } }, "Internllm2_5-7B": { @@ -244,7 +335,7 @@ "Pass rate": 0.9795, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1113728, "Total input tokens": 679302, @@ -257,13 +348,26 @@ "Pass rate": 0.9094, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 185041, "Total input tokens": 50232, "Average input tokens": 198, "Total output tokens": 134809, "Average output tokens": 531 + }, + "MATH-500": { + "Score": 22.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 467888, + "Total input tokens": 201883, + "Average input tokens": 404, + "Total output tokens": 266005, + "Average output tokens": 532 } }, "Qwen2-1.5B-Instruct": { @@ -277,7 +381,7 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 736996, "Total input tokens": 568530, @@ -290,13 +394,26 @@ "Pass rate": 0.9764, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 71047, "Total input tokens": 27937, "Average input tokens": 110, "Total output tokens": 43110, "Average output tokens": 170 + }, + "MATH-500": { + "Score": 7.0, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 413878, + "Total input tokens": 158777, + "Average input tokens": 318, + "Total output tokens": 255101, + "Average output tokens": 510 } }, "Qwen2-0.5B-Instruct": { @@ -310,7 +427,7 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 834897, "Total input tokens": 568116, @@ -323,13 +440,72 @@ "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 110415, "Total input tokens": 27937, "Average input tokens": 110, "Total output tokens": 82478, "Average output tokens": 325 + }, + "MATH-500": { + "Score": 2.6, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 429330, + "Total input tokens": 159049, + "Average input tokens": 318, + "Total output tokens": 270281, + "Average output tokens": 541 + } + }, + "deepseek-r1:1.5b": { + "META": { + "Algorithm": "IO", + "LLM": "deepseek-r1:1.5b", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 64.14, + "Pass rate": 0.9962, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 1483051, + "Total input tokens": 561935, + "Average input tokens": 426, + "Total output tokens": 921116, + "Average output tokens": 698 + }, + "AQuA": { + "Score": 68.9, + "Pass rate": 0.9488, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 351767, + "Total input tokens": 26667, + "Average input tokens": 105, + "Total output tokens": 325100, + "Average output tokens": 1280 + }, + "MATH-500": { + "Score": 43.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 1022548, + "Total input tokens": 157049, + "Average input tokens": 314, + "Total output tokens": 865499, + "Average output tokens": 1731 } } }, @@ -345,7 +521,7 @@ "Pass rate": 0.9939, "Cost($)": 3.4633, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 6646286, "Total input tokens": 6506164, @@ -358,13 +534,26 @@ "Pass rate": 0.9803, "Cost($)": 0.4928, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 903587, "Total input tokens": 862614, "Average input tokens": 3396, "Total output tokens": 40973, "Average output tokens": 161 + }, + "MATH-500": { + "Score": 23.8, + "Pass rate": 1.0, + "Cost($)": 2.0406, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 3832714, + "Total input tokens": 3708461, + "Average input tokens": 7417, + "Total output tokens": 124253, + "Average output tokens": 249 } }, "Doubao-lite-32k": { @@ -378,7 +567,7 @@ "Pass rate": 0.9962, "Cost($)": 0.2512, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 5998639, "Total input tokens": 5862016, @@ -391,13 +580,26 @@ "Pass rate": 0.9606, "Cost($)": 0.0445, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 1032841, "Total input tokens": 977890, "Average input tokens": 3850, "Total output tokens": 54951, "Average output tokens": 216 + }, + "MATH-500": { + "Score": 47.2, + "Pass rate": 1.0, + "Cost($)": 0.186, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 4388666, + "Total input tokens": 4234620, + "Average input tokens": 8469, + "Total output tokens": 154046, + "Average output tokens": 308 } }, "gpt-4o": { @@ -411,7 +613,7 @@ "Pass rate": 0.9955, "Cost($)": 39.0751, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 14715887, "Total input tokens": 14411173, @@ -424,13 +626,26 @@ "Pass rate": 0.9724, "Cost($)": 2.304, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 692096, "Total input tokens": 615589, "Average input tokens": 2424, "Total output tokens": 76507, "Average output tokens": 301 + }, + "MATH-500": { + "Score": 54.0, + "Pass rate": 1.0, + "Cost($)": 17.7735, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 6153255, + "Total input tokens": 5834537, + "Average input tokens": 11669, + "Total output tokens": 318718, + "Average output tokens": 637 } }, "Qwen2.5-72B-Instruct": { @@ -444,7 +659,7 @@ "Pass rate": 1.0, "Cost($)": 10.5479, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 18710437, "Total input tokens": 18160983, @@ -457,13 +672,26 @@ "Pass rate": 1.0, "Cost($)": 0.3177, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 563603, "Total input tokens": 441765, "Average input tokens": 1739, "Total output tokens": 121838, "Average output tokens": 480 + }, + "MATH-500": { + "Score": 62.8, + "Pass rate": 1.0, + "Cost($)": 3.4541, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 6127117, + "Total input tokens": 5747268, + "Average input tokens": 11495, + "Total output tokens": 379849, + "Average output tokens": 760 } }, "Llama-3.3-70B-Instruct": { @@ -477,7 +705,7 @@ "Pass rate": 0.9992, "Cost($)": 10.1124, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 17937864, "Total input tokens": 17038928, @@ -490,13 +718,26 @@ "Pass rate": 0.9961, "Cost($)": 0.768, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 1362379, "Total input tokens": 1119143, "Average input tokens": 4406, "Total output tokens": 243236, "Average output tokens": 958 + }, + "MATH-500": { + "Score": 64.6, + "Pass rate": 1.0, + "Cost($)": 3.1806, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5641879, + "Total input tokens": 5223611, + "Average input tokens": 10447, + "Total output tokens": 418268, + "Average output tokens": 837 } }, "Qwen2.5-7B-Instruct": { @@ -510,7 +751,7 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 14850914, "Total input tokens": 14355752, @@ -523,13 +764,26 @@ "Pass rate": 0.9921, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 695844, "Total input tokens": 564165, "Average input tokens": 2221, "Total output tokens": 131679, "Average output tokens": 518 + }, + "MATH-500": { + "Score": 48.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 4990240, + "Total input tokens": 4646708, + "Average input tokens": 9293, + "Total output tokens": 343532, + "Average output tokens": 687 } }, "Llama-3.1-8B-Instruct": { @@ -543,7 +797,7 @@ "Pass rate": 0.9856, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 22835767, "Total input tokens": 21044978, @@ -556,13 +810,26 @@ "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 4340821, "Total input tokens": 3764723, "Average input tokens": 14822, "Total output tokens": 576098, "Average output tokens": 2268 + }, + "MATH-500": { + "Score": 28.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 8763629, + "Total input tokens": 7486706, + "Average input tokens": 14973, + "Total output tokens": 1276923, + "Average output tokens": 2554 } }, "Internllm2_5-7B": { @@ -576,7 +843,7 @@ "Pass rate": 0.9795, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 35669989, "Total input tokens": 30120070, @@ -589,13 +856,26 @@ "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 4428801, "Total input tokens": 3592039, "Average input tokens": 14142, "Total output tokens": 836762, "Average output tokens": 3294 + }, + "MATH-500": { + "Score": 14.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 14186105, + "Total input tokens": 11831496, + "Average input tokens": 23663, + "Total output tokens": 2354609, + "Average output tokens": 4709 } }, "Qwen2-1.5B-Instruct": { @@ -609,7 +889,7 @@ "Pass rate": 0.8021, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 9828001, "Total input tokens": 9133603, @@ -622,13 +902,26 @@ "Pass rate": 0.9606, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 5072004, "Total input tokens": 4555858, "Average input tokens": 17936, "Total output tokens": 516146, "Average output tokens": 2032 + }, + "MATH-500": { + "Score": 8.2, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 8987061, + "Total input tokens": 8430774, + "Average input tokens": 16862, + "Total output tokens": 556287, + "Average output tokens": 1113 } }, "Qwen2-0.5B-Instruct": { @@ -642,7 +935,7 @@ "Pass rate": 0.9522, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 55392611, "Total input tokens": 52431343, @@ -655,13 +948,72 @@ "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 7170087, "Total input tokens": 6344167, "Average input tokens": 24977, "Total output tokens": 825920, "Average output tokens": 3252 + }, + "MATH-500": { + "Score": 0.6, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 19442440, + "Total input tokens": 18137392, + "Average input tokens": 36275, + "Total output tokens": 1305048, + "Average output tokens": 2610 + } + }, + "deepseek-r1:1.5b": { + "META": { + "Algorithm": "ReAct-Pro*", + "LLM": "deepseek-r1:1.5b", + "Eval Date": "2025/2/10" + }, + "gsm8k": { + "Score": 35.94, + "Pass rate": 0.9962, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 24219077, + "Total input tokens": 19299381, + "Average input tokens": 14632, + "Total output tokens": 4919696, + "Average output tokens": 3730 + }, + "AQuA": { + "Score": 54.33, + "Pass rate": 0.9646, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 14445041, + "Total input tokens": 10578715, + "Average input tokens": 41648, + "Total output tokens": 3866326, + "Average output tokens": 15222 + }, + "MATH-500": { + "Score": 24.4, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 30177348, + "Total input tokens": 20729970, + "Average input tokens": 41460, + "Total output tokens": 9447378, + "Average output tokens": 18895 } } }, @@ -677,7 +1029,7 @@ "Pass rate": 0.9924, "Cost($)": 0.6902, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1187080, "Total input tokens": 1090418, @@ -690,13 +1042,26 @@ "Pass rate": 1.0, "Cost($)": 0.1748, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 266654, "Total input tokens": 225162, "Average input tokens": 886, "Total output tokens": 41492, "Average output tokens": 163 + }, + "MATH-500": { + "Score": 28.8, + "Pass rate": 0.838, + "Cost($)": 0.168, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 271916, + "Total input tokens": 239902, + "Average input tokens": 480, + "Total output tokens": 32014, + "Average output tokens": 64 } }, "Doubao-lite-32k": { @@ -710,7 +1075,7 @@ "Pass rate": 0.9257, "Cost($)": 0.0576, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1288055, "Total input tokens": 1170038, @@ -723,13 +1088,26 @@ "Pass rate": 0.9685, "Cost($)": 0.0147, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 309436, "Total input tokens": 259863, "Average input tokens": 1023, "Total output tokens": 49573, "Average output tokens": 195 + }, + "MATH-500": { + "Score": 32.6, + "Pass rate": 0.68, + "Cost($)": 0.0144, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 303148, + "Total input tokens": 254377, + "Average input tokens": 509, + "Total output tokens": 48771, + "Average output tokens": 98 } }, "gpt-4o": { @@ -743,7 +1121,7 @@ "Pass rate": 0.9977, "Cost($)": 4.2166, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1247912, "Total input tokens": 1101672, @@ -756,13 +1134,26 @@ "Pass rate": 1.0, "Cost($)": 1.6087, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 327908, "Total input tokens": 222717, "Average input tokens": 877, "Total output tokens": 105191, "Average output tokens": 414 + }, + "MATH-500": { + "Score": 46.2, + "Pass rate": 0.864, + "Cost($)": 1.5994, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 340960, + "Total input tokens": 241357, + "Average input tokens": 483, + "Total output tokens": 99603, + "Average output tokens": 199 } }, "Qwen2.5-72B-Instruct": { @@ -776,7 +1167,7 @@ "Pass rate": 0.9939, "Cost($)": 0.7054, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1251210, "Total input tokens": 1106682, @@ -789,13 +1180,26 @@ "Pass rate": 1.0, "Cost($)": 0.1645, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 291764, "Total input tokens": 249215, "Average input tokens": 981, "Total output tokens": 42549, "Average output tokens": 168 + }, + "MATH-500": { + "Score": 47.2, + "Pass rate": 0.822, + "Cost($)": 0.233, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 413372, + "Total input tokens": 242549, + "Average input tokens": 485, + "Total output tokens": 170823, + "Average output tokens": 342 } }, "Llama-3.3-70B-Instruct": { @@ -809,7 +1213,7 @@ "Pass rate": 0.7961, "Cost($)": 0.9736, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1727044, "Total input tokens": 1126025, @@ -822,13 +1226,26 @@ "Pass rate": 0.9921, "Cost($)": 0.1746, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 309799, "Total input tokens": 240735, "Average input tokens": 948, "Total output tokens": 69064, "Average output tokens": 272 + }, + "MATH-500": { + "Score": 42.6, + "Pass rate": 0.802, + "Cost($)": 0.2839, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 503596, + "Total input tokens": 253879, + "Average input tokens": 508, + "Total output tokens": 249717, + "Average output tokens": 499 } }, "Qwen2.5-7B-Instruct": { @@ -842,7 +1259,7 @@ "Pass rate": 0.7051, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1362822, "Total input tokens": 1145390, @@ -855,13 +1272,26 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 313728, "Total input tokens": 264517, "Average input tokens": 1041, "Total output tokens": 49211, "Average output tokens": 194 + }, + "MATH-500": { + "Score": 39.6, + "Pass rate": 0.744, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 408812, + "Total input tokens": 258549, + "Average input tokens": 517, + "Total output tokens": 150263, + "Average output tokens": 301 } }, "Llama-3.1-8B-Instruct": { @@ -875,7 +1305,7 @@ "Pass rate": 0.5542, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1391111, "Total input tokens": 1147538, @@ -888,13 +1318,26 @@ "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 290914, "Total input tokens": 240613, "Average input tokens": 947, "Total output tokens": 50301, "Average output tokens": 198 + }, + "MATH-500": { + "Score": 25.4, + "Pass rate": 0.684, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 462271, + "Total input tokens": 253879, + "Average input tokens": 508, + "Total output tokens": 208392, + "Average output tokens": 417 } }, "Internllm2_5-7B": { @@ -908,7 +1351,7 @@ "Pass rate": 0.489, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1324949, "Total input tokens": 1136843, @@ -921,13 +1364,26 @@ "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 301962, "Total input tokens": 233505, "Average input tokens": 919, "Total output tokens": 68457, "Average output tokens": 270 + }, + "MATH-500": { + "Score": 15.0, + "Pass rate": 0.324, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 368709, + "Total input tokens": 247883, + "Average input tokens": 496, + "Total output tokens": 120826, + "Average output tokens": 242 } }, "Qwen2-1.5B-Instruct": { @@ -941,7 +1397,7 @@ "Pass rate": 0.3101, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1327522, "Total input tokens": 1151528, @@ -954,13 +1410,26 @@ "Pass rate": 0.9646, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 298475, "Total input tokens": 246560, "Average input tokens": 971, "Total output tokens": 51915, "Average output tokens": 204 + }, + "MATH-500": { + "Score": 0.8, + "Pass rate": 0.022, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 786870, + "Total input tokens": 248509, + "Average input tokens": 497, + "Total output tokens": 538361, + "Average output tokens": 1077 } }, "Qwen2-0.5B-Instruct": { @@ -970,11 +1439,11 @@ "Eval Date": "2025/1/22" }, "gsm8k": { - "Score": 9.62, + "Score": 9.63, "Pass rate": 0.1691, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1389135, "Total input tokens": 1151528, @@ -987,13 +1456,72 @@ "Pass rate": 0.9213, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 322281, "Total input tokens": 258867, "Average input tokens": 1019, "Total output tokens": 63414, "Average output tokens": 250 + }, + "MATH-500": { + "Score": 0.0, + "Pass rate": 0.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 437202, + "Total input tokens": 253549, + "Average input tokens": 507, + "Total output tokens": 183653, + "Average output tokens": 367 + } + }, + "deepseek-r1:1.5b": { + "META": { + "Algorithm": "PoT", + "LLM": "deepseek-r1:1.5b", + "Eval Date": "2025/2/10" + }, + "gsm8k": { + "Score": 11.9, + "Pass rate": 0.1744, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 1954509, + "Total input tokens": 1138872, + "Average input tokens": 863, + "Total output tokens": 815637, + "Average output tokens": 618 + }, + "AQuA": { + "Score": 54.72, + "Pass rate": 0.9724, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 1016647, + "Total input tokens": 250690, + "Average input tokens": 987, + "Total output tokens": 765957, + "Average output tokens": 3016 + }, + "MATH-500": { + "Score": 1.0, + "Pass rate": 0.016, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 1031067, + "Total input tokens": 245549, + "Average input tokens": 491, + "Total output tokens": 785518, + "Average output tokens": 1571 } } }, @@ -1009,7 +1537,7 @@ "Pass rate": 1.0, "Cost($)": 0.6788, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1088041, "Total input tokens": 953242, @@ -1022,13 +1550,26 @@ "Pass rate": 0.937, "Cost($)": 0.0957, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 80793, "Total input tokens": 25447, "Average input tokens": 100, "Total output tokens": 55346, "Average output tokens": 218 + }, + "MATH-500": { + "Score": 39.8, + "Pass rate": 1.0, + "Cost($)": 0.3189, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 432196, + "Total input tokens": 329381, + "Average input tokens": 659, + "Total output tokens": 102815, + "Average output tokens": 206 } }, "Doubao-lite-32k": { @@ -1042,7 +1583,7 @@ "Pass rate": 1.0, "Cost($)": 0.0558, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1201820, "Total input tokens": 1042095, @@ -1055,13 +1596,26 @@ "Pass rate": 0.9724, "Cost($)": 0.0066, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 94577, "Total input tokens": 27978, "Average input tokens": 110, "Total output tokens": 66599, "Average output tokens": 262 + }, + "MATH-500": { + "Score": 59.0, + "Pass rate": 1.0, + "Cost($)": 0.0255, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 479941, + "Total input tokens": 336370, + "Average input tokens": 673, + "Total output tokens": 143571, + "Average output tokens": 287 } }, "gpt-4o": { @@ -1075,7 +1629,7 @@ "Pass rate": 1.0, "Cost($)": 4.5367, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1165166, "Total input tokens": 948668, @@ -1088,13 +1642,26 @@ "Pass rate": 0.9803, "Cost($)": 1.0417, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 123017, "Total input tokens": 25123, "Average input tokens": 99, "Total output tokens": 97894, "Average output tokens": 385 + }, + "MATH-500": { + "Score": 68.0, + "Pass rate": 1.0, + "Cost($)": 3.0569, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 552688, + "Total input tokens": 329332, + "Average input tokens": 659, + "Total output tokens": 223356, + "Average output tokens": 447 } }, "Qwen2.5-72B-Instruct": { @@ -1108,7 +1675,7 @@ "Pass rate": 1.0, "Cost($)": 0.7195, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1276252, "Total input tokens": 1005119, @@ -1121,13 +1688,26 @@ "Pass rate": 0.9921, "Cost($)": 0.0808, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 143289, "Total input tokens": 25143, "Average input tokens": 99, "Total output tokens": 118146, "Average output tokens": 465 + }, + "MATH-500": { + "Score": 80.2, + "Pass rate": 1.0, + "Cost($)": 0.349, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 619015, + "Total input tokens": 338549, + "Average input tokens": 677, + "Total output tokens": 280466, + "Average output tokens": 561 } }, "Llama-3.3-70B-Instruct": { @@ -1141,7 +1721,7 @@ "Pass rate": 1.0, "Cost($)": 0.687, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1218665, "Total input tokens": 990168, @@ -1154,13 +1734,26 @@ "Pass rate": 0.9843, "Cost($)": 0.0927, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 164389, "Total input tokens": 32555, "Average input tokens": 128, "Total output tokens": 131834, "Average output tokens": 519 + }, + "MATH-500": { + "Score": 71.2, + "Pass rate": 1.0, + "Cost($)": 0.3463, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 614221, + "Total input tokens": 342879, + "Average input tokens": 686, + "Total output tokens": 271342, + "Average output tokens": 543 } }, "Qwen2.5-7B-Instruct": { @@ -1174,7 +1767,7 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1290805, "Total input tokens": 1046008, @@ -1187,13 +1780,26 @@ "Pass rate": 0.9961, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 149736, "Total input tokens": 33017, "Average input tokens": 130, "Total output tokens": 116719, "Average output tokens": 460 + }, + "MATH-500": { + "Score": 69.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 617204, + "Total input tokens": 354049, + "Average input tokens": 708, + "Total output tokens": 263155, + "Average output tokens": 526 } }, "Llama-3.1-8B-Instruct": { @@ -1207,7 +1813,7 @@ "Pass rate": 0.9992, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1248329, "Total input tokens": 990168, @@ -1220,13 +1826,26 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 144435, "Total input tokens": 32555, "Average input tokens": 128, "Total output tokens": 111880, "Average output tokens": 440 + }, + "MATH-500": { + "Score": 25.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 625568, + "Total input tokens": 342879, + "Average input tokens": 686, + "Total output tokens": 282689, + "Average output tokens": 565 } }, "Internllm2_5-7B": { @@ -1240,7 +1859,7 @@ "Pass rate": 0.997, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1202163, "Total input tokens": 968163, @@ -1253,13 +1872,26 @@ "Pass rate": 0.8937, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 127520, "Total input tokens": 26610, "Average input tokens": 105, "Total output tokens": 100910, "Average output tokens": 397 + }, + "MATH-500": { + "Score": 46.6, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 546774, + "Total input tokens": 332883, + "Average input tokens": 666, + "Total output tokens": 213891, + "Average output tokens": 428 } }, "Qwen2-1.5B-Instruct": { @@ -1273,7 +1905,7 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1218525, "Total input tokens": 1032818, @@ -1286,13 +1918,26 @@ "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 110040, "Total input tokens": 30477, "Average input tokens": 120, "Total output tokens": 79563, "Average output tokens": 313 + }, + "MATH-500": { + "Score": 15.2, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 536377, + "Total input tokens": 349049, + "Average input tokens": 698, + "Total output tokens": 187328, + "Average output tokens": 375 } }, "Qwen2-0.5B-Instruct": { @@ -1306,7 +1951,7 @@ "Pass rate": 0.9992, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 1223459, "Total input tokens": 1032818, @@ -1319,13 +1964,72 @@ "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 117339, "Total input tokens": 30477, "Average input tokens": 120, "Total output tokens": 86862, "Average output tokens": 342 + }, + "MATH-500": { + "Score": 6.2, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 549188, + "Total input tokens": 349049, + "Average input tokens": 698, + "Total output tokens": 200139, + "Average output tokens": 400 + } + }, + "deepseek-r1:1.5b": { + "META": { + "Algorithm": "CoT", + "LLM": "deepseek-r1:1.5b", + "Eval Date": "2025/1/23" + }, + "gsm8k": { + "Score": 70.66, + "Pass rate": 0.9977, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 2090625, + "Total input tokens": 1011714, + "Average input tokens": 767, + "Total output tokens": 1078911, + "Average output tokens": 818 + }, + "AQuA": { + "Score": 71.65, + "Pass rate": 0.9685, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 333072, + "Total input tokens": 26413, + "Average input tokens": 104, + "Total output tokens": 306659, + "Average output tokens": 1207 + }, + "MATH-500": { + "Score": 49.4, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 1199129, + "Total input tokens": 341549, + "Average input tokens": 683, + "Total output tokens": 857580, + "Average output tokens": 1715 } } }, @@ -1341,7 +2045,7 @@ "Pass rate": 0.9992, "Cost($)": 3.3938, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 4089612, "Total input tokens": 2740652, @@ -1354,13 +2058,26 @@ "Pass rate": 0.9921, "Cost($)": 0.7888, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 847335, "Total input tokens": 482192, "Average input tokens": 1898, "Total output tokens": 365143, "Average output tokens": 1438 + }, + "MATH-500": { + "Score": 28.8, + "Pass rate": 1.0, + "Cost($)": 1.9764, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 2238812, + "Total input tokens": 1381818, + "Average input tokens": 2764, + "Total output tokens": 856994, + "Average output tokens": 1714 } }, "Doubao-lite-32k": { @@ -1374,7 +2091,7 @@ "Pass rate": 0.9992, "Cost($)": 0.2083, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 3888813, "Total input tokens": 2691714, @@ -1387,13 +2104,26 @@ "Pass rate": 0.9724, "Cost($)": 0.0519, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 885986, "Total input tokens": 503751, "Average input tokens": 1983, "Total output tokens": 382235, "Average output tokens": 1505 + }, + "MATH-500": { + "Score": 49.2, + "Pass rate": 1.0, + "Cost($)": 0.1406, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 2470810, + "Total input tokens": 1507651, + "Average input tokens": 3015, + "Total output tokens": 963159, + "Average output tokens": 1926 } }, "gpt-4o": { @@ -1407,7 +2137,7 @@ "Pass rate": 0.9992, "Cost($)": 31.0542, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 5798173, "Total input tokens": 3590336, @@ -1420,13 +2150,26 @@ "Pass rate": 0.9882, "Cost($)": 8.1485, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 1373206, "Total input tokens": 744478, "Average input tokens": 2931, "Total output tokens": 628728, "Average output tokens": 2475 + }, + "MATH-500": { + "Score": 34.4, + "Pass rate": 1.0, + "Cost($)": 19.6538, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 3455323, + "Total input tokens": 1986584, + "Average input tokens": 3973, + "Total output tokens": 1468739, + "Average output tokens": 2937 } }, "Qwen2.5-72B-Instruct": { @@ -1440,7 +2183,7 @@ "Pass rate": 1.0, "Cost($)": 5.9858, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 10618008, "Total input tokens": 8136223, @@ -1453,13 +2196,26 @@ "Pass rate": 0.9921, "Cost($)": 1.0348, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 1835669, "Total input tokens": 1051218, "Average input tokens": 4139, "Total output tokens": 784451, "Average output tokens": 3088 + }, + "MATH-500": { + "Score": 74.0, + "Pass rate": 1.0, + "Cost($)": 3.1556, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5597513, + "Total input tokens": 3823997, + "Average input tokens": 7648, + "Total output tokens": 1773516, + "Average output tokens": 3547 } }, "Llama-3.3-70B-Instruct": { @@ -1473,7 +2229,7 @@ "Pass rate": 1.0, "Cost($)": 6.2005, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 10998794, "Total input tokens": 8413717, @@ -1486,13 +2242,26 @@ "Pass rate": 0.9921, "Cost($)": 1.0756, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 1907924, "Total input tokens": 1135251, "Average input tokens": 4469, "Total output tokens": 772673, "Average output tokens": 3042 + }, + "MATH-500": { + "Score": 74.2, + "Pass rate": 1.0, + "Cost($)": 3.2239, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5718739, + "Total input tokens": 3959492, + "Average input tokens": 7919, + "Total output tokens": 1759247, + "Average output tokens": 3518 } }, "Qwen2.5-7B-Instruct": { @@ -1506,7 +2275,7 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 11140985, "Total input tokens": 8586888, @@ -1519,13 +2288,26 @@ "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 1845332, "Total input tokens": 1098280, "Average input tokens": 4324, "Total output tokens": 747052, "Average output tokens": 2941 + }, + "MATH-500": { + "Score": 67.0, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5451484, + "Total input tokens": 3833751, + "Average input tokens": 7668, + "Total output tokens": 1617733, + "Average output tokens": 3235 } }, "Llama-3.1-8B-Instruct": { @@ -1539,7 +2321,7 @@ "Pass rate": 0.9955, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 11778716, "Total input tokens": 8630514, @@ -1552,13 +2334,26 @@ "Pass rate": 0.9724, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 1651333, "Total input tokens": 971003, "Average input tokens": 3823, "Total output tokens": 680330, "Average output tokens": 2678 + }, + "MATH-500": { + "Score": 30.2, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5034937, + "Total input tokens": 3546673, + "Average input tokens": 7093, + "Total output tokens": 1488264, + "Average output tokens": 2977 } }, "Internllm2_5-7B": { @@ -1572,7 +2367,7 @@ "Pass rate": 0.9841, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 14526431, "Total input tokens": 10678792, @@ -1585,13 +2380,26 @@ "Pass rate": 0.9803, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 2296222, "Total input tokens": 1420494, "Average input tokens": 5592, "Total output tokens": 875728, "Average output tokens": 3448 + }, + "MATH-500": { + "Score": 9.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5838466, + "Total input tokens": 4193296, + "Average input tokens": 8387, + "Total output tokens": 1645170, + "Average output tokens": 3290 } }, "Qwen2-1.5B-Instruct": { @@ -1605,7 +2413,7 @@ "Pass rate": 0.9189, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 12411942, "Total input tokens": 9066115, @@ -1618,13 +2426,26 @@ "Pass rate": 0.9646, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 1775335, "Total input tokens": 1034362, "Average input tokens": 4072, "Total output tokens": 740973, "Average output tokens": 2917 + }, + "MATH-500": { + "Score": 3.8, + "Pass rate": 0.99, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5569442, + "Total input tokens": 3832429, + "Average input tokens": 7665, + "Total output tokens": 1737013, + "Average output tokens": 3474 } }, "Qwen2-0.5B-Instruct": { @@ -1638,7 +2459,7 @@ "Pass rate": 0.9469, "Cost($)": 0.0, "Framework": "", - "X-shot": "8.0", + "X-shot": "8", "Samples": 1319, "All tokens": 16465720, "Total input tokens": 11019864, @@ -1651,13 +2472,580 @@ "Pass rate": 0.9724, "Cost($)": 0.0, "Framework": "", - "X-shot": "0.0", + "X-shot": "0", "Samples": 254, "All tokens": 2215091, "Total input tokens": 1246929, "Average input tokens": 4909, "Total output tokens": 968162, "Average output tokens": 3812 + }, + "MATH-500": { + "Score": 0.8, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 6862056, + "Total input tokens": 4448663, + "Average input tokens": 8897, + "Total output tokens": 2413393, + "Average output tokens": 4827 + } + }, + "deepseek-r1:1.5b": { + "META": { + "Algorithm": "SC-CoT", + "LLM": "deepseek-r1:1.5b", + "Eval Date": "2025/2/10" + }, + "gsm8k": { + "Score": 55.34, + "Pass rate": 0.997, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 25785865, + "Total input tokens": 14540096, + "Average input tokens": 11024, + "Total output tokens": 11245769, + "Average output tokens": 8526 + }, + "AQuA": { + "Score": 59.06, + "Pass rate": 0.9685, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 5802711, + "Total input tokens": 2547772, + "Average input tokens": 10031, + "Total output tokens": 3254939, + "Average output tokens": 12815 + }, + "MATH-500": { + "Score": 38.0, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 14742109, + "Total input tokens": 7080559, + "Average input tokens": 14161, + "Total output tokens": 7661550, + "Average output tokens": 15323 + } + } + }, + "ToT": { + "gpt-3.5-turbo": { + "META": { + "Algorithm": "ToT", + "LLM": "gpt-3.5-turbo", + "Eval Date": "2025/1/7" + }, + "gsm8k": { + "Score": 67.93, + "Pass rate": 0.997, + "Cost($)": 9.1707, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 16727175, + "Total input tokens": 15920037, + "Average input tokens": 12070, + "Total output tokens": 807138, + "Average output tokens": 612 + }, + "AQuA": { + "Score": 57.09, + "Pass rate": 0.9961, + "Cost($)": 1.1513, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 2001396, + "Total input tokens": 1850767, + "Average input tokens": 7286, + "Total output tokens": 150629, + "Average output tokens": 593 + }, + "MATH-500": { + "Score": 9.8, + "Pass rate": 1.0, + "Cost($)": 5.2914, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 10001767, + "Total input tokens": 9711244, + "Average input tokens": 19422, + "Total output tokens": 290523, + "Average output tokens": 581 + } + }, + "Doubao-lite-32k": { + "META": { + "Algorithm": "ToT", + "LLM": "Doubao-lite-32k", + "Eval Date": "2025/1/7" + }, + "gsm8k": { + "Score": 37.83, + "Pass rate": 0.8734, + "Cost($)": 0.8739, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 20274349, + "Total input tokens": 19208597, + "Average input tokens": 14563, + "Total output tokens": 1065752, + "Average output tokens": 808 + }, + "AQuA": { + "Score": 45.28, + "Pass rate": 0.7402, + "Cost($)": 0.0881, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 2000550, + "Total input tokens": 1850249, + "Average input tokens": 7284, + "Total output tokens": 150301, + "Average output tokens": 592 + }, + "MATH-500": { + "Score": 1.2, + "Pass rate": 0.942, + "Cost($)": 0.2371, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5564500, + "Total input tokens": 5338500, + "Average input tokens": 10677, + "Total output tokens": 226000, + "Average output tokens": 452 + } + }, + "gpt-4o": { + "META": { + "Algorithm": "ToT", + "LLM": "gpt-4o", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 91.13, + "Pass rate": 1.0, + "Cost($)": 86.8581, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 30769735, + "Total input tokens": 29445237, + "Average input tokens": 22324, + "Total output tokens": 1324498, + "Average output tokens": 1004 + }, + "AQuA": { + "Score": 81.5, + "Pass rate": 0.9921, + "Cost($)": 8.5295, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 2613607, + "Total input tokens": 2347538, + "Average input tokens": 9242, + "Total output tokens": 266069, + "Average output tokens": 1048 + }, + "MATH-500": { + "Score": 3.2, + "Pass rate": 1.0, + "Cost($)": 40.8094, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 15242432, + "Total input tokens": 14881985, + "Average input tokens": 29764, + "Total output tokens": 360447, + "Average output tokens": 721 + } + }, + "Qwen2.5-72B-Instruct": { + "META": { + "Algorithm": "ToT", + "LLM": "Qwen2.5-72B-Instruct", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 88.88, + "Pass rate": 1.0, + "Cost($)": 23.5911, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 41847148, + "Total input tokens": 40435361, + "Average input tokens": 30656, + "Total output tokens": 1411787, + "Average output tokens": 1070 + }, + "AQuA": { + "Score": 81.1, + "Pass rate": 0.9921, + "Cost($)": 3.7389, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 6632255, + "Total input tokens": 6371642, + "Average input tokens": 25085, + "Total output tokens": 260613, + "Average output tokens": 1026 + }, + "MATH-500": { + "Score": 10.8, + "Pass rate": 1.0, + "Cost($)": 9.0421, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 16039361, + "Total input tokens": 15657730, + "Average input tokens": 31315, + "Total output tokens": 381631, + "Average output tokens": 763 + } + }, + "Llama-3.3-70B-Instruct": { + "META": { + "Algorithm": "ToT", + "LLM": "Llama-3.3-70B-Instruct", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 91.89, + "Pass rate": 1.0, + "Cost($)": 20.8753, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 37029687, + "Total input tokens": 35096810, + "Average input tokens": 26609, + "Total output tokens": 1932877, + "Average output tokens": 1465 + }, + "AQuA": { + "Score": 83.07, + "Pass rate": 1.0, + "Cost($)": 2.9404, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 5215848, + "Total input tokens": 4735188, + "Average input tokens": 18642, + "Total output tokens": 480660, + "Average output tokens": 1892 + }, + "MATH-500": { + "Score": 1.4, + "Pass rate": 0.698, + "Cost($)": 8.2699, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 14669500, + "Total input tokens": 14099500, + "Average input tokens": 28199, + "Total output tokens": 570000, + "Average output tokens": 1140 + } + }, + "Qwen2.5-7B-Instruct": { + "META": { + "Algorithm": "ToT", + "LLM": "Qwen2.5-7B-Instruct", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 72.21, + "Pass rate": 0.9901, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 31657319, + "Total input tokens": 20196528, + "Average input tokens": 15312, + "Total output tokens": 11460791, + "Average output tokens": 8689 + }, + "AQuA": { + "Score": 53.94, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 8602682, + "Total input tokens": 8224468, + "Average input tokens": 32380, + "Total output tokens": 378214, + "Average output tokens": 1489 + }, + "MATH-500": { + "Score": 1.4, + "Pass rate": 0.916, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 10167500, + "Total input tokens": 9749000, + "Average input tokens": 19498, + "Total output tokens": 418500, + "Average output tokens": 837 + } + }, + "Llama-3.1-8B-Instruct": { + "META": { + "Algorithm": "ToT", + "LLM": "Llama-3.1-8B-Instruct", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 65.05, + "Pass rate": 0.9196, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 16432102, + "Total input tokens": 15554967, + "Average input tokens": 11793, + "Total output tokens": 877135, + "Average output tokens": 665 + }, + "AQuA": { + "Score": 59.06, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 5739684, + "Total input tokens": 4896222, + "Average input tokens": 19276, + "Total output tokens": 843462, + "Average output tokens": 3321 + }, + "MATH-500": { + "Score": 1.8, + "Pass rate": 0.908, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 9035000, + "Total input tokens": 7729000, + "Average input tokens": 15458, + "Total output tokens": 1306000, + "Average output tokens": 2612 + } + }, + "Internllm2_5-7B": { + "META": { + "Algorithm": "ToT", + "LLM": "Internllm2_5-7B", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 20.85, + "Pass rate": 0.7013, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 13178129, + "Total input tokens": 11768118, + "Average input tokens": 8922, + "Total output tokens": 1410011, + "Average output tokens": 1069 + }, + "AQuA": { + "Score": 35.83, + "Pass rate": 0.9961, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 4734560, + "Total input tokens": 4263136, + "Average input tokens": 16784, + "Total output tokens": 471424, + "Average output tokens": 1856 + }, + "MATH-500": { + "Score": 0.2, + "Pass rate": 0.99, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 8350500, + "Total input tokens": 7515000, + "Average input tokens": 15030, + "Total output tokens": 835500, + "Average output tokens": 1671 + } + }, + "Qwen2-1.5B-Instruct": { + "META": { + "Algorithm": "ToT", + "LLM": "Qwen2-1.5B-Instruct", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 19.64, + "Pass rate": 0.7726, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 12758687, + "Total input tokens": 12124248, + "Average input tokens": 9192, + "Total output tokens": 634439, + "Average output tokens": 481 + }, + "AQuA": { + "Score": 31.5, + "Pass rate": 0.9882, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 6250702, + "Total input tokens": 6058022, + "Average input tokens": 23850, + "Total output tokens": 192680, + "Average output tokens": 759 + }, + "MATH-500": { + "Score": 0.8, + "Pass rate": 0.972, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 4535000, + "Total input tokens": 4408000, + "Average input tokens": 8816, + "Total output tokens": 127000, + "Average output tokens": 254 + } + }, + "Qwen2-0.5B-Instruct": { + "META": { + "Algorithm": "ToT", + "LLM": "Qwen2-0.5B-Instruct", + "Eval Date": "2025/1/22" + }, + "gsm8k": { + "Score": 0, + "Pass rate": 0.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 0, + "Total input tokens": 0, + "Average input tokens": 0, + "Total output tokens": 0, + "Average output tokens": 0 + }, + "AQuA": { + "Score": 29.92, + "Pass rate": 1.0, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 8700281, + "Total input tokens": 8100085, + "Average input tokens": 31890, + "Total output tokens": 600196, + "Average output tokens": 2363 + }, + "MATH-500": { + "Score": 0.0, + "Pass rate": 0.962, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 5996500, + "Total input tokens": 5590500, + "Average input tokens": 11181, + "Total output tokens": 406000, + "Average output tokens": 812 + } + }, + "deepseek-r1:1.5b": { + "META": { + "Algorithm": "ToT", + "LLM": "deepseek-r1:1.5b", + "Eval Date": "2025/2/10" + }, + "gsm8k": { + "Score": 23.12, + "Pass rate": 0.7248, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "8", + "Samples": 1319, + "All tokens": 3421486, + "Total input tokens": 2738244, + "Average input tokens": 2076, + "Total output tokens": 683242, + "Average output tokens": 518 + }, + "AQuA": { + "Score": 24.8, + "Pass rate": 0.5551, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "0", + "Samples": 254, + "All tokens": 794512, + "Total input tokens": 605028, + "Average input tokens": 2382, + "Total output tokens": 189484, + "Average output tokens": 746 + }, + "MATH-500": { + "Score": 0.4, + "Pass rate": 0.716, + "Cost($)": 0.0, + "Framework": "", + "X-shot": "4", + "Samples": 500, + "All tokens": 1941500, + "Total input tokens": 1831000, + "Average input tokens": 3662, + "Total output tokens": 110500, + "Average output tokens": 221 } } }