Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
f435f54
1
Parent(s):
cfb5604
new models
Browse files- external_models_results.json +269 -3
external_models_results.json
CHANGED
@@ -266,8 +266,8 @@
|
|
266 |
"result_metrics_npm": 0.7286932366792048
|
267 |
},
|
268 |
{
|
269 |
-
"model": "sabia-3",
|
270 |
-
"name": "Sabiá-3",
|
271 |
"link": "https://www.maritaca.ai/",
|
272 |
"date": "2024-08-20",
|
273 |
"status": "full",
|
@@ -423,7 +423,7 @@
|
|
423 |
},
|
424 |
{
|
425 |
"model": "gemini-2.5-pro-exp-03-25",
|
426 |
-
"name": "Gemini 2.5 Pro Experimental (0325)",
|
427 |
"link": "https://aistudio.google.com",
|
428 |
"date": "2025-04-03",
|
429 |
"status": "full",
|
@@ -669,5 +669,271 @@
|
|
669 |
},
|
670 |
"result_metrics_average": 0.7870599821710969,
|
671 |
"result_metrics_npm": 0.6795192293708728
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
672 |
}
|
673 |
]
|
|
|
266 |
"result_metrics_npm": 0.7286932366792048
|
267 |
},
|
268 |
{
|
269 |
+
"model": "sabia-3-2024-07-15",
|
270 |
+
"name": "Sabiá-3 (2024-07-15)",
|
271 |
"link": "https://www.maritaca.ai/",
|
272 |
"date": "2024-08-20",
|
273 |
"status": "full",
|
|
|
423 |
},
|
424 |
{
|
425 |
"model": "gemini-2.5-pro-exp-03-25",
|
426 |
+
"name": "Gemini 2.5 Pro Experimental [reasoning] (0325)",
|
427 |
"link": "https://aistudio.google.com",
|
428 |
"date": "2025-04-03",
|
429 |
"status": "full",
|
|
|
669 |
},
|
670 |
"result_metrics_average": 0.7870599821710969,
|
671 |
"result_metrics_npm": 0.6795192293708728
|
672 |
+
},
|
673 |
+
{
|
674 |
+
"model": "deepseek-v3_1",
|
675 |
+
"name": "deepseek-ai/DeepSeek-V3.1 (API)",
|
676 |
+
"link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
|
677 |
+
"date": "2025-09-01",
|
678 |
+
"status": "full",
|
679 |
+
"main_language": "English",
|
680 |
+
"model_type": "chat",
|
681 |
+
"params": 685.0,
|
682 |
+
"result_metrics": {
|
683 |
+
"enem_challenge": 0.8887333799860042,
|
684 |
+
"bluex": 0.8178025034770514,
|
685 |
+
"oab_exams": 0.7038724373576309,
|
686 |
+
"assin2_sts": 0.8082104938836681,
|
687 |
+
"assin2_rte": 0.949346100935343,
|
688 |
+
"faquad_nli": 0.8406862745098038,
|
689 |
+
"hatebr_offensive": 0.9211711711711712,
|
690 |
+
"portuguese_hate_speech": 0.7423067698027224,
|
691 |
+
"tweetsentbr": 0.7584190029617157
|
692 |
+
},
|
693 |
+
"result_metrics_average": 0.8256164593427902,
|
694 |
+
"result_metrics_npm": 0.7370296776379883
|
695 |
+
},
|
696 |
+
{
|
697 |
+
"model": "kimi-k2",
|
698 |
+
"name": "moonshotai/Kimi-K2-Instruct (API)",
|
699 |
+
"link": "https://huggingface.co/moonshotai/Kimi-K2-Instruct",
|
700 |
+
"date": "2025-09-01",
|
701 |
+
"status": "full",
|
702 |
+
"main_language": "English",
|
703 |
+
"model_type": "chat",
|
704 |
+
"params": 1000.0,
|
705 |
+
"result_metrics": {
|
706 |
+
"enem_challenge": 0.8789363191042687,
|
707 |
+
"bluex": 0.827538247566064,
|
708 |
+
"oab_exams": 0.6970387243735763,
|
709 |
+
"assin2_sts": 0.7760142475181766,
|
710 |
+
"assin2_rte": 0.9436236879837872,
|
711 |
+
"faquad_nli": 0.8531466083708024,
|
712 |
+
"hatebr_offensive": 0.8941562198649953,
|
713 |
+
"portuguese_hate_speech": 0.7535500455551216,
|
714 |
+
"tweetsentbr": 0.7428370464802363
|
715 |
+
},
|
716 |
+
"result_metrics_average": 0.8185379052018921,
|
717 |
+
"result_metrics_npm": 0.7275664672121565
|
718 |
+
},
|
719 |
+
{
|
720 |
+
"model": "sabia-3-1-2025-05-08",
|
721 |
+
"name": "Sabiá-3.1 (2025-05-08)",
|
722 |
+
"link": "https://www.maritaca.ai/",
|
723 |
+
"date": "2025-09-01",
|
724 |
+
"status": "full",
|
725 |
+
"main_language": "Portuguese",
|
726 |
+
"model_type": "proprietary",
|
727 |
+
"result_metrics": {
|
728 |
+
"enem_challenge": 0.8894331700489853,
|
729 |
+
"bluex": 0.8178025034770514,
|
730 |
+
"oab_exams": 0.9202733485193622,
|
731 |
+
"assin2_sts": 0.8340482244079774,
|
732 |
+
"assin2_rte": 0.9423587830430271,
|
733 |
+
"faquad_nli": 0.7585644282172838,
|
734 |
+
"hatebr_offensive": 0.8308611905928697,
|
735 |
+
"portuguese_hate_speech": 0.7543648446960096,
|
736 |
+
"tweetsentbr": 0.7398273232644036
|
737 |
+
},
|
738 |
+
"result_metrics_average": 0.8319482018074411,
|
739 |
+
"result_metrics_npm": 0.7331597943893793
|
740 |
+
},
|
741 |
+
{
|
742 |
+
"model": "sabia-3-2024-12-11",
|
743 |
+
"name": "Sabiá-3 (2024-12-11)",
|
744 |
+
"link": "https://www.maritaca.ai/",
|
745 |
+
"date": "2025-09-01",
|
746 |
+
"status": "full",
|
747 |
+
"main_language": "Portuguese",
|
748 |
+
"model_type": "proprietary",
|
749 |
+
"result_metrics": {
|
750 |
+
"enem_challenge": 0.8691392582225332,
|
751 |
+
"bluex": 0.7872044506258693,
|
752 |
+
"oab_exams": 0.8009111617312072,
|
753 |
+
"assin2_sts": 0.7850131735268517,
|
754 |
+
"assin2_rte": 0.9390382723900459,
|
755 |
+
"faquad_nli": 0.7968815254182839,
|
756 |
+
"hatebr_offensive": 0.8608047226969084,
|
757 |
+
"portuguese_hate_speech": 0.7474723628059027,
|
758 |
+
"tweetsentbr": 0.7360466511491278
|
759 |
+
},
|
760 |
+
"result_metrics_average": 0.8136123976185256,
|
761 |
+
"result_metrics_npm": 0.7144701465854594
|
762 |
+
},
|
763 |
+
{
|
764 |
+
"model": "sabiazinho-3",
|
765 |
+
"name": "Sabiázinho-3 (2025-02-06)",
|
766 |
+
"link": "https://www.maritaca.ai/",
|
767 |
+
"date": "2025-09-01",
|
768 |
+
"status": "full",
|
769 |
+
"main_language": "Portuguese",
|
770 |
+
"model_type": "proprietary",
|
771 |
+
"result_metrics": {
|
772 |
+
"enem_challenge": 0.8439468159552135,
|
773 |
+
"bluex": 0.7343532684283728,
|
774 |
+
"oab_exams": 0.8159453302961276,
|
775 |
+
"assin2_sts": 0.8091208202474276,
|
776 |
+
"assin2_rte": 0.9370511249219384,
|
777 |
+
"faquad_nli": 0.7715445403113343,
|
778 |
+
"hatebr_offensive": 0.8604320820258526,
|
779 |
+
"portuguese_hate_speech": 0.7129508077161507,
|
780 |
+
"tweetsentbr": 0.6798994954276046
|
781 |
+
},
|
782 |
+
"result_metrics_average": 0.7961382539255579,
|
783 |
+
"result_metrics_npm": 0.685954609257193
|
784 |
+
},
|
785 |
+
{
|
786 |
+
"model": "grok-3-mini",
|
787 |
+
"name": "Grok 3 Mini [reasoning] (API)",
|
788 |
+
"link": "https://x.ai/",
|
789 |
+
"date": "2025-09-01",
|
790 |
+
"status": "full",
|
791 |
+
"main_language": "English",
|
792 |
+
"model_type": "chat",
|
793 |
+
"result_metrics": {
|
794 |
+
"enem_challenge": 0.9412176347095871,
|
795 |
+
"bluex": 0.8984700973574409,
|
796 |
+
"oab_exams": 0.7075170842824602,
|
797 |
+
"assin2_sts": 0.7846153023166811,
|
798 |
+
"assin2_rte": 0.9369863526592658,
|
799 |
+
"faquad_nli": 0.8974457100080231,
|
800 |
+
"hatebr_offensive": 0.9264201247592199,
|
801 |
+
"portuguese_hate_speech": 0.6868265194640906,
|
802 |
+
"tweetsentbr": 0.7496188889954271
|
803 |
+
},
|
804 |
+
"result_metrics_average": 0.836568634950244,
|
805 |
+
"result_metrics_npm": 0.7505284631974409
|
806 |
+
},
|
807 |
+
{
|
808 |
+
"model": "gpt-5-nano-2025-08-07",
|
809 |
+
"name": "GPT 5 Nano [reasoning] (2025-08-07)",
|
810 |
+
"link": "https://www.openai.com/",
|
811 |
+
"date": "2025-09-01",
|
812 |
+
"status": "full",
|
813 |
+
"main_language": "English",
|
814 |
+
"model_type": "proprietary",
|
815 |
+
"result_metrics": {
|
816 |
+
"enem_challenge": 0.9013296011196641,
|
817 |
+
"bluex": 0.8525730180806675,
|
818 |
+
"oab_exams": 0.5913439635535308,
|
819 |
+
"assin2_sts": 0.7157982790377855,
|
820 |
+
"assin2_rte": 0.9493397775671237,
|
821 |
+
"faquad_nli": 0.802473455931782,
|
822 |
+
"hatebr_offensive": 0.9169693400085076,
|
823 |
+
"portuguese_hate_speech": 0.7166590126291619,
|
824 |
+
"tweetsentbr": 0.7385573150818597
|
825 |
+
},
|
826 |
+
"result_metrics_average": 0.7983381958900091,
|
827 |
+
"result_metrics_npm": 0.699331432280926
|
828 |
+
},
|
829 |
+
{
|
830 |
+
"model": "gpt-5-mini-2025-08-07",
|
831 |
+
"name": "GPT 5 Mini [reasoning] (2025-08-07)",
|
832 |
+
"link": "https://www.openai.com/",
|
833 |
+
"date": "2025-09-01",
|
834 |
+
"status": "full",
|
835 |
+
"main_language": "English",
|
836 |
+
"model_type": "proprietary",
|
837 |
+
"result_metrics": {
|
838 |
+
"enem_challenge": 0.9566130160951715,
|
839 |
+
"bluex": 0.913769123783032,
|
840 |
+
"oab_exams": 0.7184510250569476,
|
841 |
+
"assin2_sts": 0.8151992531421179,
|
842 |
+
"assin2_rte": 0.9486789502727531,
|
843 |
+
"faquad_nli": 0.7959895379250218,
|
844 |
+
"hatebr_offensive": 0.9306148454596409,
|
845 |
+
"portuguese_hate_speech": 0.7476857189919288,
|
846 |
+
"tweetsentbr": 0.7208063363431595
|
847 |
+
},
|
848 |
+
"result_metrics_average": 0.8386453118966414,
|
849 |
+
"result_metrics_npm": 0.7509015993727701
|
850 |
+
},
|
851 |
+
{
|
852 |
+
"model": "gpt-5_reasoning_minimal-2025-08-07",
|
853 |
+
"name": "GPT 5 [reasoning: minimal] (2025-08-07)",
|
854 |
+
"link": "https://www.openai.com/",
|
855 |
+
"date": "2025-09-01",
|
856 |
+
"status": "full",
|
857 |
+
"main_language": "English",
|
858 |
+
"model_type": "proprietary",
|
859 |
+
"result_metrics": {
|
860 |
+
"enem_challenge": 0.8432470258922323,
|
861 |
+
"bluex": 0.7885952712100139,
|
862 |
+
"oab_exams": 0.8104783599088838,
|
863 |
+
"assin2_sts": 0.7497712012355019,
|
864 |
+
"assin2_rte": 0.9497544911228829,
|
865 |
+
"faquad_nli": 0.9049032312001003,
|
866 |
+
"hatebr_offensive": 0.9233018502276624,
|
867 |
+
"portuguese_hate_speech": 0.7502183789864052,
|
868 |
+
"tweetsentbr": 0.7877925879277
|
869 |
+
},
|
870 |
+
"result_metrics_average": 0.8342291553012646,
|
871 |
+
"result_metrics_npm": 0.7560493865775754
|
872 |
+
},
|
873 |
+
{
|
874 |
+
"model": "gemini-2_5_flash_lite",
|
875 |
+
"name": "Gemini 2.5 Flash Lite",
|
876 |
+
"link": "https://aistudio.google.com",
|
877 |
+
"date": "2025-09-01",
|
878 |
+
"status": "full",
|
879 |
+
"main_language": "English",
|
880 |
+
"model_type": "proprietary",
|
881 |
+
"result_metrics": {
|
882 |
+
"enem_challenge": 0.8257522743177047,
|
883 |
+
"bluex": 0.7329624478442281,
|
884 |
+
"oab_exams": 0.6783599088838269,
|
885 |
+
"assin2_sts": 0.8399704980607736,
|
886 |
+
"assin2_rte": 0.9095975398498664,
|
887 |
+
"faquad_nli": 0.8289944389172974,
|
888 |
+
"hatebr_offensive": 0.8733247194142535,
|
889 |
+
"portuguese_hate_speech": 0.7511757826108595,
|
890 |
+
"tweetsentbr": 0.7696375203962748
|
891 |
+
},
|
892 |
+
"result_metrics_average": 0.8010861255883428,
|
893 |
+
"result_metrics_npm": 0.6977608761930978
|
894 |
+
},
|
895 |
+
{
|
896 |
+
"model": "gemini-2_5_flash_lite",
|
897 |
+
"name": "Gemini 2.5 Flash Lite [reasoning: low]",
|
898 |
+
"link": "https://aistudio.google.com",
|
899 |
+
"date": "2025-09-01",
|
900 |
+
"status": "full",
|
901 |
+
"main_language": "English",
|
902 |
+
"model_type": "proprietary",
|
903 |
+
"result_metrics": {
|
904 |
+
"enem_challenge": 0.9013296011196641,
|
905 |
+
"bluex": 0.8400556328233658,
|
906 |
+
"oab_exams": 0.6943052391799545,
|
907 |
+
"assin2_sts": 0.755562697236674,
|
908 |
+
"assin2_rte": 0.9464858475885941,
|
909 |
+
"faquad_nli": 0.8703946691365647,
|
910 |
+
"hatebr_offensive": 0.9080576836597871,
|
911 |
+
"portuguese_hate_speech": 0.7416269940699909,
|
912 |
+
"tweetsentbr": 0.7520493635069894
|
913 |
+
},
|
914 |
+
"result_metrics_average": 0.8233186364801761,
|
915 |
+
"result_metrics_npm": 0.7360224650390731
|
916 |
+
},
|
917 |
+
{
|
918 |
+
"model": "gemini-2_5_flash",
|
919 |
+
"name": "Gemini 2.5 Flash",
|
920 |
+
"link": "https://aistudio.google.com",
|
921 |
+
"date": "2025-09-01",
|
922 |
+
"status": "full",
|
923 |
+
"main_language": "English",
|
924 |
+
"model_type": "proprietary",
|
925 |
+
"result_metrics": {
|
926 |
+
"enem_challenge": 0.9097270818754374,
|
927 |
+
"bluex": 0.8650904033379694,
|
928 |
+
"oab_exams": 0.8355353075170843,
|
929 |
+
"assin2_sts": 0.8714666962450285,
|
930 |
+
"assin2_rte": 0.9386350099968783,
|
931 |
+
"faquad_nli": 0.8578569197125898,
|
932 |
+
"hatebr_offensive": 0.8933375064862327,
|
933 |
+
"portuguese_hate_speech": 0.7502527990365506,
|
934 |
+
"tweetsentbr": 0.7801286503914011
|
935 |
+
},
|
936 |
+
"result_metrics_average": 0.8557811527332413,
|
937 |
+
"result_metrics_npm": 0.7734849178213028
|
938 |
}
|
939 |
]
|