eduagarcia commited on
Commit
f435f54
·
1 Parent(s): cfb5604

new models

Browse files
Files changed (1) hide show
  1. external_models_results.json +269 -3
external_models_results.json CHANGED
@@ -266,8 +266,8 @@
266
  "result_metrics_npm": 0.7286932366792048
267
  },
268
  {
269
- "model": "sabia-3",
270
- "name": "Sabiá-3",
271
  "link": "https://www.maritaca.ai/",
272
  "date": "2024-08-20",
273
  "status": "full",
@@ -423,7 +423,7 @@
423
  },
424
  {
425
  "model": "gemini-2.5-pro-exp-03-25",
426
- "name": "Gemini 2.5 Pro Experimental (0325)",
427
  "link": "https://aistudio.google.com",
428
  "date": "2025-04-03",
429
  "status": "full",
@@ -669,5 +669,271 @@
669
  },
670
  "result_metrics_average": 0.7870599821710969,
671
  "result_metrics_npm": 0.6795192293708728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  }
673
  ]
 
266
  "result_metrics_npm": 0.7286932366792048
267
  },
268
  {
269
+ "model": "sabia-3-2024-07-15",
270
+ "name": "Sabiá-3 (2024-07-15)",
271
  "link": "https://www.maritaca.ai/",
272
  "date": "2024-08-20",
273
  "status": "full",
 
423
  },
424
  {
425
  "model": "gemini-2.5-pro-exp-03-25",
426
+ "name": "Gemini 2.5 Pro Experimental [reasoning] (0325)",
427
  "link": "https://aistudio.google.com",
428
  "date": "2025-04-03",
429
  "status": "full",
 
669
  },
670
  "result_metrics_average": 0.7870599821710969,
671
  "result_metrics_npm": 0.6795192293708728
672
+ },
673
+ {
674
+ "model": "deepseek-v3_1",
675
+ "name": "deepseek-ai/DeepSeek-V3.1 (API)",
676
+ "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
677
+ "date": "2025-09-01",
678
+ "status": "full",
679
+ "main_language": "English",
680
+ "model_type": "chat",
681
+ "params": 685.0,
682
+ "result_metrics": {
683
+ "enem_challenge": 0.8887333799860042,
684
+ "bluex": 0.8178025034770514,
685
+ "oab_exams": 0.7038724373576309,
686
+ "assin2_sts": 0.8082104938836681,
687
+ "assin2_rte": 0.949346100935343,
688
+ "faquad_nli": 0.8406862745098038,
689
+ "hatebr_offensive": 0.9211711711711712,
690
+ "portuguese_hate_speech": 0.7423067698027224,
691
+ "tweetsentbr": 0.7584190029617157
692
+ },
693
+ "result_metrics_average": 0.8256164593427902,
694
+ "result_metrics_npm": 0.7370296776379883
695
+ },
696
+ {
697
+ "model": "kimi-k2",
698
+ "name": "moonshotai/Kimi-K2-Instruct (API)",
699
+ "link": "https://huggingface.co/moonshotai/Kimi-K2-Instruct",
700
+ "date": "2025-09-01",
701
+ "status": "full",
702
+ "main_language": "English",
703
+ "model_type": "chat",
704
+ "params": 1000.0,
705
+ "result_metrics": {
706
+ "enem_challenge": 0.8789363191042687,
707
+ "bluex": 0.827538247566064,
708
+ "oab_exams": 0.6970387243735763,
709
+ "assin2_sts": 0.7760142475181766,
710
+ "assin2_rte": 0.9436236879837872,
711
+ "faquad_nli": 0.8531466083708024,
712
+ "hatebr_offensive": 0.8941562198649953,
713
+ "portuguese_hate_speech": 0.7535500455551216,
714
+ "tweetsentbr": 0.7428370464802363
715
+ },
716
+ "result_metrics_average": 0.8185379052018921,
717
+ "result_metrics_npm": 0.7275664672121565
718
+ },
719
+ {
720
+ "model": "sabia-3-1-2025-05-08",
721
+ "name": "Sabiá-3.1 (2025-05-08)",
722
+ "link": "https://www.maritaca.ai/",
723
+ "date": "2025-09-01",
724
+ "status": "full",
725
+ "main_language": "Portuguese",
726
+ "model_type": "proprietary",
727
+ "result_metrics": {
728
+ "enem_challenge": 0.8894331700489853,
729
+ "bluex": 0.8178025034770514,
730
+ "oab_exams": 0.9202733485193622,
731
+ "assin2_sts": 0.8340482244079774,
732
+ "assin2_rte": 0.9423587830430271,
733
+ "faquad_nli": 0.7585644282172838,
734
+ "hatebr_offensive": 0.8308611905928697,
735
+ "portuguese_hate_speech": 0.7543648446960096,
736
+ "tweetsentbr": 0.7398273232644036
737
+ },
738
+ "result_metrics_average": 0.8319482018074411,
739
+ "result_metrics_npm": 0.7331597943893793
740
+ },
741
+ {
742
+ "model": "sabia-3-2024-12-11",
743
+ "name": "Sabiá-3 (2024-12-11)",
744
+ "link": "https://www.maritaca.ai/",
745
+ "date": "2025-09-01",
746
+ "status": "full",
747
+ "main_language": "Portuguese",
748
+ "model_type": "proprietary",
749
+ "result_metrics": {
750
+ "enem_challenge": 0.8691392582225332,
751
+ "bluex": 0.7872044506258693,
752
+ "oab_exams": 0.8009111617312072,
753
+ "assin2_sts": 0.7850131735268517,
754
+ "assin2_rte": 0.9390382723900459,
755
+ "faquad_nli": 0.7968815254182839,
756
+ "hatebr_offensive": 0.8608047226969084,
757
+ "portuguese_hate_speech": 0.7474723628059027,
758
+ "tweetsentbr": 0.7360466511491278
759
+ },
760
+ "result_metrics_average": 0.8136123976185256,
761
+ "result_metrics_npm": 0.7144701465854594
762
+ },
763
+ {
764
+ "model": "sabiazinho-3",
765
+ "name": "Sabiázinho-3 (2025-02-06)",
766
+ "link": "https://www.maritaca.ai/",
767
+ "date": "2025-09-01",
768
+ "status": "full",
769
+ "main_language": "Portuguese",
770
+ "model_type": "proprietary",
771
+ "result_metrics": {
772
+ "enem_challenge": 0.8439468159552135,
773
+ "bluex": 0.7343532684283728,
774
+ "oab_exams": 0.8159453302961276,
775
+ "assin2_sts": 0.8091208202474276,
776
+ "assin2_rte": 0.9370511249219384,
777
+ "faquad_nli": 0.7715445403113343,
778
+ "hatebr_offensive": 0.8604320820258526,
779
+ "portuguese_hate_speech": 0.7129508077161507,
780
+ "tweetsentbr": 0.6798994954276046
781
+ },
782
+ "result_metrics_average": 0.7961382539255579,
783
+ "result_metrics_npm": 0.685954609257193
784
+ },
785
+ {
786
+ "model": "grok-3-mini",
787
+ "name": "Grok 3 Mini [reasoning] (API)",
788
+ "link": "https://x.ai/",
789
+ "date": "2025-09-01",
790
+ "status": "full",
791
+ "main_language": "English",
792
+ "model_type": "chat",
793
+ "result_metrics": {
794
+ "enem_challenge": 0.9412176347095871,
795
+ "bluex": 0.8984700973574409,
796
+ "oab_exams": 0.7075170842824602,
797
+ "assin2_sts": 0.7846153023166811,
798
+ "assin2_rte": 0.9369863526592658,
799
+ "faquad_nli": 0.8974457100080231,
800
+ "hatebr_offensive": 0.9264201247592199,
801
+ "portuguese_hate_speech": 0.6868265194640906,
802
+ "tweetsentbr": 0.7496188889954271
803
+ },
804
+ "result_metrics_average": 0.836568634950244,
805
+ "result_metrics_npm": 0.7505284631974409
806
+ },
807
+ {
808
+ "model": "gpt-5-nano-2025-08-07",
809
+ "name": "GPT 5 Nano [reasoning] (2025-08-07)",
810
+ "link": "https://www.openai.com/",
811
+ "date": "2025-09-01",
812
+ "status": "full",
813
+ "main_language": "English",
814
+ "model_type": "proprietary",
815
+ "result_metrics": {
816
+ "enem_challenge": 0.9013296011196641,
817
+ "bluex": 0.8525730180806675,
818
+ "oab_exams": 0.5913439635535308,
819
+ "assin2_sts": 0.7157982790377855,
820
+ "assin2_rte": 0.9493397775671237,
821
+ "faquad_nli": 0.802473455931782,
822
+ "hatebr_offensive": 0.9169693400085076,
823
+ "portuguese_hate_speech": 0.7166590126291619,
824
+ "tweetsentbr": 0.7385573150818597
825
+ },
826
+ "result_metrics_average": 0.7983381958900091,
827
+ "result_metrics_npm": 0.699331432280926
828
+ },
829
+ {
830
+ "model": "gpt-5-mini-2025-08-07",
831
+ "name": "GPT 5 Mini [reasoning] (2025-08-07)",
832
+ "link": "https://www.openai.com/",
833
+ "date": "2025-09-01",
834
+ "status": "full",
835
+ "main_language": "English",
836
+ "model_type": "proprietary",
837
+ "result_metrics": {
838
+ "enem_challenge": 0.9566130160951715,
839
+ "bluex": 0.913769123783032,
840
+ "oab_exams": 0.7184510250569476,
841
+ "assin2_sts": 0.8151992531421179,
842
+ "assin2_rte": 0.9486789502727531,
843
+ "faquad_nli": 0.7959895379250218,
844
+ "hatebr_offensive": 0.9306148454596409,
845
+ "portuguese_hate_speech": 0.7476857189919288,
846
+ "tweetsentbr": 0.7208063363431595
847
+ },
848
+ "result_metrics_average": 0.8386453118966414,
849
+ "result_metrics_npm": 0.7509015993727701
850
+ },
851
+ {
852
+ "model": "gpt-5_reasoning_minimal-2025-08-07",
853
+ "name": "GPT 5 [reasoning: minimal] (2025-08-07)",
854
+ "link": "https://www.openai.com/",
855
+ "date": "2025-09-01",
856
+ "status": "full",
857
+ "main_language": "English",
858
+ "model_type": "proprietary",
859
+ "result_metrics": {
860
+ "enem_challenge": 0.8432470258922323,
861
+ "bluex": 0.7885952712100139,
862
+ "oab_exams": 0.8104783599088838,
863
+ "assin2_sts": 0.7497712012355019,
864
+ "assin2_rte": 0.9497544911228829,
865
+ "faquad_nli": 0.9049032312001003,
866
+ "hatebr_offensive": 0.9233018502276624,
867
+ "portuguese_hate_speech": 0.7502183789864052,
868
+ "tweetsentbr": 0.7877925879277
869
+ },
870
+ "result_metrics_average": 0.8342291553012646,
871
+ "result_metrics_npm": 0.7560493865775754
872
+ },
873
+ {
874
+ "model": "gemini-2_5_flash_lite",
875
+ "name": "Gemini 2.5 Flash Lite",
876
+ "link": "https://aistudio.google.com",
877
+ "date": "2025-09-01",
878
+ "status": "full",
879
+ "main_language": "English",
880
+ "model_type": "proprietary",
881
+ "result_metrics": {
882
+ "enem_challenge": 0.8257522743177047,
883
+ "bluex": 0.7329624478442281,
884
+ "oab_exams": 0.6783599088838269,
885
+ "assin2_sts": 0.8399704980607736,
886
+ "assin2_rte": 0.9095975398498664,
887
+ "faquad_nli": 0.8289944389172974,
888
+ "hatebr_offensive": 0.8733247194142535,
889
+ "portuguese_hate_speech": 0.7511757826108595,
890
+ "tweetsentbr": 0.7696375203962748
891
+ },
892
+ "result_metrics_average": 0.8010861255883428,
893
+ "result_metrics_npm": 0.6977608761930978
894
+ },
895
+ {
896
+ "model": "gemini-2_5_flash_lite",
897
+ "name": "Gemini 2.5 Flash Lite [reasoning: low]",
898
+ "link": "https://aistudio.google.com",
899
+ "date": "2025-09-01",
900
+ "status": "full",
901
+ "main_language": "English",
902
+ "model_type": "proprietary",
903
+ "result_metrics": {
904
+ "enem_challenge": 0.9013296011196641,
905
+ "bluex": 0.8400556328233658,
906
+ "oab_exams": 0.6943052391799545,
907
+ "assin2_sts": 0.755562697236674,
908
+ "assin2_rte": 0.9464858475885941,
909
+ "faquad_nli": 0.8703946691365647,
910
+ "hatebr_offensive": 0.9080576836597871,
911
+ "portuguese_hate_speech": 0.7416269940699909,
912
+ "tweetsentbr": 0.7520493635069894
913
+ },
914
+ "result_metrics_average": 0.8233186364801761,
915
+ "result_metrics_npm": 0.7360224650390731
916
+ },
917
+ {
918
+ "model": "gemini-2_5_flash",
919
+ "name": "Gemini 2.5 Flash",
920
+ "link": "https://aistudio.google.com",
921
+ "date": "2025-09-01",
922
+ "status": "full",
923
+ "main_language": "English",
924
+ "model_type": "proprietary",
925
+ "result_metrics": {
926
+ "enem_challenge": 0.9097270818754374,
927
+ "bluex": 0.8650904033379694,
928
+ "oab_exams": 0.8355353075170843,
929
+ "assin2_sts": 0.8714666962450285,
930
+ "assin2_rte": 0.9386350099968783,
931
+ "faquad_nli": 0.8578569197125898,
932
+ "hatebr_offensive": 0.8933375064862327,
933
+ "portuguese_hate_speech": 0.7502527990365506,
934
+ "tweetsentbr": 0.7801286503914011
935
+ },
936
+ "result_metrics_average": 0.8557811527332413,
937
+ "result_metrics_npm": 0.7734849178213028
938
  }
939
  ]