nielsr HF Staff commited on
Commit
71be1a4
·
verified ·
1 Parent(s): 0a9329f

Add pipeline tag, link to paper

Browse files

This PR ensures the model can be found at https://huggingface.co/models?pipeline_tag=any-to-any&sort=trending and adds a link to the corresponding paper.

Files changed (1) hide show
  1. README.md +31 -205
README.md CHANGED
@@ -1,6 +1,8 @@
1
  ---
2
  license: apache-2.0
 
3
  ---
 
4
  <div align="center">
5
 
6
  <img src="https://github.com/baichuan-inc/Baichuan-Omni-1.5/raw/main/assets/logo.png" width="300em" ></img>
@@ -13,7 +15,7 @@ license: apache-2.0
13
 
14
 
15
  <p align="center">
16
- Baichuan-Omni-1.5 <a href="https://huggingface.co/baichuan-inc/Baichuan-Omni-1d5">🤗</a> | Baichuan-Omni-1.5-Base <a href="https://huggingface.co/baichuan-inc/Baichuan-Omni-1d5-Base">🤗</a> |Github <a href="https://github.com/baichuan-inc/Baichuan-Omni-1.5/">📖 </a> | Report <a href="https://github.com/baichuan-inc/Baichuan-Omni-1.5/raw/main/baichuan_omni_1_5.pdf">📖</a>
17
  </p>
18
  </p>
19
  <p align="center">
@@ -232,9 +234,9 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
232
 
233
  <details>
234
 
235
- <summary>click to view</summary>
236
 
237
- #### Image Understanding
238
 
239
  <div align="center">
240
  <table style="margin: 0 auto; text-align: center;">
@@ -247,11 +249,11 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
247
  <tr>
248
  <td>Model</td>
249
  <td>Size</td>
250
- <td>MMBench-EN (Acc.)</td>
251
- <td>MMbench-CN (Acc.)</td>
252
- <td>SEED-IMG (Acc.)</td>
253
- <td>MMMU-val (Acc.)</td>
254
- <td>HallusionBench (Acc.)</td>
255
  </tr>
256
  <tr>
257
  <td colspan="9">Proprietary Models</td>
@@ -361,11 +363,11 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
361
  <tr>
362
  <td>Model</td>
363
  <td>Size</td>
364
- <td>RealWorldQA (Acc.)</td>
365
- <td>MathVista-mini (Acc.)</td>
366
- <td>TextVQA-val (Acc.)</td>
367
- <td>ChartQA (Acc.)</td>
368
- <td>OCRBench (Acc.)</td>
369
  </tr>
370
  <tr>
371
  <td colspan="8">Proprietary Models</td>
@@ -466,9 +468,9 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
466
 
467
  <details>
468
 
469
- <summary>click to view</summary>
470
 
471
- #### Video Understanding
472
  <div align="center">
473
  <table style="margin: 0 auto; text-align: center;">
474
  <thead>
@@ -481,10 +483,10 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
481
  <td>Model</td>
482
  <td>Size</td>
483
  <td># Frames</td>
484
- <td>MVBench (Acc.)</td>
485
- <td>Egoschema (Acc.)</td>
486
- <td>VideoMME (Acc.)</td>
487
- <td>Perception-Test (Acc.)</td>
488
  </tr>
489
  <tr>
490
  <td colspan="7">Proprietary Models</td>
@@ -550,7 +552,7 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
550
  <td>VideoLLaMA 2</td>
551
  <td>7B</td>
552
  <td>16</td>
553
- <td>54.6*</td>
554
  <td>51.7*</td>
555
  <td>46.6*</td>
556
  <td>51.4*</td>
@@ -606,7 +608,7 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
606
  <tr>
607
  <td>Baichuan-Omni</td>
608
  <td>7B</td>
609
- <td>1 fps (max 32)</td>
610
  <td>60.9</td>
611
  <td>58.8</td>
612
  <td>58.2</td>
@@ -634,6 +636,7 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
634
  </table>
635
  </div>
636
 
 
637
  <br>
638
 
639
  <div align="center">
@@ -798,12 +801,11 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
798
 
799
  </details>
800
 
801
-
802
  <details>
803
 
804
- <summary>click to view</summary>
805
 
806
- #### Audio Comprehensive and Speech Generation
807
  <div align="center">
808
  <table style="margin: 0 auto; text-align: center;">
809
  <thead>
@@ -914,17 +916,13 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
914
  </tbody>
915
  </table>
916
  </div>
917
-
918
-
919
  </details>
920
 
921
-
922
-
923
  <details>
924
 
925
- <summary>click to view</summary>
926
 
927
- #### Omni-modal Understanding
928
 
929
  <div align="center">
930
  <table style="margin: 0 auto; text-align: center;">
@@ -937,178 +935,6 @@ We sugguest readers to refer to our [**Github**](https://github.com/baichuan-inc
937
  <tr>
938
  <td>Model</td>
939
  <td>Size</td>
940
- <td>Image & Audio</td>
941
- <td>Image Caption & Audio</td>
942
- <td>Image & Audio Transcript</td>
943
- <td>Image Caption & Audio Transcript</td>
944
- </tr>
945
- </thead>
946
- <tr>
947
- <td colspan="6">Proprietary Models</td>
948
- </tr>
949
- <tr>
950
- <td>GPT4o-mini</td>
951
- <td>-</td>
952
- <td>-</td>
953
- <td>-</td>
954
- <td>37.0</td>
955
- <td>37.7</td>
956
- </tr>
957
- <tr>
958
- <td colspan="6">Open-source Models (Omni-modal)</td>
959
- </tr>
960
- <tr>
961
- <td>VITA</td>
962
- <td>8x7B</td>
963
- <td>33.1</td>
964
- <td>31.8</td>
965
- <td>42.0</td>
966
- <td>44.2</td>
967
- </tr>
968
- <tr>
969
- <td>VITA-1.5</td>
970
- <td>7B</td>
971
- <td>33.4</td>
972
- <td>29.6</td>
973
- <td>48.5</td>
974
- <td><b>47.2<br></td>
975
- </tr>
976
- <tr>
977
- <td>Baichuan-Omni</td>
978
- <td>7B</td>
979
- <td>32.2</td>
980
- <td>26.5</td>
981
- <td>42.6</td>
982
- <td>44.2</td>
983
- </tr>
984
- <tr>
985
- <td>MiniCPM-o 2.6</td>
986
- <td>7B</td>
987
- <td>40.5</td>
988
- <td>30.8</td>
989
- <td><b>53.2<br></td>
990
- <td>46.3</td>
991
- </tr>
992
- <tr>
993
- <td><b>Baichuan-Omni-1.5<br></td>
994
- <td>7B</td>
995
- <td><b>42.9<br></td>
996
- <td><b>37.7<br></td>
997
- <td>47.9</td>
998
- <td>46.9</td>
999
- </tr>
1000
- </tbody>
1001
- </table>
1002
- </div>
1003
-
1004
- </details>
1005
-
1006
- <details>
1007
-
1008
- <summary>click to view</summary>
1009
-
1010
- #### Medical Image Understanding Capabilities
1011
-
1012
- <div align="center">
1013
- <table style="margin: 0 auto; text-align: center;">
1014
- <thead>
1015
- <tr>
1016
- <th colspan="7">Medical Understanding&nbsp;&nbsp;&nbsp;</th>
1017
- </tr>
1018
- </thead>
1019
- <tbody>
1020
- <tr>
1021
- <td>Model</td>
1022
- <td>Size</td>
1023
- <td>GMAI-MMB-VAL (Acc.)</td>
1024
- <td>OpenMM-Medical (Acc.)</td>
1025
- </tr>
1026
- </thead>
1027
- <tr>
1028
- <td colspan="4">Proprietary Models</td>
1029
- </tr>
1030
- <tr>
1031
- <td>GPT4o-mini</td>
1032
- <td>-</td>
1033
- <td>46.4</td>
1034
- <td>74.3</td>
1035
- </tr>
1036
- <tr>
1037
- <td colspan="4">Open-source Models (Vision-Language)</td>
1038
- </tr>
1039
- <tr>
1040
- <td>Qwen2 VL</td>
1041
- <td>7B</td>
1042
- <td>46.3</td>
1043
- <td>76.9</td>
1044
- </tr>
1045
- <tr>
1046
- <td>Qwen2 VL</td>
1047
- <td>72B</td>
1048
- <td><b>50.7<br></td>
1049
- <td>80.7</td>
1050
- </tr>
1051
- <tr>
1052
- <td colspan="4">Open-source Models (Omni-modal)</td>
1053
- </tr>
1054
- <tr>
1055
- <td>VITA-1.5</td>
1056
- <td>7B</td>
1057
- <td>36.7</td>
1058
- <td>67.1</td>
1059
- </tr>
1060
- <tr>
1061
- <td>MiniCPM-o 2.6</td>
1062
- <td>7B</td>
1063
- <td>41.5</td>
1064
- <td>73.6</td>
1065
- </tr>
1066
- <tr>
1067
- <td><b>Baichuan-Omni-1.5<br></td>
1068
- <td>7B</td>
1069
- <td>49.9</td>
1070
- <td><b>83.8<br></td>
1071
- </tr>
1072
- </tbody>
1073
- </table>
1074
- </div>
1075
-
1076
- </details>
1077
-
1078
- ## Examples
1079
- <br>
1080
-
1081
- <div style="display: flex; flex-direction: column; align-items: center;">
1082
- <img src="https://github.com/baichuan-inc/Baichuan-Omni-1.5/raw/main/assets/pipeline.png" alt="pipeline" style="margin-bottom: 5px;">
1083
- <img src="https://github.com/baichuan-inc/Baichuan-Omni-1.5/raw/main/assets/math.png" alt="math" style="margin-bottom: 5px;">
1084
- <img src="https://github.com/baichuan-inc/Baichuan-Omni-1.5/raw/main/assets/fly_bill.png" alt="fly_bill" style="margin-bottom: 5px;">
1085
- </div>
1086
-
1087
-
1088
- ## 🚀 Quick Start
1089
- We recommend interested scholars to visit our github repo for more details. [**Github**](https://github.com/baichuan-inc/Baichuan-Omni-1.5/)
1090
-
1091
-
1092
- ### Statement
1093
- - We hereby declare that our team has not developed any applications based on Baichuan-Omni-1.5/Baichuan-Omni-1.5-base models, not on iOS, Android, the web, or any other platform. We strongly call on all users not to use Baichuan-Omni-1.5/Baichuan-Omni-1.5-base models for any activities that harm national / social security or violate the law. Also, we ask users not to use Baichuan-Omni-1.5/Baichuan-Omni-1.5-base models for Internet services that have not undergone appropriate security reviews and filings. We hope that all users can abide by this principle and ensure that the development of technology proceeds in a regulated and legal environment.
1094
-
1095
- - We have done our best to ensure the compliance of the data used in the model training process. However, despite our considerable efforts, there may still be some unforeseeable issues due to the complexity of the model and data. Therefore, if any problems arise due to the use of Baichuan-Omni-1.5/Baichuan-Omni-1.5-base open-source models, including but not limited to data security issues, public opinion risks, or any risks and problems brought about by the model being misled, abused, spread or improperly exploited, we will not assume any responsibility.
1096
-
1097
-
1098
-
1099
- ### License
1100
- The community usage of Baichuan-Omni-1.5/Baichuan-Omni-1.5-base requires adherence to [Apache 2.0](https://github.com/baichuan-inc/Baichuan-Omni-1.5/blob/main/LICENSE) and [Community License for Baichuan-Omni-1.5 Models](https://github.com/baichuan-inc/Baichuan-Omni-1.5/blob/main/LICENSE). The Baichuan-Omni-1.5/Baichuan-Omni-1.5-base models supports commercial use. If you plan to use the Baichuan-Omni-1.5/Baichuan-Omni-1.5-base models or its derivatives for commercial purposes, please ensure that your entity meets the following conditions:
1101
-
1102
- 1. The Daily Active Users (DAU) of your or your affiliate's service or product is less than 1 million.
1103
- 2. Neither you nor your affiliates are software service providers or cloud service providers.
1104
- 3. There is no possibility for you or your affiliates to grant the commercial license given to you, to reauthorize it to other third parties without Baichuan's permission.
1105
-
1106
- Upon meeting the above conditions, you need to submit the application materials required by the Baichuan-Omni-1.5 Model Community License Agreement via the following contact email: [email protected]. Once approved, Baichuan will hereby grant you a non-exclusive, global, non-transferable, non-sublicensable, revocable commercial copyright license.
1107
-
1108
- <!-- ### Citation
1109
-
1110
- If you find our work helpful, please consider citing our papers 📝 and liking this project ❤️!
1111
- ```bib
1112
- @article{
1113
- } -->
1114
- ```
 
1
  ---
2
  license: apache-2.0
3
+ pipeline_tag: any-to-any
4
  ---
5
+
6
  <div align="center">
7
 
8
  <img src="https://github.com/baichuan-inc/Baichuan-Omni-1.5/raw/main/assets/logo.png" width="300em" ></img>
 
15
 
16
 
17
  <p align="center">
18
+ Baichuan-Omni-1.5 <a href="https://huggingface.co/baichuan-inc/Baichuan-Omni-1d5">🤗</a> | Baichuan-Omni-1.5-Base <a href="https://huggingface.co/baichuan-inc/Baichuan-Omni-1d5-Base">🤗</a> |Github <a href="https://github.com/baichuan-inc/Baichuan-Omni-1.5/">📖 </a> | Report <a href="https://huggingface.co/papers/2501.15368">📖</a>
19
  </p>
20
  </p>
21
  <p align="center">
 
234
 
235
  <details>
236
 
237
+ <summary>Click here to view detailed evaluation results of image understanding ability.</summary>
238
 
239
+ #### Image understanding ability
240
 
241
  <div align="center">
242
  <table style="margin: 0 auto; text-align: center;">
 
249
  <tr>
250
  <td>Model</td>
251
  <td>Size</td>
252
+ <td>MMBench-EN <br>(Acc.)</td>
253
+ <td>MMbench-CN <br>(Acc.)</td>
254
+ <td>SEED-IMG <br>(Acc.)</td>
255
+ <td>MMMU-val <br>(Acc.)</td>
256
+ <td>HallusionBench <br>(Acc.)</td>
257
  </tr>
258
  <tr>
259
  <td colspan="9">Proprietary Models</td>
 
363
  <tr>
364
  <td>Model</td>
365
  <td>Size</td>
366
+ <td>RealWorldQA <br>(Acc.)</td>
367
+ <td>MathVista-mini <br>(Acc.)</td>
368
+ <td>TextVQA-val <br>(Acc.)</td>
369
+ <td>ChartQA <br>(Acc.)</td>
370
+ <td>OCRBench <br>(Acc.)</td>
371
  </tr>
372
  <tr>
373
  <td colspan="8">Proprietary Models</td>
 
468
 
469
  <details>
470
 
471
+ <summary>Click here to view detailed evaluation results of video understanding ability.</summary>
472
 
473
+ #### Video understanding ability
474
  <div align="center">
475
  <table style="margin: 0 auto; text-align: center;">
476
  <thead>
 
483
  <td>Model</td>
484
  <td>Size</td>
485
  <td># Frames</td>
486
+ <td>MVBench <br>(Acc.)</td>
487
+ <td>Egoschema <br>(Acc.)</td>
488
+ <td>VideoMME <br>(Acc.)</td>
489
+ <td>Perception-Test <br>(Acc.)</td>
490
  </tr>
491
  <tr>
492
  <td colspan="7">Proprietary Models</td>
 
552
  <td>VideoLLaMA 2</td>
553
  <td>7B</td>
554
  <td>16</td>
555
+ <td>50.2*</td>
556
  <td>51.7*</td>
557
  <td>46.6*</td>
558
  <td>51.4*</td>
 
608
  <tr>
609
  <td>Baichuan-Omni</td>
610
  <td>7B</td>
611
+ <td>1 fps (max 48)</td>
612
  <td>60.9</td>
613
  <td>58.8</td>
614
  <td>58.2</td>
 
636
  </table>
637
  </div>
638
 
639
+
640
  <br>
641
 
642
  <div align="center">
 
801
 
802
  </details>
803
 
 
804
  <details>
805
 
806
+ <summary>Click here to view detailed evaluation results of audio understanding and generation ability.</summary>
807
 
808
+ #### Audio understanding and generation ability
809
  <div align="center">
810
  <table style="margin: 0 auto; text-align: center;">
811
  <thead>
 
916
  </tbody>
917
  </table>
918
  </div>
 
 
919
  </details>
920
 
 
 
921
  <details>
922
 
923
+ <summary>Click here to view the detailed evaluation results of omni-modal understanding ability.</summary>
924
 
925
+ #### Omni-modal understanding ability
926
 
927
  <div align="center">
928
  <table style="margin: 0 auto; text-align: center;">
 
935
  <tr>
936
  <td>Model</td>
937
  <td>Size</td>
938
+ <td>Image & <br> Audio (Acc.)</td>
939
+ <td>Image Caption & <br> Audio (Acc.)</td>
940
+ <td>Image & Audio