Upload folder using huggingface_hub
Browse files
    	
        README.md
    CHANGED
    
    | @@ -3,9 +3,13 @@ license: mit | |
| 3 | 
             
            pipeline_tag: image-text-to-text
         | 
| 4 | 
             
            library_name: transformers
         | 
| 5 | 
             
            base_model:
         | 
|  | |
| 6 | 
             
              - OpenGVLab/InternVL2_5-8B
         | 
|  | |
| 7 | 
             
              - OpenGVLab/InternViT-300M-448px-V2_5
         | 
| 8 | 
             
              - internlm/internlm2_5-7b-chat
         | 
|  | |
|  | |
| 9 | 
             
            base_model_relation: merge
         | 
| 10 | 
             
            language:
         | 
| 11 | 
             
              - multilingual
         | 
| @@ -16,8 +20,8 @@ tags: | |
| 16 |  | 
| 17 | 
             
            # Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos
         | 
| 18 |  | 
| 19 | 
            -
            [\[π GitHub\]](https://github.com/ | 
| 20 | 
            -
            [\[π Sa2VA paper\]]()
         | 
| 21 | 
             
            [\[π Quick Start\]](#quick-start) 
         | 
| 22 |  | 
| 23 |  | 
| @@ -30,11 +34,11 @@ Sa2VA is an MLLM capable of question answering, visual prompt understanding, and | |
| 30 |  | 
| 31 | 
             
            We built the Sa2VA series based on Qwen2-VL and InternVL2/2.5. In the following table, we provide some Sa2VA models built on InternVL2.5. Other Sa2VA models will be open-sourced soon.
         | 
| 32 |  | 
| 33 | 
            -
            | Model Name |                             Base MLLM                             | | 
| 34 | 
            -
             | 
| 35 | 
            -
            |  Sa2VA-1B  | [InternVL2.0-1B](https://huggingface.co/OpenGVLab/InternVL2-1B) |   [Qwen2 | 
| 36 | 
            -
            |  Sa2VA-4B  | [InternVL2.5-4B](https://huggingface.co/OpenGVLab/InternVL2_5-4B) | | 
| 37 | 
            -
            |  Sa2VA-8B  | [InternVL2.5-8B](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | | 
| 38 |  | 
| 39 | 
             
            ## Sa2VA Performance
         | 
| 40 | 
             
            | Model Name |                             MMBench                             |                                    MME                                     |                       RefCOCO                        | RefCOCO+ | RefCOCOg | MeVIS | DAVIS | ReVOS |
         | 
| @@ -67,7 +71,7 @@ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast | |
| 67 |  | 
| 68 | 
             
            # for image chat
         | 
| 69 | 
             
            image_path = "/PATH/TO/IMAGE"
         | 
| 70 | 
            -
            text_prompts = "<image | 
| 71 | 
             
            image = Image.open(image_path).convert('RGB')
         | 
| 72 | 
             
            input_dict = {
         | 
| 73 | 
             
                'image': image,
         | 
| @@ -81,7 +85,7 @@ answer = return_dict["prediction"] # the text format answer | |
| 81 |  | 
| 82 | 
             
            # for image chat with segmentation output
         | 
| 83 | 
             
            image_path = "/PATH/TO/IMAGE"
         | 
| 84 | 
            -
            text_prompts = "<image | 
| 85 | 
             
            image = Image.open(image_path).convert('RGB')
         | 
| 86 | 
             
            input_dict = {
         | 
| 87 | 
             
                'image': image,
         | 
| @@ -97,7 +101,7 @@ masks = return_dict['prediction_masks']  # segmentation masks, list(np.array(1, | |
| 97 | 
             
            # for chat with visual prompt (mask format) input
         | 
| 98 | 
             
            mask_prompts = np.load('/PATH/TO/pred_masks.npy') # np.array(n_prompts, h, w)
         | 
| 99 | 
             
            image_path = "/PATH/TO/IMAGE"
         | 
| 100 | 
            -
            text_prompts = "<image | 
| 101 | 
             
            image = Image.open(image_path).convert('RGB')
         | 
| 102 | 
             
            input_dict = {
         | 
| 103 | 
             
                'image': image,
         | 
| @@ -116,7 +120,7 @@ images_paths = [os.path.join(video_folder, image_path) for image_name in images_ | |
| 116 | 
             
            if len(images_paths) > 5:  # uniformly sample 5 frames
         | 
| 117 | 
             
                step = (len(images_paths) - 1) // (5 - 1)
         | 
| 118 | 
             
                images_paths = [images_paths[0]] + images_paths[1:-1][::step][1:] + [images_paths[-1]]
         | 
| 119 | 
            -
            text_prompts = "<image | 
| 120 | 
             
            input_dict = {
         | 
| 121 | 
             
                'video': images_paths,
         | 
| 122 | 
             
                'text': text_prompts,
         | 
| @@ -132,7 +136,7 @@ answer = return_dict["prediction"] # the text format answer | |
| 132 | 
             
            video_folder = "/PATH/TO/VIDEO_FOLDER"
         | 
| 133 | 
             
            images_paths = os.listdir(video_folder)
         | 
| 134 | 
             
            images_paths = [os.path.join(video_folder, image_path) for image_name in images_paths]
         | 
| 135 | 
            -
            text_prompts = "<image | 
| 136 | 
             
            input_dict = {
         | 
| 137 | 
             
                'video': images_paths,
         | 
| 138 | 
             
                'text': text_prompts,
         | 
|  | |
| 3 | 
             
            pipeline_tag: image-text-to-text
         | 
| 4 | 
             
            library_name: transformers
         | 
| 5 | 
             
            base_model:
         | 
| 6 | 
            +
              - OpenGVLab/InternVL2-1B
         | 
| 7 | 
             
              - OpenGVLab/InternVL2_5-8B
         | 
| 8 | 
            +
              - OpenGVLab/InternVL2_5-4B
         | 
| 9 | 
             
              - OpenGVLab/InternViT-300M-448px-V2_5
         | 
| 10 | 
             
              - internlm/internlm2_5-7b-chat
         | 
| 11 | 
            +
              - Qwen/Qwen2-0.5B-Instruct
         | 
| 12 | 
            +
              - Qwen/Qwen2.5-3B-Instruct
         | 
| 13 | 
             
            base_model_relation: merge
         | 
| 14 | 
             
            language:
         | 
| 15 | 
             
              - multilingual
         | 
|  | |
| 20 |  | 
| 21 | 
             
            # Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos
         | 
| 22 |  | 
| 23 | 
            +
            [\[π GitHub\]](https://github.com/magic-research/Sa2VA)
         | 
| 24 | 
            +
            [\[π Sa2VA paper\]](https://arxiv.org/abs/2501.04001)
         | 
| 25 | 
             
            [\[π Quick Start\]](#quick-start) 
         | 
| 26 |  | 
| 27 |  | 
|  | |
| 34 |  | 
| 35 | 
             
            We built the Sa2VA series based on Qwen2-VL and InternVL2/2.5. In the following table, we provide some Sa2VA models built on InternVL2.5. Other Sa2VA models will be open-sourced soon.
         | 
| 36 |  | 
| 37 | 
            +
            | Model Name |                             Base MLLM                             |                                Language Part                                |                       HF Link                        |
         | 
| 38 | 
            +
            |:----------:|:-----------------------------------------------------------------:|:---------------------------------------------------------------------------:|:----------------------------------------------------:|
         | 
| 39 | 
            +
            |  Sa2VA-1B  | [InternVL2.0-1B](https://huggingface.co/OpenGVLab/InternVL2-1B) |   [Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)    | [π€ link](https://huggingface.co/ByteDance/Sa2VA-1B) |
         | 
| 40 | 
            +
            |  Sa2VA-4B  | [InternVL2.5-4B](https://huggingface.co/OpenGVLab/InternVL2_5-4B) |   [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)    | [π€ link](https://huggingface.co/ByteDance/Sa2VA-4B) |
         | 
| 41 | 
            +
            |  Sa2VA-8B  | [InternVL2.5-8B](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | [internlm2_5-7b-chat](https://huggingface.co/internlm/internlm2_5-7b-chat)  | [π€ link](https://huggingface.co/ByteDance/Sa2VA-8B) |
         | 
| 42 |  | 
| 43 | 
             
            ## Sa2VA Performance
         | 
| 44 | 
             
            | Model Name |                             MMBench                             |                                    MME                                     |                       RefCOCO                        | RefCOCO+ | RefCOCOg | MeVIS | DAVIS | ReVOS |
         | 
|  | |
| 71 |  | 
| 72 | 
             
            # for image chat
         | 
| 73 | 
             
            image_path = "/PATH/TO/IMAGE"
         | 
| 74 | 
            +
            text_prompts = "<image>Please describe the image."
         | 
| 75 | 
             
            image = Image.open(image_path).convert('RGB')
         | 
| 76 | 
             
            input_dict = {
         | 
| 77 | 
             
                'image': image,
         | 
|  | |
| 85 |  | 
| 86 | 
             
            # for image chat with segmentation output
         | 
| 87 | 
             
            image_path = "/PATH/TO/IMAGE"
         | 
| 88 | 
            +
            text_prompts = "<image>Could you please give me a brief description of the image? Please respond with interleaved segmentation masks for the corresponding parts of the answer."
         | 
| 89 | 
             
            image = Image.open(image_path).convert('RGB')
         | 
| 90 | 
             
            input_dict = {
         | 
| 91 | 
             
                'image': image,
         | 
|  | |
| 101 | 
             
            # for chat with visual prompt (mask format) input
         | 
| 102 | 
             
            mask_prompts = np.load('/PATH/TO/pred_masks.npy') # np.array(n_prompts, h, w)
         | 
| 103 | 
             
            image_path = "/PATH/TO/IMAGE"
         | 
| 104 | 
            +
            text_prompts = "<image>Can you provide me with a detailed description of the region in the picture marked by region1."
         | 
| 105 | 
             
            image = Image.open(image_path).convert('RGB')
         | 
| 106 | 
             
            input_dict = {
         | 
| 107 | 
             
                'image': image,
         | 
|  | |
| 120 | 
             
            if len(images_paths) > 5:  # uniformly sample 5 frames
         | 
| 121 | 
             
                step = (len(images_paths) - 1) // (5 - 1)
         | 
| 122 | 
             
                images_paths = [images_paths[0]] + images_paths[1:-1][::step][1:] + [images_paths[-1]]
         | 
| 123 | 
            +
            text_prompts = "<image>Please describe the video."
         | 
| 124 | 
             
            input_dict = {
         | 
| 125 | 
             
                'video': images_paths,
         | 
| 126 | 
             
                'text': text_prompts,
         | 
|  | |
| 136 | 
             
            video_folder = "/PATH/TO/VIDEO_FOLDER"
         | 
| 137 | 
             
            images_paths = os.listdir(video_folder)
         | 
| 138 | 
             
            images_paths = [os.path.join(video_folder, image_path) for image_name in images_paths]
         | 
| 139 | 
            +
            text_prompts = "<image>Please segment the person."
         | 
| 140 | 
             
            input_dict = {
         | 
| 141 | 
             
                'video': images_paths,
         | 
| 142 | 
             
                'text': text_prompts,
         | 
