--- library_name: transformers license: apache-2.0 base_model: - allenai/Molmo-7B-D-0924 base_model_relation: quantized tags: - bitsandbytes - Molmo - chat - multimodal --- Quantization using ```bitsandbytes``` of the model located at: https://huggingface.co/allenai/Molmo-7B-D-0924 NOTE: The sample script below requires that you install the following libraries into your virtual environment, and an Nvidia GPU is required. You can rely on a system-wide installation of CUDA, in which case just remove the ```set_cuda_paths``` function. - ```nvidia-cublas-cu12==12.1.3.1``` - ```nvidia-cuda-runtime-cu12==12.1.105``` - ```nvidia-cuda-nvrtc-cu12==12.1.105``` - ```nvidia-cufft-cu12==11.0.2.54``` Also, I've only tested it on ```torch==2.2.2``` although I plan on testing with higher versions. For Windows users, install torch with these commands: ``` pip install https://download.pytorch.org/whl/cu121/torch-2.2.2%2Bcu121-cp311-cp311-win_amd64.whl#sha256=efbcfdd4399197d06b32f7c0e1711c615188cdd65427b933648c7478fb880b3f ``` ``` pip install https://download.pytorch.org/whl/cu121/torchvision-0.17.2%2Bcu121-cp311-cp311-win_amd64.whl#sha256=10ad542aab6b47dbe73c441381986d50a7ed5021cbe01d593a14477ec1f067a0 ``` ``` pip install https://download.pytorch.org/whl/cu121/torchaudio-2.2.2%2Bcu121-cp311-cp311-win_amd64.whl#sha256=c7dee68cd3d2b889bab71d4a0c345bdc3ea2fe79a62b921a6b49292c605b6071 ``` Example script (process single image): ```Python import sys import os from pathlib import Path def set_cuda_paths(): venv_base = Path(sys.executable).parent.parent nvidia_base_path = venv_base / 'Lib' / 'site-packages' / 'nvidia' cuda_path = nvidia_base_path / 'cuda_runtime' / 'bin' cublas_path = nvidia_base_path / 'cublas' / 'bin' cudnn_path = nvidia_base_path / 'cudnn' / 'bin' nvrtc_path = nvidia_base_path / 'cuda_nvrtc' / 'bin' paths_to_add = [ str(cuda_path), str(cublas_path), str(cudnn_path), str(nvrtc_path), ] env_vars = ['CUDA_PATH', 'CUDA_PATH_V12_1', 'PATH'] for env_var in env_vars: current_value = os.environ.get(env_var, '') new_value = os.pathsep.join(paths_to_add + [current_value] if current_value else paths_to_add) os.environ[env_var] = new_value set_cuda_paths() import torch from PIL import Image from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig model_path = r"[INSERT THE PATH TO THE FOLDER HOLDING THE MODEL FILES HERE]" class VisionModel: def __init__(self): self.model = None self.processor = None self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def initialize_model_and_processor(self): self.processor = AutoProcessor.from_pretrained( model_path, trust_remote_code=True, torch_dtype='auto', device_map='auto' ) self.model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, torch_dtype='auto', device_map='auto' ) def process_single_image(self, image_path): image = Image.open(image_path) if image.mode != "RGB": image = image.convert("RGB") text = "Describe this image in detail as possible but be succinct and don't repeat yourself." inputs = self.processor.process(images=[image], text=text) inputs = {k: v.to(self.device).unsqueeze(0) for k, v in inputs.items()} output = self.model.generate_from_batch( inputs, GenerationConfig(max_new_tokens=500, stop_strings=["<|endoftext|>"]), tokenizer=self.processor.tokenizer ) generated_text = self.processor.tokenizer.decode(output[0, inputs['input_ids'].size(1):], skip_special_tokens=True) print(f"\nGenerated Text:\n{generated_text}\n") if __name__ == "__main__": image_path = r"[INSERT THE PATH TO THE IMAGE YOU WANT TO PROCESS HERE]" vision_model = VisionModel() vision_model.initialize_model_and_processor() vision_model.process_single_image(image_path) ```