Summary

A 4-bits quantization of scb10x/typhoon-7b with only less than 8 GB VRAM is required.

Steps to reproduce

# init parameters
model_name: str = 'scb10x/typhoon-7b'
quantization_mode: str = 'q4-bnb_cuda' # possible values = {'q4-bnb_cuda', 'q8-bnb_cuda', 'q4-torch_ptdq', 'q8-torch_ptdq'}

# load tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id
print(tokenizer) # LlamaTokenizerFast

# load model
import torch
from transformers import AutoModelForCausalLM

if quantization_mode == 'q4-bnb_cuda': # ampere architecture with 8gb vram + cpu with 20gb is recommended
    print('4-bits bitsandbytes quantization with cuda')
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit = True,
        device_map = 'auto',
        torch_dtype = torch.bfloat16)
elif quantization_mode == 'q8-bnb_cuda': # ampere architecture with 12gb vram + cpu with 20gb is recommended
    print('8-bits bitsandbytes quantization with cuda')
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit = True,
        device_map = 'auto',
        torch_dtype = torch.bfloat16)
elif quantization_mode == 'q4-torch_ptdq': # cpu with 64gb++ ram is recommended
    print('4-bits x2 post training dynamic quantization')
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype = torch.float32)
    model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint4x2)
elif quantization_mode == 'q8-torch_ptdq': # cpu with 64gb++ ram is recommended
    print('8-bits post training dynamic quantization')
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype = torch.float32)
    model = torch.quantization.quantize_dynamic(base_model, dtype = torch.quint8)
else:
    print('default model')
    model = AutoModelForCausalLM.from_pretrained(model_name)
print(model) # MistralForCausalLM

# text generator
from transformers import GenerationConfig, TextGenerationPipeline

config = GenerationConfig.from_pretrained(model_name)
config.num_return_sequences: int = 1
config.do_sample: bool = True
config.max_new_tokens: int = 128
config.temperature: float = 0.7
config.top_p: float = 0.95
config.repetition_penalty: float = 1.3
generator = TextGenerationPipeline(
    model = model, 
    tokenizer = tokenizer,
    return_full_text = True,
    generation_config = config)

# sample
sample: str = 'ความหมายของชีวิตคืออะไร?\n'
output = generator(sample, pad_token_id = tokenizer.eos_token_id)
print(output[0]['generated_text'])

requirement.txt

torch==2.1.2
accelerate==0.25.0
bitsandbytes==0.41.3
#transformers==4.37.0.dev0
transformers @ git+https://github.com/huggingface/transformers
Downloads last month
18
Safetensors
Model size
3.89B params
Tensor type
F32
·
BF16
·
U8
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Space using bandhit/typhoon-7b-q4-bnb_cuda-ts-1703352224 1