Spaces:
Sleeping
Sleeping
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
import torch | |
class ModelLoader: | |
def __init__(self, model_name, hugging_face_token): | |
self.model_name = model_name | |
# Configure 4-bit quantization | |
self.bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.bfloat16, | |
llm_int8_enable_fp32_cpu_offload=True | |
) | |
# Load tokenizer | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
model_name, | |
token=hugging_face_token | |
) | |
# Load model with memory optimizations | |
self.model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
quantization_config=self.bnb_config, | |
device_map="auto", | |
low_cpu_mem_usage=True, | |
max_memory={ | |
"cpu": "12GiB", | |
"cuda:0": "4GiB", | |
}, | |
token=hugging_face_token | |
) |