from transformers import AutoModelForCausalLM, AutoTokenizer | |
import os | |
import re | |
model_name_or_path = "" | |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) | |
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", load_in_4bit=True) # You may want to use bfloat16 and/or move to GPU here | |
messages = [ | |
{"role": "user", "content": "How to make pasta?"}, | |
] | |
tokenized_chat = tokenizer.apply_chat_template( | |
messages, | |
tokenize=True, | |
add_generation_prompt=True, | |
return_tensors="pt", | |
thinking_budget=0 # control the thinking budget | |
) | |
outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=60) | |
output_text = tokenizer.decode(outputs[0]) | |
print(output_text) |