from transformers import AutoModelForCausalLM, AutoTokenizer import os import re model_name_or_path = "" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", load_in_4bit=True) # You may want to use bfloat16 and/or move to GPU here messages = [ {"role": "user", "content": "How to make pasta?"}, ] tokenized_chat = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", thinking_budget=0 # control the thinking budget ) outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=60) output_text = tokenizer.decode(outputs[0]) print(output_text)