less-rambling inference
#1
by
LolaRoseHB
- opened
from mlx_lm import load, generate
model_name = "barretts/Seed-Coder-8B-Reasoning-mlx-8Bit"
model, tokenizer = load(model_name)
prefill = """You are a helpful, concise, and precise assistant.
When answering questions:
- Be direct and clear
- Avoid unnecessary elaboration
- Provide accurate information
- Use simple language
- Focus on the core of the question
- Do not overthink or generate multiple redundant explanations"""
while True:
user_input = input("Enter your prompt (or type 'exit' to quit): ")
if user_input.lower() == 'exit':
break
if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
messages = [
{"role": "user", "content": user_input},
{"role": "assistant", "content": prefill},
]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
else:
prompt = f"<|user|>\n{full_prompt}<|end|>\n<|assistant|>"
response = generate(
model=model,
tokenizer=tokenizer,
prompt=prompt,
max_tokens=2048,
verbose=True
)
print("\nResponse:\n", response)