less-rambling inference

#1
by LolaRoseHB - opened
from mlx_lm import load, generate

model_name = "barretts/Seed-Coder-8B-Reasoning-mlx-8Bit"
model, tokenizer = load(model_name)

prefill = """You are a helpful, concise, and precise assistant.
When answering questions:

  • Be direct and clear
  • Avoid unnecessary elaboration
  • Provide accurate information
  • Use simple language
  • Focus on the core of the question
  • Do not overthink or generate multiple redundant explanations"""

while True:
user_input = input("Enter your prompt (or type 'exit' to quit): ")
if user_input.lower() == 'exit':
break

if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:

    messages = [
    {"role": "user", "content": user_input},
    {"role": "assistant", "content": prefill},
    ]

    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
else:
    
    prompt = f"<|user|>\n{full_prompt}<|end|>\n<|assistant|>"

response = generate(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_tokens=2048,
    verbose=True
)

print("\nResponse:\n", response)

Sign up or log in to comment