--- license: apache-2.0 datasets: - fka/awesome-chatgpt-prompts language: - en metrics: - accuracy base_model: - moonshotai/Kimi-K2-Instruct new_version: moonshotai/Kimi-K2-Instruct pipeline_tag: summarization library_name: adapter-transformers tags: - finance --- ## accuracy and qick response balance ```cmd https://huggingface.co/Qwen/Qwen3-4B/tree/main ``` ```python from transformers import AutoModelForCausalLM, AutoTokenizer import torch model_name = "./" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", # Use float16 or bfloat16 depending on GPU device_map="auto", # Automatically maps to GPU/CPU trust_remote_code=True ) model.eval() # Inference function def ask_qwen(prompt: str, max_new_tokens=128): messages = [{"role": "user", "content": prompt + " /no_think"}] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False # Fast replies, no step-by-step thinking ) inputs = tokenizer([text], return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=0.7, top_p=0.8, top_k=20, min_p=0.0, do_sample=True ) generated = outputs[0][inputs["input_ids"].shape[-1]:] return tokenizer.decode(generated, skip_special_tokens=True).strip() # Continuous loop for user prompts if __name__ == "__main__": print("🔁 Qwen3-4B Chat Running... Type 'exit' to quit.") while True: prompt = input("\nYou: ") if prompt.lower().strip() in ['exit', 'quit']: print("👋 Exiting Qwen chat.") break try: response = ask_qwen(prompt) print(f"Qwen: {response}") except Exception as e: print(f"⚠️ Error: {e}") ```