import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import torch import os MODEL_NAMES = { "DeepSeek-R1-Distill-Qwen-7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "DeepSeek-R1-Distill-Llama-8B": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", } HF_TOKEN = os.getenv("HF_TOKEN") def load_model(model_path): tokenizer = AutoTokenizer.from_pretrained( model_path, trust_remote_code=True, token=HF_TOKEN ) config = AutoConfig.from_pretrained( model_path, trust_remote_code=True, token=HF_TOKEN ) if hasattr(config, "quantization_config"): del config.quantization_config # 刪除量化配置,避免使用 FP8 model = AutoModelForCausalLM.from_pretrained( model_path, config=config, trust_remote_code=True, token=HF_TOKEN, torch_dtype=torch.float16, device_map="auto", ) return model, tokenizer # 初始化預設模型 current_model_name = "DeepSeek-R1-Distill-Llama-8B" current_model, current_tokenizer = load_model(MODEL_NAMES[current_model_name]) def chat(message, history, model_name): global current_model, current_tokenizer, current_model_name # 檢查是否需要更換模型 if model_name != current_model_name: current_model_name = model_name current_model, current_tokenizer = load_model(MODEL_NAMES[model_name]) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = current_tokenizer(message, return_tensors="pt").to(device) outputs = current_model.generate(**inputs, max_length=1024) response = current_tokenizer.decode(outputs[0], skip_special_tokens=True) return response with gr.Blocks() as app: gr.Markdown("## Chatbot with DeepSeek Models") model_selector = gr.Dropdown( choices=list(MODEL_NAMES.keys()), value=current_model_name, label="Select Model", ) chat_interface = gr.ChatInterface( fn=lambda message, history: chat(message, history, model_selector.value), type="messages", flagging_mode="manual", save_history=True, ) model_selector.change( fn=lambda model_name: None, inputs=[model_selector], outputs=[] ) app.launch()