import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer import torch import os from threading import Thread # Nonaktifkan cache Hugging Face untuk hemat penyimpanan os.environ["HF_HUB_DISABLE_CACHE"] = "1" # Muat model dan tokenizer model_name = "Qwen/Qwen2-0.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, # bfloat16 untuk efisiensi CPU device_map="cpu", # Paksa ke CPU untuk Space gratis trust_remote_code=True, low_cpu_mem_usage=True # Optimasi memori ) # Fungsi untuk menghasilkan respons def generate_response(user_input, chat_history): if not user_input.strip(): return [("Error", "Masukkan teks tidak boleh kosong!")], chat_history if not chat_history: chat_history = [] # Format riwayat percakapan (batasi 5 interaksi terakhir untuk efisiensi) messages = [] for user_msg, bot_msg in chat_history[-5:]: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) # Tambahkan input pengguna saat ini messages.append({"role": "user", "content": user_input}) # Buat prompt menggunakan format chat Qwen prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Tokenisasi input inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cpu") # Gunakan TextStreamer untuk streaming respons (meningkatkan UX) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generate respons di thread terpisah untuk responsivitas def generate(): outputs = model.generate( **inputs, max_new_tokens=200, # Batasi token untuk kecepatan do_sample=True, temperature=0.75, top_p=0.85, eos_token_id=tokenizer.eos_token_id, use_cache=True, # Cache untuk inferensi lebih cepat streamer=streamer ) return outputs # Jalankan generasi di thread thread = Thread(target=generate) thread.start() thread.join() # Ambil respons dari output streamer (decode manual) bot_response = tokenizer.decode( model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.75, top_p=0.85)[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True ) # Perbarui riwayat percakapan chat_history.append((user_input, bot_response)) # Format output untuk Gradio Chatbot return [(user_msg, bot_msg) for user_msg, bot_msg in chat_history], chat_history # Fungsi untuk menghapus riwayat def clear_history(): return [], [] # Antarmuka Gradio with gr.Blocks( theme=gr.themes.Monochrome(), # Tema modern dan bersih css=""" #chatbot {border-radius: 10px; border: 1px solid #e0e0e0;} .gradio-container {max-width: 800px; margin: auto;} #input-box {border-radius: 8px;} #submit-btn, #clear-btn {border-radius: 8px; background: #007bff; color: white;} #submit-btn:hover, #clear-btn:hover {background: #0056b3;} """ ) as demo: gr.Markdown( """ # 💬 Chatbot Qwen (Alibaba) Ajukan pertanyaan dan dapatkan respons cerdas dari model Qwen2-0.5B-Instruct! """ ) # Komponen UI chatbot = gr.Chatbot( label="Percakapan", height=450, show_label=False, elem_id="chatbot", bubble_full_width=False ) with gr.Row(): user_input = gr.Textbox( placeholder="Ketik pertanyaanmu di sini...", show_label=False, elem_id="input-box", scale=4 ) submit_button = gr.Button("Kirim", elem_id="submit-btn", scale=1) clear_button = gr.Button("Hapus Riwayat", elem_id="clear-btn") # State untuk menyimpan riwayat percakapan chat_history = gr.State([]) # Aksi tombol submit_button.click( fn=generate_response, inputs=[user_input, chat_history], outputs=[chatbot, chat_history], _js="() => {document.querySelector('input').value = '';}" # Kosongkan input ) clear_button.click( fn=clear_history, inputs=None, outputs=[chatbot, chat_history] ) # Luncurkan aplikasi demo.launch()