gr0010 commited on
Commit
7735e46
Β·
verified Β·
1 Parent(s): 7307bfa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +456 -0
app.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ import spaces
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+
7
+ # -------------------------------------------------
8
+ # Model setup (loaded once at startup)
9
+ # -------------------------------------------------
10
+ model_name = "gr0010/Art-0-8B-development"
11
+
12
+ # Load model and tokenizer globally
13
+ print("Loading model and tokenizer...")
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
15
+
16
+ # Load model in CPU first, will move to GPU when needed
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ model_name,
19
+ torch_dtype=torch.bfloat16,
20
+ device_map="cuda", # Direct CUDA loading for ZeroGPU
21
+ trust_remote_code=True,
22
+ )
23
+ print("Model loaded successfully!")
24
+
25
+ # -------------------------------------------------
26
+ # Core generation and parsing logic with Zero GPU
27
+ # -------------------------------------------------
28
+ @spaces.GPU(duration=120) # Request GPU for up to 120 seconds
29
+ def generate_and_parse(messages: list, temperature: float = 0.6,
30
+ top_p: float = 0.95, top_k: int = 20,
31
+ min_p: float = 0.0, max_new_tokens: int = 32768):
32
+ """
33
+ Takes a clean list of messages, generates a response,
34
+ and parses it into thinking and answer parts.
35
+ Decorated with @spaces.GPU for Zero GPU allocation.
36
+ """
37
+ # Apply chat template with enable_thinking=True for Qwen3
38
+ prompt_text = tokenizer.apply_chat_template(
39
+ messages,
40
+ tokenize=False,
41
+ add_generation_prompt=True,
42
+ enable_thinking=True # Explicitly enable thinking mode
43
+ )
44
+
45
+ # --- CONSOLE DEBUG OUTPUT ---
46
+ print("\n" + "="*50)
47
+ print("--- RAW PROMPT SENT TO MODEL ---")
48
+ print(prompt_text[:500] + "..." if len(prompt_text) > 500 else prompt_text)
49
+ print("="*50 + "\n")
50
+
51
+ model_inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")
52
+
53
+ with torch.no_grad():
54
+ generated_ids = model.generate(
55
+ **model_inputs,
56
+ max_new_tokens=max_new_tokens,
57
+ do_sample=True,
58
+ temperature=temperature,
59
+ top_p=top_p,
60
+ top_k=top_k,
61
+ min_p=min_p,
62
+ pad_token_id=tokenizer.eos_token_id,
63
+ )
64
+
65
+ output_token_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
66
+
67
+ thinking = ""
68
+ answer = ""
69
+ try:
70
+ # Find the </think> token to separate thinking from answer
71
+ end_think_token_id = 151668 # </think>
72
+ if end_think_token_id in output_token_ids:
73
+ end_think_idx = output_token_ids.index(end_think_token_id) + 1
74
+ thinking_tokens = output_token_ids[:end_think_idx]
75
+ answer_tokens = output_token_ids[end_think_idx:]
76
+
77
+ thinking = tokenizer.decode(thinking_tokens, skip_special_tokens=True).strip()
78
+ # Remove <think> and </think> tags from thinking
79
+ thinking = thinking.replace("<think>", "").replace("</think>", "").strip()
80
+
81
+ answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
82
+ else:
83
+ # If no </think> token found, treat everything as answer
84
+ answer = tokenizer.decode(output_token_ids, skip_special_tokens=True).strip()
85
+ # Remove any stray <think> tags
86
+ answer = answer.replace("<think>", "").replace("</think>", "")
87
+ except (ValueError, IndexError):
88
+ answer = tokenizer.decode(output_token_ids, skip_special_tokens=True).strip()
89
+ answer = answer.replace("<think>", "").replace("</think>", "")
90
+
91
+ return thinking, answer
92
+
93
+ # -------------------------------------------------
94
+ # Gradio UI Logic
95
+ # -------------------------------------------------
96
+
97
+ # Custom CSS for better styling
98
+ custom_css = """
99
+ .model-info {
100
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
101
+ padding: 1rem;
102
+ border-radius: 10px;
103
+ margin-bottom: 1rem;
104
+ color: white;
105
+ }
106
+ .model-info a {
107
+ color: #fff;
108
+ text-decoration: underline;
109
+ font-weight: bold;
110
+ }
111
+ .cta-section {
112
+ background: #f0f0f0;
113
+ padding: 1rem;
114
+ border-radius: 10px;
115
+ margin-bottom: 1rem;
116
+ text-align: center;
117
+ }
118
+ .cta-section a {
119
+ display: inline-block;
120
+ margin: 0 0.5rem;
121
+ padding: 0.5rem 1rem;
122
+ background: #ff6b6b;
123
+ color: white;
124
+ text-decoration: none;
125
+ border-radius: 5px;
126
+ transition: background 0.3s;
127
+ }
128
+ .cta-section a:hover {
129
+ background: #ff5252;
130
+ }
131
+ """
132
+
133
+ with gr.Blocks(theme=gr.themes.Soft(), fill_height=True, css=custom_css) as demo:
134
+ # Separate states for display and model context
135
+ display_history_state = gr.State([]) # For Gradio chatbot display
136
+ model_history_state = gr.State([]) # Clean history for model
137
+ is_generating_state = gr.State(False) # To prevent multiple submissions
138
+
139
+ # Model info and CTA section
140
+ gr.HTML("""
141
+ <div class="model-info">
142
+ <h1 style="margin: 0; font-size: 2em;">🎨 Art-0 8B Thinking Chatbot</h1>
143
+ <p style="margin: 0.5rem 0;">
144
+ Powered by <a href="https://huggingface.co/gr0010/Art-0-8B-development" target="_blank">Art-0-8B-development</a>
145
+ - A fine-tuned Qwen3-8B model with advanced reasoning capabilities
146
+ </p>
147
+ </div>
148
+
149
+ <div class="cta-section">
150
+ <strong>πŸ’‘ Enjoying this model?</strong>
151
+ <a href="https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME" target="_blank">⭐ Like this Space</a>
152
+ <a href="https://huggingface.co/gr0010/Art-0-8B-development/discussions" target="_blank">πŸ’¬ Leave Feedback</a>
153
+ <a href="https://huggingface.co/gr0010" target="_blank">πŸ‘€ Follow AGI-0</a>
154
+ </div>
155
+ """)
156
+
157
+ gr.Markdown(
158
+ """
159
+ Chat with Art-0-8B, featuring transparent reasoning display and custom personality instructions.
160
+ The model shows its internal thought process when solving problems.
161
+ """
162
+ )
163
+
164
+ # System prompt at the top (main feature)
165
+ with gr.Group():
166
+ gr.Markdown("### 🎭 System Prompt (Personality & Behavior)")
167
+ system_prompt = gr.Textbox(
168
+ value="""Personality Instructions:
169
+ You are an AI assistant named Art developed by AGI-0.
170
+ Reasoning Instructions:
171
+ Think using bullet points and short sentences to simulate thoughts and emoticons to simulate emotions""",
172
+ label="System Prompt",
173
+ info="Define the model's personality and reasoning style",
174
+ lines=5,
175
+ interactive=True
176
+ )
177
+
178
+ # Main chat interface
179
+ chatbot = gr.Chatbot(
180
+ label="Conversation",
181
+ elem_id="chatbot",
182
+ bubble_full_width=False,
183
+ height=500,
184
+ show_copy_button=True,
185
+ type="messages"
186
+ )
187
+
188
+ with gr.Row():
189
+ user_input = gr.Textbox(
190
+ show_label=False,
191
+ placeholder="Type your message here...",
192
+ scale=4,
193
+ container=False,
194
+ interactive=True
195
+ )
196
+ submit_btn = gr.Button(
197
+ "Send",
198
+ variant="primary",
199
+ scale=1,
200
+ interactive=True
201
+ )
202
+
203
+ with gr.Row():
204
+ clear_btn = gr.Button("πŸ—‘οΈ Clear History", variant="secondary")
205
+ retry_btn = gr.Button("πŸ”„ Retry Last", variant="secondary")
206
+
207
+ # Example prompts
208
+ gr.Examples(
209
+ examples=[
210
+ ["Give me a short introduction to large language models."],
211
+ ["What are the benefits of using transformers in AI?"],
212
+ ["There are 5 birds on a branch. A hunter shoots one. How many birds are left?"],
213
+ ["Explain quantum computing step by step."],
214
+ ["Write a Python function to calculate the factorial of a number."],
215
+ ["What makes Art-0 different from other AI models?"],
216
+ ],
217
+ inputs=user_input,
218
+ label="πŸ’‘ Example Prompts"
219
+ )
220
+
221
+ # Advanced settings at the bottom
222
+ with gr.Accordion("βš™οΈ Advanced Generation Settings", open=False):
223
+ with gr.Row():
224
+ temperature = gr.Slider(
225
+ minimum=0.1,
226
+ maximum=2.0,
227
+ value=0.6,
228
+ step=0.1,
229
+ label="Temperature",
230
+ info="Controls randomness (higher = more creative)"
231
+ )
232
+ top_p = gr.Slider(
233
+ minimum=0.1,
234
+ maximum=1.0,
235
+ value=0.95,
236
+ step=0.05,
237
+ label="Top-p",
238
+ info="Nucleus sampling threshold"
239
+ )
240
+ with gr.Row():
241
+ top_k = gr.Slider(
242
+ minimum=1,
243
+ maximum=100,
244
+ value=20,
245
+ step=1,
246
+ label="Top-k",
247
+ info="Number of top tokens to consider"
248
+ )
249
+ min_p = gr.Slider(
250
+ minimum=0.0,
251
+ maximum=1.0,
252
+ value=0.0,
253
+ step=0.01,
254
+ label="Min-p",
255
+ info="Minimum probability threshold for token sampling"
256
+ )
257
+ with gr.Row():
258
+ max_new_tokens = gr.Slider(
259
+ minimum=128,
260
+ maximum=32768,
261
+ value=32768,
262
+ step=128,
263
+ label="Max New Tokens",
264
+ info="Maximum response length"
265
+ )
266
+
267
+ def handle_user_message(user_message: str, display_history: list, model_history: list,
268
+ system_prompt_text: str, is_generating: bool,
269
+ temp: float, top_p_val: float, top_k_val: int,
270
+ min_p_val: float, max_tokens: int):
271
+ """
272
+ Handles user input, updates histories, and generates the model's response.
273
+ """
274
+ # Prevent multiple submissions
275
+ if is_generating or not user_message.strip():
276
+ return {
277
+ chatbot: display_history,
278
+ display_history_state: display_history,
279
+ model_history_state: model_history,
280
+ is_generating_state: is_generating,
281
+ user_input: user_message,
282
+ submit_btn: gr.update(interactive=not is_generating)
283
+ }
284
+
285
+ # Set generating state
286
+ is_generating = True
287
+
288
+ # Update model history (clean format for model)
289
+ model_history.append({"role": "user", "content": user_message.strip()})
290
+
291
+ # Update display history (for Gradio chatbot)
292
+ display_history.append([user_message.strip(), None])
293
+
294
+ # Yield intermediate state to show user message and disable input
295
+ yield {
296
+ chatbot: display_history,
297
+ display_history_state: display_history,
298
+ model_history_state: model_history,
299
+ is_generating_state: is_generating,
300
+ user_input: "",
301
+ submit_btn: gr.update(interactive=False, value="πŸ”„ Generating...")
302
+ }
303
+
304
+ # Prepare messages for model (include system prompt)
305
+ messages_for_model = []
306
+ if system_prompt_text.strip():
307
+ messages_for_model.append({"role": "system", "content": system_prompt_text.strip()})
308
+ messages_for_model.extend(model_history)
309
+
310
+ try:
311
+ # Generate response with hyperparameters
312
+ thinking, answer = generate_and_parse(
313
+ messages_for_model,
314
+ temperature=temp,
315
+ top_p=top_p_val,
316
+ top_k=top_k_val,
317
+ min_p=min_p_val,
318
+ max_new_tokens=max_tokens
319
+ )
320
+
321
+ # Format response for display
322
+ if thinking and thinking.strip():
323
+ formatted_response = f"""<details>
324
+ <summary><b>πŸ€” Show Reasoning Process</b></summary>
325
+
326
+ ```
327
+ {thinking}
328
+ ```
329
+
330
+ </details>
331
+
332
+ {answer}"""
333
+ else:
334
+ formatted_response = answer
335
+
336
+ # Update model history with clean answer (no HTML formatting)
337
+ model_history.append({"role": "assistant", "content": answer})
338
+
339
+ # Update display history with formatted response
340
+ display_history[-1][1] = formatted_response
341
+
342
+ except Exception as e:
343
+ error_msg = f"❌ Error generating response: {str(e)}"
344
+ display_history[-1][1] = error_msg
345
+ # Don't add error to model history to avoid confusing the model
346
+
347
+ # Reset generating state
348
+ is_generating = False
349
+
350
+ # Final yield with complete response
351
+ yield {
352
+ chatbot: display_history,
353
+ display_history_state: display_history,
354
+ model_history_state: model_history,
355
+ is_generating_state: is_generating,
356
+ user_input: "",
357
+ submit_btn: gr.update(interactive=True, value="Send")
358
+ }
359
+
360
+ def clear_history():
361
+ """Clear both display and model histories"""
362
+ return {
363
+ chatbot: [],
364
+ display_history_state: [],
365
+ model_history_state: [],
366
+ is_generating_state: False,
367
+ user_input: "",
368
+ submit_btn: gr.update(interactive=True, value="Send")
369
+ }
370
+
371
+ def retry_last(display_history: list, model_history: list, system_prompt_text: str,
372
+ temp: float, top_p_val: float, top_k_val: int,
373
+ min_p_val: float, max_tokens: int):
374
+ """Retry the last user message"""
375
+ if not model_history or len(model_history) < 2:
376
+ return {
377
+ chatbot: display_history,
378
+ display_history_state: display_history,
379
+ model_history_state: model_history,
380
+ is_generating_state: False
381
+ }
382
+
383
+ # Remove last assistant message
384
+ if model_history[-1]["role"] == "assistant":
385
+ model_history = model_history[:-1]
386
+ display_history = display_history[:-1]
387
+
388
+ # Get last user message
389
+ last_user_msg = model_history[-1]["content"]
390
+ model_history = model_history[:-1]
391
+
392
+ # Regenerate
393
+ return handle_user_message(
394
+ last_user_msg, display_history[:-1], model_history,
395
+ system_prompt_text, False, temp, top_p_val, top_k_val, min_p_val, max_tokens
396
+ )
397
+
398
+ def on_input_change(text, is_generating):
399
+ """Handle input text changes"""
400
+ return gr.update(interactive=not is_generating and bool(text.strip()))
401
+
402
+ # Event listeners
403
+ submit_event = submit_btn.click(
404
+ handle_user_message,
405
+ inputs=[user_input, display_history_state, model_history_state, system_prompt,
406
+ is_generating_state, temperature, top_p, top_k, min_p, max_new_tokens],
407
+ outputs=[chatbot, display_history_state, model_history_state, is_generating_state,
408
+ user_input, submit_btn],
409
+ show_progress=True
410
+ )
411
+
412
+ submit_event_enter = user_input.submit(
413
+ handle_user_message,
414
+ inputs=[user_input, display_history_state, model_history_state, system_prompt,
415
+ is_generating_state, temperature, top_p, top_k, min_p, max_new_tokens],
416
+ outputs=[chatbot, display_history_state, model_history_state, is_generating_state,
417
+ user_input, submit_btn],
418
+ show_progress=True
419
+ )
420
+
421
+ # Clear button event
422
+ clear_btn.click(
423
+ clear_history,
424
+ outputs=[chatbot, display_history_state, model_history_state, is_generating_state,
425
+ user_input, submit_btn]
426
+ )
427
+
428
+ # Retry button event
429
+ retry_btn.click(
430
+ retry_last,
431
+ inputs=[display_history_state, model_history_state, system_prompt,
432
+ temperature, top_p, top_k, min_p, max_new_tokens],
433
+ outputs=[chatbot, display_history_state, model_history_state, is_generating_state],
434
+ show_progress=True
435
+ )
436
+
437
+ # Update submit button based on input and generation state
438
+ user_input.change(
439
+ on_input_change,
440
+ inputs=[user_input, is_generating_state],
441
+ outputs=[submit_btn]
442
+ )
443
+
444
+ # Footer
445
+ gr.HTML("""
446
+ <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 10px;">
447
+ <p style="margin: 0; color: #666;">
448
+ πŸš€ Powered by <strong>Zero GPU</strong> on Hugging Face Spaces |
449
+ Built with ❀️ using Gradio |
450
+ Model by <a href="https://huggingface.co/gr0010" target="_blank">AGI-0</a>
451
+ </p>
452
+ </div>
453
+ """)
454
+
455
+ if __name__ == "__main__":
456
+ demo.launch(debug=True, share=False)