Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
4 |
import torch
|
5 |
from threading import Thread
|
6 |
import re
|
@@ -9,8 +9,28 @@ phi4_model_path = "Daemontatox/Manticore-32B"
|
|
9 |
|
10 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
11 |
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
@spaces.GPU(duration=120)
|
16 |
def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
|
@@ -98,9 +118,13 @@ Ensure the final answer is in LATEX format.
|
|
98 |
prompt += f"{start_tag}assistant{sep_tag}{message['content']}{end_tag}"
|
99 |
prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
|
100 |
|
101 |
-
inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
|
104 |
|
105 |
# sampling techniques
|
106 |
generation_kwargs = {
|
@@ -113,6 +137,8 @@ Ensure the final answer is in LATEX format.
|
|
113 |
"top_p": float(top_p),
|
114 |
"repetition_penalty": float(repetition_penalty),
|
115 |
"streamer": streamer,
|
|
|
|
|
116 |
}
|
117 |
|
118 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
@@ -131,6 +157,7 @@ Ensure the final answer is in LATEX format.
|
|
131 |
new_history[-1]["content"] = assistant_response.strip()
|
132 |
yield new_history, new_history
|
133 |
|
|
|
134 |
yield new_history, new_history
|
135 |
|
136 |
# Add an example that explicitly shows LaTeX formatting
|
@@ -156,8 +183,9 @@ css = """
|
|
156 |
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
157 |
gr.Markdown(
|
158 |
"""
|
159 |
-
# Problem Solving with LaTeX Math Support
|
160 |
This application uses advanced reasoning to solve complex problems with LaTeX formatting for mathematical expressions.
|
|
|
161 |
"""
|
162 |
)
|
163 |
|
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
|
4 |
import torch
|
5 |
from threading import Thread
|
6 |
import re
|
|
|
9 |
|
10 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
11 |
|
12 |
+
# Configure 4-bit quantization
|
13 |
+
quantization_config = BitsAndBytesConfig(
|
14 |
+
load_in_4bit=True,
|
15 |
+
bnb_4bit_compute_dtype=torch.float16,
|
16 |
+
bnb_4bit_use_double_quant=True,
|
17 |
+
bnb_4bit_quant_type="nf4"
|
18 |
+
)
|
19 |
+
|
20 |
+
# Load model with 4-bit quantization
|
21 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
22 |
+
phi4_model_path,
|
23 |
+
quantization_config=quantization_config,
|
24 |
+
device_map="auto",
|
25 |
+
torch_dtype=torch.float16,
|
26 |
+
trust_remote_code=True
|
27 |
+
)
|
28 |
+
|
29 |
+
phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path, trust_remote_code=True)
|
30 |
+
|
31 |
+
# Ensure pad token is set
|
32 |
+
if phi4_tokenizer.pad_token is None:
|
33 |
+
phi4_tokenizer.pad_token = phi4_tokenizer.eos_token
|
34 |
|
35 |
@spaces.GPU(duration=120)
|
36 |
def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
|
|
|
118 |
prompt += f"{start_tag}assistant{sep_tag}{message['content']}{end_tag}"
|
119 |
prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
|
120 |
|
121 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=4096)
|
122 |
+
|
123 |
+
# Move inputs to the same device as the model
|
124 |
+
if torch.cuda.is_available():
|
125 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
126 |
|
127 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
128 |
|
129 |
# sampling techniques
|
130 |
generation_kwargs = {
|
|
|
137 |
"top_p": float(top_p),
|
138 |
"repetition_penalty": float(repetition_penalty),
|
139 |
"streamer": streamer,
|
140 |
+
"pad_token_id": tokenizer.eos_token_id,
|
141 |
+
"eos_token_id": tokenizer.eos_token_id,
|
142 |
}
|
143 |
|
144 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
|
|
157 |
new_history[-1]["content"] = assistant_response.strip()
|
158 |
yield new_history, new_history
|
159 |
|
160 |
+
thread.join() # Ensure thread completion
|
161 |
yield new_history, new_history
|
162 |
|
163 |
# Add an example that explicitly shows LaTeX formatting
|
|
|
183 |
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
184 |
gr.Markdown(
|
185 |
"""
|
186 |
+
# Problem Solving with LaTeX Math Support (4-bit Quantized)
|
187 |
This application uses advanced reasoning to solve complex problems with LaTeX formatting for mathematical expressions.
|
188 |
+
The model is loaded with 4-bit quantization to reduce memory usage.
|
189 |
"""
|
190 |
)
|
191 |
|