Daemontatox commited on
Commit
afd3591
·
verified ·
1 Parent(s): ddf8845

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -6
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import spaces
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  import torch
5
  from threading import Thread
6
  import re
@@ -9,8 +9,28 @@ phi4_model_path = "Daemontatox/Manticore-32B"
9
 
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
 
12
- phi4_model = AutoModelForCausalLM.from_pretrained(phi4_model_path, device_map="auto", torch_dtype="auto")
13
- phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  @spaces.GPU(duration=120)
16
  def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
@@ -98,9 +118,13 @@ Ensure the final answer is in LATEX format.
98
  prompt += f"{start_tag}assistant{sep_tag}{message['content']}{end_tag}"
99
  prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
100
 
101
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
 
 
 
 
102
 
103
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
104
 
105
  # sampling techniques
106
  generation_kwargs = {
@@ -113,6 +137,8 @@ Ensure the final answer is in LATEX format.
113
  "top_p": float(top_p),
114
  "repetition_penalty": float(repetition_penalty),
115
  "streamer": streamer,
 
 
116
  }
117
 
118
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
@@ -131,6 +157,7 @@ Ensure the final answer is in LATEX format.
131
  new_history[-1]["content"] = assistant_response.strip()
132
  yield new_history, new_history
133
 
 
134
  yield new_history, new_history
135
 
136
  # Add an example that explicitly shows LaTeX formatting
@@ -156,8 +183,9 @@ css = """
156
  with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
157
  gr.Markdown(
158
  """
159
- # Problem Solving with LaTeX Math Support
160
  This application uses advanced reasoning to solve complex problems with LaTeX formatting for mathematical expressions.
 
161
  """
162
  )
163
 
 
1
  import gradio as gr
2
  import spaces
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
4
  import torch
5
  from threading import Thread
6
  import re
 
9
 
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
 
12
+ # Configure 4-bit quantization
13
+ quantization_config = BitsAndBytesConfig(
14
+ load_in_4bit=True,
15
+ bnb_4bit_compute_dtype=torch.float16,
16
+ bnb_4bit_use_double_quant=True,
17
+ bnb_4bit_quant_type="nf4"
18
+ )
19
+
20
+ # Load model with 4-bit quantization
21
+ phi4_model = AutoModelForCausalLM.from_pretrained(
22
+ phi4_model_path,
23
+ quantization_config=quantization_config,
24
+ device_map="auto",
25
+ torch_dtype=torch.float16,
26
+ trust_remote_code=True
27
+ )
28
+
29
+ phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path, trust_remote_code=True)
30
+
31
+ # Ensure pad token is set
32
+ if phi4_tokenizer.pad_token is None:
33
+ phi4_tokenizer.pad_token = phi4_tokenizer.eos_token
34
 
35
  @spaces.GPU(duration=120)
36
  def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
 
118
  prompt += f"{start_tag}assistant{sep_tag}{message['content']}{end_tag}"
119
  prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
120
 
121
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=4096)
122
+
123
+ # Move inputs to the same device as the model
124
+ if torch.cuda.is_available():
125
+ inputs = {k: v.to(device) for k, v in inputs.items()}
126
 
127
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
128
 
129
  # sampling techniques
130
  generation_kwargs = {
 
137
  "top_p": float(top_p),
138
  "repetition_penalty": float(repetition_penalty),
139
  "streamer": streamer,
140
+ "pad_token_id": tokenizer.eos_token_id,
141
+ "eos_token_id": tokenizer.eos_token_id,
142
  }
143
 
144
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
 
157
  new_history[-1]["content"] = assistant_response.strip()
158
  yield new_history, new_history
159
 
160
+ thread.join() # Ensure thread completion
161
  yield new_history, new_history
162
 
163
  # Add an example that explicitly shows LaTeX formatting
 
183
  with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
184
  gr.Markdown(
185
  """
186
+ # Problem Solving with LaTeX Math Support (4-bit Quantized)
187
  This application uses advanced reasoning to solve complex problems with LaTeX formatting for mathematical expressions.
188
+ The model is loaded with 4-bit quantization to reduce memory usage.
189
  """
190
  )
191