Manticore32

Runtime error

App Files Files Community

Daemontatox commited on 13 days ago

Commit

afd3591

verified ·

1 Parent(s): ddf8845

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -6

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import spaces
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
 import re
@@ -9,8 +9,28 @@ phi4_model_path = "Daemontatox/Manticore-32B"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-phi4_model = AutoModelForCausalLM.from_pretrained(phi4_model_path, device_map="auto", torch_dtype="auto")
-phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path)
 @spaces.GPU(duration=120)
 def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
@@ -98,9 +118,13 @@ Ensure the final answer is in LATEX format.
             prompt += f"{start_tag}assistant{sep_tag}{message['content']}{end_tag}"
     prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
     # sampling techniques
     generation_kwargs = {
@@ -113,6 +137,8 @@ Ensure the final answer is in LATEX format.
         "top_p": float(top_p),
         "repetition_penalty": float(repetition_penalty),
         "streamer": streamer,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
@@ -131,6 +157,7 @@ Ensure the final answer is in LATEX format.
         new_history[-1]["content"] = assistant_response.strip()
         yield new_history, new_history
     yield new_history, new_history
 # Add an example that explicitly shows LaTeX formatting
@@ -156,8 +183,9 @@ css = """
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     gr.Markdown(
         """
-        # Problem Solving with LaTeX Math Support
         This application uses advanced reasoning to solve complex problems with LaTeX formatting for mathematical expressions.
         """
     )

 import gradio as gr
 import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 import torch
 from threading import Thread
 import re
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Configure 4-bit quantization
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4"
+)
+# Load model with 4-bit quantization
+phi4_model = AutoModelForCausalLM.from_pretrained(
+    phi4_model_path,
+    quantization_config=quantization_config,
+    device_map="auto",
+    torch_dtype=torch.float16,
+    trust_remote_code=True
+)
+phi4_tokenizer = AutoTokenizer.from_pretrained(phi4_model_path, trust_remote_code=True)
+# Ensure pad token is set
+if phi4_tokenizer.pad_token is None:
+    phi4_tokenizer.pad_token = phi4_tokenizer.eos_token
 @spaces.GPU(duration=120)
 def generate_response(user_message, max_tokens, temperature, top_k, top_p, repetition_penalty, history_state):
             prompt += f"{start_tag}assistant{sep_tag}{message['content']}{end_tag}"
     prompt += f"{start_tag}user{sep_tag}{user_message}{end_tag}{start_tag}assistant{sep_tag}"
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=4096)
+    # Move inputs to the same device as the model
+    if torch.cuda.is_available():
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     # sampling techniques
     generation_kwargs = {
         "top_p": float(top_p),
         "repetition_penalty": float(repetition_penalty),
         "streamer": streamer,
+        "pad_token_id": tokenizer.eos_token_id,
+        "eos_token_id": tokenizer.eos_token_id,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
         new_history[-1]["content"] = assistant_response.strip()
         yield new_history, new_history
+    thread.join()  # Ensure thread completion
     yield new_history, new_history
 # Add an example that explicitly shows LaTeX formatting
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     gr.Markdown(
         """
+        # Problem Solving with LaTeX Math Support (4-bit Quantized)
         This application uses advanced reasoning to solve complex problems with LaTeX formatting for mathematical expressions.
+        The model is loaded with 4-bit quantization to reduce memory usage.
         """
     )