qwen2.5-32b-milkdrop

Running on Zero

App Files Files Community

1inkusFace commited on 5 days ago

Commit

64ca47b

verified ·

1 Parent(s): c9a47f1

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -88

app.py CHANGED Viewed

@@ -1,92 +1,94 @@
 import spaces # If using Hugging Face Spaces
 import os
-os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1')
-os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
-alloc_conf_parts = [
-    'expandable_segments:True',
-    'pinned_use_background_threads:True'  # Specific to pinned memory.
-]
-os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts)
-os.environ["SAFETENSORS_FAST_GPU"] = "1"
-os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
 import torch
 import gradio as gr
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-torch.backends.cudnn.allow_tf32 = True
-torch.backends.cudnn.deterministic = True
-torch.backends.cudnn.benchmark = True
-torch.set_float32_matmul_precision("high")
-# --- Model and Tokenizer Configuration ---
-model_name = "InferenceIllusionist/MilkDropLM-32b-v0.3"
-# --- Quantization Configuration (Example: 4-bit) ---
-# This section is included based on our previous discussion.
-# Remove or comment out if you are not using quantization.
-print("Setting up 4-bit quantization config...")
-quantization_config_4bit = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.float16
-)
-print(f"Loading model: {model_name} with quantization")
 model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    quantization_config=quantization_config_4bit, # Comment out if not using quantization
-    device_map="auto",
 )
-print(f"Loading tokenizer: {model_name}")
 tokenizer = AutoTokenizer.from_pretrained(
-    model_name,
     use_fast=True
 )
-# ** MODIFICATION: Define and set the Vicuna chat template **
-# ** DOCUMENTATION: Chat Template **
-# Vicuna models expect a specific chat format. If the tokenizer doesn't have one
-# built-in, we need to set it manually.
-# This template handles a system prompt, user messages, and assistant responses.
-# It will also add the "ASSISTANT:" prompt for generation if needed.
 VICUNA_CHAT_TEMPLATE = (
-    "{% if messages[0]['role'] == 'system' %}"  # Check if the first message is a system prompt
-        "{{ messages[0]['content'] + '\\n\\n' }}"  # Add system prompt with two newlines
-        "{% set loop_messages = messages[1:] %}"  # Slice to loop over remaining messages
     "{% else %}"
-        "{% set loop_messages = messages %}"  # No system prompt, loop over all messages
     "{% endif %}"
-    "{% for message in loop_messages %}"  # Loop through user and assistant messages
         "{% if message['role'] == 'user' %}"
             "{{ 'USER: ' + message['content'].strip() + '\\n' }}"
         "{% elif message['role'] == 'assistant' %}"
             "{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}"
         "{% endif %}"
     "{% endfor %}"
-    "{% if add_generation_prompt %}"  # If we need to prompt the model for a response
-        "{% if messages[-1]['role'] != 'assistant' %}" # And the last message wasn't from the assistant
-            "{{ 'ASSISTANT:' }}"  # Add the assistant prompt
         "{% endif %}"
     "{% endif %}"
 )
 tokenizer.chat_template = VICUNA_CHAT_TEMPLATE
 print("Manually set Vicuna chat template on the tokenizer.")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
-    # Also update the model config's pad_token_id if you are setting tokenizer.pad_token
-    # This is crucial if the model's config doesn't get updated automatically.
-    if model.config.pad_token_id is None:
-         model.config.pad_token_id = tokenizer.pad_token_id
     print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
@@ -97,46 +99,45 @@ def generate_code(prompt: str) -> str:
         {"role": "user", "content": prompt}
     ]
     try:
-        # ** DOCUMENTATION: Applying Chat Template **
-        # Now that tokenizer.chat_template is set, this should work.
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
-            add_generation_prompt=True # Important to append "ASSISTANT:"
         )
-        print(f"Formatted prompt using chat template:\n{text}") # For debugging
     except Exception as e:
         print(f"Error applying chat template: {e}")
-        # Provide a more informative error or fallback if needed
-        return f"Error: Could not apply chat template. Details: {e}. Ensure the tokenizer has a valid `chat_template` attribute."
-    # Determine device for inputs if model is on multiple devices
-    # For device_map="auto", input tensors should go to the device of the first model block.
-    input_device = model.hf_device_map.get("", next(iter(model.hf_device_map.values()))) if hasattr(model, "hf_device_map") else model.device
-    model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
-    with torch.no_grad():
-        generated_ids = model.generate(
-            **model_inputs, # Pass tokenized inputs
-            max_new_tokens=2048,
-            min_new_tokens=768,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            pad_token_id=tokenizer.eos_token_id # Use EOS token for padding
-        )
-    response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
-    response = tokenizer.decode(response_ids, skip_special_tokens=True)
     return response.strip()
-# --- Gradio Interface ---
-with gr.Blocks(title="Vicuna 32B Milkdrop") as demo:
     with gr.Tab("Code Chat"):
-        gr.Markdown("# Vicuna 32B Milkdrop\nProvide a prompt to generate HLSL.")
         with gr.Row():
-            prompt_input = gr.Textbox( # Renamed to avoid conflict with 'prompt' variable in function scope
                 label="Prompt",
                 show_label=True,
                 lines=3,
@@ -144,10 +145,10 @@ with gr.Blocks(title="Vicuna 32B Milkdrop") as demo:
             )
         run_button = gr.Button("Generate Code", variant="primary")
         with gr.Row():
-            result_output = gr.Code( # Renamed
                 label="Generated Code",
                 show_label=True,
-                language="python",
                 lines=20,
             )
         gr.on(

 import spaces # If using Hugging Face Spaces
 import os
 import torch
 import gradio as gr
+# ## GGUF MOD: Unused environment variables for PyTorch have been removed.
+# ## GGUF MOD: ctransformers handles its own memory and GPU management.
+# os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1')
+# os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
+# alloc_conf_parts = [
+#     'expandable_segments:True',
+#     'pinned_use_background_threads:True'
+# ]
+# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts)
+# os.environ["SAFETENSORS_FAST_GPU"] = "1"
+os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
+# ## GGUF MOD: Import AutoModelForCausalLM from ctransformers instead of transformers.
+# ## GGUF MOD: BitsAndBytesConfig is no longer needed.
+from ctransformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
+# ## GGUF MOD: PyTorch backend settings are not used by ctransformers.
+# torch.backends.cuda.matmul.allow_tf32 = True
+# ... (rest of torch settings removed for clarity)
+# --- Model and Tokenizer Configuration ---
+# ## GGUF MOD: The model name now points to the GGUF repository.
+model_repo_id = "Quant-Cartel/MilkDropLM-32b-v0.3-GGUF"
+# ## GGUF MOD: Specify the exact GGUF file to download.
+# It's good practice to pick a specific quantization level.
+# q4_K_M is a good balance of quality and performance.
+model_file = "milkdroplm-32b-v0.3.q4_K_M.gguf"
+# ## GGUF MOD: The quantization is handled by ctransformers when loading the model.
+# ## The BitsAndBytesConfig is removed.
+print("Loading GGUF model...")
+# Documentation: Loading GGUF Model with ctransformers
+# We use AutoModelForCausalLM from the ctransformers library.
+# - model_repo_id: The Hugging Face repository containing the GGUF files.
+# - model_file: The specific .gguf file to download and load.
+# - model_type: 'llama' is specified as it's a Llama-based model, which helps ctransformers optimize.
+# - gpu_layers: This is the most important parameter for performance. It determines
+#   how many layers of the model are offloaded to the GPU. 50 is a high value
+#   that should fill most of the VRAM on modern GPUs for a 32B model,
+#   leading to much faster inference. Adjust this number based on your VRAM.
+# - hf=True: This tells ctransformers to download from the Hugging Face Hub.
 model = AutoModelForCausalLM.from_pretrained(
+    model_repo_id,
+    model_file=model_file,
+    model_type='llama',
+    gpu_layers=50, # Offload all possible layers to GPU
+    hf=True
 )
+print("GGUF Model loaded successfully.")
+# The tokenizer can still be loaded from the original repository.
+# GGUF files do not contain tokenizer data.
+tokenizer_repo_id = "InferenceIllusionist/MilkDropLM-32b-v0.3"
+print(f"Loading tokenizer from: {tokenizer_repo_id}")
 tokenizer = AutoTokenizer.from_pretrained(
+    tokenizer_repo_id,
     use_fast=True
 )
+# ** This part remains the same. The chat template is independent of the model format. **
 VICUNA_CHAT_TEMPLATE = (
+    "{% if messages[0]['role'] == 'system' %}"
+        "{{ messages[0]['content'] + '\\n\\n' }}"
+        "{% set loop_messages = messages[1:] %}"
     "{% else %}"
+        "{% set loop_messages = messages %}"
     "{% endif %}"
+    "{% for message in loop_messages %}"
         "{% if message['role'] == 'user' %}"
             "{{ 'USER: ' + message['content'].strip() + '\\n' }}"
         "{% elif message['role'] == 'assistant' %}"
             "{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}"
         "{% endif %}"
     "{% endfor %}"
+    "{% if add_generation_prompt %}"
+        "{% if messages[-1]['role'] != 'assistant' %}"
+            "{{ 'ASSISTANT:' }}"
         "{% endif %}"
     "{% endif %}"
 )
 tokenizer.chat_template = VICUNA_CHAT_TEMPLATE
 print("Manually set Vicuna chat template on the tokenizer.")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
     print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
         {"role": "user", "content": prompt}
     ]
     try:
+        # The chat template application is the same.
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
+            add_generation_prompt=True
         )
+        print(f"Formatted prompt using chat template:\n{text}")
     except Exception as e:
         print(f"Error applying chat template: {e}")
+        return f"Error: Could not apply chat template. Details: {e}."
+    # ## GGUF MOD: The generation call is now simpler.
+    # The `ctransformers` model object takes the prompt text directly.
+    # No need for tokenization or sending tensors to a device manually.
+    # Documentation: Generating Text with ctransformers
+    # The model object has a built-in generator that you call like a function.
+    # - prompt (text): The formatted string prompt for the model.
+    # - max_new_tokens, temperature, top_p: These parameters function identically
+    #   to their Hugging Face counterparts.
+    # - stop: We can provide the EOS token to ensure the model stops generating
+    #   cleanly once it thinks it's finished.
+    # The output is a simple string.
+    response = model(
+        text,
+        max_new_tokens=2048,
+        temperature=0.7,
+        top_p=0.9,
+        stop=[tokenizer.eos_token],
+    )
     return response.strip()
+# --- Gradio Interface (No changes needed here) ---
+with gr.Blocks(title="Vicuna 32B Milkdrop GGUF") as demo:
     with gr.Tab("Code Chat"):
+        gr.Markdown("# Vicuna 32B Milkdrop (GGUF)\nProvide a prompt to generate HLSL.")
         with gr.Row():
+            prompt_input = gr.Textbox(
                 label="Prompt",
                 show_label=True,
                 lines=3,
             )
         run_button = gr.Button("Generate Code", variant="primary")
         with gr.Row():
+            result_output = gr.Code(
                 label="Generated Code",
                 show_label=True,
+                language="hlsl", # Changed to hlsl for better syntax highlighting
                 lines=20,
             )
         gr.on(