qwen2.5-32b-milkdrop

Running on Zero

App Files Files Community

1inkusFace commited on 4 days ago

Commit

7a6bf95

verified ·

1 Parent(s): df7b4d1

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -104

app.py CHANGED Viewed

@@ -1,108 +1,92 @@
 import spaces # If using Hugging Face Spaces
 import os
-# ## GGUF MOD: Unused environment variables for PyTorch have been removed.
-# ## GGUF MOD: ctransformers handles its own memory and GPU management.
 os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1')
-# os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
-# alloc_conf_parts = [
-#     'expandable_segments:True',
-#     'pinned_use_background_threads:True'
-# ]
-# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts)
-# os.environ["SAFETENSORS_FAST_GPU"] = "1"
 os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
 import torch
 import gradio as gr
-# ## GGUF MOD: Import AutoModelForCausalLM from ctransformers instead of transformers.
-# ## GGUF MOD: BitsAndBytesConfig is no longer needed.
-from ctransformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-from image_gen_aux import UpscaleWithModel
-# ## GGUF MOD: PyTorch backend settings are not used by ctransformers.
 torch.backends.cuda.matmul.allow_tf32 = True
-# ... (rest of torch settings removed for clarity)
 # --- Model and Tokenizer Configuration ---
-# ## GGUF MOD: The model name now points to the GGUF repository.
-model_repo_id = "Quant-Cartel/MilkDropLM-32b-v0.3-GGUF"
-# ## GGUF MOD: Specify the exact GGUF file to download.
-# It's good practice to pick a specific quantization level.
-# q4_K_M is a good balance of quality and performance.
-model_file = "MilkDropLM-32b-v0.3-Q4_K_M.gguf"
-# ## GGUF MOD: The quantization is handled by ctransformers when loading the model.
-# ## The BitsAndBytesConfig is removed.
-print("Loading GGUF model...")
-# Documentation: Loading GGUF Model with ctransformers
-# We use AutoModelForCausalLM from the ctransformers library.
-# - model_repo_id: The Hugging Face repository containing the GGUF files.
-# - model_file: The specific .gguf file to download and load.
-# - model_type: 'llama' is specified as it's a Llama-based model, which helps ctransformers optimize.
-# - gpu_layers: This is the most important parameter for performance. It determines
-#   how many layers of the model are offloaded to the GPU. 50 is a high value
-#   that should fill most of the VRAM on modern GPUs for a 32B model,
-#   leading to much faster inference. Adjust this number based on your VRAM.
-# - hf=True: This tells ctransformers to download from the Hugging Face Hub.
-upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device('cuda'))
-upscaler.to(torch.device('cpu'))
-def loadModel():
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    model = AutoModelForCausalLM.from_pretrained(
-    model_repo_id,
-    model_file=model_file,
-    model_type='llama',
-    threads=16,
-    gpu_layers=50, # Offload all possible layers to GPU
-    hf=True
-    )
-    return model
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-model = loadModel()
-print("GGUF Model loaded successfully.")
-# The tokenizer can still be loaded from the original repository.
-# GGUF files do not contain tokenizer data.
-tokenizer_repo_id = "InferenceIllusionist/MilkDropLM-32b-v0.3"
-print(f"Loading tokenizer from: {tokenizer_repo_id}")
 tokenizer = AutoTokenizer.from_pretrained(
-    tokenizer_repo_id,
     use_fast=True
 )
-# ** This part remains the same. The chat template is independent of the model format. **
 VICUNA_CHAT_TEMPLATE = (
-    "{% if messages[0]['role'] == 'system' %}"
-        "{{ messages[0]['content'] + '\\n\\n' }}"
-        "{% set loop_messages = messages[1:] %}"
     "{% else %}"
-        "{% set loop_messages = messages %}"
     "{% endif %}"
-    "{% for message in loop_messages %}"
         "{% if message['role'] == 'user' %}"
             "{{ 'USER: ' + message['content'].strip() + '\\n' }}"
         "{% elif message['role'] == 'assistant' %}"
             "{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}"
         "{% endif %}"
     "{% endfor %}"
-    "{% if add_generation_prompt %}"
-        "{% if messages[-1]['role'] != 'assistant' %}"
-            "{{ 'ASSISTANT:' }}"
         "{% endif %}"
     "{% endif %}"
 )
 tokenizer.chat_template = VICUNA_CHAT_TEMPLATE
 print("Manually set Vicuna chat template on the tokenizer.")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
     print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
@@ -113,45 +97,46 @@ def generate_code(prompt: str) -> str:
         {"role": "user", "content": prompt}
     ]
     try:
-        # The chat template application is the same.
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
-            add_generation_prompt=True
         )
-        print(f"Formatted prompt using chat template:\n{text}")
     except Exception as e:
         print(f"Error applying chat template: {e}")
-        return f"Error: Could not apply chat template. Details: {e}."
-    # ## GGUF MOD: The generation call is now simpler.
-    # The `ctransformers` model object takes the prompt text directly.
-    # No need for tokenization or sending tensors to a device manually.
-    # Documentation: Generating Text with ctransformers
-    # The model object has a built-in generator that you call like a function.
-    # - prompt (text): The formatted string prompt for the model.
-    # - max_new_tokens, temperature, top_p: These parameters function identically
-    #   to their Hugging Face counterparts.
-    # - stop: We can provide the EOS token to ensure the model stops generating
-    #   cleanly once it thinks it's finished.
-    # The output is a simple string.
-    response = model(
-        text,
-        max_new_tokens=2048,
-        temperature=0.7,
-        top_p=0.9,
-        stop=[tokenizer.eos_token],
-    )
     return response.strip()
-# --- Gradio Interface (No changes needed here) ---
-with gr.Blocks(title="Vicuna 32B Milkdrop GGUF") as demo:
     with gr.Tab("Code Chat"):
-        gr.Markdown("# Vicuna 32B Milkdrop (GGUF)\nProvide a prompt to generate HLSL.")
         with gr.Row():
-            prompt_input = gr.Textbox(
                 label="Prompt",
                 show_label=True,
                 lines=3,
@@ -159,10 +144,10 @@ with gr.Blocks(title="Vicuna 32B Milkdrop GGUF") as demo:
             )
         run_button = gr.Button("Generate Code", variant="primary")
         with gr.Row():
-            result_output = gr.Code(
                 label="Generated Code",
                 show_label=True,
-                language="hlsl", # Changed to hlsl for better syntax highlighting
                 lines=20,
             )
         gr.on(

 import spaces # If using Hugging Face Spaces
 import os
 os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1')
+os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
+alloc_conf_parts = [
+    'expandable_segments:True',
+    'pinned_use_background_threads:True'  # Specific to pinned memory.
+]
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts)
+os.environ["SAFETENSORS_FAST_GPU"] = "1"
 os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
 import torch
 import gradio as gr
 torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = True
+torch.set_float32_matmul_precision("high")
 # --- Model and Tokenizer Configuration ---
+model_name = "InferenceIllusionist/MilkDropLM-7b-v0.3"
+# --- Quantization Configuration (Example: 4-bit) ---
+# This section is included based on our previous discussion.
+# Remove or comment out if you are not using quantization.
+print("Setting up 4-bit quantization config...")
+quantization_config_4bit = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16
+)
+print(f"Loading model: {model_name} with quantization")
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=quantization_config_4bit, # Comment out if not using quantization
+    device_map="auto",
+)
+print(f"Loading tokenizer: {model_name}")
 tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
     use_fast=True
 )
+# ** MODIFICATION: Define and set the Vicuna chat template **
+# ** DOCUMENTATION: Chat Template **
+# Vicuna models expect a specific chat format. If the tokenizer doesn't have one
+# built-in, we need to set it manually.
+# This template handles a system prompt, user messages, and assistant responses.
+# It will also add the "ASSISTANT:" prompt for generation if needed.
 VICUNA_CHAT_TEMPLATE = (
+    "{% if messages[0]['role'] == 'system' %}"  # Check if the first message is a system prompt
+        "{{ messages[0]['content'] + '\\n\\n' }}"  # Add system prompt with two newlines
+        "{% set loop_messages = messages[1:] %}"  # Slice to loop over remaining messages
     "{% else %}"
+        "{% set loop_messages = messages %}"  # No system prompt, loop over all messages
     "{% endif %}"
+    "{% for message in loop_messages %}"  # Loop through user and assistant messages
         "{% if message['role'] == 'user' %}"
             "{{ 'USER: ' + message['content'].strip() + '\\n' }}"
         "{% elif message['role'] == 'assistant' %}"
             "{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}"
         "{% endif %}"
     "{% endfor %}"
+    "{% if add_generation_prompt %}"  # If we need to prompt the model for a response
+        "{% if messages[-1]['role'] != 'assistant' %}" # And the last message wasn't from the assistant
+            "{{ 'ASSISTANT:' }}"  # Add the assistant prompt
         "{% endif %}"
     "{% endif %}"
 )
 tokenizer.chat_template = VICUNA_CHAT_TEMPLATE
 print("Manually set Vicuna chat template on the tokenizer.")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
+    # Also update the model config's pad_token_id if you are setting tokenizer.pad_token
+    # This is crucial if the model's config doesn't get updated automatically.
+    if model.config.pad_token_id is None:
+         model.config.pad_token_id = tokenizer.pad_token_id
     print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
         {"role": "user", "content": prompt}
     ]
     try:
+        # ** DOCUMENTATION: Applying Chat Template **
+        # Now that tokenizer.chat_template is set, this should work.
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
+            add_generation_prompt=True # Important to append "ASSISTANT:"
         )
+        print(f"Formatted prompt using chat template:\n{text}") # For debugging
     except Exception as e:
         print(f"Error applying chat template: {e}")
+        # Provide a more informative error or fallback if needed
+        return f"Error: Could not apply chat template. Details: {e}. Ensure the tokenizer has a valid `chat_template` attribute."
+    # Determine device for inputs if model is on multiple devices
+    # For device_map="auto", input tensors should go to the device of the first model block.
+    input_device = model.hf_device_map.get("", next(iter(model.hf_device_map.values()))) if hasattr(model, "hf_device_map") else model.device
+    model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **model_inputs, # Pass tokenized inputs
+            max_new_tokens=2048,
+            min_new_tokens=768,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id # Use EOS token for padding
+        )
+    response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
+    response = tokenizer.decode(response_ids, skip_special_tokens=True)
     return response.strip()
+# --- Gradio Interface ---
+with gr.Blocks(title="Vicuna 32B Milkdrop") as demo:
     with gr.Tab("Code Chat"):
+        gr.Markdown("# Vicuna 32B Milkdrop\nProvide a prompt to generate HLSL.")
         with gr.Row():
+            prompt_input = gr.Textbox( # Renamed to avoid conflict with 'prompt' variable in function scope
                 label="Prompt",
                 show_label=True,
                 lines=3,
             )
         run_button = gr.Button("Generate Code", variant="primary")
         with gr.Row():
+            result_output = gr.Code( # Renamed
                 label="Generated Code",
                 show_label=True,
+                language="python",
                 lines=20,
             )
         gr.on(