Spaces:

bobotheparrot
/

Bobo_1st_space

Running on Zero

App Files Files Community

bobotheparrot commited on May 13

Commit

5c79723

verified ·

1 Parent(s): d56100b

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -34

app.py CHANGED Viewed

@@ -1,24 +1,20 @@
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-import os # For Hugging Face token
-# Import spaces for ZeroGPU if you need to decorate specific functions
-# For models loaded via transformers and run on a device managed by ZeroGPU,
-# explicit @spaces.GPU might not always be needed directly on the inference function
-# if the entire space is on ZeroGPU hardware. However, for clarity or complex setups:
-# import spaces # Uncomment if using @spaces.GPU decorator
 # --- Configuration ---
-HF_TOKEN = os.getenv("HF_TOKEN") # Recommended to store your Hugging Face token as a Space secret
 MODEL_OPTIONS = {
     "Qwen1.5-1.8B-Chat": "Qwen/Qwen1.5-1.8B-Chat",
-    "Qwen2.5-Coder-3B": "Qwen/Qwen2.5-Coder-3B", # Example for a Qwen code model around 3B params
 }
 # --- Model Loading Cache ---
-# This dictionary will cache loaded models and tokenizers to avoid reloading on every call
 loaded_models = {}
 def get_model_and_tokenizer(model_name_key):
@@ -26,27 +22,24 @@ def get_model_and_tokenizer(model_name_key):
         model_id = MODEL_OPTIONS[model_name_key]
         print(f"Loading model: {model_id}...")
         try:
-            # Ensure you have accepted the terms of use for these models on Hugging Face Hub
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
-                torch_dtype="auto", # Let transformers decide the best dtype
-                device_map="auto",  # Automatically maps model to available device (GPU on ZeroGPU)
-                token=HF_TOKEN # Use token if model is private or requires it
             )
             tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
             loaded_models[model_name_key] = (model, tokenizer)
             print(f"Model {model_id} loaded successfully.")
         except Exception as e:
             print(f"Error loading model {model_id}: {e}")
-            # Fallback or error handling
-            if model_name_key in loaded_models: # Remove if partially loaded
                 del loaded_models[model_name_key]
             raise gr.Error(f"Failed to load model {model_name_key}. Please check the model ID and your Hugging Face token permissions. Error: {e}")
     return loaded_models[model_name_key]
 # --- Inference Function ---
-# If you need finer-grained control over GPU allocation for specific parts:
-# @spaces.GPU(duration=120) # Example: Request GPU for 120 seconds for this function
 def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature=0.7, top_p=0.9):
     if not prompt_text:
         return "Please enter a prompt."
@@ -56,11 +49,11 @@ def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature
     try:
         model, tokenizer = get_model_and_tokenizer(model_choice)
     except Exception as e:
-        return str(e) # Display loading error to user
-    device = model.device # Get the device the model is on
-    if "Chat" in model_choice: # Apply chat template for chat models
         messages = [
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": prompt_text}
@@ -71,11 +64,10 @@ def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature
                 tokenize=False,
                 add_generation_prompt=True
             )
-        except Exception as e: # Fallback if apply_chat_template has issues or is not applicable
             print(f"Warning: Could not apply chat template for {model_choice}: {e}. Using prompt as is.")
             input_text = prompt_text
-    else: # For code or non-chat models, use the prompt directly or adjust as needed
         input_text = prompt_text
     model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
@@ -86,15 +78,10 @@ def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_p=top_p,
-            do_sample=True # Necessary for temperature and top_p to have an effect
         )
-        # For some models, the input prompt is included in the generated_ids.
-        # We need to decode only the newly generated tokens.
-        # This slicing can vary based on the model and tokenizer.
-        # A common approach is to slice based on the input_ids length:
         response_ids = generated_ids[0][model_inputs.input_ids.shape[-1]:]
         response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
     except Exception as e:
         print(f"Error during generation with {model_choice}: {e}")
         return f"Error generating response: {e}"
@@ -102,6 +89,7 @@ def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature
     return response_text
 # --- Gradio Interface ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# LLM Coding & Math Experiment")
     gr.Markdown("Query Qwen1.5-1.8B-Chat or Qwen Code models using ZeroGPU.")
@@ -110,7 +98,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         model_dropdown = gr.Dropdown(
             label="Select Model",
             choices=list(MODEL_OPTIONS.keys()),
-            value=list(MODEL_OPTIONS.keys())[0] # Default to the first model
         )
     with gr.Row():
         prompt_input = gr.Textbox(label="Enter your prompt:", lines=4, placeholder="e.g., Write a Python function to calculate factorial, or What is the capital of France?")
@@ -125,13 +113,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.05, label="Temperature")
         top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P")
-    # Event listener for the button
     submit_button.click(
         fn=generate_response,
         inputs=[prompt_input, model_dropdown, max_new_tokens_slider, temperature_slider, top_p_slider],
         outputs=output_text,
-        api_name="generate" # Exposes an API endpoint
     )
     gr.Markdown("## Notes:")
@@ -142,4 +128,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+import os
+# Make sure to import the 'spaces' library
+import spaces # <--- ADD THIS OR ENSURE IT'S UNCOMMENTED
 # --- Configuration ---
+HF_TOKEN = os.getenv("HF_TOKEN")
 MODEL_OPTIONS = {
     "Qwen1.5-1.8B-Chat": "Qwen/Qwen1.5-1.8B-Chat",
+    "Qwen2.5-Coder-3B": "Qwen/Qwen2.5-Coder-3B",
 }
 # --- Model Loading Cache ---
 loaded_models = {}
 def get_model_and_tokenizer(model_name_key):
         model_id = MODEL_OPTIONS[model_name_key]
         print(f"Loading model: {model_id}...")
         try:
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
+                torch_dtype="auto",
+                device_map="auto",
+                token=HF_TOKEN
             )
             tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
             loaded_models[model_name_key] = (model, tokenizer)
             print(f"Model {model_id} loaded successfully.")
         except Exception as e:
             print(f"Error loading model {model_id}: {e}")
+            if model_name_key in loaded_models:
                 del loaded_models[model_name_key]
             raise gr.Error(f"Failed to load model {model_name_key}. Please check the model ID and your Hugging Face token permissions. Error: {e}")
     return loaded_models[model_name_key]
 # --- Inference Function ---
+@spaces.GPU(duration=120) # <--- ADD THIS DECORATOR (adjust duration if needed)
 def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature=0.7, top_p=0.9):
     if not prompt_text:
         return "Please enter a prompt."
     try:
         model, tokenizer = get_model_and_tokenizer(model_choice)
     except Exception as e:
+        return str(e)
+    device = model.device
+    if "Chat" in model_choice:
         messages = [
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": prompt_text}
                 tokenize=False,
                 add_generation_prompt=True
             )
+        except Exception as e:
             print(f"Warning: Could not apply chat template for {model_choice}: {e}. Using prompt as is.")
             input_text = prompt_text
+    else:
         input_text = prompt_text
     model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_p=top_p,
+            do_sample=True
         )
         response_ids = generated_ids[0][model_inputs.input_ids.shape[-1]:]
         response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
     except Exception as e:
         print(f"Error during generation with {model_choice}: {e}")
         return f"Error generating response: {e}"
     return response_text
 # --- Gradio Interface ---
+# (Rest of your Gradio code remains the same)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# LLM Coding & Math Experiment")
     gr.Markdown("Query Qwen1.5-1.8B-Chat or Qwen Code models using ZeroGPU.")
         model_dropdown = gr.Dropdown(
             label="Select Model",
             choices=list(MODEL_OPTIONS.keys()),
+            value=list(MODEL_OPTIONS.keys())[0]
         )
     with gr.Row():
         prompt_input = gr.Textbox(label="Enter your prompt:", lines=4, placeholder="e.g., Write a Python function to calculate factorial, or What is the capital of France?")
         temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.05, label="Temperature")
         top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P")
     submit_button.click(
         fn=generate_response,
         inputs=[prompt_input, model_dropdown, max_new_tokens_slider, temperature_slider, top_p_slider],
         outputs=output_text,
+        api_name="generate"
     )
     gr.Markdown("## Notes:")
     )
 if __name__ == "__main__":
+    # The logs show "Running on local URL: http://0.0.0.0:7860" which implies it's likely using the default Gradio launch.
+    # No changes needed here unless you want to explicitly set share=True for a public link when testing locally (not for Spaces deployment itself).
+    demo.launch()