bobotheparrot commited on
Commit
5c79723
·
verified ·
1 Parent(s): d56100b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -34
app.py CHANGED
@@ -1,24 +1,20 @@
1
  import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
- import os # For Hugging Face token
5
 
6
- # Import spaces for ZeroGPU if you need to decorate specific functions
7
- # For models loaded via transformers and run on a device managed by ZeroGPU,
8
- # explicit @spaces.GPU might not always be needed directly on the inference function
9
- # if the entire space is on ZeroGPU hardware. However, for clarity or complex setups:
10
- # import spaces # Uncomment if using @spaces.GPU decorator
11
 
12
  # --- Configuration ---
13
- HF_TOKEN = os.getenv("HF_TOKEN") # Recommended to store your Hugging Face token as a Space secret
14
 
15
  MODEL_OPTIONS = {
16
  "Qwen1.5-1.8B-Chat": "Qwen/Qwen1.5-1.8B-Chat",
17
- "Qwen2.5-Coder-3B": "Qwen/Qwen2.5-Coder-3B", # Example for a Qwen code model around 3B params
18
  }
19
 
20
  # --- Model Loading Cache ---
21
- # This dictionary will cache loaded models and tokenizers to avoid reloading on every call
22
  loaded_models = {}
23
 
24
  def get_model_and_tokenizer(model_name_key):
@@ -26,27 +22,24 @@ def get_model_and_tokenizer(model_name_key):
26
  model_id = MODEL_OPTIONS[model_name_key]
27
  print(f"Loading model: {model_id}...")
28
  try:
29
- # Ensure you have accepted the terms of use for these models on Hugging Face Hub
30
  model = AutoModelForCausalLM.from_pretrained(
31
  model_id,
32
- torch_dtype="auto", # Let transformers decide the best dtype
33
- device_map="auto", # Automatically maps model to available device (GPU on ZeroGPU)
34
- token=HF_TOKEN # Use token if model is private or requires it
35
  )
36
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
37
  loaded_models[model_name_key] = (model, tokenizer)
38
  print(f"Model {model_id} loaded successfully.")
39
  except Exception as e:
40
  print(f"Error loading model {model_id}: {e}")
41
- # Fallback or error handling
42
- if model_name_key in loaded_models: # Remove if partially loaded
43
  del loaded_models[model_name_key]
44
  raise gr.Error(f"Failed to load model {model_name_key}. Please check the model ID and your Hugging Face token permissions. Error: {e}")
45
  return loaded_models[model_name_key]
46
 
47
  # --- Inference Function ---
48
- # If you need finer-grained control over GPU allocation for specific parts:
49
- # @spaces.GPU(duration=120) # Example: Request GPU for 120 seconds for this function
50
  def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature=0.7, top_p=0.9):
51
  if not prompt_text:
52
  return "Please enter a prompt."
@@ -56,11 +49,11 @@ def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature
56
  try:
57
  model, tokenizer = get_model_and_tokenizer(model_choice)
58
  except Exception as e:
59
- return str(e) # Display loading error to user
60
 
61
- device = model.device # Get the device the model is on
62
 
63
- if "Chat" in model_choice: # Apply chat template for chat models
64
  messages = [
65
  {"role": "system", "content": "You are a helpful assistant."},
66
  {"role": "user", "content": prompt_text}
@@ -71,11 +64,10 @@ def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature
71
  tokenize=False,
72
  add_generation_prompt=True
73
  )
74
- except Exception as e: # Fallback if apply_chat_template has issues or is not applicable
75
  print(f"Warning: Could not apply chat template for {model_choice}: {e}. Using prompt as is.")
76
  input_text = prompt_text
77
-
78
- else: # For code or non-chat models, use the prompt directly or adjust as needed
79
  input_text = prompt_text
80
 
81
  model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
@@ -86,15 +78,10 @@ def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature
86
  max_new_tokens=max_new_tokens,
87
  temperature=temperature,
88
  top_p=top_p,
89
- do_sample=True # Necessary for temperature and top_p to have an effect
90
  )
91
- # For some models, the input prompt is included in the generated_ids.
92
- # We need to decode only the newly generated tokens.
93
- # This slicing can vary based on the model and tokenizer.
94
- # A common approach is to slice based on the input_ids length:
95
  response_ids = generated_ids[0][model_inputs.input_ids.shape[-1]:]
96
  response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
97
-
98
  except Exception as e:
99
  print(f"Error during generation with {model_choice}: {e}")
100
  return f"Error generating response: {e}"
@@ -102,6 +89,7 @@ def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature
102
  return response_text
103
 
104
  # --- Gradio Interface ---
 
105
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
106
  gr.Markdown("# LLM Coding & Math Experiment")
107
  gr.Markdown("Query Qwen1.5-1.8B-Chat or Qwen Code models using ZeroGPU.")
@@ -110,7 +98,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
110
  model_dropdown = gr.Dropdown(
111
  label="Select Model",
112
  choices=list(MODEL_OPTIONS.keys()),
113
- value=list(MODEL_OPTIONS.keys())[0] # Default to the first model
114
  )
115
  with gr.Row():
116
  prompt_input = gr.Textbox(label="Enter your prompt:", lines=4, placeholder="e.g., Write a Python function to calculate factorial, or What is the capital of France?")
@@ -125,13 +113,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
125
  temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.05, label="Temperature")
126
  top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P")
127
 
128
-
129
- # Event listener for the button
130
  submit_button.click(
131
  fn=generate_response,
132
  inputs=[prompt_input, model_dropdown, max_new_tokens_slider, temperature_slider, top_p_slider],
133
  outputs=output_text,
134
- api_name="generate" # Exposes an API endpoint
135
  )
136
 
137
  gr.Markdown("## Notes:")
@@ -142,4 +128,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
142
  )
143
 
144
  if __name__ == "__main__":
145
- demo.launch()
 
 
 
1
  import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
+ import os
5
 
6
+ # Make sure to import the 'spaces' library
7
+ import spaces # <--- ADD THIS OR ENSURE IT'S UNCOMMENTED
 
 
 
8
 
9
  # --- Configuration ---
10
+ HF_TOKEN = os.getenv("HF_TOKEN")
11
 
12
  MODEL_OPTIONS = {
13
  "Qwen1.5-1.8B-Chat": "Qwen/Qwen1.5-1.8B-Chat",
14
+ "Qwen2.5-Coder-3B": "Qwen/Qwen2.5-Coder-3B",
15
  }
16
 
17
  # --- Model Loading Cache ---
 
18
  loaded_models = {}
19
 
20
  def get_model_and_tokenizer(model_name_key):
 
22
  model_id = MODEL_OPTIONS[model_name_key]
23
  print(f"Loading model: {model_id}...")
24
  try:
 
25
  model = AutoModelForCausalLM.from_pretrained(
26
  model_id,
27
+ torch_dtype="auto",
28
+ device_map="auto",
29
+ token=HF_TOKEN
30
  )
31
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
32
  loaded_models[model_name_key] = (model, tokenizer)
33
  print(f"Model {model_id} loaded successfully.")
34
  except Exception as e:
35
  print(f"Error loading model {model_id}: {e}")
36
+ if model_name_key in loaded_models:
 
37
  del loaded_models[model_name_key]
38
  raise gr.Error(f"Failed to load model {model_name_key}. Please check the model ID and your Hugging Face token permissions. Error: {e}")
39
  return loaded_models[model_name_key]
40
 
41
  # --- Inference Function ---
42
+ @spaces.GPU(duration=120) # <--- ADD THIS DECORATOR (adjust duration if needed)
 
43
  def generate_response(prompt_text, model_choice, max_new_tokens=512, temperature=0.7, top_p=0.9):
44
  if not prompt_text:
45
  return "Please enter a prompt."
 
49
  try:
50
  model, tokenizer = get_model_and_tokenizer(model_choice)
51
  except Exception as e:
52
+ return str(e)
53
 
54
+ device = model.device
55
 
56
+ if "Chat" in model_choice:
57
  messages = [
58
  {"role": "system", "content": "You are a helpful assistant."},
59
  {"role": "user", "content": prompt_text}
 
64
  tokenize=False,
65
  add_generation_prompt=True
66
  )
67
+ except Exception as e:
68
  print(f"Warning: Could not apply chat template for {model_choice}: {e}. Using prompt as is.")
69
  input_text = prompt_text
70
+ else:
 
71
  input_text = prompt_text
72
 
73
  model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
 
78
  max_new_tokens=max_new_tokens,
79
  temperature=temperature,
80
  top_p=top_p,
81
+ do_sample=True
82
  )
 
 
 
 
83
  response_ids = generated_ids[0][model_inputs.input_ids.shape[-1]:]
84
  response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
 
85
  except Exception as e:
86
  print(f"Error during generation with {model_choice}: {e}")
87
  return f"Error generating response: {e}"
 
89
  return response_text
90
 
91
  # --- Gradio Interface ---
92
+ # (Rest of your Gradio code remains the same)
93
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
94
  gr.Markdown("# LLM Coding & Math Experiment")
95
  gr.Markdown("Query Qwen1.5-1.8B-Chat or Qwen Code models using ZeroGPU.")
 
98
  model_dropdown = gr.Dropdown(
99
  label="Select Model",
100
  choices=list(MODEL_OPTIONS.keys()),
101
+ value=list(MODEL_OPTIONS.keys())[0]
102
  )
103
  with gr.Row():
104
  prompt_input = gr.Textbox(label="Enter your prompt:", lines=4, placeholder="e.g., Write a Python function to calculate factorial, or What is the capital of France?")
 
113
  temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.05, label="Temperature")
114
  top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P")
115
 
 
 
116
  submit_button.click(
117
  fn=generate_response,
118
  inputs=[prompt_input, model_dropdown, max_new_tokens_slider, temperature_slider, top_p_slider],
119
  outputs=output_text,
120
+ api_name="generate"
121
  )
122
 
123
  gr.Markdown("## Notes:")
 
128
  )
129
 
130
  if __name__ == "__main__":
131
+ # The logs show "Running on local URL: http://0.0.0.0:7860" which implies it's likely using the default Gradio launch.
132
+ # No changes needed here unless you want to explicitly set share=True for a public link when testing locally (not for Spaces deployment itself).
133
+ demo.launch()