Spaces:

kimhyunwoo
/

gemma-3-1b-it-space

Sleeping

App Files Files Community

kimhyunwoo commited on Mar 12

Commit

a41650d

verified ·

1 Parent(s): b5f6ba9

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -53

app.py CHANGED Viewed

@@ -2,26 +2,20 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, logging
 from huggingface_hub import login
 import torch
 import os
-# --- 1. Authentication (Choose ONE method and follow the instructions) ---
-# Method 1: Environment Variable (RECOMMENDED for security and Hugging Face Spaces)
-#   - Set the HUGGING_FACE_HUB_TOKEN environment variable *before* running.
-#   - Linux/macOS:  `export HUGGING_FACE_HUB_TOKEN=your_token` (in terminal)
-#   - Windows (PowerShell):  `$env:HUGGING_FACE_HUB_TOKEN = "your_token"`
-#   - Hugging Face Spaces:  Add `HUGGING_FACE_HUB_TOKEN` as a secret in your Space's settings.
-#   - Then, uncomment the following line:
-login()
-# Method 2: Direct Token (ONLY for local testing, NOT for deployment)
-#   - Replace "YOUR_HUGGING_FACE_TOKEN" with your actual token.
-#   - WARNING:  Do NOT commit your token to a public repository!
-# login(token="YOUR_HUGGING_FACE_TOKEN")
-# Method 3: huggingface-cli (Interactive, one-time setup, good for local development)
-#   - Run `huggingface-cli login` in your terminal.
-#   - Paste your token when prompted.
-#   - No code changes are needed after this; the token is stored.
 # --- 2. Model and Tokenizer Setup (with comprehensive error handling) ---
@@ -46,9 +40,10 @@ def load_model_and_tokenizer(model_name="google/gemma-3-1b-it"):
         print("1. Ensure you have a Hugging Face account and have accepted the model's terms.")
         print("2. Verify your internet connection.")
         print("3. Double-check the model name: 'google/gemma-3-1b-it'")
-        print("4. Ensure you are properly authenticated (see authentication section above).")
         print("5. If using a GPU, ensure your CUDA drivers and PyTorch are correctly installed.")
-        exit(1)  # Exit with an error code
 model, tokenizer = load_model_and_tokenizer()
@@ -56,24 +51,13 @@ model, tokenizer = load_model_and_tokenizer()
 # --- 3. Chat Template Function (CRITICAL for conversational models) ---
 def apply_chat_template(messages, tokenizer):
-    """Applies the appropriate chat template to the message history.
-    Args:
-        messages: A list of dictionaries, where each dictionary has 'role' (user/model)
-            and 'content' keys.
-        tokenizer: The tokenizer object.
-    Returns:
-        A formatted prompt string ready for the model.
-    """
     try:
         if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
-            # Use the tokenizer's built-in chat template if available
             return tokenizer.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
         else:
-            # Fallback to a standard chat template if no specific one is found
             print("WARNING: Tokenizer does not have a defined chat_template. Using a fallback.")
             chat_template = "{% for message in messages %}" \
                             "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] + '<end_of_turn>\n' }}" \
@@ -83,14 +67,13 @@ def apply_chat_template(messages, tokenizer):
     except Exception as e:
         print(f"ERROR: Failed to apply chat template: {e}")
-        exit(1)
 # --- 4. Text Generation Function ---
 def generate_response(messages, model, tokenizer, max_new_tokens=256, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.2):
-    """Generates a response using the model and tokenizer."""
     prompt = apply_chat_template(messages, tokenizer)
     try:
@@ -98,8 +81,8 @@ def generate_response(messages, model, tokenizer, max_new_tokens=256, temperatur
             "text-generation",
             model=model,
             tokenizer=tokenizer,
-            torch_dtype=torch.bfloat16, # Make sure pipeline also uses correct dtype
-            device_map="auto", # and device mapping
             model_kwargs={"attn_implementation": "flash_attention_2"}
             )
@@ -111,33 +94,43 @@ def generate_response(messages, model, tokenizer, max_new_tokens=256, temperatur
             top_k=top_k,
             top_p=top_p,
             repetition_penalty=repetition_penalty,
-            pad_token_id=tokenizer.eos_token_id,  # Important for proper padding
         )
-        # Extract *only* the generated text (remove the prompt)
         generated_text = outputs[0]["generated_text"][len(prompt):].strip()
         return generated_text
     except Exception as e:
         print(f"ERROR: Failed to generate response: {e}")
-        return "Sorry, I encountered an error while generating a response."
-# --- 5. Main Interaction Loop (for command-line interaction) ---
-def main():
-    """Main function for interactive command-line chat."""
-    messages = []  # Initialize the conversation history
-    while True:
-        user_input = input("You: ")
-        if user_input.lower() in ("exit", "quit", "bye"):
-            break
-        messages.append({"role": "user", "content": user_input})
-        response = generate_response(messages, model, tokenizer)
-        print(f"Model: {response}")
-        messages.append({"role": "model", "content": response})
-if __name__ == "__main__":
-    main()

 from huggingface_hub import login
 import torch
 import os
+import gradio as gr
+# --- 1. Authentication (Using Environment Variable - the ONLY correct way for Spaces) ---
+# Hugging Face Spaces CANNOT use interactive login.  You MUST use an environment variable.
+# 1. Go to your Space's settings.
+# 2. Click on "Repository Secrets".
+# 3. Click "New Secret".
+# 4. Name the secret: HUGGING_FACE_HUB_TOKEN
+# 5. Paste your Hugging Face API token (with read access) as the value.
+# 6. Save the secret.
+# The login() call below will now automatically use the environment variable.
+login()
 # --- 2. Model and Tokenizer Setup (with comprehensive error handling) ---
         print("1. Ensure you have a Hugging Face account and have accepted the model's terms.")
         print("2. Verify your internet connection.")
         print("3. Double-check the model name: 'google/gemma-3-1b-it'")
+        print("4. Ensure you are properly authenticated using a Repository Secret (see above).")
         print("5. If using a GPU, ensure your CUDA drivers and PyTorch are correctly installed.")
+        # Instead of exiting, raise the exception to be caught by Gradio
+        raise
 model, tokenizer = load_model_and_tokenizer()
 # --- 3. Chat Template Function (CRITICAL for conversational models) ---
 def apply_chat_template(messages, tokenizer):
+    """Applies the appropriate chat template."""
     try:
         if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
             return tokenizer.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
         else:
             print("WARNING: Tokenizer does not have a defined chat_template. Using a fallback.")
             chat_template = "{% for message in messages %}" \
                             "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] + '<end_of_turn>\n' }}" \
     except Exception as e:
         print(f"ERROR: Failed to apply chat template: {e}")
+        raise # Re-raise to be caught by Gradio
 # --- 4. Text Generation Function ---
 def generate_response(messages, model, tokenizer, max_new_tokens=256, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.2):
+    """Generates a response."""
     prompt = apply_chat_template(messages, tokenizer)
     try:
             "text-generation",
             model=model,
             tokenizer=tokenizer,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
             model_kwargs={"attn_implementation": "flash_attention_2"}
             )
             top_k=top_k,
             top_p=top_p,
             repetition_penalty=repetition_penalty,
+            pad_token_id=tokenizer.eos_token_id,
         )
         generated_text = outputs[0]["generated_text"][len(prompt):].strip()
         return generated_text
     except Exception as e:
         print(f"ERROR: Failed to generate response: {e}")
+        raise # Re-raise the exception
+# --- 5. Gradio Interface ---
+def predict(message, history):
+    if not history:
+        history = []
+    messages = []
+    for user_msg, bot_response in history:
+        messages.append({"role": "user", "content": user_msg})
+        if bot_response:  # Check if bot_response is not None
+            messages.append({"role": "model", "content": bot_response})
+    messages.append({"role": "user", "content": message})
+    try:
+      response = generate_response(messages, model, tokenizer)
+      history.append((message, response))
+      return "", history
+    except Exception as e:
+        # Catch any exceptions during generation and display in the UI
+        return f"Error: {e}", history
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot(label="Gemma Chatbot", height=500)
+    msg = gr.Textbox(placeholder="Ask me anything!", container=False, scale=7)
+    clear = gr.ClearButton([msg, chatbot])
+    msg.submit(predict, [msg, chatbot], [msg, chatbot])
+demo.launch()