Spaces:

kimhyunwoo
/

gemma-3-1b-it-space

Sleeping

App Files Files Community

kimhyunwoo commited on Mar 12

Commit

da1470a

verified ·

1 Parent(s): 9ec0a3a

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -98

app.py CHANGED Viewed

@@ -1,121 +1,143 @@
-import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-# Model and tokenizer loading (with error handling)
-try:
-    model_name = "google/gemma-3-1b-it"  # Correct model name
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.bfloat16,  # Use bfloat16 for efficiency, if supported
-        device_map="auto",  # Automatically use GPU if available, otherwise CPU
-    )
-    # Create the pipeline
-    pipe = pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        torch_dtype=torch.bfloat16, # Make sure pipeline also uses correct dtype
-        device_map="auto", # and device mapping
-        model_kwargs={"attn_implementation": "flash_attention_2"}  # Enable Flash Attention 2 if supported by your hardware and transformers version
-    )
-except Exception as e:
-    error_message = f"Error loading model or tokenizer: {e}"
-    print(error_message)  # Log the error to the console
-    #  Provide a fallback, even if it's just displaying the error.
-    def error_response(message, history):
-        return f"Model loading failed.  Error: {error_message}"
-    # Minimal Gradio interface to show the error
-    with gr.Blocks() as demo:
-         gr.ChatInterface(error_response)
-    demo.launch()
-    exit() # Important: exit to prevent running the rest of the (broken) code
-# Chat template handling (important for correct prompting)
-def apply_chat_template(messages, chat_template=None):
-    """Applies the chat template to the message history.
     Args:
-        messages: A list of dictionaries, where each dictionary has a "role"
-            ("user" or "assistant") and "content" key.
-        chat_template:  The chat template string (optional). If None,
-        try to get from tokenizer.
     Returns:
-        A single string representing the formatted conversation.
     """
-    if chat_template is None:
         if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
-            chat_template = tokenizer.chat_template
         else:
-            # Fallback to a simple template if no chat template is found.  This is
-            # *critical* to prevent the model from generating nonsensical output.
             chat_template = "{% for message in messages %}" \
                             "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] + '<end_of_turn>\n' }}" \
                             "{% endfor %}" \
                             "{% if add_generation_prompt %}{{ '<start_of_turn>model\n' }}{% endif %}"
-    return tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True, chat_template=chat_template
-    )
-# Prediction function (modified for chat)
-def predict(message, history):
-    """Generates a response to the user's message.
-    Args:
-        message: The user's input message (string).
-        history: A list of (user_message, bot_response) tuples representing
-            the conversation history.
-    Returns:
-        The generated bot response (string).
-    """
-    # Build the conversation history in the required format.
-    messages = []
-    for user_msg, bot_response in history:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "model", "content": bot_response})
-    messages.append({"role": "user", "content": message})
-    # Apply the chat template.
-    prompt = apply_chat_template(messages)
-    # Generate the response using the pipeline (much cleaner).
     try:
-        sequences = pipe(
             prompt,
-            max_new_tokens=512,   # Limit response length
-            do_sample=True,       # Use sampling for more diverse responses
-            temperature=0.7,      # Control randomness (higher = more random)
-            top_k=50,             # Top-k sampling
-            top_p=0.95,            # Nucleus sampling
-            repetition_penalty=1.2, # Reduce repetition
-            pad_token_id=tokenizer.eos_token_id,  # Ensure padding is correct.
         )
-        response = sequences[0]['generated_text'][len(prompt):].strip() # Extract *only* generated text
-        return response
     except Exception as e:
-        return f"An error occurred during generation: {e}"
-# Gradio interface (using gr.ChatInterface for a chatbot UI)
-with gr.Blocks() as demo:
-    gr.ChatInterface(
-        predict,
-        chatbot=gr.Chatbot(height=500),  # Set a reasonable height
-        textbox=gr.Textbox(placeholder="Ask me anything!", container=False, scale=7),
-        title="Gemma-3-1b-it Chatbot",
-        description="Chat with the Gemma-3-1b-it model.",
-        retry_btn=None,   # Remove redundant buttons
-        undo_btn=None,
-        clear_btn=None,
-    )
-demo.launch(share=False) # Set share=True to create a publicly shareable link

+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, logging
+from huggingface_hub import login
 import torch
+import os
+# --- 1. Authentication (Choose ONE method and follow the instructions) ---
+# Method 1: Environment Variable (RECOMMENDED for security and Hugging Face Spaces)
+#   - Set the HUGGING_FACE_HUB_TOKEN environment variable *before* running.
+#   - Linux/macOS:  `export HUGGING_FACE_HUB_TOKEN=your_token` (in terminal)
+#   - Windows (PowerShell):  `$env:HUGGING_FACE_HUB_TOKEN = "your_token"`
+#   - Hugging Face Spaces:  Add `HUGGING_FACE_HUB_TOKEN` as a secret in your Space's settings.
+#   - Then, uncomment the following line:
+login()
+# Method 2: Direct Token (ONLY for local testing, NOT for deployment)
+#   - Replace "YOUR_HUGGING_FACE_TOKEN" with your actual token.
+#   - WARNING:  Do NOT commit your token to a public repository!
+# login(token="YOUR_HUGGING_FACE_TOKEN")
+# Method 3: huggingface-cli (Interactive, one-time setup, good for local development)
+#   - Run `huggingface-cli login` in your terminal.
+#   - Paste your token when prompted.
+#   - No code changes are needed after this; the token is stored.
+# --- 2. Model and Tokenizer Setup (with comprehensive error handling) ---
+def load_model_and_tokenizer(model_name="google/gemma-3-1b-it"):
+    """Loads the model and tokenizer, handling potential errors."""
+    try:
+        # Suppress unnecessary warning messages from transformers
+        logging.set_verbosity_error()
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map="auto",  # Automatically use GPU if available, else CPU
+            torch_dtype=torch.bfloat16,  # Use bfloat16 for speed/memory if supported
+            attn_implementation="flash_attention_2"  # Use Flash Attention 2 if supported
+        )
+        return model, tokenizer
+    except Exception as e:
+        print(f"ERROR: Failed to load model or tokenizer: {e}")
+        print("\nTroubleshooting Steps:")
+        print("1. Ensure you have a Hugging Face account and have accepted the model's terms.")
+        print("2. Verify your internet connection.")
+        print("3. Double-check the model name: 'google/gemma-3-1b-it'")
+        print("4. Ensure you are properly authenticated (see authentication section above).")
+        print("5. If using a GPU, ensure your CUDA drivers and PyTorch are correctly installed.")
+        exit(1)  # Exit with an error code
+model, tokenizer = load_model_and_tokenizer()
+# --- 3. Chat Template Function (CRITICAL for conversational models) ---
+def apply_chat_template(messages, tokenizer):
+    """Applies the appropriate chat template to the message history.
     Args:
+        messages: A list of dictionaries, where each dictionary has 'role' (user/model)
+            and 'content' keys.
+        tokenizer: The tokenizer object.
     Returns:
+        A formatted prompt string ready for the model.
     """
+    try:
         if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
+            # Use the tokenizer's built-in chat template if available
+            return tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
         else:
+            # Fallback to a standard chat template if no specific one is found
+            print("WARNING: Tokenizer does not have a defined chat_template. Using a fallback.")
             chat_template = "{% for message in messages %}" \
                             "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] + '<end_of_turn>\n' }}" \
                             "{% endfor %}" \
                             "{% if add_generation_prompt %}{{ '<start_of_turn>model\n' }}{% endif %}"
+            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, chat_template=chat_template)
+    except Exception as e:
+        print(f"ERROR: Failed to apply chat template: {e}")
+        exit(1)
+# --- 4. Text Generation Function ---
+def generate_response(messages, model, tokenizer, max_new_tokens=256, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.2):
+    """Generates a response using the model and tokenizer."""
+    prompt = apply_chat_template(messages, tokenizer)
     try:
+        pipeline_instance = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            torch_dtype=torch.bfloat16, # Make sure pipeline also uses correct dtype
+            device_map="auto", # and device mapping
+            model_kwargs={"attn_implementation": "flash_attention_2"}
+            )
+        outputs = pipeline_instance(
             prompt,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            pad_token_id=tokenizer.eos_token_id,  # Important for proper padding
         )
+        # Extract *only* the generated text (remove the prompt)
+        generated_text = outputs[0]["generated_text"][len(prompt):].strip()
+        return generated_text
     except Exception as e:
+        print(f"ERROR: Failed to generate response: {e}")
+        return "Sorry, I encountered an error while generating a response."
+# --- 5. Main Interaction Loop (for command-line interaction) ---
+def main():
+    """Main function for interactive command-line chat."""
+    messages = []  # Initialize the conversation history
+    while True:
+        user_input = input("You: ")
+        if user_input.lower() in ("exit", "quit", "bye"):
+            break
+        messages.append({"role": "user", "content": user_input})
+        response = generate_response(messages, model, tokenizer)
+        print(f"Model: {response}")
+        messages.append({"role": "model", "content": response})
+if __name__ == "__main__":
+    main()