Spaces:

kimhyunwoo
/

gemma-3-1b-it-space

Sleeping

File size: 5,295 Bytes

da1470a
 
662b714
da1470a
a41650d
da1470a
a41650d
da1470a
a41650d
 
 
 
 
 
 
da1470a
a41650d
 
da1470a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a41650d
da1470a
a41650d
 
da1470a
 
 
 
 
 
 
a41650d
da1470a
662b714
da1470a
 
 
662b714
da1470a
662b714
 
 
 
da1470a
662b714
da1470a
 
a41650d
662b714
 
da1470a
662b714
da1470a
a41650d
da1470a
662b714
 
da1470a
 
 
 
a41650d
 
da1470a
 
 
 
662b714
da1470a
 
 
 
 
 
a41650d
662b714
da1470a
 
 
662b714
 
da1470a
a41650d
 
 
 
da1470a
a41650d
 
 
 
 
 
 
 
 
da1470a
a41650d
 
 
 
 
 
 
da1470a
 
a41650d
 
 
 
da1470a
a41650d
da1470a
a41650d

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, logging
from huggingface_hub import login
import torch
import os
import gradio as gr

# --- 1. Authentication (Using Environment Variable - the ONLY correct way for Spaces) ---

# Hugging Face Spaces CANNOT use interactive login.  You MUST use an environment variable.
# 1. Go to your Space's settings.
# 2. Click on "Repository Secrets".
# 3. Click "New Secret".
# 4. Name the secret: HUGGING_FACE_HUB_TOKEN
# 5. Paste your Hugging Face API token (with read access) as the value.
# 6. Save the secret.

# The login() call below will now automatically use the environment variable.
login()

# --- 2. Model and Tokenizer Setup (with comprehensive error handling) ---

def load_model_and_tokenizer(model_name="google/gemma-3-1b-it"):
    """Loads the model and tokenizer, handling potential errors."""
    try:
        # Suppress unnecessary warning messages from transformers
        logging.set_verbosity_error()

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",  # Automatically use GPU if available, else CPU
            torch_dtype=torch.bfloat16,  # Use bfloat16 for speed/memory if supported
            attn_implementation="flash_attention_2"  # Use Flash Attention 2 if supported
        )
        return model, tokenizer

    except Exception as e:
        print(f"ERROR: Failed to load model or tokenizer: {e}")
        print("\nTroubleshooting Steps:")
        print("1. Ensure you have a Hugging Face account and have accepted the model's terms.")
        print("2. Verify your internet connection.")
        print("3. Double-check the model name: 'google/gemma-3-1b-it'")
        print("4. Ensure you are properly authenticated using a Repository Secret (see above).")
        print("5. If using a GPU, ensure your CUDA drivers and PyTorch are correctly installed.")
        # Instead of exiting, raise the exception to be caught by Gradio
        raise

model, tokenizer = load_model_and_tokenizer()


# --- 3. Chat Template Function (CRITICAL for conversational models) ---

def apply_chat_template(messages, tokenizer):
    """Applies the appropriate chat template."""
    try:
        if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
            return tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        else:
            print("WARNING: Tokenizer does not have a defined chat_template. Using a fallback.")
            chat_template = "{% for message in messages %}" \
                            "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] + '<end_of_turn>\n' }}" \
                            "{% endfor %}" \
                            "{% if add_generation_prompt %}{{ '<start_of_turn>model\n' }}{% endif %}"
            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, chat_template=chat_template)

    except Exception as e:
        print(f"ERROR: Failed to apply chat template: {e}")
        raise # Re-raise to be caught by Gradio


# --- 4. Text Generation Function ---

def generate_response(messages, model, tokenizer, max_new_tokens=256, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.2):
    """Generates a response."""
    prompt = apply_chat_template(messages, tokenizer)

    try:
        pipeline_instance = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            model_kwargs={"attn_implementation": "flash_attention_2"}
            )

        outputs = pipeline_instance(
            prompt,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
        )

        generated_text = outputs[0]["generated_text"][len(prompt):].strip()
        return generated_text

    except Exception as e:
        print(f"ERROR: Failed to generate response: {e}")
        raise # Re-raise the exception


# --- 5. Gradio Interface ---

def predict(message, history):
    if not history:
        history = []
    messages = []
    for user_msg, bot_response in history:
        messages.append({"role": "user", "content": user_msg})
        if bot_response:  # Check if bot_response is not None
            messages.append({"role": "model", "content": bot_response})
    messages.append({"role": "user", "content": message})

    try:
      response = generate_response(messages, model, tokenizer)
      history.append((message, response))
      return "", history
    except Exception as e:
        # Catch any exceptions during generation and display in the UI
        return f"Error: {e}", history


with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="Gemma Chatbot", height=500)
    msg = gr.Textbox(placeholder="Ask me anything!", container=False, scale=7)
    clear = gr.ClearButton([msg, chatbot])

    msg.submit(predict, [msg, chatbot], [msg, chatbot])

demo.launch()