Spaces:

daniel-dona
/

gemma-3-270m

Running

File size: 2,473 Bytes

d965a19
73f52cd
2705c5c
73f52cd
fd78eab
73f52cd
fd78eab
bec8f6d
9327797
73f52cd
 
 
 
 
 
 
 
bec8f6d
73f52cd
 
2067c10
 
73f52cd
 
 
 
 
 
 
 
fd78eab
73f52cd
b2d905e
73f52cd
 
b2d905e
 
 
 
73f52cd
fd78eab
 
 
 
 
 
 
 
 
 
 
 
a2806bf
fd78eab
 
 
 
 
 
6e3133c
fd78eab
 
 
 
 
 
 
 
6e3133c
fd78eab
 
 
 
 
 
73f52cd

import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "daniel-dona/gemma-3-270m-it"

#pipe = pipeline("text-generation", model=model, device="cuda")

@spaces.GPU
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):

    messages = [{"role": "system", "content": system_message}]

    print("Got:", message)

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    """response = pipe(
        messages,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        return_full_text=False, 
    )

    generated_text = response[0]['generated_text']

    yield generated_text"""

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    sample = True

    if temperature == 0:
        sample = False

    # conduct text completion
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_tokens,
        do_sample=sample,
        top_p=top_p,
        temperature=temperature
    )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

    content = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")

    return content


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()