Spaces:

Reubencf
/

Gemma3-konkani

Running on Zero

File size: 6,288 Bytes

fd3edf2
 
 
 
 
 
 
cb0052b
e79aff2
fd3edf2
cb0052b
e79aff2
 
5b16dc7
 
 
 
e79aff2
 
fd3edf2
e79aff2
cb0052b
f393d86
fd3edf2
4debea8
 
fd3edf2
e79aff2
 
 
 
 
 
5b16dc7
 
 
e79aff2
 
 
 
 
 
 
 
 
 
 
 
 
b533aec
e79aff2
 
 
fd3edf2
 
e79aff2
 
 
cb0052b
fd3edf2
 
cb0052b
e79aff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b16dc7
e79aff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb0052b
fd3edf2
 
e79aff2
 
fd3edf2
cb0052b
fd3edf2
 
e79aff2
fd3edf2
 
 
e79aff2
fd3edf2
e79aff2
 
 
 
 
 
 
 
cb0052b
 
fd3edf2
cb0052b
fd3edf2
af73ec8

# app.py — Corrected for Hugging Face ZeroGPU Spaces
# ---------------------------------------------------------------
# This version is adapted for the ZeroGPU environment by using
# the @spaces.GPU decorator.
# ---------------------------------------------------------------
import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces # 1. Import the spaces library

IS_CUDA = torch.cuda.is_available()
IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
if IS_ZEROGPU:
    torch.compiler.set_stance("force_eager")
    torch.set_float32_matmul_precision("high")
    torch.backends.cuda.matmul.allow_tf32 = True

# ── Configuration ────────────────────────────────────────────────────────────
MODEL_ID = "Reubencf/gemma3-konkani"
HF_TOKEN = os.getenv("HF_TOKEN", None)

TITLE = "Konkani LLM Fine Tuned on Gemma 3"
DESCRIPTION = (
    "Version 1 of the Konkani LLM.\n"
    "This release may contain inconsistencies, but improvements will follow in future updates."
)

# ── Loading ──────────────────────────────────────────────────────────────────
print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
def load_model():
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
        kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {}
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
                                                     device_map="auto", token=HF_TOKEN, **kwargs)
        print("[Init] Model loaded successfully.")
        return model, tokenizer
    except Exception as e:
        # If model loading fails, we can't proceed.
        print(f"[Fatal] Could not load model: {e}")
        raise Exception(f"❌ Model failed to load: {e}")

model, tokenizer = load_model()

DEF_TOKENS = 256
DEF_TEMPERATURE = 0.7
DEF_TOPK = 50
DEF_TOPP = 0.95
DEF_DURATION = 10

def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
    return int(duration if duration is not None else DEF_DURATION)

# ── Generation Function ──────────────────────────────────────────────────────
@spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
@torch.inference_mode()
def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
    """
    This function is called for each user message.
    The @spaces.GPU decorator ensures a GPU is allocated when this runs.
    """
    try:
        # Format the conversation history
        conversation = []
        if system_message: conversation.append({"role": "system", "content": system_message})
        for msg in history: # https://www.gradio.app/docs/gradio/chatbot
            if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
            conversation.append({"role": msg["role"], "content": msg["content"]})

        # Add the current user's message
        conversation.append({"role": "user", "content": message})

        # Apply the chat template
        inputs = tokenizer.apply_chat_template(
            conversation,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
        ).to(model.device)

        # Generate the response
        gen_kwargs = dict(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            #eos_token_id=tokenizer.eos_token_id,
            #num_beams=1,
            output_scores=False,
            cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
        )
        outputs = model.generate(**gen_kwargs)

        # Extract only the newly generated text
        gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
        new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)

        return new_response
    except Exception as e:
        print(f"Error: {e}")
        gr.Warning(f"Error: {e}")
        return ""

# ── UI ────────────────────────────────────────────────────────────────────────
examples = [
    ["Translate From English to Devnagri Konkani: what is color?"],
    ["घरांत विजेचो वापर उणो करपाची येवजण तयार करप."],
]

demo = gr.ChatInterface(
    fn=generate_response,
    type="messages",
    title=TITLE,
    description=DESCRIPTION,
    examples=examples,
    cache_examples=True,
    theme="soft",
    additional_inputs=[
        gr.Textbox(value="", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
        gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
        gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
        gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
    ],
)

# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    print("🚀 Starting Gradio app for ZeroGPU...")
    demo.queue().launch()