Gemma3-konkani / app.py
Reubencf's picture
Update app.py
b533aec verified
# app.py β€” Corrected for Hugging Face ZeroGPU Spaces
# ---------------------------------------------------------------
# This version is adapted for the ZeroGPU environment by using
# the @spaces.GPU decorator.
# ---------------------------------------------------------------
import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces # 1. Import the spaces library
IS_CUDA = torch.cuda.is_available()
IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
if IS_ZEROGPU:
torch.compiler.set_stance("force_eager")
torch.set_float32_matmul_precision("high")
torch.backends.cuda.matmul.allow_tf32 = True
# ── Configuration ────────────────────────────────────────────────────────────
MODEL_ID = "Reubencf/gemma3-konkani"
HF_TOKEN = os.getenv("HF_TOKEN", None)
TITLE = "Konkani LLM Fine Tuned on Gemma 3"
DESCRIPTION = (
"Version 1 of the Konkani LLM.\n"
"This release may contain inconsistencies, but improvements will follow in future updates."
)
# ── Loading ──────────────────────────────────────────────────────────────────
print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
def load_model():
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {}
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
device_map="auto", token=HF_TOKEN, **kwargs)
print("[Init] Model loaded successfully.")
return model, tokenizer
except Exception as e:
# If model loading fails, we can't proceed.
print(f"[Fatal] Could not load model: {e}")
raise Exception(f"❌ Model failed to load: {e}")
model, tokenizer = load_model()
DEF_TOKENS = 256
DEF_TEMPERATURE = 0.7
DEF_TOPK = 50
DEF_TOPP = 0.95
DEF_DURATION = 10
def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
return int(duration if duration is not None else DEF_DURATION)
# ── Generation Function ──────────────────────────────────────────────────────
@spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
@torch.inference_mode()
def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
"""
This function is called for each user message.
The @spaces.GPU decorator ensures a GPU is allocated when this runs.
"""
try:
# Format the conversation history
conversation = []
if system_message: conversation.append({"role": "system", "content": system_message})
for msg in history: # https://www.gradio.app/docs/gradio/chatbot
if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
conversation.append({"role": msg["role"], "content": msg["content"]})
# Add the current user's message
conversation.append({"role": "user", "content": message})
# Apply the chat template
inputs = tokenizer.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
).to(model.device)
# Generate the response
gen_kwargs = dict(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_new_tokens=max_tokens,
do_sample=True,
temperature=temperature,
top_k=top_k,
top_p=top_p,
#eos_token_id=tokenizer.eos_token_id,
#num_beams=1,
output_scores=False,
cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
)
outputs = model.generate(**gen_kwargs)
# Extract only the newly generated text
gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)
return new_response
except Exception as e:
print(f"Error: {e}")
gr.Warning(f"Error: {e}")
return ""
# ── UI ────────────────────────────────────────────────────────────────────────
examples = [
["Translate From English to Devnagri Konkani: what is color?"],
["ΰ€˜ΰ€°ΰ€Ύΰ€‚ΰ€€ ΰ€΅ΰ€Ώΰ€œΰ₯‡ΰ€šΰ₯‹ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€‰ΰ€£ΰ₯‹ ΰ€•ΰ€°ΰ€ͺΰ€Ύΰ€šΰ₯€ ΰ€―ΰ₯‡ΰ€΅ΰ€œΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€•ΰ€°ΰ€ͺ."],
]
demo = gr.ChatInterface(
fn=generate_response,
type="messages",
title=TITLE,
description=DESCRIPTION,
examples=examples,
cache_examples=True,
theme="soft",
additional_inputs=[
gr.Textbox(value="", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
],
)
# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("πŸš€ Starting Gradio app for ZeroGPU...")
demo.queue().launch()