Spaces:
Running
on
Zero
Running
on
Zero
# app.py β Corrected for Hugging Face ZeroGPU Spaces | |
# --------------------------------------------------------------- | |
# This version is adapted for the ZeroGPU environment by using | |
# the @spaces.GPU decorator. | |
# --------------------------------------------------------------- | |
import os | |
import torch | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import spaces # 1. Import the spaces library | |
IS_CUDA = torch.cuda.is_available() | |
IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False | |
if IS_ZEROGPU: | |
torch.compiler.set_stance("force_eager") | |
torch.set_float32_matmul_precision("high") | |
torch.backends.cuda.matmul.allow_tf32 = True | |
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
MODEL_ID = "Reubencf/gemma3-konkani" | |
HF_TOKEN = os.getenv("HF_TOKEN", None) | |
TITLE = "Konkani LLM Fine Tuned on Gemma 3" | |
DESCRIPTION = ( | |
"Version 1 of the Konkani LLM.\n" | |
"This release may contain inconsistencies, but improvements will follow in future updates." | |
) | |
# ββ Loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...") | |
def load_model(): | |
try: | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN) | |
kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {} | |
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32, | |
device_map="auto", token=HF_TOKEN, **kwargs) | |
print("[Init] Model loaded successfully.") | |
return model, tokenizer | |
except Exception as e: | |
# If model loading fails, we can't proceed. | |
print(f"[Fatal] Could not load model: {e}") | |
raise Exception(f"β Model failed to load: {e}") | |
model, tokenizer = load_model() | |
DEF_TOKENS = 256 | |
DEF_TEMPERATURE = 0.7 | |
DEF_TOPK = 50 | |
DEF_TOPP = 0.95 | |
DEF_DURATION = 10 | |
def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION): | |
return int(duration if duration is not None else DEF_DURATION) | |
# ββ Generation Function ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 2. Decorate the function that needs the GPU | |
def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION): | |
""" | |
This function is called for each user message. | |
The @spaces.GPU decorator ensures a GPU is allocated when this runs. | |
""" | |
try: | |
# Format the conversation history | |
conversation = [] | |
if system_message: conversation.append({"role": "system", "content": system_message}) | |
for msg in history: # https://www.gradio.app/docs/gradio/chatbot | |
if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue | |
conversation.append({"role": msg["role"], "content": msg["content"]}) | |
# Add the current user's message | |
conversation.append({"role": "user", "content": message}) | |
# Apply the chat template | |
inputs = tokenizer.apply_chat_template( | |
conversation, | |
tokenize=True, | |
add_generation_prompt=True, | |
return_tensors="pt", | |
return_dict=True, | |
).to(model.device) | |
# Generate the response | |
gen_kwargs = dict( | |
input_ids=inputs["input_ids"], | |
attention_mask=inputs["attention_mask"], | |
max_new_tokens=max_tokens, | |
do_sample=True, | |
temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
#eos_token_id=tokenizer.eos_token_id, | |
#num_beams=1, | |
output_scores=False, | |
cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501 | |
) | |
outputs = model.generate(**gen_kwargs) | |
# Extract only the newly generated text | |
gen_ids = outputs[0][inputs["input_ids"].shape[-1]:] | |
new_response = tokenizer.decode(gen_ids, skip_special_tokens=True) | |
return new_response | |
except Exception as e: | |
print(f"Error: {e}") | |
gr.Warning(f"Error: {e}") | |
return "" | |
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
examples = [ | |
["Translate From English to Devnagri Konkani: what is color?"], | |
["ΰ€ΰ€°ΰ€Ύΰ€ΰ€€ ΰ€΅ΰ€Ώΰ€ΰ₯ΰ€ΰ₯ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€ΰ€£ΰ₯ ΰ€ΰ€°ΰ€ͺΰ€Ύΰ€ΰ₯ ΰ€―ΰ₯ΰ€΅ΰ€ΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€ΰ€°ΰ€ͺ."], | |
] | |
demo = gr.ChatInterface( | |
fn=generate_response, | |
type="messages", | |
title=TITLE, | |
description=DESCRIPTION, | |
examples=examples, | |
cache_examples=True, | |
theme="soft", | |
additional_inputs=[ | |
gr.Textbox(value="", label="System message"), | |
gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"), | |
gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"), | |
gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"), | |
gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"), | |
gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"), | |
], | |
) | |
# ββ Launch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
if __name__ == "__main__": | |
print("π Starting Gradio app for ZeroGPU...") | |
demo.queue().launch() |