Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,288 Bytes
fd3edf2 cb0052b e79aff2 fd3edf2 cb0052b e79aff2 5b16dc7 e79aff2 fd3edf2 e79aff2 cb0052b f393d86 fd3edf2 4debea8 fd3edf2 e79aff2 5b16dc7 e79aff2 b533aec e79aff2 fd3edf2 e79aff2 cb0052b fd3edf2 cb0052b e79aff2 5b16dc7 e79aff2 cb0052b fd3edf2 e79aff2 fd3edf2 cb0052b fd3edf2 e79aff2 fd3edf2 e79aff2 fd3edf2 e79aff2 cb0052b fd3edf2 cb0052b fd3edf2 af73ec8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# app.py β Corrected for Hugging Face ZeroGPU Spaces
# ---------------------------------------------------------------
# This version is adapted for the ZeroGPU environment by using
# the @spaces.GPU decorator.
# ---------------------------------------------------------------
import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces # 1. Import the spaces library
IS_CUDA = torch.cuda.is_available()
IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
if IS_ZEROGPU:
torch.compiler.set_stance("force_eager")
torch.set_float32_matmul_precision("high")
torch.backends.cuda.matmul.allow_tf32 = True
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MODEL_ID = "Reubencf/gemma3-konkani"
HF_TOKEN = os.getenv("HF_TOKEN", None)
TITLE = "Konkani LLM Fine Tuned on Gemma 3"
DESCRIPTION = (
"Version 1 of the Konkani LLM.\n"
"This release may contain inconsistencies, but improvements will follow in future updates."
)
# ββ Loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
def load_model():
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {}
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
device_map="auto", token=HF_TOKEN, **kwargs)
print("[Init] Model loaded successfully.")
return model, tokenizer
except Exception as e:
# If model loading fails, we can't proceed.
print(f"[Fatal] Could not load model: {e}")
raise Exception(f"β Model failed to load: {e}")
model, tokenizer = load_model()
DEF_TOKENS = 256
DEF_TEMPERATURE = 0.7
DEF_TOPK = 50
DEF_TOPP = 0.95
DEF_DURATION = 10
def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
return int(duration if duration is not None else DEF_DURATION)
# ββ Generation Function ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
@torch.inference_mode()
def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
"""
This function is called for each user message.
The @spaces.GPU decorator ensures a GPU is allocated when this runs.
"""
try:
# Format the conversation history
conversation = []
if system_message: conversation.append({"role": "system", "content": system_message})
for msg in history: # https://www.gradio.app/docs/gradio/chatbot
if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
conversation.append({"role": msg["role"], "content": msg["content"]})
# Add the current user's message
conversation.append({"role": "user", "content": message})
# Apply the chat template
inputs = tokenizer.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
).to(model.device)
# Generate the response
gen_kwargs = dict(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_new_tokens=max_tokens,
do_sample=True,
temperature=temperature,
top_k=top_k,
top_p=top_p,
#eos_token_id=tokenizer.eos_token_id,
#num_beams=1,
output_scores=False,
cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
)
outputs = model.generate(**gen_kwargs)
# Extract only the newly generated text
gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)
return new_response
except Exception as e:
print(f"Error: {e}")
gr.Warning(f"Error: {e}")
return ""
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
examples = [
["Translate From English to Devnagri Konkani: what is color?"],
["ΰ€ΰ€°ΰ€Ύΰ€ΰ€€ ΰ€΅ΰ€Ώΰ€ΰ₯ΰ€ΰ₯ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€ΰ€£ΰ₯ ΰ€ΰ€°ΰ€ͺΰ€Ύΰ€ΰ₯ ΰ€―ΰ₯ΰ€΅ΰ€ΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€ΰ€°ΰ€ͺ."],
]
demo = gr.ChatInterface(
fn=generate_response,
type="messages",
title=TITLE,
description=DESCRIPTION,
examples=examples,
cache_examples=True,
theme="soft",
additional_inputs=[
gr.Textbox(value="", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
],
)
# ββ Launch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
print("π Starting Gradio app for ZeroGPU...")
demo.queue().launch() |