Spaces:

Reubencf
/

Gemma3-konkani

Running on Zero

App Files Files Community

Gemma3-konkani / app.py

Reubencf

Update app.py

b533aec verified 20 days ago

raw

history blame contribute delete

6.29 kB

	# app.py — Corrected for Hugging Face ZeroGPU Spaces
	# ---------------------------------------------------------------
	# This version is adapted for the ZeroGPU environment by using
	# the @spaces.GPU decorator.
	# ---------------------------------------------------------------
	import os
	import torch
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import spaces # 1. Import the spaces library

	IS_CUDA = torch.cuda.is_available()
	IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
	if IS_ZEROGPU:
	torch.compiler.set_stance("force_eager")
	torch.set_float32_matmul_precision("high")
	torch.backends.cuda.matmul.allow_tf32 = True

	# ── Configuration ────────────────────────────────────────────────────────────
	MODEL_ID = "Reubencf/gemma3-konkani"
	HF_TOKEN = os.getenv("HF_TOKEN", None)

	TITLE = "Konkani LLM Fine Tuned on Gemma 3"
	DESCRIPTION = (
	"Version 1 of the Konkani LLM.\n"
	"This release may contain inconsistencies, but improvements will follow in future updates."
	)

	# ── Loading ──────────────────────────────────────────────────────────────────
	print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
	def load_model():
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
	kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {}
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
	device_map="auto", token=HF_TOKEN, **kwargs)
	print("[Init] Model loaded successfully.")
	return model, tokenizer
	except Exception as e:
	# If model loading fails, we can't proceed.
	print(f"[Fatal] Could not load model: {e}")
	raise Exception(f"❌ Model failed to load: {e}")

	model, tokenizer = load_model()

	DEF_TOKENS = 256
	DEF_TEMPERATURE = 0.7
	DEF_TOPK = 50
	DEF_TOPP = 0.95
	DEF_DURATION = 10

	def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
	return int(duration if duration is not None else DEF_DURATION)

	# ── Generation Function ──────────────────────────────────────────────────────
	@spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
	@torch.inference_mode()
	def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
	"""
	This function is called for each user message.
	The @spaces.GPU decorator ensures a GPU is allocated when this runs.
	"""
	try:
	# Format the conversation history
	conversation = []
	if system_message: conversation.append({"role": "system", "content": system_message})
	for msg in history: # https://www.gradio.app/docs/gradio/chatbot
	if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
	conversation.append({"role": msg["role"], "content": msg["content"]})

	# Add the current user's message
	conversation.append({"role": "user", "content": message})

	# Apply the chat template
	inputs = tokenizer.apply_chat_template(
	conversation,
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt",
	return_dict=True,
	).to(model.device)

	# Generate the response
	gen_kwargs = dict(
	input_ids=inputs["input_ids"],
	attention_mask=inputs["attention_mask"],
	max_new_tokens=max_tokens,
	do_sample=True,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	#eos_token_id=tokenizer.eos_token_id,
	#num_beams=1,
	output_scores=False,
	cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
	)
	outputs = model.generate(**gen_kwargs)

	# Extract only the newly generated text
	gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
	new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)

	return new_response
	except Exception as e:
	print(f"Error: {e}")
	gr.Warning(f"Error: {e}")
	return ""

	# ── UI ────────────────────────────────────────────────────────────────────────
	examples = [
	["Translate From English to Devnagri Konkani: what is color?"],
	["घरांत विजेचो वापर उणो करपाची येवजण तयार करप."],
	]

	demo = gr.ChatInterface(
	fn=generate_response,
	type="messages",
	title=TITLE,
	description=DESCRIPTION,
	examples=examples,
	cache_examples=True,
	theme="soft",
	additional_inputs=[
	gr.Textbox(value="", label="System message"),
	gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
	gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
	gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
	gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
	],
	)

	# ── Launch ────────────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	print("🚀 Starting Gradio app for ZeroGPU...")
	demo.queue().launch()