File size: 6,288 Bytes
fd3edf2
 
 
 
 
 
 
cb0052b
e79aff2
fd3edf2
cb0052b
e79aff2
 
5b16dc7
 
 
 
e79aff2
 
fd3edf2
e79aff2
cb0052b
f393d86
fd3edf2
4debea8
 
fd3edf2
e79aff2
 
 
 
 
 
5b16dc7
 
 
e79aff2
 
 
 
 
 
 
 
 
 
 
 
 
b533aec
e79aff2
 
 
fd3edf2
 
e79aff2
 
 
cb0052b
fd3edf2
 
cb0052b
e79aff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b16dc7
e79aff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb0052b
fd3edf2
 
e79aff2
 
fd3edf2
cb0052b
fd3edf2
 
e79aff2
fd3edf2
 
 
e79aff2
fd3edf2
e79aff2
 
 
 
 
 
 
 
cb0052b
 
fd3edf2
cb0052b
fd3edf2
af73ec8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# app.py β€” Corrected for Hugging Face ZeroGPU Spaces
# ---------------------------------------------------------------
# This version is adapted for the ZeroGPU environment by using
# the @spaces.GPU decorator.
# ---------------------------------------------------------------
import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces # 1. Import the spaces library

IS_CUDA = torch.cuda.is_available()
IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
if IS_ZEROGPU:
    torch.compiler.set_stance("force_eager")
    torch.set_float32_matmul_precision("high")
    torch.backends.cuda.matmul.allow_tf32 = True

# ── Configuration ────────────────────────────────────────────────────────────
MODEL_ID = "Reubencf/gemma3-konkani"
HF_TOKEN = os.getenv("HF_TOKEN", None)

TITLE = "Konkani LLM Fine Tuned on Gemma 3"
DESCRIPTION = (
    "Version 1 of the Konkani LLM.\n"
    "This release may contain inconsistencies, but improvements will follow in future updates."
)

# ── Loading ──────────────────────────────────────────────────────────────────
print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
def load_model():
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
        kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {}
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
                                                     device_map="auto", token=HF_TOKEN, **kwargs)
        print("[Init] Model loaded successfully.")
        return model, tokenizer
    except Exception as e:
        # If model loading fails, we can't proceed.
        print(f"[Fatal] Could not load model: {e}")
        raise Exception(f"❌ Model failed to load: {e}")

model, tokenizer = load_model()

DEF_TOKENS = 256
DEF_TEMPERATURE = 0.7
DEF_TOPK = 50
DEF_TOPP = 0.95
DEF_DURATION = 10

def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
    return int(duration if duration is not None else DEF_DURATION)

# ── Generation Function ──────────────────────────────────────────────────────
@spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
@torch.inference_mode()
def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
    """
    This function is called for each user message.
    The @spaces.GPU decorator ensures a GPU is allocated when this runs.
    """
    try:
        # Format the conversation history
        conversation = []
        if system_message: conversation.append({"role": "system", "content": system_message})
        for msg in history: # https://www.gradio.app/docs/gradio/chatbot
            if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
            conversation.append({"role": msg["role"], "content": msg["content"]})

        # Add the current user's message
        conversation.append({"role": "user", "content": message})

        # Apply the chat template
        inputs = tokenizer.apply_chat_template(
            conversation,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
        ).to(model.device)

        # Generate the response
        gen_kwargs = dict(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            #eos_token_id=tokenizer.eos_token_id,
            #num_beams=1,
            output_scores=False,
            cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
        )
        outputs = model.generate(**gen_kwargs)

        # Extract only the newly generated text
        gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
        new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)

        return new_response
    except Exception as e:
        print(f"Error: {e}")
        gr.Warning(f"Error: {e}")
        return ""

# ── UI ────────────────────────────────────────────────────────────────────────
examples = [
    ["Translate From English to Devnagri Konkani: what is color?"],
    ["ΰ€˜ΰ€°ΰ€Ύΰ€‚ΰ€€ ΰ€΅ΰ€Ώΰ€œΰ₯‡ΰ€šΰ₯‹ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€‰ΰ€£ΰ₯‹ ΰ€•ΰ€°ΰ€ͺΰ€Ύΰ€šΰ₯€ ΰ€―ΰ₯‡ΰ€΅ΰ€œΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€•ΰ€°ΰ€ͺ."],
]

demo = gr.ChatInterface(
    fn=generate_response,
    type="messages",
    title=TITLE,
    description=DESCRIPTION,
    examples=examples,
    cache_examples=True,
    theme="soft",
    additional_inputs=[
        gr.Textbox(value="", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
        gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
        gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
        gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
    ],
)

# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    print("πŸš€ Starting Gradio app for ZeroGPU...")
    demo.queue().launch()