Reubencf commited on
Commit
af73ec8
Β·
verified Β·
1 Parent(s): e79aff2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -4
app.py CHANGED
@@ -7,11 +7,14 @@ import os
7
  import torch
8
  import gradio as gr
9
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
10
  import spaces # 1. Import the spaces library
11
 
12
  IS_CUDA = torch.cuda.is_available()
13
  IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
14
  if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
 
15
 
16
  # ── Configuration ────────────────────────────────────────────────────────────
17
  MODEL_ID = "Reubencf/gemma3-konkani"
@@ -28,8 +31,14 @@ print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
28
  def load_model():
29
  try:
30
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
31
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
32
- device_map="auto", token=HF_TOKEN)
 
 
 
 
 
 
33
  print("[Init] Model loaded successfully.")
34
  return model, tokenizer
35
  except Exception as e:
@@ -79,7 +88,7 @@ def generate_response(message, history=[], system_message="", max_tokens=DEF_TOK
79
  # Generate the response
80
  gen_kwargs = dict(
81
  input_ids=inputs["input_ids"],
82
- attention_mask=inputs["attention_mask"],
83
  max_new_tokens=max_tokens,
84
  do_sample=True,
85
  temperature=temperature,
@@ -129,4 +138,4 @@ demo = gr.ChatInterface(
129
  # ── Launch ────────────────────────────────────────────────────────────────────
130
  if __name__ == "__main__":
131
  print("πŸš€ Starting Gradio app for ZeroGPU...")
132
- demo.queue().launch()
 
7
  import torch
8
  import gradio as gr
9
  from transformers import AutoModelForCausalLM, AutoTokenizer
10
+ from transformers import TorchAoConfig # not for Zero GPU
11
+ from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Float8DynamicActivationFloat8WeightConfig # not for Zero GPU
12
  import spaces # 1. Import the spaces library
13
 
14
  IS_CUDA = torch.cuda.is_available()
15
  IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
16
  if IS_ZEROGPU: torch.set_float32_matmul_precision("high")
17
+ IS_QUANT = True
18
 
19
  # ── Configuration ────────────────────────────────────────────────────────────
20
  MODEL_ID = "Reubencf/gemma3-konkani"
 
31
  def load_model():
32
  try:
33
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
34
+ if IS_QUANT: # not for Zero GPU
35
+ quant_config = Float8DynamicActivationFloat8WeightConfig() if IS_CUDA else Int8DynamicActivationInt8WeightConfig()
36
+ quantization_config = TorchAoConfig(quant_type=quant_config)
37
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
38
+ device_map="auto", quantization_config=quantization_config, token=HF_TOKEN)
39
+ else:
40
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
41
+ device_map="auto", token=HF_TOKEN)
42
  print("[Init] Model loaded successfully.")
43
  return model, tokenizer
44
  except Exception as e:
 
88
  # Generate the response
89
  gen_kwargs = dict(
90
  input_ids=inputs["input_ids"],
91
+ #attention_mask=inputs["attention_mask"],
92
  max_new_tokens=max_tokens,
93
  do_sample=True,
94
  temperature=temperature,
 
138
  # ── Launch ────────────────────────────────────────────────────────────────────
139
  if __name__ == "__main__":
140
  print("πŸš€ Starting Gradio app for ZeroGPU...")
141
+ demo.queue().launch()