Spaces:
Running
on
Zero
Running
on
Zero
Darius Morawiec
commited on
Commit
·
bfa4ccc
1
Parent(s):
a84c724
Implement AutoModel class for dynamic model loading and refactor model initialization
Browse files
app.py
CHANGED
|
@@ -46,6 +46,27 @@ MODEL_IDS = [
|
|
| 46 |
"Qwen/Qwen3-VL-32B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct
|
| 47 |
]
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def resize_image(image, target_size=1000):
|
| 51 |
width, height = image.size
|
|
@@ -149,11 +170,6 @@ with gr.Blocks() as demo:
|
|
| 149 |
with gr.Row():
|
| 150 |
run_button = gr.Button("Run")
|
| 151 |
|
| 152 |
-
# Global variables to track loaded model
|
| 153 |
-
current_model = None
|
| 154 |
-
current_processor = None
|
| 155 |
-
current_model_id = None
|
| 156 |
-
|
| 157 |
def load_model(model_id: str):
|
| 158 |
global current_model, current_processor, current_model_id
|
| 159 |
|
|
@@ -170,23 +186,12 @@ with gr.Blocks() as demo:
|
|
| 170 |
|
| 171 |
# Force garbage collection and clear CUDA cache
|
| 172 |
gc.collect()
|
| 173 |
-
torch.cuda.empty_cache()
|
| 174 |
-
|
| 175 |
if torch.cuda.is_available():
|
|
|
|
| 176 |
torch.cuda.synchronize()
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
if model_id.startswith("Qwen/Qwen2-VL"):
|
| 181 |
-
model_loader = Qwen2VLForConditionalGeneration
|
| 182 |
-
elif model_id.startswith("Qwen/Qwen2.5-VL"):
|
| 183 |
-
model_loader = Qwen2_5_VLForConditionalGeneration
|
| 184 |
-
elif model_id.startswith("Qwen/Qwen3-VL"):
|
| 185 |
-
model_loader = Qwen3VLForConditionalGeneration
|
| 186 |
-
assert model_loader is not None, f"Unsupported model ID: {model_id}"
|
| 187 |
-
# Load model on CPU to avoid using CUDA resources during download
|
| 188 |
-
current_model = model_loader.from_pretrained(
|
| 189 |
-
model_id, torch_dtype=torch.bfloat16, device_map="cpu"
|
| 190 |
)
|
| 191 |
current_processor = AutoProcessor.from_pretrained(model_id)
|
| 192 |
current_model_id = model_id
|
|
@@ -290,8 +295,10 @@ with gr.Blocks() as demo:
|
|
| 290 |
image_resize: str = "Yes",
|
| 291 |
image_target_size: int | None = None,
|
| 292 |
):
|
|
|
|
| 293 |
model, processor = load_model(model_id)
|
| 294 |
|
|
|
|
| 295 |
return generate(
|
| 296 |
model,
|
| 297 |
processor,
|
|
|
|
| 46 |
"Qwen/Qwen3-VL-32B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct
|
| 47 |
]
|
| 48 |
|
| 49 |
+
# Global variables to track loaded model
|
| 50 |
+
current_model = None
|
| 51 |
+
current_processor = None
|
| 52 |
+
current_model_id = None
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class AutoModel:
|
| 56 |
+
@staticmethod
|
| 57 |
+
def from_pretrained(model_id, dtype="auto", device_map="cpu"):
|
| 58 |
+
if model_id.startswith("Qwen/Qwen2-VL"):
|
| 59 |
+
model_loader = Qwen2VLForConditionalGeneration
|
| 60 |
+
elif model_id.startswith("Qwen/Qwen2.5-VL"):
|
| 61 |
+
model_loader = Qwen2_5_VLForConditionalGeneration
|
| 62 |
+
elif model_id.startswith("Qwen/Qwen3-VL"):
|
| 63 |
+
model_loader = Qwen3VLForConditionalGeneration
|
| 64 |
+
else:
|
| 65 |
+
raise ValueError(f"Unsupported model ID: {model_id}")
|
| 66 |
+
return model_loader.from_pretrained(
|
| 67 |
+
model_id, dtype=dtype, device_map=device_map
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
|
| 71 |
def resize_image(image, target_size=1000):
|
| 72 |
width, height = image.size
|
|
|
|
| 170 |
with gr.Row():
|
| 171 |
run_button = gr.Button("Run")
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
def load_model(model_id: str):
|
| 174 |
global current_model, current_processor, current_model_id
|
| 175 |
|
|
|
|
| 186 |
|
| 187 |
# Force garbage collection and clear CUDA cache
|
| 188 |
gc.collect()
|
|
|
|
|
|
|
| 189 |
if torch.cuda.is_available():
|
| 190 |
+
torch.cuda.empty_cache()
|
| 191 |
torch.cuda.synchronize()
|
| 192 |
|
| 193 |
+
current_model = AutoModel.from_pretrained(
|
| 194 |
+
model_id, dtype="auto", device_map="cpu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
)
|
| 196 |
current_processor = AutoProcessor.from_pretrained(model_id)
|
| 197 |
current_model_id = model_id
|
|
|
|
| 295 |
image_resize: str = "Yes",
|
| 296 |
image_target_size: int | None = None,
|
| 297 |
):
|
| 298 |
+
# Load the model and processor on CPU
|
| 299 |
model, processor = load_model(model_id)
|
| 300 |
|
| 301 |
+
# Run inference on GPU (if available)
|
| 302 |
return generate(
|
| 303 |
model,
|
| 304 |
processor,
|