|
|
|
|
|
|
|
""" |
|
Qwen3-Omni 智能GPU/CPU Offloading系統 |
|
功能: 使用Transformers accelerate的自動offloading,避免手動設備分配問題 |
|
策略: 讓accelerate庫自動處理設備間的數據傳輸 |
|
""" |
|
|
|
import torch |
|
import gc |
|
import time |
|
import warnings |
|
import traceback |
|
import psutil |
|
from transformers import ( |
|
Qwen3OmniMoeForConditionalGeneration, |
|
Qwen3OmniMoeProcessor, |
|
) |
|
from accelerate import init_empty_weights, load_checkpoint_and_dispatch |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
class SmartOffloadingRunner: |
|
"""智能Offloading推理運行器""" |
|
|
|
def __init__(self, model_path: str = "/var/www/qwen_model_quantized"): |
|
self.model_path = model_path |
|
self.model = None |
|
self.processor = None |
|
self.device = None |
|
self.gpu_available = torch.cuda.is_available() |
|
|
|
if self.gpu_available: |
|
self.gpu_props = torch.cuda.get_device_properties(0) |
|
self.total_gpu_memory = self.gpu_props.total_memory / 1024**3 |
|
|
|
self.max_gpu_memory = min(self.total_gpu_memory * 0.85, 24.0) |
|
else: |
|
self.max_gpu_memory = 0 |
|
|
|
def get_optimal_device_map(self): |
|
"""獲取最佳設備映射""" |
|
if not self.gpu_available: |
|
print("🖥️ GPU不可用,使用CPU模式") |
|
return "cpu" |
|
|
|
print(f"🔍 GPU: {self.gpu_props.name} ({self.total_gpu_memory:.1f}GB)") |
|
print(f"📊 允許GPU使用: {self.max_gpu_memory:.1f}GB") |
|
|
|
|
|
device_map = "auto" |
|
return device_map |
|
|
|
def load_model_with_smart_offloading(self): |
|
"""使用智能offloading載入模型""" |
|
print("🚀 Qwen3-Omni 智能GPU/CPU Offloading系統") |
|
print("=" * 60) |
|
|
|
|
|
cpu_memory = psutil.virtual_memory().available / 1024**3 |
|
print(f"💾 可用記憶體: CPU {cpu_memory:.1f}GB", end="") |
|
if self.gpu_available: |
|
print(f", GPU {self.total_gpu_memory:.1f}GB") |
|
else: |
|
print() |
|
|
|
print("\n📦 載入processor...") |
|
self.processor = Qwen3OmniMoeProcessor.from_pretrained( |
|
self.model_path, |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
if self.processor.tokenizer.pad_token is None: |
|
self.processor.tokenizer.pad_token = self.processor.tokenizer.eos_token |
|
|
|
print("🧠 使用智能offloading載入模型...") |
|
start_time = time.time() |
|
|
|
|
|
device_map = self.get_optimal_device_map() |
|
|
|
|
|
try: |
|
if device_map == "cpu": |
|
|
|
self.device = "cpu" |
|
torch.set_num_threads(min(8, psutil.cpu_count())) |
|
|
|
self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( |
|
self.model_path, |
|
torch_dtype=torch.float32, |
|
device_map="cpu", |
|
trust_remote_code=True, |
|
low_cpu_mem_usage=True, |
|
) |
|
|
|
|
|
has_meta = any(p.device.type == 'meta' for p in self.model.parameters()) |
|
if has_meta: |
|
print("⚠️ 處理meta device權重...") |
|
self.model = self.model.to_empty(device="cpu") |
|
print("✅ meta device權重已初始化到CPU") |
|
|
|
else: |
|
|
|
self.device = "cuda:0" |
|
|
|
|
|
max_memory = {0: f"{self.max_gpu_memory}GB", "cpu": "60GB"} |
|
|
|
self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( |
|
self.model_path, |
|
torch_dtype=torch.float16, |
|
device_map=device_map, |
|
max_memory=max_memory, |
|
trust_remote_code=True, |
|
low_cpu_mem_usage=True, |
|
offload_folder="./offload_cache", |
|
offload_state_dict=True, |
|
) |
|
|
|
self.model.eval() |
|
load_time = time.time() - start_time |
|
|
|
print(f"✅ 模型載入完成! 用時: {load_time:.1f}秒") |
|
|
|
|
|
print("📊 記憶體使用狀態:") |
|
print(f" CPU: {psutil.virtual_memory().used / 1024**3:.1f}GB") |
|
if self.gpu_available: |
|
gpu_allocated = torch.cuda.memory_allocated() / 1024**3 |
|
print(f" GPU: {gpu_allocated:.1f}GB") |
|
|
|
|
|
if hasattr(self.model, 'hf_device_map'): |
|
gpu_layers = sum(1 for dev in self.model.hf_device_map.values() if str(dev).startswith('cuda')) |
|
cpu_layers = sum(1 for dev in self.model.hf_device_map.values() if str(dev) == 'cpu') |
|
print(f"🎯 設備分配: GPU層數={gpu_layers}, CPU層數={cpu_layers}") |
|
|
|
return True |
|
|
|
except Exception as e: |
|
print(f"❌ 載入失敗: {e}") |
|
print("🔄 回退到CPU模式...") |
|
return self.fallback_to_cpu() |
|
|
|
def fallback_to_cpu(self): |
|
"""回退到CPU模式""" |
|
try: |
|
self.device = "cpu" |
|
torch.set_num_threads(6) |
|
|
|
|
|
self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( |
|
self.model_path, |
|
torch_dtype=torch.float32, |
|
trust_remote_code=True, |
|
low_cpu_mem_usage=True, |
|
) |
|
|
|
|
|
has_meta = any(p.device.type == 'meta' for p in self.model.parameters()) |
|
if has_meta: |
|
print("⚠️ CPU模式處理meta device...") |
|
self.model = self.model.to_empty(device="cpu") |
|
print("✅ CPU模式載入完成") |
|
else: |
|
|
|
self.model = self.model.to("cpu") |
|
print("✅ CPU模式載入完成") |
|
|
|
self.model.eval() |
|
return True |
|
|
|
except Exception as e: |
|
print(f"❌ CPU模式也失敗: {e}") |
|
traceback.print_exc() |
|
return False |
|
|
|
def generate_response(self, prompt: str, max_tokens: int = 128) -> tuple: |
|
"""生成回應""" |
|
start_time = time.time() |
|
|
|
|
|
inputs = self.processor.tokenizer( |
|
prompt, |
|
return_tensors="pt", |
|
max_length=2048, |
|
truncation=True |
|
) |
|
|
|
|
|
main_device = "cuda:0" if (self.gpu_available and hasattr(self.model, 'hf_device_map')) else "cpu" |
|
|
|
|
|
if main_device == "cuda:0": |
|
inputs = {k: v.to(main_device) for k, v in inputs.items()} |
|
|
|
print(f"💭 生成中... (主設備: {main_device})") |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = self.model.generate( |
|
input_ids=inputs['input_ids'], |
|
attention_mask=inputs.get('attention_mask'), |
|
max_new_tokens=max_tokens, |
|
do_sample=False, |
|
num_beams=1, |
|
pad_token_id=self.processor.tokenizer.eos_token_id, |
|
eos_token_id=self.processor.tokenizer.eos_token_id, |
|
) |
|
|
|
|
|
response = self.processor.tokenizer.decode( |
|
outputs[0][inputs['input_ids'].shape[1]:], |
|
skip_special_tokens=True |
|
).strip() |
|
|
|
|
|
gen_time = time.time() - start_time |
|
new_tokens = outputs.shape[1] - inputs['input_ids'].shape[1] |
|
tokens_per_sec = new_tokens / gen_time if gen_time > 0 else 0 |
|
|
|
|
|
del inputs, outputs |
|
if self.gpu_available: |
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
stats = { |
|
'generation_time': gen_time, |
|
'new_tokens': new_tokens, |
|
'tokens_per_second': tokens_per_sec, |
|
'main_device': main_device |
|
} |
|
|
|
return response, stats |
|
|
|
def run_tests(self): |
|
"""運行測試""" |
|
test_prompts = [ |
|
"你好,請用一句話介紹你自己。", |
|
"什麼是人工智能?", |
|
] |
|
|
|
print("\n🧪 智能Offloading測試...") |
|
print("-" * 50) |
|
|
|
total_tokens = 0 |
|
total_time = 0 |
|
|
|
for i, prompt in enumerate(test_prompts, 1): |
|
print(f"\n📝 測試 {i}/{len(test_prompts)}: {prompt}") |
|
|
|
try: |
|
response, stats = self.generate_response(prompt, max_tokens=80) |
|
|
|
print(f"⚡ 速度: {stats['tokens_per_second']:.2f} tokens/秒") |
|
print(f"📤 回應: {response}") |
|
|
|
total_tokens += stats['new_tokens'] |
|
total_time += stats['generation_time'] |
|
|
|
except Exception as e: |
|
print(f"❌ 測試失敗: {e}") |
|
print("🔍 詳細錯誤:") |
|
traceback.print_exc() |
|
|
|
|
|
if total_time > 0: |
|
avg_speed = total_tokens / total_time |
|
print(f"\n📈 Offloading性能總結:") |
|
print(f" 平均速度: {avg_speed:.2f} tokens/秒") |
|
print(f" 總tokens: {total_tokens}") |
|
print(f" 總用時: {total_time:.2f}秒") |
|
|
|
|
|
print(f" 最終CPU記憶體: {psutil.virtual_memory().used / 1024**3:.1f}GB") |
|
if self.gpu_available: |
|
print(f" 最終GPU記憶體: {torch.cuda.memory_allocated() / 1024**3:.1f}GB") |
|
|
|
def cleanup(self): |
|
"""清理資源""" |
|
if self.model is not None: |
|
del self.model |
|
if self.processor is not None: |
|
del self.processor |
|
|
|
if self.gpu_available: |
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
|
|
import shutil |
|
import os |
|
if os.path.exists("./offload_cache"): |
|
shutil.rmtree("./offload_cache") |
|
|
|
print("🧹 資源清理完成") |
|
|
|
def main(): |
|
runner = SmartOffloadingRunner() |
|
|
|
try: |
|
|
|
success = runner.load_model_with_smart_offloading() |
|
|
|
if success: |
|
|
|
runner.run_tests() |
|
|
|
print("\n🎉 智能Offloading測試完成!") |
|
print("💡 提示: 使用accelerate自動offloading,GPU+CPU協同工作") |
|
else: |
|
print("💥 載入失敗") |
|
|
|
except Exception as e: |
|
print(f"❌ 執行失敗: {e}") |
|
traceback.print_exc() |
|
|
|
finally: |
|
runner.cleanup() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|