Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,8 @@ from huggingface_hub import login, create_repo, HfApi
|
|
| 8 |
import gradio as gr
|
| 9 |
import queue
|
| 10 |
import time
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# 全局日志
|
| 13 |
log_queue = queue.Queue()
|
|
@@ -46,9 +48,7 @@ def check_system_resources(model_name):
|
|
| 46 |
log("Checking system resources...")
|
| 47 |
system_memory = psutil.virtual_memory()
|
| 48 |
total_memory_gb = system_memory.total / (1024 ** 3)
|
| 49 |
-
available_memory_gb = system_memory.available / (1024 ** 3)
|
| 50 |
log(f"Total system memory: {total_memory_gb:.1f}GB")
|
| 51 |
-
log(f"Available memory: {available_memory_gb:.1f}GB")
|
| 52 |
|
| 53 |
model_size_gb = get_model_size_in_gb(model_name)
|
| 54 |
required_memory_gb = model_size_gb * 2.5 # 预留额外内存
|
|
@@ -66,9 +66,9 @@ def check_system_resources(model_name):
|
|
| 66 |
else:
|
| 67 |
log("❌ No GPU detected.")
|
| 68 |
|
| 69 |
-
if
|
| 70 |
log("✅ Sufficient CPU memory available; using CPU.")
|
| 71 |
-
return "cpu",
|
| 72 |
else:
|
| 73 |
raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
|
| 74 |
|
|
@@ -117,10 +117,9 @@ def download_and_merge_model(base_model_name, lora_model_name, output_dir, devic
|
|
| 117 |
model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
|
| 118 |
log("Loading adapter tokenizer...")
|
| 119 |
adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
|
| 124 |
log("Loading LoRA adapter...")
|
| 125 |
peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True)
|
| 126 |
log("Merging and unloading model...")
|
|
@@ -157,7 +156,9 @@ def clone_llamacpp_and_download_build():
|
|
| 157 |
log("llama.cpp build completed.")
|
| 158 |
# 返回到原始目录
|
| 159 |
os.chdir(os.path.dirname(llamacpp_dir))
|
| 160 |
-
|
|
|
|
|
|
|
| 161 |
|
| 162 |
@timeit
|
| 163 |
def quantize(model_path, repo_id, quant_method=None):
|
|
@@ -180,12 +181,13 @@ def quantize(model_path, repo_id, quant_method=None):
|
|
| 180 |
os.makedirs(model_output_dir, exist_ok=True)
|
| 181 |
|
| 182 |
# 中间文件保存在 model_output 目录下
|
| 183 |
-
|
| 184 |
|
| 185 |
-
if not os.path.exists(
|
| 186 |
log(f"正在将模型转换为GGML格式")
|
| 187 |
convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
|
| 188 |
-
convert_cmd = f"python {convert_script} {model_path} --outfile {
|
|
|
|
| 189 |
os.system(convert_cmd)
|
| 190 |
else:
|
| 191 |
log(f"GGML中间文件已存在,跳过转换")
|
|
@@ -194,7 +196,8 @@ def quantize(model_path, repo_id, quant_method=None):
|
|
| 194 |
final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
|
| 195 |
log(f"正在进行{quant_method}量化")
|
| 196 |
quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
|
| 197 |
-
quant_cmd = f"{quantize_bin} {
|
|
|
|
| 198 |
|
| 199 |
if not os.path.exists(final_path):
|
| 200 |
os.system(quant_cmd)
|
|
@@ -226,7 +229,8 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
|
|
| 226 |
adapter_config = PeftConfig.from_pretrained(lora_model_name)
|
| 227 |
base_model_name = adapter_config.base_model_name_or_path
|
| 228 |
if repo_name.strip().lower() == "auto":
|
| 229 |
-
repo_name = f"{
|
|
|
|
| 230 |
|
| 231 |
device = setup_environment(base_model_name)
|
| 232 |
repo_name = create_hf_repo(repo_name)
|
|
@@ -238,15 +242,20 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
|
|
| 238 |
|
| 239 |
# 量化模型
|
| 240 |
for quant_method in quant_methods:
|
| 241 |
-
quantize(
|
| 242 |
|
| 243 |
# 上传合并后的模型和量化模型
|
| 244 |
api.upload_large_folder(
|
| 245 |
folder_path=model_path,
|
| 246 |
repo_id=repo_name,
|
| 247 |
repo_type="model",
|
| 248 |
-
num_workers=4,
|
|
|
|
| 249 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
return "\n".join(current_logs)
|
| 251 |
except Exception as e:
|
| 252 |
error_message = f"Error during processing: {e}"
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
import queue
|
| 10 |
import time
|
| 11 |
+
import shutil
|
| 12 |
+
|
| 13 |
|
| 14 |
# 全局日志
|
| 15 |
log_queue = queue.Queue()
|
|
|
|
| 48 |
log("Checking system resources...")
|
| 49 |
system_memory = psutil.virtual_memory()
|
| 50 |
total_memory_gb = system_memory.total / (1024 ** 3)
|
|
|
|
| 51 |
log(f"Total system memory: {total_memory_gb:.1f}GB")
|
|
|
|
| 52 |
|
| 53 |
model_size_gb = get_model_size_in_gb(model_name)
|
| 54 |
required_memory_gb = model_size_gb * 2.5 # 预留额外内存
|
|
|
|
| 66 |
else:
|
| 67 |
log("❌ No GPU detected.")
|
| 68 |
|
| 69 |
+
if total_memory_gb >= required_memory_gb:
|
| 70 |
log("✅ Sufficient CPU memory available; using CPU.")
|
| 71 |
+
return "cpu", total_memory_gb
|
| 72 |
else:
|
| 73 |
raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
|
| 74 |
|
|
|
|
| 117 |
model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
|
| 118 |
log("Loading adapter tokenizer...")
|
| 119 |
adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
|
| 120 |
+
log("Resizing token embeddings...")
|
| 121 |
+
added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
|
| 122 |
+
model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
|
|
|
|
| 123 |
log("Loading LoRA adapter...")
|
| 124 |
peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True)
|
| 125 |
log("Merging and unloading model...")
|
|
|
|
| 156 |
log("llama.cpp build completed.")
|
| 157 |
# 返回到原始目录
|
| 158 |
os.chdir(os.path.dirname(llamacpp_dir))
|
| 159 |
+
|
| 160 |
+
def remove_illegal_chars_in_path(text):
|
| 161 |
+
return text.replace(".", "_").replace(":", "_").replace("/", "_")
|
| 162 |
|
| 163 |
@timeit
|
| 164 |
def quantize(model_path, repo_id, quant_method=None):
|
|
|
|
| 181 |
os.makedirs(model_output_dir, exist_ok=True)
|
| 182 |
|
| 183 |
# 中间文件保存在 model_output 目录下
|
| 184 |
+
guff_16 = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
|
| 185 |
|
| 186 |
+
if not os.path.exists(guff_16):
|
| 187 |
log(f"正在将模型转换为GGML格式")
|
| 188 |
convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
|
| 189 |
+
convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16}"
|
| 190 |
+
print(f"syscall:[{convert_cmd}]")
|
| 191 |
os.system(convert_cmd)
|
| 192 |
else:
|
| 193 |
log(f"GGML中间文件已存在,跳过转换")
|
|
|
|
| 196 |
final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
|
| 197 |
log(f"正在进行{quant_method}量化")
|
| 198 |
quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
|
| 199 |
+
quant_cmd = f"{quantize_bin} {guff_16} {final_path} {quant_method}"
|
| 200 |
+
print(f"syscall:[{quant_cmd}]")
|
| 201 |
|
| 202 |
if not os.path.exists(final_path):
|
| 203 |
os.system(quant_cmd)
|
|
|
|
| 229 |
adapter_config = PeftConfig.from_pretrained(lora_model_name)
|
| 230 |
base_model_name = adapter_config.base_model_name_or_path
|
| 231 |
if repo_name.strip().lower() == "auto":
|
| 232 |
+
repo_name = f"{base_model_name.split('/')[-1]}_{lora_model_name.split('/')[-1]}"
|
| 233 |
+
repo_name = remove_illegal_chars_in_path(repo_name)
|
| 234 |
|
| 235 |
device = setup_environment(base_model_name)
|
| 236 |
repo_name = create_hf_repo(repo_name)
|
|
|
|
| 242 |
|
| 243 |
# 量化模型
|
| 244 |
for quant_method in quant_methods:
|
| 245 |
+
quantize(output_dir, repo_name, quant_method=quant_method)
|
| 246 |
|
| 247 |
# 上传合并后的模型和量化模型
|
| 248 |
api.upload_large_folder(
|
| 249 |
folder_path=model_path,
|
| 250 |
repo_id=repo_name,
|
| 251 |
repo_type="model",
|
| 252 |
+
num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
|
| 253 |
+
print_report_every=10,
|
| 254 |
)
|
| 255 |
+
|
| 256 |
+
# rm -rf model_path
|
| 257 |
+
shutil.rmtree(model_path)
|
| 258 |
+
|
| 259 |
return "\n".join(current_logs)
|
| 260 |
except Exception as e:
|
| 261 |
error_message = f"Error during processing: {e}"
|