Steven10429 commited on
Commit
4b4a015
·
verified ·
1 Parent(s): ae59a60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +218 -219
app.py CHANGED
@@ -1,330 +1,329 @@
1
  import os
2
  import torch
3
  import psutil
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
5
- from peft import PeftModel, PeftConfig
6
  from pathlib import Path
7
- from tqdm import tqdm
 
8
  from huggingface_hub import login, create_repo, HfApi
9
- import subprocess
10
- import math
11
  import gradio as gr
12
- import threading
13
  import queue
14
  import time
15
 
16
- # 创建一个队列用于存储日志消息
17
  log_queue = queue.Queue()
18
  current_logs = []
19
 
20
-
21
  def log(msg):
22
- """统一的日志处理函数"""
23
  print(msg)
24
  current_logs.append(msg)
25
  return "\n".join(current_logs)
26
 
 
 
 
 
 
 
 
 
 
 
27
  def get_model_size_in_gb(model_name):
28
- """估算模型大小(以GB为单位)"""
29
  try:
30
- # get model size from huggingface
31
  api = HfApi()
32
  model_info = api.model_info(model_name)
 
33
  return model_info.safetensors.total / (1024 ** 3)
34
-
35
  except Exception as e:
36
- log(f"无法估算模型大小: {str(e)}")
37
- return 1 # bypass memory check
38
 
 
39
  def check_system_resources(model_name):
40
- """检查系统资源并决定使用什么设备"""
41
- log("正在检查系统资源...")
42
-
43
- # 获取系统内存信息
44
  system_memory = psutil.virtual_memory()
45
  total_memory_gb = system_memory.total / (1024 ** 3)
46
  available_memory_gb = system_memory.available / (1024 ** 3)
 
 
47
 
48
- log(f"系统总内存: {total_memory_gb:.1f}GB")
49
- log(f"可用内存: {available_memory_gb:.1f}GB")
50
-
51
- # 估算模型所需内存
52
  model_size_gb = get_model_size_in_gb(model_name)
53
- required_memory_gb = model_size_gb * 2.5 # 需要额外的内存用于计算
54
- log(f"估计模型需要内存: {required_memory_gb:.1f}GB")
55
 
56
- # 检查CUDA是否可用
57
  if torch.cuda.is_available():
58
  gpu_name = torch.cuda.get_device_name(0)
59
  gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
60
- log(f"发现GPU: {gpu_name}")
61
- log(f"GPU显存: {gpu_memory_gb:.1f}GB")
62
-
63
  if gpu_memory_gb >= required_memory_gb:
64
- log("✅ GPU显存足够,将使用GPU进行转换")
65
  return "cuda", gpu_memory_gb
66
  else:
67
- log(f"⚠️ GPU显存不足 (需要 {required_memory_gb:.1f}GB, 实际 {gpu_memory_gb:.1f}GB)")
68
  else:
69
- log("❌ 未检测到可用的GPU")
70
-
71
- # 检查CPU内存是否足够
72
  if available_memory_gb >= required_memory_gb:
73
- log("✅ CPU内存足够,将使用CPU进行转换")
74
  return "cpu", available_memory_gb
75
  else:
76
- raise MemoryError(f"❌ 系统内存不足 (需要 {required_memory_gb:.1f}GB, 可用 {available_memory_gb:.1f}GB)")
77
 
 
78
  def setup_environment(model_name):
79
- # # 检查系统资源并决定使用什么设备
80
- # device, available_memory = check_system_resources(model_name)
81
- device = "cpu"
 
 
 
82
  return device
83
 
 
84
  def create_hf_repo(repo_name, private=True):
85
- """创建HuggingFace仓库"""
86
  try:
87
- # check if repo already exists
88
  api = HfApi()
 
89
  if api.repo_exists(repo_name):
90
- log(f"仓库已存在: {repo_name}")
91
- return ValueError(f"仓库已���在: {repo_name}, 请使用其他名称或删除已存在的仓库")
 
 
 
 
 
92
  repo_url = create_repo(repo_name, private=private)
93
- log(f"创建仓库成功: {repo_url}")
94
- return repo_url
95
  except Exception as e:
96
- log(f"创建仓库失败: {str(e)}")
97
  raise
98
 
 
99
  def download_and_merge_model(base_model_name, lora_model_name, output_dir, device):
100
- log(f"正在加载基础模型: {base_model_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- try:
103
- # 先加载原始模型
104
- base_model = AutoModelForCausalLM.from_pretrained(
105
- base_model_name,
106
- torch_dtype=torch.float16,
107
- device_map={"": device}
108
- )
109
-
110
- old_vocab_size = base_model.get_input_embeddings().weight.shape[0]
111
- print(f"原始词表大小: {old_vocab_size}")
112
- # 加载tokenizer
113
- tokenizer = AutoTokenizer.from_pretrained(base_model_name)
114
-
115
- new_vocab_size = tokenizer.vocab_size
116
- print(f"调整词表大小: {old_vocab_size} -> {new_vocab_size}")
117
-
118
- # 保存原始权重
119
- old_embeddings = base_model.get_input_embeddings().weight.data.clone()
120
- old_lm_head = base_model.lm_head.weight.data.clone()
121
-
122
- # 调整词表大小
123
- base_model.resize_token_embeddings(new_vocab_size)
124
-
125
- # 复制原始权重到新的张量
126
- with torch.no_grad():
127
- base_model.get_input_embeddings().weight.data[:new_vocab_size] = old_embeddings[:new_vocab_size]
128
- base_model.lm_head.weight.data[:new_vocab_size] = old_lm_head[:new_vocab_size]
129
-
130
- log(f"正在加载LoRA模型: {lora_model_name}")
131
- log("基础模型配置:" + str(base_model.config))
132
-
133
- # 加载adapter配置
134
- adapter_config = PeftConfig.from_pretrained(lora_model_name)
135
- log("Adapter配置:" + str(adapter_config))
136
-
137
- model = PeftModel.from_pretrained(base_model, lora_model_name)
138
- log("正在合并LoRA权重")
139
- model = model.merge_and_unload()
140
 
141
- # 创建输出目录
142
- output_path = Path(output_dir)
143
- output_path.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- # 保存合并后的模型
146
- log(f"正在保存合并后的模型到: {output_dir}")
147
- model.save_pretrained(output_dir)
148
- tokenizer.save_pretrained(output_dir)
149
-
150
-
151
- return output_dir
152
-
153
- except Exception as e:
154
- log(f"错误: {str(e)}")
155
- log(f"错误类型: {type(e)}")
156
- import traceback
157
- log("详细错误信息:")
158
- log(traceback.format_exc())
159
- raise
160
 
161
- def quantize_and_push_model(model_path, repo_id, bits=8):
162
- """量化模型并推送到HuggingFace"""
163
- try:
164
- from optimum.bettertransformer import BetterTransformer
165
- from transformers import AutoModelForCausalLM
166
-
167
- log(f"正在加载模型用于{bits}位量化...")
168
- model = AutoModelForCausalLM.from_pretrained(model_path)
169
- tokenizer = AutoTokenizer.from_pretrained(model_path)
170
-
171
- # 转换为BetterTransformer格式
172
- model = BetterTransformer.transform(model)
173
-
174
- # 量化
175
- if bits == 8:
176
- from transformers import BitsAndBytesConfig
177
- quantization_config = BitsAndBytesConfig(
178
- load_in_8bit=True,
179
- llm_int8_threshold=6.0
180
- )
181
- elif bits == 4:
182
- from transformers import BitsAndBytesConfig
183
- quantization_config = BitsAndBytesConfig(
184
- load_in_4bit=True,
185
- bnb_4bit_compute_dtype=torch.float16,
186
- bnb_4bit_quant_type="nf4"
187
- )
188
- else:
189
- raise ValueError(f"不支持的量化位数: {bits}")
190
-
191
- # 保存量化后的模型
192
- quantized_model_path = f"{model_path}_q{bits}"
193
- model.save_pretrained(
194
- quantized_model_path,
195
- quantization_config=quantization_config
196
- )
197
- tokenizer.save_pretrained(quantized_model_path)
198
-
199
- # 推送到HuggingFace
200
- log(f"正在将{bits}位量化模型推送到HuggingFace...")
201
- api = HfApi()
202
- api.upload_folder(
203
- folder_path=quantized_model_path,
204
- repo_id=repo_id,
205
- repo_type="model"
206
- )
207
- log(f"{bits}位量化模型上传完成")
208
-
209
- except Exception as e:
210
- log(f"量化或上传过程中出错: {str(e)}")
211
- raise
212
 
213
- def process_model(base_model, lora_model, repo_name, hf_token, progress=gr.Progress()):
214
- """处理模型的主函数,用于Gradio界面"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  try:
216
- login(hf_token) # 我不理解为什么登录一次不行,非得放到环境变量里
 
217
  os.environ["HF_TOKEN"] = hf_token
218
  api = HfApi(token=hf_token)
219
  username = api.whoami()["name"]
220
- if repo_name == "Auto":
221
- repo_name = username + "/" + base_model.split("/")[-1] + "_" + lora_model.split("/")[-1]
222
-
223
- # 清空之前的日志
224
- current_logs.clear()
225
 
226
- # 设置环境和检查资源
227
- device = setup_environment(base_model)
228
 
229
- # 创建HuggingFace仓库
230
- repo_url = create_hf_repo(repo_name)
231
-
232
- # 设置输出目录
233
  output_dir = os.path.join(".", "output", repo_name)
 
 
234
 
235
- progress(0.1, desc="开始模型转换流程...")
236
- # 下载并合并模型
237
- model_path = download_and_merge_model(base_model, lora_model, output_dir, device)
238
-
239
- # 推送到HuggingFace
240
- log(f"正在将模型推送到HuggingFace...")
241
-
242
- api.upload_folder(
243
  folder_path=model_path,
244
  repo_id=repo_name,
245
- repo_type="model"
 
 
246
  )
247
 
248
- progress(0.4, desc="开始8位量化...")
249
- # 量化并上传模型
250
- quantize_and_push_model(model_path, repo_name, bits=8)
 
 
 
251
 
252
- progress(0.7, desc="开始4位量化...")
253
- quantize_and_push_model(model_path, repo_name, bits=4)
 
 
 
 
 
254
 
255
- final_message = f"全部完成!模型已上传至: https://huggingface.co/{repo_name}"
256
  log(final_message)
257
- progress(1.0, desc="处理完成")
258
-
259
- # remove hf_token from env
260
- os.environ.pop("HF_TOKEN")
261
- log("HF_TOKEN已从环境变量中删除")
262
-
263
- # remove model_path
264
- os.remove(model_path)
265
- log(f"模型路径已删除: {model_path}")
266
-
267
  return "\n".join(current_logs)
268
  except Exception as e:
269
- error_message = f"处理过程中出错: {str(e)}"
270
  log(error_message)
271
  return "\n".join(current_logs)
272
 
 
273
  def create_ui():
274
- """创建Gradio界面"""
275
- with gr.Blocks(title="模型转换工具") as app:
276
  gr.Markdown("""
277
- # 🤗 模型转换与量化工具
278
 
279
- 这个工具可以帮助你:
280
- 1. 合并基础模型和LoRA适配器
281
- 2. 创建4位和8位量化版本
282
- 3. 自动上传到HuggingFace Hub
283
  """)
284
-
285
  with gr.Row():
286
  with gr.Column():
287
  base_model = gr.Textbox(
288
- label="基础模型路径",
289
- placeholder="例如: Qwen/Qwen2.5-14B-Instruct",
290
  value="Qwen/Qwen2.5-7B-Instruct"
291
  )
292
  lora_model = gr.Textbox(
293
- label="LoRA模型路径",
294
- placeholder="输入你的LoRA模型路径"
295
  )
296
  repo_name = gr.Textbox(
297
- label="HuggingFace仓库名称",
298
- placeholder="输入要创建的仓库名称",
299
  value="Auto"
300
  )
 
 
 
 
 
301
  hf_token = gr.Textbox(
302
- label="HuggingFace Token",
303
- placeholder="输入你的HuggingFace Token",
304
  value=os.getenv("HF_TOKEN")
305
  )
306
- convert_btn = gr.Button("开始转换", variant="primary")
307
-
308
  with gr.Column():
309
  output = gr.TextArea(
310
- label="处理日志",
311
- placeholder="处理日志将在这里显示...",
312
  interactive=False,
313
  autoscroll=True,
314
  lines=20
315
  )
316
-
317
- # 设置事件处理
318
  convert_btn.click(
319
  fn=process_model,
320
- inputs=[base_model, lora_model, repo_name, hf_token],
321
  outputs=output
322
  )
323
-
324
  return app
325
 
 
326
  if __name__ == "__main__":
327
- # 创建并启动Gradio界面
328
  app = create_ui()
329
  app.queue()
330
- app.launch()
 
1
  import os
2
  import torch
3
  import psutil
 
 
4
  from pathlib import Path
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from peft import PeftModel, PeftConfig
7
  from huggingface_hub import login, create_repo, HfApi
 
 
8
  import gradio as gr
 
9
  import queue
10
  import time
11
 
12
+ # 全局日志
13
  log_queue = queue.Queue()
14
  current_logs = []
15
 
 
16
  def log(msg):
17
+ """追加并打印日志信息"""
18
  print(msg)
19
  current_logs.append(msg)
20
  return "\n".join(current_logs)
21
 
22
+ def timeit(func):
23
+ def wrapper(*args, **kwargs):
24
+ start_time = time.time()
25
+ result = func(*args, **kwargs)
26
+ end_time = time.time()
27
+ log(f"{func.__name__}: {end_time - start_time:.2f} s")
28
+ return result
29
+ return wrapper
30
+
31
+ @timeit
32
  def get_model_size_in_gb(model_name):
33
+ """通过 Hugging Face Hub 元数据估算模型大小(GB"""
34
  try:
 
35
  api = HfApi()
36
  model_info = api.model_info(model_name)
37
+ # 使用 safetensors 大小(不假定文件扩展名)
38
  return model_info.safetensors.total / (1024 ** 3)
 
39
  except Exception as e:
40
+ log(f"Unable to estimate model size: {e}")
41
+ return 1 # 默认值
42
 
43
+ @timeit
44
  def check_system_resources(model_name):
45
+ """检查系统资源,决定使用 CPU 或 GPU"""
46
+ log("Checking system resources...")
 
 
47
  system_memory = psutil.virtual_memory()
48
  total_memory_gb = system_memory.total / (1024 ** 3)
49
  available_memory_gb = system_memory.available / (1024 ** 3)
50
+ log(f"Total system memory: {total_memory_gb:.1f}GB")
51
+ log(f"Available memory: {available_memory_gb:.1f}GB")
52
 
 
 
 
 
53
  model_size_gb = get_model_size_in_gb(model_name)
54
+ required_memory_gb = model_size_gb * 2.5 # 预留额外内存
55
+ log(f"Estimated required memory for model: {required_memory_gb:.1f}GB")
56
 
 
57
  if torch.cuda.is_available():
58
  gpu_name = torch.cuda.get_device_name(0)
59
  gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
60
+ log(f"Detected GPU: {gpu_name} with {gpu_memory_gb:.1f}GB memory")
 
 
61
  if gpu_memory_gb >= required_memory_gb:
62
+ log("✅ Sufficient GPU memory available; using GPU.")
63
  return "cuda", gpu_memory_gb
64
  else:
65
+ log(f"⚠️ Insufficient GPU memory (requires {required_memory_gb:.1f}GB, found {gpu_memory_gb:.1f}GB).")
66
  else:
67
+ log("❌ No GPU detected.")
68
+
 
69
  if available_memory_gb >= required_memory_gb:
70
+ log("✅ Sufficient CPU memory available; using CPU.")
71
  return "cpu", available_memory_gb
72
  else:
73
+ raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
74
 
75
+ @timeit
76
  def setup_environment(model_name):
77
+ """选择模型转换时使用的设备"""
78
+ try:
79
+ device, _ = check_system_resources(model_name)
80
+ except Exception as e:
81
+ log(f"Resource check failed: {e}. Defaulting to CPU.")
82
+ device = "cpu"
83
  return device
84
 
85
+ @timeit
86
  def create_hf_repo(repo_name, private=True):
87
+ """创建 Hugging Face 仓库(如果不存在的话)"""
88
  try:
 
89
  api = HfApi()
90
+ # 如果仓库已存在,则尝试附加索引直到名称可用
91
  if api.repo_exists(repo_name):
92
+ retry_index = 0
93
+ repo_name_with_index = repo_name
94
+ while api.repo_exists(repo_name_with_index):
95
+ retry_index += 1
96
+ log(f"Repository {repo_name_with_index} exists; trying {repo_name}_{retry_index}")
97
+ repo_name_with_index = f"{repo_name}_{retry_index}"
98
+ repo_name = repo_name_with_index
99
  repo_url = create_repo(repo_name, private=private)
100
+ log(f"Repository created successfully: {repo_url}")
101
+ return repo_name
102
  except Exception as e:
103
+ log(f"Failed to create repository: {e}")
104
  raise
105
 
106
+ @timeit
107
  def download_and_merge_model(base_model_name, lora_model_name, output_dir, device):
108
+ """
109
+ 1. 先加载 adapter 的 tokenizer 获取其词表大小
110
+ 2. 加载 base tokenizer 用于后续合并词表
111
+ 3. 加载 base 模型,并将嵌入层调整至 adapter 词表大小
112
+ 4. 使用高层 API 加载 LoRA adapter 并合并其权重
113
+ 5. 求 base 与 adapter tokenizer 的词表并取并集,扩展 tokenizer
114
+ 6. 调整合并模型嵌入层尺寸并保存
115
+ """
116
+ model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
117
+ adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
118
+ added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
119
+ model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
120
+ model.load_adapter(lora_model_name, low_cpu_mem_usage=True)
121
+ model = model.merge_and_unload()
122
+ model.save_pretrained(output_dir)
123
+ adapter_tokenizer.save_pretrained(output_dir)
124
+ return output_dir
125
+
126
+ @timeit
127
+ def clone_llamacpp_and_download_build():
128
+ """克隆 llama.cpp 并下载最新构建"""
129
+ llamacpp_repo = "https://github.com/ggerganov/llama.cpp.git"
130
+ llamacpp_dir = os.path.join(os.getcwd(), "llama.cpp")
131
+
132
+ if not os.path.exists(llamacpp_dir):
133
+ log(f"Cloning llama.cpp from {llamacpp_repo}...")
134
+ os.system(f"git clone {llamacpp_repo} {llamacpp_dir}")
135
+
136
+ log("Building llama.cpp...")
137
+ build_dir = os.path.join(llamacpp_dir, "build")
138
+ os.makedirs(build_dir, exist_ok=True)
139
+
140
+ """
141
+ cmake -B build
142
+ cmake --build build --config Release
143
+ """
144
+
145
+ # 进入构建目录并执行 cmake 和 make
146
+ os.chdir(build_dir)
147
+ os.system("cmake -B build")
148
+ os.system("cmake --build build --config Release")
149
+
150
+ log("llama.cpp build completed.")
151
+ # 返回到原始目录
152
+ os.chdir(os.path.dirname(llamacpp_dir))
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ @timeit
156
+ def quantize_and_push_model(model_path, repo_id, quant_method=None):
157
+ """
158
+ 利用 llama-cpp-python 对模型进行量化,并上传到 Hugging Face Hub。
159
+ 使用的量化预设:
160
+ - 8-bit: Q8_0
161
+ - 4-bit: Q4_K_M 或 Q4_K_L
162
+ - 2-bit: Q2_K_L
163
+ 模型输入(model_path)应为全精度(例如 fp16)的 GGUF 文件,
164
+ 输出文件将保存为 <model_path>_q{bits}_{quant_method}
165
+ """
166
+ # 使用llama.cpp的转换工具
167
+ llamacpp_dir = os.path.join(os.getcwd(), "llama.cpp")
168
+ if not os.path.exists(llamacpp_dir):
169
+ clone_llamacpp_and_download_build()
170
 
171
+ # 确保 model_output 目录存在
172
+ model_output_dir = f"{model_path}/quantized/"
173
+ os.makedirs(model_output_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ # 中间文件保存在 model_output 目录下
176
+ temp_gguf_path = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
177
+
178
+ if not os.path.exists(temp_gguf_path):
179
+ print(f"正在将模型转换为GGML格式")
180
+ convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
181
+ convert_cmd = f"python {convert_script} {model_path} --outfile {temp_gguf_path}"
182
+ os.system(convert_cmd)
183
+ else:
184
+ print(f"GGML中间文件已存在,跳过转换")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # 最终文件保存在 model_output 目录下
187
+ final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
188
+ print(f"正在进行{quant_method}量化")
189
+ quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
190
+ quant_cmd = f"{quantize_bin} {temp_gguf_path} {final_path} {quant_method}"
191
+
192
+ if not os.path.exists(final_path):
193
+ os.system(quant_cmd)
194
+ else:
195
+ print(f"{quant_method}量化文件已存在,跳过量化")
196
+ return None
197
+
198
+ # 异步上传量化模型到 Hugging Face Hub
199
+ api = HfApi()
200
+ future = api.upload_file(
201
+ file_path=final_path,
202
+ repo_id=repo_id,
203
+ repo_type="model",
204
+ commit_message=f"Quantized {quant_method}",
205
+ commit_description=f"Quantized {model_path} with {quant_method}, using llama.cpp -> {quant_cmd} ",
206
+ run_as_future=True
207
+ )
208
+ log(f"量化模型({quant_method})上传已安排;已获得 future 对象。")
209
+ return future
210
+
211
+ @timeit
212
+ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf_token):
213
+ """
214
+ 主处理函数:
215
+ 1. 登录并(必要时)创建 Hugging Face 仓库;
216
+ 2. 设置设备;
217
+ 3. 下载并合并 base 模型与 LoRA adapter;
218
+ 4. 异步上传合并后的模型;
219
+ 5. 同时启动四个量化任务(8-bit、2-bit、4-bit 两种模式);
220
+ 6. 最后统一等待所有 Future 完成,再返回日志。
221
+ """
222
  try:
223
+ current_logs.clear()
224
+ login(hf_token)
225
  os.environ["HF_TOKEN"] = hf_token
226
  api = HfApi(token=hf_token)
227
  username = api.whoami()["name"]
228
+ if repo_name.strip().lower() == "auto":
229
+ repo_name = f"{username}/{base_model_name.split('/')[-1]}_{lora_model_name.split('/')[-1]}"
 
 
 
230
 
231
+ device = setup_environment(base_model_name)
232
+ repo_name = create_hf_repo(repo_name)
233
 
 
 
 
 
234
  output_dir = os.path.join(".", "output", repo_name)
235
+ log("Starting model merge process...")
236
+ model_path = download_and_merge_model(base_model_name, lora_model_name, output_dir, device)
237
 
238
+ # 异步上传合并后的模型
239
+ log("Scheduling merged model upload...")
240
+ future_merge = api.upload_large_folder(
 
 
 
 
 
241
  folder_path=model_path,
242
  repo_id=repo_name,
243
+ repo_type="model",
244
+ num_workers=4,
245
+ run_as_future=True
246
  )
247
 
248
+ # 启动量化任务,分别使用四种模式:
249
+ futures = []
250
+ for quant_method in quant_methods:
251
+ future = quantize_and_push_model(f"{output_dir}/model.gguf", repo_name, bits=8, quant_method=quant_method)
252
+ futures.append(future)
253
+ log("Background uploads are in progress; performing other tasks if needed...")
254
 
255
+ log("Waiting for merged model upload to complete...")
256
+ future_merge.result()
257
+ log("Merged model upload completed.")
258
+
259
+ for future in futures:
260
+ future.result()
261
+ log(f"{future.result().__name__} completed.")
262
 
263
+ final_message = f"All done! Model uploaded to: https://huggingface.co/{repo_name}"
264
  log(final_message)
265
+ os.environ.pop("HF_TOKEN", None)
266
+ log("HF_TOKEN removed from environment variables.")
 
 
 
 
 
 
 
 
267
  return "\n".join(current_logs)
268
  except Exception as e:
269
+ error_message = f"Error during processing: {e}"
270
  log(error_message)
271
  return "\n".join(current_logs)
272
 
273
+ @timeit
274
  def create_ui():
275
+ """创建 Gradio 界面,仅展示日志"""
276
+ with gr.Blocks(title="Model Merge & Quantization Tool") as app:
277
  gr.Markdown("""
278
+ # 🤗 Model Merge and Quantization Tool
279
 
280
+ This tool merges a base model with a LoRA adapter, creates 8-bit, 4-bit and 2-bit quantized versions
281
+ (using guff's quantization: Q8_0, Q2_K_L, Q4_K_M, Q4_K_L), and uploads them to the Hugging Face Hub.
 
 
282
  """)
 
283
  with gr.Row():
284
  with gr.Column():
285
  base_model = gr.Textbox(
286
+ label="Base Model Path",
287
+ placeholder="e.g., Qwen/Qwen2.5-14B-Instruct",
288
  value="Qwen/Qwen2.5-7B-Instruct"
289
  )
290
  lora_model = gr.Textbox(
291
+ label="LoRA Model Path",
292
+ placeholder="Enter the path to your LoRA model"
293
  )
294
  repo_name = gr.Textbox(
295
+ label="Hugging Face Repository Name",
296
+ placeholder="Enter the repository name to create",
297
  value="Auto"
298
  )
299
+ quant_method = gr.CheckboxGroup(
300
+ choices=["Q2_K", "Q4_K", "IQ4_NL", "Q5_K_M", "Q6_K", "Q8_0"],
301
+ value=["Q4_K", "Q8_0"],
302
+ label="Quantization Method"
303
+ )
304
  hf_token = gr.Textbox(
305
+ label="Hugging Face Token",
306
+ placeholder="Enter your Hugging Face Token",
307
  value=os.getenv("HF_TOKEN")
308
  )
309
+ convert_btn = gr.Button("Start Conversion", variant="primary")
 
310
  with gr.Column():
311
  output = gr.TextArea(
312
+ label="Logs",
313
+ placeholder="Processing logs will appear here...",
314
  interactive=False,
315
  autoscroll=True,
316
  lines=20
317
  )
 
 
318
  convert_btn.click(
319
  fn=process_model,
320
+ inputs=[base_model, lora_model, repo_name, quant_method, hf_token],
321
  outputs=output
322
  )
 
323
  return app
324
 
325
+
326
  if __name__ == "__main__":
 
327
  app = create_ui()
328
  app.queue()
329
+ app.launch()