|
#include "llama-quant.h" |
|
|
|
#include "llama-impl.h" |
|
#include "llama-model.h" |
|
#include "llama-model-loader.h" |
|
|
|
#include <algorithm> |
|
#include <cmath> |
|
#include <cstring> |
|
#include <cinttypes> |
|
#include <fstream> |
|
#include <mutex> |
|
#include <thread> |
|
#include <unordered_map> |
|
|
|
static void zeros(std::ofstream & file, size_t n) { |
|
char zero = 0; |
|
for (size_t i = 0; i < n; ++i) { |
|
file.write(&zero, 1); |
|
} |
|
} |
|
|
|
struct quantize_state_impl { |
|
const llama_model & model; |
|
const llama_model_quantize_params * params; |
|
|
|
int n_attention_wv = 0; |
|
int n_ffn_down = 0; |
|
int n_ffn_gate = 0; |
|
int n_ffn_up = 0; |
|
int i_attention_wv = 0; |
|
int i_ffn_down = 0; |
|
int i_ffn_gate = 0; |
|
int i_ffn_up = 0; |
|
|
|
int n_k_quantized = 0; |
|
int n_fallback = 0; |
|
|
|
bool has_imatrix = false; |
|
|
|
|
|
bool has_output = false; |
|
|
|
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) |
|
: model(model) |
|
, params(params) |
|
{} |
|
}; |
|
|
|
static void llama_tensor_dequantize_impl( |
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, |
|
const size_t nelements, const int nthread |
|
) { |
|
if (output.size() < nelements) { |
|
output.resize(nelements); |
|
} |
|
float * f32_output = (float *) output.data(); |
|
|
|
const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type); |
|
if (ggml_is_quantized(tensor->type)) { |
|
if (qtype->to_float == NULL) { |
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); |
|
} |
|
} else if (tensor->type != GGML_TYPE_F16 && |
|
tensor->type != GGML_TYPE_BF16) { |
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type))); |
|
} |
|
|
|
if (nthread < 2) { |
|
if (tensor->type == GGML_TYPE_F16) { |
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); |
|
} else if (tensor->type == GGML_TYPE_BF16) { |
|
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements); |
|
} else if (ggml_is_quantized(tensor->type)) { |
|
qtype->to_float(tensor->data, f32_output, nelements); |
|
} else { |
|
GGML_ABORT("fatal error"); |
|
} |
|
return; |
|
} |
|
|
|
size_t block_size; |
|
if (tensor->type == GGML_TYPE_F16 || |
|
tensor->type == GGML_TYPE_BF16) { |
|
block_size = 1; |
|
} else { |
|
block_size = (size_t)ggml_blck_size(tensor->type); |
|
} |
|
|
|
size_t block_size_bytes = ggml_type_size(tensor->type); |
|
|
|
GGML_ASSERT(nelements % block_size == 0); |
|
size_t nblocks = nelements / block_size; |
|
size_t blocks_per_thread = nblocks / nthread; |
|
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); |
|
|
|
size_t in_buff_offs = 0; |
|
size_t out_buff_offs = 0; |
|
|
|
for (int tnum = 0; tnum < nthread; tnum++) { |
|
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); |
|
size_t thr_elems = thr_blocks * block_size; |
|
size_t thr_block_bytes = thr_blocks * block_size_bytes; |
|
|
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { |
|
if (typ == GGML_TYPE_F16) { |
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); |
|
} else if (typ == GGML_TYPE_BF16) { |
|
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels); |
|
} else { |
|
qtype->to_float(inbuf, outbuf, nels); |
|
} |
|
}; |
|
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems); |
|
in_buff_offs += thr_block_bytes; |
|
out_buff_offs += thr_elems; |
|
} |
|
for (auto & w : workers) { w.join(); } |
|
workers.clear(); |
|
} |
|
|
|
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { |
|
const std::string name = ggml_get_name(tensor); |
|
|
|
|
|
const llm_arch arch = qs.model.arch; |
|
const auto tn = LLM_TN(arch); |
|
|
|
auto use_more_bits = [](int i_layer, int n_layers) -> bool { |
|
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; |
|
}; |
|
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); |
|
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { |
|
if (n_expert > 1) { |
|
|
|
|
|
|
|
|
|
if (sscanf(name, "blk.%d.", &i_layer) != 1) { |
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name)); |
|
} |
|
if (i_layer < 0 || i_layer >= n_layer) { |
|
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer)); |
|
} |
|
} |
|
return std::make_pair(i_layer, n_layer); |
|
}; |
|
|
|
|
|
|
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { |
|
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { |
|
new_type = qs.params->output_tensor_type; |
|
} else { |
|
const int64_t nx = tensor->ne[0]; |
|
const int64_t qk_k = ggml_blck_size(new_type); |
|
|
|
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) { |
|
new_type = GGML_TYPE_Q8_0; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || |
|
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || |
|
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { |
|
new_type = GGML_TYPE_Q5_K; |
|
} |
|
else if (new_type != GGML_TYPE_Q8_0) { |
|
new_type = GGML_TYPE_Q6_K; |
|
} |
|
} |
|
} else if (name == "token_embd.weight") { |
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { |
|
new_type = qs.params->token_embedding_type; |
|
} else { |
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || |
|
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { |
|
new_type = GGML_TYPE_Q2_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { |
|
new_type = GGML_TYPE_IQ3_S; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { |
|
new_type = GGML_TYPE_IQ3_S; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { |
|
new_type = GGML_TYPE_Q4_K; |
|
} |
|
} |
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || |
|
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { |
|
if (name.find("attn_v.weight") != std::string::npos) { |
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; |
|
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; |
|
++qs.i_attention_wv; |
|
} |
|
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { |
|
new_type = GGML_TYPE_Q4_K; |
|
} |
|
else if (name.find("ffn_down") != std::string::npos) { |
|
if (qs.i_ffn_down < qs.n_ffn_down/8) { |
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; |
|
} |
|
++qs.i_ffn_down; |
|
} |
|
else if (name.find("attn_output.weight") != std::string::npos) { |
|
if (qs.model.hparams.n_expert == 8) { |
|
new_type = GGML_TYPE_Q5_K; |
|
} else { |
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; |
|
} |
|
} |
|
} else if (name.find("attn_v.weight") != std::string::npos) { |
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { |
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { |
|
new_type = GGML_TYPE_Q4_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { |
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; |
|
} |
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { |
|
new_type = GGML_TYPE_Q4_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { |
|
new_type = GGML_TYPE_Q4_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { |
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; |
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { |
|
new_type = GGML_TYPE_Q5_K; |
|
} |
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && |
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; |
|
if (qs.model.type == LLM_TYPE_70B) { |
|
|
|
|
|
|
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; |
|
} |
|
if (qs.model.hparams.n_expert == 8) { |
|
|
|
|
|
new_type = GGML_TYPE_Q8_0; |
|
} |
|
++qs.i_attention_wv; |
|
} else if (name.find("attn_k.weight") != std::string::npos) { |
|
if (qs.model.hparams.n_expert == 8) { |
|
|
|
|
|
new_type = GGML_TYPE_Q8_0; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { |
|
new_type = GGML_TYPE_IQ3_XXS; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { |
|
new_type = GGML_TYPE_IQ2_S; |
|
} |
|
} else if (name.find("attn_q.weight") != std::string::npos) { |
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { |
|
new_type = GGML_TYPE_IQ3_XXS; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { |
|
new_type = GGML_TYPE_IQ2_S; |
|
} |
|
} else if (name.find("ffn_down") != std::string::npos) { |
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); |
|
int i_layer = info.first, n_layer = info.second; |
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { |
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { |
|
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { |
|
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K |
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K |
|
: GGML_TYPE_Q3_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || |
|
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { |
|
new_type = GGML_TYPE_Q4_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { |
|
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { |
|
if (arch == LLM_ARCH_FALCON) { |
|
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : |
|
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; |
|
} else { |
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; |
|
} |
|
} |
|
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { |
|
new_type = GGML_TYPE_Q5_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { |
|
new_type = GGML_TYPE_Q5_K; |
|
} |
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) |
|
&& qs.has_imatrix && i_layer < n_layer/8) { |
|
|
|
|
|
|
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; |
|
} |
|
++qs.i_ffn_down; |
|
} else if (name.find("attn_output.weight") != std::string::npos) { |
|
if (arch != LLM_ARCH_FALCON) { |
|
if (qs.model.hparams.n_expert == 8) { |
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || |
|
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || |
|
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || |
|
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { |
|
new_type = GGML_TYPE_Q5_K; |
|
} |
|
} else { |
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; |
|
} |
|
} else { |
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; |
|
} |
|
} |
|
else if (name.find("attn_qkv.weight") != std::string::npos) { |
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { |
|
new_type = GGML_TYPE_Q4_K; |
|
} |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; |
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; |
|
} |
|
else if (name.find("ffn_gate") != std::string::npos) { |
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); |
|
int i_layer = info.first, n_layer = info.second; |
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { |
|
new_type = GGML_TYPE_IQ3_XXS; |
|
} |
|
++qs.i_ffn_gate; |
|
} |
|
else if (name.find("ffn_up") != std::string::npos) { |
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); |
|
int i_layer = info.first, n_layer = info.second; |
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { |
|
new_type = GGML_TYPE_IQ3_XXS; |
|
} |
|
++qs.i_ffn_up; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool convert_incompatible_tensor = false; |
|
{ |
|
const int64_t nx = tensor->ne[0]; |
|
const int64_t ny = tensor->ne[1]; |
|
const int64_t qk_k = ggml_blck_size(new_type); |
|
|
|
if (nx % qk_k != 0) { |
|
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); |
|
convert_incompatible_tensor = true; |
|
} else { |
|
++qs.n_k_quantized; |
|
} |
|
} |
|
|
|
if (convert_incompatible_tensor) { |
|
switch (new_type) { |
|
case GGML_TYPE_TQ1_0: |
|
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; |
|
case GGML_TYPE_IQ2_XXS: |
|
case GGML_TYPE_IQ2_XS: |
|
case GGML_TYPE_IQ2_S: |
|
case GGML_TYPE_IQ3_XXS: |
|
case GGML_TYPE_IQ3_S: |
|
case GGML_TYPE_IQ1_S: |
|
case GGML_TYPE_IQ1_M: |
|
case GGML_TYPE_Q2_K: |
|
case GGML_TYPE_Q3_K: |
|
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; |
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; |
|
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; |
|
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; |
|
default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); |
|
} |
|
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { |
|
new_type = GGML_TYPE_F16; |
|
} |
|
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); |
|
++qs.n_fallback; |
|
} |
|
|
|
return new_type; |
|
} |
|
|
|
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) { |
|
if (nthread < 2) { |
|
|
|
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); |
|
if (!ggml_validate_row_data(new_type, new_data, new_size)) { |
|
throw std::runtime_error("quantized data validation failed"); |
|
} |
|
return new_size; |
|
} |
|
|
|
std::mutex mutex; |
|
int64_t counter = 0; |
|
size_t new_size = 0; |
|
bool valid = true; |
|
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, |
|
nrows, n_per_row, imatrix]() { |
|
const int64_t nrows_per_chunk = chunk_size / n_per_row; |
|
size_t local_size = 0; |
|
while (true) { |
|
std::unique_lock<std::mutex> lock(mutex); |
|
int64_t first_row = counter; counter += nrows_per_chunk; |
|
if (first_row >= nrows) { |
|
if (local_size > 0) { |
|
new_size += local_size; |
|
} |
|
break; |
|
} |
|
lock.unlock(); |
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); |
|
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); |
|
local_size += this_size; |
|
|
|
|
|
const size_t row_size = ggml_row_size(new_type, n_per_row); |
|
void * this_data = (char *) new_data + first_row * row_size; |
|
if (!ggml_validate_row_data(new_type, this_data, this_size)) { |
|
std::unique_lock<std::mutex> lock(mutex); |
|
valid = false; |
|
break; |
|
} |
|
} |
|
}; |
|
for (int it = 0; it < nthread - 1; ++it) { |
|
workers.emplace_back(compute); |
|
} |
|
compute(); |
|
for (auto & w : workers) { w.join(); } |
|
workers.clear(); |
|
if (!valid) { |
|
throw std::runtime_error("quantized data validation failed"); |
|
} |
|
return new_size; |
|
} |
|
|
|
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { |
|
ggml_type default_type; |
|
llama_ftype ftype = params->ftype; |
|
|
|
switch (params->ftype) { |
|
case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; |
|
case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; |
|
case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; |
|
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; |
|
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; |
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; |
|
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; |
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; |
|
|
|
|
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S: |
|
case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break; |
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: |
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: |
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break; |
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S: |
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break; |
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: |
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break; |
|
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; |
|
case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break; |
|
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; |
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; |
|
|
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); |
|
} |
|
|
|
int nthread = params->nthread; |
|
|
|
if (nthread <= 0) { |
|
nthread = std::thread::hardware_concurrency(); |
|
} |
|
|
|
|
|
|
|
#if defined(__linux__) || defined(_WIN32) |
|
constexpr bool use_mmap = true; |
|
#else |
|
constexpr bool use_mmap = false; |
|
#endif |
|
|
|
llama_model_kv_override * kv_overrides = nullptr; |
|
if (params->kv_overrides) { |
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides; |
|
kv_overrides = v->data(); |
|
} |
|
|
|
std::vector<std::string> splits = {}; |
|
llama_model_loader ml(fname_inp, splits, use_mmap, true, kv_overrides); |
|
ml.init_mappings(false); |
|
|
|
llama_model model(llama_model_default_params()); |
|
|
|
model.load_arch (ml); |
|
model.load_hparams(ml); |
|
model.load_stats (ml); |
|
|
|
struct quantize_state_impl qs(model, params); |
|
|
|
if (params->only_copy) { |
|
ftype = ml.ftype; |
|
} |
|
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr; |
|
if (params->imatrix) { |
|
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix); |
|
if (imatrix_data) { |
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); |
|
qs.has_imatrix = true; |
|
|
|
for (const auto & kv : *imatrix_data) { |
|
for (float f : kv.second) { |
|
if (!std::isfinite(f)) { |
|
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f)); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
const size_t align = GGUF_DEFAULT_ALIGNMENT; |
|
gguf_context_ptr ctx_out { gguf_init_empty() }; |
|
|
|
|
|
gguf_set_kv (ctx_out.get(), ml.meta.get()); |
|
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); |
|
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); |
|
|
|
|
|
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); |
|
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); |
|
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str()); |
|
|
|
if (params->kv_overrides) { |
|
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides; |
|
for (const auto & o : overrides) { |
|
if (o.key[0] == 0) break; |
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { |
|
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64); |
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { |
|
gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64); |
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { |
|
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool); |
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) { |
|
gguf_set_val_str(ctx_out.get(), o.key, o.val_str); |
|
} else { |
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); |
|
} |
|
} |
|
} |
|
|
|
|
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors; |
|
tensors.reserve(ml.weights_map.size()); |
|
for (const auto & it : ml.weights_map) { |
|
tensors.push_back(&it.second); |
|
} |
|
|
|
|
|
if (params->keep_split) { |
|
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { |
|
if (a->idx == b->idx) { |
|
return a->offs < b->offs; |
|
} |
|
return a->idx < b->idx; |
|
}); |
|
} |
|
|
|
for (const auto * it : tensors) { |
|
const struct ggml_tensor * tensor = it->tensor; |
|
|
|
const std::string name = ggml_get_name(tensor); |
|
|
|
|
|
if (name.find("attn_v.weight") != std::string::npos || |
|
name.find("attn_qkv.weight") != std::string::npos || |
|
name.find("attn_kv_b.weight")!= std::string::npos) { |
|
++qs.n_attention_wv; |
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { |
|
qs.has_output = true; |
|
} |
|
} |
|
|
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; |
|
|
|
|
|
if (qs.n_attention_wv != 0) |
|
{ |
|
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); |
|
|
|
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); |
|
if (llama_model_has_encoder(&model)) { |
|
n_attn_layer *= 3; |
|
} |
|
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected"); |
|
} |
|
|
|
size_t total_size_org = 0; |
|
size_t total_size_new = 0; |
|
|
|
std::vector<std::thread> workers; |
|
workers.reserve(nthread); |
|
|
|
int idx = 0; |
|
|
|
std::vector<no_init<uint8_t>> read_data; |
|
std::vector<no_init<uint8_t>> work; |
|
std::vector<no_init<float>> f32_conv_buf; |
|
|
|
uint16_t n_split = 1; |
|
|
|
|
|
if (params->keep_split) { |
|
for (const auto * it : tensors) { |
|
n_split = std::max(uint16_t(it->idx + 1), n_split); |
|
} |
|
} |
|
std::vector<gguf_context_ptr> ctx_outs(n_split); |
|
ctx_outs[0] = std::move(ctx_out); |
|
|
|
|
|
for (const auto * it : tensors) { |
|
uint16_t i_split = params->keep_split ? it->idx : 0; |
|
struct ggml_tensor * tensor = it->tensor; |
|
if (!ctx_outs[i_split]) { |
|
ctx_outs[i_split].reset(gguf_init_empty()); |
|
} |
|
gguf_add_tensor(ctx_outs[i_split].get(), tensor); |
|
} |
|
|
|
|
|
if (n_split > 1) { |
|
for (size_t i = 0; i < ctx_outs.size(); ++i) { |
|
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); |
|
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); |
|
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors); |
|
} |
|
} |
|
|
|
int cur_split = -1; |
|
std::ofstream fout; |
|
auto close_ofstream = [&]() { |
|
|
|
if (fout.is_open()) { |
|
fout.seekp(0); |
|
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split].get())); |
|
gguf_get_meta_data(ctx_outs[cur_split].get(), data.data()); |
|
fout.write((const char *) data.data(), data.size()); |
|
fout.close(); |
|
} |
|
}; |
|
auto new_ofstream = [&](int index) { |
|
cur_split = index; |
|
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context"); |
|
std::string fname = fname_out; |
|
if (params->keep_split) { |
|
std::vector<char> split_path(llama_path_max(), 0); |
|
llama_split_path(split_path.data(), split_path.size(), fname_out.c_str(), cur_split, n_split); |
|
fname = std::string(split_path.data()); |
|
} |
|
|
|
fout = std::ofstream(fname, std::ios::binary); |
|
fout.exceptions(std::ofstream::failbit); |
|
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get()); |
|
|
|
::zeros(fout, meta_size); |
|
}; |
|
|
|
const auto tn = LLM_TN(model.arch); |
|
new_ofstream(0); |
|
for (const auto * it : tensors) { |
|
const auto & weight = *it; |
|
struct ggml_tensor * tensor = weight.tensor; |
|
if (weight.idx != cur_split && params->keep_split) { |
|
close_ofstream(); |
|
new_ofstream(weight.idx); |
|
} |
|
|
|
const std::string name = ggml_get_name(tensor); |
|
|
|
if (!ml.use_mmap) { |
|
if (read_data.size() < ggml_nbytes(tensor)) { |
|
read_data.resize(ggml_nbytes(tensor)); |
|
} |
|
tensor->data = read_data.data(); |
|
} |
|
ml.load_data_for(tensor); |
|
|
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", |
|
++idx, ml.n_tensors, |
|
ggml_get_name(tensor), |
|
llama_format_tensor_shape(tensor).c_str(), |
|
ggml_type_name(tensor->type)); |
|
|
|
|
|
bool quantize = name.rfind("weight") == name.size() - 6; |
|
|
|
|
|
quantize &= (ggml_n_dims(tensor) >= 2); |
|
|
|
|
|
quantize &= name.find("_norm.weight") == std::string::npos; |
|
|
|
quantize &= params->quantize_output_tensor || name != "output.weight"; |
|
quantize &= !params->only_copy; |
|
|
|
|
|
|
|
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; |
|
|
|
|
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); |
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); |
|
|
|
|
|
|
|
quantize &= name.find("ssm_conv1d.weight") == std::string::npos; |
|
|
|
|
|
quantize &= name.find("time_mix_first.weight") == std::string::npos; |
|
quantize &= name.find("time_mix_w1.weight") == std::string::npos; |
|
quantize &= name.find("time_mix_w2.weight") == std::string::npos; |
|
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; |
|
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; |
|
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; |
|
|
|
|
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos; |
|
|
|
enum ggml_type new_type; |
|
void * new_data; |
|
size_t new_size; |
|
|
|
if (quantize) { |
|
new_type = default_type; |
|
|
|
|
|
if (!params->pure && ggml_is_quantized(default_type)) { |
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); |
|
} |
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { |
|
new_type = params->token_embedding_type; |
|
} |
|
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { |
|
new_type = params->output_tensor_type; |
|
} |
|
|
|
|
|
|
|
quantize = tensor->type != new_type; |
|
} |
|
|
|
if (!quantize) { |
|
new_type = tensor->type; |
|
new_data = tensor->data; |
|
new_size = ggml_nbytes(tensor); |
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); |
|
} else { |
|
const int64_t nelements = ggml_nelements(tensor); |
|
|
|
const float * imatrix = nullptr; |
|
if (imatrix_data) { |
|
auto it = imatrix_data->find(tensor->name); |
|
if (it == imatrix_data->end()) { |
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); |
|
} else { |
|
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { |
|
imatrix = it->second.data(); |
|
} else { |
|
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, |
|
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name); |
|
|
|
|
|
|
|
|
|
|
|
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { |
|
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s", |
|
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name)); |
|
} |
|
} |
|
} |
|
} |
|
if ((new_type == GGML_TYPE_IQ2_XXS || |
|
new_type == GGML_TYPE_IQ2_XS || |
|
new_type == GGML_TYPE_IQ2_S || |
|
new_type == GGML_TYPE_IQ1_S || |
|
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || |
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { |
|
LLAMA_LOG_ERROR("\n\n============================================================\n"); |
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); |
|
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); |
|
LLAMA_LOG_ERROR("============================================================\n\n"); |
|
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name)); |
|
} |
|
|
|
float * f32_data; |
|
|
|
if (tensor->type == GGML_TYPE_F32) { |
|
f32_data = (float *) tensor->data; |
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { |
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); |
|
} else { |
|
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread); |
|
f32_data = (float *) f32_conv_buf.data(); |
|
} |
|
|
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); |
|
fflush(stdout); |
|
|
|
if (work.size() < (size_t)nelements * 4) { |
|
work.resize(nelements * 4); |
|
} |
|
new_data = work.data(); |
|
|
|
const int64_t n_per_row = tensor->ne[0]; |
|
const int64_t nrows = tensor->ne[1]; |
|
|
|
static const int64_t min_chunk_size = 32 * 512; |
|
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); |
|
|
|
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; |
|
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; |
|
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; |
|
|
|
|
|
new_size = 0; |
|
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { |
|
const float * f32_data_03 = f32_data + i03 * nelements_matrix; |
|
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; |
|
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; |
|
|
|
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); |
|
} |
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); |
|
} |
|
total_size_org += ggml_nbytes(tensor); |
|
total_size_new += new_size; |
|
|
|
|
|
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type); |
|
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size); |
|
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data); |
|
|
|
|
|
fout.write((const char *) new_data, new_size); |
|
zeros(fout, GGML_PAD(new_size, align) - new_size); |
|
} |
|
close_ofstream(); |
|
|
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); |
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); |
|
|
|
if (qs.n_fallback > 0) { |
|
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", |
|
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
struct llama_model_quantize_params llama_model_quantize_default_params() { |
|
struct llama_model_quantize_params result = { |
|
0, |
|
LLAMA_FTYPE_MOSTLY_Q5_1, |
|
GGML_TYPE_COUNT, |
|
GGML_TYPE_COUNT, |
|
false, |
|
true, |
|
false, |
|
false, |
|
false, |
|
nullptr, |
|
nullptr, |
|
}; |
|
|
|
return result; |
|
} |
|
|
|
uint32_t llama_model_quantize( |
|
const char * fname_inp, |
|
const char * fname_out, |
|
const llama_model_quantize_params * params) { |
|
try { |
|
llama_model_quantize_impl(fname_inp, fname_out, params); |
|
} catch (const std::exception & err) { |
|
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); |
|
return 1; |
|
} |
|
|
|
return 0; |
|
} |
|
|