|
#ifndef __T5_HPP__ |
|
#define __T5_HPP__ |
|
|
|
#include <float.h> |
|
#include <limits> |
|
#include <map> |
|
#include <memory> |
|
#include <regex> |
|
#include <sstream> |
|
#include <string> |
|
#include <unordered_map> |
|
|
|
#include "darts.h" |
|
#include "ggml_extend.hpp" |
|
#include "json.hpp" |
|
#include "model.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MetaspacePreTokenizer { |
|
private: |
|
std::string replacement; |
|
bool add_prefix_space; |
|
|
|
public: |
|
MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true) |
|
: replacement(replacement), add_prefix_space(add_prefix_space) {} |
|
|
|
std::string tokenize(const std::string& input) const { |
|
std::string tokens; |
|
std::stringstream ss(input); |
|
|
|
if (add_prefix_space) { |
|
tokens += replacement; |
|
} |
|
|
|
std::string token; |
|
bool firstToken = true; |
|
while (std::getline(ss, token, ' ')) { |
|
if (!firstToken) |
|
tokens += replacement + token; |
|
else |
|
tokens += token; |
|
|
|
firstToken = false; |
|
} |
|
|
|
return tokens; |
|
} |
|
}; |
|
|
|
using EncodeResult = std::vector<std::pair<std::string, int>>; |
|
class T5UniGramTokenizer { |
|
public: |
|
enum Status { |
|
OK, |
|
NO_PIECES_LOADED, |
|
NO_ENTRY_FOUND, |
|
BUILD_DOUBLE_ARRAY_FAILED, |
|
PIECE_ALREADY_DEFINED, |
|
INVLIAD_JSON |
|
}; |
|
|
|
protected: |
|
MetaspacePreTokenizer pre_tokenizer; |
|
|
|
|
|
std::vector<std::pair<std::string, float>> piece_score_pairs; |
|
|
|
float min_score_ = 0.0; |
|
float max_score_ = 0.0; |
|
std::unique_ptr<Darts::DoubleArray> trie_; |
|
|
|
|
|
|
|
int trie_results_size_; |
|
|
|
int unk_id_ = 2; |
|
std::string eos_token_ = "</s>"; |
|
int eos_id_ = 1; |
|
int pad_id_ = 0; |
|
|
|
Status status_ = OK; |
|
|
|
float kUnkPenalty = 10.0; |
|
|
|
std::string replacement; |
|
bool add_prefix_space = true; |
|
|
|
void InitializePieces(const std::string& json_str) { |
|
nlohmann::json data; |
|
|
|
try { |
|
data = nlohmann::json::parse(json_str); |
|
} catch (const nlohmann::json::parse_error& e) { |
|
status_ = INVLIAD_JSON; |
|
return; |
|
} |
|
if (!data.contains("model")) { |
|
status_ = INVLIAD_JSON; |
|
return; |
|
} |
|
nlohmann::json model = data["model"]; |
|
if (!model.contains("vocab")) { |
|
status_ = INVLIAD_JSON; |
|
return; |
|
} |
|
if (model.contains("unk_id")) { |
|
unk_id_ = model["unk_id"]; |
|
} |
|
|
|
replacement = data["pre_tokenizer"]["replacement"]; |
|
add_prefix_space = data["pre_tokenizer"]["add_prefix_space"]; |
|
|
|
pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space); |
|
|
|
for (const auto& item : model["vocab"]) { |
|
if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) { |
|
status_ = INVLIAD_JSON; |
|
return; |
|
} |
|
std::string piece = item[0]; |
|
float score = item[1]; |
|
piece_score_pairs.emplace_back(piece, score); |
|
} |
|
} |
|
|
|
|
|
void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) { |
|
if (status_ != OK) |
|
return; |
|
|
|
if (pieces->empty()) { |
|
status_ = NO_PIECES_LOADED; |
|
return; |
|
} |
|
|
|
|
|
|
|
sort(pieces->begin(), pieces->end()); |
|
|
|
|
|
std::vector<const char*> key(pieces->size()); |
|
std::vector<int> value(pieces->size()); |
|
for (size_t i = 0; i < pieces->size(); ++i) { |
|
key[i] = (*pieces)[i].first.data(); |
|
value[i] = (*pieces)[i].second; |
|
} |
|
|
|
trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray()); |
|
if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr, |
|
&value[0]) != 0) { |
|
status_ = BUILD_DOUBLE_ARRAY_FAILED; |
|
return; |
|
} |
|
|
|
|
|
const int kMaxTrieResultsSize = 1024; |
|
std::vector<Darts::DoubleArray::result_pair_type> results( |
|
kMaxTrieResultsSize); |
|
trie_results_size_ = 0; |
|
for (const auto& p : *pieces) { |
|
const int num_nodes = trie_->commonPrefixSearch( |
|
p.first.data(), results.data(), results.size(), p.first.size()); |
|
trie_results_size_ = std::max(trie_results_size_, num_nodes); |
|
} |
|
|
|
if (trie_results_size_ == 0) |
|
status_ = NO_ENTRY_FOUND; |
|
} |
|
|
|
|
|
inline float GetScoreInlined(int id) const { |
|
return piece_score_pairs[id].second; |
|
} |
|
|
|
inline bool IsUnusedInlined(int id) const { |
|
return false; |
|
} |
|
|
|
inline bool IsUserDefinedInlined(int id) const { |
|
return false; |
|
} |
|
|
|
inline size_t OneCharLen(const char* src) const { |
|
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EncodeResult EncodeOptimized(const std::string& normalized) const { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (status() != OK || normalized.empty()) { |
|
return {}; |
|
} |
|
|
|
struct BestPathNode { |
|
int id = -1; |
|
float best_path_score = |
|
0; |
|
int starts_at = |
|
-1; |
|
|
|
}; |
|
const int size = normalized.size(); |
|
const float unk_score = min_score() - kUnkPenalty; |
|
|
|
std::vector<BestPathNode> best_path_ends_at(size + 1); |
|
|
|
int starts_at = 0; |
|
while (starts_at < size) { |
|
std::size_t node_pos = 0; |
|
std::size_t key_pos = starts_at; |
|
const auto best_path_score_till_here = |
|
best_path_ends_at[starts_at].best_path_score; |
|
bool has_single_node = false; |
|
const int mblen = |
|
std::min<int>(OneCharLen(normalized.data() + starts_at), |
|
size - starts_at); |
|
while (key_pos < size) { |
|
const int ret = |
|
trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1); |
|
if (ret == -2) |
|
break; |
|
if (ret >= 0) { |
|
if (IsUnusedInlined(ret)) |
|
continue; |
|
|
|
auto& target_node = best_path_ends_at[key_pos]; |
|
const auto length = (key_pos - starts_at); |
|
|
|
const auto score = IsUserDefinedInlined(ret) |
|
? (length * max_score_ - 0.1) |
|
: GetScoreInlined(ret); |
|
const auto candidate_best_path_score = |
|
score + best_path_score_till_here; |
|
if (target_node.starts_at == -1 || |
|
candidate_best_path_score > target_node.best_path_score) { |
|
target_node.best_path_score = candidate_best_path_score; |
|
target_node.starts_at = starts_at; |
|
target_node.id = ret; |
|
} |
|
if (!has_single_node && length == mblen) { |
|
has_single_node = true; |
|
} |
|
} |
|
} |
|
if (!has_single_node) { |
|
auto& target_node = best_path_ends_at[starts_at + mblen]; |
|
const auto candidate_best_path_score = |
|
unk_score + best_path_score_till_here; |
|
if (target_node.starts_at == -1 || |
|
candidate_best_path_score > target_node.best_path_score) { |
|
target_node.best_path_score = candidate_best_path_score; |
|
target_node.starts_at = starts_at; |
|
target_node.id = unk_id_; |
|
} |
|
} |
|
|
|
starts_at += mblen; |
|
} |
|
|
|
EncodeResult results; |
|
int ends_at = size; |
|
while (ends_at > 0) { |
|
const auto& node = best_path_ends_at[ends_at]; |
|
results.emplace_back( |
|
normalized.substr(node.starts_at, ends_at - node.starts_at), node.id); |
|
ends_at = node.starts_at; |
|
} |
|
std::reverse(results.begin(), results.end()); |
|
return results; |
|
} |
|
|
|
public: |
|
explicit T5UniGramTokenizer(const std::string& json_str = "") { |
|
if (json_str.size() != 0) { |
|
InitializePieces(json_str); |
|
} else { |
|
InitializePieces(ModelLoader::load_t5_tokenizer_json()); |
|
} |
|
|
|
min_score_ = FLT_MAX; |
|
max_score_ = FLT_MIN; |
|
|
|
std::vector<std::pair<std::string, int>> pieces; |
|
for (int i = 0; i < piece_score_pairs.size(); i++) { |
|
const auto& sp = piece_score_pairs[i]; |
|
|
|
min_score_ = std::min(min_score_, sp.second); |
|
max_score_ = std::max(max_score_, sp.second); |
|
|
|
pieces.emplace_back(sp.first, i); |
|
} |
|
|
|
BuildTrie(&pieces); |
|
} |
|
~T5UniGramTokenizer(){}; |
|
|
|
std::string Normalize(const std::string& input) const { |
|
|
|
|
|
std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " "); |
|
return normalized; |
|
} |
|
|
|
std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const { |
|
std::string normalized = Normalize(input); |
|
normalized = pre_tokenizer.tokenize(normalized); |
|
EncodeResult result = EncodeOptimized(normalized); |
|
if (result.size() > 0 && append_eos_if_not_present) { |
|
auto item = result[result.size() - 1]; |
|
if (item.first != eos_token_) { |
|
result.emplace_back(eos_token_, eos_id_); |
|
} |
|
} |
|
std::vector<int> tokens; |
|
for (auto item : result) { |
|
tokens.push_back(item.second); |
|
} |
|
return tokens; |
|
} |
|
|
|
void pad_tokens(std::vector<int>& tokens, |
|
std::vector<float>& weights, |
|
size_t max_length = 0, |
|
bool padding = false) { |
|
if (max_length > 0 && padding) { |
|
size_t orig_token_num = tokens.size() - 1; |
|
size_t n = std::ceil(orig_token_num * 1.0 / (max_length - 1)); |
|
if (n == 0) { |
|
n = 1; |
|
} |
|
size_t length = max_length * n; |
|
LOG_DEBUG("token length: %llu", length); |
|
std::vector<int> new_tokens; |
|
std::vector<float> new_weights; |
|
int token_idx = 0; |
|
for (int i = 0; i < length; i++) { |
|
if (token_idx >= orig_token_num) { |
|
break; |
|
} |
|
if (i % max_length == max_length - 1) { |
|
new_tokens.push_back(eos_id_); |
|
new_weights.push_back(1.0); |
|
} else { |
|
new_tokens.push_back(tokens[token_idx]); |
|
new_weights.push_back(weights[token_idx]); |
|
token_idx++; |
|
} |
|
} |
|
|
|
new_tokens.push_back(eos_id_); |
|
new_weights.push_back(1.0); |
|
tokens = new_tokens; |
|
weights = new_weights; |
|
|
|
if (padding) { |
|
int pad_token_id = pad_id_; |
|
tokens.insert(tokens.end(), length - tokens.size(), pad_token_id); |
|
weights.insert(weights.end(), length - weights.size(), 1.0); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
float min_score() const { return min_score_; } |
|
|
|
|
|
|
|
float max_score() const { return max_score_; } |
|
|
|
Status status() const { return status_; } |
|
}; |
|
|
|
class T5LayerNorm : public UnaryBlock { |
|
protected: |
|
int64_t hidden_size; |
|
float eps; |
|
|
|
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") { |
|
enum ggml_type wtype = GGML_TYPE_F32; |
|
params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); |
|
} |
|
|
|
public: |
|
T5LayerNorm(int64_t hidden_size, |
|
float eps = 1e-06f) |
|
: hidden_size(hidden_size), |
|
eps(eps) {} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { |
|
struct ggml_tensor* w = params["weight"]; |
|
x = ggml_rms_norm(ctx, x, eps); |
|
x = ggml_mul(ctx, x, w); |
|
return x; |
|
} |
|
}; |
|
|
|
struct T5DenseActDense : public UnaryBlock { |
|
public: |
|
T5DenseActDense(int64_t model_dim, int64_t ff_dim) { |
|
blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); |
|
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); |
|
} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { |
|
|
|
auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]); |
|
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); |
|
|
|
x = wi->forward(ctx, x); |
|
x = ggml_relu_inplace(ctx, x); |
|
x = wo->forward(ctx, x); |
|
return x; |
|
} |
|
}; |
|
|
|
struct T5DenseGatedActDense : public UnaryBlock { |
|
public: |
|
T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { |
|
blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); |
|
blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); |
|
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); |
|
} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { |
|
|
|
auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]); |
|
auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]); |
|
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); |
|
|
|
auto hidden_gelu = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x)); |
|
auto hidden_linear = wi_1->forward(ctx, x); |
|
x = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear); |
|
x = wo->forward(ctx, x); |
|
return x; |
|
} |
|
}; |
|
|
|
struct T5LayerFF : public UnaryBlock { |
|
public: |
|
T5LayerFF(int64_t model_dim, int64_t ff_dim) { |
|
blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim)); |
|
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); |
|
} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { |
|
|
|
auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]); |
|
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); |
|
|
|
auto forwarded_states = layer_norm->forward(ctx, x); |
|
forwarded_states = DenseReluDense->forward(ctx, forwarded_states); |
|
x = ggml_add_inplace(ctx, forwarded_states, x); |
|
return x; |
|
} |
|
}; |
|
|
|
class T5Attention : public GGMLBlock { |
|
protected: |
|
int64_t model_dim; |
|
int64_t inner_dim; |
|
int64_t num_heads; |
|
bool using_relative_attention_bias; |
|
int64_t relative_attention_num_buckets = 32; |
|
int64_t relative_attention_max_distance = 128; |
|
|
|
public: |
|
T5Attention(int64_t model_dim, |
|
int64_t inner_dim, |
|
int64_t num_heads, |
|
bool using_relative_attention_bias = false) |
|
: model_dim(model_dim), |
|
inner_dim(inner_dim), |
|
num_heads(num_heads), |
|
using_relative_attention_bias(using_relative_attention_bias) { |
|
blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false)); |
|
blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false)); |
|
blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false)); |
|
blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false)); |
|
if (using_relative_attention_bias) { |
|
blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads)); |
|
} |
|
} |
|
|
|
struct ggml_tensor* compute_bias(struct ggml_context* ctx, |
|
struct ggml_tensor* relative_position_bucket) { |
|
auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]); |
|
|
|
auto values = relative_attention_bias->forward(ctx, relative_position_bucket); |
|
values = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3)); |
|
return values; |
|
} |
|
|
|
|
|
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* past_bias = NULL, |
|
struct ggml_tensor* mask = NULL, |
|
struct ggml_tensor* relative_position_bucket = NULL) { |
|
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q"]); |
|
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k"]); |
|
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v"]); |
|
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]); |
|
|
|
int64_t n_head = num_heads; |
|
int64_t d_head = inner_dim / n_head; |
|
|
|
auto q = q_proj->forward(ctx, x); |
|
auto k = k_proj->forward(ctx, x); |
|
auto v = v_proj->forward(ctx, x); |
|
|
|
if (using_relative_attention_bias && relative_position_bucket != NULL) { |
|
past_bias = compute_bias(ctx, relative_position_bucket); |
|
} |
|
if (past_bias != NULL) { |
|
if (mask != NULL) { |
|
mask = ggml_add(ctx, mask, past_bias); |
|
} else { |
|
mask = past_bias; |
|
} |
|
} |
|
|
|
k = ggml_scale_inplace(ctx, k, sqrt(d_head)); |
|
|
|
x = ggml_nn_attention_ext(ctx, q, k, v, num_heads, mask); |
|
|
|
x = out_proj->forward(ctx, x); |
|
return {x, past_bias}; |
|
} |
|
}; |
|
|
|
struct T5LayerSelfAttention : public GGMLBlock { |
|
public: |
|
T5LayerSelfAttention(int64_t model_dim, |
|
int64_t inner_dim, |
|
int64_t ff_dim, |
|
int64_t num_heads, |
|
bool using_relative_attention_bias) { |
|
blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias)); |
|
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); |
|
} |
|
|
|
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* past_bias = NULL, |
|
struct ggml_tensor* mask = NULL, |
|
struct ggml_tensor* relative_position_bucket = NULL) { |
|
|
|
auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]); |
|
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); |
|
|
|
auto normed_hidden_state = layer_norm->forward(ctx, x); |
|
auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket); |
|
auto output = ret.first; |
|
past_bias = ret.second; |
|
|
|
x = ggml_add_inplace(ctx, output, x); |
|
return {x, past_bias}; |
|
} |
|
}; |
|
|
|
struct T5Block : public GGMLBlock { |
|
public: |
|
T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) { |
|
blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias)); |
|
blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim)); |
|
} |
|
|
|
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* past_bias = NULL, |
|
struct ggml_tensor* mask = NULL, |
|
struct ggml_tensor* relative_position_bucket = NULL) { |
|
|
|
auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]); |
|
auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]); |
|
|
|
auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket); |
|
x = ret.first; |
|
past_bias = ret.second; |
|
x = layer_1->forward(ctx, x); |
|
return {x, past_bias}; |
|
} |
|
}; |
|
|
|
struct T5Stack : public GGMLBlock { |
|
int64_t num_layers; |
|
|
|
public: |
|
T5Stack(int64_t num_layers, |
|
int64_t model_dim, |
|
int64_t inner_dim, |
|
int64_t ff_dim, |
|
int64_t num_heads) |
|
: num_layers(num_layers) { |
|
for (int i = 0; i < num_layers; i++) { |
|
blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, i == 0)); |
|
} |
|
|
|
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); |
|
} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* past_bias = NULL, |
|
struct ggml_tensor* attention_mask = NULL, |
|
struct ggml_tensor* relative_position_bucket = NULL) { |
|
|
|
for (int i = 0; i < num_layers; i++) { |
|
auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]); |
|
|
|
auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); |
|
x = ret.first; |
|
past_bias = ret.second; |
|
} |
|
|
|
auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]); |
|
|
|
x = final_layer_norm->forward(ctx, x); |
|
return x; |
|
} |
|
}; |
|
|
|
struct T5 : public GGMLBlock { |
|
public: |
|
T5(int64_t num_layers, |
|
int64_t model_dim, |
|
int64_t ff_dim, |
|
int64_t num_heads, |
|
int64_t vocab_size) { |
|
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads)); |
|
blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim)); |
|
} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, |
|
struct ggml_tensor* input_ids, |
|
struct ggml_tensor* past_bias = NULL, |
|
struct ggml_tensor* attention_mask = NULL, |
|
struct ggml_tensor* relative_position_bucket = NULL) { |
|
|
|
|
|
auto shared = std::dynamic_pointer_cast<Embedding>(blocks["shared"]); |
|
auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]); |
|
|
|
auto x = shared->forward(ctx, input_ids); |
|
x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); |
|
return x; |
|
} |
|
}; |
|
|
|
struct T5Runner : public GGMLRunner { |
|
T5 model; |
|
std::vector<int> relative_position_bucket_vec; |
|
|
|
T5Runner(ggml_backend_t backend, |
|
std::map<std::string, enum ggml_type>& tensor_types, |
|
const std::string prefix, |
|
int64_t num_layers = 24, |
|
int64_t model_dim = 4096, |
|
int64_t ff_dim = 10240, |
|
int64_t num_heads = 64, |
|
int64_t vocab_size = 32128) |
|
: GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) { |
|
model.init(params_ctx, tensor_types, prefix); |
|
} |
|
|
|
std::string get_desc() { |
|
return "t5"; |
|
} |
|
|
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { |
|
model.get_param_tensors(tensors, prefix); |
|
} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, |
|
struct ggml_tensor* input_ids, |
|
struct ggml_tensor* relative_position_bucket) { |
|
size_t N = input_ids->ne[1]; |
|
size_t n_token = input_ids->ne[0]; |
|
|
|
auto hidden_states = model.forward(ctx, input_ids, NULL, NULL, relative_position_bucket); |
|
return hidden_states; |
|
} |
|
|
|
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) { |
|
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); |
|
|
|
input_ids = to_backend(input_ids); |
|
|
|
relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx, |
|
GGML_TYPE_I32, |
|
input_ids->ne[0], |
|
input_ids->ne[0]); |
|
set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); |
|
|
|
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, relative_position_bucket); |
|
|
|
ggml_build_forward_expand(gf, hidden_states); |
|
|
|
return gf; |
|
} |
|
|
|
void compute(const int n_threads, |
|
struct ggml_tensor* input_ids, |
|
ggml_tensor** output, |
|
ggml_context* output_ctx = NULL) { |
|
auto get_graph = [&]() -> struct ggml_cgraph* { |
|
return build_graph(input_ids); |
|
}; |
|
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); |
|
} |
|
|
|
static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position, |
|
bool bidirectional = true, |
|
int num_buckets = 32, |
|
int max_distance = 128) { |
|
std::vector<int> relative_buckets(relative_position.size(), 0); |
|
std::vector<int> abs_relative_position = relative_position; |
|
|
|
if (bidirectional) { |
|
num_buckets = num_buckets / 2; |
|
for (size_t i = 0; i < relative_position.size(); ++i) { |
|
if (relative_position[i] > 0) { |
|
relative_buckets[i] += num_buckets; |
|
} |
|
abs_relative_position[i] = std::abs(relative_position[i]); |
|
} |
|
} else { |
|
for (size_t i = 0; i < relative_position.size(); ++i) { |
|
abs_relative_position[i] = std::max(-relative_position[i], 0); |
|
} |
|
} |
|
|
|
int max_exact = num_buckets / 2; |
|
std::vector<int> relative_position_if_large(relative_position.size(), 0); |
|
|
|
for (size_t i = 0; i < relative_position.size(); ++i) { |
|
if (abs_relative_position[i] < max_exact) { |
|
relative_buckets[i] += abs_relative_position[i]; |
|
} else { |
|
float log_pos = std::log(static_cast<float>(abs_relative_position[i]) / max_exact); |
|
float log_base = std::log(static_cast<float>(max_distance) / max_exact); |
|
relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact)); |
|
relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1); |
|
relative_buckets[i] += relative_position_if_large[i]; |
|
} |
|
} |
|
|
|
return relative_buckets; |
|
} |
|
|
|
std::vector<int> compute_relative_position_bucket(int query_length, |
|
int key_length) { |
|
std::vector<int> context_position(query_length); |
|
std::vector<int> memory_position(key_length); |
|
|
|
for (int i = 0; i < query_length; ++i) { |
|
context_position[i] = i; |
|
} |
|
for (int i = 0; i < key_length; ++i) { |
|
memory_position[i] = i; |
|
} |
|
|
|
std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0)); |
|
for (int i = 0; i < query_length; ++i) { |
|
for (int j = 0; j < key_length; ++j) { |
|
relative_position[i][j] = memory_position[j] - context_position[i]; |
|
} |
|
} |
|
|
|
std::vector<int> relative_position_bucket; |
|
for (int i = 0; i < query_length; ++i) { |
|
std::vector<int> result = _relative_position_bucket(relative_position[i], true); |
|
relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end()); |
|
} |
|
|
|
return relative_position_bucket; |
|
} |
|
}; |
|
|
|
struct T5Embedder { |
|
T5UniGramTokenizer tokenizer; |
|
T5Runner model; |
|
|
|
static std::map<std::string, enum ggml_type> empty_tensor_types; |
|
|
|
T5Embedder(ggml_backend_t backend, |
|
std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, |
|
const std::string prefix = "", |
|
int64_t num_layers = 24, |
|
int64_t model_dim = 4096, |
|
int64_t ff_dim = 10240, |
|
int64_t num_heads = 64, |
|
int64_t vocab_size = 32128) |
|
: model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) { |
|
} |
|
|
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { |
|
model.get_param_tensors(tensors, prefix); |
|
} |
|
|
|
void alloc_params_buffer() { |
|
model.alloc_params_buffer(); |
|
} |
|
|
|
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text, |
|
size_t max_length = 0, |
|
bool padding = false) { |
|
auto parsed_attention = parse_prompt_attention(text); |
|
|
|
{ |
|
std::stringstream ss; |
|
ss << "["; |
|
for (const auto& item : parsed_attention) { |
|
ss << "['" << item.first << "', " << item.second << "], "; |
|
} |
|
ss << "]"; |
|
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); |
|
} |
|
|
|
std::vector<int> tokens; |
|
std::vector<float> weights; |
|
for (const auto& item : parsed_attention) { |
|
const std::string& curr_text = item.first; |
|
float curr_weight = item.second; |
|
std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false); |
|
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); |
|
weights.insert(weights.end(), curr_tokens.size(), curr_weight); |
|
} |
|
|
|
int EOS_TOKEN_ID = 1; |
|
tokens.push_back(EOS_TOKEN_ID); |
|
weights.push_back(1.0); |
|
|
|
tokenizer.pad_tokens(tokens, weights, max_length, padding); |
|
|
|
|
|
|
|
|
|
|
|
|
|
return {tokens, weights}; |
|
} |
|
|
|
void test() { |
|
struct ggml_init_params params; |
|
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); |
|
params.mem_buffer = NULL; |
|
params.no_alloc = false; |
|
|
|
struct ggml_context* work_ctx = ggml_init(params); |
|
GGML_ASSERT(work_ctx != NULL); |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
std::string text("a lovely cat"); |
|
auto tokens_and_weights = tokenize(text, 77, true); |
|
std::vector<int>& tokens = tokens_and_weights.first; |
|
std::vector<float>& weights = tokens_and_weights.second; |
|
for (auto token : tokens) { |
|
printf("%d ", token); |
|
} |
|
printf("\n"); |
|
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); |
|
struct ggml_tensor* out = NULL; |
|
|
|
int t0 = ggml_time_ms(); |
|
model.compute(8, input_ids, &out, work_ctx); |
|
int t1 = ggml_time_ms(); |
|
|
|
print_ggml_tensor(out); |
|
LOG_DEBUG("t5 test done in %dms", t1 - t0); |
|
} |
|
} |
|
|
|
static void load_from_file_and_test(const std::string& file_path) { |
|
|
|
ggml_backend_t backend = ggml_backend_cpu_init(); |
|
ggml_type model_data_type = GGML_TYPE_F32; |
|
std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend)); |
|
{ |
|
LOG_INFO("loading from '%s'", file_path.c_str()); |
|
|
|
t5->alloc_params_buffer(); |
|
std::map<std::string, ggml_tensor*> tensors; |
|
t5->get_param_tensors(tensors, ""); |
|
|
|
ModelLoader model_loader; |
|
if (!model_loader.init_from_file(file_path)) { |
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); |
|
return; |
|
} |
|
|
|
bool success = model_loader.load_tensors(tensors, backend); |
|
|
|
if (!success) { |
|
LOG_ERROR("load tensors from model loader failed"); |
|
return; |
|
} |
|
|
|
LOG_INFO("t5 model loaded"); |
|
} |
|
t5->test(); |
|
} |
|
}; |
|
|
|
#endif |