Darkknight535
/

kcpp-compiled-cuda-linux

Model card Files Files and versions Community

kcpp-compiled-cuda-linux / otherarch /sdcpp /t5.hpp

Darkknight535

Upload folder using huggingface_hub

1d30d42 verified 21 days ago

raw

history blame contribute delete

40.3 kB

	#ifndef __T5_HPP__
	#define __T5_HPP__

	#include <float.h>
	#include <limits>
	#include <map>
	#include <memory>
	#include <regex>
	#include <sstream>
	#include <string>
	#include <unordered_map>

	#include "darts.h"
	#include "ggml_extend.hpp"
	#include "json.hpp"
	#include "model.h"

	// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
	// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
	// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
	//
	// Since tokenization is not the bottleneck in SD, performance was not a major consideration
	// during the migration.
	class MetaspacePreTokenizer {
	private:
	std::string replacement;
	bool add_prefix_space;

	public:
	MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
	: replacement(replacement), add_prefix_space(add_prefix_space) {}

	std::string tokenize(const std::string& input) const {
	std::string tokens;
	std::stringstream ss(input);

	if (add_prefix_space) {
	tokens += replacement;
	}

	std::string token;
	bool firstToken = true;
	while (std::getline(ss, token, ' ')) {
	if (!firstToken)
	tokens += replacement + token;
	else
	tokens += token;

	firstToken = false;
	}

	return tokens;
	}
	};

	using EncodeResult = std::vector<std::pair<std::string, int>>;
	class T5UniGramTokenizer {
	public:
	enum Status {
	OK,
	NO_PIECES_LOADED,
	NO_ENTRY_FOUND,
	BUILD_DOUBLE_ARRAY_FAILED,
	PIECE_ALREADY_DEFINED,
	INVLIAD_JSON
	};

	protected:
	MetaspacePreTokenizer pre_tokenizer;

	// all <piece, score> pairs
	std::vector<std::pair<std::string, float>> piece_score_pairs;

	float min_score_ = 0.0;
	float max_score_ = 0.0;
	std::unique_ptr<Darts::DoubleArray> trie_;

	// Maximum size of the return value of Trie, which corresponds
	// to the maximum size of shared common prefix in the sentence pieces.
	int trie_results_size_;
	// unknown id.
	int unk_id_ = 2;
	std::string eos_token_ = "</s>";
	int eos_id_ = 1;
	int pad_id_ = 0;
	// status.
	Status status_ = OK;

	float kUnkPenalty = 10.0;

	std::string replacement;
	bool add_prefix_space = true;

	void InitializePieces(const std::string& json_str) {
	nlohmann::json data;

	try {
	data = nlohmann::json::parse(json_str);
	} catch (const nlohmann::json::parse_error& e) {
	status_ = INVLIAD_JSON;
	return;
	}
	if (!data.contains("model")) {
	status_ = INVLIAD_JSON;
	return;
	}
	nlohmann::json model = data["model"];
	if (!model.contains("vocab")) {
	status_ = INVLIAD_JSON;
	return;
	}
	if (model.contains("unk_id")) {
	unk_id_ = model["unk_id"];
	}

	replacement = data["pre_tokenizer"]["replacement"];
	add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];

	pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);

	for (const auto& item : model["vocab"]) {
	if (item.size() != 2 \|\| !item[0].is_string() \|\| !item[1].is_number_float()) {
	status_ = INVLIAD_JSON;
	return;
	}
	std::string piece = item[0];
	float score = item[1];
	piece_score_pairs.emplace_back(piece, score);
	}
	}

	// Builds a Trie index.
	void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
	if (status_ != OK)
	return;

	if (pieces->empty()) {
	status_ = NO_PIECES_LOADED;
	return;
	}

	// sort by sentencepiece since DoubleArray::build()
	// only accepts sorted strings.
	sort(pieces->begin(), pieces->end());

	// Makes key/value set for DoubleArrayTrie.
	std::vector<const char*> key(pieces->size());
	std::vector<int> value(pieces->size());
	for (size_t i = 0; i < pieces->size(); ++i) {
	key[i] = (*pieces)[i].first.data(); // sorted piece.
	value[i] = (*pieces)[i].second; // vocab_id
	}

	trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
	if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
	&value[0]) != 0) {
	status_ = BUILD_DOUBLE_ARRAY_FAILED;
	return;
	}

	// Computes the maximum number of shared prefixes in the trie.
	const int kMaxTrieResultsSize = 1024;
	std::vector<Darts::DoubleArray::result_pair_type> results(
	kMaxTrieResultsSize);
	trie_results_size_ = 0;
	for (const auto& p : *pieces) {
	const int num_nodes = trie_->commonPrefixSearch(
	p.first.data(), results.data(), results.size(), p.first.size());
	trie_results_size_ = std::max(trie_results_size_, num_nodes);
	}

	if (trie_results_size_ == 0)
	status_ = NO_ENTRY_FOUND;
	}

	// Non-virtual (inlined) implementation for faster execution.
	inline float GetScoreInlined(int id) const {
	return piece_score_pairs[id].second;
	}

	inline bool IsUnusedInlined(int id) const {
	return false; // TODO
	}

	inline bool IsUserDefinedInlined(int id) const {
	return false; // TODO
	}

	inline size_t OneCharLen(const char* src) const {
	return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
	}

	// The optimized Viterbi encode.
	// Main differences from the original function:
	// 1. Memorizes the best path at each postion so far,
	// 2. No need to store the Lattice nodes,
	// 3. Works in utf-8 directly,
	// 4. Defines a new struct with fewer fields than Lattice,
	// 5. Does not depend on `class Lattice` nor call `SetSentence()`,
	// `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
	// For detailed explanations please see the comments inside the function body.
	EncodeResult EncodeOptimized(const std::string& normalized) const {
	// An optimized Viterbi algorithm for unigram language models. Benchmarking
	// results show that it generates almost identical outputs and achieves 2.1x
	// speedup on average for 102 languages compared to the original
	// implementation. It's based on the following three ideas:
	//
	// 1. Because it uses the unigram model:
	// best_score(x1, x2, …, xt) = best_score(x1, x2, …, x{t-1}) + score(xt)
	// Deciding the best path (and score) can be decoupled into two isolated
	// terms: (a) the best path ended before the last token `best_score(x1, x2, …,
	// x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
	// not related to each other at all.
	//
	// Therefore, we can compute once and store the *best_path ending at
	// each character position*. In this way, when we know best_path_ends_at[M],
	// we can reuse it to compute all the best_path_ends_at_[...] where the last
	// token starts at the same character position M.
	//
	// This improves the time complexity from O(nkk) to O(n*k) because it
	// eliminates the extra loop of recomputing the best path ending at the same
	// position, where n is the input length and k is the maximum number of tokens
	// that can be recognized starting at each position.
	//
	// 2. Again, because it uses the unigram model, we don’t need to actually
	// store the lattice nodes. We still recognize all the tokens and lattice
	// nodes from the input, but along identifying them, we use and discard them
	// on the fly. There is no need to actually store them for best path Viterbi
	// decoding. The only thing we need to store is the best_path ending at
	// each character position.
	//
	// This improvement reduces the things needed to store in memory from O(n*k)
	// to O(n), where n is the input length and k is the maximum number of tokens
	// that can be recognized starting at each position.
	//
	// It also avoids the need of dynamic-size lattice node pool, because the
	// number of things to store is fixed as n.
	//
	// 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
	// inputs. In the original implementation, the lattice positions are based on
	// unicode positions. A mapping from unicode position to the utf-8 position is
	// maintained to recover the utf-8 string piece.
	//
	// We found that it is sufficient and beneficial to directly work with utf-8
	// positions:
	//
	// Firstly, it saves the conversion and mapping between unicode positions and
	// utf-8 positions.
	//
	// Secondly, it reduces the number of fields we need to maintain in the
	// node/path structure. Specifically, there are 8 fields defined in
	// `Lattice::Node` used by the original encoder, but here in the optimized
	// encoder we only need to define 3 fields in `BestPathNode`.

	if (status() != OK \|\| normalized.empty()) {
	return {};
	}
	// Represents the last node of the best path.
	struct BestPathNode {
	int id = -1; // The vocab id. (maybe -1 for UNK)
	float best_path_score =
	0; // The total score of the best path ending at this node.
	int starts_at =
	-1; // The starting position (in utf-8) of this node. The entire best
	// path can be constructed by backtracking along this link.
	};
	const int size = normalized.size();
	const float unk_score = min_score() - kUnkPenalty;
	// The ends are exclusive.
	std::vector<BestPathNode> best_path_ends_at(size + 1);
	// Generate lattice on-the-fly (not stored) and update best_path_ends_at.
	int starts_at = 0;
	while (starts_at < size) {
	std::size_t node_pos = 0;
	std::size_t key_pos = starts_at;
	const auto best_path_score_till_here =
	best_path_ends_at[starts_at].best_path_score;
	bool has_single_node = false;
	const int mblen =
	std::min<int>(OneCharLen(normalized.data() + starts_at),
	size - starts_at);
	while (key_pos < size) {
	const int ret =
	trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
	if (ret == -2)
	break;
	if (ret >= 0) {
	if (IsUnusedInlined(ret))
	continue;
	// Update the best path node.
	auto& target_node = best_path_ends_at[key_pos];
	const auto length = (key_pos - starts_at);
	// User defined symbol receives extra bonus to always be selected.
	const auto score = IsUserDefinedInlined(ret)
	? (length * max_score_ - 0.1)
	: GetScoreInlined(ret);
	const auto candidate_best_path_score =
	score + best_path_score_till_here;
	if (target_node.starts_at == -1 \|\|
	candidate_best_path_score > target_node.best_path_score) {
	target_node.best_path_score = candidate_best_path_score;
	target_node.starts_at = starts_at;
	target_node.id = ret;
	}
	if (!has_single_node && length == mblen) {
	has_single_node = true;
	}
	}
	}
	if (!has_single_node) {
	auto& target_node = best_path_ends_at[starts_at + mblen];
	const auto candidate_best_path_score =
	unk_score + best_path_score_till_here;
	if (target_node.starts_at == -1 \|\|
	candidate_best_path_score > target_node.best_path_score) {
	target_node.best_path_score = candidate_best_path_score;
	target_node.starts_at = starts_at;
	target_node.id = unk_id_;
	}
	}
	// Move by one unicode character.
	starts_at += mblen;
	}
	// Backtrack to identify the best path.
	EncodeResult results;
	int ends_at = size;
	while (ends_at > 0) {
	const auto& node = best_path_ends_at[ends_at];
	results.emplace_back(
	normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
	ends_at = node.starts_at;
	}
	std::reverse(results.begin(), results.end());
	return results;
	}

	public:
	explicit T5UniGramTokenizer(const std::string& json_str = "") {
	if (json_str.size() != 0) {
	InitializePieces(json_str);
	} else {
	InitializePieces(ModelLoader::load_t5_tokenizer_json());
	}

	min_score_ = FLT_MAX;
	max_score_ = FLT_MIN;

	std::vector<std::pair<std::string, int>> pieces;
	for (int i = 0; i < piece_score_pairs.size(); i++) {
	const auto& sp = piece_score_pairs[i];

	min_score_ = std::min(min_score_, sp.second);
	max_score_ = std::max(max_score_, sp.second);

	pieces.emplace_back(sp.first, i);
	}

	BuildTrie(&pieces);
	}
	~T5UniGramTokenizer(){};

	std::string Normalize(const std::string& input) const {
	// Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
	// TODO: nmt-nfkc
	std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
	return normalized;
	}

	std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
	std::string normalized = Normalize(input);
	normalized = pre_tokenizer.tokenize(normalized);
	EncodeResult result = EncodeOptimized(normalized);
	if (result.size() > 0 && append_eos_if_not_present) {
	auto item = result[result.size() - 1];
	if (item.first != eos_token_) {
	result.emplace_back(eos_token_, eos_id_);
	}
	}
	std::vector<int> tokens;
	for (auto item : result) {
	tokens.push_back(item.second);
	}
	return tokens;
	}

	void pad_tokens(std::vector<int>& tokens,
	std::vector<float>& weights,
	size_t max_length = 0,
	bool padding = false) {
	if (max_length > 0 && padding) {
	size_t orig_token_num = tokens.size() - 1;
	size_t n = std::ceil(orig_token_num * 1.0 / (max_length - 1));
	if (n == 0) {
	n = 1;
	}
	size_t length = max_length * n;
	LOG_DEBUG("token length: %llu", length);
	std::vector<int> new_tokens;
	std::vector<float> new_weights;
	int token_idx = 0;
	for (int i = 0; i < length; i++) {
	if (token_idx >= orig_token_num) {
	break;
	}
	if (i % max_length == max_length - 1) {
	new_tokens.push_back(eos_id_);
	new_weights.push_back(1.0);
	} else {
	new_tokens.push_back(tokens[token_idx]);
	new_weights.push_back(weights[token_idx]);
	token_idx++;
	}
	}

	new_tokens.push_back(eos_id_);
	new_weights.push_back(1.0);
	tokens = new_tokens;
	weights = new_weights;

	if (padding) {
	int pad_token_id = pad_id_;
	tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
	weights.insert(weights.end(), length - weights.size(), 1.0);
	}
	}
	}

	// Returns the minimum score in sentence pieces.
	// min_score() - 10 is used for the cost of unknown sentence.
	float min_score() const { return min_score_; }

	// Returns the maximum score in sentence pieces.
	// max_score() is used for the cost of user defined symbols.
	float max_score() const { return max_score_; }

	Status status() const { return status_; }
	};

	class T5LayerNorm : public UnaryBlock {
	protected:
	int64_t hidden_size;
	float eps;

	void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
	enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
	params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
	}

	public:
	T5LayerNorm(int64_t hidden_size,
	float eps = 1e-06f)
	: hidden_size(hidden_size),
	eps(eps) {}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	struct ggml_tensor* w = params["weight"];
	x = ggml_rms_norm(ctx, x, eps);
	x = ggml_mul(ctx, x, w);
	return x;
	}
	};

	struct T5DenseActDense : public UnaryBlock {
	public:
	T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
	blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
	blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	// x: [N, n_token, model_dim]
	auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
	auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);

	x = wi->forward(ctx, x);
	x = ggml_relu_inplace(ctx, x);
	x = wo->forward(ctx, x);
	return x;
	}
	};

	struct T5DenseGatedActDense : public UnaryBlock {
	public:
	T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
	blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
	blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
	blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	// x: [N, n_token, model_dim]
	auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
	auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
	auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);

	auto hidden_gelu = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x));
	auto hidden_linear = wi_1->forward(ctx, x);
	x = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear);
	x = wo->forward(ctx, x);
	return x;
	}
	};

	struct T5LayerFF : public UnaryBlock {
	public:
	T5LayerFF(int64_t model_dim, int64_t ff_dim) {
	blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
	blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	// x: [N, n_token, model_dim]
	auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
	auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);

	auto forwarded_states = layer_norm->forward(ctx, x);
	forwarded_states = DenseReluDense->forward(ctx, forwarded_states);
	x = ggml_add_inplace(ctx, forwarded_states, x);
	return x;
	}
	};

	class T5Attention : public GGMLBlock {
	protected:
	int64_t model_dim;
	int64_t inner_dim;
	int64_t num_heads;
	bool using_relative_attention_bias;
	int64_t relative_attention_num_buckets = 32;
	int64_t relative_attention_max_distance = 128;

	public:
	T5Attention(int64_t model_dim,
	int64_t inner_dim,
	int64_t num_heads,
	bool using_relative_attention_bias = false)
	: model_dim(model_dim),
	inner_dim(inner_dim),
	num_heads(num_heads),
	using_relative_attention_bias(using_relative_attention_bias) {
	blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
	blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
	blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
	blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
	if (using_relative_attention_bias) {
	blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
	}
	}

	struct ggml_tensor* compute_bias(struct ggml_context* ctx,
	struct ggml_tensor* relative_position_bucket) {
	auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);

	auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads)
	values = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length)
	return values;
	}

	// x: [N, n_token, model_dim]
	std::pair<struct ggml_tensor, struct ggml_tensor> forward(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* past_bias = NULL,
	struct ggml_tensor* mask = NULL,
	struct ggml_tensor* relative_position_bucket = NULL) {
	auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q"]);
	auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k"]);
	auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v"]);
	auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);

	int64_t n_head = num_heads;
	int64_t d_head = inner_dim / n_head;

	auto q = q_proj->forward(ctx, x);
	auto k = k_proj->forward(ctx, x);
	auto v = v_proj->forward(ctx, x);

	if (using_relative_attention_bias && relative_position_bucket != NULL) {
	past_bias = compute_bias(ctx, relative_position_bucket);
	}
	if (past_bias != NULL) {
	if (mask != NULL) {
	mask = ggml_add(ctx, mask, past_bias);
	} else {
	mask = past_bias;
	}
	}

	k = ggml_scale_inplace(ctx, k, sqrt(d_head));

	x = ggml_nn_attention_ext(ctx, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head]

	x = out_proj->forward(ctx, x); // [N, n_token, model_dim]
	return {x, past_bias};
	}
	};

	struct T5LayerSelfAttention : public GGMLBlock {
	public:
	T5LayerSelfAttention(int64_t model_dim,
	int64_t inner_dim,
	int64_t ff_dim,
	int64_t num_heads,
	bool using_relative_attention_bias) {
	blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
	blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
	}

	std::pair<struct ggml_tensor, struct ggml_tensor> forward(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* past_bias = NULL,
	struct ggml_tensor* mask = NULL,
	struct ggml_tensor* relative_position_bucket = NULL) {
	// x: [N, n_token, model_dim]
	auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
	auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);

	auto normed_hidden_state = layer_norm->forward(ctx, x);
	auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
	auto output = ret.first;
	past_bias = ret.second;

	x = ggml_add_inplace(ctx, output, x);
	return {x, past_bias};
	}
	};

	struct T5Block : public GGMLBlock {
	public:
	T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
	blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
	blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
	}

	std::pair<struct ggml_tensor, struct ggml_tensor> forward(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* past_bias = NULL,
	struct ggml_tensor* mask = NULL,
	struct ggml_tensor* relative_position_bucket = NULL) {
	// x: [N, n_token, model_dim]
	auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
	auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);

	auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
	x = ret.first;
	past_bias = ret.second;
	x = layer_1->forward(ctx, x);
	return {x, past_bias};
	}
	};

	struct T5Stack : public GGMLBlock {
	int64_t num_layers;

	public:
	T5Stack(int64_t num_layers,
	int64_t model_dim,
	int64_t inner_dim,
	int64_t ff_dim,
	int64_t num_heads)
	: num_layers(num_layers) {
	for (int i = 0; i < num_layers; i++) {
	blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, i == 0));
	}

	blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* past_bias = NULL,
	struct ggml_tensor* attention_mask = NULL,
	struct ggml_tensor* relative_position_bucket = NULL) {
	// x: [N, n_token, model_dim]
	for (int i = 0; i < num_layers; i++) {
	auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);

	auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
	x = ret.first;
	past_bias = ret.second;
	}

	auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);

	x = final_layer_norm->forward(ctx, x);
	return x;
	}
	};

	struct T5 : public GGMLBlock {
	public:
	T5(int64_t num_layers,
	int64_t model_dim,
	int64_t ff_dim,
	int64_t num_heads,
	int64_t vocab_size) {
	blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads));
	blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx,
	struct ggml_tensor* input_ids,
	struct ggml_tensor* past_bias = NULL,
	struct ggml_tensor* attention_mask = NULL,
	struct ggml_tensor* relative_position_bucket = NULL) {
	// input_ids: [N, n_token]

	auto shared = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
	auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);

	auto x = shared->forward(ctx, input_ids);
	x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
	return x;
	}
	};

	struct T5Runner : public GGMLRunner {
	T5 model;
	std::vector<int> relative_position_bucket_vec;

	T5Runner(ggml_backend_t backend,
	std::map<std::string, enum ggml_type>& tensor_types,
	const std::string prefix,
	int64_t num_layers = 24,
	int64_t model_dim = 4096,
	int64_t ff_dim = 10240,
	int64_t num_heads = 64,
	int64_t vocab_size = 32128)
	: GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) {
	model.init(params_ctx, tensor_types, prefix);
	}

	std::string get_desc() {
	return "t5";
	}

	void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
	model.get_param_tensors(tensors, prefix);
	}

	struct ggml_tensor* forward(struct ggml_context* ctx,
	struct ggml_tensor* input_ids,
	struct ggml_tensor* relative_position_bucket) {
	size_t N = input_ids->ne[1];
	size_t n_token = input_ids->ne[0];

	auto hidden_states = model.forward(ctx, input_ids, NULL, NULL, relative_position_bucket); // [N, n_token, model_dim]
	return hidden_states;
	}

	struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) {
	struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

	input_ids = to_backend(input_ids);

	relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]);

	// for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
	// if (i % 77 == 0) {
	// printf("\n");
	// }
	// printf("%d ", relative_position_bucket_vec[i]);
	// }

	auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
	GGML_TYPE_I32,
	input_ids->ne[0],
	input_ids->ne[0]);
	set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());

	struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, relative_position_bucket);

	ggml_build_forward_expand(gf, hidden_states);

	return gf;
	}

	void compute(const int n_threads,
	struct ggml_tensor* input_ids,
	ggml_tensor** output,
	ggml_context* output_ctx = NULL) {
	auto get_graph = [&]() -> struct ggml_cgraph* {
	return build_graph(input_ids);
	};
	GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
	}

	static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
	bool bidirectional = true,
	int num_buckets = 32,
	int max_distance = 128) {
	std::vector<int> relative_buckets(relative_position.size(), 0);
	std::vector<int> abs_relative_position = relative_position;

	if (bidirectional) {
	num_buckets = num_buckets / 2;
	for (size_t i = 0; i < relative_position.size(); ++i) {
	if (relative_position[i] > 0) {
	relative_buckets[i] += num_buckets;
	}
	abs_relative_position[i] = std::abs(relative_position[i]);
	}
	} else {
	for (size_t i = 0; i < relative_position.size(); ++i) {
	abs_relative_position[i] = std::max(-relative_position[i], 0);
	}
	}

	int max_exact = num_buckets / 2;
	std::vector<int> relative_position_if_large(relative_position.size(), 0);

	for (size_t i = 0; i < relative_position.size(); ++i) {
	if (abs_relative_position[i] < max_exact) {
	relative_buckets[i] += abs_relative_position[i];
	} else {
	float log_pos = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
	float log_base = std::log(static_cast<float>(max_distance) / max_exact);
	relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
	relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
	relative_buckets[i] += relative_position_if_large[i];
	}
	}

	return relative_buckets;
	}

	std::vector<int> compute_relative_position_bucket(int query_length,
	int key_length) {
	std::vector<int> context_position(query_length);
	std::vector<int> memory_position(key_length);

	for (int i = 0; i < query_length; ++i) {
	context_position[i] = i;
	}
	for (int i = 0; i < key_length; ++i) {
	memory_position[i] = i;
	}

	std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
	for (int i = 0; i < query_length; ++i) {
	for (int j = 0; j < key_length; ++j) {
	relative_position[i][j] = memory_position[j] - context_position[i];
	}
	}

	std::vector<int> relative_position_bucket;
	for (int i = 0; i < query_length; ++i) {
	std::vector<int> result = _relative_position_bucket(relative_position[i], true);
	relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
	}

	return relative_position_bucket;
	}
	};

	struct T5Embedder {
	T5UniGramTokenizer tokenizer;
	T5Runner model;

	static std::map<std::string, enum ggml_type> empty_tensor_types;

	T5Embedder(ggml_backend_t backend,
	std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
	const std::string prefix = "",
	int64_t num_layers = 24,
	int64_t model_dim = 4096,
	int64_t ff_dim = 10240,
	int64_t num_heads = 64,
	int64_t vocab_size = 32128)
	: model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) {
	}

	void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
	model.get_param_tensors(tensors, prefix);
	}

	void alloc_params_buffer() {
	model.alloc_params_buffer();
	}

	std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
	size_t max_length = 0,
	bool padding = false) {
	auto parsed_attention = parse_prompt_attention(text);

	{
	std::stringstream ss;
	ss << "[";
	for (const auto& item : parsed_attention) {
	ss << "['" << item.first << "', " << item.second << "], ";
	}
	ss << "]";
	LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
	}

	std::vector<int> tokens;
	std::vector<float> weights;
	for (const auto& item : parsed_attention) {
	const std::string& curr_text = item.first;
	float curr_weight = item.second;
	std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
	tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
	weights.insert(weights.end(), curr_tokens.size(), curr_weight);
	}

	int EOS_TOKEN_ID = 1;
	tokens.push_back(EOS_TOKEN_ID);
	weights.push_back(1.0);

	tokenizer.pad_tokens(tokens, weights, max_length, padding);

	// for (int i = 0; i < tokens.size(); i++) {
	// std::cout << tokens[i] << ":" << weights[i] << ", ";
	// }
	// std::cout << std::endl;

	return {tokens, weights};
	}

	void test() {
	struct ggml_init_params params;
	params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
	params.mem_buffer = NULL;
	params.no_alloc = false;

	struct ggml_context* work_ctx = ggml_init(params);
	GGML_ASSERT(work_ctx != NULL);

	{
	// cpu f16: pass
	// cpu f32: pass
	// cuda f16: nan
	// cuda f32: pass
	// cuda q8_0: nan
	// TODO: fix cuda nan
	std::string text("a lovely cat");
	auto tokens_and_weights = tokenize(text, 77, true);
	std::vector<int>& tokens = tokens_and_weights.first;
	std::vector<float>& weights = tokens_and_weights.second;
	for (auto token : tokens) {
	printf("%d ", token);
	}
	printf("\n");
	auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
	struct ggml_tensor* out = NULL;

	int t0 = ggml_time_ms();
	model.compute(8, input_ids, &out, work_ctx);
	int t1 = ggml_time_ms();

	print_ggml_tensor(out);
	LOG_DEBUG("t5 test done in %dms", t1 - t0);
	}
	}

	static void load_from_file_and_test(const std::string& file_path) {
	// ggml_backend_t backend = ggml_backend_cuda_init(0);
	ggml_backend_t backend = ggml_backend_cpu_init();
	ggml_type model_data_type = GGML_TYPE_F32;
	std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend));
	{
	LOG_INFO("loading from '%s'", file_path.c_str());

	t5->alloc_params_buffer();
	std::map<std::string, ggml_tensor*> tensors;
	t5->get_param_tensors(tensors, "");

	ModelLoader model_loader;
	if (!model_loader.init_from_file(file_path)) {
	LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
	return;
	}

	bool success = model_loader.load_tensors(tensors, backend);

	if (!success) {
	LOG_ERROR("load tensors from model loader failed");
	return;
	}

	LOG_INFO("t5 model loaded");
	}
	t5->test();
	}
	};

	#endif // __T5_HPP__