|
#ifndef __UNET_HPP__ |
|
#define __UNET_HPP__ |
|
|
|
#include "common.hpp" |
|
#include "ggml_extend.hpp" |
|
#include "model.h" |
|
|
|
|
|
|
|
#define UNET_GRAPH_SIZE 10240 |
|
|
|
class SpatialVideoTransformer : public SpatialTransformer { |
|
protected: |
|
int64_t time_depth; |
|
int64_t max_time_embed_period; |
|
|
|
public: |
|
SpatialVideoTransformer(int64_t in_channels, |
|
int64_t n_head, |
|
int64_t d_head, |
|
int64_t depth, |
|
int64_t context_dim, |
|
int64_t time_depth = 1, |
|
int64_t max_time_embed_period = 10000) |
|
: SpatialTransformer(in_channels, n_head, d_head, depth, context_dim), |
|
max_time_embed_period(max_time_embed_period) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int64_t inner_dim = n_head * d_head; |
|
|
|
GGML_ASSERT(depth == time_depth); |
|
GGML_ASSERT(in_channels == inner_dim); |
|
|
|
int64_t time_mix_d_head = d_head; |
|
int64_t n_time_mix_heads = n_head; |
|
int64_t time_mix_inner_dim = time_mix_d_head * n_time_mix_heads; |
|
int64_t time_context_dim = context_dim; |
|
|
|
for (int i = 0; i < time_depth; i++) { |
|
std::string name = "time_stack." + std::to_string(i); |
|
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, |
|
n_time_mix_heads, |
|
time_mix_d_head, |
|
time_context_dim, |
|
true)); |
|
} |
|
|
|
int64_t time_embed_dim = in_channels * 4; |
|
blocks["time_pos_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim)); |
|
|
|
blocks["time_pos_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, in_channels)); |
|
|
|
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender()); |
|
} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* context, |
|
int timesteps) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]); |
|
auto proj_in = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]); |
|
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]); |
|
auto time_pos_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_pos_embed.0"]); |
|
auto time_pos_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_pos_embed.2"]); |
|
auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]); |
|
|
|
auto x_in = x; |
|
int64_t n = x->ne[3]; |
|
int64_t h = x->ne[1]; |
|
int64_t w = x->ne[0]; |
|
int64_t inner_dim = n_head * d_head; |
|
|
|
GGML_ASSERT(n == timesteps); |
|
|
|
auto time_context = context; |
|
auto spatial_context = context; |
|
|
|
auto time_context_first_timestep = ggml_view_3d(ctx, |
|
time_context, |
|
time_context->ne[0], |
|
time_context->ne[1], |
|
1, |
|
time_context->nb[1], |
|
time_context->nb[2], |
|
0); |
|
time_context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, |
|
time_context_first_timestep->ne[0], |
|
time_context_first_timestep->ne[1], |
|
time_context_first_timestep->ne[2] * h * w); |
|
time_context = ggml_repeat(ctx, time_context_first_timestep, time_context); |
|
|
|
x = norm->forward(ctx, x); |
|
x = proj_in->forward(ctx, x); |
|
|
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); |
|
x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); |
|
|
|
auto num_frames = ggml_arange(ctx, 0, timesteps, 1); |
|
|
|
auto t_emb = ggml_nn_timestep_embedding(ctx, num_frames, in_channels, max_time_embed_period); |
|
|
|
auto emb = time_pos_embed_0->forward(ctx, t_emb); |
|
emb = ggml_silu_inplace(ctx, emb); |
|
emb = time_pos_embed_2->forward(ctx, emb); |
|
emb = ggml_reshape_3d(ctx, emb, emb->ne[0], 1, emb->ne[1]); |
|
|
|
for (int i = 0; i < depth; i++) { |
|
std::string transformer_name = "transformer_blocks." + std::to_string(i); |
|
std::string time_stack_name = "time_stack." + std::to_string(i); |
|
|
|
auto block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[transformer_name]); |
|
auto mix_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[time_stack_name]); |
|
|
|
x = block->forward(ctx, x, spatial_context); |
|
|
|
|
|
auto x_mix = x; |
|
x_mix = ggml_add(ctx, x_mix, emb); |
|
|
|
int64_t N = x_mix->ne[2]; |
|
int64_t T = timesteps; |
|
int64_t B = N / T; |
|
int64_t S = x_mix->ne[1]; |
|
int64_t C = x_mix->ne[0]; |
|
|
|
x_mix = ggml_reshape_4d(ctx, x_mix, C, S, T, B); |
|
x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3)); |
|
x_mix = ggml_reshape_3d(ctx, x_mix, C, T, S * B); |
|
|
|
x_mix = mix_block->forward(ctx, x_mix, time_context); |
|
|
|
x_mix = ggml_reshape_4d(ctx, x_mix, C, T, S, B); |
|
x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3)); |
|
x_mix = ggml_reshape_3d(ctx, x_mix, C, S, T * B); |
|
|
|
x = time_mixer->forward(ctx, x, x_mix); |
|
} |
|
|
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); |
|
x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); |
|
|
|
|
|
x = proj_out->forward(ctx, x); |
|
|
|
x = ggml_add(ctx, x, x_in); |
|
return x; |
|
} |
|
}; |
|
|
|
|
|
class UnetModelBlock : public GGMLBlock { |
|
protected: |
|
SDVersion version = VERSION_SD1; |
|
|
|
int in_channels = 4; |
|
int out_channels = 4; |
|
int num_res_blocks = 2; |
|
std::vector<int> attention_resolutions = {4, 2, 1}; |
|
std::vector<int> channel_mult = {1, 2, 4, 4}; |
|
std::vector<int> transformer_depth = {1, 1, 1, 1}; |
|
int time_embed_dim = 1280; |
|
int num_heads = 8; |
|
int num_head_channels = -1; |
|
int context_dim = 768; |
|
|
|
public: |
|
int model_channels = 320; |
|
int adm_in_channels = 2816; |
|
|
|
UnetModelBlock(SDVersion version = VERSION_SD1, bool flash_attn = false) |
|
: version(version) { |
|
if (version == VERSION_SD2) { |
|
context_dim = 1024; |
|
num_head_channels = 64; |
|
num_heads = -1; |
|
} else if (version == VERSION_SDXL) { |
|
context_dim = 2048; |
|
attention_resolutions = {4, 2}; |
|
channel_mult = {1, 2, 4}; |
|
transformer_depth = {1, 2, 10}; |
|
num_head_channels = 64; |
|
num_heads = -1; |
|
} else if (version == VERSION_SVD) { |
|
in_channels = 8; |
|
out_channels = 4; |
|
context_dim = 1024; |
|
adm_in_channels = 768; |
|
num_head_channels = 64; |
|
num_heads = -1; |
|
} |
|
|
|
|
|
|
|
blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim)); |
|
|
|
blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim)); |
|
|
|
if (version == VERSION_SDXL || version == VERSION_SVD) { |
|
blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim)); |
|
|
|
blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim)); |
|
} |
|
|
|
|
|
blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1})); |
|
|
|
std::vector<int> input_block_chans; |
|
input_block_chans.push_back(model_channels); |
|
int ch = model_channels; |
|
int input_block_idx = 0; |
|
int ds = 1; |
|
|
|
auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* { |
|
if (version == VERSION_SVD) { |
|
return new VideoResBlock(channels, emb_channels, out_channels); |
|
} else { |
|
return new ResBlock(channels, emb_channels, out_channels); |
|
} |
|
}; |
|
|
|
auto get_attention_layer = [&](int64_t in_channels, |
|
int64_t n_head, |
|
int64_t d_head, |
|
int64_t depth, |
|
int64_t context_dim) -> SpatialTransformer* { |
|
if (version == VERSION_SVD) { |
|
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim); |
|
} else { |
|
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn); |
|
} |
|
}; |
|
|
|
size_t len_mults = channel_mult.size(); |
|
for (int i = 0; i < len_mults; i++) { |
|
int mult = channel_mult[i]; |
|
for (int j = 0; j < num_res_blocks; j++) { |
|
input_block_idx += 1; |
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; |
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels)); |
|
|
|
ch = mult * model_channels; |
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { |
|
int n_head = num_heads; |
|
int d_head = ch / num_heads; |
|
if (num_head_channels != -1) { |
|
d_head = num_head_channels; |
|
n_head = ch / d_head; |
|
} |
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; |
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, |
|
n_head, |
|
d_head, |
|
transformer_depth[i], |
|
context_dim)); |
|
} |
|
input_block_chans.push_back(ch); |
|
} |
|
if (i != len_mults - 1) { |
|
input_block_idx += 1; |
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; |
|
blocks[name] = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch)); |
|
|
|
input_block_chans.push_back(ch); |
|
ds *= 2; |
|
} |
|
} |
|
|
|
|
|
int n_head = num_heads; |
|
int d_head = ch / num_heads; |
|
if (num_head_channels != -1) { |
|
d_head = num_head_channels; |
|
n_head = ch / d_head; |
|
} |
|
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch)); |
|
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, |
|
n_head, |
|
d_head, |
|
transformer_depth[transformer_depth.size() - 1], |
|
context_dim)); |
|
blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch)); |
|
|
|
|
|
int output_block_idx = 0; |
|
for (int i = (int)len_mults - 1; i >= 0; i--) { |
|
int mult = channel_mult[i]; |
|
for (int j = 0; j < num_res_blocks + 1; j++) { |
|
int ich = input_block_chans.back(); |
|
input_block_chans.pop_back(); |
|
|
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0"; |
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); |
|
|
|
ch = mult * model_channels; |
|
int up_sample_idx = 1; |
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { |
|
int n_head = num_heads; |
|
int d_head = ch / num_heads; |
|
if (num_head_channels != -1) { |
|
d_head = num_head_channels; |
|
n_head = ch / d_head; |
|
} |
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1"; |
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim)); |
|
|
|
up_sample_idx++; |
|
} |
|
|
|
if (i > 0 && j == num_res_blocks) { |
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); |
|
blocks[name] = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch)); |
|
|
|
ds /= 2; |
|
} |
|
|
|
output_block_idx += 1; |
|
} |
|
} |
|
|
|
|
|
blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch)); |
|
|
|
blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); |
|
} |
|
|
|
struct ggml_tensor* resblock_forward(std::string name, |
|
struct ggml_context* ctx, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* emb, |
|
int num_video_frames) { |
|
if (version == VERSION_SVD) { |
|
auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]); |
|
|
|
return block->forward(ctx, x, emb, num_video_frames); |
|
} else { |
|
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]); |
|
|
|
return block->forward(ctx, x, emb); |
|
} |
|
} |
|
|
|
struct ggml_tensor* attention_layer_forward(std::string name, |
|
struct ggml_context* ctx, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* context, |
|
int timesteps) { |
|
if (version == VERSION_SVD) { |
|
auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]); |
|
|
|
return block->forward(ctx, x, context, timesteps); |
|
} else { |
|
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]); |
|
|
|
return block->forward(ctx, x, context); |
|
} |
|
} |
|
|
|
struct ggml_tensor* forward(struct ggml_context* ctx, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* timesteps, |
|
struct ggml_tensor* context, |
|
struct ggml_tensor* c_concat = NULL, |
|
struct ggml_tensor* y = NULL, |
|
int num_video_frames = -1, |
|
std::vector<struct ggml_tensor*> controls = {}, |
|
float control_strength = 0.f) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
if (context != NULL) { |
|
if (context->ne[2] != x->ne[3]) { |
|
context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3])); |
|
} |
|
} |
|
|
|
if (c_concat != NULL) { |
|
if (c_concat->ne[3] != x->ne[3]) { |
|
c_concat = ggml_repeat(ctx, c_concat, x); |
|
} |
|
x = ggml_concat(ctx, x, c_concat, 2); |
|
} |
|
|
|
if (y != NULL) { |
|
if (y->ne[1] != x->ne[3]) { |
|
y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3])); |
|
} |
|
} |
|
|
|
auto time_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]); |
|
auto time_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]); |
|
auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]); |
|
|
|
auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]); |
|
auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]); |
|
|
|
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); |
|
|
|
auto emb = time_embed_0->forward(ctx, t_emb); |
|
emb = ggml_silu_inplace(ctx, emb); |
|
emb = time_embed_2->forward(ctx, emb); |
|
|
|
|
|
if (y != NULL) { |
|
auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]); |
|
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]); |
|
|
|
auto label_emb = label_embed_0->forward(ctx, y); |
|
label_emb = ggml_silu_inplace(ctx, label_emb); |
|
label_emb = label_embed_2->forward(ctx, label_emb); |
|
|
|
emb = ggml_add(ctx, emb, label_emb); |
|
} |
|
|
|
|
|
std::vector<struct ggml_tensor*> hs; |
|
|
|
|
|
auto h = input_blocks_0_0->forward(ctx, x); |
|
|
|
ggml_set_name(h, "bench-start"); |
|
hs.push_back(h); |
|
|
|
size_t len_mults = channel_mult.size(); |
|
int input_block_idx = 0; |
|
int ds = 1; |
|
for (int i = 0; i < len_mults; i++) { |
|
int mult = channel_mult[i]; |
|
for (int j = 0; j < num_res_blocks; j++) { |
|
input_block_idx += 1; |
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; |
|
h = resblock_forward(name, ctx, h, emb, num_video_frames); |
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { |
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; |
|
h = attention_layer_forward(name, ctx, h, context, num_video_frames); |
|
} |
|
hs.push_back(h); |
|
} |
|
if (i != len_mults - 1) { |
|
ds *= 2; |
|
input_block_idx += 1; |
|
|
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; |
|
auto block = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]); |
|
|
|
h = block->forward(ctx, h); |
|
hs.push_back(h); |
|
} |
|
} |
|
|
|
|
|
|
|
h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); |
|
h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); |
|
h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); |
|
|
|
if (controls.size() > 0) { |
|
auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength); |
|
h = ggml_add(ctx, h, cs); |
|
} |
|
int control_offset = controls.size() - 2; |
|
|
|
|
|
int output_block_idx = 0; |
|
for (int i = (int)len_mults - 1; i >= 0; i--) { |
|
for (int j = 0; j < num_res_blocks + 1; j++) { |
|
auto h_skip = hs.back(); |
|
hs.pop_back(); |
|
|
|
if (controls.size() > 0) { |
|
auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength); |
|
h_skip = ggml_add(ctx, h_skip, cs); |
|
control_offset--; |
|
} |
|
|
|
h = ggml_concat(ctx, h, h_skip, 2); |
|
|
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0"; |
|
|
|
h = resblock_forward(name, ctx, h, emb, num_video_frames); |
|
|
|
int up_sample_idx = 1; |
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { |
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1"; |
|
|
|
h = attention_layer_forward(name, ctx, h, context, num_video_frames); |
|
|
|
up_sample_idx++; |
|
} |
|
|
|
if (i > 0 && j == num_res_blocks) { |
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); |
|
auto block = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]); |
|
|
|
h = block->forward(ctx, h); |
|
|
|
ds /= 2; |
|
} |
|
|
|
output_block_idx += 1; |
|
} |
|
} |
|
|
|
|
|
h = out_0->forward(ctx, h); |
|
h = ggml_silu_inplace(ctx, h); |
|
h = out_2->forward(ctx, h); |
|
ggml_set_name(h, "bench-end"); |
|
return h; |
|
} |
|
}; |
|
|
|
struct UNetModelRunner : public GGMLRunner { |
|
UnetModelBlock unet; |
|
|
|
UNetModelRunner(ggml_backend_t backend, |
|
std::map<std::string, enum ggml_type>& tensor_types, |
|
const std::string prefix, |
|
SDVersion version = VERSION_SD1, |
|
bool flash_attn = false) |
|
: GGMLRunner(backend), unet(version, flash_attn) { |
|
unet.init(params_ctx, tensor_types, prefix); |
|
} |
|
|
|
std::string get_desc() { |
|
return "unet"; |
|
} |
|
|
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { |
|
unet.get_param_tensors(tensors, prefix); |
|
} |
|
|
|
struct ggml_cgraph* build_graph(struct ggml_tensor* x, |
|
struct ggml_tensor* timesteps, |
|
struct ggml_tensor* context, |
|
struct ggml_tensor* c_concat = NULL, |
|
struct ggml_tensor* y = NULL, |
|
int num_video_frames = -1, |
|
std::vector<struct ggml_tensor*> controls = {}, |
|
float control_strength = 0.f) { |
|
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, UNET_GRAPH_SIZE, false); |
|
|
|
if (num_video_frames == -1) { |
|
num_video_frames = x->ne[3]; |
|
} |
|
|
|
x = to_backend(x); |
|
context = to_backend(context); |
|
y = to_backend(y); |
|
timesteps = to_backend(timesteps); |
|
|
|
for (int i = 0; i < controls.size(); i++) { |
|
controls[i] = to_backend(controls[i]); |
|
} |
|
|
|
struct ggml_tensor* out = unet.forward(compute_ctx, |
|
x, |
|
timesteps, |
|
context, |
|
c_concat, |
|
y, |
|
num_video_frames, |
|
controls, |
|
control_strength); |
|
|
|
ggml_build_forward_expand(gf, out); |
|
|
|
return gf; |
|
} |
|
|
|
void compute(int n_threads, |
|
struct ggml_tensor* x, |
|
struct ggml_tensor* timesteps, |
|
struct ggml_tensor* context, |
|
struct ggml_tensor* c_concat, |
|
struct ggml_tensor* y, |
|
int num_video_frames = -1, |
|
std::vector<struct ggml_tensor*> controls = {}, |
|
float control_strength = 0.f, |
|
struct ggml_tensor** output = NULL, |
|
struct ggml_context* output_ctx = NULL) { |
|
|
|
|
|
|
|
|
|
|
|
auto get_graph = [&]() -> struct ggml_cgraph* { |
|
return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength); |
|
}; |
|
|
|
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); |
|
} |
|
|
|
void test() { |
|
struct ggml_init_params params; |
|
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); |
|
params.mem_buffer = NULL; |
|
params.no_alloc = false; |
|
|
|
struct ggml_context* work_ctx = ggml_init(params); |
|
GGML_ASSERT(work_ctx != NULL); |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
int num_video_frames = 3; |
|
|
|
auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 8, num_video_frames); |
|
std::vector<float> timesteps_vec(num_video_frames, 999.f); |
|
auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); |
|
ggml_set_f32(x, 0.5f); |
|
|
|
|
|
auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 1024, 1, num_video_frames); |
|
ggml_set_f32(context, 0.5f); |
|
|
|
|
|
auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, num_video_frames); |
|
ggml_set_f32(y, 0.5f); |
|
|
|
|
|
struct ggml_tensor* out = NULL; |
|
|
|
int t0 = ggml_time_ms(); |
|
compute(8, x, timesteps, context, NULL, y, num_video_frames, {}, 0.f, &out, work_ctx); |
|
int t1 = ggml_time_ms(); |
|
|
|
print_ggml_tensor(out); |
|
LOG_DEBUG("unet test done in %dms", t1 - t0); |
|
} |
|
} |
|
}; |
|
|
|
#endif |
|
|