From 524083313ba92e6952726491bbdda89cc147aeb8 Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 2 Mar 2026 16:54:03 +0100 Subject: [PATCH 1/2] add step --- examples/cli/main.cpp | 142 +++- examples/common/common.hpp | 60 +- ggml | 2 +- include/stable-diffusion.h | 27 + src/ace.hpp | 1346 ++++++++++++++++++++++++++++++++++++ src/ace_vae.hpp | 306 ++++++++ src/conditioner.hpp | 820 ++++++++++++++++++++++ src/denoiser.hpp | 74 +- src/diffusion_model.hpp | 70 ++ src/ggml_extend.hpp | 557 ++++++++++++++- src/llm.hpp | 964 +++++++++++++++++++++++++- src/lora.hpp | 13 + src/model.cpp | 6 + src/model.h | 11 +- src/stable-diffusion.cpp | 405 ++++++++++- src/tokenize_util.cpp | 32 +- 16 files changed, 4788 insertions(+), 47 deletions(-) create mode 100644 src/ace.hpp create mode 100644 src/ace_vae.hpp diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index f9e4928ea..c65ee82c7 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -149,7 +150,7 @@ struct SDCliParams { options.manual_options = { {"-M", "--mode", - "run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen", + "run mode, one of [img_gen, vid_gen, audio_gen, upscale, convert], default: img_gen", on_mode_arg}, {"", "--preview", @@ -170,6 +171,10 @@ struct SDCliParams { return false; } + if (mode == AUDIO_GEN && output_path == "output.png") { + output_path = "output.wav"; + } + if (mode == CONVERT) { if (output_path == "output.png") { output_path = "output.gguf"; @@ -370,6 +375,87 @@ std::string format_frame_idx(std::string pattern, int frame_idx) { return result; } +static bool write_wav(const std::string& path, const sd_audio_t& audio) { + if (audio.data == nullptr || audio.sample_count == 0 || audio.channels == 0) { + return false; + } + + FILE* f = fopen(path.c_str(), "wb"); + if (!f) { + return false; + } + + uint32_t sample_rate = audio.sample_rate; + uint16_t channels = static_cast(audio.channels); + uint16_t bits_per_sample = 16; + uint32_t byte_rate = sample_rate * channels * (bits_per_sample / 8); + uint16_t block_align = channels * (bits_per_sample / 8); + uint32_t data_size = audio.sample_count * channels * (bits_per_sample / 8); + uint32_t chunk_size = 36 + data_size; + + fwrite("RIFF", 1, 4, f); + fwrite(&chunk_size, 4, 1, f); + fwrite("WAVE", 1, 4, f); + fwrite("fmt ", 1, 4, f); + uint32_t subchunk1_size = 16; + uint16_t audio_format = 1; + fwrite(&subchunk1_size, 4, 1, f); + fwrite(&audio_format, 2, 1, f); + fwrite(&channels, 2, 1, f); + fwrite(&sample_rate, 4, 1, f); + fwrite(&byte_rate, 4, 1, f); + fwrite(&block_align, 2, 1, f); + fwrite(&bits_per_sample, 2, 1, f); + fwrite("data", 1, 4, f); + fwrite(&data_size, 4, 1, f); + + for (uint32_t i = 0; i < audio.sample_count * audio.channels; ++i) { + float v = audio.data[i]; + if (v > 1.0f) v = 1.0f; + if (v < -1.0f) v = -1.0f; + int16_t s = (int16_t)std::lrintf(v * 32767.0f); + fwrite(&s, sizeof(int16_t), 1, f); + } + + fclose(f); + return true; +} + +bool save_audio_result(const SDCliParams& cli_params, + const SDGenerationParams& gen_params, + const sd_audio_t& audio) { + (void)gen_params; + namespace fs = std::filesystem; + fs::path out_path = cli_params.output_path; + + if (!out_path.parent_path().empty()) { + std::error_code ec; + fs::create_directories(out_path.parent_path(), ec); + if (ec) { + LOG_ERROR("failed to create directory '%s': %s", + out_path.parent_path().string().c_str(), ec.message().c_str()); + return false; + } + } + + fs::path base_path = out_path; + fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{}; + if (!ext.empty()) + base_path.replace_extension(); + + std::string ext_lower = ext.string(); + std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower); + if (ext_lower != ".wav") { + ext = ".wav"; + } + + fs::path audio_path = base_path; + audio_path += ext; + bool ok = write_wav(audio_path.string(), audio); + LOG_INFO("save result audio to '%s' (%s)", audio_path.string().c_str(), ok ? "success" : "failure"); + return ok; +} + bool save_results(const SDCliParams& cli_params, const SDContextParams& ctx_params, const SDGenerationParams& gen_params, @@ -501,6 +587,10 @@ int main(int argc, const char* argv[]) { cli_params.preview_fps = gen_params.fps; if (cli_params.preview_method == PREVIEW_PROJ) cli_params.preview_fps /= 4; + if (cli_params.mode == AUDIO_GEN) { + cli_params.preview_method = PREVIEW_NONE; + cli_params.preview_noisy = false; + } sd_set_log_callback(sd_log_cb, (void*)&cli_params); log_verbose = cli_params.verbose; @@ -540,6 +630,56 @@ int main(int argc, const char* argv[]) { } } + if (cli_params.mode == AUDIO_GEN) { + bool vae_decode_only = true; + sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, false); + + sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params); + if (sd_ctx == nullptr) { + LOG_INFO("new_sd_ctx_t failed"); + return 1; + } + + if (gen_params.sample_params.sample_method == SAMPLE_METHOD_COUNT) { + gen_params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx); + } + if (gen_params.sample_params.scheduler == SCHEDULER_COUNT) { + gen_params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx, gen_params.sample_params.sample_method); + } + if (gen_params.sample_params.guidance.txt_cfg == 7.0f) { + gen_params.sample_params.guidance.txt_cfg = 1.0f; + } + + sd_audio_gen_params_t audio_params = { + gen_params.lora_vec.data(), + static_cast(gen_params.lora_vec.size()), + gen_params.prompt.c_str(), + gen_params.negative_prompt.c_str(), + gen_params.lyrics.c_str(), + gen_params.keyscale.c_str(), + gen_params.language.c_str(), + gen_params.bpm, + gen_params.duration, + gen_params.timesignature, + gen_params.lm_seed, + gen_params.sample_params, + gen_params.seed, + }; + + sd_audio_t* audio = generate_audio(sd_ctx, &audio_params); + if (audio == nullptr) { + LOG_ERROR("audio generation failed"); + free_sd_ctx(sd_ctx); + return 1; + } + + bool ok = save_audio_result(cli_params, gen_params, *audio); + free(audio->data); + free(audio); + free_sd_ctx(sd_ctx); + return ok ? 0 : 1; + } + bool vae_decode_only = true; sd_image_t init_image = {0, 0, 3, nullptr}; sd_image_t end_image = {0, 0, 3, nullptr}; diff --git a/examples/common/common.hpp b/examples/common/common.hpp index 369c1f07f..22c12f806 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -37,14 +37,16 @@ namespace fs = std::filesystem; const char* modes_str[] = { "img_gen", "vid_gen", + "audio_gen", "convert", "upscale", }; -#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale" +#define SD_ALL_MODES_STR "img_gen, vid_gen, audio_gen, convert, upscale" enum SDMode { IMG_GEN, VID_GEN, + AUDIO_GEN, CONVERT, UPSCALE, MODE_COUNT @@ -1024,6 +1026,13 @@ struct SDGenerationParams { std::string prompt; std::string prompt_with_lora; // for metadata record only std::string negative_prompt; + std::string lyrics; + std::string language = "en"; + std::string keyscale = "C major"; + float bpm = 120.f; + float duration = 120.f; + int timesignature = 2; + int lm_seed = 0; int clip_skip = -1; // <= 0 represents unspecified int width = -1; int height = -1; @@ -1090,6 +1099,18 @@ struct SDGenerationParams { "--negative-prompt", "the negative prompt (default: \"\")", &negative_prompt}, + {"", + "--lyrics", + "lyrics for ACE audio models", + &lyrics}, + {"", + "--language", + "language for ACE audio lyrics (default: en)", + &language}, + {"", + "--keyscale", + "keyscale for ACE audio (e.g. \"C major\")", + &keyscale}, {"-i", "--init-img", "path to the init image", @@ -1131,6 +1152,14 @@ struct SDGenerationParams { "--width", "image width, in pixel space (default: 512)", &width}, + {"", + "--timesignature", + "time signature for ACE audio (default: 2)", + ×ignature}, + {"", + "--lm-seed", + "seed for ACE audio semantic token generation (default: 0)", + &lm_seed}, {"", "--steps", "number of sample steps (default: 20)", @@ -1176,6 +1205,14 @@ struct SDGenerationParams { "--cfg-scale", "unconditional guidance scale: (default: 7.0)", &sample_params.guidance.txt_cfg}, + {"", + "--bpm", + "tempo in BPM for ACE audio (default: 120)", + &bpm}, + {"", + "--duration", + "duration in seconds for ACE audio (default: 120.0)", + &duration}, {"", "--img-cfg-scale", "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", @@ -1573,6 +1610,13 @@ struct SDGenerationParams { load_if_exists("prompt", prompt); load_if_exists("negative_prompt", negative_prompt); + load_if_exists("lyrics", lyrics); + load_if_exists("language", language); + load_if_exists("keyscale", keyscale); + load_if_exists("bpm", bpm); + load_if_exists("duration", duration); + load_if_exists("timesignature", timesignature); + load_if_exists("lm_seed", lm_seed); load_if_exists("cache_mode", cache_mode); load_if_exists("cache_option", cache_option); load_if_exists("cache_preset", cache_preset); @@ -1744,6 +1788,13 @@ struct SDGenerationParams { return false; } + if (mode == AUDIO_GEN) { + if (duration <= 0.f) { + LOG_ERROR("error: audio duration must be greater than 0\n"); + return false; + } + } + sd_cache_params_init(&cache_params); auto parse_named_params = [&](const std::string& opt_str) -> bool { @@ -1937,6 +1988,13 @@ struct SDGenerationParams { << " high_noise_loras: \"" << high_noise_loras_str << "\",\n" << " prompt: \"" << prompt << "\",\n" << " negative_prompt: \"" << negative_prompt << "\",\n" + << " lyrics: \"" << lyrics << "\",\n" + << " language: \"" << language << "\",\n" + << " keyscale: \"" << keyscale << "\",\n" + << " bpm: " << bpm << ",\n" + << " duration: " << duration << ",\n" + << " timesignature: " << timesignature << ",\n" + << " lm_seed: " << lm_seed << ",\n" << " clip_skip: " << clip_skip << ",\n" << " width: " << width << ",\n" << " height: " << height << ",\n" diff --git a/ggml b/ggml index a8db410a2..e1132c58a 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit a8db410a252c8c8f2d120c6f2e7133ebe032f35d +Subproject commit e1132c58a83813ca3485617663da744dc8e164e6 diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 51b2b3291..1bf8af530 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -210,6 +210,13 @@ typedef struct { uint8_t* data; } sd_image_t; +typedef struct { + uint32_t sample_rate; + uint32_t channels; + uint32_t sample_count; + float* data; +} sd_audio_t; + typedef struct { int* layers; size_t layer_count; @@ -304,6 +311,22 @@ typedef struct { sd_cache_params_t cache; } sd_img_gen_params_t; +typedef struct { + const sd_lora_t* loras; + uint32_t lora_count; + const char* prompt; + const char* negative_prompt; + const char* lyrics; + const char* keyscale; + const char* language; + float bpm; + float duration; + int timesignature; + int lm_seed; + sd_sample_params_t sample_params; + int64_t seed; +} sd_audio_gen_params_t; + typedef struct { const sd_lora_t* loras; uint32_t lora_count; @@ -372,6 +395,10 @@ SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params); +SD_API void sd_audio_gen_params_init(sd_audio_gen_params_t* sd_audio_gen_params); +SD_API char* sd_audio_gen_params_to_str(const sd_audio_gen_params_t* sd_audio_gen_params); +SD_API sd_audio_t* generate_audio(sd_ctx_t* sd_ctx, const sd_audio_gen_params_t* sd_audio_gen_params); + SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params); SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out); diff --git a/src/ace.hpp b/src/ace.hpp new file mode 100644 index 000000000..d9b553a11 --- /dev/null +++ b/src/ace.hpp @@ -0,0 +1,1346 @@ +#ifndef __ACE_HPP__ +#define __ACE_HPP__ + +#include +#include +#include +#include +#include + +#include "ggml_extend.hpp" +#include "model.h" + +#define ACE_GRAPH_SIZE 81920 + +namespace ACE { + +static inline ggml_tensor* repeat_like(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* like) { + if (ggml_are_same_shape(x, like)) { + if (ggml_is_contiguous(x)) { + return x; + } + return ggml_cont(ctx->ggml_ctx, x); + } + return ggml_ext_repeat(ctx->ggml_ctx, x, like); +} + +static inline ggml_tensor* cont_if_needed(GGMLRunnerContext* ctx, ggml_tensor* x) { + if (ggml_is_contiguous(x)) { + return x; + } + return ggml_cont(ctx->ggml_ctx, x); +} + +static inline ggml_tensor* add_cont(GGMLRunnerContext* ctx, ggml_tensor* a, ggml_tensor* b) { + return ggml_add(ctx->ggml_ctx, cont_if_needed(ctx, a), cont_if_needed(ctx, b)); +} + +static inline ggml_tensor* repeat_kv_heads(GGMLRunnerContext* ctx, ggml_tensor* x, int64_t num_kv_heads, int64_t num_heads) { + if (num_kv_heads == num_heads) { + return x; + } + GGML_ASSERT(num_kv_heads > 0 && num_heads % num_kv_heads == 0); + + int64_t n_rep = num_heads / num_kv_heads; + int64_t d = x->ne[0]; + int64_t L = x->ne[2]; + int64_t B = x->ne[3]; + + auto x3 = ggml_reshape_3d(ctx->ggml_ctx, x, d, num_kv_heads, L * B); + auto x4 = ggml_reshape_4d(ctx->ggml_ctx, x3, d, 1, num_kv_heads, L * B); + auto repeat_target = ggml_new_tensor_4d(ctx->ggml_ctx, x->type, d, n_rep, num_kv_heads, L * B); + x4 = ggml_ext_repeat(ctx->ggml_ctx, x4, repeat_target); + auto x3r = ggml_reshape_3d(ctx->ggml_ctx, x4, d, num_heads, L * B); + auto x4r = ggml_reshape_4d(ctx->ggml_ctx, x3r, d, num_heads, L, B); + return x4r; +} + +static inline ggml_tensor* slice_dim1(GGMLRunnerContext* ctx, ggml_tensor* x, int64_t idx) { + return ggml_ext_slice(ctx->ggml_ctx, x, 1, idx, idx + 1, true); +} + +static inline ggml_tensor* swap_dim0_dim1(GGMLRunnerContext* ctx, ggml_tensor* x) { + return ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); +} + +struct AceMLP : public UnaryBlock { +public: + AceMLP(int64_t hidden_size, int64_t intermediate_size) { + blocks["gate_proj"] = std::make_shared(hidden_size, intermediate_size, false); + blocks["up_proj"] = std::make_shared(hidden_size, intermediate_size, false); + blocks["down_proj"] = std::make_shared(intermediate_size, hidden_size, false); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + auto gate_proj = std::dynamic_pointer_cast(blocks["gate_proj"]); + auto up_proj = std::dynamic_pointer_cast(blocks["up_proj"]); + auto down_proj = std::dynamic_pointer_cast(blocks["down_proj"]); + + auto gate = gate_proj->forward(ctx, x); + gate = ggml_silu_inplace(ctx->ggml_ctx, gate); + x = up_proj->forward(ctx, x); + x = ggml_mul(ctx->ggml_ctx, x, gate); + x = down_proj->forward(ctx, x); + return x; + } +}; + +struct TimestepEmbedding : public GGMLBlock { + int64_t in_channels; + int64_t time_embed_dim; + float time_factor; + + TimestepEmbedding(int64_t in_channels, int64_t time_embed_dim, float time_factor = 1000.f) + : in_channels(in_channels), time_embed_dim(time_embed_dim), time_factor(time_factor) { + blocks["linear_1"] = std::make_shared(in_channels, time_embed_dim, true); + blocks["linear_2"] = std::make_shared(time_embed_dim, time_embed_dim, true); + blocks["time_proj"] = std::make_shared(time_embed_dim, time_embed_dim * 6, true); + } + + std::pair forward(GGMLRunnerContext* ctx, ggml_tensor* t) { + auto linear_1 = std::dynamic_pointer_cast(blocks["linear_1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["linear_2"]); + auto time_proj = std::dynamic_pointer_cast(blocks["time_proj"]); + + auto t_freq = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, (int)in_channels, 10000, time_factor); + auto temb = linear_1->forward(ctx, t_freq); + temb = ggml_silu_inplace(ctx->ggml_ctx, temb); + temb = linear_2->forward(ctx, temb); + + // ggml_dup_tensor only allocates shape; we need ggml_dup to feed actual temb values into SiLU. + auto temb_act = ggml_silu_inplace(ctx->ggml_ctx, ggml_dup(ctx->ggml_ctx, temb)); + auto proj = time_proj->forward(ctx, temb_act); // [hidden*6, B] + proj = ggml_reshape_3d(ctx->ggml_ctx, proj, time_embed_dim, 6, proj->ne[1]); + + return {temb, proj}; + } +}; + +struct AceStepAttention : public GGMLBlock { + int64_t hidden_size; + int64_t num_heads; + int64_t num_kv_heads; + int64_t head_dim; + bool is_cross_attention; + + AceStepAttention(int64_t hidden_size, + int64_t num_heads, + int64_t num_kv_heads, + int64_t head_dim, + bool is_cross_attention = false) + : hidden_size(hidden_size), + num_heads(num_heads), + num_kv_heads(num_kv_heads), + head_dim(head_dim), + is_cross_attention(is_cross_attention) { + blocks["q_proj"] = std::make_shared(hidden_size, num_heads * head_dim, false); + blocks["k_proj"] = std::make_shared(hidden_size, num_kv_heads * head_dim, false); + blocks["v_proj"] = std::make_shared(hidden_size, num_kv_heads * head_dim, false); + blocks["o_proj"] = std::make_shared(num_heads * head_dim, hidden_size, false); + blocks["q_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["k_norm"] = std::make_shared(head_dim, 1e-6f); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* encoder_hidden_states, + ggml_tensor* attention_mask, + ggml_tensor* input_pos) { + int64_t q_len = hidden_states->ne[1]; + int64_t B = hidden_states->ne[2]; + + auto q_proj = std::dynamic_pointer_cast(blocks["q_proj"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k_proj"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v_proj"]); + auto o_proj = std::dynamic_pointer_cast(blocks["o_proj"]); + auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]); + auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]); + + auto q = q_proj->forward(ctx, hidden_states); // [Hq, q_len, B] + + ggml_tensor* kv_states = hidden_states; + if (is_cross_attention && encoder_hidden_states != nullptr) { + kv_states = encoder_hidden_states; + } + + int64_t kv_len = kv_states->ne[1]; + auto k = k_proj->forward(ctx, kv_states); + auto v = v_proj->forward(ctx, kv_states); + + q = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, q_len, B); + k = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_kv_heads, kv_len, B); + v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, kv_len, B); + + q = q_norm->forward(ctx, q); + k = k_norm->forward(ctx, k); + + if (!is_cross_attention && input_pos != nullptr) { + // Match ACE 1.5 rotary config (base=1e6, max_position_embeddings=32768). + q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, (int)head_dim, GGML_ROPE_TYPE_NEOX, 32768, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, (int)head_dim, GGML_ROPE_TYPE_NEOX, 32768, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + } + + int64_t kv_heads_effective = num_kv_heads; + if (num_kv_heads != num_heads) { + // Match Comfy/PyTorch: repeat GQA K/V heads explicitly before attention. + k = repeat_kv_heads(ctx, k, num_kv_heads, num_heads); + v = repeat_kv_heads(ctx, v, num_kv_heads, num_heads); + kv_heads_effective = num_heads; + } + + q = ggml_cont(ctx->ggml_ctx, q); + k = ggml_cont(ctx->ggml_ctx, k); + q = ggml_reshape_3d(ctx->ggml_ctx, q, head_dim * num_heads, q_len, B); + k = ggml_reshape_3d(ctx->ggml_ctx, k, head_dim * kv_heads_effective, kv_len, B); + v = ggml_cont(ctx->ggml_ctx, v); + v = ggml_reshape_3d(ctx->ggml_ctx, v, head_dim * kv_heads_effective, kv_len, B); + + const int64_t attn_batch = hidden_states->ne[2] * hidden_states->ne[3]; + const bool use_flash_attn = ctx->flash_attn_enabled && attn_batch == 1 && attention_mask == nullptr; + + auto attn = ggml_ext_attention_ext(ctx->ggml_ctx, + ctx->backend, + q, + k, + v, + num_heads, + attention_mask, + false, + use_flash_attn); + attn = o_proj->forward(ctx, attn); + return attn; + } +}; + +struct AceStepDiTLayer : public GGMLBlock { + int64_t hidden_size; + bool use_sliding; + + AceStepDiTLayer(int64_t hidden_size, + int64_t num_heads, + int64_t num_kv_heads, + int64_t head_dim, + int64_t intermediate_size, + bool use_sliding) + : hidden_size(hidden_size), + use_sliding(use_sliding) { + blocks["self_attn_norm"] = std::make_shared(hidden_size, 1e-6f); + blocks["self_attn"] = std::make_shared(hidden_size, num_heads, num_kv_heads, head_dim, false); + blocks["cross_attn_norm"] = std::make_shared(hidden_size, 1e-6f); + blocks["cross_attn"] = std::make_shared(hidden_size, num_heads, num_kv_heads, head_dim, true); + blocks["mlp_norm"] = std::make_shared(hidden_size, 1e-6f); + blocks["mlp"] = std::make_shared(hidden_size, intermediate_size); + } + +protected: + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = get_type(prefix + "scale_shift_table", tensor_storage_map, GGML_TYPE_F32); + params["scale_shift_table"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, 6, 1); + GGMLBlock::init_params(ctx, tensor_storage_map, prefix); + } + +public: + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* temb, + ggml_tensor* encoder_hidden_states, + ggml_tensor* input_pos, + ggml_tensor* attention_mask) { + auto self_attn_norm = std::dynamic_pointer_cast(blocks["self_attn_norm"]); + auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); + auto cross_attn_norm = std::dynamic_pointer_cast(blocks["cross_attn_norm"]); + auto cross_attn = std::dynamic_pointer_cast(blocks["cross_attn"]); + auto mlp_norm = std::dynamic_pointer_cast(blocks["mlp_norm"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto scale_shift_table = params["scale_shift_table"]; + auto scale_shift = ggml_ext_repeat(ctx->ggml_ctx, scale_shift_table, temb); + auto modulation = add_cont(ctx, scale_shift, temb); + + auto shift_msa = slice_dim1(ctx, modulation, 0); + auto scale_msa = slice_dim1(ctx, modulation, 1); + auto gate_msa = slice_dim1(ctx, modulation, 2); + auto c_shift_msa = slice_dim1(ctx, modulation, 3); + auto c_scale_msa = slice_dim1(ctx, modulation, 4); + auto c_gate_msa = slice_dim1(ctx, modulation, 5); + + auto norm_hidden = self_attn_norm->forward(ctx, hidden_states); + auto scale_b = repeat_like(ctx, scale_msa, norm_hidden); + auto shift_b = repeat_like(ctx, shift_msa, norm_hidden); + auto ones = ggml_ext_ones(ctx->ggml_ctx, norm_hidden->ne[0], norm_hidden->ne[1], norm_hidden->ne[2], norm_hidden->ne[3]); + auto scale_one = add_cont(ctx, scale_b, ones); + norm_hidden = ggml_mul(ctx->ggml_ctx, norm_hidden, scale_one); + norm_hidden = add_cont(ctx, norm_hidden, shift_b); + + auto attn_out = self_attn->forward(ctx, norm_hidden, nullptr, use_sliding ? attention_mask : nullptr, input_pos); + auto gate_b = repeat_like(ctx, gate_msa, attn_out); + attn_out = ggml_mul(ctx->ggml_ctx, attn_out, gate_b); + hidden_states = add_cont(ctx, hidden_states, attn_out); + + auto norm_hidden_cross = cross_attn_norm->forward(ctx, hidden_states); + auto cross_out = cross_attn->forward(ctx, norm_hidden_cross, encoder_hidden_states, nullptr, nullptr); + hidden_states = add_cont(ctx, hidden_states, cross_out); + + auto norm_hidden_mlp = mlp_norm->forward(ctx, hidden_states); + auto c_scale_b = repeat_like(ctx, c_scale_msa, norm_hidden_mlp); + auto c_shift_b = repeat_like(ctx, c_shift_msa, norm_hidden_mlp); + auto c_scale_one = add_cont(ctx, c_scale_b, ones); + norm_hidden_mlp = ggml_mul(ctx->ggml_ctx, norm_hidden_mlp, c_scale_one); + norm_hidden_mlp = add_cont(ctx, norm_hidden_mlp, c_shift_b); + auto mlp_out = mlp->forward(ctx, norm_hidden_mlp); + auto c_gate_b = repeat_like(ctx, c_gate_msa, mlp_out); + mlp_out = ggml_mul(ctx->ggml_ctx, mlp_out, c_gate_b); + hidden_states = add_cont(ctx, hidden_states, mlp_out); + + return hidden_states; + } +}; + +struct AceStepEncoderLayer : public GGMLBlock { + AceStepEncoderLayer(int64_t hidden_size, + int64_t num_heads, + int64_t num_kv_heads, + int64_t head_dim, + int64_t intermediate_size) { + blocks["self_attn"] = std::make_shared(hidden_size, num_heads, num_kv_heads, head_dim, false); + blocks["input_layernorm"] = std::make_shared(hidden_size, 1e-6f); + blocks["post_attention_layernorm"] = std::make_shared(hidden_size, 1e-6f); + blocks["mlp"] = std::make_shared(hidden_size, intermediate_size); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* input_pos, + ggml_tensor* attention_mask) { + auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); + auto ln_in = std::dynamic_pointer_cast(blocks["input_layernorm"]); + auto ln_post = std::dynamic_pointer_cast(blocks["post_attention_layernorm"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto residual = hidden_states; + hidden_states = ln_in->forward(ctx, hidden_states); + hidden_states = self_attn->forward(ctx, hidden_states, nullptr, attention_mask, input_pos); + hidden_states = add_cont(ctx, hidden_states, residual); + + residual = hidden_states; + hidden_states = ln_post->forward(ctx, hidden_states); + hidden_states = mlp->forward(ctx, hidden_states); + hidden_states = add_cont(ctx, hidden_states, residual); + return hidden_states; + } +}; + +struct AceStepLyricEncoder : public GGMLBlock { + int64_t num_layers; + + AceStepLyricEncoder(int64_t text_hidden_dim, + int64_t hidden_size, + int64_t num_layers, + int64_t num_heads, + int64_t num_kv_heads, + int64_t head_dim, + int64_t intermediate_size) + : num_layers(num_layers) { + blocks["embed_tokens"] = std::make_shared(text_hidden_dim, hidden_size, true, false, false); + blocks["norm"] = std::make_shared(hidden_size, 1e-6f); + for (int i = 0; i < num_layers; ++i) { + blocks["layers." + std::to_string(i)] = std::make_shared(hidden_size, num_heads, num_kv_heads, head_dim, intermediate_size); + } + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* inputs_embeds, + ggml_tensor* input_pos) { + auto embed_tokens = std::dynamic_pointer_cast(blocks["embed_tokens"]); + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + + inputs_embeds = ggml_ext_cont(ctx->ggml_ctx, inputs_embeds); + if (auto w = embed_tokens->get_weight(); w && inputs_embeds->type != w->type) { + inputs_embeds = ggml_cast(ctx->ggml_ctx, inputs_embeds, w->type); + } + inputs_embeds = ggml_ext_cont(ctx->ggml_ctx, inputs_embeds); + auto hidden_states = embed_tokens->forward(ctx, inputs_embeds); + + for (int i = 0; i < num_layers; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + hidden_states = layer->forward(ctx, hidden_states, input_pos, nullptr); + } + + hidden_states = norm->forward(ctx, hidden_states); + return hidden_states; + } +}; + +struct AceStepTimbreEncoder : public GGMLBlock { + int64_t num_layers; + int64_t hidden_size; + + AceStepTimbreEncoder(int64_t timbre_hidden_dim, + int64_t hidden_size, + int64_t num_layers, + int64_t num_heads, + int64_t num_kv_heads, + int64_t head_dim, + int64_t intermediate_size) + : num_layers(num_layers), + hidden_size(hidden_size) { + // Use F32 accumulation to avoid CUDA NaNs in timbre projection. + blocks["embed_tokens"] = std::make_shared(timbre_hidden_dim, hidden_size, true, false, false); + blocks["norm"] = std::make_shared(hidden_size, 1e-6f); + for (int i = 0; i < num_layers; ++i) { + blocks["layers." + std::to_string(i)] = std::make_shared(hidden_size, num_heads, num_kv_heads, head_dim, intermediate_size); + } + } + +protected: + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = get_type(prefix + "special_token", tensor_storage_map, GGML_TYPE_F32); + params["special_token"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, 1, 1); + GGMLBlock::init_params(ctx, tensor_storage_map, prefix); + } + +public: + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* refer_audio, + ggml_tensor* input_pos) { + auto embed_tokens = std::dynamic_pointer_cast(blocks["embed_tokens"]); + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + + refer_audio = ggml_ext_cont(ctx->ggml_ctx, refer_audio); + if (auto w = embed_tokens->get_weight(); w && refer_audio->type != w->type) { + refer_audio = ggml_cast(ctx->ggml_ctx, refer_audio, w->type); + } + refer_audio = ggml_ext_cont(ctx->ggml_ctx, refer_audio); + auto hidden_states = embed_tokens->forward(ctx, refer_audio); + for (int i = 0; i < num_layers; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + hidden_states = layer->forward(ctx, hidden_states, input_pos, nullptr); + } + hidden_states = norm->forward(ctx, hidden_states); + + // take first token as timbre embedding + hidden_states = slice_dim1(ctx, hidden_states, 0); + return hidden_states; + } +}; + +struct AceStepConditionEncoder : public GGMLBlock { + AceStepConditionEncoder(int64_t text_hidden_dim, + int64_t timbre_hidden_dim, + int64_t hidden_size, + int64_t num_lyric_layers, + int64_t num_timbre_layers, + int64_t num_heads, + int64_t num_kv_heads, + int64_t head_dim, + int64_t intermediate_size) { + blocks["text_projector"] = std::make_shared(text_hidden_dim, hidden_size, false, false, false); + blocks["lyric_encoder"] = std::make_shared(text_hidden_dim, hidden_size, num_lyric_layers, num_heads, num_kv_heads, head_dim, intermediate_size); + blocks["timbre_encoder"] = std::make_shared(timbre_hidden_dim, hidden_size, num_timbre_layers, num_heads, num_kv_heads, head_dim, intermediate_size); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* text_hidden_states, + ggml_tensor* lyric_hidden_states, + ggml_tensor* refer_audio, + ggml_tensor* lyric_pos, + ggml_tensor* timbre_pos) { + auto text_projector = std::dynamic_pointer_cast(blocks["text_projector"]); + auto lyric_encoder = std::dynamic_pointer_cast(blocks["lyric_encoder"]); + auto timbre_encoder = std::dynamic_pointer_cast(blocks["timbre_encoder"]); + + text_hidden_states = ggml_ext_cont(ctx->ggml_ctx, text_hidden_states); + lyric_hidden_states = ggml_ext_cont(ctx->ggml_ctx, lyric_hidden_states); + refer_audio = ggml_ext_cont(ctx->ggml_ctx, refer_audio); + ggml_tensor* text_input = text_hidden_states; + int64_t text_len = text_hidden_states->ne[1]; + int64_t text_pad = (256 - (text_len % 256)) % 256; + if (text_pad > 0) { + auto pad_tensor = ggml_ext_full(ctx->ggml_ctx, 0.0f, text_hidden_states->ne[0], text_pad, text_hidden_states->ne[2], text_hidden_states->ne[3]); + if (text_hidden_states->type != GGML_TYPE_F32) { + pad_tensor = ggml_cast(ctx->ggml_ctx, pad_tensor, text_hidden_states->type); + } + text_input = ggml_concat(ctx->ggml_ctx, text_hidden_states, pad_tensor, 1); + } + if (auto w = text_projector->get_weight(); w && text_input->type != w->type) { + text_input = ggml_cast(ctx->ggml_ctx, text_input, w->type); + } + text_input = ggml_ext_cont(ctx->ggml_ctx, text_input); + auto text_emb = text_projector->forward(ctx, text_input); + if (text_pad > 0) { + text_emb = ggml_ext_slice(ctx->ggml_ctx, text_emb, 1, 0, text_len, true); + } + auto lyric_emb = lyric_encoder->forward(ctx, lyric_hidden_states, lyric_pos); + auto timbre_emb = timbre_encoder->forward(ctx, refer_audio, timbre_pos); + + // CUDA concat only supports f32 today; normalize to f32 for packing. + ggml_type merged_type = GGML_TYPE_F32; + if (text_emb->type != merged_type) { + text_emb = ggml_cast(ctx->ggml_ctx, text_emb, merged_type); + } + if (lyric_emb->type != merged_type) { + lyric_emb = ggml_cast(ctx->ggml_ctx, lyric_emb, merged_type); + } + if (timbre_emb->type != merged_type) { + timbre_emb = ggml_cast(ctx->ggml_ctx, timbre_emb, merged_type); + } + text_emb = ggml_ext_cont(ctx->ggml_ctx, text_emb); + lyric_emb = ggml_ext_cont(ctx->ggml_ctx, lyric_emb); + timbre_emb = ggml_ext_cont(ctx->ggml_ctx, timbre_emb); + + auto merged = ggml_concat(ctx->ggml_ctx, lyric_emb, timbre_emb, 1); + merged = ggml_concat(ctx->ggml_ctx, merged, text_emb, 1); + return merged; + } +}; + +struct AceStepAttentionPooler : public GGMLBlock { + int64_t hidden_size; + int64_t num_layers; + int64_t head_dim; + int64_t num_heads; + int64_t num_kv_heads; + int64_t intermediate_size; + float rms_norm_eps; + + AceStepAttentionPooler(int64_t hidden_size, + int64_t num_layers, + int64_t head_dim, + float rms_norm_eps = 1e-6f) + : hidden_size(hidden_size), + num_layers(num_layers), + head_dim(head_dim), + num_heads(16), + num_kv_heads(8), + intermediate_size(hidden_size * 3), + rms_norm_eps(rms_norm_eps) { + blocks["embed_tokens"] = std::make_shared(hidden_size, hidden_size, true); + blocks["norm"] = std::make_shared(hidden_size, rms_norm_eps); + for (int i = 0; i < num_layers; ++i) { + blocks["layers." + std::to_string(i)] = std::make_shared(hidden_size, + num_heads, + num_kv_heads, + head_dim, + intermediate_size); + } + } + +protected: + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = get_type(prefix + "special_token", tensor_storage_map, GGML_TYPE_F32); + params["special_token"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, 1, 1); + GGMLBlock::init_params(ctx, tensor_storage_map, prefix); + } + +public: + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* input_pos) { + GGML_ASSERT(x != nullptr && ggml_n_dims(x) == 4); + + auto embed_tokens = std::dynamic_pointer_cast(blocks["embed_tokens"]); + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + + int64_t P = x->ne[1]; + int64_t T = x->ne[2]; + int64_t B = x->ne[3]; + + auto x3 = ggml_reshape_3d(ctx->ggml_ctx, cont_if_needed(ctx, x), x->ne[0], P, T * B); + x3 = embed_tokens->forward(ctx, x3); + auto x4 = ggml_reshape_4d(ctx->ggml_ctx, x3, x3->ne[0], P, T, B); + + auto special = params["special_token"]; + special = ggml_reshape_4d(ctx->ggml_ctx, special, special->ne[0], 1, 1, 1); + auto repeat_target = ggml_new_tensor_4d(ctx->ggml_ctx, GGML_TYPE_F16, x4->ne[0], 1, T, B); + auto special_rep = ggml_ext_repeat(ctx->ggml_ctx, special, repeat_target); + + x4 = ggml_concat(ctx->ggml_ctx, special_rep, x4, 1); + x4 = ggml_cont(ctx->ggml_ctx, x4); + + int64_t seq_len = x4->ne[1]; + auto x_seq = ggml_reshape_3d(ctx->ggml_ctx, x4, x4->ne[0], seq_len, T * B); + + for (int i = 0; i < num_layers; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + x_seq = layer->forward(ctx, x_seq, input_pos, nullptr); + } + + x_seq = norm->forward(ctx, x_seq); + x_seq = slice_dim1(ctx, x_seq, 0); + x_seq = ggml_cont(ctx->ggml_ctx, x_seq); + x_seq = ggml_reshape_3d(ctx->ggml_ctx, x_seq, x_seq->ne[0], T, B); + return x_seq; + } +}; + +struct AceStepDiTModel : public GGMLBlock { + int64_t hidden_size; + int64_t patch_size; + int64_t num_layers; + int64_t num_heads; + int64_t num_kv_heads; + int64_t head_dim; + int64_t intermediate_size; + int64_t audio_acoustic_hidden_dim; + std::vector sliding_layers; + + AceStepDiTModel(int64_t in_channels, + int64_t hidden_size, + int64_t num_layers, + int64_t num_heads, + int64_t num_kv_heads, + int64_t head_dim, + int64_t intermediate_size, + int64_t patch_size, + int64_t audio_acoustic_hidden_dim, + const std::vector& sliding_layers) + : hidden_size(hidden_size), + patch_size(patch_size), + num_layers(num_layers), + num_heads(num_heads), + num_kv_heads(num_kv_heads), + head_dim(head_dim), + intermediate_size(intermediate_size), + audio_acoustic_hidden_dim(audio_acoustic_hidden_dim), + sliding_layers(sliding_layers) { + blocks["proj_in.1"] = std::make_shared(in_channels, hidden_size, (int)patch_size, (int)patch_size, 0); + blocks["time_embed"] = std::make_shared(256, hidden_size); + blocks["time_embed_r"] = std::make_shared(256, hidden_size); + blocks["condition_embedder"] = std::make_shared(hidden_size, hidden_size, true); + + for (int i = 0; i < num_layers; ++i) { + blocks["layers." + std::to_string(i)] = std::make_shared(hidden_size, + num_heads, + num_kv_heads, + head_dim, + intermediate_size, + sliding_layers[i]); + } + + blocks["norm_out"] = std::make_shared(hidden_size, 1e-6f); + blocks["proj_out.1"] = std::make_shared(hidden_size, audio_acoustic_hidden_dim, (int)patch_size, (int)patch_size, 0); + } + +protected: + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = get_type(prefix + "scale_shift_table", tensor_storage_map, GGML_TYPE_F32); + params["scale_shift_table"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, 2, 1); + GGMLBlock::init_params(ctx, tensor_storage_map, prefix); + } + +public: + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* timesteps, + ggml_tensor* timesteps_r, + ggml_tensor* encoder_hidden_states, + ggml_tensor* context_latents, + ggml_tensor* input_pos, + ggml_tensor* attention_mask) { + auto time_embed = std::dynamic_pointer_cast(blocks["time_embed"]); + auto time_embed_r = std::dynamic_pointer_cast(blocks["time_embed_r"]); + auto condition_embedder = std::dynamic_pointer_cast(blocks["condition_embedder"]); + auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); + auto proj_in = std::dynamic_pointer_cast(blocks["proj_in.1"]); + auto proj_out = std::dynamic_pointer_cast(blocks["proj_out.1"]); + + ggml_tensor* t_delta = ggml_sub(ctx->ggml_ctx, timesteps, timesteps_r); + + auto temb_t_pair = time_embed->forward(ctx, timesteps); + auto temb_r_pair = time_embed_r->forward(ctx, t_delta); + + auto temb = add_cont(ctx, temb_t_pair.first, temb_r_pair.first); + auto timestep_proj = add_cont(ctx, temb_t_pair.second, temb_r_pair.second); + + GGML_ASSERT(context_latents->ne[1] == hidden_states->ne[1]); + GGML_ASSERT(context_latents->ne[2] == hidden_states->ne[2]); + GGML_ASSERT(context_latents->ne[3] == hidden_states->ne[3]); + + auto lhs = context_latents; + auto rhs = hidden_states; + if (rhs->type != lhs->type) { + rhs = ggml_cast(ctx->ggml_ctx, rhs, lhs->type); + } + + ggml_tensor* x = ggml_concat(ctx->ggml_ctx, lhs, rhs, 0); + + int64_t original_seq_len = x->ne[1]; + if (original_seq_len % patch_size != 0) { + int64_t pad_len = patch_size - (original_seq_len % patch_size); + auto pad4 = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], pad_len, x->ne[2], 1); + auto pad = ggml_reshape_3d(ctx->ggml_ctx, pad4, x->ne[0], pad_len, x->ne[2]); + x = ggml_concat(ctx->ggml_ctx, x, pad, 1); + } + + x = swap_dim0_dim1(ctx, x); + x = proj_in->forward(ctx, x); + x = swap_dim0_dim1(ctx, x); + + encoder_hidden_states = condition_embedder->forward(ctx, encoder_hidden_states); + + for (int i = 0; i < num_layers; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + x = layer->forward(ctx, x, timestep_proj, encoder_hidden_states, input_pos, attention_mask); + } + + auto scale_shift_table = params["scale_shift_table"]; + auto temb_reshaped = ggml_reshape_3d(ctx->ggml_ctx, temb, temb->ne[0], 1, temb->ne[1]); + auto repeat_target = ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F16, temb->ne[0], 2, temb->ne[1]); + auto scale_shift = ggml_ext_repeat(ctx->ggml_ctx, scale_shift_table, repeat_target); + auto temb_rep = ggml_ext_repeat(ctx->ggml_ctx, temb_reshaped, repeat_target); + auto modulation = add_cont(ctx, scale_shift, temb_rep); + + auto shift = slice_dim1(ctx, modulation, 0); + auto scale = slice_dim1(ctx, modulation, 1); + + x = norm_out->forward(ctx, x); + auto scale_b = repeat_like(ctx, scale, x); + auto shift_b = repeat_like(ctx, shift, x); + auto ones = ggml_ext_ones(ctx->ggml_ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); + auto scale_one = add_cont(ctx, scale_b, ones); + x = ggml_mul(ctx->ggml_ctx, x, scale_one); + x = add_cont(ctx, x, shift_b); + + x = swap_dim0_dim1(ctx, x); + x = proj_out->forward(ctx, x); + x = swap_dim0_dim1(ctx, x); + + x = ggml_ext_slice(ctx->ggml_ctx, x, 1, 0, original_seq_len, true); + return x; + } +}; + +struct AceQuantizer : public GGMLBlock { + std::vector levels; + std::vector basis; + int64_t codebook_dim; + int64_t hidden_size; + std::vector codes_buffer; + + AceQuantizer(int64_t hidden_size, const std::vector& levels) + : levels(levels), codebook_dim(levels.size()), hidden_size(hidden_size) { + blocks["project_in"] = std::make_shared(hidden_size, (int64_t)codebook_dim, true); + blocks["project_out"] = std::make_shared((int64_t)codebook_dim, hidden_size, true); + + basis.resize(codebook_dim); + int accum = 1; + for (size_t i = 0; i < codebook_dim; ++i) { + basis[i] = accum; + accum *= levels[i]; + } + } + + ggml_tensor* get_output_from_indices(GGMLRunnerContext* ctx, + const std::vector& indices, + int64_t T, + int64_t B) { + auto project_out = std::dynamic_pointer_cast(blocks["project_out"]); + + const int64_t total = codebook_dim * T * B; + if (total <= 0) { + return nullptr; + } + codes_buffer.assign(static_cast(total), 0.f); + + for (int64_t b = 0; b < B; ++b) { + for (int64_t t = 0; t < T; ++t) { + int idx = 35847; + if (!indices.empty() && t < (int64_t)indices.size()) { + idx = indices[t]; + } + for (int64_t d = 0; d < codebook_dim; ++d) { + int level = levels[d]; + int value = (idx / basis[d]) % level; + float scaled = value * (2.f / (level - 1)) - 1.f; + int64_t offset = d + codebook_dim * (t + T * b); + codes_buffer[static_cast(offset)] = scaled; + } + } + } + + auto codes = ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, codebook_dim, T, B); + if (codes->data != nullptr) { + memcpy(codes->data, codes_buffer.data(), static_cast(total) * sizeof(float)); + } else { + ctx->set_backend_tensor_data(codes, codes_buffer.data()); + } + + if (auto w = project_out->get_weight(); w && codes->type != w->type) { + codes = ggml_cast(ctx->ggml_ctx, codes, w->type); + } + auto out = project_out->forward(ctx, codes); + return out; + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto project_in = std::dynamic_pointer_cast(blocks["project_in"]); + auto project_out = std::dynamic_pointer_cast(blocks["project_out"]); + + auto z = project_in->forward(ctx, x); + z = ggml_cast(ctx->ggml_ctx, z, GGML_TYPE_F32); + + auto tanh_z = ggml_tanh(ctx->ggml_ctx, z); + auto ones = ggml_ext_ones(ctx->ggml_ctx, tanh_z->ne[0], tanh_z->ne[1], tanh_z->ne[2], tanh_z->ne[3]); + auto z_plus = ggml_add(ctx->ggml_ctx, tanh_z, ones); + auto z_scaled = ggml_scale(ctx->ggml_ctx, z_plus, 0.5f); + + std::vector levels_minus_1_vec(codebook_dim); + std::vector scales_vec(codebook_dim); + for (int i = 0; i < codebook_dim; ++i) { + float level_minus_1 = static_cast(levels[i] - 1); + levels_minus_1_vec[i] = level_minus_1; + scales_vec[i] = 2.f / level_minus_1; + } + + auto levels_minus_1 = ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, codebook_dim, 1, 1); + ctx->set_backend_tensor_data(levels_minus_1, levels_minus_1_vec.data()); + auto levels_rep = ggml_repeat(ctx->ggml_ctx, levels_minus_1, z_scaled); + auto scaled = ggml_mul(ctx->ggml_ctx, z_scaled, levels_rep); + auto rounded = ggml_round(ctx->ggml_ctx, scaled); + + auto scales = ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, codebook_dim, 1, 1); + ctx->set_backend_tensor_data(scales, scales_vec.data()); + auto scales_rep = ggml_repeat(ctx->ggml_ctx, scales, rounded); + auto codes = ggml_mul(ctx->ggml_ctx, rounded, scales_rep); + auto neg_ones = ggml_scale(ctx->ggml_ctx, ones, -1.f); + codes = ggml_add(ctx->ggml_ctx, codes, neg_ones); + + return project_out->forward(ctx, codes); + } +}; + +struct AudioTokenDetokenizer : public GGMLBlock { + int64_t pool_window_size; + int64_t hidden_size; + int64_t audio_acoustic_hidden_dim; + int64_t num_layers; + int64_t num_heads; + int64_t num_kv_heads; + int64_t head_dim; + int64_t intermediate_size; + + AudioTokenDetokenizer(int64_t hidden_size, + int64_t pool_window_size, + int64_t audio_acoustic_hidden_dim, + int64_t num_layers, + int64_t num_heads, + int64_t num_kv_heads, + int64_t head_dim, + int64_t intermediate_size) + : pool_window_size(pool_window_size), + hidden_size(hidden_size), + audio_acoustic_hidden_dim(audio_acoustic_hidden_dim), + num_layers(num_layers), + num_heads(num_heads), + num_kv_heads(num_kv_heads), + head_dim(head_dim), + intermediate_size(intermediate_size) { + blocks["embed_tokens"] = std::make_shared(hidden_size, hidden_size, true); + blocks["norm"] = std::make_shared(hidden_size, 1e-6f); + blocks["proj_out"] = std::make_shared(hidden_size, audio_acoustic_hidden_dim, true); + for (int i = 0; i < num_layers; ++i) { + blocks["layers." + std::to_string(i)] = std::make_shared(hidden_size, + num_heads, + num_kv_heads, + head_dim, + intermediate_size); + } + } + +protected: + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = get_type(prefix + "special_tokens", tensor_storage_map, GGML_TYPE_F32); + params["special_tokens"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, pool_window_size, 1); + GGMLBlock::init_params(ctx, tensor_storage_map, prefix); + } + +public: + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* input_pos) { + auto embed_tokens = std::dynamic_pointer_cast(blocks["embed_tokens"]); + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); + + int64_t T = x->ne[1]; + int64_t B = x->ne[2]; + + x = embed_tokens->forward(ctx, x); + x = ggml_cont(ctx->ggml_ctx, x); + const int64_t D = x->ne[0]; + const int64_t TB = x->ne[1] * x->ne[2]; + auto x3 = ggml_reshape_3d(ctx->ggml_ctx, x, D, TB, 1); + x3 = ggml_dup(ctx->ggml_ctx, x3); + auto repeat_target = ggml_new_tensor_3d(ctx->ggml_ctx, x3->type, D, TB, pool_window_size); + auto x3r = ggml_ext_repeat(ctx->ggml_ctx, x3, repeat_target); // (D, TB, P) + x3r = ggml_permute(ctx->ggml_ctx, x3r, 0, 2, 1, 3); // (D, P, TB) + x3r = ggml_cont(ctx->ggml_ctx, x3r); + + auto special = params["special_tokens"]; + special = ggml_reshape_3d(ctx->ggml_ctx, special, special->ne[0], special->ne[1], 1); // (D, P, 1) + special = ggml_dup(ctx->ggml_ctx, special); + auto special_rep = ggml_ext_repeat(ctx->ggml_ctx, special, x3r); + x = add_cont(ctx, x3r, special_rep); + + for (int i = 0; i < num_layers; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + x = layer->forward(ctx, x, input_pos, nullptr); + } + + x = norm->forward(ctx, x); + x = proj_out->forward(ctx, x); + + x = ggml_cont(ctx->ggml_ctx, x); + x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0], pool_window_size, T, B); + x = ggml_cont(ctx->ggml_ctx, x); + x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0], pool_window_size * T, B); + return x; + } +}; + +struct AceStepConditionGenerationModel : public GGMLBlock { + int64_t pool_window_size = 5; + std::vector fsq_levels = {8, 8, 8, 5, 5, 5}; + int64_t hidden_size = 2048; + int64_t audio_acoustic_hidden_dim = 64; + int64_t patch_size = 2; + bool use_tokenizer_path = false; // mirror Comfy: LM-only hints by default + + AceStepConditionGenerationModel() { + std::vector layer_types; + for (int i = 0; i < 24; ++i) { + layer_types.push_back((i % 2) == 0); + } + blocks["decoder"] = std::make_shared(192, hidden_size, 24, 16, 8, 128, 6144, patch_size, audio_acoustic_hidden_dim, layer_types); + blocks["encoder"] = std::make_shared(1024, audio_acoustic_hidden_dim, hidden_size, 8, 4, 16, 8, 128, 6144); + blocks["tokenizer.audio_acoustic_proj"] = std::make_shared(audio_acoustic_hidden_dim, hidden_size, true); + blocks["tokenizer.attention_pooler"] = std::make_shared(hidden_size, 2, 128, 1e-6f); + blocks["tokenizer.quantizer"] = std::make_shared(hidden_size, fsq_levels); + blocks["detokenizer"] = std::make_shared(hidden_size, pool_window_size, audio_acoustic_hidden_dim, 2, 16, 8, 128, hidden_size * 3); + } + +protected: + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = get_type(prefix + "null_condition_emb", tensor_storage_map, GGML_TYPE_F32); + params["null_condition_emb"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, 1, 1); + GGMLBlock::init_params(ctx, tensor_storage_map, prefix); + } + +public: + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* lyric_embed, + ggml_tensor* refer_audio, + const std::shared_ptr>& audio_codes, + ggml_tensor* decoder_pos, + ggml_tensor* lyric_pos, + ggml_tensor* timbre_pos, + ggml_tensor* tokenizer_pos, + ggml_tensor* detok_pos, + ggml_tensor* sliding_mask) { + auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); + auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); + auto audio_proj = std::dynamic_pointer_cast(blocks["tokenizer.audio_acoustic_proj"]); + auto pooler = std::dynamic_pointer_cast(blocks["tokenizer.attention_pooler"]); + auto quantizer = std::dynamic_pointer_cast(blocks["tokenizer.quantizer"]); + auto detokenizer = std::dynamic_pointer_cast(blocks["detokenizer"]); + + ggml_tensor* enc_hidden = nullptr; + if (lyric_embed == nullptr && refer_audio == nullptr && context && context->ne[0] == hidden_size) { + enc_hidden = context; + } else { + enc_hidden = encoder->forward(ctx, context, lyric_embed, refer_audio, lyric_pos, timbre_pos); + } + + auto src_latents = x; + auto chunk_masks = ggml_ext_ones(ctx->ggml_ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); + + int64_t T = x->ne[1]; + int64_t B = x->ne[2]; + int64_t T_codes = (T + pool_window_size - 1) / pool_window_size; + + ggml_tensor* tokenizer_hints_5hz = nullptr; + if (use_tokenizer_path && refer_audio && audio_proj && pooler && quantizer && tokenizer_pos) { + int64_t target_len = T_codes * pool_window_size; + ggml_tensor* tok_audio = refer_audio; + if (tok_audio->ne[1] < target_len) { + int64_t repeat_factor = (target_len + tok_audio->ne[1] - 1) / tok_audio->ne[1]; + auto repeat_target = ggml_new_tensor_3d(ctx->ggml_ctx, tok_audio->type, tok_audio->ne[0], tok_audio->ne[1] * repeat_factor, tok_audio->ne[2]); + tok_audio = ggml_ext_repeat(ctx->ggml_ctx, tok_audio, repeat_target); + } + if (tok_audio->ne[1] > target_len) { + tok_audio = ggml_ext_slice(ctx->ggml_ctx, tok_audio, 1, 0, target_len, true); + } + + auto tok_hidden = audio_proj->forward(ctx, tok_audio); + auto tok_hidden4 = ggml_reshape_4d(ctx->ggml_ctx, tok_hidden, tok_hidden->ne[0], pool_window_size, T_codes, tok_hidden->ne[2]); + auto pooled = pooler->forward(ctx, tok_hidden4, tokenizer_pos); + tokenizer_hints_5hz = quantizer->forward(ctx, pooled); + } + + ggml_tensor* lm_hints_5hz = nullptr; + if (audio_codes && !audio_codes->empty()) { + lm_hints_5hz = quantizer->get_output_from_indices(ctx, *audio_codes, T_codes, B); + } + if (lm_hints_5hz == nullptr) { + lm_hints_5hz = tokenizer_hints_5hz; + } else if (tokenizer_hints_5hz != nullptr) { + auto combined = add_cont(ctx, lm_hints_5hz, tokenizer_hints_5hz); + lm_hints_5hz = ggml_scale(ctx->ggml_ctx, combined, 0.5f); + } + + if (lm_hints_5hz != nullptr) { + auto lm_hints = detokenizer->forward(ctx, lm_hints_5hz, detok_pos); + lm_hints = ggml_ext_slice(ctx->ggml_ctx, lm_hints, 1, 0, T, true); + src_latents = lm_hints; + } + + auto context_latents = ggml_concat(ctx->ggml_ctx, src_latents, chunk_masks, 0); + + auto out = decoder->forward(ctx, + x, + timesteps, + timesteps, + enc_hidden, + context_latents, + decoder_pos, + sliding_mask); + + return out; + } +}; + +struct AceEncoderRunner : public GGMLRunner { + AceStepConditionEncoder encoder; + std::vector lyric_pos_vec; + std::vector timbre_pos_vec; + + AceEncoderRunner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model.encoder") + : GGMLRunner(backend, offload_params_to_cpu), + encoder(1024, 64, 2048, 8, 4, 16, 8, 128, 6144) { + encoder.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "ace_step_1_5_encoder"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + encoder.get_param_tensors(tensors, prefix); + } + + ggml_tensor* build_input_pos(int64_t seq_len, std::vector& cache) { + cache.resize(seq_len); + for (int64_t i = 0; i < seq_len; ++i) { + cache[i] = (int)i; + } + auto input_pos = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, seq_len); + set_backend_tensor_data(input_pos, cache.data()); + return input_pos; + } + + struct ggml_cgraph* build_graph(ggml_tensor* text_hidden_states, + ggml_tensor* lyric_hidden_states, + ggml_tensor* refer_audio) { + struct ggml_cgraph* gf = new_graph_custom(ACE_GRAPH_SIZE); + + text_hidden_states = to_backend(text_hidden_states); + lyric_hidden_states = to_backend(lyric_hidden_states); + refer_audio = to_backend(refer_audio); + + int64_t lyric_len = lyric_hidden_states ? lyric_hidden_states->ne[1] : 1; + auto lyric_pos = build_input_pos(lyric_len, lyric_pos_vec); + int64_t timbre_len = refer_audio ? refer_audio->ne[1] : 1; + auto timbre_pos = build_input_pos(timbre_len, timbre_pos_vec); + + auto runner_ctx = get_context(); + ggml_tensor* out = encoder.forward(&runner_ctx, + text_hidden_states, + lyric_hidden_states, + refer_audio, + lyric_pos, + timbre_pos); + ggml_build_forward_expand(gf, out); + return gf; + } + + bool compute(int n_threads, + ggml_tensor* text_hidden_states, + ggml_tensor* lyric_hidden_states, + ggml_tensor* refer_audio, + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(text_hidden_states, lyric_hidden_states, refer_audio); + }; + return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + } +}; + +struct AceRunner : public GGMLRunner { + AceStepConditionGenerationModel model; + std::vector input_pos_vec; + std::vector lyric_pos_vec; + std::vector timbre_pos_vec; + std::vector tokenizer_pos_vec; + std::vector detok_pos_vec; + std::vector sliding_mask_vec; + std::unique_ptr cpu_encoder; + bool use_cpu_encoder = false; + + struct EncCacheEntry { + const ggml_tensor* context = nullptr; + const ggml_tensor* lyric = nullptr; + const ggml_tensor* refer = nullptr; + int n_dims = 0; + int64_t ne[4] = {0, 0, 0, 0}; + std::vector data; + }; + + std::vector enc_cache; + const EncCacheEntry* active_enc_cache = nullptr; + + AceRunner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model") + : GGMLRunner(backend, offload_params_to_cpu) { + model = AceStepConditionGenerationModel(); + model.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "ace_step_1_5"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + model.get_param_tensors(tensors, prefix); + } + + void reset_encoder_cache() { + enc_cache.clear(); + active_enc_cache = nullptr; + } + + void init_cpu_encoder(const String2TensorStorage& tensor_storage_map, + const std::map& model_tensors) { + if (cpu_encoder) { + use_cpu_encoder = true; + return; + } + ggml_backend_t cpu_backend = ggml_backend_cpu_init(); + cpu_encoder = std::make_unique(cpu_backend, false, tensor_storage_map, "model.diffusion_model.encoder"); + cpu_encoder->alloc_params_buffer(); + + std::map cpu_tensors; + cpu_encoder->get_param_tensors(cpu_tensors, "model.diffusion_model.encoder"); + int copied = 0; + for (const auto& kv : cpu_tensors) { + auto it = model_tensors.find(kv.first); + if (it == model_tensors.end()) { + LOG_WARN("ACE CPU encoder: missing tensor '%s'", kv.first.c_str()); + continue; + } + ggml_backend_tensor_copy(it->second, kv.second); + copied++; + } + LOG_INFO("ACE CPU encoder: copied %d/%d tensors", copied, (int)cpu_tensors.size()); + use_cpu_encoder = true; + } + + ggml_tensor* build_mask(int64_t seq_len, int64_t window) { + sliding_mask_vec.resize(seq_len * seq_len); + for (int64_t i0 = 0; i0 < seq_len; ++i0) { + for (int64_t i1 = 0; i1 < seq_len; ++i1) { + float value = 0.f; + if (std::abs(i0 - i1) > window) { + // Match Comfy/PyTorch sliding window bias: torch.finfo(dtype).min. + // Sliding attention path does not use flash attention in our backend. + value = std::numeric_limits::lowest(); + } + sliding_mask_vec[i1 * seq_len + i0] = value; + } + } + auto mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, seq_len, seq_len); + if (mask->data != nullptr) { + memcpy(mask->data, sliding_mask_vec.data(), sliding_mask_vec.size() * sizeof(float)); + } else { + set_backend_tensor_data(mask, sliding_mask_vec.data()); + } + return mask; + } + + ggml_tensor* build_input_pos(int64_t seq_len, std::vector& cache) { + cache.resize(seq_len); + for (int64_t i = 0; i < seq_len; ++i) { + cache[i] = (int)i; + } + auto input_pos = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, seq_len); + set_backend_tensor_data(input_pos, cache.data()); + return input_pos; + } + + const EncCacheEntry* get_or_create_enc_cache(int n_threads, + ggml_tensor* context, + ggml_tensor* lyric_embed, + ggml_tensor* refer_audio) { + for (const auto& entry : enc_cache) { + if (entry.context == context && entry.lyric == lyric_embed && entry.refer == refer_audio) { + return &entry; + } + } + if (!cpu_encoder || !context || !lyric_embed || !refer_audio) { + return nullptr; + } + + ggml_context* out_ctx = nullptr; + { + struct ggml_init_params params; + params.mem_size = static_cast(256 * 1024 * 1024); + params.mem_buffer = nullptr; + params.no_alloc = false; + out_ctx = ggml_init(params); + } + if (!out_ctx) { + LOG_ERROR("ACE CPU encoder: failed to allocate output context"); + return nullptr; + } + + ggml_tensor* out = nullptr; + if (!cpu_encoder->compute(n_threads, context, lyric_embed, refer_audio, &out, out_ctx) || out == nullptr) { + LOG_ERROR("ACE CPU encoder: compute failed"); + ggml_free(out_ctx); + return nullptr; + } + if (out->data == nullptr) { + LOG_ERROR("ACE CPU encoder: output buffer is null (insufficient output context memory)"); + ggml_free(out_ctx); + return nullptr; + } + + EncCacheEntry entry; + entry.context = context; + entry.lyric = lyric_embed; + entry.refer = refer_audio; + entry.n_dims = ggml_n_dims(out); + entry.ne[0] = out->ne[0]; + entry.ne[1] = out->ne[1]; + entry.ne[2] = out->ne[2]; + entry.ne[3] = out->ne[3]; + + int64_t n_elem = ggml_nelements(out); + entry.data.resize(static_cast(n_elem)); + const float* src = (const float*)out->data; + if (n_elem > 0) { + memcpy(entry.data.data(), src, sizeof(float) * n_elem); + } + + ggml_free(out_ctx); + enc_cache.push_back(std::move(entry)); + return &enc_cache.back(); + } + + struct ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* lyric_embed, + ggml_tensor* refer_audio, + const std::shared_ptr>& audio_codes) { + struct ggml_cgraph* gf = new_graph_custom(ACE_GRAPH_SIZE); + + x = to_backend(x); + timesteps = to_backend(timesteps); + ggml_tensor* enc_cached = nullptr; + if (active_enc_cache != nullptr && active_enc_cache->n_dims > 0 && !active_enc_cache->data.empty()) { + enc_cached = ggml_new_tensor(compute_ctx, + GGML_TYPE_F32, + active_enc_cache->n_dims, + active_enc_cache->ne); + ggml_set_name(enc_cached, "ace_enc_hidden_cached"); + set_backend_tensor_data(enc_cached, active_enc_cache->data.data()); + } + bool use_cached = (enc_cached != nullptr); + if (use_cached) { + context = enc_cached; + lyric_embed = nullptr; + refer_audio = nullptr; + } else { + context = to_backend(context); + } + + if (lyric_embed) { + lyric_embed = to_backend(lyric_embed); + } + if (refer_audio) { + refer_audio = to_backend(refer_audio); + } + + int64_t seq_len = x->ne[1]; + int64_t patch_len = (seq_len + model.patch_size - 1) / model.patch_size; + auto decoder_pos = build_input_pos(patch_len, input_pos_vec); + + ggml_tensor* lyric_pos = nullptr; + ggml_tensor* timbre_pos = nullptr; + if (!use_cached) { + int64_t lyric_len = lyric_embed ? lyric_embed->ne[1] : 1; + lyric_pos = build_input_pos(lyric_len, lyric_pos_vec); + + int64_t timbre_len = refer_audio ? refer_audio->ne[1] : 1; + timbre_pos = build_input_pos(timbre_len, timbre_pos_vec); + } + + ggml_tensor* tokenizer_pos = nullptr; + if (model.use_tokenizer_path) { + tokenizer_pos = build_input_pos(model.pool_window_size + 1, tokenizer_pos_vec); + } + auto detok_pos = build_input_pos(model.pool_window_size, detok_pos_vec); + + auto sliding_mask = build_mask(patch_len, 128); + + auto runner_ctx = get_context(); + ggml_tensor* out = model.forward(&runner_ctx, + x, + timesteps, + context, + lyric_embed, + refer_audio, + audio_codes, + decoder_pos, + lyric_pos, + timbre_pos, + tokenizer_pos, + detok_pos, + sliding_mask); + + ggml_build_forward_expand(gf, out); + return gf; + } + + bool compute(int n_threads, + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* lyric_embed, + ggml_tensor* refer_audio, + const std::shared_ptr>& audio_codes, + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { + active_enc_cache = nullptr; + if (use_cpu_encoder) { + active_enc_cache = get_or_create_enc_cache(n_threads, context, lyric_embed, refer_audio); + } + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(x, timesteps, context, lyric_embed, refer_audio, audio_codes); + }; + return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + } +}; + +} // namespace ACE + +#endif // __ACE_HPP__ diff --git a/src/ace_vae.hpp b/src/ace_vae.hpp new file mode 100644 index 000000000..4288cfa03 --- /dev/null +++ b/src/ace_vae.hpp @@ -0,0 +1,306 @@ +#ifndef __ACE_VAE_HPP__ +#define __ACE_VAE_HPP__ + +#include +#include +#include +#include + +#include "ggml_extend.hpp" +#include "vae.hpp" + +class AudioResidualUnit : public UnaryBlock { +protected: + int64_t in_channels; + int64_t out_channels; + int dilation; + bool use_snake; + +public: + AudioResidualUnit(int64_t in_channels, + int64_t out_channels, + int dilation, + bool use_snake = true) + : in_channels(in_channels), + out_channels(out_channels), + dilation(dilation), + use_snake(use_snake) { + (void)use_snake; + int padding = (dilation * (7 - 1)) / 2; + + blocks["layers.0"] = std::make_shared(out_channels, true); + blocks["layers.1"] = std::make_shared(in_channels, out_channels, 7, 1, padding, dilation, true); + blocks["layers.2"] = std::make_shared(out_channels, true); + blocks["layers.3"] = std::make_shared(out_channels, out_channels, 1, 1, 0, 1, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + auto act1 = std::dynamic_pointer_cast(blocks["layers.0"]); + auto conv1 = std::dynamic_pointer_cast(blocks["layers.1"]); + auto act2 = std::dynamic_pointer_cast(blocks["layers.2"]); + auto conv2 = std::dynamic_pointer_cast(blocks["layers.3"]); + + auto residual = x; + x = act1->forward(ctx, x); + x = conv1->forward(ctx, x); + x = act2->forward(ctx, x); + x = conv2->forward(ctx, x); + x = ggml_add(ctx->ggml_ctx, x, residual); + return x; + } +}; + +class AudioEncoderBlock : public UnaryBlock { +public: + AudioEncoderBlock(int64_t in_channels, + int64_t out_channels, + int stride, + bool use_snake = true) { + int padding = static_cast(std::ceil(stride / 2.0)); + int kernel_size = 2 * stride; + + blocks["layers.0"] = std::make_shared(in_channels, in_channels, 1, use_snake); + blocks["layers.1"] = std::make_shared(in_channels, in_channels, 3, use_snake); + blocks["layers.2"] = std::make_shared(in_channels, in_channels, 9, use_snake); + blocks["layers.3"] = std::make_shared(in_channels, true); + blocks["layers.4"] = std::make_shared(in_channels, out_channels, kernel_size, stride, padding, 1, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + for (int i = 0; i < 5; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + x = layer->forward(ctx, x); + } + return x; + } +}; + +class AudioDecoderBlock : public UnaryBlock { +public: + AudioDecoderBlock(int64_t in_channels, + int64_t out_channels, + int stride, + bool use_snake = true) { + int padding = static_cast(std::ceil(stride / 2.0)); + int kernel_size = 2 * stride; + + blocks["layers.0"] = std::make_shared(in_channels, true); + blocks["layers.1"] = std::make_shared(in_channels, out_channels, kernel_size, stride, padding, 1, true); + blocks["layers.2"] = std::make_shared(out_channels, out_channels, 1, use_snake); + blocks["layers.3"] = std::make_shared(out_channels, out_channels, 3, use_snake); + blocks["layers.4"] = std::make_shared(out_channels, out_channels, 9, use_snake); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + for (int i = 0; i < 5; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + x = layer->forward(ctx, x); + } + return x; + } +}; + +class AudioOobleckEncoder : public UnaryBlock { +protected: + int64_t in_channels; + int64_t channels; + int64_t latent_dim; + std::vector c_mults; + std::vector strides; + int depth; + int num_layers; + +public: + AudioOobleckEncoder(int64_t in_channels, + int64_t channels, + int64_t latent_dim, + const std::vector& c_mults, + const std::vector& strides, + bool use_snake = true) + : in_channels(in_channels), + channels(channels), + latent_dim(latent_dim), + c_mults(c_mults), + strides(strides) { + std::vector c_mults_local = c_mults; + c_mults_local.insert(c_mults_local.begin(), 1); + depth = static_cast(c_mults_local.size()); + + int layer_idx = 0; + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(in_channels, c_mults_local[0] * channels, 7, 1, 3, 1, true); + + for (int i = 0; i < depth - 1; ++i) { + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(c_mults_local[i] * channels, + c_mults_local[i + 1] * channels, + strides[i], + use_snake); + } + + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(c_mults_local.back() * channels, true); + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(c_mults_local.back() * channels, latent_dim, 3, 1, 1, 1, true); + + num_layers = layer_idx; + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + for (int i = 0; i < num_layers; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + x = layer->forward(ctx, x); + } + return x; + } +}; + +class AudioOobleckDecoder : public UnaryBlock { +protected: + int64_t out_channels; + int64_t channels; + int64_t latent_dim; + std::vector c_mults; + std::vector strides; + int depth; + int num_layers; + +public: + AudioOobleckDecoder(int64_t out_channels, + int64_t channels, + int64_t latent_dim, + const std::vector& c_mults, + const std::vector& strides, + bool use_snake = true, + bool final_tanh = false) + : out_channels(out_channels), + channels(channels), + latent_dim(latent_dim), + c_mults(c_mults), + strides(strides) { + (void)final_tanh; + std::vector c_mults_local = c_mults; + c_mults_local.insert(c_mults_local.begin(), 1); + depth = static_cast(c_mults_local.size()); + + int layer_idx = 0; + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(latent_dim, c_mults_local.back() * channels, 7, 1, 3, 1, true); + + for (int i = depth - 1; i > 0; --i) { + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(c_mults_local[i] * channels, + c_mults_local[i - 1] * channels, + strides[i - 1], + use_snake); + } + + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(c_mults_local.front() * channels, true); + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(c_mults_local.front() * channels, out_channels, 7, 1, 3, 1, false); + blocks["layers." + std::to_string(layer_idx++)] = std::make_shared(); + + num_layers = layer_idx; + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + for (int i = 0; i < num_layers; ++i) { + auto layer = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); + x = layer->forward(ctx, x); + } + return x; + } +}; + +class AudioOobleckVAEModel : public GGMLBlock { +public: + AudioOobleckVAEModel(int64_t in_channels, + int64_t channels, + int64_t latent_dim, + const std::vector& c_mults, + const std::vector& strides, + bool use_snake = true, + bool final_tanh = false) { + blocks["encoder"] = std::make_shared(in_channels, channels, latent_dim * 2, c_mults, strides, use_snake); + blocks["decoder"] = std::make_shared(in_channels, channels, latent_dim, c_mults, strides, use_snake, final_tanh); + } + + ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); + return encoder->forward(ctx, x); + } + + ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) { + auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); + return decoder->forward(ctx, z); + } +}; + +struct AudioOobleckVAE : public VAE { + bool decode_only = true; + std::shared_ptr vae; + + AudioOobleckVAE(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map, + const std::string& prefix, + bool decode_only = true) + : decode_only(decode_only), + VAE(backend, offload_params_to_cpu) { + std::vector strides = {2, 4, 4, 8, 8}; + std::string stride_key = prefix + ".decoder.layers.2.layers.1.weight_v"; + auto iter = tensor_storage_map.find(stride_key); + if (iter == tensor_storage_map.end()) { + stride_key = prefix + ".decoder.layers.2.layers.1.parametrizations.weight.original1"; + iter = tensor_storage_map.find(stride_key); + } + if (iter != tensor_storage_map.end()) { + int64_t k0 = iter->second.ne[0]; + int64_t k1 = iter->second.ne[1]; + int64_t k2 = iter->second.ne[2]; + int kernel_size = static_cast(std::min(std::min(k0, k1), k2)); + if (kernel_size == 12) { + strides = {2, 4, 4, 6, 10}; + } + } + vae = std::make_shared(2, 128, 64, std::vector{1, 2, 4, 8, 16}, strides, true, false); + vae->init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "audio_oobleck_vae"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) override { + if (vae) { + vae->get_param_tensors(tensors, prefix); + } + } + + struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { + struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + + z = to_backend(z); + auto runner_ctx = get_context(); + + ggml_tensor* x = z; + if (decode_graph) { + // input: [C, T, B] -> [T, C, B] + x = ggml_cont(compute_ctx, ggml_permute(compute_ctx, x, 1, 0, 2, 3)); + x = vae->decode(&runner_ctx, x); + } else { + x = ggml_cont(compute_ctx, ggml_permute(compute_ctx, x, 1, 0, 2, 3)); + x = vae->encode(&runner_ctx, x); + } + + ggml_build_forward_expand(gf, x); + return gf; + } + + bool compute(const int n_threads, + struct ggml_tensor* z, + bool decode_graph, + struct ggml_tensor** output, + struct ggml_context* output_ctx = nullptr) override { + GGML_ASSERT(!decode_only || decode_graph); + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_graph(z, decode_graph); + }; + return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + } +}; + +#endif // __ACE_VAE_HPP__ diff --git a/src/conditioner.hpp b/src/conditioner.hpp index d4a3146b8..40d1bd030 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -4,11 +4,19 @@ #include "clip.hpp" #include "llm.hpp" #include "t5.hpp" +#include +#include +#include +#include +#include struct SDCondition { struct ggml_tensor* c_crossattn = nullptr; // aka context struct ggml_tensor* c_vector = nullptr; // aka y struct ggml_tensor* c_concat = nullptr; + struct ggml_tensor* c_lyrics = nullptr; // ace: lyric embedding + struct ggml_tensor* refer_audio = nullptr; // ace: reference audio (acoustic hidden states) + std::shared_ptr> audio_codes; // ace: semantic audio codes std::vector extra_c_crossattns; @@ -22,6 +30,13 @@ struct SDCondition { struct ConditionerParams { std::string text; + std::string lyrics; + std::string keyscale = "C major"; + std::string language = "en"; + float bpm = 120.f; + float duration = 120.f; + int timesignature = 2; + int lm_seed = 0; int clip_skip = -1; int width = -1; int height = -1; @@ -2151,4 +2166,809 @@ struct LLMEmbedder : public Conditioner { } }; +struct AceConditioner : public Conditioner { + std::shared_ptr tokenizer; + std::shared_ptr base_llm; + std::shared_ptr lm_llm; + std::string base_llm_prefix; + std::string lm_llm_prefix; + + static bool has_prefix(const String2TensorStorage& tensor_storage_map, const std::string& prefix) { + for (const auto& kv : tensor_storage_map) { + if (kv.first.rfind(prefix, 0) == 0) { + return true; + } + } + return false; + } + + static std::string resolve_prefix(const String2TensorStorage& tensor_storage_map, const std::string& base_prefix) { + // LLM blocks already add ".model" in their submodule names. Pick a prefix that + // results in ".model.*" matching the weight file. + std::string transformer_model_prefix = base_prefix + ".transformer.model"; + if (has_prefix(tensor_storage_map, transformer_model_prefix + ".")) { + return base_prefix + ".transformer"; + } + std::string transformer_prefix = base_prefix + ".transformer"; + if (has_prefix(tensor_storage_map, transformer_prefix + ".")) { + return transformer_prefix; + } + std::string model_prefix = base_prefix + ".model"; + if (has_prefix(tensor_storage_map, model_prefix + ".")) { + return base_prefix; + } + return base_prefix; + } + + static float parse_qwen3_size(const std::string& name) { + std::string s = name; + if (s.rfind("qwen3_", 0) == 0) { + s = s.substr(6); + } + if (!s.empty() && s.back() == 'b') { + s.pop_back(); + } + if (s.empty()) { + return 0.f; + } + if (s.find('.') != std::string::npos) { + return std::stof(s); + } + if (s.size() > 1 && s[0] == '0') { + return std::stof("0." + s.substr(1)); + } + return std::stof(s); + } + + static std::vector> find_qwen3_variants(const String2TensorStorage& tensor_storage_map) { + std::map found; + for (const auto& kv : tensor_storage_map) { + if (kv.first.rfind("text_encoders.qwen3_", 0) != 0) { + continue; + } + auto end = kv.first.find('.', strlen("text_encoders.")); + if (end == std::string::npos) { + continue; + } + std::string model_name = kv.first.substr(strlen("text_encoders."), end - strlen("text_encoders.")); + if (found.find(model_name) == found.end()) { + found[model_name] = parse_qwen3_size(model_name); + } + } + std::vector> variants; + variants.reserve(found.size()); + for (const auto& kv : found) { + variants.emplace_back(kv.second, "text_encoders." + kv.first); + } + std::sort(variants.begin(), variants.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + return variants; + } + + static bool has_qwen3_variant(const String2TensorStorage& tensor_storage_map, const std::string& name) { + std::string prefix = "text_encoders." + name + "."; + return has_prefix(tensor_storage_map, prefix); + } + + static std::string resolve_qwen3_variant(const String2TensorStorage& tensor_storage_map, const std::string& name) { + std::string prefix = "text_encoders." + name; + return resolve_prefix(tensor_storage_map, prefix); + } + + AceConditioner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}) { + tokenizer = std::make_shared(); + + auto variants = find_qwen3_variants(tensor_storage_map); + std::string base_prefix = "text_encoders.qwen3_06b"; + if (!variants.empty()) { + base_prefix = variants.front().second; + } + + base_llm_prefix = resolve_prefix(tensor_storage_map, base_prefix); + base_llm = std::make_shared(LLM::LLMArch::QWEN3, + backend, + offload_params_to_cpu, + tensor_storage_map, + base_llm_prefix, + false); + + bool has_lm = has_prefix(tensor_storage_map, "text_encoders.llm."); + if (has_lm) { + lm_llm_prefix = resolve_prefix(tensor_storage_map, "text_encoders.llm"); + } else if (has_qwen3_variant(tensor_storage_map, "qwen3_2b")) { + lm_llm_prefix = resolve_qwen3_variant(tensor_storage_map, "qwen3_2b"); + } else if (has_qwen3_variant(tensor_storage_map, "qwen3_4b")) { + lm_llm_prefix = resolve_qwen3_variant(tensor_storage_map, "qwen3_4b"); + } else if (variants.size() > 1) { + std::string lm_prefix = variants.back().second; + if (lm_prefix != base_prefix) { + lm_llm_prefix = resolve_prefix(tensor_storage_map, lm_prefix); + } + } + + if (!lm_llm_prefix.empty()) { + lm_llm = std::make_shared(LLM::LLMArch::QWEN3, + backend, + offload_params_to_cpu, + tensor_storage_map, + lm_llm_prefix, + false); + } + } + + void get_param_tensors(std::map& tensors) override { + if (base_llm) { + base_llm->get_param_tensors(tensors, base_llm_prefix); + } + if (lm_llm) { + lm_llm->get_param_tensors(tensors, lm_llm_prefix); + } + } + + void alloc_params_buffer() override { + if (base_llm) { + base_llm->alloc_params_buffer(); + } + if (lm_llm) { + lm_llm->alloc_params_buffer(); + } + } + + void free_params_buffer() override { + if (base_llm) { + base_llm->free_params_buffer(); + } + if (lm_llm) { + lm_llm->free_params_buffer(); + } + } + + size_t get_params_buffer_size() override { + size_t size = 0; + if (base_llm) { + size += base_llm->get_params_buffer_size(); + } + if (lm_llm) { + size += lm_llm->get_params_buffer_size(); + } + return size; + } + + void set_flash_attention_enabled(bool enabled) override { + if (base_llm) { + base_llm->set_flash_attention_enabled(false); + } + if (lm_llm) { + lm_llm->set_flash_attention_enabled(enabled); + } + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + if (base_llm) { + base_llm->set_weight_adapter(adapter); + } + if (lm_llm) { + lm_llm->set_weight_adapter(adapter); + } + } + + static std::string format_meta_cap(int bpm, int timesignature, const std::string& keyscale, int duration) { + std::ostringstream oss; + oss << "- bpm: " << bpm << "\n"; + oss << "- timesignature: " << timesignature << "\n"; + oss << "- keyscale: " << keyscale << "\n"; + oss << "- duration: " << duration << "\n"; + return oss.str(); + } + + static std::string format_meta_lm(int bpm, int timesignature, const std::string& keyscale, int duration) { + std::ostringstream oss; + oss << "bpm: " << bpm << "\n"; + oss << "duration: " << duration << "\n"; + oss << "keyscale: " << keyscale << "\n"; + oss << "timesignature: " << timesignature; + return oss.str(); + } + + std::vector compute_logits(int n_threads, + LLM::LLMRunner* runner, + const std::vector& tokens, + int pad_len = 0) { + std::vector logits; + if (!runner) { + return logits; + } + + size_t vocab_size = static_cast(runner->params.vocab_size); + size_t logits_vocab_size = vocab_size; + int64_t logits_start = runner->get_logits_range_start(); + int64_t logits_end = runner->get_logits_range_end(); + if (logits_end > logits_start) { + logits_vocab_size = static_cast(logits_end - logits_start); + } + if (logits_vocab_size == 0 || tokens.empty()) { + return logits; + } + + size_t n_tokens = tokens.size(); + size_t mask_bytes = n_tokens * n_tokens * sizeof(float); + size_t mem_size = std::max({logits_vocab_size * sizeof(float) * 2, + 8 * 1024 * 1024, + mask_bytes + 1024 * 1024}); + struct ggml_init_params params; + params.mem_size = mem_size; + params.mem_buffer = nullptr; + params.no_alloc = false; + struct ggml_context* ctx = ggml_init(params); + if (!ctx) { + return logits; + } + + ggml_tensor* input_ids = vector_to_ggml_tensor_i32(ctx, tokens); + ggml_tensor* logits_tensor = nullptr; + std::vector> image_embeds; + + ggml_tensor* attention_mask = nullptr; + { + // Match Comfy: use finite negative mask values to avoid NaNs in some backends. + constexpr float kMaskNeg = -65504.0f; + int64_t n_tokens_i = static_cast(tokens.size()); + std::vector attention_mask_vec(static_cast(n_tokens_i) * static_cast(n_tokens_i), 0.f); + for (int64_t i0 = 0; i0 < n_tokens_i; ++i0) { + for (int64_t i1 = 0; i1 < n_tokens_i; ++i1) { + float value = 0.f; + if (i0 < pad_len) { // mask out pad tokens as keys (left padding) + value = kMaskNeg; + } else if (i0 > i1) { // causal mask + value = kMaskNeg; + } + attention_mask_vec[i1 * n_tokens_i + i0] = value; + } + } + attention_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_tokens_i, n_tokens_i); + if (attention_mask->data != nullptr) { + memcpy(attention_mask->data, attention_mask_vec.data(), attention_mask_vec.size() * sizeof(float)); + } else { + ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + size_t idx = static_cast(i1 * n_tokens_i + i0); + ggml_ext_tensor_set_f32(mask, attention_mask_vec[idx], i0, i1, i2, i3); + }); + } + } + + if (!runner->compute_logits(n_threads, input_ids, attention_mask, image_embeds, &logits_tensor, ctx) || logits_tensor == nullptr) { + ggml_free(ctx); + return logits; + } + + size_t n = static_cast(ggml_nelements(logits_tensor)); + logits.resize(std::min(n, logits_vocab_size)); + if (logits_tensor->type == GGML_TYPE_F32 && logits_tensor->buffer == nullptr) { + memcpy(logits.data(), logits_tensor->data, logits.size() * sizeof(float)); + } else { + for (size_t i = 0; i < logits.size(); ++i) { + logits[i] = ggml_ext_tensor_get_f32(logits_tensor, i); + } + } + ggml_free(ctx); + return logits; + } + + bool compute_logits_kv_cfg(int n_threads, + LLM::LLMRunner* runner, + const std::vector& cond_tokens, + const std::vector& uncond_tokens, + int64_t n_past, + int cond_pad_len, + int uncond_pad_len, + std::vector& cond_logits, + std::vector& uncond_logits) { + cond_logits.clear(); + uncond_logits.clear(); + if (!runner || cond_tokens.empty() || cond_tokens.size() != uncond_tokens.size()) { + return false; + } + + size_t vocab_size = static_cast(runner->params.vocab_size); + size_t logits_vocab_size = vocab_size; + int64_t logits_start = runner->get_logits_range_start(); + int64_t logits_end = runner->get_logits_range_end(); + if (logits_end > logits_start) { + logits_vocab_size = static_cast(logits_end - logits_start); + } + if (logits_vocab_size == 0) { + return false; + } + + const size_t n_tokens = cond_tokens.size(); + ggml_tensor* logits_tensor = nullptr; + bool ok = false; + + if (n_tokens == 1 && n_past > 0) { + size_t mem_size = std::max(logits_vocab_size * 2 * sizeof(float), 2 * 1024 * 1024); + struct ggml_init_params params; + params.mem_size = mem_size; + params.mem_buffer = nullptr; + params.no_alloc = false; + struct ggml_context* ctx = ggml_init(params); + if (!ctx) { + return false; + } + + std::vector ids = {cond_tokens[0], uncond_tokens[0]}; + std::vector pad_lens = {cond_pad_len, uncond_pad_len}; + ok = runner->compute_logits_kv_decode_1token(n_threads, ids, n_past, pad_lens, &logits_tensor, ctx); + if (!ok || logits_tensor == nullptr) { + ggml_free(ctx); + return false; + } + + const size_t n_vocab = std::min(logits_vocab_size, static_cast(logits_tensor->ne[0])); + cond_logits.resize(n_vocab); + uncond_logits.resize(n_vocab); + for (size_t i = 0; i < n_vocab; ++i) { + cond_logits[i] = ggml_ext_tensor_get_f32(logits_tensor, static_cast(i), 0, 0, 0); + uncond_logits[i] = ggml_ext_tensor_get_f32(logits_tensor, static_cast(i), 0, 1, 0); + } + + ggml_free(ctx); + return true; + } + + const size_t n_kv = static_cast(n_past) + n_tokens; + size_t mask_bytes = n_kv * n_tokens * 2 * sizeof(float); + size_t mem_size = std::max({logits_vocab_size * 2 * sizeof(float), + 8 * 1024 * 1024, + mask_bytes + 1024 * 1024}); + struct ggml_init_params params; + params.mem_size = mem_size; + params.mem_buffer = nullptr; + params.no_alloc = false; + struct ggml_context* ctx = ggml_init(params); + if (!ctx) { + return false; + } + + ggml_tensor* input_ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_tokens, 2); + { + int32_t* ids = static_cast(input_ids->data); + for (size_t i = 0; i < n_tokens; ++i) { + ids[i] = cond_tokens[i]; + ids[n_tokens + i] = uncond_tokens[i]; + } + } + + constexpr float kMaskNeg = -65504.0f; + ggml_tensor* attention_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_kv, n_tokens, 2); + std::vector attention_mask_vec(n_kv * n_tokens * 2, 0.f); + for (int b = 0; b < 2; ++b) { + int pad_len = b == 0 ? cond_pad_len : uncond_pad_len; + size_t offset = static_cast(b) * n_kv * n_tokens; + for (size_t q = 0; q < n_tokens; ++q) { + int64_t abs_q = n_past + static_cast(q); + for (size_t k = 0; k < n_kv; ++k) { + float value = 0.f; + if (static_cast(k) < pad_len || static_cast(k) > abs_q) { + value = kMaskNeg; + } + attention_mask_vec[offset + q * n_kv + k] = value; + } + } + } + if (attention_mask->data != nullptr) { + memcpy(attention_mask->data, attention_mask_vec.data(), attention_mask_vec.size() * sizeof(float)); + } else { + for (int b = 0; b < 2; ++b) { + size_t offset = static_cast(b) * n_kv * n_tokens; + for (size_t q = 0; q < n_tokens; ++q) { + for (size_t k = 0; k < n_kv; ++k) { + ggml_ext_tensor_set_f32(attention_mask, + attention_mask_vec[offset + q * n_kv + k], + static_cast(k), + static_cast(q), + b, + 0); + } + } + } + } + + std::vector> image_embeds; + ok = runner->compute_logits_kv(n_threads, + input_ids, + attention_mask, + image_embeds, + n_past, + &logits_tensor, + ctx); + if (!ok || logits_tensor == nullptr) { + ggml_free(ctx); + return false; + } + + const size_t n_vocab = std::min(logits_vocab_size, static_cast(logits_tensor->ne[0])); + cond_logits.resize(n_vocab); + uncond_logits.resize(n_vocab); + for (size_t i = 0; i < n_vocab; ++i) { + cond_logits[i] = ggml_ext_tensor_get_f32(logits_tensor, static_cast(i), 0, 0, 0); + uncond_logits[i] = ggml_ext_tensor_get_f32(logits_tensor, static_cast(i), 0, 1, 0); + } + + ggml_free(ctx); + return true; + } + + int sample_from_logits(const std::vector& logits, + float temperature, + float top_p, + std::mt19937_64& rng) { + if (logits.empty()) { + return -1; + } + + int vocab_size = static_cast(logits.size()); + if (temperature <= 0.f) { + int best = 0; + float best_val = logits[0]; + for (int i = 1; i < vocab_size; ++i) { + if (logits[i] > best_val) { + best_val = logits[i]; + best = i; + } + } + return best; + } + + float max_logit = -std::numeric_limits::infinity(); + for (int i = 0; i < vocab_size; ++i) { + if (logits[i] > max_logit) { + max_logit = logits[i]; + } + } + + std::vector probs(vocab_size, 0.f); + double sum = 0.0; + for (int i = 0; i < vocab_size; ++i) { + float val = (logits[i] - max_logit) / temperature; + if (std::isinf(logits[i]) && logits[i] < 0) { + probs[i] = 0.f; + continue; + } + float p = std::exp(val); + probs[i] = p; + sum += p; + } + + if (sum <= 0.0) { + int best = 0; + float best_val = logits[0]; + for (int i = 1; i < vocab_size; ++i) { + if (logits[i] > best_val) { + best_val = logits[i]; + best = i; + } + } + return best; + } + + if (top_p < 1.0f) { + std::vector indices(vocab_size); + for (int i = 0; i < vocab_size; ++i) { + indices[i] = i; + } + std::sort(indices.begin(), indices.end(), [&](int a, int b) { return probs[a] > probs[b]; }); + + double cumulative = 0.0; + std::vector keep(vocab_size, 0); + for (int idx : indices) { + cumulative += probs[idx] / sum; + keep[idx] = 1; + if (cumulative >= top_p) { + break; + } + } + + double new_sum = 0.0; + for (int i = 0; i < vocab_size; ++i) { + if (!keep[i]) { + probs[i] = 0.f; + } else { + new_sum += probs[i]; + } + } + sum = new_sum > 0.0 ? new_sum : sum; + } + + std::uniform_real_distribution dist(0.0, sum); + double r = dist(rng); + double acc = 0.0; + for (int i = 0; i < vocab_size; ++i) { + acc += probs[i]; + if (acc >= r) { + return i; + } + } + return vocab_size - 1; + } + + std::shared_ptr> generate_audio_codes(int n_threads, + const std::string& lm_prompt, + const std::string& lm_prompt_negative, + int min_tokens, + int lm_seed) { + const int audio_start_id = 151669; + const float cfg_scale = 2.0f; + const float temperature = 0.85f; + const float top_p = 0.9f; + + std::shared_ptr> codes = std::make_shared>(); + auto runner = lm_llm ? lm_llm.get() : base_llm.get(); + if (!runner || !tokenizer) { + return codes; + } + + int64_t t0 = ggml_time_ms(); + LOG_INFO("ACE LM: generating %d audio tokens (seed=%d)", min_tokens, lm_seed); + + std::vector cond_tokens = tokenizer->tokenize(lm_prompt, nullptr); + std::vector uncond_tokens = tokenizer->tokenize(lm_prompt_negative, nullptr); + + const int pad_token_id = 151643; + int pos_pad = 0; + int neg_pad = 0; + if (uncond_tokens.size() < cond_tokens.size()) { + neg_pad = static_cast(cond_tokens.size() - uncond_tokens.size()); + uncond_tokens.insert(uncond_tokens.begin(), neg_pad, pad_token_id); + } else if (cond_tokens.size() < uncond_tokens.size()) { + pos_pad = static_cast(uncond_tokens.size() - cond_tokens.size()); + cond_tokens.insert(cond_tokens.begin(), pos_pad, pad_token_id); + } + + const int num_tokens_to_generate = min_tokens; + std::mt19937_64 rng(static_cast(lm_seed)); + bool use_kv_cache = true; + int64_t n_past = 0; + std::vector cond_tokens_full = cond_tokens; + std::vector uncond_tokens_full = uncond_tokens; + std::vector cond_step_tokens = cond_tokens; + std::vector uncond_step_tokens = uncond_tokens; + + const int64_t full_vocab_end = runner->params.vocab_size; + runner->set_logits_range(audio_start_id, full_vocab_end); + const int logits_id_offset = static_cast(runner->get_logits_range_start()); + + runner->reset_kv_cache(); + if (use_kv_cache) { + int64_t kv_capacity = static_cast(cond_tokens.size()) + static_cast(num_tokens_to_generate); + if (!runner->prepare_kv_cache(kv_capacity, 2)) { + use_kv_cache = false; + LOG_WARN("ACE LM: KV-cache allocation failed, falling back to full-sequence logits"); + } + } + + for (int step = 0; step < num_tokens_to_generate; ++step) { + std::vector cond_logits; + std::vector uncond_logits; + if (use_kv_cache) { + bool ok = compute_logits_kv_cfg(n_threads, + runner, + cond_step_tokens, + uncond_step_tokens, + n_past, + pos_pad, + neg_pad, + cond_logits, + uncond_logits); + if (ok) { + n_past += static_cast(cond_step_tokens.size()); + } + if (!ok) { + use_kv_cache = false; + runner->reset_kv_cache(); + LOG_WARN("ACE LM: KV-cache decode unavailable, falling back to full-sequence logits"); + } + } + if (!use_kv_cache) { + cond_logits = compute_logits(n_threads, runner, cond_tokens_full, pos_pad); + uncond_logits = compute_logits(n_threads, runner, uncond_tokens_full, neg_pad); + } + if (cond_logits.empty() || uncond_logits.empty() || cond_logits.size() != uncond_logits.size()) { + break; + } + std::vector cfg_logits(cond_logits.size(), 0.f); + for (size_t i = 0; i < cond_logits.size(); ++i) { + cfg_logits[i] = uncond_logits[i] + cfg_scale * (cond_logits[i] - uncond_logits[i]); + } + + const int mask_upto = std::max(0, audio_start_id - logits_id_offset); + for (int i = 0; i < mask_upto && i < static_cast(cfg_logits.size()); ++i) { + cfg_logits[i] = -std::numeric_limits::infinity(); + } + + if (top_p < 1.0f) { + std::vector indices(cfg_logits.size()); + for (size_t i = 0; i < indices.size(); ++i) { + indices[i] = static_cast(i); + } + std::sort(indices.begin(), indices.end(), + [&](int a, int b) { return cfg_logits[a] > cfg_logits[b]; }); + + float max_logit = -std::numeric_limits::infinity(); + for (float v : cfg_logits) { + if (v > max_logit) { + max_logit = v; + } + } + double sum = 0.0; + std::vector sorted_probs(indices.size(), 0.0); + for (size_t i = 0; i < indices.size(); ++i) { + float v = cfg_logits[indices[i]]; + if (std::isinf(v) && v < 0) { + sorted_probs[i] = 0.0; + continue; + } + double p = std::exp((double)(v - max_logit)); + sorted_probs[i] = p; + sum += p; + } + if (sum > 0.0) { + double cumulative = 0.0; + std::vector remove(indices.size(), 0); + for (size_t i = 0; i < indices.size(); ++i) { + cumulative += sorted_probs[i] / sum; + if (cumulative > top_p) { + remove[i] = 1; + } + } + for (int i = static_cast(remove.size()) - 1; i >= 1; --i) { + remove[i] = remove[i - 1]; + } + if (!remove.empty()) { + remove[0] = 0; + } + for (size_t i = 0; i < indices.size(); ++i) { + if (remove[i]) { + cfg_logits[indices[i]] = -std::numeric_limits::infinity(); + } + } + } + } + + int next_token = logits_id_offset + sample_from_logits(cfg_logits, temperature, 1.0f, rng); + if (next_token < audio_start_id) { + next_token = audio_start_id; + } + + codes->push_back(next_token - audio_start_id); + cond_tokens_full.push_back(next_token); + uncond_tokens_full.push_back(next_token); + cond_step_tokens.assign(1, next_token); + uncond_step_tokens.assign(1, next_token); + + if ((step + 1) % 10 == 0 || step + 1 == num_tokens_to_generate) { + int64_t t = ggml_time_ms(); + LOG_INFO("ACE LM: generated %d/%d tokens (elapsed %.2fs)", step + 1, num_tokens_to_generate, (t - t0) / 1000.0); + } + } + runner->reset_kv_cache(); + runner->set_logits_range(0, full_vocab_end); + int64_t t1 = ggml_time_ms(); + LOG_INFO("ACE LM: audio token generation done in %.2fs", (t1 - t0) / 1000.0); + + return codes; + } + + SDCondition get_learned_condition(ggml_context* work_ctx, + int n_threads, + const ConditionerParams& conditioner_params) override { + SDCondition cond; + + std::string caption = conditioner_params.text; + std::string lyrics = conditioner_params.lyrics; + std::string language = conditioner_params.language; + std::string keyscale = conditioner_params.keyscale; + int bpm = static_cast(std::round(conditioner_params.bpm)); + int timesignature = conditioner_params.timesignature; + int duration = std::max(1, static_cast(std::ceil(conditioner_params.duration))); + + std::string meta_lm = format_meta_lm(bpm, timesignature, keyscale, duration); + std::string meta_cap = format_meta_cap(bpm, timesignature, keyscale, duration); + + std::string lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n"; + std::string lm_prompt = lm_template + caption + "\n" + lyrics + "\n<|im_end|>\n<|im_start|>assistant\n\n" + meta_lm + "\n\n\n<|im_end|>\n"; + std::string lm_prompt_negative = lm_template + caption + "\n" + lyrics + "\n<|im_end|>\n<|im_start|>assistant\n\n\n\n\n<|im_end|>\n"; + + std::string lyric_prompt = "# Languages\n" + language + "\n\n# Lyric" + lyrics + "<|endoftext|><|endoftext|>"; + std::string qwen_prompt = "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n" + caption + "# Metas\n" + meta_cap + "<|endoftext|>\n<|endoftext|>"; + + auto tokens_and_weights = tokenize_with_weights(qwen_prompt); + auto tokens = tokens_and_weights.first; + auto weights = tokens_and_weights.second; + auto lyric_tokens = tokenizer->tokenize(lyric_prompt, nullptr); + + ggml_tensor* context = nullptr; + ggml_tensor* lyric_embed = nullptr; + if (base_llm) { + std::set out_layers; + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); + std::vector> image_embeds; + base_llm->compute(n_threads, input_ids, nullptr, image_embeds, out_layers, &context, work_ctx); + if (context && !weights.empty()) { + float original_mean = ggml_ext_tensor_mean(context); + int64_t n_tokens = context->ne[1]; + int64_t weight_len = static_cast(weights.size()); + int64_t limit = std::min(n_tokens, weight_len); + for (int i2 = 0; i2 < context->ne[2]; ++i2) { + for (int i1 = 0; i1 < limit; ++i1) { + for (int i0 = 0; i0 < context->ne[0]; ++i0) { + float value = ggml_ext_tensor_get_f32(context, i0, i1, i2); + value *= weights[i1]; + ggml_ext_tensor_set_f32(context, value, i0, i1, i2); + } + } + } + float new_mean = ggml_ext_tensor_mean(context); + if (new_mean != 0.f) { + ggml_ext_tensor_scale_inplace(context, (original_mean / new_mean)); + } + } + + std::set lyric_layers = {0}; + auto lyric_ids = vector_to_ggml_tensor_i32(work_ctx, lyric_tokens); + base_llm->compute(n_threads, lyric_ids, nullptr, image_embeds, lyric_layers, &lyric_embed, work_ctx); + } + + static const float kReferAudioVec[64] = { + -1.3672e-01f, -1.5820e-01f, 5.8594e-01f, -5.7422e-01f, 3.0273e-02f, + 2.7930e-01f, -2.5940e-03f, -2.0703e-01f, -1.6113e-01f, -1.4746e-01f, + -2.7710e-02f, -1.8066e-01f, -2.9688e-01f, 1.6016e+00f, -2.6719e+00f, + 7.7734e-01f, -1.3516e+00f, -1.9434e-01f, -7.1289e-02f, -5.0938e+00f, + 2.4316e-01f, 4.7266e-01f, 4.6387e-02f, -6.6406e-01f, -2.1973e-01f, + -6.7578e-01f, -1.5723e-01f, 9.5312e-01f, -2.0020e-01f, -1.7109e+00f, + 5.8984e-01f, -5.7422e-01f, 5.1562e-01f, 2.8320e-01f, 1.4551e-01f, + -1.8750e-01f, -5.9814e-02f, 3.6719e-01f, -1.0059e-01f, -1.5723e-01f, + 2.0605e-01f, -4.3359e-01f, -8.2812e-01f, 4.5654e-02f, -6.6016e-01f, + 1.4844e-01f, 9.4727e-02f, 3.8477e-01f, -1.2578e+00f, -3.3203e-01f, + -8.5547e-01f, 4.3359e-01f, 4.2383e-01f, -8.9453e-01f, -5.0391e-01f, + -5.6152e-02f, -2.9219e+00f, -2.4658e-02f, 5.0391e-01f, 9.8438e-01f, + 7.2754e-02f, -2.1582e-01f, 6.3672e-01f, 1.0000e+00f + }; + + ggml_tensor* refer_audio = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 64, 750, 1); + for (int64_t t = 0; t < refer_audio->ne[1]; ++t) { + for (int64_t c = 0; c < refer_audio->ne[0]; ++c) { + ggml_ext_tensor_set_f32(refer_audio, kReferAudioVec[c], c, t, 0); + } + } + + int min_tokens = std::max(1, duration * 5); + std::shared_ptr> audio_codes = + generate_audio_codes(n_threads, lm_prompt, lm_prompt_negative, min_tokens, conditioner_params.lm_seed); + + cond.c_crossattn = context; + cond.c_lyrics = lyric_embed; + cond.refer_audio = refer_audio; + cond.audio_codes = audio_codes; + return cond; + } + +private: + std::pair, std::vector> tokenize_with_weights(const std::string& text) { + auto parsed_attention = parse_prompt_attention(text); + std::vector tokens; + std::vector weights; + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + float curr_weight = item.second; + auto curr_tokens = tokenizer->tokenize(curr_text, nullptr); + tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); + weights.insert(weights.end(), curr_tokens.size(), curr_weight); + } + tokenizer->pad_tokens(tokens, weights, 0, false); + return {tokens, weights}; + } +}; + #endif diff --git a/src/denoiser.hpp b/src/denoiser.hpp index 40bd7cb7f..94bdc0492 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -654,16 +654,19 @@ float time_snr_shift(float alpha, float t) { struct DiscreteFlowDenoiser : public Denoiser { float sigmas[TIMESTEPS]; float shift = 3.0f; + float multiplier = 1000.0f; float sigma_data = 1.0f; - DiscreteFlowDenoiser(float shift = 3.0f) { - set_shift(shift); + DiscreteFlowDenoiser(float shift = 3.0f, float multiplier = 1000.0f) + : shift(shift), multiplier(multiplier) { + set_parameters(); } void set_parameters() { for (int i = 1; i < TIMESTEPS + 1; i++) { - sigmas[i - 1] = t_to_sigma(static_cast(i)); + float t = (static_cast(i) / static_cast(TIMESTEPS)) * multiplier; + sigmas[i - 1] = t_to_sigma(t); } } @@ -681,12 +684,71 @@ struct DiscreteFlowDenoiser : public Denoiser { } float sigma_to_t(float sigma) override { - return sigma * 1000.f; + return sigma * multiplier; } float t_to_sigma(float t) override { - t = t + 1; - return time_snr_shift(shift, t / 1000.f); + return time_snr_shift(shift, t / multiplier); + } + + std::vector get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) override { + auto scaled_t_to_sigma = [&](float t) { + float t_scaled = (t / static_cast(TIMESTEPS)) * multiplier; + return t_to_sigma(t_scaled); + }; + + std::shared_ptr scheduler; + switch (scheduler_type) { + case DISCRETE_SCHEDULER: + LOG_INFO("get_sigmas with discrete scheduler"); + scheduler = std::make_shared(); + break; + case KARRAS_SCHEDULER: + LOG_INFO("get_sigmas with Karras scheduler"); + scheduler = std::make_shared(); + break; + case EXPONENTIAL_SCHEDULER: + LOG_INFO("get_sigmas exponential scheduler"); + scheduler = std::make_shared(); + break; + case AYS_SCHEDULER: + LOG_INFO("get_sigmas with Align-Your-Steps scheduler"); + scheduler = std::make_shared(version); + break; + case GITS_SCHEDULER: + LOG_INFO("get_sigmas with GITS scheduler"); + scheduler = std::make_shared(); + break; + case SGM_UNIFORM_SCHEDULER: + LOG_INFO("get_sigmas with SGM Uniform scheduler"); + scheduler = std::make_shared(); + break; + case SIMPLE_SCHEDULER: + LOG_INFO("get_sigmas with Simple scheduler"); + scheduler = std::make_shared(); + break; + case SMOOTHSTEP_SCHEDULER: + LOG_INFO("get_sigmas with SmoothStep scheduler"); + scheduler = std::make_shared(); + break; + case BONG_TANGENT_SCHEDULER: + LOG_INFO("get_sigmas with bong_tangent scheduler"); + scheduler = std::make_shared(); + break; + case KL_OPTIMAL_SCHEDULER: + LOG_INFO("get_sigmas with KL Optimal scheduler"); + scheduler = std::make_shared(); + break; + case LCM_SCHEDULER: + LOG_INFO("get_sigmas with LCM scheduler"); + scheduler = std::make_shared(); + break; + default: + LOG_INFO("get_sigmas with discrete scheduler (default)"); + scheduler = std::make_shared(); + break; + } + return scheduler->get_sigmas(n, sigma_min(), sigma_max(), scaled_t_to_sigma); } std::vector get_scalings(float sigma) override { diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 329bb9d9a..0494dad06 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -2,12 +2,14 @@ #define __DIFFUSION_MODEL_H__ #include "anima.hpp" +#include "ace.hpp" #include "flux.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" #include "unet.hpp" #include "wan.hpp" #include "z_image.hpp" +#include struct DiffusionParams { struct ggml_tensor* x = nullptr; @@ -16,6 +18,9 @@ struct DiffusionParams { struct ggml_tensor* c_concat = nullptr; struct ggml_tensor* y = nullptr; struct ggml_tensor* guidance = nullptr; + struct ggml_tensor* lyric_embed = nullptr; + struct ggml_tensor* refer_audio = nullptr; + std::shared_ptr> audio_codes; std::vector ref_latents = {}; bool increase_ref_index = false; int num_video_frames = -1; @@ -43,6 +48,71 @@ struct DiffusionModel { virtual void set_circular_axes(bool circular_x, bool circular_y) = 0; }; +struct AceModel : public DiffusionModel { + ACE::AceRunner ace; + + AceModel(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}) + : ace(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") { + } + + std::string get_desc() override { + return ace.get_desc(); + } + + void alloc_params_buffer() override { + ace.alloc_params_buffer(); + } + + void free_params_buffer() override { + ace.free_params_buffer(); + } + + void free_compute_buffer() override { + ace.free_compute_buffer(); + } + + void get_param_tensors(std::map& tensors) override { + ace.get_param_tensors(tensors, "model.diffusion_model"); + } + + size_t get_params_buffer_size() override { + return ace.get_params_buffer_size(); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + ace.set_weight_adapter(adapter); + } + + int64_t get_adm_in_channels() override { + return 0; + } + + void set_flash_attention_enabled(bool enabled) override { + ace.set_flash_attention_enabled(enabled); + } + + void set_circular_axes(bool circular_x, bool circular_y) override { + ace.set_circular_axes(circular_x, circular_y); + } + + bool compute(int n_threads, + DiffusionParams diffusion_params, + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) override { + return ace.compute(n_threads, + diffusion_params.x, + diffusion_params.timesteps, + diffusion_params.context, + diffusion_params.lyric_embed, + diffusion_params.refer_audio, + diffusion_params.audio_codes, + output, + output_ctx); + } +}; + struct UNetModel : public DiffusionModel { UNetModelRunner unet; diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 6642cfd5f..db1195871 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -173,13 +173,43 @@ __STATIC_INLINE__ void ggml_ext_tensor_set_f32(struct ggml_tensor* tensor, float } __STATIC_INLINE__ float ggml_ext_tensor_get_f32(const ggml_tensor* tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { + size_t offset = i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]; + if (tensor->buffer != nullptr) { - float value; - ggml_backend_tensor_get(tensor, &value, i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0], sizeof(float)); - return value; + if (tensor->type == GGML_TYPE_F32) { + float value; + ggml_backend_tensor_get(tensor, &value, offset, sizeof(value)); + return value; + } else if (tensor->type == GGML_TYPE_F16) { + ggml_fp16_t value; + ggml_backend_tensor_get(tensor, &value, offset, sizeof(value)); + return ggml_fp16_to_fp32(value); + } else if (tensor->type == GGML_TYPE_BF16) { + ggml_bf16_t value; + ggml_backend_tensor_get(tensor, &value, offset, sizeof(value)); + return ggml_bf16_to_fp32(value); + } else if (tensor->type == GGML_TYPE_I32) { + int32_t value; + ggml_backend_tensor_get(tensor, &value, offset, sizeof(value)); + return (float)value; + } } - GGML_ASSERT(tensor->nb[0] == sizeof(float)); - return *(float*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]); + + if (tensor->type == GGML_TYPE_F32) { + return *(float*)((char*)(tensor->data) + offset); + } else if (tensor->type == GGML_TYPE_F16) { + ggml_fp16_t value = *(ggml_fp16_t*)((char*)(tensor->data) + offset); + return ggml_fp16_to_fp32(value); + } else if (tensor->type == GGML_TYPE_BF16) { + ggml_bf16_t value = *(ggml_bf16_t*)((char*)(tensor->data) + offset); + return ggml_bf16_to_fp32(value); + } else if (tensor->type == GGML_TYPE_I32) { + int32_t value = *(int32_t*)((char*)(tensor->data) + offset); + return (float)value; + } + + GGML_ASSERT(false); + return 0.0f; } __STATIC_INLINE__ int ggml_ext_tensor_get_i32(const ggml_tensor* tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { @@ -1211,6 +1241,21 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_full(struct ggml_context* ctx, return t; } +__STATIC_INLINE__ struct ggml_tensor* ggml_ext_repeat(struct ggml_context* ctx, + struct ggml_tensor* a, + struct ggml_tensor* like) { + if (like->type == GGML_TYPE_F16 || like->type == GGML_TYPE_F32) { + return ggml_cont(ctx, ggml_repeat(ctx, a, like)); + } + int n_dims = ggml_n_dims(like); + int64_t ne[GGML_MAX_DIMS] = {1, 1, 1, 1}; + for (int i = 0; i < n_dims; ++i) { + ne[i] = like->ne[i]; + } + auto target = ggml_new_tensor(ctx, GGML_TYPE_F16, n_dims, ne); + return ggml_cont(ctx, ggml_repeat(ctx, a, target)); +} + __STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx, int64_t ne0, int64_t ne1, @@ -1611,6 +1656,13 @@ struct GGMLRunnerContext { bool circular_x_enabled = false; bool circular_y_enabled = false; std::shared_ptr weight_adapter = nullptr; + std::map* backend_tensor_data_map = nullptr; + + void set_backend_tensor_data(struct ggml_tensor* tensor, const void* data) { + if (backend_tensor_data_map != nullptr) { + (*backend_tensor_data_map)[tensor] = data; + } + } }; struct GGMLRunner { @@ -1807,6 +1859,18 @@ struct GGMLRunner { auto tensor = kv.first; auto data = kv.second; + if (data == nullptr) { + LOG_WARN("%s: backend tensor data is null for '%s', skipping data copy", + get_desc().c_str(), + ggml_get_name(tensor)); + continue; + } + if (tensor->buffer == nullptr) { + LOG_WARN("%s: backend tensor buffer not set for '%s', skipping data copy", + get_desc().c_str(), + ggml_get_name(tensor)); + continue; + } ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor)); } @@ -1928,6 +1992,7 @@ struct GGMLRunner { runner_ctx.circular_x_enabled = circular_x_enabled; runner_ctx.circular_y_enabled = circular_y_enabled; runner_ctx.weight_adapter = weight_adapter; + runner_ctx.backend_tensor_data_map = &backend_tensor_data_map; return runner_ctx; } @@ -2102,6 +2167,25 @@ class GGMLBlock { return wtype; } + bool tensor_has_shape_3d(const std::string& name, + const String2TensorStorage& tensor_storage_map, + int64_t n0, + int64_t n1, + int64_t n2) const { + auto iter = tensor_storage_map.find(name); + if (iter == tensor_storage_map.end()) { + return false; + } + const TensorStorage& tensor_storage = iter->second; + if (tensor_storage.n_dims < 3) { + return false; + } + if (tensor_storage.n_dims > 3 && tensor_storage.ne[3] != 1) { + return false; + } + return tensor_storage.ne[0] == n0 && tensor_storage.ne[1] == n1 && tensor_storage.ne[2] == n2; + } + void init_blocks(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { for (auto& pair : blocks) { auto& block = pair.second; @@ -2223,6 +2307,22 @@ class Linear : public UnaryBlock { force_prec_f32(force_prec_f32), scale(scale) {} + struct ggml_tensor* get_weight() const { + auto iter = params.find("weight"); + if (iter == params.end()) { + return nullptr; + } + return iter->second; + } + + struct ggml_tensor* get_bias() const { + auto iter = params.find("bias"); + if (iter == params.end()) { + return nullptr; + } + return iter->second; + } + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; struct ggml_tensor* b = nullptr; @@ -2266,6 +2366,14 @@ class Embedding : public UnaryBlock { num_embeddings(num_embeddings) { } + struct ggml_tensor* get_weight() const { + auto iter = params.find("weight"); + if (iter == params.end()) { + return nullptr; + } + return iter->second; + } + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* input_ids) { // input_ids: [N, n_token] @@ -2369,6 +2477,445 @@ class Conv2d : public UnaryBlock { } }; +class Conv1d : public UnaryBlock { +protected: + int64_t in_channels; + int64_t out_channels; + int kernel_size; + int stride; + int padding; + int dilation; + bool bias; + std::string prefix; + + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + this->prefix = prefix; + enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); + // PyTorch Conv1d weights are stored as [out_channels, in_channels, kernel_size] + // Some models store already-permuted [kernel_size, in_channels, out_channels]. + if (tensor_has_shape_3d(prefix + "weight", tensor_storage_map, kernel_size, in_channels, out_channels)) { + params["weight"] = ggml_new_tensor_3d(ctx, wtype, kernel_size, in_channels, out_channels); + } else { + params["weight"] = ggml_new_tensor_3d(ctx, wtype, out_channels, in_channels, kernel_size); + } + if (bias) { + enum ggml_type btype = get_type(prefix + "bias", tensor_storage_map, GGML_TYPE_F32); + params["bias"] = ggml_new_tensor_1d(ctx, btype, out_channels); + } + } + +public: + Conv1d(int64_t in_channels, + int64_t out_channels, + int kernel_size, + int stride = 1, + int padding = 0, + int dilation = 1, + bool bias = true) + : in_channels(in_channels), + out_channels(out_channels), + kernel_size(kernel_size), + stride(stride), + padding(padding), + dilation(dilation), + bias(bias) {} + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + struct ggml_tensor* w = params["weight"]; + struct ggml_tensor* b = bias ? params["bias"] : nullptr; + + if (w->ne[0] == out_channels && w->ne[1] == in_channels && w->ne[2] == kernel_size) { + // Convert [out, in, k] -> [k, in, out] for ggml_conv_1d + w = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, w, 2, 1, 0, 3)); + } + + x = ggml_conv_1d(ctx->ggml_ctx, w, x, stride, padding, dilation); + + if (b != nullptr) { + auto b_view = ggml_reshape_3d(ctx->ggml_ctx, b, 1, b->ne[0], 1); + b_view = ggml_ext_repeat(ctx->ggml_ctx, b_view, x); + x = ggml_add_inplace(ctx->ggml_ctx, x, b_view); + } + return x; + } +}; + +class ConvTranspose1d : public UnaryBlock { +protected: + int64_t in_channels; + int64_t out_channels; + int kernel_size; + int stride; + int padding; + int dilation; + bool bias; + std::string prefix; + + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + this->prefix = prefix; + enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); + // PyTorch ConvTranspose1d weights are stored as [in_channels, out_channels, kernel_size] + // Some models store already-permuted [kernel_size, out_channels, in_channels]. + if (tensor_has_shape_3d(prefix + "weight", tensor_storage_map, kernel_size, out_channels, in_channels)) { + params["weight"] = ggml_new_tensor_3d(ctx, wtype, kernel_size, out_channels, in_channels); + } else { + params["weight"] = ggml_new_tensor_3d(ctx, wtype, in_channels, out_channels, kernel_size); + } + if (bias) { + enum ggml_type btype = get_type(prefix + "bias", tensor_storage_map, GGML_TYPE_F32); + params["bias"] = ggml_new_tensor_1d(ctx, btype, out_channels); + } + } + +public: + ConvTranspose1d(int64_t in_channels, + int64_t out_channels, + int kernel_size, + int stride = 1, + int padding = 0, + int dilation = 1, + bool bias = true) + : in_channels(in_channels), + out_channels(out_channels), + kernel_size(kernel_size), + stride(stride), + padding(padding), + dilation(dilation), + bias(bias) {} + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + GGML_ASSERT(dilation == 1); + GGML_ASSERT(padding >= 0); + + struct ggml_tensor* w = params["weight"]; + struct ggml_tensor* b = bias ? params["bias"] : nullptr; + + if (w->ne[0] == in_channels && w->ne[1] == out_channels && w->ne[2] == kernel_size) { + // Convert [in, out, k] -> [k, out, in] for ggml_conv_transpose_1d + w = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, w, 2, 1, 0, 3)); + } + + int64_t batch = x->ne[2]; + struct ggml_tensor* out = nullptr; + for (int64_t i = 0; i < batch; i++) { + struct ggml_tensor* x_i = ggml_view_3d(ctx->ggml_ctx, + x, + x->ne[0], + x->ne[1], + 1, + x->nb[1], + x->nb[2], + i * x->nb[2]); + x_i = ggml_reshape_2d(ctx->ggml_ctx, x_i, x_i->ne[0], x_i->ne[1]); + struct ggml_tensor* out_i = ggml_conv_transpose_1d(ctx->ggml_ctx, w, x_i, stride, 0, 1); + if (padding > 0) { + out_i = ggml_ext_slice(ctx->ggml_ctx, out_i, 0, padding, out_i->ne[0] - padding); + } + out_i = ggml_reshape_3d(ctx->ggml_ctx, out_i, out_i->ne[0], out_i->ne[1], 1); + if (out == nullptr) { + out = out_i; + } else { + out = ggml_concat(ctx->ggml_ctx, out, out_i, 2); + } + } + + if (b != nullptr) { + auto b_view = ggml_reshape_3d(ctx->ggml_ctx, b, 1, b->ne[0], 1); + b_view = ggml_ext_repeat(ctx->ggml_ctx, b_view, out); + out = ggml_add_inplace(ctx->ggml_ctx, out, b_view); + } + return out; + } +}; + +class Snake1d : public UnaryBlock { +protected: + int64_t channels; + bool logscale; + + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + enum ggml_type wtype = get_type(prefix + "alpha", tensor_storage_map, GGML_TYPE_F32); + params["alpha"] = ggml_new_tensor_1d(ctx, wtype, channels); + wtype = get_type(prefix + "beta", tensor_storage_map, GGML_TYPE_F32); + params["beta"] = ggml_new_tensor_1d(ctx, wtype, channels); + } + + struct ggml_tensor* broadcast_param(struct ggml_context* ctx, struct ggml_tensor* p, struct ggml_tensor* x) const { + if (ggml_n_dims(x) == 4) { + p = ggml_reshape_4d(ctx, p, 1, p->ne[0], 1, 1); + } else { + p = ggml_reshape_3d(ctx, p, 1, p->ne[0], 1); + } + return ggml_ext_repeat(ctx, p, x); + } + +public: + Snake1d(int64_t channels, bool logscale = true) + : channels(channels), logscale(logscale) {} + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + auto alpha = params["alpha"]; + auto beta = params["beta"]; + if (logscale) { + alpha = ggml_exp(ctx->ggml_ctx, alpha); + beta = ggml_exp(ctx->ggml_ctx, beta); + } + + alpha = broadcast_param(ctx->ggml_ctx, alpha, x); + beta = broadcast_param(ctx->ggml_ctx, beta, x); + + auto ax = ggml_mul(ctx->ggml_ctx, x, alpha); + auto sin_ax = ggml_sin(ctx->ggml_ctx, ax); + auto sin_sq = ggml_sqr(ctx->ggml_ctx, sin_ax); + + auto eps = ggml_ext_full(ctx->ggml_ctx, 1e-9f, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); + auto inv = ggml_div(ctx->ggml_ctx, ggml_ext_ones(ctx->ggml_ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]), ggml_add(ctx->ggml_ctx, beta, eps)); + auto add = ggml_mul(ctx->ggml_ctx, inv, sin_sq); + return ggml_add(ctx->ggml_ctx, x, add); + } +}; + +class WNConv1d : public UnaryBlock { +protected: + int64_t in_channels; + int64_t out_channels; + int kernel_size; + int stride; + int padding; + int dilation; + bool bias; + std::string weight_v_key = "weight_v"; + std::string weight_g_key = "weight_g"; + + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + std::string param_g = "parametrizations.weight.original0"; + std::string param_v = "parametrizations.weight.original1"; + if (tensor_storage_map.find(prefix + param_g) != tensor_storage_map.end() || + tensor_storage_map.find(prefix + param_v) != tensor_storage_map.end()) { + bool g_is_original0 = tensor_has_shape_3d(prefix + param_g, tensor_storage_map, out_channels, 1, 1) || + tensor_has_shape_3d(prefix + param_g, tensor_storage_map, 1, 1, out_channels); + if (g_is_original0) { + weight_g_key = param_g; + weight_v_key = param_v; + } else { + weight_g_key = param_v; + weight_v_key = param_g; + } + } + + enum ggml_type wtype = get_type(prefix + weight_v_key, tensor_storage_map, GGML_TYPE_F32); + // PyTorch weight_norm Conv1d uses [out_channels, in_channels, kernel_size] + // Some models store already-permuted [kernel_size, in_channels, out_channels]. + if (tensor_has_shape_3d(prefix + weight_v_key, tensor_storage_map, kernel_size, in_channels, out_channels)) { + params[weight_v_key] = ggml_new_tensor_3d(ctx, wtype, kernel_size, in_channels, out_channels); + } else { + params[weight_v_key] = ggml_new_tensor_3d(ctx, wtype, out_channels, in_channels, kernel_size); + } + wtype = get_type(prefix + weight_g_key, tensor_storage_map, GGML_TYPE_F32); + if (tensor_has_shape_3d(prefix + weight_g_key, tensor_storage_map, 1, 1, out_channels)) { + params[weight_g_key] = ggml_new_tensor_3d(ctx, wtype, 1, 1, out_channels); + } else { + params[weight_g_key] = ggml_new_tensor_3d(ctx, wtype, out_channels, 1, 1); + } + if (bias) { + enum ggml_type btype = get_type(prefix + "bias", tensor_storage_map, GGML_TYPE_F32); + params["bias"] = ggml_new_tensor_1d(ctx, btype, out_channels); + } + } + + struct ggml_tensor* normalize_weight(GGMLRunnerContext* ctx, struct ggml_tensor* w_v, struct ggml_tensor* w_g) const { + auto w_v_f = ggml_cast(ctx->ggml_ctx, w_v, GGML_TYPE_F32); + auto w_g_f = ggml_cast(ctx->ggml_ctx, w_g, GGML_TYPE_F32); + w_v_f = ggml_cont(ctx->ggml_ctx, w_v_f); + w_g_f = ggml_cont(ctx->ggml_ctx, w_g_f); + + if (w_v_f->ne[0] == out_channels && w_v_f->ne[1] == in_channels && w_v_f->ne[2] == kernel_size) { + // Convert [out, in, k] -> [k, in, out] + w_v_f = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, w_v_f, 2, 1, 0, 3)); + } + + auto w_flat = ggml_reshape_2d(ctx->ggml_ctx, w_v_f, w_v_f->ne[0] * w_v_f->ne[1], w_v_f->ne[2]); + auto w_sq = ggml_sqr(ctx->ggml_ctx, w_flat); + auto w_norm = ggml_sum_rows(ctx->ggml_ctx, w_sq); + w_norm = ggml_sqrt(ctx->ggml_ctx, w_norm); + w_norm = ggml_reshape_1d(ctx->ggml_ctx, w_norm, ggml_nelements(w_norm)); + + auto g_1d = ggml_reshape_1d(ctx->ggml_ctx, w_g_f, ggml_nelements(w_norm)); + auto scale = ggml_div(ctx->ggml_ctx, g_1d, w_norm); + scale = ggml_reshape_3d(ctx->ggml_ctx, scale, 1, 1, scale->ne[0]); + scale = ggml_ext_repeat(ctx->ggml_ctx, scale, w_v_f); + + auto w = ggml_mul(ctx->ggml_ctx, w_v_f, scale); + if (w_v->type != GGML_TYPE_F32) { + w = ggml_cast(ctx->ggml_ctx, w, w_v->type); + } + return w; + } + +public: + WNConv1d(int64_t in_channels, + int64_t out_channels, + int kernel_size, + int stride = 1, + int padding = 0, + int dilation = 1, + bool bias = true) + : in_channels(in_channels), + out_channels(out_channels), + kernel_size(kernel_size), + stride(stride), + padding(padding), + dilation(dilation), + bias(bias) {} + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + auto w = normalize_weight(ctx, params[weight_v_key], params[weight_g_key]); + struct ggml_tensor* b = bias ? params["bias"] : nullptr; + + x = ggml_conv_1d(ctx->ggml_ctx, w, x, stride, padding, dilation); + + if (b != nullptr) { + auto b_view = ggml_reshape_3d(ctx->ggml_ctx, b, 1, b->ne[0], 1); + b_view = ggml_ext_repeat(ctx->ggml_ctx, b_view, x); + x = ggml_add_inplace(ctx->ggml_ctx, x, b_view); + } + return x; + } +}; + +class WNConvTranspose1d : public UnaryBlock { +protected: + int64_t in_channels; + int64_t out_channels; + int kernel_size; + int stride; + int padding; + int dilation; + bool bias; + std::string weight_v_key = "weight_v"; + std::string weight_g_key = "weight_g"; + + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + std::string param_g = "parametrizations.weight.original0"; + std::string param_v = "parametrizations.weight.original1"; + if (tensor_storage_map.find(prefix + param_g) != tensor_storage_map.end() || + tensor_storage_map.find(prefix + param_v) != tensor_storage_map.end()) { + bool g_is_original0 = tensor_has_shape_3d(prefix + param_g, tensor_storage_map, in_channels, 1, 1) || + tensor_has_shape_3d(prefix + param_g, tensor_storage_map, 1, 1, in_channels); + if (g_is_original0) { + weight_g_key = param_g; + weight_v_key = param_v; + } else { + weight_g_key = param_v; + weight_v_key = param_g; + } + } + + enum ggml_type wtype = get_type(prefix + weight_v_key, tensor_storage_map, GGML_TYPE_F32); + // PyTorch ConvTranspose1d weight_norm uses [in_channels, out_channels, kernel_size] + // Some models store already-permuted [kernel_size, out_channels, in_channels]. + if (tensor_has_shape_3d(prefix + weight_v_key, tensor_storage_map, kernel_size, out_channels, in_channels)) { + params[weight_v_key] = ggml_new_tensor_3d(ctx, wtype, kernel_size, out_channels, in_channels); + } else { + params[weight_v_key] = ggml_new_tensor_3d(ctx, wtype, in_channels, out_channels, kernel_size); + } + wtype = get_type(prefix + weight_g_key, tensor_storage_map, GGML_TYPE_F32); + if (tensor_has_shape_3d(prefix + weight_g_key, tensor_storage_map, 1, 1, in_channels)) { + params[weight_g_key] = ggml_new_tensor_3d(ctx, wtype, 1, 1, in_channels); + } else { + params[weight_g_key] = ggml_new_tensor_3d(ctx, wtype, in_channels, 1, 1); + } + if (bias) { + enum ggml_type btype = get_type(prefix + "bias", tensor_storage_map, GGML_TYPE_F32); + params["bias"] = ggml_new_tensor_1d(ctx, btype, out_channels); + } + } + + struct ggml_tensor* normalize_weight(GGMLRunnerContext* ctx, struct ggml_tensor* w_v, struct ggml_tensor* w_g) const { + auto w_v_f = ggml_cast(ctx->ggml_ctx, w_v, GGML_TYPE_F32); + auto w_g_f = ggml_cast(ctx->ggml_ctx, w_g, GGML_TYPE_F32); + w_v_f = ggml_cont(ctx->ggml_ctx, w_v_f); + w_g_f = ggml_cont(ctx->ggml_ctx, w_g_f); + + if (w_v_f->ne[0] == in_channels && w_v_f->ne[1] == out_channels && w_v_f->ne[2] == kernel_size) { + // Convert [in, out, k] -> [k, out, in] + w_v_f = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, w_v_f, 2, 1, 0, 3)); + } + + auto w_flat = ggml_reshape_2d(ctx->ggml_ctx, w_v_f, w_v_f->ne[0] * w_v_f->ne[1], w_v_f->ne[2]); + auto w_sq = ggml_sqr(ctx->ggml_ctx, w_flat); + auto w_norm = ggml_sum_rows(ctx->ggml_ctx, w_sq); + w_norm = ggml_sqrt(ctx->ggml_ctx, w_norm); + w_norm = ggml_reshape_1d(ctx->ggml_ctx, w_norm, ggml_nelements(w_norm)); + + auto g_1d = ggml_reshape_1d(ctx->ggml_ctx, w_g_f, ggml_nelements(w_norm)); + auto scale = ggml_div(ctx->ggml_ctx, g_1d, w_norm); + scale = ggml_reshape_3d(ctx->ggml_ctx, scale, 1, 1, scale->ne[0]); + scale = ggml_ext_repeat(ctx->ggml_ctx, scale, w_v_f); + + auto w = ggml_mul(ctx->ggml_ctx, w_v_f, scale); + if (w_v->type != GGML_TYPE_F32) { + w = ggml_cast(ctx->ggml_ctx, w, w_v->type); + } + return w; + } + +public: + WNConvTranspose1d(int64_t in_channels, + int64_t out_channels, + int kernel_size, + int stride = 1, + int padding = 0, + int dilation = 1, + bool bias = true) + : in_channels(in_channels), + out_channels(out_channels), + kernel_size(kernel_size), + stride(stride), + padding(padding), + dilation(dilation), + bias(bias) {} + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + GGML_ASSERT(dilation == 1); + GGML_ASSERT(padding >= 0); + + auto w = normalize_weight(ctx, params[weight_v_key], params[weight_g_key]); + struct ggml_tensor* b = bias ? params["bias"] : nullptr; + + int64_t batch = x->ne[2]; + struct ggml_tensor* out = nullptr; + for (int64_t i = 0; i < batch; i++) { + struct ggml_tensor* x_i = ggml_view_3d(ctx->ggml_ctx, + x, + x->ne[0], + x->ne[1], + 1, + x->nb[1], + x->nb[2], + i * x->nb[2]); + x_i = ggml_reshape_2d(ctx->ggml_ctx, x_i, x_i->ne[0], x_i->ne[1]); + struct ggml_tensor* out_i = ggml_conv_transpose_1d(ctx->ggml_ctx, w, x_i, stride, 0, 1); + if (padding > 0) { + out_i = ggml_ext_slice(ctx->ggml_ctx, out_i, 0, padding, out_i->ne[0] - padding); + } + out_i = ggml_reshape_3d(ctx->ggml_ctx, out_i, out_i->ne[0], out_i->ne[1], 1); + if (out == nullptr) { + out = out_i; + } else { + out = ggml_concat(ctx->ggml_ctx, out, out_i, 2); + } + } + + if (b != nullptr) { + auto b_view = ggml_reshape_3d(ctx->ggml_ctx, b, 1, b->ne[0], 1); + b_view = ggml_ext_repeat(ctx->ggml_ctx, b_view, out); + out = ggml_add_inplace(ctx->ggml_ctx, out, b_view); + } + return out; + } +}; + class Conv3d : public UnaryBlock { protected: int64_t in_channels; diff --git a/src/llm.hpp b/src/llm.hpp index 5490f07c7..d49e653c5 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -2,6 +2,7 @@ #define __LLM_HPP__ #include +#include #include #include #include @@ -371,6 +372,225 @@ namespace LLM { } }; + class Qwen3Tokenizer : public BPETokenizer { + protected: + static std::string read_file_to_string(const std::string& path) { + std::ifstream ifs(path, std::ios::binary); + if (!ifs.good()) { + return ""; + } + std::ostringstream oss; + oss << ifs.rdbuf(); + return oss.str(); + } + + void load_from_merges(const std::string& merges_utf8_str) { + auto byte_unicode_pairs = bytes_to_unicode(); + byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end()); + for (auto& pair : byte_unicode_pairs) { + byte_decoder[pair.second] = pair.first; + } + std::vector merges; + size_t start = 0; + size_t pos; + std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str); + while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) { + merges.push_back(merges_utf32_str.substr(start, pos - start)); + start = pos + 1; + } + LOG_DEBUG("merges size %llu", merges.size()); + std::vector> merge_pairs; + merge_pairs.reserve(merges.size()); + for (const auto& merge : merges) { + size_t space_pos = merge.find(' '); + if (space_pos == std::u32string::npos) { + continue; + } + merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1)); + } + + std::vector tokens; + for (const auto& pair : byte_unicode_pairs) { + tokens.push_back(pair.second); + } + for (const auto& merge : merge_pairs) { + tokens.push_back(merge.first + merge.second); + } + for (auto& special_token : special_tokens) { + tokens.push_back(utf8_to_utf32(special_token)); + } + + int i = 0; + for (const auto& token : tokens) { + encoder[token] = i; + decoder[i] = token; + i++; + } + encoder_len = i; + LOG_DEBUG("vocab size: %d", encoder_len); + + int rank = 0; + for (const auto& merge : merge_pairs) { + bpe_ranks[merge] = rank++; + } + bpe_len = rank; + } + + void load_from_tokenizer_json(const std::string& json_str) { + nlohmann::json tok; + try { + tok = nlohmann::json::parse(json_str); + } catch (const nlohmann::json::parse_error&) { + GGML_ABORT("invalid qwen3 tokenizer json"); + } + + if (!tok.contains("model") || !tok["model"].contains("vocab") || !tok["model"].contains("merges")) { + GGML_ABORT("qwen3 tokenizer json missing vocab/merges"); + } + + auto vocab = tok["model"]["vocab"]; + int max_id = -1; + for (auto it = vocab.begin(); it != vocab.end(); ++it) { + const std::string token = it.key(); + int id = it.value(); + std::u32string token_u = utf8_to_utf32(token); + encoder[token_u] = id; + decoder[id] = token_u; + max_id = std::max(max_id, id); + } + + special_tokens.clear(); + if (tok.contains("added_tokens")) { + for (auto& item : tok["added_tokens"]) { + if (!item.contains("content") || !item.contains("id")) { + continue; + } + std::string content = item["content"]; + int id = item["id"]; + std::u32string u = utf8_to_utf32(content); + encoder[u] = id; + decoder[id] = u; + special_tokens.push_back(content); + if (content == "<|endoftext|>") { + UNK_TOKEN_ID = id; + EOS_TOKEN_ID = id; + PAD_TOKEN_ID = id; + } + max_id = std::max(max_id, id); + } + } + + encoder_len = max_id + 1; + LOG_DEBUG("qwen3 vocab size: %d", encoder_len); + + auto byte_unicode_pairs = bytes_to_unicode(); + byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end()); + for (auto& pair : byte_unicode_pairs) { + byte_decoder[pair.second] = pair.first; + } + + std::vector> merge_pairs; + auto merges_json = tok["model"]["merges"]; + merge_pairs.reserve(merges_json.size()); + for (auto& merge_item : merges_json) { + if (merge_item.is_string()) { + std::string merge_str = merge_item.get(); + std::u32string merge_u32 = utf8_to_utf32(merge_str); + size_t space_pos = merge_u32.find(' '); + if (space_pos == std::u32string::npos) { + continue; + } + merge_pairs.emplace_back(merge_u32.substr(0, space_pos), merge_u32.substr(space_pos + 1)); + } else if (merge_item.is_array() && merge_item.size() == 2) { + std::string first = merge_item[0].get(); + std::string second = merge_item[1].get(); + merge_pairs.emplace_back(utf8_to_utf32(first), utf8_to_utf32(second)); + } + } + + int rank = 0; + for (const auto& merge : merge_pairs) { + bpe_ranks[merge] = rank++; + } + bpe_len = rank; + } + + public: + explicit Qwen3Tokenizer(const std::string& tokenizer_json_str = "") { + UNK_TOKEN = "<|endoftext|>"; + EOS_TOKEN = "<|endoftext|>"; + PAD_TOKEN = "<|endoftext|>"; + + UNK_TOKEN_ID = 151643; + EOS_TOKEN_ID = 151643; + PAD_TOKEN_ID = 151643; + + std::string json_str = tokenizer_json_str; + if (json_str.empty()) { + if (const char* env_path = std::getenv("QWEN3_TOKENIZER_PATH"); env_path && *env_path) { + json_str = read_file_to_string(env_path); + } else if (const char* env_path = std::getenv("ACE_QWEN3_TOKENIZER_PATH"); env_path && *env_path) { + json_str = read_file_to_string(env_path); + } else if (const char* env_root = std::getenv("ACE_STEP_HOME"); env_root && *env_root) { + std::string p = std::string(env_root) + "/checkpoints/Qwen3-Embedding-0.6B/tokenizer.json"; + json_str = read_file_to_string(p); + } else if (const char* env_root = std::getenv("ACE_STEP_PATH"); env_root && *env_root) { + std::string p = std::string(env_root) + "/checkpoints/Qwen3-Embedding-0.6B/tokenizer.json"; + json_str = read_file_to_string(p); + } else { + // common local fallback paths + const std::vector fallback_paths = { + "./ACE-Step-1.5/checkpoints/Qwen3-Embedding-0.6B/tokenizer.json", + "../ACE-Step-1.5/checkpoints/Qwen3-Embedding-0.6B/tokenizer.json", + "../../ACE-Step-1.5/checkpoints/Qwen3-Embedding-0.6B/tokenizer.json", + "./tokenizer.json", + }; + for (const auto& p : fallback_paths) { + json_str = read_file_to_string(p); + if (!json_str.empty()) { + break; + } + } + } + } + + if (!json_str.empty()) { + load_from_tokenizer_json(json_str); + } else { + LOG_WARN("Qwen3 tokenizer json not found, falling back to Qwen2 merges. Set QWEN3_TOKENIZER_PATH for correct tokenization."); + special_tokens = { + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>", + "", + "", + "<|fim_prefix|>", + "<|fim_middle|>", + "<|fim_suffix|>", + "<|fim_pad|>", + "<|repo_name|>", + "<|file_sep|>", + "", + "", + "", + "", + }; + load_from_merges(load_qwen2_merges()); + } + } + }; + class MistralTokenizer : public BPETokenizer { protected: void load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) { @@ -839,7 +1059,13 @@ namespace LLM { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* input_pos, - struct ggml_tensor* attention_mask = nullptr) { + struct ggml_tensor* attention_mask = nullptr, + struct ggml_tensor* past_k = nullptr, + struct ggml_tensor* past_v = nullptr, + struct ggml_tensor* kv_row_indices = nullptr, + int64_t kv_cache_len = -1, + struct ggml_tensor** present_k = nullptr, + struct ggml_tensor** present_v = nullptr) { // x: [N, n_token, hidden_size] int64_t n_token = x->ne[1]; int64_t N = x->ne[2]; @@ -876,13 +1102,65 @@ namespace LLM { k = ggml_rope_multi(ctx->ggml_ctx, k, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); } + if (past_k != nullptr && past_v != nullptr) { + if (kv_row_indices != nullptr) { + auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k, head_dim * num_kv_heads, n_token, N); + auto v_flat = ggml_reshape_3d(ctx->ggml_ctx, v, head_dim * num_kv_heads, n_token, N); + if (k_flat->type != GGML_TYPE_F32) { + k_flat = ggml_cast(ctx->ggml_ctx, k_flat, GGML_TYPE_F32); + } + if (v_flat->type != GGML_TYPE_F32) { + v_flat = ggml_cast(ctx->ggml_ctx, v_flat, GGML_TYPE_F32); + } + k_flat = ggml_cont(ctx->ggml_ctx, k_flat); + v_flat = ggml_cont(ctx->ggml_ctx, v_flat); + + const int64_t cache_tokens = past_k->ne[1]; + const int64_t use_tokens = kv_cache_len > 0 ? kv_cache_len : cache_tokens; + GGML_ASSERT(use_tokens <= cache_tokens); + const int64_t start_token = std::max(0, use_tokens - n_token); + GGML_ASSERT(start_token + n_token <= cache_tokens); + + auto k_cache = ggml_set(ctx->ggml_ctx, + past_k, + k_flat, + past_k->nb[1], + past_k->nb[2], + past_k->nb[3], + static_cast(start_token) * past_k->nb[1]); + auto v_cache = ggml_set(ctx->ggml_ctx, + past_v, + v_flat, + past_v->nb[1], + past_v->nb[2], + past_v->nb[3], + static_cast(start_token) * past_v->nb[1]); + + auto k_cache_4d = ggml_reshape_4d(ctx->ggml_ctx, k_cache, head_dim, num_kv_heads, cache_tokens, N); + auto v_cache_4d = ggml_reshape_4d(ctx->ggml_ctx, v_cache, head_dim, num_kv_heads, cache_tokens, N); + + k = ggml_view_4d(ctx->ggml_ctx, k_cache_4d, head_dim, num_kv_heads, use_tokens, N, k_cache_4d->nb[1], k_cache_4d->nb[2], k_cache_4d->nb[3], 0); + v = ggml_view_4d(ctx->ggml_ctx, v_cache_4d, head_dim, num_kv_heads, use_tokens, N, v_cache_4d->nb[1], v_cache_4d->nb[2], v_cache_4d->nb[3], 0); + } else { + k = ggml_concat(ctx->ggml_ctx, past_k, k, 2); + v = ggml_concat(ctx->ggml_ctx, past_v, v, 2); + } + } + + if (present_k != nullptr) { + *present_k = k; + } + if (present_v != nullptr) { + *present_v = v; + } + q = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, q, 0, 2, 1, 3)); // [N, num_heads, n_token, head_dim] q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0], q->ne[1], q->ne[2] * q->ne[3]); // [N*num_heads, n_token, head_dim] k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim] k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim] - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false); // [N, n_token, hidden_size] + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled); // [N, n_token, hidden_size] x = out_proj->forward(ctx, x); // [N, n_token, hidden_size] return x; @@ -901,7 +1179,13 @@ namespace LLM { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* input_pos, - struct ggml_tensor* attention_mask = nullptr) { + struct ggml_tensor* attention_mask = nullptr, + struct ggml_tensor* past_k = nullptr, + struct ggml_tensor* past_v = nullptr, + struct ggml_tensor* kv_row_indices = nullptr, + int64_t kv_cache_len = -1, + struct ggml_tensor** present_k = nullptr, + struct ggml_tensor** present_v = nullptr) { // x: [N, n_token, hidden_size] auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); @@ -910,7 +1194,7 @@ namespace LLM { auto residual = x; x = input_layernorm->forward(ctx, x); - x = self_attn->forward(ctx, x, input_pos, attention_mask); + x = self_attn->forward(ctx, x, input_pos, attention_mask, past_k, past_v, kv_row_indices, kv_cache_len, present_k, present_v); x = ggml_add_inplace(ctx->ggml_ctx, x, residual); residual = x; @@ -941,7 +1225,13 @@ namespace LLM { struct ggml_tensor* input_pos, struct ggml_tensor* attention_mask, std::vector> image_embeds, - std::set out_layers) { + std::set out_layers, + const std::vector* past_k_cache = nullptr, + const std::vector* past_v_cache = nullptr, + struct ggml_tensor* kv_row_indices = nullptr, + int64_t kv_cache_len = -1, + std::vector* present_k_cache = nullptr, + std::vector* present_v_cache = nullptr) { // input_ids: [N, n_token] // return: [N, n_token, hidden_size] @@ -991,10 +1281,38 @@ namespace LLM { x = input_embed; } + if (out_layers.find(0) != out_layers.end()) { + intermediate_outputs.push_back(x); + } + + if (present_k_cache != nullptr) { + present_k_cache->clear(); + present_k_cache->reserve(num_layers); + } + if (present_v_cache != nullptr) { + present_v_cache->clear(); + present_v_cache->reserve(num_layers); + } + for (int i = 0; i < num_layers; i++) { auto block = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); - - x = block->forward(ctx, x, input_pos, attention_mask); + ggml_tensor* layer_past_k = nullptr; + ggml_tensor* layer_past_v = nullptr; + if (past_k_cache != nullptr && i < static_cast(past_k_cache->size())) { + layer_past_k = (*past_k_cache)[i]; + } + if (past_v_cache != nullptr && i < static_cast(past_v_cache->size())) { + layer_past_v = (*past_v_cache)[i]; + } + ggml_tensor* layer_present_k = nullptr; + ggml_tensor* layer_present_v = nullptr; + x = block->forward(ctx, x, input_pos, attention_mask, layer_past_k, layer_past_v, kv_row_indices, kv_cache_len, &layer_present_k, &layer_present_v); + if (present_k_cache != nullptr) { + present_k_cache->push_back(layer_present_k); + } + if (present_v_cache != nullptr) { + present_v_cache->push_back(layer_present_v); + } if (out_layers.find(i + 1) != out_layers.end()) { intermediate_outputs.push_back(x); } @@ -1010,6 +1328,11 @@ namespace LLM { } return x; } + + struct ggml_tensor* get_embedding_weight() const { + auto embed_tokens = std::dynamic_pointer_cast(blocks.at("embed_tokens")); + return embed_tokens ? embed_tokens->get_weight() : nullptr; + } }; struct LLM : public GGMLBlock { @@ -1042,14 +1365,36 @@ namespace LLM { struct ggml_tensor* input_pos, struct ggml_tensor* attention_mask, std::vector> image_embeds, - std::set out_layers) { + std::set out_layers, + const std::vector* past_k_cache = nullptr, + const std::vector* past_v_cache = nullptr, + struct ggml_tensor* kv_row_indices = nullptr, + int64_t kv_cache_len = -1, + std::vector* present_k_cache = nullptr, + std::vector* present_v_cache = nullptr) { // input_ids: [N, n_token] auto model = std::dynamic_pointer_cast(blocks["model"]); - auto x = model->forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); + auto x = model->forward(ctx, + input_ids, + input_pos, + attention_mask, + image_embeds, + out_layers, + past_k_cache, + past_v_cache, + kv_row_indices, + kv_cache_len, + present_k_cache, + present_v_cache); return x; } + struct ggml_tensor* get_embedding_weight() const { + auto model = std::dynamic_pointer_cast(blocks.at("model")); + return model ? model->get_embedding_weight() : nullptr; + } + struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values, struct ggml_tensor* pe, @@ -1066,14 +1411,38 @@ namespace LLM { LLMParams params; bool enable_vision; LLM model; + ggml_tensor* logit_scale = nullptr; + std::string logit_scale_name; std::vector input_pos_vec; + std::vector kv_row_indices_vec; std::vector attention_mask_vec; std::vector window_mask_vec; std::vector window_index_vec; std::vector window_inverse_index_vec; std::vector pe_vec; + int64_t kv_cache_capacity = 0; + int64_t kv_cache_batch_size = 0; + int64_t logits_range_start = 0; + int64_t logits_range_end = -1; + + struct Decode1TokenGraphState { + bool ready = false; + int64_t batch_size = 0; + int64_t kv_capacity = 0; + struct ggml_cgraph* graph = nullptr; + ggml_tensor* input_ids = nullptr; + ggml_tensor* input_pos = nullptr; + ggml_tensor* attention_mask = nullptr; + ggml_tensor* kv_row_indices = nullptr; + ggml_tensor* logits = nullptr; + } decode_graph_state; + + std::vector decode_input_ids_vec; + std::vector decode_input_pos_vec; + std::vector decode_attention_mask_vec; + LLMRunner(LLMArch arch, ggml_backend_t backend, bool offload_params_to_cpu, @@ -1150,6 +1519,30 @@ namespace LLM { } model = LLM(params, enable_vision, llama_cpp_style); model.init(params_ctx, tensor_storage_map, prefix); + + std::string root_prefix = prefix; + if (ends_with(root_prefix, ".transformer.model")) { + root_prefix = root_prefix.substr(0, root_prefix.size() - strlen(".transformer.model")); + } else if (ends_with(root_prefix, ".transformer")) { + root_prefix = root_prefix.substr(0, root_prefix.size() - strlen(".transformer")); + } else if (ends_with(root_prefix, ".model")) { + root_prefix = root_prefix.substr(0, root_prefix.size() - strlen(".model")); + } + std::string candidate = root_prefix + ".logit_scale"; + auto it = tensor_storage_map.find(candidate); + if (it != tensor_storage_map.end()) { + logit_scale_name = candidate; + enum ggml_type wtype = GGML_TYPE_F32; + if (it->second.expected_type != GGML_TYPE_COUNT) { + wtype = it->second.expected_type; + } else { + wtype = it->second.type; + } + logit_scale = ggml_new_tensor_1d(params_ctx, wtype, 1); + } + + logits_range_start = 0; + logits_range_end = params.vocab_size; } std::string get_desc() override { @@ -1158,6 +1551,9 @@ namespace LLM { void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); + if (logit_scale != nullptr && !logit_scale_name.empty()) { + tensors[logit_scale_name] = logit_scale; + } } struct ggml_tensor* forward(GGMLRunnerContext* ctx, @@ -1165,8 +1561,25 @@ namespace LLM { struct ggml_tensor* input_pos, struct ggml_tensor* attention_mask, std::vector> image_embeds, - std::set out_layers) { - auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); // [N, n_token, hidden_size] + std::set out_layers, + const std::vector* past_k_cache = nullptr, + const std::vector* past_v_cache = nullptr, + struct ggml_tensor* kv_row_indices = nullptr, + int64_t kv_cache_len = -1, + std::vector* present_k_cache = nullptr, + std::vector* present_v_cache = nullptr) { + auto hidden_states = model.forward(ctx, + input_ids, + input_pos, + attention_mask, + image_embeds, + out_layers, + past_k_cache, + past_v_cache, + kv_row_indices, + kv_cache_len, + present_k_cache, + present_v_cache); // [N, n_token, hidden_size] return hidden_states; } @@ -1180,6 +1593,280 @@ namespace LLM { return hidden_states; } + struct ggml_tensor* get_embedding_weight() const { + return model.get_embedding_weight(); + } + + bool ensure_kv_cache(int64_t required_tokens, int64_t batch_size) { + if (required_tokens <= 0 || batch_size <= 0) { + return false; + } + + int64_t target_capacity = required_tokens; + if (kv_cache_capacity > 0 && kv_cache_batch_size == batch_size && kv_cache_capacity < required_tokens) { + target_capacity = std::max(required_tokens, kv_cache_capacity * 2); + } else if (kv_cache_capacity == 0) { + target_capacity = std::max(required_tokens, 1024); + } + + bool need_realloc = cache_ctx == nullptr || + cache_buffer == nullptr || + kv_cache_batch_size != batch_size || + kv_cache_capacity < required_tokens; + if (!need_realloc) { + return true; + } + + free_cache_ctx_and_buffer(); + alloc_cache_ctx(); + + for (int i = 0; i < params.num_layers; ++i) { + std::string k_name = "llm.k." + std::to_string(i); + std::string v_name = "llm.v." + std::to_string(i); + auto k_cache = ggml_new_tensor_3d(cache_ctx, + GGML_TYPE_F32, + params.head_dim * params.num_kv_heads, + target_capacity, + batch_size); + auto v_cache = ggml_new_tensor_3d(cache_ctx, + GGML_TYPE_F32, + params.head_dim * params.num_kv_heads, + target_capacity, + batch_size); + ggml_set_name(k_cache, k_name.c_str()); + ggml_set_name(v_cache, v_name.c_str()); + } + + cache_buffer = ggml_backend_alloc_ctx_tensors(cache_ctx, runtime_backend); + if (cache_buffer == nullptr) { + LOG_ERROR("%s alloc kv cache backend buffer failed", get_desc().c_str()); + kv_cache_capacity = 0; + kv_cache_batch_size = 0; + return false; + } + + ggml_backend_buffer_clear(cache_buffer, 0); + kv_cache_capacity = target_capacity; + kv_cache_batch_size = batch_size; + return true; + } + + void set_logits_range(int64_t start, int64_t end) { + int64_t vocab = params.vocab_size; + start = std::max(0, start); + end = std::min(vocab, end); + if (end <= start) { + start = 0; + end = vocab; + } + logits_range_start = start; + logits_range_end = end; + } + + int64_t get_logits_range_start() const { + return logits_range_start; + } + + int64_t get_logits_range_end() const { + return logits_range_end; + } + + void invalidate_decode_graph_state() { + if (decode_graph_state.ready) { + free_compute_buffer(); + } + decode_graph_state = Decode1TokenGraphState{}; + } + + struct ggml_cgraph* build_graph_logits_kv_decode_1token(int64_t batch_size, int64_t kv_capacity) { + struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + + decode_input_ids_vec.resize(static_cast(batch_size), 0); + decode_input_pos_vec.resize((params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::QWEN3) ? 1 : 4, 0); + decode_attention_mask_vec.resize(static_cast(kv_capacity * batch_size), 0.f); + kv_row_indices_vec.resize(static_cast(batch_size), 0); + + auto input_ids = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_I32, 1, batch_size); + set_backend_tensor_data(input_ids, decode_input_ids_vec.data()); + + auto input_pos = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, decode_input_pos_vec.size()); + set_backend_tensor_data(input_pos, decode_input_pos_vec.data()); + + auto attention_mask = ggml_new_tensor_3d(compute_ctx, GGML_TYPE_F32, kv_capacity, 1, batch_size); + set_backend_tensor_data(attention_mask, decode_attention_mask_vec.data()); + + auto kv_row_indices = ggml_new_tensor_3d(compute_ctx, GGML_TYPE_I64, 1, batch_size, 1); + set_backend_tensor_data(kv_row_indices, kv_row_indices_vec.data()); + + std::vector past_k_cache(params.num_layers, nullptr); + std::vector past_v_cache(params.num_layers, nullptr); + for (int i = 0; i < params.num_layers; ++i) { + std::string k_name = "llm.k." + std::to_string(i); + std::string v_name = "llm.v." + std::to_string(i); + auto k_cache = get_cache_tensor_by_name(k_name); + auto v_cache = get_cache_tensor_by_name(v_name); + GGML_ASSERT(k_cache != nullptr && v_cache != nullptr); + past_k_cache[i] = k_cache; + past_v_cache[i] = v_cache; + } + + std::vector> image_embeds; + std::set out_layers; + auto runner_ctx = get_context(); + auto hidden_states = forward(&runner_ctx, + input_ids, + input_pos, + attention_mask, + image_embeds, + out_layers, + &past_k_cache, + &past_v_cache, + kv_row_indices, + kv_capacity); + + auto weight = get_embedding_weight(); + GGML_ASSERT(weight != nullptr); + int64_t start = std::max(0, logits_range_start); + int64_t end = logits_range_end > 0 ? std::min(logits_range_end, weight->ne[1]) : weight->ne[1]; + if (end <= start) { + start = 0; + end = weight->ne[1]; + } + auto weight_slice = ggml_ext_slice(compute_ctx, weight, 1, start, end); + auto logits = ggml_mul_mat(compute_ctx, weight_slice, hidden_states); + + ggml_build_forward_expand(gf, input_ids); + ggml_build_forward_expand(gf, input_pos); + ggml_build_forward_expand(gf, attention_mask); + ggml_build_forward_expand(gf, kv_row_indices); + ggml_build_forward_expand(gf, logits); + + decode_graph_state.graph = gf; + decode_graph_state.input_ids = input_ids; + decode_graph_state.input_pos = input_pos; + decode_graph_state.attention_mask = attention_mask; + decode_graph_state.kv_row_indices = kv_row_indices; + decode_graph_state.logits = logits; + + return gf; + } + + bool prepare_decode_graph_1token(int64_t batch_size, int64_t kv_capacity) { + if (decode_graph_state.ready && + decode_graph_state.batch_size == batch_size && + decode_graph_state.kv_capacity == kv_capacity && + decode_graph_state.graph != nullptr) { + return true; + } + + if (!ensure_kv_cache(kv_capacity, batch_size)) { + return false; + } + + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_graph_logits_kv_decode_1token(batch_size, kv_capacity); + }; + + if (!alloc_compute_buffer(get_graph)) { + return false; + } + + reset_compute_ctx(); + struct ggml_cgraph* gf = get_compute_graph(get_graph); + if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { + LOG_ERROR("%s alloc decode graph failed", get_desc().c_str()); + invalidate_decode_graph_state(); + return false; + } + copy_data_to_backend_tensor(); + + decode_graph_state.graph = gf; + decode_graph_state.batch_size = batch_size; + decode_graph_state.kv_capacity = kv_capacity; + decode_graph_state.ready = true; + return true; + } + + bool compute_logits_kv_decode_1token(const int n_threads, + const std::vector& token_ids, + int64_t n_past, + const std::vector& pad_lens, + ggml_tensor** output, + ggml_context* output_ctx = nullptr) { + const int64_t batch_size = static_cast(token_ids.size()); + if (batch_size <= 0 || kv_cache_capacity <= 0 || kv_cache_batch_size != batch_size) { + return false; + } + if (static_cast(batch_size) != pad_lens.size()) { + return false; + } + + if (!prepare_decode_graph_1token(batch_size, kv_cache_capacity)) { + return false; + } + + decode_input_ids_vec.resize(static_cast(batch_size)); + for (int64_t b = 0; b < batch_size; ++b) { + decode_input_ids_vec[static_cast(b)] = token_ids[static_cast(b)]; + } + + if (params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::QWEN3) { + decode_input_pos_vec.resize(1); + decode_input_pos_vec[0] = static_cast(n_past); + } else { + decode_input_pos_vec.resize(4); + decode_input_pos_vec[0] = static_cast(n_past); + decode_input_pos_vec[1] = static_cast(n_past); + decode_input_pos_vec[2] = static_cast(n_past); + decode_input_pos_vec[3] = 0; + } + + constexpr float kMaskNeg = -65504.0f; + decode_attention_mask_vec.resize(static_cast(kv_cache_capacity * batch_size)); + for (int64_t b = 0; b < batch_size; ++b) { + const int pad_len = pad_lens[static_cast(b)]; + size_t batch_offset = static_cast(b * kv_cache_capacity); + for (int64_t k = 0; k < kv_cache_capacity; ++k) { + float value = 0.f; + if (k < pad_len || k > n_past) { + value = kMaskNeg; + } + decode_attention_mask_vec[batch_offset + static_cast(k)] = value; + } + } + + kv_row_indices_vec.resize(static_cast(batch_size)); + for (int64_t b = 0; b < batch_size; ++b) { + kv_row_indices_vec[static_cast(b)] = n_past; + } + + ggml_backend_tensor_set(decode_graph_state.input_ids, decode_input_ids_vec.data(), 0, ggml_nbytes(decode_graph_state.input_ids)); + ggml_backend_tensor_set(decode_graph_state.input_pos, decode_input_pos_vec.data(), 0, ggml_nbytes(decode_graph_state.input_pos)); + ggml_backend_tensor_set(decode_graph_state.attention_mask, decode_attention_mask_vec.data(), 0, ggml_nbytes(decode_graph_state.attention_mask)); + ggml_backend_tensor_set(decode_graph_state.kv_row_indices, kv_row_indices_vec.data(), 0, ggml_nbytes(decode_graph_state.kv_row_indices)); + + if (ggml_backend_is_cpu(runtime_backend)) { + ggml_backend_cpu_set_n_threads(runtime_backend, n_threads); + } + + ggml_status status = ggml_backend_graph_compute(runtime_backend, decode_graph_state.graph); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s decode compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); + return false; + } + + if (output != nullptr) { + auto result = decode_graph_state.logits; + if (*output == nullptr && output_ctx != nullptr) { + *output = ggml_dup_tensor(output_ctx, result); + } + if (*output != nullptr) { + ggml_ext_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output)); + } + } + return true; + } + struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, struct ggml_tensor* attention_mask, std::vector> image_embeds, @@ -1216,12 +1903,13 @@ namespace LLM { if (attention_mask != nullptr) { attention_mask = to_backend(attention_mask); } else { + constexpr float kMaskNeg = -65504.0f; attention_mask_vec.resize(n_tokens * n_tokens); for (int i0 = 0; i0 < n_tokens; i0++) { for (int i1 = 0; i1 < n_tokens; i1++) { float value = 0.f; if (i0 > i1) { - value = -INFINITY; + value = kMaskNeg; } attention_mask_vec[i1 * n_tokens + i0] = value; } @@ -1234,11 +1922,215 @@ namespace LLM { struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); + ggml_build_forward_expand(gf, input_pos); + if (attention_mask != nullptr) { + ggml_build_forward_expand(gf, attention_mask); + } ggml_build_forward_expand(gf, hidden_states); return gf; } + struct ggml_cgraph* build_graph_logits(struct ggml_tensor* input_ids, + struct ggml_tensor* attention_mask, + std::vector> image_embeds) { + struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + + input_ids = to_backend(input_ids); + + for (auto& image_embed : image_embeds) { + image_embed.second = to_backend(image_embed.second); + } + + int64_t n_tokens = input_ids->ne[0]; + if (params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::QWEN3) { + input_pos_vec.resize(n_tokens); + for (int i = 0; i < n_tokens; ++i) { + input_pos_vec[i] = i; + } + } else { + input_pos_vec.resize(n_tokens * 4); + for (int i = 0; i < n_tokens; ++i) { + input_pos_vec[i] = i; + input_pos_vec[n_tokens + i] = i; + input_pos_vec[2 * n_tokens + i] = i; + input_pos_vec[3 * n_tokens + i] = 0; + } + } + + auto input_pos = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, input_pos_vec.size()); + set_backend_tensor_data(input_pos, input_pos_vec.data()); + + if (attention_mask != nullptr) { + attention_mask = to_backend(attention_mask); + } else { + constexpr float kMaskNeg = -65504.0f; + attention_mask_vec.resize(n_tokens * n_tokens); + for (int i0 = 0; i0 < n_tokens; i0++) { + for (int i1 = 0; i1 < n_tokens; i1++) { + float value = 0.f; + if (i0 > i1) { + value = kMaskNeg; + } + attention_mask_vec[i1 * n_tokens + i0] = value; + } + } + attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens); + set_backend_tensor_data(attention_mask, attention_mask_vec.data()); + } + + auto runner_ctx = get_context(); + std::set out_layers; + struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); + + auto last = ggml_ext_slice(compute_ctx, hidden_states, 1, n_tokens - 1, n_tokens, true); + + auto weight = get_embedding_weight(); + GGML_ASSERT(weight != nullptr); + int64_t start = std::max(0, logits_range_start); + int64_t end = logits_range_end > 0 ? std::min(logits_range_end, weight->ne[1]) : weight->ne[1]; + if (end <= start) { + start = 0; + end = weight->ne[1]; + } + auto weight_slice = ggml_ext_slice(compute_ctx, weight, 1, start, end); + auto logits = ggml_mul_mat(compute_ctx, weight_slice, last); + + ggml_build_forward_expand(gf, input_pos); + if (attention_mask != nullptr) { + ggml_build_forward_expand(gf, attention_mask); + } + ggml_build_forward_expand(gf, logits); + + return gf; + } + + struct ggml_cgraph* build_graph_logits_kv(struct ggml_tensor* input_ids, + struct ggml_tensor* attention_mask, + std::vector> image_embeds, + int64_t n_past) { + struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + + input_ids = to_backend(input_ids); + for (auto& image_embed : image_embeds) { + image_embed.second = to_backend(image_embed.second); + } + + const int64_t n_tokens = input_ids->ne[0]; + const int64_t batch_size = std::max(1, input_ids->ne[1]); + const int64_t n_kv = n_past + n_tokens; + + if (!ensure_kv_cache(n_kv, batch_size)) { + LOG_ERROR("%s failed to allocate kv cache (tokens=%" PRId64 ", batch=%" PRId64 ")", + get_desc().c_str(), + n_kv, + batch_size); + return gf; + } + + if (params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::QWEN3) { + input_pos_vec.resize(n_tokens); + for (int64_t i = 0; i < n_tokens; ++i) { + input_pos_vec[i] = static_cast(n_past + i); + } + } else { + input_pos_vec.resize(n_tokens * 4); + for (int64_t i = 0; i < n_tokens; ++i) { + int p = static_cast(n_past + i); + input_pos_vec[i] = p; + input_pos_vec[n_tokens + i] = p; + input_pos_vec[2 * n_tokens + i] = p; + input_pos_vec[3 * n_tokens + i] = 0; + } + } + + auto input_pos = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, input_pos_vec.size()); + set_backend_tensor_data(input_pos, input_pos_vec.data()); + + if (attention_mask != nullptr) { + attention_mask = to_backend(attention_mask); + } else { + constexpr float kMaskNeg = -65504.0f; + attention_mask_vec.resize(static_cast(n_kv * n_tokens * batch_size)); + for (int64_t b = 0; b < batch_size; ++b) { + size_t batch_offset = static_cast(b * n_kv * n_tokens); + for (int64_t q = 0; q < n_tokens; ++q) { + const int64_t abs_q = n_past + q; + for (int64_t k = 0; k < n_kv; ++k) { + float value = 0.f; + if (k > abs_q) { + value = kMaskNeg; + } + attention_mask_vec[batch_offset + q * n_kv + k] = value; + } + } + } + if (batch_size == 1) { + attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_kv, n_tokens); + } else { + attention_mask = ggml_new_tensor_3d(compute_ctx, GGML_TYPE_F32, n_kv, n_tokens, batch_size); + } + set_backend_tensor_data(attention_mask, attention_mask_vec.data()); + } + + kv_row_indices_vec.resize(static_cast(n_tokens * batch_size)); + for (int64_t b = 0; b < batch_size; ++b) { + size_t batch_offset = static_cast(b * n_tokens); + for (int64_t t = 0; t < n_tokens; ++t) { + kv_row_indices_vec[batch_offset + t] = n_past + t; + } + } + auto kv_row_indices = ggml_new_tensor_3d(compute_ctx, GGML_TYPE_I64, n_tokens, batch_size, 1); + set_backend_tensor_data(kv_row_indices, kv_row_indices_vec.data()); + + std::vector past_k_cache(params.num_layers, nullptr); + std::vector past_v_cache(params.num_layers, nullptr); + for (int i = 0; i < params.num_layers; ++i) { + std::string k_name = "llm.k." + std::to_string(i); + std::string v_name = "llm.v." + std::to_string(i); + auto k_cache = get_cache_tensor_by_name(k_name); + auto v_cache = get_cache_tensor_by_name(v_name); + if (k_cache == nullptr || v_cache == nullptr) { + LOG_ERROR("%s kv cache tensor missing for layer %d", get_desc().c_str(), i); + return gf; + } + past_k_cache[i] = k_cache; + past_v_cache[i] = v_cache; + } + + auto runner_ctx = get_context(); + std::set out_layers; + struct ggml_tensor* hidden_states = forward(&runner_ctx, + input_ids, + input_pos, + attention_mask, + image_embeds, + out_layers, + &past_k_cache, + &past_v_cache, + kv_row_indices, + n_kv); + + auto last = ggml_ext_slice(compute_ctx, hidden_states, 1, n_tokens - 1, n_tokens, true); + auto weight = get_embedding_weight(); + GGML_ASSERT(weight != nullptr); + int64_t start = std::max(0, logits_range_start); + int64_t end = logits_range_end > 0 ? std::min(logits_range_end, weight->ne[1]) : weight->ne[1]; + if (end <= start) { + start = 0; + end = weight->ne[1]; + } + auto weight_slice = ggml_ext_slice(compute_ctx, weight, 1, start, end); + auto logits = ggml_mul_mat(compute_ctx, weight_slice, last); + + ggml_build_forward_expand(gf, input_pos); + ggml_build_forward_expand(gf, attention_mask); + ggml_build_forward_expand(gf, kv_row_indices); + ggml_build_forward_expand(gf, logits); + + return gf; + } + bool compute(const int n_threads, struct ggml_tensor* input_ids, struct ggml_tensor* attention_mask, @@ -1246,12 +2138,58 @@ namespace LLM { std::set out_layers, ggml_tensor** output, ggml_context* output_ctx = nullptr) { + invalidate_decode_graph_state(); auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(input_ids, attention_mask, image_embeds, out_layers); }; return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); } + bool compute_logits(const int n_threads, + struct ggml_tensor* input_ids, + struct ggml_tensor* attention_mask, + std::vector> image_embeds, + ggml_tensor** output, + ggml_context* output_ctx = nullptr) { + invalidate_decode_graph_state(); + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_graph_logits(input_ids, attention_mask, image_embeds); + }; + return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + } + + bool compute_logits_kv(const int n_threads, + struct ggml_tensor* input_ids, + struct ggml_tensor* attention_mask, + std::vector> image_embeds, + int64_t n_past, + ggml_tensor** output, + ggml_context* output_ctx = nullptr) { + invalidate_decode_graph_state(); + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_graph_logits_kv(input_ids, attention_mask, image_embeds, n_past); + }; + return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + } + + bool prepare_kv_cache(int64_t max_tokens, int64_t batch_size) { + invalidate_decode_graph_state(); + if (!ensure_kv_cache(max_tokens, batch_size)) { + return false; + } + if (cache_buffer != nullptr) { + ggml_backend_buffer_clear(cache_buffer, 0); + } + return true; + } + + void reset_kv_cache() { + invalidate_decode_graph_state(); + free_cache_ctx_and_buffer(); + kv_cache_capacity = 0; + kv_cache_batch_size = 0; + } + int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) { int64_t grid_t = 1; int64_t grid_h = h / params.vision.patch_size; @@ -1435,6 +2373,8 @@ namespace LLM { : model(arch, backend, offload_params_to_cpu, tensor_storage_map, prefix, enable_vision) { if (arch == LLMArch::MISTRAL_SMALL_3_2) { tokenizer = std::make_shared(); + } else if (arch == LLMArch::QWEN3) { + tokenizer = std::make_shared(); } else { tokenizer = std::make_shared(); } diff --git a/src/lora.hpp b/src/lora.hpp index d2f91cd48..a2b6129ec 100644 --- a/src/lora.hpp +++ b/src/lora.hpp @@ -127,6 +127,19 @@ struct LoraModel : public GGMLRunner { lora_tensors = std::move(new_lora_tensors); } + + if (model_tensors.find("model.diffusion_model.decoder.layers.0.self_attn.q_proj.weight") != model_tensors.end()) { + std::unordered_map new_lora_tensors; + for (auto& [old_name, tensor] : lora_tensors) { + std::string new_name = old_name; + const std::string prefix = "lora.base_model.model."; + if (new_name.rfind(prefix, 0) == 0) { + new_name.replace(0, prefix.size(), "lora.model.diffusion_model.decoder."); + } + new_lora_tensors[new_name] = tensor; + } + lora_tensors = std::move(new_lora_tensors); + } } ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { diff --git a/src/model.cpp b/src/model.cpp index 77b032c2c..d41601f0b 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -1044,6 +1044,12 @@ SDVersion ModelLoader::get_sd_version() { bool has_output_block_71 = false; for (auto& [name, tensor_storage] : tensor_storage_map) { + if (tensor_storage.name.find("model.diffusion_model.decoder.layers.0.self_attn.q_proj.weight") != std::string::npos || + tensor_storage.name.find("model.diffusion_model.decoder.layers.0.cross_attn.q_proj.weight") != std::string::npos || + tensor_storage.name.find("model.diffusion_model.encoder.lyric_encoder.layers.0.input_layernorm.weight") != std::string::npos || + tensor_storage.name.find("model.diffusion_model.tokenizer.quantizer.project_in.weight") != std::string::npos) { + return VERSION_ACE_STEP_1_5; + } if (!(is_xl)) { if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) { is_flux = true; diff --git a/src/model.h b/src/model.h index 5b9ce18ab..46dbc8d50 100644 --- a/src/model.h +++ b/src/model.h @@ -50,6 +50,7 @@ enum SDVersion { VERSION_FLUX2_KLEIN, VERSION_Z_IMAGE, VERSION_OVIS_IMAGE, + VERSION_ACE_STEP_1_5, VERSION_COUNT, }; @@ -137,6 +138,13 @@ static inline bool sd_version_is_z_image(SDVersion version) { return false; } +static inline bool sd_version_is_ace(SDVersion version) { + if (version == VERSION_ACE_STEP_1_5) { + return true; + } + return false; +} + static inline bool sd_version_is_inpaint(SDVersion version) { if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || @@ -155,7 +163,8 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || - sd_version_is_z_image(version)) { + sd_version_is_z_image(version) || + sd_version_is_ace(version)) { return true; } return false; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index d769d45ca..6fa968be2 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -19,6 +19,7 @@ #include "tae.hpp" #include "ucache.hpp" #include "vae.hpp" +#include "ace_vae.hpp" #include "latent-preview.h" #include "name_conversion.h" @@ -53,6 +54,7 @@ const char* model_version_to_str[] = { "Flux.2 klein", "Z-Image", "Ovis Image", + "ACE Step 1.5", }; const char* sampling_methods_str[] = { @@ -273,11 +275,16 @@ class StableDiffusionGGML { } } - bool is_unet = sd_version_is_unet(model_loader.get_sd_version()); + SDVersion detected_version = model_loader.get_sd_version(); + bool is_unet = sd_version_is_unet(detected_version); + bool is_ace = sd_version_is_ace(detected_version); if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) { LOG_INFO("loading clip_l from '%s'", sd_ctx_params->clip_l_path); std::string prefix = is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer."; + if (is_ace) { + prefix = "text_encoders.qwen3_06b."; + } if (!model_loader.init_from_file(sd_ctx_params->clip_l_path, prefix)) { LOG_WARN("loading clip_l from '%s' failed", sd_ctx_params->clip_l_path); } @@ -339,10 +346,39 @@ class StableDiffusionGGML { auto& tensor_storage_map = model_loader.get_tensor_storage_map(); LOG_INFO("Version: %s ", model_version_to_str[version]); + std::string tensor_type_rules = SAFE_STR(sd_ctx_params->tensor_type_rules); ggml_type wtype = (int)sd_ctx_params->wtype < std::min(SD_TYPE_COUNT, GGML_TYPE_COUNT) ? (ggml_type)sd_ctx_params->wtype : GGML_TYPE_COUNT; - std::string tensor_type_rules = SAFE_STR(sd_ctx_params->tensor_type_rules); + if (wtype == GGML_TYPE_COUNT && sd_version_is_ace(version) && !ggml_backend_is_cpu(backend)) { + wtype = GGML_TYPE_F16; + } + if (sd_version_is_ace(version) && !ggml_backend_is_cpu(backend)) { + const char* ace_lm_rules = "^text_encoders\\.qwen3_2b\\.=bf16,^text_encoders\\.qwen3_4b\\.=bf16,^text_encoders\\.llm\\.=bf16"; + const char* ace_enc_rules = "^model\\.diffusion_model\\.encoder\\.=bf16"; + bool has_lm_rule = tensor_type_rules.find("text_encoders.qwen3_2b") != std::string::npos || + tensor_type_rules.find("text_encoders.qwen3_4b") != std::string::npos || + tensor_type_rules.find("text_encoders.llm") != std::string::npos; + bool has_enc_rule = tensor_type_rules.find("model.diffusion_model.encoder") != std::string::npos; + + std::string added_rules; + if (!has_lm_rule) { + added_rules = ace_lm_rules; + } + if (!has_enc_rule) { + if (!added_rules.empty()) { + added_rules += ","; + } + added_rules += ace_enc_rules; + } + if (!added_rules.empty()) { + if (tensor_type_rules.empty()) { + tensor_type_rules = added_rules; + } else { + tensor_type_rules = added_rules + "," + tensor_type_rules; + } + } + } if (wtype != GGML_TYPE_COUNT || tensor_type_rules.size() > 0) { model_loader.set_wtype_override(wtype, tensor_type_rules); } @@ -410,6 +446,9 @@ class StableDiffusionGGML { sd_version_is_flux2(version)) { scale_factor = 1.0f; shift_factor = 0.f; + } else if (sd_version_is_ace(version)) { + scale_factor = 1.0f; + shift_factor = 0.f; } if (sd_version_is_control(version)) { @@ -555,6 +594,13 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); + } else if (sd_version_is_ace(version)) { + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + tensor_storage_map); + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + tensor_storage_map); } else { // SD1.x SD2.x SDXL std::map embbeding_map; for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) { @@ -616,6 +662,29 @@ class StableDiffusionGGML { version); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); + } else if (sd_version_is_ace(version)) { + auto has_prefix = [&](const std::string& prefix) { + for (const auto& kv : tensor_storage_map) { + if (kv.first.rfind(prefix, 0) == 0) { + return true; + } + } + return false; + }; + std::string vae_prefix = "first_stage_model"; + if (has_prefix("vae.")) { + vae_prefix = "vae"; + } else if (has_prefix("encoder.") || has_prefix("decoder.")) { + vae_prefix.clear(); + } + + first_stage_model = std::make_shared(vae_backend, + offload_params_to_cpu, + tensor_storage_map, + vae_prefix, + vae_decode_only); + first_stage_model->alloc_params_buffer(); + first_stage_model->get_param_tensors(tensors, vae_prefix); } else if (version == VERSION_CHROMA_RADIANCE) { first_stage_model = std::make_shared(vae_backend, offload_params_to_cpu); @@ -892,6 +961,7 @@ class StableDiffusionGGML { // init denoiser { prediction_t pred_type = sd_ctx_params->prediction; + float flow_multiplier = 1000.f; if (pred_type == PREDICTION_COUNT) { if (sd_version_is_sd2(version)) { @@ -915,13 +985,18 @@ class StableDiffusionGGML { sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || - sd_version_is_z_image(version)) { + sd_version_is_z_image(version) || + sd_version_is_ace(version)) { pred_type = FLOW_PRED; if (sd_version_is_wan(version)) { default_flow_shift = 5.f; } else { default_flow_shift = 3.f; } + if (sd_version_is_ace(version)) { + // Comfy uses multiplier=1.0 for ACE Step 1.5 flow scheduling. + flow_multiplier = 1.0f; + } } else if (sd_version_is_flux(version)) { pred_type = FLUX_FLOW_PRED; @@ -953,7 +1028,8 @@ class StableDiffusionGGML { break; case FLOW_PRED: { LOG_INFO("running in FLOW mode"); - denoiser = std::make_shared(); + float init_flow_shift = std::isfinite(default_flow_shift) ? default_flow_shift : 3.0f; + denoiser = std::make_shared(init_flow_shift, flow_multiplier); break; } case FLUX_FLOW_PRED: { @@ -1833,6 +1909,9 @@ class StableDiffusionGGML { struct ggml_tensor* preview_tensor = nullptr; auto sd_preview_mode = sd_get_preview_mode(); + if (sd_version_is_ace(version)) { + sd_preview_mode = PREVIEW_NONE; + } if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { int64_t W = x->ne[0] * get_vae_scale_factor(); int64_t H = x->ne[1] * get_vae_scale_factor(); @@ -1860,6 +1939,9 @@ class StableDiffusionGGML { auto sd_preview_cb = sd_get_preview_callback(); auto sd_preview_cb_data = sd_get_preview_callback_data(); auto sd_preview_mode = sd_get_preview_mode(); + if (sd_version_is_ace(version)) { + sd_preview_mode = PREVIEW_NONE; + } if (step == 1 || step == -1) { pretty_progress(0, (int)steps, 0); } @@ -2012,7 +2094,6 @@ class StableDiffusionGGML { copy_ggml_tensor(noised_input, input); // noised_input = noised_input * c_in ggml_ext_tensor_scale_inplace(noised_input, c_in); - if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) { apply_mask(noised_input, init_latent, denoise_mask); } @@ -2051,11 +2132,17 @@ class StableDiffusionGGML { diffusion_params.context = cond.c_crossattn; diffusion_params.c_concat = cond.c_concat; diffusion_params.y = cond.c_vector; + diffusion_params.lyric_embed = cond.c_lyrics; + diffusion_params.refer_audio = cond.refer_audio; + diffusion_params.audio_codes = cond.audio_codes; active_condition = &cond; } else { diffusion_params.context = id_cond.c_crossattn; diffusion_params.c_concat = cond.c_concat; diffusion_params.y = id_cond.c_vector; + diffusion_params.lyric_embed = id_cond.c_lyrics; + diffusion_params.refer_audio = id_cond.refer_audio; + diffusion_params.audio_codes = id_cond.audio_codes; active_condition = &id_cond; } @@ -2087,6 +2174,9 @@ class StableDiffusionGGML { diffusion_params.context = uncond.c_crossattn; diffusion_params.c_concat = uncond.c_concat; diffusion_params.y = uncond.c_vector; + diffusion_params.lyric_embed = uncond.c_lyrics; + diffusion_params.refer_audio = uncond.refer_audio; + diffusion_params.audio_codes = uncond.audio_codes; bool skip_uncond = cache_before_condition(&uncond, out_uncond); if (!skip_uncond) { if (!work_diffusion_model->compute(n_threads, @@ -2105,6 +2195,9 @@ class StableDiffusionGGML { diffusion_params.context = img_cond.c_crossattn; diffusion_params.c_concat = img_cond.c_concat; diffusion_params.y = img_cond.c_vector; + diffusion_params.lyric_embed = img_cond.c_lyrics; + diffusion_params.refer_audio = img_cond.refer_audio; + diffusion_params.audio_codes = img_cond.audio_codes; bool skip_img_cond = cache_before_condition(&img_cond, out_img_cond); if (!skip_img_cond) { if (!work_diffusion_model->compute(n_threads, @@ -2129,6 +2222,9 @@ class StableDiffusionGGML { diffusion_params.c_concat = cond.c_concat; diffusion_params.y = cond.c_vector; diffusion_params.skip_layers = skip_layers; + diffusion_params.lyric_embed = cond.c_lyrics; + diffusion_params.refer_audio = cond.refer_audio; + diffusion_params.audio_codes = cond.audio_codes; if (!work_diffusion_model->compute(n_threads, diffusion_params, &out_skip)) { @@ -2289,6 +2385,8 @@ class StableDiffusionGGML { vae_scale_factor = 16; } else if (version == VERSION_CHROMA_RADIANCE) { vae_scale_factor = 1; + } else if (sd_version_is_ace(version)) { + vae_scale_factor = 1920; } return vae_scale_factor; } @@ -2314,6 +2412,8 @@ class StableDiffusionGGML { latent_channel = 3; } else if (sd_version_is_flux2(version)) { latent_channel = 128; + } else if (sd_version_is_ace(version)) { + latent_channel = 64; } else { latent_channel = 16; } @@ -2349,6 +2449,18 @@ class StableDiffusionGGML { return init_latent; } + ggml_tensor* generate_init_audio_latent(ggml_context* work_ctx, + float duration_seconds, + int batch = 1) { + int64_t hop = get_vae_scale_factor(); + int64_t length = static_cast(std::round(duration_seconds * 48000.0f / hop)); + length = std::max(1, length); + int C = get_latent_channel(); + ggml_tensor* init_latent = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, C, length, batch); + ggml_set_f32(init_latent, shift_factor); + return init_latent; + } + void get_latents_mean_std_vec(ggml_tensor* latent, int channel_dim, std::vector& latents_mean_vec, std::vector& latents_std_vec) { GGML_ASSERT(latent->ne[channel_dim] == 16 || latent->ne[channel_dim] == 48 || latent->ne[channel_dim] == 128); if (latent->ne[channel_dim] == 16) { @@ -2409,6 +2521,9 @@ class StableDiffusionGGML { } void process_latent_in(ggml_tensor* latent) { + if (sd_version_is_ace(version)) { + return; + } if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) { int channel_dim = sd_version_is_flux2(version) ? 2 : 3; std::vector latents_mean_vec; @@ -2448,6 +2563,9 @@ class StableDiffusionGGML { } void process_latent_out(ggml_tensor* latent) { + if (sd_version_is_ace(version)) { + return; + } if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) { int channel_dim = sd_version_is_flux2(version) ? 2 : 3; std::vector latents_mean_vec; @@ -2727,12 +2845,24 @@ class StableDiffusionGGML { void set_flow_shift(float flow_shift = INFINITY) { auto flow_denoiser = std::dynamic_pointer_cast(denoiser); if (flow_denoiser) { - if (flow_shift == INFINITY) { - flow_shift = default_flow_shift; + if (!std::isfinite(flow_shift)) { + flow_shift = std::isfinite(default_flow_shift) ? default_flow_shift : 3.0f; } flow_denoiser->set_shift(flow_shift); } } + + ggml_tensor* decode_audio(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor* result = nullptr; + if (!use_tiny_autoencoder) { + if (!first_stage_model->compute(n_threads, x, true, &result, work_ctx)) { + LOG_ERROR("audio vae decode failed"); + return nullptr; + } + first_stage_model->free_compute_buffer(); + } + return result; + } }; /*================================================= SD API ==================================================*/ @@ -3160,6 +3290,58 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { return buf; } +void sd_audio_gen_params_init(sd_audio_gen_params_t* sd_audio_gen_params) { + *sd_audio_gen_params = {}; + sd_sample_params_init(&sd_audio_gen_params->sample_params); + sd_audio_gen_params->sample_params.sample_steps = 8; + sd_audio_gen_params->sample_params.guidance.txt_cfg = 1.0f; + sd_audio_gen_params->sample_params.scheduler = SIMPLE_SCHEDULER; + sd_audio_gen_params->sample_params.sample_method = EULER_SAMPLE_METHOD; + sd_audio_gen_params->bpm = 120.f; + sd_audio_gen_params->duration = 120.f; + sd_audio_gen_params->timesignature = 2; + sd_audio_gen_params->language = "en"; + sd_audio_gen_params->keyscale = "C major"; + sd_audio_gen_params->lm_seed = 0; + sd_audio_gen_params->seed = -1; +} + +char* sd_audio_gen_params_to_str(const sd_audio_gen_params_t* sd_audio_gen_params) { + char* buf = (char*)malloc(4096); + if (!buf) + return nullptr; + buf[0] = '\0'; + + char* sample_params_str = sd_sample_params_to_str(&sd_audio_gen_params->sample_params); + + snprintf(buf + strlen(buf), 4096 - strlen(buf), + "prompt: %s\n" + "negative_prompt: %s\n" + "lyrics: %s\n" + "bpm: %.2f\n" + "duration: %.2f\n" + "timesignature: %d\n" + "language: %s\n" + "keyscale: %s\n" + "lm_seed: %d\n" + "sample_params: %s\n" + "seed: %" PRId64 "\n", + SAFE_STR(sd_audio_gen_params->prompt), + SAFE_STR(sd_audio_gen_params->negative_prompt), + SAFE_STR(sd_audio_gen_params->lyrics), + sd_audio_gen_params->bpm, + sd_audio_gen_params->duration, + sd_audio_gen_params->timesignature, + SAFE_STR(sd_audio_gen_params->language), + SAFE_STR(sd_audio_gen_params->keyscale), + sd_audio_gen_params->lm_seed, + SAFE_STR(sample_params_str), + sd_audio_gen_params->seed); + + free(sample_params_str); + return buf; +} + void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { *sd_vid_gen_params = {}; sd_sample_params_init(&sd_vid_gen_params->sample_params); @@ -3224,6 +3406,9 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me if (edm_v_denoiser) { return EXPONENTIAL_SCHEDULER; } + if (sd_version_is_ace(sd_ctx->sd->version)) { + return SIMPLE_SCHEDULER; + } } if (sample_method == LCM_SAMPLE_METHOD) { return LCM_SCHEDULER; @@ -3531,7 +3716,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g } struct ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1G + params.mem_size = static_cast(1024 * 1024) * 4096; // 4G (ACE audio needs larger scratch for full lyric conditioning) params.mem_buffer = nullptr; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); @@ -3801,6 +3986,210 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g return result_images; } +sd_audio_t* generate_audio(sd_ctx_t* sd_ctx, const sd_audio_gen_params_t* sd_audio_gen_params) { + if (sd_ctx == nullptr || sd_audio_gen_params == nullptr) { + return nullptr; + } + if (!sd_version_is_ace(sd_ctx->sd->version)) { + LOG_ERROR("audio generation is only supported with ACE Step 1.5 models"); + return nullptr; + } + if (auto ace_model = std::dynamic_pointer_cast(sd_ctx->sd->diffusion_model)) { + ace_model->ace.reset_encoder_cache(); + } + + struct ggml_init_params params; + params.mem_size = static_cast(1024 * 1024) * 1024; // 1G + params.mem_buffer = nullptr; + params.no_alloc = false; + + struct ggml_context* work_ctx = ggml_init(params); + if (!work_ctx) { + LOG_ERROR("ggml_init() failed"); + return nullptr; + } + + int64_t seed = sd_audio_gen_params->seed; + if (seed < 0) { + srand((int)time(nullptr)); + seed = rand(); + } + sd_ctx->sd->rng->manual_seed(seed); + sd_ctx->sd->sampler_rng->manual_seed(seed); + + sd_ctx->sd->apply_loras(sd_audio_gen_params->loras, sd_audio_gen_params->lora_count); + + enum sample_method_t sample_method = sd_audio_gen_params->sample_params.sample_method; + if (sample_method == SAMPLE_METHOD_COUNT) { + sample_method = sd_get_default_sample_method(sd_ctx); + } + LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); + + int sample_steps = sd_audio_gen_params->sample_params.sample_steps; + std::vector sigmas; + if (sd_audio_gen_params->sample_params.custom_sigmas_count > 0) { + sigmas = std::vector(sd_audio_gen_params->sample_params.custom_sigmas, + sd_audio_gen_params->sample_params.custom_sigmas + sd_audio_gen_params->sample_params.custom_sigmas_count); + if (sample_steps != sigmas.size() - 1) { + sample_steps = static_cast(sigmas.size()) - 1; + LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps); + } + } + + ConditionerParams condition_params; + condition_params.text = SAFE_STR(sd_audio_gen_params->prompt); + condition_params.lyrics = SAFE_STR(sd_audio_gen_params->lyrics); + condition_params.keyscale = strlen(SAFE_STR(sd_audio_gen_params->keyscale)) > 0 ? SAFE_STR(sd_audio_gen_params->keyscale) : "C major"; + condition_params.language = strlen(SAFE_STR(sd_audio_gen_params->language)) > 0 ? SAFE_STR(sd_audio_gen_params->language) : "en"; + condition_params.bpm = sd_audio_gen_params->bpm; + condition_params.duration = sd_audio_gen_params->duration; + condition_params.timesignature = sd_audio_gen_params->timesignature; + condition_params.lm_seed = sd_audio_gen_params->lm_seed; + + SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, + sd_ctx->sd->n_threads, + condition_params); + + SDCondition uncond; + if (sd_audio_gen_params->sample_params.guidance.txt_cfg != 1.0f) { + uncond = cond; + if (cond.c_crossattn) { + auto zero = ggml_dup_tensor(work_ctx, cond.c_crossattn); + ggml_set_f32(zero, 0.f); + uncond.c_crossattn = zero; + } + if (cond.c_lyrics) { + auto zero = ggml_dup_tensor(work_ctx, cond.c_lyrics); + ggml_set_f32(zero, 0.f); + uncond.c_lyrics = zero; + } + } + + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->cond_stage_model->free_params_buffer(); + } + + ggml_tensor* init_latent = sd_ctx->sd->generate_init_audio_latent(work_ctx, + sd_audio_gen_params->duration, + 1); + + ggml_tensor* noise = ggml_new_tensor_3d(work_ctx, + GGML_TYPE_F32, + init_latent->ne[0], + init_latent->ne[1], + init_latent->ne[2]); + ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); + + int seq_len = static_cast(init_latent->ne[1]); + if (sd_audio_gen_params->sample_params.custom_sigmas_count == 0) { + scheduler_t scheduler = sd_audio_gen_params->sample_params.scheduler; + if (scheduler == SCHEDULER_COUNT) { + scheduler = sd_get_default_scheduler(sd_ctx, sample_method); + } + sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps, + seq_len, + scheduler, + sd_ctx->sd->version); + } + + SDCondition img_cond; + SDCondition id_cond; + + ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, + sd_ctx->sd->diffusion_model, + true, + init_latent, + noise, + cond, + uncond, + img_cond, + nullptr, + 0.0f, + sd_audio_gen_params->sample_params.guidance, + sd_audio_gen_params->sample_params.eta, + sd_audio_gen_params->sample_params.shifted_timestep, + sample_method, + sigmas, + -1, + id_cond, + {}, + false, + nullptr, + nullptr, + 1.0f, + nullptr); + + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + + if (x_0 == nullptr) { + LOG_ERROR("sampling failed"); + ggml_free(work_ctx); + return nullptr; + } + + { + float x_min = std::numeric_limits::infinity(); + float x_max = -std::numeric_limits::infinity(); + for (int64_t t = 0; t < x_0->ne[1]; ++t) { + for (int64_t c = 0; c < x_0->ne[0]; ++c) { + float v = ggml_ext_tensor_get_f32(x_0, c, t, 0); + x_min = std::min(x_min, v); + x_max = std::max(x_max, v); + } + } + LOG_INFO("audio latent stats: min=%.6f max=%.6f", x_min, x_max); + } + + ggml_tensor* audio_tensor = sd_ctx->sd->decode_audio(work_ctx, x_0); + if (audio_tensor == nullptr) { + ggml_free(work_ctx); + return nullptr; + } + + + if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { + sd_ctx->sd->first_stage_model->free_params_buffer(); + } + + sd_ctx->sd->lora_stat(); + + int64_t sample_count = audio_tensor->ne[0]; + int64_t channels = audio_tensor->ne[1]; + sd_audio_t* result = (sd_audio_t*)calloc(1, sizeof(sd_audio_t)); + if (result == nullptr) { + ggml_free(work_ctx); + return nullptr; + } + + result->sample_rate = 48000; + result->channels = static_cast(channels); + result->sample_count = static_cast(sample_count); + result->data = (float*)malloc(sample_count * channels * sizeof(float)); + + if (!result->data) { + free(result); + ggml_free(work_ctx); + return nullptr; + } + + float v_min = std::numeric_limits::infinity(); + float v_max = -std::numeric_limits::infinity(); + for (int64_t t = 0; t < sample_count; ++t) { + for (int64_t c = 0; c < channels; ++c) { + float v = ggml_ext_tensor_get_f32(audio_tensor, t, c, 0); + v_min = std::min(v_min, v); + v_max = std::max(v_max, v); + result->data[t * channels + c] = v; + } + } + LOG_INFO("audio decode stats: min=%.6f max=%.6f", v_min, v_max); + + ggml_free(work_ctx); + return result; +} + SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) { if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) { return nullptr; diff --git a/src/tokenize_util.cpp b/src/tokenize_util.cpp index 22cf8ae2e..565fccec0 100644 --- a/src/tokenize_util.cpp +++ b/src/tokenize_util.cpp @@ -919,23 +919,31 @@ std::vector token_split(const std::string& text) { // `\s*[\r\n]+|\s+(?!\S)|\s+` if (is_space(cp)) { - std::string token; - bool saw_new_line = false; + // Match `\s*[\r\n]+` first: include any leading spaces before newline(s) + size_t j = i; + while (j < cps.size() && is_space(cps[j]) && cps[j] != U'\r' && cps[j] != U'\n') { + ++j; + } + if (j < cps.size() && (cps[j] == U'\r' || cps[j] == U'\n')) { + size_t k = j; + while (k < cps.size() && (cps[k] == U'\r' || cps[k] == U'\n')) { + ++k; + } + std::string token; + for (size_t idx = i; idx < k; ++idx) { + token += codepoint_to_utf8(cps[idx]); + } + tokens.push_back(token); + i = k; + continue; + } + // Fallback: consume a contiguous whitespace run + std::string token; while (i < cps.size() && is_space(cps[i])) { token += codepoint_to_utf8(cps[i]); - - if (cps[i] == U'\r' || cps[i] == U'\n') { - saw_new_line = true; - } else { - if (saw_new_line) { - break; - } - } - ++i; } - tokens.push_back(token); continue; } From 127ac7481467b44ca369587edd443e8191dde404 Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 2 Mar 2026 17:30:34 +0100 Subject: [PATCH 2/2] fix ggml submodule fork --- .gitmodules | 3 ++- ggml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 5a7851973..5800aab97 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "ggml"] path = ggml - url = https://github.com/ggml-org/ggml.git + url = https://github.com/rmatif/ggml.git + branch = ace diff --git a/ggml b/ggml index e1132c58a..1666eeaa2 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit e1132c58a83813ca3485617663da744dc8e164e6 +Subproject commit 1666eeaa211a05aab1ce1cfa7d545c6021ceef1c