leejet · stduhpf · Dec 5, 2025 · Dec 5, 2025 · Dec 6, 2025 · Dec 6, 2025
diff --git a/src/anima.hpp b/src/anima.hpp
@@ -598,7 +598,8 @@ namespace Anima {
                                           {},
                                           empty_ref_latents,
                                           false,
-                                          1.0f);
+                                          1.0f,
+                                          false);
 
             std::vector<float> axis_thetas = {
                 static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),

diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp
@@ -680,7 +680,7 @@ struct AutoEncoderKL : public VAE {
         } else if (sd_version_is_sd3(version)) {
             scale_factor = 1.5305f;
             shift_factor = 0.0609f;
-        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
+        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
             scale_factor = 0.3611f;
             shift_factor = 0.1159f;
         } else if (sd_version_uses_flux2_vae(version)) {

diff --git a/src/conditioner.hpp b/src/conditioner.hpp
@@ -1744,10 +1744,28 @@ struct LLMEmbedder : public Conditioner {
         }
     }
 
+    static size_t get_utf8_char_len(char c) {
+        unsigned char uc = static_cast<unsigned char>(c);
+        if ((uc & 0x80) == 0) {
+            return 1;
+        }
+        if ((uc & 0xE0) == 0xC0) {
+            return 2;
+        }
+        if ((uc & 0xF0) == 0xE0) {
+            return 3;
+        }
+        if ((uc & 0xF8) == 0xF0) {
+            return 4;
+        }
+        return 1;
+    }
+
     std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
                                                                                   const std::pair<int, int>& attn_range,
                                                                                   size_t min_length = 0,
-                                                                                  size_t max_length = 100000000) {
+                                                                                  size_t max_length = 100000000,
+                                                                                  bool spell_quotes = false) {
         std::vector<std::pair<std::string, float>> parsed_attention;
         if (attn_range.first >= 0 && attn_range.second > 0) {
             if (attn_range.first > 0) {
@@ -1781,9 +1799,44 @@ struct LLMEmbedder : public Conditioner {
         for (const auto& item : parsed_attention) {
             const std::string& curr_text = item.first;
             float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer->encode(curr_text, nullptr);
-            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
-            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+            auto append_tokens           = [&](const std::string& text_part) {
+                if (text_part.empty()) {
+                    return;
+                }
+                std::vector<int> curr_tokens = tokenizer->encode(text_part, nullptr);
+                tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+                weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+            };
+
+            if (spell_quotes) {
+                std::string buffer;
+                bool in_quote = false;
+                size_t i      = 0;
+                while (i < curr_text.size()) {
+                    size_t char_len = get_utf8_char_len(curr_text[i]);
+                    if (i + char_len > curr_text.size()) {
+                        char_len = curr_text.size() - i;
+                    }
+                    std::string uchar = curr_text.substr(i, char_len);
+                    i += char_len;
+
+                    if (uchar == "\"") {
+                        buffer += uchar;
+                        if (!in_quote) {
+                            append_tokens(buffer);
+                            buffer.clear();
+                        }
+                        in_quote = !in_quote;
+                    } else if (in_quote) {
+                        append_tokens(uchar);
+                    } else {
+                        buffer += uchar;
+                    }
+                }
+                append_tokens(buffer);
+            } else {
+                append_tokens(curr_text);
+            }
         }
 
         std::vector<float> mask;
@@ -1804,12 +1857,24 @@ struct LLMEmbedder : public Conditioner {
                                     int hidden_states_min_length,
                                     const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
                                     const std::set<int>& out_layers,
-                                    int prompt_template_encode_start_idx) {
-        auto tokens_weights_mask = tokenize(prompt, prompt_attn_range, min_length);
+                                    int prompt_template_encode_start_idx,
+                                    int prompt_template_encode_end_idx        = 0,
+                                    const std::string& prompt_template_suffix = "",
+                                    bool spell_quotes                         = false,
+                                    int max_length                            = 100000000) {
+        auto tokens_weights_mask = tokenize(prompt, prompt_attn_range, min_length, max_length, spell_quotes);
         auto& tokens             = std::get<0>(tokens_weights_mask);
         auto& weights            = std::get<1>(tokens_weights_mask);
         auto& mask               = std::get<2>(tokens_weights_mask);
 
+        if (!prompt_template_suffix.empty()) {
+            std::vector<int> suffix_tokens = tokenizer->encode(prompt_template_suffix, nullptr);
+            prompt_template_encode_end_idx = static_cast<int>(suffix_tokens.size());
+            tokens.insert(tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
+            weights.insert(weights.end(), suffix_tokens.size(), 1.f);
+            mask.insert(mask.end(), suffix_tokens.size(), 1.f);
+        }
+
         sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, tokens);
         sd::Tensor<float> attention_mask;
         if (!mask.empty()) {
@@ -1831,20 +1896,21 @@ struct LLMEmbedder : public Conditioner {
                                           image_embeds,
                                           out_layers);
         GGML_ASSERT(!hidden_states.empty());
-        hidden_states = apply_token_weights(std::move(hidden_states), weights);
-        GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx);
+        hidden_states             = apply_token_weights(std::move(hidden_states), weights);
+        int64_t hidden_states_end = hidden_states.shape()[1] - prompt_template_encode_end_idx;
+        GGML_ASSERT(hidden_states_end > prompt_template_encode_start_idx);
 
         int64_t zero_pad_len = 0;
         if (hidden_states_min_length > 0) {
-            if (hidden_states.shape()[1] - prompt_template_encode_start_idx < hidden_states_min_length) {
-                zero_pad_len = hidden_states_min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx;
+            if (hidden_states_end - prompt_template_encode_start_idx < hidden_states_min_length) {
+                zero_pad_len = hidden_states_min_length - hidden_states_end + prompt_template_encode_start_idx;
             }
         }
 
         sd::Tensor<float> new_hidden_states = sd::ops::slice(hidden_states,
                                                              1,
                                                              prompt_template_encode_start_idx,
-                                                             hidden_states.shape()[1]);
+                                                             hidden_states_end);
         if (zero_pad_len > 0) {
             auto pad_shape    = new_hidden_states.shape();
             pad_shape[1]      = zero_pad_len;
@@ -1864,8 +1930,12 @@ struct LLMEmbedder : public Conditioner {
         std::vector<std::pair<int, int>> extra_prompts_attn_range;
         std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
         int prompt_template_encode_start_idx = 34;
+        int prompt_template_encode_end_idx   = 0;
         int min_length                       = 0;  // pad tokens
+        int max_length                       = 100000000;
         int hidden_states_min_length         = 0;  // zero pad hidden_states
+        bool spell_quotes                    = false;
+        std::string prompt_template_suffix;
         std::set<int> out_layers;
 
         int64_t t0 = ggml_time_ms();
@@ -1938,6 +2008,72 @@ struct LLMEmbedder : public Conditioner {
 
                 prompt += "<|im_end|>\n<|im_start|>assistant\n";
             }
+        } else if (sd_version_is_longcat(version)) {
+            spell_quotes           = true;
+            prompt_template_suffix = "<|im_end|>\n<|im_start|>assistant\n";
+
+            if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) {
+                LOG_INFO("LongCatEditPipeline");
+                prompt_template_encode_start_idx = 67;
+                min_length                       = 512 + prompt_template_encode_start_idx;
+                max_length                       = min_length;
+                int image_embed_idx              = 36 + 6;
+
+                int min_pixels          = 384 * 384;
+                int max_pixels          = 560 * 560;
+                std::string placeholder = "<|image_pad|>";
+                std::string img_prompt;
+
+                for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
+                    const auto& image = (*conditioner_params.ref_images)[i];
+                    double factor     = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
+                    int height        = static_cast<int>(image.shape()[1]);
+                    int width         = static_cast<int>(image.shape()[0]);
+                    int h_bar         = static_cast<int>(std::round(height / factor) * factor);
+                    int w_bar         = static_cast<int>(std::round(width / factor) * factor);
+
+                    if (static_cast<double>(h_bar) * w_bar > max_pixels) {
+                        double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
+                        h_bar       = std::max(static_cast<int>(factor),
+                                               static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
+                        w_bar       = std::max(static_cast<int>(factor),
+                                               static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
+                    } else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
+                        double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
+                        h_bar       = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
+                        w_bar       = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
+                    }
+
+                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
+
+                    auto resized_image = clip_preprocess(image, w_bar, h_bar);
+                    auto image_embed   = llm->encode_image(n_threads, resized_image);
+                    GGML_ASSERT(!image_embed.empty());
+                    image_embeds.emplace_back(image_embed_idx, image_embed);
+                    image_embed_idx += 1 + static_cast<int>(image_embed.shape()[1]) + 6;
+
+                    img_prompt += "<|vision_start|>";
+                    int64_t num_image_tokens = image_embed.shape()[1];
+                    img_prompt.reserve(num_image_tokens * placeholder.size());
+                    for (int j = 0; j < num_image_tokens; j++) {
+                        img_prompt += placeholder;
+                    }
+                    img_prompt += "<|vision_end|>";
+                }
+
+                prompt = "<|im_start|>system\nAs an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n<|im_start|>user\n";
+                prompt += img_prompt;
+            } else {
+                prompt_template_encode_start_idx = 36;
+                min_length                       = 512 + prompt_template_encode_start_idx;
+                max_length                       = min_length;
+
+                prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n";
+            }
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
         } else if (version == VERSION_FLUX2) {
             prompt_template_encode_start_idx = 0;
             hidden_states_min_length         = 512;
@@ -2012,7 +2148,11 @@ struct LLMEmbedder : public Conditioner {
                                            hidden_states_min_length,
                                            image_embeds,
                                            out_layers,
-                                           prompt_template_encode_start_idx);
+                                           prompt_template_encode_start_idx,
+                                           prompt_template_encode_end_idx,
+                                           prompt_template_suffix,
+                                           spell_quotes,
+                                           max_length);
         std::vector<sd::Tensor<float>> extra_hidden_states_vec;
         for (int i = 0; i < extra_prompts.size(); i++) {
             auto extra_hidden_states = encode_prompt(n_threads,
@@ -2022,7 +2162,11 @@ struct LLMEmbedder : public Conditioner {
                                                      hidden_states_min_length,
                                                      image_embeds,
                                                      out_layers,
-                                                     prompt_template_encode_start_idx);
+                                                     prompt_template_encode_start_idx,
+                                                     prompt_template_encode_end_idx,
+                                                     prompt_template_suffix,
+                                                     spell_quotes,
+                                                     max_length);
             extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
         }
 

diff --git a/src/flux.hpp b/src/flux.hpp
@@ -446,7 +446,6 @@ namespace Flux {
             if (use_yak_mlp || use_mlp_silu_act) {
                 mlp_mult_factor = 2;
             }
-
             blocks["linear1"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
             blocks["linear2"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias));
             blocks["norm"]     = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
@@ -847,6 +846,17 @@ namespace Flux {
             }
         }
 
+        ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
+                                       ggml_tensor* x) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+
+            int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
+            int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
+            x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
+            return x;
+        }
+
         ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
                                   ggml_tensor* img,
                                   ggml_tensor* txt,
@@ -1225,6 +1235,9 @@ namespace Flux {
                 flux_params.share_modulation = true;
                 flux_params.ref_index_scale  = 10.f;
                 flux_params.use_mlp_silu_act = true;
+            } else if (sd_version_is_longcat(version)) {
+                flux_params.context_in_dim = 3584;
+                flux_params.vec_in_dim     = 0;
             }
             int64_t head_dim                   = 0;
             int64_t actual_radiance_patch_size = -1;
@@ -1412,7 +1425,6 @@ namespace Flux {
             } else if (version == VERSION_OVIS_IMAGE) {
                 txt_arange_dims = {1, 2};
             }
-
             pe_vec      = Rope::gen_flux_pe(static_cast<int>(x->ne[1]),
                                             static_cast<int>(x->ne[0]),
                                             flux_params.patch_size,
@@ -1425,7 +1437,8 @@ namespace Flux {
                                             flux_params.theta,
                                             circular_y_enabled,
                                             circular_x_enabled,
-                                            flux_params.axes_dim);
+                                            flux_params.axes_dim,
+                                            sd_version_is_longcat(version));
             int pos_len = static_cast<int>(pe_vec.size() / flux_params.axes_dim_sum / 2);
             // LOG_DEBUG("pos_len %d", pos_len);
             auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
@@ -953,11 +953,17 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm_32(ggml_context* ctx,
     return ggml_group_norm(ctx, a, 32, eps);
 }
 
+__STATIC_INLINE__ bool ggml_ext_is_padded_1d(const ggml_tensor* x) {
+    return x->nb[0] == ggml_type_size(x->type) &&
+           x->nb[2] == x->nb[1] * x->ne[1] &&
+           x->nb[3] == x->nb[2] * x->ne[2];
+}
+
 __STATIC_INLINE__ ggml_tensor* ggml_ext_scale(ggml_context* ctx,
                                               ggml_tensor* x,
                                               float factor,
                                               bool inplace = false) {
-    if (!ggml_is_contiguous(x)) {
+    if (!ggml_ext_is_padded_1d(x)) {
         x = ggml_cont(ctx, x);
     }
     if (inplace) {
@@ -3664,7 +3670,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
 
         ggml_tensor* hc  = ggml_transpose(ctx, hc_t);
         ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
-        return ggml_scale(ctx, out, scale);
+        return ggml_ext_scale(ctx, out, scale);
     } else {
         int batch = (int)h->ne[3];
         // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
@@ -3747,7 +3753,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
         ggml_tensor* hc = ggml_transpose(ctx, hc_t);
         // ungroup
         ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch);
-        return ggml_scale(ctx, out, scale);
+        return ggml_ext_scale(ctx, out, scale);
     }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -598,7 +598,8 @@ namespace Anima { @@
                                               {},
                                               empty_ref_latents,
                                               false,
-.0f);
+.0f,
+                                              false);
                 std::vector<float> axis_thetas = {
                     static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
@@ Expand Down @@