Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ API and command-line option may change frequently.***
- [Chroma](./docs/chroma.md)
- [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md)
- [LongCat Image](./docs/longcat_image.md)
- [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
Expand All @@ -48,6 +49,7 @@ API and command-line option may change frequently.***
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
- [LongCat Image Edit](./docs/longcat_image.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [LTX-2.3](./docs/ltx2.md)
Expand Down Expand Up @@ -133,6 +135,7 @@ For runtime and parameter backend placement, see the [backend selection guide](.
- [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
- [🔥LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥LTX-2.3](./docs/ltx2.md)
- [🔥Z-Image](./docs/z_image.md)
Expand Down
Binary file added assets/longcat/example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
30 changes: 30 additions & 0 deletions docs/longcat_image.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# How to Use

LongCat-Image uses a LongCat diffusion transformer, the FLUX VAE, and Qwen2.5-VL as the LLM text encoder.

## Download weights

- Download LongCat Image
- safetensors: https://huggingface.co/Comfy-Org/LongCat-Image/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/vantagewithai/LongCat-Image-GGUF/tree/main/comfy
- Download LongCat Image Edit
- LongCat Image Edit Turbo: https://huggingface.co/meituan-longcat/LongCat-Image-Edit-Turbo
- gguf: https://huggingface.co/vantagewithai/LongCat-Image-Edit-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download qwen_2.5_vl 7b
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
- For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.

## Run

LongCat uses quoted text for character-level text rendering. Put target text inside single quotes, double quotes, or Chinese quotes.

### LongCat Image

```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\LongCat-Image-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p "a lovely cat holding a sign says 'longcat.cpp'" --cfg-scale 5.0 --sampling-method euler --flow-shift 3 -v --offload-to-cpu --diffusion-fa
```

<img alt="longcat example" src="../assets/longcat/example.png" />
3 changes: 2 additions & 1 deletion src/anima.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,8 @@ namespace Anima {
{},
empty_ref_latents,
false,
1.0f);
1.0f,
false);

std::vector<float> axis_thetas = {
static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
Expand Down
2 changes: 1 addition & 1 deletion src/auto_encoder_kl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@ struct AutoEncoderKL : public VAE {
} else if (sd_version_is_sd3(version)) {
scale_factor = 1.5305f;
shift_factor = 0.0609f;
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
scale_factor = 0.3611f;
shift_factor = 0.1159f;
} else if (sd_version_uses_flux2_vae(version)) {
Expand Down
84 changes: 79 additions & 5 deletions src/conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1747,14 +1747,18 @@ struct LLMEmbedder : public Conditioner {
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
const std::pair<int, int>& attn_range,
size_t min_length = 0,
size_t max_length = 100000000) {
size_t max_length = 100000000,
bool spell_quotes = false) {
std::vector<std::pair<std::string, float>> parsed_attention;
if (attn_range.first >= 0 && attn_range.second > 0) {
if (attn_range.first > 0) {
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
}
if (attn_range.second - attn_range.first > 0) {
auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
if (spell_quotes) {
new_parsed_attention = split_quotation_attention(new_parsed_attention);
}
parsed_attention.insert(parsed_attention.end(),
new_parsed_attention.begin(),
new_parsed_attention.end());
Expand Down Expand Up @@ -1804,8 +1808,10 @@ struct LLMEmbedder : public Conditioner {
int hidden_states_min_length,
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
const std::set<int>& out_layers,
int prompt_template_encode_start_idx) {
auto tokens_weights_mask = tokenize(prompt, prompt_attn_range, min_length);
int prompt_template_encode_start_idx,
bool spell_quotes = false,
int max_length = 100000000) {
auto tokens_weights_mask = tokenize(prompt, prompt_attn_range, min_length, max_length, spell_quotes);
auto& tokens = std::get<0>(tokens_weights_mask);
auto& weights = std::get<1>(tokens_weights_mask);
auto& mask = std::get<2>(tokens_weights_mask);
Expand Down Expand Up @@ -1866,6 +1872,7 @@ struct LLMEmbedder : public Conditioner {
int prompt_template_encode_start_idx = 34;
int min_length = 0; // pad tokens
int hidden_states_min_length = 0; // zero pad hidden_states
bool spell_quotes = false;
std::set<int> out_layers;

int64_t t0 = ggml_time_ms();
Expand Down Expand Up @@ -1938,6 +1945,71 @@ struct LLMEmbedder : public Conditioner {

prompt += "<|im_end|>\n<|im_start|>assistant\n";
}
} else if (sd_version_is_longcat(version)) {
spell_quotes = true;

if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) {
LOG_INFO("LongCatEditPipeline");
prompt_template_encode_start_idx = 67;
min_length = 512 + prompt_template_encode_start_idx;
int image_embed_idx = 36 + 6;

int min_pixels = 384 * 384;
int max_pixels = 560 * 560;
std::string placeholder = "<|image_pad|>";
std::string img_prompt;

for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
const auto& image = (*conditioner_params.ref_images)[i];
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
int height = static_cast<int>(image.shape()[1]);
int width = static_cast<int>(image.shape()[0]);
int h_bar = static_cast<int>(std::round(height / factor) * factor);
int w_bar = static_cast<int>(std::round(width / factor) * factor);

if (static_cast<double>(h_bar) * w_bar > max_pixels) {
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
h_bar = std::max(static_cast<int>(factor),
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
w_bar = std::max(static_cast<int>(factor),
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
}

LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);

auto resized_image = clip_preprocess(image, w_bar, h_bar);
auto image_embed = llm->encode_image(n_threads, resized_image);
GGML_ASSERT(!image_embed.empty());
image_embeds.emplace_back(image_embed_idx, image_embed);
image_embed_idx += 1 + static_cast<int>(image_embed.shape()[1]) + 6;

img_prompt += "<|vision_start|>";
int64_t num_image_tokens = image_embed.shape()[1];
img_prompt.reserve(num_image_tokens * placeholder.size());
for (int j = 0; j < num_image_tokens; j++) {
img_prompt += placeholder;
}
img_prompt += "<|vision_end|>";
}

prompt = "<|im_start|>system\nAs an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n<|im_start|>user\n";
prompt += img_prompt;
} else {
prompt_template_encode_start_idx = 36;
min_length = 512 + prompt_template_encode_start_idx;

prompt = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n";
}

prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());

prompt += "<|im_end|>\n<|im_start|>assistant\n";
} else if (version == VERSION_FLUX2) {
prompt_template_encode_start_idx = 0;
hidden_states_min_length = 512;
Expand Down Expand Up @@ -2012,7 +2084,8 @@ struct LLMEmbedder : public Conditioner {
hidden_states_min_length,
image_embeds,
out_layers,
prompt_template_encode_start_idx);
prompt_template_encode_start_idx,
spell_quotes);
std::vector<sd::Tensor<float>> extra_hidden_states_vec;
for (int i = 0; i < extra_prompts.size(); i++) {
auto extra_hidden_states = encode_prompt(n_threads,
Expand All @@ -2022,7 +2095,8 @@ struct LLMEmbedder : public Conditioner {
hidden_states_min_length,
image_embeds,
out_layers,
prompt_template_encode_start_idx);
prompt_template_encode_start_idx,
spell_quotes);
extra_hidden_states_vec.push_back(std::move(extra_hidden_states));
}

Expand Down
8 changes: 5 additions & 3 deletions src/flux.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,6 @@ namespace Flux {
if (use_yak_mlp || use_mlp_silu_act) {
mlp_mult_factor = 2;
}

blocks["linear1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
blocks["linear2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias));
blocks["norm"] = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
Expand Down Expand Up @@ -1225,6 +1224,9 @@ namespace Flux {
flux_params.share_modulation = true;
flux_params.ref_index_scale = 10.f;
flux_params.use_mlp_silu_act = true;
} else if (sd_version_is_longcat(version)) {
flux_params.context_in_dim = 3584;
flux_params.vec_in_dim = 0;
}
int64_t head_dim = 0;
int64_t actual_radiance_patch_size = -1;
Expand Down Expand Up @@ -1412,7 +1414,6 @@ namespace Flux {
} else if (version == VERSION_OVIS_IMAGE) {
txt_arange_dims = {1, 2};
}

pe_vec = Rope::gen_flux_pe(static_cast<int>(x->ne[1]),
static_cast<int>(x->ne[0]),
flux_params.patch_size,
Expand All @@ -1425,7 +1426,8 @@ namespace Flux {
flux_params.theta,
circular_y_enabled,
circular_x_enabled,
flux_params.axes_dim);
flux_params.axes_dim,
sd_version_is_longcat(version));
int pos_len = static_cast<int>(pe_vec.size() / flux_params.axes_dim_sum / 2);
// LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
Expand Down
12 changes: 9 additions & 3 deletions src/ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -953,11 +953,17 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm_32(ggml_context* ctx,
return ggml_group_norm(ctx, a, 32, eps);
}

__STATIC_INLINE__ bool ggml_ext_is_padded_1d(const ggml_tensor* x) {
return x->nb[0] == ggml_type_size(x->type) &&
x->nb[2] == x->nb[1] * x->ne[1] &&
x->nb[3] == x->nb[2] * x->ne[2];
}

__STATIC_INLINE__ ggml_tensor* ggml_ext_scale(ggml_context* ctx,
ggml_tensor* x,
float factor,
bool inplace = false) {
if (!ggml_is_contiguous(x)) {
if (!ggml_ext_is_padded_1d(x)) {
x = ggml_cont(ctx, x);
}
if (inplace) {
Expand Down Expand Up @@ -3664,7 +3670,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(

ggml_tensor* hc = ggml_transpose(ctx, hc_t);
ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch);
return ggml_scale(ctx, out, scale);
return ggml_ext_scale(ctx, out, scale);
} else {
int batch = (int)h->ne[3];
// 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch]
Expand Down Expand Up @@ -3747,7 +3753,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
ggml_tensor* hc = ggml_transpose(ctx, hc_t);
// ungroup
ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch);
return ggml_scale(ctx, out, scale);
return ggml_ext_scale(ctx, out, scale);
}
}

Expand Down
30 changes: 19 additions & 11 deletions src/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
}

SDVersion ModelLoader::get_sd_version() {
TensorStorage token_embedding_weight, input_block_weight;
TensorStorage token_embedding_weight, input_block_weight, context_ebedding_weight;

bool has_multiple_encoders = false;
bool is_unet = false;
Expand All @@ -428,7 +428,8 @@ SDVersion ModelLoader::get_sd_version() {
bool has_attn_1024 = false;

for (auto& [name, tensor_storage] : tensor_storage_map) {
if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos ||
tensor_storage.name.find("model.diffusion_model.single_transformer_blocks.") != std::string::npos) {
is_flux = true;
}
if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
Expand Down Expand Up @@ -522,6 +523,9 @@ SDVersion ModelLoader::get_sd_version() {
tensor_storage.name == "unet.conv_in.weight") {
input_block_weight = tensor_storage;
}
if (tensor_storage.name == "model.diffusion_model.txt_in.weight" || tensor_storage.name == "model.diffusion_model.context_embedder.weight") {
context_ebedding_weight = tensor_storage;
}
}
if (is_wan) {
LOG_DEBUG("patch_embedding_channels %d", patch_embedding_channels);
Expand Down Expand Up @@ -552,16 +556,20 @@ SDVersion ModelLoader::get_sd_version() {
}

if (is_flux && !is_flux2) {
if (input_block_weight.ne[0] == 384) {
return VERSION_FLUX_FILL;
}
if (input_block_weight.ne[0] == 128) {
return VERSION_FLUX_CONTROLS;
}
if (input_block_weight.ne[0] == 196) {
return VERSION_FLEX_2;
if (context_ebedding_weight.ne[0] == 3584) {
return VERSION_LONGCAT;
} else {
if (input_block_weight.ne[0] == 384) {
return VERSION_FLUX_FILL;
}
if (input_block_weight.ne[0] == 128) {
return VERSION_FLUX_CONTROLS;
}
if (input_block_weight.ne[0] == 196) {
return VERSION_FLEX_2;
}
return VERSION_FLUX;
}
return VERSION_FLUX;
}

if (is_flux2) {
Expand Down
11 changes: 10 additions & 1 deletion src/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ enum SDVersion {
VERSION_Z_IMAGE,
VERSION_OVIS_IMAGE,
VERSION_ERNIE_IMAGE,
VERSION_LONGCAT,
VERSION_COUNT,
};

Expand Down Expand Up @@ -141,6 +142,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
return false;
}

static inline bool sd_version_is_longcat(SDVersion version) {
if (version == VERSION_LONGCAT) {
return true;
}
return false;
}

static inline bool sd_version_is_ernie_image(SDVersion version) {
if (version == VERSION_ERNIE_IMAGE) {
return true;
Expand Down Expand Up @@ -176,7 +184,8 @@ static inline bool sd_version_is_dit(SDVersion version) {
version == VERSION_HIDREAM_O1 ||
sd_version_is_anima(version) ||
sd_version_is_z_image(version) ||
sd_version_is_ernie_image(version)) {
sd_version_is_ernie_image(version) ||
sd_version_is_longcat(version)) {
return true;
}
return false;
Expand Down
Loading
Loading