From 3a1ba1a0e27811cc70d1ba1fcdc08fb58697cfde Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 20 Dec 2025 00:27:54 +0100 Subject: [PATCH 01/12] 3 files --- ..._blocks.py => modular_blocks_qwenimage.py} | 0 .../modular_blocks_qwenimage_edit.py | 1113 +++++++++++++++++ .../modular_blocks_qwenimage_edit_plus.py | 1113 +++++++++++++++++ 3 files changed, 2226 insertions(+) rename src/diffusers/modular_pipelines/qwenimage/{modular_blocks.py => modular_blocks_qwenimage.py} (100%) create mode 100644 src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py create mode 100644 src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py similarity index 100% rename from src/diffusers/modular_pipelines/qwenimage/modular_blocks.py rename to src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py new file mode 100644 index 000000000000..dcce0cab5dd1 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -0,0 +1,1113 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import InsertableDict +from .before_denoise import ( + QwenImageControlNetBeforeDenoiserStep, + QwenImageCreateMaskLatentsStep, + QwenImageEditPlusRoPEInputsStep, + QwenImageEditRoPEInputsStep, + QwenImagePrepareLatentsStep, + QwenImagePrepareLatentsWithStrengthStep, + QwenImageRoPEInputsStep, + QwenImageSetTimestepsStep, + QwenImageSetTimestepsWithStrengthStep, +) +from .decoders import ( + QwenImageAfterDenoiseStep, + QwenImageDecoderStep, + QwenImageInpaintProcessImagesOutputStep, + QwenImageProcessImagesOutputStep, +) +from .denoise import ( + QwenImageControlNetDenoiseStep, + QwenImageDenoiseStep, + QwenImageEditDenoiseStep, + QwenImageEditInpaintDenoiseStep, + QwenImageInpaintControlNetDenoiseStep, + QwenImageInpaintDenoiseStep, + QwenImageLoopBeforeDenoiserControlNet, +) +from .encoders import ( + QwenImageControlNetVaeEncoderStep, + QwenImageEditPlusProcessImagesInputStep, + QwenImageEditPlusResizeDynamicStep, + QwenImageEditPlusTextEncoderStep, + QwenImageEditPlusVaeEncoderDynamicStep, + QwenImageEditResizeDynamicStep, + QwenImageEditTextEncoderStep, + QwenImageInpaintProcessImagesInputStep, + QwenImageProcessImagesInputStep, + QwenImageTextEncoderStep, + QwenImageVaeEncoderDynamicStep, +) +from .inputs import ( + QwenImageControlNetInputsStep, + QwenImageEditPlusInputsDynamicStep, + QwenImageInputsDynamicStep, + QwenImageTextInputsStep, +) + + +logger = logging.get_logger(__name__) + +# 1. QwenImage + +## 1.1 QwenImage/text2image + +#### QwenImage/decode +#### (standard decode step works for most tasks except for inpaint) +QwenImageDecodeBlocks = InsertableDict( + [ + ("decode", QwenImageDecoderStep()), + ("postprocess", QwenImageProcessImagesOutputStep()), + ] +) + + +class QwenImageDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageDecodeBlocks.values() + block_names = QwenImageDecodeBlocks.keys() + + @property + def description(self): + return "Decode step that decodes the latents to images and postprocess the generated image." + + +#### QwenImage/text2image presets +TEXT2IMAGE_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("input", QwenImageTextInputsStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 1.2 QwenImage/inpaint + +#### QwenImage/inpaint vae encoder +QwenImageInpaintVaeEncoderBlocks = InsertableDict( + [ + ( + "preprocess", + QwenImageInpaintProcessImagesInputStep, + ), # image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs + ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents + ] +) + + +class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintVaeEncoderBlocks.values() + block_names = QwenImageInpaintVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step is used for processing image and mask inputs for inpainting tasks. It:\n" + " - Resizes the image to the target size, based on `height` and `width`.\n" + " - Processes and updates `image` and `mask_image`.\n" + " - Creates `image_latents`." + ) + + +#### QwenImage/inpaint inputs +QwenImageInpaintInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ( + "additional_inputs", + QwenImageInputsDynamicStep( + image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] + ), + ), + ] +) + + +class QwenImageInpaintInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintInputBlocks.values() + block_names = QwenImageInpaintInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the inpainting denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +# QwenImage/inpaint prepare latents +QwenImageInpaintPrepareLatentsBlocks = InsertableDict( + [ + ("add_noise_to_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("create_mask_latents", QwenImageCreateMaskLatentsStep()), + ] +) + + +class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintPrepareLatentsBlocks.values() + block_names = QwenImageInpaintPrepareLatentsBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n" + " - Add noise to the image latents to create the latents input for the denoiser.\n" + " - Create the pachified latents `mask` based on the processedmask image.\n" + ) + + +#### QwenImage/inpaint decode +QwenImageInpaintDecodeBlocks = InsertableDict( + [ + ("decode", QwenImageDecoderStep()), + ("postprocess", QwenImageInpaintProcessImagesOutputStep()), + ] +) + + +class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintDecodeBlocks.values() + block_names = QwenImageInpaintDecodeBlocks.keys() + + @property + def description(self): + return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image." + + +#### QwenImage/inpaint presets +INPAINT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageInpaintVaeEncoderStep()), + ("input", QwenImageInpaintInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageInpaintDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageInpaintDecodeStep()), + ] +) + + +## 1.3 QwenImage/img2img + +#### QwenImage/img2img vae encoder +QwenImageImg2ImgVaeEncoderBlocks = InsertableDict( + [ + ("preprocess", QwenImageProcessImagesInputStep()), + ("encode", QwenImageVaeEncoderDynamicStep()), + ] +) + + +class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + + block_classes = QwenImageImg2ImgVaeEncoderBlocks.values() + block_names = QwenImageImg2ImgVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return "Vae encoder step that preprocess andencode the image inputs into their latent representations." + + +#### QwenImage/img2img inputs +QwenImageImg2ImgInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), + ] +) + + +class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageImg2ImgInputBlocks.values() + block_names = QwenImageImg2ImgInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the img2img denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +#### QwenImage/img2img presets +IMAGE2IMAGE_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageImg2ImgVaeEncoderStep()), + ("input", QwenImageImg2ImgInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 1.4 QwenImage/controlnet + +#### QwenImage/controlnet presets +CONTROLNET_BLOCKS = InsertableDict( + [ + ("controlnet_vae_encoder", QwenImageControlNetVaeEncoderStep()), # vae encoder step for control_image + ("controlnet_inputs", QwenImageControlNetInputsStep()), # additional input step for controlnet + ( + "controlnet_before_denoise", + QwenImageControlNetBeforeDenoiserStep(), + ), # before denoise step (after set_timesteps step) + ( + "controlnet_denoise_loop_before", + QwenImageLoopBeforeDenoiserControlNet(), + ), # controlnet loop step (insert before the denoiseloop_denoiser) + ] +) + + +## 1.5 QwenImage/auto encoders + + +#### for inpaint and img2img tasks +class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] + block_names = ["inpaint", "img2img"] + block_trigger_inputs = ["mask_image", "image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n" + + " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n" + + " - if `mask_image` or `image` is not provided, step will be skipped." + ) + + +# for controlnet tasks +class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetVaeEncoderStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n" + + " - if `control_image` is not provided, step will be skipped." + ) + + +## 1.6 QwenImage/auto inputs + + +# text2image/inpaint/img2img +class QwenImageAutoInputStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintInputStep, QwenImageImg2ImgInputStep, QwenImageTextInputsStep] + block_names = ["inpaint", "img2img", "text2image"] + block_trigger_inputs = ["processed_mask_image", "image_latents", None] + + @property + def description(self): + return ( + "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" + " This is an auto pipeline block that works for text2image/inpaint/img2img tasks.\n" + + " - `QwenImageInpaintInputStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n" + + " - `QwenImageTextInputsStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" + ) + + +# controlnet +class QwenImageOptionalControlNetInputStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetInputsStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image_latents"] + + @property + def description(self): + return ( + "Controlnet input step that prepare the control_image_latents input.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetInputsStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - if `control_image_latents` is not provided, step will be skipped." + ) + + +## 1.7 QwenImage/auto before denoise step +# compose the steps into a BeforeDenoiseStep for text2image/img2img/inpaint tasks before combine into an auto step + +# QwenImage/text2image before denoise +QwenImageText2ImageBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageText2ImageBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageText2ImageBeforeDenoiseBlocks.values() + block_names = QwenImageText2ImageBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for text2image task." + + +# QwenImage/inpaint before denoise +QwenImageInpaintBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageInpaintBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintBeforeDenoiseBlocks.values() + block_names = QwenImageInpaintBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." + + +# QwenImage/img2img before denoise +QwenImageImg2ImgBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageImg2ImgBeforeDenoiseBlocks.values() + block_names = QwenImageImg2ImgBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." + + +# auto before_denoise step for text2image, inpaint, img2img tasks +class QwenImageAutoBeforeDenoiseStep(AutoPipelineBlocks): + block_classes = [ + QwenImageInpaintBeforeDenoiseStep, + QwenImageImg2ImgBeforeDenoiseStep, + QwenImageText2ImageBeforeDenoiseStep, + ] + block_names = ["inpaint", "img2img", "text2image"] + block_trigger_inputs = ["processed_mask_image", "image_latents", None] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" + + "This is an auto pipeline block that works for text2img, inpainting, img2img tasks.\n" + + " - `QwenImageInpaintBeforeDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageImg2ImgBeforeDenoiseStep` (img2img) is used when `image_latents` is provided.\n" + + " - `QwenImageText2ImageBeforeDenoiseStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" + ) + + +# auto before_denoise step for controlnet tasks +class QwenImageOptionalControlNetBeforeDenoiseStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetBeforeDenoiserStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image_latents"] + + @property + def description(self): + return ( + "Controlnet before denoise step that prepare the controlnet input.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetBeforeDenoiserStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - if `control_image_latents` is not provided, step will be skipped." + ) + + +## 1.8 QwenImage/auto denoise + + +# auto denoise step for controlnet tasks: works for all tasks with controlnet +class QwenImageControlNetAutoDenoiseStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintControlNetDenoiseStep, QwenImageControlNetDenoiseStep] + block_names = ["inpaint_denoise", "denoise"] + block_trigger_inputs = ["mask", None] + + @property + def description(self): + return ( + "Controlnet step during the denoising process. \n" + " This is an auto pipeline block that works for inpaint and text2image/img2img tasks with controlnet.\n" + + " - `QwenImageInpaintControlNetDenoiseStep` (inpaint) is used when `mask` is provided.\n" + + " - `QwenImageControlNetDenoiseStep` (text2image/img2img) is used when `mask` is not provided.\n" + ) + + +# auto denoise step for everything: works for all tasks with or without controlnet +class QwenImageAutoDenoiseStep(AutoPipelineBlocks): + block_classes = [ + QwenImageControlNetAutoDenoiseStep, + QwenImageInpaintDenoiseStep, + QwenImageDenoiseStep, + ] + block_names = ["controlnet_denoise", "inpaint_denoise", "denoise"] + block_trigger_inputs = ["control_image_latents", "mask", None] + + @property + def description(self): + return ( + "Denoise step that iteratively denoise the latents. \n" + " This is an auto pipeline block that works for inpaint/text2image/img2img tasks. It also works with controlnet\n" + + " - `QwenImageControlNetAutoDenoiseStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - `QwenImageInpaintDenoiseStep` (inpaint) is used when `mask` is provided and `control_image_latents` is not provided.\n" + + " - `QwenImageDenoiseStep` (text2image/img2img) is used when `mask` is not provided and `control_image_latents` is not provided.\n" + ) + + +## 1.9 QwenImage/auto decode +# auto decode step for inpaint and text2image tasks + + +class QwenImageAutoDecodeStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep] + block_names = ["inpaint_decode", "decode"] + block_trigger_inputs = ["mask", None] + + @property + def description(self): + return ( + "Decode step that decode the latents into images. \n" + " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n" + + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" + + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n" + ) + + +class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = [ + QwenImageAutoInputStep, + QwenImageOptionalControlNetInputStep, + QwenImageAutoBeforeDenoiseStep, + QwenImageOptionalControlNetBeforeDenoiseStep, + QwenImageAutoDenoiseStep, + QwenImageAfterDenoiseStep, + ] + block_names = [ + "input", + "controlnet_input", + "before_denoise", + "controlnet_before_denoise", + "denoise", + "after_denoise", + ] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n" + + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n" + + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" + + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n" + + " - for image-to-image generation, you need to provide `image_latents`\n" + + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n" + + " - to run the controlnet workflow, you need to provide `control_image_latents`\n" + + " - for text-to-image generation, all you need to provide is prompt embeddings" + ) + + +## 1.10 QwenImage/auto block & presets +AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageAutoVaeEncoderStep()), + ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), + ("denoise", QwenImageCoreDenoiseStep()), + ("decode", QwenImageAutoDecodeStep()), + ] +) + + +class QwenImageAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage" + + block_classes = AUTO_BLOCKS.values() + block_names = AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" + + "- for image-to-image generation, you need to provide `image`\n" + + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + + "- to run the controlnet workflow, you need to provide `control_image`\n" + + "- for text-to-image generation, all you need to provide is `prompt`" + ) + + +# 2. QwenImage-Edit + +## 2.1 QwenImage-Edit/edit + +#### QwenImage-Edit/edit vl encoder: take both image and text prompts +QwenImageEditVLEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), + ("encode", QwenImageEditTextEncoderStep()), + ] +) + + +class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditVLEncoderBlocks.values() + block_names = QwenImageEditVLEncoderBlocks.keys() + + @property + def description(self) -> str: + return "QwenImage-Edit VL encoder step that encode the image an text prompts together." + + +#### QwenImage-Edit/edit vae encoder +QwenImageEditVaeEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), # edit has a different resize step + ("preprocess", QwenImageProcessImagesInputStep()), # resized_image -> processed_image + ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents + ] +) + + +class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditVaeEncoderBlocks.values() + block_names = QwenImageEditVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return "Vae encoder step that encode the image inputs into their latent representations." + + +#### QwenImage-Edit/edit input +QwenImageEditInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), + ] +) + + +class QwenImageEditInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInputBlocks.values() + block_names = QwenImageEditInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the edit denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs: \n" + " - `image_latents`.\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +#### QwenImage/edit presets +EDIT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditVaeEncoderStep()), + ("input", QwenImageEditInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ("denoise", QwenImageEditDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 2.2 QwenImage-Edit/edit inpaint + +#### QwenImage-Edit/edit inpaint vae encoder: the difference from regular inpaint is the resize step +QwenImageEditInpaintVaeEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), # image -> resized_image + ( + "preprocess", + QwenImageInpaintProcessImagesInputStep, + ), # resized_image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs + ( + "encode", + QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"), + ), # processed_image -> image_latents + ] +) + + +class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInpaintVaeEncoderBlocks.values() + block_names = QwenImageEditInpaintVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n" + " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n" + " - process the resized image and mask image.\n" + " - create image latents." + ) + + +#### QwenImage-Edit/edit inpaint presets +EDIT_INPAINT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditInpaintVaeEncoderStep()), + ("input", QwenImageInpaintInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ("denoise", QwenImageEditInpaintDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageInpaintDecodeStep()), + ] +) + + +## 2.3 QwenImage-Edit/auto encoders + + +class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [ + QwenImageEditInpaintVaeEncoderStep, + QwenImageEditVaeEncoderStep, + ] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["mask_image", "image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations. \n" + " This is an auto pipeline block that works for edit and edit_inpaint tasks.\n" + + " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n" + + " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n" + + " - if `mask_image` or `image` is not provided, step will be skipped." + ) + + +## 2.4 QwenImage-Edit/auto inputs +class QwenImageEditAutoInputStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + + @property + def description(self): + return ( + "Input step that prepares the inputs for the edit denoising step.\n" + + " It is an auto pipeline block that works for edit and edit_inpaint tasks.\n" + + " - `QwenImageInpaintInputStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditInputStep` (edit) is used when `image_latents` is provided.\n" + + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." + ) + + +## 2.5 QwenImage-Edit/auto before denoise +# compose the steps into a BeforeDenoiseStep for edit and edit_inpaint tasks before combine into an auto step + +#### QwenImage-Edit/edit before denoise +QwenImageEditBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ] +) + + +class QwenImageEditBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditBeforeDenoiseBlocks.values() + block_names = QwenImageEditBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." + + +#### QwenImage-Edit/edit inpaint before denoise +QwenImageEditInpaintBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ] +) + + +class QwenImageEditInpaintBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInpaintBeforeDenoiseBlocks.values() + block_names = QwenImageEditInpaintBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit inpaint task." + + +# auto before_denoise step for edit and edit_inpaint tasks +class QwenImageEditAutoBeforeDenoiseStep(AutoPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditInpaintBeforeDenoiseStep, + QwenImageEditBeforeDenoiseStep, + ] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" + + "This is an auto pipeline block that works for edit (img2img) and edit inpaint tasks.\n" + + " - `QwenImageEditInpaintBeforeDenoiseStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" + + " - if `image_latents` or `processed_mask_image` is not provided, step will be skipped." + ) + + +## 2.6 QwenImage-Edit/auto denoise + + +class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks): + model_name = "qwenimage-edit" + + block_classes = [QwenImageEditInpaintDenoiseStep, QwenImageEditDenoiseStep] + block_names = ["inpaint_denoise", "denoise"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + + @property + def description(self): + return ( + "Denoise step that iteratively denoise the latents. \n" + + "This block supports edit (img2img) and edit inpaint tasks for QwenImage Edit. \n" + + " - `QwenImageEditInpaintDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditDenoiseStep` (img2img) is used when `image_latents` is provided.\n" + + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." + ) + + +## 2.7 QwenImage-Edit/auto blocks & presets + + +class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditAutoInputStep, + QwenImageEditAutoBeforeDenoiseStep, + QwenImageEditAutoDenoiseStep, + QwenImageAfterDenoiseStep, + ] + block_names = ["input", "before_denoise", "denoise", "after_denoise"] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" + + "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n" + + " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n" + + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" + ) + + +EDIT_AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), + ("denoise", QwenImageEditCoreDenoiseStep()), + ("decode", QwenImageAutoDecodeStep()), + ] +) + + +class QwenImageEditAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = EDIT_AUTO_BLOCKS.values() + block_names = EDIT_AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" + + "- for edit (img2img) generation, you need to provide `image`\n" + + "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + ) + + +#################### QwenImage Edit Plus ##################### + +# 3. QwenImage-Edit Plus + +## 3.1 QwenImage-Edit Plus / edit + +#### QwenImage-Edit Plus vl encoder: take both image and text prompts +QwenImageEditPlusVLEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditPlusResizeDynamicStep()), + ("encode", QwenImageEditPlusTextEncoderStep()), + ] +) + + +class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditPlusVLEncoderBlocks.values() + block_names = QwenImageEditPlusVLEncoderBlocks.keys() + + @property + def description(self) -> str: + return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together." + + +#### QwenImage-Edit Plus vae encoder +QwenImageEditPlusVaeEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditPlusResizeDynamicStep()), # edit plus has a different resize step + ("preprocess", QwenImageEditPlusProcessImagesInputStep()), # vae_image -> processed_image + ("encode", QwenImageEditPlusVaeEncoderDynamicStep()), # processed_image -> image_latents + ] +) + + +class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = QwenImageEditPlusVaeEncoderBlocks.values() + block_names = QwenImageEditPlusVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return "Vae encoder step that encode the image inputs into their latent representations." + + +#### QwenImage Edit Plus input blocks +QwenImageEditPlusInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ( + "additional_inputs", + QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]), + ), + ] +) + + +class QwenImageEditPlusInputStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = QwenImageEditPlusInputBlocks.values() + block_names = QwenImageEditPlusInputBlocks.keys() + + +#### QwenImage Edit Plus presets +EDIT_PLUS_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditPlusVLEncoderStep()), + ("vae_encoder", QwenImageEditPlusVaeEncoderStep()), + ("input", QwenImageEditPlusInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), + ("denoise", QwenImageEditDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +QwenImageEditPlusBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), + ] +) + + +class QwenImageEditPlusBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = QwenImageEditPlusBeforeDenoiseBlocks.values() + block_names = QwenImageEditPlusBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." + + +# auto before_denoise step for edit tasks +class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = [QwenImageEditPlusBeforeDenoiseStep] + block_names = ["edit"] + block_trigger_inputs = ["image_latents"] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" + + "This is an auto pipeline block that works for edit (img2img) task.\n" + + " - `QwenImageEditPlusBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" + + " - if `image_latents` is not provided, step will be skipped." + ) + + +## 3.2 QwenImage-Edit Plus/auto encoders + + +class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageEditPlusVaeEncoderStep] + block_names = ["edit"] + block_trigger_inputs = ["image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations. \n" + " This is an auto pipeline block that works for edit task.\n" + + " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n" + + " - if `image` is not provided, step will be skipped." + ) + + +## 3.3 QwenImage-Edit/auto blocks & presets + + +class QwenImageEditPlusAutoInputStep(AutoPipelineBlocks): + block_classes = [QwenImageEditPlusInputStep] + block_names = ["edit"] + block_trigger_inputs = ["image_latents"] + + @property + def description(self): + return ( + "Input step that prepares the inputs for the edit denoising step.\n" + + " It is an auto pipeline block that works for edit task.\n" + + " - `QwenImageEditPlusInputStep` (edit) is used when `image_latents` is provided.\n" + + " - if `image_latents` is not provided, step will be skipped." + ) + + +class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = [ + QwenImageEditPlusAutoInputStep, + QwenImageEditPlusAutoBeforeDenoiseStep, + QwenImageEditAutoDenoiseStep, + QwenImageAfterDenoiseStep, + ] + block_names = ["input", "before_denoise", "denoise", "after_denoise"] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" + + "This step support edit (img2img) workflow for QwenImage Edit Plus:\n" + + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" + ) + + +EDIT_PLUS_AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditPlusVLEncoderStep()), + ("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()), + ("denoise", QwenImageEditPlusCoreDenoiseStep()), + ("decode", QwenImageAutoDecodeStep()), + ] +) + + +class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = EDIT_PLUS_AUTO_BLOCKS.values() + block_names = EDIT_PLUS_AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n" + + "- for edit (img2img) generation, you need to provide `image`\n" + ) + + +# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus + + +ALL_BLOCKS = { + "text2image": TEXT2IMAGE_BLOCKS, + "img2img": IMAGE2IMAGE_BLOCKS, + "edit": EDIT_BLOCKS, + "edit_inpaint": EDIT_INPAINT_BLOCKS, + "edit_plus": EDIT_PLUS_BLOCKS, + "inpaint": INPAINT_BLOCKS, + "controlnet": CONTROLNET_BLOCKS, + "auto": AUTO_BLOCKS, + "edit_auto": EDIT_AUTO_BLOCKS, + "edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS, +} diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py new file mode 100644 index 000000000000..dcce0cab5dd1 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -0,0 +1,1113 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import InsertableDict +from .before_denoise import ( + QwenImageControlNetBeforeDenoiserStep, + QwenImageCreateMaskLatentsStep, + QwenImageEditPlusRoPEInputsStep, + QwenImageEditRoPEInputsStep, + QwenImagePrepareLatentsStep, + QwenImagePrepareLatentsWithStrengthStep, + QwenImageRoPEInputsStep, + QwenImageSetTimestepsStep, + QwenImageSetTimestepsWithStrengthStep, +) +from .decoders import ( + QwenImageAfterDenoiseStep, + QwenImageDecoderStep, + QwenImageInpaintProcessImagesOutputStep, + QwenImageProcessImagesOutputStep, +) +from .denoise import ( + QwenImageControlNetDenoiseStep, + QwenImageDenoiseStep, + QwenImageEditDenoiseStep, + QwenImageEditInpaintDenoiseStep, + QwenImageInpaintControlNetDenoiseStep, + QwenImageInpaintDenoiseStep, + QwenImageLoopBeforeDenoiserControlNet, +) +from .encoders import ( + QwenImageControlNetVaeEncoderStep, + QwenImageEditPlusProcessImagesInputStep, + QwenImageEditPlusResizeDynamicStep, + QwenImageEditPlusTextEncoderStep, + QwenImageEditPlusVaeEncoderDynamicStep, + QwenImageEditResizeDynamicStep, + QwenImageEditTextEncoderStep, + QwenImageInpaintProcessImagesInputStep, + QwenImageProcessImagesInputStep, + QwenImageTextEncoderStep, + QwenImageVaeEncoderDynamicStep, +) +from .inputs import ( + QwenImageControlNetInputsStep, + QwenImageEditPlusInputsDynamicStep, + QwenImageInputsDynamicStep, + QwenImageTextInputsStep, +) + + +logger = logging.get_logger(__name__) + +# 1. QwenImage + +## 1.1 QwenImage/text2image + +#### QwenImage/decode +#### (standard decode step works for most tasks except for inpaint) +QwenImageDecodeBlocks = InsertableDict( + [ + ("decode", QwenImageDecoderStep()), + ("postprocess", QwenImageProcessImagesOutputStep()), + ] +) + + +class QwenImageDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageDecodeBlocks.values() + block_names = QwenImageDecodeBlocks.keys() + + @property + def description(self): + return "Decode step that decodes the latents to images and postprocess the generated image." + + +#### QwenImage/text2image presets +TEXT2IMAGE_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("input", QwenImageTextInputsStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 1.2 QwenImage/inpaint + +#### QwenImage/inpaint vae encoder +QwenImageInpaintVaeEncoderBlocks = InsertableDict( + [ + ( + "preprocess", + QwenImageInpaintProcessImagesInputStep, + ), # image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs + ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents + ] +) + + +class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintVaeEncoderBlocks.values() + block_names = QwenImageInpaintVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step is used for processing image and mask inputs for inpainting tasks. It:\n" + " - Resizes the image to the target size, based on `height` and `width`.\n" + " - Processes and updates `image` and `mask_image`.\n" + " - Creates `image_latents`." + ) + + +#### QwenImage/inpaint inputs +QwenImageInpaintInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ( + "additional_inputs", + QwenImageInputsDynamicStep( + image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] + ), + ), + ] +) + + +class QwenImageInpaintInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintInputBlocks.values() + block_names = QwenImageInpaintInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the inpainting denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +# QwenImage/inpaint prepare latents +QwenImageInpaintPrepareLatentsBlocks = InsertableDict( + [ + ("add_noise_to_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("create_mask_latents", QwenImageCreateMaskLatentsStep()), + ] +) + + +class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintPrepareLatentsBlocks.values() + block_names = QwenImageInpaintPrepareLatentsBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n" + " - Add noise to the image latents to create the latents input for the denoiser.\n" + " - Create the pachified latents `mask` based on the processedmask image.\n" + ) + + +#### QwenImage/inpaint decode +QwenImageInpaintDecodeBlocks = InsertableDict( + [ + ("decode", QwenImageDecoderStep()), + ("postprocess", QwenImageInpaintProcessImagesOutputStep()), + ] +) + + +class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintDecodeBlocks.values() + block_names = QwenImageInpaintDecodeBlocks.keys() + + @property + def description(self): + return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image." + + +#### QwenImage/inpaint presets +INPAINT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageInpaintVaeEncoderStep()), + ("input", QwenImageInpaintInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageInpaintDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageInpaintDecodeStep()), + ] +) + + +## 1.3 QwenImage/img2img + +#### QwenImage/img2img vae encoder +QwenImageImg2ImgVaeEncoderBlocks = InsertableDict( + [ + ("preprocess", QwenImageProcessImagesInputStep()), + ("encode", QwenImageVaeEncoderDynamicStep()), + ] +) + + +class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + + block_classes = QwenImageImg2ImgVaeEncoderBlocks.values() + block_names = QwenImageImg2ImgVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return "Vae encoder step that preprocess andencode the image inputs into their latent representations." + + +#### QwenImage/img2img inputs +QwenImageImg2ImgInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), + ] +) + + +class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageImg2ImgInputBlocks.values() + block_names = QwenImageImg2ImgInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the img2img denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +#### QwenImage/img2img presets +IMAGE2IMAGE_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageImg2ImgVaeEncoderStep()), + ("input", QwenImageImg2ImgInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 1.4 QwenImage/controlnet + +#### QwenImage/controlnet presets +CONTROLNET_BLOCKS = InsertableDict( + [ + ("controlnet_vae_encoder", QwenImageControlNetVaeEncoderStep()), # vae encoder step for control_image + ("controlnet_inputs", QwenImageControlNetInputsStep()), # additional input step for controlnet + ( + "controlnet_before_denoise", + QwenImageControlNetBeforeDenoiserStep(), + ), # before denoise step (after set_timesteps step) + ( + "controlnet_denoise_loop_before", + QwenImageLoopBeforeDenoiserControlNet(), + ), # controlnet loop step (insert before the denoiseloop_denoiser) + ] +) + + +## 1.5 QwenImage/auto encoders + + +#### for inpaint and img2img tasks +class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] + block_names = ["inpaint", "img2img"] + block_trigger_inputs = ["mask_image", "image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n" + + " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n" + + " - if `mask_image` or `image` is not provided, step will be skipped." + ) + + +# for controlnet tasks +class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetVaeEncoderStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n" + + " - if `control_image` is not provided, step will be skipped." + ) + + +## 1.6 QwenImage/auto inputs + + +# text2image/inpaint/img2img +class QwenImageAutoInputStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintInputStep, QwenImageImg2ImgInputStep, QwenImageTextInputsStep] + block_names = ["inpaint", "img2img", "text2image"] + block_trigger_inputs = ["processed_mask_image", "image_latents", None] + + @property + def description(self): + return ( + "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" + " This is an auto pipeline block that works for text2image/inpaint/img2img tasks.\n" + + " - `QwenImageInpaintInputStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n" + + " - `QwenImageTextInputsStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" + ) + + +# controlnet +class QwenImageOptionalControlNetInputStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetInputsStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image_latents"] + + @property + def description(self): + return ( + "Controlnet input step that prepare the control_image_latents input.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetInputsStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - if `control_image_latents` is not provided, step will be skipped." + ) + + +## 1.7 QwenImage/auto before denoise step +# compose the steps into a BeforeDenoiseStep for text2image/img2img/inpaint tasks before combine into an auto step + +# QwenImage/text2image before denoise +QwenImageText2ImageBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageText2ImageBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageText2ImageBeforeDenoiseBlocks.values() + block_names = QwenImageText2ImageBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for text2image task." + + +# QwenImage/inpaint before denoise +QwenImageInpaintBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageInpaintBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintBeforeDenoiseBlocks.values() + block_names = QwenImageInpaintBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." + + +# QwenImage/img2img before denoise +QwenImageImg2ImgBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageImg2ImgBeforeDenoiseBlocks.values() + block_names = QwenImageImg2ImgBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." + + +# auto before_denoise step for text2image, inpaint, img2img tasks +class QwenImageAutoBeforeDenoiseStep(AutoPipelineBlocks): + block_classes = [ + QwenImageInpaintBeforeDenoiseStep, + QwenImageImg2ImgBeforeDenoiseStep, + QwenImageText2ImageBeforeDenoiseStep, + ] + block_names = ["inpaint", "img2img", "text2image"] + block_trigger_inputs = ["processed_mask_image", "image_latents", None] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" + + "This is an auto pipeline block that works for text2img, inpainting, img2img tasks.\n" + + " - `QwenImageInpaintBeforeDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageImg2ImgBeforeDenoiseStep` (img2img) is used when `image_latents` is provided.\n" + + " - `QwenImageText2ImageBeforeDenoiseStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" + ) + + +# auto before_denoise step for controlnet tasks +class QwenImageOptionalControlNetBeforeDenoiseStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetBeforeDenoiserStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image_latents"] + + @property + def description(self): + return ( + "Controlnet before denoise step that prepare the controlnet input.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetBeforeDenoiserStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - if `control_image_latents` is not provided, step will be skipped." + ) + + +## 1.8 QwenImage/auto denoise + + +# auto denoise step for controlnet tasks: works for all tasks with controlnet +class QwenImageControlNetAutoDenoiseStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintControlNetDenoiseStep, QwenImageControlNetDenoiseStep] + block_names = ["inpaint_denoise", "denoise"] + block_trigger_inputs = ["mask", None] + + @property + def description(self): + return ( + "Controlnet step during the denoising process. \n" + " This is an auto pipeline block that works for inpaint and text2image/img2img tasks with controlnet.\n" + + " - `QwenImageInpaintControlNetDenoiseStep` (inpaint) is used when `mask` is provided.\n" + + " - `QwenImageControlNetDenoiseStep` (text2image/img2img) is used when `mask` is not provided.\n" + ) + + +# auto denoise step for everything: works for all tasks with or without controlnet +class QwenImageAutoDenoiseStep(AutoPipelineBlocks): + block_classes = [ + QwenImageControlNetAutoDenoiseStep, + QwenImageInpaintDenoiseStep, + QwenImageDenoiseStep, + ] + block_names = ["controlnet_denoise", "inpaint_denoise", "denoise"] + block_trigger_inputs = ["control_image_latents", "mask", None] + + @property + def description(self): + return ( + "Denoise step that iteratively denoise the latents. \n" + " This is an auto pipeline block that works for inpaint/text2image/img2img tasks. It also works with controlnet\n" + + " - `QwenImageControlNetAutoDenoiseStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - `QwenImageInpaintDenoiseStep` (inpaint) is used when `mask` is provided and `control_image_latents` is not provided.\n" + + " - `QwenImageDenoiseStep` (text2image/img2img) is used when `mask` is not provided and `control_image_latents` is not provided.\n" + ) + + +## 1.9 QwenImage/auto decode +# auto decode step for inpaint and text2image tasks + + +class QwenImageAutoDecodeStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep] + block_names = ["inpaint_decode", "decode"] + block_trigger_inputs = ["mask", None] + + @property + def description(self): + return ( + "Decode step that decode the latents into images. \n" + " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n" + + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" + + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n" + ) + + +class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = [ + QwenImageAutoInputStep, + QwenImageOptionalControlNetInputStep, + QwenImageAutoBeforeDenoiseStep, + QwenImageOptionalControlNetBeforeDenoiseStep, + QwenImageAutoDenoiseStep, + QwenImageAfterDenoiseStep, + ] + block_names = [ + "input", + "controlnet_input", + "before_denoise", + "controlnet_before_denoise", + "denoise", + "after_denoise", + ] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n" + + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n" + + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" + + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n" + + " - for image-to-image generation, you need to provide `image_latents`\n" + + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n" + + " - to run the controlnet workflow, you need to provide `control_image_latents`\n" + + " - for text-to-image generation, all you need to provide is prompt embeddings" + ) + + +## 1.10 QwenImage/auto block & presets +AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageAutoVaeEncoderStep()), + ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), + ("denoise", QwenImageCoreDenoiseStep()), + ("decode", QwenImageAutoDecodeStep()), + ] +) + + +class QwenImageAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage" + + block_classes = AUTO_BLOCKS.values() + block_names = AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" + + "- for image-to-image generation, you need to provide `image`\n" + + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + + "- to run the controlnet workflow, you need to provide `control_image`\n" + + "- for text-to-image generation, all you need to provide is `prompt`" + ) + + +# 2. QwenImage-Edit + +## 2.1 QwenImage-Edit/edit + +#### QwenImage-Edit/edit vl encoder: take both image and text prompts +QwenImageEditVLEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), + ("encode", QwenImageEditTextEncoderStep()), + ] +) + + +class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditVLEncoderBlocks.values() + block_names = QwenImageEditVLEncoderBlocks.keys() + + @property + def description(self) -> str: + return "QwenImage-Edit VL encoder step that encode the image an text prompts together." + + +#### QwenImage-Edit/edit vae encoder +QwenImageEditVaeEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), # edit has a different resize step + ("preprocess", QwenImageProcessImagesInputStep()), # resized_image -> processed_image + ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents + ] +) + + +class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditVaeEncoderBlocks.values() + block_names = QwenImageEditVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return "Vae encoder step that encode the image inputs into their latent representations." + + +#### QwenImage-Edit/edit input +QwenImageEditInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), + ] +) + + +class QwenImageEditInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInputBlocks.values() + block_names = QwenImageEditInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the edit denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs: \n" + " - `image_latents`.\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +#### QwenImage/edit presets +EDIT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditVaeEncoderStep()), + ("input", QwenImageEditInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ("denoise", QwenImageEditDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 2.2 QwenImage-Edit/edit inpaint + +#### QwenImage-Edit/edit inpaint vae encoder: the difference from regular inpaint is the resize step +QwenImageEditInpaintVaeEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), # image -> resized_image + ( + "preprocess", + QwenImageInpaintProcessImagesInputStep, + ), # resized_image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs + ( + "encode", + QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"), + ), # processed_image -> image_latents + ] +) + + +class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInpaintVaeEncoderBlocks.values() + block_names = QwenImageEditInpaintVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n" + " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n" + " - process the resized image and mask image.\n" + " - create image latents." + ) + + +#### QwenImage-Edit/edit inpaint presets +EDIT_INPAINT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditInpaintVaeEncoderStep()), + ("input", QwenImageInpaintInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ("denoise", QwenImageEditInpaintDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageInpaintDecodeStep()), + ] +) + + +## 2.3 QwenImage-Edit/auto encoders + + +class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [ + QwenImageEditInpaintVaeEncoderStep, + QwenImageEditVaeEncoderStep, + ] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["mask_image", "image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations. \n" + " This is an auto pipeline block that works for edit and edit_inpaint tasks.\n" + + " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n" + + " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n" + + " - if `mask_image` or `image` is not provided, step will be skipped." + ) + + +## 2.4 QwenImage-Edit/auto inputs +class QwenImageEditAutoInputStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + + @property + def description(self): + return ( + "Input step that prepares the inputs for the edit denoising step.\n" + + " It is an auto pipeline block that works for edit and edit_inpaint tasks.\n" + + " - `QwenImageInpaintInputStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditInputStep` (edit) is used when `image_latents` is provided.\n" + + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." + ) + + +## 2.5 QwenImage-Edit/auto before denoise +# compose the steps into a BeforeDenoiseStep for edit and edit_inpaint tasks before combine into an auto step + +#### QwenImage-Edit/edit before denoise +QwenImageEditBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ] +) + + +class QwenImageEditBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditBeforeDenoiseBlocks.values() + block_names = QwenImageEditBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." + + +#### QwenImage-Edit/edit inpaint before denoise +QwenImageEditInpaintBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ] +) + + +class QwenImageEditInpaintBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInpaintBeforeDenoiseBlocks.values() + block_names = QwenImageEditInpaintBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit inpaint task." + + +# auto before_denoise step for edit and edit_inpaint tasks +class QwenImageEditAutoBeforeDenoiseStep(AutoPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditInpaintBeforeDenoiseStep, + QwenImageEditBeforeDenoiseStep, + ] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" + + "This is an auto pipeline block that works for edit (img2img) and edit inpaint tasks.\n" + + " - `QwenImageEditInpaintBeforeDenoiseStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" + + " - if `image_latents` or `processed_mask_image` is not provided, step will be skipped." + ) + + +## 2.6 QwenImage-Edit/auto denoise + + +class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks): + model_name = "qwenimage-edit" + + block_classes = [QwenImageEditInpaintDenoiseStep, QwenImageEditDenoiseStep] + block_names = ["inpaint_denoise", "denoise"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + + @property + def description(self): + return ( + "Denoise step that iteratively denoise the latents. \n" + + "This block supports edit (img2img) and edit inpaint tasks for QwenImage Edit. \n" + + " - `QwenImageEditInpaintDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditDenoiseStep` (img2img) is used when `image_latents` is provided.\n" + + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." + ) + + +## 2.7 QwenImage-Edit/auto blocks & presets + + +class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditAutoInputStep, + QwenImageEditAutoBeforeDenoiseStep, + QwenImageEditAutoDenoiseStep, + QwenImageAfterDenoiseStep, + ] + block_names = ["input", "before_denoise", "denoise", "after_denoise"] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" + + "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n" + + " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n" + + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" + ) + + +EDIT_AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), + ("denoise", QwenImageEditCoreDenoiseStep()), + ("decode", QwenImageAutoDecodeStep()), + ] +) + + +class QwenImageEditAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = EDIT_AUTO_BLOCKS.values() + block_names = EDIT_AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" + + "- for edit (img2img) generation, you need to provide `image`\n" + + "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + ) + + +#################### QwenImage Edit Plus ##################### + +# 3. QwenImage-Edit Plus + +## 3.1 QwenImage-Edit Plus / edit + +#### QwenImage-Edit Plus vl encoder: take both image and text prompts +QwenImageEditPlusVLEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditPlusResizeDynamicStep()), + ("encode", QwenImageEditPlusTextEncoderStep()), + ] +) + + +class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditPlusVLEncoderBlocks.values() + block_names = QwenImageEditPlusVLEncoderBlocks.keys() + + @property + def description(self) -> str: + return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together." + + +#### QwenImage-Edit Plus vae encoder +QwenImageEditPlusVaeEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditPlusResizeDynamicStep()), # edit plus has a different resize step + ("preprocess", QwenImageEditPlusProcessImagesInputStep()), # vae_image -> processed_image + ("encode", QwenImageEditPlusVaeEncoderDynamicStep()), # processed_image -> image_latents + ] +) + + +class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = QwenImageEditPlusVaeEncoderBlocks.values() + block_names = QwenImageEditPlusVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return "Vae encoder step that encode the image inputs into their latent representations." + + +#### QwenImage Edit Plus input blocks +QwenImageEditPlusInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ( + "additional_inputs", + QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]), + ), + ] +) + + +class QwenImageEditPlusInputStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = QwenImageEditPlusInputBlocks.values() + block_names = QwenImageEditPlusInputBlocks.keys() + + +#### QwenImage Edit Plus presets +EDIT_PLUS_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditPlusVLEncoderStep()), + ("vae_encoder", QwenImageEditPlusVaeEncoderStep()), + ("input", QwenImageEditPlusInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), + ("denoise", QwenImageEditDenoiseStep()), + ("after_denoise", QwenImageAfterDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +QwenImageEditPlusBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), + ] +) + + +class QwenImageEditPlusBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = QwenImageEditPlusBeforeDenoiseBlocks.values() + block_names = QwenImageEditPlusBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." + + +# auto before_denoise step for edit tasks +class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = [QwenImageEditPlusBeforeDenoiseStep] + block_names = ["edit"] + block_trigger_inputs = ["image_latents"] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" + + "This is an auto pipeline block that works for edit (img2img) task.\n" + + " - `QwenImageEditPlusBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" + + " - if `image_latents` is not provided, step will be skipped." + ) + + +## 3.2 QwenImage-Edit Plus/auto encoders + + +class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageEditPlusVaeEncoderStep] + block_names = ["edit"] + block_trigger_inputs = ["image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations. \n" + " This is an auto pipeline block that works for edit task.\n" + + " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n" + + " - if `image` is not provided, step will be skipped." + ) + + +## 3.3 QwenImage-Edit/auto blocks & presets + + +class QwenImageEditPlusAutoInputStep(AutoPipelineBlocks): + block_classes = [QwenImageEditPlusInputStep] + block_names = ["edit"] + block_trigger_inputs = ["image_latents"] + + @property + def description(self): + return ( + "Input step that prepares the inputs for the edit denoising step.\n" + + " It is an auto pipeline block that works for edit task.\n" + + " - `QwenImageEditPlusInputStep` (edit) is used when `image_latents` is provided.\n" + + " - if `image_latents` is not provided, step will be skipped." + ) + + +class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = [ + QwenImageEditPlusAutoInputStep, + QwenImageEditPlusAutoBeforeDenoiseStep, + QwenImageEditAutoDenoiseStep, + QwenImageAfterDenoiseStep, + ] + block_names = ["input", "before_denoise", "denoise", "after_denoise"] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" + + "This step support edit (img2img) workflow for QwenImage Edit Plus:\n" + + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" + ) + + +EDIT_PLUS_AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditPlusVLEncoderStep()), + ("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()), + ("denoise", QwenImageEditPlusCoreDenoiseStep()), + ("decode", QwenImageAutoDecodeStep()), + ] +) + + +class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" + block_classes = EDIT_PLUS_AUTO_BLOCKS.values() + block_names = EDIT_PLUS_AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n" + + "- for edit (img2img) generation, you need to provide `image`\n" + ) + + +# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus + + +ALL_BLOCKS = { + "text2image": TEXT2IMAGE_BLOCKS, + "img2img": IMAGE2IMAGE_BLOCKS, + "edit": EDIT_BLOCKS, + "edit_inpaint": EDIT_INPAINT_BLOCKS, + "edit_plus": EDIT_PLUS_BLOCKS, + "inpaint": INPAINT_BLOCKS, + "controlnet": CONTROLNET_BLOCKS, + "auto": AUTO_BLOCKS, + "edit_auto": EDIT_AUTO_BLOCKS, + "edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS, +} From a1af84516946ccdbd9b4179da140e67bd087fc5f Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 22 Dec 2025 01:01:16 +0100 Subject: [PATCH 02/12] add conditoinal pipeline --- .../modular_pipelines/modular_pipeline.py | 350 +++++++++--------- 1 file changed, 184 insertions(+), 166 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index c5fa4cf9921f..d710bf18eb48 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -231,7 +231,7 @@ def format_value(v): class ModularPipelineBlocks(ConfigMixin, PushToHubMixin): """ - Base class for all Pipeline Blocks: PipelineBlock, AutoPipelineBlocks, SequentialPipelineBlocks, + Base class for all Pipeline Blocks: ConditionalPipelineBlocks, AutoPipelineBlocks, SequentialPipelineBlocks, LoopSequentialPipelineBlocks [`ModularPipelineBlocks`] provides method to load and save the definition of pipeline blocks. @@ -527,9 +527,10 @@ def doc(self): ) -class AutoPipelineBlocks(ModularPipelineBlocks): +class ConditionalPipelineBlocks(ModularPipelineBlocks): """ - A Pipeline Blocks that automatically selects a block to run based on the inputs. + A Pipeline Blocks that conditionally selects a block to run based on the inputs. + Subclasses must implement the `select_block` method to define the logic for selecting the block. This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the library implements for all the pipeline blocks (such as loading or saving etc.) @@ -539,12 +540,13 @@ class AutoPipelineBlocks(ModularPipelineBlocks): Attributes: block_classes: List of block classes to be used block_names: List of prefixes for each block - block_trigger_inputs: List of input names that trigger specific blocks, with None for default + block_trigger_inputs: List of input names that select_block() uses to determine which block to run """ block_classes = [] block_names = [] block_trigger_inputs = [] + default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided def __init__(self): sub_blocks = InsertableDict() @@ -554,26 +556,15 @@ def __init__(self): else: sub_blocks[block_name] = block self.sub_blocks = sub_blocks - if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)): + if not (len(self.block_classes) == len(self.block_names)): raise ValueError( - f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same." + f"In {self.__class__.__name__}, the number of block_classes and block_names must be the same." ) - default_blocks = [t for t in self.block_trigger_inputs if t is None] - # can only have 1 or 0 default block, and has to put in the last - # the order of blocks matters here because the first block with matching trigger will be dispatched - # e.g. blocks = [inpaint, img2img] and block_trigger_inputs = ["mask", "image"] - # as long as mask is provided, it is inpaint; if only image is provided, it is img2img - if len(default_blocks) > 1 or (len(default_blocks) == 1 and self.block_trigger_inputs[-1] is not None): + if self.default_block_name is not None and self.default_block_name not in self.block_names: raise ValueError( - f"In {self.__class__.__name__}, exactly one None must be specified as the last element " - "in block_trigger_inputs." + f"In {self.__class__.__name__}, default_block_name '{self.default_block_name}' must be one of block_names: {self.block_names}" ) - # Map trigger inputs to block objects - self.trigger_to_block_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.values())) - self.trigger_to_block_name_map = dict(zip(self.block_trigger_inputs, self.sub_blocks.keys())) - self.block_to_trigger_map = dict(zip(self.sub_blocks.keys(), self.block_trigger_inputs)) - @property def model_name(self): return next(iter(self.sub_blocks.values())).model_name @@ -602,8 +593,11 @@ def expected_configs(self): @property def required_inputs(self) -> List[str]: - if None not in self.block_trigger_inputs: + + # no default block means this conditional block can be skipped entirely + if self.default_block_name is None: return [] + first_block = next(iter(self.sub_blocks.values())) required_by_all = set(getattr(first_block, "required_inputs", set())) @@ -614,7 +608,7 @@ def required_inputs(self) -> List[str]: return list(required_by_all) - # YiYi TODO: add test for this + @property def inputs(self) -> List[Tuple[str, Any]]: named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()] @@ -639,36 +633,9 @@ def outputs(self) -> List[str]: combined_outputs = self.combine_outputs(*named_outputs) return combined_outputs - @torch.no_grad() - def __call__(self, pipeline, state: PipelineState) -> PipelineState: - # Find default block first (if any) - - block = self.trigger_to_block_map.get(None) - for input_name in self.block_trigger_inputs: - if input_name is not None and state.get(input_name) is not None: - block = self.trigger_to_block_map[input_name] - break - - if block is None: - logger.info(f"skipping auto block: {self.__class__.__name__}") - return pipeline, state - - try: - logger.info(f"Running block: {block.__class__.__name__}, trigger: {input_name}") - return block(pipeline, state) - except Exception as e: - error_msg = ( - f"\nError in block: {block.__class__.__name__}\n" - f"Error details: {str(e)}\n" - f"Traceback:\n{traceback.format_exc()}" - ) - logger.error(error_msg) - raise - - def _get_trigger_inputs(self): + def _get_trigger_inputs(self) -> set: """ - Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique - block_trigger_inputs values + Returns a set of all unique trigger input values found in this block and nested blocks. """ def fn_recursive_get_trigger(blocks): @@ -676,9 +643,8 @@ def fn_recursive_get_trigger(blocks): if blocks is not None: for name, block in blocks.items(): - # Check if current block has trigger inputs(i.e. auto block) + # Check if current block has block_trigger_inputs if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None: - # Add all non-None values from the trigger inputs list trigger_values.update(t for t in block.block_trigger_inputs if t is not None) # If block has sub_blocks, recursively check them @@ -688,15 +654,58 @@ def fn_recursive_get_trigger(blocks): return trigger_values - trigger_inputs = set(self.block_trigger_inputs) - trigger_inputs.update(fn_recursive_get_trigger(self.sub_blocks)) + # Start with this block's block_trigger_inputs + all_triggers = set(t for t in self.block_trigger_inputs if t is not None) + # Add nested triggers + all_triggers.update(fn_recursive_get_trigger(self.sub_blocks)) - return trigger_inputs + return all_triggers @property def trigger_inputs(self): + """All trigger inputs including from nested blocks.""" return self._get_trigger_inputs() + def select_block(self, **kwargs) -> Optional[str]: + """ + Select the block to run based on the trigger inputs. + Subclasses must implement this method to define the logic for selecting the block. + + Args: + **kwargs: Trigger input names and their values from the state. + + Returns: + Optional[str]: The name of the block to run, or None to use default/skip. + """ + raise NotImplementedError(f"Subclass {self.__class__.__name__} must implement the `select_block` method.") + + @torch.no_grad() + def __call__(self, pipeline, state: PipelineState) -> PipelineState: + + trigger_kwargs = {name: state.get(name) for name in self.block_trigger_inputs if name is not None} + block_name = self.select_block(**trigger_kwargs) + + if block_name is None: + block_name = self.default_block_name + + if block_name is None: + logger.info(f"skipping conditional block: {self.__class__.__name__}") + return pipeline, state + + block = self.sub_blocks[block_name] + + try: + logger.info(f"Running block: {block.__class__.__name__}") + return block(pipeline, state) + except Exception as e: + error_msg = ( + f"\nError in block: {block.__class__.__name__}\n" + f"Error details: {str(e)}\n" + f"Traceback:\n{traceback.format_exc()}" + ) + logger.error(error_msg) + raise + def __repr__(self): class_name = self.__class__.__name__ base_class = self.__class__.__bases__[0].__name__ @@ -708,7 +717,7 @@ def __repr__(self): header += "\n" header += " " + "=" * 100 + "\n" header += " This pipeline contains blocks that are selected at runtime based on inputs.\n" - header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n" + header += f" Trigger Inputs: {sorted(self.trigger_inputs)}\n" header += " " + "=" * 100 + "\n\n" # Format description with proper indentation @@ -729,31 +738,20 @@ def __repr__(self): expected_configs = getattr(self, "expected_configs", []) configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False) - # Blocks section - moved to the end with simplified format + # Blocks section blocks_str = " Sub-Blocks:\n" for i, (name, block) in enumerate(self.sub_blocks.items()): - # Get trigger input for this block - trigger = None - if hasattr(self, "block_to_trigger_map"): - trigger = self.block_to_trigger_map.get(name) - # Format the trigger info - if trigger is None: - trigger_str = "[default]" - elif isinstance(trigger, (list, tuple)): - trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]" - else: - trigger_str = f"[trigger: {trigger}]" - # For AutoPipelineBlocks, add bullet points - blocks_str += f" • {name} {trigger_str} ({block.__class__.__name__})\n" + if name == self.default_block_name: + addtional_str = " [default]" else: - # For SequentialPipelineBlocks, show execution order - blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n" + addtional_str = "" + blocks_str += f" • {name}{addtional_str} ({block.__class__.__name__})\n" # Add block description - desc_lines = block.description.split("\n") - indented_desc = desc_lines[0] - if len(desc_lines) > 1: - indented_desc += "\n" + "\n".join(" " + line for line in desc_lines[1:]) + block_desc_lines = block.description.split("\n") + indented_desc = block_desc_lines[0] + if len(block_desc_lines) > 1: + indented_desc += "\n" + "\n".join(" " + line for line in block_desc_lines[1:]) blocks_str += f" Description: {indented_desc}\n\n" # Build the representation with conditional sections @@ -784,6 +782,35 @@ def doc(self): ) +class AutoPipelineBlocks(ConditionalPipelineBlocks): + """ + A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs. + """ + + def __init__(self): + super().__init__() + + if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)): + raise ValueError( + f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same." + ) + + @property + def default_block_name(self) -> Optional[str]: + """Derive default_block_name from block_trigger_inputs (None entry).""" + if None in self.block_trigger_inputs: + idx = self.block_trigger_inputs.index(None) + return self.block_names[idx] + return None + + def select_block(self, **kwargs) -> Optional[str]: + """Select block based on which trigger input is present (not None).""" + for trigger_input, block_name in zip(self.block_trigger_inputs, self.block_names): + if trigger_input is not None and kwargs.get(trigger_input) is not None: + return block_name + return None + + class SequentialPipelineBlocks(ModularPipelineBlocks): """ A Pipeline Blocks that combines multiple pipeline block classes into one. When called, it will call each block in @@ -885,7 +912,8 @@ def _get_inputs(self): # Only add outputs if the block cannot be skipped should_add_outputs = True - if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs: + if isinstance(block, ConditionalPipelineBlocks) and block.default_block_name is None: + # ConditionalPipelineBlocks without default can be skipped should_add_outputs = False if should_add_outputs: @@ -948,8 +976,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState: def _get_trigger_inputs(self): """ - Returns a set of all unique trigger input values found in the blocks. Returns: Set[str] containing all unique - block_trigger_inputs values + Returns a set of all unique trigger input values found in the blocks. """ def fn_recursive_get_trigger(blocks): @@ -957,9 +984,8 @@ def fn_recursive_get_trigger(blocks): if blocks is not None: for name, block in blocks.items(): - # Check if current block has trigger inputs(i.e. auto block) + # Check if current block has block_trigger_inputs (ConditionalPipelineBlocks) if hasattr(block, "block_trigger_inputs") and block.block_trigger_inputs is not None: - # Add all non-None values from the trigger inputs list trigger_values.update(t for t in block.block_trigger_inputs if t is not None) # If block has sub_blocks, recursively check them @@ -975,82 +1001,85 @@ def fn_recursive_get_trigger(blocks): def trigger_inputs(self): return self._get_trigger_inputs() - def _traverse_trigger_blocks(self, trigger_inputs): - # Convert trigger_inputs to a set for easier manipulation - active_triggers = set(trigger_inputs) + def _traverse_trigger_blocks(self, active_inputs): + """ + Traverse blocks and select which ones would run given the active inputs. - def fn_recursive_traverse(block, block_name, active_triggers): + Args: + active_inputs: Dict of input names to values that are "present" + + Returns: + OrderedDict of block_name -> block that would execute + """ + + def fn_recursive_traverse(block, block_name, active_inputs): result_blocks = OrderedDict() - # sequential(include loopsequential) or PipelineBlock - if not hasattr(block, "block_trigger_inputs"): - if block.sub_blocks: - # sequential or LoopSequentialPipelineBlocks (keep traversing) - for sub_block_name, sub_block in block.sub_blocks.items(): - blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers) - blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_triggers) - blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()} - result_blocks.update(blocks_to_update) + # ConditionalPipelineBlocks (includes AutoPipelineBlocks) + if isinstance(block, ConditionalPipelineBlocks): + trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs} + selected_block_name = block.select_block(**trigger_kwargs) + + if selected_block_name is None: + selected_block_name = block.default_block_name + + if selected_block_name is None: + return result_blocks + + selected_block = block.sub_blocks[selected_block_name] + + if selected_block.sub_blocks: + result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs)) else: - # PipelineBlock - result_blocks[block_name] = block - # Add this block's output names to active triggers if defined - if hasattr(block, "outputs"): - active_triggers.update(out.name for out in block.outputs) + result_blocks[block_name] = selected_block + if hasattr(selected_block, "outputs"): + for out in selected_block.outputs: + active_inputs[out.name] = True + return result_blocks - # auto + # SequentialPipelineBlocks or LoopSequentialPipelineBlocks + if block.sub_blocks: + for sub_block_name, sub_block in block.sub_blocks.items(): + blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs) + blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()} + result_blocks.update(blocks_to_update) else: - # Find first block_trigger_input that matches any value in our active_triggers - this_block = None - for trigger_input in block.block_trigger_inputs: - if trigger_input is not None and trigger_input in active_triggers: - this_block = block.trigger_to_block_map[trigger_input] - break - - # If no matches found, try to get the default (None) block - if this_block is None and None in block.block_trigger_inputs: - this_block = block.trigger_to_block_map[None] - - if this_block is not None: - # sequential/auto (keep traversing) - if this_block.sub_blocks: - result_blocks.update(fn_recursive_traverse(this_block, block_name, active_triggers)) - else: - # PipelineBlock - result_blocks[block_name] = this_block - # Add this block's output names to active triggers if defined - # YiYi TODO: do we need outputs here? can it just be intermediate_outputs? can we get rid of outputs attribute? - if hasattr(this_block, "outputs"): - active_triggers.update(out.name for out in this_block.outputs) + result_blocks[block_name] = block + if hasattr(block, "outputs"): + for out in block.outputs: + active_inputs[out.name] = True return result_blocks all_blocks = OrderedDict() for block_name, block in self.sub_blocks.items(): - blocks_to_update = fn_recursive_traverse(block, block_name, active_triggers) + blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs) all_blocks.update(blocks_to_update) return all_blocks - def get_execution_blocks(self, *trigger_inputs): - trigger_inputs_all = self.trigger_inputs + def get_execution_blocks(self, **kwargs): + """ + Get the blocks that would execute given the specified inputs. - if trigger_inputs is not None: - if not isinstance(trigger_inputs, (list, tuple, set)): - trigger_inputs = [trigger_inputs] - invalid_inputs = [x for x in trigger_inputs if x not in trigger_inputs_all] - if invalid_inputs: - logger.warning( - f"The following trigger inputs will be ignored as they are not supported: {invalid_inputs}" - ) - trigger_inputs = [x for x in trigger_inputs if x in trigger_inputs_all] + Args: + **kwargs: Input names and values. Only trigger inputs affect block selection. + Pass any inputs that would be non-None at runtime. - if trigger_inputs is None: - if None in trigger_inputs_all: - trigger_inputs = [None] - else: - trigger_inputs = [trigger_inputs_all[0]] - blocks_triggered = self._traverse_trigger_blocks(trigger_inputs) + Returns: + SequentialPipelineBlocks containing only the blocks that would execute + + Example: + # Get blocks for inpainting workflow + blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, image=image) + + # Get blocks for text2image workflow + blocks = pipeline.get_execution_blocks(prompt="a cat") + """ + # Filter out None values + active_inputs = {k: v for k, v in kwargs.items() if v is not None} + + blocks_triggered = self._traverse_trigger_blocks(active_inputs) return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered) def __repr__(self): @@ -1067,7 +1096,7 @@ def __repr__(self): header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n" # Get first trigger input as example example_input = next(t for t in self.trigger_inputs if t is not None) - header += f" Use `get_execution_blocks()` with input names to see selected blocks (e.g. `get_execution_blocks('{example_input}')`).\n" + header += f" Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n" header += " " + "=" * 100 + "\n\n" # Format description with proper indentation @@ -1091,22 +1120,9 @@ def __repr__(self): # Blocks section - moved to the end with simplified format blocks_str = " Sub-Blocks:\n" for i, (name, block) in enumerate(self.sub_blocks.items()): - # Get trigger input for this block - trigger = None - if hasattr(self, "block_to_trigger_map"): - trigger = self.block_to_trigger_map.get(name) - # Format the trigger info - if trigger is None: - trigger_str = "[default]" - elif isinstance(trigger, (list, tuple)): - trigger_str = f"[trigger: {', '.join(str(t) for t in trigger)}]" - else: - trigger_str = f"[trigger: {trigger}]" - # For AutoPipelineBlocks, add bullet points - blocks_str += f" • {name} {trigger_str} ({block.__class__.__name__})\n" - else: - # For SequentialPipelineBlocks, show execution order - blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n" + + # show execution order + blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n" # Add block description desc_lines = block.description.split("\n") @@ -1230,15 +1246,9 @@ def _get_inputs(self): if inp.name not in outputs and inp not in inputs: inputs.append(inp) - # Only add outputs if the block cannot be skipped - should_add_outputs = True - if hasattr(block, "block_trigger_inputs") and None not in block.block_trigger_inputs: - should_add_outputs = False - - if should_add_outputs: - # Add this block's outputs - block_intermediate_outputs = [out.name for out in block.intermediate_outputs] - outputs.update(block_intermediate_outputs) + # Add this block's outputs + block_intermediate_outputs = [out.name for out in block.intermediate_outputs] + outputs.update(block_intermediate_outputs) for input_param in inputs: if input_param.name in self.required_inputs: @@ -1295,6 +1305,14 @@ def __init__(self): sub_blocks[block_name] = block self.sub_blocks = sub_blocks + # Validate that sub_blocks are only leaf blocks + for block_name, block in self.sub_blocks.items(): + if block.sub_blocks: + raise ValueError( + f"In {self.__class__.__name__}, sub_blocks must be leaf blocks (no sub_blocks). " + f"Block '{block_name}' ({block.__class__.__name__}) has sub_blocks." + ) + @classmethod def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "LoopSequentialPipelineBlocks": """ From 19e2ce1b2de31226c70e9479029d2439df3a01d8 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 22 Dec 2025 01:02:40 +0100 Subject: [PATCH 03/12] refactor qwen modular --- .../modular_pipelines/qwenimage/__init__.py | 42 +- .../qwenimage/before_denoise.py | 52 +- .../modular_pipelines/qwenimage/encoders.py | 339 ++--- .../modular_pipelines/qwenimage/inputs.py | 150 ++- .../qwenimage/modular_blocks_qwenimage.py | 1140 ++++------------- .../modular_blocks_qwenimage_edit.py | 1098 +++------------- .../modular_blocks_qwenimage_edit_plus.py | 1076 +--------------- 7 files changed, 825 insertions(+), 3072 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py index ae4ec4799fbc..b62912825f12 100644 --- a/src/diffusers/modular_pipelines/qwenimage/__init__.py +++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py @@ -21,21 +21,16 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["encoders"] = ["QwenImageTextEncoderStep"] - _import_structure["modular_blocks"] = [ - "ALL_BLOCKS", + _import_structure["modular_blocks_qwenimage"] = [ "AUTO_BLOCKS", - "CONTROLNET_BLOCKS", - "EDIT_AUTO_BLOCKS", - "EDIT_BLOCKS", - "EDIT_INPAINT_BLOCKS", - "EDIT_PLUS_AUTO_BLOCKS", - "EDIT_PLUS_BLOCKS", - "IMAGE2IMAGE_BLOCKS", - "INPAINT_BLOCKS", - "TEXT2IMAGE_BLOCKS", "QwenImageAutoBlocks", + ] + _import_structure["modular_blocks_qwenimage_edit"] = [ + "EDIT_AUTO_BLOCKS", "QwenImageEditAutoBlocks", + ] + _import_structure["modular_blocks_qwenimage_edit_plus"] = [ + "EDIT_PLUS_AUTO_BLOCKS", "QwenImageEditPlusAutoBlocks", ] _import_structure["modular_pipeline"] = [ @@ -51,23 +46,16 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .encoders import ( - QwenImageTextEncoderStep, - ) - from .modular_blocks import ( - ALL_BLOCKS, + from .modular_blocks_qwenimage import ( AUTO_BLOCKS, - CONTROLNET_BLOCKS, - EDIT_AUTO_BLOCKS, - EDIT_BLOCKS, - EDIT_INPAINT_BLOCKS, - EDIT_PLUS_AUTO_BLOCKS, - EDIT_PLUS_BLOCKS, - IMAGE2IMAGE_BLOCKS, - INPAINT_BLOCKS, - TEXT2IMAGE_BLOCKS, QwenImageAutoBlocks, + ) + from .modular_blocks_qwenimage_edit import ( + EDIT_AUTO_BLOCKS, QwenImageEditAutoBlocks, + ) + from .modular_blocks_qwenimage_edit_plus import ( + EDIT_PLUS_AUTO_BLOCKS, QwenImageEditPlusAutoBlocks, ) from .modular_pipeline import ( @@ -86,4 +74,4 @@ ) for name, value in _dummy_objects.items(): - setattr(sys.modules[__name__], name, value) + setattr(sys.modules[__name__], name, value) \ No newline at end of file diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index bd92d403539e..55968bd4fc93 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -639,19 +639,65 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -class QwenImageEditPlusRoPEInputsStep(QwenImageEditRoPEInputsStep): +class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): + """RoPE inputs step for Edit Plus that handles lists of image heights/widths.""" + model_name = "qwenimage-edit-plus" + @property + def description(self) -> str: + return ( + "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.\n" + "Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.\n" + "Should be placed after prepare_latents step." + ) + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="batch_size", required=True), + InputParam(name="image_height", required=True, type_hint=List[int]), + InputParam(name="image_width", required=True, type_hint=List[int]), + InputParam(name="height", required=True), + InputParam(name="width", required=True), + InputParam(name="prompt_embeds_mask"), + InputParam(name="negative_prompt_embeds_mask"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="img_shapes", + type_hint=List[List[Tuple[int, int, int]]], + description="The shapes of the image latents, used for RoPE calculation", + ), + OutputParam( + name="txt_seq_lens", + kwargs_type="denoiser_input_fields", + type_hint=List[int], + description="The sequence lengths of the prompt embeds, used for RoPE calculation", + ), + OutputParam( + name="negative_txt_seq_lens", + kwargs_type="denoiser_input_fields", + type_hint=List[int], + description="The sequence lengths of the negative prompt embeds, used for RoPE calculation", + ), + ] + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) vae_scale_factor = components.vae_scale_factor + + # Edit Plus: image_height and image_width are lists block_state.img_shapes = [ [ (1, block_state.height // vae_scale_factor // 2, block_state.width // vae_scale_factor // 2), *[ - (1, vae_height // vae_scale_factor // 2, vae_width // vae_scale_factor // 2) - for vae_height, vae_width in zip(block_state.image_height, block_state.image_width) + (1, img_height // vae_scale_factor // 2, img_width // vae_scale_factor // 2) + for img_height, img_width in zip(block_state.image_height, block_state.image_width) ], ] ] * block_state.batch_size diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index b126a368bfdf..01385d38c99e 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -244,18 +244,19 @@ def encode_vae_image( class QwenImageEditResizeDynamicStep(ModularPipelineBlocks): model_name = "qwenimage" - def __init__(self, input_name: str = "image", output_name: str = "resized_image"): - """Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio. - - This block resizes an input image tensor and exposes the resized result under configurable input and output - names. Use this when you need to wire the resize step to different image fields (e.g., "image", - "control_image") - + def __init__( + self, + input_name: str = "image", + output_name: str = "resized_image", + target_area: int = 1024 * 1024, + ): + """Create a configurable step for resizing images to the target area while maintaining the aspect ratio. Args: input_name (str, optional): Name of the image field to read from the pipeline state. Defaults to "image". output_name (str, optional): Name of the resized image field to write back to the pipeline state. Defaults to "resized_image". + target_area (int, optional): Target area in pixels. Defaults to 1024*1024. """ if not isinstance(input_name, str) or not isinstance(output_name, str): raise ValueError( @@ -263,11 +264,12 @@ def __init__(self, input_name: str = "image", output_name: str = "resized_image" ) self._image_input_name = input_name self._resized_image_output_name = output_name + self._target_area = target_area super().__init__() @property def description(self) -> str: - return f"Image Resize step that resize the {self._image_input_name} to the target area (1024 * 1024) while maintaining the aspect ratio." + return f"Image Resize step that resize the {self._image_input_name} to the target area {self._target_area} while maintaining the aspect ratio." @property def expected_components(self) -> List[ComponentSpec]: @@ -320,48 +322,67 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state +class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks): + """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus.""" -class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep): - model_name = "qwenimage" + model_name = "qwenimage-edit-plus" def __init__( - self, - input_name: str = "image", + self, + input_name: str = "image", output_name: str = "resized_image", - vae_image_output_name: str = "vae_image", + target_area: int = 1024 * 1024, ): - """Create a configurable step for resizing images to the target area (384 * 384) while maintaining the aspect ratio. + """Create a step for resizing images to a target area. - This block resizes an input image or a list input images and exposes the resized result under configurable - input and output names. Use this when you need to wire the resize step to different image fields (e.g., - "image", "control_image") + Each image is resized independently based on its own aspect ratio. + This is suitable for Edit Plus where multiple reference images can have different dimensions. Args: - input_name (str, optional): Name of the image field to read from the - pipeline state. Defaults to "image". - output_name (str, optional): Name of the resized image field to write - back to the pipeline state. Defaults to "resized_image". - vae_image_output_name (str, optional): Name of the image field - to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus - processes the input image(s) differently for the VL and the VAE. + input_name (str, optional): Name of the image field to read. Defaults to "image". + output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image". + target_area (int, optional): Target area in pixels. Defaults to 1024*1024. """ if not isinstance(input_name, str) or not isinstance(output_name, str): raise ValueError( f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}" ) - self.condition_image_size = 384 * 384 self._image_input_name = input_name self._resized_image_output_name = output_name - self._vae_image_output_name = vae_image_output_name + self._target_area = target_area super().__init__() + @property + def description(self) -> str: + return ( + f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n" + "Each image is resized independently based on its own aspect ratio." + ) + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_resize_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image(s) to resize" + ), + ] + @property def intermediate_outputs(self) -> List[OutputParam]: - return super().intermediate_outputs + [ + return [ OutputParam( - name=self._vae_image_output_name, - type_hint=List[PIL.Image.Image], - description="The images to be processed which will be further used by the VAE encoder.", + name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images" ), ] @@ -374,26 +395,21 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): if not is_valid_image_imagelist(images): raise ValueError(f"Images must be image or list of images but are {type(images)}") - if ( - not isinstance(images, torch.Tensor) - and isinstance(images, PIL.Image.Image) - and not isinstance(images, list) - ): + if is_valid_image(images): images = [images] - # TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s - condition_images = [] - vae_images = [] - for img in images: - image_width, image_height = img.size - condition_width, condition_height, _ = calculate_dimensions( - self.condition_image_size, image_width / image_height + # Resize each image independently based on its own aspect ratio + resized_images = [] + for image in images: + image_width, image_height = image.size + calculated_width, calculated_height, _ = calculate_dimensions( + self._target_area, image_width / image_height + ) + resized_images.append( + components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width) ) - condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width)) - vae_images.append(img) - setattr(block_state, self._resized_image_output_name, condition_images) - setattr(block_state, self._vae_image_output_name, vae_images) + setattr(block_state, self._resized_image_output_name, resized_images) self.set_block_state(state, block_state) return components, state @@ -647,8 +663,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state -class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep): - model_name = "qwenimage" +class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): + """Text encoder for QwenImage Edit Plus that handles multiple reference images.""" + + model_name = "qwenimage-edit-plus" + + @property + def description(self) -> str: + return ( + "Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together " + "to generate text embeddings for guiding image generation." + ) + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration), + ComponentSpec("processor", Qwen2VLProcessor), + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 4.0}), + default_creation_method="from_config", + ), + ] @property def expected_configs(self) -> List[ConfigSpec]: @@ -664,6 +702,60 @@ def expected_configs(self) -> List[ConfigSpec]: ConfigSpec(name="prompt_template_encode_start_idx", default=64), ] + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), + InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), + InputParam( + name="resized_cond_image", + required=True, + type_hint=torch.Tensor, + description="The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step", + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="prompt_embeds", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The prompt embeddings", + ), + OutputParam( + name="prompt_embeds_mask", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The encoder attention mask", + ), + OutputParam( + name="negative_prompt_embeds", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The negative prompt embeddings", + ), + OutputParam( + name="negative_prompt_embeds_mask", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The negative prompt embeddings mask", + ), + ] + + @staticmethod + def check_inputs(prompt, negative_prompt): + if not isinstance(prompt, str) and not isinstance(prompt, list): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if ( + negative_prompt is not None + and not isinstance(negative_prompt, str) + and not isinstance(negative_prompt, list) + ): + raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}") + @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): block_state = self.get_block_state(state) @@ -676,7 +768,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): components.text_encoder, components.processor, prompt=block_state.prompt, - image=block_state.resized_image, + image=block_state.resized_cond_image, prompt_template_encode=components.config.prompt_template_encode, img_template_encode=components.config.img_template_encode, prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, @@ -692,7 +784,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): components.text_encoder, components.processor, prompt=negative_prompt, - image=block_state.resized_image, + image=block_state.resized_cond_image, prompt_template_encode=components.config.prompt_template_encode, img_template_encode=components.config.img_template_encode, prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, @@ -846,60 +938,60 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state - -class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep): +class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): model_name = "qwenimage-edit-plus" - def __init__(self): - self.vae_image_size = 1024 * 1024 - super().__init__() - @property def description(self) -> str: - return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing." + return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] @property def inputs(self) -> List[InputParam]: - return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")] + return [InputParam("resized_image")] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [OutputParam(name="processed_image")] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): block_state = self.get_block_state(state) - if block_state.vae_image is None and block_state.image is None: - raise ValueError("`vae_image` and `image` cannot be None at the same time") - vae_image_sizes = None - if block_state.vae_image is None: - image = block_state.image - self.check_inputs( - height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor - ) - height = block_state.height or components.default_height - width = block_state.width or components.default_width - block_state.processed_image = components.image_processor.preprocess( - image=image, height=height, width=width - ) - else: - # QwenImage Edit Plus can allow multiple input images with varied resolutions - processed_images = [] - vae_image_sizes = [] - for img in block_state.vae_image: - width, height = img.size - vae_width, vae_height, _ = calculate_dimensions(self.vae_image_size, width / height) - vae_image_sizes.append((vae_width, vae_height)) - processed_images.append( - components.image_processor.preprocess(image=img, height=vae_height, width=vae_width) - ) - block_state.processed_image = processed_images - block_state.vae_image_sizes = vae_image_sizes + image = block_state.resized_image + + is_image_list = isinstance(image, list) + if not is_image_list: + image = [image] + + processed_images = [] + for img in image: + img_width, img_height = img.size + processed_images.append(components.image_processor.preprocess(image=img, height=img_height, width=img_width)) + block_state.processed_image = processed_images + if is_image_list: + block_state.processed_image = processed_images + else: + block_state.processed_image = processed_images[0] self.set_block_state(state, block_state) return components, state - class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks): + """VAE encoder that handles both single images and lists of images with varied resolutions.""" + model_name = "qwenimage" def __init__( @@ -909,21 +1001,12 @@ def __init__( ): """Initialize a VAE encoder step for converting images to latent representations. - Both the input and output names are configurable so this block can be configured to process to different image - inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents"). + Handles both single images and lists of images. When input is a list, outputs a list of latents. + When input is a single tensor, outputs a single latent tensor. Args: - input_name (str, optional): Name of the input image tensor. Defaults to "processed_image". - Examples: "processed_image" or "processed_control_image" - output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents". - Examples: "image_latents" or "control_image_latents" - - Examples: - # Basic usage with default settings (includes image processor) QwenImageVaeEncoderDynamicStep() - - # Custom input/output names for control image QwenImageVaeEncoderDynamicStep( - input_name="processed_control_image", output_name="control_image_latents" - ) + input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image". + output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents". """ self._image_input_name = input_name self._image_latents_output_name = output_name @@ -931,17 +1014,18 @@ def __init__( @property def description(self) -> str: - return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n" + return ( + f"VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n" + "Handles both single images and lists of images with varied resolutions." + ) @property def expected_components(self) -> List[ComponentSpec]: - components = [ComponentSpec("vae", AutoencoderKLQwenImage)] - return components + return [ComponentSpec("vae", AutoencoderKLQwenImage)] @property def inputs(self) -> List[InputParam]: - inputs = [InputParam(self._image_input_name, required=True), InputParam("generator")] - return inputs + return [InputParam(self._image_input_name, required=True), InputParam("generator")] @property def intermediate_outputs(self) -> List[OutputParam]: @@ -949,46 +1033,7 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( self._image_latents_output_name, type_hint=torch.Tensor, - description="The latents representing the reference image", - ) - ] - - @torch.no_grad() - def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: - block_state = self.get_block_state(state) - - device = components._execution_device - dtype = components.vae.dtype - - image = getattr(block_state, self._image_input_name) - - # Encode image into latents - image_latents = encode_vae_image( - image=image, - vae=components.vae, - generator=block_state.generator, - device=device, - dtype=dtype, - latent_channels=components.num_channels_latents, - ) - setattr(block_state, self._image_latents_output_name, image_latents) - - self.set_block_state(state, block_state) - - return components, state - - -class QwenImageEditPlusVaeEncoderDynamicStep(QwenImageVaeEncoderDynamicStep): - model_name = "qwenimage-edit-plus" - - @property - def intermediate_outputs(self) -> List[OutputParam]: - # Each reference image latent can have varied resolutions hence we return this as a list. - return [ - OutputParam( - self._image_latents_output_name, - type_hint=List[torch.Tensor], - description="The latents representing the reference image(s).", + description="The latents representing the reference image(s). Single tensor or list depending on input.", ) ] @@ -1000,8 +1045,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - dtype = components.vae.dtype image = getattr(block_state, self._image_input_name) + is_image_list = isinstance(image, list) + if not is_image_list: + image = [image] - # Encode image into latents + # Handle both single image and list of images image_latents = [] for img in image: image_latents.append( @@ -1014,9 +1062,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - latent_channels=components.num_channels_latents, ) ) + if not is_image_list: + image_latents = image_latents[0] setattr(block_state, self._image_latents_output_name, image_latents) + self.set_block_state(state, block_state) return components, state diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 6e656e484847..5c3df4909f56 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -222,36 +222,15 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - class QwenImageInputsDynamicStep(ModularPipelineBlocks): - model_name = "qwenimage" - - def __init__(self, image_latent_inputs: List[str] = ["image_latents"], additional_batch_inputs: List[str] = []): - """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n" - - This step handles multiple common tasks to prepare inputs for the denoising step: - 1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size - 2. For additional_batch_inputs: Only expands batch dimensions to match final batch size - - This is a dynamic block that allows you to configure which inputs to process. - - Args: - image_latent_inputs (List[str], optional): Names of image latent tensors to process. - These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or - list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"] - additional_batch_inputs (List[str], optional): - Names of additional conditional input tensors to expand batch size. These tensors will only have their - batch dimensions adjusted to match the final batch size. Can be a single string or list of strings. - Defaults to []. Examples: ["processed_mask_image"] + """Input step for QwenImage: update height/width, expand batch, patchify.""" - Examples: - # Configure to process image_latents (default behavior) QwenImageInputsDynamicStep() - - # Configure to process multiple image latent inputs - QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"]) + model_name = "qwenimage" - # Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep( - image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] - ) - """ + def __init__( + self, + image_latent_inputs: List[str] = ["image_latents"], + additional_batch_inputs: List[str] = [], + ): if not isinstance(image_latent_inputs, list): image_latent_inputs = [image_latent_inputs] if not isinstance(additional_batch_inputs, list): @@ -263,14 +242,12 @@ def __init__(self, image_latent_inputs: List[str] = ["image_latents"], additiona @property def description(self) -> str: - # Functionality section summary_section = ( "Input processing step that:\n" - " 1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n" + " 1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size\n" " 2. For additional batch inputs: Expands batch dimensions to match final batch size" ) - # Inputs info inputs_info = "" if self._image_latent_inputs or self._additional_batch_inputs: inputs_info = "\n\nConfigured inputs:" @@ -279,11 +256,16 @@ def description(self) -> str: if self._additional_batch_inputs: inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}" - # Placement guidance placement_section = "\n\nThis block should be placed after the encoder steps and the text input step." return summary_section + inputs_info + placement_section + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"), + ] + @property def inputs(self) -> List[InputParam]: inputs = [ @@ -293,11 +275,9 @@ def inputs(self) -> List[InputParam]: InputParam(name="width"), ] - # Add image latent inputs for image_latent_input_name in self._image_latent_inputs: inputs.append(InputParam(name=image_latent_input_name)) - # Add additional batch inputs for input_name in self._additional_batch_inputs: inputs.append(InputParam(name=input_name)) @@ -310,22 +290,16 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam(name="image_width", type_hint=int, description="The width of the image latents"), ] - @property - def expected_components(self) -> List[ComponentSpec]: - return [ - ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"), - ] - def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) - # Process image latent inputs (height/width calculation, patchify, and batch expansion) + # Process image latent inputs for image_latent_input_name in self._image_latent_inputs: image_latent_tensor = getattr(block_state, image_latent_input_name) if image_latent_tensor is None: continue - # 1. Calculate height/width from latents + # 1. Calculate height/width from latents and update if not provided height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor) block_state.height = block_state.height or height block_state.width = block_state.width or width @@ -335,7 +309,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - if not hasattr(block_state, "image_width"): block_state.image_width = width - # 2. Patchify the image latent tensor + # 2. Patchify image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor) # 3. Expand batch size @@ -354,7 +328,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - if input_tensor is None: continue - # Only expand batch size input_tensor = repeat_tensor_to_batch_size( input_name=input_name, input_tensor=input_tensor, @@ -368,63 +341,130 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -class QwenImageEditPlusInputsDynamicStep(QwenImageInputsDynamicStep): +class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks): + """Input step for QwenImage Edit Plus: handles list of latents with different sizes.""" + model_name = "qwenimage-edit-plus" + def __init__( + self, + image_latent_inputs: List[str] = ["image_latents"], + additional_batch_inputs: List[str] = [], + ): + if not isinstance(image_latent_inputs, list): + image_latent_inputs = [image_latent_inputs] + if not isinstance(additional_batch_inputs, list): + additional_batch_inputs = [additional_batch_inputs] + + self._image_latent_inputs = image_latent_inputs + self._additional_batch_inputs = additional_batch_inputs + super().__init__() + + @property + def description(self) -> str: + summary_section = ( + "Input processing step for Edit Plus that:\n" + " 1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch\n" + " 2. For additional batch inputs: Expands batch dimensions to match final batch size\n" + " Height/width defaults to last image in the list." + ) + + inputs_info = "" + if self._image_latent_inputs or self._additional_batch_inputs: + inputs_info = "\n\nConfigured inputs:" + if self._image_latent_inputs: + inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}" + if self._additional_batch_inputs: + inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}" + + placement_section = "\n\nThis block should be placed after the encoder steps and the text input step." + + return summary_section + inputs_info + placement_section + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"), + ] + + @property + def inputs(self) -> List[InputParam]: + inputs = [ + InputParam(name="num_images_per_prompt", default=1), + InputParam(name="batch_size", required=True), + InputParam(name="height"), + InputParam(name="width"), + ] + + for image_latent_input_name in self._image_latent_inputs: + inputs.append(InputParam(name=image_latent_input_name)) + + for input_name in self._additional_batch_inputs: + inputs.append(InputParam(name=input_name)) + + return inputs + @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="image_height", type_hint=List[int], description="The height of the image latents"), - OutputParam(name="image_width", type_hint=List[int], description="The width of the image latents"), + OutputParam(name="image_height", type_hint=List[int], description="The heights of the image latents"), + OutputParam(name="image_width", type_hint=List[int], description="The widths of the image latents"), ] def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) - # Process image latent inputs (height/width calculation, patchify, and batch expansion) + # Process image latent inputs for image_latent_input_name in self._image_latent_inputs: image_latent_tensor = getattr(block_state, image_latent_input_name) if image_latent_tensor is None: continue - # Each image latent can have different size in QwenImage Edit Plus. + is_list = isinstance(image_latent_tensor, list) + if not is_list: + image_latent_tensor = [image_latent_tensor] + image_heights = [] image_widths = [] packed_image_latent_tensors = [] - for img_latent_tensor in image_latent_tensor: + for i, img_latent_tensor in enumerate(image_latent_tensor): # 1. Calculate height/width from latents height, width = calculate_dimension_from_latents(img_latent_tensor, components.vae_scale_factor) image_heights.append(height) image_widths.append(width) - # 2. Patchify the image latent tensor + # 2. Patchify img_latent_tensor = components.pachifier.pack_latents(img_latent_tensor) # 3. Expand batch size img_latent_tensor = repeat_tensor_to_batch_size( - input_name=image_latent_input_name, + input_name=f"{image_latent_input_name}[{i}]", input_tensor=img_latent_tensor, num_images_per_prompt=block_state.num_images_per_prompt, batch_size=block_state.batch_size, ) packed_image_latent_tensors.append(img_latent_tensor) + # Concatenate all packed latents along dim=1 packed_image_latent_tensors = torch.cat(packed_image_latent_tensors, dim=1) + + # Output lists of heights/widths block_state.image_height = image_heights block_state.image_width = image_widths - setattr(block_state, image_latent_input_name, packed_image_latent_tensors) + # Default height/width from last image block_state.height = block_state.height or image_heights[-1] block_state.width = block_state.width or image_widths[-1] + setattr(block_state, image_latent_input_name, packed_image_latent_tensors) + # Process additional batch inputs (only batch expansion) for input_name in self._additional_batch_inputs: input_tensor = getattr(block_state, input_name) if input_tensor is None: continue - # Only expand batch size input_tensor = repeat_tensor_to_batch_size( input_name=input_name, input_tensor=input_tensor, @@ -436,8 +476,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - self.set_block_state(state, block_state) return components, state - - class QwenImageControlNetInputsStep(ModularPipelineBlocks): model_name = "qwenimage" diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index dcce0cab5dd1..dea9d36082c1 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -13,13 +13,11 @@ # limitations under the License. from ...utils import logging -from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks, ConditionalPipelineBlocks from ..modular_pipeline_utils import InsertableDict from .before_denoise import ( QwenImageControlNetBeforeDenoiserStep, QwenImageCreateMaskLatentsStep, - QwenImageEditPlusRoPEInputsStep, - QwenImageEditRoPEInputsStep, QwenImagePrepareLatentsStep, QwenImagePrepareLatentsWithStrengthStep, QwenImageRoPEInputsStep, @@ -35,20 +33,12 @@ from .denoise import ( QwenImageControlNetDenoiseStep, QwenImageDenoiseStep, - QwenImageEditDenoiseStep, - QwenImageEditInpaintDenoiseStep, QwenImageInpaintControlNetDenoiseStep, QwenImageInpaintDenoiseStep, QwenImageLoopBeforeDenoiserControlNet, ) from .encoders import ( QwenImageControlNetVaeEncoderStep, - QwenImageEditPlusProcessImagesInputStep, - QwenImageEditPlusResizeDynamicStep, - QwenImageEditPlusTextEncoderStep, - QwenImageEditPlusVaeEncoderDynamicStep, - QwenImageEditResizeDynamicStep, - QwenImageEditTextEncoderStep, QwenImageInpaintProcessImagesInputStep, QwenImageProcessImagesInputStep, QwenImageTextEncoderStep, @@ -56,7 +46,6 @@ ) from .inputs import ( QwenImageControlNetInputsStep, - QwenImageEditPlusInputsDynamicStep, QwenImageInputsDynamicStep, QwenImageTextInputsStep, ) @@ -64,63 +53,15 @@ logger = logging.get_logger(__name__) -# 1. QwenImage -## 1.1 QwenImage/text2image - -#### QwenImage/decode -#### (standard decode step works for most tasks except for inpaint) -QwenImageDecodeBlocks = InsertableDict( - [ - ("decode", QwenImageDecoderStep()), - ("postprocess", QwenImageProcessImagesOutputStep()), - ] -) - - -class QwenImageDecodeStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageDecodeBlocks.values() - block_names = QwenImageDecodeBlocks.keys() - - @property - def description(self): - return "Decode step that decodes the latents to images and postprocess the generated image." - - -#### QwenImage/text2image presets -TEXT2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("input", QwenImageTextInputsStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -## 1.2 QwenImage/inpaint - -#### QwenImage/inpaint vae encoder -QwenImageInpaintVaeEncoderBlocks = InsertableDict( - [ - ( - "preprocess", - QwenImageInpaintProcessImagesInputStep, - ), # image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs - ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents - ] -) +# 1. VAE ENCODER +# inpaint vae encoder class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageInpaintVaeEncoderBlocks.values() - block_names = QwenImageInpaintVaeEncoderBlocks.keys() + block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()] + block_names = ["preprocess", "encode"] @property def description(self) -> str: @@ -132,174 +73,19 @@ def description(self) -> str: ) -#### QwenImage/inpaint inputs -QwenImageInpaintInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ( - "additional_inputs", - QwenImageInputsDynamicStep( - image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] - ), - ), - ] -) - - -class QwenImageInpaintInputStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintInputBlocks.values() - block_names = QwenImageInpaintInputBlocks.keys() - - @property - def description(self): - return "Input step that prepares the inputs for the inpainting denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -# QwenImage/inpaint prepare latents -QwenImageInpaintPrepareLatentsBlocks = InsertableDict( - [ - ("add_noise_to_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("create_mask_latents", QwenImageCreateMaskLatentsStep()), - ] -) - - -class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintPrepareLatentsBlocks.values() - block_names = QwenImageInpaintPrepareLatentsBlocks.keys() - - @property - def description(self) -> str: - return ( - "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n" - " - Add noise to the image latents to create the latents input for the denoiser.\n" - " - Create the pachified latents `mask` based on the processedmask image.\n" - ) - - -#### QwenImage/inpaint decode -QwenImageInpaintDecodeBlocks = InsertableDict( - [ - ("decode", QwenImageDecoderStep()), - ("postprocess", QwenImageInpaintProcessImagesOutputStep()), - ] -) - - -class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintDecodeBlocks.values() - block_names = QwenImageInpaintDecodeBlocks.keys() - - @property - def description(self): - return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image." - - -#### QwenImage/inpaint presets -INPAINT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageInpaintVaeEncoderStep()), - ("input", QwenImageInpaintInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageInpaintDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageInpaintDecodeStep()), - ] -) - - -## 1.3 QwenImage/img2img - -#### QwenImage/img2img vae encoder -QwenImageImg2ImgVaeEncoderBlocks = InsertableDict( - [ - ("preprocess", QwenImageProcessImagesInputStep()), - ("encode", QwenImageVaeEncoderDynamicStep()), - ] -) - - +# img2img vae encoder class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageImg2ImgVaeEncoderBlocks.values() - block_names = QwenImageImg2ImgVaeEncoderBlocks.keys() + block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()] + block_names = ["preprocess", "encode"] @property def description(self) -> str: return "Vae encoder step that preprocess andencode the image inputs into their latent representations." -#### QwenImage/img2img inputs -QwenImageImg2ImgInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), - ] -) - - -class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageImg2ImgInputBlocks.values() - block_names = QwenImageImg2ImgInputBlocks.keys() - - @property - def description(self): - return "Input step that prepares the inputs for the img2img denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -#### QwenImage/img2img presets -IMAGE2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageImg2ImgVaeEncoderStep()), - ("input", QwenImageImg2ImgInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -## 1.4 QwenImage/controlnet - -#### QwenImage/controlnet presets -CONTROLNET_BLOCKS = InsertableDict( - [ - ("controlnet_vae_encoder", QwenImageControlNetVaeEncoderStep()), # vae encoder step for control_image - ("controlnet_inputs", QwenImageControlNetInputsStep()), # additional input step for controlnet - ( - "controlnet_before_denoise", - QwenImageControlNetBeforeDenoiserStep(), - ), # before denoise step (after set_timesteps step) - ( - "controlnet_denoise_loop_before", - QwenImageLoopBeforeDenoiserControlNet(), - ), # controlnet loop step (insert before the denoiseloop_denoiser) - ] -) - - -## 1.5 QwenImage/auto encoders - - -#### for inpaint and img2img tasks +# auto vae encoder class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] block_names = ["inpaint", "img2img"] @@ -316,7 +102,7 @@ def description(self): ) -# for controlnet tasks +# optional controlnet vae encoder class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): block_classes = [QwenImageControlNetVaeEncoderStep] block_names = ["controlnet"] @@ -331,783 +117,349 @@ def description(self): + " - if `control_image` is not provided, step will be skipped." ) +# 2. DENOISE +# input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise -## 1.6 QwenImage/auto inputs - - -# text2image/inpaint/img2img -class QwenImageAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintInputStep, QwenImageImg2ImgInputStep, QwenImageTextInputsStep] - block_names = ["inpaint", "img2img", "text2image"] - block_trigger_inputs = ["processed_mask_image", "image_latents", None] - - @property - def description(self): - return ( - "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" - " This is an auto pipeline block that works for text2image/inpaint/img2img tasks.\n" - + " - `QwenImageInpaintInputStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n" - + " - `QwenImageTextInputsStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" - ) - - -# controlnet -class QwenImageOptionalControlNetInputStep(AutoPipelineBlocks): - block_classes = [QwenImageControlNetInputsStep] - block_names = ["controlnet"] - block_trigger_inputs = ["control_image_latents"] - - @property - def description(self): - return ( - "Controlnet input step that prepare the control_image_latents input.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageControlNetInputsStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - if `control_image_latents` is not provided, step will be skipped." - ) - - -## 1.7 QwenImage/auto before denoise step -# compose the steps into a BeforeDenoiseStep for text2image/img2img/inpaint tasks before combine into an auto step - -# QwenImage/text2image before denoise -QwenImageText2ImageBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) - - -class QwenImageText2ImageBeforeDenoiseStep(SequentialPipelineBlocks): +# img2img input +class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageText2ImageBeforeDenoiseBlocks.values() - block_names = QwenImageText2ImageBeforeDenoiseBlocks.keys() + block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])] + block_names = ["text_inputs", "additional_inputs"] @property def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for text2image task." - - -# QwenImage/inpaint before denoise -QwenImageInpaintBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) + return "Input step that prepares the inputs for the img2img denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." -class QwenImageInpaintBeforeDenoiseStep(SequentialPipelineBlocks): +# inpaint input +class QwenImageInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageInpaintBeforeDenoiseBlocks.values() - block_names = QwenImageInpaintBeforeDenoiseBlocks.keys() + block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])] + block_names = ["text_inputs", "additional_inputs"] @property def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." - - -# QwenImage/img2img before denoise -QwenImageImg2ImgBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) - + return "Input step that prepares the inputs for the inpainting denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." -class QwenImageImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): +# inpaint prepare latents +class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageImg2ImgBeforeDenoiseBlocks.values() - block_names = QwenImageImg2ImgBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." - - -# auto before_denoise step for text2image, inpaint, img2img tasks -class QwenImageAutoBeforeDenoiseStep(AutoPipelineBlocks): - block_classes = [ - QwenImageInpaintBeforeDenoiseStep, - QwenImageImg2ImgBeforeDenoiseStep, - QwenImageText2ImageBeforeDenoiseStep, - ] - block_names = ["inpaint", "img2img", "text2image"] - block_trigger_inputs = ["processed_mask_image", "image_latents", None] - - @property - def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for text2img, inpainting, img2img tasks.\n" - + " - `QwenImageInpaintBeforeDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageImg2ImgBeforeDenoiseStep` (img2img) is used when `image_latents` is provided.\n" - + " - `QwenImageText2ImageBeforeDenoiseStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" - ) - - -# auto before_denoise step for controlnet tasks -class QwenImageOptionalControlNetBeforeDenoiseStep(AutoPipelineBlocks): - block_classes = [QwenImageControlNetBeforeDenoiserStep] - block_names = ["controlnet"] - block_trigger_inputs = ["control_image_latents"] - - @property - def description(self): - return ( - "Controlnet before denoise step that prepare the controlnet input.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageControlNetBeforeDenoiserStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - if `control_image_latents` is not provided, step will be skipped." - ) - - -## 1.8 QwenImage/auto denoise - - -# auto denoise step for controlnet tasks: works for all tasks with controlnet -class QwenImageControlNetAutoDenoiseStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintControlNetDenoiseStep, QwenImageControlNetDenoiseStep] - block_names = ["inpaint_denoise", "denoise"] - block_trigger_inputs = ["mask", None] - - @property - def description(self): - return ( - "Controlnet step during the denoising process. \n" - " This is an auto pipeline block that works for inpaint and text2image/img2img tasks with controlnet.\n" - + " - `QwenImageInpaintControlNetDenoiseStep` (inpaint) is used when `mask` is provided.\n" - + " - `QwenImageControlNetDenoiseStep` (text2image/img2img) is used when `mask` is not provided.\n" - ) - - -# auto denoise step for everything: works for all tasks with or without controlnet -class QwenImageAutoDenoiseStep(AutoPipelineBlocks): - block_classes = [ - QwenImageControlNetAutoDenoiseStep, - QwenImageInpaintDenoiseStep, - QwenImageDenoiseStep, - ] - block_names = ["controlnet_denoise", "inpaint_denoise", "denoise"] - block_trigger_inputs = ["control_image_latents", "mask", None] + block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] + block_names = ["add_noise_to_latents", "create_mask_latents"] @property - def description(self): - return ( - "Denoise step that iteratively denoise the latents. \n" - " This is an auto pipeline block that works for inpaint/text2image/img2img tasks. It also works with controlnet\n" - + " - `QwenImageControlNetAutoDenoiseStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - `QwenImageInpaintDenoiseStep` (inpaint) is used when `mask` is provided and `control_image_latents` is not provided.\n" - + " - `QwenImageDenoiseStep` (text2image/img2img) is used when `mask` is not provided and `control_image_latents` is not provided.\n" - ) - - -## 1.9 QwenImage/auto decode -# auto decode step for inpaint and text2image tasks - - -class QwenImageAutoDecodeStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep] - block_names = ["inpaint_decode", "decode"] - block_trigger_inputs = ["mask", None] - - @property - def description(self): + def description(self) -> str: return ( - "Decode step that decode the latents into images. \n" - " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n" - + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" - + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n" + "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n" + " - Add noise to the image latents to create the latents input for the denoiser.\n" + " - Create the pachified latents `mask` based on the processedmask image.\n" ) +# CoreDenoiseStep: +# (input + prepare_latents + set_timesteps + prepare_rope_inputs + denoise + after_denoise) +# 1. text2image class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ - QwenImageAutoInputStep, - QwenImageOptionalControlNetInputStep, - QwenImageAutoBeforeDenoiseStep, - QwenImageOptionalControlNetBeforeDenoiseStep, - QwenImageAutoDenoiseStep, - QwenImageAfterDenoiseStep, + QwenImageTextInputsStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsStep(), + QwenImageRoPEInputsStep(), + QwenImageDenoiseStep(), + QwenImageAfterDenoiseStep(), ] block_names = [ "input", - "controlnet_input", - "before_denoise", - "controlnet_before_denoise", - "denoise", + "prepare_latents", + "set_timesteps", + "prepare_rope_inputs", + "denoise", "after_denoise", ] @property def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n" - + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n" - + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" - + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n" - + " - for image-to-image generation, you need to provide `image_latents`\n" - + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n" - + " - to run the controlnet workflow, you need to provide `control_image_latents`\n" - + " - for text-to-image generation, all you need to provide is prompt embeddings" - ) - - -## 1.10 QwenImage/auto block & presets -AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageAutoVaeEncoderStep()), - ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), - ("denoise", QwenImageCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), - ] -) + return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." -class QwenImageAutoBlocks(SequentialPipelineBlocks): +# 2.inpaint +class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" - - block_classes = AUTO_BLOCKS.values() - block_names = AUTO_BLOCKS.keys() + block_classes = [ + QwenImageInpaintInputStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImageInpaintPrepareLatentsStep(), + QwenImageRoPEInputsStep(), + QwenImageInpaintDenoiseStep(), + QwenImageAfterDenoiseStep(), + ] + block_names = [ + "input", + "prepare_latents", + "set_timesteps", + "prepare_inpaint_latents", + "prepare_rope_inputs", + "denoise", + "after_denoise", + ] @property def description(self): - return ( - "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" - + "- for image-to-image generation, you need to provide `image`\n" - + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" - + "- to run the controlnet workflow, you need to provide `control_image`\n" - + "- for text-to-image generation, all you need to provide is `prompt`" - ) - - -# 2. QwenImage-Edit - -## 2.1 QwenImage-Edit/edit - -#### QwenImage-Edit/edit vl encoder: take both image and text prompts -QwenImageEditVLEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), - ("encode", QwenImageEditTextEncoderStep()), - ] -) - - -class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditVLEncoderBlocks.values() - block_names = QwenImageEditVLEncoderBlocks.keys() - - @property - def description(self) -> str: - return "QwenImage-Edit VL encoder step that encode the image an text prompts together." - - -#### QwenImage-Edit/edit vae encoder -QwenImageEditVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), # edit has a different resize step - ("preprocess", QwenImageProcessImagesInputStep()), # resized_image -> processed_image - ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents - ] -) - - -class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditVaeEncoderBlocks.values() - block_names = QwenImageEditVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that encode the image inputs into their latent representations." - - -#### QwenImage-Edit/edit input -QwenImageEditInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), - ] -) + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." -class QwenImageEditInputStep(SequentialPipelineBlocks): +# 3. img2img +class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageEditInputBlocks.values() - block_names = QwenImageEditInputBlocks.keys() + block_classes = [ + QwenImageImg2ImgInputStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImagePrepareLatentsWithStrengthStep(), + QwenImageRoPEInputsStep(), + QwenImageDenoiseStep(), + QwenImageAfterDenoiseStep(), + ] + block_names = [ + "input", + "prepare_latents", + "set_timesteps", + "prepare_img2img_latents", + "prepare_rope_inputs", + "denoise", + "after_denoise", + ] @property def description(self): - return "Input step that prepares the inputs for the edit denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs: \n" - " - `image_latents`.\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -#### QwenImage/edit presets -EDIT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditVaeEncoderStep()), - ("input", QwenImageEditInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ("denoise", QwenImageEditDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." -## 2.2 QwenImage-Edit/edit inpaint -#### QwenImage-Edit/edit inpaint vae encoder: the difference from regular inpaint is the resize step -QwenImageEditInpaintVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), # image -> resized_image - ( - "preprocess", - QwenImageInpaintProcessImagesInputStep, - ), # resized_image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs - ( - "encode", - QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"), - ), # processed_image -> image_latents - ] -) - - -class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): +# 4. text2image + controlnet +class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageEditInpaintVaeEncoderBlocks.values() - block_names = QwenImageEditInpaintVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return ( - "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n" - " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n" - " - process the resized image and mask image.\n" - " - create image latents." - ) - - -#### QwenImage-Edit/edit inpaint presets -EDIT_INPAINT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditInpaintVaeEncoderStep()), - ("input", QwenImageInpaintInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ("denoise", QwenImageEditInpaintDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageInpaintDecodeStep()), - ] -) - - -## 2.3 QwenImage-Edit/auto encoders - - -class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): block_classes = [ - QwenImageEditInpaintVaeEncoderStep, - QwenImageEditVaeEncoderStep, + QwenImageTextInputsStep(), + QwenImageControlNetInputsStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsStep(), + QwenImageRoPEInputsStep(), + QwenImageControlNetBeforeDenoiserStep(), + QwenImageControlNetDenoiseStep(), + QwenImageAfterDenoiseStep(), ] - block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["mask_image", "image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations. \n" - " This is an auto pipeline block that works for edit and edit_inpaint tasks.\n" - + " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n" - + " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n" - + " - if `mask_image` or `image` is not provided, step will be skipped." - ) - - -## 2.4 QwenImage-Edit/auto inputs -class QwenImageEditAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep] - block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] - - @property - def description(self): - return ( - "Input step that prepares the inputs for the edit denoising step.\n" - + " It is an auto pipeline block that works for edit and edit_inpaint tasks.\n" - + " - `QwenImageInpaintInputStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditInputStep` (edit) is used when `image_latents` is provided.\n" - + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." - ) - - -## 2.5 QwenImage-Edit/auto before denoise -# compose the steps into a BeforeDenoiseStep for edit and edit_inpaint tasks before combine into an auto step - -#### QwenImage-Edit/edit before denoise -QwenImageEditBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + block_names = [ + "input", + "controlnet_input", + "prepare_latents", + "set_timesteps", + "prepare_rope_inputs", + "controlnet_before_denoise", + "controlnet_denoise", + "after_denoise", ] -) - - -class QwenImageEditBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditBeforeDenoiseBlocks.values() - block_names = QwenImageEditBeforeDenoiseBlocks.keys() @property def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." + return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." -#### QwenImage-Edit/edit inpaint before denoise -QwenImageEditInpaintBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ] -) - - -class QwenImageEditInpaintBeforeDenoiseStep(SequentialPipelineBlocks): +# 5. inpaint + controlnet +class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageEditInpaintBeforeDenoiseBlocks.values() - block_names = QwenImageEditInpaintBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit inpaint task." - - -# auto before_denoise step for edit and edit_inpaint tasks -class QwenImageEditAutoBeforeDenoiseStep(AutoPipelineBlocks): - model_name = "qwenimage-edit" block_classes = [ - QwenImageEditInpaintBeforeDenoiseStep, - QwenImageEditBeforeDenoiseStep, - ] - block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] + QwenImageInpaintInputStep(), + QwenImageControlNetInputsStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImageInpaintPrepareLatentsStep(), + QwenImageRoPEInputsStep(), + QwenImageControlNetBeforeDenoiserStep(), + QwenImageInpaintControlNetDenoiseStep(), + QwenImageAfterDenoiseStep(), + ] + block_names = [ + "input", + "controlnet_input", + "prepare_latents", + "set_timesteps", + "prepare_inpaint_latents", + "prepare_rope_inputs", + "controlnet_before_denoise", + "controlnet_denoise", + "after_denoise", + ] @property def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for edit (img2img) and edit inpaint tasks.\n" - + " - `QwenImageEditInpaintBeforeDenoiseStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" - + " - if `image_latents` or `processed_mask_image` is not provided, step will be skipped." - ) - - -## 2.6 QwenImage-Edit/auto denoise - + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." -class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [QwenImageEditInpaintDenoiseStep, QwenImageEditDenoiseStep] - block_names = ["inpaint_denoise", "denoise"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] +# 6. img2img + controlnet +class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = [ + QwenImageImg2ImgInputStep(), + QwenImageControlNetInputsStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImagePrepareLatentsWithStrengthStep(), + QwenImageRoPEInputsStep(), + QwenImageControlNetBeforeDenoiserStep(), + QwenImageControlNetDenoiseStep(), + QwenImageAfterDenoiseStep(), + ] + block_names = [ + "input", + "controlnet_input", + "prepare_latents", + "set_timesteps", + "prepare_img2img_latents", + "prepare_rope_inputs", + "controlnet_before_denoise", + "controlnet_denoise", + "after_denoise", + ] @property def description(self): - return ( - "Denoise step that iteratively denoise the latents. \n" - + "This block supports edit (img2img) and edit inpaint tasks for QwenImage Edit. \n" - + " - `QwenImageEditInpaintDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditDenoiseStep` (img2img) is used when `image_latents` is provided.\n" - + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." - ) - - -## 2.7 QwenImage-Edit/auto blocks & presets + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." -class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" +# auto denoise +# auto denoise step for controlnet tasks: works for all tasks with controlnet +class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): block_classes = [ - QwenImageEditAutoInputStep, - QwenImageEditAutoBeforeDenoiseStep, - QwenImageEditAutoDenoiseStep, - QwenImageAfterDenoiseStep, + QwenImageCoreDenoiseStep, + QwenImageInpaintCoreDenoiseStep, + QwenImageImg2ImgCoreDenoiseStep, + QwenImageControlNetCoreDenoiseStep, + QwenImageControlNetInpaintCoreDenoiseStep, + QwenImageControlNetImg2ImgCoreDenoiseStep, ] - block_names = ["input", "before_denoise", "denoise", "after_denoise"] + block_names = [ + "text2image", + "inpaint", + "img2img", + "controlnet_text2image", + "controlnet_inpaint", + "controlnet_img2img"] + block_trigger_inputs = ["control_image_latents", "processed_mask_image", "image_latents"] + default_block_name = "text2image" + + def select_block(self, control_image_latents=None, processed_mask_image=None, image_latents=None): + + if control_image_latents is not None: + if processed_mask_image is not None: + return "controlnet_inpaint" + elif image_latents is not None: + return "controlnet_img2img" + else: + return "controlnet_text2image" + else: + if processed_mask_image is not None: + return "inpaint" + elif image_latents is not None: + return "img2img" + else: + return "text2image" @property def description(self): return ( "Core step that performs the denoising process. \n" - + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" - + "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n" - + " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n" - + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" - ) - - -EDIT_AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), - ("denoise", QwenImageEditCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), - ] -) - - -class QwenImageEditAutoBlocks(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = EDIT_AUTO_BLOCKS.values() - block_names = EDIT_AUTO_BLOCKS.keys() - - @property - def description(self): - return ( - "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" - + "- for edit (img2img) generation, you need to provide `image`\n" - + "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + + " - `QwenImageCoreDenoiseStep` (text2image) for text2image tasks.\n" + + " - `QwenImageInpaintCoreDenoiseStep` (inpaint) for inpaint tasks.\n" + + " - `QwenImageImg2ImgCoreDenoiseStep` (img2img) for img2img tasks.\n" + + " - `QwenImageControlNetCoreDenoiseStep` (controlnet_text2image) for text2image tasks with controlnet.\n" + + " - `QwenImageControlNetInpaintCoreDenoiseStep` (controlnet_inpaint) for inpaint tasks with controlnet.\n" + + " - `QwenImageControlNetImg2ImgCoreDenoiseStep` (controlnet_img2img) for img2img tasks with controlnet.\n" + + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n" + + " - for image-to-image generation, you need to provide `image_latents`\n" + + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n" + + " - to run the controlnet workflow, you need to provide `control_image_latents`\n" + + " - for text-to-image generation, all you need to provide is prompt embeddings" ) -#################### QwenImage Edit Plus ##################### - -# 3. QwenImage-Edit Plus +# 4. DECODE -## 3.1 QwenImage-Edit Plus / edit - -#### QwenImage-Edit Plus vl encoder: take both image and text prompts -QwenImageEditPlusVLEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditPlusResizeDynamicStep()), - ("encode", QwenImageEditPlusTextEncoderStep()), - ] -) +## 1.1 text2image +#### decode +#### (standard decode step works for most tasks except for inpaint) -class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): +class QwenImageDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = QwenImageEditPlusVLEncoderBlocks.values() - block_names = QwenImageEditPlusVLEncoderBlocks.keys() - - @property - def description(self) -> str: - return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together." - - -#### QwenImage-Edit Plus vae encoder -QwenImageEditPlusVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditPlusResizeDynamicStep()), # edit plus has a different resize step - ("preprocess", QwenImageEditPlusProcessImagesInputStep()), # vae_image -> processed_image - ("encode", QwenImageEditPlusVaeEncoderDynamicStep()), # processed_image -> image_latents - ] -) - - -class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusVaeEncoderBlocks.values() - block_names = QwenImageEditPlusVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that encode the image inputs into their latent representations." - - -#### QwenImage Edit Plus input blocks -QwenImageEditPlusInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ( - "additional_inputs", - QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]), - ), - ] -) - - -class QwenImageEditPlusInputStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusInputBlocks.values() - block_names = QwenImageEditPlusInputBlocks.keys() - - -#### QwenImage Edit Plus presets -EDIT_PLUS_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditPlusVLEncoderStep()), - ("vae_encoder", QwenImageEditPlusVaeEncoderStep()), - ("input", QwenImageEditPlusInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), - ("denoise", QwenImageEditDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -QwenImageEditPlusBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), - ] -) - - -class QwenImageEditPlusBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusBeforeDenoiseBlocks.values() - block_names = QwenImageEditPlusBeforeDenoiseBlocks.keys() + block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] + block_names = ["decode", "postprocess"] @property def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." - - -# auto before_denoise step for edit tasks -class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = [QwenImageEditPlusBeforeDenoiseStep] - block_names = ["edit"] - block_trigger_inputs = ["image_latents"] - - @property - def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for edit (img2img) task.\n" - + " - `QwenImageEditPlusBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" - + " - if `image_latents` is not provided, step will be skipped." - ) + return "Decode step that decodes the latents to images and postprocess the generated image." -## 3.2 QwenImage-Edit Plus/auto encoders +#### inpaint decode -class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [QwenImageEditPlusVaeEncoderStep] - block_names = ["edit"] - block_trigger_inputs = ["image"] +class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] + block_names = ["decode", "postprocess"] @property def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations. \n" - " This is an auto pipeline block that works for edit task.\n" - + " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n" - + " - if `image` is not provided, step will be skipped." - ) - - -## 3.3 QwenImage-Edit/auto blocks & presets + return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image." -class QwenImageEditPlusAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageEditPlusInputStep] - block_names = ["edit"] - block_trigger_inputs = ["image_latents"] +# auto decode step for inpaint and text2image tasks +class QwenImageAutoDecodeStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep] + block_names = ["inpaint_decode", "decode"] + block_trigger_inputs = ["mask", None] @property def description(self): return ( - "Input step that prepares the inputs for the edit denoising step.\n" - + " It is an auto pipeline block that works for edit task.\n" - + " - `QwenImageEditPlusInputStep` (edit) is used when `image_latents` is provided.\n" - + " - if `image_latents` is not provided, step will be skipped." + "Decode step that decode the latents into images. \n" + " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n" + + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" + + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n" ) -class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = [ - QwenImageEditPlusAutoInputStep, - QwenImageEditPlusAutoBeforeDenoiseStep, - QwenImageEditAutoDenoiseStep, - QwenImageAfterDenoiseStep, - ] - block_names = ["input", "before_denoise", "denoise", "after_denoise"] - @property - def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" - + "This step support edit (img2img) workflow for QwenImage Edit Plus:\n" - + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" - ) - - -EDIT_PLUS_AUTO_BLOCKS = InsertableDict( +## 1.10 QwenImage/auto block & presets +AUTO_BLOCKS = InsertableDict( [ - ("text_encoder", QwenImageEditPlusVLEncoderStep()), - ("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()), - ("denoise", QwenImageEditPlusCoreDenoiseStep()), + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageAutoVaeEncoderStep()), + ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), + ("denoise", QwenImageAutoCoreDenoiseStep()), ("decode", QwenImageAutoDecodeStep()), ] ) -class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = EDIT_PLUS_AUTO_BLOCKS.values() - block_names = EDIT_PLUS_AUTO_BLOCKS.keys() +class QwenImageAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage" + + block_classes = AUTO_BLOCKS.values() + block_names = AUTO_BLOCKS.keys() @property def description(self): return ( - "Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n" - + "- for edit (img2img) generation, you need to provide `image`\n" - ) - - -# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus - - -ALL_BLOCKS = { - "text2image": TEXT2IMAGE_BLOCKS, - "img2img": IMAGE2IMAGE_BLOCKS, - "edit": EDIT_BLOCKS, - "edit_inpaint": EDIT_INPAINT_BLOCKS, - "edit_plus": EDIT_PLUS_BLOCKS, - "inpaint": INPAINT_BLOCKS, - "controlnet": CONTROLNET_BLOCKS, - "auto": AUTO_BLOCKS, - "edit_auto": EDIT_AUTO_BLOCKS, - "edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS, -} + "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" + + "- for image-to-image generation, you need to provide `image`\n" + + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + + "- to run the controlnet workflow, you need to provide `control_image`\n" + + "- for text-to-image generation, all you need to provide is `prompt`" + ) \ No newline at end of file diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index dcce0cab5dd1..bcadd72b5909 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -12,17 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...utils import logging -from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict from .before_denoise import ( - QwenImageControlNetBeforeDenoiserStep, QwenImageCreateMaskLatentsStep, - QwenImageEditPlusRoPEInputsStep, QwenImageEditRoPEInputsStep, QwenImagePrepareLatentsStep, QwenImagePrepareLatentsWithStrengthStep, - QwenImageRoPEInputsStep, QwenImageSetTimestepsStep, QwenImageSetTimestepsWithStrengthStep, ) @@ -33,30 +32,17 @@ QwenImageProcessImagesOutputStep, ) from .denoise import ( - QwenImageControlNetDenoiseStep, - QwenImageDenoiseStep, QwenImageEditDenoiseStep, QwenImageEditInpaintDenoiseStep, - QwenImageInpaintControlNetDenoiseStep, - QwenImageInpaintDenoiseStep, - QwenImageLoopBeforeDenoiserControlNet, ) from .encoders import ( - QwenImageControlNetVaeEncoderStep, - QwenImageEditPlusProcessImagesInputStep, - QwenImageEditPlusResizeDynamicStep, - QwenImageEditPlusTextEncoderStep, - QwenImageEditPlusVaeEncoderDynamicStep, QwenImageEditResizeDynamicStep, QwenImageEditTextEncoderStep, QwenImageInpaintProcessImagesInputStep, QwenImageProcessImagesInputStep, - QwenImageTextEncoderStep, QwenImageVaeEncoderDynamicStep, ) from .inputs import ( - QwenImageControlNetInputsStep, - QwenImageEditPlusInputsDynamicStep, QwenImageInputsDynamicStep, QwenImageTextInputsStep, ) @@ -64,824 +50,267 @@ logger = logging.get_logger(__name__) -# 1. QwenImage - -## 1.1 QwenImage/text2image - -#### QwenImage/decode -#### (standard decode step works for most tasks except for inpaint) -QwenImageDecodeBlocks = InsertableDict( - [ - ("decode", QwenImageDecoderStep()), - ("postprocess", QwenImageProcessImagesOutputStep()), - ] -) - - -class QwenImageDecodeStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageDecodeBlocks.values() - block_names = QwenImageDecodeBlocks.keys() - - @property - def description(self): - return "Decode step that decodes the latents to images and postprocess the generated image." - - -#### QwenImage/text2image presets -TEXT2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("input", QwenImageTextInputsStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) +# ==================== +# 1. TEXT ENCODER +# ==================== -## 1.2 QwenImage/inpaint - -#### QwenImage/inpaint vae encoder -QwenImageInpaintVaeEncoderBlocks = InsertableDict( - [ - ( - "preprocess", - QwenImageInpaintProcessImagesInputStep, - ), # image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs - ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents +class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): + """VL encoder that takes both image and text prompts.""" + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditResizeDynamicStep(), + QwenImageEditTextEncoderStep(), ] -) - - -class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintVaeEncoderBlocks.values() - block_names = QwenImageInpaintVaeEncoderBlocks.keys() + block_names = ["resize", "encode"] @property def description(self) -> str: - return ( - "This step is used for processing image and mask inputs for inpainting tasks. It:\n" - " - Resizes the image to the target size, based on `height` and `width`.\n" - " - Processes and updates `image` and `mask_image`.\n" - " - Creates `image_latents`." - ) + return "QwenImage-Edit VL encoder step that encode the image and text prompts together." -#### QwenImage/inpaint inputs -QwenImageInpaintInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ( - "additional_inputs", - QwenImageInputsDynamicStep( - image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] - ), - ), - ] -) +# ==================== +# 2. VAE ENCODER +# ==================== - -class QwenImageInpaintInputStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintInputBlocks.values() - block_names = QwenImageInpaintInputBlocks.keys() - - @property - def description(self): - return "Input step that prepares the inputs for the inpainting denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -# QwenImage/inpaint prepare latents -QwenImageInpaintPrepareLatentsBlocks = InsertableDict( - [ - ("add_noise_to_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("create_mask_latents", QwenImageCreateMaskLatentsStep()), +# Edit VAE encoder +class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditResizeDynamicStep(), + QwenImageProcessImagesInputStep(), + QwenImageVaeEncoderDynamicStep(), ] -) - - -class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintPrepareLatentsBlocks.values() - block_names = QwenImageInpaintPrepareLatentsBlocks.keys() + block_names = ["resize", "preprocess", "encode"] @property def description(self) -> str: - return ( - "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n" - " - Add noise to the image latents to create the latents input for the denoiser.\n" - " - Create the pachified latents `mask` based on the processedmask image.\n" - ) - - -#### QwenImage/inpaint decode -QwenImageInpaintDecodeBlocks = InsertableDict( - [ - ("decode", QwenImageDecoderStep()), - ("postprocess", QwenImageInpaintProcessImagesOutputStep()), - ] -) - - -class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintDecodeBlocks.values() - block_names = QwenImageInpaintDecodeBlocks.keys() - - @property - def description(self): - return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image." - - -#### QwenImage/inpaint presets -INPAINT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageInpaintVaeEncoderStep()), - ("input", QwenImageInpaintInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageInpaintDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageInpaintDecodeStep()), - ] -) - + return "Vae encoder step that encode the image inputs into their latent representations." -## 1.3 QwenImage/img2img -#### QwenImage/img2img vae encoder -QwenImageImg2ImgVaeEncoderBlocks = InsertableDict( - [ - ("preprocess", QwenImageProcessImagesInputStep()), - ("encode", QwenImageVaeEncoderDynamicStep()), +# Edit Inpaint VAE encoder +class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditResizeDynamicStep(), + QwenImageInpaintProcessImagesInputStep(), + QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"), ] -) - - -class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - - block_classes = QwenImageImg2ImgVaeEncoderBlocks.values() - block_names = QwenImageImg2ImgVaeEncoderBlocks.keys() + block_names = ["resize", "preprocess", "encode"] @property def description(self) -> str: - return "Vae encoder step that preprocess andencode the image inputs into their latent representations." - - -#### QwenImage/img2img inputs -QwenImageImg2ImgInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), - ] -) - - -class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageImg2ImgInputBlocks.values() - block_names = QwenImageImg2ImgInputBlocks.keys() - - @property - def description(self): - return "Input step that prepares the inputs for the img2img denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -#### QwenImage/img2img presets -IMAGE2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageImg2ImgVaeEncoderStep()), - ("input", QwenImageImg2ImgInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -## 1.4 QwenImage/controlnet - -#### QwenImage/controlnet presets -CONTROLNET_BLOCKS = InsertableDict( - [ - ("controlnet_vae_encoder", QwenImageControlNetVaeEncoderStep()), # vae encoder step for control_image - ("controlnet_inputs", QwenImageControlNetInputsStep()), # additional input step for controlnet - ( - "controlnet_before_denoise", - QwenImageControlNetBeforeDenoiserStep(), - ), # before denoise step (after set_timesteps step) - ( - "controlnet_denoise_loop_before", - QwenImageLoopBeforeDenoiserControlNet(), - ), # controlnet loop step (insert before the denoiseloop_denoiser) - ] -) - - -## 1.5 QwenImage/auto encoders - - -#### for inpaint and img2img tasks -class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] - block_names = ["inpaint", "img2img"] - block_trigger_inputs = ["mask_image", "image"] - - @property - def description(self): return ( - "Vae encoder step that encode the image inputs into their latent representations.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n" - + " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n" - + " - if `mask_image` or `image` is not provided, step will be skipped." + "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n" + " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n" + " - process the resized image and mask image.\n" + " - create image latents." ) -# for controlnet tasks -class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): - block_classes = [QwenImageControlNetVaeEncoderStep] - block_names = ["controlnet"] - block_trigger_inputs = ["control_image"] +# Auto VAE encoder +class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageEditInpaintVaeEncoderStep, QwenImageEditVaeEncoderStep] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["mask_image", "image"] @property def description(self): return ( "Vae encoder step that encode the image inputs into their latent representations.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n" - + " - if `control_image` is not provided, step will be skipped." - ) - - -## 1.6 QwenImage/auto inputs - - -# text2image/inpaint/img2img -class QwenImageAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintInputStep, QwenImageImg2ImgInputStep, QwenImageTextInputsStep] - block_names = ["inpaint", "img2img", "text2image"] - block_trigger_inputs = ["processed_mask_image", "image_latents", None] - - @property - def description(self): - return ( - "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" - " This is an auto pipeline block that works for text2image/inpaint/img2img tasks.\n" - + " - `QwenImageInpaintInputStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n" - + " - `QwenImageTextInputsStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" + "This is an auto pipeline block.\n" + " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n" + " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n" + " - if `mask_image` or `image` is not provided, step will be skipped." ) -# controlnet -class QwenImageOptionalControlNetInputStep(AutoPipelineBlocks): - block_classes = [QwenImageControlNetInputsStep] - block_names = ["controlnet"] - block_trigger_inputs = ["control_image_latents"] - - @property - def description(self): - return ( - "Controlnet input step that prepare the control_image_latents input.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageControlNetInputsStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - if `control_image_latents` is not provided, step will be skipped." - ) - - -## 1.7 QwenImage/auto before denoise step -# compose the steps into a BeforeDenoiseStep for text2image/img2img/inpaint tasks before combine into an auto step - -# QwenImage/text2image before denoise -QwenImageText2ImageBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) - - -class QwenImageText2ImageBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageText2ImageBeforeDenoiseBlocks.values() - block_names = QwenImageText2ImageBeforeDenoiseBlocks.keys() +# ==================== +# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise +# ==================== - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for text2image task." - - -# QwenImage/inpaint before denoise -QwenImageInpaintBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) - - -class QwenImageInpaintBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintBeforeDenoiseBlocks.values() - block_names = QwenImageInpaintBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." - - -# QwenImage/img2img before denoise -QwenImageImg2ImgBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) - - -class QwenImageImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageImg2ImgBeforeDenoiseBlocks.values() - block_names = QwenImageImg2ImgBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." - - -# auto before_denoise step for text2image, inpaint, img2img tasks -class QwenImageAutoBeforeDenoiseStep(AutoPipelineBlocks): +# Edit input step +class QwenImageEditInputStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" block_classes = [ - QwenImageInpaintBeforeDenoiseStep, - QwenImageImg2ImgBeforeDenoiseStep, - QwenImageText2ImageBeforeDenoiseStep, + QwenImageTextInputsStep(), + QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]), ] - block_names = ["inpaint", "img2img", "text2image"] - block_trigger_inputs = ["processed_mask_image", "image_latents", None] + block_names = ["text_inputs", "additional_inputs"] @property def description(self): return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for text2img, inpainting, img2img tasks.\n" - + " - `QwenImageInpaintBeforeDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageImg2ImgBeforeDenoiseStep` (img2img) is used when `image_latents` is provided.\n" - + " - `QwenImageText2ImageBeforeDenoiseStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" + "Input step that prepares the inputs for the edit denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n" + " - update height/width based `image_latents`, patchify `image_latents`." ) -# auto before_denoise step for controlnet tasks -class QwenImageOptionalControlNetBeforeDenoiseStep(AutoPipelineBlocks): - block_classes = [QwenImageControlNetBeforeDenoiserStep] - block_names = ["controlnet"] - block_trigger_inputs = ["control_image_latents"] - - @property - def description(self): - return ( - "Controlnet before denoise step that prepare the controlnet input.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageControlNetBeforeDenoiserStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - if `control_image_latents` is not provided, step will be skipped." - ) - - -## 1.8 QwenImage/auto denoise - - -# auto denoise step for controlnet tasks: works for all tasks with controlnet -class QwenImageControlNetAutoDenoiseStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintControlNetDenoiseStep, QwenImageControlNetDenoiseStep] - block_names = ["inpaint_denoise", "denoise"] - block_trigger_inputs = ["mask", None] - - @property - def description(self): - return ( - "Controlnet step during the denoising process. \n" - " This is an auto pipeline block that works for inpaint and text2image/img2img tasks with controlnet.\n" - + " - `QwenImageInpaintControlNetDenoiseStep` (inpaint) is used when `mask` is provided.\n" - + " - `QwenImageControlNetDenoiseStep` (text2image/img2img) is used when `mask` is not provided.\n" - ) - - -# auto denoise step for everything: works for all tasks with or without controlnet -class QwenImageAutoDenoiseStep(AutoPipelineBlocks): +# Edit Inpaint input step +class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" block_classes = [ - QwenImageControlNetAutoDenoiseStep, - QwenImageInpaintDenoiseStep, - QwenImageDenoiseStep, + QwenImageTextInputsStep(), + QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]), ] - block_names = ["controlnet_denoise", "inpaint_denoise", "denoise"] - block_trigger_inputs = ["control_image_latents", "mask", None] + block_names = ["text_inputs", "additional_inputs"] @property def description(self): return ( - "Denoise step that iteratively denoise the latents. \n" - " This is an auto pipeline block that works for inpaint/text2image/img2img tasks. It also works with controlnet\n" - + " - `QwenImageControlNetAutoDenoiseStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - `QwenImageInpaintDenoiseStep` (inpaint) is used when `mask` is provided and `control_image_latents` is not provided.\n" - + " - `QwenImageDenoiseStep` (text2image/img2img) is used when `mask` is not provided and `control_image_latents` is not provided.\n" + "Input step that prepares the inputs for the edit inpaint denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n" + " - update height/width based `image_latents`, patchify `image_latents`." ) -## 1.9 QwenImage/auto decode -# auto decode step for inpaint and text2image tasks - - -class QwenImageAutoDecodeStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep] - block_names = ["inpaint_decode", "decode"] - block_trigger_inputs = ["mask", None] +# Edit Inpaint prepare latents step +class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] + block_names = ["add_noise_to_latents", "create_mask_latents"] @property - def description(self): + def description(self) -> str: return ( - "Decode step that decode the latents into images. \n" - " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n" - + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" - + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n" + "This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:\n" + " - Add noise to the image latents to create the latents input for the denoiser.\n" + " - Create the patchified latents `mask` based on the processed mask image.\n" ) -class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" +# 1. Edit (img2img) core denoise +class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" block_classes = [ - QwenImageAutoInputStep, - QwenImageOptionalControlNetInputStep, - QwenImageAutoBeforeDenoiseStep, - QwenImageOptionalControlNetBeforeDenoiseStep, - QwenImageAutoDenoiseStep, - QwenImageAfterDenoiseStep, + QwenImageEditInputStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsStep(), + QwenImageEditRoPEInputsStep(), + QwenImageEditDenoiseStep(), + QwenImageAfterDenoiseStep(), ] block_names = [ "input", - "controlnet_input", - "before_denoise", - "controlnet_before_denoise", + "prepare_latents", + "set_timesteps", + "prepare_rope_inputs", "denoise", "after_denoise", ] @property def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n" - + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n" - + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" - + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n" - + " - for image-to-image generation, you need to provide `image_latents`\n" - + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n" - + " - to run the controlnet workflow, you need to provide `control_image_latents`\n" - + " - for text-to-image generation, all you need to provide is prompt embeddings" - ) - - -## 1.10 QwenImage/auto block & presets -AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageAutoVaeEncoderStep()), - ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), - ("denoise", QwenImageCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), - ] -) - - -class QwenImageAutoBlocks(SequentialPipelineBlocks): - model_name = "qwenimage" - - block_classes = AUTO_BLOCKS.values() - block_names = AUTO_BLOCKS.keys() - - @property - def description(self): - return ( - "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" - + "- for image-to-image generation, you need to provide `image`\n" - + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" - + "- to run the controlnet workflow, you need to provide `control_image`\n" - + "- for text-to-image generation, all you need to provide is `prompt`" - ) - - -# 2. QwenImage-Edit - -## 2.1 QwenImage-Edit/edit - -#### QwenImage-Edit/edit vl encoder: take both image and text prompts -QwenImageEditVLEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), - ("encode", QwenImageEditTextEncoderStep()), - ] -) - - -class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditVLEncoderBlocks.values() - block_names = QwenImageEditVLEncoderBlocks.keys() - - @property - def description(self) -> str: - return "QwenImage-Edit VL encoder step that encode the image an text prompts together." + return "Core denoising workflow for QwenImage-Edit edit (img2img) task." -#### QwenImage-Edit/edit vae encoder -QwenImageEditVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), # edit has a different resize step - ("preprocess", QwenImageProcessImagesInputStep()), # resized_image -> processed_image - ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents +# 2. Edit Inpaint core denoise +class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditInpaintInputStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImageEditInpaintPrepareLatentsStep(), + QwenImageEditRoPEInputsStep(), + QwenImageEditInpaintDenoiseStep(), + QwenImageAfterDenoiseStep(), ] -) - - -class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditVaeEncoderBlocks.values() - block_names = QwenImageEditVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that encode the image inputs into their latent representations." - - -#### QwenImage-Edit/edit input -QwenImageEditInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), + block_names = [ + "input", + "prepare_latents", + "set_timesteps", + "prepare_inpaint_latents", + "prepare_rope_inputs", + "denoise", + "after_denoise", ] -) - - -class QwenImageEditInputStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditInputBlocks.values() - block_names = QwenImageEditInputBlocks.keys() @property def description(self): - return "Input step that prepares the inputs for the edit denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs: \n" - " - `image_latents`.\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -#### QwenImage/edit presets -EDIT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditVaeEncoderStep()), - ("input", QwenImageEditInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ("denoise", QwenImageEditDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -## 2.2 QwenImage-Edit/edit inpaint - -#### QwenImage-Edit/edit inpaint vae encoder: the difference from regular inpaint is the resize step -QwenImageEditInpaintVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), # image -> resized_image - ( - "preprocess", - QwenImageInpaintProcessImagesInputStep, - ), # resized_image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs - ( - "encode", - QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"), - ), # processed_image -> image_latents - ] -) - - -class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditInpaintVaeEncoderBlocks.values() - block_names = QwenImageEditInpaintVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return ( - "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n" - " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n" - " - process the resized image and mask image.\n" - " - create image latents." - ) - - -#### QwenImage-Edit/edit inpaint presets -EDIT_INPAINT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditInpaintVaeEncoderStep()), - ("input", QwenImageInpaintInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ("denoise", QwenImageEditInpaintDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageInpaintDecodeStep()), - ] -) + return "Core denoising workflow for QwenImage-Edit edit inpaint task." -## 2.3 QwenImage-Edit/auto encoders - - -class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): +# Auto core denoise step +class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks): block_classes = [ - QwenImageEditInpaintVaeEncoderStep, - QwenImageEditVaeEncoderStep, + QwenImageEditInpaintCoreDenoiseStep, + QwenImageEditCoreDenoiseStep, ] block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["mask_image", "image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations. \n" - " This is an auto pipeline block that works for edit and edit_inpaint tasks.\n" - + " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n" - + " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n" - + " - if `mask_image` or `image` is not provided, step will be skipped." - ) - - -## 2.4 QwenImage-Edit/auto inputs -class QwenImageEditAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep] - block_names = ["edit_inpaint", "edit"] block_trigger_inputs = ["processed_mask_image", "image_latents"] + default_block_name = "edit" + + def select_block(self, processed_mask_image=None, image_latents=None) -> Optional[str]: + if processed_mask_image is not None: + return "edit_inpaint" + elif image_latents is not None: + return "edit" + return None @property def description(self): return ( - "Input step that prepares the inputs for the edit denoising step.\n" - + " It is an auto pipeline block that works for edit and edit_inpaint tasks.\n" - + " - `QwenImageInpaintInputStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditInputStep` (edit) is used when `image_latents` is provided.\n" - + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." + "Auto core denoising step that selects the appropriate workflow based on inputs.\n" + " - `QwenImageEditInpaintCoreDenoiseStep` when `processed_mask_image` is provided\n" + " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n" + "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit." ) -## 2.5 QwenImage-Edit/auto before denoise -# compose the steps into a BeforeDenoiseStep for edit and edit_inpaint tasks before combine into an auto step - -#### QwenImage-Edit/edit before denoise -QwenImageEditBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ] -) - - -class QwenImageEditBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditBeforeDenoiseBlocks.values() - block_names = QwenImageEditBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." - - -#### QwenImage-Edit/edit inpaint before denoise -QwenImageEditInpaintBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ] -) - - -class QwenImageEditInpaintBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditInpaintBeforeDenoiseBlocks.values() - block_names = QwenImageEditInpaintBeforeDenoiseBlocks.keys() +# ==================== +# 4. DECODE +# ==================== - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit inpaint task." - - -# auto before_denoise step for edit and edit_inpaint tasks -class QwenImageEditAutoBeforeDenoiseStep(AutoPipelineBlocks): +# Decode step (standard) +class QwenImageEditDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" - block_classes = [ - QwenImageEditInpaintBeforeDenoiseStep, - QwenImageEditBeforeDenoiseStep, - ] - block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] + block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] + block_names = ["decode", "postprocess"] @property def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for edit (img2img) and edit inpaint tasks.\n" - + " - `QwenImageEditInpaintBeforeDenoiseStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" - + " - if `image_latents` or `processed_mask_image` is not provided, step will be skipped." - ) - - -## 2.6 QwenImage-Edit/auto denoise + return "Decode step that decodes the latents to images and postprocess the generated image." -class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks): +# Inpaint decode step +class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" - - block_classes = [QwenImageEditInpaintDenoiseStep, QwenImageEditDenoiseStep] - block_names = ["inpaint_denoise", "denoise"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] + block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] + block_names = ["decode", "postprocess"] @property def description(self): - return ( - "Denoise step that iteratively denoise the latents. \n" - + "This block supports edit (img2img) and edit inpaint tasks for QwenImage Edit. \n" - + " - `QwenImageEditInpaintDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditDenoiseStep` (img2img) is used when `image_latents` is provided.\n" - + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." - ) - - -## 2.7 QwenImage-Edit/auto blocks & presets + return "Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image." -class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [ - QwenImageEditAutoInputStep, - QwenImageEditAutoBeforeDenoiseStep, - QwenImageEditAutoDenoiseStep, - QwenImageAfterDenoiseStep, - ] - block_names = ["input", "before_denoise", "denoise", "after_denoise"] +# Auto decode step +class QwenImageEditAutoDecodeStep(AutoPipelineBlocks): + block_classes = [QwenImageEditInpaintDecodeStep, QwenImageEditDecodeStep] + block_names = ["inpaint_decode", "decode"] + block_trigger_inputs = ["mask", None] @property def description(self): return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" - + "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n" - + " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n" - + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" + "Decode step that decode the latents into images.\n" + "This is an auto pipeline block.\n" + " - `QwenImageEditInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" + " - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n" ) +# ==================== +# 5. AUTO BLOCKS & PRESETS +# ==================== + EDIT_AUTO_BLOCKS = InsertableDict( [ ("text_encoder", QwenImageEditVLEncoderStep()), ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), - ("denoise", QwenImageEditCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), + ("denoise", QwenImageEditAutoCoreDenoiseStep()), + ("decode", QwenImageEditAutoDecodeStep()), ] ) @@ -895,219 +324,6 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): def description(self): return ( "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" - + "- for edit (img2img) generation, you need to provide `image`\n" - + "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" - ) - - -#################### QwenImage Edit Plus ##################### - -# 3. QwenImage-Edit Plus - -## 3.1 QwenImage-Edit Plus / edit - -#### QwenImage-Edit Plus vl encoder: take both image and text prompts -QwenImageEditPlusVLEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditPlusResizeDynamicStep()), - ("encode", QwenImageEditPlusTextEncoderStep()), - ] -) - - -class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditPlusVLEncoderBlocks.values() - block_names = QwenImageEditPlusVLEncoderBlocks.keys() - - @property - def description(self) -> str: - return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together." - - -#### QwenImage-Edit Plus vae encoder -QwenImageEditPlusVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditPlusResizeDynamicStep()), # edit plus has a different resize step - ("preprocess", QwenImageEditPlusProcessImagesInputStep()), # vae_image -> processed_image - ("encode", QwenImageEditPlusVaeEncoderDynamicStep()), # processed_image -> image_latents - ] -) - - -class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusVaeEncoderBlocks.values() - block_names = QwenImageEditPlusVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that encode the image inputs into their latent representations." - - -#### QwenImage Edit Plus input blocks -QwenImageEditPlusInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ( - "additional_inputs", - QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]), - ), - ] -) - - -class QwenImageEditPlusInputStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusInputBlocks.values() - block_names = QwenImageEditPlusInputBlocks.keys() - - -#### QwenImage Edit Plus presets -EDIT_PLUS_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditPlusVLEncoderStep()), - ("vae_encoder", QwenImageEditPlusVaeEncoderStep()), - ("input", QwenImageEditPlusInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), - ("denoise", QwenImageEditDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -QwenImageEditPlusBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), - ] -) - - -class QwenImageEditPlusBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusBeforeDenoiseBlocks.values() - block_names = QwenImageEditPlusBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." - - -# auto before_denoise step for edit tasks -class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = [QwenImageEditPlusBeforeDenoiseStep] - block_names = ["edit"] - block_trigger_inputs = ["image_latents"] - - @property - def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for edit (img2img) task.\n" - + " - `QwenImageEditPlusBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" - + " - if `image_latents` is not provided, step will be skipped." - ) - - -## 3.2 QwenImage-Edit Plus/auto encoders - - -class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [QwenImageEditPlusVaeEncoderStep] - block_names = ["edit"] - block_trigger_inputs = ["image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations. \n" - " This is an auto pipeline block that works for edit task.\n" - + " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n" - + " - if `image` is not provided, step will be skipped." - ) - - -## 3.3 QwenImage-Edit/auto blocks & presets - - -class QwenImageEditPlusAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageEditPlusInputStep] - block_names = ["edit"] - block_trigger_inputs = ["image_latents"] - - @property - def description(self): - return ( - "Input step that prepares the inputs for the edit denoising step.\n" - + " It is an auto pipeline block that works for edit task.\n" - + " - `QwenImageEditPlusInputStep` (edit) is used when `image_latents` is provided.\n" - + " - if `image_latents` is not provided, step will be skipped." - ) - - -class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = [ - QwenImageEditPlusAutoInputStep, - QwenImageEditPlusAutoBeforeDenoiseStep, - QwenImageEditAutoDenoiseStep, - QwenImageAfterDenoiseStep, - ] - block_names = ["input", "before_denoise", "denoise", "after_denoise"] - - @property - def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" - + "This step support edit (img2img) workflow for QwenImage Edit Plus:\n" - + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" - ) - - -EDIT_PLUS_AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditPlusVLEncoderStep()), - ("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()), - ("denoise", QwenImageEditPlusCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), - ] -) - - -class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = EDIT_PLUS_AUTO_BLOCKS.values() - block_names = EDIT_PLUS_AUTO_BLOCKS.keys() - - @property - def description(self): - return ( - "Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n" - + "- for edit (img2img) generation, you need to provide `image`\n" - ) - - -# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus - - -ALL_BLOCKS = { - "text2image": TEXT2IMAGE_BLOCKS, - "img2img": IMAGE2IMAGE_BLOCKS, - "edit": EDIT_BLOCKS, - "edit_inpaint": EDIT_INPAINT_BLOCKS, - "edit_plus": EDIT_PLUS_BLOCKS, - "inpaint": INPAINT_BLOCKS, - "controlnet": CONTROLNET_BLOCKS, - "auto": AUTO_BLOCKS, - "edit_auto": EDIT_AUTO_BLOCKS, - "edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS, -} + "- for edit (img2img) generation, you need to provide `image`\n" + "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n" + ) \ No newline at end of file diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index dcce0cab5dd1..75b40ccc8ce2 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -16,1069 +16,146 @@ from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict from .before_denoise import ( - QwenImageControlNetBeforeDenoiserStep, - QwenImageCreateMaskLatentsStep, QwenImageEditPlusRoPEInputsStep, - QwenImageEditRoPEInputsStep, QwenImagePrepareLatentsStep, - QwenImagePrepareLatentsWithStrengthStep, - QwenImageRoPEInputsStep, QwenImageSetTimestepsStep, - QwenImageSetTimestepsWithStrengthStep, ) from .decoders import ( QwenImageAfterDenoiseStep, QwenImageDecoderStep, - QwenImageInpaintProcessImagesOutputStep, QwenImageProcessImagesOutputStep, ) from .denoise import ( - QwenImageControlNetDenoiseStep, - QwenImageDenoiseStep, QwenImageEditDenoiseStep, - QwenImageEditInpaintDenoiseStep, - QwenImageInpaintControlNetDenoiseStep, - QwenImageInpaintDenoiseStep, - QwenImageLoopBeforeDenoiserControlNet, ) from .encoders import ( - QwenImageControlNetVaeEncoderStep, - QwenImageEditPlusProcessImagesInputStep, QwenImageEditPlusResizeDynamicStep, QwenImageEditPlusTextEncoderStep, - QwenImageEditPlusVaeEncoderDynamicStep, - QwenImageEditResizeDynamicStep, - QwenImageEditTextEncoderStep, - QwenImageInpaintProcessImagesInputStep, - QwenImageProcessImagesInputStep, - QwenImageTextEncoderStep, + QwenImageEditPlusProcessImagesInputStep, QwenImageVaeEncoderDynamicStep, ) from .inputs import ( - QwenImageControlNetInputsStep, QwenImageEditPlusInputsDynamicStep, - QwenImageInputsDynamicStep, QwenImageTextInputsStep, ) logger = logging.get_logger(__name__) -# 1. QwenImage - -## 1.1 QwenImage/text2image - -#### QwenImage/decode -#### (standard decode step works for most tasks except for inpaint) -QwenImageDecodeBlocks = InsertableDict( - [ - ("decode", QwenImageDecoderStep()), - ("postprocess", QwenImageProcessImagesOutputStep()), - ] -) - - -class QwenImageDecodeStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageDecodeBlocks.values() - block_names = QwenImageDecodeBlocks.keys() - - @property - def description(self): - return "Decode step that decodes the latents to images and postprocess the generated image." - - -#### QwenImage/text2image presets -TEXT2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("input", QwenImageTextInputsStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -## 1.2 QwenImage/inpaint - -#### QwenImage/inpaint vae encoder -QwenImageInpaintVaeEncoderBlocks = InsertableDict( - [ - ( - "preprocess", - QwenImageInpaintProcessImagesInputStep, - ), # image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs - ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents - ] -) - - -class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintVaeEncoderBlocks.values() - block_names = QwenImageInpaintVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return ( - "This step is used for processing image and mask inputs for inpainting tasks. It:\n" - " - Resizes the image to the target size, based on `height` and `width`.\n" - " - Processes and updates `image` and `mask_image`.\n" - " - Creates `image_latents`." - ) - - -#### QwenImage/inpaint inputs -QwenImageInpaintInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ( - "additional_inputs", - QwenImageInputsDynamicStep( - image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] - ), - ), - ] -) - - -class QwenImageInpaintInputStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintInputBlocks.values() - block_names = QwenImageInpaintInputBlocks.keys() - - @property - def description(self): - return "Input step that prepares the inputs for the inpainting denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -# QwenImage/inpaint prepare latents -QwenImageInpaintPrepareLatentsBlocks = InsertableDict( - [ - ("add_noise_to_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("create_mask_latents", QwenImageCreateMaskLatentsStep()), - ] -) - - -class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintPrepareLatentsBlocks.values() - block_names = QwenImageInpaintPrepareLatentsBlocks.keys() - - @property - def description(self) -> str: - return ( - "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n" - " - Add noise to the image latents to create the latents input for the denoiser.\n" - " - Create the pachified latents `mask` based on the processedmask image.\n" - ) - - -#### QwenImage/inpaint decode -QwenImageInpaintDecodeBlocks = InsertableDict( - [ - ("decode", QwenImageDecoderStep()), - ("postprocess", QwenImageInpaintProcessImagesOutputStep()), - ] -) - - -class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintDecodeBlocks.values() - block_names = QwenImageInpaintDecodeBlocks.keys() - - @property - def description(self): - return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image." - - -#### QwenImage/inpaint presets -INPAINT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageInpaintVaeEncoderStep()), - ("input", QwenImageInpaintInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageInpaintDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageInpaintDecodeStep()), - ] -) - -## 1.3 QwenImage/img2img +# ==================== +# 1. TEXT ENCODER +# ==================== -#### QwenImage/img2img vae encoder -QwenImageImg2ImgVaeEncoderBlocks = InsertableDict( - [ - ("preprocess", QwenImageProcessImagesInputStep()), - ("encode", QwenImageVaeEncoderDynamicStep()), +class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): + """VL encoder that takes both image and text prompts. Uses 384x384 target area.""" + model_name = "qwenimage-edit-plus" + block_classes = [ + QwenImageEditPlusResizeDynamicStep(target_area=384 * 384, output_name="resized_cond_image"), + QwenImageEditPlusTextEncoderStep(), ] -) - - -class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - - block_classes = QwenImageImg2ImgVaeEncoderBlocks.values() - block_names = QwenImageImg2ImgVaeEncoderBlocks.keys() + block_names = ["resize", "encode"] @property def description(self) -> str: - return "Vae encoder step that preprocess andencode the image inputs into their latent representations." - - -#### QwenImage/img2img inputs -QwenImageImg2ImgInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), - ] -) + return "QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together." -class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageImg2ImgInputBlocks.values() - block_names = QwenImageImg2ImgInputBlocks.keys() +# ==================== +# 2. VAE ENCODER +# ==================== - @property - def description(self): - return "Input step that prepares the inputs for the img2img denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -#### QwenImage/img2img presets -IMAGE2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageImg2ImgVaeEncoderStep()), - ("input", QwenImageImg2ImgInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ("denoise", QwenImageDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -## 1.4 QwenImage/controlnet - -#### QwenImage/controlnet presets -CONTROLNET_BLOCKS = InsertableDict( - [ - ("controlnet_vae_encoder", QwenImageControlNetVaeEncoderStep()), # vae encoder step for control_image - ("controlnet_inputs", QwenImageControlNetInputsStep()), # additional input step for controlnet - ( - "controlnet_before_denoise", - QwenImageControlNetBeforeDenoiserStep(), - ), # before denoise step (after set_timesteps step) - ( - "controlnet_denoise_loop_before", - QwenImageLoopBeforeDenoiserControlNet(), - ), # controlnet loop step (insert before the denoiseloop_denoiser) - ] -) - - -## 1.5 QwenImage/auto encoders - - -#### for inpaint and img2img tasks -class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] - block_names = ["inpaint", "img2img"] - block_trigger_inputs = ["mask_image", "image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n" - + " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n" - + " - if `mask_image` or `image` is not provided, step will be skipped." - ) - - -# for controlnet tasks -class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): - block_classes = [QwenImageControlNetVaeEncoderStep] - block_names = ["controlnet"] - block_trigger_inputs = ["control_image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n" - + " - if `control_image` is not provided, step will be skipped." - ) - - -## 1.6 QwenImage/auto inputs - - -# text2image/inpaint/img2img -class QwenImageAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintInputStep, QwenImageImg2ImgInputStep, QwenImageTextInputsStep] - block_names = ["inpaint", "img2img", "text2image"] - block_trigger_inputs = ["processed_mask_image", "image_latents", None] - - @property - def description(self): - return ( - "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" - " This is an auto pipeline block that works for text2image/inpaint/img2img tasks.\n" - + " - `QwenImageInpaintInputStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n" - + " - `QwenImageTextInputsStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" - ) - - -# controlnet -class QwenImageOptionalControlNetInputStep(AutoPipelineBlocks): - block_classes = [QwenImageControlNetInputsStep] - block_names = ["controlnet"] - block_trigger_inputs = ["control_image_latents"] - - @property - def description(self): - return ( - "Controlnet input step that prepare the control_image_latents input.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageControlNetInputsStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - if `control_image_latents` is not provided, step will be skipped." - ) - - -## 1.7 QwenImage/auto before denoise step -# compose the steps into a BeforeDenoiseStep for text2image/img2img/inpaint tasks before combine into an auto step - -# QwenImage/text2image before denoise -QwenImageText2ImageBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) - - -class QwenImageText2ImageBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageText2ImageBeforeDenoiseBlocks.values() - block_names = QwenImageText2ImageBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for text2image task." - - -# QwenImage/inpaint before denoise -QwenImageInpaintBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) - - -class QwenImageInpaintBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageInpaintBeforeDenoiseBlocks.values() - block_names = QwenImageInpaintBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." - - -# QwenImage/img2img before denoise -QwenImageImg2ImgBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), - ("prepare_rope_inputs", QwenImageRoPEInputsStep()), - ] -) - - -class QwenImageImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageImg2ImgBeforeDenoiseBlocks.values() - block_names = QwenImageImg2ImgBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." - - -# auto before_denoise step for text2image, inpaint, img2img tasks -class QwenImageAutoBeforeDenoiseStep(AutoPipelineBlocks): +class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): + """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area.""" + model_name = "qwenimage-edit-plus" block_classes = [ - QwenImageInpaintBeforeDenoiseStep, - QwenImageImg2ImgBeforeDenoiseStep, - QwenImageText2ImageBeforeDenoiseStep, + QwenImageEditPlusResizeDynamicStep(target_area=1024 * 1024, output_name="resized_image"), + QwenImageEditPlusProcessImagesInputStep(), + QwenImageVaeEncoderDynamicStep(), ] - block_names = ["inpaint", "img2img", "text2image"] - block_trigger_inputs = ["processed_mask_image", "image_latents", None] + block_names = ["resize", "preprocess", "encode"] @property - def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for text2img, inpainting, img2img tasks.\n" - + " - `QwenImageInpaintBeforeDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageImg2ImgBeforeDenoiseStep` (img2img) is used when `image_latents` is provided.\n" - + " - `QwenImageText2ImageBeforeDenoiseStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" - ) - - -# auto before_denoise step for controlnet tasks -class QwenImageOptionalControlNetBeforeDenoiseStep(AutoPipelineBlocks): - block_classes = [QwenImageControlNetBeforeDenoiserStep] - block_names = ["controlnet"] - block_trigger_inputs = ["control_image_latents"] - - @property - def description(self): + def description(self) -> str: return ( - "Controlnet before denoise step that prepare the controlnet input.\n" - + "This is an auto pipeline block.\n" - + " - `QwenImageControlNetBeforeDenoiserStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - if `control_image_latents` is not provided, step will be skipped." + "VAE encoder step that encodes image inputs into latent representations.\n" + "Each image is resized independently based on its own aspect ratio to 1024x1024 target area." ) -## 1.8 QwenImage/auto denoise - +# ==================== +# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise +# ==================== -# auto denoise step for controlnet tasks: works for all tasks with controlnet -class QwenImageControlNetAutoDenoiseStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintControlNetDenoiseStep, QwenImageControlNetDenoiseStep] - block_names = ["inpaint_denoise", "denoise"] - block_trigger_inputs = ["mask", None] - - @property - def description(self): - return ( - "Controlnet step during the denoising process. \n" - " This is an auto pipeline block that works for inpaint and text2image/img2img tasks with controlnet.\n" - + " - `QwenImageInpaintControlNetDenoiseStep` (inpaint) is used when `mask` is provided.\n" - + " - `QwenImageControlNetDenoiseStep` (text2image/img2img) is used when `mask` is not provided.\n" - ) - - -# auto denoise step for everything: works for all tasks with or without controlnet -class QwenImageAutoDenoiseStep(AutoPipelineBlocks): +# Edit Plus input step +class QwenImageEditPlusInputStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" block_classes = [ - QwenImageControlNetAutoDenoiseStep, - QwenImageInpaintDenoiseStep, - QwenImageDenoiseStep, + QwenImageTextInputsStep(), + QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]), ] - block_names = ["controlnet_denoise", "inpaint_denoise", "denoise"] - block_trigger_inputs = ["control_image_latents", "mask", None] - - @property - def description(self): - return ( - "Denoise step that iteratively denoise the latents. \n" - " This is an auto pipeline block that works for inpaint/text2image/img2img tasks. It also works with controlnet\n" - + " - `QwenImageControlNetAutoDenoiseStep` (controlnet) is used when `control_image_latents` is provided.\n" - + " - `QwenImageInpaintDenoiseStep` (inpaint) is used when `mask` is provided and `control_image_latents` is not provided.\n" - + " - `QwenImageDenoiseStep` (text2image/img2img) is used when `mask` is not provided and `control_image_latents` is not provided.\n" - ) - - -## 1.9 QwenImage/auto decode -# auto decode step for inpaint and text2image tasks - - -class QwenImageAutoDecodeStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep] - block_names = ["inpaint_decode", "decode"] - block_trigger_inputs = ["mask", None] + block_names = ["text_inputs", "additional_inputs"] @property def description(self): return ( - "Decode step that decode the latents into images. \n" - " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n" - + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" - + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n" + "Input step that prepares the inputs for the Edit Plus denoising step. It:\n" + " - Standardizes text embeddings batch size.\n" + " - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.\n" + " - Outputs lists of image_height/image_width for RoPE calculation.\n" + " - Defaults height/width from last image in the list." ) -class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" +# Edit Plus core denoise +class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit-plus" block_classes = [ - QwenImageAutoInputStep, - QwenImageOptionalControlNetInputStep, - QwenImageAutoBeforeDenoiseStep, - QwenImageOptionalControlNetBeforeDenoiseStep, - QwenImageAutoDenoiseStep, - QwenImageAfterDenoiseStep, + QwenImageEditPlusInputStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsStep(), + QwenImageEditPlusRoPEInputsStep(), + QwenImageEditDenoiseStep(), + QwenImageAfterDenoiseStep(), ] block_names = [ "input", - "controlnet_input", - "before_denoise", - "controlnet_before_denoise", + "prepare_latents", + "set_timesteps", + "prepare_rope_inputs", "denoise", "after_denoise", ] @property def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n" - + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n" - + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" - + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n" - + " - for image-to-image generation, you need to provide `image_latents`\n" - + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n" - + " - to run the controlnet workflow, you need to provide `control_image_latents`\n" - + " - for text-to-image generation, all you need to provide is prompt embeddings" - ) - - -## 1.10 QwenImage/auto block & presets -AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageTextEncoderStep()), - ("vae_encoder", QwenImageAutoVaeEncoderStep()), - ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), - ("denoise", QwenImageCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), - ] -) - - -class QwenImageAutoBlocks(SequentialPipelineBlocks): - model_name = "qwenimage" - - block_classes = AUTO_BLOCKS.values() - block_names = AUTO_BLOCKS.keys() - - @property - def description(self): - return ( - "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" - + "- for image-to-image generation, you need to provide `image`\n" - + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" - + "- to run the controlnet workflow, you need to provide `control_image`\n" - + "- for text-to-image generation, all you need to provide is `prompt`" - ) - - -# 2. QwenImage-Edit - -## 2.1 QwenImage-Edit/edit - -#### QwenImage-Edit/edit vl encoder: take both image and text prompts -QwenImageEditVLEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), - ("encode", QwenImageEditTextEncoderStep()), - ] -) - - -class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditVLEncoderBlocks.values() - block_names = QwenImageEditVLEncoderBlocks.keys() - - @property - def description(self) -> str: - return "QwenImage-Edit VL encoder step that encode the image an text prompts together." - - -#### QwenImage-Edit/edit vae encoder -QwenImageEditVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), # edit has a different resize step - ("preprocess", QwenImageProcessImagesInputStep()), # resized_image -> processed_image - ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents - ] -) - - -class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditVaeEncoderBlocks.values() - block_names = QwenImageEditVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that encode the image inputs into their latent representations." - - -#### QwenImage-Edit/edit input -QwenImageEditInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), - ] -) - - -class QwenImageEditInputStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditInputBlocks.values() - block_names = QwenImageEditInputBlocks.keys() - - @property - def description(self): - return "Input step that prepares the inputs for the edit denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs: \n" - " - `image_latents`.\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -#### QwenImage/edit presets -EDIT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditVaeEncoderStep()), - ("input", QwenImageEditInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ("denoise", QwenImageEditDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -## 2.2 QwenImage-Edit/edit inpaint - -#### QwenImage-Edit/edit inpaint vae encoder: the difference from regular inpaint is the resize step -QwenImageEditInpaintVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditResizeDynamicStep()), # image -> resized_image - ( - "preprocess", - QwenImageInpaintProcessImagesInputStep, - ), # resized_image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs - ( - "encode", - QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"), - ), # processed_image -> image_latents - ] -) - - -class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditInpaintVaeEncoderBlocks.values() - block_names = QwenImageEditInpaintVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return ( - "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n" - " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n" - " - process the resized image and mask image.\n" - " - create image latents." - ) - - -#### QwenImage-Edit/edit inpaint presets -EDIT_INPAINT_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditInpaintVaeEncoderStep()), - ("input", QwenImageInpaintInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ("denoise", QwenImageEditInpaintDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageInpaintDecodeStep()), - ] -) - - -## 2.3 QwenImage-Edit/auto encoders - - -class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [ - QwenImageEditInpaintVaeEncoderStep, - QwenImageEditVaeEncoderStep, - ] - block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["mask_image", "image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations. \n" - " This is an auto pipeline block that works for edit and edit_inpaint tasks.\n" - + " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n" - + " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n" - + " - if `mask_image` or `image` is not provided, step will be skipped." - ) - - -## 2.4 QwenImage-Edit/auto inputs -class QwenImageEditAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep] - block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] - - @property - def description(self): - return ( - "Input step that prepares the inputs for the edit denoising step.\n" - + " It is an auto pipeline block that works for edit and edit_inpaint tasks.\n" - + " - `QwenImageInpaintInputStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditInputStep` (edit) is used when `image_latents` is provided.\n" - + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." - ) - - -## 2.5 QwenImage-Edit/auto before denoise -# compose the steps into a BeforeDenoiseStep for edit and edit_inpaint tasks before combine into an auto step - -#### QwenImage-Edit/edit before denoise -QwenImageEditBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ] -) - - -class QwenImageEditBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditBeforeDenoiseBlocks.values() - block_names = QwenImageEditBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." - - -#### QwenImage-Edit/edit inpaint before denoise -QwenImageEditInpaintBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), - ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ] -) - - -class QwenImageEditInpaintBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditInpaintBeforeDenoiseBlocks.values() - block_names = QwenImageEditInpaintBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit inpaint task." - - -# auto before_denoise step for edit and edit_inpaint tasks -class QwenImageEditAutoBeforeDenoiseStep(AutoPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [ - QwenImageEditInpaintBeforeDenoiseStep, - QwenImageEditBeforeDenoiseStep, - ] - block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] - - @property - def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for edit (img2img) and edit inpaint tasks.\n" - + " - `QwenImageEditInpaintBeforeDenoiseStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" - + " - if `image_latents` or `processed_mask_image` is not provided, step will be skipped." - ) - + return "Core denoising workflow for QwenImage-Edit Plus edit (img2img) task." -## 2.6 QwenImage-Edit/auto denoise +# ==================== +# 4. DECODE +# ==================== -class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks): - model_name = "qwenimage-edit" - - block_classes = [QwenImageEditInpaintDenoiseStep, QwenImageEditDenoiseStep] - block_names = ["inpaint_denoise", "denoise"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] - - @property - def description(self): - return ( - "Denoise step that iteratively denoise the latents. \n" - + "This block supports edit (img2img) and edit inpaint tasks for QwenImage Edit. \n" - + " - `QwenImageEditInpaintDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" - + " - `QwenImageEditDenoiseStep` (img2img) is used when `image_latents` is provided.\n" - + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." - ) - - -## 2.7 QwenImage-Edit/auto blocks & presets - - -class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [ - QwenImageEditAutoInputStep, - QwenImageEditAutoBeforeDenoiseStep, - QwenImageEditAutoDenoiseStep, - QwenImageAfterDenoiseStep, - ] - block_names = ["input", "before_denoise", "denoise", "after_denoise"] - - @property - def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" - + "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n" - + " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n" - + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" - ) - - -EDIT_AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), - ("denoise", QwenImageEditCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), - ] -) - - -class QwenImageEditAutoBlocks(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = EDIT_AUTO_BLOCKS.values() - block_names = EDIT_AUTO_BLOCKS.keys() - - @property - def description(self): - return ( - "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" - + "- for edit (img2img) generation, you need to provide `image`\n" - + "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" - ) - - -#################### QwenImage Edit Plus ##################### - -# 3. QwenImage-Edit Plus - -## 3.1 QwenImage-Edit Plus / edit - -#### QwenImage-Edit Plus vl encoder: take both image and text prompts -QwenImageEditPlusVLEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditPlusResizeDynamicStep()), - ("encode", QwenImageEditPlusTextEncoderStep()), - ] -) - - -class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditPlusVLEncoderBlocks.values() - block_names = QwenImageEditPlusVLEncoderBlocks.keys() - - @property - def description(self) -> str: - return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together." - - -#### QwenImage-Edit Plus vae encoder -QwenImageEditPlusVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditPlusResizeDynamicStep()), # edit plus has a different resize step - ("preprocess", QwenImageEditPlusProcessImagesInputStep()), # vae_image -> processed_image - ("encode", QwenImageEditPlusVaeEncoderDynamicStep()), # processed_image -> image_latents - ] -) - - -class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): +class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusVaeEncoderBlocks.values() - block_names = QwenImageEditPlusVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that encode the image inputs into their latent representations." - - -#### QwenImage Edit Plus input blocks -QwenImageEditPlusInputBlocks = InsertableDict( - [ - ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings - ( - "additional_inputs", - QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]), - ), - ] -) - - -class QwenImageEditPlusInputStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusInputBlocks.values() - block_names = QwenImageEditPlusInputBlocks.keys() - - -#### QwenImage Edit Plus presets -EDIT_PLUS_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditPlusVLEncoderStep()), - ("vae_encoder", QwenImageEditPlusVaeEncoderStep()), - ("input", QwenImageEditPlusInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), - ("denoise", QwenImageEditDenoiseStep()), - ("after_denoise", QwenImageAfterDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -QwenImageEditPlusBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()), - ] -) - - -class QwenImageEditPlusBeforeDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = QwenImageEditPlusBeforeDenoiseBlocks.values() - block_names = QwenImageEditPlusBeforeDenoiseBlocks.keys() + block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] + block_names = ["decode", "postprocess"] @property def description(self): - return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." - - -# auto before_denoise step for edit tasks -class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = [QwenImageEditPlusBeforeDenoiseStep] - block_names = ["edit"] - block_trigger_inputs = ["image_latents"] + return "Decode step that decodes the latents to images and postprocesses the generated image." - @property - def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for edit (img2img) task.\n" - + " - `QwenImageEditPlusBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" - + " - if `image_latents` is not provided, step will be skipped." - ) - - -## 3.2 QwenImage-Edit Plus/auto encoders - - -class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [QwenImageEditPlusVaeEncoderStep] - block_names = ["edit"] - block_trigger_inputs = ["image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations. \n" - " This is an auto pipeline block that works for edit task.\n" - + " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n" - + " - if `image` is not provided, step will be skipped." - ) - - -## 3.3 QwenImage-Edit/auto blocks & presets - - -class QwenImageEditPlusAutoInputStep(AutoPipelineBlocks): - block_classes = [QwenImageEditPlusInputStep] - block_names = ["edit"] - block_trigger_inputs = ["image_latents"] - - @property - def description(self): - return ( - "Input step that prepares the inputs for the edit denoising step.\n" - + " It is an auto pipeline block that works for edit task.\n" - + " - `QwenImageEditPlusInputStep` (edit) is used when `image_latents` is provided.\n" - + " - if `image_latents` is not provided, step will be skipped." - ) - - -class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = [ - QwenImageEditPlusAutoInputStep, - QwenImageEditPlusAutoBeforeDenoiseStep, - QwenImageEditAutoDenoiseStep, - QwenImageAfterDenoiseStep, - ] - block_names = ["input", "before_denoise", "denoise", "after_denoise"] - - @property - def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" - + "This step support edit (img2img) workflow for QwenImage Edit Plus:\n" - + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" - ) +# ==================== +# 5. AUTO BLOCKS & PRESETS +# ==================== EDIT_PLUS_AUTO_BLOCKS = InsertableDict( [ ("text_encoder", QwenImageEditPlusVLEncoderStep()), - ("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()), + ("vae_encoder", QwenImageEditPlusVaeEncoderStep()), ("denoise", QwenImageEditPlusCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), + ("decode", QwenImageEditPlusDecodeStep()), ] ) @@ -1091,23 +168,8 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): @property def description(self): return ( - "Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n" - + "- for edit (img2img) generation, you need to provide `image`\n" - ) - - -# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus - - -ALL_BLOCKS = { - "text2image": TEXT2IMAGE_BLOCKS, - "img2img": IMAGE2IMAGE_BLOCKS, - "edit": EDIT_BLOCKS, - "edit_inpaint": EDIT_INPAINT_BLOCKS, - "edit_plus": EDIT_PLUS_BLOCKS, - "inpaint": INPAINT_BLOCKS, - "controlnet": CONTROLNET_BLOCKS, - "auto": AUTO_BLOCKS, - "edit_auto": EDIT_AUTO_BLOCKS, - "edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS, -} + "Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.\n" + "- `image` is required input (can be single image or list of images).\n" + "- Each image is resized independently based on its own aspect ratio.\n" + "- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area." + ) \ No newline at end of file From 8a16d854ff566965d043ab88433b4f42a1ae785f Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Tue, 6 Jan 2026 19:37:56 +0100 Subject: [PATCH 04/12] add layered --- .../qwenimage/before_denoise.py | 116 +++++++ .../modular_pipelines/qwenimage/decoders.py | 123 +++++++- .../modular_pipelines/qwenimage/encoders.py | 202 +++++++++--- .../qwenimage/modular_blocks_qwenimage.py | 6 +- .../modular_blocks_qwenimage_edit.py | 14 +- .../modular_blocks_qwenimage_edit_plus.py | 10 +- .../modular_blocks_qwenimage_layered.py | 298 ++++++++++++++++++ .../qwenimage/modular_pipeline.py | 68 ++++ .../qwenimage/prompt_templates.py | 121 +++++++ 9 files changed, 901 insertions(+), 57 deletions(-) create mode 100644 src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py create mode 100644 src/diffusers/modular_pipelines/qwenimage/prompt_templates.py diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 55968bd4fc93..4fbcfd207a66 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -420,6 +420,64 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks): + model_name = "qwenimage-layered" + + @property + def description(self) -> str: + return "Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("num_inference_steps", default=50, type_hint=int), + InputParam("sigmas", type_hint=List[float]), + InputParam("image_latents", required=True, type_hint=torch.Tensor), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="timesteps", type_hint=torch.Tensor), + ] + + @torch.no_grad() + def __call__(self, components, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + device = components._execution_device + + # Layered-specific mu calculation + base_seqlen = 256 * 256 / 16 / 16 # = 256 + mu = (block_state.image_latents.shape[1] / base_seqlen) ** 0.5 + + # Default sigmas if not provided + sigmas = ( + np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps) + if block_state.sigmas is None + else block_state.sigmas + ) + + block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps( + components.scheduler, + block_state.num_inference_steps, + device, + sigmas=sigmas, + mu=mu, + ) + + components.scheduler.set_begin_index(0) + + self.set_block_state(state, block_state) + return components, state + + class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -716,6 +774,64 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): + model_name = "qwenimage-layered" + + @property + def description(self) -> str: + return "Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step" + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="batch_size", required=True), + InputParam(name="layers", required=True), + InputParam(name="height", required=True), + InputParam(name="width", required=True), + InputParam(name="prompt_embeds_mask"), + InputParam(name="negative_prompt_embeds_mask"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="img_shapes", type_hint=List[List[Tuple[int, int, int]]], kwargs_type="denoiser_input_fields", description="The shapes of the image latents, used for RoPE calculation"), + OutputParam(name="txt_seq_lens", type_hint=List[int], kwargs_type="denoiser_input_fields", description="The sequence lengths of the prompt embeds, used for RoPE calculation"), + OutputParam(name="negative_txt_seq_lens", type_hint=List[int], kwargs_type="denoiser_input_fields", description="The sequence lengths of the negative prompt embeds, used for RoPE calculation"), + OutputParam(name="additional_t_cond", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields", description="The additional t cond, used for RoPE calculation"), + ] + + @torch.no_grad() + def __call__(self, components, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + device = components._execution_device + + # All shapes are the same for Layered + shape = (1, block_state.height // components.vae_scale_factor // 2, block_state.width // components.vae_scale_factor // 2) + + # layers+1 output shapes + 1 condition shape (all same) + block_state.img_shapes = [ + [shape] * (block_state.layers + 2) + ] * block_state.batch_size + + # txt_seq_lens + block_state.txt_seq_lens = ( + block_state.prompt_embeds_mask.sum(dim=1).tolist() + if block_state.prompt_embeds_mask is not None else None + ) + block_state.negative_txt_seq_lens = ( + block_state.negative_prompt_embeds_mask.sum(dim=1).tolist() + if block_state.negative_prompt_embeds_mask is not None else None + ) + + + block_state.additional_t_cond = torch.tensor([0] * block_state.batch_size).to(device=device, dtype=torch.long) + + self.set_block_state(state, block_state) + return components, state + + ## ControlNet inputs for denoiser class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): model_name = "qwenimage" diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 6e145f18550a..5f1281564b1d 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -24,7 +24,7 @@ from ...utils import logging from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam -from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier +from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier, QwenImageLayeredPachifier logger = logging.get_logger(__name__) @@ -71,6 +71,47 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): + """Unpack latents after denoising for Layered.""" + + model_name = "qwenimage-layered" + + @property + def description(self) -> str: + return "Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("layered_pachifier", QwenImageLayeredPachifier, default_creation_method="from_config"), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("latents", required=True, type_hint=torch.Tensor), + InputParam("height", required=True, type_hint=int), + InputParam("width", required=True, type_hint=int), + InputParam("layers", required=True, type_hint=int), + ] + + @torch.no_grad() + def __call__(self, components, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + # Unpack: (B, seq, C*4) -> (B, C, layers+1, H, W) + block_state.latents = components.layered_pachifier.unpack_latents( + block_state.latents, + block_state.height, + block_state.width, + block_state.layers, + components.vae_scale_factor, + ) + + self.set_block_state(state, block_state) + return components, state + + class QwenImageDecoderStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -135,6 +176,86 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +class QwenImageLayeredDecoderStep(ModularPipelineBlocks): + """Decode unpacked layered latents into multiple layer images.""" + + model_name = "qwenimage-layered" + + @property + def description(self) -> str: + return "Decode unpacked latents (B, C, layers+1, H, W) into layer images." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("vae", AutoencoderKLQwenImage), + ComponentSpec( + "image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("latents", required=True, type_hint=torch.Tensor), + InputParam("layers", required=True, type_hint=int), + InputParam("output_type", default="pil", type_hint=str), + ] + + @property + def outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]), + ] + + @torch.no_grad() + def __call__(self, components, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + latents = block_state.latents + layers = block_state.layers + + # 1. VAE normalization + latents = latents.to(components.vae.dtype) + latents_mean = ( + torch.tensor(components.vae.config.latents_mean) + .view(1, components.vae.config.z_dim, 1, 1, 1) + .to(latents.device, latents.dtype) + ) + latents_std = ( + 1.0 / torch.tensor(components.vae.config.latents_std) + .view(1, components.vae.config.z_dim, 1, 1, 1) + .to(latents.device, latents.dtype) + ) + latents = latents / latents_std + latents_mean + + # 2. Remove first frame (composite), keep layers frames + latents = latents[:, :, 1:] + + # 3. Reshape for batch decoding: (B, C, layers, H, W) -> (B*layers, C, 1, H, W) + b, c, f, h, w = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape(b * f, c, 1, h, w) + + # 4. Decode: (B*layers, C, 1, H, W) -> (B*layers, C, H, W) + image = components.vae.decode(latents, return_dict=False)[0][:, :, 0] + + # 5. Postprocess - returns flat list of B*layers images + image = components.image_processor.postprocess(image, output_type=block_state.output_type) + + # 6. Chunk into list per batch item + images = [] + for bidx in range(b): + images.append(image[bidx * f : (bidx + 1) * f]) + + block_state.images = images + + self.set_block_state(state, block_state) + return components, state + + class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): model_name = "qwenimage" diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 01385d38c99e..c7e9d1f114cf 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Text and VAE encoder blocks for QwenImage pipelines. +""" + from typing import Dict, List, Optional, Union import PIL @@ -28,6 +32,17 @@ from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam from .modular_pipeline import QwenImageModularPipeline +from .prompt_templates import ( + QWENIMAGE_PROMPT_TEMPLATE, + QWENIMAGE_PROMPT_TEMPLATE_START_IDX, + QWENIMAGE_EDIT_PROMPT_TEMPLATE, + QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX, + QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE, + QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE, + QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX, + QWENIMAGE_LAYERED_CAPTION_PROMPT_EN, + QWENIMAGE_LAYERED_CAPTION_PROMPT_CN, +) logger = logging.get_logger(__name__) @@ -45,8 +60,8 @@ def get_qwen_prompt_embeds( text_encoder, tokenizer, prompt: Union[str, List[str]] = None, - prompt_template_encode: str = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", - prompt_template_encode_start_idx: int = 34, + prompt_template_encode: str = QWENIMAGE_PROMPT_TEMPLATE, + prompt_template_encode_start_idx: int = QWENIMAGE_PROMPT_TEMPLATE_START_IDX, tokenizer_max_length: int = 1024, device: Optional[torch.device] = None, ): @@ -86,8 +101,8 @@ def get_qwen_prompt_embeds_edit( processor, prompt: Union[str, List[str]] = None, image: Optional[torch.Tensor] = None, - prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n", - prompt_template_encode_start_idx: int = 64, + prompt_template_encode: str = QWENIMAGE_EDIT_PROMPT_TEMPLATE, + prompt_template_encode_start_idx: int = QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX, device: Optional[torch.device] = None, ): prompt = [prompt] if isinstance(prompt, str) else prompt @@ -133,9 +148,9 @@ def get_qwen_prompt_embeds_edit_plus( processor, prompt: Union[str, List[str]] = None, image: Optional[Union[torch.Tensor, List[PIL.Image.Image], PIL.Image.Image]] = None, - prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", - img_template_encode: str = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>", - prompt_template_encode_start_idx: int = 64, + prompt_template_encode: str = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE, + img_template_encode: str = QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE, + prompt_template_encode_start_idx: int = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX, device: Optional[torch.device] = None, ): prompt = [prompt] if isinstance(prompt, str) else prompt @@ -241,14 +256,13 @@ def encode_vae_image( return image_latents -class QwenImageEditResizeDynamicStep(ModularPipelineBlocks): +class QwenImageEditResizeStep(ModularPipelineBlocks): model_name = "qwenimage" def __init__( self, input_name: str = "image", output_name: str = "resized_image", - target_area: int = 1024 * 1024, ): """Create a configurable step for resizing images to the target area while maintaining the aspect ratio. Args: @@ -256,7 +270,6 @@ def __init__( pipeline state. Defaults to "image". output_name (str, optional): Name of the resized image field to write back to the pipeline state. Defaults to "resized_image". - target_area (int, optional): Target area in pixels. Defaults to 1024*1024. """ if not isinstance(input_name, str) or not isinstance(output_name, str): raise ValueError( @@ -264,12 +277,11 @@ def __init__( ) self._image_input_name = input_name self._resized_image_output_name = output_name - self._target_area = target_area super().__init__() @property def description(self) -> str: - return f"Image Resize step that resize the {self._image_input_name} to the target area {self._target_area} while maintaining the aspect ratio." + return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio." @property def expected_components(self) -> List[ComponentSpec]: @@ -288,6 +300,9 @@ def inputs(self) -> List[InputParam]: InputParam( name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" ), + InputParam( + name="target_area", default=1024 * 1024, type_hint=int, description="The target area to resize the image to" + ), ] @property @@ -311,7 +326,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): images = [images] image_width, image_height = images[0].size - calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height) + calculated_width, calculated_height, _ = calculate_dimensions(block_state.target_area, image_width / image_height) resized_images = [ components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width) @@ -322,7 +337,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state -class QwenImageEditPlusResizeDynamicStep(ModularPipelineBlocks): +class QwenImageEditPlusResizeStep(ModularPipelineBlocks): """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus.""" model_name = "qwenimage-edit-plus" @@ -414,12 +429,98 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state + +class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): + """ + Auto-caption step that generates a text prompt from the input image if none is provided. + Uses the VL model to generate a description of the image. + """ + + model_name = "qwenimage-layered" + + @property + def description(self) -> str: + return ( + "Auto-caption step that generates a text prompt from the input image if none is provided.\n" + "Uses the VL model (text_encoder) to generate a description of the image.\n" + "If prompt is already provided, this step passes through unchanged." + ) + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration), + ComponentSpec("processor", Qwen2VLProcessor), + ] + + @property + def expected_configs(self) -> List[ConfigSpec]: + return [ + ConfigSpec(name="image_caption_prompt_en", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_EN), + ConfigSpec(name="image_caption_prompt_cn", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_CN), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="prompt", type_hint=str, description="The prompt to encode"), + InputParam( + name="resized_image", + required=True, + type_hint=PIL.Image.Image, + description="The image to generate caption from, should be resized use the resize step", + ), + InputParam( + name="use_en_prompt", + default=False, + type_hint=bool, + description="Whether to use English prompt template", + ), + ] + + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + device = components._execution_device + + # If prompt is empty or None, generate caption from image + if block_state.prompt is None or block_state.prompt == "" or block_state.prompt == " ": + + if block_state.use_en_prompt: + caption_prompt = components.config.image_caption_prompt_en + else: + caption_prompt = components.config.image_caption_prompt_cn + + model_inputs = components.processor( + text=caption_prompt, + images=block_state.resized_image, + padding=True, + return_tensors="pt", + ).to(device) + + generated_ids = components.text_encoder.generate(**model_inputs, max_new_tokens=512) + generated_ids_trimmed = [ + out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids) + ] + output_text = components.processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + + block_state.prompt = output_text.strip() + + self.set_block_state(state, block_state) + return components, state + + + class QwenImageTextEncoderStep(ModularPipelineBlocks): model_name = "qwenimage" @property def description(self) -> str: - return "Text Encoder step that generate text_embeddings to guide the image generation" + return "Text Encoder step that generates text embeddings to guide the image generation." @property def expected_components(self) -> List[ComponentSpec]: @@ -437,11 +538,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def expected_configs(self) -> List[ConfigSpec]: return [ - ConfigSpec( - name="prompt_template_encode", - default="<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", - ), - ConfigSpec(name="prompt_template_encode_start_idx", default=34), + ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_PROMPT_TEMPLATE), + ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_PROMPT_TEMPLATE_START_IDX), ConfigSpec(name="tokenizer_max_length", default=1024), ] @@ -548,7 +646,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation" + return "Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation." @property def expected_components(self) -> List[ComponentSpec]: @@ -566,11 +664,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def expected_configs(self) -> List[ConfigSpec]: return [ - ConfigSpec( - name="prompt_template_encode", - default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n", - ), - ConfigSpec(name="prompt_template_encode_start_idx", default=64), + ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE), + ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX), ] @property @@ -581,7 +676,7 @@ def inputs(self) -> List[InputParam]: InputParam( name="resized_image", required=True, - type_hint=torch.Tensor, + type_hint=PIL.Image.Image, description="The image prompt to encode, should be resized using resize step", ), ] @@ -664,7 +759,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): - """Text encoder for QwenImage Edit Plus that handles multiple reference images.""" + """Text encoder for QwenImage Edit Plus (VL encoding with multiple images).""" model_name = "qwenimage-edit-plus" @@ -691,15 +786,9 @@ def expected_components(self) -> List[ComponentSpec]: @property def expected_configs(self) -> List[ConfigSpec]: return [ - ConfigSpec( - name="prompt_template_encode", - default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", - ), - ConfigSpec( - name="img_template_encode", - default="Picture {}: <|vision_start|><|image_pad|><|vision_end|>", - ), - ConfigSpec(name="prompt_template_encode_start_idx", default=64), + ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE), + ConfigSpec(name="img_template_encode", default=QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE), + ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX), ] @property @@ -801,7 +890,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images can be resized first using QwenImageEditResizeDynamicStep." + return "Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images can be resized first using QwenImageEditResizeStep." @property def expected_components(self) -> List[ComponentSpec]: @@ -882,7 +971,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep." + return "Image Preprocess step. Images can be resized first using QwenImageEditResizeStep." @property def expected_components(self) -> List[ComponentSpec]: @@ -943,7 +1032,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep." + return "Image Preprocess step. Images can be resized first using QwenImageEditResizeStep." @property def expected_components(self) -> List[ComponentSpec]: @@ -989,7 +1078,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state -class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks): +class QwenImageVaeEncoderStep(ModularPipelineBlocks): """VAE encoder that handles both single images and lists of images with varied resolutions.""" model_name = "qwenimage" @@ -1182,3 +1271,34 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - self.set_block_state(state, block_state) return components, state + + +class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): + """Permute image latents from VAE format to Layered format.""" + + model_name = "qwenimage-layered" + + def __init__(self, input_name: str = "image_latents"): + self._input_name = input_name + super().__init__() + + @property + def description(self) -> str: + return f"Permute {self._input_name} from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing." + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(self._input_name, required=True), + ] + + @torch.no_grad() + def __call__(self, components, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + # Permute: (B, C, 1, H, W) -> (B, 1, C, H, W) + latents = getattr(block_state, self._input_name) + setattr(block_state, self._input_name, latents.permute(0, 2, 1, 3, 4)) + + self.set_block_state(state, block_state) + return components, state \ No newline at end of file diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index dea9d36082c1..ac67099eac0f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -42,7 +42,7 @@ QwenImageInpaintProcessImagesInputStep, QwenImageProcessImagesInputStep, QwenImageTextEncoderStep, - QwenImageVaeEncoderDynamicStep, + QwenImageVaeEncoderStep, ) from .inputs import ( QwenImageControlNetInputsStep, @@ -60,7 +60,7 @@ # inpaint vae encoder class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()] + block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()] block_names = ["preprocess", "encode"] @property @@ -77,7 +77,7 @@ def description(self) -> str: class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderDynamicStep()] + block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()] block_names = ["preprocess", "encode"] @property diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index bcadd72b5909..5ed287566727 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -36,11 +36,11 @@ QwenImageEditInpaintDenoiseStep, ) from .encoders import ( - QwenImageEditResizeDynamicStep, + QwenImageEditResizeStep, QwenImageEditTextEncoderStep, QwenImageInpaintProcessImagesInputStep, QwenImageProcessImagesInputStep, - QwenImageVaeEncoderDynamicStep, + QwenImageVaeEncoderStep, ) from .inputs import ( QwenImageInputsDynamicStep, @@ -59,7 +59,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): """VL encoder that takes both image and text prompts.""" model_name = "qwenimage-edit" block_classes = [ - QwenImageEditResizeDynamicStep(), + QwenImageEditResizeStep(), QwenImageEditTextEncoderStep(), ] block_names = ["resize", "encode"] @@ -77,9 +77,9 @@ def description(self) -> str: class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ - QwenImageEditResizeDynamicStep(), + QwenImageEditResizeStep(), QwenImageProcessImagesInputStep(), - QwenImageVaeEncoderDynamicStep(), + QwenImageVaeEncoderStep(), ] block_names = ["resize", "preprocess", "encode"] @@ -92,9 +92,9 @@ def description(self) -> str: class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ - QwenImageEditResizeDynamicStep(), + QwenImageEditResizeStep(), QwenImageInpaintProcessImagesInputStep(), - QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"), + QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"), ] block_names = ["resize", "preprocess", "encode"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 75b40ccc8ce2..35ee8994c8dd 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -29,10 +29,10 @@ QwenImageEditDenoiseStep, ) from .encoders import ( - QwenImageEditPlusResizeDynamicStep, + QwenImageEditPlusResizeStep, QwenImageEditPlusTextEncoderStep, QwenImageEditPlusProcessImagesInputStep, - QwenImageVaeEncoderDynamicStep, + QwenImageVaeEncoderStep, ) from .inputs import ( QwenImageEditPlusInputsDynamicStep, @@ -51,7 +51,7 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): """VL encoder that takes both image and text prompts. Uses 384x384 target area.""" model_name = "qwenimage-edit-plus" block_classes = [ - QwenImageEditPlusResizeDynamicStep(target_area=384 * 384, output_name="resized_cond_image"), + QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"), QwenImageEditPlusTextEncoderStep(), ] block_names = ["resize", "encode"] @@ -69,9 +69,9 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area.""" model_name = "qwenimage-edit-plus" block_classes = [ - QwenImageEditPlusResizeDynamicStep(target_area=1024 * 1024, output_name="resized_image"), + QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"), QwenImageEditPlusProcessImagesInputStep(), - QwenImageVaeEncoderDynamicStep(), + QwenImageVaeEncoderStep(), ] block_names = ["resize", "preprocess", "encode"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py new file mode 100644 index 000000000000..9371bc4762e9 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -0,0 +1,298 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import InsertableDict +from .before_denoise import ( + QwenImageCreateMaskLatentsStep, + QwenImageEditRoPEInputsStep, + QwenImagePrepareLatentsStep, + QwenImagePrepareLatentsWithStrengthStep, + QwenImageSetTimestepsStep, + QwenImageSetTimestepsWithStrengthStep, +) +from .decoders import ( + QwenImageAfterDenoiseStep, + QwenImageDecoderStep, + QwenImageInpaintProcessImagesOutputStep, + QwenImageProcessImagesOutputStep, +) +from .denoise import ( + QwenImageEditDenoiseStep, + QwenImageEditInpaintDenoiseStep, +) +from .encoders import ( + QwenImageEditResizeStep, + QwenImageTextEncoderStep, + QwenImageProcessImagesInputStep, + QwenImageVaeEncoderStep, + QwenImageLayeredGetImagePromptStep, + QwenImageLayeredPermuteLatentsStep, +) +from .inputs import ( + QwenImageInputsDynamicStep, + QwenImageTextInputsStep, +) + + +logger = logging.get_logger(__name__) + + +# ==================== +# 1. TEXT ENCODER +# ==================== + +class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): + """Text encoder that takes text prompt, will generate a prompt based on image if not provided.""" + model_name = "qwenimage-layered" + block_classes = [ + QwenImageEditResizeStep(), + QwenImageLayeredGetImagePromptStep(), + QwenImageTextEncoderStep(), + ] + block_names = ["resize", "get_image_prompt", "encode"] + + @property + def description(self) -> str: + return "QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided." + + +# ==================== +# 2. VAE ENCODER +# ==================== + +# Edit VAE encoder +class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage-layered" + block_classes = [ + QwenImageEditResizeStep(), + QwenImageProcessImagesInputStep(), + QwenImageVaeEncoderStep(), + QwenImageLayeredPermuteLatentsStep(), + ] + block_names = ["resize", "preprocess", "encode", "permute"] + + @property + def description(self) -> str: + return "Vae encoder step that encode the image inputs into their latent representations." + + + + + +# ==================== +# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise +# ==================== + +# Edit input step +class QwenImageEditInputStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageTextInputsStep(), + QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]), + ] + block_names = ["text_inputs", "additional_inputs"] + + @property + def description(self): + return ( + "Input step that prepares the inputs for the edit denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n" + " - update height/width based `image_latents`, patchify `image_latents`." + ) + + +# Edit Inpaint input step +class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageTextInputsStep(), + QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]), + ] + block_names = ["text_inputs", "additional_inputs"] + + @property + def description(self): + return ( + "Input step that prepares the inputs for the edit inpaint denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n" + " - update height/width based `image_latents`, patchify `image_latents`." + ) + + +# Edit Inpaint prepare latents step +class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] + block_names = ["add_noise_to_latents", "create_mask_latents"] + + @property + def description(self) -> str: + return ( + "This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:\n" + " - Add noise to the image latents to create the latents input for the denoiser.\n" + " - Create the patchified latents `mask` based on the processed mask image.\n" + ) + + +# 1. Edit (img2img) core denoise +class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditInputStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsStep(), + QwenImageEditRoPEInputsStep(), + QwenImageEditDenoiseStep(), + QwenImageAfterDenoiseStep(), + ] + block_names = [ + "input", + "prepare_latents", + "set_timesteps", + "prepare_rope_inputs", + "denoise", + "after_denoise", + ] + + @property + def description(self): + return "Core denoising workflow for QwenImage-Edit edit (img2img) task." + + +# 2. Edit Inpaint core denoise +class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditInpaintInputStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImageEditInpaintPrepareLatentsStep(), + QwenImageEditRoPEInputsStep(), + QwenImageEditInpaintDenoiseStep(), + QwenImageAfterDenoiseStep(), + ] + block_names = [ + "input", + "prepare_latents", + "set_timesteps", + "prepare_inpaint_latents", + "prepare_rope_inputs", + "denoise", + "after_denoise", + ] + + @property + def description(self): + return "Core denoising workflow for QwenImage-Edit edit inpaint task." + + +# Auto core denoise step +class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks): + block_classes = [ + QwenImageEditInpaintCoreDenoiseStep, + QwenImageEditCoreDenoiseStep, + ] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + default_block_name = "edit" + + def select_block(self, processed_mask_image=None, image_latents=None) -> Optional[str]: + if processed_mask_image is not None: + return "edit_inpaint" + elif image_latents is not None: + return "edit" + return None + + @property + def description(self): + return ( + "Auto core denoising step that selects the appropriate workflow based on inputs.\n" + " - `QwenImageEditInpaintCoreDenoiseStep` when `processed_mask_image` is provided\n" + " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n" + "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit." + ) + + +# ==================== +# 4. DECODE +# ==================== + +# Decode step (standard) +class QwenImageEditDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] + block_names = ["decode", "postprocess"] + + @property + def description(self): + return "Decode step that decodes the latents to images and postprocess the generated image." + + +# Inpaint decode step +class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] + block_names = ["decode", "postprocess"] + + @property + def description(self): + return "Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image." + + +# Auto decode step +class QwenImageEditAutoDecodeStep(AutoPipelineBlocks): + block_classes = [QwenImageEditInpaintDecodeStep, QwenImageEditDecodeStep] + block_names = ["inpaint_decode", "decode"] + block_trigger_inputs = ["mask", None] + + @property + def description(self): + return ( + "Decode step that decode the latents into images.\n" + "This is an auto pipeline block.\n" + " - `QwenImageEditInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" + " - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n" + ) + + +# ==================== +# 5. AUTO BLOCKS & PRESETS +# ==================== + +EDIT_AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), + ("denoise", QwenImageEditAutoCoreDenoiseStep()), + ("decode", QwenImageEditAutoDecodeStep()), + ] +) + + +class QwenImageEditAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = EDIT_AUTO_BLOCKS.values() + block_names = EDIT_AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" + "- for edit (img2img) generation, you need to provide `image`\n" + "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n" + ) \ No newline at end of file diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py index 59e1a13a5db2..30dc9dcec5ba 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py @@ -90,6 +90,74 @@ def unpack_latents(self, latents, height, width, vae_scale_factor=8): return latents +class QwenImageLayeredPachifier(ConfigMixin): + """ + A class to pack and unpack latents for QwenImage Layered. + + Unlike QwenImagePachifier, this handles 3D latents with shape (B, layers, C, H, W). + """ + + config_name = "config.json" + + @register_to_config + def __init__(self, patch_size: int = 2): + super().__init__() + + def pack_latents(self, latents, batch_size, num_channels_latents, height, width, layers): + """ + Pack latents from (B, layers, C, H, W) to (B, layers * H/2 * W/2, C*4). + """ + patch_size = self.config.patch_size + + latents = latents.view( + batch_size, + layers, + num_channels_latents, + height // patch_size, + patch_size, + width // patch_size, + patch_size, + ) + latents = latents.permute(0, 1, 3, 5, 2, 4, 6) + latents = latents.reshape( + batch_size, + layers * (height // patch_size) * (width // patch_size), + num_channels_latents * patch_size * patch_size, + ) + return latents + + def unpack_latents(self, latents, height, width, layers, vae_scale_factor=8): + """ + Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W). + """ + batch_size, num_patches, channels = latents.shape + patch_size = self.config.patch_size + + height = patch_size * (int(height) // (vae_scale_factor * patch_size)) + width = patch_size * (int(width) // (vae_scale_factor * patch_size)) + + latents = latents.view( + batch_size, + layers + 1, + height // patch_size, + width // patch_size, + channels // (patch_size * patch_size), + patch_size, + patch_size, + ) + latents = latents.permute(0, 1, 4, 2, 5, 3, 6) + latents = latents.reshape( + batch_size, + layers + 1, + channels // (patch_size * patch_size), + height, + width, + ) + latents = latents.permute(0, 2, 1, 3, 4) # (b, c, f, h, w) + + return latents + + class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin): """ A ModularPipeline for QwenImage. diff --git a/src/diffusers/modular_pipelines/qwenimage/prompt_templates.py b/src/diffusers/modular_pipelines/qwenimage/prompt_templates.py new file mode 100644 index 000000000000..068f768250c6 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/prompt_templates.py @@ -0,0 +1,121 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Prompt templates for QwenImage pipelines. + +This module centralizes all prompt templates used across different QwenImage pipeline variants: +- QwenImage (base): Text-only encoding for text-to-image generation +- QwenImage Edit: VL encoding with single image for image editing +- QwenImage Edit Plus: VL encoding with multiple images for multi-reference editing +- QwenImage Layered: Auto-captioning for image decomposition +""" + +# ============================================ +# QwenImage Base (text-only encoding) +# ============================================ +# Used for text-to-image generation where only text prompt is encoded + +QWENIMAGE_PROMPT_TEMPLATE = ( + "<|im_start|>system\n" + "Describe the image by detailing the color, shape, size, texture, quantity, text, " + "spatial relationships of the objects and background:<|im_end|>\n" + "<|im_start|>user\n{}<|im_end|>\n" + "<|im_start|>assistant\n" +) +QWENIMAGE_PROMPT_TEMPLATE_START_IDX = 34 + + +# ============================================ +# QwenImage Edit (VL encoding with single image) +# ============================================ +# Used for single-image editing where both image and text are encoded together + +QWENIMAGE_EDIT_PROMPT_TEMPLATE = ( + "<|im_start|>system\n" + "Describe the key features of the input image (color, shape, size, texture, objects, background), " + "then explain how the user's text instruction should alter or modify the image. " + "Generate a new image that meets the user's requirements while maintaining consistency " + "with the original input where appropriate.<|im_end|>\n" + "<|im_start|>user\n" + "<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n" + "<|im_start|>assistant\n" +) +QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX = 64 + + +# ============================================ +# QwenImage Edit Plus (VL encoding with multiple images) +# ============================================ +# Used for multi-reference editing where multiple images and text are encoded together +# The img_template is used to format each image in the prompt + +QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE = ( + "<|im_start|>system\n" + "Describe the key features of the input image (color, shape, size, texture, objects, background), " + "then explain how the user's text instruction should alter or modify the image. " + "Generate a new image that meets the user's requirements while maintaining consistency " + "with the original input where appropriate.<|im_end|>\n" + "<|im_start|>user\n{}<|im_end|>\n" + "<|im_start|>assistant\n" +) +QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>" +QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX = 64 + + +# ============================================ +# QwenImage Layered (auto-captioning) +# ============================================ +# Used for image decomposition where the VL model generates a caption from the input image +# if no prompt is provided. These prompts instruct the model to describe the image in detail. + +QWENIMAGE_LAYERED_CAPTION_PROMPT_EN = ( + "<|im_start|>system\n" + "You are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + "# Image Annotator\n" + "You are a professional image annotator. Please write an image caption based on the input image:\n" + "1. Write the caption using natural, descriptive language without structured formats or rich text.\n" + "2. Enrich caption details by including:\n" + " - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on\n" + " - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, " + "attachment relations, action relations, comparative relations, causal relations, and so on\n" + " - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on\n" + " - Identify the text clearly visible in the image, without translation or explanation, " + "and highlight it in the caption with quotation marks\n" + "3. Maintain authenticity and accuracy:\n" + " - Avoid generalizations\n" + " - Describe all visible information in the image, while do not add information not explicitly shown in the image\n" + "<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n" + "<|im_start|>assistant\n" +) + +QWENIMAGE_LAYERED_CAPTION_PROMPT_CN = ( + "<|im_start|>system\n" + "You are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + "# 图像标注器\n" + "你是一个专业的图像标注器。请基于输入图像,撰写图注:\n" + "1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。\n" + "2. 通过加入以下内容,丰富图注细节:\n" + " - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等\n" + " - 对象间的视觉关系:如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等\n" + " - 环境细节:例如天气、光照、颜色、纹理、气氛等\n" + " - 文字内容:识别图像中清晰可见的文字,不做翻译和解释,用引号在图注中强调\n" + "3. 保持真实性与准确性:\n" + " - 不要使用笼统的描述\n" + " - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容\n" + "<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n" + "<|im_start|>assistant\n" +) \ No newline at end of file From 710d665b12de0224a3b86fa7c6cd364eb27875d8 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Wed, 7 Jan 2026 18:17:10 +0100 Subject: [PATCH 05/12] up up --- .../modular_pipelines/flux/inputs.py | 2 +- .../qwenimage/before_denoise.py | 94 +++++++- .../modular_pipelines/qwenimage/decoders.py | 4 +- .../modular_pipelines/qwenimage/denoise.py | 22 ++ .../modular_pipelines/qwenimage/inputs.py | 129 ++++++++++- .../qwenimage/modular_blocks_qwenimage.py | 6 +- .../modular_blocks_qwenimage_edit.py | 6 +- .../modular_blocks_qwenimage_layered.py | 205 +++--------------- .../qwenimage/modular_pipeline.py | 36 ++- 9 files changed, 309 insertions(+), 195 deletions(-) diff --git a/src/diffusers/modular_pipelines/flux/inputs.py b/src/diffusers/modular_pipelines/flux/inputs.py index 8309eebfeb37..45b1c6bc136f 100644 --- a/src/diffusers/modular_pipelines/flux/inputs.py +++ b/src/diffusers/modular_pipelines/flux/inputs.py @@ -121,7 +121,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip return components, state -# Adapted from `QwenImageInputsDynamicStep` +# Adapted from `QwenImageAdditionalInputsStep` class FluxInputsDynamicStep(ModularPipelineBlocks): model_name = "flux" diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 4fbcfd207a66..c441f51dd78a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -23,7 +23,7 @@ from ...utils.torch_utils import randn_tensor, unwrap_module from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam -from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier +from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier, QwenImageLayeredPachifier # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift @@ -207,6 +207,98 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks): + model_name = "qwenimage-layered" + + @property + def description(self) -> str: + return "Prepare initial random noise (B, layers+1, C, H, W) for the generation process" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("pachifier", QwenImageLayeredPachifier, default_creation_method="from_config"), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("latents"), + InputParam(name="height"), + InputParam(name="width"), + InputParam(name="layers", default=4), + InputParam(name="num_images_per_prompt", default=1), + InputParam(name="generator"), + InputParam( + name="batch_size", + required=True, + type_hint=int, + description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.", + ), + InputParam( + name="dtype", + required=True, + type_hint=torch.dtype, + description="The dtype of the model inputs, can be generated in input step.", + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="latents", + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process", + ), + ] + + @staticmethod + def check_inputs(height, width, vae_scale_factor): + if height is not None and height % (vae_scale_factor * 2) != 0: + raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}") + + if width is not None and width % (vae_scale_factor * 2) != 0: + raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + self.check_inputs( + height=block_state.height, + width=block_state.width, + vae_scale_factor=components.vae_scale_factor, + ) + + device = components._execution_device + batch_size = block_state.batch_size * block_state.num_images_per_prompt + + # we can update the height and width here since it's used to generate the initial + block_state.height = block_state.height or components.default_height + block_state.width = block_state.width or components.default_width + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + latent_height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2)) + latent_width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2)) + + shape = (batch_size, block_state.layers + 1, components.num_channels_latents, latent_height, latent_width) + if isinstance(block_state.generator, list) and len(block_state.generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + if block_state.latents is None: + block_state.latents = randn_tensor( + shape, generator=block_state.generator, device=device, dtype=block_state.dtype + ) + block_state.latents = components.pachifier.pack_latents(block_state.latents) + + self.set_block_state(state, block_state) + return components, state + + class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): model_name = "qwenimage" diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 5f1281564b1d..465b27df0301 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -83,7 +83,7 @@ def description(self) -> str: @property def expected_components(self) -> List[ComponentSpec]: return [ - ComponentSpec("layered_pachifier", QwenImageLayeredPachifier, default_creation_method="from_config"), + ComponentSpec("pachifier", QwenImageLayeredPachifier, default_creation_method="from_config"), ] @property @@ -201,7 +201,6 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam("latents", required=True, type_hint=torch.Tensor), - InputParam("layers", required=True, type_hint=int), InputParam("output_type", default="pil", type_hint=str), ] @@ -216,7 +215,6 @@ def __call__(self, components, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) latents = block_state.latents - layers = block_state.layers # 1. VAE normalization latents = latents.to(components.vae.dtype) diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index 49acd2dc0295..265f6ba6a1f0 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -682,3 +682,25 @@ def description(self) -> str: " - `QwenImageLoopAfterDenoiserInpaint`\n" "This block supports inpainting tasks for QwenImage Edit." ) + + +# actually same as QwenImageEditDenoiseStep +class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): + block_classes = [ + QwenImageEditLoopBeforeDenoiser, + QwenImageEditLoopDenoiser, + QwenImageLoopAfterDenoiser, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. \n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" + " - `QwenImageEditLoopBeforeDenoiser`\n" + " - `QwenImageEditLoopDenoiser`\n" + " - `QwenImageLoopAfterDenoiser`\n" + "This block supports QwenImage Layered." + ) diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 5c3df4909f56..598eb5346ccb 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -19,7 +19,7 @@ from ...models import QwenImageMultiControlNetModel from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam -from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier +from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier, QwenImageLayeredPachifier def repeat_tensor_to_batch_size( @@ -221,7 +221,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -class QwenImageInputsDynamicStep(ModularPipelineBlocks): +class QwenImageAdditionalInputsStep(ModularPipelineBlocks): """Input step for QwenImage: update height/width, expand batch, patchify.""" model_name = "qwenimage" @@ -476,6 +476,131 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - self.set_block_state(state, block_state) return components, state + +# YiYi TODO: support define config default component from the ModularPipeline level. +# it is same as QwenImageAdditionalInputsStep, but with layered pachifier. +class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): + """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier.""" + + model_name = "qwenimage-layered" + + def __init__( + self, + image_latent_inputs: List[str] = ["image_latents"], + additional_batch_inputs: List[str] = [], + ): + if not isinstance(image_latent_inputs, list): + image_latent_inputs = [image_latent_inputs] + if not isinstance(additional_batch_inputs, list): + additional_batch_inputs = [additional_batch_inputs] + + self._image_latent_inputs = image_latent_inputs + self._additional_batch_inputs = additional_batch_inputs + super().__init__() + + @property + def description(self) -> str: + summary_section = ( + "Input processing step for Layered that:\n" + " 1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size\n" + " 2. For additional batch inputs: Expands batch dimensions to match final batch size" + ) + + inputs_info = "" + if self._image_latent_inputs or self._additional_batch_inputs: + inputs_info = "\n\nConfigured inputs:" + if self._image_latent_inputs: + inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}" + if self._additional_batch_inputs: + inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}" + + placement_section = "\n\nThis block should be placed after the encoder steps and the text input step." + + return summary_section + inputs_info + placement_section + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("pachifier", QwenImageLayeredPachifier, default_creation_method="from_config"), + ] + + @property + def inputs(self) -> List[InputParam]: + inputs = [ + InputParam(name="num_images_per_prompt", default=1), + InputParam(name="batch_size", required=True), + InputParam(name="height"), + InputParam(name="width"), + ] + + for image_latent_input_name in self._image_latent_inputs: + inputs.append(InputParam(name=image_latent_input_name)) + + for input_name in self._additional_batch_inputs: + inputs.append(InputParam(name=input_name)) + + return inputs + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="image_height", type_hint=int, description="The height of the image latents"), + OutputParam(name="image_width", type_hint=int, description="The width of the image latents"), + ] + + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + # Process image latent inputs + for image_latent_input_name in self._image_latent_inputs: + image_latent_tensor = getattr(block_state, image_latent_input_name) + if image_latent_tensor is None: + continue + + # 1. Calculate height/width from latents and update if not provided + # Layered latents are (B, layers, C, H, W) + height = image_latent_tensor.shape[3] * components.vae_scale_factor + width = image_latent_tensor.shape[4] * components.vae_scale_factor + block_state.height = block_state.height or height + block_state.width = block_state.width or width + + if not hasattr(block_state, "image_height"): + block_state.image_height = height + if not hasattr(block_state, "image_width"): + block_state.image_width = width + + # 2. Patchify with layered pachifier + image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor) + + # 3. Expand batch size + image_latent_tensor = repeat_tensor_to_batch_size( + input_name=image_latent_input_name, + input_tensor=image_latent_tensor, + num_images_per_prompt=block_state.num_images_per_prompt, + batch_size=block_state.batch_size, + ) + + setattr(block_state, image_latent_input_name, image_latent_tensor) + + # Process additional batch inputs (only batch expansion) + for input_name in self._additional_batch_inputs: + input_tensor = getattr(block_state, input_name) + if input_tensor is None: + continue + + input_tensor = repeat_tensor_to_batch_size( + input_name=input_name, + input_tensor=input_tensor, + num_images_per_prompt=block_state.num_images_per_prompt, + batch_size=block_state.batch_size, + ) + + setattr(block_state, input_name, input_tensor) + + self.set_block_state(state, block_state) + return components, state + + class QwenImageControlNetInputsStep(ModularPipelineBlocks): model_name = "qwenimage" diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index ac67099eac0f..e180558455cb 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -46,7 +46,7 @@ ) from .inputs import ( QwenImageControlNetInputsStep, - QwenImageInputsDynamicStep, + QwenImageAdditionalInputsStep, QwenImageTextInputsStep, ) @@ -123,7 +123,7 @@ def description(self): # img2img input class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])] + block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])] block_names = ["text_inputs", "additional_inputs"] @property @@ -136,7 +136,7 @@ def description(self): # inpaint input class QwenImageInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = [QwenImageTextInputsStep(), QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])] + block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])] block_names = ["text_inputs", "additional_inputs"] @property diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 5ed287566727..4dcaa9dc110f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -43,7 +43,7 @@ QwenImageVaeEncoderStep, ) from .inputs import ( - QwenImageInputsDynamicStep, + QwenImageAdditionalInputsStep, QwenImageTextInputsStep, ) @@ -134,7 +134,7 @@ class QwenImageEditInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), - QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]), + QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]), ] block_names = ["text_inputs", "additional_inputs"] @@ -152,7 +152,7 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), - QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]), + QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]), ] block_names = ["text_inputs", "additional_inputs"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 9371bc4762e9..23b953ce0732 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -18,22 +18,16 @@ from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict from .before_denoise import ( - QwenImageCreateMaskLatentsStep, - QwenImageEditRoPEInputsStep, - QwenImagePrepareLatentsStep, - QwenImagePrepareLatentsWithStrengthStep, - QwenImageSetTimestepsStep, - QwenImageSetTimestepsWithStrengthStep, + QwenImageLayeredPrepareLatentsStep, + QwenImageLayeredRoPEInputsStep, + QwenImageLayeredSetTimestepsStep, ) from .decoders import ( - QwenImageAfterDenoiseStep, - QwenImageDecoderStep, - QwenImageInpaintProcessImagesOutputStep, - QwenImageProcessImagesOutputStep, + QwenImageLayeredAfterDenoiseStep, + QwenImageLayeredDecoderStep, ) from .denoise import ( - QwenImageEditDenoiseStep, - QwenImageEditInpaintDenoiseStep, + QwenImageLayeredDenoiseStep, ) from .encoders import ( QwenImageEditResizeStep, @@ -44,7 +38,8 @@ QwenImageLayeredPermuteLatentsStep, ) from .inputs import ( - QwenImageInputsDynamicStep, + QwenImageAdditionalInputsStep, + QwenImageLayeredAdditionalInputsStep, QwenImageTextInputsStep, ) @@ -98,67 +93,34 @@ def description(self) -> str: # 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise # ==================== -# Edit input step -class QwenImageEditInputStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [ - QwenImageTextInputsStep(), - QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"]), - ] - block_names = ["text_inputs", "additional_inputs"] - - @property - def description(self): - return ( - "Input step that prepares the inputs for the edit denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n" - " - update height/width based `image_latents`, patchify `image_latents`." - ) - - -# Edit Inpaint input step -class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" +# Layered input step +class QwenImageLayeredInputStep(SequentialPipelineBlocks): + model_name = "qwenimage-layered" block_classes = [ QwenImageTextInputsStep(), - QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]), + QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]), ] block_names = ["text_inputs", "additional_inputs"] @property def description(self): return ( - "Input step that prepares the inputs for the edit inpaint denoising step. It:\n" + "Input step that prepares the inputs for the layered denoising step. It:\n" " - make sure the text embeddings have consistent batch size as well as the additional inputs.\n" " - update height/width based `image_latents`, patchify `image_latents`." ) -# Edit Inpaint prepare latents step -class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] - block_names = ["add_noise_to_latents", "create_mask_latents"] - - @property - def description(self) -> str: - return ( - "This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:\n" - " - Add noise to the image latents to create the latents input for the denoiser.\n" - " - Create the patchified latents `mask` based on the processed mask image.\n" - ) - - -# 1. Edit (img2img) core denoise -class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" +# 1. img2img core denoise +class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-layered" block_classes = [ - QwenImageEditInputStep(), - QwenImagePrepareLatentsStep(), - QwenImageSetTimestepsStep(), - QwenImageEditRoPEInputsStep(), - QwenImageEditDenoiseStep(), - QwenImageAfterDenoiseStep(), + QwenImageLayeredInputStep(), + QwenImageLayeredPrepareLatentsStep(), + QwenImageLayeredSetTimestepsStep(), + QwenImageLayeredRoPEInputsStep(), + QwenImageLayeredDenoiseStep(), + QwenImageLayeredAfterDenoiseStep(), ] block_names = [ "input", @@ -171,128 +133,19 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): - return "Core denoising workflow for QwenImage-Edit edit (img2img) task." - - -# 2. Edit Inpaint core denoise -class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [ - QwenImageEditInpaintInputStep(), - QwenImagePrepareLatentsStep(), - QwenImageSetTimestepsWithStrengthStep(), - QwenImageEditInpaintPrepareLatentsStep(), - QwenImageEditRoPEInputsStep(), - QwenImageEditInpaintDenoiseStep(), - QwenImageAfterDenoiseStep(), - ] - block_names = [ - "input", - "prepare_latents", - "set_timesteps", - "prepare_inpaint_latents", - "prepare_rope_inputs", - "denoise", - "after_denoise", - ] + return "Core denoising workflow for QwenImage-Layered img2img task." - @property - def description(self): - return "Core denoising workflow for QwenImage-Edit edit inpaint task." - - -# Auto core denoise step -class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks): - block_classes = [ - QwenImageEditInpaintCoreDenoiseStep, - QwenImageEditCoreDenoiseStep, - ] - block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["processed_mask_image", "image_latents"] - default_block_name = "edit" - - def select_block(self, processed_mask_image=None, image_latents=None) -> Optional[str]: - if processed_mask_image is not None: - return "edit_inpaint" - elif image_latents is not None: - return "edit" - return None - - @property - def description(self): - return ( - "Auto core denoising step that selects the appropriate workflow based on inputs.\n" - " - `QwenImageEditInpaintCoreDenoiseStep` when `processed_mask_image` is provided\n" - " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n" - "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit." - ) - - -# ==================== -# 4. DECODE -# ==================== - -# Decode step (standard) -class QwenImageEditDecodeStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] - block_names = ["decode", "postprocess"] - - @property - def description(self): - return "Decode step that decodes the latents to images and postprocess the generated image." - - -# Inpaint decode step -class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] - block_names = ["decode", "postprocess"] - - @property - def description(self): - return "Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image." - - -# Auto decode step -class QwenImageEditAutoDecodeStep(AutoPipelineBlocks): - block_classes = [QwenImageEditInpaintDecodeStep, QwenImageEditDecodeStep] - block_names = ["inpaint_decode", "decode"] - block_trigger_inputs = ["mask", None] - - @property - def description(self): - return ( - "Decode step that decode the latents into images.\n" - "This is an auto pipeline block.\n" - " - `QwenImageEditInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" - " - `QwenImageEditDecodeStep` (edit) is used when `mask` is not provided.\n" - ) # ==================== # 5. AUTO BLOCKS & PRESETS # ==================== -EDIT_AUTO_BLOCKS = InsertableDict( +LAYERED_AUTO_BLOCKS = InsertableDict( [ - ("text_encoder", QwenImageEditVLEncoderStep()), - ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), - ("denoise", QwenImageEditAutoCoreDenoiseStep()), - ("decode", QwenImageEditAutoDecodeStep()), + ("text_encoder", QwenImageLayeredTextEncoderStep()), + ("vae_encoder", QwenImageLayeredVaeEncoderStep()), + ("denoise", QwenImageLayeredCoreDenoiseStep()), + ("decode", QwenImageLayeredDecoderStep()), ] -) - - -class QwenImageEditAutoBlocks(SequentialPipelineBlocks): - model_name = "qwenimage-edit" - block_classes = EDIT_AUTO_BLOCKS.values() - block_names = EDIT_AUTO_BLOCKS.keys() - - @property - def description(self): - return ( - "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" - "- for edit (img2img) generation, you need to provide `image`\n" - "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n" - ) \ No newline at end of file +) \ No newline at end of file diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py index 30dc9dcec5ba..3e580dde5f08 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py @@ -94,7 +94,7 @@ class QwenImageLayeredPachifier(ConfigMixin): """ A class to pack and unpack latents for QwenImage Layered. - Unlike QwenImagePachifier, this handles 3D latents with shape (B, layers, C, H, W). + Unlike QwenImagePachifier, this handles 5D latents with shape (B, layers+1, C, H, W). """ config_name = "config.json" @@ -103,25 +103,35 @@ class QwenImageLayeredPachifier(ConfigMixin): def __init__(self, patch_size: int = 2): super().__init__() - def pack_latents(self, latents, batch_size, num_channels_latents, height, width, layers): + def pack_latents(self, latents): """ Pack latents from (B, layers, C, H, W) to (B, layers * H/2 * W/2, C*4). """ + + if latents.ndim != 5: + raise ValueError(f"Latents must have 5 dimensions (B, layers, C, H, W), but got {latents.ndim}") + + batch_size, layers, num_channels_latents, latent_height, latent_width = latents.shape patch_size = self.config.patch_size + if latent_height % patch_size != 0 or latent_width % patch_size != 0: + raise ValueError( + f"Latent height and width must be divisible by {patch_size}, but got {latent_height} and {latent_width}" + ) + latents = latents.view( batch_size, layers, num_channels_latents, - height // patch_size, + latent_height // patch_size, patch_size, - width // patch_size, + latent_width // patch_size, patch_size, ) latents = latents.permute(0, 1, 3, 5, 2, 4, 6) latents = latents.reshape( batch_size, - layers * (height // patch_size) * (width // patch_size), + layers * (latent_height // patch_size) * (latent_width // patch_size), num_channels_latents * patch_size * patch_size, ) return latents @@ -130,7 +140,11 @@ def unpack_latents(self, latents, height, width, layers, vae_scale_factor=8): """ Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W). """ - batch_size, num_patches, channels = latents.shape + + if latents.ndim != 3: + raise ValueError(f"Latents must have 3 dimensions, but got {latents.ndim}") + + batch_size, _, channels = latents.shape patch_size = self.config.patch_size height = patch_size * (int(height) // (vae_scale_factor * patch_size)) @@ -271,3 +285,13 @@ class QwenImageEditPlusModularPipeline(QwenImageEditModularPipeline): """ default_blocks_name = "QwenImageEditPlusAutoBlocks" + + +class QwenImageLayeredModularPipeline(QwenImageModularPipeline): + """ + A ModularPipeline for QwenImage-Layered. + + > [!WARNING] > This is an experimental feature and is likely to change in the future. + """ + + default_blocks_name = "QwenImageLayeredAutoBlocks" From 4006793ff9f1515eb7afb5509953757526fc8f4e Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 8 Jan 2026 06:14:57 +0100 Subject: [PATCH 06/12] u p --- .../modular_pipelines/modular_pipeline.py | 1 + src/diffusers/modular_pipelines/node_utils.py | 661 ------------------ 2 files changed, 1 insertion(+), 661 deletions(-) delete mode 100644 src/diffusers/modular_pipelines/node_utils.py diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index d710bf18eb48..a4a506f6e703 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -62,6 +62,7 @@ ("qwenimage", "QwenImageModularPipeline"), ("qwenimage-edit", "QwenImageEditModularPipeline"), ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"), + ("qwenimage-layered", "QwenImageLayeredModularPipeline"), ("z-image", "ZImageModularPipeline"), ] ) diff --git a/src/diffusers/modular_pipelines/node_utils.py b/src/diffusers/modular_pipelines/node_utils.py deleted file mode 100644 index f7ee1dd3097b..000000000000 --- a/src/diffusers/modular_pipelines/node_utils.py +++ /dev/null @@ -1,661 +0,0 @@ -import json -import logging -import os -from pathlib import Path -from typing import List, Optional, Tuple, Union - -import numpy as np -import PIL -import torch - -from ..configuration_utils import ConfigMixin -from ..image_processor import PipelineImageInput -from .modular_pipeline import ModularPipelineBlocks, SequentialPipelineBlocks -from .modular_pipeline_utils import InputParam - - -logger = logging.getLogger(__name__) - -# YiYi Notes: this is actually for SDXL, put it here for now -SDXL_INPUTS_SCHEMA = { - "prompt": InputParam( - "prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation" - ), - "prompt_2": InputParam( - "prompt_2", - type_hint=Union[str, List[str]], - description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2", - ), - "negative_prompt": InputParam( - "negative_prompt", - type_hint=Union[str, List[str]], - description="The prompt or prompts not to guide the image generation", - ), - "negative_prompt_2": InputParam( - "negative_prompt_2", - type_hint=Union[str, List[str]], - description="The negative prompt or prompts for text_encoder_2", - ), - "cross_attention_kwargs": InputParam( - "cross_attention_kwargs", - type_hint=Optional[dict], - description="Kwargs dictionary passed to the AttentionProcessor", - ), - "clip_skip": InputParam( - "clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder" - ), - "image": InputParam( - "image", - type_hint=PipelineImageInput, - required=True, - description="The image(s) to modify for img2img or inpainting", - ), - "mask_image": InputParam( - "mask_image", - type_hint=PipelineImageInput, - required=True, - description="Mask image for inpainting, white pixels will be repainted", - ), - "generator": InputParam( - "generator", - type_hint=Optional[Union[torch.Generator, List[torch.Generator]]], - description="Generator(s) for deterministic generation", - ), - "height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"), - "width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"), - "num_images_per_prompt": InputParam( - "num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt" - ), - "num_inference_steps": InputParam( - "num_inference_steps", type_hint=int, default=50, description="Number of denoising steps" - ), - "timesteps": InputParam( - "timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process" - ), - "sigmas": InputParam( - "sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process" - ), - "denoising_end": InputParam( - "denoising_end", - type_hint=Optional[float], - description="Fraction of denoising process to complete before termination", - ), - # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999 - "strength": InputParam( - "strength", type_hint=float, default=0.3, description="How much to transform the reference image" - ), - "denoising_start": InputParam( - "denoising_start", type_hint=Optional[float], description="Starting point of the denoising process" - ), - "latents": InputParam( - "latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation" - ), - "padding_mask_crop": InputParam( - "padding_mask_crop", - type_hint=Optional[Tuple[int, int]], - description="Size of margin in crop for image and mask", - ), - "original_size": InputParam( - "original_size", - type_hint=Optional[Tuple[int, int]], - description="Original size of the image for SDXL's micro-conditioning", - ), - "target_size": InputParam( - "target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning" - ), - "negative_original_size": InputParam( - "negative_original_size", - type_hint=Optional[Tuple[int, int]], - description="Negative conditioning based on image resolution", - ), - "negative_target_size": InputParam( - "negative_target_size", - type_hint=Optional[Tuple[int, int]], - description="Negative conditioning based on target resolution", - ), - "crops_coords_top_left": InputParam( - "crops_coords_top_left", - type_hint=Tuple[int, int], - default=(0, 0), - description="Top-left coordinates for SDXL's micro-conditioning", - ), - "negative_crops_coords_top_left": InputParam( - "negative_crops_coords_top_left", - type_hint=Tuple[int, int], - default=(0, 0), - description="Negative conditioning crop coordinates", - ), - "aesthetic_score": InputParam( - "aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image" - ), - "negative_aesthetic_score": InputParam( - "negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score" - ), - "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"), - "output_type": InputParam( - "output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)" - ), - "ip_adapter_image": InputParam( - "ip_adapter_image", - type_hint=PipelineImageInput, - required=True, - description="Image(s) to be used as IP adapter", - ), - "control_image": InputParam( - "control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition" - ), - "control_guidance_start": InputParam( - "control_guidance_start", - type_hint=Union[float, List[float]], - default=0.0, - description="When ControlNet starts applying", - ), - "control_guidance_end": InputParam( - "control_guidance_end", - type_hint=Union[float, List[float]], - default=1.0, - description="When ControlNet stops applying", - ), - "controlnet_conditioning_scale": InputParam( - "controlnet_conditioning_scale", - type_hint=Union[float, List[float]], - default=1.0, - description="Scale factor for ControlNet outputs", - ), - "guess_mode": InputParam( - "guess_mode", - type_hint=bool, - default=False, - description="Enables ControlNet encoder to recognize input without prompts", - ), - "control_mode": InputParam( - "control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet" - ), -} - -SDXL_INTERMEDIATE_INPUTS_SCHEMA = { - "prompt_embeds": InputParam( - "prompt_embeds", - type_hint=torch.Tensor, - required=True, - description="Text embeddings used to guide image generation", - ), - "negative_prompt_embeds": InputParam( - "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings" - ), - "pooled_prompt_embeds": InputParam( - "pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings" - ), - "negative_pooled_prompt_embeds": InputParam( - "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings" - ), - "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"), - "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"), - "preprocess_kwargs": InputParam( - "preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor" - ), - "latents": InputParam( - "latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process" - ), - "timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"), - "num_inference_steps": InputParam( - "num_inference_steps", type_hint=int, required=True, description="Number of denoising steps" - ), - "latent_timestep": InputParam( - "latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep" - ), - "image_latents": InputParam( - "image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image" - ), - "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"), - "masked_image_latents": InputParam( - "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting" - ), - "add_time_ids": InputParam( - "add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning" - ), - "negative_add_time_ids": InputParam( - "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids" - ), - "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"), - "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"), - "crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"), - "ip_adapter_embeds": InputParam( - "ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter" - ), - "negative_ip_adapter_embeds": InputParam( - "negative_ip_adapter_embeds", - type_hint=List[torch.Tensor], - description="Negative image embeddings for IP-Adapter", - ), - "images": InputParam( - "images", - type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], - required=True, - description="Generated images", - ), -} - -SDXL_PARAM_SCHEMA = {**SDXL_INPUTS_SCHEMA, **SDXL_INTERMEDIATE_INPUTS_SCHEMA} - - -DEFAULT_PARAM_MAPS = { - "prompt": { - "label": "Prompt", - "type": "string", - "default": "a bear sitting in a chair drinking a milkshake", - "display": "textarea", - }, - "negative_prompt": { - "label": "Negative Prompt", - "type": "string", - "default": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality", - "display": "textarea", - }, - "num_inference_steps": { - "label": "Steps", - "type": "int", - "default": 25, - "min": 1, - "max": 1000, - }, - "seed": { - "label": "Seed", - "type": "int", - "default": 0, - "min": 0, - "display": "random", - }, - "width": { - "label": "Width", - "type": "int", - "display": "text", - "default": 1024, - "min": 8, - "max": 8192, - "step": 8, - "group": "dimensions", - }, - "height": { - "label": "Height", - "type": "int", - "display": "text", - "default": 1024, - "min": 8, - "max": 8192, - "step": 8, - "group": "dimensions", - }, - "images": { - "label": "Images", - "type": "image", - "display": "output", - }, - "image": { - "label": "Image", - "type": "image", - "display": "input", - }, -} - -DEFAULT_TYPE_MAPS = { - "int": { - "type": "int", - "default": 0, - "min": 0, - }, - "float": { - "type": "float", - "default": 0.0, - "min": 0.0, - }, - "str": { - "type": "string", - "default": "", - }, - "bool": { - "type": "boolean", - "default": False, - }, - "image": { - "type": "image", - }, -} - -DEFAULT_MODEL_KEYS = ["unet", "vae", "text_encoder", "tokenizer", "controlnet", "transformer", "image_encoder"] -DEFAULT_CATEGORY = "Modular Diffusers" -DEFAULT_EXCLUDE_MODEL_KEYS = ["processor", "feature_extractor", "safety_checker"] -DEFAULT_PARAMS_GROUPS_KEYS = { - "text_encoders": ["text_encoder", "tokenizer"], - "ip_adapter_embeds": ["ip_adapter_embeds"], - "prompt_embeddings": ["prompt_embeds"], -} - - -def get_group_name(name, group_params_keys=DEFAULT_PARAMS_GROUPS_KEYS): - """ - Get the group name for a given parameter name, if not part of a group, return None e.g. "prompt_embeds" -> - "text_embeds", "text_encoder" -> "text_encoders", "prompt" -> None - """ - if name is None: - return None - for group_name, group_keys in group_params_keys.items(): - for group_key in group_keys: - if group_key in name: - return group_name - return None - - -class ModularNode(ConfigMixin): - """ - A ModularNode is a base class to build UI nodes using diffusers. Currently only supports Mellon. It is a wrapper - around a ModularPipelineBlocks object. - - > [!WARNING] > This is an experimental feature and is likely to change in the future. - """ - - config_name = "node_config.json" - - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - trust_remote_code: Optional[bool] = None, - **kwargs, - ): - blocks = ModularPipelineBlocks.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs - ) - return cls(blocks, **kwargs) - - def __init__(self, blocks, category=DEFAULT_CATEGORY, label=None, **kwargs): - self.blocks = blocks - - if label is None: - label = self.blocks.__class__.__name__ - # blocks param name -> mellon param name - self.name_mapping = {} - - input_params = {} - # pass or create a default param dict for each input - # e.g. for prompt, - # prompt = { - # "name": "text_input", # the name of the input in node definition, could be different from the input name in diffusers - # "label": "Prompt", - # "type": "string", - # "default": "a bear sitting in a chair drinking a milkshake", - # "display": "textarea"} - # if type is not specified, it'll be a "custom" param of its own type - # e.g. you can pass ModularNode(scheduler = {name :"scheduler"}) - # it will get this spec in node definition {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}} - # name can be a dict, in that case, it is part of a "dict" input in mellon nodes, e.g. text_encoder= {name: {"text_encoders": "text_encoder"}} - inputs = self.blocks.inputs + self.blocks.intermediate_inputs - for inp in inputs: - param = kwargs.pop(inp.name, None) - if param: - # user can pass a param dict for all inputs, e.g. ModularNode(prompt = {...}) - input_params[inp.name] = param - mellon_name = param.pop("name", inp.name) - if mellon_name != inp.name: - self.name_mapping[inp.name] = mellon_name - continue - - if inp.name not in DEFAULT_PARAM_MAPS and not inp.required and not get_group_name(inp.name): - continue - - if inp.name in DEFAULT_PARAM_MAPS: - # first check if it's in the default param map, if so, directly use that - param = DEFAULT_PARAM_MAPS[inp.name].copy() - elif get_group_name(inp.name): - param = get_group_name(inp.name) - if inp.name not in self.name_mapping: - self.name_mapping[inp.name] = param - else: - # if not, check if it's in the SDXL input schema, if so, - # 1. use the type hint to determine the type - # 2. use the default param dict for the type e.g. if "steps" is a "int" type, {"steps": {"type": "int", "default": 0, "min": 0}} - if inp.type_hint is not None: - type_str = str(inp.type_hint).lower() - else: - inp_spec = SDXL_PARAM_SCHEMA.get(inp.name, None) - type_str = str(inp_spec.type_hint).lower() if inp_spec else "" - for type_key, type_param in DEFAULT_TYPE_MAPS.items(): - if type_key in type_str: - param = type_param.copy() - param["label"] = inp.name - param["display"] = "input" - break - else: - param = inp.name - # add the param dict to the inp_params dict - input_params[inp.name] = param - - component_params = {} - for comp in self.blocks.expected_components: - param = kwargs.pop(comp.name, None) - if param: - component_params[comp.name] = param - mellon_name = param.pop("name", comp.name) - if mellon_name != comp.name: - self.name_mapping[comp.name] = mellon_name - continue - - to_exclude = False - for exclude_key in DEFAULT_EXCLUDE_MODEL_KEYS: - if exclude_key in comp.name: - to_exclude = True - break - if to_exclude: - continue - - if get_group_name(comp.name): - param = get_group_name(comp.name) - if comp.name not in self.name_mapping: - self.name_mapping[comp.name] = param - elif comp.name in DEFAULT_MODEL_KEYS: - param = {"label": comp.name, "type": "diffusers_auto_model", "display": "input"} - else: - param = comp.name - # add the param dict to the model_params dict - component_params[comp.name] = param - - output_params = {} - if isinstance(self.blocks, SequentialPipelineBlocks): - last_block_name = list(self.blocks.sub_blocks.keys())[-1] - outputs = self.blocks.sub_blocks[last_block_name].intermediate_outputs - else: - outputs = self.blocks.intermediate_outputs - - for out in outputs: - param = kwargs.pop(out.name, None) - if param: - output_params[out.name] = param - mellon_name = param.pop("name", out.name) - if mellon_name != out.name: - self.name_mapping[out.name] = mellon_name - continue - - if out.name in DEFAULT_PARAM_MAPS: - param = DEFAULT_PARAM_MAPS[out.name].copy() - param["display"] = "output" - else: - group_name = get_group_name(out.name) - if group_name: - param = group_name - if out.name not in self.name_mapping: - self.name_mapping[out.name] = param - else: - param = out.name - # add the param dict to the outputs dict - output_params[out.name] = param - - if len(kwargs) > 0: - logger.warning(f"Unused kwargs: {kwargs}") - - register_dict = { - "category": category, - "label": label, - "input_params": input_params, - "component_params": component_params, - "output_params": output_params, - "name_mapping": self.name_mapping, - } - self.register_to_config(**register_dict) - - def setup(self, components_manager, collection=None): - self.pipeline = self.blocks.init_pipeline(components_manager=components_manager, collection=collection) - self._components_manager = components_manager - - @property - def mellon_config(self): - return self._convert_to_mellon_config() - - def _convert_to_mellon_config(self): - node = {} - node["label"] = self.config.label - node["category"] = self.config.category - - node_param = {} - for inp_name, inp_param in self.config.input_params.items(): - if inp_name in self.name_mapping: - mellon_name = self.name_mapping[inp_name] - else: - mellon_name = inp_name - if isinstance(inp_param, str): - param = { - "label": inp_param, - "type": inp_param, - "display": "input", - } - else: - param = inp_param - - if mellon_name not in node_param: - node_param[mellon_name] = param - else: - logger.debug(f"Input param {mellon_name} already exists in node_param, skipping {inp_name}") - - for comp_name, comp_param in self.config.component_params.items(): - if comp_name in self.name_mapping: - mellon_name = self.name_mapping[comp_name] - else: - mellon_name = comp_name - if isinstance(comp_param, str): - param = { - "label": comp_param, - "type": comp_param, - "display": "input", - } - else: - param = comp_param - - if mellon_name not in node_param: - node_param[mellon_name] = param - else: - logger.debug(f"Component param {comp_param} already exists in node_param, skipping {comp_name}") - - for out_name, out_param in self.config.output_params.items(): - if out_name in self.name_mapping: - mellon_name = self.name_mapping[out_name] - else: - mellon_name = out_name - if isinstance(out_param, str): - param = { - "label": out_param, - "type": out_param, - "display": "output", - } - else: - param = out_param - - if mellon_name not in node_param: - node_param[mellon_name] = param - else: - logger.debug(f"Output param {out_param} already exists in node_param, skipping {out_name}") - node["params"] = node_param - return node - - def save_mellon_config(self, file_path): - """ - Save the Mellon configuration to a JSON file. - - Args: - file_path (str or Path): Path where the JSON file will be saved - - Returns: - Path: Path to the saved config file - """ - file_path = Path(file_path) - - # Create directory if it doesn't exist - os.makedirs(file_path.parent, exist_ok=True) - - # Create a combined dictionary with module definition and name mapping - config = {"module": self.mellon_config, "name_mapping": self.name_mapping} - - # Save the config to file - with open(file_path, "w", encoding="utf-8") as f: - json.dump(config, f, indent=2) - - logger.info(f"Mellon config and name mapping saved to {file_path}") - - return file_path - - @classmethod - def load_mellon_config(cls, file_path): - """ - Load a Mellon configuration from a JSON file. - - Args: - file_path (str or Path): Path to the JSON file containing Mellon config - - Returns: - dict: The loaded combined configuration containing 'module' and 'name_mapping' - """ - file_path = Path(file_path) - - if not file_path.exists(): - raise FileNotFoundError(f"Config file not found: {file_path}") - - with open(file_path, "r", encoding="utf-8") as f: - config = json.load(f) - - logger.info(f"Mellon config loaded from {file_path}") - - return config - - def process_inputs(self, **kwargs): - params_components = {} - for comp_name, comp_param in self.config.component_params.items(): - logger.debug(f"component: {comp_name}") - mellon_comp_name = self.name_mapping.get(comp_name, comp_name) - if mellon_comp_name in kwargs: - if isinstance(kwargs[mellon_comp_name], dict) and comp_name in kwargs[mellon_comp_name]: - comp = kwargs[mellon_comp_name].pop(comp_name) - else: - comp = kwargs.pop(mellon_comp_name) - if comp: - params_components[comp_name] = self._components_manager.get_one(comp["model_id"]) - - params_run = {} - for inp_name, inp_param in self.config.input_params.items(): - logger.debug(f"input: {inp_name}") - mellon_inp_name = self.name_mapping.get(inp_name, inp_name) - if mellon_inp_name in kwargs: - if isinstance(kwargs[mellon_inp_name], dict) and inp_name in kwargs[mellon_inp_name]: - inp = kwargs[mellon_inp_name].pop(inp_name) - else: - inp = kwargs.pop(mellon_inp_name) - if inp is not None: - params_run[inp_name] = inp - - return_output_names = list(self.config.output_params.keys()) - - return params_components, params_run, return_output_names - - def execute(self, **kwargs): - params_components, params_run, return_output_names = self.process_inputs(**kwargs) - - self.pipeline.update_components(**params_components) - output = self.pipeline(**params_run, output=return_output_names) - return output From f7305d80b7dd097d1f84bb478b52c9970c180406 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 8 Jan 2026 07:15:45 +0100 Subject: [PATCH 07/12] add to import --- src/diffusers/__init__.py | 4 ++++ src/diffusers/modular_pipelines/__init__.py | 4 ++++ src/diffusers/modular_pipelines/qwenimage/__init__.py | 10 ++++++++++ .../qwenimage/modular_blocks_qwenimage_layered.py | 11 ++++++++++- src/diffusers/pipelines/auto_pipeline.py | 2 ++ 5 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 3a50634d82d8..5724cf7a24ad 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -419,6 +419,8 @@ "QwenImageEditPlusAutoBlocks", "QwenImageEditPlusModularPipeline", "QwenImageModularPipeline", + "QwenImageLayeredAutoBlocks", + "QwenImageLayeredModularPipeline", "StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline", "Wan22AutoBlocks", @@ -1139,6 +1141,8 @@ QwenImageEditPlusAutoBlocks, QwenImageEditPlusModularPipeline, QwenImageModularPipeline, + QwenImageLayeredAutoBlocks, + QwenImageLayeredModularPipeline, StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline, Wan22AutoBlocks, diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index 5fcc1a176d1b..febabd56e6f5 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -63,6 +63,8 @@ "QwenImageEditAutoBlocks", "QwenImageEditPlusModularPipeline", "QwenImageEditPlusAutoBlocks", + "QwenImageLayeredModularPipeline", + "QwenImageLayeredAutoBlocks", ] _import_structure["z_image"] = [ "ZImageAutoBlocks", @@ -97,6 +99,8 @@ QwenImageEditPlusAutoBlocks, QwenImageEditPlusModularPipeline, QwenImageModularPipeline, + QwenImageLayeredAutoBlocks, + QwenImageLayeredModularPipeline, ) from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline from .wan import Wan22AutoBlocks, WanAutoBlocks, WanModularPipeline diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py index b62912825f12..37a44ac1117b 100644 --- a/src/diffusers/modular_pipelines/qwenimage/__init__.py +++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py @@ -33,10 +33,15 @@ "EDIT_PLUS_AUTO_BLOCKS", "QwenImageEditPlusAutoBlocks", ] + _import_structure["modular_blocks_qwenimage_layered"] = [ + "LAYERED_AUTO_BLOCKS", + "QwenImageLayeredAutoBlocks", + ] _import_structure["modular_pipeline"] = [ "QwenImageEditModularPipeline", "QwenImageEditPlusModularPipeline", "QwenImageModularPipeline", + "QwenImageLayeredModularPipeline", ] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -58,10 +63,15 @@ EDIT_PLUS_AUTO_BLOCKS, QwenImageEditPlusAutoBlocks, ) + from .modular_blocks_qwenimage_layered import ( + LAYERED_AUTO_BLOCKS, + QwenImageLayeredAutoBlocks, + ) from .modular_pipeline import ( QwenImageEditModularPipeline, QwenImageEditPlusModularPipeline, QwenImageModularPipeline, + QwenImageLayeredModularPipeline, ) else: import sys diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 23b953ce0732..c752fde8a0c1 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -148,4 +148,13 @@ def description(self): ("denoise", QwenImageLayeredCoreDenoiseStep()), ("decode", QwenImageLayeredDecoderStep()), ] -) \ No newline at end of file +) + +class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage-layered" + block_classes = LAYERED_AUTO_BLOCKS.values() + block_names = LAYERED_AUTO_BLOCKS.keys() + + @property + def description(self): + return "Auto Modular pipeline for layered denoising tasks using QwenImage-Layered." \ No newline at end of file diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index c14910250b54..b5ebe1b81495 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -99,6 +99,7 @@ QwenImageEditPlusPipeline, QwenImageImg2ImgPipeline, QwenImageInpaintPipeline, + QwenImageLayeredPipeline, QwenImagePipeline, ) from .sana import SanaPipeline @@ -202,6 +203,7 @@ ("qwenimage", QwenImageImg2ImgPipeline), ("qwenimage-edit", QwenImageEditPipeline), ("qwenimage-edit-plus", QwenImageEditPlusPipeline), + ("qwenimage-layered", QwenImageLayeredPipeline), ("z-image", ZImageImg2ImgPipeline), ] ) From 704ccd2b4287184b021c742c0aeea869b9faba0f Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 8 Jan 2026 12:46:49 +0100 Subject: [PATCH 08/12] more refacotr, make layer work --- .../modular_pipelines/qwenimage/decoders.py | 17 +- .../modular_pipelines/qwenimage/denoise.py | 9 +- .../modular_pipelines/qwenimage/encoders.py | 245 +++++++++++++++--- .../modular_pipelines/qwenimage/inputs.py | 22 +- .../modular_blocks_qwenimage_edit.py | 8 +- .../modular_blocks_qwenimage_edit_plus.py | 4 +- .../modular_blocks_qwenimage_layered.py | 10 +- 7 files changed, 251 insertions(+), 64 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 465b27df0301..64a5250b3c46 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -100,7 +100,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) # Unpack: (B, seq, C*4) -> (B, C, layers+1, H, W) - block_state.latents = components.layered_pachifier.unpack_latents( + block_state.latents = components.pachifier.unpack_latents( block_state.latents, block_state.height, block_state.width, @@ -205,7 +205,7 @@ def inputs(self) -> List[InputParam]: ] @property - def outputs(self) -> List[OutputParam]: + def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]), ] @@ -230,18 +230,19 @@ def __call__(self, components, state: PipelineState) -> PipelineState: ) latents = latents / latents_std + latents_mean - # 2. Remove first frame (composite), keep layers frames - latents = latents[:, :, 1:] - - # 3. Reshape for batch decoding: (B, C, layers, H, W) -> (B*layers, C, 1, H, W) + # 2. Reshape for batch decoding: (B, C, layers+1, H, W) -> (B*layers, C, 1, H, W) b, c, f, h, w = latents.shape - latents = latents.permute(0, 2, 1, 3, 4).reshape(b * f, c, 1, h, w) + # 3. Remove first frame (composite), keep layers frames + latents = latents[:, :, 1:] + latents = latents.permute(0, 2, 1, 3, 4).reshape(-1, c, 1, h, w) # 4. Decode: (B*layers, C, 1, H, W) -> (B*layers, C, H, W) - image = components.vae.decode(latents, return_dict=False)[0][:, :, 0] + image = components.vae.decode(latents, return_dict=False)[0] + image = image.squeeze(2) # 5. Postprocess - returns flat list of B*layers images image = components.image_processor.postprocess(image, output_type=block_state.output_type) + # 6. Chunk into list per batch item images = [] diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index 265f6ba6a1f0..9eb98e7156de 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -15,6 +15,7 @@ from typing import List, Tuple import torch +import inspect from ...configuration_utils import FrozenDict from ...guiders import ClassifierFreeGuidance @@ -351,6 +352,13 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState ), } + transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys()) + additional_cond_kwargs = {} + for field_name, field_value in block_state.denoiser_input_fields.items(): + if field_name in transformer_args and field_name not in guider_inputs: + additional_cond_kwargs[field_name] = field_value + block_state.additional_cond_kwargs.update(additional_cond_kwargs) + components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) guider_state = components.guider.prepare_inputs(guider_inputs) @@ -362,7 +370,6 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState guider_state_batch.noise_pred = components.transformer( hidden_states=block_state.latent_model_input, timestep=block_state.timestep / 1000, - img_shapes=block_state.img_shapes, attention_kwargs=block_state.attention_kwargs, return_dict=False, **cond_kwargs, diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index c7e9d1f114cf..5b7eb222dbd9 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -257,7 +257,86 @@ def encode_vae_image( class QwenImageEditResizeStep(ModularPipelineBlocks): - model_name = "qwenimage" + model_name = "qwenimage-edit" + + def __init__( + self, + input_name: str = "image", + output_name: str = "resized_image", + ): + """Create a configurable step for resizing images to the target area while maintaining the aspect ratio. + Args: + input_name (str, optional): Name of the image field to read from the + pipeline state. Defaults to "image". + output_name (str, optional): Name of the resized image field to write + back to the pipeline state. Defaults to "resized_image". + """ + if not isinstance(input_name, str) or not isinstance(output_name, str): + raise ValueError( + f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}" + ) + self._image_input_name = input_name + self._resized_image_output_name = output_name + super().__init__() + + @property + def description(self) -> str: + return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_resize_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images" + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + images = getattr(block_state, self._image_input_name) + + if not is_valid_image_imagelist(images): + raise ValueError(f"Images must be image or list of images but are {type(images)}") + + if is_valid_image(images): + images = [images] + + image_width, image_height = images[0].size + calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height) + + resized_images = [ + components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width) + for image in images + ] + + setattr(block_state, self._resized_image_output_name, resized_images) + self.set_block_state(state, block_state) + return components, state + + +class QwenImageLayeredResizeStep(ModularPipelineBlocks): + model_name = "qwenimage-layered" def __init__( self, @@ -301,7 +380,7 @@ def inputs(self) -> List[InputParam]: name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" ), InputParam( - name="target_area", default=1024 * 1024, type_hint=int, description="The target area to resize the image to" + name="resolution", default=640, type_hint=int, description="The target area to resize the image to, can be 1024 or 640" ), ] @@ -313,10 +392,17 @@ def intermediate_outputs(self) -> List[OutputParam]: ), ] + @staticmethod + def check_inputs(resolution: int): + if resolution not in [1024, 640]: + raise ValueError(f"Resolution must be 1024 or 640 but is {resolution}") + @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): block_state = self.get_block_state(state) + self.check_inputs(resolution=block_state.resolution) + images = getattr(block_state, self._image_input_name) if not is_valid_image_imagelist(images): @@ -326,7 +412,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): images = [images] image_width, image_height = images[0].size - calculated_width, calculated_height, _ = calculate_dimensions(block_state.target_area, image_width / image_height) + target_area = block_state.resolution * block_state.resolution + calculated_width, calculated_height, _ = calculate_dimensions(target_area, image_width / image_height) resized_images = [ components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width) @@ -337,6 +424,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state + class QwenImageEditPlusResizeStep(ModularPipelineBlocks): """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus.""" @@ -890,7 +978,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images can be resized first using QwenImageEditResizeStep." + return "Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width." @property def expected_components(self) -> List[ComponentSpec]: @@ -907,8 +995,7 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam("mask_image", required=True), - InputParam("resized_image"), - InputParam("image"), + InputParam("image", required=True), InputParam("height"), InputParam("width"), InputParam("padding_mask_crop"), @@ -938,23 +1025,73 @@ def check_inputs(height, width, vae_scale_factor): def __call__(self, components: QwenImageModularPipeline, state: PipelineState): block_state = self.get_block_state(state) - if block_state.resized_image is None and block_state.image is None: - raise ValueError("resized_image and image cannot be None at the same time") + self.check_inputs( + height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor + ) + height = block_state.height or components.default_height + width = block_state.width or components.default_width - if block_state.resized_image is None: - image = block_state.image - self.check_inputs( - height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor + block_state.processed_image, block_state.processed_mask_image, block_state.mask_overlay_kwargs = ( + components.image_mask_processor.preprocess( + image=block_state.image, + mask=block_state.mask_image, + height=height, + width=width, + padding_mask_crop=block_state.padding_mask_crop, ) - height = block_state.height or components.default_height - width = block_state.width or components.default_width - else: - width, height = block_state.resized_image[0].size - image = block_state.resized_image + ) + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): + model_name = "qwenimage-edit" + + @property + def description(self) -> str: + return "Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_mask_processor", + InpaintProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("mask_image", required=True), + InputParam("resized_image", required=True), + InputParam("padding_mask_crop"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="processed_image"), + OutputParam(name="processed_mask_image"), + OutputParam( + name="mask_overlay_kwargs", + type_hint=Dict, + description="The kwargs for the postprocess step to apply the mask overlay", + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + width, height = block_state.resized_image[0].size block_state.processed_image, block_state.processed_mask_image, block_state.mask_overlay_kwargs = ( components.image_mask_processor.preprocess( - image=image, + image=block_state.resized_image, mask=block_state.mask_image, height=height, width=width, @@ -971,7 +1108,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Image Preprocess step. Images can be resized first using QwenImageEditResizeStep." + return "Image Preprocess step. will resize the image to the given height and width." @property def expected_components(self) -> List[ComponentSpec]: @@ -986,7 +1123,11 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: - return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")] + return [ + InputParam("image", required=True), + InputParam("height"), + InputParam("width"), + ] @property def intermediate_outputs(self) -> List[OutputParam]: @@ -1004,22 +1145,59 @@ def check_inputs(height, width, vae_scale_factor): def __call__(self, components: QwenImageModularPipeline, state: PipelineState): block_state = self.get_block_state(state) - if block_state.resized_image is None and block_state.image is None: - raise ValueError("resized_image and image cannot be None at the same time") + self.check_inputs( + height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor + ) + height = block_state.height or components.default_height + width = block_state.width or components.default_width - if block_state.resized_image is None: - image = block_state.image - self.check_inputs( - height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor - ) - height = block_state.height or components.default_height - width = block_state.width or components.default_width - else: - width, height = block_state.resized_image[0].size - image = block_state.resized_image block_state.processed_image = components.image_processor.preprocess( - image=image, + image=block_state.image, + height=height, + width=width, + ) + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks): + model_name = "qwenimage-edit" + + @property + def description(self) -> str: + return "Image Preprocess step. Images needs to be resized first." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("resized_image", required=True), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [OutputParam(name="processed_image")] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + width, height = block_state.resized_image[0].size + + block_state.processed_image = components.image_processor.preprocess( + image=block_state.resized_image, height=height, width=width, ) @@ -1027,6 +1205,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state + class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): model_name = "qwenimage-edit-plus" diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 598eb5346ccb..3959e616bcda 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -286,8 +286,8 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="image_height", type_hint=int, description="The height of the image latents"), - OutputParam(name="image_width", type_hint=int, description="The width of the image latents"), + OutputParam(name="image_height", type_hint=int, description="The image height calculated from the image latents dimension"), + OutputParam(name="image_width", type_hint=int, description="The image width calculated from the image latents dimension"), ] def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -341,7 +341,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -class QwenImageEditPlusInputsDynamicStep(ModularPipelineBlocks): +class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): """Input step for QwenImage Edit Plus: handles list of latents with different sizes.""" model_name = "qwenimage-edit-plus" @@ -407,8 +407,8 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="image_height", type_hint=List[int], description="The heights of the image latents"), - OutputParam(name="image_width", type_hint=List[int], description="The widths of the image latents"), + OutputParam(name="image_height", type_hint=List[int], description="The image heights calculated from the image latents dimension"), + OutputParam(name="image_width", type_hint=List[int], description="The image widths calculated from the image latents dimension"), ] def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -529,8 +529,6 @@ def inputs(self) -> List[InputParam]: inputs = [ InputParam(name="num_images_per_prompt", default=1), InputParam(name="batch_size", required=True), - InputParam(name="height"), - InputParam(name="width"), ] for image_latent_input_name in self._image_latent_inputs: @@ -544,8 +542,10 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="image_height", type_hint=int, description="The height of the image latents"), - OutputParam(name="image_width", type_hint=int, description="The width of the image latents"), + OutputParam(name="image_height", type_hint=int, description="The image height calculated from the image latents dimension"), + OutputParam(name="image_width", type_hint=int, description="The image width calculated from the image latents dimension"), + OutputParam(name="height", type_hint=int, description="The height of the image output"), + OutputParam(name="width", type_hint=int, description="The width of the image output"), ] def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -561,8 +561,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # Layered latents are (B, layers, C, H, W) height = image_latent_tensor.shape[3] * components.vae_scale_factor width = image_latent_tensor.shape[4] * components.vae_scale_factor - block_state.height = block_state.height or height - block_state.width = block_state.width or width + block_state.height = height + block_state.width = width if not hasattr(block_state, "image_height"): block_state.image_height = height diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 4dcaa9dc110f..09bf3e8a3ff0 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -38,8 +38,8 @@ from .encoders import ( QwenImageEditResizeStep, QwenImageEditTextEncoderStep, - QwenImageInpaintProcessImagesInputStep, - QwenImageProcessImagesInputStep, + QwenImageEditInpaintProcessImagesInputStep, + QwenImageEditProcessImagesInputStep, QwenImageVaeEncoderStep, ) from .inputs import ( @@ -78,7 +78,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), - QwenImageProcessImagesInputStep(), + QwenImageEditProcessImagesInputStep(), QwenImageVaeEncoderStep(), ] block_names = ["resize", "preprocess", "encode"] @@ -93,7 +93,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), - QwenImageInpaintProcessImagesInputStep(), + QwenImageEditInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"), ] block_names = ["resize", "preprocess", "encode"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 35ee8994c8dd..0b65c0ff039c 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -35,7 +35,7 @@ QwenImageVaeEncoderStep, ) from .inputs import ( - QwenImageEditPlusInputsDynamicStep, + QwenImageEditPlusAdditionalInputsStep, QwenImageTextInputsStep, ) @@ -92,7 +92,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" block_classes = [ QwenImageTextInputsStep(), - QwenImageEditPlusInputsDynamicStep(image_latent_inputs=["image_latents"]), + QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]), ] block_names = ["text_inputs", "additional_inputs"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index c752fde8a0c1..f39fb208f1da 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -30,9 +30,9 @@ QwenImageLayeredDenoiseStep, ) from .encoders import ( - QwenImageEditResizeStep, + QwenImageLayeredResizeStep, QwenImageTextEncoderStep, - QwenImageProcessImagesInputStep, + QwenImageEditProcessImagesInputStep, QwenImageVaeEncoderStep, QwenImageLayeredGetImagePromptStep, QwenImageLayeredPermuteLatentsStep, @@ -55,7 +55,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """Text encoder that takes text prompt, will generate a prompt based on image if not provided.""" model_name = "qwenimage-layered" block_classes = [ - QwenImageEditResizeStep(), + QwenImageLayeredResizeStep(), QwenImageLayeredGetImagePromptStep(), QwenImageTextEncoderStep(), ] @@ -74,8 +74,8 @@ def description(self) -> str: class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-layered" block_classes = [ - QwenImageEditResizeStep(), - QwenImageProcessImagesInputStep(), + QwenImageLayeredResizeStep(), + QwenImageEditProcessImagesInputStep(), QwenImageVaeEncoderStep(), QwenImageLayeredPermuteLatentsStep(), ] From cee9ac03f27852780ae31103405725f5725e2d78 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 9 Jan 2026 09:46:22 +0100 Subject: [PATCH 09/12] clean up a bit git add src --- .../qwenimage/before_denoise.py | 3 + .../modular_pipelines/qwenimage/denoise.py | 49 +++++++++++++---- .../qwenimage/modular_blocks_qwenimage.py | 55 +++++++++---------- .../modular_blocks_qwenimage_edit.py | 14 ++--- .../modular_blocks_qwenimage_edit_plus.py | 6 +- .../modular_blocks_qwenimage_layered.py | 8 +-- 6 files changed, 80 insertions(+), 55 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index c441f51dd78a..0f7a9a53a4de 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -672,6 +672,7 @@ def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( name="img_shapes", + kwargs_type="denoiser_input_fields", type_hint=List[List[Tuple[int, int, int]]], description="The shapes of the images latents, used for RoPE calculation", ), @@ -739,6 +740,7 @@ def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( name="img_shapes", + kwargs_type="denoiser_input_fields", type_hint=List[List[Tuple[int, int, int]]], description="The shapes of the images latents, used for RoPE calculation", ), @@ -819,6 +821,7 @@ def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( name="img_shapes", + kwargs_type="denoiser_input_fields", type_hint=List[List[Tuple[int, int, int]]], description="The shapes of the image latents, used for RoPE calculation", ), diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index 9eb98e7156de..36ea457e0b02 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -29,7 +29,11 @@ logger = logging.get_logger(__name__) +# ==================== +# 1. LOOP STEPS (run at each denoising step) +# ==================== +# loop step:before denoiser class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks): model_name = "qwenimage" @@ -61,7 +65,7 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks): - model_name = "qwenimage" + model_name = "qwenimage-edit" @property def description(self) -> str: @@ -186,6 +190,7 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState return components, block_state +# loop step:denoiser class QwenImageLoopDenoiser(ModularPipelineBlocks): model_name = "qwenimage" @@ -254,6 +259,14 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState ), } + transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys()) + additional_cond_kwargs = {} + for field_name, field_value in block_state.denoiser_input_fields.items(): + if field_name in transformer_args and field_name not in guider_inputs: + additional_cond_kwargs[field_name] = field_value + block_state.additional_cond_kwargs.update(additional_cond_kwargs) + + components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) guider_state = components.guider.prepare_inputs(guider_inputs) @@ -265,7 +278,6 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState guider_state_batch.noise_pred = components.transformer( hidden_states=block_state.latent_model_input, timestep=block_state.timestep / 1000, - img_shapes=block_state.img_shapes, attention_kwargs=block_state.attention_kwargs, return_dict=False, **cond_kwargs, @@ -285,7 +297,7 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState class QwenImageEditLoopDenoiser(ModularPipelineBlocks): - model_name = "qwenimage" + model_name = "qwenimage-edit" @property def description(self) -> str: @@ -390,7 +402,7 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState return components, block_state - +# loop step:after denoiser class QwenImageLoopAfterDenoiser(ModularPipelineBlocks): model_name = "qwenimage" @@ -488,6 +500,9 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState return components, block_state +# ==================== +# 2. DENOISE LOOP WRAPPER: define the denoising loop logic +# ==================== class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks): model_name = "qwenimage" @@ -544,8 +559,14 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -# composing the denoising loops +# ==================== +# 3. DENOISE STEPS: compose the denoising loop with loop wrapper + loop steps +# ==================== + +# Qwen Image (text2image, image2image) class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): + model_name = "qwenimage" + block_classes = [ QwenImageLoopBeforeDenoiser, QwenImageLoopDenoiser, @@ -565,9 +586,9 @@ def description(self) -> str: "This block supports text2image and image2image tasks for QwenImage." ) - -# composing the inpainting denoising loops +# Qwen Image (inpainting) class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, QwenImageLoopDenoiser, @@ -590,8 +611,9 @@ def description(self) -> str: ) -# composing the controlnet denoising loops +# Qwen Image (text2image, image2image) with controlnet class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, QwenImageLoopBeforeDenoiserControlNet, @@ -614,8 +636,9 @@ def description(self) -> str: ) -# composing the controlnet denoising loops +# Qwen Image (inpainting) with controlnet class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, QwenImageLoopBeforeDenoiserControlNet, @@ -646,8 +669,9 @@ def description(self) -> str: ) -# composing the denoising loops +# Qwen Image Edit (image2image) class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): + model_name = "qwenimage-edit" block_classes = [ QwenImageEditLoopBeforeDenoiser, QwenImageEditLoopDenoiser, @@ -668,7 +692,9 @@ def description(self) -> str: ) +# Qwen Image Edit (inpainting) class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): + model_name = "qwenimage-edit" block_classes = [ QwenImageEditLoopBeforeDenoiser, QwenImageEditLoopDenoiser, @@ -691,8 +717,9 @@ def description(self) -> str: ) -# actually same as QwenImageEditDenoiseStep +# Qwen Image Layered (image2image) class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): + model_name = "qwenimage-layered" block_classes = [ QwenImageEditLoopBeforeDenoiser, QwenImageEditLoopDenoiser, diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index e180558455cb..784a8e6a6edb 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -54,10 +54,10 @@ logger = logging.get_logger(__name__) - +# ==================== # 1. VAE ENCODER +# ==================== -# inpaint vae encoder class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()] @@ -73,7 +73,6 @@ def description(self) -> str: ) -# img2img vae encoder class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage" @@ -84,8 +83,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): def description(self) -> str: return "Vae encoder step that preprocess andencode the image inputs into their latent representations." - -# auto vae encoder +# Auto VAE encoder class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] block_names = ["inpaint", "img2img"] @@ -117,10 +115,11 @@ def description(self): + " - if `control_image` is not provided, step will be skipped." ) -# 2. DENOISE -# input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise +# ==================== +# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) +# ==================== -# img2img input +# assemble input steps class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])] @@ -133,7 +132,6 @@ def description(self): " - update height/width based `image_latents`, patchify `image_latents`." -# inpaint input class QwenImageInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])] @@ -145,7 +143,7 @@ def description(self): " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" " - update height/width based `image_latents`, patchify `image_latents`." -# inpaint prepare latents +# assemble prepare latents steps class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] @@ -159,10 +157,9 @@ def description(self) -> str: " - Create the pachified latents `mask` based on the processedmask image.\n" ) -# CoreDenoiseStep: -# (input + prepare_latents + set_timesteps + prepare_rope_inputs + denoise + after_denoise) +# assemble denoising steps -# 1. text2image +# Qwen Image (text2image) class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ @@ -187,7 +184,7 @@ def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." -# 2.inpaint +# Qwen Image (inpainting) class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ @@ -214,7 +211,7 @@ def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." -# 3. img2img +# Qwen Image (image2image) class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ @@ -242,7 +239,7 @@ def description(self): -# 4. text2image + controlnet +# Qwen Image (text2image) with controlnet class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ @@ -271,7 +268,7 @@ def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." -# 5. inpaint + controlnet +# Qwen Image (inpainting) with controlnet class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ @@ -302,7 +299,7 @@ def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." -# 6. img2img + controlnet +# Qwen Image (image2image) with controlnet class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ @@ -333,8 +330,7 @@ def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." -# auto denoise -# auto denoise step for controlnet tasks: works for all tasks with controlnet +# Auto denoise step for QwenImage class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): block_classes = [ QwenImageCoreDenoiseStep, @@ -389,13 +385,12 @@ def description(self): ) -# 4. DECODE - -## 1.1 text2image +# ==================== +# 3. DECODE +# ==================== -#### decode -#### (standard decode step works for most tasks except for inpaint) +# standard decode step works for most tasks except for inpaint class QwenImageDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] @@ -407,8 +402,7 @@ def description(self): -#### inpaint decode - +# Inpaint decode step class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] @@ -419,7 +413,7 @@ def description(self): return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image." -# auto decode step for inpaint and text2image tasks +# Auto decode step for QwenImage class QwenImageAutoDecodeStep(AutoPipelineBlocks): block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep] block_names = ["inpaint_decode", "decode"] @@ -435,8 +429,9 @@ def description(self): ) - -## 1.10 QwenImage/auto block & presets +# ==================== +# 4. AUTO BLOCKS & PRESETS +# ==================== AUTO_BLOCKS = InsertableDict( [ ("text_encoder", QwenImageTextEncoderStep()), diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 09bf3e8a3ff0..d986c2e46aec 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -126,10 +126,10 @@ def description(self): # ==================== -# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise +# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== -# Edit input step +# assemble input steps class QwenImageEditInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ @@ -147,7 +147,6 @@ def description(self): ) -# Edit Inpaint input step class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ @@ -165,7 +164,7 @@ def description(self): ) -# Edit Inpaint prepare latents step +# assemble prepare latents steps class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] @@ -180,7 +179,7 @@ def description(self) -> str: ) -# 1. Edit (img2img) core denoise +# Qwen Image Edit (image2image) core denoise step class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ @@ -205,7 +204,7 @@ def description(self): return "Core denoising workflow for QwenImage-Edit edit (img2img) task." -# 2. Edit Inpaint core denoise +# Qwen Image Edit (inpainting) core denoise step class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ @@ -232,8 +231,9 @@ def description(self): return "Core denoising workflow for QwenImage-Edit edit inpaint task." -# Auto core denoise step +# Auto core denoise step for QwenImage Edit class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks): + model_name = "qwenimage-edit" block_classes = [ QwenImageEditInpaintCoreDenoiseStep, QwenImageEditCoreDenoiseStep, diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 0b65c0ff039c..45698e14dc24 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -84,10 +84,10 @@ def description(self) -> str: # ==================== -# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise +# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== -# Edit Plus input step +# assemble input steps class QwenImageEditPlusInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" block_classes = [ @@ -107,7 +107,7 @@ def description(self): ) -# Edit Plus core denoise +# Qwen Image Edit Plus (image2image) core denoise step class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" block_classes = [ diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index f39fb208f1da..1ff366bcb38c 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -90,10 +90,10 @@ def description(self) -> str: # ==================== -# 3. DENOISE - input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise +# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== -# Layered input step +# assemble input steps class QwenImageLayeredInputStep(SequentialPipelineBlocks): model_name = "qwenimage-layered" block_classes = [ @@ -111,7 +111,7 @@ def description(self): ) -# 1. img2img core denoise +# Qwen Image Layered (image2image) core denoise step class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage-layered" block_classes = [ @@ -138,7 +138,7 @@ def description(self): # ==================== -# 5. AUTO BLOCKS & PRESETS +# 4. AUTO BLOCKS & PRESETS # ==================== LAYERED_AUTO_BLOCKS = InsertableDict( From 4c1401fdbc9a014f734db80d626297a98b3bf0ab Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 9 Jan 2026 09:56:02 +0100 Subject: [PATCH 10/12] more --- .../qwenimage/before_denoise.py | 16 ++++++++------ .../modular_pipelines/qwenimage/decoders.py | 8 +++---- .../modular_pipelines/qwenimage/encoders.py | 22 +++++++++++++++---- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 0f7a9a53a4de..f9a851528407 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -113,7 +113,9 @@ def get_timesteps(scheduler, num_inference_steps, strength): return timesteps, num_inference_steps - t_start -# Prepare Latents steps +# ==================== +# 1. PREPARE LATENTS +# ==================== class QwenImagePrepareLatentsStep(ModularPipelineBlocks): @@ -443,7 +445,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -# Set Timesteps steps +# ==================== +# 2. SET TIMESTEPS +# ==================== class QwenImageSetTimestepsStep(ModularPipelineBlocks): @@ -643,11 +647,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -# other inputs for denoiser +# ==================== +# 3. OTHER INPUTS FOR DENOISER +# ==================== ## RoPE inputs for denoiser - class QwenImageRoPEInputsStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -792,8 +797,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): - """RoPE inputs step for Edit Plus that handles lists of image heights/widths.""" - model_name = "qwenimage-edit-plus" @property @@ -930,7 +933,6 @@ def __call__(self, components, state: PipelineState) -> PipelineState: ## ControlNet inputs for denoiser class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): model_name = "qwenimage" - @property def expected_components(self) -> List[ComponentSpec]: return [ diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 64a5250b3c46..18ad2d72b2b1 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -30,6 +30,7 @@ logger = logging.get_logger(__name__) +# after denoising loop (unpack latents) class QwenImageAfterDenoiseStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -72,7 +73,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): - """Unpack latents after denoising for Layered.""" model_name = "qwenimage-layered" @@ -111,7 +111,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: self.set_block_state(state, block_state) return components, state - +# decode step class QwenImageDecoderStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -177,8 +177,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - class QwenImageLayeredDecoderStep(ModularPipelineBlocks): - """Decode unpacked layered latents into multiple layer images.""" - model_name = "qwenimage-layered" @property @@ -242,7 +240,6 @@ def __call__(self, components, state: PipelineState) -> PipelineState: # 5. Postprocess - returns flat list of B*layers images image = components.image_processor.postprocess(image, output_type=block_state.output_type) - # 6. Chunk into list per batch item images = [] @@ -255,6 +252,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: return components, state +# postprocess the decoded images class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): model_name = "qwenimage" diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 5b7eb222dbd9..139d26bb705e 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -256,6 +256,9 @@ def encode_vae_image( return image_latents +# ==================== +# 1. RESIZE +# ==================== class QwenImageEditResizeStep(ModularPipelineBlocks): model_name = "qwenimage-edit" @@ -517,7 +520,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state - +# ==================== +# 2. GET IMAGE PROMPT +# ==================== class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): """ Auto-caption step that generates a text prompt from the input image if none is provided. @@ -602,7 +607,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state - +# ==================== +# 3. TEXT ENCODER +# ==================== class QwenImageTextEncoderStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -972,7 +979,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state - +# ==================== +# 4. IMAGE PREPROCESS +# ==================== class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -1257,6 +1266,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state +# ==================== +# 5. VAE ENCODER +# ==================== class QwenImageVaeEncoderStep(ModularPipelineBlocks): """VAE encoder that handles both single images and lists of images with varied resolutions.""" @@ -1451,7 +1463,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state - +# ==================== +# 6. PERMUTE LATENTS +# ==================== class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): """Permute image latents from VAE format to Layered format.""" From d444b49b221bf44924eab943f4fd67d70ddaa09c Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 9 Jan 2026 10:02:10 +0100 Subject: [PATCH 11/12] style --- src/diffusers/__init__.py | 4 +- src/diffusers/modular_pipelines/__init__.py | 2 +- .../modular_pipelines/modular_pipeline.py | 37 +++--- .../modular_pipelines/qwenimage/__init__.py | 4 +- .../qwenimage/before_denoise.py | 55 ++++++--- .../modular_pipelines/qwenimage/decoders.py | 12 +- .../modular_pipelines/qwenimage/denoise.py | 7 +- .../modular_pipelines/qwenimage/encoders.py | 71 ++++++----- .../modular_pipelines/qwenimage/inputs.py | 39 ++++-- .../qwenimage/modular_blocks_qwenimage.py | 115 ++++++++++-------- .../modular_blocks_qwenimage_edit.py | 15 ++- .../modular_blocks_qwenimage_edit_plus.py | 12 +- .../modular_blocks_qwenimage_layered.py | 21 ++-- .../qwenimage/modular_pipeline.py | 2 +- .../qwenimage/prompt_templates.py | 2 +- 15 files changed, 236 insertions(+), 162 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 5724cf7a24ad..e415307f4cfb 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -418,9 +418,9 @@ "QwenImageEditModularPipeline", "QwenImageEditPlusAutoBlocks", "QwenImageEditPlusModularPipeline", - "QwenImageModularPipeline", "QwenImageLayeredAutoBlocks", "QwenImageLayeredModularPipeline", + "QwenImageModularPipeline", "StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline", "Wan22AutoBlocks", @@ -1140,9 +1140,9 @@ QwenImageEditModularPipeline, QwenImageEditPlusAutoBlocks, QwenImageEditPlusModularPipeline, - QwenImageModularPipeline, QwenImageLayeredAutoBlocks, QwenImageLayeredModularPipeline, + QwenImageModularPipeline, StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline, Wan22AutoBlocks, diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index febabd56e6f5..e64db23f3831 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -98,9 +98,9 @@ QwenImageEditModularPipeline, QwenImageEditPlusAutoBlocks, QwenImageEditPlusModularPipeline, - QwenImageModularPipeline, QwenImageLayeredAutoBlocks, QwenImageLayeredModularPipeline, + QwenImageModularPipeline, ) from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline from .wan import Wan22AutoBlocks, WanAutoBlocks, WanModularPipeline diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index a4a506f6e703..d857fd040955 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -530,8 +530,8 @@ def doc(self): class ConditionalPipelineBlocks(ModularPipelineBlocks): """ - A Pipeline Blocks that conditionally selects a block to run based on the inputs. - Subclasses must implement the `select_block` method to define the logic for selecting the block. + A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the + `select_block` method to define the logic for selecting the block. This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the library implements for all the pipeline blocks (such as loading or saving etc.) @@ -547,7 +547,7 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks): block_classes = [] block_names = [] block_trigger_inputs = [] - default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided + default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided def __init__(self): sub_blocks = InsertableDict() @@ -594,11 +594,10 @@ def expected_configs(self): @property def required_inputs(self) -> List[str]: - # no default block means this conditional block can be skipped entirely if self.default_block_name is None: return [] - + first_block = next(iter(self.sub_blocks.values())) required_by_all = set(getattr(first_block, "required_inputs", set())) @@ -609,7 +608,6 @@ def required_inputs(self) -> List[str]: return list(required_by_all) - @property def inputs(self) -> List[Tuple[str, Any]]: named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()] @@ -656,7 +654,7 @@ def fn_recursive_get_trigger(blocks): return trigger_values # Start with this block's block_trigger_inputs - all_triggers = set(t for t in self.block_trigger_inputs if t is not None) + all_triggers = {t for t in self.block_trigger_inputs if t is not None} # Add nested triggers all_triggers.update(fn_recursive_get_trigger(self.sub_blocks)) @@ -669,8 +667,8 @@ def trigger_inputs(self): def select_block(self, **kwargs) -> Optional[str]: """ - Select the block to run based on the trigger inputs. - Subclasses must implement this method to define the logic for selecting the block. + Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic + for selecting the block. Args: **kwargs: Trigger input names and their values from the state. @@ -682,7 +680,6 @@ def select_block(self, **kwargs) -> Optional[str]: @torch.no_grad() def __call__(self, pipeline, state: PipelineState) -> PipelineState: - trigger_kwargs = {name: state.get(name) for name in self.block_trigger_inputs if name is not None} block_name = self.select_block(**trigger_kwargs) @@ -692,7 +689,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState: if block_name is None: logger.info(f"skipping conditional block: {self.__class__.__name__}") return pipeline, state - + block = self.sub_blocks[block_name] try: @@ -739,11 +736,11 @@ def __repr__(self): expected_configs = getattr(self, "expected_configs", []) configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False) - # Blocks section + # Blocks section blocks_str = " Sub-Blocks:\n" for i, (name, block) in enumerate(self.sub_blocks.items()): if name == self.default_block_name: - addtional_str = " [default]" + addtional_str = " [default]" else: addtional_str = "" blocks_str += f" • {name}{addtional_str} ({block.__class__.__name__})\n" @@ -1069,17 +1066,16 @@ def get_execution_blocks(self, **kwargs): Returns: SequentialPipelineBlocks containing only the blocks that would execute - + Example: - # Get blocks for inpainting workflow - blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, image=image) - - # Get blocks for text2image workflow - blocks = pipeline.get_execution_blocks(prompt="a cat") + # Get blocks for inpainting workflow blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, + image=image) + + # Get blocks for text2image workflow blocks = pipeline.get_execution_blocks(prompt="a cat") """ # Filter out None values active_inputs = {k: v for k, v in kwargs.items() if v is not None} - + blocks_triggered = self._traverse_trigger_blocks(active_inputs) return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered) @@ -1121,7 +1117,6 @@ def __repr__(self): # Blocks section - moved to the end with simplified format blocks_str = " Sub-Blocks:\n" for i, (name, block) in enumerate(self.sub_blocks.items()): - # show execution order blocks_str += f" [{i}] {name} ({block.__class__.__name__})\n" diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py index 37a44ac1117b..2b01a5b5a4b5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/__init__.py +++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py @@ -70,8 +70,8 @@ from .modular_pipeline import ( QwenImageEditModularPipeline, QwenImageEditPlusModularPipeline, - QwenImageModularPipeline, QwenImageLayeredModularPipeline, + QwenImageModularPipeline, ) else: import sys @@ -84,4 +84,4 @@ ) for name, value in _dummy_objects.items(): - setattr(sys.modules[__name__], name, value) \ No newline at end of file + setattr(sys.modules[__name__], name, value) diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index f9a851528407..0c66d6ea3303 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -23,7 +23,7 @@ from ...utils.torch_utils import randn_tensor, unwrap_module from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam -from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier, QwenImageLayeredPachifier +from .modular_pipeline import QwenImageLayeredPachifier, QwenImageModularPipeline, QwenImagePachifier # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift @@ -653,6 +653,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - ## RoPE inputs for denoiser + class QwenImageRoPEInputsStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -877,7 +878,9 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step" + return ( + "Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step" + ) @property def inputs(self) -> List[InputParam]: @@ -893,10 +896,30 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="img_shapes", type_hint=List[List[Tuple[int, int, int]]], kwargs_type="denoiser_input_fields", description="The shapes of the image latents, used for RoPE calculation"), - OutputParam(name="txt_seq_lens", type_hint=List[int], kwargs_type="denoiser_input_fields", description="The sequence lengths of the prompt embeds, used for RoPE calculation"), - OutputParam(name="negative_txt_seq_lens", type_hint=List[int], kwargs_type="denoiser_input_fields", description="The sequence lengths of the negative prompt embeds, used for RoPE calculation"), - OutputParam(name="additional_t_cond", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields", description="The additional t cond, used for RoPE calculation"), + OutputParam( + name="img_shapes", + type_hint=List[List[Tuple[int, int, int]]], + kwargs_type="denoiser_input_fields", + description="The shapes of the image latents, used for RoPE calculation", + ), + OutputParam( + name="txt_seq_lens", + type_hint=List[int], + kwargs_type="denoiser_input_fields", + description="The sequence lengths of the prompt embeds, used for RoPE calculation", + ), + OutputParam( + name="negative_txt_seq_lens", + type_hint=List[int], + kwargs_type="denoiser_input_fields", + description="The sequence lengths of the negative prompt embeds, used for RoPE calculation", + ), + OutputParam( + name="additional_t_cond", + type_hint=torch.Tensor, + kwargs_type="denoiser_input_fields", + description="The additional t cond, used for RoPE calculation", + ), ] @torch.no_grad() @@ -906,24 +929,25 @@ def __call__(self, components, state: PipelineState) -> PipelineState: device = components._execution_device # All shapes are the same for Layered - shape = (1, block_state.height // components.vae_scale_factor // 2, block_state.width // components.vae_scale_factor // 2) - + shape = ( + 1, + block_state.height // components.vae_scale_factor // 2, + block_state.width // components.vae_scale_factor // 2, + ) + # layers+1 output shapes + 1 condition shape (all same) - block_state.img_shapes = [ - [shape] * (block_state.layers + 2) - ] * block_state.batch_size + block_state.img_shapes = [[shape] * (block_state.layers + 2)] * block_state.batch_size # txt_seq_lens block_state.txt_seq_lens = ( - block_state.prompt_embeds_mask.sum(dim=1).tolist() - if block_state.prompt_embeds_mask is not None else None + block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None ) block_state.negative_txt_seq_lens = ( block_state.negative_prompt_embeds_mask.sum(dim=1).tolist() - if block_state.negative_prompt_embeds_mask is not None else None + if block_state.negative_prompt_embeds_mask is not None + else None ) - block_state.additional_t_cond = torch.tensor([0] * block_state.batch_size).to(device=device, dtype=torch.long) self.set_block_state(state, block_state) @@ -933,6 +957,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: ## ControlNet inputs for denoiser class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): model_name = "qwenimage" + @property def expected_components(self) -> List[ComponentSpec]: return [ diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 18ad2d72b2b1..24a88ebfca3c 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -24,7 +24,7 @@ from ...utils import logging from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam -from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier, QwenImageLayeredPachifier +from .modular_pipeline import QwenImageLayeredPachifier, QwenImageModularPipeline, QwenImagePachifier logger = logging.get_logger(__name__) @@ -73,7 +73,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): - model_name = "qwenimage-layered" @property @@ -111,6 +110,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: self.set_block_state(state, block_state) return components, state + # decode step class QwenImageDecoderStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -221,11 +221,9 @@ def __call__(self, components, state: PipelineState) -> PipelineState: .view(1, components.vae.config.z_dim, 1, 1, 1) .to(latents.device, latents.dtype) ) - latents_std = ( - 1.0 / torch.tensor(components.vae.config.latents_std) - .view(1, components.vae.config.z_dim, 1, 1, 1) - .to(latents.device, latents.dtype) - ) + latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view( + 1, components.vae.config.z_dim, 1, 1, 1 + ).to(latents.device, latents.dtype) latents = latents / latents_std + latents_mean # 2. Reshape for batch decoding: (B, C, layers+1, H, W) -> (B*layers, C, 1, H, W) diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index 36ea457e0b02..eb1e5a341c68 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import inspect from typing import List, Tuple import torch -import inspect from ...configuration_utils import FrozenDict from ...guiders import ClassifierFreeGuidance @@ -33,6 +33,7 @@ # 1. LOOP STEPS (run at each denoising step) # ==================== + # loop step:before denoiser class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks): model_name = "qwenimage" @@ -266,7 +267,6 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState additional_cond_kwargs[field_name] = field_value block_state.additional_cond_kwargs.update(additional_cond_kwargs) - components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) guider_state = components.guider.prepare_inputs(guider_inputs) @@ -402,6 +402,7 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState return components, block_state + # loop step:after denoiser class QwenImageLoopAfterDenoiser(ModularPipelineBlocks): model_name = "qwenimage" @@ -563,6 +564,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # 3. DENOISE STEPS: compose the denoising loop with loop wrapper + loop steps # ==================== + # Qwen Image (text2image, image2image) class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage" @@ -586,6 +588,7 @@ def description(self) -> str: "This block supports text2image and image2image tasks for QwenImage." ) + # Qwen Image (inpainting) class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage" diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 139d26bb705e..4b66dd32e521 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -33,15 +33,15 @@ from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam from .modular_pipeline import QwenImageModularPipeline from .prompt_templates import ( - QWENIMAGE_PROMPT_TEMPLATE, - QWENIMAGE_PROMPT_TEMPLATE_START_IDX, - QWENIMAGE_EDIT_PROMPT_TEMPLATE, - QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX, - QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE, QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE, + QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE, QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX, - QWENIMAGE_LAYERED_CAPTION_PROMPT_EN, + QWENIMAGE_EDIT_PROMPT_TEMPLATE, + QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX, QWENIMAGE_LAYERED_CAPTION_PROMPT_CN, + QWENIMAGE_LAYERED_CAPTION_PROMPT_EN, + QWENIMAGE_PROMPT_TEMPLATE, + QWENIMAGE_PROMPT_TEMPLATE_START_IDX, ) @@ -263,11 +263,12 @@ class QwenImageEditResizeStep(ModularPipelineBlocks): model_name = "qwenimage-edit" def __init__( - self, - input_name: str = "image", + self, + input_name: str = "image", output_name: str = "resized_image", ): """Create a configurable step for resizing images to the target area while maintaining the aspect ratio. + Args: input_name (str, optional): Name of the image field to read from the pipeline state. Defaults to "image". @@ -342,11 +343,12 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks): model_name = "qwenimage-layered" def __init__( - self, - input_name: str = "image", + self, + input_name: str = "image", output_name: str = "resized_image", ): """Create a configurable step for resizing images to the target area while maintaining the aspect ratio. + Args: input_name (str, optional): Name of the image field to read from the pipeline state. Defaults to "image". @@ -383,7 +385,10 @@ def inputs(self) -> List[InputParam]: name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" ), InputParam( - name="resolution", default=640, type_hint=int, description="The target area to resize the image to, can be 1024 or 640" + name="resolution", + default=640, + type_hint=int, + description="The target area to resize the image to, can be 1024 or 640", ), ] @@ -434,15 +439,15 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks): model_name = "qwenimage-edit-plus" def __init__( - self, - input_name: str = "image", + self, + input_name: str = "image", output_name: str = "resized_image", target_area: int = 1024 * 1024, ): """Create a step for resizing images to a target area. - Each image is resized independently based on its own aspect ratio. - This is suitable for Edit Plus where multiple reference images can have different dimensions. + Each image is resized independently based on its own aspect ratio. This is suitable for Edit Plus where + multiple reference images can have different dimensions. Args: input_name (str, optional): Name of the image field to read. Defaults to "image". @@ -480,7 +485,10 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam( - name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image(s) to resize" + name=self._image_input_name, + required=True, + type_hint=torch.Tensor, + description="The image(s) to resize", ), ] @@ -525,8 +533,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # ==================== class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): """ - Auto-caption step that generates a text prompt from the input image if none is provided. - Uses the VL model to generate a description of the image. + Auto-caption step that generates a text prompt from the input image if none is provided. Uses the VL model to + generate a description of the image. """ model_name = "qwenimage-layered" @@ -559,7 +567,7 @@ def inputs(self) -> List[InputParam]: InputParam(name="prompt", type_hint=str, description="The prompt to encode"), InputParam( name="resized_image", - required=True, + required=True, type_hint=PIL.Image.Image, description="The image to generate caption from, should be resized use the resize step", ), @@ -571,7 +579,6 @@ def inputs(self) -> List[InputParam]: ), ] - @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) @@ -580,7 +587,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # If prompt is empty or None, generate caption from image if block_state.prompt is None or block_state.prompt == "" or block_state.prompt == " ": - if block_state.use_en_prompt: caption_prompt = components.config.image_caption_prompt_en else: @@ -595,7 +601,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - generated_ids = components.text_encoder.generate(**model_inputs, max_new_tokens=512) generated_ids_trimmed = [ - out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids) + out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids) ] output_text = components.processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False @@ -979,6 +985,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state + # ==================== # 4. IMAGE PREPROCESS # ==================== @@ -1133,8 +1140,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("image", required=True), - InputParam("height"), + InputParam("image", required=True), + InputParam("height"), InputParam("width"), ] @@ -1160,7 +1167,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): height = block_state.height or components.default_height width = block_state.width or components.default_width - block_state.processed_image = components.image_processor.preprocess( image=block_state.image, height=height, @@ -1245,8 +1251,6 @@ def intermediate_outputs(self) -> List[OutputParam]: def __call__(self, components: QwenImageModularPipeline, state: PipelineState): block_state = self.get_block_state(state) - - image = block_state.resized_image is_image_list = isinstance(image, list) @@ -1256,7 +1260,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): processed_images = [] for img in image: img_width, img_height = img.size - processed_images.append(components.image_processor.preprocess(image=img, height=img_height, width=img_width)) + processed_images.append( + components.image_processor.preprocess(image=img, height=img_height, width=img_width) + ) block_state.processed_image = processed_images if is_image_list: block_state.processed_image = processed_images @@ -1266,6 +1272,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.set_block_state(state, block_state) return components, state + # ==================== # 5. VAE ENCODER # ==================== @@ -1281,8 +1288,8 @@ def __init__( ): """Initialize a VAE encoder step for converting images to latent representations. - Handles both single images and lists of images. When input is a list, outputs a list of latents. - When input is a single tensor, outputs a single latent tensor. + Handles both single images and lists of images. When input is a list, outputs a list of latents. When input is + a single tensor, outputs a single latent tensor. Args: input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image". @@ -1347,7 +1354,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - setattr(block_state, self._image_latents_output_name, image_latents) - self.set_block_state(state, block_state) return components, state @@ -1463,6 +1469,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state + # ==================== # 6. PERMUTE LATENTS # ==================== @@ -1494,4 +1501,4 @@ def __call__(self, components, state: PipelineState) -> PipelineState: setattr(block_state, self._input_name, latents.permute(0, 2, 1, 3, 4)) self.set_block_state(state, block_state) - return components, state \ No newline at end of file + return components, state diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 3959e616bcda..4a1cf3700c57 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -19,7 +19,7 @@ from ...models import QwenImageMultiControlNetModel from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam -from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier, QwenImageLayeredPachifier +from .modular_pipeline import QwenImageLayeredPachifier, QwenImageModularPipeline, QwenImagePachifier def repeat_tensor_to_batch_size( @@ -286,8 +286,16 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="image_height", type_hint=int, description="The image height calculated from the image latents dimension"), - OutputParam(name="image_width", type_hint=int, description="The image width calculated from the image latents dimension"), + OutputParam( + name="image_height", + type_hint=int, + description="The image height calculated from the image latents dimension", + ), + OutputParam( + name="image_width", + type_hint=int, + description="The image width calculated from the image latents dimension", + ), ] def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -407,8 +415,16 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="image_height", type_hint=List[int], description="The image heights calculated from the image latents dimension"), - OutputParam(name="image_width", type_hint=List[int], description="The image widths calculated from the image latents dimension"), + OutputParam( + name="image_height", + type_hint=List[int], + description="The image heights calculated from the image latents dimension", + ), + OutputParam( + name="image_width", + type_hint=List[int], + description="The image widths calculated from the image latents dimension", + ), ] def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -477,6 +493,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - self.set_block_state(state, block_state) return components, state + # YiYi TODO: support define config default component from the ModularPipeline level. # it is same as QwenImageAdditionalInputsStep, but with layered pachifier. class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): @@ -542,8 +559,16 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="image_height", type_hint=int, description="The image height calculated from the image latents dimension"), - OutputParam(name="image_width", type_hint=int, description="The image width calculated from the image latents dimension"), + OutputParam( + name="image_height", + type_hint=int, + description="The image height calculated from the image latents dimension", + ), + OutputParam( + name="image_width", + type_hint=int, + description="The image width calculated from the image latents dimension", + ), OutputParam(name="height", type_hint=int, description="The height of the image output"), OutputParam(name="width", type_hint=int, description="The width of the image output"), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 784a8e6a6edb..63e9f5a28372 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -13,7 +13,7 @@ # limitations under the License. from ...utils import logging -from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks, ConditionalPipelineBlocks +from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict from .before_denoise import ( QwenImageControlNetBeforeDenoiserStep, @@ -35,7 +35,6 @@ QwenImageDenoiseStep, QwenImageInpaintControlNetDenoiseStep, QwenImageInpaintDenoiseStep, - QwenImageLoopBeforeDenoiserControlNet, ) from .encoders import ( QwenImageControlNetVaeEncoderStep, @@ -45,8 +44,8 @@ QwenImageVaeEncoderStep, ) from .inputs import ( - QwenImageControlNetInputsStep, QwenImageAdditionalInputsStep, + QwenImageControlNetInputsStep, QwenImageTextInputsStep, ) @@ -58,6 +57,7 @@ # 1. VAE ENCODER # ==================== + class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()] @@ -83,6 +83,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): def description(self) -> str: return "Vae encoder step that preprocess andencode the image inputs into their latent representations." + # Auto VAE encoder class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] @@ -115,10 +116,12 @@ def description(self): + " - if `control_image` is not provided, step will be skipped." ) + # ==================== # 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== + # assemble input steps class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): model_name = "qwenimage" @@ -134,7 +137,12 @@ def description(self): class QwenImageInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage" - block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"])] + block_classes = [ + QwenImageTextInputsStep(), + QwenImageAdditionalInputsStep( + image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] + ), + ] block_names = ["text_inputs", "additional_inputs"] @property @@ -143,6 +151,7 @@ def description(self): " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" " - update height/width based `image_latents`, patchify `image_latents`." + # assemble prepare latents steps class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): model_name = "qwenimage" @@ -157,25 +166,27 @@ def description(self) -> str: " - Create the pachified latents `mask` based on the processedmask image.\n" ) + # assemble denoising steps + # Qwen Image (text2image) class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), - QwenImagePrepareLatentsStep(), - QwenImageSetTimestepsStep(), - QwenImageRoPEInputsStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsStep(), + QwenImageRoPEInputsStep(), QwenImageDenoiseStep(), QwenImageAfterDenoiseStep(), ] block_names = [ "input", - "prepare_latents", - "set_timesteps", - "prepare_rope_inputs", - "denoise", + "prepare_latents", + "set_timesteps", + "prepare_rope_inputs", + "denoise", "after_denoise", ] @@ -189,22 +200,22 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ QwenImageInpaintInputStep(), - QwenImagePrepareLatentsStep(), - QwenImageSetTimestepsWithStrengthStep(), - QwenImageInpaintPrepareLatentsStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImageInpaintPrepareLatentsStep(), QwenImageRoPEInputsStep(), QwenImageInpaintDenoiseStep(), QwenImageAfterDenoiseStep(), - ] + ] block_names = [ "input", "prepare_latents", - "set_timesteps", - "prepare_inpaint_latents", + "set_timesteps", + "prepare_inpaint_latents", "prepare_rope_inputs", "denoise", "after_denoise", - ] + ] @property def description(self): @@ -216,38 +227,37 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ QwenImageImg2ImgInputStep(), - QwenImagePrepareLatentsStep(), - QwenImageSetTimestepsWithStrengthStep(), - QwenImagePrepareLatentsWithStrengthStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImagePrepareLatentsWithStrengthStep(), QwenImageRoPEInputsStep(), QwenImageDenoiseStep(), QwenImageAfterDenoiseStep(), - ] + ] block_names = [ "input", - "prepare_latents", - "set_timesteps", - "prepare_img2img_latents", + "prepare_latents", + "set_timesteps", + "prepare_img2img_latents", "prepare_rope_inputs", "denoise", "after_denoise", - ] + ] @property def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." - # Qwen Image (text2image) with controlnet class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), QwenImageControlNetInputsStep(), - QwenImagePrepareLatentsStep(), - QwenImageSetTimestepsStep(), - QwenImageRoPEInputsStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsStep(), + QwenImageRoPEInputsStep(), QwenImageControlNetBeforeDenoiserStep(), QwenImageControlNetDenoiseStep(), QwenImageAfterDenoiseStep(), @@ -255,9 +265,9 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): block_names = [ "input", "controlnet_input", - "prepare_latents", - "set_timesteps", - "prepare_rope_inputs", + "prepare_latents", + "set_timesteps", + "prepare_rope_inputs", "controlnet_before_denoise", "controlnet_denoise", "after_denoise", @@ -274,25 +284,25 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): block_classes = [ QwenImageInpaintInputStep(), QwenImageControlNetInputsStep(), - QwenImagePrepareLatentsStep(), - QwenImageSetTimestepsWithStrengthStep(), - QwenImageInpaintPrepareLatentsStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImageInpaintPrepareLatentsStep(), QwenImageRoPEInputsStep(), QwenImageControlNetBeforeDenoiserStep(), QwenImageInpaintControlNetDenoiseStep(), QwenImageAfterDenoiseStep(), - ] + ] block_names = [ "input", "controlnet_input", "prepare_latents", - "set_timesteps", - "prepare_inpaint_latents", + "set_timesteps", + "prepare_inpaint_latents", "prepare_rope_inputs", "controlnet_before_denoise", "controlnet_denoise", "after_denoise", - ] + ] @property def description(self): @@ -305,25 +315,25 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): block_classes = [ QwenImageImg2ImgInputStep(), QwenImageControlNetInputsStep(), - QwenImagePrepareLatentsStep(), - QwenImageSetTimestepsWithStrengthStep(), - QwenImagePrepareLatentsWithStrengthStep(), + QwenImagePrepareLatentsStep(), + QwenImageSetTimestepsWithStrengthStep(), + QwenImagePrepareLatentsWithStrengthStep(), QwenImageRoPEInputsStep(), QwenImageControlNetBeforeDenoiserStep(), QwenImageControlNetDenoiseStep(), QwenImageAfterDenoiseStep(), - ] + ] block_names = [ "input", "controlnet_input", - "prepare_latents", - "set_timesteps", - "prepare_img2img_latents", + "prepare_latents", + "set_timesteps", + "prepare_img2img_latents", "prepare_rope_inputs", "controlnet_before_denoise", "controlnet_denoise", "after_denoise", - ] + ] @property def description(self): @@ -334,7 +344,7 @@ def description(self): class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): block_classes = [ QwenImageCoreDenoiseStep, - QwenImageInpaintCoreDenoiseStep, + QwenImageInpaintCoreDenoiseStep, QwenImageImg2ImgCoreDenoiseStep, QwenImageControlNetCoreDenoiseStep, QwenImageControlNetInpaintCoreDenoiseStep, @@ -346,12 +356,12 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): "img2img", "controlnet_text2image", "controlnet_inpaint", - "controlnet_img2img"] + "controlnet_img2img", + ] block_trigger_inputs = ["control_image_latents", "processed_mask_image", "image_latents"] default_block_name = "text2image" def select_block(self, control_image_latents=None, processed_mask_image=None, image_latents=None): - if control_image_latents is not None: if processed_mask_image is not None: return "controlnet_inpaint" @@ -401,7 +411,6 @@ def description(self): return "Decode step that decodes the latents to images and postprocess the generated image." - # Inpaint decode step class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage" @@ -457,4 +466,4 @@ def description(self): + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + "- to run the controlnet workflow, you need to provide `control_image`\n" + "- for text-to-image generation, all you need to provide is `prompt`" - ) \ No newline at end of file + ) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index d986c2e46aec..99a349994c19 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -36,10 +36,10 @@ QwenImageEditInpaintDenoiseStep, ) from .encoders import ( - QwenImageEditResizeStep, - QwenImageEditTextEncoderStep, QwenImageEditInpaintProcessImagesInputStep, QwenImageEditProcessImagesInputStep, + QwenImageEditResizeStep, + QwenImageEditTextEncoderStep, QwenImageVaeEncoderStep, ) from .inputs import ( @@ -55,8 +55,10 @@ # 1. TEXT ENCODER # ==================== + class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): """VL encoder that takes both image and text prompts.""" + model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), @@ -73,6 +75,7 @@ def description(self) -> str: # 2. VAE ENCODER # ==================== + # Edit VAE encoder class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" @@ -129,6 +132,7 @@ def description(self): # 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== + # assemble input steps class QwenImageEditInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" @@ -151,7 +155,9 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), - QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]), + QwenImageAdditionalInputsStep( + image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] + ), ] block_names = ["text_inputs", "additional_inputs"] @@ -263,6 +269,7 @@ def description(self): # 4. DECODE # ==================== + # Decode step (standard) class QwenImageEditDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" @@ -326,4 +333,4 @@ def description(self): "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" "- for edit (img2img) generation, you need to provide `image`\n" "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`\n" - ) \ No newline at end of file + ) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 45698e14dc24..275e4288eb0a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -13,7 +13,7 @@ # limitations under the License. from ...utils import logging -from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict from .before_denoise import ( QwenImageEditPlusRoPEInputsStep, @@ -29,9 +29,9 @@ QwenImageEditDenoiseStep, ) from .encoders import ( + QwenImageEditPlusProcessImagesInputStep, QwenImageEditPlusResizeStep, QwenImageEditPlusTextEncoderStep, - QwenImageEditPlusProcessImagesInputStep, QwenImageVaeEncoderStep, ) from .inputs import ( @@ -47,8 +47,10 @@ # 1. TEXT ENCODER # ==================== + class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): """VL encoder that takes both image and text prompts. Uses 384x384 target area.""" + model_name = "qwenimage-edit-plus" block_classes = [ QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"), @@ -65,8 +67,10 @@ def description(self) -> str: # 2. VAE ENCODER # ==================== + class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area.""" + model_name = "qwenimage-edit-plus" block_classes = [ QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"), @@ -87,6 +91,7 @@ def description(self) -> str: # 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== + # assemble input steps class QwenImageEditPlusInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" @@ -136,6 +141,7 @@ def description(self): # 4. DECODE # ==================== + class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] @@ -172,4 +178,4 @@ def description(self): "- `image` is required input (can be single image or list of images).\n" "- Each image is resized independently based on its own aspect ratio.\n" "- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area." - ) \ No newline at end of file + ) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 1ff366bcb38c..fe6f756789af 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional from ...utils import logging -from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict from .before_denoise import ( QwenImageLayeredPrepareLatentsStep, @@ -30,15 +29,14 @@ QwenImageLayeredDenoiseStep, ) from .encoders import ( - QwenImageLayeredResizeStep, - QwenImageTextEncoderStep, QwenImageEditProcessImagesInputStep, - QwenImageVaeEncoderStep, QwenImageLayeredGetImagePromptStep, QwenImageLayeredPermuteLatentsStep, + QwenImageLayeredResizeStep, + QwenImageTextEncoderStep, + QwenImageVaeEncoderStep, ) from .inputs import ( - QwenImageAdditionalInputsStep, QwenImageLayeredAdditionalInputsStep, QwenImageTextInputsStep, ) @@ -51,8 +49,10 @@ # 1. TEXT ENCODER # ==================== + class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """Text encoder that takes text prompt, will generate a prompt based on image if not provided.""" + model_name = "qwenimage-layered" block_classes = [ QwenImageLayeredResizeStep(), @@ -70,6 +70,7 @@ def description(self) -> str: # 2. VAE ENCODER # ==================== + # Edit VAE encoder class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-layered" @@ -86,13 +87,11 @@ def description(self) -> str: return "Vae encoder step that encode the image inputs into their latent representations." - - - # ==================== # 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== + # assemble input steps class QwenImageLayeredInputStep(SequentialPipelineBlocks): model_name = "qwenimage-layered" @@ -136,7 +135,6 @@ def description(self): return "Core denoising workflow for QwenImage-Layered img2img task." - # ==================== # 4. AUTO BLOCKS & PRESETS # ==================== @@ -150,6 +148,7 @@ def description(self): ] ) + class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): model_name = "qwenimage-layered" block_classes = LAYERED_AUTO_BLOCKS.values() @@ -157,4 +156,4 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): @property def description(self): - return "Auto Modular pipeline for layered denoising tasks using QwenImage-Layered." \ No newline at end of file + return "Auto Modular pipeline for layered denoising tasks using QwenImage-Layered." diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py index 3e580dde5f08..892435989d00 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py @@ -93,7 +93,7 @@ def unpack_latents(self, latents, height, width, vae_scale_factor=8): class QwenImageLayeredPachifier(ConfigMixin): """ A class to pack and unpack latents for QwenImage Layered. - + Unlike QwenImagePachifier, this handles 5D latents with shape (B, layers+1, C, H, W). """ diff --git a/src/diffusers/modular_pipelines/qwenimage/prompt_templates.py b/src/diffusers/modular_pipelines/qwenimage/prompt_templates.py index 068f768250c6..8e7beb555760 100644 --- a/src/diffusers/modular_pipelines/qwenimage/prompt_templates.py +++ b/src/diffusers/modular_pipelines/qwenimage/prompt_templates.py @@ -118,4 +118,4 @@ " - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容\n" "<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n" "<|im_start|>assistant\n" -) \ No newline at end of file +) From 66f5c2c6d1754888d676f3d263903e8cfd3a75e9 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 9 Jan 2026 10:02:38 +0100 Subject: [PATCH 12/12] style --- .../dummy_torch_and_transformers_objects.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 63cec365799b..47d27741fe88 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -167,6 +167,36 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class QwenImageLayeredAutoBlocks(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class QwenImageLayeredModularPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class QwenImageModularPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"]