diff --git a/docs/source/en/api/models/ernie_image_transformer2d.md b/docs/source/en/api/models/ernie_image_transformer2d.md new file mode 100644 index 000000000000..9fe03090577f --- /dev/null +++ b/docs/source/en/api/models/ernie_image_transformer2d.md @@ -0,0 +1,21 @@ + + +# ErnieImageTransformer2DModel + +A Transformer model for image-like data from [ERNIE-Image](https://huggingface.co/baidu/ERNIE-Image). + +A Transformer model for image-like data from [ERNIE-Image-Turbo](https://huggingface.co/baidu/ERNIE-Image-Turbo). + +## ErnieImageTransformer2DModel + +[[autodoc]] ErnieImageTransformer2DModel \ No newline at end of file diff --git a/docs/source/en/api/pipelines/ernie_image.md b/docs/source/en/api/pipelines/ernie_image.md new file mode 100644 index 000000000000..69c0234d4cbf --- /dev/null +++ b/docs/source/en/api/pipelines/ernie_image.md @@ -0,0 +1,86 @@ + + +# Ernie-Image + +
+ LoRA +
+ +[ERNIE-Image] is a powerful and highly efficient image generation model with 8B parameters. Currently there's only two models to be released: + +|Model|Hugging Face| +|---|---| +|ERNIE-Image|https://huggingface.co/baidu/ERNIE-Image| +|ERNIE-Image-Turbo|https://huggingface.co/baidu/ERNIE-Image-Turbo| + +## ERNIE-Image + +ERNIE-Image is designed with a relatively compact architecture and solid instruction-following capability, emphasizing parameter efficiency. Based on an 8B DiT backbone, it provides performance that is comparable in some scenarios to larger (20B+) models, while maintaining reasonable parameter efficiency. It offers a relatively stable level of performance in instruction understanding and execution, text generation (e.g., English / Chinese / Japanese), and overall stability. + +## ERNIE-Image-Turbo + +ERNIE-Image-Turbo is a distilled variant of ERNIE-Image, requiring only 8 NFEs (Number of Function Evaluations) and offering a more efficient alternative with relatively comparable performance to the full model in certain cases. + +## ErnieImagePipeline + +Use [ErnieImagePipeline] to generate images from text prompts. The pipeline supports Prompt Enhancer (PE) by default, which enhances the user’s raw prompt to improve output quality, though it may reduce instruction-following accuracy. + +We provide a pretrained 3B-parameter PE model; however, using larger language models (e.g., Gemini or ChatGPT) for prompt enhancement may yield better results. The system prompt template is available at: https://huggingface.co/baidu/ERNIE-Image/blob/main/pe/chat_template.jinja. + +If you prefer not to use PE, set use_pe=False. + +```python +import torch +from diffusers import ErnieImagePipeline +from diffusers.utils import load_image + +pipe = ErnieImagePipeline.from_pretrained("baidu/ERNIE-Image", torch_dtype=torch.bfloat16) +pipe.to("cuda") +# 如果显存不足,可以开启offload +pipe.enable_model_cpu_offload() + +prompt = "一只黑白相间的中华田园犬" +images = pipe( + prompt=prompt, + height=1024, + width=1024, + num_inference_steps=50, + guidance_scale=5.0, + generator=generator, + use_pe=True, +).images +images[0].save("ernie-image-output.png") +``` + +```python +import torch +from diffusers import ErnieImagePipeline +from diffusers.utils import load_image + +pipe = ErnieImagePipeline.from_pretrained("baidu/ERNIE-Image-Turbo", torch_dtype=torch.bfloat16) +pipe.to("cuda") +# 如果显存不足,可以开启offload +pipe.enable_model_cpu_offload() + +prompt = "一只黑白相间的中华田园犬" +images = pipe( + prompt=prompt, + height=1024, + width=1024, + num_inference_steps=8, + guidance_scale=5.0, + generator=generator, + use_pe=True, +).images +images[0].save("ernie-image-turbo-output.png") +``` \ No newline at end of file diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index e9441ef71a31..80f2415384ae 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -302,6 +302,7 @@ "ZImageControlNetModel", "ZImageTransformer2DModel", "attention_backend", + "ErnieImageTransformer2DModel" ] ) _import_structure["modular_pipelines"].extend( @@ -744,6 +745,7 @@ "ZImageInpaintPipeline", "ZImageOmniPipeline", "ZImagePipeline", + "ErnieImagePipeline", ] ) @@ -1101,6 +1103,7 @@ ZImageControlNetModel, ZImageTransformer2DModel, attention_backend, + ErnieImageTransformer2DModel, ) from .modular_pipelines import ( AutoPipelineBlocks, @@ -1517,6 +1520,7 @@ ZImageInpaintPipeline, ZImageOmniPipeline, ZImagePipeline, + ErnieImagePipeline, ) try: diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index c0eb77652226..8eea0064496f 100755 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -101,6 +101,7 @@ _import_structure["transformers.transformer_cogview4"] = ["CogView4Transformer2DModel"] _import_structure["transformers.transformer_cosmos"] = ["CosmosTransformer3DModel"] _import_structure["transformers.transformer_easyanimate"] = ["EasyAnimateTransformer3DModel"] + _import_structure["transformers.transformer_ernie_image"] = ["ErnieImageTransformer2DModel"] _import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"] _import_structure["transformers.transformer_flux2"] = ["Flux2Transformer2DModel"] _import_structure["transformers.transformer_glm_image"] = ["GlmImageTransformer2DModel"] @@ -219,6 +220,7 @@ DiTTransformer2DModel, DualTransformer2DModel, EasyAnimateTransformer3DModel, + ErnieImageTransformer2DModel, Flux2Transformer2DModel, FluxTransformer2DModel, GlmImageTransformer2DModel, diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py index 7eca42e1210e..9087a2bc857d 100755 --- a/src/diffusers/models/transformers/__init__.py +++ b/src/diffusers/models/transformers/__init__.py @@ -53,3 +53,4 @@ from .transformer_wan_animate import WanAnimateTransformer3DModel from .transformer_wan_vace import WanVACETransformer3DModel from .transformer_z_image import ZImageTransformer2DModel + from .transformer_ernie_image import ErnieImageTransformer2DModel diff --git a/src/diffusers/models/transformers/transformer_ernie_image.py b/src/diffusers/models/transformers/transformer_ernie_image.py new file mode 100644 index 000000000000..c3171598366a --- /dev/null +++ b/src/diffusers/models/transformers/transformer_ernie_image.py @@ -0,0 +1,368 @@ +# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Ernie-Image Transformer2DModel for HuggingFace Diffusers. +""" + +import math +import inspect +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from ...configuration_utils import ConfigMixin, register_to_config +from ..embeddings import Timesteps +from ..embeddings import TimestepEmbedding +from ..modeling_utils import ModelMixin +from ...utils import BaseOutput +from ..normalization import RMSNorm +from ..attention_processor import Attention +from ..attention_dispatch import dispatch_attention_fn +from ..attention import AttentionMixin, AttentionModuleMixin + + +@dataclass +class ErnieImageTransformer2DModelOutput(BaseOutput): + sample: torch.Tensor + + +def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: + assert dim % 2 == 0 + scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim + omega = 1.0 / (theta ** scale) + out = torch.einsum("...n,d->...nd", pos, omega) + return out.float() + + +class ErnieImageEmbedND3(nn.Module): + def __init__(self, dim: int, theta: int, axes_dim: Tuple[int, int, int]): + super().__init__() + self.dim = dim + self.theta = theta + self.axes_dim = list(axes_dim) + + def forward(self, ids: torch.Tensor) -> torch.Tensor: + emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1) + emb = emb.unsqueeze(2) # [B, S, 1, head_dim//2] + return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) # [B, S, 1, head_dim] + + +class ErnieImagePatchEmbedDynamic(nn.Module): + def __init__(self, in_channels: int, embed_dim: int, patch_size: int): + super().__init__() + self.patch_size = patch_size + self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + batch_size, dim, height, width = x.shape + return x.reshape(batch_size, dim, height * width).transpose(1, 2).contiguous() + + +class ErnieImageSingleStreamAttnProcessor: + _attention_backend = None + _parallel_config = None + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError( + "ErnieImageSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher." + ) + + def __call__( + self, + attn: Attention, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor | None = None, + freqs_cis: torch.Tensor | None = None, + ) -> torch.Tensor: + query = attn.to_q(hidden_states) + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + + query = query.unflatten(-1, (attn.heads, -1)) + key = key.unflatten(-1, (attn.heads, -1)) + value = value.unflatten(-1, (attn.heads, -1)) + + # Apply Norms + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # Apply RoPE: same rotate_half logic as Megatron _apply_rotary_pos_emb_bshd (rotary_interleaved=False) + # x_in: [B, S, heads, head_dim], freqs_cis: [B, S, 1, head_dim] with angles [θ0,θ0,θ1,θ1,...] + def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: + rot_dim = freqs_cis.shape[-1] + x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:] + cos_ = torch.cos(freqs_cis).to(x.dtype) + sin_ = torch.sin(freqs_cis).to(x.dtype) + # Non-interleaved rotate_half: [-x2, x1] + x1, x2 = x.chunk(2, dim=-1) + x_rotated = torch.cat((-x2, x1), dim=-1) + return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1) + + if freqs_cis is not None: + query = apply_rotary_emb(query, freqs_cis) + key = apply_rotary_emb(key, freqs_cis) + + # Cast to correct dtype + dtype = query.dtype + query, key = query.to(dtype), key.to(dtype) + + # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len] + if attention_mask is not None and attention_mask.ndim == 2: + attention_mask = attention_mask[:, None, None, :] + + # Compute joint attention + hidden_states = dispatch_attention_fn( + query, + key, + value, + attn_mask=attention_mask, + dropout_p=0.0, + is_causal=False, + backend=self._attention_backend, + parallel_config=self._parallel_config, + ) + + # Reshape back + hidden_states = hidden_states.flatten(2, 3) + hidden_states = hidden_states.to(dtype) + output = attn.to_out[0](hidden_states) + + return output + + +class ErnieImageAttention(torch.nn.Module, AttentionModuleMixin): + _default_processor_cls = ErnieImageSingleStreamAttnProcessor + + def __init__( + self, + query_dim: int, + heads: int = 8, + dim_head: int = 64, + dropout: float = 0.0, + bias: bool = False, + qk_norm: str = "rms_norm", + added_proj_bias: bool | None = True, + out_bias: bool = True, + eps: float = 1e-5, + out_dim: int = None, + elementwise_affine: bool = True, + processor=None, + ): + super().__init__() + + self.head_dim = dim_head + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.query_dim = query_dim + self.out_dim = out_dim if out_dim is not None else query_dim + self.heads = out_dim // dim_head if out_dim is not None else heads + + self.use_bias = bias + self.dropout = dropout + + self.added_proj_bias = added_proj_bias + + self.to_q = torch.nn.Linear(query_dim, self.inner_dim, bias=bias) + self.to_k = torch.nn.Linear(query_dim, self.inner_dim, bias=bias) + self.to_v = torch.nn.Linear(query_dim, self.inner_dim, bias=bias) + + # QK Norm + if qk_norm == "layer_norm": + self.norm_q = torch.nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine) + self.norm_k = torch.nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine) + elif qk_norm == "rms_norm": + self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine) + self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine) + else: + raise ValueError( + f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'fp32_layer_norm', 'layer_norm_across_heads', 'rms_norm', 'rms_norm_across_heads', 'l2'." + ) + + self.to_out = torch.nn.ModuleList([]) + self.to_out.append(torch.nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)) + + if processor is None: + processor = self._default_processor_cls() + self.set_processor(processor) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor | None = None, + attention_mask: torch.Tensor | None = None, + image_rotary_emb: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor: + attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys()) + unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters] + if len(unused_kwargs) > 0: + logger.warning( + f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored." + ) + kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters} + return self.processor(self, hidden_states, attention_mask, image_rotary_emb, **kwargs) + + +class ErnieImageFeedForward(nn.Module): + def __init__(self, hidden_size: int, ffn_hidden_size: int): + super().__init__() + # Separate gate and up projections (matches converted weights) + self.gate_proj = nn.Linear(hidden_size, ffn_hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, ffn_hidden_size, bias=False) + self.linear_fc2 = nn.Linear(ffn_hidden_size, hidden_size, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.linear_fc2(self.up_proj(x) * F.gelu(self.gate_proj(x))) + +class ErnieImageSharedAdaLNBlock(nn.Module): + def __init__(self, hidden_size: int, num_heads: int, ffn_hidden_size: int, eps: float = 1e-6, qk_layernorm: bool = True): + super().__init__() + self.adaLN_sa_ln = RMSNorm(hidden_size, eps=eps) + self.self_attention = ErnieImageAttention( + query_dim=hidden_size, + dim_head=hidden_size // num_heads, + heads=num_heads, + qk_norm="rms_norm" if qk_layernorm else None, + eps=eps, + bias=False, + out_bias=False, + processor=ErnieImageSingleStreamAttnProcessor(), + ) + self.adaLN_mlp_ln = RMSNorm(hidden_size, eps=eps) + self.mlp = ErnieImageFeedForward(hidden_size, ffn_hidden_size) + + def forward(self, x, rotary_pos_emb, shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp, attention_mask=None): + residual = x + x = self.adaLN_sa_ln(x) + x = (x.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype) + x_bsh = x.permute(1, 0, 2) # [S, B, H] → [B, S, H] for diffusers Attention (batch-first) + attn_out = self.self_attention(x_bsh, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb) + attn_out = attn_out.permute(1, 0, 2) # [B, S, H] → [S, B, H] + x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype) + residual = x + x = self.adaLN_mlp_ln(x) + x = (x.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype) + return residual + (gate_mlp.float() * self.mlp(x).float()).to(x.dtype) + + +class ErnieImageAdaLNContinuous(nn.Module): + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=eps) + self.linear = nn.Linear(hidden_size, hidden_size * 2) + + def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor: + scale, shift = self.linear(conditioning).chunk(2, dim=-1) + x = self.norm(x) + # Broadcast conditioning to sequence dimension + x = x * (1 + scale.unsqueeze(0)) + shift.unsqueeze(0) + return x + + +class ErnieImageTransformer2DModel(ModelMixin, ConfigMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + hidden_size: int = 3072, + num_attention_heads: int = 24, + num_layers: int = 24, + ffn_hidden_size: int = 8192, + in_channels: int = 128, + out_channels: int = 128, + patch_size: int = 1, + text_in_dim: int = 2560, + rope_theta: int = 256, + rope_axes_dim: Tuple[int, int, int] = (32, 48, 48), + eps: float = 1e-6, + qk_layernorm: bool = True, + ): + super().__init__() + self.hidden_size = hidden_size + self.num_heads = num_attention_heads + self.head_dim = hidden_size // num_attention_heads + self.num_layers = num_layers + self.patch_size = patch_size + self.in_channels = in_channels + self.out_channels = out_channels + self.text_in_dim = text_in_dim + + self.x_embedder = ErnieImagePatchEmbedDynamic(in_channels, hidden_size, patch_size) + self.text_proj = nn.Linear(text_in_dim, hidden_size, bias=False) if text_in_dim != hidden_size else None + self.time_proj = Timesteps(hidden_size, flip_sin_to_cos=False, downscale_freq_shift=0) + self.time_embedding = TimestepEmbedding(hidden_size, hidden_size) + self.pos_embed = ErnieImageEmbedND3(dim=self.head_dim, theta=rope_theta, axes_dim=rope_axes_dim) + self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size)) + nn.init.zeros_(self.adaLN_modulation[-1].weight) + nn.init.zeros_(self.adaLN_modulation[-1].bias) + self.layers = nn.ModuleList([ErnieImageSharedAdaLNBlock(hidden_size, num_attention_heads, ffn_hidden_size, eps, qk_layernorm=qk_layernorm) for _ in range(num_layers)]) + self.final_norm = ErnieImageAdaLNContinuous(hidden_size, eps) + self.final_linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels) + nn.init.zeros_(self.final_linear.weight) + nn.init.zeros_(self.final_linear.bias) + + def forward( + self, + hidden_states: torch.Tensor, + timestep: torch.Tensor, + # encoder_hidden_states: List[torch.Tensor], + text_bth: torch.Tensor, + text_lens: torch.Tensor, + return_dict: bool = True + ): + device, dtype = hidden_states.device, hidden_states.dtype + B, C, H, W = hidden_states.shape + p, Hp, Wp = self.patch_size, H // self.patch_size, W // self.patch_size + N_img = Hp * Wp + + img_sbh = self.x_embedder(hidden_states).transpose(0, 1).contiguous() + # text_bth, text_lens = self._pad_text(encoder_hidden_states, device, dtype) + if self.text_proj is not None and text_bth.numel() > 0: + text_bth = self.text_proj(text_bth) + Tmax = text_bth.shape[1] + text_sbh = text_bth.transpose(0, 1).contiguous() + + x = torch.cat([img_sbh, text_sbh], dim=0) + S = x.shape[0] + + # Position IDs + text_ids = torch.cat([torch.arange(Tmax, device=device, dtype=torch.float32).view(1, Tmax, 1).expand(B, -1, -1), torch.zeros((B, Tmax, 2), device=device)], dim=-1) if Tmax > 0 else torch.zeros((B, 0, 3), device=device) + grid_yx = torch.stack(torch.meshgrid(torch.arange(Hp, device=device, dtype=torch.float32), torch.arange(Wp, device=device, dtype=torch.float32), indexing="ij"), dim=-1).reshape(-1, 2) + image_ids = torch.cat([text_lens.float().view(B, 1, 1).expand(-1, N_img, -1), grid_yx.view(1, N_img, 2).expand(B, -1, -1)], dim=-1) + rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)) + + # Attention mask: True = valid (attend), False = padding (mask out), matches sdpa bool convention + valid_text = torch.arange(Tmax, device=device).view(1, Tmax) < text_lens.view(B, 1) if Tmax > 0 else torch.zeros((B, 0), device=device, dtype=torch.bool) + attention_mask = torch.cat([torch.ones((B, N_img), device=device, dtype=torch.bool), valid_text], dim=1)[:, None, None, :] + + # AdaLN + sample = self.time_proj(timestep.to(dtype)) + sample = sample.to(self.time_embedding.linear_1.weight.dtype) + c = self.time_embedding(sample) + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [t.unsqueeze(0).expand(S, -1, -1).contiguous() for t in self.adaLN_modulation(c).chunk(6, dim=-1)] + for layer in self.layers: + x = layer(x, rotary_pos_emb, shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp, attention_mask) + x = self.final_norm(x, c).type_as(x) + patches = self.final_linear(x)[:N_img].transpose(0, 1).contiguous() + output = patches.view(B, Hp, Wp, p, p, self.out_channels).permute(0, 5, 1, 3, 2, 4).contiguous().view(B, self.out_channels, H, W) + + return ErnieImageTransformer2DModelOutput(sample=output) if return_dict else (output,) + diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 26626b5f7efe..cd3437fcdaaf 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -335,6 +335,7 @@ ) _import_structure["mochi"] = ["MochiPipeline"] _import_structure["omnigen"] = ["OmniGenPipeline"] + _import_structure["ernie_image"] = ["ErnieImagePipeline"] _import_structure["ovis_image"] = ["OvisImagePipeline"] _import_structure["visualcloze"] = ["VisualClozePipeline", "VisualClozeGenerationPipeline"] _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline", "PixArtSigmaPipeline"] @@ -771,6 +772,7 @@ from .mochi import MochiPipeline from .nucleusmoe_image import NucleusMoEImagePipeline from .omnigen import OmniGenPipeline + from .ernie_image import ErnieImagePipeline from .ovis_image import OvisImagePipeline from .pag import ( AnimateDiffPAGPipeline, diff --git a/src/diffusers/pipelines/ernie_image/__init__.py b/src/diffusers/pipelines/ernie_image/__init__.py new file mode 100644 index 000000000000..97355fb609f3 --- /dev/null +++ b/src/diffusers/pipelines/ernie_image/__init__.py @@ -0,0 +1,47 @@ +from typing import TYPE_CHECKING + +from ...utils import ( + DIFFUSERS_SLOW_IMPORT, + OptionalDependencyNotAvailable, + _LazyModule, + get_objects_from_module, + is_torch_available, + is_transformers_available, +) + + +_dummy_objects = {} +_import_structure = {} + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils import dummy_torch_and_transformers_objects # noqa: F403 + + _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) +else: + _import_structure["pipeline_ernie_image"] = ["ErnieImagePipeline"] + + +if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: + try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() + + except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import * + else: + from .pipeline_ernie_image import ErnieImagePipeline +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) + + for name, value in _dummy_objects.items(): + setattr(sys.modules[__name__], name, value) diff --git a/src/diffusers/pipelines/ernie_image/pipeline_ernie_image.py b/src/diffusers/pipelines/ernie_image/pipeline_ernie_image.py new file mode 100644 index 000000000000..3fb1948739fa --- /dev/null +++ b/src/diffusers/pipelines/ernie_image/pipeline_ernie_image.py @@ -0,0 +1,365 @@ +# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Ernie-Image Pipeline for HuggingFace Diffusers. +""" + +import json +import os +import numpy as np +import torch +from PIL import Image +from typing import Callable, List, Optional, Union +from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + +from ...pipelines.pipeline_utils import DiffusionPipeline +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...models import AutoencoderKLFlux2 +from ...models.transformers import ErnieImageTransformer2DModel +from .pipeline_output import ErnieImagePipelineOutput + + +class ErnieImagePipeline(DiffusionPipeline): + """ + Pipeline for text-to-image generation using ErnieImageTransformer2DModel. + + This pipeline uses: + - A custom DiT transformer model + - A Flux2-style VAE for encoding/decoding latents + - A text encoder (e.g., Qwen) for text conditioning + - Flow Matching Euler Discrete Scheduler + """ + + model_cpu_offload_seq = "pe->text_encoder->transformer->vae" + # For SGLang fallback ... + _optional_components = ["pe", "pe_tokenizer"] + _callback_tensor_inputs = ["latents"] + + def __init__( + self, + transformer: ErnieImageTransformer2DModel, + vae: AutoencoderKLFlux2, + text_encoder: AutoModel, + tokenizer: AutoTokenizer, + scheduler: FlowMatchEulerDiscreteScheduler, + pe: Optional[AutoModelForCausalLM] = None, + pe_tokenizer: Optional[AutoTokenizer] = None, + ): + super().__init__() + self.register_modules( + transformer=transformer, + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + scheduler=scheduler, + pe=pe, + pe_tokenizer=pe_tokenizer, + ) + self.vae_scale_factor = 16 # VAE downsample factor + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1.0 + + @torch.no_grad() + def _enhance_prompt_with_pe( + self, + prompt: str, + device: torch.device, + width: int = 1024, + height: int = 1024, + system_prompt: Optional[str] = None, + temperature: float = 0.6, + top_p: float = 0.95, + ) -> str: + """Use PE model to rewrite/enhance a short prompt via chat_template.""" + # Build user message as JSON carrying prompt text and target resolution + user_content = json.dumps( + {"prompt": prompt, "width": width, "height": height}, + ensure_ascii=False, + ) + messages = [] + if system_prompt is not None: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": user_content}) + + # apply_chat_template picks up the chat_template.jinja loaded with pe_tokenizer + input_text = self.pe_tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, # "Output:" is already in the user block + ) + inputs = self.pe_tokenizer(input_text, return_tensors="pt").to(device) + output_ids = self.pe.generate( + **inputs, + max_new_tokens=self.pe_tokenizer.model_max_length, + do_sample=temperature != 1.0 or top_p != 1.0, + temperature=temperature, + top_p=top_p, + pad_token_id=self.pe_tokenizer.pad_token_id, + eos_token_id=self.pe_tokenizer.eos_token_id, + ) + # Decode only newly generated tokens + generated_ids = output_ids[0][inputs["input_ids"].shape[1]:] + return self.pe_tokenizer.decode(generated_ids, skip_special_tokens=True).strip() + + @torch.no_grad() + def encode_prompt( + self, + prompt: Union[str, List[str]], + device: torch.device, + num_images_per_prompt: int = 1, + ) -> List[torch.Tensor]: + """Encode text prompts to embeddings.""" + if isinstance(prompt, str): + prompt = [prompt] + + text_hiddens = [] + + for p in prompt: + ids = self.tokenizer( + p, + add_special_tokens=True, + truncation=True, + padding=False, + )["input_ids"] + + if len(ids) == 0: + if self.tokenizer.bos_token_id is not None: + ids = [self.tokenizer.bos_token_id] + else: + ids = [0] + + input_ids = torch.tensor([ids], device=device) + with torch.no_grad(): + outputs = self.text_encoder( + input_ids=input_ids, + output_hidden_states=True, + ) + # Use second to last hidden state (matches training) + hidden = outputs.hidden_states[-2][0] # [T, H] + + # Repeat for num_images_per_prompt + for _ in range(num_images_per_prompt): + text_hiddens.append(hidden) + + return text_hiddens + + @staticmethod + def _patchify_latents(latents: torch.Tensor) -> torch.Tensor: + """2x2 patchify: [B, 32, H, W] -> [B, 128, H/2, W/2]""" + b, c, h, w = latents.shape + latents = latents.view(b, c, h // 2, 2, w // 2, 2) + latents = latents.permute(0, 1, 3, 5, 2, 4) + return latents.reshape(b, c * 4, h // 2, w // 2) + + @staticmethod + def _unpatchify_latents(latents: torch.Tensor) -> torch.Tensor: + """Reverse patchify: [B, 128, H/2, W/2] -> [B, 32, H, W]""" + b, c, h, w = latents.shape + latents = latents.reshape(b, c // 4, 2, 2, h, w) + latents = latents.permute(0, 1, 4, 2, 5, 3) + return latents.reshape(b, c // 4, h * 2, w * 2) + + @staticmethod + def _pad_text(text_hiddens: List[torch.Tensor], device: torch.device, dtype: torch.dtype, text_in_dim: int): + B = len(text_hiddens) + if B == 0: + return torch.zeros((0, 0, text_in_dim), device=device, dtype=dtype), torch.zeros((0,), device=device, dtype=torch.long) + normalized = [th.squeeze(1).to(device).to(dtype) if th.dim() == 3 else th.to(device).to(dtype) for th in text_hiddens] + lens = torch.tensor([t.shape[0] for t in normalized], device=device, dtype=torch.long) + Tmax = int(lens.max().item()) + text_bth = torch.zeros((B, Tmax, text_in_dim), device=device, dtype=dtype) + for i, t in enumerate(normalized): + text_bth[i, :t.shape[0], :] = t + return text_bth, lens + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]], + negative_prompt: Optional[Union[str, List[str]]] = "", + height: int = 1024, + width: int = 1024, + num_inference_steps: int = 50, + guidance_scale: float = 4.0, + num_images_per_prompt: int = 1, + generator: Optional[torch.Generator] = None, + latents: Optional[torch.Tensor] = None, + output_type: str = "pil", + return_dict: bool = True, + callback_on_step_end: Optional[Callable[[int, int, dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + use_pe: bool = True, # 默认使用PE进行改写 + ): + """ + Generate images from text prompts. + + Args: + prompt: Text prompt(s) + negative_prompt: Negative prompt(s) for CFG. Default is "". + height: Image height in pixels (must be divisible by 16). Default: 1024. + width: Image width in pixels (must be divisible by 16). Default: 1024. + num_inference_steps: Number of denoising steps + guidance_scale: CFG scale (1.0 = no guidance). Default: 4.0. + num_images_per_prompt: Number of images per prompt + generator: Random generator for reproducibility + latents: Pre-generated latents (optional) + output_type: "pil" or "latent" + return_dict: Whether to return a dataclass + callback_on_step_end: Optional callback invoked at the end of each denoising step. + Called as `callback_on_step_end(pipeline, step, timestep, callback_kwargs)` where + `callback_kwargs` contains the tensors listed in `callback_on_step_end_tensor_inputs`. + The callback may return a dict to override those tensors for subsequent steps. + callback_on_step_end_tensor_inputs: List of tensor names passed into the callback kwargs. + Must be a subset of `_callback_tensor_inputs` (default: `["latents"]`). + use_pe: Whether to use the PE model to enhance prompts before generation. + + Returns: + :class:`ErnieImagePipelineOutput` with `images` and `revised_prompts`. + """ + device = self._execution_device + dtype = self.transformer.dtype + + self._guidance_scale = guidance_scale + # Validate dimensions + if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0: + raise ValueError(f"Height and width must be divisible by {self.vae_scale_factor}") + + # Handle prompts + if isinstance(prompt, str): + prompt = [prompt] + + # [Phase 1] PE: enhance prompts + revised_prompts: Optional[List[str]] = None + if use_pe and self.pe is not None and self.pe_tokenizer is not None: + prompt = [ + self._enhance_prompt_with_pe(p, device, width=width, height=height) + for p in prompt + ] + revised_prompts = list(prompt) + + batch_size = len(prompt) + total_batch_size = batch_size * num_images_per_prompt + + # Handle negative prompt + if negative_prompt is None: + negative_prompt = "" + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] * batch_size + if len(negative_prompt) != batch_size: + raise ValueError(f"negative_prompt must have same length as prompt ({batch_size})") + + # [Phase 2] Text encoding + text_hiddens = self.encode_prompt(prompt, device, num_images_per_prompt) + + # CFG with negative prompt + if self.do_classifier_free_guidance: + uncond_text_hiddens = self.encode_prompt( + negative_prompt, device, num_images_per_prompt + ) + + # Latent dimensions + latent_h = height // self.vae_scale_factor + latent_w = width // self.vae_scale_factor + latent_channels = 128 # After patchify + + # Initialize latents + if latents is None: + latents = torch.randn( + (total_batch_size, latent_channels, latent_h, latent_w), + device=device, + dtype=dtype, + generator=generator, + ) + + # Setup scheduler + sigmas = torch.linspace(1.0, 0.0, num_inference_steps + 1) + self.scheduler.set_timesteps(sigmas=sigmas[:-1], device=device) + + # Denoising loop + if self.do_classifier_free_guidance: + cfg_text_hiddens = list(uncond_text_hiddens) + list(text_hiddens) + else: + cfg_text_hiddens = text_hiddens + text_bth, text_lens = self._pad_text(text_hiddens=cfg_text_hiddens, device=device, dtype=dtype, text_in_dim=self.transformer.config.text_in_dim) + + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(self.scheduler.timesteps): + if self.do_classifier_free_guidance: + latent_model_input = torch.cat([latents, latents], dim=0) + t_batch = torch.full((total_batch_size * 2,), t.item(), device=device, dtype=dtype) + else: + latent_model_input = latents + t_batch = torch.full((total_batch_size,), t.item(), device=device, dtype=dtype) + + # Model prediction + pred = self.transformer( + hidden_states=latent_model_input, + timestep=t_batch, + text_bth=text_bth, + text_lens=text_lens, + return_dict=False, + )[0] + + # Apply CFG + if self.do_classifier_free_guidance: + pred_uncond, pred_cond = pred.chunk(2, dim=0) + pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond) + + # Scheduler step + latents = self.scheduler.step(pred, t, latents).prev_sample + + # Callback + if callback_on_step_end is not None: + callback_kwargs = {k: locals()[k] for k in callback_on_step_end_tensor_inputs} + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + latents = callback_outputs.pop("latents", latents) + + progress_bar.update() + + if output_type == "latent": + return latents + + # Decode latents to images + # Unnormalize latents using VAE's BN stats + bn_mean = self.vae.bn.running_mean.view(1, -1, 1, 1).to(device) + bn_std = torch.sqrt(self.vae.bn.running_var.view(1, -1, 1, 1) + 1e-5).to(device) + latents = latents * bn_std + bn_mean + + # Unpatchify + latents = self._unpatchify_latents(latents) + + # Decode + images = self.vae.decode(latents, return_dict=False)[0] + + # Post-process + images = (images.clamp(-1, 1) + 1) / 2 + images = images.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + images = [Image.fromarray((img * 255).astype("uint8")) for img in images] + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (images,) + + return ErnieImagePipelineOutput(images=images, revised_prompts=revised_prompts) diff --git a/src/diffusers/pipelines/ernie_image/pipeline_output.py b/src/diffusers/pipelines/ernie_image/pipeline_output.py new file mode 100644 index 000000000000..8919db0c0aca --- /dev/null +++ b/src/diffusers/pipelines/ernie_image/pipeline_output.py @@ -0,0 +1,36 @@ +# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Optional + +import PIL.Image + +from ...utils import BaseOutput + + +@dataclass +class ErnieImagePipelineOutput(BaseOutput): + """ + Output class for ERNIE-Image pipelines. + + Args: + images (`List[PIL.Image.Image]`): + List of generated images. + revised_prompts (`List[str]`, *optional*): + List of PE-revised prompts. `None` when PE is disabled or unavailable. + """ + + images: List[PIL.Image.Image] + revised_prompts: Optional[List[str]] diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 0bb9ee7b314a..6f26d738f5ef 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -1110,6 +1110,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class ErnieImageTransformer2DModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class Flux2Transformer2DModel(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index eff798a59051..fecc0882e695 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -2597,6 +2597,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class ErnieImagePipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class OvisImagePipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/models/transformers/test_models_transformer_ernie_image.py b/tests/models/transformers/test_models_transformer_ernie_image.py new file mode 100644 index 000000000000..7ef855609ed8 --- /dev/null +++ b/tests/models/transformers/test_models_transformer_ernie_image.py @@ -0,0 +1,199 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import os +import unittest + +import torch + +from diffusers import ErnieImageTransformer2DModel + +from ...testing_utils import IS_GITHUB_ACTIONS, torch_device +from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin + + +# Ernie-Image requires torch.use_deterministic_algorithms(False) due to complex64 RoPE operations +# Cannot use enable_full_determinism() which sets it to True +os.environ["CUDA_LAUNCH_BLOCKING"] = "1" +os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" +torch.use_deterministic_algorithms(False) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False +if hasattr(torch.backends, "cuda"): + torch.backends.cuda.matmul.allow_tf32 = False + + +@unittest.skipIf( + IS_GITHUB_ACTIONS, + reason="Skipping test-suite inside the CI because the model has `torch.empty()` inside of it during init and we don't have a clear way to override it in the modeling tests.", +) +class ErnieImageTransformerTests(ModelTesterMixin, unittest.TestCase): + model_class = ErnieImageTransformer2DModel + main_input_name = "hidden_states" + # We override the items here because the transformer under consideration is small. + model_split_percents = [0.9, 0.9, 0.9] + + def prepare_dummy_input(self, height=16, width=16): + batch_size = 1 + num_channels = 16 + embedding_dim = 16 + sequence_length = 16 + + hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device) + encoder_hidden_states = [ + torch.randn((sequence_length, embedding_dim)).to(torch_device) for _ in range(batch_size) + ] + timestep = torch.tensor([1.0]).to(torch_device) + + return {"hidden_states": hidden_states, "encoder_hidden_states": encoder_hidden_states, "timestep": timestep} + + @property + def dummy_input(self): + return self.prepare_dummy_input() + + @property + def input_shape(self): + return (16, 16, 16) + + @property + def output_shape(self): + return (16, 16, 16) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "hidden_size": 16, + "num_attention_heads": 1, + "num_layers": 1, + "ffn_hidden_size": 16, + "in_channels": 16, + "out_channels": 16, + "patch_size": 1, + "text_in_dim": 16, + "rope_theta": 256, + "rope_axes_dim": (8, 4, 4), + "eps": 1e-6, + "qk_layernorm": True, + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def setUp(self): + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.manual_seed(0) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(0) + + def tearDown(self): + super().tearDown() + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.manual_seed(0) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(0) + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"ErnieImageTransformer2DModel"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + @unittest.skip("Test is not supported for handling main inputs that are lists.") + def test_training(self): + super().test_training() + + @unittest.skip("Test is not supported for handling main inputs that are lists.") + def test_ema_training(self): + super().test_ema_training() + + @unittest.skip("Test is not supported for handling main inputs that are lists.") + def test_effective_gradient_checkpointing(self): + super().test_effective_gradient_checkpointing() + + @unittest.skip( + "Test needs to be revisited. But we need to ensure `x_pad_token` and `cap_pad_token` are cast to the same dtype as the destination tensor before they are assigned to the padding indices." + ) + def test_layerwise_casting_training(self): + super().test_layerwise_casting_training() + + @unittest.skip( + "TimestepEmbedding uses explicit dtype casting that conflicts with float8 layerwise casting hooks." + ) + def test_layerwise_casting_inference(self): + super().test_layerwise_casting_inference() + + @unittest.skip( + "TimestepEmbedding uses explicit dtype casting that conflicts with float8 layerwise casting hooks." + ) + def test_layerwise_casting_memory(self): + super().test_layerwise_casting_memory() + + @unittest.skip( + "TimestepEmbedding uses explicit dtype casting that conflicts with float8 layerwise casting hooks." + ) + def test_group_offloading_with_layerwise_casting(self): + super().test_group_offloading_with_layerwise_casting() + + @unittest.skip( + "TimestepEmbedding uses explicit dtype casting that conflicts with float8 layerwise casting hooks." + ) + def test_group_offloading_with_layerwise_casting_0(self): + pass + + @unittest.skip( + "TimestepEmbedding uses explicit dtype casting that conflicts with float8 layerwise casting hooks." + ) + def test_group_offloading_with_layerwise_casting_1(self): + pass + + @unittest.skip("Test is not supported for handling main inputs that are lists.") + def test_outputs_equivalence(self): + super().test_outputs_equivalence() + + @unittest.skip("Test will pass if we change to deterministic values instead of empty in the DiT.") + def test_group_offloading(self): + super().test_group_offloading() + + @unittest.skip("Test will pass if we change to deterministic values instead of empty in the DiT.") + def test_group_offloading_with_disk(self): + super().test_group_offloading_with_disk() + + +class ErnieImageTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase): + model_class = ErnieImageTransformer2DModel + different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)] + + def prepare_init_args_and_inputs_for_common(self): + return ErnieImageTransformerTests().prepare_init_args_and_inputs_for_common() + + def prepare_dummy_input(self, height, width): + return ErnieImageTransformerTests().prepare_dummy_input(height=height, width=width) + + @unittest.skip( + "The repeated block in this model is ZImageTransformerBlock, which is used for noise_refiner, context_refiner, and layers. As a consequence of this, the inputs recorded for the block would vary during compilation and full compilation with fullgraph=True would trigger recompilation at least thrice." + ) + def test_torch_compile_recompilation_and_graph_break(self): + super().test_torch_compile_recompilation_and_graph_break() + + @unittest.skip("Fullgraph AoT is broken") + def test_compile_works_with_aot(self): + super().test_compile_works_with_aot() + + @unittest.skip("Fullgraph is broken") + def test_compile_on_different_shapes(self): + super().test_compile_on_different_shapes()