From 5e0ff8be3ab3c8fbbc1f9ce24c336f75455de525 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Mon, 11 May 2026 09:17:31 -0400 Subject: [PATCH 1/2] fix(qwen): use distinct img_shapes for reference latents in Qwen Image Edit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Qwen Image Edit was applying identical RoPE positions to the noisy and reference latent segments (both packed at the noisy latent's dimensions), so cross-attention couldn't disentangle them — reference content bled into the generation as a faintly offset ghost across the whole frame, outside the masked edit region. The denoise now keeps reference latents at their own (H, W) and uses those dims in the reference segment of img_shapes, matching diffusers' QwenImageEditPipeline / QwenImageEditPlusPipeline. The reference qwen_image_i2l is resized to ~1024² area preserving aspect ratio (matching diffusers' VAE_IMAGE_SIZE) so the reference token sequence stays in the distribution the model was trained on. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../app/invocations/qwen_image_denoise.py | 70 +++++++++++----- .../qwen_image_image_to_latents.py | 4 +- .../generation/buildQwenImageGraph.test.ts | 84 ++++++++++++++++++- .../graph/generation/buildQwenImageGraph.ts | 36 ++++++-- .../invocations/test_qwen_image_denoise.py | 59 +++++++++++++ 5 files changed, 226 insertions(+), 27 deletions(-) diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py index 04e21a26c3f..6f9165ad35a 100644 --- a/invokeai/app/invocations/qwen_image_denoise.py +++ b/invokeai/app/invocations/qwen_image_denoise.py @@ -176,6 +176,41 @@ def _unpack_latents(latents: torch.Tensor, height: int, width: int) -> torch.Ten latents = latents.reshape(batch_size, channels // 4, h, w) return latents + @staticmethod + def _align_ref_latent_dims(rh: int, rw: int) -> tuple[int, int]: + """Trim reference latent spatial dims to even values for 2x2 packing. + + Raises ValueError if the aligned dims would be < 2 (i.e., the reference + latent is too small to produce any valid tokens). + """ + rh_aligned = rh - (rh % 2) + rw_aligned = rw - (rw % 2) + if rh_aligned < 2 or rw_aligned < 2: + raise ValueError( + f"Reference latent spatial dims must be >= 2 after even alignment; " + f"got ({rh_aligned}, {rw_aligned}) from input shape ({rh}, {rw}). " + "Ensure the reference image is at least 16 pixels in each dimension." + ) + return rh_aligned, rw_aligned + + @staticmethod + def _build_img_shapes( + latent_height: int, + latent_width: int, + ref_latent_height: int | None = None, + ref_latent_width: int | None = None, + ) -> list[list[tuple[int, int, int]]]: + """Build the img_shapes argument for the transformer. + + The reference segment (if present) must use its own dims so QwenEmbedRope's + spatial frequencies position ref tokens distinctly from noisy tokens — + otherwise reference content bleeds into the generation as a ghost. + """ + shapes: list[tuple[int, int, int]] = [(1, latent_height // 2, latent_width // 2)] + if ref_latent_height is not None and ref_latent_width is not None: + shapes.append((1, ref_latent_height // 2, ref_latent_width // 2)) + return [shapes] + def _run_diffusion(self, context: InvocationContext): inference_dtype = torch.bfloat16 device = TorchDevice.choose_torch_device() @@ -332,35 +367,32 @@ def _run_diffusion(self, context: InvocationContext): use_ref_latents = has_zero_cond_t ref_latents_packed = None + ref_latent_height = latent_height + ref_latent_width = latent_width if use_ref_latents: if ref_latents is not None: - _, ref_ch, rh, rw = ref_latents.shape - if rh != latent_height or rw != latent_width: - ref_latents = torch.nn.functional.interpolate( - ref_latents, size=(latent_height, latent_width), mode="bilinear" - ) + _, _, rh, rw = ref_latents.shape + ref_latent_height, ref_latent_width = self._align_ref_latent_dims(rh, rw) + if ref_latent_height != rh or ref_latent_width != rw: + ref_latents = ref_latents[..., :ref_latent_height, :ref_latent_width] else: # No reference image provided — use zeros so the model still gets the # expected sequence layout. ref_latents = torch.zeros( 1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype ) - ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width) - - # img_shapes tells the transformer the spatial layout of patches. + ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, ref_latent_height, ref_latent_width) + + # img_shapes tells the transformer the spatial layout of patches. The reference + # segment must use the reference latent's own dimensions so RoPE positions it + # distinctly from the noisy latent — otherwise the two segments share spatial + # positional encoding and the model can't disentangle them, producing a + # ghost/doubling artifact across the whole frame. Matches diffusers' + # QwenImageEditPipeline / QwenImageEditPlusPipeline. if use_ref_latents: - img_shapes = [ - [ - (1, latent_height // 2, latent_width // 2), - (1, latent_height // 2, latent_width // 2), - ] - ] + img_shapes = self._build_img_shapes(latent_height, latent_width, ref_latent_height, ref_latent_width) else: - img_shapes = [ - [ - (1, latent_height // 2, latent_width // 2), - ] - ] + img_shapes = self._build_img_shapes(latent_height, latent_width) # Prepare inpaint extension (operates in 4D space, so unpack/repack around it) inpaint_mask = self._prep_inpaint_mask(context, noise) # noise has the right 4D shape diff --git a/invokeai/app/invocations/qwen_image_image_to_latents.py b/invokeai/app/invocations/qwen_image_image_to_latents.py index c5fe1b5d5c8..ef88e03082b 100644 --- a/invokeai/app/invocations/qwen_image_image_to_latents.py +++ b/invokeai/app/invocations/qwen_image_image_to_latents.py @@ -83,7 +83,9 @@ def invoke(self, context: InvocationContext) -> LatentsOutput: if self.width is not None and self.height is not None: image = image.convert("RGB").resize((self.width, self.height), resample=PILImage.LANCZOS) - image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB")) + # multiple_of=16 ensures the post-VAE latents (vae_scale_factor=8) have even + # spatial dims, which the transformer's 2x2 patch packing requires. + image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"), multiple_of=16) if image_tensor.dim() == 3: image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w") diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts index 3a5c2cde344..bb172d4d9f5 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts @@ -140,7 +140,12 @@ vi.mock('services/api/types', async () => { }; }); -import { buildQwenImageGraph, isQwenImageEditModel, shouldUseCfg } from './buildQwenImageGraph'; +import { + buildQwenImageGraph, + calculateQwenImageEditRefDimensions, + isQwenImageEditModel, + shouldUseCfg, +} from './buildQwenImageGraph'; describe('isQwenImageEditModel', () => { afterEach(() => { @@ -415,3 +420,80 @@ describe('buildQwenImageGraph', () => { expect(hasReferenceLatentsEdge).toBe(false); }); }); + +describe('calculateQwenImageEditRefDimensions', () => { + // Cross-checked against diffusers' calculate_dimensions(1024*1024, ratio) + // (see pipeline_qwenimage_edit.py / pipeline_qwenimage_edit_plus.py). + it('produces ~1024² area for a square input', () => { + const result = calculateQwenImageEditRefDimensions(512, 512); + expect(result).toEqual({ width: 1024, height: 1024 }); + }); + + it('preserves aspect ratio for landscape inputs', () => { + expect(calculateQwenImageEditRefDimensions(1600, 1200)).toEqual({ width: 1184, height: 896 }); + expect(calculateQwenImageEditRefDimensions(1920, 1080)).toEqual({ width: 1376, height: 768 }); + }); + + it('preserves aspect ratio for portrait inputs', () => { + expect(calculateQwenImageEditRefDimensions(1200, 1600)).toEqual({ width: 896, height: 1184 }); + expect(calculateQwenImageEditRefDimensions(1080, 1920)).toEqual({ width: 768, height: 1376 }); + }); + + it('snaps dimensions to multiples of 32', () => { + const { width, height } = calculateQwenImageEditRefDimensions(1600, 1200); + expect(width % 32).toBe(0); + expect(height % 32).toBe(0); + }); + + it('clamps to a minimum of 32 for extreme aspect ratios', () => { + // 50000x100 has aspect ratio 500:1 — height would round to 0 without the clamp. + const { width, height } = calculateQwenImageEditRefDimensions(50000, 100); + expect(height).toBeGreaterThanOrEqual(32); + expect(width).toBeGreaterThanOrEqual(32); + expect(width % 32).toBe(0); + expect(height % 32).toBe(0); + }); + + it('passes computed dims as width/height to the reference i2l node', async () => { + const { selectMainModelConfig } = await import('features/controlLayers/store/paramsSlice'); + const editModel = { ...model, variant: 'edit' }; + vi.mocked(selectMainModelConfig).mockReturnValue(editModel as never); + + const { fetchModelConfigWithTypeGuard } = await import('features/metadata/util/modelFetchingHelpers'); + vi.mocked(fetchModelConfigWithTypeGuard).mockResolvedValue(editModel as never); + + const { selectRefImagesSlice } = await import('features/controlLayers/store/refImagesSlice'); + vi.mocked(selectRefImagesSlice).mockReturnValue({ + entities: [ + { + id: 'ref-image-1', + isEnabled: true, + config: { + type: 'qwen_image_reference_image', + image: { original: { image: { image_name: 'ref.png', width: 1600, height: 1200 } } }, + }, + }, + ], + } as never); + + const { g } = await buildQwenImageGraph({ + generationMode: 'txt2img', + manager: null, + state: { + system: { shouldUseNSFWChecker: false, shouldUseWatermarker: false }, + } as never, + }); + + const graph = g.getGraph(); + const refI2lNodeId = Object.keys(graph.nodes).find((id) => id.startsWith('qwen_ref_i2l:')); + expect(refI2lNodeId).toBeDefined(); + const refI2lNode = graph.nodes[refI2lNodeId!] as { width?: number; height?: number }; + expect(refI2lNode.width).toBe(1184); + expect(refI2lNode.height).toBe(896); + + // Restore mocks + vi.mocked(selectMainModelConfig).mockReturnValue(model as never); + vi.mocked(fetchModelConfigWithTypeGuard).mockResolvedValue(model as never); + vi.mocked(selectRefImagesSlice).mockReturnValue(refImagesSlice as never); + }); +}); diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts index 0d92d325afd..8c74b537424 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts @@ -51,6 +51,27 @@ export const shouldUseCfg = (cfgScale: number | number[]): boolean => { return cfgScale.some((value) => value > 1); }; +/** + * Compute the target dimensions for the VAE-encoded reference image, matching + * diffusers' `calculate_dimensions(VAE_IMAGE_SIZE=1024*1024, aspect_ratio)` used + * by QwenImageEditPipeline / QwenImageEditPlusPipeline. The reference is resized + * so its area is ~1024² while preserving aspect ratio, with each dimension + * snapped to a multiple of 32 (the model was trained at this scale; feeding it a + * much larger reference produces a sequence length it was not trained on). + */ +const QWEN_IMAGE_EDIT_REF_TARGET_AREA = 1024 * 1024; +export const calculateQwenImageEditRefDimensions = ( + width: number, + height: number +): { width: number; height: number } => { + const ratio = width / height; + let w = Math.sqrt(QWEN_IMAGE_EDIT_REF_TARGET_AREA * ratio); + let h = w / ratio; + w = Math.max(32, Math.round(w / 32) * 32); + h = Math.max(32, Math.round(h / 32) * 32); + return { width: w, height: h }; +}; + export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise => { const { generationMode, state, manager } = arg; @@ -175,15 +196,18 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise Date: Mon, 11 May 2026 10:54:08 -0400 Subject: [PATCH 2/2] fix(qwen): clamp reference latents to VAE_IMAGE_SIZE in denoise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The frontend resizes the reference image to ~1024² area before VAE encoding, but direct API callers and older graph JSON can wire qwen_image_i2l → qwen_image_denoise without explicit width/height, sending a native-resolution reference latent into the transformer. Without the clamp the model receives an out-of-distribution sequence length (artifact returns, VRAM spikes). Mirror diffusers' QwenImageEdit(Plus) VAE_IMAGE_SIZE behavior in latent space: bilinear-downscale the reference latent to calculate_dimensions(1024², aspect_ratio) snapped to multiples of 32 in pixel space (= multiples of 4 in latent space, so always packable). In-budget latents pass through untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../app/invocations/qwen_image_denoise.py | 39 +++++++++- .../invocations/test_qwen_image_denoise.py | 77 +++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py index 6f9165ad35a..2dabc929bb1 100644 --- a/invokeai/app/invocations/qwen_image_denoise.py +++ b/invokeai/app/invocations/qwen_image_denoise.py @@ -1,5 +1,6 @@ +import math from contextlib import ExitStack -from typing import Callable, Iterator, Optional, Tuple +from typing import Callable, ClassVar, Iterator, Optional, Tuple import torch import torchvision.transforms as tv_transforms @@ -211,6 +212,37 @@ def _build_img_shapes( shapes.append((1, ref_latent_height // 2, ref_latent_width // 2)) return [shapes] + # diffusers' QwenImageEdit(Plus)Pipeline VAE_IMAGE_SIZE = 1024 * 1024 pixels; + # ref images are resized to this area (preserving aspect, snapped to multiples + # of 32) before VAE encoding. We mirror this clamp in latent space so direct + # backend callers — whose i2l may not pass explicit width/height — don't feed + # the transformer an out-of-distribution reference sequence length (which + # also causes a VRAM spike for large inputs). + _REF_TARGET_PIXEL_AREA: ClassVar[int] = 1024 * 1024 + _VAE_SCALE_FACTOR: ClassVar[int] = 8 + + @classmethod + def _maybe_clamp_ref_latent_size(cls, ref_latents: torch.Tensor) -> torch.Tensor: + """Bilinear-downscale the reference latent if it exceeds diffusers' + VAE_IMAGE_SIZE budget. + + Returns the latent unchanged if it's already within budget. + """ + _, _, rh, rw = ref_latents.shape + target_cells = cls._REF_TARGET_PIXEL_AREA // (cls._VAE_SCALE_FACTOR**2) + if rh * rw <= target_cells: + return ref_latents + aspect = rw / rh + target_w_px = math.sqrt(cls._REF_TARGET_PIXEL_AREA * aspect) + target_h_px = target_w_px / aspect + target_w_px = max(32, round(target_w_px / 32) * 32) + target_h_px = max(32, round(target_h_px / 32) * 32) + target_rh = target_h_px // cls._VAE_SCALE_FACTOR + target_rw = target_w_px // cls._VAE_SCALE_FACTOR + return torch.nn.functional.interpolate( + ref_latents, size=(target_rh, target_rw), mode="bilinear", antialias=False + ) + def _run_diffusion(self, context: InvocationContext): inference_dtype = torch.bfloat16 device = TorchDevice.choose_torch_device() @@ -371,6 +403,11 @@ def _run_diffusion(self, context: InvocationContext): ref_latent_width = latent_width if use_ref_latents: if ref_latents is not None: + # Defense-in-depth: backend callers (direct API, older graph JSON) + # may wire qwen_image_i2l without explicit width/height, producing + # a native-resolution reference latent. Clamp here so the + # transformer always sees an in-distribution sequence length. + ref_latents = self._maybe_clamp_ref_latent_size(ref_latents) _, _, rh, rw = ref_latents.shape ref_latent_height, ref_latent_width = self._align_ref_latent_dims(rh, rw) if ref_latent_height != rh or ref_latent_width != rw: diff --git a/tests/app/invocations/test_qwen_image_denoise.py b/tests/app/invocations/test_qwen_image_denoise.py index 2d746ca7e5e..50187ea1535 100644 --- a/tests/app/invocations/test_qwen_image_denoise.py +++ b/tests/app/invocations/test_qwen_image_denoise.py @@ -89,6 +89,83 @@ def test_raises_on_one_dim(self): QwenImageDenoiseInvocation._align_ref_latent_dims(64, 1) +class TestMaybeClampRefLatentSize: + """Test the diffusers-style VAE_IMAGE_SIZE clamp applied to reference latents + before packing. This is defense-in-depth for backend callers (direct API, + older graph JSON) that wire qwen_image_i2l without explicit width/height — + without the clamp, the transformer receives an out-of-distribution sequence + length and VRAM usage spikes on large reference images.""" + + def test_in_budget_latent_unchanged(self): + """A 1024² ref image → 128x128 latent → exactly the budget. Pass through.""" + import torch + + ref = torch.randn(1, 16, 128, 128) + result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref) + assert result.shape == (1, 16, 128, 128) + assert result is ref # identity, no copy + + def test_small_latent_unchanged(self): + """A 512² ref → 64x64 latent (4x under budget). Pass through unchanged.""" + import torch + + ref = torch.randn(1, 16, 64, 64) + result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref) + assert result.shape == (1, 16, 64, 64) + assert result is ref + + def test_native_resolution_landscape_clamped(self): + """A native 1600x1200 image → 200x150 latents. Should clamp to the same + dims diffusers produces (1184x896 pixels → 148x112 latents).""" + import torch + + ref = torch.randn(1, 16, 150, 200) + result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref) + assert result.shape == (1, 16, 112, 148) + + def test_native_resolution_portrait_clamped(self): + """1200x1600 → 150x200 latents → diffusers target 896x1184 → 112x148.""" + import torch + + ref = torch.randn(1, 16, 200, 150) + result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref) + assert result.shape == (1, 16, 148, 112) + + def test_huge_latent_clamped(self): + """A 4096x4096 image → 512x512 latents (16x budget). Clamp to 128x128 + latents (= 1024² pixels), well within model's trained distribution.""" + import torch + + ref = torch.randn(1, 16, 512, 512) + result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref) + assert result.shape == (1, 16, 128, 128) + + def test_clamp_preserves_aspect_ratio_within_rounding(self): + """Aspect ratio of the clamped latent should match the input to within + the 32-pixel snapping granularity used by diffusers.""" + import torch + + # 1920x1080 (16:9, ~2M pixels) + ref = torch.randn(1, 16, 135, 240) + result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref) + # diffusers: calculate_dimensions(1024², 16/9) → (1376, 768) px → (172, 96) latent + assert result.shape == (1, 16, 96, 172) + + def test_clamp_output_is_packable(self): + """The clamped latent must have even spatial dims (required by 2x2 packing) + before _align_ref_latent_dims is called. Because the clamp snaps to 32px + in pixel space and vae_scale_factor=8, every clamp output is a multiple + of 4 in latent space (and therefore even).""" + import torch + + for h, w in [(150, 200), (200, 150), (135, 240), (512, 512)]: + ref = torch.randn(1, 16, h, w) + result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref) + _, _, rh, rw = result.shape + assert rh % 2 == 0, f"clamp produced odd height {rh} for input ({h},{w})" + assert rw % 2 == 0, f"clamp produced odd width {rw} for input ({h},{w})" + + class TestBuildImgShapes: """Test img_shapes construction. Regression test for the ghosting/doubling bug where ref and noisy segments shared identical spatial RoPE positions."""