From 5e0ff8be3ab3c8fbbc1f9ce24c336f75455de525 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Mon, 11 May 2026 09:17:31 -0400
Subject: [PATCH 1/2] fix(qwen): use distinct img_shapes for reference latents
 in Qwen Image Edit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Qwen Image Edit was applying identical RoPE positions to the noisy
and reference latent segments (both packed at the noisy latent's
dimensions), so cross-attention couldn't disentangle them — reference
content bled into the generation as a faintly offset ghost across the
whole frame, outside the masked edit region.

The denoise now keeps reference latents at their own (H, W) and uses
those dims in the reference segment of img_shapes, matching diffusers'
QwenImageEditPipeline / QwenImageEditPlusPipeline.

The reference qwen_image_i2l is resized to ~1024² area preserving
aspect ratio (matching diffusers' VAE_IMAGE_SIZE) so the reference
token sequence stays in the distribution the model was trained on.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../app/invocations/qwen_image_denoise.py     | 70 +++++++++++-----
 .../qwen_image_image_to_latents.py            |  4 +-
 .../generation/buildQwenImageGraph.test.ts    | 84 ++++++++++++++++++-
 .../graph/generation/buildQwenImageGraph.ts   | 36 ++++++--
 .../invocations/test_qwen_image_denoise.py    | 59 +++++++++++++
 5 files changed, 226 insertions(+), 27 deletions(-)

diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py
index 04e21a26c3f..6f9165ad35a 100644
--- a/invokeai/app/invocations/qwen_image_denoise.py
+++ b/invokeai/app/invocations/qwen_image_denoise.py
@@ -176,6 +176,41 @@ def _unpack_latents(latents: torch.Tensor, height: int, width: int) -> torch.Ten
         latents = latents.reshape(batch_size, channels // 4, h, w)
         return latents
 
+    @staticmethod
+    def _align_ref_latent_dims(rh: int, rw: int) -> tuple[int, int]:
+        """Trim reference latent spatial dims to even values for 2x2 packing.
+
+        Raises ValueError if the aligned dims would be < 2 (i.e., the reference
+        latent is too small to produce any valid tokens).
+        """
+        rh_aligned = rh - (rh % 2)
+        rw_aligned = rw - (rw % 2)
+        if rh_aligned < 2 or rw_aligned < 2:
+            raise ValueError(
+                f"Reference latent spatial dims must be >= 2 after even alignment; "
+                f"got ({rh_aligned}, {rw_aligned}) from input shape ({rh}, {rw}). "
+                "Ensure the reference image is at least 16 pixels in each dimension."
+            )
+        return rh_aligned, rw_aligned
+
+    @staticmethod
+    def _build_img_shapes(
+        latent_height: int,
+        latent_width: int,
+        ref_latent_height: int | None = None,
+        ref_latent_width: int | None = None,
+    ) -> list[list[tuple[int, int, int]]]:
+        """Build the img_shapes argument for the transformer.
+
+        The reference segment (if present) must use its own dims so QwenEmbedRope's
+        spatial frequencies position ref tokens distinctly from noisy tokens —
+        otherwise reference content bleeds into the generation as a ghost.
+        """
+        shapes: list[tuple[int, int, int]] = [(1, latent_height // 2, latent_width // 2)]
+        if ref_latent_height is not None and ref_latent_width is not None:
+            shapes.append((1, ref_latent_height // 2, ref_latent_width // 2))
+        return [shapes]
+
     def _run_diffusion(self, context: InvocationContext):
         inference_dtype = torch.bfloat16
         device = TorchDevice.choose_torch_device()
@@ -332,35 +367,32 @@ def _run_diffusion(self, context: InvocationContext):
         use_ref_latents = has_zero_cond_t
 
         ref_latents_packed = None
+        ref_latent_height = latent_height
+        ref_latent_width = latent_width
         if use_ref_latents:
             if ref_latents is not None:
-                _, ref_ch, rh, rw = ref_latents.shape
-                if rh != latent_height or rw != latent_width:
-                    ref_latents = torch.nn.functional.interpolate(
-                        ref_latents, size=(latent_height, latent_width), mode="bilinear"
-                    )
+                _, _, rh, rw = ref_latents.shape
+                ref_latent_height, ref_latent_width = self._align_ref_latent_dims(rh, rw)
+                if ref_latent_height != rh or ref_latent_width != rw:
+                    ref_latents = ref_latents[..., :ref_latent_height, :ref_latent_width]
             else:
                 # No reference image provided — use zeros so the model still gets the
                 # expected sequence layout.
                 ref_latents = torch.zeros(
                     1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype
                 )
-            ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width)
-
-        # img_shapes tells the transformer the spatial layout of patches.
+            ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, ref_latent_height, ref_latent_width)
+
+        # img_shapes tells the transformer the spatial layout of patches. The reference
+        # segment must use the reference latent's own dimensions so RoPE positions it
+        # distinctly from the noisy latent — otherwise the two segments share spatial
+        # positional encoding and the model can't disentangle them, producing a
+        # ghost/doubling artifact across the whole frame. Matches diffusers'
+        # QwenImageEditPipeline / QwenImageEditPlusPipeline.
         if use_ref_latents:
-            img_shapes = [
-                [
-                    (1, latent_height // 2, latent_width // 2),
-                    (1, latent_height // 2, latent_width // 2),
-                ]
-            ]
+            img_shapes = self._build_img_shapes(latent_height, latent_width, ref_latent_height, ref_latent_width)
         else:
-            img_shapes = [
-                [
-                    (1, latent_height // 2, latent_width // 2),
-                ]
-            ]
+            img_shapes = self._build_img_shapes(latent_height, latent_width)
 
         # Prepare inpaint extension (operates in 4D space, so unpack/repack around it)
         inpaint_mask = self._prep_inpaint_mask(context, noise)  # noise has the right 4D shape
diff --git a/invokeai/app/invocations/qwen_image_image_to_latents.py b/invokeai/app/invocations/qwen_image_image_to_latents.py
index c5fe1b5d5c8..ef88e03082b 100644
--- a/invokeai/app/invocations/qwen_image_image_to_latents.py
+++ b/invokeai/app/invocations/qwen_image_image_to_latents.py
@@ -83,7 +83,9 @@ def invoke(self, context: InvocationContext) -> LatentsOutput:
         if self.width is not None and self.height is not None:
             image = image.convert("RGB").resize((self.width, self.height), resample=PILImage.LANCZOS)
 
-        image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+        # multiple_of=16 ensures the post-VAE latents (vae_scale_factor=8) have even
+        # spatial dims, which the transformer's 2x2 patch packing requires.
+        image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"), multiple_of=16)
         if image_tensor.dim() == 3:
             image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
 
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts
index 3a5c2cde344..bb172d4d9f5 100644
--- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts
@@ -140,7 +140,12 @@ vi.mock('services/api/types', async () => {
   };
 });
 
-import { buildQwenImageGraph, isQwenImageEditModel, shouldUseCfg } from './buildQwenImageGraph';
+import {
+  buildQwenImageGraph,
+  calculateQwenImageEditRefDimensions,
+  isQwenImageEditModel,
+  shouldUseCfg,
+} from './buildQwenImageGraph';
 
 describe('isQwenImageEditModel', () => {
   afterEach(() => {
@@ -415,3 +420,80 @@ describe('buildQwenImageGraph', () => {
     expect(hasReferenceLatentsEdge).toBe(false);
   });
 });
+
+describe('calculateQwenImageEditRefDimensions', () => {
+  // Cross-checked against diffusers' calculate_dimensions(1024*1024, ratio)
+  // (see pipeline_qwenimage_edit.py / pipeline_qwenimage_edit_plus.py).
+  it('produces ~1024² area for a square input', () => {
+    const result = calculateQwenImageEditRefDimensions(512, 512);
+    expect(result).toEqual({ width: 1024, height: 1024 });
+  });
+
+  it('preserves aspect ratio for landscape inputs', () => {
+    expect(calculateQwenImageEditRefDimensions(1600, 1200)).toEqual({ width: 1184, height: 896 });
+    expect(calculateQwenImageEditRefDimensions(1920, 1080)).toEqual({ width: 1376, height: 768 });
+  });
+
+  it('preserves aspect ratio for portrait inputs', () => {
+    expect(calculateQwenImageEditRefDimensions(1200, 1600)).toEqual({ width: 896, height: 1184 });
+    expect(calculateQwenImageEditRefDimensions(1080, 1920)).toEqual({ width: 768, height: 1376 });
+  });
+
+  it('snaps dimensions to multiples of 32', () => {
+    const { width, height } = calculateQwenImageEditRefDimensions(1600, 1200);
+    expect(width % 32).toBe(0);
+    expect(height % 32).toBe(0);
+  });
+
+  it('clamps to a minimum of 32 for extreme aspect ratios', () => {
+    // 50000x100 has aspect ratio 500:1 — height would round to 0 without the clamp.
+    const { width, height } = calculateQwenImageEditRefDimensions(50000, 100);
+    expect(height).toBeGreaterThanOrEqual(32);
+    expect(width).toBeGreaterThanOrEqual(32);
+    expect(width % 32).toBe(0);
+    expect(height % 32).toBe(0);
+  });
+
+  it('passes computed dims as width/height to the reference i2l node', async () => {
+    const { selectMainModelConfig } = await import('features/controlLayers/store/paramsSlice');
+    const editModel = { ...model, variant: 'edit' };
+    vi.mocked(selectMainModelConfig).mockReturnValue(editModel as never);
+
+    const { fetchModelConfigWithTypeGuard } = await import('features/metadata/util/modelFetchingHelpers');
+    vi.mocked(fetchModelConfigWithTypeGuard).mockResolvedValue(editModel as never);
+
+    const { selectRefImagesSlice } = await import('features/controlLayers/store/refImagesSlice');
+    vi.mocked(selectRefImagesSlice).mockReturnValue({
+      entities: [
+        {
+          id: 'ref-image-1',
+          isEnabled: true,
+          config: {
+            type: 'qwen_image_reference_image',
+            image: { original: { image: { image_name: 'ref.png', width: 1600, height: 1200 } } },
+          },
+        },
+      ],
+    } as never);
+
+    const { g } = await buildQwenImageGraph({
+      generationMode: 'txt2img',
+      manager: null,
+      state: {
+        system: { shouldUseNSFWChecker: false, shouldUseWatermarker: false },
+      } as never,
+    });
+
+    const graph = g.getGraph();
+    const refI2lNodeId = Object.keys(graph.nodes).find((id) => id.startsWith('qwen_ref_i2l:'));
+    expect(refI2lNodeId).toBeDefined();
+    const refI2lNode = graph.nodes[refI2lNodeId!] as { width?: number; height?: number };
+    expect(refI2lNode.width).toBe(1184);
+    expect(refI2lNode.height).toBe(896);
+
+    // Restore mocks
+    vi.mocked(selectMainModelConfig).mockReturnValue(model as never);
+    vi.mocked(fetchModelConfigWithTypeGuard).mockResolvedValue(model as never);
+    vi.mocked(selectRefImagesSlice).mockReturnValue(refImagesSlice as never);
+  });
+});
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts
index 0d92d325afd..8c74b537424 100644
--- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts
@@ -51,6 +51,27 @@ export const shouldUseCfg = (cfgScale: number | number[]): boolean => {
   return cfgScale.some((value) => value > 1);
 };
 
+/**
+ * Compute the target dimensions for the VAE-encoded reference image, matching
+ * diffusers' `calculate_dimensions(VAE_IMAGE_SIZE=1024*1024, aspect_ratio)` used
+ * by QwenImageEditPipeline / QwenImageEditPlusPipeline. The reference is resized
+ * so its area is ~1024² while preserving aspect ratio, with each dimension
+ * snapped to a multiple of 32 (the model was trained at this scale; feeding it a
+ * much larger reference produces a sequence length it was not trained on).
+ */
+const QWEN_IMAGE_EDIT_REF_TARGET_AREA = 1024 * 1024;
+export const calculateQwenImageEditRefDimensions = (
+  width: number,
+  height: number
+): { width: number; height: number } => {
+  const ratio = width / height;
+  let w = Math.sqrt(QWEN_IMAGE_EDIT_REF_TARGET_AREA * ratio);
+  let h = w / ratio;
+  w = Math.max(32, Math.round(w / 32) * 32);
+  h = Math.max(32, Math.round(h / 32) * 32);
+  return { width: w, height: h };
+};
+
 export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise<GraphBuilderReturn> => {
   const { generationMode, state, manager } = arg;
 
@@ -175,15 +196,18 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise<GraphBu
     // Also VAE-encode the first reference image as latents for the denoising transformer.
     // The transformer expects [noisy_patches ; ref_patches] in its sequence.
     const firstConfig = validRefImageConfigs[0]!;
-    const firstImgField = zImageField.parse(
-      firstConfig.config.image?.crop?.image ?? firstConfig.config.image?.original.image
-    );
-    // Don't force-resize the reference image to the output dimensions — that would
-    // distort the aspect ratio when they differ. The I2L encodes at the image's
-    // native size; the denoise node handles dimension mismatches via interpolation.
+    const firstImage = firstConfig.config.image?.crop?.image ?? firstConfig.config.image?.original.image;
+    const firstImgField = zImageField.parse(firstImage);
+    // Resize the reference image to ~1024² area preserving aspect ratio, matching the
+    // diffusers QwenImageEdit(Plus)Pipeline's VAE_IMAGE_SIZE. The denoise node uses
+    // the reference latent's own dimensions for RoPE, so the ref segment is encoded
+    // at the resolution the model was trained on rather than the source image's
+    // native size.
+    const refDims = firstImage ? calculateQwenImageEditRefDimensions(firstImage.width, firstImage.height) : undefined;
     const refI2l = g.addNode({
       type: 'qwen_image_i2l',
       id: getPrefixedId('qwen_ref_i2l'),
+      ...(refDims ? { width: refDims.width, height: refDims.height } : {}),
     });
     const refImageNode = g.addNode({
       type: 'image',
diff --git a/tests/app/invocations/test_qwen_image_denoise.py b/tests/app/invocations/test_qwen_image_denoise.py
index e0e8a3f0b72..2d746ca7e5e 100644
--- a/tests/app/invocations/test_qwen_image_denoise.py
+++ b/tests/app/invocations/test_qwen_image_denoise.py
@@ -59,3 +59,62 @@ def test_unpack_shape(self):
         packed = torch.randn(1, 70 * 59, 64)
         unpacked = QwenImageDenoiseInvocation._unpack_latents(packed, 140, 118)
         assert unpacked.shape == (1, 16, 140, 118)
+
+
+class TestAlignRefLatentDims:
+    """Test reference latent dim alignment for 2x2 packing."""
+
+    def test_even_dims_unchanged(self):
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(96, 64) == (96, 64)
+
+    def test_odd_dims_trimmed_to_even(self):
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(97, 65) == (96, 64)
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(150, 151) == (150, 150)
+
+    def test_minimum_aligned_dims(self):
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(2, 2) == (2, 2)
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(3, 2) == (2, 2)
+
+    def test_raises_on_zero_dim(self):
+        with pytest.raises(ValueError, match="spatial dims must be >= 2"):
+            QwenImageDenoiseInvocation._align_ref_latent_dims(0, 64)
+        with pytest.raises(ValueError, match="spatial dims must be >= 2"):
+            QwenImageDenoiseInvocation._align_ref_latent_dims(64, 0)
+
+    def test_raises_on_one_dim(self):
+        """A 1-pixel latent aligns to 0 and must be rejected."""
+        with pytest.raises(ValueError, match="spatial dims must be >= 2"):
+            QwenImageDenoiseInvocation._align_ref_latent_dims(1, 64)
+        with pytest.raises(ValueError, match="spatial dims must be >= 2"):
+            QwenImageDenoiseInvocation._align_ref_latent_dims(64, 1)
+
+
+class TestBuildImgShapes:
+    """Test img_shapes construction. Regression test for the ghosting/doubling bug
+    where ref and noisy segments shared identical spatial RoPE positions."""
+
+    def test_txt2img_single_segment(self):
+        """No reference latent → single segment for the noisy latent only."""
+        result = QwenImageDenoiseInvocation._build_img_shapes(64, 64)
+        assert result == [[(1, 32, 32)]]
+
+    def test_edit_uses_distinct_ref_dims(self):
+        """Edit-mode img_shapes must place ref segment at the ref's OWN dims, not
+        the noisy dims. Identical dims caused the ghosting artifact."""
+        noisy_h, noisy_w = 64, 64
+        ref_h, ref_w = 96, 64
+        result = QwenImageDenoiseInvocation._build_img_shapes(noisy_h, noisy_w, ref_h, ref_w)
+        assert result == [[(1, 32, 32), (1, 48, 32)]]
+        # The bug was that both segments had the same shape:
+        assert result[0][0] != result[0][1]
+
+    def test_edit_matches_diffusers_layout(self):
+        """Structure must match diffusers QwenImageEditPipeline (single batch,
+        nested list of (frame, h//2, w//2) tuples)."""
+        result = QwenImageDenoiseInvocation._build_img_shapes(80, 112, 128, 96)
+        assert isinstance(result, list)
+        assert len(result) == 1
+        assert isinstance(result[0], list)
+        assert len(result[0]) == 2
+        assert result[0][0] == (1, 40, 56)
+        assert result[0][1] == (1, 64, 48)

From eaf116e360fc21d63115503a254c127b056c2ebe Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Mon, 11 May 2026 10:54:08 -0400
Subject: [PATCH 2/2] fix(qwen): clamp reference latents to VAE_IMAGE_SIZE in
 denoise
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The frontend resizes the reference image to ~1024² area before VAE
encoding, but direct API callers and older graph JSON can wire
qwen_image_i2l → qwen_image_denoise without explicit width/height,
sending a native-resolution reference latent into the transformer.
Without the clamp the model receives an out-of-distribution sequence
length (artifact returns, VRAM spikes).

Mirror diffusers' QwenImageEdit(Plus) VAE_IMAGE_SIZE behavior in
latent space: bilinear-downscale the reference latent to
calculate_dimensions(1024², aspect_ratio) snapped to multiples of 32
in pixel space (= multiples of 4 in latent space, so always packable).
In-budget latents pass through untouched.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../app/invocations/qwen_image_denoise.py     | 39 +++++++++-
 .../invocations/test_qwen_image_denoise.py    | 77 +++++++++++++++++++
 2 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py
index 6f9165ad35a..2dabc929bb1 100644
--- a/invokeai/app/invocations/qwen_image_denoise.py
+++ b/invokeai/app/invocations/qwen_image_denoise.py
@@ -1,5 +1,6 @@
+import math
 from contextlib import ExitStack
-from typing import Callable, Iterator, Optional, Tuple
+from typing import Callable, ClassVar, Iterator, Optional, Tuple
 
 import torch
 import torchvision.transforms as tv_transforms
@@ -211,6 +212,37 @@ def _build_img_shapes(
             shapes.append((1, ref_latent_height // 2, ref_latent_width // 2))
         return [shapes]
 
+    # diffusers' QwenImageEdit(Plus)Pipeline VAE_IMAGE_SIZE = 1024 * 1024 pixels;
+    # ref images are resized to this area (preserving aspect, snapped to multiples
+    # of 32) before VAE encoding. We mirror this clamp in latent space so direct
+    # backend callers — whose i2l may not pass explicit width/height — don't feed
+    # the transformer an out-of-distribution reference sequence length (which
+    # also causes a VRAM spike for large inputs).
+    _REF_TARGET_PIXEL_AREA: ClassVar[int] = 1024 * 1024
+    _VAE_SCALE_FACTOR: ClassVar[int] = 8
+
+    @classmethod
+    def _maybe_clamp_ref_latent_size(cls, ref_latents: torch.Tensor) -> torch.Tensor:
+        """Bilinear-downscale the reference latent if it exceeds diffusers'
+        VAE_IMAGE_SIZE budget.
+
+        Returns the latent unchanged if it's already within budget.
+        """
+        _, _, rh, rw = ref_latents.shape
+        target_cells = cls._REF_TARGET_PIXEL_AREA // (cls._VAE_SCALE_FACTOR**2)
+        if rh * rw <= target_cells:
+            return ref_latents
+        aspect = rw / rh
+        target_w_px = math.sqrt(cls._REF_TARGET_PIXEL_AREA * aspect)
+        target_h_px = target_w_px / aspect
+        target_w_px = max(32, round(target_w_px / 32) * 32)
+        target_h_px = max(32, round(target_h_px / 32) * 32)
+        target_rh = target_h_px // cls._VAE_SCALE_FACTOR
+        target_rw = target_w_px // cls._VAE_SCALE_FACTOR
+        return torch.nn.functional.interpolate(
+            ref_latents, size=(target_rh, target_rw), mode="bilinear", antialias=False
+        )
+
     def _run_diffusion(self, context: InvocationContext):
         inference_dtype = torch.bfloat16
         device = TorchDevice.choose_torch_device()
@@ -371,6 +403,11 @@ def _run_diffusion(self, context: InvocationContext):
         ref_latent_width = latent_width
         if use_ref_latents:
             if ref_latents is not None:
+                # Defense-in-depth: backend callers (direct API, older graph JSON)
+                # may wire qwen_image_i2l without explicit width/height, producing
+                # a native-resolution reference latent. Clamp here so the
+                # transformer always sees an in-distribution sequence length.
+                ref_latents = self._maybe_clamp_ref_latent_size(ref_latents)
                 _, _, rh, rw = ref_latents.shape
                 ref_latent_height, ref_latent_width = self._align_ref_latent_dims(rh, rw)
                 if ref_latent_height != rh or ref_latent_width != rw:
diff --git a/tests/app/invocations/test_qwen_image_denoise.py b/tests/app/invocations/test_qwen_image_denoise.py
index 2d746ca7e5e..50187ea1535 100644
--- a/tests/app/invocations/test_qwen_image_denoise.py
+++ b/tests/app/invocations/test_qwen_image_denoise.py
@@ -89,6 +89,83 @@ def test_raises_on_one_dim(self):
             QwenImageDenoiseInvocation._align_ref_latent_dims(64, 1)
 
 
+class TestMaybeClampRefLatentSize:
+    """Test the diffusers-style VAE_IMAGE_SIZE clamp applied to reference latents
+    before packing. This is defense-in-depth for backend callers (direct API,
+    older graph JSON) that wire qwen_image_i2l without explicit width/height —
+    without the clamp, the transformer receives an out-of-distribution sequence
+    length and VRAM usage spikes on large reference images."""
+
+    def test_in_budget_latent_unchanged(self):
+        """A 1024² ref image → 128x128 latent → exactly the budget. Pass through."""
+        import torch
+
+        ref = torch.randn(1, 16, 128, 128)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 128, 128)
+        assert result is ref  # identity, no copy
+
+    def test_small_latent_unchanged(self):
+        """A 512² ref → 64x64 latent (4x under budget). Pass through unchanged."""
+        import torch
+
+        ref = torch.randn(1, 16, 64, 64)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 64, 64)
+        assert result is ref
+
+    def test_native_resolution_landscape_clamped(self):
+        """A native 1600x1200 image → 200x150 latents. Should clamp to the same
+        dims diffusers produces (1184x896 pixels → 148x112 latents)."""
+        import torch
+
+        ref = torch.randn(1, 16, 150, 200)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 112, 148)
+
+    def test_native_resolution_portrait_clamped(self):
+        """1200x1600 → 150x200 latents → diffusers target 896x1184 → 112x148."""
+        import torch
+
+        ref = torch.randn(1, 16, 200, 150)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 148, 112)
+
+    def test_huge_latent_clamped(self):
+        """A 4096x4096 image → 512x512 latents (16x budget). Clamp to 128x128
+        latents (= 1024² pixels), well within model's trained distribution."""
+        import torch
+
+        ref = torch.randn(1, 16, 512, 512)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 128, 128)
+
+    def test_clamp_preserves_aspect_ratio_within_rounding(self):
+        """Aspect ratio of the clamped latent should match the input to within
+        the 32-pixel snapping granularity used by diffusers."""
+        import torch
+
+        # 1920x1080 (16:9, ~2M pixels)
+        ref = torch.randn(1, 16, 135, 240)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        # diffusers: calculate_dimensions(1024², 16/9) → (1376, 768) px → (172, 96) latent
+        assert result.shape == (1, 16, 96, 172)
+
+    def test_clamp_output_is_packable(self):
+        """The clamped latent must have even spatial dims (required by 2x2 packing)
+        before _align_ref_latent_dims is called. Because the clamp snaps to 32px
+        in pixel space and vae_scale_factor=8, every clamp output is a multiple
+        of 4 in latent space (and therefore even)."""
+        import torch
+
+        for h, w in [(150, 200), (200, 150), (135, 240), (512, 512)]:
+            ref = torch.randn(1, 16, h, w)
+            result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+            _, _, rh, rw = result.shape
+            assert rh % 2 == 0, f"clamp produced odd height {rh} for input ({h},{w})"
+            assert rw % 2 == 0, f"clamp produced odd width {rw} for input ({h},{w})"
+
+
 class TestBuildImgShapes:
     """Test img_shapes construction. Regression test for the ghosting/doubling bug
     where ref and noisy segments shared identical spatial RoPE positions."""