diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py
index 04e21a26c3f..2dabc929bb1 100644
--- a/invokeai/app/invocations/qwen_image_denoise.py
+++ b/invokeai/app/invocations/qwen_image_denoise.py
@@ -1,5 +1,6 @@
+import math
 from contextlib import ExitStack
-from typing import Callable, Iterator, Optional, Tuple
+from typing import Callable, ClassVar, Iterator, Optional, Tuple
 
 import torch
 import torchvision.transforms as tv_transforms
@@ -176,6 +177,72 @@ def _unpack_latents(latents: torch.Tensor, height: int, width: int) -> torch.Ten
         latents = latents.reshape(batch_size, channels // 4, h, w)
         return latents
 
+    @staticmethod
+    def _align_ref_latent_dims(rh: int, rw: int) -> tuple[int, int]:
+        """Trim reference latent spatial dims to even values for 2x2 packing.
+
+        Raises ValueError if the aligned dims would be < 2 (i.e., the reference
+        latent is too small to produce any valid tokens).
+        """
+        rh_aligned = rh - (rh % 2)
+        rw_aligned = rw - (rw % 2)
+        if rh_aligned < 2 or rw_aligned < 2:
+            raise ValueError(
+                f"Reference latent spatial dims must be >= 2 after even alignment; "
+                f"got ({rh_aligned}, {rw_aligned}) from input shape ({rh}, {rw}). "
+                "Ensure the reference image is at least 16 pixels in each dimension."
+            )
+        return rh_aligned, rw_aligned
+
+    @staticmethod
+    def _build_img_shapes(
+        latent_height: int,
+        latent_width: int,
+        ref_latent_height: int | None = None,
+        ref_latent_width: int | None = None,
+    ) -> list[list[tuple[int, int, int]]]:
+        """Build the img_shapes argument for the transformer.
+
+        The reference segment (if present) must use its own dims so QwenEmbedRope's
+        spatial frequencies position ref tokens distinctly from noisy tokens —
+        otherwise reference content bleeds into the generation as a ghost.
+        """
+        shapes: list[tuple[int, int, int]] = [(1, latent_height // 2, latent_width // 2)]
+        if ref_latent_height is not None and ref_latent_width is not None:
+            shapes.append((1, ref_latent_height // 2, ref_latent_width // 2))
+        return [shapes]
+
+    # diffusers' QwenImageEdit(Plus)Pipeline VAE_IMAGE_SIZE = 1024 * 1024 pixels;
+    # ref images are resized to this area (preserving aspect, snapped to multiples
+    # of 32) before VAE encoding. We mirror this clamp in latent space so direct
+    # backend callers — whose i2l may not pass explicit width/height — don't feed
+    # the transformer an out-of-distribution reference sequence length (which
+    # also causes a VRAM spike for large inputs).
+    _REF_TARGET_PIXEL_AREA: ClassVar[int] = 1024 * 1024
+    _VAE_SCALE_FACTOR: ClassVar[int] = 8
+
+    @classmethod
+    def _maybe_clamp_ref_latent_size(cls, ref_latents: torch.Tensor) -> torch.Tensor:
+        """Bilinear-downscale the reference latent if it exceeds diffusers'
+        VAE_IMAGE_SIZE budget.
+
+        Returns the latent unchanged if it's already within budget.
+        """
+        _, _, rh, rw = ref_latents.shape
+        target_cells = cls._REF_TARGET_PIXEL_AREA // (cls._VAE_SCALE_FACTOR**2)
+        if rh * rw <= target_cells:
+            return ref_latents
+        aspect = rw / rh
+        target_w_px = math.sqrt(cls._REF_TARGET_PIXEL_AREA * aspect)
+        target_h_px = target_w_px / aspect
+        target_w_px = max(32, round(target_w_px / 32) * 32)
+        target_h_px = max(32, round(target_h_px / 32) * 32)
+        target_rh = target_h_px // cls._VAE_SCALE_FACTOR
+        target_rw = target_w_px // cls._VAE_SCALE_FACTOR
+        return torch.nn.functional.interpolate(
+            ref_latents, size=(target_rh, target_rw), mode="bilinear", antialias=False
+        )
+
     def _run_diffusion(self, context: InvocationContext):
         inference_dtype = torch.bfloat16
         device = TorchDevice.choose_torch_device()
@@ -332,35 +399,37 @@ def _run_diffusion(self, context: InvocationContext):
         use_ref_latents = has_zero_cond_t
 
         ref_latents_packed = None
+        ref_latent_height = latent_height
+        ref_latent_width = latent_width
         if use_ref_latents:
             if ref_latents is not None:
-                _, ref_ch, rh, rw = ref_latents.shape
-                if rh != latent_height or rw != latent_width:
-                    ref_latents = torch.nn.functional.interpolate(
-                        ref_latents, size=(latent_height, latent_width), mode="bilinear"
-                    )
+                # Defense-in-depth: backend callers (direct API, older graph JSON)
+                # may wire qwen_image_i2l without explicit width/height, producing
+                # a native-resolution reference latent. Clamp here so the
+                # transformer always sees an in-distribution sequence length.
+                ref_latents = self._maybe_clamp_ref_latent_size(ref_latents)
+                _, _, rh, rw = ref_latents.shape
+                ref_latent_height, ref_latent_width = self._align_ref_latent_dims(rh, rw)
+                if ref_latent_height != rh or ref_latent_width != rw:
+                    ref_latents = ref_latents[..., :ref_latent_height, :ref_latent_width]
             else:
                 # No reference image provided — use zeros so the model still gets the
                 # expected sequence layout.
                 ref_latents = torch.zeros(
                     1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype
                 )
-            ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width)
-
-        # img_shapes tells the transformer the spatial layout of patches.
+            ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, ref_latent_height, ref_latent_width)
+
+        # img_shapes tells the transformer the spatial layout of patches. The reference
+        # segment must use the reference latent's own dimensions so RoPE positions it
+        # distinctly from the noisy latent — otherwise the two segments share spatial
+        # positional encoding and the model can't disentangle them, producing a
+        # ghost/doubling artifact across the whole frame. Matches diffusers'
+        # QwenImageEditPipeline / QwenImageEditPlusPipeline.
         if use_ref_latents:
-            img_shapes = [
-                [
-                    (1, latent_height // 2, latent_width // 2),
-                    (1, latent_height // 2, latent_width // 2),
-                ]
-            ]
+            img_shapes = self._build_img_shapes(latent_height, latent_width, ref_latent_height, ref_latent_width)
         else:
-            img_shapes = [
-                [
-                    (1, latent_height // 2, latent_width // 2),
-                ]
-            ]
+            img_shapes = self._build_img_shapes(latent_height, latent_width)
 
         # Prepare inpaint extension (operates in 4D space, so unpack/repack around it)
         inpaint_mask = self._prep_inpaint_mask(context, noise)  # noise has the right 4D shape
diff --git a/invokeai/app/invocations/qwen_image_image_to_latents.py b/invokeai/app/invocations/qwen_image_image_to_latents.py
index c5fe1b5d5c8..ef88e03082b 100644
--- a/invokeai/app/invocations/qwen_image_image_to_latents.py
+++ b/invokeai/app/invocations/qwen_image_image_to_latents.py
@@ -83,7 +83,9 @@ def invoke(self, context: InvocationContext) -> LatentsOutput:
         if self.width is not None and self.height is not None:
             image = image.convert("RGB").resize((self.width, self.height), resample=PILImage.LANCZOS)
 
-        image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+        # multiple_of=16 ensures the post-VAE latents (vae_scale_factor=8) have even
+        # spatial dims, which the transformer's 2x2 patch packing requires.
+        image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"), multiple_of=16)
         if image_tensor.dim() == 3:
             image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
 
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts
index 3a5c2cde344..bb172d4d9f5 100644
--- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts
@@ -140,7 +140,12 @@ vi.mock('services/api/types', async () => {
   };
 });
 
-import { buildQwenImageGraph, isQwenImageEditModel, shouldUseCfg } from './buildQwenImageGraph';
+import {
+  buildQwenImageGraph,
+  calculateQwenImageEditRefDimensions,
+  isQwenImageEditModel,
+  shouldUseCfg,
+} from './buildQwenImageGraph';
 
 describe('isQwenImageEditModel', () => {
   afterEach(() => {
@@ -415,3 +420,80 @@ describe('buildQwenImageGraph', () => {
     expect(hasReferenceLatentsEdge).toBe(false);
   });
 });
+
+describe('calculateQwenImageEditRefDimensions', () => {
+  // Cross-checked against diffusers' calculate_dimensions(1024*1024, ratio)
+  // (see pipeline_qwenimage_edit.py / pipeline_qwenimage_edit_plus.py).
+  it('produces ~1024² area for a square input', () => {
+    const result = calculateQwenImageEditRefDimensions(512, 512);
+    expect(result).toEqual({ width: 1024, height: 1024 });
+  });
+
+  it('preserves aspect ratio for landscape inputs', () => {
+    expect(calculateQwenImageEditRefDimensions(1600, 1200)).toEqual({ width: 1184, height: 896 });
+    expect(calculateQwenImageEditRefDimensions(1920, 1080)).toEqual({ width: 1376, height: 768 });
+  });
+
+  it('preserves aspect ratio for portrait inputs', () => {
+    expect(calculateQwenImageEditRefDimensions(1200, 1600)).toEqual({ width: 896, height: 1184 });
+    expect(calculateQwenImageEditRefDimensions(1080, 1920)).toEqual({ width: 768, height: 1376 });
+  });
+
+  it('snaps dimensions to multiples of 32', () => {
+    const { width, height } = calculateQwenImageEditRefDimensions(1600, 1200);
+    expect(width % 32).toBe(0);
+    expect(height % 32).toBe(0);
+  });
+
+  it('clamps to a minimum of 32 for extreme aspect ratios', () => {
+    // 50000x100 has aspect ratio 500:1 — height would round to 0 without the clamp.
+    const { width, height } = calculateQwenImageEditRefDimensions(50000, 100);
+    expect(height).toBeGreaterThanOrEqual(32);
+    expect(width).toBeGreaterThanOrEqual(32);
+    expect(width % 32).toBe(0);
+    expect(height % 32).toBe(0);
+  });
+
+  it('passes computed dims as width/height to the reference i2l node', async () => {
+    const { selectMainModelConfig } = await import('features/controlLayers/store/paramsSlice');
+    const editModel = { ...model, variant: 'edit' };
+    vi.mocked(selectMainModelConfig).mockReturnValue(editModel as never);
+
+    const { fetchModelConfigWithTypeGuard } = await import('features/metadata/util/modelFetchingHelpers');
+    vi.mocked(fetchModelConfigWithTypeGuard).mockResolvedValue(editModel as never);
+
+    const { selectRefImagesSlice } = await import('features/controlLayers/store/refImagesSlice');
+    vi.mocked(selectRefImagesSlice).mockReturnValue({
+      entities: [
+        {
+          id: 'ref-image-1',
+          isEnabled: true,
+          config: {
+            type: 'qwen_image_reference_image',
+            image: { original: { image: { image_name: 'ref.png', width: 1600, height: 1200 } } },
+          },
+        },
+      ],
+    } as never);
+
+    const { g } = await buildQwenImageGraph({
+      generationMode: 'txt2img',
+      manager: null,
+      state: {
+        system: { shouldUseNSFWChecker: false, shouldUseWatermarker: false },
+      } as never,
+    });
+
+    const graph = g.getGraph();
+    const refI2lNodeId = Object.keys(graph.nodes).find((id) => id.startsWith('qwen_ref_i2l:'));
+    expect(refI2lNodeId).toBeDefined();
+    const refI2lNode = graph.nodes[refI2lNodeId!] as { width?: number; height?: number };
+    expect(refI2lNode.width).toBe(1184);
+    expect(refI2lNode.height).toBe(896);
+
+    // Restore mocks
+    vi.mocked(selectMainModelConfig).mockReturnValue(model as never);
+    vi.mocked(fetchModelConfigWithTypeGuard).mockResolvedValue(model as never);
+    vi.mocked(selectRefImagesSlice).mockReturnValue(refImagesSlice as never);
+  });
+});
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts
index 0d92d325afd..8c74b537424 100644
--- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts
@@ -51,6 +51,27 @@ export const shouldUseCfg = (cfgScale: number | number[]): boolean => {
   return cfgScale.some((value) => value > 1);
 };
 
+/**
+ * Compute the target dimensions for the VAE-encoded reference image, matching
+ * diffusers' `calculate_dimensions(VAE_IMAGE_SIZE=1024*1024, aspect_ratio)` used
+ * by QwenImageEditPipeline / QwenImageEditPlusPipeline. The reference is resized
+ * so its area is ~1024² while preserving aspect ratio, with each dimension
+ * snapped to a multiple of 32 (the model was trained at this scale; feeding it a
+ * much larger reference produces a sequence length it was not trained on).
+ */
+const QWEN_IMAGE_EDIT_REF_TARGET_AREA = 1024 * 1024;
+export const calculateQwenImageEditRefDimensions = (
+  width: number,
+  height: number
+): { width: number; height: number } => {
+  const ratio = width / height;
+  let w = Math.sqrt(QWEN_IMAGE_EDIT_REF_TARGET_AREA * ratio);
+  let h = w / ratio;
+  w = Math.max(32, Math.round(w / 32) * 32);
+  h = Math.max(32, Math.round(h / 32) * 32);
+  return { width: w, height: h };
+};
+
 export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise<GraphBuilderReturn> => {
   const { generationMode, state, manager } = arg;
 
@@ -175,15 +196,18 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise<GraphBu
     // Also VAE-encode the first reference image as latents for the denoising transformer.
     // The transformer expects [noisy_patches ; ref_patches] in its sequence.
     const firstConfig = validRefImageConfigs[0]!;
-    const firstImgField = zImageField.parse(
-      firstConfig.config.image?.crop?.image ?? firstConfig.config.image?.original.image
-    );
-    // Don't force-resize the reference image to the output dimensions — that would
-    // distort the aspect ratio when they differ. The I2L encodes at the image's
-    // native size; the denoise node handles dimension mismatches via interpolation.
+    const firstImage = firstConfig.config.image?.crop?.image ?? firstConfig.config.image?.original.image;
+    const firstImgField = zImageField.parse(firstImage);
+    // Resize the reference image to ~1024² area preserving aspect ratio, matching the
+    // diffusers QwenImageEdit(Plus)Pipeline's VAE_IMAGE_SIZE. The denoise node uses
+    // the reference latent's own dimensions for RoPE, so the ref segment is encoded
+    // at the resolution the model was trained on rather than the source image's
+    // native size.
+    const refDims = firstImage ? calculateQwenImageEditRefDimensions(firstImage.width, firstImage.height) : undefined;
     const refI2l = g.addNode({
       type: 'qwen_image_i2l',
       id: getPrefixedId('qwen_ref_i2l'),
+      ...(refDims ? { width: refDims.width, height: refDims.height } : {}),
     });
     const refImageNode = g.addNode({
       type: 'image',
diff --git a/tests/app/invocations/test_qwen_image_denoise.py b/tests/app/invocations/test_qwen_image_denoise.py
index e0e8a3f0b72..50187ea1535 100644
--- a/tests/app/invocations/test_qwen_image_denoise.py
+++ b/tests/app/invocations/test_qwen_image_denoise.py
@@ -59,3 +59,139 @@ def test_unpack_shape(self):
         packed = torch.randn(1, 70 * 59, 64)
         unpacked = QwenImageDenoiseInvocation._unpack_latents(packed, 140, 118)
         assert unpacked.shape == (1, 16, 140, 118)
+
+
+class TestAlignRefLatentDims:
+    """Test reference latent dim alignment for 2x2 packing."""
+
+    def test_even_dims_unchanged(self):
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(96, 64) == (96, 64)
+
+    def test_odd_dims_trimmed_to_even(self):
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(97, 65) == (96, 64)
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(150, 151) == (150, 150)
+
+    def test_minimum_aligned_dims(self):
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(2, 2) == (2, 2)
+        assert QwenImageDenoiseInvocation._align_ref_latent_dims(3, 2) == (2, 2)
+
+    def test_raises_on_zero_dim(self):
+        with pytest.raises(ValueError, match="spatial dims must be >= 2"):
+            QwenImageDenoiseInvocation._align_ref_latent_dims(0, 64)
+        with pytest.raises(ValueError, match="spatial dims must be >= 2"):
+            QwenImageDenoiseInvocation._align_ref_latent_dims(64, 0)
+
+    def test_raises_on_one_dim(self):
+        """A 1-pixel latent aligns to 0 and must be rejected."""
+        with pytest.raises(ValueError, match="spatial dims must be >= 2"):
+            QwenImageDenoiseInvocation._align_ref_latent_dims(1, 64)
+        with pytest.raises(ValueError, match="spatial dims must be >= 2"):
+            QwenImageDenoiseInvocation._align_ref_latent_dims(64, 1)
+
+
+class TestMaybeClampRefLatentSize:
+    """Test the diffusers-style VAE_IMAGE_SIZE clamp applied to reference latents
+    before packing. This is defense-in-depth for backend callers (direct API,
+    older graph JSON) that wire qwen_image_i2l without explicit width/height —
+    without the clamp, the transformer receives an out-of-distribution sequence
+    length and VRAM usage spikes on large reference images."""
+
+    def test_in_budget_latent_unchanged(self):
+        """A 1024² ref image → 128x128 latent → exactly the budget. Pass through."""
+        import torch
+
+        ref = torch.randn(1, 16, 128, 128)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 128, 128)
+        assert result is ref  # identity, no copy
+
+    def test_small_latent_unchanged(self):
+        """A 512² ref → 64x64 latent (4x under budget). Pass through unchanged."""
+        import torch
+
+        ref = torch.randn(1, 16, 64, 64)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 64, 64)
+        assert result is ref
+
+    def test_native_resolution_landscape_clamped(self):
+        """A native 1600x1200 image → 200x150 latents. Should clamp to the same
+        dims diffusers produces (1184x896 pixels → 148x112 latents)."""
+        import torch
+
+        ref = torch.randn(1, 16, 150, 200)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 112, 148)
+
+    def test_native_resolution_portrait_clamped(self):
+        """1200x1600 → 150x200 latents → diffusers target 896x1184 → 112x148."""
+        import torch
+
+        ref = torch.randn(1, 16, 200, 150)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 148, 112)
+
+    def test_huge_latent_clamped(self):
+        """A 4096x4096 image → 512x512 latents (16x budget). Clamp to 128x128
+        latents (= 1024² pixels), well within model's trained distribution."""
+        import torch
+
+        ref = torch.randn(1, 16, 512, 512)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        assert result.shape == (1, 16, 128, 128)
+
+    def test_clamp_preserves_aspect_ratio_within_rounding(self):
+        """Aspect ratio of the clamped latent should match the input to within
+        the 32-pixel snapping granularity used by diffusers."""
+        import torch
+
+        # 1920x1080 (16:9, ~2M pixels)
+        ref = torch.randn(1, 16, 135, 240)
+        result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+        # diffusers: calculate_dimensions(1024², 16/9) → (1376, 768) px → (172, 96) latent
+        assert result.shape == (1, 16, 96, 172)
+
+    def test_clamp_output_is_packable(self):
+        """The clamped latent must have even spatial dims (required by 2x2 packing)
+        before _align_ref_latent_dims is called. Because the clamp snaps to 32px
+        in pixel space and vae_scale_factor=8, every clamp output is a multiple
+        of 4 in latent space (and therefore even)."""
+        import torch
+
+        for h, w in [(150, 200), (200, 150), (135, 240), (512, 512)]:
+            ref = torch.randn(1, 16, h, w)
+            result = QwenImageDenoiseInvocation._maybe_clamp_ref_latent_size(ref)
+            _, _, rh, rw = result.shape
+            assert rh % 2 == 0, f"clamp produced odd height {rh} for input ({h},{w})"
+            assert rw % 2 == 0, f"clamp produced odd width {rw} for input ({h},{w})"
+
+
+class TestBuildImgShapes:
+    """Test img_shapes construction. Regression test for the ghosting/doubling bug
+    where ref and noisy segments shared identical spatial RoPE positions."""
+
+    def test_txt2img_single_segment(self):
+        """No reference latent → single segment for the noisy latent only."""
+        result = QwenImageDenoiseInvocation._build_img_shapes(64, 64)
+        assert result == [[(1, 32, 32)]]
+
+    def test_edit_uses_distinct_ref_dims(self):
+        """Edit-mode img_shapes must place ref segment at the ref's OWN dims, not
+        the noisy dims. Identical dims caused the ghosting artifact."""
+        noisy_h, noisy_w = 64, 64
+        ref_h, ref_w = 96, 64
+        result = QwenImageDenoiseInvocation._build_img_shapes(noisy_h, noisy_w, ref_h, ref_w)
+        assert result == [[(1, 32, 32), (1, 48, 32)]]
+        # The bug was that both segments had the same shape:
+        assert result[0][0] != result[0][1]
+
+    def test_edit_matches_diffusers_layout(self):
+        """Structure must match diffusers QwenImageEditPipeline (single batch,
+        nested list of (frame, h//2, w//2) tuples)."""
+        result = QwenImageDenoiseInvocation._build_img_shapes(80, 112, 128, 96)
+        assert isinstance(result, list)
+        assert len(result) == 1
+        assert isinstance(result[0], list)
+        assert len(result[0]) == 2
+        assert result[0][0] == (1, 40, 56)
+        assert result[0][1] == (1, 64, 48)