Skip to content

Commit 1e12dba

Browse files
committed
drm/xe/migrate: support MEM_COPY instruction
Make this the default on xe2+ when doing a copy. This has a few advantages over the exiting copy instruction: 1) It has a special PAGE_COPY mode that claims to be optimised for page-in/page-out, which is the vast majority of current users. 2) It also has a simple BYTE_COPY mode that supports byte granularity copying without any restrictions. With 2) we can now easily skip the bounce buffer flow when copying buffers with strange sizing/alignment, like for memory_access. But that is left for the next patch. v2 (Matt Brost): - Use device info to check whether device should use the MEM_COPY path. This should fit better with making this a configfs tunable. - And with that also keep old path still functional on xe2 for possible experimentation. - Add a define for PAGE_COPY page-size. v3 (Matt Brost): - Fallback to an actual linear copy for pitch=1. - Also update NVL. BSpec: 57561 Signed-off-by: Matthew Auld <matthew.auld@intel.com> Cc: Matthew Brost <matthew.brost@intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> Link: https://lore.kernel.org/r/20251022163836.191405-7-matthew.auld@intel.com
1 parent 0171dcc commit 1e12dba

5 files changed

Lines changed: 72 additions & 3 deletions

File tree

drivers/gpu/drm/xe/instructions/xe_gpu_commands.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@
3131
#define XY_FAST_COPY_BLT_D1_DST_TILE4 REG_BIT(30)
3232
#define XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK GENMASK(23, 20)
3333

34+
#define MEM_COPY_CMD (2 << 29 | 0x5a << 22 | 0x8)
35+
#define MEM_COPY_PAGE_COPY_MODE REG_BIT(19)
36+
#define MEM_COPY_MATRIX_COPY REG_BIT(17)
37+
#define MEM_COPY_SRC_MOCS_INDEX_MASK GENMASK(31, 28)
38+
#define MEM_COPY_DST_MOCS_INDEX_MASK GENMASK(6, 3)
39+
3440
#define PVC_MEM_SET_CMD (2 << 29 | 0x5b << 22)
3541
#define PVC_MEM_SET_CMD_LEN_DW 7
3642
#define PVC_MEM_SET_MATRIX REG_BIT(17)

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,8 @@ struct xe_device {
300300
* pcode mailbox commands.
301301
*/
302302
u8 has_mbx_power_limits:1;
303+
/** @info.has_mem_copy_instr: Device supports MEM_COPY instruction */
304+
u8 has_mem_copy_instr:1;
303305
/** @info.has_pxp: Device has PXP support */
304306
u8 has_pxp:1;
305307
/** @info.has_range_tlb_inval: Has range based TLB invalidations */

drivers/gpu/drm/xe/xe_migrate.c

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -699,9 +699,9 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
699699
}
700700

701701
#define EMIT_COPY_DW 10
702-
static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
703-
u64 src_ofs, u64 dst_ofs, unsigned int size,
704-
unsigned int pitch)
702+
static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
703+
u64 dst_ofs, unsigned int size,
704+
unsigned int pitch)
705705
{
706706
struct xe_device *xe = gt_to_xe(gt);
707707
u32 mocs = 0;
@@ -730,6 +730,61 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
730730
bb->cs[bb->len++] = upper_32_bits(src_ofs);
731731
}
732732

733+
#define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */
734+
static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
735+
u64 dst_ofs, unsigned int size, unsigned int pitch)
736+
{
737+
u32 mode, copy_type, width;
738+
739+
xe_gt_assert(gt, IS_ALIGNED(size, pitch));
740+
xe_gt_assert(gt, pitch <= U16_MAX);
741+
xe_gt_assert(gt, pitch);
742+
xe_gt_assert(gt, size);
743+
744+
if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) &&
745+
IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) &&
746+
IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) {
747+
mode = MEM_COPY_PAGE_COPY_MODE;
748+
copy_type = 0; /* linear copy */
749+
width = size / PAGE_COPY_MODE_PS;
750+
} else if (pitch > 1) {
751+
xe_gt_assert(gt, size / pitch <= U16_MAX);
752+
mode = 0; /* BYTE_COPY */
753+
copy_type = MEM_COPY_MATRIX_COPY;
754+
width = pitch;
755+
} else {
756+
mode = 0; /* BYTE_COPY */
757+
copy_type = 0; /* linear copy */
758+
width = size;
759+
}
760+
761+
xe_gt_assert(gt, width <= U16_MAX);
762+
763+
bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type;
764+
bb->cs[bb->len++] = width - 1;
765+
bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */
766+
bb->cs[bb->len++] = pitch - 1;
767+
bb->cs[bb->len++] = pitch - 1;
768+
bb->cs[bb->len++] = lower_32_bits(src_ofs);
769+
bb->cs[bb->len++] = upper_32_bits(src_ofs);
770+
bb->cs[bb->len++] = lower_32_bits(dst_ofs);
771+
bb->cs[bb->len++] = upper_32_bits(dst_ofs);
772+
bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) |
773+
FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index);
774+
}
775+
776+
static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
777+
u64 src_ofs, u64 dst_ofs, unsigned int size,
778+
unsigned int pitch)
779+
{
780+
struct xe_device *xe = gt_to_xe(gt);
781+
782+
if (xe->info.has_mem_copy_instr)
783+
emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
784+
else
785+
emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
786+
}
787+
733788
static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
734789
{
735790
return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;

drivers/gpu/drm/xe/xe_pci.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ static const struct xe_device_desc lnl_desc = {
342342
.has_display = true,
343343
.has_flat_ccs = 1,
344344
.has_pxp = true,
345+
.has_mem_copy_instr = true,
345346
.max_gt_per_tile = 2,
346347
.needs_scratch = true,
347348
.va_bits = 48,
@@ -362,6 +363,7 @@ static const struct xe_device_desc bmg_desc = {
362363
.has_heci_cscfi = 1,
363364
.has_late_bind = true,
364365
.has_sriov = true,
366+
.has_mem_copy_instr = true,
365367
.max_gt_per_tile = 2,
366368
.needs_scratch = true,
367369
.subplatforms = (const struct xe_subplatform_desc[]) {
@@ -378,6 +380,7 @@ static const struct xe_device_desc ptl_desc = {
378380
.has_display = true,
379381
.has_flat_ccs = 1,
380382
.has_sriov = true,
383+
.has_mem_copy_instr = true,
381384
.max_gt_per_tile = 2,
382385
.needs_scratch = true,
383386
.needs_shared_vf_gt_wq = true,
@@ -390,6 +393,7 @@ static const struct xe_device_desc nvls_desc = {
390393
.dma_mask_size = 46,
391394
.has_display = true,
392395
.has_flat_ccs = 1,
396+
.has_mem_copy_instr = true,
393397
.max_gt_per_tile = 2,
394398
.require_force_probe = true,
395399
.va_bits = 48,
@@ -655,6 +659,7 @@ static int xe_info_init_early(struct xe_device *xe,
655659
xe->info.has_pxp = desc->has_pxp;
656660
xe->info.has_sriov = xe_configfs_primary_gt_allowed(to_pci_dev(xe->drm.dev)) &&
657661
desc->has_sriov;
662+
xe->info.has_mem_copy_instr = desc->has_mem_copy_instr;
658663
xe->info.skip_guc_pc = desc->skip_guc_pc;
659664
xe->info.skip_mtcfg = desc->skip_mtcfg;
660665
xe->info.skip_pcode = desc->skip_pcode;

drivers/gpu/drm/xe/xe_pci_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ struct xe_device_desc {
4646
u8 has_late_bind:1;
4747
u8 has_llc:1;
4848
u8 has_mbx_power_limits:1;
49+
u8 has_mem_copy_instr:1;
4950
u8 has_pxp:1;
5051
u8 has_sriov:1;
5152
u8 needs_scratch:1;

0 commit comments

Comments
 (0)