Skip to content

Commit d2bdcde

Browse files
Dapeng MiPeter Zijlstra
authored andcommitted
perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR
With the introduction of the OMR feature, the PEBS memory auxiliary info field for load and store latency events has been restructured for DMR. The memory auxiliary info field's bit[8] indicates whether a L2 cache miss occurred for a memory load or store instruction. If bit[8] is 0, it signifies no L2 cache miss, and bits[7:0] specify the exact cache data source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent the OMR encoding, indicating the specific L3 cache or memory region involved in the memory access. A significant enhancement is OMR encoding provides up to 8 fine-grained memory regions besides the cache region. A significant enhancement for OMR encoding is the ability to provide up to 8 fine-grained memory regions in addition to the cache region, offering more detailed insights into memory access regions. For detailed information on the memory auxiliary info encoding, please refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in the ISE documentation. This patch ensures that the PEBS memory auxiliary info field is correctly interpreted and utilized in DMR. Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com
1 parent 4e955c0 commit d2bdcde

4 files changed

Lines changed: 190 additions & 6 deletions

File tree

arch/x86/events/intel/ds.c

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ struct pebs_record_32 {
3434
3535
*/
3636

37+
union omr_encoding {
38+
struct {
39+
u8 omr_source : 4;
40+
u8 omr_remote : 1;
41+
u8 omr_hitm : 1;
42+
u8 omr_snoop : 1;
43+
u8 omr_promoted : 1;
44+
};
45+
u8 omr_full;
46+
};
47+
3748
union intel_x86_pebs_dse {
3849
u64 val;
3950
struct {
@@ -73,6 +84,18 @@ union intel_x86_pebs_dse {
7384
unsigned int lnc_addr_blk:1;
7485
unsigned int ld_reserved6:18;
7586
};
87+
struct {
88+
unsigned int pnc_dse: 8;
89+
unsigned int pnc_l2_miss:1;
90+
unsigned int pnc_stlb_clean_hit:1;
91+
unsigned int pnc_stlb_any_hit:1;
92+
unsigned int pnc_stlb_miss:1;
93+
unsigned int pnc_locked:1;
94+
unsigned int pnc_data_blk:1;
95+
unsigned int pnc_addr_blk:1;
96+
unsigned int pnc_fb_full:1;
97+
unsigned int ld_reserved8:16;
98+
};
7699
};
77100

78101

@@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void)
228251
__intel_pmu_pebs_data_source_cmt(data_source);
229252
}
230253

254+
/* Version for Panthercove and later */
255+
256+
/* L2 hit */
257+
#define PNC_PEBS_DATA_SOURCE_MAX 16
258+
static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = {
259+
P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */
260+
OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */
261+
OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */
262+
OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */
263+
OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */
264+
0, /* 0x05: Reserved */
265+
0, /* 0x06: Reserved */
266+
OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */
267+
OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */
268+
OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */
269+
OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */
270+
0, /* 0x0b: Reserved */
271+
0, /* 0x0c: Reserved */
272+
0, /* 0x0d: Reserved */
273+
0, /* 0x0e: Reserved */
274+
OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
275+
};
276+
277+
/* L2 miss */
278+
#define OMR_DATA_SOURCE_MAX 16
279+
static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = {
280+
P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */
281+
0, /* 0x01: Reserved */
282+
OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */
283+
OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */
284+
OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */
285+
OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */
286+
OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */
287+
OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */
288+
OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */
289+
OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */
290+
OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */
291+
OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */
292+
OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */
293+
OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */
294+
OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */
295+
OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */
296+
};
297+
298+
static u64 parse_omr_data_source(u8 dse)
299+
{
300+
union omr_encoding omr;
301+
u64 val = 0;
302+
303+
omr.omr_full = dse;
304+
val = omr_data_source[omr.omr_source];
305+
if (omr.omr_source > 0x1 && omr.omr_source < 0x7)
306+
val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0;
307+
else if (omr.omr_source > 0x7)
308+
val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM);
309+
310+
if (omr.omr_remote)
311+
val |= REM;
312+
313+
val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT);
314+
315+
if (omr.omr_source == 0x2) {
316+
u8 snoop = omr.omr_snoop | omr.omr_promoted;
317+
318+
if (snoop == 0x0)
319+
val |= P(SNOOP, NA);
320+
else if (snoop == 0x1)
321+
val |= P(SNOOP, MISS);
322+
else if (snoop == 0x2)
323+
val |= P(SNOOP, HIT);
324+
else if (snoop == 0x3)
325+
val |= P(SNOOP, NONE);
326+
} else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) {
327+
val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0;
328+
}
329+
330+
return val;
331+
}
332+
231333
static u64 precise_store_data(u64 status)
232334
{
233335
union intel_x86_pebs_dse dse;
@@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status)
411513
return lnl_latency_data(event, status);
412514
}
413515

516+
u64 pnc_latency_data(struct perf_event *event, u64 status)
517+
{
518+
union intel_x86_pebs_dse dse;
519+
union perf_mem_data_src src;
520+
u64 val;
521+
522+
dse.val = status;
523+
524+
if (!dse.pnc_l2_miss)
525+
val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf];
526+
else
527+
val = parse_omr_data_source(dse.pnc_dse);
528+
529+
if (!val)
530+
val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
531+
532+
if (dse.pnc_stlb_miss)
533+
val |= P(TLB, MISS) | P(TLB, L2);
534+
else
535+
val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
536+
537+
if (dse.pnc_locked)
538+
val |= P(LOCK, LOCKED);
539+
540+
if (dse.pnc_data_blk)
541+
val |= P(BLK, DATA);
542+
if (dse.pnc_addr_blk)
543+
val |= P(BLK, ADDR);
544+
if (!dse.pnc_data_blk && !dse.pnc_addr_blk)
545+
val |= P(BLK, NA);
546+
547+
src.val = val;
548+
if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
549+
src.mem_op = P(OP, STORE);
550+
551+
return src.val;
552+
}
553+
414554
static u64 load_latency_data(struct perf_event *event, u64 status)
415555
{
416556
union intel_x86_pebs_dse dse;

arch/x86/events/perf_event.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status);
16641664

16651665
u64 arl_h_latency_data(struct perf_event *event, u64 status);
16661666

1667+
u64 pnc_latency_data(struct perf_event *event, u64 status);
1668+
16671669
extern struct event_constraint intel_core2_pebs_event_constraints[];
16681670

16691671
extern struct event_constraint intel_atom_pebs_event_constraints[];

include/uapi/linux/perf_event.h

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,14 +1330,16 @@ union perf_mem_data_src {
13301330
mem_snoopx : 2, /* Snoop mode, ext */
13311331
mem_blk : 3, /* Access blocked */
13321332
mem_hops : 3, /* Hop level */
1333-
mem_rsvd : 18;
1333+
mem_region : 5, /* cache/memory regions */
1334+
mem_rsvd : 13;
13341335
};
13351336
};
13361337
#elif defined(__BIG_ENDIAN_BITFIELD)
13371338
union perf_mem_data_src {
13381339
__u64 val;
13391340
struct {
1340-
__u64 mem_rsvd : 18,
1341+
__u64 mem_rsvd : 13,
1342+
mem_region : 5, /* cache/memory regions */
13411343
mem_hops : 3, /* Hop level */
13421344
mem_blk : 3, /* Access blocked */
13431345
mem_snoopx : 2, /* Snoop mode, ext */
@@ -1394,7 +1396,7 @@ union perf_mem_data_src {
13941396
#define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */
13951397
#define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */
13961398
#define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */
1397-
/* 0x007 available */
1399+
#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */
13981400
#define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */
13991401
#define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */
14001402
#define PERF_MEM_LVLNUM_IO 0x000a /* I/O */
@@ -1447,6 +1449,25 @@ union perf_mem_data_src {
14471449
/* 5-7 available */
14481450
#define PERF_MEM_HOPS_SHIFT 43
14491451

1452+
/* Cache/Memory region */
1453+
#define PERF_MEM_REGION_NA 0x0 /* Invalid */
1454+
#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */
1455+
#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */
1456+
#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */
1457+
#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */
1458+
#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */
1459+
#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */
1460+
#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */
1461+
#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */
1462+
#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */
1463+
#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */
1464+
#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */
1465+
#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */
1466+
#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */
1467+
#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */
1468+
#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */
1469+
#define PERF_MEM_REGION_SHIFT 46
1470+
14501471
#define PERF_MEM_S(a, s) \
14511472
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
14521473

tools/include/uapi/linux/perf_event.h

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,14 +1330,16 @@ union perf_mem_data_src {
13301330
mem_snoopx : 2, /* Snoop mode, ext */
13311331
mem_blk : 3, /* Access blocked */
13321332
mem_hops : 3, /* Hop level */
1333-
mem_rsvd : 18;
1333+
mem_region : 5, /* cache/memory regions */
1334+
mem_rsvd : 13;
13341335
};
13351336
};
13361337
#elif defined(__BIG_ENDIAN_BITFIELD)
13371338
union perf_mem_data_src {
13381339
__u64 val;
13391340
struct {
1340-
__u64 mem_rsvd : 18,
1341+
__u64 mem_rsvd : 13,
1342+
mem_region : 5, /* cache/memory regions */
13411343
mem_hops : 3, /* Hop level */
13421344
mem_blk : 3, /* Access blocked */
13431345
mem_snoopx : 2, /* Snoop mode, ext */
@@ -1394,7 +1396,7 @@ union perf_mem_data_src {
13941396
#define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */
13951397
#define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */
13961398
#define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */
1397-
/* 0x007 available */
1399+
#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */
13981400
#define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */
13991401
#define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */
14001402
#define PERF_MEM_LVLNUM_IO 0x000a /* I/O */
@@ -1447,6 +1449,25 @@ union perf_mem_data_src {
14471449
/* 5-7 available */
14481450
#define PERF_MEM_HOPS_SHIFT 43
14491451

1452+
/* Cache/Memory region */
1453+
#define PERF_MEM_REGION_NA 0x0 /* Invalid */
1454+
#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */
1455+
#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */
1456+
#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */
1457+
#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */
1458+
#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */
1459+
#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */
1460+
#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */
1461+
#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */
1462+
#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */
1463+
#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */
1464+
#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */
1465+
#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */
1466+
#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */
1467+
#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */
1468+
#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */
1469+
#define PERF_MEM_REGION_SHIFT 46
1470+
14501471
#define PERF_MEM_S(a, s) \
14511472
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
14521473

0 commit comments

Comments
 (0)