AsahiLinux
diff --git a/‎tools/lib/api/io.h‎
Lines changed: 38 additions & 31 deletions b/‎tools/lib/api/io.h‎
Lines changed: 38 additions & 31 deletions
diff --git a/‎tools/lib/perf/include/perf/event.h‎
Lines changed: 6 additions & 0 deletions b/‎tools/lib/perf/include/perf/event.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tools/perf/Build‎
Lines changed: 8 additions & 6 deletions b/‎tools/perf/Build‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎tools/perf/Documentation/perf-amd-ibs.txt‎
Lines changed: 189 additions & 0 deletions b/‎tools/perf/Documentation/perf-amd-ibs.txt‎
Lines changed: 189 additions & 0 deletions
diff --git a/‎tools/perf/Documentation/perf-kwork.txt‎
Lines changed: 2 additions & 2 deletions b/‎tools/perf/Documentation/perf-kwork.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tools/perf/Documentation/perf-lock.txt‎
Lines changed: 2 additions & 2 deletions b/‎tools/perf/Documentation/perf-lock.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tools/perf/Documentation/perf-mem.txt‎
Lines changed: 1 addition & 1 deletion b/‎tools/perf/Documentation/perf-mem.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tools/perf/Documentation/perf-record.txt‎
Lines changed: 2 additions & 2 deletions b/‎tools/perf/Documentation/perf-record.txt‎
Lines changed: 2 additions & 2 deletions
@@ -43,48 +43,55 @@ static inline void io__init(struct io *io, int fd,
 	io->eof = false;
 }
 
-/* Reads one character from the "io" file with similar semantics to fgetc. */
-static inline int io__get_char(struct io *io)
+/* Read from fd filling the buffer. Called when io->data == io->end. */
+static inline int io__fill_buffer(struct io *io)
 {
-	char *ptr = io->data;
+	ssize_t n;
 
 	if (io->eof)
 		return -1;
 
-	if (ptr == io->end) {
-		ssize_t n;
-
-		if (io->timeout_ms != 0) {
-			struct pollfd pfds[] = {
-				{
-					.fd = io->fd,
-					.events = POLLIN,
-				},
-			};
-
-			n = poll(pfds, 1, io->timeout_ms);
-			if (n == 0)
-				errno = ETIMEDOUT;
-			if (n > 0 && !(pfds[0].revents & POLLIN)) {
-				errno = EIO;
-				n = -1;
-			}
-			if (n <= 0) {
-				io->eof = true;
-				return -1;
-			}
+	if (io->timeout_ms != 0) {
+		struct pollfd pfds[] = {
+			{
+				.fd = io->fd,
+				.events = POLLIN,
+			},
+		};
+
+		n = poll(pfds, 1, io->timeout_ms);
+		if (n == 0)
+			errno = ETIMEDOUT;
+		if (n > 0 && !(pfds[0].revents & POLLIN)) {
+			errno = EIO;
+			n = -1;
 		}
-		n = read(io->fd, io->buf, io->buf_len);
-
 		if (n <= 0) {
 			io->eof = true;
 			return -1;
 		}
-		ptr = &io->buf[0];
-		io->end = &io->buf[n];
 	}
-	io->data = ptr + 1;
-	return *ptr;
+	n = read(io->fd, io->buf, io->buf_len);
+
+	if (n <= 0) {
+		io->eof = true;
+		return -1;
+	}
+	io->data = &io->buf[0];
+	io->end = &io->buf[n];
+	return 0;
+}
+
+/* Reads one character from the "io" file with similar semantics to fgetc. */
+static inline int io__get_char(struct io *io)
+{
+	if (io->data == io->end) {
+		int ret = io__fill_buffer(io);
+
+		if (ret)
+			return ret;
+	}
+	return *io->data++;
 }
 
 /* Read a hexadecimal value with no 0x prefix into the out argument hex. If the
 
@@ -77,6 +77,12 @@ struct perf_record_lost_samples {
 	__u64			 lost;
 };
 
+#define MAX_ID_HDR_ENTRIES  6
+struct perf_record_lost_samples_and_ids {
+	struct perf_record_lost_samples lost;
+	__u64 sample_ids[MAX_ID_HDR_ENTRIES];
+};
+
 /*
  * PERF_FORMAT_ENABLED | PERF_FORMAT_RUNNING | PERF_FORMAT_ID | PERF_FORMAT_LOST
  */
 
@@ -1,4 +1,4 @@
-perf-y += builtin-bench.o
+perf-bench-y += builtin-bench.o
 perf-y += builtin-annotate.o
 perf-y += builtin-config.o
 perf-y += builtin-diff.o
@@ -35,8 +35,8 @@ endif
 
 perf-$(CONFIG_LIBELF) += builtin-probe.o
 
-perf-y += bench/
-perf-y += tests/
+perf-bench-y += bench/
+perf-test-y += tests/
 
 perf-y += perf.o
 
@@ -53,10 +53,12 @@ CFLAGS_builtin-trace.o	   += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_
 CFLAGS_builtin-report.o	   += -DTIPDIR="BUILD_STR($(tipdir_SQ))"
 CFLAGS_builtin-report.o	   += -DDOCDIR="BUILD_STR($(srcdir_SQ)/Documentation)"
 
-perf-y += util/
+perf-util-y += util/
+perf-util-y += arch/
 perf-y += arch/
-perf-y += ui/
-perf-y += scripts/
+perf-test-y += arch/
+perf-ui-y += ui/
+perf-util-y += scripts/
 
 gtk-y += ui/gtk/
 
 
@@ -0,0 +1,189 @@
+perf-amd-ibs(1)
+===============
+
+NAME
+----
+perf-amd-ibs - Support for AMD Instruction-Based Sampling (IBS) with perf tool
+
+SYNOPSIS
+--------
+[verse]
+'perf record' -e ibs_op//
+'perf record' -e ibs_fetch//
+
+DESCRIPTION
+-----------
+
+Instruction-Based Sampling (IBS) provides precise Instruction Pointer (IP)
+profiling support on AMD platforms. IBS has two independent components: IBS
+Op and IBS Fetch. IBS Op sampling provides information about instruction
+execution (micro-op execution to be precise) with details like d-cache
+hit/miss, d-TLB hit/miss, cache miss latency, load/store data source, branch
+behavior etc. IBS Fetch sampling provides information about instruction fetch
+with details like i-cache hit/miss, i-TLB hit/miss, fetch latency etc. IBS is
+per-smt-thread i.e. each SMT hardware thread contains standalone IBS units.
+
+Both, IBS Op and IBS Fetch, are exposed as PMUs by Linux and can be exploited
+using the Linux perf utility. The following files will be created at boot time
+if IBS is supported by the hardware and kernel.
+
+  /sys/bus/event_source/devices/ibs_op/
+  /sys/bus/event_source/devices/ibs_fetch/
+
+IBS Op PMU supports two events: cycles and micro ops. IBS Fetch PMU supports
+one event: fetch ops.
+
+IBS PMUs do not have user/kernel filtering capability and thus it requires
+CAP_SYS_ADMIN or CAP_PERFMON privilege.
+
+IBS VS. REGULAR CORE PMU
+------------------------
+
+IBS gives samples with precise IP, i.e. the IP recorded with IBS sample has
+no skid. Whereas the IP recorded by regular core PMU will have some skid
+(sample was generated at IP X but perf would record it at IP X+n). Hence,
+regular core PMU might not help for profiling with instruction level
+precision. Further, IBS provides additional information about the sample in
+question. On the other hand, regular core PMU has it's own advantages like
+plethora of events, counting mode (less interference), up to 6 parallel
+counters, event grouping support, filtering capabilities etc.
+
+Three regular core PMU events are internally forwarded to IBS Op PMU when
+precise_ip attribute is set:
+
+	-e cpu-cycles:p becomes -e ibs_op//
+	-e r076:p becomes -e ibs_op//
+	-e r0C1:p becomes -e ibs_op/cnt_ctl=1/
+
+EXAMPLES
+--------
+
+IBS Op PMU
+~~~~~~~~~~
+
+System-wide profile, cycles event, sampling period: 100000
+
+	# perf record -e ibs_op// -c 100000 -a
+
+Per-cpu profile (cpu10), cycles event, sampling period: 100000
+
+	# perf record -e ibs_op// -c 100000 -C 10
+
+Per-cpu profile (cpu10), cycles event, sampling freq: 1000
+
+	# perf record -e ibs_op// -F 1000 -C 10
+
+System-wide profile, uOps event, sampling period: 100000
+
+	# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -a
+
+Same command, but also capture IBS register raw dump along with perf sample:
+
+	# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -a --raw-samples
+
+System-wide profile, uOps event, sampling period: 100000, L3MissOnly (Zen4 onward)
+
+	# perf record -e ibs_op/cnt_ctl=1,l3missonly=1/ -c 100000 -a
+
+Per process(upstream v6.2 onward), uOps event, sampling period: 100000
+
+	# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -p 1234
+
+Per process(upstream v6.2 onward), uOps event, sampling period: 100000
+
+	# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -- ls
+
+To analyse recorded profile in aggregate mode
+
+	# perf report
+	/* Select a line and press 'a' to drill down at instruction level. */
+
+To go over each sample
+
+	# perf script
+
+Raw dump of IBS registers when profiled with --raw-samples
+
+	# perf report -D
+	/* Look for PERF_RECORD_SAMPLE */
+
+	Example register raw dump:
+
+	ibs_op_ctl:     000002c30006186a MaxCnt    100000 L3MissOnly 0 En 1
+		Val 1 CntCtl 0=cycles CurCnt       707
+	IbsOpRip:       ffffffff8204aea7
+	ibs_op_data:    0000010002550001 CompToRetCtr     1 TagToRetCtr   597
+		BrnRet 0  RipInvalid 0 BrnFuse 0 Microcode 1
+	ibs_op_data2:   0000000000000013 RmtNode 1 DataSrc 3=DRAM
+	ibs_op_data3:   0000000031960092 LdOp 0 StOp 1 DcL1TlbMiss 0
+		DcL2TlbMiss 0 DcL1TlbHit2M 1 DcL1TlbHit1G 0 DcL2TlbHit2M 0
+		DcMiss 1 DcMisAcc 0 DcWcMemAcc 0 DcUcMemAcc 0 DcLockedOp 0
+		DcMissNoMabAlloc 0 DcLinAddrValid 1 DcPhyAddrValid 1
+		DcL2TlbHit1G 0 L2Miss 1 SwPf 0 OpMemWidth 32 bytes
+		OpDcMissOpenMemReqs 12 DcMissLat     0 TlbRefillLat     0
+	IbsDCLinAd:     ff110008a5398920
+	IbsDCPhysAd:    00000008a5398920
+
+IBS applied in a real world usecase
+
+	~90% regression was observed in tbench with specific scheduler hint
+	which was counter intuitive. IBS profile of good and bad run captured
+	using perf helped in identifying exact cause of the problem:
+
+	https://lore.kernel.org/r/20220921063638.2489-1-kprateek.nayak@amd.com
+
+IBS Fetch PMU
+~~~~~~~~~~~~~
+
+Similar commands can be used with Fetch PMU as well.
+
+System-wide profile, fetch ops event, sampling period: 100000
+
+	# perf record -e ibs_fetch// -c 100000 -a
+
+System-wide profile, fetch ops event, sampling period: 100000, Random enable
+
+	# perf record -e ibs_fetch/rand_en=1/ -c 100000 -a
+
+	Random enable adds small degree of variability to sample period. This
+	helps in cases like long running loops where PMU is tagging the same
+	instruction over and over because of fixed sample period.
+
+etc.
+
+PERF MEM AND PERF C2C
+---------------------
+
+perf mem is a memory access profiler tool and perf c2c is a shared data
+cacheline analyser tool. Both of them internally uses IBS Op PMU on AMD.
+Below is a simple example of the perf mem tool.
+
+	# perf mem record -c 100000 -- make
+	# perf mem report
+
+A normal perf mem report output will provide detailed memory access profile.
+However, it can also be aggregated based on output fields. For example:
+
+	# perf mem report -F mem,sample,snoop
+	Samples: 3M of event 'ibs_op//', Event count (approx.): 23524876
+	Memory access                                 Samples  Snoop
+	N/A                                           1903343  N/A
+	L1 hit                                        1056754  N/A
+	L2 hit                                          75231  N/A
+	L3 hit                                           9496  HitM
+	L3 hit                                           2270  N/A
+	RAM hit                                          8710  N/A
+	Remote node, same socket RAM hit                 3241  N/A
+	Remote core, same node Any cache hit             1572  HitM
+	Remote core, same node Any cache hit              514  N/A
+	Remote node, same socket Any cache hit           1216  HitM
+	Remote node, same socket Any cache hit            350  N/A
+	Uncached hit                                       18  N/A
+
+Please refer to their man page for more detail.
+
+SEE ALSO
+--------
+
+linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1],
+linkperf:perf-mem[1], linkperf:perf-c2c[1]
@@ -1,4 +1,4 @@
-perf-kowrk(1)
+perf-kwork(1)
 =============
 
 NAME
@@ -35,7 +35,7 @@ There are several variants of 'perf kwork':
         perf kwork top
         perf kwork top -b
 
-   By default it shows the individual work events such as irq, workqeueu,
+   By default it shows the individual work events such as irq, workqueue,
    including the run time and delay (time between raise and actually entry):
 
       Runtime start      Runtime end        Cpu     Kwork name                 Runtime     Delaytime
 
@@ -111,11 +111,11 @@ INFO OPTIONS
 
 -t::
 --threads::
-	dump thread list in perf.data
+	dump only the thread list in perf.data
 
 -m::
 --map::
-	dump map of lock instances (address:name table)
+	dump only the map of lock instances (address:name table)
 
 
 CONTENTION OPTIONS
 
@@ -21,7 +21,7 @@ and stores are sampled. Use the -t option to limit to loads or stores.
 
 Note that on Intel systems the memory latency reported is the use-latency,
 not the pure load (or store latency). Use latency includes any pipeline
-queueing delays in addition to the memory subsystem latency.
+queuing delays in addition to the memory subsystem latency.
 
 On Arm64 this uses SPE to sample load and store operations, therefore hardware
 and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide.
 
@@ -200,7 +200,7 @@ OPTIONS
 	  ip, id, tid, pid, cpu, time, addr, period, txn, weight, phys_addr,
 	  code_pgsz, data_pgsz, weight1, weight2, weight3, ins_lat, retire_lat,
 	  p_stage_cyc, mem_op, mem_lvl, mem_snoop, mem_remote, mem_lock,
-	  mem_dtlb, mem_blk, mem_hops
+	  mem_dtlb, mem_blk, mem_hops, uid, gid
 
 	The <operator> can be one of:
 	  ==, !=, >, >=, <, <=, &
@@ -311,7 +311,7 @@ OPTIONS
 	User can change the size by passing the size after comma like
 	"--call-graph dwarf,4096".
 
-	When "fp" recording is used, perf tries to save stack enties
+	When "fp" recording is used, perf tries to save stack entries
 	up to the number specified in sysctl.kernel.perf_event_max_stack
 	by default.  User can change the number by passing it after comma
 	like "--call-graph fp,32".