Rust-for-Linux
diff --git a/‎tools/lib/perf/evlist.c‎
Lines changed: 7 additions & 6 deletions b/‎tools/lib/perf/evlist.c‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎tools/lib/perf/evsel.c‎
Lines changed: 27 additions & 7 deletions b/‎tools/lib/perf/evsel.c‎
Lines changed: 27 additions & 7 deletions
diff --git a/‎tools/lib/perf/include/internal/evlist.h‎
Lines changed: 2 additions & 1 deletion b/‎tools/lib/perf/include/internal/evlist.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tools/lib/perf/include/internal/lib.h‎
Lines changed: 2 additions & 0 deletions b/‎tools/lib/perf/include/internal/lib.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tools/lib/perf/include/perf/cpumap.h‎
Lines changed: 3 additions & 0 deletions b/‎tools/lib/perf/include/perf/cpumap.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tools/lib/perf/include/perf/evsel.h‎
Lines changed: 1 addition & 0 deletions b/‎tools/lib/perf/include/perf/evsel.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tools/lib/perf/lib.c‎
Lines changed: 20 additions & 0 deletions b/‎tools/lib/perf/lib.c‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎tools/perf/.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎tools/perf/.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tools/perf/Documentation/perf-annotate.txt‎
Lines changed: 5 additions & 0 deletions b/‎tools/perf/Documentation/perf-annotate.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tools/perf/Documentation/perf-arm-spe.txt‎
Lines changed: 218 additions & 0 deletions b/‎tools/perf/Documentation/perf-arm-spe.txt‎
Lines changed: 218 additions & 0 deletions
@@ -59,6 +59,10 @@ static void perf_evlist__propagate_maps(struct perf_evlist *evlist)
 {
 	struct perf_evsel *evsel;
 
+	/* Recomputing all_cpus, so start with a blank slate. */
+	perf_cpu_map__put(evlist->all_cpus);
+	evlist->all_cpus = NULL;
+
 	perf_evlist__for_each_evsel(evlist, evsel)
 		__perf_evlist__propagate_maps(evlist, evsel);
 }
@@ -474,6 +478,9 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
 			 */
 			refcount_set(&map->refcnt, 2);
 
+			if (ops->idx)
+				ops->idx(evlist, evsel, mp, idx);
+
 			if (ops->mmap(map, mp, *output, evlist_cpu) < 0)
 				return -1;
 
@@ -516,9 +523,6 @@ mmap_per_thread(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
 		int output = -1;
 		int output_overwrite = -1;
 
-		if (ops->idx)
-			ops->idx(evlist, mp, thread, false);
-
 		if (mmap_per_evsel(evlist, ops, thread, mp, 0, thread,
 				   &output, &output_overwrite))
 			goto out_unmap;
@@ -543,9 +547,6 @@ mmap_per_cpu(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
 		int output = -1;
 		int output_overwrite = -1;
 
-		if (ops->idx)
-			ops->idx(evlist, mp, cpu, true);
-
 		for (thread = 0; thread < nr_threads; thread++) {
 			if (mmap_per_evsel(evlist, ops, cpu, mp, cpu,
 					   thread, &output, &output_overwrite))
 
@@ -328,20 +328,25 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu_map_idx, int thread,
 	return 0;
 }
 
+static int perf_evsel__ioctl(struct perf_evsel *evsel, int ioc, void *arg,
+			     int cpu_map_idx, int thread)
+{
+	int *fd = FD(evsel, cpu_map_idx, thread);
+
+	if (fd == NULL || *fd < 0)
+		return -1;
+
+	return ioctl(*fd, ioc, arg);
+}
+
 static int perf_evsel__run_ioctl(struct perf_evsel *evsel,
 				 int ioc,  void *arg,
 				 int cpu_map_idx)
 {
 	int thread;
 
 	for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
-		int err;
-		int *fd = FD(evsel, cpu_map_idx, thread);
-
-		if (fd == NULL || *fd < 0)
-			return -1;
-
-		err = ioctl(*fd, ioc, arg);
+		int err = perf_evsel__ioctl(evsel, ioc, arg, cpu_map_idx, thread);
 
 		if (err)
 			return err;
@@ -355,6 +360,21 @@ int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx)
 	return perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_ENABLE, NULL, cpu_map_idx);
 }
 
+int perf_evsel__enable_thread(struct perf_evsel *evsel, int thread)
+{
+	struct perf_cpu cpu __maybe_unused;
+	int idx;
+	int err;
+
+	perf_cpu_map__for_each_cpu(cpu, idx, evsel->cpus) {
+		err = perf_evsel__ioctl(evsel, PERF_EVENT_IOC_ENABLE, NULL, idx, thread);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 int perf_evsel__enable(struct perf_evsel *evsel)
 {
 	int i;
 
@@ -38,7 +38,8 @@ struct perf_evlist {
 };
 
 typedef void
-(*perf_evlist_mmap__cb_idx_t)(struct perf_evlist*, struct perf_mmap_param*, int, bool);
+(*perf_evlist_mmap__cb_idx_t)(struct perf_evlist*, struct perf_evsel*,
+			      struct perf_mmap_param*, int);
 typedef struct perf_mmap*
 (*perf_evlist_mmap__cb_get_t)(struct perf_evlist*, bool, int);
 typedef int
 
@@ -9,4 +9,6 @@ extern unsigned int page_size;
 ssize_t readn(int fd, void *buf, size_t n);
 ssize_t writen(int fd, const void *buf, size_t n);
 
+ssize_t preadn(int fd, void *buf, size_t n, off_t offs);
+
 #endif /* __LIBPERF_INTERNAL_CPUMAP_H */
@@ -31,4 +31,7 @@ LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_c
 	     (idx) < perf_cpu_map__nr(cpus);			\
 	     (idx)++, (cpu) = perf_cpu_map__cpu(cpus, idx))
 
+#define perf_cpu_map__for_each_idx(idx, cpus)				\
+	for ((idx) = 0; (idx) < perf_cpu_map__nr(cpus); (idx)++)
+
 #endif /* __LIBPERF_CPUMAP_H */
@@ -36,6 +36,7 @@ LIBPERF_API int perf_evsel__read(struct perf_evsel *evsel, int cpu_map_idx, int
 				 struct perf_counts_values *count);
 LIBPERF_API int perf_evsel__enable(struct perf_evsel *evsel);
 LIBPERF_API int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx);
+LIBPERF_API int perf_evsel__enable_thread(struct perf_evsel *evsel, int thread);
 LIBPERF_API int perf_evsel__disable(struct perf_evsel *evsel);
 LIBPERF_API int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu_map_idx);
 LIBPERF_API struct perf_cpu_map *perf_evsel__cpus(struct perf_evsel *evsel);
 
@@ -38,6 +38,26 @@ ssize_t readn(int fd, void *buf, size_t n)
 	return ion(true, fd, buf, n);
 }
 
+ssize_t preadn(int fd, void *buf, size_t n, off_t offs)
+{
+	size_t left = n;
+
+	while (left) {
+		ssize_t ret = pread(fd, buf, left, offs);
+
+		if (ret < 0 && errno == EINTR)
+			continue;
+		if (ret <= 0)
+			return ret;
+
+		left -= ret;
+		buf  += ret;
+		offs += ret;
+	}
+
+	return n;
+}
+
 /*
  * Write exactly 'n' bytes or return an error.
  */
 
@@ -19,7 +19,6 @@ perf.data
 perf.data.old
 output.svg
 perf-archive
-perf-with-kcore
 perf-iostat
 tags
 TAGS
 
@@ -147,6 +147,11 @@ include::itrace.txt[]
 	The period/hits keywords set the base the percentage is computed
 	on - the samples period or the number of samples (hits).
 
+--percent-limit::
+	Do not show functions which have an overhead under that percent on
+	stdio or stdio2 (Default: 0).  Note that this is about selection of
+	functions to display, not about lines within the function.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1]
@@ -0,0 +1,218 @@
+perf-arm-spe(1)
+================
+
+NAME
+----
+perf-arm-spe - Support for Arm Statistical Profiling Extension within Perf tools
+
+SYNOPSIS
+--------
+[verse]
+'perf record' -e arm_spe//
+
+DESCRIPTION
+-----------
+
+The SPE (Statistical Profiling Extension) feature provides accurate attribution of latencies and
+ events down to individual instructions. Rather than being interrupt-driven, it picks an
+instruction to sample and then captures data for it during execution. Data includes execution time
+in cycles. For loads and stores it also includes data address, cache miss events, and data origin.
+
+The sampling has 5 stages:
+
+  1. Choose an operation
+  2. Collect data about the operation
+  3. Optionally discard the record based on a filter
+  4. Write the record to memory
+  5. Interrupt when the buffer is full
+
+Choose an operation
+~~~~~~~~~~~~~~~~~~~
+
+This is chosen from a sample population, for SPE this is an IMPLEMENTATION DEFINED choice of all
+architectural instructions or all micro-ops. Sampling happens at a programmable interval. The
+architecture provides a mechanism for the SPE driver to infer the minimum interval at which it should
+sample. This minimum interval is used by the driver if no interval is specified. A pseudo-random
+perturbation is also added to the sampling interval by default.
+
+Collect data about the operation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Program counter, PMU events, timings and data addresses related to the operation are recorded.
+Sampling ensures there is only one sampled operation is in flight.
+
+Optionally discard the record based on a filter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Based on programmable criteria, choose whether to keep the record or discard it. If the record is
+discarded then the flow stops here for this sample.
+
+Write the record to memory
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The record is appended to a memory buffer
+
+Interrupt when the buffer is full
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When the buffer fills, an interrupt is sent and the driver signals Perf to collect the records.
+Perf saves the raw data in the perf.data file.
+
+Opening the file
+----------------
+
+Up until this point no decoding of the SPE data was done by either the kernel or Perf. Only when the
+recorded file is opened with 'perf report' or 'perf script' does the decoding happen. When decoding
+the data, Perf generates "synthetic samples" as if these were generated at the time of the
+recording. These samples are the same as if normal sampling was done by Perf without using SPE,
+although they may have more attributes associated with them. For example a normal sample may have
+just the instruction pointer, but an SPE sample can have data addresses and latency attributes.
+
+Why Sampling?
+-------------
+
+ - Sampling, rather than tracing, cuts down the profiling problem to something more manageable for
+ hardware. Only one sampled operation is in flight at a time.
+
+ - Allows precise attribution data, including: Full PC of instruction, data virtual and physical
+ addresses.
+
+ - Allows correlation between an instruction and events, such as TLB and cache miss. (Data source
+ indicates which particular cache was hit, but the meaning is implementation defined because
+ different implementations can have different cache configurations.)
+
+However, SPE does not provide any call-graph information, and relies on statistical methods.
+
+Collisions
+----------
+
+When an operation is sampled while a previous sampled operation has not finished, a collision
+occurs. The new sample is dropped. Collisions affect the integrity of the data, so the sample rate
+should be set to avoid collisions.
+
+The 'sample_collision' PMU event can be used to determine the number of lost samples. Although this
+count is based on collisions _before_ filtering occurs. Therefore this can not be used as an exact
+number for samples dropped that would have made it through the filter, but can be a rough
+guide.
+
+The effect of microarchitectural sampling
+-----------------------------------------
+
+If an implementation samples micro-operations instead of instructions, the results of sampling must
+be weighted accordingly.
+
+For example, if a given instruction A is always converted into two micro-operations, A0 and A1, it
+becomes twice as likely to appear in the sample population.
+
+The coarse effect of conversions, and, if applicable, sampling of speculative operations, can be
+estimated from the 'sample_pop' and 'inst_retired' PMU events.
+
+Kernel Requirements
+-------------------
+
+The ARM_SPE_PMU config must be set to build as either a module or statically.
+
+Depending on CPU model, the kernel may need to be booted with page table isolation disabled
+(kpti=off). If KPTI needs to be disabled, this will fail with a console message "profiling buffer
+inaccessible. Try passing 'kpti=off' on the kernel command line".
+
+Capturing SPE with perf command-line tools
+------------------------------------------
+
+You can record a session with SPE samples:
+
+  perf record -e arm_spe// -- ./mybench
+
+The sample period is set from the -c option, and because the minimum interval is used by default
+it's recommended to set this to a higher value. The value is written to PMSIRR.INTERVAL.
+
+Config parameters
+~~~~~~~~~~~~~~~~~
+
+These are placed between the // in the event and comma separated. For example '-e
+arm_spe/load_filter=1,min_latency=10/'
+
+  branch_filter=1     - collect branches only (PMSFCR.B)
+  event_filter=<mask> - filter on specific events (PMSEVFR) - see bitfield description below
+  jitter=1            - use jitter to avoid resonance when sampling (PMSIRR.RND)
+  load_filter=1       - collect loads only (PMSFCR.LD)
+  min_latency=<n>     - collect only samples with this latency or higher* (PMSLATFR)
+  pa_enable=1         - collect physical address (as well as VA) of loads/stores (PMSCR.PA) - requires privilege
+  pct_enable=1        - collect physical timestamp instead of virtual timestamp (PMSCR.PCT) - requires privilege
+  store_filter=1      - collect stores only (PMSFCR.ST)
+  ts_enable=1         - enable timestamping with value of generic timer (PMSCR.TS)
+
++++*+++ Latency is the total latency from the point at which sampling started on that instruction, rather
+than only the execution latency.
+
+Only some events can be filtered on; these include:
+
+  bit 1     - instruction retired (i.e. omit speculative instructions)
+  bit 3     - L1D refill
+  bit 5     - TLB refill
+  bit 7     - mispredict
+  bit 11    - misaligned access
+
+So to sample just retired instructions:
+
+  perf record -e arm_spe/event_filter=2/ -- ./mybench
+
+or just mispredicted branches:
+
+  perf record -e arm_spe/event_filter=0x80/ -- ./mybench
+
+Viewing the data
+~~~~~~~~~~~~~~~~~
+
+By default perf report and perf script will assign samples to separate groups depending on the
+attributes/events of the SPE record. Because instructions can have multiple events associated with
+them, the samples in these groups are not necessarily unique. For example perf report shows these
+groups:
+
+  Available samples
+  0 arm_spe//
+  0 dummy:u
+  21 l1d-miss
+  897 l1d-access
+  5 llc-miss
+  7 llc-access
+  2 tlb-miss
+  1K tlb-access
+  36 branch-miss
+  0 remote-access
+  900 memory
+
+The arm_spe// and dummy:u events are implementation details and are expected to be empty.
+
+To get a full list of unique samples that are not sorted into groups, set the itrace option to
+generate 'instruction' samples. The period option is also taken into account, so set it to 1
+instruction unless you want to further downsample the already sampled SPE data:
+
+  perf report --itrace=i1i
+
+Memory access details are also stored on the samples and this can be viewed with:
+
+  perf report --mem-mode
+
+Common errors
+~~~~~~~~~~~~~
+
+ - "Cannot find PMU `arm_spe'. Missing kernel support?"
+
+   Module not built or loaded, KPTI not disabled (see above), or running on a VM
+
+ - "Arm SPE CONTEXT packets not found in the traces."
+
+   Root privilege is required to collect context packets. But these only increase the accuracy of
+   assigning PIDs to kernel samples. For userspace sampling this can be ignored.
+
+ - Excessively large perf.data file size
+
+   Increase sampling interval (see above)
+
+
+SEE ALSO
+--------
+
+linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1],
+linkperf:perf-inject[1]