Skip to content

Commit d223575

Browse files
committed
Merge tag 'perf-tools-for-v5.19-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux
Pull perf tool updates from Arnaldo Carvalho de Melo: "Intel PT: - Allow hardware tracing on KVM test programs. In this case, the VM is not running an OS, but only the functions loaded into it by the hypervisor test program, and conveniently, loaded at the same virtual addresses. - Improve documentation: - Add link to perf wiki's page - Cleanups: - Delete now unused perf-with-kcore.sh script - Remove unused machines__find_host() ARM SPE (Statistical Profile Extensions): - Add man page entry. Vendor Events: - Update various Intel event topics - Update various microarch events - Fix various cstate metrics - Fix Alderlake metric groups - Add sapphirerapids events - Add JSON files for ARM Cortex A34, A35, A55, A510, A65, A73, A75, A77, A78, A710, X1, X2 and Neoverse E1 - Update Cortex A57/A72 perf stat: - Introduce stats for the user and system rusage times perf c2c: - Prep work to support ARM systems perf annotate: - Add --percent-limit option perf lock: - Add -t/--thread option for report - Do not discard broken lock stats perf bench: - Add breakpoint benchmarks perf test: - Limit to only run executable scripts in tests - Add basic perf record tests - Add stat record+report test - Add basic stat and topdown group test - Skip several tests when the user hasn't permission to perform them - Fix test case 81 ("perf record tests") on s390x perf version: - debuginfod support improvements perf scripting python: - Expose symbol offset and source information perf build: - Error for BPF skeletons without LIBBPF - Use Python devtools for version autodetection rather than runtime Miscellaneous: - Add riscv64 support to 'perf jitdump' - Various fixes/tidy ups related to cpu_map - Fixes for handling Intel hybrid systems" * tag 'perf-tools-for-v5.19-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (122 commits) perf intel-pt: Add guest_code support perf kvm report: Add guest_code support perf script: Add guest_code support perf tools: Add guest_code support perf tools: Factor out thread__set_guest_comm() perf tools: Add machine to machines back pointer perf vendors events arm64: Update Cortex A57/A72 perf vendors events arm64: Arm Neoverse E1 perf vendors events arm64: Arm Cortex-X2 perf vendors events arm64: Arm Cortex-X1 perf vendors events arm64: Arm Cortex-A710 perf vendors events arm64: Arm Cortex-A78 perf vendors events arm64: Arm Cortex-A77 perf vendors events arm64: Arm Cortex-A75 perf vendors events arm64: Arm Cortex-A73 perf vendors events arm64: Arm Cortex-A65 perf vendors events arm64: Arm Cortex-A510 perf vendors events arm64: Arm Cortex-A55 perf vendors events arm64: Arm Cortex-A35 perf vendors events arm64: Arm Cortex-A34 ...
2 parents e908305 + 5d2b6bc commit d223575

265 files changed

Lines changed: 19159 additions & 4151 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

tools/lib/perf/evlist.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ static void perf_evlist__propagate_maps(struct perf_evlist *evlist)
5959
{
6060
struct perf_evsel *evsel;
6161

62+
/* Recomputing all_cpus, so start with a blank slate. */
63+
perf_cpu_map__put(evlist->all_cpus);
64+
evlist->all_cpus = NULL;
65+
6266
perf_evlist__for_each_evsel(evlist, evsel)
6367
__perf_evlist__propagate_maps(evlist, evsel);
6468
}
@@ -474,6 +478,9 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
474478
*/
475479
refcount_set(&map->refcnt, 2);
476480

481+
if (ops->idx)
482+
ops->idx(evlist, evsel, mp, idx);
483+
477484
if (ops->mmap(map, mp, *output, evlist_cpu) < 0)
478485
return -1;
479486

@@ -516,9 +523,6 @@ mmap_per_thread(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
516523
int output = -1;
517524
int output_overwrite = -1;
518525

519-
if (ops->idx)
520-
ops->idx(evlist, mp, thread, false);
521-
522526
if (mmap_per_evsel(evlist, ops, thread, mp, 0, thread,
523527
&output, &output_overwrite))
524528
goto out_unmap;
@@ -543,9 +547,6 @@ mmap_per_cpu(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
543547
int output = -1;
544548
int output_overwrite = -1;
545549

546-
if (ops->idx)
547-
ops->idx(evlist, mp, cpu, true);
548-
549550
for (thread = 0; thread < nr_threads; thread++) {
550551
if (mmap_per_evsel(evlist, ops, cpu, mp, cpu,
551552
thread, &output, &output_overwrite))

tools/lib/perf/evsel.c

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -328,20 +328,25 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu_map_idx, int thread,
328328
return 0;
329329
}
330330

331+
static int perf_evsel__ioctl(struct perf_evsel *evsel, int ioc, void *arg,
332+
int cpu_map_idx, int thread)
333+
{
334+
int *fd = FD(evsel, cpu_map_idx, thread);
335+
336+
if (fd == NULL || *fd < 0)
337+
return -1;
338+
339+
return ioctl(*fd, ioc, arg);
340+
}
341+
331342
static int perf_evsel__run_ioctl(struct perf_evsel *evsel,
332343
int ioc, void *arg,
333344
int cpu_map_idx)
334345
{
335346
int thread;
336347

337348
for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
338-
int err;
339-
int *fd = FD(evsel, cpu_map_idx, thread);
340-
341-
if (fd == NULL || *fd < 0)
342-
return -1;
343-
344-
err = ioctl(*fd, ioc, arg);
349+
int err = perf_evsel__ioctl(evsel, ioc, arg, cpu_map_idx, thread);
345350

346351
if (err)
347352
return err;
@@ -355,6 +360,21 @@ int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx)
355360
return perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_ENABLE, NULL, cpu_map_idx);
356361
}
357362

363+
int perf_evsel__enable_thread(struct perf_evsel *evsel, int thread)
364+
{
365+
struct perf_cpu cpu __maybe_unused;
366+
int idx;
367+
int err;
368+
369+
perf_cpu_map__for_each_cpu(cpu, idx, evsel->cpus) {
370+
err = perf_evsel__ioctl(evsel, PERF_EVENT_IOC_ENABLE, NULL, idx, thread);
371+
if (err)
372+
return err;
373+
}
374+
375+
return 0;
376+
}
377+
358378
int perf_evsel__enable(struct perf_evsel *evsel)
359379
{
360380
int i;

tools/lib/perf/include/internal/evlist.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ struct perf_evlist {
3838
};
3939

4040
typedef void
41-
(*perf_evlist_mmap__cb_idx_t)(struct perf_evlist*, struct perf_mmap_param*, int, bool);
41+
(*perf_evlist_mmap__cb_idx_t)(struct perf_evlist*, struct perf_evsel*,
42+
struct perf_mmap_param*, int);
4243
typedef struct perf_mmap*
4344
(*perf_evlist_mmap__cb_get_t)(struct perf_evlist*, bool, int);
4445
typedef int

tools/lib/perf/include/internal/lib.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,6 @@ extern unsigned int page_size;
99
ssize_t readn(int fd, void *buf, size_t n);
1010
ssize_t writen(int fd, const void *buf, size_t n);
1111

12+
ssize_t preadn(int fd, void *buf, size_t n, off_t offs);
13+
1214
#endif /* __LIBPERF_INTERNAL_CPUMAP_H */

tools/lib/perf/include/perf/cpumap.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,7 @@ LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_c
3131
(idx) < perf_cpu_map__nr(cpus); \
3232
(idx)++, (cpu) = perf_cpu_map__cpu(cpus, idx))
3333

34+
#define perf_cpu_map__for_each_idx(idx, cpus) \
35+
for ((idx) = 0; (idx) < perf_cpu_map__nr(cpus); (idx)++)
36+
3437
#endif /* __LIBPERF_CPUMAP_H */

tools/lib/perf/include/perf/evsel.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ LIBPERF_API int perf_evsel__read(struct perf_evsel *evsel, int cpu_map_idx, int
3636
struct perf_counts_values *count);
3737
LIBPERF_API int perf_evsel__enable(struct perf_evsel *evsel);
3838
LIBPERF_API int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx);
39+
LIBPERF_API int perf_evsel__enable_thread(struct perf_evsel *evsel, int thread);
3940
LIBPERF_API int perf_evsel__disable(struct perf_evsel *evsel);
4041
LIBPERF_API int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu_map_idx);
4142
LIBPERF_API struct perf_cpu_map *perf_evsel__cpus(struct perf_evsel *evsel);

tools/lib/perf/lib.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,26 @@ ssize_t readn(int fd, void *buf, size_t n)
3838
return ion(true, fd, buf, n);
3939
}
4040

41+
ssize_t preadn(int fd, void *buf, size_t n, off_t offs)
42+
{
43+
size_t left = n;
44+
45+
while (left) {
46+
ssize_t ret = pread(fd, buf, left, offs);
47+
48+
if (ret < 0 && errno == EINTR)
49+
continue;
50+
if (ret <= 0)
51+
return ret;
52+
53+
left -= ret;
54+
buf += ret;
55+
offs += ret;
56+
}
57+
58+
return n;
59+
}
60+
4161
/*
4262
* Write exactly 'n' bytes or return an error.
4363
*/

tools/perf/.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ perf.data
1919
perf.data.old
2020
output.svg
2121
perf-archive
22-
perf-with-kcore
2322
perf-iostat
2423
tags
2524
TAGS

tools/perf/Documentation/perf-annotate.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,11 @@ include::itrace.txt[]
147147
The period/hits keywords set the base the percentage is computed
148148
on - the samples period or the number of samples (hits).
149149

150+
--percent-limit::
151+
Do not show functions which have an overhead under that percent on
152+
stdio or stdio2 (Default: 0). Note that this is about selection of
153+
functions to display, not about lines within the function.
154+
150155
SEE ALSO
151156
--------
152157
linkperf:perf-record[1], linkperf:perf-report[1]
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
perf-arm-spe(1)
2+
================
3+
4+
NAME
5+
----
6+
perf-arm-spe - Support for Arm Statistical Profiling Extension within Perf tools
7+
8+
SYNOPSIS
9+
--------
10+
[verse]
11+
'perf record' -e arm_spe//
12+
13+
DESCRIPTION
14+
-----------
15+
16+
The SPE (Statistical Profiling Extension) feature provides accurate attribution of latencies and
17+
events down to individual instructions. Rather than being interrupt-driven, it picks an
18+
instruction to sample and then captures data for it during execution. Data includes execution time
19+
in cycles. For loads and stores it also includes data address, cache miss events, and data origin.
20+
21+
The sampling has 5 stages:
22+
23+
1. Choose an operation
24+
2. Collect data about the operation
25+
3. Optionally discard the record based on a filter
26+
4. Write the record to memory
27+
5. Interrupt when the buffer is full
28+
29+
Choose an operation
30+
~~~~~~~~~~~~~~~~~~~
31+
32+
This is chosen from a sample population, for SPE this is an IMPLEMENTATION DEFINED choice of all
33+
architectural instructions or all micro-ops. Sampling happens at a programmable interval. The
34+
architecture provides a mechanism for the SPE driver to infer the minimum interval at which it should
35+
sample. This minimum interval is used by the driver if no interval is specified. A pseudo-random
36+
perturbation is also added to the sampling interval by default.
37+
38+
Collect data about the operation
39+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
40+
41+
Program counter, PMU events, timings and data addresses related to the operation are recorded.
42+
Sampling ensures there is only one sampled operation is in flight.
43+
44+
Optionally discard the record based on a filter
45+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
46+
47+
Based on programmable criteria, choose whether to keep the record or discard it. If the record is
48+
discarded then the flow stops here for this sample.
49+
50+
Write the record to memory
51+
~~~~~~~~~~~~~~~~~~~~~~~~~~
52+
53+
The record is appended to a memory buffer
54+
55+
Interrupt when the buffer is full
56+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
57+
58+
When the buffer fills, an interrupt is sent and the driver signals Perf to collect the records.
59+
Perf saves the raw data in the perf.data file.
60+
61+
Opening the file
62+
----------------
63+
64+
Up until this point no decoding of the SPE data was done by either the kernel or Perf. Only when the
65+
recorded file is opened with 'perf report' or 'perf script' does the decoding happen. When decoding
66+
the data, Perf generates "synthetic samples" as if these were generated at the time of the
67+
recording. These samples are the same as if normal sampling was done by Perf without using SPE,
68+
although they may have more attributes associated with them. For example a normal sample may have
69+
just the instruction pointer, but an SPE sample can have data addresses and latency attributes.
70+
71+
Why Sampling?
72+
-------------
73+
74+
- Sampling, rather than tracing, cuts down the profiling problem to something more manageable for
75+
hardware. Only one sampled operation is in flight at a time.
76+
77+
- Allows precise attribution data, including: Full PC of instruction, data virtual and physical
78+
addresses.
79+
80+
- Allows correlation between an instruction and events, such as TLB and cache miss. (Data source
81+
indicates which particular cache was hit, but the meaning is implementation defined because
82+
different implementations can have different cache configurations.)
83+
84+
However, SPE does not provide any call-graph information, and relies on statistical methods.
85+
86+
Collisions
87+
----------
88+
89+
When an operation is sampled while a previous sampled operation has not finished, a collision
90+
occurs. The new sample is dropped. Collisions affect the integrity of the data, so the sample rate
91+
should be set to avoid collisions.
92+
93+
The 'sample_collision' PMU event can be used to determine the number of lost samples. Although this
94+
count is based on collisions _before_ filtering occurs. Therefore this can not be used as an exact
95+
number for samples dropped that would have made it through the filter, but can be a rough
96+
guide.
97+
98+
The effect of microarchitectural sampling
99+
-----------------------------------------
100+
101+
If an implementation samples micro-operations instead of instructions, the results of sampling must
102+
be weighted accordingly.
103+
104+
For example, if a given instruction A is always converted into two micro-operations, A0 and A1, it
105+
becomes twice as likely to appear in the sample population.
106+
107+
The coarse effect of conversions, and, if applicable, sampling of speculative operations, can be
108+
estimated from the 'sample_pop' and 'inst_retired' PMU events.
109+
110+
Kernel Requirements
111+
-------------------
112+
113+
The ARM_SPE_PMU config must be set to build as either a module or statically.
114+
115+
Depending on CPU model, the kernel may need to be booted with page table isolation disabled
116+
(kpti=off). If KPTI needs to be disabled, this will fail with a console message "profiling buffer
117+
inaccessible. Try passing 'kpti=off' on the kernel command line".
118+
119+
Capturing SPE with perf command-line tools
120+
------------------------------------------
121+
122+
You can record a session with SPE samples:
123+
124+
perf record -e arm_spe// -- ./mybench
125+
126+
The sample period is set from the -c option, and because the minimum interval is used by default
127+
it's recommended to set this to a higher value. The value is written to PMSIRR.INTERVAL.
128+
129+
Config parameters
130+
~~~~~~~~~~~~~~~~~
131+
132+
These are placed between the // in the event and comma separated. For example '-e
133+
arm_spe/load_filter=1,min_latency=10/'
134+
135+
branch_filter=1 - collect branches only (PMSFCR.B)
136+
event_filter=<mask> - filter on specific events (PMSEVFR) - see bitfield description below
137+
jitter=1 - use jitter to avoid resonance when sampling (PMSIRR.RND)
138+
load_filter=1 - collect loads only (PMSFCR.LD)
139+
min_latency=<n> - collect only samples with this latency or higher* (PMSLATFR)
140+
pa_enable=1 - collect physical address (as well as VA) of loads/stores (PMSCR.PA) - requires privilege
141+
pct_enable=1 - collect physical timestamp instead of virtual timestamp (PMSCR.PCT) - requires privilege
142+
store_filter=1 - collect stores only (PMSFCR.ST)
143+
ts_enable=1 - enable timestamping with value of generic timer (PMSCR.TS)
144+
145+
+++*+++ Latency is the total latency from the point at which sampling started on that instruction, rather
146+
than only the execution latency.
147+
148+
Only some events can be filtered on; these include:
149+
150+
bit 1 - instruction retired (i.e. omit speculative instructions)
151+
bit 3 - L1D refill
152+
bit 5 - TLB refill
153+
bit 7 - mispredict
154+
bit 11 - misaligned access
155+
156+
So to sample just retired instructions:
157+
158+
perf record -e arm_spe/event_filter=2/ -- ./mybench
159+
160+
or just mispredicted branches:
161+
162+
perf record -e arm_spe/event_filter=0x80/ -- ./mybench
163+
164+
Viewing the data
165+
~~~~~~~~~~~~~~~~~
166+
167+
By default perf report and perf script will assign samples to separate groups depending on the
168+
attributes/events of the SPE record. Because instructions can have multiple events associated with
169+
them, the samples in these groups are not necessarily unique. For example perf report shows these
170+
groups:
171+
172+
Available samples
173+
0 arm_spe//
174+
0 dummy:u
175+
21 l1d-miss
176+
897 l1d-access
177+
5 llc-miss
178+
7 llc-access
179+
2 tlb-miss
180+
1K tlb-access
181+
36 branch-miss
182+
0 remote-access
183+
900 memory
184+
185+
The arm_spe// and dummy:u events are implementation details and are expected to be empty.
186+
187+
To get a full list of unique samples that are not sorted into groups, set the itrace option to
188+
generate 'instruction' samples. The period option is also taken into account, so set it to 1
189+
instruction unless you want to further downsample the already sampled SPE data:
190+
191+
perf report --itrace=i1i
192+
193+
Memory access details are also stored on the samples and this can be viewed with:
194+
195+
perf report --mem-mode
196+
197+
Common errors
198+
~~~~~~~~~~~~~
199+
200+
- "Cannot find PMU `arm_spe'. Missing kernel support?"
201+
202+
Module not built or loaded, KPTI not disabled (see above), or running on a VM
203+
204+
- "Arm SPE CONTEXT packets not found in the traces."
205+
206+
Root privilege is required to collect context packets. But these only increase the accuracy of
207+
assigning PIDs to kernel samples. For userspace sampling this can be ignored.
208+
209+
- Excessively large perf.data file size
210+
211+
Increase sampling interval (see above)
212+
213+
214+
SEE ALSO
215+
--------
216+
217+
linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1],
218+
linkperf:perf-inject[1]

0 commit comments

Comments
 (0)