Skip to content

Commit edc41a1

Browse files
namhyungacmel
authored andcommitted
perf record: Enable off-cpu analysis with BPF
Add --off-cpu option to enable the off-cpu profiling with BPF. It'd use a bpf_output event and rename it to "offcpu-time". Samples will be synthesized at the end of the record session using data from a BPF map which contains the aggregated off-cpu time at context switches. So it needs root privilege to get the off-cpu profiling. Each sample will have a separate user stacktrace so it will skip kernel threads. The sample ip will be set from the stacktrace and other sample data will be updated accordingly. Currently it only handles some basic sample types. The sample timestamp is set to a dummy value just not to bother with other events during the sorting. So it has a very big initial value and increase it on processing each samples. Good thing is that it can be used together with regular profiling like cpu cycles. If you don't want to that, you can use a dummy event to enable off-cpu profiling only. Example output: $ sudo perf record --off-cpu perf bench sched messaging -l 1000 $ sudo perf report --stdio --call-graph=no # Total Lost Samples: 0 # # Samples: 41K of event 'cycles' # Event count (approx.): 42137343851 ... # Samples: 1K of event 'offcpu-time' # Event count (approx.): 587990831640 # # Children Self Command Shared Object Symbol # ........ ........ ............... .................. ......................... # 81.66% 0.00% sched-messaging libc-2.33.so [.] __libc_start_main 81.66% 0.00% sched-messaging perf [.] cmd_bench 81.66% 0.00% sched-messaging perf [.] main 81.66% 0.00% sched-messaging perf [.] run_builtin 81.43% 0.00% sched-messaging perf [.] bench_sched_messaging 40.86% 40.86% sched-messaging libpthread-2.33.so [.] __read 37.66% 37.66% sched-messaging libpthread-2.33.so [.] __write 2.91% 2.91% sched-messaging libc-2.33.so [.] __poll ... As you can see it spent most of off-cpu time in read and write in bench_sched_messaging(). The --call-graph=no was added just to make the output concise here. It uses perf hooks facility to control BPF program during the record session rather than adding new BPF/off-cpu specific calls. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Blake Jones <blakejones@google.com> Cc: Hao Luo <haoluo@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Milian Wolff <milian.wolff@kdab.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <songliubraving@fb.com> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20220518224725.742882-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
1 parent 303ead4 commit edc41a1

7 files changed

Lines changed: 404 additions & 0 deletions

File tree

tools/perf/Documentation/perf-record.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,16 @@ include::intel-hybrid.txt[]
758758
If the URLs is not specified, the value of DEBUGINFOD_URLS
759759
system environment variable is used.
760760

761+
--off-cpu::
762+
Enable off-cpu profiling with BPF. The BPF program will collect
763+
task scheduling information with (user) stacktrace and save them
764+
as sample data of a software event named "offcpu-time". The
765+
sample period will have the time the task slept in nanoseconds.
766+
767+
Note that BPF can collect stack traces using frame pointer ("fp")
768+
only, as of now. So the applications built without the frame
769+
pointer might see bogus addresses.
770+
761771
SEE ALSO
762772
--------
763773
linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1]

tools/perf/Makefile.perf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,6 +1038,7 @@ SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
10381038
SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
10391039
SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
10401040
SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
1041+
SKELETONS += $(SKEL_OUT)/off_cpu.skel.h
10411042

10421043
$(SKEL_TMP_OUT) $(LIBBPF_OUTPUT):
10431044
$(Q)$(MKDIR) -p $@

tools/perf/builtin-record.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include "util/clockid.h"
5050
#include "util/pmu-hybrid.h"
5151
#include "util/evlist-hybrid.h"
52+
#include "util/off_cpu.h"
5253
#include "asm/bug.h"
5354
#include "perf.h"
5455
#include "cputopo.h"
@@ -162,6 +163,7 @@ struct record {
162163
bool buildid_mmap;
163164
bool timestamp_filename;
164165
bool timestamp_boundary;
166+
bool off_cpu;
165167
struct switch_output switch_output;
166168
unsigned long long samples;
167169
unsigned long output_max_size; /* = 0: unlimited */
@@ -888,6 +890,11 @@ static int record__config_text_poke(struct evlist *evlist)
888890
return 0;
889891
}
890892

893+
static int record__config_off_cpu(struct record *rec)
894+
{
895+
return off_cpu_prepare(rec->evlist);
896+
}
897+
891898
static bool record__kcore_readable(struct machine *machine)
892899
{
893900
char kcore[PATH_MAX];
@@ -2591,6 +2598,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
25912598
} else
25922599
status = err;
25932600

2601+
if (rec->off_cpu)
2602+
rec->bytes_written += off_cpu_write(rec->session);
2603+
25942604
record__synthesize(rec, true);
25952605
/* this will be recalculated during process_buildids() */
25962606
rec->samples = 0;
@@ -3315,6 +3325,7 @@ static struct option __record_options[] = {
33153325
OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
33163326
"write collected trace data into several data files using parallel threads",
33173327
record__parse_threads),
3328+
OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
33183329
OPT_END()
33193330
};
33203331

@@ -3734,6 +3745,12 @@ int cmd_record(int argc, const char **argv)
37343745
set_nobuild('\0', "vmlinux", true);
37353746
# undef set_nobuild
37363747
# undef REASON
3748+
#endif
3749+
3750+
#ifndef HAVE_BPF_SKEL
3751+
# define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3752+
set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3753+
# undef set_nobuild
37373754
#endif
37383755

37393756
rec->opts.affinity = PERF_AFFINITY_SYS;
@@ -3972,6 +3989,14 @@ int cmd_record(int argc, const char **argv)
39723989
}
39733990
}
39743991

3992+
if (rec->off_cpu) {
3993+
err = record__config_off_cpu(rec);
3994+
if (err) {
3995+
pr_err("record__config_off_cpu failed, error %d\n", err);
3996+
goto out;
3997+
}
3998+
}
3999+
39754000
if (record_opts__config(&rec->opts)) {
39764001
err = -EINVAL;
39774002
goto out;

tools/perf/util/Build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ perf-$(CONFIG_LIBBPF) += bpf_map.o
147147
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
148148
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o
149149
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_ftrace.o
150+
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
150151
perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
151152
perf-$(CONFIG_LIBELF) += symbol-elf.o
152153
perf-$(CONFIG_LIBELF) += probe-file.o

tools/perf/util/bpf_off_cpu.c

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#include "util/bpf_counter.h"
3+
#include "util/debug.h"
4+
#include "util/evsel.h"
5+
#include "util/evlist.h"
6+
#include "util/off_cpu.h"
7+
#include "util/perf-hooks.h"
8+
#include "util/session.h"
9+
#include <bpf/bpf.h>
10+
11+
#include "bpf_skel/off_cpu.skel.h"
12+
13+
#define MAX_STACKS 32
14+
/* we don't need actual timestamp, just want to put the samples at last */
15+
#define OFF_CPU_TIMESTAMP (~0ull << 32)
16+
17+
static struct off_cpu_bpf *skel;
18+
19+
struct off_cpu_key {
20+
u32 pid;
21+
u32 tgid;
22+
u32 stack_id;
23+
u32 state;
24+
};
25+
26+
union off_cpu_data {
27+
struct perf_event_header hdr;
28+
u64 array[1024 / sizeof(u64)];
29+
};
30+
31+
static int off_cpu_config(struct evlist *evlist)
32+
{
33+
struct evsel *evsel;
34+
struct perf_event_attr attr = {
35+
.type = PERF_TYPE_SOFTWARE,
36+
.config = PERF_COUNT_SW_BPF_OUTPUT,
37+
.size = sizeof(attr), /* to capture ABI version */
38+
};
39+
char *evname = strdup(OFFCPU_EVENT);
40+
41+
if (evname == NULL)
42+
return -ENOMEM;
43+
44+
evsel = evsel__new(&attr);
45+
if (!evsel) {
46+
free(evname);
47+
return -ENOMEM;
48+
}
49+
50+
evsel->core.attr.freq = 1;
51+
evsel->core.attr.sample_period = 1;
52+
/* off-cpu analysis depends on stack trace */
53+
evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;
54+
55+
evlist__add(evlist, evsel);
56+
57+
free(evsel->name);
58+
evsel->name = evname;
59+
60+
return 0;
61+
}
62+
63+
static void off_cpu_start(void *arg __maybe_unused)
64+
{
65+
skel->bss->enabled = 1;
66+
}
67+
68+
static void off_cpu_finish(void *arg __maybe_unused)
69+
{
70+
skel->bss->enabled = 0;
71+
off_cpu_bpf__destroy(skel);
72+
}
73+
74+
int off_cpu_prepare(struct evlist *evlist)
75+
{
76+
int err;
77+
78+
if (off_cpu_config(evlist) < 0) {
79+
pr_err("Failed to config off-cpu BPF event\n");
80+
return -1;
81+
}
82+
83+
set_max_rlimit();
84+
85+
skel = off_cpu_bpf__open_and_load();
86+
if (!skel) {
87+
pr_err("Failed to open off-cpu BPF skeleton\n");
88+
return -1;
89+
}
90+
91+
err = off_cpu_bpf__attach(skel);
92+
if (err) {
93+
pr_err("Failed to attach off-cpu BPF skeleton\n");
94+
goto out;
95+
}
96+
97+
if (perf_hooks__set_hook("record_start", off_cpu_start, NULL) ||
98+
perf_hooks__set_hook("record_end", off_cpu_finish, NULL)) {
99+
pr_err("Failed to attach off-cpu skeleton\n");
100+
goto out;
101+
}
102+
103+
return 0;
104+
105+
out:
106+
off_cpu_bpf__destroy(skel);
107+
return -1;
108+
}
109+
110+
int off_cpu_write(struct perf_session *session)
111+
{
112+
int bytes = 0, size;
113+
int fd, stack;
114+
u64 sample_type, val, sid = 0;
115+
struct evsel *evsel;
116+
struct perf_data_file *file = &session->data->file;
117+
struct off_cpu_key prev, key;
118+
union off_cpu_data data = {
119+
.hdr = {
120+
.type = PERF_RECORD_SAMPLE,
121+
.misc = PERF_RECORD_MISC_USER,
122+
},
123+
};
124+
u64 tstamp = OFF_CPU_TIMESTAMP;
125+
126+
skel->bss->enabled = 0;
127+
128+
evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT);
129+
if (evsel == NULL) {
130+
pr_err("%s evsel not found\n", OFFCPU_EVENT);
131+
return 0;
132+
}
133+
134+
sample_type = evsel->core.attr.sample_type;
135+
136+
if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
137+
if (evsel->core.id)
138+
sid = evsel->core.id[0];
139+
}
140+
141+
fd = bpf_map__fd(skel->maps.off_cpu);
142+
stack = bpf_map__fd(skel->maps.stacks);
143+
memset(&prev, 0, sizeof(prev));
144+
145+
while (!bpf_map_get_next_key(fd, &prev, &key)) {
146+
int n = 1; /* start from perf_event_header */
147+
int ip_pos = -1;
148+
149+
bpf_map_lookup_elem(fd, &key, &val);
150+
151+
if (sample_type & PERF_SAMPLE_IDENTIFIER)
152+
data.array[n++] = sid;
153+
if (sample_type & PERF_SAMPLE_IP) {
154+
ip_pos = n;
155+
data.array[n++] = 0; /* will be updated */
156+
}
157+
if (sample_type & PERF_SAMPLE_TID)
158+
data.array[n++] = (u64)key.pid << 32 | key.tgid;
159+
if (sample_type & PERF_SAMPLE_TIME)
160+
data.array[n++] = tstamp;
161+
if (sample_type & PERF_SAMPLE_ID)
162+
data.array[n++] = sid;
163+
if (sample_type & PERF_SAMPLE_CPU)
164+
data.array[n++] = 0;
165+
if (sample_type & PERF_SAMPLE_PERIOD)
166+
data.array[n++] = val;
167+
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
168+
int len = 0;
169+
170+
/* data.array[n] is callchain->nr (updated later) */
171+
data.array[n + 1] = PERF_CONTEXT_USER;
172+
data.array[n + 2] = 0;
173+
174+
bpf_map_lookup_elem(stack, &key.stack_id, &data.array[n + 2]);
175+
while (data.array[n + 2 + len])
176+
len++;
177+
178+
/* update length of callchain */
179+
data.array[n] = len + 1;
180+
181+
/* update sample ip with the first callchain entry */
182+
if (ip_pos >= 0)
183+
data.array[ip_pos] = data.array[n + 2];
184+
185+
/* calculate sample callchain data array length */
186+
n += len + 2;
187+
}
188+
/* TODO: handle more sample types */
189+
190+
size = n * sizeof(u64);
191+
data.hdr.size = size;
192+
bytes += size;
193+
194+
if (perf_data_file__write(file, &data, size) < 0) {
195+
pr_err("failed to write perf data, error: %m\n");
196+
return bytes;
197+
}
198+
199+
prev = key;
200+
/* increase dummy timestamp to sort later samples */
201+
tstamp++;
202+
}
203+
return bytes;
204+
}

0 commit comments

Comments
 (0)