Skip to content

Commit d1efa4a

Browse files
James-A-Clarkacmel
authored andcommitted
perf cs-etm: Add separate decode paths for timeless and per-thread modes
Timeless and per-thread are orthogonal concepts that are currently treated as if they are the same (per-thread == timeless). This breaks when you modify the command line or itrace options to something that the current logic doesn't expect. For example: # Force timeless with Z --itrace=Zi10i # Or inconsistent record options -e cs_etm/timestamp=1/ --per-thread Adding Z for decoding in per-cpu mode is particularly bad because in per-thread mode trace channel IDs are discarded and all assumed to be 0, which would mix trace from different CPUs in per-cpu mode. Although the results might not be perfect in all scenarios, if the user requests no timestamps, it should still be possible to decode in either mode. Especially if the relative times of samples in different processes aren't interesting, quite a bit of space can be saved by turning off timestamps in per-cpu mode. Signed-off-by: James Clark <james.clark@arm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Denis Nikitin <denik@google.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.g.garry@oracle.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Mike Leach <mike.leach@linaro.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Suzuki Poulouse <suzuki.poulose@arm.com> Cc: Will Deacon <will@kernel.org> Cc: Yang Shi <shy828301@gmail.com> Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230424134748.228137-8-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
1 parent 1764ce0 commit d1efa4a

2 files changed

Lines changed: 148 additions & 38 deletions

File tree

tools/perf/tests/shell/test_arm_coresight.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ arm_cs_etm_system_wide_test() {
150150
echo "Recording trace with system wide mode"
151151
perf record -o ${perfdata} -e cs_etm// -a -- ls > /dev/null 2>&1
152152

153+
# System-wide mode should include perf samples so test for that
154+
# instead of ls
153155
perf_script_branch_samples perf &&
154156
perf_report_branch_samples perf &&
155157
perf_report_instruction_samples perf
@@ -182,7 +184,29 @@ arm_cs_etm_snapshot_test() {
182184
arm_cs_report "CoreSight snapshot testing" $err
183185
}
184186

187+
arm_cs_etm_basic_test() {
188+
echo "Recording trace with '$*'"
189+
perf record -o ${perfdata} "$@" -- ls > /dev/null 2>&1
190+
191+
perf_script_branch_samples ls &&
192+
perf_report_branch_samples ls &&
193+
perf_report_instruction_samples ls
194+
195+
err=$?
196+
arm_cs_report "CoreSight basic testing with '$*'" $err
197+
}
198+
185199
arm_cs_etm_traverse_path_test
186200
arm_cs_etm_system_wide_test
187201
arm_cs_etm_snapshot_test
202+
203+
# Test all combinations of per-thread, system-wide and normal mode with
204+
# and without timestamps
205+
arm_cs_etm_basic_test -e cs_etm/timestamp=0/ --per-thread
206+
arm_cs_etm_basic_test -e cs_etm/timestamp=1/ --per-thread
207+
arm_cs_etm_basic_test -e cs_etm/timestamp=0/ -a
208+
arm_cs_etm_basic_test -e cs_etm/timestamp=1/ -a
209+
arm_cs_etm_basic_test -e cs_etm/timestamp=0/
210+
arm_cs_etm_basic_test -e cs_etm/timestamp=1/
211+
188212
exit $glb_err

tools/perf/util/cs-etm.c

Lines changed: 124 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,22 @@ struct cs_etm_auxtrace {
5050
struct thread *unknown_thread;
5151
struct perf_tsc_conversion tc;
5252

53+
/*
54+
* Timeless has no timestamps in the trace so overlapping mmap lookups
55+
* are less accurate but produces smaller trace data. We use context IDs
56+
* in the trace instead of matching timestamps with fork records so
57+
* they're not really needed in the general case. Overlapping mmaps
58+
* happen in cases like between a fork and an exec.
59+
*/
5360
bool timeless_decoding;
61+
62+
/*
63+
* Per-thread ignores the trace channel ID and instead assumes that
64+
* everything in a buffer comes from the same process regardless of
65+
* which CPU it ran on. It also implies no context IDs so the TID is
66+
* taken from the auxtrace buffer.
67+
*/
68+
bool per_thread_decoding;
5469
bool snapshot_mode;
5570
bool data_queued;
5671
bool has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */
@@ -98,7 +113,7 @@ struct cs_etm_queue {
98113
/* RB tree for quick conversion between traceID and metadata pointers */
99114
static struct intlist *traceid_list;
100115

101-
static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
116+
static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm);
102117
static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
103118
pid_t tid);
104119
static int cs_etm__get_data_block(struct cs_etm_queue *etmq);
@@ -492,7 +507,7 @@ static struct cs_etm_traceid_queue
492507
struct cs_etm_traceid_queue *tidq, **traceid_queues;
493508
struct cs_etm_auxtrace *etm = etmq->etm;
494509

495-
if (etm->timeless_decoding)
510+
if (etm->per_thread_decoding)
496511
trace_chan_id = CS_ETM_PER_THREAD_TRACEID;
497512

498513
traceid_queues_list = etmq->traceid_queues_list;
@@ -731,10 +746,15 @@ static int cs_etm__flush_events(struct perf_session *session,
731746
if (!tool->ordered_events)
732747
return -EINVAL;
733748

734-
if (etm->timeless_decoding)
749+
if (etm->timeless_decoding) {
750+
/*
751+
* Pass tid = -1 to process all queues. But likely they will have
752+
* already been processed on PERF_RECORD_EXIT anyway.
753+
*/
735754
return cs_etm__process_timeless_queues(etm, -1);
755+
}
736756

737-
return cs_etm__process_queues(etm);
757+
return cs_etm__process_timestamped_queues(etm);
738758
}
739759

740760
static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
@@ -1066,7 +1086,7 @@ static int cs_etm__queue_first_cs_timestamp(struct cs_etm_auxtrace *etm,
10661086
* chronological order.
10671087
*
10681088
* Note that packets decoded above are still in the traceID's packet
1069-
* queue and will be processed in cs_etm__process_queues().
1089+
* queue and will be processed in cs_etm__process_timestamped_queues().
10701090
*/
10711091
cs_queue_nr = TO_CS_QUEUE_NR(queue_nr, trace_chan_id);
10721092
ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, cs_timestamp);
@@ -1347,9 +1367,7 @@ static inline u64 cs_etm__resolve_sample_time(struct cs_etm_queue *etmq,
13471367
struct cs_etm_auxtrace *etm = etmq->etm;
13481368
struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue;
13491369

1350-
if (etm->timeless_decoding)
1351-
return 0;
1352-
else if (etm->has_virtual_ts)
1370+
if (!etm->timeless_decoding && etm->has_virtual_ts)
13531371
return packet_queue->cs_timestamp;
13541372
else
13551373
return etm->latest_kernel_timestamp;
@@ -2329,7 +2347,7 @@ static void cs_etm__clear_all_traceid_queues(struct cs_etm_queue *etmq)
23292347
}
23302348
}
23312349

2332-
static int cs_etm__run_decoder(struct cs_etm_queue *etmq)
2350+
static int cs_etm__run_per_thread_timeless_decoder(struct cs_etm_queue *etmq)
23332351
{
23342352
int err = 0;
23352353
struct cs_etm_traceid_queue *tidq;
@@ -2367,6 +2385,51 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq)
23672385
return err;
23682386
}
23692387

2388+
static int cs_etm__run_per_cpu_timeless_decoder(struct cs_etm_queue *etmq)
2389+
{
2390+
int idx, err = 0;
2391+
struct cs_etm_traceid_queue *tidq;
2392+
struct int_node *inode;
2393+
2394+
/* Go through each buffer in the queue and decode them one by one */
2395+
while (1) {
2396+
err = cs_etm__get_data_block(etmq);
2397+
if (err <= 0)
2398+
return err;
2399+
2400+
/* Run trace decoder until buffer consumed or end of trace */
2401+
do {
2402+
err = cs_etm__decode_data_block(etmq);
2403+
if (err)
2404+
return err;
2405+
2406+
/*
2407+
* cs_etm__run_per_thread_timeless_decoder() runs on a
2408+
* single traceID queue because each TID has a separate
2409+
* buffer. But here in per-cpu mode we need to iterate
2410+
* over each channel instead.
2411+
*/
2412+
intlist__for_each_entry(inode,
2413+
etmq->traceid_queues_list) {
2414+
idx = (int)(intptr_t)inode->priv;
2415+
tidq = etmq->traceid_queues[idx];
2416+
cs_etm__process_traceid_queue(etmq, tidq);
2417+
}
2418+
} while (etmq->buf_len);
2419+
2420+
intlist__for_each_entry(inode, etmq->traceid_queues_list) {
2421+
idx = (int)(intptr_t)inode->priv;
2422+
tidq = etmq->traceid_queues[idx];
2423+
/* Flush any remaining branch stack entries */
2424+
err = cs_etm__end_block(etmq, tidq);
2425+
if (err)
2426+
return err;
2427+
}
2428+
}
2429+
2430+
return err;
2431+
}
2432+
23702433
static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
23712434
pid_t tid)
23722435
{
@@ -2381,22 +2444,30 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
23812444
if (!etmq)
23822445
continue;
23832446

2384-
tidq = cs_etm__etmq_get_traceid_queue(etmq,
2385-
CS_ETM_PER_THREAD_TRACEID);
2447+
/*
2448+
* Per-cpu mode has contextIDs in the trace and the decoder
2449+
* calls cs_etm__set_pid_tid_cpu() automatically so no need
2450+
* to do this here
2451+
*/
2452+
if (etm->per_thread_decoding) {
2453+
tidq = cs_etm__etmq_get_traceid_queue(
2454+
etmq, CS_ETM_PER_THREAD_TRACEID);
23862455

2387-
if (!tidq)
2388-
continue;
2456+
if (!tidq)
2457+
continue;
23892458

2390-
if ((tid == -1) || (tidq->tid == tid)) {
2391-
cs_etm__set_pid_tid_cpu(etm, tidq);
2392-
cs_etm__run_decoder(etmq);
2393-
}
2459+
if ((tid == -1) || (tidq->tid == tid)) {
2460+
cs_etm__set_pid_tid_cpu(etm, tidq);
2461+
cs_etm__run_per_thread_timeless_decoder(etmq);
2462+
}
2463+
} else
2464+
cs_etm__run_per_cpu_timeless_decoder(etmq);
23942465
}
23952466

23962467
return 0;
23972468
}
23982469

2399-
static int cs_etm__process_queues(struct cs_etm_auxtrace *etm)
2470+
static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm)
24002471
{
24012472
int ret = 0;
24022473
unsigned int cs_queue_nr, queue_nr, i;
@@ -2573,7 +2644,6 @@ static int cs_etm__process_event(struct perf_session *session,
25732644
struct perf_sample *sample,
25742645
struct perf_tool *tool)
25752646
{
2576-
u64 sample_kernel_timestamp;
25772647
struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
25782648
struct cs_etm_auxtrace,
25792649
auxtrace);
@@ -2586,33 +2656,39 @@ static int cs_etm__process_event(struct perf_session *session,
25862656
return -EINVAL;
25872657
}
25882658

2589-
if (sample->time && (sample->time != (u64) -1))
2590-
sample_kernel_timestamp = sample->time;
2591-
else
2592-
sample_kernel_timestamp = 0;
2593-
2594-
/*
2595-
* Don't wait for cs_etm__flush_events() in per-thread/timeless mode to start the decode. We
2596-
* need the tid of the PERF_RECORD_EXIT event to assign to the synthesised samples because
2597-
* ETM_OPT_CTXTID is not enabled.
2598-
*/
2599-
if (etm->timeless_decoding &&
2600-
event->header.type == PERF_RECORD_EXIT)
2601-
return cs_etm__process_timeless_queues(etm,
2602-
event->fork.tid);
2659+
switch (event->header.type) {
2660+
case PERF_RECORD_EXIT:
2661+
/*
2662+
* Don't need to wait for cs_etm__flush_events() in per-thread mode to
2663+
* start the decode because we know there will be no more trace from
2664+
* this thread. All this does is emit samples earlier than waiting for
2665+
* the flush in other modes, but with timestamps it makes sense to wait
2666+
* for flush so that events from different threads are interleaved
2667+
* properly.
2668+
*/
2669+
if (etm->per_thread_decoding && etm->timeless_decoding)
2670+
return cs_etm__process_timeless_queues(etm,
2671+
event->fork.tid);
2672+
break;
26032673

2604-
if (event->header.type == PERF_RECORD_ITRACE_START)
2674+
case PERF_RECORD_ITRACE_START:
26052675
return cs_etm__process_itrace_start(etm, event);
2606-
else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
2676+
2677+
case PERF_RECORD_SWITCH_CPU_WIDE:
26072678
return cs_etm__process_switch_cpu_wide(etm, event);
26082679

2609-
if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) {
2680+
case PERF_RECORD_AUX:
26102681
/*
26112682
* Record the latest kernel timestamp available in the header
26122683
* for samples so that synthesised samples occur from this point
26132684
* onwards.
26142685
*/
2615-
etm->latest_kernel_timestamp = sample_kernel_timestamp;
2686+
if (sample->time && (sample->time != (u64)-1))
2687+
etm->latest_kernel_timestamp = sample->time;
2688+
break;
2689+
2690+
default:
2691+
break;
26162692
}
26172693

26182694
return 0;
@@ -2821,10 +2897,20 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
28212897
* Return 'not found' if mismatch.
28222898
*/
28232899
if (auxtrace_event->cpu == (__u32) -1) {
2900+
etm->per_thread_decoding = true;
28242901
if (auxtrace_event->tid != sample->tid)
28252902
return 1;
2826-
} else if (auxtrace_event->cpu != sample->cpu)
2903+
} else if (auxtrace_event->cpu != sample->cpu) {
2904+
if (etm->per_thread_decoding) {
2905+
/*
2906+
* Found a per-cpu buffer after a per-thread one was
2907+
* already found
2908+
*/
2909+
pr_err("CS ETM: Inconsistent per-thread/per-cpu mode.\n");
2910+
return -EINVAL;
2911+
}
28272912
return 1;
2913+
}
28282914

28292915
if (aux_event->flags & PERF_AUX_FLAG_OVERWRITE) {
28302916
/*

0 commit comments

Comments
 (0)