@@ -50,7 +50,22 @@ struct cs_etm_auxtrace {
5050 struct thread * unknown_thread ;
5151 struct perf_tsc_conversion tc ;
5252
53+ /*
54+ * Timeless has no timestamps in the trace so overlapping mmap lookups
55+ * are less accurate but produces smaller trace data. We use context IDs
56+ * in the trace instead of matching timestamps with fork records so
57+ * they're not really needed in the general case. Overlapping mmaps
58+ * happen in cases like between a fork and an exec.
59+ */
5360 bool timeless_decoding ;
61+
62+ /*
63+ * Per-thread ignores the trace channel ID and instead assumes that
64+ * everything in a buffer comes from the same process regardless of
65+ * which CPU it ran on. It also implies no context IDs so the TID is
66+ * taken from the auxtrace buffer.
67+ */
68+ bool per_thread_decoding ;
5469 bool snapshot_mode ;
5570 bool data_queued ;
5671 bool has_virtual_ts ; /* Virtual/Kernel timestamps in the trace. */
@@ -98,7 +113,7 @@ struct cs_etm_queue {
98113/* RB tree for quick conversion between traceID and metadata pointers */
99114static struct intlist * traceid_list ;
100115
101- static int cs_etm__process_queues (struct cs_etm_auxtrace * etm );
116+ static int cs_etm__process_timestamped_queues (struct cs_etm_auxtrace * etm );
102117static int cs_etm__process_timeless_queues (struct cs_etm_auxtrace * etm ,
103118 pid_t tid );
104119static int cs_etm__get_data_block (struct cs_etm_queue * etmq );
@@ -492,7 +507,7 @@ static struct cs_etm_traceid_queue
492507 struct cs_etm_traceid_queue * tidq , * * traceid_queues ;
493508 struct cs_etm_auxtrace * etm = etmq -> etm ;
494509
495- if (etm -> timeless_decoding )
510+ if (etm -> per_thread_decoding )
496511 trace_chan_id = CS_ETM_PER_THREAD_TRACEID ;
497512
498513 traceid_queues_list = etmq -> traceid_queues_list ;
@@ -731,10 +746,15 @@ static int cs_etm__flush_events(struct perf_session *session,
731746 if (!tool -> ordered_events )
732747 return - EINVAL ;
733748
734- if (etm -> timeless_decoding )
749+ if (etm -> timeless_decoding ) {
750+ /*
751+ * Pass tid = -1 to process all queues. But likely they will have
752+ * already been processed on PERF_RECORD_EXIT anyway.
753+ */
735754 return cs_etm__process_timeless_queues (etm , -1 );
755+ }
736756
737- return cs_etm__process_queues (etm );
757+ return cs_etm__process_timestamped_queues (etm );
738758}
739759
740760static void cs_etm__free_traceid_queues (struct cs_etm_queue * etmq )
@@ -1066,7 +1086,7 @@ static int cs_etm__queue_first_cs_timestamp(struct cs_etm_auxtrace *etm,
10661086 * chronological order.
10671087 *
10681088 * Note that packets decoded above are still in the traceID's packet
1069- * queue and will be processed in cs_etm__process_queues ().
1089+ * queue and will be processed in cs_etm__process_timestamped_queues ().
10701090 */
10711091 cs_queue_nr = TO_CS_QUEUE_NR (queue_nr , trace_chan_id );
10721092 ret = auxtrace_heap__add (& etm -> heap , cs_queue_nr , cs_timestamp );
@@ -1347,9 +1367,7 @@ static inline u64 cs_etm__resolve_sample_time(struct cs_etm_queue *etmq,
13471367 struct cs_etm_auxtrace * etm = etmq -> etm ;
13481368 struct cs_etm_packet_queue * packet_queue = & tidq -> packet_queue ;
13491369
1350- if (etm -> timeless_decoding )
1351- return 0 ;
1352- else if (etm -> has_virtual_ts )
1370+ if (!etm -> timeless_decoding && etm -> has_virtual_ts )
13531371 return packet_queue -> cs_timestamp ;
13541372 else
13551373 return etm -> latest_kernel_timestamp ;
@@ -2329,7 +2347,7 @@ static void cs_etm__clear_all_traceid_queues(struct cs_etm_queue *etmq)
23292347 }
23302348}
23312349
2332- static int cs_etm__run_decoder (struct cs_etm_queue * etmq )
2350+ static int cs_etm__run_per_thread_timeless_decoder (struct cs_etm_queue * etmq )
23332351{
23342352 int err = 0 ;
23352353 struct cs_etm_traceid_queue * tidq ;
@@ -2367,6 +2385,51 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq)
23672385 return err ;
23682386}
23692387
2388+ static int cs_etm__run_per_cpu_timeless_decoder (struct cs_etm_queue * etmq )
2389+ {
2390+ int idx , err = 0 ;
2391+ struct cs_etm_traceid_queue * tidq ;
2392+ struct int_node * inode ;
2393+
2394+ /* Go through each buffer in the queue and decode them one by one */
2395+ while (1 ) {
2396+ err = cs_etm__get_data_block (etmq );
2397+ if (err <= 0 )
2398+ return err ;
2399+
2400+ /* Run trace decoder until buffer consumed or end of trace */
2401+ do {
2402+ err = cs_etm__decode_data_block (etmq );
2403+ if (err )
2404+ return err ;
2405+
2406+ /*
2407+ * cs_etm__run_per_thread_timeless_decoder() runs on a
2408+ * single traceID queue because each TID has a separate
2409+ * buffer. But here in per-cpu mode we need to iterate
2410+ * over each channel instead.
2411+ */
2412+ intlist__for_each_entry (inode ,
2413+ etmq -> traceid_queues_list ) {
2414+ idx = (int )(intptr_t )inode -> priv ;
2415+ tidq = etmq -> traceid_queues [idx ];
2416+ cs_etm__process_traceid_queue (etmq , tidq );
2417+ }
2418+ } while (etmq -> buf_len );
2419+
2420+ intlist__for_each_entry (inode , etmq -> traceid_queues_list ) {
2421+ idx = (int )(intptr_t )inode -> priv ;
2422+ tidq = etmq -> traceid_queues [idx ];
2423+ /* Flush any remaining branch stack entries */
2424+ err = cs_etm__end_block (etmq , tidq );
2425+ if (err )
2426+ return err ;
2427+ }
2428+ }
2429+
2430+ return err ;
2431+ }
2432+
23702433static int cs_etm__process_timeless_queues (struct cs_etm_auxtrace * etm ,
23712434 pid_t tid )
23722435{
@@ -2381,22 +2444,30 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
23812444 if (!etmq )
23822445 continue ;
23832446
2384- tidq = cs_etm__etmq_get_traceid_queue (etmq ,
2385- CS_ETM_PER_THREAD_TRACEID );
2447+ /*
2448+ * Per-cpu mode has contextIDs in the trace and the decoder
2449+ * calls cs_etm__set_pid_tid_cpu() automatically so no need
2450+ * to do this here
2451+ */
2452+ if (etm -> per_thread_decoding ) {
2453+ tidq = cs_etm__etmq_get_traceid_queue (
2454+ etmq , CS_ETM_PER_THREAD_TRACEID );
23862455
2387- if (!tidq )
2388- continue ;
2456+ if (!tidq )
2457+ continue ;
23892458
2390- if ((tid == -1 ) || (tidq -> tid == tid )) {
2391- cs_etm__set_pid_tid_cpu (etm , tidq );
2392- cs_etm__run_decoder (etmq );
2393- }
2459+ if ((tid == -1 ) || (tidq -> tid == tid )) {
2460+ cs_etm__set_pid_tid_cpu (etm , tidq );
2461+ cs_etm__run_per_thread_timeless_decoder (etmq );
2462+ }
2463+ } else
2464+ cs_etm__run_per_cpu_timeless_decoder (etmq );
23942465 }
23952466
23962467 return 0 ;
23972468}
23982469
2399- static int cs_etm__process_queues (struct cs_etm_auxtrace * etm )
2470+ static int cs_etm__process_timestamped_queues (struct cs_etm_auxtrace * etm )
24002471{
24012472 int ret = 0 ;
24022473 unsigned int cs_queue_nr , queue_nr , i ;
@@ -2573,7 +2644,6 @@ static int cs_etm__process_event(struct perf_session *session,
25732644 struct perf_sample * sample ,
25742645 struct perf_tool * tool )
25752646{
2576- u64 sample_kernel_timestamp ;
25772647 struct cs_etm_auxtrace * etm = container_of (session -> auxtrace ,
25782648 struct cs_etm_auxtrace ,
25792649 auxtrace );
@@ -2586,33 +2656,39 @@ static int cs_etm__process_event(struct perf_session *session,
25862656 return - EINVAL ;
25872657 }
25882658
2589- if ( sample -> time && ( sample -> time != ( u64 ) - 1 ))
2590- sample_kernel_timestamp = sample -> time ;
2591- else
2592- sample_kernel_timestamp = 0 ;
2593-
2594- /*
2595- * Don't wait for cs_etm__flush_events() in per-thread/timeless mode to start the decode. We
2596- * need the tid of the PERF_RECORD_EXIT event to assign to the synthesised samples because
2597- * ETM_OPT_CTXTID is not enabled .
2598- */
2599- if (etm -> timeless_decoding &&
2600- event -> header . type == PERF_RECORD_EXIT )
2601- return cs_etm__process_timeless_queues ( etm ,
2602- event -> fork . tid ) ;
2659+ switch ( event -> header . type ) {
2660+ case PERF_RECORD_EXIT :
2661+ /*
2662+ * Don't need to wait for cs_etm__flush_events() in per-thread mode to
2663+ * start the decode because we know there will be no more trace from
2664+ * this thread. All this does is emit samples earlier than waiting for
2665+ * the flush in other modes, but with timestamps it makes sense to wait
2666+ * for flush so that events from different threads are interleaved
2667+ * properly .
2668+ */
2669+ if (etm -> per_thread_decoding && etm -> timeless_decoding )
2670+ return cs_etm__process_timeless_queues ( etm ,
2671+ event -> fork . tid );
2672+ break ;
26032673
2604- if ( event -> header . type == PERF_RECORD_ITRACE_START )
2674+ case PERF_RECORD_ITRACE_START :
26052675 return cs_etm__process_itrace_start (etm , event );
2606- else if (event -> header .type == PERF_RECORD_SWITCH_CPU_WIDE )
2676+
2677+ case PERF_RECORD_SWITCH_CPU_WIDE :
26072678 return cs_etm__process_switch_cpu_wide (etm , event );
26082679
2609- if (! etm -> timeless_decoding && event -> header . type == PERF_RECORD_AUX ) {
2680+ case PERF_RECORD_AUX :
26102681 /*
26112682 * Record the latest kernel timestamp available in the header
26122683 * for samples so that synthesised samples occur from this point
26132684 * onwards.
26142685 */
2615- etm -> latest_kernel_timestamp = sample_kernel_timestamp ;
2686+ if (sample -> time && (sample -> time != (u64 )- 1 ))
2687+ etm -> latest_kernel_timestamp = sample -> time ;
2688+ break ;
2689+
2690+ default :
2691+ break ;
26162692 }
26172693
26182694 return 0 ;
@@ -2821,10 +2897,20 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
28212897 * Return 'not found' if mismatch.
28222898 */
28232899 if (auxtrace_event -> cpu == (__u32 ) - 1 ) {
2900+ etm -> per_thread_decoding = true;
28242901 if (auxtrace_event -> tid != sample -> tid )
28252902 return 1 ;
2826- } else if (auxtrace_event -> cpu != sample -> cpu )
2903+ } else if (auxtrace_event -> cpu != sample -> cpu ) {
2904+ if (etm -> per_thread_decoding ) {
2905+ /*
2906+ * Found a per-cpu buffer after a per-thread one was
2907+ * already found
2908+ */
2909+ pr_err ("CS ETM: Inconsistent per-thread/per-cpu mode.\n" );
2910+ return - EINVAL ;
2911+ }
28272912 return 1 ;
2913+ }
28282914
28292915 if (aux_event -> flags & PERF_AUX_FLAG_OVERWRITE ) {
28302916 /*
0 commit comments