Skip to content

Commit 5d95a2a

Browse files
author
Peter Zijlstra
committed
perf: Add context time freeze
Many of the the context reschedule users are of the form: ctx_sched_out(.type = EVENT_TIME); ... modify context ctx_resched(); With the idea that the whole reschedule happens with a single time-stamp, rather than with each ctx_sched_out() advancing time and ctx_sched_in() re-starting time, creating a non-atomic experience. However, Kan noticed that since this completely stops time, it actually looses a bit of time between the stop and start. Worse, now that we can do partial (per PMU) reschedules, the PMUs that are not scheduled out still observe the time glitch. Replace this with: ctx_time_freeze(); ... modify context ctx_resched(); With the assumption that this happens in a perf_ctx_lock() / perf_ctx_unlock() pair. The new ctx_time_freeze() will update time and sets EVENT_FROZEN, and ensures EVENT_TIME and EVENT_FROZEN remain set, this avoids perf_event_time_now() from observing a time wobble from not seeing EVENT_TIME for a little while. Additionally, this avoids loosing time between ctx_sched_out(EVENT_TIME) and ctx_sched_in(), which would re-set the timestamp. Reported-by: Kan Liang <kan.liang@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Kan Liang <kan.liang@linux.intel.com> Reviewed-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240807115550.250637571@infradead.org
1 parent 558abc7 commit 5d95a2a

1 file changed

Lines changed: 86 additions & 42 deletions

File tree

kernel/events/core.c

Lines changed: 86 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
155155
return data.ret;
156156
}
157157

158+
enum event_type_t {
159+
EVENT_FLEXIBLE = 0x01,
160+
EVENT_PINNED = 0x02,
161+
EVENT_TIME = 0x04,
162+
EVENT_FROZEN = 0x08,
163+
/* see ctx_resched() for details */
164+
EVENT_CPU = 0x10,
165+
EVENT_CGROUP = 0x20,
166+
167+
/* compound helpers */
168+
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
169+
EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
170+
};
171+
172+
static inline void __perf_ctx_lock(struct perf_event_context *ctx)
173+
{
174+
raw_spin_lock(&ctx->lock);
175+
WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
176+
}
177+
158178
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
159179
struct perf_event_context *ctx)
160180
{
161-
raw_spin_lock(&cpuctx->ctx.lock);
181+
__perf_ctx_lock(&cpuctx->ctx);
162182
if (ctx)
163-
raw_spin_lock(&ctx->lock);
183+
__perf_ctx_lock(ctx);
184+
}
185+
186+
static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
187+
{
188+
/*
189+
* If ctx_sched_in() didn't again set any ALL flags, clean up
190+
* after ctx_sched_out() by clearing is_active.
191+
*/
192+
if (ctx->is_active & EVENT_FROZEN) {
193+
if (!(ctx->is_active & EVENT_ALL))
194+
ctx->is_active = 0;
195+
else
196+
ctx->is_active &= ~EVENT_FROZEN;
197+
}
198+
raw_spin_unlock(&ctx->lock);
164199
}
165200

166201
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
167202
struct perf_event_context *ctx)
168203
{
169204
if (ctx)
170-
raw_spin_unlock(&ctx->lock);
171-
raw_spin_unlock(&cpuctx->ctx.lock);
205+
__perf_ctx_unlock(ctx);
206+
__perf_ctx_unlock(&cpuctx->ctx);
172207
}
173208

174209
#define TASK_TOMBSTONE ((void *)-1L)
@@ -370,16 +405,6 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
370405
(PERF_SAMPLE_BRANCH_KERNEL |\
371406
PERF_SAMPLE_BRANCH_HV)
372407

373-
enum event_type_t {
374-
EVENT_FLEXIBLE = 0x1,
375-
EVENT_PINNED = 0x2,
376-
EVENT_TIME = 0x4,
377-
/* see ctx_resched() for details */
378-
EVENT_CPU = 0x8,
379-
EVENT_CGROUP = 0x10,
380-
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
381-
};
382-
383408
/*
384409
* perf_sched_events : >0 events exist
385410
*/
@@ -2332,18 +2357,39 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
23322357
}
23332358

23342359
static inline void
2335-
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
2360+
__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
23362361
{
23372362
if (ctx->is_active & EVENT_TIME) {
2363+
if (ctx->is_active & EVENT_FROZEN)
2364+
return;
23382365
update_context_time(ctx);
2339-
update_cgrp_time_from_cpuctx(cpuctx, false);
2366+
update_cgrp_time_from_cpuctx(cpuctx, final);
23402367
}
23412368
}
23422369

2370+
static inline void
2371+
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
2372+
{
2373+
__ctx_time_update(cpuctx, ctx, false);
2374+
}
2375+
2376+
/*
2377+
* To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
2378+
*/
2379+
static inline void
2380+
ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
2381+
{
2382+
ctx_time_update(cpuctx, ctx);
2383+
if (ctx->is_active & EVENT_TIME)
2384+
ctx->is_active |= EVENT_FROZEN;
2385+
}
2386+
23432387
static inline void
23442388
ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
23452389
{
23462390
if (ctx->is_active & EVENT_TIME) {
2391+
if (ctx->is_active & EVENT_FROZEN)
2392+
return;
23472393
update_context_time(ctx);
23482394
update_cgrp_time_from_event(event);
23492395
}
@@ -2822,7 +2868,7 @@ static int __perf_install_in_context(void *info)
28222868
#endif
28232869

28242870
if (reprogram) {
2825-
ctx_sched_out(ctx, NULL, EVENT_TIME);
2871+
ctx_time_freeze(cpuctx, ctx);
28262872
add_event_to_ctx(event, ctx);
28272873
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
28282874
get_event_type(event));
@@ -2968,28 +3014,23 @@ static void __perf_event_enable(struct perf_event *event,
29683014
event->state <= PERF_EVENT_STATE_ERROR)
29693015
return;
29703016

2971-
if (ctx->is_active)
2972-
ctx_sched_out(ctx, NULL, EVENT_TIME);
3017+
ctx_time_freeze(cpuctx, ctx);
29733018

29743019
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
29753020
perf_cgroup_event_enable(event, ctx);
29763021

29773022
if (!ctx->is_active)
29783023
return;
29793024

2980-
if (!event_filter_match(event)) {
2981-
ctx_sched_in(ctx, NULL, EVENT_TIME);
3025+
if (!event_filter_match(event))
29823026
return;
2983-
}
29843027

29853028
/*
29863029
* If the event is in a group and isn't the group leader,
29873030
* then don't put it on unless the group is on.
29883031
*/
2989-
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2990-
ctx_sched_in(ctx, NULL, EVENT_TIME);
3032+
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
29913033
return;
2992-
}
29933034

29943035
task_ctx = cpuctx->task_ctx;
29953036
if (ctx->task)
@@ -3263,7 +3304,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
32633304
struct perf_event *event, *tmp;
32643305
struct pmu *pmu = pmu_ctx->pmu;
32653306

3266-
if (ctx->task && !ctx->is_active) {
3307+
if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
32673308
struct perf_cpu_pmu_context *cpc;
32683309

32693310
cpc = this_cpu_ptr(pmu->cpu_pmu_context);
@@ -3338,24 +3379,29 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
33383379
*
33393380
* would only update time for the pinned events.
33403381
*/
3341-
if (is_active & EVENT_TIME) {
3342-
/* update (and stop) ctx time */
3343-
update_context_time(ctx);
3344-
update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
3382+
__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
3383+
3384+
/*
3385+
* CPU-release for the below ->is_active store,
3386+
* see __load_acquire() in perf_event_time_now()
3387+
*/
3388+
barrier();
3389+
ctx->is_active &= ~event_type;
3390+
3391+
if (!(ctx->is_active & EVENT_ALL)) {
33453392
/*
3346-
* CPU-release for the below ->is_active store,
3347-
* see __load_acquire() in perf_event_time_now()
3393+
* For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
3394+
* does not observe a hole. perf_ctx_unlock() will clean up.
33483395
*/
3349-
barrier();
3396+
if (ctx->is_active & EVENT_FROZEN)
3397+
ctx->is_active &= EVENT_TIME_FROZEN;
3398+
else
3399+
ctx->is_active = 0;
33503400
}
33513401

3352-
ctx->is_active &= ~event_type;
3353-
if (!(ctx->is_active & EVENT_ALL))
3354-
ctx->is_active = 0;
3355-
33563402
if (ctx->task) {
33573403
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3358-
if (!ctx->is_active)
3404+
if (!(ctx->is_active & EVENT_ALL))
33593405
cpuctx->task_ctx = NULL;
33603406
}
33613407

@@ -3943,7 +3989,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t
39433989

39443990
ctx->is_active |= (event_type | EVENT_TIME);
39453991
if (ctx->task) {
3946-
if (!is_active)
3992+
if (!(is_active & EVENT_ALL))
39473993
cpuctx->task_ctx = ctx;
39483994
else
39493995
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@ -4424,7 +4470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
44244470

44254471
cpuctx = this_cpu_ptr(&perf_cpu_context);
44264472
perf_ctx_lock(cpuctx, ctx);
4427-
ctx_sched_out(ctx, NULL, EVENT_TIME);
4473+
ctx_time_freeze(cpuctx, ctx);
44284474

44294475
list_for_each_entry(event, &ctx->event_list, event_entry) {
44304476
enabled |= event_enable_on_exec(event, ctx);
@@ -4437,8 +4483,6 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
44374483
if (enabled) {
44384484
clone_ctx = unclone_ctx(ctx);
44394485
ctx_resched(cpuctx, ctx, NULL, event_type);
4440-
} else {
4441-
ctx_sched_in(ctx, NULL, EVENT_TIME);
44424486
}
44434487
perf_ctx_unlock(cpuctx, ctx);
44444488

0 commit comments

Comments
 (0)