Skip to content

Commit 2d17cf1

Browse files
author
Peter Zijlstra
committed
perf: Optimize context reschedule for single PMU cases
Currently re-scheduling a context will reschedule all active PMUs for that context, even if it is known only a single event is added. Namhyung reported that changing this to only reschedule the affected PMU when possible provides significant performance gains under certain conditions. Therefore, allow partial context reschedules for a specific PMU, that of the event modified. While the patch looks somewhat noisy, it mostly just propagates a new @pmu argument through the callchain and modifies the epc loop to only pick the 'epc->pmu == @pmu' case. Reported-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Kan Liang <kan.liang@linux.intel.com> Reviewed-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240807115549.920950699@infradead.org
1 parent ea1992f commit 2d17cf1

1 file changed

Lines changed: 88 additions & 76 deletions

File tree

kernel/events/core.c

Lines changed: 88 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -685,30 +685,32 @@ do { \
685685
___p; \
686686
})
687687

688+
#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
689+
list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
690+
if (_cgroup && !_epc->nr_cgroups) \
691+
continue; \
692+
else if (_pmu && _epc->pmu != _pmu) \
693+
continue; \
694+
else
695+
688696
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
689697
{
690698
struct perf_event_pmu_context *pmu_ctx;
691699

692-
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
693-
if (cgroup && !pmu_ctx->nr_cgroups)
694-
continue;
700+
for_each_epc(pmu_ctx, ctx, NULL, cgroup)
695701
perf_pmu_disable(pmu_ctx->pmu);
696-
}
697702
}
698703

699704
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
700705
{
701706
struct perf_event_pmu_context *pmu_ctx;
702707

703-
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
704-
if (cgroup && !pmu_ctx->nr_cgroups)
705-
continue;
708+
for_each_epc(pmu_ctx, ctx, NULL, cgroup)
706709
perf_pmu_enable(pmu_ctx->pmu);
707-
}
708710
}
709711

710-
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
711-
static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
712+
static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
713+
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
712714

713715
#ifdef CONFIG_CGROUP_PERF
714716

@@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct task_struct *task)
865867
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
866868
perf_ctx_disable(&cpuctx->ctx, true);
867869

868-
ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
870+
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
869871
/*
870872
* must not be done before ctxswout due
871873
* to update_cgrp_time_from_cpuctx() in
@@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct task_struct *task)
877879
* perf_cgroup_set_timestamp() in ctx_sched_in()
878880
* to not have to pass task around
879881
*/
880-
ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
882+
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
881883

882884
perf_ctx_enable(&cpuctx->ctx, true);
883885
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -2656,7 +2658,8 @@ static void add_event_to_ctx(struct perf_event *event,
26562658
}
26572659

26582660
static void task_ctx_sched_out(struct perf_event_context *ctx,
2659-
enum event_type_t event_type)
2661+
struct pmu *pmu,
2662+
enum event_type_t event_type)
26602663
{
26612664
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
26622665

@@ -2666,18 +2669,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
26662669
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
26672670
return;
26682671

2669-
ctx_sched_out(ctx, event_type);
2672+
ctx_sched_out(ctx, pmu, event_type);
26702673
}
26712674

26722675
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2673-
struct perf_event_context *ctx)
2676+
struct perf_event_context *ctx,
2677+
struct pmu *pmu)
26742678
{
2675-
ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
2679+
ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
26762680
if (ctx)
2677-
ctx_sched_in(ctx, EVENT_PINNED);
2678-
ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
2681+
ctx_sched_in(ctx, pmu, EVENT_PINNED);
2682+
ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
26792683
if (ctx)
2680-
ctx_sched_in(ctx, EVENT_FLEXIBLE);
2684+
ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
26812685
}
26822686

26832687
/*
@@ -2695,16 +2699,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
26952699
* event_type is a bit mask of the types of events involved. For CPU events,
26962700
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
26972701
*/
2698-
/*
2699-
* XXX: ctx_resched() reschedule entire perf_event_context while adding new
2700-
* event to the context or enabling existing event in the context. We can
2701-
* probably optimize it by rescheduling only affected pmu_ctx.
2702-
*/
27032702
static void ctx_resched(struct perf_cpu_context *cpuctx,
27042703
struct perf_event_context *task_ctx,
2705-
enum event_type_t event_type)
2704+
struct pmu *pmu, enum event_type_t event_type)
27062705
{
27072706
bool cpu_event = !!(event_type & EVENT_CPU);
2707+
struct perf_event_pmu_context *epc;
27082708

27092709
/*
27102710
* If pinned groups are involved, flexible groups also need to be
@@ -2715,10 +2715,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
27152715

27162716
event_type &= EVENT_ALL;
27172717

2718-
perf_ctx_disable(&cpuctx->ctx, false);
2718+
for_each_epc(epc, &cpuctx->ctx, pmu, false)
2719+
perf_pmu_disable(epc->pmu);
2720+
27192721
if (task_ctx) {
2720-
perf_ctx_disable(task_ctx, false);
2721-
task_ctx_sched_out(task_ctx, event_type);
2722+
for_each_epc(epc, task_ctx, pmu, false)
2723+
perf_pmu_disable(epc->pmu);
2724+
2725+
task_ctx_sched_out(task_ctx, pmu, event_type);
27222726
}
27232727

27242728
/*
@@ -2729,15 +2733,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
27292733
* - otherwise, do nothing more.
27302734
*/
27312735
if (cpu_event)
2732-
ctx_sched_out(&cpuctx->ctx, event_type);
2736+
ctx_sched_out(&cpuctx->ctx, pmu, event_type);
27332737
else if (event_type & EVENT_PINNED)
2734-
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
2738+
ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
2739+
2740+
perf_event_sched_in(cpuctx, task_ctx, pmu);
27352741

2736-
perf_event_sched_in(cpuctx, task_ctx);
2742+
for_each_epc(epc, &cpuctx->ctx, pmu, false)
2743+
perf_pmu_enable(epc->pmu);
27372744

2738-
perf_ctx_enable(&cpuctx->ctx, false);
2739-
if (task_ctx)
2740-
perf_ctx_enable(task_ctx, false);
2745+
if (task_ctx) {
2746+
for_each_epc(epc, task_ctx, pmu, false)
2747+
perf_pmu_enable(epc->pmu);
2748+
}
27412749
}
27422750

27432751
void perf_pmu_resched(struct pmu *pmu)
@@ -2746,7 +2754,7 @@ void perf_pmu_resched(struct pmu *pmu)
27462754
struct perf_event_context *task_ctx = cpuctx->task_ctx;
27472755

27482756
perf_ctx_lock(cpuctx, task_ctx);
2749-
ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2757+
ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
27502758
perf_ctx_unlock(cpuctx, task_ctx);
27512759
}
27522760

@@ -2802,9 +2810,10 @@ static int __perf_install_in_context(void *info)
28022810
#endif
28032811

28042812
if (reprogram) {
2805-
ctx_sched_out(ctx, EVENT_TIME);
2813+
ctx_sched_out(ctx, NULL, EVENT_TIME);
28062814
add_event_to_ctx(event, ctx);
2807-
ctx_resched(cpuctx, task_ctx, get_event_type(event));
2815+
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
2816+
get_event_type(event));
28082817
} else {
28092818
add_event_to_ctx(event, ctx);
28102819
}
@@ -2948,7 +2957,7 @@ static void __perf_event_enable(struct perf_event *event,
29482957
return;
29492958

29502959
if (ctx->is_active)
2951-
ctx_sched_out(ctx, EVENT_TIME);
2960+
ctx_sched_out(ctx, NULL, EVENT_TIME);
29522961

29532962
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
29542963
perf_cgroup_event_enable(event, ctx);
@@ -2957,7 +2966,7 @@ static void __perf_event_enable(struct perf_event *event,
29572966
return;
29582967

29592968
if (!event_filter_match(event)) {
2960-
ctx_sched_in(ctx, EVENT_TIME);
2969+
ctx_sched_in(ctx, NULL, EVENT_TIME);
29612970
return;
29622971
}
29632972

@@ -2966,15 +2975,15 @@ static void __perf_event_enable(struct perf_event *event,
29662975
* then don't put it on unless the group is on.
29672976
*/
29682977
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2969-
ctx_sched_in(ctx, EVENT_TIME);
2978+
ctx_sched_in(ctx, NULL, EVENT_TIME);
29702979
return;
29712980
}
29722981

29732982
task_ctx = cpuctx->task_ctx;
29742983
if (ctx->task)
29752984
WARN_ON_ONCE(task_ctx != ctx);
29762985

2977-
ctx_resched(cpuctx, task_ctx, get_event_type(event));
2986+
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
29782987
}
29792988

29802989
/*
@@ -3276,8 +3285,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
32763285
perf_pmu_enable(pmu);
32773286
}
32783287

3288+
/*
3289+
* Be very careful with the @pmu argument since this will change ctx state.
3290+
* The @pmu argument works for ctx_resched(), because that is symmetric in
3291+
* ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
3292+
*
3293+
* However, if you were to be asymmetrical, you could end up with messed up
3294+
* state, eg. ctx->is_active cleared even though most EPCs would still actually
3295+
* be active.
3296+
*/
32793297
static void
3280-
ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
3298+
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
32813299
{
32823300
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
32833301
struct perf_event_pmu_context *pmu_ctx;
@@ -3331,11 +3349,8 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
33313349

33323350
is_active ^= ctx->is_active; /* changed bits */
33333351

3334-
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3335-
if (cgroup && !pmu_ctx->nr_cgroups)
3336-
continue;
3352+
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
33373353
__pmu_ctx_sched_out(pmu_ctx, is_active);
3338-
}
33393354
}
33403355

33413356
/*
@@ -3579,7 +3594,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
35793594

35803595
inside_switch:
35813596
perf_ctx_sched_task_cb(ctx, false);
3582-
task_ctx_sched_out(ctx, EVENT_ALL);
3597+
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
35833598

35843599
perf_ctx_enable(ctx, false);
35853600
raw_spin_unlock(&ctx->lock);
@@ -3877,29 +3892,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx,
38773892
merge_sched_in, &can_add_hw);
38783893
}
38793894

3880-
static void ctx_groups_sched_in(struct perf_event_context *ctx,
3881-
struct perf_event_groups *groups,
3882-
bool cgroup)
3895+
static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
3896+
enum event_type_t event_type)
38833897
{
3884-
struct perf_event_pmu_context *pmu_ctx;
3885-
3886-
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3887-
if (cgroup && !pmu_ctx->nr_cgroups)
3888-
continue;
3889-
pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
3890-
}
3891-
}
3898+
struct perf_event_context *ctx = pmu_ctx->ctx;
38923899

3893-
static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
3894-
struct pmu *pmu)
3895-
{
3896-
pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
3900+
if (event_type & EVENT_PINNED)
3901+
pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
3902+
if (event_type & EVENT_FLEXIBLE)
3903+
pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
38973904
}
38983905

38993906
static void
3900-
ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
3907+
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
39013908
{
39023909
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3910+
struct perf_event_pmu_context *pmu_ctx;
39033911
int is_active = ctx->is_active;
39043912
bool cgroup = event_type & EVENT_CGROUP;
39053913

@@ -3935,12 +3943,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
39353943
* First go through the list and put on any pinned groups
39363944
* in order to give them the best chance of going on.
39373945
*/
3938-
if (is_active & EVENT_PINNED)
3939-
ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
3946+
if (is_active & EVENT_PINNED) {
3947+
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
3948+
__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
3949+
}
39403950

39413951
/* Then walk through the lower prio flexible groups */
3942-
if (is_active & EVENT_FLEXIBLE)
3943-
ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
3952+
if (is_active & EVENT_FLEXIBLE) {
3953+
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
3954+
__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
3955+
}
39443956
}
39453957

39463958
static void perf_event_context_sched_in(struct task_struct *task)
@@ -3983,10 +3995,10 @@ static void perf_event_context_sched_in(struct task_struct *task)
39833995
*/
39843996
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
39853997
perf_ctx_disable(&cpuctx->ctx, false);
3986-
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
3998+
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
39873999
}
39884000

3989-
perf_event_sched_in(cpuctx, ctx);
4001+
perf_event_sched_in(cpuctx, ctx, NULL);
39904002

39914003
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
39924004

@@ -4327,14 +4339,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
43274339
update_context_time(&cpuctx->ctx);
43284340
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
43294341
rotate_ctx(&cpuctx->ctx, cpu_event);
4330-
__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
4342+
__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
43314343
}
43324344

43334345
if (task_event)
43344346
rotate_ctx(task_epc->ctx, task_event);
43354347

43364348
if (task_event || (task_epc && cpu_event))
4337-
__pmu_ctx_sched_in(task_epc->ctx, pmu);
4349+
__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
43384350

43394351
perf_pmu_enable(pmu);
43404352
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4400,7 +4412,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
44004412

44014413
cpuctx = this_cpu_ptr(&perf_cpu_context);
44024414
perf_ctx_lock(cpuctx, ctx);
4403-
ctx_sched_out(ctx, EVENT_TIME);
4415+
ctx_sched_out(ctx, NULL, EVENT_TIME);
44044416

44054417
list_for_each_entry(event, &ctx->event_list, event_entry) {
44064418
enabled |= event_enable_on_exec(event, ctx);
@@ -4412,9 +4424,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
44124424
*/
44134425
if (enabled) {
44144426
clone_ctx = unclone_ctx(ctx);
4415-
ctx_resched(cpuctx, ctx, event_type);
4427+
ctx_resched(cpuctx, ctx, NULL, event_type);
44164428
} else {
4417-
ctx_sched_in(ctx, EVENT_TIME);
4429+
ctx_sched_in(ctx, NULL, EVENT_TIME);
44184430
}
44194431
perf_ctx_unlock(cpuctx, ctx);
44204432

@@ -13202,7 +13214,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
1320213214
* in.
1320313215
*/
1320413216
raw_spin_lock_irq(&child_ctx->lock);
13205-
task_ctx_sched_out(child_ctx, EVENT_ALL);
13217+
task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
1320613218

1320713219
/*
1320813220
* Now that the context is inactive, destroy the task <-> ctx relation
@@ -13751,7 +13763,7 @@ static void __perf_event_exit_context(void *__info)
1375113763
struct perf_event *event;
1375213764

1375313765
raw_spin_lock(&ctx->lock);
13754-
ctx_sched_out(ctx, EVENT_TIME);
13766+
ctx_sched_out(ctx, NULL, EVENT_TIME);
1375513767
list_for_each_entry(event, &ctx->event_list, event_entry)
1375613768
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
1375713769
raw_spin_unlock(&ctx->lock);

0 commit comments

Comments
 (0)