Skip to content

Commit 586b222

Browse files
committed
Merge tag 'sched-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Allow unprivileged PSI poll()ing - Fix performance regression introduced by mm_cid - Improve livepatch stalls by adding livepatch task switching to cond_resched(). This resolves livepatching busy-loop stalls with certain CPU-bound kthreads - Improve sched_move_task() performance on autogroup configs - On core-scheduling CPUs, avoid selecting throttled tasks to run - Misc cleanups, fixes and improvements * tag 'sched-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/clock: Fix local_clock() before sched_clock_init() sched/rt: Fix bad task migration for rt tasks sched: Fix performance regression introduced by mm_cid sched/core: Make sched_dynamic_mutex static sched/psi: Allow unprivileged polling of N*2s period sched/psi: Extract update_triggers side effect sched/psi: Rename existing poll members in preparation sched/psi: Rearrange polling code in preparation sched/fair: Fix inaccurate tally of ttwu_move_affine vhost: Fix livepatch timeouts in vhost_worker() livepatch,sched: Add livepatch task switching to cond_resched() livepatch: Skip task_call_func() for current task livepatch: Convert stack entries array to percpu sched: Interleave cfs bandwidth timers for improved single thread performance at low utilization sched/core: Reduce cost of sched_move_task when config autogroup sched/core: Avoid selecting the task that is throttled to run when core-sched enable sched/topology: Make sched_energy_mutex,update static
2 parents 7c33977 + f31dcb1 commit 586b222

21 files changed

Lines changed: 1424 additions & 350 deletions

File tree

Documentation/accounting/psi.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ prevent overly frequent polling. Max limit is chosen as a high enough number
105105
after which monitors are most likely not needed and psi averages can be used
106106
instead.
107107

108+
Unprivileged users can also create monitors, with the only limitation that the
109+
window size must be a multiple of 2s, in order to prevent excessive resource
110+
usage.
111+
108112
When activated, psi monitor stays active for at least the duration of one
109113
tracking window to avoid repeated activations/deactivations when system is
110114
bouncing in and out of the stall state.

drivers/vhost/vhost.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,8 +361,7 @@ static int vhost_worker(void *data)
361361
kcov_remote_start_common(worker->kcov_handle);
362362
work->fn(work);
363363
kcov_remote_stop();
364-
if (need_resched())
365-
schedule();
364+
cond_resched();
366365
}
367366
}
368367

include/linux/livepatch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <linux/ftrace.h>
1414
#include <linux/completion.h>
1515
#include <linux/list.h>
16+
#include <linux/livepatch_sched.h>
1617

1718
#if IS_ENABLED(CONFIG_LIVEPATCH)
1819

include/linux/livepatch_sched.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
#ifndef _LINUX_LIVEPATCH_SCHED_H_
3+
#define _LINUX_LIVEPATCH_SCHED_H_
4+
5+
#include <linux/jump_label.h>
6+
#include <linux/static_call_types.h>
7+
8+
#ifdef CONFIG_LIVEPATCH
9+
10+
void __klp_sched_try_switch(void);
11+
12+
#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
13+
14+
DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
15+
16+
static __always_inline void klp_sched_try_switch(void)
17+
{
18+
if (static_branch_unlikely(&klp_sched_try_switch_key))
19+
__klp_sched_try_switch();
20+
}
21+
22+
#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
23+
24+
#else /* !CONFIG_LIVEPATCH */
25+
static inline void klp_sched_try_switch(void) {}
26+
static inline void __klp_sched_try_switch(void) {}
27+
#endif /* CONFIG_LIVEPATCH */
28+
29+
#endif /* _LINUX_LIVEPATCH_SCHED_H_ */

include/linux/mm_types.h

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,13 @@ struct vm_area_struct {
573573
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
574574
} __randomize_layout;
575575

576+
#ifdef CONFIG_SCHED_MM_CID
577+
struct mm_cid {
578+
u64 time;
579+
int cid;
580+
};
581+
#endif
582+
576583
struct kioctx_table;
577584
struct mm_struct {
578585
struct {
@@ -623,15 +630,19 @@ struct mm_struct {
623630
atomic_t mm_count;
624631
#ifdef CONFIG_SCHED_MM_CID
625632
/**
626-
* @cid_lock: Protect cid bitmap updates vs lookups.
633+
* @pcpu_cid: Per-cpu current cid.
627634
*
628-
* Prevent situations where updates to the cid bitmap happen
629-
* concurrently with lookups. Those can lead to situations
630-
* where a lookup cannot find a free bit simply because it was
631-
* unlucky enough to load, non-atomically, bitmap words as they
632-
* were being concurrently updated by the updaters.
635+
* Keep track of the currently allocated mm_cid for each cpu.
636+
* The per-cpu mm_cid values are serialized by their respective
637+
* runqueue locks.
633638
*/
634-
raw_spinlock_t cid_lock;
639+
struct mm_cid __percpu *pcpu_cid;
640+
/*
641+
* @mm_cid_next_scan: Next mm_cid scan (in jiffies).
642+
*
643+
* When the next mm_cid scan is due (in jiffies).
644+
*/
645+
unsigned long mm_cid_next_scan;
635646
#endif
636647
#ifdef CONFIG_MMU
637648
atomic_long_t pgtables_bytes; /* size of all page tables */
@@ -899,6 +910,37 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
899910
}
900911

901912
#ifdef CONFIG_SCHED_MM_CID
913+
914+
enum mm_cid_state {
915+
MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
916+
MM_CID_LAZY_PUT = (1U << 31),
917+
};
918+
919+
static inline bool mm_cid_is_unset(int cid)
920+
{
921+
return cid == MM_CID_UNSET;
922+
}
923+
924+
static inline bool mm_cid_is_lazy_put(int cid)
925+
{
926+
return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
927+
}
928+
929+
static inline bool mm_cid_is_valid(int cid)
930+
{
931+
return !(cid & MM_CID_LAZY_PUT);
932+
}
933+
934+
static inline int mm_cid_set_lazy_put(int cid)
935+
{
936+
return cid | MM_CID_LAZY_PUT;
937+
}
938+
939+
static inline int mm_cid_clear_lazy_put(int cid)
940+
{
941+
return cid & ~MM_CID_LAZY_PUT;
942+
}
943+
902944
/* Accessor for struct mm_struct's cidmask. */
903945
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
904946
{
@@ -912,16 +954,40 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
912954

913955
static inline void mm_init_cid(struct mm_struct *mm)
914956
{
915-
raw_spin_lock_init(&mm->cid_lock);
957+
int i;
958+
959+
for_each_possible_cpu(i) {
960+
struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
961+
962+
pcpu_cid->cid = MM_CID_UNSET;
963+
pcpu_cid->time = 0;
964+
}
916965
cpumask_clear(mm_cidmask(mm));
917966
}
918967

968+
static inline int mm_alloc_cid(struct mm_struct *mm)
969+
{
970+
mm->pcpu_cid = alloc_percpu(struct mm_cid);
971+
if (!mm->pcpu_cid)
972+
return -ENOMEM;
973+
mm_init_cid(mm);
974+
return 0;
975+
}
976+
977+
static inline void mm_destroy_cid(struct mm_struct *mm)
978+
{
979+
free_percpu(mm->pcpu_cid);
980+
mm->pcpu_cid = NULL;
981+
}
982+
919983
static inline unsigned int mm_cid_size(void)
920984
{
921985
return cpumask_size();
922986
}
923987
#else /* CONFIG_SCHED_MM_CID */
924988
static inline void mm_init_cid(struct mm_struct *mm) { }
989+
static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
990+
static inline void mm_destroy_cid(struct mm_struct *mm) { }
925991
static inline unsigned int mm_cid_size(void)
926992
{
927993
return 0;

include/linux/psi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
2424

2525
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
2626
struct psi_trigger *psi_trigger_create(struct psi_group *group,
27-
char *buf, enum psi_res res);
27+
char *buf, enum psi_res res, struct file *file);
2828
void psi_trigger_destroy(struct psi_trigger *t);
2929

3030
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,

include/linux/psi_types.h

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,9 @@ struct psi_trigger {
151151

152152
/* Deferred event(s) from previous ratelimit window */
153153
bool pending_event;
154+
155+
/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
156+
enum psi_aggregators aggregator;
154157
};
155158

156159
struct psi_group {
@@ -171,30 +174,34 @@ struct psi_group {
171174
/* Aggregator work control */
172175
struct delayed_work avgs_work;
173176

177+
/* Unprivileged triggers against N*PSI_FREQ windows */
178+
struct list_head avg_triggers;
179+
u32 avg_nr_triggers[NR_PSI_STATES - 1];
180+
174181
/* Total stall times and sampled pressure averages */
175182
u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
176183
unsigned long avg[NR_PSI_STATES - 1][3];
177184

178-
/* Monitor work control */
179-
struct task_struct __rcu *poll_task;
180-
struct timer_list poll_timer;
181-
wait_queue_head_t poll_wait;
182-
atomic_t poll_wakeup;
183-
atomic_t poll_scheduled;
185+
/* Monitor RT polling work control */
186+
struct task_struct __rcu *rtpoll_task;
187+
struct timer_list rtpoll_timer;
188+
wait_queue_head_t rtpoll_wait;
189+
atomic_t rtpoll_wakeup;
190+
atomic_t rtpoll_scheduled;
184191

185192
/* Protects data used by the monitor */
186-
struct mutex trigger_lock;
187-
188-
/* Configured polling triggers */
189-
struct list_head triggers;
190-
u32 nr_triggers[NR_PSI_STATES - 1];
191-
u32 poll_states;
192-
u64 poll_min_period;
193-
194-
/* Total stall times at the start of monitor activation */
195-
u64 polling_total[NR_PSI_STATES - 1];
196-
u64 polling_next_update;
197-
u64 polling_until;
193+
struct mutex rtpoll_trigger_lock;
194+
195+
/* Configured RT polling triggers */
196+
struct list_head rtpoll_triggers;
197+
u32 rtpoll_nr_triggers[NR_PSI_STATES - 1];
198+
u32 rtpoll_states;
199+
u64 rtpoll_min_period;
200+
201+
/* Total stall times at the start of RT polling monitor activation */
202+
u64 rtpoll_total[NR_PSI_STATES - 1];
203+
u64 rtpoll_next_update;
204+
u64 rtpoll_until;
198205
};
199206

200207
#else /* CONFIG_PSI */

include/linux/sched.h

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include <linux/seqlock.h>
3737
#include <linux/kcsan.h>
3838
#include <linux/rv.h>
39+
#include <linux/livepatch_sched.h>
3940
#include <asm/kmap_size.h>
4041

4142
/* task_struct member predeclarations (sorted alphabetically): */
@@ -1313,7 +1314,10 @@ struct task_struct {
13131314

13141315
#ifdef CONFIG_SCHED_MM_CID
13151316
int mm_cid; /* Current cid in mm */
1317+
int last_mm_cid; /* Most recent cid in mm */
1318+
int migrate_from_cpu;
13161319
int mm_cid_active; /* Whether cid bitmap is active */
1320+
struct callback_head cid_work;
13171321
#endif
13181322

13191323
struct tlbflush_unmap_batch tlb_ubc;
@@ -2067,6 +2071,9 @@ extern int __cond_resched(void);
20672071

20682072
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
20692073

2074+
void sched_dynamic_klp_enable(void);
2075+
void sched_dynamic_klp_disable(void);
2076+
20702077
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
20712078

20722079
static __always_inline int _cond_resched(void)
@@ -2075,27 +2082,33 @@ static __always_inline int _cond_resched(void)
20752082
}
20762083

20772084
#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
2085+
20782086
extern int dynamic_cond_resched(void);
20792087

20802088
static __always_inline int _cond_resched(void)
20812089
{
20822090
return dynamic_cond_resched();
20832091
}
20842092

2085-
#else
2093+
#else /* !CONFIG_PREEMPTION */
20862094

20872095
static inline int _cond_resched(void)
20882096
{
2097+
klp_sched_try_switch();
20892098
return __cond_resched();
20902099
}
20912100

2092-
#endif /* CONFIG_PREEMPT_DYNAMIC */
2101+
#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
20932102

2094-
#else
2103+
#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
20952104

2096-
static inline int _cond_resched(void) { return 0; }
2105+
static inline int _cond_resched(void)
2106+
{
2107+
klp_sched_try_switch();
2108+
return 0;
2109+
}
20972110

2098-
#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
2111+
#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
20992112

21002113
#define cond_resched() ({ \
21012114
__might_resched(__FILE__, __LINE__, 0); \

include/linux/sched/mm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm)
3737
atomic_inc(&mm->mm_count);
3838
}
3939

40+
static inline void smp_mb__after_mmgrab(void)
41+
{
42+
smp_mb__after_atomic();
43+
}
44+
4045
extern void __mmdrop(struct mm_struct *mm);
4146

4247
static inline void mmdrop(struct mm_struct *mm)

kernel/cgroup/cgroup.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3771,7 +3771,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
37713771
}
37723772

37733773
psi = cgroup_psi(cgrp);
3774-
new = psi_trigger_create(psi, buf, res);
3774+
new = psi_trigger_create(psi, buf, res, of->file);
37753775
if (IS_ERR(new)) {
37763776
cgroup_put(cgrp);
37773777
return PTR_ERR(new);

0 commit comments

Comments
 (0)