Skip to content

Commit 6d2c10e

Browse files
committed
Merge tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Scalability and load-balancing improvements: - Enable scheduler feature NEXT_BUDDY (Mel Gorman) - Reimplement NEXT_BUDDY to align with EEVDF goals (Mel Gorman) - Skip sched_balance_running cmpxchg when balance is not due (Tim Chen) - Implement generic code for architecture specific sched domain NUMA distances (Tim Chen) - Optimize the NUMA distances of the sched-domains builds of Intel Granite Rapids (GNR) and Clearwater Forest (CWF) platforms (Tim Chen) - Implement proportional newidle balance: a randomized algorithm that runs newidle balancing proportional to its success rate. (Peter Zijlstra) Scheduler infrastructure changes: - Implement the 'sched_change' scoped_guard() pattern for the entire scheduler (Peter Zijlstra) - More broadly utilize the sched_change guard (Peter Zijlstra) - Add support to pick functions to take runqueue-flags (Joel Fernandes) - Provide and use set_need_resched_current() (Peter Zijlstra) Fair scheduling enhancements: - Forfeit vruntime on yield (Fernand Sieber) - Only update stats for allowed CPUs when looking for dst group (Adam Li) CPU-core scheduling enhancements: - Optimize core cookie matching check (Fernand Sieber) Deadline scheduler fixes: - Only set free_cpus for online runqueues (Doug Berger) - Fix dl_server time accounting (Peter Zijlstra) - Fix dl_server stop condition (Peter Zijlstra) Proxy scheduling fixes: - Yield the donor task (Fernand Sieber) Fixes and cleanups: - Fix do_set_cpus_allowed() locking (Peter Zijlstra) - Fix migrate_disable_switch() locking (Peter Zijlstra) - Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked() (Hao Jia) - Increase sched_tick_remote timeout (Phil Auld) - sched/deadline: Use cpumask_weight_and() in dl_bw_cpus() (Shrikanth Hegde) - sched/deadline: Clean up select_task_rq_dl() (Shrikanth Hegde)" * tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits) sched: Provide and use set_need_resched_current() sched/fair: Proportional newidle balance sched/fair: Small cleanup to update_newidle_cost() sched/fair: Small cleanup to sched_balance_newidle() sched/fair: Revert max_newidle_lb_cost bump sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals sched/fair: Enable scheduler feature NEXT_BUDDY sched: Increase sched_tick_remote timeout sched/fair: Have SD_SERIALIZE affect newidle balancing sched/fair: Skip sched_balance_running cmpxchg when balance is not due sched/deadline: Minor cleanup in select_task_rq_dl() sched/deadline: Use cpumask_weight_and() in dl_bw_cpus sched/deadline: Document dl_server sched/deadline: Fix dl_server stop condition sched/deadline: Fix dl_server time accounting sched/core: Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked() sched/eevdf: Fix min_vruntime vs avg_vruntime sched/core: Add comment explaining force-idle vruntime snapshots sched/core: Optimize core cookie matching check sched/proxy: Yield the donor task ...
2 parents 6c26fbe + c04507a commit 6d2c10e

28 files changed

Lines changed: 1417 additions & 803 deletions

arch/s390/mm/pfault.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,7 @@ static void pfault_interrupt(struct ext_code ext_code,
199199
* return to userspace schedule() to block.
200200
*/
201201
__set_current_state(TASK_UNINTERRUPTIBLE);
202-
set_tsk_need_resched(tsk);
203-
set_preempt_need_resched();
202+
set_need_resched_current();
204203
}
205204
}
206205
out:

arch/x86/include/asm/topology.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
325325
extern void arch_scale_freq_tick(void);
326326
#define arch_scale_freq_tick arch_scale_freq_tick
327327

328+
extern int arch_sched_node_distance(int from, int to);
329+
328330
#endif /* _ASM_X86_TOPOLOGY_H */

arch/x86/kernel/smpboot.c

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,76 @@ static void __init build_sched_topology(void)
515515
set_sched_topology(topology);
516516
}
517517

518+
#ifdef CONFIG_NUMA
519+
static int sched_avg_remote_distance;
520+
static int avg_remote_numa_distance(void)
521+
{
522+
int i, j;
523+
int distance, nr_remote, total_distance;
524+
525+
if (sched_avg_remote_distance > 0)
526+
return sched_avg_remote_distance;
527+
528+
nr_remote = 0;
529+
total_distance = 0;
530+
for_each_node_state(i, N_CPU) {
531+
for_each_node_state(j, N_CPU) {
532+
distance = node_distance(i, j);
533+
534+
if (distance >= REMOTE_DISTANCE) {
535+
nr_remote++;
536+
total_distance += distance;
537+
}
538+
}
539+
}
540+
if (nr_remote)
541+
sched_avg_remote_distance = total_distance / nr_remote;
542+
else
543+
sched_avg_remote_distance = REMOTE_DISTANCE;
544+
545+
return sched_avg_remote_distance;
546+
}
547+
548+
int arch_sched_node_distance(int from, int to)
549+
{
550+
int d = node_distance(from, to);
551+
552+
switch (boot_cpu_data.x86_vfm) {
553+
case INTEL_GRANITERAPIDS_X:
554+
case INTEL_ATOM_DARKMONT_X:
555+
556+
if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
557+
d < REMOTE_DISTANCE)
558+
return d;
559+
560+
/*
561+
* With SNC enabled, there could be too many levels of remote
562+
* NUMA node distances, creating NUMA domain levels
563+
* including local nodes and partial remote nodes.
564+
*
565+
* Trim finer distance tuning for NUMA nodes in remote package
566+
* for the purpose of building sched domains. Group NUMA nodes
567+
* in the remote package in the same sched group.
568+
* Simplify NUMA domains and avoid extra NUMA levels including
569+
* different remote NUMA nodes and local nodes.
570+
*
571+
* GNR and CWF don't expect systems with more than 2 packages
572+
* and more than 2 hops between packages. Single average remote
573+
* distance won't be appropriate if there are more than 2
574+
* packages as average distance to different remote packages
575+
* could be different.
576+
*/
577+
WARN_ONCE(topology_max_packages() > 2,
578+
"sched: Expect only up to 2 packages for GNR or CWF, "
579+
"but saw %d packages when building sched domains.",
580+
topology_max_packages());
581+
582+
d = avg_remote_numa_distance();
583+
}
584+
return d;
585+
}
586+
#endif /* CONFIG_NUMA */
587+
518588
void set_cpu_sibling_map(int cpu)
519589
{
520590
bool has_smt = __max_threads_per_core > 1;

include/linux/cleanup.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,11 @@ _label: \
348348
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
349349
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
350350

351+
#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \
352+
__DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
353+
static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
354+
{ return (void *)1; }
355+
351356
#define __GUARD_IS_ERR(_ptr) \
352357
({ \
353358
unsigned long _rc = (__force unsigned long)(_ptr); \

include/linux/sched.h

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -637,8 +637,8 @@ struct sched_rt_entity {
637637
#endif
638638
} __randomize_layout;
639639

640-
typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
641-
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
640+
struct rq_flags;
641+
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);
642642

643643
struct sched_dl_entity {
644644
struct rb_node rb_node;
@@ -685,20 +685,22 @@ struct sched_dl_entity {
685685
*
686686
* @dl_server tells if this is a server entity.
687687
*
688-
* @dl_defer tells if this is a deferred or regular server. For
689-
* now only defer server exists.
690-
*
691-
* @dl_defer_armed tells if the deferrable server is waiting
692-
* for the replenishment timer to activate it.
693-
*
694688
* @dl_server_active tells if the dlserver is active(started).
695689
* dlserver is started on first cfs enqueue on an idle runqueue
696690
* and is stopped when a dequeue results in 0 cfs tasks on the
697691
* runqueue. In other words, dlserver is active only when cpu's
698692
* runqueue has atleast one cfs task.
699693
*
694+
* @dl_defer tells if this is a deferred or regular server. For
695+
* now only defer server exists.
696+
*
697+
* @dl_defer_armed tells if the deferrable server is waiting
698+
* for the replenishment timer to activate it.
699+
*
700700
* @dl_defer_running tells if the deferrable server is actually
701701
* running, skipping the defer phase.
702+
*
703+
* @dl_defer_idle tracks idle state
702704
*/
703705
unsigned int dl_throttled : 1;
704706
unsigned int dl_yielded : 1;
@@ -709,6 +711,7 @@ struct sched_dl_entity {
709711
unsigned int dl_defer : 1;
710712
unsigned int dl_defer_armed : 1;
711713
unsigned int dl_defer_running : 1;
714+
unsigned int dl_defer_idle : 1;
712715

713716
/*
714717
* Bandwidth enforcement timer. Each -deadline task has its
@@ -730,9 +733,6 @@ struct sched_dl_entity {
730733
* dl_server_update().
731734
*
732735
* @rq the runqueue this server is for
733-
*
734-
* @server_has_tasks() returns true if @server_pick return a
735-
* runnable task.
736736
*/
737737
struct rq *rq;
738738
dl_server_pick_f server_pick_task;
@@ -1861,8 +1861,8 @@ extern int task_can_attach(struct task_struct *p);
18611861
extern int dl_bw_alloc(int cpu, u64 dl_bw);
18621862
extern void dl_bw_free(int cpu, u64 dl_bw);
18631863

1864-
/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
1865-
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
1864+
/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
1865+
extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
18661866

18671867
/**
18681868
* set_cpus_allowed_ptr - set CPU affinity mask of a task
@@ -2058,6 +2058,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
20582058
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
20592059
}
20602060

2061+
static inline void set_need_resched_current(void)
2062+
{
2063+
lockdep_assert_irqs_disabled();
2064+
set_tsk_need_resched(current);
2065+
set_preempt_need_resched();
2066+
}
2067+
20612068
/*
20622069
* cond_resched() and cond_resched_lock(): latency reduction via
20632070
* explicit rescheduling in places that are safe. The return

include/linux/sched/topology.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ struct sched_domain {
9292
unsigned int nr_balance_failed; /* initialise to 0 */
9393

9494
/* idle_balance() stats */
95+
unsigned int newidle_call;
96+
unsigned int newidle_success;
97+
unsigned int newidle_ratio;
9598
u64 max_newidle_lb_cost;
9699
unsigned long last_decay_max_lb_cost;
97100

kernel/cgroup/cpuset.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
41804180
rcu_read_lock();
41814181
cs_mask = task_cs(tsk)->cpus_allowed;
41824182
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
4183-
do_set_cpus_allowed(tsk, cs_mask);
4183+
set_cpus_allowed_force(tsk, cs_mask);
41844184
changed = true;
41854185
}
41864186
rcu_read_unlock();

kernel/kthread.c

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
593593

594594
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
595595
{
596-
unsigned long flags;
597-
598596
if (!wait_task_inactive(p, state)) {
599597
WARN_ON(1);
600598
return;
601599
}
602600

601+
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
602+
set_cpus_allowed_force(p, mask);
603+
603604
/* It's safe because the task is inactive. */
604-
raw_spin_lock_irqsave(&p->pi_lock, flags);
605-
do_set_cpus_allowed(p, mask);
606605
p->flags |= PF_NO_SETAFFINITY;
607-
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
608606
}
609607

610608
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
857855
{
858856
struct kthread *kthread = to_kthread(p);
859857
cpumask_var_t affinity;
860-
unsigned long flags;
861858
int ret = 0;
862859

863860
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
882879
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
883880
kthread_fetch_affinity(kthread, affinity);
884881

885-
/* It's safe because the task is inactive. */
886-
raw_spin_lock_irqsave(&p->pi_lock, flags);
887-
do_set_cpus_allowed(p, affinity);
888-
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
882+
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
883+
set_cpus_allowed_force(p, affinity);
889884

890885
mutex_unlock(&kthreads_hotplug_lock);
891886
out:

kernel/rcu/tiny.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,10 @@ void rcu_qs(void)
7070
*/
7171
void rcu_sched_clock_irq(int user)
7272
{
73-
if (user) {
73+
if (user)
7474
rcu_qs();
75-
} else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
76-
set_tsk_need_resched(current);
77-
set_preempt_need_resched();
78-
}
75+
else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail)
76+
set_need_resched_current();
7977
}
8078

8179
/*

kernel/rcu/tree.c

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2696,10 +2696,8 @@ void rcu_sched_clock_irq(int user)
26962696
/* The load-acquire pairs with the store-release setting to true. */
26972697
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
26982698
/* Idle and userspace execution already are quiescent states. */
2699-
if (!rcu_is_cpu_rrupt_from_idle() && !user) {
2700-
set_tsk_need_resched(current);
2701-
set_preempt_need_resched();
2702-
}
2699+
if (!rcu_is_cpu_rrupt_from_idle() && !user)
2700+
set_need_resched_current();
27032701
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
27042702
}
27052703
rcu_flavor_sched_clock_irq(user);
@@ -2824,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work)
28242822
/* Perform RCU core processing work for the current CPU. */
28252823
static __latent_entropy void rcu_core(void)
28262824
{
2827-
unsigned long flags;
28282825
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
28292826
struct rcu_node *rnp = rdp->mynode;
28302827

@@ -2837,8 +2834,8 @@ static __latent_entropy void rcu_core(void)
28372834
if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
28382835
rcu_preempt_deferred_qs(current);
28392836
} else if (rcu_preempt_need_deferred_qs(current)) {
2840-
set_tsk_need_resched(current);
2841-
set_preempt_need_resched();
2837+
guard(irqsave)();
2838+
set_need_resched_current();
28422839
}
28432840

28442841
/* Update RCU state based on any recent quiescent states. */
@@ -2847,10 +2844,9 @@ static __latent_entropy void rcu_core(void)
28472844
/* No grace period and unregistered callbacks? */
28482845
if (!rcu_gp_in_progress() &&
28492846
rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
2850-
local_irq_save(flags);
2847+
guard(irqsave)();
28512848
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
28522849
rcu_accelerate_cbs_unlocked(rnp, rdp);
2853-
local_irq_restore(flags);
28542850
}
28552851

28562852
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());

0 commit comments

Comments
 (0)