Skip to content

Commit dda5df9

Browse files
committed
Merge tag 'sched-urgent-2026-02-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Miscellaneous MMCID fixes to address bugs and performance regressions in the recent rewrite of the SCHED_MM_CID management code: - Fix livelock triggered by BPF CI testing - Fix hard lockup on weakly ordered systems - Simplify the dropping of CIDs in the exit path by removing an unintended transition phase - Fix performance/scalability regression on a thread-pool benchmark by optimizing transitional CIDs when scheduling out" * tag 'sched-urgent-2026-02-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/mmcid: Optimize transitional CIDs when scheduling out sched/mmcid: Drop per CPU CID immediately when switching to per task mode sched/mmcid: Protect transition on weakly ordered systems sched/mmcid: Prevent live lock on task to CPU mode transition
2 parents 7e0b172 + 4463c7a commit dda5df9

3 files changed

Lines changed: 163 additions & 71 deletions

File tree

include/linux/rseq_types.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,7 @@ struct mm_cid_pcpu {
121121
/**
122122
* struct mm_mm_cid - Storage for per MM CID data
123123
* @pcpu: Per CPU storage for CIDs associated to a CPU
124-
* @percpu: Set, when CIDs are in per CPU mode
125-
* @transit: Set to MM_CID_TRANSIT during a mode change transition phase
124+
* @mode: Indicates per CPU and transition mode
126125
* @max_cids: The exclusive maximum CID value for allocation and convergence
127126
* @irq_work: irq_work to handle the affinity mode change case
128127
* @work: Regular work to handle the affinity mode change case
@@ -139,8 +138,7 @@ struct mm_cid_pcpu {
139138
struct mm_mm_cid {
140139
/* Hotpath read mostly members */
141140
struct mm_cid_pcpu __percpu *pcpu;
142-
unsigned int percpu;
143-
unsigned int transit;
141+
unsigned int mode;
144142
unsigned int max_cids;
145143

146144
/* Rarely used. Moves @lock and @mutex into the second cacheline */

kernel/sched/core.c

Lines changed: 126 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10269,7 +10269,8 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1026910269
* Serialization rules:
1027010270
*
1027110271
* mm::mm_cid::mutex: Serializes fork() and exit() and therefore
10272-
* protects mm::mm_cid::users.
10272+
* protects mm::mm_cid::users and mode switch
10273+
* transitions
1027310274
*
1027410275
* mm::mm_cid::lock: Serializes mm_update_max_cids() and
1027510276
* mm_update_cpus_allowed(). Nests in mm_cid::mutex
@@ -10285,14 +10286,70 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1028510286
*
1028610287
* A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
1028710288
* by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
10288-
* MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
10289-
* MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
10290-
* task needs to drop the CID into the pool when scheduling out. Both bits
10291-
* (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
10292-
* actually handed over to user space in the RSEQ memory.
10289+
* MM_CID_ONCPU bit set.
10290+
*
10291+
* During the transition of ownership mode, the MM_CID_TRANSIT bit is set
10292+
* on the CIDs. When this bit is set the tasks drop the CID back into the
10293+
* pool when scheduling out.
10294+
*
10295+
* Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the
10296+
* CID is actually handed over to user space in the RSEQ memory.
1029310297
*
1029410298
* Mode switching:
1029510299
*
10300+
* The ownership mode is per process and stored in mm:mm_cid::mode with the
10301+
* following possible states:
10302+
*
10303+
* 0: Per task ownership
10304+
* 0 | MM_CID_TRANSIT: Transition from per CPU to per task
10305+
* MM_CID_ONCPU: Per CPU ownership
10306+
* MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU
10307+
*
10308+
* All transitions of ownership mode happen in two phases:
10309+
*
10310+
* 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the
10311+
* CIDs and denotes that the CID is only temporarily owned by a
10312+
* task. When the task schedules out it drops the CID back into the
10313+
* pool if this bit is set.
10314+
*
10315+
* 2) The initiating context walks the per CPU space or the tasks to fixup
10316+
* or drop the CIDs and after completion it clears MM_CID_TRANSIT in
10317+
* mm:mm_cid::mode. After that point the CIDs are strictly task or CPU
10318+
* owned again.
10319+
*
10320+
* This two phase transition is required to prevent CID space exhaustion
10321+
* during the transition as a direct transfer of ownership would fail:
10322+
*
10323+
* - On task to CPU mode switch if a task is scheduled in on one CPU and
10324+
* then migrated to another CPU before the fixup freed enough per task
10325+
* CIDs.
10326+
*
10327+
* - On CPU to task mode switch if two tasks are scheduled in on the same
10328+
* CPU before the fixup freed per CPU CIDs.
10329+
*
10330+
* Both scenarios can result in a live lock because sched_in() is invoked
10331+
* with runqueue lock held and loops in search of a CID and the fixup
10332+
* thread can't make progress freeing them up because it is stuck on the
10333+
* same runqueue lock.
10334+
*
10335+
* While MM_CID_TRANSIT is active during the transition phase the MM_CID
10336+
* bitmap can be contended, but that's a temporary contention bound to the
10337+
* transition period. After that everything goes back into steady state and
10338+
* nothing except fork() and exit() will touch the bitmap. This is an
10339+
* acceptable tradeoff as it completely avoids complex serialization,
10340+
* memory barriers and atomic operations for the common case.
10341+
*
10342+
* Aside of that this mechanism also ensures RT compability:
10343+
*
10344+
* - The task which runs the fixup is fully preemptible except for the
10345+
* short runqueue lock held sections.
10346+
*
10347+
* - The transient impact of the bitmap contention is only problematic
10348+
* when there is a thundering herd scenario of tasks scheduling in and
10349+
* out concurrently. There is not much which can be done about that
10350+
* except for avoiding mode switching by a proper overall system
10351+
* configuration.
10352+
*
1029610353
* Switching to per CPU mode happens when the user count becomes greater
1029710354
* than the maximum number of CIDs, which is calculated by:
1029810355
*
@@ -10306,12 +10363,13 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1030610363
*
1030710364
* At the point of switching to per CPU mode the new user is not yet
1030810365
* visible in the system, so the task which initiated the fork() runs the
10309-
* fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
10310-
* either transfers each tasks owned CID to the CPU the task runs on or
10311-
* drops it into the CID pool if a task is not on a CPU at that point in
10312-
* time. Tasks which schedule in before the task walk reaches them do the
10313-
* handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
10314-
* it's guaranteed that no task related to that MM owns a CID anymore.
10366+
* fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and
10367+
* either marks each task owned CID with MM_CID_TRANSIT if the task is
10368+
* running on a CPU or drops it into the CID pool if a task is not on a
10369+
* CPU. Tasks which schedule in before the task walk reaches them do the
10370+
* handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus()
10371+
* completes it is guaranteed that no task related to that MM owns a CID
10372+
* anymore.
1031510373
*
1031610374
* Switching back to task mode happens when the user count goes below the
1031710375
* threshold which was recorded on the per CPU mode switch:
@@ -10327,28 +10385,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1032710385
* run either in the deferred update function in context of a workqueue or
1032810386
* by a task which forks a new one or by a task which exits. Whatever
1032910387
* happens first. mm_cid_fixup_cpus_to_task() walks through the possible
10330-
* CPUs and either transfers the CPU owned CIDs to a related task which
10331-
* runs on the CPU or drops it into the pool. Tasks which schedule in on a
10332-
* CPU which the walk did not cover yet do the handover themself.
10333-
*
10334-
* This transition from CPU to per task ownership happens in two phases:
10335-
*
10336-
* 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
10337-
* CID and denotes that the CID is only temporarily owned by the
10338-
* task. When it schedules out the task drops the CID back into the
10339-
* pool if this bit is set.
10340-
*
10341-
* 2) The initiating context walks the per CPU space and after completion
10342-
* clears mm:mm_cid.transit. So after that point the CIDs are strictly
10343-
* task owned again.
10344-
*
10345-
* This two phase transition is required to prevent CID space exhaustion
10346-
* during the transition as a direct transfer of ownership would fail if
10347-
* two tasks are scheduled in on the same CPU before the fixup freed per
10348-
* CPU CIDs.
10349-
*
10350-
* When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
10351-
* related to that MM is owned by a CPU anymore.
10388+
* CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a
10389+
* related task is running on the CPU or drops it into the pool. Tasks
10390+
* which are scheduled in before the fixup covered them do the handover
10391+
* themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed
10392+
* that no CID related to that MM is owned by a CPU anymore.
1035210393
*/
1035310394

1035410395
/*
@@ -10379,6 +10420,7 @@ static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
1037910420
static bool mm_update_max_cids(struct mm_struct *mm)
1038010421
{
1038110422
struct mm_mm_cid *mc = &mm->mm_cid;
10423+
bool percpu = cid_on_cpu(mc->mode);
1038210424

1038310425
lockdep_assert_held(&mm->mm_cid.lock);
1038410426

@@ -10387,7 +10429,7 @@ static bool mm_update_max_cids(struct mm_struct *mm)
1038710429
__mm_update_max_cids(mc);
1038810430

1038910431
/* Check whether owner mode must be changed */
10390-
if (!mc->percpu) {
10432+
if (!percpu) {
1039110433
/* Enable per CPU mode when the number of users is above max_cids */
1039210434
if (mc->users > mc->max_cids)
1039310435
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
@@ -10398,12 +10440,17 @@ static bool mm_update_max_cids(struct mm_struct *mm)
1039810440
}
1039910441

1040010442
/* Mode change required? */
10401-
if (!!mc->percpu == !!mc->pcpu_thrs)
10443+
if (percpu == !!mc->pcpu_thrs)
1040210444
return false;
10403-
/* When switching back to per TASK mode, set the transition flag */
10404-
if (!mc->pcpu_thrs)
10405-
WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
10406-
WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
10445+
10446+
/* Flip the mode and set the transition flag to bridge the transfer */
10447+
WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
10448+
/*
10449+
* Order the store against the subsequent fixups so that
10450+
* acquire(rq::lock) cannot be reordered by the CPU before the
10451+
* store.
10452+
*/
10453+
smp_mb();
1040710454
return true;
1040810455
}
1040910456

@@ -10428,7 +10475,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
1042810475

1042910476
WRITE_ONCE(mc->nr_cpus_allowed, weight);
1043010477
__mm_update_max_cids(mc);
10431-
if (!mc->percpu)
10478+
if (!cid_on_cpu(mc->mode))
1043210479
return;
1043310480

1043410481
/* Adjust the threshold to the wider set */
@@ -10446,6 +10493,16 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
1044610493
irq_work_queue(&mc->irq_work);
1044710494
}
1044810495

10496+
static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
10497+
{
10498+
/*
10499+
* Ensure that the store removing the TRANSIT bit cannot be
10500+
* reordered by the CPU before the fixups have been completed.
10501+
*/
10502+
smp_mb();
10503+
WRITE_ONCE(mm->mm_cid.mode, mode);
10504+
}
10505+
1044910506
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
1045010507
{
1045110508
if (cid_on_cpu(t->mm_cid.cid)) {
@@ -10489,14 +10546,13 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
1048910546
}
1049010547
}
1049110548
}
10492-
/* Clear the transition bit */
10493-
WRITE_ONCE(mm->mm_cid.transit, 0);
10549+
mm_cid_complete_transit(mm, 0);
1049410550
}
1049510551

10496-
static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
10552+
static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
1049710553
{
1049810554
if (cid_on_task(t->mm_cid.cid)) {
10499-
t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
10555+
t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid);
1050010556
pcp->cid = t->mm_cid.cid;
1050110557
}
1050210558
}
@@ -10509,18 +10565,17 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
1050910565
if (!t->mm_cid.active)
1051010566
return false;
1051110567
if (cid_on_task(t->mm_cid.cid)) {
10512-
/* If running on the CPU, transfer the CID, otherwise drop it */
10568+
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
1051310569
if (task_rq(t)->curr == t)
10514-
mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
10570+
mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
1051510571
else
1051610572
mm_unset_cid_on_task(t);
1051710573
}
1051810574
return true;
1051910575
}
1052010576

10521-
static void mm_cid_fixup_tasks_to_cpus(void)
10577+
static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
1052210578
{
10523-
struct mm_struct *mm = current->mm;
1052410579
struct task_struct *p, *t;
1052510580
unsigned int users;
1052610581

@@ -10558,6 +10613,14 @@ static void mm_cid_fixup_tasks_to_cpus(void)
1055810613
}
1055910614
}
1056010615

10616+
static void mm_cid_fixup_tasks_to_cpus(void)
10617+
{
10618+
struct mm_struct *mm = current->mm;
10619+
10620+
mm_cid_do_fixup_tasks_to_cpus(mm);
10621+
mm_cid_complete_transit(mm, MM_CID_ONCPU);
10622+
}
10623+
1056110624
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
1056210625
{
1056310626
t->mm_cid.active = 1;
@@ -10586,17 +10649,17 @@ void sched_mm_cid_fork(struct task_struct *t)
1058610649
}
1058710650

1058810651
if (!sched_mm_cid_add_user(t, mm)) {
10589-
if (!mm->mm_cid.percpu)
10652+
if (!cid_on_cpu(mm->mm_cid.mode))
1059010653
t->mm_cid.cid = mm_get_cid(mm);
1059110654
return;
1059210655
}
1059310656

1059410657
/* Handle the mode change and transfer current's CID */
10595-
percpu = !!mm->mm_cid.percpu;
10658+
percpu = cid_on_cpu(mm->mm_cid.mode);
1059610659
if (!percpu)
1059710660
mm_cid_transit_to_task(current, pcp);
1059810661
else
10599-
mm_cid_transfer_to_cpu(current, pcp);
10662+
mm_cid_transit_to_cpu(current, pcp);
1060010663
}
1060110664

1060210665
if (percpu) {
@@ -10631,7 +10694,7 @@ static bool __sched_mm_cid_exit(struct task_struct *t)
1063110694
* affinity change increased the number of allowed CPUs and the
1063210695
* deferred fixup did not run yet.
1063310696
*/
10634-
if (WARN_ON_ONCE(mm->mm_cid.percpu))
10697+
if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
1063510698
return false;
1063610699
/*
1063710700
* A failed fork(2) cleanup never gets here, so @current must have
@@ -10664,8 +10727,14 @@ void sched_mm_cid_exit(struct task_struct *t)
1066410727
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
1066510728
if (!__sched_mm_cid_exit(t))
1066610729
return;
10667-
/* Mode change required. Transfer currents CID */
10668-
mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
10730+
/*
10731+
* Mode change. The task has the CID unset
10732+
* already. The CPU CID is still valid and
10733+
* does not have MM_CID_TRANSIT set as the
10734+
* mode change has just taken effect under
10735+
* mm::mm_cid::lock. Drop it.
10736+
*/
10737+
mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
1066910738
}
1067010739
mm_cid_fixup_cpus_to_tasks(mm);
1067110740
return;
@@ -10722,7 +10791,7 @@ static void mm_cid_work_fn(struct work_struct *work)
1072210791
if (!mm_update_max_cids(mm))
1072310792
return;
1072410793
/* Affinity changes can only switch back to task mode */
10725-
if (WARN_ON_ONCE(mm->mm_cid.percpu))
10794+
if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
1072610795
return;
1072710796
}
1072810797
mm_cid_fixup_cpus_to_tasks(mm);
@@ -10743,8 +10812,7 @@ static void mm_cid_irq_work(struct irq_work *work)
1074310812
void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
1074410813
{
1074510814
mm->mm_cid.max_cids = 0;
10746-
mm->mm_cid.percpu = 0;
10747-
mm->mm_cid.transit = 0;
10815+
mm->mm_cid.mode = 0;
1074810816
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
1074910817
mm->mm_cid.users = 0;
1075010818
mm->mm_cid.pcpu_thrs = 0;

0 commit comments

Comments
 (0)