Skip to content

Commit 47ee94e

Browse files
Thomas GleixnerPeter Zijlstra
authored andcommitted
sched/mmcid: Protect transition on weakly ordered systems
Shrikanth reported a hard lockup which he observed once. The stack trace shows the following CID related participants: watchdog: CPU 23 self-detected hard LOCKUP @ mm_get_cid+0xe8/0x188 NIP: mm_get_cid+0xe8/0x188 LR: mm_get_cid+0x108/0x188 mm_cid_switch_to+0x3c4/0x52c __schedule+0x47c/0x700 schedule_idle+0x3c/0x64 do_idle+0x160/0x1b0 cpu_startup_entry+0x48/0x50 start_secondary+0x284/0x288 start_secondary_prolog+0x10/0x14 watchdog: CPU 11 self-detected hard LOCKUP @ plpar_hcall_norets_notrace+0x18/0x2c NIP: plpar_hcall_norets_notrace+0x18/0x2c LR: queued_spin_lock_slowpath+0xd88/0x15d0 _raw_spin_lock+0x80/0xa0 raw_spin_rq_lock_nested+0x3c/0xf8 mm_cid_fixup_cpus_to_tasks+0xc8/0x28c sched_mm_cid_exit+0x108/0x22c do_exit+0xf4/0x5d0 make_task_dead+0x0/0x178 system_call_exception+0x128/0x390 system_call_vectored_common+0x15c/0x2ec The task on CPU11 is running the CID ownership mode change fixup function and is stuck on a runqueue lock. The task on CPU23 is trying to get a CID from the pool with the same runqueue lock held, but the pool is empty. After decoding a similar issue in the opposite direction switching from per task to per CPU mode the tool which models the possible scenarios failed to come up with a similar loop hole. This showed up only once, was not reproducible and according to tooling not related to a overlooked scheduling scenario permutation. But the fact that it was observed on a PowerPC system gave the right hint: PowerPC is a weakly ordered architecture. The transition mechanism does: WRITE_ONCE(mm->mm_cid.transit, MM_CID_TRANSIT); WRITE_ONCE(mm->mm_cid.percpu, new_mode); fixup() WRITE_ONCE(mm->mm_cid.transit, 0); mm_cid_schedin() does: if (!READ_ONCE(mm->mm_cid.percpu)) ... cid |= READ_ONCE(mm->mm_cid.transit); so weakly ordered systems can observe percpu == false and transit == 0 even if the fixup function has not yet completed. As a consequence the task will not drop the CID when scheduling out before the fixup is completed, which means the CID space can be exhausted and the next task scheduling in will loop in mm_get_cid() and the fixup thread can livelock on the held runqueue lock as above. This could obviously be solved by using: smp_store_release(&mm->mm_cid.percpu, true); and smp_load_acquire(&mm->mm_cid.percpu); but that brings a memory barrier back into the scheduler hotpath, which was just designed out by the CID rewrite. That can be completely avoided by combining the per CPU mode and the transit storage into a single mm_cid::mode member and ordering the stores against the fixup functions to prevent the CPU from reordering them. That makes the update of both states atomic and a concurrent read observes always consistent state. The price is an additional AND operation in mm_cid_schedin() to evaluate the per CPU or the per task path, but that's in the noise even on strongly ordered architectures as the actual load can be significantly more expensive and the conditional branch evaluation is there anyway. Fixes: fbd0e71 ("sched/mmcid: Provide CID ownership mode fixup functions") Closes: https://lore.kernel.org/bdfea828-4585-40e8-8835-247c6a8a76b0@linux.ibm.com Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com> Signed-off-by: Thomas Gleixner <tglx@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Link: https://patch.msgid.link/20260201192834.965217106@kernel.org
1 parent 4327fb1 commit 47ee94e

3 files changed

Lines changed: 58 additions & 35 deletions

File tree

include/linux/rseq_types.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,7 @@ struct mm_cid_pcpu {
121121
/**
122122
* struct mm_mm_cid - Storage for per MM CID data
123123
* @pcpu: Per CPU storage for CIDs associated to a CPU
124-
* @percpu: Set, when CIDs are in per CPU mode
125-
* @transit: Set to MM_CID_TRANSIT during a mode change transition phase
124+
* @mode: Indicates per CPU and transition mode
126125
* @max_cids: The exclusive maximum CID value for allocation and convergence
127126
* @irq_work: irq_work to handle the affinity mode change case
128127
* @work: Regular work to handle the affinity mode change case
@@ -139,8 +138,7 @@ struct mm_cid_pcpu {
139138
struct mm_mm_cid {
140139
/* Hotpath read mostly members */
141140
struct mm_cid_pcpu __percpu *pcpu;
142-
unsigned int percpu;
143-
unsigned int transit;
141+
unsigned int mode;
144142
unsigned int max_cids;
145143

146144
/* Rarely used. Moves @lock and @mutex into the second cacheline */

kernel/sched/core.c

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10297,16 +10297,25 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1029710297
*
1029810298
* Mode switching:
1029910299
*
10300+
* The ownership mode is per process and stored in mm:mm_cid::mode with the
10301+
* following possible states:
10302+
*
10303+
* 0: Per task ownership
10304+
* 0 | MM_CID_TRANSIT: Transition from per CPU to per task
10305+
* MM_CID_ONCPU: Per CPU ownership
10306+
* MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU
10307+
*
1030010308
* All transitions of ownership mode happen in two phases:
1030110309
*
10302-
* 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs
10303-
* and denotes that the CID is only temporarily owned by a task. When
10304-
* the task schedules out it drops the CID back into the pool if this
10305-
* bit is set.
10310+
* 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the
10311+
* CIDs and denotes that the CID is only temporarily owned by a
10312+
* task. When the task schedules out it drops the CID back into the
10313+
* pool if this bit is set.
1030610314
*
1030710315
* 2) The initiating context walks the per CPU space or the tasks to fixup
10308-
* or drop the CIDs and after completion it clears mm:mm_cid.transit.
10309-
* After that point the CIDs are strictly task or CPU owned again.
10316+
* or drop the CIDs and after completion it clears MM_CID_TRANSIT in
10317+
* mm:mm_cid::mode. After that point the CIDs are strictly task or CPU
10318+
* owned again.
1031010319
*
1031110320
* This two phase transition is required to prevent CID space exhaustion
1031210321
* during the transition as a direct transfer of ownership would fail:
@@ -10411,6 +10420,7 @@ static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
1041110420
static bool mm_update_max_cids(struct mm_struct *mm)
1041210421
{
1041310422
struct mm_mm_cid *mc = &mm->mm_cid;
10423+
bool percpu = cid_on_cpu(mc->mode);
1041410424

1041510425
lockdep_assert_held(&mm->mm_cid.lock);
1041610426

@@ -10419,7 +10429,7 @@ static bool mm_update_max_cids(struct mm_struct *mm)
1041910429
__mm_update_max_cids(mc);
1042010430

1042110431
/* Check whether owner mode must be changed */
10422-
if (!mc->percpu) {
10432+
if (!percpu) {
1042310433
/* Enable per CPU mode when the number of users is above max_cids */
1042410434
if (mc->users > mc->max_cids)
1042510435
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
@@ -10430,12 +10440,17 @@ static bool mm_update_max_cids(struct mm_struct *mm)
1043010440
}
1043110441

1043210442
/* Mode change required? */
10433-
if (!!mc->percpu == !!mc->pcpu_thrs)
10443+
if (percpu == !!mc->pcpu_thrs)
1043410444
return false;
1043510445

10436-
/* Set the transition flag to bridge the transfer */
10437-
WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
10438-
WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
10446+
/* Flip the mode and set the transition flag to bridge the transfer */
10447+
WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
10448+
/*
10449+
* Order the store against the subsequent fixups so that
10450+
* acquire(rq::lock) cannot be reordered by the CPU before the
10451+
* store.
10452+
*/
10453+
smp_mb();
1043910454
return true;
1044010455
}
1044110456

@@ -10460,7 +10475,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
1046010475

1046110476
WRITE_ONCE(mc->nr_cpus_allowed, weight);
1046210477
__mm_update_max_cids(mc);
10463-
if (!mc->percpu)
10478+
if (!cid_on_cpu(mc->mode))
1046410479
return;
1046510480

1046610481
/* Adjust the threshold to the wider set */
@@ -10478,6 +10493,16 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
1047810493
irq_work_queue(&mc->irq_work);
1047910494
}
1048010495

10496+
static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
10497+
{
10498+
/*
10499+
* Ensure that the store removing the TRANSIT bit cannot be
10500+
* reordered by the CPU before the fixups have been completed.
10501+
*/
10502+
smp_mb();
10503+
WRITE_ONCE(mm->mm_cid.mode, mode);
10504+
}
10505+
1048110506
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
1048210507
{
1048310508
if (cid_on_cpu(t->mm_cid.cid)) {
@@ -10521,8 +10546,7 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
1052110546
}
1052210547
}
1052310548
}
10524-
/* Clear the transition bit */
10525-
WRITE_ONCE(mm->mm_cid.transit, 0);
10549+
mm_cid_complete_transit(mm, 0);
1052610550
}
1052710551

1052810552
static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10594,8 +10618,7 @@ static void mm_cid_fixup_tasks_to_cpus(void)
1059410618
struct mm_struct *mm = current->mm;
1059510619

1059610620
mm_cid_do_fixup_tasks_to_cpus(mm);
10597-
/* Clear the transition bit */
10598-
WRITE_ONCE(mm->mm_cid.transit, 0);
10621+
mm_cid_complete_transit(mm, MM_CID_ONCPU);
1059910622
}
1060010623

1060110624
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
@@ -10626,13 +10649,13 @@ void sched_mm_cid_fork(struct task_struct *t)
1062610649
}
1062710650

1062810651
if (!sched_mm_cid_add_user(t, mm)) {
10629-
if (!mm->mm_cid.percpu)
10652+
if (!cid_on_cpu(mm->mm_cid.mode))
1063010653
t->mm_cid.cid = mm_get_cid(mm);
1063110654
return;
1063210655
}
1063310656

1063410657
/* Handle the mode change and transfer current's CID */
10635-
percpu = !!mm->mm_cid.percpu;
10658+
percpu = cid_on_cpu(mm->mm_cid.mode);
1063610659
if (!percpu)
1063710660
mm_cid_transit_to_task(current, pcp);
1063810661
else
@@ -10671,7 +10694,7 @@ static bool __sched_mm_cid_exit(struct task_struct *t)
1067110694
* affinity change increased the number of allowed CPUs and the
1067210695
* deferred fixup did not run yet.
1067310696
*/
10674-
if (WARN_ON_ONCE(mm->mm_cid.percpu))
10697+
if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
1067510698
return false;
1067610699
/*
1067710700
* A failed fork(2) cleanup never gets here, so @current must have
@@ -10762,7 +10785,7 @@ static void mm_cid_work_fn(struct work_struct *work)
1076210785
if (!mm_update_max_cids(mm))
1076310786
return;
1076410787
/* Affinity changes can only switch back to task mode */
10765-
if (WARN_ON_ONCE(mm->mm_cid.percpu))
10788+
if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
1076610789
return;
1076710790
}
1076810791
mm_cid_fixup_cpus_to_tasks(mm);
@@ -10783,8 +10806,7 @@ static void mm_cid_irq_work(struct irq_work *work)
1078310806
void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
1078410807
{
1078510808
mm->mm_cid.max_cids = 0;
10786-
mm->mm_cid.percpu = 0;
10787-
mm->mm_cid.transit = 0;
10809+
mm->mm_cid.mode = 0;
1078810810
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
1078910811
mm->mm_cid.users = 0;
1079010812
mm->mm_cid.pcpu_thrs = 0;

kernel/sched/sched.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3816,7 +3816,8 @@ static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigne
38163816
__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
38173817
}
38183818

3819-
static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
3819+
static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid,
3820+
unsigned int mode)
38203821
{
38213822
unsigned int max_cids, tcid = t->mm_cid.cid;
38223823
struct mm_struct *mm = t->mm;
@@ -3842,15 +3843,16 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int
38423843
if (!cid_on_cpu(cpu_cid))
38433844
cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
38443845

3845-
/* Set the transition mode flag if required */
3846-
if (READ_ONCE(mm->mm_cid.transit))
3846+
/* Handle the transition mode flag if required */
3847+
if (mode & MM_CID_TRANSIT)
38473848
cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT;
38483849
}
38493850
mm_cid_update_pcpu_cid(mm, cpu_cid);
38503851
mm_cid_update_task_cid(t, cpu_cid);
38513852
}
38523853

3853-
static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
3854+
static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid,
3855+
unsigned int mode)
38543856
{
38553857
unsigned int max_cids, tcid = t->mm_cid.cid;
38563858
struct mm_struct *mm = t->mm;
@@ -3876,7 +3878,7 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int
38763878
if (!cid_on_task(tcid))
38773879
tcid = mm_get_cid(mm);
38783880
/* Set the transition mode flag if required */
3879-
tcid |= READ_ONCE(mm->mm_cid.transit);
3881+
tcid |= mode & MM_CID_TRANSIT;
38803882
}
38813883
mm_cid_update_pcpu_cid(mm, tcid);
38823884
mm_cid_update_task_cid(t, tcid);
@@ -3885,16 +3887,17 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int
38853887
static __always_inline void mm_cid_schedin(struct task_struct *next)
38863888
{
38873889
struct mm_struct *mm = next->mm;
3888-
unsigned int cpu_cid;
3890+
unsigned int cpu_cid, mode;
38893891

38903892
if (!next->mm_cid.active)
38913893
return;
38923894

38933895
cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
3894-
if (likely(!READ_ONCE(mm->mm_cid.percpu)))
3895-
mm_cid_from_task(next, cpu_cid);
3896+
mode = READ_ONCE(mm->mm_cid.mode);
3897+
if (likely(!cid_on_cpu(mode)))
3898+
mm_cid_from_task(next, cpu_cid, mode);
38963899
else
3897-
mm_cid_from_cpu(next, cpu_cid);
3900+
mm_cid_from_cpu(next, cpu_cid, mode);
38983901
}
38993902

39003903
static __always_inline void mm_cid_schedout(struct task_struct *prev)

0 commit comments

Comments
 (0)