Skip to content

Commit 5ebec44

Browse files
committed
sched_ext: Exit dispatch and move operations immediately when aborting
62dcbab ("sched_ext: Avoid live-locking bypass mode switching") introduced the breather mechanism to inject delays during bypass mode switching. It maintains operation semantics unchanged while reducing lock contention to avoid live-locks on large NUMA systems. However, the breather only activates when exiting the scheduler, so there's no need to maintain operation semantics. Simplify by exiting dispatch and move operations immediately when scx_aborting is set. In consume_dispatch_q(), break out of the task iteration loop. In scx_dsq_move(), return early before acquiring locks. This also fixes cases the breather mechanism cannot handle. When a large system has many runnable threads affinitized to different CPU subsets and the BPF scheduler places them all into a single DSQ, many CPUs can scan the DSQ concurrently for tasks they can run. This can cause DSQ and RQ locks to be held for extended periods, leading to various failure modes. The breather cannot solve this because once in the consume loop, there's no exit. The new mechanism fixes this by exiting the loop immediately. The bypass DSQ is exempted to ensure the bypass mechanism itself can make progress. v2: Use READ_ONCE() when reading scx_aborting (Andrea Righi). Reported-by: Dan Schatzberg <schatzberg.dan@gmail.com> Reviewed-by: Dan Schatzberg <schatzberg.dan@gmail.com> Cc: Andrea Righi <arighi@nvidia.com> Cc: Emil Tsalapatis <etsal@meta.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent a69040e commit 5ebec44

1 file changed

Lines changed: 18 additions & 44 deletions

File tree

kernel/sched/ext.c

Lines changed: 18 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1818,48 +1818,11 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
18181818
return dst_rq;
18191819
}
18201820

1821-
/*
1822-
* A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
1823-
* banging on the same DSQ on a large NUMA system to the point where switching
1824-
* to the bypass mode can take a long time. Inject artificial delays while the
1825-
* bypass mode is switching to guarantee timely completion.
1826-
*/
1827-
static void scx_breather(struct rq *rq)
1828-
{
1829-
u64 until;
1830-
1831-
lockdep_assert_rq_held(rq);
1832-
1833-
if (likely(!READ_ONCE(scx_aborting)))
1834-
return;
1835-
1836-
raw_spin_rq_unlock(rq);
1837-
1838-
until = ktime_get_ns() + NSEC_PER_MSEC;
1839-
1840-
do {
1841-
int cnt = 1024;
1842-
while (READ_ONCE(scx_aborting) && --cnt)
1843-
cpu_relax();
1844-
} while (READ_ONCE(scx_aborting) &&
1845-
time_before64(ktime_get_ns(), until));
1846-
1847-
raw_spin_rq_lock(rq);
1848-
}
1849-
18501821
static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
18511822
struct scx_dispatch_q *dsq)
18521823
{
18531824
struct task_struct *p;
18541825
retry:
1855-
/*
1856-
* This retry loop can repeatedly race against scx_bypass() dequeueing
1857-
* tasks from @dsq trying to put the system into the bypass mode. On
1858-
* some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
1859-
* the machine into soft lockups. Give a breather.
1860-
*/
1861-
scx_breather(rq);
1862-
18631826
/*
18641827
* The caller can't expect to successfully consume a task if the task's
18651828
* addition to @dsq isn't guaranteed to be visible somehow. Test
@@ -1873,6 +1836,17 @@ static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
18731836
nldsq_for_each_task(p, dsq) {
18741837
struct rq *task_rq = task_rq(p);
18751838

1839+
/*
1840+
* This loop can lead to multiple lockup scenarios, e.g. the BPF
1841+
* scheduler can put an enormous number of affinitized tasks into
1842+
* a contended DSQ, or the outer retry loop can repeatedly race
1843+
* against scx_bypass() dequeueing tasks from @dsq trying to put
1844+
* the system into the bypass mode. This can easily live-lock the
1845+
* machine. If aborting, exit from all non-bypass DSQs.
1846+
*/
1847+
if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
1848+
break;
1849+
18761850
if (rq == task_rq) {
18771851
task_unlink_from_dsq(p, dsq);
18781852
move_local_task_to_local_dsq(p, 0, dsq, rq);
@@ -5636,6 +5610,13 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
56365610
!scx_kf_allowed(sch, SCX_KF_DISPATCH))
56375611
return false;
56385612

5613+
/*
5614+
* If the BPF scheduler keeps calling this function repeatedly, it can
5615+
* cause similar live-lock conditions as consume_dispatch_q().
5616+
*/
5617+
if (unlikely(READ_ONCE(scx_aborting)))
5618+
return false;
5619+
56395620
/*
56405621
* Can be called from either ops.dispatch() locking this_rq() or any
56415622
* context where no rq lock is held. If latter, lock @p's task_rq which
@@ -5656,13 +5637,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
56565637
raw_spin_rq_lock(src_rq);
56575638
}
56585639

5659-
/*
5660-
* If the BPF scheduler keeps calling this function repeatedly, it can
5661-
* cause similar live-lock conditions as consume_dispatch_q(). Insert a
5662-
* breather if necessary.
5663-
*/
5664-
scx_breather(src_rq);
5665-
56665640
locked_rq = src_rq;
56675641
raw_spin_lock(&src_dsq->lock);
56685642

0 commit comments

Comments
 (0)