Skip to content

Commit 61debc2

Browse files
committed
sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
Bypass mode routes tasks through fallback dispatch queues. Originally a single global DSQ, b7b3b2d ("sched_ext: Split the global DSQ per NUMA node") changed this to per-node DSQs to resolve NUMA-related livelocks. Dan Schatzberg found per-node DSQs can still livelock when many threads are pinned to different small CPU subsets: each CPU must scan many incompatible tasks to find runnable ones, causing severe contention with high CPU counts. Switch to per-CPU bypass DSQs. Each task queues on its current CPU. Default idle CPU selection and direct dispatch handle most cases well. This introduces a failure mode when tasks concentrate on one CPU in over-saturated systems. If the BPF scheduler severely skews placement before triggering bypass, that CPU's queue may be too long to drain, causing RCU stalls. A load balancer in a future patch will address this. The bypass DSQ is separate from local DSQ to enable load balancing: local DSQs use rq locks, preventing efficient scanning and transfer across CPUs, especially problematic when systems are already contended. v2: Clarified why bypass DSQ is separate from local DSQ (Andrea Righi). Reported-by: Dan Schatzberg <schatzberg.dan@gmail.com> Reviewed-by: Dan Schatzberg <schatzberg.dan@gmail.com> Reviewed-by: Andrea Righi <arighi@nvidia.com> Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 3546119 commit 61debc2

3 files changed

Lines changed: 15 additions & 3 deletions

File tree

include/linux/sched/ext.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ enum scx_dsq_id_flags {
5757
SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
5858
SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
5959
SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
60+
SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3,
6061
SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
6162
SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
6263
};

kernel/sched/ext.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,7 +1298,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
12981298

12991299
if (scx_rq_bypassing(rq)) {
13001300
__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
1301-
goto global;
1301+
goto bypass;
13021302
}
13031303

13041304
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -1356,6 +1356,9 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
13561356
global:
13571357
dsq = find_global_dsq(sch, p);
13581358
goto enqueue;
1359+
bypass:
1360+
dsq = &task_rq(p)->scx.bypass_dsq;
1361+
goto enqueue;
13591362

13601363
enqueue:
13611364
/*
@@ -2154,8 +2157,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
21542157
if (consume_global_dsq(sch, rq))
21552158
goto has_tasks;
21562159

2157-
if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
2158-
scx_rq_bypassing(rq) || !scx_rq_online(rq))
2160+
if (scx_rq_bypassing(rq)) {
2161+
if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
2162+
goto has_tasks;
2163+
else
2164+
goto no_tasks;
2165+
}
2166+
2167+
if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
21592168
goto no_tasks;
21602169

21612170
dspc->rq = rq;
@@ -5371,6 +5380,7 @@ void __init init_sched_ext_class(void)
53715380
int n = cpu_to_node(cpu);
53725381

53735382
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
5383+
init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
53745384
INIT_LIST_HEAD(&rq->scx.runnable_list);
53755385
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
53765386

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,7 @@ struct scx_rq {
808808
struct balance_callback deferred_bal_cb;
809809
struct irq_work deferred_irq_work;
810810
struct irq_work kick_cpus_irq_work;
811+
struct scx_dispatch_q bypass_dsq;
811812
};
812813
#endif /* CONFIG_SCHED_CLASS_EXT */
813814

0 commit comments

Comments
 (0)