Skip to content

Commit bfd3749

Browse files
committed
sched_ext: Use shorter slice in bypass mode
There have been reported cases of bypass mode not making forward progress fast enough. The 20ms default slice is unnecessarily long for bypass mode where the primary goal is ensuring all tasks can make forward progress. Introduce SCX_SLICE_BYPASS set to 5ms and make the scheduler automatically switch to it when entering bypass mode. Also make the bypass slice value tunable through the slice_bypass_us module parameter (adjustable between 100us and 100ms) to make it easier to test whether slice durations are a factor in problem cases. v3: Use READ_ONCE/WRITE_ONCE for scx_slice_dfl access (Dan). v2: Removed slice_dfl_us module parameter. Fixed typos (Andrea). Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com> Reviewed-by: Andrea Righi <arighi@nvidia.com> Cc: Dan Schatzberg <schatzberg.dan@gmail.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 5a629ec commit bfd3749

2 files changed

Lines changed: 42 additions & 3 deletions

File tree

include/linux/sched/ext.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,18 @@
1717
enum scx_public_consts {
1818
SCX_OPS_NAME_LEN = 128,
1919

20+
/*
21+
* %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
22+
* to set the slice for a task that is selected for execution.
23+
* %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
24+
* refill has been triggered.
25+
*
26+
* %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
27+
* mode. As making forward progress for all tasks is the main goal of
28+
* the bypass mode, a shorter slice is used.
29+
*/
2030
SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
31+
SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */
2132
SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
2233
};
2334

kernel/sched/ext.c

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,32 @@ static struct scx_dump_data scx_dump_data = {
143143
/* /sys/kernel/sched_ext interface */
144144
static struct kset *scx_kset;
145145

146+
/*
147+
* Parameters that can be adjusted through /sys/module/sched_ext/parameters.
148+
* There usually is no reason to modify these as normal scheduler operation
149+
* shouldn't be affected by them. The knobs are primarily for debugging.
150+
*/
151+
static u64 scx_slice_dfl = SCX_SLICE_DFL;
152+
static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
153+
154+
static int set_slice_us(const char *val, const struct kernel_param *kp)
155+
{
156+
return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC);
157+
}
158+
159+
static const struct kernel_param_ops slice_us_param_ops = {
160+
.set = set_slice_us,
161+
.get = param_get_uint,
162+
};
163+
164+
#undef MODULE_PARAM_PREFIX
165+
#define MODULE_PARAM_PREFIX "sched_ext."
166+
167+
module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
168+
MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
169+
170+
#undef MODULE_PARAM_PREFIX
171+
146172
#define CREATE_TRACE_POINTS
147173
#include <trace/events/sched_ext.h>
148174

@@ -919,7 +945,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
919945

920946
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
921947
{
922-
p->scx.slice = SCX_SLICE_DFL;
948+
p->scx.slice = READ_ONCE(scx_slice_dfl);
923949
__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
924950
}
925951

@@ -2896,7 +2922,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
28962922
INIT_LIST_HEAD(&scx->runnable_node);
28972923
scx->runnable_at = jiffies;
28982924
scx->ddsp_dsq_id = SCX_DSQ_INVALID;
2899-
scx->slice = SCX_SLICE_DFL;
2925+
scx->slice = READ_ONCE(scx_slice_dfl);
29002926
}
29012927

29022928
void scx_pre_fork(struct task_struct *p)
@@ -3774,6 +3800,7 @@ static void scx_bypass(bool bypass)
37743800
WARN_ON_ONCE(scx_bypass_depth <= 0);
37753801
if (scx_bypass_depth != 1)
37763802
goto unlock;
3803+
WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
37773804
bypass_timestamp = ktime_get_ns();
37783805
if (sch)
37793806
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -3782,6 +3809,7 @@ static void scx_bypass(bool bypass)
37823809
WARN_ON_ONCE(scx_bypass_depth < 0);
37833810
if (scx_bypass_depth != 0)
37843811
goto unlock;
3812+
WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL);
37853813
if (sch)
37863814
scx_add_event(sch, SCX_EV_BYPASS_DURATION,
37873815
ktime_get_ns() - bypass_timestamp);
@@ -4780,7 +4808,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
47804808
queue_flags |= DEQUEUE_CLASS;
47814809

47824810
scoped_guard (sched_change, p, queue_flags) {
4783-
p->scx.slice = SCX_SLICE_DFL;
4811+
p->scx.slice = READ_ONCE(scx_slice_dfl);
47844812
p->sched_class = new_class;
47854813
}
47864814
}

0 commit comments

Comments
 (0)