Skip to content

Commit 75b607f

Browse files
committed
Merge tag 'sched_ext-for-6.12-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo: - ops.enqueue() didn't have a way to tell whether select_task_rq_scx() and thus ops.select() were skipped. Some schedulers were incorrectly using SCX_ENQ_WAKEUP. Add SCX_ENQ_CPU_SELECTED and fix scx_qmap using it. - Remove a spurious WARN_ON_ONCE() in scx_cgroup_exit() - Fix error information clobbering during load - Add missing __weak markers to BPF helper declarations - Doc update * tag 'sched_ext-for-6.12-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: sched_ext: Documentation: Update instructions for running example schedulers sched_ext, scx_qmap: Add and use SCX_ENQ_CPU_SELECTED sched/core: Add ENQUEUE_RQ_SELECTED to indicate whether ->select_task_rq() was called sched/core: Make select_task_rq() take the pointer to wake_flags instead of value sched_ext: scx_cgroup_exit() may be called without successful scx_cgroup_init() sched_ext: Improve error reporting during loading sched_ext: Add __weak markers to BPF helper function decalarations
2 parents 5b7c893 + e0ed521 commit 75b607f

6 files changed

Lines changed: 43 additions & 25 deletions

File tree

Documentation/scheduler/sched-ext.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ BPF scheduler and reverts all tasks back to CFS.
6666
.. code-block:: none
6767
6868
# make -j16 -C tools/sched_ext
69-
# tools/sched_ext/scx_simple
69+
# tools/sched_ext/build/bin/scx_simple
7070
local=0 global=3
7171
local=5 global=24
7272
local=9 global=44

kernel/sched/core.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3518,14 +3518,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
35183518
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
35193519
*/
35203520
static inline
3521-
int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3521+
int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
35223522
{
35233523
lockdep_assert_held(&p->pi_lock);
35243524

3525-
if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3526-
cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3527-
else
3525+
if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
3526+
cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
3527+
*wake_flags |= WF_RQ_SELECTED;
3528+
} else {
35283529
cpu = cpumask_any(p->cpus_ptr);
3530+
}
35293531

35303532
/*
35313533
* In order not to call set_task_cpu() on a blocking task we need
@@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
36593661
rq->nr_uninterruptible--;
36603662

36613663
#ifdef CONFIG_SMP
3664+
if (wake_flags & WF_RQ_SELECTED)
3665+
en_flags |= ENQUEUE_RQ_SELECTED;
36623666
if (wake_flags & WF_MIGRATED)
36633667
en_flags |= ENQUEUE_MIGRATED;
36643668
else
@@ -4120,6 +4124,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
41204124
guard(preempt)();
41214125
int cpu, success = 0;
41224126

4127+
wake_flags |= WF_TTWU;
4128+
41234129
if (p == current) {
41244130
/*
41254131
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -4252,7 +4258,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
42524258
*/
42534259
smp_cond_load_acquire(&p->on_cpu, !VAL);
42544260

4255-
cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
4261+
cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
42564262
if (task_cpu(p) != cpu) {
42574263
if (p->in_iowait) {
42584264
delayacct_blkio_end(p);
@@ -4793,6 +4799,7 @@ void wake_up_new_task(struct task_struct *p)
47934799
{
47944800
struct rq_flags rf;
47954801
struct rq *rq;
4802+
int wake_flags = WF_FORK;
47964803

47974804
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
47984805
WRITE_ONCE(p->__state, TASK_RUNNING);
@@ -4807,15 +4814,15 @@ void wake_up_new_task(struct task_struct *p)
48074814
*/
48084815
p->recent_used_cpu = task_cpu(p);
48094816
rseq_migrate(p);
4810-
__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
4817+
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
48114818
#endif
48124819
rq = __task_rq_lock(p, &rf);
48134820
update_rq_clock(rq);
48144821
post_init_entity_util_avg(p);
48154822

48164823
activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
48174824
trace_sched_wakeup_new(p);
4818-
wakeup_preempt(rq, p, WF_FORK);
4825+
wakeup_preempt(rq, p, wake_flags);
48194826
#ifdef CONFIG_SMP
48204827
if (p->sched_class->task_woken) {
48214828
/*

kernel/sched/ext.c

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,10 @@ struct sched_ext_ops {
625625
/**
626626
* exit - Clean up after the BPF scheduler
627627
* @info: Exit info
628+
*
629+
* ops.exit() is also called on ops.init() failure, which is a bit
630+
* unusual. This is to allow rich reporting through @info on how
631+
* ops.init() failed.
628632
*/
629633
void (*exit)(struct scx_exit_info *info);
630634

@@ -692,6 +696,7 @@ enum scx_enq_flags {
692696
/* expose select ENQUEUE_* flags as enums */
693697
SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
694698
SCX_ENQ_HEAD = ENQUEUE_HEAD,
699+
SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
695700

696701
/* high 32bits are SCX specific */
697702

@@ -4048,7 +4053,6 @@ static void scx_cgroup_exit(void)
40484053

40494054
percpu_rwsem_assert_held(&scx_cgroup_rwsem);
40504055

4051-
WARN_ON_ONCE(!scx_cgroup_enabled);
40524056
scx_cgroup_enabled = false;
40534057

40544058
/*
@@ -4117,6 +4121,7 @@ static int scx_cgroup_init(void)
41174121
css->cgroup, &args);
41184122
if (ret) {
41194123
css_put(css);
4124+
scx_ops_error("ops.cgroup_init() failed (%d)", ret);
41204125
return ret;
41214126
}
41224127
tg->scx_flags |= SCX_TG_INITED;
@@ -5041,6 +5046,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
50415046
if (ret) {
50425047
ret = ops_sanitize_err("init", ret);
50435048
cpus_read_unlock();
5049+
scx_ops_error("ops.init() failed (%d)", ret);
50445050
goto err_disable;
50455051
}
50465052
}
@@ -5150,8 +5156,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
51505156
spin_lock_irq(&scx_tasks_lock);
51515157
scx_task_iter_exit(&sti);
51525158
spin_unlock_irq(&scx_tasks_lock);
5153-
pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
5154-
ret, p->comm, p->pid);
5159+
scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
5160+
ret, p->comm, p->pid);
51555161
goto err_disable_unlock_all;
51565162
}
51575163

@@ -5199,14 +5205,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
51995205

52005206
scx_ops_bypass(false);
52015207

5202-
/*
5203-
* Returning an error code here would lose the recorded error
5204-
* information. Exit indicating success so that the error is notified
5205-
* through ops.exit() with all the details.
5206-
*/
52075208
if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
52085209
WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
5209-
ret = 0;
52105210
goto err_disable;
52115211
}
52125212

@@ -5241,10 +5241,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
52415241
scx_ops_bypass(false);
52425242
err_disable:
52435243
mutex_unlock(&scx_ops_enable_mutex);
5244-
/* must be fully disabled before returning */
5245-
scx_ops_disable(SCX_EXIT_ERROR);
5244+
/*
5245+
* Returning an error code here would not pass all the error information
5246+
* to userspace. Record errno using scx_ops_error() for cases
5247+
* scx_ops_error() wasn't already invoked and exit indicating success so
5248+
* that the error is notified through ops.exit() with all the details.
5249+
*
5250+
* Flush scx_ops_disable_work to ensure that error is reported before
5251+
* init completion.
5252+
*/
5253+
scx_ops_error("scx_ops_enable() failed (%d)", ret);
52465254
kthread_flush_work(&scx_ops_disable_work);
5247-
return ret;
5255+
return 0;
52485256
}
52495257

52505258

kernel/sched/sched.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
22922292
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
22932293
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
22942294
#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
2295+
#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
22952296

22962297
#ifdef CONFIG_SMP
22972298
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40];
23342335
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
23352336
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
23362337
* ENQUEUE_MIGRATED - the task was migrated during wakeup
2338+
* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
23372339
*
23382340
*/
23392341

@@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40];
23602362
#define ENQUEUE_INITIAL 0x80
23612363
#define ENQUEUE_MIGRATING 0x100
23622364
#define ENQUEUE_DELAYED 0x200
2365+
#define ENQUEUE_RQ_SELECTED 0x400
23632366

23642367
#define RETRY_TASK ((void *)-1UL)
23652368

tools/sched_ext/include/scx/common.bpf.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt
4141
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
4242
void scx_bpf_dispatch_cancel(void) __ksym;
4343
bool scx_bpf_consume(u64 dsq_id) __ksym;
44-
void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
45-
void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
44+
void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
45+
void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
4646
bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
4747
bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
4848
u32 scx_bpf_reenqueue_local(void) __ksym;
@@ -71,7 +71,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
7171
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
7272
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
7373
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
74-
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
74+
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
7575

7676
/*
7777
* Use the following as @it__iter when calling

tools/sched_ext/scx_qmap.bpf.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
230230
return;
231231
}
232232

233-
/* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */
234-
if (!(enq_flags & SCX_ENQ_WAKEUP) &&
233+
/* if select_cpu() wasn't called, try direct dispatch */
234+
if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
235235
(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
236236
__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
237237
scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);

0 commit comments

Comments
 (0)