@@ -34,6 +34,8 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled);
3434DEFINE_STATIC_PERCPU_RWSEM (scx_fork_rwsem );
3535static atomic_t scx_enable_state_var = ATOMIC_INIT (SCX_DISABLED );
3636static int scx_bypass_depth ;
37+ static cpumask_var_t scx_bypass_lb_donee_cpumask ;
38+ static cpumask_var_t scx_bypass_lb_resched_cpumask ;
3739static bool scx_aborting ;
3840static bool scx_init_task_enabled ;
3941static bool scx_switching_all ;
@@ -149,6 +151,7 @@ static struct kset *scx_kset;
149151 */
150152static u64 scx_slice_dfl = SCX_SLICE_DFL ;
151153static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC ;
154+ static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US ;
152155
153156static int set_slice_us (const char * val , const struct kernel_param * kp )
154157{
@@ -160,11 +163,23 @@ static const struct kernel_param_ops slice_us_param_ops = {
160163 .get = param_get_uint ,
161164};
162165
166+ static int set_bypass_lb_intv_us (const char * val , const struct kernel_param * kp )
167+ {
168+ return param_set_uint_minmax (val , kp , 0 , 10 * USEC_PER_SEC );
169+ }
170+
171+ static const struct kernel_param_ops bypass_lb_intv_us_param_ops = {
172+ .set = set_bypass_lb_intv_us ,
173+ .get = param_get_uint ,
174+ };
175+
163176#undef MODULE_PARAM_PREFIX
164177#define MODULE_PARAM_PREFIX "sched_ext."
165178
166179module_param_cb (slice_bypass_us , & slice_us_param_ops , & scx_slice_bypass_us , 0600 );
167180MODULE_PARM_DESC (slice_bypass_us , "bypass slice in microseconds, applied on [un]load (100us to 100ms)" );
181+ module_param_cb (bypass_lb_intv_us , & bypass_lb_intv_us_param_ops , & scx_bypass_lb_intv_us , 0600 );
182+ MODULE_PARM_DESC (bypass_lb_intv_us , "bypass load balance interval in microseconds (0 (disable) to 10s)" );
168183
169184#undef MODULE_PARAM_PREFIX
170185
@@ -962,7 +977,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
962977 !RB_EMPTY_NODE (& p -> scx .dsq_priq ));
963978
964979 if (!is_local ) {
965- raw_spin_lock (& dsq -> lock );
980+ raw_spin_lock_nested (& dsq -> lock ,
981+ (enq_flags & SCX_ENQ_NESTED ) ? SINGLE_DEPTH_NESTING : 0 );
982+
966983 if (unlikely (dsq -> id == SCX_DSQ_INVALID )) {
967984 scx_error (sch , "attempting to dispatch to a destroyed dsq" );
968985 /* fall back to the global dsq */
@@ -3744,6 +3761,207 @@ bool scx_hardlockup(void)
37443761 return true;
37453762}
37463763
3764+ static u32 bypass_lb_cpu (struct scx_sched * sch , struct rq * rq ,
3765+ struct cpumask * donee_mask , struct cpumask * resched_mask ,
3766+ u32 nr_donor_target , u32 nr_donee_target )
3767+ {
3768+ struct scx_dispatch_q * donor_dsq = & rq -> scx .bypass_dsq ;
3769+ struct task_struct * p , * n ;
3770+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR (cursor , 0 , 0 );
3771+ s32 delta = READ_ONCE (donor_dsq -> nr ) - nr_donor_target ;
3772+ u32 nr_balanced = 0 , min_delta_us ;
3773+
3774+ /*
3775+ * All we want to guarantee is reasonable forward progress. No reason to
3776+ * fine tune. Assuming every task on @donor_dsq runs their full slice,
3777+ * consider offloading iff the total queued duration is over the
3778+ * threshold.
3779+ */
3780+ min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV ;
3781+ if (delta < DIV_ROUND_UP (min_delta_us , scx_slice_bypass_us ))
3782+ return 0 ;
3783+
3784+ raw_spin_rq_lock_irq (rq );
3785+ raw_spin_lock (& donor_dsq -> lock );
3786+ list_add (& cursor .node , & donor_dsq -> list );
3787+ resume :
3788+ n = container_of (& cursor , struct task_struct , scx .dsq_list );
3789+ n = nldsq_next_task (donor_dsq , n , false);
3790+
3791+ while ((p = n )) {
3792+ struct rq * donee_rq ;
3793+ struct scx_dispatch_q * donee_dsq ;
3794+ int donee ;
3795+
3796+ n = nldsq_next_task (donor_dsq , n , false);
3797+
3798+ if (donor_dsq -> nr <= nr_donor_target )
3799+ break ;
3800+
3801+ if (cpumask_empty (donee_mask ))
3802+ break ;
3803+
3804+ donee = cpumask_any_and_distribute (donee_mask , p -> cpus_ptr );
3805+ if (donee >= nr_cpu_ids )
3806+ continue ;
3807+
3808+ donee_rq = cpu_rq (donee );
3809+ donee_dsq = & donee_rq -> scx .bypass_dsq ;
3810+
3811+ /*
3812+ * $p's rq is not locked but $p's DSQ lock protects its
3813+ * scheduling properties making this test safe.
3814+ */
3815+ if (!task_can_run_on_remote_rq (sch , p , donee_rq , false))
3816+ continue ;
3817+
3818+ /*
3819+ * Moving $p from one non-local DSQ to another. The source rq
3820+ * and DSQ are already locked. Do an abbreviated dequeue and
3821+ * then perform enqueue without unlocking $donor_dsq.
3822+ *
3823+ * We don't want to drop and reacquire the lock on each
3824+ * iteration as @donor_dsq can be very long and potentially
3825+ * highly contended. Donee DSQs are less likely to be contended.
3826+ * The nested locking is safe as only this LB moves tasks
3827+ * between bypass DSQs.
3828+ */
3829+ dispatch_dequeue_locked (p , donor_dsq );
3830+ dispatch_enqueue (sch , donee_dsq , p , SCX_ENQ_NESTED );
3831+
3832+ /*
3833+ * $donee might have been idle and need to be woken up. No need
3834+ * to be clever. Kick every CPU that receives tasks.
3835+ */
3836+ cpumask_set_cpu (donee , resched_mask );
3837+
3838+ if (READ_ONCE (donee_dsq -> nr ) >= nr_donee_target )
3839+ cpumask_clear_cpu (donee , donee_mask );
3840+
3841+ nr_balanced ++ ;
3842+ if (!(nr_balanced % SCX_BYPASS_LB_BATCH ) && n ) {
3843+ list_move_tail (& cursor .node , & n -> scx .dsq_list .node );
3844+ raw_spin_unlock (& donor_dsq -> lock );
3845+ raw_spin_rq_unlock_irq (rq );
3846+ cpu_relax ();
3847+ raw_spin_rq_lock_irq (rq );
3848+ raw_spin_lock (& donor_dsq -> lock );
3849+ goto resume ;
3850+ }
3851+ }
3852+
3853+ list_del_init (& cursor .node );
3854+ raw_spin_unlock (& donor_dsq -> lock );
3855+ raw_spin_rq_unlock_irq (rq );
3856+
3857+ return nr_balanced ;
3858+ }
3859+
3860+ static void bypass_lb_node (struct scx_sched * sch , int node )
3861+ {
3862+ const struct cpumask * node_mask = cpumask_of_node (node );
3863+ struct cpumask * donee_mask = scx_bypass_lb_donee_cpumask ;
3864+ struct cpumask * resched_mask = scx_bypass_lb_resched_cpumask ;
3865+ u32 nr_tasks = 0 , nr_cpus = 0 , nr_balanced = 0 ;
3866+ u32 nr_target , nr_donor_target ;
3867+ u32 before_min = U32_MAX , before_max = 0 ;
3868+ u32 after_min = U32_MAX , after_max = 0 ;
3869+ int cpu ;
3870+
3871+ /* count the target tasks and CPUs */
3872+ for_each_cpu_and (cpu , cpu_online_mask , node_mask ) {
3873+ u32 nr = READ_ONCE (cpu_rq (cpu )-> scx .bypass_dsq .nr );
3874+
3875+ nr_tasks += nr ;
3876+ nr_cpus ++ ;
3877+
3878+ before_min = min (nr , before_min );
3879+ before_max = max (nr , before_max );
3880+ }
3881+
3882+ if (!nr_cpus )
3883+ return ;
3884+
3885+ /*
3886+ * We don't want CPUs to have more than $nr_donor_target tasks and
3887+ * balancing to fill donee CPUs upto $nr_target. Once targets are
3888+ * calculated, find the donee CPUs.
3889+ */
3890+ nr_target = DIV_ROUND_UP (nr_tasks , nr_cpus );
3891+ nr_donor_target = DIV_ROUND_UP (nr_target * SCX_BYPASS_LB_DONOR_PCT , 100 );
3892+
3893+ cpumask_clear (donee_mask );
3894+ for_each_cpu_and (cpu , cpu_online_mask , node_mask ) {
3895+ if (READ_ONCE (cpu_rq (cpu )-> scx .bypass_dsq .nr ) < nr_target )
3896+ cpumask_set_cpu (cpu , donee_mask );
3897+ }
3898+
3899+ /* iterate !donee CPUs and see if they should be offloaded */
3900+ cpumask_clear (resched_mask );
3901+ for_each_cpu_and (cpu , cpu_online_mask , node_mask ) {
3902+ struct rq * rq = cpu_rq (cpu );
3903+ struct scx_dispatch_q * donor_dsq = & rq -> scx .bypass_dsq ;
3904+
3905+ if (cpumask_empty (donee_mask ))
3906+ break ;
3907+ if (cpumask_test_cpu (cpu , donee_mask ))
3908+ continue ;
3909+ if (READ_ONCE (donor_dsq -> nr ) <= nr_donor_target )
3910+ continue ;
3911+
3912+ nr_balanced += bypass_lb_cpu (sch , rq , donee_mask , resched_mask ,
3913+ nr_donor_target , nr_target );
3914+ }
3915+
3916+ for_each_cpu (cpu , resched_mask ) {
3917+ struct rq * rq = cpu_rq (cpu );
3918+
3919+ raw_spin_rq_lock_irq (rq );
3920+ resched_curr (rq );
3921+ raw_spin_rq_unlock_irq (rq );
3922+ }
3923+
3924+ for_each_cpu_and (cpu , cpu_online_mask , node_mask ) {
3925+ u32 nr = READ_ONCE (cpu_rq (cpu )-> scx .bypass_dsq .nr );
3926+
3927+ after_min = min (nr , after_min );
3928+ after_max = max (nr , after_max );
3929+
3930+ }
3931+
3932+ trace_sched_ext_bypass_lb (node , nr_cpus , nr_tasks , nr_balanced ,
3933+ before_min , before_max , after_min , after_max );
3934+ }
3935+
3936+ /*
3937+ * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
3938+ * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
3939+ * bypass DSQs can be overloaded. If there are enough tasks to saturate other
3940+ * lightly loaded CPUs, such imbalance can lead to very high execution latency
3941+ * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such
3942+ * outcomes, a simple load balancing mechanism is implemented by the following
3943+ * timer which runs periodically while bypass mode is in effect.
3944+ */
3945+ static void scx_bypass_lb_timerfn (struct timer_list * timer )
3946+ {
3947+ struct scx_sched * sch ;
3948+ int node ;
3949+ u32 intv_us ;
3950+
3951+ sch = rcu_dereference_all (scx_root );
3952+ if (unlikely (!sch ) || !READ_ONCE (scx_bypass_depth ))
3953+ return ;
3954+
3955+ for_each_node_with_cpus (node )
3956+ bypass_lb_node (sch , node );
3957+
3958+ intv_us = READ_ONCE (scx_bypass_lb_intv_us );
3959+ if (intv_us )
3960+ mod_timer (timer , jiffies + usecs_to_jiffies (intv_us ));
3961+ }
3962+
3963+ static DEFINE_TIMER (scx_bypass_lb_timer , scx_bypass_lb_timerfn ) ;
3964+
37473965/**
37483966 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
37493967 * @bypass: true for bypass, false for unbypass
@@ -3787,16 +4005,25 @@ static void scx_bypass(bool bypass)
37874005 sch = rcu_dereference_bh (scx_root );
37884006
37894007 if (bypass ) {
3790- scx_bypass_depth ++ ;
4008+ u32 intv_us ;
4009+
4010+ WRITE_ONCE (scx_bypass_depth , scx_bypass_depth + 1 );
37914011 WARN_ON_ONCE (scx_bypass_depth <= 0 );
37924012 if (scx_bypass_depth != 1 )
37934013 goto unlock ;
37944014 WRITE_ONCE (scx_slice_dfl , scx_slice_bypass_us * NSEC_PER_USEC );
37954015 bypass_timestamp = ktime_get_ns ();
37964016 if (sch )
37974017 scx_add_event (sch , SCX_EV_BYPASS_ACTIVATE , 1 );
4018+
4019+ intv_us = READ_ONCE (scx_bypass_lb_intv_us );
4020+ if (intv_us && !timer_pending (& scx_bypass_lb_timer )) {
4021+ scx_bypass_lb_timer .expires =
4022+ jiffies + usecs_to_jiffies (intv_us );
4023+ add_timer_global (& scx_bypass_lb_timer );
4024+ }
37984025 } else {
3799- scx_bypass_depth -- ;
4026+ WRITE_ONCE ( scx_bypass_depth , scx_bypass_depth - 1 ) ;
38004027 WARN_ON_ONCE (scx_bypass_depth < 0 );
38014028 if (scx_bypass_depth != 0 )
38024029 goto unlock ;
@@ -7052,6 +7279,12 @@ static int __init scx_init(void)
70527279 return ret ;
70537280 }
70547281
7282+ if (!alloc_cpumask_var (& scx_bypass_lb_donee_cpumask , GFP_KERNEL ) ||
7283+ !alloc_cpumask_var (& scx_bypass_lb_resched_cpumask , GFP_KERNEL )) {
7284+ pr_err ("sched_ext: Failed to allocate cpumasks\n" );
7285+ return - ENOMEM ;
7286+ }
7287+
70557288 return 0 ;
70567289}
70577290__initcall (scx_init );
0 commit comments