Skip to content

Commit 79f3f9b

Browse files
author
Peter Zijlstra
committed
sched/eevdf: Fix min_vruntime vs avg_vruntime
Basically, from the constraint that the sum of lag is zero, you can infer that the 0-lag point is the weighted average of the individual vruntime, which is what we're trying to compute: \Sum w_i * v_i avg = -------------- \Sum w_i Now, since vruntime takes the whole u64 (worse, it wraps), this multiplication term in the numerator is not something we can compute; instead we do the min_vruntime (v0 henceforth) thing like: v_i = (v_i - v0) + v0 This does two things: - it keeps the key: (v_i - v0) 'small'; - it creates a relative 0-point in the modular space. If you do that subtitution and work it all out, you end up with: \Sum w_i * (v_i - v0) avg = --------------------- + v0 \Sum w_i Since you cannot very well track a ratio like that (and not suffer terrible numerical problems) we simpy track the numerator and denominator individually and only perform the division when strictly needed. Notably, the numerator lives in cfs_rq->avg_vruntime and the denominator lives in cfs_rq->avg_load. The one extra 'funny' is that these numbers track the entities in the tree, and current is typically outside of the tree, so avg_vruntime() adds current when needed before doing the division. (vruntime_eligible() elides the division by cross-wise multiplication) Anyway, as mentioned above, we currently use the CFS era min_vruntime for this purpose. However, this thing can only move forward, while the above avg can in fact move backward (when a non-eligible task leaves, the average becomes smaller), this can cause trouble when through happenstance (or construction) these values drift far enough apart to wreck the game. Replace cfs_rq::min_vruntime with cfs_rq::zero_vruntime which is kept near/at avg_vruntime, following its motion. The down-side is that this requires computing the avg more often. Fixes: 147f3ef ("sched/fair: Implement an EEVDF-like scheduling policy") Reported-by: Zicheng Qu <quzicheng@huawei.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://patch.msgid.link/20251106111741.GC4068168@noisy.programming.kicks-ass.net Cc: stable@vger.kernel.org
1 parent 9359d97 commit 79f3f9b

3 files changed

Lines changed: 31 additions & 95 deletions

File tree

kernel/sched/debug.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
796796

797797
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
798798
{
799-
s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
799+
s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
800800
struct sched_entity *last, *first, *root;
801801
struct rq *rq = cpu_rq(cpu);
802802
unsigned long flags;
@@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
819819
last = __pick_last_entity(cfs_rq);
820820
if (last)
821821
right_vruntime = last->vruntime;
822-
min_vruntime = cfs_rq->min_vruntime;
822+
zero_vruntime = cfs_rq->zero_vruntime;
823823
raw_spin_rq_unlock_irqrestore(rq, flags);
824824

825825
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
826826
SPLIT_NS(left_deadline));
827827
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
828828
SPLIT_NS(left_vruntime));
829-
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
830-
SPLIT_NS(min_vruntime));
829+
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
830+
SPLIT_NS(zero_vruntime));
831831
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
832832
SPLIT_NS(avg_vruntime(cfs_rq)));
833833
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",

kernel/sched/fair.c

Lines changed: 25 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a,
554554

555555
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
556556
{
557-
return (s64)(se->vruntime - cfs_rq->min_vruntime);
557+
return (s64)(se->vruntime - cfs_rq->zero_vruntime);
558558
}
559559

560560
#define __node_2_se(node) \
@@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
606606
*
607607
* Which we track using:
608608
*
609-
* v0 := cfs_rq->min_vruntime
609+
* v0 := cfs_rq->zero_vruntime
610610
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
611611
* \Sum w_i := cfs_rq->avg_load
612612
*
613-
* Since min_vruntime is a monotonic increasing variable that closely tracks
614-
* the per-task service, these deltas: (v_i - v), will be in the order of the
615-
* maximal (virtual) lag induced in the system due to quantisation.
613+
* Since zero_vruntime closely tracks the per-task service, these
614+
* deltas: (v_i - v), will be in the order of the maximal (virtual) lag
615+
* induced in the system due to quantisation.
616616
*
617617
* Also, we use scale_load_down() to reduce the size.
618618
*
@@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
671671
avg = div_s64(avg, load);
672672
}
673673

674-
return cfs_rq->min_vruntime + avg;
674+
return cfs_rq->zero_vruntime + avg;
675675
}
676676

677677
/*
@@ -732,50 +732,22 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
732732
load += weight;
733733
}
734734

735-
return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
735+
return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
736736
}
737737

738738
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
739739
{
740740
return vruntime_eligible(cfs_rq, se->vruntime);
741741
}
742742

743-
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
743+
static void update_zero_vruntime(struct cfs_rq *cfs_rq)
744744
{
745-
u64 min_vruntime = cfs_rq->min_vruntime;
746-
/*
747-
* open coded max_vruntime() to allow updating avg_vruntime
748-
*/
749-
s64 delta = (s64)(vruntime - min_vruntime);
750-
if (delta > 0) {
751-
avg_vruntime_update(cfs_rq, delta);
752-
min_vruntime = vruntime;
753-
}
754-
return min_vruntime;
755-
}
745+
u64 vruntime = avg_vruntime(cfs_rq);
746+
s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
756747

757-
static void update_min_vruntime(struct cfs_rq *cfs_rq)
758-
{
759-
struct sched_entity *se = __pick_root_entity(cfs_rq);
760-
struct sched_entity *curr = cfs_rq->curr;
761-
u64 vruntime = cfs_rq->min_vruntime;
762-
763-
if (curr) {
764-
if (curr->on_rq)
765-
vruntime = curr->vruntime;
766-
else
767-
curr = NULL;
768-
}
748+
avg_vruntime_update(cfs_rq, delta);
769749

770-
if (se) {
771-
if (!curr)
772-
vruntime = se->min_vruntime;
773-
else
774-
vruntime = min_vruntime(vruntime, se->min_vruntime);
775-
}
776-
777-
/* ensure we never gain time by being placed backwards. */
778-
cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
750+
cfs_rq->zero_vruntime = vruntime;
779751
}
780752

781753
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
@@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
848820
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
849821
{
850822
avg_vruntime_add(cfs_rq, se);
823+
update_zero_vruntime(cfs_rq);
851824
se->min_vruntime = se->vruntime;
852825
se->min_slice = se->slice;
853826
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
859832
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
860833
&min_vruntime_cb);
861834
avg_vruntime_sub(cfs_rq, se);
835+
update_zero_vruntime(cfs_rq);
862836
}
863837

864838
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -1226,7 +1200,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
12261200

12271201
curr->vruntime += calc_delta_fair(delta_exec, curr);
12281202
resched = update_deadline(cfs_rq, curr);
1229-
update_min_vruntime(cfs_rq);
12301203

12311204
if (entity_is_task(curr)) {
12321205
/*
@@ -3808,15 +3781,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
38083781
if (!curr)
38093782
__enqueue_entity(cfs_rq, se);
38103783
cfs_rq->nr_queued++;
3811-
3812-
/*
3813-
* The entity's vruntime has been adjusted, so let's check
3814-
* whether the rq-wide min_vruntime needs updated too. Since
3815-
* the calculations above require stable min_vruntime rather
3816-
* than up-to-date one, we do the update at the end of the
3817-
* reweight process.
3818-
*/
3819-
update_min_vruntime(cfs_rq);
38203784
}
38213785
}
38223786

@@ -5429,15 +5393,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
54295393

54305394
update_cfs_group(se);
54315395

5432-
/*
5433-
* Now advance min_vruntime if @se was the entity holding it back,
5434-
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
5435-
* put back on, and if we advance min_vruntime, we'll be placed back
5436-
* further than we started -- i.e. we'll be penalized.
5437-
*/
5438-
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
5439-
update_min_vruntime(cfs_rq);
5440-
54415396
if (flags & DEQUEUE_DELAYED)
54425397
finish_delayed_dequeue_entity(se);
54435398

@@ -9015,7 +8970,6 @@ static void yield_task_fair(struct rq *rq)
90158970
if (entity_eligible(cfs_rq, se)) {
90168971
se->vruntime = se->deadline;
90178972
se->deadline += calc_delta_fair(se->slice, se);
9018-
update_min_vruntime(cfs_rq);
90198973
}
90208974
}
90218975

@@ -13078,23 +13032,6 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
1307813032
* Which shows that S and s_i transform alike (which makes perfect sense
1307913033
* given that S is basically the (weighted) average of s_i).
1308013034
*
13081-
* Then:
13082-
*
13083-
* x -> s_min := min{s_i} (8)
13084-
*
13085-
* to obtain:
13086-
*
13087-
* \Sum_i w_i (s_i - s_min)
13088-
* S = s_min + ------------------------ (9)
13089-
* \Sum_i w_i
13090-
*
13091-
* Which already looks familiar, and is the basis for our current
13092-
* approximation:
13093-
*
13094-
* S ~= s_min (10)
13095-
*
13096-
* Now, obviously, (10) is absolute crap :-), but it sorta works.
13097-
*
1309813035
* So the thing to remember is that the above is strictly UP. It is
1309913036
* possible to generalize to multiple runqueues -- however it gets really
1310013037
* yuck when you have to add affinity support, as illustrated by our very
@@ -13116,23 +13053,23 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
1311613053
* Let, for our runqueue 'k':
1311713054
*
1311813055
* T_k = \Sum_i w_i s_i
13119-
* W_k = \Sum_i w_i ; for all i of k (11)
13056+
* W_k = \Sum_i w_i ; for all i of k (8)
1312013057
*
1312113058
* Then we can write (6) like:
1312213059
*
1312313060
* T_k
13124-
* S_k = --- (12)
13061+
* S_k = --- (9)
1312513062
* W_k
1312613063
*
1312713064
* From which immediately follows that:
1312813065
*
1312913066
* T_k + T_l
13130-
* S_k+l = --------- (13)
13067+
* S_k+l = --------- (10)
1313113068
* W_k + W_l
1313213069
*
1313313070
* On which we can define a combined lag:
1313413071
*
13135-
* lag_k+l(i) := S_k+l - s_i (14)
13072+
* lag_k+l(i) := S_k+l - s_i (11)
1313613073
*
1313713074
* And that gives us the tools to compare tasks across a combined runqueue.
1313813075
*
@@ -13143,7 +13080,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
1314313080
* using (7); this only requires storing single 'time'-stamps.
1314413081
*
1314513082
* b) when comparing tasks between 2 runqueues of which one is forced-idle,
13146-
* compare the combined lag, per (14).
13083+
* compare the combined lag, per (11).
1314713084
*
1314813085
* Now, of course cgroups (I so hate them) make this more interesting in
1314913086
* that a) seems to suggest we need to iterate all cgroup on a CPU at such
@@ -13191,12 +13128,11 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
1319113128
* every tick. This limits the observed divergence due to the work
1319213129
* conservancy.
1319313130
*
13194-
* On top of that, we can improve upon things by moving away from our
13195-
* horrible (10) hack and moving to (9) and employing (13) here.
13131+
* On top of that, we can improve upon things by employing (10) here.
1319613132
*/
1319713133

1319813134
/*
13199-
* se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
13135+
* se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
1320013136
*/
1320113137
static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
1320213138
bool forceidle)
@@ -13210,7 +13146,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
1321013146
cfs_rq->forceidle_seq = fi_seq;
1321113147
}
1321213148

13213-
cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
13149+
cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
1321413150
}
1321513151
}
1321613152

@@ -13263,11 +13199,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
1326313199

1326413200
/*
1326513201
* Find delta after normalizing se's vruntime with its cfs_rq's
13266-
* min_vruntime_fi, which would have been updated in prior calls
13202+
* zero_vruntime_fi, which would have been updated in prior calls
1326713203
* to se_fi_update().
1326813204
*/
1326913205
delta = (s64)(sea->vruntime - seb->vruntime) +
13270-
(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
13206+
(s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
1327113207

1327213208
return delta > 0;
1327313209
}
@@ -13513,7 +13449,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
1351313449
void init_cfs_rq(struct cfs_rq *cfs_rq)
1351413450
{
1351513451
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
13516-
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
13452+
cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
1351713453
raw_spin_lock_init(&cfs_rq->removed.lock);
1351813454
}
1351913455

kernel/sched/sched.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -681,10 +681,10 @@ struct cfs_rq {
681681
s64 avg_vruntime;
682682
u64 avg_load;
683683

684-
u64 min_vruntime;
684+
u64 zero_vruntime;
685685
#ifdef CONFIG_SCHED_CORE
686686
unsigned int forceidle_seq;
687-
u64 min_vruntime_fi;
687+
u64 zero_vruntime_fi;
688688
#endif
689689

690690
struct rb_root_cached tasks_timeline;

0 commit comments

Comments
 (0)