Skip to content

Commit b3d99f4

Browse files
author
Peter Zijlstra
committed
sched/fair: Fix zero_vruntime tracking
It turns out that zero_vruntime tracking is broken when there is but a single task running. Current update paths are through __{en,de}queue_entity(), and when there is but a single task, pick_next_task() will always return that one task, and put_prev_set_next_task() will end up in neither function. This can cause entity_key() to grow indefinitely large and cause overflows, leading to much pain and suffering. Furtermore, doing update_zero_vruntime() from __{de,en}queue_entity(), which are called from {set_next,put_prev}_entity() has problems because: - set_next_entity() calls __dequeue_entity() before it does cfs_rq->curr = se. This means the avg_vruntime() will see the removal but not current, missing the entity for accounting. - put_prev_entity() calls __enqueue_entity() before it does cfs_rq->curr = NULL. This means the avg_vruntime() will see the addition *and* current, leading to double accounting. Both cases are incorrect/inconsistent. Noting that avg_vruntime is already called on each {en,de}queue, remove the explicit avg_vruntime() calls (which removes an extra 64bit division for each {en,de}queue) and have avg_vruntime() update zero_vruntime itself. Additionally, have the tick call avg_vruntime() -- discarding the result, but for the side-effect of updating zero_vruntime. While there, optimize avg_vruntime() by noting that the average of one value is rather trivial to compute. Test case: # taskset -c -p 1 $$ # taskset -c 2 bash -c 'while :; do :; done&' # cat /sys/kernel/debug/sched/debug | awk '/^cpu#/ {P=0} /^cpu#2,/ {P=1} {if (P) print $0}' | grep -e zero_vruntime -e "^>" PRE: .zero_vruntime : 31316.407903 >R bash 487 50787.345112 E 50789.145972 2.800000 50780.298364 16 120 0.000000 0.000000 0.000000 / .zero_vruntime : 382548.253179 >R bash 487 427275.204288 E 427276.003584 2.800000 427268.157540 23 120 0.000000 0.000000 0.000000 / POST: .zero_vruntime : 17259.709467 >R bash 526 17259.709467 E 17262.509467 2.800000 16915.031624 9 120 0.000000 0.000000 0.000000 / .zero_vruntime : 18702.723356 >R bash 526 18702.723356 E 18705.523356 2.800000 18358.045513 9 120 0.000000 0.000000 0.000000 / Fixes: 79f3f9b ("sched/eevdf: Fix min_vruntime vs avg_vruntime") Reported-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com> Link: https://patch.msgid.link/20260219080624.438854780%40infradead.org
1 parent 6de23f8 commit b3d99f4

1 file changed

Lines changed: 57 additions & 27 deletions

File tree

kernel/sched/fair.c

Lines changed: 57 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,21 @@ static inline bool entity_before(const struct sched_entity *a,
589589
return vruntime_cmp(a->deadline, "<", b->deadline);
590590
}
591591

592+
/*
593+
* Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
594+
* and this value should be no more than two lag bounds. Which puts it in the
595+
* general order of:
596+
*
597+
* (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
598+
*
599+
* which is around 44 bits in size (on 64bit); that is 20 for
600+
* NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
601+
* however many msec the actual slice+tick ends up begin.
602+
*
603+
* (disregarding the actual divide-by-weight part makes for the worst case
604+
* weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
605+
* being the zero-lag point).
606+
*/
592607
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
593608
{
594609
return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
@@ -676,39 +691,61 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
676691
}
677692

678693
static inline
679-
void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
694+
void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
680695
{
681696
/*
682-
* v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
697+
* v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
683698
*/
684699
cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
700+
cfs_rq->zero_vruntime += delta;
685701
}
686702

687703
/*
688-
* Specifically: avg_runtime() + 0 must result in entity_eligible() := true
704+
* Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
689705
* For this to be so, the result of this function must have a left bias.
706+
*
707+
* Called in:
708+
* - place_entity() -- before enqueue
709+
* - update_entity_lag() -- before dequeue
710+
* - entity_tick()
711+
*
712+
* This means it is one entry 'behind' but that puts it close enough to where
713+
* the bound on entity_key() is at most two lag bounds.
690714
*/
691715
u64 avg_vruntime(struct cfs_rq *cfs_rq)
692716
{
693717
struct sched_entity *curr = cfs_rq->curr;
694-
s64 avg = cfs_rq->sum_w_vruntime;
695-
long load = cfs_rq->sum_weight;
718+
long weight = cfs_rq->sum_weight;
719+
s64 delta = 0;
696720

697-
if (curr && curr->on_rq) {
698-
unsigned long weight = scale_load_down(curr->load.weight);
721+
if (curr && !curr->on_rq)
722+
curr = NULL;
699723

700-
avg += entity_key(cfs_rq, curr) * weight;
701-
load += weight;
702-
}
724+
if (weight) {
725+
s64 runtime = cfs_rq->sum_w_vruntime;
726+
727+
if (curr) {
728+
unsigned long w = scale_load_down(curr->load.weight);
729+
730+
runtime += entity_key(cfs_rq, curr) * w;
731+
weight += w;
732+
}
703733

704-
if (load) {
705734
/* sign flips effective floor / ceiling */
706-
if (avg < 0)
707-
avg -= (load - 1);
708-
avg = div_s64(avg, load);
735+
if (runtime < 0)
736+
runtime -= (weight - 1);
737+
738+
delta = div_s64(runtime, weight);
739+
} else if (curr) {
740+
/*
741+
* When there is but one element, it is the average.
742+
*/
743+
delta = curr->vruntime - cfs_rq->zero_vruntime;
709744
}
710745

711-
return cfs_rq->zero_vruntime + avg;
746+
update_zero_vruntime(cfs_rq, delta);
747+
748+
return cfs_rq->zero_vruntime;
712749
}
713750

714751
/*
@@ -777,16 +814,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
777814
return vruntime_eligible(cfs_rq, se->vruntime);
778815
}
779816

780-
static void update_zero_vruntime(struct cfs_rq *cfs_rq)
781-
{
782-
u64 vruntime = avg_vruntime(cfs_rq);
783-
s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
784-
785-
sum_w_vruntime_update(cfs_rq, delta);
786-
787-
cfs_rq->zero_vruntime = vruntime;
788-
}
789-
790817
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
791818
{
792819
struct sched_entity *root = __pick_root_entity(cfs_rq);
@@ -856,7 +883,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
856883
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
857884
{
858885
sum_w_vruntime_add(cfs_rq, se);
859-
update_zero_vruntime(cfs_rq);
860886
se->min_vruntime = se->vruntime;
861887
se->min_slice = se->slice;
862888
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -868,7 +894,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
868894
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
869895
&min_vruntime_cb);
870896
sum_w_vruntime_sub(cfs_rq, se);
871-
update_zero_vruntime(cfs_rq);
872897
}
873898

874899
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -5524,6 +5549,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
55245549
update_load_avg(cfs_rq, curr, UPDATE_TG);
55255550
update_cfs_group(curr);
55265551

5552+
/*
5553+
* Pulls along cfs_rq::zero_vruntime.
5554+
*/
5555+
avg_vruntime(cfs_rq);
5556+
55275557
#ifdef CONFIG_SCHED_HRTICK
55285558
/*
55295559
* queued ticks are scheduled to match the slice, so don't bother

0 commit comments

Comments
 (0)