@@ -715,6 +715,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
715715 return cfs_rq -> min_vruntime + avg ;
716716}
717717
718+ /*
719+ * lag_i = S - s_i = w_i * (V - v_i)
720+ */
721+ void update_entity_lag (struct cfs_rq * cfs_rq , struct sched_entity * se )
722+ {
723+ SCHED_WARN_ON (!se -> on_rq );
724+ se -> vlag = avg_vruntime (cfs_rq ) - se -> vruntime ;
725+ }
726+
718727static u64 __update_min_vruntime (struct cfs_rq * cfs_rq , u64 vruntime )
719728{
720729 u64 min_vruntime = cfs_rq -> min_vruntime ;
@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
34923501static void reweight_entity (struct cfs_rq * cfs_rq , struct sched_entity * se ,
34933502 unsigned long weight )
34943503{
3504+ unsigned long old_weight = se -> load .weight ;
3505+
34953506 if (se -> on_rq ) {
34963507 /* commit outstanding execution time */
34973508 if (cfs_rq -> curr == se )
@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
35043515
35053516 update_load_set (& se -> load , weight );
35063517
3518+ if (!se -> on_rq ) {
3519+ /*
3520+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3521+ * we need to scale se->vlag when w_i changes.
3522+ */
3523+ se -> vlag = div_s64 (se -> vlag * old_weight , weight );
3524+ }
3525+
35073526#ifdef CONFIG_SMP
35083527 do {
35093528 u32 divider = get_pelt_divider (& se -> avg );
@@ -4853,49 +4872,119 @@ static void
48534872place_entity (struct cfs_rq * cfs_rq , struct sched_entity * se , int initial )
48544873{
48554874 u64 vruntime = avg_vruntime (cfs_rq );
4875+ s64 lag = 0 ;
48564876
4857- /* sleeps up to a single latency don't count. */
4858- if (!initial ) {
4859- unsigned long thresh ;
4877+ /*
4878+ * Due to how V is constructed as the weighted average of entities,
4879+ * adding tasks with positive lag, or removing tasks with negative lag
4880+ * will move 'time' backwards, this can screw around with the lag of
4881+ * other tasks.
4882+ *
4883+ * EEVDF: placement strategy #1 / #2
4884+ */
4885+ if (sched_feat (PLACE_LAG ) && cfs_rq -> nr_running > 1 ) {
4886+ struct sched_entity * curr = cfs_rq -> curr ;
4887+ unsigned long load ;
48604888
4861- if (se_is_idle (se ))
4862- thresh = sysctl_sched_min_granularity ;
4863- else
4864- thresh = sysctl_sched_latency ;
4889+ lag = se -> vlag ;
48654890
48664891 /*
4867- * Halve their sleep time's effect, to allow
4868- * for a gentler effect of sleepers:
4892+ * If we want to place a task and preserve lag, we have to
4893+ * consider the effect of the new entity on the weighted
4894+ * average and compensate for this, otherwise lag can quickly
4895+ * evaporate.
4896+ *
4897+ * Lag is defined as:
4898+ *
4899+ * lag_i = S - s_i = w_i * (V - v_i)
4900+ *
4901+ * To avoid the 'w_i' term all over the place, we only track
4902+ * the virtual lag:
4903+ *
4904+ * vl_i = V - v_i <=> v_i = V - vl_i
4905+ *
4906+ * And we take V to be the weighted average of all v:
4907+ *
4908+ * V = (\Sum w_j*v_j) / W
4909+ *
4910+ * Where W is: \Sum w_j
4911+ *
4912+ * Then, the weighted average after adding an entity with lag
4913+ * vl_i is given by:
4914+ *
4915+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
4916+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
4917+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
4918+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
4919+ * = V - w_i*vl_i / (W + w_i)
4920+ *
4921+ * And the actual lag after adding an entity with vl_i is:
4922+ *
4923+ * vl'_i = V' - v_i
4924+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
4925+ * = vl_i - w_i*vl_i / (W + w_i)
4926+ *
4927+ * Which is strictly less than vl_i. So in order to preserve lag
4928+ * we should inflate the lag before placement such that the
4929+ * effective lag after placement comes out right.
4930+ *
4931+ * As such, invert the above relation for vl'_i to get the vl_i
4932+ * we need to use such that the lag after placement is the lag
4933+ * we computed before dequeue.
4934+ *
4935+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
4936+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
4937+ *
4938+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
4939+ * = W*vl_i
4940+ *
4941+ * vl_i = (W + w_i)*vl'_i / W
48694942 */
4870- if (sched_feat (GENTLE_FAIR_SLEEPERS ))
4871- thresh >>= 1 ;
4872-
4873- vruntime -= thresh ;
4874- }
4875-
4876- /*
4877- * Pull vruntime of the entity being placed to the base level of
4878- * cfs_rq, to prevent boosting it if placed backwards.
4879- * However, min_vruntime can advance much faster than real time, with
4880- * the extreme being when an entity with the minimal weight always runs
4881- * on the cfs_rq. If the waking entity slept for a long time, its
4882- * vruntime difference from min_vruntime may overflow s64 and their
4883- * comparison may get inversed, so ignore the entity's original
4884- * vruntime in that case.
4885- * The maximal vruntime speedup is given by the ratio of normal to
4886- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
4887- * When placing a migrated waking entity, its exec_start has been set
4888- * from a different rq. In order to take into account a possible
4889- * divergence between new and prev rq's clocks task because of irq and
4890- * stolen time, we take an additional margin.
4891- * So, cutting off on the sleep time of
4892- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
4893- * should be safe.
4894- */
4895- if (entity_is_long_sleeper (se ))
4896- se -> vruntime = vruntime ;
4897- else
4898- se -> vruntime = max_vruntime (se -> vruntime , vruntime );
4943+ load = cfs_rq -> avg_load ;
4944+ if (curr && curr -> on_rq )
4945+ load += curr -> load .weight ;
4946+
4947+ lag *= load + se -> load .weight ;
4948+ if (WARN_ON_ONCE (!load ))
4949+ load = 1 ;
4950+ lag = div_s64 (lag , load );
4951+
4952+ vruntime -= lag ;
4953+ }
4954+
4955+ if (sched_feat (FAIR_SLEEPERS )) {
4956+
4957+ /* sleeps up to a single latency don't count. */
4958+ if (!initial ) {
4959+ unsigned long thresh ;
4960+
4961+ if (se_is_idle (se ))
4962+ thresh = sysctl_sched_min_granularity ;
4963+ else
4964+ thresh = sysctl_sched_latency ;
4965+
4966+ /*
4967+ * Halve their sleep time's effect, to allow
4968+ * for a gentler effect of sleepers:
4969+ */
4970+ if (sched_feat (GENTLE_FAIR_SLEEPERS ))
4971+ thresh >>= 1 ;
4972+
4973+ vruntime -= thresh ;
4974+ }
4975+
4976+ /*
4977+ * Pull vruntime of the entity being placed to the base level of
4978+ * cfs_rq, to prevent boosting it if placed backwards. If the entity
4979+ * slept for a long time, don't even try to compare its vruntime with
4980+ * the base as it may be too far off and the comparison may get
4981+ * inversed due to s64 overflow.
4982+ */
4983+ if (!entity_is_long_sleeper (se ))
4984+ vruntime = max_vruntime (se -> vruntime , vruntime );
4985+ }
4986+
4987+ se -> vruntime = vruntime ;
48994988}
49004989
49014990static void check_enqueue_throttle (struct cfs_rq * cfs_rq );
@@ -5077,6 +5166,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
50775166
50785167 clear_buddies (cfs_rq , se );
50795168
5169+ if (flags & DEQUEUE_SLEEP )
5170+ update_entity_lag (cfs_rq , se );
5171+
50805172 if (se != cfs_rq -> curr )
50815173 __dequeue_entity (cfs_rq , se );
50825174 se -> on_rq = 0 ;
0 commit comments