@@ -601,9 +601,134 @@ static inline bool entity_before(const struct sched_entity *a,
601601 return (s64 )(a -> vruntime - b -> vruntime ) < 0 ;
602602}
603603
604+ static inline s64 entity_key (struct cfs_rq * cfs_rq , struct sched_entity * se )
605+ {
606+ return (s64 )(se -> vruntime - cfs_rq -> min_vruntime );
607+ }
608+
604609#define __node_2_se (node ) \
605610 rb_entry((node), struct sched_entity, run_node)
606611
612+ /*
613+ * Compute virtual time from the per-task service numbers:
614+ *
615+ * Fair schedulers conserve lag:
616+ *
617+ * \Sum lag_i = 0
618+ *
619+ * Where lag_i is given by:
620+ *
621+ * lag_i = S - s_i = w_i * (V - v_i)
622+ *
623+ * Where S is the ideal service time and V is it's virtual time counterpart.
624+ * Therefore:
625+ *
626+ * \Sum lag_i = 0
627+ * \Sum w_i * (V - v_i) = 0
628+ * \Sum w_i * V - w_i * v_i = 0
629+ *
630+ * From which we can solve an expression for V in v_i (which we have in
631+ * se->vruntime):
632+ *
633+ * \Sum v_i * w_i \Sum v_i * w_i
634+ * V = -------------- = --------------
635+ * \Sum w_i W
636+ *
637+ * Specifically, this is the weighted average of all entity virtual runtimes.
638+ *
639+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
640+ * that join/leave operations happen at lag_i = 0, otherwise the
641+ * virtual time has non-continguous motion equivalent to:
642+ *
643+ * V +-= lag_i / W
644+ *
645+ * Also see the comment in place_entity() that deals with this. ]]
646+ *
647+ * However, since v_i is u64, and the multiplcation could easily overflow
648+ * transform it into a relative form that uses smaller quantities:
649+ *
650+ * Substitute: v_i == (v_i - v0) + v0
651+ *
652+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
653+ * V = ---------------------------- = --------------------- + v0
654+ * W W
655+ *
656+ * Which we track using:
657+ *
658+ * v0 := cfs_rq->min_vruntime
659+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
660+ * \Sum w_i := cfs_rq->avg_load
661+ *
662+ * Since min_vruntime is a monotonic increasing variable that closely tracks
663+ * the per-task service, these deltas: (v_i - v), will be in the order of the
664+ * maximal (virtual) lag induced in the system due to quantisation.
665+ *
666+ * Also, we use scale_load_down() to reduce the size.
667+ *
668+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
669+ */
670+ static void
671+ avg_vruntime_add (struct cfs_rq * cfs_rq , struct sched_entity * se )
672+ {
673+ unsigned long weight = scale_load_down (se -> load .weight );
674+ s64 key = entity_key (cfs_rq , se );
675+
676+ cfs_rq -> avg_vruntime += key * weight ;
677+ cfs_rq -> avg_load += weight ;
678+ }
679+
680+ static void
681+ avg_vruntime_sub (struct cfs_rq * cfs_rq , struct sched_entity * se )
682+ {
683+ unsigned long weight = scale_load_down (se -> load .weight );
684+ s64 key = entity_key (cfs_rq , se );
685+
686+ cfs_rq -> avg_vruntime -= key * weight ;
687+ cfs_rq -> avg_load -= weight ;
688+ }
689+
690+ static inline
691+ void avg_vruntime_update (struct cfs_rq * cfs_rq , s64 delta )
692+ {
693+ /*
694+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
695+ */
696+ cfs_rq -> avg_vruntime -= cfs_rq -> avg_load * delta ;
697+ }
698+
699+ u64 avg_vruntime (struct cfs_rq * cfs_rq )
700+ {
701+ struct sched_entity * curr = cfs_rq -> curr ;
702+ s64 avg = cfs_rq -> avg_vruntime ;
703+ long load = cfs_rq -> avg_load ;
704+
705+ if (curr && curr -> on_rq ) {
706+ unsigned long weight = scale_load_down (curr -> load .weight );
707+
708+ avg += entity_key (cfs_rq , curr ) * weight ;
709+ load += weight ;
710+ }
711+
712+ if (load )
713+ avg = div_s64 (avg , load );
714+
715+ return cfs_rq -> min_vruntime + avg ;
716+ }
717+
718+ static u64 __update_min_vruntime (struct cfs_rq * cfs_rq , u64 vruntime )
719+ {
720+ u64 min_vruntime = cfs_rq -> min_vruntime ;
721+ /*
722+ * open coded max_vruntime() to allow updating avg_vruntime
723+ */
724+ s64 delta = (s64 )(vruntime - min_vruntime );
725+ if (delta > 0 ) {
726+ avg_vruntime_update (cfs_rq , delta );
727+ min_vruntime = vruntime ;
728+ }
729+ return min_vruntime ;
730+ }
731+
607732static void update_min_vruntime (struct cfs_rq * cfs_rq )
608733{
609734 struct sched_entity * curr = cfs_rq -> curr ;
@@ -629,7 +754,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
629754
630755 /* ensure we never gain time by being placed backwards. */
631756 u64_u32_store (cfs_rq -> min_vruntime ,
632- max_vruntime (cfs_rq -> min_vruntime , vruntime ));
757+ __update_min_vruntime (cfs_rq , vruntime ));
633758}
634759
635760static inline bool __entity_less (struct rb_node * a , const struct rb_node * b )
@@ -642,12 +767,14 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
642767 */
643768static void __enqueue_entity (struct cfs_rq * cfs_rq , struct sched_entity * se )
644769{
770+ avg_vruntime_add (cfs_rq , se );
645771 rb_add_cached (& se -> run_node , & cfs_rq -> tasks_timeline , __entity_less );
646772}
647773
648774static void __dequeue_entity (struct cfs_rq * cfs_rq , struct sched_entity * se )
649775{
650776 rb_erase_cached (& se -> run_node , & cfs_rq -> tasks_timeline );
777+ avg_vruntime_sub (cfs_rq , se );
651778}
652779
653780struct sched_entity * __pick_first_entity (struct cfs_rq * cfs_rq )
@@ -3379,6 +3506,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
33793506 /* commit outstanding execution time */
33803507 if (cfs_rq -> curr == se )
33813508 update_curr (cfs_rq );
3509+ else
3510+ avg_vruntime_sub (cfs_rq , se );
33823511 update_load_sub (& cfs_rq -> load , se -> load .weight );
33833512 }
33843513 dequeue_load_avg (cfs_rq , se );
@@ -3394,9 +3523,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
33943523#endif
33953524
33963525 enqueue_load_avg (cfs_rq , se );
3397- if (se -> on_rq )
3526+ if (se -> on_rq ) {
33983527 update_load_add (& cfs_rq -> load , se -> load .weight );
3399-
3528+ if (cfs_rq -> curr != se )
3529+ avg_vruntime_add (cfs_rq , se );
3530+ }
34003531}
34013532
34023533void reweight_task (struct task_struct * p , int prio )
0 commit comments