@@ -187,12 +187,18 @@ EXPORT_SYMBOL(jiffies_64);
187187#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
188188
189189#ifdef CONFIG_NO_HZ_COMMON
190- # define NR_BASES 2
191- # define BASE_STD 0
192- # define BASE_DEF 1
190+ /*
191+ * If multiple bases need to be locked, use the base ordering for lock
192+ * nesting, i.e. lowest number first.
193+ */
194+ # define NR_BASES 3
195+ # define BASE_LOCAL 0
196+ # define BASE_GLOBAL 1
197+ # define BASE_DEF 2
193198#else
194199# define NR_BASES 1
195- # define BASE_STD 0
200+ # define BASE_LOCAL 0
201+ # define BASE_GLOBAL 0
196202# define BASE_DEF 0
197203#endif
198204
@@ -944,7 +950,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
944950
945951static inline struct timer_base * get_timer_cpu_base (u32 tflags , u32 cpu )
946952{
947- struct timer_base * base = per_cpu_ptr (& timer_bases [BASE_STD ], cpu );
953+ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL ;
954+ struct timer_base * base ;
955+
956+ base = per_cpu_ptr (& timer_bases [index ], cpu );
948957
949958 /*
950959 * If the timer is deferrable and NO_HZ_COMMON is set then we need
@@ -957,7 +966,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
957966
958967static inline struct timer_base * get_timer_this_cpu_base (u32 tflags )
959968{
960- struct timer_base * base = this_cpu_ptr (& timer_bases [BASE_STD ]);
969+ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL ;
970+ struct timer_base * base ;
971+
972+ base = this_cpu_ptr (& timer_bases [index ]);
961973
962974 /*
963975 * If the timer is deferrable and NO_HZ_COMMON is set then we need
@@ -2006,6 +2018,9 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
20062018 * Move next_expiry for the empty base into the future to prevent an
20072019 * unnecessary raise of the timer softirq when the next_expiry value
20082020 * will be reached even if there is no timer pending.
2021+ *
2022+ * This update is also required to make timer_base::next_expiry values
2023+ * easy comparable to find out which base holds the first pending timer.
20092024 */
20102025 if (!base -> timers_pending )
20112026 base -> next_expiry = basej + NEXT_TIMER_MAX_DELTA ;
@@ -2016,9 +2031,10 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
20162031static inline u64 __get_next_timer_interrupt (unsigned long basej , u64 basem ,
20172032 bool * idle )
20182033{
2019- struct timer_base * base = this_cpu_ptr (& timer_bases [BASE_STD ]);
2034+ unsigned long nextevt , nextevt_local , nextevt_global ;
2035+ struct timer_base * base_local , * base_global ;
20202036 u64 expires = KTIME_MAX ;
2021- unsigned long nextevt ;
2037+ bool local_first ;
20222038
20232039 /*
20242040 * Pretend that there is no timer pending if the cpu is offline.
@@ -2030,10 +2046,20 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
20302046 return expires ;
20312047 }
20322048
2033- raw_spin_lock ( & base -> lock );
2034- nextevt = next_timer_interrupt ( base , basej );
2049+ base_local = this_cpu_ptr ( & timer_bases [ BASE_LOCAL ] );
2050+ base_global = this_cpu_ptr ( & timer_bases [ BASE_GLOBAL ] );
20352051
2036- if (base -> timers_pending ) {
2052+ raw_spin_lock (& base_local -> lock );
2053+ raw_spin_lock_nested (& base_global -> lock , SINGLE_DEPTH_NESTING );
2054+
2055+ nextevt_local = next_timer_interrupt (base_local , basej );
2056+ nextevt_global = next_timer_interrupt (base_global , basej );
2057+
2058+ local_first = time_before_eq (nextevt_local , nextevt_global );
2059+
2060+ nextevt = local_first ? nextevt_local : nextevt_global ;
2061+
2062+ if (base_local -> timers_pending || base_global -> timers_pending ) {
20372063 /* If we missed a tick already, force 0 delta */
20382064 if (time_before (nextevt , basej ))
20392065 nextevt = basej ;
@@ -2044,31 +2070,31 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
20442070 * We have a fresh next event. Check whether we can forward the
20452071 * base.
20462072 */
2047- __forward_timer_base (base , basej );
2073+ __forward_timer_base (base_local , basej );
2074+ __forward_timer_base (base_global , basej );
20482075
20492076 /*
20502077 * Set base->is_idle only when caller is timer_base_try_to_set_idle()
20512078 */
20522079 if (idle ) {
20532080 /*
2054- * Base is idle if the next event is more than a tick away.
2081+ * Bases are idle if the next event is more than a tick away.
20552082 *
20562083 * If the base is marked idle then any timer add operation must
20572084 * forward the base clk itself to keep granularity small. This
2058- * idle logic is only maintained for the BASE_STD base,
2059- * deferrable timers may still see large granularity skew (by
2060- * design).
2085+ * idle logic is only maintained for the BASE_LOCAL and
2086+ * BASE_GLOBAL base, deferrable timers may still see large
2087+ * granularity skew (by design).
20612088 */
2062- if (!base -> is_idle ) {
2063- if (time_after (nextevt , basej + 1 )) {
2064- base -> is_idle = true;
2065- trace_timer_base_idle (true, base -> cpu );
2066- }
2089+ if (!base_local -> is_idle && time_after (nextevt , basej + 1 )) {
2090+ base_local -> is_idle = base_global -> is_idle = true;
2091+ trace_timer_base_idle (true, base_local -> cpu );
20672092 }
2068- * idle = base -> is_idle ;
2093+ * idle = base_local -> is_idle ;
20692094 }
20702095
2071- raw_spin_unlock (& base -> lock );
2096+ raw_spin_unlock (& base_global -> lock );
2097+ raw_spin_unlock (& base_local -> lock );
20722098
20732099 return cmp_next_hrtimer_event (basem , expires );
20742100}
@@ -2112,15 +2138,14 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
21122138 */
21132139void timer_clear_idle (void )
21142140{
2115- struct timer_base * base = this_cpu_ptr (& timer_bases [BASE_STD ]);
2116-
21172141 /*
21182142 * We do this unlocked. The worst outcome is a remote enqueue sending
21192143 * a pointless IPI, but taking the lock would just make the window for
21202144 * sending the IPI a few instructions smaller for the cost of taking
21212145 * the lock in the exit from idle path.
21222146 */
2123- base -> is_idle = false;
2147+ __this_cpu_write (timer_bases [BASE_LOCAL ].is_idle , false);
2148+ __this_cpu_write (timer_bases [BASE_GLOBAL ].is_idle , false);
21242149 trace_timer_base_idle (false, smp_processor_id ());
21252150}
21262151#endif
@@ -2171,19 +2196,21 @@ static inline void __run_timers(struct timer_base *base)
21712196 */
21722197static __latent_entropy void run_timer_softirq (struct softirq_action * h )
21732198{
2174- struct timer_base * base = this_cpu_ptr (& timer_bases [BASE_STD ]);
2199+ struct timer_base * base = this_cpu_ptr (& timer_bases [BASE_LOCAL ]);
21752200
21762201 __run_timers (base );
2177- if (IS_ENABLED (CONFIG_NO_HZ_COMMON ))
2202+ if (IS_ENABLED (CONFIG_NO_HZ_COMMON )) {
2203+ __run_timers (this_cpu_ptr (& timer_bases [BASE_GLOBAL ]));
21782204 __run_timers (this_cpu_ptr (& timer_bases [BASE_DEF ]));
2205+ }
21792206}
21802207
21812208/*
21822209 * Called by the local, per-CPU timer interrupt on SMP.
21832210 */
21842211static void run_local_timers (void )
21852212{
2186- struct timer_base * base = this_cpu_ptr (& timer_bases [BASE_STD ]);
2213+ struct timer_base * base = this_cpu_ptr (& timer_bases [BASE_LOCAL ]);
21872214
21882215 hrtimer_run_queues ();
21892216
0 commit comments