Skip to content

Commit 83a665d

Browse files
anna-marialxKAGA-KOKO
authored andcommitted
timers: Keep the pinned timers separate from the others
Separate the storage space for pinned timers. Deferrable timers (doesn't matter if pinned or non pinned) are still enqueued into their own base. This is preparatory work for changing the NOHZ timer placement from a push at enqueue time to a pull at expiry time model. Originally-by: Richard Cochran (linutronix GmbH) <richardcochran@gmail.com> Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Frederic Weisbecker <frederic@kernel.org> Link: https://lore.kernel.org/r/20240221090548.36600-11-anna-maria@linutronix.de
1 parent 9f6a3c6 commit 83a665d

1 file changed

Lines changed: 56 additions & 29 deletions

File tree

kernel/time/timer.c

Lines changed: 56 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,18 @@ EXPORT_SYMBOL(jiffies_64);
187187
#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
188188

189189
#ifdef CONFIG_NO_HZ_COMMON
190-
# define NR_BASES 2
191-
# define BASE_STD 0
192-
# define BASE_DEF 1
190+
/*
191+
* If multiple bases need to be locked, use the base ordering for lock
192+
* nesting, i.e. lowest number first.
193+
*/
194+
# define NR_BASES 3
195+
# define BASE_LOCAL 0
196+
# define BASE_GLOBAL 1
197+
# define BASE_DEF 2
193198
#else
194199
# define NR_BASES 1
195-
# define BASE_STD 0
200+
# define BASE_LOCAL 0
201+
# define BASE_GLOBAL 0
196202
# define BASE_DEF 0
197203
#endif
198204

@@ -944,7 +950,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
944950

945951
static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
946952
{
947-
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
953+
int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
954+
struct timer_base *base;
955+
956+
base = per_cpu_ptr(&timer_bases[index], cpu);
948957

949958
/*
950959
* If the timer is deferrable and NO_HZ_COMMON is set then we need
@@ -957,7 +966,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
957966

958967
static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
959968
{
960-
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
969+
int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
970+
struct timer_base *base;
971+
972+
base = this_cpu_ptr(&timer_bases[index]);
961973

962974
/*
963975
* If the timer is deferrable and NO_HZ_COMMON is set then we need
@@ -2006,6 +2018,9 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
20062018
* Move next_expiry for the empty base into the future to prevent an
20072019
* unnecessary raise of the timer softirq when the next_expiry value
20082020
* will be reached even if there is no timer pending.
2021+
*
2022+
* This update is also required to make timer_base::next_expiry values
2023+
* easy comparable to find out which base holds the first pending timer.
20092024
*/
20102025
if (!base->timers_pending)
20112026
base->next_expiry = basej + NEXT_TIMER_MAX_DELTA;
@@ -2016,9 +2031,10 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
20162031
static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
20172032
bool *idle)
20182033
{
2019-
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
2034+
unsigned long nextevt, nextevt_local, nextevt_global;
2035+
struct timer_base *base_local, *base_global;
20202036
u64 expires = KTIME_MAX;
2021-
unsigned long nextevt;
2037+
bool local_first;
20222038

20232039
/*
20242040
* Pretend that there is no timer pending if the cpu is offline.
@@ -2030,10 +2046,20 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
20302046
return expires;
20312047
}
20322048

2033-
raw_spin_lock(&base->lock);
2034-
nextevt = next_timer_interrupt(base, basej);
2049+
base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
2050+
base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
20352051

2036-
if (base->timers_pending) {
2052+
raw_spin_lock(&base_local->lock);
2053+
raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
2054+
2055+
nextevt_local = next_timer_interrupt(base_local, basej);
2056+
nextevt_global = next_timer_interrupt(base_global, basej);
2057+
2058+
local_first = time_before_eq(nextevt_local, nextevt_global);
2059+
2060+
nextevt = local_first ? nextevt_local : nextevt_global;
2061+
2062+
if (base_local->timers_pending || base_global->timers_pending) {
20372063
/* If we missed a tick already, force 0 delta */
20382064
if (time_before(nextevt, basej))
20392065
nextevt = basej;
@@ -2044,31 +2070,31 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
20442070
* We have a fresh next event. Check whether we can forward the
20452071
* base.
20462072
*/
2047-
__forward_timer_base(base, basej);
2073+
__forward_timer_base(base_local, basej);
2074+
__forward_timer_base(base_global, basej);
20482075

20492076
/*
20502077
* Set base->is_idle only when caller is timer_base_try_to_set_idle()
20512078
*/
20522079
if (idle) {
20532080
/*
2054-
* Base is idle if the next event is more than a tick away.
2081+
* Bases are idle if the next event is more than a tick away.
20552082
*
20562083
* If the base is marked idle then any timer add operation must
20572084
* forward the base clk itself to keep granularity small. This
2058-
* idle logic is only maintained for the BASE_STD base,
2059-
* deferrable timers may still see large granularity skew (by
2060-
* design).
2085+
* idle logic is only maintained for the BASE_LOCAL and
2086+
* BASE_GLOBAL base, deferrable timers may still see large
2087+
* granularity skew (by design).
20612088
*/
2062-
if (!base->is_idle) {
2063-
if (time_after(nextevt, basej + 1)) {
2064-
base->is_idle = true;
2065-
trace_timer_base_idle(true, base->cpu);
2066-
}
2089+
if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
2090+
base_local->is_idle = base_global->is_idle = true;
2091+
trace_timer_base_idle(true, base_local->cpu);
20672092
}
2068-
*idle = base->is_idle;
2093+
*idle = base_local->is_idle;
20692094
}
20702095

2071-
raw_spin_unlock(&base->lock);
2096+
raw_spin_unlock(&base_global->lock);
2097+
raw_spin_unlock(&base_local->lock);
20722098

20732099
return cmp_next_hrtimer_event(basem, expires);
20742100
}
@@ -2112,15 +2138,14 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
21122138
*/
21132139
void timer_clear_idle(void)
21142140
{
2115-
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
2116-
21172141
/*
21182142
* We do this unlocked. The worst outcome is a remote enqueue sending
21192143
* a pointless IPI, but taking the lock would just make the window for
21202144
* sending the IPI a few instructions smaller for the cost of taking
21212145
* the lock in the exit from idle path.
21222146
*/
2123-
base->is_idle = false;
2147+
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
2148+
__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
21242149
trace_timer_base_idle(false, smp_processor_id());
21252150
}
21262151
#endif
@@ -2171,19 +2196,21 @@ static inline void __run_timers(struct timer_base *base)
21712196
*/
21722197
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
21732198
{
2174-
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
2199+
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
21752200

21762201
__run_timers(base);
2177-
if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
2202+
if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
2203+
__run_timers(this_cpu_ptr(&timer_bases[BASE_GLOBAL]));
21782204
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
2205+
}
21792206
}
21802207

21812208
/*
21822209
* Called by the local, per-CPU timer interrupt on SMP.
21832210
*/
21842211
static void run_local_timers(void)
21852212
{
2186-
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
2213+
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
21872214

21882215
hrtimer_run_queues();
21892216

0 commit comments

Comments
 (0)