Skip to content

Commit d42e504

Browse files
committed
Merge tag 'timers-core-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer core updates from Thomas Gleixner: - Prevent a thundering herd problem when the timekeeper CPU is delayed and a large number of CPUs compete to acquire jiffies_lock to do the update. Limit it to one CPU with a separate "uncontended" atomic variable. - A set of improvements for the timer migration mechanism: - Support imbalanced NUMA trees correctly - Support dynamic exclusion of CPUs from the migrator duty to allow the cpuset/isolation mechanism to exclude them from handling timers of remote idle CPUs - The usual small updates, cleanups and enhancements * tag 'timers-core-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: timers/migration: Exclude isolated cpus from hierarchy cpumask: Add initialiser to use cleanup helpers sched/isolation: Force housekeeping if isolcpus and nohz_full don't leave any cgroup/cpuset: Rename update_unbound_workqueue_cpumask() to update_isolation_cpumasks() timers/migration: Use scoped_guard on available flag set/clear timers/migration: Add mask for CPUs available in the hierarchy timers/migration: Rename 'online' bit to 'available' selftests/timers/nanosleep: Add tests for return of remaining time selftests/timers: Clean up kernel version check in posix_timers time: Fix a few typos in time[r] related code comments time: tick-oneshot: Add missing Return and parameter descriptions to kernel-doc hrtimer: Store time as ktime_t in restart block timers/migration: Remove dead code handling idle CPU checking for remote timers timers/migration: Remove unused "cpu" parameter from tmigr_get_group() timers/migration: Assert that hotplug preparing CPU is part of stable active hierarchy timers/migration: Fix imbalanced NUMA trees timers/migration: Remove locking on group connection timers/migration: Convert "while" loops to use "for" tick/sched: Limit non-timekeeper CPUs calling jiffies update
2 parents 5028f42 + 7dec062 commit d42e504

16 files changed

Lines changed: 503 additions & 196 deletions

File tree

include/linux/cpumask.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,6 +1022,7 @@ static __always_inline unsigned int cpumask_size(void)
10221022

10231023
#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x)
10241024
#define __cpumask_var_read_mostly __read_mostly
1025+
#define CPUMASK_VAR_NULL NULL
10251026

10261027
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
10271028

@@ -1068,6 +1069,7 @@ static __always_inline bool cpumask_available(cpumask_var_t mask)
10681069

10691070
#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
10701071
#define __cpumask_var_read_mostly
1072+
#define CPUMASK_VAR_NULL {}
10711073

10721074
static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
10731075
{

include/linux/delay.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ void usleep_range_state(unsigned long min, unsigned long max,
6868
* @min: Minimum time in microseconds to sleep
6969
* @max: Maximum time in microseconds to sleep
7070
*
71-
* For basic information please refere to usleep_range_state().
71+
* For basic information please refer to usleep_range_state().
7272
*
7373
* The task will be in the state TASK_UNINTERRUPTIBLE during the sleep.
7474
*/
@@ -82,10 +82,10 @@ static inline void usleep_range(unsigned long min, unsigned long max)
8282
* @min: Minimum time in microseconds to sleep
8383
* @max: Maximum time in microseconds to sleep
8484
*
85-
* For basic information please refere to usleep_range_state().
85+
* For basic information please refer to usleep_range_state().
8686
*
8787
* The sleeping task has the state TASK_IDLE during the sleep to prevent
88-
* contribution to the load avarage.
88+
* contribution to the load average.
8989
*/
9090
static inline void usleep_range_idle(unsigned long min, unsigned long max)
9191
{
@@ -96,7 +96,7 @@ static inline void usleep_range_idle(unsigned long min, unsigned long max)
9696
* ssleep - wrapper for seconds around msleep
9797
* @seconds: Requested sleep duration in seconds
9898
*
99-
* Please refere to msleep() for detailed information.
99+
* Please refer to msleep() for detailed information.
100100
*/
101101
static inline void ssleep(unsigned int seconds)
102102
{

include/linux/restart_block.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ struct restart_block {
4343
struct __kernel_timespec __user *rmtp;
4444
struct old_timespec32 __user *compat_rmtp;
4545
};
46-
u64 expires;
46+
ktime_t expires;
4747
} nanosleep;
4848
/* For poll */
4949
struct {

include/linux/timer.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,13 @@ int timers_dead_cpu(unsigned int cpu);
188188
#define timers_dead_cpu NULL
189189
#endif
190190

191+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
192+
extern int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask);
193+
#else
194+
static inline int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
195+
{
196+
return 0;
197+
}
198+
#endif
199+
191200
#endif

include/trace/events/timer_migration.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,14 +173,14 @@ DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_active,
173173
TP_ARGS(tmc)
174174
);
175175

176-
DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_online,
176+
DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_available,
177177

178178
TP_PROTO(struct tmigr_cpu *tmc),
179179

180180
TP_ARGS(tmc)
181181
);
182182

183-
DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_offline,
183+
DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_unavailable,
184184

185185
TP_PROTO(struct tmigr_cpu *tmc),
186186

kernel/cgroup/cpuset.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,7 +1391,7 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
13911391
return isolcpus_updated;
13921392
}
13931393

1394-
static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
1394+
static void update_isolation_cpumasks(bool isolcpus_updated)
13951395
{
13961396
int ret;
13971397

@@ -1402,6 +1402,9 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
14021402

14031403
ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
14041404
WARN_ON_ONCE(ret < 0);
1405+
1406+
ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
1407+
WARN_ON_ONCE(ret < 0);
14051408
}
14061409

14071410
/**
@@ -1555,7 +1558,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
15551558
list_add(&cs->remote_sibling, &remote_children);
15561559
cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
15571560
spin_unlock_irq(&callback_lock);
1558-
update_unbound_workqueue_cpumask(isolcpus_updated);
1561+
update_isolation_cpumasks(isolcpus_updated);
15591562
cpuset_force_rebuild();
15601563
cs->prs_err = 0;
15611564

@@ -1596,7 +1599,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
15961599
compute_excpus(cs, cs->effective_xcpus);
15971600
reset_partition_data(cs);
15981601
spin_unlock_irq(&callback_lock);
1599-
update_unbound_workqueue_cpumask(isolcpus_updated);
1602+
update_isolation_cpumasks(isolcpus_updated);
16001603
cpuset_force_rebuild();
16011604

16021605
/*
@@ -1665,7 +1668,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
16651668
if (xcpus)
16661669
cpumask_copy(cs->exclusive_cpus, xcpus);
16671670
spin_unlock_irq(&callback_lock);
1668-
update_unbound_workqueue_cpumask(isolcpus_updated);
1671+
update_isolation_cpumasks(isolcpus_updated);
16691672
if (adding || deleting)
16701673
cpuset_force_rebuild();
16711674

@@ -2023,7 +2026,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
20232026
WARN_ON_ONCE(parent->nr_subparts < 0);
20242027
}
20252028
spin_unlock_irq(&callback_lock);
2026-
update_unbound_workqueue_cpumask(isolcpus_updated);
2029+
update_isolation_cpumasks(isolcpus_updated);
20272030

20282031
if ((old_prs != new_prs) && (cmd == partcmd_update))
20292032
update_partition_exclusive_flag(cs, new_prs);
@@ -3043,7 +3046,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
30433046
else if (isolcpus_updated)
30443047
isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
30453048
spin_unlock_irq(&callback_lock);
3046-
update_unbound_workqueue_cpumask(isolcpus_updated);
3049+
update_isolation_cpumasks(isolcpus_updated);
30473050

30483051
/* Force update if switching back to member & update effective_xcpus */
30493052
update_cpumasks_hier(cs, &tmpmask, !new_prs);

kernel/sched/isolation.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,29 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
167167
}
168168
}
169169

170+
/*
171+
* Check the combination of nohz_full and isolcpus=domain,
172+
* necessary to avoid problems with the timer migration
173+
* hierarchy. managed_irq is ignored by this check since it
174+
* isn't considered in the timer migration logic.
175+
*/
176+
iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
177+
type = find_first_bit(&iter_flags, HK_TYPE_MAX);
178+
/*
179+
* Pass the check if none of these flags were previously set or
180+
* are not in the current selection.
181+
*/
182+
iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
183+
first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
184+
cpumask_first_and_and(cpu_present_mask,
185+
housekeeping_staging, housekeeping.cpumasks[type]);
186+
if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
187+
pr_warn("Housekeeping: must include one present CPU "
188+
"neither in nohz_full= nor in isolcpus=domain, "
189+
"ignoring setting %s\n", str);
190+
goto free_housekeeping_staging;
191+
}
192+
170193
iter_flags = flags & ~housekeeping.flags;
171194

172195
for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)

kernel/time/hrtimer.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2145,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
21452145
int ret;
21462146

21472147
hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
2148-
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
2148+
hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
21492149
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
21502150
destroy_hrtimer_on_stack(&t.timer);
21512151
return ret;
@@ -2172,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
21722172

21732173
restart = &current->restart_block;
21742174
restart->nanosleep.clockid = t.timer.base->clockid;
2175-
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
2175+
restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
21762176
set_restart_fn(restart, hrtimer_nanosleep_restart);
21772177
out:
21782178
destroy_hrtimer_on_stack(&t.timer);

kernel/time/posix-cpu-timers.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
15571557
* Report back to the user the time still remaining.
15581558
*/
15591559
restart = &current->restart_block;
1560-
restart->nanosleep.expires = expires;
1560+
restart->nanosleep.expires = ns_to_ktime(expires);
15611561
if (restart->nanosleep.type != TT_NONE)
15621562
error = nanosleep_copyout(restart, &it.it_value);
15631563
}
@@ -1599,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
15991599
clockid_t which_clock = restart_block->nanosleep.clockid;
16001600
struct timespec64 t;
16011601

1602-
t = ns_to_timespec64(restart_block->nanosleep.expires);
1602+
t = ktime_to_timespec64(restart_block->nanosleep.expires);
16031603

16041604
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
16051605
}

kernel/time/posix-timers.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1242,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
12421242
* sys_clock_settime(). The kernel internal timekeeping is always using
12431243
* nanoseconds precision independent of the clocksource device which is
12441244
* used to read the time from. The resolution of that device only
1245-
* affects the presicion of the time returned by sys_clock_gettime().
1245+
* affects the precision of the time returned by sys_clock_gettime().
12461246
*
12471247
* Returns:
12481248
* 0 Success. @tp contains the resolution

0 commit comments

Comments
 (0)