Skip to content

Commit 2b09f48

Browse files
committed
Merge tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull rseq updates from Thomas Gleixner: "A large overhaul of the restartable sequences and CID management: The recent enablement of RSEQ in glibc resulted in regressions which are caused by the related overhead. It turned out that the decision to invoke the exit to user work was not really a decision. More or less each context switch caused that. There is a long list of small issues which sums up nicely and results in a 3-4% regression in I/O benchmarks. The other detail which caused issues due to extra work in context switch and task migration is the CID (memory context ID) management. It also requires to use a task work to consolidate the CID space, which is executed in the context of an arbitrary task and results in sporadic uncontrolled exit latencies. The rewrite addresses this by: - Removing deprecated and long unsupported functionality - Moving the related data into dedicated data structures which are optimized for fast path processing. - Caching values so actual decisions can be made - Replacing the current implementation with a optimized inlined variant. - Separating fast and slow path for architectures which use the generic entry code, so that only fault and error handling goes into the TIF_NOTIFY_RESUME handler. - Rewriting the CID management so that it becomes mostly invisible in the context switch path. That moves the work of switching modes into the fork/exit path, which is a reasonable tradeoff. That work is only required when a process creates more threads than the cpuset it is allowed to run on or when enough threads exit after that. An artificial thread pool benchmarks which triggers this did not degrade, it actually improved significantly. The main effect in migration heavy scenarios is that runqueue lock held time and therefore contention goes down significantly" * tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/mmcid: Switch over to the new mechanism sched/mmcid: Implement deferred mode change irqwork: Move data struct to a types header sched/mmcid: Provide CID ownership mode fixup functions sched/mmcid: Provide new scheduler CID mechanism sched/mmcid: Introduce per task/CPU ownership infrastructure sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex sched/mmcid: Provide precomputed maximal value sched/mmcid: Move initialization out of line signal: Move MMCID exit out of sighand lock sched/mmcid: Convert mm CID mask to a bitmap cpumask: Cache num_possible_cpus() sched/mmcid: Use cpumask_weighted_or() cpumask: Introduce cpumask_weighted_or() sched/mmcid: Prevent pointless work in mm_update_cpus_allowed() sched/mmcid: Move scheduler code out of global header sched: Fixup whitespace damage sched/mmcid: Cacheline align MM CID storage sched/mmcid: Use proper data structures sched/mmcid: Revert the complex CID management ...
2 parents 1dce506 + 653fda7 commit 2b09f48

40 files changed

Lines changed: 2064 additions & 1429 deletions

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6500,6 +6500,10 @@
65006500
Memory area to be used by remote processor image,
65016501
managed by CMA.
65026502

6503+
rseq_debug= [KNL] Enable or disable restartable sequence
6504+
debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE.
6505+
Format: <bool>
6506+
65036507
rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling
65046508
when CONFIG_RT_GROUP_SCHED=y. Defaults to
65056509
!CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED.

arch/arm64/kernel/entry-common.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs)
100100
static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs)
101101
{
102102
local_irq_disable();
103-
exit_to_user_mode_prepare(regs);
103+
exit_to_user_mode_prepare_legacy(regs);
104104
local_daif_mask();
105105
mte_check_tfsr_exit();
106106
exit_to_user_mode();

arch/x86/entry/syscall_32.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
274274
* fetch EBP before invoking any of the syscall entry work
275275
* functions.
276276
*/
277-
syscall_enter_from_user_mode_prepare(regs);
277+
enter_from_user_mode(regs);
278278

279279
instrumentation_begin();
280+
local_irq_enable();
280281
/* Fetch EBP from where the vDSO stashed it. */
281282
if (IS_ENABLED(CONFIG_X86_64)) {
282283
/*

arch/x86/include/asm/ptrace.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,12 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
187187
extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);
188188

189189

190-
static inline unsigned long regs_return_value(struct pt_regs *regs)
190+
static __always_inline unsigned long regs_return_value(struct pt_regs *regs)
191191
{
192192
return regs->ax;
193193
}
194194

195-
static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
195+
static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
196196
{
197197
regs->ax = rc;
198198
}
@@ -277,34 +277,34 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
277277
}
278278
#endif
279279

280-
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
280+
static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
281281
{
282282
return regs->sp;
283283
}
284284

285-
static inline unsigned long instruction_pointer(struct pt_regs *regs)
285+
static __always_inline unsigned long instruction_pointer(struct pt_regs *regs)
286286
{
287287
return regs->ip;
288288
}
289289

290-
static inline void instruction_pointer_set(struct pt_regs *regs,
291-
unsigned long val)
290+
static __always_inline
291+
void instruction_pointer_set(struct pt_regs *regs, unsigned long val)
292292
{
293293
regs->ip = val;
294294
}
295295

296-
static inline unsigned long frame_pointer(struct pt_regs *regs)
296+
static __always_inline unsigned long frame_pointer(struct pt_regs *regs)
297297
{
298298
return regs->bp;
299299
}
300300

301-
static inline unsigned long user_stack_pointer(struct pt_regs *regs)
301+
static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs)
302302
{
303303
return regs->sp;
304304
}
305305

306-
static inline void user_stack_pointer_set(struct pt_regs *regs,
307-
unsigned long val)
306+
static __always_inline
307+
void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
308308
{
309309
regs->sp = val;
310310
}

drivers/hv/mshv_root_main.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/crash_dump.h>
3030
#include <linux/panic_notifier.h>
3131
#include <linux/vmalloc.h>
32+
#include <linux/rseq.h>
3233

3334
#include "mshv_eventfd.h"
3435
#include "mshv.h"
@@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
560561
}
561562
} while (!vp->run.flags.intercept_suspend);
562563

564+
rseq_virt_userspace_exit();
565+
563566
return ret;
564567
}
565568

fs/binfmt_elf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
#include <linux/cred.h>
4747
#include <linux/dax.h>
4848
#include <linux/uaccess.h>
49-
#include <linux/rseq.h>
49+
#include <uapi/linux/rseq.h>
5050
#include <asm/param.h>
5151
#include <asm/page.h>
5252

fs/exec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1774,7 +1774,7 @@ static int bprm_execve(struct linux_binprm *bprm)
17741774
force_fatal_sig(SIGSEGV);
17751775

17761776
sched_mm_cid_after_execve(current);
1777-
rseq_set_notify_resume(current);
1777+
rseq_force_update();
17781778
current->in_execve = 0;
17791779

17801780
return retval;

include/asm-generic/thread_info_tif.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,7 @@
4545
# define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK)
4646
#endif
4747

48+
#define TIF_RSEQ 11 // Run RSEQ fast path
49+
#define _TIF_RSEQ BIT(TIF_RSEQ)
50+
4851
#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */

include/linux/bitmap.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct device;
4545
* bitmap_copy(dst, src, nbits) *dst = *src
4646
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
4747
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
48+
* bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst
4849
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
4950
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
5051
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
@@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
165166
const unsigned long *bitmap2, unsigned int nbits);
166167
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
167168
const unsigned long *bitmap2, unsigned int nbits);
169+
unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1,
170+
const unsigned long *bitmap2, unsigned int nbits);
168171
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
169172
const unsigned long *bitmap2, unsigned int nbits);
170173
bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
@@ -337,6 +340,18 @@ void bitmap_or(unsigned long *dst, const unsigned long *src1,
337340
__bitmap_or(dst, src1, src2, nbits);
338341
}
339342

343+
static __always_inline
344+
unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1,
345+
const unsigned long *src2, unsigned int nbits)
346+
{
347+
if (small_const_nbits(nbits)) {
348+
*dst = *src1 | *src2;
349+
return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
350+
} else {
351+
return __bitmap_weighted_or(dst, src1, src2, nbits);
352+
}
353+
}
354+
340355
static __always_inline
341356
void bitmap_xor(unsigned long *dst, const unsigned long *src1,
342357
const unsigned long *src2, unsigned int nbits)

include/linux/cleanup.h

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@
208208
*/
209209

210210
#define DEFINE_FREE(_name, _type, _free) \
211-
static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; }
211+
static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; }
212212

213213
#define __free(_name) __cleanup(__free_##_name)
214214

@@ -220,7 +220,7 @@
220220
__val; \
221221
})
222222

223-
static inline __must_check
223+
static __always_inline __must_check
224224
const volatile void * __must_check_fn(const volatile void *val)
225225
{ return val; }
226226

@@ -278,16 +278,16 @@ const volatile void * __must_check_fn(const volatile void *val)
278278

279279
#define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \
280280
typedef _type class_##_name##_t; \
281-
static inline void class_##_name##_destructor(_type *p) \
281+
static __always_inline void class_##_name##_destructor(_type *p) \
282282
{ _type _T = *p; _exit; } \
283-
static inline _type class_##_name##_constructor(_init_args) \
283+
static __always_inline _type class_##_name##_constructor(_init_args) \
284284
{ _type t = _init; return t; }
285285

286286
#define EXTEND_CLASS(_name, ext, _init, _init_args...) \
287287
typedef class_##_name##_t class_##_name##ext##_t; \
288-
static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\
288+
static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \
289289
{ class_##_name##_destructor(p); } \
290-
static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
290+
static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
291291
{ class_##_name##_t t = _init; return t; }
292292

293293
#define CLASS(_name, var) \
@@ -360,15 +360,15 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
360360
})
361361

362362
#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \
363-
static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
363+
static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
364364
{ \
365365
void *_ptr = (void *)(__force unsigned long)*(_exp); \
366366
if (IS_ERR(_ptr)) { \
367367
_ptr = NULL; \
368368
} \
369369
return _ptr; \
370370
} \
371-
static inline int class_##_name##_lock_err(class_##_name##_t *_T) \
371+
static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \
372372
{ \
373373
long _rc = (__force unsigned long)*(_exp); \
374374
if (!_rc) { \
@@ -397,9 +397,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
397397
EXTEND_CLASS(_name, _ext, \
398398
({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \
399399
class_##_name##_t _T) \
400-
static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
400+
static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
401401
{ return class_##_name##_lock_ptr(_T); } \
402-
static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
402+
static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
403403
{ return class_##_name##_lock_err(_T); }
404404

405405
/*
@@ -479,23 +479,23 @@ typedef struct { \
479479
__VA_ARGS__; \
480480
} class_##_name##_t; \
481481
\
482-
static inline void class_##_name##_destructor(class_##_name##_t *_T) \
482+
static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \
483483
{ \
484484
if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \
485485
} \
486486
\
487487
__DEFINE_GUARD_LOCK_PTR(_name, &_T->lock)
488488

489489
#define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \
490-
static inline class_##_name##_t class_##_name##_constructor(_type *l) \
490+
static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \
491491
{ \
492492
class_##_name##_t _t = { .lock = l }, *_T = &_t; \
493493
_lock; \
494494
return _t; \
495495
}
496496

497497
#define __DEFINE_LOCK_GUARD_0(_name, _lock) \
498-
static inline class_##_name##_t class_##_name##_constructor(void) \
498+
static __always_inline class_##_name##_t class_##_name##_constructor(void) \
499499
{ \
500500
class_##_name##_t _t = { .lock = (void*)1 }, \
501501
*_T __maybe_unused = &_t; \
@@ -521,9 +521,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock)
521521
if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\
522522
_t; }), \
523523
typeof_member(class_##_name##_t, lock) l) \
524-
static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
524+
static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
525525
{ return class_##_name##_lock_ptr(_T); } \
526-
static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
526+
static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
527527
{ return class_##_name##_lock_err(_T); }
528528

529529
#define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \

0 commit comments

Comments
 (0)