Skip to content

Commit 3db6b38

Browse files
KAGA-KOKOingomolnar
authored andcommitted
rseq: Switch to fast path processing on exit to user
Now that all bits and pieces are in place, hook the RSEQ handling fast path function into exit_to_user_mode_prepare() after the TIF work bits have been handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised and the caller needs to take another turn through the TIF handling slow path. This only works for architectures which use the generic entry code. Architectures who still have their own incomplete hacks are not supported and won't be. This results in the following improvements: Kernel build Before After Reduction exit to user 80692981 80514451 signal checks: 32581 121 99% slowpath runs: 1201408 1.49% 198 0.00% 100% fastpath runs: 675941 0.84% N/A id updates: 1233989 1.53% 50541 0.06% 96% cs checks: 1125366 1.39% 0 0.00% 100% cs cleared: 1125366 100% 0 100% cs fixup: 0 0% 0 RSEQ selftests Before After Reduction exit to user: 386281778 387373750 signal checks: 35661203 0 100% slowpath runs: 140542396 36.38% 100 0.00% 100% fastpath runs: 9509789 2.51% N/A id updates: 176203599 45.62% 9087994 2.35% 95% cs checks: 175587856 45.46% 4728394 1.22% 98% cs cleared: 172359544 98.16% 1319307 27.90% 99% cs fixup: 3228312 1.84% 3409087 72.10% The 'cs cleared' and 'cs fixup' percentages are not relative to the exit to user invocations, they are relative to the actual 'cs check' invocations. While some of this could have been avoided in the original code, like the obvious clearing of CS when it's already clear, the main problem of going through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ notify handler is invoked more than once before going out to user space. Doing this once when everything has stabilized is the only solution to avoid this. The initial attempt to completely decouple it from the TIF work turned out to be suboptimal for workloads, which do a lot of quick and short system calls. Even if the fast path decision is only 4 instructions (including a conditional branch), this adds up quickly and becomes measurable when the rate for actually having to handle rseq is in the low single digit percentage range of user/kernel transitions. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Link: https://patch.msgid.link/20251027084307.701201365@linutronix.de
1 parent 05b44ae commit 3db6b38

6 files changed

Lines changed: 41 additions & 22 deletions

File tree

include/linux/irq-entry-common.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,11 +197,8 @@ static __always_inline void arch_exit_to_user_mode(void) { }
197197
*/
198198
void arch_do_signal_or_restart(struct pt_regs *regs);
199199

200-
/**
201-
* exit_to_user_mode_loop - do any pending work before leaving to user space
202-
*/
203-
unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
204-
unsigned long ti_work);
200+
/* Handle pending TIF work */
201+
unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work);
205202

206203
/**
207204
* exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required

include/linux/resume_user_mode.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs)
5959
mem_cgroup_handle_over_high(GFP_KERNEL);
6060
blkcg_maybe_throttle_current();
6161

62-
rseq_handle_notify_resume(regs);
62+
rseq_handle_slowpath(regs);
6363
}
6464

6565
#endif /* LINUX_RESUME_USER_MODE_H */

include/linux/rseq.h

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,19 @@
77

88
#include <uapi/linux/rseq.h>
99

10-
void __rseq_handle_notify_resume(struct pt_regs *regs);
10+
void __rseq_handle_slowpath(struct pt_regs *regs);
1111

12-
static inline void rseq_handle_notify_resume(struct pt_regs *regs)
12+
/* Invoked from resume_user_mode_work() */
13+
static inline void rseq_handle_slowpath(struct pt_regs *regs)
1314
{
14-
/* '&' is intentional to spare one conditional branch */
15-
if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
16-
__rseq_handle_notify_resume(regs);
15+
if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
16+
if (current->rseq.event.slowpath)
17+
__rseq_handle_slowpath(regs);
18+
} else {
19+
/* '&' is intentional to spare one conditional branch */
20+
if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
21+
__rseq_handle_slowpath(regs);
22+
}
1723
}
1824

1925
void __rseq_signal_deliver(int sig, struct pt_regs *regs);
@@ -152,7 +158,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
152158
}
153159

154160
#else /* CONFIG_RSEQ */
155-
static inline void rseq_handle_notify_resume(struct pt_regs *regs) { }
161+
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
156162
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
157163
static inline void rseq_sched_switch_event(struct task_struct *t) { }
158164
static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }

init/Kconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1941,7 +1941,7 @@ config RSEQ_DEBUG_DEFAULT_ENABLE
19411941
config DEBUG_RSEQ
19421942
default n
19431943
bool "Enable debugging of rseq() system call" if EXPERT
1944-
depends on RSEQ && DEBUG_KERNEL
1944+
depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY
19451945
select RSEQ_DEBUG_DEFAULT_ENABLE
19461946
help
19471947
Enable extra debugging checks for the rseq system call.

kernel/entry/common.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,8 @@
1111
/* Workaround to allow gradual conversion of architecture code */
1212
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
1313

14-
/**
15-
* exit_to_user_mode_loop - do any pending work before leaving to user space
16-
* @regs: Pointer to pt_regs on entry stack
17-
* @ti_work: TIF work flags as read by the caller
18-
*/
19-
__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
20-
unsigned long ti_work)
14+
static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
15+
unsigned long ti_work)
2116
{
2217
/*
2318
* Before returning to user space ensure that all pending work
@@ -62,6 +57,23 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
6257
return ti_work;
6358
}
6459

60+
/**
61+
* exit_to_user_mode_loop - do any pending work before leaving to user space
62+
* @regs: Pointer to pt_regs on entry stack
63+
* @ti_work: TIF work flags as read by the caller
64+
*/
65+
__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
66+
unsigned long ti_work)
67+
{
68+
for (;;) {
69+
ti_work = __exit_to_user_mode_loop(regs, ti_work);
70+
71+
if (likely(!rseq_exit_to_user_mode_restart(regs)))
72+
return ti_work;
73+
ti_work = read_thread_flags();
74+
}
75+
}
76+
6577
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
6678
{
6779
irqentry_state_t ret = {

kernel/rseq.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,11 @@ static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
237237

238238
static void rseq_slowpath_update_usr(struct pt_regs *regs)
239239
{
240-
/* Preserve rseq state and user_irq state for exit to user */
240+
/*
241+
* Preserve rseq state and user_irq state. The generic entry code
242+
* clears user_irq on the way out, the non-generic entry
243+
* architectures are not having user_irq.
244+
*/
241245
const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
242246
struct task_struct *t = current;
243247
struct rseq_ids ids;
@@ -289,7 +293,7 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs)
289293
}
290294
}
291295

292-
void __rseq_handle_notify_resume(struct pt_regs *regs)
296+
void __rseq_handle_slowpath(struct pt_regs *regs)
293297
{
294298
/*
295299
* If invoked from hypervisors before entering the guest via

0 commit comments

Comments
 (0)