1919#include <linux/nospec.h>
2020#include <linux/syscalls.h>
2121#include <linux/uaccess.h>
22+ #include <linux/init.h>
2223
2324#ifdef CONFIG_XEN_PV
2425#include <xen/xen-ops.h>
@@ -70,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
7071 return false;
7172}
7273
73- __visible noinstr void do_syscall_64 (struct pt_regs * regs , int nr )
74+ /* Returns true to return using SYSRET, or false to use IRET */
75+ __visible noinstr bool do_syscall_64 (struct pt_regs * regs , int nr )
7476{
7577 add_random_kstack_offset ();
7678 nr = syscall_enter_from_user_mode (regs , nr );
@@ -84,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
8486
8587 instrumentation_end ();
8688 syscall_exit_to_user_mode (regs );
89+
90+ /*
91+ * Check that the register state is valid for using SYSRET to exit
92+ * to userspace. Otherwise use the slower but fully capable IRET
93+ * exit path.
94+ */
95+
96+ /* XEN PV guests always use the IRET path */
97+ if (cpu_feature_enabled (X86_FEATURE_XENPV ))
98+ return false;
99+
100+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
101+ if (unlikely (regs -> cx != regs -> ip || regs -> r11 != regs -> flags ))
102+ return false;
103+
104+ /* CS and SS must match the values set in MSR_STAR */
105+ if (unlikely (regs -> cs != __USER_CS || regs -> ss != __USER_DS ))
106+ return false;
107+
108+ /*
109+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
110+ * in kernel space. This essentially lets the user take over
111+ * the kernel, since userspace controls RSP.
112+ *
113+ * TASK_SIZE_MAX covers all user-accessible addresses other than
114+ * the deprecated vsyscall page.
115+ */
116+ if (unlikely (regs -> ip >= TASK_SIZE_MAX ))
117+ return false;
118+
119+ /*
120+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
121+ * restoring TF results in a trap from userspace immediately after
122+ * SYSRET.
123+ */
124+ if (unlikely (regs -> flags & (X86_EFLAGS_RF | X86_EFLAGS_TF )))
125+ return false;
126+
127+ /* Use SYSRET to exit to userspace */
128+ return true;
87129}
88130#endif
89131
@@ -96,6 +138,16 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs)
96138 return (int )regs -> orig_ax ;
97139}
98140
141+ #ifdef CONFIG_IA32_EMULATION
142+ bool __ia32_enabled __ro_after_init = !IS_ENABLED (CONFIG_IA32_EMULATION_DEFAULT_DISABLED );
143+
144+ static int ia32_emulation_override_cmdline (char * arg )
145+ {
146+ return kstrtobool (arg , & __ia32_enabled );
147+ }
148+ early_param ("ia32_emulation" , ia32_emulation_override_cmdline );
149+ #endif
150+
99151/*
100152 * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
101153 */
@@ -182,8 +234,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
182234 return true;
183235}
184236
185- /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
186- __visible noinstr long do_fast_syscall_32 (struct pt_regs * regs )
237+ /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
238+ __visible noinstr bool do_fast_syscall_32 (struct pt_regs * regs )
187239{
188240 /*
189241 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
@@ -201,41 +253,36 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
201253
202254 /* Invoke the syscall. If it failed, keep it simple: use IRET. */
203255 if (!__do_fast_syscall_32 (regs ))
204- return 0 ;
256+ return false ;
205257
206- #ifdef CONFIG_X86_64
207258 /*
208- * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
209- * SYSRETL is available on all 64-bit CPUs, so we don't need to
210- * bother with SYSEXIT.
211- *
212- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
213- * because the ECX fixup above will ensure that this is essentially
214- * never the case.
215- */
216- return regs -> cs == __USER32_CS && regs -> ss == __USER_DS &&
217- regs -> ip == landing_pad &&
218- (regs -> flags & (X86_EFLAGS_RF | X86_EFLAGS_TF )) == 0 ;
219- #else
220- /*
221- * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
222- *
223- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
224- * because the ECX fixup above will ensure that this is essentially
225- * never the case.
226- *
227- * We don't allow syscalls at all from VM86 mode, but we still
228- * need to check VM, because we might be returning from sys_vm86.
259+ * Check that the register state is valid for using SYSRETL/SYSEXIT
260+ * to exit to userspace. Otherwise use the slower but fully capable
261+ * IRET exit path.
229262 */
230- return static_cpu_has (X86_FEATURE_SEP ) &&
231- regs -> cs == __USER_CS && regs -> ss == __USER_DS &&
232- regs -> ip == landing_pad &&
233- (regs -> flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM )) == 0 ;
234- #endif
263+
264+ /* XEN PV guests always use the IRET path */
265+ if (cpu_feature_enabled (X86_FEATURE_XENPV ))
266+ return false;
267+
268+ /* EIP must point to the VDSO landing pad */
269+ if (unlikely (regs -> ip != landing_pad ))
270+ return false;
271+
272+ /* CS and SS must match the values set in MSR_STAR */
273+ if (unlikely (regs -> cs != __USER32_CS || regs -> ss != __USER_DS ))
274+ return false;
275+
276+ /* If the TF, RF, or VM flags are set, use IRET */
277+ if (unlikely (regs -> flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM )))
278+ return false;
279+
280+ /* Use SYSRETL/SYSEXIT to exit to userspace */
281+ return true;
235282}
236283
237- /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
238- __visible noinstr long do_SYSENTER_32 (struct pt_regs * regs )
284+ /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
285+ __visible noinstr bool do_SYSENTER_32 (struct pt_regs * regs )
239286{
240287 /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
241288 regs -> sp = regs -> bp ;
0 commit comments