xref: /linux/arch/x86/entry/syscall_64.c (revision 17e548405a81665fd14cee960db7d093d1396400)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* 64-bit system call dispatch */
3 
4 #include <linux/linkage.h>
5 #include <linux/sys.h>
6 #include <linux/cache.h>
7 #include <linux/syscalls.h>
8 #include <linux/entry-common.h>
9 #include <linux/nospec.h>
10 #include <asm/syscall.h>
11 
12 #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
13 #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
14 #include <asm/syscalls_64.h>
15 #ifdef CONFIG_X86_X32_ABI
16 #include <asm/syscalls_x32.h>
17 #endif
18 #undef  __SYSCALL
19 
20 #undef  __SYSCALL_NORETURN
21 #define __SYSCALL_NORETURN __SYSCALL
22 
23 /*
24  * The sys_call_table[] is no longer used for system calls, but
25  * kernel/trace/trace_syscalls.c still wants to know the system
26  * call address.
27  */
28 #define __SYSCALL(nr, sym) __x64_##sym,
29 const sys_call_ptr_t sys_call_table[] = {
30 #include <asm/syscalls_64.h>
31 };
32 #undef  __SYSCALL
33 
34 #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
35 long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
36 {
37 	switch (nr) {
38 	#include <asm/syscalls_64.h>
39 	default: return __x64_sys_ni_syscall(regs);
40 	}
41 }
42 
43 #ifdef CONFIG_X86_X32_ABI
44 long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
45 {
46 	switch (nr) {
47 	#include <asm/syscalls_x32.h>
48 	default: return __x64_sys_ni_syscall(regs);
49 	}
50 }
51 #endif
52 
53 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
54 {
55 	/*
56 	 * Convert negative numbers to very high and thus out of range
57 	 * numbers for comparisons.
58 	 */
59 	unsigned int unr = nr;
60 
61 	if (likely(unr < NR_syscalls)) {
62 		unr = array_index_nospec(unr, NR_syscalls);
63 		regs->ax = x64_sys_call(regs, unr);
64 		return true;
65 	}
66 	return false;
67 }
68 
69 static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
70 {
71 	/*
72 	 * Adjust the starting offset of the table, and convert numbers
73 	 * < __X32_SYSCALL_BIT to very high and thus out of range
74 	 * numbers for comparisons.
75 	 */
76 	unsigned int xnr = nr - __X32_SYSCALL_BIT;
77 
78 	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
79 		xnr = array_index_nospec(xnr, X32_NR_syscalls);
80 		regs->ax = x32_sys_call(regs, xnr);
81 		return true;
82 	}
83 	return false;
84 }
85 
86 /* Returns true to return using SYSRET, or false to use IRET */
87 __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
88 {
89 	add_random_kstack_offset();
90 	nr = syscall_enter_from_user_mode(regs, nr);
91 
92 	instrumentation_begin();
93 
94 	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
95 		/* Invalid system call, but still a system call. */
96 		regs->ax = __x64_sys_ni_syscall(regs);
97 	}
98 
99 	instrumentation_end();
100 	syscall_exit_to_user_mode(regs);
101 
102 	/*
103 	 * Check that the register state is valid for using SYSRET to exit
104 	 * to userspace.  Otherwise use the slower but fully capable IRET
105 	 * exit path.
106 	 */
107 
108 	/* XEN PV guests always use the IRET path */
109 	if (cpu_feature_enabled(X86_FEATURE_XENPV))
110 		return false;
111 
112 	/* SYSRET requires RCX == RIP and R11 == EFLAGS */
113 	if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
114 		return false;
115 
116 	/* CS and SS must match the values set in MSR_STAR */
117 	if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
118 		return false;
119 
120 	/*
121 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
122 	 * in kernel space.  This essentially lets the user take over
123 	 * the kernel, since userspace controls RSP.
124 	 *
125 	 * TASK_SIZE_MAX covers all user-accessible addresses other than
126 	 * the deprecated vsyscall page.
127 	 */
128 	if (unlikely(regs->ip >= TASK_SIZE_MAX))
129 		return false;
130 
131 	/*
132 	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
133 	 * restoring TF results in a trap from userspace immediately after
134 	 * SYSRET.
135 	 */
136 	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
137 		return false;
138 
139 	/* Use SYSRET to exit to userspace */
140 	return true;
141 }
142