xref: /linux/arch/x86/entry/common.c (revision 0678df8271820bcf8fb4f877129f05d68a237de4)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * common.c - C code for kernel entry and exit
4  * Copyright (c) 2015 Andrew Lutomirski
5  *
6  * Based on asm and ptrace code by many authors.  The code here originated
7  * in ptrace.c and signal.c.
8  */
9 
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/sched/task_stack.h>
13 #include <linux/entry-common.h>
14 #include <linux/mm.h>
15 #include <linux/smp.h>
16 #include <linux/errno.h>
17 #include <linux/ptrace.h>
18 #include <linux/export.h>
19 #include <linux/nospec.h>
20 #include <linux/syscalls.h>
21 #include <linux/uaccess.h>
22 #include <linux/init.h>
23 
24 #ifdef CONFIG_XEN_PV
25 #include <xen/xen-ops.h>
26 #include <xen/events.h>
27 #endif
28 
29 #include <asm/desc.h>
30 #include <asm/traps.h>
31 #include <asm/vdso.h>
32 #include <asm/cpufeature.h>
33 #include <asm/fpu/api.h>
34 #include <asm/nospec-branch.h>
35 #include <asm/io_bitmap.h>
36 #include <asm/syscall.h>
37 #include <asm/irq_stack.h>
38 
39 #ifdef CONFIG_X86_64
40 
41 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
42 {
43 	/*
44 	 * Convert negative numbers to very high and thus out of range
45 	 * numbers for comparisons.
46 	 */
47 	unsigned int unr = nr;
48 
49 	if (likely(unr < NR_syscalls)) {
50 		unr = array_index_nospec(unr, NR_syscalls);
51 		regs->ax = sys_call_table[unr](regs);
52 		return true;
53 	}
54 	return false;
55 }
56 
57 static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
58 {
59 	/*
60 	 * Adjust the starting offset of the table, and convert numbers
61 	 * < __X32_SYSCALL_BIT to very high and thus out of range
62 	 * numbers for comparisons.
63 	 */
64 	unsigned int xnr = nr - __X32_SYSCALL_BIT;
65 
66 	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
67 		xnr = array_index_nospec(xnr, X32_NR_syscalls);
68 		regs->ax = x32_sys_call_table[xnr](regs);
69 		return true;
70 	}
71 	return false;
72 }
73 
74 /* Returns true to return using SYSRET, or false to use IRET */
75 __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
76 {
77 	add_random_kstack_offset();
78 	nr = syscall_enter_from_user_mode(regs, nr);
79 
80 	instrumentation_begin();
81 
82 	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
83 		/* Invalid system call, but still a system call. */
84 		regs->ax = __x64_sys_ni_syscall(regs);
85 	}
86 
87 	instrumentation_end();
88 	syscall_exit_to_user_mode(regs);
89 
90 	/*
91 	 * Check that the register state is valid for using SYSRET to exit
92 	 * to userspace.  Otherwise use the slower but fully capable IRET
93 	 * exit path.
94 	 */
95 
96 	/* XEN PV guests always use the IRET path */
97 	if (cpu_feature_enabled(X86_FEATURE_XENPV))
98 		return false;
99 
100 	/* SYSRET requires RCX == RIP and R11 == EFLAGS */
101 	if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
102 		return false;
103 
104 	/* CS and SS must match the values set in MSR_STAR */
105 	if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
106 		return false;
107 
108 	/*
109 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
110 	 * in kernel space.  This essentially lets the user take over
111 	 * the kernel, since userspace controls RSP.
112 	 *
113 	 * TASK_SIZE_MAX covers all user-accessible addresses other than
114 	 * the deprecated vsyscall page.
115 	 */
116 	if (unlikely(regs->ip >= TASK_SIZE_MAX))
117 		return false;
118 
119 	/*
120 	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
121 	 * restoring TF results in a trap from userspace immediately after
122 	 * SYSRET.
123 	 */
124 	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
125 		return false;
126 
127 	/* Use SYSRET to exit to userspace */
128 	return true;
129 }
130 #endif
131 
132 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
133 static __always_inline int syscall_32_enter(struct pt_regs *regs)
134 {
135 	if (IS_ENABLED(CONFIG_IA32_EMULATION))
136 		current_thread_info()->status |= TS_COMPAT;
137 
138 	return (int)regs->orig_ax;
139 }
140 
141 #ifdef CONFIG_IA32_EMULATION
142 bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
143 
144 static int ia32_emulation_override_cmdline(char *arg)
145 {
146 	return kstrtobool(arg, &__ia32_enabled);
147 }
148 early_param("ia32_emulation", ia32_emulation_override_cmdline);
149 #endif
150 
151 /*
152  * Invoke a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.
153  */
154 static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
155 {
156 	/*
157 	 * Convert negative numbers to very high and thus out of range
158 	 * numbers for comparisons.
159 	 */
160 	unsigned int unr = nr;
161 
162 	if (likely(unr < IA32_NR_syscalls)) {
163 		unr = array_index_nospec(unr, IA32_NR_syscalls);
164 		regs->ax = ia32_sys_call_table[unr](regs);
165 	} else if (nr != -1) {
166 		regs->ax = __ia32_sys_ni_syscall(regs);
167 	}
168 }
169 
170 /* Handles int $0x80 */
171 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
172 {
173 	int nr = syscall_32_enter(regs);
174 
175 	add_random_kstack_offset();
176 	/*
177 	 * Subtlety here: if ptrace pokes something larger than 2^31-1 into
178 	 * orig_ax, the int return value truncates it. This matches
179 	 * the semantics of syscall_get_nr().
180 	 */
181 	nr = syscall_enter_from_user_mode(regs, nr);
182 	instrumentation_begin();
183 
184 	do_syscall_32_irqs_on(regs, nr);
185 
186 	instrumentation_end();
187 	syscall_exit_to_user_mode(regs);
188 }
189 
190 static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
191 {
192 	int nr = syscall_32_enter(regs);
193 	int res;
194 
195 	add_random_kstack_offset();
196 	/*
197 	 * This cannot use syscall_enter_from_user_mode() as it has to
198 	 * fetch EBP before invoking any of the syscall entry work
199 	 * functions.
200 	 */
201 	syscall_enter_from_user_mode_prepare(regs);
202 
203 	instrumentation_begin();
204 	/* Fetch EBP from where the vDSO stashed it. */
205 	if (IS_ENABLED(CONFIG_X86_64)) {
206 		/*
207 		 * Micro-optimization: the pointer we're following is
208 		 * explicitly 32 bits, so it can't be out of range.
209 		 */
210 		res = __get_user(*(u32 *)&regs->bp,
211 			 (u32 __user __force *)(unsigned long)(u32)regs->sp);
212 	} else {
213 		res = get_user(*(u32 *)&regs->bp,
214 		       (u32 __user __force *)(unsigned long)(u32)regs->sp);
215 	}
216 
217 	if (res) {
218 		/* User code screwed up. */
219 		regs->ax = -EFAULT;
220 
221 		local_irq_disable();
222 		instrumentation_end();
223 		irqentry_exit_to_user_mode(regs);
224 		return false;
225 	}
226 
227 	nr = syscall_enter_from_user_mode_work(regs, nr);
228 
229 	/* Now this is just like a normal syscall. */
230 	do_syscall_32_irqs_on(regs, nr);
231 
232 	instrumentation_end();
233 	syscall_exit_to_user_mode(regs);
234 	return true;
235 }
236 
237 /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
238 __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
239 {
240 	/*
241 	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
242 	 * convention.  Adjust regs so it looks like we entered using int80.
243 	 */
244 	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
245 					vdso_image_32.sym_int80_landing_pad;
246 
247 	/*
248 	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
249 	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
250 	 * Fix it up.
251 	 */
252 	regs->ip = landing_pad;
253 
254 	/* Invoke the syscall. If it failed, keep it simple: use IRET. */
255 	if (!__do_fast_syscall_32(regs))
256 		return false;
257 
258 	/*
259 	 * Check that the register state is valid for using SYSRETL/SYSEXIT
260 	 * to exit to userspace.  Otherwise use the slower but fully capable
261 	 * IRET exit path.
262 	 */
263 
264 	/* XEN PV guests always use the IRET path */
265 	if (cpu_feature_enabled(X86_FEATURE_XENPV))
266 		return false;
267 
268 	/* EIP must point to the VDSO landing pad */
269 	if (unlikely(regs->ip != landing_pad))
270 		return false;
271 
272 	/* CS and SS must match the values set in MSR_STAR */
273 	if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
274 		return false;
275 
276 	/* If the TF, RF, or VM flags are set, use IRET */
277 	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
278 		return false;
279 
280 	/* Use SYSRETL/SYSEXIT to exit to userspace */
281 	return true;
282 }
283 
284 /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
285 __visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
286 {
287 	/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
288 	regs->sp = regs->bp;
289 
290 	/* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
291 	regs->flags |= X86_EFLAGS_IF;
292 
293 	return do_fast_syscall_32(regs);
294 }
295 #endif
296 
297 SYSCALL_DEFINE0(ni_syscall)
298 {
299 	return -ENOSYS;
300 }
301 
302 #ifdef CONFIG_XEN_PV
303 #ifndef CONFIG_PREEMPTION
304 /*
305  * Some hypercalls issued by the toolstack can take many 10s of
306  * seconds. Allow tasks running hypercalls via the privcmd driver to
307  * be voluntarily preempted even if full kernel preemption is
308  * disabled.
309  *
310  * Such preemptible hypercalls are bracketed by
311  * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
312  * calls.
313  */
314 DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
315 EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
316 
317 /*
318  * In case of scheduling the flag must be cleared and restored after
319  * returning from schedule as the task might move to a different CPU.
320  */
321 static __always_inline bool get_and_clear_inhcall(void)
322 {
323 	bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
324 
325 	__this_cpu_write(xen_in_preemptible_hcall, false);
326 	return inhcall;
327 }
328 
329 static __always_inline void restore_inhcall(bool inhcall)
330 {
331 	__this_cpu_write(xen_in_preemptible_hcall, inhcall);
332 }
333 #else
334 static __always_inline bool get_and_clear_inhcall(void) { return false; }
335 static __always_inline void restore_inhcall(bool inhcall) { }
336 #endif
337 
338 static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
339 {
340 	struct pt_regs *old_regs = set_irq_regs(regs);
341 
342 	inc_irq_stat(irq_hv_callback_count);
343 
344 	xen_evtchn_do_upcall();
345 
346 	set_irq_regs(old_regs);
347 }
348 
349 __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
350 {
351 	irqentry_state_t state = irqentry_enter(regs);
352 	bool inhcall;
353 
354 	instrumentation_begin();
355 	run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
356 
357 	inhcall = get_and_clear_inhcall();
358 	if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
359 		irqentry_exit_cond_resched();
360 		instrumentation_end();
361 		restore_inhcall(inhcall);
362 	} else {
363 		instrumentation_end();
364 		irqentry_exit(regs, state);
365 	}
366 }
367 #endif /* CONFIG_XEN_PV */
368