xref: /linux/arch/x86/kernel/shstk.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * shstk.c - Intel shadow stack support
4  *
5  * Copyright (c) 2021, Intel Corporation.
6  * Yu-cheng Yu <yu-cheng.yu@intel.com>
7  */
8 
9 #include <linux/sched.h>
10 #include <linux/bitops.h>
11 #include <linux/types.h>
12 #include <linux/mm.h>
13 #include <linux/mman.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/sched/signal.h>
17 #include <linux/compat.h>
18 #include <linux/sizes.h>
19 #include <linux/user.h>
20 #include <linux/syscalls.h>
21 #include <asm/msr.h>
22 #include <asm/fpu/xstate.h>
23 #include <asm/fpu/types.h>
24 #include <asm/shstk.h>
25 #include <asm/special_insns.h>
26 #include <asm/fpu/api.h>
27 #include <asm/prctl.h>
28 
29 #define SS_FRAME_SIZE 8
30 
31 static bool features_enabled(unsigned long features)
32 {
33 	return current->thread.features & features;
34 }
35 
36 static void features_set(unsigned long features)
37 {
38 	current->thread.features |= features;
39 }
40 
41 static void features_clr(unsigned long features)
42 {
43 	current->thread.features &= ~features;
44 }
45 
46 /*
47  * Create a restore token on the shadow stack.  A token is always 8-byte
48  * and aligned to 8.
49  */
50 static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
51 {
52 	unsigned long addr;
53 
54 	/* Token must be aligned */
55 	if (!IS_ALIGNED(ssp, 8))
56 		return -EINVAL;
57 
58 	addr = ssp - SS_FRAME_SIZE;
59 
60 	/*
61 	 * SSP is aligned, so reserved bits and mode bit are a zero, just mark
62 	 * the token 64-bit.
63 	 */
64 	ssp |= BIT(0);
65 
66 	if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
67 		return -EFAULT;
68 
69 	if (token_addr)
70 		*token_addr = addr;
71 
72 	return 0;
73 }
74 
75 /*
76  * VM_SHADOW_STACK will have a guard page. This helps userspace protect
77  * itself from attacks. The reasoning is as follows:
78  *
79  * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
80  * INCSSP instruction can increment the shadow stack pointer. It is the
81  * shadow stack analog of an instruction like:
82  *
83  *   addq $0x80, %rsp
84  *
85  * However, there is one important difference between an ADD on %rsp
86  * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
87  * memory of the first and last elements that were "popped". It can be
88  * thought of as acting like this:
89  *
90  * READ_ONCE(ssp);       // read+discard top element on stack
91  * ssp += nr_to_pop * 8; // move the shadow stack
92  * READ_ONCE(ssp-8);     // read+discard last popped stack element
93  *
94  * The maximum distance INCSSP can move the SSP is 2040 bytes, before
95  * it would read the memory. Therefore a single page gap will be enough
96  * to prevent any operation from shifting the SSP to an adjacent stack,
97  * since it would have to land in the gap at least once, causing a
98  * fault.
99  */
100 static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
101 				 unsigned long token_offset, bool set_res_tok)
102 {
103 	int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
104 	struct mm_struct *mm = current->mm;
105 	unsigned long mapped_addr, unused;
106 
107 	if (addr)
108 		flags |= MAP_FIXED_NOREPLACE;
109 
110 	mmap_write_lock(mm);
111 	mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
112 			      VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
113 	mmap_write_unlock(mm);
114 
115 	if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
116 		goto out;
117 
118 	if (create_rstor_token(mapped_addr + token_offset, NULL)) {
119 		vm_munmap(mapped_addr, size);
120 		return -EINVAL;
121 	}
122 
123 out:
124 	return mapped_addr;
125 }
126 
127 static unsigned long adjust_shstk_size(unsigned long size)
128 {
129 	if (size)
130 		return PAGE_ALIGN(size);
131 
132 	return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
133 }
134 
135 static void unmap_shadow_stack(u64 base, u64 size)
136 {
137 	int r;
138 
139 	r = vm_munmap(base, size);
140 
141 	/*
142 	 * mmap_write_lock_killable() failed with -EINTR. This means
143 	 * the process is about to die and have it's MM cleaned up.
144 	 * This task shouldn't ever make it back to userspace. In this
145 	 * case it is ok to leak a shadow stack, so just exit out.
146 	 */
147 	if (r == -EINTR)
148 		return;
149 
150 	/*
151 	 * For all other types of vm_munmap() failure, either the
152 	 * system is out of memory or there is bug.
153 	 */
154 	WARN_ON_ONCE(r);
155 }
156 
157 static int shstk_setup(void)
158 {
159 	struct thread_shstk *shstk = &current->thread.shstk;
160 	unsigned long addr, size;
161 
162 	/* Already enabled */
163 	if (features_enabled(ARCH_SHSTK_SHSTK))
164 		return 0;
165 
166 	/* Also not supported for 32 bit */
167 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
168 		return -EOPNOTSUPP;
169 
170 	size = adjust_shstk_size(0);
171 	addr = alloc_shstk(0, size, 0, false);
172 	if (IS_ERR_VALUE(addr))
173 		return PTR_ERR((void *)addr);
174 
175 	fpregs_lock_and_load();
176 	wrmsrl(MSR_IA32_PL3_SSP, addr + size);
177 	wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
178 	fpregs_unlock();
179 
180 	shstk->base = addr;
181 	shstk->size = size;
182 	features_set(ARCH_SHSTK_SHSTK);
183 
184 	return 0;
185 }
186 
187 void reset_thread_features(void)
188 {
189 	memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
190 	current->thread.features = 0;
191 	current->thread.features_locked = 0;
192 }
193 
194 unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
195 				       unsigned long stack_size)
196 {
197 	struct thread_shstk *shstk = &tsk->thread.shstk;
198 	unsigned long addr, size;
199 
200 	/*
201 	 * If shadow stack is not enabled on the new thread, skip any
202 	 * switch to a new shadow stack.
203 	 */
204 	if (!features_enabled(ARCH_SHSTK_SHSTK))
205 		return 0;
206 
207 	/*
208 	 * For CLONE_VFORK the child will share the parents shadow stack.
209 	 * Make sure to clear the internal tracking of the thread shadow
210 	 * stack so the freeing logic run for child knows to leave it alone.
211 	 */
212 	if (clone_flags & CLONE_VFORK) {
213 		shstk->base = 0;
214 		shstk->size = 0;
215 		return 0;
216 	}
217 
218 	/*
219 	 * For !CLONE_VM the child will use a copy of the parents shadow
220 	 * stack.
221 	 */
222 	if (!(clone_flags & CLONE_VM))
223 		return 0;
224 
225 	size = adjust_shstk_size(stack_size);
226 	addr = alloc_shstk(0, size, 0, false);
227 	if (IS_ERR_VALUE(addr))
228 		return addr;
229 
230 	shstk->base = addr;
231 	shstk->size = size;
232 
233 	return addr + size;
234 }
235 
236 static unsigned long get_user_shstk_addr(void)
237 {
238 	unsigned long long ssp;
239 
240 	fpregs_lock_and_load();
241 
242 	rdmsrl(MSR_IA32_PL3_SSP, ssp);
243 
244 	fpregs_unlock();
245 
246 	return ssp;
247 }
248 
249 #define SHSTK_DATA_BIT BIT(63)
250 
251 static int put_shstk_data(u64 __user *addr, u64 data)
252 {
253 	if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
254 		return -EINVAL;
255 
256 	/*
257 	 * Mark the high bit so that the sigframe can't be processed as a
258 	 * return address.
259 	 */
260 	if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
261 		return -EFAULT;
262 	return 0;
263 }
264 
265 static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
266 {
267 	unsigned long ldata;
268 
269 	if (unlikely(get_user(ldata, addr)))
270 		return -EFAULT;
271 
272 	if (!(ldata & SHSTK_DATA_BIT))
273 		return -EINVAL;
274 
275 	*data = ldata & ~SHSTK_DATA_BIT;
276 
277 	return 0;
278 }
279 
280 static int shstk_push_sigframe(unsigned long *ssp)
281 {
282 	unsigned long target_ssp = *ssp;
283 
284 	/* Token must be aligned */
285 	if (!IS_ALIGNED(target_ssp, 8))
286 		return -EINVAL;
287 
288 	*ssp -= SS_FRAME_SIZE;
289 	if (put_shstk_data((void __user *)*ssp, target_ssp))
290 		return -EFAULT;
291 
292 	return 0;
293 }
294 
295 static int shstk_pop_sigframe(unsigned long *ssp)
296 {
297 	struct vm_area_struct *vma;
298 	unsigned long token_addr;
299 	bool need_to_check_vma;
300 	int err = 1;
301 
302 	/*
303 	 * It is possible for the SSP to be off the end of a shadow stack by 4
304 	 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
305 	 * before it, it might be this case, so check that the address being
306 	 * read is actually shadow stack.
307 	 */
308 	if (!IS_ALIGNED(*ssp, 8))
309 		return -EINVAL;
310 
311 	need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
312 
313 	if (need_to_check_vma)
314 		mmap_read_lock_killable(current->mm);
315 
316 	err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
317 	if (unlikely(err))
318 		goto out_err;
319 
320 	if (need_to_check_vma) {
321 		vma = find_vma(current->mm, *ssp);
322 		if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
323 			err = -EFAULT;
324 			goto out_err;
325 		}
326 
327 		mmap_read_unlock(current->mm);
328 	}
329 
330 	/* Restore SSP aligned? */
331 	if (unlikely(!IS_ALIGNED(token_addr, 8)))
332 		return -EINVAL;
333 
334 	/* SSP in userspace? */
335 	if (unlikely(token_addr >= TASK_SIZE_MAX))
336 		return -EINVAL;
337 
338 	*ssp = token_addr;
339 
340 	return 0;
341 out_err:
342 	if (need_to_check_vma)
343 		mmap_read_unlock(current->mm);
344 	return err;
345 }
346 
347 int setup_signal_shadow_stack(struct ksignal *ksig)
348 {
349 	void __user *restorer = ksig->ka.sa.sa_restorer;
350 	unsigned long ssp;
351 	int err;
352 
353 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
354 	    !features_enabled(ARCH_SHSTK_SHSTK))
355 		return 0;
356 
357 	if (!restorer)
358 		return -EINVAL;
359 
360 	ssp = get_user_shstk_addr();
361 	if (unlikely(!ssp))
362 		return -EINVAL;
363 
364 	err = shstk_push_sigframe(&ssp);
365 	if (unlikely(err))
366 		return err;
367 
368 	/* Push restorer address */
369 	ssp -= SS_FRAME_SIZE;
370 	err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
371 	if (unlikely(err))
372 		return -EFAULT;
373 
374 	fpregs_lock_and_load();
375 	wrmsrl(MSR_IA32_PL3_SSP, ssp);
376 	fpregs_unlock();
377 
378 	return 0;
379 }
380 
381 int restore_signal_shadow_stack(void)
382 {
383 	unsigned long ssp;
384 	int err;
385 
386 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
387 	    !features_enabled(ARCH_SHSTK_SHSTK))
388 		return 0;
389 
390 	ssp = get_user_shstk_addr();
391 	if (unlikely(!ssp))
392 		return -EINVAL;
393 
394 	err = shstk_pop_sigframe(&ssp);
395 	if (unlikely(err))
396 		return err;
397 
398 	fpregs_lock_and_load();
399 	wrmsrl(MSR_IA32_PL3_SSP, ssp);
400 	fpregs_unlock();
401 
402 	return 0;
403 }
404 
405 void shstk_free(struct task_struct *tsk)
406 {
407 	struct thread_shstk *shstk = &tsk->thread.shstk;
408 
409 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
410 	    !features_enabled(ARCH_SHSTK_SHSTK))
411 		return;
412 
413 	/*
414 	 * When fork() with CLONE_VM fails, the child (tsk) already has a
415 	 * shadow stack allocated, and exit_thread() calls this function to
416 	 * free it.  In this case the parent (current) and the child share
417 	 * the same mm struct.
418 	 */
419 	if (!tsk->mm || tsk->mm != current->mm)
420 		return;
421 
422 	/*
423 	 * If shstk->base is NULL, then this task is not managing its
424 	 * own shadow stack (CLONE_VFORK). So skip freeing it.
425 	 */
426 	if (!shstk->base)
427 		return;
428 
429 	/*
430 	 * shstk->base is NULL for CLONE_VFORK child tasks, and so is
431 	 * normal. But size = 0 on a shstk->base is not normal and
432 	 * indicated an attempt to free the thread shadow stack twice.
433 	 * Warn about it.
434 	 */
435 	if (WARN_ON(!shstk->size))
436 		return;
437 
438 	unmap_shadow_stack(shstk->base, shstk->size);
439 
440 	shstk->size = 0;
441 }
442 
443 static int wrss_control(bool enable)
444 {
445 	u64 msrval;
446 
447 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
448 		return -EOPNOTSUPP;
449 
450 	/*
451 	 * Only enable WRSS if shadow stack is enabled. If shadow stack is not
452 	 * enabled, WRSS will already be disabled, so don't bother clearing it
453 	 * when disabling.
454 	 */
455 	if (!features_enabled(ARCH_SHSTK_SHSTK))
456 		return -EPERM;
457 
458 	/* Already enabled/disabled? */
459 	if (features_enabled(ARCH_SHSTK_WRSS) == enable)
460 		return 0;
461 
462 	fpregs_lock_and_load();
463 	rdmsrl(MSR_IA32_U_CET, msrval);
464 
465 	if (enable) {
466 		features_set(ARCH_SHSTK_WRSS);
467 		msrval |= CET_WRSS_EN;
468 	} else {
469 		features_clr(ARCH_SHSTK_WRSS);
470 		if (!(msrval & CET_WRSS_EN))
471 			goto unlock;
472 
473 		msrval &= ~CET_WRSS_EN;
474 	}
475 
476 	wrmsrl(MSR_IA32_U_CET, msrval);
477 
478 unlock:
479 	fpregs_unlock();
480 
481 	return 0;
482 }
483 
484 static int shstk_disable(void)
485 {
486 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
487 		return -EOPNOTSUPP;
488 
489 	/* Already disabled? */
490 	if (!features_enabled(ARCH_SHSTK_SHSTK))
491 		return 0;
492 
493 	fpregs_lock_and_load();
494 	/* Disable WRSS too when disabling shadow stack */
495 	wrmsrl(MSR_IA32_U_CET, 0);
496 	wrmsrl(MSR_IA32_PL3_SSP, 0);
497 	fpregs_unlock();
498 
499 	shstk_free(current);
500 	features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
501 
502 	return 0;
503 }
504 
505 SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
506 {
507 	bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
508 	unsigned long aligned_size;
509 
510 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
511 		return -EOPNOTSUPP;
512 
513 	if (flags & ~SHADOW_STACK_SET_TOKEN)
514 		return -EINVAL;
515 
516 	/* If there isn't space for a token */
517 	if (set_tok && size < 8)
518 		return -ENOSPC;
519 
520 	if (addr && addr < SZ_4G)
521 		return -ERANGE;
522 
523 	/*
524 	 * An overflow would result in attempting to write the restore token
525 	 * to the wrong location. Not catastrophic, but just return the right
526 	 * error code and block it.
527 	 */
528 	aligned_size = PAGE_ALIGN(size);
529 	if (aligned_size < size)
530 		return -EOVERFLOW;
531 
532 	return alloc_shstk(addr, aligned_size, size, set_tok);
533 }
534 
535 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
536 {
537 	unsigned long features = arg2;
538 
539 	if (option == ARCH_SHSTK_STATUS) {
540 		return put_user(task->thread.features, (unsigned long __user *)arg2);
541 	}
542 
543 	if (option == ARCH_SHSTK_LOCK) {
544 		task->thread.features_locked |= features;
545 		return 0;
546 	}
547 
548 	/* Only allow via ptrace */
549 	if (task != current) {
550 		if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
551 			task->thread.features_locked &= ~features;
552 			return 0;
553 		}
554 		return -EINVAL;
555 	}
556 
557 	/* Do not allow to change locked features */
558 	if (features & task->thread.features_locked)
559 		return -EPERM;
560 
561 	/* Only support enabling/disabling one feature at a time. */
562 	if (hweight_long(features) > 1)
563 		return -EINVAL;
564 
565 	if (option == ARCH_SHSTK_DISABLE) {
566 		if (features & ARCH_SHSTK_WRSS)
567 			return wrss_control(false);
568 		if (features & ARCH_SHSTK_SHSTK)
569 			return shstk_disable();
570 		return -EINVAL;
571 	}
572 
573 	/* Handle ARCH_SHSTK_ENABLE */
574 	if (features & ARCH_SHSTK_SHSTK)
575 		return shstk_setup();
576 	if (features & ARCH_SHSTK_WRSS)
577 		return wrss_control(true);
578 	return -EINVAL;
579 }
580