xref: /linux/arch/x86/kernel/shstk.c (revision 5ea5880764cbb164afb17a62e76ca75dc371409d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * shstk.c - Intel shadow stack support
4  *
5  * Copyright (c) 2021, Intel Corporation.
6  * Yu-cheng Yu <yu-cheng.yu@intel.com>
7  */
8 
9 #include <linux/sched.h>
10 #include <linux/bitops.h>
11 #include <linux/types.h>
12 #include <linux/mm.h>
13 #include <linux/mman.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/sched/signal.h>
17 #include <linux/compat.h>
18 #include <linux/sizes.h>
19 #include <linux/user.h>
20 #include <linux/syscalls.h>
21 #include <asm/msr.h>
22 #include <asm/fpu/xstate.h>
23 #include <asm/fpu/types.h>
24 #include <asm/shstk.h>
25 #include <asm/special_insns.h>
26 #include <asm/fpu/api.h>
27 #include <asm/prctl.h>
28 
29 #define SS_FRAME_SIZE 8
30 
31 static bool features_enabled(unsigned long features)
32 {
33 	return current->thread.features & features;
34 }
35 
36 static void features_set(unsigned long features)
37 {
38 	current->thread.features |= features;
39 }
40 
41 static void features_clr(unsigned long features)
42 {
43 	current->thread.features &= ~features;
44 }
45 
46 /*
47  * Create a restore token on the shadow stack.  A token is always 8-byte
48  * and aligned to 8.
49  */
50 static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
51 {
52 	unsigned long addr;
53 
54 	/* Token must be aligned */
55 	if (!IS_ALIGNED(ssp, 8))
56 		return -EINVAL;
57 
58 	addr = ssp - SS_FRAME_SIZE;
59 
60 	/*
61 	 * SSP is aligned, so reserved bits and mode bit are a zero, just mark
62 	 * the token 64-bit.
63 	 */
64 	ssp |= BIT(0);
65 
66 	if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
67 		return -EFAULT;
68 
69 	if (token_addr)
70 		*token_addr = addr;
71 
72 	return 0;
73 }
74 
75 /*
76  * VM_SHADOW_STACK will have a guard page. This helps userspace protect
77  * itself from attacks. The reasoning is as follows:
78  *
79  * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
80  * INCSSP instruction can increment the shadow stack pointer. It is the
81  * shadow stack analog of an instruction like:
82  *
83  *   addq $0x80, %rsp
84  *
85  * However, there is one important difference between an ADD on %rsp
86  * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
87  * memory of the first and last elements that were "popped". It can be
88  * thought of as acting like this:
89  *
90  * READ_ONCE(ssp);       // read+discard top element on stack
91  * ssp += nr_to_pop * 8; // move the shadow stack
92  * READ_ONCE(ssp-8);     // read+discard last popped stack element
93  *
94  * The maximum distance INCSSP can move the SSP is 2040 bytes, before
95  * it would read the memory. Therefore a single page gap will be enough
96  * to prevent any operation from shifting the SSP to an adjacent stack,
97  * since it would have to land in the gap at least once, causing a
98  * fault.
99  */
100 static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
101 				 unsigned long token_offset, bool set_res_tok)
102 {
103 	unsigned long mapped_addr;
104 
105 	mapped_addr = vm_mmap_shadow_stack(addr, size, MAP_ABOVE4G);
106 
107 	if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
108 		goto out;
109 
110 	if (create_rstor_token(mapped_addr + token_offset, NULL)) {
111 		vm_munmap(mapped_addr, size);
112 		return -EINVAL;
113 	}
114 
115 out:
116 	return mapped_addr;
117 }
118 
119 static unsigned long adjust_shstk_size(unsigned long size)
120 {
121 	if (size)
122 		return PAGE_ALIGN(size);
123 
124 	return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
125 }
126 
127 static void unmap_shadow_stack(u64 base, u64 size)
128 {
129 	int r;
130 
131 	r = vm_munmap(base, size);
132 
133 	/*
134 	 * mmap_write_lock_killable() failed with -EINTR. This means
135 	 * the process is about to die and have it's MM cleaned up.
136 	 * This task shouldn't ever make it back to userspace. In this
137 	 * case it is ok to leak a shadow stack, so just exit out.
138 	 */
139 	if (r == -EINTR)
140 		return;
141 
142 	/*
143 	 * For all other types of vm_munmap() failure, either the
144 	 * system is out of memory or there is bug.
145 	 */
146 	WARN_ON_ONCE(r);
147 }
148 
149 static int shstk_setup(void)
150 {
151 	struct thread_shstk *shstk = &current->thread.shstk;
152 	unsigned long addr, size;
153 
154 	/* Already enabled */
155 	if (features_enabled(ARCH_SHSTK_SHSTK))
156 		return 0;
157 
158 	/* Also not supported for 32 bit */
159 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
160 		return -EOPNOTSUPP;
161 
162 	size = adjust_shstk_size(0);
163 	addr = alloc_shstk(0, size, 0, false);
164 	if (IS_ERR_VALUE(addr))
165 		return PTR_ERR((void *)addr);
166 
167 	fpregs_lock_and_load();
168 	wrmsrq(MSR_IA32_PL3_SSP, addr + size);
169 	wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN);
170 	fpregs_unlock();
171 
172 	shstk->base = addr;
173 	shstk->size = size;
174 	features_set(ARCH_SHSTK_SHSTK);
175 
176 	return 0;
177 }
178 
179 void reset_thread_features(void)
180 {
181 	memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
182 	current->thread.features = 0;
183 	current->thread.features_locked = 0;
184 }
185 
186 unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
187 				       unsigned long stack_size)
188 {
189 	struct thread_shstk *shstk = &tsk->thread.shstk;
190 	unsigned long addr, size;
191 
192 	/*
193 	 * If shadow stack is not enabled on the new thread, skip any
194 	 * switch to a new shadow stack.
195 	 */
196 	if (!features_enabled(ARCH_SHSTK_SHSTK))
197 		return 0;
198 
199 	/*
200 	 * For CLONE_VFORK the child will share the parents shadow stack.
201 	 * Make sure to clear the internal tracking of the thread shadow
202 	 * stack so the freeing logic run for child knows to leave it alone.
203 	 */
204 	if (clone_flags & CLONE_VFORK) {
205 		shstk->base = 0;
206 		shstk->size = 0;
207 		return 0;
208 	}
209 
210 	/*
211 	 * For !CLONE_VM the child will use a copy of the parents shadow
212 	 * stack.
213 	 */
214 	if (!(clone_flags & CLONE_VM))
215 		return 0;
216 
217 	size = adjust_shstk_size(stack_size);
218 	addr = alloc_shstk(0, size, 0, false);
219 	if (IS_ERR_VALUE(addr))
220 		return addr;
221 
222 	shstk->base = addr;
223 	shstk->size = size;
224 
225 	return addr + size;
226 }
227 
228 static unsigned long get_user_shstk_addr(void)
229 {
230 	unsigned long long ssp;
231 
232 	fpregs_lock_and_load();
233 
234 	rdmsrq(MSR_IA32_PL3_SSP, ssp);
235 
236 	fpregs_unlock();
237 
238 	return ssp;
239 }
240 
241 int shstk_pop(u64 *val)
242 {
243 	int ret = 0;
244 	u64 ssp;
245 
246 	if (!features_enabled(ARCH_SHSTK_SHSTK))
247 		return -ENOTSUPP;
248 
249 	fpregs_lock_and_load();
250 
251 	rdmsrq(MSR_IA32_PL3_SSP, ssp);
252 	if (val && get_user(*val, (__user u64 *)ssp))
253 		ret = -EFAULT;
254 	else
255 		wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
256 	fpregs_unlock();
257 
258 	return ret;
259 }
260 
261 int shstk_push(u64 val)
262 {
263 	u64 ssp;
264 	int ret;
265 
266 	if (!features_enabled(ARCH_SHSTK_SHSTK))
267 		return -ENOTSUPP;
268 
269 	fpregs_lock_and_load();
270 
271 	rdmsrq(MSR_IA32_PL3_SSP, ssp);
272 	ssp -= SS_FRAME_SIZE;
273 	ret = write_user_shstk_64((__user void *)ssp, val);
274 	if (!ret)
275 		wrmsrq(MSR_IA32_PL3_SSP, ssp);
276 	fpregs_unlock();
277 
278 	return ret;
279 }
280 
281 #define SHSTK_DATA_BIT BIT(63)
282 
283 static int put_shstk_data(u64 __user *addr, u64 data)
284 {
285 	if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
286 		return -EINVAL;
287 
288 	/*
289 	 * Mark the high bit so that the sigframe can't be processed as a
290 	 * return address.
291 	 */
292 	if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
293 		return -EFAULT;
294 	return 0;
295 }
296 
297 static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
298 {
299 	unsigned long ldata;
300 
301 	if (unlikely(get_user(ldata, addr)))
302 		return -EFAULT;
303 
304 	if (!(ldata & SHSTK_DATA_BIT))
305 		return -EINVAL;
306 
307 	*data = ldata & ~SHSTK_DATA_BIT;
308 
309 	return 0;
310 }
311 
312 static int shstk_push_sigframe(unsigned long *ssp)
313 {
314 	unsigned long target_ssp = *ssp;
315 
316 	/* Token must be aligned */
317 	if (!IS_ALIGNED(target_ssp, 8))
318 		return -EINVAL;
319 
320 	*ssp -= SS_FRAME_SIZE;
321 	if (put_shstk_data((void __user *)*ssp, target_ssp))
322 		return -EFAULT;
323 
324 	return 0;
325 }
326 
327 static int shstk_pop_sigframe(unsigned long *ssp)
328 {
329 	struct vm_area_struct *vma;
330 	unsigned long token_addr;
331 	bool need_to_check_vma;
332 	int err = 1;
333 
334 	/*
335 	 * It is possible for the SSP to be off the end of a shadow stack by 4
336 	 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
337 	 * before it, it might be this case, so check that the address being
338 	 * read is actually shadow stack.
339 	 */
340 	if (!IS_ALIGNED(*ssp, 8))
341 		return -EINVAL;
342 
343 	need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
344 
345 	if (need_to_check_vma)
346 		if (mmap_read_lock_killable(current->mm))
347 			return -EINTR;
348 
349 	err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
350 	if (unlikely(err))
351 		goto out_err;
352 
353 	if (need_to_check_vma) {
354 		vma = find_vma(current->mm, *ssp);
355 		if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
356 			err = -EFAULT;
357 			goto out_err;
358 		}
359 
360 		mmap_read_unlock(current->mm);
361 	}
362 
363 	/* Restore SSP aligned? */
364 	if (unlikely(!IS_ALIGNED(token_addr, 8)))
365 		return -EINVAL;
366 
367 	/* SSP in userspace? */
368 	if (unlikely(token_addr >= TASK_SIZE_MAX))
369 		return -EINVAL;
370 
371 	*ssp = token_addr;
372 
373 	return 0;
374 out_err:
375 	if (need_to_check_vma)
376 		mmap_read_unlock(current->mm);
377 	return err;
378 }
379 
380 int setup_signal_shadow_stack(struct ksignal *ksig)
381 {
382 	void __user *restorer = ksig->ka.sa.sa_restorer;
383 	unsigned long ssp;
384 	int err;
385 
386 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
387 	    !features_enabled(ARCH_SHSTK_SHSTK))
388 		return 0;
389 
390 	if (!restorer)
391 		return -EINVAL;
392 
393 	ssp = get_user_shstk_addr();
394 	if (unlikely(!ssp))
395 		return -EINVAL;
396 
397 	err = shstk_push_sigframe(&ssp);
398 	if (unlikely(err))
399 		return err;
400 
401 	/* Push restorer address */
402 	ssp -= SS_FRAME_SIZE;
403 	err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
404 	if (unlikely(err))
405 		return -EFAULT;
406 
407 	fpregs_lock_and_load();
408 	wrmsrq(MSR_IA32_PL3_SSP, ssp);
409 	fpregs_unlock();
410 
411 	return 0;
412 }
413 
414 int restore_signal_shadow_stack(void)
415 {
416 	unsigned long ssp;
417 	int err;
418 
419 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
420 	    !features_enabled(ARCH_SHSTK_SHSTK))
421 		return 0;
422 
423 	ssp = get_user_shstk_addr();
424 	if (unlikely(!ssp))
425 		return -EINVAL;
426 
427 	err = shstk_pop_sigframe(&ssp);
428 	if (unlikely(err))
429 		return err;
430 
431 	fpregs_lock_and_load();
432 	wrmsrq(MSR_IA32_PL3_SSP, ssp);
433 	fpregs_unlock();
434 
435 	return 0;
436 }
437 
438 void shstk_free(struct task_struct *tsk)
439 {
440 	struct thread_shstk *shstk = &tsk->thread.shstk;
441 
442 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
443 	    !features_enabled(ARCH_SHSTK_SHSTK))
444 		return;
445 
446 	/*
447 	 * When fork() with CLONE_VM fails, the child (tsk) already has a
448 	 * shadow stack allocated, and exit_thread() calls this function to
449 	 * free it.  In this case the parent (current) and the child share
450 	 * the same mm struct.
451 	 */
452 	if (!tsk->mm || tsk->mm != current->mm)
453 		return;
454 
455 	/*
456 	 * If shstk->base is NULL, then this task is not managing its
457 	 * own shadow stack (CLONE_VFORK). So skip freeing it.
458 	 */
459 	if (!shstk->base)
460 		return;
461 
462 	/*
463 	 * shstk->base is NULL for CLONE_VFORK child tasks, and so is
464 	 * normal. But size = 0 on a shstk->base is not normal and
465 	 * indicated an attempt to free the thread shadow stack twice.
466 	 * Warn about it.
467 	 */
468 	if (WARN_ON(!shstk->size))
469 		return;
470 
471 	unmap_shadow_stack(shstk->base, shstk->size);
472 
473 	shstk->size = 0;
474 }
475 
476 static int wrss_control(bool enable)
477 {
478 	u64 msrval;
479 
480 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
481 		return -EOPNOTSUPP;
482 
483 	/*
484 	 * Only enable WRSS if shadow stack is enabled. If shadow stack is not
485 	 * enabled, WRSS will already be disabled, so don't bother clearing it
486 	 * when disabling.
487 	 */
488 	if (!features_enabled(ARCH_SHSTK_SHSTK))
489 		return -EPERM;
490 
491 	/* Already enabled/disabled? */
492 	if (features_enabled(ARCH_SHSTK_WRSS) == enable)
493 		return 0;
494 
495 	fpregs_lock_and_load();
496 	rdmsrq(MSR_IA32_U_CET, msrval);
497 
498 	if (enable) {
499 		features_set(ARCH_SHSTK_WRSS);
500 		msrval |= CET_WRSS_EN;
501 	} else {
502 		features_clr(ARCH_SHSTK_WRSS);
503 		if (!(msrval & CET_WRSS_EN))
504 			goto unlock;
505 
506 		msrval &= ~CET_WRSS_EN;
507 	}
508 
509 	wrmsrq(MSR_IA32_U_CET, msrval);
510 
511 unlock:
512 	fpregs_unlock();
513 
514 	return 0;
515 }
516 
517 static int shstk_disable(void)
518 {
519 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
520 		return -EOPNOTSUPP;
521 
522 	/* Already disabled? */
523 	if (!features_enabled(ARCH_SHSTK_SHSTK))
524 		return 0;
525 
526 	fpregs_lock_and_load();
527 	/* Disable WRSS too when disabling shadow stack */
528 	wrmsrq(MSR_IA32_U_CET, 0);
529 	wrmsrq(MSR_IA32_PL3_SSP, 0);
530 	fpregs_unlock();
531 
532 	shstk_free(current);
533 	features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
534 
535 	return 0;
536 }
537 
538 SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
539 {
540 	bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
541 	unsigned long aligned_size;
542 
543 	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
544 		return -EOPNOTSUPP;
545 
546 	if (flags & ~SHADOW_STACK_SET_TOKEN)
547 		return -EINVAL;
548 
549 	/* If there isn't space for a token */
550 	if (set_tok && size < 8)
551 		return -ENOSPC;
552 
553 	if (addr && addr < SZ_4G)
554 		return -ERANGE;
555 
556 	/*
557 	 * An overflow would result in attempting to write the restore token
558 	 * to the wrong location. Not catastrophic, but just return the right
559 	 * error code and block it.
560 	 */
561 	aligned_size = PAGE_ALIGN(size);
562 	if (aligned_size < size)
563 		return -EOVERFLOW;
564 
565 	return alloc_shstk(addr, aligned_size, size, set_tok);
566 }
567 
568 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
569 {
570 	unsigned long features = arg2;
571 
572 	if (option == ARCH_SHSTK_STATUS) {
573 		return put_user(task->thread.features, (unsigned long __user *)arg2);
574 	}
575 
576 	if (option == ARCH_SHSTK_LOCK) {
577 		task->thread.features_locked |= features;
578 		return 0;
579 	}
580 
581 	/* Only allow via ptrace */
582 	if (task != current) {
583 		if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
584 			task->thread.features_locked &= ~features;
585 			return 0;
586 		}
587 		return -EINVAL;
588 	}
589 
590 	/* Do not allow to change locked features */
591 	if (features & task->thread.features_locked)
592 		return -EPERM;
593 
594 	/* Only support enabling/disabling one feature at a time. */
595 	if (hweight_long(features) > 1)
596 		return -EINVAL;
597 
598 	if (option == ARCH_SHSTK_DISABLE) {
599 		if (features & ARCH_SHSTK_WRSS)
600 			return wrss_control(false);
601 		if (features & ARCH_SHSTK_SHSTK)
602 			return shstk_disable();
603 		return -EINVAL;
604 	}
605 
606 	/* Handle ARCH_SHSTK_ENABLE */
607 	if (features & ARCH_SHSTK_SHSTK)
608 		return shstk_setup();
609 	if (features & ARCH_SHSTK_WRSS)
610 		return wrss_control(true);
611 	return -EINVAL;
612 }
613 
614 int shstk_update_last_frame(unsigned long val)
615 {
616 	unsigned long ssp;
617 
618 	if (!features_enabled(ARCH_SHSTK_SHSTK))
619 		return 0;
620 
621 	ssp = get_user_shstk_addr();
622 	return write_user_shstk_64((u64 __user *)ssp, (u64)val);
623 }
624 
625 bool shstk_is_enabled(void)
626 {
627 	return features_enabled(ARCH_SHSTK_SHSTK);
628 }
629