1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * shstk.c - Intel shadow stack support
4 *
5 * Copyright (c) 2021, Intel Corporation.
6 * Yu-cheng Yu <yu-cheng.yu@intel.com>
7 */
8
9 #include <linux/sched.h>
10 #include <linux/bitops.h>
11 #include <linux/types.h>
12 #include <linux/mm.h>
13 #include <linux/mman.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/sched/signal.h>
17 #include <linux/compat.h>
18 #include <linux/sizes.h>
19 #include <linux/user.h>
20 #include <linux/syscalls.h>
21 #include <asm/msr.h>
22 #include <asm/fpu/xstate.h>
23 #include <asm/fpu/types.h>
24 #include <asm/shstk.h>
25 #include <asm/special_insns.h>
26 #include <asm/fpu/api.h>
27 #include <asm/prctl.h>
28
29 #define SS_FRAME_SIZE 8
30
features_enabled(unsigned long features)31 static bool features_enabled(unsigned long features)
32 {
33 return current->thread.features & features;
34 }
35
features_set(unsigned long features)36 static void features_set(unsigned long features)
37 {
38 current->thread.features |= features;
39 }
40
features_clr(unsigned long features)41 static void features_clr(unsigned long features)
42 {
43 current->thread.features &= ~features;
44 }
45
46 /*
47 * Create a restore token on the shadow stack. A token is always 8-byte
48 * and aligned to 8.
49 */
create_rstor_token(unsigned long ssp,unsigned long * token_addr)50 static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
51 {
52 unsigned long addr;
53
54 /* Token must be aligned */
55 if (!IS_ALIGNED(ssp, 8))
56 return -EINVAL;
57
58 addr = ssp - SS_FRAME_SIZE;
59
60 /*
61 * SSP is aligned, so reserved bits and mode bit are a zero, just mark
62 * the token 64-bit.
63 */
64 ssp |= BIT(0);
65
66 if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
67 return -EFAULT;
68
69 if (token_addr)
70 *token_addr = addr;
71
72 return 0;
73 }
74
75 /*
76 * VM_SHADOW_STACK will have a guard page. This helps userspace protect
77 * itself from attacks. The reasoning is as follows:
78 *
79 * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
80 * INCSSP instruction can increment the shadow stack pointer. It is the
81 * shadow stack analog of an instruction like:
82 *
83 * addq $0x80, %rsp
84 *
85 * However, there is one important difference between an ADD on %rsp
86 * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
87 * memory of the first and last elements that were "popped". It can be
88 * thought of as acting like this:
89 *
90 * READ_ONCE(ssp); // read+discard top element on stack
91 * ssp += nr_to_pop * 8; // move the shadow stack
92 * READ_ONCE(ssp-8); // read+discard last popped stack element
93 *
94 * The maximum distance INCSSP can move the SSP is 2040 bytes, before
95 * it would read the memory. Therefore a single page gap will be enough
96 * to prevent any operation from shifting the SSP to an adjacent stack,
97 * since it would have to land in the gap at least once, causing a
98 * fault.
99 */
alloc_shstk(unsigned long addr,unsigned long size,unsigned long token_offset,bool set_res_tok)100 static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
101 unsigned long token_offset, bool set_res_tok)
102 {
103 int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
104 struct mm_struct *mm = current->mm;
105 unsigned long mapped_addr, unused;
106
107 if (addr)
108 flags |= MAP_FIXED_NOREPLACE;
109
110 mmap_write_lock(mm);
111 mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
112 VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
113 mmap_write_unlock(mm);
114
115 if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
116 goto out;
117
118 if (create_rstor_token(mapped_addr + token_offset, NULL)) {
119 vm_munmap(mapped_addr, size);
120 return -EINVAL;
121 }
122
123 out:
124 return mapped_addr;
125 }
126
adjust_shstk_size(unsigned long size)127 static unsigned long adjust_shstk_size(unsigned long size)
128 {
129 if (size)
130 return PAGE_ALIGN(size);
131
132 return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
133 }
134
unmap_shadow_stack(u64 base,u64 size)135 static void unmap_shadow_stack(u64 base, u64 size)
136 {
137 int r;
138
139 r = vm_munmap(base, size);
140
141 /*
142 * mmap_write_lock_killable() failed with -EINTR. This means
143 * the process is about to die and have it's MM cleaned up.
144 * This task shouldn't ever make it back to userspace. In this
145 * case it is ok to leak a shadow stack, so just exit out.
146 */
147 if (r == -EINTR)
148 return;
149
150 /*
151 * For all other types of vm_munmap() failure, either the
152 * system is out of memory or there is bug.
153 */
154 WARN_ON_ONCE(r);
155 }
156
shstk_setup(void)157 static int shstk_setup(void)
158 {
159 struct thread_shstk *shstk = ¤t->thread.shstk;
160 unsigned long addr, size;
161
162 /* Already enabled */
163 if (features_enabled(ARCH_SHSTK_SHSTK))
164 return 0;
165
166 /* Also not supported for 32 bit */
167 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
168 return -EOPNOTSUPP;
169
170 size = adjust_shstk_size(0);
171 addr = alloc_shstk(0, size, 0, false);
172 if (IS_ERR_VALUE(addr))
173 return PTR_ERR((void *)addr);
174
175 fpregs_lock_and_load();
176 wrmsrq(MSR_IA32_PL3_SSP, addr + size);
177 wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN);
178 fpregs_unlock();
179
180 shstk->base = addr;
181 shstk->size = size;
182 features_set(ARCH_SHSTK_SHSTK);
183
184 return 0;
185 }
186
reset_thread_features(void)187 void reset_thread_features(void)
188 {
189 memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk));
190 current->thread.features = 0;
191 current->thread.features_locked = 0;
192 }
193
shstk_alloc_thread_stack(struct task_struct * tsk,u64 clone_flags,unsigned long stack_size)194 unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
195 unsigned long stack_size)
196 {
197 struct thread_shstk *shstk = &tsk->thread.shstk;
198 unsigned long addr, size;
199
200 /*
201 * If shadow stack is not enabled on the new thread, skip any
202 * switch to a new shadow stack.
203 */
204 if (!features_enabled(ARCH_SHSTK_SHSTK))
205 return 0;
206
207 /*
208 * For CLONE_VFORK the child will share the parents shadow stack.
209 * Make sure to clear the internal tracking of the thread shadow
210 * stack so the freeing logic run for child knows to leave it alone.
211 */
212 if (clone_flags & CLONE_VFORK) {
213 shstk->base = 0;
214 shstk->size = 0;
215 return 0;
216 }
217
218 /*
219 * For !CLONE_VM the child will use a copy of the parents shadow
220 * stack.
221 */
222 if (!(clone_flags & CLONE_VM))
223 return 0;
224
225 size = adjust_shstk_size(stack_size);
226 addr = alloc_shstk(0, size, 0, false);
227 if (IS_ERR_VALUE(addr))
228 return addr;
229
230 shstk->base = addr;
231 shstk->size = size;
232
233 return addr + size;
234 }
235
get_user_shstk_addr(void)236 static unsigned long get_user_shstk_addr(void)
237 {
238 unsigned long long ssp;
239
240 fpregs_lock_and_load();
241
242 rdmsrq(MSR_IA32_PL3_SSP, ssp);
243
244 fpregs_unlock();
245
246 return ssp;
247 }
248
shstk_pop(u64 * val)249 int shstk_pop(u64 *val)
250 {
251 int ret = 0;
252 u64 ssp;
253
254 if (!features_enabled(ARCH_SHSTK_SHSTK))
255 return -ENOTSUPP;
256
257 fpregs_lock_and_load();
258
259 rdmsrq(MSR_IA32_PL3_SSP, ssp);
260 if (val && get_user(*val, (__user u64 *)ssp))
261 ret = -EFAULT;
262 else
263 wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
264 fpregs_unlock();
265
266 return ret;
267 }
268
shstk_push(u64 val)269 int shstk_push(u64 val)
270 {
271 u64 ssp;
272 int ret;
273
274 if (!features_enabled(ARCH_SHSTK_SHSTK))
275 return -ENOTSUPP;
276
277 fpregs_lock_and_load();
278
279 rdmsrq(MSR_IA32_PL3_SSP, ssp);
280 ssp -= SS_FRAME_SIZE;
281 ret = write_user_shstk_64((__user void *)ssp, val);
282 if (!ret)
283 wrmsrq(MSR_IA32_PL3_SSP, ssp);
284 fpregs_unlock();
285
286 return ret;
287 }
288
289 #define SHSTK_DATA_BIT BIT(63)
290
put_shstk_data(u64 __user * addr,u64 data)291 static int put_shstk_data(u64 __user *addr, u64 data)
292 {
293 if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
294 return -EINVAL;
295
296 /*
297 * Mark the high bit so that the sigframe can't be processed as a
298 * return address.
299 */
300 if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
301 return -EFAULT;
302 return 0;
303 }
304
get_shstk_data(unsigned long * data,unsigned long __user * addr)305 static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
306 {
307 unsigned long ldata;
308
309 if (unlikely(get_user(ldata, addr)))
310 return -EFAULT;
311
312 if (!(ldata & SHSTK_DATA_BIT))
313 return -EINVAL;
314
315 *data = ldata & ~SHSTK_DATA_BIT;
316
317 return 0;
318 }
319
shstk_push_sigframe(unsigned long * ssp)320 static int shstk_push_sigframe(unsigned long *ssp)
321 {
322 unsigned long target_ssp = *ssp;
323
324 /* Token must be aligned */
325 if (!IS_ALIGNED(target_ssp, 8))
326 return -EINVAL;
327
328 *ssp -= SS_FRAME_SIZE;
329 if (put_shstk_data((void __user *)*ssp, target_ssp))
330 return -EFAULT;
331
332 return 0;
333 }
334
shstk_pop_sigframe(unsigned long * ssp)335 static int shstk_pop_sigframe(unsigned long *ssp)
336 {
337 struct vm_area_struct *vma;
338 unsigned long token_addr;
339 bool need_to_check_vma;
340 int err = 1;
341
342 /*
343 * It is possible for the SSP to be off the end of a shadow stack by 4
344 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
345 * before it, it might be this case, so check that the address being
346 * read is actually shadow stack.
347 */
348 if (!IS_ALIGNED(*ssp, 8))
349 return -EINVAL;
350
351 need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
352
353 if (need_to_check_vma)
354 mmap_read_lock_killable(current->mm);
355
356 err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
357 if (unlikely(err))
358 goto out_err;
359
360 if (need_to_check_vma) {
361 vma = find_vma(current->mm, *ssp);
362 if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
363 err = -EFAULT;
364 goto out_err;
365 }
366
367 mmap_read_unlock(current->mm);
368 }
369
370 /* Restore SSP aligned? */
371 if (unlikely(!IS_ALIGNED(token_addr, 8)))
372 return -EINVAL;
373
374 /* SSP in userspace? */
375 if (unlikely(token_addr >= TASK_SIZE_MAX))
376 return -EINVAL;
377
378 *ssp = token_addr;
379
380 return 0;
381 out_err:
382 if (need_to_check_vma)
383 mmap_read_unlock(current->mm);
384 return err;
385 }
386
setup_signal_shadow_stack(struct ksignal * ksig)387 int setup_signal_shadow_stack(struct ksignal *ksig)
388 {
389 void __user *restorer = ksig->ka.sa.sa_restorer;
390 unsigned long ssp;
391 int err;
392
393 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
394 !features_enabled(ARCH_SHSTK_SHSTK))
395 return 0;
396
397 if (!restorer)
398 return -EINVAL;
399
400 ssp = get_user_shstk_addr();
401 if (unlikely(!ssp))
402 return -EINVAL;
403
404 err = shstk_push_sigframe(&ssp);
405 if (unlikely(err))
406 return err;
407
408 /* Push restorer address */
409 ssp -= SS_FRAME_SIZE;
410 err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
411 if (unlikely(err))
412 return -EFAULT;
413
414 fpregs_lock_and_load();
415 wrmsrq(MSR_IA32_PL3_SSP, ssp);
416 fpregs_unlock();
417
418 return 0;
419 }
420
restore_signal_shadow_stack(void)421 int restore_signal_shadow_stack(void)
422 {
423 unsigned long ssp;
424 int err;
425
426 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
427 !features_enabled(ARCH_SHSTK_SHSTK))
428 return 0;
429
430 ssp = get_user_shstk_addr();
431 if (unlikely(!ssp))
432 return -EINVAL;
433
434 err = shstk_pop_sigframe(&ssp);
435 if (unlikely(err))
436 return err;
437
438 fpregs_lock_and_load();
439 wrmsrq(MSR_IA32_PL3_SSP, ssp);
440 fpregs_unlock();
441
442 return 0;
443 }
444
shstk_free(struct task_struct * tsk)445 void shstk_free(struct task_struct *tsk)
446 {
447 struct thread_shstk *shstk = &tsk->thread.shstk;
448
449 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
450 !features_enabled(ARCH_SHSTK_SHSTK))
451 return;
452
453 /*
454 * When fork() with CLONE_VM fails, the child (tsk) already has a
455 * shadow stack allocated, and exit_thread() calls this function to
456 * free it. In this case the parent (current) and the child share
457 * the same mm struct.
458 */
459 if (!tsk->mm || tsk->mm != current->mm)
460 return;
461
462 /*
463 * If shstk->base is NULL, then this task is not managing its
464 * own shadow stack (CLONE_VFORK). So skip freeing it.
465 */
466 if (!shstk->base)
467 return;
468
469 /*
470 * shstk->base is NULL for CLONE_VFORK child tasks, and so is
471 * normal. But size = 0 on a shstk->base is not normal and
472 * indicated an attempt to free the thread shadow stack twice.
473 * Warn about it.
474 */
475 if (WARN_ON(!shstk->size))
476 return;
477
478 unmap_shadow_stack(shstk->base, shstk->size);
479
480 shstk->size = 0;
481 }
482
wrss_control(bool enable)483 static int wrss_control(bool enable)
484 {
485 u64 msrval;
486
487 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
488 return -EOPNOTSUPP;
489
490 /*
491 * Only enable WRSS if shadow stack is enabled. If shadow stack is not
492 * enabled, WRSS will already be disabled, so don't bother clearing it
493 * when disabling.
494 */
495 if (!features_enabled(ARCH_SHSTK_SHSTK))
496 return -EPERM;
497
498 /* Already enabled/disabled? */
499 if (features_enabled(ARCH_SHSTK_WRSS) == enable)
500 return 0;
501
502 fpregs_lock_and_load();
503 rdmsrq(MSR_IA32_U_CET, msrval);
504
505 if (enable) {
506 features_set(ARCH_SHSTK_WRSS);
507 msrval |= CET_WRSS_EN;
508 } else {
509 features_clr(ARCH_SHSTK_WRSS);
510 if (!(msrval & CET_WRSS_EN))
511 goto unlock;
512
513 msrval &= ~CET_WRSS_EN;
514 }
515
516 wrmsrq(MSR_IA32_U_CET, msrval);
517
518 unlock:
519 fpregs_unlock();
520
521 return 0;
522 }
523
shstk_disable(void)524 static int shstk_disable(void)
525 {
526 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
527 return -EOPNOTSUPP;
528
529 /* Already disabled? */
530 if (!features_enabled(ARCH_SHSTK_SHSTK))
531 return 0;
532
533 fpregs_lock_and_load();
534 /* Disable WRSS too when disabling shadow stack */
535 wrmsrq(MSR_IA32_U_CET, 0);
536 wrmsrq(MSR_IA32_PL3_SSP, 0);
537 fpregs_unlock();
538
539 shstk_free(current);
540 features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
541
542 return 0;
543 }
544
SYSCALL_DEFINE3(map_shadow_stack,unsigned long,addr,unsigned long,size,unsigned int,flags)545 SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
546 {
547 bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
548 unsigned long aligned_size;
549
550 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
551 return -EOPNOTSUPP;
552
553 if (flags & ~SHADOW_STACK_SET_TOKEN)
554 return -EINVAL;
555
556 /* If there isn't space for a token */
557 if (set_tok && size < 8)
558 return -ENOSPC;
559
560 if (addr && addr < SZ_4G)
561 return -ERANGE;
562
563 /*
564 * An overflow would result in attempting to write the restore token
565 * to the wrong location. Not catastrophic, but just return the right
566 * error code and block it.
567 */
568 aligned_size = PAGE_ALIGN(size);
569 if (aligned_size < size)
570 return -EOVERFLOW;
571
572 return alloc_shstk(addr, aligned_size, size, set_tok);
573 }
574
shstk_prctl(struct task_struct * task,int option,unsigned long arg2)575 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
576 {
577 unsigned long features = arg2;
578
579 if (option == ARCH_SHSTK_STATUS) {
580 return put_user(task->thread.features, (unsigned long __user *)arg2);
581 }
582
583 if (option == ARCH_SHSTK_LOCK) {
584 task->thread.features_locked |= features;
585 return 0;
586 }
587
588 /* Only allow via ptrace */
589 if (task != current) {
590 if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
591 task->thread.features_locked &= ~features;
592 return 0;
593 }
594 return -EINVAL;
595 }
596
597 /* Do not allow to change locked features */
598 if (features & task->thread.features_locked)
599 return -EPERM;
600
601 /* Only support enabling/disabling one feature at a time. */
602 if (hweight_long(features) > 1)
603 return -EINVAL;
604
605 if (option == ARCH_SHSTK_DISABLE) {
606 if (features & ARCH_SHSTK_WRSS)
607 return wrss_control(false);
608 if (features & ARCH_SHSTK_SHSTK)
609 return shstk_disable();
610 return -EINVAL;
611 }
612
613 /* Handle ARCH_SHSTK_ENABLE */
614 if (features & ARCH_SHSTK_SHSTK)
615 return shstk_setup();
616 if (features & ARCH_SHSTK_WRSS)
617 return wrss_control(true);
618 return -EINVAL;
619 }
620
shstk_update_last_frame(unsigned long val)621 int shstk_update_last_frame(unsigned long val)
622 {
623 unsigned long ssp;
624
625 if (!features_enabled(ARCH_SHSTK_SHSTK))
626 return 0;
627
628 ssp = get_user_shstk_addr();
629 return write_user_shstk_64((u64 __user *)ssp, (u64)val);
630 }
631
shstk_is_enabled(void)632 bool shstk_is_enabled(void)
633 {
634 return features_enabled(ARCH_SHSTK_SHSTK);
635 }
636