1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * shstk.c - Intel shadow stack support
4 *
5 * Copyright (c) 2021, Intel Corporation.
6 * Yu-cheng Yu <yu-cheng.yu@intel.com>
7 */
8
9 #include <linux/sched.h>
10 #include <linux/bitops.h>
11 #include <linux/types.h>
12 #include <linux/mm.h>
13 #include <linux/mman.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/sched/signal.h>
17 #include <linux/compat.h>
18 #include <linux/sizes.h>
19 #include <linux/user.h>
20 #include <linux/syscalls.h>
21 #include <asm/msr.h>
22 #include <asm/fpu/xstate.h>
23 #include <asm/fpu/types.h>
24 #include <asm/shstk.h>
25 #include <asm/special_insns.h>
26 #include <asm/fpu/api.h>
27 #include <asm/prctl.h>
28
29 #define SS_FRAME_SIZE 8
30
features_enabled(unsigned long features)31 static bool features_enabled(unsigned long features)
32 {
33 return current->thread.features & features;
34 }
35
features_set(unsigned long features)36 static void features_set(unsigned long features)
37 {
38 current->thread.features |= features;
39 }
40
features_clr(unsigned long features)41 static void features_clr(unsigned long features)
42 {
43 current->thread.features &= ~features;
44 }
45
46 /*
47 * Create a restore token on the shadow stack. A token is always 8-byte
48 * and aligned to 8.
49 */
create_rstor_token(unsigned long ssp,unsigned long * token_addr)50 static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
51 {
52 unsigned long addr;
53
54 /* Token must be aligned */
55 if (!IS_ALIGNED(ssp, 8))
56 return -EINVAL;
57
58 addr = ssp - SS_FRAME_SIZE;
59
60 /*
61 * SSP is aligned, so reserved bits and mode bit are a zero, just mark
62 * the token 64-bit.
63 */
64 ssp |= BIT(0);
65
66 if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
67 return -EFAULT;
68
69 if (token_addr)
70 *token_addr = addr;
71
72 return 0;
73 }
74
75 /*
76 * VM_SHADOW_STACK will have a guard page. This helps userspace protect
77 * itself from attacks. The reasoning is as follows:
78 *
79 * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
80 * INCSSP instruction can increment the shadow stack pointer. It is the
81 * shadow stack analog of an instruction like:
82 *
83 * addq $0x80, %rsp
84 *
85 * However, there is one important difference between an ADD on %rsp
86 * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
87 * memory of the first and last elements that were "popped". It can be
88 * thought of as acting like this:
89 *
90 * READ_ONCE(ssp); // read+discard top element on stack
91 * ssp += nr_to_pop * 8; // move the shadow stack
92 * READ_ONCE(ssp-8); // read+discard last popped stack element
93 *
94 * The maximum distance INCSSP can move the SSP is 2040 bytes, before
95 * it would read the memory. Therefore a single page gap will be enough
96 * to prevent any operation from shifting the SSP to an adjacent stack,
97 * since it would have to land in the gap at least once, causing a
98 * fault.
99 */
alloc_shstk(unsigned long addr,unsigned long size,unsigned long token_offset,bool set_res_tok)100 static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
101 unsigned long token_offset, bool set_res_tok)
102 {
103 unsigned long mapped_addr;
104
105 mapped_addr = vm_mmap_shadow_stack(addr, size, MAP_ABOVE4G);
106
107 if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
108 goto out;
109
110 if (create_rstor_token(mapped_addr + token_offset, NULL)) {
111 vm_munmap(mapped_addr, size);
112 return -EINVAL;
113 }
114
115 out:
116 return mapped_addr;
117 }
118
adjust_shstk_size(unsigned long size)119 static unsigned long adjust_shstk_size(unsigned long size)
120 {
121 if (size)
122 return PAGE_ALIGN(size);
123
124 return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
125 }
126
unmap_shadow_stack(u64 base,u64 size)127 static void unmap_shadow_stack(u64 base, u64 size)
128 {
129 int r;
130
131 r = vm_munmap(base, size);
132
133 /*
134 * mmap_write_lock_killable() failed with -EINTR. This means
135 * the process is about to die and have it's MM cleaned up.
136 * This task shouldn't ever make it back to userspace. In this
137 * case it is ok to leak a shadow stack, so just exit out.
138 */
139 if (r == -EINTR)
140 return;
141
142 /*
143 * For all other types of vm_munmap() failure, either the
144 * system is out of memory or there is bug.
145 */
146 WARN_ON_ONCE(r);
147 }
148
shstk_setup(void)149 static int shstk_setup(void)
150 {
151 struct thread_shstk *shstk = ¤t->thread.shstk;
152 unsigned long addr, size;
153
154 /* Already enabled */
155 if (features_enabled(ARCH_SHSTK_SHSTK))
156 return 0;
157
158 /* Also not supported for 32 bit */
159 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
160 return -EOPNOTSUPP;
161
162 size = adjust_shstk_size(0);
163 addr = alloc_shstk(0, size, 0, false);
164 if (IS_ERR_VALUE(addr))
165 return PTR_ERR((void *)addr);
166
167 fpregs_lock_and_load();
168 wrmsrq(MSR_IA32_PL3_SSP, addr + size);
169 wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN);
170 fpregs_unlock();
171
172 shstk->base = addr;
173 shstk->size = size;
174 features_set(ARCH_SHSTK_SHSTK);
175
176 return 0;
177 }
178
reset_thread_features(void)179 void reset_thread_features(void)
180 {
181 memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk));
182 current->thread.features = 0;
183 current->thread.features_locked = 0;
184 }
185
shstk_alloc_thread_stack(struct task_struct * tsk,u64 clone_flags,unsigned long stack_size)186 unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
187 unsigned long stack_size)
188 {
189 struct thread_shstk *shstk = &tsk->thread.shstk;
190 unsigned long addr, size;
191
192 /*
193 * If shadow stack is not enabled on the new thread, skip any
194 * switch to a new shadow stack.
195 */
196 if (!features_enabled(ARCH_SHSTK_SHSTK))
197 return 0;
198
199 /*
200 * For CLONE_VFORK the child will share the parents shadow stack.
201 * Make sure to clear the internal tracking of the thread shadow
202 * stack so the freeing logic run for child knows to leave it alone.
203 */
204 if (clone_flags & CLONE_VFORK) {
205 shstk->base = 0;
206 shstk->size = 0;
207 return 0;
208 }
209
210 /*
211 * For !CLONE_VM the child will use a copy of the parents shadow
212 * stack.
213 */
214 if (!(clone_flags & CLONE_VM))
215 return 0;
216
217 size = adjust_shstk_size(stack_size);
218 addr = alloc_shstk(0, size, 0, false);
219 if (IS_ERR_VALUE(addr))
220 return addr;
221
222 shstk->base = addr;
223 shstk->size = size;
224
225 return addr + size;
226 }
227
get_user_shstk_addr(void)228 static unsigned long get_user_shstk_addr(void)
229 {
230 unsigned long long ssp;
231
232 fpregs_lock_and_load();
233
234 rdmsrq(MSR_IA32_PL3_SSP, ssp);
235
236 fpregs_unlock();
237
238 return ssp;
239 }
240
shstk_pop(u64 * val)241 int shstk_pop(u64 *val)
242 {
243 int ret = 0;
244 u64 ssp;
245
246 if (!features_enabled(ARCH_SHSTK_SHSTK))
247 return -ENOTSUPP;
248
249 fpregs_lock_and_load();
250
251 rdmsrq(MSR_IA32_PL3_SSP, ssp);
252 if (val && get_user(*val, (__user u64 *)ssp))
253 ret = -EFAULT;
254 else
255 wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
256 fpregs_unlock();
257
258 return ret;
259 }
260
shstk_push(u64 val)261 int shstk_push(u64 val)
262 {
263 u64 ssp;
264 int ret;
265
266 if (!features_enabled(ARCH_SHSTK_SHSTK))
267 return -ENOTSUPP;
268
269 fpregs_lock_and_load();
270
271 rdmsrq(MSR_IA32_PL3_SSP, ssp);
272 ssp -= SS_FRAME_SIZE;
273 ret = write_user_shstk_64((__user void *)ssp, val);
274 if (!ret)
275 wrmsrq(MSR_IA32_PL3_SSP, ssp);
276 fpregs_unlock();
277
278 return ret;
279 }
280
281 #define SHSTK_DATA_BIT BIT(63)
282
put_shstk_data(u64 __user * addr,u64 data)283 static int put_shstk_data(u64 __user *addr, u64 data)
284 {
285 if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
286 return -EINVAL;
287
288 /*
289 * Mark the high bit so that the sigframe can't be processed as a
290 * return address.
291 */
292 if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
293 return -EFAULT;
294 return 0;
295 }
296
get_shstk_data(unsigned long * data,unsigned long __user * addr)297 static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
298 {
299 unsigned long ldata;
300
301 if (unlikely(get_user(ldata, addr)))
302 return -EFAULT;
303
304 if (!(ldata & SHSTK_DATA_BIT))
305 return -EINVAL;
306
307 *data = ldata & ~SHSTK_DATA_BIT;
308
309 return 0;
310 }
311
shstk_push_sigframe(unsigned long * ssp)312 static int shstk_push_sigframe(unsigned long *ssp)
313 {
314 unsigned long target_ssp = *ssp;
315
316 /* Token must be aligned */
317 if (!IS_ALIGNED(target_ssp, 8))
318 return -EINVAL;
319
320 *ssp -= SS_FRAME_SIZE;
321 if (put_shstk_data((void __user *)*ssp, target_ssp))
322 return -EFAULT;
323
324 return 0;
325 }
326
shstk_pop_sigframe(unsigned long * ssp)327 static int shstk_pop_sigframe(unsigned long *ssp)
328 {
329 unsigned long token_addr;
330 unsigned int seq;
331
332 /*
333 * It is possible for the SSP to be off the end of a shadow stack by 4
334 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
335 * before it, it might be this case, so check that the address being
336 * read is actually shadow stack.
337 */
338 if (!IS_ALIGNED(*ssp, 8))
339 return -EINVAL;
340
341 do {
342 struct vm_area_struct *vma;
343 bool valid_vma;
344 int err;
345
346 if (mmap_read_lock_killable(current->mm))
347 return -EINTR;
348
349 vma = find_vma(current->mm, *ssp);
350 valid_vma = vma && (vma->vm_flags & VM_SHADOW_STACK);
351
352 /*
353 * VMAs can change between get_shstk_data() and find_vma().
354 * Watch for changes and ensure that 'token_addr' comes from
355 * 'vma' by recording a seqcount.
356 *
357 * Ignore the return value of mmap_lock_speculate_try_begin()
358 * because the mmap lock excludes the possibility of writers.
359 */
360 mmap_lock_speculate_try_begin(current->mm, &seq);
361 mmap_read_unlock(current->mm);
362
363 if (!valid_vma)
364 return -EINVAL;
365
366 err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
367 if (err)
368 return err;
369 } while (mmap_lock_speculate_retry(current->mm, seq));
370
371 /* Restore SSP aligned? */
372 if (unlikely(!IS_ALIGNED(token_addr, 8)))
373 return -EINVAL;
374
375 /* SSP in userspace? */
376 if (unlikely(token_addr >= TASK_SIZE_MAX))
377 return -EINVAL;
378
379 *ssp = token_addr;
380
381 return 0;
382 }
383
setup_signal_shadow_stack(struct ksignal * ksig)384 int setup_signal_shadow_stack(struct ksignal *ksig)
385 {
386 void __user *restorer = ksig->ka.sa.sa_restorer;
387 unsigned long ssp;
388 int err;
389
390 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
391 !features_enabled(ARCH_SHSTK_SHSTK))
392 return 0;
393
394 if (!restorer)
395 return -EINVAL;
396
397 ssp = get_user_shstk_addr();
398 if (unlikely(!ssp))
399 return -EINVAL;
400
401 err = shstk_push_sigframe(&ssp);
402 if (unlikely(err))
403 return err;
404
405 /* Push restorer address */
406 ssp -= SS_FRAME_SIZE;
407 err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
408 if (unlikely(err))
409 return -EFAULT;
410
411 fpregs_lock_and_load();
412 wrmsrq(MSR_IA32_PL3_SSP, ssp);
413 fpregs_unlock();
414
415 return 0;
416 }
417
restore_signal_shadow_stack(void)418 int restore_signal_shadow_stack(void)
419 {
420 unsigned long ssp;
421 int err;
422
423 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
424 !features_enabled(ARCH_SHSTK_SHSTK))
425 return 0;
426
427 ssp = get_user_shstk_addr();
428 if (unlikely(!ssp))
429 return -EINVAL;
430
431 err = shstk_pop_sigframe(&ssp);
432 if (unlikely(err))
433 return err;
434
435 fpregs_lock_and_load();
436 wrmsrq(MSR_IA32_PL3_SSP, ssp);
437 fpregs_unlock();
438
439 return 0;
440 }
441
shstk_free(struct task_struct * tsk)442 void shstk_free(struct task_struct *tsk)
443 {
444 struct thread_shstk *shstk = &tsk->thread.shstk;
445
446 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
447 !features_enabled(ARCH_SHSTK_SHSTK))
448 return;
449
450 /*
451 * When fork() with CLONE_VM fails, the child (tsk) already has a
452 * shadow stack allocated, and exit_thread() calls this function to
453 * free it. In this case the parent (current) and the child share
454 * the same mm struct.
455 */
456 if (!tsk->mm || tsk->mm != current->mm)
457 return;
458
459 /*
460 * If shstk->base is NULL, then this task is not managing its
461 * own shadow stack (CLONE_VFORK). So skip freeing it.
462 */
463 if (!shstk->base)
464 return;
465
466 /*
467 * shstk->base is NULL for CLONE_VFORK child tasks, and so is
468 * normal. But size = 0 on a shstk->base is not normal and
469 * indicated an attempt to free the thread shadow stack twice.
470 * Warn about it.
471 */
472 if (WARN_ON(!shstk->size))
473 return;
474
475 unmap_shadow_stack(shstk->base, shstk->size);
476
477 shstk->size = 0;
478 }
479
wrss_control(bool enable)480 static int wrss_control(bool enable)
481 {
482 u64 msrval;
483
484 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
485 return -EOPNOTSUPP;
486
487 /*
488 * Only enable WRSS if shadow stack is enabled. If shadow stack is not
489 * enabled, WRSS will already be disabled, so don't bother clearing it
490 * when disabling.
491 */
492 if (!features_enabled(ARCH_SHSTK_SHSTK))
493 return -EPERM;
494
495 /* Already enabled/disabled? */
496 if (features_enabled(ARCH_SHSTK_WRSS) == enable)
497 return 0;
498
499 fpregs_lock_and_load();
500 rdmsrq(MSR_IA32_U_CET, msrval);
501
502 if (enable) {
503 features_set(ARCH_SHSTK_WRSS);
504 msrval |= CET_WRSS_EN;
505 } else {
506 features_clr(ARCH_SHSTK_WRSS);
507 if (!(msrval & CET_WRSS_EN))
508 goto unlock;
509
510 msrval &= ~CET_WRSS_EN;
511 }
512
513 wrmsrq(MSR_IA32_U_CET, msrval);
514
515 unlock:
516 fpregs_unlock();
517
518 return 0;
519 }
520
shstk_disable(void)521 static int shstk_disable(void)
522 {
523 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
524 return -EOPNOTSUPP;
525
526 /* Already disabled? */
527 if (!features_enabled(ARCH_SHSTK_SHSTK))
528 return 0;
529
530 fpregs_lock_and_load();
531 /* Disable WRSS too when disabling shadow stack */
532 wrmsrq(MSR_IA32_U_CET, 0);
533 wrmsrq(MSR_IA32_PL3_SSP, 0);
534 fpregs_unlock();
535
536 shstk_free(current);
537 features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
538
539 return 0;
540 }
541
SYSCALL_DEFINE3(map_shadow_stack,unsigned long,addr,unsigned long,size,unsigned int,flags)542 SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
543 {
544 bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
545 unsigned long aligned_size;
546
547 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
548 return -EOPNOTSUPP;
549
550 if (flags & ~SHADOW_STACK_SET_TOKEN)
551 return -EINVAL;
552
553 /* If there isn't space for a token */
554 if (set_tok && size < 8)
555 return -ENOSPC;
556
557 if (addr && addr < SZ_4G)
558 return -ERANGE;
559
560 /*
561 * An overflow would result in attempting to write the restore token
562 * to the wrong location. Not catastrophic, but just return the right
563 * error code and block it.
564 */
565 aligned_size = PAGE_ALIGN(size);
566 if (aligned_size < size)
567 return -EOVERFLOW;
568
569 return alloc_shstk(addr, aligned_size, size, set_tok);
570 }
571
shstk_prctl(struct task_struct * task,int option,unsigned long arg2)572 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
573 {
574 unsigned long features = arg2;
575
576 if (option == ARCH_SHSTK_STATUS) {
577 return put_user(task->thread.features, (unsigned long __user *)arg2);
578 }
579
580 if (option == ARCH_SHSTK_LOCK) {
581 task->thread.features_locked |= features;
582 return 0;
583 }
584
585 /* Only allow via ptrace */
586 if (task != current) {
587 if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
588 task->thread.features_locked &= ~features;
589 return 0;
590 }
591 return -EINVAL;
592 }
593
594 /* Do not allow to change locked features */
595 if (features & task->thread.features_locked)
596 return -EPERM;
597
598 /* Only support enabling/disabling one feature at a time. */
599 if (hweight_long(features) > 1)
600 return -EINVAL;
601
602 if (option == ARCH_SHSTK_DISABLE) {
603 if (features & ARCH_SHSTK_WRSS)
604 return wrss_control(false);
605 if (features & ARCH_SHSTK_SHSTK)
606 return shstk_disable();
607 return -EINVAL;
608 }
609
610 /* Handle ARCH_SHSTK_ENABLE */
611 if (features & ARCH_SHSTK_SHSTK)
612 return shstk_setup();
613 if (features & ARCH_SHSTK_WRSS)
614 return wrss_control(true);
615 return -EINVAL;
616 }
617
shstk_update_last_frame(unsigned long val)618 int shstk_update_last_frame(unsigned long val)
619 {
620 unsigned long ssp;
621
622 if (!features_enabled(ARCH_SHSTK_SHSTK))
623 return 0;
624
625 ssp = get_user_shstk_addr();
626 return write_user_shstk_64((u64 __user *)ssp, (u64)val);
627 }
628
shstk_is_enabled(void)629 bool shstk_is_enabled(void)
630 {
631 return features_enabled(ARCH_SHSTK_SHSTK);
632 }
633