1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * shstk.c - Intel shadow stack support 4 * 5 * Copyright (c) 2021, Intel Corporation. 6 * Yu-cheng Yu <yu-cheng.yu@intel.com> 7 */ 8 9 #include <linux/sched.h> 10 #include <linux/bitops.h> 11 #include <linux/types.h> 12 #include <linux/mm.h> 13 #include <linux/mman.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/sched/signal.h> 17 #include <linux/compat.h> 18 #include <linux/sizes.h> 19 #include <linux/user.h> 20 #include <linux/syscalls.h> 21 #include <asm/msr.h> 22 #include <asm/fpu/xstate.h> 23 #include <asm/fpu/types.h> 24 #include <asm/shstk.h> 25 #include <asm/special_insns.h> 26 #include <asm/fpu/api.h> 27 #include <asm/prctl.h> 28 29 #define SS_FRAME_SIZE 8 30 31 static bool features_enabled(unsigned long features) 32 { 33 return current->thread.features & features; 34 } 35 36 static void features_set(unsigned long features) 37 { 38 current->thread.features |= features; 39 } 40 41 static void features_clr(unsigned long features) 42 { 43 current->thread.features &= ~features; 44 } 45 46 /* 47 * Create a restore token on the shadow stack. A token is always 8-byte 48 * and aligned to 8. 49 */ 50 static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) 51 { 52 unsigned long addr; 53 54 /* Token must be aligned */ 55 if (!IS_ALIGNED(ssp, 8)) 56 return -EINVAL; 57 58 addr = ssp - SS_FRAME_SIZE; 59 60 /* 61 * SSP is aligned, so reserved bits and mode bit are a zero, just mark 62 * the token 64-bit. 63 */ 64 ssp |= BIT(0); 65 66 if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) 67 return -EFAULT; 68 69 if (token_addr) 70 *token_addr = addr; 71 72 return 0; 73 } 74 75 /* 76 * VM_SHADOW_STACK will have a guard page. This helps userspace protect 77 * itself from attacks. The reasoning is as follows: 78 * 79 * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The 80 * INCSSP instruction can increment the shadow stack pointer. It is the 81 * shadow stack analog of an instruction like: 82 * 83 * addq $0x80, %rsp 84 * 85 * However, there is one important difference between an ADD on %rsp 86 * and INCSSP. In addition to modifying SSP, INCSSP also reads from the 87 * memory of the first and last elements that were "popped". It can be 88 * thought of as acting like this: 89 * 90 * READ_ONCE(ssp); // read+discard top element on stack 91 * ssp += nr_to_pop * 8; // move the shadow stack 92 * READ_ONCE(ssp-8); // read+discard last popped stack element 93 * 94 * The maximum distance INCSSP can move the SSP is 2040 bytes, before 95 * it would read the memory. Therefore a single page gap will be enough 96 * to prevent any operation from shifting the SSP to an adjacent stack, 97 * since it would have to land in the gap at least once, causing a 98 * fault. 99 */ 100 static unsigned long alloc_shstk(unsigned long addr, unsigned long size, 101 unsigned long token_offset, bool set_res_tok) 102 { 103 unsigned long mapped_addr; 104 105 mapped_addr = vm_mmap_shadow_stack(addr, size, MAP_ABOVE4G); 106 107 if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) 108 goto out; 109 110 if (create_rstor_token(mapped_addr + token_offset, NULL)) { 111 vm_munmap(mapped_addr, size); 112 return -EINVAL; 113 } 114 115 out: 116 return mapped_addr; 117 } 118 119 static unsigned long adjust_shstk_size(unsigned long size) 120 { 121 if (size) 122 return PAGE_ALIGN(size); 123 124 return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); 125 } 126 127 static void unmap_shadow_stack(u64 base, u64 size) 128 { 129 int r; 130 131 r = vm_munmap(base, size); 132 133 /* 134 * mmap_write_lock_killable() failed with -EINTR. This means 135 * the process is about to die and have it's MM cleaned up. 136 * This task shouldn't ever make it back to userspace. In this 137 * case it is ok to leak a shadow stack, so just exit out. 138 */ 139 if (r == -EINTR) 140 return; 141 142 /* 143 * For all other types of vm_munmap() failure, either the 144 * system is out of memory or there is bug. 145 */ 146 WARN_ON_ONCE(r); 147 } 148 149 static int shstk_setup(void) 150 { 151 struct thread_shstk *shstk = ¤t->thread.shstk; 152 unsigned long addr, size; 153 154 /* Already enabled */ 155 if (features_enabled(ARCH_SHSTK_SHSTK)) 156 return 0; 157 158 /* Also not supported for 32 bit */ 159 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall()) 160 return -EOPNOTSUPP; 161 162 size = adjust_shstk_size(0); 163 addr = alloc_shstk(0, size, 0, false); 164 if (IS_ERR_VALUE(addr)) 165 return PTR_ERR((void *)addr); 166 167 fpregs_lock_and_load(); 168 wrmsrq(MSR_IA32_PL3_SSP, addr + size); 169 wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN); 170 fpregs_unlock(); 171 172 shstk->base = addr; 173 shstk->size = size; 174 features_set(ARCH_SHSTK_SHSTK); 175 176 return 0; 177 } 178 179 void reset_thread_features(void) 180 { 181 memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); 182 current->thread.features = 0; 183 current->thread.features_locked = 0; 184 } 185 186 unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags, 187 unsigned long stack_size) 188 { 189 struct thread_shstk *shstk = &tsk->thread.shstk; 190 unsigned long addr, size; 191 192 /* 193 * If shadow stack is not enabled on the new thread, skip any 194 * switch to a new shadow stack. 195 */ 196 if (!features_enabled(ARCH_SHSTK_SHSTK)) 197 return 0; 198 199 /* 200 * For CLONE_VFORK the child will share the parents shadow stack. 201 * Make sure to clear the internal tracking of the thread shadow 202 * stack so the freeing logic run for child knows to leave it alone. 203 */ 204 if (clone_flags & CLONE_VFORK) { 205 shstk->base = 0; 206 shstk->size = 0; 207 return 0; 208 } 209 210 /* 211 * For !CLONE_VM the child will use a copy of the parents shadow 212 * stack. 213 */ 214 if (!(clone_flags & CLONE_VM)) 215 return 0; 216 217 size = adjust_shstk_size(stack_size); 218 addr = alloc_shstk(0, size, 0, false); 219 if (IS_ERR_VALUE(addr)) 220 return addr; 221 222 shstk->base = addr; 223 shstk->size = size; 224 225 return addr + size; 226 } 227 228 static unsigned long get_user_shstk_addr(void) 229 { 230 unsigned long long ssp; 231 232 fpregs_lock_and_load(); 233 234 rdmsrq(MSR_IA32_PL3_SSP, ssp); 235 236 fpregs_unlock(); 237 238 return ssp; 239 } 240 241 int shstk_pop(u64 *val) 242 { 243 int ret = 0; 244 u64 ssp; 245 246 if (!features_enabled(ARCH_SHSTK_SHSTK)) 247 return -ENOTSUPP; 248 249 fpregs_lock_and_load(); 250 251 rdmsrq(MSR_IA32_PL3_SSP, ssp); 252 if (val && get_user(*val, (__user u64 *)ssp)) 253 ret = -EFAULT; 254 else 255 wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE); 256 fpregs_unlock(); 257 258 return ret; 259 } 260 261 int shstk_push(u64 val) 262 { 263 u64 ssp; 264 int ret; 265 266 if (!features_enabled(ARCH_SHSTK_SHSTK)) 267 return -ENOTSUPP; 268 269 fpregs_lock_and_load(); 270 271 rdmsrq(MSR_IA32_PL3_SSP, ssp); 272 ssp -= SS_FRAME_SIZE; 273 ret = write_user_shstk_64((__user void *)ssp, val); 274 if (!ret) 275 wrmsrq(MSR_IA32_PL3_SSP, ssp); 276 fpregs_unlock(); 277 278 return ret; 279 } 280 281 #define SHSTK_DATA_BIT BIT(63) 282 283 static int put_shstk_data(u64 __user *addr, u64 data) 284 { 285 if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) 286 return -EINVAL; 287 288 /* 289 * Mark the high bit so that the sigframe can't be processed as a 290 * return address. 291 */ 292 if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) 293 return -EFAULT; 294 return 0; 295 } 296 297 static int get_shstk_data(unsigned long *data, unsigned long __user *addr) 298 { 299 unsigned long ldata; 300 301 if (unlikely(get_user(ldata, addr))) 302 return -EFAULT; 303 304 if (!(ldata & SHSTK_DATA_BIT)) 305 return -EINVAL; 306 307 *data = ldata & ~SHSTK_DATA_BIT; 308 309 return 0; 310 } 311 312 static int shstk_push_sigframe(unsigned long *ssp) 313 { 314 unsigned long target_ssp = *ssp; 315 316 /* Token must be aligned */ 317 if (!IS_ALIGNED(target_ssp, 8)) 318 return -EINVAL; 319 320 *ssp -= SS_FRAME_SIZE; 321 if (put_shstk_data((void __user *)*ssp, target_ssp)) 322 return -EFAULT; 323 324 return 0; 325 } 326 327 static int shstk_pop_sigframe(unsigned long *ssp) 328 { 329 unsigned long token_addr; 330 unsigned int seq; 331 332 /* 333 * It is possible for the SSP to be off the end of a shadow stack by 4 334 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes 335 * before it, it might be this case, so check that the address being 336 * read is actually shadow stack. 337 */ 338 if (!IS_ALIGNED(*ssp, 8)) 339 return -EINVAL; 340 341 do { 342 struct vm_area_struct *vma; 343 bool valid_vma; 344 int err; 345 346 if (mmap_read_lock_killable(current->mm)) 347 return -EINTR; 348 349 vma = find_vma(current->mm, *ssp); 350 valid_vma = vma && (vma->vm_flags & VM_SHADOW_STACK); 351 352 /* 353 * VMAs can change between get_shstk_data() and find_vma(). 354 * Watch for changes and ensure that 'token_addr' comes from 355 * 'vma' by recording a seqcount. 356 * 357 * Ignore the return value of mmap_lock_speculate_try_begin() 358 * because the mmap lock excludes the possibility of writers. 359 */ 360 mmap_lock_speculate_try_begin(current->mm, &seq); 361 mmap_read_unlock(current->mm); 362 363 if (!valid_vma) 364 return -EINVAL; 365 366 err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); 367 if (err) 368 return err; 369 } while (mmap_lock_speculate_retry(current->mm, seq)); 370 371 /* Restore SSP aligned? */ 372 if (unlikely(!IS_ALIGNED(token_addr, 8))) 373 return -EINVAL; 374 375 /* SSP in userspace? */ 376 if (unlikely(token_addr >= TASK_SIZE_MAX)) 377 return -EINVAL; 378 379 *ssp = token_addr; 380 381 return 0; 382 } 383 384 int setup_signal_shadow_stack(struct ksignal *ksig) 385 { 386 void __user *restorer = ksig->ka.sa.sa_restorer; 387 unsigned long ssp; 388 int err; 389 390 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || 391 !features_enabled(ARCH_SHSTK_SHSTK)) 392 return 0; 393 394 if (!restorer) 395 return -EINVAL; 396 397 ssp = get_user_shstk_addr(); 398 if (unlikely(!ssp)) 399 return -EINVAL; 400 401 err = shstk_push_sigframe(&ssp); 402 if (unlikely(err)) 403 return err; 404 405 /* Push restorer address */ 406 ssp -= SS_FRAME_SIZE; 407 err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); 408 if (unlikely(err)) 409 return -EFAULT; 410 411 fpregs_lock_and_load(); 412 wrmsrq(MSR_IA32_PL3_SSP, ssp); 413 fpregs_unlock(); 414 415 return 0; 416 } 417 418 int restore_signal_shadow_stack(void) 419 { 420 unsigned long ssp; 421 int err; 422 423 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || 424 !features_enabled(ARCH_SHSTK_SHSTK)) 425 return 0; 426 427 ssp = get_user_shstk_addr(); 428 if (unlikely(!ssp)) 429 return -EINVAL; 430 431 err = shstk_pop_sigframe(&ssp); 432 if (unlikely(err)) 433 return err; 434 435 fpregs_lock_and_load(); 436 wrmsrq(MSR_IA32_PL3_SSP, ssp); 437 fpregs_unlock(); 438 439 return 0; 440 } 441 442 void shstk_free(struct task_struct *tsk) 443 { 444 struct thread_shstk *shstk = &tsk->thread.shstk; 445 446 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || 447 !features_enabled(ARCH_SHSTK_SHSTK)) 448 return; 449 450 /* 451 * When fork() with CLONE_VM fails, the child (tsk) already has a 452 * shadow stack allocated, and exit_thread() calls this function to 453 * free it. In this case the parent (current) and the child share 454 * the same mm struct. 455 */ 456 if (!tsk->mm || tsk->mm != current->mm) 457 return; 458 459 /* 460 * If shstk->base is NULL, then this task is not managing its 461 * own shadow stack (CLONE_VFORK). So skip freeing it. 462 */ 463 if (!shstk->base) 464 return; 465 466 /* 467 * shstk->base is NULL for CLONE_VFORK child tasks, and so is 468 * normal. But size = 0 on a shstk->base is not normal and 469 * indicated an attempt to free the thread shadow stack twice. 470 * Warn about it. 471 */ 472 if (WARN_ON(!shstk->size)) 473 return; 474 475 unmap_shadow_stack(shstk->base, shstk->size); 476 477 shstk->size = 0; 478 } 479 480 static int wrss_control(bool enable) 481 { 482 u64 msrval; 483 484 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) 485 return -EOPNOTSUPP; 486 487 /* 488 * Only enable WRSS if shadow stack is enabled. If shadow stack is not 489 * enabled, WRSS will already be disabled, so don't bother clearing it 490 * when disabling. 491 */ 492 if (!features_enabled(ARCH_SHSTK_SHSTK)) 493 return -EPERM; 494 495 /* Already enabled/disabled? */ 496 if (features_enabled(ARCH_SHSTK_WRSS) == enable) 497 return 0; 498 499 fpregs_lock_and_load(); 500 rdmsrq(MSR_IA32_U_CET, msrval); 501 502 if (enable) { 503 features_set(ARCH_SHSTK_WRSS); 504 msrval |= CET_WRSS_EN; 505 } else { 506 features_clr(ARCH_SHSTK_WRSS); 507 if (!(msrval & CET_WRSS_EN)) 508 goto unlock; 509 510 msrval &= ~CET_WRSS_EN; 511 } 512 513 wrmsrq(MSR_IA32_U_CET, msrval); 514 515 unlock: 516 fpregs_unlock(); 517 518 return 0; 519 } 520 521 static int shstk_disable(void) 522 { 523 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) 524 return -EOPNOTSUPP; 525 526 /* Already disabled? */ 527 if (!features_enabled(ARCH_SHSTK_SHSTK)) 528 return 0; 529 530 fpregs_lock_and_load(); 531 /* Disable WRSS too when disabling shadow stack */ 532 wrmsrq(MSR_IA32_U_CET, 0); 533 wrmsrq(MSR_IA32_PL3_SSP, 0); 534 fpregs_unlock(); 535 536 shstk_free(current); 537 features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); 538 539 return 0; 540 } 541 542 SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) 543 { 544 bool set_tok = flags & SHADOW_STACK_SET_TOKEN; 545 unsigned long aligned_size; 546 547 if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) 548 return -EOPNOTSUPP; 549 550 if (flags & ~SHADOW_STACK_SET_TOKEN) 551 return -EINVAL; 552 553 /* If there isn't space for a token */ 554 if (set_tok && size < 8) 555 return -ENOSPC; 556 557 if (addr && addr < SZ_4G) 558 return -ERANGE; 559 560 /* 561 * An overflow would result in attempting to write the restore token 562 * to the wrong location. Not catastrophic, but just return the right 563 * error code and block it. 564 */ 565 aligned_size = PAGE_ALIGN(size); 566 if (aligned_size < size) 567 return -EOVERFLOW; 568 569 return alloc_shstk(addr, aligned_size, size, set_tok); 570 } 571 572 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) 573 { 574 unsigned long features = arg2; 575 576 if (option == ARCH_SHSTK_STATUS) { 577 return put_user(task->thread.features, (unsigned long __user *)arg2); 578 } 579 580 if (option == ARCH_SHSTK_LOCK) { 581 task->thread.features_locked |= features; 582 return 0; 583 } 584 585 /* Only allow via ptrace */ 586 if (task != current) { 587 if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { 588 task->thread.features_locked &= ~features; 589 return 0; 590 } 591 return -EINVAL; 592 } 593 594 /* Do not allow to change locked features */ 595 if (features & task->thread.features_locked) 596 return -EPERM; 597 598 /* Only support enabling/disabling one feature at a time. */ 599 if (hweight_long(features) > 1) 600 return -EINVAL; 601 602 if (option == ARCH_SHSTK_DISABLE) { 603 if (features & ARCH_SHSTK_WRSS) 604 return wrss_control(false); 605 if (features & ARCH_SHSTK_SHSTK) 606 return shstk_disable(); 607 return -EINVAL; 608 } 609 610 /* Handle ARCH_SHSTK_ENABLE */ 611 if (features & ARCH_SHSTK_SHSTK) 612 return shstk_setup(); 613 if (features & ARCH_SHSTK_WRSS) 614 return wrss_control(true); 615 return -EINVAL; 616 } 617 618 int shstk_update_last_frame(unsigned long val) 619 { 620 unsigned long ssp; 621 622 if (!features_enabled(ARCH_SHSTK_SHSTK)) 623 return 0; 624 625 ssp = get_user_shstk_addr(); 626 return write_user_shstk_64((u64 __user *)ssp, (u64)val); 627 } 628 629 bool shstk_is_enabled(void) 630 { 631 return features_enabled(ARCH_SHSTK_SHSTK); 632 } 633