1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fsgsbase.c, an fsgsbase test 4 * Copyright (c) 2014-2016 Andy Lutomirski 5 */ 6 7 #define _GNU_SOURCE 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <stdbool.h> 11 #include <string.h> 12 #include <sys/syscall.h> 13 #include <unistd.h> 14 #include <err.h> 15 #include <sys/user.h> 16 #include <asm/prctl.h> 17 #include <sys/prctl.h> 18 #include <signal.h> 19 #include <limits.h> 20 #include <sys/ucontext.h> 21 #include <sched.h> 22 #include <linux/futex.h> 23 #include <pthread.h> 24 #include <asm/ldt.h> 25 #include <sys/mman.h> 26 #include <stddef.h> 27 #include <sys/ptrace.h> 28 #include <sys/wait.h> 29 #include <setjmp.h> 30 31 #ifndef __x86_64__ 32 # error This test is 64-bit only 33 #endif 34 35 static volatile sig_atomic_t want_segv; 36 static volatile unsigned long segv_addr; 37 38 static unsigned short *shared_scratch; 39 40 static int nerrs; 41 42 static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 43 int flags) 44 { 45 struct sigaction sa; 46 memset(&sa, 0, sizeof(sa)); 47 sa.sa_sigaction = handler; 48 sa.sa_flags = SA_SIGINFO | flags; 49 sigemptyset(&sa.sa_mask); 50 if (sigaction(sig, &sa, 0)) 51 err(1, "sigaction"); 52 } 53 54 static void clearhandler(int sig) 55 { 56 struct sigaction sa; 57 memset(&sa, 0, sizeof(sa)); 58 sa.sa_handler = SIG_DFL; 59 sigemptyset(&sa.sa_mask); 60 if (sigaction(sig, &sa, 0)) 61 err(1, "sigaction"); 62 } 63 64 static void sigsegv(int sig, siginfo_t *si, void *ctx_void) 65 { 66 ucontext_t *ctx = (ucontext_t*)ctx_void; 67 68 if (!want_segv) { 69 clearhandler(SIGSEGV); 70 return; /* Crash cleanly. */ 71 } 72 73 want_segv = false; 74 segv_addr = (unsigned long)si->si_addr; 75 76 ctx->uc_mcontext.gregs[REG_RIP] += 4; /* Skip the faulting mov */ 77 78 } 79 80 static jmp_buf jmpbuf; 81 82 static void sigill(int sig, siginfo_t *si, void *ctx_void) 83 { 84 siglongjmp(jmpbuf, 1); 85 } 86 87 static bool have_fsgsbase; 88 89 static inline unsigned long rdgsbase(void) 90 { 91 unsigned long gsbase; 92 93 asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); 94 95 return gsbase; 96 } 97 98 static inline unsigned long rdfsbase(void) 99 { 100 unsigned long fsbase; 101 102 asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); 103 104 return fsbase; 105 } 106 107 static inline void wrgsbase(unsigned long gsbase) 108 { 109 asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); 110 } 111 112 enum which_base { FS, GS }; 113 114 static unsigned long read_base(enum which_base which) 115 { 116 unsigned long offset; 117 /* 118 * Unless we have FSGSBASE, there's no direct way to do this from 119 * user mode. We can get at it indirectly using signals, though. 120 */ 121 122 want_segv = true; 123 124 offset = 0; 125 if (which == FS) { 126 /* Use a constant-length instruction here. */ 127 asm volatile ("mov %%fs:(%%rcx), %%rax" : : "c" (offset) : "rax"); 128 } else { 129 asm volatile ("mov %%gs:(%%rcx), %%rax" : : "c" (offset) : "rax"); 130 } 131 if (!want_segv) 132 return segv_addr + offset; 133 134 /* 135 * If that didn't segfault, try the other end of the address space. 136 * Unless we get really unlucky and run into the vsyscall page, this 137 * is guaranteed to segfault. 138 */ 139 140 offset = (ULONG_MAX >> 1) + 1; 141 if (which == FS) { 142 asm volatile ("mov %%fs:(%%rcx), %%rax" 143 : : "c" (offset) : "rax"); 144 } else { 145 asm volatile ("mov %%gs:(%%rcx), %%rax" 146 : : "c" (offset) : "rax"); 147 } 148 if (!want_segv) 149 return segv_addr + offset; 150 151 abort(); 152 } 153 154 static void check_gs_value(unsigned long value) 155 { 156 unsigned long base; 157 unsigned short sel; 158 159 printf("[RUN]\tARCH_SET_GS to 0x%lx\n", value); 160 if (syscall(SYS_arch_prctl, ARCH_SET_GS, value) != 0) 161 err(1, "ARCH_SET_GS"); 162 163 asm volatile ("mov %%gs, %0" : "=rm" (sel)); 164 base = read_base(GS); 165 if (base == value) { 166 printf("[OK]\tGSBASE was set as expected (selector 0x%hx)\n", 167 sel); 168 } else { 169 nerrs++; 170 printf("[FAIL]\tGSBASE was not as expected: got 0x%lx (selector 0x%hx)\n", 171 base, sel); 172 } 173 174 if (syscall(SYS_arch_prctl, ARCH_GET_GS, &base) != 0) 175 err(1, "ARCH_GET_GS"); 176 if (base == value) { 177 printf("[OK]\tARCH_GET_GS worked as expected (selector 0x%hx)\n", 178 sel); 179 } else { 180 nerrs++; 181 printf("[FAIL]\tARCH_GET_GS was not as expected: got 0x%lx (selector 0x%hx)\n", 182 base, sel); 183 } 184 } 185 186 static void mov_0_gs(unsigned long initial_base, bool schedule) 187 { 188 unsigned long base, arch_base; 189 190 printf("[RUN]\tARCH_SET_GS to 0x%lx then mov 0 to %%gs%s\n", initial_base, schedule ? " and schedule " : ""); 191 if (syscall(SYS_arch_prctl, ARCH_SET_GS, initial_base) != 0) 192 err(1, "ARCH_SET_GS"); 193 194 if (schedule) 195 usleep(10); 196 197 asm volatile ("mov %0, %%gs" : : "rm" (0)); 198 base = read_base(GS); 199 if (syscall(SYS_arch_prctl, ARCH_GET_GS, &arch_base) != 0) 200 err(1, "ARCH_GET_GS"); 201 if (base == arch_base) { 202 printf("[OK]\tGSBASE is 0x%lx\n", base); 203 } else { 204 nerrs++; 205 printf("[FAIL]\tGSBASE changed to 0x%lx but kernel reports 0x%lx\n", base, arch_base); 206 } 207 } 208 209 static volatile unsigned long remote_base; 210 static volatile unsigned int ftx; 211 212 /* 213 * ARCH_SET_FS/GS(0) may or may not program a selector of zero. HARD_ZERO 214 * means to force the selector to zero to improve test coverage. 215 */ 216 #define HARD_ZERO 0xa1fa5f343cb85fa4 217 218 static void do_remote_base() 219 { 220 unsigned long to_set = remote_base; 221 bool hard_zero = false; 222 if (to_set == HARD_ZERO) { 223 to_set = 0; 224 hard_zero = true; 225 } 226 227 if (syscall(SYS_arch_prctl, ARCH_SET_GS, to_set) != 0) 228 err(1, "ARCH_SET_GS"); 229 230 if (hard_zero) 231 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); 232 233 unsigned short sel; 234 asm volatile ("mov %%gs, %0" : "=rm" (sel)); 235 printf("\tother thread: ARCH_SET_GS(0x%lx)%s -- sel is 0x%hx\n", 236 to_set, hard_zero ? " and clear gs" : "", sel); 237 } 238 239 static __thread int set_thread_area_entry_number = -1; 240 241 static unsigned short load_gs(void) 242 { 243 /* 244 * Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think 245 * that GSBASE == 0 (i.e. thread.gsbase == 0). 246 */ 247 248 /* Step 1: tell the kernel that we have GSBASE == 0. */ 249 if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0) 250 err(1, "ARCH_SET_GS"); 251 252 /* Step 2: change GSBASE without telling the kernel. */ 253 struct user_desc desc = { 254 .entry_number = 0, 255 .base_addr = 0xBAADF00D, 256 .limit = 0xfffff, 257 .seg_32bit = 1, 258 .contents = 0, /* Data, grow-up */ 259 .read_exec_only = 0, 260 .limit_in_pages = 1, 261 .seg_not_present = 0, 262 .useable = 0 263 }; 264 if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { 265 printf("\tusing LDT slot 0\n"); 266 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); 267 return 0x7; 268 } else { 269 /* No modify_ldt for us (configured out, perhaps) */ 270 271 struct user_desc *low_desc = mmap( 272 NULL, sizeof(desc), 273 PROT_READ | PROT_WRITE, 274 MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0); 275 memcpy(low_desc, &desc, sizeof(desc)); 276 277 low_desc->entry_number = set_thread_area_entry_number; 278 279 /* 32-bit set_thread_area */ 280 long ret; 281 asm volatile ("int $0x80" 282 : "=a" (ret), "+m" (*low_desc) 283 : "a" (243), "b" (low_desc) 284 : "r8", "r9", "r10", "r11"); 285 memcpy(&desc, low_desc, sizeof(desc)); 286 munmap(low_desc, sizeof(desc)); 287 288 if (ret != 0) { 289 printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); 290 return 0; 291 } 292 printf("\tusing GDT slot %d\n", desc.entry_number); 293 set_thread_area_entry_number = desc.entry_number; 294 295 unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3); 296 asm volatile ("mov %0, %%gs" : : "rm" (gs)); 297 return gs; 298 } 299 } 300 301 void test_wrbase(unsigned short index, unsigned long base) 302 { 303 unsigned short newindex; 304 unsigned long newbase; 305 306 printf("[RUN]\tGS = 0x%hx, GSBASE = 0x%lx\n", index, base); 307 308 asm volatile ("mov %0, %%gs" : : "rm" (index)); 309 wrgsbase(base); 310 311 remote_base = 0; 312 ftx = 1; 313 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 314 while (ftx != 0) 315 syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0); 316 317 asm volatile ("mov %%gs, %0" : "=rm" (newindex)); 318 newbase = rdgsbase(); 319 320 if (newindex == index && newbase == base) { 321 printf("[OK]\tIndex and base were preserved\n"); 322 } else { 323 printf("[FAIL]\tAfter switch, GS = 0x%hx and GSBASE = 0x%lx\n", 324 newindex, newbase); 325 nerrs++; 326 } 327 } 328 329 static void *threadproc(void *ctx) 330 { 331 while (1) { 332 while (ftx == 0) 333 syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0); 334 if (ftx == 3) 335 return NULL; 336 337 if (ftx == 1) { 338 do_remote_base(); 339 } else if (ftx == 2) { 340 /* 341 * On AMD chips, this causes GSBASE != 0, GS == 0, and 342 * thread.gsbase == 0. 343 */ 344 345 load_gs(); 346 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); 347 } else { 348 errx(1, "helper thread got bad command"); 349 } 350 351 ftx = 0; 352 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 353 } 354 } 355 356 static void set_gs_and_switch_to(unsigned long local, 357 unsigned short force_sel, 358 unsigned long remote) 359 { 360 unsigned long base; 361 unsigned short sel_pre_sched, sel_post_sched; 362 363 bool hard_zero = false; 364 if (local == HARD_ZERO) { 365 hard_zero = true; 366 local = 0; 367 } 368 369 printf("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n", 370 local, hard_zero ? " and clear gs" : "", remote); 371 if (force_sel) 372 printf("\tBefore schedule, set selector to 0x%hx\n", force_sel); 373 if (syscall(SYS_arch_prctl, ARCH_SET_GS, local) != 0) 374 err(1, "ARCH_SET_GS"); 375 if (hard_zero) 376 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); 377 378 if (read_base(GS) != local) { 379 nerrs++; 380 printf("[FAIL]\tGSBASE wasn't set as expected\n"); 381 } 382 383 if (force_sel) { 384 asm volatile ("mov %0, %%gs" : : "rm" (force_sel)); 385 sel_pre_sched = force_sel; 386 local = read_base(GS); 387 388 /* 389 * Signal delivery is quite likely to change a selector 390 * of 1, 2, or 3 back to 0 due to IRET being defective. 391 */ 392 asm volatile ("mov %0, %%gs" : : "rm" (force_sel)); 393 } else { 394 asm volatile ("mov %%gs, %0" : "=rm" (sel_pre_sched)); 395 } 396 397 remote_base = remote; 398 ftx = 1; 399 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 400 while (ftx != 0) 401 syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0); 402 403 asm volatile ("mov %%gs, %0" : "=rm" (sel_post_sched)); 404 base = read_base(GS); 405 if (base == local && sel_pre_sched == sel_post_sched) { 406 printf("[OK]\tGS/BASE remained 0x%hx/0x%lx\n", 407 sel_pre_sched, local); 408 } else if (base == local && sel_pre_sched >= 1 && sel_pre_sched <= 3 && 409 sel_post_sched == 0) { 410 /* 411 * IRET is misdesigned and will squash selectors 1, 2, or 3 412 * to zero. Don't fail the test just because this happened. 413 */ 414 printf("[OK]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx because IRET is defective\n", 415 sel_pre_sched, local, sel_post_sched, base); 416 } else { 417 nerrs++; 418 printf("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx\n", 419 sel_pre_sched, local, sel_post_sched, base); 420 } 421 } 422 423 static void test_unexpected_base(void) 424 { 425 unsigned long base; 426 427 printf("[RUN]\tARCH_SET_GS(0), clear gs, then manipulate GSBASE in a different thread\n"); 428 if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0) 429 err(1, "ARCH_SET_GS"); 430 asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); 431 432 ftx = 2; 433 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 434 while (ftx != 0) 435 syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0); 436 437 base = read_base(GS); 438 if (base == 0) { 439 printf("[OK]\tGSBASE remained 0\n"); 440 } else { 441 nerrs++; 442 printf("[FAIL]\tGSBASE changed to 0x%lx\n", base); 443 } 444 } 445 446 #define USER_REGS_OFFSET(r) offsetof(struct user_regs_struct, r) 447 448 static void test_ptrace_write_gs_read_base(void) 449 { 450 int status; 451 pid_t child = fork(); 452 453 if (child < 0) 454 err(1, "fork"); 455 456 if (child == 0) { 457 printf("[RUN]\tPTRACE_POKE GS, read GSBASE back\n"); 458 459 printf("[RUN]\tARCH_SET_GS to 1\n"); 460 if (syscall(SYS_arch_prctl, ARCH_SET_GS, 1) != 0) 461 err(1, "ARCH_SET_GS"); 462 463 if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) 464 err(1, "PTRACE_TRACEME"); 465 466 raise(SIGTRAP); 467 _exit(0); 468 } 469 470 wait(&status); 471 472 if (WSTOPSIG(status) == SIGTRAP) { 473 unsigned long base; 474 unsigned long gs_offset = USER_REGS_OFFSET(gs); 475 unsigned long base_offset = USER_REGS_OFFSET(gs_base); 476 477 /* Read the initial base. It should be 1. */ 478 base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL); 479 if (base == 1) { 480 printf("[OK]\tGSBASE started at 1\n"); 481 } else { 482 nerrs++; 483 printf("[FAIL]\tGSBASE started at 0x%lx\n", base); 484 } 485 486 printf("[RUN]\tSet GS = 0x7, read GSBASE\n"); 487 488 /* Poke an LDT selector into GS. */ 489 if (ptrace(PTRACE_POKEUSER, child, gs_offset, 0x7) != 0) 490 err(1, "PTRACE_POKEUSER"); 491 492 /* And read the base. */ 493 base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL); 494 495 if (base == 0 || base == 1) { 496 printf("[OK]\tGSBASE reads as 0x%lx with invalid GS\n", base); 497 } else { 498 nerrs++; 499 printf("[FAIL]\tGSBASE=0x%lx (should be 0 or 1)\n", base); 500 } 501 } 502 503 ptrace(PTRACE_CONT, child, NULL, NULL); 504 505 wait(&status); 506 if (!WIFEXITED(status)) 507 printf("[WARN]\tChild didn't exit cleanly.\n"); 508 } 509 510 static void test_ptrace_write_gsbase(void) 511 { 512 int status; 513 pid_t child = fork(); 514 515 if (child < 0) 516 err(1, "fork"); 517 518 if (child == 0) { 519 printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n"); 520 521 *shared_scratch = load_gs(); 522 523 if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) 524 err(1, "PTRACE_TRACEME"); 525 526 raise(SIGTRAP); 527 _exit(0); 528 } 529 530 wait(&status); 531 532 if (WSTOPSIG(status) == SIGTRAP) { 533 unsigned long gs, base; 534 unsigned long gs_offset = USER_REGS_OFFSET(gs); 535 unsigned long base_offset = USER_REGS_OFFSET(gs_base); 536 537 gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL); 538 539 if (gs != *shared_scratch) { 540 nerrs++; 541 printf("[FAIL]\tGS is not prepared with nonzero\n"); 542 goto END; 543 } 544 545 if (ptrace(PTRACE_POKEUSER, child, base_offset, 0xFF) != 0) 546 err(1, "PTRACE_POKEUSER"); 547 548 gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL); 549 base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL); 550 551 /* 552 * In a non-FSGSBASE system, the nonzero selector will load 553 * GSBASE (again). But what is tested here is whether the 554 * selector value is changed or not by the GSBASE write in 555 * a ptracer. 556 */ 557 if (gs != *shared_scratch) { 558 nerrs++; 559 printf("[FAIL]\tGS changed to %lx\n", gs); 560 561 /* 562 * On older kernels, poking a nonzero value into the 563 * base would zero the selector. On newer kernels, 564 * this behavior has changed -- poking the base 565 * changes only the base and, if FSGSBASE is not 566 * available, this may have no effect once the tracee 567 * is resumed. 568 */ 569 if (gs == 0) 570 printf("\tNote: this is expected behavior on older kernels.\n"); 571 } else if (have_fsgsbase && (base != 0xFF)) { 572 nerrs++; 573 printf("[FAIL]\tGSBASE changed to %lx\n", base); 574 } else { 575 printf("[OK]\tGS remained 0x%hx", *shared_scratch); 576 if (have_fsgsbase) 577 printf(" and GSBASE changed to 0xFF"); 578 printf("\n"); 579 } 580 } 581 582 END: 583 ptrace(PTRACE_CONT, child, NULL, NULL); 584 wait(&status); 585 if (!WIFEXITED(status)) 586 printf("[WARN]\tChild didn't exit cleanly.\n"); 587 } 588 589 int main() 590 { 591 pthread_t thread; 592 593 shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE, 594 MAP_ANONYMOUS | MAP_SHARED, -1, 0); 595 596 /* Do these tests before we have an LDT. */ 597 test_ptrace_write_gs_read_base(); 598 599 /* Probe FSGSBASE */ 600 sethandler(SIGILL, sigill, 0); 601 if (sigsetjmp(jmpbuf, 1) == 0) { 602 rdfsbase(); 603 have_fsgsbase = true; 604 printf("\tFSGSBASE instructions are enabled\n"); 605 } else { 606 printf("\tFSGSBASE instructions are disabled\n"); 607 } 608 clearhandler(SIGILL); 609 610 sethandler(SIGSEGV, sigsegv, 0); 611 612 check_gs_value(0); 613 check_gs_value(1); 614 check_gs_value(0x200000000); 615 check_gs_value(0); 616 check_gs_value(0x200000000); 617 check_gs_value(1); 618 619 for (int sched = 0; sched < 2; sched++) { 620 mov_0_gs(0, !!sched); 621 mov_0_gs(1, !!sched); 622 mov_0_gs(0x200000000, !!sched); 623 } 624 625 /* Set up for multithreading. */ 626 627 cpu_set_t cpuset; 628 CPU_ZERO(&cpuset); 629 CPU_SET(0, &cpuset); 630 if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) 631 err(1, "sched_setaffinity to CPU 0"); /* should never fail */ 632 633 if (pthread_create(&thread, 0, threadproc, 0) != 0) 634 err(1, "pthread_create"); 635 636 static unsigned long bases_with_hard_zero[] = { 637 0, HARD_ZERO, 1, 0x200000000, 638 }; 639 640 for (int local = 0; local < 4; local++) { 641 for (int remote = 0; remote < 4; remote++) { 642 for (unsigned short s = 0; s < 5; s++) { 643 unsigned short sel = s; 644 if (s == 4) 645 asm ("mov %%ss, %0" : "=rm" (sel)); 646 set_gs_and_switch_to( 647 bases_with_hard_zero[local], 648 sel, 649 bases_with_hard_zero[remote]); 650 } 651 } 652 } 653 654 test_unexpected_base(); 655 656 if (have_fsgsbase) { 657 unsigned short ss; 658 659 asm volatile ("mov %%ss, %0" : "=rm" (ss)); 660 661 test_wrbase(0, 0); 662 test_wrbase(0, 1); 663 test_wrbase(0, 0x200000000); 664 test_wrbase(0, 0xffffffffffffffff); 665 test_wrbase(ss, 0); 666 test_wrbase(ss, 1); 667 test_wrbase(ss, 0x200000000); 668 test_wrbase(ss, 0xffffffffffffffff); 669 } 670 671 ftx = 3; /* Kill the thread. */ 672 syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); 673 674 if (pthread_join(thread, NULL) != 0) 675 err(1, "pthread_join"); 676 677 test_ptrace_write_gsbase(); 678 679 return nerrs == 0 ? 0 : 1; 680 } 681