1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) 4 * 5 * There are examples in here of: 6 * * how to set protection keys on memory 7 * * how to set/clear bits in pkey registers (the rights register) 8 * * how to handle SEGV_PKUERR signals and extract pkey-relevant 9 * information from the siginfo 10 * 11 * Things to add: 12 * make sure KSM and KSM COW breaking works 13 * prefault pages in at malloc, or not 14 * protect MPX bounds tables with protection keys? 15 * make sure VMA splitting/merging is working correctly 16 * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys 17 * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel 18 * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks 19 * 20 * Compile like this: 21 * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 22 * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 23 */ 24 #define _GNU_SOURCE 25 #define __SANE_USERSPACE_TYPES__ 26 #include <errno.h> 27 #include <linux/elf.h> 28 #include <linux/futex.h> 29 #include <time.h> 30 #include <sys/time.h> 31 #include <sys/syscall.h> 32 #include <string.h> 33 #include <stdio.h> 34 #include <stdint.h> 35 #include <stdbool.h> 36 #include <signal.h> 37 #include <assert.h> 38 #include <stdlib.h> 39 #include <ucontext.h> 40 #include <sys/mman.h> 41 #include <sys/types.h> 42 #include <sys/wait.h> 43 #include <sys/stat.h> 44 #include <fcntl.h> 45 #include <unistd.h> 46 #include <sys/ptrace.h> 47 #include <setjmp.h> 48 49 #include "pkey-helpers.h" 50 51 int iteration_nr = 1; 52 int test_nr; 53 54 u64 shadow_pkey_reg; 55 int dprint_in_signal; 56 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; 57 char buf[256]; 58 59 void cat_into_file(char *str, char *file) 60 { 61 int fd = open(file, O_RDWR); 62 int ret; 63 64 dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); 65 /* 66 * these need to be raw because they are called under 67 * pkey_assert() 68 */ 69 if (fd < 0) { 70 fprintf(stderr, "error opening '%s'\n", str); 71 perror("error: "); 72 exit(__LINE__); 73 } 74 75 ret = write(fd, str, strlen(str)); 76 if (ret != strlen(str)) { 77 perror("write to file failed"); 78 fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); 79 exit(__LINE__); 80 } 81 close(fd); 82 } 83 84 #if CONTROL_TRACING > 0 85 static int warned_tracing; 86 int tracing_root_ok(void) 87 { 88 if (geteuid() != 0) { 89 if (!warned_tracing) 90 fprintf(stderr, "WARNING: not run as root, " 91 "can not do tracing control\n"); 92 warned_tracing = 1; 93 return 0; 94 } 95 return 1; 96 } 97 #endif 98 99 void tracing_on(void) 100 { 101 #if CONTROL_TRACING > 0 102 #define TRACEDIR "/sys/kernel/tracing" 103 char pidstr[32]; 104 105 if (!tracing_root_ok()) 106 return; 107 108 sprintf(pidstr, "%d", getpid()); 109 cat_into_file("0", TRACEDIR "/tracing_on"); 110 cat_into_file("\n", TRACEDIR "/trace"); 111 if (1) { 112 cat_into_file("function_graph", TRACEDIR "/current_tracer"); 113 cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); 114 } else { 115 cat_into_file("nop", TRACEDIR "/current_tracer"); 116 } 117 cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); 118 cat_into_file("1", TRACEDIR "/tracing_on"); 119 dprintf1("enabled tracing\n"); 120 #endif 121 } 122 123 void tracing_off(void) 124 { 125 #if CONTROL_TRACING > 0 126 if (!tracing_root_ok()) 127 return; 128 cat_into_file("0", "/sys/kernel/tracing/tracing_on"); 129 #endif 130 } 131 132 void abort_hooks(void) 133 { 134 fprintf(stderr, "running %s()...\n", __func__); 135 tracing_off(); 136 #ifdef SLEEP_ON_ABORT 137 sleep(SLEEP_ON_ABORT); 138 #endif 139 } 140 141 /* 142 * This attempts to have roughly a page of instructions followed by a few 143 * instructions that do a write, and another page of instructions. That 144 * way, we are pretty sure that the write is in the second page of 145 * instructions and has at least a page of padding behind it. 146 * 147 * *That* lets us be sure to madvise() away the write instruction, which 148 * will then fault, which makes sure that the fault code handles 149 * execute-only memory properly. 150 */ 151 #ifdef __powerpc64__ 152 /* This way, both 4K and 64K alignment are maintained */ 153 __attribute__((__aligned__(65536))) 154 #else 155 __attribute__((__aligned__(PAGE_SIZE))) 156 #endif 157 void lots_o_noops_around_write(int *write_to_me) 158 { 159 dprintf3("running %s()\n", __func__); 160 __page_o_noops(); 161 /* Assume this happens in the second page of instructions: */ 162 *write_to_me = __LINE__; 163 /* pad out by another page: */ 164 __page_o_noops(); 165 dprintf3("%s() done\n", __func__); 166 } 167 168 void dump_mem(void *dumpme, int len_bytes) 169 { 170 char *c = (void *)dumpme; 171 int i; 172 173 for (i = 0; i < len_bytes; i += sizeof(u64)) { 174 u64 *ptr = (u64 *)(c + i); 175 dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); 176 } 177 } 178 179 static u32 hw_pkey_get(int pkey, unsigned long flags) 180 { 181 u64 pkey_reg = __read_pkey_reg(); 182 183 dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", 184 __func__, pkey, flags, 0, 0); 185 dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); 186 187 return (u32) get_pkey_bits(pkey_reg, pkey); 188 } 189 190 static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) 191 { 192 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); 193 u64 old_pkey_reg = __read_pkey_reg(); 194 u64 new_pkey_reg; 195 196 /* make sure that 'rights' only contains the bits we expect: */ 197 assert(!(rights & ~mask)); 198 199 /* modify bits accordingly in old pkey_reg and assign it */ 200 new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); 201 202 __write_pkey_reg(new_pkey_reg); 203 204 dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" 205 " pkey_reg now: %016llx old_pkey_reg: %016llx\n", 206 __func__, pkey, rights, flags, 0, __read_pkey_reg(), 207 old_pkey_reg); 208 return 0; 209 } 210 211 void pkey_disable_set(int pkey, int flags) 212 { 213 unsigned long syscall_flags = 0; 214 int ret; 215 int pkey_rights; 216 u64 orig_pkey_reg = read_pkey_reg(); 217 218 dprintf1("START->%s(%d, 0x%x)\n", __func__, 219 pkey, flags); 220 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 221 222 pkey_rights = hw_pkey_get(pkey, syscall_flags); 223 224 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 225 pkey, pkey, pkey_rights); 226 227 pkey_assert(pkey_rights >= 0); 228 229 pkey_rights |= flags; 230 231 ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); 232 assert(!ret); 233 /* pkey_reg and flags have the same format */ 234 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); 235 dprintf1("%s(%d) shadow: 0x%016llx\n", 236 __func__, pkey, shadow_pkey_reg); 237 238 pkey_assert(ret >= 0); 239 240 pkey_rights = hw_pkey_get(pkey, syscall_flags); 241 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 242 pkey, pkey, pkey_rights); 243 244 dprintf1("%s(%d) pkey_reg: 0x%016llx\n", 245 __func__, pkey, read_pkey_reg()); 246 if (flags) 247 pkey_assert(read_pkey_reg() >= orig_pkey_reg); 248 dprintf1("END<---%s(%d, 0x%x)\n", __func__, 249 pkey, flags); 250 } 251 252 void pkey_disable_clear(int pkey, int flags) 253 { 254 unsigned long syscall_flags = 0; 255 int ret; 256 int pkey_rights = hw_pkey_get(pkey, syscall_flags); 257 u64 orig_pkey_reg = read_pkey_reg(); 258 259 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 260 261 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 262 pkey, pkey, pkey_rights); 263 pkey_assert(pkey_rights >= 0); 264 265 pkey_rights &= ~flags; 266 267 ret = hw_pkey_set(pkey, pkey_rights, 0); 268 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); 269 pkey_assert(ret >= 0); 270 271 pkey_rights = hw_pkey_get(pkey, syscall_flags); 272 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 273 pkey, pkey, pkey_rights); 274 275 dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, 276 pkey, read_pkey_reg()); 277 if (flags) 278 assert(read_pkey_reg() <= orig_pkey_reg); 279 } 280 281 void pkey_write_allow(int pkey) 282 { 283 pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); 284 } 285 void pkey_write_deny(int pkey) 286 { 287 pkey_disable_set(pkey, PKEY_DISABLE_WRITE); 288 } 289 void pkey_access_allow(int pkey) 290 { 291 pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); 292 } 293 void pkey_access_deny(int pkey) 294 { 295 pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); 296 } 297 298 static char *si_code_str(int si_code) 299 { 300 if (si_code == SEGV_MAPERR) 301 return "SEGV_MAPERR"; 302 if (si_code == SEGV_ACCERR) 303 return "SEGV_ACCERR"; 304 if (si_code == SEGV_BNDERR) 305 return "SEGV_BNDERR"; 306 if (si_code == SEGV_PKUERR) 307 return "SEGV_PKUERR"; 308 return "UNKNOWN"; 309 } 310 311 int pkey_faults; 312 int last_si_pkey = -1; 313 void signal_handler(int signum, siginfo_t *si, void *vucontext) 314 { 315 ucontext_t *uctxt = vucontext; 316 int trapno; 317 unsigned long ip; 318 char *fpregs; 319 #if defined(__i386__) || defined(__x86_64__) /* arch */ 320 u32 *pkey_reg_ptr; 321 int pkey_reg_offset; 322 #endif /* arch */ 323 u64 siginfo_pkey; 324 u32 *si_pkey_ptr; 325 326 dprint_in_signal = 1; 327 dprintf1(">>>>===============SIGSEGV============================\n"); 328 dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", 329 __func__, __LINE__, 330 __read_pkey_reg(), shadow_pkey_reg); 331 332 trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; 333 ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; 334 fpregs = (char *) uctxt->uc_mcontext.fpregs; 335 336 dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", 337 __func__, trapno, ip, si_code_str(si->si_code), 338 si->si_code); 339 340 #if defined(__i386__) || defined(__x86_64__) /* arch */ 341 #ifdef __i386__ 342 /* 343 * 32-bit has some extra padding so that userspace can tell whether 344 * the XSTATE header is present in addition to the "legacy" FPU 345 * state. We just assume that it is here. 346 */ 347 fpregs += 0x70; 348 #endif /* i386 */ 349 pkey_reg_offset = pkey_reg_xstate_offset(); 350 pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); 351 352 /* 353 * If we got a PKEY fault, we *HAVE* to have at least one bit set in 354 * here. 355 */ 356 dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); 357 if (DEBUG_LEVEL > 4) 358 dump_mem(pkey_reg_ptr - 128, 256); 359 pkey_assert(*pkey_reg_ptr); 360 #endif /* arch */ 361 362 dprintf1("siginfo: %p\n", si); 363 dprintf1(" fpregs: %p\n", fpregs); 364 365 if ((si->si_code == SEGV_MAPERR) || 366 (si->si_code == SEGV_ACCERR) || 367 (si->si_code == SEGV_BNDERR)) { 368 printf("non-PK si_code, exiting...\n"); 369 exit(4); 370 } 371 372 si_pkey_ptr = siginfo_get_pkey_ptr(si); 373 dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); 374 dump_mem((u8 *)si_pkey_ptr - 8, 24); 375 siginfo_pkey = *si_pkey_ptr; 376 pkey_assert(siginfo_pkey < NR_PKEYS); 377 last_si_pkey = siginfo_pkey; 378 379 /* 380 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 381 * checking 382 */ 383 dprintf1("signal pkey_reg from pkey_reg: %016llx\n", 384 __read_pkey_reg()); 385 dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); 386 #if defined(__i386__) || defined(__x86_64__) /* arch */ 387 dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); 388 *(u64 *)pkey_reg_ptr = 0x00000000; 389 dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); 390 #elif defined(__powerpc64__) /* arch */ 391 /* restore access and let the faulting instruction continue */ 392 pkey_access_allow(siginfo_pkey); 393 #endif /* arch */ 394 pkey_faults++; 395 dprintf1("<<<<==================================================\n"); 396 dprint_in_signal = 0; 397 } 398 399 int wait_all_children(void) 400 { 401 int status; 402 return waitpid(-1, &status, 0); 403 } 404 405 void sig_chld(int x) 406 { 407 dprint_in_signal = 1; 408 dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); 409 dprint_in_signal = 0; 410 } 411 412 void setup_sigsegv_handler(void) 413 { 414 int r, rs; 415 struct sigaction newact; 416 struct sigaction oldact; 417 418 /* #PF is mapped to sigsegv */ 419 int signum = SIGSEGV; 420 421 newact.sa_handler = 0; 422 newact.sa_sigaction = signal_handler; 423 424 /*sigset_t - signals to block while in the handler */ 425 /* get the old signal mask. */ 426 rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); 427 pkey_assert(rs == 0); 428 429 /* call sa_sigaction, not sa_handler*/ 430 newact.sa_flags = SA_SIGINFO; 431 432 newact.sa_restorer = 0; /* void(*)(), obsolete */ 433 r = sigaction(signum, &newact, &oldact); 434 r = sigaction(SIGALRM, &newact, &oldact); 435 pkey_assert(r == 0); 436 } 437 438 void setup_handlers(void) 439 { 440 signal(SIGCHLD, &sig_chld); 441 setup_sigsegv_handler(); 442 } 443 444 pid_t fork_lazy_child(void) 445 { 446 pid_t forkret; 447 448 forkret = fork(); 449 pkey_assert(forkret >= 0); 450 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 451 452 if (!forkret) { 453 /* in the child */ 454 while (1) { 455 dprintf1("child sleeping...\n"); 456 sleep(30); 457 } 458 } 459 return forkret; 460 } 461 462 int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 463 unsigned long pkey) 464 { 465 int sret; 466 467 dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, 468 ptr, size, orig_prot, pkey); 469 470 errno = 0; 471 sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); 472 if (errno) { 473 dprintf2("SYS_mprotect_key sret: %d\n", sret); 474 dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); 475 dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); 476 if (DEBUG_LEVEL >= 2) 477 perror("SYS_mprotect_pkey"); 478 } 479 return sret; 480 } 481 482 int sys_pkey_alloc(unsigned long flags, unsigned long init_val) 483 { 484 int ret = syscall(SYS_pkey_alloc, flags, init_val); 485 dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", 486 __func__, flags, init_val, ret, errno); 487 return ret; 488 } 489 490 int alloc_pkey(void) 491 { 492 int ret; 493 unsigned long init_val = 0x0; 494 495 dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", 496 __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); 497 ret = sys_pkey_alloc(0, init_val); 498 /* 499 * pkey_alloc() sets PKEY register, so we need to reflect it in 500 * shadow_pkey_reg: 501 */ 502 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 503 " shadow: 0x%016llx\n", 504 __func__, __LINE__, ret, __read_pkey_reg(), 505 shadow_pkey_reg); 506 if (ret > 0) { 507 /* clear both the bits: */ 508 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, 509 ~PKEY_MASK); 510 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 511 " shadow: 0x%016llx\n", 512 __func__, 513 __LINE__, ret, __read_pkey_reg(), 514 shadow_pkey_reg); 515 /* 516 * move the new state in from init_val 517 * (remember, we cheated and init_val == pkey_reg format) 518 */ 519 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, 520 init_val); 521 } 522 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 523 " shadow: 0x%016llx\n", 524 __func__, __LINE__, ret, __read_pkey_reg(), 525 shadow_pkey_reg); 526 dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); 527 /* for shadow checking: */ 528 read_pkey_reg(); 529 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 530 " shadow: 0x%016llx\n", 531 __func__, __LINE__, ret, __read_pkey_reg(), 532 shadow_pkey_reg); 533 return ret; 534 } 535 536 int sys_pkey_free(unsigned long pkey) 537 { 538 int ret = syscall(SYS_pkey_free, pkey); 539 dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); 540 return ret; 541 } 542 543 /* 544 * I had a bug where pkey bits could be set by mprotect() but 545 * not cleared. This ensures we get lots of random bit sets 546 * and clears on the vma and pte pkey bits. 547 */ 548 int alloc_random_pkey(void) 549 { 550 int max_nr_pkey_allocs; 551 int ret; 552 int i; 553 int alloced_pkeys[NR_PKEYS]; 554 int nr_alloced = 0; 555 int random_index; 556 memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); 557 558 /* allocate every possible key and make a note of which ones we got */ 559 max_nr_pkey_allocs = NR_PKEYS; 560 for (i = 0; i < max_nr_pkey_allocs; i++) { 561 int new_pkey = alloc_pkey(); 562 if (new_pkey < 0) 563 break; 564 alloced_pkeys[nr_alloced++] = new_pkey; 565 } 566 567 pkey_assert(nr_alloced > 0); 568 /* select a random one out of the allocated ones */ 569 random_index = rand() % nr_alloced; 570 ret = alloced_pkeys[random_index]; 571 /* now zero it out so we don't free it next */ 572 alloced_pkeys[random_index] = 0; 573 574 /* go through the allocated ones that we did not want and free them */ 575 for (i = 0; i < nr_alloced; i++) { 576 int free_ret; 577 if (!alloced_pkeys[i]) 578 continue; 579 free_ret = sys_pkey_free(alloced_pkeys[i]); 580 pkey_assert(!free_ret); 581 } 582 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 583 " shadow: 0x%016llx\n", __func__, 584 __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); 585 return ret; 586 } 587 588 int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 589 unsigned long pkey) 590 { 591 int nr_iterations = random() % 100; 592 int ret; 593 594 while (0) { 595 int rpkey = alloc_random_pkey(); 596 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 597 dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 598 ptr, size, orig_prot, pkey, ret); 599 if (nr_iterations-- < 0) 600 break; 601 602 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 603 " shadow: 0x%016llx\n", 604 __func__, __LINE__, ret, __read_pkey_reg(), 605 shadow_pkey_reg); 606 sys_pkey_free(rpkey); 607 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 608 " shadow: 0x%016llx\n", 609 __func__, __LINE__, ret, __read_pkey_reg(), 610 shadow_pkey_reg); 611 } 612 pkey_assert(pkey < NR_PKEYS); 613 614 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 615 dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 616 ptr, size, orig_prot, pkey, ret); 617 pkey_assert(!ret); 618 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 619 " shadow: 0x%016llx\n", __func__, 620 __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); 621 return ret; 622 } 623 624 struct pkey_malloc_record { 625 void *ptr; 626 long size; 627 int prot; 628 }; 629 struct pkey_malloc_record *pkey_malloc_records; 630 struct pkey_malloc_record *pkey_last_malloc_record; 631 long nr_pkey_malloc_records; 632 void record_pkey_malloc(void *ptr, long size, int prot) 633 { 634 long i; 635 struct pkey_malloc_record *rec = NULL; 636 637 for (i = 0; i < nr_pkey_malloc_records; i++) { 638 rec = &pkey_malloc_records[i]; 639 /* find a free record */ 640 if (rec) 641 break; 642 } 643 if (!rec) { 644 /* every record is full */ 645 size_t old_nr_records = nr_pkey_malloc_records; 646 size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); 647 size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); 648 dprintf2("new_nr_records: %zd\n", new_nr_records); 649 dprintf2("new_size: %zd\n", new_size); 650 pkey_malloc_records = realloc(pkey_malloc_records, new_size); 651 pkey_assert(pkey_malloc_records != NULL); 652 rec = &pkey_malloc_records[nr_pkey_malloc_records]; 653 /* 654 * realloc() does not initialize memory, so zero it from 655 * the first new record all the way to the end. 656 */ 657 for (i = 0; i < new_nr_records - old_nr_records; i++) 658 memset(rec + i, 0, sizeof(*rec)); 659 } 660 dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", 661 (int)(rec - pkey_malloc_records), rec, ptr, size); 662 rec->ptr = ptr; 663 rec->size = size; 664 rec->prot = prot; 665 pkey_last_malloc_record = rec; 666 nr_pkey_malloc_records++; 667 } 668 669 void free_pkey_malloc(void *ptr) 670 { 671 long i; 672 int ret; 673 dprintf3("%s(%p)\n", __func__, ptr); 674 for (i = 0; i < nr_pkey_malloc_records; i++) { 675 struct pkey_malloc_record *rec = &pkey_malloc_records[i]; 676 dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", 677 ptr, i, rec, rec->ptr, rec->size); 678 if ((ptr < rec->ptr) || 679 (ptr >= rec->ptr + rec->size)) 680 continue; 681 682 dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", 683 ptr, i, rec, rec->ptr, rec->size); 684 nr_pkey_malloc_records--; 685 ret = munmap(rec->ptr, rec->size); 686 dprintf3("munmap ret: %d\n", ret); 687 pkey_assert(!ret); 688 dprintf3("clearing rec->ptr, rec: %p\n", rec); 689 rec->ptr = NULL; 690 dprintf3("done clearing rec->ptr, rec: %p\n", rec); 691 return; 692 } 693 pkey_assert(false); 694 } 695 696 697 void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) 698 { 699 void *ptr; 700 int ret; 701 702 read_pkey_reg(); 703 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 704 size, prot, pkey); 705 pkey_assert(pkey < NR_PKEYS); 706 ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 707 pkey_assert(ptr != (void *)-1); 708 ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); 709 pkey_assert(!ret); 710 record_pkey_malloc(ptr, size, prot); 711 read_pkey_reg(); 712 713 dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); 714 return ptr; 715 } 716 717 void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) 718 { 719 int ret; 720 void *ptr; 721 722 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 723 size, prot, pkey); 724 /* 725 * Guarantee we can fit at least one huge page in the resulting 726 * allocation by allocating space for 2: 727 */ 728 size = ALIGN_UP(size, HPAGE_SIZE * 2); 729 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 730 pkey_assert(ptr != (void *)-1); 731 record_pkey_malloc(ptr, size, prot); 732 mprotect_pkey(ptr, size, prot, pkey); 733 734 dprintf1("unaligned ptr: %p\n", ptr); 735 ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); 736 dprintf1(" aligned ptr: %p\n", ptr); 737 ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); 738 dprintf1("MADV_HUGEPAGE ret: %d\n", ret); 739 ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); 740 dprintf1("MADV_WILLNEED ret: %d\n", ret); 741 memset(ptr, 0, HPAGE_SIZE); 742 743 dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); 744 return ptr; 745 } 746 747 int hugetlb_setup_ok; 748 #define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" 749 #define GET_NR_HUGE_PAGES 10 750 void setup_hugetlbfs(void) 751 { 752 int err; 753 int fd; 754 char buf[256]; 755 long hpagesz_kb; 756 long hpagesz_mb; 757 758 if (geteuid() != 0) { 759 fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); 760 return; 761 } 762 763 cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); 764 765 /* 766 * Now go make sure that we got the pages and that they 767 * are PMD-level pages. Someone might have made PUD-level 768 * pages the default. 769 */ 770 hpagesz_kb = HPAGE_SIZE / 1024; 771 hpagesz_mb = hpagesz_kb / 1024; 772 sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); 773 fd = open(buf, O_RDONLY); 774 if (fd < 0) { 775 fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", 776 hpagesz_mb, strerror(errno)); 777 return; 778 } 779 780 /* -1 to guarantee leaving the trailing \0 */ 781 err = read(fd, buf, sizeof(buf)-1); 782 close(fd); 783 if (err <= 0) { 784 fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", 785 hpagesz_mb, strerror(errno)); 786 return; 787 } 788 789 if (atoi(buf) != GET_NR_HUGE_PAGES) { 790 fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", 791 hpagesz_mb, buf, GET_NR_HUGE_PAGES); 792 return; 793 } 794 795 hugetlb_setup_ok = 1; 796 } 797 798 void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) 799 { 800 void *ptr; 801 int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; 802 803 if (!hugetlb_setup_ok) 804 return PTR_ERR_ENOTSUP; 805 806 dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); 807 size = ALIGN_UP(size, HPAGE_SIZE * 2); 808 pkey_assert(pkey < NR_PKEYS); 809 ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); 810 pkey_assert(ptr != (void *)-1); 811 mprotect_pkey(ptr, size, prot, pkey); 812 813 record_pkey_malloc(ptr, size, prot); 814 815 dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); 816 return ptr; 817 } 818 819 void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) 820 { 821 void *ptr; 822 int fd; 823 824 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 825 size, prot, pkey); 826 pkey_assert(pkey < NR_PKEYS); 827 fd = open("/dax/foo", O_RDWR); 828 pkey_assert(fd >= 0); 829 830 ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); 831 pkey_assert(ptr != (void *)-1); 832 833 mprotect_pkey(ptr, size, prot, pkey); 834 835 record_pkey_malloc(ptr, size, prot); 836 837 dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); 838 close(fd); 839 return ptr; 840 } 841 842 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { 843 844 malloc_pkey_with_mprotect, 845 malloc_pkey_with_mprotect_subpage, 846 malloc_pkey_anon_huge, 847 malloc_pkey_hugetlb 848 /* can not do direct with the pkey_mprotect() API: 849 malloc_pkey_mmap_direct, 850 malloc_pkey_mmap_dax, 851 */ 852 }; 853 854 void *malloc_pkey(long size, int prot, u16 pkey) 855 { 856 void *ret; 857 static int malloc_type; 858 int nr_malloc_types = ARRAY_SIZE(pkey_malloc); 859 860 pkey_assert(pkey < NR_PKEYS); 861 862 while (1) { 863 pkey_assert(malloc_type < nr_malloc_types); 864 865 ret = pkey_malloc[malloc_type](size, prot, pkey); 866 pkey_assert(ret != (void *)-1); 867 868 malloc_type++; 869 if (malloc_type >= nr_malloc_types) 870 malloc_type = (random()%nr_malloc_types); 871 872 /* try again if the malloc_type we tried is unsupported */ 873 if (ret == PTR_ERR_ENOTSUP) 874 continue; 875 876 break; 877 } 878 879 dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, 880 size, prot, pkey, ret); 881 return ret; 882 } 883 884 int last_pkey_faults; 885 #define UNKNOWN_PKEY -2 886 void expected_pkey_fault(int pkey) 887 { 888 dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", 889 __func__, last_pkey_faults, pkey_faults); 890 dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); 891 pkey_assert(last_pkey_faults + 1 == pkey_faults); 892 893 /* 894 * For exec-only memory, we do not know the pkey in 895 * advance, so skip this check. 896 */ 897 if (pkey != UNKNOWN_PKEY) 898 pkey_assert(last_si_pkey == pkey); 899 900 #if defined(__i386__) || defined(__x86_64__) /* arch */ 901 /* 902 * The signal handler shold have cleared out PKEY register to let the 903 * test program continue. We now have to restore it. 904 */ 905 if (__read_pkey_reg() != 0) 906 #else /* arch */ 907 if (__read_pkey_reg() != shadow_pkey_reg) 908 #endif /* arch */ 909 pkey_assert(0); 910 911 __write_pkey_reg(shadow_pkey_reg); 912 dprintf1("%s() set pkey_reg=%016llx to restore state after signal " 913 "nuked it\n", __func__, shadow_pkey_reg); 914 last_pkey_faults = pkey_faults; 915 last_si_pkey = -1; 916 } 917 918 #define do_not_expect_pkey_fault(msg) do { \ 919 if (last_pkey_faults != pkey_faults) \ 920 dprintf0("unexpected PKey fault: %s\n", msg); \ 921 pkey_assert(last_pkey_faults == pkey_faults); \ 922 } while (0) 923 924 int test_fds[10] = { -1 }; 925 int nr_test_fds; 926 void __save_test_fd(int fd) 927 { 928 pkey_assert(fd >= 0); 929 pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); 930 test_fds[nr_test_fds] = fd; 931 nr_test_fds++; 932 } 933 934 int get_test_read_fd(void) 935 { 936 int test_fd = open("/etc/passwd", O_RDONLY); 937 __save_test_fd(test_fd); 938 return test_fd; 939 } 940 941 void close_test_fds(void) 942 { 943 int i; 944 945 for (i = 0; i < nr_test_fds; i++) { 946 if (test_fds[i] < 0) 947 continue; 948 close(test_fds[i]); 949 test_fds[i] = -1; 950 } 951 nr_test_fds = 0; 952 } 953 954 #define barrier() __asm__ __volatile__("": : :"memory") 955 __attribute__((noinline)) int read_ptr(int *ptr) 956 { 957 /* 958 * Keep GCC from optimizing this away somehow 959 */ 960 barrier(); 961 return *ptr; 962 } 963 964 void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) 965 { 966 int i, err; 967 int max_nr_pkey_allocs; 968 int alloced_pkeys[NR_PKEYS]; 969 int nr_alloced = 0; 970 long size; 971 972 pkey_assert(pkey_last_malloc_record); 973 size = pkey_last_malloc_record->size; 974 /* 975 * This is a bit of a hack. But mprotect() requires 976 * huge-page-aligned sizes when operating on hugetlbfs. 977 * So, make sure that we use something that's a multiple 978 * of a huge page when we can. 979 */ 980 if (size >= HPAGE_SIZE) 981 size = HPAGE_SIZE; 982 983 /* allocate every possible key and make sure key-0 never got allocated */ 984 max_nr_pkey_allocs = NR_PKEYS; 985 for (i = 0; i < max_nr_pkey_allocs; i++) { 986 int new_pkey = alloc_pkey(); 987 pkey_assert(new_pkey != 0); 988 989 if (new_pkey < 0) 990 break; 991 alloced_pkeys[nr_alloced++] = new_pkey; 992 } 993 /* free all the allocated keys */ 994 for (i = 0; i < nr_alloced; i++) { 995 int free_ret; 996 997 if (!alloced_pkeys[i]) 998 continue; 999 free_ret = sys_pkey_free(alloced_pkeys[i]); 1000 pkey_assert(!free_ret); 1001 } 1002 1003 /* attach key-0 in various modes */ 1004 err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); 1005 pkey_assert(!err); 1006 err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); 1007 pkey_assert(!err); 1008 err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); 1009 pkey_assert(!err); 1010 err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); 1011 pkey_assert(!err); 1012 err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); 1013 pkey_assert(!err); 1014 } 1015 1016 void test_read_of_write_disabled_region(int *ptr, u16 pkey) 1017 { 1018 int ptr_contents; 1019 1020 dprintf1("disabling write access to PKEY[1], doing read\n"); 1021 pkey_write_deny(pkey); 1022 ptr_contents = read_ptr(ptr); 1023 dprintf1("*ptr: %d\n", ptr_contents); 1024 dprintf1("\n"); 1025 } 1026 void test_read_of_access_disabled_region(int *ptr, u16 pkey) 1027 { 1028 int ptr_contents; 1029 1030 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); 1031 read_pkey_reg(); 1032 pkey_access_deny(pkey); 1033 ptr_contents = read_ptr(ptr); 1034 dprintf1("*ptr: %d\n", ptr_contents); 1035 expected_pkey_fault(pkey); 1036 } 1037 1038 void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, 1039 u16 pkey) 1040 { 1041 int ptr_contents; 1042 1043 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", 1044 pkey, ptr); 1045 ptr_contents = read_ptr(ptr); 1046 dprintf1("reading ptr before disabling the read : %d\n", 1047 ptr_contents); 1048 read_pkey_reg(); 1049 pkey_access_deny(pkey); 1050 ptr_contents = read_ptr(ptr); 1051 dprintf1("*ptr: %d\n", ptr_contents); 1052 expected_pkey_fault(pkey); 1053 } 1054 1055 void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, 1056 u16 pkey) 1057 { 1058 *ptr = __LINE__; 1059 dprintf1("disabling write access; after accessing the page, " 1060 "to PKEY[%02d], doing write\n", pkey); 1061 pkey_write_deny(pkey); 1062 *ptr = __LINE__; 1063 expected_pkey_fault(pkey); 1064 } 1065 1066 void test_write_of_write_disabled_region(int *ptr, u16 pkey) 1067 { 1068 dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); 1069 pkey_write_deny(pkey); 1070 *ptr = __LINE__; 1071 expected_pkey_fault(pkey); 1072 } 1073 void test_write_of_access_disabled_region(int *ptr, u16 pkey) 1074 { 1075 dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); 1076 pkey_access_deny(pkey); 1077 *ptr = __LINE__; 1078 expected_pkey_fault(pkey); 1079 } 1080 1081 void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, 1082 u16 pkey) 1083 { 1084 *ptr = __LINE__; 1085 dprintf1("disabling access; after accessing the page, " 1086 " to PKEY[%02d], doing write\n", pkey); 1087 pkey_access_deny(pkey); 1088 *ptr = __LINE__; 1089 expected_pkey_fault(pkey); 1090 } 1091 1092 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) 1093 { 1094 int ret; 1095 int test_fd = get_test_read_fd(); 1096 1097 dprintf1("disabling access to PKEY[%02d], " 1098 "having kernel read() to buffer\n", pkey); 1099 pkey_access_deny(pkey); 1100 ret = read(test_fd, ptr, 1); 1101 dprintf1("read ret: %d\n", ret); 1102 pkey_assert(ret); 1103 } 1104 void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) 1105 { 1106 int ret; 1107 int test_fd = get_test_read_fd(); 1108 1109 pkey_write_deny(pkey); 1110 ret = read(test_fd, ptr, 100); 1111 dprintf1("read ret: %d\n", ret); 1112 if (ret < 0 && (DEBUG_LEVEL > 0)) 1113 perror("verbose read result (OK for this to be bad)"); 1114 pkey_assert(ret); 1115 } 1116 1117 void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) 1118 { 1119 int pipe_ret, vmsplice_ret; 1120 struct iovec iov; 1121 int pipe_fds[2]; 1122 1123 pipe_ret = pipe(pipe_fds); 1124 1125 pkey_assert(pipe_ret == 0); 1126 dprintf1("disabling access to PKEY[%02d], " 1127 "having kernel vmsplice from buffer\n", pkey); 1128 pkey_access_deny(pkey); 1129 iov.iov_base = ptr; 1130 iov.iov_len = PAGE_SIZE; 1131 vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); 1132 dprintf1("vmsplice() ret: %d\n", vmsplice_ret); 1133 pkey_assert(vmsplice_ret == -1); 1134 1135 close(pipe_fds[0]); 1136 close(pipe_fds[1]); 1137 } 1138 1139 void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) 1140 { 1141 int ignored = 0xdada; 1142 int futex_ret; 1143 int some_int = __LINE__; 1144 1145 dprintf1("disabling write to PKEY[%02d], " 1146 "doing futex gunk in buffer\n", pkey); 1147 *ptr = some_int; 1148 pkey_write_deny(pkey); 1149 futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, 1150 &ignored, ignored); 1151 if (DEBUG_LEVEL > 0) 1152 perror("futex"); 1153 dprintf1("futex() ret: %d\n", futex_ret); 1154 } 1155 1156 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1157 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) 1158 { 1159 int err; 1160 int i; 1161 1162 /* Note: 0 is the default pkey, so don't mess with it */ 1163 for (i = 1; i < NR_PKEYS; i++) { 1164 if (pkey == i) 1165 continue; 1166 1167 dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); 1168 err = sys_pkey_free(i); 1169 pkey_assert(err); 1170 1171 err = sys_pkey_free(i); 1172 pkey_assert(err); 1173 1174 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); 1175 pkey_assert(err); 1176 } 1177 } 1178 1179 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1180 void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) 1181 { 1182 int err; 1183 int bad_pkey = NR_PKEYS+99; 1184 1185 /* pass a known-invalid pkey in: */ 1186 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); 1187 pkey_assert(err); 1188 } 1189 1190 void become_child(void) 1191 { 1192 pid_t forkret; 1193 1194 forkret = fork(); 1195 pkey_assert(forkret >= 0); 1196 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 1197 1198 if (!forkret) { 1199 /* in the child */ 1200 return; 1201 } 1202 exit(0); 1203 } 1204 1205 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1206 void test_pkey_alloc_exhaust(int *ptr, u16 pkey) 1207 { 1208 int err; 1209 int allocated_pkeys[NR_PKEYS] = {0}; 1210 int nr_allocated_pkeys = 0; 1211 int i; 1212 1213 for (i = 0; i < NR_PKEYS*3; i++) { 1214 int new_pkey; 1215 dprintf1("%s() alloc loop: %d\n", __func__, i); 1216 new_pkey = alloc_pkey(); 1217 dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" 1218 " shadow: 0x%016llx\n", 1219 __func__, __LINE__, err, __read_pkey_reg(), 1220 shadow_pkey_reg); 1221 read_pkey_reg(); /* for shadow checking */ 1222 dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); 1223 if ((new_pkey == -1) && (errno == ENOSPC)) { 1224 dprintf2("%s() failed to allocate pkey after %d tries\n", 1225 __func__, nr_allocated_pkeys); 1226 } else { 1227 /* 1228 * Ensure the number of successes never 1229 * exceeds the number of keys supported 1230 * in the hardware. 1231 */ 1232 pkey_assert(nr_allocated_pkeys < NR_PKEYS); 1233 allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1234 } 1235 1236 /* 1237 * Make sure that allocation state is properly 1238 * preserved across fork(). 1239 */ 1240 if (i == NR_PKEYS*2) 1241 become_child(); 1242 } 1243 1244 dprintf3("%s()::%d\n", __func__, __LINE__); 1245 1246 /* 1247 * On x86: 1248 * There are 16 pkeys supported in hardware. Three are 1249 * allocated by the time we get here: 1250 * 1. The default key (0) 1251 * 2. One possibly consumed by an execute-only mapping. 1252 * 3. One allocated by the test code and passed in via 1253 * 'pkey' to this function. 1254 * Ensure that we can allocate at least another 13 (16-3). 1255 * 1256 * On powerpc: 1257 * There are either 5, 28, 29 or 32 pkeys supported in 1258 * hardware depending on the page size (4K or 64K) and 1259 * platform (powernv or powervm). Four are allocated by 1260 * the time we get here. These include pkey-0, pkey-1, 1261 * exec-only pkey and the one allocated by the test code. 1262 * Ensure that we can allocate the remaining. 1263 */ 1264 pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); 1265 1266 for (i = 0; i < nr_allocated_pkeys; i++) { 1267 err = sys_pkey_free(allocated_pkeys[i]); 1268 pkey_assert(!err); 1269 read_pkey_reg(); /* for shadow checking */ 1270 } 1271 } 1272 1273 void arch_force_pkey_reg_init(void) 1274 { 1275 #if defined(__i386__) || defined(__x86_64__) /* arch */ 1276 u64 *buf; 1277 1278 /* 1279 * All keys should be allocated and set to allow reads and 1280 * writes, so the register should be all 0. If not, just 1281 * skip the test. 1282 */ 1283 if (read_pkey_reg()) 1284 return; 1285 1286 /* 1287 * Just allocate an absurd about of memory rather than 1288 * doing the XSAVE size enumeration dance. 1289 */ 1290 buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1291 1292 /* These __builtins require compiling with -mxsave */ 1293 1294 /* XSAVE to build a valid buffer: */ 1295 __builtin_ia32_xsave(buf, XSTATE_PKEY); 1296 /* Clear XSTATE_BV[PKRU]: */ 1297 buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; 1298 /* XRSTOR will likely get PKRU back to the init state: */ 1299 __builtin_ia32_xrstor(buf, XSTATE_PKEY); 1300 1301 munmap(buf, 1*MB); 1302 #endif 1303 } 1304 1305 1306 /* 1307 * This is mostly useless on ppc for now. But it will not 1308 * hurt anything and should give some better coverage as 1309 * a long-running test that continually checks the pkey 1310 * register. 1311 */ 1312 void test_pkey_init_state(int *ptr, u16 pkey) 1313 { 1314 int err; 1315 int allocated_pkeys[NR_PKEYS] = {0}; 1316 int nr_allocated_pkeys = 0; 1317 int i; 1318 1319 for (i = 0; i < NR_PKEYS; i++) { 1320 int new_pkey = alloc_pkey(); 1321 1322 if (new_pkey < 0) 1323 continue; 1324 allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1325 } 1326 1327 dprintf3("%s()::%d\n", __func__, __LINE__); 1328 1329 arch_force_pkey_reg_init(); 1330 1331 /* 1332 * Loop for a bit, hoping to get exercise the kernel 1333 * context switch code. 1334 */ 1335 for (i = 0; i < 1000000; i++) 1336 read_pkey_reg(); 1337 1338 for (i = 0; i < nr_allocated_pkeys; i++) { 1339 err = sys_pkey_free(allocated_pkeys[i]); 1340 pkey_assert(!err); 1341 read_pkey_reg(); /* for shadow checking */ 1342 } 1343 } 1344 1345 /* 1346 * pkey 0 is special. It is allocated by default, so you do not 1347 * have to call pkey_alloc() to use it first. Make sure that it 1348 * is usable. 1349 */ 1350 void test_mprotect_with_pkey_0(int *ptr, u16 pkey) 1351 { 1352 long size; 1353 int prot; 1354 1355 assert(pkey_last_malloc_record); 1356 size = pkey_last_malloc_record->size; 1357 /* 1358 * This is a bit of a hack. But mprotect() requires 1359 * huge-page-aligned sizes when operating on hugetlbfs. 1360 * So, make sure that we use something that's a multiple 1361 * of a huge page when we can. 1362 */ 1363 if (size >= HPAGE_SIZE) 1364 size = HPAGE_SIZE; 1365 prot = pkey_last_malloc_record->prot; 1366 1367 /* Use pkey 0 */ 1368 mprotect_pkey(ptr, size, prot, 0); 1369 1370 /* Make sure that we can set it back to the original pkey. */ 1371 mprotect_pkey(ptr, size, prot, pkey); 1372 } 1373 1374 void test_ptrace_of_child(int *ptr, u16 pkey) 1375 { 1376 __attribute__((__unused__)) int peek_result; 1377 pid_t child_pid; 1378 void *ignored = 0; 1379 long ret; 1380 int status; 1381 /* 1382 * This is the "control" for our little expermient. Make sure 1383 * we can always access it when ptracing. 1384 */ 1385 int *plain_ptr_unaligned = malloc(HPAGE_SIZE); 1386 int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); 1387 1388 /* 1389 * Fork a child which is an exact copy of this process, of course. 1390 * That means we can do all of our tests via ptrace() and then plain 1391 * memory access and ensure they work differently. 1392 */ 1393 child_pid = fork_lazy_child(); 1394 dprintf1("[%d] child pid: %d\n", getpid(), child_pid); 1395 1396 ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); 1397 if (ret) 1398 perror("attach"); 1399 dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); 1400 pkey_assert(ret != -1); 1401 ret = waitpid(child_pid, &status, WUNTRACED); 1402 if ((ret != child_pid) || !(WIFSTOPPED(status))) { 1403 fprintf(stderr, "weird waitpid result %ld stat %x\n", 1404 ret, status); 1405 pkey_assert(0); 1406 } 1407 dprintf2("waitpid ret: %ld\n", ret); 1408 dprintf2("waitpid status: %d\n", status); 1409 1410 pkey_access_deny(pkey); 1411 pkey_write_deny(pkey); 1412 1413 /* Write access, untested for now: 1414 ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); 1415 pkey_assert(ret != -1); 1416 dprintf1("poke at %p: %ld\n", peek_at, ret); 1417 */ 1418 1419 /* 1420 * Try to access the pkey-protected "ptr" via ptrace: 1421 */ 1422 ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); 1423 /* expect it to work, without an error: */ 1424 pkey_assert(ret != -1); 1425 /* Now access from the current task, and expect an exception: */ 1426 peek_result = read_ptr(ptr); 1427 expected_pkey_fault(pkey); 1428 1429 /* 1430 * Try to access the NON-pkey-protected "plain_ptr" via ptrace: 1431 */ 1432 ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); 1433 /* expect it to work, without an error: */ 1434 pkey_assert(ret != -1); 1435 /* Now access from the current task, and expect NO exception: */ 1436 peek_result = read_ptr(plain_ptr); 1437 do_not_expect_pkey_fault("read plain pointer after ptrace"); 1438 1439 ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); 1440 pkey_assert(ret != -1); 1441 1442 ret = kill(child_pid, SIGKILL); 1443 pkey_assert(ret != -1); 1444 1445 wait(&status); 1446 1447 free(plain_ptr_unaligned); 1448 } 1449 1450 void *get_pointer_to_instructions(void) 1451 { 1452 void *p1; 1453 1454 p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); 1455 dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); 1456 /* lots_o_noops_around_write should be page-aligned already */ 1457 assert(p1 == &lots_o_noops_around_write); 1458 1459 /* Point 'p1' at the *second* page of the function: */ 1460 p1 += PAGE_SIZE; 1461 1462 /* 1463 * Try to ensure we fault this in on next touch to ensure 1464 * we get an instruction fault as opposed to a data one 1465 */ 1466 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1467 1468 return p1; 1469 } 1470 1471 void test_executing_on_unreadable_memory(int *ptr, u16 pkey) 1472 { 1473 void *p1; 1474 int scratch; 1475 int ptr_contents; 1476 int ret; 1477 1478 p1 = get_pointer_to_instructions(); 1479 lots_o_noops_around_write(&scratch); 1480 ptr_contents = read_ptr(p1); 1481 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1482 1483 ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); 1484 pkey_assert(!ret); 1485 pkey_access_deny(pkey); 1486 1487 dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); 1488 1489 /* 1490 * Make sure this is an *instruction* fault 1491 */ 1492 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1493 lots_o_noops_around_write(&scratch); 1494 do_not_expect_pkey_fault("executing on PROT_EXEC memory"); 1495 expect_fault_on_read_execonly_key(p1, pkey); 1496 } 1497 1498 void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) 1499 { 1500 void *p1; 1501 int scratch; 1502 int ptr_contents; 1503 int ret; 1504 1505 dprintf1("%s() start\n", __func__); 1506 1507 p1 = get_pointer_to_instructions(); 1508 lots_o_noops_around_write(&scratch); 1509 ptr_contents = read_ptr(p1); 1510 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1511 1512 /* Use a *normal* mprotect(), not mprotect_pkey(): */ 1513 ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); 1514 pkey_assert(!ret); 1515 1516 /* 1517 * Reset the shadow, assuming that the above mprotect() 1518 * correctly changed PKRU, but to an unknown value since 1519 * the actual allocated pkey is unknown. 1520 */ 1521 shadow_pkey_reg = __read_pkey_reg(); 1522 1523 dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); 1524 1525 /* Make sure this is an *instruction* fault */ 1526 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1527 lots_o_noops_around_write(&scratch); 1528 do_not_expect_pkey_fault("executing on PROT_EXEC memory"); 1529 expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); 1530 1531 /* 1532 * Put the memory back to non-PROT_EXEC. Should clear the 1533 * exec-only pkey off the VMA and allow it to be readable 1534 * again. Go to PROT_NONE first to check for a kernel bug 1535 * that did not clear the pkey when doing PROT_NONE. 1536 */ 1537 ret = mprotect(p1, PAGE_SIZE, PROT_NONE); 1538 pkey_assert(!ret); 1539 1540 ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); 1541 pkey_assert(!ret); 1542 ptr_contents = read_ptr(p1); 1543 do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); 1544 } 1545 1546 #if defined(__i386__) || defined(__x86_64__) 1547 void test_ptrace_modifies_pkru(int *ptr, u16 pkey) 1548 { 1549 u32 new_pkru; 1550 pid_t child; 1551 int status, ret; 1552 int pkey_offset = pkey_reg_xstate_offset(); 1553 size_t xsave_size = cpu_max_xsave_size(); 1554 void *xsave; 1555 u32 *pkey_register; 1556 u64 *xstate_bv; 1557 struct iovec iov; 1558 1559 new_pkru = ~read_pkey_reg(); 1560 /* Don't make PROT_EXEC mappings inaccessible */ 1561 new_pkru &= ~3; 1562 1563 child = fork(); 1564 pkey_assert(child >= 0); 1565 dprintf3("[%d] fork() ret: %d\n", getpid(), child); 1566 if (!child) { 1567 ptrace(PTRACE_TRACEME, 0, 0, 0); 1568 /* Stop and allow the tracer to modify PKRU directly */ 1569 raise(SIGSTOP); 1570 1571 /* 1572 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 1573 * checking 1574 */ 1575 if (__read_pkey_reg() != new_pkru) 1576 exit(1); 1577 1578 /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ 1579 raise(SIGSTOP); 1580 1581 if (__read_pkey_reg() != 0) 1582 exit(1); 1583 1584 /* Stop and allow the tracer to examine PKRU */ 1585 raise(SIGSTOP); 1586 1587 exit(0); 1588 } 1589 1590 pkey_assert(child == waitpid(child, &status, 0)); 1591 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1592 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1593 1594 xsave = (void *)malloc(xsave_size); 1595 pkey_assert(xsave > 0); 1596 1597 /* Modify the PKRU register directly */ 1598 iov.iov_base = xsave; 1599 iov.iov_len = xsave_size; 1600 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1601 pkey_assert(ret == 0); 1602 1603 pkey_register = (u32 *)(xsave + pkey_offset); 1604 pkey_assert(*pkey_register == read_pkey_reg()); 1605 1606 *pkey_register = new_pkru; 1607 1608 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1609 pkey_assert(ret == 0); 1610 1611 /* Test that the modification is visible in ptrace before any execution */ 1612 memset(xsave, 0xCC, xsave_size); 1613 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1614 pkey_assert(ret == 0); 1615 pkey_assert(*pkey_register == new_pkru); 1616 1617 /* Execute the tracee */ 1618 ret = ptrace(PTRACE_CONT, child, 0, 0); 1619 pkey_assert(ret == 0); 1620 1621 /* Test that the tracee saw the PKRU value change */ 1622 pkey_assert(child == waitpid(child, &status, 0)); 1623 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1624 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1625 1626 /* Test that the modification is visible in ptrace after execution */ 1627 memset(xsave, 0xCC, xsave_size); 1628 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1629 pkey_assert(ret == 0); 1630 pkey_assert(*pkey_register == new_pkru); 1631 1632 /* Clear the PKRU bit from XSTATE_BV */ 1633 xstate_bv = (u64 *)(xsave + 512); 1634 *xstate_bv &= ~(1 << 9); 1635 1636 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1637 pkey_assert(ret == 0); 1638 1639 /* Test that the modification is visible in ptrace before any execution */ 1640 memset(xsave, 0xCC, xsave_size); 1641 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1642 pkey_assert(ret == 0); 1643 pkey_assert(*pkey_register == 0); 1644 1645 ret = ptrace(PTRACE_CONT, child, 0, 0); 1646 pkey_assert(ret == 0); 1647 1648 /* Test that the tracee saw the PKRU value go to 0 */ 1649 pkey_assert(child == waitpid(child, &status, 0)); 1650 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1651 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1652 1653 /* Test that the modification is visible in ptrace after execution */ 1654 memset(xsave, 0xCC, xsave_size); 1655 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1656 pkey_assert(ret == 0); 1657 pkey_assert(*pkey_register == 0); 1658 1659 ret = ptrace(PTRACE_CONT, child, 0, 0); 1660 pkey_assert(ret == 0); 1661 pkey_assert(child == waitpid(child, &status, 0)); 1662 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1663 pkey_assert(WIFEXITED(status)); 1664 pkey_assert(WEXITSTATUS(status) == 0); 1665 free(xsave); 1666 } 1667 #endif 1668 1669 void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) 1670 { 1671 int size = PAGE_SIZE; 1672 int sret; 1673 1674 if (cpu_has_pkeys()) { 1675 dprintf1("SKIP: %s: no CPU support\n", __func__); 1676 return; 1677 } 1678 1679 sret = syscall(__NR_pkey_mprotect, ptr, size, PROT_READ, pkey); 1680 pkey_assert(sret < 0); 1681 } 1682 1683 void (*pkey_tests[])(int *ptr, u16 pkey) = { 1684 test_read_of_write_disabled_region, 1685 test_read_of_access_disabled_region, 1686 test_read_of_access_disabled_region_with_page_already_mapped, 1687 test_write_of_write_disabled_region, 1688 test_write_of_write_disabled_region_with_page_already_mapped, 1689 test_write_of_access_disabled_region, 1690 test_write_of_access_disabled_region_with_page_already_mapped, 1691 test_kernel_write_of_access_disabled_region, 1692 test_kernel_write_of_write_disabled_region, 1693 test_kernel_gup_of_access_disabled_region, 1694 test_kernel_gup_write_to_write_disabled_region, 1695 test_executing_on_unreadable_memory, 1696 test_implicit_mprotect_exec_only_memory, 1697 test_mprotect_with_pkey_0, 1698 test_ptrace_of_child, 1699 test_pkey_init_state, 1700 test_pkey_syscalls_on_non_allocated_pkey, 1701 test_pkey_syscalls_bad_args, 1702 test_pkey_alloc_exhaust, 1703 test_pkey_alloc_free_attach_pkey0, 1704 #if defined(__i386__) || defined(__x86_64__) 1705 test_ptrace_modifies_pkru, 1706 #endif 1707 }; 1708 1709 void run_tests_once(void) 1710 { 1711 int *ptr; 1712 int prot = PROT_READ|PROT_WRITE; 1713 1714 for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { 1715 int pkey; 1716 int orig_pkey_faults = pkey_faults; 1717 1718 dprintf1("======================\n"); 1719 dprintf1("test %d preparing...\n", test_nr); 1720 1721 tracing_on(); 1722 pkey = alloc_random_pkey(); 1723 dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); 1724 ptr = malloc_pkey(PAGE_SIZE, prot, pkey); 1725 dprintf1("test %d starting...\n", test_nr); 1726 pkey_tests[test_nr](ptr, pkey); 1727 dprintf1("freeing test memory: %p\n", ptr); 1728 free_pkey_malloc(ptr); 1729 sys_pkey_free(pkey); 1730 1731 dprintf1("pkey_faults: %d\n", pkey_faults); 1732 dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); 1733 1734 tracing_off(); 1735 close_test_fds(); 1736 1737 printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); 1738 dprintf1("======================\n\n"); 1739 } 1740 iteration_nr++; 1741 } 1742 1743 void pkey_setup_shadow(void) 1744 { 1745 shadow_pkey_reg = __read_pkey_reg(); 1746 } 1747 1748 pid_t parent_pid; 1749 1750 void restore_settings_atexit(void) 1751 { 1752 if (parent_pid == getpid()) 1753 cat_into_file(buf, "/proc/sys/vm/nr_hugepages"); 1754 } 1755 1756 void save_settings(void) 1757 { 1758 int fd; 1759 int err; 1760 1761 if (geteuid()) 1762 return; 1763 1764 fd = open("/proc/sys/vm/nr_hugepages", O_RDONLY); 1765 if (fd < 0) { 1766 fprintf(stderr, "error opening\n"); 1767 perror("error: "); 1768 exit(__LINE__); 1769 } 1770 1771 /* -1 to guarantee leaving the trailing \0 */ 1772 err = read(fd, buf, sizeof(buf)-1); 1773 if (err < 0) { 1774 fprintf(stderr, "error reading\n"); 1775 perror("error: "); 1776 exit(__LINE__); 1777 } 1778 1779 parent_pid = getpid(); 1780 atexit(restore_settings_atexit); 1781 close(fd); 1782 } 1783 1784 int main(void) 1785 { 1786 int nr_iterations = 22; 1787 int pkeys_supported = is_pkeys_supported(); 1788 1789 srand((unsigned int)time(NULL)); 1790 1791 save_settings(); 1792 setup_handlers(); 1793 1794 printf("has pkeys: %d\n", pkeys_supported); 1795 1796 if (!pkeys_supported) { 1797 int size = PAGE_SIZE; 1798 int *ptr; 1799 1800 printf("running PKEY tests for unsupported CPU/OS\n"); 1801 1802 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1803 assert(ptr != (void *)-1); 1804 test_mprotect_pkey_on_unsupported_cpu(ptr, 1); 1805 exit(0); 1806 } 1807 1808 pkey_setup_shadow(); 1809 printf("startup pkey_reg: %016llx\n", read_pkey_reg()); 1810 setup_hugetlbfs(); 1811 1812 while (nr_iterations-- > 0) 1813 run_tests_once(); 1814 1815 printf("done (all tests OK)\n"); 1816 return 0; 1817 } 1818