1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) 4 * 5 * There are examples in here of: 6 * * how to set protection keys on memory 7 * * how to set/clear bits in pkey registers (the rights register) 8 * * how to handle SEGV_PKUERR signals and extract pkey-relevant 9 * information from the siginfo 10 * 11 * Things to add: 12 * make sure KSM and KSM COW breaking works 13 * prefault pages in at malloc, or not 14 * protect MPX bounds tables with protection keys? 15 * make sure VMA splitting/merging is working correctly 16 * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys 17 * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel 18 * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks 19 * 20 * Compile like this: 21 * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 22 * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 23 */ 24 #define _GNU_SOURCE 25 #define __SANE_USERSPACE_TYPES__ 26 #include <errno.h> 27 #include <linux/elf.h> 28 #include <linux/futex.h> 29 #include <time.h> 30 #include <sys/time.h> 31 #include <sys/syscall.h> 32 #include <string.h> 33 #include <stdio.h> 34 #include <stdint.h> 35 #include <stdbool.h> 36 #include <signal.h> 37 #include <assert.h> 38 #include <stdlib.h> 39 #include <ucontext.h> 40 #include <sys/mman.h> 41 #include <sys/types.h> 42 #include <sys/wait.h> 43 #include <sys/stat.h> 44 #include <fcntl.h> 45 #include <asm-generic/unistd.h> 46 #include <sys/ptrace.h> 47 #include <setjmp.h> 48 49 #include "pkey-helpers.h" 50 51 int iteration_nr = 1; 52 int test_nr; 53 54 u64 shadow_pkey_reg; 55 int dprint_in_signal; 56 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; 57 58 void cat_into_file(char *str, char *file) 59 { 60 int fd = open(file, O_RDWR); 61 int ret; 62 63 dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); 64 /* 65 * these need to be raw because they are called under 66 * pkey_assert() 67 */ 68 if (fd < 0) { 69 fprintf(stderr, "error opening '%s'\n", str); 70 perror("error: "); 71 exit(__LINE__); 72 } 73 74 ret = write(fd, str, strlen(str)); 75 if (ret != strlen(str)) { 76 perror("write to file failed"); 77 fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); 78 exit(__LINE__); 79 } 80 close(fd); 81 } 82 83 #if CONTROL_TRACING > 0 84 static int warned_tracing; 85 int tracing_root_ok(void) 86 { 87 if (geteuid() != 0) { 88 if (!warned_tracing) 89 fprintf(stderr, "WARNING: not run as root, " 90 "can not do tracing control\n"); 91 warned_tracing = 1; 92 return 0; 93 } 94 return 1; 95 } 96 #endif 97 98 void tracing_on(void) 99 { 100 #if CONTROL_TRACING > 0 101 #define TRACEDIR "/sys/kernel/tracing" 102 char pidstr[32]; 103 104 if (!tracing_root_ok()) 105 return; 106 107 sprintf(pidstr, "%d", getpid()); 108 cat_into_file("0", TRACEDIR "/tracing_on"); 109 cat_into_file("\n", TRACEDIR "/trace"); 110 if (1) { 111 cat_into_file("function_graph", TRACEDIR "/current_tracer"); 112 cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); 113 } else { 114 cat_into_file("nop", TRACEDIR "/current_tracer"); 115 } 116 cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); 117 cat_into_file("1", TRACEDIR "/tracing_on"); 118 dprintf1("enabled tracing\n"); 119 #endif 120 } 121 122 void tracing_off(void) 123 { 124 #if CONTROL_TRACING > 0 125 if (!tracing_root_ok()) 126 return; 127 cat_into_file("0", "/sys/kernel/tracing/tracing_on"); 128 #endif 129 } 130 131 void abort_hooks(void) 132 { 133 fprintf(stderr, "running %s()...\n", __func__); 134 tracing_off(); 135 #ifdef SLEEP_ON_ABORT 136 sleep(SLEEP_ON_ABORT); 137 #endif 138 } 139 140 /* 141 * This attempts to have roughly a page of instructions followed by a few 142 * instructions that do a write, and another page of instructions. That 143 * way, we are pretty sure that the write is in the second page of 144 * instructions and has at least a page of padding behind it. 145 * 146 * *That* lets us be sure to madvise() away the write instruction, which 147 * will then fault, which makes sure that the fault code handles 148 * execute-only memory properly. 149 */ 150 #if defined(__powerpc64__) || defined(__aarch64__) 151 /* This way, both 4K and 64K alignment are maintained */ 152 __attribute__((__aligned__(65536))) 153 #else 154 __attribute__((__aligned__(PAGE_SIZE))) 155 #endif 156 void lots_o_noops_around_write(int *write_to_me) 157 { 158 dprintf3("running %s()\n", __func__); 159 __page_o_noops(); 160 /* Assume this happens in the second page of instructions: */ 161 *write_to_me = __LINE__; 162 /* pad out by another page: */ 163 __page_o_noops(); 164 dprintf3("%s() done\n", __func__); 165 } 166 167 void dump_mem(void *dumpme, int len_bytes) 168 { 169 char *c = (void *)dumpme; 170 int i; 171 172 for (i = 0; i < len_bytes; i += sizeof(u64)) { 173 u64 *ptr = (u64 *)(c + i); 174 dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); 175 } 176 } 177 178 static u32 hw_pkey_get(int pkey, unsigned long flags) 179 { 180 u64 pkey_reg = __read_pkey_reg(); 181 182 dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", 183 __func__, pkey, flags, 0, 0); 184 dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); 185 186 return (u32) get_pkey_bits(pkey_reg, pkey); 187 } 188 189 static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) 190 { 191 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); 192 u64 old_pkey_reg = __read_pkey_reg(); 193 u64 new_pkey_reg; 194 195 /* make sure that 'rights' only contains the bits we expect: */ 196 assert(!(rights & ~mask)); 197 198 /* modify bits accordingly in old pkey_reg and assign it */ 199 new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); 200 201 __write_pkey_reg(new_pkey_reg); 202 203 dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" 204 " pkey_reg now: %016llx old_pkey_reg: %016llx\n", 205 __func__, pkey, rights, flags, 0, __read_pkey_reg(), 206 old_pkey_reg); 207 return 0; 208 } 209 210 void pkey_disable_set(int pkey, int flags) 211 { 212 unsigned long syscall_flags = 0; 213 int ret; 214 int pkey_rights; 215 216 dprintf1("START->%s(%d, 0x%x)\n", __func__, 217 pkey, flags); 218 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 219 220 pkey_rights = hw_pkey_get(pkey, syscall_flags); 221 222 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 223 pkey, pkey, pkey_rights); 224 225 pkey_assert(pkey_rights >= 0); 226 227 pkey_rights |= flags; 228 229 ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); 230 assert(!ret); 231 /* pkey_reg and flags have the same format */ 232 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); 233 dprintf1("%s(%d) shadow: 0x%016llx\n", 234 __func__, pkey, shadow_pkey_reg); 235 236 pkey_assert(ret >= 0); 237 238 pkey_rights = hw_pkey_get(pkey, syscall_flags); 239 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 240 pkey, pkey, pkey_rights); 241 242 dprintf1("%s(%d) pkey_reg: 0x%016llx\n", 243 __func__, pkey, read_pkey_reg()); 244 dprintf1("END<---%s(%d, 0x%x)\n", __func__, 245 pkey, flags); 246 } 247 248 void pkey_disable_clear(int pkey, int flags) 249 { 250 unsigned long syscall_flags = 0; 251 int ret; 252 int pkey_rights = hw_pkey_get(pkey, syscall_flags); 253 254 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 255 256 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 257 pkey, pkey, pkey_rights); 258 pkey_assert(pkey_rights >= 0); 259 260 pkey_rights &= ~flags; 261 262 ret = hw_pkey_set(pkey, pkey_rights, 0); 263 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); 264 pkey_assert(ret >= 0); 265 266 pkey_rights = hw_pkey_get(pkey, syscall_flags); 267 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 268 pkey, pkey, pkey_rights); 269 270 dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, 271 pkey, read_pkey_reg()); 272 } 273 274 void pkey_write_allow(int pkey) 275 { 276 pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); 277 } 278 void pkey_write_deny(int pkey) 279 { 280 pkey_disable_set(pkey, PKEY_DISABLE_WRITE); 281 } 282 void pkey_access_allow(int pkey) 283 { 284 pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); 285 } 286 void pkey_access_deny(int pkey) 287 { 288 pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); 289 } 290 291 static char *si_code_str(int si_code) 292 { 293 if (si_code == SEGV_MAPERR) 294 return "SEGV_MAPERR"; 295 if (si_code == SEGV_ACCERR) 296 return "SEGV_ACCERR"; 297 if (si_code == SEGV_BNDERR) 298 return "SEGV_BNDERR"; 299 if (si_code == SEGV_PKUERR) 300 return "SEGV_PKUERR"; 301 return "UNKNOWN"; 302 } 303 304 int pkey_faults; 305 int last_si_pkey = -1; 306 void signal_handler(int signum, siginfo_t *si, void *vucontext) 307 { 308 ucontext_t *uctxt = vucontext; 309 int trapno; 310 unsigned long ip; 311 #ifdef MCONTEXT_FPREGS 312 char *fpregs; 313 #endif 314 #if defined(__i386__) || defined(__x86_64__) /* arch */ 315 u32 *pkey_reg_ptr; 316 int pkey_reg_offset; 317 #endif /* arch */ 318 u64 siginfo_pkey; 319 u32 *si_pkey_ptr; 320 321 dprint_in_signal = 1; 322 dprintf1(">>>>===============SIGSEGV============================\n"); 323 dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", 324 __func__, __LINE__, 325 __read_pkey_reg(), shadow_pkey_reg); 326 327 trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext); 328 ip = MCONTEXT_IP(uctxt->uc_mcontext); 329 #ifdef MCONTEXT_FPREGS 330 fpregs = (char *) uctxt->uc_mcontext.fpregs; 331 #endif 332 333 dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", 334 __func__, trapno, ip, si_code_str(si->si_code), 335 si->si_code); 336 337 #if defined(__i386__) || defined(__x86_64__) /* arch */ 338 #ifdef __i386__ 339 /* 340 * 32-bit has some extra padding so that userspace can tell whether 341 * the XSTATE header is present in addition to the "legacy" FPU 342 * state. We just assume that it is here. 343 */ 344 fpregs += 0x70; 345 #endif /* i386 */ 346 pkey_reg_offset = pkey_reg_xstate_offset(); 347 pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); 348 349 /* 350 * If we got a PKEY fault, we *HAVE* to have at least one bit set in 351 * here. 352 */ 353 dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); 354 if (DEBUG_LEVEL > 4) 355 dump_mem(pkey_reg_ptr - 128, 256); 356 pkey_assert(*pkey_reg_ptr); 357 #endif /* arch */ 358 359 dprintf1("siginfo: %p\n", si); 360 #ifdef MCONTEXT_FPREGS 361 dprintf1(" fpregs: %p\n", fpregs); 362 #endif 363 364 if ((si->si_code == SEGV_MAPERR) || 365 (si->si_code == SEGV_ACCERR) || 366 (si->si_code == SEGV_BNDERR)) { 367 printf("non-PK si_code, exiting...\n"); 368 exit(4); 369 } 370 371 si_pkey_ptr = siginfo_get_pkey_ptr(si); 372 dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); 373 dump_mem((u8 *)si_pkey_ptr - 8, 24); 374 siginfo_pkey = *si_pkey_ptr; 375 pkey_assert(siginfo_pkey < NR_PKEYS); 376 last_si_pkey = siginfo_pkey; 377 378 /* 379 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 380 * checking 381 */ 382 dprintf1("signal pkey_reg from pkey_reg: %016llx\n", 383 __read_pkey_reg()); 384 dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); 385 #if defined(__i386__) || defined(__x86_64__) /* arch */ 386 dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); 387 *(u64 *)pkey_reg_ptr = 0x00000000; 388 dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); 389 #elif defined(__powerpc64__) /* arch */ 390 /* restore access and let the faulting instruction continue */ 391 pkey_access_allow(siginfo_pkey); 392 #elif defined(__aarch64__) 393 aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL); 394 #endif /* arch */ 395 pkey_faults++; 396 dprintf1("<<<<==================================================\n"); 397 dprint_in_signal = 0; 398 } 399 400 int wait_all_children(void) 401 { 402 int status; 403 return waitpid(-1, &status, 0); 404 } 405 406 void sig_chld(int x) 407 { 408 dprint_in_signal = 1; 409 dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); 410 dprint_in_signal = 0; 411 } 412 413 void setup_sigsegv_handler(void) 414 { 415 int r, rs; 416 struct sigaction newact; 417 struct sigaction oldact; 418 419 /* #PF is mapped to sigsegv */ 420 int signum = SIGSEGV; 421 422 newact.sa_handler = 0; 423 newact.sa_sigaction = signal_handler; 424 425 /*sigset_t - signals to block while in the handler */ 426 /* get the old signal mask. */ 427 rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); 428 pkey_assert(rs == 0); 429 430 /* call sa_sigaction, not sa_handler*/ 431 newact.sa_flags = SA_SIGINFO; 432 433 newact.sa_restorer = 0; /* void(*)(), obsolete */ 434 r = sigaction(signum, &newact, &oldact); 435 r = sigaction(SIGALRM, &newact, &oldact); 436 pkey_assert(r == 0); 437 } 438 439 void setup_handlers(void) 440 { 441 signal(SIGCHLD, &sig_chld); 442 setup_sigsegv_handler(); 443 } 444 445 pid_t fork_lazy_child(void) 446 { 447 pid_t forkret; 448 449 forkret = fork(); 450 pkey_assert(forkret >= 0); 451 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 452 453 if (!forkret) { 454 /* in the child */ 455 while (1) { 456 dprintf1("child sleeping...\n"); 457 sleep(30); 458 } 459 } 460 return forkret; 461 } 462 463 int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 464 unsigned long pkey) 465 { 466 int sret; 467 468 dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, 469 ptr, size, orig_prot, pkey); 470 471 errno = 0; 472 sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); 473 if (errno) { 474 dprintf2("SYS_mprotect_key sret: %d\n", sret); 475 dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); 476 dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); 477 if (DEBUG_LEVEL >= 2) 478 perror("SYS_mprotect_pkey"); 479 } 480 return sret; 481 } 482 483 int sys_pkey_alloc(unsigned long flags, unsigned long init_val) 484 { 485 int ret = syscall(SYS_pkey_alloc, flags, init_val); 486 dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", 487 __func__, flags, init_val, ret, errno); 488 return ret; 489 } 490 491 int alloc_pkey(void) 492 { 493 int ret; 494 unsigned long init_val = 0x0; 495 496 dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", 497 __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); 498 ret = sys_pkey_alloc(0, init_val); 499 /* 500 * pkey_alloc() sets PKEY register, so we need to reflect it in 501 * shadow_pkey_reg: 502 */ 503 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 504 " shadow: 0x%016llx\n", 505 __func__, __LINE__, ret, __read_pkey_reg(), 506 shadow_pkey_reg); 507 if (ret > 0) { 508 /* clear both the bits: */ 509 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, 510 ~PKEY_MASK); 511 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 512 " shadow: 0x%016llx\n", 513 __func__, 514 __LINE__, ret, __read_pkey_reg(), 515 shadow_pkey_reg); 516 /* 517 * move the new state in from init_val 518 * (remember, we cheated and init_val == pkey_reg format) 519 */ 520 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, 521 init_val); 522 } 523 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 524 " shadow: 0x%016llx\n", 525 __func__, __LINE__, ret, __read_pkey_reg(), 526 shadow_pkey_reg); 527 dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); 528 /* for shadow checking: */ 529 read_pkey_reg(); 530 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 531 " shadow: 0x%016llx\n", 532 __func__, __LINE__, ret, __read_pkey_reg(), 533 shadow_pkey_reg); 534 return ret; 535 } 536 537 int sys_pkey_free(unsigned long pkey) 538 { 539 int ret = syscall(SYS_pkey_free, pkey); 540 dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); 541 return ret; 542 } 543 544 /* 545 * I had a bug where pkey bits could be set by mprotect() but 546 * not cleared. This ensures we get lots of random bit sets 547 * and clears on the vma and pte pkey bits. 548 */ 549 int alloc_random_pkey(void) 550 { 551 int max_nr_pkey_allocs; 552 int ret; 553 int i; 554 int alloced_pkeys[NR_PKEYS]; 555 int nr_alloced = 0; 556 int random_index; 557 memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); 558 559 /* allocate every possible key and make a note of which ones we got */ 560 max_nr_pkey_allocs = NR_PKEYS; 561 for (i = 0; i < max_nr_pkey_allocs; i++) { 562 int new_pkey = alloc_pkey(); 563 if (new_pkey < 0) 564 break; 565 alloced_pkeys[nr_alloced++] = new_pkey; 566 } 567 568 pkey_assert(nr_alloced > 0); 569 /* select a random one out of the allocated ones */ 570 random_index = rand() % nr_alloced; 571 ret = alloced_pkeys[random_index]; 572 /* now zero it out so we don't free it next */ 573 alloced_pkeys[random_index] = 0; 574 575 /* go through the allocated ones that we did not want and free them */ 576 for (i = 0; i < nr_alloced; i++) { 577 int free_ret; 578 if (!alloced_pkeys[i]) 579 continue; 580 free_ret = sys_pkey_free(alloced_pkeys[i]); 581 pkey_assert(!free_ret); 582 } 583 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 584 " shadow: 0x%016llx\n", __func__, 585 __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); 586 return ret; 587 } 588 589 int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 590 unsigned long pkey) 591 { 592 int nr_iterations = random() % 100; 593 int ret; 594 595 while (0) { 596 int rpkey = alloc_random_pkey(); 597 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 598 dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 599 ptr, size, orig_prot, pkey, ret); 600 if (nr_iterations-- < 0) 601 break; 602 603 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 604 " shadow: 0x%016llx\n", 605 __func__, __LINE__, ret, __read_pkey_reg(), 606 shadow_pkey_reg); 607 sys_pkey_free(rpkey); 608 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 609 " shadow: 0x%016llx\n", 610 __func__, __LINE__, ret, __read_pkey_reg(), 611 shadow_pkey_reg); 612 } 613 pkey_assert(pkey < NR_PKEYS); 614 615 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 616 dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 617 ptr, size, orig_prot, pkey, ret); 618 pkey_assert(!ret); 619 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 620 " shadow: 0x%016llx\n", __func__, 621 __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); 622 return ret; 623 } 624 625 struct pkey_malloc_record { 626 void *ptr; 627 long size; 628 int prot; 629 }; 630 struct pkey_malloc_record *pkey_malloc_records; 631 struct pkey_malloc_record *pkey_last_malloc_record; 632 long nr_pkey_malloc_records; 633 void record_pkey_malloc(void *ptr, long size, int prot) 634 { 635 long i; 636 struct pkey_malloc_record *rec = NULL; 637 638 for (i = 0; i < nr_pkey_malloc_records; i++) { 639 rec = &pkey_malloc_records[i]; 640 /* find a free record */ 641 if (rec) 642 break; 643 } 644 if (!rec) { 645 /* every record is full */ 646 size_t old_nr_records = nr_pkey_malloc_records; 647 size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); 648 size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); 649 dprintf2("new_nr_records: %zd\n", new_nr_records); 650 dprintf2("new_size: %zd\n", new_size); 651 pkey_malloc_records = realloc(pkey_malloc_records, new_size); 652 pkey_assert(pkey_malloc_records != NULL); 653 rec = &pkey_malloc_records[nr_pkey_malloc_records]; 654 /* 655 * realloc() does not initialize memory, so zero it from 656 * the first new record all the way to the end. 657 */ 658 for (i = 0; i < new_nr_records - old_nr_records; i++) 659 memset(rec + i, 0, sizeof(*rec)); 660 } 661 dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", 662 (int)(rec - pkey_malloc_records), rec, ptr, size); 663 rec->ptr = ptr; 664 rec->size = size; 665 rec->prot = prot; 666 pkey_last_malloc_record = rec; 667 nr_pkey_malloc_records++; 668 } 669 670 void free_pkey_malloc(void *ptr) 671 { 672 long i; 673 int ret; 674 dprintf3("%s(%p)\n", __func__, ptr); 675 for (i = 0; i < nr_pkey_malloc_records; i++) { 676 struct pkey_malloc_record *rec = &pkey_malloc_records[i]; 677 dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", 678 ptr, i, rec, rec->ptr, rec->size); 679 if ((ptr < rec->ptr) || 680 (ptr >= rec->ptr + rec->size)) 681 continue; 682 683 dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", 684 ptr, i, rec, rec->ptr, rec->size); 685 nr_pkey_malloc_records--; 686 ret = munmap(rec->ptr, rec->size); 687 dprintf3("munmap ret: %d\n", ret); 688 pkey_assert(!ret); 689 dprintf3("clearing rec->ptr, rec: %p\n", rec); 690 rec->ptr = NULL; 691 dprintf3("done clearing rec->ptr, rec: %p\n", rec); 692 return; 693 } 694 pkey_assert(false); 695 } 696 697 698 void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) 699 { 700 void *ptr; 701 int ret; 702 703 read_pkey_reg(); 704 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 705 size, prot, pkey); 706 pkey_assert(pkey < NR_PKEYS); 707 ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 708 pkey_assert(ptr != (void *)-1); 709 ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); 710 pkey_assert(!ret); 711 record_pkey_malloc(ptr, size, prot); 712 read_pkey_reg(); 713 714 dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); 715 return ptr; 716 } 717 718 void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) 719 { 720 int ret; 721 void *ptr; 722 723 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 724 size, prot, pkey); 725 /* 726 * Guarantee we can fit at least one huge page in the resulting 727 * allocation by allocating space for 2: 728 */ 729 size = ALIGN_UP(size, HPAGE_SIZE * 2); 730 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 731 pkey_assert(ptr != (void *)-1); 732 record_pkey_malloc(ptr, size, prot); 733 mprotect_pkey(ptr, size, prot, pkey); 734 735 dprintf1("unaligned ptr: %p\n", ptr); 736 ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); 737 dprintf1(" aligned ptr: %p\n", ptr); 738 ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); 739 dprintf1("MADV_HUGEPAGE ret: %d\n", ret); 740 ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); 741 dprintf1("MADV_WILLNEED ret: %d\n", ret); 742 memset(ptr, 0, HPAGE_SIZE); 743 744 dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); 745 return ptr; 746 } 747 748 int hugetlb_setup_ok; 749 #define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" 750 #define GET_NR_HUGE_PAGES 10 751 void setup_hugetlbfs(void) 752 { 753 int err; 754 int fd; 755 char buf[256]; 756 long hpagesz_kb; 757 long hpagesz_mb; 758 759 if (geteuid() != 0) { 760 fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); 761 return; 762 } 763 764 cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); 765 766 /* 767 * Now go make sure that we got the pages and that they 768 * are PMD-level pages. Someone might have made PUD-level 769 * pages the default. 770 */ 771 hpagesz_kb = HPAGE_SIZE / 1024; 772 hpagesz_mb = hpagesz_kb / 1024; 773 sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); 774 fd = open(buf, O_RDONLY); 775 if (fd < 0) { 776 fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", 777 hpagesz_mb, strerror(errno)); 778 return; 779 } 780 781 /* -1 to guarantee leaving the trailing \0 */ 782 err = read(fd, buf, sizeof(buf)-1); 783 close(fd); 784 if (err <= 0) { 785 fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", 786 hpagesz_mb, strerror(errno)); 787 return; 788 } 789 790 if (atoi(buf) != GET_NR_HUGE_PAGES) { 791 fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", 792 hpagesz_mb, buf, GET_NR_HUGE_PAGES); 793 return; 794 } 795 796 hugetlb_setup_ok = 1; 797 } 798 799 void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) 800 { 801 void *ptr; 802 int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; 803 804 if (!hugetlb_setup_ok) 805 return PTR_ERR_ENOTSUP; 806 807 dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); 808 size = ALIGN_UP(size, HPAGE_SIZE * 2); 809 pkey_assert(pkey < NR_PKEYS); 810 ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); 811 pkey_assert(ptr != (void *)-1); 812 mprotect_pkey(ptr, size, prot, pkey); 813 814 record_pkey_malloc(ptr, size, prot); 815 816 dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); 817 return ptr; 818 } 819 820 void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) 821 { 822 void *ptr; 823 int fd; 824 825 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 826 size, prot, pkey); 827 pkey_assert(pkey < NR_PKEYS); 828 fd = open("/dax/foo", O_RDWR); 829 pkey_assert(fd >= 0); 830 831 ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); 832 pkey_assert(ptr != (void *)-1); 833 834 mprotect_pkey(ptr, size, prot, pkey); 835 836 record_pkey_malloc(ptr, size, prot); 837 838 dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); 839 close(fd); 840 return ptr; 841 } 842 843 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { 844 845 malloc_pkey_with_mprotect, 846 malloc_pkey_with_mprotect_subpage, 847 malloc_pkey_anon_huge, 848 malloc_pkey_hugetlb 849 /* can not do direct with the pkey_mprotect() API: 850 malloc_pkey_mmap_direct, 851 malloc_pkey_mmap_dax, 852 */ 853 }; 854 855 void *malloc_pkey(long size, int prot, u16 pkey) 856 { 857 void *ret; 858 static int malloc_type; 859 int nr_malloc_types = ARRAY_SIZE(pkey_malloc); 860 861 pkey_assert(pkey < NR_PKEYS); 862 863 while (1) { 864 pkey_assert(malloc_type < nr_malloc_types); 865 866 ret = pkey_malloc[malloc_type](size, prot, pkey); 867 pkey_assert(ret != (void *)-1); 868 869 malloc_type++; 870 if (malloc_type >= nr_malloc_types) 871 malloc_type = (random()%nr_malloc_types); 872 873 /* try again if the malloc_type we tried is unsupported */ 874 if (ret == PTR_ERR_ENOTSUP) 875 continue; 876 877 break; 878 } 879 880 dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, 881 size, prot, pkey, ret); 882 return ret; 883 } 884 885 int last_pkey_faults; 886 #define UNKNOWN_PKEY -2 887 void expected_pkey_fault(int pkey) 888 { 889 dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", 890 __func__, last_pkey_faults, pkey_faults); 891 dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); 892 pkey_assert(last_pkey_faults + 1 == pkey_faults); 893 894 /* 895 * For exec-only memory, we do not know the pkey in 896 * advance, so skip this check. 897 */ 898 if (pkey != UNKNOWN_PKEY) 899 pkey_assert(last_si_pkey == pkey); 900 901 #if defined(__i386__) || defined(__x86_64__) /* arch */ 902 /* 903 * The signal handler shold have cleared out PKEY register to let the 904 * test program continue. We now have to restore it. 905 */ 906 if (__read_pkey_reg() != 0) 907 #elif defined(__aarch64__) 908 if (__read_pkey_reg() != PKEY_ALLOW_ALL) 909 #else 910 if (__read_pkey_reg() != shadow_pkey_reg) 911 #endif /* arch */ 912 pkey_assert(0); 913 914 __write_pkey_reg(shadow_pkey_reg); 915 dprintf1("%s() set pkey_reg=%016llx to restore state after signal " 916 "nuked it\n", __func__, shadow_pkey_reg); 917 last_pkey_faults = pkey_faults; 918 last_si_pkey = -1; 919 } 920 921 #define do_not_expect_pkey_fault(msg) do { \ 922 if (last_pkey_faults != pkey_faults) \ 923 dprintf0("unexpected PKey fault: %s\n", msg); \ 924 pkey_assert(last_pkey_faults == pkey_faults); \ 925 } while (0) 926 927 int test_fds[10] = { -1 }; 928 int nr_test_fds; 929 void __save_test_fd(int fd) 930 { 931 pkey_assert(fd >= 0); 932 pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); 933 test_fds[nr_test_fds] = fd; 934 nr_test_fds++; 935 } 936 937 int get_test_read_fd(void) 938 { 939 int test_fd = open("/etc/passwd", O_RDONLY); 940 __save_test_fd(test_fd); 941 return test_fd; 942 } 943 944 void close_test_fds(void) 945 { 946 int i; 947 948 for (i = 0; i < nr_test_fds; i++) { 949 if (test_fds[i] < 0) 950 continue; 951 close(test_fds[i]); 952 test_fds[i] = -1; 953 } 954 nr_test_fds = 0; 955 } 956 957 void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) 958 { 959 int i, err; 960 int max_nr_pkey_allocs; 961 int alloced_pkeys[NR_PKEYS]; 962 int nr_alloced = 0; 963 long size; 964 965 pkey_assert(pkey_last_malloc_record); 966 size = pkey_last_malloc_record->size; 967 /* 968 * This is a bit of a hack. But mprotect() requires 969 * huge-page-aligned sizes when operating on hugetlbfs. 970 * So, make sure that we use something that's a multiple 971 * of a huge page when we can. 972 */ 973 if (size >= HPAGE_SIZE) 974 size = HPAGE_SIZE; 975 976 /* allocate every possible key and make sure key-0 never got allocated */ 977 max_nr_pkey_allocs = NR_PKEYS; 978 for (i = 0; i < max_nr_pkey_allocs; i++) { 979 int new_pkey = alloc_pkey(); 980 pkey_assert(new_pkey != 0); 981 982 if (new_pkey < 0) 983 break; 984 alloced_pkeys[nr_alloced++] = new_pkey; 985 } 986 /* free all the allocated keys */ 987 for (i = 0; i < nr_alloced; i++) { 988 int free_ret; 989 990 if (!alloced_pkeys[i]) 991 continue; 992 free_ret = sys_pkey_free(alloced_pkeys[i]); 993 pkey_assert(!free_ret); 994 } 995 996 /* attach key-0 in various modes */ 997 err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); 998 pkey_assert(!err); 999 err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); 1000 pkey_assert(!err); 1001 err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); 1002 pkey_assert(!err); 1003 err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); 1004 pkey_assert(!err); 1005 err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); 1006 pkey_assert(!err); 1007 } 1008 1009 void test_read_of_write_disabled_region(int *ptr, u16 pkey) 1010 { 1011 int ptr_contents; 1012 1013 dprintf1("disabling write access to PKEY[1], doing read\n"); 1014 pkey_write_deny(pkey); 1015 ptr_contents = read_ptr(ptr); 1016 dprintf1("*ptr: %d\n", ptr_contents); 1017 dprintf1("\n"); 1018 } 1019 void test_read_of_access_disabled_region(int *ptr, u16 pkey) 1020 { 1021 int ptr_contents; 1022 1023 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); 1024 read_pkey_reg(); 1025 pkey_access_deny(pkey); 1026 ptr_contents = read_ptr(ptr); 1027 dprintf1("*ptr: %d\n", ptr_contents); 1028 expected_pkey_fault(pkey); 1029 } 1030 1031 void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, 1032 u16 pkey) 1033 { 1034 int ptr_contents; 1035 1036 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", 1037 pkey, ptr); 1038 ptr_contents = read_ptr(ptr); 1039 dprintf1("reading ptr before disabling the read : %d\n", 1040 ptr_contents); 1041 read_pkey_reg(); 1042 pkey_access_deny(pkey); 1043 ptr_contents = read_ptr(ptr); 1044 dprintf1("*ptr: %d\n", ptr_contents); 1045 expected_pkey_fault(pkey); 1046 } 1047 1048 void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, 1049 u16 pkey) 1050 { 1051 *ptr = __LINE__; 1052 dprintf1("disabling write access; after accessing the page, " 1053 "to PKEY[%02d], doing write\n", pkey); 1054 pkey_write_deny(pkey); 1055 *ptr = __LINE__; 1056 expected_pkey_fault(pkey); 1057 } 1058 1059 void test_write_of_write_disabled_region(int *ptr, u16 pkey) 1060 { 1061 dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); 1062 pkey_write_deny(pkey); 1063 *ptr = __LINE__; 1064 expected_pkey_fault(pkey); 1065 } 1066 void test_write_of_access_disabled_region(int *ptr, u16 pkey) 1067 { 1068 dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); 1069 pkey_access_deny(pkey); 1070 *ptr = __LINE__; 1071 expected_pkey_fault(pkey); 1072 } 1073 1074 void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, 1075 u16 pkey) 1076 { 1077 *ptr = __LINE__; 1078 dprintf1("disabling access; after accessing the page, " 1079 " to PKEY[%02d], doing write\n", pkey); 1080 pkey_access_deny(pkey); 1081 *ptr = __LINE__; 1082 expected_pkey_fault(pkey); 1083 } 1084 1085 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) 1086 { 1087 int ret; 1088 int test_fd = get_test_read_fd(); 1089 1090 dprintf1("disabling access to PKEY[%02d], " 1091 "having kernel read() to buffer\n", pkey); 1092 pkey_access_deny(pkey); 1093 ret = read(test_fd, ptr, 1); 1094 dprintf1("read ret: %d\n", ret); 1095 pkey_assert(ret); 1096 } 1097 void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) 1098 { 1099 int ret; 1100 int test_fd = get_test_read_fd(); 1101 1102 pkey_write_deny(pkey); 1103 ret = read(test_fd, ptr, 100); 1104 dprintf1("read ret: %d\n", ret); 1105 if (ret < 0 && (DEBUG_LEVEL > 0)) 1106 perror("verbose read result (OK for this to be bad)"); 1107 pkey_assert(ret); 1108 } 1109 1110 void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) 1111 { 1112 int pipe_ret, vmsplice_ret; 1113 struct iovec iov; 1114 int pipe_fds[2]; 1115 1116 pipe_ret = pipe(pipe_fds); 1117 1118 pkey_assert(pipe_ret == 0); 1119 dprintf1("disabling access to PKEY[%02d], " 1120 "having kernel vmsplice from buffer\n", pkey); 1121 pkey_access_deny(pkey); 1122 iov.iov_base = ptr; 1123 iov.iov_len = PAGE_SIZE; 1124 vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); 1125 dprintf1("vmsplice() ret: %d\n", vmsplice_ret); 1126 pkey_assert(vmsplice_ret == -1); 1127 1128 close(pipe_fds[0]); 1129 close(pipe_fds[1]); 1130 } 1131 1132 void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) 1133 { 1134 int ignored = 0xdada; 1135 int futex_ret; 1136 int some_int = __LINE__; 1137 1138 dprintf1("disabling write to PKEY[%02d], " 1139 "doing futex gunk in buffer\n", pkey); 1140 *ptr = some_int; 1141 pkey_write_deny(pkey); 1142 futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, 1143 &ignored, ignored); 1144 if (DEBUG_LEVEL > 0) 1145 perror("futex"); 1146 dprintf1("futex() ret: %d\n", futex_ret); 1147 } 1148 1149 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1150 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) 1151 { 1152 int err; 1153 int i; 1154 1155 /* Note: 0 is the default pkey, so don't mess with it */ 1156 for (i = 1; i < NR_PKEYS; i++) { 1157 if (pkey == i) 1158 continue; 1159 1160 dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); 1161 err = sys_pkey_free(i); 1162 pkey_assert(err); 1163 1164 err = sys_pkey_free(i); 1165 pkey_assert(err); 1166 1167 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); 1168 pkey_assert(err); 1169 } 1170 } 1171 1172 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1173 void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) 1174 { 1175 int err; 1176 int bad_pkey = NR_PKEYS+99; 1177 1178 /* pass a known-invalid pkey in: */ 1179 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); 1180 pkey_assert(err); 1181 } 1182 1183 void become_child(void) 1184 { 1185 pid_t forkret; 1186 1187 forkret = fork(); 1188 pkey_assert(forkret >= 0); 1189 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 1190 1191 if (!forkret) { 1192 /* in the child */ 1193 return; 1194 } 1195 exit(0); 1196 } 1197 1198 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1199 void test_pkey_alloc_exhaust(int *ptr, u16 pkey) 1200 { 1201 int err; 1202 int allocated_pkeys[NR_PKEYS] = {0}; 1203 int nr_allocated_pkeys = 0; 1204 int i; 1205 1206 for (i = 0; i < NR_PKEYS*3; i++) { 1207 int new_pkey; 1208 dprintf1("%s() alloc loop: %d\n", __func__, i); 1209 new_pkey = alloc_pkey(); 1210 dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" 1211 " shadow: 0x%016llx\n", 1212 __func__, __LINE__, err, __read_pkey_reg(), 1213 shadow_pkey_reg); 1214 read_pkey_reg(); /* for shadow checking */ 1215 dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); 1216 if ((new_pkey == -1) && (errno == ENOSPC)) { 1217 dprintf2("%s() failed to allocate pkey after %d tries\n", 1218 __func__, nr_allocated_pkeys); 1219 } else { 1220 /* 1221 * Ensure the number of successes never 1222 * exceeds the number of keys supported 1223 * in the hardware. 1224 */ 1225 pkey_assert(nr_allocated_pkeys < NR_PKEYS); 1226 allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1227 } 1228 1229 /* 1230 * Make sure that allocation state is properly 1231 * preserved across fork(). 1232 */ 1233 if (i == NR_PKEYS*2) 1234 become_child(); 1235 } 1236 1237 dprintf3("%s()::%d\n", __func__, __LINE__); 1238 1239 /* 1240 * On x86: 1241 * There are 16 pkeys supported in hardware. Three are 1242 * allocated by the time we get here: 1243 * 1. The default key (0) 1244 * 2. One possibly consumed by an execute-only mapping. 1245 * 3. One allocated by the test code and passed in via 1246 * 'pkey' to this function. 1247 * Ensure that we can allocate at least another 13 (16-3). 1248 * 1249 * On powerpc: 1250 * There are either 5, 28, 29 or 32 pkeys supported in 1251 * hardware depending on the page size (4K or 64K) and 1252 * platform (powernv or powervm). Four are allocated by 1253 * the time we get here. These include pkey-0, pkey-1, 1254 * exec-only pkey and the one allocated by the test code. 1255 * Ensure that we can allocate the remaining. 1256 */ 1257 pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); 1258 1259 for (i = 0; i < nr_allocated_pkeys; i++) { 1260 err = sys_pkey_free(allocated_pkeys[i]); 1261 pkey_assert(!err); 1262 read_pkey_reg(); /* for shadow checking */ 1263 } 1264 } 1265 1266 void arch_force_pkey_reg_init(void) 1267 { 1268 #if defined(__i386__) || defined(__x86_64__) /* arch */ 1269 u64 *buf; 1270 1271 /* 1272 * All keys should be allocated and set to allow reads and 1273 * writes, so the register should be all 0. If not, just 1274 * skip the test. 1275 */ 1276 if (read_pkey_reg()) 1277 return; 1278 1279 /* 1280 * Just allocate an absurd about of memory rather than 1281 * doing the XSAVE size enumeration dance. 1282 */ 1283 buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1284 1285 /* These __builtins require compiling with -mxsave */ 1286 1287 /* XSAVE to build a valid buffer: */ 1288 __builtin_ia32_xsave(buf, XSTATE_PKEY); 1289 /* Clear XSTATE_BV[PKRU]: */ 1290 buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; 1291 /* XRSTOR will likely get PKRU back to the init state: */ 1292 __builtin_ia32_xrstor(buf, XSTATE_PKEY); 1293 1294 munmap(buf, 1*MB); 1295 #endif 1296 } 1297 1298 1299 /* 1300 * This is mostly useless on ppc for now. But it will not 1301 * hurt anything and should give some better coverage as 1302 * a long-running test that continually checks the pkey 1303 * register. 1304 */ 1305 void test_pkey_init_state(int *ptr, u16 pkey) 1306 { 1307 int err; 1308 int allocated_pkeys[NR_PKEYS] = {0}; 1309 int nr_allocated_pkeys = 0; 1310 int i; 1311 1312 for (i = 0; i < NR_PKEYS; i++) { 1313 int new_pkey = alloc_pkey(); 1314 1315 if (new_pkey < 0) 1316 continue; 1317 allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1318 } 1319 1320 dprintf3("%s()::%d\n", __func__, __LINE__); 1321 1322 arch_force_pkey_reg_init(); 1323 1324 /* 1325 * Loop for a bit, hoping to get exercise the kernel 1326 * context switch code. 1327 */ 1328 for (i = 0; i < 1000000; i++) 1329 read_pkey_reg(); 1330 1331 for (i = 0; i < nr_allocated_pkeys; i++) { 1332 err = sys_pkey_free(allocated_pkeys[i]); 1333 pkey_assert(!err); 1334 read_pkey_reg(); /* for shadow checking */ 1335 } 1336 } 1337 1338 /* 1339 * pkey 0 is special. It is allocated by default, so you do not 1340 * have to call pkey_alloc() to use it first. Make sure that it 1341 * is usable. 1342 */ 1343 void test_mprotect_with_pkey_0(int *ptr, u16 pkey) 1344 { 1345 long size; 1346 int prot; 1347 1348 assert(pkey_last_malloc_record); 1349 size = pkey_last_malloc_record->size; 1350 /* 1351 * This is a bit of a hack. But mprotect() requires 1352 * huge-page-aligned sizes when operating on hugetlbfs. 1353 * So, make sure that we use something that's a multiple 1354 * of a huge page when we can. 1355 */ 1356 if (size >= HPAGE_SIZE) 1357 size = HPAGE_SIZE; 1358 prot = pkey_last_malloc_record->prot; 1359 1360 /* Use pkey 0 */ 1361 mprotect_pkey(ptr, size, prot, 0); 1362 1363 /* Make sure that we can set it back to the original pkey. */ 1364 mprotect_pkey(ptr, size, prot, pkey); 1365 } 1366 1367 void test_ptrace_of_child(int *ptr, u16 pkey) 1368 { 1369 __attribute__((__unused__)) int peek_result; 1370 pid_t child_pid; 1371 void *ignored = 0; 1372 long ret; 1373 int status; 1374 /* 1375 * This is the "control" for our little expermient. Make sure 1376 * we can always access it when ptracing. 1377 */ 1378 int *plain_ptr_unaligned = malloc(HPAGE_SIZE); 1379 int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); 1380 1381 /* 1382 * Fork a child which is an exact copy of this process, of course. 1383 * That means we can do all of our tests via ptrace() and then plain 1384 * memory access and ensure they work differently. 1385 */ 1386 child_pid = fork_lazy_child(); 1387 dprintf1("[%d] child pid: %d\n", getpid(), child_pid); 1388 1389 ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); 1390 if (ret) 1391 perror("attach"); 1392 dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); 1393 pkey_assert(ret != -1); 1394 ret = waitpid(child_pid, &status, WUNTRACED); 1395 if ((ret != child_pid) || !(WIFSTOPPED(status))) { 1396 fprintf(stderr, "weird waitpid result %ld stat %x\n", 1397 ret, status); 1398 pkey_assert(0); 1399 } 1400 dprintf2("waitpid ret: %ld\n", ret); 1401 dprintf2("waitpid status: %d\n", status); 1402 1403 pkey_access_deny(pkey); 1404 pkey_write_deny(pkey); 1405 1406 /* Write access, untested for now: 1407 ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); 1408 pkey_assert(ret != -1); 1409 dprintf1("poke at %p: %ld\n", peek_at, ret); 1410 */ 1411 1412 /* 1413 * Try to access the pkey-protected "ptr" via ptrace: 1414 */ 1415 ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); 1416 /* expect it to work, without an error: */ 1417 pkey_assert(ret != -1); 1418 /* Now access from the current task, and expect an exception: */ 1419 peek_result = read_ptr(ptr); 1420 expected_pkey_fault(pkey); 1421 1422 /* 1423 * Try to access the NON-pkey-protected "plain_ptr" via ptrace: 1424 */ 1425 ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); 1426 /* expect it to work, without an error: */ 1427 pkey_assert(ret != -1); 1428 /* Now access from the current task, and expect NO exception: */ 1429 peek_result = read_ptr(plain_ptr); 1430 do_not_expect_pkey_fault("read plain pointer after ptrace"); 1431 1432 ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); 1433 pkey_assert(ret != -1); 1434 1435 ret = kill(child_pid, SIGKILL); 1436 pkey_assert(ret != -1); 1437 1438 wait(&status); 1439 1440 free(plain_ptr_unaligned); 1441 } 1442 1443 void *get_pointer_to_instructions(void) 1444 { 1445 void *p1; 1446 1447 p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); 1448 dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); 1449 /* lots_o_noops_around_write should be page-aligned already */ 1450 assert(p1 == &lots_o_noops_around_write); 1451 1452 /* Point 'p1' at the *second* page of the function: */ 1453 p1 += PAGE_SIZE; 1454 1455 /* 1456 * Try to ensure we fault this in on next touch to ensure 1457 * we get an instruction fault as opposed to a data one 1458 */ 1459 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1460 1461 return p1; 1462 } 1463 1464 void test_executing_on_unreadable_memory(int *ptr, u16 pkey) 1465 { 1466 void *p1; 1467 int scratch; 1468 int ptr_contents; 1469 int ret; 1470 1471 p1 = get_pointer_to_instructions(); 1472 lots_o_noops_around_write(&scratch); 1473 ptr_contents = read_ptr(p1); 1474 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1475 1476 ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); 1477 pkey_assert(!ret); 1478 pkey_access_deny(pkey); 1479 1480 dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); 1481 1482 /* 1483 * Make sure this is an *instruction* fault 1484 */ 1485 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1486 lots_o_noops_around_write(&scratch); 1487 do_not_expect_pkey_fault("executing on PROT_EXEC memory"); 1488 expect_fault_on_read_execonly_key(p1, pkey); 1489 1490 // Reset back to PROT_EXEC | PROT_READ for architectures that support 1491 // non-PKEY execute-only permissions. 1492 ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey); 1493 pkey_assert(!ret); 1494 } 1495 1496 void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) 1497 { 1498 void *p1; 1499 int scratch; 1500 int ptr_contents; 1501 int ret; 1502 1503 dprintf1("%s() start\n", __func__); 1504 1505 p1 = get_pointer_to_instructions(); 1506 lots_o_noops_around_write(&scratch); 1507 ptr_contents = read_ptr(p1); 1508 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1509 1510 /* Use a *normal* mprotect(), not mprotect_pkey(): */ 1511 ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); 1512 pkey_assert(!ret); 1513 1514 /* 1515 * Reset the shadow, assuming that the above mprotect() 1516 * correctly changed PKRU, but to an unknown value since 1517 * the actual allocated pkey is unknown. 1518 */ 1519 shadow_pkey_reg = __read_pkey_reg(); 1520 1521 dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); 1522 1523 /* Make sure this is an *instruction* fault */ 1524 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1525 lots_o_noops_around_write(&scratch); 1526 do_not_expect_pkey_fault("executing on PROT_EXEC memory"); 1527 expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); 1528 1529 /* 1530 * Put the memory back to non-PROT_EXEC. Should clear the 1531 * exec-only pkey off the VMA and allow it to be readable 1532 * again. Go to PROT_NONE first to check for a kernel bug 1533 * that did not clear the pkey when doing PROT_NONE. 1534 */ 1535 ret = mprotect(p1, PAGE_SIZE, PROT_NONE); 1536 pkey_assert(!ret); 1537 1538 ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); 1539 pkey_assert(!ret); 1540 ptr_contents = read_ptr(p1); 1541 do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); 1542 } 1543 1544 #if defined(__i386__) || defined(__x86_64__) 1545 void test_ptrace_modifies_pkru(int *ptr, u16 pkey) 1546 { 1547 u32 new_pkru; 1548 pid_t child; 1549 int status, ret; 1550 int pkey_offset = pkey_reg_xstate_offset(); 1551 size_t xsave_size = cpu_max_xsave_size(); 1552 void *xsave; 1553 u32 *pkey_register; 1554 u64 *xstate_bv; 1555 struct iovec iov; 1556 1557 new_pkru = ~read_pkey_reg(); 1558 /* Don't make PROT_EXEC mappings inaccessible */ 1559 new_pkru &= ~3; 1560 1561 child = fork(); 1562 pkey_assert(child >= 0); 1563 dprintf3("[%d] fork() ret: %d\n", getpid(), child); 1564 if (!child) { 1565 ptrace(PTRACE_TRACEME, 0, 0, 0); 1566 /* Stop and allow the tracer to modify PKRU directly */ 1567 raise(SIGSTOP); 1568 1569 /* 1570 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 1571 * checking 1572 */ 1573 if (__read_pkey_reg() != new_pkru) 1574 exit(1); 1575 1576 /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ 1577 raise(SIGSTOP); 1578 1579 if (__read_pkey_reg() != 0) 1580 exit(1); 1581 1582 /* Stop and allow the tracer to examine PKRU */ 1583 raise(SIGSTOP); 1584 1585 exit(0); 1586 } 1587 1588 pkey_assert(child == waitpid(child, &status, 0)); 1589 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1590 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1591 1592 xsave = (void *)malloc(xsave_size); 1593 pkey_assert(xsave > 0); 1594 1595 /* Modify the PKRU register directly */ 1596 iov.iov_base = xsave; 1597 iov.iov_len = xsave_size; 1598 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1599 pkey_assert(ret == 0); 1600 1601 pkey_register = (u32 *)(xsave + pkey_offset); 1602 pkey_assert(*pkey_register == read_pkey_reg()); 1603 1604 *pkey_register = new_pkru; 1605 1606 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1607 pkey_assert(ret == 0); 1608 1609 /* Test that the modification is visible in ptrace before any execution */ 1610 memset(xsave, 0xCC, xsave_size); 1611 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1612 pkey_assert(ret == 0); 1613 pkey_assert(*pkey_register == new_pkru); 1614 1615 /* Execute the tracee */ 1616 ret = ptrace(PTRACE_CONT, child, 0, 0); 1617 pkey_assert(ret == 0); 1618 1619 /* Test that the tracee saw the PKRU value change */ 1620 pkey_assert(child == waitpid(child, &status, 0)); 1621 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1622 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1623 1624 /* Test that the modification is visible in ptrace after execution */ 1625 memset(xsave, 0xCC, xsave_size); 1626 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1627 pkey_assert(ret == 0); 1628 pkey_assert(*pkey_register == new_pkru); 1629 1630 /* Clear the PKRU bit from XSTATE_BV */ 1631 xstate_bv = (u64 *)(xsave + 512); 1632 *xstate_bv &= ~(1 << 9); 1633 1634 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1635 pkey_assert(ret == 0); 1636 1637 /* Test that the modification is visible in ptrace before any execution */ 1638 memset(xsave, 0xCC, xsave_size); 1639 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1640 pkey_assert(ret == 0); 1641 pkey_assert(*pkey_register == 0); 1642 1643 ret = ptrace(PTRACE_CONT, child, 0, 0); 1644 pkey_assert(ret == 0); 1645 1646 /* Test that the tracee saw the PKRU value go to 0 */ 1647 pkey_assert(child == waitpid(child, &status, 0)); 1648 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1649 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1650 1651 /* Test that the modification is visible in ptrace after execution */ 1652 memset(xsave, 0xCC, xsave_size); 1653 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1654 pkey_assert(ret == 0); 1655 pkey_assert(*pkey_register == 0); 1656 1657 ret = ptrace(PTRACE_CONT, child, 0, 0); 1658 pkey_assert(ret == 0); 1659 pkey_assert(child == waitpid(child, &status, 0)); 1660 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1661 pkey_assert(WIFEXITED(status)); 1662 pkey_assert(WEXITSTATUS(status) == 0); 1663 free(xsave); 1664 } 1665 #endif 1666 1667 #if defined(__aarch64__) 1668 void test_ptrace_modifies_pkru(int *ptr, u16 pkey) 1669 { 1670 pid_t child; 1671 int status, ret; 1672 struct iovec iov; 1673 u64 trace_pkey; 1674 /* Just a random pkey value.. */ 1675 u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) | 1676 (POE_NONE << PKEY_BITS_PER_PKEY) | 1677 POE_RWX; 1678 1679 child = fork(); 1680 pkey_assert(child >= 0); 1681 dprintf3("[%d] fork() ret: %d\n", getpid(), child); 1682 if (!child) { 1683 ptrace(PTRACE_TRACEME, 0, 0, 0); 1684 1685 /* Stop and allow the tracer to modify PKRU directly */ 1686 raise(SIGSTOP); 1687 1688 /* 1689 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 1690 * checking 1691 */ 1692 if (__read_pkey_reg() != new_pkey) 1693 exit(1); 1694 1695 raise(SIGSTOP); 1696 1697 exit(0); 1698 } 1699 1700 pkey_assert(child == waitpid(child, &status, 0)); 1701 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1702 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1703 1704 iov.iov_base = &trace_pkey; 1705 iov.iov_len = 8; 1706 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); 1707 pkey_assert(ret == 0); 1708 pkey_assert(trace_pkey == read_pkey_reg()); 1709 1710 trace_pkey = new_pkey; 1711 1712 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov); 1713 pkey_assert(ret == 0); 1714 1715 /* Test that the modification is visible in ptrace before any execution */ 1716 memset(&trace_pkey, 0, sizeof(trace_pkey)); 1717 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); 1718 pkey_assert(ret == 0); 1719 pkey_assert(trace_pkey == new_pkey); 1720 1721 /* Execute the tracee */ 1722 ret = ptrace(PTRACE_CONT, child, 0, 0); 1723 pkey_assert(ret == 0); 1724 1725 /* Test that the tracee saw the PKRU value change */ 1726 pkey_assert(child == waitpid(child, &status, 0)); 1727 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1728 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1729 1730 /* Test that the modification is visible in ptrace after execution */ 1731 memset(&trace_pkey, 0, sizeof(trace_pkey)); 1732 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); 1733 pkey_assert(ret == 0); 1734 pkey_assert(trace_pkey == new_pkey); 1735 1736 ret = ptrace(PTRACE_CONT, child, 0, 0); 1737 pkey_assert(ret == 0); 1738 pkey_assert(child == waitpid(child, &status, 0)); 1739 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1740 pkey_assert(WIFEXITED(status)); 1741 pkey_assert(WEXITSTATUS(status) == 0); 1742 } 1743 #endif 1744 1745 void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) 1746 { 1747 int size = PAGE_SIZE; 1748 int sret; 1749 1750 if (cpu_has_pkeys()) { 1751 dprintf1("SKIP: %s: no CPU support\n", __func__); 1752 return; 1753 } 1754 1755 sret = syscall(__NR_pkey_mprotect, ptr, size, PROT_READ, pkey); 1756 pkey_assert(sret < 0); 1757 } 1758 1759 void (*pkey_tests[])(int *ptr, u16 pkey) = { 1760 test_read_of_write_disabled_region, 1761 test_read_of_access_disabled_region, 1762 test_read_of_access_disabled_region_with_page_already_mapped, 1763 test_write_of_write_disabled_region, 1764 test_write_of_write_disabled_region_with_page_already_mapped, 1765 test_write_of_access_disabled_region, 1766 test_write_of_access_disabled_region_with_page_already_mapped, 1767 test_kernel_write_of_access_disabled_region, 1768 test_kernel_write_of_write_disabled_region, 1769 test_kernel_gup_of_access_disabled_region, 1770 test_kernel_gup_write_to_write_disabled_region, 1771 test_executing_on_unreadable_memory, 1772 test_implicit_mprotect_exec_only_memory, 1773 test_mprotect_with_pkey_0, 1774 test_ptrace_of_child, 1775 test_pkey_init_state, 1776 test_pkey_syscalls_on_non_allocated_pkey, 1777 test_pkey_syscalls_bad_args, 1778 test_pkey_alloc_exhaust, 1779 test_pkey_alloc_free_attach_pkey0, 1780 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) 1781 test_ptrace_modifies_pkru, 1782 #endif 1783 }; 1784 1785 void run_tests_once(void) 1786 { 1787 int *ptr; 1788 int prot = PROT_READ|PROT_WRITE; 1789 1790 for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { 1791 int pkey; 1792 int orig_pkey_faults = pkey_faults; 1793 1794 dprintf1("======================\n"); 1795 dprintf1("test %d preparing...\n", test_nr); 1796 1797 tracing_on(); 1798 pkey = alloc_random_pkey(); 1799 dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); 1800 ptr = malloc_pkey(PAGE_SIZE, prot, pkey); 1801 dprintf1("test %d starting...\n", test_nr); 1802 pkey_tests[test_nr](ptr, pkey); 1803 dprintf1("freeing test memory: %p\n", ptr); 1804 free_pkey_malloc(ptr); 1805 sys_pkey_free(pkey); 1806 1807 dprintf1("pkey_faults: %d\n", pkey_faults); 1808 dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); 1809 1810 tracing_off(); 1811 close_test_fds(); 1812 1813 printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); 1814 dprintf1("======================\n\n"); 1815 } 1816 iteration_nr++; 1817 } 1818 1819 void pkey_setup_shadow(void) 1820 { 1821 shadow_pkey_reg = __read_pkey_reg(); 1822 } 1823 1824 int main(void) 1825 { 1826 int nr_iterations = 22; 1827 int pkeys_supported = is_pkeys_supported(); 1828 1829 srand((unsigned int)time(NULL)); 1830 1831 setup_handlers(); 1832 1833 printf("has pkeys: %d\n", pkeys_supported); 1834 1835 if (!pkeys_supported) { 1836 int size = PAGE_SIZE; 1837 int *ptr; 1838 1839 printf("running PKEY tests for unsupported CPU/OS\n"); 1840 1841 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1842 assert(ptr != (void *)-1); 1843 test_mprotect_pkey_on_unsupported_cpu(ptr, 1); 1844 exit(0); 1845 } 1846 1847 pkey_setup_shadow(); 1848 printf("startup pkey_reg: %016llx\n", read_pkey_reg()); 1849 setup_hugetlbfs(); 1850 1851 while (nr_iterations-- > 0) 1852 run_tests_once(); 1853 1854 printf("done (all tests OK)\n"); 1855 return 0; 1856 } 1857