1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) 4 * 5 * There are examples in here of: 6 * * how to set protection keys on memory 7 * * how to set/clear bits in pkey registers (the rights register) 8 * * how to handle SEGV_PKUERR signals and extract pkey-relevant 9 * information from the siginfo 10 * 11 * Things to add: 12 * make sure KSM and KSM COW breaking works 13 * prefault pages in at malloc, or not 14 * protect MPX bounds tables with protection keys? 15 * make sure VMA splitting/merging is working correctly 16 * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys 17 * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel 18 * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks 19 * 20 * Compile like this: 21 * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 22 * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 23 */ 24 #define _GNU_SOURCE 25 #define __SANE_USERSPACE_TYPES__ 26 #include <errno.h> 27 #include <linux/elf.h> 28 #include <linux/futex.h> 29 #include <time.h> 30 #include <sys/time.h> 31 #include <sys/syscall.h> 32 #include <string.h> 33 #include <stdio.h> 34 #include <stdint.h> 35 #include <stdbool.h> 36 #include <signal.h> 37 #include <assert.h> 38 #include <stdlib.h> 39 #include <ucontext.h> 40 #include <sys/mman.h> 41 #include <sys/types.h> 42 #include <sys/wait.h> 43 #include <sys/stat.h> 44 #include <fcntl.h> 45 #include <unistd.h> 46 #include <sys/ptrace.h> 47 #include <setjmp.h> 48 49 #include "hugepage_settings.h" 50 #include "pkey-helpers.h" 51 52 int iteration_nr = 1; 53 int test_nr; 54 55 u64 shadow_pkey_reg; 56 int dprint_in_signal; 57 58 noinline int read_ptr(int *ptr) 59 { 60 /* Keep GCC from optimizing this away somehow */ 61 barrier(); 62 return *ptr; 63 } 64 65 #if CONTROL_TRACING > 0 66 static void cat_into_file(char *str, char *file) 67 { 68 int fd = open(file, O_RDWR); 69 int ret; 70 71 dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); 72 /* 73 * these need to be raw because they are called under 74 * pkey_assert() 75 */ 76 if (fd < 0) { 77 fprintf(stderr, "error opening '%s'\n", str); 78 perror("error: "); 79 exit(__LINE__); 80 } 81 82 ret = write(fd, str, strlen(str)); 83 if (ret != strlen(str)) { 84 perror("write to file failed"); 85 fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); 86 exit(__LINE__); 87 } 88 close(fd); 89 } 90 91 static int warned_tracing; 92 static int tracing_root_ok(void) 93 { 94 if (geteuid() != 0) { 95 if (!warned_tracing) 96 fprintf(stderr, "WARNING: not run as root, " 97 "can not do tracing control\n"); 98 warned_tracing = 1; 99 return 0; 100 } 101 return 1; 102 } 103 #endif 104 105 static void tracing_on(void) 106 { 107 #if CONTROL_TRACING > 0 108 #define TRACEDIR "/sys/kernel/tracing" 109 char pidstr[32]; 110 111 if (!tracing_root_ok()) 112 return; 113 114 sprintf(pidstr, "%d", getpid()); 115 cat_into_file("0", TRACEDIR "/tracing_on"); 116 cat_into_file("\n", TRACEDIR "/trace"); 117 if (1) { 118 cat_into_file("function_graph", TRACEDIR "/current_tracer"); 119 cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); 120 } else { 121 cat_into_file("nop", TRACEDIR "/current_tracer"); 122 } 123 cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); 124 cat_into_file("1", TRACEDIR "/tracing_on"); 125 dprintf1("enabled tracing\n"); 126 #endif 127 } 128 129 static void tracing_off(void) 130 { 131 #if CONTROL_TRACING > 0 132 if (!tracing_root_ok()) 133 return; 134 cat_into_file("0", "/sys/kernel/tracing/tracing_on"); 135 #endif 136 } 137 138 void abort_hooks(void) 139 { 140 fflush(stdout); 141 fprintf(stderr, "running %s()...\n", __func__); 142 tracing_off(); 143 #ifdef SLEEP_ON_ABORT 144 sleep(SLEEP_ON_ABORT); 145 #endif 146 } 147 148 /* 149 * This attempts to have roughly a page of instructions followed by a few 150 * instructions that do a write, and another page of instructions. That 151 * way, we are pretty sure that the write is in the second page of 152 * instructions and has at least a page of padding behind it. 153 * 154 * *That* lets us be sure to madvise() away the write instruction, which 155 * will then fault, which makes sure that the fault code handles 156 * execute-only memory properly. 157 */ 158 #if defined(__powerpc64__) || defined(__aarch64__) 159 /* This way, both 4K and 64K alignment are maintained */ 160 __attribute__((__aligned__(65536))) 161 #else 162 __attribute__((__aligned__(PAGE_SIZE))) 163 #endif 164 static void lots_o_noops_around_write(int *write_to_me) 165 { 166 dprintf3("running %s()\n", __func__); 167 __page_o_noops(); 168 /* Assume this happens in the second page of instructions: */ 169 *write_to_me = __LINE__; 170 /* pad out by another page: */ 171 __page_o_noops(); 172 dprintf3("%s() done\n", __func__); 173 } 174 175 static void dump_mem(void *dumpme, int len_bytes) 176 { 177 char *c = (void *)dumpme; 178 int i; 179 180 for (i = 0; i < len_bytes; i += sizeof(u64)) { 181 u64 *ptr = (u64 *)(c + i); 182 dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); 183 } 184 } 185 186 static u32 hw_pkey_get(int pkey, unsigned long flags) 187 { 188 u64 pkey_reg = __read_pkey_reg(); 189 190 dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", 191 __func__, pkey, flags, 0, 0); 192 dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); 193 194 return (u32) get_pkey_bits(pkey_reg, pkey); 195 } 196 197 static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) 198 { 199 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); 200 u64 old_pkey_reg = __read_pkey_reg(); 201 u64 new_pkey_reg; 202 203 /* make sure that 'rights' only contains the bits we expect: */ 204 assert(!(rights & ~mask)); 205 206 /* modify bits accordingly in old pkey_reg and assign it */ 207 new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); 208 209 __write_pkey_reg(new_pkey_reg); 210 211 dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" 212 " pkey_reg now: %016llx old_pkey_reg: %016llx\n", 213 __func__, pkey, rights, flags, 0, __read_pkey_reg(), 214 old_pkey_reg); 215 return 0; 216 } 217 218 static void pkey_disable_set(int pkey, int flags) 219 { 220 unsigned long syscall_flags = 0; 221 int ret; 222 int pkey_rights; 223 224 dprintf1("START->%s(%d, 0x%x)\n", __func__, 225 pkey, flags); 226 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 227 228 pkey_rights = hw_pkey_get(pkey, syscall_flags); 229 230 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 231 pkey, pkey, pkey_rights); 232 233 pkey_assert(pkey_rights >= 0); 234 235 pkey_rights |= flags; 236 237 ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); 238 assert(!ret); 239 /* pkey_reg and flags have the same format */ 240 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); 241 dprintf1("%s(%d) shadow: 0x%016llx\n", 242 __func__, pkey, shadow_pkey_reg); 243 244 pkey_assert(ret >= 0); 245 246 pkey_rights = hw_pkey_get(pkey, syscall_flags); 247 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 248 pkey, pkey, pkey_rights); 249 250 dprintf1("%s(%d) pkey_reg: 0x%016llx\n", 251 __func__, pkey, read_pkey_reg()); 252 dprintf1("END<---%s(%d, 0x%x)\n", __func__, 253 pkey, flags); 254 } 255 256 static void pkey_disable_clear(int pkey, int flags) 257 { 258 unsigned long syscall_flags = 0; 259 int ret; 260 int pkey_rights = hw_pkey_get(pkey, syscall_flags); 261 262 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 263 264 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 265 pkey, pkey, pkey_rights); 266 pkey_assert(pkey_rights >= 0); 267 268 pkey_rights &= ~flags; 269 270 ret = hw_pkey_set(pkey, pkey_rights, 0); 271 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); 272 pkey_assert(ret >= 0); 273 274 pkey_rights = hw_pkey_get(pkey, syscall_flags); 275 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 276 pkey, pkey, pkey_rights); 277 278 dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, 279 pkey, read_pkey_reg()); 280 } 281 282 __maybe_unused static void pkey_write_allow(int pkey) 283 { 284 pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); 285 } 286 __maybe_unused static void pkey_write_deny(int pkey) 287 { 288 pkey_disable_set(pkey, PKEY_DISABLE_WRITE); 289 } 290 __maybe_unused static void pkey_access_allow(int pkey) 291 { 292 pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); 293 } 294 __maybe_unused static void pkey_access_deny(int pkey) 295 { 296 pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); 297 } 298 299 static char *si_code_str(int si_code) 300 { 301 if (si_code == SEGV_MAPERR) 302 return "SEGV_MAPERR"; 303 if (si_code == SEGV_ACCERR) 304 return "SEGV_ACCERR"; 305 if (si_code == SEGV_BNDERR) 306 return "SEGV_BNDERR"; 307 if (si_code == SEGV_PKUERR) 308 return "SEGV_PKUERR"; 309 return "UNKNOWN"; 310 } 311 312 static int pkey_faults; 313 static int last_si_pkey = -1; 314 static void signal_handler(int signum, siginfo_t *si, void *vucontext) 315 { 316 ucontext_t *uctxt = vucontext; 317 int trapno; 318 unsigned long ip; 319 #ifdef MCONTEXT_FPREGS 320 char *fpregs; 321 #endif 322 #if defined(__i386__) || defined(__x86_64__) /* arch */ 323 u32 *pkey_reg_ptr; 324 int pkey_reg_offset; 325 #endif /* arch */ 326 u64 siginfo_pkey; 327 u32 *si_pkey_ptr; 328 329 dprint_in_signal = 1; 330 dprintf1(">>>>===============SIGSEGV============================\n"); 331 dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", 332 __func__, __LINE__, 333 __read_pkey_reg(), shadow_pkey_reg); 334 335 trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext); 336 ip = MCONTEXT_IP(uctxt->uc_mcontext); 337 #ifdef MCONTEXT_FPREGS 338 fpregs = (char *) uctxt->uc_mcontext.fpregs; 339 #endif 340 341 dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", 342 __func__, trapno, ip, si_code_str(si->si_code), 343 si->si_code); 344 345 #if defined(__i386__) || defined(__x86_64__) /* arch */ 346 #ifdef __i386__ 347 /* 348 * 32-bit has some extra padding so that userspace can tell whether 349 * the XSTATE header is present in addition to the "legacy" FPU 350 * state. We just assume that it is here. 351 */ 352 fpregs += 0x70; 353 #endif /* i386 */ 354 pkey_reg_offset = pkey_reg_xstate_offset(); 355 pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); 356 357 /* 358 * If we got a PKEY fault, we *HAVE* to have at least one bit set in 359 * here. 360 */ 361 dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); 362 if (DEBUG_LEVEL > 4) 363 dump_mem(pkey_reg_ptr - 128, 256); 364 pkey_assert(*pkey_reg_ptr); 365 #endif /* arch */ 366 367 dprintf1("siginfo: %p\n", si); 368 #ifdef MCONTEXT_FPREGS 369 dprintf1(" fpregs: %p\n", fpregs); 370 #endif 371 372 if ((si->si_code == SEGV_MAPERR) || 373 (si->si_code == SEGV_ACCERR) || 374 (si->si_code == SEGV_BNDERR)) { 375 dprintf0("# non-PK si_code: %d, exiting...\n", si->si_code); 376 exit(1); 377 } 378 379 si_pkey_ptr = siginfo_get_pkey_ptr(si); 380 dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); 381 dump_mem((u8 *)si_pkey_ptr - 8, 24); 382 siginfo_pkey = *si_pkey_ptr; 383 pkey_assert(siginfo_pkey < NR_PKEYS); 384 last_si_pkey = siginfo_pkey; 385 386 /* 387 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 388 * checking 389 */ 390 dprintf1("signal pkey_reg from pkey_reg: %016llx\n", 391 __read_pkey_reg()); 392 dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); 393 #if defined(__i386__) || defined(__x86_64__) /* arch */ 394 dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); 395 *(u64 *)pkey_reg_ptr = 0x00000000; 396 dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); 397 #elif defined(__powerpc64__) /* arch */ 398 /* restore access and let the faulting instruction continue */ 399 pkey_access_allow(siginfo_pkey); 400 #elif defined(__aarch64__) 401 aarch64_write_signal_pkey(uctxt, PKEY_REG_ALLOW_ALL); 402 #endif /* arch */ 403 pkey_faults++; 404 dprintf1("<<<<==================================================\n"); 405 dprint_in_signal = 0; 406 } 407 408 static void sig_chld(int x) 409 { 410 dprint_in_signal = 1; 411 dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); 412 dprint_in_signal = 0; 413 } 414 415 static void setup_sigsegv_handler(void) 416 { 417 int r, rs; 418 struct sigaction newact; 419 struct sigaction oldact; 420 421 /* #PF is mapped to sigsegv */ 422 int signum = SIGSEGV; 423 424 newact.sa_handler = 0; 425 newact.sa_sigaction = signal_handler; 426 427 /*sigset_t - signals to block while in the handler */ 428 /* get the old signal mask. */ 429 rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); 430 pkey_assert(rs == 0); 431 432 /* call sa_sigaction, not sa_handler*/ 433 newact.sa_flags = SA_SIGINFO; 434 435 newact.sa_restorer = 0; /* void(*)(), obsolete */ 436 r = sigaction(signum, &newact, &oldact); 437 r = sigaction(SIGALRM, &newact, &oldact); 438 pkey_assert(r == 0); 439 } 440 441 static void setup_handlers(void) 442 { 443 signal(SIGCHLD, &sig_chld); 444 setup_sigsegv_handler(); 445 } 446 447 static pid_t fork_lazy_child(void) 448 { 449 pid_t forkret; 450 451 forkret = fork(); 452 pkey_assert(forkret >= 0); 453 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 454 455 if (!forkret) { 456 /* in the child */ 457 while (1) { 458 dprintf1("child sleeping...\n"); 459 sleep(30); 460 } 461 } 462 return forkret; 463 } 464 465 static int alloc_pkey(void) 466 { 467 int ret; 468 unsigned long init_val = PKEY_UNRESTRICTED; 469 470 dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", 471 __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); 472 ret = sys_pkey_alloc(0, init_val); 473 /* 474 * pkey_alloc() sets PKEY register, so we need to reflect it in 475 * shadow_pkey_reg: 476 */ 477 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 478 " shadow: 0x%016llx\n", 479 __func__, __LINE__, ret, __read_pkey_reg(), 480 shadow_pkey_reg); 481 if (ret > 0) { 482 /* clear both the bits: */ 483 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, 484 ~PKEY_MASK); 485 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 486 " shadow: 0x%016llx\n", 487 __func__, 488 __LINE__, ret, __read_pkey_reg(), 489 shadow_pkey_reg); 490 /* 491 * move the new state in from init_val 492 * (remember, we cheated and init_val == pkey_reg format) 493 */ 494 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, 495 init_val); 496 } 497 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 498 " shadow: 0x%016llx\n", 499 __func__, __LINE__, ret, __read_pkey_reg(), 500 shadow_pkey_reg); 501 dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); 502 /* for shadow checking: */ 503 read_pkey_reg(); 504 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 505 " shadow: 0x%016llx\n", 506 __func__, __LINE__, ret, __read_pkey_reg(), 507 shadow_pkey_reg); 508 return ret; 509 } 510 511 /* 512 * I had a bug where pkey bits could be set by mprotect() but 513 * not cleared. This ensures we get lots of random bit sets 514 * and clears on the vma and pte pkey bits. 515 */ 516 static int alloc_random_pkey(void) 517 { 518 int max_nr_pkey_allocs; 519 int ret; 520 int i; 521 int alloced_pkeys[NR_PKEYS]; 522 int nr_alloced = 0; 523 int random_index; 524 memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); 525 526 /* allocate every possible key and make a note of which ones we got */ 527 max_nr_pkey_allocs = NR_PKEYS; 528 for (i = 0; i < max_nr_pkey_allocs; i++) { 529 int new_pkey = alloc_pkey(); 530 if (new_pkey < 0) 531 break; 532 alloced_pkeys[nr_alloced++] = new_pkey; 533 } 534 535 pkey_assert(nr_alloced > 0); 536 /* select a random one out of the allocated ones */ 537 random_index = rand() % nr_alloced; 538 ret = alloced_pkeys[random_index]; 539 /* now zero it out so we don't free it next */ 540 alloced_pkeys[random_index] = 0; 541 542 /* go through the allocated ones that we did not want and free them */ 543 for (i = 0; i < nr_alloced; i++) { 544 int free_ret; 545 if (!alloced_pkeys[i]) 546 continue; 547 free_ret = sys_pkey_free(alloced_pkeys[i]); 548 pkey_assert(!free_ret); 549 } 550 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 551 " shadow: 0x%016llx\n", __func__, 552 __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); 553 return ret; 554 } 555 556 int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 557 unsigned long pkey) 558 { 559 int nr_iterations = random() % 100; 560 int ret; 561 562 while (nr_iterations-- >= 0) { 563 int rpkey = alloc_random_pkey(); 564 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 565 dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 566 ptr, size, orig_prot, pkey, ret); 567 568 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 569 " shadow: 0x%016llx\n", 570 __func__, __LINE__, ret, __read_pkey_reg(), 571 shadow_pkey_reg); 572 sys_pkey_free(rpkey); 573 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 574 " shadow: 0x%016llx\n", 575 __func__, __LINE__, ret, __read_pkey_reg(), 576 shadow_pkey_reg); 577 } 578 pkey_assert(pkey < NR_PKEYS); 579 580 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 581 dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 582 ptr, size, orig_prot, pkey, ret); 583 pkey_assert(!ret); 584 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 585 " shadow: 0x%016llx\n", __func__, 586 __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); 587 return ret; 588 } 589 590 struct pkey_malloc_record { 591 void *ptr; 592 long size; 593 int prot; 594 }; 595 struct pkey_malloc_record *pkey_malloc_records; 596 struct pkey_malloc_record *pkey_last_malloc_record; 597 static long nr_pkey_malloc_records; 598 void record_pkey_malloc(void *ptr, long size, int prot) 599 { 600 long i; 601 struct pkey_malloc_record *rec = NULL; 602 603 for (i = 0; i < nr_pkey_malloc_records; i++) { 604 rec = &pkey_malloc_records[i]; 605 /* find a free record */ 606 if (rec) 607 break; 608 } 609 if (!rec) { 610 /* every record is full */ 611 size_t old_nr_records = nr_pkey_malloc_records; 612 size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); 613 size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); 614 dprintf2("new_nr_records: %zd\n", new_nr_records); 615 dprintf2("new_size: %zd\n", new_size); 616 pkey_malloc_records = realloc(pkey_malloc_records, new_size); 617 pkey_assert(pkey_malloc_records != NULL); 618 rec = &pkey_malloc_records[nr_pkey_malloc_records]; 619 /* 620 * realloc() does not initialize memory, so zero it from 621 * the first new record all the way to the end. 622 */ 623 for (i = 0; i < new_nr_records - old_nr_records; i++) 624 memset(rec + i, 0, sizeof(*rec)); 625 } 626 dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", 627 (int)(rec - pkey_malloc_records), rec, ptr, size); 628 rec->ptr = ptr; 629 rec->size = size; 630 rec->prot = prot; 631 pkey_last_malloc_record = rec; 632 nr_pkey_malloc_records++; 633 } 634 635 static void free_pkey_malloc(void *ptr) 636 { 637 long i; 638 int ret; 639 dprintf3("%s(%p)\n", __func__, ptr); 640 for (i = 0; i < nr_pkey_malloc_records; i++) { 641 struct pkey_malloc_record *rec = &pkey_malloc_records[i]; 642 dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", 643 ptr, i, rec, rec->ptr, rec->size); 644 if ((ptr < rec->ptr) || 645 (ptr >= rec->ptr + rec->size)) 646 continue; 647 648 dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", 649 ptr, i, rec, rec->ptr, rec->size); 650 nr_pkey_malloc_records--; 651 ret = munmap(rec->ptr, rec->size); 652 dprintf3("munmap ret: %d\n", ret); 653 pkey_assert(!ret); 654 dprintf3("clearing rec->ptr, rec: %p\n", rec); 655 rec->ptr = NULL; 656 dprintf3("done clearing rec->ptr, rec: %p\n", rec); 657 return; 658 } 659 pkey_assert(false); 660 } 661 662 static void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) 663 { 664 void *ptr; 665 int ret; 666 667 read_pkey_reg(); 668 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 669 size, prot, pkey); 670 pkey_assert(pkey < NR_PKEYS); 671 ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 672 pkey_assert(ptr != (void *)-1); 673 ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); 674 pkey_assert(!ret); 675 record_pkey_malloc(ptr, size, prot); 676 read_pkey_reg(); 677 678 dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); 679 return ptr; 680 } 681 682 static void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) 683 { 684 int ret; 685 void *ptr; 686 687 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 688 size, prot, pkey); 689 /* 690 * Guarantee we can fit at least one huge page in the resulting 691 * allocation by allocating space for 2: 692 */ 693 size = ALIGN_UP(size, HPAGE_SIZE * 2); 694 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 695 pkey_assert(ptr != (void *)-1); 696 record_pkey_malloc(ptr, size, prot); 697 mprotect_pkey(ptr, size, prot, pkey); 698 699 dprintf1("unaligned ptr: %p\n", ptr); 700 ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); 701 dprintf1(" aligned ptr: %p\n", ptr); 702 ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); 703 dprintf1("MADV_HUGEPAGE ret: %d\n", ret); 704 ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); 705 dprintf1("MADV_WILLNEED ret: %d\n", ret); 706 memset(ptr, 0, HPAGE_SIZE); 707 708 dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); 709 return ptr; 710 } 711 712 static int hugetlb_setup_ok; 713 #define GET_NR_HUGE_PAGES 10 714 static void setup_hugetlbfs(void) 715 { 716 long hpagesz_mb = HPAGE_SIZE / 1024 / 1024; 717 unsigned long free_pages; 718 719 if (geteuid() != 0) { 720 ksft_print_msg("WARNING: not run as root, can not do hugetlb test\n"); 721 return; 722 } 723 724 /* 725 * Make sure that we got the pages and that they 726 * are PMD-level pages. Someone might have made PUD-level 727 * pages the default. 728 */ 729 hugetlb_save_settings(); 730 hugetlb_set_nr_pages(HPAGE_SIZE, GET_NR_HUGE_PAGES); 731 free_pages = hugetlb_free_pages(HPAGE_SIZE); 732 if (free_pages < GET_NR_HUGE_PAGES) { 733 ksft_print_msg("could not confirm %ldM pages, got: '%lu' expected %d\n", 734 hpagesz_mb, free_pages, GET_NR_HUGE_PAGES); 735 return; 736 } 737 738 hugetlb_setup_ok = 1; 739 } 740 741 static void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) 742 { 743 void *ptr; 744 int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; 745 746 if (!hugetlb_setup_ok) 747 return PTR_ERR_ENOTSUP; 748 749 dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); 750 size = ALIGN_UP(size, HPAGE_SIZE * 2); 751 pkey_assert(pkey < NR_PKEYS); 752 ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); 753 pkey_assert(ptr != (void *)-1); 754 mprotect_pkey(ptr, size, prot, pkey); 755 756 record_pkey_malloc(ptr, size, prot); 757 758 dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); 759 return ptr; 760 } 761 762 static void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { 763 764 malloc_pkey_with_mprotect, 765 malloc_pkey_with_mprotect_subpage, 766 malloc_pkey_anon_huge, 767 malloc_pkey_hugetlb 768 }; 769 770 static void *malloc_pkey(long size, int prot, u16 pkey) 771 { 772 void *ret; 773 static int malloc_type; 774 int nr_malloc_types = ARRAY_SIZE(pkey_malloc); 775 776 pkey_assert(pkey < NR_PKEYS); 777 778 while (1) { 779 pkey_assert(malloc_type < nr_malloc_types); 780 781 ret = pkey_malloc[malloc_type](size, prot, pkey); 782 pkey_assert(ret != (void *)-1); 783 784 malloc_type++; 785 if (malloc_type >= nr_malloc_types) 786 malloc_type = (random()%nr_malloc_types); 787 788 /* try again if the malloc_type we tried is unsupported */ 789 if (ret == PTR_ERR_ENOTSUP) 790 continue; 791 792 break; 793 } 794 795 dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, 796 size, prot, pkey, ret); 797 return ret; 798 } 799 800 static int last_pkey_faults; 801 #define UNKNOWN_PKEY -2 802 void expected_pkey_fault(int pkey) 803 { 804 dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", 805 __func__, last_pkey_faults, pkey_faults); 806 dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); 807 pkey_assert(last_pkey_faults + 1 == pkey_faults); 808 809 /* 810 * For exec-only memory, we do not know the pkey in 811 * advance, so skip this check. 812 */ 813 if (pkey != UNKNOWN_PKEY) 814 pkey_assert(last_si_pkey == pkey); 815 816 #if defined(__i386__) || defined(__x86_64__) /* arch */ 817 /* 818 * The signal handler shold have cleared out PKEY register to let the 819 * test program continue. We now have to restore it. 820 */ 821 if (__read_pkey_reg() != 0) 822 #elif defined(__aarch64__) 823 if (__read_pkey_reg() != PKEY_REG_ALLOW_ALL) 824 #else 825 if (__read_pkey_reg() != shadow_pkey_reg) 826 #endif /* arch */ 827 pkey_assert(0); 828 829 __write_pkey_reg(shadow_pkey_reg); 830 dprintf1("%s() set pkey_reg=%016llx to restore state after signal " 831 "nuked it\n", __func__, shadow_pkey_reg); 832 last_pkey_faults = pkey_faults; 833 last_si_pkey = -1; 834 } 835 836 #define do_not_expect_pkey_fault(msg) do { \ 837 if (last_pkey_faults != pkey_faults) \ 838 dprintf0("# unexpected PKey fault: %s\n", msg); \ 839 pkey_assert(last_pkey_faults == pkey_faults); \ 840 } while (0) 841 842 static int test_fds[10] = { -1 }; 843 static int nr_test_fds; 844 static void __save_test_fd(int fd) 845 { 846 pkey_assert(fd >= 0); 847 pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); 848 test_fds[nr_test_fds] = fd; 849 nr_test_fds++; 850 } 851 852 static int get_test_read_fd(void) 853 { 854 int test_fd = open("/etc/passwd", O_RDONLY); 855 __save_test_fd(test_fd); 856 return test_fd; 857 } 858 859 static void close_test_fds(void) 860 { 861 int i; 862 863 for (i = 0; i < nr_test_fds; i++) { 864 if (test_fds[i] < 0) 865 continue; 866 close(test_fds[i]); 867 test_fds[i] = -1; 868 } 869 nr_test_fds = 0; 870 } 871 872 static void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) 873 { 874 int i, err; 875 int max_nr_pkey_allocs; 876 int alloced_pkeys[NR_PKEYS]; 877 int nr_alloced = 0; 878 long size; 879 880 pkey_assert(pkey_last_malloc_record); 881 size = pkey_last_malloc_record->size; 882 /* 883 * This is a bit of a hack. But mprotect() requires 884 * huge-page-aligned sizes when operating on hugetlbfs. 885 * So, make sure that we use something that's a multiple 886 * of a huge page when we can. 887 */ 888 if (size >= HPAGE_SIZE) 889 size = HPAGE_SIZE; 890 891 /* allocate every possible key and make sure key-0 never got allocated */ 892 max_nr_pkey_allocs = NR_PKEYS; 893 for (i = 0; i < max_nr_pkey_allocs; i++) { 894 int new_pkey = alloc_pkey(); 895 pkey_assert(new_pkey != 0); 896 897 if (new_pkey < 0) 898 break; 899 alloced_pkeys[nr_alloced++] = new_pkey; 900 } 901 /* free all the allocated keys */ 902 for (i = 0; i < nr_alloced; i++) { 903 int free_ret; 904 905 if (!alloced_pkeys[i]) 906 continue; 907 free_ret = sys_pkey_free(alloced_pkeys[i]); 908 pkey_assert(!free_ret); 909 } 910 911 /* attach key-0 in various modes */ 912 err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); 913 pkey_assert(!err); 914 err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); 915 pkey_assert(!err); 916 err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); 917 pkey_assert(!err); 918 err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); 919 pkey_assert(!err); 920 err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); 921 pkey_assert(!err); 922 } 923 924 static void test_read_of_write_disabled_region(int *ptr, u16 pkey) 925 { 926 int ptr_contents; 927 928 dprintf1("disabling write access to PKEY[1], doing read\n"); 929 pkey_write_deny(pkey); 930 ptr_contents = read_ptr(ptr); 931 dprintf1("*ptr: %d\n", ptr_contents); 932 dprintf1("\n"); 933 } 934 static void test_read_of_access_disabled_region(int *ptr, u16 pkey) 935 { 936 int ptr_contents; 937 938 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); 939 read_pkey_reg(); 940 pkey_access_deny(pkey); 941 ptr_contents = read_ptr(ptr); 942 dprintf1("*ptr: %d\n", ptr_contents); 943 expected_pkey_fault(pkey); 944 } 945 946 static void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, 947 u16 pkey) 948 { 949 int ptr_contents; 950 951 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", 952 pkey, ptr); 953 ptr_contents = read_ptr(ptr); 954 dprintf1("reading ptr before disabling the read : %d\n", 955 ptr_contents); 956 read_pkey_reg(); 957 pkey_access_deny(pkey); 958 ptr_contents = read_ptr(ptr); 959 dprintf1("*ptr: %d\n", ptr_contents); 960 expected_pkey_fault(pkey); 961 } 962 963 static void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, 964 u16 pkey) 965 { 966 *ptr = __LINE__; 967 dprintf1("disabling write access; after accessing the page, " 968 "to PKEY[%02d], doing write\n", pkey); 969 pkey_write_deny(pkey); 970 *ptr = __LINE__; 971 expected_pkey_fault(pkey); 972 } 973 974 static void test_write_of_write_disabled_region(int *ptr, u16 pkey) 975 { 976 dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); 977 pkey_write_deny(pkey); 978 *ptr = __LINE__; 979 expected_pkey_fault(pkey); 980 } 981 static void test_write_of_access_disabled_region(int *ptr, u16 pkey) 982 { 983 dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); 984 pkey_access_deny(pkey); 985 *ptr = __LINE__; 986 expected_pkey_fault(pkey); 987 } 988 989 static void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, 990 u16 pkey) 991 { 992 *ptr = __LINE__; 993 dprintf1("disabling access; after accessing the page, " 994 " to PKEY[%02d], doing write\n", pkey); 995 pkey_access_deny(pkey); 996 *ptr = __LINE__; 997 expected_pkey_fault(pkey); 998 } 999 1000 static void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) 1001 { 1002 int ret; 1003 int test_fd = get_test_read_fd(); 1004 1005 dprintf1("disabling access to PKEY[%02d], " 1006 "having kernel read() to buffer\n", pkey); 1007 pkey_access_deny(pkey); 1008 ret = read(test_fd, ptr, 1); 1009 dprintf1("read ret: %d\n", ret); 1010 pkey_assert(ret); 1011 } 1012 1013 static void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) 1014 { 1015 int ret; 1016 int test_fd = get_test_read_fd(); 1017 1018 pkey_write_deny(pkey); 1019 ret = read(test_fd, ptr, 100); 1020 dprintf1("read ret: %d\n", ret); 1021 if (ret < 0 && (DEBUG_LEVEL > 0)) 1022 perror("verbose read result (OK for this to be bad)"); 1023 pkey_assert(ret); 1024 } 1025 1026 static void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) 1027 { 1028 int pipe_ret, vmsplice_ret; 1029 struct iovec iov; 1030 int pipe_fds[2]; 1031 1032 pipe_ret = pipe(pipe_fds); 1033 1034 pkey_assert(pipe_ret == 0); 1035 dprintf1("disabling access to PKEY[%02d], " 1036 "having kernel vmsplice from buffer\n", pkey); 1037 pkey_access_deny(pkey); 1038 iov.iov_base = ptr; 1039 iov.iov_len = PAGE_SIZE; 1040 vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); 1041 dprintf1("vmsplice() ret: %d\n", vmsplice_ret); 1042 pkey_assert(vmsplice_ret == -1); 1043 1044 close(pipe_fds[0]); 1045 close(pipe_fds[1]); 1046 } 1047 1048 static void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) 1049 { 1050 int ignored = 0xdada; 1051 int futex_ret; 1052 int some_int = __LINE__; 1053 1054 dprintf1("disabling write to PKEY[%02d], " 1055 "doing futex gunk in buffer\n", pkey); 1056 *ptr = some_int; 1057 pkey_write_deny(pkey); 1058 futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, 1059 &ignored, ignored); 1060 if (DEBUG_LEVEL > 0) 1061 perror("futex"); 1062 dprintf1("futex() ret: %d\n", futex_ret); 1063 } 1064 1065 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1066 static void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) 1067 { 1068 int err; 1069 int i; 1070 1071 /* Note: 0 is the default pkey, so don't mess with it */ 1072 for (i = 1; i < NR_PKEYS; i++) { 1073 if (pkey == i) 1074 continue; 1075 1076 dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); 1077 err = sys_pkey_free(i); 1078 pkey_assert(err); 1079 1080 err = sys_pkey_free(i); 1081 pkey_assert(err); 1082 1083 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); 1084 pkey_assert(err); 1085 } 1086 } 1087 1088 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1089 static void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) 1090 { 1091 int err; 1092 int bad_pkey = NR_PKEYS+99; 1093 1094 /* pass a known-invalid pkey in: */ 1095 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); 1096 pkey_assert(err); 1097 } 1098 1099 static void become_child(void) 1100 { 1101 pid_t forkret; 1102 1103 forkret = fork(); 1104 pkey_assert(forkret >= 0); 1105 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 1106 1107 if (!forkret) { 1108 /* in the child */ 1109 return; 1110 } 1111 _exit(0); 1112 } 1113 1114 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1115 static void test_pkey_alloc_exhaust(int *ptr, u16 pkey) 1116 { 1117 int err; 1118 int allocated_pkeys[NR_PKEYS] = {0}; 1119 int nr_allocated_pkeys = 0; 1120 int i; 1121 1122 for (i = 0; i < NR_PKEYS*3; i++) { 1123 int new_pkey; 1124 dprintf1("%s() alloc loop: %d\n", __func__, i); 1125 new_pkey = alloc_pkey(); 1126 dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" 1127 " shadow: 0x%016llx\n", 1128 __func__, __LINE__, err, __read_pkey_reg(), 1129 shadow_pkey_reg); 1130 read_pkey_reg(); /* for shadow checking */ 1131 dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); 1132 if ((new_pkey == -1) && (errno == ENOSPC)) { 1133 dprintf2("%s() failed to allocate pkey after %d tries\n", 1134 __func__, nr_allocated_pkeys); 1135 } else { 1136 /* 1137 * Ensure the number of successes never 1138 * exceeds the number of keys supported 1139 * in the hardware. 1140 */ 1141 pkey_assert(nr_allocated_pkeys < NR_PKEYS); 1142 allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1143 } 1144 1145 /* 1146 * Make sure that allocation state is properly 1147 * preserved across fork(). 1148 */ 1149 if (i == NR_PKEYS*2) 1150 become_child(); 1151 } 1152 1153 dprintf3("%s()::%d\n", __func__, __LINE__); 1154 1155 /* 1156 * On x86: 1157 * There are 16 pkeys supported in hardware. Three are 1158 * allocated by the time we get here: 1159 * 1. The default key (0) 1160 * 2. One possibly consumed by an execute-only mapping. 1161 * 3. One allocated by the test code and passed in via 1162 * 'pkey' to this function. 1163 * Ensure that we can allocate at least another 13 (16-3). 1164 * 1165 * On powerpc: 1166 * There are either 5, 28, 29 or 32 pkeys supported in 1167 * hardware depending on the page size (4K or 64K) and 1168 * platform (powernv or powervm). Four are allocated by 1169 * the time we get here. These include pkey-0, pkey-1, 1170 * exec-only pkey and the one allocated by the test code. 1171 * Ensure that we can allocate the remaining. 1172 */ 1173 pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); 1174 1175 for (i = 0; i < nr_allocated_pkeys; i++) { 1176 err = sys_pkey_free(allocated_pkeys[i]); 1177 pkey_assert(!err); 1178 read_pkey_reg(); /* for shadow checking */ 1179 } 1180 } 1181 1182 static void arch_force_pkey_reg_init(void) 1183 { 1184 #if defined(__i386__) || defined(__x86_64__) /* arch */ 1185 u64 *buf; 1186 1187 /* 1188 * All keys should be allocated and set to allow reads and 1189 * writes, so the register should be all 0. If not, just 1190 * skip the test. 1191 */ 1192 if (read_pkey_reg()) 1193 return; 1194 1195 /* 1196 * Just allocate an absurd about of memory rather than 1197 * doing the XSAVE size enumeration dance. 1198 */ 1199 buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1200 1201 /* These __builtins require compiling with -mxsave */ 1202 1203 /* XSAVE to build a valid buffer: */ 1204 __builtin_ia32_xsave(buf, XSTATE_PKEY); 1205 /* Clear XSTATE_BV[PKRU]: */ 1206 buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; 1207 /* XRSTOR will likely get PKRU back to the init state: */ 1208 __builtin_ia32_xrstor(buf, XSTATE_PKEY); 1209 1210 munmap(buf, 1*MB); 1211 #endif 1212 } 1213 1214 1215 /* 1216 * This is mostly useless on ppc for now. But it will not 1217 * hurt anything and should give some better coverage as 1218 * a long-running test that continually checks the pkey 1219 * register. 1220 */ 1221 static void test_pkey_init_state(int *ptr, u16 pkey) 1222 { 1223 int err; 1224 int allocated_pkeys[NR_PKEYS] = {0}; 1225 int nr_allocated_pkeys = 0; 1226 int i; 1227 1228 for (i = 0; i < NR_PKEYS; i++) { 1229 int new_pkey = alloc_pkey(); 1230 1231 if (new_pkey < 0) 1232 continue; 1233 allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1234 } 1235 1236 dprintf3("%s()::%d\n", __func__, __LINE__); 1237 1238 arch_force_pkey_reg_init(); 1239 1240 /* 1241 * Loop for a bit, hoping to get exercise the kernel 1242 * context switch code. 1243 */ 1244 for (i = 0; i < 1000000; i++) 1245 read_pkey_reg(); 1246 1247 for (i = 0; i < nr_allocated_pkeys; i++) { 1248 err = sys_pkey_free(allocated_pkeys[i]); 1249 pkey_assert(!err); 1250 read_pkey_reg(); /* for shadow checking */ 1251 } 1252 } 1253 1254 /* 1255 * pkey 0 is special. It is allocated by default, so you do not 1256 * have to call pkey_alloc() to use it first. Make sure that it 1257 * is usable. 1258 */ 1259 static void test_mprotect_with_pkey_0(int *ptr, u16 pkey) 1260 { 1261 long size; 1262 int prot; 1263 1264 assert(pkey_last_malloc_record); 1265 size = pkey_last_malloc_record->size; 1266 /* 1267 * This is a bit of a hack. But mprotect() requires 1268 * huge-page-aligned sizes when operating on hugetlbfs. 1269 * So, make sure that we use something that's a multiple 1270 * of a huge page when we can. 1271 */ 1272 if (size >= HPAGE_SIZE) 1273 size = HPAGE_SIZE; 1274 prot = pkey_last_malloc_record->prot; 1275 1276 /* Use pkey 0 */ 1277 mprotect_pkey(ptr, size, prot, 0); 1278 1279 /* Make sure that we can set it back to the original pkey. */ 1280 mprotect_pkey(ptr, size, prot, pkey); 1281 } 1282 1283 static void test_ptrace_of_child(int *ptr, u16 pkey) 1284 { 1285 __always_unused int peek_result; 1286 pid_t child_pid; 1287 void *ignored = 0; 1288 long ret; 1289 int status; 1290 /* 1291 * This is the "control" for our little expermient. Make sure 1292 * we can always access it when ptracing. 1293 */ 1294 int *plain_ptr_unaligned = malloc(HPAGE_SIZE); 1295 int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); 1296 1297 /* 1298 * Fork a child which is an exact copy of this process, of course. 1299 * That means we can do all of our tests via ptrace() and then plain 1300 * memory access and ensure they work differently. 1301 */ 1302 child_pid = fork_lazy_child(); 1303 dprintf1("[%d] child pid: %d\n", getpid(), child_pid); 1304 1305 ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); 1306 if (ret) 1307 perror("attach"); 1308 dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); 1309 pkey_assert(ret != -1); 1310 ret = waitpid(child_pid, &status, WUNTRACED); 1311 if ((ret != child_pid) || !(WIFSTOPPED(status))) { 1312 fprintf(stderr, "weird waitpid result %ld stat %x\n", 1313 ret, status); 1314 pkey_assert(0); 1315 } 1316 dprintf2("waitpid ret: %ld\n", ret); 1317 dprintf2("waitpid status: %d\n", status); 1318 1319 pkey_access_deny(pkey); 1320 pkey_write_deny(pkey); 1321 1322 /* Write access, untested for now: 1323 ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); 1324 pkey_assert(ret != -1); 1325 dprintf1("poke at %p: %ld\n", peek_at, ret); 1326 */ 1327 1328 /* 1329 * Try to access the pkey-protected "ptr" via ptrace: 1330 */ 1331 ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); 1332 /* expect it to work, without an error: */ 1333 pkey_assert(ret != -1); 1334 /* Now access from the current task, and expect an exception: */ 1335 peek_result = read_ptr(ptr); 1336 expected_pkey_fault(pkey); 1337 1338 /* 1339 * Try to access the NON-pkey-protected "plain_ptr" via ptrace: 1340 */ 1341 ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); 1342 /* expect it to work, without an error: */ 1343 pkey_assert(ret != -1); 1344 /* Now access from the current task, and expect NO exception: */ 1345 peek_result = read_ptr(plain_ptr); 1346 do_not_expect_pkey_fault("read plain pointer after ptrace"); 1347 1348 ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); 1349 pkey_assert(ret != -1); 1350 1351 ret = kill(child_pid, SIGKILL); 1352 pkey_assert(ret != -1); 1353 1354 wait(&status); 1355 1356 free(plain_ptr_unaligned); 1357 } 1358 1359 static void *get_pointer_to_instructions(void) 1360 { 1361 void *p1; 1362 1363 p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); 1364 dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); 1365 /* lots_o_noops_around_write should be page-aligned already */ 1366 assert(p1 == &lots_o_noops_around_write); 1367 1368 /* Point 'p1' at the *second* page of the function: */ 1369 p1 += PAGE_SIZE; 1370 1371 /* 1372 * Try to ensure we fault this in on next touch to ensure 1373 * we get an instruction fault as opposed to a data one 1374 */ 1375 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1376 1377 return p1; 1378 } 1379 1380 static void test_executing_on_unreadable_memory(int *ptr, u16 pkey) 1381 { 1382 void *p1; 1383 int scratch; 1384 int ptr_contents; 1385 int ret; 1386 1387 p1 = get_pointer_to_instructions(); 1388 lots_o_noops_around_write(&scratch); 1389 ptr_contents = read_ptr(p1); 1390 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1391 1392 ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); 1393 pkey_assert(!ret); 1394 pkey_access_deny(pkey); 1395 1396 dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); 1397 1398 /* 1399 * Make sure this is an *instruction* fault 1400 */ 1401 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1402 lots_o_noops_around_write(&scratch); 1403 do_not_expect_pkey_fault("executing on PROT_EXEC memory"); 1404 expect_fault_on_read_execonly_key(p1, pkey); 1405 1406 // Reset back to PROT_EXEC | PROT_READ for architectures that support 1407 // non-PKEY execute-only permissions. 1408 ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey); 1409 pkey_assert(!ret); 1410 } 1411 1412 static void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) 1413 { 1414 void *p1; 1415 int scratch; 1416 int ptr_contents; 1417 int ret; 1418 1419 dprintf1("%s() start\n", __func__); 1420 1421 p1 = get_pointer_to_instructions(); 1422 lots_o_noops_around_write(&scratch); 1423 ptr_contents = read_ptr(p1); 1424 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1425 1426 /* Use a *normal* mprotect(), not mprotect_pkey(): */ 1427 ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); 1428 pkey_assert(!ret); 1429 1430 /* 1431 * Reset the shadow, assuming that the above mprotect() 1432 * correctly changed PKRU, but to an unknown value since 1433 * the actual allocated pkey is unknown. 1434 */ 1435 shadow_pkey_reg = __read_pkey_reg(); 1436 1437 dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); 1438 1439 /* Make sure this is an *instruction* fault */ 1440 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1441 lots_o_noops_around_write(&scratch); 1442 do_not_expect_pkey_fault("executing on PROT_EXEC memory"); 1443 expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); 1444 1445 /* 1446 * Put the memory back to non-PROT_EXEC. Should clear the 1447 * exec-only pkey off the VMA and allow it to be readable 1448 * again. Go to PROT_NONE first to check for a kernel bug 1449 * that did not clear the pkey when doing PROT_NONE. 1450 */ 1451 ret = mprotect(p1, PAGE_SIZE, PROT_NONE); 1452 pkey_assert(!ret); 1453 1454 ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); 1455 pkey_assert(!ret); 1456 ptr_contents = read_ptr(p1); 1457 do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); 1458 } 1459 1460 #if defined(__i386__) || defined(__x86_64__) 1461 static void test_ptrace_modifies_pkru(int *ptr, u16 pkey) 1462 { 1463 u32 new_pkru; 1464 pid_t child; 1465 int status, ret; 1466 int pkey_offset = pkey_reg_xstate_offset(); 1467 size_t xsave_size = cpu_max_xsave_size(); 1468 void *xsave; 1469 u32 *pkey_register; 1470 u64 *xstate_bv; 1471 struct iovec iov; 1472 1473 new_pkru = ~read_pkey_reg(); 1474 /* Don't make PROT_EXEC mappings inaccessible */ 1475 new_pkru &= ~3; 1476 1477 child = fork(); 1478 pkey_assert(child >= 0); 1479 dprintf3("[%d] fork() ret: %d\n", getpid(), child); 1480 if (!child) { 1481 ptrace(PTRACE_TRACEME, 0, 0, 0); 1482 /* Stop and allow the tracer to modify PKRU directly */ 1483 raise(SIGSTOP); 1484 1485 /* 1486 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 1487 * checking 1488 */ 1489 if (__read_pkey_reg() != new_pkru) 1490 _exit(1); 1491 1492 /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ 1493 raise(SIGSTOP); 1494 1495 if (__read_pkey_reg() != 0) 1496 _exit(1); 1497 1498 /* Stop and allow the tracer to examine PKRU */ 1499 raise(SIGSTOP); 1500 1501 _exit(0); 1502 } 1503 1504 pkey_assert(child == waitpid(child, &status, 0)); 1505 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1506 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1507 1508 xsave = (void *)malloc(xsave_size); 1509 pkey_assert(xsave > 0); 1510 1511 /* Modify the PKRU register directly */ 1512 iov.iov_base = xsave; 1513 iov.iov_len = xsave_size; 1514 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1515 pkey_assert(ret == 0); 1516 1517 pkey_register = (u32 *)(xsave + pkey_offset); 1518 pkey_assert(*pkey_register == read_pkey_reg()); 1519 1520 *pkey_register = new_pkru; 1521 1522 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1523 pkey_assert(ret == 0); 1524 1525 /* Test that the modification is visible in ptrace before any execution */ 1526 memset(xsave, 0xCC, xsave_size); 1527 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1528 pkey_assert(ret == 0); 1529 pkey_assert(*pkey_register == new_pkru); 1530 1531 /* Execute the tracee */ 1532 ret = ptrace(PTRACE_CONT, child, 0, 0); 1533 pkey_assert(ret == 0); 1534 1535 /* Test that the tracee saw the PKRU value change */ 1536 pkey_assert(child == waitpid(child, &status, 0)); 1537 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1538 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1539 1540 /* Test that the modification is visible in ptrace after execution */ 1541 memset(xsave, 0xCC, xsave_size); 1542 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1543 pkey_assert(ret == 0); 1544 pkey_assert(*pkey_register == new_pkru); 1545 1546 /* Clear the PKRU bit from XSTATE_BV */ 1547 xstate_bv = (u64 *)(xsave + 512); 1548 *xstate_bv &= ~(1 << 9); 1549 1550 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1551 pkey_assert(ret == 0); 1552 1553 /* Test that the modification is visible in ptrace before any execution */ 1554 memset(xsave, 0xCC, xsave_size); 1555 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1556 pkey_assert(ret == 0); 1557 pkey_assert(*pkey_register == 0); 1558 1559 ret = ptrace(PTRACE_CONT, child, 0, 0); 1560 pkey_assert(ret == 0); 1561 1562 /* Test that the tracee saw the PKRU value go to 0 */ 1563 pkey_assert(child == waitpid(child, &status, 0)); 1564 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1565 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1566 1567 /* Test that the modification is visible in ptrace after execution */ 1568 memset(xsave, 0xCC, xsave_size); 1569 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1570 pkey_assert(ret == 0); 1571 pkey_assert(*pkey_register == 0); 1572 1573 ret = ptrace(PTRACE_CONT, child, 0, 0); 1574 pkey_assert(ret == 0); 1575 pkey_assert(child == waitpid(child, &status, 0)); 1576 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1577 pkey_assert(WIFEXITED(status)); 1578 pkey_assert(WEXITSTATUS(status) == 0); 1579 free(xsave); 1580 } 1581 #endif 1582 1583 #if defined(__aarch64__) 1584 static void test_ptrace_modifies_pkru(int *ptr, u16 pkey) 1585 { 1586 pid_t child; 1587 int status, ret; 1588 struct iovec iov; 1589 u64 trace_pkey; 1590 /* Just a random pkey value.. */ 1591 u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) | 1592 (POE_NONE << PKEY_BITS_PER_PKEY) | 1593 POE_RWX; 1594 1595 child = fork(); 1596 pkey_assert(child >= 0); 1597 dprintf3("[%d] fork() ret: %d\n", getpid(), child); 1598 if (!child) { 1599 ptrace(PTRACE_TRACEME, 0, 0, 0); 1600 1601 /* Stop and allow the tracer to modify PKRU directly */ 1602 raise(SIGSTOP); 1603 1604 /* 1605 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 1606 * checking 1607 */ 1608 if (__read_pkey_reg() != new_pkey) 1609 exit(1); 1610 1611 raise(SIGSTOP); 1612 1613 exit(0); 1614 } 1615 1616 pkey_assert(child == waitpid(child, &status, 0)); 1617 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1618 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1619 1620 iov.iov_base = &trace_pkey; 1621 iov.iov_len = 8; 1622 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); 1623 pkey_assert(ret == 0); 1624 pkey_assert(trace_pkey == read_pkey_reg()); 1625 1626 trace_pkey = new_pkey; 1627 1628 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov); 1629 pkey_assert(ret == 0); 1630 1631 /* Test that the modification is visible in ptrace before any execution */ 1632 memset(&trace_pkey, 0, sizeof(trace_pkey)); 1633 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); 1634 pkey_assert(ret == 0); 1635 pkey_assert(trace_pkey == new_pkey); 1636 1637 /* Execute the tracee */ 1638 ret = ptrace(PTRACE_CONT, child, 0, 0); 1639 pkey_assert(ret == 0); 1640 1641 /* Test that the tracee saw the PKRU value change */ 1642 pkey_assert(child == waitpid(child, &status, 0)); 1643 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1644 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1645 1646 /* Test that the modification is visible in ptrace after execution */ 1647 memset(&trace_pkey, 0, sizeof(trace_pkey)); 1648 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); 1649 pkey_assert(ret == 0); 1650 pkey_assert(trace_pkey == new_pkey); 1651 1652 ret = ptrace(PTRACE_CONT, child, 0, 0); 1653 pkey_assert(ret == 0); 1654 pkey_assert(child == waitpid(child, &status, 0)); 1655 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1656 pkey_assert(WIFEXITED(status)); 1657 pkey_assert(WEXITSTATUS(status) == 0); 1658 } 1659 #endif 1660 1661 static void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) 1662 { 1663 int size = PAGE_SIZE; 1664 int sret; 1665 1666 if (cpu_has_pkeys()) { 1667 dprintf1("SKIP: %s: no CPU support\n", __func__); 1668 return; 1669 } 1670 1671 sret = syscall(__NR_pkey_mprotect, ptr, size, PROT_READ, pkey); 1672 pkey_assert(sret < 0); 1673 } 1674 1675 struct pkey_test { 1676 void (*func)(int *ptr, u16 pkey); 1677 const char *name; 1678 }; 1679 1680 #define PKEY_TEST(fn) { fn, #fn } 1681 1682 static struct pkey_test pkey_tests[] = { 1683 PKEY_TEST(test_read_of_write_disabled_region), 1684 PKEY_TEST(test_read_of_access_disabled_region), 1685 PKEY_TEST(test_read_of_access_disabled_region_with_page_already_mapped), 1686 PKEY_TEST(test_write_of_write_disabled_region), 1687 PKEY_TEST(test_write_of_write_disabled_region_with_page_already_mapped), 1688 PKEY_TEST(test_write_of_access_disabled_region), 1689 PKEY_TEST(test_write_of_access_disabled_region_with_page_already_mapped), 1690 PKEY_TEST(test_kernel_write_of_access_disabled_region), 1691 PKEY_TEST(test_kernel_write_of_write_disabled_region), 1692 PKEY_TEST(test_kernel_gup_of_access_disabled_region), 1693 PKEY_TEST(test_kernel_gup_write_to_write_disabled_region), 1694 PKEY_TEST(test_executing_on_unreadable_memory), 1695 PKEY_TEST(test_implicit_mprotect_exec_only_memory), 1696 PKEY_TEST(test_mprotect_with_pkey_0), 1697 PKEY_TEST(test_ptrace_of_child), 1698 PKEY_TEST(test_pkey_init_state), 1699 PKEY_TEST(test_pkey_syscalls_on_non_allocated_pkey), 1700 PKEY_TEST(test_pkey_syscalls_bad_args), 1701 PKEY_TEST(test_pkey_alloc_exhaust), 1702 PKEY_TEST(test_pkey_alloc_free_attach_pkey0), 1703 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) 1704 PKEY_TEST(test_ptrace_modifies_pkru), 1705 #endif 1706 }; 1707 1708 static void run_tests_once(void) 1709 { 1710 int *ptr; 1711 int prot = PROT_READ|PROT_WRITE; 1712 1713 for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { 1714 int pkey; 1715 int orig_pkey_faults = pkey_faults; 1716 1717 dprintf1("======================\n"); 1718 dprintf1("test %d preparing...\n", test_nr); 1719 1720 tracing_on(); 1721 pkey = alloc_random_pkey(); 1722 dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); 1723 ptr = malloc_pkey(PAGE_SIZE, prot, pkey); 1724 dprintf1("test %d starting...\n", test_nr); 1725 pkey_tests[test_nr].func(ptr, pkey); 1726 dprintf1("freeing test memory: %p\n", ptr); 1727 free_pkey_malloc(ptr); 1728 sys_pkey_free(pkey); 1729 1730 dprintf1("pkey_faults: %d\n", pkey_faults); 1731 dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); 1732 1733 tracing_off(); 1734 close_test_fds(); 1735 1736 ksft_test_result_pass("test %s (iteration %d)\n", pkey_tests[test_nr].name, iteration_nr); 1737 dprintf1("======================\n\n"); 1738 } 1739 iteration_nr++; 1740 } 1741 1742 static void pkey_setup_shadow(void) 1743 { 1744 shadow_pkey_reg = __read_pkey_reg(); 1745 } 1746 1747 int main(void) 1748 { 1749 int nr_iterations = 22; 1750 int pkeys_supported = is_pkeys_supported(); 1751 1752 srand((unsigned int)time(NULL)); 1753 1754 setup_handlers(); 1755 1756 ksft_print_header(); 1757 1758 if (!pkeys_supported) { 1759 int size = PAGE_SIZE; 1760 int *ptr; 1761 1762 ksft_set_plan(1); 1763 ksft_print_msg("running PKEY tests for unsupported CPU/OS\n"); 1764 1765 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1766 assert(ptr != (void *)-1); 1767 test_mprotect_pkey_on_unsupported_cpu(ptr, 1); 1768 ksft_test_result_pass("pkey on unsupported CPU/OS\n"); 1769 ksft_finished(); 1770 } 1771 1772 ksft_set_plan(ARRAY_SIZE(pkey_tests) * nr_iterations); 1773 1774 pkey_setup_shadow(); 1775 ksft_print_msg("startup pkey_reg: %016llx\n", read_pkey_reg()); 1776 setup_hugetlbfs(); 1777 1778 while (nr_iterations-- > 0) 1779 run_tests_once(); 1780 1781 ksft_finished(); 1782 } 1783