1 #define JEMALLOC_PAGES_C_ 2 #include "jemalloc/internal/jemalloc_preamble.h" 3 4 #include "jemalloc/internal/pages.h" 5 6 #include "jemalloc/internal/jemalloc_internal_includes.h" 7 8 #include "jemalloc/internal/assert.h" 9 #include "jemalloc/internal/malloc_io.h" 10 11 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT 12 #include <sys/sysctl.h> 13 #ifdef __FreeBSD__ 14 #include <sys/auxv.h> 15 #include <vm/vm_param.h> 16 #include <vm/vm.h> 17 #endif 18 #endif 19 20 /******************************************************************************/ 21 /* Data. */ 22 23 /* Actual operating system page size, detected during bootstrap, <= PAGE. */ 24 static size_t os_page; 25 26 #ifndef _WIN32 27 # define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE) 28 # define PAGES_PROT_DECOMMIT (PROT_NONE) 29 static int mmap_flags; 30 #endif 31 static bool os_overcommits; 32 33 const char *thp_mode_names[] = { 34 "default", 35 "always", 36 "never", 37 "not supported" 38 }; 39 thp_mode_t opt_thp = THP_MODE_DEFAULT; 40 thp_mode_t init_system_thp_mode; 41 42 /* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */ 43 static bool pages_can_purge_lazy_runtime = true; 44 45 /******************************************************************************/ 46 /* 47 * Function prototypes for static functions that are referenced prior to 48 * definition. 49 */ 50 51 static void os_pages_unmap(void *addr, size_t size); 52 53 /******************************************************************************/ 54 55 static void * 56 os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) { 57 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); 58 assert(ALIGNMENT_CEILING(size, os_page) == size); 59 assert(size != 0); 60 61 if (os_overcommits) { 62 *commit = true; 63 } 64 65 void *ret; 66 #ifdef _WIN32 67 /* 68 * If VirtualAlloc can't allocate at the given address when one is 69 * given, it fails and returns NULL. 70 */ 71 ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0), 72 PAGE_READWRITE); 73 #else 74 /* 75 * We don't use MAP_FIXED here, because it can cause the *replacement* 76 * of existing mappings, and we only want to create new mappings. 77 */ 78 { 79 int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; 80 81 ret = mmap(addr, size, prot, mmap_flags, -1, 0); 82 } 83 assert(ret != NULL); 84 85 if (ret == MAP_FAILED) { 86 ret = NULL; 87 } else if (addr != NULL && ret != addr) { 88 /* 89 * We succeeded in mapping memory, but not in the right place. 90 */ 91 os_pages_unmap(ret, size); 92 ret = NULL; 93 } 94 #endif 95 assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL && 96 ret == addr)); 97 return ret; 98 } 99 100 static void * 101 os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size, 102 bool *commit) { 103 void *ret = (void *)((uintptr_t)addr + leadsize); 104 105 assert(alloc_size >= leadsize + size); 106 #ifdef _WIN32 107 os_pages_unmap(addr, alloc_size); 108 void *new_addr = os_pages_map(ret, size, PAGE, commit); 109 if (new_addr == ret) { 110 return ret; 111 } 112 if (new_addr != NULL) { 113 os_pages_unmap(new_addr, size); 114 } 115 return NULL; 116 #else 117 size_t trailsize = alloc_size - leadsize - size; 118 119 if (leadsize != 0) { 120 os_pages_unmap(addr, leadsize); 121 } 122 if (trailsize != 0) { 123 os_pages_unmap((void *)((uintptr_t)ret + size), trailsize); 124 } 125 return ret; 126 #endif 127 } 128 129 static void 130 os_pages_unmap(void *addr, size_t size) { 131 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); 132 assert(ALIGNMENT_CEILING(size, os_page) == size); 133 134 #ifdef _WIN32 135 if (VirtualFree(addr, 0, MEM_RELEASE) == 0) 136 #else 137 if (munmap(addr, size) == -1) 138 #endif 139 { 140 char buf[BUFERROR_BUF]; 141 142 buferror(get_errno(), buf, sizeof(buf)); 143 malloc_printf("<jemalloc>: Error in " 144 #ifdef _WIN32 145 "VirtualFree" 146 #else 147 "munmap" 148 #endif 149 "(): %s\n", buf); 150 if (opt_abort) { 151 abort(); 152 } 153 } 154 } 155 156 static void * 157 pages_map_slow(size_t size, size_t alignment, bool *commit) { 158 size_t alloc_size = size + alignment - os_page; 159 /* Beware size_t wrap-around. */ 160 if (alloc_size < size) { 161 return NULL; 162 } 163 164 void *ret; 165 do { 166 void *pages = os_pages_map(NULL, alloc_size, alignment, commit); 167 if (pages == NULL) { 168 return NULL; 169 } 170 size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment) 171 - (uintptr_t)pages; 172 ret = os_pages_trim(pages, alloc_size, leadsize, size, commit); 173 } while (ret == NULL); 174 175 assert(ret != NULL); 176 assert(PAGE_ADDR2BASE(ret) == ret); 177 return ret; 178 } 179 180 void * 181 pages_map(void *addr, size_t size, size_t alignment, bool *commit) { 182 assert(alignment >= PAGE); 183 assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr); 184 185 #if defined(__FreeBSD__) && defined(MAP_EXCL) 186 /* 187 * FreeBSD has mechanisms both to mmap at specific address without 188 * touching existing mappings, and to mmap with specific alignment. 189 */ 190 { 191 if (os_overcommits) { 192 *commit = true; 193 } 194 195 int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; 196 int flags = mmap_flags; 197 198 if (addr != NULL) { 199 flags |= MAP_FIXED | MAP_EXCL; 200 } else { 201 unsigned alignment_bits = ffs_zu(alignment); 202 assert(alignment_bits > 1); 203 flags |= MAP_ALIGNED(alignment_bits - 1); 204 } 205 206 void *ret = mmap(addr, size, prot, flags, -1, 0); 207 if (ret == MAP_FAILED) { 208 ret = NULL; 209 } 210 211 return ret; 212 } 213 #endif 214 /* 215 * Ideally, there would be a way to specify alignment to mmap() (like 216 * NetBSD has), but in the absence of such a feature, we have to work 217 * hard to efficiently create aligned mappings. The reliable, but 218 * slow method is to create a mapping that is over-sized, then trim the 219 * excess. However, that always results in one or two calls to 220 * os_pages_unmap(), and it can leave holes in the process's virtual 221 * memory map if memory grows downward. 222 * 223 * Optimistically try mapping precisely the right amount before falling 224 * back to the slow method, with the expectation that the optimistic 225 * approach works most of the time. 226 */ 227 228 void *ret = os_pages_map(addr, size, os_page, commit); 229 if (ret == NULL || ret == addr) { 230 return ret; 231 } 232 assert(addr == NULL); 233 if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) { 234 os_pages_unmap(ret, size); 235 return pages_map_slow(size, alignment, commit); 236 } 237 238 assert(PAGE_ADDR2BASE(ret) == ret); 239 return ret; 240 } 241 242 void 243 pages_unmap(void *addr, size_t size) { 244 assert(PAGE_ADDR2BASE(addr) == addr); 245 assert(PAGE_CEILING(size) == size); 246 247 os_pages_unmap(addr, size); 248 } 249 250 static bool 251 pages_commit_impl(void *addr, size_t size, bool commit) { 252 assert(PAGE_ADDR2BASE(addr) == addr); 253 assert(PAGE_CEILING(size) == size); 254 255 if (os_overcommits) { 256 return true; 257 } 258 259 #ifdef _WIN32 260 return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT, 261 PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT))); 262 #else 263 { 264 int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; 265 void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED, 266 -1, 0); 267 if (result == MAP_FAILED) { 268 return true; 269 } 270 if (result != addr) { 271 /* 272 * We succeeded in mapping memory, but not in the right 273 * place. 274 */ 275 os_pages_unmap(result, size); 276 return true; 277 } 278 return false; 279 } 280 #endif 281 } 282 283 bool 284 pages_commit(void *addr, size_t size) { 285 return pages_commit_impl(addr, size, true); 286 } 287 288 bool 289 pages_decommit(void *addr, size_t size) { 290 return pages_commit_impl(addr, size, false); 291 } 292 293 bool 294 pages_purge_lazy(void *addr, size_t size) { 295 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); 296 assert(PAGE_CEILING(size) == size); 297 298 if (!pages_can_purge_lazy) { 299 return true; 300 } 301 if (!pages_can_purge_lazy_runtime) { 302 /* 303 * Built with lazy purge enabled, but detected it was not 304 * supported on the current system. 305 */ 306 return true; 307 } 308 309 #ifdef _WIN32 310 VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE); 311 return false; 312 #elif defined(JEMALLOC_PURGE_MADVISE_FREE) 313 return (madvise(addr, size, 314 # ifdef MADV_FREE 315 MADV_FREE 316 # else 317 JEMALLOC_MADV_FREE 318 # endif 319 ) != 0); 320 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ 321 !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS) 322 return (madvise(addr, size, MADV_DONTNEED) != 0); 323 #else 324 not_reached(); 325 #endif 326 } 327 328 bool 329 pages_purge_forced(void *addr, size_t size) { 330 assert(PAGE_ADDR2BASE(addr) == addr); 331 assert(PAGE_CEILING(size) == size); 332 333 if (!pages_can_purge_forced) { 334 return true; 335 } 336 337 #if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ 338 defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS) 339 return (madvise(addr, size, MADV_DONTNEED) != 0); 340 #elif defined(JEMALLOC_MAPS_COALESCE) 341 /* Try to overlay a new demand-zeroed mapping. */ 342 return pages_commit(addr, size); 343 #else 344 not_reached(); 345 #endif 346 } 347 348 static bool 349 pages_huge_impl(void *addr, size_t size, bool aligned) { 350 if (aligned) { 351 assert(HUGEPAGE_ADDR2BASE(addr) == addr); 352 assert(HUGEPAGE_CEILING(size) == size); 353 } 354 #ifdef JEMALLOC_HAVE_MADVISE_HUGE 355 return (madvise(addr, size, MADV_HUGEPAGE) != 0); 356 #else 357 return true; 358 #endif 359 } 360 361 bool 362 pages_huge(void *addr, size_t size) { 363 return pages_huge_impl(addr, size, true); 364 } 365 366 static bool 367 pages_huge_unaligned(void *addr, size_t size) { 368 return pages_huge_impl(addr, size, false); 369 } 370 371 static bool 372 pages_nohuge_impl(void *addr, size_t size, bool aligned) { 373 if (aligned) { 374 assert(HUGEPAGE_ADDR2BASE(addr) == addr); 375 assert(HUGEPAGE_CEILING(size) == size); 376 } 377 378 #ifdef JEMALLOC_HAVE_MADVISE_HUGE 379 return (madvise(addr, size, MADV_NOHUGEPAGE) != 0); 380 #else 381 return false; 382 #endif 383 } 384 385 bool 386 pages_nohuge(void *addr, size_t size) { 387 return pages_nohuge_impl(addr, size, true); 388 } 389 390 static bool 391 pages_nohuge_unaligned(void *addr, size_t size) { 392 return pages_nohuge_impl(addr, size, false); 393 } 394 395 bool 396 pages_dontdump(void *addr, size_t size) { 397 assert(PAGE_ADDR2BASE(addr) == addr); 398 assert(PAGE_CEILING(size) == size); 399 #ifdef JEMALLOC_MADVISE_DONTDUMP 400 return madvise(addr, size, MADV_DONTDUMP) != 0; 401 #else 402 return false; 403 #endif 404 } 405 406 bool 407 pages_dodump(void *addr, size_t size) { 408 assert(PAGE_ADDR2BASE(addr) == addr); 409 assert(PAGE_CEILING(size) == size); 410 #ifdef JEMALLOC_MADVISE_DONTDUMP 411 return madvise(addr, size, MADV_DODUMP) != 0; 412 #else 413 return false; 414 #endif 415 } 416 417 418 static size_t 419 os_page_detect(void) { 420 #ifdef _WIN32 421 SYSTEM_INFO si; 422 GetSystemInfo(&si); 423 return si.dwPageSize; 424 #elif defined(__FreeBSD__) 425 /* 426 * This returns the value obtained from 427 * the auxv vector, avoiding a syscall. 428 */ 429 return getpagesize(); 430 #else 431 long result = sysconf(_SC_PAGESIZE); 432 if (result == -1) { 433 return LG_PAGE; 434 } 435 return (size_t)result; 436 #endif 437 } 438 439 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT 440 static bool 441 os_overcommits_sysctl(void) { 442 int vm_overcommit; 443 size_t sz; 444 445 #ifdef ELF_BSDF_VMNOOVERCOMMIT 446 int bsdflags; 447 448 if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) == 0) 449 return ((bsdflags & ELF_BSDF_VMNOOVERCOMMIT) == 0); 450 #endif 451 452 sz = sizeof(vm_overcommit); 453 #if defined(__FreeBSD__) && defined(VM_OVERCOMMIT) 454 int mib[2]; 455 456 mib[0] = CTL_VM; 457 mib[1] = VM_OVERCOMMIT; 458 if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) { 459 return false; /* Error. */ 460 } 461 #else 462 if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) { 463 return false; /* Error. */ 464 } 465 #endif 466 467 #ifndef SWAP_RESERVE_FORCE_ON 468 #define SWAP_RESERVE_FORCE_ON (1 << 0) 469 #define SWAP_RESERVE_RLIMIT_ON (1 << 1) 470 #endif 471 return ((vm_overcommit & (SWAP_RESERVE_FORCE_ON | 472 SWAP_RESERVE_RLIMIT_ON)) == 0); 473 } 474 #endif 475 476 #ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY 477 /* 478 * Use syscall(2) rather than {open,read,close}(2) when possible to avoid 479 * reentry during bootstrapping if another library has interposed system call 480 * wrappers. 481 */ 482 static bool 483 os_overcommits_proc(void) { 484 int fd; 485 char buf[1]; 486 487 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open) 488 #if defined(O_CLOEXEC) 489 fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY | 490 O_CLOEXEC); 491 #else 492 fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY); 493 if (fd != -1) { 494 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); 495 } 496 #endif 497 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat) 498 #if defined(O_CLOEXEC) 499 fd = (int)syscall(SYS_openat, 500 AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC); 501 #else 502 fd = (int)syscall(SYS_openat, 503 AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY); 504 if (fd != -1) { 505 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); 506 } 507 #endif 508 #else 509 #if defined(O_CLOEXEC) 510 fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC); 511 #else 512 fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY); 513 if (fd != -1) { 514 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); 515 } 516 #endif 517 #endif 518 519 if (fd == -1) { 520 return false; /* Error. */ 521 } 522 523 ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf)); 524 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close) 525 syscall(SYS_close, fd); 526 #else 527 close(fd); 528 #endif 529 530 if (nread < 1) { 531 return false; /* Error. */ 532 } 533 /* 534 * /proc/sys/vm/overcommit_memory meanings: 535 * 0: Heuristic overcommit. 536 * 1: Always overcommit. 537 * 2: Never overcommit. 538 */ 539 return (buf[0] == '0' || buf[0] == '1'); 540 } 541 #endif 542 543 void 544 pages_set_thp_state (void *ptr, size_t size) { 545 if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) { 546 return; 547 } 548 assert(opt_thp != thp_mode_not_supported && 549 init_system_thp_mode != thp_mode_not_supported); 550 551 if (opt_thp == thp_mode_always 552 && init_system_thp_mode != thp_mode_never) { 553 assert(init_system_thp_mode == thp_mode_default); 554 pages_huge_unaligned(ptr, size); 555 } else if (opt_thp == thp_mode_never) { 556 assert(init_system_thp_mode == thp_mode_default || 557 init_system_thp_mode == thp_mode_always); 558 pages_nohuge_unaligned(ptr, size); 559 } 560 } 561 562 static void 563 init_thp_state(void) { 564 if (!have_madvise_huge) { 565 if (metadata_thp_enabled() && opt_abort) { 566 malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n"); 567 abort(); 568 } 569 goto label_error; 570 } 571 572 static const char sys_state_madvise[] = "always [madvise] never\n"; 573 static const char sys_state_always[] = "[always] madvise never\n"; 574 static const char sys_state_never[] = "always madvise [never]\n"; 575 char buf[sizeof(sys_state_madvise)]; 576 577 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open) 578 int fd = (int)syscall(SYS_open, 579 "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY); 580 #else 581 int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY); 582 #endif 583 if (fd == -1) { 584 goto label_error; 585 } 586 587 ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf)); 588 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close) 589 syscall(SYS_close, fd); 590 #else 591 close(fd); 592 #endif 593 594 if (nread < 0) { 595 goto label_error; 596 } 597 598 if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) { 599 init_system_thp_mode = thp_mode_default; 600 } else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) { 601 init_system_thp_mode = thp_mode_always; 602 } else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) { 603 init_system_thp_mode = thp_mode_never; 604 } else { 605 goto label_error; 606 } 607 return; 608 label_error: 609 opt_thp = init_system_thp_mode = thp_mode_not_supported; 610 } 611 612 bool 613 pages_boot(void) { 614 os_page = os_page_detect(); 615 if (os_page > PAGE) { 616 malloc_write("<jemalloc>: Unsupported system page size\n"); 617 if (opt_abort) { 618 abort(); 619 } 620 return true; 621 } 622 623 #ifndef _WIN32 624 mmap_flags = MAP_PRIVATE | MAP_ANON; 625 #endif 626 627 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT 628 os_overcommits = os_overcommits_sysctl(); 629 #elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY) 630 os_overcommits = os_overcommits_proc(); 631 # ifdef MAP_NORESERVE 632 if (os_overcommits) { 633 mmap_flags |= MAP_NORESERVE; 634 } 635 # endif 636 #else 637 os_overcommits = false; 638 #endif 639 640 init_thp_state(); 641 642 #ifdef __FreeBSD__ 643 /* 644 * FreeBSD doesn't need the check; madvise(2) is known to work. 645 */ 646 #else 647 /* Detect lazy purge runtime support. */ 648 if (pages_can_purge_lazy) { 649 bool committed = false; 650 void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed); 651 if (madv_free_page == NULL) { 652 return true; 653 } 654 assert(pages_can_purge_lazy_runtime); 655 if (pages_purge_lazy(madv_free_page, PAGE)) { 656 pages_can_purge_lazy_runtime = false; 657 } 658 os_pages_unmap(madv_free_page, PAGE); 659 } 660 #endif 661 662 return false; 663 } 664