1 #define JEMALLOC_PAGES_C_
2 #include "jemalloc/internal/jemalloc_preamble.h"
3
4 #include "jemalloc/internal/pages.h"
5
6 #include "jemalloc/internal/jemalloc_internal_includes.h"
7
8 #include "jemalloc/internal/assert.h"
9 #include "jemalloc/internal/malloc_io.h"
10
11 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
12 #include <sys/sysctl.h>
13 #ifdef __FreeBSD__
14 #include <sys/auxv.h>
15 #include <vm/vm_param.h>
16 #include <vm/vm.h>
17 #endif
18 #endif
19
20 /******************************************************************************/
21 /* Data. */
22
23 /* Actual operating system page size, detected during bootstrap, <= PAGE. */
24 static size_t os_page;
25
26 #ifndef _WIN32
27 # define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
28 # define PAGES_PROT_DECOMMIT (PROT_NONE)
29 static int mmap_flags;
30 #endif
31 static bool os_overcommits;
32
33 const char *thp_mode_names[] = {
34 "default",
35 "always",
36 "never",
37 "not supported"
38 };
39 thp_mode_t opt_thp = THP_MODE_DEFAULT;
40 thp_mode_t init_system_thp_mode;
41
42 /* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
43 static bool pages_can_purge_lazy_runtime = true;
44
45 /******************************************************************************/
46 /*
47 * Function prototypes for static functions that are referenced prior to
48 * definition.
49 */
50
51 static void os_pages_unmap(void *addr, size_t size);
52
53 /******************************************************************************/
54
55 static void *
os_pages_map(void * addr,size_t size,size_t alignment,bool * commit)56 os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
57 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
58 assert(ALIGNMENT_CEILING(size, os_page) == size);
59 assert(size != 0);
60
61 if (os_overcommits) {
62 *commit = true;
63 }
64
65 void *ret;
66 #ifdef _WIN32
67 /*
68 * If VirtualAlloc can't allocate at the given address when one is
69 * given, it fails and returns NULL.
70 */
71 ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
72 PAGE_READWRITE);
73 #else
74 /*
75 * We don't use MAP_FIXED here, because it can cause the *replacement*
76 * of existing mappings, and we only want to create new mappings.
77 */
78 {
79 int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
80
81 ret = mmap(addr, size, prot, mmap_flags, -1, 0);
82 }
83 assert(ret != NULL);
84
85 if (ret == MAP_FAILED) {
86 ret = NULL;
87 } else if (addr != NULL && ret != addr) {
88 /*
89 * We succeeded in mapping memory, but not in the right place.
90 */
91 os_pages_unmap(ret, size);
92 ret = NULL;
93 }
94 #endif
95 assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL &&
96 ret == addr));
97 return ret;
98 }
99
100 static void *
os_pages_trim(void * addr,size_t alloc_size,size_t leadsize,size_t size,bool * commit)101 os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size,
102 bool *commit) {
103 void *ret = (void *)((uintptr_t)addr + leadsize);
104
105 assert(alloc_size >= leadsize + size);
106 #ifdef _WIN32
107 os_pages_unmap(addr, alloc_size);
108 void *new_addr = os_pages_map(ret, size, PAGE, commit);
109 if (new_addr == ret) {
110 return ret;
111 }
112 if (new_addr != NULL) {
113 os_pages_unmap(new_addr, size);
114 }
115 return NULL;
116 #else
117 size_t trailsize = alloc_size - leadsize - size;
118
119 if (leadsize != 0) {
120 os_pages_unmap(addr, leadsize);
121 }
122 if (trailsize != 0) {
123 os_pages_unmap((void *)((uintptr_t)ret + size), trailsize);
124 }
125 return ret;
126 #endif
127 }
128
129 static void
os_pages_unmap(void * addr,size_t size)130 os_pages_unmap(void *addr, size_t size) {
131 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
132 assert(ALIGNMENT_CEILING(size, os_page) == size);
133
134 #ifdef _WIN32
135 if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
136 #else
137 if (munmap(addr, size) == -1)
138 #endif
139 {
140 char buf[BUFERROR_BUF];
141
142 buferror(get_errno(), buf, sizeof(buf));
143 malloc_printf("<jemalloc>: Error in "
144 #ifdef _WIN32
145 "VirtualFree"
146 #else
147 "munmap"
148 #endif
149 "(): %s\n", buf);
150 if (opt_abort) {
151 abort();
152 }
153 }
154 }
155
156 static void *
pages_map_slow(size_t size,size_t alignment,bool * commit)157 pages_map_slow(size_t size, size_t alignment, bool *commit) {
158 size_t alloc_size = size + alignment - os_page;
159 /* Beware size_t wrap-around. */
160 if (alloc_size < size) {
161 return NULL;
162 }
163
164 void *ret;
165 do {
166 void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
167 if (pages == NULL) {
168 return NULL;
169 }
170 size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
171 - (uintptr_t)pages;
172 ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
173 } while (ret == NULL);
174
175 assert(ret != NULL);
176 assert(PAGE_ADDR2BASE(ret) == ret);
177 return ret;
178 }
179
180 void *
pages_map(void * addr,size_t size,size_t alignment,bool * commit)181 pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
182 assert(alignment >= PAGE);
183 assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
184
185 #if defined(__FreeBSD__) && defined(MAP_EXCL)
186 /*
187 * FreeBSD has mechanisms both to mmap at specific address without
188 * touching existing mappings, and to mmap with specific alignment.
189 */
190 {
191 if (os_overcommits) {
192 *commit = true;
193 }
194
195 int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
196 int flags = mmap_flags;
197
198 if (addr != NULL) {
199 flags |= MAP_FIXED | MAP_EXCL;
200 } else {
201 unsigned alignment_bits = ffs_zu(alignment);
202 assert(alignment_bits > 1);
203 flags |= MAP_ALIGNED(alignment_bits - 1);
204 }
205
206 void *ret = mmap(addr, size, prot, flags, -1, 0);
207 if (ret == MAP_FAILED) {
208 ret = NULL;
209 }
210
211 return ret;
212 }
213 #endif
214 /*
215 * Ideally, there would be a way to specify alignment to mmap() (like
216 * NetBSD has), but in the absence of such a feature, we have to work
217 * hard to efficiently create aligned mappings. The reliable, but
218 * slow method is to create a mapping that is over-sized, then trim the
219 * excess. However, that always results in one or two calls to
220 * os_pages_unmap(), and it can leave holes in the process's virtual
221 * memory map if memory grows downward.
222 *
223 * Optimistically try mapping precisely the right amount before falling
224 * back to the slow method, with the expectation that the optimistic
225 * approach works most of the time.
226 */
227
228 void *ret = os_pages_map(addr, size, os_page, commit);
229 if (ret == NULL || ret == addr) {
230 return ret;
231 }
232 assert(addr == NULL);
233 if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
234 os_pages_unmap(ret, size);
235 return pages_map_slow(size, alignment, commit);
236 }
237
238 assert(PAGE_ADDR2BASE(ret) == ret);
239 return ret;
240 }
241
242 void
pages_unmap(void * addr,size_t size)243 pages_unmap(void *addr, size_t size) {
244 assert(PAGE_ADDR2BASE(addr) == addr);
245 assert(PAGE_CEILING(size) == size);
246
247 os_pages_unmap(addr, size);
248 }
249
250 static bool
pages_commit_impl(void * addr,size_t size,bool commit)251 pages_commit_impl(void *addr, size_t size, bool commit) {
252 assert(PAGE_ADDR2BASE(addr) == addr);
253 assert(PAGE_CEILING(size) == size);
254
255 if (os_overcommits) {
256 return true;
257 }
258
259 #ifdef _WIN32
260 return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT,
261 PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT)));
262 #else
263 {
264 int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
265 void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED,
266 -1, 0);
267 if (result == MAP_FAILED) {
268 return true;
269 }
270 if (result != addr) {
271 /*
272 * We succeeded in mapping memory, but not in the right
273 * place.
274 */
275 os_pages_unmap(result, size);
276 return true;
277 }
278 return false;
279 }
280 #endif
281 }
282
283 bool
pages_commit(void * addr,size_t size)284 pages_commit(void *addr, size_t size) {
285 return pages_commit_impl(addr, size, true);
286 }
287
288 bool
pages_decommit(void * addr,size_t size)289 pages_decommit(void *addr, size_t size) {
290 return pages_commit_impl(addr, size, false);
291 }
292
293 bool
pages_purge_lazy(void * addr,size_t size)294 pages_purge_lazy(void *addr, size_t size) {
295 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
296 assert(PAGE_CEILING(size) == size);
297
298 if (!pages_can_purge_lazy) {
299 return true;
300 }
301 if (!pages_can_purge_lazy_runtime) {
302 /*
303 * Built with lazy purge enabled, but detected it was not
304 * supported on the current system.
305 */
306 return true;
307 }
308
309 #ifdef _WIN32
310 VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
311 return false;
312 #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
313 return (madvise(addr, size,
314 # ifdef MADV_FREE
315 MADV_FREE
316 # else
317 JEMALLOC_MADV_FREE
318 # endif
319 ) != 0);
320 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
321 !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
322 return (madvise(addr, size, MADV_DONTNEED) != 0);
323 #else
324 not_reached();
325 #endif
326 }
327
328 bool
pages_purge_forced(void * addr,size_t size)329 pages_purge_forced(void *addr, size_t size) {
330 assert(PAGE_ADDR2BASE(addr) == addr);
331 assert(PAGE_CEILING(size) == size);
332
333 if (!pages_can_purge_forced) {
334 return true;
335 }
336
337 #if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
338 defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
339 return (madvise(addr, size, MADV_DONTNEED) != 0);
340 #elif defined(JEMALLOC_MAPS_COALESCE)
341 /* Try to overlay a new demand-zeroed mapping. */
342 return pages_commit(addr, size);
343 #else
344 not_reached();
345 #endif
346 }
347
348 static bool
pages_huge_impl(void * addr,size_t size,bool aligned)349 pages_huge_impl(void *addr, size_t size, bool aligned) {
350 if (aligned) {
351 assert(HUGEPAGE_ADDR2BASE(addr) == addr);
352 assert(HUGEPAGE_CEILING(size) == size);
353 }
354 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
355 return (madvise(addr, size, MADV_HUGEPAGE) != 0);
356 #else
357 return true;
358 #endif
359 }
360
361 bool
pages_huge(void * addr,size_t size)362 pages_huge(void *addr, size_t size) {
363 return pages_huge_impl(addr, size, true);
364 }
365
366 static bool
pages_huge_unaligned(void * addr,size_t size)367 pages_huge_unaligned(void *addr, size_t size) {
368 return pages_huge_impl(addr, size, false);
369 }
370
371 static bool
pages_nohuge_impl(void * addr,size_t size,bool aligned)372 pages_nohuge_impl(void *addr, size_t size, bool aligned) {
373 if (aligned) {
374 assert(HUGEPAGE_ADDR2BASE(addr) == addr);
375 assert(HUGEPAGE_CEILING(size) == size);
376 }
377
378 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
379 return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
380 #else
381 return false;
382 #endif
383 }
384
385 bool
pages_nohuge(void * addr,size_t size)386 pages_nohuge(void *addr, size_t size) {
387 return pages_nohuge_impl(addr, size, true);
388 }
389
390 static bool
pages_nohuge_unaligned(void * addr,size_t size)391 pages_nohuge_unaligned(void *addr, size_t size) {
392 return pages_nohuge_impl(addr, size, false);
393 }
394
395 bool
pages_dontdump(void * addr,size_t size)396 pages_dontdump(void *addr, size_t size) {
397 assert(PAGE_ADDR2BASE(addr) == addr);
398 assert(PAGE_CEILING(size) == size);
399 #ifdef JEMALLOC_MADVISE_DONTDUMP
400 return madvise(addr, size, MADV_DONTDUMP) != 0;
401 #else
402 return false;
403 #endif
404 }
405
406 bool
pages_dodump(void * addr,size_t size)407 pages_dodump(void *addr, size_t size) {
408 assert(PAGE_ADDR2BASE(addr) == addr);
409 assert(PAGE_CEILING(size) == size);
410 #ifdef JEMALLOC_MADVISE_DONTDUMP
411 return madvise(addr, size, MADV_DODUMP) != 0;
412 #else
413 return false;
414 #endif
415 }
416
417
418 static size_t
os_page_detect(void)419 os_page_detect(void) {
420 #ifdef _WIN32
421 SYSTEM_INFO si;
422 GetSystemInfo(&si);
423 return si.dwPageSize;
424 #elif defined(__FreeBSD__)
425 /*
426 * This returns the value obtained from
427 * the auxv vector, avoiding a syscall.
428 */
429 return getpagesize();
430 #else
431 long result = sysconf(_SC_PAGESIZE);
432 if (result == -1) {
433 return LG_PAGE;
434 }
435 return (size_t)result;
436 #endif
437 }
438
439 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
440 static bool
os_overcommits_sysctl(void)441 os_overcommits_sysctl(void) {
442 int vm_overcommit;
443 size_t sz;
444 int bsdflags;
445
446 if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) == 0)
447 return ((bsdflags & ELF_BSDF_VMNOOVERCOMMIT) == 0);
448
449 sz = sizeof(vm_overcommit);
450 #if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
451 int mib[2];
452
453 mib[0] = CTL_VM;
454 mib[1] = VM_OVERCOMMIT;
455 if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
456 return false; /* Error. */
457 }
458 #else
459 if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
460 return false; /* Error. */
461 }
462 #endif
463
464 return ((vm_overcommit & (SWAP_RESERVE_FORCE_ON |
465 SWAP_RESERVE_RLIMIT_ON)) == 0);
466 }
467 #endif
468
469 #ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
470 /*
471 * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
472 * reentry during bootstrapping if another library has interposed system call
473 * wrappers.
474 */
475 static bool
os_overcommits_proc(void)476 os_overcommits_proc(void) {
477 int fd;
478 char buf[1];
479
480 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
481 #if defined(O_CLOEXEC)
482 fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
483 O_CLOEXEC);
484 #else
485 fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
486 if (fd != -1) {
487 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
488 }
489 #endif
490 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
491 #if defined(O_CLOEXEC)
492 fd = (int)syscall(SYS_openat,
493 AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
494 #else
495 fd = (int)syscall(SYS_openat,
496 AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
497 if (fd != -1) {
498 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
499 }
500 #endif
501 #else
502 #if defined(O_CLOEXEC)
503 fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
504 #else
505 fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
506 if (fd != -1) {
507 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
508 }
509 #endif
510 #endif
511
512 if (fd == -1) {
513 return false; /* Error. */
514 }
515
516 ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
517 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
518 syscall(SYS_close, fd);
519 #else
520 close(fd);
521 #endif
522
523 if (nread < 1) {
524 return false; /* Error. */
525 }
526 /*
527 * /proc/sys/vm/overcommit_memory meanings:
528 * 0: Heuristic overcommit.
529 * 1: Always overcommit.
530 * 2: Never overcommit.
531 */
532 return (buf[0] == '0' || buf[0] == '1');
533 }
534 #endif
535
536 void
pages_set_thp_state(void * ptr,size_t size)537 pages_set_thp_state (void *ptr, size_t size) {
538 if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
539 return;
540 }
541 assert(opt_thp != thp_mode_not_supported &&
542 init_system_thp_mode != thp_mode_not_supported);
543
544 if (opt_thp == thp_mode_always
545 && init_system_thp_mode != thp_mode_never) {
546 assert(init_system_thp_mode == thp_mode_default);
547 pages_huge_unaligned(ptr, size);
548 } else if (opt_thp == thp_mode_never) {
549 assert(init_system_thp_mode == thp_mode_default ||
550 init_system_thp_mode == thp_mode_always);
551 pages_nohuge_unaligned(ptr, size);
552 }
553 }
554
555 static void
init_thp_state(void)556 init_thp_state(void) {
557 if (!have_madvise_huge) {
558 if (metadata_thp_enabled() && opt_abort) {
559 malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
560 abort();
561 }
562 goto label_error;
563 }
564
565 static const char sys_state_madvise[] = "always [madvise] never\n";
566 static const char sys_state_always[] = "[always] madvise never\n";
567 static const char sys_state_never[] = "always madvise [never]\n";
568 char buf[sizeof(sys_state_madvise)];
569
570 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
571 int fd = (int)syscall(SYS_open,
572 "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
573 #else
574 int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
575 #endif
576 if (fd == -1) {
577 goto label_error;
578 }
579
580 ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
581 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
582 syscall(SYS_close, fd);
583 #else
584 close(fd);
585 #endif
586
587 if (nread < 0) {
588 goto label_error;
589 }
590
591 if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
592 init_system_thp_mode = thp_mode_default;
593 } else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
594 init_system_thp_mode = thp_mode_always;
595 } else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
596 init_system_thp_mode = thp_mode_never;
597 } else {
598 goto label_error;
599 }
600 return;
601 label_error:
602 opt_thp = init_system_thp_mode = thp_mode_not_supported;
603 }
604
605 bool
pages_boot(void)606 pages_boot(void) {
607 os_page = os_page_detect();
608 if (os_page > PAGE) {
609 malloc_write("<jemalloc>: Unsupported system page size\n");
610 if (opt_abort) {
611 abort();
612 }
613 return true;
614 }
615
616 #ifndef _WIN32
617 mmap_flags = MAP_PRIVATE | MAP_ANON;
618 #endif
619
620 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
621 os_overcommits = os_overcommits_sysctl();
622 #elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
623 os_overcommits = os_overcommits_proc();
624 # ifdef MAP_NORESERVE
625 if (os_overcommits) {
626 mmap_flags |= MAP_NORESERVE;
627 }
628 # endif
629 #else
630 os_overcommits = false;
631 #endif
632
633 init_thp_state();
634
635 #ifdef __FreeBSD__
636 /*
637 * FreeBSD doesn't need the check; madvise(2) is known to work.
638 */
639 #else
640 /* Detect lazy purge runtime support. */
641 if (pages_can_purge_lazy) {
642 bool committed = false;
643 void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
644 if (madv_free_page == NULL) {
645 return true;
646 }
647 assert(pages_can_purge_lazy_runtime);
648 if (pages_purge_lazy(madv_free_page, PAGE)) {
649 pages_can_purge_lazy_runtime = false;
650 }
651 os_pages_unmap(madv_free_page, PAGE);
652 }
653 #endif
654
655 return false;
656 }
657