xref: /freebsd/contrib/jemalloc/src/pages.c (revision 99282790b7d01ec3c4072621d46a0d7302517ad4)
1 #define JEMALLOC_PAGES_C_
2 #include "jemalloc/internal/jemalloc_preamble.h"
3 
4 #include "jemalloc/internal/pages.h"
5 
6 #include "jemalloc/internal/jemalloc_internal_includes.h"
7 
8 #include "jemalloc/internal/assert.h"
9 #include "jemalloc/internal/malloc_io.h"
10 
11 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
12 #include <sys/sysctl.h>
13 #ifdef __FreeBSD__
14 #include <vm/vm_param.h>
15 #endif
16 #endif
17 
18 /******************************************************************************/
19 /* Data. */
20 
21 /* Actual operating system page size, detected during bootstrap, <= PAGE. */
22 static size_t	os_page;
23 
24 #ifndef _WIN32
25 #  define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
26 #  define PAGES_PROT_DECOMMIT (PROT_NONE)
27 static int	mmap_flags;
28 #endif
29 static bool	os_overcommits;
30 
31 const char *thp_mode_names[] = {
32 	"default",
33 	"always",
34 	"never",
35 	"not supported"
36 };
37 thp_mode_t opt_thp = THP_MODE_DEFAULT;
38 thp_mode_t init_system_thp_mode;
39 
40 /* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
41 static bool pages_can_purge_lazy_runtime = true;
42 
43 /******************************************************************************/
44 /*
45  * Function prototypes for static functions that are referenced prior to
46  * definition.
47  */
48 
49 static void os_pages_unmap(void *addr, size_t size);
50 
51 /******************************************************************************/
52 
53 static void *
54 os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
55 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
56 	assert(ALIGNMENT_CEILING(size, os_page) == size);
57 	assert(size != 0);
58 
59 	if (os_overcommits) {
60 		*commit = true;
61 	}
62 
63 	void *ret;
64 #ifdef _WIN32
65 	/*
66 	 * If VirtualAlloc can't allocate at the given address when one is
67 	 * given, it fails and returns NULL.
68 	 */
69 	ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
70 	    PAGE_READWRITE);
71 #else
72 	/*
73 	 * We don't use MAP_FIXED here, because it can cause the *replacement*
74 	 * of existing mappings, and we only want to create new mappings.
75 	 */
76 	{
77 		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
78 
79 		ret = mmap(addr, size, prot, mmap_flags, -1, 0);
80 	}
81 	assert(ret != NULL);
82 
83 	if (ret == MAP_FAILED) {
84 		ret = NULL;
85 	} else if (addr != NULL && ret != addr) {
86 		/*
87 		 * We succeeded in mapping memory, but not in the right place.
88 		 */
89 		os_pages_unmap(ret, size);
90 		ret = NULL;
91 	}
92 #endif
93 	assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL &&
94 	    ret == addr));
95 	return ret;
96 }
97 
98 static void *
99 os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size,
100     bool *commit) {
101 	void *ret = (void *)((uintptr_t)addr + leadsize);
102 
103 	assert(alloc_size >= leadsize + size);
104 #ifdef _WIN32
105 	os_pages_unmap(addr, alloc_size);
106 	void *new_addr = os_pages_map(ret, size, PAGE, commit);
107 	if (new_addr == ret) {
108 		return ret;
109 	}
110 	if (new_addr != NULL) {
111 		os_pages_unmap(new_addr, size);
112 	}
113 	return NULL;
114 #else
115 	size_t trailsize = alloc_size - leadsize - size;
116 
117 	if (leadsize != 0) {
118 		os_pages_unmap(addr, leadsize);
119 	}
120 	if (trailsize != 0) {
121 		os_pages_unmap((void *)((uintptr_t)ret + size), trailsize);
122 	}
123 	return ret;
124 #endif
125 }
126 
127 static void
128 os_pages_unmap(void *addr, size_t size) {
129 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
130 	assert(ALIGNMENT_CEILING(size, os_page) == size);
131 
132 #ifdef _WIN32
133 	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
134 #else
135 	if (munmap(addr, size) == -1)
136 #endif
137 	{
138 		char buf[BUFERROR_BUF];
139 
140 		buferror(get_errno(), buf, sizeof(buf));
141 		malloc_printf("<jemalloc>: Error in "
142 #ifdef _WIN32
143 		    "VirtualFree"
144 #else
145 		    "munmap"
146 #endif
147 		    "(): %s\n", buf);
148 		if (opt_abort) {
149 			abort();
150 		}
151 	}
152 }
153 
154 static void *
155 pages_map_slow(size_t size, size_t alignment, bool *commit) {
156 	size_t alloc_size = size + alignment - os_page;
157 	/* Beware size_t wrap-around. */
158 	if (alloc_size < size) {
159 		return NULL;
160 	}
161 
162 	void *ret;
163 	do {
164 		void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
165 		if (pages == NULL) {
166 			return NULL;
167 		}
168 		size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
169 		    - (uintptr_t)pages;
170 		ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
171 	} while (ret == NULL);
172 
173 	assert(ret != NULL);
174 	assert(PAGE_ADDR2BASE(ret) == ret);
175 	return ret;
176 }
177 
178 void *
179 pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
180 	assert(alignment >= PAGE);
181 	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
182 
183 #if defined(__FreeBSD__) && defined(MAP_EXCL)
184 	/*
185 	 * FreeBSD has mechanisms both to mmap at specific address without
186 	 * touching existing mappings, and to mmap with specific alignment.
187 	 */
188 	{
189 		if (os_overcommits) {
190 			*commit = true;
191 		}
192 
193 		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
194 		int flags = mmap_flags;
195 
196 		if (addr != NULL) {
197 			flags |= MAP_FIXED | MAP_EXCL;
198 		} else {
199 			unsigned alignment_bits = ffs_zu(alignment);
200 			assert(alignment_bits > 1);
201 			flags |= MAP_ALIGNED(alignment_bits - 1);
202 		}
203 
204 		void *ret = mmap(addr, size, prot, flags, -1, 0);
205 		if (ret == MAP_FAILED) {
206 			ret = NULL;
207 		}
208 
209 		return ret;
210 	}
211 #endif
212 	/*
213 	 * Ideally, there would be a way to specify alignment to mmap() (like
214 	 * NetBSD has), but in the absence of such a feature, we have to work
215 	 * hard to efficiently create aligned mappings.  The reliable, but
216 	 * slow method is to create a mapping that is over-sized, then trim the
217 	 * excess.  However, that always results in one or two calls to
218 	 * os_pages_unmap(), and it can leave holes in the process's virtual
219 	 * memory map if memory grows downward.
220 	 *
221 	 * Optimistically try mapping precisely the right amount before falling
222 	 * back to the slow method, with the expectation that the optimistic
223 	 * approach works most of the time.
224 	 */
225 
226 	void *ret = os_pages_map(addr, size, os_page, commit);
227 	if (ret == NULL || ret == addr) {
228 		return ret;
229 	}
230 	assert(addr == NULL);
231 	if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
232 		os_pages_unmap(ret, size);
233 		return pages_map_slow(size, alignment, commit);
234 	}
235 
236 	assert(PAGE_ADDR2BASE(ret) == ret);
237 	return ret;
238 }
239 
240 void
241 pages_unmap(void *addr, size_t size) {
242 	assert(PAGE_ADDR2BASE(addr) == addr);
243 	assert(PAGE_CEILING(size) == size);
244 
245 	os_pages_unmap(addr, size);
246 }
247 
248 static bool
249 pages_commit_impl(void *addr, size_t size, bool commit) {
250 	assert(PAGE_ADDR2BASE(addr) == addr);
251 	assert(PAGE_CEILING(size) == size);
252 
253 	if (os_overcommits) {
254 		return true;
255 	}
256 
257 #ifdef _WIN32
258 	return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT,
259 	    PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT)));
260 #else
261 	{
262 		int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
263 		void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED,
264 		    -1, 0);
265 		if (result == MAP_FAILED) {
266 			return true;
267 		}
268 		if (result != addr) {
269 			/*
270 			 * We succeeded in mapping memory, but not in the right
271 			 * place.
272 			 */
273 			os_pages_unmap(result, size);
274 			return true;
275 		}
276 		return false;
277 	}
278 #endif
279 }
280 
281 bool
282 pages_commit(void *addr, size_t size) {
283 	return pages_commit_impl(addr, size, true);
284 }
285 
286 bool
287 pages_decommit(void *addr, size_t size) {
288 	return pages_commit_impl(addr, size, false);
289 }
290 
291 bool
292 pages_purge_lazy(void *addr, size_t size) {
293 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
294 	assert(PAGE_CEILING(size) == size);
295 
296 	if (!pages_can_purge_lazy) {
297 		return true;
298 	}
299 	if (!pages_can_purge_lazy_runtime) {
300 		/*
301 		 * Built with lazy purge enabled, but detected it was not
302 		 * supported on the current system.
303 		 */
304 		return true;
305 	}
306 
307 #ifdef _WIN32
308 	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
309 	return false;
310 #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
311 	return (madvise(addr, size,
312 #  ifdef MADV_FREE
313 	    MADV_FREE
314 #  else
315 	    JEMALLOC_MADV_FREE
316 #  endif
317 	    ) != 0);
318 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
319     !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
320 	return (madvise(addr, size, MADV_DONTNEED) != 0);
321 #else
322 	not_reached();
323 #endif
324 }
325 
326 bool
327 pages_purge_forced(void *addr, size_t size) {
328 	assert(PAGE_ADDR2BASE(addr) == addr);
329 	assert(PAGE_CEILING(size) == size);
330 
331 	if (!pages_can_purge_forced) {
332 		return true;
333 	}
334 
335 #if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
336     defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
337 	return (madvise(addr, size, MADV_DONTNEED) != 0);
338 #elif defined(JEMALLOC_MAPS_COALESCE)
339 	/* Try to overlay a new demand-zeroed mapping. */
340 	return pages_commit(addr, size);
341 #else
342 	not_reached();
343 #endif
344 }
345 
346 static bool
347 pages_huge_impl(void *addr, size_t size, bool aligned) {
348 	if (aligned) {
349 		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
350 		assert(HUGEPAGE_CEILING(size) == size);
351 	}
352 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
353 	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
354 #else
355 	return true;
356 #endif
357 }
358 
359 bool
360 pages_huge(void *addr, size_t size) {
361 	return pages_huge_impl(addr, size, true);
362 }
363 
364 static bool
365 pages_huge_unaligned(void *addr, size_t size) {
366 	return pages_huge_impl(addr, size, false);
367 }
368 
369 static bool
370 pages_nohuge_impl(void *addr, size_t size, bool aligned) {
371 	if (aligned) {
372 		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
373 		assert(HUGEPAGE_CEILING(size) == size);
374 	}
375 
376 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
377 	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
378 #else
379 	return false;
380 #endif
381 }
382 
383 bool
384 pages_nohuge(void *addr, size_t size) {
385 	return pages_nohuge_impl(addr, size, true);
386 }
387 
388 static bool
389 pages_nohuge_unaligned(void *addr, size_t size) {
390 	return pages_nohuge_impl(addr, size, false);
391 }
392 
393 bool
394 pages_dontdump(void *addr, size_t size) {
395 	assert(PAGE_ADDR2BASE(addr) == addr);
396 	assert(PAGE_CEILING(size) == size);
397 #ifdef JEMALLOC_MADVISE_DONTDUMP
398 	return madvise(addr, size, MADV_DONTDUMP) != 0;
399 #else
400 	return false;
401 #endif
402 }
403 
404 bool
405 pages_dodump(void *addr, size_t size) {
406 	assert(PAGE_ADDR2BASE(addr) == addr);
407 	assert(PAGE_CEILING(size) == size);
408 #ifdef JEMALLOC_MADVISE_DONTDUMP
409 	return madvise(addr, size, MADV_DODUMP) != 0;
410 #else
411 	return false;
412 #endif
413 }
414 
415 
416 static size_t
417 os_page_detect(void) {
418 #ifdef _WIN32
419 	SYSTEM_INFO si;
420 	GetSystemInfo(&si);
421 	return si.dwPageSize;
422 #elif defined(__FreeBSD__)
423 	/*
424 	 * This returns the value obtained from
425 	 * the auxv vector, avoiding a syscall.
426 	 */
427 	return getpagesize();
428 #else
429 	long result = sysconf(_SC_PAGESIZE);
430 	if (result == -1) {
431 		return LG_PAGE;
432 	}
433 	return (size_t)result;
434 #endif
435 }
436 
437 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
438 static bool
439 os_overcommits_sysctl(void) {
440 	int vm_overcommit;
441 	size_t sz;
442 
443 	sz = sizeof(vm_overcommit);
444 #if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
445 	int mib[2];
446 
447 	mib[0] = CTL_VM;
448 	mib[1] = VM_OVERCOMMIT;
449 	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
450 		return false; /* Error. */
451 	}
452 #else
453 	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
454 		return false; /* Error. */
455 	}
456 #endif
457 
458 	return ((vm_overcommit & 0x3) == 0);
459 }
460 #endif
461 
462 #ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
463 /*
464  * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
465  * reentry during bootstrapping if another library has interposed system call
466  * wrappers.
467  */
468 static bool
469 os_overcommits_proc(void) {
470 	int fd;
471 	char buf[1];
472 
473 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
474 	#if defined(O_CLOEXEC)
475 		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
476 			O_CLOEXEC);
477 	#else
478 		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
479 		if (fd != -1) {
480 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
481 		}
482 	#endif
483 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
484 	#if defined(O_CLOEXEC)
485 		fd = (int)syscall(SYS_openat,
486 			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
487 	#else
488 		fd = (int)syscall(SYS_openat,
489 			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
490 		if (fd != -1) {
491 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
492 		}
493 	#endif
494 #else
495 	#if defined(O_CLOEXEC)
496 		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
497 	#else
498 		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
499 		if (fd != -1) {
500 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
501 		}
502 	#endif
503 #endif
504 
505 	if (fd == -1) {
506 		return false; /* Error. */
507 	}
508 
509 	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
510 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
511 	syscall(SYS_close, fd);
512 #else
513 	close(fd);
514 #endif
515 
516 	if (nread < 1) {
517 		return false; /* Error. */
518 	}
519 	/*
520 	 * /proc/sys/vm/overcommit_memory meanings:
521 	 * 0: Heuristic overcommit.
522 	 * 1: Always overcommit.
523 	 * 2: Never overcommit.
524 	 */
525 	return (buf[0] == '0' || buf[0] == '1');
526 }
527 #endif
528 
529 void
530 pages_set_thp_state (void *ptr, size_t size) {
531 	if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
532 		return;
533 	}
534 	assert(opt_thp != thp_mode_not_supported &&
535 	    init_system_thp_mode != thp_mode_not_supported);
536 
537 	if (opt_thp == thp_mode_always
538 	    && init_system_thp_mode != thp_mode_never) {
539 		assert(init_system_thp_mode == thp_mode_default);
540 		pages_huge_unaligned(ptr, size);
541 	} else if (opt_thp == thp_mode_never) {
542 		assert(init_system_thp_mode == thp_mode_default ||
543 		    init_system_thp_mode == thp_mode_always);
544 		pages_nohuge_unaligned(ptr, size);
545 	}
546 }
547 
548 static void
549 init_thp_state(void) {
550 	if (!have_madvise_huge) {
551 		if (metadata_thp_enabled() && opt_abort) {
552 			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
553 			abort();
554 		}
555 		goto label_error;
556 	}
557 
558 	static const char sys_state_madvise[] = "always [madvise] never\n";
559 	static const char sys_state_always[] = "[always] madvise never\n";
560 	static const char sys_state_never[] = "always madvise [never]\n";
561 	char buf[sizeof(sys_state_madvise)];
562 
563 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
564 	int fd = (int)syscall(SYS_open,
565 	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
566 #else
567 	int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
568 #endif
569 	if (fd == -1) {
570 		goto label_error;
571 	}
572 
573 	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
574 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
575 	syscall(SYS_close, fd);
576 #else
577 	close(fd);
578 #endif
579 
580         if (nread < 0) {
581 		goto label_error;
582         }
583 
584 	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
585 		init_system_thp_mode = thp_mode_default;
586 	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
587 		init_system_thp_mode = thp_mode_always;
588 	} else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
589 		init_system_thp_mode = thp_mode_never;
590 	} else {
591 		goto label_error;
592 	}
593 	return;
594 label_error:
595 	opt_thp = init_system_thp_mode = thp_mode_not_supported;
596 }
597 
598 bool
599 pages_boot(void) {
600 	os_page = os_page_detect();
601 	if (os_page > PAGE) {
602 		malloc_write("<jemalloc>: Unsupported system page size\n");
603 		if (opt_abort) {
604 			abort();
605 		}
606 		return true;
607 	}
608 
609 #ifndef _WIN32
610 	mmap_flags = MAP_PRIVATE | MAP_ANON;
611 #endif
612 
613 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
614 	os_overcommits = os_overcommits_sysctl();
615 #elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
616 	os_overcommits = os_overcommits_proc();
617 #  ifdef MAP_NORESERVE
618 	if (os_overcommits) {
619 		mmap_flags |= MAP_NORESERVE;
620 	}
621 #  endif
622 #else
623 	os_overcommits = false;
624 #endif
625 
626 	init_thp_state();
627 
628 #ifdef __FreeBSD__
629 	/*
630 	 * FreeBSD doesn't need the check; madvise(2) is known to work.
631 	 */
632 #else
633 	/* Detect lazy purge runtime support. */
634 	if (pages_can_purge_lazy) {
635 		bool committed = false;
636 		void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
637 		if (madv_free_page == NULL) {
638 			return true;
639 		}
640 		assert(pages_can_purge_lazy_runtime);
641 		if (pages_purge_lazy(madv_free_page, PAGE)) {
642 			pages_can_purge_lazy_runtime = false;
643 		}
644 		os_pages_unmap(madv_free_page, PAGE);
645 	}
646 #endif
647 
648 	return false;
649 }
650