xref: /freebsd/contrib/jemalloc/src/pages.c (revision 59e013a52c22624be0f4b0f9bfe0918bc6f50b93)
1  #define JEMALLOC_PAGES_C_
2  #include "jemalloc/internal/jemalloc_preamble.h"
3  
4  #include "jemalloc/internal/pages.h"
5  
6  #include "jemalloc/internal/jemalloc_internal_includes.h"
7  
8  #include "jemalloc/internal/assert.h"
9  #include "jemalloc/internal/malloc_io.h"
10  
11  #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
12  #include <sys/sysctl.h>
13  #ifdef __FreeBSD__
14  #include <sys/auxv.h>
15  #include <vm/vm_param.h>
16  #include <vm/vm.h>
17  #endif
18  #endif
19  
20  /******************************************************************************/
21  /* Data. */
22  
23  /* Actual operating system page size, detected during bootstrap, <= PAGE. */
24  static size_t	os_page;
25  
26  #ifndef _WIN32
27  #  define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
28  #  define PAGES_PROT_DECOMMIT (PROT_NONE)
29  static int	mmap_flags;
30  #endif
31  static bool	os_overcommits;
32  
33  const char *thp_mode_names[] = {
34  	"default",
35  	"always",
36  	"never",
37  	"not supported"
38  };
39  thp_mode_t opt_thp = THP_MODE_DEFAULT;
40  thp_mode_t init_system_thp_mode;
41  
42  /* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
43  static bool pages_can_purge_lazy_runtime = true;
44  
45  /******************************************************************************/
46  /*
47   * Function prototypes for static functions that are referenced prior to
48   * definition.
49   */
50  
51  static void os_pages_unmap(void *addr, size_t size);
52  
53  /******************************************************************************/
54  
55  static void *
os_pages_map(void * addr,size_t size,size_t alignment,bool * commit)56  os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
57  	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
58  	assert(ALIGNMENT_CEILING(size, os_page) == size);
59  	assert(size != 0);
60  
61  	if (os_overcommits) {
62  		*commit = true;
63  	}
64  
65  	void *ret;
66  #ifdef _WIN32
67  	/*
68  	 * If VirtualAlloc can't allocate at the given address when one is
69  	 * given, it fails and returns NULL.
70  	 */
71  	ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
72  	    PAGE_READWRITE);
73  #else
74  	/*
75  	 * We don't use MAP_FIXED here, because it can cause the *replacement*
76  	 * of existing mappings, and we only want to create new mappings.
77  	 */
78  	{
79  		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
80  
81  		ret = mmap(addr, size, prot, mmap_flags, -1, 0);
82  	}
83  	assert(ret != NULL);
84  
85  	if (ret == MAP_FAILED) {
86  		ret = NULL;
87  	} else if (addr != NULL && ret != addr) {
88  		/*
89  		 * We succeeded in mapping memory, but not in the right place.
90  		 */
91  		os_pages_unmap(ret, size);
92  		ret = NULL;
93  	}
94  #endif
95  	assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL &&
96  	    ret == addr));
97  	return ret;
98  }
99  
100  static void *
os_pages_trim(void * addr,size_t alloc_size,size_t leadsize,size_t size,bool * commit)101  os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size,
102      bool *commit) {
103  	void *ret = (void *)((uintptr_t)addr + leadsize);
104  
105  	assert(alloc_size >= leadsize + size);
106  #ifdef _WIN32
107  	os_pages_unmap(addr, alloc_size);
108  	void *new_addr = os_pages_map(ret, size, PAGE, commit);
109  	if (new_addr == ret) {
110  		return ret;
111  	}
112  	if (new_addr != NULL) {
113  		os_pages_unmap(new_addr, size);
114  	}
115  	return NULL;
116  #else
117  	size_t trailsize = alloc_size - leadsize - size;
118  
119  	if (leadsize != 0) {
120  		os_pages_unmap(addr, leadsize);
121  	}
122  	if (trailsize != 0) {
123  		os_pages_unmap((void *)((uintptr_t)ret + size), trailsize);
124  	}
125  	return ret;
126  #endif
127  }
128  
129  static void
os_pages_unmap(void * addr,size_t size)130  os_pages_unmap(void *addr, size_t size) {
131  	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
132  	assert(ALIGNMENT_CEILING(size, os_page) == size);
133  
134  #ifdef _WIN32
135  	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
136  #else
137  	if (munmap(addr, size) == -1)
138  #endif
139  	{
140  		char buf[BUFERROR_BUF];
141  
142  		buferror(get_errno(), buf, sizeof(buf));
143  		malloc_printf("<jemalloc>: Error in "
144  #ifdef _WIN32
145  		    "VirtualFree"
146  #else
147  		    "munmap"
148  #endif
149  		    "(): %s\n", buf);
150  		if (opt_abort) {
151  			abort();
152  		}
153  	}
154  }
155  
156  static void *
pages_map_slow(size_t size,size_t alignment,bool * commit)157  pages_map_slow(size_t size, size_t alignment, bool *commit) {
158  	size_t alloc_size = size + alignment - os_page;
159  	/* Beware size_t wrap-around. */
160  	if (alloc_size < size) {
161  		return NULL;
162  	}
163  
164  	void *ret;
165  	do {
166  		void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
167  		if (pages == NULL) {
168  			return NULL;
169  		}
170  		size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
171  		    - (uintptr_t)pages;
172  		ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
173  	} while (ret == NULL);
174  
175  	assert(ret != NULL);
176  	assert(PAGE_ADDR2BASE(ret) == ret);
177  	return ret;
178  }
179  
180  void *
pages_map(void * addr,size_t size,size_t alignment,bool * commit)181  pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
182  	assert(alignment >= PAGE);
183  	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
184  
185  #if defined(__FreeBSD__) && defined(MAP_EXCL)
186  	/*
187  	 * FreeBSD has mechanisms both to mmap at specific address without
188  	 * touching existing mappings, and to mmap with specific alignment.
189  	 */
190  	{
191  		if (os_overcommits) {
192  			*commit = true;
193  		}
194  
195  		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
196  		int flags = mmap_flags;
197  
198  		if (addr != NULL) {
199  			flags |= MAP_FIXED | MAP_EXCL;
200  		} else {
201  			unsigned alignment_bits = ffs_zu(alignment);
202  			assert(alignment_bits > 1);
203  			flags |= MAP_ALIGNED(alignment_bits - 1);
204  		}
205  
206  		void *ret = mmap(addr, size, prot, flags, -1, 0);
207  		if (ret == MAP_FAILED) {
208  			ret = NULL;
209  		}
210  
211  		return ret;
212  	}
213  #endif
214  	/*
215  	 * Ideally, there would be a way to specify alignment to mmap() (like
216  	 * NetBSD has), but in the absence of such a feature, we have to work
217  	 * hard to efficiently create aligned mappings.  The reliable, but
218  	 * slow method is to create a mapping that is over-sized, then trim the
219  	 * excess.  However, that always results in one or two calls to
220  	 * os_pages_unmap(), and it can leave holes in the process's virtual
221  	 * memory map if memory grows downward.
222  	 *
223  	 * Optimistically try mapping precisely the right amount before falling
224  	 * back to the slow method, with the expectation that the optimistic
225  	 * approach works most of the time.
226  	 */
227  
228  	void *ret = os_pages_map(addr, size, os_page, commit);
229  	if (ret == NULL || ret == addr) {
230  		return ret;
231  	}
232  	assert(addr == NULL);
233  	if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
234  		os_pages_unmap(ret, size);
235  		return pages_map_slow(size, alignment, commit);
236  	}
237  
238  	assert(PAGE_ADDR2BASE(ret) == ret);
239  	return ret;
240  }
241  
242  void
pages_unmap(void * addr,size_t size)243  pages_unmap(void *addr, size_t size) {
244  	assert(PAGE_ADDR2BASE(addr) == addr);
245  	assert(PAGE_CEILING(size) == size);
246  
247  	os_pages_unmap(addr, size);
248  }
249  
250  static bool
pages_commit_impl(void * addr,size_t size,bool commit)251  pages_commit_impl(void *addr, size_t size, bool commit) {
252  	assert(PAGE_ADDR2BASE(addr) == addr);
253  	assert(PAGE_CEILING(size) == size);
254  
255  	if (os_overcommits) {
256  		return true;
257  	}
258  
259  #ifdef _WIN32
260  	return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT,
261  	    PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT)));
262  #else
263  	{
264  		int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
265  		void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED,
266  		    -1, 0);
267  		if (result == MAP_FAILED) {
268  			return true;
269  		}
270  		if (result != addr) {
271  			/*
272  			 * We succeeded in mapping memory, but not in the right
273  			 * place.
274  			 */
275  			os_pages_unmap(result, size);
276  			return true;
277  		}
278  		return false;
279  	}
280  #endif
281  }
282  
283  bool
pages_commit(void * addr,size_t size)284  pages_commit(void *addr, size_t size) {
285  	return pages_commit_impl(addr, size, true);
286  }
287  
288  bool
pages_decommit(void * addr,size_t size)289  pages_decommit(void *addr, size_t size) {
290  	return pages_commit_impl(addr, size, false);
291  }
292  
293  bool
pages_purge_lazy(void * addr,size_t size)294  pages_purge_lazy(void *addr, size_t size) {
295  	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
296  	assert(PAGE_CEILING(size) == size);
297  
298  	if (!pages_can_purge_lazy) {
299  		return true;
300  	}
301  	if (!pages_can_purge_lazy_runtime) {
302  		/*
303  		 * Built with lazy purge enabled, but detected it was not
304  		 * supported on the current system.
305  		 */
306  		return true;
307  	}
308  
309  #ifdef _WIN32
310  	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
311  	return false;
312  #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
313  	return (madvise(addr, size,
314  #  ifdef MADV_FREE
315  	    MADV_FREE
316  #  else
317  	    JEMALLOC_MADV_FREE
318  #  endif
319  	    ) != 0);
320  #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
321      !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
322  	return (madvise(addr, size, MADV_DONTNEED) != 0);
323  #else
324  	not_reached();
325  #endif
326  }
327  
328  bool
pages_purge_forced(void * addr,size_t size)329  pages_purge_forced(void *addr, size_t size) {
330  	assert(PAGE_ADDR2BASE(addr) == addr);
331  	assert(PAGE_CEILING(size) == size);
332  
333  	if (!pages_can_purge_forced) {
334  		return true;
335  	}
336  
337  #if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
338      defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
339  	return (madvise(addr, size, MADV_DONTNEED) != 0);
340  #elif defined(JEMALLOC_MAPS_COALESCE)
341  	/* Try to overlay a new demand-zeroed mapping. */
342  	return pages_commit(addr, size);
343  #else
344  	not_reached();
345  #endif
346  }
347  
348  static bool
pages_huge_impl(void * addr,size_t size,bool aligned)349  pages_huge_impl(void *addr, size_t size, bool aligned) {
350  	if (aligned) {
351  		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
352  		assert(HUGEPAGE_CEILING(size) == size);
353  	}
354  #ifdef JEMALLOC_HAVE_MADVISE_HUGE
355  	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
356  #else
357  	return true;
358  #endif
359  }
360  
361  bool
pages_huge(void * addr,size_t size)362  pages_huge(void *addr, size_t size) {
363  	return pages_huge_impl(addr, size, true);
364  }
365  
366  static bool
pages_huge_unaligned(void * addr,size_t size)367  pages_huge_unaligned(void *addr, size_t size) {
368  	return pages_huge_impl(addr, size, false);
369  }
370  
371  static bool
pages_nohuge_impl(void * addr,size_t size,bool aligned)372  pages_nohuge_impl(void *addr, size_t size, bool aligned) {
373  	if (aligned) {
374  		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
375  		assert(HUGEPAGE_CEILING(size) == size);
376  	}
377  
378  #ifdef JEMALLOC_HAVE_MADVISE_HUGE
379  	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
380  #else
381  	return false;
382  #endif
383  }
384  
385  bool
pages_nohuge(void * addr,size_t size)386  pages_nohuge(void *addr, size_t size) {
387  	return pages_nohuge_impl(addr, size, true);
388  }
389  
390  static bool
pages_nohuge_unaligned(void * addr,size_t size)391  pages_nohuge_unaligned(void *addr, size_t size) {
392  	return pages_nohuge_impl(addr, size, false);
393  }
394  
395  bool
pages_dontdump(void * addr,size_t size)396  pages_dontdump(void *addr, size_t size) {
397  	assert(PAGE_ADDR2BASE(addr) == addr);
398  	assert(PAGE_CEILING(size) == size);
399  #ifdef JEMALLOC_MADVISE_DONTDUMP
400  	return madvise(addr, size, MADV_DONTDUMP) != 0;
401  #else
402  	return false;
403  #endif
404  }
405  
406  bool
pages_dodump(void * addr,size_t size)407  pages_dodump(void *addr, size_t size) {
408  	assert(PAGE_ADDR2BASE(addr) == addr);
409  	assert(PAGE_CEILING(size) == size);
410  #ifdef JEMALLOC_MADVISE_DONTDUMP
411  	return madvise(addr, size, MADV_DODUMP) != 0;
412  #else
413  	return false;
414  #endif
415  }
416  
417  
418  static size_t
os_page_detect(void)419  os_page_detect(void) {
420  #ifdef _WIN32
421  	SYSTEM_INFO si;
422  	GetSystemInfo(&si);
423  	return si.dwPageSize;
424  #elif defined(__FreeBSD__)
425  	/*
426  	 * This returns the value obtained from
427  	 * the auxv vector, avoiding a syscall.
428  	 */
429  	return getpagesize();
430  #else
431  	long result = sysconf(_SC_PAGESIZE);
432  	if (result == -1) {
433  		return LG_PAGE;
434  	}
435  	return (size_t)result;
436  #endif
437  }
438  
439  #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
440  static bool
os_overcommits_sysctl(void)441  os_overcommits_sysctl(void) {
442  	int vm_overcommit;
443  	size_t sz;
444  	int bsdflags;
445  
446  	if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) == 0)
447  		return ((bsdflags & ELF_BSDF_VMNOOVERCOMMIT) == 0);
448  
449  	sz = sizeof(vm_overcommit);
450  #if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
451  	int mib[2];
452  
453  	mib[0] = CTL_VM;
454  	mib[1] = VM_OVERCOMMIT;
455  	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
456  		return false; /* Error. */
457  	}
458  #else
459  	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
460  		return false; /* Error. */
461  	}
462  #endif
463  
464  	return ((vm_overcommit & (SWAP_RESERVE_FORCE_ON |
465  	    SWAP_RESERVE_RLIMIT_ON)) == 0);
466  }
467  #endif
468  
469  #ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
470  /*
471   * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
472   * reentry during bootstrapping if another library has interposed system call
473   * wrappers.
474   */
475  static bool
os_overcommits_proc(void)476  os_overcommits_proc(void) {
477  	int fd;
478  	char buf[1];
479  
480  #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
481  	#if defined(O_CLOEXEC)
482  		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
483  			O_CLOEXEC);
484  	#else
485  		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
486  		if (fd != -1) {
487  			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
488  		}
489  	#endif
490  #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
491  	#if defined(O_CLOEXEC)
492  		fd = (int)syscall(SYS_openat,
493  			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
494  	#else
495  		fd = (int)syscall(SYS_openat,
496  			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
497  		if (fd != -1) {
498  			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
499  		}
500  	#endif
501  #else
502  	#if defined(O_CLOEXEC)
503  		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
504  	#else
505  		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
506  		if (fd != -1) {
507  			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
508  		}
509  	#endif
510  #endif
511  
512  	if (fd == -1) {
513  		return false; /* Error. */
514  	}
515  
516  	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
517  #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
518  	syscall(SYS_close, fd);
519  #else
520  	close(fd);
521  #endif
522  
523  	if (nread < 1) {
524  		return false; /* Error. */
525  	}
526  	/*
527  	 * /proc/sys/vm/overcommit_memory meanings:
528  	 * 0: Heuristic overcommit.
529  	 * 1: Always overcommit.
530  	 * 2: Never overcommit.
531  	 */
532  	return (buf[0] == '0' || buf[0] == '1');
533  }
534  #endif
535  
536  void
pages_set_thp_state(void * ptr,size_t size)537  pages_set_thp_state (void *ptr, size_t size) {
538  	if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
539  		return;
540  	}
541  	assert(opt_thp != thp_mode_not_supported &&
542  	    init_system_thp_mode != thp_mode_not_supported);
543  
544  	if (opt_thp == thp_mode_always
545  	    && init_system_thp_mode != thp_mode_never) {
546  		assert(init_system_thp_mode == thp_mode_default);
547  		pages_huge_unaligned(ptr, size);
548  	} else if (opt_thp == thp_mode_never) {
549  		assert(init_system_thp_mode == thp_mode_default ||
550  		    init_system_thp_mode == thp_mode_always);
551  		pages_nohuge_unaligned(ptr, size);
552  	}
553  }
554  
555  static void
init_thp_state(void)556  init_thp_state(void) {
557  	if (!have_madvise_huge) {
558  		if (metadata_thp_enabled() && opt_abort) {
559  			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
560  			abort();
561  		}
562  		goto label_error;
563  	}
564  
565  	static const char sys_state_madvise[] = "always [madvise] never\n";
566  	static const char sys_state_always[] = "[always] madvise never\n";
567  	static const char sys_state_never[] = "always madvise [never]\n";
568  	char buf[sizeof(sys_state_madvise)];
569  
570  #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
571  	int fd = (int)syscall(SYS_open,
572  	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
573  #else
574  	int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
575  #endif
576  	if (fd == -1) {
577  		goto label_error;
578  	}
579  
580  	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
581  #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
582  	syscall(SYS_close, fd);
583  #else
584  	close(fd);
585  #endif
586  
587          if (nread < 0) {
588  		goto label_error;
589          }
590  
591  	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
592  		init_system_thp_mode = thp_mode_default;
593  	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
594  		init_system_thp_mode = thp_mode_always;
595  	} else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
596  		init_system_thp_mode = thp_mode_never;
597  	} else {
598  		goto label_error;
599  	}
600  	return;
601  label_error:
602  	opt_thp = init_system_thp_mode = thp_mode_not_supported;
603  }
604  
605  bool
pages_boot(void)606  pages_boot(void) {
607  	os_page = os_page_detect();
608  	if (os_page > PAGE) {
609  		malloc_write("<jemalloc>: Unsupported system page size\n");
610  		if (opt_abort) {
611  			abort();
612  		}
613  		return true;
614  	}
615  
616  #ifndef _WIN32
617  	mmap_flags = MAP_PRIVATE | MAP_ANON;
618  #endif
619  
620  #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
621  	os_overcommits = os_overcommits_sysctl();
622  #elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
623  	os_overcommits = os_overcommits_proc();
624  #  ifdef MAP_NORESERVE
625  	if (os_overcommits) {
626  		mmap_flags |= MAP_NORESERVE;
627  	}
628  #  endif
629  #else
630  	os_overcommits = false;
631  #endif
632  
633  	init_thp_state();
634  
635  #ifdef __FreeBSD__
636  	/*
637  	 * FreeBSD doesn't need the check; madvise(2) is known to work.
638  	 */
639  #else
640  	/* Detect lazy purge runtime support. */
641  	if (pages_can_purge_lazy) {
642  		bool committed = false;
643  		void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
644  		if (madv_free_page == NULL) {
645  			return true;
646  		}
647  		assert(pages_can_purge_lazy_runtime);
648  		if (pages_purge_lazy(madv_free_page, PAGE)) {
649  			pages_can_purge_lazy_runtime = false;
650  		}
651  		os_pages_unmap(madv_free_page, PAGE);
652  	}
653  #endif
654  
655  	return false;
656  }
657