xref: /freebsd/contrib/jemalloc/src/hpa.c (revision c43cad87172039ccf38172129c79755ea79e6102)
1 #include "jemalloc/internal/jemalloc_preamble.h"
2 #include "jemalloc/internal/jemalloc_internal_includes.h"
3 
4 #include "jemalloc/internal/hpa.h"
5 
6 #include "jemalloc/internal/fb.h"
7 #include "jemalloc/internal/witness.h"
8 
9 #define HPA_EDEN_SIZE (128 * HUGEPAGE)
10 
11 static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
12     size_t alignment, bool zero, bool guarded, bool frequent_reuse,
13     bool *deferred_work_generated);
14 static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size,
15     size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated);
16 static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
17     size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated);
18 static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
19     size_t old_size, size_t new_size, bool *deferred_work_generated);
20 static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
21     bool *deferred_work_generated);
22 static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self,
23     edata_list_active_t *list, bool *deferred_work_generated);
24 static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self);
25 
26 bool
27 hpa_supported() {
28 #ifdef _WIN32
29 	/*
30 	 * At least until the API and implementation is somewhat settled, we
31 	 * don't want to try to debug the VM subsystem on the hardest-to-test
32 	 * platform.
33 	 */
34 	return false;
35 #endif
36 	if (!pages_can_hugify) {
37 		return false;
38 	}
39 	/*
40 	 * We fundamentally rely on a address-space-hungry growth strategy for
41 	 * hugepages.
42 	 */
43 	if (LG_SIZEOF_PTR != 3) {
44 		return false;
45 	}
46 	/*
47 	 * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes
48 	 * this sentinel value -- see the comment in pages.h.
49 	 */
50 	if (HUGEPAGE_PAGES == 1) {
51 		return false;
52 	}
53 	return true;
54 }
55 
56 static void
57 hpa_do_consistency_checks(hpa_shard_t *shard) {
58 	assert(shard->base != NULL);
59 }
60 
61 bool
62 hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) {
63 	/* malloc_conf processing should have filtered out these cases. */
64 	assert(hpa_supported());
65 	bool err;
66 	err = malloc_mutex_init(&central->grow_mtx, "hpa_central_grow",
67 	    WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive);
68 	if (err) {
69 		return true;
70 	}
71 	err = malloc_mutex_init(&central->mtx, "hpa_central",
72 	    WITNESS_RANK_HPA_CENTRAL, malloc_mutex_rank_exclusive);
73 	if (err) {
74 		return true;
75 	}
76 	central->base = base;
77 	central->eden = NULL;
78 	central->eden_len = 0;
79 	central->age_counter = 0;
80 	central->hooks = *hooks;
81 	return false;
82 }
83 
84 static hpdata_t *
85 hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) {
86 	return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t),
87 	    CACHELINE);
88 }
89 
90 hpdata_t *
91 hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size,
92     bool *oom) {
93 	/* Don't yet support big allocations; these should get filtered out. */
94 	assert(size <= HUGEPAGE);
95 	/*
96 	 * Should only try to extract from the central allocator if the local
97 	 * shard is exhausted.  We should hold the grow_mtx on that shard.
98 	 */
99 	witness_assert_positive_depth_to_rank(
100 	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW);
101 
102 	malloc_mutex_lock(tsdn, &central->grow_mtx);
103 	*oom = false;
104 
105 	hpdata_t *ps = NULL;
106 
107 	/* Is eden a perfect fit? */
108 	if (central->eden != NULL && central->eden_len == HUGEPAGE) {
109 		ps = hpa_alloc_ps(tsdn, central);
110 		if (ps == NULL) {
111 			*oom = true;
112 			malloc_mutex_unlock(tsdn, &central->grow_mtx);
113 			return NULL;
114 		}
115 		hpdata_init(ps, central->eden, central->age_counter++);
116 		central->eden = NULL;
117 		central->eden_len = 0;
118 		malloc_mutex_unlock(tsdn, &central->grow_mtx);
119 		return ps;
120 	}
121 
122 	/*
123 	 * We're about to try to allocate from eden by splitting.  If eden is
124 	 * NULL, we have to allocate it too.  Otherwise, we just have to
125 	 * allocate an edata_t for the new psset.
126 	 */
127 	if (central->eden == NULL) {
128 		/*
129 		 * During development, we're primarily concerned with systems
130 		 * with overcommit.  Eventually, we should be more careful here.
131 		 */
132 		bool commit = true;
133 		/* Allocate address space, bailing if we fail. */
134 		void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE,
135 		    &commit);
136 		if (new_eden == NULL) {
137 			*oom = true;
138 			malloc_mutex_unlock(tsdn, &central->grow_mtx);
139 			return NULL;
140 		}
141 		ps = hpa_alloc_ps(tsdn, central);
142 		if (ps == NULL) {
143 			pages_unmap(new_eden, HPA_EDEN_SIZE);
144 			*oom = true;
145 			malloc_mutex_unlock(tsdn, &central->grow_mtx);
146 			return NULL;
147 		}
148 		central->eden = new_eden;
149 		central->eden_len = HPA_EDEN_SIZE;
150 	} else {
151 		/* Eden is already nonempty; only need an edata for ps. */
152 		ps = hpa_alloc_ps(tsdn, central);
153 		if (ps == NULL) {
154 			*oom = true;
155 			malloc_mutex_unlock(tsdn, &central->grow_mtx);
156 			return NULL;
157 		}
158 	}
159 	assert(ps != NULL);
160 	assert(central->eden != NULL);
161 	assert(central->eden_len > HUGEPAGE);
162 	assert(central->eden_len % HUGEPAGE == 0);
163 	assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden);
164 
165 	hpdata_init(ps, central->eden, central->age_counter++);
166 
167 	char *eden_char = (char *)central->eden;
168 	eden_char += HUGEPAGE;
169 	central->eden = (void *)eden_char;
170 	central->eden_len -= HUGEPAGE;
171 
172 	malloc_mutex_unlock(tsdn, &central->grow_mtx);
173 
174 	return ps;
175 }
176 
177 bool
178 hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
179     base_t *base, edata_cache_t *edata_cache, unsigned ind,
180     const hpa_shard_opts_t *opts) {
181 	/* malloc_conf processing should have filtered out these cases. */
182 	assert(hpa_supported());
183 	bool err;
184 	err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow",
185 	    WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive);
186 	if (err) {
187 		return true;
188 	}
189 	err = malloc_mutex_init(&shard->mtx, "hpa_shard",
190 	    WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive);
191 	if (err) {
192 		return true;
193 	}
194 
195 	assert(edata_cache != NULL);
196 	shard->central = central;
197 	shard->base = base;
198 	edata_cache_fast_init(&shard->ecf, edata_cache);
199 	psset_init(&shard->psset);
200 	shard->age_counter = 0;
201 	shard->ind = ind;
202 	shard->emap = emap;
203 
204 	shard->opts = *opts;
205 
206 	shard->npending_purge = 0;
207 	nstime_init_zero(&shard->last_purge);
208 
209 	shard->stats.npurge_passes = 0;
210 	shard->stats.npurges = 0;
211 	shard->stats.nhugifies = 0;
212 	shard->stats.ndehugifies = 0;
213 
214 	/*
215 	 * Fill these in last, so that if an hpa_shard gets used despite
216 	 * initialization failing, we'll at least crash instead of just
217 	 * operating on corrupted data.
218 	 */
219 	shard->pai.alloc = &hpa_alloc;
220 	shard->pai.alloc_batch = &hpa_alloc_batch;
221 	shard->pai.expand = &hpa_expand;
222 	shard->pai.shrink = &hpa_shrink;
223 	shard->pai.dalloc = &hpa_dalloc;
224 	shard->pai.dalloc_batch = &hpa_dalloc_batch;
225 	shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work;
226 
227 	hpa_do_consistency_checks(shard);
228 
229 	return false;
230 }
231 
232 /*
233  * Note that the stats functions here follow the usual stats naming conventions;
234  * "merge" obtains the stats from some live object of instance, while "accum"
235  * only combines the stats from one stats objet to another.  Hence the lack of
236  * locking here.
237  */
238 static void
239 hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst,
240     hpa_shard_nonderived_stats_t *src) {
241 	dst->npurge_passes += src->npurge_passes;
242 	dst->npurges += src->npurges;
243 	dst->nhugifies += src->nhugifies;
244 	dst->ndehugifies += src->ndehugifies;
245 }
246 
247 void
248 hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) {
249 	psset_stats_accum(&dst->psset_stats, &src->psset_stats);
250 	hpa_shard_nonderived_stats_accum(&dst->nonderived_stats,
251 	    &src->nonderived_stats);
252 }
253 
254 void
255 hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
256     hpa_shard_stats_t *dst) {
257 	hpa_do_consistency_checks(shard);
258 
259 	malloc_mutex_lock(tsdn, &shard->grow_mtx);
260 	malloc_mutex_lock(tsdn, &shard->mtx);
261 	psset_stats_accum(&dst->psset_stats, &shard->psset.stats);
262 	hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats);
263 	malloc_mutex_unlock(tsdn, &shard->mtx);
264 	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
265 }
266 
267 static bool
268 hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) {
269 	/*
270 	 * Note that this needs to be >= rather than just >, because of the
271 	 * important special case in which the hugification threshold is exactly
272 	 * HUGEPAGE.
273 	 */
274 	return hpdata_nactive_get(ps) * PAGE
275 	    >= shard->opts.hugification_threshold;
276 }
277 
278 static size_t
279 hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
280 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
281 	return psset_ndirty(&shard->psset) - shard->npending_purge;
282 }
283 
284 static size_t
285 hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) {
286 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
287 	if (shard->opts.dirty_mult == (fxp_t)-1) {
288 		return (size_t)-1;
289 	}
290 	return fxp_mul_frac(psset_nactive(&shard->psset),
291 	    shard->opts.dirty_mult);
292 }
293 
294 static bool
295 hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
296 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
297 	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
298 	if (to_hugify == NULL) {
299 		return false;
300 	}
301 	return hpa_adjusted_ndirty(tsdn, shard)
302 	    + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard);
303 }
304 
305 static bool
306 hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
307 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
308 	if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) {
309 		return true;
310 	}
311 	if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
312 		return true;
313 	}
314 	return false;
315 }
316 
317 static void
318 hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard,
319     hpdata_t *ps) {
320 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
321 	if (hpdata_changing_state_get(ps)) {
322 		hpdata_purge_allowed_set(ps, false);
323 		hpdata_disallow_hugify(ps);
324 		return;
325 	}
326 	/*
327 	 * Hugepages are distinctly costly to purge, so try to avoid it unless
328 	 * they're *particularly* full of dirty pages.  Eventually, we should
329 	 * use a smarter / more dynamic heuristic for situations where we have
330 	 * to manually hugify.
331 	 *
332 	 * In situations where we don't manually hugify, this problem is
333 	 * reduced.  The "bad" situation we're trying to avoid is one's that's
334 	 * common in some Linux configurations (where both enabled and defrag
335 	 * are set to madvise) that can lead to long latency spikes on the first
336 	 * access after a hugification.  The ideal policy in such configurations
337 	 * is probably time-based for both purging and hugifying; only hugify a
338 	 * hugepage if it's met the criteria for some extended period of time,
339 	 * and only dehugify it if it's failed to meet the criteria for an
340 	 * extended period of time.  When background threads are on, we should
341 	 * try to take this hit on one of them, as well.
342 	 *
343 	 * I think the ideal setting is THP always enabled, and defrag set to
344 	 * deferred; in that case we don't need any explicit calls on the
345 	 * allocator's end at all; we just try to pack allocations in a
346 	 * hugepage-friendly manner and let the OS hugify in the background.
347 	 */
348 	hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0);
349 	if (hpa_good_hugification_candidate(shard, ps)
350 	    && !hpdata_huge_get(ps)) {
351 		nstime_t now;
352 		shard->central->hooks.curtime(&now, /* first_reading */ true);
353 		hpdata_allow_hugify(ps, now);
354 	}
355 	/*
356 	 * Once a hugepage has become eligible for hugification, we don't mark
357 	 * it as ineligible just because it stops meeting the criteria (this
358 	 * could lead to situations where a hugepage that spends most of its
359 	 * time meeting the criteria never quite getting hugified if there are
360 	 * intervening deallocations).  The idea is that the hugification delay
361 	 * will allow them to get purged, reseting their "hugify-allowed" bit.
362 	 * If they don't get purged, then the hugification isn't hurting and
363 	 * might help.  As an exception, we don't hugify hugepages that are now
364 	 * empty; it definitely doesn't help there until the hugepage gets
365 	 * reused, which is likely not for a while.
366 	 */
367 	if (hpdata_nactive_get(ps) == 0) {
368 		hpdata_disallow_hugify(ps);
369 	}
370 }
371 
372 static bool
373 hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
374 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
375 	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
376 	return to_hugify != NULL || hpa_should_purge(tsdn, shard);
377 }
378 
379 /* Returns whether or not we purged anything. */
380 static bool
381 hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
382 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
383 
384 	hpdata_t *to_purge = psset_pick_purge(&shard->psset);
385 	if (to_purge == NULL) {
386 		return false;
387 	}
388 	assert(hpdata_purge_allowed_get(to_purge));
389 	assert(!hpdata_changing_state_get(to_purge));
390 
391 	/*
392 	 * Don't let anyone else purge or hugify this page while
393 	 * we're purging it (allocations and deallocations are
394 	 * OK).
395 	 */
396 	psset_update_begin(&shard->psset, to_purge);
397 	assert(hpdata_alloc_allowed_get(to_purge));
398 	hpdata_mid_purge_set(to_purge, true);
399 	hpdata_purge_allowed_set(to_purge, false);
400 	hpdata_disallow_hugify(to_purge);
401 	/*
402 	 * Unlike with hugification (where concurrent
403 	 * allocations are allowed), concurrent allocation out
404 	 * of a hugepage being purged is unsafe; we might hand
405 	 * out an extent for an allocation and then purge it
406 	 * (clearing out user data).
407 	 */
408 	hpdata_alloc_allowed_set(to_purge, false);
409 	psset_update_end(&shard->psset, to_purge);
410 
411 	/* Gather all the metadata we'll need during the purge. */
412 	bool dehugify = hpdata_huge_get(to_purge);
413 	hpdata_purge_state_t purge_state;
414 	size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state);
415 
416 	shard->npending_purge += num_to_purge;
417 
418 	malloc_mutex_unlock(tsdn, &shard->mtx);
419 
420 	/* Actually do the purging, now that the lock is dropped. */
421 	if (dehugify) {
422 		shard->central->hooks.dehugify(hpdata_addr_get(to_purge),
423 		    HUGEPAGE);
424 	}
425 	size_t total_purged = 0;
426 	uint64_t purges_this_pass = 0;
427 	void *purge_addr;
428 	size_t purge_size;
429 	while (hpdata_purge_next(to_purge, &purge_state, &purge_addr,
430 	    &purge_size)) {
431 		total_purged += purge_size;
432 		assert(total_purged <= HUGEPAGE);
433 		purges_this_pass++;
434 		shard->central->hooks.purge(purge_addr, purge_size);
435 	}
436 
437 	malloc_mutex_lock(tsdn, &shard->mtx);
438 	/* The shard updates */
439 	shard->npending_purge -= num_to_purge;
440 	shard->stats.npurge_passes++;
441 	shard->stats.npurges += purges_this_pass;
442 	shard->central->hooks.curtime(&shard->last_purge,
443 	    /* first_reading */ false);
444 	if (dehugify) {
445 		shard->stats.ndehugifies++;
446 	}
447 
448 	/* The hpdata updates. */
449 	psset_update_begin(&shard->psset, to_purge);
450 	if (dehugify) {
451 		hpdata_dehugify(to_purge);
452 	}
453 	hpdata_purge_end(to_purge, &purge_state);
454 	hpdata_mid_purge_set(to_purge, false);
455 
456 	hpdata_alloc_allowed_set(to_purge, true);
457 	hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge);
458 
459 	psset_update_end(&shard->psset, to_purge);
460 
461 	return true;
462 }
463 
464 /* Returns whether or not we hugified anything. */
465 static bool
466 hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) {
467 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
468 
469 	if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
470 		return false;
471 	}
472 
473 	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
474 	if (to_hugify == NULL) {
475 		return false;
476 	}
477 	assert(hpdata_hugify_allowed_get(to_hugify));
478 	assert(!hpdata_changing_state_get(to_hugify));
479 
480 	/* Make sure that it's been hugifiable for long enough. */
481 	nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify);
482 	uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed);
483 	if (millis < shard->opts.hugify_delay_ms) {
484 		return false;
485 	}
486 
487 	/*
488 	 * Don't let anyone else purge or hugify this page while
489 	 * we're hugifying it (allocations and deallocations are
490 	 * OK).
491 	 */
492 	psset_update_begin(&shard->psset, to_hugify);
493 	hpdata_mid_hugify_set(to_hugify, true);
494 	hpdata_purge_allowed_set(to_hugify, false);
495 	hpdata_disallow_hugify(to_hugify);
496 	assert(hpdata_alloc_allowed_get(to_hugify));
497 	psset_update_end(&shard->psset, to_hugify);
498 
499 	malloc_mutex_unlock(tsdn, &shard->mtx);
500 
501 	shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE);
502 
503 	malloc_mutex_lock(tsdn, &shard->mtx);
504 	shard->stats.nhugifies++;
505 
506 	psset_update_begin(&shard->psset, to_hugify);
507 	hpdata_hugify(to_hugify);
508 	hpdata_mid_hugify_set(to_hugify, false);
509 	hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify);
510 	psset_update_end(&shard->psset, to_hugify);
511 
512 	return true;
513 }
514 
515 /*
516  * Execution of deferred work is forced if it's triggered by an explicit
517  * hpa_shard_do_deferred_work() call.
518  */
519 static void
520 hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard,
521     bool forced) {
522 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
523 	if (!forced && shard->opts.deferral_allowed) {
524 		return;
525 	}
526 	/*
527 	 * If we're on a background thread, do work so long as there's work to
528 	 * be done.  Otherwise, bound latency to not be *too* bad by doing at
529 	 * most a small fixed number of operations.
530 	 */
531 	bool hugified = false;
532 	bool purged = false;
533 	size_t max_ops = (forced ? (size_t)-1 : 16);
534 	size_t nops = 0;
535 	do {
536 		/*
537 		 * Always purge before hugifying, to make sure we get some
538 		 * ability to hit our quiescence targets.
539 		 */
540 		purged = false;
541 		while (hpa_should_purge(tsdn, shard) && nops < max_ops) {
542 			purged = hpa_try_purge(tsdn, shard);
543 			if (purged) {
544 				nops++;
545 			}
546 		}
547 		hugified = hpa_try_hugify(tsdn, shard);
548 		if (hugified) {
549 			nops++;
550 		}
551 		malloc_mutex_assert_owner(tsdn, &shard->mtx);
552 		malloc_mutex_assert_owner(tsdn, &shard->mtx);
553 	} while ((hugified || purged) && nops < max_ops);
554 }
555 
556 static edata_t *
557 hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
558     bool *oom) {
559 	bool err;
560 	edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf);
561 	if (edata == NULL) {
562 		*oom = true;
563 		return NULL;
564 	}
565 
566 	hpdata_t *ps = psset_pick_alloc(&shard->psset, size);
567 	if (ps == NULL) {
568 		edata_cache_fast_put(tsdn, &shard->ecf, edata);
569 		return NULL;
570 	}
571 
572 	psset_update_begin(&shard->psset, ps);
573 
574 	if (hpdata_empty(ps)) {
575 		/*
576 		 * If the pageslab used to be empty, treat it as though it's
577 		 * brand new for fragmentation-avoidance purposes; what we're
578 		 * trying to approximate is the age of the allocations *in* that
579 		 * pageslab, and the allocations in the new pageslab are
580 		 * definitionally the youngest in this hpa shard.
581 		 */
582 		hpdata_age_set(ps, shard->age_counter++);
583 	}
584 
585 	void *addr = hpdata_reserve_alloc(ps, size);
586 	edata_init(edata, shard->ind, addr, size, /* slab */ false,
587 	    SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active,
588 	    /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA,
589 	    EXTENT_NOT_HEAD);
590 	edata_ps_set(edata, ps);
591 
592 	/*
593 	 * This could theoretically be moved outside of the critical section,
594 	 * but that introduces the potential for a race.  Without the lock, the
595 	 * (initially nonempty, since this is the reuse pathway) pageslab we
596 	 * allocated out of could become otherwise empty while the lock is
597 	 * dropped.  This would force us to deal with a pageslab eviction down
598 	 * the error pathway, which is a pain.
599 	 */
600 	err = emap_register_boundary(tsdn, shard->emap, edata,
601 	    SC_NSIZES, /* slab */ false);
602 	if (err) {
603 		hpdata_unreserve(ps, edata_addr_get(edata),
604 		    edata_size_get(edata));
605 		/*
606 		 * We should arguably reset dirty state here, but this would
607 		 * require some sort of prepare + commit functionality that's a
608 		 * little much to deal with for now.
609 		 *
610 		 * We don't have a do_deferred_work down this pathway, on the
611 		 * principle that we didn't *really* affect shard state (we
612 		 * tweaked the stats, but our tweaks weren't really accurate).
613 		 */
614 		psset_update_end(&shard->psset, ps);
615 		edata_cache_fast_put(tsdn, &shard->ecf, edata);
616 		*oom = true;
617 		return NULL;
618 	}
619 
620 	hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
621 	psset_update_end(&shard->psset, ps);
622 	return edata;
623 }
624 
625 static size_t
626 hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
627     bool *oom, size_t nallocs, edata_list_active_t *results,
628     bool *deferred_work_generated) {
629 	malloc_mutex_lock(tsdn, &shard->mtx);
630 	size_t nsuccess = 0;
631 	for (; nsuccess < nallocs; nsuccess++) {
632 		edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size,
633 		    oom);
634 		if (edata == NULL) {
635 			break;
636 		}
637 		edata_list_active_append(results, edata);
638 	}
639 
640 	hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
641 	*deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard);
642 	malloc_mutex_unlock(tsdn, &shard->mtx);
643 	return nsuccess;
644 }
645 
646 static size_t
647 hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
648     size_t nallocs, edata_list_active_t *results,
649     bool *deferred_work_generated) {
650 	assert(size <= shard->opts.slab_max_alloc);
651 	bool oom = false;
652 
653 	size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
654 	    nallocs, results, deferred_work_generated);
655 
656 	if (nsuccess == nallocs || oom) {
657 		return nsuccess;
658 	}
659 
660 	/*
661 	 * We didn't OOM, but weren't able to fill everything requested of us;
662 	 * try to grow.
663 	 */
664 	malloc_mutex_lock(tsdn, &shard->grow_mtx);
665 	/*
666 	 * Check for grow races; maybe some earlier thread expanded the psset
667 	 * in between when we dropped the main mutex and grabbed the grow mutex.
668 	 */
669 	nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
670 	    nallocs - nsuccess, results, deferred_work_generated);
671 	if (nsuccess == nallocs || oom) {
672 		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
673 		return nsuccess;
674 	}
675 
676 	/*
677 	 * Note that we don't hold shard->mtx here (while growing);
678 	 * deallocations (and allocations of smaller sizes) may still succeed
679 	 * while we're doing this potentially expensive system call.
680 	 */
681 	hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom);
682 	if (ps == NULL) {
683 		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
684 		return nsuccess;
685 	}
686 
687 	/*
688 	 * We got the pageslab; allocate from it.  This does an unlock followed
689 	 * by a lock on the same mutex, and holds the grow mutex while doing
690 	 * deferred work, but this is an uncommon path; the simplicity is worth
691 	 * it.
692 	 */
693 	malloc_mutex_lock(tsdn, &shard->mtx);
694 	psset_insert(&shard->psset, ps);
695 	malloc_mutex_unlock(tsdn, &shard->mtx);
696 
697 	nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
698 	    nallocs - nsuccess, results, deferred_work_generated);
699 	/*
700 	 * Drop grow_mtx before doing deferred work; other threads blocked on it
701 	 * should be allowed to proceed while we're working.
702 	 */
703 	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
704 
705 	return nsuccess;
706 }
707 
708 static hpa_shard_t *
709 hpa_from_pai(pai_t *self) {
710 	assert(self->alloc = &hpa_alloc);
711 	assert(self->expand = &hpa_expand);
712 	assert(self->shrink = &hpa_shrink);
713 	assert(self->dalloc = &hpa_dalloc);
714 	return (hpa_shard_t *)self;
715 }
716 
717 static size_t
718 hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
719     edata_list_active_t *results, bool *deferred_work_generated) {
720 	assert(nallocs > 0);
721 	assert((size & PAGE_MASK) == 0);
722 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
723 	    WITNESS_RANK_CORE, 0);
724 	hpa_shard_t *shard = hpa_from_pai(self);
725 
726 	if (size > shard->opts.slab_max_alloc) {
727 		return 0;
728 	}
729 
730 	size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs,
731 	    results, deferred_work_generated);
732 
733 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
734 	    WITNESS_RANK_CORE, 0);
735 
736 	/*
737 	 * Guard the sanity checks with config_debug because the loop cannot be
738 	 * proven non-circular by the compiler, even if everything within the
739 	 * loop is optimized away.
740 	 */
741 	if (config_debug) {
742 		edata_t *edata;
743 		ql_foreach(edata, &results->head, ql_link_active) {
744 			emap_assert_mapped(tsdn, shard->emap, edata);
745 			assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
746 			assert(edata_state_get(edata) == extent_state_active);
747 			assert(edata_arena_ind_get(edata) == shard->ind);
748 			assert(edata_szind_get_maybe_invalid(edata) ==
749 			    SC_NSIZES);
750 			assert(!edata_slab_get(edata));
751 			assert(edata_committed_get(edata));
752 			assert(edata_base_get(edata) == edata_addr_get(edata));
753 			assert(edata_base_get(edata) != NULL);
754 		}
755 	}
756 	return nsuccess;
757 }
758 
759 static edata_t *
760 hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
761     bool guarded, bool frequent_reuse, bool *deferred_work_generated) {
762 	assert((size & PAGE_MASK) == 0);
763 	assert(!guarded);
764 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
765 	    WITNESS_RANK_CORE, 0);
766 
767 	/* We don't handle alignment or zeroing for now. */
768 	if (alignment > PAGE || zero) {
769 		return NULL;
770 	}
771 	/*
772 	 * An alloc with alignment == PAGE and zero == false is equivalent to a
773 	 * batch alloc of 1.  Just do that, so we can share code.
774 	 */
775 	edata_list_active_t results;
776 	edata_list_active_init(&results);
777 	size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1,
778 	    &results, deferred_work_generated);
779 	assert(nallocs == 0 || nallocs == 1);
780 	edata_t *edata = edata_list_active_first(&results);
781 	return edata;
782 }
783 
784 static bool
785 hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
786     size_t new_size, bool zero, bool *deferred_work_generated) {
787 	/* Expand not yet supported. */
788 	return true;
789 }
790 
791 static bool
792 hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
793     size_t old_size, size_t new_size, bool *deferred_work_generated) {
794 	/* Shrink not yet supported. */
795 	return true;
796 }
797 
798 static void
799 hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
800 	malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
801 
802 	assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
803 	assert(edata_state_get(edata) == extent_state_active);
804 	assert(edata_arena_ind_get(edata) == shard->ind);
805 	assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES);
806 	assert(edata_committed_get(edata));
807 	assert(edata_base_get(edata) != NULL);
808 
809 	/*
810 	 * Another thread shouldn't be trying to touch the metadata of an
811 	 * allocation being freed.  The one exception is a merge attempt from a
812 	 * lower-addressed PAC extent; in this case we have a nominal race on
813 	 * the edata metadata bits, but in practice the fact that the PAI bits
814 	 * are different will prevent any further access.  The race is bad, but
815 	 * benign in practice, and the long term plan is to track enough state
816 	 * in the rtree to prevent these merge attempts in the first place.
817 	 */
818 	edata_addr_set(edata, edata_base_get(edata));
819 	edata_zeroed_set(edata, false);
820 	emap_deregister_boundary(tsdn, shard->emap, edata);
821 }
822 
823 static void
824 hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
825 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
826 
827 	/*
828 	 * Release the metadata early, to avoid having to remember to do it
829 	 * while we're also doing tricky purging logic.  First, we need to grab
830 	 * a few bits of metadata from it.
831 	 *
832 	 * Note that the shard mutex protects ps's metadata too; it wouldn't be
833 	 * correct to try to read most information out of it without the lock.
834 	 */
835 	hpdata_t *ps = edata_ps_get(edata);
836 	/* Currently, all edatas come from pageslabs. */
837 	assert(ps != NULL);
838 	void *unreserve_addr = edata_addr_get(edata);
839 	size_t unreserve_size = edata_size_get(edata);
840 	edata_cache_fast_put(tsdn, &shard->ecf, edata);
841 
842 	psset_update_begin(&shard->psset, ps);
843 	hpdata_unreserve(ps, unreserve_addr, unreserve_size);
844 	hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
845 	psset_update_end(&shard->psset, ps);
846 }
847 
848 static void
849 hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
850     bool *deferred_work_generated) {
851 	hpa_shard_t *shard = hpa_from_pai(self);
852 
853 	edata_t *edata;
854 	ql_foreach(edata, &list->head, ql_link_active) {
855 		hpa_dalloc_prepare_unlocked(tsdn, shard, edata);
856 	}
857 
858 	malloc_mutex_lock(tsdn, &shard->mtx);
859 	/* Now, remove from the list. */
860 	while ((edata = edata_list_active_first(list)) != NULL) {
861 		edata_list_active_remove(list, edata);
862 		hpa_dalloc_locked(tsdn, shard, edata);
863 	}
864 	hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
865 	*deferred_work_generated =
866 	    hpa_shard_has_deferred_work(tsdn, shard);
867 
868 	malloc_mutex_unlock(tsdn, &shard->mtx);
869 }
870 
871 static void
872 hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
873     bool *deferred_work_generated) {
874 	assert(!edata_guarded_get(edata));
875 	/* Just a dalloc_batch of size 1; this lets us share logic. */
876 	edata_list_active_t dalloc_list;
877 	edata_list_active_init(&dalloc_list);
878 	edata_list_active_append(&dalloc_list, edata);
879 	hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated);
880 }
881 
882 /*
883  * Calculate time until either purging or hugification ought to happen.
884  * Called by background threads.
885  */
886 static uint64_t
887 hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
888 	hpa_shard_t *shard = hpa_from_pai(self);
889 	uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX;
890 
891 	malloc_mutex_lock(tsdn, &shard->mtx);
892 
893 	hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
894 	if (to_hugify != NULL) {
895 		nstime_t time_hugify_allowed =
896 		    hpdata_time_hugify_allowed(to_hugify);
897 		uint64_t since_hugify_allowed_ms =
898 		    shard->central->hooks.ms_since(&time_hugify_allowed);
899 		/*
900 		 * If not enough time has passed since hugification was allowed,
901 		 * sleep for the rest.
902 		 */
903 		if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) {
904 			time_ns = shard->opts.hugify_delay_ms -
905 			    since_hugify_allowed_ms;
906 			time_ns *= 1000 * 1000;
907 		} else {
908 			malloc_mutex_unlock(tsdn, &shard->mtx);
909 			return BACKGROUND_THREAD_DEFERRED_MIN;
910 		}
911 	}
912 
913 	if (hpa_should_purge(tsdn, shard)) {
914 		/*
915 		 * If we haven't purged before, no need to check interval
916 		 * between purges. Simply purge as soon as possible.
917 		 */
918 		if (shard->stats.npurge_passes == 0) {
919 			malloc_mutex_unlock(tsdn, &shard->mtx);
920 			return BACKGROUND_THREAD_DEFERRED_MIN;
921 		}
922 		uint64_t since_last_purge_ms = shard->central->hooks.ms_since(
923 		    &shard->last_purge);
924 
925 		if (since_last_purge_ms < shard->opts.min_purge_interval_ms) {
926 			uint64_t until_purge_ns;
927 			until_purge_ns = shard->opts.min_purge_interval_ms -
928 			    since_last_purge_ms;
929 			until_purge_ns *= 1000 * 1000;
930 
931 			if (until_purge_ns < time_ns) {
932 				time_ns = until_purge_ns;
933 			}
934 		} else {
935 			time_ns = BACKGROUND_THREAD_DEFERRED_MIN;
936 		}
937 	}
938 	malloc_mutex_unlock(tsdn, &shard->mtx);
939 	return time_ns;
940 }
941 
942 void
943 hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
944 	hpa_do_consistency_checks(shard);
945 
946 	malloc_mutex_lock(tsdn, &shard->mtx);
947 	edata_cache_fast_disable(tsdn, &shard->ecf);
948 	malloc_mutex_unlock(tsdn, &shard->mtx);
949 }
950 
951 static void
952 hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) {
953 	assert(bin_stats->npageslabs == 0);
954 	assert(bin_stats->nactive == 0);
955 }
956 
957 static void
958 hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) {
959 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
960 	for (int huge = 0; huge <= 1; huge++) {
961 		hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]);
962 		for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
963 			hpa_shard_assert_stats_empty(
964 			    &psset->stats.nonfull_slabs[i][huge]);
965 		}
966 	}
967 }
968 
969 void
970 hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
971 	hpa_do_consistency_checks(shard);
972 	/*
973 	 * By the time we're here, the arena code should have dalloc'd all the
974 	 * active extents, which means we should have eventually evicted
975 	 * everything from the psset, so it shouldn't be able to serve even a
976 	 * 1-page allocation.
977 	 */
978 	if (config_debug) {
979 		malloc_mutex_lock(tsdn, &shard->mtx);
980 		hpa_assert_empty(tsdn, shard, &shard->psset);
981 		malloc_mutex_unlock(tsdn, &shard->mtx);
982 	}
983 	hpdata_t *ps;
984 	while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) {
985 		/* There should be no allocations anywhere. */
986 		assert(hpdata_empty(ps));
987 		psset_remove(&shard->psset, ps);
988 		shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE);
989 	}
990 }
991 
992 void
993 hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard,
994     bool deferral_allowed) {
995 	hpa_do_consistency_checks(shard);
996 
997 	malloc_mutex_lock(tsdn, &shard->mtx);
998 	bool deferral_previously_allowed = shard->opts.deferral_allowed;
999 	shard->opts.deferral_allowed = deferral_allowed;
1000 	if (deferral_previously_allowed && !deferral_allowed) {
1001 		hpa_shard_maybe_do_deferred_work(tsdn, shard,
1002 		    /* forced */ true);
1003 	}
1004 	malloc_mutex_unlock(tsdn, &shard->mtx);
1005 }
1006 
1007 void
1008 hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
1009 	hpa_do_consistency_checks(shard);
1010 
1011 	malloc_mutex_lock(tsdn, &shard->mtx);
1012 	hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true);
1013 	malloc_mutex_unlock(tsdn, &shard->mtx);
1014 }
1015 
1016 void
1017 hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) {
1018 	hpa_do_consistency_checks(shard);
1019 
1020 	malloc_mutex_prefork(tsdn, &shard->grow_mtx);
1021 }
1022 
1023 void
1024 hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) {
1025 	hpa_do_consistency_checks(shard);
1026 
1027 	malloc_mutex_prefork(tsdn, &shard->mtx);
1028 }
1029 
1030 void
1031 hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) {
1032 	hpa_do_consistency_checks(shard);
1033 
1034 	malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx);
1035 	malloc_mutex_postfork_parent(tsdn, &shard->mtx);
1036 }
1037 
1038 void
1039 hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) {
1040 	hpa_do_consistency_checks(shard);
1041 
1042 	malloc_mutex_postfork_child(tsdn, &shard->grow_mtx);
1043 	malloc_mutex_postfork_child(tsdn, &shard->mtx);
1044 }
1045