1 #include "jemalloc/internal/jemalloc_preamble.h" 2 #include "jemalloc/internal/jemalloc_internal_includes.h" 3 4 #include "jemalloc/internal/hpa.h" 5 6 #include "jemalloc/internal/fb.h" 7 #include "jemalloc/internal/witness.h" 8 9 #define HPA_EDEN_SIZE (128 * HUGEPAGE) 10 11 static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, 12 size_t alignment, bool zero, bool guarded, bool frequent_reuse, 13 bool *deferred_work_generated); 14 static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, 15 size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated); 16 static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, 17 size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); 18 static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, 19 size_t old_size, size_t new_size, bool *deferred_work_generated); 20 static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, 21 bool *deferred_work_generated); 22 static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, 23 edata_list_active_t *list, bool *deferred_work_generated); 24 static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); 25 26 bool 27 hpa_supported() { 28 #ifdef _WIN32 29 /* 30 * At least until the API and implementation is somewhat settled, we 31 * don't want to try to debug the VM subsystem on the hardest-to-test 32 * platform. 33 */ 34 return false; 35 #endif 36 if (!pages_can_hugify) { 37 return false; 38 } 39 /* 40 * We fundamentally rely on a address-space-hungry growth strategy for 41 * hugepages. 42 */ 43 if (LG_SIZEOF_PTR != 3) { 44 return false; 45 } 46 /* 47 * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes 48 * this sentinel value -- see the comment in pages.h. 49 */ 50 if (HUGEPAGE_PAGES == 1) { 51 return false; 52 } 53 return true; 54 } 55 56 static void 57 hpa_do_consistency_checks(hpa_shard_t *shard) { 58 assert(shard->base != NULL); 59 } 60 61 bool 62 hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) { 63 /* malloc_conf processing should have filtered out these cases. */ 64 assert(hpa_supported()); 65 bool err; 66 err = malloc_mutex_init(¢ral->grow_mtx, "hpa_central_grow", 67 WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive); 68 if (err) { 69 return true; 70 } 71 err = malloc_mutex_init(¢ral->mtx, "hpa_central", 72 WITNESS_RANK_HPA_CENTRAL, malloc_mutex_rank_exclusive); 73 if (err) { 74 return true; 75 } 76 central->base = base; 77 central->eden = NULL; 78 central->eden_len = 0; 79 central->age_counter = 0; 80 central->hooks = *hooks; 81 return false; 82 } 83 84 static hpdata_t * 85 hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) { 86 return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t), 87 CACHELINE); 88 } 89 90 hpdata_t * 91 hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, 92 bool *oom) { 93 /* Don't yet support big allocations; these should get filtered out. */ 94 assert(size <= HUGEPAGE); 95 /* 96 * Should only try to extract from the central allocator if the local 97 * shard is exhausted. We should hold the grow_mtx on that shard. 98 */ 99 witness_assert_positive_depth_to_rank( 100 tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW); 101 102 malloc_mutex_lock(tsdn, ¢ral->grow_mtx); 103 *oom = false; 104 105 hpdata_t *ps = NULL; 106 107 /* Is eden a perfect fit? */ 108 if (central->eden != NULL && central->eden_len == HUGEPAGE) { 109 ps = hpa_alloc_ps(tsdn, central); 110 if (ps == NULL) { 111 *oom = true; 112 malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 113 return NULL; 114 } 115 hpdata_init(ps, central->eden, central->age_counter++); 116 central->eden = NULL; 117 central->eden_len = 0; 118 malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 119 return ps; 120 } 121 122 /* 123 * We're about to try to allocate from eden by splitting. If eden is 124 * NULL, we have to allocate it too. Otherwise, we just have to 125 * allocate an edata_t for the new psset. 126 */ 127 if (central->eden == NULL) { 128 /* 129 * During development, we're primarily concerned with systems 130 * with overcommit. Eventually, we should be more careful here. 131 */ 132 bool commit = true; 133 /* Allocate address space, bailing if we fail. */ 134 void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE, 135 &commit); 136 if (new_eden == NULL) { 137 *oom = true; 138 malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 139 return NULL; 140 } 141 ps = hpa_alloc_ps(tsdn, central); 142 if (ps == NULL) { 143 pages_unmap(new_eden, HPA_EDEN_SIZE); 144 *oom = true; 145 malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 146 return NULL; 147 } 148 central->eden = new_eden; 149 central->eden_len = HPA_EDEN_SIZE; 150 } else { 151 /* Eden is already nonempty; only need an edata for ps. */ 152 ps = hpa_alloc_ps(tsdn, central); 153 if (ps == NULL) { 154 *oom = true; 155 malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 156 return NULL; 157 } 158 } 159 assert(ps != NULL); 160 assert(central->eden != NULL); 161 assert(central->eden_len > HUGEPAGE); 162 assert(central->eden_len % HUGEPAGE == 0); 163 assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden); 164 165 hpdata_init(ps, central->eden, central->age_counter++); 166 167 char *eden_char = (char *)central->eden; 168 eden_char += HUGEPAGE; 169 central->eden = (void *)eden_char; 170 central->eden_len -= HUGEPAGE; 171 172 malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 173 174 return ps; 175 } 176 177 bool 178 hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, 179 base_t *base, edata_cache_t *edata_cache, unsigned ind, 180 const hpa_shard_opts_t *opts) { 181 /* malloc_conf processing should have filtered out these cases. */ 182 assert(hpa_supported()); 183 bool err; 184 err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow", 185 WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive); 186 if (err) { 187 return true; 188 } 189 err = malloc_mutex_init(&shard->mtx, "hpa_shard", 190 WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive); 191 if (err) { 192 return true; 193 } 194 195 assert(edata_cache != NULL); 196 shard->central = central; 197 shard->base = base; 198 edata_cache_fast_init(&shard->ecf, edata_cache); 199 psset_init(&shard->psset); 200 shard->age_counter = 0; 201 shard->ind = ind; 202 shard->emap = emap; 203 204 shard->opts = *opts; 205 206 shard->npending_purge = 0; 207 nstime_init_zero(&shard->last_purge); 208 209 shard->stats.npurge_passes = 0; 210 shard->stats.npurges = 0; 211 shard->stats.nhugifies = 0; 212 shard->stats.ndehugifies = 0; 213 214 /* 215 * Fill these in last, so that if an hpa_shard gets used despite 216 * initialization failing, we'll at least crash instead of just 217 * operating on corrupted data. 218 */ 219 shard->pai.alloc = &hpa_alloc; 220 shard->pai.alloc_batch = &hpa_alloc_batch; 221 shard->pai.expand = &hpa_expand; 222 shard->pai.shrink = &hpa_shrink; 223 shard->pai.dalloc = &hpa_dalloc; 224 shard->pai.dalloc_batch = &hpa_dalloc_batch; 225 shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work; 226 227 hpa_do_consistency_checks(shard); 228 229 return false; 230 } 231 232 /* 233 * Note that the stats functions here follow the usual stats naming conventions; 234 * "merge" obtains the stats from some live object of instance, while "accum" 235 * only combines the stats from one stats objet to another. Hence the lack of 236 * locking here. 237 */ 238 static void 239 hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst, 240 hpa_shard_nonderived_stats_t *src) { 241 dst->npurge_passes += src->npurge_passes; 242 dst->npurges += src->npurges; 243 dst->nhugifies += src->nhugifies; 244 dst->ndehugifies += src->ndehugifies; 245 } 246 247 void 248 hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) { 249 psset_stats_accum(&dst->psset_stats, &src->psset_stats); 250 hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, 251 &src->nonderived_stats); 252 } 253 254 void 255 hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, 256 hpa_shard_stats_t *dst) { 257 hpa_do_consistency_checks(shard); 258 259 malloc_mutex_lock(tsdn, &shard->grow_mtx); 260 malloc_mutex_lock(tsdn, &shard->mtx); 261 psset_stats_accum(&dst->psset_stats, &shard->psset.stats); 262 hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats); 263 malloc_mutex_unlock(tsdn, &shard->mtx); 264 malloc_mutex_unlock(tsdn, &shard->grow_mtx); 265 } 266 267 static bool 268 hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { 269 /* 270 * Note that this needs to be >= rather than just >, because of the 271 * important special case in which the hugification threshold is exactly 272 * HUGEPAGE. 273 */ 274 return hpdata_nactive_get(ps) * PAGE 275 >= shard->opts.hugification_threshold; 276 } 277 278 static size_t 279 hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { 280 malloc_mutex_assert_owner(tsdn, &shard->mtx); 281 return psset_ndirty(&shard->psset) - shard->npending_purge; 282 } 283 284 static size_t 285 hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) { 286 malloc_mutex_assert_owner(tsdn, &shard->mtx); 287 if (shard->opts.dirty_mult == (fxp_t)-1) { 288 return (size_t)-1; 289 } 290 return fxp_mul_frac(psset_nactive(&shard->psset), 291 shard->opts.dirty_mult); 292 } 293 294 static bool 295 hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { 296 malloc_mutex_assert_owner(tsdn, &shard->mtx); 297 hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); 298 if (to_hugify == NULL) { 299 return false; 300 } 301 return hpa_adjusted_ndirty(tsdn, shard) 302 + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard); 303 } 304 305 static bool 306 hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { 307 malloc_mutex_assert_owner(tsdn, &shard->mtx); 308 if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) { 309 return true; 310 } 311 if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { 312 return true; 313 } 314 return false; 315 } 316 317 static void 318 hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard, 319 hpdata_t *ps) { 320 malloc_mutex_assert_owner(tsdn, &shard->mtx); 321 if (hpdata_changing_state_get(ps)) { 322 hpdata_purge_allowed_set(ps, false); 323 hpdata_disallow_hugify(ps); 324 return; 325 } 326 /* 327 * Hugepages are distinctly costly to purge, so try to avoid it unless 328 * they're *particularly* full of dirty pages. Eventually, we should 329 * use a smarter / more dynamic heuristic for situations where we have 330 * to manually hugify. 331 * 332 * In situations where we don't manually hugify, this problem is 333 * reduced. The "bad" situation we're trying to avoid is one's that's 334 * common in some Linux configurations (where both enabled and defrag 335 * are set to madvise) that can lead to long latency spikes on the first 336 * access after a hugification. The ideal policy in such configurations 337 * is probably time-based for both purging and hugifying; only hugify a 338 * hugepage if it's met the criteria for some extended period of time, 339 * and only dehugify it if it's failed to meet the criteria for an 340 * extended period of time. When background threads are on, we should 341 * try to take this hit on one of them, as well. 342 * 343 * I think the ideal setting is THP always enabled, and defrag set to 344 * deferred; in that case we don't need any explicit calls on the 345 * allocator's end at all; we just try to pack allocations in a 346 * hugepage-friendly manner and let the OS hugify in the background. 347 */ 348 hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0); 349 if (hpa_good_hugification_candidate(shard, ps) 350 && !hpdata_huge_get(ps)) { 351 nstime_t now; 352 shard->central->hooks.curtime(&now, /* first_reading */ true); 353 hpdata_allow_hugify(ps, now); 354 } 355 /* 356 * Once a hugepage has become eligible for hugification, we don't mark 357 * it as ineligible just because it stops meeting the criteria (this 358 * could lead to situations where a hugepage that spends most of its 359 * time meeting the criteria never quite getting hugified if there are 360 * intervening deallocations). The idea is that the hugification delay 361 * will allow them to get purged, reseting their "hugify-allowed" bit. 362 * If they don't get purged, then the hugification isn't hurting and 363 * might help. As an exception, we don't hugify hugepages that are now 364 * empty; it definitely doesn't help there until the hugepage gets 365 * reused, which is likely not for a while. 366 */ 367 if (hpdata_nactive_get(ps) == 0) { 368 hpdata_disallow_hugify(ps); 369 } 370 } 371 372 static bool 373 hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { 374 malloc_mutex_assert_owner(tsdn, &shard->mtx); 375 hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); 376 return to_hugify != NULL || hpa_should_purge(tsdn, shard); 377 } 378 379 /* Returns whether or not we purged anything. */ 380 static bool 381 hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { 382 malloc_mutex_assert_owner(tsdn, &shard->mtx); 383 384 hpdata_t *to_purge = psset_pick_purge(&shard->psset); 385 if (to_purge == NULL) { 386 return false; 387 } 388 assert(hpdata_purge_allowed_get(to_purge)); 389 assert(!hpdata_changing_state_get(to_purge)); 390 391 /* 392 * Don't let anyone else purge or hugify this page while 393 * we're purging it (allocations and deallocations are 394 * OK). 395 */ 396 psset_update_begin(&shard->psset, to_purge); 397 assert(hpdata_alloc_allowed_get(to_purge)); 398 hpdata_mid_purge_set(to_purge, true); 399 hpdata_purge_allowed_set(to_purge, false); 400 hpdata_disallow_hugify(to_purge); 401 /* 402 * Unlike with hugification (where concurrent 403 * allocations are allowed), concurrent allocation out 404 * of a hugepage being purged is unsafe; we might hand 405 * out an extent for an allocation and then purge it 406 * (clearing out user data). 407 */ 408 hpdata_alloc_allowed_set(to_purge, false); 409 psset_update_end(&shard->psset, to_purge); 410 411 /* Gather all the metadata we'll need during the purge. */ 412 bool dehugify = hpdata_huge_get(to_purge); 413 hpdata_purge_state_t purge_state; 414 size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state); 415 416 shard->npending_purge += num_to_purge; 417 418 malloc_mutex_unlock(tsdn, &shard->mtx); 419 420 /* Actually do the purging, now that the lock is dropped. */ 421 if (dehugify) { 422 shard->central->hooks.dehugify(hpdata_addr_get(to_purge), 423 HUGEPAGE); 424 } 425 size_t total_purged = 0; 426 uint64_t purges_this_pass = 0; 427 void *purge_addr; 428 size_t purge_size; 429 while (hpdata_purge_next(to_purge, &purge_state, &purge_addr, 430 &purge_size)) { 431 total_purged += purge_size; 432 assert(total_purged <= HUGEPAGE); 433 purges_this_pass++; 434 shard->central->hooks.purge(purge_addr, purge_size); 435 } 436 437 malloc_mutex_lock(tsdn, &shard->mtx); 438 /* The shard updates */ 439 shard->npending_purge -= num_to_purge; 440 shard->stats.npurge_passes++; 441 shard->stats.npurges += purges_this_pass; 442 shard->central->hooks.curtime(&shard->last_purge, 443 /* first_reading */ false); 444 if (dehugify) { 445 shard->stats.ndehugifies++; 446 } 447 448 /* The hpdata updates. */ 449 psset_update_begin(&shard->psset, to_purge); 450 if (dehugify) { 451 hpdata_dehugify(to_purge); 452 } 453 hpdata_purge_end(to_purge, &purge_state); 454 hpdata_mid_purge_set(to_purge, false); 455 456 hpdata_alloc_allowed_set(to_purge, true); 457 hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge); 458 459 psset_update_end(&shard->psset, to_purge); 460 461 return true; 462 } 463 464 /* Returns whether or not we hugified anything. */ 465 static bool 466 hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { 467 malloc_mutex_assert_owner(tsdn, &shard->mtx); 468 469 if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { 470 return false; 471 } 472 473 hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); 474 if (to_hugify == NULL) { 475 return false; 476 } 477 assert(hpdata_hugify_allowed_get(to_hugify)); 478 assert(!hpdata_changing_state_get(to_hugify)); 479 480 /* Make sure that it's been hugifiable for long enough. */ 481 nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify); 482 uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed); 483 if (millis < shard->opts.hugify_delay_ms) { 484 return false; 485 } 486 487 /* 488 * Don't let anyone else purge or hugify this page while 489 * we're hugifying it (allocations and deallocations are 490 * OK). 491 */ 492 psset_update_begin(&shard->psset, to_hugify); 493 hpdata_mid_hugify_set(to_hugify, true); 494 hpdata_purge_allowed_set(to_hugify, false); 495 hpdata_disallow_hugify(to_hugify); 496 assert(hpdata_alloc_allowed_get(to_hugify)); 497 psset_update_end(&shard->psset, to_hugify); 498 499 malloc_mutex_unlock(tsdn, &shard->mtx); 500 501 shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE); 502 503 malloc_mutex_lock(tsdn, &shard->mtx); 504 shard->stats.nhugifies++; 505 506 psset_update_begin(&shard->psset, to_hugify); 507 hpdata_hugify(to_hugify); 508 hpdata_mid_hugify_set(to_hugify, false); 509 hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify); 510 psset_update_end(&shard->psset, to_hugify); 511 512 return true; 513 } 514 515 /* 516 * Execution of deferred work is forced if it's triggered by an explicit 517 * hpa_shard_do_deferred_work() call. 518 */ 519 static void 520 hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard, 521 bool forced) { 522 malloc_mutex_assert_owner(tsdn, &shard->mtx); 523 if (!forced && shard->opts.deferral_allowed) { 524 return; 525 } 526 /* 527 * If we're on a background thread, do work so long as there's work to 528 * be done. Otherwise, bound latency to not be *too* bad by doing at 529 * most a small fixed number of operations. 530 */ 531 bool hugified = false; 532 bool purged = false; 533 size_t max_ops = (forced ? (size_t)-1 : 16); 534 size_t nops = 0; 535 do { 536 /* 537 * Always purge before hugifying, to make sure we get some 538 * ability to hit our quiescence targets. 539 */ 540 purged = false; 541 while (hpa_should_purge(tsdn, shard) && nops < max_ops) { 542 purged = hpa_try_purge(tsdn, shard); 543 if (purged) { 544 nops++; 545 } 546 } 547 hugified = hpa_try_hugify(tsdn, shard); 548 if (hugified) { 549 nops++; 550 } 551 malloc_mutex_assert_owner(tsdn, &shard->mtx); 552 malloc_mutex_assert_owner(tsdn, &shard->mtx); 553 } while ((hugified || purged) && nops < max_ops); 554 } 555 556 static edata_t * 557 hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, 558 bool *oom) { 559 bool err; 560 edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf); 561 if (edata == NULL) { 562 *oom = true; 563 return NULL; 564 } 565 566 hpdata_t *ps = psset_pick_alloc(&shard->psset, size); 567 if (ps == NULL) { 568 edata_cache_fast_put(tsdn, &shard->ecf, edata); 569 return NULL; 570 } 571 572 psset_update_begin(&shard->psset, ps); 573 574 if (hpdata_empty(ps)) { 575 /* 576 * If the pageslab used to be empty, treat it as though it's 577 * brand new for fragmentation-avoidance purposes; what we're 578 * trying to approximate is the age of the allocations *in* that 579 * pageslab, and the allocations in the new pageslab are 580 * definitionally the youngest in this hpa shard. 581 */ 582 hpdata_age_set(ps, shard->age_counter++); 583 } 584 585 void *addr = hpdata_reserve_alloc(ps, size); 586 edata_init(edata, shard->ind, addr, size, /* slab */ false, 587 SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active, 588 /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA, 589 EXTENT_NOT_HEAD); 590 edata_ps_set(edata, ps); 591 592 /* 593 * This could theoretically be moved outside of the critical section, 594 * but that introduces the potential for a race. Without the lock, the 595 * (initially nonempty, since this is the reuse pathway) pageslab we 596 * allocated out of could become otherwise empty while the lock is 597 * dropped. This would force us to deal with a pageslab eviction down 598 * the error pathway, which is a pain. 599 */ 600 err = emap_register_boundary(tsdn, shard->emap, edata, 601 SC_NSIZES, /* slab */ false); 602 if (err) { 603 hpdata_unreserve(ps, edata_addr_get(edata), 604 edata_size_get(edata)); 605 /* 606 * We should arguably reset dirty state here, but this would 607 * require some sort of prepare + commit functionality that's a 608 * little much to deal with for now. 609 * 610 * We don't have a do_deferred_work down this pathway, on the 611 * principle that we didn't *really* affect shard state (we 612 * tweaked the stats, but our tweaks weren't really accurate). 613 */ 614 psset_update_end(&shard->psset, ps); 615 edata_cache_fast_put(tsdn, &shard->ecf, edata); 616 *oom = true; 617 return NULL; 618 } 619 620 hpa_update_purge_hugify_eligibility(tsdn, shard, ps); 621 psset_update_end(&shard->psset, ps); 622 return edata; 623 } 624 625 static size_t 626 hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, 627 bool *oom, size_t nallocs, edata_list_active_t *results, 628 bool *deferred_work_generated) { 629 malloc_mutex_lock(tsdn, &shard->mtx); 630 size_t nsuccess = 0; 631 for (; nsuccess < nallocs; nsuccess++) { 632 edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size, 633 oom); 634 if (edata == NULL) { 635 break; 636 } 637 edata_list_active_append(results, edata); 638 } 639 640 hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); 641 *deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard); 642 malloc_mutex_unlock(tsdn, &shard->mtx); 643 return nsuccess; 644 } 645 646 static size_t 647 hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, 648 size_t nallocs, edata_list_active_t *results, 649 bool *deferred_work_generated) { 650 assert(size <= shard->opts.slab_max_alloc); 651 bool oom = false; 652 653 size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, 654 nallocs, results, deferred_work_generated); 655 656 if (nsuccess == nallocs || oom) { 657 return nsuccess; 658 } 659 660 /* 661 * We didn't OOM, but weren't able to fill everything requested of us; 662 * try to grow. 663 */ 664 malloc_mutex_lock(tsdn, &shard->grow_mtx); 665 /* 666 * Check for grow races; maybe some earlier thread expanded the psset 667 * in between when we dropped the main mutex and grabbed the grow mutex. 668 */ 669 nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, 670 nallocs - nsuccess, results, deferred_work_generated); 671 if (nsuccess == nallocs || oom) { 672 malloc_mutex_unlock(tsdn, &shard->grow_mtx); 673 return nsuccess; 674 } 675 676 /* 677 * Note that we don't hold shard->mtx here (while growing); 678 * deallocations (and allocations of smaller sizes) may still succeed 679 * while we're doing this potentially expensive system call. 680 */ 681 hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom); 682 if (ps == NULL) { 683 malloc_mutex_unlock(tsdn, &shard->grow_mtx); 684 return nsuccess; 685 } 686 687 /* 688 * We got the pageslab; allocate from it. This does an unlock followed 689 * by a lock on the same mutex, and holds the grow mutex while doing 690 * deferred work, but this is an uncommon path; the simplicity is worth 691 * it. 692 */ 693 malloc_mutex_lock(tsdn, &shard->mtx); 694 psset_insert(&shard->psset, ps); 695 malloc_mutex_unlock(tsdn, &shard->mtx); 696 697 nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, 698 nallocs - nsuccess, results, deferred_work_generated); 699 /* 700 * Drop grow_mtx before doing deferred work; other threads blocked on it 701 * should be allowed to proceed while we're working. 702 */ 703 malloc_mutex_unlock(tsdn, &shard->grow_mtx); 704 705 return nsuccess; 706 } 707 708 static hpa_shard_t * 709 hpa_from_pai(pai_t *self) { 710 assert(self->alloc = &hpa_alloc); 711 assert(self->expand = &hpa_expand); 712 assert(self->shrink = &hpa_shrink); 713 assert(self->dalloc = &hpa_dalloc); 714 return (hpa_shard_t *)self; 715 } 716 717 static size_t 718 hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, 719 edata_list_active_t *results, bool *deferred_work_generated) { 720 assert(nallocs > 0); 721 assert((size & PAGE_MASK) == 0); 722 witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), 723 WITNESS_RANK_CORE, 0); 724 hpa_shard_t *shard = hpa_from_pai(self); 725 726 if (size > shard->opts.slab_max_alloc) { 727 return 0; 728 } 729 730 size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs, 731 results, deferred_work_generated); 732 733 witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), 734 WITNESS_RANK_CORE, 0); 735 736 /* 737 * Guard the sanity checks with config_debug because the loop cannot be 738 * proven non-circular by the compiler, even if everything within the 739 * loop is optimized away. 740 */ 741 if (config_debug) { 742 edata_t *edata; 743 ql_foreach(edata, &results->head, ql_link_active) { 744 emap_assert_mapped(tsdn, shard->emap, edata); 745 assert(edata_pai_get(edata) == EXTENT_PAI_HPA); 746 assert(edata_state_get(edata) == extent_state_active); 747 assert(edata_arena_ind_get(edata) == shard->ind); 748 assert(edata_szind_get_maybe_invalid(edata) == 749 SC_NSIZES); 750 assert(!edata_slab_get(edata)); 751 assert(edata_committed_get(edata)); 752 assert(edata_base_get(edata) == edata_addr_get(edata)); 753 assert(edata_base_get(edata) != NULL); 754 } 755 } 756 return nsuccess; 757 } 758 759 static edata_t * 760 hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, 761 bool guarded, bool frequent_reuse, bool *deferred_work_generated) { 762 assert((size & PAGE_MASK) == 0); 763 assert(!guarded); 764 witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), 765 WITNESS_RANK_CORE, 0); 766 767 /* We don't handle alignment or zeroing for now. */ 768 if (alignment > PAGE || zero) { 769 return NULL; 770 } 771 /* 772 * An alloc with alignment == PAGE and zero == false is equivalent to a 773 * batch alloc of 1. Just do that, so we can share code. 774 */ 775 edata_list_active_t results; 776 edata_list_active_init(&results); 777 size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1, 778 &results, deferred_work_generated); 779 assert(nallocs == 0 || nallocs == 1); 780 edata_t *edata = edata_list_active_first(&results); 781 return edata; 782 } 783 784 static bool 785 hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, 786 size_t new_size, bool zero, bool *deferred_work_generated) { 787 /* Expand not yet supported. */ 788 return true; 789 } 790 791 static bool 792 hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, 793 size_t old_size, size_t new_size, bool *deferred_work_generated) { 794 /* Shrink not yet supported. */ 795 return true; 796 } 797 798 static void 799 hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { 800 malloc_mutex_assert_not_owner(tsdn, &shard->mtx); 801 802 assert(edata_pai_get(edata) == EXTENT_PAI_HPA); 803 assert(edata_state_get(edata) == extent_state_active); 804 assert(edata_arena_ind_get(edata) == shard->ind); 805 assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES); 806 assert(edata_committed_get(edata)); 807 assert(edata_base_get(edata) != NULL); 808 809 /* 810 * Another thread shouldn't be trying to touch the metadata of an 811 * allocation being freed. The one exception is a merge attempt from a 812 * lower-addressed PAC extent; in this case we have a nominal race on 813 * the edata metadata bits, but in practice the fact that the PAI bits 814 * are different will prevent any further access. The race is bad, but 815 * benign in practice, and the long term plan is to track enough state 816 * in the rtree to prevent these merge attempts in the first place. 817 */ 818 edata_addr_set(edata, edata_base_get(edata)); 819 edata_zeroed_set(edata, false); 820 emap_deregister_boundary(tsdn, shard->emap, edata); 821 } 822 823 static void 824 hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { 825 malloc_mutex_assert_owner(tsdn, &shard->mtx); 826 827 /* 828 * Release the metadata early, to avoid having to remember to do it 829 * while we're also doing tricky purging logic. First, we need to grab 830 * a few bits of metadata from it. 831 * 832 * Note that the shard mutex protects ps's metadata too; it wouldn't be 833 * correct to try to read most information out of it without the lock. 834 */ 835 hpdata_t *ps = edata_ps_get(edata); 836 /* Currently, all edatas come from pageslabs. */ 837 assert(ps != NULL); 838 void *unreserve_addr = edata_addr_get(edata); 839 size_t unreserve_size = edata_size_get(edata); 840 edata_cache_fast_put(tsdn, &shard->ecf, edata); 841 842 psset_update_begin(&shard->psset, ps); 843 hpdata_unreserve(ps, unreserve_addr, unreserve_size); 844 hpa_update_purge_hugify_eligibility(tsdn, shard, ps); 845 psset_update_end(&shard->psset, ps); 846 } 847 848 static void 849 hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list, 850 bool *deferred_work_generated) { 851 hpa_shard_t *shard = hpa_from_pai(self); 852 853 edata_t *edata; 854 ql_foreach(edata, &list->head, ql_link_active) { 855 hpa_dalloc_prepare_unlocked(tsdn, shard, edata); 856 } 857 858 malloc_mutex_lock(tsdn, &shard->mtx); 859 /* Now, remove from the list. */ 860 while ((edata = edata_list_active_first(list)) != NULL) { 861 edata_list_active_remove(list, edata); 862 hpa_dalloc_locked(tsdn, shard, edata); 863 } 864 hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); 865 *deferred_work_generated = 866 hpa_shard_has_deferred_work(tsdn, shard); 867 868 malloc_mutex_unlock(tsdn, &shard->mtx); 869 } 870 871 static void 872 hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, 873 bool *deferred_work_generated) { 874 assert(!edata_guarded_get(edata)); 875 /* Just a dalloc_batch of size 1; this lets us share logic. */ 876 edata_list_active_t dalloc_list; 877 edata_list_active_init(&dalloc_list); 878 edata_list_active_append(&dalloc_list, edata); 879 hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated); 880 } 881 882 /* 883 * Calculate time until either purging or hugification ought to happen. 884 * Called by background threads. 885 */ 886 static uint64_t 887 hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { 888 hpa_shard_t *shard = hpa_from_pai(self); 889 uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX; 890 891 malloc_mutex_lock(tsdn, &shard->mtx); 892 893 hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); 894 if (to_hugify != NULL) { 895 nstime_t time_hugify_allowed = 896 hpdata_time_hugify_allowed(to_hugify); 897 uint64_t since_hugify_allowed_ms = 898 shard->central->hooks.ms_since(&time_hugify_allowed); 899 /* 900 * If not enough time has passed since hugification was allowed, 901 * sleep for the rest. 902 */ 903 if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) { 904 time_ns = shard->opts.hugify_delay_ms - 905 since_hugify_allowed_ms; 906 time_ns *= 1000 * 1000; 907 } else { 908 malloc_mutex_unlock(tsdn, &shard->mtx); 909 return BACKGROUND_THREAD_DEFERRED_MIN; 910 } 911 } 912 913 if (hpa_should_purge(tsdn, shard)) { 914 /* 915 * If we haven't purged before, no need to check interval 916 * between purges. Simply purge as soon as possible. 917 */ 918 if (shard->stats.npurge_passes == 0) { 919 malloc_mutex_unlock(tsdn, &shard->mtx); 920 return BACKGROUND_THREAD_DEFERRED_MIN; 921 } 922 uint64_t since_last_purge_ms = shard->central->hooks.ms_since( 923 &shard->last_purge); 924 925 if (since_last_purge_ms < shard->opts.min_purge_interval_ms) { 926 uint64_t until_purge_ns; 927 until_purge_ns = shard->opts.min_purge_interval_ms - 928 since_last_purge_ms; 929 until_purge_ns *= 1000 * 1000; 930 931 if (until_purge_ns < time_ns) { 932 time_ns = until_purge_ns; 933 } 934 } else { 935 time_ns = BACKGROUND_THREAD_DEFERRED_MIN; 936 } 937 } 938 malloc_mutex_unlock(tsdn, &shard->mtx); 939 return time_ns; 940 } 941 942 void 943 hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) { 944 hpa_do_consistency_checks(shard); 945 946 malloc_mutex_lock(tsdn, &shard->mtx); 947 edata_cache_fast_disable(tsdn, &shard->ecf); 948 malloc_mutex_unlock(tsdn, &shard->mtx); 949 } 950 951 static void 952 hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) { 953 assert(bin_stats->npageslabs == 0); 954 assert(bin_stats->nactive == 0); 955 } 956 957 static void 958 hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) { 959 malloc_mutex_assert_owner(tsdn, &shard->mtx); 960 for (int huge = 0; huge <= 1; huge++) { 961 hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]); 962 for (pszind_t i = 0; i < PSSET_NPSIZES; i++) { 963 hpa_shard_assert_stats_empty( 964 &psset->stats.nonfull_slabs[i][huge]); 965 } 966 } 967 } 968 969 void 970 hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) { 971 hpa_do_consistency_checks(shard); 972 /* 973 * By the time we're here, the arena code should have dalloc'd all the 974 * active extents, which means we should have eventually evicted 975 * everything from the psset, so it shouldn't be able to serve even a 976 * 1-page allocation. 977 */ 978 if (config_debug) { 979 malloc_mutex_lock(tsdn, &shard->mtx); 980 hpa_assert_empty(tsdn, shard, &shard->psset); 981 malloc_mutex_unlock(tsdn, &shard->mtx); 982 } 983 hpdata_t *ps; 984 while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) { 985 /* There should be no allocations anywhere. */ 986 assert(hpdata_empty(ps)); 987 psset_remove(&shard->psset, ps); 988 shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE); 989 } 990 } 991 992 void 993 hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard, 994 bool deferral_allowed) { 995 hpa_do_consistency_checks(shard); 996 997 malloc_mutex_lock(tsdn, &shard->mtx); 998 bool deferral_previously_allowed = shard->opts.deferral_allowed; 999 shard->opts.deferral_allowed = deferral_allowed; 1000 if (deferral_previously_allowed && !deferral_allowed) { 1001 hpa_shard_maybe_do_deferred_work(tsdn, shard, 1002 /* forced */ true); 1003 } 1004 malloc_mutex_unlock(tsdn, &shard->mtx); 1005 } 1006 1007 void 1008 hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { 1009 hpa_do_consistency_checks(shard); 1010 1011 malloc_mutex_lock(tsdn, &shard->mtx); 1012 hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true); 1013 malloc_mutex_unlock(tsdn, &shard->mtx); 1014 } 1015 1016 void 1017 hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) { 1018 hpa_do_consistency_checks(shard); 1019 1020 malloc_mutex_prefork(tsdn, &shard->grow_mtx); 1021 } 1022 1023 void 1024 hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) { 1025 hpa_do_consistency_checks(shard); 1026 1027 malloc_mutex_prefork(tsdn, &shard->mtx); 1028 } 1029 1030 void 1031 hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) { 1032 hpa_do_consistency_checks(shard); 1033 1034 malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx); 1035 malloc_mutex_postfork_parent(tsdn, &shard->mtx); 1036 } 1037 1038 void 1039 hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) { 1040 hpa_do_consistency_checks(shard); 1041 1042 malloc_mutex_postfork_child(tsdn, &shard->grow_mtx); 1043 malloc_mutex_postfork_child(tsdn, &shard->mtx); 1044 } 1045