1*c43cad87SWarner Losh #include "jemalloc/internal/jemalloc_preamble.h" 2*c43cad87SWarner Losh #include "jemalloc/internal/jemalloc_internal_includes.h" 3*c43cad87SWarner Losh 4*c43cad87SWarner Losh #include "jemalloc/internal/hpa.h" 5*c43cad87SWarner Losh 6*c43cad87SWarner Losh #include "jemalloc/internal/fb.h" 7*c43cad87SWarner Losh #include "jemalloc/internal/witness.h" 8*c43cad87SWarner Losh 9*c43cad87SWarner Losh #define HPA_EDEN_SIZE (128 * HUGEPAGE) 10*c43cad87SWarner Losh 11*c43cad87SWarner Losh static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, 12*c43cad87SWarner Losh size_t alignment, bool zero, bool guarded, bool frequent_reuse, 13*c43cad87SWarner Losh bool *deferred_work_generated); 14*c43cad87SWarner Losh static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, 15*c43cad87SWarner Losh size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated); 16*c43cad87SWarner Losh static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, 17*c43cad87SWarner Losh size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); 18*c43cad87SWarner Losh static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, 19*c43cad87SWarner Losh size_t old_size, size_t new_size, bool *deferred_work_generated); 20*c43cad87SWarner Losh static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, 21*c43cad87SWarner Losh bool *deferred_work_generated); 22*c43cad87SWarner Losh static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, 23*c43cad87SWarner Losh edata_list_active_t *list, bool *deferred_work_generated); 24*c43cad87SWarner Losh static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); 25*c43cad87SWarner Losh 26*c43cad87SWarner Losh bool 27*c43cad87SWarner Losh hpa_supported() { 28*c43cad87SWarner Losh #ifdef _WIN32 29*c43cad87SWarner Losh /* 30*c43cad87SWarner Losh * At least until the API and implementation is somewhat settled, we 31*c43cad87SWarner Losh * don't want to try to debug the VM subsystem on the hardest-to-test 32*c43cad87SWarner Losh * platform. 33*c43cad87SWarner Losh */ 34*c43cad87SWarner Losh return false; 35*c43cad87SWarner Losh #endif 36*c43cad87SWarner Losh if (!pages_can_hugify) { 37*c43cad87SWarner Losh return false; 38*c43cad87SWarner Losh } 39*c43cad87SWarner Losh /* 40*c43cad87SWarner Losh * We fundamentally rely on a address-space-hungry growth strategy for 41*c43cad87SWarner Losh * hugepages. 42*c43cad87SWarner Losh */ 43*c43cad87SWarner Losh if (LG_SIZEOF_PTR != 3) { 44*c43cad87SWarner Losh return false; 45*c43cad87SWarner Losh } 46*c43cad87SWarner Losh /* 47*c43cad87SWarner Losh * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes 48*c43cad87SWarner Losh * this sentinel value -- see the comment in pages.h. 49*c43cad87SWarner Losh */ 50*c43cad87SWarner Losh if (HUGEPAGE_PAGES == 1) { 51*c43cad87SWarner Losh return false; 52*c43cad87SWarner Losh } 53*c43cad87SWarner Losh return true; 54*c43cad87SWarner Losh } 55*c43cad87SWarner Losh 56*c43cad87SWarner Losh static void 57*c43cad87SWarner Losh hpa_do_consistency_checks(hpa_shard_t *shard) { 58*c43cad87SWarner Losh assert(shard->base != NULL); 59*c43cad87SWarner Losh } 60*c43cad87SWarner Losh 61*c43cad87SWarner Losh bool 62*c43cad87SWarner Losh hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) { 63*c43cad87SWarner Losh /* malloc_conf processing should have filtered out these cases. */ 64*c43cad87SWarner Losh assert(hpa_supported()); 65*c43cad87SWarner Losh bool err; 66*c43cad87SWarner Losh err = malloc_mutex_init(¢ral->grow_mtx, "hpa_central_grow", 67*c43cad87SWarner Losh WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive); 68*c43cad87SWarner Losh if (err) { 69*c43cad87SWarner Losh return true; 70*c43cad87SWarner Losh } 71*c43cad87SWarner Losh err = malloc_mutex_init(¢ral->mtx, "hpa_central", 72*c43cad87SWarner Losh WITNESS_RANK_HPA_CENTRAL, malloc_mutex_rank_exclusive); 73*c43cad87SWarner Losh if (err) { 74*c43cad87SWarner Losh return true; 75*c43cad87SWarner Losh } 76*c43cad87SWarner Losh central->base = base; 77*c43cad87SWarner Losh central->eden = NULL; 78*c43cad87SWarner Losh central->eden_len = 0; 79*c43cad87SWarner Losh central->age_counter = 0; 80*c43cad87SWarner Losh central->hooks = *hooks; 81*c43cad87SWarner Losh return false; 82*c43cad87SWarner Losh } 83*c43cad87SWarner Losh 84*c43cad87SWarner Losh static hpdata_t * 85*c43cad87SWarner Losh hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) { 86*c43cad87SWarner Losh return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t), 87*c43cad87SWarner Losh CACHELINE); 88*c43cad87SWarner Losh } 89*c43cad87SWarner Losh 90*c43cad87SWarner Losh hpdata_t * 91*c43cad87SWarner Losh hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, 92*c43cad87SWarner Losh bool *oom) { 93*c43cad87SWarner Losh /* Don't yet support big allocations; these should get filtered out. */ 94*c43cad87SWarner Losh assert(size <= HUGEPAGE); 95*c43cad87SWarner Losh /* 96*c43cad87SWarner Losh * Should only try to extract from the central allocator if the local 97*c43cad87SWarner Losh * shard is exhausted. We should hold the grow_mtx on that shard. 98*c43cad87SWarner Losh */ 99*c43cad87SWarner Losh witness_assert_positive_depth_to_rank( 100*c43cad87SWarner Losh tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW); 101*c43cad87SWarner Losh 102*c43cad87SWarner Losh malloc_mutex_lock(tsdn, ¢ral->grow_mtx); 103*c43cad87SWarner Losh *oom = false; 104*c43cad87SWarner Losh 105*c43cad87SWarner Losh hpdata_t *ps = NULL; 106*c43cad87SWarner Losh 107*c43cad87SWarner Losh /* Is eden a perfect fit? */ 108*c43cad87SWarner Losh if (central->eden != NULL && central->eden_len == HUGEPAGE) { 109*c43cad87SWarner Losh ps = hpa_alloc_ps(tsdn, central); 110*c43cad87SWarner Losh if (ps == NULL) { 111*c43cad87SWarner Losh *oom = true; 112*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 113*c43cad87SWarner Losh return NULL; 114*c43cad87SWarner Losh } 115*c43cad87SWarner Losh hpdata_init(ps, central->eden, central->age_counter++); 116*c43cad87SWarner Losh central->eden = NULL; 117*c43cad87SWarner Losh central->eden_len = 0; 118*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 119*c43cad87SWarner Losh return ps; 120*c43cad87SWarner Losh } 121*c43cad87SWarner Losh 122*c43cad87SWarner Losh /* 123*c43cad87SWarner Losh * We're about to try to allocate from eden by splitting. If eden is 124*c43cad87SWarner Losh * NULL, we have to allocate it too. Otherwise, we just have to 125*c43cad87SWarner Losh * allocate an edata_t for the new psset. 126*c43cad87SWarner Losh */ 127*c43cad87SWarner Losh if (central->eden == NULL) { 128*c43cad87SWarner Losh /* 129*c43cad87SWarner Losh * During development, we're primarily concerned with systems 130*c43cad87SWarner Losh * with overcommit. Eventually, we should be more careful here. 131*c43cad87SWarner Losh */ 132*c43cad87SWarner Losh bool commit = true; 133*c43cad87SWarner Losh /* Allocate address space, bailing if we fail. */ 134*c43cad87SWarner Losh void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE, 135*c43cad87SWarner Losh &commit); 136*c43cad87SWarner Losh if (new_eden == NULL) { 137*c43cad87SWarner Losh *oom = true; 138*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 139*c43cad87SWarner Losh return NULL; 140*c43cad87SWarner Losh } 141*c43cad87SWarner Losh ps = hpa_alloc_ps(tsdn, central); 142*c43cad87SWarner Losh if (ps == NULL) { 143*c43cad87SWarner Losh pages_unmap(new_eden, HPA_EDEN_SIZE); 144*c43cad87SWarner Losh *oom = true; 145*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 146*c43cad87SWarner Losh return NULL; 147*c43cad87SWarner Losh } 148*c43cad87SWarner Losh central->eden = new_eden; 149*c43cad87SWarner Losh central->eden_len = HPA_EDEN_SIZE; 150*c43cad87SWarner Losh } else { 151*c43cad87SWarner Losh /* Eden is already nonempty; only need an edata for ps. */ 152*c43cad87SWarner Losh ps = hpa_alloc_ps(tsdn, central); 153*c43cad87SWarner Losh if (ps == NULL) { 154*c43cad87SWarner Losh *oom = true; 155*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 156*c43cad87SWarner Losh return NULL; 157*c43cad87SWarner Losh } 158*c43cad87SWarner Losh } 159*c43cad87SWarner Losh assert(ps != NULL); 160*c43cad87SWarner Losh assert(central->eden != NULL); 161*c43cad87SWarner Losh assert(central->eden_len > HUGEPAGE); 162*c43cad87SWarner Losh assert(central->eden_len % HUGEPAGE == 0); 163*c43cad87SWarner Losh assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden); 164*c43cad87SWarner Losh 165*c43cad87SWarner Losh hpdata_init(ps, central->eden, central->age_counter++); 166*c43cad87SWarner Losh 167*c43cad87SWarner Losh char *eden_char = (char *)central->eden; 168*c43cad87SWarner Losh eden_char += HUGEPAGE; 169*c43cad87SWarner Losh central->eden = (void *)eden_char; 170*c43cad87SWarner Losh central->eden_len -= HUGEPAGE; 171*c43cad87SWarner Losh 172*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); 173*c43cad87SWarner Losh 174*c43cad87SWarner Losh return ps; 175*c43cad87SWarner Losh } 176*c43cad87SWarner Losh 177*c43cad87SWarner Losh bool 178*c43cad87SWarner Losh hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, 179*c43cad87SWarner Losh base_t *base, edata_cache_t *edata_cache, unsigned ind, 180*c43cad87SWarner Losh const hpa_shard_opts_t *opts) { 181*c43cad87SWarner Losh /* malloc_conf processing should have filtered out these cases. */ 182*c43cad87SWarner Losh assert(hpa_supported()); 183*c43cad87SWarner Losh bool err; 184*c43cad87SWarner Losh err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow", 185*c43cad87SWarner Losh WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive); 186*c43cad87SWarner Losh if (err) { 187*c43cad87SWarner Losh return true; 188*c43cad87SWarner Losh } 189*c43cad87SWarner Losh err = malloc_mutex_init(&shard->mtx, "hpa_shard", 190*c43cad87SWarner Losh WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive); 191*c43cad87SWarner Losh if (err) { 192*c43cad87SWarner Losh return true; 193*c43cad87SWarner Losh } 194*c43cad87SWarner Losh 195*c43cad87SWarner Losh assert(edata_cache != NULL); 196*c43cad87SWarner Losh shard->central = central; 197*c43cad87SWarner Losh shard->base = base; 198*c43cad87SWarner Losh edata_cache_fast_init(&shard->ecf, edata_cache); 199*c43cad87SWarner Losh psset_init(&shard->psset); 200*c43cad87SWarner Losh shard->age_counter = 0; 201*c43cad87SWarner Losh shard->ind = ind; 202*c43cad87SWarner Losh shard->emap = emap; 203*c43cad87SWarner Losh 204*c43cad87SWarner Losh shard->opts = *opts; 205*c43cad87SWarner Losh 206*c43cad87SWarner Losh shard->npending_purge = 0; 207*c43cad87SWarner Losh nstime_init_zero(&shard->last_purge); 208*c43cad87SWarner Losh 209*c43cad87SWarner Losh shard->stats.npurge_passes = 0; 210*c43cad87SWarner Losh shard->stats.npurges = 0; 211*c43cad87SWarner Losh shard->stats.nhugifies = 0; 212*c43cad87SWarner Losh shard->stats.ndehugifies = 0; 213*c43cad87SWarner Losh 214*c43cad87SWarner Losh /* 215*c43cad87SWarner Losh * Fill these in last, so that if an hpa_shard gets used despite 216*c43cad87SWarner Losh * initialization failing, we'll at least crash instead of just 217*c43cad87SWarner Losh * operating on corrupted data. 218*c43cad87SWarner Losh */ 219*c43cad87SWarner Losh shard->pai.alloc = &hpa_alloc; 220*c43cad87SWarner Losh shard->pai.alloc_batch = &hpa_alloc_batch; 221*c43cad87SWarner Losh shard->pai.expand = &hpa_expand; 222*c43cad87SWarner Losh shard->pai.shrink = &hpa_shrink; 223*c43cad87SWarner Losh shard->pai.dalloc = &hpa_dalloc; 224*c43cad87SWarner Losh shard->pai.dalloc_batch = &hpa_dalloc_batch; 225*c43cad87SWarner Losh shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work; 226*c43cad87SWarner Losh 227*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 228*c43cad87SWarner Losh 229*c43cad87SWarner Losh return false; 230*c43cad87SWarner Losh } 231*c43cad87SWarner Losh 232*c43cad87SWarner Losh /* 233*c43cad87SWarner Losh * Note that the stats functions here follow the usual stats naming conventions; 234*c43cad87SWarner Losh * "merge" obtains the stats from some live object of instance, while "accum" 235*c43cad87SWarner Losh * only combines the stats from one stats objet to another. Hence the lack of 236*c43cad87SWarner Losh * locking here. 237*c43cad87SWarner Losh */ 238*c43cad87SWarner Losh static void 239*c43cad87SWarner Losh hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst, 240*c43cad87SWarner Losh hpa_shard_nonderived_stats_t *src) { 241*c43cad87SWarner Losh dst->npurge_passes += src->npurge_passes; 242*c43cad87SWarner Losh dst->npurges += src->npurges; 243*c43cad87SWarner Losh dst->nhugifies += src->nhugifies; 244*c43cad87SWarner Losh dst->ndehugifies += src->ndehugifies; 245*c43cad87SWarner Losh } 246*c43cad87SWarner Losh 247*c43cad87SWarner Losh void 248*c43cad87SWarner Losh hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) { 249*c43cad87SWarner Losh psset_stats_accum(&dst->psset_stats, &src->psset_stats); 250*c43cad87SWarner Losh hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, 251*c43cad87SWarner Losh &src->nonderived_stats); 252*c43cad87SWarner Losh } 253*c43cad87SWarner Losh 254*c43cad87SWarner Losh void 255*c43cad87SWarner Losh hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, 256*c43cad87SWarner Losh hpa_shard_stats_t *dst) { 257*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 258*c43cad87SWarner Losh 259*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->grow_mtx); 260*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 261*c43cad87SWarner Losh psset_stats_accum(&dst->psset_stats, &shard->psset.stats); 262*c43cad87SWarner Losh hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats); 263*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 264*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->grow_mtx); 265*c43cad87SWarner Losh } 266*c43cad87SWarner Losh 267*c43cad87SWarner Losh static bool 268*c43cad87SWarner Losh hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { 269*c43cad87SWarner Losh /* 270*c43cad87SWarner Losh * Note that this needs to be >= rather than just >, because of the 271*c43cad87SWarner Losh * important special case in which the hugification threshold is exactly 272*c43cad87SWarner Losh * HUGEPAGE. 273*c43cad87SWarner Losh */ 274*c43cad87SWarner Losh return hpdata_nactive_get(ps) * PAGE 275*c43cad87SWarner Losh >= shard->opts.hugification_threshold; 276*c43cad87SWarner Losh } 277*c43cad87SWarner Losh 278*c43cad87SWarner Losh static size_t 279*c43cad87SWarner Losh hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { 280*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 281*c43cad87SWarner Losh return psset_ndirty(&shard->psset) - shard->npending_purge; 282*c43cad87SWarner Losh } 283*c43cad87SWarner Losh 284*c43cad87SWarner Losh static size_t 285*c43cad87SWarner Losh hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) { 286*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 287*c43cad87SWarner Losh if (shard->opts.dirty_mult == (fxp_t)-1) { 288*c43cad87SWarner Losh return (size_t)-1; 289*c43cad87SWarner Losh } 290*c43cad87SWarner Losh return fxp_mul_frac(psset_nactive(&shard->psset), 291*c43cad87SWarner Losh shard->opts.dirty_mult); 292*c43cad87SWarner Losh } 293*c43cad87SWarner Losh 294*c43cad87SWarner Losh static bool 295*c43cad87SWarner Losh hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { 296*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 297*c43cad87SWarner Losh hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); 298*c43cad87SWarner Losh if (to_hugify == NULL) { 299*c43cad87SWarner Losh return false; 300*c43cad87SWarner Losh } 301*c43cad87SWarner Losh return hpa_adjusted_ndirty(tsdn, shard) 302*c43cad87SWarner Losh + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard); 303*c43cad87SWarner Losh } 304*c43cad87SWarner Losh 305*c43cad87SWarner Losh static bool 306*c43cad87SWarner Losh hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { 307*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 308*c43cad87SWarner Losh if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) { 309*c43cad87SWarner Losh return true; 310*c43cad87SWarner Losh } 311*c43cad87SWarner Losh if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { 312*c43cad87SWarner Losh return true; 313*c43cad87SWarner Losh } 314*c43cad87SWarner Losh return false; 315*c43cad87SWarner Losh } 316*c43cad87SWarner Losh 317*c43cad87SWarner Losh static void 318*c43cad87SWarner Losh hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard, 319*c43cad87SWarner Losh hpdata_t *ps) { 320*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 321*c43cad87SWarner Losh if (hpdata_changing_state_get(ps)) { 322*c43cad87SWarner Losh hpdata_purge_allowed_set(ps, false); 323*c43cad87SWarner Losh hpdata_disallow_hugify(ps); 324*c43cad87SWarner Losh return; 325*c43cad87SWarner Losh } 326*c43cad87SWarner Losh /* 327*c43cad87SWarner Losh * Hugepages are distinctly costly to purge, so try to avoid it unless 328*c43cad87SWarner Losh * they're *particularly* full of dirty pages. Eventually, we should 329*c43cad87SWarner Losh * use a smarter / more dynamic heuristic for situations where we have 330*c43cad87SWarner Losh * to manually hugify. 331*c43cad87SWarner Losh * 332*c43cad87SWarner Losh * In situations where we don't manually hugify, this problem is 333*c43cad87SWarner Losh * reduced. The "bad" situation we're trying to avoid is one's that's 334*c43cad87SWarner Losh * common in some Linux configurations (where both enabled and defrag 335*c43cad87SWarner Losh * are set to madvise) that can lead to long latency spikes on the first 336*c43cad87SWarner Losh * access after a hugification. The ideal policy in such configurations 337*c43cad87SWarner Losh * is probably time-based for both purging and hugifying; only hugify a 338*c43cad87SWarner Losh * hugepage if it's met the criteria for some extended period of time, 339*c43cad87SWarner Losh * and only dehugify it if it's failed to meet the criteria for an 340*c43cad87SWarner Losh * extended period of time. When background threads are on, we should 341*c43cad87SWarner Losh * try to take this hit on one of them, as well. 342*c43cad87SWarner Losh * 343*c43cad87SWarner Losh * I think the ideal setting is THP always enabled, and defrag set to 344*c43cad87SWarner Losh * deferred; in that case we don't need any explicit calls on the 345*c43cad87SWarner Losh * allocator's end at all; we just try to pack allocations in a 346*c43cad87SWarner Losh * hugepage-friendly manner and let the OS hugify in the background. 347*c43cad87SWarner Losh */ 348*c43cad87SWarner Losh hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0); 349*c43cad87SWarner Losh if (hpa_good_hugification_candidate(shard, ps) 350*c43cad87SWarner Losh && !hpdata_huge_get(ps)) { 351*c43cad87SWarner Losh nstime_t now; 352*c43cad87SWarner Losh shard->central->hooks.curtime(&now, /* first_reading */ true); 353*c43cad87SWarner Losh hpdata_allow_hugify(ps, now); 354*c43cad87SWarner Losh } 355*c43cad87SWarner Losh /* 356*c43cad87SWarner Losh * Once a hugepage has become eligible for hugification, we don't mark 357*c43cad87SWarner Losh * it as ineligible just because it stops meeting the criteria (this 358*c43cad87SWarner Losh * could lead to situations where a hugepage that spends most of its 359*c43cad87SWarner Losh * time meeting the criteria never quite getting hugified if there are 360*c43cad87SWarner Losh * intervening deallocations). The idea is that the hugification delay 361*c43cad87SWarner Losh * will allow them to get purged, reseting their "hugify-allowed" bit. 362*c43cad87SWarner Losh * If they don't get purged, then the hugification isn't hurting and 363*c43cad87SWarner Losh * might help. As an exception, we don't hugify hugepages that are now 364*c43cad87SWarner Losh * empty; it definitely doesn't help there until the hugepage gets 365*c43cad87SWarner Losh * reused, which is likely not for a while. 366*c43cad87SWarner Losh */ 367*c43cad87SWarner Losh if (hpdata_nactive_get(ps) == 0) { 368*c43cad87SWarner Losh hpdata_disallow_hugify(ps); 369*c43cad87SWarner Losh } 370*c43cad87SWarner Losh } 371*c43cad87SWarner Losh 372*c43cad87SWarner Losh static bool 373*c43cad87SWarner Losh hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { 374*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 375*c43cad87SWarner Losh hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); 376*c43cad87SWarner Losh return to_hugify != NULL || hpa_should_purge(tsdn, shard); 377*c43cad87SWarner Losh } 378*c43cad87SWarner Losh 379*c43cad87SWarner Losh /* Returns whether or not we purged anything. */ 380*c43cad87SWarner Losh static bool 381*c43cad87SWarner Losh hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { 382*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 383*c43cad87SWarner Losh 384*c43cad87SWarner Losh hpdata_t *to_purge = psset_pick_purge(&shard->psset); 385*c43cad87SWarner Losh if (to_purge == NULL) { 386*c43cad87SWarner Losh return false; 387*c43cad87SWarner Losh } 388*c43cad87SWarner Losh assert(hpdata_purge_allowed_get(to_purge)); 389*c43cad87SWarner Losh assert(!hpdata_changing_state_get(to_purge)); 390*c43cad87SWarner Losh 391*c43cad87SWarner Losh /* 392*c43cad87SWarner Losh * Don't let anyone else purge or hugify this page while 393*c43cad87SWarner Losh * we're purging it (allocations and deallocations are 394*c43cad87SWarner Losh * OK). 395*c43cad87SWarner Losh */ 396*c43cad87SWarner Losh psset_update_begin(&shard->psset, to_purge); 397*c43cad87SWarner Losh assert(hpdata_alloc_allowed_get(to_purge)); 398*c43cad87SWarner Losh hpdata_mid_purge_set(to_purge, true); 399*c43cad87SWarner Losh hpdata_purge_allowed_set(to_purge, false); 400*c43cad87SWarner Losh hpdata_disallow_hugify(to_purge); 401*c43cad87SWarner Losh /* 402*c43cad87SWarner Losh * Unlike with hugification (where concurrent 403*c43cad87SWarner Losh * allocations are allowed), concurrent allocation out 404*c43cad87SWarner Losh * of a hugepage being purged is unsafe; we might hand 405*c43cad87SWarner Losh * out an extent for an allocation and then purge it 406*c43cad87SWarner Losh * (clearing out user data). 407*c43cad87SWarner Losh */ 408*c43cad87SWarner Losh hpdata_alloc_allowed_set(to_purge, false); 409*c43cad87SWarner Losh psset_update_end(&shard->psset, to_purge); 410*c43cad87SWarner Losh 411*c43cad87SWarner Losh /* Gather all the metadata we'll need during the purge. */ 412*c43cad87SWarner Losh bool dehugify = hpdata_huge_get(to_purge); 413*c43cad87SWarner Losh hpdata_purge_state_t purge_state; 414*c43cad87SWarner Losh size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state); 415*c43cad87SWarner Losh 416*c43cad87SWarner Losh shard->npending_purge += num_to_purge; 417*c43cad87SWarner Losh 418*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 419*c43cad87SWarner Losh 420*c43cad87SWarner Losh /* Actually do the purging, now that the lock is dropped. */ 421*c43cad87SWarner Losh if (dehugify) { 422*c43cad87SWarner Losh shard->central->hooks.dehugify(hpdata_addr_get(to_purge), 423*c43cad87SWarner Losh HUGEPAGE); 424*c43cad87SWarner Losh } 425*c43cad87SWarner Losh size_t total_purged = 0; 426*c43cad87SWarner Losh uint64_t purges_this_pass = 0; 427*c43cad87SWarner Losh void *purge_addr; 428*c43cad87SWarner Losh size_t purge_size; 429*c43cad87SWarner Losh while (hpdata_purge_next(to_purge, &purge_state, &purge_addr, 430*c43cad87SWarner Losh &purge_size)) { 431*c43cad87SWarner Losh total_purged += purge_size; 432*c43cad87SWarner Losh assert(total_purged <= HUGEPAGE); 433*c43cad87SWarner Losh purges_this_pass++; 434*c43cad87SWarner Losh shard->central->hooks.purge(purge_addr, purge_size); 435*c43cad87SWarner Losh } 436*c43cad87SWarner Losh 437*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 438*c43cad87SWarner Losh /* The shard updates */ 439*c43cad87SWarner Losh shard->npending_purge -= num_to_purge; 440*c43cad87SWarner Losh shard->stats.npurge_passes++; 441*c43cad87SWarner Losh shard->stats.npurges += purges_this_pass; 442*c43cad87SWarner Losh shard->central->hooks.curtime(&shard->last_purge, 443*c43cad87SWarner Losh /* first_reading */ false); 444*c43cad87SWarner Losh if (dehugify) { 445*c43cad87SWarner Losh shard->stats.ndehugifies++; 446*c43cad87SWarner Losh } 447*c43cad87SWarner Losh 448*c43cad87SWarner Losh /* The hpdata updates. */ 449*c43cad87SWarner Losh psset_update_begin(&shard->psset, to_purge); 450*c43cad87SWarner Losh if (dehugify) { 451*c43cad87SWarner Losh hpdata_dehugify(to_purge); 452*c43cad87SWarner Losh } 453*c43cad87SWarner Losh hpdata_purge_end(to_purge, &purge_state); 454*c43cad87SWarner Losh hpdata_mid_purge_set(to_purge, false); 455*c43cad87SWarner Losh 456*c43cad87SWarner Losh hpdata_alloc_allowed_set(to_purge, true); 457*c43cad87SWarner Losh hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge); 458*c43cad87SWarner Losh 459*c43cad87SWarner Losh psset_update_end(&shard->psset, to_purge); 460*c43cad87SWarner Losh 461*c43cad87SWarner Losh return true; 462*c43cad87SWarner Losh } 463*c43cad87SWarner Losh 464*c43cad87SWarner Losh /* Returns whether or not we hugified anything. */ 465*c43cad87SWarner Losh static bool 466*c43cad87SWarner Losh hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { 467*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 468*c43cad87SWarner Losh 469*c43cad87SWarner Losh if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { 470*c43cad87SWarner Losh return false; 471*c43cad87SWarner Losh } 472*c43cad87SWarner Losh 473*c43cad87SWarner Losh hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); 474*c43cad87SWarner Losh if (to_hugify == NULL) { 475*c43cad87SWarner Losh return false; 476*c43cad87SWarner Losh } 477*c43cad87SWarner Losh assert(hpdata_hugify_allowed_get(to_hugify)); 478*c43cad87SWarner Losh assert(!hpdata_changing_state_get(to_hugify)); 479*c43cad87SWarner Losh 480*c43cad87SWarner Losh /* Make sure that it's been hugifiable for long enough. */ 481*c43cad87SWarner Losh nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify); 482*c43cad87SWarner Losh uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed); 483*c43cad87SWarner Losh if (millis < shard->opts.hugify_delay_ms) { 484*c43cad87SWarner Losh return false; 485*c43cad87SWarner Losh } 486*c43cad87SWarner Losh 487*c43cad87SWarner Losh /* 488*c43cad87SWarner Losh * Don't let anyone else purge or hugify this page while 489*c43cad87SWarner Losh * we're hugifying it (allocations and deallocations are 490*c43cad87SWarner Losh * OK). 491*c43cad87SWarner Losh */ 492*c43cad87SWarner Losh psset_update_begin(&shard->psset, to_hugify); 493*c43cad87SWarner Losh hpdata_mid_hugify_set(to_hugify, true); 494*c43cad87SWarner Losh hpdata_purge_allowed_set(to_hugify, false); 495*c43cad87SWarner Losh hpdata_disallow_hugify(to_hugify); 496*c43cad87SWarner Losh assert(hpdata_alloc_allowed_get(to_hugify)); 497*c43cad87SWarner Losh psset_update_end(&shard->psset, to_hugify); 498*c43cad87SWarner Losh 499*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 500*c43cad87SWarner Losh 501*c43cad87SWarner Losh shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE); 502*c43cad87SWarner Losh 503*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 504*c43cad87SWarner Losh shard->stats.nhugifies++; 505*c43cad87SWarner Losh 506*c43cad87SWarner Losh psset_update_begin(&shard->psset, to_hugify); 507*c43cad87SWarner Losh hpdata_hugify(to_hugify); 508*c43cad87SWarner Losh hpdata_mid_hugify_set(to_hugify, false); 509*c43cad87SWarner Losh hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify); 510*c43cad87SWarner Losh psset_update_end(&shard->psset, to_hugify); 511*c43cad87SWarner Losh 512*c43cad87SWarner Losh return true; 513*c43cad87SWarner Losh } 514*c43cad87SWarner Losh 515*c43cad87SWarner Losh /* 516*c43cad87SWarner Losh * Execution of deferred work is forced if it's triggered by an explicit 517*c43cad87SWarner Losh * hpa_shard_do_deferred_work() call. 518*c43cad87SWarner Losh */ 519*c43cad87SWarner Losh static void 520*c43cad87SWarner Losh hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard, 521*c43cad87SWarner Losh bool forced) { 522*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 523*c43cad87SWarner Losh if (!forced && shard->opts.deferral_allowed) { 524*c43cad87SWarner Losh return; 525*c43cad87SWarner Losh } 526*c43cad87SWarner Losh /* 527*c43cad87SWarner Losh * If we're on a background thread, do work so long as there's work to 528*c43cad87SWarner Losh * be done. Otherwise, bound latency to not be *too* bad by doing at 529*c43cad87SWarner Losh * most a small fixed number of operations. 530*c43cad87SWarner Losh */ 531*c43cad87SWarner Losh bool hugified = false; 532*c43cad87SWarner Losh bool purged = false; 533*c43cad87SWarner Losh size_t max_ops = (forced ? (size_t)-1 : 16); 534*c43cad87SWarner Losh size_t nops = 0; 535*c43cad87SWarner Losh do { 536*c43cad87SWarner Losh /* 537*c43cad87SWarner Losh * Always purge before hugifying, to make sure we get some 538*c43cad87SWarner Losh * ability to hit our quiescence targets. 539*c43cad87SWarner Losh */ 540*c43cad87SWarner Losh purged = false; 541*c43cad87SWarner Losh while (hpa_should_purge(tsdn, shard) && nops < max_ops) { 542*c43cad87SWarner Losh purged = hpa_try_purge(tsdn, shard); 543*c43cad87SWarner Losh if (purged) { 544*c43cad87SWarner Losh nops++; 545*c43cad87SWarner Losh } 546*c43cad87SWarner Losh } 547*c43cad87SWarner Losh hugified = hpa_try_hugify(tsdn, shard); 548*c43cad87SWarner Losh if (hugified) { 549*c43cad87SWarner Losh nops++; 550*c43cad87SWarner Losh } 551*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 552*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 553*c43cad87SWarner Losh } while ((hugified || purged) && nops < max_ops); 554*c43cad87SWarner Losh } 555*c43cad87SWarner Losh 556*c43cad87SWarner Losh static edata_t * 557*c43cad87SWarner Losh hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, 558*c43cad87SWarner Losh bool *oom) { 559*c43cad87SWarner Losh bool err; 560*c43cad87SWarner Losh edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf); 561*c43cad87SWarner Losh if (edata == NULL) { 562*c43cad87SWarner Losh *oom = true; 563*c43cad87SWarner Losh return NULL; 564*c43cad87SWarner Losh } 565*c43cad87SWarner Losh 566*c43cad87SWarner Losh hpdata_t *ps = psset_pick_alloc(&shard->psset, size); 567*c43cad87SWarner Losh if (ps == NULL) { 568*c43cad87SWarner Losh edata_cache_fast_put(tsdn, &shard->ecf, edata); 569*c43cad87SWarner Losh return NULL; 570*c43cad87SWarner Losh } 571*c43cad87SWarner Losh 572*c43cad87SWarner Losh psset_update_begin(&shard->psset, ps); 573*c43cad87SWarner Losh 574*c43cad87SWarner Losh if (hpdata_empty(ps)) { 575*c43cad87SWarner Losh /* 576*c43cad87SWarner Losh * If the pageslab used to be empty, treat it as though it's 577*c43cad87SWarner Losh * brand new for fragmentation-avoidance purposes; what we're 578*c43cad87SWarner Losh * trying to approximate is the age of the allocations *in* that 579*c43cad87SWarner Losh * pageslab, and the allocations in the new pageslab are 580*c43cad87SWarner Losh * definitionally the youngest in this hpa shard. 581*c43cad87SWarner Losh */ 582*c43cad87SWarner Losh hpdata_age_set(ps, shard->age_counter++); 583*c43cad87SWarner Losh } 584*c43cad87SWarner Losh 585*c43cad87SWarner Losh void *addr = hpdata_reserve_alloc(ps, size); 586*c43cad87SWarner Losh edata_init(edata, shard->ind, addr, size, /* slab */ false, 587*c43cad87SWarner Losh SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active, 588*c43cad87SWarner Losh /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA, 589*c43cad87SWarner Losh EXTENT_NOT_HEAD); 590*c43cad87SWarner Losh edata_ps_set(edata, ps); 591*c43cad87SWarner Losh 592*c43cad87SWarner Losh /* 593*c43cad87SWarner Losh * This could theoretically be moved outside of the critical section, 594*c43cad87SWarner Losh * but that introduces the potential for a race. Without the lock, the 595*c43cad87SWarner Losh * (initially nonempty, since this is the reuse pathway) pageslab we 596*c43cad87SWarner Losh * allocated out of could become otherwise empty while the lock is 597*c43cad87SWarner Losh * dropped. This would force us to deal with a pageslab eviction down 598*c43cad87SWarner Losh * the error pathway, which is a pain. 599*c43cad87SWarner Losh */ 600*c43cad87SWarner Losh err = emap_register_boundary(tsdn, shard->emap, edata, 601*c43cad87SWarner Losh SC_NSIZES, /* slab */ false); 602*c43cad87SWarner Losh if (err) { 603*c43cad87SWarner Losh hpdata_unreserve(ps, edata_addr_get(edata), 604*c43cad87SWarner Losh edata_size_get(edata)); 605*c43cad87SWarner Losh /* 606*c43cad87SWarner Losh * We should arguably reset dirty state here, but this would 607*c43cad87SWarner Losh * require some sort of prepare + commit functionality that's a 608*c43cad87SWarner Losh * little much to deal with for now. 609*c43cad87SWarner Losh * 610*c43cad87SWarner Losh * We don't have a do_deferred_work down this pathway, on the 611*c43cad87SWarner Losh * principle that we didn't *really* affect shard state (we 612*c43cad87SWarner Losh * tweaked the stats, but our tweaks weren't really accurate). 613*c43cad87SWarner Losh */ 614*c43cad87SWarner Losh psset_update_end(&shard->psset, ps); 615*c43cad87SWarner Losh edata_cache_fast_put(tsdn, &shard->ecf, edata); 616*c43cad87SWarner Losh *oom = true; 617*c43cad87SWarner Losh return NULL; 618*c43cad87SWarner Losh } 619*c43cad87SWarner Losh 620*c43cad87SWarner Losh hpa_update_purge_hugify_eligibility(tsdn, shard, ps); 621*c43cad87SWarner Losh psset_update_end(&shard->psset, ps); 622*c43cad87SWarner Losh return edata; 623*c43cad87SWarner Losh } 624*c43cad87SWarner Losh 625*c43cad87SWarner Losh static size_t 626*c43cad87SWarner Losh hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, 627*c43cad87SWarner Losh bool *oom, size_t nallocs, edata_list_active_t *results, 628*c43cad87SWarner Losh bool *deferred_work_generated) { 629*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 630*c43cad87SWarner Losh size_t nsuccess = 0; 631*c43cad87SWarner Losh for (; nsuccess < nallocs; nsuccess++) { 632*c43cad87SWarner Losh edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size, 633*c43cad87SWarner Losh oom); 634*c43cad87SWarner Losh if (edata == NULL) { 635*c43cad87SWarner Losh break; 636*c43cad87SWarner Losh } 637*c43cad87SWarner Losh edata_list_active_append(results, edata); 638*c43cad87SWarner Losh } 639*c43cad87SWarner Losh 640*c43cad87SWarner Losh hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); 641*c43cad87SWarner Losh *deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard); 642*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 643*c43cad87SWarner Losh return nsuccess; 644*c43cad87SWarner Losh } 645*c43cad87SWarner Losh 646*c43cad87SWarner Losh static size_t 647*c43cad87SWarner Losh hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, 648*c43cad87SWarner Losh size_t nallocs, edata_list_active_t *results, 649*c43cad87SWarner Losh bool *deferred_work_generated) { 650*c43cad87SWarner Losh assert(size <= shard->opts.slab_max_alloc); 651*c43cad87SWarner Losh bool oom = false; 652*c43cad87SWarner Losh 653*c43cad87SWarner Losh size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, 654*c43cad87SWarner Losh nallocs, results, deferred_work_generated); 655*c43cad87SWarner Losh 656*c43cad87SWarner Losh if (nsuccess == nallocs || oom) { 657*c43cad87SWarner Losh return nsuccess; 658*c43cad87SWarner Losh } 659*c43cad87SWarner Losh 660*c43cad87SWarner Losh /* 661*c43cad87SWarner Losh * We didn't OOM, but weren't able to fill everything requested of us; 662*c43cad87SWarner Losh * try to grow. 663*c43cad87SWarner Losh */ 664*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->grow_mtx); 665*c43cad87SWarner Losh /* 666*c43cad87SWarner Losh * Check for grow races; maybe some earlier thread expanded the psset 667*c43cad87SWarner Losh * in between when we dropped the main mutex and grabbed the grow mutex. 668*c43cad87SWarner Losh */ 669*c43cad87SWarner Losh nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, 670*c43cad87SWarner Losh nallocs - nsuccess, results, deferred_work_generated); 671*c43cad87SWarner Losh if (nsuccess == nallocs || oom) { 672*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->grow_mtx); 673*c43cad87SWarner Losh return nsuccess; 674*c43cad87SWarner Losh } 675*c43cad87SWarner Losh 676*c43cad87SWarner Losh /* 677*c43cad87SWarner Losh * Note that we don't hold shard->mtx here (while growing); 678*c43cad87SWarner Losh * deallocations (and allocations of smaller sizes) may still succeed 679*c43cad87SWarner Losh * while we're doing this potentially expensive system call. 680*c43cad87SWarner Losh */ 681*c43cad87SWarner Losh hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom); 682*c43cad87SWarner Losh if (ps == NULL) { 683*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->grow_mtx); 684*c43cad87SWarner Losh return nsuccess; 685*c43cad87SWarner Losh } 686*c43cad87SWarner Losh 687*c43cad87SWarner Losh /* 688*c43cad87SWarner Losh * We got the pageslab; allocate from it. This does an unlock followed 689*c43cad87SWarner Losh * by a lock on the same mutex, and holds the grow mutex while doing 690*c43cad87SWarner Losh * deferred work, but this is an uncommon path; the simplicity is worth 691*c43cad87SWarner Losh * it. 692*c43cad87SWarner Losh */ 693*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 694*c43cad87SWarner Losh psset_insert(&shard->psset, ps); 695*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 696*c43cad87SWarner Losh 697*c43cad87SWarner Losh nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, 698*c43cad87SWarner Losh nallocs - nsuccess, results, deferred_work_generated); 699*c43cad87SWarner Losh /* 700*c43cad87SWarner Losh * Drop grow_mtx before doing deferred work; other threads blocked on it 701*c43cad87SWarner Losh * should be allowed to proceed while we're working. 702*c43cad87SWarner Losh */ 703*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->grow_mtx); 704*c43cad87SWarner Losh 705*c43cad87SWarner Losh return nsuccess; 706*c43cad87SWarner Losh } 707*c43cad87SWarner Losh 708*c43cad87SWarner Losh static hpa_shard_t * 709*c43cad87SWarner Losh hpa_from_pai(pai_t *self) { 710*c43cad87SWarner Losh assert(self->alloc = &hpa_alloc); 711*c43cad87SWarner Losh assert(self->expand = &hpa_expand); 712*c43cad87SWarner Losh assert(self->shrink = &hpa_shrink); 713*c43cad87SWarner Losh assert(self->dalloc = &hpa_dalloc); 714*c43cad87SWarner Losh return (hpa_shard_t *)self; 715*c43cad87SWarner Losh } 716*c43cad87SWarner Losh 717*c43cad87SWarner Losh static size_t 718*c43cad87SWarner Losh hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, 719*c43cad87SWarner Losh edata_list_active_t *results, bool *deferred_work_generated) { 720*c43cad87SWarner Losh assert(nallocs > 0); 721*c43cad87SWarner Losh assert((size & PAGE_MASK) == 0); 722*c43cad87SWarner Losh witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), 723*c43cad87SWarner Losh WITNESS_RANK_CORE, 0); 724*c43cad87SWarner Losh hpa_shard_t *shard = hpa_from_pai(self); 725*c43cad87SWarner Losh 726*c43cad87SWarner Losh if (size > shard->opts.slab_max_alloc) { 727*c43cad87SWarner Losh return 0; 728*c43cad87SWarner Losh } 729*c43cad87SWarner Losh 730*c43cad87SWarner Losh size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs, 731*c43cad87SWarner Losh results, deferred_work_generated); 732*c43cad87SWarner Losh 733*c43cad87SWarner Losh witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), 734*c43cad87SWarner Losh WITNESS_RANK_CORE, 0); 735*c43cad87SWarner Losh 736*c43cad87SWarner Losh /* 737*c43cad87SWarner Losh * Guard the sanity checks with config_debug because the loop cannot be 738*c43cad87SWarner Losh * proven non-circular by the compiler, even if everything within the 739*c43cad87SWarner Losh * loop is optimized away. 740*c43cad87SWarner Losh */ 741*c43cad87SWarner Losh if (config_debug) { 742*c43cad87SWarner Losh edata_t *edata; 743*c43cad87SWarner Losh ql_foreach(edata, &results->head, ql_link_active) { 744*c43cad87SWarner Losh emap_assert_mapped(tsdn, shard->emap, edata); 745*c43cad87SWarner Losh assert(edata_pai_get(edata) == EXTENT_PAI_HPA); 746*c43cad87SWarner Losh assert(edata_state_get(edata) == extent_state_active); 747*c43cad87SWarner Losh assert(edata_arena_ind_get(edata) == shard->ind); 748*c43cad87SWarner Losh assert(edata_szind_get_maybe_invalid(edata) == 749*c43cad87SWarner Losh SC_NSIZES); 750*c43cad87SWarner Losh assert(!edata_slab_get(edata)); 751*c43cad87SWarner Losh assert(edata_committed_get(edata)); 752*c43cad87SWarner Losh assert(edata_base_get(edata) == edata_addr_get(edata)); 753*c43cad87SWarner Losh assert(edata_base_get(edata) != NULL); 754*c43cad87SWarner Losh } 755*c43cad87SWarner Losh } 756*c43cad87SWarner Losh return nsuccess; 757*c43cad87SWarner Losh } 758*c43cad87SWarner Losh 759*c43cad87SWarner Losh static edata_t * 760*c43cad87SWarner Losh hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, 761*c43cad87SWarner Losh bool guarded, bool frequent_reuse, bool *deferred_work_generated) { 762*c43cad87SWarner Losh assert((size & PAGE_MASK) == 0); 763*c43cad87SWarner Losh assert(!guarded); 764*c43cad87SWarner Losh witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), 765*c43cad87SWarner Losh WITNESS_RANK_CORE, 0); 766*c43cad87SWarner Losh 767*c43cad87SWarner Losh /* We don't handle alignment or zeroing for now. */ 768*c43cad87SWarner Losh if (alignment > PAGE || zero) { 769*c43cad87SWarner Losh return NULL; 770*c43cad87SWarner Losh } 771*c43cad87SWarner Losh /* 772*c43cad87SWarner Losh * An alloc with alignment == PAGE and zero == false is equivalent to a 773*c43cad87SWarner Losh * batch alloc of 1. Just do that, so we can share code. 774*c43cad87SWarner Losh */ 775*c43cad87SWarner Losh edata_list_active_t results; 776*c43cad87SWarner Losh edata_list_active_init(&results); 777*c43cad87SWarner Losh size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1, 778*c43cad87SWarner Losh &results, deferred_work_generated); 779*c43cad87SWarner Losh assert(nallocs == 0 || nallocs == 1); 780*c43cad87SWarner Losh edata_t *edata = edata_list_active_first(&results); 781*c43cad87SWarner Losh return edata; 782*c43cad87SWarner Losh } 783*c43cad87SWarner Losh 784*c43cad87SWarner Losh static bool 785*c43cad87SWarner Losh hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, 786*c43cad87SWarner Losh size_t new_size, bool zero, bool *deferred_work_generated) { 787*c43cad87SWarner Losh /* Expand not yet supported. */ 788*c43cad87SWarner Losh return true; 789*c43cad87SWarner Losh } 790*c43cad87SWarner Losh 791*c43cad87SWarner Losh static bool 792*c43cad87SWarner Losh hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, 793*c43cad87SWarner Losh size_t old_size, size_t new_size, bool *deferred_work_generated) { 794*c43cad87SWarner Losh /* Shrink not yet supported. */ 795*c43cad87SWarner Losh return true; 796*c43cad87SWarner Losh } 797*c43cad87SWarner Losh 798*c43cad87SWarner Losh static void 799*c43cad87SWarner Losh hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { 800*c43cad87SWarner Losh malloc_mutex_assert_not_owner(tsdn, &shard->mtx); 801*c43cad87SWarner Losh 802*c43cad87SWarner Losh assert(edata_pai_get(edata) == EXTENT_PAI_HPA); 803*c43cad87SWarner Losh assert(edata_state_get(edata) == extent_state_active); 804*c43cad87SWarner Losh assert(edata_arena_ind_get(edata) == shard->ind); 805*c43cad87SWarner Losh assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES); 806*c43cad87SWarner Losh assert(edata_committed_get(edata)); 807*c43cad87SWarner Losh assert(edata_base_get(edata) != NULL); 808*c43cad87SWarner Losh 809*c43cad87SWarner Losh /* 810*c43cad87SWarner Losh * Another thread shouldn't be trying to touch the metadata of an 811*c43cad87SWarner Losh * allocation being freed. The one exception is a merge attempt from a 812*c43cad87SWarner Losh * lower-addressed PAC extent; in this case we have a nominal race on 813*c43cad87SWarner Losh * the edata metadata bits, but in practice the fact that the PAI bits 814*c43cad87SWarner Losh * are different will prevent any further access. The race is bad, but 815*c43cad87SWarner Losh * benign in practice, and the long term plan is to track enough state 816*c43cad87SWarner Losh * in the rtree to prevent these merge attempts in the first place. 817*c43cad87SWarner Losh */ 818*c43cad87SWarner Losh edata_addr_set(edata, edata_base_get(edata)); 819*c43cad87SWarner Losh edata_zeroed_set(edata, false); 820*c43cad87SWarner Losh emap_deregister_boundary(tsdn, shard->emap, edata); 821*c43cad87SWarner Losh } 822*c43cad87SWarner Losh 823*c43cad87SWarner Losh static void 824*c43cad87SWarner Losh hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { 825*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 826*c43cad87SWarner Losh 827*c43cad87SWarner Losh /* 828*c43cad87SWarner Losh * Release the metadata early, to avoid having to remember to do it 829*c43cad87SWarner Losh * while we're also doing tricky purging logic. First, we need to grab 830*c43cad87SWarner Losh * a few bits of metadata from it. 831*c43cad87SWarner Losh * 832*c43cad87SWarner Losh * Note that the shard mutex protects ps's metadata too; it wouldn't be 833*c43cad87SWarner Losh * correct to try to read most information out of it without the lock. 834*c43cad87SWarner Losh */ 835*c43cad87SWarner Losh hpdata_t *ps = edata_ps_get(edata); 836*c43cad87SWarner Losh /* Currently, all edatas come from pageslabs. */ 837*c43cad87SWarner Losh assert(ps != NULL); 838*c43cad87SWarner Losh void *unreserve_addr = edata_addr_get(edata); 839*c43cad87SWarner Losh size_t unreserve_size = edata_size_get(edata); 840*c43cad87SWarner Losh edata_cache_fast_put(tsdn, &shard->ecf, edata); 841*c43cad87SWarner Losh 842*c43cad87SWarner Losh psset_update_begin(&shard->psset, ps); 843*c43cad87SWarner Losh hpdata_unreserve(ps, unreserve_addr, unreserve_size); 844*c43cad87SWarner Losh hpa_update_purge_hugify_eligibility(tsdn, shard, ps); 845*c43cad87SWarner Losh psset_update_end(&shard->psset, ps); 846*c43cad87SWarner Losh } 847*c43cad87SWarner Losh 848*c43cad87SWarner Losh static void 849*c43cad87SWarner Losh hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list, 850*c43cad87SWarner Losh bool *deferred_work_generated) { 851*c43cad87SWarner Losh hpa_shard_t *shard = hpa_from_pai(self); 852*c43cad87SWarner Losh 853*c43cad87SWarner Losh edata_t *edata; 854*c43cad87SWarner Losh ql_foreach(edata, &list->head, ql_link_active) { 855*c43cad87SWarner Losh hpa_dalloc_prepare_unlocked(tsdn, shard, edata); 856*c43cad87SWarner Losh } 857*c43cad87SWarner Losh 858*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 859*c43cad87SWarner Losh /* Now, remove from the list. */ 860*c43cad87SWarner Losh while ((edata = edata_list_active_first(list)) != NULL) { 861*c43cad87SWarner Losh edata_list_active_remove(list, edata); 862*c43cad87SWarner Losh hpa_dalloc_locked(tsdn, shard, edata); 863*c43cad87SWarner Losh } 864*c43cad87SWarner Losh hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); 865*c43cad87SWarner Losh *deferred_work_generated = 866*c43cad87SWarner Losh hpa_shard_has_deferred_work(tsdn, shard); 867*c43cad87SWarner Losh 868*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 869*c43cad87SWarner Losh } 870*c43cad87SWarner Losh 871*c43cad87SWarner Losh static void 872*c43cad87SWarner Losh hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, 873*c43cad87SWarner Losh bool *deferred_work_generated) { 874*c43cad87SWarner Losh assert(!edata_guarded_get(edata)); 875*c43cad87SWarner Losh /* Just a dalloc_batch of size 1; this lets us share logic. */ 876*c43cad87SWarner Losh edata_list_active_t dalloc_list; 877*c43cad87SWarner Losh edata_list_active_init(&dalloc_list); 878*c43cad87SWarner Losh edata_list_active_append(&dalloc_list, edata); 879*c43cad87SWarner Losh hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated); 880*c43cad87SWarner Losh } 881*c43cad87SWarner Losh 882*c43cad87SWarner Losh /* 883*c43cad87SWarner Losh * Calculate time until either purging or hugification ought to happen. 884*c43cad87SWarner Losh * Called by background threads. 885*c43cad87SWarner Losh */ 886*c43cad87SWarner Losh static uint64_t 887*c43cad87SWarner Losh hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { 888*c43cad87SWarner Losh hpa_shard_t *shard = hpa_from_pai(self); 889*c43cad87SWarner Losh uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX; 890*c43cad87SWarner Losh 891*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 892*c43cad87SWarner Losh 893*c43cad87SWarner Losh hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); 894*c43cad87SWarner Losh if (to_hugify != NULL) { 895*c43cad87SWarner Losh nstime_t time_hugify_allowed = 896*c43cad87SWarner Losh hpdata_time_hugify_allowed(to_hugify); 897*c43cad87SWarner Losh uint64_t since_hugify_allowed_ms = 898*c43cad87SWarner Losh shard->central->hooks.ms_since(&time_hugify_allowed); 899*c43cad87SWarner Losh /* 900*c43cad87SWarner Losh * If not enough time has passed since hugification was allowed, 901*c43cad87SWarner Losh * sleep for the rest. 902*c43cad87SWarner Losh */ 903*c43cad87SWarner Losh if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) { 904*c43cad87SWarner Losh time_ns = shard->opts.hugify_delay_ms - 905*c43cad87SWarner Losh since_hugify_allowed_ms; 906*c43cad87SWarner Losh time_ns *= 1000 * 1000; 907*c43cad87SWarner Losh } else { 908*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 909*c43cad87SWarner Losh return BACKGROUND_THREAD_DEFERRED_MIN; 910*c43cad87SWarner Losh } 911*c43cad87SWarner Losh } 912*c43cad87SWarner Losh 913*c43cad87SWarner Losh if (hpa_should_purge(tsdn, shard)) { 914*c43cad87SWarner Losh /* 915*c43cad87SWarner Losh * If we haven't purged before, no need to check interval 916*c43cad87SWarner Losh * between purges. Simply purge as soon as possible. 917*c43cad87SWarner Losh */ 918*c43cad87SWarner Losh if (shard->stats.npurge_passes == 0) { 919*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 920*c43cad87SWarner Losh return BACKGROUND_THREAD_DEFERRED_MIN; 921*c43cad87SWarner Losh } 922*c43cad87SWarner Losh uint64_t since_last_purge_ms = shard->central->hooks.ms_since( 923*c43cad87SWarner Losh &shard->last_purge); 924*c43cad87SWarner Losh 925*c43cad87SWarner Losh if (since_last_purge_ms < shard->opts.min_purge_interval_ms) { 926*c43cad87SWarner Losh uint64_t until_purge_ns; 927*c43cad87SWarner Losh until_purge_ns = shard->opts.min_purge_interval_ms - 928*c43cad87SWarner Losh since_last_purge_ms; 929*c43cad87SWarner Losh until_purge_ns *= 1000 * 1000; 930*c43cad87SWarner Losh 931*c43cad87SWarner Losh if (until_purge_ns < time_ns) { 932*c43cad87SWarner Losh time_ns = until_purge_ns; 933*c43cad87SWarner Losh } 934*c43cad87SWarner Losh } else { 935*c43cad87SWarner Losh time_ns = BACKGROUND_THREAD_DEFERRED_MIN; 936*c43cad87SWarner Losh } 937*c43cad87SWarner Losh } 938*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 939*c43cad87SWarner Losh return time_ns; 940*c43cad87SWarner Losh } 941*c43cad87SWarner Losh 942*c43cad87SWarner Losh void 943*c43cad87SWarner Losh hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) { 944*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 945*c43cad87SWarner Losh 946*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 947*c43cad87SWarner Losh edata_cache_fast_disable(tsdn, &shard->ecf); 948*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 949*c43cad87SWarner Losh } 950*c43cad87SWarner Losh 951*c43cad87SWarner Losh static void 952*c43cad87SWarner Losh hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) { 953*c43cad87SWarner Losh assert(bin_stats->npageslabs == 0); 954*c43cad87SWarner Losh assert(bin_stats->nactive == 0); 955*c43cad87SWarner Losh } 956*c43cad87SWarner Losh 957*c43cad87SWarner Losh static void 958*c43cad87SWarner Losh hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) { 959*c43cad87SWarner Losh malloc_mutex_assert_owner(tsdn, &shard->mtx); 960*c43cad87SWarner Losh for (int huge = 0; huge <= 1; huge++) { 961*c43cad87SWarner Losh hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]); 962*c43cad87SWarner Losh for (pszind_t i = 0; i < PSSET_NPSIZES; i++) { 963*c43cad87SWarner Losh hpa_shard_assert_stats_empty( 964*c43cad87SWarner Losh &psset->stats.nonfull_slabs[i][huge]); 965*c43cad87SWarner Losh } 966*c43cad87SWarner Losh } 967*c43cad87SWarner Losh } 968*c43cad87SWarner Losh 969*c43cad87SWarner Losh void 970*c43cad87SWarner Losh hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) { 971*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 972*c43cad87SWarner Losh /* 973*c43cad87SWarner Losh * By the time we're here, the arena code should have dalloc'd all the 974*c43cad87SWarner Losh * active extents, which means we should have eventually evicted 975*c43cad87SWarner Losh * everything from the psset, so it shouldn't be able to serve even a 976*c43cad87SWarner Losh * 1-page allocation. 977*c43cad87SWarner Losh */ 978*c43cad87SWarner Losh if (config_debug) { 979*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 980*c43cad87SWarner Losh hpa_assert_empty(tsdn, shard, &shard->psset); 981*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 982*c43cad87SWarner Losh } 983*c43cad87SWarner Losh hpdata_t *ps; 984*c43cad87SWarner Losh while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) { 985*c43cad87SWarner Losh /* There should be no allocations anywhere. */ 986*c43cad87SWarner Losh assert(hpdata_empty(ps)); 987*c43cad87SWarner Losh psset_remove(&shard->psset, ps); 988*c43cad87SWarner Losh shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE); 989*c43cad87SWarner Losh } 990*c43cad87SWarner Losh } 991*c43cad87SWarner Losh 992*c43cad87SWarner Losh void 993*c43cad87SWarner Losh hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard, 994*c43cad87SWarner Losh bool deferral_allowed) { 995*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 996*c43cad87SWarner Losh 997*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 998*c43cad87SWarner Losh bool deferral_previously_allowed = shard->opts.deferral_allowed; 999*c43cad87SWarner Losh shard->opts.deferral_allowed = deferral_allowed; 1000*c43cad87SWarner Losh if (deferral_previously_allowed && !deferral_allowed) { 1001*c43cad87SWarner Losh hpa_shard_maybe_do_deferred_work(tsdn, shard, 1002*c43cad87SWarner Losh /* forced */ true); 1003*c43cad87SWarner Losh } 1004*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 1005*c43cad87SWarner Losh } 1006*c43cad87SWarner Losh 1007*c43cad87SWarner Losh void 1008*c43cad87SWarner Losh hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { 1009*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 1010*c43cad87SWarner Losh 1011*c43cad87SWarner Losh malloc_mutex_lock(tsdn, &shard->mtx); 1012*c43cad87SWarner Losh hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true); 1013*c43cad87SWarner Losh malloc_mutex_unlock(tsdn, &shard->mtx); 1014*c43cad87SWarner Losh } 1015*c43cad87SWarner Losh 1016*c43cad87SWarner Losh void 1017*c43cad87SWarner Losh hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) { 1018*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 1019*c43cad87SWarner Losh 1020*c43cad87SWarner Losh malloc_mutex_prefork(tsdn, &shard->grow_mtx); 1021*c43cad87SWarner Losh } 1022*c43cad87SWarner Losh 1023*c43cad87SWarner Losh void 1024*c43cad87SWarner Losh hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) { 1025*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 1026*c43cad87SWarner Losh 1027*c43cad87SWarner Losh malloc_mutex_prefork(tsdn, &shard->mtx); 1028*c43cad87SWarner Losh } 1029*c43cad87SWarner Losh 1030*c43cad87SWarner Losh void 1031*c43cad87SWarner Losh hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) { 1032*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 1033*c43cad87SWarner Losh 1034*c43cad87SWarner Losh malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx); 1035*c43cad87SWarner Losh malloc_mutex_postfork_parent(tsdn, &shard->mtx); 1036*c43cad87SWarner Losh } 1037*c43cad87SWarner Losh 1038*c43cad87SWarner Losh void 1039*c43cad87SWarner Losh hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) { 1040*c43cad87SWarner Losh hpa_do_consistency_checks(shard); 1041*c43cad87SWarner Losh 1042*c43cad87SWarner Losh malloc_mutex_postfork_child(tsdn, &shard->grow_mtx); 1043*c43cad87SWarner Losh malloc_mutex_postfork_child(tsdn, &shard->mtx); 1044*c43cad87SWarner Losh } 1045