1*c43cad87SWarner Losh #include "jemalloc/internal/jemalloc_preamble.h" 2*c43cad87SWarner Losh #include "jemalloc/internal/jemalloc_internal_includes.h" 3*c43cad87SWarner Losh 4*c43cad87SWarner Losh #include "jemalloc/internal/hpdata.h" 5*c43cad87SWarner Losh 6*c43cad87SWarner Losh static int 7*c43cad87SWarner Losh hpdata_age_comp(const hpdata_t *a, const hpdata_t *b) { 8*c43cad87SWarner Losh uint64_t a_age = hpdata_age_get(a); 9*c43cad87SWarner Losh uint64_t b_age = hpdata_age_get(b); 10*c43cad87SWarner Losh /* 11*c43cad87SWarner Losh * hpdata ages are operation counts in the psset; no two should be the 12*c43cad87SWarner Losh * same. 13*c43cad87SWarner Losh */ 14*c43cad87SWarner Losh assert(a_age != b_age); 15*c43cad87SWarner Losh return (a_age > b_age) - (a_age < b_age); 16*c43cad87SWarner Losh } 17*c43cad87SWarner Losh 18*c43cad87SWarner Losh ph_gen(, hpdata_age_heap, hpdata_t, age_link, hpdata_age_comp) 19*c43cad87SWarner Losh 20*c43cad87SWarner Losh void 21*c43cad87SWarner Losh hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { 22*c43cad87SWarner Losh hpdata_addr_set(hpdata, addr); 23*c43cad87SWarner Losh hpdata_age_set(hpdata, age); 24*c43cad87SWarner Losh hpdata->h_huge = false; 25*c43cad87SWarner Losh hpdata->h_alloc_allowed = true; 26*c43cad87SWarner Losh hpdata->h_in_psset_alloc_container = false; 27*c43cad87SWarner Losh hpdata->h_purge_allowed = false; 28*c43cad87SWarner Losh hpdata->h_hugify_allowed = false; 29*c43cad87SWarner Losh hpdata->h_in_psset_hugify_container = false; 30*c43cad87SWarner Losh hpdata->h_mid_purge = false; 31*c43cad87SWarner Losh hpdata->h_mid_hugify = false; 32*c43cad87SWarner Losh hpdata->h_updating = false; 33*c43cad87SWarner Losh hpdata->h_in_psset = false; 34*c43cad87SWarner Losh hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES); 35*c43cad87SWarner Losh hpdata->h_nactive = 0; 36*c43cad87SWarner Losh fb_init(hpdata->active_pages, HUGEPAGE_PAGES); 37*c43cad87SWarner Losh hpdata->h_ntouched = 0; 38*c43cad87SWarner Losh fb_init(hpdata->touched_pages, HUGEPAGE_PAGES); 39*c43cad87SWarner Losh 40*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 41*c43cad87SWarner Losh } 42*c43cad87SWarner Losh 43*c43cad87SWarner Losh void * 44*c43cad87SWarner Losh hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) { 45*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 46*c43cad87SWarner Losh /* 47*c43cad87SWarner Losh * This is a metadata change; the hpdata should therefore either not be 48*c43cad87SWarner Losh * in the psset, or should have explicitly marked itself as being 49*c43cad87SWarner Losh * mid-update. 50*c43cad87SWarner Losh */ 51*c43cad87SWarner Losh assert(!hpdata->h_in_psset || hpdata->h_updating); 52*c43cad87SWarner Losh assert(hpdata->h_alloc_allowed); 53*c43cad87SWarner Losh assert((sz & PAGE_MASK) == 0); 54*c43cad87SWarner Losh size_t npages = sz >> LG_PAGE; 55*c43cad87SWarner Losh assert(npages <= hpdata_longest_free_range_get(hpdata)); 56*c43cad87SWarner Losh 57*c43cad87SWarner Losh size_t result; 58*c43cad87SWarner Losh 59*c43cad87SWarner Losh size_t start = 0; 60*c43cad87SWarner Losh /* 61*c43cad87SWarner Losh * These are dead stores, but the compiler will issue warnings on them 62*c43cad87SWarner Losh * since it can't tell statically that found is always true below. 63*c43cad87SWarner Losh */ 64*c43cad87SWarner Losh size_t begin = 0; 65*c43cad87SWarner Losh size_t len = 0; 66*c43cad87SWarner Losh 67*c43cad87SWarner Losh size_t largest_unchosen_range = 0; 68*c43cad87SWarner Losh while (true) { 69*c43cad87SWarner Losh bool found = fb_urange_iter(hpdata->active_pages, 70*c43cad87SWarner Losh HUGEPAGE_PAGES, start, &begin, &len); 71*c43cad87SWarner Losh /* 72*c43cad87SWarner Losh * A precondition to this function is that hpdata must be able 73*c43cad87SWarner Losh * to serve the allocation. 74*c43cad87SWarner Losh */ 75*c43cad87SWarner Losh assert(found); 76*c43cad87SWarner Losh assert(len <= hpdata_longest_free_range_get(hpdata)); 77*c43cad87SWarner Losh if (len >= npages) { 78*c43cad87SWarner Losh /* 79*c43cad87SWarner Losh * We use first-fit within the page slabs; this gives 80*c43cad87SWarner Losh * bounded worst-case fragmentation within a slab. It's 81*c43cad87SWarner Losh * not necessarily right; we could experiment with 82*c43cad87SWarner Losh * various other options. 83*c43cad87SWarner Losh */ 84*c43cad87SWarner Losh break; 85*c43cad87SWarner Losh } 86*c43cad87SWarner Losh if (len > largest_unchosen_range) { 87*c43cad87SWarner Losh largest_unchosen_range = len; 88*c43cad87SWarner Losh } 89*c43cad87SWarner Losh start = begin + len; 90*c43cad87SWarner Losh } 91*c43cad87SWarner Losh /* We found a range; remember it. */ 92*c43cad87SWarner Losh result = begin; 93*c43cad87SWarner Losh fb_set_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages); 94*c43cad87SWarner Losh hpdata->h_nactive += npages; 95*c43cad87SWarner Losh 96*c43cad87SWarner Losh /* 97*c43cad87SWarner Losh * We might be about to dirty some memory for the first time; update our 98*c43cad87SWarner Losh * count if so. 99*c43cad87SWarner Losh */ 100*c43cad87SWarner Losh size_t new_dirty = fb_ucount(hpdata->touched_pages, HUGEPAGE_PAGES, 101*c43cad87SWarner Losh result, npages); 102*c43cad87SWarner Losh fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, result, npages); 103*c43cad87SWarner Losh hpdata->h_ntouched += new_dirty; 104*c43cad87SWarner Losh 105*c43cad87SWarner Losh /* 106*c43cad87SWarner Losh * If we allocated out of a range that was the longest in the hpdata, it 107*c43cad87SWarner Losh * might be the only one of that size and we'll have to adjust the 108*c43cad87SWarner Losh * metadata. 109*c43cad87SWarner Losh */ 110*c43cad87SWarner Losh if (len == hpdata_longest_free_range_get(hpdata)) { 111*c43cad87SWarner Losh start = begin + npages; 112*c43cad87SWarner Losh while (start < HUGEPAGE_PAGES) { 113*c43cad87SWarner Losh bool found = fb_urange_iter(hpdata->active_pages, 114*c43cad87SWarner Losh HUGEPAGE_PAGES, start, &begin, &len); 115*c43cad87SWarner Losh if (!found) { 116*c43cad87SWarner Losh break; 117*c43cad87SWarner Losh } 118*c43cad87SWarner Losh assert(len <= hpdata_longest_free_range_get(hpdata)); 119*c43cad87SWarner Losh if (len == hpdata_longest_free_range_get(hpdata)) { 120*c43cad87SWarner Losh largest_unchosen_range = len; 121*c43cad87SWarner Losh break; 122*c43cad87SWarner Losh } 123*c43cad87SWarner Losh if (len > largest_unchosen_range) { 124*c43cad87SWarner Losh largest_unchosen_range = len; 125*c43cad87SWarner Losh } 126*c43cad87SWarner Losh start = begin + len; 127*c43cad87SWarner Losh } 128*c43cad87SWarner Losh hpdata_longest_free_range_set(hpdata, largest_unchosen_range); 129*c43cad87SWarner Losh } 130*c43cad87SWarner Losh 131*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 132*c43cad87SWarner Losh return (void *)( 133*c43cad87SWarner Losh (uintptr_t)hpdata_addr_get(hpdata) + (result << LG_PAGE)); 134*c43cad87SWarner Losh } 135*c43cad87SWarner Losh 136*c43cad87SWarner Losh void 137*c43cad87SWarner Losh hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) { 138*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 139*c43cad87SWarner Losh /* See the comment in reserve. */ 140*c43cad87SWarner Losh assert(!hpdata->h_in_psset || hpdata->h_updating); 141*c43cad87SWarner Losh assert(((uintptr_t)addr & PAGE_MASK) == 0); 142*c43cad87SWarner Losh assert((sz & PAGE_MASK) == 0); 143*c43cad87SWarner Losh size_t begin = ((uintptr_t)addr - (uintptr_t)hpdata_addr_get(hpdata)) 144*c43cad87SWarner Losh >> LG_PAGE; 145*c43cad87SWarner Losh assert(begin < HUGEPAGE_PAGES); 146*c43cad87SWarner Losh size_t npages = sz >> LG_PAGE; 147*c43cad87SWarner Losh size_t old_longest_range = hpdata_longest_free_range_get(hpdata); 148*c43cad87SWarner Losh 149*c43cad87SWarner Losh fb_unset_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages); 150*c43cad87SWarner Losh /* We might have just created a new, larger range. */ 151*c43cad87SWarner Losh size_t new_begin = (fb_fls(hpdata->active_pages, HUGEPAGE_PAGES, 152*c43cad87SWarner Losh begin) + 1); 153*c43cad87SWarner Losh size_t new_end = fb_ffs(hpdata->active_pages, HUGEPAGE_PAGES, 154*c43cad87SWarner Losh begin + npages - 1); 155*c43cad87SWarner Losh size_t new_range_len = new_end - new_begin; 156*c43cad87SWarner Losh 157*c43cad87SWarner Losh if (new_range_len > old_longest_range) { 158*c43cad87SWarner Losh hpdata_longest_free_range_set(hpdata, new_range_len); 159*c43cad87SWarner Losh } 160*c43cad87SWarner Losh 161*c43cad87SWarner Losh hpdata->h_nactive -= npages; 162*c43cad87SWarner Losh 163*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 164*c43cad87SWarner Losh } 165*c43cad87SWarner Losh 166*c43cad87SWarner Losh size_t 167*c43cad87SWarner Losh hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { 168*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 169*c43cad87SWarner Losh /* 170*c43cad87SWarner Losh * See the comment below; we might purge any inactive extent, so it's 171*c43cad87SWarner Losh * unsafe for any other thread to turn any inactive extent active while 172*c43cad87SWarner Losh * we're operating on it. 173*c43cad87SWarner Losh */ 174*c43cad87SWarner Losh assert(!hpdata_alloc_allowed_get(hpdata)); 175*c43cad87SWarner Losh 176*c43cad87SWarner Losh purge_state->npurged = 0; 177*c43cad87SWarner Losh purge_state->next_purge_search_begin = 0; 178*c43cad87SWarner Losh 179*c43cad87SWarner Losh /* 180*c43cad87SWarner Losh * Initialize to_purge. 181*c43cad87SWarner Losh * 182*c43cad87SWarner Losh * It's possible to end up in situations where two dirty extents are 183*c43cad87SWarner Losh * separated by a retained extent: 184*c43cad87SWarner Losh * - 1 page allocated. 185*c43cad87SWarner Losh * - 1 page allocated. 186*c43cad87SWarner Losh * - 1 pages allocated. 187*c43cad87SWarner Losh * 188*c43cad87SWarner Losh * If the middle page is freed and purged, and then the first and third 189*c43cad87SWarner Losh * pages are freed, and then another purge pass happens, the hpdata 190*c43cad87SWarner Losh * looks like this: 191*c43cad87SWarner Losh * - 1 page dirty. 192*c43cad87SWarner Losh * - 1 page retained. 193*c43cad87SWarner Losh * - 1 page dirty. 194*c43cad87SWarner Losh * 195*c43cad87SWarner Losh * But it's safe to do a single 3-page purge. 196*c43cad87SWarner Losh * 197*c43cad87SWarner Losh * We do this by first computing the dirty pages, and then filling in 198*c43cad87SWarner Losh * any gaps by extending each range in the dirty bitmap to extend until 199*c43cad87SWarner Losh * the next active page. This purges more pages, but the expensive part 200*c43cad87SWarner Losh * of purging is the TLB shootdowns, rather than the kernel state 201*c43cad87SWarner Losh * tracking; doing a little bit more of the latter is fine if it saves 202*c43cad87SWarner Losh * us from doing some of the former. 203*c43cad87SWarner Losh */ 204*c43cad87SWarner Losh 205*c43cad87SWarner Losh /* 206*c43cad87SWarner Losh * The dirty pages are those that are touched but not active. Note that 207*c43cad87SWarner Losh * in a normal-ish case, HUGEPAGE_PAGES is something like 512 and the 208*c43cad87SWarner Losh * fb_group_t is 64 bits, so this is 64 bytes, spread across 8 209*c43cad87SWarner Losh * fb_group_ts. 210*c43cad87SWarner Losh */ 211*c43cad87SWarner Losh fb_group_t dirty_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; 212*c43cad87SWarner Losh fb_init(dirty_pages, HUGEPAGE_PAGES); 213*c43cad87SWarner Losh fb_bit_not(dirty_pages, hpdata->active_pages, HUGEPAGE_PAGES); 214*c43cad87SWarner Losh fb_bit_and(dirty_pages, dirty_pages, hpdata->touched_pages, 215*c43cad87SWarner Losh HUGEPAGE_PAGES); 216*c43cad87SWarner Losh 217*c43cad87SWarner Losh fb_init(purge_state->to_purge, HUGEPAGE_PAGES); 218*c43cad87SWarner Losh size_t next_bit = 0; 219*c43cad87SWarner Losh while (next_bit < HUGEPAGE_PAGES) { 220*c43cad87SWarner Losh size_t next_dirty = fb_ffs(dirty_pages, HUGEPAGE_PAGES, 221*c43cad87SWarner Losh next_bit); 222*c43cad87SWarner Losh /* Recall that fb_ffs returns nbits if no set bit is found. */ 223*c43cad87SWarner Losh if (next_dirty == HUGEPAGE_PAGES) { 224*c43cad87SWarner Losh break; 225*c43cad87SWarner Losh } 226*c43cad87SWarner Losh size_t next_active = fb_ffs(hpdata->active_pages, 227*c43cad87SWarner Losh HUGEPAGE_PAGES, next_dirty); 228*c43cad87SWarner Losh /* 229*c43cad87SWarner Losh * Don't purge past the end of the dirty extent, into retained 230*c43cad87SWarner Losh * pages. This helps the kernel a tiny bit, but honestly it's 231*c43cad87SWarner Losh * mostly helpful for testing (where we tend to write test cases 232*c43cad87SWarner Losh * that think in terms of the dirty ranges). 233*c43cad87SWarner Losh */ 234*c43cad87SWarner Losh ssize_t last_dirty = fb_fls(dirty_pages, HUGEPAGE_PAGES, 235*c43cad87SWarner Losh next_active - 1); 236*c43cad87SWarner Losh assert(last_dirty >= 0); 237*c43cad87SWarner Losh assert((size_t)last_dirty >= next_dirty); 238*c43cad87SWarner Losh assert((size_t)last_dirty - next_dirty + 1 <= HUGEPAGE_PAGES); 239*c43cad87SWarner Losh 240*c43cad87SWarner Losh fb_set_range(purge_state->to_purge, HUGEPAGE_PAGES, next_dirty, 241*c43cad87SWarner Losh last_dirty - next_dirty + 1); 242*c43cad87SWarner Losh next_bit = next_active + 1; 243*c43cad87SWarner Losh } 244*c43cad87SWarner Losh 245*c43cad87SWarner Losh /* We should purge, at least, everything dirty. */ 246*c43cad87SWarner Losh size_t ndirty = hpdata->h_ntouched - hpdata->h_nactive; 247*c43cad87SWarner Losh purge_state->ndirty_to_purge = ndirty; 248*c43cad87SWarner Losh assert(ndirty <= fb_scount( 249*c43cad87SWarner Losh purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); 250*c43cad87SWarner Losh assert(ndirty == fb_scount(dirty_pages, HUGEPAGE_PAGES, 0, 251*c43cad87SWarner Losh HUGEPAGE_PAGES)); 252*c43cad87SWarner Losh 253*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 254*c43cad87SWarner Losh 255*c43cad87SWarner Losh return ndirty; 256*c43cad87SWarner Losh } 257*c43cad87SWarner Losh 258*c43cad87SWarner Losh bool 259*c43cad87SWarner Losh hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, 260*c43cad87SWarner Losh void **r_purge_addr, size_t *r_purge_size) { 261*c43cad87SWarner Losh /* 262*c43cad87SWarner Losh * Note that we don't have a consistency check here; we're accessing 263*c43cad87SWarner Losh * hpdata without synchronization, and therefore have no right to expect 264*c43cad87SWarner Losh * a consistent state. 265*c43cad87SWarner Losh */ 266*c43cad87SWarner Losh assert(!hpdata_alloc_allowed_get(hpdata)); 267*c43cad87SWarner Losh 268*c43cad87SWarner Losh if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) { 269*c43cad87SWarner Losh return false; 270*c43cad87SWarner Losh } 271*c43cad87SWarner Losh size_t purge_begin; 272*c43cad87SWarner Losh size_t purge_len; 273*c43cad87SWarner Losh bool found_range = fb_srange_iter(purge_state->to_purge, HUGEPAGE_PAGES, 274*c43cad87SWarner Losh purge_state->next_purge_search_begin, &purge_begin, &purge_len); 275*c43cad87SWarner Losh if (!found_range) { 276*c43cad87SWarner Losh return false; 277*c43cad87SWarner Losh } 278*c43cad87SWarner Losh 279*c43cad87SWarner Losh *r_purge_addr = (void *)( 280*c43cad87SWarner Losh (uintptr_t)hpdata_addr_get(hpdata) + purge_begin * PAGE); 281*c43cad87SWarner Losh *r_purge_size = purge_len * PAGE; 282*c43cad87SWarner Losh 283*c43cad87SWarner Losh purge_state->next_purge_search_begin = purge_begin + purge_len; 284*c43cad87SWarner Losh purge_state->npurged += purge_len; 285*c43cad87SWarner Losh assert(purge_state->npurged <= HUGEPAGE_PAGES); 286*c43cad87SWarner Losh 287*c43cad87SWarner Losh return true; 288*c43cad87SWarner Losh } 289*c43cad87SWarner Losh 290*c43cad87SWarner Losh void 291*c43cad87SWarner Losh hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { 292*c43cad87SWarner Losh assert(!hpdata_alloc_allowed_get(hpdata)); 293*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 294*c43cad87SWarner Losh /* See the comment in reserve. */ 295*c43cad87SWarner Losh assert(!hpdata->h_in_psset || hpdata->h_updating); 296*c43cad87SWarner Losh 297*c43cad87SWarner Losh assert(purge_state->npurged == fb_scount(purge_state->to_purge, 298*c43cad87SWarner Losh HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); 299*c43cad87SWarner Losh assert(purge_state->npurged >= purge_state->ndirty_to_purge); 300*c43cad87SWarner Losh 301*c43cad87SWarner Losh fb_bit_not(purge_state->to_purge, purge_state->to_purge, 302*c43cad87SWarner Losh HUGEPAGE_PAGES); 303*c43cad87SWarner Losh fb_bit_and(hpdata->touched_pages, hpdata->touched_pages, 304*c43cad87SWarner Losh purge_state->to_purge, HUGEPAGE_PAGES); 305*c43cad87SWarner Losh assert(hpdata->h_ntouched >= purge_state->ndirty_to_purge); 306*c43cad87SWarner Losh hpdata->h_ntouched -= purge_state->ndirty_to_purge; 307*c43cad87SWarner Losh 308*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 309*c43cad87SWarner Losh } 310*c43cad87SWarner Losh 311*c43cad87SWarner Losh void 312*c43cad87SWarner Losh hpdata_hugify(hpdata_t *hpdata) { 313*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 314*c43cad87SWarner Losh hpdata->h_huge = true; 315*c43cad87SWarner Losh fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES); 316*c43cad87SWarner Losh hpdata->h_ntouched = HUGEPAGE_PAGES; 317*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 318*c43cad87SWarner Losh } 319*c43cad87SWarner Losh 320*c43cad87SWarner Losh void 321*c43cad87SWarner Losh hpdata_dehugify(hpdata_t *hpdata) { 322*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 323*c43cad87SWarner Losh hpdata->h_huge = false; 324*c43cad87SWarner Losh hpdata_assert_consistent(hpdata); 325*c43cad87SWarner Losh } 326