1 #include "jemalloc/internal/jemalloc_preamble.h" 2 #include "jemalloc/internal/jemalloc_internal_includes.h" 3 4 #include "jemalloc/internal/sec.h" 5 6 static edata_t *sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, 7 size_t alignment, bool zero, bool guarded, bool frequent_reuse, 8 bool *deferred_work_generated); 9 static bool sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, 10 size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); 11 static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, 12 size_t old_size, size_t new_size, bool *deferred_work_generated); 13 static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, 14 bool *deferred_work_generated); 15 16 static void 17 sec_bin_init(sec_bin_t *bin) { 18 bin->being_batch_filled = false; 19 bin->bytes_cur = 0; 20 edata_list_active_init(&bin->freelist); 21 } 22 23 bool 24 sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback, 25 const sec_opts_t *opts) { 26 assert(opts->max_alloc >= PAGE); 27 28 size_t max_alloc = PAGE_FLOOR(opts->max_alloc); 29 pszind_t npsizes = sz_psz2ind(max_alloc) + 1; 30 31 size_t sz_shards = opts->nshards * sizeof(sec_shard_t); 32 size_t sz_bins = opts->nshards * (size_t)npsizes * sizeof(sec_bin_t); 33 size_t sz_alloc = sz_shards + sz_bins; 34 void *dynalloc = base_alloc(tsdn, base, sz_alloc, CACHELINE); 35 if (dynalloc == NULL) { 36 return true; 37 } 38 sec_shard_t *shard_cur = (sec_shard_t *)dynalloc; 39 sec->shards = shard_cur; 40 sec_bin_t *bin_cur = (sec_bin_t *)&shard_cur[opts->nshards]; 41 /* Just for asserts, below. */ 42 sec_bin_t *bin_start = bin_cur; 43 44 for (size_t i = 0; i < opts->nshards; i++) { 45 sec_shard_t *shard = shard_cur; 46 shard_cur++; 47 bool err = malloc_mutex_init(&shard->mtx, "sec_shard", 48 WITNESS_RANK_SEC_SHARD, malloc_mutex_rank_exclusive); 49 if (err) { 50 return true; 51 } 52 shard->enabled = true; 53 shard->bins = bin_cur; 54 for (pszind_t j = 0; j < npsizes; j++) { 55 sec_bin_init(&shard->bins[j]); 56 bin_cur++; 57 } 58 shard->bytes_cur = 0; 59 shard->to_flush_next = 0; 60 } 61 /* 62 * Should have exactly matched the bin_start to the first unused byte 63 * after the shards. 64 */ 65 assert((void *)shard_cur == (void *)bin_start); 66 /* And the last bin to use up the last bytes of the allocation. */ 67 assert((char *)bin_cur == ((char *)dynalloc + sz_alloc)); 68 sec->fallback = fallback; 69 70 71 sec->opts = *opts; 72 sec->npsizes = npsizes; 73 74 /* 75 * Initialize these last so that an improper use of an SEC whose 76 * initialization failed will segfault in an easy-to-spot way. 77 */ 78 sec->pai.alloc = &sec_alloc; 79 sec->pai.alloc_batch = &pai_alloc_batch_default; 80 sec->pai.expand = &sec_expand; 81 sec->pai.shrink = &sec_shrink; 82 sec->pai.dalloc = &sec_dalloc; 83 sec->pai.dalloc_batch = &pai_dalloc_batch_default; 84 85 return false; 86 } 87 88 static sec_shard_t * 89 sec_shard_pick(tsdn_t *tsdn, sec_t *sec) { 90 /* 91 * Eventually, we should implement affinity, tracking source shard using 92 * the edata_t's newly freed up fields. For now, just randomly 93 * distribute across all shards. 94 */ 95 if (tsdn_null(tsdn)) { 96 return &sec->shards[0]; 97 } 98 tsd_t *tsd = tsdn_tsd(tsdn); 99 uint8_t *idxp = tsd_sec_shardp_get(tsd); 100 if (*idxp == (uint8_t)-1) { 101 /* 102 * First use; initialize using the trick from Daniel Lemire's 103 * "A fast alternative to the modulo reduction. Use a 64 bit 104 * number to store 32 bits, since we'll deliberately overflow 105 * when we multiply by the number of shards. 106 */ 107 uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32); 108 uint32_t idx = 109 (uint32_t)((rand32 * (uint64_t)sec->opts.nshards) >> 32); 110 assert(idx < (uint32_t)sec->opts.nshards); 111 *idxp = (uint8_t)idx; 112 } 113 return &sec->shards[*idxp]; 114 } 115 116 /* 117 * Perhaps surprisingly, this can be called on the alloc pathways; if we hit an 118 * empty cache, we'll try to fill it, which can push the shard over it's limit. 119 */ 120 static void 121 sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { 122 malloc_mutex_assert_owner(tsdn, &shard->mtx); 123 edata_list_active_t to_flush; 124 edata_list_active_init(&to_flush); 125 while (shard->bytes_cur > sec->opts.bytes_after_flush) { 126 /* Pick a victim. */ 127 sec_bin_t *bin = &shard->bins[shard->to_flush_next]; 128 129 /* Update our victim-picking state. */ 130 shard->to_flush_next++; 131 if (shard->to_flush_next == sec->npsizes) { 132 shard->to_flush_next = 0; 133 } 134 135 assert(shard->bytes_cur >= bin->bytes_cur); 136 if (bin->bytes_cur != 0) { 137 shard->bytes_cur -= bin->bytes_cur; 138 bin->bytes_cur = 0; 139 edata_list_active_concat(&to_flush, &bin->freelist); 140 } 141 /* 142 * Either bin->bytes_cur was 0, in which case we didn't touch 143 * the bin list but it should be empty anyways (or else we 144 * missed a bytes_cur update on a list modification), or it 145 * *was* 0 and we emptied it ourselves. Either way, it should 146 * be empty now. 147 */ 148 assert(edata_list_active_empty(&bin->freelist)); 149 } 150 151 malloc_mutex_unlock(tsdn, &shard->mtx); 152 bool deferred_work_generated = false; 153 pai_dalloc_batch(tsdn, sec->fallback, &to_flush, 154 &deferred_work_generated); 155 } 156 157 static edata_t * 158 sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, 159 sec_bin_t *bin) { 160 malloc_mutex_assert_owner(tsdn, &shard->mtx); 161 if (!shard->enabled) { 162 return NULL; 163 } 164 edata_t *edata = edata_list_active_first(&bin->freelist); 165 if (edata != NULL) { 166 edata_list_active_remove(&bin->freelist, edata); 167 assert(edata_size_get(edata) <= bin->bytes_cur); 168 bin->bytes_cur -= edata_size_get(edata); 169 assert(edata_size_get(edata) <= shard->bytes_cur); 170 shard->bytes_cur -= edata_size_get(edata); 171 } 172 return edata; 173 } 174 175 static edata_t * 176 sec_batch_fill_and_alloc(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, 177 sec_bin_t *bin, size_t size) { 178 malloc_mutex_assert_not_owner(tsdn, &shard->mtx); 179 180 edata_list_active_t result; 181 edata_list_active_init(&result); 182 bool deferred_work_generated = false; 183 size_t nalloc = pai_alloc_batch(tsdn, sec->fallback, size, 184 1 + sec->opts.batch_fill_extra, &result, &deferred_work_generated); 185 186 edata_t *ret = edata_list_active_first(&result); 187 if (ret != NULL) { 188 edata_list_active_remove(&result, ret); 189 } 190 191 malloc_mutex_lock(tsdn, &shard->mtx); 192 bin->being_batch_filled = false; 193 /* 194 * Handle the easy case first: nothing to cache. Note that this can 195 * only happen in case of OOM, since sec_alloc checks the expected 196 * number of allocs, and doesn't bother going down the batch_fill 197 * pathway if there won't be anything left to cache. So to be in this 198 * code path, we must have asked for > 1 alloc, but only gotten 1 back. 199 */ 200 if (nalloc <= 1) { 201 malloc_mutex_unlock(tsdn, &shard->mtx); 202 return ret; 203 } 204 205 size_t new_cached_bytes = (nalloc - 1) * size; 206 207 edata_list_active_concat(&bin->freelist, &result); 208 bin->bytes_cur += new_cached_bytes; 209 shard->bytes_cur += new_cached_bytes; 210 211 if (shard->bytes_cur > sec->opts.max_bytes) { 212 sec_flush_some_and_unlock(tsdn, sec, shard); 213 } else { 214 malloc_mutex_unlock(tsdn, &shard->mtx); 215 } 216 217 return ret; 218 } 219 220 static edata_t * 221 sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, 222 bool guarded, bool frequent_reuse, bool *deferred_work_generated) { 223 assert((size & PAGE_MASK) == 0); 224 assert(!guarded); 225 226 sec_t *sec = (sec_t *)self; 227 228 if (zero || alignment > PAGE || sec->opts.nshards == 0 229 || size > sec->opts.max_alloc) { 230 return pai_alloc(tsdn, sec->fallback, size, alignment, zero, 231 /* guarded */ false, frequent_reuse, 232 deferred_work_generated); 233 } 234 pszind_t pszind = sz_psz2ind(size); 235 assert(pszind < sec->npsizes); 236 237 sec_shard_t *shard = sec_shard_pick(tsdn, sec); 238 sec_bin_t *bin = &shard->bins[pszind]; 239 bool do_batch_fill = false; 240 241 malloc_mutex_lock(tsdn, &shard->mtx); 242 edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, bin); 243 if (edata == NULL) { 244 if (!bin->being_batch_filled 245 && sec->opts.batch_fill_extra > 0) { 246 bin->being_batch_filled = true; 247 do_batch_fill = true; 248 } 249 } 250 malloc_mutex_unlock(tsdn, &shard->mtx); 251 if (edata == NULL) { 252 if (do_batch_fill) { 253 edata = sec_batch_fill_and_alloc(tsdn, sec, shard, bin, 254 size); 255 } else { 256 edata = pai_alloc(tsdn, sec->fallback, size, alignment, 257 zero, /* guarded */ false, frequent_reuse, 258 deferred_work_generated); 259 } 260 } 261 return edata; 262 } 263 264 static bool 265 sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, 266 size_t new_size, bool zero, bool *deferred_work_generated) { 267 sec_t *sec = (sec_t *)self; 268 return pai_expand(tsdn, sec->fallback, edata, old_size, new_size, zero, 269 deferred_work_generated); 270 } 271 272 static bool 273 sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, 274 size_t new_size, bool *deferred_work_generated) { 275 sec_t *sec = (sec_t *)self; 276 return pai_shrink(tsdn, sec->fallback, edata, old_size, new_size, 277 deferred_work_generated); 278 } 279 280 static void 281 sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { 282 malloc_mutex_assert_owner(tsdn, &shard->mtx); 283 shard->bytes_cur = 0; 284 edata_list_active_t to_flush; 285 edata_list_active_init(&to_flush); 286 for (pszind_t i = 0; i < sec->npsizes; i++) { 287 sec_bin_t *bin = &shard->bins[i]; 288 bin->bytes_cur = 0; 289 edata_list_active_concat(&to_flush, &bin->freelist); 290 } 291 292 /* 293 * Ordinarily we would try to avoid doing the batch deallocation while 294 * holding the shard mutex, but the flush_all pathways only happen when 295 * we're disabling the HPA or resetting the arena, both of which are 296 * rare pathways. 297 */ 298 bool deferred_work_generated = false; 299 pai_dalloc_batch(tsdn, sec->fallback, &to_flush, 300 &deferred_work_generated); 301 } 302 303 static void 304 sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, 305 edata_t *edata) { 306 malloc_mutex_assert_owner(tsdn, &shard->mtx); 307 assert(shard->bytes_cur <= sec->opts.max_bytes); 308 size_t size = edata_size_get(edata); 309 pszind_t pszind = sz_psz2ind(size); 310 assert(pszind < sec->npsizes); 311 /* 312 * Prepending here results in LIFO allocation per bin, which seems 313 * reasonable. 314 */ 315 sec_bin_t *bin = &shard->bins[pszind]; 316 edata_list_active_prepend(&bin->freelist, edata); 317 bin->bytes_cur += size; 318 shard->bytes_cur += size; 319 if (shard->bytes_cur > sec->opts.max_bytes) { 320 /* 321 * We've exceeded the shard limit. We make two nods in the 322 * direction of fragmentation avoidance: we flush everything in 323 * the shard, rather than one particular bin, and we hold the 324 * lock while flushing (in case one of the extents we flush is 325 * highly preferred from a fragmentation-avoidance perspective 326 * in the backing allocator). This has the extra advantage of 327 * not requiring advanced cache balancing strategies. 328 */ 329 sec_flush_some_and_unlock(tsdn, sec, shard); 330 malloc_mutex_assert_not_owner(tsdn, &shard->mtx); 331 } else { 332 malloc_mutex_unlock(tsdn, &shard->mtx); 333 } 334 } 335 336 static void 337 sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, 338 bool *deferred_work_generated) { 339 sec_t *sec = (sec_t *)self; 340 if (sec->opts.nshards == 0 341 || edata_size_get(edata) > sec->opts.max_alloc) { 342 pai_dalloc(tsdn, sec->fallback, edata, 343 deferred_work_generated); 344 return; 345 } 346 sec_shard_t *shard = sec_shard_pick(tsdn, sec); 347 malloc_mutex_lock(tsdn, &shard->mtx); 348 if (shard->enabled) { 349 sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata); 350 } else { 351 malloc_mutex_unlock(tsdn, &shard->mtx); 352 pai_dalloc(tsdn, sec->fallback, edata, 353 deferred_work_generated); 354 } 355 } 356 357 void 358 sec_flush(tsdn_t *tsdn, sec_t *sec) { 359 for (size_t i = 0; i < sec->opts.nshards; i++) { 360 malloc_mutex_lock(tsdn, &sec->shards[i].mtx); 361 sec_flush_all_locked(tsdn, sec, &sec->shards[i]); 362 malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); 363 } 364 } 365 366 void 367 sec_disable(tsdn_t *tsdn, sec_t *sec) { 368 for (size_t i = 0; i < sec->opts.nshards; i++) { 369 malloc_mutex_lock(tsdn, &sec->shards[i].mtx); 370 sec->shards[i].enabled = false; 371 sec_flush_all_locked(tsdn, sec, &sec->shards[i]); 372 malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); 373 } 374 } 375 376 void 377 sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats) { 378 size_t sum = 0; 379 for (size_t i = 0; i < sec->opts.nshards; i++) { 380 /* 381 * We could save these lock acquisitions by making bytes_cur 382 * atomic, but stats collection is rare anyways and we expect 383 * the number and type of stats to get more interesting. 384 */ 385 malloc_mutex_lock(tsdn, &sec->shards[i].mtx); 386 sum += sec->shards[i].bytes_cur; 387 malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); 388 } 389 stats->bytes += sum; 390 } 391 392 void 393 sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec, 394 mutex_prof_data_t *mutex_prof_data) { 395 for (size_t i = 0; i < sec->opts.nshards; i++) { 396 malloc_mutex_lock(tsdn, &sec->shards[i].mtx); 397 malloc_mutex_prof_accum(tsdn, mutex_prof_data, 398 &sec->shards[i].mtx); 399 malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); 400 } 401 } 402 403 void 404 sec_prefork2(tsdn_t *tsdn, sec_t *sec) { 405 for (size_t i = 0; i < sec->opts.nshards; i++) { 406 malloc_mutex_prefork(tsdn, &sec->shards[i].mtx); 407 } 408 } 409 410 void 411 sec_postfork_parent(tsdn_t *tsdn, sec_t *sec) { 412 for (size_t i = 0; i < sec->opts.nshards; i++) { 413 malloc_mutex_postfork_parent(tsdn, &sec->shards[i].mtx); 414 } 415 } 416 417 void 418 sec_postfork_child(tsdn_t *tsdn, sec_t *sec) { 419 for (size_t i = 0; i < sec->opts.nshards; i++) { 420 malloc_mutex_postfork_child(tsdn, &sec->shards[i].mtx); 421 } 422 } 423