1 #ifndef JEMALLOC_INTERNAL_TSD_H 2 #define JEMALLOC_INTERNAL_TSD_H 3 4 #include "jemalloc/internal/activity_callback.h" 5 #include "jemalloc/internal/arena_types.h" 6 #include "jemalloc/internal/assert.h" 7 #include "jemalloc/internal/bin_types.h" 8 #include "jemalloc/internal/jemalloc_internal_externs.h" 9 #include "jemalloc/internal/peak.h" 10 #include "jemalloc/internal/prof_types.h" 11 #include "jemalloc/internal/ql.h" 12 #include "jemalloc/internal/rtree_tsd.h" 13 #include "jemalloc/internal/tcache_types.h" 14 #include "jemalloc/internal/tcache_structs.h" 15 #include "jemalloc/internal/util.h" 16 #include "jemalloc/internal/witness.h" 17 18 /* 19 * Thread-Specific-Data layout 20 * 21 * At least some thread-local data gets touched on the fast-path of almost all 22 * malloc operations. But much of it is only necessary down slow-paths, or 23 * testing. We want to colocate the fast-path data so that it can live on the 24 * same cacheline if possible. So we define three tiers of hotness: 25 * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths. 26 * TSD_DATA_SLOW: Touched down slow paths. "Slow" here is sort of general; 27 * there are "semi-slow" paths like "not a sized deallocation, but can still 28 * live in the tcache". We'll want to keep these closer to the fast-path 29 * data. 30 * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all. 31 * 32 * An additional concern is that the larger tcache bins won't be used (we have a 33 * bin per size class, but by default only cache relatively small objects). So 34 * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the 35 * TSD_DATA_SLOWER tier. 36 * 37 * As a result of all this, we put the slow data first, then the fast data, then 38 * the slower data, while keeping the tcache as the last element of the fast 39 * data (so that the fast -> slower transition happens midway through the 40 * tcache). While we don't yet play alignment tricks to guarantee it, this 41 * increases our odds of getting some cache/page locality on fast paths. 42 */ 43 44 #ifdef JEMALLOC_JET 45 typedef void (*test_callback_t)(int *); 46 # define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10 47 # define MALLOC_TEST_TSD \ 48 O(test_data, int, int) \ 49 O(test_callback, test_callback_t, int) 50 # define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL 51 #else 52 # define MALLOC_TEST_TSD 53 # define MALLOC_TEST_TSD_INITIALIZER 54 #endif 55 56 typedef ql_elm(tsd_t) tsd_link_t; 57 58 /* O(name, type, nullable type) */ 59 #define TSD_DATA_SLOW \ 60 O(tcache_enabled, bool, bool) \ 61 O(reentrancy_level, int8_t, int8_t) \ 62 O(thread_allocated_last_event, uint64_t, uint64_t) \ 63 O(thread_allocated_next_event, uint64_t, uint64_t) \ 64 O(thread_deallocated_last_event, uint64_t, uint64_t) \ 65 O(thread_deallocated_next_event, uint64_t, uint64_t) \ 66 O(tcache_gc_event_wait, uint64_t, uint64_t) \ 67 O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \ 68 O(prof_sample_event_wait, uint64_t, uint64_t) \ 69 O(prof_sample_last_event, uint64_t, uint64_t) \ 70 O(stats_interval_event_wait, uint64_t, uint64_t) \ 71 O(stats_interval_last_event, uint64_t, uint64_t) \ 72 O(peak_alloc_event_wait, uint64_t, uint64_t) \ 73 O(peak_dalloc_event_wait, uint64_t, uint64_t) \ 74 O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ 75 O(prng_state, uint64_t, uint64_t) \ 76 O(san_extents_until_guard_small, uint64_t, uint64_t) \ 77 O(san_extents_until_guard_large, uint64_t, uint64_t) \ 78 O(iarena, arena_t *, arena_t *) \ 79 O(arena, arena_t *, arena_t *) \ 80 O(arena_decay_ticker, ticker_geom_t, ticker_geom_t) \ 81 O(sec_shard, uint8_t, uint8_t) \ 82 O(binshards, tsd_binshards_t, tsd_binshards_t)\ 83 O(tsd_link, tsd_link_t, tsd_link_t) \ 84 O(in_hook, bool, bool) \ 85 O(peak, peak_t, peak_t) \ 86 O(activity_callback_thunk, activity_callback_thunk_t, \ 87 activity_callback_thunk_t) \ 88 O(tcache_slow, tcache_slow_t, tcache_slow_t) \ 89 O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) 90 91 #define TSD_DATA_SLOW_INITIALIZER \ 92 /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \ 93 /* reentrancy_level */ 0, \ 94 /* thread_allocated_last_event */ 0, \ 95 /* thread_allocated_next_event */ 0, \ 96 /* thread_deallocated_last_event */ 0, \ 97 /* thread_deallocated_next_event */ 0, \ 98 /* tcache_gc_event_wait */ 0, \ 99 /* tcache_gc_dalloc_event_wait */ 0, \ 100 /* prof_sample_event_wait */ 0, \ 101 /* prof_sample_last_event */ 0, \ 102 /* stats_interval_event_wait */ 0, \ 103 /* stats_interval_last_event */ 0, \ 104 /* peak_alloc_event_wait */ 0, \ 105 /* peak_dalloc_event_wait */ 0, \ 106 /* prof_tdata */ NULL, \ 107 /* prng_state */ 0, \ 108 /* san_extents_until_guard_small */ 0, \ 109 /* san_extents_until_guard_large */ 0, \ 110 /* iarena */ NULL, \ 111 /* arena */ NULL, \ 112 /* arena_decay_ticker */ \ 113 TICKER_GEOM_INIT(ARENA_DECAY_NTICKS_PER_UPDATE), \ 114 /* sec_shard */ (uint8_t)-1, \ 115 /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \ 116 /* tsd_link */ {NULL}, \ 117 /* in_hook */ false, \ 118 /* peak */ PEAK_INITIALIZER, \ 119 /* activity_callback_thunk */ \ 120 ACTIVITY_CALLBACK_THUNK_INITIALIZER, \ 121 /* tcache_slow */ TCACHE_SLOW_ZERO_INITIALIZER, \ 122 /* rtree_ctx */ RTREE_CTX_INITIALIZER, 123 124 /* O(name, type, nullable type) */ 125 #define TSD_DATA_FAST \ 126 O(thread_allocated, uint64_t, uint64_t) \ 127 O(thread_allocated_next_event_fast, uint64_t, uint64_t) \ 128 O(thread_deallocated, uint64_t, uint64_t) \ 129 O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \ 130 O(tcache, tcache_t, tcache_t) 131 132 #define TSD_DATA_FAST_INITIALIZER \ 133 /* thread_allocated */ 0, \ 134 /* thread_allocated_next_event_fast */ 0, \ 135 /* thread_deallocated */ 0, \ 136 /* thread_deallocated_next_event_fast */ 0, \ 137 /* tcache */ TCACHE_ZERO_INITIALIZER, 138 139 /* O(name, type, nullable type) */ 140 #define TSD_DATA_SLOWER \ 141 O(witness_tsd, witness_tsd_t, witness_tsdn_t) \ 142 MALLOC_TEST_TSD 143 144 #define TSD_DATA_SLOWER_INITIALIZER \ 145 /* witness */ WITNESS_TSD_INITIALIZER \ 146 /* test data */ MALLOC_TEST_TSD_INITIALIZER 147 148 149 #define TSD_INITIALIZER { \ 150 TSD_DATA_SLOW_INITIALIZER \ 151 /* state */ ATOMIC_INIT(tsd_state_uninitialized), \ 152 TSD_DATA_FAST_INITIALIZER \ 153 TSD_DATA_SLOWER_INITIALIZER \ 154 } 155 156 #if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32) 157 void _malloc_tsd_cleanup_register(bool (*f)(void)); 158 #endif 159 160 void *malloc_tsd_malloc(size_t size); 161 void malloc_tsd_dalloc(void *wrapper); 162 tsd_t *malloc_tsd_boot0(void); 163 void malloc_tsd_boot1(void); 164 void tsd_cleanup(void *arg); 165 tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal); 166 void tsd_state_set(tsd_t *tsd, uint8_t new_state); 167 void tsd_slow_update(tsd_t *tsd); 168 void tsd_prefork(tsd_t *tsd); 169 void tsd_postfork_parent(tsd_t *tsd); 170 void tsd_postfork_child(tsd_t *tsd); 171 172 /* 173 * Call ..._inc when your module wants to take all threads down the slow paths, 174 * and ..._dec when it no longer needs to. 175 */ 176 void tsd_global_slow_inc(tsdn_t *tsdn); 177 void tsd_global_slow_dec(tsdn_t *tsdn); 178 bool tsd_global_slow(); 179 180 enum { 181 /* Common case --> jnz. */ 182 tsd_state_nominal = 0, 183 /* Initialized but on slow path. */ 184 tsd_state_nominal_slow = 1, 185 /* 186 * Some thread has changed global state in such a way that all nominal 187 * threads need to recompute their fast / slow status the next time they 188 * get a chance. 189 * 190 * Any thread can change another thread's status *to* recompute, but 191 * threads are the only ones who can change their status *from* 192 * recompute. 193 */ 194 tsd_state_nominal_recompute = 2, 195 /* 196 * The above nominal states should be lower values. We use 197 * tsd_nominal_max to separate nominal states from threads in the 198 * process of being born / dying. 199 */ 200 tsd_state_nominal_max = 2, 201 202 /* 203 * A thread might free() during its death as its only allocator action; 204 * in such scenarios, we need tsd, but set up in such a way that no 205 * cleanup is necessary. 206 */ 207 tsd_state_minimal_initialized = 3, 208 /* States during which we know we're in thread death. */ 209 tsd_state_purgatory = 4, 210 tsd_state_reincarnated = 5, 211 /* 212 * What it says on the tin; tsd that hasn't been initialized. Note 213 * that even when the tsd struct lives in TLS, when need to keep track 214 * of stuff like whether or not our pthread destructors have been 215 * scheduled, so this really truly is different than the nominal state. 216 */ 217 tsd_state_uninitialized = 6 218 }; 219 220 /* 221 * Some TSD accesses can only be done in a nominal state. To enforce this, we 222 * wrap TSD member access in a function that asserts on TSD state, and mangle 223 * field names to prevent touching them accidentally. 224 */ 225 #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n 226 227 #ifdef JEMALLOC_U8_ATOMICS 228 # define tsd_state_t atomic_u8_t 229 # define tsd_atomic_load atomic_load_u8 230 # define tsd_atomic_store atomic_store_u8 231 # define tsd_atomic_exchange atomic_exchange_u8 232 #else 233 # define tsd_state_t atomic_u32_t 234 # define tsd_atomic_load atomic_load_u32 235 # define tsd_atomic_store atomic_store_u32 236 # define tsd_atomic_exchange atomic_exchange_u32 237 #endif 238 239 /* The actual tsd. */ 240 struct tsd_s { 241 /* 242 * The contents should be treated as totally opaque outside the tsd 243 * module. Access any thread-local state through the getters and 244 * setters below. 245 */ 246 247 #define O(n, t, nt) \ 248 t TSD_MANGLE(n); 249 250 TSD_DATA_SLOW 251 /* 252 * We manually limit the state to just a single byte. Unless the 8-bit 253 * atomics are unavailable (which is rare). 254 */ 255 tsd_state_t state; 256 TSD_DATA_FAST 257 TSD_DATA_SLOWER 258 #undef O 259 /* AddressSanitizer requires TLS data to be aligned to at least 8 bytes. */ 260 } JEMALLOC_ALIGNED(16); 261 262 JEMALLOC_ALWAYS_INLINE uint8_t 263 tsd_state_get(tsd_t *tsd) { 264 /* 265 * This should be atomic. Unfortunately, compilers right now can't tell 266 * that this can be done as a memory comparison, and forces a load into 267 * a register that hurts fast-path performance. 268 */ 269 /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */ 270 return *(uint8_t *)&tsd->state; 271 } 272 273 /* 274 * Wrapper around tsd_t that makes it possible to avoid implicit conversion 275 * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be 276 * explicitly converted to tsd_t, which is non-nullable. 277 */ 278 struct tsdn_s { 279 tsd_t tsd; 280 }; 281 #define TSDN_NULL ((tsdn_t *)0) 282 JEMALLOC_ALWAYS_INLINE tsdn_t * 283 tsd_tsdn(tsd_t *tsd) { 284 return (tsdn_t *)tsd; 285 } 286 287 JEMALLOC_ALWAYS_INLINE bool 288 tsdn_null(const tsdn_t *tsdn) { 289 return tsdn == NULL; 290 } 291 292 JEMALLOC_ALWAYS_INLINE tsd_t * 293 tsdn_tsd(tsdn_t *tsdn) { 294 assert(!tsdn_null(tsdn)); 295 296 return &tsdn->tsd; 297 } 298 299 /* 300 * We put the platform-specific data declarations and inlines into their own 301 * header files to avoid cluttering this file. They define tsd_boot0, 302 * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set. 303 */ 304 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP 305 #include "jemalloc/internal/tsd_malloc_thread_cleanup.h" 306 #elif (defined(JEMALLOC_TLS)) 307 #include "jemalloc/internal/tsd_tls.h" 308 #elif (defined(_WIN32)) 309 #include "jemalloc/internal/tsd_win.h" 310 #else 311 #include "jemalloc/internal/tsd_generic.h" 312 #endif 313 314 /* 315 * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of 316 * foo. This omits some safety checks, and so can be used during tsd 317 * initialization and cleanup. 318 */ 319 #define O(n, t, nt) \ 320 JEMALLOC_ALWAYS_INLINE t * \ 321 tsd_##n##p_get_unsafe(tsd_t *tsd) { \ 322 return &tsd->TSD_MANGLE(n); \ 323 } 324 TSD_DATA_SLOW 325 TSD_DATA_FAST 326 TSD_DATA_SLOWER 327 #undef O 328 329 /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */ 330 #define O(n, t, nt) \ 331 JEMALLOC_ALWAYS_INLINE t * \ 332 tsd_##n##p_get(tsd_t *tsd) { \ 333 /* \ 334 * Because the state might change asynchronously if it's \ 335 * nominal, we need to make sure that we only read it once. \ 336 */ \ 337 uint8_t state = tsd_state_get(tsd); \ 338 assert(state == tsd_state_nominal || \ 339 state == tsd_state_nominal_slow || \ 340 state == tsd_state_nominal_recompute || \ 341 state == tsd_state_reincarnated || \ 342 state == tsd_state_minimal_initialized); \ 343 return tsd_##n##p_get_unsafe(tsd); \ 344 } 345 TSD_DATA_SLOW 346 TSD_DATA_FAST 347 TSD_DATA_SLOWER 348 #undef O 349 350 /* 351 * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn 352 * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type. 353 */ 354 #define O(n, t, nt) \ 355 JEMALLOC_ALWAYS_INLINE nt * \ 356 tsdn_##n##p_get(tsdn_t *tsdn) { \ 357 if (tsdn_null(tsdn)) { \ 358 return NULL; \ 359 } \ 360 tsd_t *tsd = tsdn_tsd(tsdn); \ 361 return (nt *)tsd_##n##p_get(tsd); \ 362 } 363 TSD_DATA_SLOW 364 TSD_DATA_FAST 365 TSD_DATA_SLOWER 366 #undef O 367 368 /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */ 369 #define O(n, t, nt) \ 370 JEMALLOC_ALWAYS_INLINE t \ 371 tsd_##n##_get(tsd_t *tsd) { \ 372 return *tsd_##n##p_get(tsd); \ 373 } 374 TSD_DATA_SLOW 375 TSD_DATA_FAST 376 TSD_DATA_SLOWER 377 #undef O 378 379 /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */ 380 #define O(n, t, nt) \ 381 JEMALLOC_ALWAYS_INLINE void \ 382 tsd_##n##_set(tsd_t *tsd, t val) { \ 383 assert(tsd_state_get(tsd) != tsd_state_reincarnated && \ 384 tsd_state_get(tsd) != tsd_state_minimal_initialized); \ 385 *tsd_##n##p_get(tsd) = val; \ 386 } 387 TSD_DATA_SLOW 388 TSD_DATA_FAST 389 TSD_DATA_SLOWER 390 #undef O 391 392 JEMALLOC_ALWAYS_INLINE void 393 tsd_assert_fast(tsd_t *tsd) { 394 /* 395 * Note that our fastness assertion does *not* include global slowness 396 * counters; it's not in general possible to ensure that they won't 397 * change asynchronously from underneath us. 398 */ 399 assert(!malloc_slow && tsd_tcache_enabled_get(tsd) && 400 tsd_reentrancy_level_get(tsd) == 0); 401 } 402 403 JEMALLOC_ALWAYS_INLINE bool 404 tsd_fast(tsd_t *tsd) { 405 bool fast = (tsd_state_get(tsd) == tsd_state_nominal); 406 if (fast) { 407 tsd_assert_fast(tsd); 408 } 409 410 return fast; 411 } 412 413 JEMALLOC_ALWAYS_INLINE tsd_t * 414 tsd_fetch_impl(bool init, bool minimal) { 415 tsd_t *tsd = tsd_get(init); 416 417 if (!init && tsd_get_allocates() && tsd == NULL) { 418 return NULL; 419 } 420 assert(tsd != NULL); 421 422 if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) { 423 return tsd_fetch_slow(tsd, minimal); 424 } 425 assert(tsd_fast(tsd)); 426 tsd_assert_fast(tsd); 427 428 return tsd; 429 } 430 431 /* Get a minimal TSD that requires no cleanup. See comments in free(). */ 432 JEMALLOC_ALWAYS_INLINE tsd_t * 433 tsd_fetch_min(void) { 434 return tsd_fetch_impl(true, true); 435 } 436 437 /* For internal background threads use only. */ 438 JEMALLOC_ALWAYS_INLINE tsd_t * 439 tsd_internal_fetch(void) { 440 tsd_t *tsd = tsd_fetch_min(); 441 /* Use reincarnated state to prevent full initialization. */ 442 tsd_state_set(tsd, tsd_state_reincarnated); 443 444 return tsd; 445 } 446 447 JEMALLOC_ALWAYS_INLINE tsd_t * 448 tsd_fetch(void) { 449 return tsd_fetch_impl(true, false); 450 } 451 452 static inline bool 453 tsd_nominal(tsd_t *tsd) { 454 bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max; 455 assert(nominal || tsd_reentrancy_level_get(tsd) > 0); 456 457 return nominal; 458 } 459 460 JEMALLOC_ALWAYS_INLINE tsdn_t * 461 tsdn_fetch(void) { 462 if (!tsd_booted_get()) { 463 return NULL; 464 } 465 466 return tsd_tsdn(tsd_fetch_impl(false, false)); 467 } 468 469 JEMALLOC_ALWAYS_INLINE rtree_ctx_t * 470 tsd_rtree_ctx(tsd_t *tsd) { 471 return tsd_rtree_ctxp_get(tsd); 472 } 473 474 JEMALLOC_ALWAYS_INLINE rtree_ctx_t * 475 tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) { 476 /* 477 * If tsd cannot be accessed, initialize the fallback rtree_ctx and 478 * return a pointer to it. 479 */ 480 if (unlikely(tsdn_null(tsdn))) { 481 rtree_ctx_data_init(fallback); 482 return fallback; 483 } 484 return tsd_rtree_ctx(tsdn_tsd(tsdn)); 485 } 486 487 static inline bool 488 tsd_state_nocleanup(tsd_t *tsd) { 489 return tsd_state_get(tsd) == tsd_state_reincarnated || 490 tsd_state_get(tsd) == tsd_state_minimal_initialized; 491 } 492 493 /* 494 * These "raw" tsd reentrancy functions don't have any debug checking to make 495 * sure that we're not touching arena 0. Better is to call pre_reentrancy and 496 * post_reentrancy if this is possible. 497 */ 498 static inline void 499 tsd_pre_reentrancy_raw(tsd_t *tsd) { 500 bool fast = tsd_fast(tsd); 501 assert(tsd_reentrancy_level_get(tsd) < INT8_MAX); 502 ++*tsd_reentrancy_levelp_get(tsd); 503 if (fast) { 504 /* Prepare slow path for reentrancy. */ 505 tsd_slow_update(tsd); 506 assert(tsd_state_get(tsd) == tsd_state_nominal_slow); 507 } 508 } 509 510 static inline void 511 tsd_post_reentrancy_raw(tsd_t *tsd) { 512 int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd); 513 assert(*reentrancy_level > 0); 514 if (--*reentrancy_level == 0) { 515 tsd_slow_update(tsd); 516 } 517 } 518 519 #endif /* JEMALLOC_INTERNAL_TSD_H */ 520