xref: /freebsd/contrib/jemalloc/include/jemalloc/internal/tsd.h (revision c43cad87172039ccf38172129c79755ea79e6102)
1 #ifndef JEMALLOC_INTERNAL_TSD_H
2 #define JEMALLOC_INTERNAL_TSD_H
3 
4 #include "jemalloc/internal/activity_callback.h"
5 #include "jemalloc/internal/arena_types.h"
6 #include "jemalloc/internal/assert.h"
7 #include "jemalloc/internal/bin_types.h"
8 #include "jemalloc/internal/jemalloc_internal_externs.h"
9 #include "jemalloc/internal/peak.h"
10 #include "jemalloc/internal/prof_types.h"
11 #include "jemalloc/internal/ql.h"
12 #include "jemalloc/internal/rtree_tsd.h"
13 #include "jemalloc/internal/tcache_types.h"
14 #include "jemalloc/internal/tcache_structs.h"
15 #include "jemalloc/internal/util.h"
16 #include "jemalloc/internal/witness.h"
17 
18 /*
19  * Thread-Specific-Data layout
20  *
21  * At least some thread-local data gets touched on the fast-path of almost all
22  * malloc operations.  But much of it is only necessary down slow-paths, or
23  * testing.  We want to colocate the fast-path data so that it can live on the
24  * same cacheline if possible.  So we define three tiers of hotness:
25  * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths.
26  * TSD_DATA_SLOW: Touched down slow paths.  "Slow" here is sort of general;
27  *     there are "semi-slow" paths like "not a sized deallocation, but can still
28  *     live in the tcache".  We'll want to keep these closer to the fast-path
29  *     data.
30  * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all.
31  *
32  * An additional concern is that the larger tcache bins won't be used (we have a
33  * bin per size class, but by default only cache relatively small objects).  So
34  * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the
35  * TSD_DATA_SLOWER tier.
36  *
37  * As a result of all this, we put the slow data first, then the fast data, then
38  * the slower data, while keeping the tcache as the last element of the fast
39  * data (so that the fast -> slower transition happens midway through the
40  * tcache).  While we don't yet play alignment tricks to guarantee it, this
41  * increases our odds of getting some cache/page locality on fast paths.
42  */
43 
44 #ifdef JEMALLOC_JET
45 typedef void (*test_callback_t)(int *);
46 #  define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
47 #  define MALLOC_TEST_TSD \
48     O(test_data,		int,			int)		\
49     O(test_callback,		test_callback_t,	int)
50 #  define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL
51 #else
52 #  define MALLOC_TEST_TSD
53 #  define MALLOC_TEST_TSD_INITIALIZER
54 #endif
55 
56 typedef ql_elm(tsd_t) tsd_link_t;
57 
58 /*  O(name,			type,			nullable type) */
59 #define TSD_DATA_SLOW							\
60     O(tcache_enabled,		bool,			bool)		\
61     O(reentrancy_level,		int8_t,			int8_t)		\
62     O(thread_allocated_last_event,	uint64_t,	uint64_t)	\
63     O(thread_allocated_next_event,	uint64_t,	uint64_t)	\
64     O(thread_deallocated_last_event,	uint64_t,	uint64_t)	\
65     O(thread_deallocated_next_event,	uint64_t,	uint64_t)	\
66     O(tcache_gc_event_wait,	uint64_t,		uint64_t)	\
67     O(tcache_gc_dalloc_event_wait,	uint64_t,	uint64_t)	\
68     O(prof_sample_event_wait,	uint64_t,		uint64_t)	\
69     O(prof_sample_last_event,	uint64_t,		uint64_t)	\
70     O(stats_interval_event_wait,	uint64_t,	uint64_t)	\
71     O(stats_interval_last_event,	uint64_t,	uint64_t)	\
72     O(peak_alloc_event_wait,	uint64_t,		uint64_t)	\
73     O(peak_dalloc_event_wait,	uint64_t,	uint64_t)		\
74     O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
75     O(prng_state,		uint64_t,		uint64_t)	\
76     O(san_extents_until_guard_small,	uint64_t,	uint64_t)	\
77     O(san_extents_until_guard_large,	uint64_t,	uint64_t)	\
78     O(iarena,			arena_t *,		arena_t *)	\
79     O(arena,			arena_t *,		arena_t *)	\
80     O(arena_decay_ticker,	ticker_geom_t,		ticker_geom_t)	\
81     O(sec_shard,		uint8_t,		uint8_t)	\
82     O(binshards,		tsd_binshards_t,	tsd_binshards_t)\
83     O(tsd_link,			tsd_link_t,		tsd_link_t)	\
84     O(in_hook,			bool,			bool)		\
85     O(peak,			peak_t,			peak_t)		\
86     O(activity_callback_thunk,	activity_callback_thunk_t,		\
87 	activity_callback_thunk_t)					\
88     O(tcache_slow,		tcache_slow_t,		tcache_slow_t)	\
89     O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)
90 
91 #define TSD_DATA_SLOW_INITIALIZER					\
92     /* tcache_enabled */	TCACHE_ENABLED_ZERO_INITIALIZER,	\
93     /* reentrancy_level */	0,					\
94     /* thread_allocated_last_event */	0,				\
95     /* thread_allocated_next_event */	0,				\
96     /* thread_deallocated_last_event */	0,				\
97     /* thread_deallocated_next_event */	0,				\
98     /* tcache_gc_event_wait */		0,				\
99     /* tcache_gc_dalloc_event_wait */	0,				\
100     /* prof_sample_event_wait */	0,				\
101     /* prof_sample_last_event */	0,				\
102     /* stats_interval_event_wait */	0,				\
103     /* stats_interval_last_event */	0,				\
104     /* peak_alloc_event_wait */		0,				\
105     /* peak_dalloc_event_wait */	0,				\
106     /* prof_tdata */		NULL,					\
107     /* prng_state */		0,					\
108     /* san_extents_until_guard_small */	0,				\
109     /* san_extents_until_guard_large */	0,				\
110     /* iarena */		NULL,					\
111     /* arena */			NULL,					\
112     /* arena_decay_ticker */						\
113 	TICKER_GEOM_INIT(ARENA_DECAY_NTICKS_PER_UPDATE),		\
114     /* sec_shard */		(uint8_t)-1,				\
115     /* binshards */		TSD_BINSHARDS_ZERO_INITIALIZER,		\
116     /* tsd_link */		{NULL},					\
117     /* in_hook */		false,					\
118     /* peak */			PEAK_INITIALIZER,			\
119     /* activity_callback_thunk */					\
120 	ACTIVITY_CALLBACK_THUNK_INITIALIZER,				\
121     /* tcache_slow */		TCACHE_SLOW_ZERO_INITIALIZER,		\
122     /* rtree_ctx */		RTREE_CTX_INITIALIZER,
123 
124 /*  O(name,			type,			nullable type) */
125 #define TSD_DATA_FAST							\
126     O(thread_allocated,		uint64_t,		uint64_t)	\
127     O(thread_allocated_next_event_fast,	uint64_t,	uint64_t)	\
128     O(thread_deallocated,	uint64_t,		uint64_t)	\
129     O(thread_deallocated_next_event_fast, uint64_t,	uint64_t)	\
130     O(tcache,			tcache_t,		tcache_t)
131 
132 #define TSD_DATA_FAST_INITIALIZER					\
133     /* thread_allocated */	0,					\
134     /* thread_allocated_next_event_fast */ 0, 				\
135     /* thread_deallocated */	0,					\
136     /* thread_deallocated_next_event_fast */	0,			\
137     /* tcache */		TCACHE_ZERO_INITIALIZER,
138 
139 /*  O(name,			type,			nullable type) */
140 #define TSD_DATA_SLOWER							\
141     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
142     MALLOC_TEST_TSD
143 
144 #define TSD_DATA_SLOWER_INITIALIZER					\
145     /* witness */		WITNESS_TSD_INITIALIZER			\
146     /* test data */		MALLOC_TEST_TSD_INITIALIZER
147 
148 
149 #define TSD_INITIALIZER {						\
150     				TSD_DATA_SLOW_INITIALIZER		\
151     /* state */			ATOMIC_INIT(tsd_state_uninitialized),	\
152     				TSD_DATA_FAST_INITIALIZER		\
153     				TSD_DATA_SLOWER_INITIALIZER		\
154 }
155 
156 #if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32)
157 void _malloc_tsd_cleanup_register(bool (*f)(void));
158 #endif
159 
160 void *malloc_tsd_malloc(size_t size);
161 void malloc_tsd_dalloc(void *wrapper);
162 tsd_t *malloc_tsd_boot0(void);
163 void malloc_tsd_boot1(void);
164 void tsd_cleanup(void *arg);
165 tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
166 void tsd_state_set(tsd_t *tsd, uint8_t new_state);
167 void tsd_slow_update(tsd_t *tsd);
168 void tsd_prefork(tsd_t *tsd);
169 void tsd_postfork_parent(tsd_t *tsd);
170 void tsd_postfork_child(tsd_t *tsd);
171 
172 /*
173  * Call ..._inc when your module wants to take all threads down the slow paths,
174  * and ..._dec when it no longer needs to.
175  */
176 void tsd_global_slow_inc(tsdn_t *tsdn);
177 void tsd_global_slow_dec(tsdn_t *tsdn);
178 bool tsd_global_slow();
179 
180 enum {
181 	/* Common case --> jnz. */
182 	tsd_state_nominal = 0,
183 	/* Initialized but on slow path. */
184 	tsd_state_nominal_slow = 1,
185 	/*
186 	 * Some thread has changed global state in such a way that all nominal
187 	 * threads need to recompute their fast / slow status the next time they
188 	 * get a chance.
189 	 *
190 	 * Any thread can change another thread's status *to* recompute, but
191 	 * threads are the only ones who can change their status *from*
192 	 * recompute.
193 	 */
194 	tsd_state_nominal_recompute = 2,
195 	/*
196 	 * The above nominal states should be lower values.  We use
197 	 * tsd_nominal_max to separate nominal states from threads in the
198 	 * process of being born / dying.
199 	 */
200 	tsd_state_nominal_max = 2,
201 
202 	/*
203 	 * A thread might free() during its death as its only allocator action;
204 	 * in such scenarios, we need tsd, but set up in such a way that no
205 	 * cleanup is necessary.
206 	 */
207 	tsd_state_minimal_initialized = 3,
208 	/* States during which we know we're in thread death. */
209 	tsd_state_purgatory = 4,
210 	tsd_state_reincarnated = 5,
211 	/*
212 	 * What it says on the tin; tsd that hasn't been initialized.  Note
213 	 * that even when the tsd struct lives in TLS, when need to keep track
214 	 * of stuff like whether or not our pthread destructors have been
215 	 * scheduled, so this really truly is different than the nominal state.
216 	 */
217 	tsd_state_uninitialized = 6
218 };
219 
220 /*
221  * Some TSD accesses can only be done in a nominal state.  To enforce this, we
222  * wrap TSD member access in a function that asserts on TSD state, and mangle
223  * field names to prevent touching them accidentally.
224  */
225 #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
226 
227 #ifdef JEMALLOC_U8_ATOMICS
228 #  define tsd_state_t atomic_u8_t
229 #  define tsd_atomic_load atomic_load_u8
230 #  define tsd_atomic_store atomic_store_u8
231 #  define tsd_atomic_exchange atomic_exchange_u8
232 #else
233 #  define tsd_state_t atomic_u32_t
234 #  define tsd_atomic_load atomic_load_u32
235 #  define tsd_atomic_store atomic_store_u32
236 #  define tsd_atomic_exchange atomic_exchange_u32
237 #endif
238 
239 /* The actual tsd. */
240 struct tsd_s {
241 	/*
242 	 * The contents should be treated as totally opaque outside the tsd
243 	 * module.  Access any thread-local state through the getters and
244 	 * setters below.
245 	 */
246 
247 #define O(n, t, nt)							\
248 	t TSD_MANGLE(n);
249 
250 	TSD_DATA_SLOW
251 	/*
252 	 * We manually limit the state to just a single byte.  Unless the 8-bit
253 	 * atomics are unavailable (which is rare).
254 	 */
255 	tsd_state_t state;
256 	TSD_DATA_FAST
257 	TSD_DATA_SLOWER
258 #undef O
259 /* AddressSanitizer requires TLS data to be aligned to at least 8 bytes. */
260 } JEMALLOC_ALIGNED(16);
261 
262 JEMALLOC_ALWAYS_INLINE uint8_t
263 tsd_state_get(tsd_t *tsd) {
264 	/*
265 	 * This should be atomic.  Unfortunately, compilers right now can't tell
266 	 * that this can be done as a memory comparison, and forces a load into
267 	 * a register that hurts fast-path performance.
268 	 */
269 	/* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
270 	return *(uint8_t *)&tsd->state;
271 }
272 
273 /*
274  * Wrapper around tsd_t that makes it possible to avoid implicit conversion
275  * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
276  * explicitly converted to tsd_t, which is non-nullable.
277  */
278 struct tsdn_s {
279 	tsd_t tsd;
280 };
281 #define TSDN_NULL ((tsdn_t *)0)
282 JEMALLOC_ALWAYS_INLINE tsdn_t *
283 tsd_tsdn(tsd_t *tsd) {
284 	return (tsdn_t *)tsd;
285 }
286 
287 JEMALLOC_ALWAYS_INLINE bool
288 tsdn_null(const tsdn_t *tsdn) {
289 	return tsdn == NULL;
290 }
291 
292 JEMALLOC_ALWAYS_INLINE tsd_t *
293 tsdn_tsd(tsdn_t *tsdn) {
294 	assert(!tsdn_null(tsdn));
295 
296 	return &tsdn->tsd;
297 }
298 
299 /*
300  * We put the platform-specific data declarations and inlines into their own
301  * header files to avoid cluttering this file.  They define tsd_boot0,
302  * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set.
303  */
304 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
305 #include "jemalloc/internal/tsd_malloc_thread_cleanup.h"
306 #elif (defined(JEMALLOC_TLS))
307 #include "jemalloc/internal/tsd_tls.h"
308 #elif (defined(_WIN32))
309 #include "jemalloc/internal/tsd_win.h"
310 #else
311 #include "jemalloc/internal/tsd_generic.h"
312 #endif
313 
314 /*
315  * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of
316  * foo.  This omits some safety checks, and so can be used during tsd
317  * initialization and cleanup.
318  */
319 #define O(n, t, nt)							\
320 JEMALLOC_ALWAYS_INLINE t *						\
321 tsd_##n##p_get_unsafe(tsd_t *tsd) {					\
322 	return &tsd->TSD_MANGLE(n);					\
323 }
324 TSD_DATA_SLOW
325 TSD_DATA_FAST
326 TSD_DATA_SLOWER
327 #undef O
328 
329 /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
330 #define O(n, t, nt)							\
331 JEMALLOC_ALWAYS_INLINE t *						\
332 tsd_##n##p_get(tsd_t *tsd) {						\
333 	/*								\
334 	 * Because the state might change asynchronously if it's	\
335 	 * nominal, we need to make sure that we only read it once.	\
336 	 */								\
337 	uint8_t state = tsd_state_get(tsd);				\
338 	assert(state == tsd_state_nominal ||				\
339 	    state == tsd_state_nominal_slow ||				\
340 	    state == tsd_state_nominal_recompute ||			\
341 	    state == tsd_state_reincarnated ||				\
342 	    state == tsd_state_minimal_initialized);			\
343 	return tsd_##n##p_get_unsafe(tsd);				\
344 }
345 TSD_DATA_SLOW
346 TSD_DATA_FAST
347 TSD_DATA_SLOWER
348 #undef O
349 
350 /*
351  * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn
352  * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type.
353  */
354 #define O(n, t, nt)							\
355 JEMALLOC_ALWAYS_INLINE nt *						\
356 tsdn_##n##p_get(tsdn_t *tsdn) {						\
357 	if (tsdn_null(tsdn)) {						\
358 		return NULL;						\
359 	}								\
360 	tsd_t *tsd = tsdn_tsd(tsdn);					\
361 	return (nt *)tsd_##n##p_get(tsd);				\
362 }
363 TSD_DATA_SLOW
364 TSD_DATA_FAST
365 TSD_DATA_SLOWER
366 #undef O
367 
368 /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
369 #define O(n, t, nt)							\
370 JEMALLOC_ALWAYS_INLINE t						\
371 tsd_##n##_get(tsd_t *tsd) {						\
372 	return *tsd_##n##p_get(tsd);					\
373 }
374 TSD_DATA_SLOW
375 TSD_DATA_FAST
376 TSD_DATA_SLOWER
377 #undef O
378 
379 /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
380 #define O(n, t, nt)							\
381 JEMALLOC_ALWAYS_INLINE void						\
382 tsd_##n##_set(tsd_t *tsd, t val) {					\
383 	assert(tsd_state_get(tsd) != tsd_state_reincarnated &&		\
384 	    tsd_state_get(tsd) != tsd_state_minimal_initialized);	\
385 	*tsd_##n##p_get(tsd) = val;					\
386 }
387 TSD_DATA_SLOW
388 TSD_DATA_FAST
389 TSD_DATA_SLOWER
390 #undef O
391 
392 JEMALLOC_ALWAYS_INLINE void
393 tsd_assert_fast(tsd_t *tsd) {
394 	/*
395 	 * Note that our fastness assertion does *not* include global slowness
396 	 * counters; it's not in general possible to ensure that they won't
397 	 * change asynchronously from underneath us.
398 	 */
399 	assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
400 	    tsd_reentrancy_level_get(tsd) == 0);
401 }
402 
403 JEMALLOC_ALWAYS_INLINE bool
404 tsd_fast(tsd_t *tsd) {
405 	bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
406 	if (fast) {
407 		tsd_assert_fast(tsd);
408 	}
409 
410 	return fast;
411 }
412 
413 JEMALLOC_ALWAYS_INLINE tsd_t *
414 tsd_fetch_impl(bool init, bool minimal) {
415 	tsd_t *tsd = tsd_get(init);
416 
417 	if (!init && tsd_get_allocates() && tsd == NULL) {
418 		return NULL;
419 	}
420 	assert(tsd != NULL);
421 
422 	if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
423 		return tsd_fetch_slow(tsd, minimal);
424 	}
425 	assert(tsd_fast(tsd));
426 	tsd_assert_fast(tsd);
427 
428 	return tsd;
429 }
430 
431 /* Get a minimal TSD that requires no cleanup.  See comments in free(). */
432 JEMALLOC_ALWAYS_INLINE tsd_t *
433 tsd_fetch_min(void) {
434 	return tsd_fetch_impl(true, true);
435 }
436 
437 /* For internal background threads use only. */
438 JEMALLOC_ALWAYS_INLINE tsd_t *
439 tsd_internal_fetch(void) {
440 	tsd_t *tsd = tsd_fetch_min();
441 	/* Use reincarnated state to prevent full initialization. */
442 	tsd_state_set(tsd, tsd_state_reincarnated);
443 
444 	return tsd;
445 }
446 
447 JEMALLOC_ALWAYS_INLINE tsd_t *
448 tsd_fetch(void) {
449 	return tsd_fetch_impl(true, false);
450 }
451 
452 static inline bool
453 tsd_nominal(tsd_t *tsd) {
454 	bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max;
455 	assert(nominal || tsd_reentrancy_level_get(tsd) > 0);
456 
457 	return nominal;
458 }
459 
460 JEMALLOC_ALWAYS_INLINE tsdn_t *
461 tsdn_fetch(void) {
462 	if (!tsd_booted_get()) {
463 		return NULL;
464 	}
465 
466 	return tsd_tsdn(tsd_fetch_impl(false, false));
467 }
468 
469 JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
470 tsd_rtree_ctx(tsd_t *tsd) {
471 	return tsd_rtree_ctxp_get(tsd);
472 }
473 
474 JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
475 tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) {
476 	/*
477 	 * If tsd cannot be accessed, initialize the fallback rtree_ctx and
478 	 * return a pointer to it.
479 	 */
480 	if (unlikely(tsdn_null(tsdn))) {
481 		rtree_ctx_data_init(fallback);
482 		return fallback;
483 	}
484 	return tsd_rtree_ctx(tsdn_tsd(tsdn));
485 }
486 
487 static inline bool
488 tsd_state_nocleanup(tsd_t *tsd) {
489 	return tsd_state_get(tsd) == tsd_state_reincarnated ||
490 	    tsd_state_get(tsd) == tsd_state_minimal_initialized;
491 }
492 
493 /*
494  * These "raw" tsd reentrancy functions don't have any debug checking to make
495  * sure that we're not touching arena 0.  Better is to call pre_reentrancy and
496  * post_reentrancy if this is possible.
497  */
498 static inline void
499 tsd_pre_reentrancy_raw(tsd_t *tsd) {
500 	bool fast = tsd_fast(tsd);
501 	assert(tsd_reentrancy_level_get(tsd) < INT8_MAX);
502 	++*tsd_reentrancy_levelp_get(tsd);
503 	if (fast) {
504 		/* Prepare slow path for reentrancy. */
505 		tsd_slow_update(tsd);
506 		assert(tsd_state_get(tsd) == tsd_state_nominal_slow);
507 	}
508 }
509 
510 static inline void
511 tsd_post_reentrancy_raw(tsd_t *tsd) {
512 	int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd);
513 	assert(*reentrancy_level > 0);
514 	if (--*reentrancy_level == 0) {
515 		tsd_slow_update(tsd);
516 	}
517 }
518 
519 #endif /* JEMALLOC_INTERNAL_TSD_H */
520