xref: /freebsd/contrib/jemalloc/src/sec.c (revision c43cad87172039ccf38172129c79755ea79e6102)
1 #include "jemalloc/internal/jemalloc_preamble.h"
2 #include "jemalloc/internal/jemalloc_internal_includes.h"
3 
4 #include "jemalloc/internal/sec.h"
5 
6 static edata_t *sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
7     size_t alignment, bool zero, bool guarded, bool frequent_reuse,
8     bool *deferred_work_generated);
9 static bool sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
10     size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated);
11 static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
12     size_t old_size, size_t new_size, bool *deferred_work_generated);
13 static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
14     bool *deferred_work_generated);
15 
16 static void
17 sec_bin_init(sec_bin_t *bin) {
18 	bin->being_batch_filled = false;
19 	bin->bytes_cur = 0;
20 	edata_list_active_init(&bin->freelist);
21 }
22 
23 bool
24 sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback,
25     const sec_opts_t *opts) {
26 	assert(opts->max_alloc >= PAGE);
27 
28 	size_t max_alloc = PAGE_FLOOR(opts->max_alloc);
29 	pszind_t npsizes = sz_psz2ind(max_alloc) + 1;
30 
31 	size_t sz_shards = opts->nshards * sizeof(sec_shard_t);
32 	size_t sz_bins = opts->nshards * (size_t)npsizes * sizeof(sec_bin_t);
33 	size_t sz_alloc = sz_shards + sz_bins;
34 	void *dynalloc = base_alloc(tsdn, base, sz_alloc, CACHELINE);
35 	if (dynalloc == NULL) {
36 		return true;
37 	}
38 	sec_shard_t *shard_cur = (sec_shard_t *)dynalloc;
39 	sec->shards = shard_cur;
40 	sec_bin_t *bin_cur = (sec_bin_t *)&shard_cur[opts->nshards];
41 	/* Just for asserts, below. */
42 	sec_bin_t *bin_start = bin_cur;
43 
44 	for (size_t i = 0; i < opts->nshards; i++) {
45 		sec_shard_t *shard = shard_cur;
46 		shard_cur++;
47 		bool err = malloc_mutex_init(&shard->mtx, "sec_shard",
48 		    WITNESS_RANK_SEC_SHARD, malloc_mutex_rank_exclusive);
49 		if (err) {
50 			return true;
51 		}
52 		shard->enabled = true;
53 		shard->bins = bin_cur;
54 		for (pszind_t j = 0; j < npsizes; j++) {
55 			sec_bin_init(&shard->bins[j]);
56 			bin_cur++;
57 		}
58 		shard->bytes_cur = 0;
59 		shard->to_flush_next = 0;
60 	}
61 	/*
62 	 * Should have exactly matched the bin_start to the first unused byte
63 	 * after the shards.
64 	 */
65 	assert((void *)shard_cur == (void *)bin_start);
66 	/* And the last bin to use up the last bytes of the allocation. */
67 	assert((char *)bin_cur == ((char *)dynalloc + sz_alloc));
68 	sec->fallback = fallback;
69 
70 
71 	sec->opts = *opts;
72 	sec->npsizes = npsizes;
73 
74 	/*
75 	 * Initialize these last so that an improper use of an SEC whose
76 	 * initialization failed will segfault in an easy-to-spot way.
77 	 */
78 	sec->pai.alloc = &sec_alloc;
79 	sec->pai.alloc_batch = &pai_alloc_batch_default;
80 	sec->pai.expand = &sec_expand;
81 	sec->pai.shrink = &sec_shrink;
82 	sec->pai.dalloc = &sec_dalloc;
83 	sec->pai.dalloc_batch = &pai_dalloc_batch_default;
84 
85 	return false;
86 }
87 
88 static sec_shard_t *
89 sec_shard_pick(tsdn_t *tsdn, sec_t *sec) {
90 	/*
91 	 * Eventually, we should implement affinity, tracking source shard using
92 	 * the edata_t's newly freed up fields.  For now, just randomly
93 	 * distribute across all shards.
94 	 */
95 	if (tsdn_null(tsdn)) {
96 		return &sec->shards[0];
97 	}
98 	tsd_t *tsd = tsdn_tsd(tsdn);
99 	uint8_t *idxp = tsd_sec_shardp_get(tsd);
100 	if (*idxp == (uint8_t)-1) {
101 		/*
102 		 * First use; initialize using the trick from Daniel Lemire's
103 		 * "A fast alternative to the modulo reduction.  Use a 64 bit
104 		 * number to store 32 bits, since we'll deliberately overflow
105 		 * when we multiply by the number of shards.
106 		 */
107 		uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32);
108 		uint32_t idx =
109 		    (uint32_t)((rand32 * (uint64_t)sec->opts.nshards) >> 32);
110 		assert(idx < (uint32_t)sec->opts.nshards);
111 		*idxp = (uint8_t)idx;
112 	}
113 	return &sec->shards[*idxp];
114 }
115 
116 /*
117  * Perhaps surprisingly, this can be called on the alloc pathways; if we hit an
118  * empty cache, we'll try to fill it, which can push the shard over it's limit.
119  */
120 static void
121 sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
122 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
123 	edata_list_active_t to_flush;
124 	edata_list_active_init(&to_flush);
125 	while (shard->bytes_cur > sec->opts.bytes_after_flush) {
126 		/* Pick a victim. */
127 		sec_bin_t *bin = &shard->bins[shard->to_flush_next];
128 
129 		/* Update our victim-picking state. */
130 		shard->to_flush_next++;
131 		if (shard->to_flush_next == sec->npsizes) {
132 			shard->to_flush_next = 0;
133 		}
134 
135 		assert(shard->bytes_cur >= bin->bytes_cur);
136 		if (bin->bytes_cur != 0) {
137 			shard->bytes_cur -= bin->bytes_cur;
138 			bin->bytes_cur = 0;
139 			edata_list_active_concat(&to_flush, &bin->freelist);
140 		}
141 		/*
142 		 * Either bin->bytes_cur was 0, in which case we didn't touch
143 		 * the bin list but it should be empty anyways (or else we
144 		 * missed a bytes_cur update on a list modification), or it
145 		 * *was* 0 and we emptied it ourselves.  Either way, it should
146 		 * be empty now.
147 		 */
148 		assert(edata_list_active_empty(&bin->freelist));
149 	}
150 
151 	malloc_mutex_unlock(tsdn, &shard->mtx);
152 	bool deferred_work_generated = false;
153 	pai_dalloc_batch(tsdn, sec->fallback, &to_flush,
154 	    &deferred_work_generated);
155 }
156 
157 static edata_t *
158 sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
159     sec_bin_t *bin) {
160 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
161 	if (!shard->enabled) {
162 		return NULL;
163 	}
164 	edata_t *edata = edata_list_active_first(&bin->freelist);
165 	if (edata != NULL) {
166 		edata_list_active_remove(&bin->freelist, edata);
167 		assert(edata_size_get(edata) <= bin->bytes_cur);
168 		bin->bytes_cur -= edata_size_get(edata);
169 		assert(edata_size_get(edata) <= shard->bytes_cur);
170 		shard->bytes_cur -= edata_size_get(edata);
171 	}
172 	return edata;
173 }
174 
175 static edata_t *
176 sec_batch_fill_and_alloc(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
177     sec_bin_t *bin, size_t size) {
178 	malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
179 
180 	edata_list_active_t result;
181 	edata_list_active_init(&result);
182 	bool deferred_work_generated = false;
183 	size_t nalloc = pai_alloc_batch(tsdn, sec->fallback, size,
184 	    1 + sec->opts.batch_fill_extra, &result, &deferred_work_generated);
185 
186 	edata_t *ret = edata_list_active_first(&result);
187 	if (ret != NULL) {
188 		edata_list_active_remove(&result, ret);
189 	}
190 
191 	malloc_mutex_lock(tsdn, &shard->mtx);
192 	bin->being_batch_filled = false;
193 	/*
194 	 * Handle the easy case first: nothing to cache.  Note that this can
195 	 * only happen in case of OOM, since sec_alloc checks the expected
196 	 * number of allocs, and doesn't bother going down the batch_fill
197 	 * pathway if there won't be anything left to cache.  So to be in this
198 	 * code path, we must have asked for > 1 alloc, but only gotten 1 back.
199 	 */
200 	if (nalloc <= 1) {
201 		malloc_mutex_unlock(tsdn, &shard->mtx);
202 		return ret;
203 	}
204 
205 	size_t new_cached_bytes = (nalloc - 1) * size;
206 
207 	edata_list_active_concat(&bin->freelist, &result);
208 	bin->bytes_cur += new_cached_bytes;
209 	shard->bytes_cur += new_cached_bytes;
210 
211 	if (shard->bytes_cur > sec->opts.max_bytes) {
212 		sec_flush_some_and_unlock(tsdn, sec, shard);
213 	} else {
214 		malloc_mutex_unlock(tsdn, &shard->mtx);
215 	}
216 
217 	return ret;
218 }
219 
220 static edata_t *
221 sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
222     bool guarded, bool frequent_reuse, bool *deferred_work_generated) {
223 	assert((size & PAGE_MASK) == 0);
224 	assert(!guarded);
225 
226 	sec_t *sec = (sec_t *)self;
227 
228 	if (zero || alignment > PAGE || sec->opts.nshards == 0
229 	    || size > sec->opts.max_alloc) {
230 		return pai_alloc(tsdn, sec->fallback, size, alignment, zero,
231 		    /* guarded */ false, frequent_reuse,
232 		    deferred_work_generated);
233 	}
234 	pszind_t pszind = sz_psz2ind(size);
235 	assert(pszind < sec->npsizes);
236 
237 	sec_shard_t *shard = sec_shard_pick(tsdn, sec);
238 	sec_bin_t *bin = &shard->bins[pszind];
239 	bool do_batch_fill = false;
240 
241 	malloc_mutex_lock(tsdn, &shard->mtx);
242 	edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, bin);
243 	if (edata == NULL) {
244 		if (!bin->being_batch_filled
245 		    && sec->opts.batch_fill_extra > 0) {
246 			bin->being_batch_filled = true;
247 			do_batch_fill = true;
248 		}
249 	}
250 	malloc_mutex_unlock(tsdn, &shard->mtx);
251 	if (edata == NULL) {
252 		if (do_batch_fill) {
253 			edata = sec_batch_fill_and_alloc(tsdn, sec, shard, bin,
254 			    size);
255 		} else {
256 			edata = pai_alloc(tsdn, sec->fallback, size, alignment,
257 			    zero, /* guarded */ false, frequent_reuse,
258 			    deferred_work_generated);
259 		}
260 	}
261 	return edata;
262 }
263 
264 static bool
265 sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
266     size_t new_size, bool zero, bool *deferred_work_generated) {
267 	sec_t *sec = (sec_t *)self;
268 	return pai_expand(tsdn, sec->fallback, edata, old_size, new_size, zero,
269 	    deferred_work_generated);
270 }
271 
272 static bool
273 sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
274     size_t new_size, bool *deferred_work_generated) {
275 	sec_t *sec = (sec_t *)self;
276 	return pai_shrink(tsdn, sec->fallback, edata, old_size, new_size,
277 	    deferred_work_generated);
278 }
279 
280 static void
281 sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
282 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
283 	shard->bytes_cur = 0;
284 	edata_list_active_t to_flush;
285 	edata_list_active_init(&to_flush);
286 	for (pszind_t i = 0; i < sec->npsizes; i++) {
287 		sec_bin_t *bin = &shard->bins[i];
288 		bin->bytes_cur = 0;
289 		edata_list_active_concat(&to_flush, &bin->freelist);
290 	}
291 
292 	/*
293 	 * Ordinarily we would try to avoid doing the batch deallocation while
294 	 * holding the shard mutex, but the flush_all pathways only happen when
295 	 * we're disabling the HPA or resetting the arena, both of which are
296 	 * rare pathways.
297 	 */
298 	bool deferred_work_generated = false;
299 	pai_dalloc_batch(tsdn, sec->fallback, &to_flush,
300 	    &deferred_work_generated);
301 }
302 
303 static void
304 sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
305     edata_t *edata) {
306 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
307 	assert(shard->bytes_cur <= sec->opts.max_bytes);
308 	size_t size = edata_size_get(edata);
309 	pszind_t pszind = sz_psz2ind(size);
310 	assert(pszind < sec->npsizes);
311 	/*
312 	 * Prepending here results in LIFO allocation per bin, which seems
313 	 * reasonable.
314 	 */
315 	sec_bin_t *bin = &shard->bins[pszind];
316 	edata_list_active_prepend(&bin->freelist, edata);
317 	bin->bytes_cur += size;
318 	shard->bytes_cur += size;
319 	if (shard->bytes_cur > sec->opts.max_bytes) {
320 		/*
321 		 * We've exceeded the shard limit.  We make two nods in the
322 		 * direction of fragmentation avoidance: we flush everything in
323 		 * the shard, rather than one particular bin, and we hold the
324 		 * lock while flushing (in case one of the extents we flush is
325 		 * highly preferred from a fragmentation-avoidance perspective
326 		 * in the backing allocator).  This has the extra advantage of
327 		 * not requiring advanced cache balancing strategies.
328 		 */
329 		sec_flush_some_and_unlock(tsdn, sec, shard);
330 		malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
331 	} else {
332 		malloc_mutex_unlock(tsdn, &shard->mtx);
333 	}
334 }
335 
336 static void
337 sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
338     bool *deferred_work_generated) {
339 	sec_t *sec = (sec_t *)self;
340 	if (sec->opts.nshards == 0
341 	    || edata_size_get(edata) > sec->opts.max_alloc) {
342 		pai_dalloc(tsdn, sec->fallback, edata,
343 		    deferred_work_generated);
344 		return;
345 	}
346 	sec_shard_t *shard = sec_shard_pick(tsdn, sec);
347 	malloc_mutex_lock(tsdn, &shard->mtx);
348 	if (shard->enabled) {
349 		sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata);
350 	} else {
351 		malloc_mutex_unlock(tsdn, &shard->mtx);
352 		pai_dalloc(tsdn, sec->fallback, edata,
353 		    deferred_work_generated);
354 	}
355 }
356 
357 void
358 sec_flush(tsdn_t *tsdn, sec_t *sec) {
359 	for (size_t i = 0; i < sec->opts.nshards; i++) {
360 		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
361 		sec_flush_all_locked(tsdn, sec, &sec->shards[i]);
362 		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
363 	}
364 }
365 
366 void
367 sec_disable(tsdn_t *tsdn, sec_t *sec) {
368 	for (size_t i = 0; i < sec->opts.nshards; i++) {
369 		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
370 		sec->shards[i].enabled = false;
371 		sec_flush_all_locked(tsdn, sec, &sec->shards[i]);
372 		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
373 	}
374 }
375 
376 void
377 sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats) {
378 	size_t sum = 0;
379 	for (size_t i = 0; i < sec->opts.nshards; i++) {
380 		/*
381 		 * We could save these lock acquisitions by making bytes_cur
382 		 * atomic, but stats collection is rare anyways and we expect
383 		 * the number and type of stats to get more interesting.
384 		 */
385 		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
386 		sum += sec->shards[i].bytes_cur;
387 		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
388 	}
389 	stats->bytes += sum;
390 }
391 
392 void
393 sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec,
394     mutex_prof_data_t *mutex_prof_data) {
395 	for (size_t i = 0; i < sec->opts.nshards; i++) {
396 		malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
397 		malloc_mutex_prof_accum(tsdn, mutex_prof_data,
398 		    &sec->shards[i].mtx);
399 		malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
400 	}
401 }
402 
403 void
404 sec_prefork2(tsdn_t *tsdn, sec_t *sec) {
405 	for (size_t i = 0; i < sec->opts.nshards; i++) {
406 		malloc_mutex_prefork(tsdn, &sec->shards[i].mtx);
407 	}
408 }
409 
410 void
411 sec_postfork_parent(tsdn_t *tsdn, sec_t *sec) {
412 	for (size_t i = 0; i < sec->opts.nshards; i++) {
413 		malloc_mutex_postfork_parent(tsdn, &sec->shards[i].mtx);
414 	}
415 }
416 
417 void
418 sec_postfork_child(tsdn_t *tsdn, sec_t *sec) {
419 	for (size_t i = 0; i < sec->opts.nshards; i++) {
420 		malloc_mutex_postfork_child(tsdn, &sec->shards[i].mtx);
421 	}
422 }
423