xref: /freebsd/contrib/jemalloc/include/jemalloc/internal/bit_util.h (revision c43cad87172039ccf38172129c79755ea79e6102)
1 #ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
2 #define JEMALLOC_INTERNAL_BIT_UTIL_H
3 
4 #include "jemalloc/internal/assert.h"
5 
6 /* Sanity check. */
7 #if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
8     || !defined(JEMALLOC_INTERNAL_FFS)
9 #  error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
10 #endif
11 
12 /*
13  * Unlike the builtins and posix ffs functions, our ffs requires a non-zero
14  * input, and returns the position of the lowest bit set (as opposed to the
15  * posix versions, which return 1 larger than that position and use a return
16  * value of zero as a sentinel.  This tends to simplify logic in callers, and
17  * allows for consistency with the builtins we build fls on top of.
18  */
19 static inline unsigned
20 ffs_llu(unsigned long long x) {
21 	util_assume(x != 0);
22 	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
23 }
24 
25 static inline unsigned
26 ffs_lu(unsigned long x) {
27 	util_assume(x != 0);
28 	return JEMALLOC_INTERNAL_FFSL(x) - 1;
29 }
30 
31 static inline unsigned
32 ffs_u(unsigned x) {
33 	util_assume(x != 0);
34 	return JEMALLOC_INTERNAL_FFS(x) - 1;
35 }
36 
37 #define DO_FLS_SLOW(x, suffix) do {					\
38 	util_assume(x != 0);						\
39 	x |= (x >> 1);							\
40 	x |= (x >> 2);							\
41 	x |= (x >> 4);							\
42 	x |= (x >> 8);							\
43 	x |= (x >> 16);							\
44 	if (sizeof(x) > 4) {						\
45 		/*							\
46 		 * If sizeof(x) is 4, then the expression "x >> 32"	\
47 		 * will generate compiler warnings even if the code	\
48 		 * never executes.  This circumvents the warning, and	\
49 		 * gets compiled out in optimized builds.		\
50 		 */							\
51 		int constant_32 = sizeof(x) * 4;			\
52 		x |= (x >> constant_32);				\
53 	}								\
54 	x++;								\
55 	if (x == 0) {							\
56 		return 8 * sizeof(x) - 1;				\
57 	}								\
58 	return ffs_##suffix(x) - 1;					\
59 } while(0)
60 
61 static inline unsigned
62 fls_llu_slow(unsigned long long x) {
63 	DO_FLS_SLOW(x, llu);
64 }
65 
66 static inline unsigned
67 fls_lu_slow(unsigned long x) {
68 	DO_FLS_SLOW(x, lu);
69 }
70 
71 static inline unsigned
72 fls_u_slow(unsigned x) {
73 	DO_FLS_SLOW(x, u);
74 }
75 
76 #undef DO_FLS_SLOW
77 
78 #ifdef JEMALLOC_HAVE_BUILTIN_CLZ
79 static inline unsigned
80 fls_llu(unsigned long long x) {
81 	util_assume(x != 0);
82 	/*
83 	 * Note that the xor here is more naturally written as subtraction; the
84 	 * last bit set is the number of bits in the type minus the number of
85 	 * leading zero bits.  But GCC implements that as:
86 	 *    bsr     edi, edi
87 	 *    mov     eax, 31
88 	 *    xor     edi, 31
89 	 *    sub     eax, edi
90 	 * If we write it as xor instead, then we get
91 	 *    bsr     eax, edi
92 	 * as desired.
93 	 */
94 	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
95 }
96 
97 static inline unsigned
98 fls_lu(unsigned long x) {
99 	util_assume(x != 0);
100 	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
101 }
102 
103 static inline unsigned
104 fls_u(unsigned x) {
105 	util_assume(x != 0);
106 	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
107 }
108 #elif defined(_MSC_VER)
109 
110 #if LG_SIZEOF_PTR == 3
111 #define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
112 #else
113 /*
114  * This never actually runs; we're just dodging a compiler error for the
115  * never-taken branch where sizeof(void *) == 8.
116  */
117 #define DO_BSR64(bit, x) bit = 0; unreachable()
118 #endif
119 
120 #define DO_FLS(x) do {							\
121 	if (x == 0) {							\
122 		return 8 * sizeof(x);					\
123 	}								\
124 	unsigned long bit;						\
125 	if (sizeof(x) == 4) {						\
126 		_BitScanReverse(&bit, (unsigned)x);			\
127 		return (unsigned)bit;					\
128 	}								\
129 	if (sizeof(x) == 8 && sizeof(void *) == 8) {			\
130 		DO_BSR64(bit, x);					\
131 		return (unsigned)bit;					\
132 	}								\
133 	if (sizeof(x) == 8 && sizeof(void *) == 4) {			\
134 		/* Dodge a compiler warning, as above. */		\
135 		int constant_32 = sizeof(x) * 4;			\
136 		if (_BitScanReverse(&bit,				\
137 		    (unsigned)(x >> constant_32))) {			\
138 			return 32 + (unsigned)bit;			\
139 		} else {						\
140 			_BitScanReverse(&bit, (unsigned)x);		\
141 			return (unsigned)bit;				\
142 		}							\
143 	}								\
144 	unreachable();							\
145 } while (0)
146 
147 static inline unsigned
148 fls_llu(unsigned long long x) {
149 	DO_FLS(x);
150 }
151 
152 static inline unsigned
153 fls_lu(unsigned long x) {
154 	DO_FLS(x);
155 }
156 
157 static inline unsigned
158 fls_u(unsigned x) {
159 	DO_FLS(x);
160 }
161 
162 #undef DO_FLS
163 #undef DO_BSR64
164 #else
165 
166 static inline unsigned
167 fls_llu(unsigned long long x) {
168 	return fls_llu_slow(x);
169 }
170 
171 static inline unsigned
172 fls_lu(unsigned long x) {
173 	return fls_lu_slow(x);
174 }
175 
176 static inline unsigned
177 fls_u(unsigned x) {
178 	return fls_u_slow(x);
179 }
180 #endif
181 
182 #if LG_SIZEOF_LONG_LONG > 3
183 #  error "Haven't implemented popcount for 16-byte ints."
184 #endif
185 
186 #define DO_POPCOUNT(x, type) do {					\
187 	/*								\
188 	 * Algorithm from an old AMD optimization reference manual.	\
189 	 * We're putting a little bit more work than you might expect	\
190 	 * into the no-instrinsic case, since we only support the	\
191 	 * GCC intrinsics spelling of popcount (for now).  Detecting	\
192 	 * whether or not the popcount builtin is actually useable in	\
193 	 * MSVC is nontrivial.						\
194 	 */								\
195 									\
196 	type bmul = (type)0x0101010101010101ULL;			\
197 									\
198 	/*								\
199 	 * Replace each 2 bits with the sideways sum of the original	\
200 	 * values.  0x5 = 0b0101.					\
201 	 *								\
202 	 * You might expect this to be:					\
203 	 *   x = (x & 0x55...) + ((x >> 1) & 0x55...).			\
204 	 * That costs an extra mask relative to this, though.		\
205 	 */								\
206 	x = x - ((x >> 1) & (0x55U * bmul));				\
207 	/* Replace each 4 bits with their sideays sum.  0x3 = 0b0011. */\
208 	x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U));		\
209 	/*								\
210 	 * Replace each 8 bits with their sideways sum.  Note that we	\
211 	 * can't overflow within each 4-bit sum here, so we can skip	\
212 	 * the initial mask.						\
213 	 */								\
214 	x = (x + (x >> 4)) & (bmul * 0x0FU);				\
215 	/*								\
216 	 * None of the partial sums in this multiplication (viewed in	\
217 	 * base-256) can overflow into the next digit.  So the least	\
218 	 * significant byte of the product will be the least		\
219 	 * significant byte of the original value, the second least	\
220 	 * significant byte will be the sum of the two least		\
221 	 * significant bytes of the original value, and so on.		\
222 	 * Importantly, the high byte will be the byte-wise sum of all	\
223 	 * the bytes of the original value.				\
224 	 */								\
225 	x = x * bmul;							\
226 	x >>= ((sizeof(x) - 1) * 8);					\
227 	return (unsigned)x;						\
228 } while(0)
229 
230 static inline unsigned
231 popcount_u_slow(unsigned bitmap) {
232 	DO_POPCOUNT(bitmap, unsigned);
233 }
234 
235 static inline unsigned
236 popcount_lu_slow(unsigned long bitmap) {
237 	DO_POPCOUNT(bitmap, unsigned long);
238 }
239 
240 static inline unsigned
241 popcount_llu_slow(unsigned long long bitmap) {
242 	DO_POPCOUNT(bitmap, unsigned long long);
243 }
244 
245 #undef DO_POPCOUNT
246 
247 static inline unsigned
248 popcount_u(unsigned bitmap) {
249 #ifdef JEMALLOC_INTERNAL_POPCOUNT
250 	return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
251 #else
252 	return popcount_u_slow(bitmap);
253 #endif
254 }
255 
256 static inline unsigned
257 popcount_lu(unsigned long bitmap) {
258 #ifdef JEMALLOC_INTERNAL_POPCOUNTL
259 	return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
260 #else
261 	return popcount_lu_slow(bitmap);
262 #endif
263 }
264 
265 static inline unsigned
266 popcount_llu(unsigned long long bitmap) {
267 #ifdef JEMALLOC_INTERNAL_POPCOUNTLL
268 	return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
269 #else
270 	return popcount_llu_slow(bitmap);
271 #endif
272 }
273 
274 /*
275  * Clears first unset bit in bitmap, and returns
276  * place of bit.  bitmap *must not* be 0.
277  */
278 
279 static inline size_t
280 cfs_lu(unsigned long* bitmap) {
281 	util_assume(*bitmap != 0);
282 	size_t bit = ffs_lu(*bitmap);
283 	*bitmap ^= ZU(1) << bit;
284 	return bit;
285 }
286 
287 static inline unsigned
288 ffs_zu(size_t x) {
289 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
290 	return ffs_u(x);
291 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
292 	return ffs_lu(x);
293 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
294 	return ffs_llu(x);
295 #else
296 #error No implementation for size_t ffs()
297 #endif
298 }
299 
300 static inline unsigned
301 fls_zu(size_t x) {
302 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
303 	return fls_u(x);
304 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
305 	return fls_lu(x);
306 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
307 	return fls_llu(x);
308 #else
309 #error No implementation for size_t fls()
310 #endif
311 }
312 
313 
314 static inline unsigned
315 ffs_u64(uint64_t x) {
316 #if LG_SIZEOF_LONG == 3
317 	return ffs_lu(x);
318 #elif LG_SIZEOF_LONG_LONG == 3
319 	return ffs_llu(x);
320 #else
321 #error No implementation for 64-bit ffs()
322 #endif
323 }
324 
325 static inline unsigned
326 fls_u64(uint64_t x) {
327 #if LG_SIZEOF_LONG == 3
328 	return fls_lu(x);
329 #elif LG_SIZEOF_LONG_LONG == 3
330 	return fls_llu(x);
331 #else
332 #error No implementation for 64-bit fls()
333 #endif
334 }
335 
336 static inline unsigned
337 ffs_u32(uint32_t x) {
338 #if LG_SIZEOF_INT == 2
339 	return ffs_u(x);
340 #else
341 #error No implementation for 32-bit ffs()
342 #endif
343 	return ffs_u(x);
344 }
345 
346 static inline unsigned
347 fls_u32(uint32_t x) {
348 #if LG_SIZEOF_INT == 2
349 	return fls_u(x);
350 #else
351 #error No implementation for 32-bit fls()
352 #endif
353 	return fls_u(x);
354 }
355 
356 static inline uint64_t
357 pow2_ceil_u64(uint64_t x) {
358 	if (unlikely(x <= 1)) {
359 		return x;
360 	}
361 	size_t msb_on_index = fls_u64(x - 1);
362 	/*
363 	 * Range-check; it's on the callers to ensure that the result of this
364 	 * call won't overflow.
365 	 */
366 	assert(msb_on_index < 63);
367 	return 1ULL << (msb_on_index + 1);
368 }
369 
370 static inline uint32_t
371 pow2_ceil_u32(uint32_t x) {
372 	if (unlikely(x <= 1)) {
373 	    return x;
374 	}
375 	size_t msb_on_index = fls_u32(x - 1);
376 	/* As above. */
377 	assert(msb_on_index < 31);
378 	return 1U << (msb_on_index + 1);
379 }
380 
381 /* Compute the smallest power of 2 that is >= x. */
382 static inline size_t
383 pow2_ceil_zu(size_t x) {
384 #if (LG_SIZEOF_PTR == 3)
385 	return pow2_ceil_u64(x);
386 #else
387 	return pow2_ceil_u32(x);
388 #endif
389 }
390 
391 static inline unsigned
392 lg_floor(size_t x) {
393 	util_assume(x != 0);
394 #if (LG_SIZEOF_PTR == 3)
395 	return fls_u64(x);
396 #else
397 	return fls_u32(x);
398 #endif
399 }
400 
401 static inline unsigned
402 lg_ceil(size_t x) {
403 	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
404 }
405 
406 /* A compile-time version of lg_floor and lg_ceil. */
407 #define LG_FLOOR_1(x) 0
408 #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
409 #define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
410 #define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
411 #define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
412 #define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
413 #define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
414 #if LG_SIZEOF_PTR == 2
415 #  define LG_FLOOR(x) LG_FLOOR_32((x))
416 #else
417 #  define LG_FLOOR(x) LG_FLOOR_64((x))
418 #endif
419 
420 #define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
421 
422 #endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
423