1 #ifndef JEMALLOC_INTERNAL_BIT_UTIL_H 2 #define JEMALLOC_INTERNAL_BIT_UTIL_H 3 4 #include "jemalloc/internal/assert.h" 5 6 /* Sanity check. */ 7 #if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \ 8 || !defined(JEMALLOC_INTERNAL_FFS) 9 # error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure 10 #endif 11 12 /* 13 * Unlike the builtins and posix ffs functions, our ffs requires a non-zero 14 * input, and returns the position of the lowest bit set (as opposed to the 15 * posix versions, which return 1 larger than that position and use a return 16 * value of zero as a sentinel. This tends to simplify logic in callers, and 17 * allows for consistency with the builtins we build fls on top of. 18 */ 19 static inline unsigned 20 ffs_llu(unsigned long long x) { 21 util_assume(x != 0); 22 return JEMALLOC_INTERNAL_FFSLL(x) - 1; 23 } 24 25 static inline unsigned 26 ffs_lu(unsigned long x) { 27 util_assume(x != 0); 28 return JEMALLOC_INTERNAL_FFSL(x) - 1; 29 } 30 31 static inline unsigned 32 ffs_u(unsigned x) { 33 util_assume(x != 0); 34 return JEMALLOC_INTERNAL_FFS(x) - 1; 35 } 36 37 #define DO_FLS_SLOW(x, suffix) do { \ 38 util_assume(x != 0); \ 39 x |= (x >> 1); \ 40 x |= (x >> 2); \ 41 x |= (x >> 4); \ 42 x |= (x >> 8); \ 43 x |= (x >> 16); \ 44 if (sizeof(x) > 4) { \ 45 /* \ 46 * If sizeof(x) is 4, then the expression "x >> 32" \ 47 * will generate compiler warnings even if the code \ 48 * never executes. This circumvents the warning, and \ 49 * gets compiled out in optimized builds. \ 50 */ \ 51 int constant_32 = sizeof(x) * 4; \ 52 x |= (x >> constant_32); \ 53 } \ 54 x++; \ 55 if (x == 0) { \ 56 return 8 * sizeof(x) - 1; \ 57 } \ 58 return ffs_##suffix(x) - 1; \ 59 } while(0) 60 61 static inline unsigned 62 fls_llu_slow(unsigned long long x) { 63 DO_FLS_SLOW(x, llu); 64 } 65 66 static inline unsigned 67 fls_lu_slow(unsigned long x) { 68 DO_FLS_SLOW(x, lu); 69 } 70 71 static inline unsigned 72 fls_u_slow(unsigned x) { 73 DO_FLS_SLOW(x, u); 74 } 75 76 #undef DO_FLS_SLOW 77 78 #ifdef JEMALLOC_HAVE_BUILTIN_CLZ 79 static inline unsigned 80 fls_llu(unsigned long long x) { 81 util_assume(x != 0); 82 /* 83 * Note that the xor here is more naturally written as subtraction; the 84 * last bit set is the number of bits in the type minus the number of 85 * leading zero bits. But GCC implements that as: 86 * bsr edi, edi 87 * mov eax, 31 88 * xor edi, 31 89 * sub eax, edi 90 * If we write it as xor instead, then we get 91 * bsr eax, edi 92 * as desired. 93 */ 94 return (8 * sizeof(x) - 1) ^ __builtin_clzll(x); 95 } 96 97 static inline unsigned 98 fls_lu(unsigned long x) { 99 util_assume(x != 0); 100 return (8 * sizeof(x) - 1) ^ __builtin_clzl(x); 101 } 102 103 static inline unsigned 104 fls_u(unsigned x) { 105 util_assume(x != 0); 106 return (8 * sizeof(x) - 1) ^ __builtin_clz(x); 107 } 108 #elif defined(_MSC_VER) 109 110 #if LG_SIZEOF_PTR == 3 111 #define DO_BSR64(bit, x) _BitScanReverse64(&bit, x) 112 #else 113 /* 114 * This never actually runs; we're just dodging a compiler error for the 115 * never-taken branch where sizeof(void *) == 8. 116 */ 117 #define DO_BSR64(bit, x) bit = 0; unreachable() 118 #endif 119 120 #define DO_FLS(x) do { \ 121 if (x == 0) { \ 122 return 8 * sizeof(x); \ 123 } \ 124 unsigned long bit; \ 125 if (sizeof(x) == 4) { \ 126 _BitScanReverse(&bit, (unsigned)x); \ 127 return (unsigned)bit; \ 128 } \ 129 if (sizeof(x) == 8 && sizeof(void *) == 8) { \ 130 DO_BSR64(bit, x); \ 131 return (unsigned)bit; \ 132 } \ 133 if (sizeof(x) == 8 && sizeof(void *) == 4) { \ 134 /* Dodge a compiler warning, as above. */ \ 135 int constant_32 = sizeof(x) * 4; \ 136 if (_BitScanReverse(&bit, \ 137 (unsigned)(x >> constant_32))) { \ 138 return 32 + (unsigned)bit; \ 139 } else { \ 140 _BitScanReverse(&bit, (unsigned)x); \ 141 return (unsigned)bit; \ 142 } \ 143 } \ 144 unreachable(); \ 145 } while (0) 146 147 static inline unsigned 148 fls_llu(unsigned long long x) { 149 DO_FLS(x); 150 } 151 152 static inline unsigned 153 fls_lu(unsigned long x) { 154 DO_FLS(x); 155 } 156 157 static inline unsigned 158 fls_u(unsigned x) { 159 DO_FLS(x); 160 } 161 162 #undef DO_FLS 163 #undef DO_BSR64 164 #else 165 166 static inline unsigned 167 fls_llu(unsigned long long x) { 168 return fls_llu_slow(x); 169 } 170 171 static inline unsigned 172 fls_lu(unsigned long x) { 173 return fls_lu_slow(x); 174 } 175 176 static inline unsigned 177 fls_u(unsigned x) { 178 return fls_u_slow(x); 179 } 180 #endif 181 182 #if LG_SIZEOF_LONG_LONG > 3 183 # error "Haven't implemented popcount for 16-byte ints." 184 #endif 185 186 #define DO_POPCOUNT(x, type) do { \ 187 /* \ 188 * Algorithm from an old AMD optimization reference manual. \ 189 * We're putting a little bit more work than you might expect \ 190 * into the no-instrinsic case, since we only support the \ 191 * GCC intrinsics spelling of popcount (for now). Detecting \ 192 * whether or not the popcount builtin is actually useable in \ 193 * MSVC is nontrivial. \ 194 */ \ 195 \ 196 type bmul = (type)0x0101010101010101ULL; \ 197 \ 198 /* \ 199 * Replace each 2 bits with the sideways sum of the original \ 200 * values. 0x5 = 0b0101. \ 201 * \ 202 * You might expect this to be: \ 203 * x = (x & 0x55...) + ((x >> 1) & 0x55...). \ 204 * That costs an extra mask relative to this, though. \ 205 */ \ 206 x = x - ((x >> 1) & (0x55U * bmul)); \ 207 /* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\ 208 x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \ 209 /* \ 210 * Replace each 8 bits with their sideways sum. Note that we \ 211 * can't overflow within each 4-bit sum here, so we can skip \ 212 * the initial mask. \ 213 */ \ 214 x = (x + (x >> 4)) & (bmul * 0x0FU); \ 215 /* \ 216 * None of the partial sums in this multiplication (viewed in \ 217 * base-256) can overflow into the next digit. So the least \ 218 * significant byte of the product will be the least \ 219 * significant byte of the original value, the second least \ 220 * significant byte will be the sum of the two least \ 221 * significant bytes of the original value, and so on. \ 222 * Importantly, the high byte will be the byte-wise sum of all \ 223 * the bytes of the original value. \ 224 */ \ 225 x = x * bmul; \ 226 x >>= ((sizeof(x) - 1) * 8); \ 227 return (unsigned)x; \ 228 } while(0) 229 230 static inline unsigned 231 popcount_u_slow(unsigned bitmap) { 232 DO_POPCOUNT(bitmap, unsigned); 233 } 234 235 static inline unsigned 236 popcount_lu_slow(unsigned long bitmap) { 237 DO_POPCOUNT(bitmap, unsigned long); 238 } 239 240 static inline unsigned 241 popcount_llu_slow(unsigned long long bitmap) { 242 DO_POPCOUNT(bitmap, unsigned long long); 243 } 244 245 #undef DO_POPCOUNT 246 247 static inline unsigned 248 popcount_u(unsigned bitmap) { 249 #ifdef JEMALLOC_INTERNAL_POPCOUNT 250 return JEMALLOC_INTERNAL_POPCOUNT(bitmap); 251 #else 252 return popcount_u_slow(bitmap); 253 #endif 254 } 255 256 static inline unsigned 257 popcount_lu(unsigned long bitmap) { 258 #ifdef JEMALLOC_INTERNAL_POPCOUNTL 259 return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); 260 #else 261 return popcount_lu_slow(bitmap); 262 #endif 263 } 264 265 static inline unsigned 266 popcount_llu(unsigned long long bitmap) { 267 #ifdef JEMALLOC_INTERNAL_POPCOUNTLL 268 return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap); 269 #else 270 return popcount_llu_slow(bitmap); 271 #endif 272 } 273 274 /* 275 * Clears first unset bit in bitmap, and returns 276 * place of bit. bitmap *must not* be 0. 277 */ 278 279 static inline size_t 280 cfs_lu(unsigned long* bitmap) { 281 util_assume(*bitmap != 0); 282 size_t bit = ffs_lu(*bitmap); 283 *bitmap ^= ZU(1) << bit; 284 return bit; 285 } 286 287 static inline unsigned 288 ffs_zu(size_t x) { 289 #if LG_SIZEOF_PTR == LG_SIZEOF_INT 290 return ffs_u(x); 291 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG 292 return ffs_lu(x); 293 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG 294 return ffs_llu(x); 295 #else 296 #error No implementation for size_t ffs() 297 #endif 298 } 299 300 static inline unsigned 301 fls_zu(size_t x) { 302 #if LG_SIZEOF_PTR == LG_SIZEOF_INT 303 return fls_u(x); 304 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG 305 return fls_lu(x); 306 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG 307 return fls_llu(x); 308 #else 309 #error No implementation for size_t fls() 310 #endif 311 } 312 313 314 static inline unsigned 315 ffs_u64(uint64_t x) { 316 #if LG_SIZEOF_LONG == 3 317 return ffs_lu(x); 318 #elif LG_SIZEOF_LONG_LONG == 3 319 return ffs_llu(x); 320 #else 321 #error No implementation for 64-bit ffs() 322 #endif 323 } 324 325 static inline unsigned 326 fls_u64(uint64_t x) { 327 #if LG_SIZEOF_LONG == 3 328 return fls_lu(x); 329 #elif LG_SIZEOF_LONG_LONG == 3 330 return fls_llu(x); 331 #else 332 #error No implementation for 64-bit fls() 333 #endif 334 } 335 336 static inline unsigned 337 ffs_u32(uint32_t x) { 338 #if LG_SIZEOF_INT == 2 339 return ffs_u(x); 340 #else 341 #error No implementation for 32-bit ffs() 342 #endif 343 return ffs_u(x); 344 } 345 346 static inline unsigned 347 fls_u32(uint32_t x) { 348 #if LG_SIZEOF_INT == 2 349 return fls_u(x); 350 #else 351 #error No implementation for 32-bit fls() 352 #endif 353 return fls_u(x); 354 } 355 356 static inline uint64_t 357 pow2_ceil_u64(uint64_t x) { 358 if (unlikely(x <= 1)) { 359 return x; 360 } 361 size_t msb_on_index = fls_u64(x - 1); 362 /* 363 * Range-check; it's on the callers to ensure that the result of this 364 * call won't overflow. 365 */ 366 assert(msb_on_index < 63); 367 return 1ULL << (msb_on_index + 1); 368 } 369 370 static inline uint32_t 371 pow2_ceil_u32(uint32_t x) { 372 if (unlikely(x <= 1)) { 373 return x; 374 } 375 size_t msb_on_index = fls_u32(x - 1); 376 /* As above. */ 377 assert(msb_on_index < 31); 378 return 1U << (msb_on_index + 1); 379 } 380 381 /* Compute the smallest power of 2 that is >= x. */ 382 static inline size_t 383 pow2_ceil_zu(size_t x) { 384 #if (LG_SIZEOF_PTR == 3) 385 return pow2_ceil_u64(x); 386 #else 387 return pow2_ceil_u32(x); 388 #endif 389 } 390 391 static inline unsigned 392 lg_floor(size_t x) { 393 util_assume(x != 0); 394 #if (LG_SIZEOF_PTR == 3) 395 return fls_u64(x); 396 #else 397 return fls_u32(x); 398 #endif 399 } 400 401 static inline unsigned 402 lg_ceil(size_t x) { 403 return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1); 404 } 405 406 /* A compile-time version of lg_floor and lg_ceil. */ 407 #define LG_FLOOR_1(x) 0 408 #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1)) 409 #define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2)) 410 #define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4)) 411 #define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8)) 412 #define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16)) 413 #define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32)) 414 #if LG_SIZEOF_PTR == 2 415 # define LG_FLOOR(x) LG_FLOOR_32((x)) 416 #else 417 # define LG_FLOOR(x) LG_FLOOR_64((x)) 418 #endif 419 420 #define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1)) 421 422 #endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */ 423