1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD 2*3b35e7eeSXin LI 381ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 481ad8388SMartin Matuska // 581ad8388SMartin Matuska /// \file tuklib_integer.h 681ad8388SMartin Matuska /// \brief Various integer and bit operations 781ad8388SMartin Matuska /// 881ad8388SMartin Matuska /// This file provides macros or functions to do some basic integer and bit 981ad8388SMartin Matuska /// operations. 1081ad8388SMartin Matuska /// 11a8675d92SXin LI /// Native endian inline functions (XX = 16, 32, or 64): 12a8675d92SXin LI /// - Unaligned native endian reads: readXXne(ptr) 13a8675d92SXin LI /// - Unaligned native endian writes: writeXXne(ptr, num) 14a8675d92SXin LI /// - Aligned native endian reads: aligned_readXXne(ptr) 15a8675d92SXin LI /// - Aligned native endian writes: aligned_writeXXne(ptr, num) 16a8675d92SXin LI /// 17a8675d92SXin LI /// Endianness-converting integer operations (these can be macros!) 18a8675d92SXin LI /// (XX = 16, 32, or 64; Y = b or l): 19*3b35e7eeSXin LI /// - Byte swapping: byteswapXX(num) 20a8675d92SXin LI /// - Byte order conversions to/from native (byteswaps if Y isn't 21a8675d92SXin LI /// the native endianness): convXXYe(num) 2273ed8e77SXin LI /// - Unaligned reads: readXXYe(ptr) 2373ed8e77SXin LI /// - Unaligned writes: writeXXYe(ptr, num) 24a8675d92SXin LI /// - Aligned reads: aligned_readXXYe(ptr) 25a8675d92SXin LI /// - Aligned writes: aligned_writeXXYe(ptr, num) 2681ad8388SMartin Matuska /// 27a8675d92SXin LI /// Since the above can macros, the arguments should have no side effects 28a8675d92SXin LI /// because they may be evaluated more than once. 2981ad8388SMartin Matuska /// 30a8675d92SXin LI /// Bit scan operations for non-zero 32-bit integers (inline functions): 3181ad8388SMartin Matuska /// - Bit scan reverse (find highest non-zero bit): bsr32(num) 3281ad8388SMartin Matuska /// - Count leading zeros: clz32(num) 3381ad8388SMartin Matuska /// - Count trailing zeros: ctz32(num) 3481ad8388SMartin Matuska /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 3581ad8388SMartin Matuska /// 3681ad8388SMartin Matuska /// The above bit scan operations return 0-31. If num is zero, 3781ad8388SMartin Matuska /// the result is undefined. 3881ad8388SMartin Matuska // 3981ad8388SMartin Matuska // Authors: Lasse Collin 4081ad8388SMartin Matuska // Joachim Henke 4181ad8388SMartin Matuska // 4281ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 4381ad8388SMartin Matuska 4481ad8388SMartin Matuska #ifndef TUKLIB_INTEGER_H 4581ad8388SMartin Matuska #define TUKLIB_INTEGER_H 4681ad8388SMartin Matuska 4781ad8388SMartin Matuska #include "tuklib_common.h" 48a8675d92SXin LI #include <string.h> 49a8675d92SXin LI 50a8675d92SXin LI // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 51a8675d92SXin LI // and such functions. 52a8675d92SXin LI #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 53a8675d92SXin LI # include <immintrin.h> 54b333cd44SXin LI // Only include <intrin.h> when it is needed. GCC and Clang can both 55b333cd44SXin LI // use __builtin's, so we only need Windows instrincs when using MSVC. 56b333cd44SXin LI // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 57b333cd44SXin LI // cases explicitly. 58b333cd44SXin LI #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 59b333cd44SXin LI # include <intrin.h> 60a8675d92SXin LI #endif 6181ad8388SMartin Matuska 6281ad8388SMartin Matuska 63a8675d92SXin LI /////////////////// 64a8675d92SXin LI // Byte swapping // 65a8675d92SXin LI /////////////////// 6681ad8388SMartin Matuska 67a8675d92SXin LI #if defined(HAVE___BUILTIN_BSWAPXX) 68a8675d92SXin LI // GCC >= 4.8 and Clang 69*3b35e7eeSXin LI # define byteswap16(num) __builtin_bswap16(num) 70*3b35e7eeSXin LI # define byteswap32(num) __builtin_bswap32(num) 71*3b35e7eeSXin LI # define byteswap64(num) __builtin_bswap64(num) 72a8675d92SXin LI 73a8675d92SXin LI #elif defined(HAVE_BYTESWAP_H) 7481ad8388SMartin Matuska // glibc, uClibc, dietlibc 7581ad8388SMartin Matuska # include <byteswap.h> 7681ad8388SMartin Matuska # ifdef HAVE_BSWAP_16 77*3b35e7eeSXin LI # define byteswap16(num) bswap_16(num) 7881ad8388SMartin Matuska # endif 7981ad8388SMartin Matuska # ifdef HAVE_BSWAP_32 80*3b35e7eeSXin LI # define byteswap32(num) bswap_32(num) 8181ad8388SMartin Matuska # endif 8281ad8388SMartin Matuska # ifdef HAVE_BSWAP_64 83*3b35e7eeSXin LI # define byteswap64(num) bswap_64(num) 8481ad8388SMartin Matuska # endif 8581ad8388SMartin Matuska 8681ad8388SMartin Matuska #elif defined(HAVE_SYS_ENDIAN_H) 8781ad8388SMartin Matuska // *BSDs and Darwin 8881ad8388SMartin Matuska # include <sys/endian.h> 89*3b35e7eeSXin LI # define byteswap16(num) bswap16(num) 90*3b35e7eeSXin LI # define byteswap32(num) bswap32(num) 91*3b35e7eeSXin LI # define byteswap64(num) bswap64(num) 9281ad8388SMartin Matuska 9381ad8388SMartin Matuska #elif defined(HAVE_SYS_BYTEORDER_H) 9481ad8388SMartin Matuska // Solaris 9581ad8388SMartin Matuska # include <sys/byteorder.h> 9681ad8388SMartin Matuska # ifdef BSWAP_16 97*3b35e7eeSXin LI # define byteswap16(num) BSWAP_16(num) 9881ad8388SMartin Matuska # endif 9981ad8388SMartin Matuska # ifdef BSWAP_32 100*3b35e7eeSXin LI # define byteswap32(num) BSWAP_32(num) 10181ad8388SMartin Matuska # endif 10281ad8388SMartin Matuska # ifdef BSWAP_64 103*3b35e7eeSXin LI # define byteswap64(num) BSWAP_64(num) 10481ad8388SMartin Matuska # endif 10581ad8388SMartin Matuska # ifdef BE_16 10681ad8388SMartin Matuska # define conv16be(num) BE_16(num) 10781ad8388SMartin Matuska # endif 10881ad8388SMartin Matuska # ifdef BE_32 10981ad8388SMartin Matuska # define conv32be(num) BE_32(num) 11081ad8388SMartin Matuska # endif 11181ad8388SMartin Matuska # ifdef BE_64 11281ad8388SMartin Matuska # define conv64be(num) BE_64(num) 11381ad8388SMartin Matuska # endif 11481ad8388SMartin Matuska # ifdef LE_16 11581ad8388SMartin Matuska # define conv16le(num) LE_16(num) 11681ad8388SMartin Matuska # endif 11781ad8388SMartin Matuska # ifdef LE_32 11881ad8388SMartin Matuska # define conv32le(num) LE_32(num) 11981ad8388SMartin Matuska # endif 12081ad8388SMartin Matuska # ifdef LE_64 12181ad8388SMartin Matuska # define conv64le(num) LE_64(num) 12281ad8388SMartin Matuska # endif 12381ad8388SMartin Matuska #endif 12481ad8388SMartin Matuska 125*3b35e7eeSXin LI #ifndef byteswap16 126*3b35e7eeSXin LI # define byteswap16(n) (uint16_t)( \ 127a8675d92SXin LI (((n) & 0x00FFU) << 8) \ 128a8675d92SXin LI | (((n) & 0xFF00U) >> 8) \ 129a8675d92SXin LI ) 13081ad8388SMartin Matuska #endif 13181ad8388SMartin Matuska 132*3b35e7eeSXin LI #ifndef byteswap32 133*3b35e7eeSXin LI # define byteswap32(n) (uint32_t)( \ 134a8675d92SXin LI (((n) & UINT32_C(0x000000FF)) << 24) \ 135a8675d92SXin LI | (((n) & UINT32_C(0x0000FF00)) << 8) \ 136a8675d92SXin LI | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 137a8675d92SXin LI | (((n) & UINT32_C(0xFF000000)) >> 24) \ 138a8675d92SXin LI ) 13981ad8388SMartin Matuska #endif 14081ad8388SMartin Matuska 141*3b35e7eeSXin LI #ifndef byteswap64 142*3b35e7eeSXin LI # define byteswap64(n) (uint64_t)( \ 143a8675d92SXin LI (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 144a8675d92SXin LI | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 145a8675d92SXin LI | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 146a8675d92SXin LI | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 147a8675d92SXin LI | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 148a8675d92SXin LI | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 149a8675d92SXin LI | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 150a8675d92SXin LI | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 151a8675d92SXin LI ) 15281ad8388SMartin Matuska #endif 15381ad8388SMartin Matuska 15481ad8388SMartin Matuska // Define conversion macros using the basic byte swapping macros. 15581ad8388SMartin Matuska #ifdef WORDS_BIGENDIAN 15681ad8388SMartin Matuska # ifndef conv16be 15781ad8388SMartin Matuska # define conv16be(num) ((uint16_t)(num)) 15881ad8388SMartin Matuska # endif 15981ad8388SMartin Matuska # ifndef conv32be 16081ad8388SMartin Matuska # define conv32be(num) ((uint32_t)(num)) 16181ad8388SMartin Matuska # endif 16281ad8388SMartin Matuska # ifndef conv64be 16381ad8388SMartin Matuska # define conv64be(num) ((uint64_t)(num)) 16481ad8388SMartin Matuska # endif 16581ad8388SMartin Matuska # ifndef conv16le 166*3b35e7eeSXin LI # define conv16le(num) byteswap16(num) 16781ad8388SMartin Matuska # endif 16881ad8388SMartin Matuska # ifndef conv32le 169*3b35e7eeSXin LI # define conv32le(num) byteswap32(num) 17081ad8388SMartin Matuska # endif 17181ad8388SMartin Matuska # ifndef conv64le 172*3b35e7eeSXin LI # define conv64le(num) byteswap64(num) 17381ad8388SMartin Matuska # endif 17481ad8388SMartin Matuska #else 17581ad8388SMartin Matuska # ifndef conv16be 176*3b35e7eeSXin LI # define conv16be(num) byteswap16(num) 17781ad8388SMartin Matuska # endif 17881ad8388SMartin Matuska # ifndef conv32be 179*3b35e7eeSXin LI # define conv32be(num) byteswap32(num) 18081ad8388SMartin Matuska # endif 18181ad8388SMartin Matuska # ifndef conv64be 182*3b35e7eeSXin LI # define conv64be(num) byteswap64(num) 18381ad8388SMartin Matuska # endif 18481ad8388SMartin Matuska # ifndef conv16le 18581ad8388SMartin Matuska # define conv16le(num) ((uint16_t)(num)) 18681ad8388SMartin Matuska # endif 18781ad8388SMartin Matuska # ifndef conv32le 18881ad8388SMartin Matuska # define conv32le(num) ((uint32_t)(num)) 18981ad8388SMartin Matuska # endif 19081ad8388SMartin Matuska # ifndef conv64le 19181ad8388SMartin Matuska # define conv64le(num) ((uint64_t)(num)) 19281ad8388SMartin Matuska # endif 19381ad8388SMartin Matuska #endif 19481ad8388SMartin Matuska 19581ad8388SMartin Matuska 196a8675d92SXin LI //////////////////////////////// 197a8675d92SXin LI // Unaligned reads and writes // 198a8675d92SXin LI //////////////////////////////// 199a8675d92SXin LI 200ca6a6373SXin LI // No-strict-align archs like x86-64 201ca6a6373SXin LI // --------------------------------- 202ca6a6373SXin LI // 203a8675d92SXin LI // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 204a8675d92SXin LI // is bad even if the uint8_pointer is properly aligned because this kind 205a8675d92SXin LI // of casts break strict aliasing rules and result in undefined behavior. 206a8675d92SXin LI // With unaligned pointers it's even worse: compilers may emit vector 207a8675d92SXin LI // instructions that require aligned pointers even if non-vector 208a8675d92SXin LI // instructions work with unaligned pointers. 209a8675d92SXin LI // 210a8675d92SXin LI // Using memcpy() is the standard compliant way to do unaligned access. 211a8675d92SXin LI // Many modern compilers inline it so there is no function call overhead. 212a8675d92SXin LI // For those compilers that don't handle the memcpy() method well, the 213a8675d92SXin LI // old casting method (that violates strict aliasing) can be requested at 214a8675d92SXin LI // build time. A third method, casting to a packed struct, would also be 215a8675d92SXin LI // an option but isn't provided to keep things simpler (it's already a mess). 216a8675d92SXin LI // Hopefully this is flexible enough in practice. 217ca6a6373SXin LI // 218ca6a6373SXin LI // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 219ca6a6373SXin LI // 220ca6a6373SXin LI // buf[0] | (buf[1] << 8) 221ca6a6373SXin LI // 222ca6a6373SXin LI // reads a 16-bit value and can emit a single 16-bit load and produce 223ca6a6373SXin LI // identical code than with the memcpy() method. In other cases Clang and GCC 224ca6a6373SXin LI // produce either the same or better code with memcpy(). For example, Clang 9 225ca6a6373SXin LI // on x86-64 can detect 32-bit load but not 16-bit load. 226ca6a6373SXin LI // 227ca6a6373SXin LI // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 228ca6a6373SXin LI // code for "buf[0] | (buf[1] << 8)". 229ca6a6373SXin LI // 230ca6a6373SXin LI // Conclusion: The memcpy() method is the best choice when unaligned access 231ca6a6373SXin LI // is supported. 232ca6a6373SXin LI // 233ca6a6373SXin LI // Strict-align archs like SPARC 234ca6a6373SXin LI // ----------------------------- 235ca6a6373SXin LI // 236ca6a6373SXin LI // GCC versions from around 4.x to to at least 13.2.0 produce worse code 237ca6a6373SXin LI // from the memcpy() method than from simple byte-by-byte shift-or code 238ca6a6373SXin LI // when reading a 32-bit integer: 239ca6a6373SXin LI // 240ca6a6373SXin LI // (1) It may be constructed on stack using using four 8-bit loads, 241ca6a6373SXin LI // four 8-bit stores to stack, and finally one 32-bit load from stack. 242ca6a6373SXin LI // 243ca6a6373SXin LI // (2) Especially with -Os, an actual memcpy() call may be emitted. 244ca6a6373SXin LI // 245ca6a6373SXin LI // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 246ca6a6373SXin LI // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 247ca6a6373SXin LI // some processors but not all so this is relevant only in the case when 248ca6a6373SXin LI // GCC assumes that unaligned is not supported or -mstrict-align or 249ca6a6373SXin LI // -mno-unaligned-access is used. 250ca6a6373SXin LI // 251ca6a6373SXin LI // For Clang it makes little difference. ARM64 with -O2 -mstrict-align 252ca6a6373SXin LI // was one the very few with a minor difference: the memcpy() version 253ca6a6373SXin LI // was one instruction longer. 254ca6a6373SXin LI // 255ca6a6373SXin LI // Conclusion: At least in case of GCC and Clang, byte-by-byte code is 256*3b35e7eeSXin LI // the best choice for strict-align archs to do unaligned access. 257ca6a6373SXin LI // 258ca6a6373SXin LI // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 259ca6a6373SXin LI // 260ca6a6373SXin LI // Thanks to <https://godbolt.org/> it was easy to test different compilers. 261ca6a6373SXin LI // The following is for little endian targets: 262ca6a6373SXin LI /* 263ca6a6373SXin LI #include <stdint.h> 264ca6a6373SXin LI #include <string.h> 265ca6a6373SXin LI 266ca6a6373SXin LI uint32_t bytes16(const uint8_t *b) 267ca6a6373SXin LI { 268ca6a6373SXin LI return (uint32_t)b[0] 269ca6a6373SXin LI | ((uint32_t)b[1] << 8); 270ca6a6373SXin LI } 271ca6a6373SXin LI 272ca6a6373SXin LI uint32_t copy16(const uint8_t *b) 273ca6a6373SXin LI { 274ca6a6373SXin LI uint16_t v; 275ca6a6373SXin LI memcpy(&v, b, sizeof(v)); 276ca6a6373SXin LI return v; 277ca6a6373SXin LI } 278ca6a6373SXin LI 279ca6a6373SXin LI uint32_t bytes32(const uint8_t *b) 280ca6a6373SXin LI { 281ca6a6373SXin LI return (uint32_t)b[0] 282ca6a6373SXin LI | ((uint32_t)b[1] << 8) 283ca6a6373SXin LI | ((uint32_t)b[2] << 16) 284ca6a6373SXin LI | ((uint32_t)b[3] << 24); 285ca6a6373SXin LI } 286ca6a6373SXin LI 287ca6a6373SXin LI uint32_t copy32(const uint8_t *b) 288ca6a6373SXin LI { 289ca6a6373SXin LI uint32_t v; 290ca6a6373SXin LI memcpy(&v, b, sizeof(v)); 291ca6a6373SXin LI return v; 292ca6a6373SXin LI } 293ca6a6373SXin LI 294ca6a6373SXin LI void wbytes16(uint8_t *b, uint16_t v) 295ca6a6373SXin LI { 296ca6a6373SXin LI b[0] = (uint8_t)v; 297ca6a6373SXin LI b[1] = (uint8_t)(v >> 8); 298ca6a6373SXin LI } 299ca6a6373SXin LI 300ca6a6373SXin LI void wcopy16(uint8_t *b, uint16_t v) 301ca6a6373SXin LI { 302ca6a6373SXin LI memcpy(b, &v, sizeof(v)); 303ca6a6373SXin LI } 304ca6a6373SXin LI 305ca6a6373SXin LI void wbytes32(uint8_t *b, uint32_t v) 306ca6a6373SXin LI { 307ca6a6373SXin LI b[0] = (uint8_t)v; 308ca6a6373SXin LI b[1] = (uint8_t)(v >> 8); 309ca6a6373SXin LI b[2] = (uint8_t)(v >> 16); 310ca6a6373SXin LI b[3] = (uint8_t)(v >> 24); 311ca6a6373SXin LI } 312ca6a6373SXin LI 313ca6a6373SXin LI void wcopy32(uint8_t *b, uint32_t v) 314ca6a6373SXin LI { 315ca6a6373SXin LI memcpy(b, &v, sizeof(v)); 316ca6a6373SXin LI } 317ca6a6373SXin LI */ 318ca6a6373SXin LI 319ca6a6373SXin LI 320ca6a6373SXin LI #ifdef TUKLIB_FAST_UNALIGNED_ACCESS 32181ad8388SMartin Matuska 32281ad8388SMartin Matuska static inline uint16_t 323a8675d92SXin LI read16ne(const uint8_t *buf) 32481ad8388SMartin Matuska { 325ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 326a8675d92SXin LI return *(const uint16_t *)buf; 327a8675d92SXin LI #else 328a8675d92SXin LI uint16_t num; 329a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 330a8675d92SXin LI return num; 331a8675d92SXin LI #endif 33281ad8388SMartin Matuska } 33381ad8388SMartin Matuska 33481ad8388SMartin Matuska 33581ad8388SMartin Matuska static inline uint32_t 336a8675d92SXin LI read32ne(const uint8_t *buf) 33781ad8388SMartin Matuska { 338ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 339a8675d92SXin LI return *(const uint32_t *)buf; 340a8675d92SXin LI #else 341a8675d92SXin LI uint32_t num; 342a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 343a8675d92SXin LI return num; 344a8675d92SXin LI #endif 34581ad8388SMartin Matuska } 34681ad8388SMartin Matuska 34781ad8388SMartin Matuska 34881ad8388SMartin Matuska static inline uint64_t 349a8675d92SXin LI read64ne(const uint8_t *buf) 35081ad8388SMartin Matuska { 351ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 352a8675d92SXin LI return *(const uint64_t *)buf; 353a8675d92SXin LI #else 354a8675d92SXin LI uint64_t num; 355a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 356a8675d92SXin LI return num; 357a8675d92SXin LI #endif 35881ad8388SMartin Matuska } 35981ad8388SMartin Matuska 36081ad8388SMartin Matuska 36181ad8388SMartin Matuska static inline void 36281ad8388SMartin Matuska write16ne(uint8_t *buf, uint16_t num) 36381ad8388SMartin Matuska { 364ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 36581ad8388SMartin Matuska *(uint16_t *)buf = num; 366a8675d92SXin LI #else 367a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 368a8675d92SXin LI #endif 36981ad8388SMartin Matuska return; 37081ad8388SMartin Matuska } 37181ad8388SMartin Matuska 37281ad8388SMartin Matuska 37381ad8388SMartin Matuska static inline void 37481ad8388SMartin Matuska write32ne(uint8_t *buf, uint32_t num) 37581ad8388SMartin Matuska { 376ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 37781ad8388SMartin Matuska *(uint32_t *)buf = num; 378a8675d92SXin LI #else 379a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 380a8675d92SXin LI #endif 38181ad8388SMartin Matuska return; 38281ad8388SMartin Matuska } 38381ad8388SMartin Matuska 38481ad8388SMartin Matuska 38581ad8388SMartin Matuska static inline void 38681ad8388SMartin Matuska write64ne(uint8_t *buf, uint64_t num) 38781ad8388SMartin Matuska { 388ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 38981ad8388SMartin Matuska *(uint64_t *)buf = num; 390a8675d92SXin LI #else 391a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 392a8675d92SXin LI #endif 39381ad8388SMartin Matuska return; 39481ad8388SMartin Matuska } 39581ad8388SMartin Matuska 39681ad8388SMartin Matuska 39781ad8388SMartin Matuska static inline uint16_t 398a8675d92SXin LI read16be(const uint8_t *buf) 39981ad8388SMartin Matuska { 400a8675d92SXin LI uint16_t num = read16ne(buf); 401a8675d92SXin LI return conv16be(num); 40281ad8388SMartin Matuska } 40381ad8388SMartin Matuska 40481ad8388SMartin Matuska 40581ad8388SMartin Matuska static inline uint16_t 406a8675d92SXin LI read16le(const uint8_t *buf) 40781ad8388SMartin Matuska { 408a8675d92SXin LI uint16_t num = read16ne(buf); 409a8675d92SXin LI return conv16le(num); 41081ad8388SMartin Matuska } 41181ad8388SMartin Matuska 41281ad8388SMartin Matuska 41381ad8388SMartin Matuska static inline uint32_t 414a8675d92SXin LI read32be(const uint8_t *buf) 41581ad8388SMartin Matuska { 416a8675d92SXin LI uint32_t num = read32ne(buf); 417a8675d92SXin LI return conv32be(num); 41881ad8388SMartin Matuska } 41981ad8388SMartin Matuska 42081ad8388SMartin Matuska 42181ad8388SMartin Matuska static inline uint32_t 422a8675d92SXin LI read32le(const uint8_t *buf) 42381ad8388SMartin Matuska { 424a8675d92SXin LI uint32_t num = read32ne(buf); 425a8675d92SXin LI return conv32le(num); 42681ad8388SMartin Matuska } 42781ad8388SMartin Matuska 42881ad8388SMartin Matuska 42973ed8e77SXin LI static inline uint64_t 43073ed8e77SXin LI read64be(const uint8_t *buf) 43173ed8e77SXin LI { 43273ed8e77SXin LI uint64_t num = read64ne(buf); 43373ed8e77SXin LI return conv64be(num); 434ca6a6373SXin LI } 435ca6a6373SXin LI 436ca6a6373SXin LI 437ca6a6373SXin LI static inline uint64_t 438ca6a6373SXin LI read64le(const uint8_t *buf) 439ca6a6373SXin LI { 440ca6a6373SXin LI uint64_t num = read64ne(buf); 441ca6a6373SXin LI return conv64le(num); 442ca6a6373SXin LI } 443ca6a6373SXin LI 444ca6a6373SXin LI 445ca6a6373SXin LI // NOTE: Possible byte swapping must be done in a macro to allow the compiler 446ca6a6373SXin LI // to optimize byte swapping of constants when using glibc's or *BSD's 447ca6a6373SXin LI // byte swapping macros. The actual write is done in an inline function 448ca6a6373SXin LI // to make type checking of the buf pointer possible. 449ca6a6373SXin LI #define write16be(buf, num) write16ne(buf, conv16be(num)) 450ca6a6373SXin LI #define write32be(buf, num) write32ne(buf, conv32be(num)) 451ca6a6373SXin LI #define write64be(buf, num) write64ne(buf, conv64be(num)) 452ca6a6373SXin LI #define write16le(buf, num) write16ne(buf, conv16le(num)) 453ca6a6373SXin LI #define write32le(buf, num) write32ne(buf, conv32le(num)) 454ca6a6373SXin LI #define write64le(buf, num) write64ne(buf, conv64le(num)) 455ca6a6373SXin LI 45673ed8e77SXin LI #else 457ca6a6373SXin LI 458ca6a6373SXin LI #ifdef WORDS_BIGENDIAN 459ca6a6373SXin LI # define read16ne read16be 460ca6a6373SXin LI # define read32ne read32be 461ca6a6373SXin LI # define read64ne read64be 462ca6a6373SXin LI # define write16ne write16be 463ca6a6373SXin LI # define write32ne write32be 464ca6a6373SXin LI # define write64ne write64be 465ca6a6373SXin LI #else 466ca6a6373SXin LI # define read16ne read16le 467ca6a6373SXin LI # define read32ne read32le 468ca6a6373SXin LI # define read64ne read64le 469ca6a6373SXin LI # define write16ne write16le 470ca6a6373SXin LI # define write32ne write32le 471ca6a6373SXin LI # define write64ne write64le 472ca6a6373SXin LI #endif 473ca6a6373SXin LI 474ca6a6373SXin LI 475ca6a6373SXin LI static inline uint16_t 476ca6a6373SXin LI read16be(const uint8_t *buf) 477ca6a6373SXin LI { 478ca6a6373SXin LI uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 479ca6a6373SXin LI return num; 480ca6a6373SXin LI } 481ca6a6373SXin LI 482ca6a6373SXin LI 483ca6a6373SXin LI static inline uint16_t 484ca6a6373SXin LI read16le(const uint8_t *buf) 485ca6a6373SXin LI { 486ca6a6373SXin LI uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 487ca6a6373SXin LI return num; 488ca6a6373SXin LI } 489ca6a6373SXin LI 490ca6a6373SXin LI 491ca6a6373SXin LI static inline uint32_t 492ca6a6373SXin LI read32be(const uint8_t *buf) 493ca6a6373SXin LI { 494ca6a6373SXin LI uint32_t num = (uint32_t)buf[0] << 24; 495ca6a6373SXin LI num |= (uint32_t)buf[1] << 16; 496ca6a6373SXin LI num |= (uint32_t)buf[2] << 8; 497ca6a6373SXin LI num |= (uint32_t)buf[3]; 498ca6a6373SXin LI return num; 499ca6a6373SXin LI } 500ca6a6373SXin LI 501ca6a6373SXin LI 502ca6a6373SXin LI static inline uint32_t 503ca6a6373SXin LI read32le(const uint8_t *buf) 504ca6a6373SXin LI { 505ca6a6373SXin LI uint32_t num = (uint32_t)buf[0]; 506ca6a6373SXin LI num |= (uint32_t)buf[1] << 8; 507ca6a6373SXin LI num |= (uint32_t)buf[2] << 16; 508ca6a6373SXin LI num |= (uint32_t)buf[3] << 24; 509ca6a6373SXin LI return num; 510ca6a6373SXin LI } 511ca6a6373SXin LI 512ca6a6373SXin LI 513ca6a6373SXin LI static inline uint64_t 514ca6a6373SXin LI read64be(const uint8_t *buf) 515ca6a6373SXin LI { 51673ed8e77SXin LI uint64_t num = (uint64_t)buf[0] << 56; 51773ed8e77SXin LI num |= (uint64_t)buf[1] << 48; 51873ed8e77SXin LI num |= (uint64_t)buf[2] << 40; 51973ed8e77SXin LI num |= (uint64_t)buf[3] << 32; 52073ed8e77SXin LI num |= (uint64_t)buf[4] << 24; 52173ed8e77SXin LI num |= (uint64_t)buf[5] << 16; 52273ed8e77SXin LI num |= (uint64_t)buf[6] << 8; 52373ed8e77SXin LI num |= (uint64_t)buf[7]; 52473ed8e77SXin LI return num; 52573ed8e77SXin LI } 52673ed8e77SXin LI 52773ed8e77SXin LI 52873ed8e77SXin LI static inline uint64_t 52973ed8e77SXin LI read64le(const uint8_t *buf) 53073ed8e77SXin LI { 53173ed8e77SXin LI uint64_t num = (uint64_t)buf[0]; 53273ed8e77SXin LI num |= (uint64_t)buf[1] << 8; 53373ed8e77SXin LI num |= (uint64_t)buf[2] << 16; 53473ed8e77SXin LI num |= (uint64_t)buf[3] << 24; 53573ed8e77SXin LI num |= (uint64_t)buf[4] << 32; 53673ed8e77SXin LI num |= (uint64_t)buf[5] << 40; 53773ed8e77SXin LI num |= (uint64_t)buf[6] << 48; 53873ed8e77SXin LI num |= (uint64_t)buf[7] << 56; 53973ed8e77SXin LI return num; 54073ed8e77SXin LI } 54173ed8e77SXin LI 54273ed8e77SXin LI 54381ad8388SMartin Matuska static inline void 544a8675d92SXin LI write16be(uint8_t *buf, uint16_t num) 54581ad8388SMartin Matuska { 546342bcb12SXin LI buf[0] = (uint8_t)(num >> 8); 547342bcb12SXin LI buf[1] = (uint8_t)num; 54881ad8388SMartin Matuska return; 54981ad8388SMartin Matuska } 55081ad8388SMartin Matuska 55181ad8388SMartin Matuska 55281ad8388SMartin Matuska static inline void 553a8675d92SXin LI write16le(uint8_t *buf, uint16_t num) 55481ad8388SMartin Matuska { 555342bcb12SXin LI buf[0] = (uint8_t)num; 556342bcb12SXin LI buf[1] = (uint8_t)(num >> 8); 55781ad8388SMartin Matuska return; 55881ad8388SMartin Matuska } 55981ad8388SMartin Matuska 56081ad8388SMartin Matuska 56181ad8388SMartin Matuska static inline void 562a8675d92SXin LI write32be(uint8_t *buf, uint32_t num) 56381ad8388SMartin Matuska { 564342bcb12SXin LI buf[0] = (uint8_t)(num >> 24); 565342bcb12SXin LI buf[1] = (uint8_t)(num >> 16); 566342bcb12SXin LI buf[2] = (uint8_t)(num >> 8); 567342bcb12SXin LI buf[3] = (uint8_t)num; 56881ad8388SMartin Matuska return; 56981ad8388SMartin Matuska } 57081ad8388SMartin Matuska 57181ad8388SMartin Matuska 57281ad8388SMartin Matuska static inline void 573a8675d92SXin LI write32le(uint8_t *buf, uint32_t num) 57481ad8388SMartin Matuska { 575342bcb12SXin LI buf[0] = (uint8_t)num; 576342bcb12SXin LI buf[1] = (uint8_t)(num >> 8); 577342bcb12SXin LI buf[2] = (uint8_t)(num >> 16); 578342bcb12SXin LI buf[3] = (uint8_t)(num >> 24); 57981ad8388SMartin Matuska return; 58081ad8388SMartin Matuska } 581ca6a6373SXin LI 582ca6a6373SXin LI 583ca6a6373SXin LI static inline void 584ca6a6373SXin LI write64be(uint8_t *buf, uint64_t num) 585ca6a6373SXin LI { 586ca6a6373SXin LI buf[0] = (uint8_t)(num >> 56); 587ca6a6373SXin LI buf[1] = (uint8_t)(num >> 48); 588ca6a6373SXin LI buf[2] = (uint8_t)(num >> 40); 589ca6a6373SXin LI buf[3] = (uint8_t)(num >> 32); 590ca6a6373SXin LI buf[4] = (uint8_t)(num >> 24); 591ca6a6373SXin LI buf[5] = (uint8_t)(num >> 16); 592ca6a6373SXin LI buf[6] = (uint8_t)(num >> 8); 593ca6a6373SXin LI buf[7] = (uint8_t)num; 594ca6a6373SXin LI return; 595ca6a6373SXin LI } 596ca6a6373SXin LI 597ca6a6373SXin LI 598ca6a6373SXin LI static inline void 599ca6a6373SXin LI write64le(uint8_t *buf, uint64_t num) 600ca6a6373SXin LI { 601ca6a6373SXin LI buf[0] = (uint8_t)num; 602ca6a6373SXin LI buf[1] = (uint8_t)(num >> 8); 603ca6a6373SXin LI buf[2] = (uint8_t)(num >> 16); 604ca6a6373SXin LI buf[3] = (uint8_t)(num >> 24); 605ca6a6373SXin LI buf[4] = (uint8_t)(num >> 32); 606ca6a6373SXin LI buf[5] = (uint8_t)(num >> 40); 607ca6a6373SXin LI buf[6] = (uint8_t)(num >> 48); 608ca6a6373SXin LI buf[7] = (uint8_t)(num >> 56); 609ca6a6373SXin LI return; 610ca6a6373SXin LI } 611ca6a6373SXin LI 61281ad8388SMartin Matuska #endif 61381ad8388SMartin Matuska 61481ad8388SMartin Matuska 615a8675d92SXin LI ////////////////////////////// 616a8675d92SXin LI // Aligned reads and writes // 617a8675d92SXin LI ////////////////////////////// 618a8675d92SXin LI 619a8675d92SXin LI // Separate functions for aligned reads and writes are provided since on 620a8675d92SXin LI // strict-align archs aligned access is much faster than unaligned access. 621a8675d92SXin LI // 622a8675d92SXin LI // Just like in the unaligned case, memcpy() is needed to avoid 623a8675d92SXin LI // strict aliasing violations. However, on archs that don't support 624a8675d92SXin LI // unaligned access the compiler cannot know that the pointers given 625a8675d92SXin LI // to memcpy() are aligned which results in slow code. As of C11 there is 626a8675d92SXin LI // no standard way to tell the compiler that we know that the address is 627a8675d92SXin LI // aligned but some compilers have language extensions to do that. With 628a8675d92SXin LI // such language extensions the memcpy() method gives excellent results. 629a8675d92SXin LI // 630*3b35e7eeSXin LI // What to do on a strict-align system when no known language extensions 631a8675d92SXin LI // are available? Falling back to byte-by-byte access would be safe but ruin 632a8675d92SXin LI // optimizations that have been made specifically with aligned access in mind. 633a8675d92SXin LI // As a compromise, aligned reads will fall back to non-compliant type punning 634a8675d92SXin LI // but aligned writes will be byte-by-byte, that is, fast reads are preferred 635a8675d92SXin LI // over fast writes. This obviously isn't great but hopefully it's a working 636a8675d92SXin LI // compromise for now. 637a8675d92SXin LI // 638a8675d92SXin LI // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 639a8675d92SXin LI #ifdef HAVE___BUILTIN_ASSUME_ALIGNED 640a8675d92SXin LI # define tuklib_memcpy_aligned(dest, src, size) \ 641a8675d92SXin LI memcpy(dest, __builtin_assume_aligned(src, size), size) 642a8675d92SXin LI #else 643a8675d92SXin LI # define tuklib_memcpy_aligned(dest, src, size) \ 644a8675d92SXin LI memcpy(dest, src, size) 645a8675d92SXin LI # ifndef TUKLIB_FAST_UNALIGNED_ACCESS 646a8675d92SXin LI # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 647a8675d92SXin LI # endif 648a8675d92SXin LI #endif 649a8675d92SXin LI 650a8675d92SXin LI 651a8675d92SXin LI static inline uint16_t 652a8675d92SXin LI aligned_read16ne(const uint8_t *buf) 653a8675d92SXin LI { 654a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 655a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 656a8675d92SXin LI return *(const uint16_t *)buf; 657a8675d92SXin LI #else 658a8675d92SXin LI uint16_t num; 659a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 660a8675d92SXin LI return num; 661a8675d92SXin LI #endif 662a8675d92SXin LI } 663a8675d92SXin LI 664a8675d92SXin LI 665a8675d92SXin LI static inline uint32_t 666a8675d92SXin LI aligned_read32ne(const uint8_t *buf) 667a8675d92SXin LI { 668a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 669a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 670a8675d92SXin LI return *(const uint32_t *)buf; 671a8675d92SXin LI #else 672a8675d92SXin LI uint32_t num; 673a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 674a8675d92SXin LI return num; 675a8675d92SXin LI #endif 676a8675d92SXin LI } 677a8675d92SXin LI 678a8675d92SXin LI 679a8675d92SXin LI static inline uint64_t 680a8675d92SXin LI aligned_read64ne(const uint8_t *buf) 681a8675d92SXin LI { 682a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 683a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 684a8675d92SXin LI return *(const uint64_t *)buf; 685a8675d92SXin LI #else 686a8675d92SXin LI uint64_t num; 687a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 688a8675d92SXin LI return num; 689a8675d92SXin LI #endif 690a8675d92SXin LI } 691a8675d92SXin LI 692a8675d92SXin LI 693a8675d92SXin LI static inline void 694a8675d92SXin LI aligned_write16ne(uint8_t *buf, uint16_t num) 695a8675d92SXin LI { 696a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 697a8675d92SXin LI *(uint16_t *)buf = num; 698a8675d92SXin LI #else 699a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 700a8675d92SXin LI #endif 701a8675d92SXin LI return; 702a8675d92SXin LI } 703a8675d92SXin LI 704a8675d92SXin LI 705a8675d92SXin LI static inline void 706a8675d92SXin LI aligned_write32ne(uint8_t *buf, uint32_t num) 707a8675d92SXin LI { 708a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 709a8675d92SXin LI *(uint32_t *)buf = num; 710a8675d92SXin LI #else 711a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 712a8675d92SXin LI #endif 713a8675d92SXin LI return; 714a8675d92SXin LI } 715a8675d92SXin LI 716a8675d92SXin LI 717a8675d92SXin LI static inline void 718a8675d92SXin LI aligned_write64ne(uint8_t *buf, uint64_t num) 719a8675d92SXin LI { 720a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 721a8675d92SXin LI *(uint64_t *)buf = num; 722a8675d92SXin LI #else 723a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 724a8675d92SXin LI #endif 725a8675d92SXin LI return; 726a8675d92SXin LI } 727a8675d92SXin LI 728a8675d92SXin LI 729a8675d92SXin LI static inline uint16_t 730a8675d92SXin LI aligned_read16be(const uint8_t *buf) 731a8675d92SXin LI { 732a8675d92SXin LI uint16_t num = aligned_read16ne(buf); 733a8675d92SXin LI return conv16be(num); 734a8675d92SXin LI } 735a8675d92SXin LI 736a8675d92SXin LI 737a8675d92SXin LI static inline uint16_t 738a8675d92SXin LI aligned_read16le(const uint8_t *buf) 739a8675d92SXin LI { 740a8675d92SXin LI uint16_t num = aligned_read16ne(buf); 741a8675d92SXin LI return conv16le(num); 742a8675d92SXin LI } 743a8675d92SXin LI 744a8675d92SXin LI 745a8675d92SXin LI static inline uint32_t 746a8675d92SXin LI aligned_read32be(const uint8_t *buf) 747a8675d92SXin LI { 748a8675d92SXin LI uint32_t num = aligned_read32ne(buf); 749a8675d92SXin LI return conv32be(num); 750a8675d92SXin LI } 751a8675d92SXin LI 752a8675d92SXin LI 753a8675d92SXin LI static inline uint32_t 754a8675d92SXin LI aligned_read32le(const uint8_t *buf) 755a8675d92SXin LI { 756a8675d92SXin LI uint32_t num = aligned_read32ne(buf); 757a8675d92SXin LI return conv32le(num); 758a8675d92SXin LI } 759a8675d92SXin LI 760a8675d92SXin LI 761a8675d92SXin LI static inline uint64_t 762a8675d92SXin LI aligned_read64be(const uint8_t *buf) 763a8675d92SXin LI { 764a8675d92SXin LI uint64_t num = aligned_read64ne(buf); 765a8675d92SXin LI return conv64be(num); 766a8675d92SXin LI } 767a8675d92SXin LI 768a8675d92SXin LI 769a8675d92SXin LI static inline uint64_t 770a8675d92SXin LI aligned_read64le(const uint8_t *buf) 771a8675d92SXin LI { 772a8675d92SXin LI uint64_t num = aligned_read64ne(buf); 773a8675d92SXin LI return conv64le(num); 774a8675d92SXin LI } 775a8675d92SXin LI 776a8675d92SXin LI 777a8675d92SXin LI // These need to be macros like in the unaligned case. 778a8675d92SXin LI #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 779a8675d92SXin LI #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 780a8675d92SXin LI #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 781a8675d92SXin LI #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 782a8675d92SXin LI #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 783a8675d92SXin LI #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 784a8675d92SXin LI 785a8675d92SXin LI 786a8675d92SXin LI //////////////////// 787a8675d92SXin LI // Bit operations // 788a8675d92SXin LI //////////////////// 789a8675d92SXin LI 79081ad8388SMartin Matuska static inline uint32_t 79181ad8388SMartin Matuska bsr32(uint32_t n) 79281ad8388SMartin Matuska { 79381ad8388SMartin Matuska // Check for ICC first, since it tends to define __GNUC__ too. 79481ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 79581ad8388SMartin Matuska return _bit_scan_reverse(n); 79681ad8388SMartin Matuska 797b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 79881ad8388SMartin Matuska // GCC >= 3.4 has __builtin_clz(), which gives good results on 79981ad8388SMartin Matuska // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 80081ad8388SMartin Matuska // either plain BSR (so the XOR gets optimized away) or LZCNT and 80181ad8388SMartin Matuska // XOR (if -march indicates that SSE4a instructions are supported). 802a8675d92SXin LI return (uint32_t)__builtin_clz(n) ^ 31U; 80381ad8388SMartin Matuska 80481ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 80581ad8388SMartin Matuska uint32_t i; 80681ad8388SMartin Matuska __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 80781ad8388SMartin Matuska return i; 80881ad8388SMartin Matuska 809a8675d92SXin LI #elif defined(_MSC_VER) 810a8675d92SXin LI unsigned long i; 811a8675d92SXin LI _BitScanReverse(&i, n); 81281ad8388SMartin Matuska return i; 81381ad8388SMartin Matuska 81481ad8388SMartin Matuska #else 81581ad8388SMartin Matuska uint32_t i = 31; 81681ad8388SMartin Matuska 817a8675d92SXin LI if ((n & 0xFFFF0000) == 0) { 81881ad8388SMartin Matuska n <<= 16; 81981ad8388SMartin Matuska i = 15; 82081ad8388SMartin Matuska } 82181ad8388SMartin Matuska 822a8675d92SXin LI if ((n & 0xFF000000) == 0) { 82381ad8388SMartin Matuska n <<= 8; 82481ad8388SMartin Matuska i -= 8; 82581ad8388SMartin Matuska } 82681ad8388SMartin Matuska 827a8675d92SXin LI if ((n & 0xF0000000) == 0) { 82881ad8388SMartin Matuska n <<= 4; 82981ad8388SMartin Matuska i -= 4; 83081ad8388SMartin Matuska } 83181ad8388SMartin Matuska 832a8675d92SXin LI if ((n & 0xC0000000) == 0) { 83381ad8388SMartin Matuska n <<= 2; 83481ad8388SMartin Matuska i -= 2; 83581ad8388SMartin Matuska } 83681ad8388SMartin Matuska 837a8675d92SXin LI if ((n & 0x80000000) == 0) 83881ad8388SMartin Matuska --i; 83981ad8388SMartin Matuska 84081ad8388SMartin Matuska return i; 84181ad8388SMartin Matuska #endif 84281ad8388SMartin Matuska } 84381ad8388SMartin Matuska 84481ad8388SMartin Matuska 84581ad8388SMartin Matuska static inline uint32_t 84681ad8388SMartin Matuska clz32(uint32_t n) 84781ad8388SMartin Matuska { 84881ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 84981ad8388SMartin Matuska return _bit_scan_reverse(n) ^ 31U; 85081ad8388SMartin Matuska 851b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 852a8675d92SXin LI return (uint32_t)__builtin_clz(n); 85381ad8388SMartin Matuska 85481ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 85581ad8388SMartin Matuska uint32_t i; 85681ad8388SMartin Matuska __asm__("bsrl %1, %0\n\t" 85781ad8388SMartin Matuska "xorl $31, %0" 85881ad8388SMartin Matuska : "=r" (i) : "rm" (n)); 85981ad8388SMartin Matuska return i; 86081ad8388SMartin Matuska 861a8675d92SXin LI #elif defined(_MSC_VER) 862a8675d92SXin LI unsigned long i; 863a8675d92SXin LI _BitScanReverse(&i, n); 86481ad8388SMartin Matuska return i ^ 31U; 86581ad8388SMartin Matuska 86681ad8388SMartin Matuska #else 86781ad8388SMartin Matuska uint32_t i = 0; 86881ad8388SMartin Matuska 869a8675d92SXin LI if ((n & 0xFFFF0000) == 0) { 87081ad8388SMartin Matuska n <<= 16; 87181ad8388SMartin Matuska i = 16; 87281ad8388SMartin Matuska } 87381ad8388SMartin Matuska 874a8675d92SXin LI if ((n & 0xFF000000) == 0) { 87581ad8388SMartin Matuska n <<= 8; 87681ad8388SMartin Matuska i += 8; 87781ad8388SMartin Matuska } 87881ad8388SMartin Matuska 879a8675d92SXin LI if ((n & 0xF0000000) == 0) { 88081ad8388SMartin Matuska n <<= 4; 88181ad8388SMartin Matuska i += 4; 88281ad8388SMartin Matuska } 88381ad8388SMartin Matuska 884a8675d92SXin LI if ((n & 0xC0000000) == 0) { 88581ad8388SMartin Matuska n <<= 2; 88681ad8388SMartin Matuska i += 2; 88781ad8388SMartin Matuska } 88881ad8388SMartin Matuska 889a8675d92SXin LI if ((n & 0x80000000) == 0) 89081ad8388SMartin Matuska ++i; 89181ad8388SMartin Matuska 89281ad8388SMartin Matuska return i; 89381ad8388SMartin Matuska #endif 89481ad8388SMartin Matuska } 89581ad8388SMartin Matuska 89681ad8388SMartin Matuska 89781ad8388SMartin Matuska static inline uint32_t 89881ad8388SMartin Matuska ctz32(uint32_t n) 89981ad8388SMartin Matuska { 90081ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 90181ad8388SMartin Matuska return _bit_scan_forward(n); 90281ad8388SMartin Matuska 903b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 904a8675d92SXin LI return (uint32_t)__builtin_ctz(n); 90581ad8388SMartin Matuska 90681ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 90781ad8388SMartin Matuska uint32_t i; 90881ad8388SMartin Matuska __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 90981ad8388SMartin Matuska return i; 91081ad8388SMartin Matuska 911a8675d92SXin LI #elif defined(_MSC_VER) 912a8675d92SXin LI unsigned long i; 913a8675d92SXin LI _BitScanForward(&i, n); 91481ad8388SMartin Matuska return i; 91581ad8388SMartin Matuska 91681ad8388SMartin Matuska #else 91781ad8388SMartin Matuska uint32_t i = 0; 91881ad8388SMartin Matuska 919a8675d92SXin LI if ((n & 0x0000FFFF) == 0) { 92081ad8388SMartin Matuska n >>= 16; 92181ad8388SMartin Matuska i = 16; 92281ad8388SMartin Matuska } 92381ad8388SMartin Matuska 924a8675d92SXin LI if ((n & 0x000000FF) == 0) { 92581ad8388SMartin Matuska n >>= 8; 92681ad8388SMartin Matuska i += 8; 92781ad8388SMartin Matuska } 92881ad8388SMartin Matuska 929a8675d92SXin LI if ((n & 0x0000000F) == 0) { 93081ad8388SMartin Matuska n >>= 4; 93181ad8388SMartin Matuska i += 4; 93281ad8388SMartin Matuska } 93381ad8388SMartin Matuska 934a8675d92SXin LI if ((n & 0x00000003) == 0) { 93581ad8388SMartin Matuska n >>= 2; 93681ad8388SMartin Matuska i += 2; 93781ad8388SMartin Matuska } 93881ad8388SMartin Matuska 939a8675d92SXin LI if ((n & 0x00000001) == 0) 94081ad8388SMartin Matuska ++i; 94181ad8388SMartin Matuska 94281ad8388SMartin Matuska return i; 94381ad8388SMartin Matuska #endif 94481ad8388SMartin Matuska } 94581ad8388SMartin Matuska 94681ad8388SMartin Matuska #define bsf32 ctz32 94781ad8388SMartin Matuska 94881ad8388SMartin Matuska #endif 949