181ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 281ad8388SMartin Matuska // 381ad8388SMartin Matuska /// \file tuklib_integer.h 481ad8388SMartin Matuska /// \brief Various integer and bit operations 581ad8388SMartin Matuska /// 681ad8388SMartin Matuska /// This file provides macros or functions to do some basic integer and bit 781ad8388SMartin Matuska /// operations. 881ad8388SMartin Matuska /// 9a8675d92SXin LI /// Native endian inline functions (XX = 16, 32, or 64): 10a8675d92SXin LI /// - Unaligned native endian reads: readXXne(ptr) 11a8675d92SXin LI /// - Unaligned native endian writes: writeXXne(ptr, num) 12a8675d92SXin LI /// - Aligned native endian reads: aligned_readXXne(ptr) 13a8675d92SXin LI /// - Aligned native endian writes: aligned_writeXXne(ptr, num) 14a8675d92SXin LI /// 15a8675d92SXin LI /// Endianness-converting integer operations (these can be macros!) 16a8675d92SXin LI /// (XX = 16, 32, or 64; Y = b or l): 1781ad8388SMartin Matuska /// - Byte swapping: bswapXX(num) 18a8675d92SXin LI /// - Byte order conversions to/from native (byteswaps if Y isn't 19a8675d92SXin LI /// the native endianness): convXXYe(num) 2073ed8e77SXin LI /// - Unaligned reads: readXXYe(ptr) 2173ed8e77SXin LI /// - Unaligned writes: writeXXYe(ptr, num) 22a8675d92SXin LI /// - Aligned reads: aligned_readXXYe(ptr) 23a8675d92SXin LI /// - Aligned writes: aligned_writeXXYe(ptr, num) 2481ad8388SMartin Matuska /// 25a8675d92SXin LI /// Since the above can macros, the arguments should have no side effects 26a8675d92SXin LI /// because they may be evaluated more than once. 2781ad8388SMartin Matuska /// 28a8675d92SXin LI /// Bit scan operations for non-zero 32-bit integers (inline functions): 2981ad8388SMartin Matuska /// - Bit scan reverse (find highest non-zero bit): bsr32(num) 3081ad8388SMartin Matuska /// - Count leading zeros: clz32(num) 3181ad8388SMartin Matuska /// - Count trailing zeros: ctz32(num) 3281ad8388SMartin Matuska /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 3381ad8388SMartin Matuska /// 3481ad8388SMartin Matuska /// The above bit scan operations return 0-31. If num is zero, 3581ad8388SMartin Matuska /// the result is undefined. 3681ad8388SMartin Matuska // 3781ad8388SMartin Matuska // Authors: Lasse Collin 3881ad8388SMartin Matuska // Joachim Henke 3981ad8388SMartin Matuska // 4081ad8388SMartin Matuska // This file has been put into the public domain. 4181ad8388SMartin Matuska // You can do whatever you want with this file. 4281ad8388SMartin Matuska // 4381ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 4481ad8388SMartin Matuska 4581ad8388SMartin Matuska #ifndef TUKLIB_INTEGER_H 4681ad8388SMartin Matuska #define TUKLIB_INTEGER_H 4781ad8388SMartin Matuska 4881ad8388SMartin Matuska #include "tuklib_common.h" 49a8675d92SXin LI #include <string.h> 50a8675d92SXin LI 51a8675d92SXin LI // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 52a8675d92SXin LI // and such functions. 53a8675d92SXin LI #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 54a8675d92SXin LI # include <immintrin.h> 55b333cd44SXin LI // Only include <intrin.h> when it is needed. GCC and Clang can both 56b333cd44SXin LI // use __builtin's, so we only need Windows instrincs when using MSVC. 57b333cd44SXin LI // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 58b333cd44SXin LI // cases explicitly. 59b333cd44SXin LI #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 60b333cd44SXin LI # include <intrin.h> 61a8675d92SXin LI #endif 6281ad8388SMartin Matuska 6381ad8388SMartin Matuska 64a8675d92SXin LI /////////////////// 65a8675d92SXin LI // Byte swapping // 66a8675d92SXin LI /////////////////// 6781ad8388SMartin Matuska 68a8675d92SXin LI #if defined(HAVE___BUILTIN_BSWAPXX) 69a8675d92SXin LI // GCC >= 4.8 and Clang 70a8675d92SXin LI # define bswap16(n) __builtin_bswap16(n) 71a8675d92SXin LI # define bswap32(n) __builtin_bswap32(n) 72a8675d92SXin LI # define bswap64(n) __builtin_bswap64(n) 73a8675d92SXin LI 74a8675d92SXin LI #elif defined(HAVE_BYTESWAP_H) 7581ad8388SMartin Matuska // glibc, uClibc, dietlibc 7681ad8388SMartin Matuska # include <byteswap.h> 7781ad8388SMartin Matuska # ifdef HAVE_BSWAP_16 7881ad8388SMartin Matuska # define bswap16(num) bswap_16(num) 7981ad8388SMartin Matuska # endif 8081ad8388SMartin Matuska # ifdef HAVE_BSWAP_32 8181ad8388SMartin Matuska # define bswap32(num) bswap_32(num) 8281ad8388SMartin Matuska # endif 8381ad8388SMartin Matuska # ifdef HAVE_BSWAP_64 8481ad8388SMartin Matuska # define bswap64(num) bswap_64(num) 8581ad8388SMartin Matuska # endif 8681ad8388SMartin Matuska 8781ad8388SMartin Matuska #elif defined(HAVE_SYS_ENDIAN_H) 8881ad8388SMartin Matuska // *BSDs and Darwin 8981ad8388SMartin Matuska # include <sys/endian.h> 9081ad8388SMartin Matuska 9181ad8388SMartin Matuska #elif defined(HAVE_SYS_BYTEORDER_H) 9281ad8388SMartin Matuska // Solaris 9381ad8388SMartin Matuska # include <sys/byteorder.h> 9481ad8388SMartin Matuska # ifdef BSWAP_16 9581ad8388SMartin Matuska # define bswap16(num) BSWAP_16(num) 9681ad8388SMartin Matuska # endif 9781ad8388SMartin Matuska # ifdef BSWAP_32 9881ad8388SMartin Matuska # define bswap32(num) BSWAP_32(num) 9981ad8388SMartin Matuska # endif 10081ad8388SMartin Matuska # ifdef BSWAP_64 10181ad8388SMartin Matuska # define bswap64(num) BSWAP_64(num) 10281ad8388SMartin Matuska # endif 10381ad8388SMartin Matuska # ifdef BE_16 10481ad8388SMartin Matuska # define conv16be(num) BE_16(num) 10581ad8388SMartin Matuska # endif 10681ad8388SMartin Matuska # ifdef BE_32 10781ad8388SMartin Matuska # define conv32be(num) BE_32(num) 10881ad8388SMartin Matuska # endif 10981ad8388SMartin Matuska # ifdef BE_64 11081ad8388SMartin Matuska # define conv64be(num) BE_64(num) 11181ad8388SMartin Matuska # endif 11281ad8388SMartin Matuska # ifdef LE_16 11381ad8388SMartin Matuska # define conv16le(num) LE_16(num) 11481ad8388SMartin Matuska # endif 11581ad8388SMartin Matuska # ifdef LE_32 11681ad8388SMartin Matuska # define conv32le(num) LE_32(num) 11781ad8388SMartin Matuska # endif 11881ad8388SMartin Matuska # ifdef LE_64 11981ad8388SMartin Matuska # define conv64le(num) LE_64(num) 12081ad8388SMartin Matuska # endif 12181ad8388SMartin Matuska #endif 12281ad8388SMartin Matuska 12381ad8388SMartin Matuska #ifndef bswap16 124a8675d92SXin LI # define bswap16(n) (uint16_t)( \ 125a8675d92SXin LI (((n) & 0x00FFU) << 8) \ 126a8675d92SXin LI | (((n) & 0xFF00U) >> 8) \ 127a8675d92SXin LI ) 12881ad8388SMartin Matuska #endif 12981ad8388SMartin Matuska 13081ad8388SMartin Matuska #ifndef bswap32 131a8675d92SXin LI # define bswap32(n) (uint32_t)( \ 132a8675d92SXin LI (((n) & UINT32_C(0x000000FF)) << 24) \ 133a8675d92SXin LI | (((n) & UINT32_C(0x0000FF00)) << 8) \ 134a8675d92SXin LI | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 135a8675d92SXin LI | (((n) & UINT32_C(0xFF000000)) >> 24) \ 136a8675d92SXin LI ) 13781ad8388SMartin Matuska #endif 13881ad8388SMartin Matuska 13981ad8388SMartin Matuska #ifndef bswap64 140a8675d92SXin LI # define bswap64(n) (uint64_t)( \ 141a8675d92SXin LI (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 142a8675d92SXin LI | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 143a8675d92SXin LI | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 144a8675d92SXin LI | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 145a8675d92SXin LI | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 146a8675d92SXin LI | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 147a8675d92SXin LI | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 148a8675d92SXin LI | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 149a8675d92SXin LI ) 15081ad8388SMartin Matuska #endif 15181ad8388SMartin Matuska 15281ad8388SMartin Matuska // Define conversion macros using the basic byte swapping macros. 15381ad8388SMartin Matuska #ifdef WORDS_BIGENDIAN 15481ad8388SMartin Matuska # ifndef conv16be 15581ad8388SMartin Matuska # define conv16be(num) ((uint16_t)(num)) 15681ad8388SMartin Matuska # endif 15781ad8388SMartin Matuska # ifndef conv32be 15881ad8388SMartin Matuska # define conv32be(num) ((uint32_t)(num)) 15981ad8388SMartin Matuska # endif 16081ad8388SMartin Matuska # ifndef conv64be 16181ad8388SMartin Matuska # define conv64be(num) ((uint64_t)(num)) 16281ad8388SMartin Matuska # endif 16381ad8388SMartin Matuska # ifndef conv16le 16481ad8388SMartin Matuska # define conv16le(num) bswap16(num) 16581ad8388SMartin Matuska # endif 16681ad8388SMartin Matuska # ifndef conv32le 16781ad8388SMartin Matuska # define conv32le(num) bswap32(num) 16881ad8388SMartin Matuska # endif 16981ad8388SMartin Matuska # ifndef conv64le 17081ad8388SMartin Matuska # define conv64le(num) bswap64(num) 17181ad8388SMartin Matuska # endif 17281ad8388SMartin Matuska #else 17381ad8388SMartin Matuska # ifndef conv16be 17481ad8388SMartin Matuska # define conv16be(num) bswap16(num) 17581ad8388SMartin Matuska # endif 17681ad8388SMartin Matuska # ifndef conv32be 17781ad8388SMartin Matuska # define conv32be(num) bswap32(num) 17881ad8388SMartin Matuska # endif 17981ad8388SMartin Matuska # ifndef conv64be 18081ad8388SMartin Matuska # define conv64be(num) bswap64(num) 18181ad8388SMartin Matuska # endif 18281ad8388SMartin Matuska # ifndef conv16le 18381ad8388SMartin Matuska # define conv16le(num) ((uint16_t)(num)) 18481ad8388SMartin Matuska # endif 18581ad8388SMartin Matuska # ifndef conv32le 18681ad8388SMartin Matuska # define conv32le(num) ((uint32_t)(num)) 18781ad8388SMartin Matuska # endif 18881ad8388SMartin Matuska # ifndef conv64le 18981ad8388SMartin Matuska # define conv64le(num) ((uint64_t)(num)) 19081ad8388SMartin Matuska # endif 19181ad8388SMartin Matuska #endif 19281ad8388SMartin Matuska 19381ad8388SMartin Matuska 194a8675d92SXin LI //////////////////////////////// 195a8675d92SXin LI // Unaligned reads and writes // 196a8675d92SXin LI //////////////////////////////// 197a8675d92SXin LI 198*ca6a6373SXin LI // No-strict-align archs like x86-64 199*ca6a6373SXin LI // --------------------------------- 200*ca6a6373SXin LI // 201a8675d92SXin LI // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 202a8675d92SXin LI // is bad even if the uint8_pointer is properly aligned because this kind 203a8675d92SXin LI // of casts break strict aliasing rules and result in undefined behavior. 204a8675d92SXin LI // With unaligned pointers it's even worse: compilers may emit vector 205a8675d92SXin LI // instructions that require aligned pointers even if non-vector 206a8675d92SXin LI // instructions work with unaligned pointers. 207a8675d92SXin LI // 208a8675d92SXin LI // Using memcpy() is the standard compliant way to do unaligned access. 209a8675d92SXin LI // Many modern compilers inline it so there is no function call overhead. 210a8675d92SXin LI // For those compilers that don't handle the memcpy() method well, the 211a8675d92SXin LI // old casting method (that violates strict aliasing) can be requested at 212a8675d92SXin LI // build time. A third method, casting to a packed struct, would also be 213a8675d92SXin LI // an option but isn't provided to keep things simpler (it's already a mess). 214a8675d92SXin LI // Hopefully this is flexible enough in practice. 215*ca6a6373SXin LI // 216*ca6a6373SXin LI // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 217*ca6a6373SXin LI // 218*ca6a6373SXin LI // buf[0] | (buf[1] << 8) 219*ca6a6373SXin LI // 220*ca6a6373SXin LI // reads a 16-bit value and can emit a single 16-bit load and produce 221*ca6a6373SXin LI // identical code than with the memcpy() method. In other cases Clang and GCC 222*ca6a6373SXin LI // produce either the same or better code with memcpy(). For example, Clang 9 223*ca6a6373SXin LI // on x86-64 can detect 32-bit load but not 16-bit load. 224*ca6a6373SXin LI // 225*ca6a6373SXin LI // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 226*ca6a6373SXin LI // code for "buf[0] | (buf[1] << 8)". 227*ca6a6373SXin LI // 228*ca6a6373SXin LI // Conclusion: The memcpy() method is the best choice when unaligned access 229*ca6a6373SXin LI // is supported. 230*ca6a6373SXin LI // 231*ca6a6373SXin LI // Strict-align archs like SPARC 232*ca6a6373SXin LI // ----------------------------- 233*ca6a6373SXin LI // 234*ca6a6373SXin LI // GCC versions from around 4.x to to at least 13.2.0 produce worse code 235*ca6a6373SXin LI // from the memcpy() method than from simple byte-by-byte shift-or code 236*ca6a6373SXin LI // when reading a 32-bit integer: 237*ca6a6373SXin LI // 238*ca6a6373SXin LI // (1) It may be constructed on stack using using four 8-bit loads, 239*ca6a6373SXin LI // four 8-bit stores to stack, and finally one 32-bit load from stack. 240*ca6a6373SXin LI // 241*ca6a6373SXin LI // (2) Especially with -Os, an actual memcpy() call may be emitted. 242*ca6a6373SXin LI // 243*ca6a6373SXin LI // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 244*ca6a6373SXin LI // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 245*ca6a6373SXin LI // some processors but not all so this is relevant only in the case when 246*ca6a6373SXin LI // GCC assumes that unaligned is not supported or -mstrict-align or 247*ca6a6373SXin LI // -mno-unaligned-access is used. 248*ca6a6373SXin LI // 249*ca6a6373SXin LI // For Clang it makes little difference. ARM64 with -O2 -mstrict-align 250*ca6a6373SXin LI // was one the very few with a minor difference: the memcpy() version 251*ca6a6373SXin LI // was one instruction longer. 252*ca6a6373SXin LI // 253*ca6a6373SXin LI // Conclusion: At least in case of GCC and Clang, byte-by-byte code is 254*ca6a6373SXin LI // the best choise for strict-align archs to do unaligned access. 255*ca6a6373SXin LI // 256*ca6a6373SXin LI // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 257*ca6a6373SXin LI // 258*ca6a6373SXin LI // Thanks to <https://godbolt.org/> it was easy to test different compilers. 259*ca6a6373SXin LI // The following is for little endian targets: 260*ca6a6373SXin LI /* 261*ca6a6373SXin LI #include <stdint.h> 262*ca6a6373SXin LI #include <string.h> 263*ca6a6373SXin LI 264*ca6a6373SXin LI uint32_t bytes16(const uint8_t *b) 265*ca6a6373SXin LI { 266*ca6a6373SXin LI return (uint32_t)b[0] 267*ca6a6373SXin LI | ((uint32_t)b[1] << 8); 268*ca6a6373SXin LI } 269*ca6a6373SXin LI 270*ca6a6373SXin LI uint32_t copy16(const uint8_t *b) 271*ca6a6373SXin LI { 272*ca6a6373SXin LI uint16_t v; 273*ca6a6373SXin LI memcpy(&v, b, sizeof(v)); 274*ca6a6373SXin LI return v; 275*ca6a6373SXin LI } 276*ca6a6373SXin LI 277*ca6a6373SXin LI uint32_t bytes32(const uint8_t *b) 278*ca6a6373SXin LI { 279*ca6a6373SXin LI return (uint32_t)b[0] 280*ca6a6373SXin LI | ((uint32_t)b[1] << 8) 281*ca6a6373SXin LI | ((uint32_t)b[2] << 16) 282*ca6a6373SXin LI | ((uint32_t)b[3] << 24); 283*ca6a6373SXin LI } 284*ca6a6373SXin LI 285*ca6a6373SXin LI uint32_t copy32(const uint8_t *b) 286*ca6a6373SXin LI { 287*ca6a6373SXin LI uint32_t v; 288*ca6a6373SXin LI memcpy(&v, b, sizeof(v)); 289*ca6a6373SXin LI return v; 290*ca6a6373SXin LI } 291*ca6a6373SXin LI 292*ca6a6373SXin LI void wbytes16(uint8_t *b, uint16_t v) 293*ca6a6373SXin LI { 294*ca6a6373SXin LI b[0] = (uint8_t)v; 295*ca6a6373SXin LI b[1] = (uint8_t)(v >> 8); 296*ca6a6373SXin LI } 297*ca6a6373SXin LI 298*ca6a6373SXin LI void wcopy16(uint8_t *b, uint16_t v) 299*ca6a6373SXin LI { 300*ca6a6373SXin LI memcpy(b, &v, sizeof(v)); 301*ca6a6373SXin LI } 302*ca6a6373SXin LI 303*ca6a6373SXin LI void wbytes32(uint8_t *b, uint32_t v) 304*ca6a6373SXin LI { 305*ca6a6373SXin LI b[0] = (uint8_t)v; 306*ca6a6373SXin LI b[1] = (uint8_t)(v >> 8); 307*ca6a6373SXin LI b[2] = (uint8_t)(v >> 16); 308*ca6a6373SXin LI b[3] = (uint8_t)(v >> 24); 309*ca6a6373SXin LI } 310*ca6a6373SXin LI 311*ca6a6373SXin LI void wcopy32(uint8_t *b, uint32_t v) 312*ca6a6373SXin LI { 313*ca6a6373SXin LI memcpy(b, &v, sizeof(v)); 314*ca6a6373SXin LI } 315*ca6a6373SXin LI */ 316*ca6a6373SXin LI 317*ca6a6373SXin LI 318*ca6a6373SXin LI #ifdef TUKLIB_FAST_UNALIGNED_ACCESS 31981ad8388SMartin Matuska 32081ad8388SMartin Matuska static inline uint16_t 321a8675d92SXin LI read16ne(const uint8_t *buf) 32281ad8388SMartin Matuska { 323*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 324a8675d92SXin LI return *(const uint16_t *)buf; 325a8675d92SXin LI #else 326a8675d92SXin LI uint16_t num; 327a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 328a8675d92SXin LI return num; 329a8675d92SXin LI #endif 33081ad8388SMartin Matuska } 33181ad8388SMartin Matuska 33281ad8388SMartin Matuska 33381ad8388SMartin Matuska static inline uint32_t 334a8675d92SXin LI read32ne(const uint8_t *buf) 33581ad8388SMartin Matuska { 336*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 337a8675d92SXin LI return *(const uint32_t *)buf; 338a8675d92SXin LI #else 339a8675d92SXin LI uint32_t num; 340a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 341a8675d92SXin LI return num; 342a8675d92SXin LI #endif 34381ad8388SMartin Matuska } 34481ad8388SMartin Matuska 34581ad8388SMartin Matuska 34681ad8388SMartin Matuska static inline uint64_t 347a8675d92SXin LI read64ne(const uint8_t *buf) 34881ad8388SMartin Matuska { 349*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 350a8675d92SXin LI return *(const uint64_t *)buf; 351a8675d92SXin LI #else 352a8675d92SXin LI uint64_t num; 353a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 354a8675d92SXin LI return num; 355a8675d92SXin LI #endif 35681ad8388SMartin Matuska } 35781ad8388SMartin Matuska 35881ad8388SMartin Matuska 35981ad8388SMartin Matuska static inline void 36081ad8388SMartin Matuska write16ne(uint8_t *buf, uint16_t num) 36181ad8388SMartin Matuska { 362*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 36381ad8388SMartin Matuska *(uint16_t *)buf = num; 364a8675d92SXin LI #else 365a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 366a8675d92SXin LI #endif 36781ad8388SMartin Matuska return; 36881ad8388SMartin Matuska } 36981ad8388SMartin Matuska 37081ad8388SMartin Matuska 37181ad8388SMartin Matuska static inline void 37281ad8388SMartin Matuska write32ne(uint8_t *buf, uint32_t num) 37381ad8388SMartin Matuska { 374*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 37581ad8388SMartin Matuska *(uint32_t *)buf = num; 376a8675d92SXin LI #else 377a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 378a8675d92SXin LI #endif 37981ad8388SMartin Matuska return; 38081ad8388SMartin Matuska } 38181ad8388SMartin Matuska 38281ad8388SMartin Matuska 38381ad8388SMartin Matuska static inline void 38481ad8388SMartin Matuska write64ne(uint8_t *buf, uint64_t num) 38581ad8388SMartin Matuska { 386*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 38781ad8388SMartin Matuska *(uint64_t *)buf = num; 388a8675d92SXin LI #else 389a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 390a8675d92SXin LI #endif 39181ad8388SMartin Matuska return; 39281ad8388SMartin Matuska } 39381ad8388SMartin Matuska 39481ad8388SMartin Matuska 39581ad8388SMartin Matuska static inline uint16_t 396a8675d92SXin LI read16be(const uint8_t *buf) 39781ad8388SMartin Matuska { 398a8675d92SXin LI uint16_t num = read16ne(buf); 399a8675d92SXin LI return conv16be(num); 40081ad8388SMartin Matuska } 40181ad8388SMartin Matuska 40281ad8388SMartin Matuska 40381ad8388SMartin Matuska static inline uint16_t 404a8675d92SXin LI read16le(const uint8_t *buf) 40581ad8388SMartin Matuska { 406a8675d92SXin LI uint16_t num = read16ne(buf); 407a8675d92SXin LI return conv16le(num); 40881ad8388SMartin Matuska } 40981ad8388SMartin Matuska 41081ad8388SMartin Matuska 41181ad8388SMartin Matuska static inline uint32_t 412a8675d92SXin LI read32be(const uint8_t *buf) 41381ad8388SMartin Matuska { 414a8675d92SXin LI uint32_t num = read32ne(buf); 415a8675d92SXin LI return conv32be(num); 41681ad8388SMartin Matuska } 41781ad8388SMartin Matuska 41881ad8388SMartin Matuska 41981ad8388SMartin Matuska static inline uint32_t 420a8675d92SXin LI read32le(const uint8_t *buf) 42181ad8388SMartin Matuska { 422a8675d92SXin LI uint32_t num = read32ne(buf); 423a8675d92SXin LI return conv32le(num); 42481ad8388SMartin Matuska } 42581ad8388SMartin Matuska 42681ad8388SMartin Matuska 42773ed8e77SXin LI static inline uint64_t 42873ed8e77SXin LI read64be(const uint8_t *buf) 42973ed8e77SXin LI { 43073ed8e77SXin LI uint64_t num = read64ne(buf); 43173ed8e77SXin LI return conv64be(num); 432*ca6a6373SXin LI } 433*ca6a6373SXin LI 434*ca6a6373SXin LI 435*ca6a6373SXin LI static inline uint64_t 436*ca6a6373SXin LI read64le(const uint8_t *buf) 437*ca6a6373SXin LI { 438*ca6a6373SXin LI uint64_t num = read64ne(buf); 439*ca6a6373SXin LI return conv64le(num); 440*ca6a6373SXin LI } 441*ca6a6373SXin LI 442*ca6a6373SXin LI 443*ca6a6373SXin LI // NOTE: Possible byte swapping must be done in a macro to allow the compiler 444*ca6a6373SXin LI // to optimize byte swapping of constants when using glibc's or *BSD's 445*ca6a6373SXin LI // byte swapping macros. The actual write is done in an inline function 446*ca6a6373SXin LI // to make type checking of the buf pointer possible. 447*ca6a6373SXin LI #define write16be(buf, num) write16ne(buf, conv16be(num)) 448*ca6a6373SXin LI #define write32be(buf, num) write32ne(buf, conv32be(num)) 449*ca6a6373SXin LI #define write64be(buf, num) write64ne(buf, conv64be(num)) 450*ca6a6373SXin LI #define write16le(buf, num) write16ne(buf, conv16le(num)) 451*ca6a6373SXin LI #define write32le(buf, num) write32ne(buf, conv32le(num)) 452*ca6a6373SXin LI #define write64le(buf, num) write64ne(buf, conv64le(num)) 453*ca6a6373SXin LI 45473ed8e77SXin LI #else 455*ca6a6373SXin LI 456*ca6a6373SXin LI #ifdef WORDS_BIGENDIAN 457*ca6a6373SXin LI # define read16ne read16be 458*ca6a6373SXin LI # define read32ne read32be 459*ca6a6373SXin LI # define read64ne read64be 460*ca6a6373SXin LI # define write16ne write16be 461*ca6a6373SXin LI # define write32ne write32be 462*ca6a6373SXin LI # define write64ne write64be 463*ca6a6373SXin LI #else 464*ca6a6373SXin LI # define read16ne read16le 465*ca6a6373SXin LI # define read32ne read32le 466*ca6a6373SXin LI # define read64ne read64le 467*ca6a6373SXin LI # define write16ne write16le 468*ca6a6373SXin LI # define write32ne write32le 469*ca6a6373SXin LI # define write64ne write64le 470*ca6a6373SXin LI #endif 471*ca6a6373SXin LI 472*ca6a6373SXin LI 473*ca6a6373SXin LI static inline uint16_t 474*ca6a6373SXin LI read16be(const uint8_t *buf) 475*ca6a6373SXin LI { 476*ca6a6373SXin LI uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 477*ca6a6373SXin LI return num; 478*ca6a6373SXin LI } 479*ca6a6373SXin LI 480*ca6a6373SXin LI 481*ca6a6373SXin LI static inline uint16_t 482*ca6a6373SXin LI read16le(const uint8_t *buf) 483*ca6a6373SXin LI { 484*ca6a6373SXin LI uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 485*ca6a6373SXin LI return num; 486*ca6a6373SXin LI } 487*ca6a6373SXin LI 488*ca6a6373SXin LI 489*ca6a6373SXin LI static inline uint32_t 490*ca6a6373SXin LI read32be(const uint8_t *buf) 491*ca6a6373SXin LI { 492*ca6a6373SXin LI uint32_t num = (uint32_t)buf[0] << 24; 493*ca6a6373SXin LI num |= (uint32_t)buf[1] << 16; 494*ca6a6373SXin LI num |= (uint32_t)buf[2] << 8; 495*ca6a6373SXin LI num |= (uint32_t)buf[3]; 496*ca6a6373SXin LI return num; 497*ca6a6373SXin LI } 498*ca6a6373SXin LI 499*ca6a6373SXin LI 500*ca6a6373SXin LI static inline uint32_t 501*ca6a6373SXin LI read32le(const uint8_t *buf) 502*ca6a6373SXin LI { 503*ca6a6373SXin LI uint32_t num = (uint32_t)buf[0]; 504*ca6a6373SXin LI num |= (uint32_t)buf[1] << 8; 505*ca6a6373SXin LI num |= (uint32_t)buf[2] << 16; 506*ca6a6373SXin LI num |= (uint32_t)buf[3] << 24; 507*ca6a6373SXin LI return num; 508*ca6a6373SXin LI } 509*ca6a6373SXin LI 510*ca6a6373SXin LI 511*ca6a6373SXin LI static inline uint64_t 512*ca6a6373SXin LI read64be(const uint8_t *buf) 513*ca6a6373SXin LI { 51473ed8e77SXin LI uint64_t num = (uint64_t)buf[0] << 56; 51573ed8e77SXin LI num |= (uint64_t)buf[1] << 48; 51673ed8e77SXin LI num |= (uint64_t)buf[2] << 40; 51773ed8e77SXin LI num |= (uint64_t)buf[3] << 32; 51873ed8e77SXin LI num |= (uint64_t)buf[4] << 24; 51973ed8e77SXin LI num |= (uint64_t)buf[5] << 16; 52073ed8e77SXin LI num |= (uint64_t)buf[6] << 8; 52173ed8e77SXin LI num |= (uint64_t)buf[7]; 52273ed8e77SXin LI return num; 52373ed8e77SXin LI } 52473ed8e77SXin LI 52573ed8e77SXin LI 52673ed8e77SXin LI static inline uint64_t 52773ed8e77SXin LI read64le(const uint8_t *buf) 52873ed8e77SXin LI { 52973ed8e77SXin LI uint64_t num = (uint64_t)buf[0]; 53073ed8e77SXin LI num |= (uint64_t)buf[1] << 8; 53173ed8e77SXin LI num |= (uint64_t)buf[2] << 16; 53273ed8e77SXin LI num |= (uint64_t)buf[3] << 24; 53373ed8e77SXin LI num |= (uint64_t)buf[4] << 32; 53473ed8e77SXin LI num |= (uint64_t)buf[5] << 40; 53573ed8e77SXin LI num |= (uint64_t)buf[6] << 48; 53673ed8e77SXin LI num |= (uint64_t)buf[7] << 56; 53773ed8e77SXin LI return num; 53873ed8e77SXin LI } 53973ed8e77SXin LI 54073ed8e77SXin LI 54181ad8388SMartin Matuska static inline void 542a8675d92SXin LI write16be(uint8_t *buf, uint16_t num) 54381ad8388SMartin Matuska { 544342bcb12SXin LI buf[0] = (uint8_t)(num >> 8); 545342bcb12SXin LI buf[1] = (uint8_t)num; 54681ad8388SMartin Matuska return; 54781ad8388SMartin Matuska } 54881ad8388SMartin Matuska 54981ad8388SMartin Matuska 55081ad8388SMartin Matuska static inline void 551a8675d92SXin LI write16le(uint8_t *buf, uint16_t num) 55281ad8388SMartin Matuska { 553342bcb12SXin LI buf[0] = (uint8_t)num; 554342bcb12SXin LI buf[1] = (uint8_t)(num >> 8); 55581ad8388SMartin Matuska return; 55681ad8388SMartin Matuska } 55781ad8388SMartin Matuska 55881ad8388SMartin Matuska 55981ad8388SMartin Matuska static inline void 560a8675d92SXin LI write32be(uint8_t *buf, uint32_t num) 56181ad8388SMartin Matuska { 562342bcb12SXin LI buf[0] = (uint8_t)(num >> 24); 563342bcb12SXin LI buf[1] = (uint8_t)(num >> 16); 564342bcb12SXin LI buf[2] = (uint8_t)(num >> 8); 565342bcb12SXin LI buf[3] = (uint8_t)num; 56681ad8388SMartin Matuska return; 56781ad8388SMartin Matuska } 56881ad8388SMartin Matuska 56981ad8388SMartin Matuska 57081ad8388SMartin Matuska static inline void 571a8675d92SXin LI write32le(uint8_t *buf, uint32_t num) 57281ad8388SMartin Matuska { 573342bcb12SXin LI buf[0] = (uint8_t)num; 574342bcb12SXin LI buf[1] = (uint8_t)(num >> 8); 575342bcb12SXin LI buf[2] = (uint8_t)(num >> 16); 576342bcb12SXin LI buf[3] = (uint8_t)(num >> 24); 57781ad8388SMartin Matuska return; 57881ad8388SMartin Matuska } 579*ca6a6373SXin LI 580*ca6a6373SXin LI 581*ca6a6373SXin LI static inline void 582*ca6a6373SXin LI write64be(uint8_t *buf, uint64_t num) 583*ca6a6373SXin LI { 584*ca6a6373SXin LI buf[0] = (uint8_t)(num >> 56); 585*ca6a6373SXin LI buf[1] = (uint8_t)(num >> 48); 586*ca6a6373SXin LI buf[2] = (uint8_t)(num >> 40); 587*ca6a6373SXin LI buf[3] = (uint8_t)(num >> 32); 588*ca6a6373SXin LI buf[4] = (uint8_t)(num >> 24); 589*ca6a6373SXin LI buf[5] = (uint8_t)(num >> 16); 590*ca6a6373SXin LI buf[6] = (uint8_t)(num >> 8); 591*ca6a6373SXin LI buf[7] = (uint8_t)num; 592*ca6a6373SXin LI return; 593*ca6a6373SXin LI } 594*ca6a6373SXin LI 595*ca6a6373SXin LI 596*ca6a6373SXin LI static inline void 597*ca6a6373SXin LI write64le(uint8_t *buf, uint64_t num) 598*ca6a6373SXin LI { 599*ca6a6373SXin LI buf[0] = (uint8_t)num; 600*ca6a6373SXin LI buf[1] = (uint8_t)(num >> 8); 601*ca6a6373SXin LI buf[2] = (uint8_t)(num >> 16); 602*ca6a6373SXin LI buf[3] = (uint8_t)(num >> 24); 603*ca6a6373SXin LI buf[4] = (uint8_t)(num >> 32); 604*ca6a6373SXin LI buf[5] = (uint8_t)(num >> 40); 605*ca6a6373SXin LI buf[6] = (uint8_t)(num >> 48); 606*ca6a6373SXin LI buf[7] = (uint8_t)(num >> 56); 607*ca6a6373SXin LI return; 608*ca6a6373SXin LI } 609*ca6a6373SXin LI 61081ad8388SMartin Matuska #endif 61181ad8388SMartin Matuska 61281ad8388SMartin Matuska 613a8675d92SXin LI ////////////////////////////// 614a8675d92SXin LI // Aligned reads and writes // 615a8675d92SXin LI ////////////////////////////// 616a8675d92SXin LI 617a8675d92SXin LI // Separate functions for aligned reads and writes are provided since on 618a8675d92SXin LI // strict-align archs aligned access is much faster than unaligned access. 619a8675d92SXin LI // 620a8675d92SXin LI // Just like in the unaligned case, memcpy() is needed to avoid 621a8675d92SXin LI // strict aliasing violations. However, on archs that don't support 622a8675d92SXin LI // unaligned access the compiler cannot know that the pointers given 623a8675d92SXin LI // to memcpy() are aligned which results in slow code. As of C11 there is 624a8675d92SXin LI // no standard way to tell the compiler that we know that the address is 625a8675d92SXin LI // aligned but some compilers have language extensions to do that. With 626a8675d92SXin LI // such language extensions the memcpy() method gives excellent results. 627a8675d92SXin LI // 628a8675d92SXin LI // What to do on a strict-align system when no known language extentensions 629a8675d92SXin LI // are available? Falling back to byte-by-byte access would be safe but ruin 630a8675d92SXin LI // optimizations that have been made specifically with aligned access in mind. 631a8675d92SXin LI // As a compromise, aligned reads will fall back to non-compliant type punning 632a8675d92SXin LI // but aligned writes will be byte-by-byte, that is, fast reads are preferred 633a8675d92SXin LI // over fast writes. This obviously isn't great but hopefully it's a working 634a8675d92SXin LI // compromise for now. 635a8675d92SXin LI // 636a8675d92SXin LI // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 637a8675d92SXin LI #ifdef HAVE___BUILTIN_ASSUME_ALIGNED 638a8675d92SXin LI # define tuklib_memcpy_aligned(dest, src, size) \ 639a8675d92SXin LI memcpy(dest, __builtin_assume_aligned(src, size), size) 640a8675d92SXin LI #else 641a8675d92SXin LI # define tuklib_memcpy_aligned(dest, src, size) \ 642a8675d92SXin LI memcpy(dest, src, size) 643a8675d92SXin LI # ifndef TUKLIB_FAST_UNALIGNED_ACCESS 644a8675d92SXin LI # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 645a8675d92SXin LI # endif 646a8675d92SXin LI #endif 647a8675d92SXin LI 648a8675d92SXin LI 649a8675d92SXin LI static inline uint16_t 650a8675d92SXin LI aligned_read16ne(const uint8_t *buf) 651a8675d92SXin LI { 652a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 653a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 654a8675d92SXin LI return *(const uint16_t *)buf; 655a8675d92SXin LI #else 656a8675d92SXin LI uint16_t num; 657a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 658a8675d92SXin LI return num; 659a8675d92SXin LI #endif 660a8675d92SXin LI } 661a8675d92SXin LI 662a8675d92SXin LI 663a8675d92SXin LI static inline uint32_t 664a8675d92SXin LI aligned_read32ne(const uint8_t *buf) 665a8675d92SXin LI { 666a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 667a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 668a8675d92SXin LI return *(const uint32_t *)buf; 669a8675d92SXin LI #else 670a8675d92SXin LI uint32_t num; 671a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 672a8675d92SXin LI return num; 673a8675d92SXin LI #endif 674a8675d92SXin LI } 675a8675d92SXin LI 676a8675d92SXin LI 677a8675d92SXin LI static inline uint64_t 678a8675d92SXin LI aligned_read64ne(const uint8_t *buf) 679a8675d92SXin LI { 680a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 681a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 682a8675d92SXin LI return *(const uint64_t *)buf; 683a8675d92SXin LI #else 684a8675d92SXin LI uint64_t num; 685a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 686a8675d92SXin LI return num; 687a8675d92SXin LI #endif 688a8675d92SXin LI } 689a8675d92SXin LI 690a8675d92SXin LI 691a8675d92SXin LI static inline void 692a8675d92SXin LI aligned_write16ne(uint8_t *buf, uint16_t num) 693a8675d92SXin LI { 694a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 695a8675d92SXin LI *(uint16_t *)buf = num; 696a8675d92SXin LI #else 697a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 698a8675d92SXin LI #endif 699a8675d92SXin LI return; 700a8675d92SXin LI } 701a8675d92SXin LI 702a8675d92SXin LI 703a8675d92SXin LI static inline void 704a8675d92SXin LI aligned_write32ne(uint8_t *buf, uint32_t num) 705a8675d92SXin LI { 706a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 707a8675d92SXin LI *(uint32_t *)buf = num; 708a8675d92SXin LI #else 709a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 710a8675d92SXin LI #endif 711a8675d92SXin LI return; 712a8675d92SXin LI } 713a8675d92SXin LI 714a8675d92SXin LI 715a8675d92SXin LI static inline void 716a8675d92SXin LI aligned_write64ne(uint8_t *buf, uint64_t num) 717a8675d92SXin LI { 718a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 719a8675d92SXin LI *(uint64_t *)buf = num; 720a8675d92SXin LI #else 721a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 722a8675d92SXin LI #endif 723a8675d92SXin LI return; 724a8675d92SXin LI } 725a8675d92SXin LI 726a8675d92SXin LI 727a8675d92SXin LI static inline uint16_t 728a8675d92SXin LI aligned_read16be(const uint8_t *buf) 729a8675d92SXin LI { 730a8675d92SXin LI uint16_t num = aligned_read16ne(buf); 731a8675d92SXin LI return conv16be(num); 732a8675d92SXin LI } 733a8675d92SXin LI 734a8675d92SXin LI 735a8675d92SXin LI static inline uint16_t 736a8675d92SXin LI aligned_read16le(const uint8_t *buf) 737a8675d92SXin LI { 738a8675d92SXin LI uint16_t num = aligned_read16ne(buf); 739a8675d92SXin LI return conv16le(num); 740a8675d92SXin LI } 741a8675d92SXin LI 742a8675d92SXin LI 743a8675d92SXin LI static inline uint32_t 744a8675d92SXin LI aligned_read32be(const uint8_t *buf) 745a8675d92SXin LI { 746a8675d92SXin LI uint32_t num = aligned_read32ne(buf); 747a8675d92SXin LI return conv32be(num); 748a8675d92SXin LI } 749a8675d92SXin LI 750a8675d92SXin LI 751a8675d92SXin LI static inline uint32_t 752a8675d92SXin LI aligned_read32le(const uint8_t *buf) 753a8675d92SXin LI { 754a8675d92SXin LI uint32_t num = aligned_read32ne(buf); 755a8675d92SXin LI return conv32le(num); 756a8675d92SXin LI } 757a8675d92SXin LI 758a8675d92SXin LI 759a8675d92SXin LI static inline uint64_t 760a8675d92SXin LI aligned_read64be(const uint8_t *buf) 761a8675d92SXin LI { 762a8675d92SXin LI uint64_t num = aligned_read64ne(buf); 763a8675d92SXin LI return conv64be(num); 764a8675d92SXin LI } 765a8675d92SXin LI 766a8675d92SXin LI 767a8675d92SXin LI static inline uint64_t 768a8675d92SXin LI aligned_read64le(const uint8_t *buf) 769a8675d92SXin LI { 770a8675d92SXin LI uint64_t num = aligned_read64ne(buf); 771a8675d92SXin LI return conv64le(num); 772a8675d92SXin LI } 773a8675d92SXin LI 774a8675d92SXin LI 775a8675d92SXin LI // These need to be macros like in the unaligned case. 776a8675d92SXin LI #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 777a8675d92SXin LI #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 778a8675d92SXin LI #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 779a8675d92SXin LI #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 780a8675d92SXin LI #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 781a8675d92SXin LI #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 782a8675d92SXin LI 783a8675d92SXin LI 784a8675d92SXin LI //////////////////// 785a8675d92SXin LI // Bit operations // 786a8675d92SXin LI //////////////////// 787a8675d92SXin LI 78881ad8388SMartin Matuska static inline uint32_t 78981ad8388SMartin Matuska bsr32(uint32_t n) 79081ad8388SMartin Matuska { 79181ad8388SMartin Matuska // Check for ICC first, since it tends to define __GNUC__ too. 79281ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 79381ad8388SMartin Matuska return _bit_scan_reverse(n); 79481ad8388SMartin Matuska 795b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 79681ad8388SMartin Matuska // GCC >= 3.4 has __builtin_clz(), which gives good results on 79781ad8388SMartin Matuska // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 79881ad8388SMartin Matuska // either plain BSR (so the XOR gets optimized away) or LZCNT and 79981ad8388SMartin Matuska // XOR (if -march indicates that SSE4a instructions are supported). 800a8675d92SXin LI return (uint32_t)__builtin_clz(n) ^ 31U; 80181ad8388SMartin Matuska 80281ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 80381ad8388SMartin Matuska uint32_t i; 80481ad8388SMartin Matuska __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 80581ad8388SMartin Matuska return i; 80681ad8388SMartin Matuska 807a8675d92SXin LI #elif defined(_MSC_VER) 808a8675d92SXin LI unsigned long i; 809a8675d92SXin LI _BitScanReverse(&i, n); 81081ad8388SMartin Matuska return i; 81181ad8388SMartin Matuska 81281ad8388SMartin Matuska #else 81381ad8388SMartin Matuska uint32_t i = 31; 81481ad8388SMartin Matuska 815a8675d92SXin LI if ((n & 0xFFFF0000) == 0) { 81681ad8388SMartin Matuska n <<= 16; 81781ad8388SMartin Matuska i = 15; 81881ad8388SMartin Matuska } 81981ad8388SMartin Matuska 820a8675d92SXin LI if ((n & 0xFF000000) == 0) { 82181ad8388SMartin Matuska n <<= 8; 82281ad8388SMartin Matuska i -= 8; 82381ad8388SMartin Matuska } 82481ad8388SMartin Matuska 825a8675d92SXin LI if ((n & 0xF0000000) == 0) { 82681ad8388SMartin Matuska n <<= 4; 82781ad8388SMartin Matuska i -= 4; 82881ad8388SMartin Matuska } 82981ad8388SMartin Matuska 830a8675d92SXin LI if ((n & 0xC0000000) == 0) { 83181ad8388SMartin Matuska n <<= 2; 83281ad8388SMartin Matuska i -= 2; 83381ad8388SMartin Matuska } 83481ad8388SMartin Matuska 835a8675d92SXin LI if ((n & 0x80000000) == 0) 83681ad8388SMartin Matuska --i; 83781ad8388SMartin Matuska 83881ad8388SMartin Matuska return i; 83981ad8388SMartin Matuska #endif 84081ad8388SMartin Matuska } 84181ad8388SMartin Matuska 84281ad8388SMartin Matuska 84381ad8388SMartin Matuska static inline uint32_t 84481ad8388SMartin Matuska clz32(uint32_t n) 84581ad8388SMartin Matuska { 84681ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 84781ad8388SMartin Matuska return _bit_scan_reverse(n) ^ 31U; 84881ad8388SMartin Matuska 849b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 850a8675d92SXin LI return (uint32_t)__builtin_clz(n); 85181ad8388SMartin Matuska 85281ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 85381ad8388SMartin Matuska uint32_t i; 85481ad8388SMartin Matuska __asm__("bsrl %1, %0\n\t" 85581ad8388SMartin Matuska "xorl $31, %0" 85681ad8388SMartin Matuska : "=r" (i) : "rm" (n)); 85781ad8388SMartin Matuska return i; 85881ad8388SMartin Matuska 859a8675d92SXin LI #elif defined(_MSC_VER) 860a8675d92SXin LI unsigned long i; 861a8675d92SXin LI _BitScanReverse(&i, n); 86281ad8388SMartin Matuska return i ^ 31U; 86381ad8388SMartin Matuska 86481ad8388SMartin Matuska #else 86581ad8388SMartin Matuska uint32_t i = 0; 86681ad8388SMartin Matuska 867a8675d92SXin LI if ((n & 0xFFFF0000) == 0) { 86881ad8388SMartin Matuska n <<= 16; 86981ad8388SMartin Matuska i = 16; 87081ad8388SMartin Matuska } 87181ad8388SMartin Matuska 872a8675d92SXin LI if ((n & 0xFF000000) == 0) { 87381ad8388SMartin Matuska n <<= 8; 87481ad8388SMartin Matuska i += 8; 87581ad8388SMartin Matuska } 87681ad8388SMartin Matuska 877a8675d92SXin LI if ((n & 0xF0000000) == 0) { 87881ad8388SMartin Matuska n <<= 4; 87981ad8388SMartin Matuska i += 4; 88081ad8388SMartin Matuska } 88181ad8388SMartin Matuska 882a8675d92SXin LI if ((n & 0xC0000000) == 0) { 88381ad8388SMartin Matuska n <<= 2; 88481ad8388SMartin Matuska i += 2; 88581ad8388SMartin Matuska } 88681ad8388SMartin Matuska 887a8675d92SXin LI if ((n & 0x80000000) == 0) 88881ad8388SMartin Matuska ++i; 88981ad8388SMartin Matuska 89081ad8388SMartin Matuska return i; 89181ad8388SMartin Matuska #endif 89281ad8388SMartin Matuska } 89381ad8388SMartin Matuska 89481ad8388SMartin Matuska 89581ad8388SMartin Matuska static inline uint32_t 89681ad8388SMartin Matuska ctz32(uint32_t n) 89781ad8388SMartin Matuska { 89881ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 89981ad8388SMartin Matuska return _bit_scan_forward(n); 90081ad8388SMartin Matuska 901b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 902a8675d92SXin LI return (uint32_t)__builtin_ctz(n); 90381ad8388SMartin Matuska 90481ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 90581ad8388SMartin Matuska uint32_t i; 90681ad8388SMartin Matuska __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 90781ad8388SMartin Matuska return i; 90881ad8388SMartin Matuska 909a8675d92SXin LI #elif defined(_MSC_VER) 910a8675d92SXin LI unsigned long i; 911a8675d92SXin LI _BitScanForward(&i, n); 91281ad8388SMartin Matuska return i; 91381ad8388SMartin Matuska 91481ad8388SMartin Matuska #else 91581ad8388SMartin Matuska uint32_t i = 0; 91681ad8388SMartin Matuska 917a8675d92SXin LI if ((n & 0x0000FFFF) == 0) { 91881ad8388SMartin Matuska n >>= 16; 91981ad8388SMartin Matuska i = 16; 92081ad8388SMartin Matuska } 92181ad8388SMartin Matuska 922a8675d92SXin LI if ((n & 0x000000FF) == 0) { 92381ad8388SMartin Matuska n >>= 8; 92481ad8388SMartin Matuska i += 8; 92581ad8388SMartin Matuska } 92681ad8388SMartin Matuska 927a8675d92SXin LI if ((n & 0x0000000F) == 0) { 92881ad8388SMartin Matuska n >>= 4; 92981ad8388SMartin Matuska i += 4; 93081ad8388SMartin Matuska } 93181ad8388SMartin Matuska 932a8675d92SXin LI if ((n & 0x00000003) == 0) { 93381ad8388SMartin Matuska n >>= 2; 93481ad8388SMartin Matuska i += 2; 93581ad8388SMartin Matuska } 93681ad8388SMartin Matuska 937a8675d92SXin LI if ((n & 0x00000001) == 0) 93881ad8388SMartin Matuska ++i; 93981ad8388SMartin Matuska 94081ad8388SMartin Matuska return i; 94181ad8388SMartin Matuska #endif 94281ad8388SMartin Matuska } 94381ad8388SMartin Matuska 94481ad8388SMartin Matuska #define bsf32 ctz32 94581ad8388SMartin Matuska 94681ad8388SMartin Matuska #endif 947