13b35e7eeSXin LI // SPDX-License-Identifier: 0BSD 23b35e7eeSXin LI 381ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 481ad8388SMartin Matuska // 581ad8388SMartin Matuska /// \file tuklib_integer.h 681ad8388SMartin Matuska /// \brief Various integer and bit operations 781ad8388SMartin Matuska /// 881ad8388SMartin Matuska /// This file provides macros or functions to do some basic integer and bit 981ad8388SMartin Matuska /// operations. 1081ad8388SMartin Matuska /// 11a8675d92SXin LI /// Native endian inline functions (XX = 16, 32, or 64): 12a8675d92SXin LI /// - Unaligned native endian reads: readXXne(ptr) 13a8675d92SXin LI /// - Unaligned native endian writes: writeXXne(ptr, num) 14a8675d92SXin LI /// - Aligned native endian reads: aligned_readXXne(ptr) 15a8675d92SXin LI /// - Aligned native endian writes: aligned_writeXXne(ptr, num) 16a8675d92SXin LI /// 17a8675d92SXin LI /// Endianness-converting integer operations (these can be macros!) 18a8675d92SXin LI /// (XX = 16, 32, or 64; Y = b or l): 193b35e7eeSXin LI /// - Byte swapping: byteswapXX(num) 20a8675d92SXin LI /// - Byte order conversions to/from native (byteswaps if Y isn't 21a8675d92SXin LI /// the native endianness): convXXYe(num) 2273ed8e77SXin LI /// - Unaligned reads: readXXYe(ptr) 2373ed8e77SXin LI /// - Unaligned writes: writeXXYe(ptr, num) 24a8675d92SXin LI /// - Aligned reads: aligned_readXXYe(ptr) 25a8675d92SXin LI /// - Aligned writes: aligned_writeXXYe(ptr, num) 2681ad8388SMartin Matuska /// 27a8675d92SXin LI /// Since the above can macros, the arguments should have no side effects 28a8675d92SXin LI /// because they may be evaluated more than once. 2981ad8388SMartin Matuska /// 30a8675d92SXin LI /// Bit scan operations for non-zero 32-bit integers (inline functions): 3181ad8388SMartin Matuska /// - Bit scan reverse (find highest non-zero bit): bsr32(num) 3281ad8388SMartin Matuska /// - Count leading zeros: clz32(num) 3381ad8388SMartin Matuska /// - Count trailing zeros: ctz32(num) 3481ad8388SMartin Matuska /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 3581ad8388SMartin Matuska /// 3681ad8388SMartin Matuska /// The above bit scan operations return 0-31. If num is zero, 3781ad8388SMartin Matuska /// the result is undefined. 3881ad8388SMartin Matuska // 3981ad8388SMartin Matuska // Authors: Lasse Collin 4081ad8388SMartin Matuska // Joachim Henke 4181ad8388SMartin Matuska // 4281ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 4381ad8388SMartin Matuska 4481ad8388SMartin Matuska #ifndef TUKLIB_INTEGER_H 4581ad8388SMartin Matuska #define TUKLIB_INTEGER_H 4681ad8388SMartin Matuska 4781ad8388SMartin Matuska #include "tuklib_common.h" 48a8675d92SXin LI #include <string.h> 49a8675d92SXin LI 50a8675d92SXin LI // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 51a8675d92SXin LI // and such functions. 52a8675d92SXin LI #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 53a8675d92SXin LI # include <immintrin.h> 54b333cd44SXin LI // Only include <intrin.h> when it is needed. GCC and Clang can both 55b333cd44SXin LI // use __builtin's, so we only need Windows instrincs when using MSVC. 56b333cd44SXin LI // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 57b333cd44SXin LI // cases explicitly. 58b333cd44SXin LI #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 59b333cd44SXin LI # include <intrin.h> 60a8675d92SXin LI #endif 6181ad8388SMartin Matuska 6281ad8388SMartin Matuska 63a8675d92SXin LI /////////////////// 64a8675d92SXin LI // Byte swapping // 65a8675d92SXin LI /////////////////// 6681ad8388SMartin Matuska 67a8675d92SXin LI #if defined(HAVE___BUILTIN_BSWAPXX) 68a8675d92SXin LI // GCC >= 4.8 and Clang 693b35e7eeSXin LI # define byteswap16(num) __builtin_bswap16(num) 703b35e7eeSXin LI # define byteswap32(num) __builtin_bswap32(num) 713b35e7eeSXin LI # define byteswap64(num) __builtin_bswap64(num) 72a8675d92SXin LI 73a8675d92SXin LI #elif defined(HAVE_BYTESWAP_H) 7481ad8388SMartin Matuska // glibc, uClibc, dietlibc 7581ad8388SMartin Matuska # include <byteswap.h> 7681ad8388SMartin Matuska # ifdef HAVE_BSWAP_16 773b35e7eeSXin LI # define byteswap16(num) bswap_16(num) 7881ad8388SMartin Matuska # endif 7981ad8388SMartin Matuska # ifdef HAVE_BSWAP_32 803b35e7eeSXin LI # define byteswap32(num) bswap_32(num) 8181ad8388SMartin Matuska # endif 8281ad8388SMartin Matuska # ifdef HAVE_BSWAP_64 833b35e7eeSXin LI # define byteswap64(num) bswap_64(num) 8481ad8388SMartin Matuska # endif 8581ad8388SMartin Matuska 8681ad8388SMartin Matuska #elif defined(HAVE_SYS_ENDIAN_H) 8781ad8388SMartin Matuska // *BSDs and Darwin 8881ad8388SMartin Matuska # include <sys/endian.h> 89*26743408SXin LI # ifdef __OpenBSD__ 90*26743408SXin LI # define byteswap16(num) swap16(num) 91*26743408SXin LI # define byteswap32(num) swap32(num) 92*26743408SXin LI # define byteswap64(num) swap64(num) 93*26743408SXin LI # else 943b35e7eeSXin LI # define byteswap16(num) bswap16(num) 953b35e7eeSXin LI # define byteswap32(num) bswap32(num) 963b35e7eeSXin LI # define byteswap64(num) bswap64(num) 97*26743408SXin LI # endif 9881ad8388SMartin Matuska 9981ad8388SMartin Matuska #elif defined(HAVE_SYS_BYTEORDER_H) 10081ad8388SMartin Matuska // Solaris 10181ad8388SMartin Matuska # include <sys/byteorder.h> 10281ad8388SMartin Matuska # ifdef BSWAP_16 1033b35e7eeSXin LI # define byteswap16(num) BSWAP_16(num) 10481ad8388SMartin Matuska # endif 10581ad8388SMartin Matuska # ifdef BSWAP_32 1063b35e7eeSXin LI # define byteswap32(num) BSWAP_32(num) 10781ad8388SMartin Matuska # endif 10881ad8388SMartin Matuska # ifdef BSWAP_64 1093b35e7eeSXin LI # define byteswap64(num) BSWAP_64(num) 11081ad8388SMartin Matuska # endif 11181ad8388SMartin Matuska # ifdef BE_16 11281ad8388SMartin Matuska # define conv16be(num) BE_16(num) 11381ad8388SMartin Matuska # endif 11481ad8388SMartin Matuska # ifdef BE_32 11581ad8388SMartin Matuska # define conv32be(num) BE_32(num) 11681ad8388SMartin Matuska # endif 11781ad8388SMartin Matuska # ifdef BE_64 11881ad8388SMartin Matuska # define conv64be(num) BE_64(num) 11981ad8388SMartin Matuska # endif 12081ad8388SMartin Matuska # ifdef LE_16 12181ad8388SMartin Matuska # define conv16le(num) LE_16(num) 12281ad8388SMartin Matuska # endif 12381ad8388SMartin Matuska # ifdef LE_32 12481ad8388SMartin Matuska # define conv32le(num) LE_32(num) 12581ad8388SMartin Matuska # endif 12681ad8388SMartin Matuska # ifdef LE_64 12781ad8388SMartin Matuska # define conv64le(num) LE_64(num) 12881ad8388SMartin Matuska # endif 12981ad8388SMartin Matuska #endif 13081ad8388SMartin Matuska 1313b35e7eeSXin LI #ifndef byteswap16 1323b35e7eeSXin LI # define byteswap16(n) (uint16_t)( \ 133a8675d92SXin LI (((n) & 0x00FFU) << 8) \ 134a8675d92SXin LI | (((n) & 0xFF00U) >> 8) \ 135a8675d92SXin LI ) 13681ad8388SMartin Matuska #endif 13781ad8388SMartin Matuska 1383b35e7eeSXin LI #ifndef byteswap32 1393b35e7eeSXin LI # define byteswap32(n) (uint32_t)( \ 140a8675d92SXin LI (((n) & UINT32_C(0x000000FF)) << 24) \ 141a8675d92SXin LI | (((n) & UINT32_C(0x0000FF00)) << 8) \ 142a8675d92SXin LI | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 143a8675d92SXin LI | (((n) & UINT32_C(0xFF000000)) >> 24) \ 144a8675d92SXin LI ) 14581ad8388SMartin Matuska #endif 14681ad8388SMartin Matuska 1473b35e7eeSXin LI #ifndef byteswap64 1483b35e7eeSXin LI # define byteswap64(n) (uint64_t)( \ 149a8675d92SXin LI (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 150a8675d92SXin LI | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 151a8675d92SXin LI | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 152a8675d92SXin LI | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 153a8675d92SXin LI | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 154a8675d92SXin LI | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 155a8675d92SXin LI | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 156a8675d92SXin LI | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 157a8675d92SXin LI ) 15881ad8388SMartin Matuska #endif 15981ad8388SMartin Matuska 16081ad8388SMartin Matuska // Define conversion macros using the basic byte swapping macros. 16181ad8388SMartin Matuska #ifdef WORDS_BIGENDIAN 16281ad8388SMartin Matuska # ifndef conv16be 16381ad8388SMartin Matuska # define conv16be(num) ((uint16_t)(num)) 16481ad8388SMartin Matuska # endif 16581ad8388SMartin Matuska # ifndef conv32be 16681ad8388SMartin Matuska # define conv32be(num) ((uint32_t)(num)) 16781ad8388SMartin Matuska # endif 16881ad8388SMartin Matuska # ifndef conv64be 16981ad8388SMartin Matuska # define conv64be(num) ((uint64_t)(num)) 17081ad8388SMartin Matuska # endif 17181ad8388SMartin Matuska # ifndef conv16le 1723b35e7eeSXin LI # define conv16le(num) byteswap16(num) 17381ad8388SMartin Matuska # endif 17481ad8388SMartin Matuska # ifndef conv32le 1753b35e7eeSXin LI # define conv32le(num) byteswap32(num) 17681ad8388SMartin Matuska # endif 17781ad8388SMartin Matuska # ifndef conv64le 1783b35e7eeSXin LI # define conv64le(num) byteswap64(num) 17981ad8388SMartin Matuska # endif 18081ad8388SMartin Matuska #else 18181ad8388SMartin Matuska # ifndef conv16be 1823b35e7eeSXin LI # define conv16be(num) byteswap16(num) 18381ad8388SMartin Matuska # endif 18481ad8388SMartin Matuska # ifndef conv32be 1853b35e7eeSXin LI # define conv32be(num) byteswap32(num) 18681ad8388SMartin Matuska # endif 18781ad8388SMartin Matuska # ifndef conv64be 1883b35e7eeSXin LI # define conv64be(num) byteswap64(num) 18981ad8388SMartin Matuska # endif 19081ad8388SMartin Matuska # ifndef conv16le 19181ad8388SMartin Matuska # define conv16le(num) ((uint16_t)(num)) 19281ad8388SMartin Matuska # endif 19381ad8388SMartin Matuska # ifndef conv32le 19481ad8388SMartin Matuska # define conv32le(num) ((uint32_t)(num)) 19581ad8388SMartin Matuska # endif 19681ad8388SMartin Matuska # ifndef conv64le 19781ad8388SMartin Matuska # define conv64le(num) ((uint64_t)(num)) 19881ad8388SMartin Matuska # endif 19981ad8388SMartin Matuska #endif 20081ad8388SMartin Matuska 20181ad8388SMartin Matuska 202a8675d92SXin LI //////////////////////////////// 203a8675d92SXin LI // Unaligned reads and writes // 204a8675d92SXin LI //////////////////////////////// 205a8675d92SXin LI 206ca6a6373SXin LI // No-strict-align archs like x86-64 207ca6a6373SXin LI // --------------------------------- 208ca6a6373SXin LI // 209a8675d92SXin LI // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 210a8675d92SXin LI // is bad even if the uint8_pointer is properly aligned because this kind 211a8675d92SXin LI // of casts break strict aliasing rules and result in undefined behavior. 212a8675d92SXin LI // With unaligned pointers it's even worse: compilers may emit vector 213a8675d92SXin LI // instructions that require aligned pointers even if non-vector 214a8675d92SXin LI // instructions work with unaligned pointers. 215a8675d92SXin LI // 216a8675d92SXin LI // Using memcpy() is the standard compliant way to do unaligned access. 217a8675d92SXin LI // Many modern compilers inline it so there is no function call overhead. 218a8675d92SXin LI // For those compilers that don't handle the memcpy() method well, the 219a8675d92SXin LI // old casting method (that violates strict aliasing) can be requested at 220a8675d92SXin LI // build time. A third method, casting to a packed struct, would also be 221a8675d92SXin LI // an option but isn't provided to keep things simpler (it's already a mess). 222a8675d92SXin LI // Hopefully this is flexible enough in practice. 223ca6a6373SXin LI // 224ca6a6373SXin LI // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 225ca6a6373SXin LI // 226ca6a6373SXin LI // buf[0] | (buf[1] << 8) 227ca6a6373SXin LI // 228ca6a6373SXin LI // reads a 16-bit value and can emit a single 16-bit load and produce 229ca6a6373SXin LI // identical code than with the memcpy() method. In other cases Clang and GCC 230ca6a6373SXin LI // produce either the same or better code with memcpy(). For example, Clang 9 231ca6a6373SXin LI // on x86-64 can detect 32-bit load but not 16-bit load. 232ca6a6373SXin LI // 233ca6a6373SXin LI // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 234ca6a6373SXin LI // code for "buf[0] | (buf[1] << 8)". 235ca6a6373SXin LI // 236ca6a6373SXin LI // Conclusion: The memcpy() method is the best choice when unaligned access 237ca6a6373SXin LI // is supported. 238ca6a6373SXin LI // 239ca6a6373SXin LI // Strict-align archs like SPARC 240ca6a6373SXin LI // ----------------------------- 241ca6a6373SXin LI // 242ca6a6373SXin LI // GCC versions from around 4.x to to at least 13.2.0 produce worse code 243ca6a6373SXin LI // from the memcpy() method than from simple byte-by-byte shift-or code 244ca6a6373SXin LI // when reading a 32-bit integer: 245ca6a6373SXin LI // 246*26743408SXin LI // (1) It may be constructed on stack using four 8-bit loads, 247ca6a6373SXin LI // four 8-bit stores to stack, and finally one 32-bit load from stack. 248ca6a6373SXin LI // 249ca6a6373SXin LI // (2) Especially with -Os, an actual memcpy() call may be emitted. 250ca6a6373SXin LI // 251ca6a6373SXin LI // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 252ca6a6373SXin LI // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 253ca6a6373SXin LI // some processors but not all so this is relevant only in the case when 254ca6a6373SXin LI // GCC assumes that unaligned is not supported or -mstrict-align or 255ca6a6373SXin LI // -mno-unaligned-access is used. 256ca6a6373SXin LI // 257ca6a6373SXin LI // For Clang it makes little difference. ARM64 with -O2 -mstrict-align 258ca6a6373SXin LI // was one the very few with a minor difference: the memcpy() version 259ca6a6373SXin LI // was one instruction longer. 260ca6a6373SXin LI // 261ca6a6373SXin LI // Conclusion: At least in case of GCC and Clang, byte-by-byte code is 2623b35e7eeSXin LI // the best choice for strict-align archs to do unaligned access. 263ca6a6373SXin LI // 264ca6a6373SXin LI // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 265ca6a6373SXin LI // 266ca6a6373SXin LI // Thanks to <https://godbolt.org/> it was easy to test different compilers. 267ca6a6373SXin LI // The following is for little endian targets: 268ca6a6373SXin LI /* 269ca6a6373SXin LI #include <stdint.h> 270ca6a6373SXin LI #include <string.h> 271ca6a6373SXin LI 272ca6a6373SXin LI uint32_t bytes16(const uint8_t *b) 273ca6a6373SXin LI { 274ca6a6373SXin LI return (uint32_t)b[0] 275ca6a6373SXin LI | ((uint32_t)b[1] << 8); 276ca6a6373SXin LI } 277ca6a6373SXin LI 278ca6a6373SXin LI uint32_t copy16(const uint8_t *b) 279ca6a6373SXin LI { 280ca6a6373SXin LI uint16_t v; 281ca6a6373SXin LI memcpy(&v, b, sizeof(v)); 282ca6a6373SXin LI return v; 283ca6a6373SXin LI } 284ca6a6373SXin LI 285ca6a6373SXin LI uint32_t bytes32(const uint8_t *b) 286ca6a6373SXin LI { 287ca6a6373SXin LI return (uint32_t)b[0] 288ca6a6373SXin LI | ((uint32_t)b[1] << 8) 289ca6a6373SXin LI | ((uint32_t)b[2] << 16) 290ca6a6373SXin LI | ((uint32_t)b[3] << 24); 291ca6a6373SXin LI } 292ca6a6373SXin LI 293ca6a6373SXin LI uint32_t copy32(const uint8_t *b) 294ca6a6373SXin LI { 295ca6a6373SXin LI uint32_t v; 296ca6a6373SXin LI memcpy(&v, b, sizeof(v)); 297ca6a6373SXin LI return v; 298ca6a6373SXin LI } 299ca6a6373SXin LI 300ca6a6373SXin LI void wbytes16(uint8_t *b, uint16_t v) 301ca6a6373SXin LI { 302ca6a6373SXin LI b[0] = (uint8_t)v; 303ca6a6373SXin LI b[1] = (uint8_t)(v >> 8); 304ca6a6373SXin LI } 305ca6a6373SXin LI 306ca6a6373SXin LI void wcopy16(uint8_t *b, uint16_t v) 307ca6a6373SXin LI { 308ca6a6373SXin LI memcpy(b, &v, sizeof(v)); 309ca6a6373SXin LI } 310ca6a6373SXin LI 311ca6a6373SXin LI void wbytes32(uint8_t *b, uint32_t v) 312ca6a6373SXin LI { 313ca6a6373SXin LI b[0] = (uint8_t)v; 314ca6a6373SXin LI b[1] = (uint8_t)(v >> 8); 315ca6a6373SXin LI b[2] = (uint8_t)(v >> 16); 316ca6a6373SXin LI b[3] = (uint8_t)(v >> 24); 317ca6a6373SXin LI } 318ca6a6373SXin LI 319ca6a6373SXin LI void wcopy32(uint8_t *b, uint32_t v) 320ca6a6373SXin LI { 321ca6a6373SXin LI memcpy(b, &v, sizeof(v)); 322ca6a6373SXin LI } 323ca6a6373SXin LI */ 324ca6a6373SXin LI 325ca6a6373SXin LI 326ca6a6373SXin LI #ifdef TUKLIB_FAST_UNALIGNED_ACCESS 32781ad8388SMartin Matuska 32881ad8388SMartin Matuska static inline uint16_t 329a8675d92SXin LI read16ne(const uint8_t *buf) 33081ad8388SMartin Matuska { 331ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 332a8675d92SXin LI return *(const uint16_t *)buf; 333a8675d92SXin LI #else 334a8675d92SXin LI uint16_t num; 335a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 336a8675d92SXin LI return num; 337a8675d92SXin LI #endif 33881ad8388SMartin Matuska } 33981ad8388SMartin Matuska 34081ad8388SMartin Matuska 34181ad8388SMartin Matuska static inline uint32_t 342a8675d92SXin LI read32ne(const uint8_t *buf) 34381ad8388SMartin Matuska { 344ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 345a8675d92SXin LI return *(const uint32_t *)buf; 346a8675d92SXin LI #else 347a8675d92SXin LI uint32_t num; 348a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 349a8675d92SXin LI return num; 350a8675d92SXin LI #endif 35181ad8388SMartin Matuska } 35281ad8388SMartin Matuska 35381ad8388SMartin Matuska 35481ad8388SMartin Matuska static inline uint64_t 355a8675d92SXin LI read64ne(const uint8_t *buf) 35681ad8388SMartin Matuska { 357ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 358a8675d92SXin LI return *(const uint64_t *)buf; 359a8675d92SXin LI #else 360a8675d92SXin LI uint64_t num; 361a8675d92SXin LI memcpy(&num, buf, sizeof(num)); 362a8675d92SXin LI return num; 363a8675d92SXin LI #endif 36481ad8388SMartin Matuska } 36581ad8388SMartin Matuska 36681ad8388SMartin Matuska 36781ad8388SMartin Matuska static inline void 36881ad8388SMartin Matuska write16ne(uint8_t *buf, uint16_t num) 36981ad8388SMartin Matuska { 370ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 37181ad8388SMartin Matuska *(uint16_t *)buf = num; 372a8675d92SXin LI #else 373a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 374a8675d92SXin LI #endif 37581ad8388SMartin Matuska return; 37681ad8388SMartin Matuska } 37781ad8388SMartin Matuska 37881ad8388SMartin Matuska 37981ad8388SMartin Matuska static inline void 38081ad8388SMartin Matuska write32ne(uint8_t *buf, uint32_t num) 38181ad8388SMartin Matuska { 382ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 38381ad8388SMartin Matuska *(uint32_t *)buf = num; 384a8675d92SXin LI #else 385a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 386a8675d92SXin LI #endif 38781ad8388SMartin Matuska return; 38881ad8388SMartin Matuska } 38981ad8388SMartin Matuska 39081ad8388SMartin Matuska 39181ad8388SMartin Matuska static inline void 39281ad8388SMartin Matuska write64ne(uint8_t *buf, uint64_t num) 39381ad8388SMartin Matuska { 394ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 39581ad8388SMartin Matuska *(uint64_t *)buf = num; 396a8675d92SXin LI #else 397a8675d92SXin LI memcpy(buf, &num, sizeof(num)); 398a8675d92SXin LI #endif 39981ad8388SMartin Matuska return; 40081ad8388SMartin Matuska } 40181ad8388SMartin Matuska 40281ad8388SMartin Matuska 40381ad8388SMartin Matuska static inline uint16_t 404a8675d92SXin LI read16be(const uint8_t *buf) 40581ad8388SMartin Matuska { 406a8675d92SXin LI uint16_t num = read16ne(buf); 407a8675d92SXin LI return conv16be(num); 40881ad8388SMartin Matuska } 40981ad8388SMartin Matuska 41081ad8388SMartin Matuska 41181ad8388SMartin Matuska static inline uint16_t 412a8675d92SXin LI read16le(const uint8_t *buf) 41381ad8388SMartin Matuska { 414a8675d92SXin LI uint16_t num = read16ne(buf); 415a8675d92SXin LI return conv16le(num); 41681ad8388SMartin Matuska } 41781ad8388SMartin Matuska 41881ad8388SMartin Matuska 41981ad8388SMartin Matuska static inline uint32_t 420a8675d92SXin LI read32be(const uint8_t *buf) 42181ad8388SMartin Matuska { 422a8675d92SXin LI uint32_t num = read32ne(buf); 423a8675d92SXin LI return conv32be(num); 42481ad8388SMartin Matuska } 42581ad8388SMartin Matuska 42681ad8388SMartin Matuska 42781ad8388SMartin Matuska static inline uint32_t 428a8675d92SXin LI read32le(const uint8_t *buf) 42981ad8388SMartin Matuska { 430a8675d92SXin LI uint32_t num = read32ne(buf); 431a8675d92SXin LI return conv32le(num); 43281ad8388SMartin Matuska } 43381ad8388SMartin Matuska 43481ad8388SMartin Matuska 43573ed8e77SXin LI static inline uint64_t 43673ed8e77SXin LI read64be(const uint8_t *buf) 43773ed8e77SXin LI { 43873ed8e77SXin LI uint64_t num = read64ne(buf); 43973ed8e77SXin LI return conv64be(num); 440ca6a6373SXin LI } 441ca6a6373SXin LI 442ca6a6373SXin LI 443ca6a6373SXin LI static inline uint64_t 444ca6a6373SXin LI read64le(const uint8_t *buf) 445ca6a6373SXin LI { 446ca6a6373SXin LI uint64_t num = read64ne(buf); 447ca6a6373SXin LI return conv64le(num); 448ca6a6373SXin LI } 449ca6a6373SXin LI 450ca6a6373SXin LI 451ca6a6373SXin LI // NOTE: Possible byte swapping must be done in a macro to allow the compiler 452ca6a6373SXin LI // to optimize byte swapping of constants when using glibc's or *BSD's 453ca6a6373SXin LI // byte swapping macros. The actual write is done in an inline function 454ca6a6373SXin LI // to make type checking of the buf pointer possible. 455ca6a6373SXin LI #define write16be(buf, num) write16ne(buf, conv16be(num)) 456ca6a6373SXin LI #define write32be(buf, num) write32ne(buf, conv32be(num)) 457ca6a6373SXin LI #define write64be(buf, num) write64ne(buf, conv64be(num)) 458ca6a6373SXin LI #define write16le(buf, num) write16ne(buf, conv16le(num)) 459ca6a6373SXin LI #define write32le(buf, num) write32ne(buf, conv32le(num)) 460ca6a6373SXin LI #define write64le(buf, num) write64ne(buf, conv64le(num)) 461ca6a6373SXin LI 46273ed8e77SXin LI #else 463ca6a6373SXin LI 464ca6a6373SXin LI #ifdef WORDS_BIGENDIAN 465ca6a6373SXin LI # define read16ne read16be 466ca6a6373SXin LI # define read32ne read32be 467ca6a6373SXin LI # define read64ne read64be 468ca6a6373SXin LI # define write16ne write16be 469ca6a6373SXin LI # define write32ne write32be 470ca6a6373SXin LI # define write64ne write64be 471ca6a6373SXin LI #else 472ca6a6373SXin LI # define read16ne read16le 473ca6a6373SXin LI # define read32ne read32le 474ca6a6373SXin LI # define read64ne read64le 475ca6a6373SXin LI # define write16ne write16le 476ca6a6373SXin LI # define write32ne write32le 477ca6a6373SXin LI # define write64ne write64le 478ca6a6373SXin LI #endif 479ca6a6373SXin LI 480ca6a6373SXin LI 481ca6a6373SXin LI static inline uint16_t 482ca6a6373SXin LI read16be(const uint8_t *buf) 483ca6a6373SXin LI { 484ca6a6373SXin LI uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 485ca6a6373SXin LI return num; 486ca6a6373SXin LI } 487ca6a6373SXin LI 488ca6a6373SXin LI 489ca6a6373SXin LI static inline uint16_t 490ca6a6373SXin LI read16le(const uint8_t *buf) 491ca6a6373SXin LI { 492ca6a6373SXin LI uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 493ca6a6373SXin LI return num; 494ca6a6373SXin LI } 495ca6a6373SXin LI 496ca6a6373SXin LI 497ca6a6373SXin LI static inline uint32_t 498ca6a6373SXin LI read32be(const uint8_t *buf) 499ca6a6373SXin LI { 500ca6a6373SXin LI uint32_t num = (uint32_t)buf[0] << 24; 501ca6a6373SXin LI num |= (uint32_t)buf[1] << 16; 502ca6a6373SXin LI num |= (uint32_t)buf[2] << 8; 503ca6a6373SXin LI num |= (uint32_t)buf[3]; 504ca6a6373SXin LI return num; 505ca6a6373SXin LI } 506ca6a6373SXin LI 507ca6a6373SXin LI 508ca6a6373SXin LI static inline uint32_t 509ca6a6373SXin LI read32le(const uint8_t *buf) 510ca6a6373SXin LI { 511ca6a6373SXin LI uint32_t num = (uint32_t)buf[0]; 512ca6a6373SXin LI num |= (uint32_t)buf[1] << 8; 513ca6a6373SXin LI num |= (uint32_t)buf[2] << 16; 514ca6a6373SXin LI num |= (uint32_t)buf[3] << 24; 515ca6a6373SXin LI return num; 516ca6a6373SXin LI } 517ca6a6373SXin LI 518ca6a6373SXin LI 519ca6a6373SXin LI static inline uint64_t 520ca6a6373SXin LI read64be(const uint8_t *buf) 521ca6a6373SXin LI { 52273ed8e77SXin LI uint64_t num = (uint64_t)buf[0] << 56; 52373ed8e77SXin LI num |= (uint64_t)buf[1] << 48; 52473ed8e77SXin LI num |= (uint64_t)buf[2] << 40; 52573ed8e77SXin LI num |= (uint64_t)buf[3] << 32; 52673ed8e77SXin LI num |= (uint64_t)buf[4] << 24; 52773ed8e77SXin LI num |= (uint64_t)buf[5] << 16; 52873ed8e77SXin LI num |= (uint64_t)buf[6] << 8; 52973ed8e77SXin LI num |= (uint64_t)buf[7]; 53073ed8e77SXin LI return num; 53173ed8e77SXin LI } 53273ed8e77SXin LI 53373ed8e77SXin LI 53473ed8e77SXin LI static inline uint64_t 53573ed8e77SXin LI read64le(const uint8_t *buf) 53673ed8e77SXin LI { 53773ed8e77SXin LI uint64_t num = (uint64_t)buf[0]; 53873ed8e77SXin LI num |= (uint64_t)buf[1] << 8; 53973ed8e77SXin LI num |= (uint64_t)buf[2] << 16; 54073ed8e77SXin LI num |= (uint64_t)buf[3] << 24; 54173ed8e77SXin LI num |= (uint64_t)buf[4] << 32; 54273ed8e77SXin LI num |= (uint64_t)buf[5] << 40; 54373ed8e77SXin LI num |= (uint64_t)buf[6] << 48; 54473ed8e77SXin LI num |= (uint64_t)buf[7] << 56; 54573ed8e77SXin LI return num; 54673ed8e77SXin LI } 54773ed8e77SXin LI 54873ed8e77SXin LI 54981ad8388SMartin Matuska static inline void 550a8675d92SXin LI write16be(uint8_t *buf, uint16_t num) 55181ad8388SMartin Matuska { 552342bcb12SXin LI buf[0] = (uint8_t)(num >> 8); 553342bcb12SXin LI buf[1] = (uint8_t)num; 55481ad8388SMartin Matuska return; 55581ad8388SMartin Matuska } 55681ad8388SMartin Matuska 55781ad8388SMartin Matuska 55881ad8388SMartin Matuska static inline void 559a8675d92SXin LI write16le(uint8_t *buf, uint16_t num) 56081ad8388SMartin Matuska { 561342bcb12SXin LI buf[0] = (uint8_t)num; 562342bcb12SXin LI buf[1] = (uint8_t)(num >> 8); 56381ad8388SMartin Matuska return; 56481ad8388SMartin Matuska } 56581ad8388SMartin Matuska 56681ad8388SMartin Matuska 56781ad8388SMartin Matuska static inline void 568a8675d92SXin LI write32be(uint8_t *buf, uint32_t num) 56981ad8388SMartin Matuska { 570342bcb12SXin LI buf[0] = (uint8_t)(num >> 24); 571342bcb12SXin LI buf[1] = (uint8_t)(num >> 16); 572342bcb12SXin LI buf[2] = (uint8_t)(num >> 8); 573342bcb12SXin LI buf[3] = (uint8_t)num; 57481ad8388SMartin Matuska return; 57581ad8388SMartin Matuska } 57681ad8388SMartin Matuska 57781ad8388SMartin Matuska 57881ad8388SMartin Matuska static inline void 579a8675d92SXin LI write32le(uint8_t *buf, uint32_t num) 58081ad8388SMartin Matuska { 581342bcb12SXin LI buf[0] = (uint8_t)num; 582342bcb12SXin LI buf[1] = (uint8_t)(num >> 8); 583342bcb12SXin LI buf[2] = (uint8_t)(num >> 16); 584342bcb12SXin LI buf[3] = (uint8_t)(num >> 24); 58581ad8388SMartin Matuska return; 58681ad8388SMartin Matuska } 587ca6a6373SXin LI 588ca6a6373SXin LI 589ca6a6373SXin LI static inline void 590ca6a6373SXin LI write64be(uint8_t *buf, uint64_t num) 591ca6a6373SXin LI { 592ca6a6373SXin LI buf[0] = (uint8_t)(num >> 56); 593ca6a6373SXin LI buf[1] = (uint8_t)(num >> 48); 594ca6a6373SXin LI buf[2] = (uint8_t)(num >> 40); 595ca6a6373SXin LI buf[3] = (uint8_t)(num >> 32); 596ca6a6373SXin LI buf[4] = (uint8_t)(num >> 24); 597ca6a6373SXin LI buf[5] = (uint8_t)(num >> 16); 598ca6a6373SXin LI buf[6] = (uint8_t)(num >> 8); 599ca6a6373SXin LI buf[7] = (uint8_t)num; 600ca6a6373SXin LI return; 601ca6a6373SXin LI } 602ca6a6373SXin LI 603ca6a6373SXin LI 604ca6a6373SXin LI static inline void 605ca6a6373SXin LI write64le(uint8_t *buf, uint64_t num) 606ca6a6373SXin LI { 607ca6a6373SXin LI buf[0] = (uint8_t)num; 608ca6a6373SXin LI buf[1] = (uint8_t)(num >> 8); 609ca6a6373SXin LI buf[2] = (uint8_t)(num >> 16); 610ca6a6373SXin LI buf[3] = (uint8_t)(num >> 24); 611ca6a6373SXin LI buf[4] = (uint8_t)(num >> 32); 612ca6a6373SXin LI buf[5] = (uint8_t)(num >> 40); 613ca6a6373SXin LI buf[6] = (uint8_t)(num >> 48); 614ca6a6373SXin LI buf[7] = (uint8_t)(num >> 56); 615ca6a6373SXin LI return; 616ca6a6373SXin LI } 617ca6a6373SXin LI 61881ad8388SMartin Matuska #endif 61981ad8388SMartin Matuska 62081ad8388SMartin Matuska 621a8675d92SXin LI ////////////////////////////// 622a8675d92SXin LI // Aligned reads and writes // 623a8675d92SXin LI ////////////////////////////// 624a8675d92SXin LI 625a8675d92SXin LI // Separate functions for aligned reads and writes are provided since on 626a8675d92SXin LI // strict-align archs aligned access is much faster than unaligned access. 627a8675d92SXin LI // 628a8675d92SXin LI // Just like in the unaligned case, memcpy() is needed to avoid 629a8675d92SXin LI // strict aliasing violations. However, on archs that don't support 630a8675d92SXin LI // unaligned access the compiler cannot know that the pointers given 631a8675d92SXin LI // to memcpy() are aligned which results in slow code. As of C11 there is 632a8675d92SXin LI // no standard way to tell the compiler that we know that the address is 633a8675d92SXin LI // aligned but some compilers have language extensions to do that. With 634a8675d92SXin LI // such language extensions the memcpy() method gives excellent results. 635a8675d92SXin LI // 6363b35e7eeSXin LI // What to do on a strict-align system when no known language extensions 637a8675d92SXin LI // are available? Falling back to byte-by-byte access would be safe but ruin 638a8675d92SXin LI // optimizations that have been made specifically with aligned access in mind. 639a8675d92SXin LI // As a compromise, aligned reads will fall back to non-compliant type punning 640a8675d92SXin LI // but aligned writes will be byte-by-byte, that is, fast reads are preferred 641a8675d92SXin LI // over fast writes. This obviously isn't great but hopefully it's a working 642a8675d92SXin LI // compromise for now. 643a8675d92SXin LI // 644a8675d92SXin LI // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 645a8675d92SXin LI #ifdef HAVE___BUILTIN_ASSUME_ALIGNED 646a8675d92SXin LI # define tuklib_memcpy_aligned(dest, src, size) \ 647a8675d92SXin LI memcpy(dest, __builtin_assume_aligned(src, size), size) 648a8675d92SXin LI #else 649a8675d92SXin LI # define tuklib_memcpy_aligned(dest, src, size) \ 650a8675d92SXin LI memcpy(dest, src, size) 651a8675d92SXin LI # ifndef TUKLIB_FAST_UNALIGNED_ACCESS 652a8675d92SXin LI # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 653a8675d92SXin LI # endif 654a8675d92SXin LI #endif 655a8675d92SXin LI 656a8675d92SXin LI 657a8675d92SXin LI static inline uint16_t 658a8675d92SXin LI aligned_read16ne(const uint8_t *buf) 659a8675d92SXin LI { 660a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 661a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 662a8675d92SXin LI return *(const uint16_t *)buf; 663a8675d92SXin LI #else 664a8675d92SXin LI uint16_t num; 665a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 666a8675d92SXin LI return num; 667a8675d92SXin LI #endif 668a8675d92SXin LI } 669a8675d92SXin LI 670a8675d92SXin LI 671a8675d92SXin LI static inline uint32_t 672a8675d92SXin LI aligned_read32ne(const uint8_t *buf) 673a8675d92SXin LI { 674a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 675a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 676a8675d92SXin LI return *(const uint32_t *)buf; 677a8675d92SXin LI #else 678a8675d92SXin LI uint32_t num; 679a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 680a8675d92SXin LI return num; 681a8675d92SXin LI #endif 682a8675d92SXin LI } 683a8675d92SXin LI 684a8675d92SXin LI 685a8675d92SXin LI static inline uint64_t 686a8675d92SXin LI aligned_read64ne(const uint8_t *buf) 687a8675d92SXin LI { 688a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 689a8675d92SXin LI || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 690a8675d92SXin LI return *(const uint64_t *)buf; 691a8675d92SXin LI #else 692a8675d92SXin LI uint64_t num; 693a8675d92SXin LI tuklib_memcpy_aligned(&num, buf, sizeof(num)); 694a8675d92SXin LI return num; 695a8675d92SXin LI #endif 696a8675d92SXin LI } 697a8675d92SXin LI 698a8675d92SXin LI 699a8675d92SXin LI static inline void 700a8675d92SXin LI aligned_write16ne(uint8_t *buf, uint16_t num) 701a8675d92SXin LI { 702a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 703a8675d92SXin LI *(uint16_t *)buf = num; 704a8675d92SXin LI #else 705a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 706a8675d92SXin LI #endif 707a8675d92SXin LI return; 708a8675d92SXin LI } 709a8675d92SXin LI 710a8675d92SXin LI 711a8675d92SXin LI static inline void 712a8675d92SXin LI aligned_write32ne(uint8_t *buf, uint32_t num) 713a8675d92SXin LI { 714a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 715a8675d92SXin LI *(uint32_t *)buf = num; 716a8675d92SXin LI #else 717a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 718a8675d92SXin LI #endif 719a8675d92SXin LI return; 720a8675d92SXin LI } 721a8675d92SXin LI 722a8675d92SXin LI 723a8675d92SXin LI static inline void 724a8675d92SXin LI aligned_write64ne(uint8_t *buf, uint64_t num) 725a8675d92SXin LI { 726a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 727a8675d92SXin LI *(uint64_t *)buf = num; 728a8675d92SXin LI #else 729a8675d92SXin LI tuklib_memcpy_aligned(buf, &num, sizeof(num)); 730a8675d92SXin LI #endif 731a8675d92SXin LI return; 732a8675d92SXin LI } 733a8675d92SXin LI 734a8675d92SXin LI 735a8675d92SXin LI static inline uint16_t 736a8675d92SXin LI aligned_read16be(const uint8_t *buf) 737a8675d92SXin LI { 738a8675d92SXin LI uint16_t num = aligned_read16ne(buf); 739a8675d92SXin LI return conv16be(num); 740a8675d92SXin LI } 741a8675d92SXin LI 742a8675d92SXin LI 743a8675d92SXin LI static inline uint16_t 744a8675d92SXin LI aligned_read16le(const uint8_t *buf) 745a8675d92SXin LI { 746a8675d92SXin LI uint16_t num = aligned_read16ne(buf); 747a8675d92SXin LI return conv16le(num); 748a8675d92SXin LI } 749a8675d92SXin LI 750a8675d92SXin LI 751a8675d92SXin LI static inline uint32_t 752a8675d92SXin LI aligned_read32be(const uint8_t *buf) 753a8675d92SXin LI { 754a8675d92SXin LI uint32_t num = aligned_read32ne(buf); 755a8675d92SXin LI return conv32be(num); 756a8675d92SXin LI } 757a8675d92SXin LI 758a8675d92SXin LI 759a8675d92SXin LI static inline uint32_t 760a8675d92SXin LI aligned_read32le(const uint8_t *buf) 761a8675d92SXin LI { 762a8675d92SXin LI uint32_t num = aligned_read32ne(buf); 763a8675d92SXin LI return conv32le(num); 764a8675d92SXin LI } 765a8675d92SXin LI 766a8675d92SXin LI 767a8675d92SXin LI static inline uint64_t 768a8675d92SXin LI aligned_read64be(const uint8_t *buf) 769a8675d92SXin LI { 770a8675d92SXin LI uint64_t num = aligned_read64ne(buf); 771a8675d92SXin LI return conv64be(num); 772a8675d92SXin LI } 773a8675d92SXin LI 774a8675d92SXin LI 775a8675d92SXin LI static inline uint64_t 776a8675d92SXin LI aligned_read64le(const uint8_t *buf) 777a8675d92SXin LI { 778a8675d92SXin LI uint64_t num = aligned_read64ne(buf); 779a8675d92SXin LI return conv64le(num); 780a8675d92SXin LI } 781a8675d92SXin LI 782a8675d92SXin LI 783a8675d92SXin LI // These need to be macros like in the unaligned case. 784a8675d92SXin LI #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 785a8675d92SXin LI #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 786a8675d92SXin LI #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 787a8675d92SXin LI #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 788a8675d92SXin LI #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 789a8675d92SXin LI #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 790a8675d92SXin LI 791a8675d92SXin LI 792a8675d92SXin LI //////////////////// 793a8675d92SXin LI // Bit operations // 794a8675d92SXin LI //////////////////// 795a8675d92SXin LI 79681ad8388SMartin Matuska static inline uint32_t 79781ad8388SMartin Matuska bsr32(uint32_t n) 79881ad8388SMartin Matuska { 79981ad8388SMartin Matuska // Check for ICC first, since it tends to define __GNUC__ too. 80081ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 80181ad8388SMartin Matuska return _bit_scan_reverse(n); 80281ad8388SMartin Matuska 803b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 80481ad8388SMartin Matuska // GCC >= 3.4 has __builtin_clz(), which gives good results on 80581ad8388SMartin Matuska // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 80681ad8388SMartin Matuska // either plain BSR (so the XOR gets optimized away) or LZCNT and 80781ad8388SMartin Matuska // XOR (if -march indicates that SSE4a instructions are supported). 808a8675d92SXin LI return (uint32_t)__builtin_clz(n) ^ 31U; 80981ad8388SMartin Matuska 81081ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 81181ad8388SMartin Matuska uint32_t i; 81281ad8388SMartin Matuska __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 81381ad8388SMartin Matuska return i; 81481ad8388SMartin Matuska 815a8675d92SXin LI #elif defined(_MSC_VER) 816a8675d92SXin LI unsigned long i; 817a8675d92SXin LI _BitScanReverse(&i, n); 81881ad8388SMartin Matuska return i; 81981ad8388SMartin Matuska 82081ad8388SMartin Matuska #else 82181ad8388SMartin Matuska uint32_t i = 31; 82281ad8388SMartin Matuska 823a8675d92SXin LI if ((n & 0xFFFF0000) == 0) { 82481ad8388SMartin Matuska n <<= 16; 82581ad8388SMartin Matuska i = 15; 82681ad8388SMartin Matuska } 82781ad8388SMartin Matuska 828a8675d92SXin LI if ((n & 0xFF000000) == 0) { 82981ad8388SMartin Matuska n <<= 8; 83081ad8388SMartin Matuska i -= 8; 83181ad8388SMartin Matuska } 83281ad8388SMartin Matuska 833a8675d92SXin LI if ((n & 0xF0000000) == 0) { 83481ad8388SMartin Matuska n <<= 4; 83581ad8388SMartin Matuska i -= 4; 83681ad8388SMartin Matuska } 83781ad8388SMartin Matuska 838a8675d92SXin LI if ((n & 0xC0000000) == 0) { 83981ad8388SMartin Matuska n <<= 2; 84081ad8388SMartin Matuska i -= 2; 84181ad8388SMartin Matuska } 84281ad8388SMartin Matuska 843a8675d92SXin LI if ((n & 0x80000000) == 0) 84481ad8388SMartin Matuska --i; 84581ad8388SMartin Matuska 84681ad8388SMartin Matuska return i; 84781ad8388SMartin Matuska #endif 84881ad8388SMartin Matuska } 84981ad8388SMartin Matuska 85081ad8388SMartin Matuska 85181ad8388SMartin Matuska static inline uint32_t 85281ad8388SMartin Matuska clz32(uint32_t n) 85381ad8388SMartin Matuska { 85481ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 85581ad8388SMartin Matuska return _bit_scan_reverse(n) ^ 31U; 85681ad8388SMartin Matuska 857b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 858a8675d92SXin LI return (uint32_t)__builtin_clz(n); 85981ad8388SMartin Matuska 86081ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 86181ad8388SMartin Matuska uint32_t i; 86281ad8388SMartin Matuska __asm__("bsrl %1, %0\n\t" 86381ad8388SMartin Matuska "xorl $31, %0" 86481ad8388SMartin Matuska : "=r" (i) : "rm" (n)); 86581ad8388SMartin Matuska return i; 86681ad8388SMartin Matuska 867a8675d92SXin LI #elif defined(_MSC_VER) 868a8675d92SXin LI unsigned long i; 869a8675d92SXin LI _BitScanReverse(&i, n); 87081ad8388SMartin Matuska return i ^ 31U; 87181ad8388SMartin Matuska 87281ad8388SMartin Matuska #else 87381ad8388SMartin Matuska uint32_t i = 0; 87481ad8388SMartin Matuska 875a8675d92SXin LI if ((n & 0xFFFF0000) == 0) { 87681ad8388SMartin Matuska n <<= 16; 87781ad8388SMartin Matuska i = 16; 87881ad8388SMartin Matuska } 87981ad8388SMartin Matuska 880a8675d92SXin LI if ((n & 0xFF000000) == 0) { 88181ad8388SMartin Matuska n <<= 8; 88281ad8388SMartin Matuska i += 8; 88381ad8388SMartin Matuska } 88481ad8388SMartin Matuska 885a8675d92SXin LI if ((n & 0xF0000000) == 0) { 88681ad8388SMartin Matuska n <<= 4; 88781ad8388SMartin Matuska i += 4; 88881ad8388SMartin Matuska } 88981ad8388SMartin Matuska 890a8675d92SXin LI if ((n & 0xC0000000) == 0) { 89181ad8388SMartin Matuska n <<= 2; 89281ad8388SMartin Matuska i += 2; 89381ad8388SMartin Matuska } 89481ad8388SMartin Matuska 895a8675d92SXin LI if ((n & 0x80000000) == 0) 89681ad8388SMartin Matuska ++i; 89781ad8388SMartin Matuska 89881ad8388SMartin Matuska return i; 89981ad8388SMartin Matuska #endif 90081ad8388SMartin Matuska } 90181ad8388SMartin Matuska 90281ad8388SMartin Matuska 90381ad8388SMartin Matuska static inline uint32_t 90481ad8388SMartin Matuska ctz32(uint32_t n) 90581ad8388SMartin Matuska { 90681ad8388SMartin Matuska #if defined(__INTEL_COMPILER) 90781ad8388SMartin Matuska return _bit_scan_forward(n); 90881ad8388SMartin Matuska 909b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 910a8675d92SXin LI return (uint32_t)__builtin_ctz(n); 91181ad8388SMartin Matuska 91281ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 91381ad8388SMartin Matuska uint32_t i; 91481ad8388SMartin Matuska __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 91581ad8388SMartin Matuska return i; 91681ad8388SMartin Matuska 917a8675d92SXin LI #elif defined(_MSC_VER) 918a8675d92SXin LI unsigned long i; 919a8675d92SXin LI _BitScanForward(&i, n); 92081ad8388SMartin Matuska return i; 92181ad8388SMartin Matuska 92281ad8388SMartin Matuska #else 92381ad8388SMartin Matuska uint32_t i = 0; 92481ad8388SMartin Matuska 925a8675d92SXin LI if ((n & 0x0000FFFF) == 0) { 92681ad8388SMartin Matuska n >>= 16; 92781ad8388SMartin Matuska i = 16; 92881ad8388SMartin Matuska } 92981ad8388SMartin Matuska 930a8675d92SXin LI if ((n & 0x000000FF) == 0) { 93181ad8388SMartin Matuska n >>= 8; 93281ad8388SMartin Matuska i += 8; 93381ad8388SMartin Matuska } 93481ad8388SMartin Matuska 935a8675d92SXin LI if ((n & 0x0000000F) == 0) { 93681ad8388SMartin Matuska n >>= 4; 93781ad8388SMartin Matuska i += 4; 93881ad8388SMartin Matuska } 93981ad8388SMartin Matuska 940a8675d92SXin LI if ((n & 0x00000003) == 0) { 94181ad8388SMartin Matuska n >>= 2; 94281ad8388SMartin Matuska i += 2; 94381ad8388SMartin Matuska } 94481ad8388SMartin Matuska 945a8675d92SXin LI if ((n & 0x00000001) == 0) 94681ad8388SMartin Matuska ++i; 94781ad8388SMartin Matuska 94881ad8388SMartin Matuska return i; 94981ad8388SMartin Matuska #endif 95081ad8388SMartin Matuska } 95181ad8388SMartin Matuska 95281ad8388SMartin Matuska #define bsf32 ctz32 95381ad8388SMartin Matuska 95481ad8388SMartin Matuska #endif 955