xref: /freebsd/contrib/xz/src/common/tuklib_integer.h (revision ca6a6373bdaed010d6cbfb27f7249ae96009409d)
181ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
281ad8388SMartin Matuska //
381ad8388SMartin Matuska /// \file       tuklib_integer.h
481ad8388SMartin Matuska /// \brief      Various integer and bit operations
581ad8388SMartin Matuska ///
681ad8388SMartin Matuska /// This file provides macros or functions to do some basic integer and bit
781ad8388SMartin Matuska /// operations.
881ad8388SMartin Matuska ///
9a8675d92SXin LI /// Native endian inline functions (XX = 16, 32, or 64):
10a8675d92SXin LI ///   - Unaligned native endian reads: readXXne(ptr)
11a8675d92SXin LI ///   - Unaligned native endian writes: writeXXne(ptr, num)
12a8675d92SXin LI ///   - Aligned native endian reads: aligned_readXXne(ptr)
13a8675d92SXin LI ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
14a8675d92SXin LI ///
15a8675d92SXin LI /// Endianness-converting integer operations (these can be macros!)
16a8675d92SXin LI /// (XX = 16, 32, or 64; Y = b or l):
1781ad8388SMartin Matuska ///   - Byte swapping: bswapXX(num)
18a8675d92SXin LI ///   - Byte order conversions to/from native (byteswaps if Y isn't
19a8675d92SXin LI ///     the native endianness): convXXYe(num)
2073ed8e77SXin LI ///   - Unaligned reads: readXXYe(ptr)
2173ed8e77SXin LI ///   - Unaligned writes: writeXXYe(ptr, num)
22a8675d92SXin LI ///   - Aligned reads: aligned_readXXYe(ptr)
23a8675d92SXin LI ///   - Aligned writes: aligned_writeXXYe(ptr, num)
2481ad8388SMartin Matuska ///
25a8675d92SXin LI /// Since the above can macros, the arguments should have no side effects
26a8675d92SXin LI /// because they may be evaluated more than once.
2781ad8388SMartin Matuska ///
28a8675d92SXin LI /// Bit scan operations for non-zero 32-bit integers (inline functions):
2981ad8388SMartin Matuska ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
3081ad8388SMartin Matuska ///   - Count leading zeros: clz32(num)
3181ad8388SMartin Matuska ///   - Count trailing zeros: ctz32(num)
3281ad8388SMartin Matuska ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
3381ad8388SMartin Matuska ///
3481ad8388SMartin Matuska /// The above bit scan operations return 0-31. If num is zero,
3581ad8388SMartin Matuska /// the result is undefined.
3681ad8388SMartin Matuska //
3781ad8388SMartin Matuska //  Authors:    Lasse Collin
3881ad8388SMartin Matuska //              Joachim Henke
3981ad8388SMartin Matuska //
4081ad8388SMartin Matuska //  This file has been put into the public domain.
4181ad8388SMartin Matuska //  You can do whatever you want with this file.
4281ad8388SMartin Matuska //
4381ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
4481ad8388SMartin Matuska 
4581ad8388SMartin Matuska #ifndef TUKLIB_INTEGER_H
4681ad8388SMartin Matuska #define TUKLIB_INTEGER_H
4781ad8388SMartin Matuska 
4881ad8388SMartin Matuska #include "tuklib_common.h"
49a8675d92SXin LI #include <string.h>
50a8675d92SXin LI 
51a8675d92SXin LI // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
52a8675d92SXin LI // and such functions.
53a8675d92SXin LI #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
54a8675d92SXin LI #	include <immintrin.h>
55b333cd44SXin LI // Only include <intrin.h> when it is needed. GCC and Clang can both
56b333cd44SXin LI // use __builtin's, so we only need Windows instrincs when using MSVC.
57b333cd44SXin LI // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
58b333cd44SXin LI // cases explicitly.
59b333cd44SXin LI #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
60b333cd44SXin LI #	include <intrin.h>
61a8675d92SXin LI #endif
6281ad8388SMartin Matuska 
6381ad8388SMartin Matuska 
64a8675d92SXin LI ///////////////////
65a8675d92SXin LI // Byte swapping //
66a8675d92SXin LI ///////////////////
6781ad8388SMartin Matuska 
68a8675d92SXin LI #if defined(HAVE___BUILTIN_BSWAPXX)
69a8675d92SXin LI 	// GCC >= 4.8 and Clang
70a8675d92SXin LI #	define bswap16(n) __builtin_bswap16(n)
71a8675d92SXin LI #	define bswap32(n) __builtin_bswap32(n)
72a8675d92SXin LI #	define bswap64(n) __builtin_bswap64(n)
73a8675d92SXin LI 
74a8675d92SXin LI #elif defined(HAVE_BYTESWAP_H)
7581ad8388SMartin Matuska 	// glibc, uClibc, dietlibc
7681ad8388SMartin Matuska #	include <byteswap.h>
7781ad8388SMartin Matuska #	ifdef HAVE_BSWAP_16
7881ad8388SMartin Matuska #		define bswap16(num) bswap_16(num)
7981ad8388SMartin Matuska #	endif
8081ad8388SMartin Matuska #	ifdef HAVE_BSWAP_32
8181ad8388SMartin Matuska #		define bswap32(num) bswap_32(num)
8281ad8388SMartin Matuska #	endif
8381ad8388SMartin Matuska #	ifdef HAVE_BSWAP_64
8481ad8388SMartin Matuska #		define bswap64(num) bswap_64(num)
8581ad8388SMartin Matuska #	endif
8681ad8388SMartin Matuska 
8781ad8388SMartin Matuska #elif defined(HAVE_SYS_ENDIAN_H)
8881ad8388SMartin Matuska 	// *BSDs and Darwin
8981ad8388SMartin Matuska #	include <sys/endian.h>
9081ad8388SMartin Matuska 
9181ad8388SMartin Matuska #elif defined(HAVE_SYS_BYTEORDER_H)
9281ad8388SMartin Matuska 	// Solaris
9381ad8388SMartin Matuska #	include <sys/byteorder.h>
9481ad8388SMartin Matuska #	ifdef BSWAP_16
9581ad8388SMartin Matuska #		define bswap16(num) BSWAP_16(num)
9681ad8388SMartin Matuska #	endif
9781ad8388SMartin Matuska #	ifdef BSWAP_32
9881ad8388SMartin Matuska #		define bswap32(num) BSWAP_32(num)
9981ad8388SMartin Matuska #	endif
10081ad8388SMartin Matuska #	ifdef BSWAP_64
10181ad8388SMartin Matuska #		define bswap64(num) BSWAP_64(num)
10281ad8388SMartin Matuska #	endif
10381ad8388SMartin Matuska #	ifdef BE_16
10481ad8388SMartin Matuska #		define conv16be(num) BE_16(num)
10581ad8388SMartin Matuska #	endif
10681ad8388SMartin Matuska #	ifdef BE_32
10781ad8388SMartin Matuska #		define conv32be(num) BE_32(num)
10881ad8388SMartin Matuska #	endif
10981ad8388SMartin Matuska #	ifdef BE_64
11081ad8388SMartin Matuska #		define conv64be(num) BE_64(num)
11181ad8388SMartin Matuska #	endif
11281ad8388SMartin Matuska #	ifdef LE_16
11381ad8388SMartin Matuska #		define conv16le(num) LE_16(num)
11481ad8388SMartin Matuska #	endif
11581ad8388SMartin Matuska #	ifdef LE_32
11681ad8388SMartin Matuska #		define conv32le(num) LE_32(num)
11781ad8388SMartin Matuska #	endif
11881ad8388SMartin Matuska #	ifdef LE_64
11981ad8388SMartin Matuska #		define conv64le(num) LE_64(num)
12081ad8388SMartin Matuska #	endif
12181ad8388SMartin Matuska #endif
12281ad8388SMartin Matuska 
12381ad8388SMartin Matuska #ifndef bswap16
124a8675d92SXin LI #	define bswap16(n) (uint16_t)( \
125a8675d92SXin LI 		  (((n) & 0x00FFU) << 8) \
126a8675d92SXin LI 		| (((n) & 0xFF00U) >> 8) \
127a8675d92SXin LI 	)
12881ad8388SMartin Matuska #endif
12981ad8388SMartin Matuska 
13081ad8388SMartin Matuska #ifndef bswap32
131a8675d92SXin LI #	define bswap32(n) (uint32_t)( \
132a8675d92SXin LI 		  (((n) & UINT32_C(0x000000FF)) << 24) \
133a8675d92SXin LI 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
134a8675d92SXin LI 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
135a8675d92SXin LI 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
136a8675d92SXin LI 	)
13781ad8388SMartin Matuska #endif
13881ad8388SMartin Matuska 
13981ad8388SMartin Matuska #ifndef bswap64
140a8675d92SXin LI #	define bswap64(n) (uint64_t)( \
141a8675d92SXin LI 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
142a8675d92SXin LI 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
143a8675d92SXin LI 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
144a8675d92SXin LI 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
145a8675d92SXin LI 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
146a8675d92SXin LI 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
147a8675d92SXin LI 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
148a8675d92SXin LI 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
149a8675d92SXin LI 	)
15081ad8388SMartin Matuska #endif
15181ad8388SMartin Matuska 
15281ad8388SMartin Matuska // Define conversion macros using the basic byte swapping macros.
15381ad8388SMartin Matuska #ifdef WORDS_BIGENDIAN
15481ad8388SMartin Matuska #	ifndef conv16be
15581ad8388SMartin Matuska #		define conv16be(num) ((uint16_t)(num))
15681ad8388SMartin Matuska #	endif
15781ad8388SMartin Matuska #	ifndef conv32be
15881ad8388SMartin Matuska #		define conv32be(num) ((uint32_t)(num))
15981ad8388SMartin Matuska #	endif
16081ad8388SMartin Matuska #	ifndef conv64be
16181ad8388SMartin Matuska #		define conv64be(num) ((uint64_t)(num))
16281ad8388SMartin Matuska #	endif
16381ad8388SMartin Matuska #	ifndef conv16le
16481ad8388SMartin Matuska #		define conv16le(num) bswap16(num)
16581ad8388SMartin Matuska #	endif
16681ad8388SMartin Matuska #	ifndef conv32le
16781ad8388SMartin Matuska #		define conv32le(num) bswap32(num)
16881ad8388SMartin Matuska #	endif
16981ad8388SMartin Matuska #	ifndef conv64le
17081ad8388SMartin Matuska #		define conv64le(num) bswap64(num)
17181ad8388SMartin Matuska #	endif
17281ad8388SMartin Matuska #else
17381ad8388SMartin Matuska #	ifndef conv16be
17481ad8388SMartin Matuska #		define conv16be(num) bswap16(num)
17581ad8388SMartin Matuska #	endif
17681ad8388SMartin Matuska #	ifndef conv32be
17781ad8388SMartin Matuska #		define conv32be(num) bswap32(num)
17881ad8388SMartin Matuska #	endif
17981ad8388SMartin Matuska #	ifndef conv64be
18081ad8388SMartin Matuska #		define conv64be(num) bswap64(num)
18181ad8388SMartin Matuska #	endif
18281ad8388SMartin Matuska #	ifndef conv16le
18381ad8388SMartin Matuska #		define conv16le(num) ((uint16_t)(num))
18481ad8388SMartin Matuska #	endif
18581ad8388SMartin Matuska #	ifndef conv32le
18681ad8388SMartin Matuska #		define conv32le(num) ((uint32_t)(num))
18781ad8388SMartin Matuska #	endif
18881ad8388SMartin Matuska #	ifndef conv64le
18981ad8388SMartin Matuska #		define conv64le(num) ((uint64_t)(num))
19081ad8388SMartin Matuska #	endif
19181ad8388SMartin Matuska #endif
19281ad8388SMartin Matuska 
19381ad8388SMartin Matuska 
194a8675d92SXin LI ////////////////////////////////
195a8675d92SXin LI // Unaligned reads and writes //
196a8675d92SXin LI ////////////////////////////////
197a8675d92SXin LI 
198*ca6a6373SXin LI // No-strict-align archs like x86-64
199*ca6a6373SXin LI // ---------------------------------
200*ca6a6373SXin LI //
201a8675d92SXin LI // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
202a8675d92SXin LI // is bad even if the uint8_pointer is properly aligned because this kind
203a8675d92SXin LI // of casts break strict aliasing rules and result in undefined behavior.
204a8675d92SXin LI // With unaligned pointers it's even worse: compilers may emit vector
205a8675d92SXin LI // instructions that require aligned pointers even if non-vector
206a8675d92SXin LI // instructions work with unaligned pointers.
207a8675d92SXin LI //
208a8675d92SXin LI // Using memcpy() is the standard compliant way to do unaligned access.
209a8675d92SXin LI // Many modern compilers inline it so there is no function call overhead.
210a8675d92SXin LI // For those compilers that don't handle the memcpy() method well, the
211a8675d92SXin LI // old casting method (that violates strict aliasing) can be requested at
212a8675d92SXin LI // build time. A third method, casting to a packed struct, would also be
213a8675d92SXin LI // an option but isn't provided to keep things simpler (it's already a mess).
214a8675d92SXin LI // Hopefully this is flexible enough in practice.
215*ca6a6373SXin LI //
216*ca6a6373SXin LI // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
217*ca6a6373SXin LI //
218*ca6a6373SXin LI //     buf[0] | (buf[1] << 8)
219*ca6a6373SXin LI //
220*ca6a6373SXin LI // reads a 16-bit value and can emit a single 16-bit load and produce
221*ca6a6373SXin LI // identical code than with the memcpy() method. In other cases Clang and GCC
222*ca6a6373SXin LI // produce either the same or better code with memcpy(). For example, Clang 9
223*ca6a6373SXin LI // on x86-64 can detect 32-bit load but not 16-bit load.
224*ca6a6373SXin LI //
225*ca6a6373SXin LI // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
226*ca6a6373SXin LI // code for "buf[0] | (buf[1] << 8)".
227*ca6a6373SXin LI //
228*ca6a6373SXin LI // Conclusion: The memcpy() method is the best choice when unaligned access
229*ca6a6373SXin LI // is supported.
230*ca6a6373SXin LI //
231*ca6a6373SXin LI // Strict-align archs like SPARC
232*ca6a6373SXin LI // -----------------------------
233*ca6a6373SXin LI //
234*ca6a6373SXin LI // GCC versions from around 4.x to to at least 13.2.0 produce worse code
235*ca6a6373SXin LI // from the memcpy() method than from simple byte-by-byte shift-or code
236*ca6a6373SXin LI // when reading a 32-bit integer:
237*ca6a6373SXin LI //
238*ca6a6373SXin LI //     (1) It may be constructed on stack using using four 8-bit loads,
239*ca6a6373SXin LI //         four 8-bit stores to stack, and finally one 32-bit load from stack.
240*ca6a6373SXin LI //
241*ca6a6373SXin LI //     (2) Especially with -Os, an actual memcpy() call may be emitted.
242*ca6a6373SXin LI //
243*ca6a6373SXin LI // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
244*ca6a6373SXin LI // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
245*ca6a6373SXin LI // some processors but not all so this is relevant only in the case when
246*ca6a6373SXin LI // GCC assumes that unaligned is not supported or -mstrict-align or
247*ca6a6373SXin LI // -mno-unaligned-access is used.
248*ca6a6373SXin LI //
249*ca6a6373SXin LI // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
250*ca6a6373SXin LI // was one the very few with a minor difference: the memcpy() version
251*ca6a6373SXin LI // was one instruction longer.
252*ca6a6373SXin LI //
253*ca6a6373SXin LI // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
254*ca6a6373SXin LI // the best choise for strict-align archs to do unaligned access.
255*ca6a6373SXin LI //
256*ca6a6373SXin LI // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
257*ca6a6373SXin LI //
258*ca6a6373SXin LI // Thanks to <https://godbolt.org/> it was easy to test different compilers.
259*ca6a6373SXin LI // The following is for little endian targets:
260*ca6a6373SXin LI /*
261*ca6a6373SXin LI #include <stdint.h>
262*ca6a6373SXin LI #include <string.h>
263*ca6a6373SXin LI 
264*ca6a6373SXin LI uint32_t bytes16(const uint8_t *b)
265*ca6a6373SXin LI {
266*ca6a6373SXin LI     return (uint32_t)b[0]
267*ca6a6373SXin LI         | ((uint32_t)b[1] << 8);
268*ca6a6373SXin LI }
269*ca6a6373SXin LI 
270*ca6a6373SXin LI uint32_t copy16(const uint8_t *b)
271*ca6a6373SXin LI {
272*ca6a6373SXin LI     uint16_t v;
273*ca6a6373SXin LI     memcpy(&v, b, sizeof(v));
274*ca6a6373SXin LI     return v;
275*ca6a6373SXin LI }
276*ca6a6373SXin LI 
277*ca6a6373SXin LI uint32_t bytes32(const uint8_t *b)
278*ca6a6373SXin LI {
279*ca6a6373SXin LI     return (uint32_t)b[0]
280*ca6a6373SXin LI         | ((uint32_t)b[1] << 8)
281*ca6a6373SXin LI         | ((uint32_t)b[2] << 16)
282*ca6a6373SXin LI         | ((uint32_t)b[3] << 24);
283*ca6a6373SXin LI }
284*ca6a6373SXin LI 
285*ca6a6373SXin LI uint32_t copy32(const uint8_t *b)
286*ca6a6373SXin LI {
287*ca6a6373SXin LI     uint32_t v;
288*ca6a6373SXin LI     memcpy(&v, b, sizeof(v));
289*ca6a6373SXin LI     return v;
290*ca6a6373SXin LI }
291*ca6a6373SXin LI 
292*ca6a6373SXin LI void wbytes16(uint8_t *b, uint16_t v)
293*ca6a6373SXin LI {
294*ca6a6373SXin LI     b[0] = (uint8_t)v;
295*ca6a6373SXin LI     b[1] = (uint8_t)(v >> 8);
296*ca6a6373SXin LI }
297*ca6a6373SXin LI 
298*ca6a6373SXin LI void wcopy16(uint8_t *b, uint16_t v)
299*ca6a6373SXin LI {
300*ca6a6373SXin LI     memcpy(b, &v, sizeof(v));
301*ca6a6373SXin LI }
302*ca6a6373SXin LI 
303*ca6a6373SXin LI void wbytes32(uint8_t *b, uint32_t v)
304*ca6a6373SXin LI {
305*ca6a6373SXin LI     b[0] = (uint8_t)v;
306*ca6a6373SXin LI     b[1] = (uint8_t)(v >> 8);
307*ca6a6373SXin LI     b[2] = (uint8_t)(v >> 16);
308*ca6a6373SXin LI     b[3] = (uint8_t)(v >> 24);
309*ca6a6373SXin LI }
310*ca6a6373SXin LI 
311*ca6a6373SXin LI void wcopy32(uint8_t *b, uint32_t v)
312*ca6a6373SXin LI {
313*ca6a6373SXin LI     memcpy(b, &v, sizeof(v));
314*ca6a6373SXin LI }
315*ca6a6373SXin LI */
316*ca6a6373SXin LI 
317*ca6a6373SXin LI 
318*ca6a6373SXin LI #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
31981ad8388SMartin Matuska 
32081ad8388SMartin Matuska static inline uint16_t
321a8675d92SXin LI read16ne(const uint8_t *buf)
32281ad8388SMartin Matuska {
323*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
324a8675d92SXin LI 	return *(const uint16_t *)buf;
325a8675d92SXin LI #else
326a8675d92SXin LI 	uint16_t num;
327a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
328a8675d92SXin LI 	return num;
329a8675d92SXin LI #endif
33081ad8388SMartin Matuska }
33181ad8388SMartin Matuska 
33281ad8388SMartin Matuska 
33381ad8388SMartin Matuska static inline uint32_t
334a8675d92SXin LI read32ne(const uint8_t *buf)
33581ad8388SMartin Matuska {
336*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
337a8675d92SXin LI 	return *(const uint32_t *)buf;
338a8675d92SXin LI #else
339a8675d92SXin LI 	uint32_t num;
340a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
341a8675d92SXin LI 	return num;
342a8675d92SXin LI #endif
34381ad8388SMartin Matuska }
34481ad8388SMartin Matuska 
34581ad8388SMartin Matuska 
34681ad8388SMartin Matuska static inline uint64_t
347a8675d92SXin LI read64ne(const uint8_t *buf)
34881ad8388SMartin Matuska {
349*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
350a8675d92SXin LI 	return *(const uint64_t *)buf;
351a8675d92SXin LI #else
352a8675d92SXin LI 	uint64_t num;
353a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
354a8675d92SXin LI 	return num;
355a8675d92SXin LI #endif
35681ad8388SMartin Matuska }
35781ad8388SMartin Matuska 
35881ad8388SMartin Matuska 
35981ad8388SMartin Matuska static inline void
36081ad8388SMartin Matuska write16ne(uint8_t *buf, uint16_t num)
36181ad8388SMartin Matuska {
362*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
36381ad8388SMartin Matuska 	*(uint16_t *)buf = num;
364a8675d92SXin LI #else
365a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
366a8675d92SXin LI #endif
36781ad8388SMartin Matuska 	return;
36881ad8388SMartin Matuska }
36981ad8388SMartin Matuska 
37081ad8388SMartin Matuska 
37181ad8388SMartin Matuska static inline void
37281ad8388SMartin Matuska write32ne(uint8_t *buf, uint32_t num)
37381ad8388SMartin Matuska {
374*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
37581ad8388SMartin Matuska 	*(uint32_t *)buf = num;
376a8675d92SXin LI #else
377a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
378a8675d92SXin LI #endif
37981ad8388SMartin Matuska 	return;
38081ad8388SMartin Matuska }
38181ad8388SMartin Matuska 
38281ad8388SMartin Matuska 
38381ad8388SMartin Matuska static inline void
38481ad8388SMartin Matuska write64ne(uint8_t *buf, uint64_t num)
38581ad8388SMartin Matuska {
386*ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
38781ad8388SMartin Matuska 	*(uint64_t *)buf = num;
388a8675d92SXin LI #else
389a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
390a8675d92SXin LI #endif
39181ad8388SMartin Matuska 	return;
39281ad8388SMartin Matuska }
39381ad8388SMartin Matuska 
39481ad8388SMartin Matuska 
39581ad8388SMartin Matuska static inline uint16_t
396a8675d92SXin LI read16be(const uint8_t *buf)
39781ad8388SMartin Matuska {
398a8675d92SXin LI 	uint16_t num = read16ne(buf);
399a8675d92SXin LI 	return conv16be(num);
40081ad8388SMartin Matuska }
40181ad8388SMartin Matuska 
40281ad8388SMartin Matuska 
40381ad8388SMartin Matuska static inline uint16_t
404a8675d92SXin LI read16le(const uint8_t *buf)
40581ad8388SMartin Matuska {
406a8675d92SXin LI 	uint16_t num = read16ne(buf);
407a8675d92SXin LI 	return conv16le(num);
40881ad8388SMartin Matuska }
40981ad8388SMartin Matuska 
41081ad8388SMartin Matuska 
41181ad8388SMartin Matuska static inline uint32_t
412a8675d92SXin LI read32be(const uint8_t *buf)
41381ad8388SMartin Matuska {
414a8675d92SXin LI 	uint32_t num = read32ne(buf);
415a8675d92SXin LI 	return conv32be(num);
41681ad8388SMartin Matuska }
41781ad8388SMartin Matuska 
41881ad8388SMartin Matuska 
41981ad8388SMartin Matuska static inline uint32_t
420a8675d92SXin LI read32le(const uint8_t *buf)
42181ad8388SMartin Matuska {
422a8675d92SXin LI 	uint32_t num = read32ne(buf);
423a8675d92SXin LI 	return conv32le(num);
42481ad8388SMartin Matuska }
42581ad8388SMartin Matuska 
42681ad8388SMartin Matuska 
42773ed8e77SXin LI static inline uint64_t
42873ed8e77SXin LI read64be(const uint8_t *buf)
42973ed8e77SXin LI {
43073ed8e77SXin LI 	uint64_t num = read64ne(buf);
43173ed8e77SXin LI 	return conv64be(num);
432*ca6a6373SXin LI }
433*ca6a6373SXin LI 
434*ca6a6373SXin LI 
435*ca6a6373SXin LI static inline uint64_t
436*ca6a6373SXin LI read64le(const uint8_t *buf)
437*ca6a6373SXin LI {
438*ca6a6373SXin LI 	uint64_t num = read64ne(buf);
439*ca6a6373SXin LI 	return conv64le(num);
440*ca6a6373SXin LI }
441*ca6a6373SXin LI 
442*ca6a6373SXin LI 
443*ca6a6373SXin LI // NOTE: Possible byte swapping must be done in a macro to allow the compiler
444*ca6a6373SXin LI // to optimize byte swapping of constants when using glibc's or *BSD's
445*ca6a6373SXin LI // byte swapping macros. The actual write is done in an inline function
446*ca6a6373SXin LI // to make type checking of the buf pointer possible.
447*ca6a6373SXin LI #define write16be(buf, num) write16ne(buf, conv16be(num))
448*ca6a6373SXin LI #define write32be(buf, num) write32ne(buf, conv32be(num))
449*ca6a6373SXin LI #define write64be(buf, num) write64ne(buf, conv64be(num))
450*ca6a6373SXin LI #define write16le(buf, num) write16ne(buf, conv16le(num))
451*ca6a6373SXin LI #define write32le(buf, num) write32ne(buf, conv32le(num))
452*ca6a6373SXin LI #define write64le(buf, num) write64ne(buf, conv64le(num))
453*ca6a6373SXin LI 
45473ed8e77SXin LI #else
455*ca6a6373SXin LI 
456*ca6a6373SXin LI #ifdef WORDS_BIGENDIAN
457*ca6a6373SXin LI #	define read16ne read16be
458*ca6a6373SXin LI #	define read32ne read32be
459*ca6a6373SXin LI #	define read64ne read64be
460*ca6a6373SXin LI #	define write16ne write16be
461*ca6a6373SXin LI #	define write32ne write32be
462*ca6a6373SXin LI #	define write64ne write64be
463*ca6a6373SXin LI #else
464*ca6a6373SXin LI #	define read16ne read16le
465*ca6a6373SXin LI #	define read32ne read32le
466*ca6a6373SXin LI #	define read64ne read64le
467*ca6a6373SXin LI #	define write16ne write16le
468*ca6a6373SXin LI #	define write32ne write32le
469*ca6a6373SXin LI #	define write64ne write64le
470*ca6a6373SXin LI #endif
471*ca6a6373SXin LI 
472*ca6a6373SXin LI 
473*ca6a6373SXin LI static inline uint16_t
474*ca6a6373SXin LI read16be(const uint8_t *buf)
475*ca6a6373SXin LI {
476*ca6a6373SXin LI 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
477*ca6a6373SXin LI 	return num;
478*ca6a6373SXin LI }
479*ca6a6373SXin LI 
480*ca6a6373SXin LI 
481*ca6a6373SXin LI static inline uint16_t
482*ca6a6373SXin LI read16le(const uint8_t *buf)
483*ca6a6373SXin LI {
484*ca6a6373SXin LI 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
485*ca6a6373SXin LI 	return num;
486*ca6a6373SXin LI }
487*ca6a6373SXin LI 
488*ca6a6373SXin LI 
489*ca6a6373SXin LI static inline uint32_t
490*ca6a6373SXin LI read32be(const uint8_t *buf)
491*ca6a6373SXin LI {
492*ca6a6373SXin LI 	uint32_t num = (uint32_t)buf[0] << 24;
493*ca6a6373SXin LI 	num |= (uint32_t)buf[1] << 16;
494*ca6a6373SXin LI 	num |= (uint32_t)buf[2] << 8;
495*ca6a6373SXin LI 	num |= (uint32_t)buf[3];
496*ca6a6373SXin LI 	return num;
497*ca6a6373SXin LI }
498*ca6a6373SXin LI 
499*ca6a6373SXin LI 
500*ca6a6373SXin LI static inline uint32_t
501*ca6a6373SXin LI read32le(const uint8_t *buf)
502*ca6a6373SXin LI {
503*ca6a6373SXin LI 	uint32_t num = (uint32_t)buf[0];
504*ca6a6373SXin LI 	num |= (uint32_t)buf[1] << 8;
505*ca6a6373SXin LI 	num |= (uint32_t)buf[2] << 16;
506*ca6a6373SXin LI 	num |= (uint32_t)buf[3] << 24;
507*ca6a6373SXin LI 	return num;
508*ca6a6373SXin LI }
509*ca6a6373SXin LI 
510*ca6a6373SXin LI 
511*ca6a6373SXin LI static inline uint64_t
512*ca6a6373SXin LI read64be(const uint8_t *buf)
513*ca6a6373SXin LI {
51473ed8e77SXin LI 	uint64_t num = (uint64_t)buf[0] << 56;
51573ed8e77SXin LI 	num |= (uint64_t)buf[1] << 48;
51673ed8e77SXin LI 	num |= (uint64_t)buf[2] << 40;
51773ed8e77SXin LI 	num |= (uint64_t)buf[3] << 32;
51873ed8e77SXin LI 	num |= (uint64_t)buf[4] << 24;
51973ed8e77SXin LI 	num |= (uint64_t)buf[5] << 16;
52073ed8e77SXin LI 	num |= (uint64_t)buf[6] << 8;
52173ed8e77SXin LI 	num |= (uint64_t)buf[7];
52273ed8e77SXin LI 	return num;
52373ed8e77SXin LI }
52473ed8e77SXin LI 
52573ed8e77SXin LI 
52673ed8e77SXin LI static inline uint64_t
52773ed8e77SXin LI read64le(const uint8_t *buf)
52873ed8e77SXin LI {
52973ed8e77SXin LI 	uint64_t num = (uint64_t)buf[0];
53073ed8e77SXin LI 	num |= (uint64_t)buf[1] << 8;
53173ed8e77SXin LI 	num |= (uint64_t)buf[2] << 16;
53273ed8e77SXin LI 	num |= (uint64_t)buf[3] << 24;
53373ed8e77SXin LI 	num |= (uint64_t)buf[4] << 32;
53473ed8e77SXin LI 	num |= (uint64_t)buf[5] << 40;
53573ed8e77SXin LI 	num |= (uint64_t)buf[6] << 48;
53673ed8e77SXin LI 	num |= (uint64_t)buf[7] << 56;
53773ed8e77SXin LI 	return num;
53873ed8e77SXin LI }
53973ed8e77SXin LI 
54073ed8e77SXin LI 
54181ad8388SMartin Matuska static inline void
542a8675d92SXin LI write16be(uint8_t *buf, uint16_t num)
54381ad8388SMartin Matuska {
544342bcb12SXin LI 	buf[0] = (uint8_t)(num >> 8);
545342bcb12SXin LI 	buf[1] = (uint8_t)num;
54681ad8388SMartin Matuska 	return;
54781ad8388SMartin Matuska }
54881ad8388SMartin Matuska 
54981ad8388SMartin Matuska 
55081ad8388SMartin Matuska static inline void
551a8675d92SXin LI write16le(uint8_t *buf, uint16_t num)
55281ad8388SMartin Matuska {
553342bcb12SXin LI 	buf[0] = (uint8_t)num;
554342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 8);
55581ad8388SMartin Matuska 	return;
55681ad8388SMartin Matuska }
55781ad8388SMartin Matuska 
55881ad8388SMartin Matuska 
55981ad8388SMartin Matuska static inline void
560a8675d92SXin LI write32be(uint8_t *buf, uint32_t num)
56181ad8388SMartin Matuska {
562342bcb12SXin LI 	buf[0] = (uint8_t)(num >> 24);
563342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 16);
564342bcb12SXin LI 	buf[2] = (uint8_t)(num >> 8);
565342bcb12SXin LI 	buf[3] = (uint8_t)num;
56681ad8388SMartin Matuska 	return;
56781ad8388SMartin Matuska }
56881ad8388SMartin Matuska 
56981ad8388SMartin Matuska 
57081ad8388SMartin Matuska static inline void
571a8675d92SXin LI write32le(uint8_t *buf, uint32_t num)
57281ad8388SMartin Matuska {
573342bcb12SXin LI 	buf[0] = (uint8_t)num;
574342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 8);
575342bcb12SXin LI 	buf[2] = (uint8_t)(num >> 16);
576342bcb12SXin LI 	buf[3] = (uint8_t)(num >> 24);
57781ad8388SMartin Matuska 	return;
57881ad8388SMartin Matuska }
579*ca6a6373SXin LI 
580*ca6a6373SXin LI 
581*ca6a6373SXin LI static inline void
582*ca6a6373SXin LI write64be(uint8_t *buf, uint64_t num)
583*ca6a6373SXin LI {
584*ca6a6373SXin LI 	buf[0] = (uint8_t)(num >> 56);
585*ca6a6373SXin LI 	buf[1] = (uint8_t)(num >> 48);
586*ca6a6373SXin LI 	buf[2] = (uint8_t)(num >> 40);
587*ca6a6373SXin LI 	buf[3] = (uint8_t)(num >> 32);
588*ca6a6373SXin LI 	buf[4] = (uint8_t)(num >> 24);
589*ca6a6373SXin LI 	buf[5] = (uint8_t)(num >> 16);
590*ca6a6373SXin LI 	buf[6] = (uint8_t)(num >> 8);
591*ca6a6373SXin LI 	buf[7] = (uint8_t)num;
592*ca6a6373SXin LI 	return;
593*ca6a6373SXin LI }
594*ca6a6373SXin LI 
595*ca6a6373SXin LI 
596*ca6a6373SXin LI static inline void
597*ca6a6373SXin LI write64le(uint8_t *buf, uint64_t num)
598*ca6a6373SXin LI {
599*ca6a6373SXin LI 	buf[0] = (uint8_t)num;
600*ca6a6373SXin LI 	buf[1] = (uint8_t)(num >> 8);
601*ca6a6373SXin LI 	buf[2] = (uint8_t)(num >> 16);
602*ca6a6373SXin LI 	buf[3] = (uint8_t)(num >> 24);
603*ca6a6373SXin LI 	buf[4] = (uint8_t)(num >> 32);
604*ca6a6373SXin LI 	buf[5] = (uint8_t)(num >> 40);
605*ca6a6373SXin LI 	buf[6] = (uint8_t)(num >> 48);
606*ca6a6373SXin LI 	buf[7] = (uint8_t)(num >> 56);
607*ca6a6373SXin LI 	return;
608*ca6a6373SXin LI }
609*ca6a6373SXin LI 
61081ad8388SMartin Matuska #endif
61181ad8388SMartin Matuska 
61281ad8388SMartin Matuska 
613a8675d92SXin LI //////////////////////////////
614a8675d92SXin LI // Aligned reads and writes //
615a8675d92SXin LI //////////////////////////////
616a8675d92SXin LI 
617a8675d92SXin LI // Separate functions for aligned reads and writes are provided since on
618a8675d92SXin LI // strict-align archs aligned access is much faster than unaligned access.
619a8675d92SXin LI //
620a8675d92SXin LI // Just like in the unaligned case, memcpy() is needed to avoid
621a8675d92SXin LI // strict aliasing violations. However, on archs that don't support
622a8675d92SXin LI // unaligned access the compiler cannot know that the pointers given
623a8675d92SXin LI // to memcpy() are aligned which results in slow code. As of C11 there is
624a8675d92SXin LI // no standard way to tell the compiler that we know that the address is
625a8675d92SXin LI // aligned but some compilers have language extensions to do that. With
626a8675d92SXin LI // such language extensions the memcpy() method gives excellent results.
627a8675d92SXin LI //
628a8675d92SXin LI // What to do on a strict-align system when no known language extentensions
629a8675d92SXin LI // are available? Falling back to byte-by-byte access would be safe but ruin
630a8675d92SXin LI // optimizations that have been made specifically with aligned access in mind.
631a8675d92SXin LI // As a compromise, aligned reads will fall back to non-compliant type punning
632a8675d92SXin LI // but aligned writes will be byte-by-byte, that is, fast reads are preferred
633a8675d92SXin LI // over fast writes. This obviously isn't great but hopefully it's a working
634a8675d92SXin LI // compromise for now.
635a8675d92SXin LI //
636a8675d92SXin LI // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
637a8675d92SXin LI #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
638a8675d92SXin LI #	define tuklib_memcpy_aligned(dest, src, size) \
639a8675d92SXin LI 		memcpy(dest, __builtin_assume_aligned(src, size), size)
640a8675d92SXin LI #else
641a8675d92SXin LI #	define tuklib_memcpy_aligned(dest, src, size) \
642a8675d92SXin LI 		memcpy(dest, src, size)
643a8675d92SXin LI #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
644a8675d92SXin LI #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
645a8675d92SXin LI #	endif
646a8675d92SXin LI #endif
647a8675d92SXin LI 
648a8675d92SXin LI 
649a8675d92SXin LI static inline uint16_t
650a8675d92SXin LI aligned_read16ne(const uint8_t *buf)
651a8675d92SXin LI {
652a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
653a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
654a8675d92SXin LI 	return *(const uint16_t *)buf;
655a8675d92SXin LI #else
656a8675d92SXin LI 	uint16_t num;
657a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
658a8675d92SXin LI 	return num;
659a8675d92SXin LI #endif
660a8675d92SXin LI }
661a8675d92SXin LI 
662a8675d92SXin LI 
663a8675d92SXin LI static inline uint32_t
664a8675d92SXin LI aligned_read32ne(const uint8_t *buf)
665a8675d92SXin LI {
666a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
667a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
668a8675d92SXin LI 	return *(const uint32_t *)buf;
669a8675d92SXin LI #else
670a8675d92SXin LI 	uint32_t num;
671a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
672a8675d92SXin LI 	return num;
673a8675d92SXin LI #endif
674a8675d92SXin LI }
675a8675d92SXin LI 
676a8675d92SXin LI 
677a8675d92SXin LI static inline uint64_t
678a8675d92SXin LI aligned_read64ne(const uint8_t *buf)
679a8675d92SXin LI {
680a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
681a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
682a8675d92SXin LI 	return *(const uint64_t *)buf;
683a8675d92SXin LI #else
684a8675d92SXin LI 	uint64_t num;
685a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
686a8675d92SXin LI 	return num;
687a8675d92SXin LI #endif
688a8675d92SXin LI }
689a8675d92SXin LI 
690a8675d92SXin LI 
691a8675d92SXin LI static inline void
692a8675d92SXin LI aligned_write16ne(uint8_t *buf, uint16_t num)
693a8675d92SXin LI {
694a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
695a8675d92SXin LI 	*(uint16_t *)buf = num;
696a8675d92SXin LI #else
697a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
698a8675d92SXin LI #endif
699a8675d92SXin LI 	return;
700a8675d92SXin LI }
701a8675d92SXin LI 
702a8675d92SXin LI 
703a8675d92SXin LI static inline void
704a8675d92SXin LI aligned_write32ne(uint8_t *buf, uint32_t num)
705a8675d92SXin LI {
706a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
707a8675d92SXin LI 	*(uint32_t *)buf = num;
708a8675d92SXin LI #else
709a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
710a8675d92SXin LI #endif
711a8675d92SXin LI 	return;
712a8675d92SXin LI }
713a8675d92SXin LI 
714a8675d92SXin LI 
715a8675d92SXin LI static inline void
716a8675d92SXin LI aligned_write64ne(uint8_t *buf, uint64_t num)
717a8675d92SXin LI {
718a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
719a8675d92SXin LI 	*(uint64_t *)buf = num;
720a8675d92SXin LI #else
721a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
722a8675d92SXin LI #endif
723a8675d92SXin LI 	return;
724a8675d92SXin LI }
725a8675d92SXin LI 
726a8675d92SXin LI 
727a8675d92SXin LI static inline uint16_t
728a8675d92SXin LI aligned_read16be(const uint8_t *buf)
729a8675d92SXin LI {
730a8675d92SXin LI 	uint16_t num = aligned_read16ne(buf);
731a8675d92SXin LI 	return conv16be(num);
732a8675d92SXin LI }
733a8675d92SXin LI 
734a8675d92SXin LI 
735a8675d92SXin LI static inline uint16_t
736a8675d92SXin LI aligned_read16le(const uint8_t *buf)
737a8675d92SXin LI {
738a8675d92SXin LI 	uint16_t num = aligned_read16ne(buf);
739a8675d92SXin LI 	return conv16le(num);
740a8675d92SXin LI }
741a8675d92SXin LI 
742a8675d92SXin LI 
743a8675d92SXin LI static inline uint32_t
744a8675d92SXin LI aligned_read32be(const uint8_t *buf)
745a8675d92SXin LI {
746a8675d92SXin LI 	uint32_t num = aligned_read32ne(buf);
747a8675d92SXin LI 	return conv32be(num);
748a8675d92SXin LI }
749a8675d92SXin LI 
750a8675d92SXin LI 
751a8675d92SXin LI static inline uint32_t
752a8675d92SXin LI aligned_read32le(const uint8_t *buf)
753a8675d92SXin LI {
754a8675d92SXin LI 	uint32_t num = aligned_read32ne(buf);
755a8675d92SXin LI 	return conv32le(num);
756a8675d92SXin LI }
757a8675d92SXin LI 
758a8675d92SXin LI 
759a8675d92SXin LI static inline uint64_t
760a8675d92SXin LI aligned_read64be(const uint8_t *buf)
761a8675d92SXin LI {
762a8675d92SXin LI 	uint64_t num = aligned_read64ne(buf);
763a8675d92SXin LI 	return conv64be(num);
764a8675d92SXin LI }
765a8675d92SXin LI 
766a8675d92SXin LI 
767a8675d92SXin LI static inline uint64_t
768a8675d92SXin LI aligned_read64le(const uint8_t *buf)
769a8675d92SXin LI {
770a8675d92SXin LI 	uint64_t num = aligned_read64ne(buf);
771a8675d92SXin LI 	return conv64le(num);
772a8675d92SXin LI }
773a8675d92SXin LI 
774a8675d92SXin LI 
775a8675d92SXin LI // These need to be macros like in the unaligned case.
776a8675d92SXin LI #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
777a8675d92SXin LI #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
778a8675d92SXin LI #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
779a8675d92SXin LI #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
780a8675d92SXin LI #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
781a8675d92SXin LI #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
782a8675d92SXin LI 
783a8675d92SXin LI 
784a8675d92SXin LI ////////////////////
785a8675d92SXin LI // Bit operations //
786a8675d92SXin LI ////////////////////
787a8675d92SXin LI 
78881ad8388SMartin Matuska static inline uint32_t
78981ad8388SMartin Matuska bsr32(uint32_t n)
79081ad8388SMartin Matuska {
79181ad8388SMartin Matuska 	// Check for ICC first, since it tends to define __GNUC__ too.
79281ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
79381ad8388SMartin Matuska 	return _bit_scan_reverse(n);
79481ad8388SMartin Matuska 
795b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
79681ad8388SMartin Matuska 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
79781ad8388SMartin Matuska 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
79881ad8388SMartin Matuska 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
79981ad8388SMartin Matuska 	// XOR (if -march indicates that SSE4a instructions are supported).
800a8675d92SXin LI 	return (uint32_t)__builtin_clz(n) ^ 31U;
80181ad8388SMartin Matuska 
80281ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
80381ad8388SMartin Matuska 	uint32_t i;
80481ad8388SMartin Matuska 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
80581ad8388SMartin Matuska 	return i;
80681ad8388SMartin Matuska 
807a8675d92SXin LI #elif defined(_MSC_VER)
808a8675d92SXin LI 	unsigned long i;
809a8675d92SXin LI 	_BitScanReverse(&i, n);
81081ad8388SMartin Matuska 	return i;
81181ad8388SMartin Matuska 
81281ad8388SMartin Matuska #else
81381ad8388SMartin Matuska 	uint32_t i = 31;
81481ad8388SMartin Matuska 
815a8675d92SXin LI 	if ((n & 0xFFFF0000) == 0) {
81681ad8388SMartin Matuska 		n <<= 16;
81781ad8388SMartin Matuska 		i = 15;
81881ad8388SMartin Matuska 	}
81981ad8388SMartin Matuska 
820a8675d92SXin LI 	if ((n & 0xFF000000) == 0) {
82181ad8388SMartin Matuska 		n <<= 8;
82281ad8388SMartin Matuska 		i -= 8;
82381ad8388SMartin Matuska 	}
82481ad8388SMartin Matuska 
825a8675d92SXin LI 	if ((n & 0xF0000000) == 0) {
82681ad8388SMartin Matuska 		n <<= 4;
82781ad8388SMartin Matuska 		i -= 4;
82881ad8388SMartin Matuska 	}
82981ad8388SMartin Matuska 
830a8675d92SXin LI 	if ((n & 0xC0000000) == 0) {
83181ad8388SMartin Matuska 		n <<= 2;
83281ad8388SMartin Matuska 		i -= 2;
83381ad8388SMartin Matuska 	}
83481ad8388SMartin Matuska 
835a8675d92SXin LI 	if ((n & 0x80000000) == 0)
83681ad8388SMartin Matuska 		--i;
83781ad8388SMartin Matuska 
83881ad8388SMartin Matuska 	return i;
83981ad8388SMartin Matuska #endif
84081ad8388SMartin Matuska }
84181ad8388SMartin Matuska 
84281ad8388SMartin Matuska 
84381ad8388SMartin Matuska static inline uint32_t
84481ad8388SMartin Matuska clz32(uint32_t n)
84581ad8388SMartin Matuska {
84681ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
84781ad8388SMartin Matuska 	return _bit_scan_reverse(n) ^ 31U;
84881ad8388SMartin Matuska 
849b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
850a8675d92SXin LI 	return (uint32_t)__builtin_clz(n);
85181ad8388SMartin Matuska 
85281ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
85381ad8388SMartin Matuska 	uint32_t i;
85481ad8388SMartin Matuska 	__asm__("bsrl %1, %0\n\t"
85581ad8388SMartin Matuska 		"xorl $31, %0"
85681ad8388SMartin Matuska 		: "=r" (i) : "rm" (n));
85781ad8388SMartin Matuska 	return i;
85881ad8388SMartin Matuska 
859a8675d92SXin LI #elif defined(_MSC_VER)
860a8675d92SXin LI 	unsigned long i;
861a8675d92SXin LI 	_BitScanReverse(&i, n);
86281ad8388SMartin Matuska 	return i ^ 31U;
86381ad8388SMartin Matuska 
86481ad8388SMartin Matuska #else
86581ad8388SMartin Matuska 	uint32_t i = 0;
86681ad8388SMartin Matuska 
867a8675d92SXin LI 	if ((n & 0xFFFF0000) == 0) {
86881ad8388SMartin Matuska 		n <<= 16;
86981ad8388SMartin Matuska 		i = 16;
87081ad8388SMartin Matuska 	}
87181ad8388SMartin Matuska 
872a8675d92SXin LI 	if ((n & 0xFF000000) == 0) {
87381ad8388SMartin Matuska 		n <<= 8;
87481ad8388SMartin Matuska 		i += 8;
87581ad8388SMartin Matuska 	}
87681ad8388SMartin Matuska 
877a8675d92SXin LI 	if ((n & 0xF0000000) == 0) {
87881ad8388SMartin Matuska 		n <<= 4;
87981ad8388SMartin Matuska 		i += 4;
88081ad8388SMartin Matuska 	}
88181ad8388SMartin Matuska 
882a8675d92SXin LI 	if ((n & 0xC0000000) == 0) {
88381ad8388SMartin Matuska 		n <<= 2;
88481ad8388SMartin Matuska 		i += 2;
88581ad8388SMartin Matuska 	}
88681ad8388SMartin Matuska 
887a8675d92SXin LI 	if ((n & 0x80000000) == 0)
88881ad8388SMartin Matuska 		++i;
88981ad8388SMartin Matuska 
89081ad8388SMartin Matuska 	return i;
89181ad8388SMartin Matuska #endif
89281ad8388SMartin Matuska }
89381ad8388SMartin Matuska 
89481ad8388SMartin Matuska 
89581ad8388SMartin Matuska static inline uint32_t
89681ad8388SMartin Matuska ctz32(uint32_t n)
89781ad8388SMartin Matuska {
89881ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
89981ad8388SMartin Matuska 	return _bit_scan_forward(n);
90081ad8388SMartin Matuska 
901b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
902a8675d92SXin LI 	return (uint32_t)__builtin_ctz(n);
90381ad8388SMartin Matuska 
90481ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
90581ad8388SMartin Matuska 	uint32_t i;
90681ad8388SMartin Matuska 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
90781ad8388SMartin Matuska 	return i;
90881ad8388SMartin Matuska 
909a8675d92SXin LI #elif defined(_MSC_VER)
910a8675d92SXin LI 	unsigned long i;
911a8675d92SXin LI 	_BitScanForward(&i, n);
91281ad8388SMartin Matuska 	return i;
91381ad8388SMartin Matuska 
91481ad8388SMartin Matuska #else
91581ad8388SMartin Matuska 	uint32_t i = 0;
91681ad8388SMartin Matuska 
917a8675d92SXin LI 	if ((n & 0x0000FFFF) == 0) {
91881ad8388SMartin Matuska 		n >>= 16;
91981ad8388SMartin Matuska 		i = 16;
92081ad8388SMartin Matuska 	}
92181ad8388SMartin Matuska 
922a8675d92SXin LI 	if ((n & 0x000000FF) == 0) {
92381ad8388SMartin Matuska 		n >>= 8;
92481ad8388SMartin Matuska 		i += 8;
92581ad8388SMartin Matuska 	}
92681ad8388SMartin Matuska 
927a8675d92SXin LI 	if ((n & 0x0000000F) == 0) {
92881ad8388SMartin Matuska 		n >>= 4;
92981ad8388SMartin Matuska 		i += 4;
93081ad8388SMartin Matuska 	}
93181ad8388SMartin Matuska 
932a8675d92SXin LI 	if ((n & 0x00000003) == 0) {
93381ad8388SMartin Matuska 		n >>= 2;
93481ad8388SMartin Matuska 		i += 2;
93581ad8388SMartin Matuska 	}
93681ad8388SMartin Matuska 
937a8675d92SXin LI 	if ((n & 0x00000001) == 0)
93881ad8388SMartin Matuska 		++i;
93981ad8388SMartin Matuska 
94081ad8388SMartin Matuska 	return i;
94181ad8388SMartin Matuska #endif
94281ad8388SMartin Matuska }
94381ad8388SMartin Matuska 
94481ad8388SMartin Matuska #define bsf32 ctz32
94581ad8388SMartin Matuska 
94681ad8388SMartin Matuska #endif
947