xref: /freebsd/contrib/xz/src/common/tuklib_integer.h (revision 3b35e7ee8de9b0260149a2b77e87a2b9c7a36244)
1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
2*3b35e7eeSXin LI 
381ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
481ad8388SMartin Matuska //
581ad8388SMartin Matuska /// \file       tuklib_integer.h
681ad8388SMartin Matuska /// \brief      Various integer and bit operations
781ad8388SMartin Matuska ///
881ad8388SMartin Matuska /// This file provides macros or functions to do some basic integer and bit
981ad8388SMartin Matuska /// operations.
1081ad8388SMartin Matuska ///
11a8675d92SXin LI /// Native endian inline functions (XX = 16, 32, or 64):
12a8675d92SXin LI ///   - Unaligned native endian reads: readXXne(ptr)
13a8675d92SXin LI ///   - Unaligned native endian writes: writeXXne(ptr, num)
14a8675d92SXin LI ///   - Aligned native endian reads: aligned_readXXne(ptr)
15a8675d92SXin LI ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
16a8675d92SXin LI ///
17a8675d92SXin LI /// Endianness-converting integer operations (these can be macros!)
18a8675d92SXin LI /// (XX = 16, 32, or 64; Y = b or l):
19*3b35e7eeSXin LI ///   - Byte swapping: byteswapXX(num)
20a8675d92SXin LI ///   - Byte order conversions to/from native (byteswaps if Y isn't
21a8675d92SXin LI ///     the native endianness): convXXYe(num)
2273ed8e77SXin LI ///   - Unaligned reads: readXXYe(ptr)
2373ed8e77SXin LI ///   - Unaligned writes: writeXXYe(ptr, num)
24a8675d92SXin LI ///   - Aligned reads: aligned_readXXYe(ptr)
25a8675d92SXin LI ///   - Aligned writes: aligned_writeXXYe(ptr, num)
2681ad8388SMartin Matuska ///
27a8675d92SXin LI /// Since the above can macros, the arguments should have no side effects
28a8675d92SXin LI /// because they may be evaluated more than once.
2981ad8388SMartin Matuska ///
30a8675d92SXin LI /// Bit scan operations for non-zero 32-bit integers (inline functions):
3181ad8388SMartin Matuska ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
3281ad8388SMartin Matuska ///   - Count leading zeros: clz32(num)
3381ad8388SMartin Matuska ///   - Count trailing zeros: ctz32(num)
3481ad8388SMartin Matuska ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
3581ad8388SMartin Matuska ///
3681ad8388SMartin Matuska /// The above bit scan operations return 0-31. If num is zero,
3781ad8388SMartin Matuska /// the result is undefined.
3881ad8388SMartin Matuska //
3981ad8388SMartin Matuska //  Authors:    Lasse Collin
4081ad8388SMartin Matuska //              Joachim Henke
4181ad8388SMartin Matuska //
4281ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
4381ad8388SMartin Matuska 
4481ad8388SMartin Matuska #ifndef TUKLIB_INTEGER_H
4581ad8388SMartin Matuska #define TUKLIB_INTEGER_H
4681ad8388SMartin Matuska 
4781ad8388SMartin Matuska #include "tuklib_common.h"
48a8675d92SXin LI #include <string.h>
49a8675d92SXin LI 
50a8675d92SXin LI // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51a8675d92SXin LI // and such functions.
52a8675d92SXin LI #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53a8675d92SXin LI #	include <immintrin.h>
54b333cd44SXin LI // Only include <intrin.h> when it is needed. GCC and Clang can both
55b333cd44SXin LI // use __builtin's, so we only need Windows instrincs when using MSVC.
56b333cd44SXin LI // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
57b333cd44SXin LI // cases explicitly.
58b333cd44SXin LI #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
59b333cd44SXin LI #	include <intrin.h>
60a8675d92SXin LI #endif
6181ad8388SMartin Matuska 
6281ad8388SMartin Matuska 
63a8675d92SXin LI ///////////////////
64a8675d92SXin LI // Byte swapping //
65a8675d92SXin LI ///////////////////
6681ad8388SMartin Matuska 
67a8675d92SXin LI #if defined(HAVE___BUILTIN_BSWAPXX)
68a8675d92SXin LI 	// GCC >= 4.8 and Clang
69*3b35e7eeSXin LI #	define byteswap16(num) __builtin_bswap16(num)
70*3b35e7eeSXin LI #	define byteswap32(num) __builtin_bswap32(num)
71*3b35e7eeSXin LI #	define byteswap64(num) __builtin_bswap64(num)
72a8675d92SXin LI 
73a8675d92SXin LI #elif defined(HAVE_BYTESWAP_H)
7481ad8388SMartin Matuska 	// glibc, uClibc, dietlibc
7581ad8388SMartin Matuska #	include <byteswap.h>
7681ad8388SMartin Matuska #	ifdef HAVE_BSWAP_16
77*3b35e7eeSXin LI #		define byteswap16(num) bswap_16(num)
7881ad8388SMartin Matuska #	endif
7981ad8388SMartin Matuska #	ifdef HAVE_BSWAP_32
80*3b35e7eeSXin LI #		define byteswap32(num) bswap_32(num)
8181ad8388SMartin Matuska #	endif
8281ad8388SMartin Matuska #	ifdef HAVE_BSWAP_64
83*3b35e7eeSXin LI #		define byteswap64(num) bswap_64(num)
8481ad8388SMartin Matuska #	endif
8581ad8388SMartin Matuska 
8681ad8388SMartin Matuska #elif defined(HAVE_SYS_ENDIAN_H)
8781ad8388SMartin Matuska 	// *BSDs and Darwin
8881ad8388SMartin Matuska #	include <sys/endian.h>
89*3b35e7eeSXin LI #	define byteswap16(num) bswap16(num)
90*3b35e7eeSXin LI #	define byteswap32(num) bswap32(num)
91*3b35e7eeSXin LI #	define byteswap64(num) bswap64(num)
9281ad8388SMartin Matuska 
9381ad8388SMartin Matuska #elif defined(HAVE_SYS_BYTEORDER_H)
9481ad8388SMartin Matuska 	// Solaris
9581ad8388SMartin Matuska #	include <sys/byteorder.h>
9681ad8388SMartin Matuska #	ifdef BSWAP_16
97*3b35e7eeSXin LI #		define byteswap16(num) BSWAP_16(num)
9881ad8388SMartin Matuska #	endif
9981ad8388SMartin Matuska #	ifdef BSWAP_32
100*3b35e7eeSXin LI #		define byteswap32(num) BSWAP_32(num)
10181ad8388SMartin Matuska #	endif
10281ad8388SMartin Matuska #	ifdef BSWAP_64
103*3b35e7eeSXin LI #		define byteswap64(num) BSWAP_64(num)
10481ad8388SMartin Matuska #	endif
10581ad8388SMartin Matuska #	ifdef BE_16
10681ad8388SMartin Matuska #		define conv16be(num) BE_16(num)
10781ad8388SMartin Matuska #	endif
10881ad8388SMartin Matuska #	ifdef BE_32
10981ad8388SMartin Matuska #		define conv32be(num) BE_32(num)
11081ad8388SMartin Matuska #	endif
11181ad8388SMartin Matuska #	ifdef BE_64
11281ad8388SMartin Matuska #		define conv64be(num) BE_64(num)
11381ad8388SMartin Matuska #	endif
11481ad8388SMartin Matuska #	ifdef LE_16
11581ad8388SMartin Matuska #		define conv16le(num) LE_16(num)
11681ad8388SMartin Matuska #	endif
11781ad8388SMartin Matuska #	ifdef LE_32
11881ad8388SMartin Matuska #		define conv32le(num) LE_32(num)
11981ad8388SMartin Matuska #	endif
12081ad8388SMartin Matuska #	ifdef LE_64
12181ad8388SMartin Matuska #		define conv64le(num) LE_64(num)
12281ad8388SMartin Matuska #	endif
12381ad8388SMartin Matuska #endif
12481ad8388SMartin Matuska 
125*3b35e7eeSXin LI #ifndef byteswap16
126*3b35e7eeSXin LI #	define byteswap16(n) (uint16_t)( \
127a8675d92SXin LI 		  (((n) & 0x00FFU) << 8) \
128a8675d92SXin LI 		| (((n) & 0xFF00U) >> 8) \
129a8675d92SXin LI 	)
13081ad8388SMartin Matuska #endif
13181ad8388SMartin Matuska 
132*3b35e7eeSXin LI #ifndef byteswap32
133*3b35e7eeSXin LI #	define byteswap32(n) (uint32_t)( \
134a8675d92SXin LI 		  (((n) & UINT32_C(0x000000FF)) << 24) \
135a8675d92SXin LI 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
136a8675d92SXin LI 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
137a8675d92SXin LI 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
138a8675d92SXin LI 	)
13981ad8388SMartin Matuska #endif
14081ad8388SMartin Matuska 
141*3b35e7eeSXin LI #ifndef byteswap64
142*3b35e7eeSXin LI #	define byteswap64(n) (uint64_t)( \
143a8675d92SXin LI 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
144a8675d92SXin LI 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
145a8675d92SXin LI 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
146a8675d92SXin LI 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
147a8675d92SXin LI 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
148a8675d92SXin LI 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
149a8675d92SXin LI 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
150a8675d92SXin LI 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
151a8675d92SXin LI 	)
15281ad8388SMartin Matuska #endif
15381ad8388SMartin Matuska 
15481ad8388SMartin Matuska // Define conversion macros using the basic byte swapping macros.
15581ad8388SMartin Matuska #ifdef WORDS_BIGENDIAN
15681ad8388SMartin Matuska #	ifndef conv16be
15781ad8388SMartin Matuska #		define conv16be(num) ((uint16_t)(num))
15881ad8388SMartin Matuska #	endif
15981ad8388SMartin Matuska #	ifndef conv32be
16081ad8388SMartin Matuska #		define conv32be(num) ((uint32_t)(num))
16181ad8388SMartin Matuska #	endif
16281ad8388SMartin Matuska #	ifndef conv64be
16381ad8388SMartin Matuska #		define conv64be(num) ((uint64_t)(num))
16481ad8388SMartin Matuska #	endif
16581ad8388SMartin Matuska #	ifndef conv16le
166*3b35e7eeSXin LI #		define conv16le(num) byteswap16(num)
16781ad8388SMartin Matuska #	endif
16881ad8388SMartin Matuska #	ifndef conv32le
169*3b35e7eeSXin LI #		define conv32le(num) byteswap32(num)
17081ad8388SMartin Matuska #	endif
17181ad8388SMartin Matuska #	ifndef conv64le
172*3b35e7eeSXin LI #		define conv64le(num) byteswap64(num)
17381ad8388SMartin Matuska #	endif
17481ad8388SMartin Matuska #else
17581ad8388SMartin Matuska #	ifndef conv16be
176*3b35e7eeSXin LI #		define conv16be(num) byteswap16(num)
17781ad8388SMartin Matuska #	endif
17881ad8388SMartin Matuska #	ifndef conv32be
179*3b35e7eeSXin LI #		define conv32be(num) byteswap32(num)
18081ad8388SMartin Matuska #	endif
18181ad8388SMartin Matuska #	ifndef conv64be
182*3b35e7eeSXin LI #		define conv64be(num) byteswap64(num)
18381ad8388SMartin Matuska #	endif
18481ad8388SMartin Matuska #	ifndef conv16le
18581ad8388SMartin Matuska #		define conv16le(num) ((uint16_t)(num))
18681ad8388SMartin Matuska #	endif
18781ad8388SMartin Matuska #	ifndef conv32le
18881ad8388SMartin Matuska #		define conv32le(num) ((uint32_t)(num))
18981ad8388SMartin Matuska #	endif
19081ad8388SMartin Matuska #	ifndef conv64le
19181ad8388SMartin Matuska #		define conv64le(num) ((uint64_t)(num))
19281ad8388SMartin Matuska #	endif
19381ad8388SMartin Matuska #endif
19481ad8388SMartin Matuska 
19581ad8388SMartin Matuska 
196a8675d92SXin LI ////////////////////////////////
197a8675d92SXin LI // Unaligned reads and writes //
198a8675d92SXin LI ////////////////////////////////
199a8675d92SXin LI 
200ca6a6373SXin LI // No-strict-align archs like x86-64
201ca6a6373SXin LI // ---------------------------------
202ca6a6373SXin LI //
203a8675d92SXin LI // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
204a8675d92SXin LI // is bad even if the uint8_pointer is properly aligned because this kind
205a8675d92SXin LI // of casts break strict aliasing rules and result in undefined behavior.
206a8675d92SXin LI // With unaligned pointers it's even worse: compilers may emit vector
207a8675d92SXin LI // instructions that require aligned pointers even if non-vector
208a8675d92SXin LI // instructions work with unaligned pointers.
209a8675d92SXin LI //
210a8675d92SXin LI // Using memcpy() is the standard compliant way to do unaligned access.
211a8675d92SXin LI // Many modern compilers inline it so there is no function call overhead.
212a8675d92SXin LI // For those compilers that don't handle the memcpy() method well, the
213a8675d92SXin LI // old casting method (that violates strict aliasing) can be requested at
214a8675d92SXin LI // build time. A third method, casting to a packed struct, would also be
215a8675d92SXin LI // an option but isn't provided to keep things simpler (it's already a mess).
216a8675d92SXin LI // Hopefully this is flexible enough in practice.
217ca6a6373SXin LI //
218ca6a6373SXin LI // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
219ca6a6373SXin LI //
220ca6a6373SXin LI //     buf[0] | (buf[1] << 8)
221ca6a6373SXin LI //
222ca6a6373SXin LI // reads a 16-bit value and can emit a single 16-bit load and produce
223ca6a6373SXin LI // identical code than with the memcpy() method. In other cases Clang and GCC
224ca6a6373SXin LI // produce either the same or better code with memcpy(). For example, Clang 9
225ca6a6373SXin LI // on x86-64 can detect 32-bit load but not 16-bit load.
226ca6a6373SXin LI //
227ca6a6373SXin LI // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
228ca6a6373SXin LI // code for "buf[0] | (buf[1] << 8)".
229ca6a6373SXin LI //
230ca6a6373SXin LI // Conclusion: The memcpy() method is the best choice when unaligned access
231ca6a6373SXin LI // is supported.
232ca6a6373SXin LI //
233ca6a6373SXin LI // Strict-align archs like SPARC
234ca6a6373SXin LI // -----------------------------
235ca6a6373SXin LI //
236ca6a6373SXin LI // GCC versions from around 4.x to to at least 13.2.0 produce worse code
237ca6a6373SXin LI // from the memcpy() method than from simple byte-by-byte shift-or code
238ca6a6373SXin LI // when reading a 32-bit integer:
239ca6a6373SXin LI //
240ca6a6373SXin LI //     (1) It may be constructed on stack using using four 8-bit loads,
241ca6a6373SXin LI //         four 8-bit stores to stack, and finally one 32-bit load from stack.
242ca6a6373SXin LI //
243ca6a6373SXin LI //     (2) Especially with -Os, an actual memcpy() call may be emitted.
244ca6a6373SXin LI //
245ca6a6373SXin LI // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
246ca6a6373SXin LI // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
247ca6a6373SXin LI // some processors but not all so this is relevant only in the case when
248ca6a6373SXin LI // GCC assumes that unaligned is not supported or -mstrict-align or
249ca6a6373SXin LI // -mno-unaligned-access is used.
250ca6a6373SXin LI //
251ca6a6373SXin LI // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
252ca6a6373SXin LI // was one the very few with a minor difference: the memcpy() version
253ca6a6373SXin LI // was one instruction longer.
254ca6a6373SXin LI //
255ca6a6373SXin LI // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
256*3b35e7eeSXin LI // the best choice for strict-align archs to do unaligned access.
257ca6a6373SXin LI //
258ca6a6373SXin LI // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
259ca6a6373SXin LI //
260ca6a6373SXin LI // Thanks to <https://godbolt.org/> it was easy to test different compilers.
261ca6a6373SXin LI // The following is for little endian targets:
262ca6a6373SXin LI /*
263ca6a6373SXin LI #include <stdint.h>
264ca6a6373SXin LI #include <string.h>
265ca6a6373SXin LI 
266ca6a6373SXin LI uint32_t bytes16(const uint8_t *b)
267ca6a6373SXin LI {
268ca6a6373SXin LI     return (uint32_t)b[0]
269ca6a6373SXin LI         | ((uint32_t)b[1] << 8);
270ca6a6373SXin LI }
271ca6a6373SXin LI 
272ca6a6373SXin LI uint32_t copy16(const uint8_t *b)
273ca6a6373SXin LI {
274ca6a6373SXin LI     uint16_t v;
275ca6a6373SXin LI     memcpy(&v, b, sizeof(v));
276ca6a6373SXin LI     return v;
277ca6a6373SXin LI }
278ca6a6373SXin LI 
279ca6a6373SXin LI uint32_t bytes32(const uint8_t *b)
280ca6a6373SXin LI {
281ca6a6373SXin LI     return (uint32_t)b[0]
282ca6a6373SXin LI         | ((uint32_t)b[1] << 8)
283ca6a6373SXin LI         | ((uint32_t)b[2] << 16)
284ca6a6373SXin LI         | ((uint32_t)b[3] << 24);
285ca6a6373SXin LI }
286ca6a6373SXin LI 
287ca6a6373SXin LI uint32_t copy32(const uint8_t *b)
288ca6a6373SXin LI {
289ca6a6373SXin LI     uint32_t v;
290ca6a6373SXin LI     memcpy(&v, b, sizeof(v));
291ca6a6373SXin LI     return v;
292ca6a6373SXin LI }
293ca6a6373SXin LI 
294ca6a6373SXin LI void wbytes16(uint8_t *b, uint16_t v)
295ca6a6373SXin LI {
296ca6a6373SXin LI     b[0] = (uint8_t)v;
297ca6a6373SXin LI     b[1] = (uint8_t)(v >> 8);
298ca6a6373SXin LI }
299ca6a6373SXin LI 
300ca6a6373SXin LI void wcopy16(uint8_t *b, uint16_t v)
301ca6a6373SXin LI {
302ca6a6373SXin LI     memcpy(b, &v, sizeof(v));
303ca6a6373SXin LI }
304ca6a6373SXin LI 
305ca6a6373SXin LI void wbytes32(uint8_t *b, uint32_t v)
306ca6a6373SXin LI {
307ca6a6373SXin LI     b[0] = (uint8_t)v;
308ca6a6373SXin LI     b[1] = (uint8_t)(v >> 8);
309ca6a6373SXin LI     b[2] = (uint8_t)(v >> 16);
310ca6a6373SXin LI     b[3] = (uint8_t)(v >> 24);
311ca6a6373SXin LI }
312ca6a6373SXin LI 
313ca6a6373SXin LI void wcopy32(uint8_t *b, uint32_t v)
314ca6a6373SXin LI {
315ca6a6373SXin LI     memcpy(b, &v, sizeof(v));
316ca6a6373SXin LI }
317ca6a6373SXin LI */
318ca6a6373SXin LI 
319ca6a6373SXin LI 
320ca6a6373SXin LI #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
32181ad8388SMartin Matuska 
32281ad8388SMartin Matuska static inline uint16_t
323a8675d92SXin LI read16ne(const uint8_t *buf)
32481ad8388SMartin Matuska {
325ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
326a8675d92SXin LI 	return *(const uint16_t *)buf;
327a8675d92SXin LI #else
328a8675d92SXin LI 	uint16_t num;
329a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
330a8675d92SXin LI 	return num;
331a8675d92SXin LI #endif
33281ad8388SMartin Matuska }
33381ad8388SMartin Matuska 
33481ad8388SMartin Matuska 
33581ad8388SMartin Matuska static inline uint32_t
336a8675d92SXin LI read32ne(const uint8_t *buf)
33781ad8388SMartin Matuska {
338ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
339a8675d92SXin LI 	return *(const uint32_t *)buf;
340a8675d92SXin LI #else
341a8675d92SXin LI 	uint32_t num;
342a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
343a8675d92SXin LI 	return num;
344a8675d92SXin LI #endif
34581ad8388SMartin Matuska }
34681ad8388SMartin Matuska 
34781ad8388SMartin Matuska 
34881ad8388SMartin Matuska static inline uint64_t
349a8675d92SXin LI read64ne(const uint8_t *buf)
35081ad8388SMartin Matuska {
351ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
352a8675d92SXin LI 	return *(const uint64_t *)buf;
353a8675d92SXin LI #else
354a8675d92SXin LI 	uint64_t num;
355a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
356a8675d92SXin LI 	return num;
357a8675d92SXin LI #endif
35881ad8388SMartin Matuska }
35981ad8388SMartin Matuska 
36081ad8388SMartin Matuska 
36181ad8388SMartin Matuska static inline void
36281ad8388SMartin Matuska write16ne(uint8_t *buf, uint16_t num)
36381ad8388SMartin Matuska {
364ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
36581ad8388SMartin Matuska 	*(uint16_t *)buf = num;
366a8675d92SXin LI #else
367a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
368a8675d92SXin LI #endif
36981ad8388SMartin Matuska 	return;
37081ad8388SMartin Matuska }
37181ad8388SMartin Matuska 
37281ad8388SMartin Matuska 
37381ad8388SMartin Matuska static inline void
37481ad8388SMartin Matuska write32ne(uint8_t *buf, uint32_t num)
37581ad8388SMartin Matuska {
376ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
37781ad8388SMartin Matuska 	*(uint32_t *)buf = num;
378a8675d92SXin LI #else
379a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
380a8675d92SXin LI #endif
38181ad8388SMartin Matuska 	return;
38281ad8388SMartin Matuska }
38381ad8388SMartin Matuska 
38481ad8388SMartin Matuska 
38581ad8388SMartin Matuska static inline void
38681ad8388SMartin Matuska write64ne(uint8_t *buf, uint64_t num)
38781ad8388SMartin Matuska {
388ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
38981ad8388SMartin Matuska 	*(uint64_t *)buf = num;
390a8675d92SXin LI #else
391a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
392a8675d92SXin LI #endif
39381ad8388SMartin Matuska 	return;
39481ad8388SMartin Matuska }
39581ad8388SMartin Matuska 
39681ad8388SMartin Matuska 
39781ad8388SMartin Matuska static inline uint16_t
398a8675d92SXin LI read16be(const uint8_t *buf)
39981ad8388SMartin Matuska {
400a8675d92SXin LI 	uint16_t num = read16ne(buf);
401a8675d92SXin LI 	return conv16be(num);
40281ad8388SMartin Matuska }
40381ad8388SMartin Matuska 
40481ad8388SMartin Matuska 
40581ad8388SMartin Matuska static inline uint16_t
406a8675d92SXin LI read16le(const uint8_t *buf)
40781ad8388SMartin Matuska {
408a8675d92SXin LI 	uint16_t num = read16ne(buf);
409a8675d92SXin LI 	return conv16le(num);
41081ad8388SMartin Matuska }
41181ad8388SMartin Matuska 
41281ad8388SMartin Matuska 
41381ad8388SMartin Matuska static inline uint32_t
414a8675d92SXin LI read32be(const uint8_t *buf)
41581ad8388SMartin Matuska {
416a8675d92SXin LI 	uint32_t num = read32ne(buf);
417a8675d92SXin LI 	return conv32be(num);
41881ad8388SMartin Matuska }
41981ad8388SMartin Matuska 
42081ad8388SMartin Matuska 
42181ad8388SMartin Matuska static inline uint32_t
422a8675d92SXin LI read32le(const uint8_t *buf)
42381ad8388SMartin Matuska {
424a8675d92SXin LI 	uint32_t num = read32ne(buf);
425a8675d92SXin LI 	return conv32le(num);
42681ad8388SMartin Matuska }
42781ad8388SMartin Matuska 
42881ad8388SMartin Matuska 
42973ed8e77SXin LI static inline uint64_t
43073ed8e77SXin LI read64be(const uint8_t *buf)
43173ed8e77SXin LI {
43273ed8e77SXin LI 	uint64_t num = read64ne(buf);
43373ed8e77SXin LI 	return conv64be(num);
434ca6a6373SXin LI }
435ca6a6373SXin LI 
436ca6a6373SXin LI 
437ca6a6373SXin LI static inline uint64_t
438ca6a6373SXin LI read64le(const uint8_t *buf)
439ca6a6373SXin LI {
440ca6a6373SXin LI 	uint64_t num = read64ne(buf);
441ca6a6373SXin LI 	return conv64le(num);
442ca6a6373SXin LI }
443ca6a6373SXin LI 
444ca6a6373SXin LI 
445ca6a6373SXin LI // NOTE: Possible byte swapping must be done in a macro to allow the compiler
446ca6a6373SXin LI // to optimize byte swapping of constants when using glibc's or *BSD's
447ca6a6373SXin LI // byte swapping macros. The actual write is done in an inline function
448ca6a6373SXin LI // to make type checking of the buf pointer possible.
449ca6a6373SXin LI #define write16be(buf, num) write16ne(buf, conv16be(num))
450ca6a6373SXin LI #define write32be(buf, num) write32ne(buf, conv32be(num))
451ca6a6373SXin LI #define write64be(buf, num) write64ne(buf, conv64be(num))
452ca6a6373SXin LI #define write16le(buf, num) write16ne(buf, conv16le(num))
453ca6a6373SXin LI #define write32le(buf, num) write32ne(buf, conv32le(num))
454ca6a6373SXin LI #define write64le(buf, num) write64ne(buf, conv64le(num))
455ca6a6373SXin LI 
45673ed8e77SXin LI #else
457ca6a6373SXin LI 
458ca6a6373SXin LI #ifdef WORDS_BIGENDIAN
459ca6a6373SXin LI #	define read16ne read16be
460ca6a6373SXin LI #	define read32ne read32be
461ca6a6373SXin LI #	define read64ne read64be
462ca6a6373SXin LI #	define write16ne write16be
463ca6a6373SXin LI #	define write32ne write32be
464ca6a6373SXin LI #	define write64ne write64be
465ca6a6373SXin LI #else
466ca6a6373SXin LI #	define read16ne read16le
467ca6a6373SXin LI #	define read32ne read32le
468ca6a6373SXin LI #	define read64ne read64le
469ca6a6373SXin LI #	define write16ne write16le
470ca6a6373SXin LI #	define write32ne write32le
471ca6a6373SXin LI #	define write64ne write64le
472ca6a6373SXin LI #endif
473ca6a6373SXin LI 
474ca6a6373SXin LI 
475ca6a6373SXin LI static inline uint16_t
476ca6a6373SXin LI read16be(const uint8_t *buf)
477ca6a6373SXin LI {
478ca6a6373SXin LI 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
479ca6a6373SXin LI 	return num;
480ca6a6373SXin LI }
481ca6a6373SXin LI 
482ca6a6373SXin LI 
483ca6a6373SXin LI static inline uint16_t
484ca6a6373SXin LI read16le(const uint8_t *buf)
485ca6a6373SXin LI {
486ca6a6373SXin LI 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
487ca6a6373SXin LI 	return num;
488ca6a6373SXin LI }
489ca6a6373SXin LI 
490ca6a6373SXin LI 
491ca6a6373SXin LI static inline uint32_t
492ca6a6373SXin LI read32be(const uint8_t *buf)
493ca6a6373SXin LI {
494ca6a6373SXin LI 	uint32_t num = (uint32_t)buf[0] << 24;
495ca6a6373SXin LI 	num |= (uint32_t)buf[1] << 16;
496ca6a6373SXin LI 	num |= (uint32_t)buf[2] << 8;
497ca6a6373SXin LI 	num |= (uint32_t)buf[3];
498ca6a6373SXin LI 	return num;
499ca6a6373SXin LI }
500ca6a6373SXin LI 
501ca6a6373SXin LI 
502ca6a6373SXin LI static inline uint32_t
503ca6a6373SXin LI read32le(const uint8_t *buf)
504ca6a6373SXin LI {
505ca6a6373SXin LI 	uint32_t num = (uint32_t)buf[0];
506ca6a6373SXin LI 	num |= (uint32_t)buf[1] << 8;
507ca6a6373SXin LI 	num |= (uint32_t)buf[2] << 16;
508ca6a6373SXin LI 	num |= (uint32_t)buf[3] << 24;
509ca6a6373SXin LI 	return num;
510ca6a6373SXin LI }
511ca6a6373SXin LI 
512ca6a6373SXin LI 
513ca6a6373SXin LI static inline uint64_t
514ca6a6373SXin LI read64be(const uint8_t *buf)
515ca6a6373SXin LI {
51673ed8e77SXin LI 	uint64_t num = (uint64_t)buf[0] << 56;
51773ed8e77SXin LI 	num |= (uint64_t)buf[1] << 48;
51873ed8e77SXin LI 	num |= (uint64_t)buf[2] << 40;
51973ed8e77SXin LI 	num |= (uint64_t)buf[3] << 32;
52073ed8e77SXin LI 	num |= (uint64_t)buf[4] << 24;
52173ed8e77SXin LI 	num |= (uint64_t)buf[5] << 16;
52273ed8e77SXin LI 	num |= (uint64_t)buf[6] << 8;
52373ed8e77SXin LI 	num |= (uint64_t)buf[7];
52473ed8e77SXin LI 	return num;
52573ed8e77SXin LI }
52673ed8e77SXin LI 
52773ed8e77SXin LI 
52873ed8e77SXin LI static inline uint64_t
52973ed8e77SXin LI read64le(const uint8_t *buf)
53073ed8e77SXin LI {
53173ed8e77SXin LI 	uint64_t num = (uint64_t)buf[0];
53273ed8e77SXin LI 	num |= (uint64_t)buf[1] << 8;
53373ed8e77SXin LI 	num |= (uint64_t)buf[2] << 16;
53473ed8e77SXin LI 	num |= (uint64_t)buf[3] << 24;
53573ed8e77SXin LI 	num |= (uint64_t)buf[4] << 32;
53673ed8e77SXin LI 	num |= (uint64_t)buf[5] << 40;
53773ed8e77SXin LI 	num |= (uint64_t)buf[6] << 48;
53873ed8e77SXin LI 	num |= (uint64_t)buf[7] << 56;
53973ed8e77SXin LI 	return num;
54073ed8e77SXin LI }
54173ed8e77SXin LI 
54273ed8e77SXin LI 
54381ad8388SMartin Matuska static inline void
544a8675d92SXin LI write16be(uint8_t *buf, uint16_t num)
54581ad8388SMartin Matuska {
546342bcb12SXin LI 	buf[0] = (uint8_t)(num >> 8);
547342bcb12SXin LI 	buf[1] = (uint8_t)num;
54881ad8388SMartin Matuska 	return;
54981ad8388SMartin Matuska }
55081ad8388SMartin Matuska 
55181ad8388SMartin Matuska 
55281ad8388SMartin Matuska static inline void
553a8675d92SXin LI write16le(uint8_t *buf, uint16_t num)
55481ad8388SMartin Matuska {
555342bcb12SXin LI 	buf[0] = (uint8_t)num;
556342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 8);
55781ad8388SMartin Matuska 	return;
55881ad8388SMartin Matuska }
55981ad8388SMartin Matuska 
56081ad8388SMartin Matuska 
56181ad8388SMartin Matuska static inline void
562a8675d92SXin LI write32be(uint8_t *buf, uint32_t num)
56381ad8388SMartin Matuska {
564342bcb12SXin LI 	buf[0] = (uint8_t)(num >> 24);
565342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 16);
566342bcb12SXin LI 	buf[2] = (uint8_t)(num >> 8);
567342bcb12SXin LI 	buf[3] = (uint8_t)num;
56881ad8388SMartin Matuska 	return;
56981ad8388SMartin Matuska }
57081ad8388SMartin Matuska 
57181ad8388SMartin Matuska 
57281ad8388SMartin Matuska static inline void
573a8675d92SXin LI write32le(uint8_t *buf, uint32_t num)
57481ad8388SMartin Matuska {
575342bcb12SXin LI 	buf[0] = (uint8_t)num;
576342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 8);
577342bcb12SXin LI 	buf[2] = (uint8_t)(num >> 16);
578342bcb12SXin LI 	buf[3] = (uint8_t)(num >> 24);
57981ad8388SMartin Matuska 	return;
58081ad8388SMartin Matuska }
581ca6a6373SXin LI 
582ca6a6373SXin LI 
583ca6a6373SXin LI static inline void
584ca6a6373SXin LI write64be(uint8_t *buf, uint64_t num)
585ca6a6373SXin LI {
586ca6a6373SXin LI 	buf[0] = (uint8_t)(num >> 56);
587ca6a6373SXin LI 	buf[1] = (uint8_t)(num >> 48);
588ca6a6373SXin LI 	buf[2] = (uint8_t)(num >> 40);
589ca6a6373SXin LI 	buf[3] = (uint8_t)(num >> 32);
590ca6a6373SXin LI 	buf[4] = (uint8_t)(num >> 24);
591ca6a6373SXin LI 	buf[5] = (uint8_t)(num >> 16);
592ca6a6373SXin LI 	buf[6] = (uint8_t)(num >> 8);
593ca6a6373SXin LI 	buf[7] = (uint8_t)num;
594ca6a6373SXin LI 	return;
595ca6a6373SXin LI }
596ca6a6373SXin LI 
597ca6a6373SXin LI 
598ca6a6373SXin LI static inline void
599ca6a6373SXin LI write64le(uint8_t *buf, uint64_t num)
600ca6a6373SXin LI {
601ca6a6373SXin LI 	buf[0] = (uint8_t)num;
602ca6a6373SXin LI 	buf[1] = (uint8_t)(num >> 8);
603ca6a6373SXin LI 	buf[2] = (uint8_t)(num >> 16);
604ca6a6373SXin LI 	buf[3] = (uint8_t)(num >> 24);
605ca6a6373SXin LI 	buf[4] = (uint8_t)(num >> 32);
606ca6a6373SXin LI 	buf[5] = (uint8_t)(num >> 40);
607ca6a6373SXin LI 	buf[6] = (uint8_t)(num >> 48);
608ca6a6373SXin LI 	buf[7] = (uint8_t)(num >> 56);
609ca6a6373SXin LI 	return;
610ca6a6373SXin LI }
611ca6a6373SXin LI 
61281ad8388SMartin Matuska #endif
61381ad8388SMartin Matuska 
61481ad8388SMartin Matuska 
615a8675d92SXin LI //////////////////////////////
616a8675d92SXin LI // Aligned reads and writes //
617a8675d92SXin LI //////////////////////////////
618a8675d92SXin LI 
619a8675d92SXin LI // Separate functions for aligned reads and writes are provided since on
620a8675d92SXin LI // strict-align archs aligned access is much faster than unaligned access.
621a8675d92SXin LI //
622a8675d92SXin LI // Just like in the unaligned case, memcpy() is needed to avoid
623a8675d92SXin LI // strict aliasing violations. However, on archs that don't support
624a8675d92SXin LI // unaligned access the compiler cannot know that the pointers given
625a8675d92SXin LI // to memcpy() are aligned which results in slow code. As of C11 there is
626a8675d92SXin LI // no standard way to tell the compiler that we know that the address is
627a8675d92SXin LI // aligned but some compilers have language extensions to do that. With
628a8675d92SXin LI // such language extensions the memcpy() method gives excellent results.
629a8675d92SXin LI //
630*3b35e7eeSXin LI // What to do on a strict-align system when no known language extensions
631a8675d92SXin LI // are available? Falling back to byte-by-byte access would be safe but ruin
632a8675d92SXin LI // optimizations that have been made specifically with aligned access in mind.
633a8675d92SXin LI // As a compromise, aligned reads will fall back to non-compliant type punning
634a8675d92SXin LI // but aligned writes will be byte-by-byte, that is, fast reads are preferred
635a8675d92SXin LI // over fast writes. This obviously isn't great but hopefully it's a working
636a8675d92SXin LI // compromise for now.
637a8675d92SXin LI //
638a8675d92SXin LI // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
639a8675d92SXin LI #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
640a8675d92SXin LI #	define tuklib_memcpy_aligned(dest, src, size) \
641a8675d92SXin LI 		memcpy(dest, __builtin_assume_aligned(src, size), size)
642a8675d92SXin LI #else
643a8675d92SXin LI #	define tuklib_memcpy_aligned(dest, src, size) \
644a8675d92SXin LI 		memcpy(dest, src, size)
645a8675d92SXin LI #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
646a8675d92SXin LI #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
647a8675d92SXin LI #	endif
648a8675d92SXin LI #endif
649a8675d92SXin LI 
650a8675d92SXin LI 
651a8675d92SXin LI static inline uint16_t
652a8675d92SXin LI aligned_read16ne(const uint8_t *buf)
653a8675d92SXin LI {
654a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
655a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
656a8675d92SXin LI 	return *(const uint16_t *)buf;
657a8675d92SXin LI #else
658a8675d92SXin LI 	uint16_t num;
659a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
660a8675d92SXin LI 	return num;
661a8675d92SXin LI #endif
662a8675d92SXin LI }
663a8675d92SXin LI 
664a8675d92SXin LI 
665a8675d92SXin LI static inline uint32_t
666a8675d92SXin LI aligned_read32ne(const uint8_t *buf)
667a8675d92SXin LI {
668a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
669a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
670a8675d92SXin LI 	return *(const uint32_t *)buf;
671a8675d92SXin LI #else
672a8675d92SXin LI 	uint32_t num;
673a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
674a8675d92SXin LI 	return num;
675a8675d92SXin LI #endif
676a8675d92SXin LI }
677a8675d92SXin LI 
678a8675d92SXin LI 
679a8675d92SXin LI static inline uint64_t
680a8675d92SXin LI aligned_read64ne(const uint8_t *buf)
681a8675d92SXin LI {
682a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
683a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
684a8675d92SXin LI 	return *(const uint64_t *)buf;
685a8675d92SXin LI #else
686a8675d92SXin LI 	uint64_t num;
687a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
688a8675d92SXin LI 	return num;
689a8675d92SXin LI #endif
690a8675d92SXin LI }
691a8675d92SXin LI 
692a8675d92SXin LI 
693a8675d92SXin LI static inline void
694a8675d92SXin LI aligned_write16ne(uint8_t *buf, uint16_t num)
695a8675d92SXin LI {
696a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
697a8675d92SXin LI 	*(uint16_t *)buf = num;
698a8675d92SXin LI #else
699a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
700a8675d92SXin LI #endif
701a8675d92SXin LI 	return;
702a8675d92SXin LI }
703a8675d92SXin LI 
704a8675d92SXin LI 
705a8675d92SXin LI static inline void
706a8675d92SXin LI aligned_write32ne(uint8_t *buf, uint32_t num)
707a8675d92SXin LI {
708a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
709a8675d92SXin LI 	*(uint32_t *)buf = num;
710a8675d92SXin LI #else
711a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
712a8675d92SXin LI #endif
713a8675d92SXin LI 	return;
714a8675d92SXin LI }
715a8675d92SXin LI 
716a8675d92SXin LI 
717a8675d92SXin LI static inline void
718a8675d92SXin LI aligned_write64ne(uint8_t *buf, uint64_t num)
719a8675d92SXin LI {
720a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
721a8675d92SXin LI 	*(uint64_t *)buf = num;
722a8675d92SXin LI #else
723a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
724a8675d92SXin LI #endif
725a8675d92SXin LI 	return;
726a8675d92SXin LI }
727a8675d92SXin LI 
728a8675d92SXin LI 
729a8675d92SXin LI static inline uint16_t
730a8675d92SXin LI aligned_read16be(const uint8_t *buf)
731a8675d92SXin LI {
732a8675d92SXin LI 	uint16_t num = aligned_read16ne(buf);
733a8675d92SXin LI 	return conv16be(num);
734a8675d92SXin LI }
735a8675d92SXin LI 
736a8675d92SXin LI 
737a8675d92SXin LI static inline uint16_t
738a8675d92SXin LI aligned_read16le(const uint8_t *buf)
739a8675d92SXin LI {
740a8675d92SXin LI 	uint16_t num = aligned_read16ne(buf);
741a8675d92SXin LI 	return conv16le(num);
742a8675d92SXin LI }
743a8675d92SXin LI 
744a8675d92SXin LI 
745a8675d92SXin LI static inline uint32_t
746a8675d92SXin LI aligned_read32be(const uint8_t *buf)
747a8675d92SXin LI {
748a8675d92SXin LI 	uint32_t num = aligned_read32ne(buf);
749a8675d92SXin LI 	return conv32be(num);
750a8675d92SXin LI }
751a8675d92SXin LI 
752a8675d92SXin LI 
753a8675d92SXin LI static inline uint32_t
754a8675d92SXin LI aligned_read32le(const uint8_t *buf)
755a8675d92SXin LI {
756a8675d92SXin LI 	uint32_t num = aligned_read32ne(buf);
757a8675d92SXin LI 	return conv32le(num);
758a8675d92SXin LI }
759a8675d92SXin LI 
760a8675d92SXin LI 
761a8675d92SXin LI static inline uint64_t
762a8675d92SXin LI aligned_read64be(const uint8_t *buf)
763a8675d92SXin LI {
764a8675d92SXin LI 	uint64_t num = aligned_read64ne(buf);
765a8675d92SXin LI 	return conv64be(num);
766a8675d92SXin LI }
767a8675d92SXin LI 
768a8675d92SXin LI 
769a8675d92SXin LI static inline uint64_t
770a8675d92SXin LI aligned_read64le(const uint8_t *buf)
771a8675d92SXin LI {
772a8675d92SXin LI 	uint64_t num = aligned_read64ne(buf);
773a8675d92SXin LI 	return conv64le(num);
774a8675d92SXin LI }
775a8675d92SXin LI 
776a8675d92SXin LI 
777a8675d92SXin LI // These need to be macros like in the unaligned case.
778a8675d92SXin LI #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
779a8675d92SXin LI #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
780a8675d92SXin LI #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
781a8675d92SXin LI #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
782a8675d92SXin LI #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
783a8675d92SXin LI #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
784a8675d92SXin LI 
785a8675d92SXin LI 
786a8675d92SXin LI ////////////////////
787a8675d92SXin LI // Bit operations //
788a8675d92SXin LI ////////////////////
789a8675d92SXin LI 
79081ad8388SMartin Matuska static inline uint32_t
79181ad8388SMartin Matuska bsr32(uint32_t n)
79281ad8388SMartin Matuska {
79381ad8388SMartin Matuska 	// Check for ICC first, since it tends to define __GNUC__ too.
79481ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
79581ad8388SMartin Matuska 	return _bit_scan_reverse(n);
79681ad8388SMartin Matuska 
797b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
79881ad8388SMartin Matuska 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
79981ad8388SMartin Matuska 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
80081ad8388SMartin Matuska 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
80181ad8388SMartin Matuska 	// XOR (if -march indicates that SSE4a instructions are supported).
802a8675d92SXin LI 	return (uint32_t)__builtin_clz(n) ^ 31U;
80381ad8388SMartin Matuska 
80481ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
80581ad8388SMartin Matuska 	uint32_t i;
80681ad8388SMartin Matuska 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
80781ad8388SMartin Matuska 	return i;
80881ad8388SMartin Matuska 
809a8675d92SXin LI #elif defined(_MSC_VER)
810a8675d92SXin LI 	unsigned long i;
811a8675d92SXin LI 	_BitScanReverse(&i, n);
81281ad8388SMartin Matuska 	return i;
81381ad8388SMartin Matuska 
81481ad8388SMartin Matuska #else
81581ad8388SMartin Matuska 	uint32_t i = 31;
81681ad8388SMartin Matuska 
817a8675d92SXin LI 	if ((n & 0xFFFF0000) == 0) {
81881ad8388SMartin Matuska 		n <<= 16;
81981ad8388SMartin Matuska 		i = 15;
82081ad8388SMartin Matuska 	}
82181ad8388SMartin Matuska 
822a8675d92SXin LI 	if ((n & 0xFF000000) == 0) {
82381ad8388SMartin Matuska 		n <<= 8;
82481ad8388SMartin Matuska 		i -= 8;
82581ad8388SMartin Matuska 	}
82681ad8388SMartin Matuska 
827a8675d92SXin LI 	if ((n & 0xF0000000) == 0) {
82881ad8388SMartin Matuska 		n <<= 4;
82981ad8388SMartin Matuska 		i -= 4;
83081ad8388SMartin Matuska 	}
83181ad8388SMartin Matuska 
832a8675d92SXin LI 	if ((n & 0xC0000000) == 0) {
83381ad8388SMartin Matuska 		n <<= 2;
83481ad8388SMartin Matuska 		i -= 2;
83581ad8388SMartin Matuska 	}
83681ad8388SMartin Matuska 
837a8675d92SXin LI 	if ((n & 0x80000000) == 0)
83881ad8388SMartin Matuska 		--i;
83981ad8388SMartin Matuska 
84081ad8388SMartin Matuska 	return i;
84181ad8388SMartin Matuska #endif
84281ad8388SMartin Matuska }
84381ad8388SMartin Matuska 
84481ad8388SMartin Matuska 
84581ad8388SMartin Matuska static inline uint32_t
84681ad8388SMartin Matuska clz32(uint32_t n)
84781ad8388SMartin Matuska {
84881ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
84981ad8388SMartin Matuska 	return _bit_scan_reverse(n) ^ 31U;
85081ad8388SMartin Matuska 
851b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
852a8675d92SXin LI 	return (uint32_t)__builtin_clz(n);
85381ad8388SMartin Matuska 
85481ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
85581ad8388SMartin Matuska 	uint32_t i;
85681ad8388SMartin Matuska 	__asm__("bsrl %1, %0\n\t"
85781ad8388SMartin Matuska 		"xorl $31, %0"
85881ad8388SMartin Matuska 		: "=r" (i) : "rm" (n));
85981ad8388SMartin Matuska 	return i;
86081ad8388SMartin Matuska 
861a8675d92SXin LI #elif defined(_MSC_VER)
862a8675d92SXin LI 	unsigned long i;
863a8675d92SXin LI 	_BitScanReverse(&i, n);
86481ad8388SMartin Matuska 	return i ^ 31U;
86581ad8388SMartin Matuska 
86681ad8388SMartin Matuska #else
86781ad8388SMartin Matuska 	uint32_t i = 0;
86881ad8388SMartin Matuska 
869a8675d92SXin LI 	if ((n & 0xFFFF0000) == 0) {
87081ad8388SMartin Matuska 		n <<= 16;
87181ad8388SMartin Matuska 		i = 16;
87281ad8388SMartin Matuska 	}
87381ad8388SMartin Matuska 
874a8675d92SXin LI 	if ((n & 0xFF000000) == 0) {
87581ad8388SMartin Matuska 		n <<= 8;
87681ad8388SMartin Matuska 		i += 8;
87781ad8388SMartin Matuska 	}
87881ad8388SMartin Matuska 
879a8675d92SXin LI 	if ((n & 0xF0000000) == 0) {
88081ad8388SMartin Matuska 		n <<= 4;
88181ad8388SMartin Matuska 		i += 4;
88281ad8388SMartin Matuska 	}
88381ad8388SMartin Matuska 
884a8675d92SXin LI 	if ((n & 0xC0000000) == 0) {
88581ad8388SMartin Matuska 		n <<= 2;
88681ad8388SMartin Matuska 		i += 2;
88781ad8388SMartin Matuska 	}
88881ad8388SMartin Matuska 
889a8675d92SXin LI 	if ((n & 0x80000000) == 0)
89081ad8388SMartin Matuska 		++i;
89181ad8388SMartin Matuska 
89281ad8388SMartin Matuska 	return i;
89381ad8388SMartin Matuska #endif
89481ad8388SMartin Matuska }
89581ad8388SMartin Matuska 
89681ad8388SMartin Matuska 
89781ad8388SMartin Matuska static inline uint32_t
89881ad8388SMartin Matuska ctz32(uint32_t n)
89981ad8388SMartin Matuska {
90081ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
90181ad8388SMartin Matuska 	return _bit_scan_forward(n);
90281ad8388SMartin Matuska 
903b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
904a8675d92SXin LI 	return (uint32_t)__builtin_ctz(n);
90581ad8388SMartin Matuska 
90681ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
90781ad8388SMartin Matuska 	uint32_t i;
90881ad8388SMartin Matuska 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
90981ad8388SMartin Matuska 	return i;
91081ad8388SMartin Matuska 
911a8675d92SXin LI #elif defined(_MSC_VER)
912a8675d92SXin LI 	unsigned long i;
913a8675d92SXin LI 	_BitScanForward(&i, n);
91481ad8388SMartin Matuska 	return i;
91581ad8388SMartin Matuska 
91681ad8388SMartin Matuska #else
91781ad8388SMartin Matuska 	uint32_t i = 0;
91881ad8388SMartin Matuska 
919a8675d92SXin LI 	if ((n & 0x0000FFFF) == 0) {
92081ad8388SMartin Matuska 		n >>= 16;
92181ad8388SMartin Matuska 		i = 16;
92281ad8388SMartin Matuska 	}
92381ad8388SMartin Matuska 
924a8675d92SXin LI 	if ((n & 0x000000FF) == 0) {
92581ad8388SMartin Matuska 		n >>= 8;
92681ad8388SMartin Matuska 		i += 8;
92781ad8388SMartin Matuska 	}
92881ad8388SMartin Matuska 
929a8675d92SXin LI 	if ((n & 0x0000000F) == 0) {
93081ad8388SMartin Matuska 		n >>= 4;
93181ad8388SMartin Matuska 		i += 4;
93281ad8388SMartin Matuska 	}
93381ad8388SMartin Matuska 
934a8675d92SXin LI 	if ((n & 0x00000003) == 0) {
93581ad8388SMartin Matuska 		n >>= 2;
93681ad8388SMartin Matuska 		i += 2;
93781ad8388SMartin Matuska 	}
93881ad8388SMartin Matuska 
939a8675d92SXin LI 	if ((n & 0x00000001) == 0)
94081ad8388SMartin Matuska 		++i;
94181ad8388SMartin Matuska 
94281ad8388SMartin Matuska 	return i;
94381ad8388SMartin Matuska #endif
94481ad8388SMartin Matuska }
94581ad8388SMartin Matuska 
94681ad8388SMartin Matuska #define bsf32 ctz32
94781ad8388SMartin Matuska 
94881ad8388SMartin Matuska #endif
949