xref: /freebsd/contrib/xz/src/common/tuklib_integer.h (revision 26743408e9ff53ac0e041407c359ed3c17c15596)
13b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
23b35e7eeSXin LI 
381ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
481ad8388SMartin Matuska //
581ad8388SMartin Matuska /// \file       tuklib_integer.h
681ad8388SMartin Matuska /// \brief      Various integer and bit operations
781ad8388SMartin Matuska ///
881ad8388SMartin Matuska /// This file provides macros or functions to do some basic integer and bit
981ad8388SMartin Matuska /// operations.
1081ad8388SMartin Matuska ///
11a8675d92SXin LI /// Native endian inline functions (XX = 16, 32, or 64):
12a8675d92SXin LI ///   - Unaligned native endian reads: readXXne(ptr)
13a8675d92SXin LI ///   - Unaligned native endian writes: writeXXne(ptr, num)
14a8675d92SXin LI ///   - Aligned native endian reads: aligned_readXXne(ptr)
15a8675d92SXin LI ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
16a8675d92SXin LI ///
17a8675d92SXin LI /// Endianness-converting integer operations (these can be macros!)
18a8675d92SXin LI /// (XX = 16, 32, or 64; Y = b or l):
193b35e7eeSXin LI ///   - Byte swapping: byteswapXX(num)
20a8675d92SXin LI ///   - Byte order conversions to/from native (byteswaps if Y isn't
21a8675d92SXin LI ///     the native endianness): convXXYe(num)
2273ed8e77SXin LI ///   - Unaligned reads: readXXYe(ptr)
2373ed8e77SXin LI ///   - Unaligned writes: writeXXYe(ptr, num)
24a8675d92SXin LI ///   - Aligned reads: aligned_readXXYe(ptr)
25a8675d92SXin LI ///   - Aligned writes: aligned_writeXXYe(ptr, num)
2681ad8388SMartin Matuska ///
27a8675d92SXin LI /// Since the above can macros, the arguments should have no side effects
28a8675d92SXin LI /// because they may be evaluated more than once.
2981ad8388SMartin Matuska ///
30a8675d92SXin LI /// Bit scan operations for non-zero 32-bit integers (inline functions):
3181ad8388SMartin Matuska ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
3281ad8388SMartin Matuska ///   - Count leading zeros: clz32(num)
3381ad8388SMartin Matuska ///   - Count trailing zeros: ctz32(num)
3481ad8388SMartin Matuska ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
3581ad8388SMartin Matuska ///
3681ad8388SMartin Matuska /// The above bit scan operations return 0-31. If num is zero,
3781ad8388SMartin Matuska /// the result is undefined.
3881ad8388SMartin Matuska //
3981ad8388SMartin Matuska //  Authors:    Lasse Collin
4081ad8388SMartin Matuska //              Joachim Henke
4181ad8388SMartin Matuska //
4281ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
4381ad8388SMartin Matuska 
4481ad8388SMartin Matuska #ifndef TUKLIB_INTEGER_H
4581ad8388SMartin Matuska #define TUKLIB_INTEGER_H
4681ad8388SMartin Matuska 
4781ad8388SMartin Matuska #include "tuklib_common.h"
48a8675d92SXin LI #include <string.h>
49a8675d92SXin LI 
50a8675d92SXin LI // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51a8675d92SXin LI // and such functions.
52a8675d92SXin LI #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53a8675d92SXin LI #	include <immintrin.h>
54b333cd44SXin LI // Only include <intrin.h> when it is needed. GCC and Clang can both
55b333cd44SXin LI // use __builtin's, so we only need Windows instrincs when using MSVC.
56b333cd44SXin LI // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
57b333cd44SXin LI // cases explicitly.
58b333cd44SXin LI #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
59b333cd44SXin LI #	include <intrin.h>
60a8675d92SXin LI #endif
6181ad8388SMartin Matuska 
6281ad8388SMartin Matuska 
63a8675d92SXin LI ///////////////////
64a8675d92SXin LI // Byte swapping //
65a8675d92SXin LI ///////////////////
6681ad8388SMartin Matuska 
67a8675d92SXin LI #if defined(HAVE___BUILTIN_BSWAPXX)
68a8675d92SXin LI 	// GCC >= 4.8 and Clang
693b35e7eeSXin LI #	define byteswap16(num) __builtin_bswap16(num)
703b35e7eeSXin LI #	define byteswap32(num) __builtin_bswap32(num)
713b35e7eeSXin LI #	define byteswap64(num) __builtin_bswap64(num)
72a8675d92SXin LI 
73a8675d92SXin LI #elif defined(HAVE_BYTESWAP_H)
7481ad8388SMartin Matuska 	// glibc, uClibc, dietlibc
7581ad8388SMartin Matuska #	include <byteswap.h>
7681ad8388SMartin Matuska #	ifdef HAVE_BSWAP_16
773b35e7eeSXin LI #		define byteswap16(num) bswap_16(num)
7881ad8388SMartin Matuska #	endif
7981ad8388SMartin Matuska #	ifdef HAVE_BSWAP_32
803b35e7eeSXin LI #		define byteswap32(num) bswap_32(num)
8181ad8388SMartin Matuska #	endif
8281ad8388SMartin Matuska #	ifdef HAVE_BSWAP_64
833b35e7eeSXin LI #		define byteswap64(num) bswap_64(num)
8481ad8388SMartin Matuska #	endif
8581ad8388SMartin Matuska 
8681ad8388SMartin Matuska #elif defined(HAVE_SYS_ENDIAN_H)
8781ad8388SMartin Matuska 	// *BSDs and Darwin
8881ad8388SMartin Matuska #	include <sys/endian.h>
89*26743408SXin LI #	ifdef __OpenBSD__
90*26743408SXin LI #		define byteswap16(num) swap16(num)
91*26743408SXin LI #		define byteswap32(num) swap32(num)
92*26743408SXin LI #		define byteswap64(num) swap64(num)
93*26743408SXin LI #	else
943b35e7eeSXin LI #		define byteswap16(num) bswap16(num)
953b35e7eeSXin LI #		define byteswap32(num) bswap32(num)
963b35e7eeSXin LI #		define byteswap64(num) bswap64(num)
97*26743408SXin LI #	endif
9881ad8388SMartin Matuska 
9981ad8388SMartin Matuska #elif defined(HAVE_SYS_BYTEORDER_H)
10081ad8388SMartin Matuska 	// Solaris
10181ad8388SMartin Matuska #	include <sys/byteorder.h>
10281ad8388SMartin Matuska #	ifdef BSWAP_16
1033b35e7eeSXin LI #		define byteswap16(num) BSWAP_16(num)
10481ad8388SMartin Matuska #	endif
10581ad8388SMartin Matuska #	ifdef BSWAP_32
1063b35e7eeSXin LI #		define byteswap32(num) BSWAP_32(num)
10781ad8388SMartin Matuska #	endif
10881ad8388SMartin Matuska #	ifdef BSWAP_64
1093b35e7eeSXin LI #		define byteswap64(num) BSWAP_64(num)
11081ad8388SMartin Matuska #	endif
11181ad8388SMartin Matuska #	ifdef BE_16
11281ad8388SMartin Matuska #		define conv16be(num) BE_16(num)
11381ad8388SMartin Matuska #	endif
11481ad8388SMartin Matuska #	ifdef BE_32
11581ad8388SMartin Matuska #		define conv32be(num) BE_32(num)
11681ad8388SMartin Matuska #	endif
11781ad8388SMartin Matuska #	ifdef BE_64
11881ad8388SMartin Matuska #		define conv64be(num) BE_64(num)
11981ad8388SMartin Matuska #	endif
12081ad8388SMartin Matuska #	ifdef LE_16
12181ad8388SMartin Matuska #		define conv16le(num) LE_16(num)
12281ad8388SMartin Matuska #	endif
12381ad8388SMartin Matuska #	ifdef LE_32
12481ad8388SMartin Matuska #		define conv32le(num) LE_32(num)
12581ad8388SMartin Matuska #	endif
12681ad8388SMartin Matuska #	ifdef LE_64
12781ad8388SMartin Matuska #		define conv64le(num) LE_64(num)
12881ad8388SMartin Matuska #	endif
12981ad8388SMartin Matuska #endif
13081ad8388SMartin Matuska 
1313b35e7eeSXin LI #ifndef byteswap16
1323b35e7eeSXin LI #	define byteswap16(n) (uint16_t)( \
133a8675d92SXin LI 		  (((n) & 0x00FFU) << 8) \
134a8675d92SXin LI 		| (((n) & 0xFF00U) >> 8) \
135a8675d92SXin LI 	)
13681ad8388SMartin Matuska #endif
13781ad8388SMartin Matuska 
1383b35e7eeSXin LI #ifndef byteswap32
1393b35e7eeSXin LI #	define byteswap32(n) (uint32_t)( \
140a8675d92SXin LI 		  (((n) & UINT32_C(0x000000FF)) << 24) \
141a8675d92SXin LI 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
142a8675d92SXin LI 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
143a8675d92SXin LI 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
144a8675d92SXin LI 	)
14581ad8388SMartin Matuska #endif
14681ad8388SMartin Matuska 
1473b35e7eeSXin LI #ifndef byteswap64
1483b35e7eeSXin LI #	define byteswap64(n) (uint64_t)( \
149a8675d92SXin LI 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
150a8675d92SXin LI 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
151a8675d92SXin LI 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
152a8675d92SXin LI 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
153a8675d92SXin LI 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
154a8675d92SXin LI 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
155a8675d92SXin LI 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
156a8675d92SXin LI 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
157a8675d92SXin LI 	)
15881ad8388SMartin Matuska #endif
15981ad8388SMartin Matuska 
16081ad8388SMartin Matuska // Define conversion macros using the basic byte swapping macros.
16181ad8388SMartin Matuska #ifdef WORDS_BIGENDIAN
16281ad8388SMartin Matuska #	ifndef conv16be
16381ad8388SMartin Matuska #		define conv16be(num) ((uint16_t)(num))
16481ad8388SMartin Matuska #	endif
16581ad8388SMartin Matuska #	ifndef conv32be
16681ad8388SMartin Matuska #		define conv32be(num) ((uint32_t)(num))
16781ad8388SMartin Matuska #	endif
16881ad8388SMartin Matuska #	ifndef conv64be
16981ad8388SMartin Matuska #		define conv64be(num) ((uint64_t)(num))
17081ad8388SMartin Matuska #	endif
17181ad8388SMartin Matuska #	ifndef conv16le
1723b35e7eeSXin LI #		define conv16le(num) byteswap16(num)
17381ad8388SMartin Matuska #	endif
17481ad8388SMartin Matuska #	ifndef conv32le
1753b35e7eeSXin LI #		define conv32le(num) byteswap32(num)
17681ad8388SMartin Matuska #	endif
17781ad8388SMartin Matuska #	ifndef conv64le
1783b35e7eeSXin LI #		define conv64le(num) byteswap64(num)
17981ad8388SMartin Matuska #	endif
18081ad8388SMartin Matuska #else
18181ad8388SMartin Matuska #	ifndef conv16be
1823b35e7eeSXin LI #		define conv16be(num) byteswap16(num)
18381ad8388SMartin Matuska #	endif
18481ad8388SMartin Matuska #	ifndef conv32be
1853b35e7eeSXin LI #		define conv32be(num) byteswap32(num)
18681ad8388SMartin Matuska #	endif
18781ad8388SMartin Matuska #	ifndef conv64be
1883b35e7eeSXin LI #		define conv64be(num) byteswap64(num)
18981ad8388SMartin Matuska #	endif
19081ad8388SMartin Matuska #	ifndef conv16le
19181ad8388SMartin Matuska #		define conv16le(num) ((uint16_t)(num))
19281ad8388SMartin Matuska #	endif
19381ad8388SMartin Matuska #	ifndef conv32le
19481ad8388SMartin Matuska #		define conv32le(num) ((uint32_t)(num))
19581ad8388SMartin Matuska #	endif
19681ad8388SMartin Matuska #	ifndef conv64le
19781ad8388SMartin Matuska #		define conv64le(num) ((uint64_t)(num))
19881ad8388SMartin Matuska #	endif
19981ad8388SMartin Matuska #endif
20081ad8388SMartin Matuska 
20181ad8388SMartin Matuska 
202a8675d92SXin LI ////////////////////////////////
203a8675d92SXin LI // Unaligned reads and writes //
204a8675d92SXin LI ////////////////////////////////
205a8675d92SXin LI 
206ca6a6373SXin LI // No-strict-align archs like x86-64
207ca6a6373SXin LI // ---------------------------------
208ca6a6373SXin LI //
209a8675d92SXin LI // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
210a8675d92SXin LI // is bad even if the uint8_pointer is properly aligned because this kind
211a8675d92SXin LI // of casts break strict aliasing rules and result in undefined behavior.
212a8675d92SXin LI // With unaligned pointers it's even worse: compilers may emit vector
213a8675d92SXin LI // instructions that require aligned pointers even if non-vector
214a8675d92SXin LI // instructions work with unaligned pointers.
215a8675d92SXin LI //
216a8675d92SXin LI // Using memcpy() is the standard compliant way to do unaligned access.
217a8675d92SXin LI // Many modern compilers inline it so there is no function call overhead.
218a8675d92SXin LI // For those compilers that don't handle the memcpy() method well, the
219a8675d92SXin LI // old casting method (that violates strict aliasing) can be requested at
220a8675d92SXin LI // build time. A third method, casting to a packed struct, would also be
221a8675d92SXin LI // an option but isn't provided to keep things simpler (it's already a mess).
222a8675d92SXin LI // Hopefully this is flexible enough in practice.
223ca6a6373SXin LI //
224ca6a6373SXin LI // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
225ca6a6373SXin LI //
226ca6a6373SXin LI //     buf[0] | (buf[1] << 8)
227ca6a6373SXin LI //
228ca6a6373SXin LI // reads a 16-bit value and can emit a single 16-bit load and produce
229ca6a6373SXin LI // identical code than with the memcpy() method. In other cases Clang and GCC
230ca6a6373SXin LI // produce either the same or better code with memcpy(). For example, Clang 9
231ca6a6373SXin LI // on x86-64 can detect 32-bit load but not 16-bit load.
232ca6a6373SXin LI //
233ca6a6373SXin LI // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
234ca6a6373SXin LI // code for "buf[0] | (buf[1] << 8)".
235ca6a6373SXin LI //
236ca6a6373SXin LI // Conclusion: The memcpy() method is the best choice when unaligned access
237ca6a6373SXin LI // is supported.
238ca6a6373SXin LI //
239ca6a6373SXin LI // Strict-align archs like SPARC
240ca6a6373SXin LI // -----------------------------
241ca6a6373SXin LI //
242ca6a6373SXin LI // GCC versions from around 4.x to to at least 13.2.0 produce worse code
243ca6a6373SXin LI // from the memcpy() method than from simple byte-by-byte shift-or code
244ca6a6373SXin LI // when reading a 32-bit integer:
245ca6a6373SXin LI //
246*26743408SXin LI //     (1) It may be constructed on stack using four 8-bit loads,
247ca6a6373SXin LI //         four 8-bit stores to stack, and finally one 32-bit load from stack.
248ca6a6373SXin LI //
249ca6a6373SXin LI //     (2) Especially with -Os, an actual memcpy() call may be emitted.
250ca6a6373SXin LI //
251ca6a6373SXin LI // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
252ca6a6373SXin LI // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
253ca6a6373SXin LI // some processors but not all so this is relevant only in the case when
254ca6a6373SXin LI // GCC assumes that unaligned is not supported or -mstrict-align or
255ca6a6373SXin LI // -mno-unaligned-access is used.
256ca6a6373SXin LI //
257ca6a6373SXin LI // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
258ca6a6373SXin LI // was one the very few with a minor difference: the memcpy() version
259ca6a6373SXin LI // was one instruction longer.
260ca6a6373SXin LI //
261ca6a6373SXin LI // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
2623b35e7eeSXin LI // the best choice for strict-align archs to do unaligned access.
263ca6a6373SXin LI //
264ca6a6373SXin LI // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
265ca6a6373SXin LI //
266ca6a6373SXin LI // Thanks to <https://godbolt.org/> it was easy to test different compilers.
267ca6a6373SXin LI // The following is for little endian targets:
268ca6a6373SXin LI /*
269ca6a6373SXin LI #include <stdint.h>
270ca6a6373SXin LI #include <string.h>
271ca6a6373SXin LI 
272ca6a6373SXin LI uint32_t bytes16(const uint8_t *b)
273ca6a6373SXin LI {
274ca6a6373SXin LI     return (uint32_t)b[0]
275ca6a6373SXin LI         | ((uint32_t)b[1] << 8);
276ca6a6373SXin LI }
277ca6a6373SXin LI 
278ca6a6373SXin LI uint32_t copy16(const uint8_t *b)
279ca6a6373SXin LI {
280ca6a6373SXin LI     uint16_t v;
281ca6a6373SXin LI     memcpy(&v, b, sizeof(v));
282ca6a6373SXin LI     return v;
283ca6a6373SXin LI }
284ca6a6373SXin LI 
285ca6a6373SXin LI uint32_t bytes32(const uint8_t *b)
286ca6a6373SXin LI {
287ca6a6373SXin LI     return (uint32_t)b[0]
288ca6a6373SXin LI         | ((uint32_t)b[1] << 8)
289ca6a6373SXin LI         | ((uint32_t)b[2] << 16)
290ca6a6373SXin LI         | ((uint32_t)b[3] << 24);
291ca6a6373SXin LI }
292ca6a6373SXin LI 
293ca6a6373SXin LI uint32_t copy32(const uint8_t *b)
294ca6a6373SXin LI {
295ca6a6373SXin LI     uint32_t v;
296ca6a6373SXin LI     memcpy(&v, b, sizeof(v));
297ca6a6373SXin LI     return v;
298ca6a6373SXin LI }
299ca6a6373SXin LI 
300ca6a6373SXin LI void wbytes16(uint8_t *b, uint16_t v)
301ca6a6373SXin LI {
302ca6a6373SXin LI     b[0] = (uint8_t)v;
303ca6a6373SXin LI     b[1] = (uint8_t)(v >> 8);
304ca6a6373SXin LI }
305ca6a6373SXin LI 
306ca6a6373SXin LI void wcopy16(uint8_t *b, uint16_t v)
307ca6a6373SXin LI {
308ca6a6373SXin LI     memcpy(b, &v, sizeof(v));
309ca6a6373SXin LI }
310ca6a6373SXin LI 
311ca6a6373SXin LI void wbytes32(uint8_t *b, uint32_t v)
312ca6a6373SXin LI {
313ca6a6373SXin LI     b[0] = (uint8_t)v;
314ca6a6373SXin LI     b[1] = (uint8_t)(v >> 8);
315ca6a6373SXin LI     b[2] = (uint8_t)(v >> 16);
316ca6a6373SXin LI     b[3] = (uint8_t)(v >> 24);
317ca6a6373SXin LI }
318ca6a6373SXin LI 
319ca6a6373SXin LI void wcopy32(uint8_t *b, uint32_t v)
320ca6a6373SXin LI {
321ca6a6373SXin LI     memcpy(b, &v, sizeof(v));
322ca6a6373SXin LI }
323ca6a6373SXin LI */
324ca6a6373SXin LI 
325ca6a6373SXin LI 
326ca6a6373SXin LI #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
32781ad8388SMartin Matuska 
32881ad8388SMartin Matuska static inline uint16_t
329a8675d92SXin LI read16ne(const uint8_t *buf)
33081ad8388SMartin Matuska {
331ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
332a8675d92SXin LI 	return *(const uint16_t *)buf;
333a8675d92SXin LI #else
334a8675d92SXin LI 	uint16_t num;
335a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
336a8675d92SXin LI 	return num;
337a8675d92SXin LI #endif
33881ad8388SMartin Matuska }
33981ad8388SMartin Matuska 
34081ad8388SMartin Matuska 
34181ad8388SMartin Matuska static inline uint32_t
342a8675d92SXin LI read32ne(const uint8_t *buf)
34381ad8388SMartin Matuska {
344ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
345a8675d92SXin LI 	return *(const uint32_t *)buf;
346a8675d92SXin LI #else
347a8675d92SXin LI 	uint32_t num;
348a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
349a8675d92SXin LI 	return num;
350a8675d92SXin LI #endif
35181ad8388SMartin Matuska }
35281ad8388SMartin Matuska 
35381ad8388SMartin Matuska 
35481ad8388SMartin Matuska static inline uint64_t
355a8675d92SXin LI read64ne(const uint8_t *buf)
35681ad8388SMartin Matuska {
357ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
358a8675d92SXin LI 	return *(const uint64_t *)buf;
359a8675d92SXin LI #else
360a8675d92SXin LI 	uint64_t num;
361a8675d92SXin LI 	memcpy(&num, buf, sizeof(num));
362a8675d92SXin LI 	return num;
363a8675d92SXin LI #endif
36481ad8388SMartin Matuska }
36581ad8388SMartin Matuska 
36681ad8388SMartin Matuska 
36781ad8388SMartin Matuska static inline void
36881ad8388SMartin Matuska write16ne(uint8_t *buf, uint16_t num)
36981ad8388SMartin Matuska {
370ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
37181ad8388SMartin Matuska 	*(uint16_t *)buf = num;
372a8675d92SXin LI #else
373a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
374a8675d92SXin LI #endif
37581ad8388SMartin Matuska 	return;
37681ad8388SMartin Matuska }
37781ad8388SMartin Matuska 
37881ad8388SMartin Matuska 
37981ad8388SMartin Matuska static inline void
38081ad8388SMartin Matuska write32ne(uint8_t *buf, uint32_t num)
38181ad8388SMartin Matuska {
382ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
38381ad8388SMartin Matuska 	*(uint32_t *)buf = num;
384a8675d92SXin LI #else
385a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
386a8675d92SXin LI #endif
38781ad8388SMartin Matuska 	return;
38881ad8388SMartin Matuska }
38981ad8388SMartin Matuska 
39081ad8388SMartin Matuska 
39181ad8388SMartin Matuska static inline void
39281ad8388SMartin Matuska write64ne(uint8_t *buf, uint64_t num)
39381ad8388SMartin Matuska {
394ca6a6373SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
39581ad8388SMartin Matuska 	*(uint64_t *)buf = num;
396a8675d92SXin LI #else
397a8675d92SXin LI 	memcpy(buf, &num, sizeof(num));
398a8675d92SXin LI #endif
39981ad8388SMartin Matuska 	return;
40081ad8388SMartin Matuska }
40181ad8388SMartin Matuska 
40281ad8388SMartin Matuska 
40381ad8388SMartin Matuska static inline uint16_t
404a8675d92SXin LI read16be(const uint8_t *buf)
40581ad8388SMartin Matuska {
406a8675d92SXin LI 	uint16_t num = read16ne(buf);
407a8675d92SXin LI 	return conv16be(num);
40881ad8388SMartin Matuska }
40981ad8388SMartin Matuska 
41081ad8388SMartin Matuska 
41181ad8388SMartin Matuska static inline uint16_t
412a8675d92SXin LI read16le(const uint8_t *buf)
41381ad8388SMartin Matuska {
414a8675d92SXin LI 	uint16_t num = read16ne(buf);
415a8675d92SXin LI 	return conv16le(num);
41681ad8388SMartin Matuska }
41781ad8388SMartin Matuska 
41881ad8388SMartin Matuska 
41981ad8388SMartin Matuska static inline uint32_t
420a8675d92SXin LI read32be(const uint8_t *buf)
42181ad8388SMartin Matuska {
422a8675d92SXin LI 	uint32_t num = read32ne(buf);
423a8675d92SXin LI 	return conv32be(num);
42481ad8388SMartin Matuska }
42581ad8388SMartin Matuska 
42681ad8388SMartin Matuska 
42781ad8388SMartin Matuska static inline uint32_t
428a8675d92SXin LI read32le(const uint8_t *buf)
42981ad8388SMartin Matuska {
430a8675d92SXin LI 	uint32_t num = read32ne(buf);
431a8675d92SXin LI 	return conv32le(num);
43281ad8388SMartin Matuska }
43381ad8388SMartin Matuska 
43481ad8388SMartin Matuska 
43573ed8e77SXin LI static inline uint64_t
43673ed8e77SXin LI read64be(const uint8_t *buf)
43773ed8e77SXin LI {
43873ed8e77SXin LI 	uint64_t num = read64ne(buf);
43973ed8e77SXin LI 	return conv64be(num);
440ca6a6373SXin LI }
441ca6a6373SXin LI 
442ca6a6373SXin LI 
443ca6a6373SXin LI static inline uint64_t
444ca6a6373SXin LI read64le(const uint8_t *buf)
445ca6a6373SXin LI {
446ca6a6373SXin LI 	uint64_t num = read64ne(buf);
447ca6a6373SXin LI 	return conv64le(num);
448ca6a6373SXin LI }
449ca6a6373SXin LI 
450ca6a6373SXin LI 
451ca6a6373SXin LI // NOTE: Possible byte swapping must be done in a macro to allow the compiler
452ca6a6373SXin LI // to optimize byte swapping of constants when using glibc's or *BSD's
453ca6a6373SXin LI // byte swapping macros. The actual write is done in an inline function
454ca6a6373SXin LI // to make type checking of the buf pointer possible.
455ca6a6373SXin LI #define write16be(buf, num) write16ne(buf, conv16be(num))
456ca6a6373SXin LI #define write32be(buf, num) write32ne(buf, conv32be(num))
457ca6a6373SXin LI #define write64be(buf, num) write64ne(buf, conv64be(num))
458ca6a6373SXin LI #define write16le(buf, num) write16ne(buf, conv16le(num))
459ca6a6373SXin LI #define write32le(buf, num) write32ne(buf, conv32le(num))
460ca6a6373SXin LI #define write64le(buf, num) write64ne(buf, conv64le(num))
461ca6a6373SXin LI 
46273ed8e77SXin LI #else
463ca6a6373SXin LI 
464ca6a6373SXin LI #ifdef WORDS_BIGENDIAN
465ca6a6373SXin LI #	define read16ne read16be
466ca6a6373SXin LI #	define read32ne read32be
467ca6a6373SXin LI #	define read64ne read64be
468ca6a6373SXin LI #	define write16ne write16be
469ca6a6373SXin LI #	define write32ne write32be
470ca6a6373SXin LI #	define write64ne write64be
471ca6a6373SXin LI #else
472ca6a6373SXin LI #	define read16ne read16le
473ca6a6373SXin LI #	define read32ne read32le
474ca6a6373SXin LI #	define read64ne read64le
475ca6a6373SXin LI #	define write16ne write16le
476ca6a6373SXin LI #	define write32ne write32le
477ca6a6373SXin LI #	define write64ne write64le
478ca6a6373SXin LI #endif
479ca6a6373SXin LI 
480ca6a6373SXin LI 
481ca6a6373SXin LI static inline uint16_t
482ca6a6373SXin LI read16be(const uint8_t *buf)
483ca6a6373SXin LI {
484ca6a6373SXin LI 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
485ca6a6373SXin LI 	return num;
486ca6a6373SXin LI }
487ca6a6373SXin LI 
488ca6a6373SXin LI 
489ca6a6373SXin LI static inline uint16_t
490ca6a6373SXin LI read16le(const uint8_t *buf)
491ca6a6373SXin LI {
492ca6a6373SXin LI 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
493ca6a6373SXin LI 	return num;
494ca6a6373SXin LI }
495ca6a6373SXin LI 
496ca6a6373SXin LI 
497ca6a6373SXin LI static inline uint32_t
498ca6a6373SXin LI read32be(const uint8_t *buf)
499ca6a6373SXin LI {
500ca6a6373SXin LI 	uint32_t num = (uint32_t)buf[0] << 24;
501ca6a6373SXin LI 	num |= (uint32_t)buf[1] << 16;
502ca6a6373SXin LI 	num |= (uint32_t)buf[2] << 8;
503ca6a6373SXin LI 	num |= (uint32_t)buf[3];
504ca6a6373SXin LI 	return num;
505ca6a6373SXin LI }
506ca6a6373SXin LI 
507ca6a6373SXin LI 
508ca6a6373SXin LI static inline uint32_t
509ca6a6373SXin LI read32le(const uint8_t *buf)
510ca6a6373SXin LI {
511ca6a6373SXin LI 	uint32_t num = (uint32_t)buf[0];
512ca6a6373SXin LI 	num |= (uint32_t)buf[1] << 8;
513ca6a6373SXin LI 	num |= (uint32_t)buf[2] << 16;
514ca6a6373SXin LI 	num |= (uint32_t)buf[3] << 24;
515ca6a6373SXin LI 	return num;
516ca6a6373SXin LI }
517ca6a6373SXin LI 
518ca6a6373SXin LI 
519ca6a6373SXin LI static inline uint64_t
520ca6a6373SXin LI read64be(const uint8_t *buf)
521ca6a6373SXin LI {
52273ed8e77SXin LI 	uint64_t num = (uint64_t)buf[0] << 56;
52373ed8e77SXin LI 	num |= (uint64_t)buf[1] << 48;
52473ed8e77SXin LI 	num |= (uint64_t)buf[2] << 40;
52573ed8e77SXin LI 	num |= (uint64_t)buf[3] << 32;
52673ed8e77SXin LI 	num |= (uint64_t)buf[4] << 24;
52773ed8e77SXin LI 	num |= (uint64_t)buf[5] << 16;
52873ed8e77SXin LI 	num |= (uint64_t)buf[6] << 8;
52973ed8e77SXin LI 	num |= (uint64_t)buf[7];
53073ed8e77SXin LI 	return num;
53173ed8e77SXin LI }
53273ed8e77SXin LI 
53373ed8e77SXin LI 
53473ed8e77SXin LI static inline uint64_t
53573ed8e77SXin LI read64le(const uint8_t *buf)
53673ed8e77SXin LI {
53773ed8e77SXin LI 	uint64_t num = (uint64_t)buf[0];
53873ed8e77SXin LI 	num |= (uint64_t)buf[1] << 8;
53973ed8e77SXin LI 	num |= (uint64_t)buf[2] << 16;
54073ed8e77SXin LI 	num |= (uint64_t)buf[3] << 24;
54173ed8e77SXin LI 	num |= (uint64_t)buf[4] << 32;
54273ed8e77SXin LI 	num |= (uint64_t)buf[5] << 40;
54373ed8e77SXin LI 	num |= (uint64_t)buf[6] << 48;
54473ed8e77SXin LI 	num |= (uint64_t)buf[7] << 56;
54573ed8e77SXin LI 	return num;
54673ed8e77SXin LI }
54773ed8e77SXin LI 
54873ed8e77SXin LI 
54981ad8388SMartin Matuska static inline void
550a8675d92SXin LI write16be(uint8_t *buf, uint16_t num)
55181ad8388SMartin Matuska {
552342bcb12SXin LI 	buf[0] = (uint8_t)(num >> 8);
553342bcb12SXin LI 	buf[1] = (uint8_t)num;
55481ad8388SMartin Matuska 	return;
55581ad8388SMartin Matuska }
55681ad8388SMartin Matuska 
55781ad8388SMartin Matuska 
55881ad8388SMartin Matuska static inline void
559a8675d92SXin LI write16le(uint8_t *buf, uint16_t num)
56081ad8388SMartin Matuska {
561342bcb12SXin LI 	buf[0] = (uint8_t)num;
562342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 8);
56381ad8388SMartin Matuska 	return;
56481ad8388SMartin Matuska }
56581ad8388SMartin Matuska 
56681ad8388SMartin Matuska 
56781ad8388SMartin Matuska static inline void
568a8675d92SXin LI write32be(uint8_t *buf, uint32_t num)
56981ad8388SMartin Matuska {
570342bcb12SXin LI 	buf[0] = (uint8_t)(num >> 24);
571342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 16);
572342bcb12SXin LI 	buf[2] = (uint8_t)(num >> 8);
573342bcb12SXin LI 	buf[3] = (uint8_t)num;
57481ad8388SMartin Matuska 	return;
57581ad8388SMartin Matuska }
57681ad8388SMartin Matuska 
57781ad8388SMartin Matuska 
57881ad8388SMartin Matuska static inline void
579a8675d92SXin LI write32le(uint8_t *buf, uint32_t num)
58081ad8388SMartin Matuska {
581342bcb12SXin LI 	buf[0] = (uint8_t)num;
582342bcb12SXin LI 	buf[1] = (uint8_t)(num >> 8);
583342bcb12SXin LI 	buf[2] = (uint8_t)(num >> 16);
584342bcb12SXin LI 	buf[3] = (uint8_t)(num >> 24);
58581ad8388SMartin Matuska 	return;
58681ad8388SMartin Matuska }
587ca6a6373SXin LI 
588ca6a6373SXin LI 
589ca6a6373SXin LI static inline void
590ca6a6373SXin LI write64be(uint8_t *buf, uint64_t num)
591ca6a6373SXin LI {
592ca6a6373SXin LI 	buf[0] = (uint8_t)(num >> 56);
593ca6a6373SXin LI 	buf[1] = (uint8_t)(num >> 48);
594ca6a6373SXin LI 	buf[2] = (uint8_t)(num >> 40);
595ca6a6373SXin LI 	buf[3] = (uint8_t)(num >> 32);
596ca6a6373SXin LI 	buf[4] = (uint8_t)(num >> 24);
597ca6a6373SXin LI 	buf[5] = (uint8_t)(num >> 16);
598ca6a6373SXin LI 	buf[6] = (uint8_t)(num >> 8);
599ca6a6373SXin LI 	buf[7] = (uint8_t)num;
600ca6a6373SXin LI 	return;
601ca6a6373SXin LI }
602ca6a6373SXin LI 
603ca6a6373SXin LI 
604ca6a6373SXin LI static inline void
605ca6a6373SXin LI write64le(uint8_t *buf, uint64_t num)
606ca6a6373SXin LI {
607ca6a6373SXin LI 	buf[0] = (uint8_t)num;
608ca6a6373SXin LI 	buf[1] = (uint8_t)(num >> 8);
609ca6a6373SXin LI 	buf[2] = (uint8_t)(num >> 16);
610ca6a6373SXin LI 	buf[3] = (uint8_t)(num >> 24);
611ca6a6373SXin LI 	buf[4] = (uint8_t)(num >> 32);
612ca6a6373SXin LI 	buf[5] = (uint8_t)(num >> 40);
613ca6a6373SXin LI 	buf[6] = (uint8_t)(num >> 48);
614ca6a6373SXin LI 	buf[7] = (uint8_t)(num >> 56);
615ca6a6373SXin LI 	return;
616ca6a6373SXin LI }
617ca6a6373SXin LI 
61881ad8388SMartin Matuska #endif
61981ad8388SMartin Matuska 
62081ad8388SMartin Matuska 
621a8675d92SXin LI //////////////////////////////
622a8675d92SXin LI // Aligned reads and writes //
623a8675d92SXin LI //////////////////////////////
624a8675d92SXin LI 
625a8675d92SXin LI // Separate functions for aligned reads and writes are provided since on
626a8675d92SXin LI // strict-align archs aligned access is much faster than unaligned access.
627a8675d92SXin LI //
628a8675d92SXin LI // Just like in the unaligned case, memcpy() is needed to avoid
629a8675d92SXin LI // strict aliasing violations. However, on archs that don't support
630a8675d92SXin LI // unaligned access the compiler cannot know that the pointers given
631a8675d92SXin LI // to memcpy() are aligned which results in slow code. As of C11 there is
632a8675d92SXin LI // no standard way to tell the compiler that we know that the address is
633a8675d92SXin LI // aligned but some compilers have language extensions to do that. With
634a8675d92SXin LI // such language extensions the memcpy() method gives excellent results.
635a8675d92SXin LI //
6363b35e7eeSXin LI // What to do on a strict-align system when no known language extensions
637a8675d92SXin LI // are available? Falling back to byte-by-byte access would be safe but ruin
638a8675d92SXin LI // optimizations that have been made specifically with aligned access in mind.
639a8675d92SXin LI // As a compromise, aligned reads will fall back to non-compliant type punning
640a8675d92SXin LI // but aligned writes will be byte-by-byte, that is, fast reads are preferred
641a8675d92SXin LI // over fast writes. This obviously isn't great but hopefully it's a working
642a8675d92SXin LI // compromise for now.
643a8675d92SXin LI //
644a8675d92SXin LI // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
645a8675d92SXin LI #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
646a8675d92SXin LI #	define tuklib_memcpy_aligned(dest, src, size) \
647a8675d92SXin LI 		memcpy(dest, __builtin_assume_aligned(src, size), size)
648a8675d92SXin LI #else
649a8675d92SXin LI #	define tuklib_memcpy_aligned(dest, src, size) \
650a8675d92SXin LI 		memcpy(dest, src, size)
651a8675d92SXin LI #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
652a8675d92SXin LI #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
653a8675d92SXin LI #	endif
654a8675d92SXin LI #endif
655a8675d92SXin LI 
656a8675d92SXin LI 
657a8675d92SXin LI static inline uint16_t
658a8675d92SXin LI aligned_read16ne(const uint8_t *buf)
659a8675d92SXin LI {
660a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
661a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
662a8675d92SXin LI 	return *(const uint16_t *)buf;
663a8675d92SXin LI #else
664a8675d92SXin LI 	uint16_t num;
665a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
666a8675d92SXin LI 	return num;
667a8675d92SXin LI #endif
668a8675d92SXin LI }
669a8675d92SXin LI 
670a8675d92SXin LI 
671a8675d92SXin LI static inline uint32_t
672a8675d92SXin LI aligned_read32ne(const uint8_t *buf)
673a8675d92SXin LI {
674a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
675a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
676a8675d92SXin LI 	return *(const uint32_t *)buf;
677a8675d92SXin LI #else
678a8675d92SXin LI 	uint32_t num;
679a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
680a8675d92SXin LI 	return num;
681a8675d92SXin LI #endif
682a8675d92SXin LI }
683a8675d92SXin LI 
684a8675d92SXin LI 
685a8675d92SXin LI static inline uint64_t
686a8675d92SXin LI aligned_read64ne(const uint8_t *buf)
687a8675d92SXin LI {
688a8675d92SXin LI #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
689a8675d92SXin LI 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
690a8675d92SXin LI 	return *(const uint64_t *)buf;
691a8675d92SXin LI #else
692a8675d92SXin LI 	uint64_t num;
693a8675d92SXin LI 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
694a8675d92SXin LI 	return num;
695a8675d92SXin LI #endif
696a8675d92SXin LI }
697a8675d92SXin LI 
698a8675d92SXin LI 
699a8675d92SXin LI static inline void
700a8675d92SXin LI aligned_write16ne(uint8_t *buf, uint16_t num)
701a8675d92SXin LI {
702a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
703a8675d92SXin LI 	*(uint16_t *)buf = num;
704a8675d92SXin LI #else
705a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
706a8675d92SXin LI #endif
707a8675d92SXin LI 	return;
708a8675d92SXin LI }
709a8675d92SXin LI 
710a8675d92SXin LI 
711a8675d92SXin LI static inline void
712a8675d92SXin LI aligned_write32ne(uint8_t *buf, uint32_t num)
713a8675d92SXin LI {
714a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
715a8675d92SXin LI 	*(uint32_t *)buf = num;
716a8675d92SXin LI #else
717a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
718a8675d92SXin LI #endif
719a8675d92SXin LI 	return;
720a8675d92SXin LI }
721a8675d92SXin LI 
722a8675d92SXin LI 
723a8675d92SXin LI static inline void
724a8675d92SXin LI aligned_write64ne(uint8_t *buf, uint64_t num)
725a8675d92SXin LI {
726a8675d92SXin LI #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
727a8675d92SXin LI 	*(uint64_t *)buf = num;
728a8675d92SXin LI #else
729a8675d92SXin LI 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
730a8675d92SXin LI #endif
731a8675d92SXin LI 	return;
732a8675d92SXin LI }
733a8675d92SXin LI 
734a8675d92SXin LI 
735a8675d92SXin LI static inline uint16_t
736a8675d92SXin LI aligned_read16be(const uint8_t *buf)
737a8675d92SXin LI {
738a8675d92SXin LI 	uint16_t num = aligned_read16ne(buf);
739a8675d92SXin LI 	return conv16be(num);
740a8675d92SXin LI }
741a8675d92SXin LI 
742a8675d92SXin LI 
743a8675d92SXin LI static inline uint16_t
744a8675d92SXin LI aligned_read16le(const uint8_t *buf)
745a8675d92SXin LI {
746a8675d92SXin LI 	uint16_t num = aligned_read16ne(buf);
747a8675d92SXin LI 	return conv16le(num);
748a8675d92SXin LI }
749a8675d92SXin LI 
750a8675d92SXin LI 
751a8675d92SXin LI static inline uint32_t
752a8675d92SXin LI aligned_read32be(const uint8_t *buf)
753a8675d92SXin LI {
754a8675d92SXin LI 	uint32_t num = aligned_read32ne(buf);
755a8675d92SXin LI 	return conv32be(num);
756a8675d92SXin LI }
757a8675d92SXin LI 
758a8675d92SXin LI 
759a8675d92SXin LI static inline uint32_t
760a8675d92SXin LI aligned_read32le(const uint8_t *buf)
761a8675d92SXin LI {
762a8675d92SXin LI 	uint32_t num = aligned_read32ne(buf);
763a8675d92SXin LI 	return conv32le(num);
764a8675d92SXin LI }
765a8675d92SXin LI 
766a8675d92SXin LI 
767a8675d92SXin LI static inline uint64_t
768a8675d92SXin LI aligned_read64be(const uint8_t *buf)
769a8675d92SXin LI {
770a8675d92SXin LI 	uint64_t num = aligned_read64ne(buf);
771a8675d92SXin LI 	return conv64be(num);
772a8675d92SXin LI }
773a8675d92SXin LI 
774a8675d92SXin LI 
775a8675d92SXin LI static inline uint64_t
776a8675d92SXin LI aligned_read64le(const uint8_t *buf)
777a8675d92SXin LI {
778a8675d92SXin LI 	uint64_t num = aligned_read64ne(buf);
779a8675d92SXin LI 	return conv64le(num);
780a8675d92SXin LI }
781a8675d92SXin LI 
782a8675d92SXin LI 
783a8675d92SXin LI // These need to be macros like in the unaligned case.
784a8675d92SXin LI #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
785a8675d92SXin LI #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
786a8675d92SXin LI #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
787a8675d92SXin LI #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
788a8675d92SXin LI #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
789a8675d92SXin LI #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
790a8675d92SXin LI 
791a8675d92SXin LI 
792a8675d92SXin LI ////////////////////
793a8675d92SXin LI // Bit operations //
794a8675d92SXin LI ////////////////////
795a8675d92SXin LI 
79681ad8388SMartin Matuska static inline uint32_t
79781ad8388SMartin Matuska bsr32(uint32_t n)
79881ad8388SMartin Matuska {
79981ad8388SMartin Matuska 	// Check for ICC first, since it tends to define __GNUC__ too.
80081ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
80181ad8388SMartin Matuska 	return _bit_scan_reverse(n);
80281ad8388SMartin Matuska 
803b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
80481ad8388SMartin Matuska 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
80581ad8388SMartin Matuska 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
80681ad8388SMartin Matuska 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
80781ad8388SMartin Matuska 	// XOR (if -march indicates that SSE4a instructions are supported).
808a8675d92SXin LI 	return (uint32_t)__builtin_clz(n) ^ 31U;
80981ad8388SMartin Matuska 
81081ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
81181ad8388SMartin Matuska 	uint32_t i;
81281ad8388SMartin Matuska 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
81381ad8388SMartin Matuska 	return i;
81481ad8388SMartin Matuska 
815a8675d92SXin LI #elif defined(_MSC_VER)
816a8675d92SXin LI 	unsigned long i;
817a8675d92SXin LI 	_BitScanReverse(&i, n);
81881ad8388SMartin Matuska 	return i;
81981ad8388SMartin Matuska 
82081ad8388SMartin Matuska #else
82181ad8388SMartin Matuska 	uint32_t i = 31;
82281ad8388SMartin Matuska 
823a8675d92SXin LI 	if ((n & 0xFFFF0000) == 0) {
82481ad8388SMartin Matuska 		n <<= 16;
82581ad8388SMartin Matuska 		i = 15;
82681ad8388SMartin Matuska 	}
82781ad8388SMartin Matuska 
828a8675d92SXin LI 	if ((n & 0xFF000000) == 0) {
82981ad8388SMartin Matuska 		n <<= 8;
83081ad8388SMartin Matuska 		i -= 8;
83181ad8388SMartin Matuska 	}
83281ad8388SMartin Matuska 
833a8675d92SXin LI 	if ((n & 0xF0000000) == 0) {
83481ad8388SMartin Matuska 		n <<= 4;
83581ad8388SMartin Matuska 		i -= 4;
83681ad8388SMartin Matuska 	}
83781ad8388SMartin Matuska 
838a8675d92SXin LI 	if ((n & 0xC0000000) == 0) {
83981ad8388SMartin Matuska 		n <<= 2;
84081ad8388SMartin Matuska 		i -= 2;
84181ad8388SMartin Matuska 	}
84281ad8388SMartin Matuska 
843a8675d92SXin LI 	if ((n & 0x80000000) == 0)
84481ad8388SMartin Matuska 		--i;
84581ad8388SMartin Matuska 
84681ad8388SMartin Matuska 	return i;
84781ad8388SMartin Matuska #endif
84881ad8388SMartin Matuska }
84981ad8388SMartin Matuska 
85081ad8388SMartin Matuska 
85181ad8388SMartin Matuska static inline uint32_t
85281ad8388SMartin Matuska clz32(uint32_t n)
85381ad8388SMartin Matuska {
85481ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
85581ad8388SMartin Matuska 	return _bit_scan_reverse(n) ^ 31U;
85681ad8388SMartin Matuska 
857b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
858a8675d92SXin LI 	return (uint32_t)__builtin_clz(n);
85981ad8388SMartin Matuska 
86081ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
86181ad8388SMartin Matuska 	uint32_t i;
86281ad8388SMartin Matuska 	__asm__("bsrl %1, %0\n\t"
86381ad8388SMartin Matuska 		"xorl $31, %0"
86481ad8388SMartin Matuska 		: "=r" (i) : "rm" (n));
86581ad8388SMartin Matuska 	return i;
86681ad8388SMartin Matuska 
867a8675d92SXin LI #elif defined(_MSC_VER)
868a8675d92SXin LI 	unsigned long i;
869a8675d92SXin LI 	_BitScanReverse(&i, n);
87081ad8388SMartin Matuska 	return i ^ 31U;
87181ad8388SMartin Matuska 
87281ad8388SMartin Matuska #else
87381ad8388SMartin Matuska 	uint32_t i = 0;
87481ad8388SMartin Matuska 
875a8675d92SXin LI 	if ((n & 0xFFFF0000) == 0) {
87681ad8388SMartin Matuska 		n <<= 16;
87781ad8388SMartin Matuska 		i = 16;
87881ad8388SMartin Matuska 	}
87981ad8388SMartin Matuska 
880a8675d92SXin LI 	if ((n & 0xFF000000) == 0) {
88181ad8388SMartin Matuska 		n <<= 8;
88281ad8388SMartin Matuska 		i += 8;
88381ad8388SMartin Matuska 	}
88481ad8388SMartin Matuska 
885a8675d92SXin LI 	if ((n & 0xF0000000) == 0) {
88681ad8388SMartin Matuska 		n <<= 4;
88781ad8388SMartin Matuska 		i += 4;
88881ad8388SMartin Matuska 	}
88981ad8388SMartin Matuska 
890a8675d92SXin LI 	if ((n & 0xC0000000) == 0) {
89181ad8388SMartin Matuska 		n <<= 2;
89281ad8388SMartin Matuska 		i += 2;
89381ad8388SMartin Matuska 	}
89481ad8388SMartin Matuska 
895a8675d92SXin LI 	if ((n & 0x80000000) == 0)
89681ad8388SMartin Matuska 		++i;
89781ad8388SMartin Matuska 
89881ad8388SMartin Matuska 	return i;
89981ad8388SMartin Matuska #endif
90081ad8388SMartin Matuska }
90181ad8388SMartin Matuska 
90281ad8388SMartin Matuska 
90381ad8388SMartin Matuska static inline uint32_t
90481ad8388SMartin Matuska ctz32(uint32_t n)
90581ad8388SMartin Matuska {
90681ad8388SMartin Matuska #if defined(__INTEL_COMPILER)
90781ad8388SMartin Matuska 	return _bit_scan_forward(n);
90881ad8388SMartin Matuska 
909b333cd44SXin LI #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
910a8675d92SXin LI 	return (uint32_t)__builtin_ctz(n);
91181ad8388SMartin Matuska 
91281ad8388SMartin Matuska #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
91381ad8388SMartin Matuska 	uint32_t i;
91481ad8388SMartin Matuska 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
91581ad8388SMartin Matuska 	return i;
91681ad8388SMartin Matuska 
917a8675d92SXin LI #elif defined(_MSC_VER)
918a8675d92SXin LI 	unsigned long i;
919a8675d92SXin LI 	_BitScanForward(&i, n);
92081ad8388SMartin Matuska 	return i;
92181ad8388SMartin Matuska 
92281ad8388SMartin Matuska #else
92381ad8388SMartin Matuska 	uint32_t i = 0;
92481ad8388SMartin Matuska 
925a8675d92SXin LI 	if ((n & 0x0000FFFF) == 0) {
92681ad8388SMartin Matuska 		n >>= 16;
92781ad8388SMartin Matuska 		i = 16;
92881ad8388SMartin Matuska 	}
92981ad8388SMartin Matuska 
930a8675d92SXin LI 	if ((n & 0x000000FF) == 0) {
93181ad8388SMartin Matuska 		n >>= 8;
93281ad8388SMartin Matuska 		i += 8;
93381ad8388SMartin Matuska 	}
93481ad8388SMartin Matuska 
935a8675d92SXin LI 	if ((n & 0x0000000F) == 0) {
93681ad8388SMartin Matuska 		n >>= 4;
93781ad8388SMartin Matuska 		i += 4;
93881ad8388SMartin Matuska 	}
93981ad8388SMartin Matuska 
940a8675d92SXin LI 	if ((n & 0x00000003) == 0) {
94181ad8388SMartin Matuska 		n >>= 2;
94281ad8388SMartin Matuska 		i += 2;
94381ad8388SMartin Matuska 	}
94481ad8388SMartin Matuska 
945a8675d92SXin LI 	if ((n & 0x00000001) == 0)
94681ad8388SMartin Matuska 		++i;
94781ad8388SMartin Matuska 
94881ad8388SMartin Matuska 	return i;
94981ad8388SMartin Matuska #endif
95081ad8388SMartin Matuska }
95181ad8388SMartin Matuska 
95281ad8388SMartin Matuska #define bsf32 ctz32
95381ad8388SMartin Matuska 
95481ad8388SMartin Matuska #endif
955