xref: /freebsd/contrib/xz/src/common/tuklib_integer.h (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 /// \file       tuklib_integer.h
4 /// \brief      Various integer and bit operations
5 ///
6 /// This file provides macros or functions to do some basic integer and bit
7 /// operations.
8 ///
9 /// Native endian inline functions (XX = 16, 32, or 64):
10 ///   - Unaligned native endian reads: readXXne(ptr)
11 ///   - Unaligned native endian writes: writeXXne(ptr, num)
12 ///   - Aligned native endian reads: aligned_readXXne(ptr)
13 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
14 ///
15 /// Endianness-converting integer operations (these can be macros!)
16 /// (XX = 16, 32, or 64; Y = b or l):
17 ///   - Byte swapping: bswapXX(num)
18 ///   - Byte order conversions to/from native (byteswaps if Y isn't
19 ///     the native endianness): convXXYe(num)
20 ///   - Unaligned reads: readXXYe(ptr)
21 ///   - Unaligned writes: writeXXYe(ptr, num)
22 ///   - Aligned reads: aligned_readXXYe(ptr)
23 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
24 ///
25 /// Since the above can macros, the arguments should have no side effects
26 /// because they may be evaluated more than once.
27 ///
28 /// Bit scan operations for non-zero 32-bit integers (inline functions):
29 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
30 ///   - Count leading zeros: clz32(num)
31 ///   - Count trailing zeros: ctz32(num)
32 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
33 ///
34 /// The above bit scan operations return 0-31. If num is zero,
35 /// the result is undefined.
36 //
37 //  Authors:    Lasse Collin
38 //              Joachim Henke
39 //
40 //  This file has been put into the public domain.
41 //  You can do whatever you want with this file.
42 //
43 ///////////////////////////////////////////////////////////////////////////////
44 
45 #ifndef TUKLIB_INTEGER_H
46 #define TUKLIB_INTEGER_H
47 
48 #include "tuklib_common.h"
49 #include <string.h>
50 
51 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
52 // and such functions.
53 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
54 #	include <immintrin.h>
55 // Only include <intrin.h> when it is needed. GCC and Clang can both
56 // use __builtin's, so we only need Windows instrincs when using MSVC.
57 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
58 // cases explicitly.
59 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
60 #	include <intrin.h>
61 #endif
62 
63 
64 ///////////////////
65 // Byte swapping //
66 ///////////////////
67 
68 #if defined(HAVE___BUILTIN_BSWAPXX)
69 	// GCC >= 4.8 and Clang
70 #	define bswap16(n) __builtin_bswap16(n)
71 #	define bswap32(n) __builtin_bswap32(n)
72 #	define bswap64(n) __builtin_bswap64(n)
73 
74 #elif defined(HAVE_BYTESWAP_H)
75 	// glibc, uClibc, dietlibc
76 #	include <byteswap.h>
77 #	ifdef HAVE_BSWAP_16
78 #		define bswap16(num) bswap_16(num)
79 #	endif
80 #	ifdef HAVE_BSWAP_32
81 #		define bswap32(num) bswap_32(num)
82 #	endif
83 #	ifdef HAVE_BSWAP_64
84 #		define bswap64(num) bswap_64(num)
85 #	endif
86 
87 #elif defined(HAVE_SYS_ENDIAN_H)
88 	// *BSDs and Darwin
89 #	include <sys/endian.h>
90 
91 #elif defined(HAVE_SYS_BYTEORDER_H)
92 	// Solaris
93 #	include <sys/byteorder.h>
94 #	ifdef BSWAP_16
95 #		define bswap16(num) BSWAP_16(num)
96 #	endif
97 #	ifdef BSWAP_32
98 #		define bswap32(num) BSWAP_32(num)
99 #	endif
100 #	ifdef BSWAP_64
101 #		define bswap64(num) BSWAP_64(num)
102 #	endif
103 #	ifdef BE_16
104 #		define conv16be(num) BE_16(num)
105 #	endif
106 #	ifdef BE_32
107 #		define conv32be(num) BE_32(num)
108 #	endif
109 #	ifdef BE_64
110 #		define conv64be(num) BE_64(num)
111 #	endif
112 #	ifdef LE_16
113 #		define conv16le(num) LE_16(num)
114 #	endif
115 #	ifdef LE_32
116 #		define conv32le(num) LE_32(num)
117 #	endif
118 #	ifdef LE_64
119 #		define conv64le(num) LE_64(num)
120 #	endif
121 #endif
122 
123 #ifndef bswap16
124 #	define bswap16(n) (uint16_t)( \
125 		  (((n) & 0x00FFU) << 8) \
126 		| (((n) & 0xFF00U) >> 8) \
127 	)
128 #endif
129 
130 #ifndef bswap32
131 #	define bswap32(n) (uint32_t)( \
132 		  (((n) & UINT32_C(0x000000FF)) << 24) \
133 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
134 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
135 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
136 	)
137 #endif
138 
139 #ifndef bswap64
140 #	define bswap64(n) (uint64_t)( \
141 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
142 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
143 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
144 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
145 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
146 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
147 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
148 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
149 	)
150 #endif
151 
152 // Define conversion macros using the basic byte swapping macros.
153 #ifdef WORDS_BIGENDIAN
154 #	ifndef conv16be
155 #		define conv16be(num) ((uint16_t)(num))
156 #	endif
157 #	ifndef conv32be
158 #		define conv32be(num) ((uint32_t)(num))
159 #	endif
160 #	ifndef conv64be
161 #		define conv64be(num) ((uint64_t)(num))
162 #	endif
163 #	ifndef conv16le
164 #		define conv16le(num) bswap16(num)
165 #	endif
166 #	ifndef conv32le
167 #		define conv32le(num) bswap32(num)
168 #	endif
169 #	ifndef conv64le
170 #		define conv64le(num) bswap64(num)
171 #	endif
172 #else
173 #	ifndef conv16be
174 #		define conv16be(num) bswap16(num)
175 #	endif
176 #	ifndef conv32be
177 #		define conv32be(num) bswap32(num)
178 #	endif
179 #	ifndef conv64be
180 #		define conv64be(num) bswap64(num)
181 #	endif
182 #	ifndef conv16le
183 #		define conv16le(num) ((uint16_t)(num))
184 #	endif
185 #	ifndef conv32le
186 #		define conv32le(num) ((uint32_t)(num))
187 #	endif
188 #	ifndef conv64le
189 #		define conv64le(num) ((uint64_t)(num))
190 #	endif
191 #endif
192 
193 
194 ////////////////////////////////
195 // Unaligned reads and writes //
196 ////////////////////////////////
197 
198 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
199 // is bad even if the uint8_pointer is properly aligned because this kind
200 // of casts break strict aliasing rules and result in undefined behavior.
201 // With unaligned pointers it's even worse: compilers may emit vector
202 // instructions that require aligned pointers even if non-vector
203 // instructions work with unaligned pointers.
204 //
205 // Using memcpy() is the standard compliant way to do unaligned access.
206 // Many modern compilers inline it so there is no function call overhead.
207 // For those compilers that don't handle the memcpy() method well, the
208 // old casting method (that violates strict aliasing) can be requested at
209 // build time. A third method, casting to a packed struct, would also be
210 // an option but isn't provided to keep things simpler (it's already a mess).
211 // Hopefully this is flexible enough in practice.
212 
213 static inline uint16_t
214 read16ne(const uint8_t *buf)
215 {
216 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
217 		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
218 	return *(const uint16_t *)buf;
219 #else
220 	uint16_t num;
221 	memcpy(&num, buf, sizeof(num));
222 	return num;
223 #endif
224 }
225 
226 
227 static inline uint32_t
228 read32ne(const uint8_t *buf)
229 {
230 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
231 		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
232 	return *(const uint32_t *)buf;
233 #else
234 	uint32_t num;
235 	memcpy(&num, buf, sizeof(num));
236 	return num;
237 #endif
238 }
239 
240 
241 static inline uint64_t
242 read64ne(const uint8_t *buf)
243 {
244 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
245 		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
246 	return *(const uint64_t *)buf;
247 #else
248 	uint64_t num;
249 	memcpy(&num, buf, sizeof(num));
250 	return num;
251 #endif
252 }
253 
254 
255 static inline void
256 write16ne(uint8_t *buf, uint16_t num)
257 {
258 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
259 		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
260 	*(uint16_t *)buf = num;
261 #else
262 	memcpy(buf, &num, sizeof(num));
263 #endif
264 	return;
265 }
266 
267 
268 static inline void
269 write32ne(uint8_t *buf, uint32_t num)
270 {
271 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
272 		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
273 	*(uint32_t *)buf = num;
274 #else
275 	memcpy(buf, &num, sizeof(num));
276 #endif
277 	return;
278 }
279 
280 
281 static inline void
282 write64ne(uint8_t *buf, uint64_t num)
283 {
284 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
285 		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
286 	*(uint64_t *)buf = num;
287 #else
288 	memcpy(buf, &num, sizeof(num));
289 #endif
290 	return;
291 }
292 
293 
294 static inline uint16_t
295 read16be(const uint8_t *buf)
296 {
297 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
298 	uint16_t num = read16ne(buf);
299 	return conv16be(num);
300 #else
301 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
302 	return num;
303 #endif
304 }
305 
306 
307 static inline uint16_t
308 read16le(const uint8_t *buf)
309 {
310 #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
311 	uint16_t num = read16ne(buf);
312 	return conv16le(num);
313 #else
314 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
315 	return num;
316 #endif
317 }
318 
319 
320 static inline uint32_t
321 read32be(const uint8_t *buf)
322 {
323 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
324 	uint32_t num = read32ne(buf);
325 	return conv32be(num);
326 #else
327 	uint32_t num = (uint32_t)buf[0] << 24;
328 	num |= (uint32_t)buf[1] << 16;
329 	num |= (uint32_t)buf[2] << 8;
330 	num |= (uint32_t)buf[3];
331 	return num;
332 #endif
333 }
334 
335 
336 static inline uint32_t
337 read32le(const uint8_t *buf)
338 {
339 #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
340 	uint32_t num = read32ne(buf);
341 	return conv32le(num);
342 #else
343 	uint32_t num = (uint32_t)buf[0];
344 	num |= (uint32_t)buf[1] << 8;
345 	num |= (uint32_t)buf[2] << 16;
346 	num |= (uint32_t)buf[3] << 24;
347 	return num;
348 #endif
349 }
350 
351 
352 static inline uint64_t
353 read64be(const uint8_t *buf)
354 {
355 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
356 	uint64_t num = read64ne(buf);
357 	return conv64be(num);
358 #else
359 	uint64_t num = (uint64_t)buf[0] << 56;
360 	num |= (uint64_t)buf[1] << 48;
361 	num |= (uint64_t)buf[2] << 40;
362 	num |= (uint64_t)buf[3] << 32;
363 	num |= (uint64_t)buf[4] << 24;
364 	num |= (uint64_t)buf[5] << 16;
365 	num |= (uint64_t)buf[6] << 8;
366 	num |= (uint64_t)buf[7];
367 	return num;
368 #endif
369 }
370 
371 
372 static inline uint64_t
373 read64le(const uint8_t *buf)
374 {
375 #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
376 	uint64_t num = read64ne(buf);
377 	return conv64le(num);
378 #else
379 	uint64_t num = (uint64_t)buf[0];
380 	num |= (uint64_t)buf[1] << 8;
381 	num |= (uint64_t)buf[2] << 16;
382 	num |= (uint64_t)buf[3] << 24;
383 	num |= (uint64_t)buf[4] << 32;
384 	num |= (uint64_t)buf[5] << 40;
385 	num |= (uint64_t)buf[6] << 48;
386 	num |= (uint64_t)buf[7] << 56;
387 	return num;
388 #endif
389 }
390 
391 
392 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
393 // to optimize byte swapping of constants when using glibc's or *BSD's
394 // byte swapping macros. The actual write is done in an inline function
395 // to make type checking of the buf pointer possible.
396 #if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
397 #	define write16be(buf, num) write16ne(buf, conv16be(num))
398 #	define write32be(buf, num) write32ne(buf, conv32be(num))
399 #	define write64be(buf, num) write64ne(buf, conv64be(num))
400 #endif
401 
402 #if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
403 #	define write16le(buf, num) write16ne(buf, conv16le(num))
404 #	define write32le(buf, num) write32ne(buf, conv32le(num))
405 #	define write64le(buf, num) write64ne(buf, conv64le(num))
406 #endif
407 
408 
409 #ifndef write16be
410 static inline void
411 write16be(uint8_t *buf, uint16_t num)
412 {
413 	buf[0] = (uint8_t)(num >> 8);
414 	buf[1] = (uint8_t)num;
415 	return;
416 }
417 #endif
418 
419 
420 #ifndef write16le
421 static inline void
422 write16le(uint8_t *buf, uint16_t num)
423 {
424 	buf[0] = (uint8_t)num;
425 	buf[1] = (uint8_t)(num >> 8);
426 	return;
427 }
428 #endif
429 
430 
431 #ifndef write32be
432 static inline void
433 write32be(uint8_t *buf, uint32_t num)
434 {
435 	buf[0] = (uint8_t)(num >> 24);
436 	buf[1] = (uint8_t)(num >> 16);
437 	buf[2] = (uint8_t)(num >> 8);
438 	buf[3] = (uint8_t)num;
439 	return;
440 }
441 #endif
442 
443 
444 #ifndef write32le
445 static inline void
446 write32le(uint8_t *buf, uint32_t num)
447 {
448 	buf[0] = (uint8_t)num;
449 	buf[1] = (uint8_t)(num >> 8);
450 	buf[2] = (uint8_t)(num >> 16);
451 	buf[3] = (uint8_t)(num >> 24);
452 	return;
453 }
454 #endif
455 
456 
457 //////////////////////////////
458 // Aligned reads and writes //
459 //////////////////////////////
460 
461 // Separate functions for aligned reads and writes are provided since on
462 // strict-align archs aligned access is much faster than unaligned access.
463 //
464 // Just like in the unaligned case, memcpy() is needed to avoid
465 // strict aliasing violations. However, on archs that don't support
466 // unaligned access the compiler cannot know that the pointers given
467 // to memcpy() are aligned which results in slow code. As of C11 there is
468 // no standard way to tell the compiler that we know that the address is
469 // aligned but some compilers have language extensions to do that. With
470 // such language extensions the memcpy() method gives excellent results.
471 //
472 // What to do on a strict-align system when no known language extentensions
473 // are available? Falling back to byte-by-byte access would be safe but ruin
474 // optimizations that have been made specifically with aligned access in mind.
475 // As a compromise, aligned reads will fall back to non-compliant type punning
476 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
477 // over fast writes. This obviously isn't great but hopefully it's a working
478 // compromise for now.
479 //
480 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
481 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
482 #	define tuklib_memcpy_aligned(dest, src, size) \
483 		memcpy(dest, __builtin_assume_aligned(src, size), size)
484 #else
485 #	define tuklib_memcpy_aligned(dest, src, size) \
486 		memcpy(dest, src, size)
487 #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
488 #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
489 #	endif
490 #endif
491 
492 
493 static inline uint16_t
494 aligned_read16ne(const uint8_t *buf)
495 {
496 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
497 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
498 	return *(const uint16_t *)buf;
499 #else
500 	uint16_t num;
501 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
502 	return num;
503 #endif
504 }
505 
506 
507 static inline uint32_t
508 aligned_read32ne(const uint8_t *buf)
509 {
510 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
511 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
512 	return *(const uint32_t *)buf;
513 #else
514 	uint32_t num;
515 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
516 	return num;
517 #endif
518 }
519 
520 
521 static inline uint64_t
522 aligned_read64ne(const uint8_t *buf)
523 {
524 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
525 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
526 	return *(const uint64_t *)buf;
527 #else
528 	uint64_t num;
529 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
530 	return num;
531 #endif
532 }
533 
534 
535 static inline void
536 aligned_write16ne(uint8_t *buf, uint16_t num)
537 {
538 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
539 	*(uint16_t *)buf = num;
540 #else
541 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
542 #endif
543 	return;
544 }
545 
546 
547 static inline void
548 aligned_write32ne(uint8_t *buf, uint32_t num)
549 {
550 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
551 	*(uint32_t *)buf = num;
552 #else
553 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
554 #endif
555 	return;
556 }
557 
558 
559 static inline void
560 aligned_write64ne(uint8_t *buf, uint64_t num)
561 {
562 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
563 	*(uint64_t *)buf = num;
564 #else
565 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
566 #endif
567 	return;
568 }
569 
570 
571 static inline uint16_t
572 aligned_read16be(const uint8_t *buf)
573 {
574 	uint16_t num = aligned_read16ne(buf);
575 	return conv16be(num);
576 }
577 
578 
579 static inline uint16_t
580 aligned_read16le(const uint8_t *buf)
581 {
582 	uint16_t num = aligned_read16ne(buf);
583 	return conv16le(num);
584 }
585 
586 
587 static inline uint32_t
588 aligned_read32be(const uint8_t *buf)
589 {
590 	uint32_t num = aligned_read32ne(buf);
591 	return conv32be(num);
592 }
593 
594 
595 static inline uint32_t
596 aligned_read32le(const uint8_t *buf)
597 {
598 	uint32_t num = aligned_read32ne(buf);
599 	return conv32le(num);
600 }
601 
602 
603 static inline uint64_t
604 aligned_read64be(const uint8_t *buf)
605 {
606 	uint64_t num = aligned_read64ne(buf);
607 	return conv64be(num);
608 }
609 
610 
611 static inline uint64_t
612 aligned_read64le(const uint8_t *buf)
613 {
614 	uint64_t num = aligned_read64ne(buf);
615 	return conv64le(num);
616 }
617 
618 
619 // These need to be macros like in the unaligned case.
620 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
621 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
622 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
623 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
624 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
625 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
626 
627 
628 ////////////////////
629 // Bit operations //
630 ////////////////////
631 
632 static inline uint32_t
633 bsr32(uint32_t n)
634 {
635 	// Check for ICC first, since it tends to define __GNUC__ too.
636 #if defined(__INTEL_COMPILER)
637 	return _bit_scan_reverse(n);
638 
639 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
640 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
641 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
642 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
643 	// XOR (if -march indicates that SSE4a instructions are supported).
644 	return (uint32_t)__builtin_clz(n) ^ 31U;
645 
646 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
647 	uint32_t i;
648 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
649 	return i;
650 
651 #elif defined(_MSC_VER)
652 	unsigned long i;
653 	_BitScanReverse(&i, n);
654 	return i;
655 
656 #else
657 	uint32_t i = 31;
658 
659 	if ((n & 0xFFFF0000) == 0) {
660 		n <<= 16;
661 		i = 15;
662 	}
663 
664 	if ((n & 0xFF000000) == 0) {
665 		n <<= 8;
666 		i -= 8;
667 	}
668 
669 	if ((n & 0xF0000000) == 0) {
670 		n <<= 4;
671 		i -= 4;
672 	}
673 
674 	if ((n & 0xC0000000) == 0) {
675 		n <<= 2;
676 		i -= 2;
677 	}
678 
679 	if ((n & 0x80000000) == 0)
680 		--i;
681 
682 	return i;
683 #endif
684 }
685 
686 
687 static inline uint32_t
688 clz32(uint32_t n)
689 {
690 #if defined(__INTEL_COMPILER)
691 	return _bit_scan_reverse(n) ^ 31U;
692 
693 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
694 	return (uint32_t)__builtin_clz(n);
695 
696 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
697 	uint32_t i;
698 	__asm__("bsrl %1, %0\n\t"
699 		"xorl $31, %0"
700 		: "=r" (i) : "rm" (n));
701 	return i;
702 
703 #elif defined(_MSC_VER)
704 	unsigned long i;
705 	_BitScanReverse(&i, n);
706 	return i ^ 31U;
707 
708 #else
709 	uint32_t i = 0;
710 
711 	if ((n & 0xFFFF0000) == 0) {
712 		n <<= 16;
713 		i = 16;
714 	}
715 
716 	if ((n & 0xFF000000) == 0) {
717 		n <<= 8;
718 		i += 8;
719 	}
720 
721 	if ((n & 0xF0000000) == 0) {
722 		n <<= 4;
723 		i += 4;
724 	}
725 
726 	if ((n & 0xC0000000) == 0) {
727 		n <<= 2;
728 		i += 2;
729 	}
730 
731 	if ((n & 0x80000000) == 0)
732 		++i;
733 
734 	return i;
735 #endif
736 }
737 
738 
739 static inline uint32_t
740 ctz32(uint32_t n)
741 {
742 #if defined(__INTEL_COMPILER)
743 	return _bit_scan_forward(n);
744 
745 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
746 	return (uint32_t)__builtin_ctz(n);
747 
748 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
749 	uint32_t i;
750 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
751 	return i;
752 
753 #elif defined(_MSC_VER)
754 	unsigned long i;
755 	_BitScanForward(&i, n);
756 	return i;
757 
758 #else
759 	uint32_t i = 0;
760 
761 	if ((n & 0x0000FFFF) == 0) {
762 		n >>= 16;
763 		i = 16;
764 	}
765 
766 	if ((n & 0x000000FF) == 0) {
767 		n >>= 8;
768 		i += 8;
769 	}
770 
771 	if ((n & 0x0000000F) == 0) {
772 		n >>= 4;
773 		i += 4;
774 	}
775 
776 	if ((n & 0x00000003) == 0) {
777 		n >>= 2;
778 		i += 2;
779 	}
780 
781 	if ((n & 0x00000001) == 0)
782 		++i;
783 
784 	return i;
785 #endif
786 }
787 
788 #define bsf32 ctz32
789 
790 #endif
791