xref: /freebsd/contrib/xz/src/common/tuklib_integer.h (revision 59c8e88e72633afbc47a4ace0d2170d00d51f7dc)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       tuklib_integer.h
6 /// \brief      Various integer and bit operations
7 ///
8 /// This file provides macros or functions to do some basic integer and bit
9 /// operations.
10 ///
11 /// Native endian inline functions (XX = 16, 32, or 64):
12 ///   - Unaligned native endian reads: readXXne(ptr)
13 ///   - Unaligned native endian writes: writeXXne(ptr, num)
14 ///   - Aligned native endian reads: aligned_readXXne(ptr)
15 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
16 ///
17 /// Endianness-converting integer operations (these can be macros!)
18 /// (XX = 16, 32, or 64; Y = b or l):
19 ///   - Byte swapping: bswapXX(num)
20 ///   - Byte order conversions to/from native (byteswaps if Y isn't
21 ///     the native endianness): convXXYe(num)
22 ///   - Unaligned reads: readXXYe(ptr)
23 ///   - Unaligned writes: writeXXYe(ptr, num)
24 ///   - Aligned reads: aligned_readXXYe(ptr)
25 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
26 ///
27 /// Since the above can macros, the arguments should have no side effects
28 /// because they may be evaluated more than once.
29 ///
30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
31 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
32 ///   - Count leading zeros: clz32(num)
33 ///   - Count trailing zeros: ctz32(num)
34 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
35 ///
36 /// The above bit scan operations return 0-31. If num is zero,
37 /// the result is undefined.
38 //
39 //  Authors:    Lasse Collin
40 //              Joachim Henke
41 //
42 ///////////////////////////////////////////////////////////////////////////////
43 
44 #ifndef TUKLIB_INTEGER_H
45 #define TUKLIB_INTEGER_H
46 
47 #include "tuklib_common.h"
48 #include <string.h>
49 
50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51 // and such functions.
52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53 #	include <immintrin.h>
54 // Only include <intrin.h> when it is needed. GCC and Clang can both
55 // use __builtin's, so we only need Windows instrincs when using MSVC.
56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
57 // cases explicitly.
58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
59 #	include <intrin.h>
60 #endif
61 
62 
63 ///////////////////
64 // Byte swapping //
65 ///////////////////
66 
67 #if defined(HAVE___BUILTIN_BSWAPXX)
68 	// GCC >= 4.8 and Clang
69 #	define bswap16(n) __builtin_bswap16(n)
70 #	define bswap32(n) __builtin_bswap32(n)
71 #	define bswap64(n) __builtin_bswap64(n)
72 
73 #elif defined(HAVE_BYTESWAP_H)
74 	// glibc, uClibc, dietlibc
75 #	include <byteswap.h>
76 #	ifdef HAVE_BSWAP_16
77 #		define bswap16(num) bswap_16(num)
78 #	endif
79 #	ifdef HAVE_BSWAP_32
80 #		define bswap32(num) bswap_32(num)
81 #	endif
82 #	ifdef HAVE_BSWAP_64
83 #		define bswap64(num) bswap_64(num)
84 #	endif
85 
86 #elif defined(HAVE_SYS_ENDIAN_H)
87 	// *BSDs and Darwin
88 #	include <sys/endian.h>
89 
90 #elif defined(HAVE_SYS_BYTEORDER_H)
91 	// Solaris
92 #	include <sys/byteorder.h>
93 #	ifdef BSWAP_16
94 #		define bswap16(num) BSWAP_16(num)
95 #	endif
96 #	ifdef BSWAP_32
97 #		define bswap32(num) BSWAP_32(num)
98 #	endif
99 #	ifdef BSWAP_64
100 #		define bswap64(num) BSWAP_64(num)
101 #	endif
102 #	ifdef BE_16
103 #		define conv16be(num) BE_16(num)
104 #	endif
105 #	ifdef BE_32
106 #		define conv32be(num) BE_32(num)
107 #	endif
108 #	ifdef BE_64
109 #		define conv64be(num) BE_64(num)
110 #	endif
111 #	ifdef LE_16
112 #		define conv16le(num) LE_16(num)
113 #	endif
114 #	ifdef LE_32
115 #		define conv32le(num) LE_32(num)
116 #	endif
117 #	ifdef LE_64
118 #		define conv64le(num) LE_64(num)
119 #	endif
120 #endif
121 
122 #ifndef bswap16
123 #	define bswap16(n) (uint16_t)( \
124 		  (((n) & 0x00FFU) << 8) \
125 		| (((n) & 0xFF00U) >> 8) \
126 	)
127 #endif
128 
129 #ifndef bswap32
130 #	define bswap32(n) (uint32_t)( \
131 		  (((n) & UINT32_C(0x000000FF)) << 24) \
132 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
133 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
134 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
135 	)
136 #endif
137 
138 #ifndef bswap64
139 #	define bswap64(n) (uint64_t)( \
140 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
141 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
142 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
143 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
144 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
145 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
146 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
147 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
148 	)
149 #endif
150 
151 // Define conversion macros using the basic byte swapping macros.
152 #ifdef WORDS_BIGENDIAN
153 #	ifndef conv16be
154 #		define conv16be(num) ((uint16_t)(num))
155 #	endif
156 #	ifndef conv32be
157 #		define conv32be(num) ((uint32_t)(num))
158 #	endif
159 #	ifndef conv64be
160 #		define conv64be(num) ((uint64_t)(num))
161 #	endif
162 #	ifndef conv16le
163 #		define conv16le(num) bswap16(num)
164 #	endif
165 #	ifndef conv32le
166 #		define conv32le(num) bswap32(num)
167 #	endif
168 #	ifndef conv64le
169 #		define conv64le(num) bswap64(num)
170 #	endif
171 #else
172 #	ifndef conv16be
173 #		define conv16be(num) bswap16(num)
174 #	endif
175 #	ifndef conv32be
176 #		define conv32be(num) bswap32(num)
177 #	endif
178 #	ifndef conv64be
179 #		define conv64be(num) bswap64(num)
180 #	endif
181 #	ifndef conv16le
182 #		define conv16le(num) ((uint16_t)(num))
183 #	endif
184 #	ifndef conv32le
185 #		define conv32le(num) ((uint32_t)(num))
186 #	endif
187 #	ifndef conv64le
188 #		define conv64le(num) ((uint64_t)(num))
189 #	endif
190 #endif
191 
192 
193 ////////////////////////////////
194 // Unaligned reads and writes //
195 ////////////////////////////////
196 
197 // No-strict-align archs like x86-64
198 // ---------------------------------
199 //
200 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
201 // is bad even if the uint8_pointer is properly aligned because this kind
202 // of casts break strict aliasing rules and result in undefined behavior.
203 // With unaligned pointers it's even worse: compilers may emit vector
204 // instructions that require aligned pointers even if non-vector
205 // instructions work with unaligned pointers.
206 //
207 // Using memcpy() is the standard compliant way to do unaligned access.
208 // Many modern compilers inline it so there is no function call overhead.
209 // For those compilers that don't handle the memcpy() method well, the
210 // old casting method (that violates strict aliasing) can be requested at
211 // build time. A third method, casting to a packed struct, would also be
212 // an option but isn't provided to keep things simpler (it's already a mess).
213 // Hopefully this is flexible enough in practice.
214 //
215 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
216 //
217 //     buf[0] | (buf[1] << 8)
218 //
219 // reads a 16-bit value and can emit a single 16-bit load and produce
220 // identical code than with the memcpy() method. In other cases Clang and GCC
221 // produce either the same or better code with memcpy(). For example, Clang 9
222 // on x86-64 can detect 32-bit load but not 16-bit load.
223 //
224 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
225 // code for "buf[0] | (buf[1] << 8)".
226 //
227 // Conclusion: The memcpy() method is the best choice when unaligned access
228 // is supported.
229 //
230 // Strict-align archs like SPARC
231 // -----------------------------
232 //
233 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
234 // from the memcpy() method than from simple byte-by-byte shift-or code
235 // when reading a 32-bit integer:
236 //
237 //     (1) It may be constructed on stack using using four 8-bit loads,
238 //         four 8-bit stores to stack, and finally one 32-bit load from stack.
239 //
240 //     (2) Especially with -Os, an actual memcpy() call may be emitted.
241 //
242 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
243 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
244 // some processors but not all so this is relevant only in the case when
245 // GCC assumes that unaligned is not supported or -mstrict-align or
246 // -mno-unaligned-access is used.
247 //
248 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
249 // was one the very few with a minor difference: the memcpy() version
250 // was one instruction longer.
251 //
252 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
253 // the best choice for strict-align archs to do unaligned access.
254 //
255 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
256 //
257 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
258 // The following is for little endian targets:
259 /*
260 #include <stdint.h>
261 #include <string.h>
262 
263 uint32_t bytes16(const uint8_t *b)
264 {
265     return (uint32_t)b[0]
266         | ((uint32_t)b[1] << 8);
267 }
268 
269 uint32_t copy16(const uint8_t *b)
270 {
271     uint16_t v;
272     memcpy(&v, b, sizeof(v));
273     return v;
274 }
275 
276 uint32_t bytes32(const uint8_t *b)
277 {
278     return (uint32_t)b[0]
279         | ((uint32_t)b[1] << 8)
280         | ((uint32_t)b[2] << 16)
281         | ((uint32_t)b[3] << 24);
282 }
283 
284 uint32_t copy32(const uint8_t *b)
285 {
286     uint32_t v;
287     memcpy(&v, b, sizeof(v));
288     return v;
289 }
290 
291 void wbytes16(uint8_t *b, uint16_t v)
292 {
293     b[0] = (uint8_t)v;
294     b[1] = (uint8_t)(v >> 8);
295 }
296 
297 void wcopy16(uint8_t *b, uint16_t v)
298 {
299     memcpy(b, &v, sizeof(v));
300 }
301 
302 void wbytes32(uint8_t *b, uint32_t v)
303 {
304     b[0] = (uint8_t)v;
305     b[1] = (uint8_t)(v >> 8);
306     b[2] = (uint8_t)(v >> 16);
307     b[3] = (uint8_t)(v >> 24);
308 }
309 
310 void wcopy32(uint8_t *b, uint32_t v)
311 {
312     memcpy(b, &v, sizeof(v));
313 }
314 */
315 
316 
317 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
318 
319 static inline uint16_t
320 read16ne(const uint8_t *buf)
321 {
322 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
323 	return *(const uint16_t *)buf;
324 #else
325 	uint16_t num;
326 	memcpy(&num, buf, sizeof(num));
327 	return num;
328 #endif
329 }
330 
331 
332 static inline uint32_t
333 read32ne(const uint8_t *buf)
334 {
335 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
336 	return *(const uint32_t *)buf;
337 #else
338 	uint32_t num;
339 	memcpy(&num, buf, sizeof(num));
340 	return num;
341 #endif
342 }
343 
344 
345 static inline uint64_t
346 read64ne(const uint8_t *buf)
347 {
348 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
349 	return *(const uint64_t *)buf;
350 #else
351 	uint64_t num;
352 	memcpy(&num, buf, sizeof(num));
353 	return num;
354 #endif
355 }
356 
357 
358 static inline void
359 write16ne(uint8_t *buf, uint16_t num)
360 {
361 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
362 	*(uint16_t *)buf = num;
363 #else
364 	memcpy(buf, &num, sizeof(num));
365 #endif
366 	return;
367 }
368 
369 
370 static inline void
371 write32ne(uint8_t *buf, uint32_t num)
372 {
373 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
374 	*(uint32_t *)buf = num;
375 #else
376 	memcpy(buf, &num, sizeof(num));
377 #endif
378 	return;
379 }
380 
381 
382 static inline void
383 write64ne(uint8_t *buf, uint64_t num)
384 {
385 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
386 	*(uint64_t *)buf = num;
387 #else
388 	memcpy(buf, &num, sizeof(num));
389 #endif
390 	return;
391 }
392 
393 
394 static inline uint16_t
395 read16be(const uint8_t *buf)
396 {
397 	uint16_t num = read16ne(buf);
398 	return conv16be(num);
399 }
400 
401 
402 static inline uint16_t
403 read16le(const uint8_t *buf)
404 {
405 	uint16_t num = read16ne(buf);
406 	return conv16le(num);
407 }
408 
409 
410 static inline uint32_t
411 read32be(const uint8_t *buf)
412 {
413 	uint32_t num = read32ne(buf);
414 	return conv32be(num);
415 }
416 
417 
418 static inline uint32_t
419 read32le(const uint8_t *buf)
420 {
421 	uint32_t num = read32ne(buf);
422 	return conv32le(num);
423 }
424 
425 
426 static inline uint64_t
427 read64be(const uint8_t *buf)
428 {
429 	uint64_t num = read64ne(buf);
430 	return conv64be(num);
431 }
432 
433 
434 static inline uint64_t
435 read64le(const uint8_t *buf)
436 {
437 	uint64_t num = read64ne(buf);
438 	return conv64le(num);
439 }
440 
441 
442 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
443 // to optimize byte swapping of constants when using glibc's or *BSD's
444 // byte swapping macros. The actual write is done in an inline function
445 // to make type checking of the buf pointer possible.
446 #define write16be(buf, num) write16ne(buf, conv16be(num))
447 #define write32be(buf, num) write32ne(buf, conv32be(num))
448 #define write64be(buf, num) write64ne(buf, conv64be(num))
449 #define write16le(buf, num) write16ne(buf, conv16le(num))
450 #define write32le(buf, num) write32ne(buf, conv32le(num))
451 #define write64le(buf, num) write64ne(buf, conv64le(num))
452 
453 #else
454 
455 #ifdef WORDS_BIGENDIAN
456 #	define read16ne read16be
457 #	define read32ne read32be
458 #	define read64ne read64be
459 #	define write16ne write16be
460 #	define write32ne write32be
461 #	define write64ne write64be
462 #else
463 #	define read16ne read16le
464 #	define read32ne read32le
465 #	define read64ne read64le
466 #	define write16ne write16le
467 #	define write32ne write32le
468 #	define write64ne write64le
469 #endif
470 
471 
472 static inline uint16_t
473 read16be(const uint8_t *buf)
474 {
475 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
476 	return num;
477 }
478 
479 
480 static inline uint16_t
481 read16le(const uint8_t *buf)
482 {
483 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
484 	return num;
485 }
486 
487 
488 static inline uint32_t
489 read32be(const uint8_t *buf)
490 {
491 	uint32_t num = (uint32_t)buf[0] << 24;
492 	num |= (uint32_t)buf[1] << 16;
493 	num |= (uint32_t)buf[2] << 8;
494 	num |= (uint32_t)buf[3];
495 	return num;
496 }
497 
498 
499 static inline uint32_t
500 read32le(const uint8_t *buf)
501 {
502 	uint32_t num = (uint32_t)buf[0];
503 	num |= (uint32_t)buf[1] << 8;
504 	num |= (uint32_t)buf[2] << 16;
505 	num |= (uint32_t)buf[3] << 24;
506 	return num;
507 }
508 
509 
510 static inline uint64_t
511 read64be(const uint8_t *buf)
512 {
513 	uint64_t num = (uint64_t)buf[0] << 56;
514 	num |= (uint64_t)buf[1] << 48;
515 	num |= (uint64_t)buf[2] << 40;
516 	num |= (uint64_t)buf[3] << 32;
517 	num |= (uint64_t)buf[4] << 24;
518 	num |= (uint64_t)buf[5] << 16;
519 	num |= (uint64_t)buf[6] << 8;
520 	num |= (uint64_t)buf[7];
521 	return num;
522 }
523 
524 
525 static inline uint64_t
526 read64le(const uint8_t *buf)
527 {
528 	uint64_t num = (uint64_t)buf[0];
529 	num |= (uint64_t)buf[1] << 8;
530 	num |= (uint64_t)buf[2] << 16;
531 	num |= (uint64_t)buf[3] << 24;
532 	num |= (uint64_t)buf[4] << 32;
533 	num |= (uint64_t)buf[5] << 40;
534 	num |= (uint64_t)buf[6] << 48;
535 	num |= (uint64_t)buf[7] << 56;
536 	return num;
537 }
538 
539 
540 static inline void
541 write16be(uint8_t *buf, uint16_t num)
542 {
543 	buf[0] = (uint8_t)(num >> 8);
544 	buf[1] = (uint8_t)num;
545 	return;
546 }
547 
548 
549 static inline void
550 write16le(uint8_t *buf, uint16_t num)
551 {
552 	buf[0] = (uint8_t)num;
553 	buf[1] = (uint8_t)(num >> 8);
554 	return;
555 }
556 
557 
558 static inline void
559 write32be(uint8_t *buf, uint32_t num)
560 {
561 	buf[0] = (uint8_t)(num >> 24);
562 	buf[1] = (uint8_t)(num >> 16);
563 	buf[2] = (uint8_t)(num >> 8);
564 	buf[3] = (uint8_t)num;
565 	return;
566 }
567 
568 
569 static inline void
570 write32le(uint8_t *buf, uint32_t num)
571 {
572 	buf[0] = (uint8_t)num;
573 	buf[1] = (uint8_t)(num >> 8);
574 	buf[2] = (uint8_t)(num >> 16);
575 	buf[3] = (uint8_t)(num >> 24);
576 	return;
577 }
578 
579 
580 static inline void
581 write64be(uint8_t *buf, uint64_t num)
582 {
583 	buf[0] = (uint8_t)(num >> 56);
584 	buf[1] = (uint8_t)(num >> 48);
585 	buf[2] = (uint8_t)(num >> 40);
586 	buf[3] = (uint8_t)(num >> 32);
587 	buf[4] = (uint8_t)(num >> 24);
588 	buf[5] = (uint8_t)(num >> 16);
589 	buf[6] = (uint8_t)(num >> 8);
590 	buf[7] = (uint8_t)num;
591 	return;
592 }
593 
594 
595 static inline void
596 write64le(uint8_t *buf, uint64_t num)
597 {
598 	buf[0] = (uint8_t)num;
599 	buf[1] = (uint8_t)(num >> 8);
600 	buf[2] = (uint8_t)(num >> 16);
601 	buf[3] = (uint8_t)(num >> 24);
602 	buf[4] = (uint8_t)(num >> 32);
603 	buf[5] = (uint8_t)(num >> 40);
604 	buf[6] = (uint8_t)(num >> 48);
605 	buf[7] = (uint8_t)(num >> 56);
606 	return;
607 }
608 
609 #endif
610 
611 
612 //////////////////////////////
613 // Aligned reads and writes //
614 //////////////////////////////
615 
616 // Separate functions for aligned reads and writes are provided since on
617 // strict-align archs aligned access is much faster than unaligned access.
618 //
619 // Just like in the unaligned case, memcpy() is needed to avoid
620 // strict aliasing violations. However, on archs that don't support
621 // unaligned access the compiler cannot know that the pointers given
622 // to memcpy() are aligned which results in slow code. As of C11 there is
623 // no standard way to tell the compiler that we know that the address is
624 // aligned but some compilers have language extensions to do that. With
625 // such language extensions the memcpy() method gives excellent results.
626 //
627 // What to do on a strict-align system when no known language extentensions
628 // are available? Falling back to byte-by-byte access would be safe but ruin
629 // optimizations that have been made specifically with aligned access in mind.
630 // As a compromise, aligned reads will fall back to non-compliant type punning
631 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
632 // over fast writes. This obviously isn't great but hopefully it's a working
633 // compromise for now.
634 //
635 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
636 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
637 #	define tuklib_memcpy_aligned(dest, src, size) \
638 		memcpy(dest, __builtin_assume_aligned(src, size), size)
639 #else
640 #	define tuklib_memcpy_aligned(dest, src, size) \
641 		memcpy(dest, src, size)
642 #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
643 #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
644 #	endif
645 #endif
646 
647 
648 static inline uint16_t
649 aligned_read16ne(const uint8_t *buf)
650 {
651 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
652 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
653 	return *(const uint16_t *)buf;
654 #else
655 	uint16_t num;
656 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
657 	return num;
658 #endif
659 }
660 
661 
662 static inline uint32_t
663 aligned_read32ne(const uint8_t *buf)
664 {
665 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
666 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
667 	return *(const uint32_t *)buf;
668 #else
669 	uint32_t num;
670 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
671 	return num;
672 #endif
673 }
674 
675 
676 static inline uint64_t
677 aligned_read64ne(const uint8_t *buf)
678 {
679 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
680 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
681 	return *(const uint64_t *)buf;
682 #else
683 	uint64_t num;
684 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
685 	return num;
686 #endif
687 }
688 
689 
690 static inline void
691 aligned_write16ne(uint8_t *buf, uint16_t num)
692 {
693 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
694 	*(uint16_t *)buf = num;
695 #else
696 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
697 #endif
698 	return;
699 }
700 
701 
702 static inline void
703 aligned_write32ne(uint8_t *buf, uint32_t num)
704 {
705 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
706 	*(uint32_t *)buf = num;
707 #else
708 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
709 #endif
710 	return;
711 }
712 
713 
714 static inline void
715 aligned_write64ne(uint8_t *buf, uint64_t num)
716 {
717 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
718 	*(uint64_t *)buf = num;
719 #else
720 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
721 #endif
722 	return;
723 }
724 
725 
726 static inline uint16_t
727 aligned_read16be(const uint8_t *buf)
728 {
729 	uint16_t num = aligned_read16ne(buf);
730 	return conv16be(num);
731 }
732 
733 
734 static inline uint16_t
735 aligned_read16le(const uint8_t *buf)
736 {
737 	uint16_t num = aligned_read16ne(buf);
738 	return conv16le(num);
739 }
740 
741 
742 static inline uint32_t
743 aligned_read32be(const uint8_t *buf)
744 {
745 	uint32_t num = aligned_read32ne(buf);
746 	return conv32be(num);
747 }
748 
749 
750 static inline uint32_t
751 aligned_read32le(const uint8_t *buf)
752 {
753 	uint32_t num = aligned_read32ne(buf);
754 	return conv32le(num);
755 }
756 
757 
758 static inline uint64_t
759 aligned_read64be(const uint8_t *buf)
760 {
761 	uint64_t num = aligned_read64ne(buf);
762 	return conv64be(num);
763 }
764 
765 
766 static inline uint64_t
767 aligned_read64le(const uint8_t *buf)
768 {
769 	uint64_t num = aligned_read64ne(buf);
770 	return conv64le(num);
771 }
772 
773 
774 // These need to be macros like in the unaligned case.
775 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
776 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
777 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
778 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
779 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
780 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
781 
782 
783 ////////////////////
784 // Bit operations //
785 ////////////////////
786 
787 static inline uint32_t
788 bsr32(uint32_t n)
789 {
790 	// Check for ICC first, since it tends to define __GNUC__ too.
791 #if defined(__INTEL_COMPILER)
792 	return _bit_scan_reverse(n);
793 
794 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
795 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
796 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
797 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
798 	// XOR (if -march indicates that SSE4a instructions are supported).
799 	return (uint32_t)__builtin_clz(n) ^ 31U;
800 
801 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
802 	uint32_t i;
803 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
804 	return i;
805 
806 #elif defined(_MSC_VER)
807 	unsigned long i;
808 	_BitScanReverse(&i, n);
809 	return i;
810 
811 #else
812 	uint32_t i = 31;
813 
814 	if ((n & 0xFFFF0000) == 0) {
815 		n <<= 16;
816 		i = 15;
817 	}
818 
819 	if ((n & 0xFF000000) == 0) {
820 		n <<= 8;
821 		i -= 8;
822 	}
823 
824 	if ((n & 0xF0000000) == 0) {
825 		n <<= 4;
826 		i -= 4;
827 	}
828 
829 	if ((n & 0xC0000000) == 0) {
830 		n <<= 2;
831 		i -= 2;
832 	}
833 
834 	if ((n & 0x80000000) == 0)
835 		--i;
836 
837 	return i;
838 #endif
839 }
840 
841 
842 static inline uint32_t
843 clz32(uint32_t n)
844 {
845 #if defined(__INTEL_COMPILER)
846 	return _bit_scan_reverse(n) ^ 31U;
847 
848 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
849 	return (uint32_t)__builtin_clz(n);
850 
851 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
852 	uint32_t i;
853 	__asm__("bsrl %1, %0\n\t"
854 		"xorl $31, %0"
855 		: "=r" (i) : "rm" (n));
856 	return i;
857 
858 #elif defined(_MSC_VER)
859 	unsigned long i;
860 	_BitScanReverse(&i, n);
861 	return i ^ 31U;
862 
863 #else
864 	uint32_t i = 0;
865 
866 	if ((n & 0xFFFF0000) == 0) {
867 		n <<= 16;
868 		i = 16;
869 	}
870 
871 	if ((n & 0xFF000000) == 0) {
872 		n <<= 8;
873 		i += 8;
874 	}
875 
876 	if ((n & 0xF0000000) == 0) {
877 		n <<= 4;
878 		i += 4;
879 	}
880 
881 	if ((n & 0xC0000000) == 0) {
882 		n <<= 2;
883 		i += 2;
884 	}
885 
886 	if ((n & 0x80000000) == 0)
887 		++i;
888 
889 	return i;
890 #endif
891 }
892 
893 
894 static inline uint32_t
895 ctz32(uint32_t n)
896 {
897 #if defined(__INTEL_COMPILER)
898 	return _bit_scan_forward(n);
899 
900 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
901 	return (uint32_t)__builtin_ctz(n);
902 
903 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
904 	uint32_t i;
905 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
906 	return i;
907 
908 #elif defined(_MSC_VER)
909 	unsigned long i;
910 	_BitScanForward(&i, n);
911 	return i;
912 
913 #else
914 	uint32_t i = 0;
915 
916 	if ((n & 0x0000FFFF) == 0) {
917 		n >>= 16;
918 		i = 16;
919 	}
920 
921 	if ((n & 0x000000FF) == 0) {
922 		n >>= 8;
923 		i += 8;
924 	}
925 
926 	if ((n & 0x0000000F) == 0) {
927 		n >>= 4;
928 		i += 4;
929 	}
930 
931 	if ((n & 0x00000003) == 0) {
932 		n >>= 2;
933 		i += 2;
934 	}
935 
936 	if ((n & 0x00000001) == 0)
937 		++i;
938 
939 	return i;
940 #endif
941 }
942 
943 #define bsf32 ctz32
944 
945 #endif
946