xref: /freebsd/contrib/xz/src/common/tuklib_integer.h (revision 3a56015a2f5d630910177fa79a522bb95511ccf7)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       tuklib_integer.h
6 /// \brief      Various integer and bit operations
7 ///
8 /// This file provides macros or functions to do some basic integer and bit
9 /// operations.
10 ///
11 /// Native endian inline functions (XX = 16, 32, or 64):
12 ///   - Unaligned native endian reads: readXXne(ptr)
13 ///   - Unaligned native endian writes: writeXXne(ptr, num)
14 ///   - Aligned native endian reads: aligned_readXXne(ptr)
15 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
16 ///
17 /// Endianness-converting integer operations (these can be macros!)
18 /// (XX = 16, 32, or 64; Y = b or l):
19 ///   - Byte swapping: byteswapXX(num)
20 ///   - Byte order conversions to/from native (byteswaps if Y isn't
21 ///     the native endianness): convXXYe(num)
22 ///   - Unaligned reads: readXXYe(ptr)
23 ///   - Unaligned writes: writeXXYe(ptr, num)
24 ///   - Aligned reads: aligned_readXXYe(ptr)
25 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
26 ///
27 /// Since the above can macros, the arguments should have no side effects
28 /// because they may be evaluated more than once.
29 ///
30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
31 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
32 ///   - Count leading zeros: clz32(num)
33 ///   - Count trailing zeros: ctz32(num)
34 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
35 ///
36 /// The above bit scan operations return 0-31. If num is zero,
37 /// the result is undefined.
38 //
39 //  Authors:    Lasse Collin
40 //              Joachim Henke
41 //
42 ///////////////////////////////////////////////////////////////////////////////
43 
44 #ifndef TUKLIB_INTEGER_H
45 #define TUKLIB_INTEGER_H
46 
47 #include "tuklib_common.h"
48 #include <string.h>
49 
50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51 // and such functions.
52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53 #	include <immintrin.h>
54 // Only include <intrin.h> when it is needed. GCC and Clang can both
55 // use __builtin's, so we only need Windows instrincs when using MSVC.
56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
57 // cases explicitly.
58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
59 #	include <intrin.h>
60 #endif
61 
62 
63 ///////////////////
64 // Byte swapping //
65 ///////////////////
66 
67 #if defined(HAVE___BUILTIN_BSWAPXX)
68 	// GCC >= 4.8 and Clang
69 #	define byteswap16(num) __builtin_bswap16(num)
70 #	define byteswap32(num) __builtin_bswap32(num)
71 #	define byteswap64(num) __builtin_bswap64(num)
72 
73 #elif defined(HAVE_BYTESWAP_H)
74 	// glibc, uClibc, dietlibc
75 #	include <byteswap.h>
76 #	ifdef HAVE_BSWAP_16
77 #		define byteswap16(num) bswap_16(num)
78 #	endif
79 #	ifdef HAVE_BSWAP_32
80 #		define byteswap32(num) bswap_32(num)
81 #	endif
82 #	ifdef HAVE_BSWAP_64
83 #		define byteswap64(num) bswap_64(num)
84 #	endif
85 
86 #elif defined(HAVE_SYS_ENDIAN_H)
87 	// *BSDs and Darwin
88 #	include <sys/endian.h>
89 #	ifdef __OpenBSD__
90 #		define byteswap16(num) swap16(num)
91 #		define byteswap32(num) swap32(num)
92 #		define byteswap64(num) swap64(num)
93 #	else
94 #		define byteswap16(num) bswap16(num)
95 #		define byteswap32(num) bswap32(num)
96 #		define byteswap64(num) bswap64(num)
97 #	endif
98 
99 #elif defined(HAVE_SYS_BYTEORDER_H)
100 	// Solaris
101 #	include <sys/byteorder.h>
102 #	ifdef BSWAP_16
103 #		define byteswap16(num) BSWAP_16(num)
104 #	endif
105 #	ifdef BSWAP_32
106 #		define byteswap32(num) BSWAP_32(num)
107 #	endif
108 #	ifdef BSWAP_64
109 #		define byteswap64(num) BSWAP_64(num)
110 #	endif
111 #	ifdef BE_16
112 #		define conv16be(num) BE_16(num)
113 #	endif
114 #	ifdef BE_32
115 #		define conv32be(num) BE_32(num)
116 #	endif
117 #	ifdef BE_64
118 #		define conv64be(num) BE_64(num)
119 #	endif
120 #	ifdef LE_16
121 #		define conv16le(num) LE_16(num)
122 #	endif
123 #	ifdef LE_32
124 #		define conv32le(num) LE_32(num)
125 #	endif
126 #	ifdef LE_64
127 #		define conv64le(num) LE_64(num)
128 #	endif
129 #endif
130 
131 #ifndef byteswap16
132 #	define byteswap16(n) (uint16_t)( \
133 		  (((n) & 0x00FFU) << 8) \
134 		| (((n) & 0xFF00U) >> 8) \
135 	)
136 #endif
137 
138 #ifndef byteswap32
139 #	define byteswap32(n) (uint32_t)( \
140 		  (((n) & UINT32_C(0x000000FF)) << 24) \
141 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
142 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
143 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
144 	)
145 #endif
146 
147 #ifndef byteswap64
148 #	define byteswap64(n) (uint64_t)( \
149 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
150 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
151 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
152 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
153 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
154 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
155 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
156 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
157 	)
158 #endif
159 
160 // Define conversion macros using the basic byte swapping macros.
161 #ifdef WORDS_BIGENDIAN
162 #	ifndef conv16be
163 #		define conv16be(num) ((uint16_t)(num))
164 #	endif
165 #	ifndef conv32be
166 #		define conv32be(num) ((uint32_t)(num))
167 #	endif
168 #	ifndef conv64be
169 #		define conv64be(num) ((uint64_t)(num))
170 #	endif
171 #	ifndef conv16le
172 #		define conv16le(num) byteswap16(num)
173 #	endif
174 #	ifndef conv32le
175 #		define conv32le(num) byteswap32(num)
176 #	endif
177 #	ifndef conv64le
178 #		define conv64le(num) byteswap64(num)
179 #	endif
180 #else
181 #	ifndef conv16be
182 #		define conv16be(num) byteswap16(num)
183 #	endif
184 #	ifndef conv32be
185 #		define conv32be(num) byteswap32(num)
186 #	endif
187 #	ifndef conv64be
188 #		define conv64be(num) byteswap64(num)
189 #	endif
190 #	ifndef conv16le
191 #		define conv16le(num) ((uint16_t)(num))
192 #	endif
193 #	ifndef conv32le
194 #		define conv32le(num) ((uint32_t)(num))
195 #	endif
196 #	ifndef conv64le
197 #		define conv64le(num) ((uint64_t)(num))
198 #	endif
199 #endif
200 
201 
202 ////////////////////////////////
203 // Unaligned reads and writes //
204 ////////////////////////////////
205 
206 // No-strict-align archs like x86-64
207 // ---------------------------------
208 //
209 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
210 // is bad even if the uint8_pointer is properly aligned because this kind
211 // of casts break strict aliasing rules and result in undefined behavior.
212 // With unaligned pointers it's even worse: compilers may emit vector
213 // instructions that require aligned pointers even if non-vector
214 // instructions work with unaligned pointers.
215 //
216 // Using memcpy() is the standard compliant way to do unaligned access.
217 // Many modern compilers inline it so there is no function call overhead.
218 // For those compilers that don't handle the memcpy() method well, the
219 // old casting method (that violates strict aliasing) can be requested at
220 // build time. A third method, casting to a packed struct, would also be
221 // an option but isn't provided to keep things simpler (it's already a mess).
222 // Hopefully this is flexible enough in practice.
223 //
224 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
225 //
226 //     buf[0] | (buf[1] << 8)
227 //
228 // reads a 16-bit value and can emit a single 16-bit load and produce
229 // identical code than with the memcpy() method. In other cases Clang and GCC
230 // produce either the same or better code with memcpy(). For example, Clang 9
231 // on x86-64 can detect 32-bit load but not 16-bit load.
232 //
233 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
234 // code for "buf[0] | (buf[1] << 8)".
235 //
236 // Conclusion: The memcpy() method is the best choice when unaligned access
237 // is supported.
238 //
239 // Strict-align archs like SPARC
240 // -----------------------------
241 //
242 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
243 // from the memcpy() method than from simple byte-by-byte shift-or code
244 // when reading a 32-bit integer:
245 //
246 //     (1) It may be constructed on stack using four 8-bit loads,
247 //         four 8-bit stores to stack, and finally one 32-bit load from stack.
248 //
249 //     (2) Especially with -Os, an actual memcpy() call may be emitted.
250 //
251 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
252 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
253 // some processors but not all so this is relevant only in the case when
254 // GCC assumes that unaligned is not supported or -mstrict-align or
255 // -mno-unaligned-access is used.
256 //
257 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
258 // was one the very few with a minor difference: the memcpy() version
259 // was one instruction longer.
260 //
261 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
262 // the best choice for strict-align archs to do unaligned access.
263 //
264 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
265 //
266 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
267 // The following is for little endian targets:
268 /*
269 #include <stdint.h>
270 #include <string.h>
271 
272 uint32_t bytes16(const uint8_t *b)
273 {
274     return (uint32_t)b[0]
275         | ((uint32_t)b[1] << 8);
276 }
277 
278 uint32_t copy16(const uint8_t *b)
279 {
280     uint16_t v;
281     memcpy(&v, b, sizeof(v));
282     return v;
283 }
284 
285 uint32_t bytes32(const uint8_t *b)
286 {
287     return (uint32_t)b[0]
288         | ((uint32_t)b[1] << 8)
289         | ((uint32_t)b[2] << 16)
290         | ((uint32_t)b[3] << 24);
291 }
292 
293 uint32_t copy32(const uint8_t *b)
294 {
295     uint32_t v;
296     memcpy(&v, b, sizeof(v));
297     return v;
298 }
299 
300 void wbytes16(uint8_t *b, uint16_t v)
301 {
302     b[0] = (uint8_t)v;
303     b[1] = (uint8_t)(v >> 8);
304 }
305 
306 void wcopy16(uint8_t *b, uint16_t v)
307 {
308     memcpy(b, &v, sizeof(v));
309 }
310 
311 void wbytes32(uint8_t *b, uint32_t v)
312 {
313     b[0] = (uint8_t)v;
314     b[1] = (uint8_t)(v >> 8);
315     b[2] = (uint8_t)(v >> 16);
316     b[3] = (uint8_t)(v >> 24);
317 }
318 
319 void wcopy32(uint8_t *b, uint32_t v)
320 {
321     memcpy(b, &v, sizeof(v));
322 }
323 */
324 
325 
326 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
327 
328 static inline uint16_t
329 read16ne(const uint8_t *buf)
330 {
331 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
332 	return *(const uint16_t *)buf;
333 #else
334 	uint16_t num;
335 	memcpy(&num, buf, sizeof(num));
336 	return num;
337 #endif
338 }
339 
340 
341 static inline uint32_t
342 read32ne(const uint8_t *buf)
343 {
344 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
345 	return *(const uint32_t *)buf;
346 #else
347 	uint32_t num;
348 	memcpy(&num, buf, sizeof(num));
349 	return num;
350 #endif
351 }
352 
353 
354 static inline uint64_t
355 read64ne(const uint8_t *buf)
356 {
357 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
358 	return *(const uint64_t *)buf;
359 #else
360 	uint64_t num;
361 	memcpy(&num, buf, sizeof(num));
362 	return num;
363 #endif
364 }
365 
366 
367 static inline void
368 write16ne(uint8_t *buf, uint16_t num)
369 {
370 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
371 	*(uint16_t *)buf = num;
372 #else
373 	memcpy(buf, &num, sizeof(num));
374 #endif
375 	return;
376 }
377 
378 
379 static inline void
380 write32ne(uint8_t *buf, uint32_t num)
381 {
382 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
383 	*(uint32_t *)buf = num;
384 #else
385 	memcpy(buf, &num, sizeof(num));
386 #endif
387 	return;
388 }
389 
390 
391 static inline void
392 write64ne(uint8_t *buf, uint64_t num)
393 {
394 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
395 	*(uint64_t *)buf = num;
396 #else
397 	memcpy(buf, &num, sizeof(num));
398 #endif
399 	return;
400 }
401 
402 
403 static inline uint16_t
404 read16be(const uint8_t *buf)
405 {
406 	uint16_t num = read16ne(buf);
407 	return conv16be(num);
408 }
409 
410 
411 static inline uint16_t
412 read16le(const uint8_t *buf)
413 {
414 	uint16_t num = read16ne(buf);
415 	return conv16le(num);
416 }
417 
418 
419 static inline uint32_t
420 read32be(const uint8_t *buf)
421 {
422 	uint32_t num = read32ne(buf);
423 	return conv32be(num);
424 }
425 
426 
427 static inline uint32_t
428 read32le(const uint8_t *buf)
429 {
430 	uint32_t num = read32ne(buf);
431 	return conv32le(num);
432 }
433 
434 
435 static inline uint64_t
436 read64be(const uint8_t *buf)
437 {
438 	uint64_t num = read64ne(buf);
439 	return conv64be(num);
440 }
441 
442 
443 static inline uint64_t
444 read64le(const uint8_t *buf)
445 {
446 	uint64_t num = read64ne(buf);
447 	return conv64le(num);
448 }
449 
450 
451 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
452 // to optimize byte swapping of constants when using glibc's or *BSD's
453 // byte swapping macros. The actual write is done in an inline function
454 // to make type checking of the buf pointer possible.
455 #define write16be(buf, num) write16ne(buf, conv16be(num))
456 #define write32be(buf, num) write32ne(buf, conv32be(num))
457 #define write64be(buf, num) write64ne(buf, conv64be(num))
458 #define write16le(buf, num) write16ne(buf, conv16le(num))
459 #define write32le(buf, num) write32ne(buf, conv32le(num))
460 #define write64le(buf, num) write64ne(buf, conv64le(num))
461 
462 #else
463 
464 #ifdef WORDS_BIGENDIAN
465 #	define read16ne read16be
466 #	define read32ne read32be
467 #	define read64ne read64be
468 #	define write16ne write16be
469 #	define write32ne write32be
470 #	define write64ne write64be
471 #else
472 #	define read16ne read16le
473 #	define read32ne read32le
474 #	define read64ne read64le
475 #	define write16ne write16le
476 #	define write32ne write32le
477 #	define write64ne write64le
478 #endif
479 
480 
481 static inline uint16_t
482 read16be(const uint8_t *buf)
483 {
484 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
485 	return num;
486 }
487 
488 
489 static inline uint16_t
490 read16le(const uint8_t *buf)
491 {
492 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
493 	return num;
494 }
495 
496 
497 static inline uint32_t
498 read32be(const uint8_t *buf)
499 {
500 	uint32_t num = (uint32_t)buf[0] << 24;
501 	num |= (uint32_t)buf[1] << 16;
502 	num |= (uint32_t)buf[2] << 8;
503 	num |= (uint32_t)buf[3];
504 	return num;
505 }
506 
507 
508 static inline uint32_t
509 read32le(const uint8_t *buf)
510 {
511 	uint32_t num = (uint32_t)buf[0];
512 	num |= (uint32_t)buf[1] << 8;
513 	num |= (uint32_t)buf[2] << 16;
514 	num |= (uint32_t)buf[3] << 24;
515 	return num;
516 }
517 
518 
519 static inline uint64_t
520 read64be(const uint8_t *buf)
521 {
522 	uint64_t num = (uint64_t)buf[0] << 56;
523 	num |= (uint64_t)buf[1] << 48;
524 	num |= (uint64_t)buf[2] << 40;
525 	num |= (uint64_t)buf[3] << 32;
526 	num |= (uint64_t)buf[4] << 24;
527 	num |= (uint64_t)buf[5] << 16;
528 	num |= (uint64_t)buf[6] << 8;
529 	num |= (uint64_t)buf[7];
530 	return num;
531 }
532 
533 
534 static inline uint64_t
535 read64le(const uint8_t *buf)
536 {
537 	uint64_t num = (uint64_t)buf[0];
538 	num |= (uint64_t)buf[1] << 8;
539 	num |= (uint64_t)buf[2] << 16;
540 	num |= (uint64_t)buf[3] << 24;
541 	num |= (uint64_t)buf[4] << 32;
542 	num |= (uint64_t)buf[5] << 40;
543 	num |= (uint64_t)buf[6] << 48;
544 	num |= (uint64_t)buf[7] << 56;
545 	return num;
546 }
547 
548 
549 static inline void
550 write16be(uint8_t *buf, uint16_t num)
551 {
552 	buf[0] = (uint8_t)(num >> 8);
553 	buf[1] = (uint8_t)num;
554 	return;
555 }
556 
557 
558 static inline void
559 write16le(uint8_t *buf, uint16_t num)
560 {
561 	buf[0] = (uint8_t)num;
562 	buf[1] = (uint8_t)(num >> 8);
563 	return;
564 }
565 
566 
567 static inline void
568 write32be(uint8_t *buf, uint32_t num)
569 {
570 	buf[0] = (uint8_t)(num >> 24);
571 	buf[1] = (uint8_t)(num >> 16);
572 	buf[2] = (uint8_t)(num >> 8);
573 	buf[3] = (uint8_t)num;
574 	return;
575 }
576 
577 
578 static inline void
579 write32le(uint8_t *buf, uint32_t num)
580 {
581 	buf[0] = (uint8_t)num;
582 	buf[1] = (uint8_t)(num >> 8);
583 	buf[2] = (uint8_t)(num >> 16);
584 	buf[3] = (uint8_t)(num >> 24);
585 	return;
586 }
587 
588 
589 static inline void
590 write64be(uint8_t *buf, uint64_t num)
591 {
592 	buf[0] = (uint8_t)(num >> 56);
593 	buf[1] = (uint8_t)(num >> 48);
594 	buf[2] = (uint8_t)(num >> 40);
595 	buf[3] = (uint8_t)(num >> 32);
596 	buf[4] = (uint8_t)(num >> 24);
597 	buf[5] = (uint8_t)(num >> 16);
598 	buf[6] = (uint8_t)(num >> 8);
599 	buf[7] = (uint8_t)num;
600 	return;
601 }
602 
603 
604 static inline void
605 write64le(uint8_t *buf, uint64_t num)
606 {
607 	buf[0] = (uint8_t)num;
608 	buf[1] = (uint8_t)(num >> 8);
609 	buf[2] = (uint8_t)(num >> 16);
610 	buf[3] = (uint8_t)(num >> 24);
611 	buf[4] = (uint8_t)(num >> 32);
612 	buf[5] = (uint8_t)(num >> 40);
613 	buf[6] = (uint8_t)(num >> 48);
614 	buf[7] = (uint8_t)(num >> 56);
615 	return;
616 }
617 
618 #endif
619 
620 
621 //////////////////////////////
622 // Aligned reads and writes //
623 //////////////////////////////
624 
625 // Separate functions for aligned reads and writes are provided since on
626 // strict-align archs aligned access is much faster than unaligned access.
627 //
628 // Just like in the unaligned case, memcpy() is needed to avoid
629 // strict aliasing violations. However, on archs that don't support
630 // unaligned access the compiler cannot know that the pointers given
631 // to memcpy() are aligned which results in slow code. As of C11 there is
632 // no standard way to tell the compiler that we know that the address is
633 // aligned but some compilers have language extensions to do that. With
634 // such language extensions the memcpy() method gives excellent results.
635 //
636 // What to do on a strict-align system when no known language extensions
637 // are available? Falling back to byte-by-byte access would be safe but ruin
638 // optimizations that have been made specifically with aligned access in mind.
639 // As a compromise, aligned reads will fall back to non-compliant type punning
640 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
641 // over fast writes. This obviously isn't great but hopefully it's a working
642 // compromise for now.
643 //
644 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
645 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
646 #	define tuklib_memcpy_aligned(dest, src, size) \
647 		memcpy(dest, __builtin_assume_aligned(src, size), size)
648 #else
649 #	define tuklib_memcpy_aligned(dest, src, size) \
650 		memcpy(dest, src, size)
651 #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
652 #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
653 #	endif
654 #endif
655 
656 
657 static inline uint16_t
658 aligned_read16ne(const uint8_t *buf)
659 {
660 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
661 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
662 	return *(const uint16_t *)buf;
663 #else
664 	uint16_t num;
665 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
666 	return num;
667 #endif
668 }
669 
670 
671 static inline uint32_t
672 aligned_read32ne(const uint8_t *buf)
673 {
674 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
675 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
676 	return *(const uint32_t *)buf;
677 #else
678 	uint32_t num;
679 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
680 	return num;
681 #endif
682 }
683 
684 
685 static inline uint64_t
686 aligned_read64ne(const uint8_t *buf)
687 {
688 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
689 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
690 	return *(const uint64_t *)buf;
691 #else
692 	uint64_t num;
693 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
694 	return num;
695 #endif
696 }
697 
698 
699 static inline void
700 aligned_write16ne(uint8_t *buf, uint16_t num)
701 {
702 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
703 	*(uint16_t *)buf = num;
704 #else
705 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
706 #endif
707 	return;
708 }
709 
710 
711 static inline void
712 aligned_write32ne(uint8_t *buf, uint32_t num)
713 {
714 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
715 	*(uint32_t *)buf = num;
716 #else
717 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
718 #endif
719 	return;
720 }
721 
722 
723 static inline void
724 aligned_write64ne(uint8_t *buf, uint64_t num)
725 {
726 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
727 	*(uint64_t *)buf = num;
728 #else
729 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
730 #endif
731 	return;
732 }
733 
734 
735 static inline uint16_t
736 aligned_read16be(const uint8_t *buf)
737 {
738 	uint16_t num = aligned_read16ne(buf);
739 	return conv16be(num);
740 }
741 
742 
743 static inline uint16_t
744 aligned_read16le(const uint8_t *buf)
745 {
746 	uint16_t num = aligned_read16ne(buf);
747 	return conv16le(num);
748 }
749 
750 
751 static inline uint32_t
752 aligned_read32be(const uint8_t *buf)
753 {
754 	uint32_t num = aligned_read32ne(buf);
755 	return conv32be(num);
756 }
757 
758 
759 static inline uint32_t
760 aligned_read32le(const uint8_t *buf)
761 {
762 	uint32_t num = aligned_read32ne(buf);
763 	return conv32le(num);
764 }
765 
766 
767 static inline uint64_t
768 aligned_read64be(const uint8_t *buf)
769 {
770 	uint64_t num = aligned_read64ne(buf);
771 	return conv64be(num);
772 }
773 
774 
775 static inline uint64_t
776 aligned_read64le(const uint8_t *buf)
777 {
778 	uint64_t num = aligned_read64ne(buf);
779 	return conv64le(num);
780 }
781 
782 
783 // These need to be macros like in the unaligned case.
784 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
785 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
786 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
787 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
788 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
789 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
790 
791 
792 ////////////////////
793 // Bit operations //
794 ////////////////////
795 
796 static inline uint32_t
797 bsr32(uint32_t n)
798 {
799 	// Check for ICC first, since it tends to define __GNUC__ too.
800 #if defined(__INTEL_COMPILER)
801 	return _bit_scan_reverse(n);
802 
803 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
804 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
805 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
806 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
807 	// XOR (if -march indicates that SSE4a instructions are supported).
808 	return (uint32_t)__builtin_clz(n) ^ 31U;
809 
810 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
811 	uint32_t i;
812 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
813 	return i;
814 
815 #elif defined(_MSC_VER)
816 	unsigned long i;
817 	_BitScanReverse(&i, n);
818 	return i;
819 
820 #else
821 	uint32_t i = 31;
822 
823 	if ((n & 0xFFFF0000) == 0) {
824 		n <<= 16;
825 		i = 15;
826 	}
827 
828 	if ((n & 0xFF000000) == 0) {
829 		n <<= 8;
830 		i -= 8;
831 	}
832 
833 	if ((n & 0xF0000000) == 0) {
834 		n <<= 4;
835 		i -= 4;
836 	}
837 
838 	if ((n & 0xC0000000) == 0) {
839 		n <<= 2;
840 		i -= 2;
841 	}
842 
843 	if ((n & 0x80000000) == 0)
844 		--i;
845 
846 	return i;
847 #endif
848 }
849 
850 
851 static inline uint32_t
852 clz32(uint32_t n)
853 {
854 #if defined(__INTEL_COMPILER)
855 	return _bit_scan_reverse(n) ^ 31U;
856 
857 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
858 	return (uint32_t)__builtin_clz(n);
859 
860 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
861 	uint32_t i;
862 	__asm__("bsrl %1, %0\n\t"
863 		"xorl $31, %0"
864 		: "=r" (i) : "rm" (n));
865 	return i;
866 
867 #elif defined(_MSC_VER)
868 	unsigned long i;
869 	_BitScanReverse(&i, n);
870 	return i ^ 31U;
871 
872 #else
873 	uint32_t i = 0;
874 
875 	if ((n & 0xFFFF0000) == 0) {
876 		n <<= 16;
877 		i = 16;
878 	}
879 
880 	if ((n & 0xFF000000) == 0) {
881 		n <<= 8;
882 		i += 8;
883 	}
884 
885 	if ((n & 0xF0000000) == 0) {
886 		n <<= 4;
887 		i += 4;
888 	}
889 
890 	if ((n & 0xC0000000) == 0) {
891 		n <<= 2;
892 		i += 2;
893 	}
894 
895 	if ((n & 0x80000000) == 0)
896 		++i;
897 
898 	return i;
899 #endif
900 }
901 
902 
903 static inline uint32_t
904 ctz32(uint32_t n)
905 {
906 #if defined(__INTEL_COMPILER)
907 	return _bit_scan_forward(n);
908 
909 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
910 	return (uint32_t)__builtin_ctz(n);
911 
912 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
913 	uint32_t i;
914 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
915 	return i;
916 
917 #elif defined(_MSC_VER)
918 	unsigned long i;
919 	_BitScanForward(&i, n);
920 	return i;
921 
922 #else
923 	uint32_t i = 0;
924 
925 	if ((n & 0x0000FFFF) == 0) {
926 		n >>= 16;
927 		i = 16;
928 	}
929 
930 	if ((n & 0x000000FF) == 0) {
931 		n >>= 8;
932 		i += 8;
933 	}
934 
935 	if ((n & 0x0000000F) == 0) {
936 		n >>= 4;
937 		i += 4;
938 	}
939 
940 	if ((n & 0x00000003) == 0) {
941 		n >>= 2;
942 		i += 2;
943 	}
944 
945 	if ((n & 0x00000001) == 0)
946 		++i;
947 
948 	return i;
949 #endif
950 }
951 
952 #define bsf32 ctz32
953 
954 #endif
955