xref: /freebsd/contrib/xz/src/common/tuklib_integer.h (revision 9cbf1de7e34a6fced041388fad5d9180cb7705fe)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       tuklib_integer.h
6 /// \brief      Various integer and bit operations
7 ///
8 /// This file provides macros or functions to do some basic integer and bit
9 /// operations.
10 ///
11 /// Native endian inline functions (XX = 16, 32, or 64):
12 ///   - Unaligned native endian reads: readXXne(ptr)
13 ///   - Unaligned native endian writes: writeXXne(ptr, num)
14 ///   - Aligned native endian reads: aligned_readXXne(ptr)
15 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
16 ///
17 /// Endianness-converting integer operations (these can be macros!)
18 /// (XX = 16, 32, or 64; Y = b or l):
19 ///   - Byte swapping: byteswapXX(num)
20 ///   - Byte order conversions to/from native (byteswaps if Y isn't
21 ///     the native endianness): convXXYe(num)
22 ///   - Unaligned reads: readXXYe(ptr)
23 ///   - Unaligned writes: writeXXYe(ptr, num)
24 ///   - Aligned reads: aligned_readXXYe(ptr)
25 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
26 ///
27 /// Since the above can macros, the arguments should have no side effects
28 /// because they may be evaluated more than once.
29 ///
30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
31 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
32 ///   - Count leading zeros: clz32(num)
33 ///   - Count trailing zeros: ctz32(num)
34 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
35 ///
36 /// The above bit scan operations return 0-31. If num is zero,
37 /// the result is undefined.
38 //
39 //  Authors:    Lasse Collin
40 //              Joachim Henke
41 //
42 ///////////////////////////////////////////////////////////////////////////////
43 
44 #ifndef TUKLIB_INTEGER_H
45 #define TUKLIB_INTEGER_H
46 
47 #include "tuklib_common.h"
48 #include <string.h>
49 
50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51 // and such functions.
52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53 #	include <immintrin.h>
54 // Only include <intrin.h> when it is needed. GCC and Clang can both
55 // use __builtin's, so we only need Windows instrincs when using MSVC.
56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
57 // cases explicitly.
58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
59 #	include <intrin.h>
60 #endif
61 
62 
63 ///////////////////
64 // Byte swapping //
65 ///////////////////
66 
67 #if defined(HAVE___BUILTIN_BSWAPXX)
68 	// GCC >= 4.8 and Clang
69 #	define byteswap16(num) __builtin_bswap16(num)
70 #	define byteswap32(num) __builtin_bswap32(num)
71 #	define byteswap64(num) __builtin_bswap64(num)
72 
73 #elif defined(HAVE_BYTESWAP_H)
74 	// glibc, uClibc, dietlibc
75 #	include <byteswap.h>
76 #	ifdef HAVE_BSWAP_16
77 #		define byteswap16(num) bswap_16(num)
78 #	endif
79 #	ifdef HAVE_BSWAP_32
80 #		define byteswap32(num) bswap_32(num)
81 #	endif
82 #	ifdef HAVE_BSWAP_64
83 #		define byteswap64(num) bswap_64(num)
84 #	endif
85 
86 #elif defined(HAVE_SYS_ENDIAN_H)
87 	// *BSDs and Darwin
88 #	include <sys/endian.h>
89 #	define byteswap16(num) bswap16(num)
90 #	define byteswap32(num) bswap32(num)
91 #	define byteswap64(num) bswap64(num)
92 
93 #elif defined(HAVE_SYS_BYTEORDER_H)
94 	// Solaris
95 #	include <sys/byteorder.h>
96 #	ifdef BSWAP_16
97 #		define byteswap16(num) BSWAP_16(num)
98 #	endif
99 #	ifdef BSWAP_32
100 #		define byteswap32(num) BSWAP_32(num)
101 #	endif
102 #	ifdef BSWAP_64
103 #		define byteswap64(num) BSWAP_64(num)
104 #	endif
105 #	ifdef BE_16
106 #		define conv16be(num) BE_16(num)
107 #	endif
108 #	ifdef BE_32
109 #		define conv32be(num) BE_32(num)
110 #	endif
111 #	ifdef BE_64
112 #		define conv64be(num) BE_64(num)
113 #	endif
114 #	ifdef LE_16
115 #		define conv16le(num) LE_16(num)
116 #	endif
117 #	ifdef LE_32
118 #		define conv32le(num) LE_32(num)
119 #	endif
120 #	ifdef LE_64
121 #		define conv64le(num) LE_64(num)
122 #	endif
123 #endif
124 
125 #ifndef byteswap16
126 #	define byteswap16(n) (uint16_t)( \
127 		  (((n) & 0x00FFU) << 8) \
128 		| (((n) & 0xFF00U) >> 8) \
129 	)
130 #endif
131 
132 #ifndef byteswap32
133 #	define byteswap32(n) (uint32_t)( \
134 		  (((n) & UINT32_C(0x000000FF)) << 24) \
135 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
136 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
137 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
138 	)
139 #endif
140 
141 #ifndef byteswap64
142 #	define byteswap64(n) (uint64_t)( \
143 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
144 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
145 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
146 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
147 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
148 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
149 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
150 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
151 	)
152 #endif
153 
154 // Define conversion macros using the basic byte swapping macros.
155 #ifdef WORDS_BIGENDIAN
156 #	ifndef conv16be
157 #		define conv16be(num) ((uint16_t)(num))
158 #	endif
159 #	ifndef conv32be
160 #		define conv32be(num) ((uint32_t)(num))
161 #	endif
162 #	ifndef conv64be
163 #		define conv64be(num) ((uint64_t)(num))
164 #	endif
165 #	ifndef conv16le
166 #		define conv16le(num) byteswap16(num)
167 #	endif
168 #	ifndef conv32le
169 #		define conv32le(num) byteswap32(num)
170 #	endif
171 #	ifndef conv64le
172 #		define conv64le(num) byteswap64(num)
173 #	endif
174 #else
175 #	ifndef conv16be
176 #		define conv16be(num) byteswap16(num)
177 #	endif
178 #	ifndef conv32be
179 #		define conv32be(num) byteswap32(num)
180 #	endif
181 #	ifndef conv64be
182 #		define conv64be(num) byteswap64(num)
183 #	endif
184 #	ifndef conv16le
185 #		define conv16le(num) ((uint16_t)(num))
186 #	endif
187 #	ifndef conv32le
188 #		define conv32le(num) ((uint32_t)(num))
189 #	endif
190 #	ifndef conv64le
191 #		define conv64le(num) ((uint64_t)(num))
192 #	endif
193 #endif
194 
195 
196 ////////////////////////////////
197 // Unaligned reads and writes //
198 ////////////////////////////////
199 
200 // No-strict-align archs like x86-64
201 // ---------------------------------
202 //
203 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
204 // is bad even if the uint8_pointer is properly aligned because this kind
205 // of casts break strict aliasing rules and result in undefined behavior.
206 // With unaligned pointers it's even worse: compilers may emit vector
207 // instructions that require aligned pointers even if non-vector
208 // instructions work with unaligned pointers.
209 //
210 // Using memcpy() is the standard compliant way to do unaligned access.
211 // Many modern compilers inline it so there is no function call overhead.
212 // For those compilers that don't handle the memcpy() method well, the
213 // old casting method (that violates strict aliasing) can be requested at
214 // build time. A third method, casting to a packed struct, would also be
215 // an option but isn't provided to keep things simpler (it's already a mess).
216 // Hopefully this is flexible enough in practice.
217 //
218 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
219 //
220 //     buf[0] | (buf[1] << 8)
221 //
222 // reads a 16-bit value and can emit a single 16-bit load and produce
223 // identical code than with the memcpy() method. In other cases Clang and GCC
224 // produce either the same or better code with memcpy(). For example, Clang 9
225 // on x86-64 can detect 32-bit load but not 16-bit load.
226 //
227 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
228 // code for "buf[0] | (buf[1] << 8)".
229 //
230 // Conclusion: The memcpy() method is the best choice when unaligned access
231 // is supported.
232 //
233 // Strict-align archs like SPARC
234 // -----------------------------
235 //
236 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
237 // from the memcpy() method than from simple byte-by-byte shift-or code
238 // when reading a 32-bit integer:
239 //
240 //     (1) It may be constructed on stack using using four 8-bit loads,
241 //         four 8-bit stores to stack, and finally one 32-bit load from stack.
242 //
243 //     (2) Especially with -Os, an actual memcpy() call may be emitted.
244 //
245 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
246 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
247 // some processors but not all so this is relevant only in the case when
248 // GCC assumes that unaligned is not supported or -mstrict-align or
249 // -mno-unaligned-access is used.
250 //
251 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
252 // was one the very few with a minor difference: the memcpy() version
253 // was one instruction longer.
254 //
255 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
256 // the best choice for strict-align archs to do unaligned access.
257 //
258 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
259 //
260 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
261 // The following is for little endian targets:
262 /*
263 #include <stdint.h>
264 #include <string.h>
265 
266 uint32_t bytes16(const uint8_t *b)
267 {
268     return (uint32_t)b[0]
269         | ((uint32_t)b[1] << 8);
270 }
271 
272 uint32_t copy16(const uint8_t *b)
273 {
274     uint16_t v;
275     memcpy(&v, b, sizeof(v));
276     return v;
277 }
278 
279 uint32_t bytes32(const uint8_t *b)
280 {
281     return (uint32_t)b[0]
282         | ((uint32_t)b[1] << 8)
283         | ((uint32_t)b[2] << 16)
284         | ((uint32_t)b[3] << 24);
285 }
286 
287 uint32_t copy32(const uint8_t *b)
288 {
289     uint32_t v;
290     memcpy(&v, b, sizeof(v));
291     return v;
292 }
293 
294 void wbytes16(uint8_t *b, uint16_t v)
295 {
296     b[0] = (uint8_t)v;
297     b[1] = (uint8_t)(v >> 8);
298 }
299 
300 void wcopy16(uint8_t *b, uint16_t v)
301 {
302     memcpy(b, &v, sizeof(v));
303 }
304 
305 void wbytes32(uint8_t *b, uint32_t v)
306 {
307     b[0] = (uint8_t)v;
308     b[1] = (uint8_t)(v >> 8);
309     b[2] = (uint8_t)(v >> 16);
310     b[3] = (uint8_t)(v >> 24);
311 }
312 
313 void wcopy32(uint8_t *b, uint32_t v)
314 {
315     memcpy(b, &v, sizeof(v));
316 }
317 */
318 
319 
320 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
321 
322 static inline uint16_t
323 read16ne(const uint8_t *buf)
324 {
325 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
326 	return *(const uint16_t *)buf;
327 #else
328 	uint16_t num;
329 	memcpy(&num, buf, sizeof(num));
330 	return num;
331 #endif
332 }
333 
334 
335 static inline uint32_t
336 read32ne(const uint8_t *buf)
337 {
338 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
339 	return *(const uint32_t *)buf;
340 #else
341 	uint32_t num;
342 	memcpy(&num, buf, sizeof(num));
343 	return num;
344 #endif
345 }
346 
347 
348 static inline uint64_t
349 read64ne(const uint8_t *buf)
350 {
351 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
352 	return *(const uint64_t *)buf;
353 #else
354 	uint64_t num;
355 	memcpy(&num, buf, sizeof(num));
356 	return num;
357 #endif
358 }
359 
360 
361 static inline void
362 write16ne(uint8_t *buf, uint16_t num)
363 {
364 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
365 	*(uint16_t *)buf = num;
366 #else
367 	memcpy(buf, &num, sizeof(num));
368 #endif
369 	return;
370 }
371 
372 
373 static inline void
374 write32ne(uint8_t *buf, uint32_t num)
375 {
376 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
377 	*(uint32_t *)buf = num;
378 #else
379 	memcpy(buf, &num, sizeof(num));
380 #endif
381 	return;
382 }
383 
384 
385 static inline void
386 write64ne(uint8_t *buf, uint64_t num)
387 {
388 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
389 	*(uint64_t *)buf = num;
390 #else
391 	memcpy(buf, &num, sizeof(num));
392 #endif
393 	return;
394 }
395 
396 
397 static inline uint16_t
398 read16be(const uint8_t *buf)
399 {
400 	uint16_t num = read16ne(buf);
401 	return conv16be(num);
402 }
403 
404 
405 static inline uint16_t
406 read16le(const uint8_t *buf)
407 {
408 	uint16_t num = read16ne(buf);
409 	return conv16le(num);
410 }
411 
412 
413 static inline uint32_t
414 read32be(const uint8_t *buf)
415 {
416 	uint32_t num = read32ne(buf);
417 	return conv32be(num);
418 }
419 
420 
421 static inline uint32_t
422 read32le(const uint8_t *buf)
423 {
424 	uint32_t num = read32ne(buf);
425 	return conv32le(num);
426 }
427 
428 
429 static inline uint64_t
430 read64be(const uint8_t *buf)
431 {
432 	uint64_t num = read64ne(buf);
433 	return conv64be(num);
434 }
435 
436 
437 static inline uint64_t
438 read64le(const uint8_t *buf)
439 {
440 	uint64_t num = read64ne(buf);
441 	return conv64le(num);
442 }
443 
444 
445 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
446 // to optimize byte swapping of constants when using glibc's or *BSD's
447 // byte swapping macros. The actual write is done in an inline function
448 // to make type checking of the buf pointer possible.
449 #define write16be(buf, num) write16ne(buf, conv16be(num))
450 #define write32be(buf, num) write32ne(buf, conv32be(num))
451 #define write64be(buf, num) write64ne(buf, conv64be(num))
452 #define write16le(buf, num) write16ne(buf, conv16le(num))
453 #define write32le(buf, num) write32ne(buf, conv32le(num))
454 #define write64le(buf, num) write64ne(buf, conv64le(num))
455 
456 #else
457 
458 #ifdef WORDS_BIGENDIAN
459 #	define read16ne read16be
460 #	define read32ne read32be
461 #	define read64ne read64be
462 #	define write16ne write16be
463 #	define write32ne write32be
464 #	define write64ne write64be
465 #else
466 #	define read16ne read16le
467 #	define read32ne read32le
468 #	define read64ne read64le
469 #	define write16ne write16le
470 #	define write32ne write32le
471 #	define write64ne write64le
472 #endif
473 
474 
475 static inline uint16_t
476 read16be(const uint8_t *buf)
477 {
478 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
479 	return num;
480 }
481 
482 
483 static inline uint16_t
484 read16le(const uint8_t *buf)
485 {
486 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
487 	return num;
488 }
489 
490 
491 static inline uint32_t
492 read32be(const uint8_t *buf)
493 {
494 	uint32_t num = (uint32_t)buf[0] << 24;
495 	num |= (uint32_t)buf[1] << 16;
496 	num |= (uint32_t)buf[2] << 8;
497 	num |= (uint32_t)buf[3];
498 	return num;
499 }
500 
501 
502 static inline uint32_t
503 read32le(const uint8_t *buf)
504 {
505 	uint32_t num = (uint32_t)buf[0];
506 	num |= (uint32_t)buf[1] << 8;
507 	num |= (uint32_t)buf[2] << 16;
508 	num |= (uint32_t)buf[3] << 24;
509 	return num;
510 }
511 
512 
513 static inline uint64_t
514 read64be(const uint8_t *buf)
515 {
516 	uint64_t num = (uint64_t)buf[0] << 56;
517 	num |= (uint64_t)buf[1] << 48;
518 	num |= (uint64_t)buf[2] << 40;
519 	num |= (uint64_t)buf[3] << 32;
520 	num |= (uint64_t)buf[4] << 24;
521 	num |= (uint64_t)buf[5] << 16;
522 	num |= (uint64_t)buf[6] << 8;
523 	num |= (uint64_t)buf[7];
524 	return num;
525 }
526 
527 
528 static inline uint64_t
529 read64le(const uint8_t *buf)
530 {
531 	uint64_t num = (uint64_t)buf[0];
532 	num |= (uint64_t)buf[1] << 8;
533 	num |= (uint64_t)buf[2] << 16;
534 	num |= (uint64_t)buf[3] << 24;
535 	num |= (uint64_t)buf[4] << 32;
536 	num |= (uint64_t)buf[5] << 40;
537 	num |= (uint64_t)buf[6] << 48;
538 	num |= (uint64_t)buf[7] << 56;
539 	return num;
540 }
541 
542 
543 static inline void
544 write16be(uint8_t *buf, uint16_t num)
545 {
546 	buf[0] = (uint8_t)(num >> 8);
547 	buf[1] = (uint8_t)num;
548 	return;
549 }
550 
551 
552 static inline void
553 write16le(uint8_t *buf, uint16_t num)
554 {
555 	buf[0] = (uint8_t)num;
556 	buf[1] = (uint8_t)(num >> 8);
557 	return;
558 }
559 
560 
561 static inline void
562 write32be(uint8_t *buf, uint32_t num)
563 {
564 	buf[0] = (uint8_t)(num >> 24);
565 	buf[1] = (uint8_t)(num >> 16);
566 	buf[2] = (uint8_t)(num >> 8);
567 	buf[3] = (uint8_t)num;
568 	return;
569 }
570 
571 
572 static inline void
573 write32le(uint8_t *buf, uint32_t num)
574 {
575 	buf[0] = (uint8_t)num;
576 	buf[1] = (uint8_t)(num >> 8);
577 	buf[2] = (uint8_t)(num >> 16);
578 	buf[3] = (uint8_t)(num >> 24);
579 	return;
580 }
581 
582 
583 static inline void
584 write64be(uint8_t *buf, uint64_t num)
585 {
586 	buf[0] = (uint8_t)(num >> 56);
587 	buf[1] = (uint8_t)(num >> 48);
588 	buf[2] = (uint8_t)(num >> 40);
589 	buf[3] = (uint8_t)(num >> 32);
590 	buf[4] = (uint8_t)(num >> 24);
591 	buf[5] = (uint8_t)(num >> 16);
592 	buf[6] = (uint8_t)(num >> 8);
593 	buf[7] = (uint8_t)num;
594 	return;
595 }
596 
597 
598 static inline void
599 write64le(uint8_t *buf, uint64_t num)
600 {
601 	buf[0] = (uint8_t)num;
602 	buf[1] = (uint8_t)(num >> 8);
603 	buf[2] = (uint8_t)(num >> 16);
604 	buf[3] = (uint8_t)(num >> 24);
605 	buf[4] = (uint8_t)(num >> 32);
606 	buf[5] = (uint8_t)(num >> 40);
607 	buf[6] = (uint8_t)(num >> 48);
608 	buf[7] = (uint8_t)(num >> 56);
609 	return;
610 }
611 
612 #endif
613 
614 
615 //////////////////////////////
616 // Aligned reads and writes //
617 //////////////////////////////
618 
619 // Separate functions for aligned reads and writes are provided since on
620 // strict-align archs aligned access is much faster than unaligned access.
621 //
622 // Just like in the unaligned case, memcpy() is needed to avoid
623 // strict aliasing violations. However, on archs that don't support
624 // unaligned access the compiler cannot know that the pointers given
625 // to memcpy() are aligned which results in slow code. As of C11 there is
626 // no standard way to tell the compiler that we know that the address is
627 // aligned but some compilers have language extensions to do that. With
628 // such language extensions the memcpy() method gives excellent results.
629 //
630 // What to do on a strict-align system when no known language extensions
631 // are available? Falling back to byte-by-byte access would be safe but ruin
632 // optimizations that have been made specifically with aligned access in mind.
633 // As a compromise, aligned reads will fall back to non-compliant type punning
634 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
635 // over fast writes. This obviously isn't great but hopefully it's a working
636 // compromise for now.
637 //
638 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
639 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
640 #	define tuklib_memcpy_aligned(dest, src, size) \
641 		memcpy(dest, __builtin_assume_aligned(src, size), size)
642 #else
643 #	define tuklib_memcpy_aligned(dest, src, size) \
644 		memcpy(dest, src, size)
645 #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
646 #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
647 #	endif
648 #endif
649 
650 
651 static inline uint16_t
652 aligned_read16ne(const uint8_t *buf)
653 {
654 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
655 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
656 	return *(const uint16_t *)buf;
657 #else
658 	uint16_t num;
659 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
660 	return num;
661 #endif
662 }
663 
664 
665 static inline uint32_t
666 aligned_read32ne(const uint8_t *buf)
667 {
668 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
669 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
670 	return *(const uint32_t *)buf;
671 #else
672 	uint32_t num;
673 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
674 	return num;
675 #endif
676 }
677 
678 
679 static inline uint64_t
680 aligned_read64ne(const uint8_t *buf)
681 {
682 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
683 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
684 	return *(const uint64_t *)buf;
685 #else
686 	uint64_t num;
687 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
688 	return num;
689 #endif
690 }
691 
692 
693 static inline void
694 aligned_write16ne(uint8_t *buf, uint16_t num)
695 {
696 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
697 	*(uint16_t *)buf = num;
698 #else
699 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
700 #endif
701 	return;
702 }
703 
704 
705 static inline void
706 aligned_write32ne(uint8_t *buf, uint32_t num)
707 {
708 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
709 	*(uint32_t *)buf = num;
710 #else
711 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
712 #endif
713 	return;
714 }
715 
716 
717 static inline void
718 aligned_write64ne(uint8_t *buf, uint64_t num)
719 {
720 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
721 	*(uint64_t *)buf = num;
722 #else
723 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
724 #endif
725 	return;
726 }
727 
728 
729 static inline uint16_t
730 aligned_read16be(const uint8_t *buf)
731 {
732 	uint16_t num = aligned_read16ne(buf);
733 	return conv16be(num);
734 }
735 
736 
737 static inline uint16_t
738 aligned_read16le(const uint8_t *buf)
739 {
740 	uint16_t num = aligned_read16ne(buf);
741 	return conv16le(num);
742 }
743 
744 
745 static inline uint32_t
746 aligned_read32be(const uint8_t *buf)
747 {
748 	uint32_t num = aligned_read32ne(buf);
749 	return conv32be(num);
750 }
751 
752 
753 static inline uint32_t
754 aligned_read32le(const uint8_t *buf)
755 {
756 	uint32_t num = aligned_read32ne(buf);
757 	return conv32le(num);
758 }
759 
760 
761 static inline uint64_t
762 aligned_read64be(const uint8_t *buf)
763 {
764 	uint64_t num = aligned_read64ne(buf);
765 	return conv64be(num);
766 }
767 
768 
769 static inline uint64_t
770 aligned_read64le(const uint8_t *buf)
771 {
772 	uint64_t num = aligned_read64ne(buf);
773 	return conv64le(num);
774 }
775 
776 
777 // These need to be macros like in the unaligned case.
778 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
779 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
780 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
781 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
782 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
783 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
784 
785 
786 ////////////////////
787 // Bit operations //
788 ////////////////////
789 
790 static inline uint32_t
791 bsr32(uint32_t n)
792 {
793 	// Check for ICC first, since it tends to define __GNUC__ too.
794 #if defined(__INTEL_COMPILER)
795 	return _bit_scan_reverse(n);
796 
797 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
798 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
799 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
800 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
801 	// XOR (if -march indicates that SSE4a instructions are supported).
802 	return (uint32_t)__builtin_clz(n) ^ 31U;
803 
804 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
805 	uint32_t i;
806 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
807 	return i;
808 
809 #elif defined(_MSC_VER)
810 	unsigned long i;
811 	_BitScanReverse(&i, n);
812 	return i;
813 
814 #else
815 	uint32_t i = 31;
816 
817 	if ((n & 0xFFFF0000) == 0) {
818 		n <<= 16;
819 		i = 15;
820 	}
821 
822 	if ((n & 0xFF000000) == 0) {
823 		n <<= 8;
824 		i -= 8;
825 	}
826 
827 	if ((n & 0xF0000000) == 0) {
828 		n <<= 4;
829 		i -= 4;
830 	}
831 
832 	if ((n & 0xC0000000) == 0) {
833 		n <<= 2;
834 		i -= 2;
835 	}
836 
837 	if ((n & 0x80000000) == 0)
838 		--i;
839 
840 	return i;
841 #endif
842 }
843 
844 
845 static inline uint32_t
846 clz32(uint32_t n)
847 {
848 #if defined(__INTEL_COMPILER)
849 	return _bit_scan_reverse(n) ^ 31U;
850 
851 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
852 	return (uint32_t)__builtin_clz(n);
853 
854 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
855 	uint32_t i;
856 	__asm__("bsrl %1, %0\n\t"
857 		"xorl $31, %0"
858 		: "=r" (i) : "rm" (n));
859 	return i;
860 
861 #elif defined(_MSC_VER)
862 	unsigned long i;
863 	_BitScanReverse(&i, n);
864 	return i ^ 31U;
865 
866 #else
867 	uint32_t i = 0;
868 
869 	if ((n & 0xFFFF0000) == 0) {
870 		n <<= 16;
871 		i = 16;
872 	}
873 
874 	if ((n & 0xFF000000) == 0) {
875 		n <<= 8;
876 		i += 8;
877 	}
878 
879 	if ((n & 0xF0000000) == 0) {
880 		n <<= 4;
881 		i += 4;
882 	}
883 
884 	if ((n & 0xC0000000) == 0) {
885 		n <<= 2;
886 		i += 2;
887 	}
888 
889 	if ((n & 0x80000000) == 0)
890 		++i;
891 
892 	return i;
893 #endif
894 }
895 
896 
897 static inline uint32_t
898 ctz32(uint32_t n)
899 {
900 #if defined(__INTEL_COMPILER)
901 	return _bit_scan_forward(n);
902 
903 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
904 	return (uint32_t)__builtin_ctz(n);
905 
906 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
907 	uint32_t i;
908 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
909 	return i;
910 
911 #elif defined(_MSC_VER)
912 	unsigned long i;
913 	_BitScanForward(&i, n);
914 	return i;
915 
916 #else
917 	uint32_t i = 0;
918 
919 	if ((n & 0x0000FFFF) == 0) {
920 		n >>= 16;
921 		i = 16;
922 	}
923 
924 	if ((n & 0x000000FF) == 0) {
925 		n >>= 8;
926 		i += 8;
927 	}
928 
929 	if ((n & 0x0000000F) == 0) {
930 		n >>= 4;
931 		i += 4;
932 	}
933 
934 	if ((n & 0x00000003) == 0) {
935 		n >>= 2;
936 		i += 2;
937 	}
938 
939 	if ((n & 0x00000001) == 0)
940 		++i;
941 
942 	return i;
943 #endif
944 }
945 
946 #define bsf32 ctz32
947 
948 #endif
949