1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file tuklib_integer.h 6 /// \brief Various integer and bit operations 7 /// 8 /// This file provides macros or functions to do some basic integer and bit 9 /// operations. 10 /// 11 /// Native endian inline functions (XX = 16, 32, or 64): 12 /// - Unaligned native endian reads: readXXne(ptr) 13 /// - Unaligned native endian writes: writeXXne(ptr, num) 14 /// - Aligned native endian reads: aligned_readXXne(ptr) 15 /// - Aligned native endian writes: aligned_writeXXne(ptr, num) 16 /// 17 /// Endianness-converting integer operations (these can be macros!) 18 /// (XX = 16, 32, or 64; Y = b or l): 19 /// - Byte swapping: byteswapXX(num) 20 /// - Byte order conversions to/from native (byteswaps if Y isn't 21 /// the native endianness): convXXYe(num) 22 /// - Unaligned reads: readXXYe(ptr) 23 /// - Unaligned writes: writeXXYe(ptr, num) 24 /// - Aligned reads: aligned_readXXYe(ptr) 25 /// - Aligned writes: aligned_writeXXYe(ptr, num) 26 /// 27 /// Since the above can macros, the arguments should have no side effects 28 /// because they may be evaluated more than once. 29 /// 30 /// Bit scan operations for non-zero 32-bit integers (inline functions): 31 /// - Bit scan reverse (find highest non-zero bit): bsr32(num) 32 /// - Count leading zeros: clz32(num) 33 /// - Count trailing zeros: ctz32(num) 34 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 35 /// 36 /// The above bit scan operations return 0-31. If num is zero, 37 /// the result is undefined. 38 // 39 // Authors: Lasse Collin 40 // Joachim Henke 41 // 42 /////////////////////////////////////////////////////////////////////////////// 43 44 #ifndef TUKLIB_INTEGER_H 45 #define TUKLIB_INTEGER_H 46 47 #include "tuklib_common.h" 48 #include <string.h> 49 50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 51 // and such functions. 52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 53 # include <immintrin.h> 54 // Only include <intrin.h> when it is needed. GCC and Clang can both 55 // use __builtin's, so we only need Windows instrincs when using MSVC. 56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 57 // cases explicitly. 58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 59 # include <intrin.h> 60 #endif 61 62 63 /////////////////// 64 // Byte swapping // 65 /////////////////// 66 67 #if defined(HAVE___BUILTIN_BSWAPXX) 68 // GCC >= 4.8 and Clang 69 # define byteswap16(num) __builtin_bswap16(num) 70 # define byteswap32(num) __builtin_bswap32(num) 71 # define byteswap64(num) __builtin_bswap64(num) 72 73 #elif defined(HAVE_BYTESWAP_H) 74 // glibc, uClibc, dietlibc 75 # include <byteswap.h> 76 # ifdef HAVE_BSWAP_16 77 # define byteswap16(num) bswap_16(num) 78 # endif 79 # ifdef HAVE_BSWAP_32 80 # define byteswap32(num) bswap_32(num) 81 # endif 82 # ifdef HAVE_BSWAP_64 83 # define byteswap64(num) bswap_64(num) 84 # endif 85 86 #elif defined(HAVE_SYS_ENDIAN_H) 87 // *BSDs and Darwin 88 # include <sys/endian.h> 89 # ifdef __OpenBSD__ 90 # define byteswap16(num) swap16(num) 91 # define byteswap32(num) swap32(num) 92 # define byteswap64(num) swap64(num) 93 # else 94 # define byteswap16(num) bswap16(num) 95 # define byteswap32(num) bswap32(num) 96 # define byteswap64(num) bswap64(num) 97 # endif 98 99 #elif defined(HAVE_SYS_BYTEORDER_H) 100 // Solaris 101 # include <sys/byteorder.h> 102 # ifdef BSWAP_16 103 # define byteswap16(num) BSWAP_16(num) 104 # endif 105 # ifdef BSWAP_32 106 # define byteswap32(num) BSWAP_32(num) 107 # endif 108 # ifdef BSWAP_64 109 # define byteswap64(num) BSWAP_64(num) 110 # endif 111 # ifdef BE_16 112 # define conv16be(num) BE_16(num) 113 # endif 114 # ifdef BE_32 115 # define conv32be(num) BE_32(num) 116 # endif 117 # ifdef BE_64 118 # define conv64be(num) BE_64(num) 119 # endif 120 # ifdef LE_16 121 # define conv16le(num) LE_16(num) 122 # endif 123 # ifdef LE_32 124 # define conv32le(num) LE_32(num) 125 # endif 126 # ifdef LE_64 127 # define conv64le(num) LE_64(num) 128 # endif 129 #endif 130 131 #ifndef byteswap16 132 # define byteswap16(n) (uint16_t)( \ 133 (((n) & 0x00FFU) << 8) \ 134 | (((n) & 0xFF00U) >> 8) \ 135 ) 136 #endif 137 138 #ifndef byteswap32 139 # define byteswap32(n) (uint32_t)( \ 140 (((n) & UINT32_C(0x000000FF)) << 24) \ 141 | (((n) & UINT32_C(0x0000FF00)) << 8) \ 142 | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 143 | (((n) & UINT32_C(0xFF000000)) >> 24) \ 144 ) 145 #endif 146 147 #ifndef byteswap64 148 # define byteswap64(n) (uint64_t)( \ 149 (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 150 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 151 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 152 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 153 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 154 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 155 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 156 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 157 ) 158 #endif 159 160 // Define conversion macros using the basic byte swapping macros. 161 #ifdef WORDS_BIGENDIAN 162 # ifndef conv16be 163 # define conv16be(num) ((uint16_t)(num)) 164 # endif 165 # ifndef conv32be 166 # define conv32be(num) ((uint32_t)(num)) 167 # endif 168 # ifndef conv64be 169 # define conv64be(num) ((uint64_t)(num)) 170 # endif 171 # ifndef conv16le 172 # define conv16le(num) byteswap16(num) 173 # endif 174 # ifndef conv32le 175 # define conv32le(num) byteswap32(num) 176 # endif 177 # ifndef conv64le 178 # define conv64le(num) byteswap64(num) 179 # endif 180 #else 181 # ifndef conv16be 182 # define conv16be(num) byteswap16(num) 183 # endif 184 # ifndef conv32be 185 # define conv32be(num) byteswap32(num) 186 # endif 187 # ifndef conv64be 188 # define conv64be(num) byteswap64(num) 189 # endif 190 # ifndef conv16le 191 # define conv16le(num) ((uint16_t)(num)) 192 # endif 193 # ifndef conv32le 194 # define conv32le(num) ((uint32_t)(num)) 195 # endif 196 # ifndef conv64le 197 # define conv64le(num) ((uint64_t)(num)) 198 # endif 199 #endif 200 201 202 //////////////////////////////// 203 // Unaligned reads and writes // 204 //////////////////////////////// 205 206 // No-strict-align archs like x86-64 207 // --------------------------------- 208 // 209 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 210 // is bad even if the uint8_pointer is properly aligned because this kind 211 // of casts break strict aliasing rules and result in undefined behavior. 212 // With unaligned pointers it's even worse: compilers may emit vector 213 // instructions that require aligned pointers even if non-vector 214 // instructions work with unaligned pointers. 215 // 216 // Using memcpy() is the standard compliant way to do unaligned access. 217 // Many modern compilers inline it so there is no function call overhead. 218 // For those compilers that don't handle the memcpy() method well, the 219 // old casting method (that violates strict aliasing) can be requested at 220 // build time. A third method, casting to a packed struct, would also be 221 // an option but isn't provided to keep things simpler (it's already a mess). 222 // Hopefully this is flexible enough in practice. 223 // 224 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 225 // 226 // buf[0] | (buf[1] << 8) 227 // 228 // reads a 16-bit value and can emit a single 16-bit load and produce 229 // identical code than with the memcpy() method. In other cases Clang and GCC 230 // produce either the same or better code with memcpy(). For example, Clang 9 231 // on x86-64 can detect 32-bit load but not 16-bit load. 232 // 233 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 234 // code for "buf[0] | (buf[1] << 8)". 235 // 236 // Conclusion: The memcpy() method is the best choice when unaligned access 237 // is supported. 238 // 239 // Strict-align archs like SPARC 240 // ----------------------------- 241 // 242 // GCC versions from around 4.x to to at least 13.2.0 produce worse code 243 // from the memcpy() method than from simple byte-by-byte shift-or code 244 // when reading a 32-bit integer: 245 // 246 // (1) It may be constructed on stack using four 8-bit loads, 247 // four 8-bit stores to stack, and finally one 32-bit load from stack. 248 // 249 // (2) Especially with -Os, an actual memcpy() call may be emitted. 250 // 251 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 252 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 253 // some processors but not all so this is relevant only in the case when 254 // GCC assumes that unaligned is not supported or -mstrict-align or 255 // -mno-unaligned-access is used. 256 // 257 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align 258 // was one the very few with a minor difference: the memcpy() version 259 // was one instruction longer. 260 // 261 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is 262 // the best choice for strict-align archs to do unaligned access. 263 // 264 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 265 // 266 // Thanks to <https://godbolt.org/> it was easy to test different compilers. 267 // The following is for little endian targets: 268 /* 269 #include <stdint.h> 270 #include <string.h> 271 272 uint32_t bytes16(const uint8_t *b) 273 { 274 return (uint32_t)b[0] 275 | ((uint32_t)b[1] << 8); 276 } 277 278 uint32_t copy16(const uint8_t *b) 279 { 280 uint16_t v; 281 memcpy(&v, b, sizeof(v)); 282 return v; 283 } 284 285 uint32_t bytes32(const uint8_t *b) 286 { 287 return (uint32_t)b[0] 288 | ((uint32_t)b[1] << 8) 289 | ((uint32_t)b[2] << 16) 290 | ((uint32_t)b[3] << 24); 291 } 292 293 uint32_t copy32(const uint8_t *b) 294 { 295 uint32_t v; 296 memcpy(&v, b, sizeof(v)); 297 return v; 298 } 299 300 void wbytes16(uint8_t *b, uint16_t v) 301 { 302 b[0] = (uint8_t)v; 303 b[1] = (uint8_t)(v >> 8); 304 } 305 306 void wcopy16(uint8_t *b, uint16_t v) 307 { 308 memcpy(b, &v, sizeof(v)); 309 } 310 311 void wbytes32(uint8_t *b, uint32_t v) 312 { 313 b[0] = (uint8_t)v; 314 b[1] = (uint8_t)(v >> 8); 315 b[2] = (uint8_t)(v >> 16); 316 b[3] = (uint8_t)(v >> 24); 317 } 318 319 void wcopy32(uint8_t *b, uint32_t v) 320 { 321 memcpy(b, &v, sizeof(v)); 322 } 323 */ 324 325 326 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS 327 328 static inline uint16_t 329 read16ne(const uint8_t *buf) 330 { 331 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 332 return *(const uint16_t *)buf; 333 #else 334 uint16_t num; 335 memcpy(&num, buf, sizeof(num)); 336 return num; 337 #endif 338 } 339 340 341 static inline uint32_t 342 read32ne(const uint8_t *buf) 343 { 344 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 345 return *(const uint32_t *)buf; 346 #else 347 uint32_t num; 348 memcpy(&num, buf, sizeof(num)); 349 return num; 350 #endif 351 } 352 353 354 static inline uint64_t 355 read64ne(const uint8_t *buf) 356 { 357 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 358 return *(const uint64_t *)buf; 359 #else 360 uint64_t num; 361 memcpy(&num, buf, sizeof(num)); 362 return num; 363 #endif 364 } 365 366 367 static inline void 368 write16ne(uint8_t *buf, uint16_t num) 369 { 370 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 371 *(uint16_t *)buf = num; 372 #else 373 memcpy(buf, &num, sizeof(num)); 374 #endif 375 return; 376 } 377 378 379 static inline void 380 write32ne(uint8_t *buf, uint32_t num) 381 { 382 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 383 *(uint32_t *)buf = num; 384 #else 385 memcpy(buf, &num, sizeof(num)); 386 #endif 387 return; 388 } 389 390 391 static inline void 392 write64ne(uint8_t *buf, uint64_t num) 393 { 394 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 395 *(uint64_t *)buf = num; 396 #else 397 memcpy(buf, &num, sizeof(num)); 398 #endif 399 return; 400 } 401 402 403 static inline uint16_t 404 read16be(const uint8_t *buf) 405 { 406 uint16_t num = read16ne(buf); 407 return conv16be(num); 408 } 409 410 411 static inline uint16_t 412 read16le(const uint8_t *buf) 413 { 414 uint16_t num = read16ne(buf); 415 return conv16le(num); 416 } 417 418 419 static inline uint32_t 420 read32be(const uint8_t *buf) 421 { 422 uint32_t num = read32ne(buf); 423 return conv32be(num); 424 } 425 426 427 static inline uint32_t 428 read32le(const uint8_t *buf) 429 { 430 uint32_t num = read32ne(buf); 431 return conv32le(num); 432 } 433 434 435 static inline uint64_t 436 read64be(const uint8_t *buf) 437 { 438 uint64_t num = read64ne(buf); 439 return conv64be(num); 440 } 441 442 443 static inline uint64_t 444 read64le(const uint8_t *buf) 445 { 446 uint64_t num = read64ne(buf); 447 return conv64le(num); 448 } 449 450 451 // NOTE: Possible byte swapping must be done in a macro to allow the compiler 452 // to optimize byte swapping of constants when using glibc's or *BSD's 453 // byte swapping macros. The actual write is done in an inline function 454 // to make type checking of the buf pointer possible. 455 #define write16be(buf, num) write16ne(buf, conv16be(num)) 456 #define write32be(buf, num) write32ne(buf, conv32be(num)) 457 #define write64be(buf, num) write64ne(buf, conv64be(num)) 458 #define write16le(buf, num) write16ne(buf, conv16le(num)) 459 #define write32le(buf, num) write32ne(buf, conv32le(num)) 460 #define write64le(buf, num) write64ne(buf, conv64le(num)) 461 462 #else 463 464 #ifdef WORDS_BIGENDIAN 465 # define read16ne read16be 466 # define read32ne read32be 467 # define read64ne read64be 468 # define write16ne write16be 469 # define write32ne write32be 470 # define write64ne write64be 471 #else 472 # define read16ne read16le 473 # define read32ne read32le 474 # define read64ne read64le 475 # define write16ne write16le 476 # define write32ne write32le 477 # define write64ne write64le 478 #endif 479 480 481 static inline uint16_t 482 read16be(const uint8_t *buf) 483 { 484 uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 485 return num; 486 } 487 488 489 static inline uint16_t 490 read16le(const uint8_t *buf) 491 { 492 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 493 return num; 494 } 495 496 497 static inline uint32_t 498 read32be(const uint8_t *buf) 499 { 500 uint32_t num = (uint32_t)buf[0] << 24; 501 num |= (uint32_t)buf[1] << 16; 502 num |= (uint32_t)buf[2] << 8; 503 num |= (uint32_t)buf[3]; 504 return num; 505 } 506 507 508 static inline uint32_t 509 read32le(const uint8_t *buf) 510 { 511 uint32_t num = (uint32_t)buf[0]; 512 num |= (uint32_t)buf[1] << 8; 513 num |= (uint32_t)buf[2] << 16; 514 num |= (uint32_t)buf[3] << 24; 515 return num; 516 } 517 518 519 static inline uint64_t 520 read64be(const uint8_t *buf) 521 { 522 uint64_t num = (uint64_t)buf[0] << 56; 523 num |= (uint64_t)buf[1] << 48; 524 num |= (uint64_t)buf[2] << 40; 525 num |= (uint64_t)buf[3] << 32; 526 num |= (uint64_t)buf[4] << 24; 527 num |= (uint64_t)buf[5] << 16; 528 num |= (uint64_t)buf[6] << 8; 529 num |= (uint64_t)buf[7]; 530 return num; 531 } 532 533 534 static inline uint64_t 535 read64le(const uint8_t *buf) 536 { 537 uint64_t num = (uint64_t)buf[0]; 538 num |= (uint64_t)buf[1] << 8; 539 num |= (uint64_t)buf[2] << 16; 540 num |= (uint64_t)buf[3] << 24; 541 num |= (uint64_t)buf[4] << 32; 542 num |= (uint64_t)buf[5] << 40; 543 num |= (uint64_t)buf[6] << 48; 544 num |= (uint64_t)buf[7] << 56; 545 return num; 546 } 547 548 549 static inline void 550 write16be(uint8_t *buf, uint16_t num) 551 { 552 buf[0] = (uint8_t)(num >> 8); 553 buf[1] = (uint8_t)num; 554 return; 555 } 556 557 558 static inline void 559 write16le(uint8_t *buf, uint16_t num) 560 { 561 buf[0] = (uint8_t)num; 562 buf[1] = (uint8_t)(num >> 8); 563 return; 564 } 565 566 567 static inline void 568 write32be(uint8_t *buf, uint32_t num) 569 { 570 buf[0] = (uint8_t)(num >> 24); 571 buf[1] = (uint8_t)(num >> 16); 572 buf[2] = (uint8_t)(num >> 8); 573 buf[3] = (uint8_t)num; 574 return; 575 } 576 577 578 static inline void 579 write32le(uint8_t *buf, uint32_t num) 580 { 581 buf[0] = (uint8_t)num; 582 buf[1] = (uint8_t)(num >> 8); 583 buf[2] = (uint8_t)(num >> 16); 584 buf[3] = (uint8_t)(num >> 24); 585 return; 586 } 587 588 589 static inline void 590 write64be(uint8_t *buf, uint64_t num) 591 { 592 buf[0] = (uint8_t)(num >> 56); 593 buf[1] = (uint8_t)(num >> 48); 594 buf[2] = (uint8_t)(num >> 40); 595 buf[3] = (uint8_t)(num >> 32); 596 buf[4] = (uint8_t)(num >> 24); 597 buf[5] = (uint8_t)(num >> 16); 598 buf[6] = (uint8_t)(num >> 8); 599 buf[7] = (uint8_t)num; 600 return; 601 } 602 603 604 static inline void 605 write64le(uint8_t *buf, uint64_t num) 606 { 607 buf[0] = (uint8_t)num; 608 buf[1] = (uint8_t)(num >> 8); 609 buf[2] = (uint8_t)(num >> 16); 610 buf[3] = (uint8_t)(num >> 24); 611 buf[4] = (uint8_t)(num >> 32); 612 buf[5] = (uint8_t)(num >> 40); 613 buf[6] = (uint8_t)(num >> 48); 614 buf[7] = (uint8_t)(num >> 56); 615 return; 616 } 617 618 #endif 619 620 621 ////////////////////////////// 622 // Aligned reads and writes // 623 ////////////////////////////// 624 625 // Separate functions for aligned reads and writes are provided since on 626 // strict-align archs aligned access is much faster than unaligned access. 627 // 628 // Just like in the unaligned case, memcpy() is needed to avoid 629 // strict aliasing violations. However, on archs that don't support 630 // unaligned access the compiler cannot know that the pointers given 631 // to memcpy() are aligned which results in slow code. As of C11 there is 632 // no standard way to tell the compiler that we know that the address is 633 // aligned but some compilers have language extensions to do that. With 634 // such language extensions the memcpy() method gives excellent results. 635 // 636 // What to do on a strict-align system when no known language extensions 637 // are available? Falling back to byte-by-byte access would be safe but ruin 638 // optimizations that have been made specifically with aligned access in mind. 639 // As a compromise, aligned reads will fall back to non-compliant type punning 640 // but aligned writes will be byte-by-byte, that is, fast reads are preferred 641 // over fast writes. This obviously isn't great but hopefully it's a working 642 // compromise for now. 643 // 644 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 645 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED 646 # define tuklib_memcpy_aligned(dest, src, size) \ 647 memcpy(dest, __builtin_assume_aligned(src, size), size) 648 #else 649 # define tuklib_memcpy_aligned(dest, src, size) \ 650 memcpy(dest, src, size) 651 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS 652 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 653 # endif 654 #endif 655 656 657 static inline uint16_t 658 aligned_read16ne(const uint8_t *buf) 659 { 660 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 661 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 662 return *(const uint16_t *)buf; 663 #else 664 uint16_t num; 665 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 666 return num; 667 #endif 668 } 669 670 671 static inline uint32_t 672 aligned_read32ne(const uint8_t *buf) 673 { 674 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 675 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 676 return *(const uint32_t *)buf; 677 #else 678 uint32_t num; 679 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 680 return num; 681 #endif 682 } 683 684 685 static inline uint64_t 686 aligned_read64ne(const uint8_t *buf) 687 { 688 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 689 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 690 return *(const uint64_t *)buf; 691 #else 692 uint64_t num; 693 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 694 return num; 695 #endif 696 } 697 698 699 static inline void 700 aligned_write16ne(uint8_t *buf, uint16_t num) 701 { 702 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 703 *(uint16_t *)buf = num; 704 #else 705 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 706 #endif 707 return; 708 } 709 710 711 static inline void 712 aligned_write32ne(uint8_t *buf, uint32_t num) 713 { 714 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 715 *(uint32_t *)buf = num; 716 #else 717 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 718 #endif 719 return; 720 } 721 722 723 static inline void 724 aligned_write64ne(uint8_t *buf, uint64_t num) 725 { 726 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 727 *(uint64_t *)buf = num; 728 #else 729 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 730 #endif 731 return; 732 } 733 734 735 static inline uint16_t 736 aligned_read16be(const uint8_t *buf) 737 { 738 uint16_t num = aligned_read16ne(buf); 739 return conv16be(num); 740 } 741 742 743 static inline uint16_t 744 aligned_read16le(const uint8_t *buf) 745 { 746 uint16_t num = aligned_read16ne(buf); 747 return conv16le(num); 748 } 749 750 751 static inline uint32_t 752 aligned_read32be(const uint8_t *buf) 753 { 754 uint32_t num = aligned_read32ne(buf); 755 return conv32be(num); 756 } 757 758 759 static inline uint32_t 760 aligned_read32le(const uint8_t *buf) 761 { 762 uint32_t num = aligned_read32ne(buf); 763 return conv32le(num); 764 } 765 766 767 static inline uint64_t 768 aligned_read64be(const uint8_t *buf) 769 { 770 uint64_t num = aligned_read64ne(buf); 771 return conv64be(num); 772 } 773 774 775 static inline uint64_t 776 aligned_read64le(const uint8_t *buf) 777 { 778 uint64_t num = aligned_read64ne(buf); 779 return conv64le(num); 780 } 781 782 783 // These need to be macros like in the unaligned case. 784 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 785 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 786 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 787 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 788 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 789 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 790 791 792 //////////////////// 793 // Bit operations // 794 //////////////////// 795 796 static inline uint32_t 797 bsr32(uint32_t n) 798 { 799 // Check for ICC first, since it tends to define __GNUC__ too. 800 #if defined(__INTEL_COMPILER) 801 return _bit_scan_reverse(n); 802 803 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 804 // GCC >= 3.4 has __builtin_clz(), which gives good results on 805 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 806 // either plain BSR (so the XOR gets optimized away) or LZCNT and 807 // XOR (if -march indicates that SSE4a instructions are supported). 808 return (uint32_t)__builtin_clz(n) ^ 31U; 809 810 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 811 uint32_t i; 812 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 813 return i; 814 815 #elif defined(_MSC_VER) 816 unsigned long i; 817 _BitScanReverse(&i, n); 818 return i; 819 820 #else 821 uint32_t i = 31; 822 823 if ((n & 0xFFFF0000) == 0) { 824 n <<= 16; 825 i = 15; 826 } 827 828 if ((n & 0xFF000000) == 0) { 829 n <<= 8; 830 i -= 8; 831 } 832 833 if ((n & 0xF0000000) == 0) { 834 n <<= 4; 835 i -= 4; 836 } 837 838 if ((n & 0xC0000000) == 0) { 839 n <<= 2; 840 i -= 2; 841 } 842 843 if ((n & 0x80000000) == 0) 844 --i; 845 846 return i; 847 #endif 848 } 849 850 851 static inline uint32_t 852 clz32(uint32_t n) 853 { 854 #if defined(__INTEL_COMPILER) 855 return _bit_scan_reverse(n) ^ 31U; 856 857 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 858 return (uint32_t)__builtin_clz(n); 859 860 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 861 uint32_t i; 862 __asm__("bsrl %1, %0\n\t" 863 "xorl $31, %0" 864 : "=r" (i) : "rm" (n)); 865 return i; 866 867 #elif defined(_MSC_VER) 868 unsigned long i; 869 _BitScanReverse(&i, n); 870 return i ^ 31U; 871 872 #else 873 uint32_t i = 0; 874 875 if ((n & 0xFFFF0000) == 0) { 876 n <<= 16; 877 i = 16; 878 } 879 880 if ((n & 0xFF000000) == 0) { 881 n <<= 8; 882 i += 8; 883 } 884 885 if ((n & 0xF0000000) == 0) { 886 n <<= 4; 887 i += 4; 888 } 889 890 if ((n & 0xC0000000) == 0) { 891 n <<= 2; 892 i += 2; 893 } 894 895 if ((n & 0x80000000) == 0) 896 ++i; 897 898 return i; 899 #endif 900 } 901 902 903 static inline uint32_t 904 ctz32(uint32_t n) 905 { 906 #if defined(__INTEL_COMPILER) 907 return _bit_scan_forward(n); 908 909 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 910 return (uint32_t)__builtin_ctz(n); 911 912 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 913 uint32_t i; 914 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 915 return i; 916 917 #elif defined(_MSC_VER) 918 unsigned long i; 919 _BitScanForward(&i, n); 920 return i; 921 922 #else 923 uint32_t i = 0; 924 925 if ((n & 0x0000FFFF) == 0) { 926 n >>= 16; 927 i = 16; 928 } 929 930 if ((n & 0x000000FF) == 0) { 931 n >>= 8; 932 i += 8; 933 } 934 935 if ((n & 0x0000000F) == 0) { 936 n >>= 4; 937 i += 4; 938 } 939 940 if ((n & 0x00000003) == 0) { 941 n >>= 2; 942 i += 2; 943 } 944 945 if ((n & 0x00000001) == 0) 946 ++i; 947 948 return i; 949 #endif 950 } 951 952 #define bsf32 ctz32 953 954 #endif 955