1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file tuklib_integer.h 6 /// \brief Various integer and bit operations 7 /// 8 /// This file provides macros or functions to do some basic integer and bit 9 /// operations. 10 /// 11 /// Native endian inline functions (XX = 16, 32, or 64): 12 /// - Unaligned native endian reads: readXXne(ptr) 13 /// - Unaligned native endian writes: writeXXne(ptr, num) 14 /// - Aligned native endian reads: aligned_readXXne(ptr) 15 /// - Aligned native endian writes: aligned_writeXXne(ptr, num) 16 /// 17 /// Endianness-converting integer operations (these can be macros!) 18 /// (XX = 16, 32, or 64; Y = b or l): 19 /// - Byte swapping: bswapXX(num) 20 /// - Byte order conversions to/from native (byteswaps if Y isn't 21 /// the native endianness): convXXYe(num) 22 /// - Unaligned reads: readXXYe(ptr) 23 /// - Unaligned writes: writeXXYe(ptr, num) 24 /// - Aligned reads: aligned_readXXYe(ptr) 25 /// - Aligned writes: aligned_writeXXYe(ptr, num) 26 /// 27 /// Since the above can macros, the arguments should have no side effects 28 /// because they may be evaluated more than once. 29 /// 30 /// Bit scan operations for non-zero 32-bit integers (inline functions): 31 /// - Bit scan reverse (find highest non-zero bit): bsr32(num) 32 /// - Count leading zeros: clz32(num) 33 /// - Count trailing zeros: ctz32(num) 34 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 35 /// 36 /// The above bit scan operations return 0-31. If num is zero, 37 /// the result is undefined. 38 // 39 // Authors: Lasse Collin 40 // Joachim Henke 41 // 42 /////////////////////////////////////////////////////////////////////////////// 43 44 #ifndef TUKLIB_INTEGER_H 45 #define TUKLIB_INTEGER_H 46 47 #include "tuklib_common.h" 48 #include <string.h> 49 50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 51 // and such functions. 52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 53 # include <immintrin.h> 54 // Only include <intrin.h> when it is needed. GCC and Clang can both 55 // use __builtin's, so we only need Windows instrincs when using MSVC. 56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 57 // cases explicitly. 58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 59 # include <intrin.h> 60 #endif 61 62 63 /////////////////// 64 // Byte swapping // 65 /////////////////// 66 67 #if defined(HAVE___BUILTIN_BSWAPXX) 68 // GCC >= 4.8 and Clang 69 # define bswap16(n) __builtin_bswap16(n) 70 # define bswap32(n) __builtin_bswap32(n) 71 # define bswap64(n) __builtin_bswap64(n) 72 73 #elif defined(HAVE_BYTESWAP_H) 74 // glibc, uClibc, dietlibc 75 # include <byteswap.h> 76 # ifdef HAVE_BSWAP_16 77 # define bswap16(num) bswap_16(num) 78 # endif 79 # ifdef HAVE_BSWAP_32 80 # define bswap32(num) bswap_32(num) 81 # endif 82 # ifdef HAVE_BSWAP_64 83 # define bswap64(num) bswap_64(num) 84 # endif 85 86 #elif defined(HAVE_SYS_ENDIAN_H) 87 // *BSDs and Darwin 88 # include <sys/endian.h> 89 90 #elif defined(HAVE_SYS_BYTEORDER_H) 91 // Solaris 92 # include <sys/byteorder.h> 93 # ifdef BSWAP_16 94 # define bswap16(num) BSWAP_16(num) 95 # endif 96 # ifdef BSWAP_32 97 # define bswap32(num) BSWAP_32(num) 98 # endif 99 # ifdef BSWAP_64 100 # define bswap64(num) BSWAP_64(num) 101 # endif 102 # ifdef BE_16 103 # define conv16be(num) BE_16(num) 104 # endif 105 # ifdef BE_32 106 # define conv32be(num) BE_32(num) 107 # endif 108 # ifdef BE_64 109 # define conv64be(num) BE_64(num) 110 # endif 111 # ifdef LE_16 112 # define conv16le(num) LE_16(num) 113 # endif 114 # ifdef LE_32 115 # define conv32le(num) LE_32(num) 116 # endif 117 # ifdef LE_64 118 # define conv64le(num) LE_64(num) 119 # endif 120 #endif 121 122 #ifndef bswap16 123 # define bswap16(n) (uint16_t)( \ 124 (((n) & 0x00FFU) << 8) \ 125 | (((n) & 0xFF00U) >> 8) \ 126 ) 127 #endif 128 129 #ifndef bswap32 130 # define bswap32(n) (uint32_t)( \ 131 (((n) & UINT32_C(0x000000FF)) << 24) \ 132 | (((n) & UINT32_C(0x0000FF00)) << 8) \ 133 | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 134 | (((n) & UINT32_C(0xFF000000)) >> 24) \ 135 ) 136 #endif 137 138 #ifndef bswap64 139 # define bswap64(n) (uint64_t)( \ 140 (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 141 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 142 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 143 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 144 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 145 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 146 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 147 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 148 ) 149 #endif 150 151 // Define conversion macros using the basic byte swapping macros. 152 #ifdef WORDS_BIGENDIAN 153 # ifndef conv16be 154 # define conv16be(num) ((uint16_t)(num)) 155 # endif 156 # ifndef conv32be 157 # define conv32be(num) ((uint32_t)(num)) 158 # endif 159 # ifndef conv64be 160 # define conv64be(num) ((uint64_t)(num)) 161 # endif 162 # ifndef conv16le 163 # define conv16le(num) bswap16(num) 164 # endif 165 # ifndef conv32le 166 # define conv32le(num) bswap32(num) 167 # endif 168 # ifndef conv64le 169 # define conv64le(num) bswap64(num) 170 # endif 171 #else 172 # ifndef conv16be 173 # define conv16be(num) bswap16(num) 174 # endif 175 # ifndef conv32be 176 # define conv32be(num) bswap32(num) 177 # endif 178 # ifndef conv64be 179 # define conv64be(num) bswap64(num) 180 # endif 181 # ifndef conv16le 182 # define conv16le(num) ((uint16_t)(num)) 183 # endif 184 # ifndef conv32le 185 # define conv32le(num) ((uint32_t)(num)) 186 # endif 187 # ifndef conv64le 188 # define conv64le(num) ((uint64_t)(num)) 189 # endif 190 #endif 191 192 193 //////////////////////////////// 194 // Unaligned reads and writes // 195 //////////////////////////////// 196 197 // No-strict-align archs like x86-64 198 // --------------------------------- 199 // 200 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 201 // is bad even if the uint8_pointer is properly aligned because this kind 202 // of casts break strict aliasing rules and result in undefined behavior. 203 // With unaligned pointers it's even worse: compilers may emit vector 204 // instructions that require aligned pointers even if non-vector 205 // instructions work with unaligned pointers. 206 // 207 // Using memcpy() is the standard compliant way to do unaligned access. 208 // Many modern compilers inline it so there is no function call overhead. 209 // For those compilers that don't handle the memcpy() method well, the 210 // old casting method (that violates strict aliasing) can be requested at 211 // build time. A third method, casting to a packed struct, would also be 212 // an option but isn't provided to keep things simpler (it's already a mess). 213 // Hopefully this is flexible enough in practice. 214 // 215 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 216 // 217 // buf[0] | (buf[1] << 8) 218 // 219 // reads a 16-bit value and can emit a single 16-bit load and produce 220 // identical code than with the memcpy() method. In other cases Clang and GCC 221 // produce either the same or better code with memcpy(). For example, Clang 9 222 // on x86-64 can detect 32-bit load but not 16-bit load. 223 // 224 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 225 // code for "buf[0] | (buf[1] << 8)". 226 // 227 // Conclusion: The memcpy() method is the best choice when unaligned access 228 // is supported. 229 // 230 // Strict-align archs like SPARC 231 // ----------------------------- 232 // 233 // GCC versions from around 4.x to to at least 13.2.0 produce worse code 234 // from the memcpy() method than from simple byte-by-byte shift-or code 235 // when reading a 32-bit integer: 236 // 237 // (1) It may be constructed on stack using using four 8-bit loads, 238 // four 8-bit stores to stack, and finally one 32-bit load from stack. 239 // 240 // (2) Especially with -Os, an actual memcpy() call may be emitted. 241 // 242 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 243 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 244 // some processors but not all so this is relevant only in the case when 245 // GCC assumes that unaligned is not supported or -mstrict-align or 246 // -mno-unaligned-access is used. 247 // 248 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align 249 // was one the very few with a minor difference: the memcpy() version 250 // was one instruction longer. 251 // 252 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is 253 // the best choice for strict-align archs to do unaligned access. 254 // 255 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 256 // 257 // Thanks to <https://godbolt.org/> it was easy to test different compilers. 258 // The following is for little endian targets: 259 /* 260 #include <stdint.h> 261 #include <string.h> 262 263 uint32_t bytes16(const uint8_t *b) 264 { 265 return (uint32_t)b[0] 266 | ((uint32_t)b[1] << 8); 267 } 268 269 uint32_t copy16(const uint8_t *b) 270 { 271 uint16_t v; 272 memcpy(&v, b, sizeof(v)); 273 return v; 274 } 275 276 uint32_t bytes32(const uint8_t *b) 277 { 278 return (uint32_t)b[0] 279 | ((uint32_t)b[1] << 8) 280 | ((uint32_t)b[2] << 16) 281 | ((uint32_t)b[3] << 24); 282 } 283 284 uint32_t copy32(const uint8_t *b) 285 { 286 uint32_t v; 287 memcpy(&v, b, sizeof(v)); 288 return v; 289 } 290 291 void wbytes16(uint8_t *b, uint16_t v) 292 { 293 b[0] = (uint8_t)v; 294 b[1] = (uint8_t)(v >> 8); 295 } 296 297 void wcopy16(uint8_t *b, uint16_t v) 298 { 299 memcpy(b, &v, sizeof(v)); 300 } 301 302 void wbytes32(uint8_t *b, uint32_t v) 303 { 304 b[0] = (uint8_t)v; 305 b[1] = (uint8_t)(v >> 8); 306 b[2] = (uint8_t)(v >> 16); 307 b[3] = (uint8_t)(v >> 24); 308 } 309 310 void wcopy32(uint8_t *b, uint32_t v) 311 { 312 memcpy(b, &v, sizeof(v)); 313 } 314 */ 315 316 317 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS 318 319 static inline uint16_t 320 read16ne(const uint8_t *buf) 321 { 322 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 323 return *(const uint16_t *)buf; 324 #else 325 uint16_t num; 326 memcpy(&num, buf, sizeof(num)); 327 return num; 328 #endif 329 } 330 331 332 static inline uint32_t 333 read32ne(const uint8_t *buf) 334 { 335 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 336 return *(const uint32_t *)buf; 337 #else 338 uint32_t num; 339 memcpy(&num, buf, sizeof(num)); 340 return num; 341 #endif 342 } 343 344 345 static inline uint64_t 346 read64ne(const uint8_t *buf) 347 { 348 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 349 return *(const uint64_t *)buf; 350 #else 351 uint64_t num; 352 memcpy(&num, buf, sizeof(num)); 353 return num; 354 #endif 355 } 356 357 358 static inline void 359 write16ne(uint8_t *buf, uint16_t num) 360 { 361 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 362 *(uint16_t *)buf = num; 363 #else 364 memcpy(buf, &num, sizeof(num)); 365 #endif 366 return; 367 } 368 369 370 static inline void 371 write32ne(uint8_t *buf, uint32_t num) 372 { 373 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 374 *(uint32_t *)buf = num; 375 #else 376 memcpy(buf, &num, sizeof(num)); 377 #endif 378 return; 379 } 380 381 382 static inline void 383 write64ne(uint8_t *buf, uint64_t num) 384 { 385 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 386 *(uint64_t *)buf = num; 387 #else 388 memcpy(buf, &num, sizeof(num)); 389 #endif 390 return; 391 } 392 393 394 static inline uint16_t 395 read16be(const uint8_t *buf) 396 { 397 uint16_t num = read16ne(buf); 398 return conv16be(num); 399 } 400 401 402 static inline uint16_t 403 read16le(const uint8_t *buf) 404 { 405 uint16_t num = read16ne(buf); 406 return conv16le(num); 407 } 408 409 410 static inline uint32_t 411 read32be(const uint8_t *buf) 412 { 413 uint32_t num = read32ne(buf); 414 return conv32be(num); 415 } 416 417 418 static inline uint32_t 419 read32le(const uint8_t *buf) 420 { 421 uint32_t num = read32ne(buf); 422 return conv32le(num); 423 } 424 425 426 static inline uint64_t 427 read64be(const uint8_t *buf) 428 { 429 uint64_t num = read64ne(buf); 430 return conv64be(num); 431 } 432 433 434 static inline uint64_t 435 read64le(const uint8_t *buf) 436 { 437 uint64_t num = read64ne(buf); 438 return conv64le(num); 439 } 440 441 442 // NOTE: Possible byte swapping must be done in a macro to allow the compiler 443 // to optimize byte swapping of constants when using glibc's or *BSD's 444 // byte swapping macros. The actual write is done in an inline function 445 // to make type checking of the buf pointer possible. 446 #define write16be(buf, num) write16ne(buf, conv16be(num)) 447 #define write32be(buf, num) write32ne(buf, conv32be(num)) 448 #define write64be(buf, num) write64ne(buf, conv64be(num)) 449 #define write16le(buf, num) write16ne(buf, conv16le(num)) 450 #define write32le(buf, num) write32ne(buf, conv32le(num)) 451 #define write64le(buf, num) write64ne(buf, conv64le(num)) 452 453 #else 454 455 #ifdef WORDS_BIGENDIAN 456 # define read16ne read16be 457 # define read32ne read32be 458 # define read64ne read64be 459 # define write16ne write16be 460 # define write32ne write32be 461 # define write64ne write64be 462 #else 463 # define read16ne read16le 464 # define read32ne read32le 465 # define read64ne read64le 466 # define write16ne write16le 467 # define write32ne write32le 468 # define write64ne write64le 469 #endif 470 471 472 static inline uint16_t 473 read16be(const uint8_t *buf) 474 { 475 uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 476 return num; 477 } 478 479 480 static inline uint16_t 481 read16le(const uint8_t *buf) 482 { 483 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 484 return num; 485 } 486 487 488 static inline uint32_t 489 read32be(const uint8_t *buf) 490 { 491 uint32_t num = (uint32_t)buf[0] << 24; 492 num |= (uint32_t)buf[1] << 16; 493 num |= (uint32_t)buf[2] << 8; 494 num |= (uint32_t)buf[3]; 495 return num; 496 } 497 498 499 static inline uint32_t 500 read32le(const uint8_t *buf) 501 { 502 uint32_t num = (uint32_t)buf[0]; 503 num |= (uint32_t)buf[1] << 8; 504 num |= (uint32_t)buf[2] << 16; 505 num |= (uint32_t)buf[3] << 24; 506 return num; 507 } 508 509 510 static inline uint64_t 511 read64be(const uint8_t *buf) 512 { 513 uint64_t num = (uint64_t)buf[0] << 56; 514 num |= (uint64_t)buf[1] << 48; 515 num |= (uint64_t)buf[2] << 40; 516 num |= (uint64_t)buf[3] << 32; 517 num |= (uint64_t)buf[4] << 24; 518 num |= (uint64_t)buf[5] << 16; 519 num |= (uint64_t)buf[6] << 8; 520 num |= (uint64_t)buf[7]; 521 return num; 522 } 523 524 525 static inline uint64_t 526 read64le(const uint8_t *buf) 527 { 528 uint64_t num = (uint64_t)buf[0]; 529 num |= (uint64_t)buf[1] << 8; 530 num |= (uint64_t)buf[2] << 16; 531 num |= (uint64_t)buf[3] << 24; 532 num |= (uint64_t)buf[4] << 32; 533 num |= (uint64_t)buf[5] << 40; 534 num |= (uint64_t)buf[6] << 48; 535 num |= (uint64_t)buf[7] << 56; 536 return num; 537 } 538 539 540 static inline void 541 write16be(uint8_t *buf, uint16_t num) 542 { 543 buf[0] = (uint8_t)(num >> 8); 544 buf[1] = (uint8_t)num; 545 return; 546 } 547 548 549 static inline void 550 write16le(uint8_t *buf, uint16_t num) 551 { 552 buf[0] = (uint8_t)num; 553 buf[1] = (uint8_t)(num >> 8); 554 return; 555 } 556 557 558 static inline void 559 write32be(uint8_t *buf, uint32_t num) 560 { 561 buf[0] = (uint8_t)(num >> 24); 562 buf[1] = (uint8_t)(num >> 16); 563 buf[2] = (uint8_t)(num >> 8); 564 buf[3] = (uint8_t)num; 565 return; 566 } 567 568 569 static inline void 570 write32le(uint8_t *buf, uint32_t num) 571 { 572 buf[0] = (uint8_t)num; 573 buf[1] = (uint8_t)(num >> 8); 574 buf[2] = (uint8_t)(num >> 16); 575 buf[3] = (uint8_t)(num >> 24); 576 return; 577 } 578 579 580 static inline void 581 write64be(uint8_t *buf, uint64_t num) 582 { 583 buf[0] = (uint8_t)(num >> 56); 584 buf[1] = (uint8_t)(num >> 48); 585 buf[2] = (uint8_t)(num >> 40); 586 buf[3] = (uint8_t)(num >> 32); 587 buf[4] = (uint8_t)(num >> 24); 588 buf[5] = (uint8_t)(num >> 16); 589 buf[6] = (uint8_t)(num >> 8); 590 buf[7] = (uint8_t)num; 591 return; 592 } 593 594 595 static inline void 596 write64le(uint8_t *buf, uint64_t num) 597 { 598 buf[0] = (uint8_t)num; 599 buf[1] = (uint8_t)(num >> 8); 600 buf[2] = (uint8_t)(num >> 16); 601 buf[3] = (uint8_t)(num >> 24); 602 buf[4] = (uint8_t)(num >> 32); 603 buf[5] = (uint8_t)(num >> 40); 604 buf[6] = (uint8_t)(num >> 48); 605 buf[7] = (uint8_t)(num >> 56); 606 return; 607 } 608 609 #endif 610 611 612 ////////////////////////////// 613 // Aligned reads and writes // 614 ////////////////////////////// 615 616 // Separate functions for aligned reads and writes are provided since on 617 // strict-align archs aligned access is much faster than unaligned access. 618 // 619 // Just like in the unaligned case, memcpy() is needed to avoid 620 // strict aliasing violations. However, on archs that don't support 621 // unaligned access the compiler cannot know that the pointers given 622 // to memcpy() are aligned which results in slow code. As of C11 there is 623 // no standard way to tell the compiler that we know that the address is 624 // aligned but some compilers have language extensions to do that. With 625 // such language extensions the memcpy() method gives excellent results. 626 // 627 // What to do on a strict-align system when no known language extentensions 628 // are available? Falling back to byte-by-byte access would be safe but ruin 629 // optimizations that have been made specifically with aligned access in mind. 630 // As a compromise, aligned reads will fall back to non-compliant type punning 631 // but aligned writes will be byte-by-byte, that is, fast reads are preferred 632 // over fast writes. This obviously isn't great but hopefully it's a working 633 // compromise for now. 634 // 635 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 636 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED 637 # define tuklib_memcpy_aligned(dest, src, size) \ 638 memcpy(dest, __builtin_assume_aligned(src, size), size) 639 #else 640 # define tuklib_memcpy_aligned(dest, src, size) \ 641 memcpy(dest, src, size) 642 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS 643 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 644 # endif 645 #endif 646 647 648 static inline uint16_t 649 aligned_read16ne(const uint8_t *buf) 650 { 651 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 652 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 653 return *(const uint16_t *)buf; 654 #else 655 uint16_t num; 656 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 657 return num; 658 #endif 659 } 660 661 662 static inline uint32_t 663 aligned_read32ne(const uint8_t *buf) 664 { 665 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 666 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 667 return *(const uint32_t *)buf; 668 #else 669 uint32_t num; 670 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 671 return num; 672 #endif 673 } 674 675 676 static inline uint64_t 677 aligned_read64ne(const uint8_t *buf) 678 { 679 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 680 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 681 return *(const uint64_t *)buf; 682 #else 683 uint64_t num; 684 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 685 return num; 686 #endif 687 } 688 689 690 static inline void 691 aligned_write16ne(uint8_t *buf, uint16_t num) 692 { 693 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 694 *(uint16_t *)buf = num; 695 #else 696 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 697 #endif 698 return; 699 } 700 701 702 static inline void 703 aligned_write32ne(uint8_t *buf, uint32_t num) 704 { 705 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 706 *(uint32_t *)buf = num; 707 #else 708 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 709 #endif 710 return; 711 } 712 713 714 static inline void 715 aligned_write64ne(uint8_t *buf, uint64_t num) 716 { 717 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 718 *(uint64_t *)buf = num; 719 #else 720 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 721 #endif 722 return; 723 } 724 725 726 static inline uint16_t 727 aligned_read16be(const uint8_t *buf) 728 { 729 uint16_t num = aligned_read16ne(buf); 730 return conv16be(num); 731 } 732 733 734 static inline uint16_t 735 aligned_read16le(const uint8_t *buf) 736 { 737 uint16_t num = aligned_read16ne(buf); 738 return conv16le(num); 739 } 740 741 742 static inline uint32_t 743 aligned_read32be(const uint8_t *buf) 744 { 745 uint32_t num = aligned_read32ne(buf); 746 return conv32be(num); 747 } 748 749 750 static inline uint32_t 751 aligned_read32le(const uint8_t *buf) 752 { 753 uint32_t num = aligned_read32ne(buf); 754 return conv32le(num); 755 } 756 757 758 static inline uint64_t 759 aligned_read64be(const uint8_t *buf) 760 { 761 uint64_t num = aligned_read64ne(buf); 762 return conv64be(num); 763 } 764 765 766 static inline uint64_t 767 aligned_read64le(const uint8_t *buf) 768 { 769 uint64_t num = aligned_read64ne(buf); 770 return conv64le(num); 771 } 772 773 774 // These need to be macros like in the unaligned case. 775 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 776 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 777 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 778 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 779 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 780 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 781 782 783 //////////////////// 784 // Bit operations // 785 //////////////////// 786 787 static inline uint32_t 788 bsr32(uint32_t n) 789 { 790 // Check for ICC first, since it tends to define __GNUC__ too. 791 #if defined(__INTEL_COMPILER) 792 return _bit_scan_reverse(n); 793 794 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 795 // GCC >= 3.4 has __builtin_clz(), which gives good results on 796 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 797 // either plain BSR (so the XOR gets optimized away) or LZCNT and 798 // XOR (if -march indicates that SSE4a instructions are supported). 799 return (uint32_t)__builtin_clz(n) ^ 31U; 800 801 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 802 uint32_t i; 803 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 804 return i; 805 806 #elif defined(_MSC_VER) 807 unsigned long i; 808 _BitScanReverse(&i, n); 809 return i; 810 811 #else 812 uint32_t i = 31; 813 814 if ((n & 0xFFFF0000) == 0) { 815 n <<= 16; 816 i = 15; 817 } 818 819 if ((n & 0xFF000000) == 0) { 820 n <<= 8; 821 i -= 8; 822 } 823 824 if ((n & 0xF0000000) == 0) { 825 n <<= 4; 826 i -= 4; 827 } 828 829 if ((n & 0xC0000000) == 0) { 830 n <<= 2; 831 i -= 2; 832 } 833 834 if ((n & 0x80000000) == 0) 835 --i; 836 837 return i; 838 #endif 839 } 840 841 842 static inline uint32_t 843 clz32(uint32_t n) 844 { 845 #if defined(__INTEL_COMPILER) 846 return _bit_scan_reverse(n) ^ 31U; 847 848 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 849 return (uint32_t)__builtin_clz(n); 850 851 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 852 uint32_t i; 853 __asm__("bsrl %1, %0\n\t" 854 "xorl $31, %0" 855 : "=r" (i) : "rm" (n)); 856 return i; 857 858 #elif defined(_MSC_VER) 859 unsigned long i; 860 _BitScanReverse(&i, n); 861 return i ^ 31U; 862 863 #else 864 uint32_t i = 0; 865 866 if ((n & 0xFFFF0000) == 0) { 867 n <<= 16; 868 i = 16; 869 } 870 871 if ((n & 0xFF000000) == 0) { 872 n <<= 8; 873 i += 8; 874 } 875 876 if ((n & 0xF0000000) == 0) { 877 n <<= 4; 878 i += 4; 879 } 880 881 if ((n & 0xC0000000) == 0) { 882 n <<= 2; 883 i += 2; 884 } 885 886 if ((n & 0x80000000) == 0) 887 ++i; 888 889 return i; 890 #endif 891 } 892 893 894 static inline uint32_t 895 ctz32(uint32_t n) 896 { 897 #if defined(__INTEL_COMPILER) 898 return _bit_scan_forward(n); 899 900 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 901 return (uint32_t)__builtin_ctz(n); 902 903 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 904 uint32_t i; 905 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 906 return i; 907 908 #elif defined(_MSC_VER) 909 unsigned long i; 910 _BitScanForward(&i, n); 911 return i; 912 913 #else 914 uint32_t i = 0; 915 916 if ((n & 0x0000FFFF) == 0) { 917 n >>= 16; 918 i = 16; 919 } 920 921 if ((n & 0x000000FF) == 0) { 922 n >>= 8; 923 i += 8; 924 } 925 926 if ((n & 0x0000000F) == 0) { 927 n >>= 4; 928 i += 4; 929 } 930 931 if ((n & 0x00000003) == 0) { 932 n >>= 2; 933 i += 2; 934 } 935 936 if ((n & 0x00000001) == 0) 937 ++i; 938 939 return i; 940 #endif 941 } 942 943 #define bsf32 ctz32 944 945 #endif 946