1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file tuklib_integer.h 6 /// \brief Various integer and bit operations 7 /// 8 /// This file provides macros or functions to do some basic integer and bit 9 /// operations. 10 /// 11 /// Native endian inline functions (XX = 16, 32, or 64): 12 /// - Unaligned native endian reads: readXXne(ptr) 13 /// - Unaligned native endian writes: writeXXne(ptr, num) 14 /// - Aligned native endian reads: aligned_readXXne(ptr) 15 /// - Aligned native endian writes: aligned_writeXXne(ptr, num) 16 /// 17 /// Endianness-converting integer operations (these can be macros!) 18 /// (XX = 16, 32, or 64; Y = b or l): 19 /// - Byte swapping: byteswapXX(num) 20 /// - Byte order conversions to/from native (byteswaps if Y isn't 21 /// the native endianness): convXXYe(num) 22 /// - Unaligned reads: readXXYe(ptr) 23 /// - Unaligned writes: writeXXYe(ptr, num) 24 /// - Aligned reads: aligned_readXXYe(ptr) 25 /// - Aligned writes: aligned_writeXXYe(ptr, num) 26 /// 27 /// Since the above can macros, the arguments should have no side effects 28 /// because they may be evaluated more than once. 29 /// 30 /// Bit scan operations for non-zero 32-bit integers (inline functions): 31 /// - Bit scan reverse (find highest non-zero bit): bsr32(num) 32 /// - Count leading zeros: clz32(num) 33 /// - Count trailing zeros: ctz32(num) 34 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 35 /// 36 /// The above bit scan operations return 0-31. If num is zero, 37 /// the result is undefined. 38 // 39 // Authors: Lasse Collin 40 // Joachim Henke 41 // 42 /////////////////////////////////////////////////////////////////////////////// 43 44 #ifndef TUKLIB_INTEGER_H 45 #define TUKLIB_INTEGER_H 46 47 #include "tuklib_common.h" 48 #include <string.h> 49 50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 51 // and such functions. 52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 53 # include <immintrin.h> 54 // Only include <intrin.h> when it is needed. GCC and Clang can both 55 // use __builtin's, so we only need Windows instrincs when using MSVC. 56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 57 // cases explicitly. 58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 59 # include <intrin.h> 60 #endif 61 62 63 /////////////////// 64 // Byte swapping // 65 /////////////////// 66 67 #if defined(HAVE___BUILTIN_BSWAPXX) 68 // GCC >= 4.8 and Clang 69 # define byteswap16(num) __builtin_bswap16(num) 70 # define byteswap32(num) __builtin_bswap32(num) 71 # define byteswap64(num) __builtin_bswap64(num) 72 73 #elif defined(HAVE_BYTESWAP_H) 74 // glibc, uClibc, dietlibc 75 # include <byteswap.h> 76 # ifdef HAVE_BSWAP_16 77 # define byteswap16(num) bswap_16(num) 78 # endif 79 # ifdef HAVE_BSWAP_32 80 # define byteswap32(num) bswap_32(num) 81 # endif 82 # ifdef HAVE_BSWAP_64 83 # define byteswap64(num) bswap_64(num) 84 # endif 85 86 #elif defined(HAVE_SYS_ENDIAN_H) 87 // *BSDs and Darwin 88 # include <sys/endian.h> 89 # define byteswap16(num) bswap16(num) 90 # define byteswap32(num) bswap32(num) 91 # define byteswap64(num) bswap64(num) 92 93 #elif defined(HAVE_SYS_BYTEORDER_H) 94 // Solaris 95 # include <sys/byteorder.h> 96 # ifdef BSWAP_16 97 # define byteswap16(num) BSWAP_16(num) 98 # endif 99 # ifdef BSWAP_32 100 # define byteswap32(num) BSWAP_32(num) 101 # endif 102 # ifdef BSWAP_64 103 # define byteswap64(num) BSWAP_64(num) 104 # endif 105 # ifdef BE_16 106 # define conv16be(num) BE_16(num) 107 # endif 108 # ifdef BE_32 109 # define conv32be(num) BE_32(num) 110 # endif 111 # ifdef BE_64 112 # define conv64be(num) BE_64(num) 113 # endif 114 # ifdef LE_16 115 # define conv16le(num) LE_16(num) 116 # endif 117 # ifdef LE_32 118 # define conv32le(num) LE_32(num) 119 # endif 120 # ifdef LE_64 121 # define conv64le(num) LE_64(num) 122 # endif 123 #endif 124 125 #ifndef byteswap16 126 # define byteswap16(n) (uint16_t)( \ 127 (((n) & 0x00FFU) << 8) \ 128 | (((n) & 0xFF00U) >> 8) \ 129 ) 130 #endif 131 132 #ifndef byteswap32 133 # define byteswap32(n) (uint32_t)( \ 134 (((n) & UINT32_C(0x000000FF)) << 24) \ 135 | (((n) & UINT32_C(0x0000FF00)) << 8) \ 136 | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 137 | (((n) & UINT32_C(0xFF000000)) >> 24) \ 138 ) 139 #endif 140 141 #ifndef byteswap64 142 # define byteswap64(n) (uint64_t)( \ 143 (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 144 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 145 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 146 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 147 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 148 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 149 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 150 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 151 ) 152 #endif 153 154 // Define conversion macros using the basic byte swapping macros. 155 #ifdef WORDS_BIGENDIAN 156 # ifndef conv16be 157 # define conv16be(num) ((uint16_t)(num)) 158 # endif 159 # ifndef conv32be 160 # define conv32be(num) ((uint32_t)(num)) 161 # endif 162 # ifndef conv64be 163 # define conv64be(num) ((uint64_t)(num)) 164 # endif 165 # ifndef conv16le 166 # define conv16le(num) byteswap16(num) 167 # endif 168 # ifndef conv32le 169 # define conv32le(num) byteswap32(num) 170 # endif 171 # ifndef conv64le 172 # define conv64le(num) byteswap64(num) 173 # endif 174 #else 175 # ifndef conv16be 176 # define conv16be(num) byteswap16(num) 177 # endif 178 # ifndef conv32be 179 # define conv32be(num) byteswap32(num) 180 # endif 181 # ifndef conv64be 182 # define conv64be(num) byteswap64(num) 183 # endif 184 # ifndef conv16le 185 # define conv16le(num) ((uint16_t)(num)) 186 # endif 187 # ifndef conv32le 188 # define conv32le(num) ((uint32_t)(num)) 189 # endif 190 # ifndef conv64le 191 # define conv64le(num) ((uint64_t)(num)) 192 # endif 193 #endif 194 195 196 //////////////////////////////// 197 // Unaligned reads and writes // 198 //////////////////////////////// 199 200 // No-strict-align archs like x86-64 201 // --------------------------------- 202 // 203 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 204 // is bad even if the uint8_pointer is properly aligned because this kind 205 // of casts break strict aliasing rules and result in undefined behavior. 206 // With unaligned pointers it's even worse: compilers may emit vector 207 // instructions that require aligned pointers even if non-vector 208 // instructions work with unaligned pointers. 209 // 210 // Using memcpy() is the standard compliant way to do unaligned access. 211 // Many modern compilers inline it so there is no function call overhead. 212 // For those compilers that don't handle the memcpy() method well, the 213 // old casting method (that violates strict aliasing) can be requested at 214 // build time. A third method, casting to a packed struct, would also be 215 // an option but isn't provided to keep things simpler (it's already a mess). 216 // Hopefully this is flexible enough in practice. 217 // 218 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 219 // 220 // buf[0] | (buf[1] << 8) 221 // 222 // reads a 16-bit value and can emit a single 16-bit load and produce 223 // identical code than with the memcpy() method. In other cases Clang and GCC 224 // produce either the same or better code with memcpy(). For example, Clang 9 225 // on x86-64 can detect 32-bit load but not 16-bit load. 226 // 227 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 228 // code for "buf[0] | (buf[1] << 8)". 229 // 230 // Conclusion: The memcpy() method is the best choice when unaligned access 231 // is supported. 232 // 233 // Strict-align archs like SPARC 234 // ----------------------------- 235 // 236 // GCC versions from around 4.x to to at least 13.2.0 produce worse code 237 // from the memcpy() method than from simple byte-by-byte shift-or code 238 // when reading a 32-bit integer: 239 // 240 // (1) It may be constructed on stack using using four 8-bit loads, 241 // four 8-bit stores to stack, and finally one 32-bit load from stack. 242 // 243 // (2) Especially with -Os, an actual memcpy() call may be emitted. 244 // 245 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 246 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 247 // some processors but not all so this is relevant only in the case when 248 // GCC assumes that unaligned is not supported or -mstrict-align or 249 // -mno-unaligned-access is used. 250 // 251 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align 252 // was one the very few with a minor difference: the memcpy() version 253 // was one instruction longer. 254 // 255 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is 256 // the best choice for strict-align archs to do unaligned access. 257 // 258 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 259 // 260 // Thanks to <https://godbolt.org/> it was easy to test different compilers. 261 // The following is for little endian targets: 262 /* 263 #include <stdint.h> 264 #include <string.h> 265 266 uint32_t bytes16(const uint8_t *b) 267 { 268 return (uint32_t)b[0] 269 | ((uint32_t)b[1] << 8); 270 } 271 272 uint32_t copy16(const uint8_t *b) 273 { 274 uint16_t v; 275 memcpy(&v, b, sizeof(v)); 276 return v; 277 } 278 279 uint32_t bytes32(const uint8_t *b) 280 { 281 return (uint32_t)b[0] 282 | ((uint32_t)b[1] << 8) 283 | ((uint32_t)b[2] << 16) 284 | ((uint32_t)b[3] << 24); 285 } 286 287 uint32_t copy32(const uint8_t *b) 288 { 289 uint32_t v; 290 memcpy(&v, b, sizeof(v)); 291 return v; 292 } 293 294 void wbytes16(uint8_t *b, uint16_t v) 295 { 296 b[0] = (uint8_t)v; 297 b[1] = (uint8_t)(v >> 8); 298 } 299 300 void wcopy16(uint8_t *b, uint16_t v) 301 { 302 memcpy(b, &v, sizeof(v)); 303 } 304 305 void wbytes32(uint8_t *b, uint32_t v) 306 { 307 b[0] = (uint8_t)v; 308 b[1] = (uint8_t)(v >> 8); 309 b[2] = (uint8_t)(v >> 16); 310 b[3] = (uint8_t)(v >> 24); 311 } 312 313 void wcopy32(uint8_t *b, uint32_t v) 314 { 315 memcpy(b, &v, sizeof(v)); 316 } 317 */ 318 319 320 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS 321 322 static inline uint16_t 323 read16ne(const uint8_t *buf) 324 { 325 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 326 return *(const uint16_t *)buf; 327 #else 328 uint16_t num; 329 memcpy(&num, buf, sizeof(num)); 330 return num; 331 #endif 332 } 333 334 335 static inline uint32_t 336 read32ne(const uint8_t *buf) 337 { 338 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 339 return *(const uint32_t *)buf; 340 #else 341 uint32_t num; 342 memcpy(&num, buf, sizeof(num)); 343 return num; 344 #endif 345 } 346 347 348 static inline uint64_t 349 read64ne(const uint8_t *buf) 350 { 351 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 352 return *(const uint64_t *)buf; 353 #else 354 uint64_t num; 355 memcpy(&num, buf, sizeof(num)); 356 return num; 357 #endif 358 } 359 360 361 static inline void 362 write16ne(uint8_t *buf, uint16_t num) 363 { 364 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 365 *(uint16_t *)buf = num; 366 #else 367 memcpy(buf, &num, sizeof(num)); 368 #endif 369 return; 370 } 371 372 373 static inline void 374 write32ne(uint8_t *buf, uint32_t num) 375 { 376 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 377 *(uint32_t *)buf = num; 378 #else 379 memcpy(buf, &num, sizeof(num)); 380 #endif 381 return; 382 } 383 384 385 static inline void 386 write64ne(uint8_t *buf, uint64_t num) 387 { 388 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 389 *(uint64_t *)buf = num; 390 #else 391 memcpy(buf, &num, sizeof(num)); 392 #endif 393 return; 394 } 395 396 397 static inline uint16_t 398 read16be(const uint8_t *buf) 399 { 400 uint16_t num = read16ne(buf); 401 return conv16be(num); 402 } 403 404 405 static inline uint16_t 406 read16le(const uint8_t *buf) 407 { 408 uint16_t num = read16ne(buf); 409 return conv16le(num); 410 } 411 412 413 static inline uint32_t 414 read32be(const uint8_t *buf) 415 { 416 uint32_t num = read32ne(buf); 417 return conv32be(num); 418 } 419 420 421 static inline uint32_t 422 read32le(const uint8_t *buf) 423 { 424 uint32_t num = read32ne(buf); 425 return conv32le(num); 426 } 427 428 429 static inline uint64_t 430 read64be(const uint8_t *buf) 431 { 432 uint64_t num = read64ne(buf); 433 return conv64be(num); 434 } 435 436 437 static inline uint64_t 438 read64le(const uint8_t *buf) 439 { 440 uint64_t num = read64ne(buf); 441 return conv64le(num); 442 } 443 444 445 // NOTE: Possible byte swapping must be done in a macro to allow the compiler 446 // to optimize byte swapping of constants when using glibc's or *BSD's 447 // byte swapping macros. The actual write is done in an inline function 448 // to make type checking of the buf pointer possible. 449 #define write16be(buf, num) write16ne(buf, conv16be(num)) 450 #define write32be(buf, num) write32ne(buf, conv32be(num)) 451 #define write64be(buf, num) write64ne(buf, conv64be(num)) 452 #define write16le(buf, num) write16ne(buf, conv16le(num)) 453 #define write32le(buf, num) write32ne(buf, conv32le(num)) 454 #define write64le(buf, num) write64ne(buf, conv64le(num)) 455 456 #else 457 458 #ifdef WORDS_BIGENDIAN 459 # define read16ne read16be 460 # define read32ne read32be 461 # define read64ne read64be 462 # define write16ne write16be 463 # define write32ne write32be 464 # define write64ne write64be 465 #else 466 # define read16ne read16le 467 # define read32ne read32le 468 # define read64ne read64le 469 # define write16ne write16le 470 # define write32ne write32le 471 # define write64ne write64le 472 #endif 473 474 475 static inline uint16_t 476 read16be(const uint8_t *buf) 477 { 478 uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 479 return num; 480 } 481 482 483 static inline uint16_t 484 read16le(const uint8_t *buf) 485 { 486 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 487 return num; 488 } 489 490 491 static inline uint32_t 492 read32be(const uint8_t *buf) 493 { 494 uint32_t num = (uint32_t)buf[0] << 24; 495 num |= (uint32_t)buf[1] << 16; 496 num |= (uint32_t)buf[2] << 8; 497 num |= (uint32_t)buf[3]; 498 return num; 499 } 500 501 502 static inline uint32_t 503 read32le(const uint8_t *buf) 504 { 505 uint32_t num = (uint32_t)buf[0]; 506 num |= (uint32_t)buf[1] << 8; 507 num |= (uint32_t)buf[2] << 16; 508 num |= (uint32_t)buf[3] << 24; 509 return num; 510 } 511 512 513 static inline uint64_t 514 read64be(const uint8_t *buf) 515 { 516 uint64_t num = (uint64_t)buf[0] << 56; 517 num |= (uint64_t)buf[1] << 48; 518 num |= (uint64_t)buf[2] << 40; 519 num |= (uint64_t)buf[3] << 32; 520 num |= (uint64_t)buf[4] << 24; 521 num |= (uint64_t)buf[5] << 16; 522 num |= (uint64_t)buf[6] << 8; 523 num |= (uint64_t)buf[7]; 524 return num; 525 } 526 527 528 static inline uint64_t 529 read64le(const uint8_t *buf) 530 { 531 uint64_t num = (uint64_t)buf[0]; 532 num |= (uint64_t)buf[1] << 8; 533 num |= (uint64_t)buf[2] << 16; 534 num |= (uint64_t)buf[3] << 24; 535 num |= (uint64_t)buf[4] << 32; 536 num |= (uint64_t)buf[5] << 40; 537 num |= (uint64_t)buf[6] << 48; 538 num |= (uint64_t)buf[7] << 56; 539 return num; 540 } 541 542 543 static inline void 544 write16be(uint8_t *buf, uint16_t num) 545 { 546 buf[0] = (uint8_t)(num >> 8); 547 buf[1] = (uint8_t)num; 548 return; 549 } 550 551 552 static inline void 553 write16le(uint8_t *buf, uint16_t num) 554 { 555 buf[0] = (uint8_t)num; 556 buf[1] = (uint8_t)(num >> 8); 557 return; 558 } 559 560 561 static inline void 562 write32be(uint8_t *buf, uint32_t num) 563 { 564 buf[0] = (uint8_t)(num >> 24); 565 buf[1] = (uint8_t)(num >> 16); 566 buf[2] = (uint8_t)(num >> 8); 567 buf[3] = (uint8_t)num; 568 return; 569 } 570 571 572 static inline void 573 write32le(uint8_t *buf, uint32_t num) 574 { 575 buf[0] = (uint8_t)num; 576 buf[1] = (uint8_t)(num >> 8); 577 buf[2] = (uint8_t)(num >> 16); 578 buf[3] = (uint8_t)(num >> 24); 579 return; 580 } 581 582 583 static inline void 584 write64be(uint8_t *buf, uint64_t num) 585 { 586 buf[0] = (uint8_t)(num >> 56); 587 buf[1] = (uint8_t)(num >> 48); 588 buf[2] = (uint8_t)(num >> 40); 589 buf[3] = (uint8_t)(num >> 32); 590 buf[4] = (uint8_t)(num >> 24); 591 buf[5] = (uint8_t)(num >> 16); 592 buf[6] = (uint8_t)(num >> 8); 593 buf[7] = (uint8_t)num; 594 return; 595 } 596 597 598 static inline void 599 write64le(uint8_t *buf, uint64_t num) 600 { 601 buf[0] = (uint8_t)num; 602 buf[1] = (uint8_t)(num >> 8); 603 buf[2] = (uint8_t)(num >> 16); 604 buf[3] = (uint8_t)(num >> 24); 605 buf[4] = (uint8_t)(num >> 32); 606 buf[5] = (uint8_t)(num >> 40); 607 buf[6] = (uint8_t)(num >> 48); 608 buf[7] = (uint8_t)(num >> 56); 609 return; 610 } 611 612 #endif 613 614 615 ////////////////////////////// 616 // Aligned reads and writes // 617 ////////////////////////////// 618 619 // Separate functions for aligned reads and writes are provided since on 620 // strict-align archs aligned access is much faster than unaligned access. 621 // 622 // Just like in the unaligned case, memcpy() is needed to avoid 623 // strict aliasing violations. However, on archs that don't support 624 // unaligned access the compiler cannot know that the pointers given 625 // to memcpy() are aligned which results in slow code. As of C11 there is 626 // no standard way to tell the compiler that we know that the address is 627 // aligned but some compilers have language extensions to do that. With 628 // such language extensions the memcpy() method gives excellent results. 629 // 630 // What to do on a strict-align system when no known language extensions 631 // are available? Falling back to byte-by-byte access would be safe but ruin 632 // optimizations that have been made specifically with aligned access in mind. 633 // As a compromise, aligned reads will fall back to non-compliant type punning 634 // but aligned writes will be byte-by-byte, that is, fast reads are preferred 635 // over fast writes. This obviously isn't great but hopefully it's a working 636 // compromise for now. 637 // 638 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 639 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED 640 # define tuklib_memcpy_aligned(dest, src, size) \ 641 memcpy(dest, __builtin_assume_aligned(src, size), size) 642 #else 643 # define tuklib_memcpy_aligned(dest, src, size) \ 644 memcpy(dest, src, size) 645 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS 646 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 647 # endif 648 #endif 649 650 651 static inline uint16_t 652 aligned_read16ne(const uint8_t *buf) 653 { 654 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 655 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 656 return *(const uint16_t *)buf; 657 #else 658 uint16_t num; 659 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 660 return num; 661 #endif 662 } 663 664 665 static inline uint32_t 666 aligned_read32ne(const uint8_t *buf) 667 { 668 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 669 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 670 return *(const uint32_t *)buf; 671 #else 672 uint32_t num; 673 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 674 return num; 675 #endif 676 } 677 678 679 static inline uint64_t 680 aligned_read64ne(const uint8_t *buf) 681 { 682 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 683 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 684 return *(const uint64_t *)buf; 685 #else 686 uint64_t num; 687 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 688 return num; 689 #endif 690 } 691 692 693 static inline void 694 aligned_write16ne(uint8_t *buf, uint16_t num) 695 { 696 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 697 *(uint16_t *)buf = num; 698 #else 699 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 700 #endif 701 return; 702 } 703 704 705 static inline void 706 aligned_write32ne(uint8_t *buf, uint32_t num) 707 { 708 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 709 *(uint32_t *)buf = num; 710 #else 711 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 712 #endif 713 return; 714 } 715 716 717 static inline void 718 aligned_write64ne(uint8_t *buf, uint64_t num) 719 { 720 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 721 *(uint64_t *)buf = num; 722 #else 723 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 724 #endif 725 return; 726 } 727 728 729 static inline uint16_t 730 aligned_read16be(const uint8_t *buf) 731 { 732 uint16_t num = aligned_read16ne(buf); 733 return conv16be(num); 734 } 735 736 737 static inline uint16_t 738 aligned_read16le(const uint8_t *buf) 739 { 740 uint16_t num = aligned_read16ne(buf); 741 return conv16le(num); 742 } 743 744 745 static inline uint32_t 746 aligned_read32be(const uint8_t *buf) 747 { 748 uint32_t num = aligned_read32ne(buf); 749 return conv32be(num); 750 } 751 752 753 static inline uint32_t 754 aligned_read32le(const uint8_t *buf) 755 { 756 uint32_t num = aligned_read32ne(buf); 757 return conv32le(num); 758 } 759 760 761 static inline uint64_t 762 aligned_read64be(const uint8_t *buf) 763 { 764 uint64_t num = aligned_read64ne(buf); 765 return conv64be(num); 766 } 767 768 769 static inline uint64_t 770 aligned_read64le(const uint8_t *buf) 771 { 772 uint64_t num = aligned_read64ne(buf); 773 return conv64le(num); 774 } 775 776 777 // These need to be macros like in the unaligned case. 778 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 779 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 780 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 781 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 782 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 783 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 784 785 786 //////////////////// 787 // Bit operations // 788 //////////////////// 789 790 static inline uint32_t 791 bsr32(uint32_t n) 792 { 793 // Check for ICC first, since it tends to define __GNUC__ too. 794 #if defined(__INTEL_COMPILER) 795 return _bit_scan_reverse(n); 796 797 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 798 // GCC >= 3.4 has __builtin_clz(), which gives good results on 799 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 800 // either plain BSR (so the XOR gets optimized away) or LZCNT and 801 // XOR (if -march indicates that SSE4a instructions are supported). 802 return (uint32_t)__builtin_clz(n) ^ 31U; 803 804 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 805 uint32_t i; 806 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 807 return i; 808 809 #elif defined(_MSC_VER) 810 unsigned long i; 811 _BitScanReverse(&i, n); 812 return i; 813 814 #else 815 uint32_t i = 31; 816 817 if ((n & 0xFFFF0000) == 0) { 818 n <<= 16; 819 i = 15; 820 } 821 822 if ((n & 0xFF000000) == 0) { 823 n <<= 8; 824 i -= 8; 825 } 826 827 if ((n & 0xF0000000) == 0) { 828 n <<= 4; 829 i -= 4; 830 } 831 832 if ((n & 0xC0000000) == 0) { 833 n <<= 2; 834 i -= 2; 835 } 836 837 if ((n & 0x80000000) == 0) 838 --i; 839 840 return i; 841 #endif 842 } 843 844 845 static inline uint32_t 846 clz32(uint32_t n) 847 { 848 #if defined(__INTEL_COMPILER) 849 return _bit_scan_reverse(n) ^ 31U; 850 851 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 852 return (uint32_t)__builtin_clz(n); 853 854 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 855 uint32_t i; 856 __asm__("bsrl %1, %0\n\t" 857 "xorl $31, %0" 858 : "=r" (i) : "rm" (n)); 859 return i; 860 861 #elif defined(_MSC_VER) 862 unsigned long i; 863 _BitScanReverse(&i, n); 864 return i ^ 31U; 865 866 #else 867 uint32_t i = 0; 868 869 if ((n & 0xFFFF0000) == 0) { 870 n <<= 16; 871 i = 16; 872 } 873 874 if ((n & 0xFF000000) == 0) { 875 n <<= 8; 876 i += 8; 877 } 878 879 if ((n & 0xF0000000) == 0) { 880 n <<= 4; 881 i += 4; 882 } 883 884 if ((n & 0xC0000000) == 0) { 885 n <<= 2; 886 i += 2; 887 } 888 889 if ((n & 0x80000000) == 0) 890 ++i; 891 892 return i; 893 #endif 894 } 895 896 897 static inline uint32_t 898 ctz32(uint32_t n) 899 { 900 #if defined(__INTEL_COMPILER) 901 return _bit_scan_forward(n); 902 903 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 904 return (uint32_t)__builtin_ctz(n); 905 906 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 907 uint32_t i; 908 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 909 return i; 910 911 #elif defined(_MSC_VER) 912 unsigned long i; 913 _BitScanForward(&i, n); 914 return i; 915 916 #else 917 uint32_t i = 0; 918 919 if ((n & 0x0000FFFF) == 0) { 920 n >>= 16; 921 i = 16; 922 } 923 924 if ((n & 0x000000FF) == 0) { 925 n >>= 8; 926 i += 8; 927 } 928 929 if ((n & 0x0000000F) == 0) { 930 n >>= 4; 931 i += 4; 932 } 933 934 if ((n & 0x00000003) == 0) { 935 n >>= 2; 936 i += 2; 937 } 938 939 if ((n & 0x00000001) == 0) 940 ++i; 941 942 return i; 943 #endif 944 } 945 946 #define bsf32 ctz32 947 948 #endif 949