1 /////////////////////////////////////////////////////////////////////////////// 2 // 3 /// \file tuklib_integer.h 4 /// \brief Various integer and bit operations 5 /// 6 /// This file provides macros or functions to do some basic integer and bit 7 /// operations. 8 /// 9 /// Native endian inline functions (XX = 16, 32, or 64): 10 /// - Unaligned native endian reads: readXXne(ptr) 11 /// - Unaligned native endian writes: writeXXne(ptr, num) 12 /// - Aligned native endian reads: aligned_readXXne(ptr) 13 /// - Aligned native endian writes: aligned_writeXXne(ptr, num) 14 /// 15 /// Endianness-converting integer operations (these can be macros!) 16 /// (XX = 16, 32, or 64; Y = b or l): 17 /// - Byte swapping: bswapXX(num) 18 /// - Byte order conversions to/from native (byteswaps if Y isn't 19 /// the native endianness): convXXYe(num) 20 /// - Unaligned reads: readXXYe(ptr) 21 /// - Unaligned writes: writeXXYe(ptr, num) 22 /// - Aligned reads: aligned_readXXYe(ptr) 23 /// - Aligned writes: aligned_writeXXYe(ptr, num) 24 /// 25 /// Since the above can macros, the arguments should have no side effects 26 /// because they may be evaluated more than once. 27 /// 28 /// Bit scan operations for non-zero 32-bit integers (inline functions): 29 /// - Bit scan reverse (find highest non-zero bit): bsr32(num) 30 /// - Count leading zeros: clz32(num) 31 /// - Count trailing zeros: ctz32(num) 32 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 33 /// 34 /// The above bit scan operations return 0-31. If num is zero, 35 /// the result is undefined. 36 // 37 // Authors: Lasse Collin 38 // Joachim Henke 39 // 40 // This file has been put into the public domain. 41 // You can do whatever you want with this file. 42 // 43 /////////////////////////////////////////////////////////////////////////////// 44 45 #ifndef TUKLIB_INTEGER_H 46 #define TUKLIB_INTEGER_H 47 48 #include "tuklib_common.h" 49 #include <string.h> 50 51 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 52 // and such functions. 53 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 54 # include <immintrin.h> 55 // Only include <intrin.h> when it is needed. GCC and Clang can both 56 // use __builtin's, so we only need Windows instrincs when using MSVC. 57 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 58 // cases explicitly. 59 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 60 # include <intrin.h> 61 #endif 62 63 64 /////////////////// 65 // Byte swapping // 66 /////////////////// 67 68 #if defined(HAVE___BUILTIN_BSWAPXX) 69 // GCC >= 4.8 and Clang 70 # define bswap16(n) __builtin_bswap16(n) 71 # define bswap32(n) __builtin_bswap32(n) 72 # define bswap64(n) __builtin_bswap64(n) 73 74 #elif defined(HAVE_BYTESWAP_H) 75 // glibc, uClibc, dietlibc 76 # include <byteswap.h> 77 # ifdef HAVE_BSWAP_16 78 # define bswap16(num) bswap_16(num) 79 # endif 80 # ifdef HAVE_BSWAP_32 81 # define bswap32(num) bswap_32(num) 82 # endif 83 # ifdef HAVE_BSWAP_64 84 # define bswap64(num) bswap_64(num) 85 # endif 86 87 #elif defined(HAVE_SYS_ENDIAN_H) 88 // *BSDs and Darwin 89 # include <sys/endian.h> 90 91 #elif defined(HAVE_SYS_BYTEORDER_H) 92 // Solaris 93 # include <sys/byteorder.h> 94 # ifdef BSWAP_16 95 # define bswap16(num) BSWAP_16(num) 96 # endif 97 # ifdef BSWAP_32 98 # define bswap32(num) BSWAP_32(num) 99 # endif 100 # ifdef BSWAP_64 101 # define bswap64(num) BSWAP_64(num) 102 # endif 103 # ifdef BE_16 104 # define conv16be(num) BE_16(num) 105 # endif 106 # ifdef BE_32 107 # define conv32be(num) BE_32(num) 108 # endif 109 # ifdef BE_64 110 # define conv64be(num) BE_64(num) 111 # endif 112 # ifdef LE_16 113 # define conv16le(num) LE_16(num) 114 # endif 115 # ifdef LE_32 116 # define conv32le(num) LE_32(num) 117 # endif 118 # ifdef LE_64 119 # define conv64le(num) LE_64(num) 120 # endif 121 #endif 122 123 #ifndef bswap16 124 # define bswap16(n) (uint16_t)( \ 125 (((n) & 0x00FFU) << 8) \ 126 | (((n) & 0xFF00U) >> 8) \ 127 ) 128 #endif 129 130 #ifndef bswap32 131 # define bswap32(n) (uint32_t)( \ 132 (((n) & UINT32_C(0x000000FF)) << 24) \ 133 | (((n) & UINT32_C(0x0000FF00)) << 8) \ 134 | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 135 | (((n) & UINT32_C(0xFF000000)) >> 24) \ 136 ) 137 #endif 138 139 #ifndef bswap64 140 # define bswap64(n) (uint64_t)( \ 141 (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 142 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 143 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 144 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 145 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 146 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 147 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 148 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 149 ) 150 #endif 151 152 // Define conversion macros using the basic byte swapping macros. 153 #ifdef WORDS_BIGENDIAN 154 # ifndef conv16be 155 # define conv16be(num) ((uint16_t)(num)) 156 # endif 157 # ifndef conv32be 158 # define conv32be(num) ((uint32_t)(num)) 159 # endif 160 # ifndef conv64be 161 # define conv64be(num) ((uint64_t)(num)) 162 # endif 163 # ifndef conv16le 164 # define conv16le(num) bswap16(num) 165 # endif 166 # ifndef conv32le 167 # define conv32le(num) bswap32(num) 168 # endif 169 # ifndef conv64le 170 # define conv64le(num) bswap64(num) 171 # endif 172 #else 173 # ifndef conv16be 174 # define conv16be(num) bswap16(num) 175 # endif 176 # ifndef conv32be 177 # define conv32be(num) bswap32(num) 178 # endif 179 # ifndef conv64be 180 # define conv64be(num) bswap64(num) 181 # endif 182 # ifndef conv16le 183 # define conv16le(num) ((uint16_t)(num)) 184 # endif 185 # ifndef conv32le 186 # define conv32le(num) ((uint32_t)(num)) 187 # endif 188 # ifndef conv64le 189 # define conv64le(num) ((uint64_t)(num)) 190 # endif 191 #endif 192 193 194 //////////////////////////////// 195 // Unaligned reads and writes // 196 //////////////////////////////// 197 198 // No-strict-align archs like x86-64 199 // --------------------------------- 200 // 201 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 202 // is bad even if the uint8_pointer is properly aligned because this kind 203 // of casts break strict aliasing rules and result in undefined behavior. 204 // With unaligned pointers it's even worse: compilers may emit vector 205 // instructions that require aligned pointers even if non-vector 206 // instructions work with unaligned pointers. 207 // 208 // Using memcpy() is the standard compliant way to do unaligned access. 209 // Many modern compilers inline it so there is no function call overhead. 210 // For those compilers that don't handle the memcpy() method well, the 211 // old casting method (that violates strict aliasing) can be requested at 212 // build time. A third method, casting to a packed struct, would also be 213 // an option but isn't provided to keep things simpler (it's already a mess). 214 // Hopefully this is flexible enough in practice. 215 // 216 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 217 // 218 // buf[0] | (buf[1] << 8) 219 // 220 // reads a 16-bit value and can emit a single 16-bit load and produce 221 // identical code than with the memcpy() method. In other cases Clang and GCC 222 // produce either the same or better code with memcpy(). For example, Clang 9 223 // on x86-64 can detect 32-bit load but not 16-bit load. 224 // 225 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 226 // code for "buf[0] | (buf[1] << 8)". 227 // 228 // Conclusion: The memcpy() method is the best choice when unaligned access 229 // is supported. 230 // 231 // Strict-align archs like SPARC 232 // ----------------------------- 233 // 234 // GCC versions from around 4.x to to at least 13.2.0 produce worse code 235 // from the memcpy() method than from simple byte-by-byte shift-or code 236 // when reading a 32-bit integer: 237 // 238 // (1) It may be constructed on stack using using four 8-bit loads, 239 // four 8-bit stores to stack, and finally one 32-bit load from stack. 240 // 241 // (2) Especially with -Os, an actual memcpy() call may be emitted. 242 // 243 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 244 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 245 // some processors but not all so this is relevant only in the case when 246 // GCC assumes that unaligned is not supported or -mstrict-align or 247 // -mno-unaligned-access is used. 248 // 249 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align 250 // was one the very few with a minor difference: the memcpy() version 251 // was one instruction longer. 252 // 253 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is 254 // the best choise for strict-align archs to do unaligned access. 255 // 256 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 257 // 258 // Thanks to <https://godbolt.org/> it was easy to test different compilers. 259 // The following is for little endian targets: 260 /* 261 #include <stdint.h> 262 #include <string.h> 263 264 uint32_t bytes16(const uint8_t *b) 265 { 266 return (uint32_t)b[0] 267 | ((uint32_t)b[1] << 8); 268 } 269 270 uint32_t copy16(const uint8_t *b) 271 { 272 uint16_t v; 273 memcpy(&v, b, sizeof(v)); 274 return v; 275 } 276 277 uint32_t bytes32(const uint8_t *b) 278 { 279 return (uint32_t)b[0] 280 | ((uint32_t)b[1] << 8) 281 | ((uint32_t)b[2] << 16) 282 | ((uint32_t)b[3] << 24); 283 } 284 285 uint32_t copy32(const uint8_t *b) 286 { 287 uint32_t v; 288 memcpy(&v, b, sizeof(v)); 289 return v; 290 } 291 292 void wbytes16(uint8_t *b, uint16_t v) 293 { 294 b[0] = (uint8_t)v; 295 b[1] = (uint8_t)(v >> 8); 296 } 297 298 void wcopy16(uint8_t *b, uint16_t v) 299 { 300 memcpy(b, &v, sizeof(v)); 301 } 302 303 void wbytes32(uint8_t *b, uint32_t v) 304 { 305 b[0] = (uint8_t)v; 306 b[1] = (uint8_t)(v >> 8); 307 b[2] = (uint8_t)(v >> 16); 308 b[3] = (uint8_t)(v >> 24); 309 } 310 311 void wcopy32(uint8_t *b, uint32_t v) 312 { 313 memcpy(b, &v, sizeof(v)); 314 } 315 */ 316 317 318 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS 319 320 static inline uint16_t 321 read16ne(const uint8_t *buf) 322 { 323 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 324 return *(const uint16_t *)buf; 325 #else 326 uint16_t num; 327 memcpy(&num, buf, sizeof(num)); 328 return num; 329 #endif 330 } 331 332 333 static inline uint32_t 334 read32ne(const uint8_t *buf) 335 { 336 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 337 return *(const uint32_t *)buf; 338 #else 339 uint32_t num; 340 memcpy(&num, buf, sizeof(num)); 341 return num; 342 #endif 343 } 344 345 346 static inline uint64_t 347 read64ne(const uint8_t *buf) 348 { 349 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 350 return *(const uint64_t *)buf; 351 #else 352 uint64_t num; 353 memcpy(&num, buf, sizeof(num)); 354 return num; 355 #endif 356 } 357 358 359 static inline void 360 write16ne(uint8_t *buf, uint16_t num) 361 { 362 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 363 *(uint16_t *)buf = num; 364 #else 365 memcpy(buf, &num, sizeof(num)); 366 #endif 367 return; 368 } 369 370 371 static inline void 372 write32ne(uint8_t *buf, uint32_t num) 373 { 374 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 375 *(uint32_t *)buf = num; 376 #else 377 memcpy(buf, &num, sizeof(num)); 378 #endif 379 return; 380 } 381 382 383 static inline void 384 write64ne(uint8_t *buf, uint64_t num) 385 { 386 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 387 *(uint64_t *)buf = num; 388 #else 389 memcpy(buf, &num, sizeof(num)); 390 #endif 391 return; 392 } 393 394 395 static inline uint16_t 396 read16be(const uint8_t *buf) 397 { 398 uint16_t num = read16ne(buf); 399 return conv16be(num); 400 } 401 402 403 static inline uint16_t 404 read16le(const uint8_t *buf) 405 { 406 uint16_t num = read16ne(buf); 407 return conv16le(num); 408 } 409 410 411 static inline uint32_t 412 read32be(const uint8_t *buf) 413 { 414 uint32_t num = read32ne(buf); 415 return conv32be(num); 416 } 417 418 419 static inline uint32_t 420 read32le(const uint8_t *buf) 421 { 422 uint32_t num = read32ne(buf); 423 return conv32le(num); 424 } 425 426 427 static inline uint64_t 428 read64be(const uint8_t *buf) 429 { 430 uint64_t num = read64ne(buf); 431 return conv64be(num); 432 } 433 434 435 static inline uint64_t 436 read64le(const uint8_t *buf) 437 { 438 uint64_t num = read64ne(buf); 439 return conv64le(num); 440 } 441 442 443 // NOTE: Possible byte swapping must be done in a macro to allow the compiler 444 // to optimize byte swapping of constants when using glibc's or *BSD's 445 // byte swapping macros. The actual write is done in an inline function 446 // to make type checking of the buf pointer possible. 447 #define write16be(buf, num) write16ne(buf, conv16be(num)) 448 #define write32be(buf, num) write32ne(buf, conv32be(num)) 449 #define write64be(buf, num) write64ne(buf, conv64be(num)) 450 #define write16le(buf, num) write16ne(buf, conv16le(num)) 451 #define write32le(buf, num) write32ne(buf, conv32le(num)) 452 #define write64le(buf, num) write64ne(buf, conv64le(num)) 453 454 #else 455 456 #ifdef WORDS_BIGENDIAN 457 # define read16ne read16be 458 # define read32ne read32be 459 # define read64ne read64be 460 # define write16ne write16be 461 # define write32ne write32be 462 # define write64ne write64be 463 #else 464 # define read16ne read16le 465 # define read32ne read32le 466 # define read64ne read64le 467 # define write16ne write16le 468 # define write32ne write32le 469 # define write64ne write64le 470 #endif 471 472 473 static inline uint16_t 474 read16be(const uint8_t *buf) 475 { 476 uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 477 return num; 478 } 479 480 481 static inline uint16_t 482 read16le(const uint8_t *buf) 483 { 484 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 485 return num; 486 } 487 488 489 static inline uint32_t 490 read32be(const uint8_t *buf) 491 { 492 uint32_t num = (uint32_t)buf[0] << 24; 493 num |= (uint32_t)buf[1] << 16; 494 num |= (uint32_t)buf[2] << 8; 495 num |= (uint32_t)buf[3]; 496 return num; 497 } 498 499 500 static inline uint32_t 501 read32le(const uint8_t *buf) 502 { 503 uint32_t num = (uint32_t)buf[0]; 504 num |= (uint32_t)buf[1] << 8; 505 num |= (uint32_t)buf[2] << 16; 506 num |= (uint32_t)buf[3] << 24; 507 return num; 508 } 509 510 511 static inline uint64_t 512 read64be(const uint8_t *buf) 513 { 514 uint64_t num = (uint64_t)buf[0] << 56; 515 num |= (uint64_t)buf[1] << 48; 516 num |= (uint64_t)buf[2] << 40; 517 num |= (uint64_t)buf[3] << 32; 518 num |= (uint64_t)buf[4] << 24; 519 num |= (uint64_t)buf[5] << 16; 520 num |= (uint64_t)buf[6] << 8; 521 num |= (uint64_t)buf[7]; 522 return num; 523 } 524 525 526 static inline uint64_t 527 read64le(const uint8_t *buf) 528 { 529 uint64_t num = (uint64_t)buf[0]; 530 num |= (uint64_t)buf[1] << 8; 531 num |= (uint64_t)buf[2] << 16; 532 num |= (uint64_t)buf[3] << 24; 533 num |= (uint64_t)buf[4] << 32; 534 num |= (uint64_t)buf[5] << 40; 535 num |= (uint64_t)buf[6] << 48; 536 num |= (uint64_t)buf[7] << 56; 537 return num; 538 } 539 540 541 static inline void 542 write16be(uint8_t *buf, uint16_t num) 543 { 544 buf[0] = (uint8_t)(num >> 8); 545 buf[1] = (uint8_t)num; 546 return; 547 } 548 549 550 static inline void 551 write16le(uint8_t *buf, uint16_t num) 552 { 553 buf[0] = (uint8_t)num; 554 buf[1] = (uint8_t)(num >> 8); 555 return; 556 } 557 558 559 static inline void 560 write32be(uint8_t *buf, uint32_t num) 561 { 562 buf[0] = (uint8_t)(num >> 24); 563 buf[1] = (uint8_t)(num >> 16); 564 buf[2] = (uint8_t)(num >> 8); 565 buf[3] = (uint8_t)num; 566 return; 567 } 568 569 570 static inline void 571 write32le(uint8_t *buf, uint32_t num) 572 { 573 buf[0] = (uint8_t)num; 574 buf[1] = (uint8_t)(num >> 8); 575 buf[2] = (uint8_t)(num >> 16); 576 buf[3] = (uint8_t)(num >> 24); 577 return; 578 } 579 580 581 static inline void 582 write64be(uint8_t *buf, uint64_t num) 583 { 584 buf[0] = (uint8_t)(num >> 56); 585 buf[1] = (uint8_t)(num >> 48); 586 buf[2] = (uint8_t)(num >> 40); 587 buf[3] = (uint8_t)(num >> 32); 588 buf[4] = (uint8_t)(num >> 24); 589 buf[5] = (uint8_t)(num >> 16); 590 buf[6] = (uint8_t)(num >> 8); 591 buf[7] = (uint8_t)num; 592 return; 593 } 594 595 596 static inline void 597 write64le(uint8_t *buf, uint64_t num) 598 { 599 buf[0] = (uint8_t)num; 600 buf[1] = (uint8_t)(num >> 8); 601 buf[2] = (uint8_t)(num >> 16); 602 buf[3] = (uint8_t)(num >> 24); 603 buf[4] = (uint8_t)(num >> 32); 604 buf[5] = (uint8_t)(num >> 40); 605 buf[6] = (uint8_t)(num >> 48); 606 buf[7] = (uint8_t)(num >> 56); 607 return; 608 } 609 610 #endif 611 612 613 ////////////////////////////// 614 // Aligned reads and writes // 615 ////////////////////////////// 616 617 // Separate functions for aligned reads and writes are provided since on 618 // strict-align archs aligned access is much faster than unaligned access. 619 // 620 // Just like in the unaligned case, memcpy() is needed to avoid 621 // strict aliasing violations. However, on archs that don't support 622 // unaligned access the compiler cannot know that the pointers given 623 // to memcpy() are aligned which results in slow code. As of C11 there is 624 // no standard way to tell the compiler that we know that the address is 625 // aligned but some compilers have language extensions to do that. With 626 // such language extensions the memcpy() method gives excellent results. 627 // 628 // What to do on a strict-align system when no known language extentensions 629 // are available? Falling back to byte-by-byte access would be safe but ruin 630 // optimizations that have been made specifically with aligned access in mind. 631 // As a compromise, aligned reads will fall back to non-compliant type punning 632 // but aligned writes will be byte-by-byte, that is, fast reads are preferred 633 // over fast writes. This obviously isn't great but hopefully it's a working 634 // compromise for now. 635 // 636 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 637 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED 638 # define tuklib_memcpy_aligned(dest, src, size) \ 639 memcpy(dest, __builtin_assume_aligned(src, size), size) 640 #else 641 # define tuklib_memcpy_aligned(dest, src, size) \ 642 memcpy(dest, src, size) 643 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS 644 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 645 # endif 646 #endif 647 648 649 static inline uint16_t 650 aligned_read16ne(const uint8_t *buf) 651 { 652 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 653 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 654 return *(const uint16_t *)buf; 655 #else 656 uint16_t num; 657 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 658 return num; 659 #endif 660 } 661 662 663 static inline uint32_t 664 aligned_read32ne(const uint8_t *buf) 665 { 666 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 667 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 668 return *(const uint32_t *)buf; 669 #else 670 uint32_t num; 671 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 672 return num; 673 #endif 674 } 675 676 677 static inline uint64_t 678 aligned_read64ne(const uint8_t *buf) 679 { 680 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 681 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 682 return *(const uint64_t *)buf; 683 #else 684 uint64_t num; 685 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 686 return num; 687 #endif 688 } 689 690 691 static inline void 692 aligned_write16ne(uint8_t *buf, uint16_t num) 693 { 694 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 695 *(uint16_t *)buf = num; 696 #else 697 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 698 #endif 699 return; 700 } 701 702 703 static inline void 704 aligned_write32ne(uint8_t *buf, uint32_t num) 705 { 706 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 707 *(uint32_t *)buf = num; 708 #else 709 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 710 #endif 711 return; 712 } 713 714 715 static inline void 716 aligned_write64ne(uint8_t *buf, uint64_t num) 717 { 718 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 719 *(uint64_t *)buf = num; 720 #else 721 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 722 #endif 723 return; 724 } 725 726 727 static inline uint16_t 728 aligned_read16be(const uint8_t *buf) 729 { 730 uint16_t num = aligned_read16ne(buf); 731 return conv16be(num); 732 } 733 734 735 static inline uint16_t 736 aligned_read16le(const uint8_t *buf) 737 { 738 uint16_t num = aligned_read16ne(buf); 739 return conv16le(num); 740 } 741 742 743 static inline uint32_t 744 aligned_read32be(const uint8_t *buf) 745 { 746 uint32_t num = aligned_read32ne(buf); 747 return conv32be(num); 748 } 749 750 751 static inline uint32_t 752 aligned_read32le(const uint8_t *buf) 753 { 754 uint32_t num = aligned_read32ne(buf); 755 return conv32le(num); 756 } 757 758 759 static inline uint64_t 760 aligned_read64be(const uint8_t *buf) 761 { 762 uint64_t num = aligned_read64ne(buf); 763 return conv64be(num); 764 } 765 766 767 static inline uint64_t 768 aligned_read64le(const uint8_t *buf) 769 { 770 uint64_t num = aligned_read64ne(buf); 771 return conv64le(num); 772 } 773 774 775 // These need to be macros like in the unaligned case. 776 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 777 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 778 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 779 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 780 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 781 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 782 783 784 //////////////////// 785 // Bit operations // 786 //////////////////// 787 788 static inline uint32_t 789 bsr32(uint32_t n) 790 { 791 // Check for ICC first, since it tends to define __GNUC__ too. 792 #if defined(__INTEL_COMPILER) 793 return _bit_scan_reverse(n); 794 795 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 796 // GCC >= 3.4 has __builtin_clz(), which gives good results on 797 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 798 // either plain BSR (so the XOR gets optimized away) or LZCNT and 799 // XOR (if -march indicates that SSE4a instructions are supported). 800 return (uint32_t)__builtin_clz(n) ^ 31U; 801 802 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 803 uint32_t i; 804 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 805 return i; 806 807 #elif defined(_MSC_VER) 808 unsigned long i; 809 _BitScanReverse(&i, n); 810 return i; 811 812 #else 813 uint32_t i = 31; 814 815 if ((n & 0xFFFF0000) == 0) { 816 n <<= 16; 817 i = 15; 818 } 819 820 if ((n & 0xFF000000) == 0) { 821 n <<= 8; 822 i -= 8; 823 } 824 825 if ((n & 0xF0000000) == 0) { 826 n <<= 4; 827 i -= 4; 828 } 829 830 if ((n & 0xC0000000) == 0) { 831 n <<= 2; 832 i -= 2; 833 } 834 835 if ((n & 0x80000000) == 0) 836 --i; 837 838 return i; 839 #endif 840 } 841 842 843 static inline uint32_t 844 clz32(uint32_t n) 845 { 846 #if defined(__INTEL_COMPILER) 847 return _bit_scan_reverse(n) ^ 31U; 848 849 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 850 return (uint32_t)__builtin_clz(n); 851 852 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 853 uint32_t i; 854 __asm__("bsrl %1, %0\n\t" 855 "xorl $31, %0" 856 : "=r" (i) : "rm" (n)); 857 return i; 858 859 #elif defined(_MSC_VER) 860 unsigned long i; 861 _BitScanReverse(&i, n); 862 return i ^ 31U; 863 864 #else 865 uint32_t i = 0; 866 867 if ((n & 0xFFFF0000) == 0) { 868 n <<= 16; 869 i = 16; 870 } 871 872 if ((n & 0xFF000000) == 0) { 873 n <<= 8; 874 i += 8; 875 } 876 877 if ((n & 0xF0000000) == 0) { 878 n <<= 4; 879 i += 4; 880 } 881 882 if ((n & 0xC0000000) == 0) { 883 n <<= 2; 884 i += 2; 885 } 886 887 if ((n & 0x80000000) == 0) 888 ++i; 889 890 return i; 891 #endif 892 } 893 894 895 static inline uint32_t 896 ctz32(uint32_t n) 897 { 898 #if defined(__INTEL_COMPILER) 899 return _bit_scan_forward(n); 900 901 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 902 return (uint32_t)__builtin_ctz(n); 903 904 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 905 uint32_t i; 906 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 907 return i; 908 909 #elif defined(_MSC_VER) 910 unsigned long i; 911 _BitScanForward(&i, n); 912 return i; 913 914 #else 915 uint32_t i = 0; 916 917 if ((n & 0x0000FFFF) == 0) { 918 n >>= 16; 919 i = 16; 920 } 921 922 if ((n & 0x000000FF) == 0) { 923 n >>= 8; 924 i += 8; 925 } 926 927 if ((n & 0x0000000F) == 0) { 928 n >>= 4; 929 i += 4; 930 } 931 932 if ((n & 0x00000003) == 0) { 933 n >>= 2; 934 i += 2; 935 } 936 937 if ((n & 0x00000001) == 0) 938 ++i; 939 940 return i; 941 #endif 942 } 943 944 #define bsf32 ctz32 945 946 #endif 947