Lines Matching +full:1 +full:x64 +full:- +full:bit
2 * xxHash - Extremely Fast Hash algorithm
3 * Copyright (C) 2012-2023, Yann Collet
5 * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
31 * - xxHash homepage: http://www.xxhash.com
32 * - xxHash source repository : https://github.com/Cyan4973/xxHash
53 #define LLVM_XXH_USE_NEON 1
67 return (X << R) | (X >> (64 - R)); in rotl64()
111 const unsigned char *const Limit = BEnd - 32; in xxHash64()
115 uint64_t V4 = Seed - PRIME64_1; in xxHash64()
128 H64 = rotl64(V1, 1) + rotl64(V2, 7) + rotl64(V3, 12) + rotl64(V4, 18); in xxHash64()
171 // clang-format off
178 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
180 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
186 // clang-format on
191 // Calculates a 64-bit to 128-bit multiply, then XOR folds it.
228 const uint8_t c2 = input[len >> 1]; in XXH3_len_1to3_64b()
229 const uint8_t c3 = input[len - 1]; in XXH3_len_1to3_64b()
242 const uint32_t input2 = endian::read32le(input + len - 4); in XXH3_len_4to8_64b()
244 (endian::read64le(secret + 8) ^ endian::read64le(secret + 16)) - seed; in XXH3_len_4to8_64b()
260 (endian::read64le(secret + 40) ^ endian::read64le(secret + 48)) - seed; in XXH3_len_9to16_64b()
262 input_hi ^= endian::read64le(input + len - 8); in XXH3_len_9to16_64b()
284 uint64_t rhs = 0U - seed; in XXH3_mix16B()
292 /* For mid range keys, XXH3 uses a Mum-hash variant. */
299 acc_end = XXH3_mix16B(input + len - 16, secret + 16, seed); in XXH3_len_17to128_64b()
302 acc_end += XXH3_mix16B(input + len - 32, secret + 48, seed); in XXH3_len_17to128_64b()
305 acc_end += XXH3_mix16B(input + len - 48, secret + 80, seed); in XXH3_len_17to128_64b()
308 acc_end += XXH3_mix16B(input + len - 64, secret + 112, seed); in XXH3_len_17to128_64b()
330 secret + 16 * (i - 8) + XXH3_MIDSIZE_STARTOFFSET, seed); in XXH3_len_129to240_64b()
334 XXH3_mix16B(input + len - 16, in XXH3_len_129to240_64b()
335 secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); in XXH3_len_129to240_64b()
346 // - workaround for suboptimal codegen on older GCC
347 // - compiler barriers against instruction reordering
348 // - WebAssembly SIMD support
349 // - configurable split between NEON and scalar lanes (benchmarking shows no
375 uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16)); in XXH3_accumulate_512_neon()
379 uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16)); in XXH3_accumulate_512_neon()
382 uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); in XXH3_accumulate_512_neon()
383 uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); in XXH3_accumulate_512_neon()
391 * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to in XXH3_accumulate_512_neon()
395 * The intrinsic returns a double vector because the original ARMv7-a in XXH3_accumulate_512_neon()
399 * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] in XXH3_accumulate_512_neon()
400 * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] in XXH3_accumulate_512_neon()
408 uint32x4_t data_key_hi = unzipped.val[1]; in XXH3_accumulate_512_neon()
425 xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2); in XXH3_accumulate_512_neon()
483 acc[i ^ 1] += data_val; in XXH3_accumulate_512_scalar()
509 acc[1] ^ endian::read64le(secret + 8)); in XXH3_mix2Accs()
524 (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; in XXH3_hashLong_64b()
526 const size_t nb_blocks = (len - 1) / block_len; in XXH3_hashLong_64b()
533 XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN); in XXH3_hashLong_64b()
537 const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN; in XXH3_hashLong_64b()
543 XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN, in XXH3_hashLong_64b()
544 secret + secretSize - XXH_STRIPE_LEN - in XXH3_hashLong_64b()
568 * XXH3's 128-bit variant has better mixing and strength than the 64-bit
571 * For example, extra steps are taken to avoid the seed-dependent collisions
572 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
575 * lengths. Note that longer hashes are about as fast as the 64-bit version
576 * due to it using only a slight modification of the 64-bit loop.
578 * XXH128 is also more oriented towards 64-bit machines. It is still extremely
579 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
585 * @brief 32-bit rotate left.
587 * @param x The 32-bit integer to be rotated.
605 #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
606 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
612 * @brief Calculates a 64->128-bit long multiply.
617 * @param lhs , rhs The 64-bit integers to be multiplied
618 * @return The 128-bit result represented in an @ref XXH128_hash_t.
624 * On most 64-bit targets, GCC and Clang define a __uint128_t type. in XXH_mult64to128()
625 * This is usually the best way as it usually uses a native long 64-bit in XXH_mult64to128()
630 * Despite being a 32-bit platform, Clang (and emscripten) define this type in XXH_mult64to128()
632 * compiler builtin call which calculates a full 128-bit multiply. in XXH_mult64to128()
634 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 in XXH_mult64to128()
647 * MSVC for x64's _umul128 method. in XXH_mult64to128()
652 * This compiles to single operand MUL on x64. in XXH_mult64to128()
683 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. in XXH_mult64to128()
690 * ---------- in XXH_mult64to128()
691 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 in XXH_mult64to128()
693 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 in XXH_mult64to128()
695 * --------- in XXH_mult64to128()
698 * --------- in XXH_mult64to128()
702 * 1. It avoids manual carry tracking. Just like how in XXH_mult64to128()
708 * in 32-bit ARMv6 and later, which is shown below: in XXH_mult64to128()
719 * comparable to some 64-bit ALUs. in XXH_mult64to128()
722 * of 32-bit ADD/ADCs. in XXH_mult64to128()
752 /* A doubled version of 1to3_64b with different constants. */ in XXH3_len_1to3_128b()
754 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } in XXH3_len_1to3_128b()
755 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } in XXH3_len_1to3_128b()
756 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } in XXH3_len_1to3_128b()
759 uint8_t const c2 = input[len >> 1]; in XXH3_len_1to3_128b()
760 uint8_t const c3 = input[len - 1]; in XXH3_len_1to3_128b()
767 (endian::read32le(secret + 8) ^ endian::read32le(secret + 12)) - seed; in XXH3_len_1to3_128b()
781 uint32_t const input_hi = endian::read32le(input + len - 4); in XXH3_len_4to8_128b()
791 m128.high64 += (m128.low64 << 1); in XXH3_len_4to8_128b()
805 (endian::read64le(secret + 32) ^ endian::read64le(secret + 40)) - seed; in XXH3_len_9to16_128b()
809 uint64_t input_hi = endian::read64le(input + len - 8); in XXH3_len_9to16_128b()
814 * both the low and high bits in the 128x64 multiply below. in XXH3_len_9to16_128b()
816 m128.low64 += (uint64_t)(len - 1) << 54; in XXH3_len_9to16_128b()
823 * The best approach to this operation is different on 32-bit and 64-bit. in XXH3_len_9to16_128b()
825 if (sizeof(void *) < sizeof(uint64_t)) { /* 32-bit */ in XXH3_len_9to16_128b()
827 * 32-bit optimized version, which is more readable. in XXH3_len_9to16_128b()
829 * On 32-bit, it removes an ADC and delays a dependency between the two in XXH3_len_9to16_128b()
830 * halves of m128.high64, but it generates an extra mask on 64-bit. in XXH3_len_9to16_128b()
836 * 64-bit optimized (albeit more confusing) version. in XXH3_len_9to16_128b()
846 * Inverse Property: x + y - x == y in XXH3_len_9to16_128b()
847 * a + (b * (1 + c - 1)) in XXH3_len_9to16_128b()
849 * a + (b * 1) + (b * (c - 1)) in XXH3_len_9to16_128b()
850 * Identity Property: x * 1 == x in XXH3_len_9to16_128b()
851 * a + b + (b * (c - 1)) in XXH3_len_9to16_128b()
855 * - 1)) in XXH3_len_9to16_128b()
858 * input_hi + ((uint64_t)input_hi.lo * (PRIME32_2 - 1)) in XXH3_len_9to16_128b()
860 m128.high64 += input_hi + XXH_mult32to64((uint32_t)input_hi, PRIME32_2 - 1); in XXH3_len_9to16_128b()
865 /* 128x64 multiply: h128 = m128 * PRIME64_2; */ in XXH3_len_9to16_128b()
897 * A bit slower than XXH3_mix16B, but handles multiply by zero better.
922 XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96, seed); in XXH3_len_17to128_128b()
924 acc = XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed); in XXH3_len_17to128_128b()
926 acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed); in XXH3_len_17to128_128b()
928 acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed); in XXH3_len_17to128_128b()
932 ((len - seed) * PRIME64_2); in XXH3_len_17to128_128b()
934 h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64); in XXH3_len_17to128_128b()
954 acc = XXH128_mix32B(acc, input + i - 32, input + i - 16, secret + i - 32, in XXH3_len_129to240_128b()
960 * NB: `i <= len` will duplicate the last 32-bytes if in XXH3_len_129to240_128b()
965 acc = XXH128_mix32B(acc, input + i - 32, input + i - 16, in XXH3_len_129to240_128b()
966 secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, seed); in XXH3_len_129to240_128b()
970 XXH128_mix32B(acc, input + len - 16, input + len - 32, in XXH3_len_129to240_128b()
971 secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, in XXH3_len_129to240_128b()
972 (uint64_t)0 - seed); in XXH3_len_129to240_128b()
977 ((len - seed) * PRIME64_2); in XXH3_len_129to240_128b()
979 h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64); in XXH3_len_129to240_128b()
987 (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; in XXH3_hashLong_128b()
989 const size_t nb_blocks = (len - 1) / block_len; in XXH3_hashLong_128b()
997 XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN); in XXH3_hashLong_128b()
1001 const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN; in XXH3_hashLong_128b()
1007 XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN, in XXH3_hashLong_128b()
1008 secret + secretSize - XXH_STRIPE_LEN - in XXH3_hashLong_128b()
1018 acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, in XXH3_hashLong_128b()
1030 * For now, it's a contract pre-condition. in xxh3_128bits()