1 /* 2 * xxHash - Extremely Fast Hash algorithm 3 * Copyright (C) 2012-2023, Yann Collet 4 * 5 * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above 14 * copyright notice, this list of conditions and the following disclaimer 15 * in the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 * 30 * You can contact the author at : 31 * - xxHash homepage: http://www.xxhash.com 32 * - xxHash source repository : https://github.com/Cyan4973/xxHash 33 */ 34 35 // xxhash64 is based on commit d2df04efcbef7d7f6886d345861e5dfda4edacc1. Removed 36 // everything but a simple interface for computing xxh64. 37 38 // xxh3_64bits is based on commit d5891596637d21366b9b1dcf2c0007a3edb26a9e (July 39 // 2023). 40 41 // xxh3_128bits is based on commit b0adcc54188c3130b1793e7b19c62eb1e669f7df 42 // (June 2024). 43 44 #include "llvm/Support/xxhash.h" 45 #include "llvm/Support/Compiler.h" 46 #include "llvm/Support/Endian.h" 47 48 #include <stdlib.h> 49 50 #if !defined(LLVM_XXH_USE_NEON) 51 #if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \ 52 !defined(__ARM_BIG_ENDIAN) 53 #define LLVM_XXH_USE_NEON 1 54 #else 55 #define LLVM_XXH_USE_NEON 0 56 #endif 57 #endif 58 59 #if LLVM_XXH_USE_NEON 60 #include <arm_neon.h> 61 #endif 62 63 using namespace llvm; 64 using namespace support; 65 66 static uint64_t rotl64(uint64_t X, size_t R) { 67 return (X << R) | (X >> (64 - R)); 68 } 69 70 constexpr uint32_t PRIME32_1 = 0x9E3779B1; 71 constexpr uint32_t PRIME32_2 = 0x85EBCA77; 72 constexpr uint32_t PRIME32_3 = 0xC2B2AE3D; 73 74 static const uint64_t PRIME64_1 = 11400714785074694791ULL; 75 static const uint64_t PRIME64_2 = 14029467366897019727ULL; 76 static const uint64_t PRIME64_3 = 1609587929392839161ULL; 77 static const uint64_t PRIME64_4 = 9650029242287828579ULL; 78 static const uint64_t PRIME64_5 = 2870177450012600261ULL; 79 80 static uint64_t round(uint64_t Acc, uint64_t Input) { 81 Acc += Input * PRIME64_2; 82 Acc = rotl64(Acc, 31); 83 Acc *= PRIME64_1; 84 return Acc; 85 } 86 87 static uint64_t mergeRound(uint64_t Acc, uint64_t Val) { 88 Val = round(0, Val); 89 Acc ^= Val; 90 Acc = Acc * PRIME64_1 + PRIME64_4; 91 return Acc; 92 } 93 94 static uint64_t XXH64_avalanche(uint64_t hash) { 95 hash ^= hash >> 33; 96 hash *= PRIME64_2; 97 hash ^= hash >> 29; 98 hash *= PRIME64_3; 99 hash ^= hash >> 32; 100 return hash; 101 } 102 103 uint64_t llvm::xxHash64(StringRef Data) { 104 size_t Len = Data.size(); 105 uint64_t Seed = 0; 106 const unsigned char *P = Data.bytes_begin(); 107 const unsigned char *const BEnd = Data.bytes_end(); 108 uint64_t H64; 109 110 if (Len >= 32) { 111 const unsigned char *const Limit = BEnd - 32; 112 uint64_t V1 = Seed + PRIME64_1 + PRIME64_2; 113 uint64_t V2 = Seed + PRIME64_2; 114 uint64_t V3 = Seed + 0; 115 uint64_t V4 = Seed - PRIME64_1; 116 117 do { 118 V1 = round(V1, endian::read64le(P)); 119 P += 8; 120 V2 = round(V2, endian::read64le(P)); 121 P += 8; 122 V3 = round(V3, endian::read64le(P)); 123 P += 8; 124 V4 = round(V4, endian::read64le(P)); 125 P += 8; 126 } while (P <= Limit); 127 128 H64 = rotl64(V1, 1) + rotl64(V2, 7) + rotl64(V3, 12) + rotl64(V4, 18); 129 H64 = mergeRound(H64, V1); 130 H64 = mergeRound(H64, V2); 131 H64 = mergeRound(H64, V3); 132 H64 = mergeRound(H64, V4); 133 134 } else { 135 H64 = Seed + PRIME64_5; 136 } 137 138 H64 += (uint64_t)Len; 139 140 while (reinterpret_cast<uintptr_t>(P) + 8 <= 141 reinterpret_cast<uintptr_t>(BEnd)) { 142 uint64_t const K1 = round(0, endian::read64le(P)); 143 H64 ^= K1; 144 H64 = rotl64(H64, 27) * PRIME64_1 + PRIME64_4; 145 P += 8; 146 } 147 148 if (reinterpret_cast<uintptr_t>(P) + 4 <= reinterpret_cast<uintptr_t>(BEnd)) { 149 H64 ^= (uint64_t)(endian::read32le(P)) * PRIME64_1; 150 H64 = rotl64(H64, 23) * PRIME64_2 + PRIME64_3; 151 P += 4; 152 } 153 154 while (P < BEnd) { 155 H64 ^= (*P) * PRIME64_5; 156 H64 = rotl64(H64, 11) * PRIME64_1; 157 P++; 158 } 159 160 return XXH64_avalanche(H64); 161 } 162 163 uint64_t llvm::xxHash64(ArrayRef<uint8_t> Data) { 164 return xxHash64({(const char *)Data.data(), Data.size()}); 165 } 166 167 constexpr size_t XXH3_SECRETSIZE_MIN = 136; 168 constexpr size_t XXH_SECRET_DEFAULT_SIZE = 192; 169 170 /* Pseudorandom data taken directly from FARSH */ 171 // clang-format off 172 constexpr uint8_t kSecret[XXH_SECRET_DEFAULT_SIZE] = { 173 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 174 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 175 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, 176 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, 177 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 178 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, 179 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, 180 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 181 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, 182 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, 183 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 184 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, 185 }; 186 // clang-format on 187 188 constexpr uint64_t PRIME_MX1 = 0x165667919E3779F9; 189 constexpr uint64_t PRIME_MX2 = 0x9FB21C651E98DF25; 190 191 // Calculates a 64-bit to 128-bit multiply, then XOR folds it. 192 static uint64_t XXH3_mul128_fold64(uint64_t lhs, uint64_t rhs) { 193 #if defined(__SIZEOF_INT128__) || \ 194 (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) 195 __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; 196 return uint64_t(product) ^ uint64_t(product >> 64); 197 198 #else 199 /* First calculate all of the cross products. */ 200 const uint64_t lo_lo = (lhs & 0xFFFFFFFF) * (rhs & 0xFFFFFFFF); 201 const uint64_t hi_lo = (lhs >> 32) * (rhs & 0xFFFFFFFF); 202 const uint64_t lo_hi = (lhs & 0xFFFFFFFF) * (rhs >> 32); 203 const uint64_t hi_hi = (lhs >> 32) * (rhs >> 32); 204 205 /* Now add the products together. These will never overflow. */ 206 const uint64_t cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; 207 const uint64_t upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; 208 const uint64_t lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); 209 210 return upper ^ lower; 211 #endif 212 } 213 214 constexpr size_t XXH_STRIPE_LEN = 64; 215 constexpr size_t XXH_SECRET_CONSUME_RATE = 8; 216 constexpr size_t XXH_ACC_NB = XXH_STRIPE_LEN / sizeof(uint64_t); 217 218 static uint64_t XXH3_avalanche(uint64_t hash) { 219 hash ^= hash >> 37; 220 hash *= PRIME_MX1; 221 hash ^= hash >> 32; 222 return hash; 223 } 224 225 static uint64_t XXH3_len_1to3_64b(const uint8_t *input, size_t len, 226 const uint8_t *secret, uint64_t seed) { 227 const uint8_t c1 = input[0]; 228 const uint8_t c2 = input[len >> 1]; 229 const uint8_t c3 = input[len - 1]; 230 uint32_t combined = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) | 231 ((uint32_t)c3 << 0) | ((uint32_t)len << 8); 232 uint64_t bitflip = 233 (uint64_t)(endian::read32le(secret) ^ endian::read32le(secret + 4)) + 234 seed; 235 return XXH64_avalanche(uint64_t(combined) ^ bitflip); 236 } 237 238 static uint64_t XXH3_len_4to8_64b(const uint8_t *input, size_t len, 239 const uint8_t *secret, uint64_t seed) { 240 seed ^= (uint64_t)byteswap(uint32_t(seed)) << 32; 241 const uint32_t input1 = endian::read32le(input); 242 const uint32_t input2 = endian::read32le(input + len - 4); 243 uint64_t acc = 244 (endian::read64le(secret + 8) ^ endian::read64le(secret + 16)) - seed; 245 const uint64_t input64 = (uint64_t)input2 | ((uint64_t)input1 << 32); 246 acc ^= input64; 247 // XXH3_rrmxmx(acc, len) 248 acc ^= rotl64(acc, 49) ^ rotl64(acc, 24); 249 acc *= PRIME_MX2; 250 acc ^= (acc >> 35) + (uint64_t)len; 251 acc *= PRIME_MX2; 252 return acc ^ (acc >> 28); 253 } 254 255 static uint64_t XXH3_len_9to16_64b(const uint8_t *input, size_t len, 256 const uint8_t *secret, uint64_t const seed) { 257 uint64_t input_lo = 258 (endian::read64le(secret + 24) ^ endian::read64le(secret + 32)) + seed; 259 uint64_t input_hi = 260 (endian::read64le(secret + 40) ^ endian::read64le(secret + 48)) - seed; 261 input_lo ^= endian::read64le(input); 262 input_hi ^= endian::read64le(input + len - 8); 263 uint64_t acc = uint64_t(len) + byteswap(input_lo) + input_hi + 264 XXH3_mul128_fold64(input_lo, input_hi); 265 return XXH3_avalanche(acc); 266 } 267 268 LLVM_ATTRIBUTE_ALWAYS_INLINE 269 static uint64_t XXH3_len_0to16_64b(const uint8_t *input, size_t len, 270 const uint8_t *secret, uint64_t const seed) { 271 if (LLVM_LIKELY(len > 8)) 272 return XXH3_len_9to16_64b(input, len, secret, seed); 273 if (LLVM_LIKELY(len >= 4)) 274 return XXH3_len_4to8_64b(input, len, secret, seed); 275 if (len != 0) 276 return XXH3_len_1to3_64b(input, len, secret, seed); 277 return XXH64_avalanche(seed ^ endian::read64le(secret + 56) ^ 278 endian::read64le(secret + 64)); 279 } 280 281 static uint64_t XXH3_mix16B(const uint8_t *input, uint8_t const *secret, 282 uint64_t seed) { 283 uint64_t lhs = seed; 284 uint64_t rhs = 0U - seed; 285 lhs += endian::read64le(secret); 286 rhs += endian::read64le(secret + 8); 287 lhs ^= endian::read64le(input); 288 rhs ^= endian::read64le(input + 8); 289 return XXH3_mul128_fold64(lhs, rhs); 290 } 291 292 /* For mid range keys, XXH3 uses a Mum-hash variant. */ 293 LLVM_ATTRIBUTE_ALWAYS_INLINE 294 static uint64_t XXH3_len_17to128_64b(const uint8_t *input, size_t len, 295 const uint8_t *secret, 296 uint64_t const seed) { 297 uint64_t acc = len * PRIME64_1, acc_end; 298 acc += XXH3_mix16B(input + 0, secret + 0, seed); 299 acc_end = XXH3_mix16B(input + len - 16, secret + 16, seed); 300 if (len > 32) { 301 acc += XXH3_mix16B(input + 16, secret + 32, seed); 302 acc_end += XXH3_mix16B(input + len - 32, secret + 48, seed); 303 if (len > 64) { 304 acc += XXH3_mix16B(input + 32, secret + 64, seed); 305 acc_end += XXH3_mix16B(input + len - 48, secret + 80, seed); 306 if (len > 96) { 307 acc += XXH3_mix16B(input + 48, secret + 96, seed); 308 acc_end += XXH3_mix16B(input + len - 64, secret + 112, seed); 309 } 310 } 311 } 312 return XXH3_avalanche(acc + acc_end); 313 } 314 315 constexpr size_t XXH3_MIDSIZE_MAX = 240; 316 constexpr size_t XXH3_MIDSIZE_STARTOFFSET = 3; 317 constexpr size_t XXH3_MIDSIZE_LASTOFFSET = 17; 318 319 LLVM_ATTRIBUTE_NOINLINE 320 static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len, 321 const uint8_t *secret, uint64_t seed) { 322 uint64_t acc = (uint64_t)len * PRIME64_1; 323 const unsigned nbRounds = len / 16; 324 for (unsigned i = 0; i < 8; ++i) 325 acc += XXH3_mix16B(input + 16 * i, secret + 16 * i, seed); 326 acc = XXH3_avalanche(acc); 327 328 for (unsigned i = 8; i < nbRounds; ++i) { 329 acc += XXH3_mix16B(input + 16 * i, 330 secret + 16 * (i - 8) + XXH3_MIDSIZE_STARTOFFSET, seed); 331 } 332 /* last bytes */ 333 acc += 334 XXH3_mix16B(input + len - 16, 335 secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); 336 return XXH3_avalanche(acc); 337 } 338 339 #if LLVM_XXH_USE_NEON 340 341 #define XXH3_accumulate_512 XXH3_accumulate_512_neon 342 #define XXH3_scrambleAcc XXH3_scrambleAcc_neon 343 344 // NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849 345 // (July 2024), with the following removed: 346 // - workaround for suboptimal codegen on older GCC 347 // - compiler barriers against instruction reordering 348 // - WebAssembly SIMD support 349 // - configurable split between NEON and scalar lanes (benchmarking shows no 350 // penalty when fully doing SIMD on the Apple M1) 351 352 #if defined(__GNUC__) || defined(__clang__) 353 #define XXH_ALIASING __attribute__((__may_alias__)) 354 #else 355 #define XXH_ALIASING /* nothing */ 356 #endif 357 358 typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; 359 360 LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) { 361 return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr)); 362 } 363 364 LLVM_ATTRIBUTE_ALWAYS_INLINE 365 static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input, 366 const uint8_t *secret) { 367 xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc; 368 369 #ifdef __clang__ 370 #pragma clang loop unroll(full) 371 #endif 372 for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) { 373 /* data_vec = input[i]; */ 374 uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16)); 375 uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16)); 376 377 /* key_vec = secret[i]; */ 378 uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16)); 379 uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16)); 380 381 /* data_swap = swap(data_vec) */ 382 uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); 383 uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); 384 385 /* data_key = data_vec ^ key_vec; */ 386 uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); 387 uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); 388 389 /* 390 * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a 391 * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to 392 * get one vector with the low 32 bits of each lane, and one vector 393 * with the high 32 bits of each lane. 394 * 395 * The intrinsic returns a double vector because the original ARMv7-a 396 * instruction modified both arguments in place. AArch64 and SIMD128 emit 397 * two instructions from this intrinsic. 398 * 399 * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] 400 * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] 401 */ 402 uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1), 403 vreinterpretq_u32_u64(data_key_2)); 404 405 /* data_key_lo = data_key & 0xFFFFFFFF */ 406 uint32x4_t data_key_lo = unzipped.val[0]; 407 /* data_key_hi = data_key >> 32 */ 408 uint32x4_t data_key_hi = unzipped.val[1]; 409 410 /* 411 * Then, we can split the vectors horizontally and multiply which, as for 412 * most widening intrinsics, have a variant that works on both high half 413 * vectors for free on AArch64. A similar instruction is available on 414 * SIMD128. 415 * 416 * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi 417 */ 418 uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo), 419 vget_low_u32(data_key_hi)); 420 uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo), 421 vget_high_u32(data_key_hi)); 422 423 /* xacc[i] = acc_vec + sum; */ 424 xacc[i] = vaddq_u64(xacc[i], sum_1); 425 xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2); 426 } 427 } 428 429 LLVM_ATTRIBUTE_ALWAYS_INLINE 430 static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) { 431 xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc; 432 433 /* { prime32_1, prime32_1 } */ 434 uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1); 435 /* { 0, prime32_1, 0, prime32_1 } */ 436 uint32x4_t const kPrimeHi = 437 vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32)); 438 439 for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) { 440 /* xacc[i] ^= (xacc[i] >> 47); */ 441 uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i)); 442 uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); 443 uint64x2_t data_vec = veorq_u64(acc_vec, shifted); 444 445 /* xacc[i] ^= secret[i]; */ 446 uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16)); 447 uint64x2_t data_key = veorq_u64(data_vec, key_vec); 448 449 /* 450 * xacc[i] *= XXH_PRIME32_1 451 * 452 * Expanded version with portable NEON intrinsics 453 * 454 * lo(x) * lo(y) + (hi(x) * lo(y) << 32) 455 * 456 * prod_hi = hi(data_key) * lo(prime) << 32 457 * 458 * Since we only need 32 bits of this multiply a trick can be used, 459 * reinterpreting the vector as a uint32x4_t and multiplying by 460 * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the 461 * shift. 462 */ 463 uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi); 464 465 /* Extract low bits for vmlal_u32 */ 466 uint32x2_t data_key_lo = vmovn_u64(data_key); 467 468 /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ 469 xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); 470 } 471 } 472 #else 473 474 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar 475 #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar 476 477 LLVM_ATTRIBUTE_ALWAYS_INLINE 478 static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input, 479 const uint8_t *secret) { 480 for (size_t i = 0; i < XXH_ACC_NB; ++i) { 481 uint64_t data_val = endian::read64le(input + 8 * i); 482 uint64_t data_key = data_val ^ endian::read64le(secret + 8 * i); 483 acc[i ^ 1] += data_val; 484 acc[i] += uint32_t(data_key) * (data_key >> 32); 485 } 486 } 487 488 LLVM_ATTRIBUTE_ALWAYS_INLINE 489 static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) { 490 for (size_t i = 0; i < XXH_ACC_NB; ++i) { 491 acc[i] ^= acc[i] >> 47; 492 acc[i] ^= endian::read64le(secret + 8 * i); 493 acc[i] *= PRIME32_1; 494 } 495 } 496 #endif 497 498 LLVM_ATTRIBUTE_ALWAYS_INLINE 499 static void XXH3_accumulate(uint64_t *acc, const uint8_t *input, 500 const uint8_t *secret, size_t nbStripes) { 501 for (size_t n = 0; n < nbStripes; ++n) { 502 XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN, 503 secret + n * XXH_SECRET_CONSUME_RATE); 504 } 505 } 506 507 static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) { 508 return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret), 509 acc[1] ^ endian::read64le(secret + 8)); 510 } 511 512 static uint64_t XXH3_mergeAccs(const uint64_t *acc, const uint8_t *key, 513 uint64_t start) { 514 uint64_t result64 = start; 515 for (size_t i = 0; i < 4; ++i) 516 result64 += XXH3_mix2Accs(acc + 2 * i, key + 16 * i); 517 return XXH3_avalanche(result64); 518 } 519 520 LLVM_ATTRIBUTE_NOINLINE 521 static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len, 522 const uint8_t *secret, size_t secretSize) { 523 const size_t nbStripesPerBlock = 524 (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; 525 const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock; 526 const size_t nb_blocks = (len - 1) / block_len; 527 alignas(16) uint64_t acc[XXH_ACC_NB] = { 528 PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, 529 PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1, 530 }; 531 for (size_t n = 0; n < nb_blocks; ++n) { 532 XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock); 533 XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN); 534 } 535 536 /* last partial block */ 537 const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN; 538 assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE); 539 XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes); 540 541 /* last stripe */ 542 constexpr size_t XXH_SECRET_LASTACC_START = 7; 543 XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN, 544 secret + secretSize - XXH_STRIPE_LEN - 545 XXH_SECRET_LASTACC_START); 546 547 /* converge into final hash */ 548 constexpr size_t XXH_SECRET_MERGEACCS_START = 11; 549 return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, 550 (uint64_t)len * PRIME64_1); 551 } 552 553 uint64_t llvm::xxh3_64bits(ArrayRef<uint8_t> data) { 554 auto *in = data.data(); 555 size_t len = data.size(); 556 if (len <= 16) 557 return XXH3_len_0to16_64b(in, len, kSecret, 0); 558 if (len <= 128) 559 return XXH3_len_17to128_64b(in, len, kSecret, 0); 560 if (len <= XXH3_MIDSIZE_MAX) 561 return XXH3_len_129to240_64b(in, len, kSecret, 0); 562 return XXH3_hashLong_64b(in, len, kSecret, sizeof(kSecret)); 563 } 564 565 /* ========================================== 566 * XXH3 128 bits (a.k.a XXH128) 567 * ========================================== 568 * XXH3's 128-bit variant has better mixing and strength than the 64-bit 569 * variant, even without counting the significantly larger output size. 570 * 571 * For example, extra steps are taken to avoid the seed-dependent collisions 572 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). 573 * 574 * This strength naturally comes at the cost of some speed, especially on short 575 * lengths. Note that longer hashes are about as fast as the 64-bit version 576 * due to it using only a slight modification of the 64-bit loop. 577 * 578 * XXH128 is also more oriented towards 64-bit machines. It is still extremely 579 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). 580 */ 581 582 /*! 583 * @internal 584 * @def XXH_rotl32(x,r) 585 * @brief 32-bit rotate left. 586 * 587 * @param x The 32-bit integer to be rotated. 588 * @param r The number of bits to rotate. 589 * @pre 590 * @p r > 0 && @p r < 32 591 * @note 592 * @p x and @p r may be evaluated multiple times. 593 * @return The rotated result. 594 */ 595 #if __has_builtin(__builtin_rotateleft32) && \ 596 __has_builtin(__builtin_rotateleft64) 597 #define XXH_rotl32 __builtin_rotateleft32 598 #define XXH_rotl64 __builtin_rotateleft64 599 /* Note: although _rotl exists for minGW (GCC under windows), performance seems 600 * poor */ 601 #elif defined(_MSC_VER) 602 #define XXH_rotl32(x, r) _rotl(x, r) 603 #define XXH_rotl64(x, r) _rotl64(x, r) 604 #else 605 #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r)))) 606 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) 607 #endif 608 609 #define XXH_mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y)) 610 611 /*! 612 * @brief Calculates a 64->128-bit long multiply. 613 * 614 * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar 615 * version. 616 * 617 * @param lhs , rhs The 64-bit integers to be multiplied 618 * @return The 128-bit result represented in an @ref XXH128_hash_t. 619 */ 620 static XXH128_hash_t XXH_mult64to128(uint64_t lhs, uint64_t rhs) { 621 /* 622 * GCC/Clang __uint128_t method. 623 * 624 * On most 64-bit targets, GCC and Clang define a __uint128_t type. 625 * This is usually the best way as it usually uses a native long 64-bit 626 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. 627 * 628 * Usually. 629 * 630 * Despite being a 32-bit platform, Clang (and emscripten) define this type 631 * despite not having the arithmetic for it. This results in a laggy 632 * compiler builtin call which calculates a full 128-bit multiply. 633 * In that case it is best to use the portable one. 634 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 635 */ 636 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) && \ 637 defined(__SIZEOF_INT128__) || \ 638 (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) 639 640 __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; 641 XXH128_hash_t r128; 642 r128.low64 = (uint64_t)(product); 643 r128.high64 = (uint64_t)(product >> 64); 644 return r128; 645 646 /* 647 * MSVC for x64's _umul128 method. 648 * 649 * uint64_t _umul128(uint64_t Multiplier, uint64_t Multiplicand, uint64_t 650 * *HighProduct); 651 * 652 * This compiles to single operand MUL on x64. 653 */ 654 #elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) 655 656 #ifndef _MSC_VER 657 #pragma intrinsic(_umul128) 658 #endif 659 uint64_t product_high; 660 uint64_t const product_low = _umul128(lhs, rhs, &product_high); 661 XXH128_hash_t r128; 662 r128.low64 = product_low; 663 r128.high64 = product_high; 664 return r128; 665 666 /* 667 * MSVC for ARM64's __umulh method. 668 * 669 * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. 670 */ 671 #elif defined(_M_ARM64) || defined(_M_ARM64EC) 672 673 #ifndef _MSC_VER 674 #pragma intrinsic(__umulh) 675 #endif 676 XXH128_hash_t r128; 677 r128.low64 = lhs * rhs; 678 r128.high64 = __umulh(lhs, rhs); 679 return r128; 680 681 #else 682 /* 683 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. 684 * 685 * This is a fast and simple grade school multiply, which is shown below 686 * with base 10 arithmetic instead of base 0x100000000. 687 * 688 * 9 3 // D2 lhs = 93 689 * x 7 5 // D2 rhs = 75 690 * ---------- 691 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 692 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 693 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 694 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 695 * --------- 696 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 697 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 698 * --------- 699 * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 700 * 701 * The reasons for adding the products like this are: 702 * 1. It avoids manual carry tracking. Just like how 703 * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. 704 * This avoids a lot of complexity. 705 * 706 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL 707 * instruction available in ARM's Digital Signal Processing extension 708 * in 32-bit ARMv6 and later, which is shown below: 709 * 710 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) 711 * { 712 * uint64_t product = (uint64_t)*RdLo * (uint64_t)*RdHi + Rn + Rm; 713 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); 714 * *RdHi = (xxh_u32)(product >> 32); 715 * } 716 * 717 * This instruction was designed for efficient long multiplication, and 718 * allows this to be calculated in only 4 instructions at speeds 719 * comparable to some 64-bit ALUs. 720 * 721 * 3. It isn't terrible on other platforms. Usually this will be a couple 722 * of 32-bit ADD/ADCs. 723 */ 724 725 /* First calculate all of the cross products. */ 726 uint64_t const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); 727 uint64_t const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); 728 uint64_t const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); 729 uint64_t const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); 730 731 /* Now add the products together. These will never overflow. */ 732 uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; 733 uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; 734 uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); 735 736 XXH128_hash_t r128; 737 r128.low64 = lower; 738 r128.high64 = upper; 739 return r128; 740 #endif 741 } 742 743 /*! Seems to produce slightly better code on GCC for some reason. */ 744 LLVM_ATTRIBUTE_ALWAYS_INLINE constexpr uint64_t XXH_xorshift64(uint64_t v64, 745 int shift) { 746 return v64 ^ (v64 >> shift); 747 } 748 749 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t 750 XXH3_len_1to3_128b(const uint8_t *input, size_t len, const uint8_t *secret, 751 uint64_t seed) { 752 /* A doubled version of 1to3_64b with different constants. */ 753 /* 754 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } 755 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } 756 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } 757 */ 758 uint8_t const c1 = input[0]; 759 uint8_t const c2 = input[len >> 1]; 760 uint8_t const c3 = input[len - 1]; 761 uint32_t const combinedl = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) | 762 ((uint32_t)c3 << 0) | ((uint32_t)len << 8); 763 uint32_t const combinedh = XXH_rotl32(byteswap(combinedl), 13); 764 uint64_t const bitflipl = 765 (endian::read32le(secret) ^ endian::read32le(secret + 4)) + seed; 766 uint64_t const bitfliph = 767 (endian::read32le(secret + 8) ^ endian::read32le(secret + 12)) - seed; 768 uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl; 769 uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph; 770 XXH128_hash_t h128; 771 h128.low64 = XXH64_avalanche(keyed_lo); 772 h128.high64 = XXH64_avalanche(keyed_hi); 773 return h128; 774 } 775 776 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t 777 XXH3_len_4to8_128b(const uint8_t *input, size_t len, const uint8_t *secret, 778 uint64_t seed) { 779 seed ^= (uint64_t)byteswap((uint32_t)seed) << 32; 780 uint32_t const input_lo = endian::read32le(input); 781 uint32_t const input_hi = endian::read32le(input + len - 4); 782 uint64_t const input_64 = input_lo + ((uint64_t)input_hi << 32); 783 uint64_t const bitflip = 784 (endian::read64le(secret + 16) ^ endian::read64le(secret + 24)) + seed; 785 uint64_t const keyed = input_64 ^ bitflip; 786 787 /* Shift len to the left to ensure it is even, this avoids even multiplies. 788 */ 789 XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2)); 790 791 m128.high64 += (m128.low64 << 1); 792 m128.low64 ^= (m128.high64 >> 3); 793 794 m128.low64 = XXH_xorshift64(m128.low64, 35); 795 m128.low64 *= PRIME_MX2; 796 m128.low64 = XXH_xorshift64(m128.low64, 28); 797 m128.high64 = XXH3_avalanche(m128.high64); 798 return m128; 799 } 800 801 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t 802 XXH3_len_9to16_128b(const uint8_t *input, size_t len, const uint8_t *secret, 803 uint64_t seed) { 804 uint64_t const bitflipl = 805 (endian::read64le(secret + 32) ^ endian::read64le(secret + 40)) - seed; 806 uint64_t const bitfliph = 807 (endian::read64le(secret + 48) ^ endian::read64le(secret + 56)) + seed; 808 uint64_t const input_lo = endian::read64le(input); 809 uint64_t input_hi = endian::read64le(input + len - 8); 810 XXH128_hash_t m128 = 811 XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1); 812 /* 813 * Put len in the middle of m128 to ensure that the length gets mixed to 814 * both the low and high bits in the 128x64 multiply below. 815 */ 816 m128.low64 += (uint64_t)(len - 1) << 54; 817 input_hi ^= bitfliph; 818 /* 819 * Add the high 32 bits of input_hi to the high 32 bits of m128, then 820 * add the long product of the low 32 bits of input_hi and PRIME32_2 to 821 * the high 64 bits of m128. 822 * 823 * The best approach to this operation is different on 32-bit and 64-bit. 824 */ 825 if (sizeof(void *) < sizeof(uint64_t)) { /* 32-bit */ 826 /* 827 * 32-bit optimized version, which is more readable. 828 * 829 * On 32-bit, it removes an ADC and delays a dependency between the two 830 * halves of m128.high64, but it generates an extra mask on 64-bit. 831 */ 832 m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + 833 XXH_mult32to64((uint32_t)input_hi, PRIME32_2); 834 } else { 835 /* 836 * 64-bit optimized (albeit more confusing) version. 837 * 838 * Uses some properties of addition and multiplication to remove the mask: 839 * 840 * Let: 841 * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) 842 * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) 843 * c = PRIME32_2 844 * 845 * a + (b * c) 846 * Inverse Property: x + y - x == y 847 * a + (b * (1 + c - 1)) 848 * Distributive Property: x * (y + z) == (x * y) + (x * z) 849 * a + (b * 1) + (b * (c - 1)) 850 * Identity Property: x * 1 == x 851 * a + b + (b * (c - 1)) 852 * 853 * Substitute a, b, and c: 854 * input_hi.hi + input_hi.lo + ((uint64_t)input_hi.lo * (PRIME32_2 855 * - 1)) 856 * 857 * Since input_hi.hi + input_hi.lo == input_hi, we get this: 858 * input_hi + ((uint64_t)input_hi.lo * (PRIME32_2 - 1)) 859 */ 860 m128.high64 += input_hi + XXH_mult32to64((uint32_t)input_hi, PRIME32_2 - 1); 861 } 862 /* m128 ^= XXH_swap64(m128 >> 64); */ 863 m128.low64 ^= byteswap(m128.high64); 864 865 /* 128x64 multiply: h128 = m128 * PRIME64_2; */ 866 XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2); 867 h128.high64 += m128.high64 * PRIME64_2; 868 869 h128.low64 = XXH3_avalanche(h128.low64); 870 h128.high64 = XXH3_avalanche(h128.high64); 871 return h128; 872 } 873 874 /* 875 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN 876 */ 877 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t 878 XXH3_len_0to16_128b(const uint8_t *input, size_t len, const uint8_t *secret, 879 uint64_t seed) { 880 if (len > 8) 881 return XXH3_len_9to16_128b(input, len, secret, seed); 882 if (len >= 4) 883 return XXH3_len_4to8_128b(input, len, secret, seed); 884 if (len) 885 return XXH3_len_1to3_128b(input, len, secret, seed); 886 XXH128_hash_t h128; 887 uint64_t const bitflipl = 888 endian::read64le(secret + 64) ^ endian::read64le(secret + 72); 889 uint64_t const bitfliph = 890 endian::read64le(secret + 80) ^ endian::read64le(secret + 88); 891 h128.low64 = XXH64_avalanche(seed ^ bitflipl); 892 h128.high64 = XXH64_avalanche(seed ^ bitfliph); 893 return h128; 894 } 895 896 /* 897 * A bit slower than XXH3_mix16B, but handles multiply by zero better. 898 */ 899 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t 900 XXH128_mix32B(XXH128_hash_t acc, const uint8_t *input_1, const uint8_t *input_2, 901 const uint8_t *secret, uint64_t seed) { 902 acc.low64 += XXH3_mix16B(input_1, secret + 0, seed); 903 acc.low64 ^= endian::read64le(input_2) + endian::read64le(input_2 + 8); 904 acc.high64 += XXH3_mix16B(input_2, secret + 16, seed); 905 acc.high64 ^= endian::read64le(input_1) + endian::read64le(input_1 + 8); 906 return acc; 907 } 908 909 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t 910 XXH3_len_17to128_128b(const uint8_t *input, size_t len, const uint8_t *secret, 911 size_t secretSize, uint64_t seed) { 912 (void)secretSize; 913 914 XXH128_hash_t acc; 915 acc.low64 = len * PRIME64_1; 916 acc.high64 = 0; 917 918 if (len > 32) { 919 if (len > 64) { 920 if (len > 96) { 921 acc = 922 XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96, seed); 923 } 924 acc = XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed); 925 } 926 acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed); 927 } 928 acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed); 929 XXH128_hash_t h128; 930 h128.low64 = acc.low64 + acc.high64; 931 h128.high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + 932 ((len - seed) * PRIME64_2); 933 h128.low64 = XXH3_avalanche(h128.low64); 934 h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64); 935 return h128; 936 } 937 938 LLVM_ATTRIBUTE_NOINLINE static XXH128_hash_t 939 XXH3_len_129to240_128b(const uint8_t *input, size_t len, const uint8_t *secret, 940 size_t secretSize, uint64_t seed) { 941 (void)secretSize; 942 943 XXH128_hash_t acc; 944 unsigned i; 945 acc.low64 = len * PRIME64_1; 946 acc.high64 = 0; 947 /* 948 * We set as `i` as offset + 32. We do this so that unchanged 949 * `len` can be used as upper bound. This reaches a sweet spot 950 * where both x86 and aarch64 get simple agen and good codegen 951 * for the loop. 952 */ 953 for (i = 32; i < 160; i += 32) { 954 acc = XXH128_mix32B(acc, input + i - 32, input + i - 16, secret + i - 32, 955 seed); 956 } 957 acc.low64 = XXH3_avalanche(acc.low64); 958 acc.high64 = XXH3_avalanche(acc.high64); 959 /* 960 * NB: `i <= len` will duplicate the last 32-bytes if 961 * len % 32 was zero. This is an unfortunate necessity to keep 962 * the hash result stable. 963 */ 964 for (i = 160; i <= len; i += 32) { 965 acc = XXH128_mix32B(acc, input + i - 32, input + i - 16, 966 secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, seed); 967 } 968 /* last bytes */ 969 acc = 970 XXH128_mix32B(acc, input + len - 16, input + len - 32, 971 secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 972 (uint64_t)0 - seed); 973 974 XXH128_hash_t h128; 975 h128.low64 = acc.low64 + acc.high64; 976 h128.high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + 977 ((len - seed) * PRIME64_2); 978 h128.low64 = XXH3_avalanche(h128.low64); 979 h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64); 980 return h128; 981 } 982 983 LLVM_ATTRIBUTE_ALWAYS_INLINE XXH128_hash_t 984 XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret, 985 size_t secretSize) { 986 const size_t nbStripesPerBlock = 987 (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; 988 const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock; 989 const size_t nb_blocks = (len - 1) / block_len; 990 alignas(16) uint64_t acc[XXH_ACC_NB] = { 991 PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, 992 PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1, 993 }; 994 995 for (size_t n = 0; n < nb_blocks; ++n) { 996 XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock); 997 XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN); 998 } 999 1000 /* last partial block */ 1001 const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN; 1002 assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE); 1003 XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes); 1004 1005 /* last stripe */ 1006 constexpr size_t XXH_SECRET_LASTACC_START = 7; 1007 XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN, 1008 secret + secretSize - XXH_STRIPE_LEN - 1009 XXH_SECRET_LASTACC_START); 1010 1011 /* converge into final hash */ 1012 static_assert(sizeof(acc) == 64); 1013 XXH128_hash_t h128; 1014 constexpr size_t XXH_SECRET_MERGEACCS_START = 11; 1015 h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, 1016 (uint64_t)len * PRIME64_1); 1017 h128.high64 = XXH3_mergeAccs( 1018 acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, 1019 ~((uint64_t)len * PRIME64_2)); 1020 return h128; 1021 } 1022 1023 llvm::XXH128_hash_t llvm::xxh3_128bits(ArrayRef<uint8_t> data) { 1024 size_t len = data.size(); 1025 const uint8_t *input = data.data(); 1026 1027 /* 1028 * If an action is to be taken if `secret` conditions are not respected, 1029 * it should be done here. 1030 * For now, it's a contract pre-condition. 1031 * Adding a check and a branch here would cost performance at every hash. 1032 */ 1033 if (len <= 16) 1034 return XXH3_len_0to16_128b(input, len, kSecret, /*seed64=*/0); 1035 if (len <= 128) 1036 return XXH3_len_17to128_128b(input, len, kSecret, sizeof(kSecret), 1037 /*seed64=*/0); 1038 if (len <= XXH3_MIDSIZE_MAX) 1039 return XXH3_len_129to240_128b(input, len, kSecret, sizeof(kSecret), 1040 /*seed64=*/0); 1041 return XXH3_hashLong_128b(input, len, kSecret, sizeof(kSecret)); 1042 } 1043