xref: /freebsd/contrib/llvm-project/llvm/lib/Support/xxhash.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1  /*
2   *  xxHash - Extremely Fast Hash algorithm
3   *  Copyright (C) 2012-2023, Yann Collet
4   *
5   *  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
6   *
7   *  Redistribution and use in source and binary forms, with or without
8   *  modification, are permitted provided that the following conditions are
9   *  met:
10   *
11   *  * Redistributions of source code must retain the above copyright
12   *  notice, this list of conditions and the following disclaimer.
13   *  * Redistributions in binary form must reproduce the above
14   *  copyright notice, this list of conditions and the following disclaimer
15   *  in the documentation and/or other materials provided with the
16   *  distribution.
17   *
18   *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19   *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22   *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29   *
30   *  You can contact the author at :
31   *  - xxHash homepage: http://www.xxhash.com
32   *  - xxHash source repository : https://github.com/Cyan4973/xxHash
33   */
34  
35  // xxhash64 is based on commit d2df04efcbef7d7f6886d345861e5dfda4edacc1. Removed
36  // everything but a simple interface for computing xxh64.
37  
38  // xxh3_64bits is based on commit d5891596637d21366b9b1dcf2c0007a3edb26a9e (July
39  // 2023).
40  
41  // xxh3_128bits is based on commit b0adcc54188c3130b1793e7b19c62eb1e669f7df
42  // (June 2024).
43  
44  #include "llvm/Support/xxhash.h"
45  #include "llvm/Support/Compiler.h"
46  #include "llvm/Support/Endian.h"
47  
48  #include <stdlib.h>
49  
50  #if !defined(LLVM_XXH_USE_NEON)
51  #if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) &&      \
52      !defined(__ARM_BIG_ENDIAN)
53  #define LLVM_XXH_USE_NEON 1
54  #else
55  #define LLVM_XXH_USE_NEON 0
56  #endif
57  #endif
58  
59  #if LLVM_XXH_USE_NEON
60  #include <arm_neon.h>
61  #endif
62  
63  using namespace llvm;
64  using namespace support;
65  
rotl64(uint64_t X,size_t R)66  static uint64_t rotl64(uint64_t X, size_t R) {
67    return (X << R) | (X >> (64 - R));
68  }
69  
70  constexpr uint32_t PRIME32_1 = 0x9E3779B1;
71  constexpr uint32_t PRIME32_2 = 0x85EBCA77;
72  constexpr uint32_t PRIME32_3 = 0xC2B2AE3D;
73  
74  static const uint64_t PRIME64_1 = 11400714785074694791ULL;
75  static const uint64_t PRIME64_2 = 14029467366897019727ULL;
76  static const uint64_t PRIME64_3 = 1609587929392839161ULL;
77  static const uint64_t PRIME64_4 = 9650029242287828579ULL;
78  static const uint64_t PRIME64_5 = 2870177450012600261ULL;
79  
round(uint64_t Acc,uint64_t Input)80  static uint64_t round(uint64_t Acc, uint64_t Input) {
81    Acc += Input * PRIME64_2;
82    Acc = rotl64(Acc, 31);
83    Acc *= PRIME64_1;
84    return Acc;
85  }
86  
mergeRound(uint64_t Acc,uint64_t Val)87  static uint64_t mergeRound(uint64_t Acc, uint64_t Val) {
88    Val = round(0, Val);
89    Acc ^= Val;
90    Acc = Acc * PRIME64_1 + PRIME64_4;
91    return Acc;
92  }
93  
XXH64_avalanche(uint64_t hash)94  static uint64_t XXH64_avalanche(uint64_t hash) {
95    hash ^= hash >> 33;
96    hash *= PRIME64_2;
97    hash ^= hash >> 29;
98    hash *= PRIME64_3;
99    hash ^= hash >> 32;
100    return hash;
101  }
102  
xxHash64(StringRef Data)103  uint64_t llvm::xxHash64(StringRef Data) {
104    size_t Len = Data.size();
105    uint64_t Seed = 0;
106    const unsigned char *P = Data.bytes_begin();
107    const unsigned char *const BEnd = Data.bytes_end();
108    uint64_t H64;
109  
110    if (Len >= 32) {
111      const unsigned char *const Limit = BEnd - 32;
112      uint64_t V1 = Seed + PRIME64_1 + PRIME64_2;
113      uint64_t V2 = Seed + PRIME64_2;
114      uint64_t V3 = Seed + 0;
115      uint64_t V4 = Seed - PRIME64_1;
116  
117      do {
118        V1 = round(V1, endian::read64le(P));
119        P += 8;
120        V2 = round(V2, endian::read64le(P));
121        P += 8;
122        V3 = round(V3, endian::read64le(P));
123        P += 8;
124        V4 = round(V4, endian::read64le(P));
125        P += 8;
126      } while (P <= Limit);
127  
128      H64 = rotl64(V1, 1) + rotl64(V2, 7) + rotl64(V3, 12) + rotl64(V4, 18);
129      H64 = mergeRound(H64, V1);
130      H64 = mergeRound(H64, V2);
131      H64 = mergeRound(H64, V3);
132      H64 = mergeRound(H64, V4);
133  
134    } else {
135      H64 = Seed + PRIME64_5;
136    }
137  
138    H64 += (uint64_t)Len;
139  
140    while (reinterpret_cast<uintptr_t>(P) + 8 <=
141           reinterpret_cast<uintptr_t>(BEnd)) {
142      uint64_t const K1 = round(0, endian::read64le(P));
143      H64 ^= K1;
144      H64 = rotl64(H64, 27) * PRIME64_1 + PRIME64_4;
145      P += 8;
146    }
147  
148    if (reinterpret_cast<uintptr_t>(P) + 4 <= reinterpret_cast<uintptr_t>(BEnd)) {
149      H64 ^= (uint64_t)(endian::read32le(P)) * PRIME64_1;
150      H64 = rotl64(H64, 23) * PRIME64_2 + PRIME64_3;
151      P += 4;
152    }
153  
154    while (P < BEnd) {
155      H64 ^= (*P) * PRIME64_5;
156      H64 = rotl64(H64, 11) * PRIME64_1;
157      P++;
158    }
159  
160    return XXH64_avalanche(H64);
161  }
162  
xxHash64(ArrayRef<uint8_t> Data)163  uint64_t llvm::xxHash64(ArrayRef<uint8_t> Data) {
164    return xxHash64({(const char *)Data.data(), Data.size()});
165  }
166  
167  constexpr size_t XXH3_SECRETSIZE_MIN = 136;
168  constexpr size_t XXH_SECRET_DEFAULT_SIZE = 192;
169  
170  /* Pseudorandom data taken directly from FARSH */
171  // clang-format off
172  constexpr uint8_t kSecret[XXH_SECRET_DEFAULT_SIZE] = {
173      0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
174      0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
175      0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
176      0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
177      0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
178      0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
179      0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
180      0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
181      0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
182      0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
183      0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
184      0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
185  };
186  // clang-format on
187  
188  constexpr uint64_t PRIME_MX1 = 0x165667919E3779F9;
189  constexpr uint64_t PRIME_MX2 = 0x9FB21C651E98DF25;
190  
191  // Calculates a 64-bit to 128-bit multiply, then XOR folds it.
XXH3_mul128_fold64(uint64_t lhs,uint64_t rhs)192  static uint64_t XXH3_mul128_fold64(uint64_t lhs, uint64_t rhs) {
193  #if defined(__SIZEOF_INT128__) ||                                              \
194      (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
195    __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
196    return uint64_t(product) ^ uint64_t(product >> 64);
197  
198  #else
199    /* First calculate all of the cross products. */
200    const uint64_t lo_lo = (lhs & 0xFFFFFFFF) * (rhs & 0xFFFFFFFF);
201    const uint64_t hi_lo = (lhs >> 32) * (rhs & 0xFFFFFFFF);
202    const uint64_t lo_hi = (lhs & 0xFFFFFFFF) * (rhs >> 32);
203    const uint64_t hi_hi = (lhs >> 32) * (rhs >> 32);
204  
205    /* Now add the products together. These will never overflow. */
206    const uint64_t cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
207    const uint64_t upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
208    const uint64_t lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
209  
210    return upper ^ lower;
211  #endif
212  }
213  
214  constexpr size_t XXH_STRIPE_LEN = 64;
215  constexpr size_t XXH_SECRET_CONSUME_RATE = 8;
216  constexpr size_t XXH_ACC_NB = XXH_STRIPE_LEN / sizeof(uint64_t);
217  
XXH3_avalanche(uint64_t hash)218  static uint64_t XXH3_avalanche(uint64_t hash) {
219    hash ^= hash >> 37;
220    hash *= PRIME_MX1;
221    hash ^= hash >> 32;
222    return hash;
223  }
224  
XXH3_len_1to3_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)225  static uint64_t XXH3_len_1to3_64b(const uint8_t *input, size_t len,
226                                    const uint8_t *secret, uint64_t seed) {
227    const uint8_t c1 = input[0];
228    const uint8_t c2 = input[len >> 1];
229    const uint8_t c3 = input[len - 1];
230    uint32_t combined = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) |
231                        ((uint32_t)c3 << 0) | ((uint32_t)len << 8);
232    uint64_t bitflip =
233        (uint64_t)(endian::read32le(secret) ^ endian::read32le(secret + 4)) +
234        seed;
235    return XXH64_avalanche(uint64_t(combined) ^ bitflip);
236  }
237  
XXH3_len_4to8_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)238  static uint64_t XXH3_len_4to8_64b(const uint8_t *input, size_t len,
239                                    const uint8_t *secret, uint64_t seed) {
240    seed ^= (uint64_t)byteswap(uint32_t(seed)) << 32;
241    const uint32_t input1 = endian::read32le(input);
242    const uint32_t input2 = endian::read32le(input + len - 4);
243    uint64_t acc =
244        (endian::read64le(secret + 8) ^ endian::read64le(secret + 16)) - seed;
245    const uint64_t input64 = (uint64_t)input2 | ((uint64_t)input1 << 32);
246    acc ^= input64;
247    // XXH3_rrmxmx(acc, len)
248    acc ^= rotl64(acc, 49) ^ rotl64(acc, 24);
249    acc *= PRIME_MX2;
250    acc ^= (acc >> 35) + (uint64_t)len;
251    acc *= PRIME_MX2;
252    return acc ^ (acc >> 28);
253  }
254  
XXH3_len_9to16_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t const seed)255  static uint64_t XXH3_len_9to16_64b(const uint8_t *input, size_t len,
256                                     const uint8_t *secret, uint64_t const seed) {
257    uint64_t input_lo =
258        (endian::read64le(secret + 24) ^ endian::read64le(secret + 32)) + seed;
259    uint64_t input_hi =
260        (endian::read64le(secret + 40) ^ endian::read64le(secret + 48)) - seed;
261    input_lo ^= endian::read64le(input);
262    input_hi ^= endian::read64le(input + len - 8);
263    uint64_t acc = uint64_t(len) + byteswap(input_lo) + input_hi +
264                   XXH3_mul128_fold64(input_lo, input_hi);
265    return XXH3_avalanche(acc);
266  }
267  
268  LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_len_0to16_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t const seed)269  static uint64_t XXH3_len_0to16_64b(const uint8_t *input, size_t len,
270                                     const uint8_t *secret, uint64_t const seed) {
271    if (LLVM_LIKELY(len > 8))
272      return XXH3_len_9to16_64b(input, len, secret, seed);
273    if (LLVM_LIKELY(len >= 4))
274      return XXH3_len_4to8_64b(input, len, secret, seed);
275    if (len != 0)
276      return XXH3_len_1to3_64b(input, len, secret, seed);
277    return XXH64_avalanche(seed ^ endian::read64le(secret + 56) ^
278                           endian::read64le(secret + 64));
279  }
280  
XXH3_mix16B(const uint8_t * input,uint8_t const * secret,uint64_t seed)281  static uint64_t XXH3_mix16B(const uint8_t *input, uint8_t const *secret,
282                              uint64_t seed) {
283    uint64_t lhs = seed;
284    uint64_t rhs = 0U - seed;
285    lhs += endian::read64le(secret);
286    rhs += endian::read64le(secret + 8);
287    lhs ^= endian::read64le(input);
288    rhs ^= endian::read64le(input + 8);
289    return XXH3_mul128_fold64(lhs, rhs);
290  }
291  
292  /* For mid range keys, XXH3 uses a Mum-hash variant. */
293  LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_len_17to128_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t const seed)294  static uint64_t XXH3_len_17to128_64b(const uint8_t *input, size_t len,
295                                       const uint8_t *secret,
296                                       uint64_t const seed) {
297    uint64_t acc = len * PRIME64_1, acc_end;
298    acc += XXH3_mix16B(input + 0, secret + 0, seed);
299    acc_end = XXH3_mix16B(input + len - 16, secret + 16, seed);
300    if (len > 32) {
301      acc += XXH3_mix16B(input + 16, secret + 32, seed);
302      acc_end += XXH3_mix16B(input + len - 32, secret + 48, seed);
303      if (len > 64) {
304        acc += XXH3_mix16B(input + 32, secret + 64, seed);
305        acc_end += XXH3_mix16B(input + len - 48, secret + 80, seed);
306        if (len > 96) {
307          acc += XXH3_mix16B(input + 48, secret + 96, seed);
308          acc_end += XXH3_mix16B(input + len - 64, secret + 112, seed);
309        }
310      }
311    }
312    return XXH3_avalanche(acc + acc_end);
313  }
314  
315  constexpr size_t XXH3_MIDSIZE_MAX = 240;
316  constexpr size_t XXH3_MIDSIZE_STARTOFFSET = 3;
317  constexpr size_t XXH3_MIDSIZE_LASTOFFSET = 17;
318  
319  LLVM_ATTRIBUTE_NOINLINE
XXH3_len_129to240_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)320  static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
321                                        const uint8_t *secret, uint64_t seed) {
322    uint64_t acc = (uint64_t)len * PRIME64_1;
323    const unsigned nbRounds = len / 16;
324    for (unsigned i = 0; i < 8; ++i)
325      acc += XXH3_mix16B(input + 16 * i, secret + 16 * i, seed);
326    acc = XXH3_avalanche(acc);
327  
328    for (unsigned i = 8; i < nbRounds; ++i) {
329      acc += XXH3_mix16B(input + 16 * i,
330                         secret + 16 * (i - 8) + XXH3_MIDSIZE_STARTOFFSET, seed);
331    }
332    /* last bytes */
333    acc +=
334        XXH3_mix16B(input + len - 16,
335                    secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
336    return XXH3_avalanche(acc);
337  }
338  
339  #if LLVM_XXH_USE_NEON
340  
341  #define XXH3_accumulate_512 XXH3_accumulate_512_neon
342  #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
343  
344  // NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
345  // (July 2024), with the following removed:
346  // - workaround for suboptimal codegen on older GCC
347  // - compiler barriers against instruction reordering
348  // - WebAssembly SIMD support
349  // - configurable split between NEON and scalar lanes (benchmarking shows no
350  //   penalty when fully doing SIMD on the Apple M1)
351  
352  #if defined(__GNUC__) || defined(__clang__)
353  #define XXH_ALIASING __attribute__((__may_alias__))
354  #else
355  #define XXH_ALIASING /* nothing */
356  #endif
357  
358  typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
359  
XXH_vld1q_u64(void const * ptr)360  LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
361    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
362  }
363  
364  LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_accumulate_512_neon(uint64_t * acc,const uint8_t * input,const uint8_t * secret)365  static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
366                                       const uint8_t *secret) {
367    xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
368  
369  #ifdef __clang__
370  #pragma clang loop unroll(full)
371  #endif
372    for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
373      /* data_vec = input[i]; */
374      uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
375      uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));
376  
377      /* key_vec  = secret[i];  */
378      uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
379      uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));
380  
381      /* data_swap = swap(data_vec) */
382      uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
383      uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
384  
385      /* data_key = data_vec ^ key_vec; */
386      uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
387      uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
388  
389      /*
390       * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
391       * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
392       * get one vector with the low 32 bits of each lane, and one vector
393       * with the high 32 bits of each lane.
394       *
395       * The intrinsic returns a double vector because the original ARMv7-a
396       * instruction modified both arguments in place. AArch64 and SIMD128 emit
397       * two instructions from this intrinsic.
398       *
399       *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
400       *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
401       */
402      uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
403                                        vreinterpretq_u32_u64(data_key_2));
404  
405      /* data_key_lo = data_key & 0xFFFFFFFF */
406      uint32x4_t data_key_lo = unzipped.val[0];
407      /* data_key_hi = data_key >> 32 */
408      uint32x4_t data_key_hi = unzipped.val[1];
409  
410      /*
411       * Then, we can split the vectors horizontally and multiply which, as for
412       * most widening intrinsics, have a variant that works on both high half
413       * vectors for free on AArch64. A similar instruction is available on
414       * SIMD128.
415       *
416       * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
417       */
418      uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
419                                   vget_low_u32(data_key_hi));
420      uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
421                                   vget_high_u32(data_key_hi));
422  
423      /* xacc[i] = acc_vec + sum; */
424      xacc[i] = vaddq_u64(xacc[i], sum_1);
425      xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
426    }
427  }
428  
429  LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_scrambleAcc_neon(uint64_t * acc,const uint8_t * secret)430  static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
431    xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
432  
433    /* { prime32_1, prime32_1 } */
434    uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
435    /* { 0, prime32_1, 0, prime32_1 } */
436    uint32x4_t const kPrimeHi =
437        vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));
438  
439    for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
440      /* xacc[i] ^= (xacc[i] >> 47); */
441      uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
442      uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
443      uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
444  
445      /* xacc[i] ^= secret[i]; */
446      uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
447      uint64x2_t data_key = veorq_u64(data_vec, key_vec);
448  
449      /*
450       * xacc[i] *= XXH_PRIME32_1
451       *
452       * Expanded version with portable NEON intrinsics
453       *
454       *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
455       *
456       * prod_hi = hi(data_key) * lo(prime) << 32
457       *
458       * Since we only need 32 bits of this multiply a trick can be used,
459       * reinterpreting the vector as a uint32x4_t and multiplying by
460       * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
461       * shift.
462       */
463      uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);
464  
465      /* Extract low bits for vmlal_u32  */
466      uint32x2_t data_key_lo = vmovn_u64(data_key);
467  
468      /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
469      xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
470    }
471  }
472  #else
473  
474  #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
475  #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
476  
477  LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_accumulate_512_scalar(uint64_t * acc,const uint8_t * input,const uint8_t * secret)478  static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
479                                         const uint8_t *secret) {
480    for (size_t i = 0; i < XXH_ACC_NB; ++i) {
481      uint64_t data_val = endian::read64le(input + 8 * i);
482      uint64_t data_key = data_val ^ endian::read64le(secret + 8 * i);
483      acc[i ^ 1] += data_val;
484      acc[i] += uint32_t(data_key) * (data_key >> 32);
485    }
486  }
487  
488  LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_scrambleAcc_scalar(uint64_t * acc,const uint8_t * secret)489  static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) {
490    for (size_t i = 0; i < XXH_ACC_NB; ++i) {
491      acc[i] ^= acc[i] >> 47;
492      acc[i] ^= endian::read64le(secret + 8 * i);
493      acc[i] *= PRIME32_1;
494    }
495  }
496  #endif
497  
498  LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_accumulate(uint64_t * acc,const uint8_t * input,const uint8_t * secret,size_t nbStripes)499  static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
500                              const uint8_t *secret, size_t nbStripes) {
501    for (size_t n = 0; n < nbStripes; ++n) {
502      XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN,
503                          secret + n * XXH_SECRET_CONSUME_RATE);
504    }
505  }
506  
XXH3_mix2Accs(const uint64_t * acc,const uint8_t * secret)507  static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) {
508    return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret),
509                              acc[1] ^ endian::read64le(secret + 8));
510  }
511  
XXH3_mergeAccs(const uint64_t * acc,const uint8_t * key,uint64_t start)512  static uint64_t XXH3_mergeAccs(const uint64_t *acc, const uint8_t *key,
513                                 uint64_t start) {
514    uint64_t result64 = start;
515    for (size_t i = 0; i < 4; ++i)
516      result64 += XXH3_mix2Accs(acc + 2 * i, key + 16 * i);
517    return XXH3_avalanche(result64);
518  }
519  
520  LLVM_ATTRIBUTE_NOINLINE
XXH3_hashLong_64b(const uint8_t * input,size_t len,const uint8_t * secret,size_t secretSize)521  static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
522                                    const uint8_t *secret, size_t secretSize) {
523    const size_t nbStripesPerBlock =
524        (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
525    const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
526    const size_t nb_blocks = (len - 1) / block_len;
527    alignas(16) uint64_t acc[XXH_ACC_NB] = {
528        PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3,
529        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
530    };
531    for (size_t n = 0; n < nb_blocks; ++n) {
532      XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
533      XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
534    }
535  
536    /* last partial block */
537    const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
538    assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
539    XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
540  
541    /* last stripe */
542    constexpr size_t XXH_SECRET_LASTACC_START = 7;
543    XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
544                        secret + secretSize - XXH_STRIPE_LEN -
545                            XXH_SECRET_LASTACC_START);
546  
547    /* converge into final hash */
548    constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
549    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
550                          (uint64_t)len * PRIME64_1);
551  }
552  
xxh3_64bits(ArrayRef<uint8_t> data)553  uint64_t llvm::xxh3_64bits(ArrayRef<uint8_t> data) {
554    auto *in = data.data();
555    size_t len = data.size();
556    if (len <= 16)
557      return XXH3_len_0to16_64b(in, len, kSecret, 0);
558    if (len <= 128)
559      return XXH3_len_17to128_64b(in, len, kSecret, 0);
560    if (len <= XXH3_MIDSIZE_MAX)
561      return XXH3_len_129to240_64b(in, len, kSecret, 0);
562    return XXH3_hashLong_64b(in, len, kSecret, sizeof(kSecret));
563  }
564  
565  /* ==========================================
566   * XXH3 128 bits (a.k.a XXH128)
567   * ==========================================
568   * XXH3's 128-bit variant has better mixing and strength than the 64-bit
569   * variant, even without counting the significantly larger output size.
570   *
571   * For example, extra steps are taken to avoid the seed-dependent collisions
572   * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
573   *
574   * This strength naturally comes at the cost of some speed, especially on short
575   * lengths. Note that longer hashes are about as fast as the 64-bit version
576   * due to it using only a slight modification of the 64-bit loop.
577   *
578   * XXH128 is also more oriented towards 64-bit machines. It is still extremely
579   * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
580   */
581  
582  /*!
583   * @internal
584   * @def XXH_rotl32(x,r)
585   * @brief 32-bit rotate left.
586   *
587   * @param x The 32-bit integer to be rotated.
588   * @param r The number of bits to rotate.
589   * @pre
590   *   @p r > 0 && @p r < 32
591   * @note
592   *   @p x and @p r may be evaluated multiple times.
593   * @return The rotated result.
594   */
595  #if __has_builtin(__builtin_rotateleft32) &&                                   \
596      __has_builtin(__builtin_rotateleft64)
597  #define XXH_rotl32 __builtin_rotateleft32
598  #define XXH_rotl64 __builtin_rotateleft64
599  /* Note: although _rotl exists for minGW (GCC under windows), performance seems
600   * poor */
601  #elif defined(_MSC_VER)
602  #define XXH_rotl32(x, r) _rotl(x, r)
603  #define XXH_rotl64(x, r) _rotl64(x, r)
604  #else
605  #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
606  #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
607  #endif
608  
609  #define XXH_mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y))
610  
611  /*!
612   * @brief Calculates a 64->128-bit long multiply.
613   *
614   * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
615   * version.
616   *
617   * @param lhs , rhs The 64-bit integers to be multiplied
618   * @return The 128-bit result represented in an @ref XXH128_hash_t.
619   */
XXH_mult64to128(uint64_t lhs,uint64_t rhs)620  static XXH128_hash_t XXH_mult64to128(uint64_t lhs, uint64_t rhs) {
621    /*
622     * GCC/Clang __uint128_t method.
623     *
624     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
625     * This is usually the best way as it usually uses a native long 64-bit
626     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
627     *
628     * Usually.
629     *
630     * Despite being a 32-bit platform, Clang (and emscripten) define this type
631     * despite not having the arithmetic for it. This results in a laggy
632     * compiler builtin call which calculates a full 128-bit multiply.
633     * In that case it is best to use the portable one.
634     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
635     */
636  #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) &&         \
637          defined(__SIZEOF_INT128__) ||                                          \
638      (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
639  
640    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
641    XXH128_hash_t r128;
642    r128.low64 = (uint64_t)(product);
643    r128.high64 = (uint64_t)(product >> 64);
644    return r128;
645  
646    /*
647     * MSVC for x64's _umul128 method.
648     *
649     * uint64_t _umul128(uint64_t Multiplier, uint64_t Multiplicand, uint64_t
650     * *HighProduct);
651     *
652     * This compiles to single operand MUL on x64.
653     */
654  #elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
655  
656  #ifndef _MSC_VER
657  #pragma intrinsic(_umul128)
658  #endif
659    uint64_t product_high;
660    uint64_t const product_low = _umul128(lhs, rhs, &product_high);
661    XXH128_hash_t r128;
662    r128.low64 = product_low;
663    r128.high64 = product_high;
664    return r128;
665  
666    /*
667     * MSVC for ARM64's __umulh method.
668     *
669     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
670     */
671  #elif defined(_M_ARM64) || defined(_M_ARM64EC)
672  
673  #ifndef _MSC_VER
674  #pragma intrinsic(__umulh)
675  #endif
676    XXH128_hash_t r128;
677    r128.low64 = lhs * rhs;
678    r128.high64 = __umulh(lhs, rhs);
679    return r128;
680  
681  #else
682    /*
683     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
684     *
685     * This is a fast and simple grade school multiply, which is shown below
686     * with base 10 arithmetic instead of base 0x100000000.
687     *
688     *           9 3 // D2 lhs = 93
689     *         x 7 5 // D2 rhs = 75
690     *     ----------
691     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
692     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
693     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
694     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
695     *     ---------
696     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
697     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
698     *     ---------
699     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
700     *
701     * The reasons for adding the products like this are:
702     *  1. It avoids manual carry tracking. Just like how
703     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
704     *     This avoids a lot of complexity.
705     *
706     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
707     *     instruction available in ARM's Digital Signal Processing extension
708     *     in 32-bit ARMv6 and later, which is shown below:
709     *
710     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
711     *         {
712     *             uint64_t product = (uint64_t)*RdLo * (uint64_t)*RdHi + Rn + Rm;
713     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
714     *             *RdHi = (xxh_u32)(product >> 32);
715     *         }
716     *
717     *     This instruction was designed for efficient long multiplication, and
718     *     allows this to be calculated in only 4 instructions at speeds
719     *     comparable to some 64-bit ALUs.
720     *
721     *  3. It isn't terrible on other platforms. Usually this will be a couple
722     *     of 32-bit ADD/ADCs.
723     */
724  
725    /* First calculate all of the cross products. */
726    uint64_t const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
727    uint64_t const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
728    uint64_t const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
729    uint64_t const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
730  
731    /* Now add the products together. These will never overflow. */
732    uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
733    uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
734    uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
735  
736    XXH128_hash_t r128;
737    r128.low64 = lower;
738    r128.high64 = upper;
739    return r128;
740  #endif
741  }
742  
743  /*! Seems to produce slightly better code on GCC for some reason. */
XXH_xorshift64(uint64_t v64,int shift)744  LLVM_ATTRIBUTE_ALWAYS_INLINE constexpr uint64_t XXH_xorshift64(uint64_t v64,
745                                                                 int shift) {
746    return v64 ^ (v64 >> shift);
747  }
748  
749  LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_1to3_128b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)750  XXH3_len_1to3_128b(const uint8_t *input, size_t len, const uint8_t *secret,
751                     uint64_t seed) {
752    /* A doubled version of 1to3_64b with different constants. */
753    /*
754     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
755     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
756     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
757     */
758    uint8_t const c1 = input[0];
759    uint8_t const c2 = input[len >> 1];
760    uint8_t const c3 = input[len - 1];
761    uint32_t const combinedl = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) |
762                               ((uint32_t)c3 << 0) | ((uint32_t)len << 8);
763    uint32_t const combinedh = XXH_rotl32(byteswap(combinedl), 13);
764    uint64_t const bitflipl =
765        (endian::read32le(secret) ^ endian::read32le(secret + 4)) + seed;
766    uint64_t const bitfliph =
767        (endian::read32le(secret + 8) ^ endian::read32le(secret + 12)) - seed;
768    uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl;
769    uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph;
770    XXH128_hash_t h128;
771    h128.low64 = XXH64_avalanche(keyed_lo);
772    h128.high64 = XXH64_avalanche(keyed_hi);
773    return h128;
774  }
775  
776  LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_4to8_128b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)777  XXH3_len_4to8_128b(const uint8_t *input, size_t len, const uint8_t *secret,
778                     uint64_t seed) {
779    seed ^= (uint64_t)byteswap((uint32_t)seed) << 32;
780    uint32_t const input_lo = endian::read32le(input);
781    uint32_t const input_hi = endian::read32le(input + len - 4);
782    uint64_t const input_64 = input_lo + ((uint64_t)input_hi << 32);
783    uint64_t const bitflip =
784        (endian::read64le(secret + 16) ^ endian::read64le(secret + 24)) + seed;
785    uint64_t const keyed = input_64 ^ bitflip;
786  
787    /* Shift len to the left to ensure it is even, this avoids even multiplies.
788     */
789    XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
790  
791    m128.high64 += (m128.low64 << 1);
792    m128.low64 ^= (m128.high64 >> 3);
793  
794    m128.low64 = XXH_xorshift64(m128.low64, 35);
795    m128.low64 *= PRIME_MX2;
796    m128.low64 = XXH_xorshift64(m128.low64, 28);
797    m128.high64 = XXH3_avalanche(m128.high64);
798    return m128;
799  }
800  
801  LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_9to16_128b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)802  XXH3_len_9to16_128b(const uint8_t *input, size_t len, const uint8_t *secret,
803                      uint64_t seed) {
804    uint64_t const bitflipl =
805        (endian::read64le(secret + 32) ^ endian::read64le(secret + 40)) - seed;
806    uint64_t const bitfliph =
807        (endian::read64le(secret + 48) ^ endian::read64le(secret + 56)) + seed;
808    uint64_t const input_lo = endian::read64le(input);
809    uint64_t input_hi = endian::read64le(input + len - 8);
810    XXH128_hash_t m128 =
811        XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
812    /*
813     * Put len in the middle of m128 to ensure that the length gets mixed to
814     * both the low and high bits in the 128x64 multiply below.
815     */
816    m128.low64 += (uint64_t)(len - 1) << 54;
817    input_hi ^= bitfliph;
818    /*
819     * Add the high 32 bits of input_hi to the high 32 bits of m128, then
820     * add the long product of the low 32 bits of input_hi and PRIME32_2 to
821     * the high 64 bits of m128.
822     *
823     * The best approach to this operation is different on 32-bit and 64-bit.
824     */
825    if (sizeof(void *) < sizeof(uint64_t)) { /* 32-bit */
826      /*
827       * 32-bit optimized version, which is more readable.
828       *
829       * On 32-bit, it removes an ADC and delays a dependency between the two
830       * halves of m128.high64, but it generates an extra mask on 64-bit.
831       */
832      m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) +
833                     XXH_mult32to64((uint32_t)input_hi, PRIME32_2);
834    } else {
835      /*
836       * 64-bit optimized (albeit more confusing) version.
837       *
838       * Uses some properties of addition and multiplication to remove the mask:
839       *
840       * Let:
841       *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
842       *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
843       *    c = PRIME32_2
844       *
845       *    a + (b * c)
846       * Inverse Property: x + y - x == y
847       *    a + (b * (1 + c - 1))
848       * Distributive Property: x * (y + z) == (x * y) + (x * z)
849       *    a + (b * 1) + (b * (c - 1))
850       * Identity Property: x * 1 == x
851       *    a + b + (b * (c - 1))
852       *
853       * Substitute a, b, and c:
854       *    input_hi.hi + input_hi.lo + ((uint64_t)input_hi.lo * (PRIME32_2
855       * - 1))
856       *
857       * Since input_hi.hi + input_hi.lo == input_hi, we get this:
858       *    input_hi + ((uint64_t)input_hi.lo * (PRIME32_2 - 1))
859       */
860      m128.high64 += input_hi + XXH_mult32to64((uint32_t)input_hi, PRIME32_2 - 1);
861    }
862    /* m128 ^= XXH_swap64(m128 >> 64); */
863    m128.low64 ^= byteswap(m128.high64);
864  
865    /* 128x64 multiply: h128 = m128 * PRIME64_2; */
866    XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
867    h128.high64 += m128.high64 * PRIME64_2;
868  
869    h128.low64 = XXH3_avalanche(h128.low64);
870    h128.high64 = XXH3_avalanche(h128.high64);
871    return h128;
872  }
873  
874  /*
875   * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
876   */
877  LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_0to16_128b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)878  XXH3_len_0to16_128b(const uint8_t *input, size_t len, const uint8_t *secret,
879                      uint64_t seed) {
880    if (len > 8)
881      return XXH3_len_9to16_128b(input, len, secret, seed);
882    if (len >= 4)
883      return XXH3_len_4to8_128b(input, len, secret, seed);
884    if (len)
885      return XXH3_len_1to3_128b(input, len, secret, seed);
886    XXH128_hash_t h128;
887    uint64_t const bitflipl =
888        endian::read64le(secret + 64) ^ endian::read64le(secret + 72);
889    uint64_t const bitfliph =
890        endian::read64le(secret + 80) ^ endian::read64le(secret + 88);
891    h128.low64 = XXH64_avalanche(seed ^ bitflipl);
892    h128.high64 = XXH64_avalanche(seed ^ bitfliph);
893    return h128;
894  }
895  
896  /*
897   * A bit slower than XXH3_mix16B, but handles multiply by zero better.
898   */
899  LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH128_mix32B(XXH128_hash_t acc,const uint8_t * input_1,const uint8_t * input_2,const uint8_t * secret,uint64_t seed)900  XXH128_mix32B(XXH128_hash_t acc, const uint8_t *input_1, const uint8_t *input_2,
901                const uint8_t *secret, uint64_t seed) {
902    acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
903    acc.low64 ^= endian::read64le(input_2) + endian::read64le(input_2 + 8);
904    acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
905    acc.high64 ^= endian::read64le(input_1) + endian::read64le(input_1 + 8);
906    return acc;
907  }
908  
909  LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_17to128_128b(const uint8_t * input,size_t len,const uint8_t * secret,size_t secretSize,uint64_t seed)910  XXH3_len_17to128_128b(const uint8_t *input, size_t len, const uint8_t *secret,
911                        size_t secretSize, uint64_t seed) {
912    (void)secretSize;
913  
914    XXH128_hash_t acc;
915    acc.low64 = len * PRIME64_1;
916    acc.high64 = 0;
917  
918    if (len > 32) {
919      if (len > 64) {
920        if (len > 96) {
921          acc =
922              XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96, seed);
923        }
924        acc = XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
925      }
926      acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
927    }
928    acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
929    XXH128_hash_t h128;
930    h128.low64 = acc.low64 + acc.high64;
931    h128.high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) +
932                  ((len - seed) * PRIME64_2);
933    h128.low64 = XXH3_avalanche(h128.low64);
934    h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64);
935    return h128;
936  }
937  
938  LLVM_ATTRIBUTE_NOINLINE static XXH128_hash_t
XXH3_len_129to240_128b(const uint8_t * input,size_t len,const uint8_t * secret,size_t secretSize,uint64_t seed)939  XXH3_len_129to240_128b(const uint8_t *input, size_t len, const uint8_t *secret,
940                         size_t secretSize, uint64_t seed) {
941    (void)secretSize;
942  
943    XXH128_hash_t acc;
944    unsigned i;
945    acc.low64 = len * PRIME64_1;
946    acc.high64 = 0;
947    /*
948     *  We set as `i` as offset + 32. We do this so that unchanged
949     * `len` can be used as upper bound. This reaches a sweet spot
950     * where both x86 and aarch64 get simple agen and good codegen
951     * for the loop.
952     */
953    for (i = 32; i < 160; i += 32) {
954      acc = XXH128_mix32B(acc, input + i - 32, input + i - 16, secret + i - 32,
955                          seed);
956    }
957    acc.low64 = XXH3_avalanche(acc.low64);
958    acc.high64 = XXH3_avalanche(acc.high64);
959    /*
960     * NB: `i <= len` will duplicate the last 32-bytes if
961     * len % 32 was zero. This is an unfortunate necessity to keep
962     * the hash result stable.
963     */
964    for (i = 160; i <= len; i += 32) {
965      acc = XXH128_mix32B(acc, input + i - 32, input + i - 16,
966                          secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, seed);
967    }
968    /* last bytes */
969    acc =
970        XXH128_mix32B(acc, input + len - 16, input + len - 32,
971                      secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
972                      (uint64_t)0 - seed);
973  
974    XXH128_hash_t h128;
975    h128.low64 = acc.low64 + acc.high64;
976    h128.high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) +
977                  ((len - seed) * PRIME64_2);
978    h128.low64 = XXH3_avalanche(h128.low64);
979    h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64);
980    return h128;
981  }
982  
983  LLVM_ATTRIBUTE_ALWAYS_INLINE XXH128_hash_t
XXH3_hashLong_128b(const uint8_t * input,size_t len,const uint8_t * secret,size_t secretSize)984  XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
985                     size_t secretSize) {
986    const size_t nbStripesPerBlock =
987        (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
988    const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
989    const size_t nb_blocks = (len - 1) / block_len;
990    alignas(16) uint64_t acc[XXH_ACC_NB] = {
991        PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3,
992        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
993    };
994  
995    for (size_t n = 0; n < nb_blocks; ++n) {
996      XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
997      XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
998    }
999  
1000    /* last partial block */
1001    const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
1002    assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
1003    XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
1004  
1005    /* last stripe */
1006    constexpr size_t XXH_SECRET_LASTACC_START = 7;
1007    XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
1008                        secret + secretSize - XXH_STRIPE_LEN -
1009                            XXH_SECRET_LASTACC_START);
1010  
1011    /* converge into final hash */
1012    static_assert(sizeof(acc) == 64);
1013    XXH128_hash_t h128;
1014    constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
1015    h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
1016                                (uint64_t)len * PRIME64_1);
1017    h128.high64 = XXH3_mergeAccs(
1018        acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
1019        ~((uint64_t)len * PRIME64_2));
1020    return h128;
1021  }
1022  
xxh3_128bits(ArrayRef<uint8_t> data)1023  llvm::XXH128_hash_t llvm::xxh3_128bits(ArrayRef<uint8_t> data) {
1024    size_t len = data.size();
1025    const uint8_t *input = data.data();
1026  
1027    /*
1028     * If an action is to be taken if `secret` conditions are not respected,
1029     * it should be done here.
1030     * For now, it's a contract pre-condition.
1031     * Adding a check and a branch here would cost performance at every hash.
1032     */
1033    if (len <= 16)
1034      return XXH3_len_0to16_128b(input, len, kSecret, /*seed64=*/0);
1035    if (len <= 128)
1036      return XXH3_len_17to128_128b(input, len, kSecret, sizeof(kSecret),
1037                                   /*seed64=*/0);
1038    if (len <= XXH3_MIDSIZE_MAX)
1039      return XXH3_len_129to240_128b(input, len, kSecret, sizeof(kSecret),
1040                                    /*seed64=*/0);
1041    return XXH3_hashLong_128b(input, len, kSecret, sizeof(kSecret));
1042  }
1043