xref: /freebsd/contrib/llvm-project/llvm/lib/Support/xxhash.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*
2  *  xxHash - Extremely Fast Hash algorithm
3  *  Copyright (C) 2012-2023, Yann Collet
4  *
5  *  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
6  *
7  *  Redistribution and use in source and binary forms, with or without
8  *  modification, are permitted provided that the following conditions are
9  *  met:
10  *
11  *  * Redistributions of source code must retain the above copyright
12  *  notice, this list of conditions and the following disclaimer.
13  *  * Redistributions in binary form must reproduce the above
14  *  copyright notice, this list of conditions and the following disclaimer
15  *  in the documentation and/or other materials provided with the
16  *  distribution.
17  *
18  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  *  You can contact the author at :
31  *  - xxHash homepage: http://www.xxhash.com
32  *  - xxHash source repository : https://github.com/Cyan4973/xxHash
33  */
34 
35 // xxhash64 is based on commit d2df04efcbef7d7f6886d345861e5dfda4edacc1. Removed
36 // everything but a simple interface for computing xxh64.
37 
38 // xxh3_64bits is based on commit d5891596637d21366b9b1dcf2c0007a3edb26a9e (July
39 // 2023).
40 
41 // xxh3_128bits is based on commit b0adcc54188c3130b1793e7b19c62eb1e669f7df
42 // (June 2024).
43 
44 #include "llvm/Support/xxhash.h"
45 #include "llvm/Support/Compiler.h"
46 #include "llvm/Support/Endian.h"
47 
48 #include <stdlib.h>
49 
50 #if !defined(LLVM_XXH_USE_NEON)
51 #if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) &&      \
52     !defined(__ARM_BIG_ENDIAN)
53 #define LLVM_XXH_USE_NEON 1
54 #else
55 #define LLVM_XXH_USE_NEON 0
56 #endif
57 #endif
58 
59 #if LLVM_XXH_USE_NEON
60 #include <arm_neon.h>
61 #endif
62 
63 using namespace llvm;
64 using namespace support;
65 
66 static uint64_t rotl64(uint64_t X, size_t R) {
67   return (X << R) | (X >> (64 - R));
68 }
69 
70 constexpr uint32_t PRIME32_1 = 0x9E3779B1;
71 constexpr uint32_t PRIME32_2 = 0x85EBCA77;
72 constexpr uint32_t PRIME32_3 = 0xC2B2AE3D;
73 
74 static const uint64_t PRIME64_1 = 11400714785074694791ULL;
75 static const uint64_t PRIME64_2 = 14029467366897019727ULL;
76 static const uint64_t PRIME64_3 = 1609587929392839161ULL;
77 static const uint64_t PRIME64_4 = 9650029242287828579ULL;
78 static const uint64_t PRIME64_5 = 2870177450012600261ULL;
79 
80 static uint64_t round(uint64_t Acc, uint64_t Input) {
81   Acc += Input * PRIME64_2;
82   Acc = rotl64(Acc, 31);
83   Acc *= PRIME64_1;
84   return Acc;
85 }
86 
87 static uint64_t mergeRound(uint64_t Acc, uint64_t Val) {
88   Val = round(0, Val);
89   Acc ^= Val;
90   Acc = Acc * PRIME64_1 + PRIME64_4;
91   return Acc;
92 }
93 
94 static uint64_t XXH64_avalanche(uint64_t hash) {
95   hash ^= hash >> 33;
96   hash *= PRIME64_2;
97   hash ^= hash >> 29;
98   hash *= PRIME64_3;
99   hash ^= hash >> 32;
100   return hash;
101 }
102 
103 uint64_t llvm::xxHash64(StringRef Data) {
104   size_t Len = Data.size();
105   uint64_t Seed = 0;
106   const unsigned char *P = Data.bytes_begin();
107   const unsigned char *const BEnd = Data.bytes_end();
108   uint64_t H64;
109 
110   if (Len >= 32) {
111     const unsigned char *const Limit = BEnd - 32;
112     uint64_t V1 = Seed + PRIME64_1 + PRIME64_2;
113     uint64_t V2 = Seed + PRIME64_2;
114     uint64_t V3 = Seed + 0;
115     uint64_t V4 = Seed - PRIME64_1;
116 
117     do {
118       V1 = round(V1, endian::read64le(P));
119       P += 8;
120       V2 = round(V2, endian::read64le(P));
121       P += 8;
122       V3 = round(V3, endian::read64le(P));
123       P += 8;
124       V4 = round(V4, endian::read64le(P));
125       P += 8;
126     } while (P <= Limit);
127 
128     H64 = rotl64(V1, 1) + rotl64(V2, 7) + rotl64(V3, 12) + rotl64(V4, 18);
129     H64 = mergeRound(H64, V1);
130     H64 = mergeRound(H64, V2);
131     H64 = mergeRound(H64, V3);
132     H64 = mergeRound(H64, V4);
133 
134   } else {
135     H64 = Seed + PRIME64_5;
136   }
137 
138   H64 += (uint64_t)Len;
139 
140   while (reinterpret_cast<uintptr_t>(P) + 8 <=
141          reinterpret_cast<uintptr_t>(BEnd)) {
142     uint64_t const K1 = round(0, endian::read64le(P));
143     H64 ^= K1;
144     H64 = rotl64(H64, 27) * PRIME64_1 + PRIME64_4;
145     P += 8;
146   }
147 
148   if (reinterpret_cast<uintptr_t>(P) + 4 <= reinterpret_cast<uintptr_t>(BEnd)) {
149     H64 ^= (uint64_t)(endian::read32le(P)) * PRIME64_1;
150     H64 = rotl64(H64, 23) * PRIME64_2 + PRIME64_3;
151     P += 4;
152   }
153 
154   while (P < BEnd) {
155     H64 ^= (*P) * PRIME64_5;
156     H64 = rotl64(H64, 11) * PRIME64_1;
157     P++;
158   }
159 
160   return XXH64_avalanche(H64);
161 }
162 
163 uint64_t llvm::xxHash64(ArrayRef<uint8_t> Data) {
164   return xxHash64({(const char *)Data.data(), Data.size()});
165 }
166 
167 constexpr size_t XXH3_SECRETSIZE_MIN = 136;
168 constexpr size_t XXH_SECRET_DEFAULT_SIZE = 192;
169 
170 /* Pseudorandom data taken directly from FARSH */
171 // clang-format off
172 constexpr uint8_t kSecret[XXH_SECRET_DEFAULT_SIZE] = {
173     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
174     0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
175     0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
176     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
177     0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
178     0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
179     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
180     0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
181     0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
182     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
183     0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
184     0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
185 };
186 // clang-format on
187 
188 constexpr uint64_t PRIME_MX1 = 0x165667919E3779F9;
189 constexpr uint64_t PRIME_MX2 = 0x9FB21C651E98DF25;
190 
191 // Calculates a 64-bit to 128-bit multiply, then XOR folds it.
192 static uint64_t XXH3_mul128_fold64(uint64_t lhs, uint64_t rhs) {
193 #if defined(__SIZEOF_INT128__) ||                                              \
194     (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
195   __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
196   return uint64_t(product) ^ uint64_t(product >> 64);
197 
198 #else
199   /* First calculate all of the cross products. */
200   const uint64_t lo_lo = (lhs & 0xFFFFFFFF) * (rhs & 0xFFFFFFFF);
201   const uint64_t hi_lo = (lhs >> 32) * (rhs & 0xFFFFFFFF);
202   const uint64_t lo_hi = (lhs & 0xFFFFFFFF) * (rhs >> 32);
203   const uint64_t hi_hi = (lhs >> 32) * (rhs >> 32);
204 
205   /* Now add the products together. These will never overflow. */
206   const uint64_t cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
207   const uint64_t upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
208   const uint64_t lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
209 
210   return upper ^ lower;
211 #endif
212 }
213 
214 constexpr size_t XXH_STRIPE_LEN = 64;
215 constexpr size_t XXH_SECRET_CONSUME_RATE = 8;
216 constexpr size_t XXH_ACC_NB = XXH_STRIPE_LEN / sizeof(uint64_t);
217 
218 static uint64_t XXH3_avalanche(uint64_t hash) {
219   hash ^= hash >> 37;
220   hash *= PRIME_MX1;
221   hash ^= hash >> 32;
222   return hash;
223 }
224 
225 static uint64_t XXH3_len_1to3_64b(const uint8_t *input, size_t len,
226                                   const uint8_t *secret, uint64_t seed) {
227   const uint8_t c1 = input[0];
228   const uint8_t c2 = input[len >> 1];
229   const uint8_t c3 = input[len - 1];
230   uint32_t combined = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) |
231                       ((uint32_t)c3 << 0) | ((uint32_t)len << 8);
232   uint64_t bitflip =
233       (uint64_t)(endian::read32le(secret) ^ endian::read32le(secret + 4)) +
234       seed;
235   return XXH64_avalanche(uint64_t(combined) ^ bitflip);
236 }
237 
238 static uint64_t XXH3_len_4to8_64b(const uint8_t *input, size_t len,
239                                   const uint8_t *secret, uint64_t seed) {
240   seed ^= (uint64_t)byteswap(uint32_t(seed)) << 32;
241   const uint32_t input1 = endian::read32le(input);
242   const uint32_t input2 = endian::read32le(input + len - 4);
243   uint64_t acc =
244       (endian::read64le(secret + 8) ^ endian::read64le(secret + 16)) - seed;
245   const uint64_t input64 = (uint64_t)input2 | ((uint64_t)input1 << 32);
246   acc ^= input64;
247   // XXH3_rrmxmx(acc, len)
248   acc ^= rotl64(acc, 49) ^ rotl64(acc, 24);
249   acc *= PRIME_MX2;
250   acc ^= (acc >> 35) + (uint64_t)len;
251   acc *= PRIME_MX2;
252   return acc ^ (acc >> 28);
253 }
254 
255 static uint64_t XXH3_len_9to16_64b(const uint8_t *input, size_t len,
256                                    const uint8_t *secret, uint64_t const seed) {
257   uint64_t input_lo =
258       (endian::read64le(secret + 24) ^ endian::read64le(secret + 32)) + seed;
259   uint64_t input_hi =
260       (endian::read64le(secret + 40) ^ endian::read64le(secret + 48)) - seed;
261   input_lo ^= endian::read64le(input);
262   input_hi ^= endian::read64le(input + len - 8);
263   uint64_t acc = uint64_t(len) + byteswap(input_lo) + input_hi +
264                  XXH3_mul128_fold64(input_lo, input_hi);
265   return XXH3_avalanche(acc);
266 }
267 
268 LLVM_ATTRIBUTE_ALWAYS_INLINE
269 static uint64_t XXH3_len_0to16_64b(const uint8_t *input, size_t len,
270                                    const uint8_t *secret, uint64_t const seed) {
271   if (LLVM_LIKELY(len > 8))
272     return XXH3_len_9to16_64b(input, len, secret, seed);
273   if (LLVM_LIKELY(len >= 4))
274     return XXH3_len_4to8_64b(input, len, secret, seed);
275   if (len != 0)
276     return XXH3_len_1to3_64b(input, len, secret, seed);
277   return XXH64_avalanche(seed ^ endian::read64le(secret + 56) ^
278                          endian::read64le(secret + 64));
279 }
280 
281 static uint64_t XXH3_mix16B(const uint8_t *input, uint8_t const *secret,
282                             uint64_t seed) {
283   uint64_t lhs = seed;
284   uint64_t rhs = 0U - seed;
285   lhs += endian::read64le(secret);
286   rhs += endian::read64le(secret + 8);
287   lhs ^= endian::read64le(input);
288   rhs ^= endian::read64le(input + 8);
289   return XXH3_mul128_fold64(lhs, rhs);
290 }
291 
292 /* For mid range keys, XXH3 uses a Mum-hash variant. */
293 LLVM_ATTRIBUTE_ALWAYS_INLINE
294 static uint64_t XXH3_len_17to128_64b(const uint8_t *input, size_t len,
295                                      const uint8_t *secret,
296                                      uint64_t const seed) {
297   uint64_t acc = len * PRIME64_1, acc_end;
298   acc += XXH3_mix16B(input + 0, secret + 0, seed);
299   acc_end = XXH3_mix16B(input + len - 16, secret + 16, seed);
300   if (len > 32) {
301     acc += XXH3_mix16B(input + 16, secret + 32, seed);
302     acc_end += XXH3_mix16B(input + len - 32, secret + 48, seed);
303     if (len > 64) {
304       acc += XXH3_mix16B(input + 32, secret + 64, seed);
305       acc_end += XXH3_mix16B(input + len - 48, secret + 80, seed);
306       if (len > 96) {
307         acc += XXH3_mix16B(input + 48, secret + 96, seed);
308         acc_end += XXH3_mix16B(input + len - 64, secret + 112, seed);
309       }
310     }
311   }
312   return XXH3_avalanche(acc + acc_end);
313 }
314 
315 constexpr size_t XXH3_MIDSIZE_MAX = 240;
316 constexpr size_t XXH3_MIDSIZE_STARTOFFSET = 3;
317 constexpr size_t XXH3_MIDSIZE_LASTOFFSET = 17;
318 
319 LLVM_ATTRIBUTE_NOINLINE
320 static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
321                                       const uint8_t *secret, uint64_t seed) {
322   uint64_t acc = (uint64_t)len * PRIME64_1;
323   const unsigned nbRounds = len / 16;
324   for (unsigned i = 0; i < 8; ++i)
325     acc += XXH3_mix16B(input + 16 * i, secret + 16 * i, seed);
326   acc = XXH3_avalanche(acc);
327 
328   for (unsigned i = 8; i < nbRounds; ++i) {
329     acc += XXH3_mix16B(input + 16 * i,
330                        secret + 16 * (i - 8) + XXH3_MIDSIZE_STARTOFFSET, seed);
331   }
332   /* last bytes */
333   acc +=
334       XXH3_mix16B(input + len - 16,
335                   secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
336   return XXH3_avalanche(acc);
337 }
338 
339 #if LLVM_XXH_USE_NEON
340 
341 #define XXH3_accumulate_512 XXH3_accumulate_512_neon
342 #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
343 
344 // NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
345 // (July 2024), with the following removed:
346 // - workaround for suboptimal codegen on older GCC
347 // - compiler barriers against instruction reordering
348 // - WebAssembly SIMD support
349 // - configurable split between NEON and scalar lanes (benchmarking shows no
350 //   penalty when fully doing SIMD on the Apple M1)
351 
352 #if defined(__GNUC__) || defined(__clang__)
353 #define XXH_ALIASING __attribute__((__may_alias__))
354 #else
355 #define XXH_ALIASING /* nothing */
356 #endif
357 
358 typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
359 
360 LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
361   return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
362 }
363 
364 LLVM_ATTRIBUTE_ALWAYS_INLINE
365 static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
366                                      const uint8_t *secret) {
367   xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
368 
369 #ifdef __clang__
370 #pragma clang loop unroll(full)
371 #endif
372   for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
373     /* data_vec = input[i]; */
374     uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
375     uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));
376 
377     /* key_vec  = secret[i];  */
378     uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
379     uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));
380 
381     /* data_swap = swap(data_vec) */
382     uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
383     uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
384 
385     /* data_key = data_vec ^ key_vec; */
386     uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
387     uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
388 
389     /*
390      * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
391      * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
392      * get one vector with the low 32 bits of each lane, and one vector
393      * with the high 32 bits of each lane.
394      *
395      * The intrinsic returns a double vector because the original ARMv7-a
396      * instruction modified both arguments in place. AArch64 and SIMD128 emit
397      * two instructions from this intrinsic.
398      *
399      *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
400      *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
401      */
402     uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
403                                       vreinterpretq_u32_u64(data_key_2));
404 
405     /* data_key_lo = data_key & 0xFFFFFFFF */
406     uint32x4_t data_key_lo = unzipped.val[0];
407     /* data_key_hi = data_key >> 32 */
408     uint32x4_t data_key_hi = unzipped.val[1];
409 
410     /*
411      * Then, we can split the vectors horizontally and multiply which, as for
412      * most widening intrinsics, have a variant that works on both high half
413      * vectors for free on AArch64. A similar instruction is available on
414      * SIMD128.
415      *
416      * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
417      */
418     uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
419                                  vget_low_u32(data_key_hi));
420     uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
421                                  vget_high_u32(data_key_hi));
422 
423     /* xacc[i] = acc_vec + sum; */
424     xacc[i] = vaddq_u64(xacc[i], sum_1);
425     xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
426   }
427 }
428 
429 LLVM_ATTRIBUTE_ALWAYS_INLINE
430 static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
431   xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
432 
433   /* { prime32_1, prime32_1 } */
434   uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
435   /* { 0, prime32_1, 0, prime32_1 } */
436   uint32x4_t const kPrimeHi =
437       vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));
438 
439   for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
440     /* xacc[i] ^= (xacc[i] >> 47); */
441     uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
442     uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
443     uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
444 
445     /* xacc[i] ^= secret[i]; */
446     uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
447     uint64x2_t data_key = veorq_u64(data_vec, key_vec);
448 
449     /*
450      * xacc[i] *= XXH_PRIME32_1
451      *
452      * Expanded version with portable NEON intrinsics
453      *
454      *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
455      *
456      * prod_hi = hi(data_key) * lo(prime) << 32
457      *
458      * Since we only need 32 bits of this multiply a trick can be used,
459      * reinterpreting the vector as a uint32x4_t and multiplying by
460      * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
461      * shift.
462      */
463     uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);
464 
465     /* Extract low bits for vmlal_u32  */
466     uint32x2_t data_key_lo = vmovn_u64(data_key);
467 
468     /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
469     xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
470   }
471 }
472 #else
473 
474 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
475 #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
476 
477 LLVM_ATTRIBUTE_ALWAYS_INLINE
478 static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
479                                        const uint8_t *secret) {
480   for (size_t i = 0; i < XXH_ACC_NB; ++i) {
481     uint64_t data_val = endian::read64le(input + 8 * i);
482     uint64_t data_key = data_val ^ endian::read64le(secret + 8 * i);
483     acc[i ^ 1] += data_val;
484     acc[i] += uint32_t(data_key) * (data_key >> 32);
485   }
486 }
487 
488 LLVM_ATTRIBUTE_ALWAYS_INLINE
489 static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) {
490   for (size_t i = 0; i < XXH_ACC_NB; ++i) {
491     acc[i] ^= acc[i] >> 47;
492     acc[i] ^= endian::read64le(secret + 8 * i);
493     acc[i] *= PRIME32_1;
494   }
495 }
496 #endif
497 
498 LLVM_ATTRIBUTE_ALWAYS_INLINE
499 static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
500                             const uint8_t *secret, size_t nbStripes) {
501   for (size_t n = 0; n < nbStripes; ++n) {
502     XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN,
503                         secret + n * XXH_SECRET_CONSUME_RATE);
504   }
505 }
506 
507 static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) {
508   return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret),
509                             acc[1] ^ endian::read64le(secret + 8));
510 }
511 
512 static uint64_t XXH3_mergeAccs(const uint64_t *acc, const uint8_t *key,
513                                uint64_t start) {
514   uint64_t result64 = start;
515   for (size_t i = 0; i < 4; ++i)
516     result64 += XXH3_mix2Accs(acc + 2 * i, key + 16 * i);
517   return XXH3_avalanche(result64);
518 }
519 
520 LLVM_ATTRIBUTE_NOINLINE
521 static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
522                                   const uint8_t *secret, size_t secretSize) {
523   const size_t nbStripesPerBlock =
524       (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
525   const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
526   const size_t nb_blocks = (len - 1) / block_len;
527   alignas(16) uint64_t acc[XXH_ACC_NB] = {
528       PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3,
529       PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
530   };
531   for (size_t n = 0; n < nb_blocks; ++n) {
532     XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
533     XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
534   }
535 
536   /* last partial block */
537   const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
538   assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
539   XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
540 
541   /* last stripe */
542   constexpr size_t XXH_SECRET_LASTACC_START = 7;
543   XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
544                       secret + secretSize - XXH_STRIPE_LEN -
545                           XXH_SECRET_LASTACC_START);
546 
547   /* converge into final hash */
548   constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
549   return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
550                         (uint64_t)len * PRIME64_1);
551 }
552 
553 uint64_t llvm::xxh3_64bits(ArrayRef<uint8_t> data) {
554   auto *in = data.data();
555   size_t len = data.size();
556   if (len <= 16)
557     return XXH3_len_0to16_64b(in, len, kSecret, 0);
558   if (len <= 128)
559     return XXH3_len_17to128_64b(in, len, kSecret, 0);
560   if (len <= XXH3_MIDSIZE_MAX)
561     return XXH3_len_129to240_64b(in, len, kSecret, 0);
562   return XXH3_hashLong_64b(in, len, kSecret, sizeof(kSecret));
563 }
564 
565 /* ==========================================
566  * XXH3 128 bits (a.k.a XXH128)
567  * ==========================================
568  * XXH3's 128-bit variant has better mixing and strength than the 64-bit
569  * variant, even without counting the significantly larger output size.
570  *
571  * For example, extra steps are taken to avoid the seed-dependent collisions
572  * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
573  *
574  * This strength naturally comes at the cost of some speed, especially on short
575  * lengths. Note that longer hashes are about as fast as the 64-bit version
576  * due to it using only a slight modification of the 64-bit loop.
577  *
578  * XXH128 is also more oriented towards 64-bit machines. It is still extremely
579  * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
580  */
581 
582 /*!
583  * @internal
584  * @def XXH_rotl32(x,r)
585  * @brief 32-bit rotate left.
586  *
587  * @param x The 32-bit integer to be rotated.
588  * @param r The number of bits to rotate.
589  * @pre
590  *   @p r > 0 && @p r < 32
591  * @note
592  *   @p x and @p r may be evaluated multiple times.
593  * @return The rotated result.
594  */
595 #if __has_builtin(__builtin_rotateleft32) &&                                   \
596     __has_builtin(__builtin_rotateleft64)
597 #define XXH_rotl32 __builtin_rotateleft32
598 #define XXH_rotl64 __builtin_rotateleft64
599 /* Note: although _rotl exists for minGW (GCC under windows), performance seems
600  * poor */
601 #elif defined(_MSC_VER)
602 #define XXH_rotl32(x, r) _rotl(x, r)
603 #define XXH_rotl64(x, r) _rotl64(x, r)
604 #else
605 #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
606 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
607 #endif
608 
609 #define XXH_mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y))
610 
611 /*!
612  * @brief Calculates a 64->128-bit long multiply.
613  *
614  * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
615  * version.
616  *
617  * @param lhs , rhs The 64-bit integers to be multiplied
618  * @return The 128-bit result represented in an @ref XXH128_hash_t.
619  */
620 static XXH128_hash_t XXH_mult64to128(uint64_t lhs, uint64_t rhs) {
621   /*
622    * GCC/Clang __uint128_t method.
623    *
624    * On most 64-bit targets, GCC and Clang define a __uint128_t type.
625    * This is usually the best way as it usually uses a native long 64-bit
626    * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
627    *
628    * Usually.
629    *
630    * Despite being a 32-bit platform, Clang (and emscripten) define this type
631    * despite not having the arithmetic for it. This results in a laggy
632    * compiler builtin call which calculates a full 128-bit multiply.
633    * In that case it is best to use the portable one.
634    * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
635    */
636 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) &&         \
637         defined(__SIZEOF_INT128__) ||                                          \
638     (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
639 
640   __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
641   XXH128_hash_t r128;
642   r128.low64 = (uint64_t)(product);
643   r128.high64 = (uint64_t)(product >> 64);
644   return r128;
645 
646   /*
647    * MSVC for x64's _umul128 method.
648    *
649    * uint64_t _umul128(uint64_t Multiplier, uint64_t Multiplicand, uint64_t
650    * *HighProduct);
651    *
652    * This compiles to single operand MUL on x64.
653    */
654 #elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
655 
656 #ifndef _MSC_VER
657 #pragma intrinsic(_umul128)
658 #endif
659   uint64_t product_high;
660   uint64_t const product_low = _umul128(lhs, rhs, &product_high);
661   XXH128_hash_t r128;
662   r128.low64 = product_low;
663   r128.high64 = product_high;
664   return r128;
665 
666   /*
667    * MSVC for ARM64's __umulh method.
668    *
669    * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
670    */
671 #elif defined(_M_ARM64) || defined(_M_ARM64EC)
672 
673 #ifndef _MSC_VER
674 #pragma intrinsic(__umulh)
675 #endif
676   XXH128_hash_t r128;
677   r128.low64 = lhs * rhs;
678   r128.high64 = __umulh(lhs, rhs);
679   return r128;
680 
681 #else
682   /*
683    * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
684    *
685    * This is a fast and simple grade school multiply, which is shown below
686    * with base 10 arithmetic instead of base 0x100000000.
687    *
688    *           9 3 // D2 lhs = 93
689    *         x 7 5 // D2 rhs = 75
690    *     ----------
691    *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
692    *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
693    *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
694    *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
695    *     ---------
696    *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
697    *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
698    *     ---------
699    *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
700    *
701    * The reasons for adding the products like this are:
702    *  1. It avoids manual carry tracking. Just like how
703    *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
704    *     This avoids a lot of complexity.
705    *
706    *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
707    *     instruction available in ARM's Digital Signal Processing extension
708    *     in 32-bit ARMv6 and later, which is shown below:
709    *
710    *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
711    *         {
712    *             uint64_t product = (uint64_t)*RdLo * (uint64_t)*RdHi + Rn + Rm;
713    *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
714    *             *RdHi = (xxh_u32)(product >> 32);
715    *         }
716    *
717    *     This instruction was designed for efficient long multiplication, and
718    *     allows this to be calculated in only 4 instructions at speeds
719    *     comparable to some 64-bit ALUs.
720    *
721    *  3. It isn't terrible on other platforms. Usually this will be a couple
722    *     of 32-bit ADD/ADCs.
723    */
724 
725   /* First calculate all of the cross products. */
726   uint64_t const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
727   uint64_t const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
728   uint64_t const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
729   uint64_t const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
730 
731   /* Now add the products together. These will never overflow. */
732   uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
733   uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
734   uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
735 
736   XXH128_hash_t r128;
737   r128.low64 = lower;
738   r128.high64 = upper;
739   return r128;
740 #endif
741 }
742 
743 /*! Seems to produce slightly better code on GCC for some reason. */
744 LLVM_ATTRIBUTE_ALWAYS_INLINE constexpr uint64_t XXH_xorshift64(uint64_t v64,
745                                                                int shift) {
746   return v64 ^ (v64 >> shift);
747 }
748 
749 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
750 XXH3_len_1to3_128b(const uint8_t *input, size_t len, const uint8_t *secret,
751                    uint64_t seed) {
752   /* A doubled version of 1to3_64b with different constants. */
753   /*
754    * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
755    * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
756    * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
757    */
758   uint8_t const c1 = input[0];
759   uint8_t const c2 = input[len >> 1];
760   uint8_t const c3 = input[len - 1];
761   uint32_t const combinedl = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) |
762                              ((uint32_t)c3 << 0) | ((uint32_t)len << 8);
763   uint32_t const combinedh = XXH_rotl32(byteswap(combinedl), 13);
764   uint64_t const bitflipl =
765       (endian::read32le(secret) ^ endian::read32le(secret + 4)) + seed;
766   uint64_t const bitfliph =
767       (endian::read32le(secret + 8) ^ endian::read32le(secret + 12)) - seed;
768   uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl;
769   uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph;
770   XXH128_hash_t h128;
771   h128.low64 = XXH64_avalanche(keyed_lo);
772   h128.high64 = XXH64_avalanche(keyed_hi);
773   return h128;
774 }
775 
776 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
777 XXH3_len_4to8_128b(const uint8_t *input, size_t len, const uint8_t *secret,
778                    uint64_t seed) {
779   seed ^= (uint64_t)byteswap((uint32_t)seed) << 32;
780   uint32_t const input_lo = endian::read32le(input);
781   uint32_t const input_hi = endian::read32le(input + len - 4);
782   uint64_t const input_64 = input_lo + ((uint64_t)input_hi << 32);
783   uint64_t const bitflip =
784       (endian::read64le(secret + 16) ^ endian::read64le(secret + 24)) + seed;
785   uint64_t const keyed = input_64 ^ bitflip;
786 
787   /* Shift len to the left to ensure it is even, this avoids even multiplies.
788    */
789   XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
790 
791   m128.high64 += (m128.low64 << 1);
792   m128.low64 ^= (m128.high64 >> 3);
793 
794   m128.low64 = XXH_xorshift64(m128.low64, 35);
795   m128.low64 *= PRIME_MX2;
796   m128.low64 = XXH_xorshift64(m128.low64, 28);
797   m128.high64 = XXH3_avalanche(m128.high64);
798   return m128;
799 }
800 
801 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
802 XXH3_len_9to16_128b(const uint8_t *input, size_t len, const uint8_t *secret,
803                     uint64_t seed) {
804   uint64_t const bitflipl =
805       (endian::read64le(secret + 32) ^ endian::read64le(secret + 40)) - seed;
806   uint64_t const bitfliph =
807       (endian::read64le(secret + 48) ^ endian::read64le(secret + 56)) + seed;
808   uint64_t const input_lo = endian::read64le(input);
809   uint64_t input_hi = endian::read64le(input + len - 8);
810   XXH128_hash_t m128 =
811       XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
812   /*
813    * Put len in the middle of m128 to ensure that the length gets mixed to
814    * both the low and high bits in the 128x64 multiply below.
815    */
816   m128.low64 += (uint64_t)(len - 1) << 54;
817   input_hi ^= bitfliph;
818   /*
819    * Add the high 32 bits of input_hi to the high 32 bits of m128, then
820    * add the long product of the low 32 bits of input_hi and PRIME32_2 to
821    * the high 64 bits of m128.
822    *
823    * The best approach to this operation is different on 32-bit and 64-bit.
824    */
825   if (sizeof(void *) < sizeof(uint64_t)) { /* 32-bit */
826     /*
827      * 32-bit optimized version, which is more readable.
828      *
829      * On 32-bit, it removes an ADC and delays a dependency between the two
830      * halves of m128.high64, but it generates an extra mask on 64-bit.
831      */
832     m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) +
833                    XXH_mult32to64((uint32_t)input_hi, PRIME32_2);
834   } else {
835     /*
836      * 64-bit optimized (albeit more confusing) version.
837      *
838      * Uses some properties of addition and multiplication to remove the mask:
839      *
840      * Let:
841      *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
842      *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
843      *    c = PRIME32_2
844      *
845      *    a + (b * c)
846      * Inverse Property: x + y - x == y
847      *    a + (b * (1 + c - 1))
848      * Distributive Property: x * (y + z) == (x * y) + (x * z)
849      *    a + (b * 1) + (b * (c - 1))
850      * Identity Property: x * 1 == x
851      *    a + b + (b * (c - 1))
852      *
853      * Substitute a, b, and c:
854      *    input_hi.hi + input_hi.lo + ((uint64_t)input_hi.lo * (PRIME32_2
855      * - 1))
856      *
857      * Since input_hi.hi + input_hi.lo == input_hi, we get this:
858      *    input_hi + ((uint64_t)input_hi.lo * (PRIME32_2 - 1))
859      */
860     m128.high64 += input_hi + XXH_mult32to64((uint32_t)input_hi, PRIME32_2 - 1);
861   }
862   /* m128 ^= XXH_swap64(m128 >> 64); */
863   m128.low64 ^= byteswap(m128.high64);
864 
865   /* 128x64 multiply: h128 = m128 * PRIME64_2; */
866   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
867   h128.high64 += m128.high64 * PRIME64_2;
868 
869   h128.low64 = XXH3_avalanche(h128.low64);
870   h128.high64 = XXH3_avalanche(h128.high64);
871   return h128;
872 }
873 
874 /*
875  * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
876  */
877 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
878 XXH3_len_0to16_128b(const uint8_t *input, size_t len, const uint8_t *secret,
879                     uint64_t seed) {
880   if (len > 8)
881     return XXH3_len_9to16_128b(input, len, secret, seed);
882   if (len >= 4)
883     return XXH3_len_4to8_128b(input, len, secret, seed);
884   if (len)
885     return XXH3_len_1to3_128b(input, len, secret, seed);
886   XXH128_hash_t h128;
887   uint64_t const bitflipl =
888       endian::read64le(secret + 64) ^ endian::read64le(secret + 72);
889   uint64_t const bitfliph =
890       endian::read64le(secret + 80) ^ endian::read64le(secret + 88);
891   h128.low64 = XXH64_avalanche(seed ^ bitflipl);
892   h128.high64 = XXH64_avalanche(seed ^ bitfliph);
893   return h128;
894 }
895 
896 /*
897  * A bit slower than XXH3_mix16B, but handles multiply by zero better.
898  */
899 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
900 XXH128_mix32B(XXH128_hash_t acc, const uint8_t *input_1, const uint8_t *input_2,
901               const uint8_t *secret, uint64_t seed) {
902   acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
903   acc.low64 ^= endian::read64le(input_2) + endian::read64le(input_2 + 8);
904   acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
905   acc.high64 ^= endian::read64le(input_1) + endian::read64le(input_1 + 8);
906   return acc;
907 }
908 
909 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
910 XXH3_len_17to128_128b(const uint8_t *input, size_t len, const uint8_t *secret,
911                       size_t secretSize, uint64_t seed) {
912   (void)secretSize;
913 
914   XXH128_hash_t acc;
915   acc.low64 = len * PRIME64_1;
916   acc.high64 = 0;
917 
918   if (len > 32) {
919     if (len > 64) {
920       if (len > 96) {
921         acc =
922             XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96, seed);
923       }
924       acc = XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
925     }
926     acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
927   }
928   acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
929   XXH128_hash_t h128;
930   h128.low64 = acc.low64 + acc.high64;
931   h128.high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) +
932                 ((len - seed) * PRIME64_2);
933   h128.low64 = XXH3_avalanche(h128.low64);
934   h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64);
935   return h128;
936 }
937 
938 LLVM_ATTRIBUTE_NOINLINE static XXH128_hash_t
939 XXH3_len_129to240_128b(const uint8_t *input, size_t len, const uint8_t *secret,
940                        size_t secretSize, uint64_t seed) {
941   (void)secretSize;
942 
943   XXH128_hash_t acc;
944   unsigned i;
945   acc.low64 = len * PRIME64_1;
946   acc.high64 = 0;
947   /*
948    *  We set as `i` as offset + 32. We do this so that unchanged
949    * `len` can be used as upper bound. This reaches a sweet spot
950    * where both x86 and aarch64 get simple agen and good codegen
951    * for the loop.
952    */
953   for (i = 32; i < 160; i += 32) {
954     acc = XXH128_mix32B(acc, input + i - 32, input + i - 16, secret + i - 32,
955                         seed);
956   }
957   acc.low64 = XXH3_avalanche(acc.low64);
958   acc.high64 = XXH3_avalanche(acc.high64);
959   /*
960    * NB: `i <= len` will duplicate the last 32-bytes if
961    * len % 32 was zero. This is an unfortunate necessity to keep
962    * the hash result stable.
963    */
964   for (i = 160; i <= len; i += 32) {
965     acc = XXH128_mix32B(acc, input + i - 32, input + i - 16,
966                         secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, seed);
967   }
968   /* last bytes */
969   acc =
970       XXH128_mix32B(acc, input + len - 16, input + len - 32,
971                     secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
972                     (uint64_t)0 - seed);
973 
974   XXH128_hash_t h128;
975   h128.low64 = acc.low64 + acc.high64;
976   h128.high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) +
977                 ((len - seed) * PRIME64_2);
978   h128.low64 = XXH3_avalanche(h128.low64);
979   h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64);
980   return h128;
981 }
982 
983 LLVM_ATTRIBUTE_ALWAYS_INLINE XXH128_hash_t
984 XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
985                    size_t secretSize) {
986   const size_t nbStripesPerBlock =
987       (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
988   const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
989   const size_t nb_blocks = (len - 1) / block_len;
990   alignas(16) uint64_t acc[XXH_ACC_NB] = {
991       PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3,
992       PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
993   };
994 
995   for (size_t n = 0; n < nb_blocks; ++n) {
996     XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
997     XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
998   }
999 
1000   /* last partial block */
1001   const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
1002   assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
1003   XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
1004 
1005   /* last stripe */
1006   constexpr size_t XXH_SECRET_LASTACC_START = 7;
1007   XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
1008                       secret + secretSize - XXH_STRIPE_LEN -
1009                           XXH_SECRET_LASTACC_START);
1010 
1011   /* converge into final hash */
1012   static_assert(sizeof(acc) == 64);
1013   XXH128_hash_t h128;
1014   constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
1015   h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
1016                               (uint64_t)len * PRIME64_1);
1017   h128.high64 = XXH3_mergeAccs(
1018       acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
1019       ~((uint64_t)len * PRIME64_2));
1020   return h128;
1021 }
1022 
1023 llvm::XXH128_hash_t llvm::xxh3_128bits(ArrayRef<uint8_t> data) {
1024   size_t len = data.size();
1025   const uint8_t *input = data.data();
1026 
1027   /*
1028    * If an action is to be taken if `secret` conditions are not respected,
1029    * it should be done here.
1030    * For now, it's a contract pre-condition.
1031    * Adding a check and a branch here would cost performance at every hash.
1032    */
1033   if (len <= 16)
1034     return XXH3_len_0to16_128b(input, len, kSecret, /*seed64=*/0);
1035   if (len <= 128)
1036     return XXH3_len_17to128_128b(input, len, kSecret, sizeof(kSecret),
1037                                  /*seed64=*/0);
1038   if (len <= XXH3_MIDSIZE_MAX)
1039     return XXH3_len_129to240_128b(input, len, kSecret, sizeof(kSecret),
1040                                   /*seed64=*/0);
1041   return XXH3_hashLong_128b(input, len, kSecret, sizeof(kSecret));
1042 }
1043