1 /*
2 * xxHash - Extremely Fast Hash algorithm
3 * Copyright (C) 2012-2023, Yann Collet
4 *
5 * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following disclaimer
15 * in the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * You can contact the author at :
31 * - xxHash homepage: http://www.xxhash.com
32 * - xxHash source repository : https://github.com/Cyan4973/xxHash
33 */
34
35 // xxhash64 is based on commit d2df04efcbef7d7f6886d345861e5dfda4edacc1. Removed
36 // everything but a simple interface for computing xxh64.
37
38 // xxh3_64bits is based on commit d5891596637d21366b9b1dcf2c0007a3edb26a9e (July
39 // 2023).
40
41 // xxh3_128bits is based on commit b0adcc54188c3130b1793e7b19c62eb1e669f7df
42 // (June 2024).
43
44 #include "llvm/Support/xxhash.h"
45 #include "llvm/Support/Compiler.h"
46 #include "llvm/Support/Endian.h"
47
48 #include <stdlib.h>
49
50 #if !defined(LLVM_XXH_USE_NEON)
51 #if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
52 !defined(__ARM_BIG_ENDIAN)
53 #define LLVM_XXH_USE_NEON 1
54 #else
55 #define LLVM_XXH_USE_NEON 0
56 #endif
57 #endif
58
59 #if LLVM_XXH_USE_NEON
60 #include <arm_neon.h>
61 #endif
62
63 using namespace llvm;
64 using namespace support;
65
rotl64(uint64_t X,size_t R)66 static uint64_t rotl64(uint64_t X, size_t R) {
67 return (X << R) | (X >> (64 - R));
68 }
69
70 constexpr uint32_t PRIME32_1 = 0x9E3779B1;
71 constexpr uint32_t PRIME32_2 = 0x85EBCA77;
72 constexpr uint32_t PRIME32_3 = 0xC2B2AE3D;
73
74 static const uint64_t PRIME64_1 = 11400714785074694791ULL;
75 static const uint64_t PRIME64_2 = 14029467366897019727ULL;
76 static const uint64_t PRIME64_3 = 1609587929392839161ULL;
77 static const uint64_t PRIME64_4 = 9650029242287828579ULL;
78 static const uint64_t PRIME64_5 = 2870177450012600261ULL;
79
round(uint64_t Acc,uint64_t Input)80 static uint64_t round(uint64_t Acc, uint64_t Input) {
81 Acc += Input * PRIME64_2;
82 Acc = rotl64(Acc, 31);
83 Acc *= PRIME64_1;
84 return Acc;
85 }
86
mergeRound(uint64_t Acc,uint64_t Val)87 static uint64_t mergeRound(uint64_t Acc, uint64_t Val) {
88 Val = round(0, Val);
89 Acc ^= Val;
90 Acc = Acc * PRIME64_1 + PRIME64_4;
91 return Acc;
92 }
93
XXH64_avalanche(uint64_t hash)94 static uint64_t XXH64_avalanche(uint64_t hash) {
95 hash ^= hash >> 33;
96 hash *= PRIME64_2;
97 hash ^= hash >> 29;
98 hash *= PRIME64_3;
99 hash ^= hash >> 32;
100 return hash;
101 }
102
xxHash64(StringRef Data)103 uint64_t llvm::xxHash64(StringRef Data) {
104 size_t Len = Data.size();
105 uint64_t Seed = 0;
106 const unsigned char *P = Data.bytes_begin();
107 const unsigned char *const BEnd = Data.bytes_end();
108 uint64_t H64;
109
110 if (Len >= 32) {
111 const unsigned char *const Limit = BEnd - 32;
112 uint64_t V1 = Seed + PRIME64_1 + PRIME64_2;
113 uint64_t V2 = Seed + PRIME64_2;
114 uint64_t V3 = Seed + 0;
115 uint64_t V4 = Seed - PRIME64_1;
116
117 do {
118 V1 = round(V1, endian::read64le(P));
119 P += 8;
120 V2 = round(V2, endian::read64le(P));
121 P += 8;
122 V3 = round(V3, endian::read64le(P));
123 P += 8;
124 V4 = round(V4, endian::read64le(P));
125 P += 8;
126 } while (P <= Limit);
127
128 H64 = rotl64(V1, 1) + rotl64(V2, 7) + rotl64(V3, 12) + rotl64(V4, 18);
129 H64 = mergeRound(H64, V1);
130 H64 = mergeRound(H64, V2);
131 H64 = mergeRound(H64, V3);
132 H64 = mergeRound(H64, V4);
133
134 } else {
135 H64 = Seed + PRIME64_5;
136 }
137
138 H64 += (uint64_t)Len;
139
140 while (reinterpret_cast<uintptr_t>(P) + 8 <=
141 reinterpret_cast<uintptr_t>(BEnd)) {
142 uint64_t const K1 = round(0, endian::read64le(P));
143 H64 ^= K1;
144 H64 = rotl64(H64, 27) * PRIME64_1 + PRIME64_4;
145 P += 8;
146 }
147
148 if (reinterpret_cast<uintptr_t>(P) + 4 <= reinterpret_cast<uintptr_t>(BEnd)) {
149 H64 ^= (uint64_t)(endian::read32le(P)) * PRIME64_1;
150 H64 = rotl64(H64, 23) * PRIME64_2 + PRIME64_3;
151 P += 4;
152 }
153
154 while (P < BEnd) {
155 H64 ^= (*P) * PRIME64_5;
156 H64 = rotl64(H64, 11) * PRIME64_1;
157 P++;
158 }
159
160 return XXH64_avalanche(H64);
161 }
162
xxHash64(ArrayRef<uint8_t> Data)163 uint64_t llvm::xxHash64(ArrayRef<uint8_t> Data) {
164 return xxHash64({(const char *)Data.data(), Data.size()});
165 }
166
167 constexpr size_t XXH3_SECRETSIZE_MIN = 136;
168 constexpr size_t XXH_SECRET_DEFAULT_SIZE = 192;
169
170 /* Pseudorandom data taken directly from FARSH */
171 // clang-format off
172 constexpr uint8_t kSecret[XXH_SECRET_DEFAULT_SIZE] = {
173 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
174 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
175 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
176 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
177 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
178 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
179 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
180 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
181 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
182 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
183 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
184 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
185 };
186 // clang-format on
187
188 constexpr uint64_t PRIME_MX1 = 0x165667919E3779F9;
189 constexpr uint64_t PRIME_MX2 = 0x9FB21C651E98DF25;
190
191 // Calculates a 64-bit to 128-bit multiply, then XOR folds it.
XXH3_mul128_fold64(uint64_t lhs,uint64_t rhs)192 static uint64_t XXH3_mul128_fold64(uint64_t lhs, uint64_t rhs) {
193 #if defined(__SIZEOF_INT128__) || \
194 (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
195 __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
196 return uint64_t(product) ^ uint64_t(product >> 64);
197
198 #else
199 /* First calculate all of the cross products. */
200 const uint64_t lo_lo = (lhs & 0xFFFFFFFF) * (rhs & 0xFFFFFFFF);
201 const uint64_t hi_lo = (lhs >> 32) * (rhs & 0xFFFFFFFF);
202 const uint64_t lo_hi = (lhs & 0xFFFFFFFF) * (rhs >> 32);
203 const uint64_t hi_hi = (lhs >> 32) * (rhs >> 32);
204
205 /* Now add the products together. These will never overflow. */
206 const uint64_t cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
207 const uint64_t upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
208 const uint64_t lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
209
210 return upper ^ lower;
211 #endif
212 }
213
214 constexpr size_t XXH_STRIPE_LEN = 64;
215 constexpr size_t XXH_SECRET_CONSUME_RATE = 8;
216 constexpr size_t XXH_ACC_NB = XXH_STRIPE_LEN / sizeof(uint64_t);
217
XXH3_avalanche(uint64_t hash)218 static uint64_t XXH3_avalanche(uint64_t hash) {
219 hash ^= hash >> 37;
220 hash *= PRIME_MX1;
221 hash ^= hash >> 32;
222 return hash;
223 }
224
XXH3_len_1to3_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)225 static uint64_t XXH3_len_1to3_64b(const uint8_t *input, size_t len,
226 const uint8_t *secret, uint64_t seed) {
227 const uint8_t c1 = input[0];
228 const uint8_t c2 = input[len >> 1];
229 const uint8_t c3 = input[len - 1];
230 uint32_t combined = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) |
231 ((uint32_t)c3 << 0) | ((uint32_t)len << 8);
232 uint64_t bitflip =
233 (uint64_t)(endian::read32le(secret) ^ endian::read32le(secret + 4)) +
234 seed;
235 return XXH64_avalanche(uint64_t(combined) ^ bitflip);
236 }
237
XXH3_len_4to8_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)238 static uint64_t XXH3_len_4to8_64b(const uint8_t *input, size_t len,
239 const uint8_t *secret, uint64_t seed) {
240 seed ^= (uint64_t)byteswap(uint32_t(seed)) << 32;
241 const uint32_t input1 = endian::read32le(input);
242 const uint32_t input2 = endian::read32le(input + len - 4);
243 uint64_t acc =
244 (endian::read64le(secret + 8) ^ endian::read64le(secret + 16)) - seed;
245 const uint64_t input64 = (uint64_t)input2 | ((uint64_t)input1 << 32);
246 acc ^= input64;
247 // XXH3_rrmxmx(acc, len)
248 acc ^= rotl64(acc, 49) ^ rotl64(acc, 24);
249 acc *= PRIME_MX2;
250 acc ^= (acc >> 35) + (uint64_t)len;
251 acc *= PRIME_MX2;
252 return acc ^ (acc >> 28);
253 }
254
XXH3_len_9to16_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t const seed)255 static uint64_t XXH3_len_9to16_64b(const uint8_t *input, size_t len,
256 const uint8_t *secret, uint64_t const seed) {
257 uint64_t input_lo =
258 (endian::read64le(secret + 24) ^ endian::read64le(secret + 32)) + seed;
259 uint64_t input_hi =
260 (endian::read64le(secret + 40) ^ endian::read64le(secret + 48)) - seed;
261 input_lo ^= endian::read64le(input);
262 input_hi ^= endian::read64le(input + len - 8);
263 uint64_t acc = uint64_t(len) + byteswap(input_lo) + input_hi +
264 XXH3_mul128_fold64(input_lo, input_hi);
265 return XXH3_avalanche(acc);
266 }
267
268 LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_len_0to16_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t const seed)269 static uint64_t XXH3_len_0to16_64b(const uint8_t *input, size_t len,
270 const uint8_t *secret, uint64_t const seed) {
271 if (LLVM_LIKELY(len > 8))
272 return XXH3_len_9to16_64b(input, len, secret, seed);
273 if (LLVM_LIKELY(len >= 4))
274 return XXH3_len_4to8_64b(input, len, secret, seed);
275 if (len != 0)
276 return XXH3_len_1to3_64b(input, len, secret, seed);
277 return XXH64_avalanche(seed ^ endian::read64le(secret + 56) ^
278 endian::read64le(secret + 64));
279 }
280
XXH3_mix16B(const uint8_t * input,uint8_t const * secret,uint64_t seed)281 static uint64_t XXH3_mix16B(const uint8_t *input, uint8_t const *secret,
282 uint64_t seed) {
283 uint64_t lhs = seed;
284 uint64_t rhs = 0U - seed;
285 lhs += endian::read64le(secret);
286 rhs += endian::read64le(secret + 8);
287 lhs ^= endian::read64le(input);
288 rhs ^= endian::read64le(input + 8);
289 return XXH3_mul128_fold64(lhs, rhs);
290 }
291
292 /* For mid range keys, XXH3 uses a Mum-hash variant. */
293 LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_len_17to128_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t const seed)294 static uint64_t XXH3_len_17to128_64b(const uint8_t *input, size_t len,
295 const uint8_t *secret,
296 uint64_t const seed) {
297 uint64_t acc = len * PRIME64_1, acc_end;
298 acc += XXH3_mix16B(input + 0, secret + 0, seed);
299 acc_end = XXH3_mix16B(input + len - 16, secret + 16, seed);
300 if (len > 32) {
301 acc += XXH3_mix16B(input + 16, secret + 32, seed);
302 acc_end += XXH3_mix16B(input + len - 32, secret + 48, seed);
303 if (len > 64) {
304 acc += XXH3_mix16B(input + 32, secret + 64, seed);
305 acc_end += XXH3_mix16B(input + len - 48, secret + 80, seed);
306 if (len > 96) {
307 acc += XXH3_mix16B(input + 48, secret + 96, seed);
308 acc_end += XXH3_mix16B(input + len - 64, secret + 112, seed);
309 }
310 }
311 }
312 return XXH3_avalanche(acc + acc_end);
313 }
314
315 constexpr size_t XXH3_MIDSIZE_MAX = 240;
316 constexpr size_t XXH3_MIDSIZE_STARTOFFSET = 3;
317 constexpr size_t XXH3_MIDSIZE_LASTOFFSET = 17;
318
319 LLVM_ATTRIBUTE_NOINLINE
XXH3_len_129to240_64b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)320 static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
321 const uint8_t *secret, uint64_t seed) {
322 uint64_t acc = (uint64_t)len * PRIME64_1;
323 const unsigned nbRounds = len / 16;
324 for (unsigned i = 0; i < 8; ++i)
325 acc += XXH3_mix16B(input + 16 * i, secret + 16 * i, seed);
326 acc = XXH3_avalanche(acc);
327
328 for (unsigned i = 8; i < nbRounds; ++i) {
329 acc += XXH3_mix16B(input + 16 * i,
330 secret + 16 * (i - 8) + XXH3_MIDSIZE_STARTOFFSET, seed);
331 }
332 /* last bytes */
333 acc +=
334 XXH3_mix16B(input + len - 16,
335 secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
336 return XXH3_avalanche(acc);
337 }
338
339 #if LLVM_XXH_USE_NEON
340
341 #define XXH3_accumulate_512 XXH3_accumulate_512_neon
342 #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
343
344 // NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
345 // (July 2024), with the following removed:
346 // - workaround for suboptimal codegen on older GCC
347 // - compiler barriers against instruction reordering
348 // - WebAssembly SIMD support
349 // - configurable split between NEON and scalar lanes (benchmarking shows no
350 // penalty when fully doing SIMD on the Apple M1)
351
352 #if defined(__GNUC__) || defined(__clang__)
353 #define XXH_ALIASING __attribute__((__may_alias__))
354 #else
355 #define XXH_ALIASING /* nothing */
356 #endif
357
358 typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
359
XXH_vld1q_u64(void const * ptr)360 LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
361 return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
362 }
363
364 LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_accumulate_512_neon(uint64_t * acc,const uint8_t * input,const uint8_t * secret)365 static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
366 const uint8_t *secret) {
367 xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
368
369 #ifdef __clang__
370 #pragma clang loop unroll(full)
371 #endif
372 for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
373 /* data_vec = input[i]; */
374 uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
375 uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));
376
377 /* key_vec = secret[i]; */
378 uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
379 uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));
380
381 /* data_swap = swap(data_vec) */
382 uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
383 uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
384
385 /* data_key = data_vec ^ key_vec; */
386 uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
387 uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
388
389 /*
390 * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
391 * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
392 * get one vector with the low 32 bits of each lane, and one vector
393 * with the high 32 bits of each lane.
394 *
395 * The intrinsic returns a double vector because the original ARMv7-a
396 * instruction modified both arguments in place. AArch64 and SIMD128 emit
397 * two instructions from this intrinsic.
398 *
399 * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
400 * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
401 */
402 uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
403 vreinterpretq_u32_u64(data_key_2));
404
405 /* data_key_lo = data_key & 0xFFFFFFFF */
406 uint32x4_t data_key_lo = unzipped.val[0];
407 /* data_key_hi = data_key >> 32 */
408 uint32x4_t data_key_hi = unzipped.val[1];
409
410 /*
411 * Then, we can split the vectors horizontally and multiply which, as for
412 * most widening intrinsics, have a variant that works on both high half
413 * vectors for free on AArch64. A similar instruction is available on
414 * SIMD128.
415 *
416 * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
417 */
418 uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
419 vget_low_u32(data_key_hi));
420 uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
421 vget_high_u32(data_key_hi));
422
423 /* xacc[i] = acc_vec + sum; */
424 xacc[i] = vaddq_u64(xacc[i], sum_1);
425 xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
426 }
427 }
428
429 LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_scrambleAcc_neon(uint64_t * acc,const uint8_t * secret)430 static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
431 xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
432
433 /* { prime32_1, prime32_1 } */
434 uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
435 /* { 0, prime32_1, 0, prime32_1 } */
436 uint32x4_t const kPrimeHi =
437 vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));
438
439 for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
440 /* xacc[i] ^= (xacc[i] >> 47); */
441 uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
442 uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
443 uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
444
445 /* xacc[i] ^= secret[i]; */
446 uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
447 uint64x2_t data_key = veorq_u64(data_vec, key_vec);
448
449 /*
450 * xacc[i] *= XXH_PRIME32_1
451 *
452 * Expanded version with portable NEON intrinsics
453 *
454 * lo(x) * lo(y) + (hi(x) * lo(y) << 32)
455 *
456 * prod_hi = hi(data_key) * lo(prime) << 32
457 *
458 * Since we only need 32 bits of this multiply a trick can be used,
459 * reinterpreting the vector as a uint32x4_t and multiplying by
460 * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
461 * shift.
462 */
463 uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);
464
465 /* Extract low bits for vmlal_u32 */
466 uint32x2_t data_key_lo = vmovn_u64(data_key);
467
468 /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
469 xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
470 }
471 }
472 #else
473
474 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
475 #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
476
477 LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_accumulate_512_scalar(uint64_t * acc,const uint8_t * input,const uint8_t * secret)478 static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
479 const uint8_t *secret) {
480 for (size_t i = 0; i < XXH_ACC_NB; ++i) {
481 uint64_t data_val = endian::read64le(input + 8 * i);
482 uint64_t data_key = data_val ^ endian::read64le(secret + 8 * i);
483 acc[i ^ 1] += data_val;
484 acc[i] += uint32_t(data_key) * (data_key >> 32);
485 }
486 }
487
488 LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_scrambleAcc_scalar(uint64_t * acc,const uint8_t * secret)489 static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) {
490 for (size_t i = 0; i < XXH_ACC_NB; ++i) {
491 acc[i] ^= acc[i] >> 47;
492 acc[i] ^= endian::read64le(secret + 8 * i);
493 acc[i] *= PRIME32_1;
494 }
495 }
496 #endif
497
498 LLVM_ATTRIBUTE_ALWAYS_INLINE
XXH3_accumulate(uint64_t * acc,const uint8_t * input,const uint8_t * secret,size_t nbStripes)499 static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
500 const uint8_t *secret, size_t nbStripes) {
501 for (size_t n = 0; n < nbStripes; ++n) {
502 XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN,
503 secret + n * XXH_SECRET_CONSUME_RATE);
504 }
505 }
506
XXH3_mix2Accs(const uint64_t * acc,const uint8_t * secret)507 static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) {
508 return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret),
509 acc[1] ^ endian::read64le(secret + 8));
510 }
511
XXH3_mergeAccs(const uint64_t * acc,const uint8_t * key,uint64_t start)512 static uint64_t XXH3_mergeAccs(const uint64_t *acc, const uint8_t *key,
513 uint64_t start) {
514 uint64_t result64 = start;
515 for (size_t i = 0; i < 4; ++i)
516 result64 += XXH3_mix2Accs(acc + 2 * i, key + 16 * i);
517 return XXH3_avalanche(result64);
518 }
519
520 LLVM_ATTRIBUTE_NOINLINE
XXH3_hashLong_64b(const uint8_t * input,size_t len,const uint8_t * secret,size_t secretSize)521 static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
522 const uint8_t *secret, size_t secretSize) {
523 const size_t nbStripesPerBlock =
524 (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
525 const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
526 const size_t nb_blocks = (len - 1) / block_len;
527 alignas(16) uint64_t acc[XXH_ACC_NB] = {
528 PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3,
529 PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
530 };
531 for (size_t n = 0; n < nb_blocks; ++n) {
532 XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
533 XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
534 }
535
536 /* last partial block */
537 const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
538 assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
539 XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
540
541 /* last stripe */
542 constexpr size_t XXH_SECRET_LASTACC_START = 7;
543 XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
544 secret + secretSize - XXH_STRIPE_LEN -
545 XXH_SECRET_LASTACC_START);
546
547 /* converge into final hash */
548 constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
549 return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
550 (uint64_t)len * PRIME64_1);
551 }
552
xxh3_64bits(ArrayRef<uint8_t> data)553 uint64_t llvm::xxh3_64bits(ArrayRef<uint8_t> data) {
554 auto *in = data.data();
555 size_t len = data.size();
556 if (len <= 16)
557 return XXH3_len_0to16_64b(in, len, kSecret, 0);
558 if (len <= 128)
559 return XXH3_len_17to128_64b(in, len, kSecret, 0);
560 if (len <= XXH3_MIDSIZE_MAX)
561 return XXH3_len_129to240_64b(in, len, kSecret, 0);
562 return XXH3_hashLong_64b(in, len, kSecret, sizeof(kSecret));
563 }
564
565 /* ==========================================
566 * XXH3 128 bits (a.k.a XXH128)
567 * ==========================================
568 * XXH3's 128-bit variant has better mixing and strength than the 64-bit
569 * variant, even without counting the significantly larger output size.
570 *
571 * For example, extra steps are taken to avoid the seed-dependent collisions
572 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
573 *
574 * This strength naturally comes at the cost of some speed, especially on short
575 * lengths. Note that longer hashes are about as fast as the 64-bit version
576 * due to it using only a slight modification of the 64-bit loop.
577 *
578 * XXH128 is also more oriented towards 64-bit machines. It is still extremely
579 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
580 */
581
582 /*!
583 * @internal
584 * @def XXH_rotl32(x,r)
585 * @brief 32-bit rotate left.
586 *
587 * @param x The 32-bit integer to be rotated.
588 * @param r The number of bits to rotate.
589 * @pre
590 * @p r > 0 && @p r < 32
591 * @note
592 * @p x and @p r may be evaluated multiple times.
593 * @return The rotated result.
594 */
595 #if __has_builtin(__builtin_rotateleft32) && \
596 __has_builtin(__builtin_rotateleft64)
597 #define XXH_rotl32 __builtin_rotateleft32
598 #define XXH_rotl64 __builtin_rotateleft64
599 /* Note: although _rotl exists for minGW (GCC under windows), performance seems
600 * poor */
601 #elif defined(_MSC_VER)
602 #define XXH_rotl32(x, r) _rotl(x, r)
603 #define XXH_rotl64(x, r) _rotl64(x, r)
604 #else
605 #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
606 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
607 #endif
608
609 #define XXH_mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y))
610
611 /*!
612 * @brief Calculates a 64->128-bit long multiply.
613 *
614 * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
615 * version.
616 *
617 * @param lhs , rhs The 64-bit integers to be multiplied
618 * @return The 128-bit result represented in an @ref XXH128_hash_t.
619 */
XXH_mult64to128(uint64_t lhs,uint64_t rhs)620 static XXH128_hash_t XXH_mult64to128(uint64_t lhs, uint64_t rhs) {
621 /*
622 * GCC/Clang __uint128_t method.
623 *
624 * On most 64-bit targets, GCC and Clang define a __uint128_t type.
625 * This is usually the best way as it usually uses a native long 64-bit
626 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
627 *
628 * Usually.
629 *
630 * Despite being a 32-bit platform, Clang (and emscripten) define this type
631 * despite not having the arithmetic for it. This results in a laggy
632 * compiler builtin call which calculates a full 128-bit multiply.
633 * In that case it is best to use the portable one.
634 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
635 */
636 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) && \
637 defined(__SIZEOF_INT128__) || \
638 (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
639
640 __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
641 XXH128_hash_t r128;
642 r128.low64 = (uint64_t)(product);
643 r128.high64 = (uint64_t)(product >> 64);
644 return r128;
645
646 /*
647 * MSVC for x64's _umul128 method.
648 *
649 * uint64_t _umul128(uint64_t Multiplier, uint64_t Multiplicand, uint64_t
650 * *HighProduct);
651 *
652 * This compiles to single operand MUL on x64.
653 */
654 #elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
655
656 #ifndef _MSC_VER
657 #pragma intrinsic(_umul128)
658 #endif
659 uint64_t product_high;
660 uint64_t const product_low = _umul128(lhs, rhs, &product_high);
661 XXH128_hash_t r128;
662 r128.low64 = product_low;
663 r128.high64 = product_high;
664 return r128;
665
666 /*
667 * MSVC for ARM64's __umulh method.
668 *
669 * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
670 */
671 #elif defined(_M_ARM64) || defined(_M_ARM64EC)
672
673 #ifndef _MSC_VER
674 #pragma intrinsic(__umulh)
675 #endif
676 XXH128_hash_t r128;
677 r128.low64 = lhs * rhs;
678 r128.high64 = __umulh(lhs, rhs);
679 return r128;
680
681 #else
682 /*
683 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
684 *
685 * This is a fast and simple grade school multiply, which is shown below
686 * with base 10 arithmetic instead of base 0x100000000.
687 *
688 * 9 3 // D2 lhs = 93
689 * x 7 5 // D2 rhs = 75
690 * ----------
691 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
692 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
693 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
694 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
695 * ---------
696 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
697 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
698 * ---------
699 * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
700 *
701 * The reasons for adding the products like this are:
702 * 1. It avoids manual carry tracking. Just like how
703 * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
704 * This avoids a lot of complexity.
705 *
706 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
707 * instruction available in ARM's Digital Signal Processing extension
708 * in 32-bit ARMv6 and later, which is shown below:
709 *
710 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
711 * {
712 * uint64_t product = (uint64_t)*RdLo * (uint64_t)*RdHi + Rn + Rm;
713 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
714 * *RdHi = (xxh_u32)(product >> 32);
715 * }
716 *
717 * This instruction was designed for efficient long multiplication, and
718 * allows this to be calculated in only 4 instructions at speeds
719 * comparable to some 64-bit ALUs.
720 *
721 * 3. It isn't terrible on other platforms. Usually this will be a couple
722 * of 32-bit ADD/ADCs.
723 */
724
725 /* First calculate all of the cross products. */
726 uint64_t const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
727 uint64_t const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
728 uint64_t const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
729 uint64_t const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
730
731 /* Now add the products together. These will never overflow. */
732 uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
733 uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
734 uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
735
736 XXH128_hash_t r128;
737 r128.low64 = lower;
738 r128.high64 = upper;
739 return r128;
740 #endif
741 }
742
743 /*! Seems to produce slightly better code on GCC for some reason. */
XXH_xorshift64(uint64_t v64,int shift)744 LLVM_ATTRIBUTE_ALWAYS_INLINE constexpr uint64_t XXH_xorshift64(uint64_t v64,
745 int shift) {
746 return v64 ^ (v64 >> shift);
747 }
748
749 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_1to3_128b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)750 XXH3_len_1to3_128b(const uint8_t *input, size_t len, const uint8_t *secret,
751 uint64_t seed) {
752 /* A doubled version of 1to3_64b with different constants. */
753 /*
754 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
755 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
756 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
757 */
758 uint8_t const c1 = input[0];
759 uint8_t const c2 = input[len >> 1];
760 uint8_t const c3 = input[len - 1];
761 uint32_t const combinedl = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) |
762 ((uint32_t)c3 << 0) | ((uint32_t)len << 8);
763 uint32_t const combinedh = XXH_rotl32(byteswap(combinedl), 13);
764 uint64_t const bitflipl =
765 (endian::read32le(secret) ^ endian::read32le(secret + 4)) + seed;
766 uint64_t const bitfliph =
767 (endian::read32le(secret + 8) ^ endian::read32le(secret + 12)) - seed;
768 uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl;
769 uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph;
770 XXH128_hash_t h128;
771 h128.low64 = XXH64_avalanche(keyed_lo);
772 h128.high64 = XXH64_avalanche(keyed_hi);
773 return h128;
774 }
775
776 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_4to8_128b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)777 XXH3_len_4to8_128b(const uint8_t *input, size_t len, const uint8_t *secret,
778 uint64_t seed) {
779 seed ^= (uint64_t)byteswap((uint32_t)seed) << 32;
780 uint32_t const input_lo = endian::read32le(input);
781 uint32_t const input_hi = endian::read32le(input + len - 4);
782 uint64_t const input_64 = input_lo + ((uint64_t)input_hi << 32);
783 uint64_t const bitflip =
784 (endian::read64le(secret + 16) ^ endian::read64le(secret + 24)) + seed;
785 uint64_t const keyed = input_64 ^ bitflip;
786
787 /* Shift len to the left to ensure it is even, this avoids even multiplies.
788 */
789 XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
790
791 m128.high64 += (m128.low64 << 1);
792 m128.low64 ^= (m128.high64 >> 3);
793
794 m128.low64 = XXH_xorshift64(m128.low64, 35);
795 m128.low64 *= PRIME_MX2;
796 m128.low64 = XXH_xorshift64(m128.low64, 28);
797 m128.high64 = XXH3_avalanche(m128.high64);
798 return m128;
799 }
800
801 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_9to16_128b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)802 XXH3_len_9to16_128b(const uint8_t *input, size_t len, const uint8_t *secret,
803 uint64_t seed) {
804 uint64_t const bitflipl =
805 (endian::read64le(secret + 32) ^ endian::read64le(secret + 40)) - seed;
806 uint64_t const bitfliph =
807 (endian::read64le(secret + 48) ^ endian::read64le(secret + 56)) + seed;
808 uint64_t const input_lo = endian::read64le(input);
809 uint64_t input_hi = endian::read64le(input + len - 8);
810 XXH128_hash_t m128 =
811 XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
812 /*
813 * Put len in the middle of m128 to ensure that the length gets mixed to
814 * both the low and high bits in the 128x64 multiply below.
815 */
816 m128.low64 += (uint64_t)(len - 1) << 54;
817 input_hi ^= bitfliph;
818 /*
819 * Add the high 32 bits of input_hi to the high 32 bits of m128, then
820 * add the long product of the low 32 bits of input_hi and PRIME32_2 to
821 * the high 64 bits of m128.
822 *
823 * The best approach to this operation is different on 32-bit and 64-bit.
824 */
825 if (sizeof(void *) < sizeof(uint64_t)) { /* 32-bit */
826 /*
827 * 32-bit optimized version, which is more readable.
828 *
829 * On 32-bit, it removes an ADC and delays a dependency between the two
830 * halves of m128.high64, but it generates an extra mask on 64-bit.
831 */
832 m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) +
833 XXH_mult32to64((uint32_t)input_hi, PRIME32_2);
834 } else {
835 /*
836 * 64-bit optimized (albeit more confusing) version.
837 *
838 * Uses some properties of addition and multiplication to remove the mask:
839 *
840 * Let:
841 * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
842 * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
843 * c = PRIME32_2
844 *
845 * a + (b * c)
846 * Inverse Property: x + y - x == y
847 * a + (b * (1 + c - 1))
848 * Distributive Property: x * (y + z) == (x * y) + (x * z)
849 * a + (b * 1) + (b * (c - 1))
850 * Identity Property: x * 1 == x
851 * a + b + (b * (c - 1))
852 *
853 * Substitute a, b, and c:
854 * input_hi.hi + input_hi.lo + ((uint64_t)input_hi.lo * (PRIME32_2
855 * - 1))
856 *
857 * Since input_hi.hi + input_hi.lo == input_hi, we get this:
858 * input_hi + ((uint64_t)input_hi.lo * (PRIME32_2 - 1))
859 */
860 m128.high64 += input_hi + XXH_mult32to64((uint32_t)input_hi, PRIME32_2 - 1);
861 }
862 /* m128 ^= XXH_swap64(m128 >> 64); */
863 m128.low64 ^= byteswap(m128.high64);
864
865 /* 128x64 multiply: h128 = m128 * PRIME64_2; */
866 XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
867 h128.high64 += m128.high64 * PRIME64_2;
868
869 h128.low64 = XXH3_avalanche(h128.low64);
870 h128.high64 = XXH3_avalanche(h128.high64);
871 return h128;
872 }
873
874 /*
875 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
876 */
877 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_0to16_128b(const uint8_t * input,size_t len,const uint8_t * secret,uint64_t seed)878 XXH3_len_0to16_128b(const uint8_t *input, size_t len, const uint8_t *secret,
879 uint64_t seed) {
880 if (len > 8)
881 return XXH3_len_9to16_128b(input, len, secret, seed);
882 if (len >= 4)
883 return XXH3_len_4to8_128b(input, len, secret, seed);
884 if (len)
885 return XXH3_len_1to3_128b(input, len, secret, seed);
886 XXH128_hash_t h128;
887 uint64_t const bitflipl =
888 endian::read64le(secret + 64) ^ endian::read64le(secret + 72);
889 uint64_t const bitfliph =
890 endian::read64le(secret + 80) ^ endian::read64le(secret + 88);
891 h128.low64 = XXH64_avalanche(seed ^ bitflipl);
892 h128.high64 = XXH64_avalanche(seed ^ bitfliph);
893 return h128;
894 }
895
896 /*
897 * A bit slower than XXH3_mix16B, but handles multiply by zero better.
898 */
899 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH128_mix32B(XXH128_hash_t acc,const uint8_t * input_1,const uint8_t * input_2,const uint8_t * secret,uint64_t seed)900 XXH128_mix32B(XXH128_hash_t acc, const uint8_t *input_1, const uint8_t *input_2,
901 const uint8_t *secret, uint64_t seed) {
902 acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
903 acc.low64 ^= endian::read64le(input_2) + endian::read64le(input_2 + 8);
904 acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
905 acc.high64 ^= endian::read64le(input_1) + endian::read64le(input_1 + 8);
906 return acc;
907 }
908
909 LLVM_ATTRIBUTE_ALWAYS_INLINE static XXH128_hash_t
XXH3_len_17to128_128b(const uint8_t * input,size_t len,const uint8_t * secret,size_t secretSize,uint64_t seed)910 XXH3_len_17to128_128b(const uint8_t *input, size_t len, const uint8_t *secret,
911 size_t secretSize, uint64_t seed) {
912 (void)secretSize;
913
914 XXH128_hash_t acc;
915 acc.low64 = len * PRIME64_1;
916 acc.high64 = 0;
917
918 if (len > 32) {
919 if (len > 64) {
920 if (len > 96) {
921 acc =
922 XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96, seed);
923 }
924 acc = XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
925 }
926 acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
927 }
928 acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
929 XXH128_hash_t h128;
930 h128.low64 = acc.low64 + acc.high64;
931 h128.high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) +
932 ((len - seed) * PRIME64_2);
933 h128.low64 = XXH3_avalanche(h128.low64);
934 h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64);
935 return h128;
936 }
937
938 LLVM_ATTRIBUTE_NOINLINE static XXH128_hash_t
XXH3_len_129to240_128b(const uint8_t * input,size_t len,const uint8_t * secret,size_t secretSize,uint64_t seed)939 XXH3_len_129to240_128b(const uint8_t *input, size_t len, const uint8_t *secret,
940 size_t secretSize, uint64_t seed) {
941 (void)secretSize;
942
943 XXH128_hash_t acc;
944 unsigned i;
945 acc.low64 = len * PRIME64_1;
946 acc.high64 = 0;
947 /*
948 * We set as `i` as offset + 32. We do this so that unchanged
949 * `len` can be used as upper bound. This reaches a sweet spot
950 * where both x86 and aarch64 get simple agen and good codegen
951 * for the loop.
952 */
953 for (i = 32; i < 160; i += 32) {
954 acc = XXH128_mix32B(acc, input + i - 32, input + i - 16, secret + i - 32,
955 seed);
956 }
957 acc.low64 = XXH3_avalanche(acc.low64);
958 acc.high64 = XXH3_avalanche(acc.high64);
959 /*
960 * NB: `i <= len` will duplicate the last 32-bytes if
961 * len % 32 was zero. This is an unfortunate necessity to keep
962 * the hash result stable.
963 */
964 for (i = 160; i <= len; i += 32) {
965 acc = XXH128_mix32B(acc, input + i - 32, input + i - 16,
966 secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, seed);
967 }
968 /* last bytes */
969 acc =
970 XXH128_mix32B(acc, input + len - 16, input + len - 32,
971 secret + XXH3_SECRETSIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
972 (uint64_t)0 - seed);
973
974 XXH128_hash_t h128;
975 h128.low64 = acc.low64 + acc.high64;
976 h128.high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) +
977 ((len - seed) * PRIME64_2);
978 h128.low64 = XXH3_avalanche(h128.low64);
979 h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64);
980 return h128;
981 }
982
983 LLVM_ATTRIBUTE_ALWAYS_INLINE XXH128_hash_t
XXH3_hashLong_128b(const uint8_t * input,size_t len,const uint8_t * secret,size_t secretSize)984 XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
985 size_t secretSize) {
986 const size_t nbStripesPerBlock =
987 (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
988 const size_t block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
989 const size_t nb_blocks = (len - 1) / block_len;
990 alignas(16) uint64_t acc[XXH_ACC_NB] = {
991 PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3,
992 PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
993 };
994
995 for (size_t n = 0; n < nb_blocks; ++n) {
996 XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
997 XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
998 }
999
1000 /* last partial block */
1001 const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
1002 assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
1003 XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
1004
1005 /* last stripe */
1006 constexpr size_t XXH_SECRET_LASTACC_START = 7;
1007 XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
1008 secret + secretSize - XXH_STRIPE_LEN -
1009 XXH_SECRET_LASTACC_START);
1010
1011 /* converge into final hash */
1012 static_assert(sizeof(acc) == 64);
1013 XXH128_hash_t h128;
1014 constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
1015 h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
1016 (uint64_t)len * PRIME64_1);
1017 h128.high64 = XXH3_mergeAccs(
1018 acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
1019 ~((uint64_t)len * PRIME64_2));
1020 return h128;
1021 }
1022
xxh3_128bits(ArrayRef<uint8_t> data)1023 llvm::XXH128_hash_t llvm::xxh3_128bits(ArrayRef<uint8_t> data) {
1024 size_t len = data.size();
1025 const uint8_t *input = data.data();
1026
1027 /*
1028 * If an action is to be taken if `secret` conditions are not respected,
1029 * it should be done here.
1030 * For now, it's a contract pre-condition.
1031 * Adding a check and a branch here would cost performance at every hash.
1032 */
1033 if (len <= 16)
1034 return XXH3_len_0to16_128b(input, len, kSecret, /*seed64=*/0);
1035 if (len <= 128)
1036 return XXH3_len_17to128_128b(input, len, kSecret, sizeof(kSecret),
1037 /*seed64=*/0);
1038 if (len <= XXH3_MIDSIZE_MAX)
1039 return XXH3_len_129to240_128b(input, len, kSecret, sizeof(kSecret),
1040 /*seed64=*/0);
1041 return XXH3_hashLong_128b(input, len, kSecret, sizeof(kSecret));
1042 }
1043