Lines Matching +full:16 +full:- +full:input
2 * xxHash - Fast Hash algorithm
6 * - xxHash homepage: http://www.xxhash.com
7 * - xxHash source repository : https://github.com/Cyan4973/xxHash
9 * This source code is licensed under both the BSD-style license (found in the
12 * You may select, at your option, one of the above-listed licenses.
49 MD5-32 0.33 GB/s 10 Ronald L. Rivest
50 SHA1-32 0.28 GB/s 10
57 Other speed-oriented implementations can be faster,
59 https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c349009…
61 A 64-bit version, named XXH64, is available since r35.
62 It offers much better speed, but for 64-bit applications only.
79 * expressed as a compile-time constant:
81 * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
335 /*-**********************************************************************
336 * 32-bit hash
340 * @brief An unsigned 32-bit integer.
360 # error "unsupported platform: need a 32-bit type"
370 * Contains functions used in the classic 32-bit xxHash algorithm.
373 * XXH32 is useful for older platforms, with no or poor 64-bit performance.
375 * for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
383 * @brief Calculates the 32-bit hash of @p input using xxHash32.
387 * @param input The block of data to be hashed, at least @p length bytes in size.
388 * @param length The length of @p input, in bytes.
389 * @param seed The 32-bit seed to alter the hash's output predictably.
392 * The memory between @p input and @p input + @p length must be valid,
393 * readable, contiguous memory. However, if @p length is `0`, @p input may be
396 * @return The calculated 32-bit hash value.
404 XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
407 * Streaming functions generate the xxHash value from an incremental input.
408 * This method is slower than single-call functions, due to state management.
421 * This function returns the nn-bits hash as an int or long long.
423 * It's still possible to continue inserting input into the hash state after a
495 * @param seed The 32-bit seed to alter the hash result predictably.
505 * @brief Consumes a block of @p input to an @ref XXH32_state_t.
510 * @param input The block of data to be hashed, at least @p length bytes in size.
511 * @param length The length of @p input, in bytes.
516 * The memory between @p input and @p input + @p length must be valid,
517 * readable, contiguous memory. However, if @p length is `0`, @p input may be
522 XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t lengt…
545 * This the simplest and fastest format for further post-processing.
550 * The canonical representation settles this issue by mandating big-endian
551 * convention, the same convention as human-readable numbers (large digits first).
598 /* C-language Attributes are added in C23. */
634 /*-**********************************************************************
635 * 64-bit hash
639 * @brief An unsigned 64-bit integer.
655 /* the following type must have a width of 64-bit */
666 * Contains functions used in the classic 64-bit xxHash algorithm.
669 * XXH3 provides competitive speed for both 32-bit and 64-bit systems,
676 * @brief Calculates the 64-bit hash of @p input using xxHash64.
678 * This function usually runs faster on 64-bit systems, but slower on 32-bit
681 * @param input The block of data to be hashed, at least @p length bytes in size.
682 * @param length The length of @p input, in bytes.
683 * @param seed The 64-bit seed to alter the hash's output predictably.
686 * The memory between @p input and @p input + @p length must be valid,
687 * readable, contiguous memory. However, if @p length is `0`, @p input may be
690 * @return The calculated 64-bit hash.
698 /* Begin FreeBSD - This symbol is needed by dll-linked CLI zstd(1). */
701 XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
715 XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t lengt…
732 * - Improved speed for both small and large inputs
733 * - True 64-bit and 128-bit outputs
734 * - SIMD acceleration
735 * - Improved 32-bit viability
739 * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
745 * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
747 * Any 32-bit and 64-bit targets that can run XXH32 smoothly
766 * The API supports one-shot hashing, streaming mode, and custom secrets.
769 /*-**********************************************************************
770 * XXH3 64-bit variant
774 * default 64-bit variant, using default secret and default seed of 0.
820 * As a consequence, streaming is slower than one-shot hashing.
821 * For better performance, prefer one-shot functions whenever applicable.
849 * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
857 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t …
864 /*-**********************************************************************
865 * XXH3 128-bit variant
869 * @brief The return value from 128-bit hashes.
887 * As a consequence, streaming is slower than one-shot hashing.
888 * For better performance, prefer one-shot functions whenever applicable.
893 * All reset and streaming functions have same meaning as their 64-bit counterpart.
900 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t…
971 XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
973 …XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
979 #ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */
994 XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */
1078 /*!< Reserved field. Needed for padding on 64-bit. */
1082 /*!< Total length hashed. 64-bit even on 32-bit targets. */
1100 * @brief Initializes a stack-allocated `XXH3_state_s`.
1110 #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
1114 * simple alias to pre-selected XXH3_128bits variant
1125 * Derive a high-entropy secret from any user-defined content, named customSeed.
1127 * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
1130 * The function accepts as input a custom seed of any length and any content,
1131 * and derives from it a high-entropy secret of length @secretSize
1140 * _and_ feature very high entropy (consist of random-looking bytes).
1183 * hence offering only a pure speed benefit on "large" input,
1184 * by skipping the need to regenerate the secret for every large input.
1186 * Another usage scenario is to hash the secret to a 64-bit hash value,
1230 /*-**********************************************************************
1232 *-**********************************************************************
1268 * @brief Define this to disable 64-bit code.
1281 * is sub-optimal.
1288 * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
1293 * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
1299 * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
1307 * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
1310 * inline small `memcpy()` calls, and it might also be faster on big-endian
1316 * Methods 1 and 2 rely on implementation-defined behavior. Use these with
1320 * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
1328 * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
1334 * It checks for input alignment, and when conditions are met, uses a "fast
1335 * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
1356 * @brief When non-zero, sets all functions to `static`.
1371 * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
1372 * -fno-inline with GCC or Clang, this will automatically be defined.
1432 # if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
1433 || defined(__NO_INLINE__) /* -O0, -fno-inline */
1529 # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } whi…
1580 * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
1585 * @return The 32-bit native endian integer from the bytes at @p ptr.
1591 * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
1596 * @return The 32-bit little endian integer from the bytes at @p ptr.
1602 * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
1607 * @return The 32-bit big endian integer from the bytes at @p ptr.
1624 * @return The 32-bit little endian integer from the bytes at @p ptr.
1654 return ((const xxh_unalign*)ptr)->u32; in XXH_read32()
1661 * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
1713 * Portable and well-defined behavior. in XXH_isLittleEndian()
1727 * Compiler-specific Functions and Macros
1740 * @brief 32-bit rotate left.
1742 * @param x The 32-bit integer to be rotated.
1759 # define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
1760 # define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
1766 * @brief A 32-bit byteswap.
1768 * @param x The 32-bit integer to byteswap.
1800 * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
1811 | ((xxh_u32)bytePtr[2] << 16) in XXH_readLE32()
1820 | ((xxh_u32)bytePtr[1] << 16) in XXH_readBE32()
1855 * 32-bit hash functions
1882 * This shuffles the bits so that any bit from @p input impacts several bits in
1886 * @param input The stripe of input to mix.
1889 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) in XXH32_round() argument
1891 acc += input * XXH_PRIME32_2; in XXH32_round()
1904 * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on in XXH32_round()
1910 * - Four instructions are required to rotate, in XXH32_round()
1919 * - Instruction level parallelism is actually more beneficial here because in XXH32_round()
1937 * The final mix ensures that all input bits have a chance to impact any bit in
1949 h32 ^= h32 >> 16; in XXH32_avalanche()
1957 * @brief Processes the last 0-15 bytes of @p ptr.
1959 * There may be up to 15 bytes remaining to consume from the input.
1960 * This final stage will digest them to ensure that all input bytes are present
1964 * @param ptr The pointer to the remaining input.
1965 * @param len The remaining length, modulo 16.
1990 len -= 4; in XXH32_finalize()
1994 --len; in XXH32_finalize()
1998 switch(len&15) /* or switch(bEnd - p) */ { in XXH32_finalize()
2054 * @param input , len , seed Directly passed from @ref XXH32().
2055 * @param align Whether @p input is aligned.
2059 XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) in XXH32_endian_align() argument
2063 if (input==NULL) XXH_ASSERT(len == 0); in XXH32_endian_align()
2065 if (len>=16) { in XXH32_endian_align()
2066 const xxh_u8* const bEnd = input + len; in XXH32_endian_align()
2067 const xxh_u8* const limit = bEnd - 15; in XXH32_endian_align()
2071 xxh_u32 v4 = seed - XXH_PRIME32_1; in XXH32_endian_align()
2074 v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; in XXH32_endian_align()
2075 v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; in XXH32_endian_align()
2076 v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; in XXH32_endian_align()
2077 v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; in XXH32_endian_align()
2078 } while (input < limit); in XXH32_endian_align()
2088 return XXH32_finalize(h32, input, len&15, align); in XXH32_endian_align()
2092 XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) in XXH32() argument
2098 XXH32_update(&state, (const xxh_u8*)input, len); in XXH32()
2102 … if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ in XXH32()
2103 return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); in XXH32()
2106 return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); in XXH32()
2138 statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; in XXH32_reset()
2139 statePtr->v[1] = seed + XXH_PRIME32_2; in XXH32_reset()
2140 statePtr->v[2] = seed + 0; in XXH32_reset()
2141 statePtr->v[3] = seed - XXH_PRIME32_1; in XXH32_reset()
2148 XXH32_update(XXH32_state_t* state, const void* input, size_t len) in XXH32_update() argument
2150 if (input==NULL) { in XXH32_update()
2155 { const xxh_u8* p = (const xxh_u8*)input; in XXH32_update()
2158 state->total_len_32 += (XXH32_hash_t)len; in XXH32_update()
2159 state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); in XXH32_update()
2161 if (state->memsize + len < 16) { /* fill in tmp buffer */ in XXH32_update()
2162 XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); in XXH32_update()
2163 state->memsize += (XXH32_hash_t)len; in XXH32_update()
2167 if (state->memsize) { /* some data left from previous update */ in XXH32_update()
2168 XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); in XXH32_update()
2169 { const xxh_u32* p32 = state->mem32; in XXH32_update()
2170 state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; in XXH32_update()
2171 state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; in XXH32_update()
2172 state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; in XXH32_update()
2173 state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); in XXH32_update()
2175 p += 16-state->memsize; in XXH32_update()
2176 state->memsize = 0; in XXH32_update()
2179 if (p <= bEnd-16) { in XXH32_update()
2180 const xxh_u8* const limit = bEnd - 16; in XXH32_update()
2183 state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; in XXH32_update()
2184 state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; in XXH32_update()
2185 state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; in XXH32_update()
2186 state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; in XXH32_update()
2192 XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); in XXH32_update()
2193 state->memsize = (unsigned)(bEnd-p); in XXH32_update()
2206 if (state->large_len) { in XXH32_digest()
2207 h32 = XXH_rotl32(state->v[0], 1) in XXH32_digest()
2208 + XXH_rotl32(state->v[1], 7) in XXH32_digest()
2209 + XXH_rotl32(state->v[2], 12) in XXH32_digest()
2210 + XXH_rotl32(state->v[3], 18); in XXH32_digest()
2212 h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; in XXH32_digest()
2215 h32 += state->total_len_32; in XXH32_digest()
2217 return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); in XXH32_digest()
2229 * as human-readable numbers (large digits first).
2253 * 64-bit hash functions
2295 return ((const xxh_unalign64*)ptr)->u64; in XXH_read64()
2302 * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2332 /* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
2340 | ((xxh_u64)bytePtr[2] << 16) in XXH_readLE64()
2353 | ((xxh_u64)bytePtr[5] << 16) in XXH_readBE64()
2405 static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) in XXH64_round() argument
2407 acc += input * XXH_PRIME64_2; in XXH64_round()
2444 len -= 8; in XXH64_finalize()
2450 len -= 4; in XXH64_finalize()
2455 --len; in XXH64_finalize()
2471 XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) in XXH64_endian_align() argument
2474 if (input==NULL) XXH_ASSERT(len == 0); in XXH64_endian_align()
2477 const xxh_u8* const bEnd = input + len; in XXH64_endian_align()
2478 const xxh_u8* const limit = bEnd - 31; in XXH64_endian_align()
2482 xxh_u64 v4 = seed - XXH_PRIME64_1; in XXH64_endian_align()
2485 v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; in XXH64_endian_align()
2486 v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; in XXH64_endian_align()
2487 v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; in XXH64_endian_align()
2488 v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; in XXH64_endian_align()
2489 } while (input<limit); in XXH64_endian_align()
2503 return XXH64_finalize(h64, input, len, align); in XXH64_endian_align()
2508 XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) in XXH64() argument
2514 XXH64_update(&state, (const xxh_u8*)input, len); in XXH64()
2518 if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ in XXH64()
2519 return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); in XXH64()
2522 return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); in XXH64()
2552 statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; in XXH64_reset()
2553 statePtr->v[1] = seed + XXH_PRIME64_2; in XXH64_reset()
2554 statePtr->v[2] = seed + 0; in XXH64_reset()
2555 statePtr->v[3] = seed - XXH_PRIME64_1; in XXH64_reset()
2561 XXH64_update (XXH64_state_t* state, const void* input, size_t len) in XXH64_update() argument
2563 if (input==NULL) { in XXH64_update()
2568 { const xxh_u8* p = (const xxh_u8*)input; in XXH64_update()
2571 state->total_len += len; in XXH64_update()
2573 if (state->memsize + len < 32) { /* fill in tmp buffer */ in XXH64_update()
2574 XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); in XXH64_update()
2575 state->memsize += (xxh_u32)len; in XXH64_update()
2579 if (state->memsize) { /* tmp buffer is full */ in XXH64_update()
2580 XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); in XXH64_update()
2581 state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); in XXH64_update()
2582 state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); in XXH64_update()
2583 state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); in XXH64_update()
2584 state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); in XXH64_update()
2585 p += 32 - state->memsize; in XXH64_update()
2586 state->memsize = 0; in XXH64_update()
2590 const xxh_u8* const limit = bEnd - 32; in XXH64_update()
2593 state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; in XXH64_update()
2594 state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; in XXH64_update()
2595 state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; in XXH64_update()
2596 state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; in XXH64_update()
2602 XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); in XXH64_update()
2603 state->memsize = (unsigned)(bEnd-p); in XXH64_update()
2616 if (state->total_len >= 32) { in XXH64_digest()
2617 …h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_… in XXH64_digest()
2618 h64 = XXH64_mergeRound(h64, state->v[0]); in XXH64_digest()
2619 h64 = XXH64_mergeRound(h64, state->v[1]); in XXH64_digest()
2620 h64 = XXH64_mergeRound(h64, state->v[2]); in XXH64_digest()
2621 h64 = XXH64_mergeRound(h64, state->v[3]); in XXH64_digest()
2623 h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; in XXH64_digest()
2626 h64 += (xxh_u64) state->total_len; in XXH64_digest()
2628 return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); in XXH64_digest()
2701 * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
2702 * remaining a true 64-bit/128-bit hash function.
2704 * This is done by prioritizing a subset of 64-bit operations that can be
2705 * emulated without too many steps on the average 32-bit machine.
2707 * For example, these two lines seem similar, and run equally fast on 64-bit:
2713 * However, to a 32-bit machine, there is a major difference.
2717 * x.lo ^= (x.hi >> (47 - 32));
2722 * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
2727 * - All the bits we need are in the upper 32 bits, so we can ignore the lower
2729 * - The shift result will always fit in the lower 32 bits, and therefore,
2734 * - Usable unaligned access
2735 * - A 32-bit or 64-bit ALU
2736 * - If 32-bit, a decent ADC instruction
2737 * - A 32 or 64-bit multiply with a 64-bit result
2738 * - For the 128-bit variant, a decent byteswap helps short inputs.
2740 * The first two are already required by XXH32, and almost all 32-bit and 64-bit
2743 * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
2746 * First of all, Thumb-1 lacks support for the UMULL instruction which
2755 * do a 32->64 multiply with UMULL, and the flexible operand allowing free
2760 * If compiling Thumb-1 for a target which supports ARM instructions, we will
2764 * to specify -march, as you likely meant to compile for a newer architecture.
2770 # warning "XXH3 is highly inefficient without ARM or Thumb-2."
2808 XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
2809 XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
2867 # define XXH_ACC_ALIGN 16
2871 # define XXH_ACC_ALIGN 16
2873 # define XXH_ACC_ALIGN 16
2888 * GCC usually generates the best code with -O3 for xxHash.
2898 * -O2 -mavx2 -march=haswell
2900 * -O2 -mavx2 -mno-avx256-split-unaligned-load
2904 * -O2, but the other one we can't control without "failed to inline always
2909 && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
2911 # pragma GCC optimize("-O2")
2922 * To do the same operation, the 128-bit 'Q' register needs to be split into
2923 * two 64-bit 'D' registers, performing this operation::
2926 * | '---------. .--------' |
2928 * | .---------' '--------. |
2932 * completely different than the fastest method for ARMv7-A.
2934 * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
2936 * will only affect bits 8-15 of AX on x86.
2941 * On ARMv7-A, this strangely modifies both parameters in place instead of
2942 * taking the usual 3-operand form.
2944 * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
2946 * halves where we want - all in one instruction.
2961 * aarch64 cannot access the high bits of a Q-form register, and writes to a
2962 * D-form register zero the high bits, similar to how writes to W-form scalar
2984 * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
2988 * Function-like macro:
3002 /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
3003 … /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
3024 * emulated 64-bit arithmetic is too slow.
3028 * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
3029 * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
3033 * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
3040 * This change benefits CPUs with large micro-op buffers without negatively affecting
3044 * |:----------------------|:--------------------|----------:|-----------:|------:|
3045 * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |
3046 * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |
3047 * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |
3098 # warning "-maltivec=be is not recommended. Please use native endianness."
3173 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
3216 * @brief Calculates a 32-bit to 64-bit long multiply.
3225 * If you are compiling for platforms like Thumb-1 and don't have a better option,
3229 * @return 64-bit product of the low 32 bits of @p x and @p y.
3241 * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
3244 * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
3250 * @brief Calculates a 64->128-bit long multiply.
3255 * @param lhs , rhs The 64-bit integers to be multiplied
3256 * @return The 128-bit result represented in an @ref XXH128_hash_t.
3264 * On most 64-bit targets, GCC and Clang define a __uint128_t type. in XXH_mult64to128()
3265 * This is usually the best way as it usually uses a native long 64-bit in XXH_mult64to128()
3270 * Despite being a 32-bit platform, Clang (and emscripten) define this type in XXH_mult64to128()
3272 * compiler builtin call which calculates a full 128-bit multiply. in XXH_mult64to128()
3274 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 in XXH_mult64to128()
3322 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. in XXH_mult64to128()
3329 * ---------- in XXH_mult64to128()
3334 * --------- in XXH_mult64to128()
3337 * --------- in XXH_mult64to128()
3347 * in 32-bit ARMv6 and later, which is shown below: in XXH_mult64to128()
3358 * comparable to some 64-bit ALUs. in XXH_mult64to128()
3361 * of 32-bit ADD/ADCs. in XXH_mult64to128()
3383 * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
3388 * @param lhs , rhs The 64-bit integers to multiply
3408 * suitable when input bits are already partially mixed
3421 * preferable when input has not been previously mixed
3438 * sub-optimal on short lengths. It used an iterative algorithm which strongly
3445 * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
3452 * At very short lengths, there isn't enough input to fully hide secrets, or use
3468 XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) in XXH3_len_1to3_64b() argument
3470 XXH_ASSERT(input != NULL); in XXH3_len_1to3_64b()
3474 * len = 1: combined = { input[0], 0x01, input[0], input[0] } in XXH3_len_1to3_64b()
3475 * len = 2: combined = { input[1], 0x02, input[0], input[1] } in XXH3_len_1to3_64b()
3476 * len = 3: combined = { input[2], 0x03, input[0], input[1] } in XXH3_len_1to3_64b()
3478 { xxh_u8 const c1 = input[0]; in XXH3_len_1to3_64b()
3479 xxh_u8 const c2 = input[len >> 1]; in XXH3_len_1to3_64b()
3480 xxh_u8 const c3 = input[len - 1]; in XXH3_len_1to3_64b()
3481 xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) in XXH3_len_1to3_64b()
3490 XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) in XXH3_len_4to8_64b() argument
3492 XXH_ASSERT(input != NULL); in XXH3_len_4to8_64b()
3496 { xxh_u32 const input1 = XXH_readLE32(input); in XXH3_len_4to8_64b()
3497 xxh_u32 const input2 = XXH_readLE32(input + len - 4); in XXH3_len_4to8_64b()
3498 xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; in XXH3_len_4to8_64b()
3506 XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) in XXH3_len_9to16_64b() argument
3508 XXH_ASSERT(input != NULL); in XXH3_len_9to16_64b()
3510 XXH_ASSERT(9 <= len && len <= 16); in XXH3_len_9to16_64b()
3512 xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; in XXH3_len_9to16_64b()
3513 xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; in XXH3_len_9to16_64b()
3514 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; in XXH3_len_9to16_64b()
3523 XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) in XXH3_len_0to16_64b() argument
3525 XXH_ASSERT(len <= 16); in XXH3_len_0to16_64b()
3526 { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); in XXH3_len_0to16_64b()
3527 if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); in XXH3_len_0to16_64b()
3528 if (len) return XXH3_len_1to3_64b(input, len, secret, seed); in XXH3_len_0to16_64b()
3534 * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
3540 * unseeded non-cryptographic hashes, it does not attempt to defend itself
3549 * function that is only called up to 16 times per hash with up to 240 bytes of
3550 * input.
3552 * This is not too bad for a non-cryptographic hash function, especially with
3555 * The 128-bit variant (which trades some speed for strength) is NOT affected
3559 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, in XXH3_mix16B() argument
3567 * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in in XXH3_mix16B()
3573 * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, in XXH3_mix16B()
3582 { xxh_u64 const input_lo = XXH_readLE64(input); in XXH3_mix16B()
3583 xxh_u64 const input_hi = XXH_readLE64(input+8); in XXH3_mix16B()
3586 input_hi ^ (XXH_readLE64(secret+8) - seed64) in XXH3_mix16B()
3591 /* For mid range keys, XXH3 uses a Mum-hash variant. */
3593 XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, in XXH3_len_17to128_64b() argument
3598 XXH_ASSERT(16 < len && len <= 128); in XXH3_len_17to128_64b()
3604 acc += XXH3_mix16B(input+48, secret+96, seed); in XXH3_len_17to128_64b()
3605 acc += XXH3_mix16B(input+len-64, secret+112, seed); in XXH3_len_17to128_64b()
3607 acc += XXH3_mix16B(input+32, secret+64, seed); in XXH3_len_17to128_64b()
3608 acc += XXH3_mix16B(input+len-48, secret+80, seed); in XXH3_len_17to128_64b()
3610 acc += XXH3_mix16B(input+16, secret+32, seed); in XXH3_len_17to128_64b()
3611 acc += XXH3_mix16B(input+len-32, secret+48, seed); in XXH3_len_17to128_64b()
3613 acc += XXH3_mix16B(input+0, secret+0, seed); in XXH3_len_17to128_64b()
3614 acc += XXH3_mix16B(input+len-16, secret+16, seed); in XXH3_len_17to128_64b()
3623 XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, in XXH3_len_129to240_64b() argument
3634 int const nbRounds = (int)len / 16; in XXH3_len_129to240_64b()
3637 acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); in XXH3_len_129to240_64b()
3646 * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. in XXH3_len_129to240_64b()
3649 * For 64->128-bit multiplies, even if the NEON was 100% optimal, it in XXH3_len_129to240_64b()
3667 acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); in XXH3_len_129to240_64b()
3670 …acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed… in XXH3_len_129to240_64b()
3703 /* the following type must have a width of 64-bit */
3713 * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
3716 * We harden it by mixing the original input to the accumulators as well as the product.
3719 * original input is preserved.
3721 * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
3722 * cross-pollination, as otherwise the upper and lower halves would be
3725 * This doesn't matter on 64-bit hashes since they all get merged together in
3740 const void* XXH_RESTRICT input, in XXH3_accumulate_512_avx512() argument
3748 /* data_vec = input[0]; */ in XXH3_accumulate_512_avx512()
3749 __m512i const data_vec = _mm512_loadu_si512 (input); in XXH3_accumulate_512_avx512()
3771 * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
3819 …st seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64)); in XXH3_initCustomSecret_avx512()
3849 const void* XXH_RESTRICT input, in XXH3_accumulate_512_avx2() argument
3856 const __m256i* const xinput = (const __m256i *) input; in XXH3_accumulate_512_avx2()
3917 … __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - … in XXH3_initCustomSecret_avx2()
3925 * - do not extract the secret from sse registers in the internal loop in XXH3_initCustomSecret_avx2()
3926 * - use less common registers, and avoid pushing these reg into stack in XXH3_initCustomSecret_avx2()
3933 /* GCC -O2 need unroll loop manually */ in XXH3_initCustomSecret_avx2()
3954 const void* XXH_RESTRICT input, in XXH3_accumulate_512_sse2() argument
3957 /* SSE2 is just a half-scale version of the AVX2 version. */ in XXH3_accumulate_512_sse2()
3962 const __m128i* const xinput = (const __m128i *) input; in XXH3_accumulate_512_sse2()
4024 XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; in XXH3_initCustomSecret_sse2()
4027 __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); in XXH3_initCustomSecret_sse2()
4036 * - do not extract the secret from sse registers in the internal loop in XXH3_initCustomSecret_sse2()
4037 * - use less common registers, and avoid pushing these reg into stack in XXH3_initCustomSecret_sse2()
4055 XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
4074 const void* XXH_RESTRICT input, in XXH3_accumulate_512_neon() argument
4082 uint8_t const* const xinput = (const uint8_t *) input; in XXH3_accumulate_512_neon()
4089 uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); in XXH3_accumulate_512_neon()
4091 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); in XXH3_accumulate_512_neon()
4110 XXH3_scalarRound(acc, input, secret, i); in XXH3_accumulate_512_neon()
4133 uint8x16_t key_vec = vld1q_u8 (xsecret + (i * 16)); in XXH3_scrambleAcc_neon()
4153 * However, unlike SSE, Clang lacks a 64-bit multiply routine in XXH3_scrambleAcc_neon()
4154 * for NEON, and it scalarizes two 64-bit multiplies instead. in XXH3_scrambleAcc_neon()
4180 const void* XXH_RESTRICT input, in XXH3_accumulate_512_vsx() argument
4185 xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ in XXH3_accumulate_512_vsx()
4246 /* scalar variants - universal */
4257 void const* XXH_RESTRICT input, in XXH3_scalarRound() argument
4262 xxh_u8 const* xinput = (xxh_u8 const*) input; in XXH3_scalarRound()
4265 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); in XXH3_scalarRound()
4280 const void* XXH_RESTRICT input, in XXH3_accumulate_512_scalar() argument
4285 XXH3_scalarRound(acc, input, secret, i); in XXH3_accumulate_512_scalar()
4303 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); in XXH3_scalarScrambleRound()
4333 * which requires a non-const pointer. in XXH3_initCustomSecret_scalar()
4345 * While MOVK is great for generating constants (2 cycles for a 64-bit in XXH3_initCustomSecret_scalar()
4380 { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; in XXH3_initCustomSecret_scalar()
4389 xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; in XXH3_initCustomSecret_scalar()
4390 xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; in XXH3_initCustomSecret_scalar()
4391 XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); in XXH3_initCustomSecret_scalar()
4392 XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); in XXH3_initCustomSecret_scalar()
4461 const xxh_u8* XXH_RESTRICT input, in XXH3_accumulate() argument
4468 const xxh_u8* const in = input + n*XXH_STRIPE_LEN; in XXH3_accumulate()
4478 const xxh_u8* XXH_RESTRICT input, size_t len, in XXH3_hashLong_internal_loop() argument
4483 size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; in XXH3_hashLong_internal_loop()
4485 size_t const nb_blocks = (len - 1) / block_len; in XXH3_hashLong_internal_loop()
4492 XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512); in XXH3_hashLong_internal_loop()
4493 f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); in XXH3_hashLong_internal_loop()
4498 { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; in XXH3_hashLong_internal_loop()
4500 XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512); in XXH3_hashLong_internal_loop()
4503 { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; in XXH3_hashLong_internal_loop()
4505 f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); in XXH3_hashLong_internal_loop()
4524 result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); in XXH3_mergeAccs()
4531 * Prevent autovectorization on Clang ARMv7-a. Exact same problem as in XXH3_mergeAccs()
4548 XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, in XXH3_hashLong_64b_internal() argument
4555 …XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_a… in XXH3_hashLong_64b_internal()
4571 XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, in XXH3_hashLong_64b_withSecret() argument
4575 …return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambl… in XXH3_hashLong_64b_withSecret()
4585 XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, in XXH3_hashLong_64b_default() argument
4589 …return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_… in XXH3_hashLong_64b_default()
4604 XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, in XXH3_hashLong_64b_withSeed_internal() argument
4611 return XXH3_hashLong_64b_internal(input, len, in XXH3_hashLong_64b_withSeed_internal()
4616 return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), in XXH3_hashLong_64b_withSeed_internal()
4625 XXH3_hashLong_64b_withSeed(const void* input, size_t len, in XXH3_hashLong_64b_withSeed() argument
4629 return XXH3_hashLong_64b_withSeed_internal(input, len, seed, in XXH3_hashLong_64b_withSeed()
4638 XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, in XXH3_64bits_internal() argument
4646 * For now, it's a contract pre-condition. in XXH3_64bits_internal()
4650 if (len <= 16) in XXH3_64bits_internal()
4651 return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); in XXH3_64bits_internal()
4653 … return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); in XXH3_64bits_internal()
4655 … return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); in XXH3_64bits_internal()
4656 return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); in XXH3_64bits_internal()
4663 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) in XXH3_64bits() argument
4665 …return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_d… in XXH3_64bits()
4670 XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) in XXH3_64bits_withSecret() argument
4672 return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); in XXH3_64bits_withSecret()
4677 XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) in XXH3_64bits_withSeed() argument
4679 …return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64… in XXH3_64bits_withSeed()
4683 XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize,… in XXH3_64bits_withSecretandSeed() argument
4686 return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); in XXH3_64bits_withSecretandSeed()
4687 return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize); in XXH3_64bits_withSecretandSeed()
4698 * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
4699 * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
4700 * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
4719 XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ in XXH_alignedMalloc()
4730 size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ in XXH_alignedMalloc()
4731 /* Add the offset for the now-aligned pointer */ in XXH_alignedMalloc()
4737 ptr[-1] = (xxh_u8)offset; in XXH_alignedMalloc()
4752 xxh_u8 offset = ptr[-1]; in XXH_alignedFree()
4754 xxh_u8* base = ptr - offset; in XXH_alignedFree()
4787 size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; in XXH3_reset_internal()
4792 statePtr->acc[0] = XXH_PRIME32_3; in XXH3_reset_internal()
4793 statePtr->acc[1] = XXH_PRIME64_1; in XXH3_reset_internal()
4794 statePtr->acc[2] = XXH_PRIME64_2; in XXH3_reset_internal()
4795 statePtr->acc[3] = XXH_PRIME64_3; in XXH3_reset_internal()
4796 statePtr->acc[4] = XXH_PRIME64_4; in XXH3_reset_internal()
4797 statePtr->acc[5] = XXH_PRIME32_2; in XXH3_reset_internal()
4798 statePtr->acc[6] = XXH_PRIME64_5; in XXH3_reset_internal()
4799 statePtr->acc[7] = XXH_PRIME32_1; in XXH3_reset_internal()
4800 statePtr->seed = seed; in XXH3_reset_internal()
4801 statePtr->useSeed = (seed != 0); in XXH3_reset_internal()
4802 statePtr->extSecret = (const unsigned char*)secret; in XXH3_reset_internal()
4804 statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; in XXH3_reset_internal()
4805 statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; in XXH3_reset_internal()
4834 if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) in XXH3_64bits_reset_withSeed()
4835 XXH3_initCustomSecret(statePtr->customSecret, seed); in XXH3_64bits_reset_withSeed()
4848 statePtr->useSeed = 1; /* always, even if seed64==0 */ in XXH3_64bits_reset_withSecretandSeed()
4853 * there must be a guarantee that at least one more byte must be consumed from input
4858 const xxh_u8* XXH_RESTRICT input, size_t nbStripes, in XXH3_consumeStripes() argument
4865 if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { in XXH3_consumeStripes()
4867 size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; in XXH3_consumeStripes()
4868 size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; in XXH3_consumeStripes()
4869 …XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEn… in XXH3_consumeStripes()
4871 …XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, … in XXH3_consumeStripes()
4874 …XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_… in XXH3_consumeStripes()
4889 const xxh_u8* XXH_RESTRICT input, size_t len, in XXH3_update() argument
4893 if (input==NULL) { in XXH3_update()
4899 { const xxh_u8* const bEnd = input + len; in XXH3_update()
4900 …const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extS… in XXH3_update()
4906 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc)); in XXH3_update()
4908 xxh_u64* XXH_RESTRICT const acc = state->acc; in XXH3_update()
4910 state->totalLen += len; in XXH3_update()
4911 XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); in XXH3_update()
4913 /* small input : just fill in tmp buffer */ in XXH3_update()
4914 if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { in XXH3_update()
4915 XXH_memcpy(state->buffer + state->bufferedSize, input, len); in XXH3_update()
4916 state->bufferedSize += (XXH32_hash_t)len; in XXH3_update()
4920 /* total input is now > XXH3_INTERNALBUFFER_SIZE */ in XXH3_update()
4928 if (state->bufferedSize) { in XXH3_update()
4929 size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; in XXH3_update()
4930 XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); in XXH3_update()
4931 input += loadSize; in XXH3_update()
4933 &state->nbStripesSoFar, state->nbStripesPerBlock, in XXH3_update()
4934 state->buffer, XXH3_INTERNALBUFFER_STRIPES, in XXH3_update()
4935 secret, state->secretLimit, in XXH3_update()
4937 state->bufferedSize = 0; in XXH3_update()
4939 XXH_ASSERT(input < bEnd); in XXH3_update()
4941 /* large input to consume : ingest per full block */ in XXH3_update()
4942 if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) { in XXH3_update()
4943 size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; in XXH3_update()
4944 XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar); in XXH3_update()
4946 { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar; in XXH3_update()
4948 …XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToE… in XXH3_update()
4949 f_scramble(acc, secret + state->secretLimit); in XXH3_update()
4950 state->nbStripesSoFar = 0; in XXH3_update()
4951 input += nbStripesToEnd * XXH_STRIPE_LEN; in XXH3_update()
4952 nbStripes -= nbStripesToEnd; in XXH3_update()
4955 while(nbStripes >= state->nbStripesPerBlock) { in XXH3_update()
4956 XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512); in XXH3_update()
4957 f_scramble(acc, secret + state->secretLimit); in XXH3_update()
4958 input += state->nbStripesPerBlock * XXH_STRIPE_LEN; in XXH3_update()
4959 nbStripes -= state->nbStripesPerBlock; in XXH3_update()
4962 XXH3_accumulate(acc, input, secret, nbStripes, f_acc512); in XXH3_update()
4963 input += nbStripes * XXH_STRIPE_LEN; in XXH3_update()
4964 XXH_ASSERT(input < bEnd); /* at least some bytes left */ in XXH3_update()
4965 state->nbStripesSoFar = nbStripes; in XXH3_update()
4967 …XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STR… in XXH3_update()
4968 XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN); in XXH3_update()
4971 /* Consume input by a multiple of internal buffer size */ in XXH3_update()
4972 if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { in XXH3_update()
4973 const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; in XXH3_update()
4976 &state->nbStripesSoFar, state->nbStripesPerBlock, in XXH3_update()
4977 input, XXH3_INTERNALBUFFER_STRIPES, in XXH3_update()
4978 secret, state->secretLimit, in XXH3_update()
4980 input += XXH3_INTERNALBUFFER_SIZE; in XXH3_update()
4981 } while (input<limit); in XXH3_update()
4983 …XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STR… in XXH3_update()
4987 /* Some remaining input (always) : buffer it */ in XXH3_update()
4988 XXH_ASSERT(input < bEnd); in XXH3_update()
4989 XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); in XXH3_update()
4990 XXH_ASSERT(state->bufferedSize == 0); in XXH3_update()
4991 XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); in XXH3_update()
4992 state->bufferedSize = (XXH32_hash_t)(bEnd-input); in XXH3_update()
4995 memcpy(state->acc, acc, sizeof(acc)); in XXH3_update()
5004 XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) in XXH3_64bits_update() argument
5006 return XXH3_update(state, (const xxh_u8*)input, len, in XXH3_64bits_update()
5018 * continue ingesting more input afterwards. in XXH3_digest_long()
5020 XXH_memcpy(acc, state->acc, sizeof(state->acc)); in XXH3_digest_long()
5021 if (state->bufferedSize >= XXH_STRIPE_LEN) { in XXH3_digest_long()
5022 size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; in XXH3_digest_long()
5023 size_t nbStripesSoFar = state->nbStripesSoFar; in XXH3_digest_long()
5025 &nbStripesSoFar, state->nbStripesPerBlock, in XXH3_digest_long()
5026 state->buffer, nbStripes, in XXH3_digest_long()
5027 secret, state->secretLimit, in XXH3_digest_long()
5031 state->buffer + state->bufferedSize - XXH_STRIPE_LEN, in XXH3_digest_long()
5032 secret + state->secretLimit - XXH_SECRET_LASTACC_START); in XXH3_digest_long()
5035 size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; in XXH3_digest_long()
5036 XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ in XXH3_digest_long()
5037 XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); in XXH3_digest_long()
5038 XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); in XXH3_digest_long()
5041 secret + state->secretLimit - XXH_SECRET_LASTACC_START); in XXH3_digest_long()
5048 …const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extS… in XXH3_64bits_digest()
5049 if (state->totalLen > XXH3_MIDSIZE_MAX) { in XXH3_64bits_digest()
5054 (xxh_u64)state->totalLen * XXH_PRIME64_1); in XXH3_64bits_digest()
5056 /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ in XXH3_64bits_digest()
5057 if (state->useSeed) in XXH3_64bits_digest()
5058 return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); in XXH3_64bits_digest()
5059 return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), in XXH3_64bits_digest()
5060 secret, state->secretLimit + XXH_STRIPE_LEN); in XXH3_64bits_digest()
5068 * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
5071 * For example, extra steps are taken to avoid the seed-dependent collisions
5072 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
5075 * lengths. Note that longer hashes are about as fast as the 64-bit version
5076 * due to it using only a slight modification of the 64-bit loop.
5078 * XXH128 is also more oriented towards 64-bit machines. It is still extremely
5079 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
5083 XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) in XXH3_len_1to3_128b() argument
5086 XXH_ASSERT(input != NULL); in XXH3_len_1to3_128b()
5090 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } in XXH3_len_1to3_128b()
5091 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } in XXH3_len_1to3_128b()
5092 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } in XXH3_len_1to3_128b()
5094 { xxh_u8 const c1 = input[0]; in XXH3_len_1to3_128b()
5095 xxh_u8 const c2 = input[len >> 1]; in XXH3_len_1to3_128b()
5096 xxh_u8 const c3 = input[len - 1]; in XXH3_len_1to3_128b()
5097 xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) in XXH3_len_1to3_128b()
5101 xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; in XXH3_len_1to3_128b()
5112 XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) in XXH3_len_4to8_128b() argument
5114 XXH_ASSERT(input != NULL); in XXH3_len_4to8_128b()
5118 { xxh_u32 const input_lo = XXH_readLE32(input); in XXH3_len_4to8_128b()
5119 xxh_u32 const input_hi = XXH_readLE32(input + len - 4); in XXH3_len_4to8_128b()
5121 xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; in XXH3_len_4to8_128b()
5139 XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) in XXH3_len_9to16_128b() argument
5141 XXH_ASSERT(input != NULL); in XXH3_len_9to16_128b()
5143 XXH_ASSERT(9 <= len && len <= 16); in XXH3_len_9to16_128b()
5144 { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; in XXH3_len_9to16_128b()
5146 xxh_u64 const input_lo = XXH_readLE64(input); in XXH3_len_9to16_128b()
5147 xxh_u64 input_hi = XXH_readLE64(input + len - 8); in XXH3_len_9to16_128b()
5153 m128.low64 += (xxh_u64)(len - 1) << 54; in XXH3_len_9to16_128b()
5160 * The best approach to this operation is different on 32-bit and 64-bit. in XXH3_len_9to16_128b()
5162 if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ in XXH3_len_9to16_128b()
5164 * 32-bit optimized version, which is more readable. in XXH3_len_9to16_128b()
5166 * On 32-bit, it removes an ADC and delays a dependency between the two in XXH3_len_9to16_128b()
5167 * halves of m128.high64, but it generates an extra mask on 64-bit. in XXH3_len_9to16_128b()
5172 * 64-bit optimized (albeit more confusing) version. in XXH3_len_9to16_128b()
5182 * Inverse Property: x + y - x == y in XXH3_len_9to16_128b()
5183 * a + (b * (1 + c - 1)) in XXH3_len_9to16_128b()
5185 * a + (b * 1) + (b * (c - 1)) in XXH3_len_9to16_128b()
5187 * a + b + (b * (c - 1)) in XXH3_len_9to16_128b()
5190 * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) in XXH3_len_9to16_128b()
5193 * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) in XXH3_len_9to16_128b()
5195 m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); in XXH3_len_9to16_128b()
5214 XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) in XXH3_len_0to16_128b() argument
5216 XXH_ASSERT(len <= 16); in XXH3_len_0to16_128b()
5217 { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); in XXH3_len_0to16_128b()
5218 if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); in XXH3_len_0to16_128b()
5219 if (len) return XXH3_len_1to3_128b(input, len, secret, seed); in XXH3_len_0to16_128b()
5238 acc.high64 += XXH3_mix16B (input_2, secret+16, seed); in XXH128_mix32B()
5245 XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, in XXH3_len_17to128_128b() argument
5250 XXH_ASSERT(16 < len && len <= 128); in XXH3_len_17to128_128b()
5258 acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); in XXH3_len_17to128_128b()
5260 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); in XXH3_len_17to128_128b()
5262 acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); in XXH3_len_17to128_128b()
5264 acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); in XXH3_len_17to128_128b()
5269 + ((len - seed) * XXH_PRIME64_2); in XXH3_len_17to128_128b()
5271 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); in XXH3_len_17to128_128b()
5278 XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, in XXH3_len_129to240_128b() argument
5292 input + (32 * i), in XXH3_len_129to240_128b()
5293 input + (32 * i) + 16, in XXH3_len_129to240_128b()
5302 input + (32 * i), in XXH3_len_129to240_128b()
5303 input + (32 * i) + 16, in XXH3_len_129to240_128b()
5304 secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), in XXH3_len_129to240_128b()
5309 input + len - 16, in XXH3_len_129to240_128b()
5310 input + len - 32, in XXH3_len_129to240_128b()
5311 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, in XXH3_len_129to240_128b()
5312 0ULL - seed); in XXH3_len_129to240_128b()
5318 + ((len - seed) * XXH_PRIME64_2); in XXH3_len_129to240_128b()
5320 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); in XXH3_len_129to240_128b()
5327 XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, in XXH3_hashLong_128b_internal() argument
5334 …XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramb… in XXH3_hashLong_128b_internal()
5345 - sizeof(acc) - XXH_SECRET_MERGEACCS_START, in XXH3_hashLong_128b_internal()
5355 XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, in XXH3_hashLong_128b_default() argument
5360 return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), in XXH3_hashLong_128b_default()
5369 XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, in XXH3_hashLong_128b_withSecret() argument
5374 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, in XXH3_hashLong_128b_withSecret()
5379 XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, in XXH3_hashLong_128b_withSeed_internal() argument
5386 return XXH3_hashLong_128b_internal(input, len, in XXH3_hashLong_128b_withSeed_internal()
5391 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), in XXH3_hashLong_128b_withSeed_internal()
5400 XXH3_hashLong_128b_withSeed(const void* input, size_t len, in XXH3_hashLong_128b_withSeed() argument
5404 return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, in XXH3_hashLong_128b_withSeed()
5412 XXH3_128bits_internal(const void* input, size_t len, in XXH3_128bits_internal() argument
5420 * For now, it's a contract pre-condition. in XXH3_128bits_internal()
5423 if (len <= 16) in XXH3_128bits_internal()
5424 return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); in XXH3_128bits_internal()
5426 … return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); in XXH3_128bits_internal()
5428 …return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); in XXH3_128bits_internal()
5429 return f_hl128(input, len, seed64, secret, secretLen); in XXH3_128bits_internal()
5436 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) in XXH3_128bits() argument
5438 return XXH3_128bits_internal(input, len, 0, in XXH3_128bits()
5445 XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) in XXH3_128bits_withSecret() argument
5447 return XXH3_128bits_internal(input, len, 0, in XXH3_128bits_withSecret()
5454 XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) in XXH3_128bits_withSeed() argument
5456 return XXH3_128bits_internal(input, len, seed, in XXH3_128bits_withSeed()
5463 XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize… in XXH3_128bits_withSecretandSeed() argument
5466 return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); in XXH3_128bits_withSecretandSeed()
5467 return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); in XXH3_128bits_withSecretandSeed()
5472 XXH128(const void* input, size_t len, XXH64_hash_t seed) in XXH128() argument
5474 return XXH3_128bits_withSeed(input, len, seed); in XXH128()
5478 /* === XXH3 128-bit streaming === */
5481 * All initialization and update functions are identical to 64-bit streaming variant.
5515 XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) in XXH3_128bits_update() argument
5517 return XXH3_update(state, (const xxh_u8*)input, len, in XXH3_128bits_update()
5524 …const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extS… in XXH3_128bits_digest()
5525 if (state->totalLen > XXH3_MIDSIZE_MAX) { in XXH3_128bits_digest()
5528 XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); in XXH3_128bits_digest()
5532 (xxh_u64)state->totalLen * XXH_PRIME64_1); in XXH3_128bits_digest()
5534 secret + state->secretLimit + XXH_STRIPE_LEN in XXH3_128bits_digest()
5535 - sizeof(acc) - XXH_SECRET_MERGEACCS_START, in XXH3_128bits_digest()
5536 ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); in XXH3_128bits_digest()
5541 if (state->seed) in XXH3_128bits_digest()
5542 return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); in XXH3_128bits_digest()
5543 return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), in XXH3_128bits_digest()
5544 secret, state->secretLimit + XXH_STRIPE_LEN); in XXH3_128bits_digest()
5547 /* 128-bit utility functions */
5568 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); in XXH128_cmp()
5571 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); in XXH128_cmp()
5595 h.low64 = XXH_readBE64(src->digest + 8); in XXH128_hashFromCanonical()
5636 /* Fill secretBuffer with a copy of customSeed - repeat as needed */ in XXH3_generateSecret()
5639 size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); in XXH3_generateSecret()
5644 { size_t const nbSeg16 = secretSize / 16; in XXH3_generateSecret()
5650 XXH3_combine16((char*)secretBuffer + n*16, h128); in XXH3_generateSecret()
5653 XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler)); in XXH3_generateSecret()
5673 && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */