xref: /freebsd/sys/contrib/zstd/lib/common/xxhash.c (revision 25ecdc7d52770caf1c9b44b5ec11f468f6b636f3)
1 /*
2  *  xxHash - Fast Hash algorithm
3  *  Copyright (c) 2012-2020, Yann Collet, Facebook, Inc.
4  *
5  *  You can contact the author at :
6  *  - xxHash homepage: http://www.xxhash.com
7  *  - xxHash source repository : https://github.com/Cyan4973/xxHash
8  *
9  * This source code is licensed under both the BSD-style license (found in the
10  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
11  * in the COPYING file in the root directory of this source tree).
12  * You may select, at your option, one of the above-listed licenses.
13 */
14 
15 
16 /* *************************************
17 *  Tuning parameters
18 ***************************************/
19 /*!XXH_FORCE_MEMORY_ACCESS :
20  * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
21  * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
22  * The below switch allow to select different access method for improved performance.
23  * Method 0 (default) : use `memcpy()`. Safe and portable.
24  * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
25  *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
26  * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
27  *            It can generate buggy code on targets which do not support unaligned memory accesses.
28  *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
29  * See http://stackoverflow.com/a/32095106/646947 for details.
30  * Prefer these methods in priority order (0 > 1 > 2)
31  */
32 #ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
33 #  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
34 #    define XXH_FORCE_MEMORY_ACCESS 2
35 #  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
36   (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) || \
37   defined(__ICCARM__)
38 #    define XXH_FORCE_MEMORY_ACCESS 1
39 #  endif
40 #endif
41 
42 /*!XXH_ACCEPT_NULL_INPUT_POINTER :
43  * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
44  * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
45  * By default, this option is disabled. To enable it, uncomment below define :
46  */
47 /* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
48 
49 /*!XXH_FORCE_NATIVE_FORMAT :
50  * By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
51  * Results are therefore identical for little-endian and big-endian CPU.
52  * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
53  * Should endian-independence be of no importance for your application, you may set the #define below to 1,
54  * to improve speed for Big-endian CPU.
55  * This option has no impact on Little_Endian CPU.
56  */
57 #ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
58 #  define XXH_FORCE_NATIVE_FORMAT 0
59 #endif
60 
61 /*!XXH_FORCE_ALIGN_CHECK :
62  * This is a minor performance trick, only useful with lots of very small keys.
63  * It means : check for aligned/unaligned input.
64  * The check costs one initial branch per hash; set to 0 when the input data
65  * is guaranteed to be aligned.
66  */
67 #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
68 #  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
69 #    define XXH_FORCE_ALIGN_CHECK 0
70 #  else
71 #    define XXH_FORCE_ALIGN_CHECK 1
72 #  endif
73 #endif
74 
75 
76 /* *************************************
77 *  Includes & Memory related functions
78 ***************************************/
79 /* Modify the local functions below should you wish to use some other memory routines */
80 /* for ZSTD_malloc(), ZSTD_free() */
81 #define ZSTD_DEPS_NEED_MALLOC
82 #include "zstd_deps.h"  /* size_t, ZSTD_malloc, ZSTD_free, ZSTD_memcpy */
83 static void* XXH_malloc(size_t s) { return ZSTD_malloc(s); }
84 static void  XXH_free  (void* p)  { ZSTD_free(p); }
85 static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_memcpy(dest,src,size); }
86 
87 #ifndef XXH_STATIC_LINKING_ONLY
88 #  define XXH_STATIC_LINKING_ONLY
89 #endif
90 #include "xxhash.h"
91 
92 
93 /* *************************************
94 *  Compiler Specific Options
95 ***************************************/
96 #include "compiler.h"
97 
98 
99 /* *************************************
100 *  Basic Types
101 ***************************************/
102 #include "mem.h"  /* BYTE, U32, U64, size_t */
103 
104 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
105 
106 /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
107 static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
108 static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
109 
110 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
111 
112 /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
113 /* currently only defined for gcc and icc */
114 typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
115 
116 static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
117 static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
118 
119 #else
120 
121 /* portable and safe solution. Generally efficient.
122  * see : http://stackoverflow.com/a/32095106/646947
123  */
124 
125 static U32 XXH_read32(const void* memPtr)
126 {
127     U32 val;
128     ZSTD_memcpy(&val, memPtr, sizeof(val));
129     return val;
130 }
131 
132 static U64 XXH_read64(const void* memPtr)
133 {
134     U64 val;
135     ZSTD_memcpy(&val, memPtr, sizeof(val));
136     return val;
137 }
138 
139 #endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
140 
141 
142 /* ****************************************
143 *  Compiler-specific Functions and Macros
144 ******************************************/
145 #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
146 
147 /* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
148 #if defined(_MSC_VER)
149 #  define XXH_rotl32(x,r) _rotl(x,r)
150 #  define XXH_rotl64(x,r) _rotl64(x,r)
151 #else
152 #if defined(__ICCARM__)
153 #  include <intrinsics.h>
154 #  define XXH_rotl32(x,r) __ROR(x,(32 - r))
155 #else
156 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
157 #endif
158 #  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
159 #endif
160 
161 #if defined(_MSC_VER)     /* Visual Studio */
162 #  define XXH_swap32 _byteswap_ulong
163 #  define XXH_swap64 _byteswap_uint64
164 #elif GCC_VERSION >= 403
165 #  define XXH_swap32 __builtin_bswap32
166 #  define XXH_swap64 __builtin_bswap64
167 #else
168 static U32 XXH_swap32 (U32 x)
169 {
170     return  ((x << 24) & 0xff000000 ) |
171             ((x <<  8) & 0x00ff0000 ) |
172             ((x >>  8) & 0x0000ff00 ) |
173             ((x >> 24) & 0x000000ff );
174 }
175 static U64 XXH_swap64 (U64 x)
176 {
177     return  ((x << 56) & 0xff00000000000000ULL) |
178             ((x << 40) & 0x00ff000000000000ULL) |
179             ((x << 24) & 0x0000ff0000000000ULL) |
180             ((x << 8)  & 0x000000ff00000000ULL) |
181             ((x >> 8)  & 0x00000000ff000000ULL) |
182             ((x >> 24) & 0x0000000000ff0000ULL) |
183             ((x >> 40) & 0x000000000000ff00ULL) |
184             ((x >> 56) & 0x00000000000000ffULL);
185 }
186 #endif
187 
188 
189 /* *************************************
190 *  Architecture Macros
191 ***************************************/
192 typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
193 
194 /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
195 #ifndef XXH_CPU_LITTLE_ENDIAN
196     static const int g_one = 1;
197 #   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&g_one))
198 #endif
199 
200 
201 /* ***************************
202 *  Memory reads
203 *****************************/
204 typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
205 
206 FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
207 {
208     if (align==XXH_unaligned)
209         return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
210     else
211         return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
212 }
213 
214 FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
215 {
216     return XXH_readLE32_align(ptr, endian, XXH_unaligned);
217 }
218 
219 static U32 XXH_readBE32(const void* ptr)
220 {
221     return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
222 }
223 
224 FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
225 {
226     if (align==XXH_unaligned)
227         return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
228     else
229         return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
230 }
231 
232 FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
233 {
234     return XXH_readLE64_align(ptr, endian, XXH_unaligned);
235 }
236 
237 static U64 XXH_readBE64(const void* ptr)
238 {
239     return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
240 }
241 
242 
243 /* *************************************
244 *  Macros
245 ***************************************/
246 #define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(int)(!!(c)) }; }    /* use only *after* variable declarations */
247 
248 
249 /* *************************************
250 *  Constants
251 ***************************************/
252 static const U32 PRIME32_1 = 2654435761U;
253 static const U32 PRIME32_2 = 2246822519U;
254 static const U32 PRIME32_3 = 3266489917U;
255 static const U32 PRIME32_4 =  668265263U;
256 static const U32 PRIME32_5 =  374761393U;
257 
258 static const U64 PRIME64_1 = 11400714785074694791ULL;
259 static const U64 PRIME64_2 = 14029467366897019727ULL;
260 static const U64 PRIME64_3 =  1609587929392839161ULL;
261 static const U64 PRIME64_4 =  9650029242287828579ULL;
262 static const U64 PRIME64_5 =  2870177450012600261ULL;
263 
264 XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
265 
266 
267 /* **************************
268 *  Utils
269 ****************************/
270 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState)
271 {
272     ZSTD_memcpy(dstState, srcState, sizeof(*dstState));
273 }
274 
275 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState)
276 {
277     ZSTD_memcpy(dstState, srcState, sizeof(*dstState));
278 }
279 
280 
281 /* ***************************
282 *  Simple Hash Functions
283 *****************************/
284 
285 static U32 XXH32_round(U32 seed, U32 input)
286 {
287     seed += input * PRIME32_2;
288     seed  = XXH_rotl32(seed, 13);
289     seed *= PRIME32_1;
290     return seed;
291 }
292 
293 FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
294 {
295     const BYTE* p = (const BYTE*)input;
296     const BYTE* bEnd = p + len;
297     U32 h32;
298 #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
299 
300 #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
301     if (p==NULL) {
302         len=0;
303         bEnd=p=(const BYTE*)(size_t)16;
304     }
305 #endif
306 
307     if (len>=16) {
308         const BYTE* const limit = bEnd - 16;
309         U32 v1 = seed + PRIME32_1 + PRIME32_2;
310         U32 v2 = seed + PRIME32_2;
311         U32 v3 = seed + 0;
312         U32 v4 = seed - PRIME32_1;
313 
314         do {
315             v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
316             v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
317             v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
318             v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
319         } while (p<=limit);
320 
321         h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
322     } else {
323         h32  = seed + PRIME32_5;
324     }
325 
326     h32 += (U32) len;
327 
328     while (p+4<=bEnd) {
329         h32 += XXH_get32bits(p) * PRIME32_3;
330         h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
331         p+=4;
332     }
333 
334     while (p<bEnd) {
335         h32 += (*p) * PRIME32_5;
336         h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
337         p++;
338     }
339 
340     h32 ^= h32 >> 15;
341     h32 *= PRIME32_2;
342     h32 ^= h32 >> 13;
343     h32 *= PRIME32_3;
344     h32 ^= h32 >> 16;
345 
346     return h32;
347 }
348 
349 
350 XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
351 {
352 #if 0
353     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
354     XXH32_CREATESTATE_STATIC(state);
355     XXH32_reset(state, seed);
356     XXH32_update(state, input, len);
357     return XXH32_digest(state);
358 #else
359     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
360 
361     if (XXH_FORCE_ALIGN_CHECK) {
362         if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
363             if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
364                 return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
365             else
366                 return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
367     }   }
368 
369     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
370         return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
371     else
372         return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
373 #endif
374 }
375 
376 
377 static U64 XXH64_round(U64 acc, U64 input)
378 {
379     acc += input * PRIME64_2;
380     acc  = XXH_rotl64(acc, 31);
381     acc *= PRIME64_1;
382     return acc;
383 }
384 
385 static U64 XXH64_mergeRound(U64 acc, U64 val)
386 {
387     val  = XXH64_round(0, val);
388     acc ^= val;
389     acc  = acc * PRIME64_1 + PRIME64_4;
390     return acc;
391 }
392 
393 FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
394 {
395     const BYTE* p = (const BYTE*)input;
396     const BYTE* const bEnd = p + len;
397     U64 h64;
398 #define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
399 
400 #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
401     if (p==NULL) {
402         len=0;
403         bEnd=p=(const BYTE*)(size_t)32;
404     }
405 #endif
406 
407     if (len>=32) {
408         const BYTE* const limit = bEnd - 32;
409         U64 v1 = seed + PRIME64_1 + PRIME64_2;
410         U64 v2 = seed + PRIME64_2;
411         U64 v3 = seed + 0;
412         U64 v4 = seed - PRIME64_1;
413 
414         do {
415             v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
416             v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
417             v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
418             v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
419         } while (p<=limit);
420 
421         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
422         h64 = XXH64_mergeRound(h64, v1);
423         h64 = XXH64_mergeRound(h64, v2);
424         h64 = XXH64_mergeRound(h64, v3);
425         h64 = XXH64_mergeRound(h64, v4);
426 
427     } else {
428         h64  = seed + PRIME64_5;
429     }
430 
431     h64 += (U64) len;
432 
433     while (p+8<=bEnd) {
434         U64 const k1 = XXH64_round(0, XXH_get64bits(p));
435         h64 ^= k1;
436         h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
437         p+=8;
438     }
439 
440     if (p+4<=bEnd) {
441         h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
442         h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
443         p+=4;
444     }
445 
446     while (p<bEnd) {
447         h64 ^= (*p) * PRIME64_5;
448         h64 = XXH_rotl64(h64, 11) * PRIME64_1;
449         p++;
450     }
451 
452     h64 ^= h64 >> 33;
453     h64 *= PRIME64_2;
454     h64 ^= h64 >> 29;
455     h64 *= PRIME64_3;
456     h64 ^= h64 >> 32;
457 
458     return h64;
459 }
460 
461 
462 XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
463 {
464 #if 0
465     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
466     XXH64_CREATESTATE_STATIC(state);
467     XXH64_reset(state, seed);
468     XXH64_update(state, input, len);
469     return XXH64_digest(state);
470 #else
471     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
472 
473     if (XXH_FORCE_ALIGN_CHECK) {
474         if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
475             if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
476                 return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
477             else
478                 return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
479     }   }
480 
481     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
482         return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
483     else
484         return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
485 #endif
486 }
487 
488 
489 /* **************************************************
490 *  Advanced Hash Functions
491 ****************************************************/
492 
493 XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
494 {
495     return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
496 }
497 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
498 {
499     XXH_free(statePtr);
500     return XXH_OK;
501 }
502 
503 XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
504 {
505     return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
506 }
507 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
508 {
509     XXH_free(statePtr);
510     return XXH_OK;
511 }
512 
513 
514 /*** Hash feed ***/
515 
516 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
517 {
518     XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
519     ZSTD_memset(&state, 0, sizeof(state)-4);   /* do not write into reserved, for future removal */
520     state.v1 = seed + PRIME32_1 + PRIME32_2;
521     state.v2 = seed + PRIME32_2;
522     state.v3 = seed + 0;
523     state.v4 = seed - PRIME32_1;
524     ZSTD_memcpy(statePtr, &state, sizeof(state));
525     return XXH_OK;
526 }
527 
528 
529 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
530 {
531     XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
532     ZSTD_memset(&state, 0, sizeof(state)-8);   /* do not write into reserved, for future removal */
533     state.v1 = seed + PRIME64_1 + PRIME64_2;
534     state.v2 = seed + PRIME64_2;
535     state.v3 = seed + 0;
536     state.v4 = seed - PRIME64_1;
537     ZSTD_memcpy(statePtr, &state, sizeof(state));
538     return XXH_OK;
539 }
540 
541 
542 FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
543 {
544     const BYTE* p = (const BYTE*)input;
545     const BYTE* const bEnd = p + len;
546 
547 #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
548     if (input==NULL) return XXH_ERROR;
549 #endif
550 
551     state->total_len_32 += (unsigned)len;
552     state->large_len |= (len>=16) | (state->total_len_32>=16);
553 
554     if (state->memsize + len < 16)  {   /* fill in tmp buffer */
555         XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
556         state->memsize += (unsigned)len;
557         return XXH_OK;
558     }
559 
560     if (state->memsize) {   /* some data left from previous update */
561         XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
562         {   const U32* p32 = state->mem32;
563             state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
564             state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
565             state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
566             state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++;
567         }
568         p += 16-state->memsize;
569         state->memsize = 0;
570     }
571 
572     if (p <= bEnd-16) {
573         const BYTE* const limit = bEnd - 16;
574         U32 v1 = state->v1;
575         U32 v2 = state->v2;
576         U32 v3 = state->v3;
577         U32 v4 = state->v4;
578 
579         do {
580             v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
581             v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
582             v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
583             v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
584         } while (p<=limit);
585 
586         state->v1 = v1;
587         state->v2 = v2;
588         state->v3 = v3;
589         state->v4 = v4;
590     }
591 
592     if (p < bEnd) {
593         XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
594         state->memsize = (unsigned)(bEnd-p);
595     }
596 
597     return XXH_OK;
598 }
599 
600 XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
601 {
602     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
603 
604     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
605         return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
606     else
607         return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
608 }
609 
610 
611 
612 FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
613 {
614     const BYTE * p = (const BYTE*)state->mem32;
615     const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
616     U32 h32;
617 
618     if (state->large_len) {
619         h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
620     } else {
621         h32 = state->v3 /* == seed */ + PRIME32_5;
622     }
623 
624     h32 += state->total_len_32;
625 
626     while (p+4<=bEnd) {
627         h32 += XXH_readLE32(p, endian) * PRIME32_3;
628         h32  = XXH_rotl32(h32, 17) * PRIME32_4;
629         p+=4;
630     }
631 
632     while (p<bEnd) {
633         h32 += (*p) * PRIME32_5;
634         h32  = XXH_rotl32(h32, 11) * PRIME32_1;
635         p++;
636     }
637 
638     h32 ^= h32 >> 15;
639     h32 *= PRIME32_2;
640     h32 ^= h32 >> 13;
641     h32 *= PRIME32_3;
642     h32 ^= h32 >> 16;
643 
644     return h32;
645 }
646 
647 
648 XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
649 {
650     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
651 
652     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
653         return XXH32_digest_endian(state_in, XXH_littleEndian);
654     else
655         return XXH32_digest_endian(state_in, XXH_bigEndian);
656 }
657 
658 
659 
660 /* **** XXH64 **** */
661 
662 FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
663 {
664     const BYTE* p = (const BYTE*)input;
665     const BYTE* const bEnd = p + len;
666 
667 #ifdef XXH_ACCEPT_NULL_INPUT_POINTER
668     if (input==NULL) return XXH_ERROR;
669 #endif
670 
671     state->total_len += len;
672 
673     if (state->memsize + len < 32) {  /* fill in tmp buffer */
674         if (input != NULL) {
675             XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
676         }
677         state->memsize += (U32)len;
678         return XXH_OK;
679     }
680 
681     if (state->memsize) {   /* tmp buffer is full */
682         XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
683         state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
684         state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
685         state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
686         state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
687         p += 32-state->memsize;
688         state->memsize = 0;
689     }
690 
691     if (p+32 <= bEnd) {
692         const BYTE* const limit = bEnd - 32;
693         U64 v1 = state->v1;
694         U64 v2 = state->v2;
695         U64 v3 = state->v3;
696         U64 v4 = state->v4;
697 
698         do {
699             v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
700             v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
701             v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
702             v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
703         } while (p<=limit);
704 
705         state->v1 = v1;
706         state->v2 = v2;
707         state->v3 = v3;
708         state->v4 = v4;
709     }
710 
711     if (p < bEnd) {
712         XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
713         state->memsize = (unsigned)(bEnd-p);
714     }
715 
716     return XXH_OK;
717 }
718 
719 XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
720 {
721     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
722 
723     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
724         return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
725     else
726         return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
727 }
728 
729 
730 
731 FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
732 {
733     const BYTE * p = (const BYTE*)state->mem64;
734     const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
735     U64 h64;
736 
737     if (state->total_len >= 32) {
738         U64 const v1 = state->v1;
739         U64 const v2 = state->v2;
740         U64 const v3 = state->v3;
741         U64 const v4 = state->v4;
742 
743         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
744         h64 = XXH64_mergeRound(h64, v1);
745         h64 = XXH64_mergeRound(h64, v2);
746         h64 = XXH64_mergeRound(h64, v3);
747         h64 = XXH64_mergeRound(h64, v4);
748     } else {
749         h64  = state->v3 + PRIME64_5;
750     }
751 
752     h64 += (U64) state->total_len;
753 
754     while (p+8<=bEnd) {
755         U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
756         h64 ^= k1;
757         h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
758         p+=8;
759     }
760 
761     if (p+4<=bEnd) {
762         h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
763         h64  = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
764         p+=4;
765     }
766 
767     while (p<bEnd) {
768         h64 ^= (*p) * PRIME64_5;
769         h64  = XXH_rotl64(h64, 11) * PRIME64_1;
770         p++;
771     }
772 
773     h64 ^= h64 >> 33;
774     h64 *= PRIME64_2;
775     h64 ^= h64 >> 29;
776     h64 *= PRIME64_3;
777     h64 ^= h64 >> 32;
778 
779     return h64;
780 }
781 
782 
783 XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
784 {
785     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
786 
787     if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
788         return XXH64_digest_endian(state_in, XXH_littleEndian);
789     else
790         return XXH64_digest_endian(state_in, XXH_bigEndian);
791 }
792 
793 
794 /* **************************
795 *  Canonical representation
796 ****************************/
797 
798 /*! Default XXH result types are basic unsigned 32 and 64 bits.
799 *   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
800 *   These functions allow transformation of hash result into and from its canonical format.
801 *   This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs.
802 */
803 
804 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
805 {
806     XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
807     if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
808     ZSTD_memcpy(dst, &hash, sizeof(*dst));
809 }
810 
811 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
812 {
813     XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
814     if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
815     ZSTD_memcpy(dst, &hash, sizeof(*dst));
816 }
817 
818 XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
819 {
820     return XXH_readBE32(src);
821 }
822 
823 XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
824 {
825     return XXH_readBE64(src);
826 }
827