xref: /freebsd/contrib/libucl/src/mum.h (revision 2326db40a1d2dd98631d70aae200ca52575139fb)
1 /* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>
2 
3    Permission is hereby granted, free of charge, to any person
4    obtaining a copy of this software and associated documentation
5    files (the "Software"), to deal in the Software without
6    restriction, including without limitation the rights to use, copy,
7    modify, merge, publish, distribute, sublicense, and/or sell copies
8    of the Software, and to permit persons to whom the Software is
9    furnished to do so, subject to the following conditions:
10 
11    The above copyright notice and this permission notice shall be
12    included in all copies or substantial portions of the Software.
13 
14    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21    SOFTWARE.
22 */
23 
24 /* This file implements MUM (MUltiply and Mix) hashing.  We randomize
25    input data by 64x64-bit multiplication and mixing hi- and low-parts
26    of the multiplication result by using an addition and then mix it
27    into the current state.  We use prime numbers randomly generated
28    with the equal probability of their bit values for the
29    multiplication.  When all primes are used once, the state is
30    randomized and the same prime numbers are used again for data
31    randomization.
32 
33    The MUM hashing passes all SMHasher tests.  Pseudo Random Number
34    Generator based on MUM also passes NIST Statistical Test Suite for
35    Random and Pseudorandom Number Generators for Cryptographic
36    Applications (version 2.2.1) with 1000 bitstreams each containing
37    1M bits.  MUM hashing is also faster Spooky64 and City64 on small
38    strings (at least up to 512-bit) on Haswell and Power7.  The MUM bulk
39    speed (speed on very long data) is bigger than Spooky and City on
40    Power7.  On Haswell the bulk speed is bigger than Spooky one and
41    close to City speed.  */
42 
43 #ifndef __MUM_HASH__
44 #define __MUM_HASH__
45 
46 #include <stddef.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <limits.h>
50 
51 #ifdef _MSC_VER
52 typedef unsigned __int16 uint16_t;
53 typedef unsigned __int32 uint32_t;
54 typedef unsigned __int64 uint64_t;
55 #else
56 #include <stdint.h>
57 #endif
58 
59 /* Macro saying to use 128-bit integers implemented by GCC for some
60    targets.  */
61 #ifndef _MUM_USE_INT128
62 /* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
63    HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
64    otherwise int. */
65 #if defined(__GNUC__) && UINT_MAX != ULONG_MAX
66 #define _MUM_USE_INT128 1
67 #else
68 #define _MUM_USE_INT128 0
69 #endif
70 #endif
71 
72 #if defined(__GNUC__) && ((__GNUC__ == 4) &&  (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
73 #define _MUM_FRESH_GCC
74 #endif
75 
76 #if defined(__GNUC__) && !defined(__llvm__) && defined(_MUM_FRESH_GCC)
77 #define _MUM_ATTRIBUTE_UNUSED  __attribute__((unused))
78 #define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
79 #define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
80 #else
81 #define _MUM_ATTRIBUTE_UNUSED
82 #define _MUM_OPTIMIZE(opts)
83 #define _MUM_TARGET(opts)
84 #endif
85 
86 
87 /* Here are different primes randomly generated with the equal
88    probability of their bit values.  They are used to randomize input
89    values.  */
90 static uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
91 static uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
92 static uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
93 static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
94 static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
95 static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
96 static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
97 
98 static uint64_t _mum_primes [] = {
99   0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
100   0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
101   0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
102   0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
103 };
104 
105 /* Multiply 64-bit V and P and return sum of high and low parts of the
106    result.  */
107 static inline uint64_t
_mum(uint64_t v,uint64_t p)108 _mum (uint64_t v, uint64_t p) {
109   uint64_t hi, lo;
110 #if _MUM_USE_INT128
111 #if defined(__aarch64__)
112   /* AARCH64 needs 2 insns to calculate 128-bit result of the
113      multiplication.  If we use a generic code we actually call a
114      function doing 128x128->128 bit multiplication.  The function is
115      very slow.  */
116   lo = v * p, hi;
117   asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
118 #else
119   __uint128_t r = (__uint128_t) v * (__uint128_t) p;
120   hi = (uint64_t) (r >> 64);
121   lo = (uint64_t) r;
122 #endif
123 #else
124   /* Implementation of 64x64->128-bit multiplication by four 32x32->64
125      bit multiplication.  */
126   uint64_t hv = v >> 32, hp = p >> 32;
127   uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
128   uint64_t rh =  hv * hp;
129   uint64_t rm_0 = hv * lp;
130   uint64_t rm_1 = hp * lv;
131   uint64_t rl =  lv * lp;
132   uint64_t t, carry = 0;
133 
134   /* We could ignore a carry bit here if we did not care about the
135      same hash for 32-bit and 64-bit targets.  */
136   t = rl + (rm_0 << 32);
137 #ifdef MUM_TARGET_INDEPENDENT_HASH
138   carry = t < rl;
139 #endif
140   lo = t + (rm_1 << 32);
141 #ifdef MUM_TARGET_INDEPENDENT_HASH
142   carry += lo < t;
143 #endif
144   hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
145 #endif
146   /* We could use XOR here too but, for some reasons, on Haswell and
147      Power7 using an addition improves hashing performance by 10% for
148      small strings.  */
149   return hi + lo;
150 }
151 
152 #if defined(_MSC_VER)
153 #define _mum_bswap_32(x) _byteswap_uint32_t (x)
154 #define _mum_bswap_64(x) _byteswap_uint64_t (x)
155 #elif defined(__APPLE__)
156 #include <libkern/OSByteOrder.h>
157 #define _mum_bswap_32(x) OSSwapInt32 (x)
158 #define _mum_bswap_64(x) OSSwapInt64 (x)
159 #elif defined(__GNUC__)
160 #define _mum_bswap32(x) __builtin_bswap32 (x)
161 #define _mum_bswap64(x) __builtin_bswap64 (x)
162 #else
163 #include <byteswap.h>
164 #define _mum_bswap32(x) bswap32 (x)
165 #define _mum_bswap64(x) bswap64 (x)
166 #endif
167 
168 static inline uint64_t
_mum_le(uint64_t v)169 _mum_le (uint64_t v) {
170 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
171   return v;
172 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
173   return _mum_bswap64 (v);
174 #else
175 #error "Unknown endianness"
176 #endif
177 }
178 
179 static inline uint32_t
_mum_le32(uint32_t v)180 _mum_le32 (uint32_t v) {
181 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
182   return v;
183 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
184   return _mum_bswap32 (v);
185 #else
186 #error "Unknown endianness"
187 #endif
188 }
189 
190 /* Macro defining how many times the most nested loop in
191    _mum_hash_aligned will be unrolled by the compiler (although it can
192    make an own decision:).  Use only a constant here to help a
193    compiler to unroll a major loop.
194 
195    The macro value affects the result hash for strings > 128 bit.  The
196    unroll factor greatly affects the hashing speed.  We prefer the
197    speed.  */
198 #ifndef _MUM_UNROLL_FACTOR_POWER
199 #if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
200 #define _MUM_UNROLL_FACTOR_POWER 3
201 #elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
202 #define _MUM_UNROLL_FACTOR_POWER 4
203 #else
204 #define _MUM_UNROLL_FACTOR_POWER 2
205 #endif
206 #endif
207 
208 #if _MUM_UNROLL_FACTOR_POWER < 1
209 #error "too small unroll factor"
210 #elif _MUM_UNROLL_FACTOR_POWER > 4
211 #error "We have not enough primes for such unroll factor"
212 #endif
213 
214 #define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
215 
216 static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
_mum_hash_aligned(uint64_t start,const void * key,size_t len)217 _mum_hash_aligned (uint64_t start, const void *key, size_t len) {
218   uint64_t result = start;
219   const unsigned char *str = (const unsigned char *) key;
220   uint64_t u64;
221   int i;
222   size_t n;
223 
224   result = _mum (result, _mum_block_start_prime);
225   while  (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
226     /* This loop could be vectorized when we have vector insns for
227        64x64->128-bit multiplication.  AVX2 currently only have a
228        vector insn for 4 32x32->64-bit multiplication.  */
229     for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
230       result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
231     len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
232     str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
233     /* We will use the same prime numbers on the next iterations --
234        randomize the state.  */
235     result = _mum (result, _mum_unroll_prime);
236   }
237   n = len / sizeof (uint64_t);
238   for (i = 0; i < (int)n; i++)
239     result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
240   len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
241   switch (len) {
242   case 7:
243     u64 = _mum_le32 (*(uint32_t *) str);
244     u64 |= (uint64_t) str[4] << 32;
245     u64 |= (uint64_t) str[5] << 40;
246     u64 |= (uint64_t) str[6] << 48;
247     return result ^ _mum (u64, _mum_tail_prime);
248   case 6:
249     u64 = _mum_le32 (*(uint32_t *) str);
250     u64 |= (uint64_t) str[4] << 32;
251     u64 |= (uint64_t) str[5] << 40;
252     return result ^ _mum (u64, _mum_tail_prime);
253   case 5:
254     u64 = _mum_le32 (*(uint32_t *) str);
255     u64 |= (uint64_t) str[4] << 32;
256     return result ^ _mum (u64, _mum_tail_prime);
257   case 4:
258     u64 = _mum_le32 (*(uint32_t *) str);
259     return result ^ _mum (u64, _mum_tail_prime);
260   case 3:
261     u64 = str[0];
262     u64 |= (uint64_t) str[1] << 8;
263     u64 |= (uint64_t) str[2] << 16;
264     return result ^ _mum (u64, _mum_tail_prime);
265   case 2:
266     u64 = str[0];
267     u64 |= (uint64_t) str[1] << 8;
268     return result ^ _mum (u64, _mum_tail_prime);
269   case 1:
270     u64 = str[0];
271     return result ^ _mum (u64, _mum_tail_prime);
272   }
273   return result;
274 }
275 
276 /* Final randomization of H.  */
277 static inline uint64_t
_mum_final(uint64_t h)278 _mum_final (uint64_t h) {
279   h ^= _mum (h, _mum_finish_prime1);
280   h ^= _mum (h, _mum_finish_prime2);
281   return h;
282 }
283 
284 #if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
285 
286 /* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
287    it is possible.  Although on modern Intel processors MULQ takes
288    3-cycles vs. 4 for MULX, MULX permits more freedom in insn
289    scheduling as it uses less fixed registers.  */
290 static inline uint64_t _MUM_TARGET("arch=haswell")
_mum_hash_avx2(const void * key,size_t len,uint64_t seed)291 _mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
292   return _mum_final (_mum_hash_aligned (seed + len, key, len));
293 }
294 #endif
295 
296 #ifndef _MUM_UNALIGNED_ACCESS
297 #if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
298     || defined(__s390__) || defined(__m32c__) || defined(cris)     \
299     || defined(__CR16__) || defined(__vax__) || defined(__m68k__) \
300     || defined(__aarch64__)
301 #define _MUM_UNALIGNED_ACCESS 1
302 #else
303 #define _MUM_UNALIGNED_ACCESS 0
304 #endif
305 #endif
306 
307 /* When we need an aligned access to data being hashed we move part of
308    the unaligned data to an aligned block of given size and then
309    process it, repeating processing the data by the block.  */
310 #ifndef _MUM_BLOCK_LEN
311 #define _MUM_BLOCK_LEN 1024
312 #endif
313 
314 #if _MUM_BLOCK_LEN < 8
315 #error "too small block length"
316 #endif
317 
318 static inline uint64_t
319 #if defined(__x86_64__)
320 _MUM_TARGET("inline-all-stringops")
321 #endif
_mum_hash_default(const void * key,size_t len,uint64_t seed)322 _mum_hash_default (const void *key, size_t len, uint64_t seed) {
323   uint64_t result;
324   const unsigned char *str = (const unsigned char *) key;
325   size_t block_len;
326   uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
327 
328   result = seed + len;
329   if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0)
330     result = _mum_hash_aligned (result, key, len);
331   else {
332     while (len != 0) {
333       block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
334       memmove (buf, str, block_len);
335       result = _mum_hash_aligned (result, buf, block_len);
336       len -= block_len;
337       str += block_len;
338     }
339   }
340   return _mum_final (result);
341 }
342 
343 static inline uint64_t
_mum_next_factor(void)344 _mum_next_factor (void) {
345   uint64_t start = 0;
346   int i;
347 
348   for (i = 0; i < 8; i++)
349     start = (start << 8) | rand() % 256;
350   return start;
351 }
352 
353 /* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++  */
354 
355 /* Set random multiplicators depending on SEED.  */
356 static inline void
mum_hash_randomize(uint64_t seed)357 mum_hash_randomize (uint64_t seed) {
358   int i;
359 
360   srand (seed);
361   _mum_hash_step_prime = _mum_next_factor ();
362   _mum_key_step_prime = _mum_next_factor ();
363   _mum_finish_prime1 = _mum_next_factor ();
364   _mum_finish_prime2 = _mum_next_factor ();
365   _mum_block_start_prime = _mum_next_factor ();
366   _mum_unroll_prime = _mum_next_factor ();
367   _mum_tail_prime = _mum_next_factor ();
368   for (i = 0; i < (int)(sizeof (_mum_primes) / sizeof (uint64_t)); i++)
369     _mum_primes[i] = _mum_next_factor ();
370 }
371 
372 /* Start hashing data with SEED.  Return the state.  */
373 static inline uint64_t
mum_hash_init(uint64_t seed)374 mum_hash_init (uint64_t seed) {
375   return seed;
376 }
377 
378 /* Process data KEY with the state H and return the updated state.  */
379 static inline uint64_t
mum_hash_step(uint64_t h,uint64_t key)380 mum_hash_step (uint64_t h, uint64_t key)
381 {
382   return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
383 }
384 
385 /* Return the result of hashing using the current state H.  */
386 static inline uint64_t
mum_hash_finish(uint64_t h)387 mum_hash_finish (uint64_t h) {
388   return _mum_final (h);
389 }
390 
391 /* Fast hashing of KEY with SEED.  The hash is always the same for the
392    same key on any target. */
393 static inline size_t
mum_hash64(uint64_t key,uint64_t seed)394 mum_hash64 (uint64_t key, uint64_t seed) {
395   return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
396 }
397 
398 /* Hash data KEY of length LEN and SEED.  The hash depends on the
399    target endianness and the unroll factor.  */
400 static inline uint64_t
mum_hash(const void * key,size_t len,uint64_t seed)401 mum_hash (const void *key, size_t len, uint64_t seed) {
402 #if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
403   static int avx2_support = 0;
404 
405   if (avx2_support > 0)
406     return _mum_hash_avx2 (key, len, seed);
407   else if (! avx2_support) {
408     __builtin_cpu_init ();
409     avx2_support =  __builtin_cpu_supports ("avx2") ? 1 : -1;
410     if (avx2_support > 0)
411       return _mum_hash_avx2 (key, len, seed);
412   }
413 #endif
414   return _mum_hash_default (key, len, seed);
415 }
416 
417 #endif
418