xref: /freebsd/contrib/libucl/src/mum.h (revision 9c77fb6aaa366cbabc80ee1b834bcfe4df135491)
1 /* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>
2 
3    Permission is hereby granted, free of charge, to any person
4    obtaining a copy of this software and associated documentation
5    files (the "Software"), to deal in the Software without
6    restriction, including without limitation the rights to use, copy,
7    modify, merge, publish, distribute, sublicense, and/or sell copies
8    of the Software, and to permit persons to whom the Software is
9    furnished to do so, subject to the following conditions:
10 
11    The above copyright notice and this permission notice shall be
12    included in all copies or substantial portions of the Software.
13 
14    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21    SOFTWARE.
22 */
23 
24 /* This file implements MUM (MUltiply and Mix) hashing.  We randomize
25    input data by 64x64-bit multiplication and mixing hi- and low-parts
26    of the multiplication result by using an addition and then mix it
27    into the current state.  We use prime numbers randomly generated
28    with the equal probability of their bit values for the
29    multiplication.  When all primes are used once, the state is
30    randomized and the same prime numbers are used again for data
31    randomization.
32 
33    The MUM hashing passes all SMHasher tests.  Pseudo Random Number
34    Generator based on MUM also passes NIST Statistical Test Suite for
35    Random and Pseudorandom Number Generators for Cryptographic
36    Applications (version 2.2.1) with 1000 bitstreams each containing
37    1M bits.  MUM hashing is also faster Spooky64 and City64 on small
38    strings (at least up to 512-bit) on Haswell and Power7.  The MUM bulk
39    speed (speed on very long data) is bigger than Spooky and City on
40    Power7.  On Haswell the bulk speed is bigger than Spooky one and
41    close to City speed.  */
42 
43 #ifndef __MUM_HASH__
44 #define __MUM_HASH__
45 
46 #include <stddef.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <limits.h>
50 
51 #ifdef _MSC_VER
52 typedef unsigned __int16 uint16_t;
53 typedef unsigned __int32 uint32_t;
54 typedef unsigned __int64 uint64_t;
55 #else
56 #include <stdint.h>
57 #endif
58 
59 /* Macro saying to use 128-bit integers implemented by GCC for some
60    targets.  */
61 #ifndef _MUM_USE_INT128
62 /* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
63    HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
64    otherwise int. */
65 #if defined(__GNUC__) && UINT_MAX != ULONG_MAX
66 #define _MUM_USE_INT128 1
67 #else
68 #define _MUM_USE_INT128 0
69 #endif
70 #endif
71 
72 #if defined(__GNUC__) && ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
73 #define _MUM_FRESH_GCC
74 #endif
75 
76 #if defined(__GNUC__) && !defined(__llvm__) && defined(_MUM_FRESH_GCC)
77 #define _MUM_ATTRIBUTE_UNUSED __attribute__((unused))
78 #define _MUM_OPTIMIZE(opts) __attribute__((__optimize__(opts)))
79 #define _MUM_TARGET(opts) __attribute__((__target__(opts)))
80 #else
81 #define _MUM_ATTRIBUTE_UNUSED
82 #define _MUM_OPTIMIZE(opts)
83 #define _MUM_TARGET(opts)
84 #endif
85 
86 
87 /* Here are different primes randomly generated with the equal
88    probability of their bit values.  They are used to randomize input
89    values.  */
90 static uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
91 static uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
92 static uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
93 static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
94 static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
95 static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
96 static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
97 
98 static uint64_t _mum_primes[] = {
99 	0X9ebdcae10d981691,
100 	0X32b9b9b97a27ac7d,
101 	0X29b5584d83d35bbd,
102 	0X4b04e0e61401255f,
103 	0X25e8f7b1f1c9d027,
104 	0X80d4c8c000f3e881,
105 	0Xbd1255431904b9dd,
106 	0X8a3bd4485eee6d81,
107 	0X3bc721b2aad05197,
108 	0X71b1a19b907d6e33,
109 	0X525e6c1084a8534b,
110 	0X9e4c2cd340c1299f,
111 	0Xde3add92e94caa37,
112 	0X7e14eadb1f65311d,
113 	0X3f5aa40f89812853,
114 	0X33b15a3b587d15c9,
115 };
116 
117 /* Multiply 64-bit V and P and return sum of high and low parts of the
118    result.  */
119 static inline uint64_t
120 _mum(uint64_t v, uint64_t p)
121 {
122 	uint64_t hi, lo;
123 #if _MUM_USE_INT128
124 #if defined(__aarch64__)
125 	/* AARCH64 needs 2 insns to calculate 128-bit result of the
126      multiplication.  If we use a generic code we actually call a
127      function doing 128x128->128 bit multiplication.  The function is
128      very slow.  */
129 	lo = v * p, hi;
130 	asm("umulh %0, %1, %2" : "=r"(hi) : "r"(v), "r"(p));
131 #else
132 	__uint128_t r = (__uint128_t) v * (__uint128_t) p;
133 	hi = (uint64_t) (r >> 64);
134 	lo = (uint64_t) r;
135 #endif
136 #else
137 	/* Implementation of 64x64->128-bit multiplication by four 32x32->64
138      bit multiplication.  */
139 	uint64_t hv = v >> 32, hp = p >> 32;
140 	uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
141 	uint64_t rh = hv * hp;
142 	uint64_t rm_0 = hv * lp;
143 	uint64_t rm_1 = hp * lv;
144 	uint64_t rl = lv * lp;
145 	uint64_t t, carry = 0;
146 
147 	/* We could ignore a carry bit here if we did not care about the
148      same hash for 32-bit and 64-bit targets.  */
149 	t = rl + (rm_0 << 32);
150 #ifdef MUM_TARGET_INDEPENDENT_HASH
151 	carry = t < rl;
152 #endif
153 	lo = t + (rm_1 << 32);
154 #ifdef MUM_TARGET_INDEPENDENT_HASH
155 	carry += lo < t;
156 #endif
157 	hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
158 #endif
159 	/* We could use XOR here too but, for some reasons, on Haswell and
160      Power7 using an addition improves hashing performance by 10% for
161      small strings.  */
162 	return hi + lo;
163 }
164 
165 #if defined(_MSC_VER)
166 #define _mum_bswap_32(x) _byteswap_uint32_t(x)
167 #define _mum_bswap_64(x) _byteswap_uint64_t(x)
168 #elif defined(__APPLE__)
169 #include <libkern/OSByteOrder.h>
170 #define _mum_bswap_32(x) OSSwapInt32(x)
171 #define _mum_bswap_64(x) OSSwapInt64(x)
172 #elif defined(__GNUC__)
173 #define _mum_bswap32(x) __builtin_bswap32(x)
174 #define _mum_bswap64(x) __builtin_bswap64(x)
175 #else
176 #include <byteswap.h>
177 #define _mum_bswap32(x) bswap32(x)
178 #define _mum_bswap64(x) bswap64(x)
179 #endif
180 
181 static inline uint64_t
182 _mum_le(uint64_t v)
183 {
184 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
185 	return v;
186 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
187 	return _mum_bswap64(v);
188 #else
189 #error "Unknown endianness"
190 #endif
191 }
192 
193 static inline uint32_t
194 _mum_le32(uint32_t v)
195 {
196 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
197 	return v;
198 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
199 	return _mum_bswap32(v);
200 #else
201 #error "Unknown endianness"
202 #endif
203 }
204 
205 /* Macro defining how many times the most nested loop in
206    _mum_hash_aligned will be unrolled by the compiler (although it can
207    make an own decision:).  Use only a constant here to help a
208    compiler to unroll a major loop.
209 
210    The macro value affects the result hash for strings > 128 bit.  The
211    unroll factor greatly affects the hashing speed.  We prefer the
212    speed.  */
213 #ifndef _MUM_UNROLL_FACTOR_POWER
214 #if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
215 #define _MUM_UNROLL_FACTOR_POWER 3
216 #elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
217 #define _MUM_UNROLL_FACTOR_POWER 4
218 #else
219 #define _MUM_UNROLL_FACTOR_POWER 2
220 #endif
221 #endif
222 
223 #if _MUM_UNROLL_FACTOR_POWER < 1
224 #error "too small unroll factor"
225 #elif _MUM_UNROLL_FACTOR_POWER > 4
226 #error "We have not enough primes for such unroll factor"
227 #endif
228 
229 #define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
230 
231 static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
232 	_mum_hash_aligned(uint64_t start, const void *key, size_t len)
233 {
234 	uint64_t result = start;
235 	const unsigned char *str = (const unsigned char *) key;
236 	uint64_t u64;
237 	int i;
238 	size_t n;
239 
240 	result = _mum(result, _mum_block_start_prime);
241 	while (len > _MUM_UNROLL_FACTOR * sizeof(uint64_t)) {
242 		/* This loop could be vectorized when we have vector insns for
243        64x64->128-bit multiplication.  AVX2 currently only have a
244        vector insn for 4 32x32->64-bit multiplication.  */
245 		for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
246 			result ^= _mum(_mum_le(((uint64_t *) str)[i]), _mum_primes[i]);
247 		len -= _MUM_UNROLL_FACTOR * sizeof(uint64_t);
248 		str += _MUM_UNROLL_FACTOR * sizeof(uint64_t);
249 		/* We will use the same prime numbers on the next iterations --
250        randomize the state.  */
251 		result = _mum(result, _mum_unroll_prime);
252 	}
253 	n = len / sizeof(uint64_t);
254 	for (i = 0; i < (int) n; i++)
255 		result ^= _mum(_mum_le(((uint64_t *) str)[i]), _mum_primes[i]);
256 	len -= n * sizeof(uint64_t);
257 	str += n * sizeof(uint64_t);
258 	switch (len) {
259 	case 7:
260 		u64 = _mum_le32(*(uint32_t *) str);
261 		u64 |= (uint64_t) str[4] << 32;
262 		u64 |= (uint64_t) str[5] << 40;
263 		u64 |= (uint64_t) str[6] << 48;
264 		return result ^ _mum(u64, _mum_tail_prime);
265 	case 6:
266 		u64 = _mum_le32(*(uint32_t *) str);
267 		u64 |= (uint64_t) str[4] << 32;
268 		u64 |= (uint64_t) str[5] << 40;
269 		return result ^ _mum(u64, _mum_tail_prime);
270 	case 5:
271 		u64 = _mum_le32(*(uint32_t *) str);
272 		u64 |= (uint64_t) str[4] << 32;
273 		return result ^ _mum(u64, _mum_tail_prime);
274 	case 4:
275 		u64 = _mum_le32(*(uint32_t *) str);
276 		return result ^ _mum(u64, _mum_tail_prime);
277 	case 3:
278 		u64 = str[0];
279 		u64 |= (uint64_t) str[1] << 8;
280 		u64 |= (uint64_t) str[2] << 16;
281 		return result ^ _mum(u64, _mum_tail_prime);
282 	case 2:
283 		u64 = str[0];
284 		u64 |= (uint64_t) str[1] << 8;
285 		return result ^ _mum(u64, _mum_tail_prime);
286 	case 1:
287 		u64 = str[0];
288 		return result ^ _mum(u64, _mum_tail_prime);
289 	}
290 	return result;
291 }
292 
293 /* Final randomization of H.  */
294 static inline uint64_t
295 _mum_final(uint64_t h)
296 {
297 	h ^= _mum(h, _mum_finish_prime1);
298 	h ^= _mum(h, _mum_finish_prime2);
299 	return h;
300 }
301 
302 #if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
303 
304 /* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
305    it is possible.  Although on modern Intel processors MULQ takes
306    3-cycles vs. 4 for MULX, MULX permits more freedom in insn
307    scheduling as it uses less fixed registers.  */
308 static inline uint64_t _MUM_TARGET("arch=haswell")
309 	_mum_hash_avx2(const void *key, size_t len, uint64_t seed)
310 {
311 	return _mum_final(_mum_hash_aligned(seed + len, key, len));
312 }
313 #endif
314 
315 #ifndef _MUM_UNALIGNED_ACCESS
316 #if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) || defined(__s390__) || defined(__m32c__) || defined(cris) || defined(__CR16__) || defined(__vax__) || defined(__m68k__) || defined(__aarch64__)
317 #define _MUM_UNALIGNED_ACCESS 1
318 #else
319 #define _MUM_UNALIGNED_ACCESS 0
320 #endif
321 #endif
322 
323 /* When we need an aligned access to data being hashed we move part of
324    the unaligned data to an aligned block of given size and then
325    process it, repeating processing the data by the block.  */
326 #ifndef _MUM_BLOCK_LEN
327 #define _MUM_BLOCK_LEN 1024
328 #endif
329 
330 #if _MUM_BLOCK_LEN < 8
331 #error "too small block length"
332 #endif
333 
334 static inline uint64_t
335 #if defined(__x86_64__)
336 	_MUM_TARGET("inline-all-stringops")
337 #endif
338 		_mum_hash_default(const void *key, size_t len, uint64_t seed)
339 {
340 	uint64_t result;
341 	const unsigned char *str = (const unsigned char *) key;
342 	size_t block_len;
343 	uint64_t buf[_MUM_BLOCK_LEN / sizeof(uint64_t)];
344 
345 	result = seed + len;
346 	if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0)
347 		result = _mum_hash_aligned(result, key, len);
348 	else {
349 		while (len != 0) {
350 			block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
351 			memmove(buf, str, block_len);
352 			result = _mum_hash_aligned(result, buf, block_len);
353 			len -= block_len;
354 			str += block_len;
355 		}
356 	}
357 	return _mum_final(result);
358 }
359 
360 static inline uint64_t
361 _mum_next_factor(void)
362 {
363 	uint64_t start = 0;
364 	int i;
365 
366 	for (i = 0; i < 8; i++)
367 		start = (start << 8) | rand() % 256;
368 	return start;
369 }
370 
371 /* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++  */
372 
373 /* Set random multiplicators depending on SEED.  */
374 static inline void
375 mum_hash_randomize(uint64_t seed)
376 {
377 	int i;
378 
379 	srand(seed);
380 	_mum_hash_step_prime = _mum_next_factor();
381 	_mum_key_step_prime = _mum_next_factor();
382 	_mum_finish_prime1 = _mum_next_factor();
383 	_mum_finish_prime2 = _mum_next_factor();
384 	_mum_block_start_prime = _mum_next_factor();
385 	_mum_unroll_prime = _mum_next_factor();
386 	_mum_tail_prime = _mum_next_factor();
387 	for (i = 0; i < (int) (sizeof(_mum_primes) / sizeof(uint64_t)); i++)
388 		_mum_primes[i] = _mum_next_factor();
389 }
390 
391 /* Start hashing data with SEED.  Return the state.  */
392 static inline uint64_t
393 mum_hash_init(uint64_t seed)
394 {
395 	return seed;
396 }
397 
398 /* Process data KEY with the state H and return the updated state.  */
399 static inline uint64_t
400 mum_hash_step(uint64_t h, uint64_t key)
401 {
402 	return _mum(h, _mum_hash_step_prime) ^ _mum(key, _mum_key_step_prime);
403 }
404 
405 /* Return the result of hashing using the current state H.  */
406 static inline uint64_t
407 mum_hash_finish(uint64_t h)
408 {
409 	return _mum_final(h);
410 }
411 
412 /* Fast hashing of KEY with SEED.  The hash is always the same for the
413    same key on any target. */
414 static inline size_t
415 mum_hash64(uint64_t key, uint64_t seed)
416 {
417 	return mum_hash_finish(mum_hash_step(mum_hash_init(seed), key));
418 }
419 
420 /* Hash data KEY of length LEN and SEED.  The hash depends on the
421    target endianness and the unroll factor.  */
422 static inline uint64_t
423 mum_hash(const void *key, size_t len, uint64_t seed)
424 {
425 #if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
426 	static int avx2_support = 0;
427 
428 	if (avx2_support > 0)
429 		return _mum_hash_avx2(key, len, seed);
430 	else if (!avx2_support) {
431 		__builtin_cpu_init();
432 		avx2_support = __builtin_cpu_supports("avx2") ? 1 : -1;
433 		if (avx2_support > 0)
434 			return _mum_hash_avx2(key, len, seed);
435 	}
436 #endif
437 	return _mum_hash_default(key, len, seed);
438 }
439 
440 #endif
441