1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5 6 To the extent possible under law, the author(s) have dedicated all copyright 7 and related and neighboring rights to this software to the public domain 8 worldwide. This software is distributed without any warranty. 9 10 You should have received a copy of the CC0 Public Domain Dedication along with 11 this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12 */ 13 14 #include <stdint.h> 15 #include <string.h> 16 #include <stdio.h> 17 18 #include "blake2.h" 19 #include "blake2-impl.h" 20 21 #include "blake2-config.h" 22 23 #if defined(_MSC_VER) 24 #include <intrin.h> 25 #endif 26 27 #if defined(HAVE_SSE2) 28 #include <emmintrin.h> 29 // MSVC only defines _mm_set_epi64x for x86_64... 30 #if defined(_MSC_VER) && !defined(_M_X64) 31 static inline __m128i _mm_set_epi64x( const uint64_t u1, const uint64_t u0 ) 32 { 33 return _mm_set_epi32( u1 >> 32, u1, u0 >> 32, u0 ); 34 } 35 #endif 36 #endif 37 38 #if defined(HAVE_SSSE3) 39 #include <tmmintrin.h> 40 #endif 41 #if defined(HAVE_SSE4_1) 42 #include <smmintrin.h> 43 #endif 44 #if defined(HAVE_AVX) 45 #include <immintrin.h> 46 #endif 47 #if defined(HAVE_XOP) && !defined(_MSC_VER) 48 #include <x86intrin.h> 49 #endif 50 51 52 53 #include "blake2b-round.h" 54 55 static const uint64_t blake2b_IV[8] = 56 { 57 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 58 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, 59 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 60 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL 61 }; 62 63 static const uint8_t blake2b_sigma[12][16] = 64 { 65 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , 66 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , 67 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , 68 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , 69 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , 70 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , 71 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , 72 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , 73 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , 74 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , 75 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , 76 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } 77 }; 78 79 80 /* Some helper functions, not necessarily useful */ 81 static inline int blake2b_set_lastnode( blake2b_state *S ) 82 { 83 S->f[1] = ~0ULL; 84 return 0; 85 } 86 87 static inline int blake2b_clear_lastnode( blake2b_state *S ) 88 { 89 S->f[1] = 0ULL; 90 return 0; 91 } 92 93 static inline int blake2b_set_lastblock( blake2b_state *S ) 94 { 95 if( S->last_node ) blake2b_set_lastnode( S ); 96 97 S->f[0] = ~0ULL; 98 return 0; 99 } 100 101 static inline int blake2b_clear_lastblock( blake2b_state *S ) 102 { 103 if( S->last_node ) blake2b_clear_lastnode( S ); 104 105 S->f[0] = 0ULL; 106 return 0; 107 } 108 109 110 static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc ) 111 { 112 #if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) 113 // ADD/ADC chain 114 __uint128_t t = ( ( __uint128_t )S->t[1] << 64 ) | S->t[0]; 115 t += inc; 116 S->t[0] = ( uint64_t )( t >> 0 ); 117 S->t[1] = ( uint64_t )( t >> 64 ); 118 #else 119 S->t[0] += inc; 120 S->t[1] += ( S->t[0] < inc ); 121 #endif 122 return 0; 123 } 124 125 126 // Parameter-related functions 127 static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length ) 128 { 129 P->digest_length = digest_length; 130 return 0; 131 } 132 133 static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout ) 134 { 135 P->fanout = fanout; 136 return 0; 137 } 138 139 static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth ) 140 { 141 P->depth = depth; 142 return 0; 143 } 144 145 static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length ) 146 { 147 P->leaf_length = leaf_length; 148 return 0; 149 } 150 151 static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset ) 152 { 153 P->node_offset = node_offset; 154 return 0; 155 } 156 157 static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth ) 158 { 159 P->node_depth = node_depth; 160 return 0; 161 } 162 163 static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length ) 164 { 165 P->inner_length = inner_length; 166 return 0; 167 } 168 169 static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] ) 170 { 171 memcpy( P->salt, salt, BLAKE2B_SALTBYTES ); 172 return 0; 173 } 174 175 static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] ) 176 { 177 memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES ); 178 return 0; 179 } 180 181 static inline int blake2b_init0( blake2b_state *S ) 182 { 183 memset( S, 0, sizeof( blake2b_state ) ); 184 185 for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i]; 186 187 return 0; 188 } 189 190 191 192 #define blake2b_init BLAKE2_IMPL_NAME(blake2b_init) 193 #define blake2b_init_param BLAKE2_IMPL_NAME(blake2b_init_param) 194 #define blake2b_init_key BLAKE2_IMPL_NAME(blake2b_init_key) 195 #define blake2b_update BLAKE2_IMPL_NAME(blake2b_update) 196 #define blake2b_final BLAKE2_IMPL_NAME(blake2b_final) 197 #define blake2b BLAKE2_IMPL_NAME(blake2b) 198 199 #if defined(__cplusplus) 200 extern "C" { 201 #endif 202 int blake2b_init( blake2b_state *S, size_t outlen ); 203 int blake2b_init_param( blake2b_state *S, const blake2b_param *P ); 204 int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen ); 205 int blake2b_update( blake2b_state *S, const uint8_t *in, size_t inlen ); 206 int blake2b_final( blake2b_state *S, uint8_t *out, size_t outlen ); 207 int blake2b( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ); 208 #if defined(__cplusplus) 209 } 210 #endif 211 212 /* init xors IV with input parameter block */ 213 int blake2b_init_param( blake2b_state *S, const blake2b_param *P ) 214 { 215 uint8_t *p, *h, *v; 216 //blake2b_init0( S ); 217 v = ( uint8_t * )( blake2b_IV ); 218 h = ( uint8_t * )( S->h ); 219 p = ( uint8_t * )( P ); 220 /* IV XOR ParamBlock */ 221 memset( S, 0, sizeof( blake2b_state ) ); 222 223 for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i]; 224 225 S->outlen = P->digest_length; 226 return 0; 227 } 228 229 230 /* Some sort of default parameter block initialization, for sequential blake2b */ 231 232 int blake2b_init( blake2b_state *S, size_t outlen ) 233 { 234 if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; 235 236 const blake2b_param P = 237 { 238 ( uint8_t ) outlen, 239 0, 240 1, 241 1, 242 0, 243 0, 244 0, 245 0, 246 {0}, 247 {0}, 248 {0} 249 }; 250 return blake2b_init_param( S, &P ); 251 } 252 253 int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen ) 254 { 255 if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; 256 257 if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1; 258 259 const blake2b_param P = 260 { 261 ( uint8_t ) outlen, 262 ( uint8_t ) keylen, 263 1, 264 1, 265 0, 266 0, 267 0, 268 0, 269 {0}, 270 {0}, 271 {0} 272 }; 273 274 if( blake2b_init_param( S, &P ) < 0 ) 275 return 0; 276 277 { 278 uint8_t block[BLAKE2B_BLOCKBYTES]; 279 memset( block, 0, BLAKE2B_BLOCKBYTES ); 280 memcpy( block, key, keylen ); 281 blake2b_update( S, block, BLAKE2B_BLOCKBYTES ); 282 secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */ 283 } 284 return 0; 285 } 286 287 static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) 288 { 289 __m128i row1l, row1h; 290 __m128i row2l, row2h; 291 __m128i row3l, row3h; 292 __m128i row4l, row4h; 293 __m128i b0, b1; 294 __m128i t0, t1; 295 #if defined(HAVE_SSSE3) && !defined(HAVE_XOP) 296 const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); 297 const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); 298 #endif 299 #if defined(HAVE_SSE4_1) 300 const __m128i m0 = LOADU( block + 00 ); 301 const __m128i m1 = LOADU( block + 16 ); 302 const __m128i m2 = LOADU( block + 32 ); 303 const __m128i m3 = LOADU( block + 48 ); 304 const __m128i m4 = LOADU( block + 64 ); 305 const __m128i m5 = LOADU( block + 80 ); 306 const __m128i m6 = LOADU( block + 96 ); 307 const __m128i m7 = LOADU( block + 112 ); 308 #else 309 const uint64_t m0 = ( ( uint64_t * )block )[ 0]; 310 const uint64_t m1 = ( ( uint64_t * )block )[ 1]; 311 const uint64_t m2 = ( ( uint64_t * )block )[ 2]; 312 const uint64_t m3 = ( ( uint64_t * )block )[ 3]; 313 const uint64_t m4 = ( ( uint64_t * )block )[ 4]; 314 const uint64_t m5 = ( ( uint64_t * )block )[ 5]; 315 const uint64_t m6 = ( ( uint64_t * )block )[ 6]; 316 const uint64_t m7 = ( ( uint64_t * )block )[ 7]; 317 const uint64_t m8 = ( ( uint64_t * )block )[ 8]; 318 const uint64_t m9 = ( ( uint64_t * )block )[ 9]; 319 const uint64_t m10 = ( ( uint64_t * )block )[10]; 320 const uint64_t m11 = ( ( uint64_t * )block )[11]; 321 const uint64_t m12 = ( ( uint64_t * )block )[12]; 322 const uint64_t m13 = ( ( uint64_t * )block )[13]; 323 const uint64_t m14 = ( ( uint64_t * )block )[14]; 324 const uint64_t m15 = ( ( uint64_t * )block )[15]; 325 #endif 326 row1l = LOADU( &S->h[0] ); 327 row1h = LOADU( &S->h[2] ); 328 row2l = LOADU( &S->h[4] ); 329 row2h = LOADU( &S->h[6] ); 330 row3l = LOADU( &blake2b_IV[0] ); 331 row3h = LOADU( &blake2b_IV[2] ); 332 row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) ); 333 row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) ); 334 ROUND( 0 ); 335 ROUND( 1 ); 336 ROUND( 2 ); 337 ROUND( 3 ); 338 ROUND( 4 ); 339 ROUND( 5 ); 340 ROUND( 6 ); 341 ROUND( 7 ); 342 ROUND( 8 ); 343 ROUND( 9 ); 344 ROUND( 10 ); 345 ROUND( 11 ); 346 row1l = _mm_xor_si128( row3l, row1l ); 347 row1h = _mm_xor_si128( row3h, row1h ); 348 STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) ); 349 STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) ); 350 row2l = _mm_xor_si128( row4l, row2l ); 351 row2h = _mm_xor_si128( row4h, row2h ); 352 STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) ); 353 STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); 354 return 0; 355 } 356 357 358 int blake2b_update( blake2b_state *S, const uint8_t *in, size_t inlen ) 359 { 360 while( inlen > 0 ) 361 { 362 uint32_t left = S->buflen; 363 uint32_t fill = 2 * BLAKE2B_BLOCKBYTES - left; 364 365 if( inlen > fill ) 366 { 367 memcpy( S->buf + left, in, fill ); // Fill buffer 368 S->buflen += fill; 369 blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); 370 blake2b_compress( S, S->buf ); // Compress 371 memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left 372 S->buflen -= BLAKE2B_BLOCKBYTES; 373 in += fill; 374 inlen -= fill; 375 } 376 else // inlen <= fill 377 { 378 memcpy( S->buf + left, in, inlen ); 379 S->buflen += ( uint32_t ) inlen; // Be lazy, do not compress 380 in += inlen; 381 inlen -= inlen; 382 } 383 } 384 385 return 0; 386 } 387 388 389 int blake2b_final( blake2b_state *S, uint8_t *out, size_t outlen ) 390 { 391 if(S->outlen != outlen) return -1; 392 393 if( S->buflen > BLAKE2B_BLOCKBYTES ) 394 { 395 blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); 396 blake2b_compress( S, S->buf ); 397 S->buflen -= BLAKE2B_BLOCKBYTES; 398 memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen ); 399 } 400 401 blake2b_increment_counter( S, S->buflen ); 402 blake2b_set_lastblock( S ); 403 memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ 404 blake2b_compress( S, S->buf ); 405 memcpy( out, &S->h[0], outlen ); 406 return 0; 407 } 408 409 410 int blake2b( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ) 411 { 412 blake2b_state S[1]; 413 414 /* Verify parameters */ 415 if ( NULL == in && inlen > 0 ) return -1; 416 417 if ( NULL == out ) return -1; 418 419 if( NULL == key && keylen > 0 ) return -1; 420 421 if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1; 422 423 if( keylen > BLAKE2B_KEYBYTES ) return -1; 424 425 if( keylen ) 426 { 427 if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1; 428 } 429 else 430 { 431 if( blake2b_init( S, outlen ) < 0 ) return -1; 432 } 433 434 if( blake2b_update( S, ( uint8_t * )in, inlen ) < 0) return -1; 435 return blake2b_final( S, out, outlen ); 436 } 437 438 #if defined(SUPERCOP) 439 int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen ) 440 { 441 return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 ); 442 } 443 #endif 444