1
2 #define BLAKE2_USE_SSSE3
3 #define BLAKE2_USE_SSE41
4
5 #include <stdint.h>
6 #include <string.h>
7
8 #include "blake2.h"
9 #include "private/common.h"
10 #include "private/sse2_64_32.h"
11
12 #if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \
13 defined(HAVE_SMMINTRIN_H)
14
15 # ifdef __GNUC__
16 # pragma GCC target("sse2")
17 # pragma GCC target("ssse3")
18 # pragma GCC target("sse4.1")
19 # endif
20
21 # include <emmintrin.h>
22 # include <smmintrin.h>
23 # include <tmmintrin.h>
24
25 # include "blake2b-compress-sse41.h"
26
27 CRYPTO_ALIGN(64)
28 static const uint64_t blake2b_IV[8] = {
29 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL,
30 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
31 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
32 };
33
34 int
blake2b_compress_sse41(blake2b_state * S,const uint8_t block[BLAKE2B_BLOCKBYTES])35 blake2b_compress_sse41(blake2b_state *S,
36 const uint8_t block[BLAKE2B_BLOCKBYTES])
37 {
38 __m128i row1l, row1h;
39 __m128i row2l, row2h;
40 __m128i row3l, row3h;
41 __m128i row4l, row4h;
42 __m128i b0, b1;
43 __m128i t0, t1;
44 const __m128i r16 =
45 _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
46 const __m128i r24 =
47 _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
48 const __m128i m0 = LOADU(block + 00);
49 const __m128i m1 = LOADU(block + 16);
50 const __m128i m2 = LOADU(block + 32);
51 const __m128i m3 = LOADU(block + 48);
52 const __m128i m4 = LOADU(block + 64);
53 const __m128i m5 = LOADU(block + 80);
54 const __m128i m6 = LOADU(block + 96);
55 const __m128i m7 = LOADU(block + 112);
56 row1l = LOADU(&S->h[0]);
57 row1h = LOADU(&S->h[2]);
58 row2l = LOADU(&S->h[4]);
59 row2h = LOADU(&S->h[6]);
60 row3l = LOADU(&blake2b_IV[0]);
61 row3h = LOADU(&blake2b_IV[2]);
62 row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
63 row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
64 ROUND(0);
65 ROUND(1);
66 ROUND(2);
67 ROUND(3);
68 ROUND(4);
69 ROUND(5);
70 ROUND(6);
71 ROUND(7);
72 ROUND(8);
73 ROUND(9);
74 ROUND(10);
75 ROUND(11);
76 row1l = _mm_xor_si128(row3l, row1l);
77 row1h = _mm_xor_si128(row3h, row1h);
78 STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
79 STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
80 row2l = _mm_xor_si128(row4l, row2l);
81 row2h = _mm_xor_si128(row4h, row2h);
82 STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
83 STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
84 return 0;
85 }
86
87 #endif
88