1 #ifndef blamka_round_ssse3_H
2 #define blamka_round_ssse3_H
3
4 #include "private/common.h"
5 #include "private/sse2_64_32.h"
6
7 #define r16 \
8 (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
9 #define r24 \
10 (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
11 #define _mm_roti_epi64(x, c) \
12 (-(c) == 32) \
13 ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
14 : (-(c) == 24) \
15 ? _mm_shuffle_epi8((x), r24) \
16 : (-(c) == 16) \
17 ? _mm_shuffle_epi8((x), r16) \
18 : (-(c) == 63) \
19 ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
20 _mm_add_epi64((x), (x))) \
21 : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
22 _mm_slli_epi64((x), 64 - (-(c))))
23
24 static inline __m128i
fBlaMka(__m128i x,__m128i y)25 fBlaMka(__m128i x, __m128i y)
26 {
27 const __m128i z = _mm_mul_epu32(x, y);
28 return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
29 }
30
31 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
32 do { \
33 A0 = fBlaMka(A0, B0); \
34 A1 = fBlaMka(A1, B1); \
35 \
36 D0 = _mm_xor_si128(D0, A0); \
37 D1 = _mm_xor_si128(D1, A1); \
38 \
39 D0 = _mm_roti_epi64(D0, -32); \
40 D1 = _mm_roti_epi64(D1, -32); \
41 \
42 C0 = fBlaMka(C0, D0); \
43 C1 = fBlaMka(C1, D1); \
44 \
45 B0 = _mm_xor_si128(B0, C0); \
46 B1 = _mm_xor_si128(B1, C1); \
47 \
48 B0 = _mm_roti_epi64(B0, -24); \
49 B1 = _mm_roti_epi64(B1, -24); \
50 } while ((void) 0, 0)
51
52 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
53 do { \
54 A0 = fBlaMka(A0, B0); \
55 A1 = fBlaMka(A1, B1); \
56 \
57 D0 = _mm_xor_si128(D0, A0); \
58 D1 = _mm_xor_si128(D1, A1); \
59 \
60 D0 = _mm_roti_epi64(D0, -16); \
61 D1 = _mm_roti_epi64(D1, -16); \
62 \
63 C0 = fBlaMka(C0, D0); \
64 C1 = fBlaMka(C1, D1); \
65 \
66 B0 = _mm_xor_si128(B0, C0); \
67 B1 = _mm_xor_si128(B1, C1); \
68 \
69 B0 = _mm_roti_epi64(B0, -63); \
70 B1 = _mm_roti_epi64(B1, -63); \
71 } while ((void) 0, 0)
72
73 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
74 do { \
75 __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
76 __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
77 B0 = t0; \
78 B1 = t1; \
79 \
80 t0 = C0; \
81 C0 = C1; \
82 C1 = t0; \
83 \
84 t0 = _mm_alignr_epi8(D1, D0, 8); \
85 t1 = _mm_alignr_epi8(D0, D1, 8); \
86 D0 = t1; \
87 D1 = t0; \
88 } while ((void) 0, 0)
89
90 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
91 do { \
92 __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
93 __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
94 B0 = t0; \
95 B1 = t1; \
96 \
97 t0 = C0; \
98 C0 = C1; \
99 C1 = t0; \
100 \
101 t0 = _mm_alignr_epi8(D0, D1, 8); \
102 t1 = _mm_alignr_epi8(D1, D0, 8); \
103 D0 = t1; \
104 D1 = t0; \
105 } while ((void) 0, 0)
106
107 #define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
108 do { \
109 G1(A0, B0, C0, D0, A1, B1, C1, D1); \
110 G2(A0, B0, C0, D0, A1, B1, C1, D1); \
111 \
112 DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
113 \
114 G1(A0, B0, C0, D0, A1, B1, C1, D1); \
115 G2(A0, B0, C0, D0, A1, B1, C1, D1); \
116 \
117 UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
118 } while ((void) 0, 0)
119
120 #endif
121