1 2 #ifndef blake2b_compress_ssse3_H 3 #define blake2b_compress_ssse3_H 4 5 #define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p)) 6 #define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r) 7 8 #define _mm_roti_epi64(x, c) \ 9 (-(c) == 32) \ 10 ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ 11 : (-(c) == 24) \ 12 ? _mm_shuffle_epi8((x), r24) \ 13 : (-(c) == 16) \ 14 ? _mm_shuffle_epi8((x), r16) \ 15 : (-(c) == 63) \ 16 ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ 17 _mm_add_epi64((x), (x))) \ 18 : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ 19 _mm_slli_epi64((x), 64 - (-(c)))) 20 21 #define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \ 22 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ 23 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ 24 \ 25 row4l = _mm_xor_si128(row4l, row1l); \ 26 row4h = _mm_xor_si128(row4h, row1h); \ 27 \ 28 row4l = _mm_roti_epi64(row4l, -32); \ 29 row4h = _mm_roti_epi64(row4h, -32); \ 30 \ 31 row3l = _mm_add_epi64(row3l, row4l); \ 32 row3h = _mm_add_epi64(row3h, row4h); \ 33 \ 34 row2l = _mm_xor_si128(row2l, row3l); \ 35 row2h = _mm_xor_si128(row2h, row3h); \ 36 \ 37 row2l = _mm_roti_epi64(row2l, -24); \ 38 row2h = _mm_roti_epi64(row2h, -24); 39 40 #define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \ 41 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ 42 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ 43 \ 44 row4l = _mm_xor_si128(row4l, row1l); \ 45 row4h = _mm_xor_si128(row4h, row1h); \ 46 \ 47 row4l = _mm_roti_epi64(row4l, -16); \ 48 row4h = _mm_roti_epi64(row4h, -16); \ 49 \ 50 row3l = _mm_add_epi64(row3l, row4l); \ 51 row3h = _mm_add_epi64(row3h, row4h); \ 52 \ 53 row2l = _mm_xor_si128(row2l, row3l); \ 54 row2h = _mm_xor_si128(row2h, row3h); \ 55 \ 56 row2l = _mm_roti_epi64(row2l, -63); \ 57 row2h = _mm_roti_epi64(row2h, -63); 58 59 #define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \ 60 t0 = _mm_alignr_epi8(row2h, row2l, 8); \ 61 t1 = _mm_alignr_epi8(row2l, row2h, 8); \ 62 row2l = t0; \ 63 row2h = t1; \ 64 \ 65 t0 = row3l; \ 66 row3l = row3h; \ 67 row3h = t0; \ 68 \ 69 t0 = _mm_alignr_epi8(row4h, row4l, 8); \ 70 t1 = _mm_alignr_epi8(row4l, row4h, 8); \ 71 row4l = t1; \ 72 row4h = t0; 73 74 #define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \ 75 t0 = _mm_alignr_epi8(row2l, row2h, 8); \ 76 t1 = _mm_alignr_epi8(row2h, row2l, 8); \ 77 row2l = t0; \ 78 row2h = t1; \ 79 \ 80 t0 = row3l; \ 81 row3l = row3h; \ 82 row3h = t0; \ 83 \ 84 t0 = _mm_alignr_epi8(row4l, row4h, 8); \ 85 t1 = _mm_alignr_epi8(row4h, row4l, 8); \ 86 row4l = t1; \ 87 row4h = t0; 88 89 #include "blake2b-load-sse2.h" 90 91 #define ROUND(r) \ 92 LOAD_MSG_##r##_1(b0, b1); \ 93 G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ 94 LOAD_MSG_##r##_2(b0, b1); \ 95 G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ 96 DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ 97 LOAD_MSG_##r##_3(b0, b1); \ 98 G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ 99 LOAD_MSG_##r##_4(b0, b1); \ 100 G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ 101 UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); 102 103 #endif 104