1*0e33efe4SConrad Meyer /* 2*0e33efe4SConrad Meyer BLAKE2 reference source code package - optimized C implementations 3*0e33efe4SConrad Meyer 4*0e33efe4SConrad Meyer Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5*0e33efe4SConrad Meyer 6*0e33efe4SConrad Meyer To the extent possible under law, the author(s) have dedicated all copyright 7*0e33efe4SConrad Meyer and related and neighboring rights to this software to the public domain 8*0e33efe4SConrad Meyer worldwide. This software is distributed without any warranty. 9*0e33efe4SConrad Meyer 10*0e33efe4SConrad Meyer You should have received a copy of the CC0 Public Domain Dedication along with 11*0e33efe4SConrad Meyer this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12*0e33efe4SConrad Meyer */ 13*0e33efe4SConrad Meyer #pragma once 14*0e33efe4SConrad Meyer #ifndef __BLAKE2B_ROUND_H__ 15*0e33efe4SConrad Meyer #define __BLAKE2B_ROUND_H__ 16*0e33efe4SConrad Meyer 17*0e33efe4SConrad Meyer #define LOAD(p) _mm_load_si128( (__m128i *)(p) ) 18*0e33efe4SConrad Meyer #define STORE(p,r) _mm_store_si128((__m128i *)(p), r) 19*0e33efe4SConrad Meyer 20*0e33efe4SConrad Meyer #define LOADU(p) _mm_loadu_si128( (__m128i *)(p) ) 21*0e33efe4SConrad Meyer #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) 22*0e33efe4SConrad Meyer 23*0e33efe4SConrad Meyer #define TOF(reg) _mm_castsi128_ps((reg)) 24*0e33efe4SConrad Meyer #define TOI(reg) _mm_castps_si128((reg)) 25*0e33efe4SConrad Meyer 26*0e33efe4SConrad Meyer #define LIKELY(x) __builtin_expect((x),1) 27*0e33efe4SConrad Meyer 28*0e33efe4SConrad Meyer 29*0e33efe4SConrad Meyer /* Microarchitecture-specific macros */ 30*0e33efe4SConrad Meyer #ifndef HAVE_XOP 31*0e33efe4SConrad Meyer #ifdef HAVE_SSSE3 32*0e33efe4SConrad Meyer #define _mm_roti_epi64(x, c) \ 33*0e33efe4SConrad Meyer (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ 34*0e33efe4SConrad Meyer : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ 35*0e33efe4SConrad Meyer : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ 36*0e33efe4SConrad Meyer : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ 37*0e33efe4SConrad Meyer : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) 38*0e33efe4SConrad Meyer #else 39*0e33efe4SConrad Meyer #define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) )) 40*0e33efe4SConrad Meyer #endif 41*0e33efe4SConrad Meyer #else 42*0e33efe4SConrad Meyer /* ... */ 43*0e33efe4SConrad Meyer #endif 44*0e33efe4SConrad Meyer 45*0e33efe4SConrad Meyer 46*0e33efe4SConrad Meyer 47*0e33efe4SConrad Meyer #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 48*0e33efe4SConrad Meyer row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ 49*0e33efe4SConrad Meyer row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ 50*0e33efe4SConrad Meyer \ 51*0e33efe4SConrad Meyer row4l = _mm_xor_si128(row4l, row1l); \ 52*0e33efe4SConrad Meyer row4h = _mm_xor_si128(row4h, row1h); \ 53*0e33efe4SConrad Meyer \ 54*0e33efe4SConrad Meyer row4l = _mm_roti_epi64(row4l, -32); \ 55*0e33efe4SConrad Meyer row4h = _mm_roti_epi64(row4h, -32); \ 56*0e33efe4SConrad Meyer \ 57*0e33efe4SConrad Meyer row3l = _mm_add_epi64(row3l, row4l); \ 58*0e33efe4SConrad Meyer row3h = _mm_add_epi64(row3h, row4h); \ 59*0e33efe4SConrad Meyer \ 60*0e33efe4SConrad Meyer row2l = _mm_xor_si128(row2l, row3l); \ 61*0e33efe4SConrad Meyer row2h = _mm_xor_si128(row2h, row3h); \ 62*0e33efe4SConrad Meyer \ 63*0e33efe4SConrad Meyer row2l = _mm_roti_epi64(row2l, -24); \ 64*0e33efe4SConrad Meyer row2h = _mm_roti_epi64(row2h, -24); \ 65*0e33efe4SConrad Meyer 66*0e33efe4SConrad Meyer #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 67*0e33efe4SConrad Meyer row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ 68*0e33efe4SConrad Meyer row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ 69*0e33efe4SConrad Meyer \ 70*0e33efe4SConrad Meyer row4l = _mm_xor_si128(row4l, row1l); \ 71*0e33efe4SConrad Meyer row4h = _mm_xor_si128(row4h, row1h); \ 72*0e33efe4SConrad Meyer \ 73*0e33efe4SConrad Meyer row4l = _mm_roti_epi64(row4l, -16); \ 74*0e33efe4SConrad Meyer row4h = _mm_roti_epi64(row4h, -16); \ 75*0e33efe4SConrad Meyer \ 76*0e33efe4SConrad Meyer row3l = _mm_add_epi64(row3l, row4l); \ 77*0e33efe4SConrad Meyer row3h = _mm_add_epi64(row3h, row4h); \ 78*0e33efe4SConrad Meyer \ 79*0e33efe4SConrad Meyer row2l = _mm_xor_si128(row2l, row3l); \ 80*0e33efe4SConrad Meyer row2h = _mm_xor_si128(row2h, row3h); \ 81*0e33efe4SConrad Meyer \ 82*0e33efe4SConrad Meyer row2l = _mm_roti_epi64(row2l, -63); \ 83*0e33efe4SConrad Meyer row2h = _mm_roti_epi64(row2h, -63); \ 84*0e33efe4SConrad Meyer 85*0e33efe4SConrad Meyer #if defined(HAVE_SSSE3) 86*0e33efe4SConrad Meyer #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 87*0e33efe4SConrad Meyer t0 = _mm_alignr_epi8(row2h, row2l, 8); \ 88*0e33efe4SConrad Meyer t1 = _mm_alignr_epi8(row2l, row2h, 8); \ 89*0e33efe4SConrad Meyer row2l = t0; \ 90*0e33efe4SConrad Meyer row2h = t1; \ 91*0e33efe4SConrad Meyer \ 92*0e33efe4SConrad Meyer t0 = row3l; \ 93*0e33efe4SConrad Meyer row3l = row3h; \ 94*0e33efe4SConrad Meyer row3h = t0; \ 95*0e33efe4SConrad Meyer \ 96*0e33efe4SConrad Meyer t0 = _mm_alignr_epi8(row4h, row4l, 8); \ 97*0e33efe4SConrad Meyer t1 = _mm_alignr_epi8(row4l, row4h, 8); \ 98*0e33efe4SConrad Meyer row4l = t1; \ 99*0e33efe4SConrad Meyer row4h = t0; 100*0e33efe4SConrad Meyer 101*0e33efe4SConrad Meyer #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 102*0e33efe4SConrad Meyer t0 = _mm_alignr_epi8(row2l, row2h, 8); \ 103*0e33efe4SConrad Meyer t1 = _mm_alignr_epi8(row2h, row2l, 8); \ 104*0e33efe4SConrad Meyer row2l = t0; \ 105*0e33efe4SConrad Meyer row2h = t1; \ 106*0e33efe4SConrad Meyer \ 107*0e33efe4SConrad Meyer t0 = row3l; \ 108*0e33efe4SConrad Meyer row3l = row3h; \ 109*0e33efe4SConrad Meyer row3h = t0; \ 110*0e33efe4SConrad Meyer \ 111*0e33efe4SConrad Meyer t0 = _mm_alignr_epi8(row4l, row4h, 8); \ 112*0e33efe4SConrad Meyer t1 = _mm_alignr_epi8(row4h, row4l, 8); \ 113*0e33efe4SConrad Meyer row4l = t1; \ 114*0e33efe4SConrad Meyer row4h = t0; 115*0e33efe4SConrad Meyer #else 116*0e33efe4SConrad Meyer 117*0e33efe4SConrad Meyer #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 118*0e33efe4SConrad Meyer t0 = row4l;\ 119*0e33efe4SConrad Meyer t1 = row2l;\ 120*0e33efe4SConrad Meyer row4l = row3l;\ 121*0e33efe4SConrad Meyer row3l = row3h;\ 122*0e33efe4SConrad Meyer row3h = row4l;\ 123*0e33efe4SConrad Meyer row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ 124*0e33efe4SConrad Meyer row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ 125*0e33efe4SConrad Meyer row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ 126*0e33efe4SConrad Meyer row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) 127*0e33efe4SConrad Meyer 128*0e33efe4SConrad Meyer #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 129*0e33efe4SConrad Meyer t0 = row3l;\ 130*0e33efe4SConrad Meyer row3l = row3h;\ 131*0e33efe4SConrad Meyer row3h = t0;\ 132*0e33efe4SConrad Meyer t0 = row2l;\ 133*0e33efe4SConrad Meyer t1 = row4l;\ 134*0e33efe4SConrad Meyer row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ 135*0e33efe4SConrad Meyer row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ 136*0e33efe4SConrad Meyer row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ 137*0e33efe4SConrad Meyer row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) 138*0e33efe4SConrad Meyer 139*0e33efe4SConrad Meyer #endif 140*0e33efe4SConrad Meyer 141*0e33efe4SConrad Meyer #if defined(HAVE_SSE4_1) 142*0e33efe4SConrad Meyer #include "blake2b-load-sse41.h" 143*0e33efe4SConrad Meyer #else 144*0e33efe4SConrad Meyer #include "blake2b-load-sse2.h" 145*0e33efe4SConrad Meyer #endif 146*0e33efe4SConrad Meyer 147*0e33efe4SConrad Meyer #define ROUND(r) \ 148*0e33efe4SConrad Meyer LOAD_MSG_ ##r ##_1(b0, b1); \ 149*0e33efe4SConrad Meyer G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 150*0e33efe4SConrad Meyer LOAD_MSG_ ##r ##_2(b0, b1); \ 151*0e33efe4SConrad Meyer G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 152*0e33efe4SConrad Meyer DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 153*0e33efe4SConrad Meyer LOAD_MSG_ ##r ##_3(b0, b1); \ 154*0e33efe4SConrad Meyer G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 155*0e33efe4SConrad Meyer LOAD_MSG_ ##r ##_4(b0, b1); \ 156*0e33efe4SConrad Meyer G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 157*0e33efe4SConrad Meyer UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); 158*0e33efe4SConrad Meyer 159*0e33efe4SConrad Meyer #endif 160*0e33efe4SConrad Meyer 161