1*0e33efe4SConrad Meyer /* 2*0e33efe4SConrad Meyer BLAKE2 reference source code package - optimized C implementations 3*0e33efe4SConrad Meyer 4*0e33efe4SConrad Meyer Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5*0e33efe4SConrad Meyer 6*0e33efe4SConrad Meyer To the extent possible under law, the author(s) have dedicated all copyright 7*0e33efe4SConrad Meyer and related and neighboring rights to this software to the public domain 8*0e33efe4SConrad Meyer worldwide. This software is distributed without any warranty. 9*0e33efe4SConrad Meyer 10*0e33efe4SConrad Meyer You should have received a copy of the CC0 Public Domain Dedication along with 11*0e33efe4SConrad Meyer this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12*0e33efe4SConrad Meyer */ 13*0e33efe4SConrad Meyer #pragma once 14*0e33efe4SConrad Meyer #ifndef __BLAKE2S_LOAD_SSE41_H__ 15*0e33efe4SConrad Meyer #define __BLAKE2S_LOAD_SSE41_H__ 16*0e33efe4SConrad Meyer 17*0e33efe4SConrad Meyer #define LOAD_MSG_0_1(buf) \ 18*0e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); 19*0e33efe4SConrad Meyer 20*0e33efe4SConrad Meyer #define LOAD_MSG_0_2(buf) \ 21*0e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); 22*0e33efe4SConrad Meyer 23*0e33efe4SConrad Meyer #define LOAD_MSG_0_3(buf) \ 24*0e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); 25*0e33efe4SConrad Meyer 26*0e33efe4SConrad Meyer #define LOAD_MSG_0_4(buf) \ 27*0e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); 28*0e33efe4SConrad Meyer 29*0e33efe4SConrad Meyer #define LOAD_MSG_1_1(buf) \ 30*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1, m2, 0x0C); \ 31*0e33efe4SConrad Meyer t1 = _mm_slli_si128(m3, 4); \ 32*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0xF0); \ 33*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); 34*0e33efe4SConrad Meyer 35*0e33efe4SConrad Meyer #define LOAD_MSG_1_2(buf) \ 36*0e33efe4SConrad Meyer t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ 37*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,m3,0xC0); \ 38*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0xF0); \ 39*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 40*0e33efe4SConrad Meyer 41*0e33efe4SConrad Meyer #define LOAD_MSG_1_3(buf) \ 42*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m1, 4); \ 43*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m2, t0, 0x30); \ 44*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(m0, t1, 0xF0); \ 45*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 46*0e33efe4SConrad Meyer 47*0e33efe4SConrad Meyer #define LOAD_MSG_1_4(buf) \ 48*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \ 49*0e33efe4SConrad Meyer t1 = _mm_slli_si128(m3, 4); \ 50*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0x0C); \ 51*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 52*0e33efe4SConrad Meyer 53*0e33efe4SConrad Meyer #define LOAD_MSG_2_1(buf) \ 54*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m2,m3); \ 55*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m3,m1,0x0C); \ 56*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0x0F); \ 57*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 58*0e33efe4SConrad Meyer 59*0e33efe4SConrad Meyer #define LOAD_MSG_2_2(buf) \ 60*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m2,m0); \ 61*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0, m0, 0xF0); \ 62*0e33efe4SConrad Meyer t2 = _mm_slli_si128(m3, 8); \ 63*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1, t2, 0xC0); 64*0e33efe4SConrad Meyer 65*0e33efe4SConrad Meyer #define LOAD_MSG_2_3(buf) \ 66*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0, m2, 0x3C); \ 67*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 12); \ 68*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \ 69*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); 70*0e33efe4SConrad Meyer 71*0e33efe4SConrad Meyer #define LOAD_MSG_2_4(buf) \ 72*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m3, 4); \ 73*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0, m1, 0x33); \ 74*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, t0, 0xC0); \ 75*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); 76*0e33efe4SConrad Meyer 77*0e33efe4SConrad Meyer #define LOAD_MSG_3_1(buf) \ 78*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \ 79*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(t0, m2); \ 80*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, m3, 0x0C); \ 81*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 82*0e33efe4SConrad Meyer 83*0e33efe4SConrad Meyer #define LOAD_MSG_3_2(buf) \ 84*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m2, 8); \ 85*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m3,m0,0x0C); \ 86*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, t0, 0xC0); \ 87*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 88*0e33efe4SConrad Meyer 89*0e33efe4SConrad Meyer #define LOAD_MSG_3_3(buf) \ 90*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m1,0x0F); \ 91*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0, m3, 0xC0); \ 92*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 93*0e33efe4SConrad Meyer 94*0e33efe4SConrad Meyer #define LOAD_MSG_3_4(buf) \ 95*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m0,m2); \ 96*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(m1,m2); \ 97*0e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t1,t0); 98*0e33efe4SConrad Meyer 99*0e33efe4SConrad Meyer #define LOAD_MSG_4_1(buf) \ 100*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m1,m2); \ 101*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m0,m2); \ 102*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x33); \ 103*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 104*0e33efe4SConrad Meyer 105*0e33efe4SConrad Meyer #define LOAD_MSG_4_2(buf) \ 106*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m1,m3); \ 107*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(m0,m1); \ 108*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t0,t1,0x33); 109*0e33efe4SConrad Meyer 110*0e33efe4SConrad Meyer #define LOAD_MSG_4_3(buf) \ 111*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m3,m1); \ 112*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m2,m0); \ 113*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0x33); 114*0e33efe4SConrad Meyer 115*0e33efe4SConrad Meyer #define LOAD_MSG_4_4(buf) \ 116*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m2,0x03); \ 117*0e33efe4SConrad Meyer t1 = _mm_slli_si128(t0, 8); \ 118*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,m3,0x0F); \ 119*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); 120*0e33efe4SConrad Meyer 121*0e33efe4SConrad Meyer #define LOAD_MSG_5_1(buf) \ 122*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \ 123*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m0,m2); \ 124*0e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t0,t1); 125*0e33efe4SConrad Meyer 126*0e33efe4SConrad Meyer #define LOAD_MSG_5_2(buf) \ 127*0e33efe4SConrad Meyer t0 = _mm_srli_si128(m2, 4); \ 128*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0,m3,0x03); \ 129*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0x3C); 130*0e33efe4SConrad Meyer 131*0e33efe4SConrad Meyer #define LOAD_MSG_5_3(buf) \ 132*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1,m0,0x0C); \ 133*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m3, 4); \ 134*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x30); \ 135*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); 136*0e33efe4SConrad Meyer 137*0e33efe4SConrad Meyer #define LOAD_MSG_5_4(buf) \ 138*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m1,m2); \ 139*0e33efe4SConrad Meyer t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ 140*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t0,t1,0x33); 141*0e33efe4SConrad Meyer 142*0e33efe4SConrad Meyer #define LOAD_MSG_6_1(buf) \ 143*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m1, 12); \ 144*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0,m3,0x33); \ 145*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0xC0); 146*0e33efe4SConrad Meyer 147*0e33efe4SConrad Meyer #define LOAD_MSG_6_2(buf) \ 148*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m3,m2,0x30); \ 149*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 4); \ 150*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \ 151*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); 152*0e33efe4SConrad Meyer 153*0e33efe4SConrad Meyer #define LOAD_MSG_6_3(buf) \ 154*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m0,m2); \ 155*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 4); \ 156*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); 157*0e33efe4SConrad Meyer 158*0e33efe4SConrad Meyer #define LOAD_MSG_6_4(buf) \ 159*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m1,m2); \ 160*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m0,t0); \ 161*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 162*0e33efe4SConrad Meyer 163*0e33efe4SConrad Meyer #define LOAD_MSG_7_1(buf) \ 164*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \ 165*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0,m3,0x0F); \ 166*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); 167*0e33efe4SConrad Meyer 168*0e33efe4SConrad Meyer #define LOAD_MSG_7_2(buf) \ 169*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m2,m3,0x30); \ 170*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m0,4); \ 171*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \ 172*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); 173*0e33efe4SConrad Meyer 174*0e33efe4SConrad Meyer #define LOAD_MSG_7_3(buf) \ 175*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m0,m3); \ 176*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(m1,m2); \ 177*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x3C); \ 178*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); 179*0e33efe4SConrad Meyer 180*0e33efe4SConrad Meyer #define LOAD_MSG_7_4(buf) \ 181*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m0,m1); \ 182*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(m1,m2); \ 183*0e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t0,t1); 184*0e33efe4SConrad Meyer 185*0e33efe4SConrad Meyer #define LOAD_MSG_8_1(buf) \ 186*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m1,m3); \ 187*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(t0,m0); \ 188*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,m2,0xC0); \ 189*0e33efe4SConrad Meyer buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); 190*0e33efe4SConrad Meyer 191*0e33efe4SConrad Meyer #define LOAD_MSG_8_2(buf) \ 192*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m3); \ 193*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m2,t0,0xF0); \ 194*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); 195*0e33efe4SConrad Meyer 196*0e33efe4SConrad Meyer #define LOAD_MSG_8_3(buf) \ 197*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m2,m0,0x0C); \ 198*0e33efe4SConrad Meyer t1 = _mm_slli_si128(t0,4); \ 199*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,m3,0x0F); 200*0e33efe4SConrad Meyer 201*0e33efe4SConrad Meyer #define LOAD_MSG_8_4(buf) \ 202*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1,m0,0x30); \ 203*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); 204*0e33efe4SConrad Meyer 205*0e33efe4SConrad Meyer #define LOAD_MSG_9_1(buf) \ 206*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m2,0x03); \ 207*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,m2,0x30); \ 208*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,t0,0x0F); \ 209*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); 210*0e33efe4SConrad Meyer 211*0e33efe4SConrad Meyer #define LOAD_MSG_9_2(buf) \ 212*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m0,4); \ 213*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,t0,0xC0); \ 214*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); 215*0e33efe4SConrad Meyer 216*0e33efe4SConrad Meyer #define LOAD_MSG_9_3(buf) \ 217*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m3); \ 218*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m2,m3); \ 219*0e33efe4SConrad Meyer t2 = _mm_unpackhi_epi64(t0,t1); \ 220*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); 221*0e33efe4SConrad Meyer 222*0e33efe4SConrad Meyer #define LOAD_MSG_9_4(buf) \ 223*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m3,m2,0xC0); \ 224*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m0,m3); \ 225*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x0F); \ 226*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); 227*0e33efe4SConrad Meyer 228*0e33efe4SConrad Meyer #endif 229*0e33efe4SConrad Meyer 230