1*0e33efe4SConrad Meyer /* 2*0e33efe4SConrad Meyer BLAKE2 reference source code package - optimized C implementations 3*0e33efe4SConrad Meyer 4*0e33efe4SConrad Meyer Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5*0e33efe4SConrad Meyer 6*0e33efe4SConrad Meyer To the extent possible under law, the author(s) have dedicated all copyright 7*0e33efe4SConrad Meyer and related and neighboring rights to this software to the public domain 8*0e33efe4SConrad Meyer worldwide. This software is distributed without any warranty. 9*0e33efe4SConrad Meyer 10*0e33efe4SConrad Meyer You should have received a copy of the CC0 Public Domain Dedication along with 11*0e33efe4SConrad Meyer this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12*0e33efe4SConrad Meyer */ 13*0e33efe4SConrad Meyer #pragma once 14*0e33efe4SConrad Meyer #ifndef __BLAKE2S_LOAD_XOP_H__ 15*0e33efe4SConrad Meyer #define __BLAKE2S_LOAD_XOP_H__ 16*0e33efe4SConrad Meyer 17*0e33efe4SConrad Meyer #define TOB(x) ((x)*4*0x01010101 + 0x03020100) // ..or not TOB 18*0e33efe4SConrad Meyer 19*0e33efe4SConrad Meyer /* Basic VPPERM emulation, for testing purposes */ 20*0e33efe4SConrad Meyer /*static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel) 21*0e33efe4SConrad Meyer { 22*0e33efe4SConrad Meyer const __m128i sixteen = _mm_set1_epi8(16); 23*0e33efe4SConrad Meyer const __m128i t0 = _mm_shuffle_epi8(src1, sel); 24*0e33efe4SConrad Meyer const __m128i s1 = _mm_shuffle_epi8(src2, _mm_sub_epi8(sel, sixteen)); 25*0e33efe4SConrad Meyer const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen), 26*0e33efe4SConrad Meyer _mm_cmpgt_epi8(sel, sixteen)); // (>=16) = 0xff : 00 27*0e33efe4SConrad Meyer return _mm_blendv_epi8(t0, s1, mask); 28*0e33efe4SConrad Meyer }*/ 29*0e33efe4SConrad Meyer 30*0e33efe4SConrad Meyer #define LOAD_MSG_0_1(buf) \ 31*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); 32*0e33efe4SConrad Meyer 33*0e33efe4SConrad Meyer #define LOAD_MSG_0_2(buf) \ 34*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); 35*0e33efe4SConrad Meyer 36*0e33efe4SConrad Meyer #define LOAD_MSG_0_3(buf) \ 37*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); 38*0e33efe4SConrad Meyer 39*0e33efe4SConrad Meyer #define LOAD_MSG_0_4(buf) \ 40*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); 41*0e33efe4SConrad Meyer 42*0e33efe4SConrad Meyer #define LOAD_MSG_1_1(buf) \ 43*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \ 44*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); 45*0e33efe4SConrad Meyer 46*0e33efe4SConrad Meyer #define LOAD_MSG_1_2(buf) \ 47*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(2),TOB(0),TOB(4),TOB(6)) ); \ 48*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 49*0e33efe4SConrad Meyer 50*0e33efe4SConrad Meyer #define LOAD_MSG_1_3(buf) \ 51*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \ 52*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 53*0e33efe4SConrad Meyer 54*0e33efe4SConrad Meyer #define LOAD_MSG_1_4(buf) \ 55*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \ 56*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); 57*0e33efe4SConrad Meyer 58*0e33efe4SConrad Meyer #define LOAD_MSG_2_1(buf) \ 59*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \ 60*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(4),TOB(0)) ); 61*0e33efe4SConrad Meyer 62*0e33efe4SConrad Meyer #define LOAD_MSG_2_2(buf) \ 63*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(2),TOB(0),TOB(4)) ); \ 64*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) ); 65*0e33efe4SConrad Meyer 66*0e33efe4SConrad Meyer #define LOAD_MSG_2_3(buf) \ 67*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \ 68*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); 69*0e33efe4SConrad Meyer 70*0e33efe4SConrad Meyer #define LOAD_MSG_2_4(buf) \ 71*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \ 72*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); 73*0e33efe4SConrad Meyer 74*0e33efe4SConrad Meyer #define LOAD_MSG_3_1(buf) \ 75*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \ 76*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \ 77*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(1),TOB(0)) ); 78*0e33efe4SConrad Meyer 79*0e33efe4SConrad Meyer #define LOAD_MSG_3_2(buf) \ 80*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(1),TOB(5)) ); \ 81*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) ); 82*0e33efe4SConrad Meyer 83*0e33efe4SConrad Meyer #define LOAD_MSG_3_3(buf) \ 84*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \ 85*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 86*0e33efe4SConrad Meyer 87*0e33efe4SConrad Meyer #define LOAD_MSG_3_4(buf) \ 88*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ 89*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) ); 90*0e33efe4SConrad Meyer 91*0e33efe4SConrad Meyer #define LOAD_MSG_4_1(buf) \ 92*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \ 93*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(5)) ); 94*0e33efe4SConrad Meyer 95*0e33efe4SConrad Meyer #define LOAD_MSG_4_2(buf) \ 96*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(7),TOB(0)) ); \ 97*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 98*0e33efe4SConrad Meyer 99*0e33efe4SConrad Meyer #define LOAD_MSG_4_3(buf) \ 100*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \ 101*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \ 102*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); 103*0e33efe4SConrad Meyer 104*0e33efe4SConrad Meyer #define LOAD_MSG_4_4(buf) \ 105*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \ 106*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) ); 107*0e33efe4SConrad Meyer 108*0e33efe4SConrad Meyer #define LOAD_MSG_5_1(buf) \ 109*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \ 110*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(1),TOB(0)) ); 111*0e33efe4SConrad Meyer 112*0e33efe4SConrad Meyer #define LOAD_MSG_5_2(buf) \ 113*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(6),TOB(0)) ); \ 114*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); 115*0e33efe4SConrad Meyer 116*0e33efe4SConrad Meyer #define LOAD_MSG_5_3(buf) \ 117*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \ 118*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 119*0e33efe4SConrad Meyer 120*0e33efe4SConrad Meyer #define LOAD_MSG_5_4(buf) \ 121*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \ 122*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) ); 123*0e33efe4SConrad Meyer 124*0e33efe4SConrad Meyer #define LOAD_MSG_6_1(buf) \ 125*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \ 126*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(4)) ); 127*0e33efe4SConrad Meyer 128*0e33efe4SConrad Meyer #define LOAD_MSG_6_2(buf) \ 129*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(6),TOB(0),TOB(0),TOB(1)) ); \ 130*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) ); 131*0e33efe4SConrad Meyer 132*0e33efe4SConrad Meyer #define LOAD_MSG_6_3(buf) \ 133*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \ 134*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) ); 135*0e33efe4SConrad Meyer 136*0e33efe4SConrad Meyer #define LOAD_MSG_6_4(buf) \ 137*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \ 138*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 139*0e33efe4SConrad Meyer 140*0e33efe4SConrad Meyer #define LOAD_MSG_7_1(buf) \ 141*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \ 142*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(5)) ); 143*0e33efe4SConrad Meyer 144*0e33efe4SConrad Meyer #define LOAD_MSG_7_2(buf) \ 145*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(7)) ); \ 146*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); 147*0e33efe4SConrad Meyer 148*0e33efe4SConrad Meyer #define LOAD_MSG_7_3(buf) \ 149*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \ 150*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \ 151*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); 152*0e33efe4SConrad Meyer 153*0e33efe4SConrad Meyer #define LOAD_MSG_7_4(buf) \ 154*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \ 155*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) ); 156*0e33efe4SConrad Meyer 157*0e33efe4SConrad Meyer #define LOAD_MSG_8_1(buf) \ 158*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ 159*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \ 160*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); 161*0e33efe4SConrad Meyer 162*0e33efe4SConrad Meyer #define LOAD_MSG_8_2(buf) \ 163*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(4),TOB(3),TOB(5),TOB(0)) ); \ 164*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) ); 165*0e33efe4SConrad Meyer 166*0e33efe4SConrad Meyer #define LOAD_MSG_8_3(buf) \ 167*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \ 168*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \ 169*0e33efe4SConrad Meyer 170*0e33efe4SConrad Meyer #define LOAD_MSG_8_4(buf) \ 171*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) ); 172*0e33efe4SConrad Meyer 173*0e33efe4SConrad Meyer #define LOAD_MSG_9_1(buf) \ 174*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \ 175*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(4),TOB(6)) ); 176*0e33efe4SConrad Meyer 177*0e33efe4SConrad Meyer #define LOAD_MSG_9_2(buf) \ 178*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) ); 179*0e33efe4SConrad Meyer 180*0e33efe4SConrad Meyer #define LOAD_MSG_9_3(buf) \ 181*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \ 182*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) ); 183*0e33efe4SConrad Meyer 184*0e33efe4SConrad Meyer #define LOAD_MSG_9_4(buf) \ 185*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \ 186*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) ); 187*0e33efe4SConrad Meyer 188*0e33efe4SConrad Meyer #endif 189*0e33efe4SConrad Meyer 190