1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5 6 To the extent possible under law, the author(s) have dedicated all copyright 7 and related and neighboring rights to this software to the public domain 8 worldwide. This software is distributed without any warranty. 9 10 You should have received a copy of the CC0 Public Domain Dedication along with 11 this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12 */ 13 #pragma once 14 #ifndef __BLAKE2S_LOAD_SSE41_H__ 15 #define __BLAKE2S_LOAD_SSE41_H__ 16 17 #define LOAD_MSG_0_1(buf) \ 18 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); 19 20 #define LOAD_MSG_0_2(buf) \ 21 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); 22 23 #define LOAD_MSG_0_3(buf) \ 24 buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); 25 26 #define LOAD_MSG_0_4(buf) \ 27 buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); 28 29 #define LOAD_MSG_1_1(buf) \ 30 t0 = _mm_blend_epi16(m1, m2, 0x0C); \ 31 t1 = _mm_slli_si128(m3, 4); \ 32 t2 = _mm_blend_epi16(t0, t1, 0xF0); \ 33 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); 34 35 #define LOAD_MSG_1_2(buf) \ 36 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ 37 t1 = _mm_blend_epi16(m1,m3,0xC0); \ 38 t2 = _mm_blend_epi16(t0, t1, 0xF0); \ 39 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 40 41 #define LOAD_MSG_1_3(buf) \ 42 t0 = _mm_slli_si128(m1, 4); \ 43 t1 = _mm_blend_epi16(m2, t0, 0x30); \ 44 t2 = _mm_blend_epi16(m0, t1, 0xF0); \ 45 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 46 47 #define LOAD_MSG_1_4(buf) \ 48 t0 = _mm_unpackhi_epi32(m0,m1); \ 49 t1 = _mm_slli_si128(m3, 4); \ 50 t2 = _mm_blend_epi16(t0, t1, 0x0C); \ 51 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 52 53 #define LOAD_MSG_2_1(buf) \ 54 t0 = _mm_unpackhi_epi32(m2,m3); \ 55 t1 = _mm_blend_epi16(m3,m1,0x0C); \ 56 t2 = _mm_blend_epi16(t0, t1, 0x0F); \ 57 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 58 59 #define LOAD_MSG_2_2(buf) \ 60 t0 = _mm_unpacklo_epi32(m2,m0); \ 61 t1 = _mm_blend_epi16(t0, m0, 0xF0); \ 62 t2 = _mm_slli_si128(m3, 8); \ 63 buf = _mm_blend_epi16(t1, t2, 0xC0); 64 65 #define LOAD_MSG_2_3(buf) \ 66 t0 = _mm_blend_epi16(m0, m2, 0x3C); \ 67 t1 = _mm_srli_si128(m1, 12); \ 68 t2 = _mm_blend_epi16(t0,t1,0x03); \ 69 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); 70 71 #define LOAD_MSG_2_4(buf) \ 72 t0 = _mm_slli_si128(m3, 4); \ 73 t1 = _mm_blend_epi16(m0, m1, 0x33); \ 74 t2 = _mm_blend_epi16(t1, t0, 0xC0); \ 75 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); 76 77 #define LOAD_MSG_3_1(buf) \ 78 t0 = _mm_unpackhi_epi32(m0,m1); \ 79 t1 = _mm_unpackhi_epi32(t0, m2); \ 80 t2 = _mm_blend_epi16(t1, m3, 0x0C); \ 81 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 82 83 #define LOAD_MSG_3_2(buf) \ 84 t0 = _mm_slli_si128(m2, 8); \ 85 t1 = _mm_blend_epi16(m3,m0,0x0C); \ 86 t2 = _mm_blend_epi16(t1, t0, 0xC0); \ 87 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 88 89 #define LOAD_MSG_3_3(buf) \ 90 t0 = _mm_blend_epi16(m0,m1,0x0F); \ 91 t1 = _mm_blend_epi16(t0, m3, 0xC0); \ 92 buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 93 94 #define LOAD_MSG_3_4(buf) \ 95 t0 = _mm_unpacklo_epi32(m0,m2); \ 96 t1 = _mm_unpackhi_epi32(m1,m2); \ 97 buf = _mm_unpacklo_epi64(t1,t0); 98 99 #define LOAD_MSG_4_1(buf) \ 100 t0 = _mm_unpacklo_epi64(m1,m2); \ 101 t1 = _mm_unpackhi_epi64(m0,m2); \ 102 t2 = _mm_blend_epi16(t0,t1,0x33); \ 103 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 104 105 #define LOAD_MSG_4_2(buf) \ 106 t0 = _mm_unpackhi_epi64(m1,m3); \ 107 t1 = _mm_unpacklo_epi64(m0,m1); \ 108 buf = _mm_blend_epi16(t0,t1,0x33); 109 110 #define LOAD_MSG_4_3(buf) \ 111 t0 = _mm_unpackhi_epi64(m3,m1); \ 112 t1 = _mm_unpackhi_epi64(m2,m0); \ 113 buf = _mm_blend_epi16(t1,t0,0x33); 114 115 #define LOAD_MSG_4_4(buf) \ 116 t0 = _mm_blend_epi16(m0,m2,0x03); \ 117 t1 = _mm_slli_si128(t0, 8); \ 118 t2 = _mm_blend_epi16(t1,m3,0x0F); \ 119 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); 120 121 #define LOAD_MSG_5_1(buf) \ 122 t0 = _mm_unpackhi_epi32(m0,m1); \ 123 t1 = _mm_unpacklo_epi32(m0,m2); \ 124 buf = _mm_unpacklo_epi64(t0,t1); 125 126 #define LOAD_MSG_5_2(buf) \ 127 t0 = _mm_srli_si128(m2, 4); \ 128 t1 = _mm_blend_epi16(m0,m3,0x03); \ 129 buf = _mm_blend_epi16(t1,t0,0x3C); 130 131 #define LOAD_MSG_5_3(buf) \ 132 t0 = _mm_blend_epi16(m1,m0,0x0C); \ 133 t1 = _mm_srli_si128(m3, 4); \ 134 t2 = _mm_blend_epi16(t0,t1,0x30); \ 135 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); 136 137 #define LOAD_MSG_5_4(buf) \ 138 t0 = _mm_unpacklo_epi64(m1,m2); \ 139 t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ 140 buf = _mm_blend_epi16(t0,t1,0x33); 141 142 #define LOAD_MSG_6_1(buf) \ 143 t0 = _mm_slli_si128(m1, 12); \ 144 t1 = _mm_blend_epi16(m0,m3,0x33); \ 145 buf = _mm_blend_epi16(t1,t0,0xC0); 146 147 #define LOAD_MSG_6_2(buf) \ 148 t0 = _mm_blend_epi16(m3,m2,0x30); \ 149 t1 = _mm_srli_si128(m1, 4); \ 150 t2 = _mm_blend_epi16(t0,t1,0x03); \ 151 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); 152 153 #define LOAD_MSG_6_3(buf) \ 154 t0 = _mm_unpacklo_epi64(m0,m2); \ 155 t1 = _mm_srli_si128(m1, 4); \ 156 buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); 157 158 #define LOAD_MSG_6_4(buf) \ 159 t0 = _mm_unpackhi_epi32(m1,m2); \ 160 t1 = _mm_unpackhi_epi64(m0,t0); \ 161 buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 162 163 #define LOAD_MSG_7_1(buf) \ 164 t0 = _mm_unpackhi_epi32(m0,m1); \ 165 t1 = _mm_blend_epi16(t0,m3,0x0F); \ 166 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); 167 168 #define LOAD_MSG_7_2(buf) \ 169 t0 = _mm_blend_epi16(m2,m3,0x30); \ 170 t1 = _mm_srli_si128(m0,4); \ 171 t2 = _mm_blend_epi16(t0,t1,0x03); \ 172 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); 173 174 #define LOAD_MSG_7_3(buf) \ 175 t0 = _mm_unpackhi_epi64(m0,m3); \ 176 t1 = _mm_unpacklo_epi64(m1,m2); \ 177 t2 = _mm_blend_epi16(t0,t1,0x3C); \ 178 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); 179 180 #define LOAD_MSG_7_4(buf) \ 181 t0 = _mm_unpacklo_epi32(m0,m1); \ 182 t1 = _mm_unpackhi_epi32(m1,m2); \ 183 buf = _mm_unpacklo_epi64(t0,t1); 184 185 #define LOAD_MSG_8_1(buf) \ 186 t0 = _mm_unpackhi_epi32(m1,m3); \ 187 t1 = _mm_unpacklo_epi64(t0,m0); \ 188 t2 = _mm_blend_epi16(t1,m2,0xC0); \ 189 buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); 190 191 #define LOAD_MSG_8_2(buf) \ 192 t0 = _mm_unpackhi_epi32(m0,m3); \ 193 t1 = _mm_blend_epi16(m2,t0,0xF0); \ 194 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); 195 196 #define LOAD_MSG_8_3(buf) \ 197 t0 = _mm_blend_epi16(m2,m0,0x0C); \ 198 t1 = _mm_slli_si128(t0,4); \ 199 buf = _mm_blend_epi16(t1,m3,0x0F); 200 201 #define LOAD_MSG_8_4(buf) \ 202 t0 = _mm_blend_epi16(m1,m0,0x30); \ 203 buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); 204 205 #define LOAD_MSG_9_1(buf) \ 206 t0 = _mm_blend_epi16(m0,m2,0x03); \ 207 t1 = _mm_blend_epi16(m1,m2,0x30); \ 208 t2 = _mm_blend_epi16(t1,t0,0x0F); \ 209 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); 210 211 #define LOAD_MSG_9_2(buf) \ 212 t0 = _mm_slli_si128(m0,4); \ 213 t1 = _mm_blend_epi16(m1,t0,0xC0); \ 214 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); 215 216 #define LOAD_MSG_9_3(buf) \ 217 t0 = _mm_unpackhi_epi32(m0,m3); \ 218 t1 = _mm_unpacklo_epi32(m2,m3); \ 219 t2 = _mm_unpackhi_epi64(t0,t1); \ 220 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); 221 222 #define LOAD_MSG_9_4(buf) \ 223 t0 = _mm_blend_epi16(m3,m2,0xC0); \ 224 t1 = _mm_unpacklo_epi32(m0,m3); \ 225 t2 = _mm_blend_epi16(t0,t1,0x0F); \ 226 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); 227 228 #endif 229 230