1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5 6 To the extent possible under law, the author(s) have dedicated all copyright 7 and related and neighboring rights to this software to the public domain 8 worldwide. This software is distributed without any warranty. 9 10 You should have received a copy of the CC0 Public Domain Dedication along with 11 this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12 */ 13 #pragma once 14 #ifndef __BLAKE2B_LOAD_SSE41_H__ 15 #define __BLAKE2B_LOAD_SSE41_H__ 16 17 #define LOAD_MSG_0_1(b0, b1) \ 18 do \ 19 { \ 20 b0 = _mm_unpacklo_epi64(m0, m1); \ 21 b1 = _mm_unpacklo_epi64(m2, m3); \ 22 } while(0) 23 24 25 #define LOAD_MSG_0_2(b0, b1) \ 26 do \ 27 { \ 28 b0 = _mm_unpackhi_epi64(m0, m1); \ 29 b1 = _mm_unpackhi_epi64(m2, m3); \ 30 } while(0) 31 32 33 #define LOAD_MSG_0_3(b0, b1) \ 34 do \ 35 { \ 36 b0 = _mm_unpacklo_epi64(m4, m5); \ 37 b1 = _mm_unpacklo_epi64(m6, m7); \ 38 } while(0) 39 40 41 #define LOAD_MSG_0_4(b0, b1) \ 42 do \ 43 { \ 44 b0 = _mm_unpackhi_epi64(m4, m5); \ 45 b1 = _mm_unpackhi_epi64(m6, m7); \ 46 } while(0) 47 48 49 #define LOAD_MSG_1_1(b0, b1) \ 50 do \ 51 { \ 52 b0 = _mm_unpacklo_epi64(m7, m2); \ 53 b1 = _mm_unpackhi_epi64(m4, m6); \ 54 } while(0) 55 56 57 #define LOAD_MSG_1_2(b0, b1) \ 58 do \ 59 { \ 60 b0 = _mm_unpacklo_epi64(m5, m4); \ 61 b1 = _mm_alignr_epi8(m3, m7, 8); \ 62 } while(0) 63 64 65 #define LOAD_MSG_1_3(b0, b1) \ 66 do \ 67 { \ 68 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 69 b1 = _mm_unpackhi_epi64(m5, m2); \ 70 } while(0) 71 72 73 #define LOAD_MSG_1_4(b0, b1) \ 74 do \ 75 { \ 76 b0 = _mm_unpacklo_epi64(m6, m1); \ 77 b1 = _mm_unpackhi_epi64(m3, m1); \ 78 } while(0) 79 80 81 #define LOAD_MSG_2_1(b0, b1) \ 82 do \ 83 { \ 84 b0 = _mm_alignr_epi8(m6, m5, 8); \ 85 b1 = _mm_unpackhi_epi64(m2, m7); \ 86 } while(0) 87 88 89 #define LOAD_MSG_2_2(b0, b1) \ 90 do \ 91 { \ 92 b0 = _mm_unpacklo_epi64(m4, m0); \ 93 b1 = _mm_blend_epi16(m1, m6, 0xF0); \ 94 } while(0) 95 96 97 #define LOAD_MSG_2_3(b0, b1) \ 98 do \ 99 { \ 100 b0 = _mm_blend_epi16(m5, m1, 0xF0); \ 101 b1 = _mm_unpackhi_epi64(m3, m4); \ 102 } while(0) 103 104 105 #define LOAD_MSG_2_4(b0, b1) \ 106 do \ 107 { \ 108 b0 = _mm_unpacklo_epi64(m7, m3); \ 109 b1 = _mm_alignr_epi8(m2, m0, 8); \ 110 } while(0) 111 112 113 #define LOAD_MSG_3_1(b0, b1) \ 114 do \ 115 { \ 116 b0 = _mm_unpackhi_epi64(m3, m1); \ 117 b1 = _mm_unpackhi_epi64(m6, m5); \ 118 } while(0) 119 120 121 #define LOAD_MSG_3_2(b0, b1) \ 122 do \ 123 { \ 124 b0 = _mm_unpackhi_epi64(m4, m0); \ 125 b1 = _mm_unpacklo_epi64(m6, m7); \ 126 } while(0) 127 128 129 #define LOAD_MSG_3_3(b0, b1) \ 130 do \ 131 { \ 132 b0 = _mm_blend_epi16(m1, m2, 0xF0); \ 133 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 134 } while(0) 135 136 137 #define LOAD_MSG_3_4(b0, b1) \ 138 do \ 139 { \ 140 b0 = _mm_unpacklo_epi64(m3, m5); \ 141 b1 = _mm_unpacklo_epi64(m0, m4); \ 142 } while(0) 143 144 145 #define LOAD_MSG_4_1(b0, b1) \ 146 do \ 147 { \ 148 b0 = _mm_unpackhi_epi64(m4, m2); \ 149 b1 = _mm_unpacklo_epi64(m1, m5); \ 150 } while(0) 151 152 153 #define LOAD_MSG_4_2(b0, b1) \ 154 do \ 155 { \ 156 b0 = _mm_blend_epi16(m0, m3, 0xF0); \ 157 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 158 } while(0) 159 160 161 #define LOAD_MSG_4_3(b0, b1) \ 162 do \ 163 { \ 164 b0 = _mm_blend_epi16(m7, m5, 0xF0); \ 165 b1 = _mm_blend_epi16(m3, m1, 0xF0); \ 166 } while(0) 167 168 169 #define LOAD_MSG_4_4(b0, b1) \ 170 do \ 171 { \ 172 b0 = _mm_alignr_epi8(m6, m0, 8); \ 173 b1 = _mm_blend_epi16(m4, m6, 0xF0); \ 174 } while(0) 175 176 177 #define LOAD_MSG_5_1(b0, b1) \ 178 do \ 179 { \ 180 b0 = _mm_unpacklo_epi64(m1, m3); \ 181 b1 = _mm_unpacklo_epi64(m0, m4); \ 182 } while(0) 183 184 185 #define LOAD_MSG_5_2(b0, b1) \ 186 do \ 187 { \ 188 b0 = _mm_unpacklo_epi64(m6, m5); \ 189 b1 = _mm_unpackhi_epi64(m5, m1); \ 190 } while(0) 191 192 193 #define LOAD_MSG_5_3(b0, b1) \ 194 do \ 195 { \ 196 b0 = _mm_blend_epi16(m2, m3, 0xF0); \ 197 b1 = _mm_unpackhi_epi64(m7, m0); \ 198 } while(0) 199 200 201 #define LOAD_MSG_5_4(b0, b1) \ 202 do \ 203 { \ 204 b0 = _mm_unpackhi_epi64(m6, m2); \ 205 b1 = _mm_blend_epi16(m7, m4, 0xF0); \ 206 } while(0) 207 208 209 #define LOAD_MSG_6_1(b0, b1) \ 210 do \ 211 { \ 212 b0 = _mm_blend_epi16(m6, m0, 0xF0); \ 213 b1 = _mm_unpacklo_epi64(m7, m2); \ 214 } while(0) 215 216 217 #define LOAD_MSG_6_2(b0, b1) \ 218 do \ 219 { \ 220 b0 = _mm_unpackhi_epi64(m2, m7); \ 221 b1 = _mm_alignr_epi8(m5, m6, 8); \ 222 } while(0) 223 224 225 #define LOAD_MSG_6_3(b0, b1) \ 226 do \ 227 { \ 228 b0 = _mm_unpacklo_epi64(m0, m3); \ 229 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ 230 } while(0) 231 232 233 #define LOAD_MSG_6_4(b0, b1) \ 234 do \ 235 { \ 236 b0 = _mm_unpackhi_epi64(m3, m1); \ 237 b1 = _mm_blend_epi16(m1, m5, 0xF0); \ 238 } while(0) 239 240 241 #define LOAD_MSG_7_1(b0, b1) \ 242 do \ 243 { \ 244 b0 = _mm_unpackhi_epi64(m6, m3); \ 245 b1 = _mm_blend_epi16(m6, m1, 0xF0); \ 246 } while(0) 247 248 249 #define LOAD_MSG_7_2(b0, b1) \ 250 do \ 251 { \ 252 b0 = _mm_alignr_epi8(m7, m5, 8); \ 253 b1 = _mm_unpackhi_epi64(m0, m4); \ 254 } while(0) 255 256 257 #define LOAD_MSG_7_3(b0, b1) \ 258 do \ 259 { \ 260 b0 = _mm_unpackhi_epi64(m2, m7); \ 261 b1 = _mm_unpacklo_epi64(m4, m1); \ 262 } while(0) 263 264 265 #define LOAD_MSG_7_4(b0, b1) \ 266 do \ 267 { \ 268 b0 = _mm_unpacklo_epi64(m0, m2); \ 269 b1 = _mm_unpacklo_epi64(m3, m5); \ 270 } while(0) 271 272 273 #define LOAD_MSG_8_1(b0, b1) \ 274 do \ 275 { \ 276 b0 = _mm_unpacklo_epi64(m3, m7); \ 277 b1 = _mm_alignr_epi8(m0, m5, 8); \ 278 } while(0) 279 280 281 #define LOAD_MSG_8_2(b0, b1) \ 282 do \ 283 { \ 284 b0 = _mm_unpackhi_epi64(m7, m4); \ 285 b1 = _mm_alignr_epi8(m4, m1, 8); \ 286 } while(0) 287 288 289 #define LOAD_MSG_8_3(b0, b1) \ 290 do \ 291 { \ 292 b0 = m6; \ 293 b1 = _mm_alignr_epi8(m5, m0, 8); \ 294 } while(0) 295 296 297 #define LOAD_MSG_8_4(b0, b1) \ 298 do \ 299 { \ 300 b0 = _mm_blend_epi16(m1, m3, 0xF0); \ 301 b1 = m2; \ 302 } while(0) 303 304 305 #define LOAD_MSG_9_1(b0, b1) \ 306 do \ 307 { \ 308 b0 = _mm_unpacklo_epi64(m5, m4); \ 309 b1 = _mm_unpackhi_epi64(m3, m0); \ 310 } while(0) 311 312 313 #define LOAD_MSG_9_2(b0, b1) \ 314 do \ 315 { \ 316 b0 = _mm_unpacklo_epi64(m1, m2); \ 317 b1 = _mm_blend_epi16(m3, m2, 0xF0); \ 318 } while(0) 319 320 321 #define LOAD_MSG_9_3(b0, b1) \ 322 do \ 323 { \ 324 b0 = _mm_unpackhi_epi64(m7, m4); \ 325 b1 = _mm_unpackhi_epi64(m1, m6); \ 326 } while(0) 327 328 329 #define LOAD_MSG_9_4(b0, b1) \ 330 do \ 331 { \ 332 b0 = _mm_alignr_epi8(m7, m5, 8); \ 333 b1 = _mm_unpacklo_epi64(m6, m0); \ 334 } while(0) 335 336 337 #define LOAD_MSG_10_1(b0, b1) \ 338 do \ 339 { \ 340 b0 = _mm_unpacklo_epi64(m0, m1); \ 341 b1 = _mm_unpacklo_epi64(m2, m3); \ 342 } while(0) 343 344 345 #define LOAD_MSG_10_2(b0, b1) \ 346 do \ 347 { \ 348 b0 = _mm_unpackhi_epi64(m0, m1); \ 349 b1 = _mm_unpackhi_epi64(m2, m3); \ 350 } while(0) 351 352 353 #define LOAD_MSG_10_3(b0, b1) \ 354 do \ 355 { \ 356 b0 = _mm_unpacklo_epi64(m4, m5); \ 357 b1 = _mm_unpacklo_epi64(m6, m7); \ 358 } while(0) 359 360 361 #define LOAD_MSG_10_4(b0, b1) \ 362 do \ 363 { \ 364 b0 = _mm_unpackhi_epi64(m4, m5); \ 365 b1 = _mm_unpackhi_epi64(m6, m7); \ 366 } while(0) 367 368 369 #define LOAD_MSG_11_1(b0, b1) \ 370 do \ 371 { \ 372 b0 = _mm_unpacklo_epi64(m7, m2); \ 373 b1 = _mm_unpackhi_epi64(m4, m6); \ 374 } while(0) 375 376 377 #define LOAD_MSG_11_2(b0, b1) \ 378 do \ 379 { \ 380 b0 = _mm_unpacklo_epi64(m5, m4); \ 381 b1 = _mm_alignr_epi8(m3, m7, 8); \ 382 } while(0) 383 384 385 #define LOAD_MSG_11_3(b0, b1) \ 386 do \ 387 { \ 388 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 389 b1 = _mm_unpackhi_epi64(m5, m2); \ 390 } while(0) 391 392 393 #define LOAD_MSG_11_4(b0, b1) \ 394 do \ 395 { \ 396 b0 = _mm_unpacklo_epi64(m6, m1); \ 397 b1 = _mm_unpackhi_epi64(m3, m1); \ 398 } while(0) 399 400 401 #endif 402 403