1*0e33efe4SConrad Meyer /* 2*0e33efe4SConrad Meyer BLAKE2 reference source code package - optimized C implementations 3*0e33efe4SConrad Meyer 4*0e33efe4SConrad Meyer Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5*0e33efe4SConrad Meyer 6*0e33efe4SConrad Meyer To the extent possible under law, the author(s) have dedicated all copyright 7*0e33efe4SConrad Meyer and related and neighboring rights to this software to the public domain 8*0e33efe4SConrad Meyer worldwide. This software is distributed without any warranty. 9*0e33efe4SConrad Meyer 10*0e33efe4SConrad Meyer You should have received a copy of the CC0 Public Domain Dedication along with 11*0e33efe4SConrad Meyer this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12*0e33efe4SConrad Meyer */ 13*0e33efe4SConrad Meyer #pragma once 14*0e33efe4SConrad Meyer #ifndef __BLAKE2B_LOAD_SSE41_H__ 15*0e33efe4SConrad Meyer #define __BLAKE2B_LOAD_SSE41_H__ 16*0e33efe4SConrad Meyer 17*0e33efe4SConrad Meyer #define LOAD_MSG_0_1(b0, b1) \ 18*0e33efe4SConrad Meyer do \ 19*0e33efe4SConrad Meyer { \ 20*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m0, m1); \ 21*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m2, m3); \ 22*0e33efe4SConrad Meyer } while(0) 23*0e33efe4SConrad Meyer 24*0e33efe4SConrad Meyer 25*0e33efe4SConrad Meyer #define LOAD_MSG_0_2(b0, b1) \ 26*0e33efe4SConrad Meyer do \ 27*0e33efe4SConrad Meyer { \ 28*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m0, m1); \ 29*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m2, m3); \ 30*0e33efe4SConrad Meyer } while(0) 31*0e33efe4SConrad Meyer 32*0e33efe4SConrad Meyer 33*0e33efe4SConrad Meyer #define LOAD_MSG_0_3(b0, b1) \ 34*0e33efe4SConrad Meyer do \ 35*0e33efe4SConrad Meyer { \ 36*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m4, m5); \ 37*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m6, m7); \ 38*0e33efe4SConrad Meyer } while(0) 39*0e33efe4SConrad Meyer 40*0e33efe4SConrad Meyer 41*0e33efe4SConrad Meyer #define LOAD_MSG_0_4(b0, b1) \ 42*0e33efe4SConrad Meyer do \ 43*0e33efe4SConrad Meyer { \ 44*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m4, m5); \ 45*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m6, m7); \ 46*0e33efe4SConrad Meyer } while(0) 47*0e33efe4SConrad Meyer 48*0e33efe4SConrad Meyer 49*0e33efe4SConrad Meyer #define LOAD_MSG_1_1(b0, b1) \ 50*0e33efe4SConrad Meyer do \ 51*0e33efe4SConrad Meyer { \ 52*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m7, m2); \ 53*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m4, m6); \ 54*0e33efe4SConrad Meyer } while(0) 55*0e33efe4SConrad Meyer 56*0e33efe4SConrad Meyer 57*0e33efe4SConrad Meyer #define LOAD_MSG_1_2(b0, b1) \ 58*0e33efe4SConrad Meyer do \ 59*0e33efe4SConrad Meyer { \ 60*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m5, m4); \ 61*0e33efe4SConrad Meyer b1 = _mm_alignr_epi8(m3, m7, 8); \ 62*0e33efe4SConrad Meyer } while(0) 63*0e33efe4SConrad Meyer 64*0e33efe4SConrad Meyer 65*0e33efe4SConrad Meyer #define LOAD_MSG_1_3(b0, b1) \ 66*0e33efe4SConrad Meyer do \ 67*0e33efe4SConrad Meyer { \ 68*0e33efe4SConrad Meyer b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 69*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m5, m2); \ 70*0e33efe4SConrad Meyer } while(0) 71*0e33efe4SConrad Meyer 72*0e33efe4SConrad Meyer 73*0e33efe4SConrad Meyer #define LOAD_MSG_1_4(b0, b1) \ 74*0e33efe4SConrad Meyer do \ 75*0e33efe4SConrad Meyer { \ 76*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m6, m1); \ 77*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m3, m1); \ 78*0e33efe4SConrad Meyer } while(0) 79*0e33efe4SConrad Meyer 80*0e33efe4SConrad Meyer 81*0e33efe4SConrad Meyer #define LOAD_MSG_2_1(b0, b1) \ 82*0e33efe4SConrad Meyer do \ 83*0e33efe4SConrad Meyer { \ 84*0e33efe4SConrad Meyer b0 = _mm_alignr_epi8(m6, m5, 8); \ 85*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m2, m7); \ 86*0e33efe4SConrad Meyer } while(0) 87*0e33efe4SConrad Meyer 88*0e33efe4SConrad Meyer 89*0e33efe4SConrad Meyer #define LOAD_MSG_2_2(b0, b1) \ 90*0e33efe4SConrad Meyer do \ 91*0e33efe4SConrad Meyer { \ 92*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m4, m0); \ 93*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m1, m6, 0xF0); \ 94*0e33efe4SConrad Meyer } while(0) 95*0e33efe4SConrad Meyer 96*0e33efe4SConrad Meyer 97*0e33efe4SConrad Meyer #define LOAD_MSG_2_3(b0, b1) \ 98*0e33efe4SConrad Meyer do \ 99*0e33efe4SConrad Meyer { \ 100*0e33efe4SConrad Meyer b0 = _mm_blend_epi16(m5, m1, 0xF0); \ 101*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m3, m4); \ 102*0e33efe4SConrad Meyer } while(0) 103*0e33efe4SConrad Meyer 104*0e33efe4SConrad Meyer 105*0e33efe4SConrad Meyer #define LOAD_MSG_2_4(b0, b1) \ 106*0e33efe4SConrad Meyer do \ 107*0e33efe4SConrad Meyer { \ 108*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m7, m3); \ 109*0e33efe4SConrad Meyer b1 = _mm_alignr_epi8(m2, m0, 8); \ 110*0e33efe4SConrad Meyer } while(0) 111*0e33efe4SConrad Meyer 112*0e33efe4SConrad Meyer 113*0e33efe4SConrad Meyer #define LOAD_MSG_3_1(b0, b1) \ 114*0e33efe4SConrad Meyer do \ 115*0e33efe4SConrad Meyer { \ 116*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m3, m1); \ 117*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m6, m5); \ 118*0e33efe4SConrad Meyer } while(0) 119*0e33efe4SConrad Meyer 120*0e33efe4SConrad Meyer 121*0e33efe4SConrad Meyer #define LOAD_MSG_3_2(b0, b1) \ 122*0e33efe4SConrad Meyer do \ 123*0e33efe4SConrad Meyer { \ 124*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m4, m0); \ 125*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m6, m7); \ 126*0e33efe4SConrad Meyer } while(0) 127*0e33efe4SConrad Meyer 128*0e33efe4SConrad Meyer 129*0e33efe4SConrad Meyer #define LOAD_MSG_3_3(b0, b1) \ 130*0e33efe4SConrad Meyer do \ 131*0e33efe4SConrad Meyer { \ 132*0e33efe4SConrad Meyer b0 = _mm_blend_epi16(m1, m2, 0xF0); \ 133*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 134*0e33efe4SConrad Meyer } while(0) 135*0e33efe4SConrad Meyer 136*0e33efe4SConrad Meyer 137*0e33efe4SConrad Meyer #define LOAD_MSG_3_4(b0, b1) \ 138*0e33efe4SConrad Meyer do \ 139*0e33efe4SConrad Meyer { \ 140*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m3, m5); \ 141*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m0, m4); \ 142*0e33efe4SConrad Meyer } while(0) 143*0e33efe4SConrad Meyer 144*0e33efe4SConrad Meyer 145*0e33efe4SConrad Meyer #define LOAD_MSG_4_1(b0, b1) \ 146*0e33efe4SConrad Meyer do \ 147*0e33efe4SConrad Meyer { \ 148*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m4, m2); \ 149*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m1, m5); \ 150*0e33efe4SConrad Meyer } while(0) 151*0e33efe4SConrad Meyer 152*0e33efe4SConrad Meyer 153*0e33efe4SConrad Meyer #define LOAD_MSG_4_2(b0, b1) \ 154*0e33efe4SConrad Meyer do \ 155*0e33efe4SConrad Meyer { \ 156*0e33efe4SConrad Meyer b0 = _mm_blend_epi16(m0, m3, 0xF0); \ 157*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 158*0e33efe4SConrad Meyer } while(0) 159*0e33efe4SConrad Meyer 160*0e33efe4SConrad Meyer 161*0e33efe4SConrad Meyer #define LOAD_MSG_4_3(b0, b1) \ 162*0e33efe4SConrad Meyer do \ 163*0e33efe4SConrad Meyer { \ 164*0e33efe4SConrad Meyer b0 = _mm_blend_epi16(m7, m5, 0xF0); \ 165*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m3, m1, 0xF0); \ 166*0e33efe4SConrad Meyer } while(0) 167*0e33efe4SConrad Meyer 168*0e33efe4SConrad Meyer 169*0e33efe4SConrad Meyer #define LOAD_MSG_4_4(b0, b1) \ 170*0e33efe4SConrad Meyer do \ 171*0e33efe4SConrad Meyer { \ 172*0e33efe4SConrad Meyer b0 = _mm_alignr_epi8(m6, m0, 8); \ 173*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m4, m6, 0xF0); \ 174*0e33efe4SConrad Meyer } while(0) 175*0e33efe4SConrad Meyer 176*0e33efe4SConrad Meyer 177*0e33efe4SConrad Meyer #define LOAD_MSG_5_1(b0, b1) \ 178*0e33efe4SConrad Meyer do \ 179*0e33efe4SConrad Meyer { \ 180*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m1, m3); \ 181*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m0, m4); \ 182*0e33efe4SConrad Meyer } while(0) 183*0e33efe4SConrad Meyer 184*0e33efe4SConrad Meyer 185*0e33efe4SConrad Meyer #define LOAD_MSG_5_2(b0, b1) \ 186*0e33efe4SConrad Meyer do \ 187*0e33efe4SConrad Meyer { \ 188*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m6, m5); \ 189*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m5, m1); \ 190*0e33efe4SConrad Meyer } while(0) 191*0e33efe4SConrad Meyer 192*0e33efe4SConrad Meyer 193*0e33efe4SConrad Meyer #define LOAD_MSG_5_3(b0, b1) \ 194*0e33efe4SConrad Meyer do \ 195*0e33efe4SConrad Meyer { \ 196*0e33efe4SConrad Meyer b0 = _mm_blend_epi16(m2, m3, 0xF0); \ 197*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m7, m0); \ 198*0e33efe4SConrad Meyer } while(0) 199*0e33efe4SConrad Meyer 200*0e33efe4SConrad Meyer 201*0e33efe4SConrad Meyer #define LOAD_MSG_5_4(b0, b1) \ 202*0e33efe4SConrad Meyer do \ 203*0e33efe4SConrad Meyer { \ 204*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m6, m2); \ 205*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m7, m4, 0xF0); \ 206*0e33efe4SConrad Meyer } while(0) 207*0e33efe4SConrad Meyer 208*0e33efe4SConrad Meyer 209*0e33efe4SConrad Meyer #define LOAD_MSG_6_1(b0, b1) \ 210*0e33efe4SConrad Meyer do \ 211*0e33efe4SConrad Meyer { \ 212*0e33efe4SConrad Meyer b0 = _mm_blend_epi16(m6, m0, 0xF0); \ 213*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m7, m2); \ 214*0e33efe4SConrad Meyer } while(0) 215*0e33efe4SConrad Meyer 216*0e33efe4SConrad Meyer 217*0e33efe4SConrad Meyer #define LOAD_MSG_6_2(b0, b1) \ 218*0e33efe4SConrad Meyer do \ 219*0e33efe4SConrad Meyer { \ 220*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m2, m7); \ 221*0e33efe4SConrad Meyer b1 = _mm_alignr_epi8(m5, m6, 8); \ 222*0e33efe4SConrad Meyer } while(0) 223*0e33efe4SConrad Meyer 224*0e33efe4SConrad Meyer 225*0e33efe4SConrad Meyer #define LOAD_MSG_6_3(b0, b1) \ 226*0e33efe4SConrad Meyer do \ 227*0e33efe4SConrad Meyer { \ 228*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m0, m3); \ 229*0e33efe4SConrad Meyer b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ 230*0e33efe4SConrad Meyer } while(0) 231*0e33efe4SConrad Meyer 232*0e33efe4SConrad Meyer 233*0e33efe4SConrad Meyer #define LOAD_MSG_6_4(b0, b1) \ 234*0e33efe4SConrad Meyer do \ 235*0e33efe4SConrad Meyer { \ 236*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m3, m1); \ 237*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m1, m5, 0xF0); \ 238*0e33efe4SConrad Meyer } while(0) 239*0e33efe4SConrad Meyer 240*0e33efe4SConrad Meyer 241*0e33efe4SConrad Meyer #define LOAD_MSG_7_1(b0, b1) \ 242*0e33efe4SConrad Meyer do \ 243*0e33efe4SConrad Meyer { \ 244*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m6, m3); \ 245*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m6, m1, 0xF0); \ 246*0e33efe4SConrad Meyer } while(0) 247*0e33efe4SConrad Meyer 248*0e33efe4SConrad Meyer 249*0e33efe4SConrad Meyer #define LOAD_MSG_7_2(b0, b1) \ 250*0e33efe4SConrad Meyer do \ 251*0e33efe4SConrad Meyer { \ 252*0e33efe4SConrad Meyer b0 = _mm_alignr_epi8(m7, m5, 8); \ 253*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m0, m4); \ 254*0e33efe4SConrad Meyer } while(0) 255*0e33efe4SConrad Meyer 256*0e33efe4SConrad Meyer 257*0e33efe4SConrad Meyer #define LOAD_MSG_7_3(b0, b1) \ 258*0e33efe4SConrad Meyer do \ 259*0e33efe4SConrad Meyer { \ 260*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m2, m7); \ 261*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m4, m1); \ 262*0e33efe4SConrad Meyer } while(0) 263*0e33efe4SConrad Meyer 264*0e33efe4SConrad Meyer 265*0e33efe4SConrad Meyer #define LOAD_MSG_7_4(b0, b1) \ 266*0e33efe4SConrad Meyer do \ 267*0e33efe4SConrad Meyer { \ 268*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m0, m2); \ 269*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m3, m5); \ 270*0e33efe4SConrad Meyer } while(0) 271*0e33efe4SConrad Meyer 272*0e33efe4SConrad Meyer 273*0e33efe4SConrad Meyer #define LOAD_MSG_8_1(b0, b1) \ 274*0e33efe4SConrad Meyer do \ 275*0e33efe4SConrad Meyer { \ 276*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m3, m7); \ 277*0e33efe4SConrad Meyer b1 = _mm_alignr_epi8(m0, m5, 8); \ 278*0e33efe4SConrad Meyer } while(0) 279*0e33efe4SConrad Meyer 280*0e33efe4SConrad Meyer 281*0e33efe4SConrad Meyer #define LOAD_MSG_8_2(b0, b1) \ 282*0e33efe4SConrad Meyer do \ 283*0e33efe4SConrad Meyer { \ 284*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m7, m4); \ 285*0e33efe4SConrad Meyer b1 = _mm_alignr_epi8(m4, m1, 8); \ 286*0e33efe4SConrad Meyer } while(0) 287*0e33efe4SConrad Meyer 288*0e33efe4SConrad Meyer 289*0e33efe4SConrad Meyer #define LOAD_MSG_8_3(b0, b1) \ 290*0e33efe4SConrad Meyer do \ 291*0e33efe4SConrad Meyer { \ 292*0e33efe4SConrad Meyer b0 = m6; \ 293*0e33efe4SConrad Meyer b1 = _mm_alignr_epi8(m5, m0, 8); \ 294*0e33efe4SConrad Meyer } while(0) 295*0e33efe4SConrad Meyer 296*0e33efe4SConrad Meyer 297*0e33efe4SConrad Meyer #define LOAD_MSG_8_4(b0, b1) \ 298*0e33efe4SConrad Meyer do \ 299*0e33efe4SConrad Meyer { \ 300*0e33efe4SConrad Meyer b0 = _mm_blend_epi16(m1, m3, 0xF0); \ 301*0e33efe4SConrad Meyer b1 = m2; \ 302*0e33efe4SConrad Meyer } while(0) 303*0e33efe4SConrad Meyer 304*0e33efe4SConrad Meyer 305*0e33efe4SConrad Meyer #define LOAD_MSG_9_1(b0, b1) \ 306*0e33efe4SConrad Meyer do \ 307*0e33efe4SConrad Meyer { \ 308*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m5, m4); \ 309*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m3, m0); \ 310*0e33efe4SConrad Meyer } while(0) 311*0e33efe4SConrad Meyer 312*0e33efe4SConrad Meyer 313*0e33efe4SConrad Meyer #define LOAD_MSG_9_2(b0, b1) \ 314*0e33efe4SConrad Meyer do \ 315*0e33efe4SConrad Meyer { \ 316*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m1, m2); \ 317*0e33efe4SConrad Meyer b1 = _mm_blend_epi16(m3, m2, 0xF0); \ 318*0e33efe4SConrad Meyer } while(0) 319*0e33efe4SConrad Meyer 320*0e33efe4SConrad Meyer 321*0e33efe4SConrad Meyer #define LOAD_MSG_9_3(b0, b1) \ 322*0e33efe4SConrad Meyer do \ 323*0e33efe4SConrad Meyer { \ 324*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m7, m4); \ 325*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m1, m6); \ 326*0e33efe4SConrad Meyer } while(0) 327*0e33efe4SConrad Meyer 328*0e33efe4SConrad Meyer 329*0e33efe4SConrad Meyer #define LOAD_MSG_9_4(b0, b1) \ 330*0e33efe4SConrad Meyer do \ 331*0e33efe4SConrad Meyer { \ 332*0e33efe4SConrad Meyer b0 = _mm_alignr_epi8(m7, m5, 8); \ 333*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m6, m0); \ 334*0e33efe4SConrad Meyer } while(0) 335*0e33efe4SConrad Meyer 336*0e33efe4SConrad Meyer 337*0e33efe4SConrad Meyer #define LOAD_MSG_10_1(b0, b1) \ 338*0e33efe4SConrad Meyer do \ 339*0e33efe4SConrad Meyer { \ 340*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m0, m1); \ 341*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m2, m3); \ 342*0e33efe4SConrad Meyer } while(0) 343*0e33efe4SConrad Meyer 344*0e33efe4SConrad Meyer 345*0e33efe4SConrad Meyer #define LOAD_MSG_10_2(b0, b1) \ 346*0e33efe4SConrad Meyer do \ 347*0e33efe4SConrad Meyer { \ 348*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m0, m1); \ 349*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m2, m3); \ 350*0e33efe4SConrad Meyer } while(0) 351*0e33efe4SConrad Meyer 352*0e33efe4SConrad Meyer 353*0e33efe4SConrad Meyer #define LOAD_MSG_10_3(b0, b1) \ 354*0e33efe4SConrad Meyer do \ 355*0e33efe4SConrad Meyer { \ 356*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m4, m5); \ 357*0e33efe4SConrad Meyer b1 = _mm_unpacklo_epi64(m6, m7); \ 358*0e33efe4SConrad Meyer } while(0) 359*0e33efe4SConrad Meyer 360*0e33efe4SConrad Meyer 361*0e33efe4SConrad Meyer #define LOAD_MSG_10_4(b0, b1) \ 362*0e33efe4SConrad Meyer do \ 363*0e33efe4SConrad Meyer { \ 364*0e33efe4SConrad Meyer b0 = _mm_unpackhi_epi64(m4, m5); \ 365*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m6, m7); \ 366*0e33efe4SConrad Meyer } while(0) 367*0e33efe4SConrad Meyer 368*0e33efe4SConrad Meyer 369*0e33efe4SConrad Meyer #define LOAD_MSG_11_1(b0, b1) \ 370*0e33efe4SConrad Meyer do \ 371*0e33efe4SConrad Meyer { \ 372*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m7, m2); \ 373*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m4, m6); \ 374*0e33efe4SConrad Meyer } while(0) 375*0e33efe4SConrad Meyer 376*0e33efe4SConrad Meyer 377*0e33efe4SConrad Meyer #define LOAD_MSG_11_2(b0, b1) \ 378*0e33efe4SConrad Meyer do \ 379*0e33efe4SConrad Meyer { \ 380*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m5, m4); \ 381*0e33efe4SConrad Meyer b1 = _mm_alignr_epi8(m3, m7, 8); \ 382*0e33efe4SConrad Meyer } while(0) 383*0e33efe4SConrad Meyer 384*0e33efe4SConrad Meyer 385*0e33efe4SConrad Meyer #define LOAD_MSG_11_3(b0, b1) \ 386*0e33efe4SConrad Meyer do \ 387*0e33efe4SConrad Meyer { \ 388*0e33efe4SConrad Meyer b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 389*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m5, m2); \ 390*0e33efe4SConrad Meyer } while(0) 391*0e33efe4SConrad Meyer 392*0e33efe4SConrad Meyer 393*0e33efe4SConrad Meyer #define LOAD_MSG_11_4(b0, b1) \ 394*0e33efe4SConrad Meyer do \ 395*0e33efe4SConrad Meyer { \ 396*0e33efe4SConrad Meyer b0 = _mm_unpacklo_epi64(m6, m1); \ 397*0e33efe4SConrad Meyer b1 = _mm_unpackhi_epi64(m3, m1); \ 398*0e33efe4SConrad Meyer } while(0) 399*0e33efe4SConrad Meyer 400*0e33efe4SConrad Meyer 401*0e33efe4SConrad Meyer #endif 402*0e33efe4SConrad Meyer 403