1 #ifndef blake2b_load_avx2_H 2 #define blake2b_load_avx2_H 3 4 #define BLAKE2B_LOAD_MSG_0_1(b0) \ 5 do { \ 6 t0 = _mm256_unpacklo_epi64(m0, m1); \ 7 t1 = _mm256_unpacklo_epi64(m2, m3); \ 8 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 9 } while (0) 10 11 #define BLAKE2B_LOAD_MSG_0_2(b0) \ 12 do { \ 13 t0 = _mm256_unpackhi_epi64(m0, m1); \ 14 t1 = _mm256_unpackhi_epi64(m2, m3); \ 15 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 16 } while (0) 17 18 #define BLAKE2B_LOAD_MSG_0_3(b0) \ 19 do { \ 20 t0 = _mm256_unpacklo_epi64(m4, m5); \ 21 t1 = _mm256_unpacklo_epi64(m6, m7); \ 22 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 23 } while (0) 24 25 #define BLAKE2B_LOAD_MSG_0_4(b0) \ 26 do { \ 27 t0 = _mm256_unpackhi_epi64(m4, m5); \ 28 t1 = _mm256_unpackhi_epi64(m6, m7); \ 29 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 30 } while (0) 31 32 #define BLAKE2B_LOAD_MSG_1_1(b0) \ 33 do { \ 34 t0 = _mm256_unpacklo_epi64(m7, m2); \ 35 t1 = _mm256_unpackhi_epi64(m4, m6); \ 36 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 37 } while (0) 38 39 #define BLAKE2B_LOAD_MSG_1_2(b0) \ 40 do { \ 41 t0 = _mm256_unpacklo_epi64(m5, m4); \ 42 t1 = _mm256_alignr_epi8(m3, m7, 8); \ 43 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 44 } while (0) 45 46 #define BLAKE2B_LOAD_MSG_1_3(b0) \ 47 do { \ 48 t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \ 49 t1 = _mm256_unpackhi_epi64(m5, m2); \ 50 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 51 } while (0) 52 53 #define BLAKE2B_LOAD_MSG_1_4(b0) \ 54 do { \ 55 t0 = _mm256_unpacklo_epi64(m6, m1); \ 56 t1 = _mm256_unpackhi_epi64(m3, m1); \ 57 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 58 } while (0) 59 60 #define BLAKE2B_LOAD_MSG_2_1(b0) \ 61 do { \ 62 t0 = _mm256_alignr_epi8(m6, m5, 8); \ 63 t1 = _mm256_unpackhi_epi64(m2, m7); \ 64 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 65 } while (0) 66 67 #define BLAKE2B_LOAD_MSG_2_2(b0) \ 68 do { \ 69 t0 = _mm256_unpacklo_epi64(m4, m0); \ 70 t1 = _mm256_blend_epi32(m6, m1, 0x33); \ 71 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 72 } while (0) 73 74 #define BLAKE2B_LOAD_MSG_2_3(b0) \ 75 do { \ 76 t0 = _mm256_blend_epi32(m1, m5, 0x33); \ 77 t1 = _mm256_unpackhi_epi64(m3, m4); \ 78 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 79 } while (0) 80 81 #define BLAKE2B_LOAD_MSG_2_4(b0) \ 82 do { \ 83 t0 = _mm256_unpacklo_epi64(m7, m3); \ 84 t1 = _mm256_alignr_epi8(m2, m0, 8); \ 85 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 86 } while (0) 87 88 #define BLAKE2B_LOAD_MSG_3_1(b0) \ 89 do { \ 90 t0 = _mm256_unpackhi_epi64(m3, m1); \ 91 t1 = _mm256_unpackhi_epi64(m6, m5); \ 92 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 93 } while (0) 94 95 #define BLAKE2B_LOAD_MSG_3_2(b0) \ 96 do { \ 97 t0 = _mm256_unpackhi_epi64(m4, m0); \ 98 t1 = _mm256_unpacklo_epi64(m6, m7); \ 99 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 100 } while (0) 101 102 #define BLAKE2B_LOAD_MSG_3_3(b0) \ 103 do { \ 104 t0 = _mm256_blend_epi32(m2, m1, 0x33); \ 105 t1 = _mm256_blend_epi32(m7, m2, 0x33); \ 106 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 107 } while (0) 108 109 #define BLAKE2B_LOAD_MSG_3_4(b0) \ 110 do { \ 111 t0 = _mm256_unpacklo_epi64(m3, m5); \ 112 t1 = _mm256_unpacklo_epi64(m0, m4); \ 113 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 114 } while (0) 115 116 #define BLAKE2B_LOAD_MSG_4_1(b0) \ 117 do { \ 118 t0 = _mm256_unpackhi_epi64(m4, m2); \ 119 t1 = _mm256_unpacklo_epi64(m1, m5); \ 120 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 121 } while (0) 122 123 #define BLAKE2B_LOAD_MSG_4_2(b0) \ 124 do { \ 125 t0 = _mm256_blend_epi32(m3, m0, 0x33); \ 126 t1 = _mm256_blend_epi32(m7, m2, 0x33); \ 127 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 128 } while (0) 129 130 #define BLAKE2B_LOAD_MSG_4_3(b0) \ 131 do { \ 132 t0 = _mm256_blend_epi32(m5, m7, 0x33); \ 133 t1 = _mm256_blend_epi32(m1, m3, 0x33); \ 134 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 135 } while (0) 136 137 #define BLAKE2B_LOAD_MSG_4_4(b0) \ 138 do { \ 139 t0 = _mm256_alignr_epi8(m6, m0, 8); \ 140 t1 = _mm256_blend_epi32(m6, m4, 0x33); \ 141 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 142 } while (0) 143 144 #define BLAKE2B_LOAD_MSG_5_1(b0) \ 145 do { \ 146 t0 = _mm256_unpacklo_epi64(m1, m3); \ 147 t1 = _mm256_unpacklo_epi64(m0, m4); \ 148 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 149 } while (0) 150 151 #define BLAKE2B_LOAD_MSG_5_2(b0) \ 152 do { \ 153 t0 = _mm256_unpacklo_epi64(m6, m5); \ 154 t1 = _mm256_unpackhi_epi64(m5, m1); \ 155 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 156 } while (0) 157 158 #define BLAKE2B_LOAD_MSG_5_3(b0) \ 159 do { \ 160 t0 = _mm256_blend_epi32(m3, m2, 0x33); \ 161 t1 = _mm256_unpackhi_epi64(m7, m0); \ 162 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 163 } while (0) 164 165 #define BLAKE2B_LOAD_MSG_5_4(b0) \ 166 do { \ 167 t0 = _mm256_unpackhi_epi64(m6, m2); \ 168 t1 = _mm256_blend_epi32(m4, m7, 0x33); \ 169 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 170 } while (0) 171 172 #define BLAKE2B_LOAD_MSG_6_1(b0) \ 173 do { \ 174 t0 = _mm256_blend_epi32(m0, m6, 0x33); \ 175 t1 = _mm256_unpacklo_epi64(m7, m2); \ 176 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 177 } while (0) 178 179 #define BLAKE2B_LOAD_MSG_6_2(b0) \ 180 do { \ 181 t0 = _mm256_unpackhi_epi64(m2, m7); \ 182 t1 = _mm256_alignr_epi8(m5, m6, 8); \ 183 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 184 } while (0) 185 186 #define BLAKE2B_LOAD_MSG_6_3(b0) \ 187 do { \ 188 t0 = _mm256_unpacklo_epi64(m0, m3); \ 189 t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \ 190 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 191 } while (0) 192 193 #define BLAKE2B_LOAD_MSG_6_4(b0) \ 194 do { \ 195 t0 = _mm256_unpackhi_epi64(m3, m1); \ 196 t1 = _mm256_blend_epi32(m5, m1, 0x33); \ 197 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 198 } while (0) 199 200 #define BLAKE2B_LOAD_MSG_7_1(b0) \ 201 do { \ 202 t0 = _mm256_unpackhi_epi64(m6, m3); \ 203 t1 = _mm256_blend_epi32(m1, m6, 0x33); \ 204 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 205 } while (0) 206 207 #define BLAKE2B_LOAD_MSG_7_2(b0) \ 208 do { \ 209 t0 = _mm256_alignr_epi8(m7, m5, 8); \ 210 t1 = _mm256_unpackhi_epi64(m0, m4); \ 211 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 212 } while (0) 213 214 #define BLAKE2B_LOAD_MSG_7_3(b0) \ 215 do { \ 216 t0 = _mm256_unpackhi_epi64(m2, m7); \ 217 t1 = _mm256_unpacklo_epi64(m4, m1); \ 218 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 219 } while (0) 220 221 #define BLAKE2B_LOAD_MSG_7_4(b0) \ 222 do { \ 223 t0 = _mm256_unpacklo_epi64(m0, m2); \ 224 t1 = _mm256_unpacklo_epi64(m3, m5); \ 225 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 226 } while (0) 227 228 #define BLAKE2B_LOAD_MSG_8_1(b0) \ 229 do { \ 230 t0 = _mm256_unpacklo_epi64(m3, m7); \ 231 t1 = _mm256_alignr_epi8(m0, m5, 8); \ 232 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 233 } while (0) 234 235 #define BLAKE2B_LOAD_MSG_8_2(b0) \ 236 do { \ 237 t0 = _mm256_unpackhi_epi64(m7, m4); \ 238 t1 = _mm256_alignr_epi8(m4, m1, 8); \ 239 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 240 } while (0) 241 242 #define BLAKE2B_LOAD_MSG_8_3(b0) \ 243 do { \ 244 t0 = m6; \ 245 t1 = _mm256_alignr_epi8(m5, m0, 8); \ 246 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 247 } while (0) 248 249 #define BLAKE2B_LOAD_MSG_8_4(b0) \ 250 do { \ 251 t0 = _mm256_blend_epi32(m3, m1, 0x33); \ 252 t1 = m2; \ 253 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 254 } while (0) 255 256 #define BLAKE2B_LOAD_MSG_9_1(b0) \ 257 do { \ 258 t0 = _mm256_unpacklo_epi64(m5, m4); \ 259 t1 = _mm256_unpackhi_epi64(m3, m0); \ 260 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 261 } while (0) 262 263 #define BLAKE2B_LOAD_MSG_9_2(b0) \ 264 do { \ 265 t0 = _mm256_unpacklo_epi64(m1, m2); \ 266 t1 = _mm256_blend_epi32(m2, m3, 0x33); \ 267 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 268 } while (0) 269 270 #define BLAKE2B_LOAD_MSG_9_3(b0) \ 271 do { \ 272 t0 = _mm256_unpackhi_epi64(m7, m4); \ 273 t1 = _mm256_unpackhi_epi64(m1, m6); \ 274 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 275 } while (0) 276 277 #define BLAKE2B_LOAD_MSG_9_4(b0) \ 278 do { \ 279 t0 = _mm256_alignr_epi8(m7, m5, 8); \ 280 t1 = _mm256_unpacklo_epi64(m6, m0); \ 281 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 282 } while (0) 283 284 #define BLAKE2B_LOAD_MSG_10_1(b0) \ 285 do { \ 286 t0 = _mm256_unpacklo_epi64(m0, m1); \ 287 t1 = _mm256_unpacklo_epi64(m2, m3); \ 288 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 289 } while (0) 290 291 #define BLAKE2B_LOAD_MSG_10_2(b0) \ 292 do { \ 293 t0 = _mm256_unpackhi_epi64(m0, m1); \ 294 t1 = _mm256_unpackhi_epi64(m2, m3); \ 295 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 296 } while (0) 297 298 #define BLAKE2B_LOAD_MSG_10_3(b0) \ 299 do { \ 300 t0 = _mm256_unpacklo_epi64(m4, m5); \ 301 t1 = _mm256_unpacklo_epi64(m6, m7); \ 302 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 303 } while (0) 304 305 #define BLAKE2B_LOAD_MSG_10_4(b0) \ 306 do { \ 307 t0 = _mm256_unpackhi_epi64(m4, m5); \ 308 t1 = _mm256_unpackhi_epi64(m6, m7); \ 309 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 310 } while (0) 311 312 #define BLAKE2B_LOAD_MSG_11_1(b0) \ 313 do { \ 314 t0 = _mm256_unpacklo_epi64(m7, m2); \ 315 t1 = _mm256_unpackhi_epi64(m4, m6); \ 316 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 317 } while (0) 318 319 #define BLAKE2B_LOAD_MSG_11_2(b0) \ 320 do { \ 321 t0 = _mm256_unpacklo_epi64(m5, m4); \ 322 t1 = _mm256_alignr_epi8(m3, m7, 8); \ 323 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 324 } while (0) 325 326 #define BLAKE2B_LOAD_MSG_11_3(b0) \ 327 do { \ 328 t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \ 329 t1 = _mm256_unpackhi_epi64(m5, m2); \ 330 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 331 } while (0) 332 333 #define BLAKE2B_LOAD_MSG_11_4(b0) \ 334 do { \ 335 t0 = _mm256_unpacklo_epi64(m6, m1); \ 336 t1 = _mm256_unpackhi_epi64(m3, m1); \ 337 b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ 338 } while (0) 339 340 #endif 341