1#if defined(__x86_64__) 2 3#include "llvm_blake3_prefix.h" 4 5#if defined(__ELF__) && !(defined(__sun__) && defined(__svr4__)) 6.section .note.GNU-stack,"",%progbits 7#endif 8 9#if defined(__ELF__) && defined(__CET__) && defined(__has_include) 10#if __has_include(<cet.h>) 11#include <cet.h> 12#endif 13#endif 14 15#if !defined(_CET_ENDBR) 16#define _CET_ENDBR 17#endif 18 19#ifdef __APPLE__ 20#define HIDDEN .private_extern 21#else 22#define HIDDEN .hidden 23#endif 24 25.intel_syntax noprefix 26HIDDEN blake3_hash_many_sse2 27HIDDEN _blake3_hash_many_sse2 28HIDDEN blake3_compress_in_place_sse2 29HIDDEN _blake3_compress_in_place_sse2 30HIDDEN blake3_compress_xof_sse2 31HIDDEN _blake3_compress_xof_sse2 32.global blake3_hash_many_sse2 33.global _blake3_hash_many_sse2 34.global blake3_compress_in_place_sse2 35.global _blake3_compress_in_place_sse2 36.global blake3_compress_xof_sse2 37.global _blake3_compress_xof_sse2 38#ifdef __APPLE__ 39.text 40#else 41.section .text 42#endif 43 .p2align 6 44_blake3_hash_many_sse2: 45blake3_hash_many_sse2: 46 _CET_ENDBR 47 push r15 48 push r14 49 push r13 50 push r12 51 push rbx 52 push rbp 53 mov rbp, rsp 54 sub rsp, 360 55 and rsp, 0xFFFFFFFFFFFFFFC0 56 neg r9d 57 movd xmm0, r9d 58 pshufd xmm0, xmm0, 0x00 59 movdqa xmmword ptr [rsp+0x130], xmm0 60 movdqa xmm1, xmm0 61 pand xmm1, xmmword ptr [ADD0+rip] 62 pand xmm0, xmmword ptr [ADD1+rip] 63 movdqa xmmword ptr [rsp+0x150], xmm0 64 movd xmm0, r8d 65 pshufd xmm0, xmm0, 0x00 66 paddd xmm0, xmm1 67 movdqa xmmword ptr [rsp+0x110], xmm0 68 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] 69 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] 70 pcmpgtd xmm1, xmm0 71 shr r8, 32 72 movd xmm2, r8d 73 pshufd xmm2, xmm2, 0x00 74 psubd xmm2, xmm1 75 movdqa xmmword ptr [rsp+0x120], xmm2 76 mov rbx, qword ptr [rbp+0x50] 77 mov r15, rdx 78 shl r15, 6 79 movzx r13d, byte ptr [rbp+0x38] 80 movzx r12d, byte ptr [rbp+0x48] 81 cmp rsi, 4 82 jc 3f 832: 84 movdqu xmm3, xmmword ptr [rcx] 85 pshufd xmm0, xmm3, 0x00 86 pshufd xmm1, xmm3, 0x55 87 pshufd xmm2, xmm3, 0xAA 88 pshufd xmm3, xmm3, 0xFF 89 movdqu xmm7, xmmword ptr [rcx+0x10] 90 pshufd xmm4, xmm7, 0x00 91 pshufd xmm5, xmm7, 0x55 92 pshufd xmm6, xmm7, 0xAA 93 pshufd xmm7, xmm7, 0xFF 94 mov r8, qword ptr [rdi] 95 mov r9, qword ptr [rdi+0x8] 96 mov r10, qword ptr [rdi+0x10] 97 mov r11, qword ptr [rdi+0x18] 98 movzx eax, byte ptr [rbp+0x40] 99 or eax, r13d 100 xor edx, edx 1019: 102 mov r14d, eax 103 or eax, r12d 104 add rdx, 64 105 cmp rdx, r15 106 cmovne eax, r14d 107 movdqu xmm8, xmmword ptr [r8+rdx-0x40] 108 movdqu xmm9, xmmword ptr [r9+rdx-0x40] 109 movdqu xmm10, xmmword ptr [r10+rdx-0x40] 110 movdqu xmm11, xmmword ptr [r11+rdx-0x40] 111 movdqa xmm12, xmm8 112 punpckldq xmm8, xmm9 113 punpckhdq xmm12, xmm9 114 movdqa xmm14, xmm10 115 punpckldq xmm10, xmm11 116 punpckhdq xmm14, xmm11 117 movdqa xmm9, xmm8 118 punpcklqdq xmm8, xmm10 119 punpckhqdq xmm9, xmm10 120 movdqa xmm13, xmm12 121 punpcklqdq xmm12, xmm14 122 punpckhqdq xmm13, xmm14 123 movdqa xmmword ptr [rsp], xmm8 124 movdqa xmmword ptr [rsp+0x10], xmm9 125 movdqa xmmword ptr [rsp+0x20], xmm12 126 movdqa xmmword ptr [rsp+0x30], xmm13 127 movdqu xmm8, xmmword ptr [r8+rdx-0x30] 128 movdqu xmm9, xmmword ptr [r9+rdx-0x30] 129 movdqu xmm10, xmmword ptr [r10+rdx-0x30] 130 movdqu xmm11, xmmword ptr [r11+rdx-0x30] 131 movdqa xmm12, xmm8 132 punpckldq xmm8, xmm9 133 punpckhdq xmm12, xmm9 134 movdqa xmm14, xmm10 135 punpckldq xmm10, xmm11 136 punpckhdq xmm14, xmm11 137 movdqa xmm9, xmm8 138 punpcklqdq xmm8, xmm10 139 punpckhqdq xmm9, xmm10 140 movdqa xmm13, xmm12 141 punpcklqdq xmm12, xmm14 142 punpckhqdq xmm13, xmm14 143 movdqa xmmword ptr [rsp+0x40], xmm8 144 movdqa xmmword ptr [rsp+0x50], xmm9 145 movdqa xmmword ptr [rsp+0x60], xmm12 146 movdqa xmmword ptr [rsp+0x70], xmm13 147 movdqu xmm8, xmmword ptr [r8+rdx-0x20] 148 movdqu xmm9, xmmword ptr [r9+rdx-0x20] 149 movdqu xmm10, xmmword ptr [r10+rdx-0x20] 150 movdqu xmm11, xmmword ptr [r11+rdx-0x20] 151 movdqa xmm12, xmm8 152 punpckldq xmm8, xmm9 153 punpckhdq xmm12, xmm9 154 movdqa xmm14, xmm10 155 punpckldq xmm10, xmm11 156 punpckhdq xmm14, xmm11 157 movdqa xmm9, xmm8 158 punpcklqdq xmm8, xmm10 159 punpckhqdq xmm9, xmm10 160 movdqa xmm13, xmm12 161 punpcklqdq xmm12, xmm14 162 punpckhqdq xmm13, xmm14 163 movdqa xmmword ptr [rsp+0x80], xmm8 164 movdqa xmmword ptr [rsp+0x90], xmm9 165 movdqa xmmword ptr [rsp+0xA0], xmm12 166 movdqa xmmword ptr [rsp+0xB0], xmm13 167 movdqu xmm8, xmmword ptr [r8+rdx-0x10] 168 movdqu xmm9, xmmword ptr [r9+rdx-0x10] 169 movdqu xmm10, xmmword ptr [r10+rdx-0x10] 170 movdqu xmm11, xmmword ptr [r11+rdx-0x10] 171 movdqa xmm12, xmm8 172 punpckldq xmm8, xmm9 173 punpckhdq xmm12, xmm9 174 movdqa xmm14, xmm10 175 punpckldq xmm10, xmm11 176 punpckhdq xmm14, xmm11 177 movdqa xmm9, xmm8 178 punpcklqdq xmm8, xmm10 179 punpckhqdq xmm9, xmm10 180 movdqa xmm13, xmm12 181 punpcklqdq xmm12, xmm14 182 punpckhqdq xmm13, xmm14 183 movdqa xmmword ptr [rsp+0xC0], xmm8 184 movdqa xmmword ptr [rsp+0xD0], xmm9 185 movdqa xmmword ptr [rsp+0xE0], xmm12 186 movdqa xmmword ptr [rsp+0xF0], xmm13 187 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] 188 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] 189 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] 190 movdqa xmm12, xmmword ptr [rsp+0x110] 191 movdqa xmm13, xmmword ptr [rsp+0x120] 192 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] 193 movd xmm15, eax 194 pshufd xmm15, xmm15, 0x00 195 prefetcht0 [r8+rdx+0x80] 196 prefetcht0 [r9+rdx+0x80] 197 prefetcht0 [r10+rdx+0x80] 198 prefetcht0 [r11+rdx+0x80] 199 paddd xmm0, xmmword ptr [rsp] 200 paddd xmm1, xmmword ptr [rsp+0x20] 201 paddd xmm2, xmmword ptr [rsp+0x40] 202 paddd xmm3, xmmword ptr [rsp+0x60] 203 paddd xmm0, xmm4 204 paddd xmm1, xmm5 205 paddd xmm2, xmm6 206 paddd xmm3, xmm7 207 pxor xmm12, xmm0 208 pxor xmm13, xmm1 209 pxor xmm14, xmm2 210 pxor xmm15, xmm3 211 pshuflw xmm12, xmm12, 0xB1 212 pshufhw xmm12, xmm12, 0xB1 213 pshuflw xmm13, xmm13, 0xB1 214 pshufhw xmm13, xmm13, 0xB1 215 pshuflw xmm14, xmm14, 0xB1 216 pshufhw xmm14, xmm14, 0xB1 217 pshuflw xmm15, xmm15, 0xB1 218 pshufhw xmm15, xmm15, 0xB1 219 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] 220 paddd xmm8, xmm12 221 paddd xmm9, xmm13 222 paddd xmm10, xmm14 223 paddd xmm11, xmm15 224 pxor xmm4, xmm8 225 pxor xmm5, xmm9 226 pxor xmm6, xmm10 227 pxor xmm7, xmm11 228 movdqa xmmword ptr [rsp+0x100], xmm8 229 movdqa xmm8, xmm4 230 psrld xmm8, 12 231 pslld xmm4, 20 232 por xmm4, xmm8 233 movdqa xmm8, xmm5 234 psrld xmm8, 12 235 pslld xmm5, 20 236 por xmm5, xmm8 237 movdqa xmm8, xmm6 238 psrld xmm8, 12 239 pslld xmm6, 20 240 por xmm6, xmm8 241 movdqa xmm8, xmm7 242 psrld xmm8, 12 243 pslld xmm7, 20 244 por xmm7, xmm8 245 paddd xmm0, xmmword ptr [rsp+0x10] 246 paddd xmm1, xmmword ptr [rsp+0x30] 247 paddd xmm2, xmmword ptr [rsp+0x50] 248 paddd xmm3, xmmword ptr [rsp+0x70] 249 paddd xmm0, xmm4 250 paddd xmm1, xmm5 251 paddd xmm2, xmm6 252 paddd xmm3, xmm7 253 pxor xmm12, xmm0 254 pxor xmm13, xmm1 255 pxor xmm14, xmm2 256 pxor xmm15, xmm3 257 movdqa xmm8, xmm12 258 psrld xmm12, 8 259 pslld xmm8, 24 260 pxor xmm12, xmm8 261 movdqa xmm8, xmm13 262 psrld xmm13, 8 263 pslld xmm8, 24 264 pxor xmm13, xmm8 265 movdqa xmm8, xmm14 266 psrld xmm14, 8 267 pslld xmm8, 24 268 pxor xmm14, xmm8 269 movdqa xmm8, xmm15 270 psrld xmm15, 8 271 pslld xmm8, 24 272 pxor xmm15, xmm8 273 movdqa xmm8, xmmword ptr [rsp+0x100] 274 paddd xmm8, xmm12 275 paddd xmm9, xmm13 276 paddd xmm10, xmm14 277 paddd xmm11, xmm15 278 pxor xmm4, xmm8 279 pxor xmm5, xmm9 280 pxor xmm6, xmm10 281 pxor xmm7, xmm11 282 movdqa xmmword ptr [rsp+0x100], xmm8 283 movdqa xmm8, xmm4 284 psrld xmm8, 7 285 pslld xmm4, 25 286 por xmm4, xmm8 287 movdqa xmm8, xmm5 288 psrld xmm8, 7 289 pslld xmm5, 25 290 por xmm5, xmm8 291 movdqa xmm8, xmm6 292 psrld xmm8, 7 293 pslld xmm6, 25 294 por xmm6, xmm8 295 movdqa xmm8, xmm7 296 psrld xmm8, 7 297 pslld xmm7, 25 298 por xmm7, xmm8 299 paddd xmm0, xmmword ptr [rsp+0x80] 300 paddd xmm1, xmmword ptr [rsp+0xA0] 301 paddd xmm2, xmmword ptr [rsp+0xC0] 302 paddd xmm3, xmmword ptr [rsp+0xE0] 303 paddd xmm0, xmm5 304 paddd xmm1, xmm6 305 paddd xmm2, xmm7 306 paddd xmm3, xmm4 307 pxor xmm15, xmm0 308 pxor xmm12, xmm1 309 pxor xmm13, xmm2 310 pxor xmm14, xmm3 311 pshuflw xmm15, xmm15, 0xB1 312 pshufhw xmm15, xmm15, 0xB1 313 pshuflw xmm12, xmm12, 0xB1 314 pshufhw xmm12, xmm12, 0xB1 315 pshuflw xmm13, xmm13, 0xB1 316 pshufhw xmm13, xmm13, 0xB1 317 pshuflw xmm14, xmm14, 0xB1 318 pshufhw xmm14, xmm14, 0xB1 319 paddd xmm10, xmm15 320 paddd xmm11, xmm12 321 movdqa xmm8, xmmword ptr [rsp+0x100] 322 paddd xmm8, xmm13 323 paddd xmm9, xmm14 324 pxor xmm5, xmm10 325 pxor xmm6, xmm11 326 pxor xmm7, xmm8 327 pxor xmm4, xmm9 328 movdqa xmmword ptr [rsp+0x100], xmm8 329 movdqa xmm8, xmm5 330 psrld xmm8, 12 331 pslld xmm5, 20 332 por xmm5, xmm8 333 movdqa xmm8, xmm6 334 psrld xmm8, 12 335 pslld xmm6, 20 336 por xmm6, xmm8 337 movdqa xmm8, xmm7 338 psrld xmm8, 12 339 pslld xmm7, 20 340 por xmm7, xmm8 341 movdqa xmm8, xmm4 342 psrld xmm8, 12 343 pslld xmm4, 20 344 por xmm4, xmm8 345 paddd xmm0, xmmword ptr [rsp+0x90] 346 paddd xmm1, xmmword ptr [rsp+0xB0] 347 paddd xmm2, xmmword ptr [rsp+0xD0] 348 paddd xmm3, xmmword ptr [rsp+0xF0] 349 paddd xmm0, xmm5 350 paddd xmm1, xmm6 351 paddd xmm2, xmm7 352 paddd xmm3, xmm4 353 pxor xmm15, xmm0 354 pxor xmm12, xmm1 355 pxor xmm13, xmm2 356 pxor xmm14, xmm3 357 movdqa xmm8, xmm15 358 psrld xmm15, 8 359 pslld xmm8, 24 360 pxor xmm15, xmm8 361 movdqa xmm8, xmm12 362 psrld xmm12, 8 363 pslld xmm8, 24 364 pxor xmm12, xmm8 365 movdqa xmm8, xmm13 366 psrld xmm13, 8 367 pslld xmm8, 24 368 pxor xmm13, xmm8 369 movdqa xmm8, xmm14 370 psrld xmm14, 8 371 pslld xmm8, 24 372 pxor xmm14, xmm8 373 paddd xmm10, xmm15 374 paddd xmm11, xmm12 375 movdqa xmm8, xmmword ptr [rsp+0x100] 376 paddd xmm8, xmm13 377 paddd xmm9, xmm14 378 pxor xmm5, xmm10 379 pxor xmm6, xmm11 380 pxor xmm7, xmm8 381 pxor xmm4, xmm9 382 movdqa xmmword ptr [rsp+0x100], xmm8 383 movdqa xmm8, xmm5 384 psrld xmm8, 7 385 pslld xmm5, 25 386 por xmm5, xmm8 387 movdqa xmm8, xmm6 388 psrld xmm8, 7 389 pslld xmm6, 25 390 por xmm6, xmm8 391 movdqa xmm8, xmm7 392 psrld xmm8, 7 393 pslld xmm7, 25 394 por xmm7, xmm8 395 movdqa xmm8, xmm4 396 psrld xmm8, 7 397 pslld xmm4, 25 398 por xmm4, xmm8 399 paddd xmm0, xmmword ptr [rsp+0x20] 400 paddd xmm1, xmmword ptr [rsp+0x30] 401 paddd xmm2, xmmword ptr [rsp+0x70] 402 paddd xmm3, xmmword ptr [rsp+0x40] 403 paddd xmm0, xmm4 404 paddd xmm1, xmm5 405 paddd xmm2, xmm6 406 paddd xmm3, xmm7 407 pxor xmm12, xmm0 408 pxor xmm13, xmm1 409 pxor xmm14, xmm2 410 pxor xmm15, xmm3 411 pshuflw xmm12, xmm12, 0xB1 412 pshufhw xmm12, xmm12, 0xB1 413 pshuflw xmm13, xmm13, 0xB1 414 pshufhw xmm13, xmm13, 0xB1 415 pshuflw xmm14, xmm14, 0xB1 416 pshufhw xmm14, xmm14, 0xB1 417 pshuflw xmm15, xmm15, 0xB1 418 pshufhw xmm15, xmm15, 0xB1 419 movdqa xmm8, xmmword ptr [rsp+0x100] 420 paddd xmm8, xmm12 421 paddd xmm9, xmm13 422 paddd xmm10, xmm14 423 paddd xmm11, xmm15 424 pxor xmm4, xmm8 425 pxor xmm5, xmm9 426 pxor xmm6, xmm10 427 pxor xmm7, xmm11 428 movdqa xmmword ptr [rsp+0x100], xmm8 429 movdqa xmm8, xmm4 430 psrld xmm8, 12 431 pslld xmm4, 20 432 por xmm4, xmm8 433 movdqa xmm8, xmm5 434 psrld xmm8, 12 435 pslld xmm5, 20 436 por xmm5, xmm8 437 movdqa xmm8, xmm6 438 psrld xmm8, 12 439 pslld xmm6, 20 440 por xmm6, xmm8 441 movdqa xmm8, xmm7 442 psrld xmm8, 12 443 pslld xmm7, 20 444 por xmm7, xmm8 445 paddd xmm0, xmmword ptr [rsp+0x60] 446 paddd xmm1, xmmword ptr [rsp+0xA0] 447 paddd xmm2, xmmword ptr [rsp] 448 paddd xmm3, xmmword ptr [rsp+0xD0] 449 paddd xmm0, xmm4 450 paddd xmm1, xmm5 451 paddd xmm2, xmm6 452 paddd xmm3, xmm7 453 pxor xmm12, xmm0 454 pxor xmm13, xmm1 455 pxor xmm14, xmm2 456 pxor xmm15, xmm3 457 movdqa xmm8, xmm12 458 psrld xmm12, 8 459 pslld xmm8, 24 460 pxor xmm12, xmm8 461 movdqa xmm8, xmm13 462 psrld xmm13, 8 463 pslld xmm8, 24 464 pxor xmm13, xmm8 465 movdqa xmm8, xmm14 466 psrld xmm14, 8 467 pslld xmm8, 24 468 pxor xmm14, xmm8 469 movdqa xmm8, xmm15 470 psrld xmm15, 8 471 pslld xmm8, 24 472 pxor xmm15, xmm8 473 movdqa xmm8, xmmword ptr [rsp+0x100] 474 paddd xmm8, xmm12 475 paddd xmm9, xmm13 476 paddd xmm10, xmm14 477 paddd xmm11, xmm15 478 pxor xmm4, xmm8 479 pxor xmm5, xmm9 480 pxor xmm6, xmm10 481 pxor xmm7, xmm11 482 movdqa xmmword ptr [rsp+0x100], xmm8 483 movdqa xmm8, xmm4 484 psrld xmm8, 7 485 pslld xmm4, 25 486 por xmm4, xmm8 487 movdqa xmm8, xmm5 488 psrld xmm8, 7 489 pslld xmm5, 25 490 por xmm5, xmm8 491 movdqa xmm8, xmm6 492 psrld xmm8, 7 493 pslld xmm6, 25 494 por xmm6, xmm8 495 movdqa xmm8, xmm7 496 psrld xmm8, 7 497 pslld xmm7, 25 498 por xmm7, xmm8 499 paddd xmm0, xmmword ptr [rsp+0x10] 500 paddd xmm1, xmmword ptr [rsp+0xC0] 501 paddd xmm2, xmmword ptr [rsp+0x90] 502 paddd xmm3, xmmword ptr [rsp+0xF0] 503 paddd xmm0, xmm5 504 paddd xmm1, xmm6 505 paddd xmm2, xmm7 506 paddd xmm3, xmm4 507 pxor xmm15, xmm0 508 pxor xmm12, xmm1 509 pxor xmm13, xmm2 510 pxor xmm14, xmm3 511 pshuflw xmm15, xmm15, 0xB1 512 pshufhw xmm15, xmm15, 0xB1 513 pshuflw xmm12, xmm12, 0xB1 514 pshufhw xmm12, xmm12, 0xB1 515 pshuflw xmm13, xmm13, 0xB1 516 pshufhw xmm13, xmm13, 0xB1 517 pshuflw xmm14, xmm14, 0xB1 518 pshufhw xmm14, xmm14, 0xB1 519 paddd xmm10, xmm15 520 paddd xmm11, xmm12 521 movdqa xmm8, xmmword ptr [rsp+0x100] 522 paddd xmm8, xmm13 523 paddd xmm9, xmm14 524 pxor xmm5, xmm10 525 pxor xmm6, xmm11 526 pxor xmm7, xmm8 527 pxor xmm4, xmm9 528 movdqa xmmword ptr [rsp+0x100], xmm8 529 movdqa xmm8, xmm5 530 psrld xmm8, 12 531 pslld xmm5, 20 532 por xmm5, xmm8 533 movdqa xmm8, xmm6 534 psrld xmm8, 12 535 pslld xmm6, 20 536 por xmm6, xmm8 537 movdqa xmm8, xmm7 538 psrld xmm8, 12 539 pslld xmm7, 20 540 por xmm7, xmm8 541 movdqa xmm8, xmm4 542 psrld xmm8, 12 543 pslld xmm4, 20 544 por xmm4, xmm8 545 paddd xmm0, xmmword ptr [rsp+0xB0] 546 paddd xmm1, xmmword ptr [rsp+0x50] 547 paddd xmm2, xmmword ptr [rsp+0xE0] 548 paddd xmm3, xmmword ptr [rsp+0x80] 549 paddd xmm0, xmm5 550 paddd xmm1, xmm6 551 paddd xmm2, xmm7 552 paddd xmm3, xmm4 553 pxor xmm15, xmm0 554 pxor xmm12, xmm1 555 pxor xmm13, xmm2 556 pxor xmm14, xmm3 557 movdqa xmm8, xmm15 558 psrld xmm15, 8 559 pslld xmm8, 24 560 pxor xmm15, xmm8 561 movdqa xmm8, xmm12 562 psrld xmm12, 8 563 pslld xmm8, 24 564 pxor xmm12, xmm8 565 movdqa xmm8, xmm13 566 psrld xmm13, 8 567 pslld xmm8, 24 568 pxor xmm13, xmm8 569 movdqa xmm8, xmm14 570 psrld xmm14, 8 571 pslld xmm8, 24 572 pxor xmm14, xmm8 573 paddd xmm10, xmm15 574 paddd xmm11, xmm12 575 movdqa xmm8, xmmword ptr [rsp+0x100] 576 paddd xmm8, xmm13 577 paddd xmm9, xmm14 578 pxor xmm5, xmm10 579 pxor xmm6, xmm11 580 pxor xmm7, xmm8 581 pxor xmm4, xmm9 582 movdqa xmmword ptr [rsp+0x100], xmm8 583 movdqa xmm8, xmm5 584 psrld xmm8, 7 585 pslld xmm5, 25 586 por xmm5, xmm8 587 movdqa xmm8, xmm6 588 psrld xmm8, 7 589 pslld xmm6, 25 590 por xmm6, xmm8 591 movdqa xmm8, xmm7 592 psrld xmm8, 7 593 pslld xmm7, 25 594 por xmm7, xmm8 595 movdqa xmm8, xmm4 596 psrld xmm8, 7 597 pslld xmm4, 25 598 por xmm4, xmm8 599 paddd xmm0, xmmword ptr [rsp+0x30] 600 paddd xmm1, xmmword ptr [rsp+0xA0] 601 paddd xmm2, xmmword ptr [rsp+0xD0] 602 paddd xmm3, xmmword ptr [rsp+0x70] 603 paddd xmm0, xmm4 604 paddd xmm1, xmm5 605 paddd xmm2, xmm6 606 paddd xmm3, xmm7 607 pxor xmm12, xmm0 608 pxor xmm13, xmm1 609 pxor xmm14, xmm2 610 pxor xmm15, xmm3 611 pshuflw xmm12, xmm12, 0xB1 612 pshufhw xmm12, xmm12, 0xB1 613 pshuflw xmm13, xmm13, 0xB1 614 pshufhw xmm13, xmm13, 0xB1 615 pshuflw xmm14, xmm14, 0xB1 616 pshufhw xmm14, xmm14, 0xB1 617 pshuflw xmm15, xmm15, 0xB1 618 pshufhw xmm15, xmm15, 0xB1 619 movdqa xmm8, xmmword ptr [rsp+0x100] 620 paddd xmm8, xmm12 621 paddd xmm9, xmm13 622 paddd xmm10, xmm14 623 paddd xmm11, xmm15 624 pxor xmm4, xmm8 625 pxor xmm5, xmm9 626 pxor xmm6, xmm10 627 pxor xmm7, xmm11 628 movdqa xmmword ptr [rsp+0x100], xmm8 629 movdqa xmm8, xmm4 630 psrld xmm8, 12 631 pslld xmm4, 20 632 por xmm4, xmm8 633 movdqa xmm8, xmm5 634 psrld xmm8, 12 635 pslld xmm5, 20 636 por xmm5, xmm8 637 movdqa xmm8, xmm6 638 psrld xmm8, 12 639 pslld xmm6, 20 640 por xmm6, xmm8 641 movdqa xmm8, xmm7 642 psrld xmm8, 12 643 pslld xmm7, 20 644 por xmm7, xmm8 645 paddd xmm0, xmmword ptr [rsp+0x40] 646 paddd xmm1, xmmword ptr [rsp+0xC0] 647 paddd xmm2, xmmword ptr [rsp+0x20] 648 paddd xmm3, xmmword ptr [rsp+0xE0] 649 paddd xmm0, xmm4 650 paddd xmm1, xmm5 651 paddd xmm2, xmm6 652 paddd xmm3, xmm7 653 pxor xmm12, xmm0 654 pxor xmm13, xmm1 655 pxor xmm14, xmm2 656 pxor xmm15, xmm3 657 movdqa xmm8, xmm12 658 psrld xmm12, 8 659 pslld xmm8, 24 660 pxor xmm12, xmm8 661 movdqa xmm8, xmm13 662 psrld xmm13, 8 663 pslld xmm8, 24 664 pxor xmm13, xmm8 665 movdqa xmm8, xmm14 666 psrld xmm14, 8 667 pslld xmm8, 24 668 pxor xmm14, xmm8 669 movdqa xmm8, xmm15 670 psrld xmm15, 8 671 pslld xmm8, 24 672 pxor xmm15, xmm8 673 movdqa xmm8, xmmword ptr [rsp+0x100] 674 paddd xmm8, xmm12 675 paddd xmm9, xmm13 676 paddd xmm10, xmm14 677 paddd xmm11, xmm15 678 pxor xmm4, xmm8 679 pxor xmm5, xmm9 680 pxor xmm6, xmm10 681 pxor xmm7, xmm11 682 movdqa xmmword ptr [rsp+0x100], xmm8 683 movdqa xmm8, xmm4 684 psrld xmm8, 7 685 pslld xmm4, 25 686 por xmm4, xmm8 687 movdqa xmm8, xmm5 688 psrld xmm8, 7 689 pslld xmm5, 25 690 por xmm5, xmm8 691 movdqa xmm8, xmm6 692 psrld xmm8, 7 693 pslld xmm6, 25 694 por xmm6, xmm8 695 movdqa xmm8, xmm7 696 psrld xmm8, 7 697 pslld xmm7, 25 698 por xmm7, xmm8 699 paddd xmm0, xmmword ptr [rsp+0x60] 700 paddd xmm1, xmmword ptr [rsp+0x90] 701 paddd xmm2, xmmword ptr [rsp+0xB0] 702 paddd xmm3, xmmword ptr [rsp+0x80] 703 paddd xmm0, xmm5 704 paddd xmm1, xmm6 705 paddd xmm2, xmm7 706 paddd xmm3, xmm4 707 pxor xmm15, xmm0 708 pxor xmm12, xmm1 709 pxor xmm13, xmm2 710 pxor xmm14, xmm3 711 pshuflw xmm15, xmm15, 0xB1 712 pshufhw xmm15, xmm15, 0xB1 713 pshuflw xmm12, xmm12, 0xB1 714 pshufhw xmm12, xmm12, 0xB1 715 pshuflw xmm13, xmm13, 0xB1 716 pshufhw xmm13, xmm13, 0xB1 717 pshuflw xmm14, xmm14, 0xB1 718 pshufhw xmm14, xmm14, 0xB1 719 paddd xmm10, xmm15 720 paddd xmm11, xmm12 721 movdqa xmm8, xmmword ptr [rsp+0x100] 722 paddd xmm8, xmm13 723 paddd xmm9, xmm14 724 pxor xmm5, xmm10 725 pxor xmm6, xmm11 726 pxor xmm7, xmm8 727 pxor xmm4, xmm9 728 movdqa xmmword ptr [rsp+0x100], xmm8 729 movdqa xmm8, xmm5 730 psrld xmm8, 12 731 pslld xmm5, 20 732 por xmm5, xmm8 733 movdqa xmm8, xmm6 734 psrld xmm8, 12 735 pslld xmm6, 20 736 por xmm6, xmm8 737 movdqa xmm8, xmm7 738 psrld xmm8, 12 739 pslld xmm7, 20 740 por xmm7, xmm8 741 movdqa xmm8, xmm4 742 psrld xmm8, 12 743 pslld xmm4, 20 744 por xmm4, xmm8 745 paddd xmm0, xmmword ptr [rsp+0x50] 746 paddd xmm1, xmmword ptr [rsp] 747 paddd xmm2, xmmword ptr [rsp+0xF0] 748 paddd xmm3, xmmword ptr [rsp+0x10] 749 paddd xmm0, xmm5 750 paddd xmm1, xmm6 751 paddd xmm2, xmm7 752 paddd xmm3, xmm4 753 pxor xmm15, xmm0 754 pxor xmm12, xmm1 755 pxor xmm13, xmm2 756 pxor xmm14, xmm3 757 movdqa xmm8, xmm15 758 psrld xmm15, 8 759 pslld xmm8, 24 760 pxor xmm15, xmm8 761 movdqa xmm8, xmm12 762 psrld xmm12, 8 763 pslld xmm8, 24 764 pxor xmm12, xmm8 765 movdqa xmm8, xmm13 766 psrld xmm13, 8 767 pslld xmm8, 24 768 pxor xmm13, xmm8 769 movdqa xmm8, xmm14 770 psrld xmm14, 8 771 pslld xmm8, 24 772 pxor xmm14, xmm8 773 paddd xmm10, xmm15 774 paddd xmm11, xmm12 775 movdqa xmm8, xmmword ptr [rsp+0x100] 776 paddd xmm8, xmm13 777 paddd xmm9, xmm14 778 pxor xmm5, xmm10 779 pxor xmm6, xmm11 780 pxor xmm7, xmm8 781 pxor xmm4, xmm9 782 movdqa xmmword ptr [rsp+0x100], xmm8 783 movdqa xmm8, xmm5 784 psrld xmm8, 7 785 pslld xmm5, 25 786 por xmm5, xmm8 787 movdqa xmm8, xmm6 788 psrld xmm8, 7 789 pslld xmm6, 25 790 por xmm6, xmm8 791 movdqa xmm8, xmm7 792 psrld xmm8, 7 793 pslld xmm7, 25 794 por xmm7, xmm8 795 movdqa xmm8, xmm4 796 psrld xmm8, 7 797 pslld xmm4, 25 798 por xmm4, xmm8 799 paddd xmm0, xmmword ptr [rsp+0xA0] 800 paddd xmm1, xmmword ptr [rsp+0xC0] 801 paddd xmm2, xmmword ptr [rsp+0xE0] 802 paddd xmm3, xmmword ptr [rsp+0xD0] 803 paddd xmm0, xmm4 804 paddd xmm1, xmm5 805 paddd xmm2, xmm6 806 paddd xmm3, xmm7 807 pxor xmm12, xmm0 808 pxor xmm13, xmm1 809 pxor xmm14, xmm2 810 pxor xmm15, xmm3 811 pshuflw xmm12, xmm12, 0xB1 812 pshufhw xmm12, xmm12, 0xB1 813 pshuflw xmm13, xmm13, 0xB1 814 pshufhw xmm13, xmm13, 0xB1 815 pshuflw xmm14, xmm14, 0xB1 816 pshufhw xmm14, xmm14, 0xB1 817 pshuflw xmm15, xmm15, 0xB1 818 pshufhw xmm15, xmm15, 0xB1 819 movdqa xmm8, xmmword ptr [rsp+0x100] 820 paddd xmm8, xmm12 821 paddd xmm9, xmm13 822 paddd xmm10, xmm14 823 paddd xmm11, xmm15 824 pxor xmm4, xmm8 825 pxor xmm5, xmm9 826 pxor xmm6, xmm10 827 pxor xmm7, xmm11 828 movdqa xmmword ptr [rsp+0x100], xmm8 829 movdqa xmm8, xmm4 830 psrld xmm8, 12 831 pslld xmm4, 20 832 por xmm4, xmm8 833 movdqa xmm8, xmm5 834 psrld xmm8, 12 835 pslld xmm5, 20 836 por xmm5, xmm8 837 movdqa xmm8, xmm6 838 psrld xmm8, 12 839 pslld xmm6, 20 840 por xmm6, xmm8 841 movdqa xmm8, xmm7 842 psrld xmm8, 12 843 pslld xmm7, 20 844 por xmm7, xmm8 845 paddd xmm0, xmmword ptr [rsp+0x70] 846 paddd xmm1, xmmword ptr [rsp+0x90] 847 paddd xmm2, xmmword ptr [rsp+0x30] 848 paddd xmm3, xmmword ptr [rsp+0xF0] 849 paddd xmm0, xmm4 850 paddd xmm1, xmm5 851 paddd xmm2, xmm6 852 paddd xmm3, xmm7 853 pxor xmm12, xmm0 854 pxor xmm13, xmm1 855 pxor xmm14, xmm2 856 pxor xmm15, xmm3 857 movdqa xmm8, xmm12 858 psrld xmm12, 8 859 pslld xmm8, 24 860 pxor xmm12, xmm8 861 movdqa xmm8, xmm13 862 psrld xmm13, 8 863 pslld xmm8, 24 864 pxor xmm13, xmm8 865 movdqa xmm8, xmm14 866 psrld xmm14, 8 867 pslld xmm8, 24 868 pxor xmm14, xmm8 869 movdqa xmm8, xmm15 870 psrld xmm15, 8 871 pslld xmm8, 24 872 pxor xmm15, xmm8 873 movdqa xmm8, xmmword ptr [rsp+0x100] 874 paddd xmm8, xmm12 875 paddd xmm9, xmm13 876 paddd xmm10, xmm14 877 paddd xmm11, xmm15 878 pxor xmm4, xmm8 879 pxor xmm5, xmm9 880 pxor xmm6, xmm10 881 pxor xmm7, xmm11 882 movdqa xmmword ptr [rsp+0x100], xmm8 883 movdqa xmm8, xmm4 884 psrld xmm8, 7 885 pslld xmm4, 25 886 por xmm4, xmm8 887 movdqa xmm8, xmm5 888 psrld xmm8, 7 889 pslld xmm5, 25 890 por xmm5, xmm8 891 movdqa xmm8, xmm6 892 psrld xmm8, 7 893 pslld xmm6, 25 894 por xmm6, xmm8 895 movdqa xmm8, xmm7 896 psrld xmm8, 7 897 pslld xmm7, 25 898 por xmm7, xmm8 899 paddd xmm0, xmmword ptr [rsp+0x40] 900 paddd xmm1, xmmword ptr [rsp+0xB0] 901 paddd xmm2, xmmword ptr [rsp+0x50] 902 paddd xmm3, xmmword ptr [rsp+0x10] 903 paddd xmm0, xmm5 904 paddd xmm1, xmm6 905 paddd xmm2, xmm7 906 paddd xmm3, xmm4 907 pxor xmm15, xmm0 908 pxor xmm12, xmm1 909 pxor xmm13, xmm2 910 pxor xmm14, xmm3 911 pshuflw xmm15, xmm15, 0xB1 912 pshufhw xmm15, xmm15, 0xB1 913 pshuflw xmm12, xmm12, 0xB1 914 pshufhw xmm12, xmm12, 0xB1 915 pshuflw xmm13, xmm13, 0xB1 916 pshufhw xmm13, xmm13, 0xB1 917 pshuflw xmm14, xmm14, 0xB1 918 pshufhw xmm14, xmm14, 0xB1 919 paddd xmm10, xmm15 920 paddd xmm11, xmm12 921 movdqa xmm8, xmmword ptr [rsp+0x100] 922 paddd xmm8, xmm13 923 paddd xmm9, xmm14 924 pxor xmm5, xmm10 925 pxor xmm6, xmm11 926 pxor xmm7, xmm8 927 pxor xmm4, xmm9 928 movdqa xmmword ptr [rsp+0x100], xmm8 929 movdqa xmm8, xmm5 930 psrld xmm8, 12 931 pslld xmm5, 20 932 por xmm5, xmm8 933 movdqa xmm8, xmm6 934 psrld xmm8, 12 935 pslld xmm6, 20 936 por xmm6, xmm8 937 movdqa xmm8, xmm7 938 psrld xmm8, 12 939 pslld xmm7, 20 940 por xmm7, xmm8 941 movdqa xmm8, xmm4 942 psrld xmm8, 12 943 pslld xmm4, 20 944 por xmm4, xmm8 945 paddd xmm0, xmmword ptr [rsp] 946 paddd xmm1, xmmword ptr [rsp+0x20] 947 paddd xmm2, xmmword ptr [rsp+0x80] 948 paddd xmm3, xmmword ptr [rsp+0x60] 949 paddd xmm0, xmm5 950 paddd xmm1, xmm6 951 paddd xmm2, xmm7 952 paddd xmm3, xmm4 953 pxor xmm15, xmm0 954 pxor xmm12, xmm1 955 pxor xmm13, xmm2 956 pxor xmm14, xmm3 957 movdqa xmm8, xmm15 958 psrld xmm15, 8 959 pslld xmm8, 24 960 pxor xmm15, xmm8 961 movdqa xmm8, xmm12 962 psrld xmm12, 8 963 pslld xmm8, 24 964 pxor xmm12, xmm8 965 movdqa xmm8, xmm13 966 psrld xmm13, 8 967 pslld xmm8, 24 968 pxor xmm13, xmm8 969 movdqa xmm8, xmm14 970 psrld xmm14, 8 971 pslld xmm8, 24 972 pxor xmm14, xmm8 973 paddd xmm10, xmm15 974 paddd xmm11, xmm12 975 movdqa xmm8, xmmword ptr [rsp+0x100] 976 paddd xmm8, xmm13 977 paddd xmm9, xmm14 978 pxor xmm5, xmm10 979 pxor xmm6, xmm11 980 pxor xmm7, xmm8 981 pxor xmm4, xmm9 982 movdqa xmmword ptr [rsp+0x100], xmm8 983 movdqa xmm8, xmm5 984 psrld xmm8, 7 985 pslld xmm5, 25 986 por xmm5, xmm8 987 movdqa xmm8, xmm6 988 psrld xmm8, 7 989 pslld xmm6, 25 990 por xmm6, xmm8 991 movdqa xmm8, xmm7 992 psrld xmm8, 7 993 pslld xmm7, 25 994 por xmm7, xmm8 995 movdqa xmm8, xmm4 996 psrld xmm8, 7 997 pslld xmm4, 25 998 por xmm4, xmm8 999 paddd xmm0, xmmword ptr [rsp+0xC0] 1000 paddd xmm1, xmmword ptr [rsp+0x90] 1001 paddd xmm2, xmmword ptr [rsp+0xF0] 1002 paddd xmm3, xmmword ptr [rsp+0xE0] 1003 paddd xmm0, xmm4 1004 paddd xmm1, xmm5 1005 paddd xmm2, xmm6 1006 paddd xmm3, xmm7 1007 pxor xmm12, xmm0 1008 pxor xmm13, xmm1 1009 pxor xmm14, xmm2 1010 pxor xmm15, xmm3 1011 pshuflw xmm12, xmm12, 0xB1 1012 pshufhw xmm12, xmm12, 0xB1 1013 pshuflw xmm13, xmm13, 0xB1 1014 pshufhw xmm13, xmm13, 0xB1 1015 pshuflw xmm14, xmm14, 0xB1 1016 pshufhw xmm14, xmm14, 0xB1 1017 pshuflw xmm15, xmm15, 0xB1 1018 pshufhw xmm15, xmm15, 0xB1 1019 movdqa xmm8, xmmword ptr [rsp+0x100] 1020 paddd xmm8, xmm12 1021 paddd xmm9, xmm13 1022 paddd xmm10, xmm14 1023 paddd xmm11, xmm15 1024 pxor xmm4, xmm8 1025 pxor xmm5, xmm9 1026 pxor xmm6, xmm10 1027 pxor xmm7, xmm11 1028 movdqa xmmword ptr [rsp+0x100], xmm8 1029 movdqa xmm8, xmm4 1030 psrld xmm8, 12 1031 pslld xmm4, 20 1032 por xmm4, xmm8 1033 movdqa xmm8, xmm5 1034 psrld xmm8, 12 1035 pslld xmm5, 20 1036 por xmm5, xmm8 1037 movdqa xmm8, xmm6 1038 psrld xmm8, 12 1039 pslld xmm6, 20 1040 por xmm6, xmm8 1041 movdqa xmm8, xmm7 1042 psrld xmm8, 12 1043 pslld xmm7, 20 1044 por xmm7, xmm8 1045 paddd xmm0, xmmword ptr [rsp+0xD0] 1046 paddd xmm1, xmmword ptr [rsp+0xB0] 1047 paddd xmm2, xmmword ptr [rsp+0xA0] 1048 paddd xmm3, xmmword ptr [rsp+0x80] 1049 paddd xmm0, xmm4 1050 paddd xmm1, xmm5 1051 paddd xmm2, xmm6 1052 paddd xmm3, xmm7 1053 pxor xmm12, xmm0 1054 pxor xmm13, xmm1 1055 pxor xmm14, xmm2 1056 pxor xmm15, xmm3 1057 movdqa xmm8, xmm12 1058 psrld xmm12, 8 1059 pslld xmm8, 24 1060 pxor xmm12, xmm8 1061 movdqa xmm8, xmm13 1062 psrld xmm13, 8 1063 pslld xmm8, 24 1064 pxor xmm13, xmm8 1065 movdqa xmm8, xmm14 1066 psrld xmm14, 8 1067 pslld xmm8, 24 1068 pxor xmm14, xmm8 1069 movdqa xmm8, xmm15 1070 psrld xmm15, 8 1071 pslld xmm8, 24 1072 pxor xmm15, xmm8 1073 movdqa xmm8, xmmword ptr [rsp+0x100] 1074 paddd xmm8, xmm12 1075 paddd xmm9, xmm13 1076 paddd xmm10, xmm14 1077 paddd xmm11, xmm15 1078 pxor xmm4, xmm8 1079 pxor xmm5, xmm9 1080 pxor xmm6, xmm10 1081 pxor xmm7, xmm11 1082 movdqa xmmword ptr [rsp+0x100], xmm8 1083 movdqa xmm8, xmm4 1084 psrld xmm8, 7 1085 pslld xmm4, 25 1086 por xmm4, xmm8 1087 movdqa xmm8, xmm5 1088 psrld xmm8, 7 1089 pslld xmm5, 25 1090 por xmm5, xmm8 1091 movdqa xmm8, xmm6 1092 psrld xmm8, 7 1093 pslld xmm6, 25 1094 por xmm6, xmm8 1095 movdqa xmm8, xmm7 1096 psrld xmm8, 7 1097 pslld xmm7, 25 1098 por xmm7, xmm8 1099 paddd xmm0, xmmword ptr [rsp+0x70] 1100 paddd xmm1, xmmword ptr [rsp+0x50] 1101 paddd xmm2, xmmword ptr [rsp] 1102 paddd xmm3, xmmword ptr [rsp+0x60] 1103 paddd xmm0, xmm5 1104 paddd xmm1, xmm6 1105 paddd xmm2, xmm7 1106 paddd xmm3, xmm4 1107 pxor xmm15, xmm0 1108 pxor xmm12, xmm1 1109 pxor xmm13, xmm2 1110 pxor xmm14, xmm3 1111 pshuflw xmm15, xmm15, 0xB1 1112 pshufhw xmm15, xmm15, 0xB1 1113 pshuflw xmm12, xmm12, 0xB1 1114 pshufhw xmm12, xmm12, 0xB1 1115 pshuflw xmm13, xmm13, 0xB1 1116 pshufhw xmm13, xmm13, 0xB1 1117 pshuflw xmm14, xmm14, 0xB1 1118 pshufhw xmm14, xmm14, 0xB1 1119 paddd xmm10, xmm15 1120 paddd xmm11, xmm12 1121 movdqa xmm8, xmmword ptr [rsp+0x100] 1122 paddd xmm8, xmm13 1123 paddd xmm9, xmm14 1124 pxor xmm5, xmm10 1125 pxor xmm6, xmm11 1126 pxor xmm7, xmm8 1127 pxor xmm4, xmm9 1128 movdqa xmmword ptr [rsp+0x100], xmm8 1129 movdqa xmm8, xmm5 1130 psrld xmm8, 12 1131 pslld xmm5, 20 1132 por xmm5, xmm8 1133 movdqa xmm8, xmm6 1134 psrld xmm8, 12 1135 pslld xmm6, 20 1136 por xmm6, xmm8 1137 movdqa xmm8, xmm7 1138 psrld xmm8, 12 1139 pslld xmm7, 20 1140 por xmm7, xmm8 1141 movdqa xmm8, xmm4 1142 psrld xmm8, 12 1143 pslld xmm4, 20 1144 por xmm4, xmm8 1145 paddd xmm0, xmmword ptr [rsp+0x20] 1146 paddd xmm1, xmmword ptr [rsp+0x30] 1147 paddd xmm2, xmmword ptr [rsp+0x10] 1148 paddd xmm3, xmmword ptr [rsp+0x40] 1149 paddd xmm0, xmm5 1150 paddd xmm1, xmm6 1151 paddd xmm2, xmm7 1152 paddd xmm3, xmm4 1153 pxor xmm15, xmm0 1154 pxor xmm12, xmm1 1155 pxor xmm13, xmm2 1156 pxor xmm14, xmm3 1157 movdqa xmm8, xmm15 1158 psrld xmm15, 8 1159 pslld xmm8, 24 1160 pxor xmm15, xmm8 1161 movdqa xmm8, xmm12 1162 psrld xmm12, 8 1163 pslld xmm8, 24 1164 pxor xmm12, xmm8 1165 movdqa xmm8, xmm13 1166 psrld xmm13, 8 1167 pslld xmm8, 24 1168 pxor xmm13, xmm8 1169 movdqa xmm8, xmm14 1170 psrld xmm14, 8 1171 pslld xmm8, 24 1172 pxor xmm14, xmm8 1173 paddd xmm10, xmm15 1174 paddd xmm11, xmm12 1175 movdqa xmm8, xmmword ptr [rsp+0x100] 1176 paddd xmm8, xmm13 1177 paddd xmm9, xmm14 1178 pxor xmm5, xmm10 1179 pxor xmm6, xmm11 1180 pxor xmm7, xmm8 1181 pxor xmm4, xmm9 1182 movdqa xmmword ptr [rsp+0x100], xmm8 1183 movdqa xmm8, xmm5 1184 psrld xmm8, 7 1185 pslld xmm5, 25 1186 por xmm5, xmm8 1187 movdqa xmm8, xmm6 1188 psrld xmm8, 7 1189 pslld xmm6, 25 1190 por xmm6, xmm8 1191 movdqa xmm8, xmm7 1192 psrld xmm8, 7 1193 pslld xmm7, 25 1194 por xmm7, xmm8 1195 movdqa xmm8, xmm4 1196 psrld xmm8, 7 1197 pslld xmm4, 25 1198 por xmm4, xmm8 1199 paddd xmm0, xmmword ptr [rsp+0x90] 1200 paddd xmm1, xmmword ptr [rsp+0xB0] 1201 paddd xmm2, xmmword ptr [rsp+0x80] 1202 paddd xmm3, xmmword ptr [rsp+0xF0] 1203 paddd xmm0, xmm4 1204 paddd xmm1, xmm5 1205 paddd xmm2, xmm6 1206 paddd xmm3, xmm7 1207 pxor xmm12, xmm0 1208 pxor xmm13, xmm1 1209 pxor xmm14, xmm2 1210 pxor xmm15, xmm3 1211 pshuflw xmm12, xmm12, 0xB1 1212 pshufhw xmm12, xmm12, 0xB1 1213 pshuflw xmm13, xmm13, 0xB1 1214 pshufhw xmm13, xmm13, 0xB1 1215 pshuflw xmm14, xmm14, 0xB1 1216 pshufhw xmm14, xmm14, 0xB1 1217 pshuflw xmm15, xmm15, 0xB1 1218 pshufhw xmm15, xmm15, 0xB1 1219 movdqa xmm8, xmmword ptr [rsp+0x100] 1220 paddd xmm8, xmm12 1221 paddd xmm9, xmm13 1222 paddd xmm10, xmm14 1223 paddd xmm11, xmm15 1224 pxor xmm4, xmm8 1225 pxor xmm5, xmm9 1226 pxor xmm6, xmm10 1227 pxor xmm7, xmm11 1228 movdqa xmmword ptr [rsp+0x100], xmm8 1229 movdqa xmm8, xmm4 1230 psrld xmm8, 12 1231 pslld xmm4, 20 1232 por xmm4, xmm8 1233 movdqa xmm8, xmm5 1234 psrld xmm8, 12 1235 pslld xmm5, 20 1236 por xmm5, xmm8 1237 movdqa xmm8, xmm6 1238 psrld xmm8, 12 1239 pslld xmm6, 20 1240 por xmm6, xmm8 1241 movdqa xmm8, xmm7 1242 psrld xmm8, 12 1243 pslld xmm7, 20 1244 por xmm7, xmm8 1245 paddd xmm0, xmmword ptr [rsp+0xE0] 1246 paddd xmm1, xmmword ptr [rsp+0x50] 1247 paddd xmm2, xmmword ptr [rsp+0xC0] 1248 paddd xmm3, xmmword ptr [rsp+0x10] 1249 paddd xmm0, xmm4 1250 paddd xmm1, xmm5 1251 paddd xmm2, xmm6 1252 paddd xmm3, xmm7 1253 pxor xmm12, xmm0 1254 pxor xmm13, xmm1 1255 pxor xmm14, xmm2 1256 pxor xmm15, xmm3 1257 movdqa xmm8, xmm12 1258 psrld xmm12, 8 1259 pslld xmm8, 24 1260 pxor xmm12, xmm8 1261 movdqa xmm8, xmm13 1262 psrld xmm13, 8 1263 pslld xmm8, 24 1264 pxor xmm13, xmm8 1265 movdqa xmm8, xmm14 1266 psrld xmm14, 8 1267 pslld xmm8, 24 1268 pxor xmm14, xmm8 1269 movdqa xmm8, xmm15 1270 psrld xmm15, 8 1271 pslld xmm8, 24 1272 pxor xmm15, xmm8 1273 movdqa xmm8, xmmword ptr [rsp+0x100] 1274 paddd xmm8, xmm12 1275 paddd xmm9, xmm13 1276 paddd xmm10, xmm14 1277 paddd xmm11, xmm15 1278 pxor xmm4, xmm8 1279 pxor xmm5, xmm9 1280 pxor xmm6, xmm10 1281 pxor xmm7, xmm11 1282 movdqa xmmword ptr [rsp+0x100], xmm8 1283 movdqa xmm8, xmm4 1284 psrld xmm8, 7 1285 pslld xmm4, 25 1286 por xmm4, xmm8 1287 movdqa xmm8, xmm5 1288 psrld xmm8, 7 1289 pslld xmm5, 25 1290 por xmm5, xmm8 1291 movdqa xmm8, xmm6 1292 psrld xmm8, 7 1293 pslld xmm6, 25 1294 por xmm6, xmm8 1295 movdqa xmm8, xmm7 1296 psrld xmm8, 7 1297 pslld xmm7, 25 1298 por xmm7, xmm8 1299 paddd xmm0, xmmword ptr [rsp+0xD0] 1300 paddd xmm1, xmmword ptr [rsp] 1301 paddd xmm2, xmmword ptr [rsp+0x20] 1302 paddd xmm3, xmmword ptr [rsp+0x40] 1303 paddd xmm0, xmm5 1304 paddd xmm1, xmm6 1305 paddd xmm2, xmm7 1306 paddd xmm3, xmm4 1307 pxor xmm15, xmm0 1308 pxor xmm12, xmm1 1309 pxor xmm13, xmm2 1310 pxor xmm14, xmm3 1311 pshuflw xmm15, xmm15, 0xB1 1312 pshufhw xmm15, xmm15, 0xB1 1313 pshuflw xmm12, xmm12, 0xB1 1314 pshufhw xmm12, xmm12, 0xB1 1315 pshuflw xmm13, xmm13, 0xB1 1316 pshufhw xmm13, xmm13, 0xB1 1317 pshuflw xmm14, xmm14, 0xB1 1318 pshufhw xmm14, xmm14, 0xB1 1319 paddd xmm10, xmm15 1320 paddd xmm11, xmm12 1321 movdqa xmm8, xmmword ptr [rsp+0x100] 1322 paddd xmm8, xmm13 1323 paddd xmm9, xmm14 1324 pxor xmm5, xmm10 1325 pxor xmm6, xmm11 1326 pxor xmm7, xmm8 1327 pxor xmm4, xmm9 1328 movdqa xmmword ptr [rsp+0x100], xmm8 1329 movdqa xmm8, xmm5 1330 psrld xmm8, 12 1331 pslld xmm5, 20 1332 por xmm5, xmm8 1333 movdqa xmm8, xmm6 1334 psrld xmm8, 12 1335 pslld xmm6, 20 1336 por xmm6, xmm8 1337 movdqa xmm8, xmm7 1338 psrld xmm8, 12 1339 pslld xmm7, 20 1340 por xmm7, xmm8 1341 movdqa xmm8, xmm4 1342 psrld xmm8, 12 1343 pslld xmm4, 20 1344 por xmm4, xmm8 1345 paddd xmm0, xmmword ptr [rsp+0x30] 1346 paddd xmm1, xmmword ptr [rsp+0xA0] 1347 paddd xmm2, xmmword ptr [rsp+0x60] 1348 paddd xmm3, xmmword ptr [rsp+0x70] 1349 paddd xmm0, xmm5 1350 paddd xmm1, xmm6 1351 paddd xmm2, xmm7 1352 paddd xmm3, xmm4 1353 pxor xmm15, xmm0 1354 pxor xmm12, xmm1 1355 pxor xmm13, xmm2 1356 pxor xmm14, xmm3 1357 movdqa xmm8, xmm15 1358 psrld xmm15, 8 1359 pslld xmm8, 24 1360 pxor xmm15, xmm8 1361 movdqa xmm8, xmm12 1362 psrld xmm12, 8 1363 pslld xmm8, 24 1364 pxor xmm12, xmm8 1365 movdqa xmm8, xmm13 1366 psrld xmm13, 8 1367 pslld xmm8, 24 1368 pxor xmm13, xmm8 1369 movdqa xmm8, xmm14 1370 psrld xmm14, 8 1371 pslld xmm8, 24 1372 pxor xmm14, xmm8 1373 paddd xmm10, xmm15 1374 paddd xmm11, xmm12 1375 movdqa xmm8, xmmword ptr [rsp+0x100] 1376 paddd xmm8, xmm13 1377 paddd xmm9, xmm14 1378 pxor xmm5, xmm10 1379 pxor xmm6, xmm11 1380 pxor xmm7, xmm8 1381 pxor xmm4, xmm9 1382 movdqa xmmword ptr [rsp+0x100], xmm8 1383 movdqa xmm8, xmm5 1384 psrld xmm8, 7 1385 pslld xmm5, 25 1386 por xmm5, xmm8 1387 movdqa xmm8, xmm6 1388 psrld xmm8, 7 1389 pslld xmm6, 25 1390 por xmm6, xmm8 1391 movdqa xmm8, xmm7 1392 psrld xmm8, 7 1393 pslld xmm7, 25 1394 por xmm7, xmm8 1395 movdqa xmm8, xmm4 1396 psrld xmm8, 7 1397 pslld xmm4, 25 1398 por xmm4, xmm8 1399 paddd xmm0, xmmword ptr [rsp+0xB0] 1400 paddd xmm1, xmmword ptr [rsp+0x50] 1401 paddd xmm2, xmmword ptr [rsp+0x10] 1402 paddd xmm3, xmmword ptr [rsp+0x80] 1403 paddd xmm0, xmm4 1404 paddd xmm1, xmm5 1405 paddd xmm2, xmm6 1406 paddd xmm3, xmm7 1407 pxor xmm12, xmm0 1408 pxor xmm13, xmm1 1409 pxor xmm14, xmm2 1410 pxor xmm15, xmm3 1411 pshuflw xmm12, xmm12, 0xB1 1412 pshufhw xmm12, xmm12, 0xB1 1413 pshuflw xmm13, xmm13, 0xB1 1414 pshufhw xmm13, xmm13, 0xB1 1415 pshuflw xmm14, xmm14, 0xB1 1416 pshufhw xmm14, xmm14, 0xB1 1417 pshuflw xmm15, xmm15, 0xB1 1418 pshufhw xmm15, xmm15, 0xB1 1419 movdqa xmm8, xmmword ptr [rsp+0x100] 1420 paddd xmm8, xmm12 1421 paddd xmm9, xmm13 1422 paddd xmm10, xmm14 1423 paddd xmm11, xmm15 1424 pxor xmm4, xmm8 1425 pxor xmm5, xmm9 1426 pxor xmm6, xmm10 1427 pxor xmm7, xmm11 1428 movdqa xmmword ptr [rsp+0x100], xmm8 1429 movdqa xmm8, xmm4 1430 psrld xmm8, 12 1431 pslld xmm4, 20 1432 por xmm4, xmm8 1433 movdqa xmm8, xmm5 1434 psrld xmm8, 12 1435 pslld xmm5, 20 1436 por xmm5, xmm8 1437 movdqa xmm8, xmm6 1438 psrld xmm8, 12 1439 pslld xmm6, 20 1440 por xmm6, xmm8 1441 movdqa xmm8, xmm7 1442 psrld xmm8, 12 1443 pslld xmm7, 20 1444 por xmm7, xmm8 1445 paddd xmm0, xmmword ptr [rsp+0xF0] 1446 paddd xmm1, xmmword ptr [rsp] 1447 paddd xmm2, xmmword ptr [rsp+0x90] 1448 paddd xmm3, xmmword ptr [rsp+0x60] 1449 paddd xmm0, xmm4 1450 paddd xmm1, xmm5 1451 paddd xmm2, xmm6 1452 paddd xmm3, xmm7 1453 pxor xmm12, xmm0 1454 pxor xmm13, xmm1 1455 pxor xmm14, xmm2 1456 pxor xmm15, xmm3 1457 movdqa xmm8, xmm12 1458 psrld xmm12, 8 1459 pslld xmm8, 24 1460 pxor xmm12, xmm8 1461 movdqa xmm8, xmm13 1462 psrld xmm13, 8 1463 pslld xmm8, 24 1464 pxor xmm13, xmm8 1465 movdqa xmm8, xmm14 1466 psrld xmm14, 8 1467 pslld xmm8, 24 1468 pxor xmm14, xmm8 1469 movdqa xmm8, xmm15 1470 psrld xmm15, 8 1471 pslld xmm8, 24 1472 pxor xmm15, xmm8 1473 movdqa xmm8, xmmword ptr [rsp+0x100] 1474 paddd xmm8, xmm12 1475 paddd xmm9, xmm13 1476 paddd xmm10, xmm14 1477 paddd xmm11, xmm15 1478 pxor xmm4, xmm8 1479 pxor xmm5, xmm9 1480 pxor xmm6, xmm10 1481 pxor xmm7, xmm11 1482 movdqa xmmword ptr [rsp+0x100], xmm8 1483 movdqa xmm8, xmm4 1484 psrld xmm8, 7 1485 pslld xmm4, 25 1486 por xmm4, xmm8 1487 movdqa xmm8, xmm5 1488 psrld xmm8, 7 1489 pslld xmm5, 25 1490 por xmm5, xmm8 1491 movdqa xmm8, xmm6 1492 psrld xmm8, 7 1493 pslld xmm6, 25 1494 por xmm6, xmm8 1495 movdqa xmm8, xmm7 1496 psrld xmm8, 7 1497 pslld xmm7, 25 1498 por xmm7, xmm8 1499 paddd xmm0, xmmword ptr [rsp+0xE0] 1500 paddd xmm1, xmmword ptr [rsp+0x20] 1501 paddd xmm2, xmmword ptr [rsp+0x30] 1502 paddd xmm3, xmmword ptr [rsp+0x70] 1503 paddd xmm0, xmm5 1504 paddd xmm1, xmm6 1505 paddd xmm2, xmm7 1506 paddd xmm3, xmm4 1507 pxor xmm15, xmm0 1508 pxor xmm12, xmm1 1509 pxor xmm13, xmm2 1510 pxor xmm14, xmm3 1511 pshuflw xmm15, xmm15, 0xB1 1512 pshufhw xmm15, xmm15, 0xB1 1513 pshuflw xmm12, xmm12, 0xB1 1514 pshufhw xmm12, xmm12, 0xB1 1515 pshuflw xmm13, xmm13, 0xB1 1516 pshufhw xmm13, xmm13, 0xB1 1517 pshuflw xmm14, xmm14, 0xB1 1518 pshufhw xmm14, xmm14, 0xB1 1519 paddd xmm10, xmm15 1520 paddd xmm11, xmm12 1521 movdqa xmm8, xmmword ptr [rsp+0x100] 1522 paddd xmm8, xmm13 1523 paddd xmm9, xmm14 1524 pxor xmm5, xmm10 1525 pxor xmm6, xmm11 1526 pxor xmm7, xmm8 1527 pxor xmm4, xmm9 1528 movdqa xmmword ptr [rsp+0x100], xmm8 1529 movdqa xmm8, xmm5 1530 psrld xmm8, 12 1531 pslld xmm5, 20 1532 por xmm5, xmm8 1533 movdqa xmm8, xmm6 1534 psrld xmm8, 12 1535 pslld xmm6, 20 1536 por xmm6, xmm8 1537 movdqa xmm8, xmm7 1538 psrld xmm8, 12 1539 pslld xmm7, 20 1540 por xmm7, xmm8 1541 movdqa xmm8, xmm4 1542 psrld xmm8, 12 1543 pslld xmm4, 20 1544 por xmm4, xmm8 1545 paddd xmm0, xmmword ptr [rsp+0xA0] 1546 paddd xmm1, xmmword ptr [rsp+0xC0] 1547 paddd xmm2, xmmword ptr [rsp+0x40] 1548 paddd xmm3, xmmword ptr [rsp+0xD0] 1549 paddd xmm0, xmm5 1550 paddd xmm1, xmm6 1551 paddd xmm2, xmm7 1552 paddd xmm3, xmm4 1553 pxor xmm15, xmm0 1554 pxor xmm12, xmm1 1555 pxor xmm13, xmm2 1556 pxor xmm14, xmm3 1557 movdqa xmm8, xmm15 1558 psrld xmm15, 8 1559 pslld xmm8, 24 1560 pxor xmm15, xmm8 1561 movdqa xmm8, xmm12 1562 psrld xmm12, 8 1563 pslld xmm8, 24 1564 pxor xmm12, xmm8 1565 movdqa xmm8, xmm13 1566 psrld xmm13, 8 1567 pslld xmm8, 24 1568 pxor xmm13, xmm8 1569 movdqa xmm8, xmm14 1570 psrld xmm14, 8 1571 pslld xmm8, 24 1572 pxor xmm14, xmm8 1573 paddd xmm10, xmm15 1574 paddd xmm11, xmm12 1575 movdqa xmm8, xmmword ptr [rsp+0x100] 1576 paddd xmm8, xmm13 1577 paddd xmm9, xmm14 1578 pxor xmm5, xmm10 1579 pxor xmm6, xmm11 1580 pxor xmm7, xmm8 1581 pxor xmm4, xmm9 1582 pxor xmm0, xmm8 1583 pxor xmm1, xmm9 1584 pxor xmm2, xmm10 1585 pxor xmm3, xmm11 1586 movdqa xmm8, xmm5 1587 psrld xmm8, 7 1588 pslld xmm5, 25 1589 por xmm5, xmm8 1590 movdqa xmm8, xmm6 1591 psrld xmm8, 7 1592 pslld xmm6, 25 1593 por xmm6, xmm8 1594 movdqa xmm8, xmm7 1595 psrld xmm8, 7 1596 pslld xmm7, 25 1597 por xmm7, xmm8 1598 movdqa xmm8, xmm4 1599 psrld xmm8, 7 1600 pslld xmm4, 25 1601 por xmm4, xmm8 1602 pxor xmm4, xmm12 1603 pxor xmm5, xmm13 1604 pxor xmm6, xmm14 1605 pxor xmm7, xmm15 1606 mov eax, r13d 1607 jne 9b 1608 movdqa xmm9, xmm0 1609 punpckldq xmm0, xmm1 1610 punpckhdq xmm9, xmm1 1611 movdqa xmm11, xmm2 1612 punpckldq xmm2, xmm3 1613 punpckhdq xmm11, xmm3 1614 movdqa xmm1, xmm0 1615 punpcklqdq xmm0, xmm2 1616 punpckhqdq xmm1, xmm2 1617 movdqa xmm3, xmm9 1618 punpcklqdq xmm9, xmm11 1619 punpckhqdq xmm3, xmm11 1620 movdqu xmmword ptr [rbx], xmm0 1621 movdqu xmmword ptr [rbx+0x20], xmm1 1622 movdqu xmmword ptr [rbx+0x40], xmm9 1623 movdqu xmmword ptr [rbx+0x60], xmm3 1624 movdqa xmm9, xmm4 1625 punpckldq xmm4, xmm5 1626 punpckhdq xmm9, xmm5 1627 movdqa xmm11, xmm6 1628 punpckldq xmm6, xmm7 1629 punpckhdq xmm11, xmm7 1630 movdqa xmm5, xmm4 1631 punpcklqdq xmm4, xmm6 1632 punpckhqdq xmm5, xmm6 1633 movdqa xmm7, xmm9 1634 punpcklqdq xmm9, xmm11 1635 punpckhqdq xmm7, xmm11 1636 movdqu xmmword ptr [rbx+0x10], xmm4 1637 movdqu xmmword ptr [rbx+0x30], xmm5 1638 movdqu xmmword ptr [rbx+0x50], xmm9 1639 movdqu xmmword ptr [rbx+0x70], xmm7 1640 movdqa xmm1, xmmword ptr [rsp+0x110] 1641 movdqa xmm0, xmm1 1642 paddd xmm1, xmmword ptr [rsp+0x150] 1643 movdqa xmmword ptr [rsp+0x110], xmm1 1644 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] 1645 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] 1646 pcmpgtd xmm0, xmm1 1647 movdqa xmm1, xmmword ptr [rsp+0x120] 1648 psubd xmm1, xmm0 1649 movdqa xmmword ptr [rsp+0x120], xmm1 1650 add rbx, 128 1651 add rdi, 32 1652 sub rsi, 4 1653 cmp rsi, 4 1654 jnc 2b 1655 test rsi, rsi 1656 jnz 3f 16574: 1658 mov rsp, rbp 1659 pop rbp 1660 pop rbx 1661 pop r12 1662 pop r13 1663 pop r14 1664 pop r15 1665 ret 1666.p2align 5 16673: 1668 test esi, 0x2 1669 je 3f 1670 movups xmm0, xmmword ptr [rcx] 1671 movups xmm1, xmmword ptr [rcx+0x10] 1672 movaps xmm8, xmm0 1673 movaps xmm9, xmm1 1674 movd xmm13, dword ptr [rsp+0x110] 1675 movd xmm14, dword ptr [rsp+0x120] 1676 punpckldq xmm13, xmm14 1677 movaps xmmword ptr [rsp], xmm13 1678 movd xmm14, dword ptr [rsp+0x114] 1679 movd xmm13, dword ptr [rsp+0x124] 1680 punpckldq xmm14, xmm13 1681 movaps xmmword ptr [rsp+0x10], xmm14 1682 mov r8, qword ptr [rdi] 1683 mov r9, qword ptr [rdi+0x8] 1684 movzx eax, byte ptr [rbp+0x40] 1685 or eax, r13d 1686 xor edx, edx 16872: 1688 mov r14d, eax 1689 or eax, r12d 1690 add rdx, 64 1691 cmp rdx, r15 1692 cmovne eax, r14d 1693 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 1694 movaps xmm10, xmm2 1695 movups xmm4, xmmword ptr [r8+rdx-0x40] 1696 movups xmm5, xmmword ptr [r8+rdx-0x30] 1697 movaps xmm3, xmm4 1698 shufps xmm4, xmm5, 136 1699 shufps xmm3, xmm5, 221 1700 movaps xmm5, xmm3 1701 movups xmm6, xmmword ptr [r8+rdx-0x20] 1702 movups xmm7, xmmword ptr [r8+rdx-0x10] 1703 movaps xmm3, xmm6 1704 shufps xmm6, xmm7, 136 1705 pshufd xmm6, xmm6, 0x93 1706 shufps xmm3, xmm7, 221 1707 pshufd xmm7, xmm3, 0x93 1708 movups xmm12, xmmword ptr [r9+rdx-0x40] 1709 movups xmm13, xmmword ptr [r9+rdx-0x30] 1710 movaps xmm11, xmm12 1711 shufps xmm12, xmm13, 136 1712 shufps xmm11, xmm13, 221 1713 movaps xmm13, xmm11 1714 movups xmm14, xmmword ptr [r9+rdx-0x20] 1715 movups xmm15, xmmword ptr [r9+rdx-0x10] 1716 movaps xmm11, xmm14 1717 shufps xmm14, xmm15, 136 1718 pshufd xmm14, xmm14, 0x93 1719 shufps xmm11, xmm15, 221 1720 pshufd xmm15, xmm11, 0x93 1721 shl rax, 0x20 1722 or rax, 0x40 1723 movq xmm3, rax 1724 movdqa xmmword ptr [rsp+0x20], xmm3 1725 movaps xmm3, xmmword ptr [rsp] 1726 movaps xmm11, xmmword ptr [rsp+0x10] 1727 punpcklqdq xmm3, xmmword ptr [rsp+0x20] 1728 punpcklqdq xmm11, xmmword ptr [rsp+0x20] 1729 mov al, 7 17309: 1731 paddd xmm0, xmm4 1732 paddd xmm8, xmm12 1733 movaps xmmword ptr [rsp+0x20], xmm4 1734 movaps xmmword ptr [rsp+0x30], xmm12 1735 paddd xmm0, xmm1 1736 paddd xmm8, xmm9 1737 pxor xmm3, xmm0 1738 pxor xmm11, xmm8 1739 pshuflw xmm3, xmm3, 0xB1 1740 pshufhw xmm3, xmm3, 0xB1 1741 pshuflw xmm11, xmm11, 0xB1 1742 pshufhw xmm11, xmm11, 0xB1 1743 paddd xmm2, xmm3 1744 paddd xmm10, xmm11 1745 pxor xmm1, xmm2 1746 pxor xmm9, xmm10 1747 movdqa xmm4, xmm1 1748 pslld xmm1, 20 1749 psrld xmm4, 12 1750 por xmm1, xmm4 1751 movdqa xmm4, xmm9 1752 pslld xmm9, 20 1753 psrld xmm4, 12 1754 por xmm9, xmm4 1755 paddd xmm0, xmm5 1756 paddd xmm8, xmm13 1757 movaps xmmword ptr [rsp+0x40], xmm5 1758 movaps xmmword ptr [rsp+0x50], xmm13 1759 paddd xmm0, xmm1 1760 paddd xmm8, xmm9 1761 pxor xmm3, xmm0 1762 pxor xmm11, xmm8 1763 movdqa xmm13, xmm3 1764 psrld xmm3, 8 1765 pslld xmm13, 24 1766 pxor xmm3, xmm13 1767 movdqa xmm13, xmm11 1768 psrld xmm11, 8 1769 pslld xmm13, 24 1770 pxor xmm11, xmm13 1771 paddd xmm2, xmm3 1772 paddd xmm10, xmm11 1773 pxor xmm1, xmm2 1774 pxor xmm9, xmm10 1775 movdqa xmm4, xmm1 1776 pslld xmm1, 25 1777 psrld xmm4, 7 1778 por xmm1, xmm4 1779 movdqa xmm4, xmm9 1780 pslld xmm9, 25 1781 psrld xmm4, 7 1782 por xmm9, xmm4 1783 pshufd xmm0, xmm0, 0x93 1784 pshufd xmm8, xmm8, 0x93 1785 pshufd xmm3, xmm3, 0x4E 1786 pshufd xmm11, xmm11, 0x4E 1787 pshufd xmm2, xmm2, 0x39 1788 pshufd xmm10, xmm10, 0x39 1789 paddd xmm0, xmm6 1790 paddd xmm8, xmm14 1791 paddd xmm0, xmm1 1792 paddd xmm8, xmm9 1793 pxor xmm3, xmm0 1794 pxor xmm11, xmm8 1795 pshuflw xmm3, xmm3, 0xB1 1796 pshufhw xmm3, xmm3, 0xB1 1797 pshuflw xmm11, xmm11, 0xB1 1798 pshufhw xmm11, xmm11, 0xB1 1799 paddd xmm2, xmm3 1800 paddd xmm10, xmm11 1801 pxor xmm1, xmm2 1802 pxor xmm9, xmm10 1803 movdqa xmm4, xmm1 1804 pslld xmm1, 20 1805 psrld xmm4, 12 1806 por xmm1, xmm4 1807 movdqa xmm4, xmm9 1808 pslld xmm9, 20 1809 psrld xmm4, 12 1810 por xmm9, xmm4 1811 paddd xmm0, xmm7 1812 paddd xmm8, xmm15 1813 paddd xmm0, xmm1 1814 paddd xmm8, xmm9 1815 pxor xmm3, xmm0 1816 pxor xmm11, xmm8 1817 movdqa xmm13, xmm3 1818 psrld xmm3, 8 1819 pslld xmm13, 24 1820 pxor xmm3, xmm13 1821 movdqa xmm13, xmm11 1822 psrld xmm11, 8 1823 pslld xmm13, 24 1824 pxor xmm11, xmm13 1825 paddd xmm2, xmm3 1826 paddd xmm10, xmm11 1827 pxor xmm1, xmm2 1828 pxor xmm9, xmm10 1829 movdqa xmm4, xmm1 1830 pslld xmm1, 25 1831 psrld xmm4, 7 1832 por xmm1, xmm4 1833 movdqa xmm4, xmm9 1834 pslld xmm9, 25 1835 psrld xmm4, 7 1836 por xmm9, xmm4 1837 pshufd xmm0, xmm0, 0x39 1838 pshufd xmm8, xmm8, 0x39 1839 pshufd xmm3, xmm3, 0x4E 1840 pshufd xmm11, xmm11, 0x4E 1841 pshufd xmm2, xmm2, 0x93 1842 pshufd xmm10, xmm10, 0x93 1843 dec al 1844 je 9f 1845 movdqa xmm12, xmmword ptr [rsp+0x20] 1846 movdqa xmm5, xmmword ptr [rsp+0x40] 1847 pshufd xmm13, xmm12, 0x0F 1848 shufps xmm12, xmm5, 214 1849 pshufd xmm4, xmm12, 0x39 1850 movdqa xmm12, xmm6 1851 shufps xmm12, xmm7, 250 1852 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] 1853 pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] 1854 por xmm13, xmm12 1855 movdqa xmmword ptr [rsp+0x20], xmm13 1856 movdqa xmm12, xmm7 1857 punpcklqdq xmm12, xmm5 1858 movdqa xmm13, xmm6 1859 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] 1860 pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] 1861 por xmm12, xmm13 1862 pshufd xmm12, xmm12, 0x78 1863 punpckhdq xmm5, xmm7 1864 punpckldq xmm6, xmm5 1865 pshufd xmm7, xmm6, 0x1E 1866 movdqa xmmword ptr [rsp+0x40], xmm12 1867 movdqa xmm5, xmmword ptr [rsp+0x30] 1868 movdqa xmm13, xmmword ptr [rsp+0x50] 1869 pshufd xmm6, xmm5, 0x0F 1870 shufps xmm5, xmm13, 214 1871 pshufd xmm12, xmm5, 0x39 1872 movdqa xmm5, xmm14 1873 shufps xmm5, xmm15, 250 1874 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] 1875 pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] 1876 por xmm6, xmm5 1877 movdqa xmm5, xmm15 1878 punpcklqdq xmm5, xmm13 1879 movdqa xmmword ptr [rsp+0x30], xmm2 1880 movdqa xmm2, xmm14 1881 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] 1882 pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] 1883 por xmm5, xmm2 1884 movdqa xmm2, xmmword ptr [rsp+0x30] 1885 pshufd xmm5, xmm5, 0x78 1886 punpckhdq xmm13, xmm15 1887 punpckldq xmm14, xmm13 1888 pshufd xmm15, xmm14, 0x1E 1889 movdqa xmm13, xmm6 1890 movdqa xmm14, xmm5 1891 movdqa xmm5, xmmword ptr [rsp+0x20] 1892 movdqa xmm6, xmmword ptr [rsp+0x40] 1893 jmp 9b 18949: 1895 pxor xmm0, xmm2 1896 pxor xmm1, xmm3 1897 pxor xmm8, xmm10 1898 pxor xmm9, xmm11 1899 mov eax, r13d 1900 cmp rdx, r15 1901 jne 2b 1902 movups xmmword ptr [rbx], xmm0 1903 movups xmmword ptr [rbx+0x10], xmm1 1904 movups xmmword ptr [rbx+0x20], xmm8 1905 movups xmmword ptr [rbx+0x30], xmm9 1906 mov eax, dword ptr [rsp+0x130] 1907 neg eax 1908 mov r10d, dword ptr [rsp+0x110+8*rax] 1909 mov r11d, dword ptr [rsp+0x120+8*rax] 1910 mov dword ptr [rsp+0x110], r10d 1911 mov dword ptr [rsp+0x120], r11d 1912 add rdi, 16 1913 add rbx, 64 1914 sub rsi, 2 19153: 1916 test esi, 0x1 1917 je 4b 1918 movups xmm0, xmmword ptr [rcx] 1919 movups xmm1, xmmword ptr [rcx+0x10] 1920 movd xmm13, dword ptr [rsp+0x110] 1921 movd xmm14, dword ptr [rsp+0x120] 1922 punpckldq xmm13, xmm14 1923 mov r8, qword ptr [rdi] 1924 movzx eax, byte ptr [rbp+0x40] 1925 or eax, r13d 1926 xor edx, edx 19272: 1928 mov r14d, eax 1929 or eax, r12d 1930 add rdx, 64 1931 cmp rdx, r15 1932 cmovne eax, r14d 1933 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 1934 shl rax, 32 1935 or rax, 64 1936 movq xmm12, rax 1937 movdqa xmm3, xmm13 1938 punpcklqdq xmm3, xmm12 1939 movups xmm4, xmmword ptr [r8+rdx-0x40] 1940 movups xmm5, xmmword ptr [r8+rdx-0x30] 1941 movaps xmm8, xmm4 1942 shufps xmm4, xmm5, 136 1943 shufps xmm8, xmm5, 221 1944 movaps xmm5, xmm8 1945 movups xmm6, xmmword ptr [r8+rdx-0x20] 1946 movups xmm7, xmmword ptr [r8+rdx-0x10] 1947 movaps xmm8, xmm6 1948 shufps xmm6, xmm7, 136 1949 pshufd xmm6, xmm6, 0x93 1950 shufps xmm8, xmm7, 221 1951 pshufd xmm7, xmm8, 0x93 1952 mov al, 7 19539: 1954 paddd xmm0, xmm4 1955 paddd xmm0, xmm1 1956 pxor xmm3, xmm0 1957 pshuflw xmm3, xmm3, 0xB1 1958 pshufhw xmm3, xmm3, 0xB1 1959 paddd xmm2, xmm3 1960 pxor xmm1, xmm2 1961 movdqa xmm11, xmm1 1962 pslld xmm1, 20 1963 psrld xmm11, 12 1964 por xmm1, xmm11 1965 paddd xmm0, xmm5 1966 paddd xmm0, xmm1 1967 pxor xmm3, xmm0 1968 movdqa xmm14, xmm3 1969 psrld xmm3, 8 1970 pslld xmm14, 24 1971 pxor xmm3, xmm14 1972 paddd xmm2, xmm3 1973 pxor xmm1, xmm2 1974 movdqa xmm11, xmm1 1975 pslld xmm1, 25 1976 psrld xmm11, 7 1977 por xmm1, xmm11 1978 pshufd xmm0, xmm0, 0x93 1979 pshufd xmm3, xmm3, 0x4E 1980 pshufd xmm2, xmm2, 0x39 1981 paddd xmm0, xmm6 1982 paddd xmm0, xmm1 1983 pxor xmm3, xmm0 1984 pshuflw xmm3, xmm3, 0xB1 1985 pshufhw xmm3, xmm3, 0xB1 1986 paddd xmm2, xmm3 1987 pxor xmm1, xmm2 1988 movdqa xmm11, xmm1 1989 pslld xmm1, 20 1990 psrld xmm11, 12 1991 por xmm1, xmm11 1992 paddd xmm0, xmm7 1993 paddd xmm0, xmm1 1994 pxor xmm3, xmm0 1995 movdqa xmm14, xmm3 1996 psrld xmm3, 8 1997 pslld xmm14, 24 1998 pxor xmm3, xmm14 1999 paddd xmm2, xmm3 2000 pxor xmm1, xmm2 2001 movdqa xmm11, xmm1 2002 pslld xmm1, 25 2003 psrld xmm11, 7 2004 por xmm1, xmm11 2005 pshufd xmm0, xmm0, 0x39 2006 pshufd xmm3, xmm3, 0x4E 2007 pshufd xmm2, xmm2, 0x93 2008 dec al 2009 jz 9f 2010 movdqa xmm8, xmm4 2011 shufps xmm8, xmm5, 214 2012 pshufd xmm9, xmm4, 0x0F 2013 pshufd xmm4, xmm8, 0x39 2014 movdqa xmm8, xmm6 2015 shufps xmm8, xmm7, 250 2016 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2017 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2018 por xmm9, xmm8 2019 movdqa xmm8, xmm7 2020 punpcklqdq xmm8, xmm5 2021 movdqa xmm10, xmm6 2022 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2023 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2024 por xmm8, xmm10 2025 pshufd xmm8, xmm8, 0x78 2026 punpckhdq xmm5, xmm7 2027 punpckldq xmm6, xmm5 2028 pshufd xmm7, xmm6, 0x1E 2029 movdqa xmm5, xmm9 2030 movdqa xmm6, xmm8 2031 jmp 9b 20329: 2033 pxor xmm0, xmm2 2034 pxor xmm1, xmm3 2035 mov eax, r13d 2036 cmp rdx, r15 2037 jne 2b 2038 movups xmmword ptr [rbx], xmm0 2039 movups xmmword ptr [rbx+0x10], xmm1 2040 jmp 4b 2041 2042.p2align 6 2043blake3_compress_in_place_sse2: 2044_blake3_compress_in_place_sse2: 2045 _CET_ENDBR 2046 movups xmm0, xmmword ptr [rdi] 2047 movups xmm1, xmmword ptr [rdi+0x10] 2048 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 2049 shl r8, 32 2050 add rdx, r8 2051 movq xmm3, rcx 2052 movq xmm4, rdx 2053 punpcklqdq xmm3, xmm4 2054 movups xmm4, xmmword ptr [rsi] 2055 movups xmm5, xmmword ptr [rsi+0x10] 2056 movaps xmm8, xmm4 2057 shufps xmm4, xmm5, 136 2058 shufps xmm8, xmm5, 221 2059 movaps xmm5, xmm8 2060 movups xmm6, xmmword ptr [rsi+0x20] 2061 movups xmm7, xmmword ptr [rsi+0x30] 2062 movaps xmm8, xmm6 2063 shufps xmm6, xmm7, 136 2064 pshufd xmm6, xmm6, 0x93 2065 shufps xmm8, xmm7, 221 2066 pshufd xmm7, xmm8, 0x93 2067 mov al, 7 20689: 2069 paddd xmm0, xmm4 2070 paddd xmm0, xmm1 2071 pxor xmm3, xmm0 2072 pshuflw xmm3, xmm3, 0xB1 2073 pshufhw xmm3, xmm3, 0xB1 2074 paddd xmm2, xmm3 2075 pxor xmm1, xmm2 2076 movdqa xmm11, xmm1 2077 pslld xmm1, 20 2078 psrld xmm11, 12 2079 por xmm1, xmm11 2080 paddd xmm0, xmm5 2081 paddd xmm0, xmm1 2082 pxor xmm3, xmm0 2083 movdqa xmm14, xmm3 2084 psrld xmm3, 8 2085 pslld xmm14, 24 2086 pxor xmm3, xmm14 2087 paddd xmm2, xmm3 2088 pxor xmm1, xmm2 2089 movdqa xmm11, xmm1 2090 pslld xmm1, 25 2091 psrld xmm11, 7 2092 por xmm1, xmm11 2093 pshufd xmm0, xmm0, 0x93 2094 pshufd xmm3, xmm3, 0x4E 2095 pshufd xmm2, xmm2, 0x39 2096 paddd xmm0, xmm6 2097 paddd xmm0, xmm1 2098 pxor xmm3, xmm0 2099 pshuflw xmm3, xmm3, 0xB1 2100 pshufhw xmm3, xmm3, 0xB1 2101 paddd xmm2, xmm3 2102 pxor xmm1, xmm2 2103 movdqa xmm11, xmm1 2104 pslld xmm1, 20 2105 psrld xmm11, 12 2106 por xmm1, xmm11 2107 paddd xmm0, xmm7 2108 paddd xmm0, xmm1 2109 pxor xmm3, xmm0 2110 movdqa xmm14, xmm3 2111 psrld xmm3, 8 2112 pslld xmm14, 24 2113 pxor xmm3, xmm14 2114 paddd xmm2, xmm3 2115 pxor xmm1, xmm2 2116 movdqa xmm11, xmm1 2117 pslld xmm1, 25 2118 psrld xmm11, 7 2119 por xmm1, xmm11 2120 pshufd xmm0, xmm0, 0x39 2121 pshufd xmm3, xmm3, 0x4E 2122 pshufd xmm2, xmm2, 0x93 2123 dec al 2124 jz 9f 2125 movdqa xmm8, xmm4 2126 shufps xmm8, xmm5, 214 2127 pshufd xmm9, xmm4, 0x0F 2128 pshufd xmm4, xmm8, 0x39 2129 movdqa xmm8, xmm6 2130 shufps xmm8, xmm7, 250 2131 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2132 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2133 por xmm9, xmm8 2134 movdqa xmm8, xmm7 2135 punpcklqdq xmm8, xmm5 2136 movdqa xmm10, xmm6 2137 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2138 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2139 por xmm8, xmm10 2140 pshufd xmm8, xmm8, 0x78 2141 punpckhdq xmm5, xmm7 2142 punpckldq xmm6, xmm5 2143 pshufd xmm7, xmm6, 0x1E 2144 movdqa xmm5, xmm9 2145 movdqa xmm6, xmm8 2146 jmp 9b 21479: 2148 pxor xmm0, xmm2 2149 pxor xmm1, xmm3 2150 movups xmmword ptr [rdi], xmm0 2151 movups xmmword ptr [rdi+0x10], xmm1 2152 ret 2153 2154.p2align 6 2155blake3_compress_xof_sse2: 2156_blake3_compress_xof_sse2: 2157 _CET_ENDBR 2158 movups xmm0, xmmword ptr [rdi] 2159 movups xmm1, xmmword ptr [rdi+0x10] 2160 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 2161 movzx eax, r8b 2162 movzx edx, dl 2163 shl rax, 32 2164 add rdx, rax 2165 movq xmm3, rcx 2166 movq xmm4, rdx 2167 punpcklqdq xmm3, xmm4 2168 movups xmm4, xmmword ptr [rsi] 2169 movups xmm5, xmmword ptr [rsi+0x10] 2170 movaps xmm8, xmm4 2171 shufps xmm4, xmm5, 136 2172 shufps xmm8, xmm5, 221 2173 movaps xmm5, xmm8 2174 movups xmm6, xmmword ptr [rsi+0x20] 2175 movups xmm7, xmmword ptr [rsi+0x30] 2176 movaps xmm8, xmm6 2177 shufps xmm6, xmm7, 136 2178 pshufd xmm6, xmm6, 0x93 2179 shufps xmm8, xmm7, 221 2180 pshufd xmm7, xmm8, 0x93 2181 mov al, 7 21829: 2183 paddd xmm0, xmm4 2184 paddd xmm0, xmm1 2185 pxor xmm3, xmm0 2186 pshuflw xmm3, xmm3, 0xB1 2187 pshufhw xmm3, xmm3, 0xB1 2188 paddd xmm2, xmm3 2189 pxor xmm1, xmm2 2190 movdqa xmm11, xmm1 2191 pslld xmm1, 20 2192 psrld xmm11, 12 2193 por xmm1, xmm11 2194 paddd xmm0, xmm5 2195 paddd xmm0, xmm1 2196 pxor xmm3, xmm0 2197 movdqa xmm14, xmm3 2198 psrld xmm3, 8 2199 pslld xmm14, 24 2200 pxor xmm3, xmm14 2201 paddd xmm2, xmm3 2202 pxor xmm1, xmm2 2203 movdqa xmm11, xmm1 2204 pslld xmm1, 25 2205 psrld xmm11, 7 2206 por xmm1, xmm11 2207 pshufd xmm0, xmm0, 0x93 2208 pshufd xmm3, xmm3, 0x4E 2209 pshufd xmm2, xmm2, 0x39 2210 paddd xmm0, xmm6 2211 paddd xmm0, xmm1 2212 pxor xmm3, xmm0 2213 pshuflw xmm3, xmm3, 0xB1 2214 pshufhw xmm3, xmm3, 0xB1 2215 paddd xmm2, xmm3 2216 pxor xmm1, xmm2 2217 movdqa xmm11, xmm1 2218 pslld xmm1, 20 2219 psrld xmm11, 12 2220 por xmm1, xmm11 2221 paddd xmm0, xmm7 2222 paddd xmm0, xmm1 2223 pxor xmm3, xmm0 2224 movdqa xmm14, xmm3 2225 psrld xmm3, 8 2226 pslld xmm14, 24 2227 pxor xmm3, xmm14 2228 paddd xmm2, xmm3 2229 pxor xmm1, xmm2 2230 movdqa xmm11, xmm1 2231 pslld xmm1, 25 2232 psrld xmm11, 7 2233 por xmm1, xmm11 2234 pshufd xmm0, xmm0, 0x39 2235 pshufd xmm3, xmm3, 0x4E 2236 pshufd xmm2, xmm2, 0x93 2237 dec al 2238 jz 9f 2239 movdqa xmm8, xmm4 2240 shufps xmm8, xmm5, 214 2241 pshufd xmm9, xmm4, 0x0F 2242 pshufd xmm4, xmm8, 0x39 2243 movdqa xmm8, xmm6 2244 shufps xmm8, xmm7, 250 2245 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2246 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2247 por xmm9, xmm8 2248 movdqa xmm8, xmm7 2249 punpcklqdq xmm8, xmm5 2250 movdqa xmm10, xmm6 2251 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2252 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2253 por xmm8, xmm10 2254 pshufd xmm8, xmm8, 0x78 2255 punpckhdq xmm5, xmm7 2256 punpckldq xmm6, xmm5 2257 pshufd xmm7, xmm6, 0x1E 2258 movdqa xmm5, xmm9 2259 movdqa xmm6, xmm8 2260 jmp 9b 22619: 2262 movdqu xmm4, xmmword ptr [rdi] 2263 movdqu xmm5, xmmword ptr [rdi+0x10] 2264 pxor xmm0, xmm2 2265 pxor xmm1, xmm3 2266 pxor xmm2, xmm4 2267 pxor xmm3, xmm5 2268 movups xmmword ptr [r9], xmm0 2269 movups xmmword ptr [r9+0x10], xmm1 2270 movups xmmword ptr [r9+0x20], xmm2 2271 movups xmmword ptr [r9+0x30], xmm3 2272 ret 2273 2274 2275#ifdef __APPLE__ 2276.static_data 2277#else 2278.section .rodata 2279#endif 2280.p2align 6 2281BLAKE3_IV: 2282 .long 0x6A09E667, 0xBB67AE85 2283 .long 0x3C6EF372, 0xA54FF53A 2284ADD0: 2285 .long 0, 1, 2, 3 2286ADD1: 2287 .long 4, 4, 4, 4 2288BLAKE3_IV_0: 2289 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 2290BLAKE3_IV_1: 2291 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 2292BLAKE3_IV_2: 2293 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 2294BLAKE3_IV_3: 2295 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A 2296BLAKE3_BLOCK_LEN: 2297 .long 64, 64, 64, 64 2298CMP_MSB_MASK: 2299 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 2300PBLENDW_0x33_MASK: 2301 .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 2302PBLENDW_0xCC_MASK: 2303 .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF 2304PBLENDW_0x3F_MASK: 2305 .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 2306PBLENDW_0xC0_MASK: 2307 .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF 2308 2309#endif 2310