1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 24 * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale 25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de> 26 */ 27 28#if defined(HAVE_SSE2) 29 30#define _ASM 31#include <sys/asm_linkage.h> 32 33.intel_syntax noprefix 34 35SECTION_TEXT 36 37ENTRY_ALIGN(zfs_blake3_hash_many_sse2, 64) 38 ENDBR 39 push r15 40 push r14 41 push r13 42 push r12 43 push rbx 44 push rbp 45 mov rbp, rsp 46 sub rsp, 360 47 and rsp, 0xFFFFFFFFFFFFFFC0 48 neg r9d 49 movd xmm0, r9d 50 pshufd xmm0, xmm0, 0x00 51 movdqa xmmword ptr [rsp+0x130], xmm0 52 movdqa xmm1, xmm0 53 pand xmm1, xmmword ptr [ADD0+rip] 54 pand xmm0, xmmword ptr [ADD1+rip] 55 movdqa xmmword ptr [rsp+0x150], xmm0 56 movd xmm0, r8d 57 pshufd xmm0, xmm0, 0x00 58 paddd xmm0, xmm1 59 movdqa xmmword ptr [rsp+0x110], xmm0 60 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] 61 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] 62 pcmpgtd xmm1, xmm0 63 shr r8, 32 64 movd xmm2, r8d 65 pshufd xmm2, xmm2, 0x00 66 psubd xmm2, xmm1 67 movdqa xmmword ptr [rsp+0x120], xmm2 68 mov rbx, qword ptr [rbp+0x50] 69 mov r15, rdx 70 shl r15, 6 71 movzx r13d, byte ptr [rbp+0x38] 72 movzx r12d, byte ptr [rbp+0x48] 73 cmp rsi, 4 74 jc 3f 752: 76 movdqu xmm3, xmmword ptr [rcx] 77 pshufd xmm0, xmm3, 0x00 78 pshufd xmm1, xmm3, 0x55 79 pshufd xmm2, xmm3, 0xAA 80 pshufd xmm3, xmm3, 0xFF 81 movdqu xmm7, xmmword ptr [rcx+0x10] 82 pshufd xmm4, xmm7, 0x00 83 pshufd xmm5, xmm7, 0x55 84 pshufd xmm6, xmm7, 0xAA 85 pshufd xmm7, xmm7, 0xFF 86 mov r8, qword ptr [rdi] 87 mov r9, qword ptr [rdi+0x8] 88 mov r10, qword ptr [rdi+0x10] 89 mov r11, qword ptr [rdi+0x18] 90 movzx eax, byte ptr [rbp+0x40] 91 or eax, r13d 92 xor edx, edx 939: 94 mov r14d, eax 95 or eax, r12d 96 add rdx, 64 97 cmp rdx, r15 98 cmovne eax, r14d 99 movdqu xmm8, xmmword ptr [r8+rdx-0x40] 100 movdqu xmm9, xmmword ptr [r9+rdx-0x40] 101 movdqu xmm10, xmmword ptr [r10+rdx-0x40] 102 movdqu xmm11, xmmword ptr [r11+rdx-0x40] 103 movdqa xmm12, xmm8 104 punpckldq xmm8, xmm9 105 punpckhdq xmm12, xmm9 106 movdqa xmm14, xmm10 107 punpckldq xmm10, xmm11 108 punpckhdq xmm14, xmm11 109 movdqa xmm9, xmm8 110 punpcklqdq xmm8, xmm10 111 punpckhqdq xmm9, xmm10 112 movdqa xmm13, xmm12 113 punpcklqdq xmm12, xmm14 114 punpckhqdq xmm13, xmm14 115 movdqa xmmword ptr [rsp], xmm8 116 movdqa xmmword ptr [rsp+0x10], xmm9 117 movdqa xmmword ptr [rsp+0x20], xmm12 118 movdqa xmmword ptr [rsp+0x30], xmm13 119 movdqu xmm8, xmmword ptr [r8+rdx-0x30] 120 movdqu xmm9, xmmword ptr [r9+rdx-0x30] 121 movdqu xmm10, xmmword ptr [r10+rdx-0x30] 122 movdqu xmm11, xmmword ptr [r11+rdx-0x30] 123 movdqa xmm12, xmm8 124 punpckldq xmm8, xmm9 125 punpckhdq xmm12, xmm9 126 movdqa xmm14, xmm10 127 punpckldq xmm10, xmm11 128 punpckhdq xmm14, xmm11 129 movdqa xmm9, xmm8 130 punpcklqdq xmm8, xmm10 131 punpckhqdq xmm9, xmm10 132 movdqa xmm13, xmm12 133 punpcklqdq xmm12, xmm14 134 punpckhqdq xmm13, xmm14 135 movdqa xmmword ptr [rsp+0x40], xmm8 136 movdqa xmmword ptr [rsp+0x50], xmm9 137 movdqa xmmword ptr [rsp+0x60], xmm12 138 movdqa xmmword ptr [rsp+0x70], xmm13 139 movdqu xmm8, xmmword ptr [r8+rdx-0x20] 140 movdqu xmm9, xmmword ptr [r9+rdx-0x20] 141 movdqu xmm10, xmmword ptr [r10+rdx-0x20] 142 movdqu xmm11, xmmword ptr [r11+rdx-0x20] 143 movdqa xmm12, xmm8 144 punpckldq xmm8, xmm9 145 punpckhdq xmm12, xmm9 146 movdqa xmm14, xmm10 147 punpckldq xmm10, xmm11 148 punpckhdq xmm14, xmm11 149 movdqa xmm9, xmm8 150 punpcklqdq xmm8, xmm10 151 punpckhqdq xmm9, xmm10 152 movdqa xmm13, xmm12 153 punpcklqdq xmm12, xmm14 154 punpckhqdq xmm13, xmm14 155 movdqa xmmword ptr [rsp+0x80], xmm8 156 movdqa xmmword ptr [rsp+0x90], xmm9 157 movdqa xmmword ptr [rsp+0xA0], xmm12 158 movdqa xmmword ptr [rsp+0xB0], xmm13 159 movdqu xmm8, xmmword ptr [r8+rdx-0x10] 160 movdqu xmm9, xmmword ptr [r9+rdx-0x10] 161 movdqu xmm10, xmmword ptr [r10+rdx-0x10] 162 movdqu xmm11, xmmword ptr [r11+rdx-0x10] 163 movdqa xmm12, xmm8 164 punpckldq xmm8, xmm9 165 punpckhdq xmm12, xmm9 166 movdqa xmm14, xmm10 167 punpckldq xmm10, xmm11 168 punpckhdq xmm14, xmm11 169 movdqa xmm9, xmm8 170 punpcklqdq xmm8, xmm10 171 punpckhqdq xmm9, xmm10 172 movdqa xmm13, xmm12 173 punpcklqdq xmm12, xmm14 174 punpckhqdq xmm13, xmm14 175 movdqa xmmword ptr [rsp+0xC0], xmm8 176 movdqa xmmword ptr [rsp+0xD0], xmm9 177 movdqa xmmword ptr [rsp+0xE0], xmm12 178 movdqa xmmword ptr [rsp+0xF0], xmm13 179 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] 180 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] 181 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] 182 movdqa xmm12, xmmword ptr [rsp+0x110] 183 movdqa xmm13, xmmword ptr [rsp+0x120] 184 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] 185 movd xmm15, eax 186 pshufd xmm15, xmm15, 0x00 187 prefetcht0 [r8+rdx+0x80] 188 prefetcht0 [r9+rdx+0x80] 189 prefetcht0 [r10+rdx+0x80] 190 prefetcht0 [r11+rdx+0x80] 191 paddd xmm0, xmmword ptr [rsp] 192 paddd xmm1, xmmword ptr [rsp+0x20] 193 paddd xmm2, xmmword ptr [rsp+0x40] 194 paddd xmm3, xmmword ptr [rsp+0x60] 195 paddd xmm0, xmm4 196 paddd xmm1, xmm5 197 paddd xmm2, xmm6 198 paddd xmm3, xmm7 199 pxor xmm12, xmm0 200 pxor xmm13, xmm1 201 pxor xmm14, xmm2 202 pxor xmm15, xmm3 203 pshuflw xmm12, xmm12, 0xB1 204 pshufhw xmm12, xmm12, 0xB1 205 pshuflw xmm13, xmm13, 0xB1 206 pshufhw xmm13, xmm13, 0xB1 207 pshuflw xmm14, xmm14, 0xB1 208 pshufhw xmm14, xmm14, 0xB1 209 pshuflw xmm15, xmm15, 0xB1 210 pshufhw xmm15, xmm15, 0xB1 211 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] 212 paddd xmm8, xmm12 213 paddd xmm9, xmm13 214 paddd xmm10, xmm14 215 paddd xmm11, xmm15 216 pxor xmm4, xmm8 217 pxor xmm5, xmm9 218 pxor xmm6, xmm10 219 pxor xmm7, xmm11 220 movdqa xmmword ptr [rsp+0x100], xmm8 221 movdqa xmm8, xmm4 222 psrld xmm8, 12 223 pslld xmm4, 20 224 por xmm4, xmm8 225 movdqa xmm8, xmm5 226 psrld xmm8, 12 227 pslld xmm5, 20 228 por xmm5, xmm8 229 movdqa xmm8, xmm6 230 psrld xmm8, 12 231 pslld xmm6, 20 232 por xmm6, xmm8 233 movdqa xmm8, xmm7 234 psrld xmm8, 12 235 pslld xmm7, 20 236 por xmm7, xmm8 237 paddd xmm0, xmmword ptr [rsp+0x10] 238 paddd xmm1, xmmword ptr [rsp+0x30] 239 paddd xmm2, xmmword ptr [rsp+0x50] 240 paddd xmm3, xmmword ptr [rsp+0x70] 241 paddd xmm0, xmm4 242 paddd xmm1, xmm5 243 paddd xmm2, xmm6 244 paddd xmm3, xmm7 245 pxor xmm12, xmm0 246 pxor xmm13, xmm1 247 pxor xmm14, xmm2 248 pxor xmm15, xmm3 249 movdqa xmm8, xmm12 250 psrld xmm12, 8 251 pslld xmm8, 24 252 pxor xmm12, xmm8 253 movdqa xmm8, xmm13 254 psrld xmm13, 8 255 pslld xmm8, 24 256 pxor xmm13, xmm8 257 movdqa xmm8, xmm14 258 psrld xmm14, 8 259 pslld xmm8, 24 260 pxor xmm14, xmm8 261 movdqa xmm8, xmm15 262 psrld xmm15, 8 263 pslld xmm8, 24 264 pxor xmm15, xmm8 265 movdqa xmm8, xmmword ptr [rsp+0x100] 266 paddd xmm8, xmm12 267 paddd xmm9, xmm13 268 paddd xmm10, xmm14 269 paddd xmm11, xmm15 270 pxor xmm4, xmm8 271 pxor xmm5, xmm9 272 pxor xmm6, xmm10 273 pxor xmm7, xmm11 274 movdqa xmmword ptr [rsp+0x100], xmm8 275 movdqa xmm8, xmm4 276 psrld xmm8, 7 277 pslld xmm4, 25 278 por xmm4, xmm8 279 movdqa xmm8, xmm5 280 psrld xmm8, 7 281 pslld xmm5, 25 282 por xmm5, xmm8 283 movdqa xmm8, xmm6 284 psrld xmm8, 7 285 pslld xmm6, 25 286 por xmm6, xmm8 287 movdqa xmm8, xmm7 288 psrld xmm8, 7 289 pslld xmm7, 25 290 por xmm7, xmm8 291 paddd xmm0, xmmword ptr [rsp+0x80] 292 paddd xmm1, xmmword ptr [rsp+0xA0] 293 paddd xmm2, xmmword ptr [rsp+0xC0] 294 paddd xmm3, xmmword ptr [rsp+0xE0] 295 paddd xmm0, xmm5 296 paddd xmm1, xmm6 297 paddd xmm2, xmm7 298 paddd xmm3, xmm4 299 pxor xmm15, xmm0 300 pxor xmm12, xmm1 301 pxor xmm13, xmm2 302 pxor xmm14, xmm3 303 pshuflw xmm15, xmm15, 0xB1 304 pshufhw xmm15, xmm15, 0xB1 305 pshuflw xmm12, xmm12, 0xB1 306 pshufhw xmm12, xmm12, 0xB1 307 pshuflw xmm13, xmm13, 0xB1 308 pshufhw xmm13, xmm13, 0xB1 309 pshuflw xmm14, xmm14, 0xB1 310 pshufhw xmm14, xmm14, 0xB1 311 paddd xmm10, xmm15 312 paddd xmm11, xmm12 313 movdqa xmm8, xmmword ptr [rsp+0x100] 314 paddd xmm8, xmm13 315 paddd xmm9, xmm14 316 pxor xmm5, xmm10 317 pxor xmm6, xmm11 318 pxor xmm7, xmm8 319 pxor xmm4, xmm9 320 movdqa xmmword ptr [rsp+0x100], xmm8 321 movdqa xmm8, xmm5 322 psrld xmm8, 12 323 pslld xmm5, 20 324 por xmm5, xmm8 325 movdqa xmm8, xmm6 326 psrld xmm8, 12 327 pslld xmm6, 20 328 por xmm6, xmm8 329 movdqa xmm8, xmm7 330 psrld xmm8, 12 331 pslld xmm7, 20 332 por xmm7, xmm8 333 movdqa xmm8, xmm4 334 psrld xmm8, 12 335 pslld xmm4, 20 336 por xmm4, xmm8 337 paddd xmm0, xmmword ptr [rsp+0x90] 338 paddd xmm1, xmmword ptr [rsp+0xB0] 339 paddd xmm2, xmmword ptr [rsp+0xD0] 340 paddd xmm3, xmmword ptr [rsp+0xF0] 341 paddd xmm0, xmm5 342 paddd xmm1, xmm6 343 paddd xmm2, xmm7 344 paddd xmm3, xmm4 345 pxor xmm15, xmm0 346 pxor xmm12, xmm1 347 pxor xmm13, xmm2 348 pxor xmm14, xmm3 349 movdqa xmm8, xmm15 350 psrld xmm15, 8 351 pslld xmm8, 24 352 pxor xmm15, xmm8 353 movdqa xmm8, xmm12 354 psrld xmm12, 8 355 pslld xmm8, 24 356 pxor xmm12, xmm8 357 movdqa xmm8, xmm13 358 psrld xmm13, 8 359 pslld xmm8, 24 360 pxor xmm13, xmm8 361 movdqa xmm8, xmm14 362 psrld xmm14, 8 363 pslld xmm8, 24 364 pxor xmm14, xmm8 365 paddd xmm10, xmm15 366 paddd xmm11, xmm12 367 movdqa xmm8, xmmword ptr [rsp+0x100] 368 paddd xmm8, xmm13 369 paddd xmm9, xmm14 370 pxor xmm5, xmm10 371 pxor xmm6, xmm11 372 pxor xmm7, xmm8 373 pxor xmm4, xmm9 374 movdqa xmmword ptr [rsp+0x100], xmm8 375 movdqa xmm8, xmm5 376 psrld xmm8, 7 377 pslld xmm5, 25 378 por xmm5, xmm8 379 movdqa xmm8, xmm6 380 psrld xmm8, 7 381 pslld xmm6, 25 382 por xmm6, xmm8 383 movdqa xmm8, xmm7 384 psrld xmm8, 7 385 pslld xmm7, 25 386 por xmm7, xmm8 387 movdqa xmm8, xmm4 388 psrld xmm8, 7 389 pslld xmm4, 25 390 por xmm4, xmm8 391 paddd xmm0, xmmword ptr [rsp+0x20] 392 paddd xmm1, xmmword ptr [rsp+0x30] 393 paddd xmm2, xmmword ptr [rsp+0x70] 394 paddd xmm3, xmmword ptr [rsp+0x40] 395 paddd xmm0, xmm4 396 paddd xmm1, xmm5 397 paddd xmm2, xmm6 398 paddd xmm3, xmm7 399 pxor xmm12, xmm0 400 pxor xmm13, xmm1 401 pxor xmm14, xmm2 402 pxor xmm15, xmm3 403 pshuflw xmm12, xmm12, 0xB1 404 pshufhw xmm12, xmm12, 0xB1 405 pshuflw xmm13, xmm13, 0xB1 406 pshufhw xmm13, xmm13, 0xB1 407 pshuflw xmm14, xmm14, 0xB1 408 pshufhw xmm14, xmm14, 0xB1 409 pshuflw xmm15, xmm15, 0xB1 410 pshufhw xmm15, xmm15, 0xB1 411 movdqa xmm8, xmmword ptr [rsp+0x100] 412 paddd xmm8, xmm12 413 paddd xmm9, xmm13 414 paddd xmm10, xmm14 415 paddd xmm11, xmm15 416 pxor xmm4, xmm8 417 pxor xmm5, xmm9 418 pxor xmm6, xmm10 419 pxor xmm7, xmm11 420 movdqa xmmword ptr [rsp+0x100], xmm8 421 movdqa xmm8, xmm4 422 psrld xmm8, 12 423 pslld xmm4, 20 424 por xmm4, xmm8 425 movdqa xmm8, xmm5 426 psrld xmm8, 12 427 pslld xmm5, 20 428 por xmm5, xmm8 429 movdqa xmm8, xmm6 430 psrld xmm8, 12 431 pslld xmm6, 20 432 por xmm6, xmm8 433 movdqa xmm8, xmm7 434 psrld xmm8, 12 435 pslld xmm7, 20 436 por xmm7, xmm8 437 paddd xmm0, xmmword ptr [rsp+0x60] 438 paddd xmm1, xmmword ptr [rsp+0xA0] 439 paddd xmm2, xmmword ptr [rsp] 440 paddd xmm3, xmmword ptr [rsp+0xD0] 441 paddd xmm0, xmm4 442 paddd xmm1, xmm5 443 paddd xmm2, xmm6 444 paddd xmm3, xmm7 445 pxor xmm12, xmm0 446 pxor xmm13, xmm1 447 pxor xmm14, xmm2 448 pxor xmm15, xmm3 449 movdqa xmm8, xmm12 450 psrld xmm12, 8 451 pslld xmm8, 24 452 pxor xmm12, xmm8 453 movdqa xmm8, xmm13 454 psrld xmm13, 8 455 pslld xmm8, 24 456 pxor xmm13, xmm8 457 movdqa xmm8, xmm14 458 psrld xmm14, 8 459 pslld xmm8, 24 460 pxor xmm14, xmm8 461 movdqa xmm8, xmm15 462 psrld xmm15, 8 463 pslld xmm8, 24 464 pxor xmm15, xmm8 465 movdqa xmm8, xmmword ptr [rsp+0x100] 466 paddd xmm8, xmm12 467 paddd xmm9, xmm13 468 paddd xmm10, xmm14 469 paddd xmm11, xmm15 470 pxor xmm4, xmm8 471 pxor xmm5, xmm9 472 pxor xmm6, xmm10 473 pxor xmm7, xmm11 474 movdqa xmmword ptr [rsp+0x100], xmm8 475 movdqa xmm8, xmm4 476 psrld xmm8, 7 477 pslld xmm4, 25 478 por xmm4, xmm8 479 movdqa xmm8, xmm5 480 psrld xmm8, 7 481 pslld xmm5, 25 482 por xmm5, xmm8 483 movdqa xmm8, xmm6 484 psrld xmm8, 7 485 pslld xmm6, 25 486 por xmm6, xmm8 487 movdqa xmm8, xmm7 488 psrld xmm8, 7 489 pslld xmm7, 25 490 por xmm7, xmm8 491 paddd xmm0, xmmword ptr [rsp+0x10] 492 paddd xmm1, xmmword ptr [rsp+0xC0] 493 paddd xmm2, xmmword ptr [rsp+0x90] 494 paddd xmm3, xmmword ptr [rsp+0xF0] 495 paddd xmm0, xmm5 496 paddd xmm1, xmm6 497 paddd xmm2, xmm7 498 paddd xmm3, xmm4 499 pxor xmm15, xmm0 500 pxor xmm12, xmm1 501 pxor xmm13, xmm2 502 pxor xmm14, xmm3 503 pshuflw xmm15, xmm15, 0xB1 504 pshufhw xmm15, xmm15, 0xB1 505 pshuflw xmm12, xmm12, 0xB1 506 pshufhw xmm12, xmm12, 0xB1 507 pshuflw xmm13, xmm13, 0xB1 508 pshufhw xmm13, xmm13, 0xB1 509 pshuflw xmm14, xmm14, 0xB1 510 pshufhw xmm14, xmm14, 0xB1 511 paddd xmm10, xmm15 512 paddd xmm11, xmm12 513 movdqa xmm8, xmmword ptr [rsp+0x100] 514 paddd xmm8, xmm13 515 paddd xmm9, xmm14 516 pxor xmm5, xmm10 517 pxor xmm6, xmm11 518 pxor xmm7, xmm8 519 pxor xmm4, xmm9 520 movdqa xmmword ptr [rsp+0x100], xmm8 521 movdqa xmm8, xmm5 522 psrld xmm8, 12 523 pslld xmm5, 20 524 por xmm5, xmm8 525 movdqa xmm8, xmm6 526 psrld xmm8, 12 527 pslld xmm6, 20 528 por xmm6, xmm8 529 movdqa xmm8, xmm7 530 psrld xmm8, 12 531 pslld xmm7, 20 532 por xmm7, xmm8 533 movdqa xmm8, xmm4 534 psrld xmm8, 12 535 pslld xmm4, 20 536 por xmm4, xmm8 537 paddd xmm0, xmmword ptr [rsp+0xB0] 538 paddd xmm1, xmmword ptr [rsp+0x50] 539 paddd xmm2, xmmword ptr [rsp+0xE0] 540 paddd xmm3, xmmword ptr [rsp+0x80] 541 paddd xmm0, xmm5 542 paddd xmm1, xmm6 543 paddd xmm2, xmm7 544 paddd xmm3, xmm4 545 pxor xmm15, xmm0 546 pxor xmm12, xmm1 547 pxor xmm13, xmm2 548 pxor xmm14, xmm3 549 movdqa xmm8, xmm15 550 psrld xmm15, 8 551 pslld xmm8, 24 552 pxor xmm15, xmm8 553 movdqa xmm8, xmm12 554 psrld xmm12, 8 555 pslld xmm8, 24 556 pxor xmm12, xmm8 557 movdqa xmm8, xmm13 558 psrld xmm13, 8 559 pslld xmm8, 24 560 pxor xmm13, xmm8 561 movdqa xmm8, xmm14 562 psrld xmm14, 8 563 pslld xmm8, 24 564 pxor xmm14, xmm8 565 paddd xmm10, xmm15 566 paddd xmm11, xmm12 567 movdqa xmm8, xmmword ptr [rsp+0x100] 568 paddd xmm8, xmm13 569 paddd xmm9, xmm14 570 pxor xmm5, xmm10 571 pxor xmm6, xmm11 572 pxor xmm7, xmm8 573 pxor xmm4, xmm9 574 movdqa xmmword ptr [rsp+0x100], xmm8 575 movdqa xmm8, xmm5 576 psrld xmm8, 7 577 pslld xmm5, 25 578 por xmm5, xmm8 579 movdqa xmm8, xmm6 580 psrld xmm8, 7 581 pslld xmm6, 25 582 por xmm6, xmm8 583 movdqa xmm8, xmm7 584 psrld xmm8, 7 585 pslld xmm7, 25 586 por xmm7, xmm8 587 movdqa xmm8, xmm4 588 psrld xmm8, 7 589 pslld xmm4, 25 590 por xmm4, xmm8 591 paddd xmm0, xmmword ptr [rsp+0x30] 592 paddd xmm1, xmmword ptr [rsp+0xA0] 593 paddd xmm2, xmmword ptr [rsp+0xD0] 594 paddd xmm3, xmmword ptr [rsp+0x70] 595 paddd xmm0, xmm4 596 paddd xmm1, xmm5 597 paddd xmm2, xmm6 598 paddd xmm3, xmm7 599 pxor xmm12, xmm0 600 pxor xmm13, xmm1 601 pxor xmm14, xmm2 602 pxor xmm15, xmm3 603 pshuflw xmm12, xmm12, 0xB1 604 pshufhw xmm12, xmm12, 0xB1 605 pshuflw xmm13, xmm13, 0xB1 606 pshufhw xmm13, xmm13, 0xB1 607 pshuflw xmm14, xmm14, 0xB1 608 pshufhw xmm14, xmm14, 0xB1 609 pshuflw xmm15, xmm15, 0xB1 610 pshufhw xmm15, xmm15, 0xB1 611 movdqa xmm8, xmmword ptr [rsp+0x100] 612 paddd xmm8, xmm12 613 paddd xmm9, xmm13 614 paddd xmm10, xmm14 615 paddd xmm11, xmm15 616 pxor xmm4, xmm8 617 pxor xmm5, xmm9 618 pxor xmm6, xmm10 619 pxor xmm7, xmm11 620 movdqa xmmword ptr [rsp+0x100], xmm8 621 movdqa xmm8, xmm4 622 psrld xmm8, 12 623 pslld xmm4, 20 624 por xmm4, xmm8 625 movdqa xmm8, xmm5 626 psrld xmm8, 12 627 pslld xmm5, 20 628 por xmm5, xmm8 629 movdqa xmm8, xmm6 630 psrld xmm8, 12 631 pslld xmm6, 20 632 por xmm6, xmm8 633 movdqa xmm8, xmm7 634 psrld xmm8, 12 635 pslld xmm7, 20 636 por xmm7, xmm8 637 paddd xmm0, xmmword ptr [rsp+0x40] 638 paddd xmm1, xmmword ptr [rsp+0xC0] 639 paddd xmm2, xmmword ptr [rsp+0x20] 640 paddd xmm3, xmmword ptr [rsp+0xE0] 641 paddd xmm0, xmm4 642 paddd xmm1, xmm5 643 paddd xmm2, xmm6 644 paddd xmm3, xmm7 645 pxor xmm12, xmm0 646 pxor xmm13, xmm1 647 pxor xmm14, xmm2 648 pxor xmm15, xmm3 649 movdqa xmm8, xmm12 650 psrld xmm12, 8 651 pslld xmm8, 24 652 pxor xmm12, xmm8 653 movdqa xmm8, xmm13 654 psrld xmm13, 8 655 pslld xmm8, 24 656 pxor xmm13, xmm8 657 movdqa xmm8, xmm14 658 psrld xmm14, 8 659 pslld xmm8, 24 660 pxor xmm14, xmm8 661 movdqa xmm8, xmm15 662 psrld xmm15, 8 663 pslld xmm8, 24 664 pxor xmm15, xmm8 665 movdqa xmm8, xmmword ptr [rsp+0x100] 666 paddd xmm8, xmm12 667 paddd xmm9, xmm13 668 paddd xmm10, xmm14 669 paddd xmm11, xmm15 670 pxor xmm4, xmm8 671 pxor xmm5, xmm9 672 pxor xmm6, xmm10 673 pxor xmm7, xmm11 674 movdqa xmmword ptr [rsp+0x100], xmm8 675 movdqa xmm8, xmm4 676 psrld xmm8, 7 677 pslld xmm4, 25 678 por xmm4, xmm8 679 movdqa xmm8, xmm5 680 psrld xmm8, 7 681 pslld xmm5, 25 682 por xmm5, xmm8 683 movdqa xmm8, xmm6 684 psrld xmm8, 7 685 pslld xmm6, 25 686 por xmm6, xmm8 687 movdqa xmm8, xmm7 688 psrld xmm8, 7 689 pslld xmm7, 25 690 por xmm7, xmm8 691 paddd xmm0, xmmword ptr [rsp+0x60] 692 paddd xmm1, xmmword ptr [rsp+0x90] 693 paddd xmm2, xmmword ptr [rsp+0xB0] 694 paddd xmm3, xmmword ptr [rsp+0x80] 695 paddd xmm0, xmm5 696 paddd xmm1, xmm6 697 paddd xmm2, xmm7 698 paddd xmm3, xmm4 699 pxor xmm15, xmm0 700 pxor xmm12, xmm1 701 pxor xmm13, xmm2 702 pxor xmm14, xmm3 703 pshuflw xmm15, xmm15, 0xB1 704 pshufhw xmm15, xmm15, 0xB1 705 pshuflw xmm12, xmm12, 0xB1 706 pshufhw xmm12, xmm12, 0xB1 707 pshuflw xmm13, xmm13, 0xB1 708 pshufhw xmm13, xmm13, 0xB1 709 pshuflw xmm14, xmm14, 0xB1 710 pshufhw xmm14, xmm14, 0xB1 711 paddd xmm10, xmm15 712 paddd xmm11, xmm12 713 movdqa xmm8, xmmword ptr [rsp+0x100] 714 paddd xmm8, xmm13 715 paddd xmm9, xmm14 716 pxor xmm5, xmm10 717 pxor xmm6, xmm11 718 pxor xmm7, xmm8 719 pxor xmm4, xmm9 720 movdqa xmmword ptr [rsp+0x100], xmm8 721 movdqa xmm8, xmm5 722 psrld xmm8, 12 723 pslld xmm5, 20 724 por xmm5, xmm8 725 movdqa xmm8, xmm6 726 psrld xmm8, 12 727 pslld xmm6, 20 728 por xmm6, xmm8 729 movdqa xmm8, xmm7 730 psrld xmm8, 12 731 pslld xmm7, 20 732 por xmm7, xmm8 733 movdqa xmm8, xmm4 734 psrld xmm8, 12 735 pslld xmm4, 20 736 por xmm4, xmm8 737 paddd xmm0, xmmword ptr [rsp+0x50] 738 paddd xmm1, xmmword ptr [rsp] 739 paddd xmm2, xmmword ptr [rsp+0xF0] 740 paddd xmm3, xmmword ptr [rsp+0x10] 741 paddd xmm0, xmm5 742 paddd xmm1, xmm6 743 paddd xmm2, xmm7 744 paddd xmm3, xmm4 745 pxor xmm15, xmm0 746 pxor xmm12, xmm1 747 pxor xmm13, xmm2 748 pxor xmm14, xmm3 749 movdqa xmm8, xmm15 750 psrld xmm15, 8 751 pslld xmm8, 24 752 pxor xmm15, xmm8 753 movdqa xmm8, xmm12 754 psrld xmm12, 8 755 pslld xmm8, 24 756 pxor xmm12, xmm8 757 movdqa xmm8, xmm13 758 psrld xmm13, 8 759 pslld xmm8, 24 760 pxor xmm13, xmm8 761 movdqa xmm8, xmm14 762 psrld xmm14, 8 763 pslld xmm8, 24 764 pxor xmm14, xmm8 765 paddd xmm10, xmm15 766 paddd xmm11, xmm12 767 movdqa xmm8, xmmword ptr [rsp+0x100] 768 paddd xmm8, xmm13 769 paddd xmm9, xmm14 770 pxor xmm5, xmm10 771 pxor xmm6, xmm11 772 pxor xmm7, xmm8 773 pxor xmm4, xmm9 774 movdqa xmmword ptr [rsp+0x100], xmm8 775 movdqa xmm8, xmm5 776 psrld xmm8, 7 777 pslld xmm5, 25 778 por xmm5, xmm8 779 movdqa xmm8, xmm6 780 psrld xmm8, 7 781 pslld xmm6, 25 782 por xmm6, xmm8 783 movdqa xmm8, xmm7 784 psrld xmm8, 7 785 pslld xmm7, 25 786 por xmm7, xmm8 787 movdqa xmm8, xmm4 788 psrld xmm8, 7 789 pslld xmm4, 25 790 por xmm4, xmm8 791 paddd xmm0, xmmword ptr [rsp+0xA0] 792 paddd xmm1, xmmword ptr [rsp+0xC0] 793 paddd xmm2, xmmword ptr [rsp+0xE0] 794 paddd xmm3, xmmword ptr [rsp+0xD0] 795 paddd xmm0, xmm4 796 paddd xmm1, xmm5 797 paddd xmm2, xmm6 798 paddd xmm3, xmm7 799 pxor xmm12, xmm0 800 pxor xmm13, xmm1 801 pxor xmm14, xmm2 802 pxor xmm15, xmm3 803 pshuflw xmm12, xmm12, 0xB1 804 pshufhw xmm12, xmm12, 0xB1 805 pshuflw xmm13, xmm13, 0xB1 806 pshufhw xmm13, xmm13, 0xB1 807 pshuflw xmm14, xmm14, 0xB1 808 pshufhw xmm14, xmm14, 0xB1 809 pshuflw xmm15, xmm15, 0xB1 810 pshufhw xmm15, xmm15, 0xB1 811 movdqa xmm8, xmmword ptr [rsp+0x100] 812 paddd xmm8, xmm12 813 paddd xmm9, xmm13 814 paddd xmm10, xmm14 815 paddd xmm11, xmm15 816 pxor xmm4, xmm8 817 pxor xmm5, xmm9 818 pxor xmm6, xmm10 819 pxor xmm7, xmm11 820 movdqa xmmword ptr [rsp+0x100], xmm8 821 movdqa xmm8, xmm4 822 psrld xmm8, 12 823 pslld xmm4, 20 824 por xmm4, xmm8 825 movdqa xmm8, xmm5 826 psrld xmm8, 12 827 pslld xmm5, 20 828 por xmm5, xmm8 829 movdqa xmm8, xmm6 830 psrld xmm8, 12 831 pslld xmm6, 20 832 por xmm6, xmm8 833 movdqa xmm8, xmm7 834 psrld xmm8, 12 835 pslld xmm7, 20 836 por xmm7, xmm8 837 paddd xmm0, xmmword ptr [rsp+0x70] 838 paddd xmm1, xmmword ptr [rsp+0x90] 839 paddd xmm2, xmmword ptr [rsp+0x30] 840 paddd xmm3, xmmword ptr [rsp+0xF0] 841 paddd xmm0, xmm4 842 paddd xmm1, xmm5 843 paddd xmm2, xmm6 844 paddd xmm3, xmm7 845 pxor xmm12, xmm0 846 pxor xmm13, xmm1 847 pxor xmm14, xmm2 848 pxor xmm15, xmm3 849 movdqa xmm8, xmm12 850 psrld xmm12, 8 851 pslld xmm8, 24 852 pxor xmm12, xmm8 853 movdqa xmm8, xmm13 854 psrld xmm13, 8 855 pslld xmm8, 24 856 pxor xmm13, xmm8 857 movdqa xmm8, xmm14 858 psrld xmm14, 8 859 pslld xmm8, 24 860 pxor xmm14, xmm8 861 movdqa xmm8, xmm15 862 psrld xmm15, 8 863 pslld xmm8, 24 864 pxor xmm15, xmm8 865 movdqa xmm8, xmmword ptr [rsp+0x100] 866 paddd xmm8, xmm12 867 paddd xmm9, xmm13 868 paddd xmm10, xmm14 869 paddd xmm11, xmm15 870 pxor xmm4, xmm8 871 pxor xmm5, xmm9 872 pxor xmm6, xmm10 873 pxor xmm7, xmm11 874 movdqa xmmword ptr [rsp+0x100], xmm8 875 movdqa xmm8, xmm4 876 psrld xmm8, 7 877 pslld xmm4, 25 878 por xmm4, xmm8 879 movdqa xmm8, xmm5 880 psrld xmm8, 7 881 pslld xmm5, 25 882 por xmm5, xmm8 883 movdqa xmm8, xmm6 884 psrld xmm8, 7 885 pslld xmm6, 25 886 por xmm6, xmm8 887 movdqa xmm8, xmm7 888 psrld xmm8, 7 889 pslld xmm7, 25 890 por xmm7, xmm8 891 paddd xmm0, xmmword ptr [rsp+0x40] 892 paddd xmm1, xmmword ptr [rsp+0xB0] 893 paddd xmm2, xmmword ptr [rsp+0x50] 894 paddd xmm3, xmmword ptr [rsp+0x10] 895 paddd xmm0, xmm5 896 paddd xmm1, xmm6 897 paddd xmm2, xmm7 898 paddd xmm3, xmm4 899 pxor xmm15, xmm0 900 pxor xmm12, xmm1 901 pxor xmm13, xmm2 902 pxor xmm14, xmm3 903 pshuflw xmm15, xmm15, 0xB1 904 pshufhw xmm15, xmm15, 0xB1 905 pshuflw xmm12, xmm12, 0xB1 906 pshufhw xmm12, xmm12, 0xB1 907 pshuflw xmm13, xmm13, 0xB1 908 pshufhw xmm13, xmm13, 0xB1 909 pshuflw xmm14, xmm14, 0xB1 910 pshufhw xmm14, xmm14, 0xB1 911 paddd xmm10, xmm15 912 paddd xmm11, xmm12 913 movdqa xmm8, xmmword ptr [rsp+0x100] 914 paddd xmm8, xmm13 915 paddd xmm9, xmm14 916 pxor xmm5, xmm10 917 pxor xmm6, xmm11 918 pxor xmm7, xmm8 919 pxor xmm4, xmm9 920 movdqa xmmword ptr [rsp+0x100], xmm8 921 movdqa xmm8, xmm5 922 psrld xmm8, 12 923 pslld xmm5, 20 924 por xmm5, xmm8 925 movdqa xmm8, xmm6 926 psrld xmm8, 12 927 pslld xmm6, 20 928 por xmm6, xmm8 929 movdqa xmm8, xmm7 930 psrld xmm8, 12 931 pslld xmm7, 20 932 por xmm7, xmm8 933 movdqa xmm8, xmm4 934 psrld xmm8, 12 935 pslld xmm4, 20 936 por xmm4, xmm8 937 paddd xmm0, xmmword ptr [rsp] 938 paddd xmm1, xmmword ptr [rsp+0x20] 939 paddd xmm2, xmmword ptr [rsp+0x80] 940 paddd xmm3, xmmword ptr [rsp+0x60] 941 paddd xmm0, xmm5 942 paddd xmm1, xmm6 943 paddd xmm2, xmm7 944 paddd xmm3, xmm4 945 pxor xmm15, xmm0 946 pxor xmm12, xmm1 947 pxor xmm13, xmm2 948 pxor xmm14, xmm3 949 movdqa xmm8, xmm15 950 psrld xmm15, 8 951 pslld xmm8, 24 952 pxor xmm15, xmm8 953 movdqa xmm8, xmm12 954 psrld xmm12, 8 955 pslld xmm8, 24 956 pxor xmm12, xmm8 957 movdqa xmm8, xmm13 958 psrld xmm13, 8 959 pslld xmm8, 24 960 pxor xmm13, xmm8 961 movdqa xmm8, xmm14 962 psrld xmm14, 8 963 pslld xmm8, 24 964 pxor xmm14, xmm8 965 paddd xmm10, xmm15 966 paddd xmm11, xmm12 967 movdqa xmm8, xmmword ptr [rsp+0x100] 968 paddd xmm8, xmm13 969 paddd xmm9, xmm14 970 pxor xmm5, xmm10 971 pxor xmm6, xmm11 972 pxor xmm7, xmm8 973 pxor xmm4, xmm9 974 movdqa xmmword ptr [rsp+0x100], xmm8 975 movdqa xmm8, xmm5 976 psrld xmm8, 7 977 pslld xmm5, 25 978 por xmm5, xmm8 979 movdqa xmm8, xmm6 980 psrld xmm8, 7 981 pslld xmm6, 25 982 por xmm6, xmm8 983 movdqa xmm8, xmm7 984 psrld xmm8, 7 985 pslld xmm7, 25 986 por xmm7, xmm8 987 movdqa xmm8, xmm4 988 psrld xmm8, 7 989 pslld xmm4, 25 990 por xmm4, xmm8 991 paddd xmm0, xmmword ptr [rsp+0xC0] 992 paddd xmm1, xmmword ptr [rsp+0x90] 993 paddd xmm2, xmmword ptr [rsp+0xF0] 994 paddd xmm3, xmmword ptr [rsp+0xE0] 995 paddd xmm0, xmm4 996 paddd xmm1, xmm5 997 paddd xmm2, xmm6 998 paddd xmm3, xmm7 999 pxor xmm12, xmm0 1000 pxor xmm13, xmm1 1001 pxor xmm14, xmm2 1002 pxor xmm15, xmm3 1003 pshuflw xmm12, xmm12, 0xB1 1004 pshufhw xmm12, xmm12, 0xB1 1005 pshuflw xmm13, xmm13, 0xB1 1006 pshufhw xmm13, xmm13, 0xB1 1007 pshuflw xmm14, xmm14, 0xB1 1008 pshufhw xmm14, xmm14, 0xB1 1009 pshuflw xmm15, xmm15, 0xB1 1010 pshufhw xmm15, xmm15, 0xB1 1011 movdqa xmm8, xmmword ptr [rsp+0x100] 1012 paddd xmm8, xmm12 1013 paddd xmm9, xmm13 1014 paddd xmm10, xmm14 1015 paddd xmm11, xmm15 1016 pxor xmm4, xmm8 1017 pxor xmm5, xmm9 1018 pxor xmm6, xmm10 1019 pxor xmm7, xmm11 1020 movdqa xmmword ptr [rsp+0x100], xmm8 1021 movdqa xmm8, xmm4 1022 psrld xmm8, 12 1023 pslld xmm4, 20 1024 por xmm4, xmm8 1025 movdqa xmm8, xmm5 1026 psrld xmm8, 12 1027 pslld xmm5, 20 1028 por xmm5, xmm8 1029 movdqa xmm8, xmm6 1030 psrld xmm8, 12 1031 pslld xmm6, 20 1032 por xmm6, xmm8 1033 movdqa xmm8, xmm7 1034 psrld xmm8, 12 1035 pslld xmm7, 20 1036 por xmm7, xmm8 1037 paddd xmm0, xmmword ptr [rsp+0xD0] 1038 paddd xmm1, xmmword ptr [rsp+0xB0] 1039 paddd xmm2, xmmword ptr [rsp+0xA0] 1040 paddd xmm3, xmmword ptr [rsp+0x80] 1041 paddd xmm0, xmm4 1042 paddd xmm1, xmm5 1043 paddd xmm2, xmm6 1044 paddd xmm3, xmm7 1045 pxor xmm12, xmm0 1046 pxor xmm13, xmm1 1047 pxor xmm14, xmm2 1048 pxor xmm15, xmm3 1049 movdqa xmm8, xmm12 1050 psrld xmm12, 8 1051 pslld xmm8, 24 1052 pxor xmm12, xmm8 1053 movdqa xmm8, xmm13 1054 psrld xmm13, 8 1055 pslld xmm8, 24 1056 pxor xmm13, xmm8 1057 movdqa xmm8, xmm14 1058 psrld xmm14, 8 1059 pslld xmm8, 24 1060 pxor xmm14, xmm8 1061 movdqa xmm8, xmm15 1062 psrld xmm15, 8 1063 pslld xmm8, 24 1064 pxor xmm15, xmm8 1065 movdqa xmm8, xmmword ptr [rsp+0x100] 1066 paddd xmm8, xmm12 1067 paddd xmm9, xmm13 1068 paddd xmm10, xmm14 1069 paddd xmm11, xmm15 1070 pxor xmm4, xmm8 1071 pxor xmm5, xmm9 1072 pxor xmm6, xmm10 1073 pxor xmm7, xmm11 1074 movdqa xmmword ptr [rsp+0x100], xmm8 1075 movdqa xmm8, xmm4 1076 psrld xmm8, 7 1077 pslld xmm4, 25 1078 por xmm4, xmm8 1079 movdqa xmm8, xmm5 1080 psrld xmm8, 7 1081 pslld xmm5, 25 1082 por xmm5, xmm8 1083 movdqa xmm8, xmm6 1084 psrld xmm8, 7 1085 pslld xmm6, 25 1086 por xmm6, xmm8 1087 movdqa xmm8, xmm7 1088 psrld xmm8, 7 1089 pslld xmm7, 25 1090 por xmm7, xmm8 1091 paddd xmm0, xmmword ptr [rsp+0x70] 1092 paddd xmm1, xmmword ptr [rsp+0x50] 1093 paddd xmm2, xmmword ptr [rsp] 1094 paddd xmm3, xmmword ptr [rsp+0x60] 1095 paddd xmm0, xmm5 1096 paddd xmm1, xmm6 1097 paddd xmm2, xmm7 1098 paddd xmm3, xmm4 1099 pxor xmm15, xmm0 1100 pxor xmm12, xmm1 1101 pxor xmm13, xmm2 1102 pxor xmm14, xmm3 1103 pshuflw xmm15, xmm15, 0xB1 1104 pshufhw xmm15, xmm15, 0xB1 1105 pshuflw xmm12, xmm12, 0xB1 1106 pshufhw xmm12, xmm12, 0xB1 1107 pshuflw xmm13, xmm13, 0xB1 1108 pshufhw xmm13, xmm13, 0xB1 1109 pshuflw xmm14, xmm14, 0xB1 1110 pshufhw xmm14, xmm14, 0xB1 1111 paddd xmm10, xmm15 1112 paddd xmm11, xmm12 1113 movdqa xmm8, xmmword ptr [rsp+0x100] 1114 paddd xmm8, xmm13 1115 paddd xmm9, xmm14 1116 pxor xmm5, xmm10 1117 pxor xmm6, xmm11 1118 pxor xmm7, xmm8 1119 pxor xmm4, xmm9 1120 movdqa xmmword ptr [rsp+0x100], xmm8 1121 movdqa xmm8, xmm5 1122 psrld xmm8, 12 1123 pslld xmm5, 20 1124 por xmm5, xmm8 1125 movdqa xmm8, xmm6 1126 psrld xmm8, 12 1127 pslld xmm6, 20 1128 por xmm6, xmm8 1129 movdqa xmm8, xmm7 1130 psrld xmm8, 12 1131 pslld xmm7, 20 1132 por xmm7, xmm8 1133 movdqa xmm8, xmm4 1134 psrld xmm8, 12 1135 pslld xmm4, 20 1136 por xmm4, xmm8 1137 paddd xmm0, xmmword ptr [rsp+0x20] 1138 paddd xmm1, xmmword ptr [rsp+0x30] 1139 paddd xmm2, xmmword ptr [rsp+0x10] 1140 paddd xmm3, xmmword ptr [rsp+0x40] 1141 paddd xmm0, xmm5 1142 paddd xmm1, xmm6 1143 paddd xmm2, xmm7 1144 paddd xmm3, xmm4 1145 pxor xmm15, xmm0 1146 pxor xmm12, xmm1 1147 pxor xmm13, xmm2 1148 pxor xmm14, xmm3 1149 movdqa xmm8, xmm15 1150 psrld xmm15, 8 1151 pslld xmm8, 24 1152 pxor xmm15, xmm8 1153 movdqa xmm8, xmm12 1154 psrld xmm12, 8 1155 pslld xmm8, 24 1156 pxor xmm12, xmm8 1157 movdqa xmm8, xmm13 1158 psrld xmm13, 8 1159 pslld xmm8, 24 1160 pxor xmm13, xmm8 1161 movdqa xmm8, xmm14 1162 psrld xmm14, 8 1163 pslld xmm8, 24 1164 pxor xmm14, xmm8 1165 paddd xmm10, xmm15 1166 paddd xmm11, xmm12 1167 movdqa xmm8, xmmword ptr [rsp+0x100] 1168 paddd xmm8, xmm13 1169 paddd xmm9, xmm14 1170 pxor xmm5, xmm10 1171 pxor xmm6, xmm11 1172 pxor xmm7, xmm8 1173 pxor xmm4, xmm9 1174 movdqa xmmword ptr [rsp+0x100], xmm8 1175 movdqa xmm8, xmm5 1176 psrld xmm8, 7 1177 pslld xmm5, 25 1178 por xmm5, xmm8 1179 movdqa xmm8, xmm6 1180 psrld xmm8, 7 1181 pslld xmm6, 25 1182 por xmm6, xmm8 1183 movdqa xmm8, xmm7 1184 psrld xmm8, 7 1185 pslld xmm7, 25 1186 por xmm7, xmm8 1187 movdqa xmm8, xmm4 1188 psrld xmm8, 7 1189 pslld xmm4, 25 1190 por xmm4, xmm8 1191 paddd xmm0, xmmword ptr [rsp+0x90] 1192 paddd xmm1, xmmword ptr [rsp+0xB0] 1193 paddd xmm2, xmmword ptr [rsp+0x80] 1194 paddd xmm3, xmmword ptr [rsp+0xF0] 1195 paddd xmm0, xmm4 1196 paddd xmm1, xmm5 1197 paddd xmm2, xmm6 1198 paddd xmm3, xmm7 1199 pxor xmm12, xmm0 1200 pxor xmm13, xmm1 1201 pxor xmm14, xmm2 1202 pxor xmm15, xmm3 1203 pshuflw xmm12, xmm12, 0xB1 1204 pshufhw xmm12, xmm12, 0xB1 1205 pshuflw xmm13, xmm13, 0xB1 1206 pshufhw xmm13, xmm13, 0xB1 1207 pshuflw xmm14, xmm14, 0xB1 1208 pshufhw xmm14, xmm14, 0xB1 1209 pshuflw xmm15, xmm15, 0xB1 1210 pshufhw xmm15, xmm15, 0xB1 1211 movdqa xmm8, xmmword ptr [rsp+0x100] 1212 paddd xmm8, xmm12 1213 paddd xmm9, xmm13 1214 paddd xmm10, xmm14 1215 paddd xmm11, xmm15 1216 pxor xmm4, xmm8 1217 pxor xmm5, xmm9 1218 pxor xmm6, xmm10 1219 pxor xmm7, xmm11 1220 movdqa xmmword ptr [rsp+0x100], xmm8 1221 movdqa xmm8, xmm4 1222 psrld xmm8, 12 1223 pslld xmm4, 20 1224 por xmm4, xmm8 1225 movdqa xmm8, xmm5 1226 psrld xmm8, 12 1227 pslld xmm5, 20 1228 por xmm5, xmm8 1229 movdqa xmm8, xmm6 1230 psrld xmm8, 12 1231 pslld xmm6, 20 1232 por xmm6, xmm8 1233 movdqa xmm8, xmm7 1234 psrld xmm8, 12 1235 pslld xmm7, 20 1236 por xmm7, xmm8 1237 paddd xmm0, xmmword ptr [rsp+0xE0] 1238 paddd xmm1, xmmword ptr [rsp+0x50] 1239 paddd xmm2, xmmword ptr [rsp+0xC0] 1240 paddd xmm3, xmmword ptr [rsp+0x10] 1241 paddd xmm0, xmm4 1242 paddd xmm1, xmm5 1243 paddd xmm2, xmm6 1244 paddd xmm3, xmm7 1245 pxor xmm12, xmm0 1246 pxor xmm13, xmm1 1247 pxor xmm14, xmm2 1248 pxor xmm15, xmm3 1249 movdqa xmm8, xmm12 1250 psrld xmm12, 8 1251 pslld xmm8, 24 1252 pxor xmm12, xmm8 1253 movdqa xmm8, xmm13 1254 psrld xmm13, 8 1255 pslld xmm8, 24 1256 pxor xmm13, xmm8 1257 movdqa xmm8, xmm14 1258 psrld xmm14, 8 1259 pslld xmm8, 24 1260 pxor xmm14, xmm8 1261 movdqa xmm8, xmm15 1262 psrld xmm15, 8 1263 pslld xmm8, 24 1264 pxor xmm15, xmm8 1265 movdqa xmm8, xmmword ptr [rsp+0x100] 1266 paddd xmm8, xmm12 1267 paddd xmm9, xmm13 1268 paddd xmm10, xmm14 1269 paddd xmm11, xmm15 1270 pxor xmm4, xmm8 1271 pxor xmm5, xmm9 1272 pxor xmm6, xmm10 1273 pxor xmm7, xmm11 1274 movdqa xmmword ptr [rsp+0x100], xmm8 1275 movdqa xmm8, xmm4 1276 psrld xmm8, 7 1277 pslld xmm4, 25 1278 por xmm4, xmm8 1279 movdqa xmm8, xmm5 1280 psrld xmm8, 7 1281 pslld xmm5, 25 1282 por xmm5, xmm8 1283 movdqa xmm8, xmm6 1284 psrld xmm8, 7 1285 pslld xmm6, 25 1286 por xmm6, xmm8 1287 movdqa xmm8, xmm7 1288 psrld xmm8, 7 1289 pslld xmm7, 25 1290 por xmm7, xmm8 1291 paddd xmm0, xmmword ptr [rsp+0xD0] 1292 paddd xmm1, xmmword ptr [rsp] 1293 paddd xmm2, xmmword ptr [rsp+0x20] 1294 paddd xmm3, xmmword ptr [rsp+0x40] 1295 paddd xmm0, xmm5 1296 paddd xmm1, xmm6 1297 paddd xmm2, xmm7 1298 paddd xmm3, xmm4 1299 pxor xmm15, xmm0 1300 pxor xmm12, xmm1 1301 pxor xmm13, xmm2 1302 pxor xmm14, xmm3 1303 pshuflw xmm15, xmm15, 0xB1 1304 pshufhw xmm15, xmm15, 0xB1 1305 pshuflw xmm12, xmm12, 0xB1 1306 pshufhw xmm12, xmm12, 0xB1 1307 pshuflw xmm13, xmm13, 0xB1 1308 pshufhw xmm13, xmm13, 0xB1 1309 pshuflw xmm14, xmm14, 0xB1 1310 pshufhw xmm14, xmm14, 0xB1 1311 paddd xmm10, xmm15 1312 paddd xmm11, xmm12 1313 movdqa xmm8, xmmword ptr [rsp+0x100] 1314 paddd xmm8, xmm13 1315 paddd xmm9, xmm14 1316 pxor xmm5, xmm10 1317 pxor xmm6, xmm11 1318 pxor xmm7, xmm8 1319 pxor xmm4, xmm9 1320 movdqa xmmword ptr [rsp+0x100], xmm8 1321 movdqa xmm8, xmm5 1322 psrld xmm8, 12 1323 pslld xmm5, 20 1324 por xmm5, xmm8 1325 movdqa xmm8, xmm6 1326 psrld xmm8, 12 1327 pslld xmm6, 20 1328 por xmm6, xmm8 1329 movdqa xmm8, xmm7 1330 psrld xmm8, 12 1331 pslld xmm7, 20 1332 por xmm7, xmm8 1333 movdqa xmm8, xmm4 1334 psrld xmm8, 12 1335 pslld xmm4, 20 1336 por xmm4, xmm8 1337 paddd xmm0, xmmword ptr [rsp+0x30] 1338 paddd xmm1, xmmword ptr [rsp+0xA0] 1339 paddd xmm2, xmmword ptr [rsp+0x60] 1340 paddd xmm3, xmmword ptr [rsp+0x70] 1341 paddd xmm0, xmm5 1342 paddd xmm1, xmm6 1343 paddd xmm2, xmm7 1344 paddd xmm3, xmm4 1345 pxor xmm15, xmm0 1346 pxor xmm12, xmm1 1347 pxor xmm13, xmm2 1348 pxor xmm14, xmm3 1349 movdqa xmm8, xmm15 1350 psrld xmm15, 8 1351 pslld xmm8, 24 1352 pxor xmm15, xmm8 1353 movdqa xmm8, xmm12 1354 psrld xmm12, 8 1355 pslld xmm8, 24 1356 pxor xmm12, xmm8 1357 movdqa xmm8, xmm13 1358 psrld xmm13, 8 1359 pslld xmm8, 24 1360 pxor xmm13, xmm8 1361 movdqa xmm8, xmm14 1362 psrld xmm14, 8 1363 pslld xmm8, 24 1364 pxor xmm14, xmm8 1365 paddd xmm10, xmm15 1366 paddd xmm11, xmm12 1367 movdqa xmm8, xmmword ptr [rsp+0x100] 1368 paddd xmm8, xmm13 1369 paddd xmm9, xmm14 1370 pxor xmm5, xmm10 1371 pxor xmm6, xmm11 1372 pxor xmm7, xmm8 1373 pxor xmm4, xmm9 1374 movdqa xmmword ptr [rsp+0x100], xmm8 1375 movdqa xmm8, xmm5 1376 psrld xmm8, 7 1377 pslld xmm5, 25 1378 por xmm5, xmm8 1379 movdqa xmm8, xmm6 1380 psrld xmm8, 7 1381 pslld xmm6, 25 1382 por xmm6, xmm8 1383 movdqa xmm8, xmm7 1384 psrld xmm8, 7 1385 pslld xmm7, 25 1386 por xmm7, xmm8 1387 movdqa xmm8, xmm4 1388 psrld xmm8, 7 1389 pslld xmm4, 25 1390 por xmm4, xmm8 1391 paddd xmm0, xmmword ptr [rsp+0xB0] 1392 paddd xmm1, xmmword ptr [rsp+0x50] 1393 paddd xmm2, xmmword ptr [rsp+0x10] 1394 paddd xmm3, xmmword ptr [rsp+0x80] 1395 paddd xmm0, xmm4 1396 paddd xmm1, xmm5 1397 paddd xmm2, xmm6 1398 paddd xmm3, xmm7 1399 pxor xmm12, xmm0 1400 pxor xmm13, xmm1 1401 pxor xmm14, xmm2 1402 pxor xmm15, xmm3 1403 pshuflw xmm12, xmm12, 0xB1 1404 pshufhw xmm12, xmm12, 0xB1 1405 pshuflw xmm13, xmm13, 0xB1 1406 pshufhw xmm13, xmm13, 0xB1 1407 pshuflw xmm14, xmm14, 0xB1 1408 pshufhw xmm14, xmm14, 0xB1 1409 pshuflw xmm15, xmm15, 0xB1 1410 pshufhw xmm15, xmm15, 0xB1 1411 movdqa xmm8, xmmword ptr [rsp+0x100] 1412 paddd xmm8, xmm12 1413 paddd xmm9, xmm13 1414 paddd xmm10, xmm14 1415 paddd xmm11, xmm15 1416 pxor xmm4, xmm8 1417 pxor xmm5, xmm9 1418 pxor xmm6, xmm10 1419 pxor xmm7, xmm11 1420 movdqa xmmword ptr [rsp+0x100], xmm8 1421 movdqa xmm8, xmm4 1422 psrld xmm8, 12 1423 pslld xmm4, 20 1424 por xmm4, xmm8 1425 movdqa xmm8, xmm5 1426 psrld xmm8, 12 1427 pslld xmm5, 20 1428 por xmm5, xmm8 1429 movdqa xmm8, xmm6 1430 psrld xmm8, 12 1431 pslld xmm6, 20 1432 por xmm6, xmm8 1433 movdqa xmm8, xmm7 1434 psrld xmm8, 12 1435 pslld xmm7, 20 1436 por xmm7, xmm8 1437 paddd xmm0, xmmword ptr [rsp+0xF0] 1438 paddd xmm1, xmmword ptr [rsp] 1439 paddd xmm2, xmmword ptr [rsp+0x90] 1440 paddd xmm3, xmmword ptr [rsp+0x60] 1441 paddd xmm0, xmm4 1442 paddd xmm1, xmm5 1443 paddd xmm2, xmm6 1444 paddd xmm3, xmm7 1445 pxor xmm12, xmm0 1446 pxor xmm13, xmm1 1447 pxor xmm14, xmm2 1448 pxor xmm15, xmm3 1449 movdqa xmm8, xmm12 1450 psrld xmm12, 8 1451 pslld xmm8, 24 1452 pxor xmm12, xmm8 1453 movdqa xmm8, xmm13 1454 psrld xmm13, 8 1455 pslld xmm8, 24 1456 pxor xmm13, xmm8 1457 movdqa xmm8, xmm14 1458 psrld xmm14, 8 1459 pslld xmm8, 24 1460 pxor xmm14, xmm8 1461 movdqa xmm8, xmm15 1462 psrld xmm15, 8 1463 pslld xmm8, 24 1464 pxor xmm15, xmm8 1465 movdqa xmm8, xmmword ptr [rsp+0x100] 1466 paddd xmm8, xmm12 1467 paddd xmm9, xmm13 1468 paddd xmm10, xmm14 1469 paddd xmm11, xmm15 1470 pxor xmm4, xmm8 1471 pxor xmm5, xmm9 1472 pxor xmm6, xmm10 1473 pxor xmm7, xmm11 1474 movdqa xmmword ptr [rsp+0x100], xmm8 1475 movdqa xmm8, xmm4 1476 psrld xmm8, 7 1477 pslld xmm4, 25 1478 por xmm4, xmm8 1479 movdqa xmm8, xmm5 1480 psrld xmm8, 7 1481 pslld xmm5, 25 1482 por xmm5, xmm8 1483 movdqa xmm8, xmm6 1484 psrld xmm8, 7 1485 pslld xmm6, 25 1486 por xmm6, xmm8 1487 movdqa xmm8, xmm7 1488 psrld xmm8, 7 1489 pslld xmm7, 25 1490 por xmm7, xmm8 1491 paddd xmm0, xmmword ptr [rsp+0xE0] 1492 paddd xmm1, xmmword ptr [rsp+0x20] 1493 paddd xmm2, xmmword ptr [rsp+0x30] 1494 paddd xmm3, xmmword ptr [rsp+0x70] 1495 paddd xmm0, xmm5 1496 paddd xmm1, xmm6 1497 paddd xmm2, xmm7 1498 paddd xmm3, xmm4 1499 pxor xmm15, xmm0 1500 pxor xmm12, xmm1 1501 pxor xmm13, xmm2 1502 pxor xmm14, xmm3 1503 pshuflw xmm15, xmm15, 0xB1 1504 pshufhw xmm15, xmm15, 0xB1 1505 pshuflw xmm12, xmm12, 0xB1 1506 pshufhw xmm12, xmm12, 0xB1 1507 pshuflw xmm13, xmm13, 0xB1 1508 pshufhw xmm13, xmm13, 0xB1 1509 pshuflw xmm14, xmm14, 0xB1 1510 pshufhw xmm14, xmm14, 0xB1 1511 paddd xmm10, xmm15 1512 paddd xmm11, xmm12 1513 movdqa xmm8, xmmword ptr [rsp+0x100] 1514 paddd xmm8, xmm13 1515 paddd xmm9, xmm14 1516 pxor xmm5, xmm10 1517 pxor xmm6, xmm11 1518 pxor xmm7, xmm8 1519 pxor xmm4, xmm9 1520 movdqa xmmword ptr [rsp+0x100], xmm8 1521 movdqa xmm8, xmm5 1522 psrld xmm8, 12 1523 pslld xmm5, 20 1524 por xmm5, xmm8 1525 movdqa xmm8, xmm6 1526 psrld xmm8, 12 1527 pslld xmm6, 20 1528 por xmm6, xmm8 1529 movdqa xmm8, xmm7 1530 psrld xmm8, 12 1531 pslld xmm7, 20 1532 por xmm7, xmm8 1533 movdqa xmm8, xmm4 1534 psrld xmm8, 12 1535 pslld xmm4, 20 1536 por xmm4, xmm8 1537 paddd xmm0, xmmword ptr [rsp+0xA0] 1538 paddd xmm1, xmmword ptr [rsp+0xC0] 1539 paddd xmm2, xmmword ptr [rsp+0x40] 1540 paddd xmm3, xmmword ptr [rsp+0xD0] 1541 paddd xmm0, xmm5 1542 paddd xmm1, xmm6 1543 paddd xmm2, xmm7 1544 paddd xmm3, xmm4 1545 pxor xmm15, xmm0 1546 pxor xmm12, xmm1 1547 pxor xmm13, xmm2 1548 pxor xmm14, xmm3 1549 movdqa xmm8, xmm15 1550 psrld xmm15, 8 1551 pslld xmm8, 24 1552 pxor xmm15, xmm8 1553 movdqa xmm8, xmm12 1554 psrld xmm12, 8 1555 pslld xmm8, 24 1556 pxor xmm12, xmm8 1557 movdqa xmm8, xmm13 1558 psrld xmm13, 8 1559 pslld xmm8, 24 1560 pxor xmm13, xmm8 1561 movdqa xmm8, xmm14 1562 psrld xmm14, 8 1563 pslld xmm8, 24 1564 pxor xmm14, xmm8 1565 paddd xmm10, xmm15 1566 paddd xmm11, xmm12 1567 movdqa xmm8, xmmword ptr [rsp+0x100] 1568 paddd xmm8, xmm13 1569 paddd xmm9, xmm14 1570 pxor xmm5, xmm10 1571 pxor xmm6, xmm11 1572 pxor xmm7, xmm8 1573 pxor xmm4, xmm9 1574 pxor xmm0, xmm8 1575 pxor xmm1, xmm9 1576 pxor xmm2, xmm10 1577 pxor xmm3, xmm11 1578 movdqa xmm8, xmm5 1579 psrld xmm8, 7 1580 pslld xmm5, 25 1581 por xmm5, xmm8 1582 movdqa xmm8, xmm6 1583 psrld xmm8, 7 1584 pslld xmm6, 25 1585 por xmm6, xmm8 1586 movdqa xmm8, xmm7 1587 psrld xmm8, 7 1588 pslld xmm7, 25 1589 por xmm7, xmm8 1590 movdqa xmm8, xmm4 1591 psrld xmm8, 7 1592 pslld xmm4, 25 1593 por xmm4, xmm8 1594 pxor xmm4, xmm12 1595 pxor xmm5, xmm13 1596 pxor xmm6, xmm14 1597 pxor xmm7, xmm15 1598 mov eax, r13d 1599 jne 9b 1600 movdqa xmm9, xmm0 1601 punpckldq xmm0, xmm1 1602 punpckhdq xmm9, xmm1 1603 movdqa xmm11, xmm2 1604 punpckldq xmm2, xmm3 1605 punpckhdq xmm11, xmm3 1606 movdqa xmm1, xmm0 1607 punpcklqdq xmm0, xmm2 1608 punpckhqdq xmm1, xmm2 1609 movdqa xmm3, xmm9 1610 punpcklqdq xmm9, xmm11 1611 punpckhqdq xmm3, xmm11 1612 movdqu xmmword ptr [rbx], xmm0 1613 movdqu xmmword ptr [rbx+0x20], xmm1 1614 movdqu xmmword ptr [rbx+0x40], xmm9 1615 movdqu xmmword ptr [rbx+0x60], xmm3 1616 movdqa xmm9, xmm4 1617 punpckldq xmm4, xmm5 1618 punpckhdq xmm9, xmm5 1619 movdqa xmm11, xmm6 1620 punpckldq xmm6, xmm7 1621 punpckhdq xmm11, xmm7 1622 movdqa xmm5, xmm4 1623 punpcklqdq xmm4, xmm6 1624 punpckhqdq xmm5, xmm6 1625 movdqa xmm7, xmm9 1626 punpcklqdq xmm9, xmm11 1627 punpckhqdq xmm7, xmm11 1628 movdqu xmmword ptr [rbx+0x10], xmm4 1629 movdqu xmmword ptr [rbx+0x30], xmm5 1630 movdqu xmmword ptr [rbx+0x50], xmm9 1631 movdqu xmmword ptr [rbx+0x70], xmm7 1632 movdqa xmm1, xmmword ptr [rsp+0x110] 1633 movdqa xmm0, xmm1 1634 paddd xmm1, xmmword ptr [rsp+0x150] 1635 movdqa xmmword ptr [rsp+0x110], xmm1 1636 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] 1637 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] 1638 pcmpgtd xmm0, xmm1 1639 movdqa xmm1, xmmword ptr [rsp+0x120] 1640 psubd xmm1, xmm0 1641 movdqa xmmword ptr [rsp+0x120], xmm1 1642 add rbx, 128 1643 add rdi, 32 1644 sub rsi, 4 1645 cmp rsi, 4 1646 jnc 2b 1647 test rsi, rsi 1648 jnz 3f 16494: 1650 mov rsp, rbp 1651 pop rbp 1652 pop rbx 1653 pop r12 1654 pop r13 1655 pop r14 1656 pop r15 1657 RET 1658.p2align 5 16593: 1660 test esi, 0x2 1661 je 3f 1662 movups xmm0, xmmword ptr [rcx] 1663 movups xmm1, xmmword ptr [rcx+0x10] 1664 movaps xmm8, xmm0 1665 movaps xmm9, xmm1 1666 movd xmm13, dword ptr [rsp+0x110] 1667 movd xmm14, dword ptr [rsp+0x120] 1668 punpckldq xmm13, xmm14 1669 movaps xmmword ptr [rsp], xmm13 1670 movd xmm14, dword ptr [rsp+0x114] 1671 movd xmm13, dword ptr [rsp+0x124] 1672 punpckldq xmm14, xmm13 1673 movaps xmmword ptr [rsp+0x10], xmm14 1674 mov r8, qword ptr [rdi] 1675 mov r9, qword ptr [rdi+0x8] 1676 movzx eax, byte ptr [rbp+0x40] 1677 or eax, r13d 1678 xor edx, edx 16792: 1680 mov r14d, eax 1681 or eax, r12d 1682 add rdx, 64 1683 cmp rdx, r15 1684 cmovne eax, r14d 1685 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 1686 movaps xmm10, xmm2 1687 movups xmm4, xmmword ptr [r8+rdx-0x40] 1688 movups xmm5, xmmword ptr [r8+rdx-0x30] 1689 movaps xmm3, xmm4 1690 shufps xmm4, xmm5, 136 1691 shufps xmm3, xmm5, 221 1692 movaps xmm5, xmm3 1693 movups xmm6, xmmword ptr [r8+rdx-0x20] 1694 movups xmm7, xmmword ptr [r8+rdx-0x10] 1695 movaps xmm3, xmm6 1696 shufps xmm6, xmm7, 136 1697 pshufd xmm6, xmm6, 0x93 1698 shufps xmm3, xmm7, 221 1699 pshufd xmm7, xmm3, 0x93 1700 movups xmm12, xmmword ptr [r9+rdx-0x40] 1701 movups xmm13, xmmword ptr [r9+rdx-0x30] 1702 movaps xmm11, xmm12 1703 shufps xmm12, xmm13, 136 1704 shufps xmm11, xmm13, 221 1705 movaps xmm13, xmm11 1706 movups xmm14, xmmword ptr [r9+rdx-0x20] 1707 movups xmm15, xmmword ptr [r9+rdx-0x10] 1708 movaps xmm11, xmm14 1709 shufps xmm14, xmm15, 136 1710 pshufd xmm14, xmm14, 0x93 1711 shufps xmm11, xmm15, 221 1712 pshufd xmm15, xmm11, 0x93 1713 shl rax, 0x20 1714 or rax, 0x40 1715 movq xmm3, rax 1716 movdqa xmmword ptr [rsp+0x20], xmm3 1717 movaps xmm3, xmmword ptr [rsp] 1718 movaps xmm11, xmmword ptr [rsp+0x10] 1719 punpcklqdq xmm3, xmmword ptr [rsp+0x20] 1720 punpcklqdq xmm11, xmmword ptr [rsp+0x20] 1721 mov al, 7 17229: 1723 paddd xmm0, xmm4 1724 paddd xmm8, xmm12 1725 movaps xmmword ptr [rsp+0x20], xmm4 1726 movaps xmmword ptr [rsp+0x30], xmm12 1727 paddd xmm0, xmm1 1728 paddd xmm8, xmm9 1729 pxor xmm3, xmm0 1730 pxor xmm11, xmm8 1731 pshuflw xmm3, xmm3, 0xB1 1732 pshufhw xmm3, xmm3, 0xB1 1733 pshuflw xmm11, xmm11, 0xB1 1734 pshufhw xmm11, xmm11, 0xB1 1735 paddd xmm2, xmm3 1736 paddd xmm10, xmm11 1737 pxor xmm1, xmm2 1738 pxor xmm9, xmm10 1739 movdqa xmm4, xmm1 1740 pslld xmm1, 20 1741 psrld xmm4, 12 1742 por xmm1, xmm4 1743 movdqa xmm4, xmm9 1744 pslld xmm9, 20 1745 psrld xmm4, 12 1746 por xmm9, xmm4 1747 paddd xmm0, xmm5 1748 paddd xmm8, xmm13 1749 movaps xmmword ptr [rsp+0x40], xmm5 1750 movaps xmmword ptr [rsp+0x50], xmm13 1751 paddd xmm0, xmm1 1752 paddd xmm8, xmm9 1753 pxor xmm3, xmm0 1754 pxor xmm11, xmm8 1755 movdqa xmm13, xmm3 1756 psrld xmm3, 8 1757 pslld xmm13, 24 1758 pxor xmm3, xmm13 1759 movdqa xmm13, xmm11 1760 psrld xmm11, 8 1761 pslld xmm13, 24 1762 pxor xmm11, xmm13 1763 paddd xmm2, xmm3 1764 paddd xmm10, xmm11 1765 pxor xmm1, xmm2 1766 pxor xmm9, xmm10 1767 movdqa xmm4, xmm1 1768 pslld xmm1, 25 1769 psrld xmm4, 7 1770 por xmm1, xmm4 1771 movdqa xmm4, xmm9 1772 pslld xmm9, 25 1773 psrld xmm4, 7 1774 por xmm9, xmm4 1775 pshufd xmm0, xmm0, 0x93 1776 pshufd xmm8, xmm8, 0x93 1777 pshufd xmm3, xmm3, 0x4E 1778 pshufd xmm11, xmm11, 0x4E 1779 pshufd xmm2, xmm2, 0x39 1780 pshufd xmm10, xmm10, 0x39 1781 paddd xmm0, xmm6 1782 paddd xmm8, xmm14 1783 paddd xmm0, xmm1 1784 paddd xmm8, xmm9 1785 pxor xmm3, xmm0 1786 pxor xmm11, xmm8 1787 pshuflw xmm3, xmm3, 0xB1 1788 pshufhw xmm3, xmm3, 0xB1 1789 pshuflw xmm11, xmm11, 0xB1 1790 pshufhw xmm11, xmm11, 0xB1 1791 paddd xmm2, xmm3 1792 paddd xmm10, xmm11 1793 pxor xmm1, xmm2 1794 pxor xmm9, xmm10 1795 movdqa xmm4, xmm1 1796 pslld xmm1, 20 1797 psrld xmm4, 12 1798 por xmm1, xmm4 1799 movdqa xmm4, xmm9 1800 pslld xmm9, 20 1801 psrld xmm4, 12 1802 por xmm9, xmm4 1803 paddd xmm0, xmm7 1804 paddd xmm8, xmm15 1805 paddd xmm0, xmm1 1806 paddd xmm8, xmm9 1807 pxor xmm3, xmm0 1808 pxor xmm11, xmm8 1809 movdqa xmm13, xmm3 1810 psrld xmm3, 8 1811 pslld xmm13, 24 1812 pxor xmm3, xmm13 1813 movdqa xmm13, xmm11 1814 psrld xmm11, 8 1815 pslld xmm13, 24 1816 pxor xmm11, xmm13 1817 paddd xmm2, xmm3 1818 paddd xmm10, xmm11 1819 pxor xmm1, xmm2 1820 pxor xmm9, xmm10 1821 movdqa xmm4, xmm1 1822 pslld xmm1, 25 1823 psrld xmm4, 7 1824 por xmm1, xmm4 1825 movdqa xmm4, xmm9 1826 pslld xmm9, 25 1827 psrld xmm4, 7 1828 por xmm9, xmm4 1829 pshufd xmm0, xmm0, 0x39 1830 pshufd xmm8, xmm8, 0x39 1831 pshufd xmm3, xmm3, 0x4E 1832 pshufd xmm11, xmm11, 0x4E 1833 pshufd xmm2, xmm2, 0x93 1834 pshufd xmm10, xmm10, 0x93 1835 dec al 1836 je 9f 1837 movdqa xmm12, xmmword ptr [rsp+0x20] 1838 movdqa xmm5, xmmword ptr [rsp+0x40] 1839 pshufd xmm13, xmm12, 0x0F 1840 shufps xmm12, xmm5, 214 1841 pshufd xmm4, xmm12, 0x39 1842 movdqa xmm12, xmm6 1843 shufps xmm12, xmm7, 250 1844 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] 1845 pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] 1846 por xmm13, xmm12 1847 movdqa xmmword ptr [rsp+0x20], xmm13 1848 movdqa xmm12, xmm7 1849 punpcklqdq xmm12, xmm5 1850 movdqa xmm13, xmm6 1851 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] 1852 pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] 1853 por xmm12, xmm13 1854 pshufd xmm12, xmm12, 0x78 1855 punpckhdq xmm5, xmm7 1856 punpckldq xmm6, xmm5 1857 pshufd xmm7, xmm6, 0x1E 1858 movdqa xmmword ptr [rsp+0x40], xmm12 1859 movdqa xmm5, xmmword ptr [rsp+0x30] 1860 movdqa xmm13, xmmword ptr [rsp+0x50] 1861 pshufd xmm6, xmm5, 0x0F 1862 shufps xmm5, xmm13, 214 1863 pshufd xmm12, xmm5, 0x39 1864 movdqa xmm5, xmm14 1865 shufps xmm5, xmm15, 250 1866 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] 1867 pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] 1868 por xmm6, xmm5 1869 movdqa xmm5, xmm15 1870 punpcklqdq xmm5, xmm13 1871 movdqa xmmword ptr [rsp+0x30], xmm2 1872 movdqa xmm2, xmm14 1873 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] 1874 pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] 1875 por xmm5, xmm2 1876 movdqa xmm2, xmmword ptr [rsp+0x30] 1877 pshufd xmm5, xmm5, 0x78 1878 punpckhdq xmm13, xmm15 1879 punpckldq xmm14, xmm13 1880 pshufd xmm15, xmm14, 0x1E 1881 movdqa xmm13, xmm6 1882 movdqa xmm14, xmm5 1883 movdqa xmm5, xmmword ptr [rsp+0x20] 1884 movdqa xmm6, xmmword ptr [rsp+0x40] 1885 jmp 9b 18869: 1887 pxor xmm0, xmm2 1888 pxor xmm1, xmm3 1889 pxor xmm8, xmm10 1890 pxor xmm9, xmm11 1891 mov eax, r13d 1892 cmp rdx, r15 1893 jne 2b 1894 movups xmmword ptr [rbx], xmm0 1895 movups xmmword ptr [rbx+0x10], xmm1 1896 movups xmmword ptr [rbx+0x20], xmm8 1897 movups xmmword ptr [rbx+0x30], xmm9 1898 mov eax, dword ptr [rsp+0x130] 1899 neg eax 1900 mov r10d, dword ptr [rsp+0x110+8*rax] 1901 mov r11d, dword ptr [rsp+0x120+8*rax] 1902 mov dword ptr [rsp+0x110], r10d 1903 mov dword ptr [rsp+0x120], r11d 1904 add rdi, 16 1905 add rbx, 64 1906 sub rsi, 2 19073: 1908 test esi, 0x1 1909 je 4b 1910 movups xmm0, xmmword ptr [rcx] 1911 movups xmm1, xmmword ptr [rcx+0x10] 1912 movd xmm13, dword ptr [rsp+0x110] 1913 movd xmm14, dword ptr [rsp+0x120] 1914 punpckldq xmm13, xmm14 1915 mov r8, qword ptr [rdi] 1916 movzx eax, byte ptr [rbp+0x40] 1917 or eax, r13d 1918 xor edx, edx 19192: 1920 mov r14d, eax 1921 or eax, r12d 1922 add rdx, 64 1923 cmp rdx, r15 1924 cmovne eax, r14d 1925 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 1926 shl rax, 32 1927 or rax, 64 1928 movq xmm12, rax 1929 movdqa xmm3, xmm13 1930 punpcklqdq xmm3, xmm12 1931 movups xmm4, xmmword ptr [r8+rdx-0x40] 1932 movups xmm5, xmmword ptr [r8+rdx-0x30] 1933 movaps xmm8, xmm4 1934 shufps xmm4, xmm5, 136 1935 shufps xmm8, xmm5, 221 1936 movaps xmm5, xmm8 1937 movups xmm6, xmmword ptr [r8+rdx-0x20] 1938 movups xmm7, xmmword ptr [r8+rdx-0x10] 1939 movaps xmm8, xmm6 1940 shufps xmm6, xmm7, 136 1941 pshufd xmm6, xmm6, 0x93 1942 shufps xmm8, xmm7, 221 1943 pshufd xmm7, xmm8, 0x93 1944 mov al, 7 19459: 1946 paddd xmm0, xmm4 1947 paddd xmm0, xmm1 1948 pxor xmm3, xmm0 1949 pshuflw xmm3, xmm3, 0xB1 1950 pshufhw xmm3, xmm3, 0xB1 1951 paddd xmm2, xmm3 1952 pxor xmm1, xmm2 1953 movdqa xmm11, xmm1 1954 pslld xmm1, 20 1955 psrld xmm11, 12 1956 por xmm1, xmm11 1957 paddd xmm0, xmm5 1958 paddd xmm0, xmm1 1959 pxor xmm3, xmm0 1960 movdqa xmm14, xmm3 1961 psrld xmm3, 8 1962 pslld xmm14, 24 1963 pxor xmm3, xmm14 1964 paddd xmm2, xmm3 1965 pxor xmm1, xmm2 1966 movdqa xmm11, xmm1 1967 pslld xmm1, 25 1968 psrld xmm11, 7 1969 por xmm1, xmm11 1970 pshufd xmm0, xmm0, 0x93 1971 pshufd xmm3, xmm3, 0x4E 1972 pshufd xmm2, xmm2, 0x39 1973 paddd xmm0, xmm6 1974 paddd xmm0, xmm1 1975 pxor xmm3, xmm0 1976 pshuflw xmm3, xmm3, 0xB1 1977 pshufhw xmm3, xmm3, 0xB1 1978 paddd xmm2, xmm3 1979 pxor xmm1, xmm2 1980 movdqa xmm11, xmm1 1981 pslld xmm1, 20 1982 psrld xmm11, 12 1983 por xmm1, xmm11 1984 paddd xmm0, xmm7 1985 paddd xmm0, xmm1 1986 pxor xmm3, xmm0 1987 movdqa xmm14, xmm3 1988 psrld xmm3, 8 1989 pslld xmm14, 24 1990 pxor xmm3, xmm14 1991 paddd xmm2, xmm3 1992 pxor xmm1, xmm2 1993 movdqa xmm11, xmm1 1994 pslld xmm1, 25 1995 psrld xmm11, 7 1996 por xmm1, xmm11 1997 pshufd xmm0, xmm0, 0x39 1998 pshufd xmm3, xmm3, 0x4E 1999 pshufd xmm2, xmm2, 0x93 2000 dec al 2001 jz 9f 2002 movdqa xmm8, xmm4 2003 shufps xmm8, xmm5, 214 2004 pshufd xmm9, xmm4, 0x0F 2005 pshufd xmm4, xmm8, 0x39 2006 movdqa xmm8, xmm6 2007 shufps xmm8, xmm7, 250 2008 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2009 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2010 por xmm9, xmm8 2011 movdqa xmm8, xmm7 2012 punpcklqdq xmm8, xmm5 2013 movdqa xmm10, xmm6 2014 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2015 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2016 por xmm8, xmm10 2017 pshufd xmm8, xmm8, 0x78 2018 punpckhdq xmm5, xmm7 2019 punpckldq xmm6, xmm5 2020 pshufd xmm7, xmm6, 0x1E 2021 movdqa xmm5, xmm9 2022 movdqa xmm6, xmm8 2023 jmp 9b 20249: 2025 pxor xmm0, xmm2 2026 pxor xmm1, xmm3 2027 mov eax, r13d 2028 cmp rdx, r15 2029 jne 2b 2030 movups xmmword ptr [rbx], xmm0 2031 movups xmmword ptr [rbx+0x10], xmm1 2032 jmp 4b 2033SET_SIZE(zfs_blake3_hash_many_sse2) 2034 2035ENTRY_ALIGN(zfs_blake3_compress_in_place_sse2, 64) 2036 ENDBR 2037 movups xmm0, xmmword ptr [rdi] 2038 movups xmm1, xmmword ptr [rdi+0x10] 2039 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 2040 shl r8, 32 2041 add rdx, r8 2042 movq xmm3, rcx 2043 movq xmm4, rdx 2044 punpcklqdq xmm3, xmm4 2045 movups xmm4, xmmword ptr [rsi] 2046 movups xmm5, xmmword ptr [rsi+0x10] 2047 movaps xmm8, xmm4 2048 shufps xmm4, xmm5, 136 2049 shufps xmm8, xmm5, 221 2050 movaps xmm5, xmm8 2051 movups xmm6, xmmword ptr [rsi+0x20] 2052 movups xmm7, xmmword ptr [rsi+0x30] 2053 movaps xmm8, xmm6 2054 shufps xmm6, xmm7, 136 2055 pshufd xmm6, xmm6, 0x93 2056 shufps xmm8, xmm7, 221 2057 pshufd xmm7, xmm8, 0x93 2058 mov al, 7 20599: 2060 paddd xmm0, xmm4 2061 paddd xmm0, xmm1 2062 pxor xmm3, xmm0 2063 pshuflw xmm3, xmm3, 0xB1 2064 pshufhw xmm3, xmm3, 0xB1 2065 paddd xmm2, xmm3 2066 pxor xmm1, xmm2 2067 movdqa xmm11, xmm1 2068 pslld xmm1, 20 2069 psrld xmm11, 12 2070 por xmm1, xmm11 2071 paddd xmm0, xmm5 2072 paddd xmm0, xmm1 2073 pxor xmm3, xmm0 2074 movdqa xmm14, xmm3 2075 psrld xmm3, 8 2076 pslld xmm14, 24 2077 pxor xmm3, xmm14 2078 paddd xmm2, xmm3 2079 pxor xmm1, xmm2 2080 movdqa xmm11, xmm1 2081 pslld xmm1, 25 2082 psrld xmm11, 7 2083 por xmm1, xmm11 2084 pshufd xmm0, xmm0, 0x93 2085 pshufd xmm3, xmm3, 0x4E 2086 pshufd xmm2, xmm2, 0x39 2087 paddd xmm0, xmm6 2088 paddd xmm0, xmm1 2089 pxor xmm3, xmm0 2090 pshuflw xmm3, xmm3, 0xB1 2091 pshufhw xmm3, xmm3, 0xB1 2092 paddd xmm2, xmm3 2093 pxor xmm1, xmm2 2094 movdqa xmm11, xmm1 2095 pslld xmm1, 20 2096 psrld xmm11, 12 2097 por xmm1, xmm11 2098 paddd xmm0, xmm7 2099 paddd xmm0, xmm1 2100 pxor xmm3, xmm0 2101 movdqa xmm14, xmm3 2102 psrld xmm3, 8 2103 pslld xmm14, 24 2104 pxor xmm3, xmm14 2105 paddd xmm2, xmm3 2106 pxor xmm1, xmm2 2107 movdqa xmm11, xmm1 2108 pslld xmm1, 25 2109 psrld xmm11, 7 2110 por xmm1, xmm11 2111 pshufd xmm0, xmm0, 0x39 2112 pshufd xmm3, xmm3, 0x4E 2113 pshufd xmm2, xmm2, 0x93 2114 dec al 2115 jz 9f 2116 movdqa xmm8, xmm4 2117 shufps xmm8, xmm5, 214 2118 pshufd xmm9, xmm4, 0x0F 2119 pshufd xmm4, xmm8, 0x39 2120 movdqa xmm8, xmm6 2121 shufps xmm8, xmm7, 250 2122 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2123 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2124 por xmm9, xmm8 2125 movdqa xmm8, xmm7 2126 punpcklqdq xmm8, xmm5 2127 movdqa xmm10, xmm6 2128 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2129 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2130 por xmm8, xmm10 2131 pshufd xmm8, xmm8, 0x78 2132 punpckhdq xmm5, xmm7 2133 punpckldq xmm6, xmm5 2134 pshufd xmm7, xmm6, 0x1E 2135 movdqa xmm5, xmm9 2136 movdqa xmm6, xmm8 2137 jmp 9b 21389: 2139 pxor xmm0, xmm2 2140 pxor xmm1, xmm3 2141 movups xmmword ptr [rdi], xmm0 2142 movups xmmword ptr [rdi+0x10], xmm1 2143 RET 2144SET_SIZE(zfs_blake3_compress_in_place_sse2) 2145 2146ENTRY_ALIGN(zfs_blake3_compress_xof_sse2, 64) 2147 ENDBR 2148 movups xmm0, xmmword ptr [rdi] 2149 movups xmm1, xmmword ptr [rdi+0x10] 2150 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 2151 movzx eax, r8b 2152 movzx edx, dl 2153 shl rax, 32 2154 add rdx, rax 2155 movq xmm3, rcx 2156 movq xmm4, rdx 2157 punpcklqdq xmm3, xmm4 2158 movups xmm4, xmmword ptr [rsi] 2159 movups xmm5, xmmword ptr [rsi+0x10] 2160 movaps xmm8, xmm4 2161 shufps xmm4, xmm5, 136 2162 shufps xmm8, xmm5, 221 2163 movaps xmm5, xmm8 2164 movups xmm6, xmmword ptr [rsi+0x20] 2165 movups xmm7, xmmword ptr [rsi+0x30] 2166 movaps xmm8, xmm6 2167 shufps xmm6, xmm7, 136 2168 pshufd xmm6, xmm6, 0x93 2169 shufps xmm8, xmm7, 221 2170 pshufd xmm7, xmm8, 0x93 2171 mov al, 7 21729: 2173 paddd xmm0, xmm4 2174 paddd xmm0, xmm1 2175 pxor xmm3, xmm0 2176 pshuflw xmm3, xmm3, 0xB1 2177 pshufhw xmm3, xmm3, 0xB1 2178 paddd xmm2, xmm3 2179 pxor xmm1, xmm2 2180 movdqa xmm11, xmm1 2181 pslld xmm1, 20 2182 psrld xmm11, 12 2183 por xmm1, xmm11 2184 paddd xmm0, xmm5 2185 paddd xmm0, xmm1 2186 pxor xmm3, xmm0 2187 movdqa xmm14, xmm3 2188 psrld xmm3, 8 2189 pslld xmm14, 24 2190 pxor xmm3, xmm14 2191 paddd xmm2, xmm3 2192 pxor xmm1, xmm2 2193 movdqa xmm11, xmm1 2194 pslld xmm1, 25 2195 psrld xmm11, 7 2196 por xmm1, xmm11 2197 pshufd xmm0, xmm0, 0x93 2198 pshufd xmm3, xmm3, 0x4E 2199 pshufd xmm2, xmm2, 0x39 2200 paddd xmm0, xmm6 2201 paddd xmm0, xmm1 2202 pxor xmm3, xmm0 2203 pshuflw xmm3, xmm3, 0xB1 2204 pshufhw xmm3, xmm3, 0xB1 2205 paddd xmm2, xmm3 2206 pxor xmm1, xmm2 2207 movdqa xmm11, xmm1 2208 pslld xmm1, 20 2209 psrld xmm11, 12 2210 por xmm1, xmm11 2211 paddd xmm0, xmm7 2212 paddd xmm0, xmm1 2213 pxor xmm3, xmm0 2214 movdqa xmm14, xmm3 2215 psrld xmm3, 8 2216 pslld xmm14, 24 2217 pxor xmm3, xmm14 2218 paddd xmm2, xmm3 2219 pxor xmm1, xmm2 2220 movdqa xmm11, xmm1 2221 pslld xmm1, 25 2222 psrld xmm11, 7 2223 por xmm1, xmm11 2224 pshufd xmm0, xmm0, 0x39 2225 pshufd xmm3, xmm3, 0x4E 2226 pshufd xmm2, xmm2, 0x93 2227 dec al 2228 jz 9f 2229 movdqa xmm8, xmm4 2230 shufps xmm8, xmm5, 214 2231 pshufd xmm9, xmm4, 0x0F 2232 pshufd xmm4, xmm8, 0x39 2233 movdqa xmm8, xmm6 2234 shufps xmm8, xmm7, 250 2235 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2236 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2237 por xmm9, xmm8 2238 movdqa xmm8, xmm7 2239 punpcklqdq xmm8, xmm5 2240 movdqa xmm10, xmm6 2241 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2242 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2243 por xmm8, xmm10 2244 pshufd xmm8, xmm8, 0x78 2245 punpckhdq xmm5, xmm7 2246 punpckldq xmm6, xmm5 2247 pshufd xmm7, xmm6, 0x1E 2248 movdqa xmm5, xmm9 2249 movdqa xmm6, xmm8 2250 jmp 9b 22519: 2252 movdqu xmm4, xmmword ptr [rdi] 2253 movdqu xmm5, xmmword ptr [rdi+0x10] 2254 pxor xmm0, xmm2 2255 pxor xmm1, xmm3 2256 pxor xmm2, xmm4 2257 pxor xmm3, xmm5 2258 movups xmmword ptr [r9], xmm0 2259 movups xmmword ptr [r9+0x10], xmm1 2260 movups xmmword ptr [r9+0x20], xmm2 2261 movups xmmword ptr [r9+0x30], xmm3 2262 RET 2263SET_SIZE(zfs_blake3_compress_xof_sse2) 2264 2265SECTION_STATIC 2266.p2align 6 2267BLAKE3_IV: 2268 .long 0x6A09E667, 0xBB67AE85 2269 .long 0x3C6EF372, 0xA54FF53A 2270ADD0: 2271 .long 0, 1, 2, 3 2272ADD1: 2273 .long 4, 4, 4, 4 2274BLAKE3_IV_0: 2275 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 2276BLAKE3_IV_1: 2277 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 2278BLAKE3_IV_2: 2279 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 2280BLAKE3_IV_3: 2281 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A 2282BLAKE3_BLOCK_LEN: 2283 .long 64, 64, 64, 64 2284CMP_MSB_MASK: 2285 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 2286PBLENDW_0x33_MASK: 2287 .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 2288PBLENDW_0xCC_MASK: 2289 .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF 2290PBLENDW_0x3F_MASK: 2291 .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 2292PBLENDW_0xC0_MASK: 2293 .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF 2294 2295#endif /* HAVE_SSE2 */ 2296 2297#ifdef __ELF__ 2298.section .note.GNU-stack,"",%progbits 2299#endif 2300