1// SPDX-License-Identifier: CDDL-1.0 2/* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23/* 24 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 25 * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale 26 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de> 27 */ 28 29#if defined(HAVE_SSE2) 30 31#define _ASM 32#include <sys/asm_linkage.h> 33 34.intel_syntax noprefix 35 36SECTION_TEXT 37 38ENTRY_ALIGN(zfs_blake3_hash_many_sse2, 64) 39 ENDBR 40 push r15 41 push r14 42 push r13 43 push r12 44 push rbx 45 push rbp 46 mov rbp, rsp 47 sub rsp, 360 48 and rsp, 0xFFFFFFFFFFFFFFC0 49 neg r9d 50 movd xmm0, r9d 51 pshufd xmm0, xmm0, 0x00 52 movdqa xmmword ptr [rsp+0x130], xmm0 53 movdqa xmm1, xmm0 54 pand xmm1, xmmword ptr [ADD0+rip] 55 pand xmm0, xmmword ptr [ADD1+rip] 56 movdqa xmmword ptr [rsp+0x150], xmm0 57 movd xmm0, r8d 58 pshufd xmm0, xmm0, 0x00 59 paddd xmm0, xmm1 60 movdqa xmmword ptr [rsp+0x110], xmm0 61 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] 62 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] 63 pcmpgtd xmm1, xmm0 64 shr r8, 32 65 movd xmm2, r8d 66 pshufd xmm2, xmm2, 0x00 67 psubd xmm2, xmm1 68 movdqa xmmword ptr [rsp+0x120], xmm2 69 mov rbx, qword ptr [rbp+0x50] 70 mov r15, rdx 71 shl r15, 6 72 movzx r13d, byte ptr [rbp+0x38] 73 movzx r12d, byte ptr [rbp+0x48] 74 cmp rsi, 4 75 jc 3f 762: 77 movdqu xmm3, xmmword ptr [rcx] 78 pshufd xmm0, xmm3, 0x00 79 pshufd xmm1, xmm3, 0x55 80 pshufd xmm2, xmm3, 0xAA 81 pshufd xmm3, xmm3, 0xFF 82 movdqu xmm7, xmmword ptr [rcx+0x10] 83 pshufd xmm4, xmm7, 0x00 84 pshufd xmm5, xmm7, 0x55 85 pshufd xmm6, xmm7, 0xAA 86 pshufd xmm7, xmm7, 0xFF 87 mov r8, qword ptr [rdi] 88 mov r9, qword ptr [rdi+0x8] 89 mov r10, qword ptr [rdi+0x10] 90 mov r11, qword ptr [rdi+0x18] 91 movzx eax, byte ptr [rbp+0x40] 92 or eax, r13d 93 xor edx, edx 949: 95 mov r14d, eax 96 or eax, r12d 97 add rdx, 64 98 cmp rdx, r15 99 cmovne eax, r14d 100 movdqu xmm8, xmmword ptr [r8+rdx-0x40] 101 movdqu xmm9, xmmword ptr [r9+rdx-0x40] 102 movdqu xmm10, xmmword ptr [r10+rdx-0x40] 103 movdqu xmm11, xmmword ptr [r11+rdx-0x40] 104 movdqa xmm12, xmm8 105 punpckldq xmm8, xmm9 106 punpckhdq xmm12, xmm9 107 movdqa xmm14, xmm10 108 punpckldq xmm10, xmm11 109 punpckhdq xmm14, xmm11 110 movdqa xmm9, xmm8 111 punpcklqdq xmm8, xmm10 112 punpckhqdq xmm9, xmm10 113 movdqa xmm13, xmm12 114 punpcklqdq xmm12, xmm14 115 punpckhqdq xmm13, xmm14 116 movdqa xmmword ptr [rsp], xmm8 117 movdqa xmmword ptr [rsp+0x10], xmm9 118 movdqa xmmword ptr [rsp+0x20], xmm12 119 movdqa xmmword ptr [rsp+0x30], xmm13 120 movdqu xmm8, xmmword ptr [r8+rdx-0x30] 121 movdqu xmm9, xmmword ptr [r9+rdx-0x30] 122 movdqu xmm10, xmmword ptr [r10+rdx-0x30] 123 movdqu xmm11, xmmword ptr [r11+rdx-0x30] 124 movdqa xmm12, xmm8 125 punpckldq xmm8, xmm9 126 punpckhdq xmm12, xmm9 127 movdqa xmm14, xmm10 128 punpckldq xmm10, xmm11 129 punpckhdq xmm14, xmm11 130 movdqa xmm9, xmm8 131 punpcklqdq xmm8, xmm10 132 punpckhqdq xmm9, xmm10 133 movdqa xmm13, xmm12 134 punpcklqdq xmm12, xmm14 135 punpckhqdq xmm13, xmm14 136 movdqa xmmword ptr [rsp+0x40], xmm8 137 movdqa xmmword ptr [rsp+0x50], xmm9 138 movdqa xmmword ptr [rsp+0x60], xmm12 139 movdqa xmmword ptr [rsp+0x70], xmm13 140 movdqu xmm8, xmmword ptr [r8+rdx-0x20] 141 movdqu xmm9, xmmword ptr [r9+rdx-0x20] 142 movdqu xmm10, xmmword ptr [r10+rdx-0x20] 143 movdqu xmm11, xmmword ptr [r11+rdx-0x20] 144 movdqa xmm12, xmm8 145 punpckldq xmm8, xmm9 146 punpckhdq xmm12, xmm9 147 movdqa xmm14, xmm10 148 punpckldq xmm10, xmm11 149 punpckhdq xmm14, xmm11 150 movdqa xmm9, xmm8 151 punpcklqdq xmm8, xmm10 152 punpckhqdq xmm9, xmm10 153 movdqa xmm13, xmm12 154 punpcklqdq xmm12, xmm14 155 punpckhqdq xmm13, xmm14 156 movdqa xmmword ptr [rsp+0x80], xmm8 157 movdqa xmmword ptr [rsp+0x90], xmm9 158 movdqa xmmword ptr [rsp+0xA0], xmm12 159 movdqa xmmword ptr [rsp+0xB0], xmm13 160 movdqu xmm8, xmmword ptr [r8+rdx-0x10] 161 movdqu xmm9, xmmword ptr [r9+rdx-0x10] 162 movdqu xmm10, xmmword ptr [r10+rdx-0x10] 163 movdqu xmm11, xmmword ptr [r11+rdx-0x10] 164 movdqa xmm12, xmm8 165 punpckldq xmm8, xmm9 166 punpckhdq xmm12, xmm9 167 movdqa xmm14, xmm10 168 punpckldq xmm10, xmm11 169 punpckhdq xmm14, xmm11 170 movdqa xmm9, xmm8 171 punpcklqdq xmm8, xmm10 172 punpckhqdq xmm9, xmm10 173 movdqa xmm13, xmm12 174 punpcklqdq xmm12, xmm14 175 punpckhqdq xmm13, xmm14 176 movdqa xmmword ptr [rsp+0xC0], xmm8 177 movdqa xmmword ptr [rsp+0xD0], xmm9 178 movdqa xmmword ptr [rsp+0xE0], xmm12 179 movdqa xmmword ptr [rsp+0xF0], xmm13 180 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] 181 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] 182 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] 183 movdqa xmm12, xmmword ptr [rsp+0x110] 184 movdqa xmm13, xmmword ptr [rsp+0x120] 185 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] 186 movd xmm15, eax 187 pshufd xmm15, xmm15, 0x00 188 prefetcht0 [r8+rdx+0x80] 189 prefetcht0 [r9+rdx+0x80] 190 prefetcht0 [r10+rdx+0x80] 191 prefetcht0 [r11+rdx+0x80] 192 paddd xmm0, xmmword ptr [rsp] 193 paddd xmm1, xmmword ptr [rsp+0x20] 194 paddd xmm2, xmmword ptr [rsp+0x40] 195 paddd xmm3, xmmword ptr [rsp+0x60] 196 paddd xmm0, xmm4 197 paddd xmm1, xmm5 198 paddd xmm2, xmm6 199 paddd xmm3, xmm7 200 pxor xmm12, xmm0 201 pxor xmm13, xmm1 202 pxor xmm14, xmm2 203 pxor xmm15, xmm3 204 pshuflw xmm12, xmm12, 0xB1 205 pshufhw xmm12, xmm12, 0xB1 206 pshuflw xmm13, xmm13, 0xB1 207 pshufhw xmm13, xmm13, 0xB1 208 pshuflw xmm14, xmm14, 0xB1 209 pshufhw xmm14, xmm14, 0xB1 210 pshuflw xmm15, xmm15, 0xB1 211 pshufhw xmm15, xmm15, 0xB1 212 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] 213 paddd xmm8, xmm12 214 paddd xmm9, xmm13 215 paddd xmm10, xmm14 216 paddd xmm11, xmm15 217 pxor xmm4, xmm8 218 pxor xmm5, xmm9 219 pxor xmm6, xmm10 220 pxor xmm7, xmm11 221 movdqa xmmword ptr [rsp+0x100], xmm8 222 movdqa xmm8, xmm4 223 psrld xmm8, 12 224 pslld xmm4, 20 225 por xmm4, xmm8 226 movdqa xmm8, xmm5 227 psrld xmm8, 12 228 pslld xmm5, 20 229 por xmm5, xmm8 230 movdqa xmm8, xmm6 231 psrld xmm8, 12 232 pslld xmm6, 20 233 por xmm6, xmm8 234 movdqa xmm8, xmm7 235 psrld xmm8, 12 236 pslld xmm7, 20 237 por xmm7, xmm8 238 paddd xmm0, xmmword ptr [rsp+0x10] 239 paddd xmm1, xmmword ptr [rsp+0x30] 240 paddd xmm2, xmmword ptr [rsp+0x50] 241 paddd xmm3, xmmword ptr [rsp+0x70] 242 paddd xmm0, xmm4 243 paddd xmm1, xmm5 244 paddd xmm2, xmm6 245 paddd xmm3, xmm7 246 pxor xmm12, xmm0 247 pxor xmm13, xmm1 248 pxor xmm14, xmm2 249 pxor xmm15, xmm3 250 movdqa xmm8, xmm12 251 psrld xmm12, 8 252 pslld xmm8, 24 253 pxor xmm12, xmm8 254 movdqa xmm8, xmm13 255 psrld xmm13, 8 256 pslld xmm8, 24 257 pxor xmm13, xmm8 258 movdqa xmm8, xmm14 259 psrld xmm14, 8 260 pslld xmm8, 24 261 pxor xmm14, xmm8 262 movdqa xmm8, xmm15 263 psrld xmm15, 8 264 pslld xmm8, 24 265 pxor xmm15, xmm8 266 movdqa xmm8, xmmword ptr [rsp+0x100] 267 paddd xmm8, xmm12 268 paddd xmm9, xmm13 269 paddd xmm10, xmm14 270 paddd xmm11, xmm15 271 pxor xmm4, xmm8 272 pxor xmm5, xmm9 273 pxor xmm6, xmm10 274 pxor xmm7, xmm11 275 movdqa xmmword ptr [rsp+0x100], xmm8 276 movdqa xmm8, xmm4 277 psrld xmm8, 7 278 pslld xmm4, 25 279 por xmm4, xmm8 280 movdqa xmm8, xmm5 281 psrld xmm8, 7 282 pslld xmm5, 25 283 por xmm5, xmm8 284 movdqa xmm8, xmm6 285 psrld xmm8, 7 286 pslld xmm6, 25 287 por xmm6, xmm8 288 movdqa xmm8, xmm7 289 psrld xmm8, 7 290 pslld xmm7, 25 291 por xmm7, xmm8 292 paddd xmm0, xmmword ptr [rsp+0x80] 293 paddd xmm1, xmmword ptr [rsp+0xA0] 294 paddd xmm2, xmmword ptr [rsp+0xC0] 295 paddd xmm3, xmmword ptr [rsp+0xE0] 296 paddd xmm0, xmm5 297 paddd xmm1, xmm6 298 paddd xmm2, xmm7 299 paddd xmm3, xmm4 300 pxor xmm15, xmm0 301 pxor xmm12, xmm1 302 pxor xmm13, xmm2 303 pxor xmm14, xmm3 304 pshuflw xmm15, xmm15, 0xB1 305 pshufhw xmm15, xmm15, 0xB1 306 pshuflw xmm12, xmm12, 0xB1 307 pshufhw xmm12, xmm12, 0xB1 308 pshuflw xmm13, xmm13, 0xB1 309 pshufhw xmm13, xmm13, 0xB1 310 pshuflw xmm14, xmm14, 0xB1 311 pshufhw xmm14, xmm14, 0xB1 312 paddd xmm10, xmm15 313 paddd xmm11, xmm12 314 movdqa xmm8, xmmword ptr [rsp+0x100] 315 paddd xmm8, xmm13 316 paddd xmm9, xmm14 317 pxor xmm5, xmm10 318 pxor xmm6, xmm11 319 pxor xmm7, xmm8 320 pxor xmm4, xmm9 321 movdqa xmmword ptr [rsp+0x100], xmm8 322 movdqa xmm8, xmm5 323 psrld xmm8, 12 324 pslld xmm5, 20 325 por xmm5, xmm8 326 movdqa xmm8, xmm6 327 psrld xmm8, 12 328 pslld xmm6, 20 329 por xmm6, xmm8 330 movdqa xmm8, xmm7 331 psrld xmm8, 12 332 pslld xmm7, 20 333 por xmm7, xmm8 334 movdqa xmm8, xmm4 335 psrld xmm8, 12 336 pslld xmm4, 20 337 por xmm4, xmm8 338 paddd xmm0, xmmword ptr [rsp+0x90] 339 paddd xmm1, xmmword ptr [rsp+0xB0] 340 paddd xmm2, xmmword ptr [rsp+0xD0] 341 paddd xmm3, xmmword ptr [rsp+0xF0] 342 paddd xmm0, xmm5 343 paddd xmm1, xmm6 344 paddd xmm2, xmm7 345 paddd xmm3, xmm4 346 pxor xmm15, xmm0 347 pxor xmm12, xmm1 348 pxor xmm13, xmm2 349 pxor xmm14, xmm3 350 movdqa xmm8, xmm15 351 psrld xmm15, 8 352 pslld xmm8, 24 353 pxor xmm15, xmm8 354 movdqa xmm8, xmm12 355 psrld xmm12, 8 356 pslld xmm8, 24 357 pxor xmm12, xmm8 358 movdqa xmm8, xmm13 359 psrld xmm13, 8 360 pslld xmm8, 24 361 pxor xmm13, xmm8 362 movdqa xmm8, xmm14 363 psrld xmm14, 8 364 pslld xmm8, 24 365 pxor xmm14, xmm8 366 paddd xmm10, xmm15 367 paddd xmm11, xmm12 368 movdqa xmm8, xmmword ptr [rsp+0x100] 369 paddd xmm8, xmm13 370 paddd xmm9, xmm14 371 pxor xmm5, xmm10 372 pxor xmm6, xmm11 373 pxor xmm7, xmm8 374 pxor xmm4, xmm9 375 movdqa xmmword ptr [rsp+0x100], xmm8 376 movdqa xmm8, xmm5 377 psrld xmm8, 7 378 pslld xmm5, 25 379 por xmm5, xmm8 380 movdqa xmm8, xmm6 381 psrld xmm8, 7 382 pslld xmm6, 25 383 por xmm6, xmm8 384 movdqa xmm8, xmm7 385 psrld xmm8, 7 386 pslld xmm7, 25 387 por xmm7, xmm8 388 movdqa xmm8, xmm4 389 psrld xmm8, 7 390 pslld xmm4, 25 391 por xmm4, xmm8 392 paddd xmm0, xmmword ptr [rsp+0x20] 393 paddd xmm1, xmmword ptr [rsp+0x30] 394 paddd xmm2, xmmword ptr [rsp+0x70] 395 paddd xmm3, xmmword ptr [rsp+0x40] 396 paddd xmm0, xmm4 397 paddd xmm1, xmm5 398 paddd xmm2, xmm6 399 paddd xmm3, xmm7 400 pxor xmm12, xmm0 401 pxor xmm13, xmm1 402 pxor xmm14, xmm2 403 pxor xmm15, xmm3 404 pshuflw xmm12, xmm12, 0xB1 405 pshufhw xmm12, xmm12, 0xB1 406 pshuflw xmm13, xmm13, 0xB1 407 pshufhw xmm13, xmm13, 0xB1 408 pshuflw xmm14, xmm14, 0xB1 409 pshufhw xmm14, xmm14, 0xB1 410 pshuflw xmm15, xmm15, 0xB1 411 pshufhw xmm15, xmm15, 0xB1 412 movdqa xmm8, xmmword ptr [rsp+0x100] 413 paddd xmm8, xmm12 414 paddd xmm9, xmm13 415 paddd xmm10, xmm14 416 paddd xmm11, xmm15 417 pxor xmm4, xmm8 418 pxor xmm5, xmm9 419 pxor xmm6, xmm10 420 pxor xmm7, xmm11 421 movdqa xmmword ptr [rsp+0x100], xmm8 422 movdqa xmm8, xmm4 423 psrld xmm8, 12 424 pslld xmm4, 20 425 por xmm4, xmm8 426 movdqa xmm8, xmm5 427 psrld xmm8, 12 428 pslld xmm5, 20 429 por xmm5, xmm8 430 movdqa xmm8, xmm6 431 psrld xmm8, 12 432 pslld xmm6, 20 433 por xmm6, xmm8 434 movdqa xmm8, xmm7 435 psrld xmm8, 12 436 pslld xmm7, 20 437 por xmm7, xmm8 438 paddd xmm0, xmmword ptr [rsp+0x60] 439 paddd xmm1, xmmword ptr [rsp+0xA0] 440 paddd xmm2, xmmword ptr [rsp] 441 paddd xmm3, xmmword ptr [rsp+0xD0] 442 paddd xmm0, xmm4 443 paddd xmm1, xmm5 444 paddd xmm2, xmm6 445 paddd xmm3, xmm7 446 pxor xmm12, xmm0 447 pxor xmm13, xmm1 448 pxor xmm14, xmm2 449 pxor xmm15, xmm3 450 movdqa xmm8, xmm12 451 psrld xmm12, 8 452 pslld xmm8, 24 453 pxor xmm12, xmm8 454 movdqa xmm8, xmm13 455 psrld xmm13, 8 456 pslld xmm8, 24 457 pxor xmm13, xmm8 458 movdqa xmm8, xmm14 459 psrld xmm14, 8 460 pslld xmm8, 24 461 pxor xmm14, xmm8 462 movdqa xmm8, xmm15 463 psrld xmm15, 8 464 pslld xmm8, 24 465 pxor xmm15, xmm8 466 movdqa xmm8, xmmword ptr [rsp+0x100] 467 paddd xmm8, xmm12 468 paddd xmm9, xmm13 469 paddd xmm10, xmm14 470 paddd xmm11, xmm15 471 pxor xmm4, xmm8 472 pxor xmm5, xmm9 473 pxor xmm6, xmm10 474 pxor xmm7, xmm11 475 movdqa xmmword ptr [rsp+0x100], xmm8 476 movdqa xmm8, xmm4 477 psrld xmm8, 7 478 pslld xmm4, 25 479 por xmm4, xmm8 480 movdqa xmm8, xmm5 481 psrld xmm8, 7 482 pslld xmm5, 25 483 por xmm5, xmm8 484 movdqa xmm8, xmm6 485 psrld xmm8, 7 486 pslld xmm6, 25 487 por xmm6, xmm8 488 movdqa xmm8, xmm7 489 psrld xmm8, 7 490 pslld xmm7, 25 491 por xmm7, xmm8 492 paddd xmm0, xmmword ptr [rsp+0x10] 493 paddd xmm1, xmmword ptr [rsp+0xC0] 494 paddd xmm2, xmmword ptr [rsp+0x90] 495 paddd xmm3, xmmword ptr [rsp+0xF0] 496 paddd xmm0, xmm5 497 paddd xmm1, xmm6 498 paddd xmm2, xmm7 499 paddd xmm3, xmm4 500 pxor xmm15, xmm0 501 pxor xmm12, xmm1 502 pxor xmm13, xmm2 503 pxor xmm14, xmm3 504 pshuflw xmm15, xmm15, 0xB1 505 pshufhw xmm15, xmm15, 0xB1 506 pshuflw xmm12, xmm12, 0xB1 507 pshufhw xmm12, xmm12, 0xB1 508 pshuflw xmm13, xmm13, 0xB1 509 pshufhw xmm13, xmm13, 0xB1 510 pshuflw xmm14, xmm14, 0xB1 511 pshufhw xmm14, xmm14, 0xB1 512 paddd xmm10, xmm15 513 paddd xmm11, xmm12 514 movdqa xmm8, xmmword ptr [rsp+0x100] 515 paddd xmm8, xmm13 516 paddd xmm9, xmm14 517 pxor xmm5, xmm10 518 pxor xmm6, xmm11 519 pxor xmm7, xmm8 520 pxor xmm4, xmm9 521 movdqa xmmword ptr [rsp+0x100], xmm8 522 movdqa xmm8, xmm5 523 psrld xmm8, 12 524 pslld xmm5, 20 525 por xmm5, xmm8 526 movdqa xmm8, xmm6 527 psrld xmm8, 12 528 pslld xmm6, 20 529 por xmm6, xmm8 530 movdqa xmm8, xmm7 531 psrld xmm8, 12 532 pslld xmm7, 20 533 por xmm7, xmm8 534 movdqa xmm8, xmm4 535 psrld xmm8, 12 536 pslld xmm4, 20 537 por xmm4, xmm8 538 paddd xmm0, xmmword ptr [rsp+0xB0] 539 paddd xmm1, xmmword ptr [rsp+0x50] 540 paddd xmm2, xmmword ptr [rsp+0xE0] 541 paddd xmm3, xmmword ptr [rsp+0x80] 542 paddd xmm0, xmm5 543 paddd xmm1, xmm6 544 paddd xmm2, xmm7 545 paddd xmm3, xmm4 546 pxor xmm15, xmm0 547 pxor xmm12, xmm1 548 pxor xmm13, xmm2 549 pxor xmm14, xmm3 550 movdqa xmm8, xmm15 551 psrld xmm15, 8 552 pslld xmm8, 24 553 pxor xmm15, xmm8 554 movdqa xmm8, xmm12 555 psrld xmm12, 8 556 pslld xmm8, 24 557 pxor xmm12, xmm8 558 movdqa xmm8, xmm13 559 psrld xmm13, 8 560 pslld xmm8, 24 561 pxor xmm13, xmm8 562 movdqa xmm8, xmm14 563 psrld xmm14, 8 564 pslld xmm8, 24 565 pxor xmm14, xmm8 566 paddd xmm10, xmm15 567 paddd xmm11, xmm12 568 movdqa xmm8, xmmword ptr [rsp+0x100] 569 paddd xmm8, xmm13 570 paddd xmm9, xmm14 571 pxor xmm5, xmm10 572 pxor xmm6, xmm11 573 pxor xmm7, xmm8 574 pxor xmm4, xmm9 575 movdqa xmmword ptr [rsp+0x100], xmm8 576 movdqa xmm8, xmm5 577 psrld xmm8, 7 578 pslld xmm5, 25 579 por xmm5, xmm8 580 movdqa xmm8, xmm6 581 psrld xmm8, 7 582 pslld xmm6, 25 583 por xmm6, xmm8 584 movdqa xmm8, xmm7 585 psrld xmm8, 7 586 pslld xmm7, 25 587 por xmm7, xmm8 588 movdqa xmm8, xmm4 589 psrld xmm8, 7 590 pslld xmm4, 25 591 por xmm4, xmm8 592 paddd xmm0, xmmword ptr [rsp+0x30] 593 paddd xmm1, xmmword ptr [rsp+0xA0] 594 paddd xmm2, xmmword ptr [rsp+0xD0] 595 paddd xmm3, xmmword ptr [rsp+0x70] 596 paddd xmm0, xmm4 597 paddd xmm1, xmm5 598 paddd xmm2, xmm6 599 paddd xmm3, xmm7 600 pxor xmm12, xmm0 601 pxor xmm13, xmm1 602 pxor xmm14, xmm2 603 pxor xmm15, xmm3 604 pshuflw xmm12, xmm12, 0xB1 605 pshufhw xmm12, xmm12, 0xB1 606 pshuflw xmm13, xmm13, 0xB1 607 pshufhw xmm13, xmm13, 0xB1 608 pshuflw xmm14, xmm14, 0xB1 609 pshufhw xmm14, xmm14, 0xB1 610 pshuflw xmm15, xmm15, 0xB1 611 pshufhw xmm15, xmm15, 0xB1 612 movdqa xmm8, xmmword ptr [rsp+0x100] 613 paddd xmm8, xmm12 614 paddd xmm9, xmm13 615 paddd xmm10, xmm14 616 paddd xmm11, xmm15 617 pxor xmm4, xmm8 618 pxor xmm5, xmm9 619 pxor xmm6, xmm10 620 pxor xmm7, xmm11 621 movdqa xmmword ptr [rsp+0x100], xmm8 622 movdqa xmm8, xmm4 623 psrld xmm8, 12 624 pslld xmm4, 20 625 por xmm4, xmm8 626 movdqa xmm8, xmm5 627 psrld xmm8, 12 628 pslld xmm5, 20 629 por xmm5, xmm8 630 movdqa xmm8, xmm6 631 psrld xmm8, 12 632 pslld xmm6, 20 633 por xmm6, xmm8 634 movdqa xmm8, xmm7 635 psrld xmm8, 12 636 pslld xmm7, 20 637 por xmm7, xmm8 638 paddd xmm0, xmmword ptr [rsp+0x40] 639 paddd xmm1, xmmword ptr [rsp+0xC0] 640 paddd xmm2, xmmword ptr [rsp+0x20] 641 paddd xmm3, xmmword ptr [rsp+0xE0] 642 paddd xmm0, xmm4 643 paddd xmm1, xmm5 644 paddd xmm2, xmm6 645 paddd xmm3, xmm7 646 pxor xmm12, xmm0 647 pxor xmm13, xmm1 648 pxor xmm14, xmm2 649 pxor xmm15, xmm3 650 movdqa xmm8, xmm12 651 psrld xmm12, 8 652 pslld xmm8, 24 653 pxor xmm12, xmm8 654 movdqa xmm8, xmm13 655 psrld xmm13, 8 656 pslld xmm8, 24 657 pxor xmm13, xmm8 658 movdqa xmm8, xmm14 659 psrld xmm14, 8 660 pslld xmm8, 24 661 pxor xmm14, xmm8 662 movdqa xmm8, xmm15 663 psrld xmm15, 8 664 pslld xmm8, 24 665 pxor xmm15, xmm8 666 movdqa xmm8, xmmword ptr [rsp+0x100] 667 paddd xmm8, xmm12 668 paddd xmm9, xmm13 669 paddd xmm10, xmm14 670 paddd xmm11, xmm15 671 pxor xmm4, xmm8 672 pxor xmm5, xmm9 673 pxor xmm6, xmm10 674 pxor xmm7, xmm11 675 movdqa xmmword ptr [rsp+0x100], xmm8 676 movdqa xmm8, xmm4 677 psrld xmm8, 7 678 pslld xmm4, 25 679 por xmm4, xmm8 680 movdqa xmm8, xmm5 681 psrld xmm8, 7 682 pslld xmm5, 25 683 por xmm5, xmm8 684 movdqa xmm8, xmm6 685 psrld xmm8, 7 686 pslld xmm6, 25 687 por xmm6, xmm8 688 movdqa xmm8, xmm7 689 psrld xmm8, 7 690 pslld xmm7, 25 691 por xmm7, xmm8 692 paddd xmm0, xmmword ptr [rsp+0x60] 693 paddd xmm1, xmmword ptr [rsp+0x90] 694 paddd xmm2, xmmword ptr [rsp+0xB0] 695 paddd xmm3, xmmword ptr [rsp+0x80] 696 paddd xmm0, xmm5 697 paddd xmm1, xmm6 698 paddd xmm2, xmm7 699 paddd xmm3, xmm4 700 pxor xmm15, xmm0 701 pxor xmm12, xmm1 702 pxor xmm13, xmm2 703 pxor xmm14, xmm3 704 pshuflw xmm15, xmm15, 0xB1 705 pshufhw xmm15, xmm15, 0xB1 706 pshuflw xmm12, xmm12, 0xB1 707 pshufhw xmm12, xmm12, 0xB1 708 pshuflw xmm13, xmm13, 0xB1 709 pshufhw xmm13, xmm13, 0xB1 710 pshuflw xmm14, xmm14, 0xB1 711 pshufhw xmm14, xmm14, 0xB1 712 paddd xmm10, xmm15 713 paddd xmm11, xmm12 714 movdqa xmm8, xmmword ptr [rsp+0x100] 715 paddd xmm8, xmm13 716 paddd xmm9, xmm14 717 pxor xmm5, xmm10 718 pxor xmm6, xmm11 719 pxor xmm7, xmm8 720 pxor xmm4, xmm9 721 movdqa xmmword ptr [rsp+0x100], xmm8 722 movdqa xmm8, xmm5 723 psrld xmm8, 12 724 pslld xmm5, 20 725 por xmm5, xmm8 726 movdqa xmm8, xmm6 727 psrld xmm8, 12 728 pslld xmm6, 20 729 por xmm6, xmm8 730 movdqa xmm8, xmm7 731 psrld xmm8, 12 732 pslld xmm7, 20 733 por xmm7, xmm8 734 movdqa xmm8, xmm4 735 psrld xmm8, 12 736 pslld xmm4, 20 737 por xmm4, xmm8 738 paddd xmm0, xmmword ptr [rsp+0x50] 739 paddd xmm1, xmmword ptr [rsp] 740 paddd xmm2, xmmword ptr [rsp+0xF0] 741 paddd xmm3, xmmword ptr [rsp+0x10] 742 paddd xmm0, xmm5 743 paddd xmm1, xmm6 744 paddd xmm2, xmm7 745 paddd xmm3, xmm4 746 pxor xmm15, xmm0 747 pxor xmm12, xmm1 748 pxor xmm13, xmm2 749 pxor xmm14, xmm3 750 movdqa xmm8, xmm15 751 psrld xmm15, 8 752 pslld xmm8, 24 753 pxor xmm15, xmm8 754 movdqa xmm8, xmm12 755 psrld xmm12, 8 756 pslld xmm8, 24 757 pxor xmm12, xmm8 758 movdqa xmm8, xmm13 759 psrld xmm13, 8 760 pslld xmm8, 24 761 pxor xmm13, xmm8 762 movdqa xmm8, xmm14 763 psrld xmm14, 8 764 pslld xmm8, 24 765 pxor xmm14, xmm8 766 paddd xmm10, xmm15 767 paddd xmm11, xmm12 768 movdqa xmm8, xmmword ptr [rsp+0x100] 769 paddd xmm8, xmm13 770 paddd xmm9, xmm14 771 pxor xmm5, xmm10 772 pxor xmm6, xmm11 773 pxor xmm7, xmm8 774 pxor xmm4, xmm9 775 movdqa xmmword ptr [rsp+0x100], xmm8 776 movdqa xmm8, xmm5 777 psrld xmm8, 7 778 pslld xmm5, 25 779 por xmm5, xmm8 780 movdqa xmm8, xmm6 781 psrld xmm8, 7 782 pslld xmm6, 25 783 por xmm6, xmm8 784 movdqa xmm8, xmm7 785 psrld xmm8, 7 786 pslld xmm7, 25 787 por xmm7, xmm8 788 movdqa xmm8, xmm4 789 psrld xmm8, 7 790 pslld xmm4, 25 791 por xmm4, xmm8 792 paddd xmm0, xmmword ptr [rsp+0xA0] 793 paddd xmm1, xmmword ptr [rsp+0xC0] 794 paddd xmm2, xmmword ptr [rsp+0xE0] 795 paddd xmm3, xmmword ptr [rsp+0xD0] 796 paddd xmm0, xmm4 797 paddd xmm1, xmm5 798 paddd xmm2, xmm6 799 paddd xmm3, xmm7 800 pxor xmm12, xmm0 801 pxor xmm13, xmm1 802 pxor xmm14, xmm2 803 pxor xmm15, xmm3 804 pshuflw xmm12, xmm12, 0xB1 805 pshufhw xmm12, xmm12, 0xB1 806 pshuflw xmm13, xmm13, 0xB1 807 pshufhw xmm13, xmm13, 0xB1 808 pshuflw xmm14, xmm14, 0xB1 809 pshufhw xmm14, xmm14, 0xB1 810 pshuflw xmm15, xmm15, 0xB1 811 pshufhw xmm15, xmm15, 0xB1 812 movdqa xmm8, xmmword ptr [rsp+0x100] 813 paddd xmm8, xmm12 814 paddd xmm9, xmm13 815 paddd xmm10, xmm14 816 paddd xmm11, xmm15 817 pxor xmm4, xmm8 818 pxor xmm5, xmm9 819 pxor xmm6, xmm10 820 pxor xmm7, xmm11 821 movdqa xmmword ptr [rsp+0x100], xmm8 822 movdqa xmm8, xmm4 823 psrld xmm8, 12 824 pslld xmm4, 20 825 por xmm4, xmm8 826 movdqa xmm8, xmm5 827 psrld xmm8, 12 828 pslld xmm5, 20 829 por xmm5, xmm8 830 movdqa xmm8, xmm6 831 psrld xmm8, 12 832 pslld xmm6, 20 833 por xmm6, xmm8 834 movdqa xmm8, xmm7 835 psrld xmm8, 12 836 pslld xmm7, 20 837 por xmm7, xmm8 838 paddd xmm0, xmmword ptr [rsp+0x70] 839 paddd xmm1, xmmword ptr [rsp+0x90] 840 paddd xmm2, xmmword ptr [rsp+0x30] 841 paddd xmm3, xmmword ptr [rsp+0xF0] 842 paddd xmm0, xmm4 843 paddd xmm1, xmm5 844 paddd xmm2, xmm6 845 paddd xmm3, xmm7 846 pxor xmm12, xmm0 847 pxor xmm13, xmm1 848 pxor xmm14, xmm2 849 pxor xmm15, xmm3 850 movdqa xmm8, xmm12 851 psrld xmm12, 8 852 pslld xmm8, 24 853 pxor xmm12, xmm8 854 movdqa xmm8, xmm13 855 psrld xmm13, 8 856 pslld xmm8, 24 857 pxor xmm13, xmm8 858 movdqa xmm8, xmm14 859 psrld xmm14, 8 860 pslld xmm8, 24 861 pxor xmm14, xmm8 862 movdqa xmm8, xmm15 863 psrld xmm15, 8 864 pslld xmm8, 24 865 pxor xmm15, xmm8 866 movdqa xmm8, xmmword ptr [rsp+0x100] 867 paddd xmm8, xmm12 868 paddd xmm9, xmm13 869 paddd xmm10, xmm14 870 paddd xmm11, xmm15 871 pxor xmm4, xmm8 872 pxor xmm5, xmm9 873 pxor xmm6, xmm10 874 pxor xmm7, xmm11 875 movdqa xmmword ptr [rsp+0x100], xmm8 876 movdqa xmm8, xmm4 877 psrld xmm8, 7 878 pslld xmm4, 25 879 por xmm4, xmm8 880 movdqa xmm8, xmm5 881 psrld xmm8, 7 882 pslld xmm5, 25 883 por xmm5, xmm8 884 movdqa xmm8, xmm6 885 psrld xmm8, 7 886 pslld xmm6, 25 887 por xmm6, xmm8 888 movdqa xmm8, xmm7 889 psrld xmm8, 7 890 pslld xmm7, 25 891 por xmm7, xmm8 892 paddd xmm0, xmmword ptr [rsp+0x40] 893 paddd xmm1, xmmword ptr [rsp+0xB0] 894 paddd xmm2, xmmword ptr [rsp+0x50] 895 paddd xmm3, xmmword ptr [rsp+0x10] 896 paddd xmm0, xmm5 897 paddd xmm1, xmm6 898 paddd xmm2, xmm7 899 paddd xmm3, xmm4 900 pxor xmm15, xmm0 901 pxor xmm12, xmm1 902 pxor xmm13, xmm2 903 pxor xmm14, xmm3 904 pshuflw xmm15, xmm15, 0xB1 905 pshufhw xmm15, xmm15, 0xB1 906 pshuflw xmm12, xmm12, 0xB1 907 pshufhw xmm12, xmm12, 0xB1 908 pshuflw xmm13, xmm13, 0xB1 909 pshufhw xmm13, xmm13, 0xB1 910 pshuflw xmm14, xmm14, 0xB1 911 pshufhw xmm14, xmm14, 0xB1 912 paddd xmm10, xmm15 913 paddd xmm11, xmm12 914 movdqa xmm8, xmmword ptr [rsp+0x100] 915 paddd xmm8, xmm13 916 paddd xmm9, xmm14 917 pxor xmm5, xmm10 918 pxor xmm6, xmm11 919 pxor xmm7, xmm8 920 pxor xmm4, xmm9 921 movdqa xmmword ptr [rsp+0x100], xmm8 922 movdqa xmm8, xmm5 923 psrld xmm8, 12 924 pslld xmm5, 20 925 por xmm5, xmm8 926 movdqa xmm8, xmm6 927 psrld xmm8, 12 928 pslld xmm6, 20 929 por xmm6, xmm8 930 movdqa xmm8, xmm7 931 psrld xmm8, 12 932 pslld xmm7, 20 933 por xmm7, xmm8 934 movdqa xmm8, xmm4 935 psrld xmm8, 12 936 pslld xmm4, 20 937 por xmm4, xmm8 938 paddd xmm0, xmmword ptr [rsp] 939 paddd xmm1, xmmword ptr [rsp+0x20] 940 paddd xmm2, xmmword ptr [rsp+0x80] 941 paddd xmm3, xmmword ptr [rsp+0x60] 942 paddd xmm0, xmm5 943 paddd xmm1, xmm6 944 paddd xmm2, xmm7 945 paddd xmm3, xmm4 946 pxor xmm15, xmm0 947 pxor xmm12, xmm1 948 pxor xmm13, xmm2 949 pxor xmm14, xmm3 950 movdqa xmm8, xmm15 951 psrld xmm15, 8 952 pslld xmm8, 24 953 pxor xmm15, xmm8 954 movdqa xmm8, xmm12 955 psrld xmm12, 8 956 pslld xmm8, 24 957 pxor xmm12, xmm8 958 movdqa xmm8, xmm13 959 psrld xmm13, 8 960 pslld xmm8, 24 961 pxor xmm13, xmm8 962 movdqa xmm8, xmm14 963 psrld xmm14, 8 964 pslld xmm8, 24 965 pxor xmm14, xmm8 966 paddd xmm10, xmm15 967 paddd xmm11, xmm12 968 movdqa xmm8, xmmword ptr [rsp+0x100] 969 paddd xmm8, xmm13 970 paddd xmm9, xmm14 971 pxor xmm5, xmm10 972 pxor xmm6, xmm11 973 pxor xmm7, xmm8 974 pxor xmm4, xmm9 975 movdqa xmmword ptr [rsp+0x100], xmm8 976 movdqa xmm8, xmm5 977 psrld xmm8, 7 978 pslld xmm5, 25 979 por xmm5, xmm8 980 movdqa xmm8, xmm6 981 psrld xmm8, 7 982 pslld xmm6, 25 983 por xmm6, xmm8 984 movdqa xmm8, xmm7 985 psrld xmm8, 7 986 pslld xmm7, 25 987 por xmm7, xmm8 988 movdqa xmm8, xmm4 989 psrld xmm8, 7 990 pslld xmm4, 25 991 por xmm4, xmm8 992 paddd xmm0, xmmword ptr [rsp+0xC0] 993 paddd xmm1, xmmword ptr [rsp+0x90] 994 paddd xmm2, xmmword ptr [rsp+0xF0] 995 paddd xmm3, xmmword ptr [rsp+0xE0] 996 paddd xmm0, xmm4 997 paddd xmm1, xmm5 998 paddd xmm2, xmm6 999 paddd xmm3, xmm7 1000 pxor xmm12, xmm0 1001 pxor xmm13, xmm1 1002 pxor xmm14, xmm2 1003 pxor xmm15, xmm3 1004 pshuflw xmm12, xmm12, 0xB1 1005 pshufhw xmm12, xmm12, 0xB1 1006 pshuflw xmm13, xmm13, 0xB1 1007 pshufhw xmm13, xmm13, 0xB1 1008 pshuflw xmm14, xmm14, 0xB1 1009 pshufhw xmm14, xmm14, 0xB1 1010 pshuflw xmm15, xmm15, 0xB1 1011 pshufhw xmm15, xmm15, 0xB1 1012 movdqa xmm8, xmmword ptr [rsp+0x100] 1013 paddd xmm8, xmm12 1014 paddd xmm9, xmm13 1015 paddd xmm10, xmm14 1016 paddd xmm11, xmm15 1017 pxor xmm4, xmm8 1018 pxor xmm5, xmm9 1019 pxor xmm6, xmm10 1020 pxor xmm7, xmm11 1021 movdqa xmmword ptr [rsp+0x100], xmm8 1022 movdqa xmm8, xmm4 1023 psrld xmm8, 12 1024 pslld xmm4, 20 1025 por xmm4, xmm8 1026 movdqa xmm8, xmm5 1027 psrld xmm8, 12 1028 pslld xmm5, 20 1029 por xmm5, xmm8 1030 movdqa xmm8, xmm6 1031 psrld xmm8, 12 1032 pslld xmm6, 20 1033 por xmm6, xmm8 1034 movdqa xmm8, xmm7 1035 psrld xmm8, 12 1036 pslld xmm7, 20 1037 por xmm7, xmm8 1038 paddd xmm0, xmmword ptr [rsp+0xD0] 1039 paddd xmm1, xmmword ptr [rsp+0xB0] 1040 paddd xmm2, xmmword ptr [rsp+0xA0] 1041 paddd xmm3, xmmword ptr [rsp+0x80] 1042 paddd xmm0, xmm4 1043 paddd xmm1, xmm5 1044 paddd xmm2, xmm6 1045 paddd xmm3, xmm7 1046 pxor xmm12, xmm0 1047 pxor xmm13, xmm1 1048 pxor xmm14, xmm2 1049 pxor xmm15, xmm3 1050 movdqa xmm8, xmm12 1051 psrld xmm12, 8 1052 pslld xmm8, 24 1053 pxor xmm12, xmm8 1054 movdqa xmm8, xmm13 1055 psrld xmm13, 8 1056 pslld xmm8, 24 1057 pxor xmm13, xmm8 1058 movdqa xmm8, xmm14 1059 psrld xmm14, 8 1060 pslld xmm8, 24 1061 pxor xmm14, xmm8 1062 movdqa xmm8, xmm15 1063 psrld xmm15, 8 1064 pslld xmm8, 24 1065 pxor xmm15, xmm8 1066 movdqa xmm8, xmmword ptr [rsp+0x100] 1067 paddd xmm8, xmm12 1068 paddd xmm9, xmm13 1069 paddd xmm10, xmm14 1070 paddd xmm11, xmm15 1071 pxor xmm4, xmm8 1072 pxor xmm5, xmm9 1073 pxor xmm6, xmm10 1074 pxor xmm7, xmm11 1075 movdqa xmmword ptr [rsp+0x100], xmm8 1076 movdqa xmm8, xmm4 1077 psrld xmm8, 7 1078 pslld xmm4, 25 1079 por xmm4, xmm8 1080 movdqa xmm8, xmm5 1081 psrld xmm8, 7 1082 pslld xmm5, 25 1083 por xmm5, xmm8 1084 movdqa xmm8, xmm6 1085 psrld xmm8, 7 1086 pslld xmm6, 25 1087 por xmm6, xmm8 1088 movdqa xmm8, xmm7 1089 psrld xmm8, 7 1090 pslld xmm7, 25 1091 por xmm7, xmm8 1092 paddd xmm0, xmmword ptr [rsp+0x70] 1093 paddd xmm1, xmmword ptr [rsp+0x50] 1094 paddd xmm2, xmmword ptr [rsp] 1095 paddd xmm3, xmmword ptr [rsp+0x60] 1096 paddd xmm0, xmm5 1097 paddd xmm1, xmm6 1098 paddd xmm2, xmm7 1099 paddd xmm3, xmm4 1100 pxor xmm15, xmm0 1101 pxor xmm12, xmm1 1102 pxor xmm13, xmm2 1103 pxor xmm14, xmm3 1104 pshuflw xmm15, xmm15, 0xB1 1105 pshufhw xmm15, xmm15, 0xB1 1106 pshuflw xmm12, xmm12, 0xB1 1107 pshufhw xmm12, xmm12, 0xB1 1108 pshuflw xmm13, xmm13, 0xB1 1109 pshufhw xmm13, xmm13, 0xB1 1110 pshuflw xmm14, xmm14, 0xB1 1111 pshufhw xmm14, xmm14, 0xB1 1112 paddd xmm10, xmm15 1113 paddd xmm11, xmm12 1114 movdqa xmm8, xmmword ptr [rsp+0x100] 1115 paddd xmm8, xmm13 1116 paddd xmm9, xmm14 1117 pxor xmm5, xmm10 1118 pxor xmm6, xmm11 1119 pxor xmm7, xmm8 1120 pxor xmm4, xmm9 1121 movdqa xmmword ptr [rsp+0x100], xmm8 1122 movdqa xmm8, xmm5 1123 psrld xmm8, 12 1124 pslld xmm5, 20 1125 por xmm5, xmm8 1126 movdqa xmm8, xmm6 1127 psrld xmm8, 12 1128 pslld xmm6, 20 1129 por xmm6, xmm8 1130 movdqa xmm8, xmm7 1131 psrld xmm8, 12 1132 pslld xmm7, 20 1133 por xmm7, xmm8 1134 movdqa xmm8, xmm4 1135 psrld xmm8, 12 1136 pslld xmm4, 20 1137 por xmm4, xmm8 1138 paddd xmm0, xmmword ptr [rsp+0x20] 1139 paddd xmm1, xmmword ptr [rsp+0x30] 1140 paddd xmm2, xmmword ptr [rsp+0x10] 1141 paddd xmm3, xmmword ptr [rsp+0x40] 1142 paddd xmm0, xmm5 1143 paddd xmm1, xmm6 1144 paddd xmm2, xmm7 1145 paddd xmm3, xmm4 1146 pxor xmm15, xmm0 1147 pxor xmm12, xmm1 1148 pxor xmm13, xmm2 1149 pxor xmm14, xmm3 1150 movdqa xmm8, xmm15 1151 psrld xmm15, 8 1152 pslld xmm8, 24 1153 pxor xmm15, xmm8 1154 movdqa xmm8, xmm12 1155 psrld xmm12, 8 1156 pslld xmm8, 24 1157 pxor xmm12, xmm8 1158 movdqa xmm8, xmm13 1159 psrld xmm13, 8 1160 pslld xmm8, 24 1161 pxor xmm13, xmm8 1162 movdqa xmm8, xmm14 1163 psrld xmm14, 8 1164 pslld xmm8, 24 1165 pxor xmm14, xmm8 1166 paddd xmm10, xmm15 1167 paddd xmm11, xmm12 1168 movdqa xmm8, xmmword ptr [rsp+0x100] 1169 paddd xmm8, xmm13 1170 paddd xmm9, xmm14 1171 pxor xmm5, xmm10 1172 pxor xmm6, xmm11 1173 pxor xmm7, xmm8 1174 pxor xmm4, xmm9 1175 movdqa xmmword ptr [rsp+0x100], xmm8 1176 movdqa xmm8, xmm5 1177 psrld xmm8, 7 1178 pslld xmm5, 25 1179 por xmm5, xmm8 1180 movdqa xmm8, xmm6 1181 psrld xmm8, 7 1182 pslld xmm6, 25 1183 por xmm6, xmm8 1184 movdqa xmm8, xmm7 1185 psrld xmm8, 7 1186 pslld xmm7, 25 1187 por xmm7, xmm8 1188 movdqa xmm8, xmm4 1189 psrld xmm8, 7 1190 pslld xmm4, 25 1191 por xmm4, xmm8 1192 paddd xmm0, xmmword ptr [rsp+0x90] 1193 paddd xmm1, xmmword ptr [rsp+0xB0] 1194 paddd xmm2, xmmword ptr [rsp+0x80] 1195 paddd xmm3, xmmword ptr [rsp+0xF0] 1196 paddd xmm0, xmm4 1197 paddd xmm1, xmm5 1198 paddd xmm2, xmm6 1199 paddd xmm3, xmm7 1200 pxor xmm12, xmm0 1201 pxor xmm13, xmm1 1202 pxor xmm14, xmm2 1203 pxor xmm15, xmm3 1204 pshuflw xmm12, xmm12, 0xB1 1205 pshufhw xmm12, xmm12, 0xB1 1206 pshuflw xmm13, xmm13, 0xB1 1207 pshufhw xmm13, xmm13, 0xB1 1208 pshuflw xmm14, xmm14, 0xB1 1209 pshufhw xmm14, xmm14, 0xB1 1210 pshuflw xmm15, xmm15, 0xB1 1211 pshufhw xmm15, xmm15, 0xB1 1212 movdqa xmm8, xmmword ptr [rsp+0x100] 1213 paddd xmm8, xmm12 1214 paddd xmm9, xmm13 1215 paddd xmm10, xmm14 1216 paddd xmm11, xmm15 1217 pxor xmm4, xmm8 1218 pxor xmm5, xmm9 1219 pxor xmm6, xmm10 1220 pxor xmm7, xmm11 1221 movdqa xmmword ptr [rsp+0x100], xmm8 1222 movdqa xmm8, xmm4 1223 psrld xmm8, 12 1224 pslld xmm4, 20 1225 por xmm4, xmm8 1226 movdqa xmm8, xmm5 1227 psrld xmm8, 12 1228 pslld xmm5, 20 1229 por xmm5, xmm8 1230 movdqa xmm8, xmm6 1231 psrld xmm8, 12 1232 pslld xmm6, 20 1233 por xmm6, xmm8 1234 movdqa xmm8, xmm7 1235 psrld xmm8, 12 1236 pslld xmm7, 20 1237 por xmm7, xmm8 1238 paddd xmm0, xmmword ptr [rsp+0xE0] 1239 paddd xmm1, xmmword ptr [rsp+0x50] 1240 paddd xmm2, xmmword ptr [rsp+0xC0] 1241 paddd xmm3, xmmword ptr [rsp+0x10] 1242 paddd xmm0, xmm4 1243 paddd xmm1, xmm5 1244 paddd xmm2, xmm6 1245 paddd xmm3, xmm7 1246 pxor xmm12, xmm0 1247 pxor xmm13, xmm1 1248 pxor xmm14, xmm2 1249 pxor xmm15, xmm3 1250 movdqa xmm8, xmm12 1251 psrld xmm12, 8 1252 pslld xmm8, 24 1253 pxor xmm12, xmm8 1254 movdqa xmm8, xmm13 1255 psrld xmm13, 8 1256 pslld xmm8, 24 1257 pxor xmm13, xmm8 1258 movdqa xmm8, xmm14 1259 psrld xmm14, 8 1260 pslld xmm8, 24 1261 pxor xmm14, xmm8 1262 movdqa xmm8, xmm15 1263 psrld xmm15, 8 1264 pslld xmm8, 24 1265 pxor xmm15, xmm8 1266 movdqa xmm8, xmmword ptr [rsp+0x100] 1267 paddd xmm8, xmm12 1268 paddd xmm9, xmm13 1269 paddd xmm10, xmm14 1270 paddd xmm11, xmm15 1271 pxor xmm4, xmm8 1272 pxor xmm5, xmm9 1273 pxor xmm6, xmm10 1274 pxor xmm7, xmm11 1275 movdqa xmmword ptr [rsp+0x100], xmm8 1276 movdqa xmm8, xmm4 1277 psrld xmm8, 7 1278 pslld xmm4, 25 1279 por xmm4, xmm8 1280 movdqa xmm8, xmm5 1281 psrld xmm8, 7 1282 pslld xmm5, 25 1283 por xmm5, xmm8 1284 movdqa xmm8, xmm6 1285 psrld xmm8, 7 1286 pslld xmm6, 25 1287 por xmm6, xmm8 1288 movdqa xmm8, xmm7 1289 psrld xmm8, 7 1290 pslld xmm7, 25 1291 por xmm7, xmm8 1292 paddd xmm0, xmmword ptr [rsp+0xD0] 1293 paddd xmm1, xmmword ptr [rsp] 1294 paddd xmm2, xmmword ptr [rsp+0x20] 1295 paddd xmm3, xmmword ptr [rsp+0x40] 1296 paddd xmm0, xmm5 1297 paddd xmm1, xmm6 1298 paddd xmm2, xmm7 1299 paddd xmm3, xmm4 1300 pxor xmm15, xmm0 1301 pxor xmm12, xmm1 1302 pxor xmm13, xmm2 1303 pxor xmm14, xmm3 1304 pshuflw xmm15, xmm15, 0xB1 1305 pshufhw xmm15, xmm15, 0xB1 1306 pshuflw xmm12, xmm12, 0xB1 1307 pshufhw xmm12, xmm12, 0xB1 1308 pshuflw xmm13, xmm13, 0xB1 1309 pshufhw xmm13, xmm13, 0xB1 1310 pshuflw xmm14, xmm14, 0xB1 1311 pshufhw xmm14, xmm14, 0xB1 1312 paddd xmm10, xmm15 1313 paddd xmm11, xmm12 1314 movdqa xmm8, xmmword ptr [rsp+0x100] 1315 paddd xmm8, xmm13 1316 paddd xmm9, xmm14 1317 pxor xmm5, xmm10 1318 pxor xmm6, xmm11 1319 pxor xmm7, xmm8 1320 pxor xmm4, xmm9 1321 movdqa xmmword ptr [rsp+0x100], xmm8 1322 movdqa xmm8, xmm5 1323 psrld xmm8, 12 1324 pslld xmm5, 20 1325 por xmm5, xmm8 1326 movdqa xmm8, xmm6 1327 psrld xmm8, 12 1328 pslld xmm6, 20 1329 por xmm6, xmm8 1330 movdqa xmm8, xmm7 1331 psrld xmm8, 12 1332 pslld xmm7, 20 1333 por xmm7, xmm8 1334 movdqa xmm8, xmm4 1335 psrld xmm8, 12 1336 pslld xmm4, 20 1337 por xmm4, xmm8 1338 paddd xmm0, xmmword ptr [rsp+0x30] 1339 paddd xmm1, xmmword ptr [rsp+0xA0] 1340 paddd xmm2, xmmword ptr [rsp+0x60] 1341 paddd xmm3, xmmword ptr [rsp+0x70] 1342 paddd xmm0, xmm5 1343 paddd xmm1, xmm6 1344 paddd xmm2, xmm7 1345 paddd xmm3, xmm4 1346 pxor xmm15, xmm0 1347 pxor xmm12, xmm1 1348 pxor xmm13, xmm2 1349 pxor xmm14, xmm3 1350 movdqa xmm8, xmm15 1351 psrld xmm15, 8 1352 pslld xmm8, 24 1353 pxor xmm15, xmm8 1354 movdqa xmm8, xmm12 1355 psrld xmm12, 8 1356 pslld xmm8, 24 1357 pxor xmm12, xmm8 1358 movdqa xmm8, xmm13 1359 psrld xmm13, 8 1360 pslld xmm8, 24 1361 pxor xmm13, xmm8 1362 movdqa xmm8, xmm14 1363 psrld xmm14, 8 1364 pslld xmm8, 24 1365 pxor xmm14, xmm8 1366 paddd xmm10, xmm15 1367 paddd xmm11, xmm12 1368 movdqa xmm8, xmmword ptr [rsp+0x100] 1369 paddd xmm8, xmm13 1370 paddd xmm9, xmm14 1371 pxor xmm5, xmm10 1372 pxor xmm6, xmm11 1373 pxor xmm7, xmm8 1374 pxor xmm4, xmm9 1375 movdqa xmmword ptr [rsp+0x100], xmm8 1376 movdqa xmm8, xmm5 1377 psrld xmm8, 7 1378 pslld xmm5, 25 1379 por xmm5, xmm8 1380 movdqa xmm8, xmm6 1381 psrld xmm8, 7 1382 pslld xmm6, 25 1383 por xmm6, xmm8 1384 movdqa xmm8, xmm7 1385 psrld xmm8, 7 1386 pslld xmm7, 25 1387 por xmm7, xmm8 1388 movdqa xmm8, xmm4 1389 psrld xmm8, 7 1390 pslld xmm4, 25 1391 por xmm4, xmm8 1392 paddd xmm0, xmmword ptr [rsp+0xB0] 1393 paddd xmm1, xmmword ptr [rsp+0x50] 1394 paddd xmm2, xmmword ptr [rsp+0x10] 1395 paddd xmm3, xmmword ptr [rsp+0x80] 1396 paddd xmm0, xmm4 1397 paddd xmm1, xmm5 1398 paddd xmm2, xmm6 1399 paddd xmm3, xmm7 1400 pxor xmm12, xmm0 1401 pxor xmm13, xmm1 1402 pxor xmm14, xmm2 1403 pxor xmm15, xmm3 1404 pshuflw xmm12, xmm12, 0xB1 1405 pshufhw xmm12, xmm12, 0xB1 1406 pshuflw xmm13, xmm13, 0xB1 1407 pshufhw xmm13, xmm13, 0xB1 1408 pshuflw xmm14, xmm14, 0xB1 1409 pshufhw xmm14, xmm14, 0xB1 1410 pshuflw xmm15, xmm15, 0xB1 1411 pshufhw xmm15, xmm15, 0xB1 1412 movdqa xmm8, xmmword ptr [rsp+0x100] 1413 paddd xmm8, xmm12 1414 paddd xmm9, xmm13 1415 paddd xmm10, xmm14 1416 paddd xmm11, xmm15 1417 pxor xmm4, xmm8 1418 pxor xmm5, xmm9 1419 pxor xmm6, xmm10 1420 pxor xmm7, xmm11 1421 movdqa xmmword ptr [rsp+0x100], xmm8 1422 movdqa xmm8, xmm4 1423 psrld xmm8, 12 1424 pslld xmm4, 20 1425 por xmm4, xmm8 1426 movdqa xmm8, xmm5 1427 psrld xmm8, 12 1428 pslld xmm5, 20 1429 por xmm5, xmm8 1430 movdqa xmm8, xmm6 1431 psrld xmm8, 12 1432 pslld xmm6, 20 1433 por xmm6, xmm8 1434 movdqa xmm8, xmm7 1435 psrld xmm8, 12 1436 pslld xmm7, 20 1437 por xmm7, xmm8 1438 paddd xmm0, xmmword ptr [rsp+0xF0] 1439 paddd xmm1, xmmword ptr [rsp] 1440 paddd xmm2, xmmword ptr [rsp+0x90] 1441 paddd xmm3, xmmword ptr [rsp+0x60] 1442 paddd xmm0, xmm4 1443 paddd xmm1, xmm5 1444 paddd xmm2, xmm6 1445 paddd xmm3, xmm7 1446 pxor xmm12, xmm0 1447 pxor xmm13, xmm1 1448 pxor xmm14, xmm2 1449 pxor xmm15, xmm3 1450 movdqa xmm8, xmm12 1451 psrld xmm12, 8 1452 pslld xmm8, 24 1453 pxor xmm12, xmm8 1454 movdqa xmm8, xmm13 1455 psrld xmm13, 8 1456 pslld xmm8, 24 1457 pxor xmm13, xmm8 1458 movdqa xmm8, xmm14 1459 psrld xmm14, 8 1460 pslld xmm8, 24 1461 pxor xmm14, xmm8 1462 movdqa xmm8, xmm15 1463 psrld xmm15, 8 1464 pslld xmm8, 24 1465 pxor xmm15, xmm8 1466 movdqa xmm8, xmmword ptr [rsp+0x100] 1467 paddd xmm8, xmm12 1468 paddd xmm9, xmm13 1469 paddd xmm10, xmm14 1470 paddd xmm11, xmm15 1471 pxor xmm4, xmm8 1472 pxor xmm5, xmm9 1473 pxor xmm6, xmm10 1474 pxor xmm7, xmm11 1475 movdqa xmmword ptr [rsp+0x100], xmm8 1476 movdqa xmm8, xmm4 1477 psrld xmm8, 7 1478 pslld xmm4, 25 1479 por xmm4, xmm8 1480 movdqa xmm8, xmm5 1481 psrld xmm8, 7 1482 pslld xmm5, 25 1483 por xmm5, xmm8 1484 movdqa xmm8, xmm6 1485 psrld xmm8, 7 1486 pslld xmm6, 25 1487 por xmm6, xmm8 1488 movdqa xmm8, xmm7 1489 psrld xmm8, 7 1490 pslld xmm7, 25 1491 por xmm7, xmm8 1492 paddd xmm0, xmmword ptr [rsp+0xE0] 1493 paddd xmm1, xmmword ptr [rsp+0x20] 1494 paddd xmm2, xmmword ptr [rsp+0x30] 1495 paddd xmm3, xmmword ptr [rsp+0x70] 1496 paddd xmm0, xmm5 1497 paddd xmm1, xmm6 1498 paddd xmm2, xmm7 1499 paddd xmm3, xmm4 1500 pxor xmm15, xmm0 1501 pxor xmm12, xmm1 1502 pxor xmm13, xmm2 1503 pxor xmm14, xmm3 1504 pshuflw xmm15, xmm15, 0xB1 1505 pshufhw xmm15, xmm15, 0xB1 1506 pshuflw xmm12, xmm12, 0xB1 1507 pshufhw xmm12, xmm12, 0xB1 1508 pshuflw xmm13, xmm13, 0xB1 1509 pshufhw xmm13, xmm13, 0xB1 1510 pshuflw xmm14, xmm14, 0xB1 1511 pshufhw xmm14, xmm14, 0xB1 1512 paddd xmm10, xmm15 1513 paddd xmm11, xmm12 1514 movdqa xmm8, xmmword ptr [rsp+0x100] 1515 paddd xmm8, xmm13 1516 paddd xmm9, xmm14 1517 pxor xmm5, xmm10 1518 pxor xmm6, xmm11 1519 pxor xmm7, xmm8 1520 pxor xmm4, xmm9 1521 movdqa xmmword ptr [rsp+0x100], xmm8 1522 movdqa xmm8, xmm5 1523 psrld xmm8, 12 1524 pslld xmm5, 20 1525 por xmm5, xmm8 1526 movdqa xmm8, xmm6 1527 psrld xmm8, 12 1528 pslld xmm6, 20 1529 por xmm6, xmm8 1530 movdqa xmm8, xmm7 1531 psrld xmm8, 12 1532 pslld xmm7, 20 1533 por xmm7, xmm8 1534 movdqa xmm8, xmm4 1535 psrld xmm8, 12 1536 pslld xmm4, 20 1537 por xmm4, xmm8 1538 paddd xmm0, xmmword ptr [rsp+0xA0] 1539 paddd xmm1, xmmword ptr [rsp+0xC0] 1540 paddd xmm2, xmmword ptr [rsp+0x40] 1541 paddd xmm3, xmmword ptr [rsp+0xD0] 1542 paddd xmm0, xmm5 1543 paddd xmm1, xmm6 1544 paddd xmm2, xmm7 1545 paddd xmm3, xmm4 1546 pxor xmm15, xmm0 1547 pxor xmm12, xmm1 1548 pxor xmm13, xmm2 1549 pxor xmm14, xmm3 1550 movdqa xmm8, xmm15 1551 psrld xmm15, 8 1552 pslld xmm8, 24 1553 pxor xmm15, xmm8 1554 movdqa xmm8, xmm12 1555 psrld xmm12, 8 1556 pslld xmm8, 24 1557 pxor xmm12, xmm8 1558 movdqa xmm8, xmm13 1559 psrld xmm13, 8 1560 pslld xmm8, 24 1561 pxor xmm13, xmm8 1562 movdqa xmm8, xmm14 1563 psrld xmm14, 8 1564 pslld xmm8, 24 1565 pxor xmm14, xmm8 1566 paddd xmm10, xmm15 1567 paddd xmm11, xmm12 1568 movdqa xmm8, xmmword ptr [rsp+0x100] 1569 paddd xmm8, xmm13 1570 paddd xmm9, xmm14 1571 pxor xmm5, xmm10 1572 pxor xmm6, xmm11 1573 pxor xmm7, xmm8 1574 pxor xmm4, xmm9 1575 pxor xmm0, xmm8 1576 pxor xmm1, xmm9 1577 pxor xmm2, xmm10 1578 pxor xmm3, xmm11 1579 movdqa xmm8, xmm5 1580 psrld xmm8, 7 1581 pslld xmm5, 25 1582 por xmm5, xmm8 1583 movdqa xmm8, xmm6 1584 psrld xmm8, 7 1585 pslld xmm6, 25 1586 por xmm6, xmm8 1587 movdqa xmm8, xmm7 1588 psrld xmm8, 7 1589 pslld xmm7, 25 1590 por xmm7, xmm8 1591 movdqa xmm8, xmm4 1592 psrld xmm8, 7 1593 pslld xmm4, 25 1594 por xmm4, xmm8 1595 pxor xmm4, xmm12 1596 pxor xmm5, xmm13 1597 pxor xmm6, xmm14 1598 pxor xmm7, xmm15 1599 mov eax, r13d 1600 jne 9b 1601 movdqa xmm9, xmm0 1602 punpckldq xmm0, xmm1 1603 punpckhdq xmm9, xmm1 1604 movdqa xmm11, xmm2 1605 punpckldq xmm2, xmm3 1606 punpckhdq xmm11, xmm3 1607 movdqa xmm1, xmm0 1608 punpcklqdq xmm0, xmm2 1609 punpckhqdq xmm1, xmm2 1610 movdqa xmm3, xmm9 1611 punpcklqdq xmm9, xmm11 1612 punpckhqdq xmm3, xmm11 1613 movdqu xmmword ptr [rbx], xmm0 1614 movdqu xmmword ptr [rbx+0x20], xmm1 1615 movdqu xmmword ptr [rbx+0x40], xmm9 1616 movdqu xmmword ptr [rbx+0x60], xmm3 1617 movdqa xmm9, xmm4 1618 punpckldq xmm4, xmm5 1619 punpckhdq xmm9, xmm5 1620 movdqa xmm11, xmm6 1621 punpckldq xmm6, xmm7 1622 punpckhdq xmm11, xmm7 1623 movdqa xmm5, xmm4 1624 punpcklqdq xmm4, xmm6 1625 punpckhqdq xmm5, xmm6 1626 movdqa xmm7, xmm9 1627 punpcklqdq xmm9, xmm11 1628 punpckhqdq xmm7, xmm11 1629 movdqu xmmword ptr [rbx+0x10], xmm4 1630 movdqu xmmword ptr [rbx+0x30], xmm5 1631 movdqu xmmword ptr [rbx+0x50], xmm9 1632 movdqu xmmword ptr [rbx+0x70], xmm7 1633 movdqa xmm1, xmmword ptr [rsp+0x110] 1634 movdqa xmm0, xmm1 1635 paddd xmm1, xmmword ptr [rsp+0x150] 1636 movdqa xmmword ptr [rsp+0x110], xmm1 1637 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] 1638 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] 1639 pcmpgtd xmm0, xmm1 1640 movdqa xmm1, xmmword ptr [rsp+0x120] 1641 psubd xmm1, xmm0 1642 movdqa xmmword ptr [rsp+0x120], xmm1 1643 add rbx, 128 1644 add rdi, 32 1645 sub rsi, 4 1646 cmp rsi, 4 1647 jnc 2b 1648 test rsi, rsi 1649 jnz 3f 16504: 1651 mov rsp, rbp 1652 pop rbp 1653 pop rbx 1654 pop r12 1655 pop r13 1656 pop r14 1657 pop r15 1658 RET 1659.p2align 5 16603: 1661 test esi, 0x2 1662 je 3f 1663 movups xmm0, xmmword ptr [rcx] 1664 movups xmm1, xmmword ptr [rcx+0x10] 1665 movaps xmm8, xmm0 1666 movaps xmm9, xmm1 1667 movd xmm13, dword ptr [rsp+0x110] 1668 movd xmm14, dword ptr [rsp+0x120] 1669 punpckldq xmm13, xmm14 1670 movaps xmmword ptr [rsp], xmm13 1671 movd xmm14, dword ptr [rsp+0x114] 1672 movd xmm13, dword ptr [rsp+0x124] 1673 punpckldq xmm14, xmm13 1674 movaps xmmword ptr [rsp+0x10], xmm14 1675 mov r8, qword ptr [rdi] 1676 mov r9, qword ptr [rdi+0x8] 1677 movzx eax, byte ptr [rbp+0x40] 1678 or eax, r13d 1679 xor edx, edx 16802: 1681 mov r14d, eax 1682 or eax, r12d 1683 add rdx, 64 1684 cmp rdx, r15 1685 cmovne eax, r14d 1686 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 1687 movaps xmm10, xmm2 1688 movups xmm4, xmmword ptr [r8+rdx-0x40] 1689 movups xmm5, xmmword ptr [r8+rdx-0x30] 1690 movaps xmm3, xmm4 1691 shufps xmm4, xmm5, 136 1692 shufps xmm3, xmm5, 221 1693 movaps xmm5, xmm3 1694 movups xmm6, xmmword ptr [r8+rdx-0x20] 1695 movups xmm7, xmmword ptr [r8+rdx-0x10] 1696 movaps xmm3, xmm6 1697 shufps xmm6, xmm7, 136 1698 pshufd xmm6, xmm6, 0x93 1699 shufps xmm3, xmm7, 221 1700 pshufd xmm7, xmm3, 0x93 1701 movups xmm12, xmmword ptr [r9+rdx-0x40] 1702 movups xmm13, xmmword ptr [r9+rdx-0x30] 1703 movaps xmm11, xmm12 1704 shufps xmm12, xmm13, 136 1705 shufps xmm11, xmm13, 221 1706 movaps xmm13, xmm11 1707 movups xmm14, xmmword ptr [r9+rdx-0x20] 1708 movups xmm15, xmmword ptr [r9+rdx-0x10] 1709 movaps xmm11, xmm14 1710 shufps xmm14, xmm15, 136 1711 pshufd xmm14, xmm14, 0x93 1712 shufps xmm11, xmm15, 221 1713 pshufd xmm15, xmm11, 0x93 1714 shl rax, 0x20 1715 or rax, 0x40 1716 movq xmm3, rax 1717 movdqa xmmword ptr [rsp+0x20], xmm3 1718 movaps xmm3, xmmword ptr [rsp] 1719 movaps xmm11, xmmword ptr [rsp+0x10] 1720 punpcklqdq xmm3, xmmword ptr [rsp+0x20] 1721 punpcklqdq xmm11, xmmword ptr [rsp+0x20] 1722 mov al, 7 17239: 1724 paddd xmm0, xmm4 1725 paddd xmm8, xmm12 1726 movaps xmmword ptr [rsp+0x20], xmm4 1727 movaps xmmword ptr [rsp+0x30], xmm12 1728 paddd xmm0, xmm1 1729 paddd xmm8, xmm9 1730 pxor xmm3, xmm0 1731 pxor xmm11, xmm8 1732 pshuflw xmm3, xmm3, 0xB1 1733 pshufhw xmm3, xmm3, 0xB1 1734 pshuflw xmm11, xmm11, 0xB1 1735 pshufhw xmm11, xmm11, 0xB1 1736 paddd xmm2, xmm3 1737 paddd xmm10, xmm11 1738 pxor xmm1, xmm2 1739 pxor xmm9, xmm10 1740 movdqa xmm4, xmm1 1741 pslld xmm1, 20 1742 psrld xmm4, 12 1743 por xmm1, xmm4 1744 movdqa xmm4, xmm9 1745 pslld xmm9, 20 1746 psrld xmm4, 12 1747 por xmm9, xmm4 1748 paddd xmm0, xmm5 1749 paddd xmm8, xmm13 1750 movaps xmmword ptr [rsp+0x40], xmm5 1751 movaps xmmword ptr [rsp+0x50], xmm13 1752 paddd xmm0, xmm1 1753 paddd xmm8, xmm9 1754 pxor xmm3, xmm0 1755 pxor xmm11, xmm8 1756 movdqa xmm13, xmm3 1757 psrld xmm3, 8 1758 pslld xmm13, 24 1759 pxor xmm3, xmm13 1760 movdqa xmm13, xmm11 1761 psrld xmm11, 8 1762 pslld xmm13, 24 1763 pxor xmm11, xmm13 1764 paddd xmm2, xmm3 1765 paddd xmm10, xmm11 1766 pxor xmm1, xmm2 1767 pxor xmm9, xmm10 1768 movdqa xmm4, xmm1 1769 pslld xmm1, 25 1770 psrld xmm4, 7 1771 por xmm1, xmm4 1772 movdqa xmm4, xmm9 1773 pslld xmm9, 25 1774 psrld xmm4, 7 1775 por xmm9, xmm4 1776 pshufd xmm0, xmm0, 0x93 1777 pshufd xmm8, xmm8, 0x93 1778 pshufd xmm3, xmm3, 0x4E 1779 pshufd xmm11, xmm11, 0x4E 1780 pshufd xmm2, xmm2, 0x39 1781 pshufd xmm10, xmm10, 0x39 1782 paddd xmm0, xmm6 1783 paddd xmm8, xmm14 1784 paddd xmm0, xmm1 1785 paddd xmm8, xmm9 1786 pxor xmm3, xmm0 1787 pxor xmm11, xmm8 1788 pshuflw xmm3, xmm3, 0xB1 1789 pshufhw xmm3, xmm3, 0xB1 1790 pshuflw xmm11, xmm11, 0xB1 1791 pshufhw xmm11, xmm11, 0xB1 1792 paddd xmm2, xmm3 1793 paddd xmm10, xmm11 1794 pxor xmm1, xmm2 1795 pxor xmm9, xmm10 1796 movdqa xmm4, xmm1 1797 pslld xmm1, 20 1798 psrld xmm4, 12 1799 por xmm1, xmm4 1800 movdqa xmm4, xmm9 1801 pslld xmm9, 20 1802 psrld xmm4, 12 1803 por xmm9, xmm4 1804 paddd xmm0, xmm7 1805 paddd xmm8, xmm15 1806 paddd xmm0, xmm1 1807 paddd xmm8, xmm9 1808 pxor xmm3, xmm0 1809 pxor xmm11, xmm8 1810 movdqa xmm13, xmm3 1811 psrld xmm3, 8 1812 pslld xmm13, 24 1813 pxor xmm3, xmm13 1814 movdqa xmm13, xmm11 1815 psrld xmm11, 8 1816 pslld xmm13, 24 1817 pxor xmm11, xmm13 1818 paddd xmm2, xmm3 1819 paddd xmm10, xmm11 1820 pxor xmm1, xmm2 1821 pxor xmm9, xmm10 1822 movdqa xmm4, xmm1 1823 pslld xmm1, 25 1824 psrld xmm4, 7 1825 por xmm1, xmm4 1826 movdqa xmm4, xmm9 1827 pslld xmm9, 25 1828 psrld xmm4, 7 1829 por xmm9, xmm4 1830 pshufd xmm0, xmm0, 0x39 1831 pshufd xmm8, xmm8, 0x39 1832 pshufd xmm3, xmm3, 0x4E 1833 pshufd xmm11, xmm11, 0x4E 1834 pshufd xmm2, xmm2, 0x93 1835 pshufd xmm10, xmm10, 0x93 1836 dec al 1837 je 9f 1838 movdqa xmm12, xmmword ptr [rsp+0x20] 1839 movdqa xmm5, xmmword ptr [rsp+0x40] 1840 pshufd xmm13, xmm12, 0x0F 1841 shufps xmm12, xmm5, 214 1842 pshufd xmm4, xmm12, 0x39 1843 movdqa xmm12, xmm6 1844 shufps xmm12, xmm7, 250 1845 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] 1846 pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] 1847 por xmm13, xmm12 1848 movdqa xmmword ptr [rsp+0x20], xmm13 1849 movdqa xmm12, xmm7 1850 punpcklqdq xmm12, xmm5 1851 movdqa xmm13, xmm6 1852 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] 1853 pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] 1854 por xmm12, xmm13 1855 pshufd xmm12, xmm12, 0x78 1856 punpckhdq xmm5, xmm7 1857 punpckldq xmm6, xmm5 1858 pshufd xmm7, xmm6, 0x1E 1859 movdqa xmmword ptr [rsp+0x40], xmm12 1860 movdqa xmm5, xmmword ptr [rsp+0x30] 1861 movdqa xmm13, xmmword ptr [rsp+0x50] 1862 pshufd xmm6, xmm5, 0x0F 1863 shufps xmm5, xmm13, 214 1864 pshufd xmm12, xmm5, 0x39 1865 movdqa xmm5, xmm14 1866 shufps xmm5, xmm15, 250 1867 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] 1868 pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] 1869 por xmm6, xmm5 1870 movdqa xmm5, xmm15 1871 punpcklqdq xmm5, xmm13 1872 movdqa xmmword ptr [rsp+0x30], xmm2 1873 movdqa xmm2, xmm14 1874 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] 1875 pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] 1876 por xmm5, xmm2 1877 movdqa xmm2, xmmword ptr [rsp+0x30] 1878 pshufd xmm5, xmm5, 0x78 1879 punpckhdq xmm13, xmm15 1880 punpckldq xmm14, xmm13 1881 pshufd xmm15, xmm14, 0x1E 1882 movdqa xmm13, xmm6 1883 movdqa xmm14, xmm5 1884 movdqa xmm5, xmmword ptr [rsp+0x20] 1885 movdqa xmm6, xmmword ptr [rsp+0x40] 1886 jmp 9b 18879: 1888 pxor xmm0, xmm2 1889 pxor xmm1, xmm3 1890 pxor xmm8, xmm10 1891 pxor xmm9, xmm11 1892 mov eax, r13d 1893 cmp rdx, r15 1894 jne 2b 1895 movups xmmword ptr [rbx], xmm0 1896 movups xmmword ptr [rbx+0x10], xmm1 1897 movups xmmword ptr [rbx+0x20], xmm8 1898 movups xmmword ptr [rbx+0x30], xmm9 1899 mov eax, dword ptr [rsp+0x130] 1900 neg eax 1901 mov r10d, dword ptr [rsp+0x110+8*rax] 1902 mov r11d, dword ptr [rsp+0x120+8*rax] 1903 mov dword ptr [rsp+0x110], r10d 1904 mov dword ptr [rsp+0x120], r11d 1905 add rdi, 16 1906 add rbx, 64 1907 sub rsi, 2 19083: 1909 test esi, 0x1 1910 je 4b 1911 movups xmm0, xmmword ptr [rcx] 1912 movups xmm1, xmmword ptr [rcx+0x10] 1913 movd xmm13, dword ptr [rsp+0x110] 1914 movd xmm14, dword ptr [rsp+0x120] 1915 punpckldq xmm13, xmm14 1916 mov r8, qword ptr [rdi] 1917 movzx eax, byte ptr [rbp+0x40] 1918 or eax, r13d 1919 xor edx, edx 19202: 1921 mov r14d, eax 1922 or eax, r12d 1923 add rdx, 64 1924 cmp rdx, r15 1925 cmovne eax, r14d 1926 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 1927 shl rax, 32 1928 or rax, 64 1929 movq xmm12, rax 1930 movdqa xmm3, xmm13 1931 punpcklqdq xmm3, xmm12 1932 movups xmm4, xmmword ptr [r8+rdx-0x40] 1933 movups xmm5, xmmword ptr [r8+rdx-0x30] 1934 movaps xmm8, xmm4 1935 shufps xmm4, xmm5, 136 1936 shufps xmm8, xmm5, 221 1937 movaps xmm5, xmm8 1938 movups xmm6, xmmword ptr [r8+rdx-0x20] 1939 movups xmm7, xmmword ptr [r8+rdx-0x10] 1940 movaps xmm8, xmm6 1941 shufps xmm6, xmm7, 136 1942 pshufd xmm6, xmm6, 0x93 1943 shufps xmm8, xmm7, 221 1944 pshufd xmm7, xmm8, 0x93 1945 mov al, 7 19469: 1947 paddd xmm0, xmm4 1948 paddd xmm0, xmm1 1949 pxor xmm3, xmm0 1950 pshuflw xmm3, xmm3, 0xB1 1951 pshufhw xmm3, xmm3, 0xB1 1952 paddd xmm2, xmm3 1953 pxor xmm1, xmm2 1954 movdqa xmm11, xmm1 1955 pslld xmm1, 20 1956 psrld xmm11, 12 1957 por xmm1, xmm11 1958 paddd xmm0, xmm5 1959 paddd xmm0, xmm1 1960 pxor xmm3, xmm0 1961 movdqa xmm14, xmm3 1962 psrld xmm3, 8 1963 pslld xmm14, 24 1964 pxor xmm3, xmm14 1965 paddd xmm2, xmm3 1966 pxor xmm1, xmm2 1967 movdqa xmm11, xmm1 1968 pslld xmm1, 25 1969 psrld xmm11, 7 1970 por xmm1, xmm11 1971 pshufd xmm0, xmm0, 0x93 1972 pshufd xmm3, xmm3, 0x4E 1973 pshufd xmm2, xmm2, 0x39 1974 paddd xmm0, xmm6 1975 paddd xmm0, xmm1 1976 pxor xmm3, xmm0 1977 pshuflw xmm3, xmm3, 0xB1 1978 pshufhw xmm3, xmm3, 0xB1 1979 paddd xmm2, xmm3 1980 pxor xmm1, xmm2 1981 movdqa xmm11, xmm1 1982 pslld xmm1, 20 1983 psrld xmm11, 12 1984 por xmm1, xmm11 1985 paddd xmm0, xmm7 1986 paddd xmm0, xmm1 1987 pxor xmm3, xmm0 1988 movdqa xmm14, xmm3 1989 psrld xmm3, 8 1990 pslld xmm14, 24 1991 pxor xmm3, xmm14 1992 paddd xmm2, xmm3 1993 pxor xmm1, xmm2 1994 movdqa xmm11, xmm1 1995 pslld xmm1, 25 1996 psrld xmm11, 7 1997 por xmm1, xmm11 1998 pshufd xmm0, xmm0, 0x39 1999 pshufd xmm3, xmm3, 0x4E 2000 pshufd xmm2, xmm2, 0x93 2001 dec al 2002 jz 9f 2003 movdqa xmm8, xmm4 2004 shufps xmm8, xmm5, 214 2005 pshufd xmm9, xmm4, 0x0F 2006 pshufd xmm4, xmm8, 0x39 2007 movdqa xmm8, xmm6 2008 shufps xmm8, xmm7, 250 2009 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2010 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2011 por xmm9, xmm8 2012 movdqa xmm8, xmm7 2013 punpcklqdq xmm8, xmm5 2014 movdqa xmm10, xmm6 2015 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2016 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2017 por xmm8, xmm10 2018 pshufd xmm8, xmm8, 0x78 2019 punpckhdq xmm5, xmm7 2020 punpckldq xmm6, xmm5 2021 pshufd xmm7, xmm6, 0x1E 2022 movdqa xmm5, xmm9 2023 movdqa xmm6, xmm8 2024 jmp 9b 20259: 2026 pxor xmm0, xmm2 2027 pxor xmm1, xmm3 2028 mov eax, r13d 2029 cmp rdx, r15 2030 jne 2b 2031 movups xmmword ptr [rbx], xmm0 2032 movups xmmword ptr [rbx+0x10], xmm1 2033 jmp 4b 2034SET_SIZE(zfs_blake3_hash_many_sse2) 2035 2036ENTRY_ALIGN(zfs_blake3_compress_in_place_sse2, 64) 2037 ENDBR 2038 movups xmm0, xmmword ptr [rdi] 2039 movups xmm1, xmmword ptr [rdi+0x10] 2040 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 2041 shl r8, 32 2042 add rdx, r8 2043 movq xmm3, rcx 2044 movq xmm4, rdx 2045 punpcklqdq xmm3, xmm4 2046 movups xmm4, xmmword ptr [rsi] 2047 movups xmm5, xmmword ptr [rsi+0x10] 2048 movaps xmm8, xmm4 2049 shufps xmm4, xmm5, 136 2050 shufps xmm8, xmm5, 221 2051 movaps xmm5, xmm8 2052 movups xmm6, xmmword ptr [rsi+0x20] 2053 movups xmm7, xmmword ptr [rsi+0x30] 2054 movaps xmm8, xmm6 2055 shufps xmm6, xmm7, 136 2056 pshufd xmm6, xmm6, 0x93 2057 shufps xmm8, xmm7, 221 2058 pshufd xmm7, xmm8, 0x93 2059 mov al, 7 20609: 2061 paddd xmm0, xmm4 2062 paddd xmm0, xmm1 2063 pxor xmm3, xmm0 2064 pshuflw xmm3, xmm3, 0xB1 2065 pshufhw xmm3, xmm3, 0xB1 2066 paddd xmm2, xmm3 2067 pxor xmm1, xmm2 2068 movdqa xmm11, xmm1 2069 pslld xmm1, 20 2070 psrld xmm11, 12 2071 por xmm1, xmm11 2072 paddd xmm0, xmm5 2073 paddd xmm0, xmm1 2074 pxor xmm3, xmm0 2075 movdqa xmm14, xmm3 2076 psrld xmm3, 8 2077 pslld xmm14, 24 2078 pxor xmm3, xmm14 2079 paddd xmm2, xmm3 2080 pxor xmm1, xmm2 2081 movdqa xmm11, xmm1 2082 pslld xmm1, 25 2083 psrld xmm11, 7 2084 por xmm1, xmm11 2085 pshufd xmm0, xmm0, 0x93 2086 pshufd xmm3, xmm3, 0x4E 2087 pshufd xmm2, xmm2, 0x39 2088 paddd xmm0, xmm6 2089 paddd xmm0, xmm1 2090 pxor xmm3, xmm0 2091 pshuflw xmm3, xmm3, 0xB1 2092 pshufhw xmm3, xmm3, 0xB1 2093 paddd xmm2, xmm3 2094 pxor xmm1, xmm2 2095 movdqa xmm11, xmm1 2096 pslld xmm1, 20 2097 psrld xmm11, 12 2098 por xmm1, xmm11 2099 paddd xmm0, xmm7 2100 paddd xmm0, xmm1 2101 pxor xmm3, xmm0 2102 movdqa xmm14, xmm3 2103 psrld xmm3, 8 2104 pslld xmm14, 24 2105 pxor xmm3, xmm14 2106 paddd xmm2, xmm3 2107 pxor xmm1, xmm2 2108 movdqa xmm11, xmm1 2109 pslld xmm1, 25 2110 psrld xmm11, 7 2111 por xmm1, xmm11 2112 pshufd xmm0, xmm0, 0x39 2113 pshufd xmm3, xmm3, 0x4E 2114 pshufd xmm2, xmm2, 0x93 2115 dec al 2116 jz 9f 2117 movdqa xmm8, xmm4 2118 shufps xmm8, xmm5, 214 2119 pshufd xmm9, xmm4, 0x0F 2120 pshufd xmm4, xmm8, 0x39 2121 movdqa xmm8, xmm6 2122 shufps xmm8, xmm7, 250 2123 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2124 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2125 por xmm9, xmm8 2126 movdqa xmm8, xmm7 2127 punpcklqdq xmm8, xmm5 2128 movdqa xmm10, xmm6 2129 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2130 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2131 por xmm8, xmm10 2132 pshufd xmm8, xmm8, 0x78 2133 punpckhdq xmm5, xmm7 2134 punpckldq xmm6, xmm5 2135 pshufd xmm7, xmm6, 0x1E 2136 movdqa xmm5, xmm9 2137 movdqa xmm6, xmm8 2138 jmp 9b 21399: 2140 pxor xmm0, xmm2 2141 pxor xmm1, xmm3 2142 movups xmmword ptr [rdi], xmm0 2143 movups xmmword ptr [rdi+0x10], xmm1 2144 RET 2145SET_SIZE(zfs_blake3_compress_in_place_sse2) 2146 2147ENTRY_ALIGN(zfs_blake3_compress_xof_sse2, 64) 2148 ENDBR 2149 movups xmm0, xmmword ptr [rdi] 2150 movups xmm1, xmmword ptr [rdi+0x10] 2151 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 2152 movzx eax, r8b 2153 movzx edx, dl 2154 shl rax, 32 2155 add rdx, rax 2156 movq xmm3, rcx 2157 movq xmm4, rdx 2158 punpcklqdq xmm3, xmm4 2159 movups xmm4, xmmword ptr [rsi] 2160 movups xmm5, xmmword ptr [rsi+0x10] 2161 movaps xmm8, xmm4 2162 shufps xmm4, xmm5, 136 2163 shufps xmm8, xmm5, 221 2164 movaps xmm5, xmm8 2165 movups xmm6, xmmword ptr [rsi+0x20] 2166 movups xmm7, xmmword ptr [rsi+0x30] 2167 movaps xmm8, xmm6 2168 shufps xmm6, xmm7, 136 2169 pshufd xmm6, xmm6, 0x93 2170 shufps xmm8, xmm7, 221 2171 pshufd xmm7, xmm8, 0x93 2172 mov al, 7 21739: 2174 paddd xmm0, xmm4 2175 paddd xmm0, xmm1 2176 pxor xmm3, xmm0 2177 pshuflw xmm3, xmm3, 0xB1 2178 pshufhw xmm3, xmm3, 0xB1 2179 paddd xmm2, xmm3 2180 pxor xmm1, xmm2 2181 movdqa xmm11, xmm1 2182 pslld xmm1, 20 2183 psrld xmm11, 12 2184 por xmm1, xmm11 2185 paddd xmm0, xmm5 2186 paddd xmm0, xmm1 2187 pxor xmm3, xmm0 2188 movdqa xmm14, xmm3 2189 psrld xmm3, 8 2190 pslld xmm14, 24 2191 pxor xmm3, xmm14 2192 paddd xmm2, xmm3 2193 pxor xmm1, xmm2 2194 movdqa xmm11, xmm1 2195 pslld xmm1, 25 2196 psrld xmm11, 7 2197 por xmm1, xmm11 2198 pshufd xmm0, xmm0, 0x93 2199 pshufd xmm3, xmm3, 0x4E 2200 pshufd xmm2, xmm2, 0x39 2201 paddd xmm0, xmm6 2202 paddd xmm0, xmm1 2203 pxor xmm3, xmm0 2204 pshuflw xmm3, xmm3, 0xB1 2205 pshufhw xmm3, xmm3, 0xB1 2206 paddd xmm2, xmm3 2207 pxor xmm1, xmm2 2208 movdqa xmm11, xmm1 2209 pslld xmm1, 20 2210 psrld xmm11, 12 2211 por xmm1, xmm11 2212 paddd xmm0, xmm7 2213 paddd xmm0, xmm1 2214 pxor xmm3, xmm0 2215 movdqa xmm14, xmm3 2216 psrld xmm3, 8 2217 pslld xmm14, 24 2218 pxor xmm3, xmm14 2219 paddd xmm2, xmm3 2220 pxor xmm1, xmm2 2221 movdqa xmm11, xmm1 2222 pslld xmm1, 25 2223 psrld xmm11, 7 2224 por xmm1, xmm11 2225 pshufd xmm0, xmm0, 0x39 2226 pshufd xmm3, xmm3, 0x4E 2227 pshufd xmm2, xmm2, 0x93 2228 dec al 2229 jz 9f 2230 movdqa xmm8, xmm4 2231 shufps xmm8, xmm5, 214 2232 pshufd xmm9, xmm4, 0x0F 2233 pshufd xmm4, xmm8, 0x39 2234 movdqa xmm8, xmm6 2235 shufps xmm8, xmm7, 250 2236 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2237 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2238 por xmm9, xmm8 2239 movdqa xmm8, xmm7 2240 punpcklqdq xmm8, xmm5 2241 movdqa xmm10, xmm6 2242 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2243 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2244 por xmm8, xmm10 2245 pshufd xmm8, xmm8, 0x78 2246 punpckhdq xmm5, xmm7 2247 punpckldq xmm6, xmm5 2248 pshufd xmm7, xmm6, 0x1E 2249 movdqa xmm5, xmm9 2250 movdqa xmm6, xmm8 2251 jmp 9b 22529: 2253 movdqu xmm4, xmmword ptr [rdi] 2254 movdqu xmm5, xmmword ptr [rdi+0x10] 2255 pxor xmm0, xmm2 2256 pxor xmm1, xmm3 2257 pxor xmm2, xmm4 2258 pxor xmm3, xmm5 2259 movups xmmword ptr [r9], xmm0 2260 movups xmmword ptr [r9+0x10], xmm1 2261 movups xmmword ptr [r9+0x20], xmm2 2262 movups xmmword ptr [r9+0x30], xmm3 2263 RET 2264SET_SIZE(zfs_blake3_compress_xof_sse2) 2265 2266SECTION_STATIC 2267.p2align 6 2268BLAKE3_IV: 2269 .long 0x6A09E667, 0xBB67AE85 2270 .long 0x3C6EF372, 0xA54FF53A 2271ADD0: 2272 .long 0, 1, 2, 3 2273ADD1: 2274 .long 4, 4, 4, 4 2275BLAKE3_IV_0: 2276 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 2277BLAKE3_IV_1: 2278 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 2279BLAKE3_IV_2: 2280 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 2281BLAKE3_IV_3: 2282 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A 2283BLAKE3_BLOCK_LEN: 2284 .long 64, 64, 64, 64 2285CMP_MSB_MASK: 2286 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 2287PBLENDW_0x33_MASK: 2288 .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 2289PBLENDW_0xCC_MASK: 2290 .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF 2291PBLENDW_0x3F_MASK: 2292 .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 2293PBLENDW_0xC0_MASK: 2294 .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF 2295 2296#endif /* HAVE_SSE2 */ 2297 2298#ifdef __ELF__ 2299.section .note.GNU-stack,"",%progbits 2300#endif 2301