1/* Do not modify. This file is auto-generated from chacha-x86_64.pl. */ 2.text 3 4 5 6.align 64 7.Lzero: 8.long 0,0,0,0 9.Lone: 10.long 1,0,0,0 11.Linc: 12.long 0,1,2,3 13.Lfour: 14.long 4,4,4,4 15.Lincy: 16.long 0,2,4,6,1,3,5,7 17.Leight: 18.long 8,8,8,8,8,8,8,8 19.Lrot16: 20.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 21.Lrot24: 22.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 23.Ltwoy: 24.long 2,0,0,0, 2,0,0,0 25.align 64 26.Lzeroz: 27.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 28.Lfourz: 29.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 30.Lincz: 31.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 32.Lsixteen: 33.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 34.Lsigma: 35.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 36.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 37.globl ChaCha20_ctr32 38.type ChaCha20_ctr32,@function 39.align 64 40ChaCha20_ctr32: 41.cfi_startproc 42 cmpq $0,%rdx 43 je .Lno_data 44 movq OPENSSL_ia32cap_P+4(%rip),%r10 45 testl $512,%r10d 46 jnz .LChaCha20_ssse3 47 48 pushq %rbx 49.cfi_adjust_cfa_offset 8 50.cfi_offset %rbx,-16 51 pushq %rbp 52.cfi_adjust_cfa_offset 8 53.cfi_offset %rbp,-24 54 pushq %r12 55.cfi_adjust_cfa_offset 8 56.cfi_offset %r12,-32 57 pushq %r13 58.cfi_adjust_cfa_offset 8 59.cfi_offset %r13,-40 60 pushq %r14 61.cfi_adjust_cfa_offset 8 62.cfi_offset %r14,-48 63 pushq %r15 64.cfi_adjust_cfa_offset 8 65.cfi_offset %r15,-56 66 subq $64+24,%rsp 67.cfi_adjust_cfa_offset 64+24 68.Lctr32_body: 69 70 71 movdqu (%rcx),%xmm1 72 movdqu 16(%rcx),%xmm2 73 movdqu (%r8),%xmm3 74 movdqa .Lone(%rip),%xmm4 75 76 77 movdqa %xmm1,16(%rsp) 78 movdqa %xmm2,32(%rsp) 79 movdqa %xmm3,48(%rsp) 80 movq %rdx,%rbp 81 jmp .Loop_outer 82 83.align 32 84.Loop_outer: 85 movl $0x61707865,%eax 86 movl $0x3320646e,%ebx 87 movl $0x79622d32,%ecx 88 movl $0x6b206574,%edx 89 movl 16(%rsp),%r8d 90 movl 20(%rsp),%r9d 91 movl 24(%rsp),%r10d 92 movl 28(%rsp),%r11d 93 movd %xmm3,%r12d 94 movl 52(%rsp),%r13d 95 movl 56(%rsp),%r14d 96 movl 60(%rsp),%r15d 97 98 movq %rbp,64+0(%rsp) 99 movl $10,%ebp 100 movq %rsi,64+8(%rsp) 101.byte 102,72,15,126,214 102 movq %rdi,64+16(%rsp) 103 movq %rsi,%rdi 104 shrq $32,%rdi 105 jmp .Loop 106 107.align 32 108.Loop: 109 addl %r8d,%eax 110 xorl %eax,%r12d 111 roll $16,%r12d 112 addl %r9d,%ebx 113 xorl %ebx,%r13d 114 roll $16,%r13d 115 addl %r12d,%esi 116 xorl %esi,%r8d 117 roll $12,%r8d 118 addl %r13d,%edi 119 xorl %edi,%r9d 120 roll $12,%r9d 121 addl %r8d,%eax 122 xorl %eax,%r12d 123 roll $8,%r12d 124 addl %r9d,%ebx 125 xorl %ebx,%r13d 126 roll $8,%r13d 127 addl %r12d,%esi 128 xorl %esi,%r8d 129 roll $7,%r8d 130 addl %r13d,%edi 131 xorl %edi,%r9d 132 roll $7,%r9d 133 movl %esi,32(%rsp) 134 movl %edi,36(%rsp) 135 movl 40(%rsp),%esi 136 movl 44(%rsp),%edi 137 addl %r10d,%ecx 138 xorl %ecx,%r14d 139 roll $16,%r14d 140 addl %r11d,%edx 141 xorl %edx,%r15d 142 roll $16,%r15d 143 addl %r14d,%esi 144 xorl %esi,%r10d 145 roll $12,%r10d 146 addl %r15d,%edi 147 xorl %edi,%r11d 148 roll $12,%r11d 149 addl %r10d,%ecx 150 xorl %ecx,%r14d 151 roll $8,%r14d 152 addl %r11d,%edx 153 xorl %edx,%r15d 154 roll $8,%r15d 155 addl %r14d,%esi 156 xorl %esi,%r10d 157 roll $7,%r10d 158 addl %r15d,%edi 159 xorl %edi,%r11d 160 roll $7,%r11d 161 addl %r9d,%eax 162 xorl %eax,%r15d 163 roll $16,%r15d 164 addl %r10d,%ebx 165 xorl %ebx,%r12d 166 roll $16,%r12d 167 addl %r15d,%esi 168 xorl %esi,%r9d 169 roll $12,%r9d 170 addl %r12d,%edi 171 xorl %edi,%r10d 172 roll $12,%r10d 173 addl %r9d,%eax 174 xorl %eax,%r15d 175 roll $8,%r15d 176 addl %r10d,%ebx 177 xorl %ebx,%r12d 178 roll $8,%r12d 179 addl %r15d,%esi 180 xorl %esi,%r9d 181 roll $7,%r9d 182 addl %r12d,%edi 183 xorl %edi,%r10d 184 roll $7,%r10d 185 movl %esi,40(%rsp) 186 movl %edi,44(%rsp) 187 movl 32(%rsp),%esi 188 movl 36(%rsp),%edi 189 addl %r11d,%ecx 190 xorl %ecx,%r13d 191 roll $16,%r13d 192 addl %r8d,%edx 193 xorl %edx,%r14d 194 roll $16,%r14d 195 addl %r13d,%esi 196 xorl %esi,%r11d 197 roll $12,%r11d 198 addl %r14d,%edi 199 xorl %edi,%r8d 200 roll $12,%r8d 201 addl %r11d,%ecx 202 xorl %ecx,%r13d 203 roll $8,%r13d 204 addl %r8d,%edx 205 xorl %edx,%r14d 206 roll $8,%r14d 207 addl %r13d,%esi 208 xorl %esi,%r11d 209 roll $7,%r11d 210 addl %r14d,%edi 211 xorl %edi,%r8d 212 roll $7,%r8d 213 decl %ebp 214 jnz .Loop 215 movl %edi,36(%rsp) 216 movl %esi,32(%rsp) 217 movq 64(%rsp),%rbp 218 movdqa %xmm2,%xmm1 219 movq 64+8(%rsp),%rsi 220 paddd %xmm4,%xmm3 221 movq 64+16(%rsp),%rdi 222 223 addl $0x61707865,%eax 224 addl $0x3320646e,%ebx 225 addl $0x79622d32,%ecx 226 addl $0x6b206574,%edx 227 addl 16(%rsp),%r8d 228 addl 20(%rsp),%r9d 229 addl 24(%rsp),%r10d 230 addl 28(%rsp),%r11d 231 addl 48(%rsp),%r12d 232 addl 52(%rsp),%r13d 233 addl 56(%rsp),%r14d 234 addl 60(%rsp),%r15d 235 paddd 32(%rsp),%xmm1 236 237 cmpq $64,%rbp 238 jb .Ltail 239 240 xorl 0(%rsi),%eax 241 xorl 4(%rsi),%ebx 242 xorl 8(%rsi),%ecx 243 xorl 12(%rsi),%edx 244 xorl 16(%rsi),%r8d 245 xorl 20(%rsi),%r9d 246 xorl 24(%rsi),%r10d 247 xorl 28(%rsi),%r11d 248 movdqu 32(%rsi),%xmm0 249 xorl 48(%rsi),%r12d 250 xorl 52(%rsi),%r13d 251 xorl 56(%rsi),%r14d 252 xorl 60(%rsi),%r15d 253 leaq 64(%rsi),%rsi 254 pxor %xmm1,%xmm0 255 256 movdqa %xmm2,32(%rsp) 257 movd %xmm3,48(%rsp) 258 259 movl %eax,0(%rdi) 260 movl %ebx,4(%rdi) 261 movl %ecx,8(%rdi) 262 movl %edx,12(%rdi) 263 movl %r8d,16(%rdi) 264 movl %r9d,20(%rdi) 265 movl %r10d,24(%rdi) 266 movl %r11d,28(%rdi) 267 movdqu %xmm0,32(%rdi) 268 movl %r12d,48(%rdi) 269 movl %r13d,52(%rdi) 270 movl %r14d,56(%rdi) 271 movl %r15d,60(%rdi) 272 leaq 64(%rdi),%rdi 273 274 subq $64,%rbp 275 jnz .Loop_outer 276 277 jmp .Ldone 278 279.align 16 280.Ltail: 281 movl %eax,0(%rsp) 282 movl %ebx,4(%rsp) 283 xorq %rbx,%rbx 284 movl %ecx,8(%rsp) 285 movl %edx,12(%rsp) 286 movl %r8d,16(%rsp) 287 movl %r9d,20(%rsp) 288 movl %r10d,24(%rsp) 289 movl %r11d,28(%rsp) 290 movdqa %xmm1,32(%rsp) 291 movl %r12d,48(%rsp) 292 movl %r13d,52(%rsp) 293 movl %r14d,56(%rsp) 294 movl %r15d,60(%rsp) 295 296.Loop_tail: 297 movzbl (%rsi,%rbx,1),%eax 298 movzbl (%rsp,%rbx,1),%edx 299 leaq 1(%rbx),%rbx 300 xorl %edx,%eax 301 movb %al,-1(%rdi,%rbx,1) 302 decq %rbp 303 jnz .Loop_tail 304 305.Ldone: 306 leaq 64+24+48(%rsp),%rsi 307.cfi_def_cfa %rsi,8 308 movq -48(%rsi),%r15 309.cfi_restore %r15 310 movq -40(%rsi),%r14 311.cfi_restore %r14 312 movq -32(%rsi),%r13 313.cfi_restore %r13 314 movq -24(%rsi),%r12 315.cfi_restore %r12 316 movq -16(%rsi),%rbp 317.cfi_restore %rbp 318 movq -8(%rsi),%rbx 319.cfi_restore %rbx 320 leaq (%rsi),%rsp 321.cfi_def_cfa_register %rsp 322.Lno_data: 323 .byte 0xf3,0xc3 324.cfi_endproc 325.size ChaCha20_ctr32,.-ChaCha20_ctr32 326.type ChaCha20_ssse3,@function 327.align 32 328ChaCha20_ssse3: 329.cfi_startproc 330.LChaCha20_ssse3: 331 movq %rsp,%r9 332.cfi_def_cfa_register %r9 333 testl $2048,%r10d 334 jnz .LChaCha20_4xop 335 cmpq $128,%rdx 336 je .LChaCha20_128 337 ja .LChaCha20_4x 338 339.Ldo_sse3_after_all: 340 subq $64+8,%rsp 341 movdqa .Lsigma(%rip),%xmm0 342 movdqu (%rcx),%xmm1 343 movdqu 16(%rcx),%xmm2 344 movdqu (%r8),%xmm3 345 movdqa .Lrot16(%rip),%xmm6 346 movdqa .Lrot24(%rip),%xmm7 347 348 movdqa %xmm0,0(%rsp) 349 movdqa %xmm1,16(%rsp) 350 movdqa %xmm2,32(%rsp) 351 movdqa %xmm3,48(%rsp) 352 movq $10,%r8 353 jmp .Loop_ssse3 354 355.align 32 356.Loop_outer_ssse3: 357 movdqa .Lone(%rip),%xmm3 358 movdqa 0(%rsp),%xmm0 359 movdqa 16(%rsp),%xmm1 360 movdqa 32(%rsp),%xmm2 361 paddd 48(%rsp),%xmm3 362 movq $10,%r8 363 movdqa %xmm3,48(%rsp) 364 jmp .Loop_ssse3 365 366.align 32 367.Loop_ssse3: 368 paddd %xmm1,%xmm0 369 pxor %xmm0,%xmm3 370.byte 102,15,56,0,222 371 paddd %xmm3,%xmm2 372 pxor %xmm2,%xmm1 373 movdqa %xmm1,%xmm4 374 psrld $20,%xmm1 375 pslld $12,%xmm4 376 por %xmm4,%xmm1 377 paddd %xmm1,%xmm0 378 pxor %xmm0,%xmm3 379.byte 102,15,56,0,223 380 paddd %xmm3,%xmm2 381 pxor %xmm2,%xmm1 382 movdqa %xmm1,%xmm4 383 psrld $25,%xmm1 384 pslld $7,%xmm4 385 por %xmm4,%xmm1 386 pshufd $78,%xmm2,%xmm2 387 pshufd $57,%xmm1,%xmm1 388 pshufd $147,%xmm3,%xmm3 389 nop 390 paddd %xmm1,%xmm0 391 pxor %xmm0,%xmm3 392.byte 102,15,56,0,222 393 paddd %xmm3,%xmm2 394 pxor %xmm2,%xmm1 395 movdqa %xmm1,%xmm4 396 psrld $20,%xmm1 397 pslld $12,%xmm4 398 por %xmm4,%xmm1 399 paddd %xmm1,%xmm0 400 pxor %xmm0,%xmm3 401.byte 102,15,56,0,223 402 paddd %xmm3,%xmm2 403 pxor %xmm2,%xmm1 404 movdqa %xmm1,%xmm4 405 psrld $25,%xmm1 406 pslld $7,%xmm4 407 por %xmm4,%xmm1 408 pshufd $78,%xmm2,%xmm2 409 pshufd $147,%xmm1,%xmm1 410 pshufd $57,%xmm3,%xmm3 411 decq %r8 412 jnz .Loop_ssse3 413 paddd 0(%rsp),%xmm0 414 paddd 16(%rsp),%xmm1 415 paddd 32(%rsp),%xmm2 416 paddd 48(%rsp),%xmm3 417 418 cmpq $64,%rdx 419 jb .Ltail_ssse3 420 421 movdqu 0(%rsi),%xmm4 422 movdqu 16(%rsi),%xmm5 423 pxor %xmm4,%xmm0 424 movdqu 32(%rsi),%xmm4 425 pxor %xmm5,%xmm1 426 movdqu 48(%rsi),%xmm5 427 leaq 64(%rsi),%rsi 428 pxor %xmm4,%xmm2 429 pxor %xmm5,%xmm3 430 431 movdqu %xmm0,0(%rdi) 432 movdqu %xmm1,16(%rdi) 433 movdqu %xmm2,32(%rdi) 434 movdqu %xmm3,48(%rdi) 435 leaq 64(%rdi),%rdi 436 437 subq $64,%rdx 438 jnz .Loop_outer_ssse3 439 440 jmp .Ldone_ssse3 441 442.align 16 443.Ltail_ssse3: 444 movdqa %xmm0,0(%rsp) 445 movdqa %xmm1,16(%rsp) 446 movdqa %xmm2,32(%rsp) 447 movdqa %xmm3,48(%rsp) 448 xorq %r8,%r8 449 450.Loop_tail_ssse3: 451 movzbl (%rsi,%r8,1),%eax 452 movzbl (%rsp,%r8,1),%ecx 453 leaq 1(%r8),%r8 454 xorl %ecx,%eax 455 movb %al,-1(%rdi,%r8,1) 456 decq %rdx 457 jnz .Loop_tail_ssse3 458 459.Ldone_ssse3: 460 leaq (%r9),%rsp 461.cfi_def_cfa_register %rsp 462.Lssse3_epilogue: 463 .byte 0xf3,0xc3 464.cfi_endproc 465.size ChaCha20_ssse3,.-ChaCha20_ssse3 466.type ChaCha20_128,@function 467.align 32 468ChaCha20_128: 469.cfi_startproc 470.LChaCha20_128: 471 movq %rsp,%r9 472.cfi_def_cfa_register %r9 473 subq $64+8,%rsp 474 movdqa .Lsigma(%rip),%xmm8 475 movdqu (%rcx),%xmm9 476 movdqu 16(%rcx),%xmm2 477 movdqu (%r8),%xmm3 478 movdqa .Lone(%rip),%xmm1 479 movdqa .Lrot16(%rip),%xmm6 480 movdqa .Lrot24(%rip),%xmm7 481 482 movdqa %xmm8,%xmm10 483 movdqa %xmm8,0(%rsp) 484 movdqa %xmm9,%xmm11 485 movdqa %xmm9,16(%rsp) 486 movdqa %xmm2,%xmm0 487 movdqa %xmm2,32(%rsp) 488 paddd %xmm3,%xmm1 489 movdqa %xmm3,48(%rsp) 490 movq $10,%r8 491 jmp .Loop_128 492 493.align 32 494.Loop_128: 495 paddd %xmm9,%xmm8 496 pxor %xmm8,%xmm3 497 paddd %xmm11,%xmm10 498 pxor %xmm10,%xmm1 499.byte 102,15,56,0,222 500.byte 102,15,56,0,206 501 paddd %xmm3,%xmm2 502 paddd %xmm1,%xmm0 503 pxor %xmm2,%xmm9 504 pxor %xmm0,%xmm11 505 movdqa %xmm9,%xmm4 506 psrld $20,%xmm9 507 movdqa %xmm11,%xmm5 508 pslld $12,%xmm4 509 psrld $20,%xmm11 510 por %xmm4,%xmm9 511 pslld $12,%xmm5 512 por %xmm5,%xmm11 513 paddd %xmm9,%xmm8 514 pxor %xmm8,%xmm3 515 paddd %xmm11,%xmm10 516 pxor %xmm10,%xmm1 517.byte 102,15,56,0,223 518.byte 102,15,56,0,207 519 paddd %xmm3,%xmm2 520 paddd %xmm1,%xmm0 521 pxor %xmm2,%xmm9 522 pxor %xmm0,%xmm11 523 movdqa %xmm9,%xmm4 524 psrld $25,%xmm9 525 movdqa %xmm11,%xmm5 526 pslld $7,%xmm4 527 psrld $25,%xmm11 528 por %xmm4,%xmm9 529 pslld $7,%xmm5 530 por %xmm5,%xmm11 531 pshufd $78,%xmm2,%xmm2 532 pshufd $57,%xmm9,%xmm9 533 pshufd $147,%xmm3,%xmm3 534 pshufd $78,%xmm0,%xmm0 535 pshufd $57,%xmm11,%xmm11 536 pshufd $147,%xmm1,%xmm1 537 paddd %xmm9,%xmm8 538 pxor %xmm8,%xmm3 539 paddd %xmm11,%xmm10 540 pxor %xmm10,%xmm1 541.byte 102,15,56,0,222 542.byte 102,15,56,0,206 543 paddd %xmm3,%xmm2 544 paddd %xmm1,%xmm0 545 pxor %xmm2,%xmm9 546 pxor %xmm0,%xmm11 547 movdqa %xmm9,%xmm4 548 psrld $20,%xmm9 549 movdqa %xmm11,%xmm5 550 pslld $12,%xmm4 551 psrld $20,%xmm11 552 por %xmm4,%xmm9 553 pslld $12,%xmm5 554 por %xmm5,%xmm11 555 paddd %xmm9,%xmm8 556 pxor %xmm8,%xmm3 557 paddd %xmm11,%xmm10 558 pxor %xmm10,%xmm1 559.byte 102,15,56,0,223 560.byte 102,15,56,0,207 561 paddd %xmm3,%xmm2 562 paddd %xmm1,%xmm0 563 pxor %xmm2,%xmm9 564 pxor %xmm0,%xmm11 565 movdqa %xmm9,%xmm4 566 psrld $25,%xmm9 567 movdqa %xmm11,%xmm5 568 pslld $7,%xmm4 569 psrld $25,%xmm11 570 por %xmm4,%xmm9 571 pslld $7,%xmm5 572 por %xmm5,%xmm11 573 pshufd $78,%xmm2,%xmm2 574 pshufd $147,%xmm9,%xmm9 575 pshufd $57,%xmm3,%xmm3 576 pshufd $78,%xmm0,%xmm0 577 pshufd $147,%xmm11,%xmm11 578 pshufd $57,%xmm1,%xmm1 579 decq %r8 580 jnz .Loop_128 581 paddd 0(%rsp),%xmm8 582 paddd 16(%rsp),%xmm9 583 paddd 32(%rsp),%xmm2 584 paddd 48(%rsp),%xmm3 585 paddd .Lone(%rip),%xmm1 586 paddd 0(%rsp),%xmm10 587 paddd 16(%rsp),%xmm11 588 paddd 32(%rsp),%xmm0 589 paddd 48(%rsp),%xmm1 590 591 movdqu 0(%rsi),%xmm4 592 movdqu 16(%rsi),%xmm5 593 pxor %xmm4,%xmm8 594 movdqu 32(%rsi),%xmm4 595 pxor %xmm5,%xmm9 596 movdqu 48(%rsi),%xmm5 597 pxor %xmm4,%xmm2 598 movdqu 64(%rsi),%xmm4 599 pxor %xmm5,%xmm3 600 movdqu 80(%rsi),%xmm5 601 pxor %xmm4,%xmm10 602 movdqu 96(%rsi),%xmm4 603 pxor %xmm5,%xmm11 604 movdqu 112(%rsi),%xmm5 605 pxor %xmm4,%xmm0 606 pxor %xmm5,%xmm1 607 608 movdqu %xmm8,0(%rdi) 609 movdqu %xmm9,16(%rdi) 610 movdqu %xmm2,32(%rdi) 611 movdqu %xmm3,48(%rdi) 612 movdqu %xmm10,64(%rdi) 613 movdqu %xmm11,80(%rdi) 614 movdqu %xmm0,96(%rdi) 615 movdqu %xmm1,112(%rdi) 616 leaq (%r9),%rsp 617.cfi_def_cfa_register %rsp 618.L128_epilogue: 619 .byte 0xf3,0xc3 620.cfi_endproc 621.size ChaCha20_128,.-ChaCha20_128 622.type ChaCha20_4x,@function 623.align 32 624ChaCha20_4x: 625.cfi_startproc 626.LChaCha20_4x: 627 movq %rsp,%r9 628.cfi_def_cfa_register %r9 629 movq %r10,%r11 630 shrq $32,%r10 631 testq $32,%r10 632 jnz .LChaCha20_8x 633 cmpq $192,%rdx 634 ja .Lproceed4x 635 636 andq $71303168,%r11 637 cmpq $4194304,%r11 638 je .Ldo_sse3_after_all 639 640.Lproceed4x: 641 subq $0x140+8,%rsp 642 movdqa .Lsigma(%rip),%xmm11 643 movdqu (%rcx),%xmm15 644 movdqu 16(%rcx),%xmm7 645 movdqu (%r8),%xmm3 646 leaq 256(%rsp),%rcx 647 leaq .Lrot16(%rip),%r10 648 leaq .Lrot24(%rip),%r11 649 650 pshufd $0x00,%xmm11,%xmm8 651 pshufd $0x55,%xmm11,%xmm9 652 movdqa %xmm8,64(%rsp) 653 pshufd $0xaa,%xmm11,%xmm10 654 movdqa %xmm9,80(%rsp) 655 pshufd $0xff,%xmm11,%xmm11 656 movdqa %xmm10,96(%rsp) 657 movdqa %xmm11,112(%rsp) 658 659 pshufd $0x00,%xmm15,%xmm12 660 pshufd $0x55,%xmm15,%xmm13 661 movdqa %xmm12,128-256(%rcx) 662 pshufd $0xaa,%xmm15,%xmm14 663 movdqa %xmm13,144-256(%rcx) 664 pshufd $0xff,%xmm15,%xmm15 665 movdqa %xmm14,160-256(%rcx) 666 movdqa %xmm15,176-256(%rcx) 667 668 pshufd $0x00,%xmm7,%xmm4 669 pshufd $0x55,%xmm7,%xmm5 670 movdqa %xmm4,192-256(%rcx) 671 pshufd $0xaa,%xmm7,%xmm6 672 movdqa %xmm5,208-256(%rcx) 673 pshufd $0xff,%xmm7,%xmm7 674 movdqa %xmm6,224-256(%rcx) 675 movdqa %xmm7,240-256(%rcx) 676 677 pshufd $0x00,%xmm3,%xmm0 678 pshufd $0x55,%xmm3,%xmm1 679 paddd .Linc(%rip),%xmm0 680 pshufd $0xaa,%xmm3,%xmm2 681 movdqa %xmm1,272-256(%rcx) 682 pshufd $0xff,%xmm3,%xmm3 683 movdqa %xmm2,288-256(%rcx) 684 movdqa %xmm3,304-256(%rcx) 685 686 jmp .Loop_enter4x 687 688.align 32 689.Loop_outer4x: 690 movdqa 64(%rsp),%xmm8 691 movdqa 80(%rsp),%xmm9 692 movdqa 96(%rsp),%xmm10 693 movdqa 112(%rsp),%xmm11 694 movdqa 128-256(%rcx),%xmm12 695 movdqa 144-256(%rcx),%xmm13 696 movdqa 160-256(%rcx),%xmm14 697 movdqa 176-256(%rcx),%xmm15 698 movdqa 192-256(%rcx),%xmm4 699 movdqa 208-256(%rcx),%xmm5 700 movdqa 224-256(%rcx),%xmm6 701 movdqa 240-256(%rcx),%xmm7 702 movdqa 256-256(%rcx),%xmm0 703 movdqa 272-256(%rcx),%xmm1 704 movdqa 288-256(%rcx),%xmm2 705 movdqa 304-256(%rcx),%xmm3 706 paddd .Lfour(%rip),%xmm0 707 708.Loop_enter4x: 709 movdqa %xmm6,32(%rsp) 710 movdqa %xmm7,48(%rsp) 711 movdqa (%r10),%xmm7 712 movl $10,%eax 713 movdqa %xmm0,256-256(%rcx) 714 jmp .Loop4x 715 716.align 32 717.Loop4x: 718 paddd %xmm12,%xmm8 719 paddd %xmm13,%xmm9 720 pxor %xmm8,%xmm0 721 pxor %xmm9,%xmm1 722.byte 102,15,56,0,199 723.byte 102,15,56,0,207 724 paddd %xmm0,%xmm4 725 paddd %xmm1,%xmm5 726 pxor %xmm4,%xmm12 727 pxor %xmm5,%xmm13 728 movdqa %xmm12,%xmm6 729 pslld $12,%xmm12 730 psrld $20,%xmm6 731 movdqa %xmm13,%xmm7 732 pslld $12,%xmm13 733 por %xmm6,%xmm12 734 psrld $20,%xmm7 735 movdqa (%r11),%xmm6 736 por %xmm7,%xmm13 737 paddd %xmm12,%xmm8 738 paddd %xmm13,%xmm9 739 pxor %xmm8,%xmm0 740 pxor %xmm9,%xmm1 741.byte 102,15,56,0,198 742.byte 102,15,56,0,206 743 paddd %xmm0,%xmm4 744 paddd %xmm1,%xmm5 745 pxor %xmm4,%xmm12 746 pxor %xmm5,%xmm13 747 movdqa %xmm12,%xmm7 748 pslld $7,%xmm12 749 psrld $25,%xmm7 750 movdqa %xmm13,%xmm6 751 pslld $7,%xmm13 752 por %xmm7,%xmm12 753 psrld $25,%xmm6 754 movdqa (%r10),%xmm7 755 por %xmm6,%xmm13 756 movdqa %xmm4,0(%rsp) 757 movdqa %xmm5,16(%rsp) 758 movdqa 32(%rsp),%xmm4 759 movdqa 48(%rsp),%xmm5 760 paddd %xmm14,%xmm10 761 paddd %xmm15,%xmm11 762 pxor %xmm10,%xmm2 763 pxor %xmm11,%xmm3 764.byte 102,15,56,0,215 765.byte 102,15,56,0,223 766 paddd %xmm2,%xmm4 767 paddd %xmm3,%xmm5 768 pxor %xmm4,%xmm14 769 pxor %xmm5,%xmm15 770 movdqa %xmm14,%xmm6 771 pslld $12,%xmm14 772 psrld $20,%xmm6 773 movdqa %xmm15,%xmm7 774 pslld $12,%xmm15 775 por %xmm6,%xmm14 776 psrld $20,%xmm7 777 movdqa (%r11),%xmm6 778 por %xmm7,%xmm15 779 paddd %xmm14,%xmm10 780 paddd %xmm15,%xmm11 781 pxor %xmm10,%xmm2 782 pxor %xmm11,%xmm3 783.byte 102,15,56,0,214 784.byte 102,15,56,0,222 785 paddd %xmm2,%xmm4 786 paddd %xmm3,%xmm5 787 pxor %xmm4,%xmm14 788 pxor %xmm5,%xmm15 789 movdqa %xmm14,%xmm7 790 pslld $7,%xmm14 791 psrld $25,%xmm7 792 movdqa %xmm15,%xmm6 793 pslld $7,%xmm15 794 por %xmm7,%xmm14 795 psrld $25,%xmm6 796 movdqa (%r10),%xmm7 797 por %xmm6,%xmm15 798 paddd %xmm13,%xmm8 799 paddd %xmm14,%xmm9 800 pxor %xmm8,%xmm3 801 pxor %xmm9,%xmm0 802.byte 102,15,56,0,223 803.byte 102,15,56,0,199 804 paddd %xmm3,%xmm4 805 paddd %xmm0,%xmm5 806 pxor %xmm4,%xmm13 807 pxor %xmm5,%xmm14 808 movdqa %xmm13,%xmm6 809 pslld $12,%xmm13 810 psrld $20,%xmm6 811 movdqa %xmm14,%xmm7 812 pslld $12,%xmm14 813 por %xmm6,%xmm13 814 psrld $20,%xmm7 815 movdqa (%r11),%xmm6 816 por %xmm7,%xmm14 817 paddd %xmm13,%xmm8 818 paddd %xmm14,%xmm9 819 pxor %xmm8,%xmm3 820 pxor %xmm9,%xmm0 821.byte 102,15,56,0,222 822.byte 102,15,56,0,198 823 paddd %xmm3,%xmm4 824 paddd %xmm0,%xmm5 825 pxor %xmm4,%xmm13 826 pxor %xmm5,%xmm14 827 movdqa %xmm13,%xmm7 828 pslld $7,%xmm13 829 psrld $25,%xmm7 830 movdqa %xmm14,%xmm6 831 pslld $7,%xmm14 832 por %xmm7,%xmm13 833 psrld $25,%xmm6 834 movdqa (%r10),%xmm7 835 por %xmm6,%xmm14 836 movdqa %xmm4,32(%rsp) 837 movdqa %xmm5,48(%rsp) 838 movdqa 0(%rsp),%xmm4 839 movdqa 16(%rsp),%xmm5 840 paddd %xmm15,%xmm10 841 paddd %xmm12,%xmm11 842 pxor %xmm10,%xmm1 843 pxor %xmm11,%xmm2 844.byte 102,15,56,0,207 845.byte 102,15,56,0,215 846 paddd %xmm1,%xmm4 847 paddd %xmm2,%xmm5 848 pxor %xmm4,%xmm15 849 pxor %xmm5,%xmm12 850 movdqa %xmm15,%xmm6 851 pslld $12,%xmm15 852 psrld $20,%xmm6 853 movdqa %xmm12,%xmm7 854 pslld $12,%xmm12 855 por %xmm6,%xmm15 856 psrld $20,%xmm7 857 movdqa (%r11),%xmm6 858 por %xmm7,%xmm12 859 paddd %xmm15,%xmm10 860 paddd %xmm12,%xmm11 861 pxor %xmm10,%xmm1 862 pxor %xmm11,%xmm2 863.byte 102,15,56,0,206 864.byte 102,15,56,0,214 865 paddd %xmm1,%xmm4 866 paddd %xmm2,%xmm5 867 pxor %xmm4,%xmm15 868 pxor %xmm5,%xmm12 869 movdqa %xmm15,%xmm7 870 pslld $7,%xmm15 871 psrld $25,%xmm7 872 movdqa %xmm12,%xmm6 873 pslld $7,%xmm12 874 por %xmm7,%xmm15 875 psrld $25,%xmm6 876 movdqa (%r10),%xmm7 877 por %xmm6,%xmm12 878 decl %eax 879 jnz .Loop4x 880 881 paddd 64(%rsp),%xmm8 882 paddd 80(%rsp),%xmm9 883 paddd 96(%rsp),%xmm10 884 paddd 112(%rsp),%xmm11 885 886 movdqa %xmm8,%xmm6 887 punpckldq %xmm9,%xmm8 888 movdqa %xmm10,%xmm7 889 punpckldq %xmm11,%xmm10 890 punpckhdq %xmm9,%xmm6 891 punpckhdq %xmm11,%xmm7 892 movdqa %xmm8,%xmm9 893 punpcklqdq %xmm10,%xmm8 894 movdqa %xmm6,%xmm11 895 punpcklqdq %xmm7,%xmm6 896 punpckhqdq %xmm10,%xmm9 897 punpckhqdq %xmm7,%xmm11 898 paddd 128-256(%rcx),%xmm12 899 paddd 144-256(%rcx),%xmm13 900 paddd 160-256(%rcx),%xmm14 901 paddd 176-256(%rcx),%xmm15 902 903 movdqa %xmm8,0(%rsp) 904 movdqa %xmm9,16(%rsp) 905 movdqa 32(%rsp),%xmm8 906 movdqa 48(%rsp),%xmm9 907 908 movdqa %xmm12,%xmm10 909 punpckldq %xmm13,%xmm12 910 movdqa %xmm14,%xmm7 911 punpckldq %xmm15,%xmm14 912 punpckhdq %xmm13,%xmm10 913 punpckhdq %xmm15,%xmm7 914 movdqa %xmm12,%xmm13 915 punpcklqdq %xmm14,%xmm12 916 movdqa %xmm10,%xmm15 917 punpcklqdq %xmm7,%xmm10 918 punpckhqdq %xmm14,%xmm13 919 punpckhqdq %xmm7,%xmm15 920 paddd 192-256(%rcx),%xmm4 921 paddd 208-256(%rcx),%xmm5 922 paddd 224-256(%rcx),%xmm8 923 paddd 240-256(%rcx),%xmm9 924 925 movdqa %xmm6,32(%rsp) 926 movdqa %xmm11,48(%rsp) 927 928 movdqa %xmm4,%xmm14 929 punpckldq %xmm5,%xmm4 930 movdqa %xmm8,%xmm7 931 punpckldq %xmm9,%xmm8 932 punpckhdq %xmm5,%xmm14 933 punpckhdq %xmm9,%xmm7 934 movdqa %xmm4,%xmm5 935 punpcklqdq %xmm8,%xmm4 936 movdqa %xmm14,%xmm9 937 punpcklqdq %xmm7,%xmm14 938 punpckhqdq %xmm8,%xmm5 939 punpckhqdq %xmm7,%xmm9 940 paddd 256-256(%rcx),%xmm0 941 paddd 272-256(%rcx),%xmm1 942 paddd 288-256(%rcx),%xmm2 943 paddd 304-256(%rcx),%xmm3 944 945 movdqa %xmm0,%xmm8 946 punpckldq %xmm1,%xmm0 947 movdqa %xmm2,%xmm7 948 punpckldq %xmm3,%xmm2 949 punpckhdq %xmm1,%xmm8 950 punpckhdq %xmm3,%xmm7 951 movdqa %xmm0,%xmm1 952 punpcklqdq %xmm2,%xmm0 953 movdqa %xmm8,%xmm3 954 punpcklqdq %xmm7,%xmm8 955 punpckhqdq %xmm2,%xmm1 956 punpckhqdq %xmm7,%xmm3 957 cmpq $256,%rdx 958 jb .Ltail4x 959 960 movdqu 0(%rsi),%xmm6 961 movdqu 16(%rsi),%xmm11 962 movdqu 32(%rsi),%xmm2 963 movdqu 48(%rsi),%xmm7 964 pxor 0(%rsp),%xmm6 965 pxor %xmm12,%xmm11 966 pxor %xmm4,%xmm2 967 pxor %xmm0,%xmm7 968 969 movdqu %xmm6,0(%rdi) 970 movdqu 64(%rsi),%xmm6 971 movdqu %xmm11,16(%rdi) 972 movdqu 80(%rsi),%xmm11 973 movdqu %xmm2,32(%rdi) 974 movdqu 96(%rsi),%xmm2 975 movdqu %xmm7,48(%rdi) 976 movdqu 112(%rsi),%xmm7 977 leaq 128(%rsi),%rsi 978 pxor 16(%rsp),%xmm6 979 pxor %xmm13,%xmm11 980 pxor %xmm5,%xmm2 981 pxor %xmm1,%xmm7 982 983 movdqu %xmm6,64(%rdi) 984 movdqu 0(%rsi),%xmm6 985 movdqu %xmm11,80(%rdi) 986 movdqu 16(%rsi),%xmm11 987 movdqu %xmm2,96(%rdi) 988 movdqu 32(%rsi),%xmm2 989 movdqu %xmm7,112(%rdi) 990 leaq 128(%rdi),%rdi 991 movdqu 48(%rsi),%xmm7 992 pxor 32(%rsp),%xmm6 993 pxor %xmm10,%xmm11 994 pxor %xmm14,%xmm2 995 pxor %xmm8,%xmm7 996 997 movdqu %xmm6,0(%rdi) 998 movdqu 64(%rsi),%xmm6 999 movdqu %xmm11,16(%rdi) 1000 movdqu 80(%rsi),%xmm11 1001 movdqu %xmm2,32(%rdi) 1002 movdqu 96(%rsi),%xmm2 1003 movdqu %xmm7,48(%rdi) 1004 movdqu 112(%rsi),%xmm7 1005 leaq 128(%rsi),%rsi 1006 pxor 48(%rsp),%xmm6 1007 pxor %xmm15,%xmm11 1008 pxor %xmm9,%xmm2 1009 pxor %xmm3,%xmm7 1010 movdqu %xmm6,64(%rdi) 1011 movdqu %xmm11,80(%rdi) 1012 movdqu %xmm2,96(%rdi) 1013 movdqu %xmm7,112(%rdi) 1014 leaq 128(%rdi),%rdi 1015 1016 subq $256,%rdx 1017 jnz .Loop_outer4x 1018 1019 jmp .Ldone4x 1020 1021.Ltail4x: 1022 cmpq $192,%rdx 1023 jae .L192_or_more4x 1024 cmpq $128,%rdx 1025 jae .L128_or_more4x 1026 cmpq $64,%rdx 1027 jae .L64_or_more4x 1028 1029 1030 xorq %r10,%r10 1031 1032 movdqa %xmm12,16(%rsp) 1033 movdqa %xmm4,32(%rsp) 1034 movdqa %xmm0,48(%rsp) 1035 jmp .Loop_tail4x 1036 1037.align 32 1038.L64_or_more4x: 1039 movdqu 0(%rsi),%xmm6 1040 movdqu 16(%rsi),%xmm11 1041 movdqu 32(%rsi),%xmm2 1042 movdqu 48(%rsi),%xmm7 1043 pxor 0(%rsp),%xmm6 1044 pxor %xmm12,%xmm11 1045 pxor %xmm4,%xmm2 1046 pxor %xmm0,%xmm7 1047 movdqu %xmm6,0(%rdi) 1048 movdqu %xmm11,16(%rdi) 1049 movdqu %xmm2,32(%rdi) 1050 movdqu %xmm7,48(%rdi) 1051 je .Ldone4x 1052 1053 movdqa 16(%rsp),%xmm6 1054 leaq 64(%rsi),%rsi 1055 xorq %r10,%r10 1056 movdqa %xmm6,0(%rsp) 1057 movdqa %xmm13,16(%rsp) 1058 leaq 64(%rdi),%rdi 1059 movdqa %xmm5,32(%rsp) 1060 subq $64,%rdx 1061 movdqa %xmm1,48(%rsp) 1062 jmp .Loop_tail4x 1063 1064.align 32 1065.L128_or_more4x: 1066 movdqu 0(%rsi),%xmm6 1067 movdqu 16(%rsi),%xmm11 1068 movdqu 32(%rsi),%xmm2 1069 movdqu 48(%rsi),%xmm7 1070 pxor 0(%rsp),%xmm6 1071 pxor %xmm12,%xmm11 1072 pxor %xmm4,%xmm2 1073 pxor %xmm0,%xmm7 1074 1075 movdqu %xmm6,0(%rdi) 1076 movdqu 64(%rsi),%xmm6 1077 movdqu %xmm11,16(%rdi) 1078 movdqu 80(%rsi),%xmm11 1079 movdqu %xmm2,32(%rdi) 1080 movdqu 96(%rsi),%xmm2 1081 movdqu %xmm7,48(%rdi) 1082 movdqu 112(%rsi),%xmm7 1083 pxor 16(%rsp),%xmm6 1084 pxor %xmm13,%xmm11 1085 pxor %xmm5,%xmm2 1086 pxor %xmm1,%xmm7 1087 movdqu %xmm6,64(%rdi) 1088 movdqu %xmm11,80(%rdi) 1089 movdqu %xmm2,96(%rdi) 1090 movdqu %xmm7,112(%rdi) 1091 je .Ldone4x 1092 1093 movdqa 32(%rsp),%xmm6 1094 leaq 128(%rsi),%rsi 1095 xorq %r10,%r10 1096 movdqa %xmm6,0(%rsp) 1097 movdqa %xmm10,16(%rsp) 1098 leaq 128(%rdi),%rdi 1099 movdqa %xmm14,32(%rsp) 1100 subq $128,%rdx 1101 movdqa %xmm8,48(%rsp) 1102 jmp .Loop_tail4x 1103 1104.align 32 1105.L192_or_more4x: 1106 movdqu 0(%rsi),%xmm6 1107 movdqu 16(%rsi),%xmm11 1108 movdqu 32(%rsi),%xmm2 1109 movdqu 48(%rsi),%xmm7 1110 pxor 0(%rsp),%xmm6 1111 pxor %xmm12,%xmm11 1112 pxor %xmm4,%xmm2 1113 pxor %xmm0,%xmm7 1114 1115 movdqu %xmm6,0(%rdi) 1116 movdqu 64(%rsi),%xmm6 1117 movdqu %xmm11,16(%rdi) 1118 movdqu 80(%rsi),%xmm11 1119 movdqu %xmm2,32(%rdi) 1120 movdqu 96(%rsi),%xmm2 1121 movdqu %xmm7,48(%rdi) 1122 movdqu 112(%rsi),%xmm7 1123 leaq 128(%rsi),%rsi 1124 pxor 16(%rsp),%xmm6 1125 pxor %xmm13,%xmm11 1126 pxor %xmm5,%xmm2 1127 pxor %xmm1,%xmm7 1128 1129 movdqu %xmm6,64(%rdi) 1130 movdqu 0(%rsi),%xmm6 1131 movdqu %xmm11,80(%rdi) 1132 movdqu 16(%rsi),%xmm11 1133 movdqu %xmm2,96(%rdi) 1134 movdqu 32(%rsi),%xmm2 1135 movdqu %xmm7,112(%rdi) 1136 leaq 128(%rdi),%rdi 1137 movdqu 48(%rsi),%xmm7 1138 pxor 32(%rsp),%xmm6 1139 pxor %xmm10,%xmm11 1140 pxor %xmm14,%xmm2 1141 pxor %xmm8,%xmm7 1142 movdqu %xmm6,0(%rdi) 1143 movdqu %xmm11,16(%rdi) 1144 movdqu %xmm2,32(%rdi) 1145 movdqu %xmm7,48(%rdi) 1146 je .Ldone4x 1147 1148 movdqa 48(%rsp),%xmm6 1149 leaq 64(%rsi),%rsi 1150 xorq %r10,%r10 1151 movdqa %xmm6,0(%rsp) 1152 movdqa %xmm15,16(%rsp) 1153 leaq 64(%rdi),%rdi 1154 movdqa %xmm9,32(%rsp) 1155 subq $192,%rdx 1156 movdqa %xmm3,48(%rsp) 1157 1158.Loop_tail4x: 1159 movzbl (%rsi,%r10,1),%eax 1160 movzbl (%rsp,%r10,1),%ecx 1161 leaq 1(%r10),%r10 1162 xorl %ecx,%eax 1163 movb %al,-1(%rdi,%r10,1) 1164 decq %rdx 1165 jnz .Loop_tail4x 1166 1167.Ldone4x: 1168 leaq (%r9),%rsp 1169.cfi_def_cfa_register %rsp 1170.L4x_epilogue: 1171 .byte 0xf3,0xc3 1172.cfi_endproc 1173.size ChaCha20_4x,.-ChaCha20_4x 1174.type ChaCha20_4xop,@function 1175.align 32 1176ChaCha20_4xop: 1177.cfi_startproc 1178.LChaCha20_4xop: 1179 movq %rsp,%r9 1180.cfi_def_cfa_register %r9 1181 subq $0x140+8,%rsp 1182 vzeroupper 1183 1184 vmovdqa .Lsigma(%rip),%xmm11 1185 vmovdqu (%rcx),%xmm3 1186 vmovdqu 16(%rcx),%xmm15 1187 vmovdqu (%r8),%xmm7 1188 leaq 256(%rsp),%rcx 1189 1190 vpshufd $0x00,%xmm11,%xmm8 1191 vpshufd $0x55,%xmm11,%xmm9 1192 vmovdqa %xmm8,64(%rsp) 1193 vpshufd $0xaa,%xmm11,%xmm10 1194 vmovdqa %xmm9,80(%rsp) 1195 vpshufd $0xff,%xmm11,%xmm11 1196 vmovdqa %xmm10,96(%rsp) 1197 vmovdqa %xmm11,112(%rsp) 1198 1199 vpshufd $0x00,%xmm3,%xmm0 1200 vpshufd $0x55,%xmm3,%xmm1 1201 vmovdqa %xmm0,128-256(%rcx) 1202 vpshufd $0xaa,%xmm3,%xmm2 1203 vmovdqa %xmm1,144-256(%rcx) 1204 vpshufd $0xff,%xmm3,%xmm3 1205 vmovdqa %xmm2,160-256(%rcx) 1206 vmovdqa %xmm3,176-256(%rcx) 1207 1208 vpshufd $0x00,%xmm15,%xmm12 1209 vpshufd $0x55,%xmm15,%xmm13 1210 vmovdqa %xmm12,192-256(%rcx) 1211 vpshufd $0xaa,%xmm15,%xmm14 1212 vmovdqa %xmm13,208-256(%rcx) 1213 vpshufd $0xff,%xmm15,%xmm15 1214 vmovdqa %xmm14,224-256(%rcx) 1215 vmovdqa %xmm15,240-256(%rcx) 1216 1217 vpshufd $0x00,%xmm7,%xmm4 1218 vpshufd $0x55,%xmm7,%xmm5 1219 vpaddd .Linc(%rip),%xmm4,%xmm4 1220 vpshufd $0xaa,%xmm7,%xmm6 1221 vmovdqa %xmm5,272-256(%rcx) 1222 vpshufd $0xff,%xmm7,%xmm7 1223 vmovdqa %xmm6,288-256(%rcx) 1224 vmovdqa %xmm7,304-256(%rcx) 1225 1226 jmp .Loop_enter4xop 1227 1228.align 32 1229.Loop_outer4xop: 1230 vmovdqa 64(%rsp),%xmm8 1231 vmovdqa 80(%rsp),%xmm9 1232 vmovdqa 96(%rsp),%xmm10 1233 vmovdqa 112(%rsp),%xmm11 1234 vmovdqa 128-256(%rcx),%xmm0 1235 vmovdqa 144-256(%rcx),%xmm1 1236 vmovdqa 160-256(%rcx),%xmm2 1237 vmovdqa 176-256(%rcx),%xmm3 1238 vmovdqa 192-256(%rcx),%xmm12 1239 vmovdqa 208-256(%rcx),%xmm13 1240 vmovdqa 224-256(%rcx),%xmm14 1241 vmovdqa 240-256(%rcx),%xmm15 1242 vmovdqa 256-256(%rcx),%xmm4 1243 vmovdqa 272-256(%rcx),%xmm5 1244 vmovdqa 288-256(%rcx),%xmm6 1245 vmovdqa 304-256(%rcx),%xmm7 1246 vpaddd .Lfour(%rip),%xmm4,%xmm4 1247 1248.Loop_enter4xop: 1249 movl $10,%eax 1250 vmovdqa %xmm4,256-256(%rcx) 1251 jmp .Loop4xop 1252 1253.align 32 1254.Loop4xop: 1255 vpaddd %xmm0,%xmm8,%xmm8 1256 vpaddd %xmm1,%xmm9,%xmm9 1257 vpaddd %xmm2,%xmm10,%xmm10 1258 vpaddd %xmm3,%xmm11,%xmm11 1259 vpxor %xmm4,%xmm8,%xmm4 1260 vpxor %xmm5,%xmm9,%xmm5 1261 vpxor %xmm6,%xmm10,%xmm6 1262 vpxor %xmm7,%xmm11,%xmm7 1263.byte 143,232,120,194,228,16 1264.byte 143,232,120,194,237,16 1265.byte 143,232,120,194,246,16 1266.byte 143,232,120,194,255,16 1267 vpaddd %xmm4,%xmm12,%xmm12 1268 vpaddd %xmm5,%xmm13,%xmm13 1269 vpaddd %xmm6,%xmm14,%xmm14 1270 vpaddd %xmm7,%xmm15,%xmm15 1271 vpxor %xmm0,%xmm12,%xmm0 1272 vpxor %xmm1,%xmm13,%xmm1 1273 vpxor %xmm14,%xmm2,%xmm2 1274 vpxor %xmm15,%xmm3,%xmm3 1275.byte 143,232,120,194,192,12 1276.byte 143,232,120,194,201,12 1277.byte 143,232,120,194,210,12 1278.byte 143,232,120,194,219,12 1279 vpaddd %xmm8,%xmm0,%xmm8 1280 vpaddd %xmm9,%xmm1,%xmm9 1281 vpaddd %xmm2,%xmm10,%xmm10 1282 vpaddd %xmm3,%xmm11,%xmm11 1283 vpxor %xmm4,%xmm8,%xmm4 1284 vpxor %xmm5,%xmm9,%xmm5 1285 vpxor %xmm6,%xmm10,%xmm6 1286 vpxor %xmm7,%xmm11,%xmm7 1287.byte 143,232,120,194,228,8 1288.byte 143,232,120,194,237,8 1289.byte 143,232,120,194,246,8 1290.byte 143,232,120,194,255,8 1291 vpaddd %xmm4,%xmm12,%xmm12 1292 vpaddd %xmm5,%xmm13,%xmm13 1293 vpaddd %xmm6,%xmm14,%xmm14 1294 vpaddd %xmm7,%xmm15,%xmm15 1295 vpxor %xmm0,%xmm12,%xmm0 1296 vpxor %xmm1,%xmm13,%xmm1 1297 vpxor %xmm14,%xmm2,%xmm2 1298 vpxor %xmm15,%xmm3,%xmm3 1299.byte 143,232,120,194,192,7 1300.byte 143,232,120,194,201,7 1301.byte 143,232,120,194,210,7 1302.byte 143,232,120,194,219,7 1303 vpaddd %xmm1,%xmm8,%xmm8 1304 vpaddd %xmm2,%xmm9,%xmm9 1305 vpaddd %xmm3,%xmm10,%xmm10 1306 vpaddd %xmm0,%xmm11,%xmm11 1307 vpxor %xmm7,%xmm8,%xmm7 1308 vpxor %xmm4,%xmm9,%xmm4 1309 vpxor %xmm5,%xmm10,%xmm5 1310 vpxor %xmm6,%xmm11,%xmm6 1311.byte 143,232,120,194,255,16 1312.byte 143,232,120,194,228,16 1313.byte 143,232,120,194,237,16 1314.byte 143,232,120,194,246,16 1315 vpaddd %xmm7,%xmm14,%xmm14 1316 vpaddd %xmm4,%xmm15,%xmm15 1317 vpaddd %xmm5,%xmm12,%xmm12 1318 vpaddd %xmm6,%xmm13,%xmm13 1319 vpxor %xmm1,%xmm14,%xmm1 1320 vpxor %xmm2,%xmm15,%xmm2 1321 vpxor %xmm12,%xmm3,%xmm3 1322 vpxor %xmm13,%xmm0,%xmm0 1323.byte 143,232,120,194,201,12 1324.byte 143,232,120,194,210,12 1325.byte 143,232,120,194,219,12 1326.byte 143,232,120,194,192,12 1327 vpaddd %xmm8,%xmm1,%xmm8 1328 vpaddd %xmm9,%xmm2,%xmm9 1329 vpaddd %xmm3,%xmm10,%xmm10 1330 vpaddd %xmm0,%xmm11,%xmm11 1331 vpxor %xmm7,%xmm8,%xmm7 1332 vpxor %xmm4,%xmm9,%xmm4 1333 vpxor %xmm5,%xmm10,%xmm5 1334 vpxor %xmm6,%xmm11,%xmm6 1335.byte 143,232,120,194,255,8 1336.byte 143,232,120,194,228,8 1337.byte 143,232,120,194,237,8 1338.byte 143,232,120,194,246,8 1339 vpaddd %xmm7,%xmm14,%xmm14 1340 vpaddd %xmm4,%xmm15,%xmm15 1341 vpaddd %xmm5,%xmm12,%xmm12 1342 vpaddd %xmm6,%xmm13,%xmm13 1343 vpxor %xmm1,%xmm14,%xmm1 1344 vpxor %xmm2,%xmm15,%xmm2 1345 vpxor %xmm12,%xmm3,%xmm3 1346 vpxor %xmm13,%xmm0,%xmm0 1347.byte 143,232,120,194,201,7 1348.byte 143,232,120,194,210,7 1349.byte 143,232,120,194,219,7 1350.byte 143,232,120,194,192,7 1351 decl %eax 1352 jnz .Loop4xop 1353 1354 vpaddd 64(%rsp),%xmm8,%xmm8 1355 vpaddd 80(%rsp),%xmm9,%xmm9 1356 vpaddd 96(%rsp),%xmm10,%xmm10 1357 vpaddd 112(%rsp),%xmm11,%xmm11 1358 1359 vmovdqa %xmm14,32(%rsp) 1360 vmovdqa %xmm15,48(%rsp) 1361 1362 vpunpckldq %xmm9,%xmm8,%xmm14 1363 vpunpckldq %xmm11,%xmm10,%xmm15 1364 vpunpckhdq %xmm9,%xmm8,%xmm8 1365 vpunpckhdq %xmm11,%xmm10,%xmm10 1366 vpunpcklqdq %xmm15,%xmm14,%xmm9 1367 vpunpckhqdq %xmm15,%xmm14,%xmm14 1368 vpunpcklqdq %xmm10,%xmm8,%xmm11 1369 vpunpckhqdq %xmm10,%xmm8,%xmm8 1370 vpaddd 128-256(%rcx),%xmm0,%xmm0 1371 vpaddd 144-256(%rcx),%xmm1,%xmm1 1372 vpaddd 160-256(%rcx),%xmm2,%xmm2 1373 vpaddd 176-256(%rcx),%xmm3,%xmm3 1374 1375 vmovdqa %xmm9,0(%rsp) 1376 vmovdqa %xmm14,16(%rsp) 1377 vmovdqa 32(%rsp),%xmm9 1378 vmovdqa 48(%rsp),%xmm14 1379 1380 vpunpckldq %xmm1,%xmm0,%xmm10 1381 vpunpckldq %xmm3,%xmm2,%xmm15 1382 vpunpckhdq %xmm1,%xmm0,%xmm0 1383 vpunpckhdq %xmm3,%xmm2,%xmm2 1384 vpunpcklqdq %xmm15,%xmm10,%xmm1 1385 vpunpckhqdq %xmm15,%xmm10,%xmm10 1386 vpunpcklqdq %xmm2,%xmm0,%xmm3 1387 vpunpckhqdq %xmm2,%xmm0,%xmm0 1388 vpaddd 192-256(%rcx),%xmm12,%xmm12 1389 vpaddd 208-256(%rcx),%xmm13,%xmm13 1390 vpaddd 224-256(%rcx),%xmm9,%xmm9 1391 vpaddd 240-256(%rcx),%xmm14,%xmm14 1392 1393 vpunpckldq %xmm13,%xmm12,%xmm2 1394 vpunpckldq %xmm14,%xmm9,%xmm15 1395 vpunpckhdq %xmm13,%xmm12,%xmm12 1396 vpunpckhdq %xmm14,%xmm9,%xmm9 1397 vpunpcklqdq %xmm15,%xmm2,%xmm13 1398 vpunpckhqdq %xmm15,%xmm2,%xmm2 1399 vpunpcklqdq %xmm9,%xmm12,%xmm14 1400 vpunpckhqdq %xmm9,%xmm12,%xmm12 1401 vpaddd 256-256(%rcx),%xmm4,%xmm4 1402 vpaddd 272-256(%rcx),%xmm5,%xmm5 1403 vpaddd 288-256(%rcx),%xmm6,%xmm6 1404 vpaddd 304-256(%rcx),%xmm7,%xmm7 1405 1406 vpunpckldq %xmm5,%xmm4,%xmm9 1407 vpunpckldq %xmm7,%xmm6,%xmm15 1408 vpunpckhdq %xmm5,%xmm4,%xmm4 1409 vpunpckhdq %xmm7,%xmm6,%xmm6 1410 vpunpcklqdq %xmm15,%xmm9,%xmm5 1411 vpunpckhqdq %xmm15,%xmm9,%xmm9 1412 vpunpcklqdq %xmm6,%xmm4,%xmm7 1413 vpunpckhqdq %xmm6,%xmm4,%xmm4 1414 vmovdqa 0(%rsp),%xmm6 1415 vmovdqa 16(%rsp),%xmm15 1416 1417 cmpq $256,%rdx 1418 jb .Ltail4xop 1419 1420 vpxor 0(%rsi),%xmm6,%xmm6 1421 vpxor 16(%rsi),%xmm1,%xmm1 1422 vpxor 32(%rsi),%xmm13,%xmm13 1423 vpxor 48(%rsi),%xmm5,%xmm5 1424 vpxor 64(%rsi),%xmm15,%xmm15 1425 vpxor 80(%rsi),%xmm10,%xmm10 1426 vpxor 96(%rsi),%xmm2,%xmm2 1427 vpxor 112(%rsi),%xmm9,%xmm9 1428 leaq 128(%rsi),%rsi 1429 vpxor 0(%rsi),%xmm11,%xmm11 1430 vpxor 16(%rsi),%xmm3,%xmm3 1431 vpxor 32(%rsi),%xmm14,%xmm14 1432 vpxor 48(%rsi),%xmm7,%xmm7 1433 vpxor 64(%rsi),%xmm8,%xmm8 1434 vpxor 80(%rsi),%xmm0,%xmm0 1435 vpxor 96(%rsi),%xmm12,%xmm12 1436 vpxor 112(%rsi),%xmm4,%xmm4 1437 leaq 128(%rsi),%rsi 1438 1439 vmovdqu %xmm6,0(%rdi) 1440 vmovdqu %xmm1,16(%rdi) 1441 vmovdqu %xmm13,32(%rdi) 1442 vmovdqu %xmm5,48(%rdi) 1443 vmovdqu %xmm15,64(%rdi) 1444 vmovdqu %xmm10,80(%rdi) 1445 vmovdqu %xmm2,96(%rdi) 1446 vmovdqu %xmm9,112(%rdi) 1447 leaq 128(%rdi),%rdi 1448 vmovdqu %xmm11,0(%rdi) 1449 vmovdqu %xmm3,16(%rdi) 1450 vmovdqu %xmm14,32(%rdi) 1451 vmovdqu %xmm7,48(%rdi) 1452 vmovdqu %xmm8,64(%rdi) 1453 vmovdqu %xmm0,80(%rdi) 1454 vmovdqu %xmm12,96(%rdi) 1455 vmovdqu %xmm4,112(%rdi) 1456 leaq 128(%rdi),%rdi 1457 1458 subq $256,%rdx 1459 jnz .Loop_outer4xop 1460 1461 jmp .Ldone4xop 1462 1463.align 32 1464.Ltail4xop: 1465 cmpq $192,%rdx 1466 jae .L192_or_more4xop 1467 cmpq $128,%rdx 1468 jae .L128_or_more4xop 1469 cmpq $64,%rdx 1470 jae .L64_or_more4xop 1471 1472 xorq %r10,%r10 1473 vmovdqa %xmm6,0(%rsp) 1474 vmovdqa %xmm1,16(%rsp) 1475 vmovdqa %xmm13,32(%rsp) 1476 vmovdqa %xmm5,48(%rsp) 1477 jmp .Loop_tail4xop 1478 1479.align 32 1480.L64_or_more4xop: 1481 vpxor 0(%rsi),%xmm6,%xmm6 1482 vpxor 16(%rsi),%xmm1,%xmm1 1483 vpxor 32(%rsi),%xmm13,%xmm13 1484 vpxor 48(%rsi),%xmm5,%xmm5 1485 vmovdqu %xmm6,0(%rdi) 1486 vmovdqu %xmm1,16(%rdi) 1487 vmovdqu %xmm13,32(%rdi) 1488 vmovdqu %xmm5,48(%rdi) 1489 je .Ldone4xop 1490 1491 leaq 64(%rsi),%rsi 1492 vmovdqa %xmm15,0(%rsp) 1493 xorq %r10,%r10 1494 vmovdqa %xmm10,16(%rsp) 1495 leaq 64(%rdi),%rdi 1496 vmovdqa %xmm2,32(%rsp) 1497 subq $64,%rdx 1498 vmovdqa %xmm9,48(%rsp) 1499 jmp .Loop_tail4xop 1500 1501.align 32 1502.L128_or_more4xop: 1503 vpxor 0(%rsi),%xmm6,%xmm6 1504 vpxor 16(%rsi),%xmm1,%xmm1 1505 vpxor 32(%rsi),%xmm13,%xmm13 1506 vpxor 48(%rsi),%xmm5,%xmm5 1507 vpxor 64(%rsi),%xmm15,%xmm15 1508 vpxor 80(%rsi),%xmm10,%xmm10 1509 vpxor 96(%rsi),%xmm2,%xmm2 1510 vpxor 112(%rsi),%xmm9,%xmm9 1511 1512 vmovdqu %xmm6,0(%rdi) 1513 vmovdqu %xmm1,16(%rdi) 1514 vmovdqu %xmm13,32(%rdi) 1515 vmovdqu %xmm5,48(%rdi) 1516 vmovdqu %xmm15,64(%rdi) 1517 vmovdqu %xmm10,80(%rdi) 1518 vmovdqu %xmm2,96(%rdi) 1519 vmovdqu %xmm9,112(%rdi) 1520 je .Ldone4xop 1521 1522 leaq 128(%rsi),%rsi 1523 vmovdqa %xmm11,0(%rsp) 1524 xorq %r10,%r10 1525 vmovdqa %xmm3,16(%rsp) 1526 leaq 128(%rdi),%rdi 1527 vmovdqa %xmm14,32(%rsp) 1528 subq $128,%rdx 1529 vmovdqa %xmm7,48(%rsp) 1530 jmp .Loop_tail4xop 1531 1532.align 32 1533.L192_or_more4xop: 1534 vpxor 0(%rsi),%xmm6,%xmm6 1535 vpxor 16(%rsi),%xmm1,%xmm1 1536 vpxor 32(%rsi),%xmm13,%xmm13 1537 vpxor 48(%rsi),%xmm5,%xmm5 1538 vpxor 64(%rsi),%xmm15,%xmm15 1539 vpxor 80(%rsi),%xmm10,%xmm10 1540 vpxor 96(%rsi),%xmm2,%xmm2 1541 vpxor 112(%rsi),%xmm9,%xmm9 1542 leaq 128(%rsi),%rsi 1543 vpxor 0(%rsi),%xmm11,%xmm11 1544 vpxor 16(%rsi),%xmm3,%xmm3 1545 vpxor 32(%rsi),%xmm14,%xmm14 1546 vpxor 48(%rsi),%xmm7,%xmm7 1547 1548 vmovdqu %xmm6,0(%rdi) 1549 vmovdqu %xmm1,16(%rdi) 1550 vmovdqu %xmm13,32(%rdi) 1551 vmovdqu %xmm5,48(%rdi) 1552 vmovdqu %xmm15,64(%rdi) 1553 vmovdqu %xmm10,80(%rdi) 1554 vmovdqu %xmm2,96(%rdi) 1555 vmovdqu %xmm9,112(%rdi) 1556 leaq 128(%rdi),%rdi 1557 vmovdqu %xmm11,0(%rdi) 1558 vmovdqu %xmm3,16(%rdi) 1559 vmovdqu %xmm14,32(%rdi) 1560 vmovdqu %xmm7,48(%rdi) 1561 je .Ldone4xop 1562 1563 leaq 64(%rsi),%rsi 1564 vmovdqa %xmm8,0(%rsp) 1565 xorq %r10,%r10 1566 vmovdqa %xmm0,16(%rsp) 1567 leaq 64(%rdi),%rdi 1568 vmovdqa %xmm12,32(%rsp) 1569 subq $192,%rdx 1570 vmovdqa %xmm4,48(%rsp) 1571 1572.Loop_tail4xop: 1573 movzbl (%rsi,%r10,1),%eax 1574 movzbl (%rsp,%r10,1),%ecx 1575 leaq 1(%r10),%r10 1576 xorl %ecx,%eax 1577 movb %al,-1(%rdi,%r10,1) 1578 decq %rdx 1579 jnz .Loop_tail4xop 1580 1581.Ldone4xop: 1582 vzeroupper 1583 leaq (%r9),%rsp 1584.cfi_def_cfa_register %rsp 1585.L4xop_epilogue: 1586 .byte 0xf3,0xc3 1587.cfi_endproc 1588.size ChaCha20_4xop,.-ChaCha20_4xop 1589.type ChaCha20_8x,@function 1590.align 32 1591ChaCha20_8x: 1592.cfi_startproc 1593.LChaCha20_8x: 1594 movq %rsp,%r9 1595.cfi_def_cfa_register %r9 1596 subq $0x280+8,%rsp 1597 andq $-32,%rsp 1598 vzeroupper 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 vbroadcasti128 .Lsigma(%rip),%ymm11 1610 vbroadcasti128 (%rcx),%ymm3 1611 vbroadcasti128 16(%rcx),%ymm15 1612 vbroadcasti128 (%r8),%ymm7 1613 leaq 256(%rsp),%rcx 1614 leaq 512(%rsp),%rax 1615 leaq .Lrot16(%rip),%r10 1616 leaq .Lrot24(%rip),%r11 1617 1618 vpshufd $0x00,%ymm11,%ymm8 1619 vpshufd $0x55,%ymm11,%ymm9 1620 vmovdqa %ymm8,128-256(%rcx) 1621 vpshufd $0xaa,%ymm11,%ymm10 1622 vmovdqa %ymm9,160-256(%rcx) 1623 vpshufd $0xff,%ymm11,%ymm11 1624 vmovdqa %ymm10,192-256(%rcx) 1625 vmovdqa %ymm11,224-256(%rcx) 1626 1627 vpshufd $0x00,%ymm3,%ymm0 1628 vpshufd $0x55,%ymm3,%ymm1 1629 vmovdqa %ymm0,256-256(%rcx) 1630 vpshufd $0xaa,%ymm3,%ymm2 1631 vmovdqa %ymm1,288-256(%rcx) 1632 vpshufd $0xff,%ymm3,%ymm3 1633 vmovdqa %ymm2,320-256(%rcx) 1634 vmovdqa %ymm3,352-256(%rcx) 1635 1636 vpshufd $0x00,%ymm15,%ymm12 1637 vpshufd $0x55,%ymm15,%ymm13 1638 vmovdqa %ymm12,384-512(%rax) 1639 vpshufd $0xaa,%ymm15,%ymm14 1640 vmovdqa %ymm13,416-512(%rax) 1641 vpshufd $0xff,%ymm15,%ymm15 1642 vmovdqa %ymm14,448-512(%rax) 1643 vmovdqa %ymm15,480-512(%rax) 1644 1645 vpshufd $0x00,%ymm7,%ymm4 1646 vpshufd $0x55,%ymm7,%ymm5 1647 vpaddd .Lincy(%rip),%ymm4,%ymm4 1648 vpshufd $0xaa,%ymm7,%ymm6 1649 vmovdqa %ymm5,544-512(%rax) 1650 vpshufd $0xff,%ymm7,%ymm7 1651 vmovdqa %ymm6,576-512(%rax) 1652 vmovdqa %ymm7,608-512(%rax) 1653 1654 jmp .Loop_enter8x 1655 1656.align 32 1657.Loop_outer8x: 1658 vmovdqa 128-256(%rcx),%ymm8 1659 vmovdqa 160-256(%rcx),%ymm9 1660 vmovdqa 192-256(%rcx),%ymm10 1661 vmovdqa 224-256(%rcx),%ymm11 1662 vmovdqa 256-256(%rcx),%ymm0 1663 vmovdqa 288-256(%rcx),%ymm1 1664 vmovdqa 320-256(%rcx),%ymm2 1665 vmovdqa 352-256(%rcx),%ymm3 1666 vmovdqa 384-512(%rax),%ymm12 1667 vmovdqa 416-512(%rax),%ymm13 1668 vmovdqa 448-512(%rax),%ymm14 1669 vmovdqa 480-512(%rax),%ymm15 1670 vmovdqa 512-512(%rax),%ymm4 1671 vmovdqa 544-512(%rax),%ymm5 1672 vmovdqa 576-512(%rax),%ymm6 1673 vmovdqa 608-512(%rax),%ymm7 1674 vpaddd .Leight(%rip),%ymm4,%ymm4 1675 1676.Loop_enter8x: 1677 vmovdqa %ymm14,64(%rsp) 1678 vmovdqa %ymm15,96(%rsp) 1679 vbroadcasti128 (%r10),%ymm15 1680 vmovdqa %ymm4,512-512(%rax) 1681 movl $10,%eax 1682 jmp .Loop8x 1683 1684.align 32 1685.Loop8x: 1686 vpaddd %ymm0,%ymm8,%ymm8 1687 vpxor %ymm4,%ymm8,%ymm4 1688 vpshufb %ymm15,%ymm4,%ymm4 1689 vpaddd %ymm1,%ymm9,%ymm9 1690 vpxor %ymm5,%ymm9,%ymm5 1691 vpshufb %ymm15,%ymm5,%ymm5 1692 vpaddd %ymm4,%ymm12,%ymm12 1693 vpxor %ymm0,%ymm12,%ymm0 1694 vpslld $12,%ymm0,%ymm14 1695 vpsrld $20,%ymm0,%ymm0 1696 vpor %ymm0,%ymm14,%ymm0 1697 vbroadcasti128 (%r11),%ymm14 1698 vpaddd %ymm5,%ymm13,%ymm13 1699 vpxor %ymm1,%ymm13,%ymm1 1700 vpslld $12,%ymm1,%ymm15 1701 vpsrld $20,%ymm1,%ymm1 1702 vpor %ymm1,%ymm15,%ymm1 1703 vpaddd %ymm0,%ymm8,%ymm8 1704 vpxor %ymm4,%ymm8,%ymm4 1705 vpshufb %ymm14,%ymm4,%ymm4 1706 vpaddd %ymm1,%ymm9,%ymm9 1707 vpxor %ymm5,%ymm9,%ymm5 1708 vpshufb %ymm14,%ymm5,%ymm5 1709 vpaddd %ymm4,%ymm12,%ymm12 1710 vpxor %ymm0,%ymm12,%ymm0 1711 vpslld $7,%ymm0,%ymm15 1712 vpsrld $25,%ymm0,%ymm0 1713 vpor %ymm0,%ymm15,%ymm0 1714 vbroadcasti128 (%r10),%ymm15 1715 vpaddd %ymm5,%ymm13,%ymm13 1716 vpxor %ymm1,%ymm13,%ymm1 1717 vpslld $7,%ymm1,%ymm14 1718 vpsrld $25,%ymm1,%ymm1 1719 vpor %ymm1,%ymm14,%ymm1 1720 vmovdqa %ymm12,0(%rsp) 1721 vmovdqa %ymm13,32(%rsp) 1722 vmovdqa 64(%rsp),%ymm12 1723 vmovdqa 96(%rsp),%ymm13 1724 vpaddd %ymm2,%ymm10,%ymm10 1725 vpxor %ymm6,%ymm10,%ymm6 1726 vpshufb %ymm15,%ymm6,%ymm6 1727 vpaddd %ymm3,%ymm11,%ymm11 1728 vpxor %ymm7,%ymm11,%ymm7 1729 vpshufb %ymm15,%ymm7,%ymm7 1730 vpaddd %ymm6,%ymm12,%ymm12 1731 vpxor %ymm2,%ymm12,%ymm2 1732 vpslld $12,%ymm2,%ymm14 1733 vpsrld $20,%ymm2,%ymm2 1734 vpor %ymm2,%ymm14,%ymm2 1735 vbroadcasti128 (%r11),%ymm14 1736 vpaddd %ymm7,%ymm13,%ymm13 1737 vpxor %ymm3,%ymm13,%ymm3 1738 vpslld $12,%ymm3,%ymm15 1739 vpsrld $20,%ymm3,%ymm3 1740 vpor %ymm3,%ymm15,%ymm3 1741 vpaddd %ymm2,%ymm10,%ymm10 1742 vpxor %ymm6,%ymm10,%ymm6 1743 vpshufb %ymm14,%ymm6,%ymm6 1744 vpaddd %ymm3,%ymm11,%ymm11 1745 vpxor %ymm7,%ymm11,%ymm7 1746 vpshufb %ymm14,%ymm7,%ymm7 1747 vpaddd %ymm6,%ymm12,%ymm12 1748 vpxor %ymm2,%ymm12,%ymm2 1749 vpslld $7,%ymm2,%ymm15 1750 vpsrld $25,%ymm2,%ymm2 1751 vpor %ymm2,%ymm15,%ymm2 1752 vbroadcasti128 (%r10),%ymm15 1753 vpaddd %ymm7,%ymm13,%ymm13 1754 vpxor %ymm3,%ymm13,%ymm3 1755 vpslld $7,%ymm3,%ymm14 1756 vpsrld $25,%ymm3,%ymm3 1757 vpor %ymm3,%ymm14,%ymm3 1758 vpaddd %ymm1,%ymm8,%ymm8 1759 vpxor %ymm7,%ymm8,%ymm7 1760 vpshufb %ymm15,%ymm7,%ymm7 1761 vpaddd %ymm2,%ymm9,%ymm9 1762 vpxor %ymm4,%ymm9,%ymm4 1763 vpshufb %ymm15,%ymm4,%ymm4 1764 vpaddd %ymm7,%ymm12,%ymm12 1765 vpxor %ymm1,%ymm12,%ymm1 1766 vpslld $12,%ymm1,%ymm14 1767 vpsrld $20,%ymm1,%ymm1 1768 vpor %ymm1,%ymm14,%ymm1 1769 vbroadcasti128 (%r11),%ymm14 1770 vpaddd %ymm4,%ymm13,%ymm13 1771 vpxor %ymm2,%ymm13,%ymm2 1772 vpslld $12,%ymm2,%ymm15 1773 vpsrld $20,%ymm2,%ymm2 1774 vpor %ymm2,%ymm15,%ymm2 1775 vpaddd %ymm1,%ymm8,%ymm8 1776 vpxor %ymm7,%ymm8,%ymm7 1777 vpshufb %ymm14,%ymm7,%ymm7 1778 vpaddd %ymm2,%ymm9,%ymm9 1779 vpxor %ymm4,%ymm9,%ymm4 1780 vpshufb %ymm14,%ymm4,%ymm4 1781 vpaddd %ymm7,%ymm12,%ymm12 1782 vpxor %ymm1,%ymm12,%ymm1 1783 vpslld $7,%ymm1,%ymm15 1784 vpsrld $25,%ymm1,%ymm1 1785 vpor %ymm1,%ymm15,%ymm1 1786 vbroadcasti128 (%r10),%ymm15 1787 vpaddd %ymm4,%ymm13,%ymm13 1788 vpxor %ymm2,%ymm13,%ymm2 1789 vpslld $7,%ymm2,%ymm14 1790 vpsrld $25,%ymm2,%ymm2 1791 vpor %ymm2,%ymm14,%ymm2 1792 vmovdqa %ymm12,64(%rsp) 1793 vmovdqa %ymm13,96(%rsp) 1794 vmovdqa 0(%rsp),%ymm12 1795 vmovdqa 32(%rsp),%ymm13 1796 vpaddd %ymm3,%ymm10,%ymm10 1797 vpxor %ymm5,%ymm10,%ymm5 1798 vpshufb %ymm15,%ymm5,%ymm5 1799 vpaddd %ymm0,%ymm11,%ymm11 1800 vpxor %ymm6,%ymm11,%ymm6 1801 vpshufb %ymm15,%ymm6,%ymm6 1802 vpaddd %ymm5,%ymm12,%ymm12 1803 vpxor %ymm3,%ymm12,%ymm3 1804 vpslld $12,%ymm3,%ymm14 1805 vpsrld $20,%ymm3,%ymm3 1806 vpor %ymm3,%ymm14,%ymm3 1807 vbroadcasti128 (%r11),%ymm14 1808 vpaddd %ymm6,%ymm13,%ymm13 1809 vpxor %ymm0,%ymm13,%ymm0 1810 vpslld $12,%ymm0,%ymm15 1811 vpsrld $20,%ymm0,%ymm0 1812 vpor %ymm0,%ymm15,%ymm0 1813 vpaddd %ymm3,%ymm10,%ymm10 1814 vpxor %ymm5,%ymm10,%ymm5 1815 vpshufb %ymm14,%ymm5,%ymm5 1816 vpaddd %ymm0,%ymm11,%ymm11 1817 vpxor %ymm6,%ymm11,%ymm6 1818 vpshufb %ymm14,%ymm6,%ymm6 1819 vpaddd %ymm5,%ymm12,%ymm12 1820 vpxor %ymm3,%ymm12,%ymm3 1821 vpslld $7,%ymm3,%ymm15 1822 vpsrld $25,%ymm3,%ymm3 1823 vpor %ymm3,%ymm15,%ymm3 1824 vbroadcasti128 (%r10),%ymm15 1825 vpaddd %ymm6,%ymm13,%ymm13 1826 vpxor %ymm0,%ymm13,%ymm0 1827 vpslld $7,%ymm0,%ymm14 1828 vpsrld $25,%ymm0,%ymm0 1829 vpor %ymm0,%ymm14,%ymm0 1830 decl %eax 1831 jnz .Loop8x 1832 1833 leaq 512(%rsp),%rax 1834 vpaddd 128-256(%rcx),%ymm8,%ymm8 1835 vpaddd 160-256(%rcx),%ymm9,%ymm9 1836 vpaddd 192-256(%rcx),%ymm10,%ymm10 1837 vpaddd 224-256(%rcx),%ymm11,%ymm11 1838 1839 vpunpckldq %ymm9,%ymm8,%ymm14 1840 vpunpckldq %ymm11,%ymm10,%ymm15 1841 vpunpckhdq %ymm9,%ymm8,%ymm8 1842 vpunpckhdq %ymm11,%ymm10,%ymm10 1843 vpunpcklqdq %ymm15,%ymm14,%ymm9 1844 vpunpckhqdq %ymm15,%ymm14,%ymm14 1845 vpunpcklqdq %ymm10,%ymm8,%ymm11 1846 vpunpckhqdq %ymm10,%ymm8,%ymm8 1847 vpaddd 256-256(%rcx),%ymm0,%ymm0 1848 vpaddd 288-256(%rcx),%ymm1,%ymm1 1849 vpaddd 320-256(%rcx),%ymm2,%ymm2 1850 vpaddd 352-256(%rcx),%ymm3,%ymm3 1851 1852 vpunpckldq %ymm1,%ymm0,%ymm10 1853 vpunpckldq %ymm3,%ymm2,%ymm15 1854 vpunpckhdq %ymm1,%ymm0,%ymm0 1855 vpunpckhdq %ymm3,%ymm2,%ymm2 1856 vpunpcklqdq %ymm15,%ymm10,%ymm1 1857 vpunpckhqdq %ymm15,%ymm10,%ymm10 1858 vpunpcklqdq %ymm2,%ymm0,%ymm3 1859 vpunpckhqdq %ymm2,%ymm0,%ymm0 1860 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1861 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1862 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1863 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1864 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1865 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1866 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1867 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1868 vmovdqa %ymm15,0(%rsp) 1869 vmovdqa %ymm9,32(%rsp) 1870 vmovdqa 64(%rsp),%ymm15 1871 vmovdqa 96(%rsp),%ymm9 1872 1873 vpaddd 384-512(%rax),%ymm12,%ymm12 1874 vpaddd 416-512(%rax),%ymm13,%ymm13 1875 vpaddd 448-512(%rax),%ymm15,%ymm15 1876 vpaddd 480-512(%rax),%ymm9,%ymm9 1877 1878 vpunpckldq %ymm13,%ymm12,%ymm2 1879 vpunpckldq %ymm9,%ymm15,%ymm8 1880 vpunpckhdq %ymm13,%ymm12,%ymm12 1881 vpunpckhdq %ymm9,%ymm15,%ymm15 1882 vpunpcklqdq %ymm8,%ymm2,%ymm13 1883 vpunpckhqdq %ymm8,%ymm2,%ymm2 1884 vpunpcklqdq %ymm15,%ymm12,%ymm9 1885 vpunpckhqdq %ymm15,%ymm12,%ymm12 1886 vpaddd 512-512(%rax),%ymm4,%ymm4 1887 vpaddd 544-512(%rax),%ymm5,%ymm5 1888 vpaddd 576-512(%rax),%ymm6,%ymm6 1889 vpaddd 608-512(%rax),%ymm7,%ymm7 1890 1891 vpunpckldq %ymm5,%ymm4,%ymm15 1892 vpunpckldq %ymm7,%ymm6,%ymm8 1893 vpunpckhdq %ymm5,%ymm4,%ymm4 1894 vpunpckhdq %ymm7,%ymm6,%ymm6 1895 vpunpcklqdq %ymm8,%ymm15,%ymm5 1896 vpunpckhqdq %ymm8,%ymm15,%ymm15 1897 vpunpcklqdq %ymm6,%ymm4,%ymm7 1898 vpunpckhqdq %ymm6,%ymm4,%ymm4 1899 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1900 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1901 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1902 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1903 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1904 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1905 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1906 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1907 vmovdqa 0(%rsp),%ymm6 1908 vmovdqa 32(%rsp),%ymm12 1909 1910 cmpq $512,%rdx 1911 jb .Ltail8x 1912 1913 vpxor 0(%rsi),%ymm6,%ymm6 1914 vpxor 32(%rsi),%ymm8,%ymm8 1915 vpxor 64(%rsi),%ymm1,%ymm1 1916 vpxor 96(%rsi),%ymm5,%ymm5 1917 leaq 128(%rsi),%rsi 1918 vmovdqu %ymm6,0(%rdi) 1919 vmovdqu %ymm8,32(%rdi) 1920 vmovdqu %ymm1,64(%rdi) 1921 vmovdqu %ymm5,96(%rdi) 1922 leaq 128(%rdi),%rdi 1923 1924 vpxor 0(%rsi),%ymm12,%ymm12 1925 vpxor 32(%rsi),%ymm13,%ymm13 1926 vpxor 64(%rsi),%ymm10,%ymm10 1927 vpxor 96(%rsi),%ymm15,%ymm15 1928 leaq 128(%rsi),%rsi 1929 vmovdqu %ymm12,0(%rdi) 1930 vmovdqu %ymm13,32(%rdi) 1931 vmovdqu %ymm10,64(%rdi) 1932 vmovdqu %ymm15,96(%rdi) 1933 leaq 128(%rdi),%rdi 1934 1935 vpxor 0(%rsi),%ymm14,%ymm14 1936 vpxor 32(%rsi),%ymm2,%ymm2 1937 vpxor 64(%rsi),%ymm3,%ymm3 1938 vpxor 96(%rsi),%ymm7,%ymm7 1939 leaq 128(%rsi),%rsi 1940 vmovdqu %ymm14,0(%rdi) 1941 vmovdqu %ymm2,32(%rdi) 1942 vmovdqu %ymm3,64(%rdi) 1943 vmovdqu %ymm7,96(%rdi) 1944 leaq 128(%rdi),%rdi 1945 1946 vpxor 0(%rsi),%ymm11,%ymm11 1947 vpxor 32(%rsi),%ymm9,%ymm9 1948 vpxor 64(%rsi),%ymm0,%ymm0 1949 vpxor 96(%rsi),%ymm4,%ymm4 1950 leaq 128(%rsi),%rsi 1951 vmovdqu %ymm11,0(%rdi) 1952 vmovdqu %ymm9,32(%rdi) 1953 vmovdqu %ymm0,64(%rdi) 1954 vmovdqu %ymm4,96(%rdi) 1955 leaq 128(%rdi),%rdi 1956 1957 subq $512,%rdx 1958 jnz .Loop_outer8x 1959 1960 jmp .Ldone8x 1961 1962.Ltail8x: 1963 cmpq $448,%rdx 1964 jae .L448_or_more8x 1965 cmpq $384,%rdx 1966 jae .L384_or_more8x 1967 cmpq $320,%rdx 1968 jae .L320_or_more8x 1969 cmpq $256,%rdx 1970 jae .L256_or_more8x 1971 cmpq $192,%rdx 1972 jae .L192_or_more8x 1973 cmpq $128,%rdx 1974 jae .L128_or_more8x 1975 cmpq $64,%rdx 1976 jae .L64_or_more8x 1977 1978 xorq %r10,%r10 1979 vmovdqa %ymm6,0(%rsp) 1980 vmovdqa %ymm8,32(%rsp) 1981 jmp .Loop_tail8x 1982 1983.align 32 1984.L64_or_more8x: 1985 vpxor 0(%rsi),%ymm6,%ymm6 1986 vpxor 32(%rsi),%ymm8,%ymm8 1987 vmovdqu %ymm6,0(%rdi) 1988 vmovdqu %ymm8,32(%rdi) 1989 je .Ldone8x 1990 1991 leaq 64(%rsi),%rsi 1992 xorq %r10,%r10 1993 vmovdqa %ymm1,0(%rsp) 1994 leaq 64(%rdi),%rdi 1995 subq $64,%rdx 1996 vmovdqa %ymm5,32(%rsp) 1997 jmp .Loop_tail8x 1998 1999.align 32 2000.L128_or_more8x: 2001 vpxor 0(%rsi),%ymm6,%ymm6 2002 vpxor 32(%rsi),%ymm8,%ymm8 2003 vpxor 64(%rsi),%ymm1,%ymm1 2004 vpxor 96(%rsi),%ymm5,%ymm5 2005 vmovdqu %ymm6,0(%rdi) 2006 vmovdqu %ymm8,32(%rdi) 2007 vmovdqu %ymm1,64(%rdi) 2008 vmovdqu %ymm5,96(%rdi) 2009 je .Ldone8x 2010 2011 leaq 128(%rsi),%rsi 2012 xorq %r10,%r10 2013 vmovdqa %ymm12,0(%rsp) 2014 leaq 128(%rdi),%rdi 2015 subq $128,%rdx 2016 vmovdqa %ymm13,32(%rsp) 2017 jmp .Loop_tail8x 2018 2019.align 32 2020.L192_or_more8x: 2021 vpxor 0(%rsi),%ymm6,%ymm6 2022 vpxor 32(%rsi),%ymm8,%ymm8 2023 vpxor 64(%rsi),%ymm1,%ymm1 2024 vpxor 96(%rsi),%ymm5,%ymm5 2025 vpxor 128(%rsi),%ymm12,%ymm12 2026 vpxor 160(%rsi),%ymm13,%ymm13 2027 vmovdqu %ymm6,0(%rdi) 2028 vmovdqu %ymm8,32(%rdi) 2029 vmovdqu %ymm1,64(%rdi) 2030 vmovdqu %ymm5,96(%rdi) 2031 vmovdqu %ymm12,128(%rdi) 2032 vmovdqu %ymm13,160(%rdi) 2033 je .Ldone8x 2034 2035 leaq 192(%rsi),%rsi 2036 xorq %r10,%r10 2037 vmovdqa %ymm10,0(%rsp) 2038 leaq 192(%rdi),%rdi 2039 subq $192,%rdx 2040 vmovdqa %ymm15,32(%rsp) 2041 jmp .Loop_tail8x 2042 2043.align 32 2044.L256_or_more8x: 2045 vpxor 0(%rsi),%ymm6,%ymm6 2046 vpxor 32(%rsi),%ymm8,%ymm8 2047 vpxor 64(%rsi),%ymm1,%ymm1 2048 vpxor 96(%rsi),%ymm5,%ymm5 2049 vpxor 128(%rsi),%ymm12,%ymm12 2050 vpxor 160(%rsi),%ymm13,%ymm13 2051 vpxor 192(%rsi),%ymm10,%ymm10 2052 vpxor 224(%rsi),%ymm15,%ymm15 2053 vmovdqu %ymm6,0(%rdi) 2054 vmovdqu %ymm8,32(%rdi) 2055 vmovdqu %ymm1,64(%rdi) 2056 vmovdqu %ymm5,96(%rdi) 2057 vmovdqu %ymm12,128(%rdi) 2058 vmovdqu %ymm13,160(%rdi) 2059 vmovdqu %ymm10,192(%rdi) 2060 vmovdqu %ymm15,224(%rdi) 2061 je .Ldone8x 2062 2063 leaq 256(%rsi),%rsi 2064 xorq %r10,%r10 2065 vmovdqa %ymm14,0(%rsp) 2066 leaq 256(%rdi),%rdi 2067 subq $256,%rdx 2068 vmovdqa %ymm2,32(%rsp) 2069 jmp .Loop_tail8x 2070 2071.align 32 2072.L320_or_more8x: 2073 vpxor 0(%rsi),%ymm6,%ymm6 2074 vpxor 32(%rsi),%ymm8,%ymm8 2075 vpxor 64(%rsi),%ymm1,%ymm1 2076 vpxor 96(%rsi),%ymm5,%ymm5 2077 vpxor 128(%rsi),%ymm12,%ymm12 2078 vpxor 160(%rsi),%ymm13,%ymm13 2079 vpxor 192(%rsi),%ymm10,%ymm10 2080 vpxor 224(%rsi),%ymm15,%ymm15 2081 vpxor 256(%rsi),%ymm14,%ymm14 2082 vpxor 288(%rsi),%ymm2,%ymm2 2083 vmovdqu %ymm6,0(%rdi) 2084 vmovdqu %ymm8,32(%rdi) 2085 vmovdqu %ymm1,64(%rdi) 2086 vmovdqu %ymm5,96(%rdi) 2087 vmovdqu %ymm12,128(%rdi) 2088 vmovdqu %ymm13,160(%rdi) 2089 vmovdqu %ymm10,192(%rdi) 2090 vmovdqu %ymm15,224(%rdi) 2091 vmovdqu %ymm14,256(%rdi) 2092 vmovdqu %ymm2,288(%rdi) 2093 je .Ldone8x 2094 2095 leaq 320(%rsi),%rsi 2096 xorq %r10,%r10 2097 vmovdqa %ymm3,0(%rsp) 2098 leaq 320(%rdi),%rdi 2099 subq $320,%rdx 2100 vmovdqa %ymm7,32(%rsp) 2101 jmp .Loop_tail8x 2102 2103.align 32 2104.L384_or_more8x: 2105 vpxor 0(%rsi),%ymm6,%ymm6 2106 vpxor 32(%rsi),%ymm8,%ymm8 2107 vpxor 64(%rsi),%ymm1,%ymm1 2108 vpxor 96(%rsi),%ymm5,%ymm5 2109 vpxor 128(%rsi),%ymm12,%ymm12 2110 vpxor 160(%rsi),%ymm13,%ymm13 2111 vpxor 192(%rsi),%ymm10,%ymm10 2112 vpxor 224(%rsi),%ymm15,%ymm15 2113 vpxor 256(%rsi),%ymm14,%ymm14 2114 vpxor 288(%rsi),%ymm2,%ymm2 2115 vpxor 320(%rsi),%ymm3,%ymm3 2116 vpxor 352(%rsi),%ymm7,%ymm7 2117 vmovdqu %ymm6,0(%rdi) 2118 vmovdqu %ymm8,32(%rdi) 2119 vmovdqu %ymm1,64(%rdi) 2120 vmovdqu %ymm5,96(%rdi) 2121 vmovdqu %ymm12,128(%rdi) 2122 vmovdqu %ymm13,160(%rdi) 2123 vmovdqu %ymm10,192(%rdi) 2124 vmovdqu %ymm15,224(%rdi) 2125 vmovdqu %ymm14,256(%rdi) 2126 vmovdqu %ymm2,288(%rdi) 2127 vmovdqu %ymm3,320(%rdi) 2128 vmovdqu %ymm7,352(%rdi) 2129 je .Ldone8x 2130 2131 leaq 384(%rsi),%rsi 2132 xorq %r10,%r10 2133 vmovdqa %ymm11,0(%rsp) 2134 leaq 384(%rdi),%rdi 2135 subq $384,%rdx 2136 vmovdqa %ymm9,32(%rsp) 2137 jmp .Loop_tail8x 2138 2139.align 32 2140.L448_or_more8x: 2141 vpxor 0(%rsi),%ymm6,%ymm6 2142 vpxor 32(%rsi),%ymm8,%ymm8 2143 vpxor 64(%rsi),%ymm1,%ymm1 2144 vpxor 96(%rsi),%ymm5,%ymm5 2145 vpxor 128(%rsi),%ymm12,%ymm12 2146 vpxor 160(%rsi),%ymm13,%ymm13 2147 vpxor 192(%rsi),%ymm10,%ymm10 2148 vpxor 224(%rsi),%ymm15,%ymm15 2149 vpxor 256(%rsi),%ymm14,%ymm14 2150 vpxor 288(%rsi),%ymm2,%ymm2 2151 vpxor 320(%rsi),%ymm3,%ymm3 2152 vpxor 352(%rsi),%ymm7,%ymm7 2153 vpxor 384(%rsi),%ymm11,%ymm11 2154 vpxor 416(%rsi),%ymm9,%ymm9 2155 vmovdqu %ymm6,0(%rdi) 2156 vmovdqu %ymm8,32(%rdi) 2157 vmovdqu %ymm1,64(%rdi) 2158 vmovdqu %ymm5,96(%rdi) 2159 vmovdqu %ymm12,128(%rdi) 2160 vmovdqu %ymm13,160(%rdi) 2161 vmovdqu %ymm10,192(%rdi) 2162 vmovdqu %ymm15,224(%rdi) 2163 vmovdqu %ymm14,256(%rdi) 2164 vmovdqu %ymm2,288(%rdi) 2165 vmovdqu %ymm3,320(%rdi) 2166 vmovdqu %ymm7,352(%rdi) 2167 vmovdqu %ymm11,384(%rdi) 2168 vmovdqu %ymm9,416(%rdi) 2169 je .Ldone8x 2170 2171 leaq 448(%rsi),%rsi 2172 xorq %r10,%r10 2173 vmovdqa %ymm0,0(%rsp) 2174 leaq 448(%rdi),%rdi 2175 subq $448,%rdx 2176 vmovdqa %ymm4,32(%rsp) 2177 2178.Loop_tail8x: 2179 movzbl (%rsi,%r10,1),%eax 2180 movzbl (%rsp,%r10,1),%ecx 2181 leaq 1(%r10),%r10 2182 xorl %ecx,%eax 2183 movb %al,-1(%rdi,%r10,1) 2184 decq %rdx 2185 jnz .Loop_tail8x 2186 2187.Ldone8x: 2188 vzeroall 2189 leaq (%r9),%rsp 2190.cfi_def_cfa_register %rsp 2191.L8x_epilogue: 2192 .byte 0xf3,0xc3 2193.cfi_endproc 2194.size ChaCha20_8x,.-ChaCha20_8x 2195 .section ".note.gnu.property", "a" 2196 .p2align 3 2197 .long 1f - 0f 2198 .long 4f - 1f 2199 .long 5 22000: 2201 # "GNU" encoded with .byte, since .asciz isn't supported 2202 # on Solaris. 2203 .byte 0x47 2204 .byte 0x4e 2205 .byte 0x55 2206 .byte 0 22071: 2208 .p2align 3 2209 .long 0xc0000002 2210 .long 3f - 2f 22112: 2212 .long 3 22133: 2214 .p2align 3 22154: 2216