1/* Do not modify. This file is auto-generated from chacha-x86_64.pl. */ 2.text 3 4 5 6.section .rodata 7.align 64 8.Lzero: 9.long 0,0,0,0 10.Lone: 11.long 1,0,0,0 12.Linc: 13.long 0,1,2,3 14.Lfour: 15.long 4,4,4,4 16.Lincy: 17.long 0,2,4,6,1,3,5,7 18.Leight: 19.long 8,8,8,8,8,8,8,8 20.Lrot16: 21.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 22.Lrot24: 23.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 24.Ltwoy: 25.long 2,0,0,0, 2,0,0,0 26.align 64 27.Lzeroz: 28.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 29.Lfourz: 30.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 31.Lincz: 32.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 33.Lsixteen: 34.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 35.Lsigma: 36.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 37.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 38.previous 39.globl ChaCha20_ctr32 40.type ChaCha20_ctr32,@function 41.align 64 42ChaCha20_ctr32: 43.cfi_startproc 44 cmpq $0,%rdx 45 je .Lno_data 46 movq OPENSSL_ia32cap_P+4(%rip),%r10 47 testl $512,%r10d 48 jnz .LChaCha20_ssse3 49 50 pushq %rbx 51.cfi_adjust_cfa_offset 8 52.cfi_offset %rbx,-16 53 pushq %rbp 54.cfi_adjust_cfa_offset 8 55.cfi_offset %rbp,-24 56 pushq %r12 57.cfi_adjust_cfa_offset 8 58.cfi_offset %r12,-32 59 pushq %r13 60.cfi_adjust_cfa_offset 8 61.cfi_offset %r13,-40 62 pushq %r14 63.cfi_adjust_cfa_offset 8 64.cfi_offset %r14,-48 65 pushq %r15 66.cfi_adjust_cfa_offset 8 67.cfi_offset %r15,-56 68 subq $64+24,%rsp 69.cfi_adjust_cfa_offset 64+24 70.Lctr32_body: 71 72 73 movdqu (%rcx),%xmm1 74 movdqu 16(%rcx),%xmm2 75 movdqu (%r8),%xmm3 76 movdqa .Lone(%rip),%xmm4 77 78 79 movdqa %xmm1,16(%rsp) 80 movdqa %xmm2,32(%rsp) 81 movdqa %xmm3,48(%rsp) 82 movq %rdx,%rbp 83 jmp .Loop_outer 84 85.align 32 86.Loop_outer: 87 movl $0x61707865,%eax 88 movl $0x3320646e,%ebx 89 movl $0x79622d32,%ecx 90 movl $0x6b206574,%edx 91 movl 16(%rsp),%r8d 92 movl 20(%rsp),%r9d 93 movl 24(%rsp),%r10d 94 movl 28(%rsp),%r11d 95 movd %xmm3,%r12d 96 movl 52(%rsp),%r13d 97 movl 56(%rsp),%r14d 98 movl 60(%rsp),%r15d 99 100 movq %rbp,64+0(%rsp) 101 movl $10,%ebp 102 movq %rsi,64+8(%rsp) 103.byte 102,72,15,126,214 104 movq %rdi,64+16(%rsp) 105 movq %rsi,%rdi 106 shrq $32,%rdi 107 jmp .Loop 108 109.align 32 110.Loop: 111 addl %r8d,%eax 112 xorl %eax,%r12d 113 roll $16,%r12d 114 addl %r9d,%ebx 115 xorl %ebx,%r13d 116 roll $16,%r13d 117 addl %r12d,%esi 118 xorl %esi,%r8d 119 roll $12,%r8d 120 addl %r13d,%edi 121 xorl %edi,%r9d 122 roll $12,%r9d 123 addl %r8d,%eax 124 xorl %eax,%r12d 125 roll $8,%r12d 126 addl %r9d,%ebx 127 xorl %ebx,%r13d 128 roll $8,%r13d 129 addl %r12d,%esi 130 xorl %esi,%r8d 131 roll $7,%r8d 132 addl %r13d,%edi 133 xorl %edi,%r9d 134 roll $7,%r9d 135 movl %esi,32(%rsp) 136 movl %edi,36(%rsp) 137 movl 40(%rsp),%esi 138 movl 44(%rsp),%edi 139 addl %r10d,%ecx 140 xorl %ecx,%r14d 141 roll $16,%r14d 142 addl %r11d,%edx 143 xorl %edx,%r15d 144 roll $16,%r15d 145 addl %r14d,%esi 146 xorl %esi,%r10d 147 roll $12,%r10d 148 addl %r15d,%edi 149 xorl %edi,%r11d 150 roll $12,%r11d 151 addl %r10d,%ecx 152 xorl %ecx,%r14d 153 roll $8,%r14d 154 addl %r11d,%edx 155 xorl %edx,%r15d 156 roll $8,%r15d 157 addl %r14d,%esi 158 xorl %esi,%r10d 159 roll $7,%r10d 160 addl %r15d,%edi 161 xorl %edi,%r11d 162 roll $7,%r11d 163 addl %r9d,%eax 164 xorl %eax,%r15d 165 roll $16,%r15d 166 addl %r10d,%ebx 167 xorl %ebx,%r12d 168 roll $16,%r12d 169 addl %r15d,%esi 170 xorl %esi,%r9d 171 roll $12,%r9d 172 addl %r12d,%edi 173 xorl %edi,%r10d 174 roll $12,%r10d 175 addl %r9d,%eax 176 xorl %eax,%r15d 177 roll $8,%r15d 178 addl %r10d,%ebx 179 xorl %ebx,%r12d 180 roll $8,%r12d 181 addl %r15d,%esi 182 xorl %esi,%r9d 183 roll $7,%r9d 184 addl %r12d,%edi 185 xorl %edi,%r10d 186 roll $7,%r10d 187 movl %esi,40(%rsp) 188 movl %edi,44(%rsp) 189 movl 32(%rsp),%esi 190 movl 36(%rsp),%edi 191 addl %r11d,%ecx 192 xorl %ecx,%r13d 193 roll $16,%r13d 194 addl %r8d,%edx 195 xorl %edx,%r14d 196 roll $16,%r14d 197 addl %r13d,%esi 198 xorl %esi,%r11d 199 roll $12,%r11d 200 addl %r14d,%edi 201 xorl %edi,%r8d 202 roll $12,%r8d 203 addl %r11d,%ecx 204 xorl %ecx,%r13d 205 roll $8,%r13d 206 addl %r8d,%edx 207 xorl %edx,%r14d 208 roll $8,%r14d 209 addl %r13d,%esi 210 xorl %esi,%r11d 211 roll $7,%r11d 212 addl %r14d,%edi 213 xorl %edi,%r8d 214 roll $7,%r8d 215 decl %ebp 216 jnz .Loop 217 movl %edi,36(%rsp) 218 movl %esi,32(%rsp) 219 movq 64(%rsp),%rbp 220 movdqa %xmm2,%xmm1 221 movq 64+8(%rsp),%rsi 222 paddd %xmm4,%xmm3 223 movq 64+16(%rsp),%rdi 224 225 addl $0x61707865,%eax 226 addl $0x3320646e,%ebx 227 addl $0x79622d32,%ecx 228 addl $0x6b206574,%edx 229 addl 16(%rsp),%r8d 230 addl 20(%rsp),%r9d 231 addl 24(%rsp),%r10d 232 addl 28(%rsp),%r11d 233 addl 48(%rsp),%r12d 234 addl 52(%rsp),%r13d 235 addl 56(%rsp),%r14d 236 addl 60(%rsp),%r15d 237 paddd 32(%rsp),%xmm1 238 239 cmpq $64,%rbp 240 jb .Ltail 241 242 xorl 0(%rsi),%eax 243 xorl 4(%rsi),%ebx 244 xorl 8(%rsi),%ecx 245 xorl 12(%rsi),%edx 246 xorl 16(%rsi),%r8d 247 xorl 20(%rsi),%r9d 248 xorl 24(%rsi),%r10d 249 xorl 28(%rsi),%r11d 250 movdqu 32(%rsi),%xmm0 251 xorl 48(%rsi),%r12d 252 xorl 52(%rsi),%r13d 253 xorl 56(%rsi),%r14d 254 xorl 60(%rsi),%r15d 255 leaq 64(%rsi),%rsi 256 pxor %xmm1,%xmm0 257 258 movdqa %xmm2,32(%rsp) 259 movd %xmm3,48(%rsp) 260 261 movl %eax,0(%rdi) 262 movl %ebx,4(%rdi) 263 movl %ecx,8(%rdi) 264 movl %edx,12(%rdi) 265 movl %r8d,16(%rdi) 266 movl %r9d,20(%rdi) 267 movl %r10d,24(%rdi) 268 movl %r11d,28(%rdi) 269 movdqu %xmm0,32(%rdi) 270 movl %r12d,48(%rdi) 271 movl %r13d,52(%rdi) 272 movl %r14d,56(%rdi) 273 movl %r15d,60(%rdi) 274 leaq 64(%rdi),%rdi 275 276 subq $64,%rbp 277 jnz .Loop_outer 278 279 jmp .Ldone 280 281.align 16 282.Ltail: 283 movl %eax,0(%rsp) 284 movl %ebx,4(%rsp) 285 xorq %rbx,%rbx 286 movl %ecx,8(%rsp) 287 movl %edx,12(%rsp) 288 movl %r8d,16(%rsp) 289 movl %r9d,20(%rsp) 290 movl %r10d,24(%rsp) 291 movl %r11d,28(%rsp) 292 movdqa %xmm1,32(%rsp) 293 movl %r12d,48(%rsp) 294 movl %r13d,52(%rsp) 295 movl %r14d,56(%rsp) 296 movl %r15d,60(%rsp) 297 298.Loop_tail: 299 movzbl (%rsi,%rbx,1),%eax 300 movzbl (%rsp,%rbx,1),%edx 301 leaq 1(%rbx),%rbx 302 xorl %edx,%eax 303 movb %al,-1(%rdi,%rbx,1) 304 decq %rbp 305 jnz .Loop_tail 306 307.Ldone: 308 leaq 64+24+48(%rsp),%rsi 309.cfi_def_cfa %rsi,8 310 movq -48(%rsi),%r15 311.cfi_restore %r15 312 movq -40(%rsi),%r14 313.cfi_restore %r14 314 movq -32(%rsi),%r13 315.cfi_restore %r13 316 movq -24(%rsi),%r12 317.cfi_restore %r12 318 movq -16(%rsi),%rbp 319.cfi_restore %rbp 320 movq -8(%rsi),%rbx 321.cfi_restore %rbx 322 leaq (%rsi),%rsp 323.cfi_def_cfa_register %rsp 324.Lno_data: 325 .byte 0xf3,0xc3 326.cfi_endproc 327.size ChaCha20_ctr32,.-ChaCha20_ctr32 328.type ChaCha20_ssse3,@function 329.align 32 330ChaCha20_ssse3: 331.cfi_startproc 332.LChaCha20_ssse3: 333 movq %rsp,%r9 334.cfi_def_cfa_register %r9 335 testl $2048,%r10d 336 jnz .LChaCha20_4xop 337 cmpq $128,%rdx 338 je .LChaCha20_128 339 ja .LChaCha20_4x 340 341.Ldo_sse3_after_all: 342 subq $64+8,%rsp 343 movdqa .Lsigma(%rip),%xmm0 344 movdqu (%rcx),%xmm1 345 movdqu 16(%rcx),%xmm2 346 movdqu (%r8),%xmm3 347 movdqa .Lrot16(%rip),%xmm6 348 movdqa .Lrot24(%rip),%xmm7 349 350 movdqa %xmm0,0(%rsp) 351 movdqa %xmm1,16(%rsp) 352 movdqa %xmm2,32(%rsp) 353 movdqa %xmm3,48(%rsp) 354 movq $10,%r8 355 jmp .Loop_ssse3 356 357.align 32 358.Loop_outer_ssse3: 359 movdqa .Lone(%rip),%xmm3 360 movdqa 0(%rsp),%xmm0 361 movdqa 16(%rsp),%xmm1 362 movdqa 32(%rsp),%xmm2 363 paddd 48(%rsp),%xmm3 364 movq $10,%r8 365 movdqa %xmm3,48(%rsp) 366 jmp .Loop_ssse3 367 368.align 32 369.Loop_ssse3: 370 paddd %xmm1,%xmm0 371 pxor %xmm0,%xmm3 372.byte 102,15,56,0,222 373 paddd %xmm3,%xmm2 374 pxor %xmm2,%xmm1 375 movdqa %xmm1,%xmm4 376 psrld $20,%xmm1 377 pslld $12,%xmm4 378 por %xmm4,%xmm1 379 paddd %xmm1,%xmm0 380 pxor %xmm0,%xmm3 381.byte 102,15,56,0,223 382 paddd %xmm3,%xmm2 383 pxor %xmm2,%xmm1 384 movdqa %xmm1,%xmm4 385 psrld $25,%xmm1 386 pslld $7,%xmm4 387 por %xmm4,%xmm1 388 pshufd $78,%xmm2,%xmm2 389 pshufd $57,%xmm1,%xmm1 390 pshufd $147,%xmm3,%xmm3 391 nop 392 paddd %xmm1,%xmm0 393 pxor %xmm0,%xmm3 394.byte 102,15,56,0,222 395 paddd %xmm3,%xmm2 396 pxor %xmm2,%xmm1 397 movdqa %xmm1,%xmm4 398 psrld $20,%xmm1 399 pslld $12,%xmm4 400 por %xmm4,%xmm1 401 paddd %xmm1,%xmm0 402 pxor %xmm0,%xmm3 403.byte 102,15,56,0,223 404 paddd %xmm3,%xmm2 405 pxor %xmm2,%xmm1 406 movdqa %xmm1,%xmm4 407 psrld $25,%xmm1 408 pslld $7,%xmm4 409 por %xmm4,%xmm1 410 pshufd $78,%xmm2,%xmm2 411 pshufd $147,%xmm1,%xmm1 412 pshufd $57,%xmm3,%xmm3 413 decq %r8 414 jnz .Loop_ssse3 415 paddd 0(%rsp),%xmm0 416 paddd 16(%rsp),%xmm1 417 paddd 32(%rsp),%xmm2 418 paddd 48(%rsp),%xmm3 419 420 cmpq $64,%rdx 421 jb .Ltail_ssse3 422 423 movdqu 0(%rsi),%xmm4 424 movdqu 16(%rsi),%xmm5 425 pxor %xmm4,%xmm0 426 movdqu 32(%rsi),%xmm4 427 pxor %xmm5,%xmm1 428 movdqu 48(%rsi),%xmm5 429 leaq 64(%rsi),%rsi 430 pxor %xmm4,%xmm2 431 pxor %xmm5,%xmm3 432 433 movdqu %xmm0,0(%rdi) 434 movdqu %xmm1,16(%rdi) 435 movdqu %xmm2,32(%rdi) 436 movdqu %xmm3,48(%rdi) 437 leaq 64(%rdi),%rdi 438 439 subq $64,%rdx 440 jnz .Loop_outer_ssse3 441 442 jmp .Ldone_ssse3 443 444.align 16 445.Ltail_ssse3: 446 movdqa %xmm0,0(%rsp) 447 movdqa %xmm1,16(%rsp) 448 movdqa %xmm2,32(%rsp) 449 movdqa %xmm3,48(%rsp) 450 xorq %r8,%r8 451 452.Loop_tail_ssse3: 453 movzbl (%rsi,%r8,1),%eax 454 movzbl (%rsp,%r8,1),%ecx 455 leaq 1(%r8),%r8 456 xorl %ecx,%eax 457 movb %al,-1(%rdi,%r8,1) 458 decq %rdx 459 jnz .Loop_tail_ssse3 460 461.Ldone_ssse3: 462 leaq (%r9),%rsp 463.cfi_def_cfa_register %rsp 464.Lssse3_epilogue: 465 .byte 0xf3,0xc3 466.cfi_endproc 467.size ChaCha20_ssse3,.-ChaCha20_ssse3 468.type ChaCha20_128,@function 469.align 32 470ChaCha20_128: 471.cfi_startproc 472.LChaCha20_128: 473 movq %rsp,%r9 474.cfi_def_cfa_register %r9 475 subq $64+8,%rsp 476 movdqa .Lsigma(%rip),%xmm8 477 movdqu (%rcx),%xmm9 478 movdqu 16(%rcx),%xmm2 479 movdqu (%r8),%xmm3 480 movdqa .Lone(%rip),%xmm1 481 movdqa .Lrot16(%rip),%xmm6 482 movdqa .Lrot24(%rip),%xmm7 483 484 movdqa %xmm8,%xmm10 485 movdqa %xmm8,0(%rsp) 486 movdqa %xmm9,%xmm11 487 movdqa %xmm9,16(%rsp) 488 movdqa %xmm2,%xmm0 489 movdqa %xmm2,32(%rsp) 490 paddd %xmm3,%xmm1 491 movdqa %xmm3,48(%rsp) 492 movq $10,%r8 493 jmp .Loop_128 494 495.align 32 496.Loop_128: 497 paddd %xmm9,%xmm8 498 pxor %xmm8,%xmm3 499 paddd %xmm11,%xmm10 500 pxor %xmm10,%xmm1 501.byte 102,15,56,0,222 502.byte 102,15,56,0,206 503 paddd %xmm3,%xmm2 504 paddd %xmm1,%xmm0 505 pxor %xmm2,%xmm9 506 pxor %xmm0,%xmm11 507 movdqa %xmm9,%xmm4 508 psrld $20,%xmm9 509 movdqa %xmm11,%xmm5 510 pslld $12,%xmm4 511 psrld $20,%xmm11 512 por %xmm4,%xmm9 513 pslld $12,%xmm5 514 por %xmm5,%xmm11 515 paddd %xmm9,%xmm8 516 pxor %xmm8,%xmm3 517 paddd %xmm11,%xmm10 518 pxor %xmm10,%xmm1 519.byte 102,15,56,0,223 520.byte 102,15,56,0,207 521 paddd %xmm3,%xmm2 522 paddd %xmm1,%xmm0 523 pxor %xmm2,%xmm9 524 pxor %xmm0,%xmm11 525 movdqa %xmm9,%xmm4 526 psrld $25,%xmm9 527 movdqa %xmm11,%xmm5 528 pslld $7,%xmm4 529 psrld $25,%xmm11 530 por %xmm4,%xmm9 531 pslld $7,%xmm5 532 por %xmm5,%xmm11 533 pshufd $78,%xmm2,%xmm2 534 pshufd $57,%xmm9,%xmm9 535 pshufd $147,%xmm3,%xmm3 536 pshufd $78,%xmm0,%xmm0 537 pshufd $57,%xmm11,%xmm11 538 pshufd $147,%xmm1,%xmm1 539 paddd %xmm9,%xmm8 540 pxor %xmm8,%xmm3 541 paddd %xmm11,%xmm10 542 pxor %xmm10,%xmm1 543.byte 102,15,56,0,222 544.byte 102,15,56,0,206 545 paddd %xmm3,%xmm2 546 paddd %xmm1,%xmm0 547 pxor %xmm2,%xmm9 548 pxor %xmm0,%xmm11 549 movdqa %xmm9,%xmm4 550 psrld $20,%xmm9 551 movdqa %xmm11,%xmm5 552 pslld $12,%xmm4 553 psrld $20,%xmm11 554 por %xmm4,%xmm9 555 pslld $12,%xmm5 556 por %xmm5,%xmm11 557 paddd %xmm9,%xmm8 558 pxor %xmm8,%xmm3 559 paddd %xmm11,%xmm10 560 pxor %xmm10,%xmm1 561.byte 102,15,56,0,223 562.byte 102,15,56,0,207 563 paddd %xmm3,%xmm2 564 paddd %xmm1,%xmm0 565 pxor %xmm2,%xmm9 566 pxor %xmm0,%xmm11 567 movdqa %xmm9,%xmm4 568 psrld $25,%xmm9 569 movdqa %xmm11,%xmm5 570 pslld $7,%xmm4 571 psrld $25,%xmm11 572 por %xmm4,%xmm9 573 pslld $7,%xmm5 574 por %xmm5,%xmm11 575 pshufd $78,%xmm2,%xmm2 576 pshufd $147,%xmm9,%xmm9 577 pshufd $57,%xmm3,%xmm3 578 pshufd $78,%xmm0,%xmm0 579 pshufd $147,%xmm11,%xmm11 580 pshufd $57,%xmm1,%xmm1 581 decq %r8 582 jnz .Loop_128 583 paddd 0(%rsp),%xmm8 584 paddd 16(%rsp),%xmm9 585 paddd 32(%rsp),%xmm2 586 paddd 48(%rsp),%xmm3 587 paddd .Lone(%rip),%xmm1 588 paddd 0(%rsp),%xmm10 589 paddd 16(%rsp),%xmm11 590 paddd 32(%rsp),%xmm0 591 paddd 48(%rsp),%xmm1 592 593 movdqu 0(%rsi),%xmm4 594 movdqu 16(%rsi),%xmm5 595 pxor %xmm4,%xmm8 596 movdqu 32(%rsi),%xmm4 597 pxor %xmm5,%xmm9 598 movdqu 48(%rsi),%xmm5 599 pxor %xmm4,%xmm2 600 movdqu 64(%rsi),%xmm4 601 pxor %xmm5,%xmm3 602 movdqu 80(%rsi),%xmm5 603 pxor %xmm4,%xmm10 604 movdqu 96(%rsi),%xmm4 605 pxor %xmm5,%xmm11 606 movdqu 112(%rsi),%xmm5 607 pxor %xmm4,%xmm0 608 pxor %xmm5,%xmm1 609 610 movdqu %xmm8,0(%rdi) 611 movdqu %xmm9,16(%rdi) 612 movdqu %xmm2,32(%rdi) 613 movdqu %xmm3,48(%rdi) 614 movdqu %xmm10,64(%rdi) 615 movdqu %xmm11,80(%rdi) 616 movdqu %xmm0,96(%rdi) 617 movdqu %xmm1,112(%rdi) 618 leaq (%r9),%rsp 619.cfi_def_cfa_register %rsp 620.L128_epilogue: 621 .byte 0xf3,0xc3 622.cfi_endproc 623.size ChaCha20_128,.-ChaCha20_128 624.type ChaCha20_4x,@function 625.align 32 626ChaCha20_4x: 627.cfi_startproc 628.LChaCha20_4x: 629 movq %rsp,%r9 630.cfi_def_cfa_register %r9 631 movq %r10,%r11 632 shrq $32,%r10 633 testq $32,%r10 634 jnz .LChaCha20_8x 635 cmpq $192,%rdx 636 ja .Lproceed4x 637 638 andq $71303168,%r11 639 cmpq $4194304,%r11 640 je .Ldo_sse3_after_all 641 642.Lproceed4x: 643 subq $0x140+8,%rsp 644 movdqa .Lsigma(%rip),%xmm11 645 movdqu (%rcx),%xmm15 646 movdqu 16(%rcx),%xmm7 647 movdqu (%r8),%xmm3 648 leaq 256(%rsp),%rcx 649 leaq .Lrot16(%rip),%r10 650 leaq .Lrot24(%rip),%r11 651 652 pshufd $0x00,%xmm11,%xmm8 653 pshufd $0x55,%xmm11,%xmm9 654 movdqa %xmm8,64(%rsp) 655 pshufd $0xaa,%xmm11,%xmm10 656 movdqa %xmm9,80(%rsp) 657 pshufd $0xff,%xmm11,%xmm11 658 movdqa %xmm10,96(%rsp) 659 movdqa %xmm11,112(%rsp) 660 661 pshufd $0x00,%xmm15,%xmm12 662 pshufd $0x55,%xmm15,%xmm13 663 movdqa %xmm12,128-256(%rcx) 664 pshufd $0xaa,%xmm15,%xmm14 665 movdqa %xmm13,144-256(%rcx) 666 pshufd $0xff,%xmm15,%xmm15 667 movdqa %xmm14,160-256(%rcx) 668 movdqa %xmm15,176-256(%rcx) 669 670 pshufd $0x00,%xmm7,%xmm4 671 pshufd $0x55,%xmm7,%xmm5 672 movdqa %xmm4,192-256(%rcx) 673 pshufd $0xaa,%xmm7,%xmm6 674 movdqa %xmm5,208-256(%rcx) 675 pshufd $0xff,%xmm7,%xmm7 676 movdqa %xmm6,224-256(%rcx) 677 movdqa %xmm7,240-256(%rcx) 678 679 pshufd $0x00,%xmm3,%xmm0 680 pshufd $0x55,%xmm3,%xmm1 681 paddd .Linc(%rip),%xmm0 682 pshufd $0xaa,%xmm3,%xmm2 683 movdqa %xmm1,272-256(%rcx) 684 pshufd $0xff,%xmm3,%xmm3 685 movdqa %xmm2,288-256(%rcx) 686 movdqa %xmm3,304-256(%rcx) 687 688 jmp .Loop_enter4x 689 690.align 32 691.Loop_outer4x: 692 movdqa 64(%rsp),%xmm8 693 movdqa 80(%rsp),%xmm9 694 movdqa 96(%rsp),%xmm10 695 movdqa 112(%rsp),%xmm11 696 movdqa 128-256(%rcx),%xmm12 697 movdqa 144-256(%rcx),%xmm13 698 movdqa 160-256(%rcx),%xmm14 699 movdqa 176-256(%rcx),%xmm15 700 movdqa 192-256(%rcx),%xmm4 701 movdqa 208-256(%rcx),%xmm5 702 movdqa 224-256(%rcx),%xmm6 703 movdqa 240-256(%rcx),%xmm7 704 movdqa 256-256(%rcx),%xmm0 705 movdqa 272-256(%rcx),%xmm1 706 movdqa 288-256(%rcx),%xmm2 707 movdqa 304-256(%rcx),%xmm3 708 paddd .Lfour(%rip),%xmm0 709 710.Loop_enter4x: 711 movdqa %xmm6,32(%rsp) 712 movdqa %xmm7,48(%rsp) 713 movdqa (%r10),%xmm7 714 movl $10,%eax 715 movdqa %xmm0,256-256(%rcx) 716 jmp .Loop4x 717 718.align 32 719.Loop4x: 720 paddd %xmm12,%xmm8 721 paddd %xmm13,%xmm9 722 pxor %xmm8,%xmm0 723 pxor %xmm9,%xmm1 724.byte 102,15,56,0,199 725.byte 102,15,56,0,207 726 paddd %xmm0,%xmm4 727 paddd %xmm1,%xmm5 728 pxor %xmm4,%xmm12 729 pxor %xmm5,%xmm13 730 movdqa %xmm12,%xmm6 731 pslld $12,%xmm12 732 psrld $20,%xmm6 733 movdqa %xmm13,%xmm7 734 pslld $12,%xmm13 735 por %xmm6,%xmm12 736 psrld $20,%xmm7 737 movdqa (%r11),%xmm6 738 por %xmm7,%xmm13 739 paddd %xmm12,%xmm8 740 paddd %xmm13,%xmm9 741 pxor %xmm8,%xmm0 742 pxor %xmm9,%xmm1 743.byte 102,15,56,0,198 744.byte 102,15,56,0,206 745 paddd %xmm0,%xmm4 746 paddd %xmm1,%xmm5 747 pxor %xmm4,%xmm12 748 pxor %xmm5,%xmm13 749 movdqa %xmm12,%xmm7 750 pslld $7,%xmm12 751 psrld $25,%xmm7 752 movdqa %xmm13,%xmm6 753 pslld $7,%xmm13 754 por %xmm7,%xmm12 755 psrld $25,%xmm6 756 movdqa (%r10),%xmm7 757 por %xmm6,%xmm13 758 movdqa %xmm4,0(%rsp) 759 movdqa %xmm5,16(%rsp) 760 movdqa 32(%rsp),%xmm4 761 movdqa 48(%rsp),%xmm5 762 paddd %xmm14,%xmm10 763 paddd %xmm15,%xmm11 764 pxor %xmm10,%xmm2 765 pxor %xmm11,%xmm3 766.byte 102,15,56,0,215 767.byte 102,15,56,0,223 768 paddd %xmm2,%xmm4 769 paddd %xmm3,%xmm5 770 pxor %xmm4,%xmm14 771 pxor %xmm5,%xmm15 772 movdqa %xmm14,%xmm6 773 pslld $12,%xmm14 774 psrld $20,%xmm6 775 movdqa %xmm15,%xmm7 776 pslld $12,%xmm15 777 por %xmm6,%xmm14 778 psrld $20,%xmm7 779 movdqa (%r11),%xmm6 780 por %xmm7,%xmm15 781 paddd %xmm14,%xmm10 782 paddd %xmm15,%xmm11 783 pxor %xmm10,%xmm2 784 pxor %xmm11,%xmm3 785.byte 102,15,56,0,214 786.byte 102,15,56,0,222 787 paddd %xmm2,%xmm4 788 paddd %xmm3,%xmm5 789 pxor %xmm4,%xmm14 790 pxor %xmm5,%xmm15 791 movdqa %xmm14,%xmm7 792 pslld $7,%xmm14 793 psrld $25,%xmm7 794 movdqa %xmm15,%xmm6 795 pslld $7,%xmm15 796 por %xmm7,%xmm14 797 psrld $25,%xmm6 798 movdqa (%r10),%xmm7 799 por %xmm6,%xmm15 800 paddd %xmm13,%xmm8 801 paddd %xmm14,%xmm9 802 pxor %xmm8,%xmm3 803 pxor %xmm9,%xmm0 804.byte 102,15,56,0,223 805.byte 102,15,56,0,199 806 paddd %xmm3,%xmm4 807 paddd %xmm0,%xmm5 808 pxor %xmm4,%xmm13 809 pxor %xmm5,%xmm14 810 movdqa %xmm13,%xmm6 811 pslld $12,%xmm13 812 psrld $20,%xmm6 813 movdqa %xmm14,%xmm7 814 pslld $12,%xmm14 815 por %xmm6,%xmm13 816 psrld $20,%xmm7 817 movdqa (%r11),%xmm6 818 por %xmm7,%xmm14 819 paddd %xmm13,%xmm8 820 paddd %xmm14,%xmm9 821 pxor %xmm8,%xmm3 822 pxor %xmm9,%xmm0 823.byte 102,15,56,0,222 824.byte 102,15,56,0,198 825 paddd %xmm3,%xmm4 826 paddd %xmm0,%xmm5 827 pxor %xmm4,%xmm13 828 pxor %xmm5,%xmm14 829 movdqa %xmm13,%xmm7 830 pslld $7,%xmm13 831 psrld $25,%xmm7 832 movdqa %xmm14,%xmm6 833 pslld $7,%xmm14 834 por %xmm7,%xmm13 835 psrld $25,%xmm6 836 movdqa (%r10),%xmm7 837 por %xmm6,%xmm14 838 movdqa %xmm4,32(%rsp) 839 movdqa %xmm5,48(%rsp) 840 movdqa 0(%rsp),%xmm4 841 movdqa 16(%rsp),%xmm5 842 paddd %xmm15,%xmm10 843 paddd %xmm12,%xmm11 844 pxor %xmm10,%xmm1 845 pxor %xmm11,%xmm2 846.byte 102,15,56,0,207 847.byte 102,15,56,0,215 848 paddd %xmm1,%xmm4 849 paddd %xmm2,%xmm5 850 pxor %xmm4,%xmm15 851 pxor %xmm5,%xmm12 852 movdqa %xmm15,%xmm6 853 pslld $12,%xmm15 854 psrld $20,%xmm6 855 movdqa %xmm12,%xmm7 856 pslld $12,%xmm12 857 por %xmm6,%xmm15 858 psrld $20,%xmm7 859 movdqa (%r11),%xmm6 860 por %xmm7,%xmm12 861 paddd %xmm15,%xmm10 862 paddd %xmm12,%xmm11 863 pxor %xmm10,%xmm1 864 pxor %xmm11,%xmm2 865.byte 102,15,56,0,206 866.byte 102,15,56,0,214 867 paddd %xmm1,%xmm4 868 paddd %xmm2,%xmm5 869 pxor %xmm4,%xmm15 870 pxor %xmm5,%xmm12 871 movdqa %xmm15,%xmm7 872 pslld $7,%xmm15 873 psrld $25,%xmm7 874 movdqa %xmm12,%xmm6 875 pslld $7,%xmm12 876 por %xmm7,%xmm15 877 psrld $25,%xmm6 878 movdqa (%r10),%xmm7 879 por %xmm6,%xmm12 880 decl %eax 881 jnz .Loop4x 882 883 paddd 64(%rsp),%xmm8 884 paddd 80(%rsp),%xmm9 885 paddd 96(%rsp),%xmm10 886 paddd 112(%rsp),%xmm11 887 888 movdqa %xmm8,%xmm6 889 punpckldq %xmm9,%xmm8 890 movdqa %xmm10,%xmm7 891 punpckldq %xmm11,%xmm10 892 punpckhdq %xmm9,%xmm6 893 punpckhdq %xmm11,%xmm7 894 movdqa %xmm8,%xmm9 895 punpcklqdq %xmm10,%xmm8 896 movdqa %xmm6,%xmm11 897 punpcklqdq %xmm7,%xmm6 898 punpckhqdq %xmm10,%xmm9 899 punpckhqdq %xmm7,%xmm11 900 paddd 128-256(%rcx),%xmm12 901 paddd 144-256(%rcx),%xmm13 902 paddd 160-256(%rcx),%xmm14 903 paddd 176-256(%rcx),%xmm15 904 905 movdqa %xmm8,0(%rsp) 906 movdqa %xmm9,16(%rsp) 907 movdqa 32(%rsp),%xmm8 908 movdqa 48(%rsp),%xmm9 909 910 movdqa %xmm12,%xmm10 911 punpckldq %xmm13,%xmm12 912 movdqa %xmm14,%xmm7 913 punpckldq %xmm15,%xmm14 914 punpckhdq %xmm13,%xmm10 915 punpckhdq %xmm15,%xmm7 916 movdqa %xmm12,%xmm13 917 punpcklqdq %xmm14,%xmm12 918 movdqa %xmm10,%xmm15 919 punpcklqdq %xmm7,%xmm10 920 punpckhqdq %xmm14,%xmm13 921 punpckhqdq %xmm7,%xmm15 922 paddd 192-256(%rcx),%xmm4 923 paddd 208-256(%rcx),%xmm5 924 paddd 224-256(%rcx),%xmm8 925 paddd 240-256(%rcx),%xmm9 926 927 movdqa %xmm6,32(%rsp) 928 movdqa %xmm11,48(%rsp) 929 930 movdqa %xmm4,%xmm14 931 punpckldq %xmm5,%xmm4 932 movdqa %xmm8,%xmm7 933 punpckldq %xmm9,%xmm8 934 punpckhdq %xmm5,%xmm14 935 punpckhdq %xmm9,%xmm7 936 movdqa %xmm4,%xmm5 937 punpcklqdq %xmm8,%xmm4 938 movdqa %xmm14,%xmm9 939 punpcklqdq %xmm7,%xmm14 940 punpckhqdq %xmm8,%xmm5 941 punpckhqdq %xmm7,%xmm9 942 paddd 256-256(%rcx),%xmm0 943 paddd 272-256(%rcx),%xmm1 944 paddd 288-256(%rcx),%xmm2 945 paddd 304-256(%rcx),%xmm3 946 947 movdqa %xmm0,%xmm8 948 punpckldq %xmm1,%xmm0 949 movdqa %xmm2,%xmm7 950 punpckldq %xmm3,%xmm2 951 punpckhdq %xmm1,%xmm8 952 punpckhdq %xmm3,%xmm7 953 movdqa %xmm0,%xmm1 954 punpcklqdq %xmm2,%xmm0 955 movdqa %xmm8,%xmm3 956 punpcklqdq %xmm7,%xmm8 957 punpckhqdq %xmm2,%xmm1 958 punpckhqdq %xmm7,%xmm3 959 cmpq $256,%rdx 960 jb .Ltail4x 961 962 movdqu 0(%rsi),%xmm6 963 movdqu 16(%rsi),%xmm11 964 movdqu 32(%rsi),%xmm2 965 movdqu 48(%rsi),%xmm7 966 pxor 0(%rsp),%xmm6 967 pxor %xmm12,%xmm11 968 pxor %xmm4,%xmm2 969 pxor %xmm0,%xmm7 970 971 movdqu %xmm6,0(%rdi) 972 movdqu 64(%rsi),%xmm6 973 movdqu %xmm11,16(%rdi) 974 movdqu 80(%rsi),%xmm11 975 movdqu %xmm2,32(%rdi) 976 movdqu 96(%rsi),%xmm2 977 movdqu %xmm7,48(%rdi) 978 movdqu 112(%rsi),%xmm7 979 leaq 128(%rsi),%rsi 980 pxor 16(%rsp),%xmm6 981 pxor %xmm13,%xmm11 982 pxor %xmm5,%xmm2 983 pxor %xmm1,%xmm7 984 985 movdqu %xmm6,64(%rdi) 986 movdqu 0(%rsi),%xmm6 987 movdqu %xmm11,80(%rdi) 988 movdqu 16(%rsi),%xmm11 989 movdqu %xmm2,96(%rdi) 990 movdqu 32(%rsi),%xmm2 991 movdqu %xmm7,112(%rdi) 992 leaq 128(%rdi),%rdi 993 movdqu 48(%rsi),%xmm7 994 pxor 32(%rsp),%xmm6 995 pxor %xmm10,%xmm11 996 pxor %xmm14,%xmm2 997 pxor %xmm8,%xmm7 998 999 movdqu %xmm6,0(%rdi) 1000 movdqu 64(%rsi),%xmm6 1001 movdqu %xmm11,16(%rdi) 1002 movdqu 80(%rsi),%xmm11 1003 movdqu %xmm2,32(%rdi) 1004 movdqu 96(%rsi),%xmm2 1005 movdqu %xmm7,48(%rdi) 1006 movdqu 112(%rsi),%xmm7 1007 leaq 128(%rsi),%rsi 1008 pxor 48(%rsp),%xmm6 1009 pxor %xmm15,%xmm11 1010 pxor %xmm9,%xmm2 1011 pxor %xmm3,%xmm7 1012 movdqu %xmm6,64(%rdi) 1013 movdqu %xmm11,80(%rdi) 1014 movdqu %xmm2,96(%rdi) 1015 movdqu %xmm7,112(%rdi) 1016 leaq 128(%rdi),%rdi 1017 1018 subq $256,%rdx 1019 jnz .Loop_outer4x 1020 1021 jmp .Ldone4x 1022 1023.Ltail4x: 1024 cmpq $192,%rdx 1025 jae .L192_or_more4x 1026 cmpq $128,%rdx 1027 jae .L128_or_more4x 1028 cmpq $64,%rdx 1029 jae .L64_or_more4x 1030 1031 1032 xorq %r10,%r10 1033 1034 movdqa %xmm12,16(%rsp) 1035 movdqa %xmm4,32(%rsp) 1036 movdqa %xmm0,48(%rsp) 1037 jmp .Loop_tail4x 1038 1039.align 32 1040.L64_or_more4x: 1041 movdqu 0(%rsi),%xmm6 1042 movdqu 16(%rsi),%xmm11 1043 movdqu 32(%rsi),%xmm2 1044 movdqu 48(%rsi),%xmm7 1045 pxor 0(%rsp),%xmm6 1046 pxor %xmm12,%xmm11 1047 pxor %xmm4,%xmm2 1048 pxor %xmm0,%xmm7 1049 movdqu %xmm6,0(%rdi) 1050 movdqu %xmm11,16(%rdi) 1051 movdqu %xmm2,32(%rdi) 1052 movdqu %xmm7,48(%rdi) 1053 je .Ldone4x 1054 1055 movdqa 16(%rsp),%xmm6 1056 leaq 64(%rsi),%rsi 1057 xorq %r10,%r10 1058 movdqa %xmm6,0(%rsp) 1059 movdqa %xmm13,16(%rsp) 1060 leaq 64(%rdi),%rdi 1061 movdqa %xmm5,32(%rsp) 1062 subq $64,%rdx 1063 movdqa %xmm1,48(%rsp) 1064 jmp .Loop_tail4x 1065 1066.align 32 1067.L128_or_more4x: 1068 movdqu 0(%rsi),%xmm6 1069 movdqu 16(%rsi),%xmm11 1070 movdqu 32(%rsi),%xmm2 1071 movdqu 48(%rsi),%xmm7 1072 pxor 0(%rsp),%xmm6 1073 pxor %xmm12,%xmm11 1074 pxor %xmm4,%xmm2 1075 pxor %xmm0,%xmm7 1076 1077 movdqu %xmm6,0(%rdi) 1078 movdqu 64(%rsi),%xmm6 1079 movdqu %xmm11,16(%rdi) 1080 movdqu 80(%rsi),%xmm11 1081 movdqu %xmm2,32(%rdi) 1082 movdqu 96(%rsi),%xmm2 1083 movdqu %xmm7,48(%rdi) 1084 movdqu 112(%rsi),%xmm7 1085 pxor 16(%rsp),%xmm6 1086 pxor %xmm13,%xmm11 1087 pxor %xmm5,%xmm2 1088 pxor %xmm1,%xmm7 1089 movdqu %xmm6,64(%rdi) 1090 movdqu %xmm11,80(%rdi) 1091 movdqu %xmm2,96(%rdi) 1092 movdqu %xmm7,112(%rdi) 1093 je .Ldone4x 1094 1095 movdqa 32(%rsp),%xmm6 1096 leaq 128(%rsi),%rsi 1097 xorq %r10,%r10 1098 movdqa %xmm6,0(%rsp) 1099 movdqa %xmm10,16(%rsp) 1100 leaq 128(%rdi),%rdi 1101 movdqa %xmm14,32(%rsp) 1102 subq $128,%rdx 1103 movdqa %xmm8,48(%rsp) 1104 jmp .Loop_tail4x 1105 1106.align 32 1107.L192_or_more4x: 1108 movdqu 0(%rsi),%xmm6 1109 movdqu 16(%rsi),%xmm11 1110 movdqu 32(%rsi),%xmm2 1111 movdqu 48(%rsi),%xmm7 1112 pxor 0(%rsp),%xmm6 1113 pxor %xmm12,%xmm11 1114 pxor %xmm4,%xmm2 1115 pxor %xmm0,%xmm7 1116 1117 movdqu %xmm6,0(%rdi) 1118 movdqu 64(%rsi),%xmm6 1119 movdqu %xmm11,16(%rdi) 1120 movdqu 80(%rsi),%xmm11 1121 movdqu %xmm2,32(%rdi) 1122 movdqu 96(%rsi),%xmm2 1123 movdqu %xmm7,48(%rdi) 1124 movdqu 112(%rsi),%xmm7 1125 leaq 128(%rsi),%rsi 1126 pxor 16(%rsp),%xmm6 1127 pxor %xmm13,%xmm11 1128 pxor %xmm5,%xmm2 1129 pxor %xmm1,%xmm7 1130 1131 movdqu %xmm6,64(%rdi) 1132 movdqu 0(%rsi),%xmm6 1133 movdqu %xmm11,80(%rdi) 1134 movdqu 16(%rsi),%xmm11 1135 movdqu %xmm2,96(%rdi) 1136 movdqu 32(%rsi),%xmm2 1137 movdqu %xmm7,112(%rdi) 1138 leaq 128(%rdi),%rdi 1139 movdqu 48(%rsi),%xmm7 1140 pxor 32(%rsp),%xmm6 1141 pxor %xmm10,%xmm11 1142 pxor %xmm14,%xmm2 1143 pxor %xmm8,%xmm7 1144 movdqu %xmm6,0(%rdi) 1145 movdqu %xmm11,16(%rdi) 1146 movdqu %xmm2,32(%rdi) 1147 movdqu %xmm7,48(%rdi) 1148 je .Ldone4x 1149 1150 movdqa 48(%rsp),%xmm6 1151 leaq 64(%rsi),%rsi 1152 xorq %r10,%r10 1153 movdqa %xmm6,0(%rsp) 1154 movdqa %xmm15,16(%rsp) 1155 leaq 64(%rdi),%rdi 1156 movdqa %xmm9,32(%rsp) 1157 subq $192,%rdx 1158 movdqa %xmm3,48(%rsp) 1159 1160.Loop_tail4x: 1161 movzbl (%rsi,%r10,1),%eax 1162 movzbl (%rsp,%r10,1),%ecx 1163 leaq 1(%r10),%r10 1164 xorl %ecx,%eax 1165 movb %al,-1(%rdi,%r10,1) 1166 decq %rdx 1167 jnz .Loop_tail4x 1168 1169.Ldone4x: 1170 leaq (%r9),%rsp 1171.cfi_def_cfa_register %rsp 1172.L4x_epilogue: 1173 .byte 0xf3,0xc3 1174.cfi_endproc 1175.size ChaCha20_4x,.-ChaCha20_4x 1176.type ChaCha20_4xop,@function 1177.align 32 1178ChaCha20_4xop: 1179.cfi_startproc 1180.LChaCha20_4xop: 1181 movq %rsp,%r9 1182.cfi_def_cfa_register %r9 1183 subq $0x140+8,%rsp 1184 vzeroupper 1185 1186 vmovdqa .Lsigma(%rip),%xmm11 1187 vmovdqu (%rcx),%xmm3 1188 vmovdqu 16(%rcx),%xmm15 1189 vmovdqu (%r8),%xmm7 1190 leaq 256(%rsp),%rcx 1191 1192 vpshufd $0x00,%xmm11,%xmm8 1193 vpshufd $0x55,%xmm11,%xmm9 1194 vmovdqa %xmm8,64(%rsp) 1195 vpshufd $0xaa,%xmm11,%xmm10 1196 vmovdqa %xmm9,80(%rsp) 1197 vpshufd $0xff,%xmm11,%xmm11 1198 vmovdqa %xmm10,96(%rsp) 1199 vmovdqa %xmm11,112(%rsp) 1200 1201 vpshufd $0x00,%xmm3,%xmm0 1202 vpshufd $0x55,%xmm3,%xmm1 1203 vmovdqa %xmm0,128-256(%rcx) 1204 vpshufd $0xaa,%xmm3,%xmm2 1205 vmovdqa %xmm1,144-256(%rcx) 1206 vpshufd $0xff,%xmm3,%xmm3 1207 vmovdqa %xmm2,160-256(%rcx) 1208 vmovdqa %xmm3,176-256(%rcx) 1209 1210 vpshufd $0x00,%xmm15,%xmm12 1211 vpshufd $0x55,%xmm15,%xmm13 1212 vmovdqa %xmm12,192-256(%rcx) 1213 vpshufd $0xaa,%xmm15,%xmm14 1214 vmovdqa %xmm13,208-256(%rcx) 1215 vpshufd $0xff,%xmm15,%xmm15 1216 vmovdqa %xmm14,224-256(%rcx) 1217 vmovdqa %xmm15,240-256(%rcx) 1218 1219 vpshufd $0x00,%xmm7,%xmm4 1220 vpshufd $0x55,%xmm7,%xmm5 1221 vpaddd .Linc(%rip),%xmm4,%xmm4 1222 vpshufd $0xaa,%xmm7,%xmm6 1223 vmovdqa %xmm5,272-256(%rcx) 1224 vpshufd $0xff,%xmm7,%xmm7 1225 vmovdqa %xmm6,288-256(%rcx) 1226 vmovdqa %xmm7,304-256(%rcx) 1227 1228 jmp .Loop_enter4xop 1229 1230.align 32 1231.Loop_outer4xop: 1232 vmovdqa 64(%rsp),%xmm8 1233 vmovdqa 80(%rsp),%xmm9 1234 vmovdqa 96(%rsp),%xmm10 1235 vmovdqa 112(%rsp),%xmm11 1236 vmovdqa 128-256(%rcx),%xmm0 1237 vmovdqa 144-256(%rcx),%xmm1 1238 vmovdqa 160-256(%rcx),%xmm2 1239 vmovdqa 176-256(%rcx),%xmm3 1240 vmovdqa 192-256(%rcx),%xmm12 1241 vmovdqa 208-256(%rcx),%xmm13 1242 vmovdqa 224-256(%rcx),%xmm14 1243 vmovdqa 240-256(%rcx),%xmm15 1244 vmovdqa 256-256(%rcx),%xmm4 1245 vmovdqa 272-256(%rcx),%xmm5 1246 vmovdqa 288-256(%rcx),%xmm6 1247 vmovdqa 304-256(%rcx),%xmm7 1248 vpaddd .Lfour(%rip),%xmm4,%xmm4 1249 1250.Loop_enter4xop: 1251 movl $10,%eax 1252 vmovdqa %xmm4,256-256(%rcx) 1253 jmp .Loop4xop 1254 1255.align 32 1256.Loop4xop: 1257 vpaddd %xmm0,%xmm8,%xmm8 1258 vpaddd %xmm1,%xmm9,%xmm9 1259 vpaddd %xmm2,%xmm10,%xmm10 1260 vpaddd %xmm3,%xmm11,%xmm11 1261 vpxor %xmm4,%xmm8,%xmm4 1262 vpxor %xmm5,%xmm9,%xmm5 1263 vpxor %xmm6,%xmm10,%xmm6 1264 vpxor %xmm7,%xmm11,%xmm7 1265.byte 143,232,120,194,228,16 1266.byte 143,232,120,194,237,16 1267.byte 143,232,120,194,246,16 1268.byte 143,232,120,194,255,16 1269 vpaddd %xmm4,%xmm12,%xmm12 1270 vpaddd %xmm5,%xmm13,%xmm13 1271 vpaddd %xmm6,%xmm14,%xmm14 1272 vpaddd %xmm7,%xmm15,%xmm15 1273 vpxor %xmm0,%xmm12,%xmm0 1274 vpxor %xmm1,%xmm13,%xmm1 1275 vpxor %xmm14,%xmm2,%xmm2 1276 vpxor %xmm15,%xmm3,%xmm3 1277.byte 143,232,120,194,192,12 1278.byte 143,232,120,194,201,12 1279.byte 143,232,120,194,210,12 1280.byte 143,232,120,194,219,12 1281 vpaddd %xmm8,%xmm0,%xmm8 1282 vpaddd %xmm9,%xmm1,%xmm9 1283 vpaddd %xmm2,%xmm10,%xmm10 1284 vpaddd %xmm3,%xmm11,%xmm11 1285 vpxor %xmm4,%xmm8,%xmm4 1286 vpxor %xmm5,%xmm9,%xmm5 1287 vpxor %xmm6,%xmm10,%xmm6 1288 vpxor %xmm7,%xmm11,%xmm7 1289.byte 143,232,120,194,228,8 1290.byte 143,232,120,194,237,8 1291.byte 143,232,120,194,246,8 1292.byte 143,232,120,194,255,8 1293 vpaddd %xmm4,%xmm12,%xmm12 1294 vpaddd %xmm5,%xmm13,%xmm13 1295 vpaddd %xmm6,%xmm14,%xmm14 1296 vpaddd %xmm7,%xmm15,%xmm15 1297 vpxor %xmm0,%xmm12,%xmm0 1298 vpxor %xmm1,%xmm13,%xmm1 1299 vpxor %xmm14,%xmm2,%xmm2 1300 vpxor %xmm15,%xmm3,%xmm3 1301.byte 143,232,120,194,192,7 1302.byte 143,232,120,194,201,7 1303.byte 143,232,120,194,210,7 1304.byte 143,232,120,194,219,7 1305 vpaddd %xmm1,%xmm8,%xmm8 1306 vpaddd %xmm2,%xmm9,%xmm9 1307 vpaddd %xmm3,%xmm10,%xmm10 1308 vpaddd %xmm0,%xmm11,%xmm11 1309 vpxor %xmm7,%xmm8,%xmm7 1310 vpxor %xmm4,%xmm9,%xmm4 1311 vpxor %xmm5,%xmm10,%xmm5 1312 vpxor %xmm6,%xmm11,%xmm6 1313.byte 143,232,120,194,255,16 1314.byte 143,232,120,194,228,16 1315.byte 143,232,120,194,237,16 1316.byte 143,232,120,194,246,16 1317 vpaddd %xmm7,%xmm14,%xmm14 1318 vpaddd %xmm4,%xmm15,%xmm15 1319 vpaddd %xmm5,%xmm12,%xmm12 1320 vpaddd %xmm6,%xmm13,%xmm13 1321 vpxor %xmm1,%xmm14,%xmm1 1322 vpxor %xmm2,%xmm15,%xmm2 1323 vpxor %xmm12,%xmm3,%xmm3 1324 vpxor %xmm13,%xmm0,%xmm0 1325.byte 143,232,120,194,201,12 1326.byte 143,232,120,194,210,12 1327.byte 143,232,120,194,219,12 1328.byte 143,232,120,194,192,12 1329 vpaddd %xmm8,%xmm1,%xmm8 1330 vpaddd %xmm9,%xmm2,%xmm9 1331 vpaddd %xmm3,%xmm10,%xmm10 1332 vpaddd %xmm0,%xmm11,%xmm11 1333 vpxor %xmm7,%xmm8,%xmm7 1334 vpxor %xmm4,%xmm9,%xmm4 1335 vpxor %xmm5,%xmm10,%xmm5 1336 vpxor %xmm6,%xmm11,%xmm6 1337.byte 143,232,120,194,255,8 1338.byte 143,232,120,194,228,8 1339.byte 143,232,120,194,237,8 1340.byte 143,232,120,194,246,8 1341 vpaddd %xmm7,%xmm14,%xmm14 1342 vpaddd %xmm4,%xmm15,%xmm15 1343 vpaddd %xmm5,%xmm12,%xmm12 1344 vpaddd %xmm6,%xmm13,%xmm13 1345 vpxor %xmm1,%xmm14,%xmm1 1346 vpxor %xmm2,%xmm15,%xmm2 1347 vpxor %xmm12,%xmm3,%xmm3 1348 vpxor %xmm13,%xmm0,%xmm0 1349.byte 143,232,120,194,201,7 1350.byte 143,232,120,194,210,7 1351.byte 143,232,120,194,219,7 1352.byte 143,232,120,194,192,7 1353 decl %eax 1354 jnz .Loop4xop 1355 1356 vpaddd 64(%rsp),%xmm8,%xmm8 1357 vpaddd 80(%rsp),%xmm9,%xmm9 1358 vpaddd 96(%rsp),%xmm10,%xmm10 1359 vpaddd 112(%rsp),%xmm11,%xmm11 1360 1361 vmovdqa %xmm14,32(%rsp) 1362 vmovdqa %xmm15,48(%rsp) 1363 1364 vpunpckldq %xmm9,%xmm8,%xmm14 1365 vpunpckldq %xmm11,%xmm10,%xmm15 1366 vpunpckhdq %xmm9,%xmm8,%xmm8 1367 vpunpckhdq %xmm11,%xmm10,%xmm10 1368 vpunpcklqdq %xmm15,%xmm14,%xmm9 1369 vpunpckhqdq %xmm15,%xmm14,%xmm14 1370 vpunpcklqdq %xmm10,%xmm8,%xmm11 1371 vpunpckhqdq %xmm10,%xmm8,%xmm8 1372 vpaddd 128-256(%rcx),%xmm0,%xmm0 1373 vpaddd 144-256(%rcx),%xmm1,%xmm1 1374 vpaddd 160-256(%rcx),%xmm2,%xmm2 1375 vpaddd 176-256(%rcx),%xmm3,%xmm3 1376 1377 vmovdqa %xmm9,0(%rsp) 1378 vmovdqa %xmm14,16(%rsp) 1379 vmovdqa 32(%rsp),%xmm9 1380 vmovdqa 48(%rsp),%xmm14 1381 1382 vpunpckldq %xmm1,%xmm0,%xmm10 1383 vpunpckldq %xmm3,%xmm2,%xmm15 1384 vpunpckhdq %xmm1,%xmm0,%xmm0 1385 vpunpckhdq %xmm3,%xmm2,%xmm2 1386 vpunpcklqdq %xmm15,%xmm10,%xmm1 1387 vpunpckhqdq %xmm15,%xmm10,%xmm10 1388 vpunpcklqdq %xmm2,%xmm0,%xmm3 1389 vpunpckhqdq %xmm2,%xmm0,%xmm0 1390 vpaddd 192-256(%rcx),%xmm12,%xmm12 1391 vpaddd 208-256(%rcx),%xmm13,%xmm13 1392 vpaddd 224-256(%rcx),%xmm9,%xmm9 1393 vpaddd 240-256(%rcx),%xmm14,%xmm14 1394 1395 vpunpckldq %xmm13,%xmm12,%xmm2 1396 vpunpckldq %xmm14,%xmm9,%xmm15 1397 vpunpckhdq %xmm13,%xmm12,%xmm12 1398 vpunpckhdq %xmm14,%xmm9,%xmm9 1399 vpunpcklqdq %xmm15,%xmm2,%xmm13 1400 vpunpckhqdq %xmm15,%xmm2,%xmm2 1401 vpunpcklqdq %xmm9,%xmm12,%xmm14 1402 vpunpckhqdq %xmm9,%xmm12,%xmm12 1403 vpaddd 256-256(%rcx),%xmm4,%xmm4 1404 vpaddd 272-256(%rcx),%xmm5,%xmm5 1405 vpaddd 288-256(%rcx),%xmm6,%xmm6 1406 vpaddd 304-256(%rcx),%xmm7,%xmm7 1407 1408 vpunpckldq %xmm5,%xmm4,%xmm9 1409 vpunpckldq %xmm7,%xmm6,%xmm15 1410 vpunpckhdq %xmm5,%xmm4,%xmm4 1411 vpunpckhdq %xmm7,%xmm6,%xmm6 1412 vpunpcklqdq %xmm15,%xmm9,%xmm5 1413 vpunpckhqdq %xmm15,%xmm9,%xmm9 1414 vpunpcklqdq %xmm6,%xmm4,%xmm7 1415 vpunpckhqdq %xmm6,%xmm4,%xmm4 1416 vmovdqa 0(%rsp),%xmm6 1417 vmovdqa 16(%rsp),%xmm15 1418 1419 cmpq $256,%rdx 1420 jb .Ltail4xop 1421 1422 vpxor 0(%rsi),%xmm6,%xmm6 1423 vpxor 16(%rsi),%xmm1,%xmm1 1424 vpxor 32(%rsi),%xmm13,%xmm13 1425 vpxor 48(%rsi),%xmm5,%xmm5 1426 vpxor 64(%rsi),%xmm15,%xmm15 1427 vpxor 80(%rsi),%xmm10,%xmm10 1428 vpxor 96(%rsi),%xmm2,%xmm2 1429 vpxor 112(%rsi),%xmm9,%xmm9 1430 leaq 128(%rsi),%rsi 1431 vpxor 0(%rsi),%xmm11,%xmm11 1432 vpxor 16(%rsi),%xmm3,%xmm3 1433 vpxor 32(%rsi),%xmm14,%xmm14 1434 vpxor 48(%rsi),%xmm7,%xmm7 1435 vpxor 64(%rsi),%xmm8,%xmm8 1436 vpxor 80(%rsi),%xmm0,%xmm0 1437 vpxor 96(%rsi),%xmm12,%xmm12 1438 vpxor 112(%rsi),%xmm4,%xmm4 1439 leaq 128(%rsi),%rsi 1440 1441 vmovdqu %xmm6,0(%rdi) 1442 vmovdqu %xmm1,16(%rdi) 1443 vmovdqu %xmm13,32(%rdi) 1444 vmovdqu %xmm5,48(%rdi) 1445 vmovdqu %xmm15,64(%rdi) 1446 vmovdqu %xmm10,80(%rdi) 1447 vmovdqu %xmm2,96(%rdi) 1448 vmovdqu %xmm9,112(%rdi) 1449 leaq 128(%rdi),%rdi 1450 vmovdqu %xmm11,0(%rdi) 1451 vmovdqu %xmm3,16(%rdi) 1452 vmovdqu %xmm14,32(%rdi) 1453 vmovdqu %xmm7,48(%rdi) 1454 vmovdqu %xmm8,64(%rdi) 1455 vmovdqu %xmm0,80(%rdi) 1456 vmovdqu %xmm12,96(%rdi) 1457 vmovdqu %xmm4,112(%rdi) 1458 leaq 128(%rdi),%rdi 1459 1460 subq $256,%rdx 1461 jnz .Loop_outer4xop 1462 1463 jmp .Ldone4xop 1464 1465.align 32 1466.Ltail4xop: 1467 cmpq $192,%rdx 1468 jae .L192_or_more4xop 1469 cmpq $128,%rdx 1470 jae .L128_or_more4xop 1471 cmpq $64,%rdx 1472 jae .L64_or_more4xop 1473 1474 xorq %r10,%r10 1475 vmovdqa %xmm6,0(%rsp) 1476 vmovdqa %xmm1,16(%rsp) 1477 vmovdqa %xmm13,32(%rsp) 1478 vmovdqa %xmm5,48(%rsp) 1479 jmp .Loop_tail4xop 1480 1481.align 32 1482.L64_or_more4xop: 1483 vpxor 0(%rsi),%xmm6,%xmm6 1484 vpxor 16(%rsi),%xmm1,%xmm1 1485 vpxor 32(%rsi),%xmm13,%xmm13 1486 vpxor 48(%rsi),%xmm5,%xmm5 1487 vmovdqu %xmm6,0(%rdi) 1488 vmovdqu %xmm1,16(%rdi) 1489 vmovdqu %xmm13,32(%rdi) 1490 vmovdqu %xmm5,48(%rdi) 1491 je .Ldone4xop 1492 1493 leaq 64(%rsi),%rsi 1494 vmovdqa %xmm15,0(%rsp) 1495 xorq %r10,%r10 1496 vmovdqa %xmm10,16(%rsp) 1497 leaq 64(%rdi),%rdi 1498 vmovdqa %xmm2,32(%rsp) 1499 subq $64,%rdx 1500 vmovdqa %xmm9,48(%rsp) 1501 jmp .Loop_tail4xop 1502 1503.align 32 1504.L128_or_more4xop: 1505 vpxor 0(%rsi),%xmm6,%xmm6 1506 vpxor 16(%rsi),%xmm1,%xmm1 1507 vpxor 32(%rsi),%xmm13,%xmm13 1508 vpxor 48(%rsi),%xmm5,%xmm5 1509 vpxor 64(%rsi),%xmm15,%xmm15 1510 vpxor 80(%rsi),%xmm10,%xmm10 1511 vpxor 96(%rsi),%xmm2,%xmm2 1512 vpxor 112(%rsi),%xmm9,%xmm9 1513 1514 vmovdqu %xmm6,0(%rdi) 1515 vmovdqu %xmm1,16(%rdi) 1516 vmovdqu %xmm13,32(%rdi) 1517 vmovdqu %xmm5,48(%rdi) 1518 vmovdqu %xmm15,64(%rdi) 1519 vmovdqu %xmm10,80(%rdi) 1520 vmovdqu %xmm2,96(%rdi) 1521 vmovdqu %xmm9,112(%rdi) 1522 je .Ldone4xop 1523 1524 leaq 128(%rsi),%rsi 1525 vmovdqa %xmm11,0(%rsp) 1526 xorq %r10,%r10 1527 vmovdqa %xmm3,16(%rsp) 1528 leaq 128(%rdi),%rdi 1529 vmovdqa %xmm14,32(%rsp) 1530 subq $128,%rdx 1531 vmovdqa %xmm7,48(%rsp) 1532 jmp .Loop_tail4xop 1533 1534.align 32 1535.L192_or_more4xop: 1536 vpxor 0(%rsi),%xmm6,%xmm6 1537 vpxor 16(%rsi),%xmm1,%xmm1 1538 vpxor 32(%rsi),%xmm13,%xmm13 1539 vpxor 48(%rsi),%xmm5,%xmm5 1540 vpxor 64(%rsi),%xmm15,%xmm15 1541 vpxor 80(%rsi),%xmm10,%xmm10 1542 vpxor 96(%rsi),%xmm2,%xmm2 1543 vpxor 112(%rsi),%xmm9,%xmm9 1544 leaq 128(%rsi),%rsi 1545 vpxor 0(%rsi),%xmm11,%xmm11 1546 vpxor 16(%rsi),%xmm3,%xmm3 1547 vpxor 32(%rsi),%xmm14,%xmm14 1548 vpxor 48(%rsi),%xmm7,%xmm7 1549 1550 vmovdqu %xmm6,0(%rdi) 1551 vmovdqu %xmm1,16(%rdi) 1552 vmovdqu %xmm13,32(%rdi) 1553 vmovdqu %xmm5,48(%rdi) 1554 vmovdqu %xmm15,64(%rdi) 1555 vmovdqu %xmm10,80(%rdi) 1556 vmovdqu %xmm2,96(%rdi) 1557 vmovdqu %xmm9,112(%rdi) 1558 leaq 128(%rdi),%rdi 1559 vmovdqu %xmm11,0(%rdi) 1560 vmovdqu %xmm3,16(%rdi) 1561 vmovdqu %xmm14,32(%rdi) 1562 vmovdqu %xmm7,48(%rdi) 1563 je .Ldone4xop 1564 1565 leaq 64(%rsi),%rsi 1566 vmovdqa %xmm8,0(%rsp) 1567 xorq %r10,%r10 1568 vmovdqa %xmm0,16(%rsp) 1569 leaq 64(%rdi),%rdi 1570 vmovdqa %xmm12,32(%rsp) 1571 subq $192,%rdx 1572 vmovdqa %xmm4,48(%rsp) 1573 1574.Loop_tail4xop: 1575 movzbl (%rsi,%r10,1),%eax 1576 movzbl (%rsp,%r10,1),%ecx 1577 leaq 1(%r10),%r10 1578 xorl %ecx,%eax 1579 movb %al,-1(%rdi,%r10,1) 1580 decq %rdx 1581 jnz .Loop_tail4xop 1582 1583.Ldone4xop: 1584 vzeroupper 1585 leaq (%r9),%rsp 1586.cfi_def_cfa_register %rsp 1587.L4xop_epilogue: 1588 .byte 0xf3,0xc3 1589.cfi_endproc 1590.size ChaCha20_4xop,.-ChaCha20_4xop 1591.type ChaCha20_8x,@function 1592.align 32 1593ChaCha20_8x: 1594.cfi_startproc 1595.LChaCha20_8x: 1596 movq %rsp,%r9 1597.cfi_def_cfa_register %r9 1598 subq $0x280+8,%rsp 1599 andq $-32,%rsp 1600 vzeroupper 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 vbroadcasti128 .Lsigma(%rip),%ymm11 1612 vbroadcasti128 (%rcx),%ymm3 1613 vbroadcasti128 16(%rcx),%ymm15 1614 vbroadcasti128 (%r8),%ymm7 1615 leaq 256(%rsp),%rcx 1616 leaq 512(%rsp),%rax 1617 leaq .Lrot16(%rip),%r10 1618 leaq .Lrot24(%rip),%r11 1619 1620 vpshufd $0x00,%ymm11,%ymm8 1621 vpshufd $0x55,%ymm11,%ymm9 1622 vmovdqa %ymm8,128-256(%rcx) 1623 vpshufd $0xaa,%ymm11,%ymm10 1624 vmovdqa %ymm9,160-256(%rcx) 1625 vpshufd $0xff,%ymm11,%ymm11 1626 vmovdqa %ymm10,192-256(%rcx) 1627 vmovdqa %ymm11,224-256(%rcx) 1628 1629 vpshufd $0x00,%ymm3,%ymm0 1630 vpshufd $0x55,%ymm3,%ymm1 1631 vmovdqa %ymm0,256-256(%rcx) 1632 vpshufd $0xaa,%ymm3,%ymm2 1633 vmovdqa %ymm1,288-256(%rcx) 1634 vpshufd $0xff,%ymm3,%ymm3 1635 vmovdqa %ymm2,320-256(%rcx) 1636 vmovdqa %ymm3,352-256(%rcx) 1637 1638 vpshufd $0x00,%ymm15,%ymm12 1639 vpshufd $0x55,%ymm15,%ymm13 1640 vmovdqa %ymm12,384-512(%rax) 1641 vpshufd $0xaa,%ymm15,%ymm14 1642 vmovdqa %ymm13,416-512(%rax) 1643 vpshufd $0xff,%ymm15,%ymm15 1644 vmovdqa %ymm14,448-512(%rax) 1645 vmovdqa %ymm15,480-512(%rax) 1646 1647 vpshufd $0x00,%ymm7,%ymm4 1648 vpshufd $0x55,%ymm7,%ymm5 1649 vpaddd .Lincy(%rip),%ymm4,%ymm4 1650 vpshufd $0xaa,%ymm7,%ymm6 1651 vmovdqa %ymm5,544-512(%rax) 1652 vpshufd $0xff,%ymm7,%ymm7 1653 vmovdqa %ymm6,576-512(%rax) 1654 vmovdqa %ymm7,608-512(%rax) 1655 1656 jmp .Loop_enter8x 1657 1658.align 32 1659.Loop_outer8x: 1660 vmovdqa 128-256(%rcx),%ymm8 1661 vmovdqa 160-256(%rcx),%ymm9 1662 vmovdqa 192-256(%rcx),%ymm10 1663 vmovdqa 224-256(%rcx),%ymm11 1664 vmovdqa 256-256(%rcx),%ymm0 1665 vmovdqa 288-256(%rcx),%ymm1 1666 vmovdqa 320-256(%rcx),%ymm2 1667 vmovdqa 352-256(%rcx),%ymm3 1668 vmovdqa 384-512(%rax),%ymm12 1669 vmovdqa 416-512(%rax),%ymm13 1670 vmovdqa 448-512(%rax),%ymm14 1671 vmovdqa 480-512(%rax),%ymm15 1672 vmovdqa 512-512(%rax),%ymm4 1673 vmovdqa 544-512(%rax),%ymm5 1674 vmovdqa 576-512(%rax),%ymm6 1675 vmovdqa 608-512(%rax),%ymm7 1676 vpaddd .Leight(%rip),%ymm4,%ymm4 1677 1678.Loop_enter8x: 1679 vmovdqa %ymm14,64(%rsp) 1680 vmovdqa %ymm15,96(%rsp) 1681 vbroadcasti128 (%r10),%ymm15 1682 vmovdqa %ymm4,512-512(%rax) 1683 movl $10,%eax 1684 jmp .Loop8x 1685 1686.align 32 1687.Loop8x: 1688 vpaddd %ymm0,%ymm8,%ymm8 1689 vpxor %ymm4,%ymm8,%ymm4 1690 vpshufb %ymm15,%ymm4,%ymm4 1691 vpaddd %ymm1,%ymm9,%ymm9 1692 vpxor %ymm5,%ymm9,%ymm5 1693 vpshufb %ymm15,%ymm5,%ymm5 1694 vpaddd %ymm4,%ymm12,%ymm12 1695 vpxor %ymm0,%ymm12,%ymm0 1696 vpslld $12,%ymm0,%ymm14 1697 vpsrld $20,%ymm0,%ymm0 1698 vpor %ymm0,%ymm14,%ymm0 1699 vbroadcasti128 (%r11),%ymm14 1700 vpaddd %ymm5,%ymm13,%ymm13 1701 vpxor %ymm1,%ymm13,%ymm1 1702 vpslld $12,%ymm1,%ymm15 1703 vpsrld $20,%ymm1,%ymm1 1704 vpor %ymm1,%ymm15,%ymm1 1705 vpaddd %ymm0,%ymm8,%ymm8 1706 vpxor %ymm4,%ymm8,%ymm4 1707 vpshufb %ymm14,%ymm4,%ymm4 1708 vpaddd %ymm1,%ymm9,%ymm9 1709 vpxor %ymm5,%ymm9,%ymm5 1710 vpshufb %ymm14,%ymm5,%ymm5 1711 vpaddd %ymm4,%ymm12,%ymm12 1712 vpxor %ymm0,%ymm12,%ymm0 1713 vpslld $7,%ymm0,%ymm15 1714 vpsrld $25,%ymm0,%ymm0 1715 vpor %ymm0,%ymm15,%ymm0 1716 vbroadcasti128 (%r10),%ymm15 1717 vpaddd %ymm5,%ymm13,%ymm13 1718 vpxor %ymm1,%ymm13,%ymm1 1719 vpslld $7,%ymm1,%ymm14 1720 vpsrld $25,%ymm1,%ymm1 1721 vpor %ymm1,%ymm14,%ymm1 1722 vmovdqa %ymm12,0(%rsp) 1723 vmovdqa %ymm13,32(%rsp) 1724 vmovdqa 64(%rsp),%ymm12 1725 vmovdqa 96(%rsp),%ymm13 1726 vpaddd %ymm2,%ymm10,%ymm10 1727 vpxor %ymm6,%ymm10,%ymm6 1728 vpshufb %ymm15,%ymm6,%ymm6 1729 vpaddd %ymm3,%ymm11,%ymm11 1730 vpxor %ymm7,%ymm11,%ymm7 1731 vpshufb %ymm15,%ymm7,%ymm7 1732 vpaddd %ymm6,%ymm12,%ymm12 1733 vpxor %ymm2,%ymm12,%ymm2 1734 vpslld $12,%ymm2,%ymm14 1735 vpsrld $20,%ymm2,%ymm2 1736 vpor %ymm2,%ymm14,%ymm2 1737 vbroadcasti128 (%r11),%ymm14 1738 vpaddd %ymm7,%ymm13,%ymm13 1739 vpxor %ymm3,%ymm13,%ymm3 1740 vpslld $12,%ymm3,%ymm15 1741 vpsrld $20,%ymm3,%ymm3 1742 vpor %ymm3,%ymm15,%ymm3 1743 vpaddd %ymm2,%ymm10,%ymm10 1744 vpxor %ymm6,%ymm10,%ymm6 1745 vpshufb %ymm14,%ymm6,%ymm6 1746 vpaddd %ymm3,%ymm11,%ymm11 1747 vpxor %ymm7,%ymm11,%ymm7 1748 vpshufb %ymm14,%ymm7,%ymm7 1749 vpaddd %ymm6,%ymm12,%ymm12 1750 vpxor %ymm2,%ymm12,%ymm2 1751 vpslld $7,%ymm2,%ymm15 1752 vpsrld $25,%ymm2,%ymm2 1753 vpor %ymm2,%ymm15,%ymm2 1754 vbroadcasti128 (%r10),%ymm15 1755 vpaddd %ymm7,%ymm13,%ymm13 1756 vpxor %ymm3,%ymm13,%ymm3 1757 vpslld $7,%ymm3,%ymm14 1758 vpsrld $25,%ymm3,%ymm3 1759 vpor %ymm3,%ymm14,%ymm3 1760 vpaddd %ymm1,%ymm8,%ymm8 1761 vpxor %ymm7,%ymm8,%ymm7 1762 vpshufb %ymm15,%ymm7,%ymm7 1763 vpaddd %ymm2,%ymm9,%ymm9 1764 vpxor %ymm4,%ymm9,%ymm4 1765 vpshufb %ymm15,%ymm4,%ymm4 1766 vpaddd %ymm7,%ymm12,%ymm12 1767 vpxor %ymm1,%ymm12,%ymm1 1768 vpslld $12,%ymm1,%ymm14 1769 vpsrld $20,%ymm1,%ymm1 1770 vpor %ymm1,%ymm14,%ymm1 1771 vbroadcasti128 (%r11),%ymm14 1772 vpaddd %ymm4,%ymm13,%ymm13 1773 vpxor %ymm2,%ymm13,%ymm2 1774 vpslld $12,%ymm2,%ymm15 1775 vpsrld $20,%ymm2,%ymm2 1776 vpor %ymm2,%ymm15,%ymm2 1777 vpaddd %ymm1,%ymm8,%ymm8 1778 vpxor %ymm7,%ymm8,%ymm7 1779 vpshufb %ymm14,%ymm7,%ymm7 1780 vpaddd %ymm2,%ymm9,%ymm9 1781 vpxor %ymm4,%ymm9,%ymm4 1782 vpshufb %ymm14,%ymm4,%ymm4 1783 vpaddd %ymm7,%ymm12,%ymm12 1784 vpxor %ymm1,%ymm12,%ymm1 1785 vpslld $7,%ymm1,%ymm15 1786 vpsrld $25,%ymm1,%ymm1 1787 vpor %ymm1,%ymm15,%ymm1 1788 vbroadcasti128 (%r10),%ymm15 1789 vpaddd %ymm4,%ymm13,%ymm13 1790 vpxor %ymm2,%ymm13,%ymm2 1791 vpslld $7,%ymm2,%ymm14 1792 vpsrld $25,%ymm2,%ymm2 1793 vpor %ymm2,%ymm14,%ymm2 1794 vmovdqa %ymm12,64(%rsp) 1795 vmovdqa %ymm13,96(%rsp) 1796 vmovdqa 0(%rsp),%ymm12 1797 vmovdqa 32(%rsp),%ymm13 1798 vpaddd %ymm3,%ymm10,%ymm10 1799 vpxor %ymm5,%ymm10,%ymm5 1800 vpshufb %ymm15,%ymm5,%ymm5 1801 vpaddd %ymm0,%ymm11,%ymm11 1802 vpxor %ymm6,%ymm11,%ymm6 1803 vpshufb %ymm15,%ymm6,%ymm6 1804 vpaddd %ymm5,%ymm12,%ymm12 1805 vpxor %ymm3,%ymm12,%ymm3 1806 vpslld $12,%ymm3,%ymm14 1807 vpsrld $20,%ymm3,%ymm3 1808 vpor %ymm3,%ymm14,%ymm3 1809 vbroadcasti128 (%r11),%ymm14 1810 vpaddd %ymm6,%ymm13,%ymm13 1811 vpxor %ymm0,%ymm13,%ymm0 1812 vpslld $12,%ymm0,%ymm15 1813 vpsrld $20,%ymm0,%ymm0 1814 vpor %ymm0,%ymm15,%ymm0 1815 vpaddd %ymm3,%ymm10,%ymm10 1816 vpxor %ymm5,%ymm10,%ymm5 1817 vpshufb %ymm14,%ymm5,%ymm5 1818 vpaddd %ymm0,%ymm11,%ymm11 1819 vpxor %ymm6,%ymm11,%ymm6 1820 vpshufb %ymm14,%ymm6,%ymm6 1821 vpaddd %ymm5,%ymm12,%ymm12 1822 vpxor %ymm3,%ymm12,%ymm3 1823 vpslld $7,%ymm3,%ymm15 1824 vpsrld $25,%ymm3,%ymm3 1825 vpor %ymm3,%ymm15,%ymm3 1826 vbroadcasti128 (%r10),%ymm15 1827 vpaddd %ymm6,%ymm13,%ymm13 1828 vpxor %ymm0,%ymm13,%ymm0 1829 vpslld $7,%ymm0,%ymm14 1830 vpsrld $25,%ymm0,%ymm0 1831 vpor %ymm0,%ymm14,%ymm0 1832 decl %eax 1833 jnz .Loop8x 1834 1835 leaq 512(%rsp),%rax 1836 vpaddd 128-256(%rcx),%ymm8,%ymm8 1837 vpaddd 160-256(%rcx),%ymm9,%ymm9 1838 vpaddd 192-256(%rcx),%ymm10,%ymm10 1839 vpaddd 224-256(%rcx),%ymm11,%ymm11 1840 1841 vpunpckldq %ymm9,%ymm8,%ymm14 1842 vpunpckldq %ymm11,%ymm10,%ymm15 1843 vpunpckhdq %ymm9,%ymm8,%ymm8 1844 vpunpckhdq %ymm11,%ymm10,%ymm10 1845 vpunpcklqdq %ymm15,%ymm14,%ymm9 1846 vpunpckhqdq %ymm15,%ymm14,%ymm14 1847 vpunpcklqdq %ymm10,%ymm8,%ymm11 1848 vpunpckhqdq %ymm10,%ymm8,%ymm8 1849 vpaddd 256-256(%rcx),%ymm0,%ymm0 1850 vpaddd 288-256(%rcx),%ymm1,%ymm1 1851 vpaddd 320-256(%rcx),%ymm2,%ymm2 1852 vpaddd 352-256(%rcx),%ymm3,%ymm3 1853 1854 vpunpckldq %ymm1,%ymm0,%ymm10 1855 vpunpckldq %ymm3,%ymm2,%ymm15 1856 vpunpckhdq %ymm1,%ymm0,%ymm0 1857 vpunpckhdq %ymm3,%ymm2,%ymm2 1858 vpunpcklqdq %ymm15,%ymm10,%ymm1 1859 vpunpckhqdq %ymm15,%ymm10,%ymm10 1860 vpunpcklqdq %ymm2,%ymm0,%ymm3 1861 vpunpckhqdq %ymm2,%ymm0,%ymm0 1862 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1863 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1864 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1865 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1866 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1867 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1868 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1869 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1870 vmovdqa %ymm15,0(%rsp) 1871 vmovdqa %ymm9,32(%rsp) 1872 vmovdqa 64(%rsp),%ymm15 1873 vmovdqa 96(%rsp),%ymm9 1874 1875 vpaddd 384-512(%rax),%ymm12,%ymm12 1876 vpaddd 416-512(%rax),%ymm13,%ymm13 1877 vpaddd 448-512(%rax),%ymm15,%ymm15 1878 vpaddd 480-512(%rax),%ymm9,%ymm9 1879 1880 vpunpckldq %ymm13,%ymm12,%ymm2 1881 vpunpckldq %ymm9,%ymm15,%ymm8 1882 vpunpckhdq %ymm13,%ymm12,%ymm12 1883 vpunpckhdq %ymm9,%ymm15,%ymm15 1884 vpunpcklqdq %ymm8,%ymm2,%ymm13 1885 vpunpckhqdq %ymm8,%ymm2,%ymm2 1886 vpunpcklqdq %ymm15,%ymm12,%ymm9 1887 vpunpckhqdq %ymm15,%ymm12,%ymm12 1888 vpaddd 512-512(%rax),%ymm4,%ymm4 1889 vpaddd 544-512(%rax),%ymm5,%ymm5 1890 vpaddd 576-512(%rax),%ymm6,%ymm6 1891 vpaddd 608-512(%rax),%ymm7,%ymm7 1892 1893 vpunpckldq %ymm5,%ymm4,%ymm15 1894 vpunpckldq %ymm7,%ymm6,%ymm8 1895 vpunpckhdq %ymm5,%ymm4,%ymm4 1896 vpunpckhdq %ymm7,%ymm6,%ymm6 1897 vpunpcklqdq %ymm8,%ymm15,%ymm5 1898 vpunpckhqdq %ymm8,%ymm15,%ymm15 1899 vpunpcklqdq %ymm6,%ymm4,%ymm7 1900 vpunpckhqdq %ymm6,%ymm4,%ymm4 1901 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1902 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1903 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1904 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1905 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1906 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1907 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1908 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1909 vmovdqa 0(%rsp),%ymm6 1910 vmovdqa 32(%rsp),%ymm12 1911 1912 cmpq $512,%rdx 1913 jb .Ltail8x 1914 1915 vpxor 0(%rsi),%ymm6,%ymm6 1916 vpxor 32(%rsi),%ymm8,%ymm8 1917 vpxor 64(%rsi),%ymm1,%ymm1 1918 vpxor 96(%rsi),%ymm5,%ymm5 1919 leaq 128(%rsi),%rsi 1920 vmovdqu %ymm6,0(%rdi) 1921 vmovdqu %ymm8,32(%rdi) 1922 vmovdqu %ymm1,64(%rdi) 1923 vmovdqu %ymm5,96(%rdi) 1924 leaq 128(%rdi),%rdi 1925 1926 vpxor 0(%rsi),%ymm12,%ymm12 1927 vpxor 32(%rsi),%ymm13,%ymm13 1928 vpxor 64(%rsi),%ymm10,%ymm10 1929 vpxor 96(%rsi),%ymm15,%ymm15 1930 leaq 128(%rsi),%rsi 1931 vmovdqu %ymm12,0(%rdi) 1932 vmovdqu %ymm13,32(%rdi) 1933 vmovdqu %ymm10,64(%rdi) 1934 vmovdqu %ymm15,96(%rdi) 1935 leaq 128(%rdi),%rdi 1936 1937 vpxor 0(%rsi),%ymm14,%ymm14 1938 vpxor 32(%rsi),%ymm2,%ymm2 1939 vpxor 64(%rsi),%ymm3,%ymm3 1940 vpxor 96(%rsi),%ymm7,%ymm7 1941 leaq 128(%rsi),%rsi 1942 vmovdqu %ymm14,0(%rdi) 1943 vmovdqu %ymm2,32(%rdi) 1944 vmovdqu %ymm3,64(%rdi) 1945 vmovdqu %ymm7,96(%rdi) 1946 leaq 128(%rdi),%rdi 1947 1948 vpxor 0(%rsi),%ymm11,%ymm11 1949 vpxor 32(%rsi),%ymm9,%ymm9 1950 vpxor 64(%rsi),%ymm0,%ymm0 1951 vpxor 96(%rsi),%ymm4,%ymm4 1952 leaq 128(%rsi),%rsi 1953 vmovdqu %ymm11,0(%rdi) 1954 vmovdqu %ymm9,32(%rdi) 1955 vmovdqu %ymm0,64(%rdi) 1956 vmovdqu %ymm4,96(%rdi) 1957 leaq 128(%rdi),%rdi 1958 1959 subq $512,%rdx 1960 jnz .Loop_outer8x 1961 1962 jmp .Ldone8x 1963 1964.Ltail8x: 1965 cmpq $448,%rdx 1966 jae .L448_or_more8x 1967 cmpq $384,%rdx 1968 jae .L384_or_more8x 1969 cmpq $320,%rdx 1970 jae .L320_or_more8x 1971 cmpq $256,%rdx 1972 jae .L256_or_more8x 1973 cmpq $192,%rdx 1974 jae .L192_or_more8x 1975 cmpq $128,%rdx 1976 jae .L128_or_more8x 1977 cmpq $64,%rdx 1978 jae .L64_or_more8x 1979 1980 xorq %r10,%r10 1981 vmovdqa %ymm6,0(%rsp) 1982 vmovdqa %ymm8,32(%rsp) 1983 jmp .Loop_tail8x 1984 1985.align 32 1986.L64_or_more8x: 1987 vpxor 0(%rsi),%ymm6,%ymm6 1988 vpxor 32(%rsi),%ymm8,%ymm8 1989 vmovdqu %ymm6,0(%rdi) 1990 vmovdqu %ymm8,32(%rdi) 1991 je .Ldone8x 1992 1993 leaq 64(%rsi),%rsi 1994 xorq %r10,%r10 1995 vmovdqa %ymm1,0(%rsp) 1996 leaq 64(%rdi),%rdi 1997 subq $64,%rdx 1998 vmovdqa %ymm5,32(%rsp) 1999 jmp .Loop_tail8x 2000 2001.align 32 2002.L128_or_more8x: 2003 vpxor 0(%rsi),%ymm6,%ymm6 2004 vpxor 32(%rsi),%ymm8,%ymm8 2005 vpxor 64(%rsi),%ymm1,%ymm1 2006 vpxor 96(%rsi),%ymm5,%ymm5 2007 vmovdqu %ymm6,0(%rdi) 2008 vmovdqu %ymm8,32(%rdi) 2009 vmovdqu %ymm1,64(%rdi) 2010 vmovdqu %ymm5,96(%rdi) 2011 je .Ldone8x 2012 2013 leaq 128(%rsi),%rsi 2014 xorq %r10,%r10 2015 vmovdqa %ymm12,0(%rsp) 2016 leaq 128(%rdi),%rdi 2017 subq $128,%rdx 2018 vmovdqa %ymm13,32(%rsp) 2019 jmp .Loop_tail8x 2020 2021.align 32 2022.L192_or_more8x: 2023 vpxor 0(%rsi),%ymm6,%ymm6 2024 vpxor 32(%rsi),%ymm8,%ymm8 2025 vpxor 64(%rsi),%ymm1,%ymm1 2026 vpxor 96(%rsi),%ymm5,%ymm5 2027 vpxor 128(%rsi),%ymm12,%ymm12 2028 vpxor 160(%rsi),%ymm13,%ymm13 2029 vmovdqu %ymm6,0(%rdi) 2030 vmovdqu %ymm8,32(%rdi) 2031 vmovdqu %ymm1,64(%rdi) 2032 vmovdqu %ymm5,96(%rdi) 2033 vmovdqu %ymm12,128(%rdi) 2034 vmovdqu %ymm13,160(%rdi) 2035 je .Ldone8x 2036 2037 leaq 192(%rsi),%rsi 2038 xorq %r10,%r10 2039 vmovdqa %ymm10,0(%rsp) 2040 leaq 192(%rdi),%rdi 2041 subq $192,%rdx 2042 vmovdqa %ymm15,32(%rsp) 2043 jmp .Loop_tail8x 2044 2045.align 32 2046.L256_or_more8x: 2047 vpxor 0(%rsi),%ymm6,%ymm6 2048 vpxor 32(%rsi),%ymm8,%ymm8 2049 vpxor 64(%rsi),%ymm1,%ymm1 2050 vpxor 96(%rsi),%ymm5,%ymm5 2051 vpxor 128(%rsi),%ymm12,%ymm12 2052 vpxor 160(%rsi),%ymm13,%ymm13 2053 vpxor 192(%rsi),%ymm10,%ymm10 2054 vpxor 224(%rsi),%ymm15,%ymm15 2055 vmovdqu %ymm6,0(%rdi) 2056 vmovdqu %ymm8,32(%rdi) 2057 vmovdqu %ymm1,64(%rdi) 2058 vmovdqu %ymm5,96(%rdi) 2059 vmovdqu %ymm12,128(%rdi) 2060 vmovdqu %ymm13,160(%rdi) 2061 vmovdqu %ymm10,192(%rdi) 2062 vmovdqu %ymm15,224(%rdi) 2063 je .Ldone8x 2064 2065 leaq 256(%rsi),%rsi 2066 xorq %r10,%r10 2067 vmovdqa %ymm14,0(%rsp) 2068 leaq 256(%rdi),%rdi 2069 subq $256,%rdx 2070 vmovdqa %ymm2,32(%rsp) 2071 jmp .Loop_tail8x 2072 2073.align 32 2074.L320_or_more8x: 2075 vpxor 0(%rsi),%ymm6,%ymm6 2076 vpxor 32(%rsi),%ymm8,%ymm8 2077 vpxor 64(%rsi),%ymm1,%ymm1 2078 vpxor 96(%rsi),%ymm5,%ymm5 2079 vpxor 128(%rsi),%ymm12,%ymm12 2080 vpxor 160(%rsi),%ymm13,%ymm13 2081 vpxor 192(%rsi),%ymm10,%ymm10 2082 vpxor 224(%rsi),%ymm15,%ymm15 2083 vpxor 256(%rsi),%ymm14,%ymm14 2084 vpxor 288(%rsi),%ymm2,%ymm2 2085 vmovdqu %ymm6,0(%rdi) 2086 vmovdqu %ymm8,32(%rdi) 2087 vmovdqu %ymm1,64(%rdi) 2088 vmovdqu %ymm5,96(%rdi) 2089 vmovdqu %ymm12,128(%rdi) 2090 vmovdqu %ymm13,160(%rdi) 2091 vmovdqu %ymm10,192(%rdi) 2092 vmovdqu %ymm15,224(%rdi) 2093 vmovdqu %ymm14,256(%rdi) 2094 vmovdqu %ymm2,288(%rdi) 2095 je .Ldone8x 2096 2097 leaq 320(%rsi),%rsi 2098 xorq %r10,%r10 2099 vmovdqa %ymm3,0(%rsp) 2100 leaq 320(%rdi),%rdi 2101 subq $320,%rdx 2102 vmovdqa %ymm7,32(%rsp) 2103 jmp .Loop_tail8x 2104 2105.align 32 2106.L384_or_more8x: 2107 vpxor 0(%rsi),%ymm6,%ymm6 2108 vpxor 32(%rsi),%ymm8,%ymm8 2109 vpxor 64(%rsi),%ymm1,%ymm1 2110 vpxor 96(%rsi),%ymm5,%ymm5 2111 vpxor 128(%rsi),%ymm12,%ymm12 2112 vpxor 160(%rsi),%ymm13,%ymm13 2113 vpxor 192(%rsi),%ymm10,%ymm10 2114 vpxor 224(%rsi),%ymm15,%ymm15 2115 vpxor 256(%rsi),%ymm14,%ymm14 2116 vpxor 288(%rsi),%ymm2,%ymm2 2117 vpxor 320(%rsi),%ymm3,%ymm3 2118 vpxor 352(%rsi),%ymm7,%ymm7 2119 vmovdqu %ymm6,0(%rdi) 2120 vmovdqu %ymm8,32(%rdi) 2121 vmovdqu %ymm1,64(%rdi) 2122 vmovdqu %ymm5,96(%rdi) 2123 vmovdqu %ymm12,128(%rdi) 2124 vmovdqu %ymm13,160(%rdi) 2125 vmovdqu %ymm10,192(%rdi) 2126 vmovdqu %ymm15,224(%rdi) 2127 vmovdqu %ymm14,256(%rdi) 2128 vmovdqu %ymm2,288(%rdi) 2129 vmovdqu %ymm3,320(%rdi) 2130 vmovdqu %ymm7,352(%rdi) 2131 je .Ldone8x 2132 2133 leaq 384(%rsi),%rsi 2134 xorq %r10,%r10 2135 vmovdqa %ymm11,0(%rsp) 2136 leaq 384(%rdi),%rdi 2137 subq $384,%rdx 2138 vmovdqa %ymm9,32(%rsp) 2139 jmp .Loop_tail8x 2140 2141.align 32 2142.L448_or_more8x: 2143 vpxor 0(%rsi),%ymm6,%ymm6 2144 vpxor 32(%rsi),%ymm8,%ymm8 2145 vpxor 64(%rsi),%ymm1,%ymm1 2146 vpxor 96(%rsi),%ymm5,%ymm5 2147 vpxor 128(%rsi),%ymm12,%ymm12 2148 vpxor 160(%rsi),%ymm13,%ymm13 2149 vpxor 192(%rsi),%ymm10,%ymm10 2150 vpxor 224(%rsi),%ymm15,%ymm15 2151 vpxor 256(%rsi),%ymm14,%ymm14 2152 vpxor 288(%rsi),%ymm2,%ymm2 2153 vpxor 320(%rsi),%ymm3,%ymm3 2154 vpxor 352(%rsi),%ymm7,%ymm7 2155 vpxor 384(%rsi),%ymm11,%ymm11 2156 vpxor 416(%rsi),%ymm9,%ymm9 2157 vmovdqu %ymm6,0(%rdi) 2158 vmovdqu %ymm8,32(%rdi) 2159 vmovdqu %ymm1,64(%rdi) 2160 vmovdqu %ymm5,96(%rdi) 2161 vmovdqu %ymm12,128(%rdi) 2162 vmovdqu %ymm13,160(%rdi) 2163 vmovdqu %ymm10,192(%rdi) 2164 vmovdqu %ymm15,224(%rdi) 2165 vmovdqu %ymm14,256(%rdi) 2166 vmovdqu %ymm2,288(%rdi) 2167 vmovdqu %ymm3,320(%rdi) 2168 vmovdqu %ymm7,352(%rdi) 2169 vmovdqu %ymm11,384(%rdi) 2170 vmovdqu %ymm9,416(%rdi) 2171 je .Ldone8x 2172 2173 leaq 448(%rsi),%rsi 2174 xorq %r10,%r10 2175 vmovdqa %ymm0,0(%rsp) 2176 leaq 448(%rdi),%rdi 2177 subq $448,%rdx 2178 vmovdqa %ymm4,32(%rsp) 2179 2180.Loop_tail8x: 2181 movzbl (%rsi,%r10,1),%eax 2182 movzbl (%rsp,%r10,1),%ecx 2183 leaq 1(%r10),%r10 2184 xorl %ecx,%eax 2185 movb %al,-1(%rdi,%r10,1) 2186 decq %rdx 2187 jnz .Loop_tail8x 2188 2189.Ldone8x: 2190 vzeroall 2191 leaq (%r9),%rsp 2192.cfi_def_cfa_register %rsp 2193.L8x_epilogue: 2194 .byte 0xf3,0xc3 2195.cfi_endproc 2196.size ChaCha20_8x,.-ChaCha20_8x 2197 .section ".note.gnu.property", "a" 2198 .p2align 3 2199 .long 1f - 0f 2200 .long 4f - 1f 2201 .long 5 22020: 2203 # "GNU" encoded with .byte, since .asciz isn't supported 2204 # on Solaris. 2205 .byte 0x47 2206 .byte 0x4e 2207 .byte 0x55 2208 .byte 0 22091: 2210 .p2align 3 2211 .long 0xc0000002 2212 .long 3f - 2f 22132: 2214 .long 3 22153: 2216 .p2align 3 22174: 2218