1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from chacha-x86_64.pl. */ 3.text 4 5 6 7.align 64 8.Lzero: 9.long 0,0,0,0 10.Lone: 11.long 1,0,0,0 12.Linc: 13.long 0,1,2,3 14.Lfour: 15.long 4,4,4,4 16.Lincy: 17.long 0,2,4,6,1,3,5,7 18.Leight: 19.long 8,8,8,8,8,8,8,8 20.Lrot16: 21.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 22.Lrot24: 23.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 24.Ltwoy: 25.long 2,0,0,0, 2,0,0,0 26.align 64 27.Lzeroz: 28.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 29.Lfourz: 30.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 31.Lincz: 32.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 33.Lsixteen: 34.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 35.Lsigma: 36.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 37.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 38.globl ChaCha20_ctr32 39.type ChaCha20_ctr32,@function 40.align 64 41ChaCha20_ctr32: 42.cfi_startproc 43 cmpq $0,%rdx 44 je .Lno_data 45 movq OPENSSL_ia32cap_P+4(%rip),%r10 46 testl $512,%r10d 47 jnz .LChaCha20_ssse3 48 49 pushq %rbx 50.cfi_adjust_cfa_offset 8 51.cfi_offset %rbx,-16 52 pushq %rbp 53.cfi_adjust_cfa_offset 8 54.cfi_offset %rbp,-24 55 pushq %r12 56.cfi_adjust_cfa_offset 8 57.cfi_offset %r12,-32 58 pushq %r13 59.cfi_adjust_cfa_offset 8 60.cfi_offset %r13,-40 61 pushq %r14 62.cfi_adjust_cfa_offset 8 63.cfi_offset %r14,-48 64 pushq %r15 65.cfi_adjust_cfa_offset 8 66.cfi_offset %r15,-56 67 subq $64+24,%rsp 68.cfi_adjust_cfa_offset 64+24 69.Lctr32_body: 70 71 72 movdqu (%rcx),%xmm1 73 movdqu 16(%rcx),%xmm2 74 movdqu (%r8),%xmm3 75 movdqa .Lone(%rip),%xmm4 76 77 78 movdqa %xmm1,16(%rsp) 79 movdqa %xmm2,32(%rsp) 80 movdqa %xmm3,48(%rsp) 81 movq %rdx,%rbp 82 jmp .Loop_outer 83 84.align 32 85.Loop_outer: 86 movl $0x61707865,%eax 87 movl $0x3320646e,%ebx 88 movl $0x79622d32,%ecx 89 movl $0x6b206574,%edx 90 movl 16(%rsp),%r8d 91 movl 20(%rsp),%r9d 92 movl 24(%rsp),%r10d 93 movl 28(%rsp),%r11d 94 movd %xmm3,%r12d 95 movl 52(%rsp),%r13d 96 movl 56(%rsp),%r14d 97 movl 60(%rsp),%r15d 98 99 movq %rbp,64+0(%rsp) 100 movl $10,%ebp 101 movq %rsi,64+8(%rsp) 102.byte 102,72,15,126,214 103 movq %rdi,64+16(%rsp) 104 movq %rsi,%rdi 105 shrq $32,%rdi 106 jmp .Loop 107 108.align 32 109.Loop: 110 addl %r8d,%eax 111 xorl %eax,%r12d 112 roll $16,%r12d 113 addl %r9d,%ebx 114 xorl %ebx,%r13d 115 roll $16,%r13d 116 addl %r12d,%esi 117 xorl %esi,%r8d 118 roll $12,%r8d 119 addl %r13d,%edi 120 xorl %edi,%r9d 121 roll $12,%r9d 122 addl %r8d,%eax 123 xorl %eax,%r12d 124 roll $8,%r12d 125 addl %r9d,%ebx 126 xorl %ebx,%r13d 127 roll $8,%r13d 128 addl %r12d,%esi 129 xorl %esi,%r8d 130 roll $7,%r8d 131 addl %r13d,%edi 132 xorl %edi,%r9d 133 roll $7,%r9d 134 movl %esi,32(%rsp) 135 movl %edi,36(%rsp) 136 movl 40(%rsp),%esi 137 movl 44(%rsp),%edi 138 addl %r10d,%ecx 139 xorl %ecx,%r14d 140 roll $16,%r14d 141 addl %r11d,%edx 142 xorl %edx,%r15d 143 roll $16,%r15d 144 addl %r14d,%esi 145 xorl %esi,%r10d 146 roll $12,%r10d 147 addl %r15d,%edi 148 xorl %edi,%r11d 149 roll $12,%r11d 150 addl %r10d,%ecx 151 xorl %ecx,%r14d 152 roll $8,%r14d 153 addl %r11d,%edx 154 xorl %edx,%r15d 155 roll $8,%r15d 156 addl %r14d,%esi 157 xorl %esi,%r10d 158 roll $7,%r10d 159 addl %r15d,%edi 160 xorl %edi,%r11d 161 roll $7,%r11d 162 addl %r9d,%eax 163 xorl %eax,%r15d 164 roll $16,%r15d 165 addl %r10d,%ebx 166 xorl %ebx,%r12d 167 roll $16,%r12d 168 addl %r15d,%esi 169 xorl %esi,%r9d 170 roll $12,%r9d 171 addl %r12d,%edi 172 xorl %edi,%r10d 173 roll $12,%r10d 174 addl %r9d,%eax 175 xorl %eax,%r15d 176 roll $8,%r15d 177 addl %r10d,%ebx 178 xorl %ebx,%r12d 179 roll $8,%r12d 180 addl %r15d,%esi 181 xorl %esi,%r9d 182 roll $7,%r9d 183 addl %r12d,%edi 184 xorl %edi,%r10d 185 roll $7,%r10d 186 movl %esi,40(%rsp) 187 movl %edi,44(%rsp) 188 movl 32(%rsp),%esi 189 movl 36(%rsp),%edi 190 addl %r11d,%ecx 191 xorl %ecx,%r13d 192 roll $16,%r13d 193 addl %r8d,%edx 194 xorl %edx,%r14d 195 roll $16,%r14d 196 addl %r13d,%esi 197 xorl %esi,%r11d 198 roll $12,%r11d 199 addl %r14d,%edi 200 xorl %edi,%r8d 201 roll $12,%r8d 202 addl %r11d,%ecx 203 xorl %ecx,%r13d 204 roll $8,%r13d 205 addl %r8d,%edx 206 xorl %edx,%r14d 207 roll $8,%r14d 208 addl %r13d,%esi 209 xorl %esi,%r11d 210 roll $7,%r11d 211 addl %r14d,%edi 212 xorl %edi,%r8d 213 roll $7,%r8d 214 decl %ebp 215 jnz .Loop 216 movl %edi,36(%rsp) 217 movl %esi,32(%rsp) 218 movq 64(%rsp),%rbp 219 movdqa %xmm2,%xmm1 220 movq 64+8(%rsp),%rsi 221 paddd %xmm4,%xmm3 222 movq 64+16(%rsp),%rdi 223 224 addl $0x61707865,%eax 225 addl $0x3320646e,%ebx 226 addl $0x79622d32,%ecx 227 addl $0x6b206574,%edx 228 addl 16(%rsp),%r8d 229 addl 20(%rsp),%r9d 230 addl 24(%rsp),%r10d 231 addl 28(%rsp),%r11d 232 addl 48(%rsp),%r12d 233 addl 52(%rsp),%r13d 234 addl 56(%rsp),%r14d 235 addl 60(%rsp),%r15d 236 paddd 32(%rsp),%xmm1 237 238 cmpq $64,%rbp 239 jb .Ltail 240 241 xorl 0(%rsi),%eax 242 xorl 4(%rsi),%ebx 243 xorl 8(%rsi),%ecx 244 xorl 12(%rsi),%edx 245 xorl 16(%rsi),%r8d 246 xorl 20(%rsi),%r9d 247 xorl 24(%rsi),%r10d 248 xorl 28(%rsi),%r11d 249 movdqu 32(%rsi),%xmm0 250 xorl 48(%rsi),%r12d 251 xorl 52(%rsi),%r13d 252 xorl 56(%rsi),%r14d 253 xorl 60(%rsi),%r15d 254 leaq 64(%rsi),%rsi 255 pxor %xmm1,%xmm0 256 257 movdqa %xmm2,32(%rsp) 258 movd %xmm3,48(%rsp) 259 260 movl %eax,0(%rdi) 261 movl %ebx,4(%rdi) 262 movl %ecx,8(%rdi) 263 movl %edx,12(%rdi) 264 movl %r8d,16(%rdi) 265 movl %r9d,20(%rdi) 266 movl %r10d,24(%rdi) 267 movl %r11d,28(%rdi) 268 movdqu %xmm0,32(%rdi) 269 movl %r12d,48(%rdi) 270 movl %r13d,52(%rdi) 271 movl %r14d,56(%rdi) 272 movl %r15d,60(%rdi) 273 leaq 64(%rdi),%rdi 274 275 subq $64,%rbp 276 jnz .Loop_outer 277 278 jmp .Ldone 279 280.align 16 281.Ltail: 282 movl %eax,0(%rsp) 283 movl %ebx,4(%rsp) 284 xorq %rbx,%rbx 285 movl %ecx,8(%rsp) 286 movl %edx,12(%rsp) 287 movl %r8d,16(%rsp) 288 movl %r9d,20(%rsp) 289 movl %r10d,24(%rsp) 290 movl %r11d,28(%rsp) 291 movdqa %xmm1,32(%rsp) 292 movl %r12d,48(%rsp) 293 movl %r13d,52(%rsp) 294 movl %r14d,56(%rsp) 295 movl %r15d,60(%rsp) 296 297.Loop_tail: 298 movzbl (%rsi,%rbx,1),%eax 299 movzbl (%rsp,%rbx,1),%edx 300 leaq 1(%rbx),%rbx 301 xorl %edx,%eax 302 movb %al,-1(%rdi,%rbx,1) 303 decq %rbp 304 jnz .Loop_tail 305 306.Ldone: 307 leaq 64+24+48(%rsp),%rsi 308.cfi_def_cfa %rsi,8 309 movq -48(%rsi),%r15 310.cfi_restore %r15 311 movq -40(%rsi),%r14 312.cfi_restore %r14 313 movq -32(%rsi),%r13 314.cfi_restore %r13 315 movq -24(%rsi),%r12 316.cfi_restore %r12 317 movq -16(%rsi),%rbp 318.cfi_restore %rbp 319 movq -8(%rsi),%rbx 320.cfi_restore %rbx 321 leaq (%rsi),%rsp 322.cfi_def_cfa_register %rsp 323.Lno_data: 324 .byte 0xf3,0xc3 325.cfi_endproc 326.size ChaCha20_ctr32,.-ChaCha20_ctr32 327.type ChaCha20_ssse3,@function 328.align 32 329ChaCha20_ssse3: 330.cfi_startproc 331.LChaCha20_ssse3: 332 movq %rsp,%r9 333.cfi_def_cfa_register %r9 334 testl $2048,%r10d 335 jnz .LChaCha20_4xop 336 cmpq $128,%rdx 337 je .LChaCha20_128 338 ja .LChaCha20_4x 339 340.Ldo_sse3_after_all: 341 subq $64+8,%rsp 342 movdqa .Lsigma(%rip),%xmm0 343 movdqu (%rcx),%xmm1 344 movdqu 16(%rcx),%xmm2 345 movdqu (%r8),%xmm3 346 movdqa .Lrot16(%rip),%xmm6 347 movdqa .Lrot24(%rip),%xmm7 348 349 movdqa %xmm0,0(%rsp) 350 movdqa %xmm1,16(%rsp) 351 movdqa %xmm2,32(%rsp) 352 movdqa %xmm3,48(%rsp) 353 movq $10,%r8 354 jmp .Loop_ssse3 355 356.align 32 357.Loop_outer_ssse3: 358 movdqa .Lone(%rip),%xmm3 359 movdqa 0(%rsp),%xmm0 360 movdqa 16(%rsp),%xmm1 361 movdqa 32(%rsp),%xmm2 362 paddd 48(%rsp),%xmm3 363 movq $10,%r8 364 movdqa %xmm3,48(%rsp) 365 jmp .Loop_ssse3 366 367.align 32 368.Loop_ssse3: 369 paddd %xmm1,%xmm0 370 pxor %xmm0,%xmm3 371.byte 102,15,56,0,222 372 paddd %xmm3,%xmm2 373 pxor %xmm2,%xmm1 374 movdqa %xmm1,%xmm4 375 psrld $20,%xmm1 376 pslld $12,%xmm4 377 por %xmm4,%xmm1 378 paddd %xmm1,%xmm0 379 pxor %xmm0,%xmm3 380.byte 102,15,56,0,223 381 paddd %xmm3,%xmm2 382 pxor %xmm2,%xmm1 383 movdqa %xmm1,%xmm4 384 psrld $25,%xmm1 385 pslld $7,%xmm4 386 por %xmm4,%xmm1 387 pshufd $78,%xmm2,%xmm2 388 pshufd $57,%xmm1,%xmm1 389 pshufd $147,%xmm3,%xmm3 390 nop 391 paddd %xmm1,%xmm0 392 pxor %xmm0,%xmm3 393.byte 102,15,56,0,222 394 paddd %xmm3,%xmm2 395 pxor %xmm2,%xmm1 396 movdqa %xmm1,%xmm4 397 psrld $20,%xmm1 398 pslld $12,%xmm4 399 por %xmm4,%xmm1 400 paddd %xmm1,%xmm0 401 pxor %xmm0,%xmm3 402.byte 102,15,56,0,223 403 paddd %xmm3,%xmm2 404 pxor %xmm2,%xmm1 405 movdqa %xmm1,%xmm4 406 psrld $25,%xmm1 407 pslld $7,%xmm4 408 por %xmm4,%xmm1 409 pshufd $78,%xmm2,%xmm2 410 pshufd $147,%xmm1,%xmm1 411 pshufd $57,%xmm3,%xmm3 412 decq %r8 413 jnz .Loop_ssse3 414 paddd 0(%rsp),%xmm0 415 paddd 16(%rsp),%xmm1 416 paddd 32(%rsp),%xmm2 417 paddd 48(%rsp),%xmm3 418 419 cmpq $64,%rdx 420 jb .Ltail_ssse3 421 422 movdqu 0(%rsi),%xmm4 423 movdqu 16(%rsi),%xmm5 424 pxor %xmm4,%xmm0 425 movdqu 32(%rsi),%xmm4 426 pxor %xmm5,%xmm1 427 movdqu 48(%rsi),%xmm5 428 leaq 64(%rsi),%rsi 429 pxor %xmm4,%xmm2 430 pxor %xmm5,%xmm3 431 432 movdqu %xmm0,0(%rdi) 433 movdqu %xmm1,16(%rdi) 434 movdqu %xmm2,32(%rdi) 435 movdqu %xmm3,48(%rdi) 436 leaq 64(%rdi),%rdi 437 438 subq $64,%rdx 439 jnz .Loop_outer_ssse3 440 441 jmp .Ldone_ssse3 442 443.align 16 444.Ltail_ssse3: 445 movdqa %xmm0,0(%rsp) 446 movdqa %xmm1,16(%rsp) 447 movdqa %xmm2,32(%rsp) 448 movdqa %xmm3,48(%rsp) 449 xorq %r8,%r8 450 451.Loop_tail_ssse3: 452 movzbl (%rsi,%r8,1),%eax 453 movzbl (%rsp,%r8,1),%ecx 454 leaq 1(%r8),%r8 455 xorl %ecx,%eax 456 movb %al,-1(%rdi,%r8,1) 457 decq %rdx 458 jnz .Loop_tail_ssse3 459 460.Ldone_ssse3: 461 leaq (%r9),%rsp 462.cfi_def_cfa_register %rsp 463.Lssse3_epilogue: 464 .byte 0xf3,0xc3 465.cfi_endproc 466.size ChaCha20_ssse3,.-ChaCha20_ssse3 467.type ChaCha20_128,@function 468.align 32 469ChaCha20_128: 470.cfi_startproc 471.LChaCha20_128: 472 movq %rsp,%r9 473.cfi_def_cfa_register %r9 474 subq $64+8,%rsp 475 movdqa .Lsigma(%rip),%xmm8 476 movdqu (%rcx),%xmm9 477 movdqu 16(%rcx),%xmm2 478 movdqu (%r8),%xmm3 479 movdqa .Lone(%rip),%xmm1 480 movdqa .Lrot16(%rip),%xmm6 481 movdqa .Lrot24(%rip),%xmm7 482 483 movdqa %xmm8,%xmm10 484 movdqa %xmm8,0(%rsp) 485 movdqa %xmm9,%xmm11 486 movdqa %xmm9,16(%rsp) 487 movdqa %xmm2,%xmm0 488 movdqa %xmm2,32(%rsp) 489 paddd %xmm3,%xmm1 490 movdqa %xmm3,48(%rsp) 491 movq $10,%r8 492 jmp .Loop_128 493 494.align 32 495.Loop_128: 496 paddd %xmm9,%xmm8 497 pxor %xmm8,%xmm3 498 paddd %xmm11,%xmm10 499 pxor %xmm10,%xmm1 500.byte 102,15,56,0,222 501.byte 102,15,56,0,206 502 paddd %xmm3,%xmm2 503 paddd %xmm1,%xmm0 504 pxor %xmm2,%xmm9 505 pxor %xmm0,%xmm11 506 movdqa %xmm9,%xmm4 507 psrld $20,%xmm9 508 movdqa %xmm11,%xmm5 509 pslld $12,%xmm4 510 psrld $20,%xmm11 511 por %xmm4,%xmm9 512 pslld $12,%xmm5 513 por %xmm5,%xmm11 514 paddd %xmm9,%xmm8 515 pxor %xmm8,%xmm3 516 paddd %xmm11,%xmm10 517 pxor %xmm10,%xmm1 518.byte 102,15,56,0,223 519.byte 102,15,56,0,207 520 paddd %xmm3,%xmm2 521 paddd %xmm1,%xmm0 522 pxor %xmm2,%xmm9 523 pxor %xmm0,%xmm11 524 movdqa %xmm9,%xmm4 525 psrld $25,%xmm9 526 movdqa %xmm11,%xmm5 527 pslld $7,%xmm4 528 psrld $25,%xmm11 529 por %xmm4,%xmm9 530 pslld $7,%xmm5 531 por %xmm5,%xmm11 532 pshufd $78,%xmm2,%xmm2 533 pshufd $57,%xmm9,%xmm9 534 pshufd $147,%xmm3,%xmm3 535 pshufd $78,%xmm0,%xmm0 536 pshufd $57,%xmm11,%xmm11 537 pshufd $147,%xmm1,%xmm1 538 paddd %xmm9,%xmm8 539 pxor %xmm8,%xmm3 540 paddd %xmm11,%xmm10 541 pxor %xmm10,%xmm1 542.byte 102,15,56,0,222 543.byte 102,15,56,0,206 544 paddd %xmm3,%xmm2 545 paddd %xmm1,%xmm0 546 pxor %xmm2,%xmm9 547 pxor %xmm0,%xmm11 548 movdqa %xmm9,%xmm4 549 psrld $20,%xmm9 550 movdqa %xmm11,%xmm5 551 pslld $12,%xmm4 552 psrld $20,%xmm11 553 por %xmm4,%xmm9 554 pslld $12,%xmm5 555 por %xmm5,%xmm11 556 paddd %xmm9,%xmm8 557 pxor %xmm8,%xmm3 558 paddd %xmm11,%xmm10 559 pxor %xmm10,%xmm1 560.byte 102,15,56,0,223 561.byte 102,15,56,0,207 562 paddd %xmm3,%xmm2 563 paddd %xmm1,%xmm0 564 pxor %xmm2,%xmm9 565 pxor %xmm0,%xmm11 566 movdqa %xmm9,%xmm4 567 psrld $25,%xmm9 568 movdqa %xmm11,%xmm5 569 pslld $7,%xmm4 570 psrld $25,%xmm11 571 por %xmm4,%xmm9 572 pslld $7,%xmm5 573 por %xmm5,%xmm11 574 pshufd $78,%xmm2,%xmm2 575 pshufd $147,%xmm9,%xmm9 576 pshufd $57,%xmm3,%xmm3 577 pshufd $78,%xmm0,%xmm0 578 pshufd $147,%xmm11,%xmm11 579 pshufd $57,%xmm1,%xmm1 580 decq %r8 581 jnz .Loop_128 582 paddd 0(%rsp),%xmm8 583 paddd 16(%rsp),%xmm9 584 paddd 32(%rsp),%xmm2 585 paddd 48(%rsp),%xmm3 586 paddd .Lone(%rip),%xmm1 587 paddd 0(%rsp),%xmm10 588 paddd 16(%rsp),%xmm11 589 paddd 32(%rsp),%xmm0 590 paddd 48(%rsp),%xmm1 591 592 movdqu 0(%rsi),%xmm4 593 movdqu 16(%rsi),%xmm5 594 pxor %xmm4,%xmm8 595 movdqu 32(%rsi),%xmm4 596 pxor %xmm5,%xmm9 597 movdqu 48(%rsi),%xmm5 598 pxor %xmm4,%xmm2 599 movdqu 64(%rsi),%xmm4 600 pxor %xmm5,%xmm3 601 movdqu 80(%rsi),%xmm5 602 pxor %xmm4,%xmm10 603 movdqu 96(%rsi),%xmm4 604 pxor %xmm5,%xmm11 605 movdqu 112(%rsi),%xmm5 606 pxor %xmm4,%xmm0 607 pxor %xmm5,%xmm1 608 609 movdqu %xmm8,0(%rdi) 610 movdqu %xmm9,16(%rdi) 611 movdqu %xmm2,32(%rdi) 612 movdqu %xmm3,48(%rdi) 613 movdqu %xmm10,64(%rdi) 614 movdqu %xmm11,80(%rdi) 615 movdqu %xmm0,96(%rdi) 616 movdqu %xmm1,112(%rdi) 617 leaq (%r9),%rsp 618.cfi_def_cfa_register %rsp 619.L128_epilogue: 620 .byte 0xf3,0xc3 621.cfi_endproc 622.size ChaCha20_128,.-ChaCha20_128 623.type ChaCha20_4x,@function 624.align 32 625ChaCha20_4x: 626.cfi_startproc 627.LChaCha20_4x: 628 movq %rsp,%r9 629.cfi_def_cfa_register %r9 630 movq %r10,%r11 631 shrq $32,%r10 632 testq $32,%r10 633 jnz .LChaCha20_8x 634 cmpq $192,%rdx 635 ja .Lproceed4x 636 637 andq $71303168,%r11 638 cmpq $4194304,%r11 639 je .Ldo_sse3_after_all 640 641.Lproceed4x: 642 subq $0x140+8,%rsp 643 movdqa .Lsigma(%rip),%xmm11 644 movdqu (%rcx),%xmm15 645 movdqu 16(%rcx),%xmm7 646 movdqu (%r8),%xmm3 647 leaq 256(%rsp),%rcx 648 leaq .Lrot16(%rip),%r10 649 leaq .Lrot24(%rip),%r11 650 651 pshufd $0x00,%xmm11,%xmm8 652 pshufd $0x55,%xmm11,%xmm9 653 movdqa %xmm8,64(%rsp) 654 pshufd $0xaa,%xmm11,%xmm10 655 movdqa %xmm9,80(%rsp) 656 pshufd $0xff,%xmm11,%xmm11 657 movdqa %xmm10,96(%rsp) 658 movdqa %xmm11,112(%rsp) 659 660 pshufd $0x00,%xmm15,%xmm12 661 pshufd $0x55,%xmm15,%xmm13 662 movdqa %xmm12,128-256(%rcx) 663 pshufd $0xaa,%xmm15,%xmm14 664 movdqa %xmm13,144-256(%rcx) 665 pshufd $0xff,%xmm15,%xmm15 666 movdqa %xmm14,160-256(%rcx) 667 movdqa %xmm15,176-256(%rcx) 668 669 pshufd $0x00,%xmm7,%xmm4 670 pshufd $0x55,%xmm7,%xmm5 671 movdqa %xmm4,192-256(%rcx) 672 pshufd $0xaa,%xmm7,%xmm6 673 movdqa %xmm5,208-256(%rcx) 674 pshufd $0xff,%xmm7,%xmm7 675 movdqa %xmm6,224-256(%rcx) 676 movdqa %xmm7,240-256(%rcx) 677 678 pshufd $0x00,%xmm3,%xmm0 679 pshufd $0x55,%xmm3,%xmm1 680 paddd .Linc(%rip),%xmm0 681 pshufd $0xaa,%xmm3,%xmm2 682 movdqa %xmm1,272-256(%rcx) 683 pshufd $0xff,%xmm3,%xmm3 684 movdqa %xmm2,288-256(%rcx) 685 movdqa %xmm3,304-256(%rcx) 686 687 jmp .Loop_enter4x 688 689.align 32 690.Loop_outer4x: 691 movdqa 64(%rsp),%xmm8 692 movdqa 80(%rsp),%xmm9 693 movdqa 96(%rsp),%xmm10 694 movdqa 112(%rsp),%xmm11 695 movdqa 128-256(%rcx),%xmm12 696 movdqa 144-256(%rcx),%xmm13 697 movdqa 160-256(%rcx),%xmm14 698 movdqa 176-256(%rcx),%xmm15 699 movdqa 192-256(%rcx),%xmm4 700 movdqa 208-256(%rcx),%xmm5 701 movdqa 224-256(%rcx),%xmm6 702 movdqa 240-256(%rcx),%xmm7 703 movdqa 256-256(%rcx),%xmm0 704 movdqa 272-256(%rcx),%xmm1 705 movdqa 288-256(%rcx),%xmm2 706 movdqa 304-256(%rcx),%xmm3 707 paddd .Lfour(%rip),%xmm0 708 709.Loop_enter4x: 710 movdqa %xmm6,32(%rsp) 711 movdqa %xmm7,48(%rsp) 712 movdqa (%r10),%xmm7 713 movl $10,%eax 714 movdqa %xmm0,256-256(%rcx) 715 jmp .Loop4x 716 717.align 32 718.Loop4x: 719 paddd %xmm12,%xmm8 720 paddd %xmm13,%xmm9 721 pxor %xmm8,%xmm0 722 pxor %xmm9,%xmm1 723.byte 102,15,56,0,199 724.byte 102,15,56,0,207 725 paddd %xmm0,%xmm4 726 paddd %xmm1,%xmm5 727 pxor %xmm4,%xmm12 728 pxor %xmm5,%xmm13 729 movdqa %xmm12,%xmm6 730 pslld $12,%xmm12 731 psrld $20,%xmm6 732 movdqa %xmm13,%xmm7 733 pslld $12,%xmm13 734 por %xmm6,%xmm12 735 psrld $20,%xmm7 736 movdqa (%r11),%xmm6 737 por %xmm7,%xmm13 738 paddd %xmm12,%xmm8 739 paddd %xmm13,%xmm9 740 pxor %xmm8,%xmm0 741 pxor %xmm9,%xmm1 742.byte 102,15,56,0,198 743.byte 102,15,56,0,206 744 paddd %xmm0,%xmm4 745 paddd %xmm1,%xmm5 746 pxor %xmm4,%xmm12 747 pxor %xmm5,%xmm13 748 movdqa %xmm12,%xmm7 749 pslld $7,%xmm12 750 psrld $25,%xmm7 751 movdqa %xmm13,%xmm6 752 pslld $7,%xmm13 753 por %xmm7,%xmm12 754 psrld $25,%xmm6 755 movdqa (%r10),%xmm7 756 por %xmm6,%xmm13 757 movdqa %xmm4,0(%rsp) 758 movdqa %xmm5,16(%rsp) 759 movdqa 32(%rsp),%xmm4 760 movdqa 48(%rsp),%xmm5 761 paddd %xmm14,%xmm10 762 paddd %xmm15,%xmm11 763 pxor %xmm10,%xmm2 764 pxor %xmm11,%xmm3 765.byte 102,15,56,0,215 766.byte 102,15,56,0,223 767 paddd %xmm2,%xmm4 768 paddd %xmm3,%xmm5 769 pxor %xmm4,%xmm14 770 pxor %xmm5,%xmm15 771 movdqa %xmm14,%xmm6 772 pslld $12,%xmm14 773 psrld $20,%xmm6 774 movdqa %xmm15,%xmm7 775 pslld $12,%xmm15 776 por %xmm6,%xmm14 777 psrld $20,%xmm7 778 movdqa (%r11),%xmm6 779 por %xmm7,%xmm15 780 paddd %xmm14,%xmm10 781 paddd %xmm15,%xmm11 782 pxor %xmm10,%xmm2 783 pxor %xmm11,%xmm3 784.byte 102,15,56,0,214 785.byte 102,15,56,0,222 786 paddd %xmm2,%xmm4 787 paddd %xmm3,%xmm5 788 pxor %xmm4,%xmm14 789 pxor %xmm5,%xmm15 790 movdqa %xmm14,%xmm7 791 pslld $7,%xmm14 792 psrld $25,%xmm7 793 movdqa %xmm15,%xmm6 794 pslld $7,%xmm15 795 por %xmm7,%xmm14 796 psrld $25,%xmm6 797 movdqa (%r10),%xmm7 798 por %xmm6,%xmm15 799 paddd %xmm13,%xmm8 800 paddd %xmm14,%xmm9 801 pxor %xmm8,%xmm3 802 pxor %xmm9,%xmm0 803.byte 102,15,56,0,223 804.byte 102,15,56,0,199 805 paddd %xmm3,%xmm4 806 paddd %xmm0,%xmm5 807 pxor %xmm4,%xmm13 808 pxor %xmm5,%xmm14 809 movdqa %xmm13,%xmm6 810 pslld $12,%xmm13 811 psrld $20,%xmm6 812 movdqa %xmm14,%xmm7 813 pslld $12,%xmm14 814 por %xmm6,%xmm13 815 psrld $20,%xmm7 816 movdqa (%r11),%xmm6 817 por %xmm7,%xmm14 818 paddd %xmm13,%xmm8 819 paddd %xmm14,%xmm9 820 pxor %xmm8,%xmm3 821 pxor %xmm9,%xmm0 822.byte 102,15,56,0,222 823.byte 102,15,56,0,198 824 paddd %xmm3,%xmm4 825 paddd %xmm0,%xmm5 826 pxor %xmm4,%xmm13 827 pxor %xmm5,%xmm14 828 movdqa %xmm13,%xmm7 829 pslld $7,%xmm13 830 psrld $25,%xmm7 831 movdqa %xmm14,%xmm6 832 pslld $7,%xmm14 833 por %xmm7,%xmm13 834 psrld $25,%xmm6 835 movdqa (%r10),%xmm7 836 por %xmm6,%xmm14 837 movdqa %xmm4,32(%rsp) 838 movdqa %xmm5,48(%rsp) 839 movdqa 0(%rsp),%xmm4 840 movdqa 16(%rsp),%xmm5 841 paddd %xmm15,%xmm10 842 paddd %xmm12,%xmm11 843 pxor %xmm10,%xmm1 844 pxor %xmm11,%xmm2 845.byte 102,15,56,0,207 846.byte 102,15,56,0,215 847 paddd %xmm1,%xmm4 848 paddd %xmm2,%xmm5 849 pxor %xmm4,%xmm15 850 pxor %xmm5,%xmm12 851 movdqa %xmm15,%xmm6 852 pslld $12,%xmm15 853 psrld $20,%xmm6 854 movdqa %xmm12,%xmm7 855 pslld $12,%xmm12 856 por %xmm6,%xmm15 857 psrld $20,%xmm7 858 movdqa (%r11),%xmm6 859 por %xmm7,%xmm12 860 paddd %xmm15,%xmm10 861 paddd %xmm12,%xmm11 862 pxor %xmm10,%xmm1 863 pxor %xmm11,%xmm2 864.byte 102,15,56,0,206 865.byte 102,15,56,0,214 866 paddd %xmm1,%xmm4 867 paddd %xmm2,%xmm5 868 pxor %xmm4,%xmm15 869 pxor %xmm5,%xmm12 870 movdqa %xmm15,%xmm7 871 pslld $7,%xmm15 872 psrld $25,%xmm7 873 movdqa %xmm12,%xmm6 874 pslld $7,%xmm12 875 por %xmm7,%xmm15 876 psrld $25,%xmm6 877 movdqa (%r10),%xmm7 878 por %xmm6,%xmm12 879 decl %eax 880 jnz .Loop4x 881 882 paddd 64(%rsp),%xmm8 883 paddd 80(%rsp),%xmm9 884 paddd 96(%rsp),%xmm10 885 paddd 112(%rsp),%xmm11 886 887 movdqa %xmm8,%xmm6 888 punpckldq %xmm9,%xmm8 889 movdqa %xmm10,%xmm7 890 punpckldq %xmm11,%xmm10 891 punpckhdq %xmm9,%xmm6 892 punpckhdq %xmm11,%xmm7 893 movdqa %xmm8,%xmm9 894 punpcklqdq %xmm10,%xmm8 895 movdqa %xmm6,%xmm11 896 punpcklqdq %xmm7,%xmm6 897 punpckhqdq %xmm10,%xmm9 898 punpckhqdq %xmm7,%xmm11 899 paddd 128-256(%rcx),%xmm12 900 paddd 144-256(%rcx),%xmm13 901 paddd 160-256(%rcx),%xmm14 902 paddd 176-256(%rcx),%xmm15 903 904 movdqa %xmm8,0(%rsp) 905 movdqa %xmm9,16(%rsp) 906 movdqa 32(%rsp),%xmm8 907 movdqa 48(%rsp),%xmm9 908 909 movdqa %xmm12,%xmm10 910 punpckldq %xmm13,%xmm12 911 movdqa %xmm14,%xmm7 912 punpckldq %xmm15,%xmm14 913 punpckhdq %xmm13,%xmm10 914 punpckhdq %xmm15,%xmm7 915 movdqa %xmm12,%xmm13 916 punpcklqdq %xmm14,%xmm12 917 movdqa %xmm10,%xmm15 918 punpcklqdq %xmm7,%xmm10 919 punpckhqdq %xmm14,%xmm13 920 punpckhqdq %xmm7,%xmm15 921 paddd 192-256(%rcx),%xmm4 922 paddd 208-256(%rcx),%xmm5 923 paddd 224-256(%rcx),%xmm8 924 paddd 240-256(%rcx),%xmm9 925 926 movdqa %xmm6,32(%rsp) 927 movdqa %xmm11,48(%rsp) 928 929 movdqa %xmm4,%xmm14 930 punpckldq %xmm5,%xmm4 931 movdqa %xmm8,%xmm7 932 punpckldq %xmm9,%xmm8 933 punpckhdq %xmm5,%xmm14 934 punpckhdq %xmm9,%xmm7 935 movdqa %xmm4,%xmm5 936 punpcklqdq %xmm8,%xmm4 937 movdqa %xmm14,%xmm9 938 punpcklqdq %xmm7,%xmm14 939 punpckhqdq %xmm8,%xmm5 940 punpckhqdq %xmm7,%xmm9 941 paddd 256-256(%rcx),%xmm0 942 paddd 272-256(%rcx),%xmm1 943 paddd 288-256(%rcx),%xmm2 944 paddd 304-256(%rcx),%xmm3 945 946 movdqa %xmm0,%xmm8 947 punpckldq %xmm1,%xmm0 948 movdqa %xmm2,%xmm7 949 punpckldq %xmm3,%xmm2 950 punpckhdq %xmm1,%xmm8 951 punpckhdq %xmm3,%xmm7 952 movdqa %xmm0,%xmm1 953 punpcklqdq %xmm2,%xmm0 954 movdqa %xmm8,%xmm3 955 punpcklqdq %xmm7,%xmm8 956 punpckhqdq %xmm2,%xmm1 957 punpckhqdq %xmm7,%xmm3 958 cmpq $256,%rdx 959 jb .Ltail4x 960 961 movdqu 0(%rsi),%xmm6 962 movdqu 16(%rsi),%xmm11 963 movdqu 32(%rsi),%xmm2 964 movdqu 48(%rsi),%xmm7 965 pxor 0(%rsp),%xmm6 966 pxor %xmm12,%xmm11 967 pxor %xmm4,%xmm2 968 pxor %xmm0,%xmm7 969 970 movdqu %xmm6,0(%rdi) 971 movdqu 64(%rsi),%xmm6 972 movdqu %xmm11,16(%rdi) 973 movdqu 80(%rsi),%xmm11 974 movdqu %xmm2,32(%rdi) 975 movdqu 96(%rsi),%xmm2 976 movdqu %xmm7,48(%rdi) 977 movdqu 112(%rsi),%xmm7 978 leaq 128(%rsi),%rsi 979 pxor 16(%rsp),%xmm6 980 pxor %xmm13,%xmm11 981 pxor %xmm5,%xmm2 982 pxor %xmm1,%xmm7 983 984 movdqu %xmm6,64(%rdi) 985 movdqu 0(%rsi),%xmm6 986 movdqu %xmm11,80(%rdi) 987 movdqu 16(%rsi),%xmm11 988 movdqu %xmm2,96(%rdi) 989 movdqu 32(%rsi),%xmm2 990 movdqu %xmm7,112(%rdi) 991 leaq 128(%rdi),%rdi 992 movdqu 48(%rsi),%xmm7 993 pxor 32(%rsp),%xmm6 994 pxor %xmm10,%xmm11 995 pxor %xmm14,%xmm2 996 pxor %xmm8,%xmm7 997 998 movdqu %xmm6,0(%rdi) 999 movdqu 64(%rsi),%xmm6 1000 movdqu %xmm11,16(%rdi) 1001 movdqu 80(%rsi),%xmm11 1002 movdqu %xmm2,32(%rdi) 1003 movdqu 96(%rsi),%xmm2 1004 movdqu %xmm7,48(%rdi) 1005 movdqu 112(%rsi),%xmm7 1006 leaq 128(%rsi),%rsi 1007 pxor 48(%rsp),%xmm6 1008 pxor %xmm15,%xmm11 1009 pxor %xmm9,%xmm2 1010 pxor %xmm3,%xmm7 1011 movdqu %xmm6,64(%rdi) 1012 movdqu %xmm11,80(%rdi) 1013 movdqu %xmm2,96(%rdi) 1014 movdqu %xmm7,112(%rdi) 1015 leaq 128(%rdi),%rdi 1016 1017 subq $256,%rdx 1018 jnz .Loop_outer4x 1019 1020 jmp .Ldone4x 1021 1022.Ltail4x: 1023 cmpq $192,%rdx 1024 jae .L192_or_more4x 1025 cmpq $128,%rdx 1026 jae .L128_or_more4x 1027 cmpq $64,%rdx 1028 jae .L64_or_more4x 1029 1030 1031 xorq %r10,%r10 1032 1033 movdqa %xmm12,16(%rsp) 1034 movdqa %xmm4,32(%rsp) 1035 movdqa %xmm0,48(%rsp) 1036 jmp .Loop_tail4x 1037 1038.align 32 1039.L64_or_more4x: 1040 movdqu 0(%rsi),%xmm6 1041 movdqu 16(%rsi),%xmm11 1042 movdqu 32(%rsi),%xmm2 1043 movdqu 48(%rsi),%xmm7 1044 pxor 0(%rsp),%xmm6 1045 pxor %xmm12,%xmm11 1046 pxor %xmm4,%xmm2 1047 pxor %xmm0,%xmm7 1048 movdqu %xmm6,0(%rdi) 1049 movdqu %xmm11,16(%rdi) 1050 movdqu %xmm2,32(%rdi) 1051 movdqu %xmm7,48(%rdi) 1052 je .Ldone4x 1053 1054 movdqa 16(%rsp),%xmm6 1055 leaq 64(%rsi),%rsi 1056 xorq %r10,%r10 1057 movdqa %xmm6,0(%rsp) 1058 movdqa %xmm13,16(%rsp) 1059 leaq 64(%rdi),%rdi 1060 movdqa %xmm5,32(%rsp) 1061 subq $64,%rdx 1062 movdqa %xmm1,48(%rsp) 1063 jmp .Loop_tail4x 1064 1065.align 32 1066.L128_or_more4x: 1067 movdqu 0(%rsi),%xmm6 1068 movdqu 16(%rsi),%xmm11 1069 movdqu 32(%rsi),%xmm2 1070 movdqu 48(%rsi),%xmm7 1071 pxor 0(%rsp),%xmm6 1072 pxor %xmm12,%xmm11 1073 pxor %xmm4,%xmm2 1074 pxor %xmm0,%xmm7 1075 1076 movdqu %xmm6,0(%rdi) 1077 movdqu 64(%rsi),%xmm6 1078 movdqu %xmm11,16(%rdi) 1079 movdqu 80(%rsi),%xmm11 1080 movdqu %xmm2,32(%rdi) 1081 movdqu 96(%rsi),%xmm2 1082 movdqu %xmm7,48(%rdi) 1083 movdqu 112(%rsi),%xmm7 1084 pxor 16(%rsp),%xmm6 1085 pxor %xmm13,%xmm11 1086 pxor %xmm5,%xmm2 1087 pxor %xmm1,%xmm7 1088 movdqu %xmm6,64(%rdi) 1089 movdqu %xmm11,80(%rdi) 1090 movdqu %xmm2,96(%rdi) 1091 movdqu %xmm7,112(%rdi) 1092 je .Ldone4x 1093 1094 movdqa 32(%rsp),%xmm6 1095 leaq 128(%rsi),%rsi 1096 xorq %r10,%r10 1097 movdqa %xmm6,0(%rsp) 1098 movdqa %xmm10,16(%rsp) 1099 leaq 128(%rdi),%rdi 1100 movdqa %xmm14,32(%rsp) 1101 subq $128,%rdx 1102 movdqa %xmm8,48(%rsp) 1103 jmp .Loop_tail4x 1104 1105.align 32 1106.L192_or_more4x: 1107 movdqu 0(%rsi),%xmm6 1108 movdqu 16(%rsi),%xmm11 1109 movdqu 32(%rsi),%xmm2 1110 movdqu 48(%rsi),%xmm7 1111 pxor 0(%rsp),%xmm6 1112 pxor %xmm12,%xmm11 1113 pxor %xmm4,%xmm2 1114 pxor %xmm0,%xmm7 1115 1116 movdqu %xmm6,0(%rdi) 1117 movdqu 64(%rsi),%xmm6 1118 movdqu %xmm11,16(%rdi) 1119 movdqu 80(%rsi),%xmm11 1120 movdqu %xmm2,32(%rdi) 1121 movdqu 96(%rsi),%xmm2 1122 movdqu %xmm7,48(%rdi) 1123 movdqu 112(%rsi),%xmm7 1124 leaq 128(%rsi),%rsi 1125 pxor 16(%rsp),%xmm6 1126 pxor %xmm13,%xmm11 1127 pxor %xmm5,%xmm2 1128 pxor %xmm1,%xmm7 1129 1130 movdqu %xmm6,64(%rdi) 1131 movdqu 0(%rsi),%xmm6 1132 movdqu %xmm11,80(%rdi) 1133 movdqu 16(%rsi),%xmm11 1134 movdqu %xmm2,96(%rdi) 1135 movdqu 32(%rsi),%xmm2 1136 movdqu %xmm7,112(%rdi) 1137 leaq 128(%rdi),%rdi 1138 movdqu 48(%rsi),%xmm7 1139 pxor 32(%rsp),%xmm6 1140 pxor %xmm10,%xmm11 1141 pxor %xmm14,%xmm2 1142 pxor %xmm8,%xmm7 1143 movdqu %xmm6,0(%rdi) 1144 movdqu %xmm11,16(%rdi) 1145 movdqu %xmm2,32(%rdi) 1146 movdqu %xmm7,48(%rdi) 1147 je .Ldone4x 1148 1149 movdqa 48(%rsp),%xmm6 1150 leaq 64(%rsi),%rsi 1151 xorq %r10,%r10 1152 movdqa %xmm6,0(%rsp) 1153 movdqa %xmm15,16(%rsp) 1154 leaq 64(%rdi),%rdi 1155 movdqa %xmm9,32(%rsp) 1156 subq $192,%rdx 1157 movdqa %xmm3,48(%rsp) 1158 1159.Loop_tail4x: 1160 movzbl (%rsi,%r10,1),%eax 1161 movzbl (%rsp,%r10,1),%ecx 1162 leaq 1(%r10),%r10 1163 xorl %ecx,%eax 1164 movb %al,-1(%rdi,%r10,1) 1165 decq %rdx 1166 jnz .Loop_tail4x 1167 1168.Ldone4x: 1169 leaq (%r9),%rsp 1170.cfi_def_cfa_register %rsp 1171.L4x_epilogue: 1172 .byte 0xf3,0xc3 1173.cfi_endproc 1174.size ChaCha20_4x,.-ChaCha20_4x 1175.type ChaCha20_4xop,@function 1176.align 32 1177ChaCha20_4xop: 1178.cfi_startproc 1179.LChaCha20_4xop: 1180 movq %rsp,%r9 1181.cfi_def_cfa_register %r9 1182 subq $0x140+8,%rsp 1183 vzeroupper 1184 1185 vmovdqa .Lsigma(%rip),%xmm11 1186 vmovdqu (%rcx),%xmm3 1187 vmovdqu 16(%rcx),%xmm15 1188 vmovdqu (%r8),%xmm7 1189 leaq 256(%rsp),%rcx 1190 1191 vpshufd $0x00,%xmm11,%xmm8 1192 vpshufd $0x55,%xmm11,%xmm9 1193 vmovdqa %xmm8,64(%rsp) 1194 vpshufd $0xaa,%xmm11,%xmm10 1195 vmovdqa %xmm9,80(%rsp) 1196 vpshufd $0xff,%xmm11,%xmm11 1197 vmovdqa %xmm10,96(%rsp) 1198 vmovdqa %xmm11,112(%rsp) 1199 1200 vpshufd $0x00,%xmm3,%xmm0 1201 vpshufd $0x55,%xmm3,%xmm1 1202 vmovdqa %xmm0,128-256(%rcx) 1203 vpshufd $0xaa,%xmm3,%xmm2 1204 vmovdqa %xmm1,144-256(%rcx) 1205 vpshufd $0xff,%xmm3,%xmm3 1206 vmovdqa %xmm2,160-256(%rcx) 1207 vmovdqa %xmm3,176-256(%rcx) 1208 1209 vpshufd $0x00,%xmm15,%xmm12 1210 vpshufd $0x55,%xmm15,%xmm13 1211 vmovdqa %xmm12,192-256(%rcx) 1212 vpshufd $0xaa,%xmm15,%xmm14 1213 vmovdqa %xmm13,208-256(%rcx) 1214 vpshufd $0xff,%xmm15,%xmm15 1215 vmovdqa %xmm14,224-256(%rcx) 1216 vmovdqa %xmm15,240-256(%rcx) 1217 1218 vpshufd $0x00,%xmm7,%xmm4 1219 vpshufd $0x55,%xmm7,%xmm5 1220 vpaddd .Linc(%rip),%xmm4,%xmm4 1221 vpshufd $0xaa,%xmm7,%xmm6 1222 vmovdqa %xmm5,272-256(%rcx) 1223 vpshufd $0xff,%xmm7,%xmm7 1224 vmovdqa %xmm6,288-256(%rcx) 1225 vmovdqa %xmm7,304-256(%rcx) 1226 1227 jmp .Loop_enter4xop 1228 1229.align 32 1230.Loop_outer4xop: 1231 vmovdqa 64(%rsp),%xmm8 1232 vmovdqa 80(%rsp),%xmm9 1233 vmovdqa 96(%rsp),%xmm10 1234 vmovdqa 112(%rsp),%xmm11 1235 vmovdqa 128-256(%rcx),%xmm0 1236 vmovdqa 144-256(%rcx),%xmm1 1237 vmovdqa 160-256(%rcx),%xmm2 1238 vmovdqa 176-256(%rcx),%xmm3 1239 vmovdqa 192-256(%rcx),%xmm12 1240 vmovdqa 208-256(%rcx),%xmm13 1241 vmovdqa 224-256(%rcx),%xmm14 1242 vmovdqa 240-256(%rcx),%xmm15 1243 vmovdqa 256-256(%rcx),%xmm4 1244 vmovdqa 272-256(%rcx),%xmm5 1245 vmovdqa 288-256(%rcx),%xmm6 1246 vmovdqa 304-256(%rcx),%xmm7 1247 vpaddd .Lfour(%rip),%xmm4,%xmm4 1248 1249.Loop_enter4xop: 1250 movl $10,%eax 1251 vmovdqa %xmm4,256-256(%rcx) 1252 jmp .Loop4xop 1253 1254.align 32 1255.Loop4xop: 1256 vpaddd %xmm0,%xmm8,%xmm8 1257 vpaddd %xmm1,%xmm9,%xmm9 1258 vpaddd %xmm2,%xmm10,%xmm10 1259 vpaddd %xmm3,%xmm11,%xmm11 1260 vpxor %xmm4,%xmm8,%xmm4 1261 vpxor %xmm5,%xmm9,%xmm5 1262 vpxor %xmm6,%xmm10,%xmm6 1263 vpxor %xmm7,%xmm11,%xmm7 1264.byte 143,232,120,194,228,16 1265.byte 143,232,120,194,237,16 1266.byte 143,232,120,194,246,16 1267.byte 143,232,120,194,255,16 1268 vpaddd %xmm4,%xmm12,%xmm12 1269 vpaddd %xmm5,%xmm13,%xmm13 1270 vpaddd %xmm6,%xmm14,%xmm14 1271 vpaddd %xmm7,%xmm15,%xmm15 1272 vpxor %xmm0,%xmm12,%xmm0 1273 vpxor %xmm1,%xmm13,%xmm1 1274 vpxor %xmm14,%xmm2,%xmm2 1275 vpxor %xmm15,%xmm3,%xmm3 1276.byte 143,232,120,194,192,12 1277.byte 143,232,120,194,201,12 1278.byte 143,232,120,194,210,12 1279.byte 143,232,120,194,219,12 1280 vpaddd %xmm8,%xmm0,%xmm8 1281 vpaddd %xmm9,%xmm1,%xmm9 1282 vpaddd %xmm2,%xmm10,%xmm10 1283 vpaddd %xmm3,%xmm11,%xmm11 1284 vpxor %xmm4,%xmm8,%xmm4 1285 vpxor %xmm5,%xmm9,%xmm5 1286 vpxor %xmm6,%xmm10,%xmm6 1287 vpxor %xmm7,%xmm11,%xmm7 1288.byte 143,232,120,194,228,8 1289.byte 143,232,120,194,237,8 1290.byte 143,232,120,194,246,8 1291.byte 143,232,120,194,255,8 1292 vpaddd %xmm4,%xmm12,%xmm12 1293 vpaddd %xmm5,%xmm13,%xmm13 1294 vpaddd %xmm6,%xmm14,%xmm14 1295 vpaddd %xmm7,%xmm15,%xmm15 1296 vpxor %xmm0,%xmm12,%xmm0 1297 vpxor %xmm1,%xmm13,%xmm1 1298 vpxor %xmm14,%xmm2,%xmm2 1299 vpxor %xmm15,%xmm3,%xmm3 1300.byte 143,232,120,194,192,7 1301.byte 143,232,120,194,201,7 1302.byte 143,232,120,194,210,7 1303.byte 143,232,120,194,219,7 1304 vpaddd %xmm1,%xmm8,%xmm8 1305 vpaddd %xmm2,%xmm9,%xmm9 1306 vpaddd %xmm3,%xmm10,%xmm10 1307 vpaddd %xmm0,%xmm11,%xmm11 1308 vpxor %xmm7,%xmm8,%xmm7 1309 vpxor %xmm4,%xmm9,%xmm4 1310 vpxor %xmm5,%xmm10,%xmm5 1311 vpxor %xmm6,%xmm11,%xmm6 1312.byte 143,232,120,194,255,16 1313.byte 143,232,120,194,228,16 1314.byte 143,232,120,194,237,16 1315.byte 143,232,120,194,246,16 1316 vpaddd %xmm7,%xmm14,%xmm14 1317 vpaddd %xmm4,%xmm15,%xmm15 1318 vpaddd %xmm5,%xmm12,%xmm12 1319 vpaddd %xmm6,%xmm13,%xmm13 1320 vpxor %xmm1,%xmm14,%xmm1 1321 vpxor %xmm2,%xmm15,%xmm2 1322 vpxor %xmm12,%xmm3,%xmm3 1323 vpxor %xmm13,%xmm0,%xmm0 1324.byte 143,232,120,194,201,12 1325.byte 143,232,120,194,210,12 1326.byte 143,232,120,194,219,12 1327.byte 143,232,120,194,192,12 1328 vpaddd %xmm8,%xmm1,%xmm8 1329 vpaddd %xmm9,%xmm2,%xmm9 1330 vpaddd %xmm3,%xmm10,%xmm10 1331 vpaddd %xmm0,%xmm11,%xmm11 1332 vpxor %xmm7,%xmm8,%xmm7 1333 vpxor %xmm4,%xmm9,%xmm4 1334 vpxor %xmm5,%xmm10,%xmm5 1335 vpxor %xmm6,%xmm11,%xmm6 1336.byte 143,232,120,194,255,8 1337.byte 143,232,120,194,228,8 1338.byte 143,232,120,194,237,8 1339.byte 143,232,120,194,246,8 1340 vpaddd %xmm7,%xmm14,%xmm14 1341 vpaddd %xmm4,%xmm15,%xmm15 1342 vpaddd %xmm5,%xmm12,%xmm12 1343 vpaddd %xmm6,%xmm13,%xmm13 1344 vpxor %xmm1,%xmm14,%xmm1 1345 vpxor %xmm2,%xmm15,%xmm2 1346 vpxor %xmm12,%xmm3,%xmm3 1347 vpxor %xmm13,%xmm0,%xmm0 1348.byte 143,232,120,194,201,7 1349.byte 143,232,120,194,210,7 1350.byte 143,232,120,194,219,7 1351.byte 143,232,120,194,192,7 1352 decl %eax 1353 jnz .Loop4xop 1354 1355 vpaddd 64(%rsp),%xmm8,%xmm8 1356 vpaddd 80(%rsp),%xmm9,%xmm9 1357 vpaddd 96(%rsp),%xmm10,%xmm10 1358 vpaddd 112(%rsp),%xmm11,%xmm11 1359 1360 vmovdqa %xmm14,32(%rsp) 1361 vmovdqa %xmm15,48(%rsp) 1362 1363 vpunpckldq %xmm9,%xmm8,%xmm14 1364 vpunpckldq %xmm11,%xmm10,%xmm15 1365 vpunpckhdq %xmm9,%xmm8,%xmm8 1366 vpunpckhdq %xmm11,%xmm10,%xmm10 1367 vpunpcklqdq %xmm15,%xmm14,%xmm9 1368 vpunpckhqdq %xmm15,%xmm14,%xmm14 1369 vpunpcklqdq %xmm10,%xmm8,%xmm11 1370 vpunpckhqdq %xmm10,%xmm8,%xmm8 1371 vpaddd 128-256(%rcx),%xmm0,%xmm0 1372 vpaddd 144-256(%rcx),%xmm1,%xmm1 1373 vpaddd 160-256(%rcx),%xmm2,%xmm2 1374 vpaddd 176-256(%rcx),%xmm3,%xmm3 1375 1376 vmovdqa %xmm9,0(%rsp) 1377 vmovdqa %xmm14,16(%rsp) 1378 vmovdqa 32(%rsp),%xmm9 1379 vmovdqa 48(%rsp),%xmm14 1380 1381 vpunpckldq %xmm1,%xmm0,%xmm10 1382 vpunpckldq %xmm3,%xmm2,%xmm15 1383 vpunpckhdq %xmm1,%xmm0,%xmm0 1384 vpunpckhdq %xmm3,%xmm2,%xmm2 1385 vpunpcklqdq %xmm15,%xmm10,%xmm1 1386 vpunpckhqdq %xmm15,%xmm10,%xmm10 1387 vpunpcklqdq %xmm2,%xmm0,%xmm3 1388 vpunpckhqdq %xmm2,%xmm0,%xmm0 1389 vpaddd 192-256(%rcx),%xmm12,%xmm12 1390 vpaddd 208-256(%rcx),%xmm13,%xmm13 1391 vpaddd 224-256(%rcx),%xmm9,%xmm9 1392 vpaddd 240-256(%rcx),%xmm14,%xmm14 1393 1394 vpunpckldq %xmm13,%xmm12,%xmm2 1395 vpunpckldq %xmm14,%xmm9,%xmm15 1396 vpunpckhdq %xmm13,%xmm12,%xmm12 1397 vpunpckhdq %xmm14,%xmm9,%xmm9 1398 vpunpcklqdq %xmm15,%xmm2,%xmm13 1399 vpunpckhqdq %xmm15,%xmm2,%xmm2 1400 vpunpcklqdq %xmm9,%xmm12,%xmm14 1401 vpunpckhqdq %xmm9,%xmm12,%xmm12 1402 vpaddd 256-256(%rcx),%xmm4,%xmm4 1403 vpaddd 272-256(%rcx),%xmm5,%xmm5 1404 vpaddd 288-256(%rcx),%xmm6,%xmm6 1405 vpaddd 304-256(%rcx),%xmm7,%xmm7 1406 1407 vpunpckldq %xmm5,%xmm4,%xmm9 1408 vpunpckldq %xmm7,%xmm6,%xmm15 1409 vpunpckhdq %xmm5,%xmm4,%xmm4 1410 vpunpckhdq %xmm7,%xmm6,%xmm6 1411 vpunpcklqdq %xmm15,%xmm9,%xmm5 1412 vpunpckhqdq %xmm15,%xmm9,%xmm9 1413 vpunpcklqdq %xmm6,%xmm4,%xmm7 1414 vpunpckhqdq %xmm6,%xmm4,%xmm4 1415 vmovdqa 0(%rsp),%xmm6 1416 vmovdqa 16(%rsp),%xmm15 1417 1418 cmpq $256,%rdx 1419 jb .Ltail4xop 1420 1421 vpxor 0(%rsi),%xmm6,%xmm6 1422 vpxor 16(%rsi),%xmm1,%xmm1 1423 vpxor 32(%rsi),%xmm13,%xmm13 1424 vpxor 48(%rsi),%xmm5,%xmm5 1425 vpxor 64(%rsi),%xmm15,%xmm15 1426 vpxor 80(%rsi),%xmm10,%xmm10 1427 vpxor 96(%rsi),%xmm2,%xmm2 1428 vpxor 112(%rsi),%xmm9,%xmm9 1429 leaq 128(%rsi),%rsi 1430 vpxor 0(%rsi),%xmm11,%xmm11 1431 vpxor 16(%rsi),%xmm3,%xmm3 1432 vpxor 32(%rsi),%xmm14,%xmm14 1433 vpxor 48(%rsi),%xmm7,%xmm7 1434 vpxor 64(%rsi),%xmm8,%xmm8 1435 vpxor 80(%rsi),%xmm0,%xmm0 1436 vpxor 96(%rsi),%xmm12,%xmm12 1437 vpxor 112(%rsi),%xmm4,%xmm4 1438 leaq 128(%rsi),%rsi 1439 1440 vmovdqu %xmm6,0(%rdi) 1441 vmovdqu %xmm1,16(%rdi) 1442 vmovdqu %xmm13,32(%rdi) 1443 vmovdqu %xmm5,48(%rdi) 1444 vmovdqu %xmm15,64(%rdi) 1445 vmovdqu %xmm10,80(%rdi) 1446 vmovdqu %xmm2,96(%rdi) 1447 vmovdqu %xmm9,112(%rdi) 1448 leaq 128(%rdi),%rdi 1449 vmovdqu %xmm11,0(%rdi) 1450 vmovdqu %xmm3,16(%rdi) 1451 vmovdqu %xmm14,32(%rdi) 1452 vmovdqu %xmm7,48(%rdi) 1453 vmovdqu %xmm8,64(%rdi) 1454 vmovdqu %xmm0,80(%rdi) 1455 vmovdqu %xmm12,96(%rdi) 1456 vmovdqu %xmm4,112(%rdi) 1457 leaq 128(%rdi),%rdi 1458 1459 subq $256,%rdx 1460 jnz .Loop_outer4xop 1461 1462 jmp .Ldone4xop 1463 1464.align 32 1465.Ltail4xop: 1466 cmpq $192,%rdx 1467 jae .L192_or_more4xop 1468 cmpq $128,%rdx 1469 jae .L128_or_more4xop 1470 cmpq $64,%rdx 1471 jae .L64_or_more4xop 1472 1473 xorq %r10,%r10 1474 vmovdqa %xmm6,0(%rsp) 1475 vmovdqa %xmm1,16(%rsp) 1476 vmovdqa %xmm13,32(%rsp) 1477 vmovdqa %xmm5,48(%rsp) 1478 jmp .Loop_tail4xop 1479 1480.align 32 1481.L64_or_more4xop: 1482 vpxor 0(%rsi),%xmm6,%xmm6 1483 vpxor 16(%rsi),%xmm1,%xmm1 1484 vpxor 32(%rsi),%xmm13,%xmm13 1485 vpxor 48(%rsi),%xmm5,%xmm5 1486 vmovdqu %xmm6,0(%rdi) 1487 vmovdqu %xmm1,16(%rdi) 1488 vmovdqu %xmm13,32(%rdi) 1489 vmovdqu %xmm5,48(%rdi) 1490 je .Ldone4xop 1491 1492 leaq 64(%rsi),%rsi 1493 vmovdqa %xmm15,0(%rsp) 1494 xorq %r10,%r10 1495 vmovdqa %xmm10,16(%rsp) 1496 leaq 64(%rdi),%rdi 1497 vmovdqa %xmm2,32(%rsp) 1498 subq $64,%rdx 1499 vmovdqa %xmm9,48(%rsp) 1500 jmp .Loop_tail4xop 1501 1502.align 32 1503.L128_or_more4xop: 1504 vpxor 0(%rsi),%xmm6,%xmm6 1505 vpxor 16(%rsi),%xmm1,%xmm1 1506 vpxor 32(%rsi),%xmm13,%xmm13 1507 vpxor 48(%rsi),%xmm5,%xmm5 1508 vpxor 64(%rsi),%xmm15,%xmm15 1509 vpxor 80(%rsi),%xmm10,%xmm10 1510 vpxor 96(%rsi),%xmm2,%xmm2 1511 vpxor 112(%rsi),%xmm9,%xmm9 1512 1513 vmovdqu %xmm6,0(%rdi) 1514 vmovdqu %xmm1,16(%rdi) 1515 vmovdqu %xmm13,32(%rdi) 1516 vmovdqu %xmm5,48(%rdi) 1517 vmovdqu %xmm15,64(%rdi) 1518 vmovdqu %xmm10,80(%rdi) 1519 vmovdqu %xmm2,96(%rdi) 1520 vmovdqu %xmm9,112(%rdi) 1521 je .Ldone4xop 1522 1523 leaq 128(%rsi),%rsi 1524 vmovdqa %xmm11,0(%rsp) 1525 xorq %r10,%r10 1526 vmovdqa %xmm3,16(%rsp) 1527 leaq 128(%rdi),%rdi 1528 vmovdqa %xmm14,32(%rsp) 1529 subq $128,%rdx 1530 vmovdqa %xmm7,48(%rsp) 1531 jmp .Loop_tail4xop 1532 1533.align 32 1534.L192_or_more4xop: 1535 vpxor 0(%rsi),%xmm6,%xmm6 1536 vpxor 16(%rsi),%xmm1,%xmm1 1537 vpxor 32(%rsi),%xmm13,%xmm13 1538 vpxor 48(%rsi),%xmm5,%xmm5 1539 vpxor 64(%rsi),%xmm15,%xmm15 1540 vpxor 80(%rsi),%xmm10,%xmm10 1541 vpxor 96(%rsi),%xmm2,%xmm2 1542 vpxor 112(%rsi),%xmm9,%xmm9 1543 leaq 128(%rsi),%rsi 1544 vpxor 0(%rsi),%xmm11,%xmm11 1545 vpxor 16(%rsi),%xmm3,%xmm3 1546 vpxor 32(%rsi),%xmm14,%xmm14 1547 vpxor 48(%rsi),%xmm7,%xmm7 1548 1549 vmovdqu %xmm6,0(%rdi) 1550 vmovdqu %xmm1,16(%rdi) 1551 vmovdqu %xmm13,32(%rdi) 1552 vmovdqu %xmm5,48(%rdi) 1553 vmovdqu %xmm15,64(%rdi) 1554 vmovdqu %xmm10,80(%rdi) 1555 vmovdqu %xmm2,96(%rdi) 1556 vmovdqu %xmm9,112(%rdi) 1557 leaq 128(%rdi),%rdi 1558 vmovdqu %xmm11,0(%rdi) 1559 vmovdqu %xmm3,16(%rdi) 1560 vmovdqu %xmm14,32(%rdi) 1561 vmovdqu %xmm7,48(%rdi) 1562 je .Ldone4xop 1563 1564 leaq 64(%rsi),%rsi 1565 vmovdqa %xmm8,0(%rsp) 1566 xorq %r10,%r10 1567 vmovdqa %xmm0,16(%rsp) 1568 leaq 64(%rdi),%rdi 1569 vmovdqa %xmm12,32(%rsp) 1570 subq $192,%rdx 1571 vmovdqa %xmm4,48(%rsp) 1572 1573.Loop_tail4xop: 1574 movzbl (%rsi,%r10,1),%eax 1575 movzbl (%rsp,%r10,1),%ecx 1576 leaq 1(%r10),%r10 1577 xorl %ecx,%eax 1578 movb %al,-1(%rdi,%r10,1) 1579 decq %rdx 1580 jnz .Loop_tail4xop 1581 1582.Ldone4xop: 1583 vzeroupper 1584 leaq (%r9),%rsp 1585.cfi_def_cfa_register %rsp 1586.L4xop_epilogue: 1587 .byte 0xf3,0xc3 1588.cfi_endproc 1589.size ChaCha20_4xop,.-ChaCha20_4xop 1590.type ChaCha20_8x,@function 1591.align 32 1592ChaCha20_8x: 1593.cfi_startproc 1594.LChaCha20_8x: 1595 movq %rsp,%r9 1596.cfi_def_cfa_register %r9 1597 subq $0x280+8,%rsp 1598 andq $-32,%rsp 1599 vzeroupper 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 vbroadcasti128 .Lsigma(%rip),%ymm11 1611 vbroadcasti128 (%rcx),%ymm3 1612 vbroadcasti128 16(%rcx),%ymm15 1613 vbroadcasti128 (%r8),%ymm7 1614 leaq 256(%rsp),%rcx 1615 leaq 512(%rsp),%rax 1616 leaq .Lrot16(%rip),%r10 1617 leaq .Lrot24(%rip),%r11 1618 1619 vpshufd $0x00,%ymm11,%ymm8 1620 vpshufd $0x55,%ymm11,%ymm9 1621 vmovdqa %ymm8,128-256(%rcx) 1622 vpshufd $0xaa,%ymm11,%ymm10 1623 vmovdqa %ymm9,160-256(%rcx) 1624 vpshufd $0xff,%ymm11,%ymm11 1625 vmovdqa %ymm10,192-256(%rcx) 1626 vmovdqa %ymm11,224-256(%rcx) 1627 1628 vpshufd $0x00,%ymm3,%ymm0 1629 vpshufd $0x55,%ymm3,%ymm1 1630 vmovdqa %ymm0,256-256(%rcx) 1631 vpshufd $0xaa,%ymm3,%ymm2 1632 vmovdqa %ymm1,288-256(%rcx) 1633 vpshufd $0xff,%ymm3,%ymm3 1634 vmovdqa %ymm2,320-256(%rcx) 1635 vmovdqa %ymm3,352-256(%rcx) 1636 1637 vpshufd $0x00,%ymm15,%ymm12 1638 vpshufd $0x55,%ymm15,%ymm13 1639 vmovdqa %ymm12,384-512(%rax) 1640 vpshufd $0xaa,%ymm15,%ymm14 1641 vmovdqa %ymm13,416-512(%rax) 1642 vpshufd $0xff,%ymm15,%ymm15 1643 vmovdqa %ymm14,448-512(%rax) 1644 vmovdqa %ymm15,480-512(%rax) 1645 1646 vpshufd $0x00,%ymm7,%ymm4 1647 vpshufd $0x55,%ymm7,%ymm5 1648 vpaddd .Lincy(%rip),%ymm4,%ymm4 1649 vpshufd $0xaa,%ymm7,%ymm6 1650 vmovdqa %ymm5,544-512(%rax) 1651 vpshufd $0xff,%ymm7,%ymm7 1652 vmovdqa %ymm6,576-512(%rax) 1653 vmovdqa %ymm7,608-512(%rax) 1654 1655 jmp .Loop_enter8x 1656 1657.align 32 1658.Loop_outer8x: 1659 vmovdqa 128-256(%rcx),%ymm8 1660 vmovdqa 160-256(%rcx),%ymm9 1661 vmovdqa 192-256(%rcx),%ymm10 1662 vmovdqa 224-256(%rcx),%ymm11 1663 vmovdqa 256-256(%rcx),%ymm0 1664 vmovdqa 288-256(%rcx),%ymm1 1665 vmovdqa 320-256(%rcx),%ymm2 1666 vmovdqa 352-256(%rcx),%ymm3 1667 vmovdqa 384-512(%rax),%ymm12 1668 vmovdqa 416-512(%rax),%ymm13 1669 vmovdqa 448-512(%rax),%ymm14 1670 vmovdqa 480-512(%rax),%ymm15 1671 vmovdqa 512-512(%rax),%ymm4 1672 vmovdqa 544-512(%rax),%ymm5 1673 vmovdqa 576-512(%rax),%ymm6 1674 vmovdqa 608-512(%rax),%ymm7 1675 vpaddd .Leight(%rip),%ymm4,%ymm4 1676 1677.Loop_enter8x: 1678 vmovdqa %ymm14,64(%rsp) 1679 vmovdqa %ymm15,96(%rsp) 1680 vbroadcasti128 (%r10),%ymm15 1681 vmovdqa %ymm4,512-512(%rax) 1682 movl $10,%eax 1683 jmp .Loop8x 1684 1685.align 32 1686.Loop8x: 1687 vpaddd %ymm0,%ymm8,%ymm8 1688 vpxor %ymm4,%ymm8,%ymm4 1689 vpshufb %ymm15,%ymm4,%ymm4 1690 vpaddd %ymm1,%ymm9,%ymm9 1691 vpxor %ymm5,%ymm9,%ymm5 1692 vpshufb %ymm15,%ymm5,%ymm5 1693 vpaddd %ymm4,%ymm12,%ymm12 1694 vpxor %ymm0,%ymm12,%ymm0 1695 vpslld $12,%ymm0,%ymm14 1696 vpsrld $20,%ymm0,%ymm0 1697 vpor %ymm0,%ymm14,%ymm0 1698 vbroadcasti128 (%r11),%ymm14 1699 vpaddd %ymm5,%ymm13,%ymm13 1700 vpxor %ymm1,%ymm13,%ymm1 1701 vpslld $12,%ymm1,%ymm15 1702 vpsrld $20,%ymm1,%ymm1 1703 vpor %ymm1,%ymm15,%ymm1 1704 vpaddd %ymm0,%ymm8,%ymm8 1705 vpxor %ymm4,%ymm8,%ymm4 1706 vpshufb %ymm14,%ymm4,%ymm4 1707 vpaddd %ymm1,%ymm9,%ymm9 1708 vpxor %ymm5,%ymm9,%ymm5 1709 vpshufb %ymm14,%ymm5,%ymm5 1710 vpaddd %ymm4,%ymm12,%ymm12 1711 vpxor %ymm0,%ymm12,%ymm0 1712 vpslld $7,%ymm0,%ymm15 1713 vpsrld $25,%ymm0,%ymm0 1714 vpor %ymm0,%ymm15,%ymm0 1715 vbroadcasti128 (%r10),%ymm15 1716 vpaddd %ymm5,%ymm13,%ymm13 1717 vpxor %ymm1,%ymm13,%ymm1 1718 vpslld $7,%ymm1,%ymm14 1719 vpsrld $25,%ymm1,%ymm1 1720 vpor %ymm1,%ymm14,%ymm1 1721 vmovdqa %ymm12,0(%rsp) 1722 vmovdqa %ymm13,32(%rsp) 1723 vmovdqa 64(%rsp),%ymm12 1724 vmovdqa 96(%rsp),%ymm13 1725 vpaddd %ymm2,%ymm10,%ymm10 1726 vpxor %ymm6,%ymm10,%ymm6 1727 vpshufb %ymm15,%ymm6,%ymm6 1728 vpaddd %ymm3,%ymm11,%ymm11 1729 vpxor %ymm7,%ymm11,%ymm7 1730 vpshufb %ymm15,%ymm7,%ymm7 1731 vpaddd %ymm6,%ymm12,%ymm12 1732 vpxor %ymm2,%ymm12,%ymm2 1733 vpslld $12,%ymm2,%ymm14 1734 vpsrld $20,%ymm2,%ymm2 1735 vpor %ymm2,%ymm14,%ymm2 1736 vbroadcasti128 (%r11),%ymm14 1737 vpaddd %ymm7,%ymm13,%ymm13 1738 vpxor %ymm3,%ymm13,%ymm3 1739 vpslld $12,%ymm3,%ymm15 1740 vpsrld $20,%ymm3,%ymm3 1741 vpor %ymm3,%ymm15,%ymm3 1742 vpaddd %ymm2,%ymm10,%ymm10 1743 vpxor %ymm6,%ymm10,%ymm6 1744 vpshufb %ymm14,%ymm6,%ymm6 1745 vpaddd %ymm3,%ymm11,%ymm11 1746 vpxor %ymm7,%ymm11,%ymm7 1747 vpshufb %ymm14,%ymm7,%ymm7 1748 vpaddd %ymm6,%ymm12,%ymm12 1749 vpxor %ymm2,%ymm12,%ymm2 1750 vpslld $7,%ymm2,%ymm15 1751 vpsrld $25,%ymm2,%ymm2 1752 vpor %ymm2,%ymm15,%ymm2 1753 vbroadcasti128 (%r10),%ymm15 1754 vpaddd %ymm7,%ymm13,%ymm13 1755 vpxor %ymm3,%ymm13,%ymm3 1756 vpslld $7,%ymm3,%ymm14 1757 vpsrld $25,%ymm3,%ymm3 1758 vpor %ymm3,%ymm14,%ymm3 1759 vpaddd %ymm1,%ymm8,%ymm8 1760 vpxor %ymm7,%ymm8,%ymm7 1761 vpshufb %ymm15,%ymm7,%ymm7 1762 vpaddd %ymm2,%ymm9,%ymm9 1763 vpxor %ymm4,%ymm9,%ymm4 1764 vpshufb %ymm15,%ymm4,%ymm4 1765 vpaddd %ymm7,%ymm12,%ymm12 1766 vpxor %ymm1,%ymm12,%ymm1 1767 vpslld $12,%ymm1,%ymm14 1768 vpsrld $20,%ymm1,%ymm1 1769 vpor %ymm1,%ymm14,%ymm1 1770 vbroadcasti128 (%r11),%ymm14 1771 vpaddd %ymm4,%ymm13,%ymm13 1772 vpxor %ymm2,%ymm13,%ymm2 1773 vpslld $12,%ymm2,%ymm15 1774 vpsrld $20,%ymm2,%ymm2 1775 vpor %ymm2,%ymm15,%ymm2 1776 vpaddd %ymm1,%ymm8,%ymm8 1777 vpxor %ymm7,%ymm8,%ymm7 1778 vpshufb %ymm14,%ymm7,%ymm7 1779 vpaddd %ymm2,%ymm9,%ymm9 1780 vpxor %ymm4,%ymm9,%ymm4 1781 vpshufb %ymm14,%ymm4,%ymm4 1782 vpaddd %ymm7,%ymm12,%ymm12 1783 vpxor %ymm1,%ymm12,%ymm1 1784 vpslld $7,%ymm1,%ymm15 1785 vpsrld $25,%ymm1,%ymm1 1786 vpor %ymm1,%ymm15,%ymm1 1787 vbroadcasti128 (%r10),%ymm15 1788 vpaddd %ymm4,%ymm13,%ymm13 1789 vpxor %ymm2,%ymm13,%ymm2 1790 vpslld $7,%ymm2,%ymm14 1791 vpsrld $25,%ymm2,%ymm2 1792 vpor %ymm2,%ymm14,%ymm2 1793 vmovdqa %ymm12,64(%rsp) 1794 vmovdqa %ymm13,96(%rsp) 1795 vmovdqa 0(%rsp),%ymm12 1796 vmovdqa 32(%rsp),%ymm13 1797 vpaddd %ymm3,%ymm10,%ymm10 1798 vpxor %ymm5,%ymm10,%ymm5 1799 vpshufb %ymm15,%ymm5,%ymm5 1800 vpaddd %ymm0,%ymm11,%ymm11 1801 vpxor %ymm6,%ymm11,%ymm6 1802 vpshufb %ymm15,%ymm6,%ymm6 1803 vpaddd %ymm5,%ymm12,%ymm12 1804 vpxor %ymm3,%ymm12,%ymm3 1805 vpslld $12,%ymm3,%ymm14 1806 vpsrld $20,%ymm3,%ymm3 1807 vpor %ymm3,%ymm14,%ymm3 1808 vbroadcasti128 (%r11),%ymm14 1809 vpaddd %ymm6,%ymm13,%ymm13 1810 vpxor %ymm0,%ymm13,%ymm0 1811 vpslld $12,%ymm0,%ymm15 1812 vpsrld $20,%ymm0,%ymm0 1813 vpor %ymm0,%ymm15,%ymm0 1814 vpaddd %ymm3,%ymm10,%ymm10 1815 vpxor %ymm5,%ymm10,%ymm5 1816 vpshufb %ymm14,%ymm5,%ymm5 1817 vpaddd %ymm0,%ymm11,%ymm11 1818 vpxor %ymm6,%ymm11,%ymm6 1819 vpshufb %ymm14,%ymm6,%ymm6 1820 vpaddd %ymm5,%ymm12,%ymm12 1821 vpxor %ymm3,%ymm12,%ymm3 1822 vpslld $7,%ymm3,%ymm15 1823 vpsrld $25,%ymm3,%ymm3 1824 vpor %ymm3,%ymm15,%ymm3 1825 vbroadcasti128 (%r10),%ymm15 1826 vpaddd %ymm6,%ymm13,%ymm13 1827 vpxor %ymm0,%ymm13,%ymm0 1828 vpslld $7,%ymm0,%ymm14 1829 vpsrld $25,%ymm0,%ymm0 1830 vpor %ymm0,%ymm14,%ymm0 1831 decl %eax 1832 jnz .Loop8x 1833 1834 leaq 512(%rsp),%rax 1835 vpaddd 128-256(%rcx),%ymm8,%ymm8 1836 vpaddd 160-256(%rcx),%ymm9,%ymm9 1837 vpaddd 192-256(%rcx),%ymm10,%ymm10 1838 vpaddd 224-256(%rcx),%ymm11,%ymm11 1839 1840 vpunpckldq %ymm9,%ymm8,%ymm14 1841 vpunpckldq %ymm11,%ymm10,%ymm15 1842 vpunpckhdq %ymm9,%ymm8,%ymm8 1843 vpunpckhdq %ymm11,%ymm10,%ymm10 1844 vpunpcklqdq %ymm15,%ymm14,%ymm9 1845 vpunpckhqdq %ymm15,%ymm14,%ymm14 1846 vpunpcklqdq %ymm10,%ymm8,%ymm11 1847 vpunpckhqdq %ymm10,%ymm8,%ymm8 1848 vpaddd 256-256(%rcx),%ymm0,%ymm0 1849 vpaddd 288-256(%rcx),%ymm1,%ymm1 1850 vpaddd 320-256(%rcx),%ymm2,%ymm2 1851 vpaddd 352-256(%rcx),%ymm3,%ymm3 1852 1853 vpunpckldq %ymm1,%ymm0,%ymm10 1854 vpunpckldq %ymm3,%ymm2,%ymm15 1855 vpunpckhdq %ymm1,%ymm0,%ymm0 1856 vpunpckhdq %ymm3,%ymm2,%ymm2 1857 vpunpcklqdq %ymm15,%ymm10,%ymm1 1858 vpunpckhqdq %ymm15,%ymm10,%ymm10 1859 vpunpcklqdq %ymm2,%ymm0,%ymm3 1860 vpunpckhqdq %ymm2,%ymm0,%ymm0 1861 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1862 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1863 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1864 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1865 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1866 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1867 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1868 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1869 vmovdqa %ymm15,0(%rsp) 1870 vmovdqa %ymm9,32(%rsp) 1871 vmovdqa 64(%rsp),%ymm15 1872 vmovdqa 96(%rsp),%ymm9 1873 1874 vpaddd 384-512(%rax),%ymm12,%ymm12 1875 vpaddd 416-512(%rax),%ymm13,%ymm13 1876 vpaddd 448-512(%rax),%ymm15,%ymm15 1877 vpaddd 480-512(%rax),%ymm9,%ymm9 1878 1879 vpunpckldq %ymm13,%ymm12,%ymm2 1880 vpunpckldq %ymm9,%ymm15,%ymm8 1881 vpunpckhdq %ymm13,%ymm12,%ymm12 1882 vpunpckhdq %ymm9,%ymm15,%ymm15 1883 vpunpcklqdq %ymm8,%ymm2,%ymm13 1884 vpunpckhqdq %ymm8,%ymm2,%ymm2 1885 vpunpcklqdq %ymm15,%ymm12,%ymm9 1886 vpunpckhqdq %ymm15,%ymm12,%ymm12 1887 vpaddd 512-512(%rax),%ymm4,%ymm4 1888 vpaddd 544-512(%rax),%ymm5,%ymm5 1889 vpaddd 576-512(%rax),%ymm6,%ymm6 1890 vpaddd 608-512(%rax),%ymm7,%ymm7 1891 1892 vpunpckldq %ymm5,%ymm4,%ymm15 1893 vpunpckldq %ymm7,%ymm6,%ymm8 1894 vpunpckhdq %ymm5,%ymm4,%ymm4 1895 vpunpckhdq %ymm7,%ymm6,%ymm6 1896 vpunpcklqdq %ymm8,%ymm15,%ymm5 1897 vpunpckhqdq %ymm8,%ymm15,%ymm15 1898 vpunpcklqdq %ymm6,%ymm4,%ymm7 1899 vpunpckhqdq %ymm6,%ymm4,%ymm4 1900 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1901 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1902 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1903 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1904 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1905 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1906 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1907 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1908 vmovdqa 0(%rsp),%ymm6 1909 vmovdqa 32(%rsp),%ymm12 1910 1911 cmpq $512,%rdx 1912 jb .Ltail8x 1913 1914 vpxor 0(%rsi),%ymm6,%ymm6 1915 vpxor 32(%rsi),%ymm8,%ymm8 1916 vpxor 64(%rsi),%ymm1,%ymm1 1917 vpxor 96(%rsi),%ymm5,%ymm5 1918 leaq 128(%rsi),%rsi 1919 vmovdqu %ymm6,0(%rdi) 1920 vmovdqu %ymm8,32(%rdi) 1921 vmovdqu %ymm1,64(%rdi) 1922 vmovdqu %ymm5,96(%rdi) 1923 leaq 128(%rdi),%rdi 1924 1925 vpxor 0(%rsi),%ymm12,%ymm12 1926 vpxor 32(%rsi),%ymm13,%ymm13 1927 vpxor 64(%rsi),%ymm10,%ymm10 1928 vpxor 96(%rsi),%ymm15,%ymm15 1929 leaq 128(%rsi),%rsi 1930 vmovdqu %ymm12,0(%rdi) 1931 vmovdqu %ymm13,32(%rdi) 1932 vmovdqu %ymm10,64(%rdi) 1933 vmovdqu %ymm15,96(%rdi) 1934 leaq 128(%rdi),%rdi 1935 1936 vpxor 0(%rsi),%ymm14,%ymm14 1937 vpxor 32(%rsi),%ymm2,%ymm2 1938 vpxor 64(%rsi),%ymm3,%ymm3 1939 vpxor 96(%rsi),%ymm7,%ymm7 1940 leaq 128(%rsi),%rsi 1941 vmovdqu %ymm14,0(%rdi) 1942 vmovdqu %ymm2,32(%rdi) 1943 vmovdqu %ymm3,64(%rdi) 1944 vmovdqu %ymm7,96(%rdi) 1945 leaq 128(%rdi),%rdi 1946 1947 vpxor 0(%rsi),%ymm11,%ymm11 1948 vpxor 32(%rsi),%ymm9,%ymm9 1949 vpxor 64(%rsi),%ymm0,%ymm0 1950 vpxor 96(%rsi),%ymm4,%ymm4 1951 leaq 128(%rsi),%rsi 1952 vmovdqu %ymm11,0(%rdi) 1953 vmovdqu %ymm9,32(%rdi) 1954 vmovdqu %ymm0,64(%rdi) 1955 vmovdqu %ymm4,96(%rdi) 1956 leaq 128(%rdi),%rdi 1957 1958 subq $512,%rdx 1959 jnz .Loop_outer8x 1960 1961 jmp .Ldone8x 1962 1963.Ltail8x: 1964 cmpq $448,%rdx 1965 jae .L448_or_more8x 1966 cmpq $384,%rdx 1967 jae .L384_or_more8x 1968 cmpq $320,%rdx 1969 jae .L320_or_more8x 1970 cmpq $256,%rdx 1971 jae .L256_or_more8x 1972 cmpq $192,%rdx 1973 jae .L192_or_more8x 1974 cmpq $128,%rdx 1975 jae .L128_or_more8x 1976 cmpq $64,%rdx 1977 jae .L64_or_more8x 1978 1979 xorq %r10,%r10 1980 vmovdqa %ymm6,0(%rsp) 1981 vmovdqa %ymm8,32(%rsp) 1982 jmp .Loop_tail8x 1983 1984.align 32 1985.L64_or_more8x: 1986 vpxor 0(%rsi),%ymm6,%ymm6 1987 vpxor 32(%rsi),%ymm8,%ymm8 1988 vmovdqu %ymm6,0(%rdi) 1989 vmovdqu %ymm8,32(%rdi) 1990 je .Ldone8x 1991 1992 leaq 64(%rsi),%rsi 1993 xorq %r10,%r10 1994 vmovdqa %ymm1,0(%rsp) 1995 leaq 64(%rdi),%rdi 1996 subq $64,%rdx 1997 vmovdqa %ymm5,32(%rsp) 1998 jmp .Loop_tail8x 1999 2000.align 32 2001.L128_or_more8x: 2002 vpxor 0(%rsi),%ymm6,%ymm6 2003 vpxor 32(%rsi),%ymm8,%ymm8 2004 vpxor 64(%rsi),%ymm1,%ymm1 2005 vpxor 96(%rsi),%ymm5,%ymm5 2006 vmovdqu %ymm6,0(%rdi) 2007 vmovdqu %ymm8,32(%rdi) 2008 vmovdqu %ymm1,64(%rdi) 2009 vmovdqu %ymm5,96(%rdi) 2010 je .Ldone8x 2011 2012 leaq 128(%rsi),%rsi 2013 xorq %r10,%r10 2014 vmovdqa %ymm12,0(%rsp) 2015 leaq 128(%rdi),%rdi 2016 subq $128,%rdx 2017 vmovdqa %ymm13,32(%rsp) 2018 jmp .Loop_tail8x 2019 2020.align 32 2021.L192_or_more8x: 2022 vpxor 0(%rsi),%ymm6,%ymm6 2023 vpxor 32(%rsi),%ymm8,%ymm8 2024 vpxor 64(%rsi),%ymm1,%ymm1 2025 vpxor 96(%rsi),%ymm5,%ymm5 2026 vpxor 128(%rsi),%ymm12,%ymm12 2027 vpxor 160(%rsi),%ymm13,%ymm13 2028 vmovdqu %ymm6,0(%rdi) 2029 vmovdqu %ymm8,32(%rdi) 2030 vmovdqu %ymm1,64(%rdi) 2031 vmovdqu %ymm5,96(%rdi) 2032 vmovdqu %ymm12,128(%rdi) 2033 vmovdqu %ymm13,160(%rdi) 2034 je .Ldone8x 2035 2036 leaq 192(%rsi),%rsi 2037 xorq %r10,%r10 2038 vmovdqa %ymm10,0(%rsp) 2039 leaq 192(%rdi),%rdi 2040 subq $192,%rdx 2041 vmovdqa %ymm15,32(%rsp) 2042 jmp .Loop_tail8x 2043 2044.align 32 2045.L256_or_more8x: 2046 vpxor 0(%rsi),%ymm6,%ymm6 2047 vpxor 32(%rsi),%ymm8,%ymm8 2048 vpxor 64(%rsi),%ymm1,%ymm1 2049 vpxor 96(%rsi),%ymm5,%ymm5 2050 vpxor 128(%rsi),%ymm12,%ymm12 2051 vpxor 160(%rsi),%ymm13,%ymm13 2052 vpxor 192(%rsi),%ymm10,%ymm10 2053 vpxor 224(%rsi),%ymm15,%ymm15 2054 vmovdqu %ymm6,0(%rdi) 2055 vmovdqu %ymm8,32(%rdi) 2056 vmovdqu %ymm1,64(%rdi) 2057 vmovdqu %ymm5,96(%rdi) 2058 vmovdqu %ymm12,128(%rdi) 2059 vmovdqu %ymm13,160(%rdi) 2060 vmovdqu %ymm10,192(%rdi) 2061 vmovdqu %ymm15,224(%rdi) 2062 je .Ldone8x 2063 2064 leaq 256(%rsi),%rsi 2065 xorq %r10,%r10 2066 vmovdqa %ymm14,0(%rsp) 2067 leaq 256(%rdi),%rdi 2068 subq $256,%rdx 2069 vmovdqa %ymm2,32(%rsp) 2070 jmp .Loop_tail8x 2071 2072.align 32 2073.L320_or_more8x: 2074 vpxor 0(%rsi),%ymm6,%ymm6 2075 vpxor 32(%rsi),%ymm8,%ymm8 2076 vpxor 64(%rsi),%ymm1,%ymm1 2077 vpxor 96(%rsi),%ymm5,%ymm5 2078 vpxor 128(%rsi),%ymm12,%ymm12 2079 vpxor 160(%rsi),%ymm13,%ymm13 2080 vpxor 192(%rsi),%ymm10,%ymm10 2081 vpxor 224(%rsi),%ymm15,%ymm15 2082 vpxor 256(%rsi),%ymm14,%ymm14 2083 vpxor 288(%rsi),%ymm2,%ymm2 2084 vmovdqu %ymm6,0(%rdi) 2085 vmovdqu %ymm8,32(%rdi) 2086 vmovdqu %ymm1,64(%rdi) 2087 vmovdqu %ymm5,96(%rdi) 2088 vmovdqu %ymm12,128(%rdi) 2089 vmovdqu %ymm13,160(%rdi) 2090 vmovdqu %ymm10,192(%rdi) 2091 vmovdqu %ymm15,224(%rdi) 2092 vmovdqu %ymm14,256(%rdi) 2093 vmovdqu %ymm2,288(%rdi) 2094 je .Ldone8x 2095 2096 leaq 320(%rsi),%rsi 2097 xorq %r10,%r10 2098 vmovdqa %ymm3,0(%rsp) 2099 leaq 320(%rdi),%rdi 2100 subq $320,%rdx 2101 vmovdqa %ymm7,32(%rsp) 2102 jmp .Loop_tail8x 2103 2104.align 32 2105.L384_or_more8x: 2106 vpxor 0(%rsi),%ymm6,%ymm6 2107 vpxor 32(%rsi),%ymm8,%ymm8 2108 vpxor 64(%rsi),%ymm1,%ymm1 2109 vpxor 96(%rsi),%ymm5,%ymm5 2110 vpxor 128(%rsi),%ymm12,%ymm12 2111 vpxor 160(%rsi),%ymm13,%ymm13 2112 vpxor 192(%rsi),%ymm10,%ymm10 2113 vpxor 224(%rsi),%ymm15,%ymm15 2114 vpxor 256(%rsi),%ymm14,%ymm14 2115 vpxor 288(%rsi),%ymm2,%ymm2 2116 vpxor 320(%rsi),%ymm3,%ymm3 2117 vpxor 352(%rsi),%ymm7,%ymm7 2118 vmovdqu %ymm6,0(%rdi) 2119 vmovdqu %ymm8,32(%rdi) 2120 vmovdqu %ymm1,64(%rdi) 2121 vmovdqu %ymm5,96(%rdi) 2122 vmovdqu %ymm12,128(%rdi) 2123 vmovdqu %ymm13,160(%rdi) 2124 vmovdqu %ymm10,192(%rdi) 2125 vmovdqu %ymm15,224(%rdi) 2126 vmovdqu %ymm14,256(%rdi) 2127 vmovdqu %ymm2,288(%rdi) 2128 vmovdqu %ymm3,320(%rdi) 2129 vmovdqu %ymm7,352(%rdi) 2130 je .Ldone8x 2131 2132 leaq 384(%rsi),%rsi 2133 xorq %r10,%r10 2134 vmovdqa %ymm11,0(%rsp) 2135 leaq 384(%rdi),%rdi 2136 subq $384,%rdx 2137 vmovdqa %ymm9,32(%rsp) 2138 jmp .Loop_tail8x 2139 2140.align 32 2141.L448_or_more8x: 2142 vpxor 0(%rsi),%ymm6,%ymm6 2143 vpxor 32(%rsi),%ymm8,%ymm8 2144 vpxor 64(%rsi),%ymm1,%ymm1 2145 vpxor 96(%rsi),%ymm5,%ymm5 2146 vpxor 128(%rsi),%ymm12,%ymm12 2147 vpxor 160(%rsi),%ymm13,%ymm13 2148 vpxor 192(%rsi),%ymm10,%ymm10 2149 vpxor 224(%rsi),%ymm15,%ymm15 2150 vpxor 256(%rsi),%ymm14,%ymm14 2151 vpxor 288(%rsi),%ymm2,%ymm2 2152 vpxor 320(%rsi),%ymm3,%ymm3 2153 vpxor 352(%rsi),%ymm7,%ymm7 2154 vpxor 384(%rsi),%ymm11,%ymm11 2155 vpxor 416(%rsi),%ymm9,%ymm9 2156 vmovdqu %ymm6,0(%rdi) 2157 vmovdqu %ymm8,32(%rdi) 2158 vmovdqu %ymm1,64(%rdi) 2159 vmovdqu %ymm5,96(%rdi) 2160 vmovdqu %ymm12,128(%rdi) 2161 vmovdqu %ymm13,160(%rdi) 2162 vmovdqu %ymm10,192(%rdi) 2163 vmovdqu %ymm15,224(%rdi) 2164 vmovdqu %ymm14,256(%rdi) 2165 vmovdqu %ymm2,288(%rdi) 2166 vmovdqu %ymm3,320(%rdi) 2167 vmovdqu %ymm7,352(%rdi) 2168 vmovdqu %ymm11,384(%rdi) 2169 vmovdqu %ymm9,416(%rdi) 2170 je .Ldone8x 2171 2172 leaq 448(%rsi),%rsi 2173 xorq %r10,%r10 2174 vmovdqa %ymm0,0(%rsp) 2175 leaq 448(%rdi),%rdi 2176 subq $448,%rdx 2177 vmovdqa %ymm4,32(%rsp) 2178 2179.Loop_tail8x: 2180 movzbl (%rsi,%r10,1),%eax 2181 movzbl (%rsp,%r10,1),%ecx 2182 leaq 1(%r10),%r10 2183 xorl %ecx,%eax 2184 movb %al,-1(%rdi,%r10,1) 2185 decq %rdx 2186 jnz .Loop_tail8x 2187 2188.Ldone8x: 2189 vzeroall 2190 leaq (%r9),%rsp 2191.cfi_def_cfa_register %rsp 2192.L8x_epilogue: 2193 .byte 0xf3,0xc3 2194.cfi_endproc 2195.size ChaCha20_8x,.-ChaCha20_8x 2196