1/* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */ 2.text 3 4 5 6.globl poly1305_init 7.hidden poly1305_init 8.globl poly1305_blocks 9.hidden poly1305_blocks 10.globl poly1305_emit 11.hidden poly1305_emit 12 13.type poly1305_init,@function 14.align 32 15poly1305_init: 16.cfi_startproc 17 xorq %rax,%rax 18 movq %rax,0(%rdi) 19 movq %rax,8(%rdi) 20 movq %rax,16(%rdi) 21 22 cmpq $0,%rsi 23 je .Lno_key 24 25 leaq poly1305_blocks(%rip),%r10 26 leaq poly1305_emit(%rip),%r11 27 movq OPENSSL_ia32cap_P+4(%rip),%r9 28 leaq poly1305_blocks_avx(%rip),%rax 29 leaq poly1305_emit_avx(%rip),%rcx 30 btq $28,%r9 31 cmovcq %rax,%r10 32 cmovcq %rcx,%r11 33 leaq poly1305_blocks_avx2(%rip),%rax 34 btq $37,%r9 35 cmovcq %rax,%r10 36 movq $0x0ffffffc0fffffff,%rax 37 movq $0x0ffffffc0ffffffc,%rcx 38 andq 0(%rsi),%rax 39 andq 8(%rsi),%rcx 40 movq %rax,24(%rdi) 41 movq %rcx,32(%rdi) 42 movq %r10,0(%rdx) 43 movq %r11,8(%rdx) 44 movl $1,%eax 45.Lno_key: 46 .byte 0xf3,0xc3 47.cfi_endproc 48.size poly1305_init,.-poly1305_init 49 50.type poly1305_blocks,@function 51.align 32 52poly1305_blocks: 53.cfi_startproc 54.Lblocks: 55 shrq $4,%rdx 56 jz .Lno_data 57 58 pushq %rbx 59.cfi_adjust_cfa_offset 8 60.cfi_offset %rbx,-16 61 pushq %rbp 62.cfi_adjust_cfa_offset 8 63.cfi_offset %rbp,-24 64 pushq %r12 65.cfi_adjust_cfa_offset 8 66.cfi_offset %r12,-32 67 pushq %r13 68.cfi_adjust_cfa_offset 8 69.cfi_offset %r13,-40 70 pushq %r14 71.cfi_adjust_cfa_offset 8 72.cfi_offset %r14,-48 73 pushq %r15 74.cfi_adjust_cfa_offset 8 75.cfi_offset %r15,-56 76.Lblocks_body: 77 78 movq %rdx,%r15 79 80 movq 24(%rdi),%r11 81 movq 32(%rdi),%r13 82 83 movq 0(%rdi),%r14 84 movq 8(%rdi),%rbx 85 movq 16(%rdi),%rbp 86 87 movq %r13,%r12 88 shrq $2,%r13 89 movq %r12,%rax 90 addq %r12,%r13 91 jmp .Loop 92 93.align 32 94.Loop: 95 addq 0(%rsi),%r14 96 adcq 8(%rsi),%rbx 97 leaq 16(%rsi),%rsi 98 adcq %rcx,%rbp 99 mulq %r14 100 movq %rax,%r9 101 movq %r11,%rax 102 movq %rdx,%r10 103 104 mulq %r14 105 movq %rax,%r14 106 movq %r11,%rax 107 movq %rdx,%r8 108 109 mulq %rbx 110 addq %rax,%r9 111 movq %r13,%rax 112 adcq %rdx,%r10 113 114 mulq %rbx 115 movq %rbp,%rbx 116 addq %rax,%r14 117 adcq %rdx,%r8 118 119 imulq %r13,%rbx 120 addq %rbx,%r9 121 movq %r8,%rbx 122 adcq $0,%r10 123 124 imulq %r11,%rbp 125 addq %r9,%rbx 126 movq $-4,%rax 127 adcq %rbp,%r10 128 129 andq %r10,%rax 130 movq %r10,%rbp 131 shrq $2,%r10 132 andq $3,%rbp 133 addq %r10,%rax 134 addq %rax,%r14 135 adcq $0,%rbx 136 adcq $0,%rbp 137 movq %r12,%rax 138 decq %r15 139 jnz .Loop 140 141 movq %r14,0(%rdi) 142 movq %rbx,8(%rdi) 143 movq %rbp,16(%rdi) 144 145 movq 0(%rsp),%r15 146.cfi_restore %r15 147 movq 8(%rsp),%r14 148.cfi_restore %r14 149 movq 16(%rsp),%r13 150.cfi_restore %r13 151 movq 24(%rsp),%r12 152.cfi_restore %r12 153 movq 32(%rsp),%rbp 154.cfi_restore %rbp 155 movq 40(%rsp),%rbx 156.cfi_restore %rbx 157 leaq 48(%rsp),%rsp 158.cfi_adjust_cfa_offset -48 159.Lno_data: 160.Lblocks_epilogue: 161 .byte 0xf3,0xc3 162.cfi_endproc 163.size poly1305_blocks,.-poly1305_blocks 164 165.type poly1305_emit,@function 166.align 32 167poly1305_emit: 168.cfi_startproc 169.Lemit: 170 movq 0(%rdi),%r8 171 movq 8(%rdi),%r9 172 movq 16(%rdi),%r10 173 174 movq %r8,%rax 175 addq $5,%r8 176 movq %r9,%rcx 177 adcq $0,%r9 178 adcq $0,%r10 179 shrq $2,%r10 180 cmovnzq %r8,%rax 181 cmovnzq %r9,%rcx 182 183 addq 0(%rdx),%rax 184 adcq 8(%rdx),%rcx 185 movq %rax,0(%rsi) 186 movq %rcx,8(%rsi) 187 188 .byte 0xf3,0xc3 189.cfi_endproc 190.size poly1305_emit,.-poly1305_emit 191.type __poly1305_block,@function 192.align 32 193__poly1305_block: 194.cfi_startproc 195 mulq %r14 196 movq %rax,%r9 197 movq %r11,%rax 198 movq %rdx,%r10 199 200 mulq %r14 201 movq %rax,%r14 202 movq %r11,%rax 203 movq %rdx,%r8 204 205 mulq %rbx 206 addq %rax,%r9 207 movq %r13,%rax 208 adcq %rdx,%r10 209 210 mulq %rbx 211 movq %rbp,%rbx 212 addq %rax,%r14 213 adcq %rdx,%r8 214 215 imulq %r13,%rbx 216 addq %rbx,%r9 217 movq %r8,%rbx 218 adcq $0,%r10 219 220 imulq %r11,%rbp 221 addq %r9,%rbx 222 movq $-4,%rax 223 adcq %rbp,%r10 224 225 andq %r10,%rax 226 movq %r10,%rbp 227 shrq $2,%r10 228 andq $3,%rbp 229 addq %r10,%rax 230 addq %rax,%r14 231 adcq $0,%rbx 232 adcq $0,%rbp 233 .byte 0xf3,0xc3 234.cfi_endproc 235.size __poly1305_block,.-__poly1305_block 236 237.type __poly1305_init_avx,@function 238.align 32 239__poly1305_init_avx: 240.cfi_startproc 241 movq %r11,%r14 242 movq %r12,%rbx 243 xorq %rbp,%rbp 244 245 leaq 48+64(%rdi),%rdi 246 247 movq %r12,%rax 248 call __poly1305_block 249 250 movl $0x3ffffff,%eax 251 movl $0x3ffffff,%edx 252 movq %r14,%r8 253 andl %r14d,%eax 254 movq %r11,%r9 255 andl %r11d,%edx 256 movl %eax,-64(%rdi) 257 shrq $26,%r8 258 movl %edx,-60(%rdi) 259 shrq $26,%r9 260 261 movl $0x3ffffff,%eax 262 movl $0x3ffffff,%edx 263 andl %r8d,%eax 264 andl %r9d,%edx 265 movl %eax,-48(%rdi) 266 leal (%rax,%rax,4),%eax 267 movl %edx,-44(%rdi) 268 leal (%rdx,%rdx,4),%edx 269 movl %eax,-32(%rdi) 270 shrq $26,%r8 271 movl %edx,-28(%rdi) 272 shrq $26,%r9 273 274 movq %rbx,%rax 275 movq %r12,%rdx 276 shlq $12,%rax 277 shlq $12,%rdx 278 orq %r8,%rax 279 orq %r9,%rdx 280 andl $0x3ffffff,%eax 281 andl $0x3ffffff,%edx 282 movl %eax,-16(%rdi) 283 leal (%rax,%rax,4),%eax 284 movl %edx,-12(%rdi) 285 leal (%rdx,%rdx,4),%edx 286 movl %eax,0(%rdi) 287 movq %rbx,%r8 288 movl %edx,4(%rdi) 289 movq %r12,%r9 290 291 movl $0x3ffffff,%eax 292 movl $0x3ffffff,%edx 293 shrq $14,%r8 294 shrq $14,%r9 295 andl %r8d,%eax 296 andl %r9d,%edx 297 movl %eax,16(%rdi) 298 leal (%rax,%rax,4),%eax 299 movl %edx,20(%rdi) 300 leal (%rdx,%rdx,4),%edx 301 movl %eax,32(%rdi) 302 shrq $26,%r8 303 movl %edx,36(%rdi) 304 shrq $26,%r9 305 306 movq %rbp,%rax 307 shlq $24,%rax 308 orq %rax,%r8 309 movl %r8d,48(%rdi) 310 leaq (%r8,%r8,4),%r8 311 movl %r9d,52(%rdi) 312 leaq (%r9,%r9,4),%r9 313 movl %r8d,64(%rdi) 314 movl %r9d,68(%rdi) 315 316 movq %r12,%rax 317 call __poly1305_block 318 319 movl $0x3ffffff,%eax 320 movq %r14,%r8 321 andl %r14d,%eax 322 shrq $26,%r8 323 movl %eax,-52(%rdi) 324 325 movl $0x3ffffff,%edx 326 andl %r8d,%edx 327 movl %edx,-36(%rdi) 328 leal (%rdx,%rdx,4),%edx 329 shrq $26,%r8 330 movl %edx,-20(%rdi) 331 332 movq %rbx,%rax 333 shlq $12,%rax 334 orq %r8,%rax 335 andl $0x3ffffff,%eax 336 movl %eax,-4(%rdi) 337 leal (%rax,%rax,4),%eax 338 movq %rbx,%r8 339 movl %eax,12(%rdi) 340 341 movl $0x3ffffff,%edx 342 shrq $14,%r8 343 andl %r8d,%edx 344 movl %edx,28(%rdi) 345 leal (%rdx,%rdx,4),%edx 346 shrq $26,%r8 347 movl %edx,44(%rdi) 348 349 movq %rbp,%rax 350 shlq $24,%rax 351 orq %rax,%r8 352 movl %r8d,60(%rdi) 353 leaq (%r8,%r8,4),%r8 354 movl %r8d,76(%rdi) 355 356 movq %r12,%rax 357 call __poly1305_block 358 359 movl $0x3ffffff,%eax 360 movq %r14,%r8 361 andl %r14d,%eax 362 shrq $26,%r8 363 movl %eax,-56(%rdi) 364 365 movl $0x3ffffff,%edx 366 andl %r8d,%edx 367 movl %edx,-40(%rdi) 368 leal (%rdx,%rdx,4),%edx 369 shrq $26,%r8 370 movl %edx,-24(%rdi) 371 372 movq %rbx,%rax 373 shlq $12,%rax 374 orq %r8,%rax 375 andl $0x3ffffff,%eax 376 movl %eax,-8(%rdi) 377 leal (%rax,%rax,4),%eax 378 movq %rbx,%r8 379 movl %eax,8(%rdi) 380 381 movl $0x3ffffff,%edx 382 shrq $14,%r8 383 andl %r8d,%edx 384 movl %edx,24(%rdi) 385 leal (%rdx,%rdx,4),%edx 386 shrq $26,%r8 387 movl %edx,40(%rdi) 388 389 movq %rbp,%rax 390 shlq $24,%rax 391 orq %rax,%r8 392 movl %r8d,56(%rdi) 393 leaq (%r8,%r8,4),%r8 394 movl %r8d,72(%rdi) 395 396 leaq -48-64(%rdi),%rdi 397 .byte 0xf3,0xc3 398.cfi_endproc 399.size __poly1305_init_avx,.-__poly1305_init_avx 400 401.type poly1305_blocks_avx,@function 402.align 32 403poly1305_blocks_avx: 404.cfi_startproc 405 movl 20(%rdi),%r8d 406 cmpq $128,%rdx 407 jae .Lblocks_avx 408 testl %r8d,%r8d 409 jz .Lblocks 410 411.Lblocks_avx: 412 andq $-16,%rdx 413 jz .Lno_data_avx 414 415 vzeroupper 416 417 testl %r8d,%r8d 418 jz .Lbase2_64_avx 419 420 testq $31,%rdx 421 jz .Leven_avx 422 423 pushq %rbx 424.cfi_adjust_cfa_offset 8 425.cfi_offset %rbx,-16 426 pushq %rbp 427.cfi_adjust_cfa_offset 8 428.cfi_offset %rbp,-24 429 pushq %r12 430.cfi_adjust_cfa_offset 8 431.cfi_offset %r12,-32 432 pushq %r13 433.cfi_adjust_cfa_offset 8 434.cfi_offset %r13,-40 435 pushq %r14 436.cfi_adjust_cfa_offset 8 437.cfi_offset %r14,-48 438 pushq %r15 439.cfi_adjust_cfa_offset 8 440.cfi_offset %r15,-56 441.Lblocks_avx_body: 442 443 movq %rdx,%r15 444 445 movq 0(%rdi),%r8 446 movq 8(%rdi),%r9 447 movl 16(%rdi),%ebp 448 449 movq 24(%rdi),%r11 450 movq 32(%rdi),%r13 451 452 453 movl %r8d,%r14d 454 andq $-2147483648,%r8 455 movq %r9,%r12 456 movl %r9d,%ebx 457 andq $-2147483648,%r9 458 459 shrq $6,%r8 460 shlq $52,%r12 461 addq %r8,%r14 462 shrq $12,%rbx 463 shrq $18,%r9 464 addq %r12,%r14 465 adcq %r9,%rbx 466 467 movq %rbp,%r8 468 shlq $40,%r8 469 shrq $24,%rbp 470 addq %r8,%rbx 471 adcq $0,%rbp 472 473 movq $-4,%r9 474 movq %rbp,%r8 475 andq %rbp,%r9 476 shrq $2,%r8 477 andq $3,%rbp 478 addq %r9,%r8 479 addq %r8,%r14 480 adcq $0,%rbx 481 adcq $0,%rbp 482 483 movq %r13,%r12 484 movq %r13,%rax 485 shrq $2,%r13 486 addq %r12,%r13 487 488 addq 0(%rsi),%r14 489 adcq 8(%rsi),%rbx 490 leaq 16(%rsi),%rsi 491 adcq %rcx,%rbp 492 493 call __poly1305_block 494 495 testq %rcx,%rcx 496 jz .Lstore_base2_64_avx 497 498 499 movq %r14,%rax 500 movq %r14,%rdx 501 shrq $52,%r14 502 movq %rbx,%r11 503 movq %rbx,%r12 504 shrq $26,%rdx 505 andq $0x3ffffff,%rax 506 shlq $12,%r11 507 andq $0x3ffffff,%rdx 508 shrq $14,%rbx 509 orq %r11,%r14 510 shlq $24,%rbp 511 andq $0x3ffffff,%r14 512 shrq $40,%r12 513 andq $0x3ffffff,%rbx 514 orq %r12,%rbp 515 516 subq $16,%r15 517 jz .Lstore_base2_26_avx 518 519 vmovd %eax,%xmm0 520 vmovd %edx,%xmm1 521 vmovd %r14d,%xmm2 522 vmovd %ebx,%xmm3 523 vmovd %ebp,%xmm4 524 jmp .Lproceed_avx 525 526.align 32 527.Lstore_base2_64_avx: 528 movq %r14,0(%rdi) 529 movq %rbx,8(%rdi) 530 movq %rbp,16(%rdi) 531 jmp .Ldone_avx 532 533.align 16 534.Lstore_base2_26_avx: 535 movl %eax,0(%rdi) 536 movl %edx,4(%rdi) 537 movl %r14d,8(%rdi) 538 movl %ebx,12(%rdi) 539 movl %ebp,16(%rdi) 540.align 16 541.Ldone_avx: 542 movq 0(%rsp),%r15 543.cfi_restore %r15 544 movq 8(%rsp),%r14 545.cfi_restore %r14 546 movq 16(%rsp),%r13 547.cfi_restore %r13 548 movq 24(%rsp),%r12 549.cfi_restore %r12 550 movq 32(%rsp),%rbp 551.cfi_restore %rbp 552 movq 40(%rsp),%rbx 553.cfi_restore %rbx 554 leaq 48(%rsp),%rsp 555.cfi_adjust_cfa_offset -48 556.Lno_data_avx: 557.Lblocks_avx_epilogue: 558 .byte 0xf3,0xc3 559.cfi_endproc 560 561.align 32 562.Lbase2_64_avx: 563.cfi_startproc 564 pushq %rbx 565.cfi_adjust_cfa_offset 8 566.cfi_offset %rbx,-16 567 pushq %rbp 568.cfi_adjust_cfa_offset 8 569.cfi_offset %rbp,-24 570 pushq %r12 571.cfi_adjust_cfa_offset 8 572.cfi_offset %r12,-32 573 pushq %r13 574.cfi_adjust_cfa_offset 8 575.cfi_offset %r13,-40 576 pushq %r14 577.cfi_adjust_cfa_offset 8 578.cfi_offset %r14,-48 579 pushq %r15 580.cfi_adjust_cfa_offset 8 581.cfi_offset %r15,-56 582.Lbase2_64_avx_body: 583 584 movq %rdx,%r15 585 586 movq 24(%rdi),%r11 587 movq 32(%rdi),%r13 588 589 movq 0(%rdi),%r14 590 movq 8(%rdi),%rbx 591 movl 16(%rdi),%ebp 592 593 movq %r13,%r12 594 movq %r13,%rax 595 shrq $2,%r13 596 addq %r12,%r13 597 598 testq $31,%rdx 599 jz .Linit_avx 600 601 addq 0(%rsi),%r14 602 adcq 8(%rsi),%rbx 603 leaq 16(%rsi),%rsi 604 adcq %rcx,%rbp 605 subq $16,%r15 606 607 call __poly1305_block 608 609.Linit_avx: 610 611 movq %r14,%rax 612 movq %r14,%rdx 613 shrq $52,%r14 614 movq %rbx,%r8 615 movq %rbx,%r9 616 shrq $26,%rdx 617 andq $0x3ffffff,%rax 618 shlq $12,%r8 619 andq $0x3ffffff,%rdx 620 shrq $14,%rbx 621 orq %r8,%r14 622 shlq $24,%rbp 623 andq $0x3ffffff,%r14 624 shrq $40,%r9 625 andq $0x3ffffff,%rbx 626 orq %r9,%rbp 627 628 vmovd %eax,%xmm0 629 vmovd %edx,%xmm1 630 vmovd %r14d,%xmm2 631 vmovd %ebx,%xmm3 632 vmovd %ebp,%xmm4 633 movl $1,20(%rdi) 634 635 call __poly1305_init_avx 636 637.Lproceed_avx: 638 movq %r15,%rdx 639 640 movq 0(%rsp),%r15 641.cfi_restore %r15 642 movq 8(%rsp),%r14 643.cfi_restore %r14 644 movq 16(%rsp),%r13 645.cfi_restore %r13 646 movq 24(%rsp),%r12 647.cfi_restore %r12 648 movq 32(%rsp),%rbp 649.cfi_restore %rbp 650 movq 40(%rsp),%rbx 651.cfi_restore %rbx 652 leaq 48(%rsp),%rax 653 leaq 48(%rsp),%rsp 654.cfi_adjust_cfa_offset -48 655.Lbase2_64_avx_epilogue: 656 jmp .Ldo_avx 657.cfi_endproc 658 659.align 32 660.Leven_avx: 661.cfi_startproc 662 vmovd 0(%rdi),%xmm0 663 vmovd 4(%rdi),%xmm1 664 vmovd 8(%rdi),%xmm2 665 vmovd 12(%rdi),%xmm3 666 vmovd 16(%rdi),%xmm4 667 668.Ldo_avx: 669 leaq -88(%rsp),%r11 670.cfi_def_cfa %r11,0x60 671 subq $0x178,%rsp 672 subq $64,%rdx 673 leaq -32(%rsi),%rax 674 cmovcq %rax,%rsi 675 676 vmovdqu 48(%rdi),%xmm14 677 leaq 112(%rdi),%rdi 678 leaq .Lconst(%rip),%rcx 679 680 681 682 vmovdqu 32(%rsi),%xmm5 683 vmovdqu 48(%rsi),%xmm6 684 vmovdqa 64(%rcx),%xmm15 685 686 vpsrldq $6,%xmm5,%xmm7 687 vpsrldq $6,%xmm6,%xmm8 688 vpunpckhqdq %xmm6,%xmm5,%xmm9 689 vpunpcklqdq %xmm6,%xmm5,%xmm5 690 vpunpcklqdq %xmm8,%xmm7,%xmm8 691 692 vpsrlq $40,%xmm9,%xmm9 693 vpsrlq $26,%xmm5,%xmm6 694 vpand %xmm15,%xmm5,%xmm5 695 vpsrlq $4,%xmm8,%xmm7 696 vpand %xmm15,%xmm6,%xmm6 697 vpsrlq $30,%xmm8,%xmm8 698 vpand %xmm15,%xmm7,%xmm7 699 vpand %xmm15,%xmm8,%xmm8 700 vpor 32(%rcx),%xmm9,%xmm9 701 702 jbe .Lskip_loop_avx 703 704 705 vmovdqu -48(%rdi),%xmm11 706 vmovdqu -32(%rdi),%xmm12 707 vpshufd $0xEE,%xmm14,%xmm13 708 vpshufd $0x44,%xmm14,%xmm10 709 vmovdqa %xmm13,-144(%r11) 710 vmovdqa %xmm10,0(%rsp) 711 vpshufd $0xEE,%xmm11,%xmm14 712 vmovdqu -16(%rdi),%xmm10 713 vpshufd $0x44,%xmm11,%xmm11 714 vmovdqa %xmm14,-128(%r11) 715 vmovdqa %xmm11,16(%rsp) 716 vpshufd $0xEE,%xmm12,%xmm13 717 vmovdqu 0(%rdi),%xmm11 718 vpshufd $0x44,%xmm12,%xmm12 719 vmovdqa %xmm13,-112(%r11) 720 vmovdqa %xmm12,32(%rsp) 721 vpshufd $0xEE,%xmm10,%xmm14 722 vmovdqu 16(%rdi),%xmm12 723 vpshufd $0x44,%xmm10,%xmm10 724 vmovdqa %xmm14,-96(%r11) 725 vmovdqa %xmm10,48(%rsp) 726 vpshufd $0xEE,%xmm11,%xmm13 727 vmovdqu 32(%rdi),%xmm10 728 vpshufd $0x44,%xmm11,%xmm11 729 vmovdqa %xmm13,-80(%r11) 730 vmovdqa %xmm11,64(%rsp) 731 vpshufd $0xEE,%xmm12,%xmm14 732 vmovdqu 48(%rdi),%xmm11 733 vpshufd $0x44,%xmm12,%xmm12 734 vmovdqa %xmm14,-64(%r11) 735 vmovdqa %xmm12,80(%rsp) 736 vpshufd $0xEE,%xmm10,%xmm13 737 vmovdqu 64(%rdi),%xmm12 738 vpshufd $0x44,%xmm10,%xmm10 739 vmovdqa %xmm13,-48(%r11) 740 vmovdqa %xmm10,96(%rsp) 741 vpshufd $0xEE,%xmm11,%xmm14 742 vpshufd $0x44,%xmm11,%xmm11 743 vmovdqa %xmm14,-32(%r11) 744 vmovdqa %xmm11,112(%rsp) 745 vpshufd $0xEE,%xmm12,%xmm13 746 vmovdqa 0(%rsp),%xmm14 747 vpshufd $0x44,%xmm12,%xmm12 748 vmovdqa %xmm13,-16(%r11) 749 vmovdqa %xmm12,128(%rsp) 750 751 jmp .Loop_avx 752 753.align 32 754.Loop_avx: 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 vpmuludq %xmm5,%xmm14,%xmm10 776 vpmuludq %xmm6,%xmm14,%xmm11 777 vmovdqa %xmm2,32(%r11) 778 vpmuludq %xmm7,%xmm14,%xmm12 779 vmovdqa 16(%rsp),%xmm2 780 vpmuludq %xmm8,%xmm14,%xmm13 781 vpmuludq %xmm9,%xmm14,%xmm14 782 783 vmovdqa %xmm0,0(%r11) 784 vpmuludq 32(%rsp),%xmm9,%xmm0 785 vmovdqa %xmm1,16(%r11) 786 vpmuludq %xmm8,%xmm2,%xmm1 787 vpaddq %xmm0,%xmm10,%xmm10 788 vpaddq %xmm1,%xmm14,%xmm14 789 vmovdqa %xmm3,48(%r11) 790 vpmuludq %xmm7,%xmm2,%xmm0 791 vpmuludq %xmm6,%xmm2,%xmm1 792 vpaddq %xmm0,%xmm13,%xmm13 793 vmovdqa 48(%rsp),%xmm3 794 vpaddq %xmm1,%xmm12,%xmm12 795 vmovdqa %xmm4,64(%r11) 796 vpmuludq %xmm5,%xmm2,%xmm2 797 vpmuludq %xmm7,%xmm3,%xmm0 798 vpaddq %xmm2,%xmm11,%xmm11 799 800 vmovdqa 64(%rsp),%xmm4 801 vpaddq %xmm0,%xmm14,%xmm14 802 vpmuludq %xmm6,%xmm3,%xmm1 803 vpmuludq %xmm5,%xmm3,%xmm3 804 vpaddq %xmm1,%xmm13,%xmm13 805 vmovdqa 80(%rsp),%xmm2 806 vpaddq %xmm3,%xmm12,%xmm12 807 vpmuludq %xmm9,%xmm4,%xmm0 808 vpmuludq %xmm8,%xmm4,%xmm4 809 vpaddq %xmm0,%xmm11,%xmm11 810 vmovdqa 96(%rsp),%xmm3 811 vpaddq %xmm4,%xmm10,%xmm10 812 813 vmovdqa 128(%rsp),%xmm4 814 vpmuludq %xmm6,%xmm2,%xmm1 815 vpmuludq %xmm5,%xmm2,%xmm2 816 vpaddq %xmm1,%xmm14,%xmm14 817 vpaddq %xmm2,%xmm13,%xmm13 818 vpmuludq %xmm9,%xmm3,%xmm0 819 vpmuludq %xmm8,%xmm3,%xmm1 820 vpaddq %xmm0,%xmm12,%xmm12 821 vmovdqu 0(%rsi),%xmm0 822 vpaddq %xmm1,%xmm11,%xmm11 823 vpmuludq %xmm7,%xmm3,%xmm3 824 vpmuludq %xmm7,%xmm4,%xmm7 825 vpaddq %xmm3,%xmm10,%xmm10 826 827 vmovdqu 16(%rsi),%xmm1 828 vpaddq %xmm7,%xmm11,%xmm11 829 vpmuludq %xmm8,%xmm4,%xmm8 830 vpmuludq %xmm9,%xmm4,%xmm9 831 vpsrldq $6,%xmm0,%xmm2 832 vpaddq %xmm8,%xmm12,%xmm12 833 vpaddq %xmm9,%xmm13,%xmm13 834 vpsrldq $6,%xmm1,%xmm3 835 vpmuludq 112(%rsp),%xmm5,%xmm9 836 vpmuludq %xmm6,%xmm4,%xmm5 837 vpunpckhqdq %xmm1,%xmm0,%xmm4 838 vpaddq %xmm9,%xmm14,%xmm14 839 vmovdqa -144(%r11),%xmm9 840 vpaddq %xmm5,%xmm10,%xmm10 841 842 vpunpcklqdq %xmm1,%xmm0,%xmm0 843 vpunpcklqdq %xmm3,%xmm2,%xmm3 844 845 846 vpsrldq $5,%xmm4,%xmm4 847 vpsrlq $26,%xmm0,%xmm1 848 vpand %xmm15,%xmm0,%xmm0 849 vpsrlq $4,%xmm3,%xmm2 850 vpand %xmm15,%xmm1,%xmm1 851 vpand 0(%rcx),%xmm4,%xmm4 852 vpsrlq $30,%xmm3,%xmm3 853 vpand %xmm15,%xmm2,%xmm2 854 vpand %xmm15,%xmm3,%xmm3 855 vpor 32(%rcx),%xmm4,%xmm4 856 857 vpaddq 0(%r11),%xmm0,%xmm0 858 vpaddq 16(%r11),%xmm1,%xmm1 859 vpaddq 32(%r11),%xmm2,%xmm2 860 vpaddq 48(%r11),%xmm3,%xmm3 861 vpaddq 64(%r11),%xmm4,%xmm4 862 863 leaq 32(%rsi),%rax 864 leaq 64(%rsi),%rsi 865 subq $64,%rdx 866 cmovcq %rax,%rsi 867 868 869 870 871 872 873 874 875 876 877 vpmuludq %xmm0,%xmm9,%xmm5 878 vpmuludq %xmm1,%xmm9,%xmm6 879 vpaddq %xmm5,%xmm10,%xmm10 880 vpaddq %xmm6,%xmm11,%xmm11 881 vmovdqa -128(%r11),%xmm7 882 vpmuludq %xmm2,%xmm9,%xmm5 883 vpmuludq %xmm3,%xmm9,%xmm6 884 vpaddq %xmm5,%xmm12,%xmm12 885 vpaddq %xmm6,%xmm13,%xmm13 886 vpmuludq %xmm4,%xmm9,%xmm9 887 vpmuludq -112(%r11),%xmm4,%xmm5 888 vpaddq %xmm9,%xmm14,%xmm14 889 890 vpaddq %xmm5,%xmm10,%xmm10 891 vpmuludq %xmm2,%xmm7,%xmm6 892 vpmuludq %xmm3,%xmm7,%xmm5 893 vpaddq %xmm6,%xmm13,%xmm13 894 vmovdqa -96(%r11),%xmm8 895 vpaddq %xmm5,%xmm14,%xmm14 896 vpmuludq %xmm1,%xmm7,%xmm6 897 vpmuludq %xmm0,%xmm7,%xmm7 898 vpaddq %xmm6,%xmm12,%xmm12 899 vpaddq %xmm7,%xmm11,%xmm11 900 901 vmovdqa -80(%r11),%xmm9 902 vpmuludq %xmm2,%xmm8,%xmm5 903 vpmuludq %xmm1,%xmm8,%xmm6 904 vpaddq %xmm5,%xmm14,%xmm14 905 vpaddq %xmm6,%xmm13,%xmm13 906 vmovdqa -64(%r11),%xmm7 907 vpmuludq %xmm0,%xmm8,%xmm8 908 vpmuludq %xmm4,%xmm9,%xmm5 909 vpaddq %xmm8,%xmm12,%xmm12 910 vpaddq %xmm5,%xmm11,%xmm11 911 vmovdqa -48(%r11),%xmm8 912 vpmuludq %xmm3,%xmm9,%xmm9 913 vpmuludq %xmm1,%xmm7,%xmm6 914 vpaddq %xmm9,%xmm10,%xmm10 915 916 vmovdqa -16(%r11),%xmm9 917 vpaddq %xmm6,%xmm14,%xmm14 918 vpmuludq %xmm0,%xmm7,%xmm7 919 vpmuludq %xmm4,%xmm8,%xmm5 920 vpaddq %xmm7,%xmm13,%xmm13 921 vpaddq %xmm5,%xmm12,%xmm12 922 vmovdqu 32(%rsi),%xmm5 923 vpmuludq %xmm3,%xmm8,%xmm7 924 vpmuludq %xmm2,%xmm8,%xmm8 925 vpaddq %xmm7,%xmm11,%xmm11 926 vmovdqu 48(%rsi),%xmm6 927 vpaddq %xmm8,%xmm10,%xmm10 928 929 vpmuludq %xmm2,%xmm9,%xmm2 930 vpmuludq %xmm3,%xmm9,%xmm3 931 vpsrldq $6,%xmm5,%xmm7 932 vpaddq %xmm2,%xmm11,%xmm11 933 vpmuludq %xmm4,%xmm9,%xmm4 934 vpsrldq $6,%xmm6,%xmm8 935 vpaddq %xmm3,%xmm12,%xmm2 936 vpaddq %xmm4,%xmm13,%xmm3 937 vpmuludq -32(%r11),%xmm0,%xmm4 938 vpmuludq %xmm1,%xmm9,%xmm0 939 vpunpckhqdq %xmm6,%xmm5,%xmm9 940 vpaddq %xmm4,%xmm14,%xmm4 941 vpaddq %xmm0,%xmm10,%xmm0 942 943 vpunpcklqdq %xmm6,%xmm5,%xmm5 944 vpunpcklqdq %xmm8,%xmm7,%xmm8 945 946 947 vpsrldq $5,%xmm9,%xmm9 948 vpsrlq $26,%xmm5,%xmm6 949 vmovdqa 0(%rsp),%xmm14 950 vpand %xmm15,%xmm5,%xmm5 951 vpsrlq $4,%xmm8,%xmm7 952 vpand %xmm15,%xmm6,%xmm6 953 vpand 0(%rcx),%xmm9,%xmm9 954 vpsrlq $30,%xmm8,%xmm8 955 vpand %xmm15,%xmm7,%xmm7 956 vpand %xmm15,%xmm8,%xmm8 957 vpor 32(%rcx),%xmm9,%xmm9 958 959 960 961 962 963 vpsrlq $26,%xmm3,%xmm13 964 vpand %xmm15,%xmm3,%xmm3 965 vpaddq %xmm13,%xmm4,%xmm4 966 967 vpsrlq $26,%xmm0,%xmm10 968 vpand %xmm15,%xmm0,%xmm0 969 vpaddq %xmm10,%xmm11,%xmm1 970 971 vpsrlq $26,%xmm4,%xmm10 972 vpand %xmm15,%xmm4,%xmm4 973 974 vpsrlq $26,%xmm1,%xmm11 975 vpand %xmm15,%xmm1,%xmm1 976 vpaddq %xmm11,%xmm2,%xmm2 977 978 vpaddq %xmm10,%xmm0,%xmm0 979 vpsllq $2,%xmm10,%xmm10 980 vpaddq %xmm10,%xmm0,%xmm0 981 982 vpsrlq $26,%xmm2,%xmm12 983 vpand %xmm15,%xmm2,%xmm2 984 vpaddq %xmm12,%xmm3,%xmm3 985 986 vpsrlq $26,%xmm0,%xmm10 987 vpand %xmm15,%xmm0,%xmm0 988 vpaddq %xmm10,%xmm1,%xmm1 989 990 vpsrlq $26,%xmm3,%xmm13 991 vpand %xmm15,%xmm3,%xmm3 992 vpaddq %xmm13,%xmm4,%xmm4 993 994 ja .Loop_avx 995 996.Lskip_loop_avx: 997 998 999 1000 vpshufd $0x10,%xmm14,%xmm14 1001 addq $32,%rdx 1002 jnz .Long_tail_avx 1003 1004 vpaddq %xmm2,%xmm7,%xmm7 1005 vpaddq %xmm0,%xmm5,%xmm5 1006 vpaddq %xmm1,%xmm6,%xmm6 1007 vpaddq %xmm3,%xmm8,%xmm8 1008 vpaddq %xmm4,%xmm9,%xmm9 1009 1010.Long_tail_avx: 1011 vmovdqa %xmm2,32(%r11) 1012 vmovdqa %xmm0,0(%r11) 1013 vmovdqa %xmm1,16(%r11) 1014 vmovdqa %xmm3,48(%r11) 1015 vmovdqa %xmm4,64(%r11) 1016 1017 1018 1019 1020 1021 1022 1023 vpmuludq %xmm7,%xmm14,%xmm12 1024 vpmuludq %xmm5,%xmm14,%xmm10 1025 vpshufd $0x10,-48(%rdi),%xmm2 1026 vpmuludq %xmm6,%xmm14,%xmm11 1027 vpmuludq %xmm8,%xmm14,%xmm13 1028 vpmuludq %xmm9,%xmm14,%xmm14 1029 1030 vpmuludq %xmm8,%xmm2,%xmm0 1031 vpaddq %xmm0,%xmm14,%xmm14 1032 vpshufd $0x10,-32(%rdi),%xmm3 1033 vpmuludq %xmm7,%xmm2,%xmm1 1034 vpaddq %xmm1,%xmm13,%xmm13 1035 vpshufd $0x10,-16(%rdi),%xmm4 1036 vpmuludq %xmm6,%xmm2,%xmm0 1037 vpaddq %xmm0,%xmm12,%xmm12 1038 vpmuludq %xmm5,%xmm2,%xmm2 1039 vpaddq %xmm2,%xmm11,%xmm11 1040 vpmuludq %xmm9,%xmm3,%xmm3 1041 vpaddq %xmm3,%xmm10,%xmm10 1042 1043 vpshufd $0x10,0(%rdi),%xmm2 1044 vpmuludq %xmm7,%xmm4,%xmm1 1045 vpaddq %xmm1,%xmm14,%xmm14 1046 vpmuludq %xmm6,%xmm4,%xmm0 1047 vpaddq %xmm0,%xmm13,%xmm13 1048 vpshufd $0x10,16(%rdi),%xmm3 1049 vpmuludq %xmm5,%xmm4,%xmm4 1050 vpaddq %xmm4,%xmm12,%xmm12 1051 vpmuludq %xmm9,%xmm2,%xmm1 1052 vpaddq %xmm1,%xmm11,%xmm11 1053 vpshufd $0x10,32(%rdi),%xmm4 1054 vpmuludq %xmm8,%xmm2,%xmm2 1055 vpaddq %xmm2,%xmm10,%xmm10 1056 1057 vpmuludq %xmm6,%xmm3,%xmm0 1058 vpaddq %xmm0,%xmm14,%xmm14 1059 vpmuludq %xmm5,%xmm3,%xmm3 1060 vpaddq %xmm3,%xmm13,%xmm13 1061 vpshufd $0x10,48(%rdi),%xmm2 1062 vpmuludq %xmm9,%xmm4,%xmm1 1063 vpaddq %xmm1,%xmm12,%xmm12 1064 vpshufd $0x10,64(%rdi),%xmm3 1065 vpmuludq %xmm8,%xmm4,%xmm0 1066 vpaddq %xmm0,%xmm11,%xmm11 1067 vpmuludq %xmm7,%xmm4,%xmm4 1068 vpaddq %xmm4,%xmm10,%xmm10 1069 1070 vpmuludq %xmm5,%xmm2,%xmm2 1071 vpaddq %xmm2,%xmm14,%xmm14 1072 vpmuludq %xmm9,%xmm3,%xmm1 1073 vpaddq %xmm1,%xmm13,%xmm13 1074 vpmuludq %xmm8,%xmm3,%xmm0 1075 vpaddq %xmm0,%xmm12,%xmm12 1076 vpmuludq %xmm7,%xmm3,%xmm1 1077 vpaddq %xmm1,%xmm11,%xmm11 1078 vpmuludq %xmm6,%xmm3,%xmm3 1079 vpaddq %xmm3,%xmm10,%xmm10 1080 1081 jz .Lshort_tail_avx 1082 1083 vmovdqu 0(%rsi),%xmm0 1084 vmovdqu 16(%rsi),%xmm1 1085 1086 vpsrldq $6,%xmm0,%xmm2 1087 vpsrldq $6,%xmm1,%xmm3 1088 vpunpckhqdq %xmm1,%xmm0,%xmm4 1089 vpunpcklqdq %xmm1,%xmm0,%xmm0 1090 vpunpcklqdq %xmm3,%xmm2,%xmm3 1091 1092 vpsrlq $40,%xmm4,%xmm4 1093 vpsrlq $26,%xmm0,%xmm1 1094 vpand %xmm15,%xmm0,%xmm0 1095 vpsrlq $4,%xmm3,%xmm2 1096 vpand %xmm15,%xmm1,%xmm1 1097 vpsrlq $30,%xmm3,%xmm3 1098 vpand %xmm15,%xmm2,%xmm2 1099 vpand %xmm15,%xmm3,%xmm3 1100 vpor 32(%rcx),%xmm4,%xmm4 1101 1102 vpshufd $0x32,-64(%rdi),%xmm9 1103 vpaddq 0(%r11),%xmm0,%xmm0 1104 vpaddq 16(%r11),%xmm1,%xmm1 1105 vpaddq 32(%r11),%xmm2,%xmm2 1106 vpaddq 48(%r11),%xmm3,%xmm3 1107 vpaddq 64(%r11),%xmm4,%xmm4 1108 1109 1110 1111 1112 vpmuludq %xmm0,%xmm9,%xmm5 1113 vpaddq %xmm5,%xmm10,%xmm10 1114 vpmuludq %xmm1,%xmm9,%xmm6 1115 vpaddq %xmm6,%xmm11,%xmm11 1116 vpmuludq %xmm2,%xmm9,%xmm5 1117 vpaddq %xmm5,%xmm12,%xmm12 1118 vpshufd $0x32,-48(%rdi),%xmm7 1119 vpmuludq %xmm3,%xmm9,%xmm6 1120 vpaddq %xmm6,%xmm13,%xmm13 1121 vpmuludq %xmm4,%xmm9,%xmm9 1122 vpaddq %xmm9,%xmm14,%xmm14 1123 1124 vpmuludq %xmm3,%xmm7,%xmm5 1125 vpaddq %xmm5,%xmm14,%xmm14 1126 vpshufd $0x32,-32(%rdi),%xmm8 1127 vpmuludq %xmm2,%xmm7,%xmm6 1128 vpaddq %xmm6,%xmm13,%xmm13 1129 vpshufd $0x32,-16(%rdi),%xmm9 1130 vpmuludq %xmm1,%xmm7,%xmm5 1131 vpaddq %xmm5,%xmm12,%xmm12 1132 vpmuludq %xmm0,%xmm7,%xmm7 1133 vpaddq %xmm7,%xmm11,%xmm11 1134 vpmuludq %xmm4,%xmm8,%xmm8 1135 vpaddq %xmm8,%xmm10,%xmm10 1136 1137 vpshufd $0x32,0(%rdi),%xmm7 1138 vpmuludq %xmm2,%xmm9,%xmm6 1139 vpaddq %xmm6,%xmm14,%xmm14 1140 vpmuludq %xmm1,%xmm9,%xmm5 1141 vpaddq %xmm5,%xmm13,%xmm13 1142 vpshufd $0x32,16(%rdi),%xmm8 1143 vpmuludq %xmm0,%xmm9,%xmm9 1144 vpaddq %xmm9,%xmm12,%xmm12 1145 vpmuludq %xmm4,%xmm7,%xmm6 1146 vpaddq %xmm6,%xmm11,%xmm11 1147 vpshufd $0x32,32(%rdi),%xmm9 1148 vpmuludq %xmm3,%xmm7,%xmm7 1149 vpaddq %xmm7,%xmm10,%xmm10 1150 1151 vpmuludq %xmm1,%xmm8,%xmm5 1152 vpaddq %xmm5,%xmm14,%xmm14 1153 vpmuludq %xmm0,%xmm8,%xmm8 1154 vpaddq %xmm8,%xmm13,%xmm13 1155 vpshufd $0x32,48(%rdi),%xmm7 1156 vpmuludq %xmm4,%xmm9,%xmm6 1157 vpaddq %xmm6,%xmm12,%xmm12 1158 vpshufd $0x32,64(%rdi),%xmm8 1159 vpmuludq %xmm3,%xmm9,%xmm5 1160 vpaddq %xmm5,%xmm11,%xmm11 1161 vpmuludq %xmm2,%xmm9,%xmm9 1162 vpaddq %xmm9,%xmm10,%xmm10 1163 1164 vpmuludq %xmm0,%xmm7,%xmm7 1165 vpaddq %xmm7,%xmm14,%xmm14 1166 vpmuludq %xmm4,%xmm8,%xmm6 1167 vpaddq %xmm6,%xmm13,%xmm13 1168 vpmuludq %xmm3,%xmm8,%xmm5 1169 vpaddq %xmm5,%xmm12,%xmm12 1170 vpmuludq %xmm2,%xmm8,%xmm6 1171 vpaddq %xmm6,%xmm11,%xmm11 1172 vpmuludq %xmm1,%xmm8,%xmm8 1173 vpaddq %xmm8,%xmm10,%xmm10 1174 1175.Lshort_tail_avx: 1176 1177 1178 1179 vpsrldq $8,%xmm14,%xmm9 1180 vpsrldq $8,%xmm13,%xmm8 1181 vpsrldq $8,%xmm11,%xmm6 1182 vpsrldq $8,%xmm10,%xmm5 1183 vpsrldq $8,%xmm12,%xmm7 1184 vpaddq %xmm8,%xmm13,%xmm13 1185 vpaddq %xmm9,%xmm14,%xmm14 1186 vpaddq %xmm5,%xmm10,%xmm10 1187 vpaddq %xmm6,%xmm11,%xmm11 1188 vpaddq %xmm7,%xmm12,%xmm12 1189 1190 1191 1192 1193 vpsrlq $26,%xmm13,%xmm3 1194 vpand %xmm15,%xmm13,%xmm13 1195 vpaddq %xmm3,%xmm14,%xmm14 1196 1197 vpsrlq $26,%xmm10,%xmm0 1198 vpand %xmm15,%xmm10,%xmm10 1199 vpaddq %xmm0,%xmm11,%xmm11 1200 1201 vpsrlq $26,%xmm14,%xmm4 1202 vpand %xmm15,%xmm14,%xmm14 1203 1204 vpsrlq $26,%xmm11,%xmm1 1205 vpand %xmm15,%xmm11,%xmm11 1206 vpaddq %xmm1,%xmm12,%xmm12 1207 1208 vpaddq %xmm4,%xmm10,%xmm10 1209 vpsllq $2,%xmm4,%xmm4 1210 vpaddq %xmm4,%xmm10,%xmm10 1211 1212 vpsrlq $26,%xmm12,%xmm2 1213 vpand %xmm15,%xmm12,%xmm12 1214 vpaddq %xmm2,%xmm13,%xmm13 1215 1216 vpsrlq $26,%xmm10,%xmm0 1217 vpand %xmm15,%xmm10,%xmm10 1218 vpaddq %xmm0,%xmm11,%xmm11 1219 1220 vpsrlq $26,%xmm13,%xmm3 1221 vpand %xmm15,%xmm13,%xmm13 1222 vpaddq %xmm3,%xmm14,%xmm14 1223 1224 vmovd %xmm10,-112(%rdi) 1225 vmovd %xmm11,-108(%rdi) 1226 vmovd %xmm12,-104(%rdi) 1227 vmovd %xmm13,-100(%rdi) 1228 vmovd %xmm14,-96(%rdi) 1229 leaq 88(%r11),%rsp 1230.cfi_def_cfa %rsp,8 1231 vzeroupper 1232 .byte 0xf3,0xc3 1233.cfi_endproc 1234.size poly1305_blocks_avx,.-poly1305_blocks_avx 1235 1236.type poly1305_emit_avx,@function 1237.align 32 1238poly1305_emit_avx: 1239.cfi_startproc 1240 cmpl $0,20(%rdi) 1241 je .Lemit 1242 1243 movl 0(%rdi),%eax 1244 movl 4(%rdi),%ecx 1245 movl 8(%rdi),%r8d 1246 movl 12(%rdi),%r11d 1247 movl 16(%rdi),%r10d 1248 1249 shlq $26,%rcx 1250 movq %r8,%r9 1251 shlq $52,%r8 1252 addq %rcx,%rax 1253 shrq $12,%r9 1254 addq %rax,%r8 1255 adcq $0,%r9 1256 1257 shlq $14,%r11 1258 movq %r10,%rax 1259 shrq $24,%r10 1260 addq %r11,%r9 1261 shlq $40,%rax 1262 addq %rax,%r9 1263 adcq $0,%r10 1264 1265 movq %r10,%rax 1266 movq %r10,%rcx 1267 andq $3,%r10 1268 shrq $2,%rax 1269 andq $-4,%rcx 1270 addq %rcx,%rax 1271 addq %rax,%r8 1272 adcq $0,%r9 1273 adcq $0,%r10 1274 1275 movq %r8,%rax 1276 addq $5,%r8 1277 movq %r9,%rcx 1278 adcq $0,%r9 1279 adcq $0,%r10 1280 shrq $2,%r10 1281 cmovnzq %r8,%rax 1282 cmovnzq %r9,%rcx 1283 1284 addq 0(%rdx),%rax 1285 adcq 8(%rdx),%rcx 1286 movq %rax,0(%rsi) 1287 movq %rcx,8(%rsi) 1288 1289 .byte 0xf3,0xc3 1290.cfi_endproc 1291.size poly1305_emit_avx,.-poly1305_emit_avx 1292.type poly1305_blocks_avx2,@function 1293.align 32 1294poly1305_blocks_avx2: 1295.cfi_startproc 1296 movl 20(%rdi),%r8d 1297 cmpq $128,%rdx 1298 jae .Lblocks_avx2 1299 testl %r8d,%r8d 1300 jz .Lblocks 1301 1302.Lblocks_avx2: 1303 andq $-16,%rdx 1304 jz .Lno_data_avx2 1305 1306 vzeroupper 1307 1308 testl %r8d,%r8d 1309 jz .Lbase2_64_avx2 1310 1311 testq $63,%rdx 1312 jz .Leven_avx2 1313 1314 pushq %rbx 1315.cfi_adjust_cfa_offset 8 1316.cfi_offset %rbx,-16 1317 pushq %rbp 1318.cfi_adjust_cfa_offset 8 1319.cfi_offset %rbp,-24 1320 pushq %r12 1321.cfi_adjust_cfa_offset 8 1322.cfi_offset %r12,-32 1323 pushq %r13 1324.cfi_adjust_cfa_offset 8 1325.cfi_offset %r13,-40 1326 pushq %r14 1327.cfi_adjust_cfa_offset 8 1328.cfi_offset %r14,-48 1329 pushq %r15 1330.cfi_adjust_cfa_offset 8 1331.cfi_offset %r15,-56 1332.Lblocks_avx2_body: 1333 1334 movq %rdx,%r15 1335 1336 movq 0(%rdi),%r8 1337 movq 8(%rdi),%r9 1338 movl 16(%rdi),%ebp 1339 1340 movq 24(%rdi),%r11 1341 movq 32(%rdi),%r13 1342 1343 1344 movl %r8d,%r14d 1345 andq $-2147483648,%r8 1346 movq %r9,%r12 1347 movl %r9d,%ebx 1348 andq $-2147483648,%r9 1349 1350 shrq $6,%r8 1351 shlq $52,%r12 1352 addq %r8,%r14 1353 shrq $12,%rbx 1354 shrq $18,%r9 1355 addq %r12,%r14 1356 adcq %r9,%rbx 1357 1358 movq %rbp,%r8 1359 shlq $40,%r8 1360 shrq $24,%rbp 1361 addq %r8,%rbx 1362 adcq $0,%rbp 1363 1364 movq $-4,%r9 1365 movq %rbp,%r8 1366 andq %rbp,%r9 1367 shrq $2,%r8 1368 andq $3,%rbp 1369 addq %r9,%r8 1370 addq %r8,%r14 1371 adcq $0,%rbx 1372 adcq $0,%rbp 1373 1374 movq %r13,%r12 1375 movq %r13,%rax 1376 shrq $2,%r13 1377 addq %r12,%r13 1378 1379.Lbase2_26_pre_avx2: 1380 addq 0(%rsi),%r14 1381 adcq 8(%rsi),%rbx 1382 leaq 16(%rsi),%rsi 1383 adcq %rcx,%rbp 1384 subq $16,%r15 1385 1386 call __poly1305_block 1387 movq %r12,%rax 1388 1389 testq $63,%r15 1390 jnz .Lbase2_26_pre_avx2 1391 1392 testq %rcx,%rcx 1393 jz .Lstore_base2_64_avx2 1394 1395 1396 movq %r14,%rax 1397 movq %r14,%rdx 1398 shrq $52,%r14 1399 movq %rbx,%r11 1400 movq %rbx,%r12 1401 shrq $26,%rdx 1402 andq $0x3ffffff,%rax 1403 shlq $12,%r11 1404 andq $0x3ffffff,%rdx 1405 shrq $14,%rbx 1406 orq %r11,%r14 1407 shlq $24,%rbp 1408 andq $0x3ffffff,%r14 1409 shrq $40,%r12 1410 andq $0x3ffffff,%rbx 1411 orq %r12,%rbp 1412 1413 testq %r15,%r15 1414 jz .Lstore_base2_26_avx2 1415 1416 vmovd %eax,%xmm0 1417 vmovd %edx,%xmm1 1418 vmovd %r14d,%xmm2 1419 vmovd %ebx,%xmm3 1420 vmovd %ebp,%xmm4 1421 jmp .Lproceed_avx2 1422 1423.align 32 1424.Lstore_base2_64_avx2: 1425 movq %r14,0(%rdi) 1426 movq %rbx,8(%rdi) 1427 movq %rbp,16(%rdi) 1428 jmp .Ldone_avx2 1429 1430.align 16 1431.Lstore_base2_26_avx2: 1432 movl %eax,0(%rdi) 1433 movl %edx,4(%rdi) 1434 movl %r14d,8(%rdi) 1435 movl %ebx,12(%rdi) 1436 movl %ebp,16(%rdi) 1437.align 16 1438.Ldone_avx2: 1439 movq 0(%rsp),%r15 1440.cfi_restore %r15 1441 movq 8(%rsp),%r14 1442.cfi_restore %r14 1443 movq 16(%rsp),%r13 1444.cfi_restore %r13 1445 movq 24(%rsp),%r12 1446.cfi_restore %r12 1447 movq 32(%rsp),%rbp 1448.cfi_restore %rbp 1449 movq 40(%rsp),%rbx 1450.cfi_restore %rbx 1451 leaq 48(%rsp),%rsp 1452.cfi_adjust_cfa_offset -48 1453.Lno_data_avx2: 1454.Lblocks_avx2_epilogue: 1455 .byte 0xf3,0xc3 1456.cfi_endproc 1457 1458.align 32 1459.Lbase2_64_avx2: 1460.cfi_startproc 1461 pushq %rbx 1462.cfi_adjust_cfa_offset 8 1463.cfi_offset %rbx,-16 1464 pushq %rbp 1465.cfi_adjust_cfa_offset 8 1466.cfi_offset %rbp,-24 1467 pushq %r12 1468.cfi_adjust_cfa_offset 8 1469.cfi_offset %r12,-32 1470 pushq %r13 1471.cfi_adjust_cfa_offset 8 1472.cfi_offset %r13,-40 1473 pushq %r14 1474.cfi_adjust_cfa_offset 8 1475.cfi_offset %r14,-48 1476 pushq %r15 1477.cfi_adjust_cfa_offset 8 1478.cfi_offset %r15,-56 1479.Lbase2_64_avx2_body: 1480 1481 movq %rdx,%r15 1482 1483 movq 24(%rdi),%r11 1484 movq 32(%rdi),%r13 1485 1486 movq 0(%rdi),%r14 1487 movq 8(%rdi),%rbx 1488 movl 16(%rdi),%ebp 1489 1490 movq %r13,%r12 1491 movq %r13,%rax 1492 shrq $2,%r13 1493 addq %r12,%r13 1494 1495 testq $63,%rdx 1496 jz .Linit_avx2 1497 1498.Lbase2_64_pre_avx2: 1499 addq 0(%rsi),%r14 1500 adcq 8(%rsi),%rbx 1501 leaq 16(%rsi),%rsi 1502 adcq %rcx,%rbp 1503 subq $16,%r15 1504 1505 call __poly1305_block 1506 movq %r12,%rax 1507 1508 testq $63,%r15 1509 jnz .Lbase2_64_pre_avx2 1510 1511.Linit_avx2: 1512 1513 movq %r14,%rax 1514 movq %r14,%rdx 1515 shrq $52,%r14 1516 movq %rbx,%r8 1517 movq %rbx,%r9 1518 shrq $26,%rdx 1519 andq $0x3ffffff,%rax 1520 shlq $12,%r8 1521 andq $0x3ffffff,%rdx 1522 shrq $14,%rbx 1523 orq %r8,%r14 1524 shlq $24,%rbp 1525 andq $0x3ffffff,%r14 1526 shrq $40,%r9 1527 andq $0x3ffffff,%rbx 1528 orq %r9,%rbp 1529 1530 vmovd %eax,%xmm0 1531 vmovd %edx,%xmm1 1532 vmovd %r14d,%xmm2 1533 vmovd %ebx,%xmm3 1534 vmovd %ebp,%xmm4 1535 movl $1,20(%rdi) 1536 1537 call __poly1305_init_avx 1538 1539.Lproceed_avx2: 1540 movq %r15,%rdx 1541 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1542 movl $3221291008,%r11d 1543 1544 movq 0(%rsp),%r15 1545.cfi_restore %r15 1546 movq 8(%rsp),%r14 1547.cfi_restore %r14 1548 movq 16(%rsp),%r13 1549.cfi_restore %r13 1550 movq 24(%rsp),%r12 1551.cfi_restore %r12 1552 movq 32(%rsp),%rbp 1553.cfi_restore %rbp 1554 movq 40(%rsp),%rbx 1555.cfi_restore %rbx 1556 leaq 48(%rsp),%rax 1557 leaq 48(%rsp),%rsp 1558.cfi_adjust_cfa_offset -48 1559.Lbase2_64_avx2_epilogue: 1560 jmp .Ldo_avx2 1561.cfi_endproc 1562 1563.align 32 1564.Leven_avx2: 1565.cfi_startproc 1566 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1567 vmovd 0(%rdi),%xmm0 1568 vmovd 4(%rdi),%xmm1 1569 vmovd 8(%rdi),%xmm2 1570 vmovd 12(%rdi),%xmm3 1571 vmovd 16(%rdi),%xmm4 1572 1573.Ldo_avx2: 1574 leaq -8(%rsp),%r11 1575.cfi_def_cfa %r11,16 1576 subq $0x128,%rsp 1577 leaq .Lconst(%rip),%rcx 1578 leaq 48+64(%rdi),%rdi 1579 vmovdqa 96(%rcx),%ymm7 1580 1581 1582 vmovdqu -64(%rdi),%xmm9 1583 andq $-512,%rsp 1584 vmovdqu -48(%rdi),%xmm10 1585 vmovdqu -32(%rdi),%xmm6 1586 vmovdqu -16(%rdi),%xmm11 1587 vmovdqu 0(%rdi),%xmm12 1588 vmovdqu 16(%rdi),%xmm13 1589 leaq 144(%rsp),%rax 1590 vmovdqu 32(%rdi),%xmm14 1591 vpermd %ymm9,%ymm7,%ymm9 1592 vmovdqu 48(%rdi),%xmm15 1593 vpermd %ymm10,%ymm7,%ymm10 1594 vmovdqu 64(%rdi),%xmm5 1595 vpermd %ymm6,%ymm7,%ymm6 1596 vmovdqa %ymm9,0(%rsp) 1597 vpermd %ymm11,%ymm7,%ymm11 1598 vmovdqa %ymm10,32-144(%rax) 1599 vpermd %ymm12,%ymm7,%ymm12 1600 vmovdqa %ymm6,64-144(%rax) 1601 vpermd %ymm13,%ymm7,%ymm13 1602 vmovdqa %ymm11,96-144(%rax) 1603 vpermd %ymm14,%ymm7,%ymm14 1604 vmovdqa %ymm12,128-144(%rax) 1605 vpermd %ymm15,%ymm7,%ymm15 1606 vmovdqa %ymm13,160-144(%rax) 1607 vpermd %ymm5,%ymm7,%ymm5 1608 vmovdqa %ymm14,192-144(%rax) 1609 vmovdqa %ymm15,224-144(%rax) 1610 vmovdqa %ymm5,256-144(%rax) 1611 vmovdqa 64(%rcx),%ymm5 1612 1613 1614 1615 vmovdqu 0(%rsi),%xmm7 1616 vmovdqu 16(%rsi),%xmm8 1617 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1618 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1619 leaq 64(%rsi),%rsi 1620 1621 vpsrldq $6,%ymm7,%ymm9 1622 vpsrldq $6,%ymm8,%ymm10 1623 vpunpckhqdq %ymm8,%ymm7,%ymm6 1624 vpunpcklqdq %ymm10,%ymm9,%ymm9 1625 vpunpcklqdq %ymm8,%ymm7,%ymm7 1626 1627 vpsrlq $30,%ymm9,%ymm10 1628 vpsrlq $4,%ymm9,%ymm9 1629 vpsrlq $26,%ymm7,%ymm8 1630 vpsrlq $40,%ymm6,%ymm6 1631 vpand %ymm5,%ymm9,%ymm9 1632 vpand %ymm5,%ymm7,%ymm7 1633 vpand %ymm5,%ymm8,%ymm8 1634 vpand %ymm5,%ymm10,%ymm10 1635 vpor 32(%rcx),%ymm6,%ymm6 1636 1637 vpaddq %ymm2,%ymm9,%ymm2 1638 subq $64,%rdx 1639 jz .Ltail_avx2 1640 jmp .Loop_avx2 1641 1642.align 32 1643.Loop_avx2: 1644 1645 1646 1647 1648 1649 1650 1651 1652 vpaddq %ymm0,%ymm7,%ymm0 1653 vmovdqa 0(%rsp),%ymm7 1654 vpaddq %ymm1,%ymm8,%ymm1 1655 vmovdqa 32(%rsp),%ymm8 1656 vpaddq %ymm3,%ymm10,%ymm3 1657 vmovdqa 96(%rsp),%ymm9 1658 vpaddq %ymm4,%ymm6,%ymm4 1659 vmovdqa 48(%rax),%ymm10 1660 vmovdqa 112(%rax),%ymm5 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 vpmuludq %ymm2,%ymm7,%ymm13 1678 vpmuludq %ymm2,%ymm8,%ymm14 1679 vpmuludq %ymm2,%ymm9,%ymm15 1680 vpmuludq %ymm2,%ymm10,%ymm11 1681 vpmuludq %ymm2,%ymm5,%ymm12 1682 1683 vpmuludq %ymm0,%ymm8,%ymm6 1684 vpmuludq %ymm1,%ymm8,%ymm2 1685 vpaddq %ymm6,%ymm12,%ymm12 1686 vpaddq %ymm2,%ymm13,%ymm13 1687 vpmuludq %ymm3,%ymm8,%ymm6 1688 vpmuludq 64(%rsp),%ymm4,%ymm2 1689 vpaddq %ymm6,%ymm15,%ymm15 1690 vpaddq %ymm2,%ymm11,%ymm11 1691 vmovdqa -16(%rax),%ymm8 1692 1693 vpmuludq %ymm0,%ymm7,%ymm6 1694 vpmuludq %ymm1,%ymm7,%ymm2 1695 vpaddq %ymm6,%ymm11,%ymm11 1696 vpaddq %ymm2,%ymm12,%ymm12 1697 vpmuludq %ymm3,%ymm7,%ymm6 1698 vpmuludq %ymm4,%ymm7,%ymm2 1699 vmovdqu 0(%rsi),%xmm7 1700 vpaddq %ymm6,%ymm14,%ymm14 1701 vpaddq %ymm2,%ymm15,%ymm15 1702 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1703 1704 vpmuludq %ymm3,%ymm8,%ymm6 1705 vpmuludq %ymm4,%ymm8,%ymm2 1706 vmovdqu 16(%rsi),%xmm8 1707 vpaddq %ymm6,%ymm11,%ymm11 1708 vpaddq %ymm2,%ymm12,%ymm12 1709 vmovdqa 16(%rax),%ymm2 1710 vpmuludq %ymm1,%ymm9,%ymm6 1711 vpmuludq %ymm0,%ymm9,%ymm9 1712 vpaddq %ymm6,%ymm14,%ymm14 1713 vpaddq %ymm9,%ymm13,%ymm13 1714 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1715 leaq 64(%rsi),%rsi 1716 1717 vpmuludq %ymm1,%ymm2,%ymm6 1718 vpmuludq %ymm0,%ymm2,%ymm2 1719 vpsrldq $6,%ymm7,%ymm9 1720 vpaddq %ymm6,%ymm15,%ymm15 1721 vpaddq %ymm2,%ymm14,%ymm14 1722 vpmuludq %ymm3,%ymm10,%ymm6 1723 vpmuludq %ymm4,%ymm10,%ymm2 1724 vpsrldq $6,%ymm8,%ymm10 1725 vpaddq %ymm6,%ymm12,%ymm12 1726 vpaddq %ymm2,%ymm13,%ymm13 1727 vpunpckhqdq %ymm8,%ymm7,%ymm6 1728 1729 vpmuludq %ymm3,%ymm5,%ymm3 1730 vpmuludq %ymm4,%ymm5,%ymm4 1731 vpunpcklqdq %ymm8,%ymm7,%ymm7 1732 vpaddq %ymm3,%ymm13,%ymm2 1733 vpaddq %ymm4,%ymm14,%ymm3 1734 vpunpcklqdq %ymm10,%ymm9,%ymm10 1735 vpmuludq 80(%rax),%ymm0,%ymm4 1736 vpmuludq %ymm1,%ymm5,%ymm0 1737 vmovdqa 64(%rcx),%ymm5 1738 vpaddq %ymm4,%ymm15,%ymm4 1739 vpaddq %ymm0,%ymm11,%ymm0 1740 1741 1742 1743 1744 vpsrlq $26,%ymm3,%ymm14 1745 vpand %ymm5,%ymm3,%ymm3 1746 vpaddq %ymm14,%ymm4,%ymm4 1747 1748 vpsrlq $26,%ymm0,%ymm11 1749 vpand %ymm5,%ymm0,%ymm0 1750 vpaddq %ymm11,%ymm12,%ymm1 1751 1752 vpsrlq $26,%ymm4,%ymm15 1753 vpand %ymm5,%ymm4,%ymm4 1754 1755 vpsrlq $4,%ymm10,%ymm9 1756 1757 vpsrlq $26,%ymm1,%ymm12 1758 vpand %ymm5,%ymm1,%ymm1 1759 vpaddq %ymm12,%ymm2,%ymm2 1760 1761 vpaddq %ymm15,%ymm0,%ymm0 1762 vpsllq $2,%ymm15,%ymm15 1763 vpaddq %ymm15,%ymm0,%ymm0 1764 1765 vpand %ymm5,%ymm9,%ymm9 1766 vpsrlq $26,%ymm7,%ymm8 1767 1768 vpsrlq $26,%ymm2,%ymm13 1769 vpand %ymm5,%ymm2,%ymm2 1770 vpaddq %ymm13,%ymm3,%ymm3 1771 1772 vpaddq %ymm9,%ymm2,%ymm2 1773 vpsrlq $30,%ymm10,%ymm10 1774 1775 vpsrlq $26,%ymm0,%ymm11 1776 vpand %ymm5,%ymm0,%ymm0 1777 vpaddq %ymm11,%ymm1,%ymm1 1778 1779 vpsrlq $40,%ymm6,%ymm6 1780 1781 vpsrlq $26,%ymm3,%ymm14 1782 vpand %ymm5,%ymm3,%ymm3 1783 vpaddq %ymm14,%ymm4,%ymm4 1784 1785 vpand %ymm5,%ymm7,%ymm7 1786 vpand %ymm5,%ymm8,%ymm8 1787 vpand %ymm5,%ymm10,%ymm10 1788 vpor 32(%rcx),%ymm6,%ymm6 1789 1790 subq $64,%rdx 1791 jnz .Loop_avx2 1792 1793.byte 0x66,0x90 1794.Ltail_avx2: 1795 1796 1797 1798 1799 1800 1801 1802 vpaddq %ymm0,%ymm7,%ymm0 1803 vmovdqu 4(%rsp),%ymm7 1804 vpaddq %ymm1,%ymm8,%ymm1 1805 vmovdqu 36(%rsp),%ymm8 1806 vpaddq %ymm3,%ymm10,%ymm3 1807 vmovdqu 100(%rsp),%ymm9 1808 vpaddq %ymm4,%ymm6,%ymm4 1809 vmovdqu 52(%rax),%ymm10 1810 vmovdqu 116(%rax),%ymm5 1811 1812 vpmuludq %ymm2,%ymm7,%ymm13 1813 vpmuludq %ymm2,%ymm8,%ymm14 1814 vpmuludq %ymm2,%ymm9,%ymm15 1815 vpmuludq %ymm2,%ymm10,%ymm11 1816 vpmuludq %ymm2,%ymm5,%ymm12 1817 1818 vpmuludq %ymm0,%ymm8,%ymm6 1819 vpmuludq %ymm1,%ymm8,%ymm2 1820 vpaddq %ymm6,%ymm12,%ymm12 1821 vpaddq %ymm2,%ymm13,%ymm13 1822 vpmuludq %ymm3,%ymm8,%ymm6 1823 vpmuludq 68(%rsp),%ymm4,%ymm2 1824 vpaddq %ymm6,%ymm15,%ymm15 1825 vpaddq %ymm2,%ymm11,%ymm11 1826 1827 vpmuludq %ymm0,%ymm7,%ymm6 1828 vpmuludq %ymm1,%ymm7,%ymm2 1829 vpaddq %ymm6,%ymm11,%ymm11 1830 vmovdqu -12(%rax),%ymm8 1831 vpaddq %ymm2,%ymm12,%ymm12 1832 vpmuludq %ymm3,%ymm7,%ymm6 1833 vpmuludq %ymm4,%ymm7,%ymm2 1834 vpaddq %ymm6,%ymm14,%ymm14 1835 vpaddq %ymm2,%ymm15,%ymm15 1836 1837 vpmuludq %ymm3,%ymm8,%ymm6 1838 vpmuludq %ymm4,%ymm8,%ymm2 1839 vpaddq %ymm6,%ymm11,%ymm11 1840 vpaddq %ymm2,%ymm12,%ymm12 1841 vmovdqu 20(%rax),%ymm2 1842 vpmuludq %ymm1,%ymm9,%ymm6 1843 vpmuludq %ymm0,%ymm9,%ymm9 1844 vpaddq %ymm6,%ymm14,%ymm14 1845 vpaddq %ymm9,%ymm13,%ymm13 1846 1847 vpmuludq %ymm1,%ymm2,%ymm6 1848 vpmuludq %ymm0,%ymm2,%ymm2 1849 vpaddq %ymm6,%ymm15,%ymm15 1850 vpaddq %ymm2,%ymm14,%ymm14 1851 vpmuludq %ymm3,%ymm10,%ymm6 1852 vpmuludq %ymm4,%ymm10,%ymm2 1853 vpaddq %ymm6,%ymm12,%ymm12 1854 vpaddq %ymm2,%ymm13,%ymm13 1855 1856 vpmuludq %ymm3,%ymm5,%ymm3 1857 vpmuludq %ymm4,%ymm5,%ymm4 1858 vpaddq %ymm3,%ymm13,%ymm2 1859 vpaddq %ymm4,%ymm14,%ymm3 1860 vpmuludq 84(%rax),%ymm0,%ymm4 1861 vpmuludq %ymm1,%ymm5,%ymm0 1862 vmovdqa 64(%rcx),%ymm5 1863 vpaddq %ymm4,%ymm15,%ymm4 1864 vpaddq %ymm0,%ymm11,%ymm0 1865 1866 1867 1868 1869 vpsrldq $8,%ymm12,%ymm8 1870 vpsrldq $8,%ymm2,%ymm9 1871 vpsrldq $8,%ymm3,%ymm10 1872 vpsrldq $8,%ymm4,%ymm6 1873 vpsrldq $8,%ymm0,%ymm7 1874 vpaddq %ymm8,%ymm12,%ymm12 1875 vpaddq %ymm9,%ymm2,%ymm2 1876 vpaddq %ymm10,%ymm3,%ymm3 1877 vpaddq %ymm6,%ymm4,%ymm4 1878 vpaddq %ymm7,%ymm0,%ymm0 1879 1880 vpermq $0x2,%ymm3,%ymm10 1881 vpermq $0x2,%ymm4,%ymm6 1882 vpermq $0x2,%ymm0,%ymm7 1883 vpermq $0x2,%ymm12,%ymm8 1884 vpermq $0x2,%ymm2,%ymm9 1885 vpaddq %ymm10,%ymm3,%ymm3 1886 vpaddq %ymm6,%ymm4,%ymm4 1887 vpaddq %ymm7,%ymm0,%ymm0 1888 vpaddq %ymm8,%ymm12,%ymm12 1889 vpaddq %ymm9,%ymm2,%ymm2 1890 1891 1892 1893 1894 vpsrlq $26,%ymm3,%ymm14 1895 vpand %ymm5,%ymm3,%ymm3 1896 vpaddq %ymm14,%ymm4,%ymm4 1897 1898 vpsrlq $26,%ymm0,%ymm11 1899 vpand %ymm5,%ymm0,%ymm0 1900 vpaddq %ymm11,%ymm12,%ymm1 1901 1902 vpsrlq $26,%ymm4,%ymm15 1903 vpand %ymm5,%ymm4,%ymm4 1904 1905 vpsrlq $26,%ymm1,%ymm12 1906 vpand %ymm5,%ymm1,%ymm1 1907 vpaddq %ymm12,%ymm2,%ymm2 1908 1909 vpaddq %ymm15,%ymm0,%ymm0 1910 vpsllq $2,%ymm15,%ymm15 1911 vpaddq %ymm15,%ymm0,%ymm0 1912 1913 vpsrlq $26,%ymm2,%ymm13 1914 vpand %ymm5,%ymm2,%ymm2 1915 vpaddq %ymm13,%ymm3,%ymm3 1916 1917 vpsrlq $26,%ymm0,%ymm11 1918 vpand %ymm5,%ymm0,%ymm0 1919 vpaddq %ymm11,%ymm1,%ymm1 1920 1921 vpsrlq $26,%ymm3,%ymm14 1922 vpand %ymm5,%ymm3,%ymm3 1923 vpaddq %ymm14,%ymm4,%ymm4 1924 1925 vmovd %xmm0,-112(%rdi) 1926 vmovd %xmm1,-108(%rdi) 1927 vmovd %xmm2,-104(%rdi) 1928 vmovd %xmm3,-100(%rdi) 1929 vmovd %xmm4,-96(%rdi) 1930 leaq 8(%r11),%rsp 1931.cfi_def_cfa %rsp,8 1932 vzeroupper 1933 .byte 0xf3,0xc3 1934.cfi_endproc 1935.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 1936.align 64 1937.Lconst: 1938.Lmask24: 1939.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 1940.L129: 1941.long 16777216,0,16777216,0,16777216,0,16777216,0 1942.Lmask26: 1943.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 1944.Lpermd_avx2: 1945.long 2,2,2,3,2,0,2,1 1946.Lpermd_avx512: 1947.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 1948 1949.L2_44_inp_permd: 1950.long 0,1,1,2,2,3,7,7 1951.L2_44_inp_shift: 1952.quad 0,12,24,64 1953.L2_44_mask: 1954.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 1955.L2_44_shift_rgt: 1956.quad 44,44,42,64 1957.L2_44_shift_lft: 1958.quad 8,8,10,64 1959 1960.align 64 1961.Lx_mask44: 1962.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1963.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1964.Lx_mask42: 1965.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1966.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1967.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1968.align 16 1969.globl xor128_encrypt_n_pad 1970.type xor128_encrypt_n_pad,@function 1971.align 16 1972xor128_encrypt_n_pad: 1973.cfi_startproc 1974 subq %rdx,%rsi 1975 subq %rdx,%rdi 1976 movq %rcx,%r10 1977 shrq $4,%rcx 1978 jz .Ltail_enc 1979 nop 1980.Loop_enc_xmm: 1981 movdqu (%rsi,%rdx,1),%xmm0 1982 pxor (%rdx),%xmm0 1983 movdqu %xmm0,(%rdi,%rdx,1) 1984 movdqa %xmm0,(%rdx) 1985 leaq 16(%rdx),%rdx 1986 decq %rcx 1987 jnz .Loop_enc_xmm 1988 1989 andq $15,%r10 1990 jz .Ldone_enc 1991 1992.Ltail_enc: 1993 movq $16,%rcx 1994 subq %r10,%rcx 1995 xorl %eax,%eax 1996.Loop_enc_byte: 1997 movb (%rsi,%rdx,1),%al 1998 xorb (%rdx),%al 1999 movb %al,(%rdi,%rdx,1) 2000 movb %al,(%rdx) 2001 leaq 1(%rdx),%rdx 2002 decq %r10 2003 jnz .Loop_enc_byte 2004 2005 xorl %eax,%eax 2006.Loop_enc_pad: 2007 movb %al,(%rdx) 2008 leaq 1(%rdx),%rdx 2009 decq %rcx 2010 jnz .Loop_enc_pad 2011 2012.Ldone_enc: 2013 movq %rdx,%rax 2014 .byte 0xf3,0xc3 2015.cfi_endproc 2016.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 2017 2018.globl xor128_decrypt_n_pad 2019.type xor128_decrypt_n_pad,@function 2020.align 16 2021xor128_decrypt_n_pad: 2022.cfi_startproc 2023 subq %rdx,%rsi 2024 subq %rdx,%rdi 2025 movq %rcx,%r10 2026 shrq $4,%rcx 2027 jz .Ltail_dec 2028 nop 2029.Loop_dec_xmm: 2030 movdqu (%rsi,%rdx,1),%xmm0 2031 movdqa (%rdx),%xmm1 2032 pxor %xmm0,%xmm1 2033 movdqu %xmm1,(%rdi,%rdx,1) 2034 movdqa %xmm0,(%rdx) 2035 leaq 16(%rdx),%rdx 2036 decq %rcx 2037 jnz .Loop_dec_xmm 2038 2039 pxor %xmm1,%xmm1 2040 andq $15,%r10 2041 jz .Ldone_dec 2042 2043.Ltail_dec: 2044 movq $16,%rcx 2045 subq %r10,%rcx 2046 xorl %eax,%eax 2047 xorq %r11,%r11 2048.Loop_dec_byte: 2049 movb (%rsi,%rdx,1),%r11b 2050 movb (%rdx),%al 2051 xorb %r11b,%al 2052 movb %al,(%rdi,%rdx,1) 2053 movb %r11b,(%rdx) 2054 leaq 1(%rdx),%rdx 2055 decq %r10 2056 jnz .Loop_dec_byte 2057 2058 xorl %eax,%eax 2059.Loop_dec_pad: 2060 movb %al,(%rdx) 2061 leaq 1(%rdx),%rdx 2062 decq %rcx 2063 jnz .Loop_dec_pad 2064 2065.Ldone_dec: 2066 movq %rdx,%rax 2067 .byte 0xf3,0xc3 2068.cfi_endproc 2069.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 2070