1/* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */ 2.text 3 4 5 6.globl poly1305_init 7.hidden poly1305_init 8.globl poly1305_blocks 9.hidden poly1305_blocks 10.globl poly1305_emit 11.hidden poly1305_emit 12 13.type poly1305_init,@function 14.align 32 15poly1305_init: 16.cfi_startproc 17 xorq %rax,%rax 18 movq %rax,0(%rdi) 19 movq %rax,8(%rdi) 20 movq %rax,16(%rdi) 21 22 cmpq $0,%rsi 23 je .Lno_key 24 25 leaq poly1305_blocks(%rip),%r10 26 leaq poly1305_emit(%rip),%r11 27 movq OPENSSL_ia32cap_P+4(%rip),%r9 28 leaq poly1305_blocks_avx(%rip),%rax 29 leaq poly1305_emit_avx(%rip),%rcx 30 btq $28,%r9 31 cmovcq %rax,%r10 32 cmovcq %rcx,%r11 33 leaq poly1305_blocks_avx2(%rip),%rax 34 btq $37,%r9 35 cmovcq %rax,%r10 36 movq $0x0ffffffc0fffffff,%rax 37 movq $0x0ffffffc0ffffffc,%rcx 38 andq 0(%rsi),%rax 39 andq 8(%rsi),%rcx 40 movq %rax,24(%rdi) 41 movq %rcx,32(%rdi) 42 movq %r10,0(%rdx) 43 movq %r11,8(%rdx) 44 movl $1,%eax 45.Lno_key: 46 .byte 0xf3,0xc3 47.cfi_endproc 48.size poly1305_init,.-poly1305_init 49 50.type poly1305_blocks,@function 51.align 32 52poly1305_blocks: 53.cfi_startproc 54.byte 243,15,30,250 55.Lblocks: 56 shrq $4,%rdx 57 jz .Lno_data 58 59 pushq %rbx 60.cfi_adjust_cfa_offset 8 61.cfi_offset %rbx,-16 62 pushq %rbp 63.cfi_adjust_cfa_offset 8 64.cfi_offset %rbp,-24 65 pushq %r12 66.cfi_adjust_cfa_offset 8 67.cfi_offset %r12,-32 68 pushq %r13 69.cfi_adjust_cfa_offset 8 70.cfi_offset %r13,-40 71 pushq %r14 72.cfi_adjust_cfa_offset 8 73.cfi_offset %r14,-48 74 pushq %r15 75.cfi_adjust_cfa_offset 8 76.cfi_offset %r15,-56 77.Lblocks_body: 78 79 movq %rdx,%r15 80 81 movq 24(%rdi),%r11 82 movq 32(%rdi),%r13 83 84 movq 0(%rdi),%r14 85 movq 8(%rdi),%rbx 86 movq 16(%rdi),%rbp 87 88 movq %r13,%r12 89 shrq $2,%r13 90 movq %r12,%rax 91 addq %r12,%r13 92 jmp .Loop 93 94.align 32 95.Loop: 96 addq 0(%rsi),%r14 97 adcq 8(%rsi),%rbx 98 leaq 16(%rsi),%rsi 99 adcq %rcx,%rbp 100 mulq %r14 101 movq %rax,%r9 102 movq %r11,%rax 103 movq %rdx,%r10 104 105 mulq %r14 106 movq %rax,%r14 107 movq %r11,%rax 108 movq %rdx,%r8 109 110 mulq %rbx 111 addq %rax,%r9 112 movq %r13,%rax 113 adcq %rdx,%r10 114 115 mulq %rbx 116 movq %rbp,%rbx 117 addq %rax,%r14 118 adcq %rdx,%r8 119 120 imulq %r13,%rbx 121 addq %rbx,%r9 122 movq %r8,%rbx 123 adcq $0,%r10 124 125 imulq %r11,%rbp 126 addq %r9,%rbx 127 movq $-4,%rax 128 adcq %rbp,%r10 129 130 andq %r10,%rax 131 movq %r10,%rbp 132 shrq $2,%r10 133 andq $3,%rbp 134 addq %r10,%rax 135 addq %rax,%r14 136 adcq $0,%rbx 137 adcq $0,%rbp 138 movq %r12,%rax 139 decq %r15 140 jnz .Loop 141 142 movq %r14,0(%rdi) 143 movq %rbx,8(%rdi) 144 movq %rbp,16(%rdi) 145 146 movq 0(%rsp),%r15 147.cfi_restore %r15 148 movq 8(%rsp),%r14 149.cfi_restore %r14 150 movq 16(%rsp),%r13 151.cfi_restore %r13 152 movq 24(%rsp),%r12 153.cfi_restore %r12 154 movq 32(%rsp),%rbp 155.cfi_restore %rbp 156 movq 40(%rsp),%rbx 157.cfi_restore %rbx 158 leaq 48(%rsp),%rsp 159.cfi_adjust_cfa_offset -48 160.Lno_data: 161.Lblocks_epilogue: 162 .byte 0xf3,0xc3 163.cfi_endproc 164.size poly1305_blocks,.-poly1305_blocks 165 166.type poly1305_emit,@function 167.align 32 168poly1305_emit: 169.cfi_startproc 170.byte 243,15,30,250 171.Lemit: 172 movq 0(%rdi),%r8 173 movq 8(%rdi),%r9 174 movq 16(%rdi),%r10 175 176 movq %r8,%rax 177 addq $5,%r8 178 movq %r9,%rcx 179 adcq $0,%r9 180 adcq $0,%r10 181 shrq $2,%r10 182 cmovnzq %r8,%rax 183 cmovnzq %r9,%rcx 184 185 addq 0(%rdx),%rax 186 adcq 8(%rdx),%rcx 187 movq %rax,0(%rsi) 188 movq %rcx,8(%rsi) 189 190 .byte 0xf3,0xc3 191.cfi_endproc 192.size poly1305_emit,.-poly1305_emit 193.type __poly1305_block,@function 194.align 32 195__poly1305_block: 196.cfi_startproc 197 mulq %r14 198 movq %rax,%r9 199 movq %r11,%rax 200 movq %rdx,%r10 201 202 mulq %r14 203 movq %rax,%r14 204 movq %r11,%rax 205 movq %rdx,%r8 206 207 mulq %rbx 208 addq %rax,%r9 209 movq %r13,%rax 210 adcq %rdx,%r10 211 212 mulq %rbx 213 movq %rbp,%rbx 214 addq %rax,%r14 215 adcq %rdx,%r8 216 217 imulq %r13,%rbx 218 addq %rbx,%r9 219 movq %r8,%rbx 220 adcq $0,%r10 221 222 imulq %r11,%rbp 223 addq %r9,%rbx 224 movq $-4,%rax 225 adcq %rbp,%r10 226 227 andq %r10,%rax 228 movq %r10,%rbp 229 shrq $2,%r10 230 andq $3,%rbp 231 addq %r10,%rax 232 addq %rax,%r14 233 adcq $0,%rbx 234 adcq $0,%rbp 235 .byte 0xf3,0xc3 236.cfi_endproc 237.size __poly1305_block,.-__poly1305_block 238 239.type __poly1305_init_avx,@function 240.align 32 241__poly1305_init_avx: 242.cfi_startproc 243 movq %r11,%r14 244 movq %r12,%rbx 245 xorq %rbp,%rbp 246 247 leaq 48+64(%rdi),%rdi 248 249 movq %r12,%rax 250 call __poly1305_block 251 252 movl $0x3ffffff,%eax 253 movl $0x3ffffff,%edx 254 movq %r14,%r8 255 andl %r14d,%eax 256 movq %r11,%r9 257 andl %r11d,%edx 258 movl %eax,-64(%rdi) 259 shrq $26,%r8 260 movl %edx,-60(%rdi) 261 shrq $26,%r9 262 263 movl $0x3ffffff,%eax 264 movl $0x3ffffff,%edx 265 andl %r8d,%eax 266 andl %r9d,%edx 267 movl %eax,-48(%rdi) 268 leal (%rax,%rax,4),%eax 269 movl %edx,-44(%rdi) 270 leal (%rdx,%rdx,4),%edx 271 movl %eax,-32(%rdi) 272 shrq $26,%r8 273 movl %edx,-28(%rdi) 274 shrq $26,%r9 275 276 movq %rbx,%rax 277 movq %r12,%rdx 278 shlq $12,%rax 279 shlq $12,%rdx 280 orq %r8,%rax 281 orq %r9,%rdx 282 andl $0x3ffffff,%eax 283 andl $0x3ffffff,%edx 284 movl %eax,-16(%rdi) 285 leal (%rax,%rax,4),%eax 286 movl %edx,-12(%rdi) 287 leal (%rdx,%rdx,4),%edx 288 movl %eax,0(%rdi) 289 movq %rbx,%r8 290 movl %edx,4(%rdi) 291 movq %r12,%r9 292 293 movl $0x3ffffff,%eax 294 movl $0x3ffffff,%edx 295 shrq $14,%r8 296 shrq $14,%r9 297 andl %r8d,%eax 298 andl %r9d,%edx 299 movl %eax,16(%rdi) 300 leal (%rax,%rax,4),%eax 301 movl %edx,20(%rdi) 302 leal (%rdx,%rdx,4),%edx 303 movl %eax,32(%rdi) 304 shrq $26,%r8 305 movl %edx,36(%rdi) 306 shrq $26,%r9 307 308 movq %rbp,%rax 309 shlq $24,%rax 310 orq %rax,%r8 311 movl %r8d,48(%rdi) 312 leaq (%r8,%r8,4),%r8 313 movl %r9d,52(%rdi) 314 leaq (%r9,%r9,4),%r9 315 movl %r8d,64(%rdi) 316 movl %r9d,68(%rdi) 317 318 movq %r12,%rax 319 call __poly1305_block 320 321 movl $0x3ffffff,%eax 322 movq %r14,%r8 323 andl %r14d,%eax 324 shrq $26,%r8 325 movl %eax,-52(%rdi) 326 327 movl $0x3ffffff,%edx 328 andl %r8d,%edx 329 movl %edx,-36(%rdi) 330 leal (%rdx,%rdx,4),%edx 331 shrq $26,%r8 332 movl %edx,-20(%rdi) 333 334 movq %rbx,%rax 335 shlq $12,%rax 336 orq %r8,%rax 337 andl $0x3ffffff,%eax 338 movl %eax,-4(%rdi) 339 leal (%rax,%rax,4),%eax 340 movq %rbx,%r8 341 movl %eax,12(%rdi) 342 343 movl $0x3ffffff,%edx 344 shrq $14,%r8 345 andl %r8d,%edx 346 movl %edx,28(%rdi) 347 leal (%rdx,%rdx,4),%edx 348 shrq $26,%r8 349 movl %edx,44(%rdi) 350 351 movq %rbp,%rax 352 shlq $24,%rax 353 orq %rax,%r8 354 movl %r8d,60(%rdi) 355 leaq (%r8,%r8,4),%r8 356 movl %r8d,76(%rdi) 357 358 movq %r12,%rax 359 call __poly1305_block 360 361 movl $0x3ffffff,%eax 362 movq %r14,%r8 363 andl %r14d,%eax 364 shrq $26,%r8 365 movl %eax,-56(%rdi) 366 367 movl $0x3ffffff,%edx 368 andl %r8d,%edx 369 movl %edx,-40(%rdi) 370 leal (%rdx,%rdx,4),%edx 371 shrq $26,%r8 372 movl %edx,-24(%rdi) 373 374 movq %rbx,%rax 375 shlq $12,%rax 376 orq %r8,%rax 377 andl $0x3ffffff,%eax 378 movl %eax,-8(%rdi) 379 leal (%rax,%rax,4),%eax 380 movq %rbx,%r8 381 movl %eax,8(%rdi) 382 383 movl $0x3ffffff,%edx 384 shrq $14,%r8 385 andl %r8d,%edx 386 movl %edx,24(%rdi) 387 leal (%rdx,%rdx,4),%edx 388 shrq $26,%r8 389 movl %edx,40(%rdi) 390 391 movq %rbp,%rax 392 shlq $24,%rax 393 orq %rax,%r8 394 movl %r8d,56(%rdi) 395 leaq (%r8,%r8,4),%r8 396 movl %r8d,72(%rdi) 397 398 leaq -48-64(%rdi),%rdi 399 .byte 0xf3,0xc3 400.cfi_endproc 401.size __poly1305_init_avx,.-__poly1305_init_avx 402 403.type poly1305_blocks_avx,@function 404.align 32 405poly1305_blocks_avx: 406.cfi_startproc 407.byte 243,15,30,250 408 movl 20(%rdi),%r8d 409 cmpq $128,%rdx 410 jae .Lblocks_avx 411 testl %r8d,%r8d 412 jz .Lblocks 413 414.Lblocks_avx: 415 andq $-16,%rdx 416 jz .Lno_data_avx 417 418 vzeroupper 419 420 testl %r8d,%r8d 421 jz .Lbase2_64_avx 422 423 testq $31,%rdx 424 jz .Leven_avx 425 426 pushq %rbx 427.cfi_adjust_cfa_offset 8 428.cfi_offset %rbx,-16 429 pushq %rbp 430.cfi_adjust_cfa_offset 8 431.cfi_offset %rbp,-24 432 pushq %r12 433.cfi_adjust_cfa_offset 8 434.cfi_offset %r12,-32 435 pushq %r13 436.cfi_adjust_cfa_offset 8 437.cfi_offset %r13,-40 438 pushq %r14 439.cfi_adjust_cfa_offset 8 440.cfi_offset %r14,-48 441 pushq %r15 442.cfi_adjust_cfa_offset 8 443.cfi_offset %r15,-56 444.Lblocks_avx_body: 445 446 movq %rdx,%r15 447 448 movq 0(%rdi),%r8 449 movq 8(%rdi),%r9 450 movl 16(%rdi),%ebp 451 452 movq 24(%rdi),%r11 453 movq 32(%rdi),%r13 454 455 456 movl %r8d,%r14d 457 andq $-2147483648,%r8 458 movq %r9,%r12 459 movl %r9d,%ebx 460 andq $-2147483648,%r9 461 462 shrq $6,%r8 463 shlq $52,%r12 464 addq %r8,%r14 465 shrq $12,%rbx 466 shrq $18,%r9 467 addq %r12,%r14 468 adcq %r9,%rbx 469 470 movq %rbp,%r8 471 shlq $40,%r8 472 shrq $24,%rbp 473 addq %r8,%rbx 474 adcq $0,%rbp 475 476 movq $-4,%r9 477 movq %rbp,%r8 478 andq %rbp,%r9 479 shrq $2,%r8 480 andq $3,%rbp 481 addq %r9,%r8 482 addq %r8,%r14 483 adcq $0,%rbx 484 adcq $0,%rbp 485 486 movq %r13,%r12 487 movq %r13,%rax 488 shrq $2,%r13 489 addq %r12,%r13 490 491 addq 0(%rsi),%r14 492 adcq 8(%rsi),%rbx 493 leaq 16(%rsi),%rsi 494 adcq %rcx,%rbp 495 496 call __poly1305_block 497 498 testq %rcx,%rcx 499 jz .Lstore_base2_64_avx 500 501 502 movq %r14,%rax 503 movq %r14,%rdx 504 shrq $52,%r14 505 movq %rbx,%r11 506 movq %rbx,%r12 507 shrq $26,%rdx 508 andq $0x3ffffff,%rax 509 shlq $12,%r11 510 andq $0x3ffffff,%rdx 511 shrq $14,%rbx 512 orq %r11,%r14 513 shlq $24,%rbp 514 andq $0x3ffffff,%r14 515 shrq $40,%r12 516 andq $0x3ffffff,%rbx 517 orq %r12,%rbp 518 519 subq $16,%r15 520 jz .Lstore_base2_26_avx 521 522 vmovd %eax,%xmm0 523 vmovd %edx,%xmm1 524 vmovd %r14d,%xmm2 525 vmovd %ebx,%xmm3 526 vmovd %ebp,%xmm4 527 jmp .Lproceed_avx 528 529.align 32 530.Lstore_base2_64_avx: 531 movq %r14,0(%rdi) 532 movq %rbx,8(%rdi) 533 movq %rbp,16(%rdi) 534 jmp .Ldone_avx 535 536.align 16 537.Lstore_base2_26_avx: 538 movl %eax,0(%rdi) 539 movl %edx,4(%rdi) 540 movl %r14d,8(%rdi) 541 movl %ebx,12(%rdi) 542 movl %ebp,16(%rdi) 543.align 16 544.Ldone_avx: 545 movq 0(%rsp),%r15 546.cfi_restore %r15 547 movq 8(%rsp),%r14 548.cfi_restore %r14 549 movq 16(%rsp),%r13 550.cfi_restore %r13 551 movq 24(%rsp),%r12 552.cfi_restore %r12 553 movq 32(%rsp),%rbp 554.cfi_restore %rbp 555 movq 40(%rsp),%rbx 556.cfi_restore %rbx 557 leaq 48(%rsp),%rsp 558.cfi_adjust_cfa_offset -48 559.Lno_data_avx: 560.Lblocks_avx_epilogue: 561 .byte 0xf3,0xc3 562.cfi_endproc 563 564.align 32 565.Lbase2_64_avx: 566.cfi_startproc 567 pushq %rbx 568.cfi_adjust_cfa_offset 8 569.cfi_offset %rbx,-16 570 pushq %rbp 571.cfi_adjust_cfa_offset 8 572.cfi_offset %rbp,-24 573 pushq %r12 574.cfi_adjust_cfa_offset 8 575.cfi_offset %r12,-32 576 pushq %r13 577.cfi_adjust_cfa_offset 8 578.cfi_offset %r13,-40 579 pushq %r14 580.cfi_adjust_cfa_offset 8 581.cfi_offset %r14,-48 582 pushq %r15 583.cfi_adjust_cfa_offset 8 584.cfi_offset %r15,-56 585.Lbase2_64_avx_body: 586 587 movq %rdx,%r15 588 589 movq 24(%rdi),%r11 590 movq 32(%rdi),%r13 591 592 movq 0(%rdi),%r14 593 movq 8(%rdi),%rbx 594 movl 16(%rdi),%ebp 595 596 movq %r13,%r12 597 movq %r13,%rax 598 shrq $2,%r13 599 addq %r12,%r13 600 601 testq $31,%rdx 602 jz .Linit_avx 603 604 addq 0(%rsi),%r14 605 adcq 8(%rsi),%rbx 606 leaq 16(%rsi),%rsi 607 adcq %rcx,%rbp 608 subq $16,%r15 609 610 call __poly1305_block 611 612.Linit_avx: 613 614 movq %r14,%rax 615 movq %r14,%rdx 616 shrq $52,%r14 617 movq %rbx,%r8 618 movq %rbx,%r9 619 shrq $26,%rdx 620 andq $0x3ffffff,%rax 621 shlq $12,%r8 622 andq $0x3ffffff,%rdx 623 shrq $14,%rbx 624 orq %r8,%r14 625 shlq $24,%rbp 626 andq $0x3ffffff,%r14 627 shrq $40,%r9 628 andq $0x3ffffff,%rbx 629 orq %r9,%rbp 630 631 vmovd %eax,%xmm0 632 vmovd %edx,%xmm1 633 vmovd %r14d,%xmm2 634 vmovd %ebx,%xmm3 635 vmovd %ebp,%xmm4 636 movl $1,20(%rdi) 637 638 call __poly1305_init_avx 639 640.Lproceed_avx: 641 movq %r15,%rdx 642 643 movq 0(%rsp),%r15 644.cfi_restore %r15 645 movq 8(%rsp),%r14 646.cfi_restore %r14 647 movq 16(%rsp),%r13 648.cfi_restore %r13 649 movq 24(%rsp),%r12 650.cfi_restore %r12 651 movq 32(%rsp),%rbp 652.cfi_restore %rbp 653 movq 40(%rsp),%rbx 654.cfi_restore %rbx 655 leaq 48(%rsp),%rax 656 leaq 48(%rsp),%rsp 657.cfi_adjust_cfa_offset -48 658.Lbase2_64_avx_epilogue: 659 jmp .Ldo_avx 660.cfi_endproc 661 662.align 32 663.Leven_avx: 664.cfi_startproc 665 vmovd 0(%rdi),%xmm0 666 vmovd 4(%rdi),%xmm1 667 vmovd 8(%rdi),%xmm2 668 vmovd 12(%rdi),%xmm3 669 vmovd 16(%rdi),%xmm4 670 671.Ldo_avx: 672 leaq -88(%rsp),%r11 673.cfi_def_cfa %r11,0x60 674 subq $0x178,%rsp 675 subq $64,%rdx 676 leaq -32(%rsi),%rax 677 cmovcq %rax,%rsi 678 679 vmovdqu 48(%rdi),%xmm14 680 leaq 112(%rdi),%rdi 681 leaq .Lconst(%rip),%rcx 682 683 684 685 vmovdqu 32(%rsi),%xmm5 686 vmovdqu 48(%rsi),%xmm6 687 vmovdqa 64(%rcx),%xmm15 688 689 vpsrldq $6,%xmm5,%xmm7 690 vpsrldq $6,%xmm6,%xmm8 691 vpunpckhqdq %xmm6,%xmm5,%xmm9 692 vpunpcklqdq %xmm6,%xmm5,%xmm5 693 vpunpcklqdq %xmm8,%xmm7,%xmm8 694 695 vpsrlq $40,%xmm9,%xmm9 696 vpsrlq $26,%xmm5,%xmm6 697 vpand %xmm15,%xmm5,%xmm5 698 vpsrlq $4,%xmm8,%xmm7 699 vpand %xmm15,%xmm6,%xmm6 700 vpsrlq $30,%xmm8,%xmm8 701 vpand %xmm15,%xmm7,%xmm7 702 vpand %xmm15,%xmm8,%xmm8 703 vpor 32(%rcx),%xmm9,%xmm9 704 705 jbe .Lskip_loop_avx 706 707 708 vmovdqu -48(%rdi),%xmm11 709 vmovdqu -32(%rdi),%xmm12 710 vpshufd $0xEE,%xmm14,%xmm13 711 vpshufd $0x44,%xmm14,%xmm10 712 vmovdqa %xmm13,-144(%r11) 713 vmovdqa %xmm10,0(%rsp) 714 vpshufd $0xEE,%xmm11,%xmm14 715 vmovdqu -16(%rdi),%xmm10 716 vpshufd $0x44,%xmm11,%xmm11 717 vmovdqa %xmm14,-128(%r11) 718 vmovdqa %xmm11,16(%rsp) 719 vpshufd $0xEE,%xmm12,%xmm13 720 vmovdqu 0(%rdi),%xmm11 721 vpshufd $0x44,%xmm12,%xmm12 722 vmovdqa %xmm13,-112(%r11) 723 vmovdqa %xmm12,32(%rsp) 724 vpshufd $0xEE,%xmm10,%xmm14 725 vmovdqu 16(%rdi),%xmm12 726 vpshufd $0x44,%xmm10,%xmm10 727 vmovdqa %xmm14,-96(%r11) 728 vmovdqa %xmm10,48(%rsp) 729 vpshufd $0xEE,%xmm11,%xmm13 730 vmovdqu 32(%rdi),%xmm10 731 vpshufd $0x44,%xmm11,%xmm11 732 vmovdqa %xmm13,-80(%r11) 733 vmovdqa %xmm11,64(%rsp) 734 vpshufd $0xEE,%xmm12,%xmm14 735 vmovdqu 48(%rdi),%xmm11 736 vpshufd $0x44,%xmm12,%xmm12 737 vmovdqa %xmm14,-64(%r11) 738 vmovdqa %xmm12,80(%rsp) 739 vpshufd $0xEE,%xmm10,%xmm13 740 vmovdqu 64(%rdi),%xmm12 741 vpshufd $0x44,%xmm10,%xmm10 742 vmovdqa %xmm13,-48(%r11) 743 vmovdqa %xmm10,96(%rsp) 744 vpshufd $0xEE,%xmm11,%xmm14 745 vpshufd $0x44,%xmm11,%xmm11 746 vmovdqa %xmm14,-32(%r11) 747 vmovdqa %xmm11,112(%rsp) 748 vpshufd $0xEE,%xmm12,%xmm13 749 vmovdqa 0(%rsp),%xmm14 750 vpshufd $0x44,%xmm12,%xmm12 751 vmovdqa %xmm13,-16(%r11) 752 vmovdqa %xmm12,128(%rsp) 753 754 jmp .Loop_avx 755 756.align 32 757.Loop_avx: 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 vpmuludq %xmm5,%xmm14,%xmm10 779 vpmuludq %xmm6,%xmm14,%xmm11 780 vmovdqa %xmm2,32(%r11) 781 vpmuludq %xmm7,%xmm14,%xmm12 782 vmovdqa 16(%rsp),%xmm2 783 vpmuludq %xmm8,%xmm14,%xmm13 784 vpmuludq %xmm9,%xmm14,%xmm14 785 786 vmovdqa %xmm0,0(%r11) 787 vpmuludq 32(%rsp),%xmm9,%xmm0 788 vmovdqa %xmm1,16(%r11) 789 vpmuludq %xmm8,%xmm2,%xmm1 790 vpaddq %xmm0,%xmm10,%xmm10 791 vpaddq %xmm1,%xmm14,%xmm14 792 vmovdqa %xmm3,48(%r11) 793 vpmuludq %xmm7,%xmm2,%xmm0 794 vpmuludq %xmm6,%xmm2,%xmm1 795 vpaddq %xmm0,%xmm13,%xmm13 796 vmovdqa 48(%rsp),%xmm3 797 vpaddq %xmm1,%xmm12,%xmm12 798 vmovdqa %xmm4,64(%r11) 799 vpmuludq %xmm5,%xmm2,%xmm2 800 vpmuludq %xmm7,%xmm3,%xmm0 801 vpaddq %xmm2,%xmm11,%xmm11 802 803 vmovdqa 64(%rsp),%xmm4 804 vpaddq %xmm0,%xmm14,%xmm14 805 vpmuludq %xmm6,%xmm3,%xmm1 806 vpmuludq %xmm5,%xmm3,%xmm3 807 vpaddq %xmm1,%xmm13,%xmm13 808 vmovdqa 80(%rsp),%xmm2 809 vpaddq %xmm3,%xmm12,%xmm12 810 vpmuludq %xmm9,%xmm4,%xmm0 811 vpmuludq %xmm8,%xmm4,%xmm4 812 vpaddq %xmm0,%xmm11,%xmm11 813 vmovdqa 96(%rsp),%xmm3 814 vpaddq %xmm4,%xmm10,%xmm10 815 816 vmovdqa 128(%rsp),%xmm4 817 vpmuludq %xmm6,%xmm2,%xmm1 818 vpmuludq %xmm5,%xmm2,%xmm2 819 vpaddq %xmm1,%xmm14,%xmm14 820 vpaddq %xmm2,%xmm13,%xmm13 821 vpmuludq %xmm9,%xmm3,%xmm0 822 vpmuludq %xmm8,%xmm3,%xmm1 823 vpaddq %xmm0,%xmm12,%xmm12 824 vmovdqu 0(%rsi),%xmm0 825 vpaddq %xmm1,%xmm11,%xmm11 826 vpmuludq %xmm7,%xmm3,%xmm3 827 vpmuludq %xmm7,%xmm4,%xmm7 828 vpaddq %xmm3,%xmm10,%xmm10 829 830 vmovdqu 16(%rsi),%xmm1 831 vpaddq %xmm7,%xmm11,%xmm11 832 vpmuludq %xmm8,%xmm4,%xmm8 833 vpmuludq %xmm9,%xmm4,%xmm9 834 vpsrldq $6,%xmm0,%xmm2 835 vpaddq %xmm8,%xmm12,%xmm12 836 vpaddq %xmm9,%xmm13,%xmm13 837 vpsrldq $6,%xmm1,%xmm3 838 vpmuludq 112(%rsp),%xmm5,%xmm9 839 vpmuludq %xmm6,%xmm4,%xmm5 840 vpunpckhqdq %xmm1,%xmm0,%xmm4 841 vpaddq %xmm9,%xmm14,%xmm14 842 vmovdqa -144(%r11),%xmm9 843 vpaddq %xmm5,%xmm10,%xmm10 844 845 vpunpcklqdq %xmm1,%xmm0,%xmm0 846 vpunpcklqdq %xmm3,%xmm2,%xmm3 847 848 849 vpsrldq $5,%xmm4,%xmm4 850 vpsrlq $26,%xmm0,%xmm1 851 vpand %xmm15,%xmm0,%xmm0 852 vpsrlq $4,%xmm3,%xmm2 853 vpand %xmm15,%xmm1,%xmm1 854 vpand 0(%rcx),%xmm4,%xmm4 855 vpsrlq $30,%xmm3,%xmm3 856 vpand %xmm15,%xmm2,%xmm2 857 vpand %xmm15,%xmm3,%xmm3 858 vpor 32(%rcx),%xmm4,%xmm4 859 860 vpaddq 0(%r11),%xmm0,%xmm0 861 vpaddq 16(%r11),%xmm1,%xmm1 862 vpaddq 32(%r11),%xmm2,%xmm2 863 vpaddq 48(%r11),%xmm3,%xmm3 864 vpaddq 64(%r11),%xmm4,%xmm4 865 866 leaq 32(%rsi),%rax 867 leaq 64(%rsi),%rsi 868 subq $64,%rdx 869 cmovcq %rax,%rsi 870 871 872 873 874 875 876 877 878 879 880 vpmuludq %xmm0,%xmm9,%xmm5 881 vpmuludq %xmm1,%xmm9,%xmm6 882 vpaddq %xmm5,%xmm10,%xmm10 883 vpaddq %xmm6,%xmm11,%xmm11 884 vmovdqa -128(%r11),%xmm7 885 vpmuludq %xmm2,%xmm9,%xmm5 886 vpmuludq %xmm3,%xmm9,%xmm6 887 vpaddq %xmm5,%xmm12,%xmm12 888 vpaddq %xmm6,%xmm13,%xmm13 889 vpmuludq %xmm4,%xmm9,%xmm9 890 vpmuludq -112(%r11),%xmm4,%xmm5 891 vpaddq %xmm9,%xmm14,%xmm14 892 893 vpaddq %xmm5,%xmm10,%xmm10 894 vpmuludq %xmm2,%xmm7,%xmm6 895 vpmuludq %xmm3,%xmm7,%xmm5 896 vpaddq %xmm6,%xmm13,%xmm13 897 vmovdqa -96(%r11),%xmm8 898 vpaddq %xmm5,%xmm14,%xmm14 899 vpmuludq %xmm1,%xmm7,%xmm6 900 vpmuludq %xmm0,%xmm7,%xmm7 901 vpaddq %xmm6,%xmm12,%xmm12 902 vpaddq %xmm7,%xmm11,%xmm11 903 904 vmovdqa -80(%r11),%xmm9 905 vpmuludq %xmm2,%xmm8,%xmm5 906 vpmuludq %xmm1,%xmm8,%xmm6 907 vpaddq %xmm5,%xmm14,%xmm14 908 vpaddq %xmm6,%xmm13,%xmm13 909 vmovdqa -64(%r11),%xmm7 910 vpmuludq %xmm0,%xmm8,%xmm8 911 vpmuludq %xmm4,%xmm9,%xmm5 912 vpaddq %xmm8,%xmm12,%xmm12 913 vpaddq %xmm5,%xmm11,%xmm11 914 vmovdqa -48(%r11),%xmm8 915 vpmuludq %xmm3,%xmm9,%xmm9 916 vpmuludq %xmm1,%xmm7,%xmm6 917 vpaddq %xmm9,%xmm10,%xmm10 918 919 vmovdqa -16(%r11),%xmm9 920 vpaddq %xmm6,%xmm14,%xmm14 921 vpmuludq %xmm0,%xmm7,%xmm7 922 vpmuludq %xmm4,%xmm8,%xmm5 923 vpaddq %xmm7,%xmm13,%xmm13 924 vpaddq %xmm5,%xmm12,%xmm12 925 vmovdqu 32(%rsi),%xmm5 926 vpmuludq %xmm3,%xmm8,%xmm7 927 vpmuludq %xmm2,%xmm8,%xmm8 928 vpaddq %xmm7,%xmm11,%xmm11 929 vmovdqu 48(%rsi),%xmm6 930 vpaddq %xmm8,%xmm10,%xmm10 931 932 vpmuludq %xmm2,%xmm9,%xmm2 933 vpmuludq %xmm3,%xmm9,%xmm3 934 vpsrldq $6,%xmm5,%xmm7 935 vpaddq %xmm2,%xmm11,%xmm11 936 vpmuludq %xmm4,%xmm9,%xmm4 937 vpsrldq $6,%xmm6,%xmm8 938 vpaddq %xmm3,%xmm12,%xmm2 939 vpaddq %xmm4,%xmm13,%xmm3 940 vpmuludq -32(%r11),%xmm0,%xmm4 941 vpmuludq %xmm1,%xmm9,%xmm0 942 vpunpckhqdq %xmm6,%xmm5,%xmm9 943 vpaddq %xmm4,%xmm14,%xmm4 944 vpaddq %xmm0,%xmm10,%xmm0 945 946 vpunpcklqdq %xmm6,%xmm5,%xmm5 947 vpunpcklqdq %xmm8,%xmm7,%xmm8 948 949 950 vpsrldq $5,%xmm9,%xmm9 951 vpsrlq $26,%xmm5,%xmm6 952 vmovdqa 0(%rsp),%xmm14 953 vpand %xmm15,%xmm5,%xmm5 954 vpsrlq $4,%xmm8,%xmm7 955 vpand %xmm15,%xmm6,%xmm6 956 vpand 0(%rcx),%xmm9,%xmm9 957 vpsrlq $30,%xmm8,%xmm8 958 vpand %xmm15,%xmm7,%xmm7 959 vpand %xmm15,%xmm8,%xmm8 960 vpor 32(%rcx),%xmm9,%xmm9 961 962 963 964 965 966 vpsrlq $26,%xmm3,%xmm13 967 vpand %xmm15,%xmm3,%xmm3 968 vpaddq %xmm13,%xmm4,%xmm4 969 970 vpsrlq $26,%xmm0,%xmm10 971 vpand %xmm15,%xmm0,%xmm0 972 vpaddq %xmm10,%xmm11,%xmm1 973 974 vpsrlq $26,%xmm4,%xmm10 975 vpand %xmm15,%xmm4,%xmm4 976 977 vpsrlq $26,%xmm1,%xmm11 978 vpand %xmm15,%xmm1,%xmm1 979 vpaddq %xmm11,%xmm2,%xmm2 980 981 vpaddq %xmm10,%xmm0,%xmm0 982 vpsllq $2,%xmm10,%xmm10 983 vpaddq %xmm10,%xmm0,%xmm0 984 985 vpsrlq $26,%xmm2,%xmm12 986 vpand %xmm15,%xmm2,%xmm2 987 vpaddq %xmm12,%xmm3,%xmm3 988 989 vpsrlq $26,%xmm0,%xmm10 990 vpand %xmm15,%xmm0,%xmm0 991 vpaddq %xmm10,%xmm1,%xmm1 992 993 vpsrlq $26,%xmm3,%xmm13 994 vpand %xmm15,%xmm3,%xmm3 995 vpaddq %xmm13,%xmm4,%xmm4 996 997 ja .Loop_avx 998 999.Lskip_loop_avx: 1000 1001 1002 1003 vpshufd $0x10,%xmm14,%xmm14 1004 addq $32,%rdx 1005 jnz .Long_tail_avx 1006 1007 vpaddq %xmm2,%xmm7,%xmm7 1008 vpaddq %xmm0,%xmm5,%xmm5 1009 vpaddq %xmm1,%xmm6,%xmm6 1010 vpaddq %xmm3,%xmm8,%xmm8 1011 vpaddq %xmm4,%xmm9,%xmm9 1012 1013.Long_tail_avx: 1014 vmovdqa %xmm2,32(%r11) 1015 vmovdqa %xmm0,0(%r11) 1016 vmovdqa %xmm1,16(%r11) 1017 vmovdqa %xmm3,48(%r11) 1018 vmovdqa %xmm4,64(%r11) 1019 1020 1021 1022 1023 1024 1025 1026 vpmuludq %xmm7,%xmm14,%xmm12 1027 vpmuludq %xmm5,%xmm14,%xmm10 1028 vpshufd $0x10,-48(%rdi),%xmm2 1029 vpmuludq %xmm6,%xmm14,%xmm11 1030 vpmuludq %xmm8,%xmm14,%xmm13 1031 vpmuludq %xmm9,%xmm14,%xmm14 1032 1033 vpmuludq %xmm8,%xmm2,%xmm0 1034 vpaddq %xmm0,%xmm14,%xmm14 1035 vpshufd $0x10,-32(%rdi),%xmm3 1036 vpmuludq %xmm7,%xmm2,%xmm1 1037 vpaddq %xmm1,%xmm13,%xmm13 1038 vpshufd $0x10,-16(%rdi),%xmm4 1039 vpmuludq %xmm6,%xmm2,%xmm0 1040 vpaddq %xmm0,%xmm12,%xmm12 1041 vpmuludq %xmm5,%xmm2,%xmm2 1042 vpaddq %xmm2,%xmm11,%xmm11 1043 vpmuludq %xmm9,%xmm3,%xmm3 1044 vpaddq %xmm3,%xmm10,%xmm10 1045 1046 vpshufd $0x10,0(%rdi),%xmm2 1047 vpmuludq %xmm7,%xmm4,%xmm1 1048 vpaddq %xmm1,%xmm14,%xmm14 1049 vpmuludq %xmm6,%xmm4,%xmm0 1050 vpaddq %xmm0,%xmm13,%xmm13 1051 vpshufd $0x10,16(%rdi),%xmm3 1052 vpmuludq %xmm5,%xmm4,%xmm4 1053 vpaddq %xmm4,%xmm12,%xmm12 1054 vpmuludq %xmm9,%xmm2,%xmm1 1055 vpaddq %xmm1,%xmm11,%xmm11 1056 vpshufd $0x10,32(%rdi),%xmm4 1057 vpmuludq %xmm8,%xmm2,%xmm2 1058 vpaddq %xmm2,%xmm10,%xmm10 1059 1060 vpmuludq %xmm6,%xmm3,%xmm0 1061 vpaddq %xmm0,%xmm14,%xmm14 1062 vpmuludq %xmm5,%xmm3,%xmm3 1063 vpaddq %xmm3,%xmm13,%xmm13 1064 vpshufd $0x10,48(%rdi),%xmm2 1065 vpmuludq %xmm9,%xmm4,%xmm1 1066 vpaddq %xmm1,%xmm12,%xmm12 1067 vpshufd $0x10,64(%rdi),%xmm3 1068 vpmuludq %xmm8,%xmm4,%xmm0 1069 vpaddq %xmm0,%xmm11,%xmm11 1070 vpmuludq %xmm7,%xmm4,%xmm4 1071 vpaddq %xmm4,%xmm10,%xmm10 1072 1073 vpmuludq %xmm5,%xmm2,%xmm2 1074 vpaddq %xmm2,%xmm14,%xmm14 1075 vpmuludq %xmm9,%xmm3,%xmm1 1076 vpaddq %xmm1,%xmm13,%xmm13 1077 vpmuludq %xmm8,%xmm3,%xmm0 1078 vpaddq %xmm0,%xmm12,%xmm12 1079 vpmuludq %xmm7,%xmm3,%xmm1 1080 vpaddq %xmm1,%xmm11,%xmm11 1081 vpmuludq %xmm6,%xmm3,%xmm3 1082 vpaddq %xmm3,%xmm10,%xmm10 1083 1084 jz .Lshort_tail_avx 1085 1086 vmovdqu 0(%rsi),%xmm0 1087 vmovdqu 16(%rsi),%xmm1 1088 1089 vpsrldq $6,%xmm0,%xmm2 1090 vpsrldq $6,%xmm1,%xmm3 1091 vpunpckhqdq %xmm1,%xmm0,%xmm4 1092 vpunpcklqdq %xmm1,%xmm0,%xmm0 1093 vpunpcklqdq %xmm3,%xmm2,%xmm3 1094 1095 vpsrlq $40,%xmm4,%xmm4 1096 vpsrlq $26,%xmm0,%xmm1 1097 vpand %xmm15,%xmm0,%xmm0 1098 vpsrlq $4,%xmm3,%xmm2 1099 vpand %xmm15,%xmm1,%xmm1 1100 vpsrlq $30,%xmm3,%xmm3 1101 vpand %xmm15,%xmm2,%xmm2 1102 vpand %xmm15,%xmm3,%xmm3 1103 vpor 32(%rcx),%xmm4,%xmm4 1104 1105 vpshufd $0x32,-64(%rdi),%xmm9 1106 vpaddq 0(%r11),%xmm0,%xmm0 1107 vpaddq 16(%r11),%xmm1,%xmm1 1108 vpaddq 32(%r11),%xmm2,%xmm2 1109 vpaddq 48(%r11),%xmm3,%xmm3 1110 vpaddq 64(%r11),%xmm4,%xmm4 1111 1112 1113 1114 1115 vpmuludq %xmm0,%xmm9,%xmm5 1116 vpaddq %xmm5,%xmm10,%xmm10 1117 vpmuludq %xmm1,%xmm9,%xmm6 1118 vpaddq %xmm6,%xmm11,%xmm11 1119 vpmuludq %xmm2,%xmm9,%xmm5 1120 vpaddq %xmm5,%xmm12,%xmm12 1121 vpshufd $0x32,-48(%rdi),%xmm7 1122 vpmuludq %xmm3,%xmm9,%xmm6 1123 vpaddq %xmm6,%xmm13,%xmm13 1124 vpmuludq %xmm4,%xmm9,%xmm9 1125 vpaddq %xmm9,%xmm14,%xmm14 1126 1127 vpmuludq %xmm3,%xmm7,%xmm5 1128 vpaddq %xmm5,%xmm14,%xmm14 1129 vpshufd $0x32,-32(%rdi),%xmm8 1130 vpmuludq %xmm2,%xmm7,%xmm6 1131 vpaddq %xmm6,%xmm13,%xmm13 1132 vpshufd $0x32,-16(%rdi),%xmm9 1133 vpmuludq %xmm1,%xmm7,%xmm5 1134 vpaddq %xmm5,%xmm12,%xmm12 1135 vpmuludq %xmm0,%xmm7,%xmm7 1136 vpaddq %xmm7,%xmm11,%xmm11 1137 vpmuludq %xmm4,%xmm8,%xmm8 1138 vpaddq %xmm8,%xmm10,%xmm10 1139 1140 vpshufd $0x32,0(%rdi),%xmm7 1141 vpmuludq %xmm2,%xmm9,%xmm6 1142 vpaddq %xmm6,%xmm14,%xmm14 1143 vpmuludq %xmm1,%xmm9,%xmm5 1144 vpaddq %xmm5,%xmm13,%xmm13 1145 vpshufd $0x32,16(%rdi),%xmm8 1146 vpmuludq %xmm0,%xmm9,%xmm9 1147 vpaddq %xmm9,%xmm12,%xmm12 1148 vpmuludq %xmm4,%xmm7,%xmm6 1149 vpaddq %xmm6,%xmm11,%xmm11 1150 vpshufd $0x32,32(%rdi),%xmm9 1151 vpmuludq %xmm3,%xmm7,%xmm7 1152 vpaddq %xmm7,%xmm10,%xmm10 1153 1154 vpmuludq %xmm1,%xmm8,%xmm5 1155 vpaddq %xmm5,%xmm14,%xmm14 1156 vpmuludq %xmm0,%xmm8,%xmm8 1157 vpaddq %xmm8,%xmm13,%xmm13 1158 vpshufd $0x32,48(%rdi),%xmm7 1159 vpmuludq %xmm4,%xmm9,%xmm6 1160 vpaddq %xmm6,%xmm12,%xmm12 1161 vpshufd $0x32,64(%rdi),%xmm8 1162 vpmuludq %xmm3,%xmm9,%xmm5 1163 vpaddq %xmm5,%xmm11,%xmm11 1164 vpmuludq %xmm2,%xmm9,%xmm9 1165 vpaddq %xmm9,%xmm10,%xmm10 1166 1167 vpmuludq %xmm0,%xmm7,%xmm7 1168 vpaddq %xmm7,%xmm14,%xmm14 1169 vpmuludq %xmm4,%xmm8,%xmm6 1170 vpaddq %xmm6,%xmm13,%xmm13 1171 vpmuludq %xmm3,%xmm8,%xmm5 1172 vpaddq %xmm5,%xmm12,%xmm12 1173 vpmuludq %xmm2,%xmm8,%xmm6 1174 vpaddq %xmm6,%xmm11,%xmm11 1175 vpmuludq %xmm1,%xmm8,%xmm8 1176 vpaddq %xmm8,%xmm10,%xmm10 1177 1178.Lshort_tail_avx: 1179 1180 1181 1182 vpsrldq $8,%xmm14,%xmm9 1183 vpsrldq $8,%xmm13,%xmm8 1184 vpsrldq $8,%xmm11,%xmm6 1185 vpsrldq $8,%xmm10,%xmm5 1186 vpsrldq $8,%xmm12,%xmm7 1187 vpaddq %xmm8,%xmm13,%xmm13 1188 vpaddq %xmm9,%xmm14,%xmm14 1189 vpaddq %xmm5,%xmm10,%xmm10 1190 vpaddq %xmm6,%xmm11,%xmm11 1191 vpaddq %xmm7,%xmm12,%xmm12 1192 1193 1194 1195 1196 vpsrlq $26,%xmm13,%xmm3 1197 vpand %xmm15,%xmm13,%xmm13 1198 vpaddq %xmm3,%xmm14,%xmm14 1199 1200 vpsrlq $26,%xmm10,%xmm0 1201 vpand %xmm15,%xmm10,%xmm10 1202 vpaddq %xmm0,%xmm11,%xmm11 1203 1204 vpsrlq $26,%xmm14,%xmm4 1205 vpand %xmm15,%xmm14,%xmm14 1206 1207 vpsrlq $26,%xmm11,%xmm1 1208 vpand %xmm15,%xmm11,%xmm11 1209 vpaddq %xmm1,%xmm12,%xmm12 1210 1211 vpaddq %xmm4,%xmm10,%xmm10 1212 vpsllq $2,%xmm4,%xmm4 1213 vpaddq %xmm4,%xmm10,%xmm10 1214 1215 vpsrlq $26,%xmm12,%xmm2 1216 vpand %xmm15,%xmm12,%xmm12 1217 vpaddq %xmm2,%xmm13,%xmm13 1218 1219 vpsrlq $26,%xmm10,%xmm0 1220 vpand %xmm15,%xmm10,%xmm10 1221 vpaddq %xmm0,%xmm11,%xmm11 1222 1223 vpsrlq $26,%xmm13,%xmm3 1224 vpand %xmm15,%xmm13,%xmm13 1225 vpaddq %xmm3,%xmm14,%xmm14 1226 1227 vmovd %xmm10,-112(%rdi) 1228 vmovd %xmm11,-108(%rdi) 1229 vmovd %xmm12,-104(%rdi) 1230 vmovd %xmm13,-100(%rdi) 1231 vmovd %xmm14,-96(%rdi) 1232 leaq 88(%r11),%rsp 1233.cfi_def_cfa %rsp,8 1234 vzeroupper 1235 .byte 0xf3,0xc3 1236.cfi_endproc 1237.size poly1305_blocks_avx,.-poly1305_blocks_avx 1238 1239.type poly1305_emit_avx,@function 1240.align 32 1241poly1305_emit_avx: 1242.cfi_startproc 1243.byte 243,15,30,250 1244 cmpl $0,20(%rdi) 1245 je .Lemit 1246 1247 movl 0(%rdi),%eax 1248 movl 4(%rdi),%ecx 1249 movl 8(%rdi),%r8d 1250 movl 12(%rdi),%r11d 1251 movl 16(%rdi),%r10d 1252 1253 shlq $26,%rcx 1254 movq %r8,%r9 1255 shlq $52,%r8 1256 addq %rcx,%rax 1257 shrq $12,%r9 1258 addq %rax,%r8 1259 adcq $0,%r9 1260 1261 shlq $14,%r11 1262 movq %r10,%rax 1263 shrq $24,%r10 1264 addq %r11,%r9 1265 shlq $40,%rax 1266 addq %rax,%r9 1267 adcq $0,%r10 1268 1269 movq %r10,%rax 1270 movq %r10,%rcx 1271 andq $3,%r10 1272 shrq $2,%rax 1273 andq $-4,%rcx 1274 addq %rcx,%rax 1275 addq %rax,%r8 1276 adcq $0,%r9 1277 adcq $0,%r10 1278 1279 movq %r8,%rax 1280 addq $5,%r8 1281 movq %r9,%rcx 1282 adcq $0,%r9 1283 adcq $0,%r10 1284 shrq $2,%r10 1285 cmovnzq %r8,%rax 1286 cmovnzq %r9,%rcx 1287 1288 addq 0(%rdx),%rax 1289 adcq 8(%rdx),%rcx 1290 movq %rax,0(%rsi) 1291 movq %rcx,8(%rsi) 1292 1293 .byte 0xf3,0xc3 1294.cfi_endproc 1295.size poly1305_emit_avx,.-poly1305_emit_avx 1296.type poly1305_blocks_avx2,@function 1297.align 32 1298poly1305_blocks_avx2: 1299.cfi_startproc 1300.byte 243,15,30,250 1301 movl 20(%rdi),%r8d 1302 cmpq $128,%rdx 1303 jae .Lblocks_avx2 1304 testl %r8d,%r8d 1305 jz .Lblocks 1306 1307.Lblocks_avx2: 1308 andq $-16,%rdx 1309 jz .Lno_data_avx2 1310 1311 vzeroupper 1312 1313 testl %r8d,%r8d 1314 jz .Lbase2_64_avx2 1315 1316 testq $63,%rdx 1317 jz .Leven_avx2 1318 1319 pushq %rbx 1320.cfi_adjust_cfa_offset 8 1321.cfi_offset %rbx,-16 1322 pushq %rbp 1323.cfi_adjust_cfa_offset 8 1324.cfi_offset %rbp,-24 1325 pushq %r12 1326.cfi_adjust_cfa_offset 8 1327.cfi_offset %r12,-32 1328 pushq %r13 1329.cfi_adjust_cfa_offset 8 1330.cfi_offset %r13,-40 1331 pushq %r14 1332.cfi_adjust_cfa_offset 8 1333.cfi_offset %r14,-48 1334 pushq %r15 1335.cfi_adjust_cfa_offset 8 1336.cfi_offset %r15,-56 1337.Lblocks_avx2_body: 1338 1339 movq %rdx,%r15 1340 1341 movq 0(%rdi),%r8 1342 movq 8(%rdi),%r9 1343 movl 16(%rdi),%ebp 1344 1345 movq 24(%rdi),%r11 1346 movq 32(%rdi),%r13 1347 1348 1349 movl %r8d,%r14d 1350 andq $-2147483648,%r8 1351 movq %r9,%r12 1352 movl %r9d,%ebx 1353 andq $-2147483648,%r9 1354 1355 shrq $6,%r8 1356 shlq $52,%r12 1357 addq %r8,%r14 1358 shrq $12,%rbx 1359 shrq $18,%r9 1360 addq %r12,%r14 1361 adcq %r9,%rbx 1362 1363 movq %rbp,%r8 1364 shlq $40,%r8 1365 shrq $24,%rbp 1366 addq %r8,%rbx 1367 adcq $0,%rbp 1368 1369 movq $-4,%r9 1370 movq %rbp,%r8 1371 andq %rbp,%r9 1372 shrq $2,%r8 1373 andq $3,%rbp 1374 addq %r9,%r8 1375 addq %r8,%r14 1376 adcq $0,%rbx 1377 adcq $0,%rbp 1378 1379 movq %r13,%r12 1380 movq %r13,%rax 1381 shrq $2,%r13 1382 addq %r12,%r13 1383 1384.Lbase2_26_pre_avx2: 1385 addq 0(%rsi),%r14 1386 adcq 8(%rsi),%rbx 1387 leaq 16(%rsi),%rsi 1388 adcq %rcx,%rbp 1389 subq $16,%r15 1390 1391 call __poly1305_block 1392 movq %r12,%rax 1393 1394 testq $63,%r15 1395 jnz .Lbase2_26_pre_avx2 1396 1397 testq %rcx,%rcx 1398 jz .Lstore_base2_64_avx2 1399 1400 1401 movq %r14,%rax 1402 movq %r14,%rdx 1403 shrq $52,%r14 1404 movq %rbx,%r11 1405 movq %rbx,%r12 1406 shrq $26,%rdx 1407 andq $0x3ffffff,%rax 1408 shlq $12,%r11 1409 andq $0x3ffffff,%rdx 1410 shrq $14,%rbx 1411 orq %r11,%r14 1412 shlq $24,%rbp 1413 andq $0x3ffffff,%r14 1414 shrq $40,%r12 1415 andq $0x3ffffff,%rbx 1416 orq %r12,%rbp 1417 1418 testq %r15,%r15 1419 jz .Lstore_base2_26_avx2 1420 1421 vmovd %eax,%xmm0 1422 vmovd %edx,%xmm1 1423 vmovd %r14d,%xmm2 1424 vmovd %ebx,%xmm3 1425 vmovd %ebp,%xmm4 1426 jmp .Lproceed_avx2 1427 1428.align 32 1429.Lstore_base2_64_avx2: 1430 movq %r14,0(%rdi) 1431 movq %rbx,8(%rdi) 1432 movq %rbp,16(%rdi) 1433 jmp .Ldone_avx2 1434 1435.align 16 1436.Lstore_base2_26_avx2: 1437 movl %eax,0(%rdi) 1438 movl %edx,4(%rdi) 1439 movl %r14d,8(%rdi) 1440 movl %ebx,12(%rdi) 1441 movl %ebp,16(%rdi) 1442.align 16 1443.Ldone_avx2: 1444 movq 0(%rsp),%r15 1445.cfi_restore %r15 1446 movq 8(%rsp),%r14 1447.cfi_restore %r14 1448 movq 16(%rsp),%r13 1449.cfi_restore %r13 1450 movq 24(%rsp),%r12 1451.cfi_restore %r12 1452 movq 32(%rsp),%rbp 1453.cfi_restore %rbp 1454 movq 40(%rsp),%rbx 1455.cfi_restore %rbx 1456 leaq 48(%rsp),%rsp 1457.cfi_adjust_cfa_offset -48 1458.Lno_data_avx2: 1459.Lblocks_avx2_epilogue: 1460 .byte 0xf3,0xc3 1461.cfi_endproc 1462 1463.align 32 1464.Lbase2_64_avx2: 1465.cfi_startproc 1466 pushq %rbx 1467.cfi_adjust_cfa_offset 8 1468.cfi_offset %rbx,-16 1469 pushq %rbp 1470.cfi_adjust_cfa_offset 8 1471.cfi_offset %rbp,-24 1472 pushq %r12 1473.cfi_adjust_cfa_offset 8 1474.cfi_offset %r12,-32 1475 pushq %r13 1476.cfi_adjust_cfa_offset 8 1477.cfi_offset %r13,-40 1478 pushq %r14 1479.cfi_adjust_cfa_offset 8 1480.cfi_offset %r14,-48 1481 pushq %r15 1482.cfi_adjust_cfa_offset 8 1483.cfi_offset %r15,-56 1484.Lbase2_64_avx2_body: 1485 1486 movq %rdx,%r15 1487 1488 movq 24(%rdi),%r11 1489 movq 32(%rdi),%r13 1490 1491 movq 0(%rdi),%r14 1492 movq 8(%rdi),%rbx 1493 movl 16(%rdi),%ebp 1494 1495 movq %r13,%r12 1496 movq %r13,%rax 1497 shrq $2,%r13 1498 addq %r12,%r13 1499 1500 testq $63,%rdx 1501 jz .Linit_avx2 1502 1503.Lbase2_64_pre_avx2: 1504 addq 0(%rsi),%r14 1505 adcq 8(%rsi),%rbx 1506 leaq 16(%rsi),%rsi 1507 adcq %rcx,%rbp 1508 subq $16,%r15 1509 1510 call __poly1305_block 1511 movq %r12,%rax 1512 1513 testq $63,%r15 1514 jnz .Lbase2_64_pre_avx2 1515 1516.Linit_avx2: 1517 1518 movq %r14,%rax 1519 movq %r14,%rdx 1520 shrq $52,%r14 1521 movq %rbx,%r8 1522 movq %rbx,%r9 1523 shrq $26,%rdx 1524 andq $0x3ffffff,%rax 1525 shlq $12,%r8 1526 andq $0x3ffffff,%rdx 1527 shrq $14,%rbx 1528 orq %r8,%r14 1529 shlq $24,%rbp 1530 andq $0x3ffffff,%r14 1531 shrq $40,%r9 1532 andq $0x3ffffff,%rbx 1533 orq %r9,%rbp 1534 1535 vmovd %eax,%xmm0 1536 vmovd %edx,%xmm1 1537 vmovd %r14d,%xmm2 1538 vmovd %ebx,%xmm3 1539 vmovd %ebp,%xmm4 1540 movl $1,20(%rdi) 1541 1542 call __poly1305_init_avx 1543 1544.Lproceed_avx2: 1545 movq %r15,%rdx 1546 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1547 movl $3221291008,%r11d 1548 1549 movq 0(%rsp),%r15 1550.cfi_restore %r15 1551 movq 8(%rsp),%r14 1552.cfi_restore %r14 1553 movq 16(%rsp),%r13 1554.cfi_restore %r13 1555 movq 24(%rsp),%r12 1556.cfi_restore %r12 1557 movq 32(%rsp),%rbp 1558.cfi_restore %rbp 1559 movq 40(%rsp),%rbx 1560.cfi_restore %rbx 1561 leaq 48(%rsp),%rax 1562 leaq 48(%rsp),%rsp 1563.cfi_adjust_cfa_offset -48 1564.Lbase2_64_avx2_epilogue: 1565 jmp .Ldo_avx2 1566.cfi_endproc 1567 1568.align 32 1569.Leven_avx2: 1570.cfi_startproc 1571 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1572 vmovd 0(%rdi),%xmm0 1573 vmovd 4(%rdi),%xmm1 1574 vmovd 8(%rdi),%xmm2 1575 vmovd 12(%rdi),%xmm3 1576 vmovd 16(%rdi),%xmm4 1577 1578.Ldo_avx2: 1579 leaq -8(%rsp),%r11 1580.cfi_def_cfa %r11,16 1581 subq $0x128,%rsp 1582 leaq .Lconst(%rip),%rcx 1583 leaq 48+64(%rdi),%rdi 1584 vmovdqa 96(%rcx),%ymm7 1585 1586 1587 vmovdqu -64(%rdi),%xmm9 1588 andq $-512,%rsp 1589 vmovdqu -48(%rdi),%xmm10 1590 vmovdqu -32(%rdi),%xmm6 1591 vmovdqu -16(%rdi),%xmm11 1592 vmovdqu 0(%rdi),%xmm12 1593 vmovdqu 16(%rdi),%xmm13 1594 leaq 144(%rsp),%rax 1595 vmovdqu 32(%rdi),%xmm14 1596 vpermd %ymm9,%ymm7,%ymm9 1597 vmovdqu 48(%rdi),%xmm15 1598 vpermd %ymm10,%ymm7,%ymm10 1599 vmovdqu 64(%rdi),%xmm5 1600 vpermd %ymm6,%ymm7,%ymm6 1601 vmovdqa %ymm9,0(%rsp) 1602 vpermd %ymm11,%ymm7,%ymm11 1603 vmovdqa %ymm10,32-144(%rax) 1604 vpermd %ymm12,%ymm7,%ymm12 1605 vmovdqa %ymm6,64-144(%rax) 1606 vpermd %ymm13,%ymm7,%ymm13 1607 vmovdqa %ymm11,96-144(%rax) 1608 vpermd %ymm14,%ymm7,%ymm14 1609 vmovdqa %ymm12,128-144(%rax) 1610 vpermd %ymm15,%ymm7,%ymm15 1611 vmovdqa %ymm13,160-144(%rax) 1612 vpermd %ymm5,%ymm7,%ymm5 1613 vmovdqa %ymm14,192-144(%rax) 1614 vmovdqa %ymm15,224-144(%rax) 1615 vmovdqa %ymm5,256-144(%rax) 1616 vmovdqa 64(%rcx),%ymm5 1617 1618 1619 1620 vmovdqu 0(%rsi),%xmm7 1621 vmovdqu 16(%rsi),%xmm8 1622 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1623 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1624 leaq 64(%rsi),%rsi 1625 1626 vpsrldq $6,%ymm7,%ymm9 1627 vpsrldq $6,%ymm8,%ymm10 1628 vpunpckhqdq %ymm8,%ymm7,%ymm6 1629 vpunpcklqdq %ymm10,%ymm9,%ymm9 1630 vpunpcklqdq %ymm8,%ymm7,%ymm7 1631 1632 vpsrlq $30,%ymm9,%ymm10 1633 vpsrlq $4,%ymm9,%ymm9 1634 vpsrlq $26,%ymm7,%ymm8 1635 vpsrlq $40,%ymm6,%ymm6 1636 vpand %ymm5,%ymm9,%ymm9 1637 vpand %ymm5,%ymm7,%ymm7 1638 vpand %ymm5,%ymm8,%ymm8 1639 vpand %ymm5,%ymm10,%ymm10 1640 vpor 32(%rcx),%ymm6,%ymm6 1641 1642 vpaddq %ymm2,%ymm9,%ymm2 1643 subq $64,%rdx 1644 jz .Ltail_avx2 1645 jmp .Loop_avx2 1646 1647.align 32 1648.Loop_avx2: 1649 1650 1651 1652 1653 1654 1655 1656 1657 vpaddq %ymm0,%ymm7,%ymm0 1658 vmovdqa 0(%rsp),%ymm7 1659 vpaddq %ymm1,%ymm8,%ymm1 1660 vmovdqa 32(%rsp),%ymm8 1661 vpaddq %ymm3,%ymm10,%ymm3 1662 vmovdqa 96(%rsp),%ymm9 1663 vpaddq %ymm4,%ymm6,%ymm4 1664 vmovdqa 48(%rax),%ymm10 1665 vmovdqa 112(%rax),%ymm5 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 vpmuludq %ymm2,%ymm7,%ymm13 1683 vpmuludq %ymm2,%ymm8,%ymm14 1684 vpmuludq %ymm2,%ymm9,%ymm15 1685 vpmuludq %ymm2,%ymm10,%ymm11 1686 vpmuludq %ymm2,%ymm5,%ymm12 1687 1688 vpmuludq %ymm0,%ymm8,%ymm6 1689 vpmuludq %ymm1,%ymm8,%ymm2 1690 vpaddq %ymm6,%ymm12,%ymm12 1691 vpaddq %ymm2,%ymm13,%ymm13 1692 vpmuludq %ymm3,%ymm8,%ymm6 1693 vpmuludq 64(%rsp),%ymm4,%ymm2 1694 vpaddq %ymm6,%ymm15,%ymm15 1695 vpaddq %ymm2,%ymm11,%ymm11 1696 vmovdqa -16(%rax),%ymm8 1697 1698 vpmuludq %ymm0,%ymm7,%ymm6 1699 vpmuludq %ymm1,%ymm7,%ymm2 1700 vpaddq %ymm6,%ymm11,%ymm11 1701 vpaddq %ymm2,%ymm12,%ymm12 1702 vpmuludq %ymm3,%ymm7,%ymm6 1703 vpmuludq %ymm4,%ymm7,%ymm2 1704 vmovdqu 0(%rsi),%xmm7 1705 vpaddq %ymm6,%ymm14,%ymm14 1706 vpaddq %ymm2,%ymm15,%ymm15 1707 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1708 1709 vpmuludq %ymm3,%ymm8,%ymm6 1710 vpmuludq %ymm4,%ymm8,%ymm2 1711 vmovdqu 16(%rsi),%xmm8 1712 vpaddq %ymm6,%ymm11,%ymm11 1713 vpaddq %ymm2,%ymm12,%ymm12 1714 vmovdqa 16(%rax),%ymm2 1715 vpmuludq %ymm1,%ymm9,%ymm6 1716 vpmuludq %ymm0,%ymm9,%ymm9 1717 vpaddq %ymm6,%ymm14,%ymm14 1718 vpaddq %ymm9,%ymm13,%ymm13 1719 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1720 leaq 64(%rsi),%rsi 1721 1722 vpmuludq %ymm1,%ymm2,%ymm6 1723 vpmuludq %ymm0,%ymm2,%ymm2 1724 vpsrldq $6,%ymm7,%ymm9 1725 vpaddq %ymm6,%ymm15,%ymm15 1726 vpaddq %ymm2,%ymm14,%ymm14 1727 vpmuludq %ymm3,%ymm10,%ymm6 1728 vpmuludq %ymm4,%ymm10,%ymm2 1729 vpsrldq $6,%ymm8,%ymm10 1730 vpaddq %ymm6,%ymm12,%ymm12 1731 vpaddq %ymm2,%ymm13,%ymm13 1732 vpunpckhqdq %ymm8,%ymm7,%ymm6 1733 1734 vpmuludq %ymm3,%ymm5,%ymm3 1735 vpmuludq %ymm4,%ymm5,%ymm4 1736 vpunpcklqdq %ymm8,%ymm7,%ymm7 1737 vpaddq %ymm3,%ymm13,%ymm2 1738 vpaddq %ymm4,%ymm14,%ymm3 1739 vpunpcklqdq %ymm10,%ymm9,%ymm10 1740 vpmuludq 80(%rax),%ymm0,%ymm4 1741 vpmuludq %ymm1,%ymm5,%ymm0 1742 vmovdqa 64(%rcx),%ymm5 1743 vpaddq %ymm4,%ymm15,%ymm4 1744 vpaddq %ymm0,%ymm11,%ymm0 1745 1746 1747 1748 1749 vpsrlq $26,%ymm3,%ymm14 1750 vpand %ymm5,%ymm3,%ymm3 1751 vpaddq %ymm14,%ymm4,%ymm4 1752 1753 vpsrlq $26,%ymm0,%ymm11 1754 vpand %ymm5,%ymm0,%ymm0 1755 vpaddq %ymm11,%ymm12,%ymm1 1756 1757 vpsrlq $26,%ymm4,%ymm15 1758 vpand %ymm5,%ymm4,%ymm4 1759 1760 vpsrlq $4,%ymm10,%ymm9 1761 1762 vpsrlq $26,%ymm1,%ymm12 1763 vpand %ymm5,%ymm1,%ymm1 1764 vpaddq %ymm12,%ymm2,%ymm2 1765 1766 vpaddq %ymm15,%ymm0,%ymm0 1767 vpsllq $2,%ymm15,%ymm15 1768 vpaddq %ymm15,%ymm0,%ymm0 1769 1770 vpand %ymm5,%ymm9,%ymm9 1771 vpsrlq $26,%ymm7,%ymm8 1772 1773 vpsrlq $26,%ymm2,%ymm13 1774 vpand %ymm5,%ymm2,%ymm2 1775 vpaddq %ymm13,%ymm3,%ymm3 1776 1777 vpaddq %ymm9,%ymm2,%ymm2 1778 vpsrlq $30,%ymm10,%ymm10 1779 1780 vpsrlq $26,%ymm0,%ymm11 1781 vpand %ymm5,%ymm0,%ymm0 1782 vpaddq %ymm11,%ymm1,%ymm1 1783 1784 vpsrlq $40,%ymm6,%ymm6 1785 1786 vpsrlq $26,%ymm3,%ymm14 1787 vpand %ymm5,%ymm3,%ymm3 1788 vpaddq %ymm14,%ymm4,%ymm4 1789 1790 vpand %ymm5,%ymm7,%ymm7 1791 vpand %ymm5,%ymm8,%ymm8 1792 vpand %ymm5,%ymm10,%ymm10 1793 vpor 32(%rcx),%ymm6,%ymm6 1794 1795 subq $64,%rdx 1796 jnz .Loop_avx2 1797 1798.byte 0x66,0x90 1799.Ltail_avx2: 1800 1801 1802 1803 1804 1805 1806 1807 vpaddq %ymm0,%ymm7,%ymm0 1808 vmovdqu 4(%rsp),%ymm7 1809 vpaddq %ymm1,%ymm8,%ymm1 1810 vmovdqu 36(%rsp),%ymm8 1811 vpaddq %ymm3,%ymm10,%ymm3 1812 vmovdqu 100(%rsp),%ymm9 1813 vpaddq %ymm4,%ymm6,%ymm4 1814 vmovdqu 52(%rax),%ymm10 1815 vmovdqu 116(%rax),%ymm5 1816 1817 vpmuludq %ymm2,%ymm7,%ymm13 1818 vpmuludq %ymm2,%ymm8,%ymm14 1819 vpmuludq %ymm2,%ymm9,%ymm15 1820 vpmuludq %ymm2,%ymm10,%ymm11 1821 vpmuludq %ymm2,%ymm5,%ymm12 1822 1823 vpmuludq %ymm0,%ymm8,%ymm6 1824 vpmuludq %ymm1,%ymm8,%ymm2 1825 vpaddq %ymm6,%ymm12,%ymm12 1826 vpaddq %ymm2,%ymm13,%ymm13 1827 vpmuludq %ymm3,%ymm8,%ymm6 1828 vpmuludq 68(%rsp),%ymm4,%ymm2 1829 vpaddq %ymm6,%ymm15,%ymm15 1830 vpaddq %ymm2,%ymm11,%ymm11 1831 1832 vpmuludq %ymm0,%ymm7,%ymm6 1833 vpmuludq %ymm1,%ymm7,%ymm2 1834 vpaddq %ymm6,%ymm11,%ymm11 1835 vmovdqu -12(%rax),%ymm8 1836 vpaddq %ymm2,%ymm12,%ymm12 1837 vpmuludq %ymm3,%ymm7,%ymm6 1838 vpmuludq %ymm4,%ymm7,%ymm2 1839 vpaddq %ymm6,%ymm14,%ymm14 1840 vpaddq %ymm2,%ymm15,%ymm15 1841 1842 vpmuludq %ymm3,%ymm8,%ymm6 1843 vpmuludq %ymm4,%ymm8,%ymm2 1844 vpaddq %ymm6,%ymm11,%ymm11 1845 vpaddq %ymm2,%ymm12,%ymm12 1846 vmovdqu 20(%rax),%ymm2 1847 vpmuludq %ymm1,%ymm9,%ymm6 1848 vpmuludq %ymm0,%ymm9,%ymm9 1849 vpaddq %ymm6,%ymm14,%ymm14 1850 vpaddq %ymm9,%ymm13,%ymm13 1851 1852 vpmuludq %ymm1,%ymm2,%ymm6 1853 vpmuludq %ymm0,%ymm2,%ymm2 1854 vpaddq %ymm6,%ymm15,%ymm15 1855 vpaddq %ymm2,%ymm14,%ymm14 1856 vpmuludq %ymm3,%ymm10,%ymm6 1857 vpmuludq %ymm4,%ymm10,%ymm2 1858 vpaddq %ymm6,%ymm12,%ymm12 1859 vpaddq %ymm2,%ymm13,%ymm13 1860 1861 vpmuludq %ymm3,%ymm5,%ymm3 1862 vpmuludq %ymm4,%ymm5,%ymm4 1863 vpaddq %ymm3,%ymm13,%ymm2 1864 vpaddq %ymm4,%ymm14,%ymm3 1865 vpmuludq 84(%rax),%ymm0,%ymm4 1866 vpmuludq %ymm1,%ymm5,%ymm0 1867 vmovdqa 64(%rcx),%ymm5 1868 vpaddq %ymm4,%ymm15,%ymm4 1869 vpaddq %ymm0,%ymm11,%ymm0 1870 1871 1872 1873 1874 vpsrldq $8,%ymm12,%ymm8 1875 vpsrldq $8,%ymm2,%ymm9 1876 vpsrldq $8,%ymm3,%ymm10 1877 vpsrldq $8,%ymm4,%ymm6 1878 vpsrldq $8,%ymm0,%ymm7 1879 vpaddq %ymm8,%ymm12,%ymm12 1880 vpaddq %ymm9,%ymm2,%ymm2 1881 vpaddq %ymm10,%ymm3,%ymm3 1882 vpaddq %ymm6,%ymm4,%ymm4 1883 vpaddq %ymm7,%ymm0,%ymm0 1884 1885 vpermq $0x2,%ymm3,%ymm10 1886 vpermq $0x2,%ymm4,%ymm6 1887 vpermq $0x2,%ymm0,%ymm7 1888 vpermq $0x2,%ymm12,%ymm8 1889 vpermq $0x2,%ymm2,%ymm9 1890 vpaddq %ymm10,%ymm3,%ymm3 1891 vpaddq %ymm6,%ymm4,%ymm4 1892 vpaddq %ymm7,%ymm0,%ymm0 1893 vpaddq %ymm8,%ymm12,%ymm12 1894 vpaddq %ymm9,%ymm2,%ymm2 1895 1896 1897 1898 1899 vpsrlq $26,%ymm3,%ymm14 1900 vpand %ymm5,%ymm3,%ymm3 1901 vpaddq %ymm14,%ymm4,%ymm4 1902 1903 vpsrlq $26,%ymm0,%ymm11 1904 vpand %ymm5,%ymm0,%ymm0 1905 vpaddq %ymm11,%ymm12,%ymm1 1906 1907 vpsrlq $26,%ymm4,%ymm15 1908 vpand %ymm5,%ymm4,%ymm4 1909 1910 vpsrlq $26,%ymm1,%ymm12 1911 vpand %ymm5,%ymm1,%ymm1 1912 vpaddq %ymm12,%ymm2,%ymm2 1913 1914 vpaddq %ymm15,%ymm0,%ymm0 1915 vpsllq $2,%ymm15,%ymm15 1916 vpaddq %ymm15,%ymm0,%ymm0 1917 1918 vpsrlq $26,%ymm2,%ymm13 1919 vpand %ymm5,%ymm2,%ymm2 1920 vpaddq %ymm13,%ymm3,%ymm3 1921 1922 vpsrlq $26,%ymm0,%ymm11 1923 vpand %ymm5,%ymm0,%ymm0 1924 vpaddq %ymm11,%ymm1,%ymm1 1925 1926 vpsrlq $26,%ymm3,%ymm14 1927 vpand %ymm5,%ymm3,%ymm3 1928 vpaddq %ymm14,%ymm4,%ymm4 1929 1930 vmovd %xmm0,-112(%rdi) 1931 vmovd %xmm1,-108(%rdi) 1932 vmovd %xmm2,-104(%rdi) 1933 vmovd %xmm3,-100(%rdi) 1934 vmovd %xmm4,-96(%rdi) 1935 leaq 8(%r11),%rsp 1936.cfi_def_cfa %rsp,8 1937 vzeroupper 1938 .byte 0xf3,0xc3 1939.cfi_endproc 1940.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 1941.section .rodata 1942.align 64 1943.Lconst: 1944.Lmask24: 1945.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 1946.L129: 1947.long 16777216,0,16777216,0,16777216,0,16777216,0 1948.Lmask26: 1949.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 1950.Lpermd_avx2: 1951.long 2,2,2,3,2,0,2,1 1952.Lpermd_avx512: 1953.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 1954 1955.L2_44_inp_permd: 1956.long 0,1,1,2,2,3,7,7 1957.L2_44_inp_shift: 1958.quad 0,12,24,64 1959.L2_44_mask: 1960.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 1961.L2_44_shift_rgt: 1962.quad 44,44,42,64 1963.L2_44_shift_lft: 1964.quad 8,8,10,64 1965 1966.align 64 1967.Lx_mask44: 1968.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1969.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1970.Lx_mask42: 1971.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1972.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1973.previous 1974.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1975.align 16 1976.globl xor128_encrypt_n_pad 1977.type xor128_encrypt_n_pad,@function 1978.align 16 1979xor128_encrypt_n_pad: 1980.cfi_startproc 1981 subq %rdx,%rsi 1982 subq %rdx,%rdi 1983 movq %rcx,%r10 1984 shrq $4,%rcx 1985 jz .Ltail_enc 1986 nop 1987.Loop_enc_xmm: 1988 movdqu (%rsi,%rdx,1),%xmm0 1989 pxor (%rdx),%xmm0 1990 movdqu %xmm0,(%rdi,%rdx,1) 1991 movdqa %xmm0,(%rdx) 1992 leaq 16(%rdx),%rdx 1993 decq %rcx 1994 jnz .Loop_enc_xmm 1995 1996 andq $15,%r10 1997 jz .Ldone_enc 1998 1999.Ltail_enc: 2000 movq $16,%rcx 2001 subq %r10,%rcx 2002 xorl %eax,%eax 2003.Loop_enc_byte: 2004 movb (%rsi,%rdx,1),%al 2005 xorb (%rdx),%al 2006 movb %al,(%rdi,%rdx,1) 2007 movb %al,(%rdx) 2008 leaq 1(%rdx),%rdx 2009 decq %r10 2010 jnz .Loop_enc_byte 2011 2012 xorl %eax,%eax 2013.Loop_enc_pad: 2014 movb %al,(%rdx) 2015 leaq 1(%rdx),%rdx 2016 decq %rcx 2017 jnz .Loop_enc_pad 2018 2019.Ldone_enc: 2020 movq %rdx,%rax 2021 .byte 0xf3,0xc3 2022.cfi_endproc 2023.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 2024 2025.globl xor128_decrypt_n_pad 2026.type xor128_decrypt_n_pad,@function 2027.align 16 2028xor128_decrypt_n_pad: 2029.cfi_startproc 2030 subq %rdx,%rsi 2031 subq %rdx,%rdi 2032 movq %rcx,%r10 2033 shrq $4,%rcx 2034 jz .Ltail_dec 2035 nop 2036.Loop_dec_xmm: 2037 movdqu (%rsi,%rdx,1),%xmm0 2038 movdqa (%rdx),%xmm1 2039 pxor %xmm0,%xmm1 2040 movdqu %xmm1,(%rdi,%rdx,1) 2041 movdqa %xmm0,(%rdx) 2042 leaq 16(%rdx),%rdx 2043 decq %rcx 2044 jnz .Loop_dec_xmm 2045 2046 pxor %xmm1,%xmm1 2047 andq $15,%r10 2048 jz .Ldone_dec 2049 2050.Ltail_dec: 2051 movq $16,%rcx 2052 subq %r10,%rcx 2053 xorl %eax,%eax 2054 xorq %r11,%r11 2055.Loop_dec_byte: 2056 movb (%rsi,%rdx,1),%r11b 2057 movb (%rdx),%al 2058 xorb %r11b,%al 2059 movb %al,(%rdi,%rdx,1) 2060 movb %r11b,(%rdx) 2061 leaq 1(%rdx),%rdx 2062 decq %r10 2063 jnz .Loop_dec_byte 2064 2065 xorl %eax,%eax 2066.Loop_dec_pad: 2067 movb %al,(%rdx) 2068 leaq 1(%rdx),%rdx 2069 decq %rcx 2070 jnz .Loop_dec_pad 2071 2072.Ldone_dec: 2073 movq %rdx,%rax 2074 .byte 0xf3,0xc3 2075.cfi_endproc 2076.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 2077 .section ".note.gnu.property", "a" 2078 .p2align 3 2079 .long 1f - 0f 2080 .long 4f - 1f 2081 .long 5 20820: 2083 # "GNU" encoded with .byte, since .asciz isn't supported 2084 # on Solaris. 2085 .byte 0x47 2086 .byte 0x4e 2087 .byte 0x55 2088 .byte 0 20891: 2090 .p2align 3 2091 .long 0xc0000002 2092 .long 3f - 2f 20932: 2094 .long 3 20953: 2096 .p2align 3 20974: 2098