1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */ 3.text 4 5 6 7.globl poly1305_init 8.hidden poly1305_init 9.globl poly1305_blocks 10.hidden poly1305_blocks 11.globl poly1305_emit 12.hidden poly1305_emit 13 14.type poly1305_init,@function 15.align 32 16poly1305_init: 17.cfi_startproc 18 xorq %rax,%rax 19 movq %rax,0(%rdi) 20 movq %rax,8(%rdi) 21 movq %rax,16(%rdi) 22 23 cmpq $0,%rsi 24 je .Lno_key 25 26 leaq poly1305_blocks(%rip),%r10 27 leaq poly1305_emit(%rip),%r11 28 movq OPENSSL_ia32cap_P+4(%rip),%r9 29 leaq poly1305_blocks_avx(%rip),%rax 30 leaq poly1305_emit_avx(%rip),%rcx 31 btq $28,%r9 32 cmovcq %rax,%r10 33 cmovcq %rcx,%r11 34 leaq poly1305_blocks_avx2(%rip),%rax 35 btq $37,%r9 36 cmovcq %rax,%r10 37 movq $0x0ffffffc0fffffff,%rax 38 movq $0x0ffffffc0ffffffc,%rcx 39 andq 0(%rsi),%rax 40 andq 8(%rsi),%rcx 41 movq %rax,24(%rdi) 42 movq %rcx,32(%rdi) 43 movq %r10,0(%rdx) 44 movq %r11,8(%rdx) 45 movl $1,%eax 46.Lno_key: 47 .byte 0xf3,0xc3 48.cfi_endproc 49.size poly1305_init,.-poly1305_init 50 51.type poly1305_blocks,@function 52.align 32 53poly1305_blocks: 54.cfi_startproc 55.Lblocks: 56 shrq $4,%rdx 57 jz .Lno_data 58 59 pushq %rbx 60.cfi_adjust_cfa_offset 8 61.cfi_offset %rbx,-16 62 pushq %rbp 63.cfi_adjust_cfa_offset 8 64.cfi_offset %rbp,-24 65 pushq %r12 66.cfi_adjust_cfa_offset 8 67.cfi_offset %r12,-32 68 pushq %r13 69.cfi_adjust_cfa_offset 8 70.cfi_offset %r13,-40 71 pushq %r14 72.cfi_adjust_cfa_offset 8 73.cfi_offset %r14,-48 74 pushq %r15 75.cfi_adjust_cfa_offset 8 76.cfi_offset %r15,-56 77.Lblocks_body: 78 79 movq %rdx,%r15 80 81 movq 24(%rdi),%r11 82 movq 32(%rdi),%r13 83 84 movq 0(%rdi),%r14 85 movq 8(%rdi),%rbx 86 movq 16(%rdi),%rbp 87 88 movq %r13,%r12 89 shrq $2,%r13 90 movq %r12,%rax 91 addq %r12,%r13 92 jmp .Loop 93 94.align 32 95.Loop: 96 addq 0(%rsi),%r14 97 adcq 8(%rsi),%rbx 98 leaq 16(%rsi),%rsi 99 adcq %rcx,%rbp 100 mulq %r14 101 movq %rax,%r9 102 movq %r11,%rax 103 movq %rdx,%r10 104 105 mulq %r14 106 movq %rax,%r14 107 movq %r11,%rax 108 movq %rdx,%r8 109 110 mulq %rbx 111 addq %rax,%r9 112 movq %r13,%rax 113 adcq %rdx,%r10 114 115 mulq %rbx 116 movq %rbp,%rbx 117 addq %rax,%r14 118 adcq %rdx,%r8 119 120 imulq %r13,%rbx 121 addq %rbx,%r9 122 movq %r8,%rbx 123 adcq $0,%r10 124 125 imulq %r11,%rbp 126 addq %r9,%rbx 127 movq $-4,%rax 128 adcq %rbp,%r10 129 130 andq %r10,%rax 131 movq %r10,%rbp 132 shrq $2,%r10 133 andq $3,%rbp 134 addq %r10,%rax 135 addq %rax,%r14 136 adcq $0,%rbx 137 adcq $0,%rbp 138 movq %r12,%rax 139 decq %r15 140 jnz .Loop 141 142 movq %r14,0(%rdi) 143 movq %rbx,8(%rdi) 144 movq %rbp,16(%rdi) 145 146 movq 0(%rsp),%r15 147.cfi_restore %r15 148 movq 8(%rsp),%r14 149.cfi_restore %r14 150 movq 16(%rsp),%r13 151.cfi_restore %r13 152 movq 24(%rsp),%r12 153.cfi_restore %r12 154 movq 32(%rsp),%rbp 155.cfi_restore %rbp 156 movq 40(%rsp),%rbx 157.cfi_restore %rbx 158 leaq 48(%rsp),%rsp 159.cfi_adjust_cfa_offset -48 160.Lno_data: 161.Lblocks_epilogue: 162 .byte 0xf3,0xc3 163.cfi_endproc 164.size poly1305_blocks,.-poly1305_blocks 165 166.type poly1305_emit,@function 167.align 32 168poly1305_emit: 169.cfi_startproc 170.Lemit: 171 movq 0(%rdi),%r8 172 movq 8(%rdi),%r9 173 movq 16(%rdi),%r10 174 175 movq %r8,%rax 176 addq $5,%r8 177 movq %r9,%rcx 178 adcq $0,%r9 179 adcq $0,%r10 180 shrq $2,%r10 181 cmovnzq %r8,%rax 182 cmovnzq %r9,%rcx 183 184 addq 0(%rdx),%rax 185 adcq 8(%rdx),%rcx 186 movq %rax,0(%rsi) 187 movq %rcx,8(%rsi) 188 189 .byte 0xf3,0xc3 190.cfi_endproc 191.size poly1305_emit,.-poly1305_emit 192.type __poly1305_block,@function 193.align 32 194__poly1305_block: 195.cfi_startproc 196 mulq %r14 197 movq %rax,%r9 198 movq %r11,%rax 199 movq %rdx,%r10 200 201 mulq %r14 202 movq %rax,%r14 203 movq %r11,%rax 204 movq %rdx,%r8 205 206 mulq %rbx 207 addq %rax,%r9 208 movq %r13,%rax 209 adcq %rdx,%r10 210 211 mulq %rbx 212 movq %rbp,%rbx 213 addq %rax,%r14 214 adcq %rdx,%r8 215 216 imulq %r13,%rbx 217 addq %rbx,%r9 218 movq %r8,%rbx 219 adcq $0,%r10 220 221 imulq %r11,%rbp 222 addq %r9,%rbx 223 movq $-4,%rax 224 adcq %rbp,%r10 225 226 andq %r10,%rax 227 movq %r10,%rbp 228 shrq $2,%r10 229 andq $3,%rbp 230 addq %r10,%rax 231 addq %rax,%r14 232 adcq $0,%rbx 233 adcq $0,%rbp 234 .byte 0xf3,0xc3 235.cfi_endproc 236.size __poly1305_block,.-__poly1305_block 237 238.type __poly1305_init_avx,@function 239.align 32 240__poly1305_init_avx: 241.cfi_startproc 242 movq %r11,%r14 243 movq %r12,%rbx 244 xorq %rbp,%rbp 245 246 leaq 48+64(%rdi),%rdi 247 248 movq %r12,%rax 249 call __poly1305_block 250 251 movl $0x3ffffff,%eax 252 movl $0x3ffffff,%edx 253 movq %r14,%r8 254 andl %r14d,%eax 255 movq %r11,%r9 256 andl %r11d,%edx 257 movl %eax,-64(%rdi) 258 shrq $26,%r8 259 movl %edx,-60(%rdi) 260 shrq $26,%r9 261 262 movl $0x3ffffff,%eax 263 movl $0x3ffffff,%edx 264 andl %r8d,%eax 265 andl %r9d,%edx 266 movl %eax,-48(%rdi) 267 leal (%rax,%rax,4),%eax 268 movl %edx,-44(%rdi) 269 leal (%rdx,%rdx,4),%edx 270 movl %eax,-32(%rdi) 271 shrq $26,%r8 272 movl %edx,-28(%rdi) 273 shrq $26,%r9 274 275 movq %rbx,%rax 276 movq %r12,%rdx 277 shlq $12,%rax 278 shlq $12,%rdx 279 orq %r8,%rax 280 orq %r9,%rdx 281 andl $0x3ffffff,%eax 282 andl $0x3ffffff,%edx 283 movl %eax,-16(%rdi) 284 leal (%rax,%rax,4),%eax 285 movl %edx,-12(%rdi) 286 leal (%rdx,%rdx,4),%edx 287 movl %eax,0(%rdi) 288 movq %rbx,%r8 289 movl %edx,4(%rdi) 290 movq %r12,%r9 291 292 movl $0x3ffffff,%eax 293 movl $0x3ffffff,%edx 294 shrq $14,%r8 295 shrq $14,%r9 296 andl %r8d,%eax 297 andl %r9d,%edx 298 movl %eax,16(%rdi) 299 leal (%rax,%rax,4),%eax 300 movl %edx,20(%rdi) 301 leal (%rdx,%rdx,4),%edx 302 movl %eax,32(%rdi) 303 shrq $26,%r8 304 movl %edx,36(%rdi) 305 shrq $26,%r9 306 307 movq %rbp,%rax 308 shlq $24,%rax 309 orq %rax,%r8 310 movl %r8d,48(%rdi) 311 leaq (%r8,%r8,4),%r8 312 movl %r9d,52(%rdi) 313 leaq (%r9,%r9,4),%r9 314 movl %r8d,64(%rdi) 315 movl %r9d,68(%rdi) 316 317 movq %r12,%rax 318 call __poly1305_block 319 320 movl $0x3ffffff,%eax 321 movq %r14,%r8 322 andl %r14d,%eax 323 shrq $26,%r8 324 movl %eax,-52(%rdi) 325 326 movl $0x3ffffff,%edx 327 andl %r8d,%edx 328 movl %edx,-36(%rdi) 329 leal (%rdx,%rdx,4),%edx 330 shrq $26,%r8 331 movl %edx,-20(%rdi) 332 333 movq %rbx,%rax 334 shlq $12,%rax 335 orq %r8,%rax 336 andl $0x3ffffff,%eax 337 movl %eax,-4(%rdi) 338 leal (%rax,%rax,4),%eax 339 movq %rbx,%r8 340 movl %eax,12(%rdi) 341 342 movl $0x3ffffff,%edx 343 shrq $14,%r8 344 andl %r8d,%edx 345 movl %edx,28(%rdi) 346 leal (%rdx,%rdx,4),%edx 347 shrq $26,%r8 348 movl %edx,44(%rdi) 349 350 movq %rbp,%rax 351 shlq $24,%rax 352 orq %rax,%r8 353 movl %r8d,60(%rdi) 354 leaq (%r8,%r8,4),%r8 355 movl %r8d,76(%rdi) 356 357 movq %r12,%rax 358 call __poly1305_block 359 360 movl $0x3ffffff,%eax 361 movq %r14,%r8 362 andl %r14d,%eax 363 shrq $26,%r8 364 movl %eax,-56(%rdi) 365 366 movl $0x3ffffff,%edx 367 andl %r8d,%edx 368 movl %edx,-40(%rdi) 369 leal (%rdx,%rdx,4),%edx 370 shrq $26,%r8 371 movl %edx,-24(%rdi) 372 373 movq %rbx,%rax 374 shlq $12,%rax 375 orq %r8,%rax 376 andl $0x3ffffff,%eax 377 movl %eax,-8(%rdi) 378 leal (%rax,%rax,4),%eax 379 movq %rbx,%r8 380 movl %eax,8(%rdi) 381 382 movl $0x3ffffff,%edx 383 shrq $14,%r8 384 andl %r8d,%edx 385 movl %edx,24(%rdi) 386 leal (%rdx,%rdx,4),%edx 387 shrq $26,%r8 388 movl %edx,40(%rdi) 389 390 movq %rbp,%rax 391 shlq $24,%rax 392 orq %rax,%r8 393 movl %r8d,56(%rdi) 394 leaq (%r8,%r8,4),%r8 395 movl %r8d,72(%rdi) 396 397 leaq -48-64(%rdi),%rdi 398 .byte 0xf3,0xc3 399.cfi_endproc 400.size __poly1305_init_avx,.-__poly1305_init_avx 401 402.type poly1305_blocks_avx,@function 403.align 32 404poly1305_blocks_avx: 405.cfi_startproc 406 movl 20(%rdi),%r8d 407 cmpq $128,%rdx 408 jae .Lblocks_avx 409 testl %r8d,%r8d 410 jz .Lblocks 411 412.Lblocks_avx: 413 andq $-16,%rdx 414 jz .Lno_data_avx 415 416 vzeroupper 417 418 testl %r8d,%r8d 419 jz .Lbase2_64_avx 420 421 testq $31,%rdx 422 jz .Leven_avx 423 424 pushq %rbx 425.cfi_adjust_cfa_offset 8 426.cfi_offset %rbx,-16 427 pushq %rbp 428.cfi_adjust_cfa_offset 8 429.cfi_offset %rbp,-24 430 pushq %r12 431.cfi_adjust_cfa_offset 8 432.cfi_offset %r12,-32 433 pushq %r13 434.cfi_adjust_cfa_offset 8 435.cfi_offset %r13,-40 436 pushq %r14 437.cfi_adjust_cfa_offset 8 438.cfi_offset %r14,-48 439 pushq %r15 440.cfi_adjust_cfa_offset 8 441.cfi_offset %r15,-56 442.Lblocks_avx_body: 443 444 movq %rdx,%r15 445 446 movq 0(%rdi),%r8 447 movq 8(%rdi),%r9 448 movl 16(%rdi),%ebp 449 450 movq 24(%rdi),%r11 451 movq 32(%rdi),%r13 452 453 454 movl %r8d,%r14d 455 andq $-2147483648,%r8 456 movq %r9,%r12 457 movl %r9d,%ebx 458 andq $-2147483648,%r9 459 460 shrq $6,%r8 461 shlq $52,%r12 462 addq %r8,%r14 463 shrq $12,%rbx 464 shrq $18,%r9 465 addq %r12,%r14 466 adcq %r9,%rbx 467 468 movq %rbp,%r8 469 shlq $40,%r8 470 shrq $24,%rbp 471 addq %r8,%rbx 472 adcq $0,%rbp 473 474 movq $-4,%r9 475 movq %rbp,%r8 476 andq %rbp,%r9 477 shrq $2,%r8 478 andq $3,%rbp 479 addq %r9,%r8 480 addq %r8,%r14 481 adcq $0,%rbx 482 adcq $0,%rbp 483 484 movq %r13,%r12 485 movq %r13,%rax 486 shrq $2,%r13 487 addq %r12,%r13 488 489 addq 0(%rsi),%r14 490 adcq 8(%rsi),%rbx 491 leaq 16(%rsi),%rsi 492 adcq %rcx,%rbp 493 494 call __poly1305_block 495 496 testq %rcx,%rcx 497 jz .Lstore_base2_64_avx 498 499 500 movq %r14,%rax 501 movq %r14,%rdx 502 shrq $52,%r14 503 movq %rbx,%r11 504 movq %rbx,%r12 505 shrq $26,%rdx 506 andq $0x3ffffff,%rax 507 shlq $12,%r11 508 andq $0x3ffffff,%rdx 509 shrq $14,%rbx 510 orq %r11,%r14 511 shlq $24,%rbp 512 andq $0x3ffffff,%r14 513 shrq $40,%r12 514 andq $0x3ffffff,%rbx 515 orq %r12,%rbp 516 517 subq $16,%r15 518 jz .Lstore_base2_26_avx 519 520 vmovd %eax,%xmm0 521 vmovd %edx,%xmm1 522 vmovd %r14d,%xmm2 523 vmovd %ebx,%xmm3 524 vmovd %ebp,%xmm4 525 jmp .Lproceed_avx 526 527.align 32 528.Lstore_base2_64_avx: 529 movq %r14,0(%rdi) 530 movq %rbx,8(%rdi) 531 movq %rbp,16(%rdi) 532 jmp .Ldone_avx 533 534.align 16 535.Lstore_base2_26_avx: 536 movl %eax,0(%rdi) 537 movl %edx,4(%rdi) 538 movl %r14d,8(%rdi) 539 movl %ebx,12(%rdi) 540 movl %ebp,16(%rdi) 541.align 16 542.Ldone_avx: 543 movq 0(%rsp),%r15 544.cfi_restore %r15 545 movq 8(%rsp),%r14 546.cfi_restore %r14 547 movq 16(%rsp),%r13 548.cfi_restore %r13 549 movq 24(%rsp),%r12 550.cfi_restore %r12 551 movq 32(%rsp),%rbp 552.cfi_restore %rbp 553 movq 40(%rsp),%rbx 554.cfi_restore %rbx 555 leaq 48(%rsp),%rsp 556.cfi_adjust_cfa_offset -48 557.Lno_data_avx: 558.Lblocks_avx_epilogue: 559 .byte 0xf3,0xc3 560.cfi_endproc 561 562.align 32 563.Lbase2_64_avx: 564.cfi_startproc 565 pushq %rbx 566.cfi_adjust_cfa_offset 8 567.cfi_offset %rbx,-16 568 pushq %rbp 569.cfi_adjust_cfa_offset 8 570.cfi_offset %rbp,-24 571 pushq %r12 572.cfi_adjust_cfa_offset 8 573.cfi_offset %r12,-32 574 pushq %r13 575.cfi_adjust_cfa_offset 8 576.cfi_offset %r13,-40 577 pushq %r14 578.cfi_adjust_cfa_offset 8 579.cfi_offset %r14,-48 580 pushq %r15 581.cfi_adjust_cfa_offset 8 582.cfi_offset %r15,-56 583.Lbase2_64_avx_body: 584 585 movq %rdx,%r15 586 587 movq 24(%rdi),%r11 588 movq 32(%rdi),%r13 589 590 movq 0(%rdi),%r14 591 movq 8(%rdi),%rbx 592 movl 16(%rdi),%ebp 593 594 movq %r13,%r12 595 movq %r13,%rax 596 shrq $2,%r13 597 addq %r12,%r13 598 599 testq $31,%rdx 600 jz .Linit_avx 601 602 addq 0(%rsi),%r14 603 adcq 8(%rsi),%rbx 604 leaq 16(%rsi),%rsi 605 adcq %rcx,%rbp 606 subq $16,%r15 607 608 call __poly1305_block 609 610.Linit_avx: 611 612 movq %r14,%rax 613 movq %r14,%rdx 614 shrq $52,%r14 615 movq %rbx,%r8 616 movq %rbx,%r9 617 shrq $26,%rdx 618 andq $0x3ffffff,%rax 619 shlq $12,%r8 620 andq $0x3ffffff,%rdx 621 shrq $14,%rbx 622 orq %r8,%r14 623 shlq $24,%rbp 624 andq $0x3ffffff,%r14 625 shrq $40,%r9 626 andq $0x3ffffff,%rbx 627 orq %r9,%rbp 628 629 vmovd %eax,%xmm0 630 vmovd %edx,%xmm1 631 vmovd %r14d,%xmm2 632 vmovd %ebx,%xmm3 633 vmovd %ebp,%xmm4 634 movl $1,20(%rdi) 635 636 call __poly1305_init_avx 637 638.Lproceed_avx: 639 movq %r15,%rdx 640 641 movq 0(%rsp),%r15 642.cfi_restore %r15 643 movq 8(%rsp),%r14 644.cfi_restore %r14 645 movq 16(%rsp),%r13 646.cfi_restore %r13 647 movq 24(%rsp),%r12 648.cfi_restore %r12 649 movq 32(%rsp),%rbp 650.cfi_restore %rbp 651 movq 40(%rsp),%rbx 652.cfi_restore %rbx 653 leaq 48(%rsp),%rax 654 leaq 48(%rsp),%rsp 655.cfi_adjust_cfa_offset -48 656.Lbase2_64_avx_epilogue: 657 jmp .Ldo_avx 658.cfi_endproc 659 660.align 32 661.Leven_avx: 662.cfi_startproc 663 vmovd 0(%rdi),%xmm0 664 vmovd 4(%rdi),%xmm1 665 vmovd 8(%rdi),%xmm2 666 vmovd 12(%rdi),%xmm3 667 vmovd 16(%rdi),%xmm4 668 669.Ldo_avx: 670 leaq -88(%rsp),%r11 671.cfi_def_cfa %r11,0x60 672 subq $0x178,%rsp 673 subq $64,%rdx 674 leaq -32(%rsi),%rax 675 cmovcq %rax,%rsi 676 677 vmovdqu 48(%rdi),%xmm14 678 leaq 112(%rdi),%rdi 679 leaq .Lconst(%rip),%rcx 680 681 682 683 vmovdqu 32(%rsi),%xmm5 684 vmovdqu 48(%rsi),%xmm6 685 vmovdqa 64(%rcx),%xmm15 686 687 vpsrldq $6,%xmm5,%xmm7 688 vpsrldq $6,%xmm6,%xmm8 689 vpunpckhqdq %xmm6,%xmm5,%xmm9 690 vpunpcklqdq %xmm6,%xmm5,%xmm5 691 vpunpcklqdq %xmm8,%xmm7,%xmm8 692 693 vpsrlq $40,%xmm9,%xmm9 694 vpsrlq $26,%xmm5,%xmm6 695 vpand %xmm15,%xmm5,%xmm5 696 vpsrlq $4,%xmm8,%xmm7 697 vpand %xmm15,%xmm6,%xmm6 698 vpsrlq $30,%xmm8,%xmm8 699 vpand %xmm15,%xmm7,%xmm7 700 vpand %xmm15,%xmm8,%xmm8 701 vpor 32(%rcx),%xmm9,%xmm9 702 703 jbe .Lskip_loop_avx 704 705 706 vmovdqu -48(%rdi),%xmm11 707 vmovdqu -32(%rdi),%xmm12 708 vpshufd $0xEE,%xmm14,%xmm13 709 vpshufd $0x44,%xmm14,%xmm10 710 vmovdqa %xmm13,-144(%r11) 711 vmovdqa %xmm10,0(%rsp) 712 vpshufd $0xEE,%xmm11,%xmm14 713 vmovdqu -16(%rdi),%xmm10 714 vpshufd $0x44,%xmm11,%xmm11 715 vmovdqa %xmm14,-128(%r11) 716 vmovdqa %xmm11,16(%rsp) 717 vpshufd $0xEE,%xmm12,%xmm13 718 vmovdqu 0(%rdi),%xmm11 719 vpshufd $0x44,%xmm12,%xmm12 720 vmovdqa %xmm13,-112(%r11) 721 vmovdqa %xmm12,32(%rsp) 722 vpshufd $0xEE,%xmm10,%xmm14 723 vmovdqu 16(%rdi),%xmm12 724 vpshufd $0x44,%xmm10,%xmm10 725 vmovdqa %xmm14,-96(%r11) 726 vmovdqa %xmm10,48(%rsp) 727 vpshufd $0xEE,%xmm11,%xmm13 728 vmovdqu 32(%rdi),%xmm10 729 vpshufd $0x44,%xmm11,%xmm11 730 vmovdqa %xmm13,-80(%r11) 731 vmovdqa %xmm11,64(%rsp) 732 vpshufd $0xEE,%xmm12,%xmm14 733 vmovdqu 48(%rdi),%xmm11 734 vpshufd $0x44,%xmm12,%xmm12 735 vmovdqa %xmm14,-64(%r11) 736 vmovdqa %xmm12,80(%rsp) 737 vpshufd $0xEE,%xmm10,%xmm13 738 vmovdqu 64(%rdi),%xmm12 739 vpshufd $0x44,%xmm10,%xmm10 740 vmovdqa %xmm13,-48(%r11) 741 vmovdqa %xmm10,96(%rsp) 742 vpshufd $0xEE,%xmm11,%xmm14 743 vpshufd $0x44,%xmm11,%xmm11 744 vmovdqa %xmm14,-32(%r11) 745 vmovdqa %xmm11,112(%rsp) 746 vpshufd $0xEE,%xmm12,%xmm13 747 vmovdqa 0(%rsp),%xmm14 748 vpshufd $0x44,%xmm12,%xmm12 749 vmovdqa %xmm13,-16(%r11) 750 vmovdqa %xmm12,128(%rsp) 751 752 jmp .Loop_avx 753 754.align 32 755.Loop_avx: 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 vpmuludq %xmm5,%xmm14,%xmm10 777 vpmuludq %xmm6,%xmm14,%xmm11 778 vmovdqa %xmm2,32(%r11) 779 vpmuludq %xmm7,%xmm14,%xmm12 780 vmovdqa 16(%rsp),%xmm2 781 vpmuludq %xmm8,%xmm14,%xmm13 782 vpmuludq %xmm9,%xmm14,%xmm14 783 784 vmovdqa %xmm0,0(%r11) 785 vpmuludq 32(%rsp),%xmm9,%xmm0 786 vmovdqa %xmm1,16(%r11) 787 vpmuludq %xmm8,%xmm2,%xmm1 788 vpaddq %xmm0,%xmm10,%xmm10 789 vpaddq %xmm1,%xmm14,%xmm14 790 vmovdqa %xmm3,48(%r11) 791 vpmuludq %xmm7,%xmm2,%xmm0 792 vpmuludq %xmm6,%xmm2,%xmm1 793 vpaddq %xmm0,%xmm13,%xmm13 794 vmovdqa 48(%rsp),%xmm3 795 vpaddq %xmm1,%xmm12,%xmm12 796 vmovdqa %xmm4,64(%r11) 797 vpmuludq %xmm5,%xmm2,%xmm2 798 vpmuludq %xmm7,%xmm3,%xmm0 799 vpaddq %xmm2,%xmm11,%xmm11 800 801 vmovdqa 64(%rsp),%xmm4 802 vpaddq %xmm0,%xmm14,%xmm14 803 vpmuludq %xmm6,%xmm3,%xmm1 804 vpmuludq %xmm5,%xmm3,%xmm3 805 vpaddq %xmm1,%xmm13,%xmm13 806 vmovdqa 80(%rsp),%xmm2 807 vpaddq %xmm3,%xmm12,%xmm12 808 vpmuludq %xmm9,%xmm4,%xmm0 809 vpmuludq %xmm8,%xmm4,%xmm4 810 vpaddq %xmm0,%xmm11,%xmm11 811 vmovdqa 96(%rsp),%xmm3 812 vpaddq %xmm4,%xmm10,%xmm10 813 814 vmovdqa 128(%rsp),%xmm4 815 vpmuludq %xmm6,%xmm2,%xmm1 816 vpmuludq %xmm5,%xmm2,%xmm2 817 vpaddq %xmm1,%xmm14,%xmm14 818 vpaddq %xmm2,%xmm13,%xmm13 819 vpmuludq %xmm9,%xmm3,%xmm0 820 vpmuludq %xmm8,%xmm3,%xmm1 821 vpaddq %xmm0,%xmm12,%xmm12 822 vmovdqu 0(%rsi),%xmm0 823 vpaddq %xmm1,%xmm11,%xmm11 824 vpmuludq %xmm7,%xmm3,%xmm3 825 vpmuludq %xmm7,%xmm4,%xmm7 826 vpaddq %xmm3,%xmm10,%xmm10 827 828 vmovdqu 16(%rsi),%xmm1 829 vpaddq %xmm7,%xmm11,%xmm11 830 vpmuludq %xmm8,%xmm4,%xmm8 831 vpmuludq %xmm9,%xmm4,%xmm9 832 vpsrldq $6,%xmm0,%xmm2 833 vpaddq %xmm8,%xmm12,%xmm12 834 vpaddq %xmm9,%xmm13,%xmm13 835 vpsrldq $6,%xmm1,%xmm3 836 vpmuludq 112(%rsp),%xmm5,%xmm9 837 vpmuludq %xmm6,%xmm4,%xmm5 838 vpunpckhqdq %xmm1,%xmm0,%xmm4 839 vpaddq %xmm9,%xmm14,%xmm14 840 vmovdqa -144(%r11),%xmm9 841 vpaddq %xmm5,%xmm10,%xmm10 842 843 vpunpcklqdq %xmm1,%xmm0,%xmm0 844 vpunpcklqdq %xmm3,%xmm2,%xmm3 845 846 847 vpsrldq $5,%xmm4,%xmm4 848 vpsrlq $26,%xmm0,%xmm1 849 vpand %xmm15,%xmm0,%xmm0 850 vpsrlq $4,%xmm3,%xmm2 851 vpand %xmm15,%xmm1,%xmm1 852 vpand 0(%rcx),%xmm4,%xmm4 853 vpsrlq $30,%xmm3,%xmm3 854 vpand %xmm15,%xmm2,%xmm2 855 vpand %xmm15,%xmm3,%xmm3 856 vpor 32(%rcx),%xmm4,%xmm4 857 858 vpaddq 0(%r11),%xmm0,%xmm0 859 vpaddq 16(%r11),%xmm1,%xmm1 860 vpaddq 32(%r11),%xmm2,%xmm2 861 vpaddq 48(%r11),%xmm3,%xmm3 862 vpaddq 64(%r11),%xmm4,%xmm4 863 864 leaq 32(%rsi),%rax 865 leaq 64(%rsi),%rsi 866 subq $64,%rdx 867 cmovcq %rax,%rsi 868 869 870 871 872 873 874 875 876 877 878 vpmuludq %xmm0,%xmm9,%xmm5 879 vpmuludq %xmm1,%xmm9,%xmm6 880 vpaddq %xmm5,%xmm10,%xmm10 881 vpaddq %xmm6,%xmm11,%xmm11 882 vmovdqa -128(%r11),%xmm7 883 vpmuludq %xmm2,%xmm9,%xmm5 884 vpmuludq %xmm3,%xmm9,%xmm6 885 vpaddq %xmm5,%xmm12,%xmm12 886 vpaddq %xmm6,%xmm13,%xmm13 887 vpmuludq %xmm4,%xmm9,%xmm9 888 vpmuludq -112(%r11),%xmm4,%xmm5 889 vpaddq %xmm9,%xmm14,%xmm14 890 891 vpaddq %xmm5,%xmm10,%xmm10 892 vpmuludq %xmm2,%xmm7,%xmm6 893 vpmuludq %xmm3,%xmm7,%xmm5 894 vpaddq %xmm6,%xmm13,%xmm13 895 vmovdqa -96(%r11),%xmm8 896 vpaddq %xmm5,%xmm14,%xmm14 897 vpmuludq %xmm1,%xmm7,%xmm6 898 vpmuludq %xmm0,%xmm7,%xmm7 899 vpaddq %xmm6,%xmm12,%xmm12 900 vpaddq %xmm7,%xmm11,%xmm11 901 902 vmovdqa -80(%r11),%xmm9 903 vpmuludq %xmm2,%xmm8,%xmm5 904 vpmuludq %xmm1,%xmm8,%xmm6 905 vpaddq %xmm5,%xmm14,%xmm14 906 vpaddq %xmm6,%xmm13,%xmm13 907 vmovdqa -64(%r11),%xmm7 908 vpmuludq %xmm0,%xmm8,%xmm8 909 vpmuludq %xmm4,%xmm9,%xmm5 910 vpaddq %xmm8,%xmm12,%xmm12 911 vpaddq %xmm5,%xmm11,%xmm11 912 vmovdqa -48(%r11),%xmm8 913 vpmuludq %xmm3,%xmm9,%xmm9 914 vpmuludq %xmm1,%xmm7,%xmm6 915 vpaddq %xmm9,%xmm10,%xmm10 916 917 vmovdqa -16(%r11),%xmm9 918 vpaddq %xmm6,%xmm14,%xmm14 919 vpmuludq %xmm0,%xmm7,%xmm7 920 vpmuludq %xmm4,%xmm8,%xmm5 921 vpaddq %xmm7,%xmm13,%xmm13 922 vpaddq %xmm5,%xmm12,%xmm12 923 vmovdqu 32(%rsi),%xmm5 924 vpmuludq %xmm3,%xmm8,%xmm7 925 vpmuludq %xmm2,%xmm8,%xmm8 926 vpaddq %xmm7,%xmm11,%xmm11 927 vmovdqu 48(%rsi),%xmm6 928 vpaddq %xmm8,%xmm10,%xmm10 929 930 vpmuludq %xmm2,%xmm9,%xmm2 931 vpmuludq %xmm3,%xmm9,%xmm3 932 vpsrldq $6,%xmm5,%xmm7 933 vpaddq %xmm2,%xmm11,%xmm11 934 vpmuludq %xmm4,%xmm9,%xmm4 935 vpsrldq $6,%xmm6,%xmm8 936 vpaddq %xmm3,%xmm12,%xmm2 937 vpaddq %xmm4,%xmm13,%xmm3 938 vpmuludq -32(%r11),%xmm0,%xmm4 939 vpmuludq %xmm1,%xmm9,%xmm0 940 vpunpckhqdq %xmm6,%xmm5,%xmm9 941 vpaddq %xmm4,%xmm14,%xmm4 942 vpaddq %xmm0,%xmm10,%xmm0 943 944 vpunpcklqdq %xmm6,%xmm5,%xmm5 945 vpunpcklqdq %xmm8,%xmm7,%xmm8 946 947 948 vpsrldq $5,%xmm9,%xmm9 949 vpsrlq $26,%xmm5,%xmm6 950 vmovdqa 0(%rsp),%xmm14 951 vpand %xmm15,%xmm5,%xmm5 952 vpsrlq $4,%xmm8,%xmm7 953 vpand %xmm15,%xmm6,%xmm6 954 vpand 0(%rcx),%xmm9,%xmm9 955 vpsrlq $30,%xmm8,%xmm8 956 vpand %xmm15,%xmm7,%xmm7 957 vpand %xmm15,%xmm8,%xmm8 958 vpor 32(%rcx),%xmm9,%xmm9 959 960 961 962 963 964 vpsrlq $26,%xmm3,%xmm13 965 vpand %xmm15,%xmm3,%xmm3 966 vpaddq %xmm13,%xmm4,%xmm4 967 968 vpsrlq $26,%xmm0,%xmm10 969 vpand %xmm15,%xmm0,%xmm0 970 vpaddq %xmm10,%xmm11,%xmm1 971 972 vpsrlq $26,%xmm4,%xmm10 973 vpand %xmm15,%xmm4,%xmm4 974 975 vpsrlq $26,%xmm1,%xmm11 976 vpand %xmm15,%xmm1,%xmm1 977 vpaddq %xmm11,%xmm2,%xmm2 978 979 vpaddq %xmm10,%xmm0,%xmm0 980 vpsllq $2,%xmm10,%xmm10 981 vpaddq %xmm10,%xmm0,%xmm0 982 983 vpsrlq $26,%xmm2,%xmm12 984 vpand %xmm15,%xmm2,%xmm2 985 vpaddq %xmm12,%xmm3,%xmm3 986 987 vpsrlq $26,%xmm0,%xmm10 988 vpand %xmm15,%xmm0,%xmm0 989 vpaddq %xmm10,%xmm1,%xmm1 990 991 vpsrlq $26,%xmm3,%xmm13 992 vpand %xmm15,%xmm3,%xmm3 993 vpaddq %xmm13,%xmm4,%xmm4 994 995 ja .Loop_avx 996 997.Lskip_loop_avx: 998 999 1000 1001 vpshufd $0x10,%xmm14,%xmm14 1002 addq $32,%rdx 1003 jnz .Long_tail_avx 1004 1005 vpaddq %xmm2,%xmm7,%xmm7 1006 vpaddq %xmm0,%xmm5,%xmm5 1007 vpaddq %xmm1,%xmm6,%xmm6 1008 vpaddq %xmm3,%xmm8,%xmm8 1009 vpaddq %xmm4,%xmm9,%xmm9 1010 1011.Long_tail_avx: 1012 vmovdqa %xmm2,32(%r11) 1013 vmovdqa %xmm0,0(%r11) 1014 vmovdqa %xmm1,16(%r11) 1015 vmovdqa %xmm3,48(%r11) 1016 vmovdqa %xmm4,64(%r11) 1017 1018 1019 1020 1021 1022 1023 1024 vpmuludq %xmm7,%xmm14,%xmm12 1025 vpmuludq %xmm5,%xmm14,%xmm10 1026 vpshufd $0x10,-48(%rdi),%xmm2 1027 vpmuludq %xmm6,%xmm14,%xmm11 1028 vpmuludq %xmm8,%xmm14,%xmm13 1029 vpmuludq %xmm9,%xmm14,%xmm14 1030 1031 vpmuludq %xmm8,%xmm2,%xmm0 1032 vpaddq %xmm0,%xmm14,%xmm14 1033 vpshufd $0x10,-32(%rdi),%xmm3 1034 vpmuludq %xmm7,%xmm2,%xmm1 1035 vpaddq %xmm1,%xmm13,%xmm13 1036 vpshufd $0x10,-16(%rdi),%xmm4 1037 vpmuludq %xmm6,%xmm2,%xmm0 1038 vpaddq %xmm0,%xmm12,%xmm12 1039 vpmuludq %xmm5,%xmm2,%xmm2 1040 vpaddq %xmm2,%xmm11,%xmm11 1041 vpmuludq %xmm9,%xmm3,%xmm3 1042 vpaddq %xmm3,%xmm10,%xmm10 1043 1044 vpshufd $0x10,0(%rdi),%xmm2 1045 vpmuludq %xmm7,%xmm4,%xmm1 1046 vpaddq %xmm1,%xmm14,%xmm14 1047 vpmuludq %xmm6,%xmm4,%xmm0 1048 vpaddq %xmm0,%xmm13,%xmm13 1049 vpshufd $0x10,16(%rdi),%xmm3 1050 vpmuludq %xmm5,%xmm4,%xmm4 1051 vpaddq %xmm4,%xmm12,%xmm12 1052 vpmuludq %xmm9,%xmm2,%xmm1 1053 vpaddq %xmm1,%xmm11,%xmm11 1054 vpshufd $0x10,32(%rdi),%xmm4 1055 vpmuludq %xmm8,%xmm2,%xmm2 1056 vpaddq %xmm2,%xmm10,%xmm10 1057 1058 vpmuludq %xmm6,%xmm3,%xmm0 1059 vpaddq %xmm0,%xmm14,%xmm14 1060 vpmuludq %xmm5,%xmm3,%xmm3 1061 vpaddq %xmm3,%xmm13,%xmm13 1062 vpshufd $0x10,48(%rdi),%xmm2 1063 vpmuludq %xmm9,%xmm4,%xmm1 1064 vpaddq %xmm1,%xmm12,%xmm12 1065 vpshufd $0x10,64(%rdi),%xmm3 1066 vpmuludq %xmm8,%xmm4,%xmm0 1067 vpaddq %xmm0,%xmm11,%xmm11 1068 vpmuludq %xmm7,%xmm4,%xmm4 1069 vpaddq %xmm4,%xmm10,%xmm10 1070 1071 vpmuludq %xmm5,%xmm2,%xmm2 1072 vpaddq %xmm2,%xmm14,%xmm14 1073 vpmuludq %xmm9,%xmm3,%xmm1 1074 vpaddq %xmm1,%xmm13,%xmm13 1075 vpmuludq %xmm8,%xmm3,%xmm0 1076 vpaddq %xmm0,%xmm12,%xmm12 1077 vpmuludq %xmm7,%xmm3,%xmm1 1078 vpaddq %xmm1,%xmm11,%xmm11 1079 vpmuludq %xmm6,%xmm3,%xmm3 1080 vpaddq %xmm3,%xmm10,%xmm10 1081 1082 jz .Lshort_tail_avx 1083 1084 vmovdqu 0(%rsi),%xmm0 1085 vmovdqu 16(%rsi),%xmm1 1086 1087 vpsrldq $6,%xmm0,%xmm2 1088 vpsrldq $6,%xmm1,%xmm3 1089 vpunpckhqdq %xmm1,%xmm0,%xmm4 1090 vpunpcklqdq %xmm1,%xmm0,%xmm0 1091 vpunpcklqdq %xmm3,%xmm2,%xmm3 1092 1093 vpsrlq $40,%xmm4,%xmm4 1094 vpsrlq $26,%xmm0,%xmm1 1095 vpand %xmm15,%xmm0,%xmm0 1096 vpsrlq $4,%xmm3,%xmm2 1097 vpand %xmm15,%xmm1,%xmm1 1098 vpsrlq $30,%xmm3,%xmm3 1099 vpand %xmm15,%xmm2,%xmm2 1100 vpand %xmm15,%xmm3,%xmm3 1101 vpor 32(%rcx),%xmm4,%xmm4 1102 1103 vpshufd $0x32,-64(%rdi),%xmm9 1104 vpaddq 0(%r11),%xmm0,%xmm0 1105 vpaddq 16(%r11),%xmm1,%xmm1 1106 vpaddq 32(%r11),%xmm2,%xmm2 1107 vpaddq 48(%r11),%xmm3,%xmm3 1108 vpaddq 64(%r11),%xmm4,%xmm4 1109 1110 1111 1112 1113 vpmuludq %xmm0,%xmm9,%xmm5 1114 vpaddq %xmm5,%xmm10,%xmm10 1115 vpmuludq %xmm1,%xmm9,%xmm6 1116 vpaddq %xmm6,%xmm11,%xmm11 1117 vpmuludq %xmm2,%xmm9,%xmm5 1118 vpaddq %xmm5,%xmm12,%xmm12 1119 vpshufd $0x32,-48(%rdi),%xmm7 1120 vpmuludq %xmm3,%xmm9,%xmm6 1121 vpaddq %xmm6,%xmm13,%xmm13 1122 vpmuludq %xmm4,%xmm9,%xmm9 1123 vpaddq %xmm9,%xmm14,%xmm14 1124 1125 vpmuludq %xmm3,%xmm7,%xmm5 1126 vpaddq %xmm5,%xmm14,%xmm14 1127 vpshufd $0x32,-32(%rdi),%xmm8 1128 vpmuludq %xmm2,%xmm7,%xmm6 1129 vpaddq %xmm6,%xmm13,%xmm13 1130 vpshufd $0x32,-16(%rdi),%xmm9 1131 vpmuludq %xmm1,%xmm7,%xmm5 1132 vpaddq %xmm5,%xmm12,%xmm12 1133 vpmuludq %xmm0,%xmm7,%xmm7 1134 vpaddq %xmm7,%xmm11,%xmm11 1135 vpmuludq %xmm4,%xmm8,%xmm8 1136 vpaddq %xmm8,%xmm10,%xmm10 1137 1138 vpshufd $0x32,0(%rdi),%xmm7 1139 vpmuludq %xmm2,%xmm9,%xmm6 1140 vpaddq %xmm6,%xmm14,%xmm14 1141 vpmuludq %xmm1,%xmm9,%xmm5 1142 vpaddq %xmm5,%xmm13,%xmm13 1143 vpshufd $0x32,16(%rdi),%xmm8 1144 vpmuludq %xmm0,%xmm9,%xmm9 1145 vpaddq %xmm9,%xmm12,%xmm12 1146 vpmuludq %xmm4,%xmm7,%xmm6 1147 vpaddq %xmm6,%xmm11,%xmm11 1148 vpshufd $0x32,32(%rdi),%xmm9 1149 vpmuludq %xmm3,%xmm7,%xmm7 1150 vpaddq %xmm7,%xmm10,%xmm10 1151 1152 vpmuludq %xmm1,%xmm8,%xmm5 1153 vpaddq %xmm5,%xmm14,%xmm14 1154 vpmuludq %xmm0,%xmm8,%xmm8 1155 vpaddq %xmm8,%xmm13,%xmm13 1156 vpshufd $0x32,48(%rdi),%xmm7 1157 vpmuludq %xmm4,%xmm9,%xmm6 1158 vpaddq %xmm6,%xmm12,%xmm12 1159 vpshufd $0x32,64(%rdi),%xmm8 1160 vpmuludq %xmm3,%xmm9,%xmm5 1161 vpaddq %xmm5,%xmm11,%xmm11 1162 vpmuludq %xmm2,%xmm9,%xmm9 1163 vpaddq %xmm9,%xmm10,%xmm10 1164 1165 vpmuludq %xmm0,%xmm7,%xmm7 1166 vpaddq %xmm7,%xmm14,%xmm14 1167 vpmuludq %xmm4,%xmm8,%xmm6 1168 vpaddq %xmm6,%xmm13,%xmm13 1169 vpmuludq %xmm3,%xmm8,%xmm5 1170 vpaddq %xmm5,%xmm12,%xmm12 1171 vpmuludq %xmm2,%xmm8,%xmm6 1172 vpaddq %xmm6,%xmm11,%xmm11 1173 vpmuludq %xmm1,%xmm8,%xmm8 1174 vpaddq %xmm8,%xmm10,%xmm10 1175 1176.Lshort_tail_avx: 1177 1178 1179 1180 vpsrldq $8,%xmm14,%xmm9 1181 vpsrldq $8,%xmm13,%xmm8 1182 vpsrldq $8,%xmm11,%xmm6 1183 vpsrldq $8,%xmm10,%xmm5 1184 vpsrldq $8,%xmm12,%xmm7 1185 vpaddq %xmm8,%xmm13,%xmm13 1186 vpaddq %xmm9,%xmm14,%xmm14 1187 vpaddq %xmm5,%xmm10,%xmm10 1188 vpaddq %xmm6,%xmm11,%xmm11 1189 vpaddq %xmm7,%xmm12,%xmm12 1190 1191 1192 1193 1194 vpsrlq $26,%xmm13,%xmm3 1195 vpand %xmm15,%xmm13,%xmm13 1196 vpaddq %xmm3,%xmm14,%xmm14 1197 1198 vpsrlq $26,%xmm10,%xmm0 1199 vpand %xmm15,%xmm10,%xmm10 1200 vpaddq %xmm0,%xmm11,%xmm11 1201 1202 vpsrlq $26,%xmm14,%xmm4 1203 vpand %xmm15,%xmm14,%xmm14 1204 1205 vpsrlq $26,%xmm11,%xmm1 1206 vpand %xmm15,%xmm11,%xmm11 1207 vpaddq %xmm1,%xmm12,%xmm12 1208 1209 vpaddq %xmm4,%xmm10,%xmm10 1210 vpsllq $2,%xmm4,%xmm4 1211 vpaddq %xmm4,%xmm10,%xmm10 1212 1213 vpsrlq $26,%xmm12,%xmm2 1214 vpand %xmm15,%xmm12,%xmm12 1215 vpaddq %xmm2,%xmm13,%xmm13 1216 1217 vpsrlq $26,%xmm10,%xmm0 1218 vpand %xmm15,%xmm10,%xmm10 1219 vpaddq %xmm0,%xmm11,%xmm11 1220 1221 vpsrlq $26,%xmm13,%xmm3 1222 vpand %xmm15,%xmm13,%xmm13 1223 vpaddq %xmm3,%xmm14,%xmm14 1224 1225 vmovd %xmm10,-112(%rdi) 1226 vmovd %xmm11,-108(%rdi) 1227 vmovd %xmm12,-104(%rdi) 1228 vmovd %xmm13,-100(%rdi) 1229 vmovd %xmm14,-96(%rdi) 1230 leaq 88(%r11),%rsp 1231.cfi_def_cfa %rsp,8 1232 vzeroupper 1233 .byte 0xf3,0xc3 1234.cfi_endproc 1235.size poly1305_blocks_avx,.-poly1305_blocks_avx 1236 1237.type poly1305_emit_avx,@function 1238.align 32 1239poly1305_emit_avx: 1240.cfi_startproc 1241 cmpl $0,20(%rdi) 1242 je .Lemit 1243 1244 movl 0(%rdi),%eax 1245 movl 4(%rdi),%ecx 1246 movl 8(%rdi),%r8d 1247 movl 12(%rdi),%r11d 1248 movl 16(%rdi),%r10d 1249 1250 shlq $26,%rcx 1251 movq %r8,%r9 1252 shlq $52,%r8 1253 addq %rcx,%rax 1254 shrq $12,%r9 1255 addq %rax,%r8 1256 adcq $0,%r9 1257 1258 shlq $14,%r11 1259 movq %r10,%rax 1260 shrq $24,%r10 1261 addq %r11,%r9 1262 shlq $40,%rax 1263 addq %rax,%r9 1264 adcq $0,%r10 1265 1266 movq %r10,%rax 1267 movq %r10,%rcx 1268 andq $3,%r10 1269 shrq $2,%rax 1270 andq $-4,%rcx 1271 addq %rcx,%rax 1272 addq %rax,%r8 1273 adcq $0,%r9 1274 adcq $0,%r10 1275 1276 movq %r8,%rax 1277 addq $5,%r8 1278 movq %r9,%rcx 1279 adcq $0,%r9 1280 adcq $0,%r10 1281 shrq $2,%r10 1282 cmovnzq %r8,%rax 1283 cmovnzq %r9,%rcx 1284 1285 addq 0(%rdx),%rax 1286 adcq 8(%rdx),%rcx 1287 movq %rax,0(%rsi) 1288 movq %rcx,8(%rsi) 1289 1290 .byte 0xf3,0xc3 1291.cfi_endproc 1292.size poly1305_emit_avx,.-poly1305_emit_avx 1293.type poly1305_blocks_avx2,@function 1294.align 32 1295poly1305_blocks_avx2: 1296.cfi_startproc 1297 movl 20(%rdi),%r8d 1298 cmpq $128,%rdx 1299 jae .Lblocks_avx2 1300 testl %r8d,%r8d 1301 jz .Lblocks 1302 1303.Lblocks_avx2: 1304 andq $-16,%rdx 1305 jz .Lno_data_avx2 1306 1307 vzeroupper 1308 1309 testl %r8d,%r8d 1310 jz .Lbase2_64_avx2 1311 1312 testq $63,%rdx 1313 jz .Leven_avx2 1314 1315 pushq %rbx 1316.cfi_adjust_cfa_offset 8 1317.cfi_offset %rbx,-16 1318 pushq %rbp 1319.cfi_adjust_cfa_offset 8 1320.cfi_offset %rbp,-24 1321 pushq %r12 1322.cfi_adjust_cfa_offset 8 1323.cfi_offset %r12,-32 1324 pushq %r13 1325.cfi_adjust_cfa_offset 8 1326.cfi_offset %r13,-40 1327 pushq %r14 1328.cfi_adjust_cfa_offset 8 1329.cfi_offset %r14,-48 1330 pushq %r15 1331.cfi_adjust_cfa_offset 8 1332.cfi_offset %r15,-56 1333.Lblocks_avx2_body: 1334 1335 movq %rdx,%r15 1336 1337 movq 0(%rdi),%r8 1338 movq 8(%rdi),%r9 1339 movl 16(%rdi),%ebp 1340 1341 movq 24(%rdi),%r11 1342 movq 32(%rdi),%r13 1343 1344 1345 movl %r8d,%r14d 1346 andq $-2147483648,%r8 1347 movq %r9,%r12 1348 movl %r9d,%ebx 1349 andq $-2147483648,%r9 1350 1351 shrq $6,%r8 1352 shlq $52,%r12 1353 addq %r8,%r14 1354 shrq $12,%rbx 1355 shrq $18,%r9 1356 addq %r12,%r14 1357 adcq %r9,%rbx 1358 1359 movq %rbp,%r8 1360 shlq $40,%r8 1361 shrq $24,%rbp 1362 addq %r8,%rbx 1363 adcq $0,%rbp 1364 1365 movq $-4,%r9 1366 movq %rbp,%r8 1367 andq %rbp,%r9 1368 shrq $2,%r8 1369 andq $3,%rbp 1370 addq %r9,%r8 1371 addq %r8,%r14 1372 adcq $0,%rbx 1373 adcq $0,%rbp 1374 1375 movq %r13,%r12 1376 movq %r13,%rax 1377 shrq $2,%r13 1378 addq %r12,%r13 1379 1380.Lbase2_26_pre_avx2: 1381 addq 0(%rsi),%r14 1382 adcq 8(%rsi),%rbx 1383 leaq 16(%rsi),%rsi 1384 adcq %rcx,%rbp 1385 subq $16,%r15 1386 1387 call __poly1305_block 1388 movq %r12,%rax 1389 1390 testq $63,%r15 1391 jnz .Lbase2_26_pre_avx2 1392 1393 testq %rcx,%rcx 1394 jz .Lstore_base2_64_avx2 1395 1396 1397 movq %r14,%rax 1398 movq %r14,%rdx 1399 shrq $52,%r14 1400 movq %rbx,%r11 1401 movq %rbx,%r12 1402 shrq $26,%rdx 1403 andq $0x3ffffff,%rax 1404 shlq $12,%r11 1405 andq $0x3ffffff,%rdx 1406 shrq $14,%rbx 1407 orq %r11,%r14 1408 shlq $24,%rbp 1409 andq $0x3ffffff,%r14 1410 shrq $40,%r12 1411 andq $0x3ffffff,%rbx 1412 orq %r12,%rbp 1413 1414 testq %r15,%r15 1415 jz .Lstore_base2_26_avx2 1416 1417 vmovd %eax,%xmm0 1418 vmovd %edx,%xmm1 1419 vmovd %r14d,%xmm2 1420 vmovd %ebx,%xmm3 1421 vmovd %ebp,%xmm4 1422 jmp .Lproceed_avx2 1423 1424.align 32 1425.Lstore_base2_64_avx2: 1426 movq %r14,0(%rdi) 1427 movq %rbx,8(%rdi) 1428 movq %rbp,16(%rdi) 1429 jmp .Ldone_avx2 1430 1431.align 16 1432.Lstore_base2_26_avx2: 1433 movl %eax,0(%rdi) 1434 movl %edx,4(%rdi) 1435 movl %r14d,8(%rdi) 1436 movl %ebx,12(%rdi) 1437 movl %ebp,16(%rdi) 1438.align 16 1439.Ldone_avx2: 1440 movq 0(%rsp),%r15 1441.cfi_restore %r15 1442 movq 8(%rsp),%r14 1443.cfi_restore %r14 1444 movq 16(%rsp),%r13 1445.cfi_restore %r13 1446 movq 24(%rsp),%r12 1447.cfi_restore %r12 1448 movq 32(%rsp),%rbp 1449.cfi_restore %rbp 1450 movq 40(%rsp),%rbx 1451.cfi_restore %rbx 1452 leaq 48(%rsp),%rsp 1453.cfi_adjust_cfa_offset -48 1454.Lno_data_avx2: 1455.Lblocks_avx2_epilogue: 1456 .byte 0xf3,0xc3 1457.cfi_endproc 1458 1459.align 32 1460.Lbase2_64_avx2: 1461.cfi_startproc 1462 pushq %rbx 1463.cfi_adjust_cfa_offset 8 1464.cfi_offset %rbx,-16 1465 pushq %rbp 1466.cfi_adjust_cfa_offset 8 1467.cfi_offset %rbp,-24 1468 pushq %r12 1469.cfi_adjust_cfa_offset 8 1470.cfi_offset %r12,-32 1471 pushq %r13 1472.cfi_adjust_cfa_offset 8 1473.cfi_offset %r13,-40 1474 pushq %r14 1475.cfi_adjust_cfa_offset 8 1476.cfi_offset %r14,-48 1477 pushq %r15 1478.cfi_adjust_cfa_offset 8 1479.cfi_offset %r15,-56 1480.Lbase2_64_avx2_body: 1481 1482 movq %rdx,%r15 1483 1484 movq 24(%rdi),%r11 1485 movq 32(%rdi),%r13 1486 1487 movq 0(%rdi),%r14 1488 movq 8(%rdi),%rbx 1489 movl 16(%rdi),%ebp 1490 1491 movq %r13,%r12 1492 movq %r13,%rax 1493 shrq $2,%r13 1494 addq %r12,%r13 1495 1496 testq $63,%rdx 1497 jz .Linit_avx2 1498 1499.Lbase2_64_pre_avx2: 1500 addq 0(%rsi),%r14 1501 adcq 8(%rsi),%rbx 1502 leaq 16(%rsi),%rsi 1503 adcq %rcx,%rbp 1504 subq $16,%r15 1505 1506 call __poly1305_block 1507 movq %r12,%rax 1508 1509 testq $63,%r15 1510 jnz .Lbase2_64_pre_avx2 1511 1512.Linit_avx2: 1513 1514 movq %r14,%rax 1515 movq %r14,%rdx 1516 shrq $52,%r14 1517 movq %rbx,%r8 1518 movq %rbx,%r9 1519 shrq $26,%rdx 1520 andq $0x3ffffff,%rax 1521 shlq $12,%r8 1522 andq $0x3ffffff,%rdx 1523 shrq $14,%rbx 1524 orq %r8,%r14 1525 shlq $24,%rbp 1526 andq $0x3ffffff,%r14 1527 shrq $40,%r9 1528 andq $0x3ffffff,%rbx 1529 orq %r9,%rbp 1530 1531 vmovd %eax,%xmm0 1532 vmovd %edx,%xmm1 1533 vmovd %r14d,%xmm2 1534 vmovd %ebx,%xmm3 1535 vmovd %ebp,%xmm4 1536 movl $1,20(%rdi) 1537 1538 call __poly1305_init_avx 1539 1540.Lproceed_avx2: 1541 movq %r15,%rdx 1542 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1543 movl $3221291008,%r11d 1544 1545 movq 0(%rsp),%r15 1546.cfi_restore %r15 1547 movq 8(%rsp),%r14 1548.cfi_restore %r14 1549 movq 16(%rsp),%r13 1550.cfi_restore %r13 1551 movq 24(%rsp),%r12 1552.cfi_restore %r12 1553 movq 32(%rsp),%rbp 1554.cfi_restore %rbp 1555 movq 40(%rsp),%rbx 1556.cfi_restore %rbx 1557 leaq 48(%rsp),%rax 1558 leaq 48(%rsp),%rsp 1559.cfi_adjust_cfa_offset -48 1560.Lbase2_64_avx2_epilogue: 1561 jmp .Ldo_avx2 1562.cfi_endproc 1563 1564.align 32 1565.Leven_avx2: 1566.cfi_startproc 1567 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1568 vmovd 0(%rdi),%xmm0 1569 vmovd 4(%rdi),%xmm1 1570 vmovd 8(%rdi),%xmm2 1571 vmovd 12(%rdi),%xmm3 1572 vmovd 16(%rdi),%xmm4 1573 1574.Ldo_avx2: 1575 leaq -8(%rsp),%r11 1576.cfi_def_cfa %r11,16 1577 subq $0x128,%rsp 1578 leaq .Lconst(%rip),%rcx 1579 leaq 48+64(%rdi),%rdi 1580 vmovdqa 96(%rcx),%ymm7 1581 1582 1583 vmovdqu -64(%rdi),%xmm9 1584 andq $-512,%rsp 1585 vmovdqu -48(%rdi),%xmm10 1586 vmovdqu -32(%rdi),%xmm6 1587 vmovdqu -16(%rdi),%xmm11 1588 vmovdqu 0(%rdi),%xmm12 1589 vmovdqu 16(%rdi),%xmm13 1590 leaq 144(%rsp),%rax 1591 vmovdqu 32(%rdi),%xmm14 1592 vpermd %ymm9,%ymm7,%ymm9 1593 vmovdqu 48(%rdi),%xmm15 1594 vpermd %ymm10,%ymm7,%ymm10 1595 vmovdqu 64(%rdi),%xmm5 1596 vpermd %ymm6,%ymm7,%ymm6 1597 vmovdqa %ymm9,0(%rsp) 1598 vpermd %ymm11,%ymm7,%ymm11 1599 vmovdqa %ymm10,32-144(%rax) 1600 vpermd %ymm12,%ymm7,%ymm12 1601 vmovdqa %ymm6,64-144(%rax) 1602 vpermd %ymm13,%ymm7,%ymm13 1603 vmovdqa %ymm11,96-144(%rax) 1604 vpermd %ymm14,%ymm7,%ymm14 1605 vmovdqa %ymm12,128-144(%rax) 1606 vpermd %ymm15,%ymm7,%ymm15 1607 vmovdqa %ymm13,160-144(%rax) 1608 vpermd %ymm5,%ymm7,%ymm5 1609 vmovdqa %ymm14,192-144(%rax) 1610 vmovdqa %ymm15,224-144(%rax) 1611 vmovdqa %ymm5,256-144(%rax) 1612 vmovdqa 64(%rcx),%ymm5 1613 1614 1615 1616 vmovdqu 0(%rsi),%xmm7 1617 vmovdqu 16(%rsi),%xmm8 1618 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1619 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1620 leaq 64(%rsi),%rsi 1621 1622 vpsrldq $6,%ymm7,%ymm9 1623 vpsrldq $6,%ymm8,%ymm10 1624 vpunpckhqdq %ymm8,%ymm7,%ymm6 1625 vpunpcklqdq %ymm10,%ymm9,%ymm9 1626 vpunpcklqdq %ymm8,%ymm7,%ymm7 1627 1628 vpsrlq $30,%ymm9,%ymm10 1629 vpsrlq $4,%ymm9,%ymm9 1630 vpsrlq $26,%ymm7,%ymm8 1631 vpsrlq $40,%ymm6,%ymm6 1632 vpand %ymm5,%ymm9,%ymm9 1633 vpand %ymm5,%ymm7,%ymm7 1634 vpand %ymm5,%ymm8,%ymm8 1635 vpand %ymm5,%ymm10,%ymm10 1636 vpor 32(%rcx),%ymm6,%ymm6 1637 1638 vpaddq %ymm2,%ymm9,%ymm2 1639 subq $64,%rdx 1640 jz .Ltail_avx2 1641 jmp .Loop_avx2 1642 1643.align 32 1644.Loop_avx2: 1645 1646 1647 1648 1649 1650 1651 1652 1653 vpaddq %ymm0,%ymm7,%ymm0 1654 vmovdqa 0(%rsp),%ymm7 1655 vpaddq %ymm1,%ymm8,%ymm1 1656 vmovdqa 32(%rsp),%ymm8 1657 vpaddq %ymm3,%ymm10,%ymm3 1658 vmovdqa 96(%rsp),%ymm9 1659 vpaddq %ymm4,%ymm6,%ymm4 1660 vmovdqa 48(%rax),%ymm10 1661 vmovdqa 112(%rax),%ymm5 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 vpmuludq %ymm2,%ymm7,%ymm13 1679 vpmuludq %ymm2,%ymm8,%ymm14 1680 vpmuludq %ymm2,%ymm9,%ymm15 1681 vpmuludq %ymm2,%ymm10,%ymm11 1682 vpmuludq %ymm2,%ymm5,%ymm12 1683 1684 vpmuludq %ymm0,%ymm8,%ymm6 1685 vpmuludq %ymm1,%ymm8,%ymm2 1686 vpaddq %ymm6,%ymm12,%ymm12 1687 vpaddq %ymm2,%ymm13,%ymm13 1688 vpmuludq %ymm3,%ymm8,%ymm6 1689 vpmuludq 64(%rsp),%ymm4,%ymm2 1690 vpaddq %ymm6,%ymm15,%ymm15 1691 vpaddq %ymm2,%ymm11,%ymm11 1692 vmovdqa -16(%rax),%ymm8 1693 1694 vpmuludq %ymm0,%ymm7,%ymm6 1695 vpmuludq %ymm1,%ymm7,%ymm2 1696 vpaddq %ymm6,%ymm11,%ymm11 1697 vpaddq %ymm2,%ymm12,%ymm12 1698 vpmuludq %ymm3,%ymm7,%ymm6 1699 vpmuludq %ymm4,%ymm7,%ymm2 1700 vmovdqu 0(%rsi),%xmm7 1701 vpaddq %ymm6,%ymm14,%ymm14 1702 vpaddq %ymm2,%ymm15,%ymm15 1703 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1704 1705 vpmuludq %ymm3,%ymm8,%ymm6 1706 vpmuludq %ymm4,%ymm8,%ymm2 1707 vmovdqu 16(%rsi),%xmm8 1708 vpaddq %ymm6,%ymm11,%ymm11 1709 vpaddq %ymm2,%ymm12,%ymm12 1710 vmovdqa 16(%rax),%ymm2 1711 vpmuludq %ymm1,%ymm9,%ymm6 1712 vpmuludq %ymm0,%ymm9,%ymm9 1713 vpaddq %ymm6,%ymm14,%ymm14 1714 vpaddq %ymm9,%ymm13,%ymm13 1715 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1716 leaq 64(%rsi),%rsi 1717 1718 vpmuludq %ymm1,%ymm2,%ymm6 1719 vpmuludq %ymm0,%ymm2,%ymm2 1720 vpsrldq $6,%ymm7,%ymm9 1721 vpaddq %ymm6,%ymm15,%ymm15 1722 vpaddq %ymm2,%ymm14,%ymm14 1723 vpmuludq %ymm3,%ymm10,%ymm6 1724 vpmuludq %ymm4,%ymm10,%ymm2 1725 vpsrldq $6,%ymm8,%ymm10 1726 vpaddq %ymm6,%ymm12,%ymm12 1727 vpaddq %ymm2,%ymm13,%ymm13 1728 vpunpckhqdq %ymm8,%ymm7,%ymm6 1729 1730 vpmuludq %ymm3,%ymm5,%ymm3 1731 vpmuludq %ymm4,%ymm5,%ymm4 1732 vpunpcklqdq %ymm8,%ymm7,%ymm7 1733 vpaddq %ymm3,%ymm13,%ymm2 1734 vpaddq %ymm4,%ymm14,%ymm3 1735 vpunpcklqdq %ymm10,%ymm9,%ymm10 1736 vpmuludq 80(%rax),%ymm0,%ymm4 1737 vpmuludq %ymm1,%ymm5,%ymm0 1738 vmovdqa 64(%rcx),%ymm5 1739 vpaddq %ymm4,%ymm15,%ymm4 1740 vpaddq %ymm0,%ymm11,%ymm0 1741 1742 1743 1744 1745 vpsrlq $26,%ymm3,%ymm14 1746 vpand %ymm5,%ymm3,%ymm3 1747 vpaddq %ymm14,%ymm4,%ymm4 1748 1749 vpsrlq $26,%ymm0,%ymm11 1750 vpand %ymm5,%ymm0,%ymm0 1751 vpaddq %ymm11,%ymm12,%ymm1 1752 1753 vpsrlq $26,%ymm4,%ymm15 1754 vpand %ymm5,%ymm4,%ymm4 1755 1756 vpsrlq $4,%ymm10,%ymm9 1757 1758 vpsrlq $26,%ymm1,%ymm12 1759 vpand %ymm5,%ymm1,%ymm1 1760 vpaddq %ymm12,%ymm2,%ymm2 1761 1762 vpaddq %ymm15,%ymm0,%ymm0 1763 vpsllq $2,%ymm15,%ymm15 1764 vpaddq %ymm15,%ymm0,%ymm0 1765 1766 vpand %ymm5,%ymm9,%ymm9 1767 vpsrlq $26,%ymm7,%ymm8 1768 1769 vpsrlq $26,%ymm2,%ymm13 1770 vpand %ymm5,%ymm2,%ymm2 1771 vpaddq %ymm13,%ymm3,%ymm3 1772 1773 vpaddq %ymm9,%ymm2,%ymm2 1774 vpsrlq $30,%ymm10,%ymm10 1775 1776 vpsrlq $26,%ymm0,%ymm11 1777 vpand %ymm5,%ymm0,%ymm0 1778 vpaddq %ymm11,%ymm1,%ymm1 1779 1780 vpsrlq $40,%ymm6,%ymm6 1781 1782 vpsrlq $26,%ymm3,%ymm14 1783 vpand %ymm5,%ymm3,%ymm3 1784 vpaddq %ymm14,%ymm4,%ymm4 1785 1786 vpand %ymm5,%ymm7,%ymm7 1787 vpand %ymm5,%ymm8,%ymm8 1788 vpand %ymm5,%ymm10,%ymm10 1789 vpor 32(%rcx),%ymm6,%ymm6 1790 1791 subq $64,%rdx 1792 jnz .Loop_avx2 1793 1794.byte 0x66,0x90 1795.Ltail_avx2: 1796 1797 1798 1799 1800 1801 1802 1803 vpaddq %ymm0,%ymm7,%ymm0 1804 vmovdqu 4(%rsp),%ymm7 1805 vpaddq %ymm1,%ymm8,%ymm1 1806 vmovdqu 36(%rsp),%ymm8 1807 vpaddq %ymm3,%ymm10,%ymm3 1808 vmovdqu 100(%rsp),%ymm9 1809 vpaddq %ymm4,%ymm6,%ymm4 1810 vmovdqu 52(%rax),%ymm10 1811 vmovdqu 116(%rax),%ymm5 1812 1813 vpmuludq %ymm2,%ymm7,%ymm13 1814 vpmuludq %ymm2,%ymm8,%ymm14 1815 vpmuludq %ymm2,%ymm9,%ymm15 1816 vpmuludq %ymm2,%ymm10,%ymm11 1817 vpmuludq %ymm2,%ymm5,%ymm12 1818 1819 vpmuludq %ymm0,%ymm8,%ymm6 1820 vpmuludq %ymm1,%ymm8,%ymm2 1821 vpaddq %ymm6,%ymm12,%ymm12 1822 vpaddq %ymm2,%ymm13,%ymm13 1823 vpmuludq %ymm3,%ymm8,%ymm6 1824 vpmuludq 68(%rsp),%ymm4,%ymm2 1825 vpaddq %ymm6,%ymm15,%ymm15 1826 vpaddq %ymm2,%ymm11,%ymm11 1827 1828 vpmuludq %ymm0,%ymm7,%ymm6 1829 vpmuludq %ymm1,%ymm7,%ymm2 1830 vpaddq %ymm6,%ymm11,%ymm11 1831 vmovdqu -12(%rax),%ymm8 1832 vpaddq %ymm2,%ymm12,%ymm12 1833 vpmuludq %ymm3,%ymm7,%ymm6 1834 vpmuludq %ymm4,%ymm7,%ymm2 1835 vpaddq %ymm6,%ymm14,%ymm14 1836 vpaddq %ymm2,%ymm15,%ymm15 1837 1838 vpmuludq %ymm3,%ymm8,%ymm6 1839 vpmuludq %ymm4,%ymm8,%ymm2 1840 vpaddq %ymm6,%ymm11,%ymm11 1841 vpaddq %ymm2,%ymm12,%ymm12 1842 vmovdqu 20(%rax),%ymm2 1843 vpmuludq %ymm1,%ymm9,%ymm6 1844 vpmuludq %ymm0,%ymm9,%ymm9 1845 vpaddq %ymm6,%ymm14,%ymm14 1846 vpaddq %ymm9,%ymm13,%ymm13 1847 1848 vpmuludq %ymm1,%ymm2,%ymm6 1849 vpmuludq %ymm0,%ymm2,%ymm2 1850 vpaddq %ymm6,%ymm15,%ymm15 1851 vpaddq %ymm2,%ymm14,%ymm14 1852 vpmuludq %ymm3,%ymm10,%ymm6 1853 vpmuludq %ymm4,%ymm10,%ymm2 1854 vpaddq %ymm6,%ymm12,%ymm12 1855 vpaddq %ymm2,%ymm13,%ymm13 1856 1857 vpmuludq %ymm3,%ymm5,%ymm3 1858 vpmuludq %ymm4,%ymm5,%ymm4 1859 vpaddq %ymm3,%ymm13,%ymm2 1860 vpaddq %ymm4,%ymm14,%ymm3 1861 vpmuludq 84(%rax),%ymm0,%ymm4 1862 vpmuludq %ymm1,%ymm5,%ymm0 1863 vmovdqa 64(%rcx),%ymm5 1864 vpaddq %ymm4,%ymm15,%ymm4 1865 vpaddq %ymm0,%ymm11,%ymm0 1866 1867 1868 1869 1870 vpsrldq $8,%ymm12,%ymm8 1871 vpsrldq $8,%ymm2,%ymm9 1872 vpsrldq $8,%ymm3,%ymm10 1873 vpsrldq $8,%ymm4,%ymm6 1874 vpsrldq $8,%ymm0,%ymm7 1875 vpaddq %ymm8,%ymm12,%ymm12 1876 vpaddq %ymm9,%ymm2,%ymm2 1877 vpaddq %ymm10,%ymm3,%ymm3 1878 vpaddq %ymm6,%ymm4,%ymm4 1879 vpaddq %ymm7,%ymm0,%ymm0 1880 1881 vpermq $0x2,%ymm3,%ymm10 1882 vpermq $0x2,%ymm4,%ymm6 1883 vpermq $0x2,%ymm0,%ymm7 1884 vpermq $0x2,%ymm12,%ymm8 1885 vpermq $0x2,%ymm2,%ymm9 1886 vpaddq %ymm10,%ymm3,%ymm3 1887 vpaddq %ymm6,%ymm4,%ymm4 1888 vpaddq %ymm7,%ymm0,%ymm0 1889 vpaddq %ymm8,%ymm12,%ymm12 1890 vpaddq %ymm9,%ymm2,%ymm2 1891 1892 1893 1894 1895 vpsrlq $26,%ymm3,%ymm14 1896 vpand %ymm5,%ymm3,%ymm3 1897 vpaddq %ymm14,%ymm4,%ymm4 1898 1899 vpsrlq $26,%ymm0,%ymm11 1900 vpand %ymm5,%ymm0,%ymm0 1901 vpaddq %ymm11,%ymm12,%ymm1 1902 1903 vpsrlq $26,%ymm4,%ymm15 1904 vpand %ymm5,%ymm4,%ymm4 1905 1906 vpsrlq $26,%ymm1,%ymm12 1907 vpand %ymm5,%ymm1,%ymm1 1908 vpaddq %ymm12,%ymm2,%ymm2 1909 1910 vpaddq %ymm15,%ymm0,%ymm0 1911 vpsllq $2,%ymm15,%ymm15 1912 vpaddq %ymm15,%ymm0,%ymm0 1913 1914 vpsrlq $26,%ymm2,%ymm13 1915 vpand %ymm5,%ymm2,%ymm2 1916 vpaddq %ymm13,%ymm3,%ymm3 1917 1918 vpsrlq $26,%ymm0,%ymm11 1919 vpand %ymm5,%ymm0,%ymm0 1920 vpaddq %ymm11,%ymm1,%ymm1 1921 1922 vpsrlq $26,%ymm3,%ymm14 1923 vpand %ymm5,%ymm3,%ymm3 1924 vpaddq %ymm14,%ymm4,%ymm4 1925 1926 vmovd %xmm0,-112(%rdi) 1927 vmovd %xmm1,-108(%rdi) 1928 vmovd %xmm2,-104(%rdi) 1929 vmovd %xmm3,-100(%rdi) 1930 vmovd %xmm4,-96(%rdi) 1931 leaq 8(%r11),%rsp 1932.cfi_def_cfa %rsp,8 1933 vzeroupper 1934 .byte 0xf3,0xc3 1935.cfi_endproc 1936.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 1937.align 64 1938.Lconst: 1939.Lmask24: 1940.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 1941.L129: 1942.long 16777216,0,16777216,0,16777216,0,16777216,0 1943.Lmask26: 1944.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 1945.Lpermd_avx2: 1946.long 2,2,2,3,2,0,2,1 1947.Lpermd_avx512: 1948.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 1949 1950.L2_44_inp_permd: 1951.long 0,1,1,2,2,3,7,7 1952.L2_44_inp_shift: 1953.quad 0,12,24,64 1954.L2_44_mask: 1955.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 1956.L2_44_shift_rgt: 1957.quad 44,44,42,64 1958.L2_44_shift_lft: 1959.quad 8,8,10,64 1960 1961.align 64 1962.Lx_mask44: 1963.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1964.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1965.Lx_mask42: 1966.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1967.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1968.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1969.align 16 1970.globl xor128_encrypt_n_pad 1971.type xor128_encrypt_n_pad,@function 1972.align 16 1973xor128_encrypt_n_pad: 1974.cfi_startproc 1975 subq %rdx,%rsi 1976 subq %rdx,%rdi 1977 movq %rcx,%r10 1978 shrq $4,%rcx 1979 jz .Ltail_enc 1980 nop 1981.Loop_enc_xmm: 1982 movdqu (%rsi,%rdx,1),%xmm0 1983 pxor (%rdx),%xmm0 1984 movdqu %xmm0,(%rdi,%rdx,1) 1985 movdqa %xmm0,(%rdx) 1986 leaq 16(%rdx),%rdx 1987 decq %rcx 1988 jnz .Loop_enc_xmm 1989 1990 andq $15,%r10 1991 jz .Ldone_enc 1992 1993.Ltail_enc: 1994 movq $16,%rcx 1995 subq %r10,%rcx 1996 xorl %eax,%eax 1997.Loop_enc_byte: 1998 movb (%rsi,%rdx,1),%al 1999 xorb (%rdx),%al 2000 movb %al,(%rdi,%rdx,1) 2001 movb %al,(%rdx) 2002 leaq 1(%rdx),%rdx 2003 decq %r10 2004 jnz .Loop_enc_byte 2005 2006 xorl %eax,%eax 2007.Loop_enc_pad: 2008 movb %al,(%rdx) 2009 leaq 1(%rdx),%rdx 2010 decq %rcx 2011 jnz .Loop_enc_pad 2012 2013.Ldone_enc: 2014 movq %rdx,%rax 2015 .byte 0xf3,0xc3 2016.cfi_endproc 2017.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 2018 2019.globl xor128_decrypt_n_pad 2020.type xor128_decrypt_n_pad,@function 2021.align 16 2022xor128_decrypt_n_pad: 2023.cfi_startproc 2024 subq %rdx,%rsi 2025 subq %rdx,%rdi 2026 movq %rcx,%r10 2027 shrq $4,%rcx 2028 jz .Ltail_dec 2029 nop 2030.Loop_dec_xmm: 2031 movdqu (%rsi,%rdx,1),%xmm0 2032 movdqa (%rdx),%xmm1 2033 pxor %xmm0,%xmm1 2034 movdqu %xmm1,(%rdi,%rdx,1) 2035 movdqa %xmm0,(%rdx) 2036 leaq 16(%rdx),%rdx 2037 decq %rcx 2038 jnz .Loop_dec_xmm 2039 2040 pxor %xmm1,%xmm1 2041 andq $15,%r10 2042 jz .Ldone_dec 2043 2044.Ltail_dec: 2045 movq $16,%rcx 2046 subq %r10,%rcx 2047 xorl %eax,%eax 2048 xorq %r11,%r11 2049.Loop_dec_byte: 2050 movb (%rsi,%rdx,1),%r11b 2051 movb (%rdx),%al 2052 xorb %r11b,%al 2053 movb %al,(%rdi,%rdx,1) 2054 movb %r11b,(%rdx) 2055 leaq 1(%rdx),%rdx 2056 decq %r10 2057 jnz .Loop_dec_byte 2058 2059 xorl %eax,%eax 2060.Loop_dec_pad: 2061 movb %al,(%rdx) 2062 leaq 1(%rdx),%rdx 2063 decq %rcx 2064 jnz .Loop_dec_pad 2065 2066.Ldone_dec: 2067 movq %rdx,%rax 2068 .byte 0xf3,0xc3 2069.cfi_endproc 2070.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 2071