1/* Do not modify. This file is auto-generated from poly1305-x86.pl. */ 2#ifdef PIC 3.text 4.align 64 5.globl poly1305_init 6.type poly1305_init,@function 7.align 16 8poly1305_init: 9.L_poly1305_init_begin: 10 #ifdef __CET__ 11 12.byte 243,15,30,251 13 #endif 14 15 pushl %ebp 16 pushl %ebx 17 pushl %esi 18 pushl %edi 19 movl 20(%esp),%edi 20 movl 24(%esp),%esi 21 movl 28(%esp),%ebp 22 xorl %eax,%eax 23 movl %eax,(%edi) 24 movl %eax,4(%edi) 25 movl %eax,8(%edi) 26 movl %eax,12(%edi) 27 movl %eax,16(%edi) 28 movl %eax,20(%edi) 29 cmpl $0,%esi 30 je .L000nokey 31 call .L001pic_point 32.L001pic_point: 33 popl %ebx 34 leal poly1305_blocks-.L001pic_point(%ebx),%eax 35 leal poly1305_emit-.L001pic_point(%ebx),%edx 36 leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi 37 movl (%edi),%ecx 38 andl $83886080,%ecx 39 cmpl $83886080,%ecx 40 jne .L002no_sse2 41 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax 42 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx 43 movl 8(%edi),%ecx 44 testl $32,%ecx 45 jz .L002no_sse2 46 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax 47.L002no_sse2: 48 movl 20(%esp),%edi 49 movl %eax,(%ebp) 50 movl %edx,4(%ebp) 51 movl (%esi),%eax 52 movl 4(%esi),%ebx 53 movl 8(%esi),%ecx 54 movl 12(%esi),%edx 55 andl $268435455,%eax 56 andl $268435452,%ebx 57 andl $268435452,%ecx 58 andl $268435452,%edx 59 movl %eax,24(%edi) 60 movl %ebx,28(%edi) 61 movl %ecx,32(%edi) 62 movl %edx,36(%edi) 63 movl $1,%eax 64.L000nokey: 65 popl %edi 66 popl %esi 67 popl %ebx 68 popl %ebp 69 ret 70.size poly1305_init,.-.L_poly1305_init_begin 71.globl poly1305_blocks 72.type poly1305_blocks,@function 73.align 16 74poly1305_blocks: 75.L_poly1305_blocks_begin: 76 #ifdef __CET__ 77 78.byte 243,15,30,251 79 #endif 80 81 pushl %ebp 82 pushl %ebx 83 pushl %esi 84 pushl %edi 85 movl 20(%esp),%edi 86 movl 24(%esp),%esi 87 movl 28(%esp),%ecx 88.Lenter_blocks: 89 andl $-15,%ecx 90 jz .L003nodata 91 subl $64,%esp 92 movl 24(%edi),%eax 93 movl 28(%edi),%ebx 94 leal (%esi,%ecx,1),%ebp 95 movl 32(%edi),%ecx 96 movl 36(%edi),%edx 97 movl %ebp,92(%esp) 98 movl %esi,%ebp 99 movl %eax,36(%esp) 100 movl %ebx,%eax 101 shrl $2,%eax 102 movl %ebx,40(%esp) 103 addl %ebx,%eax 104 movl %ecx,%ebx 105 shrl $2,%ebx 106 movl %ecx,44(%esp) 107 addl %ecx,%ebx 108 movl %edx,%ecx 109 shrl $2,%ecx 110 movl %edx,48(%esp) 111 addl %edx,%ecx 112 movl %eax,52(%esp) 113 movl %ebx,56(%esp) 114 movl %ecx,60(%esp) 115 movl (%edi),%eax 116 movl 4(%edi),%ebx 117 movl 8(%edi),%ecx 118 movl 12(%edi),%esi 119 movl 16(%edi),%edi 120 jmp .L004loop 121.align 32 122.L004loop: 123 addl (%ebp),%eax 124 adcl 4(%ebp),%ebx 125 adcl 8(%ebp),%ecx 126 adcl 12(%ebp),%esi 127 leal 16(%ebp),%ebp 128 adcl 96(%esp),%edi 129 movl %eax,(%esp) 130 movl %esi,12(%esp) 131 mull 36(%esp) 132 movl %edi,16(%esp) 133 movl %eax,%edi 134 movl %ebx,%eax 135 movl %edx,%esi 136 mull 60(%esp) 137 addl %eax,%edi 138 movl %ecx,%eax 139 adcl %edx,%esi 140 mull 56(%esp) 141 addl %eax,%edi 142 movl 12(%esp),%eax 143 adcl %edx,%esi 144 mull 52(%esp) 145 addl %eax,%edi 146 movl (%esp),%eax 147 adcl %edx,%esi 148 mull 40(%esp) 149 movl %edi,20(%esp) 150 xorl %edi,%edi 151 addl %eax,%esi 152 movl %ebx,%eax 153 adcl %edx,%edi 154 mull 36(%esp) 155 addl %eax,%esi 156 movl %ecx,%eax 157 adcl %edx,%edi 158 mull 60(%esp) 159 addl %eax,%esi 160 movl 12(%esp),%eax 161 adcl %edx,%edi 162 mull 56(%esp) 163 addl %eax,%esi 164 movl 16(%esp),%eax 165 adcl %edx,%edi 166 imull 52(%esp),%eax 167 addl %eax,%esi 168 movl (%esp),%eax 169 adcl $0,%edi 170 mull 44(%esp) 171 movl %esi,24(%esp) 172 xorl %esi,%esi 173 addl %eax,%edi 174 movl %ebx,%eax 175 adcl %edx,%esi 176 mull 40(%esp) 177 addl %eax,%edi 178 movl %ecx,%eax 179 adcl %edx,%esi 180 mull 36(%esp) 181 addl %eax,%edi 182 movl 12(%esp),%eax 183 adcl %edx,%esi 184 mull 60(%esp) 185 addl %eax,%edi 186 movl 16(%esp),%eax 187 adcl %edx,%esi 188 imull 56(%esp),%eax 189 addl %eax,%edi 190 movl (%esp),%eax 191 adcl $0,%esi 192 mull 48(%esp) 193 movl %edi,28(%esp) 194 xorl %edi,%edi 195 addl %eax,%esi 196 movl %ebx,%eax 197 adcl %edx,%edi 198 mull 44(%esp) 199 addl %eax,%esi 200 movl %ecx,%eax 201 adcl %edx,%edi 202 mull 40(%esp) 203 addl %eax,%esi 204 movl 12(%esp),%eax 205 adcl %edx,%edi 206 mull 36(%esp) 207 addl %eax,%esi 208 movl 16(%esp),%ecx 209 adcl %edx,%edi 210 movl %ecx,%edx 211 imull 60(%esp),%ecx 212 addl %ecx,%esi 213 movl 20(%esp),%eax 214 adcl $0,%edi 215 imull 36(%esp),%edx 216 addl %edi,%edx 217 movl 24(%esp),%ebx 218 movl 28(%esp),%ecx 219 movl %edx,%edi 220 shrl $2,%edx 221 andl $3,%edi 222 leal (%edx,%edx,4),%edx 223 addl %edx,%eax 224 adcl $0,%ebx 225 adcl $0,%ecx 226 adcl $0,%esi 227 adcl $0,%edi 228 cmpl 92(%esp),%ebp 229 jne .L004loop 230 movl 84(%esp),%edx 231 addl $64,%esp 232 movl %eax,(%edx) 233 movl %ebx,4(%edx) 234 movl %ecx,8(%edx) 235 movl %esi,12(%edx) 236 movl %edi,16(%edx) 237.L003nodata: 238 popl %edi 239 popl %esi 240 popl %ebx 241 popl %ebp 242 ret 243.size poly1305_blocks,.-.L_poly1305_blocks_begin 244.globl poly1305_emit 245.type poly1305_emit,@function 246.align 16 247poly1305_emit: 248.L_poly1305_emit_begin: 249 #ifdef __CET__ 250 251.byte 243,15,30,251 252 #endif 253 254 pushl %ebp 255 pushl %ebx 256 pushl %esi 257 pushl %edi 258 movl 20(%esp),%ebp 259.Lenter_emit: 260 movl 24(%esp),%edi 261 movl (%ebp),%eax 262 movl 4(%ebp),%ebx 263 movl 8(%ebp),%ecx 264 movl 12(%ebp),%edx 265 movl 16(%ebp),%esi 266 addl $5,%eax 267 adcl $0,%ebx 268 adcl $0,%ecx 269 adcl $0,%edx 270 adcl $0,%esi 271 shrl $2,%esi 272 negl %esi 273 andl %esi,%eax 274 andl %esi,%ebx 275 andl %esi,%ecx 276 andl %esi,%edx 277 movl %eax,(%edi) 278 movl %ebx,4(%edi) 279 movl %ecx,8(%edi) 280 movl %edx,12(%edi) 281 notl %esi 282 movl (%ebp),%eax 283 movl 4(%ebp),%ebx 284 movl 8(%ebp),%ecx 285 movl 12(%ebp),%edx 286 movl 28(%esp),%ebp 287 andl %esi,%eax 288 andl %esi,%ebx 289 andl %esi,%ecx 290 andl %esi,%edx 291 orl (%edi),%eax 292 orl 4(%edi),%ebx 293 orl 8(%edi),%ecx 294 orl 12(%edi),%edx 295 addl (%ebp),%eax 296 adcl 4(%ebp),%ebx 297 adcl 8(%ebp),%ecx 298 adcl 12(%ebp),%edx 299 movl %eax,(%edi) 300 movl %ebx,4(%edi) 301 movl %ecx,8(%edi) 302 movl %edx,12(%edi) 303 popl %edi 304 popl %esi 305 popl %ebx 306 popl %ebp 307 ret 308.size poly1305_emit,.-.L_poly1305_emit_begin 309.align 32 310.type _poly1305_init_sse2,@function 311.align 16 312_poly1305_init_sse2: 313 #ifdef __CET__ 314 315.byte 243,15,30,251 316 #endif 317 318 movdqu 24(%edi),%xmm4 319 leal 48(%edi),%edi 320 movl %esp,%ebp 321 subl $224,%esp 322 andl $-16,%esp 323 movq 64(%ebx),%xmm7 324 movdqa %xmm4,%xmm0 325 movdqa %xmm4,%xmm1 326 movdqa %xmm4,%xmm2 327 pand %xmm7,%xmm0 328 psrlq $26,%xmm1 329 psrldq $6,%xmm2 330 pand %xmm7,%xmm1 331 movdqa %xmm2,%xmm3 332 psrlq $4,%xmm2 333 psrlq $30,%xmm3 334 pand %xmm7,%xmm2 335 pand %xmm7,%xmm3 336 psrldq $13,%xmm4 337 leal 144(%esp),%edx 338 movl $2,%ecx 339.L005square: 340 movdqa %xmm0,(%esp) 341 movdqa %xmm1,16(%esp) 342 movdqa %xmm2,32(%esp) 343 movdqa %xmm3,48(%esp) 344 movdqa %xmm4,64(%esp) 345 movdqa %xmm1,%xmm6 346 movdqa %xmm2,%xmm5 347 pslld $2,%xmm6 348 pslld $2,%xmm5 349 paddd %xmm1,%xmm6 350 paddd %xmm2,%xmm5 351 movdqa %xmm6,80(%esp) 352 movdqa %xmm5,96(%esp) 353 movdqa %xmm3,%xmm6 354 movdqa %xmm4,%xmm5 355 pslld $2,%xmm6 356 pslld $2,%xmm5 357 paddd %xmm3,%xmm6 358 paddd %xmm4,%xmm5 359 movdqa %xmm6,112(%esp) 360 movdqa %xmm5,128(%esp) 361 pshufd $68,%xmm0,%xmm6 362 movdqa %xmm1,%xmm5 363 pshufd $68,%xmm1,%xmm1 364 pshufd $68,%xmm2,%xmm2 365 pshufd $68,%xmm3,%xmm3 366 pshufd $68,%xmm4,%xmm4 367 movdqa %xmm6,(%edx) 368 movdqa %xmm1,16(%edx) 369 movdqa %xmm2,32(%edx) 370 movdqa %xmm3,48(%edx) 371 movdqa %xmm4,64(%edx) 372 pmuludq %xmm0,%xmm4 373 pmuludq %xmm0,%xmm3 374 pmuludq %xmm0,%xmm2 375 pmuludq %xmm0,%xmm1 376 pmuludq %xmm6,%xmm0 377 movdqa %xmm5,%xmm6 378 pmuludq 48(%edx),%xmm5 379 movdqa %xmm6,%xmm7 380 pmuludq 32(%edx),%xmm6 381 paddq %xmm5,%xmm4 382 movdqa %xmm7,%xmm5 383 pmuludq 16(%edx),%xmm7 384 paddq %xmm6,%xmm3 385 movdqa 80(%esp),%xmm6 386 pmuludq (%edx),%xmm5 387 paddq %xmm7,%xmm2 388 pmuludq 64(%edx),%xmm6 389 movdqa 32(%esp),%xmm7 390 paddq %xmm5,%xmm1 391 movdqa %xmm7,%xmm5 392 pmuludq 32(%edx),%xmm7 393 paddq %xmm6,%xmm0 394 movdqa %xmm5,%xmm6 395 pmuludq 16(%edx),%xmm5 396 paddq %xmm7,%xmm4 397 movdqa 96(%esp),%xmm7 398 pmuludq (%edx),%xmm6 399 paddq %xmm5,%xmm3 400 movdqa %xmm7,%xmm5 401 pmuludq 64(%edx),%xmm7 402 paddq %xmm6,%xmm2 403 pmuludq 48(%edx),%xmm5 404 movdqa 48(%esp),%xmm6 405 paddq %xmm7,%xmm1 406 movdqa %xmm6,%xmm7 407 pmuludq 16(%edx),%xmm6 408 paddq %xmm5,%xmm0 409 movdqa 112(%esp),%xmm5 410 pmuludq (%edx),%xmm7 411 paddq %xmm6,%xmm4 412 movdqa %xmm5,%xmm6 413 pmuludq 64(%edx),%xmm5 414 paddq %xmm7,%xmm3 415 movdqa %xmm6,%xmm7 416 pmuludq 48(%edx),%xmm6 417 paddq %xmm5,%xmm2 418 pmuludq 32(%edx),%xmm7 419 movdqa 64(%esp),%xmm5 420 paddq %xmm6,%xmm1 421 movdqa 128(%esp),%xmm6 422 pmuludq (%edx),%xmm5 423 paddq %xmm7,%xmm0 424 movdqa %xmm6,%xmm7 425 pmuludq 64(%edx),%xmm6 426 paddq %xmm5,%xmm4 427 movdqa %xmm7,%xmm5 428 pmuludq 16(%edx),%xmm7 429 paddq %xmm6,%xmm3 430 movdqa %xmm5,%xmm6 431 pmuludq 32(%edx),%xmm5 432 paddq %xmm7,%xmm0 433 pmuludq 48(%edx),%xmm6 434 movdqa 64(%ebx),%xmm7 435 paddq %xmm5,%xmm1 436 paddq %xmm6,%xmm2 437 movdqa %xmm3,%xmm5 438 pand %xmm7,%xmm3 439 psrlq $26,%xmm5 440 paddq %xmm4,%xmm5 441 movdqa %xmm0,%xmm6 442 pand %xmm7,%xmm0 443 psrlq $26,%xmm6 444 movdqa %xmm5,%xmm4 445 paddq %xmm1,%xmm6 446 psrlq $26,%xmm5 447 pand %xmm7,%xmm4 448 movdqa %xmm6,%xmm1 449 psrlq $26,%xmm6 450 paddd %xmm5,%xmm0 451 psllq $2,%xmm5 452 paddq %xmm2,%xmm6 453 paddq %xmm0,%xmm5 454 pand %xmm7,%xmm1 455 movdqa %xmm6,%xmm2 456 psrlq $26,%xmm6 457 pand %xmm7,%xmm2 458 paddd %xmm3,%xmm6 459 movdqa %xmm5,%xmm0 460 psrlq $26,%xmm5 461 movdqa %xmm6,%xmm3 462 psrlq $26,%xmm6 463 pand %xmm7,%xmm0 464 paddd %xmm5,%xmm1 465 pand %xmm7,%xmm3 466 paddd %xmm6,%xmm4 467 decl %ecx 468 jz .L006square_break 469 punpcklqdq (%esp),%xmm0 470 punpcklqdq 16(%esp),%xmm1 471 punpcklqdq 32(%esp),%xmm2 472 punpcklqdq 48(%esp),%xmm3 473 punpcklqdq 64(%esp),%xmm4 474 jmp .L005square 475.L006square_break: 476 psllq $32,%xmm0 477 psllq $32,%xmm1 478 psllq $32,%xmm2 479 psllq $32,%xmm3 480 psllq $32,%xmm4 481 por (%esp),%xmm0 482 por 16(%esp),%xmm1 483 por 32(%esp),%xmm2 484 por 48(%esp),%xmm3 485 por 64(%esp),%xmm4 486 pshufd $141,%xmm0,%xmm0 487 pshufd $141,%xmm1,%xmm1 488 pshufd $141,%xmm2,%xmm2 489 pshufd $141,%xmm3,%xmm3 490 pshufd $141,%xmm4,%xmm4 491 movdqu %xmm0,(%edi) 492 movdqu %xmm1,16(%edi) 493 movdqu %xmm2,32(%edi) 494 movdqu %xmm3,48(%edi) 495 movdqu %xmm4,64(%edi) 496 movdqa %xmm1,%xmm6 497 movdqa %xmm2,%xmm5 498 pslld $2,%xmm6 499 pslld $2,%xmm5 500 paddd %xmm1,%xmm6 501 paddd %xmm2,%xmm5 502 movdqu %xmm6,80(%edi) 503 movdqu %xmm5,96(%edi) 504 movdqa %xmm3,%xmm6 505 movdqa %xmm4,%xmm5 506 pslld $2,%xmm6 507 pslld $2,%xmm5 508 paddd %xmm3,%xmm6 509 paddd %xmm4,%xmm5 510 movdqu %xmm6,112(%edi) 511 movdqu %xmm5,128(%edi) 512 movl %ebp,%esp 513 leal -48(%edi),%edi 514 ret 515.size _poly1305_init_sse2,.-_poly1305_init_sse2 516.align 32 517.type _poly1305_blocks_sse2,@function 518.align 16 519_poly1305_blocks_sse2: 520 #ifdef __CET__ 521 522.byte 243,15,30,251 523 #endif 524 525 pushl %ebp 526 pushl %ebx 527 pushl %esi 528 pushl %edi 529 movl 20(%esp),%edi 530 movl 24(%esp),%esi 531 movl 28(%esp),%ecx 532 movl 20(%edi),%eax 533 andl $-16,%ecx 534 jz .L007nodata 535 cmpl $64,%ecx 536 jae .L008enter_sse2 537 testl %eax,%eax 538 jz .Lenter_blocks 539.align 16 540.L008enter_sse2: 541 call .L009pic_point 542.L009pic_point: 543 popl %ebx 544 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx 545 testl %eax,%eax 546 jnz .L010base2_26 547 call _poly1305_init_sse2 548 movl (%edi),%eax 549 movl 3(%edi),%ecx 550 movl 6(%edi),%edx 551 movl 9(%edi),%esi 552 movl 13(%edi),%ebp 553 movl $1,20(%edi) 554 shrl $2,%ecx 555 andl $67108863,%eax 556 shrl $4,%edx 557 andl $67108863,%ecx 558 shrl $6,%esi 559 andl $67108863,%edx 560 movd %eax,%xmm0 561 movd %ecx,%xmm1 562 movd %edx,%xmm2 563 movd %esi,%xmm3 564 movd %ebp,%xmm4 565 movl 24(%esp),%esi 566 movl 28(%esp),%ecx 567 jmp .L011base2_32 568.align 16 569.L010base2_26: 570 movd (%edi),%xmm0 571 movd 4(%edi),%xmm1 572 movd 8(%edi),%xmm2 573 movd 12(%edi),%xmm3 574 movd 16(%edi),%xmm4 575 movdqa 64(%ebx),%xmm7 576.L011base2_32: 577 movl 32(%esp),%eax 578 movl %esp,%ebp 579 subl $528,%esp 580 andl $-16,%esp 581 leal 48(%edi),%edi 582 shll $24,%eax 583 testl $31,%ecx 584 jz .L012even 585 movdqu (%esi),%xmm6 586 leal 16(%esi),%esi 587 movdqa %xmm6,%xmm5 588 pand %xmm7,%xmm6 589 paddd %xmm6,%xmm0 590 movdqa %xmm5,%xmm6 591 psrlq $26,%xmm5 592 psrldq $6,%xmm6 593 pand %xmm7,%xmm5 594 paddd %xmm5,%xmm1 595 movdqa %xmm6,%xmm5 596 psrlq $4,%xmm6 597 pand %xmm7,%xmm6 598 paddd %xmm6,%xmm2 599 movdqa %xmm5,%xmm6 600 psrlq $30,%xmm5 601 pand %xmm7,%xmm5 602 psrldq $7,%xmm6 603 paddd %xmm5,%xmm3 604 movd %eax,%xmm5 605 paddd %xmm6,%xmm4 606 movd 12(%edi),%xmm6 607 paddd %xmm5,%xmm4 608 movdqa %xmm0,(%esp) 609 movdqa %xmm1,16(%esp) 610 movdqa %xmm2,32(%esp) 611 movdqa %xmm3,48(%esp) 612 movdqa %xmm4,64(%esp) 613 pmuludq %xmm6,%xmm0 614 pmuludq %xmm6,%xmm1 615 pmuludq %xmm6,%xmm2 616 movd 28(%edi),%xmm5 617 pmuludq %xmm6,%xmm3 618 pmuludq %xmm6,%xmm4 619 movdqa %xmm5,%xmm6 620 pmuludq 48(%esp),%xmm5 621 movdqa %xmm6,%xmm7 622 pmuludq 32(%esp),%xmm6 623 paddq %xmm5,%xmm4 624 movdqa %xmm7,%xmm5 625 pmuludq 16(%esp),%xmm7 626 paddq %xmm6,%xmm3 627 movd 92(%edi),%xmm6 628 pmuludq (%esp),%xmm5 629 paddq %xmm7,%xmm2 630 pmuludq 64(%esp),%xmm6 631 movd 44(%edi),%xmm7 632 paddq %xmm5,%xmm1 633 movdqa %xmm7,%xmm5 634 pmuludq 32(%esp),%xmm7 635 paddq %xmm6,%xmm0 636 movdqa %xmm5,%xmm6 637 pmuludq 16(%esp),%xmm5 638 paddq %xmm7,%xmm4 639 movd 108(%edi),%xmm7 640 pmuludq (%esp),%xmm6 641 paddq %xmm5,%xmm3 642 movdqa %xmm7,%xmm5 643 pmuludq 64(%esp),%xmm7 644 paddq %xmm6,%xmm2 645 pmuludq 48(%esp),%xmm5 646 movd 60(%edi),%xmm6 647 paddq %xmm7,%xmm1 648 movdqa %xmm6,%xmm7 649 pmuludq 16(%esp),%xmm6 650 paddq %xmm5,%xmm0 651 movd 124(%edi),%xmm5 652 pmuludq (%esp),%xmm7 653 paddq %xmm6,%xmm4 654 movdqa %xmm5,%xmm6 655 pmuludq 64(%esp),%xmm5 656 paddq %xmm7,%xmm3 657 movdqa %xmm6,%xmm7 658 pmuludq 48(%esp),%xmm6 659 paddq %xmm5,%xmm2 660 pmuludq 32(%esp),%xmm7 661 movd 76(%edi),%xmm5 662 paddq %xmm6,%xmm1 663 movd 140(%edi),%xmm6 664 pmuludq (%esp),%xmm5 665 paddq %xmm7,%xmm0 666 movdqa %xmm6,%xmm7 667 pmuludq 64(%esp),%xmm6 668 paddq %xmm5,%xmm4 669 movdqa %xmm7,%xmm5 670 pmuludq 16(%esp),%xmm7 671 paddq %xmm6,%xmm3 672 movdqa %xmm5,%xmm6 673 pmuludq 32(%esp),%xmm5 674 paddq %xmm7,%xmm0 675 pmuludq 48(%esp),%xmm6 676 movdqa 64(%ebx),%xmm7 677 paddq %xmm5,%xmm1 678 paddq %xmm6,%xmm2 679 movdqa %xmm3,%xmm5 680 pand %xmm7,%xmm3 681 psrlq $26,%xmm5 682 paddq %xmm4,%xmm5 683 movdqa %xmm0,%xmm6 684 pand %xmm7,%xmm0 685 psrlq $26,%xmm6 686 movdqa %xmm5,%xmm4 687 paddq %xmm1,%xmm6 688 psrlq $26,%xmm5 689 pand %xmm7,%xmm4 690 movdqa %xmm6,%xmm1 691 psrlq $26,%xmm6 692 paddd %xmm5,%xmm0 693 psllq $2,%xmm5 694 paddq %xmm2,%xmm6 695 paddq %xmm0,%xmm5 696 pand %xmm7,%xmm1 697 movdqa %xmm6,%xmm2 698 psrlq $26,%xmm6 699 pand %xmm7,%xmm2 700 paddd %xmm3,%xmm6 701 movdqa %xmm5,%xmm0 702 psrlq $26,%xmm5 703 movdqa %xmm6,%xmm3 704 psrlq $26,%xmm6 705 pand %xmm7,%xmm0 706 paddd %xmm5,%xmm1 707 pand %xmm7,%xmm3 708 paddd %xmm6,%xmm4 709 subl $16,%ecx 710 jz .L013done 711.L012even: 712 leal 384(%esp),%edx 713 leal -32(%esi),%eax 714 subl $64,%ecx 715 movdqu (%edi),%xmm5 716 pshufd $68,%xmm5,%xmm6 717 cmovbl %eax,%esi 718 pshufd $238,%xmm5,%xmm5 719 movdqa %xmm6,(%edx) 720 leal 160(%esp),%eax 721 movdqu 16(%edi),%xmm6 722 movdqa %xmm5,-144(%edx) 723 pshufd $68,%xmm6,%xmm5 724 pshufd $238,%xmm6,%xmm6 725 movdqa %xmm5,16(%edx) 726 movdqu 32(%edi),%xmm5 727 movdqa %xmm6,-128(%edx) 728 pshufd $68,%xmm5,%xmm6 729 pshufd $238,%xmm5,%xmm5 730 movdqa %xmm6,32(%edx) 731 movdqu 48(%edi),%xmm6 732 movdqa %xmm5,-112(%edx) 733 pshufd $68,%xmm6,%xmm5 734 pshufd $238,%xmm6,%xmm6 735 movdqa %xmm5,48(%edx) 736 movdqu 64(%edi),%xmm5 737 movdqa %xmm6,-96(%edx) 738 pshufd $68,%xmm5,%xmm6 739 pshufd $238,%xmm5,%xmm5 740 movdqa %xmm6,64(%edx) 741 movdqu 80(%edi),%xmm6 742 movdqa %xmm5,-80(%edx) 743 pshufd $68,%xmm6,%xmm5 744 pshufd $238,%xmm6,%xmm6 745 movdqa %xmm5,80(%edx) 746 movdqu 96(%edi),%xmm5 747 movdqa %xmm6,-64(%edx) 748 pshufd $68,%xmm5,%xmm6 749 pshufd $238,%xmm5,%xmm5 750 movdqa %xmm6,96(%edx) 751 movdqu 112(%edi),%xmm6 752 movdqa %xmm5,-48(%edx) 753 pshufd $68,%xmm6,%xmm5 754 pshufd $238,%xmm6,%xmm6 755 movdqa %xmm5,112(%edx) 756 movdqu 128(%edi),%xmm5 757 movdqa %xmm6,-32(%edx) 758 pshufd $68,%xmm5,%xmm6 759 pshufd $238,%xmm5,%xmm5 760 movdqa %xmm6,128(%edx) 761 movdqa %xmm5,-16(%edx) 762 movdqu 32(%esi),%xmm5 763 movdqu 48(%esi),%xmm6 764 leal 32(%esi),%esi 765 movdqa %xmm2,112(%esp) 766 movdqa %xmm3,128(%esp) 767 movdqa %xmm4,144(%esp) 768 movdqa %xmm5,%xmm2 769 movdqa %xmm6,%xmm3 770 psrldq $6,%xmm2 771 psrldq $6,%xmm3 772 movdqa %xmm5,%xmm4 773 punpcklqdq %xmm3,%xmm2 774 punpckhqdq %xmm6,%xmm4 775 punpcklqdq %xmm6,%xmm5 776 movdqa %xmm2,%xmm3 777 psrlq $4,%xmm2 778 psrlq $30,%xmm3 779 movdqa %xmm5,%xmm6 780 psrlq $40,%xmm4 781 psrlq $26,%xmm6 782 pand %xmm7,%xmm5 783 pand %xmm7,%xmm6 784 pand %xmm7,%xmm2 785 pand %xmm7,%xmm3 786 por (%ebx),%xmm4 787 movdqa %xmm0,80(%esp) 788 movdqa %xmm1,96(%esp) 789 jbe .L014skip_loop 790 jmp .L015loop 791.align 32 792.L015loop: 793 movdqa -144(%edx),%xmm7 794 movdqa %xmm6,16(%eax) 795 movdqa %xmm2,32(%eax) 796 movdqa %xmm3,48(%eax) 797 movdqa %xmm4,64(%eax) 798 movdqa %xmm5,%xmm1 799 pmuludq %xmm7,%xmm5 800 movdqa %xmm6,%xmm0 801 pmuludq %xmm7,%xmm6 802 pmuludq %xmm7,%xmm2 803 pmuludq %xmm7,%xmm3 804 pmuludq %xmm7,%xmm4 805 pmuludq -16(%edx),%xmm0 806 movdqa %xmm1,%xmm7 807 pmuludq -128(%edx),%xmm1 808 paddq %xmm5,%xmm0 809 movdqa %xmm7,%xmm5 810 pmuludq -112(%edx),%xmm7 811 paddq %xmm6,%xmm1 812 movdqa %xmm5,%xmm6 813 pmuludq -96(%edx),%xmm5 814 paddq %xmm7,%xmm2 815 movdqa 16(%eax),%xmm7 816 pmuludq -80(%edx),%xmm6 817 paddq %xmm5,%xmm3 818 movdqa %xmm7,%xmm5 819 pmuludq -128(%edx),%xmm7 820 paddq %xmm6,%xmm4 821 movdqa %xmm5,%xmm6 822 pmuludq -112(%edx),%xmm5 823 paddq %xmm7,%xmm2 824 movdqa 32(%eax),%xmm7 825 pmuludq -96(%edx),%xmm6 826 paddq %xmm5,%xmm3 827 movdqa %xmm7,%xmm5 828 pmuludq -32(%edx),%xmm7 829 paddq %xmm6,%xmm4 830 movdqa %xmm5,%xmm6 831 pmuludq -16(%edx),%xmm5 832 paddq %xmm7,%xmm0 833 movdqa %xmm6,%xmm7 834 pmuludq -128(%edx),%xmm6 835 paddq %xmm5,%xmm1 836 movdqa 48(%eax),%xmm5 837 pmuludq -112(%edx),%xmm7 838 paddq %xmm6,%xmm3 839 movdqa %xmm5,%xmm6 840 pmuludq -48(%edx),%xmm5 841 paddq %xmm7,%xmm4 842 movdqa %xmm6,%xmm7 843 pmuludq -32(%edx),%xmm6 844 paddq %xmm5,%xmm0 845 movdqa %xmm7,%xmm5 846 pmuludq -16(%edx),%xmm7 847 paddq %xmm6,%xmm1 848 movdqa 64(%eax),%xmm6 849 pmuludq -128(%edx),%xmm5 850 paddq %xmm7,%xmm2 851 movdqa %xmm6,%xmm7 852 pmuludq -16(%edx),%xmm6 853 paddq %xmm5,%xmm4 854 movdqa %xmm7,%xmm5 855 pmuludq -64(%edx),%xmm7 856 paddq %xmm6,%xmm3 857 movdqa %xmm5,%xmm6 858 pmuludq -48(%edx),%xmm5 859 paddq %xmm7,%xmm0 860 movdqa 64(%ebx),%xmm7 861 pmuludq -32(%edx),%xmm6 862 paddq %xmm5,%xmm1 863 paddq %xmm6,%xmm2 864 movdqu -32(%esi),%xmm5 865 movdqu -16(%esi),%xmm6 866 leal 32(%esi),%esi 867 movdqa %xmm2,32(%esp) 868 movdqa %xmm3,48(%esp) 869 movdqa %xmm4,64(%esp) 870 movdqa %xmm5,%xmm2 871 movdqa %xmm6,%xmm3 872 psrldq $6,%xmm2 873 psrldq $6,%xmm3 874 movdqa %xmm5,%xmm4 875 punpcklqdq %xmm3,%xmm2 876 punpckhqdq %xmm6,%xmm4 877 punpcklqdq %xmm6,%xmm5 878 movdqa %xmm2,%xmm3 879 psrlq $4,%xmm2 880 psrlq $30,%xmm3 881 movdqa %xmm5,%xmm6 882 psrlq $40,%xmm4 883 psrlq $26,%xmm6 884 pand %xmm7,%xmm5 885 pand %xmm7,%xmm6 886 pand %xmm7,%xmm2 887 pand %xmm7,%xmm3 888 por (%ebx),%xmm4 889 leal -32(%esi),%eax 890 subl $64,%ecx 891 paddd 80(%esp),%xmm5 892 paddd 96(%esp),%xmm6 893 paddd 112(%esp),%xmm2 894 paddd 128(%esp),%xmm3 895 paddd 144(%esp),%xmm4 896 cmovbl %eax,%esi 897 leal 160(%esp),%eax 898 movdqa (%edx),%xmm7 899 movdqa %xmm1,16(%esp) 900 movdqa %xmm6,16(%eax) 901 movdqa %xmm2,32(%eax) 902 movdqa %xmm3,48(%eax) 903 movdqa %xmm4,64(%eax) 904 movdqa %xmm5,%xmm1 905 pmuludq %xmm7,%xmm5 906 paddq %xmm0,%xmm5 907 movdqa %xmm6,%xmm0 908 pmuludq %xmm7,%xmm6 909 pmuludq %xmm7,%xmm2 910 pmuludq %xmm7,%xmm3 911 pmuludq %xmm7,%xmm4 912 paddq 16(%esp),%xmm6 913 paddq 32(%esp),%xmm2 914 paddq 48(%esp),%xmm3 915 paddq 64(%esp),%xmm4 916 pmuludq 128(%edx),%xmm0 917 movdqa %xmm1,%xmm7 918 pmuludq 16(%edx),%xmm1 919 paddq %xmm5,%xmm0 920 movdqa %xmm7,%xmm5 921 pmuludq 32(%edx),%xmm7 922 paddq %xmm6,%xmm1 923 movdqa %xmm5,%xmm6 924 pmuludq 48(%edx),%xmm5 925 paddq %xmm7,%xmm2 926 movdqa 16(%eax),%xmm7 927 pmuludq 64(%edx),%xmm6 928 paddq %xmm5,%xmm3 929 movdqa %xmm7,%xmm5 930 pmuludq 16(%edx),%xmm7 931 paddq %xmm6,%xmm4 932 movdqa %xmm5,%xmm6 933 pmuludq 32(%edx),%xmm5 934 paddq %xmm7,%xmm2 935 movdqa 32(%eax),%xmm7 936 pmuludq 48(%edx),%xmm6 937 paddq %xmm5,%xmm3 938 movdqa %xmm7,%xmm5 939 pmuludq 112(%edx),%xmm7 940 paddq %xmm6,%xmm4 941 movdqa %xmm5,%xmm6 942 pmuludq 128(%edx),%xmm5 943 paddq %xmm7,%xmm0 944 movdqa %xmm6,%xmm7 945 pmuludq 16(%edx),%xmm6 946 paddq %xmm5,%xmm1 947 movdqa 48(%eax),%xmm5 948 pmuludq 32(%edx),%xmm7 949 paddq %xmm6,%xmm3 950 movdqa %xmm5,%xmm6 951 pmuludq 96(%edx),%xmm5 952 paddq %xmm7,%xmm4 953 movdqa %xmm6,%xmm7 954 pmuludq 112(%edx),%xmm6 955 paddq %xmm5,%xmm0 956 movdqa %xmm7,%xmm5 957 pmuludq 128(%edx),%xmm7 958 paddq %xmm6,%xmm1 959 movdqa 64(%eax),%xmm6 960 pmuludq 16(%edx),%xmm5 961 paddq %xmm7,%xmm2 962 movdqa %xmm6,%xmm7 963 pmuludq 128(%edx),%xmm6 964 paddq %xmm5,%xmm4 965 movdqa %xmm7,%xmm5 966 pmuludq 80(%edx),%xmm7 967 paddq %xmm6,%xmm3 968 movdqa %xmm5,%xmm6 969 pmuludq 96(%edx),%xmm5 970 paddq %xmm7,%xmm0 971 movdqa 64(%ebx),%xmm7 972 pmuludq 112(%edx),%xmm6 973 paddq %xmm5,%xmm1 974 paddq %xmm6,%xmm2 975 movdqa %xmm3,%xmm5 976 pand %xmm7,%xmm3 977 psrlq $26,%xmm5 978 paddq %xmm4,%xmm5 979 movdqa %xmm0,%xmm6 980 pand %xmm7,%xmm0 981 psrlq $26,%xmm6 982 movdqa %xmm5,%xmm4 983 paddq %xmm1,%xmm6 984 psrlq $26,%xmm5 985 pand %xmm7,%xmm4 986 movdqa %xmm6,%xmm1 987 psrlq $26,%xmm6 988 paddd %xmm5,%xmm0 989 psllq $2,%xmm5 990 paddq %xmm2,%xmm6 991 paddq %xmm0,%xmm5 992 pand %xmm7,%xmm1 993 movdqa %xmm6,%xmm2 994 psrlq $26,%xmm6 995 pand %xmm7,%xmm2 996 paddd %xmm3,%xmm6 997 movdqa %xmm5,%xmm0 998 psrlq $26,%xmm5 999 movdqa %xmm6,%xmm3 1000 psrlq $26,%xmm6 1001 pand %xmm7,%xmm0 1002 paddd %xmm5,%xmm1 1003 pand %xmm7,%xmm3 1004 paddd %xmm6,%xmm4 1005 movdqu 32(%esi),%xmm5 1006 movdqu 48(%esi),%xmm6 1007 leal 32(%esi),%esi 1008 movdqa %xmm2,112(%esp) 1009 movdqa %xmm3,128(%esp) 1010 movdqa %xmm4,144(%esp) 1011 movdqa %xmm5,%xmm2 1012 movdqa %xmm6,%xmm3 1013 psrldq $6,%xmm2 1014 psrldq $6,%xmm3 1015 movdqa %xmm5,%xmm4 1016 punpcklqdq %xmm3,%xmm2 1017 punpckhqdq %xmm6,%xmm4 1018 punpcklqdq %xmm6,%xmm5 1019 movdqa %xmm2,%xmm3 1020 psrlq $4,%xmm2 1021 psrlq $30,%xmm3 1022 movdqa %xmm5,%xmm6 1023 psrlq $40,%xmm4 1024 psrlq $26,%xmm6 1025 pand %xmm7,%xmm5 1026 pand %xmm7,%xmm6 1027 pand %xmm7,%xmm2 1028 pand %xmm7,%xmm3 1029 por (%ebx),%xmm4 1030 movdqa %xmm0,80(%esp) 1031 movdqa %xmm1,96(%esp) 1032 ja .L015loop 1033.L014skip_loop: 1034 pshufd $16,-144(%edx),%xmm7 1035 addl $32,%ecx 1036 jnz .L016long_tail 1037 paddd %xmm0,%xmm5 1038 paddd %xmm1,%xmm6 1039 paddd 112(%esp),%xmm2 1040 paddd 128(%esp),%xmm3 1041 paddd 144(%esp),%xmm4 1042.L016long_tail: 1043 movdqa %xmm5,(%eax) 1044 movdqa %xmm6,16(%eax) 1045 movdqa %xmm2,32(%eax) 1046 movdqa %xmm3,48(%eax) 1047 movdqa %xmm4,64(%eax) 1048 pmuludq %xmm7,%xmm5 1049 pmuludq %xmm7,%xmm6 1050 pmuludq %xmm7,%xmm2 1051 movdqa %xmm5,%xmm0 1052 pshufd $16,-128(%edx),%xmm5 1053 pmuludq %xmm7,%xmm3 1054 movdqa %xmm6,%xmm1 1055 pmuludq %xmm7,%xmm4 1056 movdqa %xmm5,%xmm6 1057 pmuludq 48(%eax),%xmm5 1058 movdqa %xmm6,%xmm7 1059 pmuludq 32(%eax),%xmm6 1060 paddq %xmm5,%xmm4 1061 movdqa %xmm7,%xmm5 1062 pmuludq 16(%eax),%xmm7 1063 paddq %xmm6,%xmm3 1064 pshufd $16,-64(%edx),%xmm6 1065 pmuludq (%eax),%xmm5 1066 paddq %xmm7,%xmm2 1067 pmuludq 64(%eax),%xmm6 1068 pshufd $16,-112(%edx),%xmm7 1069 paddq %xmm5,%xmm1 1070 movdqa %xmm7,%xmm5 1071 pmuludq 32(%eax),%xmm7 1072 paddq %xmm6,%xmm0 1073 movdqa %xmm5,%xmm6 1074 pmuludq 16(%eax),%xmm5 1075 paddq %xmm7,%xmm4 1076 pshufd $16,-48(%edx),%xmm7 1077 pmuludq (%eax),%xmm6 1078 paddq %xmm5,%xmm3 1079 movdqa %xmm7,%xmm5 1080 pmuludq 64(%eax),%xmm7 1081 paddq %xmm6,%xmm2 1082 pmuludq 48(%eax),%xmm5 1083 pshufd $16,-96(%edx),%xmm6 1084 paddq %xmm7,%xmm1 1085 movdqa %xmm6,%xmm7 1086 pmuludq 16(%eax),%xmm6 1087 paddq %xmm5,%xmm0 1088 pshufd $16,-32(%edx),%xmm5 1089 pmuludq (%eax),%xmm7 1090 paddq %xmm6,%xmm4 1091 movdqa %xmm5,%xmm6 1092 pmuludq 64(%eax),%xmm5 1093 paddq %xmm7,%xmm3 1094 movdqa %xmm6,%xmm7 1095 pmuludq 48(%eax),%xmm6 1096 paddq %xmm5,%xmm2 1097 pmuludq 32(%eax),%xmm7 1098 pshufd $16,-80(%edx),%xmm5 1099 paddq %xmm6,%xmm1 1100 pshufd $16,-16(%edx),%xmm6 1101 pmuludq (%eax),%xmm5 1102 paddq %xmm7,%xmm0 1103 movdqa %xmm6,%xmm7 1104 pmuludq 64(%eax),%xmm6 1105 paddq %xmm5,%xmm4 1106 movdqa %xmm7,%xmm5 1107 pmuludq 16(%eax),%xmm7 1108 paddq %xmm6,%xmm3 1109 movdqa %xmm5,%xmm6 1110 pmuludq 32(%eax),%xmm5 1111 paddq %xmm7,%xmm0 1112 pmuludq 48(%eax),%xmm6 1113 movdqa 64(%ebx),%xmm7 1114 paddq %xmm5,%xmm1 1115 paddq %xmm6,%xmm2 1116 jz .L017short_tail 1117 movdqu -32(%esi),%xmm5 1118 movdqu -16(%esi),%xmm6 1119 leal 32(%esi),%esi 1120 movdqa %xmm2,32(%esp) 1121 movdqa %xmm3,48(%esp) 1122 movdqa %xmm4,64(%esp) 1123 movdqa %xmm5,%xmm2 1124 movdqa %xmm6,%xmm3 1125 psrldq $6,%xmm2 1126 psrldq $6,%xmm3 1127 movdqa %xmm5,%xmm4 1128 punpcklqdq %xmm3,%xmm2 1129 punpckhqdq %xmm6,%xmm4 1130 punpcklqdq %xmm6,%xmm5 1131 movdqa %xmm2,%xmm3 1132 psrlq $4,%xmm2 1133 psrlq $30,%xmm3 1134 movdqa %xmm5,%xmm6 1135 psrlq $40,%xmm4 1136 psrlq $26,%xmm6 1137 pand %xmm7,%xmm5 1138 pand %xmm7,%xmm6 1139 pand %xmm7,%xmm2 1140 pand %xmm7,%xmm3 1141 por (%ebx),%xmm4 1142 pshufd $16,(%edx),%xmm7 1143 paddd 80(%esp),%xmm5 1144 paddd 96(%esp),%xmm6 1145 paddd 112(%esp),%xmm2 1146 paddd 128(%esp),%xmm3 1147 paddd 144(%esp),%xmm4 1148 movdqa %xmm5,(%esp) 1149 pmuludq %xmm7,%xmm5 1150 movdqa %xmm6,16(%esp) 1151 pmuludq %xmm7,%xmm6 1152 paddq %xmm5,%xmm0 1153 movdqa %xmm2,%xmm5 1154 pmuludq %xmm7,%xmm2 1155 paddq %xmm6,%xmm1 1156 movdqa %xmm3,%xmm6 1157 pmuludq %xmm7,%xmm3 1158 paddq 32(%esp),%xmm2 1159 movdqa %xmm5,32(%esp) 1160 pshufd $16,16(%edx),%xmm5 1161 paddq 48(%esp),%xmm3 1162 movdqa %xmm6,48(%esp) 1163 movdqa %xmm4,%xmm6 1164 pmuludq %xmm7,%xmm4 1165 paddq 64(%esp),%xmm4 1166 movdqa %xmm6,64(%esp) 1167 movdqa %xmm5,%xmm6 1168 pmuludq 48(%esp),%xmm5 1169 movdqa %xmm6,%xmm7 1170 pmuludq 32(%esp),%xmm6 1171 paddq %xmm5,%xmm4 1172 movdqa %xmm7,%xmm5 1173 pmuludq 16(%esp),%xmm7 1174 paddq %xmm6,%xmm3 1175 pshufd $16,80(%edx),%xmm6 1176 pmuludq (%esp),%xmm5 1177 paddq %xmm7,%xmm2 1178 pmuludq 64(%esp),%xmm6 1179 pshufd $16,32(%edx),%xmm7 1180 paddq %xmm5,%xmm1 1181 movdqa %xmm7,%xmm5 1182 pmuludq 32(%esp),%xmm7 1183 paddq %xmm6,%xmm0 1184 movdqa %xmm5,%xmm6 1185 pmuludq 16(%esp),%xmm5 1186 paddq %xmm7,%xmm4 1187 pshufd $16,96(%edx),%xmm7 1188 pmuludq (%esp),%xmm6 1189 paddq %xmm5,%xmm3 1190 movdqa %xmm7,%xmm5 1191 pmuludq 64(%esp),%xmm7 1192 paddq %xmm6,%xmm2 1193 pmuludq 48(%esp),%xmm5 1194 pshufd $16,48(%edx),%xmm6 1195 paddq %xmm7,%xmm1 1196 movdqa %xmm6,%xmm7 1197 pmuludq 16(%esp),%xmm6 1198 paddq %xmm5,%xmm0 1199 pshufd $16,112(%edx),%xmm5 1200 pmuludq (%esp),%xmm7 1201 paddq %xmm6,%xmm4 1202 movdqa %xmm5,%xmm6 1203 pmuludq 64(%esp),%xmm5 1204 paddq %xmm7,%xmm3 1205 movdqa %xmm6,%xmm7 1206 pmuludq 48(%esp),%xmm6 1207 paddq %xmm5,%xmm2 1208 pmuludq 32(%esp),%xmm7 1209 pshufd $16,64(%edx),%xmm5 1210 paddq %xmm6,%xmm1 1211 pshufd $16,128(%edx),%xmm6 1212 pmuludq (%esp),%xmm5 1213 paddq %xmm7,%xmm0 1214 movdqa %xmm6,%xmm7 1215 pmuludq 64(%esp),%xmm6 1216 paddq %xmm5,%xmm4 1217 movdqa %xmm7,%xmm5 1218 pmuludq 16(%esp),%xmm7 1219 paddq %xmm6,%xmm3 1220 movdqa %xmm5,%xmm6 1221 pmuludq 32(%esp),%xmm5 1222 paddq %xmm7,%xmm0 1223 pmuludq 48(%esp),%xmm6 1224 movdqa 64(%ebx),%xmm7 1225 paddq %xmm5,%xmm1 1226 paddq %xmm6,%xmm2 1227.L017short_tail: 1228 pshufd $78,%xmm4,%xmm6 1229 pshufd $78,%xmm3,%xmm5 1230 paddq %xmm6,%xmm4 1231 paddq %xmm5,%xmm3 1232 pshufd $78,%xmm0,%xmm6 1233 pshufd $78,%xmm1,%xmm5 1234 paddq %xmm6,%xmm0 1235 paddq %xmm5,%xmm1 1236 pshufd $78,%xmm2,%xmm6 1237 movdqa %xmm3,%xmm5 1238 pand %xmm7,%xmm3 1239 psrlq $26,%xmm5 1240 paddq %xmm6,%xmm2 1241 paddq %xmm4,%xmm5 1242 movdqa %xmm0,%xmm6 1243 pand %xmm7,%xmm0 1244 psrlq $26,%xmm6 1245 movdqa %xmm5,%xmm4 1246 paddq %xmm1,%xmm6 1247 psrlq $26,%xmm5 1248 pand %xmm7,%xmm4 1249 movdqa %xmm6,%xmm1 1250 psrlq $26,%xmm6 1251 paddd %xmm5,%xmm0 1252 psllq $2,%xmm5 1253 paddq %xmm2,%xmm6 1254 paddq %xmm0,%xmm5 1255 pand %xmm7,%xmm1 1256 movdqa %xmm6,%xmm2 1257 psrlq $26,%xmm6 1258 pand %xmm7,%xmm2 1259 paddd %xmm3,%xmm6 1260 movdqa %xmm5,%xmm0 1261 psrlq $26,%xmm5 1262 movdqa %xmm6,%xmm3 1263 psrlq $26,%xmm6 1264 pand %xmm7,%xmm0 1265 paddd %xmm5,%xmm1 1266 pand %xmm7,%xmm3 1267 paddd %xmm6,%xmm4 1268.L013done: 1269 movd %xmm0,-48(%edi) 1270 movd %xmm1,-44(%edi) 1271 movd %xmm2,-40(%edi) 1272 movd %xmm3,-36(%edi) 1273 movd %xmm4,-32(%edi) 1274 movl %ebp,%esp 1275.L007nodata: 1276 popl %edi 1277 popl %esi 1278 popl %ebx 1279 popl %ebp 1280 ret 1281.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 1282.align 32 1283.type _poly1305_emit_sse2,@function 1284.align 16 1285_poly1305_emit_sse2: 1286 #ifdef __CET__ 1287 1288.byte 243,15,30,251 1289 #endif 1290 1291 pushl %ebp 1292 pushl %ebx 1293 pushl %esi 1294 pushl %edi 1295 movl 20(%esp),%ebp 1296 cmpl $0,20(%ebp) 1297 je .Lenter_emit 1298 movl (%ebp),%eax 1299 movl 4(%ebp),%edi 1300 movl 8(%ebp),%ecx 1301 movl 12(%ebp),%edx 1302 movl 16(%ebp),%esi 1303 movl %edi,%ebx 1304 shll $26,%edi 1305 shrl $6,%ebx 1306 addl %edi,%eax 1307 movl %ecx,%edi 1308 adcl $0,%ebx 1309 shll $20,%edi 1310 shrl $12,%ecx 1311 addl %edi,%ebx 1312 movl %edx,%edi 1313 adcl $0,%ecx 1314 shll $14,%edi 1315 shrl $18,%edx 1316 addl %edi,%ecx 1317 movl %esi,%edi 1318 adcl $0,%edx 1319 shll $8,%edi 1320 shrl $24,%esi 1321 addl %edi,%edx 1322 adcl $0,%esi 1323 movl %esi,%edi 1324 andl $3,%esi 1325 shrl $2,%edi 1326 leal (%edi,%edi,4),%ebp 1327 movl 24(%esp),%edi 1328 addl %ebp,%eax 1329 movl 28(%esp),%ebp 1330 adcl $0,%ebx 1331 adcl $0,%ecx 1332 adcl $0,%edx 1333 adcl $0,%esi 1334 movd %eax,%xmm0 1335 addl $5,%eax 1336 movd %ebx,%xmm1 1337 adcl $0,%ebx 1338 movd %ecx,%xmm2 1339 adcl $0,%ecx 1340 movd %edx,%xmm3 1341 adcl $0,%edx 1342 adcl $0,%esi 1343 shrl $2,%esi 1344 negl %esi 1345 andl %esi,%eax 1346 andl %esi,%ebx 1347 andl %esi,%ecx 1348 andl %esi,%edx 1349 movl %eax,(%edi) 1350 movd %xmm0,%eax 1351 movl %ebx,4(%edi) 1352 movd %xmm1,%ebx 1353 movl %ecx,8(%edi) 1354 movd %xmm2,%ecx 1355 movl %edx,12(%edi) 1356 movd %xmm3,%edx 1357 notl %esi 1358 andl %esi,%eax 1359 andl %esi,%ebx 1360 orl (%edi),%eax 1361 andl %esi,%ecx 1362 orl 4(%edi),%ebx 1363 andl %esi,%edx 1364 orl 8(%edi),%ecx 1365 orl 12(%edi),%edx 1366 addl (%ebp),%eax 1367 adcl 4(%ebp),%ebx 1368 movl %eax,(%edi) 1369 adcl 8(%ebp),%ecx 1370 movl %ebx,4(%edi) 1371 adcl 12(%ebp),%edx 1372 movl %ecx,8(%edi) 1373 movl %edx,12(%edi) 1374 popl %edi 1375 popl %esi 1376 popl %ebx 1377 popl %ebp 1378 ret 1379.size _poly1305_emit_sse2,.-_poly1305_emit_sse2 1380.align 32 1381.type _poly1305_init_avx2,@function 1382.align 16 1383_poly1305_init_avx2: 1384 #ifdef __CET__ 1385 1386.byte 243,15,30,251 1387 #endif 1388 1389 vmovdqu 24(%edi),%xmm4 1390 leal 48(%edi),%edi 1391 movl %esp,%ebp 1392 subl $224,%esp 1393 andl $-16,%esp 1394 vmovdqa 64(%ebx),%xmm7 1395 vpand %xmm7,%xmm4,%xmm0 1396 vpsrlq $26,%xmm4,%xmm1 1397 vpsrldq $6,%xmm4,%xmm3 1398 vpand %xmm7,%xmm1,%xmm1 1399 vpsrlq $4,%xmm3,%xmm2 1400 vpsrlq $30,%xmm3,%xmm3 1401 vpand %xmm7,%xmm2,%xmm2 1402 vpand %xmm7,%xmm3,%xmm3 1403 vpsrldq $13,%xmm4,%xmm4 1404 leal 144(%esp),%edx 1405 movl $2,%ecx 1406.L018square: 1407 vmovdqa %xmm0,(%esp) 1408 vmovdqa %xmm1,16(%esp) 1409 vmovdqa %xmm2,32(%esp) 1410 vmovdqa %xmm3,48(%esp) 1411 vmovdqa %xmm4,64(%esp) 1412 vpslld $2,%xmm1,%xmm6 1413 vpslld $2,%xmm2,%xmm5 1414 vpaddd %xmm1,%xmm6,%xmm6 1415 vpaddd %xmm2,%xmm5,%xmm5 1416 vmovdqa %xmm6,80(%esp) 1417 vmovdqa %xmm5,96(%esp) 1418 vpslld $2,%xmm3,%xmm6 1419 vpslld $2,%xmm4,%xmm5 1420 vpaddd %xmm3,%xmm6,%xmm6 1421 vpaddd %xmm4,%xmm5,%xmm5 1422 vmovdqa %xmm6,112(%esp) 1423 vmovdqa %xmm5,128(%esp) 1424 vpshufd $68,%xmm0,%xmm5 1425 vmovdqa %xmm1,%xmm6 1426 vpshufd $68,%xmm1,%xmm1 1427 vpshufd $68,%xmm2,%xmm2 1428 vpshufd $68,%xmm3,%xmm3 1429 vpshufd $68,%xmm4,%xmm4 1430 vmovdqa %xmm5,(%edx) 1431 vmovdqa %xmm1,16(%edx) 1432 vmovdqa %xmm2,32(%edx) 1433 vmovdqa %xmm3,48(%edx) 1434 vmovdqa %xmm4,64(%edx) 1435 vpmuludq %xmm0,%xmm4,%xmm4 1436 vpmuludq %xmm0,%xmm3,%xmm3 1437 vpmuludq %xmm0,%xmm2,%xmm2 1438 vpmuludq %xmm0,%xmm1,%xmm1 1439 vpmuludq %xmm0,%xmm5,%xmm0 1440 vpmuludq 48(%edx),%xmm6,%xmm5 1441 vpaddq %xmm5,%xmm4,%xmm4 1442 vpmuludq 32(%edx),%xmm6,%xmm7 1443 vpaddq %xmm7,%xmm3,%xmm3 1444 vpmuludq 16(%edx),%xmm6,%xmm5 1445 vpaddq %xmm5,%xmm2,%xmm2 1446 vmovdqa 80(%esp),%xmm7 1447 vpmuludq (%edx),%xmm6,%xmm6 1448 vpaddq %xmm6,%xmm1,%xmm1 1449 vmovdqa 32(%esp),%xmm5 1450 vpmuludq 64(%edx),%xmm7,%xmm7 1451 vpaddq %xmm7,%xmm0,%xmm0 1452 vpmuludq 32(%edx),%xmm5,%xmm6 1453 vpaddq %xmm6,%xmm4,%xmm4 1454 vpmuludq 16(%edx),%xmm5,%xmm7 1455 vpaddq %xmm7,%xmm3,%xmm3 1456 vmovdqa 96(%esp),%xmm6 1457 vpmuludq (%edx),%xmm5,%xmm5 1458 vpaddq %xmm5,%xmm2,%xmm2 1459 vpmuludq 64(%edx),%xmm6,%xmm7 1460 vpaddq %xmm7,%xmm1,%xmm1 1461 vmovdqa 48(%esp),%xmm5 1462 vpmuludq 48(%edx),%xmm6,%xmm6 1463 vpaddq %xmm6,%xmm0,%xmm0 1464 vpmuludq 16(%edx),%xmm5,%xmm7 1465 vpaddq %xmm7,%xmm4,%xmm4 1466 vmovdqa 112(%esp),%xmm6 1467 vpmuludq (%edx),%xmm5,%xmm5 1468 vpaddq %xmm5,%xmm3,%xmm3 1469 vpmuludq 64(%edx),%xmm6,%xmm7 1470 vpaddq %xmm7,%xmm2,%xmm2 1471 vpmuludq 48(%edx),%xmm6,%xmm5 1472 vpaddq %xmm5,%xmm1,%xmm1 1473 vmovdqa 64(%esp),%xmm7 1474 vpmuludq 32(%edx),%xmm6,%xmm6 1475 vpaddq %xmm6,%xmm0,%xmm0 1476 vmovdqa 128(%esp),%xmm5 1477 vpmuludq (%edx),%xmm7,%xmm7 1478 vpaddq %xmm7,%xmm4,%xmm4 1479 vpmuludq 64(%edx),%xmm5,%xmm6 1480 vpaddq %xmm6,%xmm3,%xmm3 1481 vpmuludq 16(%edx),%xmm5,%xmm7 1482 vpaddq %xmm7,%xmm0,%xmm0 1483 vpmuludq 32(%edx),%xmm5,%xmm6 1484 vpaddq %xmm6,%xmm1,%xmm1 1485 vmovdqa 64(%ebx),%xmm7 1486 vpmuludq 48(%edx),%xmm5,%xmm5 1487 vpaddq %xmm5,%xmm2,%xmm2 1488 vpsrlq $26,%xmm3,%xmm5 1489 vpand %xmm7,%xmm3,%xmm3 1490 vpsrlq $26,%xmm0,%xmm6 1491 vpand %xmm7,%xmm0,%xmm0 1492 vpaddq %xmm5,%xmm4,%xmm4 1493 vpaddq %xmm6,%xmm1,%xmm1 1494 vpsrlq $26,%xmm4,%xmm5 1495 vpand %xmm7,%xmm4,%xmm4 1496 vpsrlq $26,%xmm1,%xmm6 1497 vpand %xmm7,%xmm1,%xmm1 1498 vpaddq %xmm6,%xmm2,%xmm2 1499 vpaddd %xmm5,%xmm0,%xmm0 1500 vpsllq $2,%xmm5,%xmm5 1501 vpsrlq $26,%xmm2,%xmm6 1502 vpand %xmm7,%xmm2,%xmm2 1503 vpaddd %xmm5,%xmm0,%xmm0 1504 vpaddd %xmm6,%xmm3,%xmm3 1505 vpsrlq $26,%xmm3,%xmm6 1506 vpsrlq $26,%xmm0,%xmm5 1507 vpand %xmm7,%xmm0,%xmm0 1508 vpand %xmm7,%xmm3,%xmm3 1509 vpaddd %xmm5,%xmm1,%xmm1 1510 vpaddd %xmm6,%xmm4,%xmm4 1511 decl %ecx 1512 jz .L019square_break 1513 vpunpcklqdq (%esp),%xmm0,%xmm0 1514 vpunpcklqdq 16(%esp),%xmm1,%xmm1 1515 vpunpcklqdq 32(%esp),%xmm2,%xmm2 1516 vpunpcklqdq 48(%esp),%xmm3,%xmm3 1517 vpunpcklqdq 64(%esp),%xmm4,%xmm4 1518 jmp .L018square 1519.L019square_break: 1520 vpsllq $32,%xmm0,%xmm0 1521 vpsllq $32,%xmm1,%xmm1 1522 vpsllq $32,%xmm2,%xmm2 1523 vpsllq $32,%xmm3,%xmm3 1524 vpsllq $32,%xmm4,%xmm4 1525 vpor (%esp),%xmm0,%xmm0 1526 vpor 16(%esp),%xmm1,%xmm1 1527 vpor 32(%esp),%xmm2,%xmm2 1528 vpor 48(%esp),%xmm3,%xmm3 1529 vpor 64(%esp),%xmm4,%xmm4 1530 vpshufd $141,%xmm0,%xmm0 1531 vpshufd $141,%xmm1,%xmm1 1532 vpshufd $141,%xmm2,%xmm2 1533 vpshufd $141,%xmm3,%xmm3 1534 vpshufd $141,%xmm4,%xmm4 1535 vmovdqu %xmm0,(%edi) 1536 vmovdqu %xmm1,16(%edi) 1537 vmovdqu %xmm2,32(%edi) 1538 vmovdqu %xmm3,48(%edi) 1539 vmovdqu %xmm4,64(%edi) 1540 vpslld $2,%xmm1,%xmm6 1541 vpslld $2,%xmm2,%xmm5 1542 vpaddd %xmm1,%xmm6,%xmm6 1543 vpaddd %xmm2,%xmm5,%xmm5 1544 vmovdqu %xmm6,80(%edi) 1545 vmovdqu %xmm5,96(%edi) 1546 vpslld $2,%xmm3,%xmm6 1547 vpslld $2,%xmm4,%xmm5 1548 vpaddd %xmm3,%xmm6,%xmm6 1549 vpaddd %xmm4,%xmm5,%xmm5 1550 vmovdqu %xmm6,112(%edi) 1551 vmovdqu %xmm5,128(%edi) 1552 movl %ebp,%esp 1553 leal -48(%edi),%edi 1554 ret 1555.size _poly1305_init_avx2,.-_poly1305_init_avx2 1556.align 32 1557.type _poly1305_blocks_avx2,@function 1558.align 16 1559_poly1305_blocks_avx2: 1560 #ifdef __CET__ 1561 1562.byte 243,15,30,251 1563 #endif 1564 1565 pushl %ebp 1566 pushl %ebx 1567 pushl %esi 1568 pushl %edi 1569 movl 20(%esp),%edi 1570 movl 24(%esp),%esi 1571 movl 28(%esp),%ecx 1572 movl 20(%edi),%eax 1573 andl $-16,%ecx 1574 jz .L020nodata 1575 cmpl $64,%ecx 1576 jae .L021enter_avx2 1577 testl %eax,%eax 1578 jz .Lenter_blocks 1579.L021enter_avx2: 1580 vzeroupper 1581 call .L022pic_point 1582.L022pic_point: 1583 popl %ebx 1584 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx 1585 testl %eax,%eax 1586 jnz .L023base2_26 1587 call _poly1305_init_avx2 1588 movl (%edi),%eax 1589 movl 3(%edi),%ecx 1590 movl 6(%edi),%edx 1591 movl 9(%edi),%esi 1592 movl 13(%edi),%ebp 1593 shrl $2,%ecx 1594 andl $67108863,%eax 1595 shrl $4,%edx 1596 andl $67108863,%ecx 1597 shrl $6,%esi 1598 andl $67108863,%edx 1599 movl %eax,(%edi) 1600 movl %ecx,4(%edi) 1601 movl %edx,8(%edi) 1602 movl %esi,12(%edi) 1603 movl %ebp,16(%edi) 1604 movl $1,20(%edi) 1605 movl 24(%esp),%esi 1606 movl 28(%esp),%ecx 1607.L023base2_26: 1608 movl 32(%esp),%eax 1609 movl %esp,%ebp 1610 subl $448,%esp 1611 andl $-512,%esp 1612 vmovdqu 48(%edi),%xmm0 1613 leal 288(%esp),%edx 1614 vmovdqu 64(%edi),%xmm1 1615 vmovdqu 80(%edi),%xmm2 1616 vmovdqu 96(%edi),%xmm3 1617 vmovdqu 112(%edi),%xmm4 1618 leal 48(%edi),%edi 1619 vpermq $64,%ymm0,%ymm0 1620 vpermq $64,%ymm1,%ymm1 1621 vpermq $64,%ymm2,%ymm2 1622 vpermq $64,%ymm3,%ymm3 1623 vpermq $64,%ymm4,%ymm4 1624 vpshufd $200,%ymm0,%ymm0 1625 vpshufd $200,%ymm1,%ymm1 1626 vpshufd $200,%ymm2,%ymm2 1627 vpshufd $200,%ymm3,%ymm3 1628 vpshufd $200,%ymm4,%ymm4 1629 vmovdqa %ymm0,-128(%edx) 1630 vmovdqu 80(%edi),%xmm0 1631 vmovdqa %ymm1,-96(%edx) 1632 vmovdqu 96(%edi),%xmm1 1633 vmovdqa %ymm2,-64(%edx) 1634 vmovdqu 112(%edi),%xmm2 1635 vmovdqa %ymm3,-32(%edx) 1636 vmovdqu 128(%edi),%xmm3 1637 vmovdqa %ymm4,(%edx) 1638 vpermq $64,%ymm0,%ymm0 1639 vpermq $64,%ymm1,%ymm1 1640 vpermq $64,%ymm2,%ymm2 1641 vpermq $64,%ymm3,%ymm3 1642 vpshufd $200,%ymm0,%ymm0 1643 vpshufd $200,%ymm1,%ymm1 1644 vpshufd $200,%ymm2,%ymm2 1645 vpshufd $200,%ymm3,%ymm3 1646 vmovdqa %ymm0,32(%edx) 1647 vmovd -48(%edi),%xmm0 1648 vmovdqa %ymm1,64(%edx) 1649 vmovd -44(%edi),%xmm1 1650 vmovdqa %ymm2,96(%edx) 1651 vmovd -40(%edi),%xmm2 1652 vmovdqa %ymm3,128(%edx) 1653 vmovd -36(%edi),%xmm3 1654 vmovd -32(%edi),%xmm4 1655 vmovdqa 64(%ebx),%ymm7 1656 negl %eax 1657 testl $63,%ecx 1658 jz .L024even 1659 movl %ecx,%edx 1660 andl $-64,%ecx 1661 andl $63,%edx 1662 vmovdqu (%esi),%xmm5 1663 cmpl $32,%edx 1664 jb .L025one 1665 vmovdqu 16(%esi),%xmm6 1666 je .L026two 1667 vinserti128 $1,32(%esi),%ymm5,%ymm5 1668 leal 48(%esi),%esi 1669 leal 8(%ebx),%ebx 1670 leal 296(%esp),%edx 1671 jmp .L027tail 1672.L026two: 1673 leal 32(%esi),%esi 1674 leal 16(%ebx),%ebx 1675 leal 304(%esp),%edx 1676 jmp .L027tail 1677.L025one: 1678 leal 16(%esi),%esi 1679 vpxor %ymm6,%ymm6,%ymm6 1680 leal 32(%ebx,%eax,8),%ebx 1681 leal 312(%esp),%edx 1682 jmp .L027tail 1683.align 32 1684.L024even: 1685 vmovdqu (%esi),%xmm5 1686 vmovdqu 16(%esi),%xmm6 1687 vinserti128 $1,32(%esi),%ymm5,%ymm5 1688 vinserti128 $1,48(%esi),%ymm6,%ymm6 1689 leal 64(%esi),%esi 1690 subl $64,%ecx 1691 jz .L027tail 1692.L028loop: 1693 vmovdqa %ymm2,64(%esp) 1694 vpsrldq $6,%ymm5,%ymm2 1695 vmovdqa %ymm0,(%esp) 1696 vpsrldq $6,%ymm6,%ymm0 1697 vmovdqa %ymm1,32(%esp) 1698 vpunpckhqdq %ymm6,%ymm5,%ymm1 1699 vpunpcklqdq %ymm6,%ymm5,%ymm5 1700 vpunpcklqdq %ymm0,%ymm2,%ymm2 1701 vpsrlq $30,%ymm2,%ymm0 1702 vpsrlq $4,%ymm2,%ymm2 1703 vpsrlq $26,%ymm5,%ymm6 1704 vpsrlq $40,%ymm1,%ymm1 1705 vpand %ymm7,%ymm2,%ymm2 1706 vpand %ymm7,%ymm5,%ymm5 1707 vpand %ymm7,%ymm6,%ymm6 1708 vpand %ymm7,%ymm0,%ymm0 1709 vpor (%ebx),%ymm1,%ymm1 1710 vpaddq 64(%esp),%ymm2,%ymm2 1711 vpaddq (%esp),%ymm5,%ymm5 1712 vpaddq 32(%esp),%ymm6,%ymm6 1713 vpaddq %ymm3,%ymm0,%ymm0 1714 vpaddq %ymm4,%ymm1,%ymm1 1715 vpmuludq -96(%edx),%ymm2,%ymm3 1716 vmovdqa %ymm6,32(%esp) 1717 vpmuludq -64(%edx),%ymm2,%ymm4 1718 vmovdqa %ymm0,96(%esp) 1719 vpmuludq 96(%edx),%ymm2,%ymm0 1720 vmovdqa %ymm1,128(%esp) 1721 vpmuludq 128(%edx),%ymm2,%ymm1 1722 vpmuludq -128(%edx),%ymm2,%ymm2 1723 vpmuludq -32(%edx),%ymm5,%ymm7 1724 vpaddq %ymm7,%ymm3,%ymm3 1725 vpmuludq (%edx),%ymm5,%ymm6 1726 vpaddq %ymm6,%ymm4,%ymm4 1727 vpmuludq -128(%edx),%ymm5,%ymm7 1728 vpaddq %ymm7,%ymm0,%ymm0 1729 vmovdqa 32(%esp),%ymm7 1730 vpmuludq -96(%edx),%ymm5,%ymm6 1731 vpaddq %ymm6,%ymm1,%ymm1 1732 vpmuludq -64(%edx),%ymm5,%ymm5 1733 vpaddq %ymm5,%ymm2,%ymm2 1734 vpmuludq -64(%edx),%ymm7,%ymm6 1735 vpaddq %ymm6,%ymm3,%ymm3 1736 vpmuludq -32(%edx),%ymm7,%ymm5 1737 vpaddq %ymm5,%ymm4,%ymm4 1738 vpmuludq 128(%edx),%ymm7,%ymm6 1739 vpaddq %ymm6,%ymm0,%ymm0 1740 vmovdqa 96(%esp),%ymm6 1741 vpmuludq -128(%edx),%ymm7,%ymm5 1742 vpaddq %ymm5,%ymm1,%ymm1 1743 vpmuludq -96(%edx),%ymm7,%ymm7 1744 vpaddq %ymm7,%ymm2,%ymm2 1745 vpmuludq -128(%edx),%ymm6,%ymm5 1746 vpaddq %ymm5,%ymm3,%ymm3 1747 vpmuludq -96(%edx),%ymm6,%ymm7 1748 vpaddq %ymm7,%ymm4,%ymm4 1749 vpmuludq 64(%edx),%ymm6,%ymm5 1750 vpaddq %ymm5,%ymm0,%ymm0 1751 vmovdqa 128(%esp),%ymm5 1752 vpmuludq 96(%edx),%ymm6,%ymm7 1753 vpaddq %ymm7,%ymm1,%ymm1 1754 vpmuludq 128(%edx),%ymm6,%ymm6 1755 vpaddq %ymm6,%ymm2,%ymm2 1756 vpmuludq 128(%edx),%ymm5,%ymm7 1757 vpaddq %ymm7,%ymm3,%ymm3 1758 vpmuludq 32(%edx),%ymm5,%ymm6 1759 vpaddq %ymm6,%ymm0,%ymm0 1760 vpmuludq -128(%edx),%ymm5,%ymm7 1761 vpaddq %ymm7,%ymm4,%ymm4 1762 vmovdqa 64(%ebx),%ymm7 1763 vpmuludq 64(%edx),%ymm5,%ymm6 1764 vpaddq %ymm6,%ymm1,%ymm1 1765 vpmuludq 96(%edx),%ymm5,%ymm5 1766 vpaddq %ymm5,%ymm2,%ymm2 1767 vpsrlq $26,%ymm3,%ymm5 1768 vpand %ymm7,%ymm3,%ymm3 1769 vpsrlq $26,%ymm0,%ymm6 1770 vpand %ymm7,%ymm0,%ymm0 1771 vpaddq %ymm5,%ymm4,%ymm4 1772 vpaddq %ymm6,%ymm1,%ymm1 1773 vpsrlq $26,%ymm4,%ymm5 1774 vpand %ymm7,%ymm4,%ymm4 1775 vpsrlq $26,%ymm1,%ymm6 1776 vpand %ymm7,%ymm1,%ymm1 1777 vpaddq %ymm6,%ymm2,%ymm2 1778 vpaddq %ymm5,%ymm0,%ymm0 1779 vpsllq $2,%ymm5,%ymm5 1780 vpsrlq $26,%ymm2,%ymm6 1781 vpand %ymm7,%ymm2,%ymm2 1782 vpaddq %ymm5,%ymm0,%ymm0 1783 vpaddq %ymm6,%ymm3,%ymm3 1784 vpsrlq $26,%ymm3,%ymm6 1785 vpsrlq $26,%ymm0,%ymm5 1786 vpand %ymm7,%ymm0,%ymm0 1787 vpand %ymm7,%ymm3,%ymm3 1788 vpaddq %ymm5,%ymm1,%ymm1 1789 vpaddq %ymm6,%ymm4,%ymm4 1790 vmovdqu (%esi),%xmm5 1791 vmovdqu 16(%esi),%xmm6 1792 vinserti128 $1,32(%esi),%ymm5,%ymm5 1793 vinserti128 $1,48(%esi),%ymm6,%ymm6 1794 leal 64(%esi),%esi 1795 subl $64,%ecx 1796 jnz .L028loop 1797.L027tail: 1798 vmovdqa %ymm2,64(%esp) 1799 vpsrldq $6,%ymm5,%ymm2 1800 vmovdqa %ymm0,(%esp) 1801 vpsrldq $6,%ymm6,%ymm0 1802 vmovdqa %ymm1,32(%esp) 1803 vpunpckhqdq %ymm6,%ymm5,%ymm1 1804 vpunpcklqdq %ymm6,%ymm5,%ymm5 1805 vpunpcklqdq %ymm0,%ymm2,%ymm2 1806 vpsrlq $30,%ymm2,%ymm0 1807 vpsrlq $4,%ymm2,%ymm2 1808 vpsrlq $26,%ymm5,%ymm6 1809 vpsrlq $40,%ymm1,%ymm1 1810 vpand %ymm7,%ymm2,%ymm2 1811 vpand %ymm7,%ymm5,%ymm5 1812 vpand %ymm7,%ymm6,%ymm6 1813 vpand %ymm7,%ymm0,%ymm0 1814 vpor (%ebx),%ymm1,%ymm1 1815 andl $-64,%ebx 1816 vpaddq 64(%esp),%ymm2,%ymm2 1817 vpaddq (%esp),%ymm5,%ymm5 1818 vpaddq 32(%esp),%ymm6,%ymm6 1819 vpaddq %ymm3,%ymm0,%ymm0 1820 vpaddq %ymm4,%ymm1,%ymm1 1821 vpmuludq -92(%edx),%ymm2,%ymm3 1822 vmovdqa %ymm6,32(%esp) 1823 vpmuludq -60(%edx),%ymm2,%ymm4 1824 vmovdqa %ymm0,96(%esp) 1825 vpmuludq 100(%edx),%ymm2,%ymm0 1826 vmovdqa %ymm1,128(%esp) 1827 vpmuludq 132(%edx),%ymm2,%ymm1 1828 vpmuludq -124(%edx),%ymm2,%ymm2 1829 vpmuludq -28(%edx),%ymm5,%ymm7 1830 vpaddq %ymm7,%ymm3,%ymm3 1831 vpmuludq 4(%edx),%ymm5,%ymm6 1832 vpaddq %ymm6,%ymm4,%ymm4 1833 vpmuludq -124(%edx),%ymm5,%ymm7 1834 vpaddq %ymm7,%ymm0,%ymm0 1835 vmovdqa 32(%esp),%ymm7 1836 vpmuludq -92(%edx),%ymm5,%ymm6 1837 vpaddq %ymm6,%ymm1,%ymm1 1838 vpmuludq -60(%edx),%ymm5,%ymm5 1839 vpaddq %ymm5,%ymm2,%ymm2 1840 vpmuludq -60(%edx),%ymm7,%ymm6 1841 vpaddq %ymm6,%ymm3,%ymm3 1842 vpmuludq -28(%edx),%ymm7,%ymm5 1843 vpaddq %ymm5,%ymm4,%ymm4 1844 vpmuludq 132(%edx),%ymm7,%ymm6 1845 vpaddq %ymm6,%ymm0,%ymm0 1846 vmovdqa 96(%esp),%ymm6 1847 vpmuludq -124(%edx),%ymm7,%ymm5 1848 vpaddq %ymm5,%ymm1,%ymm1 1849 vpmuludq -92(%edx),%ymm7,%ymm7 1850 vpaddq %ymm7,%ymm2,%ymm2 1851 vpmuludq -124(%edx),%ymm6,%ymm5 1852 vpaddq %ymm5,%ymm3,%ymm3 1853 vpmuludq -92(%edx),%ymm6,%ymm7 1854 vpaddq %ymm7,%ymm4,%ymm4 1855 vpmuludq 68(%edx),%ymm6,%ymm5 1856 vpaddq %ymm5,%ymm0,%ymm0 1857 vmovdqa 128(%esp),%ymm5 1858 vpmuludq 100(%edx),%ymm6,%ymm7 1859 vpaddq %ymm7,%ymm1,%ymm1 1860 vpmuludq 132(%edx),%ymm6,%ymm6 1861 vpaddq %ymm6,%ymm2,%ymm2 1862 vpmuludq 132(%edx),%ymm5,%ymm7 1863 vpaddq %ymm7,%ymm3,%ymm3 1864 vpmuludq 36(%edx),%ymm5,%ymm6 1865 vpaddq %ymm6,%ymm0,%ymm0 1866 vpmuludq -124(%edx),%ymm5,%ymm7 1867 vpaddq %ymm7,%ymm4,%ymm4 1868 vmovdqa 64(%ebx),%ymm7 1869 vpmuludq 68(%edx),%ymm5,%ymm6 1870 vpaddq %ymm6,%ymm1,%ymm1 1871 vpmuludq 100(%edx),%ymm5,%ymm5 1872 vpaddq %ymm5,%ymm2,%ymm2 1873 vpsrldq $8,%ymm4,%ymm5 1874 vpsrldq $8,%ymm3,%ymm6 1875 vpaddq %ymm5,%ymm4,%ymm4 1876 vpsrldq $8,%ymm0,%ymm5 1877 vpaddq %ymm6,%ymm3,%ymm3 1878 vpsrldq $8,%ymm1,%ymm6 1879 vpaddq %ymm5,%ymm0,%ymm0 1880 vpsrldq $8,%ymm2,%ymm5 1881 vpaddq %ymm6,%ymm1,%ymm1 1882 vpermq $2,%ymm4,%ymm6 1883 vpaddq %ymm5,%ymm2,%ymm2 1884 vpermq $2,%ymm3,%ymm5 1885 vpaddq %ymm6,%ymm4,%ymm4 1886 vpermq $2,%ymm0,%ymm6 1887 vpaddq %ymm5,%ymm3,%ymm3 1888 vpermq $2,%ymm1,%ymm5 1889 vpaddq %ymm6,%ymm0,%ymm0 1890 vpermq $2,%ymm2,%ymm6 1891 vpaddq %ymm5,%ymm1,%ymm1 1892 vpaddq %ymm6,%ymm2,%ymm2 1893 vpsrlq $26,%ymm3,%ymm5 1894 vpand %ymm7,%ymm3,%ymm3 1895 vpsrlq $26,%ymm0,%ymm6 1896 vpand %ymm7,%ymm0,%ymm0 1897 vpaddq %ymm5,%ymm4,%ymm4 1898 vpaddq %ymm6,%ymm1,%ymm1 1899 vpsrlq $26,%ymm4,%ymm5 1900 vpand %ymm7,%ymm4,%ymm4 1901 vpsrlq $26,%ymm1,%ymm6 1902 vpand %ymm7,%ymm1,%ymm1 1903 vpaddq %ymm6,%ymm2,%ymm2 1904 vpaddq %ymm5,%ymm0,%ymm0 1905 vpsllq $2,%ymm5,%ymm5 1906 vpsrlq $26,%ymm2,%ymm6 1907 vpand %ymm7,%ymm2,%ymm2 1908 vpaddq %ymm5,%ymm0,%ymm0 1909 vpaddq %ymm6,%ymm3,%ymm3 1910 vpsrlq $26,%ymm3,%ymm6 1911 vpsrlq $26,%ymm0,%ymm5 1912 vpand %ymm7,%ymm0,%ymm0 1913 vpand %ymm7,%ymm3,%ymm3 1914 vpaddq %ymm5,%ymm1,%ymm1 1915 vpaddq %ymm6,%ymm4,%ymm4 1916 cmpl $0,%ecx 1917 je .L029done 1918 vpshufd $252,%xmm0,%xmm0 1919 leal 288(%esp),%edx 1920 vpshufd $252,%xmm1,%xmm1 1921 vpshufd $252,%xmm2,%xmm2 1922 vpshufd $252,%xmm3,%xmm3 1923 vpshufd $252,%xmm4,%xmm4 1924 jmp .L024even 1925.align 16 1926.L029done: 1927 vmovd %xmm0,-48(%edi) 1928 vmovd %xmm1,-44(%edi) 1929 vmovd %xmm2,-40(%edi) 1930 vmovd %xmm3,-36(%edi) 1931 vmovd %xmm4,-32(%edi) 1932 vzeroupper 1933 movl %ebp,%esp 1934.L020nodata: 1935 popl %edi 1936 popl %esi 1937 popl %ebx 1938 popl %ebp 1939 ret 1940.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 1941.align 64 1942.Lconst_sse2: 1943.long 16777216,0,16777216,0,16777216,0,16777216,0 1944.long 0,0,0,0,0,0,0,0 1945.long 67108863,0,67108863,0,67108863,0,67108863,0 1946.long 268435455,268435452,268435452,268435452 1947.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 1948.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 1949.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 1950.byte 114,103,62,0 1951.align 4 1952.comm OPENSSL_ia32cap_P,16,4 1953 1954 .section ".note.gnu.property", "a" 1955 .p2align 2 1956 .long 1f - 0f 1957 .long 4f - 1f 1958 .long 5 19590: 1960 .asciz "GNU" 19611: 1962 .p2align 2 1963 .long 0xc0000002 1964 .long 3f - 2f 19652: 1966 .long 3 19673: 1968 .p2align 2 19694: 1970#else 1971.text 1972.align 64 1973.globl poly1305_init 1974.type poly1305_init,@function 1975.align 16 1976poly1305_init: 1977.L_poly1305_init_begin: 1978 #ifdef __CET__ 1979 1980.byte 243,15,30,251 1981 #endif 1982 1983 pushl %ebp 1984 pushl %ebx 1985 pushl %esi 1986 pushl %edi 1987 movl 20(%esp),%edi 1988 movl 24(%esp),%esi 1989 movl 28(%esp),%ebp 1990 xorl %eax,%eax 1991 movl %eax,(%edi) 1992 movl %eax,4(%edi) 1993 movl %eax,8(%edi) 1994 movl %eax,12(%edi) 1995 movl %eax,16(%edi) 1996 movl %eax,20(%edi) 1997 cmpl $0,%esi 1998 je .L000nokey 1999 call .L001pic_point 2000.L001pic_point: 2001 popl %ebx 2002 leal poly1305_blocks-.L001pic_point(%ebx),%eax 2003 leal poly1305_emit-.L001pic_point(%ebx),%edx 2004 leal OPENSSL_ia32cap_P,%edi 2005 movl (%edi),%ecx 2006 andl $83886080,%ecx 2007 cmpl $83886080,%ecx 2008 jne .L002no_sse2 2009 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax 2010 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx 2011 movl 8(%edi),%ecx 2012 testl $32,%ecx 2013 jz .L002no_sse2 2014 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax 2015.L002no_sse2: 2016 movl 20(%esp),%edi 2017 movl %eax,(%ebp) 2018 movl %edx,4(%ebp) 2019 movl (%esi),%eax 2020 movl 4(%esi),%ebx 2021 movl 8(%esi),%ecx 2022 movl 12(%esi),%edx 2023 andl $268435455,%eax 2024 andl $268435452,%ebx 2025 andl $268435452,%ecx 2026 andl $268435452,%edx 2027 movl %eax,24(%edi) 2028 movl %ebx,28(%edi) 2029 movl %ecx,32(%edi) 2030 movl %edx,36(%edi) 2031 movl $1,%eax 2032.L000nokey: 2033 popl %edi 2034 popl %esi 2035 popl %ebx 2036 popl %ebp 2037 ret 2038.size poly1305_init,.-.L_poly1305_init_begin 2039.globl poly1305_blocks 2040.type poly1305_blocks,@function 2041.align 16 2042poly1305_blocks: 2043.L_poly1305_blocks_begin: 2044 #ifdef __CET__ 2045 2046.byte 243,15,30,251 2047 #endif 2048 2049 pushl %ebp 2050 pushl %ebx 2051 pushl %esi 2052 pushl %edi 2053 movl 20(%esp),%edi 2054 movl 24(%esp),%esi 2055 movl 28(%esp),%ecx 2056.Lenter_blocks: 2057 andl $-15,%ecx 2058 jz .L003nodata 2059 subl $64,%esp 2060 movl 24(%edi),%eax 2061 movl 28(%edi),%ebx 2062 leal (%esi,%ecx,1),%ebp 2063 movl 32(%edi),%ecx 2064 movl 36(%edi),%edx 2065 movl %ebp,92(%esp) 2066 movl %esi,%ebp 2067 movl %eax,36(%esp) 2068 movl %ebx,%eax 2069 shrl $2,%eax 2070 movl %ebx,40(%esp) 2071 addl %ebx,%eax 2072 movl %ecx,%ebx 2073 shrl $2,%ebx 2074 movl %ecx,44(%esp) 2075 addl %ecx,%ebx 2076 movl %edx,%ecx 2077 shrl $2,%ecx 2078 movl %edx,48(%esp) 2079 addl %edx,%ecx 2080 movl %eax,52(%esp) 2081 movl %ebx,56(%esp) 2082 movl %ecx,60(%esp) 2083 movl (%edi),%eax 2084 movl 4(%edi),%ebx 2085 movl 8(%edi),%ecx 2086 movl 12(%edi),%esi 2087 movl 16(%edi),%edi 2088 jmp .L004loop 2089.align 32 2090.L004loop: 2091 addl (%ebp),%eax 2092 adcl 4(%ebp),%ebx 2093 adcl 8(%ebp),%ecx 2094 adcl 12(%ebp),%esi 2095 leal 16(%ebp),%ebp 2096 adcl 96(%esp),%edi 2097 movl %eax,(%esp) 2098 movl %esi,12(%esp) 2099 mull 36(%esp) 2100 movl %edi,16(%esp) 2101 movl %eax,%edi 2102 movl %ebx,%eax 2103 movl %edx,%esi 2104 mull 60(%esp) 2105 addl %eax,%edi 2106 movl %ecx,%eax 2107 adcl %edx,%esi 2108 mull 56(%esp) 2109 addl %eax,%edi 2110 movl 12(%esp),%eax 2111 adcl %edx,%esi 2112 mull 52(%esp) 2113 addl %eax,%edi 2114 movl (%esp),%eax 2115 adcl %edx,%esi 2116 mull 40(%esp) 2117 movl %edi,20(%esp) 2118 xorl %edi,%edi 2119 addl %eax,%esi 2120 movl %ebx,%eax 2121 adcl %edx,%edi 2122 mull 36(%esp) 2123 addl %eax,%esi 2124 movl %ecx,%eax 2125 adcl %edx,%edi 2126 mull 60(%esp) 2127 addl %eax,%esi 2128 movl 12(%esp),%eax 2129 adcl %edx,%edi 2130 mull 56(%esp) 2131 addl %eax,%esi 2132 movl 16(%esp),%eax 2133 adcl %edx,%edi 2134 imull 52(%esp),%eax 2135 addl %eax,%esi 2136 movl (%esp),%eax 2137 adcl $0,%edi 2138 mull 44(%esp) 2139 movl %esi,24(%esp) 2140 xorl %esi,%esi 2141 addl %eax,%edi 2142 movl %ebx,%eax 2143 adcl %edx,%esi 2144 mull 40(%esp) 2145 addl %eax,%edi 2146 movl %ecx,%eax 2147 adcl %edx,%esi 2148 mull 36(%esp) 2149 addl %eax,%edi 2150 movl 12(%esp),%eax 2151 adcl %edx,%esi 2152 mull 60(%esp) 2153 addl %eax,%edi 2154 movl 16(%esp),%eax 2155 adcl %edx,%esi 2156 imull 56(%esp),%eax 2157 addl %eax,%edi 2158 movl (%esp),%eax 2159 adcl $0,%esi 2160 mull 48(%esp) 2161 movl %edi,28(%esp) 2162 xorl %edi,%edi 2163 addl %eax,%esi 2164 movl %ebx,%eax 2165 adcl %edx,%edi 2166 mull 44(%esp) 2167 addl %eax,%esi 2168 movl %ecx,%eax 2169 adcl %edx,%edi 2170 mull 40(%esp) 2171 addl %eax,%esi 2172 movl 12(%esp),%eax 2173 adcl %edx,%edi 2174 mull 36(%esp) 2175 addl %eax,%esi 2176 movl 16(%esp),%ecx 2177 adcl %edx,%edi 2178 movl %ecx,%edx 2179 imull 60(%esp),%ecx 2180 addl %ecx,%esi 2181 movl 20(%esp),%eax 2182 adcl $0,%edi 2183 imull 36(%esp),%edx 2184 addl %edi,%edx 2185 movl 24(%esp),%ebx 2186 movl 28(%esp),%ecx 2187 movl %edx,%edi 2188 shrl $2,%edx 2189 andl $3,%edi 2190 leal (%edx,%edx,4),%edx 2191 addl %edx,%eax 2192 adcl $0,%ebx 2193 adcl $0,%ecx 2194 adcl $0,%esi 2195 adcl $0,%edi 2196 cmpl 92(%esp),%ebp 2197 jne .L004loop 2198 movl 84(%esp),%edx 2199 addl $64,%esp 2200 movl %eax,(%edx) 2201 movl %ebx,4(%edx) 2202 movl %ecx,8(%edx) 2203 movl %esi,12(%edx) 2204 movl %edi,16(%edx) 2205.L003nodata: 2206 popl %edi 2207 popl %esi 2208 popl %ebx 2209 popl %ebp 2210 ret 2211.size poly1305_blocks,.-.L_poly1305_blocks_begin 2212.globl poly1305_emit 2213.type poly1305_emit,@function 2214.align 16 2215poly1305_emit: 2216.L_poly1305_emit_begin: 2217 #ifdef __CET__ 2218 2219.byte 243,15,30,251 2220 #endif 2221 2222 pushl %ebp 2223 pushl %ebx 2224 pushl %esi 2225 pushl %edi 2226 movl 20(%esp),%ebp 2227.Lenter_emit: 2228 movl 24(%esp),%edi 2229 movl (%ebp),%eax 2230 movl 4(%ebp),%ebx 2231 movl 8(%ebp),%ecx 2232 movl 12(%ebp),%edx 2233 movl 16(%ebp),%esi 2234 addl $5,%eax 2235 adcl $0,%ebx 2236 adcl $0,%ecx 2237 adcl $0,%edx 2238 adcl $0,%esi 2239 shrl $2,%esi 2240 negl %esi 2241 andl %esi,%eax 2242 andl %esi,%ebx 2243 andl %esi,%ecx 2244 andl %esi,%edx 2245 movl %eax,(%edi) 2246 movl %ebx,4(%edi) 2247 movl %ecx,8(%edi) 2248 movl %edx,12(%edi) 2249 notl %esi 2250 movl (%ebp),%eax 2251 movl 4(%ebp),%ebx 2252 movl 8(%ebp),%ecx 2253 movl 12(%ebp),%edx 2254 movl 28(%esp),%ebp 2255 andl %esi,%eax 2256 andl %esi,%ebx 2257 andl %esi,%ecx 2258 andl %esi,%edx 2259 orl (%edi),%eax 2260 orl 4(%edi),%ebx 2261 orl 8(%edi),%ecx 2262 orl 12(%edi),%edx 2263 addl (%ebp),%eax 2264 adcl 4(%ebp),%ebx 2265 adcl 8(%ebp),%ecx 2266 adcl 12(%ebp),%edx 2267 movl %eax,(%edi) 2268 movl %ebx,4(%edi) 2269 movl %ecx,8(%edi) 2270 movl %edx,12(%edi) 2271 popl %edi 2272 popl %esi 2273 popl %ebx 2274 popl %ebp 2275 ret 2276.size poly1305_emit,.-.L_poly1305_emit_begin 2277.align 32 2278.type _poly1305_init_sse2,@function 2279.align 16 2280_poly1305_init_sse2: 2281 #ifdef __CET__ 2282 2283.byte 243,15,30,251 2284 #endif 2285 2286 movdqu 24(%edi),%xmm4 2287 leal 48(%edi),%edi 2288 movl %esp,%ebp 2289 subl $224,%esp 2290 andl $-16,%esp 2291 movq 64(%ebx),%xmm7 2292 movdqa %xmm4,%xmm0 2293 movdqa %xmm4,%xmm1 2294 movdqa %xmm4,%xmm2 2295 pand %xmm7,%xmm0 2296 psrlq $26,%xmm1 2297 psrldq $6,%xmm2 2298 pand %xmm7,%xmm1 2299 movdqa %xmm2,%xmm3 2300 psrlq $4,%xmm2 2301 psrlq $30,%xmm3 2302 pand %xmm7,%xmm2 2303 pand %xmm7,%xmm3 2304 psrldq $13,%xmm4 2305 leal 144(%esp),%edx 2306 movl $2,%ecx 2307.L005square: 2308 movdqa %xmm0,(%esp) 2309 movdqa %xmm1,16(%esp) 2310 movdqa %xmm2,32(%esp) 2311 movdqa %xmm3,48(%esp) 2312 movdqa %xmm4,64(%esp) 2313 movdqa %xmm1,%xmm6 2314 movdqa %xmm2,%xmm5 2315 pslld $2,%xmm6 2316 pslld $2,%xmm5 2317 paddd %xmm1,%xmm6 2318 paddd %xmm2,%xmm5 2319 movdqa %xmm6,80(%esp) 2320 movdqa %xmm5,96(%esp) 2321 movdqa %xmm3,%xmm6 2322 movdqa %xmm4,%xmm5 2323 pslld $2,%xmm6 2324 pslld $2,%xmm5 2325 paddd %xmm3,%xmm6 2326 paddd %xmm4,%xmm5 2327 movdqa %xmm6,112(%esp) 2328 movdqa %xmm5,128(%esp) 2329 pshufd $68,%xmm0,%xmm6 2330 movdqa %xmm1,%xmm5 2331 pshufd $68,%xmm1,%xmm1 2332 pshufd $68,%xmm2,%xmm2 2333 pshufd $68,%xmm3,%xmm3 2334 pshufd $68,%xmm4,%xmm4 2335 movdqa %xmm6,(%edx) 2336 movdqa %xmm1,16(%edx) 2337 movdqa %xmm2,32(%edx) 2338 movdqa %xmm3,48(%edx) 2339 movdqa %xmm4,64(%edx) 2340 pmuludq %xmm0,%xmm4 2341 pmuludq %xmm0,%xmm3 2342 pmuludq %xmm0,%xmm2 2343 pmuludq %xmm0,%xmm1 2344 pmuludq %xmm6,%xmm0 2345 movdqa %xmm5,%xmm6 2346 pmuludq 48(%edx),%xmm5 2347 movdqa %xmm6,%xmm7 2348 pmuludq 32(%edx),%xmm6 2349 paddq %xmm5,%xmm4 2350 movdqa %xmm7,%xmm5 2351 pmuludq 16(%edx),%xmm7 2352 paddq %xmm6,%xmm3 2353 movdqa 80(%esp),%xmm6 2354 pmuludq (%edx),%xmm5 2355 paddq %xmm7,%xmm2 2356 pmuludq 64(%edx),%xmm6 2357 movdqa 32(%esp),%xmm7 2358 paddq %xmm5,%xmm1 2359 movdqa %xmm7,%xmm5 2360 pmuludq 32(%edx),%xmm7 2361 paddq %xmm6,%xmm0 2362 movdqa %xmm5,%xmm6 2363 pmuludq 16(%edx),%xmm5 2364 paddq %xmm7,%xmm4 2365 movdqa 96(%esp),%xmm7 2366 pmuludq (%edx),%xmm6 2367 paddq %xmm5,%xmm3 2368 movdqa %xmm7,%xmm5 2369 pmuludq 64(%edx),%xmm7 2370 paddq %xmm6,%xmm2 2371 pmuludq 48(%edx),%xmm5 2372 movdqa 48(%esp),%xmm6 2373 paddq %xmm7,%xmm1 2374 movdqa %xmm6,%xmm7 2375 pmuludq 16(%edx),%xmm6 2376 paddq %xmm5,%xmm0 2377 movdqa 112(%esp),%xmm5 2378 pmuludq (%edx),%xmm7 2379 paddq %xmm6,%xmm4 2380 movdqa %xmm5,%xmm6 2381 pmuludq 64(%edx),%xmm5 2382 paddq %xmm7,%xmm3 2383 movdqa %xmm6,%xmm7 2384 pmuludq 48(%edx),%xmm6 2385 paddq %xmm5,%xmm2 2386 pmuludq 32(%edx),%xmm7 2387 movdqa 64(%esp),%xmm5 2388 paddq %xmm6,%xmm1 2389 movdqa 128(%esp),%xmm6 2390 pmuludq (%edx),%xmm5 2391 paddq %xmm7,%xmm0 2392 movdqa %xmm6,%xmm7 2393 pmuludq 64(%edx),%xmm6 2394 paddq %xmm5,%xmm4 2395 movdqa %xmm7,%xmm5 2396 pmuludq 16(%edx),%xmm7 2397 paddq %xmm6,%xmm3 2398 movdqa %xmm5,%xmm6 2399 pmuludq 32(%edx),%xmm5 2400 paddq %xmm7,%xmm0 2401 pmuludq 48(%edx),%xmm6 2402 movdqa 64(%ebx),%xmm7 2403 paddq %xmm5,%xmm1 2404 paddq %xmm6,%xmm2 2405 movdqa %xmm3,%xmm5 2406 pand %xmm7,%xmm3 2407 psrlq $26,%xmm5 2408 paddq %xmm4,%xmm5 2409 movdqa %xmm0,%xmm6 2410 pand %xmm7,%xmm0 2411 psrlq $26,%xmm6 2412 movdqa %xmm5,%xmm4 2413 paddq %xmm1,%xmm6 2414 psrlq $26,%xmm5 2415 pand %xmm7,%xmm4 2416 movdqa %xmm6,%xmm1 2417 psrlq $26,%xmm6 2418 paddd %xmm5,%xmm0 2419 psllq $2,%xmm5 2420 paddq %xmm2,%xmm6 2421 paddq %xmm0,%xmm5 2422 pand %xmm7,%xmm1 2423 movdqa %xmm6,%xmm2 2424 psrlq $26,%xmm6 2425 pand %xmm7,%xmm2 2426 paddd %xmm3,%xmm6 2427 movdqa %xmm5,%xmm0 2428 psrlq $26,%xmm5 2429 movdqa %xmm6,%xmm3 2430 psrlq $26,%xmm6 2431 pand %xmm7,%xmm0 2432 paddd %xmm5,%xmm1 2433 pand %xmm7,%xmm3 2434 paddd %xmm6,%xmm4 2435 decl %ecx 2436 jz .L006square_break 2437 punpcklqdq (%esp),%xmm0 2438 punpcklqdq 16(%esp),%xmm1 2439 punpcklqdq 32(%esp),%xmm2 2440 punpcklqdq 48(%esp),%xmm3 2441 punpcklqdq 64(%esp),%xmm4 2442 jmp .L005square 2443.L006square_break: 2444 psllq $32,%xmm0 2445 psllq $32,%xmm1 2446 psllq $32,%xmm2 2447 psllq $32,%xmm3 2448 psllq $32,%xmm4 2449 por (%esp),%xmm0 2450 por 16(%esp),%xmm1 2451 por 32(%esp),%xmm2 2452 por 48(%esp),%xmm3 2453 por 64(%esp),%xmm4 2454 pshufd $141,%xmm0,%xmm0 2455 pshufd $141,%xmm1,%xmm1 2456 pshufd $141,%xmm2,%xmm2 2457 pshufd $141,%xmm3,%xmm3 2458 pshufd $141,%xmm4,%xmm4 2459 movdqu %xmm0,(%edi) 2460 movdqu %xmm1,16(%edi) 2461 movdqu %xmm2,32(%edi) 2462 movdqu %xmm3,48(%edi) 2463 movdqu %xmm4,64(%edi) 2464 movdqa %xmm1,%xmm6 2465 movdqa %xmm2,%xmm5 2466 pslld $2,%xmm6 2467 pslld $2,%xmm5 2468 paddd %xmm1,%xmm6 2469 paddd %xmm2,%xmm5 2470 movdqu %xmm6,80(%edi) 2471 movdqu %xmm5,96(%edi) 2472 movdqa %xmm3,%xmm6 2473 movdqa %xmm4,%xmm5 2474 pslld $2,%xmm6 2475 pslld $2,%xmm5 2476 paddd %xmm3,%xmm6 2477 paddd %xmm4,%xmm5 2478 movdqu %xmm6,112(%edi) 2479 movdqu %xmm5,128(%edi) 2480 movl %ebp,%esp 2481 leal -48(%edi),%edi 2482 ret 2483.size _poly1305_init_sse2,.-_poly1305_init_sse2 2484.align 32 2485.type _poly1305_blocks_sse2,@function 2486.align 16 2487_poly1305_blocks_sse2: 2488 #ifdef __CET__ 2489 2490.byte 243,15,30,251 2491 #endif 2492 2493 pushl %ebp 2494 pushl %ebx 2495 pushl %esi 2496 pushl %edi 2497 movl 20(%esp),%edi 2498 movl 24(%esp),%esi 2499 movl 28(%esp),%ecx 2500 movl 20(%edi),%eax 2501 andl $-16,%ecx 2502 jz .L007nodata 2503 cmpl $64,%ecx 2504 jae .L008enter_sse2 2505 testl %eax,%eax 2506 jz .Lenter_blocks 2507.align 16 2508.L008enter_sse2: 2509 call .L009pic_point 2510.L009pic_point: 2511 popl %ebx 2512 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx 2513 testl %eax,%eax 2514 jnz .L010base2_26 2515 call _poly1305_init_sse2 2516 movl (%edi),%eax 2517 movl 3(%edi),%ecx 2518 movl 6(%edi),%edx 2519 movl 9(%edi),%esi 2520 movl 13(%edi),%ebp 2521 movl $1,20(%edi) 2522 shrl $2,%ecx 2523 andl $67108863,%eax 2524 shrl $4,%edx 2525 andl $67108863,%ecx 2526 shrl $6,%esi 2527 andl $67108863,%edx 2528 movd %eax,%xmm0 2529 movd %ecx,%xmm1 2530 movd %edx,%xmm2 2531 movd %esi,%xmm3 2532 movd %ebp,%xmm4 2533 movl 24(%esp),%esi 2534 movl 28(%esp),%ecx 2535 jmp .L011base2_32 2536.align 16 2537.L010base2_26: 2538 movd (%edi),%xmm0 2539 movd 4(%edi),%xmm1 2540 movd 8(%edi),%xmm2 2541 movd 12(%edi),%xmm3 2542 movd 16(%edi),%xmm4 2543 movdqa 64(%ebx),%xmm7 2544.L011base2_32: 2545 movl 32(%esp),%eax 2546 movl %esp,%ebp 2547 subl $528,%esp 2548 andl $-16,%esp 2549 leal 48(%edi),%edi 2550 shll $24,%eax 2551 testl $31,%ecx 2552 jz .L012even 2553 movdqu (%esi),%xmm6 2554 leal 16(%esi),%esi 2555 movdqa %xmm6,%xmm5 2556 pand %xmm7,%xmm6 2557 paddd %xmm6,%xmm0 2558 movdqa %xmm5,%xmm6 2559 psrlq $26,%xmm5 2560 psrldq $6,%xmm6 2561 pand %xmm7,%xmm5 2562 paddd %xmm5,%xmm1 2563 movdqa %xmm6,%xmm5 2564 psrlq $4,%xmm6 2565 pand %xmm7,%xmm6 2566 paddd %xmm6,%xmm2 2567 movdqa %xmm5,%xmm6 2568 psrlq $30,%xmm5 2569 pand %xmm7,%xmm5 2570 psrldq $7,%xmm6 2571 paddd %xmm5,%xmm3 2572 movd %eax,%xmm5 2573 paddd %xmm6,%xmm4 2574 movd 12(%edi),%xmm6 2575 paddd %xmm5,%xmm4 2576 movdqa %xmm0,(%esp) 2577 movdqa %xmm1,16(%esp) 2578 movdqa %xmm2,32(%esp) 2579 movdqa %xmm3,48(%esp) 2580 movdqa %xmm4,64(%esp) 2581 pmuludq %xmm6,%xmm0 2582 pmuludq %xmm6,%xmm1 2583 pmuludq %xmm6,%xmm2 2584 movd 28(%edi),%xmm5 2585 pmuludq %xmm6,%xmm3 2586 pmuludq %xmm6,%xmm4 2587 movdqa %xmm5,%xmm6 2588 pmuludq 48(%esp),%xmm5 2589 movdqa %xmm6,%xmm7 2590 pmuludq 32(%esp),%xmm6 2591 paddq %xmm5,%xmm4 2592 movdqa %xmm7,%xmm5 2593 pmuludq 16(%esp),%xmm7 2594 paddq %xmm6,%xmm3 2595 movd 92(%edi),%xmm6 2596 pmuludq (%esp),%xmm5 2597 paddq %xmm7,%xmm2 2598 pmuludq 64(%esp),%xmm6 2599 movd 44(%edi),%xmm7 2600 paddq %xmm5,%xmm1 2601 movdqa %xmm7,%xmm5 2602 pmuludq 32(%esp),%xmm7 2603 paddq %xmm6,%xmm0 2604 movdqa %xmm5,%xmm6 2605 pmuludq 16(%esp),%xmm5 2606 paddq %xmm7,%xmm4 2607 movd 108(%edi),%xmm7 2608 pmuludq (%esp),%xmm6 2609 paddq %xmm5,%xmm3 2610 movdqa %xmm7,%xmm5 2611 pmuludq 64(%esp),%xmm7 2612 paddq %xmm6,%xmm2 2613 pmuludq 48(%esp),%xmm5 2614 movd 60(%edi),%xmm6 2615 paddq %xmm7,%xmm1 2616 movdqa %xmm6,%xmm7 2617 pmuludq 16(%esp),%xmm6 2618 paddq %xmm5,%xmm0 2619 movd 124(%edi),%xmm5 2620 pmuludq (%esp),%xmm7 2621 paddq %xmm6,%xmm4 2622 movdqa %xmm5,%xmm6 2623 pmuludq 64(%esp),%xmm5 2624 paddq %xmm7,%xmm3 2625 movdqa %xmm6,%xmm7 2626 pmuludq 48(%esp),%xmm6 2627 paddq %xmm5,%xmm2 2628 pmuludq 32(%esp),%xmm7 2629 movd 76(%edi),%xmm5 2630 paddq %xmm6,%xmm1 2631 movd 140(%edi),%xmm6 2632 pmuludq (%esp),%xmm5 2633 paddq %xmm7,%xmm0 2634 movdqa %xmm6,%xmm7 2635 pmuludq 64(%esp),%xmm6 2636 paddq %xmm5,%xmm4 2637 movdqa %xmm7,%xmm5 2638 pmuludq 16(%esp),%xmm7 2639 paddq %xmm6,%xmm3 2640 movdqa %xmm5,%xmm6 2641 pmuludq 32(%esp),%xmm5 2642 paddq %xmm7,%xmm0 2643 pmuludq 48(%esp),%xmm6 2644 movdqa 64(%ebx),%xmm7 2645 paddq %xmm5,%xmm1 2646 paddq %xmm6,%xmm2 2647 movdqa %xmm3,%xmm5 2648 pand %xmm7,%xmm3 2649 psrlq $26,%xmm5 2650 paddq %xmm4,%xmm5 2651 movdqa %xmm0,%xmm6 2652 pand %xmm7,%xmm0 2653 psrlq $26,%xmm6 2654 movdqa %xmm5,%xmm4 2655 paddq %xmm1,%xmm6 2656 psrlq $26,%xmm5 2657 pand %xmm7,%xmm4 2658 movdqa %xmm6,%xmm1 2659 psrlq $26,%xmm6 2660 paddd %xmm5,%xmm0 2661 psllq $2,%xmm5 2662 paddq %xmm2,%xmm6 2663 paddq %xmm0,%xmm5 2664 pand %xmm7,%xmm1 2665 movdqa %xmm6,%xmm2 2666 psrlq $26,%xmm6 2667 pand %xmm7,%xmm2 2668 paddd %xmm3,%xmm6 2669 movdqa %xmm5,%xmm0 2670 psrlq $26,%xmm5 2671 movdqa %xmm6,%xmm3 2672 psrlq $26,%xmm6 2673 pand %xmm7,%xmm0 2674 paddd %xmm5,%xmm1 2675 pand %xmm7,%xmm3 2676 paddd %xmm6,%xmm4 2677 subl $16,%ecx 2678 jz .L013done 2679.L012even: 2680 leal 384(%esp),%edx 2681 leal -32(%esi),%eax 2682 subl $64,%ecx 2683 movdqu (%edi),%xmm5 2684 pshufd $68,%xmm5,%xmm6 2685 cmovbl %eax,%esi 2686 pshufd $238,%xmm5,%xmm5 2687 movdqa %xmm6,(%edx) 2688 leal 160(%esp),%eax 2689 movdqu 16(%edi),%xmm6 2690 movdqa %xmm5,-144(%edx) 2691 pshufd $68,%xmm6,%xmm5 2692 pshufd $238,%xmm6,%xmm6 2693 movdqa %xmm5,16(%edx) 2694 movdqu 32(%edi),%xmm5 2695 movdqa %xmm6,-128(%edx) 2696 pshufd $68,%xmm5,%xmm6 2697 pshufd $238,%xmm5,%xmm5 2698 movdqa %xmm6,32(%edx) 2699 movdqu 48(%edi),%xmm6 2700 movdqa %xmm5,-112(%edx) 2701 pshufd $68,%xmm6,%xmm5 2702 pshufd $238,%xmm6,%xmm6 2703 movdqa %xmm5,48(%edx) 2704 movdqu 64(%edi),%xmm5 2705 movdqa %xmm6,-96(%edx) 2706 pshufd $68,%xmm5,%xmm6 2707 pshufd $238,%xmm5,%xmm5 2708 movdqa %xmm6,64(%edx) 2709 movdqu 80(%edi),%xmm6 2710 movdqa %xmm5,-80(%edx) 2711 pshufd $68,%xmm6,%xmm5 2712 pshufd $238,%xmm6,%xmm6 2713 movdqa %xmm5,80(%edx) 2714 movdqu 96(%edi),%xmm5 2715 movdqa %xmm6,-64(%edx) 2716 pshufd $68,%xmm5,%xmm6 2717 pshufd $238,%xmm5,%xmm5 2718 movdqa %xmm6,96(%edx) 2719 movdqu 112(%edi),%xmm6 2720 movdqa %xmm5,-48(%edx) 2721 pshufd $68,%xmm6,%xmm5 2722 pshufd $238,%xmm6,%xmm6 2723 movdqa %xmm5,112(%edx) 2724 movdqu 128(%edi),%xmm5 2725 movdqa %xmm6,-32(%edx) 2726 pshufd $68,%xmm5,%xmm6 2727 pshufd $238,%xmm5,%xmm5 2728 movdqa %xmm6,128(%edx) 2729 movdqa %xmm5,-16(%edx) 2730 movdqu 32(%esi),%xmm5 2731 movdqu 48(%esi),%xmm6 2732 leal 32(%esi),%esi 2733 movdqa %xmm2,112(%esp) 2734 movdqa %xmm3,128(%esp) 2735 movdqa %xmm4,144(%esp) 2736 movdqa %xmm5,%xmm2 2737 movdqa %xmm6,%xmm3 2738 psrldq $6,%xmm2 2739 psrldq $6,%xmm3 2740 movdqa %xmm5,%xmm4 2741 punpcklqdq %xmm3,%xmm2 2742 punpckhqdq %xmm6,%xmm4 2743 punpcklqdq %xmm6,%xmm5 2744 movdqa %xmm2,%xmm3 2745 psrlq $4,%xmm2 2746 psrlq $30,%xmm3 2747 movdqa %xmm5,%xmm6 2748 psrlq $40,%xmm4 2749 psrlq $26,%xmm6 2750 pand %xmm7,%xmm5 2751 pand %xmm7,%xmm6 2752 pand %xmm7,%xmm2 2753 pand %xmm7,%xmm3 2754 por (%ebx),%xmm4 2755 movdqa %xmm0,80(%esp) 2756 movdqa %xmm1,96(%esp) 2757 jbe .L014skip_loop 2758 jmp .L015loop 2759.align 32 2760.L015loop: 2761 movdqa -144(%edx),%xmm7 2762 movdqa %xmm6,16(%eax) 2763 movdqa %xmm2,32(%eax) 2764 movdqa %xmm3,48(%eax) 2765 movdqa %xmm4,64(%eax) 2766 movdqa %xmm5,%xmm1 2767 pmuludq %xmm7,%xmm5 2768 movdqa %xmm6,%xmm0 2769 pmuludq %xmm7,%xmm6 2770 pmuludq %xmm7,%xmm2 2771 pmuludq %xmm7,%xmm3 2772 pmuludq %xmm7,%xmm4 2773 pmuludq -16(%edx),%xmm0 2774 movdqa %xmm1,%xmm7 2775 pmuludq -128(%edx),%xmm1 2776 paddq %xmm5,%xmm0 2777 movdqa %xmm7,%xmm5 2778 pmuludq -112(%edx),%xmm7 2779 paddq %xmm6,%xmm1 2780 movdqa %xmm5,%xmm6 2781 pmuludq -96(%edx),%xmm5 2782 paddq %xmm7,%xmm2 2783 movdqa 16(%eax),%xmm7 2784 pmuludq -80(%edx),%xmm6 2785 paddq %xmm5,%xmm3 2786 movdqa %xmm7,%xmm5 2787 pmuludq -128(%edx),%xmm7 2788 paddq %xmm6,%xmm4 2789 movdqa %xmm5,%xmm6 2790 pmuludq -112(%edx),%xmm5 2791 paddq %xmm7,%xmm2 2792 movdqa 32(%eax),%xmm7 2793 pmuludq -96(%edx),%xmm6 2794 paddq %xmm5,%xmm3 2795 movdqa %xmm7,%xmm5 2796 pmuludq -32(%edx),%xmm7 2797 paddq %xmm6,%xmm4 2798 movdqa %xmm5,%xmm6 2799 pmuludq -16(%edx),%xmm5 2800 paddq %xmm7,%xmm0 2801 movdqa %xmm6,%xmm7 2802 pmuludq -128(%edx),%xmm6 2803 paddq %xmm5,%xmm1 2804 movdqa 48(%eax),%xmm5 2805 pmuludq -112(%edx),%xmm7 2806 paddq %xmm6,%xmm3 2807 movdqa %xmm5,%xmm6 2808 pmuludq -48(%edx),%xmm5 2809 paddq %xmm7,%xmm4 2810 movdqa %xmm6,%xmm7 2811 pmuludq -32(%edx),%xmm6 2812 paddq %xmm5,%xmm0 2813 movdqa %xmm7,%xmm5 2814 pmuludq -16(%edx),%xmm7 2815 paddq %xmm6,%xmm1 2816 movdqa 64(%eax),%xmm6 2817 pmuludq -128(%edx),%xmm5 2818 paddq %xmm7,%xmm2 2819 movdqa %xmm6,%xmm7 2820 pmuludq -16(%edx),%xmm6 2821 paddq %xmm5,%xmm4 2822 movdqa %xmm7,%xmm5 2823 pmuludq -64(%edx),%xmm7 2824 paddq %xmm6,%xmm3 2825 movdqa %xmm5,%xmm6 2826 pmuludq -48(%edx),%xmm5 2827 paddq %xmm7,%xmm0 2828 movdqa 64(%ebx),%xmm7 2829 pmuludq -32(%edx),%xmm6 2830 paddq %xmm5,%xmm1 2831 paddq %xmm6,%xmm2 2832 movdqu -32(%esi),%xmm5 2833 movdqu -16(%esi),%xmm6 2834 leal 32(%esi),%esi 2835 movdqa %xmm2,32(%esp) 2836 movdqa %xmm3,48(%esp) 2837 movdqa %xmm4,64(%esp) 2838 movdqa %xmm5,%xmm2 2839 movdqa %xmm6,%xmm3 2840 psrldq $6,%xmm2 2841 psrldq $6,%xmm3 2842 movdqa %xmm5,%xmm4 2843 punpcklqdq %xmm3,%xmm2 2844 punpckhqdq %xmm6,%xmm4 2845 punpcklqdq %xmm6,%xmm5 2846 movdqa %xmm2,%xmm3 2847 psrlq $4,%xmm2 2848 psrlq $30,%xmm3 2849 movdqa %xmm5,%xmm6 2850 psrlq $40,%xmm4 2851 psrlq $26,%xmm6 2852 pand %xmm7,%xmm5 2853 pand %xmm7,%xmm6 2854 pand %xmm7,%xmm2 2855 pand %xmm7,%xmm3 2856 por (%ebx),%xmm4 2857 leal -32(%esi),%eax 2858 subl $64,%ecx 2859 paddd 80(%esp),%xmm5 2860 paddd 96(%esp),%xmm6 2861 paddd 112(%esp),%xmm2 2862 paddd 128(%esp),%xmm3 2863 paddd 144(%esp),%xmm4 2864 cmovbl %eax,%esi 2865 leal 160(%esp),%eax 2866 movdqa (%edx),%xmm7 2867 movdqa %xmm1,16(%esp) 2868 movdqa %xmm6,16(%eax) 2869 movdqa %xmm2,32(%eax) 2870 movdqa %xmm3,48(%eax) 2871 movdqa %xmm4,64(%eax) 2872 movdqa %xmm5,%xmm1 2873 pmuludq %xmm7,%xmm5 2874 paddq %xmm0,%xmm5 2875 movdqa %xmm6,%xmm0 2876 pmuludq %xmm7,%xmm6 2877 pmuludq %xmm7,%xmm2 2878 pmuludq %xmm7,%xmm3 2879 pmuludq %xmm7,%xmm4 2880 paddq 16(%esp),%xmm6 2881 paddq 32(%esp),%xmm2 2882 paddq 48(%esp),%xmm3 2883 paddq 64(%esp),%xmm4 2884 pmuludq 128(%edx),%xmm0 2885 movdqa %xmm1,%xmm7 2886 pmuludq 16(%edx),%xmm1 2887 paddq %xmm5,%xmm0 2888 movdqa %xmm7,%xmm5 2889 pmuludq 32(%edx),%xmm7 2890 paddq %xmm6,%xmm1 2891 movdqa %xmm5,%xmm6 2892 pmuludq 48(%edx),%xmm5 2893 paddq %xmm7,%xmm2 2894 movdqa 16(%eax),%xmm7 2895 pmuludq 64(%edx),%xmm6 2896 paddq %xmm5,%xmm3 2897 movdqa %xmm7,%xmm5 2898 pmuludq 16(%edx),%xmm7 2899 paddq %xmm6,%xmm4 2900 movdqa %xmm5,%xmm6 2901 pmuludq 32(%edx),%xmm5 2902 paddq %xmm7,%xmm2 2903 movdqa 32(%eax),%xmm7 2904 pmuludq 48(%edx),%xmm6 2905 paddq %xmm5,%xmm3 2906 movdqa %xmm7,%xmm5 2907 pmuludq 112(%edx),%xmm7 2908 paddq %xmm6,%xmm4 2909 movdqa %xmm5,%xmm6 2910 pmuludq 128(%edx),%xmm5 2911 paddq %xmm7,%xmm0 2912 movdqa %xmm6,%xmm7 2913 pmuludq 16(%edx),%xmm6 2914 paddq %xmm5,%xmm1 2915 movdqa 48(%eax),%xmm5 2916 pmuludq 32(%edx),%xmm7 2917 paddq %xmm6,%xmm3 2918 movdqa %xmm5,%xmm6 2919 pmuludq 96(%edx),%xmm5 2920 paddq %xmm7,%xmm4 2921 movdqa %xmm6,%xmm7 2922 pmuludq 112(%edx),%xmm6 2923 paddq %xmm5,%xmm0 2924 movdqa %xmm7,%xmm5 2925 pmuludq 128(%edx),%xmm7 2926 paddq %xmm6,%xmm1 2927 movdqa 64(%eax),%xmm6 2928 pmuludq 16(%edx),%xmm5 2929 paddq %xmm7,%xmm2 2930 movdqa %xmm6,%xmm7 2931 pmuludq 128(%edx),%xmm6 2932 paddq %xmm5,%xmm4 2933 movdqa %xmm7,%xmm5 2934 pmuludq 80(%edx),%xmm7 2935 paddq %xmm6,%xmm3 2936 movdqa %xmm5,%xmm6 2937 pmuludq 96(%edx),%xmm5 2938 paddq %xmm7,%xmm0 2939 movdqa 64(%ebx),%xmm7 2940 pmuludq 112(%edx),%xmm6 2941 paddq %xmm5,%xmm1 2942 paddq %xmm6,%xmm2 2943 movdqa %xmm3,%xmm5 2944 pand %xmm7,%xmm3 2945 psrlq $26,%xmm5 2946 paddq %xmm4,%xmm5 2947 movdqa %xmm0,%xmm6 2948 pand %xmm7,%xmm0 2949 psrlq $26,%xmm6 2950 movdqa %xmm5,%xmm4 2951 paddq %xmm1,%xmm6 2952 psrlq $26,%xmm5 2953 pand %xmm7,%xmm4 2954 movdqa %xmm6,%xmm1 2955 psrlq $26,%xmm6 2956 paddd %xmm5,%xmm0 2957 psllq $2,%xmm5 2958 paddq %xmm2,%xmm6 2959 paddq %xmm0,%xmm5 2960 pand %xmm7,%xmm1 2961 movdqa %xmm6,%xmm2 2962 psrlq $26,%xmm6 2963 pand %xmm7,%xmm2 2964 paddd %xmm3,%xmm6 2965 movdqa %xmm5,%xmm0 2966 psrlq $26,%xmm5 2967 movdqa %xmm6,%xmm3 2968 psrlq $26,%xmm6 2969 pand %xmm7,%xmm0 2970 paddd %xmm5,%xmm1 2971 pand %xmm7,%xmm3 2972 paddd %xmm6,%xmm4 2973 movdqu 32(%esi),%xmm5 2974 movdqu 48(%esi),%xmm6 2975 leal 32(%esi),%esi 2976 movdqa %xmm2,112(%esp) 2977 movdqa %xmm3,128(%esp) 2978 movdqa %xmm4,144(%esp) 2979 movdqa %xmm5,%xmm2 2980 movdqa %xmm6,%xmm3 2981 psrldq $6,%xmm2 2982 psrldq $6,%xmm3 2983 movdqa %xmm5,%xmm4 2984 punpcklqdq %xmm3,%xmm2 2985 punpckhqdq %xmm6,%xmm4 2986 punpcklqdq %xmm6,%xmm5 2987 movdqa %xmm2,%xmm3 2988 psrlq $4,%xmm2 2989 psrlq $30,%xmm3 2990 movdqa %xmm5,%xmm6 2991 psrlq $40,%xmm4 2992 psrlq $26,%xmm6 2993 pand %xmm7,%xmm5 2994 pand %xmm7,%xmm6 2995 pand %xmm7,%xmm2 2996 pand %xmm7,%xmm3 2997 por (%ebx),%xmm4 2998 movdqa %xmm0,80(%esp) 2999 movdqa %xmm1,96(%esp) 3000 ja .L015loop 3001.L014skip_loop: 3002 pshufd $16,-144(%edx),%xmm7 3003 addl $32,%ecx 3004 jnz .L016long_tail 3005 paddd %xmm0,%xmm5 3006 paddd %xmm1,%xmm6 3007 paddd 112(%esp),%xmm2 3008 paddd 128(%esp),%xmm3 3009 paddd 144(%esp),%xmm4 3010.L016long_tail: 3011 movdqa %xmm5,(%eax) 3012 movdqa %xmm6,16(%eax) 3013 movdqa %xmm2,32(%eax) 3014 movdqa %xmm3,48(%eax) 3015 movdqa %xmm4,64(%eax) 3016 pmuludq %xmm7,%xmm5 3017 pmuludq %xmm7,%xmm6 3018 pmuludq %xmm7,%xmm2 3019 movdqa %xmm5,%xmm0 3020 pshufd $16,-128(%edx),%xmm5 3021 pmuludq %xmm7,%xmm3 3022 movdqa %xmm6,%xmm1 3023 pmuludq %xmm7,%xmm4 3024 movdqa %xmm5,%xmm6 3025 pmuludq 48(%eax),%xmm5 3026 movdqa %xmm6,%xmm7 3027 pmuludq 32(%eax),%xmm6 3028 paddq %xmm5,%xmm4 3029 movdqa %xmm7,%xmm5 3030 pmuludq 16(%eax),%xmm7 3031 paddq %xmm6,%xmm3 3032 pshufd $16,-64(%edx),%xmm6 3033 pmuludq (%eax),%xmm5 3034 paddq %xmm7,%xmm2 3035 pmuludq 64(%eax),%xmm6 3036 pshufd $16,-112(%edx),%xmm7 3037 paddq %xmm5,%xmm1 3038 movdqa %xmm7,%xmm5 3039 pmuludq 32(%eax),%xmm7 3040 paddq %xmm6,%xmm0 3041 movdqa %xmm5,%xmm6 3042 pmuludq 16(%eax),%xmm5 3043 paddq %xmm7,%xmm4 3044 pshufd $16,-48(%edx),%xmm7 3045 pmuludq (%eax),%xmm6 3046 paddq %xmm5,%xmm3 3047 movdqa %xmm7,%xmm5 3048 pmuludq 64(%eax),%xmm7 3049 paddq %xmm6,%xmm2 3050 pmuludq 48(%eax),%xmm5 3051 pshufd $16,-96(%edx),%xmm6 3052 paddq %xmm7,%xmm1 3053 movdqa %xmm6,%xmm7 3054 pmuludq 16(%eax),%xmm6 3055 paddq %xmm5,%xmm0 3056 pshufd $16,-32(%edx),%xmm5 3057 pmuludq (%eax),%xmm7 3058 paddq %xmm6,%xmm4 3059 movdqa %xmm5,%xmm6 3060 pmuludq 64(%eax),%xmm5 3061 paddq %xmm7,%xmm3 3062 movdqa %xmm6,%xmm7 3063 pmuludq 48(%eax),%xmm6 3064 paddq %xmm5,%xmm2 3065 pmuludq 32(%eax),%xmm7 3066 pshufd $16,-80(%edx),%xmm5 3067 paddq %xmm6,%xmm1 3068 pshufd $16,-16(%edx),%xmm6 3069 pmuludq (%eax),%xmm5 3070 paddq %xmm7,%xmm0 3071 movdqa %xmm6,%xmm7 3072 pmuludq 64(%eax),%xmm6 3073 paddq %xmm5,%xmm4 3074 movdqa %xmm7,%xmm5 3075 pmuludq 16(%eax),%xmm7 3076 paddq %xmm6,%xmm3 3077 movdqa %xmm5,%xmm6 3078 pmuludq 32(%eax),%xmm5 3079 paddq %xmm7,%xmm0 3080 pmuludq 48(%eax),%xmm6 3081 movdqa 64(%ebx),%xmm7 3082 paddq %xmm5,%xmm1 3083 paddq %xmm6,%xmm2 3084 jz .L017short_tail 3085 movdqu -32(%esi),%xmm5 3086 movdqu -16(%esi),%xmm6 3087 leal 32(%esi),%esi 3088 movdqa %xmm2,32(%esp) 3089 movdqa %xmm3,48(%esp) 3090 movdqa %xmm4,64(%esp) 3091 movdqa %xmm5,%xmm2 3092 movdqa %xmm6,%xmm3 3093 psrldq $6,%xmm2 3094 psrldq $6,%xmm3 3095 movdqa %xmm5,%xmm4 3096 punpcklqdq %xmm3,%xmm2 3097 punpckhqdq %xmm6,%xmm4 3098 punpcklqdq %xmm6,%xmm5 3099 movdqa %xmm2,%xmm3 3100 psrlq $4,%xmm2 3101 psrlq $30,%xmm3 3102 movdqa %xmm5,%xmm6 3103 psrlq $40,%xmm4 3104 psrlq $26,%xmm6 3105 pand %xmm7,%xmm5 3106 pand %xmm7,%xmm6 3107 pand %xmm7,%xmm2 3108 pand %xmm7,%xmm3 3109 por (%ebx),%xmm4 3110 pshufd $16,(%edx),%xmm7 3111 paddd 80(%esp),%xmm5 3112 paddd 96(%esp),%xmm6 3113 paddd 112(%esp),%xmm2 3114 paddd 128(%esp),%xmm3 3115 paddd 144(%esp),%xmm4 3116 movdqa %xmm5,(%esp) 3117 pmuludq %xmm7,%xmm5 3118 movdqa %xmm6,16(%esp) 3119 pmuludq %xmm7,%xmm6 3120 paddq %xmm5,%xmm0 3121 movdqa %xmm2,%xmm5 3122 pmuludq %xmm7,%xmm2 3123 paddq %xmm6,%xmm1 3124 movdqa %xmm3,%xmm6 3125 pmuludq %xmm7,%xmm3 3126 paddq 32(%esp),%xmm2 3127 movdqa %xmm5,32(%esp) 3128 pshufd $16,16(%edx),%xmm5 3129 paddq 48(%esp),%xmm3 3130 movdqa %xmm6,48(%esp) 3131 movdqa %xmm4,%xmm6 3132 pmuludq %xmm7,%xmm4 3133 paddq 64(%esp),%xmm4 3134 movdqa %xmm6,64(%esp) 3135 movdqa %xmm5,%xmm6 3136 pmuludq 48(%esp),%xmm5 3137 movdqa %xmm6,%xmm7 3138 pmuludq 32(%esp),%xmm6 3139 paddq %xmm5,%xmm4 3140 movdqa %xmm7,%xmm5 3141 pmuludq 16(%esp),%xmm7 3142 paddq %xmm6,%xmm3 3143 pshufd $16,80(%edx),%xmm6 3144 pmuludq (%esp),%xmm5 3145 paddq %xmm7,%xmm2 3146 pmuludq 64(%esp),%xmm6 3147 pshufd $16,32(%edx),%xmm7 3148 paddq %xmm5,%xmm1 3149 movdqa %xmm7,%xmm5 3150 pmuludq 32(%esp),%xmm7 3151 paddq %xmm6,%xmm0 3152 movdqa %xmm5,%xmm6 3153 pmuludq 16(%esp),%xmm5 3154 paddq %xmm7,%xmm4 3155 pshufd $16,96(%edx),%xmm7 3156 pmuludq (%esp),%xmm6 3157 paddq %xmm5,%xmm3 3158 movdqa %xmm7,%xmm5 3159 pmuludq 64(%esp),%xmm7 3160 paddq %xmm6,%xmm2 3161 pmuludq 48(%esp),%xmm5 3162 pshufd $16,48(%edx),%xmm6 3163 paddq %xmm7,%xmm1 3164 movdqa %xmm6,%xmm7 3165 pmuludq 16(%esp),%xmm6 3166 paddq %xmm5,%xmm0 3167 pshufd $16,112(%edx),%xmm5 3168 pmuludq (%esp),%xmm7 3169 paddq %xmm6,%xmm4 3170 movdqa %xmm5,%xmm6 3171 pmuludq 64(%esp),%xmm5 3172 paddq %xmm7,%xmm3 3173 movdqa %xmm6,%xmm7 3174 pmuludq 48(%esp),%xmm6 3175 paddq %xmm5,%xmm2 3176 pmuludq 32(%esp),%xmm7 3177 pshufd $16,64(%edx),%xmm5 3178 paddq %xmm6,%xmm1 3179 pshufd $16,128(%edx),%xmm6 3180 pmuludq (%esp),%xmm5 3181 paddq %xmm7,%xmm0 3182 movdqa %xmm6,%xmm7 3183 pmuludq 64(%esp),%xmm6 3184 paddq %xmm5,%xmm4 3185 movdqa %xmm7,%xmm5 3186 pmuludq 16(%esp),%xmm7 3187 paddq %xmm6,%xmm3 3188 movdqa %xmm5,%xmm6 3189 pmuludq 32(%esp),%xmm5 3190 paddq %xmm7,%xmm0 3191 pmuludq 48(%esp),%xmm6 3192 movdqa 64(%ebx),%xmm7 3193 paddq %xmm5,%xmm1 3194 paddq %xmm6,%xmm2 3195.L017short_tail: 3196 pshufd $78,%xmm4,%xmm6 3197 pshufd $78,%xmm3,%xmm5 3198 paddq %xmm6,%xmm4 3199 paddq %xmm5,%xmm3 3200 pshufd $78,%xmm0,%xmm6 3201 pshufd $78,%xmm1,%xmm5 3202 paddq %xmm6,%xmm0 3203 paddq %xmm5,%xmm1 3204 pshufd $78,%xmm2,%xmm6 3205 movdqa %xmm3,%xmm5 3206 pand %xmm7,%xmm3 3207 psrlq $26,%xmm5 3208 paddq %xmm6,%xmm2 3209 paddq %xmm4,%xmm5 3210 movdqa %xmm0,%xmm6 3211 pand %xmm7,%xmm0 3212 psrlq $26,%xmm6 3213 movdqa %xmm5,%xmm4 3214 paddq %xmm1,%xmm6 3215 psrlq $26,%xmm5 3216 pand %xmm7,%xmm4 3217 movdqa %xmm6,%xmm1 3218 psrlq $26,%xmm6 3219 paddd %xmm5,%xmm0 3220 psllq $2,%xmm5 3221 paddq %xmm2,%xmm6 3222 paddq %xmm0,%xmm5 3223 pand %xmm7,%xmm1 3224 movdqa %xmm6,%xmm2 3225 psrlq $26,%xmm6 3226 pand %xmm7,%xmm2 3227 paddd %xmm3,%xmm6 3228 movdqa %xmm5,%xmm0 3229 psrlq $26,%xmm5 3230 movdqa %xmm6,%xmm3 3231 psrlq $26,%xmm6 3232 pand %xmm7,%xmm0 3233 paddd %xmm5,%xmm1 3234 pand %xmm7,%xmm3 3235 paddd %xmm6,%xmm4 3236.L013done: 3237 movd %xmm0,-48(%edi) 3238 movd %xmm1,-44(%edi) 3239 movd %xmm2,-40(%edi) 3240 movd %xmm3,-36(%edi) 3241 movd %xmm4,-32(%edi) 3242 movl %ebp,%esp 3243.L007nodata: 3244 popl %edi 3245 popl %esi 3246 popl %ebx 3247 popl %ebp 3248 ret 3249.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 3250.align 32 3251.type _poly1305_emit_sse2,@function 3252.align 16 3253_poly1305_emit_sse2: 3254 #ifdef __CET__ 3255 3256.byte 243,15,30,251 3257 #endif 3258 3259 pushl %ebp 3260 pushl %ebx 3261 pushl %esi 3262 pushl %edi 3263 movl 20(%esp),%ebp 3264 cmpl $0,20(%ebp) 3265 je .Lenter_emit 3266 movl (%ebp),%eax 3267 movl 4(%ebp),%edi 3268 movl 8(%ebp),%ecx 3269 movl 12(%ebp),%edx 3270 movl 16(%ebp),%esi 3271 movl %edi,%ebx 3272 shll $26,%edi 3273 shrl $6,%ebx 3274 addl %edi,%eax 3275 movl %ecx,%edi 3276 adcl $0,%ebx 3277 shll $20,%edi 3278 shrl $12,%ecx 3279 addl %edi,%ebx 3280 movl %edx,%edi 3281 adcl $0,%ecx 3282 shll $14,%edi 3283 shrl $18,%edx 3284 addl %edi,%ecx 3285 movl %esi,%edi 3286 adcl $0,%edx 3287 shll $8,%edi 3288 shrl $24,%esi 3289 addl %edi,%edx 3290 adcl $0,%esi 3291 movl %esi,%edi 3292 andl $3,%esi 3293 shrl $2,%edi 3294 leal (%edi,%edi,4),%ebp 3295 movl 24(%esp),%edi 3296 addl %ebp,%eax 3297 movl 28(%esp),%ebp 3298 adcl $0,%ebx 3299 adcl $0,%ecx 3300 adcl $0,%edx 3301 adcl $0,%esi 3302 movd %eax,%xmm0 3303 addl $5,%eax 3304 movd %ebx,%xmm1 3305 adcl $0,%ebx 3306 movd %ecx,%xmm2 3307 adcl $0,%ecx 3308 movd %edx,%xmm3 3309 adcl $0,%edx 3310 adcl $0,%esi 3311 shrl $2,%esi 3312 negl %esi 3313 andl %esi,%eax 3314 andl %esi,%ebx 3315 andl %esi,%ecx 3316 andl %esi,%edx 3317 movl %eax,(%edi) 3318 movd %xmm0,%eax 3319 movl %ebx,4(%edi) 3320 movd %xmm1,%ebx 3321 movl %ecx,8(%edi) 3322 movd %xmm2,%ecx 3323 movl %edx,12(%edi) 3324 movd %xmm3,%edx 3325 notl %esi 3326 andl %esi,%eax 3327 andl %esi,%ebx 3328 orl (%edi),%eax 3329 andl %esi,%ecx 3330 orl 4(%edi),%ebx 3331 andl %esi,%edx 3332 orl 8(%edi),%ecx 3333 orl 12(%edi),%edx 3334 addl (%ebp),%eax 3335 adcl 4(%ebp),%ebx 3336 movl %eax,(%edi) 3337 adcl 8(%ebp),%ecx 3338 movl %ebx,4(%edi) 3339 adcl 12(%ebp),%edx 3340 movl %ecx,8(%edi) 3341 movl %edx,12(%edi) 3342 popl %edi 3343 popl %esi 3344 popl %ebx 3345 popl %ebp 3346 ret 3347.size _poly1305_emit_sse2,.-_poly1305_emit_sse2 3348.align 32 3349.type _poly1305_init_avx2,@function 3350.align 16 3351_poly1305_init_avx2: 3352 #ifdef __CET__ 3353 3354.byte 243,15,30,251 3355 #endif 3356 3357 vmovdqu 24(%edi),%xmm4 3358 leal 48(%edi),%edi 3359 movl %esp,%ebp 3360 subl $224,%esp 3361 andl $-16,%esp 3362 vmovdqa 64(%ebx),%xmm7 3363 vpand %xmm7,%xmm4,%xmm0 3364 vpsrlq $26,%xmm4,%xmm1 3365 vpsrldq $6,%xmm4,%xmm3 3366 vpand %xmm7,%xmm1,%xmm1 3367 vpsrlq $4,%xmm3,%xmm2 3368 vpsrlq $30,%xmm3,%xmm3 3369 vpand %xmm7,%xmm2,%xmm2 3370 vpand %xmm7,%xmm3,%xmm3 3371 vpsrldq $13,%xmm4,%xmm4 3372 leal 144(%esp),%edx 3373 movl $2,%ecx 3374.L018square: 3375 vmovdqa %xmm0,(%esp) 3376 vmovdqa %xmm1,16(%esp) 3377 vmovdqa %xmm2,32(%esp) 3378 vmovdqa %xmm3,48(%esp) 3379 vmovdqa %xmm4,64(%esp) 3380 vpslld $2,%xmm1,%xmm6 3381 vpslld $2,%xmm2,%xmm5 3382 vpaddd %xmm1,%xmm6,%xmm6 3383 vpaddd %xmm2,%xmm5,%xmm5 3384 vmovdqa %xmm6,80(%esp) 3385 vmovdqa %xmm5,96(%esp) 3386 vpslld $2,%xmm3,%xmm6 3387 vpslld $2,%xmm4,%xmm5 3388 vpaddd %xmm3,%xmm6,%xmm6 3389 vpaddd %xmm4,%xmm5,%xmm5 3390 vmovdqa %xmm6,112(%esp) 3391 vmovdqa %xmm5,128(%esp) 3392 vpshufd $68,%xmm0,%xmm5 3393 vmovdqa %xmm1,%xmm6 3394 vpshufd $68,%xmm1,%xmm1 3395 vpshufd $68,%xmm2,%xmm2 3396 vpshufd $68,%xmm3,%xmm3 3397 vpshufd $68,%xmm4,%xmm4 3398 vmovdqa %xmm5,(%edx) 3399 vmovdqa %xmm1,16(%edx) 3400 vmovdqa %xmm2,32(%edx) 3401 vmovdqa %xmm3,48(%edx) 3402 vmovdqa %xmm4,64(%edx) 3403 vpmuludq %xmm0,%xmm4,%xmm4 3404 vpmuludq %xmm0,%xmm3,%xmm3 3405 vpmuludq %xmm0,%xmm2,%xmm2 3406 vpmuludq %xmm0,%xmm1,%xmm1 3407 vpmuludq %xmm0,%xmm5,%xmm0 3408 vpmuludq 48(%edx),%xmm6,%xmm5 3409 vpaddq %xmm5,%xmm4,%xmm4 3410 vpmuludq 32(%edx),%xmm6,%xmm7 3411 vpaddq %xmm7,%xmm3,%xmm3 3412 vpmuludq 16(%edx),%xmm6,%xmm5 3413 vpaddq %xmm5,%xmm2,%xmm2 3414 vmovdqa 80(%esp),%xmm7 3415 vpmuludq (%edx),%xmm6,%xmm6 3416 vpaddq %xmm6,%xmm1,%xmm1 3417 vmovdqa 32(%esp),%xmm5 3418 vpmuludq 64(%edx),%xmm7,%xmm7 3419 vpaddq %xmm7,%xmm0,%xmm0 3420 vpmuludq 32(%edx),%xmm5,%xmm6 3421 vpaddq %xmm6,%xmm4,%xmm4 3422 vpmuludq 16(%edx),%xmm5,%xmm7 3423 vpaddq %xmm7,%xmm3,%xmm3 3424 vmovdqa 96(%esp),%xmm6 3425 vpmuludq (%edx),%xmm5,%xmm5 3426 vpaddq %xmm5,%xmm2,%xmm2 3427 vpmuludq 64(%edx),%xmm6,%xmm7 3428 vpaddq %xmm7,%xmm1,%xmm1 3429 vmovdqa 48(%esp),%xmm5 3430 vpmuludq 48(%edx),%xmm6,%xmm6 3431 vpaddq %xmm6,%xmm0,%xmm0 3432 vpmuludq 16(%edx),%xmm5,%xmm7 3433 vpaddq %xmm7,%xmm4,%xmm4 3434 vmovdqa 112(%esp),%xmm6 3435 vpmuludq (%edx),%xmm5,%xmm5 3436 vpaddq %xmm5,%xmm3,%xmm3 3437 vpmuludq 64(%edx),%xmm6,%xmm7 3438 vpaddq %xmm7,%xmm2,%xmm2 3439 vpmuludq 48(%edx),%xmm6,%xmm5 3440 vpaddq %xmm5,%xmm1,%xmm1 3441 vmovdqa 64(%esp),%xmm7 3442 vpmuludq 32(%edx),%xmm6,%xmm6 3443 vpaddq %xmm6,%xmm0,%xmm0 3444 vmovdqa 128(%esp),%xmm5 3445 vpmuludq (%edx),%xmm7,%xmm7 3446 vpaddq %xmm7,%xmm4,%xmm4 3447 vpmuludq 64(%edx),%xmm5,%xmm6 3448 vpaddq %xmm6,%xmm3,%xmm3 3449 vpmuludq 16(%edx),%xmm5,%xmm7 3450 vpaddq %xmm7,%xmm0,%xmm0 3451 vpmuludq 32(%edx),%xmm5,%xmm6 3452 vpaddq %xmm6,%xmm1,%xmm1 3453 vmovdqa 64(%ebx),%xmm7 3454 vpmuludq 48(%edx),%xmm5,%xmm5 3455 vpaddq %xmm5,%xmm2,%xmm2 3456 vpsrlq $26,%xmm3,%xmm5 3457 vpand %xmm7,%xmm3,%xmm3 3458 vpsrlq $26,%xmm0,%xmm6 3459 vpand %xmm7,%xmm0,%xmm0 3460 vpaddq %xmm5,%xmm4,%xmm4 3461 vpaddq %xmm6,%xmm1,%xmm1 3462 vpsrlq $26,%xmm4,%xmm5 3463 vpand %xmm7,%xmm4,%xmm4 3464 vpsrlq $26,%xmm1,%xmm6 3465 vpand %xmm7,%xmm1,%xmm1 3466 vpaddq %xmm6,%xmm2,%xmm2 3467 vpaddd %xmm5,%xmm0,%xmm0 3468 vpsllq $2,%xmm5,%xmm5 3469 vpsrlq $26,%xmm2,%xmm6 3470 vpand %xmm7,%xmm2,%xmm2 3471 vpaddd %xmm5,%xmm0,%xmm0 3472 vpaddd %xmm6,%xmm3,%xmm3 3473 vpsrlq $26,%xmm3,%xmm6 3474 vpsrlq $26,%xmm0,%xmm5 3475 vpand %xmm7,%xmm0,%xmm0 3476 vpand %xmm7,%xmm3,%xmm3 3477 vpaddd %xmm5,%xmm1,%xmm1 3478 vpaddd %xmm6,%xmm4,%xmm4 3479 decl %ecx 3480 jz .L019square_break 3481 vpunpcklqdq (%esp),%xmm0,%xmm0 3482 vpunpcklqdq 16(%esp),%xmm1,%xmm1 3483 vpunpcklqdq 32(%esp),%xmm2,%xmm2 3484 vpunpcklqdq 48(%esp),%xmm3,%xmm3 3485 vpunpcklqdq 64(%esp),%xmm4,%xmm4 3486 jmp .L018square 3487.L019square_break: 3488 vpsllq $32,%xmm0,%xmm0 3489 vpsllq $32,%xmm1,%xmm1 3490 vpsllq $32,%xmm2,%xmm2 3491 vpsllq $32,%xmm3,%xmm3 3492 vpsllq $32,%xmm4,%xmm4 3493 vpor (%esp),%xmm0,%xmm0 3494 vpor 16(%esp),%xmm1,%xmm1 3495 vpor 32(%esp),%xmm2,%xmm2 3496 vpor 48(%esp),%xmm3,%xmm3 3497 vpor 64(%esp),%xmm4,%xmm4 3498 vpshufd $141,%xmm0,%xmm0 3499 vpshufd $141,%xmm1,%xmm1 3500 vpshufd $141,%xmm2,%xmm2 3501 vpshufd $141,%xmm3,%xmm3 3502 vpshufd $141,%xmm4,%xmm4 3503 vmovdqu %xmm0,(%edi) 3504 vmovdqu %xmm1,16(%edi) 3505 vmovdqu %xmm2,32(%edi) 3506 vmovdqu %xmm3,48(%edi) 3507 vmovdqu %xmm4,64(%edi) 3508 vpslld $2,%xmm1,%xmm6 3509 vpslld $2,%xmm2,%xmm5 3510 vpaddd %xmm1,%xmm6,%xmm6 3511 vpaddd %xmm2,%xmm5,%xmm5 3512 vmovdqu %xmm6,80(%edi) 3513 vmovdqu %xmm5,96(%edi) 3514 vpslld $2,%xmm3,%xmm6 3515 vpslld $2,%xmm4,%xmm5 3516 vpaddd %xmm3,%xmm6,%xmm6 3517 vpaddd %xmm4,%xmm5,%xmm5 3518 vmovdqu %xmm6,112(%edi) 3519 vmovdqu %xmm5,128(%edi) 3520 movl %ebp,%esp 3521 leal -48(%edi),%edi 3522 ret 3523.size _poly1305_init_avx2,.-_poly1305_init_avx2 3524.align 32 3525.type _poly1305_blocks_avx2,@function 3526.align 16 3527_poly1305_blocks_avx2: 3528 #ifdef __CET__ 3529 3530.byte 243,15,30,251 3531 #endif 3532 3533 pushl %ebp 3534 pushl %ebx 3535 pushl %esi 3536 pushl %edi 3537 movl 20(%esp),%edi 3538 movl 24(%esp),%esi 3539 movl 28(%esp),%ecx 3540 movl 20(%edi),%eax 3541 andl $-16,%ecx 3542 jz .L020nodata 3543 cmpl $64,%ecx 3544 jae .L021enter_avx2 3545 testl %eax,%eax 3546 jz .Lenter_blocks 3547.L021enter_avx2: 3548 vzeroupper 3549 call .L022pic_point 3550.L022pic_point: 3551 popl %ebx 3552 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx 3553 testl %eax,%eax 3554 jnz .L023base2_26 3555 call _poly1305_init_avx2 3556 movl (%edi),%eax 3557 movl 3(%edi),%ecx 3558 movl 6(%edi),%edx 3559 movl 9(%edi),%esi 3560 movl 13(%edi),%ebp 3561 shrl $2,%ecx 3562 andl $67108863,%eax 3563 shrl $4,%edx 3564 andl $67108863,%ecx 3565 shrl $6,%esi 3566 andl $67108863,%edx 3567 movl %eax,(%edi) 3568 movl %ecx,4(%edi) 3569 movl %edx,8(%edi) 3570 movl %esi,12(%edi) 3571 movl %ebp,16(%edi) 3572 movl $1,20(%edi) 3573 movl 24(%esp),%esi 3574 movl 28(%esp),%ecx 3575.L023base2_26: 3576 movl 32(%esp),%eax 3577 movl %esp,%ebp 3578 subl $448,%esp 3579 andl $-512,%esp 3580 vmovdqu 48(%edi),%xmm0 3581 leal 288(%esp),%edx 3582 vmovdqu 64(%edi),%xmm1 3583 vmovdqu 80(%edi),%xmm2 3584 vmovdqu 96(%edi),%xmm3 3585 vmovdqu 112(%edi),%xmm4 3586 leal 48(%edi),%edi 3587 vpermq $64,%ymm0,%ymm0 3588 vpermq $64,%ymm1,%ymm1 3589 vpermq $64,%ymm2,%ymm2 3590 vpermq $64,%ymm3,%ymm3 3591 vpermq $64,%ymm4,%ymm4 3592 vpshufd $200,%ymm0,%ymm0 3593 vpshufd $200,%ymm1,%ymm1 3594 vpshufd $200,%ymm2,%ymm2 3595 vpshufd $200,%ymm3,%ymm3 3596 vpshufd $200,%ymm4,%ymm4 3597 vmovdqa %ymm0,-128(%edx) 3598 vmovdqu 80(%edi),%xmm0 3599 vmovdqa %ymm1,-96(%edx) 3600 vmovdqu 96(%edi),%xmm1 3601 vmovdqa %ymm2,-64(%edx) 3602 vmovdqu 112(%edi),%xmm2 3603 vmovdqa %ymm3,-32(%edx) 3604 vmovdqu 128(%edi),%xmm3 3605 vmovdqa %ymm4,(%edx) 3606 vpermq $64,%ymm0,%ymm0 3607 vpermq $64,%ymm1,%ymm1 3608 vpermq $64,%ymm2,%ymm2 3609 vpermq $64,%ymm3,%ymm3 3610 vpshufd $200,%ymm0,%ymm0 3611 vpshufd $200,%ymm1,%ymm1 3612 vpshufd $200,%ymm2,%ymm2 3613 vpshufd $200,%ymm3,%ymm3 3614 vmovdqa %ymm0,32(%edx) 3615 vmovd -48(%edi),%xmm0 3616 vmovdqa %ymm1,64(%edx) 3617 vmovd -44(%edi),%xmm1 3618 vmovdqa %ymm2,96(%edx) 3619 vmovd -40(%edi),%xmm2 3620 vmovdqa %ymm3,128(%edx) 3621 vmovd -36(%edi),%xmm3 3622 vmovd -32(%edi),%xmm4 3623 vmovdqa 64(%ebx),%ymm7 3624 negl %eax 3625 testl $63,%ecx 3626 jz .L024even 3627 movl %ecx,%edx 3628 andl $-64,%ecx 3629 andl $63,%edx 3630 vmovdqu (%esi),%xmm5 3631 cmpl $32,%edx 3632 jb .L025one 3633 vmovdqu 16(%esi),%xmm6 3634 je .L026two 3635 vinserti128 $1,32(%esi),%ymm5,%ymm5 3636 leal 48(%esi),%esi 3637 leal 8(%ebx),%ebx 3638 leal 296(%esp),%edx 3639 jmp .L027tail 3640.L026two: 3641 leal 32(%esi),%esi 3642 leal 16(%ebx),%ebx 3643 leal 304(%esp),%edx 3644 jmp .L027tail 3645.L025one: 3646 leal 16(%esi),%esi 3647 vpxor %ymm6,%ymm6,%ymm6 3648 leal 32(%ebx,%eax,8),%ebx 3649 leal 312(%esp),%edx 3650 jmp .L027tail 3651.align 32 3652.L024even: 3653 vmovdqu (%esi),%xmm5 3654 vmovdqu 16(%esi),%xmm6 3655 vinserti128 $1,32(%esi),%ymm5,%ymm5 3656 vinserti128 $1,48(%esi),%ymm6,%ymm6 3657 leal 64(%esi),%esi 3658 subl $64,%ecx 3659 jz .L027tail 3660.L028loop: 3661 vmovdqa %ymm2,64(%esp) 3662 vpsrldq $6,%ymm5,%ymm2 3663 vmovdqa %ymm0,(%esp) 3664 vpsrldq $6,%ymm6,%ymm0 3665 vmovdqa %ymm1,32(%esp) 3666 vpunpckhqdq %ymm6,%ymm5,%ymm1 3667 vpunpcklqdq %ymm6,%ymm5,%ymm5 3668 vpunpcklqdq %ymm0,%ymm2,%ymm2 3669 vpsrlq $30,%ymm2,%ymm0 3670 vpsrlq $4,%ymm2,%ymm2 3671 vpsrlq $26,%ymm5,%ymm6 3672 vpsrlq $40,%ymm1,%ymm1 3673 vpand %ymm7,%ymm2,%ymm2 3674 vpand %ymm7,%ymm5,%ymm5 3675 vpand %ymm7,%ymm6,%ymm6 3676 vpand %ymm7,%ymm0,%ymm0 3677 vpor (%ebx),%ymm1,%ymm1 3678 vpaddq 64(%esp),%ymm2,%ymm2 3679 vpaddq (%esp),%ymm5,%ymm5 3680 vpaddq 32(%esp),%ymm6,%ymm6 3681 vpaddq %ymm3,%ymm0,%ymm0 3682 vpaddq %ymm4,%ymm1,%ymm1 3683 vpmuludq -96(%edx),%ymm2,%ymm3 3684 vmovdqa %ymm6,32(%esp) 3685 vpmuludq -64(%edx),%ymm2,%ymm4 3686 vmovdqa %ymm0,96(%esp) 3687 vpmuludq 96(%edx),%ymm2,%ymm0 3688 vmovdqa %ymm1,128(%esp) 3689 vpmuludq 128(%edx),%ymm2,%ymm1 3690 vpmuludq -128(%edx),%ymm2,%ymm2 3691 vpmuludq -32(%edx),%ymm5,%ymm7 3692 vpaddq %ymm7,%ymm3,%ymm3 3693 vpmuludq (%edx),%ymm5,%ymm6 3694 vpaddq %ymm6,%ymm4,%ymm4 3695 vpmuludq -128(%edx),%ymm5,%ymm7 3696 vpaddq %ymm7,%ymm0,%ymm0 3697 vmovdqa 32(%esp),%ymm7 3698 vpmuludq -96(%edx),%ymm5,%ymm6 3699 vpaddq %ymm6,%ymm1,%ymm1 3700 vpmuludq -64(%edx),%ymm5,%ymm5 3701 vpaddq %ymm5,%ymm2,%ymm2 3702 vpmuludq -64(%edx),%ymm7,%ymm6 3703 vpaddq %ymm6,%ymm3,%ymm3 3704 vpmuludq -32(%edx),%ymm7,%ymm5 3705 vpaddq %ymm5,%ymm4,%ymm4 3706 vpmuludq 128(%edx),%ymm7,%ymm6 3707 vpaddq %ymm6,%ymm0,%ymm0 3708 vmovdqa 96(%esp),%ymm6 3709 vpmuludq -128(%edx),%ymm7,%ymm5 3710 vpaddq %ymm5,%ymm1,%ymm1 3711 vpmuludq -96(%edx),%ymm7,%ymm7 3712 vpaddq %ymm7,%ymm2,%ymm2 3713 vpmuludq -128(%edx),%ymm6,%ymm5 3714 vpaddq %ymm5,%ymm3,%ymm3 3715 vpmuludq -96(%edx),%ymm6,%ymm7 3716 vpaddq %ymm7,%ymm4,%ymm4 3717 vpmuludq 64(%edx),%ymm6,%ymm5 3718 vpaddq %ymm5,%ymm0,%ymm0 3719 vmovdqa 128(%esp),%ymm5 3720 vpmuludq 96(%edx),%ymm6,%ymm7 3721 vpaddq %ymm7,%ymm1,%ymm1 3722 vpmuludq 128(%edx),%ymm6,%ymm6 3723 vpaddq %ymm6,%ymm2,%ymm2 3724 vpmuludq 128(%edx),%ymm5,%ymm7 3725 vpaddq %ymm7,%ymm3,%ymm3 3726 vpmuludq 32(%edx),%ymm5,%ymm6 3727 vpaddq %ymm6,%ymm0,%ymm0 3728 vpmuludq -128(%edx),%ymm5,%ymm7 3729 vpaddq %ymm7,%ymm4,%ymm4 3730 vmovdqa 64(%ebx),%ymm7 3731 vpmuludq 64(%edx),%ymm5,%ymm6 3732 vpaddq %ymm6,%ymm1,%ymm1 3733 vpmuludq 96(%edx),%ymm5,%ymm5 3734 vpaddq %ymm5,%ymm2,%ymm2 3735 vpsrlq $26,%ymm3,%ymm5 3736 vpand %ymm7,%ymm3,%ymm3 3737 vpsrlq $26,%ymm0,%ymm6 3738 vpand %ymm7,%ymm0,%ymm0 3739 vpaddq %ymm5,%ymm4,%ymm4 3740 vpaddq %ymm6,%ymm1,%ymm1 3741 vpsrlq $26,%ymm4,%ymm5 3742 vpand %ymm7,%ymm4,%ymm4 3743 vpsrlq $26,%ymm1,%ymm6 3744 vpand %ymm7,%ymm1,%ymm1 3745 vpaddq %ymm6,%ymm2,%ymm2 3746 vpaddq %ymm5,%ymm0,%ymm0 3747 vpsllq $2,%ymm5,%ymm5 3748 vpsrlq $26,%ymm2,%ymm6 3749 vpand %ymm7,%ymm2,%ymm2 3750 vpaddq %ymm5,%ymm0,%ymm0 3751 vpaddq %ymm6,%ymm3,%ymm3 3752 vpsrlq $26,%ymm3,%ymm6 3753 vpsrlq $26,%ymm0,%ymm5 3754 vpand %ymm7,%ymm0,%ymm0 3755 vpand %ymm7,%ymm3,%ymm3 3756 vpaddq %ymm5,%ymm1,%ymm1 3757 vpaddq %ymm6,%ymm4,%ymm4 3758 vmovdqu (%esi),%xmm5 3759 vmovdqu 16(%esi),%xmm6 3760 vinserti128 $1,32(%esi),%ymm5,%ymm5 3761 vinserti128 $1,48(%esi),%ymm6,%ymm6 3762 leal 64(%esi),%esi 3763 subl $64,%ecx 3764 jnz .L028loop 3765.L027tail: 3766 vmovdqa %ymm2,64(%esp) 3767 vpsrldq $6,%ymm5,%ymm2 3768 vmovdqa %ymm0,(%esp) 3769 vpsrldq $6,%ymm6,%ymm0 3770 vmovdqa %ymm1,32(%esp) 3771 vpunpckhqdq %ymm6,%ymm5,%ymm1 3772 vpunpcklqdq %ymm6,%ymm5,%ymm5 3773 vpunpcklqdq %ymm0,%ymm2,%ymm2 3774 vpsrlq $30,%ymm2,%ymm0 3775 vpsrlq $4,%ymm2,%ymm2 3776 vpsrlq $26,%ymm5,%ymm6 3777 vpsrlq $40,%ymm1,%ymm1 3778 vpand %ymm7,%ymm2,%ymm2 3779 vpand %ymm7,%ymm5,%ymm5 3780 vpand %ymm7,%ymm6,%ymm6 3781 vpand %ymm7,%ymm0,%ymm0 3782 vpor (%ebx),%ymm1,%ymm1 3783 andl $-64,%ebx 3784 vpaddq 64(%esp),%ymm2,%ymm2 3785 vpaddq (%esp),%ymm5,%ymm5 3786 vpaddq 32(%esp),%ymm6,%ymm6 3787 vpaddq %ymm3,%ymm0,%ymm0 3788 vpaddq %ymm4,%ymm1,%ymm1 3789 vpmuludq -92(%edx),%ymm2,%ymm3 3790 vmovdqa %ymm6,32(%esp) 3791 vpmuludq -60(%edx),%ymm2,%ymm4 3792 vmovdqa %ymm0,96(%esp) 3793 vpmuludq 100(%edx),%ymm2,%ymm0 3794 vmovdqa %ymm1,128(%esp) 3795 vpmuludq 132(%edx),%ymm2,%ymm1 3796 vpmuludq -124(%edx),%ymm2,%ymm2 3797 vpmuludq -28(%edx),%ymm5,%ymm7 3798 vpaddq %ymm7,%ymm3,%ymm3 3799 vpmuludq 4(%edx),%ymm5,%ymm6 3800 vpaddq %ymm6,%ymm4,%ymm4 3801 vpmuludq -124(%edx),%ymm5,%ymm7 3802 vpaddq %ymm7,%ymm0,%ymm0 3803 vmovdqa 32(%esp),%ymm7 3804 vpmuludq -92(%edx),%ymm5,%ymm6 3805 vpaddq %ymm6,%ymm1,%ymm1 3806 vpmuludq -60(%edx),%ymm5,%ymm5 3807 vpaddq %ymm5,%ymm2,%ymm2 3808 vpmuludq -60(%edx),%ymm7,%ymm6 3809 vpaddq %ymm6,%ymm3,%ymm3 3810 vpmuludq -28(%edx),%ymm7,%ymm5 3811 vpaddq %ymm5,%ymm4,%ymm4 3812 vpmuludq 132(%edx),%ymm7,%ymm6 3813 vpaddq %ymm6,%ymm0,%ymm0 3814 vmovdqa 96(%esp),%ymm6 3815 vpmuludq -124(%edx),%ymm7,%ymm5 3816 vpaddq %ymm5,%ymm1,%ymm1 3817 vpmuludq -92(%edx),%ymm7,%ymm7 3818 vpaddq %ymm7,%ymm2,%ymm2 3819 vpmuludq -124(%edx),%ymm6,%ymm5 3820 vpaddq %ymm5,%ymm3,%ymm3 3821 vpmuludq -92(%edx),%ymm6,%ymm7 3822 vpaddq %ymm7,%ymm4,%ymm4 3823 vpmuludq 68(%edx),%ymm6,%ymm5 3824 vpaddq %ymm5,%ymm0,%ymm0 3825 vmovdqa 128(%esp),%ymm5 3826 vpmuludq 100(%edx),%ymm6,%ymm7 3827 vpaddq %ymm7,%ymm1,%ymm1 3828 vpmuludq 132(%edx),%ymm6,%ymm6 3829 vpaddq %ymm6,%ymm2,%ymm2 3830 vpmuludq 132(%edx),%ymm5,%ymm7 3831 vpaddq %ymm7,%ymm3,%ymm3 3832 vpmuludq 36(%edx),%ymm5,%ymm6 3833 vpaddq %ymm6,%ymm0,%ymm0 3834 vpmuludq -124(%edx),%ymm5,%ymm7 3835 vpaddq %ymm7,%ymm4,%ymm4 3836 vmovdqa 64(%ebx),%ymm7 3837 vpmuludq 68(%edx),%ymm5,%ymm6 3838 vpaddq %ymm6,%ymm1,%ymm1 3839 vpmuludq 100(%edx),%ymm5,%ymm5 3840 vpaddq %ymm5,%ymm2,%ymm2 3841 vpsrldq $8,%ymm4,%ymm5 3842 vpsrldq $8,%ymm3,%ymm6 3843 vpaddq %ymm5,%ymm4,%ymm4 3844 vpsrldq $8,%ymm0,%ymm5 3845 vpaddq %ymm6,%ymm3,%ymm3 3846 vpsrldq $8,%ymm1,%ymm6 3847 vpaddq %ymm5,%ymm0,%ymm0 3848 vpsrldq $8,%ymm2,%ymm5 3849 vpaddq %ymm6,%ymm1,%ymm1 3850 vpermq $2,%ymm4,%ymm6 3851 vpaddq %ymm5,%ymm2,%ymm2 3852 vpermq $2,%ymm3,%ymm5 3853 vpaddq %ymm6,%ymm4,%ymm4 3854 vpermq $2,%ymm0,%ymm6 3855 vpaddq %ymm5,%ymm3,%ymm3 3856 vpermq $2,%ymm1,%ymm5 3857 vpaddq %ymm6,%ymm0,%ymm0 3858 vpermq $2,%ymm2,%ymm6 3859 vpaddq %ymm5,%ymm1,%ymm1 3860 vpaddq %ymm6,%ymm2,%ymm2 3861 vpsrlq $26,%ymm3,%ymm5 3862 vpand %ymm7,%ymm3,%ymm3 3863 vpsrlq $26,%ymm0,%ymm6 3864 vpand %ymm7,%ymm0,%ymm0 3865 vpaddq %ymm5,%ymm4,%ymm4 3866 vpaddq %ymm6,%ymm1,%ymm1 3867 vpsrlq $26,%ymm4,%ymm5 3868 vpand %ymm7,%ymm4,%ymm4 3869 vpsrlq $26,%ymm1,%ymm6 3870 vpand %ymm7,%ymm1,%ymm1 3871 vpaddq %ymm6,%ymm2,%ymm2 3872 vpaddq %ymm5,%ymm0,%ymm0 3873 vpsllq $2,%ymm5,%ymm5 3874 vpsrlq $26,%ymm2,%ymm6 3875 vpand %ymm7,%ymm2,%ymm2 3876 vpaddq %ymm5,%ymm0,%ymm0 3877 vpaddq %ymm6,%ymm3,%ymm3 3878 vpsrlq $26,%ymm3,%ymm6 3879 vpsrlq $26,%ymm0,%ymm5 3880 vpand %ymm7,%ymm0,%ymm0 3881 vpand %ymm7,%ymm3,%ymm3 3882 vpaddq %ymm5,%ymm1,%ymm1 3883 vpaddq %ymm6,%ymm4,%ymm4 3884 cmpl $0,%ecx 3885 je .L029done 3886 vpshufd $252,%xmm0,%xmm0 3887 leal 288(%esp),%edx 3888 vpshufd $252,%xmm1,%xmm1 3889 vpshufd $252,%xmm2,%xmm2 3890 vpshufd $252,%xmm3,%xmm3 3891 vpshufd $252,%xmm4,%xmm4 3892 jmp .L024even 3893.align 16 3894.L029done: 3895 vmovd %xmm0,-48(%edi) 3896 vmovd %xmm1,-44(%edi) 3897 vmovd %xmm2,-40(%edi) 3898 vmovd %xmm3,-36(%edi) 3899 vmovd %xmm4,-32(%edi) 3900 vzeroupper 3901 movl %ebp,%esp 3902.L020nodata: 3903 popl %edi 3904 popl %esi 3905 popl %ebx 3906 popl %ebp 3907 ret 3908.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 3909.align 64 3910.Lconst_sse2: 3911.long 16777216,0,16777216,0,16777216,0,16777216,0 3912.long 0,0,0,0,0,0,0,0 3913.long 67108863,0,67108863,0,67108863,0,67108863,0 3914.long 268435455,268435452,268435452,268435452 3915.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 3916.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 3917.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 3918.byte 114,103,62,0 3919.align 4 3920.comm OPENSSL_ia32cap_P,16,4 3921 3922 .section ".note.gnu.property", "a" 3923 .p2align 2 3924 .long 1f - 0f 3925 .long 4f - 1f 3926 .long 5 39270: 3928 .asciz "GNU" 39291: 3930 .p2align 2 3931 .long 0xc0000002 3932 .long 3f - 2f 39332: 3934 .long 3 39353: 3936 .p2align 2 39374: 3938#endif 3939