1/* Do not modify. This file is auto-generated from poly1305-x86.pl. */ 2#ifdef PIC 3.text 4.align 64 5.globl poly1305_init 6.type poly1305_init,@function 7.align 16 8poly1305_init: 9.L_poly1305_init_begin: 10 pushl %ebp 11 pushl %ebx 12 pushl %esi 13 pushl %edi 14 movl 20(%esp),%edi 15 movl 24(%esp),%esi 16 movl 28(%esp),%ebp 17 xorl %eax,%eax 18 movl %eax,(%edi) 19 movl %eax,4(%edi) 20 movl %eax,8(%edi) 21 movl %eax,12(%edi) 22 movl %eax,16(%edi) 23 movl %eax,20(%edi) 24 cmpl $0,%esi 25 je .L000nokey 26 call .L001pic_point 27.L001pic_point: 28 popl %ebx 29 leal poly1305_blocks-.L001pic_point(%ebx),%eax 30 leal poly1305_emit-.L001pic_point(%ebx),%edx 31 leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi 32 movl (%edi),%ecx 33 andl $83886080,%ecx 34 cmpl $83886080,%ecx 35 jne .L002no_sse2 36 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax 37 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx 38 movl 8(%edi),%ecx 39 testl $32,%ecx 40 jz .L002no_sse2 41 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax 42.L002no_sse2: 43 movl 20(%esp),%edi 44 movl %eax,(%ebp) 45 movl %edx,4(%ebp) 46 movl (%esi),%eax 47 movl 4(%esi),%ebx 48 movl 8(%esi),%ecx 49 movl 12(%esi),%edx 50 andl $268435455,%eax 51 andl $268435452,%ebx 52 andl $268435452,%ecx 53 andl $268435452,%edx 54 movl %eax,24(%edi) 55 movl %ebx,28(%edi) 56 movl %ecx,32(%edi) 57 movl %edx,36(%edi) 58 movl $1,%eax 59.L000nokey: 60 popl %edi 61 popl %esi 62 popl %ebx 63 popl %ebp 64 ret 65.size poly1305_init,.-.L_poly1305_init_begin 66.globl poly1305_blocks 67.type poly1305_blocks,@function 68.align 16 69poly1305_blocks: 70.L_poly1305_blocks_begin: 71 pushl %ebp 72 pushl %ebx 73 pushl %esi 74 pushl %edi 75 movl 20(%esp),%edi 76 movl 24(%esp),%esi 77 movl 28(%esp),%ecx 78.Lenter_blocks: 79 andl $-15,%ecx 80 jz .L003nodata 81 subl $64,%esp 82 movl 24(%edi),%eax 83 movl 28(%edi),%ebx 84 leal (%esi,%ecx,1),%ebp 85 movl 32(%edi),%ecx 86 movl 36(%edi),%edx 87 movl %ebp,92(%esp) 88 movl %esi,%ebp 89 movl %eax,36(%esp) 90 movl %ebx,%eax 91 shrl $2,%eax 92 movl %ebx,40(%esp) 93 addl %ebx,%eax 94 movl %ecx,%ebx 95 shrl $2,%ebx 96 movl %ecx,44(%esp) 97 addl %ecx,%ebx 98 movl %edx,%ecx 99 shrl $2,%ecx 100 movl %edx,48(%esp) 101 addl %edx,%ecx 102 movl %eax,52(%esp) 103 movl %ebx,56(%esp) 104 movl %ecx,60(%esp) 105 movl (%edi),%eax 106 movl 4(%edi),%ebx 107 movl 8(%edi),%ecx 108 movl 12(%edi),%esi 109 movl 16(%edi),%edi 110 jmp .L004loop 111.align 32 112.L004loop: 113 addl (%ebp),%eax 114 adcl 4(%ebp),%ebx 115 adcl 8(%ebp),%ecx 116 adcl 12(%ebp),%esi 117 leal 16(%ebp),%ebp 118 adcl 96(%esp),%edi 119 movl %eax,(%esp) 120 movl %esi,12(%esp) 121 mull 36(%esp) 122 movl %edi,16(%esp) 123 movl %eax,%edi 124 movl %ebx,%eax 125 movl %edx,%esi 126 mull 60(%esp) 127 addl %eax,%edi 128 movl %ecx,%eax 129 adcl %edx,%esi 130 mull 56(%esp) 131 addl %eax,%edi 132 movl 12(%esp),%eax 133 adcl %edx,%esi 134 mull 52(%esp) 135 addl %eax,%edi 136 movl (%esp),%eax 137 adcl %edx,%esi 138 mull 40(%esp) 139 movl %edi,20(%esp) 140 xorl %edi,%edi 141 addl %eax,%esi 142 movl %ebx,%eax 143 adcl %edx,%edi 144 mull 36(%esp) 145 addl %eax,%esi 146 movl %ecx,%eax 147 adcl %edx,%edi 148 mull 60(%esp) 149 addl %eax,%esi 150 movl 12(%esp),%eax 151 adcl %edx,%edi 152 mull 56(%esp) 153 addl %eax,%esi 154 movl 16(%esp),%eax 155 adcl %edx,%edi 156 imull 52(%esp),%eax 157 addl %eax,%esi 158 movl (%esp),%eax 159 adcl $0,%edi 160 mull 44(%esp) 161 movl %esi,24(%esp) 162 xorl %esi,%esi 163 addl %eax,%edi 164 movl %ebx,%eax 165 adcl %edx,%esi 166 mull 40(%esp) 167 addl %eax,%edi 168 movl %ecx,%eax 169 adcl %edx,%esi 170 mull 36(%esp) 171 addl %eax,%edi 172 movl 12(%esp),%eax 173 adcl %edx,%esi 174 mull 60(%esp) 175 addl %eax,%edi 176 movl 16(%esp),%eax 177 adcl %edx,%esi 178 imull 56(%esp),%eax 179 addl %eax,%edi 180 movl (%esp),%eax 181 adcl $0,%esi 182 mull 48(%esp) 183 movl %edi,28(%esp) 184 xorl %edi,%edi 185 addl %eax,%esi 186 movl %ebx,%eax 187 adcl %edx,%edi 188 mull 44(%esp) 189 addl %eax,%esi 190 movl %ecx,%eax 191 adcl %edx,%edi 192 mull 40(%esp) 193 addl %eax,%esi 194 movl 12(%esp),%eax 195 adcl %edx,%edi 196 mull 36(%esp) 197 addl %eax,%esi 198 movl 16(%esp),%ecx 199 adcl %edx,%edi 200 movl %ecx,%edx 201 imull 60(%esp),%ecx 202 addl %ecx,%esi 203 movl 20(%esp),%eax 204 adcl $0,%edi 205 imull 36(%esp),%edx 206 addl %edi,%edx 207 movl 24(%esp),%ebx 208 movl 28(%esp),%ecx 209 movl %edx,%edi 210 shrl $2,%edx 211 andl $3,%edi 212 leal (%edx,%edx,4),%edx 213 addl %edx,%eax 214 adcl $0,%ebx 215 adcl $0,%ecx 216 adcl $0,%esi 217 adcl $0,%edi 218 cmpl 92(%esp),%ebp 219 jne .L004loop 220 movl 84(%esp),%edx 221 addl $64,%esp 222 movl %eax,(%edx) 223 movl %ebx,4(%edx) 224 movl %ecx,8(%edx) 225 movl %esi,12(%edx) 226 movl %edi,16(%edx) 227.L003nodata: 228 popl %edi 229 popl %esi 230 popl %ebx 231 popl %ebp 232 ret 233.size poly1305_blocks,.-.L_poly1305_blocks_begin 234.globl poly1305_emit 235.type poly1305_emit,@function 236.align 16 237poly1305_emit: 238.L_poly1305_emit_begin: 239 pushl %ebp 240 pushl %ebx 241 pushl %esi 242 pushl %edi 243 movl 20(%esp),%ebp 244.Lenter_emit: 245 movl 24(%esp),%edi 246 movl (%ebp),%eax 247 movl 4(%ebp),%ebx 248 movl 8(%ebp),%ecx 249 movl 12(%ebp),%edx 250 movl 16(%ebp),%esi 251 addl $5,%eax 252 adcl $0,%ebx 253 adcl $0,%ecx 254 adcl $0,%edx 255 adcl $0,%esi 256 shrl $2,%esi 257 negl %esi 258 andl %esi,%eax 259 andl %esi,%ebx 260 andl %esi,%ecx 261 andl %esi,%edx 262 movl %eax,(%edi) 263 movl %ebx,4(%edi) 264 movl %ecx,8(%edi) 265 movl %edx,12(%edi) 266 notl %esi 267 movl (%ebp),%eax 268 movl 4(%ebp),%ebx 269 movl 8(%ebp),%ecx 270 movl 12(%ebp),%edx 271 movl 28(%esp),%ebp 272 andl %esi,%eax 273 andl %esi,%ebx 274 andl %esi,%ecx 275 andl %esi,%edx 276 orl (%edi),%eax 277 orl 4(%edi),%ebx 278 orl 8(%edi),%ecx 279 orl 12(%edi),%edx 280 addl (%ebp),%eax 281 adcl 4(%ebp),%ebx 282 adcl 8(%ebp),%ecx 283 adcl 12(%ebp),%edx 284 movl %eax,(%edi) 285 movl %ebx,4(%edi) 286 movl %ecx,8(%edi) 287 movl %edx,12(%edi) 288 popl %edi 289 popl %esi 290 popl %ebx 291 popl %ebp 292 ret 293.size poly1305_emit,.-.L_poly1305_emit_begin 294.align 32 295.type _poly1305_init_sse2,@function 296.align 16 297_poly1305_init_sse2: 298 movdqu 24(%edi),%xmm4 299 leal 48(%edi),%edi 300 movl %esp,%ebp 301 subl $224,%esp 302 andl $-16,%esp 303 movq 64(%ebx),%xmm7 304 movdqa %xmm4,%xmm0 305 movdqa %xmm4,%xmm1 306 movdqa %xmm4,%xmm2 307 pand %xmm7,%xmm0 308 psrlq $26,%xmm1 309 psrldq $6,%xmm2 310 pand %xmm7,%xmm1 311 movdqa %xmm2,%xmm3 312 psrlq $4,%xmm2 313 psrlq $30,%xmm3 314 pand %xmm7,%xmm2 315 pand %xmm7,%xmm3 316 psrldq $13,%xmm4 317 leal 144(%esp),%edx 318 movl $2,%ecx 319.L005square: 320 movdqa %xmm0,(%esp) 321 movdqa %xmm1,16(%esp) 322 movdqa %xmm2,32(%esp) 323 movdqa %xmm3,48(%esp) 324 movdqa %xmm4,64(%esp) 325 movdqa %xmm1,%xmm6 326 movdqa %xmm2,%xmm5 327 pslld $2,%xmm6 328 pslld $2,%xmm5 329 paddd %xmm1,%xmm6 330 paddd %xmm2,%xmm5 331 movdqa %xmm6,80(%esp) 332 movdqa %xmm5,96(%esp) 333 movdqa %xmm3,%xmm6 334 movdqa %xmm4,%xmm5 335 pslld $2,%xmm6 336 pslld $2,%xmm5 337 paddd %xmm3,%xmm6 338 paddd %xmm4,%xmm5 339 movdqa %xmm6,112(%esp) 340 movdqa %xmm5,128(%esp) 341 pshufd $68,%xmm0,%xmm6 342 movdqa %xmm1,%xmm5 343 pshufd $68,%xmm1,%xmm1 344 pshufd $68,%xmm2,%xmm2 345 pshufd $68,%xmm3,%xmm3 346 pshufd $68,%xmm4,%xmm4 347 movdqa %xmm6,(%edx) 348 movdqa %xmm1,16(%edx) 349 movdqa %xmm2,32(%edx) 350 movdqa %xmm3,48(%edx) 351 movdqa %xmm4,64(%edx) 352 pmuludq %xmm0,%xmm4 353 pmuludq %xmm0,%xmm3 354 pmuludq %xmm0,%xmm2 355 pmuludq %xmm0,%xmm1 356 pmuludq %xmm6,%xmm0 357 movdqa %xmm5,%xmm6 358 pmuludq 48(%edx),%xmm5 359 movdqa %xmm6,%xmm7 360 pmuludq 32(%edx),%xmm6 361 paddq %xmm5,%xmm4 362 movdqa %xmm7,%xmm5 363 pmuludq 16(%edx),%xmm7 364 paddq %xmm6,%xmm3 365 movdqa 80(%esp),%xmm6 366 pmuludq (%edx),%xmm5 367 paddq %xmm7,%xmm2 368 pmuludq 64(%edx),%xmm6 369 movdqa 32(%esp),%xmm7 370 paddq %xmm5,%xmm1 371 movdqa %xmm7,%xmm5 372 pmuludq 32(%edx),%xmm7 373 paddq %xmm6,%xmm0 374 movdqa %xmm5,%xmm6 375 pmuludq 16(%edx),%xmm5 376 paddq %xmm7,%xmm4 377 movdqa 96(%esp),%xmm7 378 pmuludq (%edx),%xmm6 379 paddq %xmm5,%xmm3 380 movdqa %xmm7,%xmm5 381 pmuludq 64(%edx),%xmm7 382 paddq %xmm6,%xmm2 383 pmuludq 48(%edx),%xmm5 384 movdqa 48(%esp),%xmm6 385 paddq %xmm7,%xmm1 386 movdqa %xmm6,%xmm7 387 pmuludq 16(%edx),%xmm6 388 paddq %xmm5,%xmm0 389 movdqa 112(%esp),%xmm5 390 pmuludq (%edx),%xmm7 391 paddq %xmm6,%xmm4 392 movdqa %xmm5,%xmm6 393 pmuludq 64(%edx),%xmm5 394 paddq %xmm7,%xmm3 395 movdqa %xmm6,%xmm7 396 pmuludq 48(%edx),%xmm6 397 paddq %xmm5,%xmm2 398 pmuludq 32(%edx),%xmm7 399 movdqa 64(%esp),%xmm5 400 paddq %xmm6,%xmm1 401 movdqa 128(%esp),%xmm6 402 pmuludq (%edx),%xmm5 403 paddq %xmm7,%xmm0 404 movdqa %xmm6,%xmm7 405 pmuludq 64(%edx),%xmm6 406 paddq %xmm5,%xmm4 407 movdqa %xmm7,%xmm5 408 pmuludq 16(%edx),%xmm7 409 paddq %xmm6,%xmm3 410 movdqa %xmm5,%xmm6 411 pmuludq 32(%edx),%xmm5 412 paddq %xmm7,%xmm0 413 pmuludq 48(%edx),%xmm6 414 movdqa 64(%ebx),%xmm7 415 paddq %xmm5,%xmm1 416 paddq %xmm6,%xmm2 417 movdqa %xmm3,%xmm5 418 pand %xmm7,%xmm3 419 psrlq $26,%xmm5 420 paddq %xmm4,%xmm5 421 movdqa %xmm0,%xmm6 422 pand %xmm7,%xmm0 423 psrlq $26,%xmm6 424 movdqa %xmm5,%xmm4 425 paddq %xmm1,%xmm6 426 psrlq $26,%xmm5 427 pand %xmm7,%xmm4 428 movdqa %xmm6,%xmm1 429 psrlq $26,%xmm6 430 paddd %xmm5,%xmm0 431 psllq $2,%xmm5 432 paddq %xmm2,%xmm6 433 paddq %xmm0,%xmm5 434 pand %xmm7,%xmm1 435 movdqa %xmm6,%xmm2 436 psrlq $26,%xmm6 437 pand %xmm7,%xmm2 438 paddd %xmm3,%xmm6 439 movdqa %xmm5,%xmm0 440 psrlq $26,%xmm5 441 movdqa %xmm6,%xmm3 442 psrlq $26,%xmm6 443 pand %xmm7,%xmm0 444 paddd %xmm5,%xmm1 445 pand %xmm7,%xmm3 446 paddd %xmm6,%xmm4 447 decl %ecx 448 jz .L006square_break 449 punpcklqdq (%esp),%xmm0 450 punpcklqdq 16(%esp),%xmm1 451 punpcklqdq 32(%esp),%xmm2 452 punpcklqdq 48(%esp),%xmm3 453 punpcklqdq 64(%esp),%xmm4 454 jmp .L005square 455.L006square_break: 456 psllq $32,%xmm0 457 psllq $32,%xmm1 458 psllq $32,%xmm2 459 psllq $32,%xmm3 460 psllq $32,%xmm4 461 por (%esp),%xmm0 462 por 16(%esp),%xmm1 463 por 32(%esp),%xmm2 464 por 48(%esp),%xmm3 465 por 64(%esp),%xmm4 466 pshufd $141,%xmm0,%xmm0 467 pshufd $141,%xmm1,%xmm1 468 pshufd $141,%xmm2,%xmm2 469 pshufd $141,%xmm3,%xmm3 470 pshufd $141,%xmm4,%xmm4 471 movdqu %xmm0,(%edi) 472 movdqu %xmm1,16(%edi) 473 movdqu %xmm2,32(%edi) 474 movdqu %xmm3,48(%edi) 475 movdqu %xmm4,64(%edi) 476 movdqa %xmm1,%xmm6 477 movdqa %xmm2,%xmm5 478 pslld $2,%xmm6 479 pslld $2,%xmm5 480 paddd %xmm1,%xmm6 481 paddd %xmm2,%xmm5 482 movdqu %xmm6,80(%edi) 483 movdqu %xmm5,96(%edi) 484 movdqa %xmm3,%xmm6 485 movdqa %xmm4,%xmm5 486 pslld $2,%xmm6 487 pslld $2,%xmm5 488 paddd %xmm3,%xmm6 489 paddd %xmm4,%xmm5 490 movdqu %xmm6,112(%edi) 491 movdqu %xmm5,128(%edi) 492 movl %ebp,%esp 493 leal -48(%edi),%edi 494 ret 495.size _poly1305_init_sse2,.-_poly1305_init_sse2 496.align 32 497.type _poly1305_blocks_sse2,@function 498.align 16 499_poly1305_blocks_sse2: 500 pushl %ebp 501 pushl %ebx 502 pushl %esi 503 pushl %edi 504 movl 20(%esp),%edi 505 movl 24(%esp),%esi 506 movl 28(%esp),%ecx 507 movl 20(%edi),%eax 508 andl $-16,%ecx 509 jz .L007nodata 510 cmpl $64,%ecx 511 jae .L008enter_sse2 512 testl %eax,%eax 513 jz .Lenter_blocks 514.align 16 515.L008enter_sse2: 516 call .L009pic_point 517.L009pic_point: 518 popl %ebx 519 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx 520 testl %eax,%eax 521 jnz .L010base2_26 522 call _poly1305_init_sse2 523 movl (%edi),%eax 524 movl 3(%edi),%ecx 525 movl 6(%edi),%edx 526 movl 9(%edi),%esi 527 movl 13(%edi),%ebp 528 movl $1,20(%edi) 529 shrl $2,%ecx 530 andl $67108863,%eax 531 shrl $4,%edx 532 andl $67108863,%ecx 533 shrl $6,%esi 534 andl $67108863,%edx 535 movd %eax,%xmm0 536 movd %ecx,%xmm1 537 movd %edx,%xmm2 538 movd %esi,%xmm3 539 movd %ebp,%xmm4 540 movl 24(%esp),%esi 541 movl 28(%esp),%ecx 542 jmp .L011base2_32 543.align 16 544.L010base2_26: 545 movd (%edi),%xmm0 546 movd 4(%edi),%xmm1 547 movd 8(%edi),%xmm2 548 movd 12(%edi),%xmm3 549 movd 16(%edi),%xmm4 550 movdqa 64(%ebx),%xmm7 551.L011base2_32: 552 movl 32(%esp),%eax 553 movl %esp,%ebp 554 subl $528,%esp 555 andl $-16,%esp 556 leal 48(%edi),%edi 557 shll $24,%eax 558 testl $31,%ecx 559 jz .L012even 560 movdqu (%esi),%xmm6 561 leal 16(%esi),%esi 562 movdqa %xmm6,%xmm5 563 pand %xmm7,%xmm6 564 paddd %xmm6,%xmm0 565 movdqa %xmm5,%xmm6 566 psrlq $26,%xmm5 567 psrldq $6,%xmm6 568 pand %xmm7,%xmm5 569 paddd %xmm5,%xmm1 570 movdqa %xmm6,%xmm5 571 psrlq $4,%xmm6 572 pand %xmm7,%xmm6 573 paddd %xmm6,%xmm2 574 movdqa %xmm5,%xmm6 575 psrlq $30,%xmm5 576 pand %xmm7,%xmm5 577 psrldq $7,%xmm6 578 paddd %xmm5,%xmm3 579 movd %eax,%xmm5 580 paddd %xmm6,%xmm4 581 movd 12(%edi),%xmm6 582 paddd %xmm5,%xmm4 583 movdqa %xmm0,(%esp) 584 movdqa %xmm1,16(%esp) 585 movdqa %xmm2,32(%esp) 586 movdqa %xmm3,48(%esp) 587 movdqa %xmm4,64(%esp) 588 pmuludq %xmm6,%xmm0 589 pmuludq %xmm6,%xmm1 590 pmuludq %xmm6,%xmm2 591 movd 28(%edi),%xmm5 592 pmuludq %xmm6,%xmm3 593 pmuludq %xmm6,%xmm4 594 movdqa %xmm5,%xmm6 595 pmuludq 48(%esp),%xmm5 596 movdqa %xmm6,%xmm7 597 pmuludq 32(%esp),%xmm6 598 paddq %xmm5,%xmm4 599 movdqa %xmm7,%xmm5 600 pmuludq 16(%esp),%xmm7 601 paddq %xmm6,%xmm3 602 movd 92(%edi),%xmm6 603 pmuludq (%esp),%xmm5 604 paddq %xmm7,%xmm2 605 pmuludq 64(%esp),%xmm6 606 movd 44(%edi),%xmm7 607 paddq %xmm5,%xmm1 608 movdqa %xmm7,%xmm5 609 pmuludq 32(%esp),%xmm7 610 paddq %xmm6,%xmm0 611 movdqa %xmm5,%xmm6 612 pmuludq 16(%esp),%xmm5 613 paddq %xmm7,%xmm4 614 movd 108(%edi),%xmm7 615 pmuludq (%esp),%xmm6 616 paddq %xmm5,%xmm3 617 movdqa %xmm7,%xmm5 618 pmuludq 64(%esp),%xmm7 619 paddq %xmm6,%xmm2 620 pmuludq 48(%esp),%xmm5 621 movd 60(%edi),%xmm6 622 paddq %xmm7,%xmm1 623 movdqa %xmm6,%xmm7 624 pmuludq 16(%esp),%xmm6 625 paddq %xmm5,%xmm0 626 movd 124(%edi),%xmm5 627 pmuludq (%esp),%xmm7 628 paddq %xmm6,%xmm4 629 movdqa %xmm5,%xmm6 630 pmuludq 64(%esp),%xmm5 631 paddq %xmm7,%xmm3 632 movdqa %xmm6,%xmm7 633 pmuludq 48(%esp),%xmm6 634 paddq %xmm5,%xmm2 635 pmuludq 32(%esp),%xmm7 636 movd 76(%edi),%xmm5 637 paddq %xmm6,%xmm1 638 movd 140(%edi),%xmm6 639 pmuludq (%esp),%xmm5 640 paddq %xmm7,%xmm0 641 movdqa %xmm6,%xmm7 642 pmuludq 64(%esp),%xmm6 643 paddq %xmm5,%xmm4 644 movdqa %xmm7,%xmm5 645 pmuludq 16(%esp),%xmm7 646 paddq %xmm6,%xmm3 647 movdqa %xmm5,%xmm6 648 pmuludq 32(%esp),%xmm5 649 paddq %xmm7,%xmm0 650 pmuludq 48(%esp),%xmm6 651 movdqa 64(%ebx),%xmm7 652 paddq %xmm5,%xmm1 653 paddq %xmm6,%xmm2 654 movdqa %xmm3,%xmm5 655 pand %xmm7,%xmm3 656 psrlq $26,%xmm5 657 paddq %xmm4,%xmm5 658 movdqa %xmm0,%xmm6 659 pand %xmm7,%xmm0 660 psrlq $26,%xmm6 661 movdqa %xmm5,%xmm4 662 paddq %xmm1,%xmm6 663 psrlq $26,%xmm5 664 pand %xmm7,%xmm4 665 movdqa %xmm6,%xmm1 666 psrlq $26,%xmm6 667 paddd %xmm5,%xmm0 668 psllq $2,%xmm5 669 paddq %xmm2,%xmm6 670 paddq %xmm0,%xmm5 671 pand %xmm7,%xmm1 672 movdqa %xmm6,%xmm2 673 psrlq $26,%xmm6 674 pand %xmm7,%xmm2 675 paddd %xmm3,%xmm6 676 movdqa %xmm5,%xmm0 677 psrlq $26,%xmm5 678 movdqa %xmm6,%xmm3 679 psrlq $26,%xmm6 680 pand %xmm7,%xmm0 681 paddd %xmm5,%xmm1 682 pand %xmm7,%xmm3 683 paddd %xmm6,%xmm4 684 subl $16,%ecx 685 jz .L013done 686.L012even: 687 leal 384(%esp),%edx 688 leal -32(%esi),%eax 689 subl $64,%ecx 690 movdqu (%edi),%xmm5 691 pshufd $68,%xmm5,%xmm6 692 cmovbl %eax,%esi 693 pshufd $238,%xmm5,%xmm5 694 movdqa %xmm6,(%edx) 695 leal 160(%esp),%eax 696 movdqu 16(%edi),%xmm6 697 movdqa %xmm5,-144(%edx) 698 pshufd $68,%xmm6,%xmm5 699 pshufd $238,%xmm6,%xmm6 700 movdqa %xmm5,16(%edx) 701 movdqu 32(%edi),%xmm5 702 movdqa %xmm6,-128(%edx) 703 pshufd $68,%xmm5,%xmm6 704 pshufd $238,%xmm5,%xmm5 705 movdqa %xmm6,32(%edx) 706 movdqu 48(%edi),%xmm6 707 movdqa %xmm5,-112(%edx) 708 pshufd $68,%xmm6,%xmm5 709 pshufd $238,%xmm6,%xmm6 710 movdqa %xmm5,48(%edx) 711 movdqu 64(%edi),%xmm5 712 movdqa %xmm6,-96(%edx) 713 pshufd $68,%xmm5,%xmm6 714 pshufd $238,%xmm5,%xmm5 715 movdqa %xmm6,64(%edx) 716 movdqu 80(%edi),%xmm6 717 movdqa %xmm5,-80(%edx) 718 pshufd $68,%xmm6,%xmm5 719 pshufd $238,%xmm6,%xmm6 720 movdqa %xmm5,80(%edx) 721 movdqu 96(%edi),%xmm5 722 movdqa %xmm6,-64(%edx) 723 pshufd $68,%xmm5,%xmm6 724 pshufd $238,%xmm5,%xmm5 725 movdqa %xmm6,96(%edx) 726 movdqu 112(%edi),%xmm6 727 movdqa %xmm5,-48(%edx) 728 pshufd $68,%xmm6,%xmm5 729 pshufd $238,%xmm6,%xmm6 730 movdqa %xmm5,112(%edx) 731 movdqu 128(%edi),%xmm5 732 movdqa %xmm6,-32(%edx) 733 pshufd $68,%xmm5,%xmm6 734 pshufd $238,%xmm5,%xmm5 735 movdqa %xmm6,128(%edx) 736 movdqa %xmm5,-16(%edx) 737 movdqu 32(%esi),%xmm5 738 movdqu 48(%esi),%xmm6 739 leal 32(%esi),%esi 740 movdqa %xmm2,112(%esp) 741 movdqa %xmm3,128(%esp) 742 movdqa %xmm4,144(%esp) 743 movdqa %xmm5,%xmm2 744 movdqa %xmm6,%xmm3 745 psrldq $6,%xmm2 746 psrldq $6,%xmm3 747 movdqa %xmm5,%xmm4 748 punpcklqdq %xmm3,%xmm2 749 punpckhqdq %xmm6,%xmm4 750 punpcklqdq %xmm6,%xmm5 751 movdqa %xmm2,%xmm3 752 psrlq $4,%xmm2 753 psrlq $30,%xmm3 754 movdqa %xmm5,%xmm6 755 psrlq $40,%xmm4 756 psrlq $26,%xmm6 757 pand %xmm7,%xmm5 758 pand %xmm7,%xmm6 759 pand %xmm7,%xmm2 760 pand %xmm7,%xmm3 761 por (%ebx),%xmm4 762 movdqa %xmm0,80(%esp) 763 movdqa %xmm1,96(%esp) 764 jbe .L014skip_loop 765 jmp .L015loop 766.align 32 767.L015loop: 768 movdqa -144(%edx),%xmm7 769 movdqa %xmm6,16(%eax) 770 movdqa %xmm2,32(%eax) 771 movdqa %xmm3,48(%eax) 772 movdqa %xmm4,64(%eax) 773 movdqa %xmm5,%xmm1 774 pmuludq %xmm7,%xmm5 775 movdqa %xmm6,%xmm0 776 pmuludq %xmm7,%xmm6 777 pmuludq %xmm7,%xmm2 778 pmuludq %xmm7,%xmm3 779 pmuludq %xmm7,%xmm4 780 pmuludq -16(%edx),%xmm0 781 movdqa %xmm1,%xmm7 782 pmuludq -128(%edx),%xmm1 783 paddq %xmm5,%xmm0 784 movdqa %xmm7,%xmm5 785 pmuludq -112(%edx),%xmm7 786 paddq %xmm6,%xmm1 787 movdqa %xmm5,%xmm6 788 pmuludq -96(%edx),%xmm5 789 paddq %xmm7,%xmm2 790 movdqa 16(%eax),%xmm7 791 pmuludq -80(%edx),%xmm6 792 paddq %xmm5,%xmm3 793 movdqa %xmm7,%xmm5 794 pmuludq -128(%edx),%xmm7 795 paddq %xmm6,%xmm4 796 movdqa %xmm5,%xmm6 797 pmuludq -112(%edx),%xmm5 798 paddq %xmm7,%xmm2 799 movdqa 32(%eax),%xmm7 800 pmuludq -96(%edx),%xmm6 801 paddq %xmm5,%xmm3 802 movdqa %xmm7,%xmm5 803 pmuludq -32(%edx),%xmm7 804 paddq %xmm6,%xmm4 805 movdqa %xmm5,%xmm6 806 pmuludq -16(%edx),%xmm5 807 paddq %xmm7,%xmm0 808 movdqa %xmm6,%xmm7 809 pmuludq -128(%edx),%xmm6 810 paddq %xmm5,%xmm1 811 movdqa 48(%eax),%xmm5 812 pmuludq -112(%edx),%xmm7 813 paddq %xmm6,%xmm3 814 movdqa %xmm5,%xmm6 815 pmuludq -48(%edx),%xmm5 816 paddq %xmm7,%xmm4 817 movdqa %xmm6,%xmm7 818 pmuludq -32(%edx),%xmm6 819 paddq %xmm5,%xmm0 820 movdqa %xmm7,%xmm5 821 pmuludq -16(%edx),%xmm7 822 paddq %xmm6,%xmm1 823 movdqa 64(%eax),%xmm6 824 pmuludq -128(%edx),%xmm5 825 paddq %xmm7,%xmm2 826 movdqa %xmm6,%xmm7 827 pmuludq -16(%edx),%xmm6 828 paddq %xmm5,%xmm4 829 movdqa %xmm7,%xmm5 830 pmuludq -64(%edx),%xmm7 831 paddq %xmm6,%xmm3 832 movdqa %xmm5,%xmm6 833 pmuludq -48(%edx),%xmm5 834 paddq %xmm7,%xmm0 835 movdqa 64(%ebx),%xmm7 836 pmuludq -32(%edx),%xmm6 837 paddq %xmm5,%xmm1 838 paddq %xmm6,%xmm2 839 movdqu -32(%esi),%xmm5 840 movdqu -16(%esi),%xmm6 841 leal 32(%esi),%esi 842 movdqa %xmm2,32(%esp) 843 movdqa %xmm3,48(%esp) 844 movdqa %xmm4,64(%esp) 845 movdqa %xmm5,%xmm2 846 movdqa %xmm6,%xmm3 847 psrldq $6,%xmm2 848 psrldq $6,%xmm3 849 movdqa %xmm5,%xmm4 850 punpcklqdq %xmm3,%xmm2 851 punpckhqdq %xmm6,%xmm4 852 punpcklqdq %xmm6,%xmm5 853 movdqa %xmm2,%xmm3 854 psrlq $4,%xmm2 855 psrlq $30,%xmm3 856 movdqa %xmm5,%xmm6 857 psrlq $40,%xmm4 858 psrlq $26,%xmm6 859 pand %xmm7,%xmm5 860 pand %xmm7,%xmm6 861 pand %xmm7,%xmm2 862 pand %xmm7,%xmm3 863 por (%ebx),%xmm4 864 leal -32(%esi),%eax 865 subl $64,%ecx 866 paddd 80(%esp),%xmm5 867 paddd 96(%esp),%xmm6 868 paddd 112(%esp),%xmm2 869 paddd 128(%esp),%xmm3 870 paddd 144(%esp),%xmm4 871 cmovbl %eax,%esi 872 leal 160(%esp),%eax 873 movdqa (%edx),%xmm7 874 movdqa %xmm1,16(%esp) 875 movdqa %xmm6,16(%eax) 876 movdqa %xmm2,32(%eax) 877 movdqa %xmm3,48(%eax) 878 movdqa %xmm4,64(%eax) 879 movdqa %xmm5,%xmm1 880 pmuludq %xmm7,%xmm5 881 paddq %xmm0,%xmm5 882 movdqa %xmm6,%xmm0 883 pmuludq %xmm7,%xmm6 884 pmuludq %xmm7,%xmm2 885 pmuludq %xmm7,%xmm3 886 pmuludq %xmm7,%xmm4 887 paddq 16(%esp),%xmm6 888 paddq 32(%esp),%xmm2 889 paddq 48(%esp),%xmm3 890 paddq 64(%esp),%xmm4 891 pmuludq 128(%edx),%xmm0 892 movdqa %xmm1,%xmm7 893 pmuludq 16(%edx),%xmm1 894 paddq %xmm5,%xmm0 895 movdqa %xmm7,%xmm5 896 pmuludq 32(%edx),%xmm7 897 paddq %xmm6,%xmm1 898 movdqa %xmm5,%xmm6 899 pmuludq 48(%edx),%xmm5 900 paddq %xmm7,%xmm2 901 movdqa 16(%eax),%xmm7 902 pmuludq 64(%edx),%xmm6 903 paddq %xmm5,%xmm3 904 movdqa %xmm7,%xmm5 905 pmuludq 16(%edx),%xmm7 906 paddq %xmm6,%xmm4 907 movdqa %xmm5,%xmm6 908 pmuludq 32(%edx),%xmm5 909 paddq %xmm7,%xmm2 910 movdqa 32(%eax),%xmm7 911 pmuludq 48(%edx),%xmm6 912 paddq %xmm5,%xmm3 913 movdqa %xmm7,%xmm5 914 pmuludq 112(%edx),%xmm7 915 paddq %xmm6,%xmm4 916 movdqa %xmm5,%xmm6 917 pmuludq 128(%edx),%xmm5 918 paddq %xmm7,%xmm0 919 movdqa %xmm6,%xmm7 920 pmuludq 16(%edx),%xmm6 921 paddq %xmm5,%xmm1 922 movdqa 48(%eax),%xmm5 923 pmuludq 32(%edx),%xmm7 924 paddq %xmm6,%xmm3 925 movdqa %xmm5,%xmm6 926 pmuludq 96(%edx),%xmm5 927 paddq %xmm7,%xmm4 928 movdqa %xmm6,%xmm7 929 pmuludq 112(%edx),%xmm6 930 paddq %xmm5,%xmm0 931 movdqa %xmm7,%xmm5 932 pmuludq 128(%edx),%xmm7 933 paddq %xmm6,%xmm1 934 movdqa 64(%eax),%xmm6 935 pmuludq 16(%edx),%xmm5 936 paddq %xmm7,%xmm2 937 movdqa %xmm6,%xmm7 938 pmuludq 128(%edx),%xmm6 939 paddq %xmm5,%xmm4 940 movdqa %xmm7,%xmm5 941 pmuludq 80(%edx),%xmm7 942 paddq %xmm6,%xmm3 943 movdqa %xmm5,%xmm6 944 pmuludq 96(%edx),%xmm5 945 paddq %xmm7,%xmm0 946 movdqa 64(%ebx),%xmm7 947 pmuludq 112(%edx),%xmm6 948 paddq %xmm5,%xmm1 949 paddq %xmm6,%xmm2 950 movdqa %xmm3,%xmm5 951 pand %xmm7,%xmm3 952 psrlq $26,%xmm5 953 paddq %xmm4,%xmm5 954 movdqa %xmm0,%xmm6 955 pand %xmm7,%xmm0 956 psrlq $26,%xmm6 957 movdqa %xmm5,%xmm4 958 paddq %xmm1,%xmm6 959 psrlq $26,%xmm5 960 pand %xmm7,%xmm4 961 movdqa %xmm6,%xmm1 962 psrlq $26,%xmm6 963 paddd %xmm5,%xmm0 964 psllq $2,%xmm5 965 paddq %xmm2,%xmm6 966 paddq %xmm0,%xmm5 967 pand %xmm7,%xmm1 968 movdqa %xmm6,%xmm2 969 psrlq $26,%xmm6 970 pand %xmm7,%xmm2 971 paddd %xmm3,%xmm6 972 movdqa %xmm5,%xmm0 973 psrlq $26,%xmm5 974 movdqa %xmm6,%xmm3 975 psrlq $26,%xmm6 976 pand %xmm7,%xmm0 977 paddd %xmm5,%xmm1 978 pand %xmm7,%xmm3 979 paddd %xmm6,%xmm4 980 movdqu 32(%esi),%xmm5 981 movdqu 48(%esi),%xmm6 982 leal 32(%esi),%esi 983 movdqa %xmm2,112(%esp) 984 movdqa %xmm3,128(%esp) 985 movdqa %xmm4,144(%esp) 986 movdqa %xmm5,%xmm2 987 movdqa %xmm6,%xmm3 988 psrldq $6,%xmm2 989 psrldq $6,%xmm3 990 movdqa %xmm5,%xmm4 991 punpcklqdq %xmm3,%xmm2 992 punpckhqdq %xmm6,%xmm4 993 punpcklqdq %xmm6,%xmm5 994 movdqa %xmm2,%xmm3 995 psrlq $4,%xmm2 996 psrlq $30,%xmm3 997 movdqa %xmm5,%xmm6 998 psrlq $40,%xmm4 999 psrlq $26,%xmm6 1000 pand %xmm7,%xmm5 1001 pand %xmm7,%xmm6 1002 pand %xmm7,%xmm2 1003 pand %xmm7,%xmm3 1004 por (%ebx),%xmm4 1005 movdqa %xmm0,80(%esp) 1006 movdqa %xmm1,96(%esp) 1007 ja .L015loop 1008.L014skip_loop: 1009 pshufd $16,-144(%edx),%xmm7 1010 addl $32,%ecx 1011 jnz .L016long_tail 1012 paddd %xmm0,%xmm5 1013 paddd %xmm1,%xmm6 1014 paddd 112(%esp),%xmm2 1015 paddd 128(%esp),%xmm3 1016 paddd 144(%esp),%xmm4 1017.L016long_tail: 1018 movdqa %xmm5,(%eax) 1019 movdqa %xmm6,16(%eax) 1020 movdqa %xmm2,32(%eax) 1021 movdqa %xmm3,48(%eax) 1022 movdqa %xmm4,64(%eax) 1023 pmuludq %xmm7,%xmm5 1024 pmuludq %xmm7,%xmm6 1025 pmuludq %xmm7,%xmm2 1026 movdqa %xmm5,%xmm0 1027 pshufd $16,-128(%edx),%xmm5 1028 pmuludq %xmm7,%xmm3 1029 movdqa %xmm6,%xmm1 1030 pmuludq %xmm7,%xmm4 1031 movdqa %xmm5,%xmm6 1032 pmuludq 48(%eax),%xmm5 1033 movdqa %xmm6,%xmm7 1034 pmuludq 32(%eax),%xmm6 1035 paddq %xmm5,%xmm4 1036 movdqa %xmm7,%xmm5 1037 pmuludq 16(%eax),%xmm7 1038 paddq %xmm6,%xmm3 1039 pshufd $16,-64(%edx),%xmm6 1040 pmuludq (%eax),%xmm5 1041 paddq %xmm7,%xmm2 1042 pmuludq 64(%eax),%xmm6 1043 pshufd $16,-112(%edx),%xmm7 1044 paddq %xmm5,%xmm1 1045 movdqa %xmm7,%xmm5 1046 pmuludq 32(%eax),%xmm7 1047 paddq %xmm6,%xmm0 1048 movdqa %xmm5,%xmm6 1049 pmuludq 16(%eax),%xmm5 1050 paddq %xmm7,%xmm4 1051 pshufd $16,-48(%edx),%xmm7 1052 pmuludq (%eax),%xmm6 1053 paddq %xmm5,%xmm3 1054 movdqa %xmm7,%xmm5 1055 pmuludq 64(%eax),%xmm7 1056 paddq %xmm6,%xmm2 1057 pmuludq 48(%eax),%xmm5 1058 pshufd $16,-96(%edx),%xmm6 1059 paddq %xmm7,%xmm1 1060 movdqa %xmm6,%xmm7 1061 pmuludq 16(%eax),%xmm6 1062 paddq %xmm5,%xmm0 1063 pshufd $16,-32(%edx),%xmm5 1064 pmuludq (%eax),%xmm7 1065 paddq %xmm6,%xmm4 1066 movdqa %xmm5,%xmm6 1067 pmuludq 64(%eax),%xmm5 1068 paddq %xmm7,%xmm3 1069 movdqa %xmm6,%xmm7 1070 pmuludq 48(%eax),%xmm6 1071 paddq %xmm5,%xmm2 1072 pmuludq 32(%eax),%xmm7 1073 pshufd $16,-80(%edx),%xmm5 1074 paddq %xmm6,%xmm1 1075 pshufd $16,-16(%edx),%xmm6 1076 pmuludq (%eax),%xmm5 1077 paddq %xmm7,%xmm0 1078 movdqa %xmm6,%xmm7 1079 pmuludq 64(%eax),%xmm6 1080 paddq %xmm5,%xmm4 1081 movdqa %xmm7,%xmm5 1082 pmuludq 16(%eax),%xmm7 1083 paddq %xmm6,%xmm3 1084 movdqa %xmm5,%xmm6 1085 pmuludq 32(%eax),%xmm5 1086 paddq %xmm7,%xmm0 1087 pmuludq 48(%eax),%xmm6 1088 movdqa 64(%ebx),%xmm7 1089 paddq %xmm5,%xmm1 1090 paddq %xmm6,%xmm2 1091 jz .L017short_tail 1092 movdqu -32(%esi),%xmm5 1093 movdqu -16(%esi),%xmm6 1094 leal 32(%esi),%esi 1095 movdqa %xmm2,32(%esp) 1096 movdqa %xmm3,48(%esp) 1097 movdqa %xmm4,64(%esp) 1098 movdqa %xmm5,%xmm2 1099 movdqa %xmm6,%xmm3 1100 psrldq $6,%xmm2 1101 psrldq $6,%xmm3 1102 movdqa %xmm5,%xmm4 1103 punpcklqdq %xmm3,%xmm2 1104 punpckhqdq %xmm6,%xmm4 1105 punpcklqdq %xmm6,%xmm5 1106 movdqa %xmm2,%xmm3 1107 psrlq $4,%xmm2 1108 psrlq $30,%xmm3 1109 movdqa %xmm5,%xmm6 1110 psrlq $40,%xmm4 1111 psrlq $26,%xmm6 1112 pand %xmm7,%xmm5 1113 pand %xmm7,%xmm6 1114 pand %xmm7,%xmm2 1115 pand %xmm7,%xmm3 1116 por (%ebx),%xmm4 1117 pshufd $16,(%edx),%xmm7 1118 paddd 80(%esp),%xmm5 1119 paddd 96(%esp),%xmm6 1120 paddd 112(%esp),%xmm2 1121 paddd 128(%esp),%xmm3 1122 paddd 144(%esp),%xmm4 1123 movdqa %xmm5,(%esp) 1124 pmuludq %xmm7,%xmm5 1125 movdqa %xmm6,16(%esp) 1126 pmuludq %xmm7,%xmm6 1127 paddq %xmm5,%xmm0 1128 movdqa %xmm2,%xmm5 1129 pmuludq %xmm7,%xmm2 1130 paddq %xmm6,%xmm1 1131 movdqa %xmm3,%xmm6 1132 pmuludq %xmm7,%xmm3 1133 paddq 32(%esp),%xmm2 1134 movdqa %xmm5,32(%esp) 1135 pshufd $16,16(%edx),%xmm5 1136 paddq 48(%esp),%xmm3 1137 movdqa %xmm6,48(%esp) 1138 movdqa %xmm4,%xmm6 1139 pmuludq %xmm7,%xmm4 1140 paddq 64(%esp),%xmm4 1141 movdqa %xmm6,64(%esp) 1142 movdqa %xmm5,%xmm6 1143 pmuludq 48(%esp),%xmm5 1144 movdqa %xmm6,%xmm7 1145 pmuludq 32(%esp),%xmm6 1146 paddq %xmm5,%xmm4 1147 movdqa %xmm7,%xmm5 1148 pmuludq 16(%esp),%xmm7 1149 paddq %xmm6,%xmm3 1150 pshufd $16,80(%edx),%xmm6 1151 pmuludq (%esp),%xmm5 1152 paddq %xmm7,%xmm2 1153 pmuludq 64(%esp),%xmm6 1154 pshufd $16,32(%edx),%xmm7 1155 paddq %xmm5,%xmm1 1156 movdqa %xmm7,%xmm5 1157 pmuludq 32(%esp),%xmm7 1158 paddq %xmm6,%xmm0 1159 movdqa %xmm5,%xmm6 1160 pmuludq 16(%esp),%xmm5 1161 paddq %xmm7,%xmm4 1162 pshufd $16,96(%edx),%xmm7 1163 pmuludq (%esp),%xmm6 1164 paddq %xmm5,%xmm3 1165 movdqa %xmm7,%xmm5 1166 pmuludq 64(%esp),%xmm7 1167 paddq %xmm6,%xmm2 1168 pmuludq 48(%esp),%xmm5 1169 pshufd $16,48(%edx),%xmm6 1170 paddq %xmm7,%xmm1 1171 movdqa %xmm6,%xmm7 1172 pmuludq 16(%esp),%xmm6 1173 paddq %xmm5,%xmm0 1174 pshufd $16,112(%edx),%xmm5 1175 pmuludq (%esp),%xmm7 1176 paddq %xmm6,%xmm4 1177 movdqa %xmm5,%xmm6 1178 pmuludq 64(%esp),%xmm5 1179 paddq %xmm7,%xmm3 1180 movdqa %xmm6,%xmm7 1181 pmuludq 48(%esp),%xmm6 1182 paddq %xmm5,%xmm2 1183 pmuludq 32(%esp),%xmm7 1184 pshufd $16,64(%edx),%xmm5 1185 paddq %xmm6,%xmm1 1186 pshufd $16,128(%edx),%xmm6 1187 pmuludq (%esp),%xmm5 1188 paddq %xmm7,%xmm0 1189 movdqa %xmm6,%xmm7 1190 pmuludq 64(%esp),%xmm6 1191 paddq %xmm5,%xmm4 1192 movdqa %xmm7,%xmm5 1193 pmuludq 16(%esp),%xmm7 1194 paddq %xmm6,%xmm3 1195 movdqa %xmm5,%xmm6 1196 pmuludq 32(%esp),%xmm5 1197 paddq %xmm7,%xmm0 1198 pmuludq 48(%esp),%xmm6 1199 movdqa 64(%ebx),%xmm7 1200 paddq %xmm5,%xmm1 1201 paddq %xmm6,%xmm2 1202.L017short_tail: 1203 pshufd $78,%xmm4,%xmm6 1204 pshufd $78,%xmm3,%xmm5 1205 paddq %xmm6,%xmm4 1206 paddq %xmm5,%xmm3 1207 pshufd $78,%xmm0,%xmm6 1208 pshufd $78,%xmm1,%xmm5 1209 paddq %xmm6,%xmm0 1210 paddq %xmm5,%xmm1 1211 pshufd $78,%xmm2,%xmm6 1212 movdqa %xmm3,%xmm5 1213 pand %xmm7,%xmm3 1214 psrlq $26,%xmm5 1215 paddq %xmm6,%xmm2 1216 paddq %xmm4,%xmm5 1217 movdqa %xmm0,%xmm6 1218 pand %xmm7,%xmm0 1219 psrlq $26,%xmm6 1220 movdqa %xmm5,%xmm4 1221 paddq %xmm1,%xmm6 1222 psrlq $26,%xmm5 1223 pand %xmm7,%xmm4 1224 movdqa %xmm6,%xmm1 1225 psrlq $26,%xmm6 1226 paddd %xmm5,%xmm0 1227 psllq $2,%xmm5 1228 paddq %xmm2,%xmm6 1229 paddq %xmm0,%xmm5 1230 pand %xmm7,%xmm1 1231 movdqa %xmm6,%xmm2 1232 psrlq $26,%xmm6 1233 pand %xmm7,%xmm2 1234 paddd %xmm3,%xmm6 1235 movdqa %xmm5,%xmm0 1236 psrlq $26,%xmm5 1237 movdqa %xmm6,%xmm3 1238 psrlq $26,%xmm6 1239 pand %xmm7,%xmm0 1240 paddd %xmm5,%xmm1 1241 pand %xmm7,%xmm3 1242 paddd %xmm6,%xmm4 1243.L013done: 1244 movd %xmm0,-48(%edi) 1245 movd %xmm1,-44(%edi) 1246 movd %xmm2,-40(%edi) 1247 movd %xmm3,-36(%edi) 1248 movd %xmm4,-32(%edi) 1249 movl %ebp,%esp 1250.L007nodata: 1251 popl %edi 1252 popl %esi 1253 popl %ebx 1254 popl %ebp 1255 ret 1256.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 1257.align 32 1258.type _poly1305_emit_sse2,@function 1259.align 16 1260_poly1305_emit_sse2: 1261 pushl %ebp 1262 pushl %ebx 1263 pushl %esi 1264 pushl %edi 1265 movl 20(%esp),%ebp 1266 cmpl $0,20(%ebp) 1267 je .Lenter_emit 1268 movl (%ebp),%eax 1269 movl 4(%ebp),%edi 1270 movl 8(%ebp),%ecx 1271 movl 12(%ebp),%edx 1272 movl 16(%ebp),%esi 1273 movl %edi,%ebx 1274 shll $26,%edi 1275 shrl $6,%ebx 1276 addl %edi,%eax 1277 movl %ecx,%edi 1278 adcl $0,%ebx 1279 shll $20,%edi 1280 shrl $12,%ecx 1281 addl %edi,%ebx 1282 movl %edx,%edi 1283 adcl $0,%ecx 1284 shll $14,%edi 1285 shrl $18,%edx 1286 addl %edi,%ecx 1287 movl %esi,%edi 1288 adcl $0,%edx 1289 shll $8,%edi 1290 shrl $24,%esi 1291 addl %edi,%edx 1292 adcl $0,%esi 1293 movl %esi,%edi 1294 andl $3,%esi 1295 shrl $2,%edi 1296 leal (%edi,%edi,4),%ebp 1297 movl 24(%esp),%edi 1298 addl %ebp,%eax 1299 movl 28(%esp),%ebp 1300 adcl $0,%ebx 1301 adcl $0,%ecx 1302 adcl $0,%edx 1303 adcl $0,%esi 1304 movd %eax,%xmm0 1305 addl $5,%eax 1306 movd %ebx,%xmm1 1307 adcl $0,%ebx 1308 movd %ecx,%xmm2 1309 adcl $0,%ecx 1310 movd %edx,%xmm3 1311 adcl $0,%edx 1312 adcl $0,%esi 1313 shrl $2,%esi 1314 negl %esi 1315 andl %esi,%eax 1316 andl %esi,%ebx 1317 andl %esi,%ecx 1318 andl %esi,%edx 1319 movl %eax,(%edi) 1320 movd %xmm0,%eax 1321 movl %ebx,4(%edi) 1322 movd %xmm1,%ebx 1323 movl %ecx,8(%edi) 1324 movd %xmm2,%ecx 1325 movl %edx,12(%edi) 1326 movd %xmm3,%edx 1327 notl %esi 1328 andl %esi,%eax 1329 andl %esi,%ebx 1330 orl (%edi),%eax 1331 andl %esi,%ecx 1332 orl 4(%edi),%ebx 1333 andl %esi,%edx 1334 orl 8(%edi),%ecx 1335 orl 12(%edi),%edx 1336 addl (%ebp),%eax 1337 adcl 4(%ebp),%ebx 1338 movl %eax,(%edi) 1339 adcl 8(%ebp),%ecx 1340 movl %ebx,4(%edi) 1341 adcl 12(%ebp),%edx 1342 movl %ecx,8(%edi) 1343 movl %edx,12(%edi) 1344 popl %edi 1345 popl %esi 1346 popl %ebx 1347 popl %ebp 1348 ret 1349.size _poly1305_emit_sse2,.-_poly1305_emit_sse2 1350.align 32 1351.type _poly1305_init_avx2,@function 1352.align 16 1353_poly1305_init_avx2: 1354 vmovdqu 24(%edi),%xmm4 1355 leal 48(%edi),%edi 1356 movl %esp,%ebp 1357 subl $224,%esp 1358 andl $-16,%esp 1359 vmovdqa 64(%ebx),%xmm7 1360 vpand %xmm7,%xmm4,%xmm0 1361 vpsrlq $26,%xmm4,%xmm1 1362 vpsrldq $6,%xmm4,%xmm3 1363 vpand %xmm7,%xmm1,%xmm1 1364 vpsrlq $4,%xmm3,%xmm2 1365 vpsrlq $30,%xmm3,%xmm3 1366 vpand %xmm7,%xmm2,%xmm2 1367 vpand %xmm7,%xmm3,%xmm3 1368 vpsrldq $13,%xmm4,%xmm4 1369 leal 144(%esp),%edx 1370 movl $2,%ecx 1371.L018square: 1372 vmovdqa %xmm0,(%esp) 1373 vmovdqa %xmm1,16(%esp) 1374 vmovdqa %xmm2,32(%esp) 1375 vmovdqa %xmm3,48(%esp) 1376 vmovdqa %xmm4,64(%esp) 1377 vpslld $2,%xmm1,%xmm6 1378 vpslld $2,%xmm2,%xmm5 1379 vpaddd %xmm1,%xmm6,%xmm6 1380 vpaddd %xmm2,%xmm5,%xmm5 1381 vmovdqa %xmm6,80(%esp) 1382 vmovdqa %xmm5,96(%esp) 1383 vpslld $2,%xmm3,%xmm6 1384 vpslld $2,%xmm4,%xmm5 1385 vpaddd %xmm3,%xmm6,%xmm6 1386 vpaddd %xmm4,%xmm5,%xmm5 1387 vmovdqa %xmm6,112(%esp) 1388 vmovdqa %xmm5,128(%esp) 1389 vpshufd $68,%xmm0,%xmm5 1390 vmovdqa %xmm1,%xmm6 1391 vpshufd $68,%xmm1,%xmm1 1392 vpshufd $68,%xmm2,%xmm2 1393 vpshufd $68,%xmm3,%xmm3 1394 vpshufd $68,%xmm4,%xmm4 1395 vmovdqa %xmm5,(%edx) 1396 vmovdqa %xmm1,16(%edx) 1397 vmovdqa %xmm2,32(%edx) 1398 vmovdqa %xmm3,48(%edx) 1399 vmovdqa %xmm4,64(%edx) 1400 vpmuludq %xmm0,%xmm4,%xmm4 1401 vpmuludq %xmm0,%xmm3,%xmm3 1402 vpmuludq %xmm0,%xmm2,%xmm2 1403 vpmuludq %xmm0,%xmm1,%xmm1 1404 vpmuludq %xmm0,%xmm5,%xmm0 1405 vpmuludq 48(%edx),%xmm6,%xmm5 1406 vpaddq %xmm5,%xmm4,%xmm4 1407 vpmuludq 32(%edx),%xmm6,%xmm7 1408 vpaddq %xmm7,%xmm3,%xmm3 1409 vpmuludq 16(%edx),%xmm6,%xmm5 1410 vpaddq %xmm5,%xmm2,%xmm2 1411 vmovdqa 80(%esp),%xmm7 1412 vpmuludq (%edx),%xmm6,%xmm6 1413 vpaddq %xmm6,%xmm1,%xmm1 1414 vmovdqa 32(%esp),%xmm5 1415 vpmuludq 64(%edx),%xmm7,%xmm7 1416 vpaddq %xmm7,%xmm0,%xmm0 1417 vpmuludq 32(%edx),%xmm5,%xmm6 1418 vpaddq %xmm6,%xmm4,%xmm4 1419 vpmuludq 16(%edx),%xmm5,%xmm7 1420 vpaddq %xmm7,%xmm3,%xmm3 1421 vmovdqa 96(%esp),%xmm6 1422 vpmuludq (%edx),%xmm5,%xmm5 1423 vpaddq %xmm5,%xmm2,%xmm2 1424 vpmuludq 64(%edx),%xmm6,%xmm7 1425 vpaddq %xmm7,%xmm1,%xmm1 1426 vmovdqa 48(%esp),%xmm5 1427 vpmuludq 48(%edx),%xmm6,%xmm6 1428 vpaddq %xmm6,%xmm0,%xmm0 1429 vpmuludq 16(%edx),%xmm5,%xmm7 1430 vpaddq %xmm7,%xmm4,%xmm4 1431 vmovdqa 112(%esp),%xmm6 1432 vpmuludq (%edx),%xmm5,%xmm5 1433 vpaddq %xmm5,%xmm3,%xmm3 1434 vpmuludq 64(%edx),%xmm6,%xmm7 1435 vpaddq %xmm7,%xmm2,%xmm2 1436 vpmuludq 48(%edx),%xmm6,%xmm5 1437 vpaddq %xmm5,%xmm1,%xmm1 1438 vmovdqa 64(%esp),%xmm7 1439 vpmuludq 32(%edx),%xmm6,%xmm6 1440 vpaddq %xmm6,%xmm0,%xmm0 1441 vmovdqa 128(%esp),%xmm5 1442 vpmuludq (%edx),%xmm7,%xmm7 1443 vpaddq %xmm7,%xmm4,%xmm4 1444 vpmuludq 64(%edx),%xmm5,%xmm6 1445 vpaddq %xmm6,%xmm3,%xmm3 1446 vpmuludq 16(%edx),%xmm5,%xmm7 1447 vpaddq %xmm7,%xmm0,%xmm0 1448 vpmuludq 32(%edx),%xmm5,%xmm6 1449 vpaddq %xmm6,%xmm1,%xmm1 1450 vmovdqa 64(%ebx),%xmm7 1451 vpmuludq 48(%edx),%xmm5,%xmm5 1452 vpaddq %xmm5,%xmm2,%xmm2 1453 vpsrlq $26,%xmm3,%xmm5 1454 vpand %xmm7,%xmm3,%xmm3 1455 vpsrlq $26,%xmm0,%xmm6 1456 vpand %xmm7,%xmm0,%xmm0 1457 vpaddq %xmm5,%xmm4,%xmm4 1458 vpaddq %xmm6,%xmm1,%xmm1 1459 vpsrlq $26,%xmm4,%xmm5 1460 vpand %xmm7,%xmm4,%xmm4 1461 vpsrlq $26,%xmm1,%xmm6 1462 vpand %xmm7,%xmm1,%xmm1 1463 vpaddq %xmm6,%xmm2,%xmm2 1464 vpaddd %xmm5,%xmm0,%xmm0 1465 vpsllq $2,%xmm5,%xmm5 1466 vpsrlq $26,%xmm2,%xmm6 1467 vpand %xmm7,%xmm2,%xmm2 1468 vpaddd %xmm5,%xmm0,%xmm0 1469 vpaddd %xmm6,%xmm3,%xmm3 1470 vpsrlq $26,%xmm3,%xmm6 1471 vpsrlq $26,%xmm0,%xmm5 1472 vpand %xmm7,%xmm0,%xmm0 1473 vpand %xmm7,%xmm3,%xmm3 1474 vpaddd %xmm5,%xmm1,%xmm1 1475 vpaddd %xmm6,%xmm4,%xmm4 1476 decl %ecx 1477 jz .L019square_break 1478 vpunpcklqdq (%esp),%xmm0,%xmm0 1479 vpunpcklqdq 16(%esp),%xmm1,%xmm1 1480 vpunpcklqdq 32(%esp),%xmm2,%xmm2 1481 vpunpcklqdq 48(%esp),%xmm3,%xmm3 1482 vpunpcklqdq 64(%esp),%xmm4,%xmm4 1483 jmp .L018square 1484.L019square_break: 1485 vpsllq $32,%xmm0,%xmm0 1486 vpsllq $32,%xmm1,%xmm1 1487 vpsllq $32,%xmm2,%xmm2 1488 vpsllq $32,%xmm3,%xmm3 1489 vpsllq $32,%xmm4,%xmm4 1490 vpor (%esp),%xmm0,%xmm0 1491 vpor 16(%esp),%xmm1,%xmm1 1492 vpor 32(%esp),%xmm2,%xmm2 1493 vpor 48(%esp),%xmm3,%xmm3 1494 vpor 64(%esp),%xmm4,%xmm4 1495 vpshufd $141,%xmm0,%xmm0 1496 vpshufd $141,%xmm1,%xmm1 1497 vpshufd $141,%xmm2,%xmm2 1498 vpshufd $141,%xmm3,%xmm3 1499 vpshufd $141,%xmm4,%xmm4 1500 vmovdqu %xmm0,(%edi) 1501 vmovdqu %xmm1,16(%edi) 1502 vmovdqu %xmm2,32(%edi) 1503 vmovdqu %xmm3,48(%edi) 1504 vmovdqu %xmm4,64(%edi) 1505 vpslld $2,%xmm1,%xmm6 1506 vpslld $2,%xmm2,%xmm5 1507 vpaddd %xmm1,%xmm6,%xmm6 1508 vpaddd %xmm2,%xmm5,%xmm5 1509 vmovdqu %xmm6,80(%edi) 1510 vmovdqu %xmm5,96(%edi) 1511 vpslld $2,%xmm3,%xmm6 1512 vpslld $2,%xmm4,%xmm5 1513 vpaddd %xmm3,%xmm6,%xmm6 1514 vpaddd %xmm4,%xmm5,%xmm5 1515 vmovdqu %xmm6,112(%edi) 1516 vmovdqu %xmm5,128(%edi) 1517 movl %ebp,%esp 1518 leal -48(%edi),%edi 1519 ret 1520.size _poly1305_init_avx2,.-_poly1305_init_avx2 1521.align 32 1522.type _poly1305_blocks_avx2,@function 1523.align 16 1524_poly1305_blocks_avx2: 1525 pushl %ebp 1526 pushl %ebx 1527 pushl %esi 1528 pushl %edi 1529 movl 20(%esp),%edi 1530 movl 24(%esp),%esi 1531 movl 28(%esp),%ecx 1532 movl 20(%edi),%eax 1533 andl $-16,%ecx 1534 jz .L020nodata 1535 cmpl $64,%ecx 1536 jae .L021enter_avx2 1537 testl %eax,%eax 1538 jz .Lenter_blocks 1539.L021enter_avx2: 1540 vzeroupper 1541 call .L022pic_point 1542.L022pic_point: 1543 popl %ebx 1544 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx 1545 testl %eax,%eax 1546 jnz .L023base2_26 1547 call _poly1305_init_avx2 1548 movl (%edi),%eax 1549 movl 3(%edi),%ecx 1550 movl 6(%edi),%edx 1551 movl 9(%edi),%esi 1552 movl 13(%edi),%ebp 1553 shrl $2,%ecx 1554 andl $67108863,%eax 1555 shrl $4,%edx 1556 andl $67108863,%ecx 1557 shrl $6,%esi 1558 andl $67108863,%edx 1559 movl %eax,(%edi) 1560 movl %ecx,4(%edi) 1561 movl %edx,8(%edi) 1562 movl %esi,12(%edi) 1563 movl %ebp,16(%edi) 1564 movl $1,20(%edi) 1565 movl 24(%esp),%esi 1566 movl 28(%esp),%ecx 1567.L023base2_26: 1568 movl 32(%esp),%eax 1569 movl %esp,%ebp 1570 subl $448,%esp 1571 andl $-512,%esp 1572 vmovdqu 48(%edi),%xmm0 1573 leal 288(%esp),%edx 1574 vmovdqu 64(%edi),%xmm1 1575 vmovdqu 80(%edi),%xmm2 1576 vmovdqu 96(%edi),%xmm3 1577 vmovdqu 112(%edi),%xmm4 1578 leal 48(%edi),%edi 1579 vpermq $64,%ymm0,%ymm0 1580 vpermq $64,%ymm1,%ymm1 1581 vpermq $64,%ymm2,%ymm2 1582 vpermq $64,%ymm3,%ymm3 1583 vpermq $64,%ymm4,%ymm4 1584 vpshufd $200,%ymm0,%ymm0 1585 vpshufd $200,%ymm1,%ymm1 1586 vpshufd $200,%ymm2,%ymm2 1587 vpshufd $200,%ymm3,%ymm3 1588 vpshufd $200,%ymm4,%ymm4 1589 vmovdqa %ymm0,-128(%edx) 1590 vmovdqu 80(%edi),%xmm0 1591 vmovdqa %ymm1,-96(%edx) 1592 vmovdqu 96(%edi),%xmm1 1593 vmovdqa %ymm2,-64(%edx) 1594 vmovdqu 112(%edi),%xmm2 1595 vmovdqa %ymm3,-32(%edx) 1596 vmovdqu 128(%edi),%xmm3 1597 vmovdqa %ymm4,(%edx) 1598 vpermq $64,%ymm0,%ymm0 1599 vpermq $64,%ymm1,%ymm1 1600 vpermq $64,%ymm2,%ymm2 1601 vpermq $64,%ymm3,%ymm3 1602 vpshufd $200,%ymm0,%ymm0 1603 vpshufd $200,%ymm1,%ymm1 1604 vpshufd $200,%ymm2,%ymm2 1605 vpshufd $200,%ymm3,%ymm3 1606 vmovdqa %ymm0,32(%edx) 1607 vmovd -48(%edi),%xmm0 1608 vmovdqa %ymm1,64(%edx) 1609 vmovd -44(%edi),%xmm1 1610 vmovdqa %ymm2,96(%edx) 1611 vmovd -40(%edi),%xmm2 1612 vmovdqa %ymm3,128(%edx) 1613 vmovd -36(%edi),%xmm3 1614 vmovd -32(%edi),%xmm4 1615 vmovdqa 64(%ebx),%ymm7 1616 negl %eax 1617 testl $63,%ecx 1618 jz .L024even 1619 movl %ecx,%edx 1620 andl $-64,%ecx 1621 andl $63,%edx 1622 vmovdqu (%esi),%xmm5 1623 cmpl $32,%edx 1624 jb .L025one 1625 vmovdqu 16(%esi),%xmm6 1626 je .L026two 1627 vinserti128 $1,32(%esi),%ymm5,%ymm5 1628 leal 48(%esi),%esi 1629 leal 8(%ebx),%ebx 1630 leal 296(%esp),%edx 1631 jmp .L027tail 1632.L026two: 1633 leal 32(%esi),%esi 1634 leal 16(%ebx),%ebx 1635 leal 304(%esp),%edx 1636 jmp .L027tail 1637.L025one: 1638 leal 16(%esi),%esi 1639 vpxor %ymm6,%ymm6,%ymm6 1640 leal 32(%ebx,%eax,8),%ebx 1641 leal 312(%esp),%edx 1642 jmp .L027tail 1643.align 32 1644.L024even: 1645 vmovdqu (%esi),%xmm5 1646 vmovdqu 16(%esi),%xmm6 1647 vinserti128 $1,32(%esi),%ymm5,%ymm5 1648 vinserti128 $1,48(%esi),%ymm6,%ymm6 1649 leal 64(%esi),%esi 1650 subl $64,%ecx 1651 jz .L027tail 1652.L028loop: 1653 vmovdqa %ymm2,64(%esp) 1654 vpsrldq $6,%ymm5,%ymm2 1655 vmovdqa %ymm0,(%esp) 1656 vpsrldq $6,%ymm6,%ymm0 1657 vmovdqa %ymm1,32(%esp) 1658 vpunpckhqdq %ymm6,%ymm5,%ymm1 1659 vpunpcklqdq %ymm6,%ymm5,%ymm5 1660 vpunpcklqdq %ymm0,%ymm2,%ymm2 1661 vpsrlq $30,%ymm2,%ymm0 1662 vpsrlq $4,%ymm2,%ymm2 1663 vpsrlq $26,%ymm5,%ymm6 1664 vpsrlq $40,%ymm1,%ymm1 1665 vpand %ymm7,%ymm2,%ymm2 1666 vpand %ymm7,%ymm5,%ymm5 1667 vpand %ymm7,%ymm6,%ymm6 1668 vpand %ymm7,%ymm0,%ymm0 1669 vpor (%ebx),%ymm1,%ymm1 1670 vpaddq 64(%esp),%ymm2,%ymm2 1671 vpaddq (%esp),%ymm5,%ymm5 1672 vpaddq 32(%esp),%ymm6,%ymm6 1673 vpaddq %ymm3,%ymm0,%ymm0 1674 vpaddq %ymm4,%ymm1,%ymm1 1675 vpmuludq -96(%edx),%ymm2,%ymm3 1676 vmovdqa %ymm6,32(%esp) 1677 vpmuludq -64(%edx),%ymm2,%ymm4 1678 vmovdqa %ymm0,96(%esp) 1679 vpmuludq 96(%edx),%ymm2,%ymm0 1680 vmovdqa %ymm1,128(%esp) 1681 vpmuludq 128(%edx),%ymm2,%ymm1 1682 vpmuludq -128(%edx),%ymm2,%ymm2 1683 vpmuludq -32(%edx),%ymm5,%ymm7 1684 vpaddq %ymm7,%ymm3,%ymm3 1685 vpmuludq (%edx),%ymm5,%ymm6 1686 vpaddq %ymm6,%ymm4,%ymm4 1687 vpmuludq -128(%edx),%ymm5,%ymm7 1688 vpaddq %ymm7,%ymm0,%ymm0 1689 vmovdqa 32(%esp),%ymm7 1690 vpmuludq -96(%edx),%ymm5,%ymm6 1691 vpaddq %ymm6,%ymm1,%ymm1 1692 vpmuludq -64(%edx),%ymm5,%ymm5 1693 vpaddq %ymm5,%ymm2,%ymm2 1694 vpmuludq -64(%edx),%ymm7,%ymm6 1695 vpaddq %ymm6,%ymm3,%ymm3 1696 vpmuludq -32(%edx),%ymm7,%ymm5 1697 vpaddq %ymm5,%ymm4,%ymm4 1698 vpmuludq 128(%edx),%ymm7,%ymm6 1699 vpaddq %ymm6,%ymm0,%ymm0 1700 vmovdqa 96(%esp),%ymm6 1701 vpmuludq -128(%edx),%ymm7,%ymm5 1702 vpaddq %ymm5,%ymm1,%ymm1 1703 vpmuludq -96(%edx),%ymm7,%ymm7 1704 vpaddq %ymm7,%ymm2,%ymm2 1705 vpmuludq -128(%edx),%ymm6,%ymm5 1706 vpaddq %ymm5,%ymm3,%ymm3 1707 vpmuludq -96(%edx),%ymm6,%ymm7 1708 vpaddq %ymm7,%ymm4,%ymm4 1709 vpmuludq 64(%edx),%ymm6,%ymm5 1710 vpaddq %ymm5,%ymm0,%ymm0 1711 vmovdqa 128(%esp),%ymm5 1712 vpmuludq 96(%edx),%ymm6,%ymm7 1713 vpaddq %ymm7,%ymm1,%ymm1 1714 vpmuludq 128(%edx),%ymm6,%ymm6 1715 vpaddq %ymm6,%ymm2,%ymm2 1716 vpmuludq 128(%edx),%ymm5,%ymm7 1717 vpaddq %ymm7,%ymm3,%ymm3 1718 vpmuludq 32(%edx),%ymm5,%ymm6 1719 vpaddq %ymm6,%ymm0,%ymm0 1720 vpmuludq -128(%edx),%ymm5,%ymm7 1721 vpaddq %ymm7,%ymm4,%ymm4 1722 vmovdqa 64(%ebx),%ymm7 1723 vpmuludq 64(%edx),%ymm5,%ymm6 1724 vpaddq %ymm6,%ymm1,%ymm1 1725 vpmuludq 96(%edx),%ymm5,%ymm5 1726 vpaddq %ymm5,%ymm2,%ymm2 1727 vpsrlq $26,%ymm3,%ymm5 1728 vpand %ymm7,%ymm3,%ymm3 1729 vpsrlq $26,%ymm0,%ymm6 1730 vpand %ymm7,%ymm0,%ymm0 1731 vpaddq %ymm5,%ymm4,%ymm4 1732 vpaddq %ymm6,%ymm1,%ymm1 1733 vpsrlq $26,%ymm4,%ymm5 1734 vpand %ymm7,%ymm4,%ymm4 1735 vpsrlq $26,%ymm1,%ymm6 1736 vpand %ymm7,%ymm1,%ymm1 1737 vpaddq %ymm6,%ymm2,%ymm2 1738 vpaddq %ymm5,%ymm0,%ymm0 1739 vpsllq $2,%ymm5,%ymm5 1740 vpsrlq $26,%ymm2,%ymm6 1741 vpand %ymm7,%ymm2,%ymm2 1742 vpaddq %ymm5,%ymm0,%ymm0 1743 vpaddq %ymm6,%ymm3,%ymm3 1744 vpsrlq $26,%ymm3,%ymm6 1745 vpsrlq $26,%ymm0,%ymm5 1746 vpand %ymm7,%ymm0,%ymm0 1747 vpand %ymm7,%ymm3,%ymm3 1748 vpaddq %ymm5,%ymm1,%ymm1 1749 vpaddq %ymm6,%ymm4,%ymm4 1750 vmovdqu (%esi),%xmm5 1751 vmovdqu 16(%esi),%xmm6 1752 vinserti128 $1,32(%esi),%ymm5,%ymm5 1753 vinserti128 $1,48(%esi),%ymm6,%ymm6 1754 leal 64(%esi),%esi 1755 subl $64,%ecx 1756 jnz .L028loop 1757.L027tail: 1758 vmovdqa %ymm2,64(%esp) 1759 vpsrldq $6,%ymm5,%ymm2 1760 vmovdqa %ymm0,(%esp) 1761 vpsrldq $6,%ymm6,%ymm0 1762 vmovdqa %ymm1,32(%esp) 1763 vpunpckhqdq %ymm6,%ymm5,%ymm1 1764 vpunpcklqdq %ymm6,%ymm5,%ymm5 1765 vpunpcklqdq %ymm0,%ymm2,%ymm2 1766 vpsrlq $30,%ymm2,%ymm0 1767 vpsrlq $4,%ymm2,%ymm2 1768 vpsrlq $26,%ymm5,%ymm6 1769 vpsrlq $40,%ymm1,%ymm1 1770 vpand %ymm7,%ymm2,%ymm2 1771 vpand %ymm7,%ymm5,%ymm5 1772 vpand %ymm7,%ymm6,%ymm6 1773 vpand %ymm7,%ymm0,%ymm0 1774 vpor (%ebx),%ymm1,%ymm1 1775 andl $-64,%ebx 1776 vpaddq 64(%esp),%ymm2,%ymm2 1777 vpaddq (%esp),%ymm5,%ymm5 1778 vpaddq 32(%esp),%ymm6,%ymm6 1779 vpaddq %ymm3,%ymm0,%ymm0 1780 vpaddq %ymm4,%ymm1,%ymm1 1781 vpmuludq -92(%edx),%ymm2,%ymm3 1782 vmovdqa %ymm6,32(%esp) 1783 vpmuludq -60(%edx),%ymm2,%ymm4 1784 vmovdqa %ymm0,96(%esp) 1785 vpmuludq 100(%edx),%ymm2,%ymm0 1786 vmovdqa %ymm1,128(%esp) 1787 vpmuludq 132(%edx),%ymm2,%ymm1 1788 vpmuludq -124(%edx),%ymm2,%ymm2 1789 vpmuludq -28(%edx),%ymm5,%ymm7 1790 vpaddq %ymm7,%ymm3,%ymm3 1791 vpmuludq 4(%edx),%ymm5,%ymm6 1792 vpaddq %ymm6,%ymm4,%ymm4 1793 vpmuludq -124(%edx),%ymm5,%ymm7 1794 vpaddq %ymm7,%ymm0,%ymm0 1795 vmovdqa 32(%esp),%ymm7 1796 vpmuludq -92(%edx),%ymm5,%ymm6 1797 vpaddq %ymm6,%ymm1,%ymm1 1798 vpmuludq -60(%edx),%ymm5,%ymm5 1799 vpaddq %ymm5,%ymm2,%ymm2 1800 vpmuludq -60(%edx),%ymm7,%ymm6 1801 vpaddq %ymm6,%ymm3,%ymm3 1802 vpmuludq -28(%edx),%ymm7,%ymm5 1803 vpaddq %ymm5,%ymm4,%ymm4 1804 vpmuludq 132(%edx),%ymm7,%ymm6 1805 vpaddq %ymm6,%ymm0,%ymm0 1806 vmovdqa 96(%esp),%ymm6 1807 vpmuludq -124(%edx),%ymm7,%ymm5 1808 vpaddq %ymm5,%ymm1,%ymm1 1809 vpmuludq -92(%edx),%ymm7,%ymm7 1810 vpaddq %ymm7,%ymm2,%ymm2 1811 vpmuludq -124(%edx),%ymm6,%ymm5 1812 vpaddq %ymm5,%ymm3,%ymm3 1813 vpmuludq -92(%edx),%ymm6,%ymm7 1814 vpaddq %ymm7,%ymm4,%ymm4 1815 vpmuludq 68(%edx),%ymm6,%ymm5 1816 vpaddq %ymm5,%ymm0,%ymm0 1817 vmovdqa 128(%esp),%ymm5 1818 vpmuludq 100(%edx),%ymm6,%ymm7 1819 vpaddq %ymm7,%ymm1,%ymm1 1820 vpmuludq 132(%edx),%ymm6,%ymm6 1821 vpaddq %ymm6,%ymm2,%ymm2 1822 vpmuludq 132(%edx),%ymm5,%ymm7 1823 vpaddq %ymm7,%ymm3,%ymm3 1824 vpmuludq 36(%edx),%ymm5,%ymm6 1825 vpaddq %ymm6,%ymm0,%ymm0 1826 vpmuludq -124(%edx),%ymm5,%ymm7 1827 vpaddq %ymm7,%ymm4,%ymm4 1828 vmovdqa 64(%ebx),%ymm7 1829 vpmuludq 68(%edx),%ymm5,%ymm6 1830 vpaddq %ymm6,%ymm1,%ymm1 1831 vpmuludq 100(%edx),%ymm5,%ymm5 1832 vpaddq %ymm5,%ymm2,%ymm2 1833 vpsrldq $8,%ymm4,%ymm5 1834 vpsrldq $8,%ymm3,%ymm6 1835 vpaddq %ymm5,%ymm4,%ymm4 1836 vpsrldq $8,%ymm0,%ymm5 1837 vpaddq %ymm6,%ymm3,%ymm3 1838 vpsrldq $8,%ymm1,%ymm6 1839 vpaddq %ymm5,%ymm0,%ymm0 1840 vpsrldq $8,%ymm2,%ymm5 1841 vpaddq %ymm6,%ymm1,%ymm1 1842 vpermq $2,%ymm4,%ymm6 1843 vpaddq %ymm5,%ymm2,%ymm2 1844 vpermq $2,%ymm3,%ymm5 1845 vpaddq %ymm6,%ymm4,%ymm4 1846 vpermq $2,%ymm0,%ymm6 1847 vpaddq %ymm5,%ymm3,%ymm3 1848 vpermq $2,%ymm1,%ymm5 1849 vpaddq %ymm6,%ymm0,%ymm0 1850 vpermq $2,%ymm2,%ymm6 1851 vpaddq %ymm5,%ymm1,%ymm1 1852 vpaddq %ymm6,%ymm2,%ymm2 1853 vpsrlq $26,%ymm3,%ymm5 1854 vpand %ymm7,%ymm3,%ymm3 1855 vpsrlq $26,%ymm0,%ymm6 1856 vpand %ymm7,%ymm0,%ymm0 1857 vpaddq %ymm5,%ymm4,%ymm4 1858 vpaddq %ymm6,%ymm1,%ymm1 1859 vpsrlq $26,%ymm4,%ymm5 1860 vpand %ymm7,%ymm4,%ymm4 1861 vpsrlq $26,%ymm1,%ymm6 1862 vpand %ymm7,%ymm1,%ymm1 1863 vpaddq %ymm6,%ymm2,%ymm2 1864 vpaddq %ymm5,%ymm0,%ymm0 1865 vpsllq $2,%ymm5,%ymm5 1866 vpsrlq $26,%ymm2,%ymm6 1867 vpand %ymm7,%ymm2,%ymm2 1868 vpaddq %ymm5,%ymm0,%ymm0 1869 vpaddq %ymm6,%ymm3,%ymm3 1870 vpsrlq $26,%ymm3,%ymm6 1871 vpsrlq $26,%ymm0,%ymm5 1872 vpand %ymm7,%ymm0,%ymm0 1873 vpand %ymm7,%ymm3,%ymm3 1874 vpaddq %ymm5,%ymm1,%ymm1 1875 vpaddq %ymm6,%ymm4,%ymm4 1876 cmpl $0,%ecx 1877 je .L029done 1878 vpshufd $252,%xmm0,%xmm0 1879 leal 288(%esp),%edx 1880 vpshufd $252,%xmm1,%xmm1 1881 vpshufd $252,%xmm2,%xmm2 1882 vpshufd $252,%xmm3,%xmm3 1883 vpshufd $252,%xmm4,%xmm4 1884 jmp .L024even 1885.align 16 1886.L029done: 1887 vmovd %xmm0,-48(%edi) 1888 vmovd %xmm1,-44(%edi) 1889 vmovd %xmm2,-40(%edi) 1890 vmovd %xmm3,-36(%edi) 1891 vmovd %xmm4,-32(%edi) 1892 vzeroupper 1893 movl %ebp,%esp 1894.L020nodata: 1895 popl %edi 1896 popl %esi 1897 popl %ebx 1898 popl %ebp 1899 ret 1900.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 1901.align 64 1902.Lconst_sse2: 1903.long 16777216,0,16777216,0,16777216,0,16777216,0 1904.long 0,0,0,0,0,0,0,0 1905.long 67108863,0,67108863,0,67108863,0,67108863,0 1906.long 268435455,268435452,268435452,268435452 1907.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 1908.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 1909.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 1910.byte 114,103,62,0 1911.align 4 1912.comm OPENSSL_ia32cap_P,16,4 1913#else 1914.text 1915.align 64 1916.globl poly1305_init 1917.type poly1305_init,@function 1918.align 16 1919poly1305_init: 1920.L_poly1305_init_begin: 1921 pushl %ebp 1922 pushl %ebx 1923 pushl %esi 1924 pushl %edi 1925 movl 20(%esp),%edi 1926 movl 24(%esp),%esi 1927 movl 28(%esp),%ebp 1928 xorl %eax,%eax 1929 movl %eax,(%edi) 1930 movl %eax,4(%edi) 1931 movl %eax,8(%edi) 1932 movl %eax,12(%edi) 1933 movl %eax,16(%edi) 1934 movl %eax,20(%edi) 1935 cmpl $0,%esi 1936 je .L000nokey 1937 call .L001pic_point 1938.L001pic_point: 1939 popl %ebx 1940 leal poly1305_blocks-.L001pic_point(%ebx),%eax 1941 leal poly1305_emit-.L001pic_point(%ebx),%edx 1942 leal OPENSSL_ia32cap_P,%edi 1943 movl (%edi),%ecx 1944 andl $83886080,%ecx 1945 cmpl $83886080,%ecx 1946 jne .L002no_sse2 1947 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax 1948 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx 1949 movl 8(%edi),%ecx 1950 testl $32,%ecx 1951 jz .L002no_sse2 1952 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax 1953.L002no_sse2: 1954 movl 20(%esp),%edi 1955 movl %eax,(%ebp) 1956 movl %edx,4(%ebp) 1957 movl (%esi),%eax 1958 movl 4(%esi),%ebx 1959 movl 8(%esi),%ecx 1960 movl 12(%esi),%edx 1961 andl $268435455,%eax 1962 andl $268435452,%ebx 1963 andl $268435452,%ecx 1964 andl $268435452,%edx 1965 movl %eax,24(%edi) 1966 movl %ebx,28(%edi) 1967 movl %ecx,32(%edi) 1968 movl %edx,36(%edi) 1969 movl $1,%eax 1970.L000nokey: 1971 popl %edi 1972 popl %esi 1973 popl %ebx 1974 popl %ebp 1975 ret 1976.size poly1305_init,.-.L_poly1305_init_begin 1977.globl poly1305_blocks 1978.type poly1305_blocks,@function 1979.align 16 1980poly1305_blocks: 1981.L_poly1305_blocks_begin: 1982 pushl %ebp 1983 pushl %ebx 1984 pushl %esi 1985 pushl %edi 1986 movl 20(%esp),%edi 1987 movl 24(%esp),%esi 1988 movl 28(%esp),%ecx 1989.Lenter_blocks: 1990 andl $-15,%ecx 1991 jz .L003nodata 1992 subl $64,%esp 1993 movl 24(%edi),%eax 1994 movl 28(%edi),%ebx 1995 leal (%esi,%ecx,1),%ebp 1996 movl 32(%edi),%ecx 1997 movl 36(%edi),%edx 1998 movl %ebp,92(%esp) 1999 movl %esi,%ebp 2000 movl %eax,36(%esp) 2001 movl %ebx,%eax 2002 shrl $2,%eax 2003 movl %ebx,40(%esp) 2004 addl %ebx,%eax 2005 movl %ecx,%ebx 2006 shrl $2,%ebx 2007 movl %ecx,44(%esp) 2008 addl %ecx,%ebx 2009 movl %edx,%ecx 2010 shrl $2,%ecx 2011 movl %edx,48(%esp) 2012 addl %edx,%ecx 2013 movl %eax,52(%esp) 2014 movl %ebx,56(%esp) 2015 movl %ecx,60(%esp) 2016 movl (%edi),%eax 2017 movl 4(%edi),%ebx 2018 movl 8(%edi),%ecx 2019 movl 12(%edi),%esi 2020 movl 16(%edi),%edi 2021 jmp .L004loop 2022.align 32 2023.L004loop: 2024 addl (%ebp),%eax 2025 adcl 4(%ebp),%ebx 2026 adcl 8(%ebp),%ecx 2027 adcl 12(%ebp),%esi 2028 leal 16(%ebp),%ebp 2029 adcl 96(%esp),%edi 2030 movl %eax,(%esp) 2031 movl %esi,12(%esp) 2032 mull 36(%esp) 2033 movl %edi,16(%esp) 2034 movl %eax,%edi 2035 movl %ebx,%eax 2036 movl %edx,%esi 2037 mull 60(%esp) 2038 addl %eax,%edi 2039 movl %ecx,%eax 2040 adcl %edx,%esi 2041 mull 56(%esp) 2042 addl %eax,%edi 2043 movl 12(%esp),%eax 2044 adcl %edx,%esi 2045 mull 52(%esp) 2046 addl %eax,%edi 2047 movl (%esp),%eax 2048 adcl %edx,%esi 2049 mull 40(%esp) 2050 movl %edi,20(%esp) 2051 xorl %edi,%edi 2052 addl %eax,%esi 2053 movl %ebx,%eax 2054 adcl %edx,%edi 2055 mull 36(%esp) 2056 addl %eax,%esi 2057 movl %ecx,%eax 2058 adcl %edx,%edi 2059 mull 60(%esp) 2060 addl %eax,%esi 2061 movl 12(%esp),%eax 2062 adcl %edx,%edi 2063 mull 56(%esp) 2064 addl %eax,%esi 2065 movl 16(%esp),%eax 2066 adcl %edx,%edi 2067 imull 52(%esp),%eax 2068 addl %eax,%esi 2069 movl (%esp),%eax 2070 adcl $0,%edi 2071 mull 44(%esp) 2072 movl %esi,24(%esp) 2073 xorl %esi,%esi 2074 addl %eax,%edi 2075 movl %ebx,%eax 2076 adcl %edx,%esi 2077 mull 40(%esp) 2078 addl %eax,%edi 2079 movl %ecx,%eax 2080 adcl %edx,%esi 2081 mull 36(%esp) 2082 addl %eax,%edi 2083 movl 12(%esp),%eax 2084 adcl %edx,%esi 2085 mull 60(%esp) 2086 addl %eax,%edi 2087 movl 16(%esp),%eax 2088 adcl %edx,%esi 2089 imull 56(%esp),%eax 2090 addl %eax,%edi 2091 movl (%esp),%eax 2092 adcl $0,%esi 2093 mull 48(%esp) 2094 movl %edi,28(%esp) 2095 xorl %edi,%edi 2096 addl %eax,%esi 2097 movl %ebx,%eax 2098 adcl %edx,%edi 2099 mull 44(%esp) 2100 addl %eax,%esi 2101 movl %ecx,%eax 2102 adcl %edx,%edi 2103 mull 40(%esp) 2104 addl %eax,%esi 2105 movl 12(%esp),%eax 2106 adcl %edx,%edi 2107 mull 36(%esp) 2108 addl %eax,%esi 2109 movl 16(%esp),%ecx 2110 adcl %edx,%edi 2111 movl %ecx,%edx 2112 imull 60(%esp),%ecx 2113 addl %ecx,%esi 2114 movl 20(%esp),%eax 2115 adcl $0,%edi 2116 imull 36(%esp),%edx 2117 addl %edi,%edx 2118 movl 24(%esp),%ebx 2119 movl 28(%esp),%ecx 2120 movl %edx,%edi 2121 shrl $2,%edx 2122 andl $3,%edi 2123 leal (%edx,%edx,4),%edx 2124 addl %edx,%eax 2125 adcl $0,%ebx 2126 adcl $0,%ecx 2127 adcl $0,%esi 2128 adcl $0,%edi 2129 cmpl 92(%esp),%ebp 2130 jne .L004loop 2131 movl 84(%esp),%edx 2132 addl $64,%esp 2133 movl %eax,(%edx) 2134 movl %ebx,4(%edx) 2135 movl %ecx,8(%edx) 2136 movl %esi,12(%edx) 2137 movl %edi,16(%edx) 2138.L003nodata: 2139 popl %edi 2140 popl %esi 2141 popl %ebx 2142 popl %ebp 2143 ret 2144.size poly1305_blocks,.-.L_poly1305_blocks_begin 2145.globl poly1305_emit 2146.type poly1305_emit,@function 2147.align 16 2148poly1305_emit: 2149.L_poly1305_emit_begin: 2150 pushl %ebp 2151 pushl %ebx 2152 pushl %esi 2153 pushl %edi 2154 movl 20(%esp),%ebp 2155.Lenter_emit: 2156 movl 24(%esp),%edi 2157 movl (%ebp),%eax 2158 movl 4(%ebp),%ebx 2159 movl 8(%ebp),%ecx 2160 movl 12(%ebp),%edx 2161 movl 16(%ebp),%esi 2162 addl $5,%eax 2163 adcl $0,%ebx 2164 adcl $0,%ecx 2165 adcl $0,%edx 2166 adcl $0,%esi 2167 shrl $2,%esi 2168 negl %esi 2169 andl %esi,%eax 2170 andl %esi,%ebx 2171 andl %esi,%ecx 2172 andl %esi,%edx 2173 movl %eax,(%edi) 2174 movl %ebx,4(%edi) 2175 movl %ecx,8(%edi) 2176 movl %edx,12(%edi) 2177 notl %esi 2178 movl (%ebp),%eax 2179 movl 4(%ebp),%ebx 2180 movl 8(%ebp),%ecx 2181 movl 12(%ebp),%edx 2182 movl 28(%esp),%ebp 2183 andl %esi,%eax 2184 andl %esi,%ebx 2185 andl %esi,%ecx 2186 andl %esi,%edx 2187 orl (%edi),%eax 2188 orl 4(%edi),%ebx 2189 orl 8(%edi),%ecx 2190 orl 12(%edi),%edx 2191 addl (%ebp),%eax 2192 adcl 4(%ebp),%ebx 2193 adcl 8(%ebp),%ecx 2194 adcl 12(%ebp),%edx 2195 movl %eax,(%edi) 2196 movl %ebx,4(%edi) 2197 movl %ecx,8(%edi) 2198 movl %edx,12(%edi) 2199 popl %edi 2200 popl %esi 2201 popl %ebx 2202 popl %ebp 2203 ret 2204.size poly1305_emit,.-.L_poly1305_emit_begin 2205.align 32 2206.type _poly1305_init_sse2,@function 2207.align 16 2208_poly1305_init_sse2: 2209 movdqu 24(%edi),%xmm4 2210 leal 48(%edi),%edi 2211 movl %esp,%ebp 2212 subl $224,%esp 2213 andl $-16,%esp 2214 movq 64(%ebx),%xmm7 2215 movdqa %xmm4,%xmm0 2216 movdqa %xmm4,%xmm1 2217 movdqa %xmm4,%xmm2 2218 pand %xmm7,%xmm0 2219 psrlq $26,%xmm1 2220 psrldq $6,%xmm2 2221 pand %xmm7,%xmm1 2222 movdqa %xmm2,%xmm3 2223 psrlq $4,%xmm2 2224 psrlq $30,%xmm3 2225 pand %xmm7,%xmm2 2226 pand %xmm7,%xmm3 2227 psrldq $13,%xmm4 2228 leal 144(%esp),%edx 2229 movl $2,%ecx 2230.L005square: 2231 movdqa %xmm0,(%esp) 2232 movdqa %xmm1,16(%esp) 2233 movdqa %xmm2,32(%esp) 2234 movdqa %xmm3,48(%esp) 2235 movdqa %xmm4,64(%esp) 2236 movdqa %xmm1,%xmm6 2237 movdqa %xmm2,%xmm5 2238 pslld $2,%xmm6 2239 pslld $2,%xmm5 2240 paddd %xmm1,%xmm6 2241 paddd %xmm2,%xmm5 2242 movdqa %xmm6,80(%esp) 2243 movdqa %xmm5,96(%esp) 2244 movdqa %xmm3,%xmm6 2245 movdqa %xmm4,%xmm5 2246 pslld $2,%xmm6 2247 pslld $2,%xmm5 2248 paddd %xmm3,%xmm6 2249 paddd %xmm4,%xmm5 2250 movdqa %xmm6,112(%esp) 2251 movdqa %xmm5,128(%esp) 2252 pshufd $68,%xmm0,%xmm6 2253 movdqa %xmm1,%xmm5 2254 pshufd $68,%xmm1,%xmm1 2255 pshufd $68,%xmm2,%xmm2 2256 pshufd $68,%xmm3,%xmm3 2257 pshufd $68,%xmm4,%xmm4 2258 movdqa %xmm6,(%edx) 2259 movdqa %xmm1,16(%edx) 2260 movdqa %xmm2,32(%edx) 2261 movdqa %xmm3,48(%edx) 2262 movdqa %xmm4,64(%edx) 2263 pmuludq %xmm0,%xmm4 2264 pmuludq %xmm0,%xmm3 2265 pmuludq %xmm0,%xmm2 2266 pmuludq %xmm0,%xmm1 2267 pmuludq %xmm6,%xmm0 2268 movdqa %xmm5,%xmm6 2269 pmuludq 48(%edx),%xmm5 2270 movdqa %xmm6,%xmm7 2271 pmuludq 32(%edx),%xmm6 2272 paddq %xmm5,%xmm4 2273 movdqa %xmm7,%xmm5 2274 pmuludq 16(%edx),%xmm7 2275 paddq %xmm6,%xmm3 2276 movdqa 80(%esp),%xmm6 2277 pmuludq (%edx),%xmm5 2278 paddq %xmm7,%xmm2 2279 pmuludq 64(%edx),%xmm6 2280 movdqa 32(%esp),%xmm7 2281 paddq %xmm5,%xmm1 2282 movdqa %xmm7,%xmm5 2283 pmuludq 32(%edx),%xmm7 2284 paddq %xmm6,%xmm0 2285 movdqa %xmm5,%xmm6 2286 pmuludq 16(%edx),%xmm5 2287 paddq %xmm7,%xmm4 2288 movdqa 96(%esp),%xmm7 2289 pmuludq (%edx),%xmm6 2290 paddq %xmm5,%xmm3 2291 movdqa %xmm7,%xmm5 2292 pmuludq 64(%edx),%xmm7 2293 paddq %xmm6,%xmm2 2294 pmuludq 48(%edx),%xmm5 2295 movdqa 48(%esp),%xmm6 2296 paddq %xmm7,%xmm1 2297 movdqa %xmm6,%xmm7 2298 pmuludq 16(%edx),%xmm6 2299 paddq %xmm5,%xmm0 2300 movdqa 112(%esp),%xmm5 2301 pmuludq (%edx),%xmm7 2302 paddq %xmm6,%xmm4 2303 movdqa %xmm5,%xmm6 2304 pmuludq 64(%edx),%xmm5 2305 paddq %xmm7,%xmm3 2306 movdqa %xmm6,%xmm7 2307 pmuludq 48(%edx),%xmm6 2308 paddq %xmm5,%xmm2 2309 pmuludq 32(%edx),%xmm7 2310 movdqa 64(%esp),%xmm5 2311 paddq %xmm6,%xmm1 2312 movdqa 128(%esp),%xmm6 2313 pmuludq (%edx),%xmm5 2314 paddq %xmm7,%xmm0 2315 movdqa %xmm6,%xmm7 2316 pmuludq 64(%edx),%xmm6 2317 paddq %xmm5,%xmm4 2318 movdqa %xmm7,%xmm5 2319 pmuludq 16(%edx),%xmm7 2320 paddq %xmm6,%xmm3 2321 movdqa %xmm5,%xmm6 2322 pmuludq 32(%edx),%xmm5 2323 paddq %xmm7,%xmm0 2324 pmuludq 48(%edx),%xmm6 2325 movdqa 64(%ebx),%xmm7 2326 paddq %xmm5,%xmm1 2327 paddq %xmm6,%xmm2 2328 movdqa %xmm3,%xmm5 2329 pand %xmm7,%xmm3 2330 psrlq $26,%xmm5 2331 paddq %xmm4,%xmm5 2332 movdqa %xmm0,%xmm6 2333 pand %xmm7,%xmm0 2334 psrlq $26,%xmm6 2335 movdqa %xmm5,%xmm4 2336 paddq %xmm1,%xmm6 2337 psrlq $26,%xmm5 2338 pand %xmm7,%xmm4 2339 movdqa %xmm6,%xmm1 2340 psrlq $26,%xmm6 2341 paddd %xmm5,%xmm0 2342 psllq $2,%xmm5 2343 paddq %xmm2,%xmm6 2344 paddq %xmm0,%xmm5 2345 pand %xmm7,%xmm1 2346 movdqa %xmm6,%xmm2 2347 psrlq $26,%xmm6 2348 pand %xmm7,%xmm2 2349 paddd %xmm3,%xmm6 2350 movdqa %xmm5,%xmm0 2351 psrlq $26,%xmm5 2352 movdqa %xmm6,%xmm3 2353 psrlq $26,%xmm6 2354 pand %xmm7,%xmm0 2355 paddd %xmm5,%xmm1 2356 pand %xmm7,%xmm3 2357 paddd %xmm6,%xmm4 2358 decl %ecx 2359 jz .L006square_break 2360 punpcklqdq (%esp),%xmm0 2361 punpcklqdq 16(%esp),%xmm1 2362 punpcklqdq 32(%esp),%xmm2 2363 punpcklqdq 48(%esp),%xmm3 2364 punpcklqdq 64(%esp),%xmm4 2365 jmp .L005square 2366.L006square_break: 2367 psllq $32,%xmm0 2368 psllq $32,%xmm1 2369 psllq $32,%xmm2 2370 psllq $32,%xmm3 2371 psllq $32,%xmm4 2372 por (%esp),%xmm0 2373 por 16(%esp),%xmm1 2374 por 32(%esp),%xmm2 2375 por 48(%esp),%xmm3 2376 por 64(%esp),%xmm4 2377 pshufd $141,%xmm0,%xmm0 2378 pshufd $141,%xmm1,%xmm1 2379 pshufd $141,%xmm2,%xmm2 2380 pshufd $141,%xmm3,%xmm3 2381 pshufd $141,%xmm4,%xmm4 2382 movdqu %xmm0,(%edi) 2383 movdqu %xmm1,16(%edi) 2384 movdqu %xmm2,32(%edi) 2385 movdqu %xmm3,48(%edi) 2386 movdqu %xmm4,64(%edi) 2387 movdqa %xmm1,%xmm6 2388 movdqa %xmm2,%xmm5 2389 pslld $2,%xmm6 2390 pslld $2,%xmm5 2391 paddd %xmm1,%xmm6 2392 paddd %xmm2,%xmm5 2393 movdqu %xmm6,80(%edi) 2394 movdqu %xmm5,96(%edi) 2395 movdqa %xmm3,%xmm6 2396 movdqa %xmm4,%xmm5 2397 pslld $2,%xmm6 2398 pslld $2,%xmm5 2399 paddd %xmm3,%xmm6 2400 paddd %xmm4,%xmm5 2401 movdqu %xmm6,112(%edi) 2402 movdqu %xmm5,128(%edi) 2403 movl %ebp,%esp 2404 leal -48(%edi),%edi 2405 ret 2406.size _poly1305_init_sse2,.-_poly1305_init_sse2 2407.align 32 2408.type _poly1305_blocks_sse2,@function 2409.align 16 2410_poly1305_blocks_sse2: 2411 pushl %ebp 2412 pushl %ebx 2413 pushl %esi 2414 pushl %edi 2415 movl 20(%esp),%edi 2416 movl 24(%esp),%esi 2417 movl 28(%esp),%ecx 2418 movl 20(%edi),%eax 2419 andl $-16,%ecx 2420 jz .L007nodata 2421 cmpl $64,%ecx 2422 jae .L008enter_sse2 2423 testl %eax,%eax 2424 jz .Lenter_blocks 2425.align 16 2426.L008enter_sse2: 2427 call .L009pic_point 2428.L009pic_point: 2429 popl %ebx 2430 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx 2431 testl %eax,%eax 2432 jnz .L010base2_26 2433 call _poly1305_init_sse2 2434 movl (%edi),%eax 2435 movl 3(%edi),%ecx 2436 movl 6(%edi),%edx 2437 movl 9(%edi),%esi 2438 movl 13(%edi),%ebp 2439 movl $1,20(%edi) 2440 shrl $2,%ecx 2441 andl $67108863,%eax 2442 shrl $4,%edx 2443 andl $67108863,%ecx 2444 shrl $6,%esi 2445 andl $67108863,%edx 2446 movd %eax,%xmm0 2447 movd %ecx,%xmm1 2448 movd %edx,%xmm2 2449 movd %esi,%xmm3 2450 movd %ebp,%xmm4 2451 movl 24(%esp),%esi 2452 movl 28(%esp),%ecx 2453 jmp .L011base2_32 2454.align 16 2455.L010base2_26: 2456 movd (%edi),%xmm0 2457 movd 4(%edi),%xmm1 2458 movd 8(%edi),%xmm2 2459 movd 12(%edi),%xmm3 2460 movd 16(%edi),%xmm4 2461 movdqa 64(%ebx),%xmm7 2462.L011base2_32: 2463 movl 32(%esp),%eax 2464 movl %esp,%ebp 2465 subl $528,%esp 2466 andl $-16,%esp 2467 leal 48(%edi),%edi 2468 shll $24,%eax 2469 testl $31,%ecx 2470 jz .L012even 2471 movdqu (%esi),%xmm6 2472 leal 16(%esi),%esi 2473 movdqa %xmm6,%xmm5 2474 pand %xmm7,%xmm6 2475 paddd %xmm6,%xmm0 2476 movdqa %xmm5,%xmm6 2477 psrlq $26,%xmm5 2478 psrldq $6,%xmm6 2479 pand %xmm7,%xmm5 2480 paddd %xmm5,%xmm1 2481 movdqa %xmm6,%xmm5 2482 psrlq $4,%xmm6 2483 pand %xmm7,%xmm6 2484 paddd %xmm6,%xmm2 2485 movdqa %xmm5,%xmm6 2486 psrlq $30,%xmm5 2487 pand %xmm7,%xmm5 2488 psrldq $7,%xmm6 2489 paddd %xmm5,%xmm3 2490 movd %eax,%xmm5 2491 paddd %xmm6,%xmm4 2492 movd 12(%edi),%xmm6 2493 paddd %xmm5,%xmm4 2494 movdqa %xmm0,(%esp) 2495 movdqa %xmm1,16(%esp) 2496 movdqa %xmm2,32(%esp) 2497 movdqa %xmm3,48(%esp) 2498 movdqa %xmm4,64(%esp) 2499 pmuludq %xmm6,%xmm0 2500 pmuludq %xmm6,%xmm1 2501 pmuludq %xmm6,%xmm2 2502 movd 28(%edi),%xmm5 2503 pmuludq %xmm6,%xmm3 2504 pmuludq %xmm6,%xmm4 2505 movdqa %xmm5,%xmm6 2506 pmuludq 48(%esp),%xmm5 2507 movdqa %xmm6,%xmm7 2508 pmuludq 32(%esp),%xmm6 2509 paddq %xmm5,%xmm4 2510 movdqa %xmm7,%xmm5 2511 pmuludq 16(%esp),%xmm7 2512 paddq %xmm6,%xmm3 2513 movd 92(%edi),%xmm6 2514 pmuludq (%esp),%xmm5 2515 paddq %xmm7,%xmm2 2516 pmuludq 64(%esp),%xmm6 2517 movd 44(%edi),%xmm7 2518 paddq %xmm5,%xmm1 2519 movdqa %xmm7,%xmm5 2520 pmuludq 32(%esp),%xmm7 2521 paddq %xmm6,%xmm0 2522 movdqa %xmm5,%xmm6 2523 pmuludq 16(%esp),%xmm5 2524 paddq %xmm7,%xmm4 2525 movd 108(%edi),%xmm7 2526 pmuludq (%esp),%xmm6 2527 paddq %xmm5,%xmm3 2528 movdqa %xmm7,%xmm5 2529 pmuludq 64(%esp),%xmm7 2530 paddq %xmm6,%xmm2 2531 pmuludq 48(%esp),%xmm5 2532 movd 60(%edi),%xmm6 2533 paddq %xmm7,%xmm1 2534 movdqa %xmm6,%xmm7 2535 pmuludq 16(%esp),%xmm6 2536 paddq %xmm5,%xmm0 2537 movd 124(%edi),%xmm5 2538 pmuludq (%esp),%xmm7 2539 paddq %xmm6,%xmm4 2540 movdqa %xmm5,%xmm6 2541 pmuludq 64(%esp),%xmm5 2542 paddq %xmm7,%xmm3 2543 movdqa %xmm6,%xmm7 2544 pmuludq 48(%esp),%xmm6 2545 paddq %xmm5,%xmm2 2546 pmuludq 32(%esp),%xmm7 2547 movd 76(%edi),%xmm5 2548 paddq %xmm6,%xmm1 2549 movd 140(%edi),%xmm6 2550 pmuludq (%esp),%xmm5 2551 paddq %xmm7,%xmm0 2552 movdqa %xmm6,%xmm7 2553 pmuludq 64(%esp),%xmm6 2554 paddq %xmm5,%xmm4 2555 movdqa %xmm7,%xmm5 2556 pmuludq 16(%esp),%xmm7 2557 paddq %xmm6,%xmm3 2558 movdqa %xmm5,%xmm6 2559 pmuludq 32(%esp),%xmm5 2560 paddq %xmm7,%xmm0 2561 pmuludq 48(%esp),%xmm6 2562 movdqa 64(%ebx),%xmm7 2563 paddq %xmm5,%xmm1 2564 paddq %xmm6,%xmm2 2565 movdqa %xmm3,%xmm5 2566 pand %xmm7,%xmm3 2567 psrlq $26,%xmm5 2568 paddq %xmm4,%xmm5 2569 movdqa %xmm0,%xmm6 2570 pand %xmm7,%xmm0 2571 psrlq $26,%xmm6 2572 movdqa %xmm5,%xmm4 2573 paddq %xmm1,%xmm6 2574 psrlq $26,%xmm5 2575 pand %xmm7,%xmm4 2576 movdqa %xmm6,%xmm1 2577 psrlq $26,%xmm6 2578 paddd %xmm5,%xmm0 2579 psllq $2,%xmm5 2580 paddq %xmm2,%xmm6 2581 paddq %xmm0,%xmm5 2582 pand %xmm7,%xmm1 2583 movdqa %xmm6,%xmm2 2584 psrlq $26,%xmm6 2585 pand %xmm7,%xmm2 2586 paddd %xmm3,%xmm6 2587 movdqa %xmm5,%xmm0 2588 psrlq $26,%xmm5 2589 movdqa %xmm6,%xmm3 2590 psrlq $26,%xmm6 2591 pand %xmm7,%xmm0 2592 paddd %xmm5,%xmm1 2593 pand %xmm7,%xmm3 2594 paddd %xmm6,%xmm4 2595 subl $16,%ecx 2596 jz .L013done 2597.L012even: 2598 leal 384(%esp),%edx 2599 leal -32(%esi),%eax 2600 subl $64,%ecx 2601 movdqu (%edi),%xmm5 2602 pshufd $68,%xmm5,%xmm6 2603 cmovbl %eax,%esi 2604 pshufd $238,%xmm5,%xmm5 2605 movdqa %xmm6,(%edx) 2606 leal 160(%esp),%eax 2607 movdqu 16(%edi),%xmm6 2608 movdqa %xmm5,-144(%edx) 2609 pshufd $68,%xmm6,%xmm5 2610 pshufd $238,%xmm6,%xmm6 2611 movdqa %xmm5,16(%edx) 2612 movdqu 32(%edi),%xmm5 2613 movdqa %xmm6,-128(%edx) 2614 pshufd $68,%xmm5,%xmm6 2615 pshufd $238,%xmm5,%xmm5 2616 movdqa %xmm6,32(%edx) 2617 movdqu 48(%edi),%xmm6 2618 movdqa %xmm5,-112(%edx) 2619 pshufd $68,%xmm6,%xmm5 2620 pshufd $238,%xmm6,%xmm6 2621 movdqa %xmm5,48(%edx) 2622 movdqu 64(%edi),%xmm5 2623 movdqa %xmm6,-96(%edx) 2624 pshufd $68,%xmm5,%xmm6 2625 pshufd $238,%xmm5,%xmm5 2626 movdqa %xmm6,64(%edx) 2627 movdqu 80(%edi),%xmm6 2628 movdqa %xmm5,-80(%edx) 2629 pshufd $68,%xmm6,%xmm5 2630 pshufd $238,%xmm6,%xmm6 2631 movdqa %xmm5,80(%edx) 2632 movdqu 96(%edi),%xmm5 2633 movdqa %xmm6,-64(%edx) 2634 pshufd $68,%xmm5,%xmm6 2635 pshufd $238,%xmm5,%xmm5 2636 movdqa %xmm6,96(%edx) 2637 movdqu 112(%edi),%xmm6 2638 movdqa %xmm5,-48(%edx) 2639 pshufd $68,%xmm6,%xmm5 2640 pshufd $238,%xmm6,%xmm6 2641 movdqa %xmm5,112(%edx) 2642 movdqu 128(%edi),%xmm5 2643 movdqa %xmm6,-32(%edx) 2644 pshufd $68,%xmm5,%xmm6 2645 pshufd $238,%xmm5,%xmm5 2646 movdqa %xmm6,128(%edx) 2647 movdqa %xmm5,-16(%edx) 2648 movdqu 32(%esi),%xmm5 2649 movdqu 48(%esi),%xmm6 2650 leal 32(%esi),%esi 2651 movdqa %xmm2,112(%esp) 2652 movdqa %xmm3,128(%esp) 2653 movdqa %xmm4,144(%esp) 2654 movdqa %xmm5,%xmm2 2655 movdqa %xmm6,%xmm3 2656 psrldq $6,%xmm2 2657 psrldq $6,%xmm3 2658 movdqa %xmm5,%xmm4 2659 punpcklqdq %xmm3,%xmm2 2660 punpckhqdq %xmm6,%xmm4 2661 punpcklqdq %xmm6,%xmm5 2662 movdqa %xmm2,%xmm3 2663 psrlq $4,%xmm2 2664 psrlq $30,%xmm3 2665 movdqa %xmm5,%xmm6 2666 psrlq $40,%xmm4 2667 psrlq $26,%xmm6 2668 pand %xmm7,%xmm5 2669 pand %xmm7,%xmm6 2670 pand %xmm7,%xmm2 2671 pand %xmm7,%xmm3 2672 por (%ebx),%xmm4 2673 movdqa %xmm0,80(%esp) 2674 movdqa %xmm1,96(%esp) 2675 jbe .L014skip_loop 2676 jmp .L015loop 2677.align 32 2678.L015loop: 2679 movdqa -144(%edx),%xmm7 2680 movdqa %xmm6,16(%eax) 2681 movdqa %xmm2,32(%eax) 2682 movdqa %xmm3,48(%eax) 2683 movdqa %xmm4,64(%eax) 2684 movdqa %xmm5,%xmm1 2685 pmuludq %xmm7,%xmm5 2686 movdqa %xmm6,%xmm0 2687 pmuludq %xmm7,%xmm6 2688 pmuludq %xmm7,%xmm2 2689 pmuludq %xmm7,%xmm3 2690 pmuludq %xmm7,%xmm4 2691 pmuludq -16(%edx),%xmm0 2692 movdqa %xmm1,%xmm7 2693 pmuludq -128(%edx),%xmm1 2694 paddq %xmm5,%xmm0 2695 movdqa %xmm7,%xmm5 2696 pmuludq -112(%edx),%xmm7 2697 paddq %xmm6,%xmm1 2698 movdqa %xmm5,%xmm6 2699 pmuludq -96(%edx),%xmm5 2700 paddq %xmm7,%xmm2 2701 movdqa 16(%eax),%xmm7 2702 pmuludq -80(%edx),%xmm6 2703 paddq %xmm5,%xmm3 2704 movdqa %xmm7,%xmm5 2705 pmuludq -128(%edx),%xmm7 2706 paddq %xmm6,%xmm4 2707 movdqa %xmm5,%xmm6 2708 pmuludq -112(%edx),%xmm5 2709 paddq %xmm7,%xmm2 2710 movdqa 32(%eax),%xmm7 2711 pmuludq -96(%edx),%xmm6 2712 paddq %xmm5,%xmm3 2713 movdqa %xmm7,%xmm5 2714 pmuludq -32(%edx),%xmm7 2715 paddq %xmm6,%xmm4 2716 movdqa %xmm5,%xmm6 2717 pmuludq -16(%edx),%xmm5 2718 paddq %xmm7,%xmm0 2719 movdqa %xmm6,%xmm7 2720 pmuludq -128(%edx),%xmm6 2721 paddq %xmm5,%xmm1 2722 movdqa 48(%eax),%xmm5 2723 pmuludq -112(%edx),%xmm7 2724 paddq %xmm6,%xmm3 2725 movdqa %xmm5,%xmm6 2726 pmuludq -48(%edx),%xmm5 2727 paddq %xmm7,%xmm4 2728 movdqa %xmm6,%xmm7 2729 pmuludq -32(%edx),%xmm6 2730 paddq %xmm5,%xmm0 2731 movdqa %xmm7,%xmm5 2732 pmuludq -16(%edx),%xmm7 2733 paddq %xmm6,%xmm1 2734 movdqa 64(%eax),%xmm6 2735 pmuludq -128(%edx),%xmm5 2736 paddq %xmm7,%xmm2 2737 movdqa %xmm6,%xmm7 2738 pmuludq -16(%edx),%xmm6 2739 paddq %xmm5,%xmm4 2740 movdqa %xmm7,%xmm5 2741 pmuludq -64(%edx),%xmm7 2742 paddq %xmm6,%xmm3 2743 movdqa %xmm5,%xmm6 2744 pmuludq -48(%edx),%xmm5 2745 paddq %xmm7,%xmm0 2746 movdqa 64(%ebx),%xmm7 2747 pmuludq -32(%edx),%xmm6 2748 paddq %xmm5,%xmm1 2749 paddq %xmm6,%xmm2 2750 movdqu -32(%esi),%xmm5 2751 movdqu -16(%esi),%xmm6 2752 leal 32(%esi),%esi 2753 movdqa %xmm2,32(%esp) 2754 movdqa %xmm3,48(%esp) 2755 movdqa %xmm4,64(%esp) 2756 movdqa %xmm5,%xmm2 2757 movdqa %xmm6,%xmm3 2758 psrldq $6,%xmm2 2759 psrldq $6,%xmm3 2760 movdqa %xmm5,%xmm4 2761 punpcklqdq %xmm3,%xmm2 2762 punpckhqdq %xmm6,%xmm4 2763 punpcklqdq %xmm6,%xmm5 2764 movdqa %xmm2,%xmm3 2765 psrlq $4,%xmm2 2766 psrlq $30,%xmm3 2767 movdqa %xmm5,%xmm6 2768 psrlq $40,%xmm4 2769 psrlq $26,%xmm6 2770 pand %xmm7,%xmm5 2771 pand %xmm7,%xmm6 2772 pand %xmm7,%xmm2 2773 pand %xmm7,%xmm3 2774 por (%ebx),%xmm4 2775 leal -32(%esi),%eax 2776 subl $64,%ecx 2777 paddd 80(%esp),%xmm5 2778 paddd 96(%esp),%xmm6 2779 paddd 112(%esp),%xmm2 2780 paddd 128(%esp),%xmm3 2781 paddd 144(%esp),%xmm4 2782 cmovbl %eax,%esi 2783 leal 160(%esp),%eax 2784 movdqa (%edx),%xmm7 2785 movdqa %xmm1,16(%esp) 2786 movdqa %xmm6,16(%eax) 2787 movdqa %xmm2,32(%eax) 2788 movdqa %xmm3,48(%eax) 2789 movdqa %xmm4,64(%eax) 2790 movdqa %xmm5,%xmm1 2791 pmuludq %xmm7,%xmm5 2792 paddq %xmm0,%xmm5 2793 movdqa %xmm6,%xmm0 2794 pmuludq %xmm7,%xmm6 2795 pmuludq %xmm7,%xmm2 2796 pmuludq %xmm7,%xmm3 2797 pmuludq %xmm7,%xmm4 2798 paddq 16(%esp),%xmm6 2799 paddq 32(%esp),%xmm2 2800 paddq 48(%esp),%xmm3 2801 paddq 64(%esp),%xmm4 2802 pmuludq 128(%edx),%xmm0 2803 movdqa %xmm1,%xmm7 2804 pmuludq 16(%edx),%xmm1 2805 paddq %xmm5,%xmm0 2806 movdqa %xmm7,%xmm5 2807 pmuludq 32(%edx),%xmm7 2808 paddq %xmm6,%xmm1 2809 movdqa %xmm5,%xmm6 2810 pmuludq 48(%edx),%xmm5 2811 paddq %xmm7,%xmm2 2812 movdqa 16(%eax),%xmm7 2813 pmuludq 64(%edx),%xmm6 2814 paddq %xmm5,%xmm3 2815 movdqa %xmm7,%xmm5 2816 pmuludq 16(%edx),%xmm7 2817 paddq %xmm6,%xmm4 2818 movdqa %xmm5,%xmm6 2819 pmuludq 32(%edx),%xmm5 2820 paddq %xmm7,%xmm2 2821 movdqa 32(%eax),%xmm7 2822 pmuludq 48(%edx),%xmm6 2823 paddq %xmm5,%xmm3 2824 movdqa %xmm7,%xmm5 2825 pmuludq 112(%edx),%xmm7 2826 paddq %xmm6,%xmm4 2827 movdqa %xmm5,%xmm6 2828 pmuludq 128(%edx),%xmm5 2829 paddq %xmm7,%xmm0 2830 movdqa %xmm6,%xmm7 2831 pmuludq 16(%edx),%xmm6 2832 paddq %xmm5,%xmm1 2833 movdqa 48(%eax),%xmm5 2834 pmuludq 32(%edx),%xmm7 2835 paddq %xmm6,%xmm3 2836 movdqa %xmm5,%xmm6 2837 pmuludq 96(%edx),%xmm5 2838 paddq %xmm7,%xmm4 2839 movdqa %xmm6,%xmm7 2840 pmuludq 112(%edx),%xmm6 2841 paddq %xmm5,%xmm0 2842 movdqa %xmm7,%xmm5 2843 pmuludq 128(%edx),%xmm7 2844 paddq %xmm6,%xmm1 2845 movdqa 64(%eax),%xmm6 2846 pmuludq 16(%edx),%xmm5 2847 paddq %xmm7,%xmm2 2848 movdqa %xmm6,%xmm7 2849 pmuludq 128(%edx),%xmm6 2850 paddq %xmm5,%xmm4 2851 movdqa %xmm7,%xmm5 2852 pmuludq 80(%edx),%xmm7 2853 paddq %xmm6,%xmm3 2854 movdqa %xmm5,%xmm6 2855 pmuludq 96(%edx),%xmm5 2856 paddq %xmm7,%xmm0 2857 movdqa 64(%ebx),%xmm7 2858 pmuludq 112(%edx),%xmm6 2859 paddq %xmm5,%xmm1 2860 paddq %xmm6,%xmm2 2861 movdqa %xmm3,%xmm5 2862 pand %xmm7,%xmm3 2863 psrlq $26,%xmm5 2864 paddq %xmm4,%xmm5 2865 movdqa %xmm0,%xmm6 2866 pand %xmm7,%xmm0 2867 psrlq $26,%xmm6 2868 movdqa %xmm5,%xmm4 2869 paddq %xmm1,%xmm6 2870 psrlq $26,%xmm5 2871 pand %xmm7,%xmm4 2872 movdqa %xmm6,%xmm1 2873 psrlq $26,%xmm6 2874 paddd %xmm5,%xmm0 2875 psllq $2,%xmm5 2876 paddq %xmm2,%xmm6 2877 paddq %xmm0,%xmm5 2878 pand %xmm7,%xmm1 2879 movdqa %xmm6,%xmm2 2880 psrlq $26,%xmm6 2881 pand %xmm7,%xmm2 2882 paddd %xmm3,%xmm6 2883 movdqa %xmm5,%xmm0 2884 psrlq $26,%xmm5 2885 movdqa %xmm6,%xmm3 2886 psrlq $26,%xmm6 2887 pand %xmm7,%xmm0 2888 paddd %xmm5,%xmm1 2889 pand %xmm7,%xmm3 2890 paddd %xmm6,%xmm4 2891 movdqu 32(%esi),%xmm5 2892 movdqu 48(%esi),%xmm6 2893 leal 32(%esi),%esi 2894 movdqa %xmm2,112(%esp) 2895 movdqa %xmm3,128(%esp) 2896 movdqa %xmm4,144(%esp) 2897 movdqa %xmm5,%xmm2 2898 movdqa %xmm6,%xmm3 2899 psrldq $6,%xmm2 2900 psrldq $6,%xmm3 2901 movdqa %xmm5,%xmm4 2902 punpcklqdq %xmm3,%xmm2 2903 punpckhqdq %xmm6,%xmm4 2904 punpcklqdq %xmm6,%xmm5 2905 movdqa %xmm2,%xmm3 2906 psrlq $4,%xmm2 2907 psrlq $30,%xmm3 2908 movdqa %xmm5,%xmm6 2909 psrlq $40,%xmm4 2910 psrlq $26,%xmm6 2911 pand %xmm7,%xmm5 2912 pand %xmm7,%xmm6 2913 pand %xmm7,%xmm2 2914 pand %xmm7,%xmm3 2915 por (%ebx),%xmm4 2916 movdqa %xmm0,80(%esp) 2917 movdqa %xmm1,96(%esp) 2918 ja .L015loop 2919.L014skip_loop: 2920 pshufd $16,-144(%edx),%xmm7 2921 addl $32,%ecx 2922 jnz .L016long_tail 2923 paddd %xmm0,%xmm5 2924 paddd %xmm1,%xmm6 2925 paddd 112(%esp),%xmm2 2926 paddd 128(%esp),%xmm3 2927 paddd 144(%esp),%xmm4 2928.L016long_tail: 2929 movdqa %xmm5,(%eax) 2930 movdqa %xmm6,16(%eax) 2931 movdqa %xmm2,32(%eax) 2932 movdqa %xmm3,48(%eax) 2933 movdqa %xmm4,64(%eax) 2934 pmuludq %xmm7,%xmm5 2935 pmuludq %xmm7,%xmm6 2936 pmuludq %xmm7,%xmm2 2937 movdqa %xmm5,%xmm0 2938 pshufd $16,-128(%edx),%xmm5 2939 pmuludq %xmm7,%xmm3 2940 movdqa %xmm6,%xmm1 2941 pmuludq %xmm7,%xmm4 2942 movdqa %xmm5,%xmm6 2943 pmuludq 48(%eax),%xmm5 2944 movdqa %xmm6,%xmm7 2945 pmuludq 32(%eax),%xmm6 2946 paddq %xmm5,%xmm4 2947 movdqa %xmm7,%xmm5 2948 pmuludq 16(%eax),%xmm7 2949 paddq %xmm6,%xmm3 2950 pshufd $16,-64(%edx),%xmm6 2951 pmuludq (%eax),%xmm5 2952 paddq %xmm7,%xmm2 2953 pmuludq 64(%eax),%xmm6 2954 pshufd $16,-112(%edx),%xmm7 2955 paddq %xmm5,%xmm1 2956 movdqa %xmm7,%xmm5 2957 pmuludq 32(%eax),%xmm7 2958 paddq %xmm6,%xmm0 2959 movdqa %xmm5,%xmm6 2960 pmuludq 16(%eax),%xmm5 2961 paddq %xmm7,%xmm4 2962 pshufd $16,-48(%edx),%xmm7 2963 pmuludq (%eax),%xmm6 2964 paddq %xmm5,%xmm3 2965 movdqa %xmm7,%xmm5 2966 pmuludq 64(%eax),%xmm7 2967 paddq %xmm6,%xmm2 2968 pmuludq 48(%eax),%xmm5 2969 pshufd $16,-96(%edx),%xmm6 2970 paddq %xmm7,%xmm1 2971 movdqa %xmm6,%xmm7 2972 pmuludq 16(%eax),%xmm6 2973 paddq %xmm5,%xmm0 2974 pshufd $16,-32(%edx),%xmm5 2975 pmuludq (%eax),%xmm7 2976 paddq %xmm6,%xmm4 2977 movdqa %xmm5,%xmm6 2978 pmuludq 64(%eax),%xmm5 2979 paddq %xmm7,%xmm3 2980 movdqa %xmm6,%xmm7 2981 pmuludq 48(%eax),%xmm6 2982 paddq %xmm5,%xmm2 2983 pmuludq 32(%eax),%xmm7 2984 pshufd $16,-80(%edx),%xmm5 2985 paddq %xmm6,%xmm1 2986 pshufd $16,-16(%edx),%xmm6 2987 pmuludq (%eax),%xmm5 2988 paddq %xmm7,%xmm0 2989 movdqa %xmm6,%xmm7 2990 pmuludq 64(%eax),%xmm6 2991 paddq %xmm5,%xmm4 2992 movdqa %xmm7,%xmm5 2993 pmuludq 16(%eax),%xmm7 2994 paddq %xmm6,%xmm3 2995 movdqa %xmm5,%xmm6 2996 pmuludq 32(%eax),%xmm5 2997 paddq %xmm7,%xmm0 2998 pmuludq 48(%eax),%xmm6 2999 movdqa 64(%ebx),%xmm7 3000 paddq %xmm5,%xmm1 3001 paddq %xmm6,%xmm2 3002 jz .L017short_tail 3003 movdqu -32(%esi),%xmm5 3004 movdqu -16(%esi),%xmm6 3005 leal 32(%esi),%esi 3006 movdqa %xmm2,32(%esp) 3007 movdqa %xmm3,48(%esp) 3008 movdqa %xmm4,64(%esp) 3009 movdqa %xmm5,%xmm2 3010 movdqa %xmm6,%xmm3 3011 psrldq $6,%xmm2 3012 psrldq $6,%xmm3 3013 movdqa %xmm5,%xmm4 3014 punpcklqdq %xmm3,%xmm2 3015 punpckhqdq %xmm6,%xmm4 3016 punpcklqdq %xmm6,%xmm5 3017 movdqa %xmm2,%xmm3 3018 psrlq $4,%xmm2 3019 psrlq $30,%xmm3 3020 movdqa %xmm5,%xmm6 3021 psrlq $40,%xmm4 3022 psrlq $26,%xmm6 3023 pand %xmm7,%xmm5 3024 pand %xmm7,%xmm6 3025 pand %xmm7,%xmm2 3026 pand %xmm7,%xmm3 3027 por (%ebx),%xmm4 3028 pshufd $16,(%edx),%xmm7 3029 paddd 80(%esp),%xmm5 3030 paddd 96(%esp),%xmm6 3031 paddd 112(%esp),%xmm2 3032 paddd 128(%esp),%xmm3 3033 paddd 144(%esp),%xmm4 3034 movdqa %xmm5,(%esp) 3035 pmuludq %xmm7,%xmm5 3036 movdqa %xmm6,16(%esp) 3037 pmuludq %xmm7,%xmm6 3038 paddq %xmm5,%xmm0 3039 movdqa %xmm2,%xmm5 3040 pmuludq %xmm7,%xmm2 3041 paddq %xmm6,%xmm1 3042 movdqa %xmm3,%xmm6 3043 pmuludq %xmm7,%xmm3 3044 paddq 32(%esp),%xmm2 3045 movdqa %xmm5,32(%esp) 3046 pshufd $16,16(%edx),%xmm5 3047 paddq 48(%esp),%xmm3 3048 movdqa %xmm6,48(%esp) 3049 movdqa %xmm4,%xmm6 3050 pmuludq %xmm7,%xmm4 3051 paddq 64(%esp),%xmm4 3052 movdqa %xmm6,64(%esp) 3053 movdqa %xmm5,%xmm6 3054 pmuludq 48(%esp),%xmm5 3055 movdqa %xmm6,%xmm7 3056 pmuludq 32(%esp),%xmm6 3057 paddq %xmm5,%xmm4 3058 movdqa %xmm7,%xmm5 3059 pmuludq 16(%esp),%xmm7 3060 paddq %xmm6,%xmm3 3061 pshufd $16,80(%edx),%xmm6 3062 pmuludq (%esp),%xmm5 3063 paddq %xmm7,%xmm2 3064 pmuludq 64(%esp),%xmm6 3065 pshufd $16,32(%edx),%xmm7 3066 paddq %xmm5,%xmm1 3067 movdqa %xmm7,%xmm5 3068 pmuludq 32(%esp),%xmm7 3069 paddq %xmm6,%xmm0 3070 movdqa %xmm5,%xmm6 3071 pmuludq 16(%esp),%xmm5 3072 paddq %xmm7,%xmm4 3073 pshufd $16,96(%edx),%xmm7 3074 pmuludq (%esp),%xmm6 3075 paddq %xmm5,%xmm3 3076 movdqa %xmm7,%xmm5 3077 pmuludq 64(%esp),%xmm7 3078 paddq %xmm6,%xmm2 3079 pmuludq 48(%esp),%xmm5 3080 pshufd $16,48(%edx),%xmm6 3081 paddq %xmm7,%xmm1 3082 movdqa %xmm6,%xmm7 3083 pmuludq 16(%esp),%xmm6 3084 paddq %xmm5,%xmm0 3085 pshufd $16,112(%edx),%xmm5 3086 pmuludq (%esp),%xmm7 3087 paddq %xmm6,%xmm4 3088 movdqa %xmm5,%xmm6 3089 pmuludq 64(%esp),%xmm5 3090 paddq %xmm7,%xmm3 3091 movdqa %xmm6,%xmm7 3092 pmuludq 48(%esp),%xmm6 3093 paddq %xmm5,%xmm2 3094 pmuludq 32(%esp),%xmm7 3095 pshufd $16,64(%edx),%xmm5 3096 paddq %xmm6,%xmm1 3097 pshufd $16,128(%edx),%xmm6 3098 pmuludq (%esp),%xmm5 3099 paddq %xmm7,%xmm0 3100 movdqa %xmm6,%xmm7 3101 pmuludq 64(%esp),%xmm6 3102 paddq %xmm5,%xmm4 3103 movdqa %xmm7,%xmm5 3104 pmuludq 16(%esp),%xmm7 3105 paddq %xmm6,%xmm3 3106 movdqa %xmm5,%xmm6 3107 pmuludq 32(%esp),%xmm5 3108 paddq %xmm7,%xmm0 3109 pmuludq 48(%esp),%xmm6 3110 movdqa 64(%ebx),%xmm7 3111 paddq %xmm5,%xmm1 3112 paddq %xmm6,%xmm2 3113.L017short_tail: 3114 pshufd $78,%xmm4,%xmm6 3115 pshufd $78,%xmm3,%xmm5 3116 paddq %xmm6,%xmm4 3117 paddq %xmm5,%xmm3 3118 pshufd $78,%xmm0,%xmm6 3119 pshufd $78,%xmm1,%xmm5 3120 paddq %xmm6,%xmm0 3121 paddq %xmm5,%xmm1 3122 pshufd $78,%xmm2,%xmm6 3123 movdqa %xmm3,%xmm5 3124 pand %xmm7,%xmm3 3125 psrlq $26,%xmm5 3126 paddq %xmm6,%xmm2 3127 paddq %xmm4,%xmm5 3128 movdqa %xmm0,%xmm6 3129 pand %xmm7,%xmm0 3130 psrlq $26,%xmm6 3131 movdqa %xmm5,%xmm4 3132 paddq %xmm1,%xmm6 3133 psrlq $26,%xmm5 3134 pand %xmm7,%xmm4 3135 movdqa %xmm6,%xmm1 3136 psrlq $26,%xmm6 3137 paddd %xmm5,%xmm0 3138 psllq $2,%xmm5 3139 paddq %xmm2,%xmm6 3140 paddq %xmm0,%xmm5 3141 pand %xmm7,%xmm1 3142 movdqa %xmm6,%xmm2 3143 psrlq $26,%xmm6 3144 pand %xmm7,%xmm2 3145 paddd %xmm3,%xmm6 3146 movdqa %xmm5,%xmm0 3147 psrlq $26,%xmm5 3148 movdqa %xmm6,%xmm3 3149 psrlq $26,%xmm6 3150 pand %xmm7,%xmm0 3151 paddd %xmm5,%xmm1 3152 pand %xmm7,%xmm3 3153 paddd %xmm6,%xmm4 3154.L013done: 3155 movd %xmm0,-48(%edi) 3156 movd %xmm1,-44(%edi) 3157 movd %xmm2,-40(%edi) 3158 movd %xmm3,-36(%edi) 3159 movd %xmm4,-32(%edi) 3160 movl %ebp,%esp 3161.L007nodata: 3162 popl %edi 3163 popl %esi 3164 popl %ebx 3165 popl %ebp 3166 ret 3167.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 3168.align 32 3169.type _poly1305_emit_sse2,@function 3170.align 16 3171_poly1305_emit_sse2: 3172 pushl %ebp 3173 pushl %ebx 3174 pushl %esi 3175 pushl %edi 3176 movl 20(%esp),%ebp 3177 cmpl $0,20(%ebp) 3178 je .Lenter_emit 3179 movl (%ebp),%eax 3180 movl 4(%ebp),%edi 3181 movl 8(%ebp),%ecx 3182 movl 12(%ebp),%edx 3183 movl 16(%ebp),%esi 3184 movl %edi,%ebx 3185 shll $26,%edi 3186 shrl $6,%ebx 3187 addl %edi,%eax 3188 movl %ecx,%edi 3189 adcl $0,%ebx 3190 shll $20,%edi 3191 shrl $12,%ecx 3192 addl %edi,%ebx 3193 movl %edx,%edi 3194 adcl $0,%ecx 3195 shll $14,%edi 3196 shrl $18,%edx 3197 addl %edi,%ecx 3198 movl %esi,%edi 3199 adcl $0,%edx 3200 shll $8,%edi 3201 shrl $24,%esi 3202 addl %edi,%edx 3203 adcl $0,%esi 3204 movl %esi,%edi 3205 andl $3,%esi 3206 shrl $2,%edi 3207 leal (%edi,%edi,4),%ebp 3208 movl 24(%esp),%edi 3209 addl %ebp,%eax 3210 movl 28(%esp),%ebp 3211 adcl $0,%ebx 3212 adcl $0,%ecx 3213 adcl $0,%edx 3214 adcl $0,%esi 3215 movd %eax,%xmm0 3216 addl $5,%eax 3217 movd %ebx,%xmm1 3218 adcl $0,%ebx 3219 movd %ecx,%xmm2 3220 adcl $0,%ecx 3221 movd %edx,%xmm3 3222 adcl $0,%edx 3223 adcl $0,%esi 3224 shrl $2,%esi 3225 negl %esi 3226 andl %esi,%eax 3227 andl %esi,%ebx 3228 andl %esi,%ecx 3229 andl %esi,%edx 3230 movl %eax,(%edi) 3231 movd %xmm0,%eax 3232 movl %ebx,4(%edi) 3233 movd %xmm1,%ebx 3234 movl %ecx,8(%edi) 3235 movd %xmm2,%ecx 3236 movl %edx,12(%edi) 3237 movd %xmm3,%edx 3238 notl %esi 3239 andl %esi,%eax 3240 andl %esi,%ebx 3241 orl (%edi),%eax 3242 andl %esi,%ecx 3243 orl 4(%edi),%ebx 3244 andl %esi,%edx 3245 orl 8(%edi),%ecx 3246 orl 12(%edi),%edx 3247 addl (%ebp),%eax 3248 adcl 4(%ebp),%ebx 3249 movl %eax,(%edi) 3250 adcl 8(%ebp),%ecx 3251 movl %ebx,4(%edi) 3252 adcl 12(%ebp),%edx 3253 movl %ecx,8(%edi) 3254 movl %edx,12(%edi) 3255 popl %edi 3256 popl %esi 3257 popl %ebx 3258 popl %ebp 3259 ret 3260.size _poly1305_emit_sse2,.-_poly1305_emit_sse2 3261.align 32 3262.type _poly1305_init_avx2,@function 3263.align 16 3264_poly1305_init_avx2: 3265 vmovdqu 24(%edi),%xmm4 3266 leal 48(%edi),%edi 3267 movl %esp,%ebp 3268 subl $224,%esp 3269 andl $-16,%esp 3270 vmovdqa 64(%ebx),%xmm7 3271 vpand %xmm7,%xmm4,%xmm0 3272 vpsrlq $26,%xmm4,%xmm1 3273 vpsrldq $6,%xmm4,%xmm3 3274 vpand %xmm7,%xmm1,%xmm1 3275 vpsrlq $4,%xmm3,%xmm2 3276 vpsrlq $30,%xmm3,%xmm3 3277 vpand %xmm7,%xmm2,%xmm2 3278 vpand %xmm7,%xmm3,%xmm3 3279 vpsrldq $13,%xmm4,%xmm4 3280 leal 144(%esp),%edx 3281 movl $2,%ecx 3282.L018square: 3283 vmovdqa %xmm0,(%esp) 3284 vmovdqa %xmm1,16(%esp) 3285 vmovdqa %xmm2,32(%esp) 3286 vmovdqa %xmm3,48(%esp) 3287 vmovdqa %xmm4,64(%esp) 3288 vpslld $2,%xmm1,%xmm6 3289 vpslld $2,%xmm2,%xmm5 3290 vpaddd %xmm1,%xmm6,%xmm6 3291 vpaddd %xmm2,%xmm5,%xmm5 3292 vmovdqa %xmm6,80(%esp) 3293 vmovdqa %xmm5,96(%esp) 3294 vpslld $2,%xmm3,%xmm6 3295 vpslld $2,%xmm4,%xmm5 3296 vpaddd %xmm3,%xmm6,%xmm6 3297 vpaddd %xmm4,%xmm5,%xmm5 3298 vmovdqa %xmm6,112(%esp) 3299 vmovdqa %xmm5,128(%esp) 3300 vpshufd $68,%xmm0,%xmm5 3301 vmovdqa %xmm1,%xmm6 3302 vpshufd $68,%xmm1,%xmm1 3303 vpshufd $68,%xmm2,%xmm2 3304 vpshufd $68,%xmm3,%xmm3 3305 vpshufd $68,%xmm4,%xmm4 3306 vmovdqa %xmm5,(%edx) 3307 vmovdqa %xmm1,16(%edx) 3308 vmovdqa %xmm2,32(%edx) 3309 vmovdqa %xmm3,48(%edx) 3310 vmovdqa %xmm4,64(%edx) 3311 vpmuludq %xmm0,%xmm4,%xmm4 3312 vpmuludq %xmm0,%xmm3,%xmm3 3313 vpmuludq %xmm0,%xmm2,%xmm2 3314 vpmuludq %xmm0,%xmm1,%xmm1 3315 vpmuludq %xmm0,%xmm5,%xmm0 3316 vpmuludq 48(%edx),%xmm6,%xmm5 3317 vpaddq %xmm5,%xmm4,%xmm4 3318 vpmuludq 32(%edx),%xmm6,%xmm7 3319 vpaddq %xmm7,%xmm3,%xmm3 3320 vpmuludq 16(%edx),%xmm6,%xmm5 3321 vpaddq %xmm5,%xmm2,%xmm2 3322 vmovdqa 80(%esp),%xmm7 3323 vpmuludq (%edx),%xmm6,%xmm6 3324 vpaddq %xmm6,%xmm1,%xmm1 3325 vmovdqa 32(%esp),%xmm5 3326 vpmuludq 64(%edx),%xmm7,%xmm7 3327 vpaddq %xmm7,%xmm0,%xmm0 3328 vpmuludq 32(%edx),%xmm5,%xmm6 3329 vpaddq %xmm6,%xmm4,%xmm4 3330 vpmuludq 16(%edx),%xmm5,%xmm7 3331 vpaddq %xmm7,%xmm3,%xmm3 3332 vmovdqa 96(%esp),%xmm6 3333 vpmuludq (%edx),%xmm5,%xmm5 3334 vpaddq %xmm5,%xmm2,%xmm2 3335 vpmuludq 64(%edx),%xmm6,%xmm7 3336 vpaddq %xmm7,%xmm1,%xmm1 3337 vmovdqa 48(%esp),%xmm5 3338 vpmuludq 48(%edx),%xmm6,%xmm6 3339 vpaddq %xmm6,%xmm0,%xmm0 3340 vpmuludq 16(%edx),%xmm5,%xmm7 3341 vpaddq %xmm7,%xmm4,%xmm4 3342 vmovdqa 112(%esp),%xmm6 3343 vpmuludq (%edx),%xmm5,%xmm5 3344 vpaddq %xmm5,%xmm3,%xmm3 3345 vpmuludq 64(%edx),%xmm6,%xmm7 3346 vpaddq %xmm7,%xmm2,%xmm2 3347 vpmuludq 48(%edx),%xmm6,%xmm5 3348 vpaddq %xmm5,%xmm1,%xmm1 3349 vmovdqa 64(%esp),%xmm7 3350 vpmuludq 32(%edx),%xmm6,%xmm6 3351 vpaddq %xmm6,%xmm0,%xmm0 3352 vmovdqa 128(%esp),%xmm5 3353 vpmuludq (%edx),%xmm7,%xmm7 3354 vpaddq %xmm7,%xmm4,%xmm4 3355 vpmuludq 64(%edx),%xmm5,%xmm6 3356 vpaddq %xmm6,%xmm3,%xmm3 3357 vpmuludq 16(%edx),%xmm5,%xmm7 3358 vpaddq %xmm7,%xmm0,%xmm0 3359 vpmuludq 32(%edx),%xmm5,%xmm6 3360 vpaddq %xmm6,%xmm1,%xmm1 3361 vmovdqa 64(%ebx),%xmm7 3362 vpmuludq 48(%edx),%xmm5,%xmm5 3363 vpaddq %xmm5,%xmm2,%xmm2 3364 vpsrlq $26,%xmm3,%xmm5 3365 vpand %xmm7,%xmm3,%xmm3 3366 vpsrlq $26,%xmm0,%xmm6 3367 vpand %xmm7,%xmm0,%xmm0 3368 vpaddq %xmm5,%xmm4,%xmm4 3369 vpaddq %xmm6,%xmm1,%xmm1 3370 vpsrlq $26,%xmm4,%xmm5 3371 vpand %xmm7,%xmm4,%xmm4 3372 vpsrlq $26,%xmm1,%xmm6 3373 vpand %xmm7,%xmm1,%xmm1 3374 vpaddq %xmm6,%xmm2,%xmm2 3375 vpaddd %xmm5,%xmm0,%xmm0 3376 vpsllq $2,%xmm5,%xmm5 3377 vpsrlq $26,%xmm2,%xmm6 3378 vpand %xmm7,%xmm2,%xmm2 3379 vpaddd %xmm5,%xmm0,%xmm0 3380 vpaddd %xmm6,%xmm3,%xmm3 3381 vpsrlq $26,%xmm3,%xmm6 3382 vpsrlq $26,%xmm0,%xmm5 3383 vpand %xmm7,%xmm0,%xmm0 3384 vpand %xmm7,%xmm3,%xmm3 3385 vpaddd %xmm5,%xmm1,%xmm1 3386 vpaddd %xmm6,%xmm4,%xmm4 3387 decl %ecx 3388 jz .L019square_break 3389 vpunpcklqdq (%esp),%xmm0,%xmm0 3390 vpunpcklqdq 16(%esp),%xmm1,%xmm1 3391 vpunpcklqdq 32(%esp),%xmm2,%xmm2 3392 vpunpcklqdq 48(%esp),%xmm3,%xmm3 3393 vpunpcklqdq 64(%esp),%xmm4,%xmm4 3394 jmp .L018square 3395.L019square_break: 3396 vpsllq $32,%xmm0,%xmm0 3397 vpsllq $32,%xmm1,%xmm1 3398 vpsllq $32,%xmm2,%xmm2 3399 vpsllq $32,%xmm3,%xmm3 3400 vpsllq $32,%xmm4,%xmm4 3401 vpor (%esp),%xmm0,%xmm0 3402 vpor 16(%esp),%xmm1,%xmm1 3403 vpor 32(%esp),%xmm2,%xmm2 3404 vpor 48(%esp),%xmm3,%xmm3 3405 vpor 64(%esp),%xmm4,%xmm4 3406 vpshufd $141,%xmm0,%xmm0 3407 vpshufd $141,%xmm1,%xmm1 3408 vpshufd $141,%xmm2,%xmm2 3409 vpshufd $141,%xmm3,%xmm3 3410 vpshufd $141,%xmm4,%xmm4 3411 vmovdqu %xmm0,(%edi) 3412 vmovdqu %xmm1,16(%edi) 3413 vmovdqu %xmm2,32(%edi) 3414 vmovdqu %xmm3,48(%edi) 3415 vmovdqu %xmm4,64(%edi) 3416 vpslld $2,%xmm1,%xmm6 3417 vpslld $2,%xmm2,%xmm5 3418 vpaddd %xmm1,%xmm6,%xmm6 3419 vpaddd %xmm2,%xmm5,%xmm5 3420 vmovdqu %xmm6,80(%edi) 3421 vmovdqu %xmm5,96(%edi) 3422 vpslld $2,%xmm3,%xmm6 3423 vpslld $2,%xmm4,%xmm5 3424 vpaddd %xmm3,%xmm6,%xmm6 3425 vpaddd %xmm4,%xmm5,%xmm5 3426 vmovdqu %xmm6,112(%edi) 3427 vmovdqu %xmm5,128(%edi) 3428 movl %ebp,%esp 3429 leal -48(%edi),%edi 3430 ret 3431.size _poly1305_init_avx2,.-_poly1305_init_avx2 3432.align 32 3433.type _poly1305_blocks_avx2,@function 3434.align 16 3435_poly1305_blocks_avx2: 3436 pushl %ebp 3437 pushl %ebx 3438 pushl %esi 3439 pushl %edi 3440 movl 20(%esp),%edi 3441 movl 24(%esp),%esi 3442 movl 28(%esp),%ecx 3443 movl 20(%edi),%eax 3444 andl $-16,%ecx 3445 jz .L020nodata 3446 cmpl $64,%ecx 3447 jae .L021enter_avx2 3448 testl %eax,%eax 3449 jz .Lenter_blocks 3450.L021enter_avx2: 3451 vzeroupper 3452 call .L022pic_point 3453.L022pic_point: 3454 popl %ebx 3455 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx 3456 testl %eax,%eax 3457 jnz .L023base2_26 3458 call _poly1305_init_avx2 3459 movl (%edi),%eax 3460 movl 3(%edi),%ecx 3461 movl 6(%edi),%edx 3462 movl 9(%edi),%esi 3463 movl 13(%edi),%ebp 3464 shrl $2,%ecx 3465 andl $67108863,%eax 3466 shrl $4,%edx 3467 andl $67108863,%ecx 3468 shrl $6,%esi 3469 andl $67108863,%edx 3470 movl %eax,(%edi) 3471 movl %ecx,4(%edi) 3472 movl %edx,8(%edi) 3473 movl %esi,12(%edi) 3474 movl %ebp,16(%edi) 3475 movl $1,20(%edi) 3476 movl 24(%esp),%esi 3477 movl 28(%esp),%ecx 3478.L023base2_26: 3479 movl 32(%esp),%eax 3480 movl %esp,%ebp 3481 subl $448,%esp 3482 andl $-512,%esp 3483 vmovdqu 48(%edi),%xmm0 3484 leal 288(%esp),%edx 3485 vmovdqu 64(%edi),%xmm1 3486 vmovdqu 80(%edi),%xmm2 3487 vmovdqu 96(%edi),%xmm3 3488 vmovdqu 112(%edi),%xmm4 3489 leal 48(%edi),%edi 3490 vpermq $64,%ymm0,%ymm0 3491 vpermq $64,%ymm1,%ymm1 3492 vpermq $64,%ymm2,%ymm2 3493 vpermq $64,%ymm3,%ymm3 3494 vpermq $64,%ymm4,%ymm4 3495 vpshufd $200,%ymm0,%ymm0 3496 vpshufd $200,%ymm1,%ymm1 3497 vpshufd $200,%ymm2,%ymm2 3498 vpshufd $200,%ymm3,%ymm3 3499 vpshufd $200,%ymm4,%ymm4 3500 vmovdqa %ymm0,-128(%edx) 3501 vmovdqu 80(%edi),%xmm0 3502 vmovdqa %ymm1,-96(%edx) 3503 vmovdqu 96(%edi),%xmm1 3504 vmovdqa %ymm2,-64(%edx) 3505 vmovdqu 112(%edi),%xmm2 3506 vmovdqa %ymm3,-32(%edx) 3507 vmovdqu 128(%edi),%xmm3 3508 vmovdqa %ymm4,(%edx) 3509 vpermq $64,%ymm0,%ymm0 3510 vpermq $64,%ymm1,%ymm1 3511 vpermq $64,%ymm2,%ymm2 3512 vpermq $64,%ymm3,%ymm3 3513 vpshufd $200,%ymm0,%ymm0 3514 vpshufd $200,%ymm1,%ymm1 3515 vpshufd $200,%ymm2,%ymm2 3516 vpshufd $200,%ymm3,%ymm3 3517 vmovdqa %ymm0,32(%edx) 3518 vmovd -48(%edi),%xmm0 3519 vmovdqa %ymm1,64(%edx) 3520 vmovd -44(%edi),%xmm1 3521 vmovdqa %ymm2,96(%edx) 3522 vmovd -40(%edi),%xmm2 3523 vmovdqa %ymm3,128(%edx) 3524 vmovd -36(%edi),%xmm3 3525 vmovd -32(%edi),%xmm4 3526 vmovdqa 64(%ebx),%ymm7 3527 negl %eax 3528 testl $63,%ecx 3529 jz .L024even 3530 movl %ecx,%edx 3531 andl $-64,%ecx 3532 andl $63,%edx 3533 vmovdqu (%esi),%xmm5 3534 cmpl $32,%edx 3535 jb .L025one 3536 vmovdqu 16(%esi),%xmm6 3537 je .L026two 3538 vinserti128 $1,32(%esi),%ymm5,%ymm5 3539 leal 48(%esi),%esi 3540 leal 8(%ebx),%ebx 3541 leal 296(%esp),%edx 3542 jmp .L027tail 3543.L026two: 3544 leal 32(%esi),%esi 3545 leal 16(%ebx),%ebx 3546 leal 304(%esp),%edx 3547 jmp .L027tail 3548.L025one: 3549 leal 16(%esi),%esi 3550 vpxor %ymm6,%ymm6,%ymm6 3551 leal 32(%ebx,%eax,8),%ebx 3552 leal 312(%esp),%edx 3553 jmp .L027tail 3554.align 32 3555.L024even: 3556 vmovdqu (%esi),%xmm5 3557 vmovdqu 16(%esi),%xmm6 3558 vinserti128 $1,32(%esi),%ymm5,%ymm5 3559 vinserti128 $1,48(%esi),%ymm6,%ymm6 3560 leal 64(%esi),%esi 3561 subl $64,%ecx 3562 jz .L027tail 3563.L028loop: 3564 vmovdqa %ymm2,64(%esp) 3565 vpsrldq $6,%ymm5,%ymm2 3566 vmovdqa %ymm0,(%esp) 3567 vpsrldq $6,%ymm6,%ymm0 3568 vmovdqa %ymm1,32(%esp) 3569 vpunpckhqdq %ymm6,%ymm5,%ymm1 3570 vpunpcklqdq %ymm6,%ymm5,%ymm5 3571 vpunpcklqdq %ymm0,%ymm2,%ymm2 3572 vpsrlq $30,%ymm2,%ymm0 3573 vpsrlq $4,%ymm2,%ymm2 3574 vpsrlq $26,%ymm5,%ymm6 3575 vpsrlq $40,%ymm1,%ymm1 3576 vpand %ymm7,%ymm2,%ymm2 3577 vpand %ymm7,%ymm5,%ymm5 3578 vpand %ymm7,%ymm6,%ymm6 3579 vpand %ymm7,%ymm0,%ymm0 3580 vpor (%ebx),%ymm1,%ymm1 3581 vpaddq 64(%esp),%ymm2,%ymm2 3582 vpaddq (%esp),%ymm5,%ymm5 3583 vpaddq 32(%esp),%ymm6,%ymm6 3584 vpaddq %ymm3,%ymm0,%ymm0 3585 vpaddq %ymm4,%ymm1,%ymm1 3586 vpmuludq -96(%edx),%ymm2,%ymm3 3587 vmovdqa %ymm6,32(%esp) 3588 vpmuludq -64(%edx),%ymm2,%ymm4 3589 vmovdqa %ymm0,96(%esp) 3590 vpmuludq 96(%edx),%ymm2,%ymm0 3591 vmovdqa %ymm1,128(%esp) 3592 vpmuludq 128(%edx),%ymm2,%ymm1 3593 vpmuludq -128(%edx),%ymm2,%ymm2 3594 vpmuludq -32(%edx),%ymm5,%ymm7 3595 vpaddq %ymm7,%ymm3,%ymm3 3596 vpmuludq (%edx),%ymm5,%ymm6 3597 vpaddq %ymm6,%ymm4,%ymm4 3598 vpmuludq -128(%edx),%ymm5,%ymm7 3599 vpaddq %ymm7,%ymm0,%ymm0 3600 vmovdqa 32(%esp),%ymm7 3601 vpmuludq -96(%edx),%ymm5,%ymm6 3602 vpaddq %ymm6,%ymm1,%ymm1 3603 vpmuludq -64(%edx),%ymm5,%ymm5 3604 vpaddq %ymm5,%ymm2,%ymm2 3605 vpmuludq -64(%edx),%ymm7,%ymm6 3606 vpaddq %ymm6,%ymm3,%ymm3 3607 vpmuludq -32(%edx),%ymm7,%ymm5 3608 vpaddq %ymm5,%ymm4,%ymm4 3609 vpmuludq 128(%edx),%ymm7,%ymm6 3610 vpaddq %ymm6,%ymm0,%ymm0 3611 vmovdqa 96(%esp),%ymm6 3612 vpmuludq -128(%edx),%ymm7,%ymm5 3613 vpaddq %ymm5,%ymm1,%ymm1 3614 vpmuludq -96(%edx),%ymm7,%ymm7 3615 vpaddq %ymm7,%ymm2,%ymm2 3616 vpmuludq -128(%edx),%ymm6,%ymm5 3617 vpaddq %ymm5,%ymm3,%ymm3 3618 vpmuludq -96(%edx),%ymm6,%ymm7 3619 vpaddq %ymm7,%ymm4,%ymm4 3620 vpmuludq 64(%edx),%ymm6,%ymm5 3621 vpaddq %ymm5,%ymm0,%ymm0 3622 vmovdqa 128(%esp),%ymm5 3623 vpmuludq 96(%edx),%ymm6,%ymm7 3624 vpaddq %ymm7,%ymm1,%ymm1 3625 vpmuludq 128(%edx),%ymm6,%ymm6 3626 vpaddq %ymm6,%ymm2,%ymm2 3627 vpmuludq 128(%edx),%ymm5,%ymm7 3628 vpaddq %ymm7,%ymm3,%ymm3 3629 vpmuludq 32(%edx),%ymm5,%ymm6 3630 vpaddq %ymm6,%ymm0,%ymm0 3631 vpmuludq -128(%edx),%ymm5,%ymm7 3632 vpaddq %ymm7,%ymm4,%ymm4 3633 vmovdqa 64(%ebx),%ymm7 3634 vpmuludq 64(%edx),%ymm5,%ymm6 3635 vpaddq %ymm6,%ymm1,%ymm1 3636 vpmuludq 96(%edx),%ymm5,%ymm5 3637 vpaddq %ymm5,%ymm2,%ymm2 3638 vpsrlq $26,%ymm3,%ymm5 3639 vpand %ymm7,%ymm3,%ymm3 3640 vpsrlq $26,%ymm0,%ymm6 3641 vpand %ymm7,%ymm0,%ymm0 3642 vpaddq %ymm5,%ymm4,%ymm4 3643 vpaddq %ymm6,%ymm1,%ymm1 3644 vpsrlq $26,%ymm4,%ymm5 3645 vpand %ymm7,%ymm4,%ymm4 3646 vpsrlq $26,%ymm1,%ymm6 3647 vpand %ymm7,%ymm1,%ymm1 3648 vpaddq %ymm6,%ymm2,%ymm2 3649 vpaddq %ymm5,%ymm0,%ymm0 3650 vpsllq $2,%ymm5,%ymm5 3651 vpsrlq $26,%ymm2,%ymm6 3652 vpand %ymm7,%ymm2,%ymm2 3653 vpaddq %ymm5,%ymm0,%ymm0 3654 vpaddq %ymm6,%ymm3,%ymm3 3655 vpsrlq $26,%ymm3,%ymm6 3656 vpsrlq $26,%ymm0,%ymm5 3657 vpand %ymm7,%ymm0,%ymm0 3658 vpand %ymm7,%ymm3,%ymm3 3659 vpaddq %ymm5,%ymm1,%ymm1 3660 vpaddq %ymm6,%ymm4,%ymm4 3661 vmovdqu (%esi),%xmm5 3662 vmovdqu 16(%esi),%xmm6 3663 vinserti128 $1,32(%esi),%ymm5,%ymm5 3664 vinserti128 $1,48(%esi),%ymm6,%ymm6 3665 leal 64(%esi),%esi 3666 subl $64,%ecx 3667 jnz .L028loop 3668.L027tail: 3669 vmovdqa %ymm2,64(%esp) 3670 vpsrldq $6,%ymm5,%ymm2 3671 vmovdqa %ymm0,(%esp) 3672 vpsrldq $6,%ymm6,%ymm0 3673 vmovdqa %ymm1,32(%esp) 3674 vpunpckhqdq %ymm6,%ymm5,%ymm1 3675 vpunpcklqdq %ymm6,%ymm5,%ymm5 3676 vpunpcklqdq %ymm0,%ymm2,%ymm2 3677 vpsrlq $30,%ymm2,%ymm0 3678 vpsrlq $4,%ymm2,%ymm2 3679 vpsrlq $26,%ymm5,%ymm6 3680 vpsrlq $40,%ymm1,%ymm1 3681 vpand %ymm7,%ymm2,%ymm2 3682 vpand %ymm7,%ymm5,%ymm5 3683 vpand %ymm7,%ymm6,%ymm6 3684 vpand %ymm7,%ymm0,%ymm0 3685 vpor (%ebx),%ymm1,%ymm1 3686 andl $-64,%ebx 3687 vpaddq 64(%esp),%ymm2,%ymm2 3688 vpaddq (%esp),%ymm5,%ymm5 3689 vpaddq 32(%esp),%ymm6,%ymm6 3690 vpaddq %ymm3,%ymm0,%ymm0 3691 vpaddq %ymm4,%ymm1,%ymm1 3692 vpmuludq -92(%edx),%ymm2,%ymm3 3693 vmovdqa %ymm6,32(%esp) 3694 vpmuludq -60(%edx),%ymm2,%ymm4 3695 vmovdqa %ymm0,96(%esp) 3696 vpmuludq 100(%edx),%ymm2,%ymm0 3697 vmovdqa %ymm1,128(%esp) 3698 vpmuludq 132(%edx),%ymm2,%ymm1 3699 vpmuludq -124(%edx),%ymm2,%ymm2 3700 vpmuludq -28(%edx),%ymm5,%ymm7 3701 vpaddq %ymm7,%ymm3,%ymm3 3702 vpmuludq 4(%edx),%ymm5,%ymm6 3703 vpaddq %ymm6,%ymm4,%ymm4 3704 vpmuludq -124(%edx),%ymm5,%ymm7 3705 vpaddq %ymm7,%ymm0,%ymm0 3706 vmovdqa 32(%esp),%ymm7 3707 vpmuludq -92(%edx),%ymm5,%ymm6 3708 vpaddq %ymm6,%ymm1,%ymm1 3709 vpmuludq -60(%edx),%ymm5,%ymm5 3710 vpaddq %ymm5,%ymm2,%ymm2 3711 vpmuludq -60(%edx),%ymm7,%ymm6 3712 vpaddq %ymm6,%ymm3,%ymm3 3713 vpmuludq -28(%edx),%ymm7,%ymm5 3714 vpaddq %ymm5,%ymm4,%ymm4 3715 vpmuludq 132(%edx),%ymm7,%ymm6 3716 vpaddq %ymm6,%ymm0,%ymm0 3717 vmovdqa 96(%esp),%ymm6 3718 vpmuludq -124(%edx),%ymm7,%ymm5 3719 vpaddq %ymm5,%ymm1,%ymm1 3720 vpmuludq -92(%edx),%ymm7,%ymm7 3721 vpaddq %ymm7,%ymm2,%ymm2 3722 vpmuludq -124(%edx),%ymm6,%ymm5 3723 vpaddq %ymm5,%ymm3,%ymm3 3724 vpmuludq -92(%edx),%ymm6,%ymm7 3725 vpaddq %ymm7,%ymm4,%ymm4 3726 vpmuludq 68(%edx),%ymm6,%ymm5 3727 vpaddq %ymm5,%ymm0,%ymm0 3728 vmovdqa 128(%esp),%ymm5 3729 vpmuludq 100(%edx),%ymm6,%ymm7 3730 vpaddq %ymm7,%ymm1,%ymm1 3731 vpmuludq 132(%edx),%ymm6,%ymm6 3732 vpaddq %ymm6,%ymm2,%ymm2 3733 vpmuludq 132(%edx),%ymm5,%ymm7 3734 vpaddq %ymm7,%ymm3,%ymm3 3735 vpmuludq 36(%edx),%ymm5,%ymm6 3736 vpaddq %ymm6,%ymm0,%ymm0 3737 vpmuludq -124(%edx),%ymm5,%ymm7 3738 vpaddq %ymm7,%ymm4,%ymm4 3739 vmovdqa 64(%ebx),%ymm7 3740 vpmuludq 68(%edx),%ymm5,%ymm6 3741 vpaddq %ymm6,%ymm1,%ymm1 3742 vpmuludq 100(%edx),%ymm5,%ymm5 3743 vpaddq %ymm5,%ymm2,%ymm2 3744 vpsrldq $8,%ymm4,%ymm5 3745 vpsrldq $8,%ymm3,%ymm6 3746 vpaddq %ymm5,%ymm4,%ymm4 3747 vpsrldq $8,%ymm0,%ymm5 3748 vpaddq %ymm6,%ymm3,%ymm3 3749 vpsrldq $8,%ymm1,%ymm6 3750 vpaddq %ymm5,%ymm0,%ymm0 3751 vpsrldq $8,%ymm2,%ymm5 3752 vpaddq %ymm6,%ymm1,%ymm1 3753 vpermq $2,%ymm4,%ymm6 3754 vpaddq %ymm5,%ymm2,%ymm2 3755 vpermq $2,%ymm3,%ymm5 3756 vpaddq %ymm6,%ymm4,%ymm4 3757 vpermq $2,%ymm0,%ymm6 3758 vpaddq %ymm5,%ymm3,%ymm3 3759 vpermq $2,%ymm1,%ymm5 3760 vpaddq %ymm6,%ymm0,%ymm0 3761 vpermq $2,%ymm2,%ymm6 3762 vpaddq %ymm5,%ymm1,%ymm1 3763 vpaddq %ymm6,%ymm2,%ymm2 3764 vpsrlq $26,%ymm3,%ymm5 3765 vpand %ymm7,%ymm3,%ymm3 3766 vpsrlq $26,%ymm0,%ymm6 3767 vpand %ymm7,%ymm0,%ymm0 3768 vpaddq %ymm5,%ymm4,%ymm4 3769 vpaddq %ymm6,%ymm1,%ymm1 3770 vpsrlq $26,%ymm4,%ymm5 3771 vpand %ymm7,%ymm4,%ymm4 3772 vpsrlq $26,%ymm1,%ymm6 3773 vpand %ymm7,%ymm1,%ymm1 3774 vpaddq %ymm6,%ymm2,%ymm2 3775 vpaddq %ymm5,%ymm0,%ymm0 3776 vpsllq $2,%ymm5,%ymm5 3777 vpsrlq $26,%ymm2,%ymm6 3778 vpand %ymm7,%ymm2,%ymm2 3779 vpaddq %ymm5,%ymm0,%ymm0 3780 vpaddq %ymm6,%ymm3,%ymm3 3781 vpsrlq $26,%ymm3,%ymm6 3782 vpsrlq $26,%ymm0,%ymm5 3783 vpand %ymm7,%ymm0,%ymm0 3784 vpand %ymm7,%ymm3,%ymm3 3785 vpaddq %ymm5,%ymm1,%ymm1 3786 vpaddq %ymm6,%ymm4,%ymm4 3787 cmpl $0,%ecx 3788 je .L029done 3789 vpshufd $252,%xmm0,%xmm0 3790 leal 288(%esp),%edx 3791 vpshufd $252,%xmm1,%xmm1 3792 vpshufd $252,%xmm2,%xmm2 3793 vpshufd $252,%xmm3,%xmm3 3794 vpshufd $252,%xmm4,%xmm4 3795 jmp .L024even 3796.align 16 3797.L029done: 3798 vmovd %xmm0,-48(%edi) 3799 vmovd %xmm1,-44(%edi) 3800 vmovd %xmm2,-40(%edi) 3801 vmovd %xmm3,-36(%edi) 3802 vmovd %xmm4,-32(%edi) 3803 vzeroupper 3804 movl %ebp,%esp 3805.L020nodata: 3806 popl %edi 3807 popl %esi 3808 popl %ebx 3809 popl %ebp 3810 ret 3811.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 3812.align 64 3813.Lconst_sse2: 3814.long 16777216,0,16777216,0,16777216,0,16777216,0 3815.long 0,0,0,0,0,0,0,0 3816.long 67108863,0,67108863,0,67108863,0,67108863,0 3817.long 268435455,268435452,268435452,268435452 3818.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 3819.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 3820.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 3821.byte 114,103,62,0 3822.align 4 3823.comm OPENSSL_ia32cap_P,16,4 3824#endif 3825