1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from poly1305-x86.pl. */ 3#ifdef PIC 4.text 5.align 64 6.globl poly1305_init 7.type poly1305_init,@function 8.align 16 9poly1305_init: 10.L_poly1305_init_begin: 11 pushl %ebp 12 pushl %ebx 13 pushl %esi 14 pushl %edi 15 movl 20(%esp),%edi 16 movl 24(%esp),%esi 17 movl 28(%esp),%ebp 18 xorl %eax,%eax 19 movl %eax,(%edi) 20 movl %eax,4(%edi) 21 movl %eax,8(%edi) 22 movl %eax,12(%edi) 23 movl %eax,16(%edi) 24 movl %eax,20(%edi) 25 cmpl $0,%esi 26 je .L000nokey 27 call .L001pic_point 28.L001pic_point: 29 popl %ebx 30 leal poly1305_blocks-.L001pic_point(%ebx),%eax 31 leal poly1305_emit-.L001pic_point(%ebx),%edx 32 leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi 33 movl (%edi),%ecx 34 andl $83886080,%ecx 35 cmpl $83886080,%ecx 36 jne .L002no_sse2 37 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax 38 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx 39 movl 8(%edi),%ecx 40 testl $32,%ecx 41 jz .L002no_sse2 42 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax 43.L002no_sse2: 44 movl 20(%esp),%edi 45 movl %eax,(%ebp) 46 movl %edx,4(%ebp) 47 movl (%esi),%eax 48 movl 4(%esi),%ebx 49 movl 8(%esi),%ecx 50 movl 12(%esi),%edx 51 andl $268435455,%eax 52 andl $268435452,%ebx 53 andl $268435452,%ecx 54 andl $268435452,%edx 55 movl %eax,24(%edi) 56 movl %ebx,28(%edi) 57 movl %ecx,32(%edi) 58 movl %edx,36(%edi) 59 movl $1,%eax 60.L000nokey: 61 popl %edi 62 popl %esi 63 popl %ebx 64 popl %ebp 65 ret 66.size poly1305_init,.-.L_poly1305_init_begin 67.globl poly1305_blocks 68.type poly1305_blocks,@function 69.align 16 70poly1305_blocks: 71.L_poly1305_blocks_begin: 72 pushl %ebp 73 pushl %ebx 74 pushl %esi 75 pushl %edi 76 movl 20(%esp),%edi 77 movl 24(%esp),%esi 78 movl 28(%esp),%ecx 79.Lenter_blocks: 80 andl $-15,%ecx 81 jz .L003nodata 82 subl $64,%esp 83 movl 24(%edi),%eax 84 movl 28(%edi),%ebx 85 leal (%esi,%ecx,1),%ebp 86 movl 32(%edi),%ecx 87 movl 36(%edi),%edx 88 movl %ebp,92(%esp) 89 movl %esi,%ebp 90 movl %eax,36(%esp) 91 movl %ebx,%eax 92 shrl $2,%eax 93 movl %ebx,40(%esp) 94 addl %ebx,%eax 95 movl %ecx,%ebx 96 shrl $2,%ebx 97 movl %ecx,44(%esp) 98 addl %ecx,%ebx 99 movl %edx,%ecx 100 shrl $2,%ecx 101 movl %edx,48(%esp) 102 addl %edx,%ecx 103 movl %eax,52(%esp) 104 movl %ebx,56(%esp) 105 movl %ecx,60(%esp) 106 movl (%edi),%eax 107 movl 4(%edi),%ebx 108 movl 8(%edi),%ecx 109 movl 12(%edi),%esi 110 movl 16(%edi),%edi 111 jmp .L004loop 112.align 32 113.L004loop: 114 addl (%ebp),%eax 115 adcl 4(%ebp),%ebx 116 adcl 8(%ebp),%ecx 117 adcl 12(%ebp),%esi 118 leal 16(%ebp),%ebp 119 adcl 96(%esp),%edi 120 movl %eax,(%esp) 121 movl %esi,12(%esp) 122 mull 36(%esp) 123 movl %edi,16(%esp) 124 movl %eax,%edi 125 movl %ebx,%eax 126 movl %edx,%esi 127 mull 60(%esp) 128 addl %eax,%edi 129 movl %ecx,%eax 130 adcl %edx,%esi 131 mull 56(%esp) 132 addl %eax,%edi 133 movl 12(%esp),%eax 134 adcl %edx,%esi 135 mull 52(%esp) 136 addl %eax,%edi 137 movl (%esp),%eax 138 adcl %edx,%esi 139 mull 40(%esp) 140 movl %edi,20(%esp) 141 xorl %edi,%edi 142 addl %eax,%esi 143 movl %ebx,%eax 144 adcl %edx,%edi 145 mull 36(%esp) 146 addl %eax,%esi 147 movl %ecx,%eax 148 adcl %edx,%edi 149 mull 60(%esp) 150 addl %eax,%esi 151 movl 12(%esp),%eax 152 adcl %edx,%edi 153 mull 56(%esp) 154 addl %eax,%esi 155 movl 16(%esp),%eax 156 adcl %edx,%edi 157 imull 52(%esp),%eax 158 addl %eax,%esi 159 movl (%esp),%eax 160 adcl $0,%edi 161 mull 44(%esp) 162 movl %esi,24(%esp) 163 xorl %esi,%esi 164 addl %eax,%edi 165 movl %ebx,%eax 166 adcl %edx,%esi 167 mull 40(%esp) 168 addl %eax,%edi 169 movl %ecx,%eax 170 adcl %edx,%esi 171 mull 36(%esp) 172 addl %eax,%edi 173 movl 12(%esp),%eax 174 adcl %edx,%esi 175 mull 60(%esp) 176 addl %eax,%edi 177 movl 16(%esp),%eax 178 adcl %edx,%esi 179 imull 56(%esp),%eax 180 addl %eax,%edi 181 movl (%esp),%eax 182 adcl $0,%esi 183 mull 48(%esp) 184 movl %edi,28(%esp) 185 xorl %edi,%edi 186 addl %eax,%esi 187 movl %ebx,%eax 188 adcl %edx,%edi 189 mull 44(%esp) 190 addl %eax,%esi 191 movl %ecx,%eax 192 adcl %edx,%edi 193 mull 40(%esp) 194 addl %eax,%esi 195 movl 12(%esp),%eax 196 adcl %edx,%edi 197 mull 36(%esp) 198 addl %eax,%esi 199 movl 16(%esp),%ecx 200 adcl %edx,%edi 201 movl %ecx,%edx 202 imull 60(%esp),%ecx 203 addl %ecx,%esi 204 movl 20(%esp),%eax 205 adcl $0,%edi 206 imull 36(%esp),%edx 207 addl %edi,%edx 208 movl 24(%esp),%ebx 209 movl 28(%esp),%ecx 210 movl %edx,%edi 211 shrl $2,%edx 212 andl $3,%edi 213 leal (%edx,%edx,4),%edx 214 addl %edx,%eax 215 adcl $0,%ebx 216 adcl $0,%ecx 217 adcl $0,%esi 218 adcl $0,%edi 219 cmpl 92(%esp),%ebp 220 jne .L004loop 221 movl 84(%esp),%edx 222 addl $64,%esp 223 movl %eax,(%edx) 224 movl %ebx,4(%edx) 225 movl %ecx,8(%edx) 226 movl %esi,12(%edx) 227 movl %edi,16(%edx) 228.L003nodata: 229 popl %edi 230 popl %esi 231 popl %ebx 232 popl %ebp 233 ret 234.size poly1305_blocks,.-.L_poly1305_blocks_begin 235.globl poly1305_emit 236.type poly1305_emit,@function 237.align 16 238poly1305_emit: 239.L_poly1305_emit_begin: 240 pushl %ebp 241 pushl %ebx 242 pushl %esi 243 pushl %edi 244 movl 20(%esp),%ebp 245.Lenter_emit: 246 movl 24(%esp),%edi 247 movl (%ebp),%eax 248 movl 4(%ebp),%ebx 249 movl 8(%ebp),%ecx 250 movl 12(%ebp),%edx 251 movl 16(%ebp),%esi 252 addl $5,%eax 253 adcl $0,%ebx 254 adcl $0,%ecx 255 adcl $0,%edx 256 adcl $0,%esi 257 shrl $2,%esi 258 negl %esi 259 andl %esi,%eax 260 andl %esi,%ebx 261 andl %esi,%ecx 262 andl %esi,%edx 263 movl %eax,(%edi) 264 movl %ebx,4(%edi) 265 movl %ecx,8(%edi) 266 movl %edx,12(%edi) 267 notl %esi 268 movl (%ebp),%eax 269 movl 4(%ebp),%ebx 270 movl 8(%ebp),%ecx 271 movl 12(%ebp),%edx 272 movl 28(%esp),%ebp 273 andl %esi,%eax 274 andl %esi,%ebx 275 andl %esi,%ecx 276 andl %esi,%edx 277 orl (%edi),%eax 278 orl 4(%edi),%ebx 279 orl 8(%edi),%ecx 280 orl 12(%edi),%edx 281 addl (%ebp),%eax 282 adcl 4(%ebp),%ebx 283 adcl 8(%ebp),%ecx 284 adcl 12(%ebp),%edx 285 movl %eax,(%edi) 286 movl %ebx,4(%edi) 287 movl %ecx,8(%edi) 288 movl %edx,12(%edi) 289 popl %edi 290 popl %esi 291 popl %ebx 292 popl %ebp 293 ret 294.size poly1305_emit,.-.L_poly1305_emit_begin 295.align 32 296.type _poly1305_init_sse2,@function 297.align 16 298_poly1305_init_sse2: 299 movdqu 24(%edi),%xmm4 300 leal 48(%edi),%edi 301 movl %esp,%ebp 302 subl $224,%esp 303 andl $-16,%esp 304 movq 64(%ebx),%xmm7 305 movdqa %xmm4,%xmm0 306 movdqa %xmm4,%xmm1 307 movdqa %xmm4,%xmm2 308 pand %xmm7,%xmm0 309 psrlq $26,%xmm1 310 psrldq $6,%xmm2 311 pand %xmm7,%xmm1 312 movdqa %xmm2,%xmm3 313 psrlq $4,%xmm2 314 psrlq $30,%xmm3 315 pand %xmm7,%xmm2 316 pand %xmm7,%xmm3 317 psrldq $13,%xmm4 318 leal 144(%esp),%edx 319 movl $2,%ecx 320.L005square: 321 movdqa %xmm0,(%esp) 322 movdqa %xmm1,16(%esp) 323 movdqa %xmm2,32(%esp) 324 movdqa %xmm3,48(%esp) 325 movdqa %xmm4,64(%esp) 326 movdqa %xmm1,%xmm6 327 movdqa %xmm2,%xmm5 328 pslld $2,%xmm6 329 pslld $2,%xmm5 330 paddd %xmm1,%xmm6 331 paddd %xmm2,%xmm5 332 movdqa %xmm6,80(%esp) 333 movdqa %xmm5,96(%esp) 334 movdqa %xmm3,%xmm6 335 movdqa %xmm4,%xmm5 336 pslld $2,%xmm6 337 pslld $2,%xmm5 338 paddd %xmm3,%xmm6 339 paddd %xmm4,%xmm5 340 movdqa %xmm6,112(%esp) 341 movdqa %xmm5,128(%esp) 342 pshufd $68,%xmm0,%xmm6 343 movdqa %xmm1,%xmm5 344 pshufd $68,%xmm1,%xmm1 345 pshufd $68,%xmm2,%xmm2 346 pshufd $68,%xmm3,%xmm3 347 pshufd $68,%xmm4,%xmm4 348 movdqa %xmm6,(%edx) 349 movdqa %xmm1,16(%edx) 350 movdqa %xmm2,32(%edx) 351 movdqa %xmm3,48(%edx) 352 movdqa %xmm4,64(%edx) 353 pmuludq %xmm0,%xmm4 354 pmuludq %xmm0,%xmm3 355 pmuludq %xmm0,%xmm2 356 pmuludq %xmm0,%xmm1 357 pmuludq %xmm6,%xmm0 358 movdqa %xmm5,%xmm6 359 pmuludq 48(%edx),%xmm5 360 movdqa %xmm6,%xmm7 361 pmuludq 32(%edx),%xmm6 362 paddq %xmm5,%xmm4 363 movdqa %xmm7,%xmm5 364 pmuludq 16(%edx),%xmm7 365 paddq %xmm6,%xmm3 366 movdqa 80(%esp),%xmm6 367 pmuludq (%edx),%xmm5 368 paddq %xmm7,%xmm2 369 pmuludq 64(%edx),%xmm6 370 movdqa 32(%esp),%xmm7 371 paddq %xmm5,%xmm1 372 movdqa %xmm7,%xmm5 373 pmuludq 32(%edx),%xmm7 374 paddq %xmm6,%xmm0 375 movdqa %xmm5,%xmm6 376 pmuludq 16(%edx),%xmm5 377 paddq %xmm7,%xmm4 378 movdqa 96(%esp),%xmm7 379 pmuludq (%edx),%xmm6 380 paddq %xmm5,%xmm3 381 movdqa %xmm7,%xmm5 382 pmuludq 64(%edx),%xmm7 383 paddq %xmm6,%xmm2 384 pmuludq 48(%edx),%xmm5 385 movdqa 48(%esp),%xmm6 386 paddq %xmm7,%xmm1 387 movdqa %xmm6,%xmm7 388 pmuludq 16(%edx),%xmm6 389 paddq %xmm5,%xmm0 390 movdqa 112(%esp),%xmm5 391 pmuludq (%edx),%xmm7 392 paddq %xmm6,%xmm4 393 movdqa %xmm5,%xmm6 394 pmuludq 64(%edx),%xmm5 395 paddq %xmm7,%xmm3 396 movdqa %xmm6,%xmm7 397 pmuludq 48(%edx),%xmm6 398 paddq %xmm5,%xmm2 399 pmuludq 32(%edx),%xmm7 400 movdqa 64(%esp),%xmm5 401 paddq %xmm6,%xmm1 402 movdqa 128(%esp),%xmm6 403 pmuludq (%edx),%xmm5 404 paddq %xmm7,%xmm0 405 movdqa %xmm6,%xmm7 406 pmuludq 64(%edx),%xmm6 407 paddq %xmm5,%xmm4 408 movdqa %xmm7,%xmm5 409 pmuludq 16(%edx),%xmm7 410 paddq %xmm6,%xmm3 411 movdqa %xmm5,%xmm6 412 pmuludq 32(%edx),%xmm5 413 paddq %xmm7,%xmm0 414 pmuludq 48(%edx),%xmm6 415 movdqa 64(%ebx),%xmm7 416 paddq %xmm5,%xmm1 417 paddq %xmm6,%xmm2 418 movdqa %xmm3,%xmm5 419 pand %xmm7,%xmm3 420 psrlq $26,%xmm5 421 paddq %xmm4,%xmm5 422 movdqa %xmm0,%xmm6 423 pand %xmm7,%xmm0 424 psrlq $26,%xmm6 425 movdqa %xmm5,%xmm4 426 paddq %xmm1,%xmm6 427 psrlq $26,%xmm5 428 pand %xmm7,%xmm4 429 movdqa %xmm6,%xmm1 430 psrlq $26,%xmm6 431 paddd %xmm5,%xmm0 432 psllq $2,%xmm5 433 paddq %xmm2,%xmm6 434 paddq %xmm0,%xmm5 435 pand %xmm7,%xmm1 436 movdqa %xmm6,%xmm2 437 psrlq $26,%xmm6 438 pand %xmm7,%xmm2 439 paddd %xmm3,%xmm6 440 movdqa %xmm5,%xmm0 441 psrlq $26,%xmm5 442 movdqa %xmm6,%xmm3 443 psrlq $26,%xmm6 444 pand %xmm7,%xmm0 445 paddd %xmm5,%xmm1 446 pand %xmm7,%xmm3 447 paddd %xmm6,%xmm4 448 decl %ecx 449 jz .L006square_break 450 punpcklqdq (%esp),%xmm0 451 punpcklqdq 16(%esp),%xmm1 452 punpcklqdq 32(%esp),%xmm2 453 punpcklqdq 48(%esp),%xmm3 454 punpcklqdq 64(%esp),%xmm4 455 jmp .L005square 456.L006square_break: 457 psllq $32,%xmm0 458 psllq $32,%xmm1 459 psllq $32,%xmm2 460 psllq $32,%xmm3 461 psllq $32,%xmm4 462 por (%esp),%xmm0 463 por 16(%esp),%xmm1 464 por 32(%esp),%xmm2 465 por 48(%esp),%xmm3 466 por 64(%esp),%xmm4 467 pshufd $141,%xmm0,%xmm0 468 pshufd $141,%xmm1,%xmm1 469 pshufd $141,%xmm2,%xmm2 470 pshufd $141,%xmm3,%xmm3 471 pshufd $141,%xmm4,%xmm4 472 movdqu %xmm0,(%edi) 473 movdqu %xmm1,16(%edi) 474 movdqu %xmm2,32(%edi) 475 movdqu %xmm3,48(%edi) 476 movdqu %xmm4,64(%edi) 477 movdqa %xmm1,%xmm6 478 movdqa %xmm2,%xmm5 479 pslld $2,%xmm6 480 pslld $2,%xmm5 481 paddd %xmm1,%xmm6 482 paddd %xmm2,%xmm5 483 movdqu %xmm6,80(%edi) 484 movdqu %xmm5,96(%edi) 485 movdqa %xmm3,%xmm6 486 movdqa %xmm4,%xmm5 487 pslld $2,%xmm6 488 pslld $2,%xmm5 489 paddd %xmm3,%xmm6 490 paddd %xmm4,%xmm5 491 movdqu %xmm6,112(%edi) 492 movdqu %xmm5,128(%edi) 493 movl %ebp,%esp 494 leal -48(%edi),%edi 495 ret 496.size _poly1305_init_sse2,.-_poly1305_init_sse2 497.align 32 498.type _poly1305_blocks_sse2,@function 499.align 16 500_poly1305_blocks_sse2: 501 pushl %ebp 502 pushl %ebx 503 pushl %esi 504 pushl %edi 505 movl 20(%esp),%edi 506 movl 24(%esp),%esi 507 movl 28(%esp),%ecx 508 movl 20(%edi),%eax 509 andl $-16,%ecx 510 jz .L007nodata 511 cmpl $64,%ecx 512 jae .L008enter_sse2 513 testl %eax,%eax 514 jz .Lenter_blocks 515.align 16 516.L008enter_sse2: 517 call .L009pic_point 518.L009pic_point: 519 popl %ebx 520 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx 521 testl %eax,%eax 522 jnz .L010base2_26 523 call _poly1305_init_sse2 524 movl (%edi),%eax 525 movl 3(%edi),%ecx 526 movl 6(%edi),%edx 527 movl 9(%edi),%esi 528 movl 13(%edi),%ebp 529 movl $1,20(%edi) 530 shrl $2,%ecx 531 andl $67108863,%eax 532 shrl $4,%edx 533 andl $67108863,%ecx 534 shrl $6,%esi 535 andl $67108863,%edx 536 movd %eax,%xmm0 537 movd %ecx,%xmm1 538 movd %edx,%xmm2 539 movd %esi,%xmm3 540 movd %ebp,%xmm4 541 movl 24(%esp),%esi 542 movl 28(%esp),%ecx 543 jmp .L011base2_32 544.align 16 545.L010base2_26: 546 movd (%edi),%xmm0 547 movd 4(%edi),%xmm1 548 movd 8(%edi),%xmm2 549 movd 12(%edi),%xmm3 550 movd 16(%edi),%xmm4 551 movdqa 64(%ebx),%xmm7 552.L011base2_32: 553 movl 32(%esp),%eax 554 movl %esp,%ebp 555 subl $528,%esp 556 andl $-16,%esp 557 leal 48(%edi),%edi 558 shll $24,%eax 559 testl $31,%ecx 560 jz .L012even 561 movdqu (%esi),%xmm6 562 leal 16(%esi),%esi 563 movdqa %xmm6,%xmm5 564 pand %xmm7,%xmm6 565 paddd %xmm6,%xmm0 566 movdqa %xmm5,%xmm6 567 psrlq $26,%xmm5 568 psrldq $6,%xmm6 569 pand %xmm7,%xmm5 570 paddd %xmm5,%xmm1 571 movdqa %xmm6,%xmm5 572 psrlq $4,%xmm6 573 pand %xmm7,%xmm6 574 paddd %xmm6,%xmm2 575 movdqa %xmm5,%xmm6 576 psrlq $30,%xmm5 577 pand %xmm7,%xmm5 578 psrldq $7,%xmm6 579 paddd %xmm5,%xmm3 580 movd %eax,%xmm5 581 paddd %xmm6,%xmm4 582 movd 12(%edi),%xmm6 583 paddd %xmm5,%xmm4 584 movdqa %xmm0,(%esp) 585 movdqa %xmm1,16(%esp) 586 movdqa %xmm2,32(%esp) 587 movdqa %xmm3,48(%esp) 588 movdqa %xmm4,64(%esp) 589 pmuludq %xmm6,%xmm0 590 pmuludq %xmm6,%xmm1 591 pmuludq %xmm6,%xmm2 592 movd 28(%edi),%xmm5 593 pmuludq %xmm6,%xmm3 594 pmuludq %xmm6,%xmm4 595 movdqa %xmm5,%xmm6 596 pmuludq 48(%esp),%xmm5 597 movdqa %xmm6,%xmm7 598 pmuludq 32(%esp),%xmm6 599 paddq %xmm5,%xmm4 600 movdqa %xmm7,%xmm5 601 pmuludq 16(%esp),%xmm7 602 paddq %xmm6,%xmm3 603 movd 92(%edi),%xmm6 604 pmuludq (%esp),%xmm5 605 paddq %xmm7,%xmm2 606 pmuludq 64(%esp),%xmm6 607 movd 44(%edi),%xmm7 608 paddq %xmm5,%xmm1 609 movdqa %xmm7,%xmm5 610 pmuludq 32(%esp),%xmm7 611 paddq %xmm6,%xmm0 612 movdqa %xmm5,%xmm6 613 pmuludq 16(%esp),%xmm5 614 paddq %xmm7,%xmm4 615 movd 108(%edi),%xmm7 616 pmuludq (%esp),%xmm6 617 paddq %xmm5,%xmm3 618 movdqa %xmm7,%xmm5 619 pmuludq 64(%esp),%xmm7 620 paddq %xmm6,%xmm2 621 pmuludq 48(%esp),%xmm5 622 movd 60(%edi),%xmm6 623 paddq %xmm7,%xmm1 624 movdqa %xmm6,%xmm7 625 pmuludq 16(%esp),%xmm6 626 paddq %xmm5,%xmm0 627 movd 124(%edi),%xmm5 628 pmuludq (%esp),%xmm7 629 paddq %xmm6,%xmm4 630 movdqa %xmm5,%xmm6 631 pmuludq 64(%esp),%xmm5 632 paddq %xmm7,%xmm3 633 movdqa %xmm6,%xmm7 634 pmuludq 48(%esp),%xmm6 635 paddq %xmm5,%xmm2 636 pmuludq 32(%esp),%xmm7 637 movd 76(%edi),%xmm5 638 paddq %xmm6,%xmm1 639 movd 140(%edi),%xmm6 640 pmuludq (%esp),%xmm5 641 paddq %xmm7,%xmm0 642 movdqa %xmm6,%xmm7 643 pmuludq 64(%esp),%xmm6 644 paddq %xmm5,%xmm4 645 movdqa %xmm7,%xmm5 646 pmuludq 16(%esp),%xmm7 647 paddq %xmm6,%xmm3 648 movdqa %xmm5,%xmm6 649 pmuludq 32(%esp),%xmm5 650 paddq %xmm7,%xmm0 651 pmuludq 48(%esp),%xmm6 652 movdqa 64(%ebx),%xmm7 653 paddq %xmm5,%xmm1 654 paddq %xmm6,%xmm2 655 movdqa %xmm3,%xmm5 656 pand %xmm7,%xmm3 657 psrlq $26,%xmm5 658 paddq %xmm4,%xmm5 659 movdqa %xmm0,%xmm6 660 pand %xmm7,%xmm0 661 psrlq $26,%xmm6 662 movdqa %xmm5,%xmm4 663 paddq %xmm1,%xmm6 664 psrlq $26,%xmm5 665 pand %xmm7,%xmm4 666 movdqa %xmm6,%xmm1 667 psrlq $26,%xmm6 668 paddd %xmm5,%xmm0 669 psllq $2,%xmm5 670 paddq %xmm2,%xmm6 671 paddq %xmm0,%xmm5 672 pand %xmm7,%xmm1 673 movdqa %xmm6,%xmm2 674 psrlq $26,%xmm6 675 pand %xmm7,%xmm2 676 paddd %xmm3,%xmm6 677 movdqa %xmm5,%xmm0 678 psrlq $26,%xmm5 679 movdqa %xmm6,%xmm3 680 psrlq $26,%xmm6 681 pand %xmm7,%xmm0 682 paddd %xmm5,%xmm1 683 pand %xmm7,%xmm3 684 paddd %xmm6,%xmm4 685 subl $16,%ecx 686 jz .L013done 687.L012even: 688 leal 384(%esp),%edx 689 leal -32(%esi),%eax 690 subl $64,%ecx 691 movdqu (%edi),%xmm5 692 pshufd $68,%xmm5,%xmm6 693 cmovbl %eax,%esi 694 pshufd $238,%xmm5,%xmm5 695 movdqa %xmm6,(%edx) 696 leal 160(%esp),%eax 697 movdqu 16(%edi),%xmm6 698 movdqa %xmm5,-144(%edx) 699 pshufd $68,%xmm6,%xmm5 700 pshufd $238,%xmm6,%xmm6 701 movdqa %xmm5,16(%edx) 702 movdqu 32(%edi),%xmm5 703 movdqa %xmm6,-128(%edx) 704 pshufd $68,%xmm5,%xmm6 705 pshufd $238,%xmm5,%xmm5 706 movdqa %xmm6,32(%edx) 707 movdqu 48(%edi),%xmm6 708 movdqa %xmm5,-112(%edx) 709 pshufd $68,%xmm6,%xmm5 710 pshufd $238,%xmm6,%xmm6 711 movdqa %xmm5,48(%edx) 712 movdqu 64(%edi),%xmm5 713 movdqa %xmm6,-96(%edx) 714 pshufd $68,%xmm5,%xmm6 715 pshufd $238,%xmm5,%xmm5 716 movdqa %xmm6,64(%edx) 717 movdqu 80(%edi),%xmm6 718 movdqa %xmm5,-80(%edx) 719 pshufd $68,%xmm6,%xmm5 720 pshufd $238,%xmm6,%xmm6 721 movdqa %xmm5,80(%edx) 722 movdqu 96(%edi),%xmm5 723 movdqa %xmm6,-64(%edx) 724 pshufd $68,%xmm5,%xmm6 725 pshufd $238,%xmm5,%xmm5 726 movdqa %xmm6,96(%edx) 727 movdqu 112(%edi),%xmm6 728 movdqa %xmm5,-48(%edx) 729 pshufd $68,%xmm6,%xmm5 730 pshufd $238,%xmm6,%xmm6 731 movdqa %xmm5,112(%edx) 732 movdqu 128(%edi),%xmm5 733 movdqa %xmm6,-32(%edx) 734 pshufd $68,%xmm5,%xmm6 735 pshufd $238,%xmm5,%xmm5 736 movdqa %xmm6,128(%edx) 737 movdqa %xmm5,-16(%edx) 738 movdqu 32(%esi),%xmm5 739 movdqu 48(%esi),%xmm6 740 leal 32(%esi),%esi 741 movdqa %xmm2,112(%esp) 742 movdqa %xmm3,128(%esp) 743 movdqa %xmm4,144(%esp) 744 movdqa %xmm5,%xmm2 745 movdqa %xmm6,%xmm3 746 psrldq $6,%xmm2 747 psrldq $6,%xmm3 748 movdqa %xmm5,%xmm4 749 punpcklqdq %xmm3,%xmm2 750 punpckhqdq %xmm6,%xmm4 751 punpcklqdq %xmm6,%xmm5 752 movdqa %xmm2,%xmm3 753 psrlq $4,%xmm2 754 psrlq $30,%xmm3 755 movdqa %xmm5,%xmm6 756 psrlq $40,%xmm4 757 psrlq $26,%xmm6 758 pand %xmm7,%xmm5 759 pand %xmm7,%xmm6 760 pand %xmm7,%xmm2 761 pand %xmm7,%xmm3 762 por (%ebx),%xmm4 763 movdqa %xmm0,80(%esp) 764 movdqa %xmm1,96(%esp) 765 jbe .L014skip_loop 766 jmp .L015loop 767.align 32 768.L015loop: 769 movdqa -144(%edx),%xmm7 770 movdqa %xmm6,16(%eax) 771 movdqa %xmm2,32(%eax) 772 movdqa %xmm3,48(%eax) 773 movdqa %xmm4,64(%eax) 774 movdqa %xmm5,%xmm1 775 pmuludq %xmm7,%xmm5 776 movdqa %xmm6,%xmm0 777 pmuludq %xmm7,%xmm6 778 pmuludq %xmm7,%xmm2 779 pmuludq %xmm7,%xmm3 780 pmuludq %xmm7,%xmm4 781 pmuludq -16(%edx),%xmm0 782 movdqa %xmm1,%xmm7 783 pmuludq -128(%edx),%xmm1 784 paddq %xmm5,%xmm0 785 movdqa %xmm7,%xmm5 786 pmuludq -112(%edx),%xmm7 787 paddq %xmm6,%xmm1 788 movdqa %xmm5,%xmm6 789 pmuludq -96(%edx),%xmm5 790 paddq %xmm7,%xmm2 791 movdqa 16(%eax),%xmm7 792 pmuludq -80(%edx),%xmm6 793 paddq %xmm5,%xmm3 794 movdqa %xmm7,%xmm5 795 pmuludq -128(%edx),%xmm7 796 paddq %xmm6,%xmm4 797 movdqa %xmm5,%xmm6 798 pmuludq -112(%edx),%xmm5 799 paddq %xmm7,%xmm2 800 movdqa 32(%eax),%xmm7 801 pmuludq -96(%edx),%xmm6 802 paddq %xmm5,%xmm3 803 movdqa %xmm7,%xmm5 804 pmuludq -32(%edx),%xmm7 805 paddq %xmm6,%xmm4 806 movdqa %xmm5,%xmm6 807 pmuludq -16(%edx),%xmm5 808 paddq %xmm7,%xmm0 809 movdqa %xmm6,%xmm7 810 pmuludq -128(%edx),%xmm6 811 paddq %xmm5,%xmm1 812 movdqa 48(%eax),%xmm5 813 pmuludq -112(%edx),%xmm7 814 paddq %xmm6,%xmm3 815 movdqa %xmm5,%xmm6 816 pmuludq -48(%edx),%xmm5 817 paddq %xmm7,%xmm4 818 movdqa %xmm6,%xmm7 819 pmuludq -32(%edx),%xmm6 820 paddq %xmm5,%xmm0 821 movdqa %xmm7,%xmm5 822 pmuludq -16(%edx),%xmm7 823 paddq %xmm6,%xmm1 824 movdqa 64(%eax),%xmm6 825 pmuludq -128(%edx),%xmm5 826 paddq %xmm7,%xmm2 827 movdqa %xmm6,%xmm7 828 pmuludq -16(%edx),%xmm6 829 paddq %xmm5,%xmm4 830 movdqa %xmm7,%xmm5 831 pmuludq -64(%edx),%xmm7 832 paddq %xmm6,%xmm3 833 movdqa %xmm5,%xmm6 834 pmuludq -48(%edx),%xmm5 835 paddq %xmm7,%xmm0 836 movdqa 64(%ebx),%xmm7 837 pmuludq -32(%edx),%xmm6 838 paddq %xmm5,%xmm1 839 paddq %xmm6,%xmm2 840 movdqu -32(%esi),%xmm5 841 movdqu -16(%esi),%xmm6 842 leal 32(%esi),%esi 843 movdqa %xmm2,32(%esp) 844 movdqa %xmm3,48(%esp) 845 movdqa %xmm4,64(%esp) 846 movdqa %xmm5,%xmm2 847 movdqa %xmm6,%xmm3 848 psrldq $6,%xmm2 849 psrldq $6,%xmm3 850 movdqa %xmm5,%xmm4 851 punpcklqdq %xmm3,%xmm2 852 punpckhqdq %xmm6,%xmm4 853 punpcklqdq %xmm6,%xmm5 854 movdqa %xmm2,%xmm3 855 psrlq $4,%xmm2 856 psrlq $30,%xmm3 857 movdqa %xmm5,%xmm6 858 psrlq $40,%xmm4 859 psrlq $26,%xmm6 860 pand %xmm7,%xmm5 861 pand %xmm7,%xmm6 862 pand %xmm7,%xmm2 863 pand %xmm7,%xmm3 864 por (%ebx),%xmm4 865 leal -32(%esi),%eax 866 subl $64,%ecx 867 paddd 80(%esp),%xmm5 868 paddd 96(%esp),%xmm6 869 paddd 112(%esp),%xmm2 870 paddd 128(%esp),%xmm3 871 paddd 144(%esp),%xmm4 872 cmovbl %eax,%esi 873 leal 160(%esp),%eax 874 movdqa (%edx),%xmm7 875 movdqa %xmm1,16(%esp) 876 movdqa %xmm6,16(%eax) 877 movdqa %xmm2,32(%eax) 878 movdqa %xmm3,48(%eax) 879 movdqa %xmm4,64(%eax) 880 movdqa %xmm5,%xmm1 881 pmuludq %xmm7,%xmm5 882 paddq %xmm0,%xmm5 883 movdqa %xmm6,%xmm0 884 pmuludq %xmm7,%xmm6 885 pmuludq %xmm7,%xmm2 886 pmuludq %xmm7,%xmm3 887 pmuludq %xmm7,%xmm4 888 paddq 16(%esp),%xmm6 889 paddq 32(%esp),%xmm2 890 paddq 48(%esp),%xmm3 891 paddq 64(%esp),%xmm4 892 pmuludq 128(%edx),%xmm0 893 movdqa %xmm1,%xmm7 894 pmuludq 16(%edx),%xmm1 895 paddq %xmm5,%xmm0 896 movdqa %xmm7,%xmm5 897 pmuludq 32(%edx),%xmm7 898 paddq %xmm6,%xmm1 899 movdqa %xmm5,%xmm6 900 pmuludq 48(%edx),%xmm5 901 paddq %xmm7,%xmm2 902 movdqa 16(%eax),%xmm7 903 pmuludq 64(%edx),%xmm6 904 paddq %xmm5,%xmm3 905 movdqa %xmm7,%xmm5 906 pmuludq 16(%edx),%xmm7 907 paddq %xmm6,%xmm4 908 movdqa %xmm5,%xmm6 909 pmuludq 32(%edx),%xmm5 910 paddq %xmm7,%xmm2 911 movdqa 32(%eax),%xmm7 912 pmuludq 48(%edx),%xmm6 913 paddq %xmm5,%xmm3 914 movdqa %xmm7,%xmm5 915 pmuludq 112(%edx),%xmm7 916 paddq %xmm6,%xmm4 917 movdqa %xmm5,%xmm6 918 pmuludq 128(%edx),%xmm5 919 paddq %xmm7,%xmm0 920 movdqa %xmm6,%xmm7 921 pmuludq 16(%edx),%xmm6 922 paddq %xmm5,%xmm1 923 movdqa 48(%eax),%xmm5 924 pmuludq 32(%edx),%xmm7 925 paddq %xmm6,%xmm3 926 movdqa %xmm5,%xmm6 927 pmuludq 96(%edx),%xmm5 928 paddq %xmm7,%xmm4 929 movdqa %xmm6,%xmm7 930 pmuludq 112(%edx),%xmm6 931 paddq %xmm5,%xmm0 932 movdqa %xmm7,%xmm5 933 pmuludq 128(%edx),%xmm7 934 paddq %xmm6,%xmm1 935 movdqa 64(%eax),%xmm6 936 pmuludq 16(%edx),%xmm5 937 paddq %xmm7,%xmm2 938 movdqa %xmm6,%xmm7 939 pmuludq 128(%edx),%xmm6 940 paddq %xmm5,%xmm4 941 movdqa %xmm7,%xmm5 942 pmuludq 80(%edx),%xmm7 943 paddq %xmm6,%xmm3 944 movdqa %xmm5,%xmm6 945 pmuludq 96(%edx),%xmm5 946 paddq %xmm7,%xmm0 947 movdqa 64(%ebx),%xmm7 948 pmuludq 112(%edx),%xmm6 949 paddq %xmm5,%xmm1 950 paddq %xmm6,%xmm2 951 movdqa %xmm3,%xmm5 952 pand %xmm7,%xmm3 953 psrlq $26,%xmm5 954 paddq %xmm4,%xmm5 955 movdqa %xmm0,%xmm6 956 pand %xmm7,%xmm0 957 psrlq $26,%xmm6 958 movdqa %xmm5,%xmm4 959 paddq %xmm1,%xmm6 960 psrlq $26,%xmm5 961 pand %xmm7,%xmm4 962 movdqa %xmm6,%xmm1 963 psrlq $26,%xmm6 964 paddd %xmm5,%xmm0 965 psllq $2,%xmm5 966 paddq %xmm2,%xmm6 967 paddq %xmm0,%xmm5 968 pand %xmm7,%xmm1 969 movdqa %xmm6,%xmm2 970 psrlq $26,%xmm6 971 pand %xmm7,%xmm2 972 paddd %xmm3,%xmm6 973 movdqa %xmm5,%xmm0 974 psrlq $26,%xmm5 975 movdqa %xmm6,%xmm3 976 psrlq $26,%xmm6 977 pand %xmm7,%xmm0 978 paddd %xmm5,%xmm1 979 pand %xmm7,%xmm3 980 paddd %xmm6,%xmm4 981 movdqu 32(%esi),%xmm5 982 movdqu 48(%esi),%xmm6 983 leal 32(%esi),%esi 984 movdqa %xmm2,112(%esp) 985 movdqa %xmm3,128(%esp) 986 movdqa %xmm4,144(%esp) 987 movdqa %xmm5,%xmm2 988 movdqa %xmm6,%xmm3 989 psrldq $6,%xmm2 990 psrldq $6,%xmm3 991 movdqa %xmm5,%xmm4 992 punpcklqdq %xmm3,%xmm2 993 punpckhqdq %xmm6,%xmm4 994 punpcklqdq %xmm6,%xmm5 995 movdqa %xmm2,%xmm3 996 psrlq $4,%xmm2 997 psrlq $30,%xmm3 998 movdqa %xmm5,%xmm6 999 psrlq $40,%xmm4 1000 psrlq $26,%xmm6 1001 pand %xmm7,%xmm5 1002 pand %xmm7,%xmm6 1003 pand %xmm7,%xmm2 1004 pand %xmm7,%xmm3 1005 por (%ebx),%xmm4 1006 movdqa %xmm0,80(%esp) 1007 movdqa %xmm1,96(%esp) 1008 ja .L015loop 1009.L014skip_loop: 1010 pshufd $16,-144(%edx),%xmm7 1011 addl $32,%ecx 1012 jnz .L016long_tail 1013 paddd %xmm0,%xmm5 1014 paddd %xmm1,%xmm6 1015 paddd 112(%esp),%xmm2 1016 paddd 128(%esp),%xmm3 1017 paddd 144(%esp),%xmm4 1018.L016long_tail: 1019 movdqa %xmm5,(%eax) 1020 movdqa %xmm6,16(%eax) 1021 movdqa %xmm2,32(%eax) 1022 movdqa %xmm3,48(%eax) 1023 movdqa %xmm4,64(%eax) 1024 pmuludq %xmm7,%xmm5 1025 pmuludq %xmm7,%xmm6 1026 pmuludq %xmm7,%xmm2 1027 movdqa %xmm5,%xmm0 1028 pshufd $16,-128(%edx),%xmm5 1029 pmuludq %xmm7,%xmm3 1030 movdqa %xmm6,%xmm1 1031 pmuludq %xmm7,%xmm4 1032 movdqa %xmm5,%xmm6 1033 pmuludq 48(%eax),%xmm5 1034 movdqa %xmm6,%xmm7 1035 pmuludq 32(%eax),%xmm6 1036 paddq %xmm5,%xmm4 1037 movdqa %xmm7,%xmm5 1038 pmuludq 16(%eax),%xmm7 1039 paddq %xmm6,%xmm3 1040 pshufd $16,-64(%edx),%xmm6 1041 pmuludq (%eax),%xmm5 1042 paddq %xmm7,%xmm2 1043 pmuludq 64(%eax),%xmm6 1044 pshufd $16,-112(%edx),%xmm7 1045 paddq %xmm5,%xmm1 1046 movdqa %xmm7,%xmm5 1047 pmuludq 32(%eax),%xmm7 1048 paddq %xmm6,%xmm0 1049 movdqa %xmm5,%xmm6 1050 pmuludq 16(%eax),%xmm5 1051 paddq %xmm7,%xmm4 1052 pshufd $16,-48(%edx),%xmm7 1053 pmuludq (%eax),%xmm6 1054 paddq %xmm5,%xmm3 1055 movdqa %xmm7,%xmm5 1056 pmuludq 64(%eax),%xmm7 1057 paddq %xmm6,%xmm2 1058 pmuludq 48(%eax),%xmm5 1059 pshufd $16,-96(%edx),%xmm6 1060 paddq %xmm7,%xmm1 1061 movdqa %xmm6,%xmm7 1062 pmuludq 16(%eax),%xmm6 1063 paddq %xmm5,%xmm0 1064 pshufd $16,-32(%edx),%xmm5 1065 pmuludq (%eax),%xmm7 1066 paddq %xmm6,%xmm4 1067 movdqa %xmm5,%xmm6 1068 pmuludq 64(%eax),%xmm5 1069 paddq %xmm7,%xmm3 1070 movdqa %xmm6,%xmm7 1071 pmuludq 48(%eax),%xmm6 1072 paddq %xmm5,%xmm2 1073 pmuludq 32(%eax),%xmm7 1074 pshufd $16,-80(%edx),%xmm5 1075 paddq %xmm6,%xmm1 1076 pshufd $16,-16(%edx),%xmm6 1077 pmuludq (%eax),%xmm5 1078 paddq %xmm7,%xmm0 1079 movdqa %xmm6,%xmm7 1080 pmuludq 64(%eax),%xmm6 1081 paddq %xmm5,%xmm4 1082 movdqa %xmm7,%xmm5 1083 pmuludq 16(%eax),%xmm7 1084 paddq %xmm6,%xmm3 1085 movdqa %xmm5,%xmm6 1086 pmuludq 32(%eax),%xmm5 1087 paddq %xmm7,%xmm0 1088 pmuludq 48(%eax),%xmm6 1089 movdqa 64(%ebx),%xmm7 1090 paddq %xmm5,%xmm1 1091 paddq %xmm6,%xmm2 1092 jz .L017short_tail 1093 movdqu -32(%esi),%xmm5 1094 movdqu -16(%esi),%xmm6 1095 leal 32(%esi),%esi 1096 movdqa %xmm2,32(%esp) 1097 movdqa %xmm3,48(%esp) 1098 movdqa %xmm4,64(%esp) 1099 movdqa %xmm5,%xmm2 1100 movdqa %xmm6,%xmm3 1101 psrldq $6,%xmm2 1102 psrldq $6,%xmm3 1103 movdqa %xmm5,%xmm4 1104 punpcklqdq %xmm3,%xmm2 1105 punpckhqdq %xmm6,%xmm4 1106 punpcklqdq %xmm6,%xmm5 1107 movdqa %xmm2,%xmm3 1108 psrlq $4,%xmm2 1109 psrlq $30,%xmm3 1110 movdqa %xmm5,%xmm6 1111 psrlq $40,%xmm4 1112 psrlq $26,%xmm6 1113 pand %xmm7,%xmm5 1114 pand %xmm7,%xmm6 1115 pand %xmm7,%xmm2 1116 pand %xmm7,%xmm3 1117 por (%ebx),%xmm4 1118 pshufd $16,(%edx),%xmm7 1119 paddd 80(%esp),%xmm5 1120 paddd 96(%esp),%xmm6 1121 paddd 112(%esp),%xmm2 1122 paddd 128(%esp),%xmm3 1123 paddd 144(%esp),%xmm4 1124 movdqa %xmm5,(%esp) 1125 pmuludq %xmm7,%xmm5 1126 movdqa %xmm6,16(%esp) 1127 pmuludq %xmm7,%xmm6 1128 paddq %xmm5,%xmm0 1129 movdqa %xmm2,%xmm5 1130 pmuludq %xmm7,%xmm2 1131 paddq %xmm6,%xmm1 1132 movdqa %xmm3,%xmm6 1133 pmuludq %xmm7,%xmm3 1134 paddq 32(%esp),%xmm2 1135 movdqa %xmm5,32(%esp) 1136 pshufd $16,16(%edx),%xmm5 1137 paddq 48(%esp),%xmm3 1138 movdqa %xmm6,48(%esp) 1139 movdqa %xmm4,%xmm6 1140 pmuludq %xmm7,%xmm4 1141 paddq 64(%esp),%xmm4 1142 movdqa %xmm6,64(%esp) 1143 movdqa %xmm5,%xmm6 1144 pmuludq 48(%esp),%xmm5 1145 movdqa %xmm6,%xmm7 1146 pmuludq 32(%esp),%xmm6 1147 paddq %xmm5,%xmm4 1148 movdqa %xmm7,%xmm5 1149 pmuludq 16(%esp),%xmm7 1150 paddq %xmm6,%xmm3 1151 pshufd $16,80(%edx),%xmm6 1152 pmuludq (%esp),%xmm5 1153 paddq %xmm7,%xmm2 1154 pmuludq 64(%esp),%xmm6 1155 pshufd $16,32(%edx),%xmm7 1156 paddq %xmm5,%xmm1 1157 movdqa %xmm7,%xmm5 1158 pmuludq 32(%esp),%xmm7 1159 paddq %xmm6,%xmm0 1160 movdqa %xmm5,%xmm6 1161 pmuludq 16(%esp),%xmm5 1162 paddq %xmm7,%xmm4 1163 pshufd $16,96(%edx),%xmm7 1164 pmuludq (%esp),%xmm6 1165 paddq %xmm5,%xmm3 1166 movdqa %xmm7,%xmm5 1167 pmuludq 64(%esp),%xmm7 1168 paddq %xmm6,%xmm2 1169 pmuludq 48(%esp),%xmm5 1170 pshufd $16,48(%edx),%xmm6 1171 paddq %xmm7,%xmm1 1172 movdqa %xmm6,%xmm7 1173 pmuludq 16(%esp),%xmm6 1174 paddq %xmm5,%xmm0 1175 pshufd $16,112(%edx),%xmm5 1176 pmuludq (%esp),%xmm7 1177 paddq %xmm6,%xmm4 1178 movdqa %xmm5,%xmm6 1179 pmuludq 64(%esp),%xmm5 1180 paddq %xmm7,%xmm3 1181 movdqa %xmm6,%xmm7 1182 pmuludq 48(%esp),%xmm6 1183 paddq %xmm5,%xmm2 1184 pmuludq 32(%esp),%xmm7 1185 pshufd $16,64(%edx),%xmm5 1186 paddq %xmm6,%xmm1 1187 pshufd $16,128(%edx),%xmm6 1188 pmuludq (%esp),%xmm5 1189 paddq %xmm7,%xmm0 1190 movdqa %xmm6,%xmm7 1191 pmuludq 64(%esp),%xmm6 1192 paddq %xmm5,%xmm4 1193 movdqa %xmm7,%xmm5 1194 pmuludq 16(%esp),%xmm7 1195 paddq %xmm6,%xmm3 1196 movdqa %xmm5,%xmm6 1197 pmuludq 32(%esp),%xmm5 1198 paddq %xmm7,%xmm0 1199 pmuludq 48(%esp),%xmm6 1200 movdqa 64(%ebx),%xmm7 1201 paddq %xmm5,%xmm1 1202 paddq %xmm6,%xmm2 1203.L017short_tail: 1204 pshufd $78,%xmm4,%xmm6 1205 pshufd $78,%xmm3,%xmm5 1206 paddq %xmm6,%xmm4 1207 paddq %xmm5,%xmm3 1208 pshufd $78,%xmm0,%xmm6 1209 pshufd $78,%xmm1,%xmm5 1210 paddq %xmm6,%xmm0 1211 paddq %xmm5,%xmm1 1212 pshufd $78,%xmm2,%xmm6 1213 movdqa %xmm3,%xmm5 1214 pand %xmm7,%xmm3 1215 psrlq $26,%xmm5 1216 paddq %xmm6,%xmm2 1217 paddq %xmm4,%xmm5 1218 movdqa %xmm0,%xmm6 1219 pand %xmm7,%xmm0 1220 psrlq $26,%xmm6 1221 movdqa %xmm5,%xmm4 1222 paddq %xmm1,%xmm6 1223 psrlq $26,%xmm5 1224 pand %xmm7,%xmm4 1225 movdqa %xmm6,%xmm1 1226 psrlq $26,%xmm6 1227 paddd %xmm5,%xmm0 1228 psllq $2,%xmm5 1229 paddq %xmm2,%xmm6 1230 paddq %xmm0,%xmm5 1231 pand %xmm7,%xmm1 1232 movdqa %xmm6,%xmm2 1233 psrlq $26,%xmm6 1234 pand %xmm7,%xmm2 1235 paddd %xmm3,%xmm6 1236 movdqa %xmm5,%xmm0 1237 psrlq $26,%xmm5 1238 movdqa %xmm6,%xmm3 1239 psrlq $26,%xmm6 1240 pand %xmm7,%xmm0 1241 paddd %xmm5,%xmm1 1242 pand %xmm7,%xmm3 1243 paddd %xmm6,%xmm4 1244.L013done: 1245 movd %xmm0,-48(%edi) 1246 movd %xmm1,-44(%edi) 1247 movd %xmm2,-40(%edi) 1248 movd %xmm3,-36(%edi) 1249 movd %xmm4,-32(%edi) 1250 movl %ebp,%esp 1251.L007nodata: 1252 popl %edi 1253 popl %esi 1254 popl %ebx 1255 popl %ebp 1256 ret 1257.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 1258.align 32 1259.type _poly1305_emit_sse2,@function 1260.align 16 1261_poly1305_emit_sse2: 1262 pushl %ebp 1263 pushl %ebx 1264 pushl %esi 1265 pushl %edi 1266 movl 20(%esp),%ebp 1267 cmpl $0,20(%ebp) 1268 je .Lenter_emit 1269 movl (%ebp),%eax 1270 movl 4(%ebp),%edi 1271 movl 8(%ebp),%ecx 1272 movl 12(%ebp),%edx 1273 movl 16(%ebp),%esi 1274 movl %edi,%ebx 1275 shll $26,%edi 1276 shrl $6,%ebx 1277 addl %edi,%eax 1278 movl %ecx,%edi 1279 adcl $0,%ebx 1280 shll $20,%edi 1281 shrl $12,%ecx 1282 addl %edi,%ebx 1283 movl %edx,%edi 1284 adcl $0,%ecx 1285 shll $14,%edi 1286 shrl $18,%edx 1287 addl %edi,%ecx 1288 movl %esi,%edi 1289 adcl $0,%edx 1290 shll $8,%edi 1291 shrl $24,%esi 1292 addl %edi,%edx 1293 adcl $0,%esi 1294 movl %esi,%edi 1295 andl $3,%esi 1296 shrl $2,%edi 1297 leal (%edi,%edi,4),%ebp 1298 movl 24(%esp),%edi 1299 addl %ebp,%eax 1300 movl 28(%esp),%ebp 1301 adcl $0,%ebx 1302 adcl $0,%ecx 1303 adcl $0,%edx 1304 adcl $0,%esi 1305 movd %eax,%xmm0 1306 addl $5,%eax 1307 movd %ebx,%xmm1 1308 adcl $0,%ebx 1309 movd %ecx,%xmm2 1310 adcl $0,%ecx 1311 movd %edx,%xmm3 1312 adcl $0,%edx 1313 adcl $0,%esi 1314 shrl $2,%esi 1315 negl %esi 1316 andl %esi,%eax 1317 andl %esi,%ebx 1318 andl %esi,%ecx 1319 andl %esi,%edx 1320 movl %eax,(%edi) 1321 movd %xmm0,%eax 1322 movl %ebx,4(%edi) 1323 movd %xmm1,%ebx 1324 movl %ecx,8(%edi) 1325 movd %xmm2,%ecx 1326 movl %edx,12(%edi) 1327 movd %xmm3,%edx 1328 notl %esi 1329 andl %esi,%eax 1330 andl %esi,%ebx 1331 orl (%edi),%eax 1332 andl %esi,%ecx 1333 orl 4(%edi),%ebx 1334 andl %esi,%edx 1335 orl 8(%edi),%ecx 1336 orl 12(%edi),%edx 1337 addl (%ebp),%eax 1338 adcl 4(%ebp),%ebx 1339 movl %eax,(%edi) 1340 adcl 8(%ebp),%ecx 1341 movl %ebx,4(%edi) 1342 adcl 12(%ebp),%edx 1343 movl %ecx,8(%edi) 1344 movl %edx,12(%edi) 1345 popl %edi 1346 popl %esi 1347 popl %ebx 1348 popl %ebp 1349 ret 1350.size _poly1305_emit_sse2,.-_poly1305_emit_sse2 1351.align 32 1352.type _poly1305_init_avx2,@function 1353.align 16 1354_poly1305_init_avx2: 1355 vmovdqu 24(%edi),%xmm4 1356 leal 48(%edi),%edi 1357 movl %esp,%ebp 1358 subl $224,%esp 1359 andl $-16,%esp 1360 vmovdqa 64(%ebx),%xmm7 1361 vpand %xmm7,%xmm4,%xmm0 1362 vpsrlq $26,%xmm4,%xmm1 1363 vpsrldq $6,%xmm4,%xmm3 1364 vpand %xmm7,%xmm1,%xmm1 1365 vpsrlq $4,%xmm3,%xmm2 1366 vpsrlq $30,%xmm3,%xmm3 1367 vpand %xmm7,%xmm2,%xmm2 1368 vpand %xmm7,%xmm3,%xmm3 1369 vpsrldq $13,%xmm4,%xmm4 1370 leal 144(%esp),%edx 1371 movl $2,%ecx 1372.L018square: 1373 vmovdqa %xmm0,(%esp) 1374 vmovdqa %xmm1,16(%esp) 1375 vmovdqa %xmm2,32(%esp) 1376 vmovdqa %xmm3,48(%esp) 1377 vmovdqa %xmm4,64(%esp) 1378 vpslld $2,%xmm1,%xmm6 1379 vpslld $2,%xmm2,%xmm5 1380 vpaddd %xmm1,%xmm6,%xmm6 1381 vpaddd %xmm2,%xmm5,%xmm5 1382 vmovdqa %xmm6,80(%esp) 1383 vmovdqa %xmm5,96(%esp) 1384 vpslld $2,%xmm3,%xmm6 1385 vpslld $2,%xmm4,%xmm5 1386 vpaddd %xmm3,%xmm6,%xmm6 1387 vpaddd %xmm4,%xmm5,%xmm5 1388 vmovdqa %xmm6,112(%esp) 1389 vmovdqa %xmm5,128(%esp) 1390 vpshufd $68,%xmm0,%xmm5 1391 vmovdqa %xmm1,%xmm6 1392 vpshufd $68,%xmm1,%xmm1 1393 vpshufd $68,%xmm2,%xmm2 1394 vpshufd $68,%xmm3,%xmm3 1395 vpshufd $68,%xmm4,%xmm4 1396 vmovdqa %xmm5,(%edx) 1397 vmovdqa %xmm1,16(%edx) 1398 vmovdqa %xmm2,32(%edx) 1399 vmovdqa %xmm3,48(%edx) 1400 vmovdqa %xmm4,64(%edx) 1401 vpmuludq %xmm0,%xmm4,%xmm4 1402 vpmuludq %xmm0,%xmm3,%xmm3 1403 vpmuludq %xmm0,%xmm2,%xmm2 1404 vpmuludq %xmm0,%xmm1,%xmm1 1405 vpmuludq %xmm0,%xmm5,%xmm0 1406 vpmuludq 48(%edx),%xmm6,%xmm5 1407 vpaddq %xmm5,%xmm4,%xmm4 1408 vpmuludq 32(%edx),%xmm6,%xmm7 1409 vpaddq %xmm7,%xmm3,%xmm3 1410 vpmuludq 16(%edx),%xmm6,%xmm5 1411 vpaddq %xmm5,%xmm2,%xmm2 1412 vmovdqa 80(%esp),%xmm7 1413 vpmuludq (%edx),%xmm6,%xmm6 1414 vpaddq %xmm6,%xmm1,%xmm1 1415 vmovdqa 32(%esp),%xmm5 1416 vpmuludq 64(%edx),%xmm7,%xmm7 1417 vpaddq %xmm7,%xmm0,%xmm0 1418 vpmuludq 32(%edx),%xmm5,%xmm6 1419 vpaddq %xmm6,%xmm4,%xmm4 1420 vpmuludq 16(%edx),%xmm5,%xmm7 1421 vpaddq %xmm7,%xmm3,%xmm3 1422 vmovdqa 96(%esp),%xmm6 1423 vpmuludq (%edx),%xmm5,%xmm5 1424 vpaddq %xmm5,%xmm2,%xmm2 1425 vpmuludq 64(%edx),%xmm6,%xmm7 1426 vpaddq %xmm7,%xmm1,%xmm1 1427 vmovdqa 48(%esp),%xmm5 1428 vpmuludq 48(%edx),%xmm6,%xmm6 1429 vpaddq %xmm6,%xmm0,%xmm0 1430 vpmuludq 16(%edx),%xmm5,%xmm7 1431 vpaddq %xmm7,%xmm4,%xmm4 1432 vmovdqa 112(%esp),%xmm6 1433 vpmuludq (%edx),%xmm5,%xmm5 1434 vpaddq %xmm5,%xmm3,%xmm3 1435 vpmuludq 64(%edx),%xmm6,%xmm7 1436 vpaddq %xmm7,%xmm2,%xmm2 1437 vpmuludq 48(%edx),%xmm6,%xmm5 1438 vpaddq %xmm5,%xmm1,%xmm1 1439 vmovdqa 64(%esp),%xmm7 1440 vpmuludq 32(%edx),%xmm6,%xmm6 1441 vpaddq %xmm6,%xmm0,%xmm0 1442 vmovdqa 128(%esp),%xmm5 1443 vpmuludq (%edx),%xmm7,%xmm7 1444 vpaddq %xmm7,%xmm4,%xmm4 1445 vpmuludq 64(%edx),%xmm5,%xmm6 1446 vpaddq %xmm6,%xmm3,%xmm3 1447 vpmuludq 16(%edx),%xmm5,%xmm7 1448 vpaddq %xmm7,%xmm0,%xmm0 1449 vpmuludq 32(%edx),%xmm5,%xmm6 1450 vpaddq %xmm6,%xmm1,%xmm1 1451 vmovdqa 64(%ebx),%xmm7 1452 vpmuludq 48(%edx),%xmm5,%xmm5 1453 vpaddq %xmm5,%xmm2,%xmm2 1454 vpsrlq $26,%xmm3,%xmm5 1455 vpand %xmm7,%xmm3,%xmm3 1456 vpsrlq $26,%xmm0,%xmm6 1457 vpand %xmm7,%xmm0,%xmm0 1458 vpaddq %xmm5,%xmm4,%xmm4 1459 vpaddq %xmm6,%xmm1,%xmm1 1460 vpsrlq $26,%xmm4,%xmm5 1461 vpand %xmm7,%xmm4,%xmm4 1462 vpsrlq $26,%xmm1,%xmm6 1463 vpand %xmm7,%xmm1,%xmm1 1464 vpaddq %xmm6,%xmm2,%xmm2 1465 vpaddd %xmm5,%xmm0,%xmm0 1466 vpsllq $2,%xmm5,%xmm5 1467 vpsrlq $26,%xmm2,%xmm6 1468 vpand %xmm7,%xmm2,%xmm2 1469 vpaddd %xmm5,%xmm0,%xmm0 1470 vpaddd %xmm6,%xmm3,%xmm3 1471 vpsrlq $26,%xmm3,%xmm6 1472 vpsrlq $26,%xmm0,%xmm5 1473 vpand %xmm7,%xmm0,%xmm0 1474 vpand %xmm7,%xmm3,%xmm3 1475 vpaddd %xmm5,%xmm1,%xmm1 1476 vpaddd %xmm6,%xmm4,%xmm4 1477 decl %ecx 1478 jz .L019square_break 1479 vpunpcklqdq (%esp),%xmm0,%xmm0 1480 vpunpcklqdq 16(%esp),%xmm1,%xmm1 1481 vpunpcklqdq 32(%esp),%xmm2,%xmm2 1482 vpunpcklqdq 48(%esp),%xmm3,%xmm3 1483 vpunpcklqdq 64(%esp),%xmm4,%xmm4 1484 jmp .L018square 1485.L019square_break: 1486 vpsllq $32,%xmm0,%xmm0 1487 vpsllq $32,%xmm1,%xmm1 1488 vpsllq $32,%xmm2,%xmm2 1489 vpsllq $32,%xmm3,%xmm3 1490 vpsllq $32,%xmm4,%xmm4 1491 vpor (%esp),%xmm0,%xmm0 1492 vpor 16(%esp),%xmm1,%xmm1 1493 vpor 32(%esp),%xmm2,%xmm2 1494 vpor 48(%esp),%xmm3,%xmm3 1495 vpor 64(%esp),%xmm4,%xmm4 1496 vpshufd $141,%xmm0,%xmm0 1497 vpshufd $141,%xmm1,%xmm1 1498 vpshufd $141,%xmm2,%xmm2 1499 vpshufd $141,%xmm3,%xmm3 1500 vpshufd $141,%xmm4,%xmm4 1501 vmovdqu %xmm0,(%edi) 1502 vmovdqu %xmm1,16(%edi) 1503 vmovdqu %xmm2,32(%edi) 1504 vmovdqu %xmm3,48(%edi) 1505 vmovdqu %xmm4,64(%edi) 1506 vpslld $2,%xmm1,%xmm6 1507 vpslld $2,%xmm2,%xmm5 1508 vpaddd %xmm1,%xmm6,%xmm6 1509 vpaddd %xmm2,%xmm5,%xmm5 1510 vmovdqu %xmm6,80(%edi) 1511 vmovdqu %xmm5,96(%edi) 1512 vpslld $2,%xmm3,%xmm6 1513 vpslld $2,%xmm4,%xmm5 1514 vpaddd %xmm3,%xmm6,%xmm6 1515 vpaddd %xmm4,%xmm5,%xmm5 1516 vmovdqu %xmm6,112(%edi) 1517 vmovdqu %xmm5,128(%edi) 1518 movl %ebp,%esp 1519 leal -48(%edi),%edi 1520 ret 1521.size _poly1305_init_avx2,.-_poly1305_init_avx2 1522.align 32 1523.type _poly1305_blocks_avx2,@function 1524.align 16 1525_poly1305_blocks_avx2: 1526 pushl %ebp 1527 pushl %ebx 1528 pushl %esi 1529 pushl %edi 1530 movl 20(%esp),%edi 1531 movl 24(%esp),%esi 1532 movl 28(%esp),%ecx 1533 movl 20(%edi),%eax 1534 andl $-16,%ecx 1535 jz .L020nodata 1536 cmpl $64,%ecx 1537 jae .L021enter_avx2 1538 testl %eax,%eax 1539 jz .Lenter_blocks 1540.L021enter_avx2: 1541 vzeroupper 1542 call .L022pic_point 1543.L022pic_point: 1544 popl %ebx 1545 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx 1546 testl %eax,%eax 1547 jnz .L023base2_26 1548 call _poly1305_init_avx2 1549 movl (%edi),%eax 1550 movl 3(%edi),%ecx 1551 movl 6(%edi),%edx 1552 movl 9(%edi),%esi 1553 movl 13(%edi),%ebp 1554 shrl $2,%ecx 1555 andl $67108863,%eax 1556 shrl $4,%edx 1557 andl $67108863,%ecx 1558 shrl $6,%esi 1559 andl $67108863,%edx 1560 movl %eax,(%edi) 1561 movl %ecx,4(%edi) 1562 movl %edx,8(%edi) 1563 movl %esi,12(%edi) 1564 movl %ebp,16(%edi) 1565 movl $1,20(%edi) 1566 movl 24(%esp),%esi 1567 movl 28(%esp),%ecx 1568.L023base2_26: 1569 movl 32(%esp),%eax 1570 movl %esp,%ebp 1571 subl $448,%esp 1572 andl $-512,%esp 1573 vmovdqu 48(%edi),%xmm0 1574 leal 288(%esp),%edx 1575 vmovdqu 64(%edi),%xmm1 1576 vmovdqu 80(%edi),%xmm2 1577 vmovdqu 96(%edi),%xmm3 1578 vmovdqu 112(%edi),%xmm4 1579 leal 48(%edi),%edi 1580 vpermq $64,%ymm0,%ymm0 1581 vpermq $64,%ymm1,%ymm1 1582 vpermq $64,%ymm2,%ymm2 1583 vpermq $64,%ymm3,%ymm3 1584 vpermq $64,%ymm4,%ymm4 1585 vpshufd $200,%ymm0,%ymm0 1586 vpshufd $200,%ymm1,%ymm1 1587 vpshufd $200,%ymm2,%ymm2 1588 vpshufd $200,%ymm3,%ymm3 1589 vpshufd $200,%ymm4,%ymm4 1590 vmovdqa %ymm0,-128(%edx) 1591 vmovdqu 80(%edi),%xmm0 1592 vmovdqa %ymm1,-96(%edx) 1593 vmovdqu 96(%edi),%xmm1 1594 vmovdqa %ymm2,-64(%edx) 1595 vmovdqu 112(%edi),%xmm2 1596 vmovdqa %ymm3,-32(%edx) 1597 vmovdqu 128(%edi),%xmm3 1598 vmovdqa %ymm4,(%edx) 1599 vpermq $64,%ymm0,%ymm0 1600 vpermq $64,%ymm1,%ymm1 1601 vpermq $64,%ymm2,%ymm2 1602 vpermq $64,%ymm3,%ymm3 1603 vpshufd $200,%ymm0,%ymm0 1604 vpshufd $200,%ymm1,%ymm1 1605 vpshufd $200,%ymm2,%ymm2 1606 vpshufd $200,%ymm3,%ymm3 1607 vmovdqa %ymm0,32(%edx) 1608 vmovd -48(%edi),%xmm0 1609 vmovdqa %ymm1,64(%edx) 1610 vmovd -44(%edi),%xmm1 1611 vmovdqa %ymm2,96(%edx) 1612 vmovd -40(%edi),%xmm2 1613 vmovdqa %ymm3,128(%edx) 1614 vmovd -36(%edi),%xmm3 1615 vmovd -32(%edi),%xmm4 1616 vmovdqa 64(%ebx),%ymm7 1617 negl %eax 1618 testl $63,%ecx 1619 jz .L024even 1620 movl %ecx,%edx 1621 andl $-64,%ecx 1622 andl $63,%edx 1623 vmovdqu (%esi),%xmm5 1624 cmpl $32,%edx 1625 jb .L025one 1626 vmovdqu 16(%esi),%xmm6 1627 je .L026two 1628 vinserti128 $1,32(%esi),%ymm5,%ymm5 1629 leal 48(%esi),%esi 1630 leal 8(%ebx),%ebx 1631 leal 296(%esp),%edx 1632 jmp .L027tail 1633.L026two: 1634 leal 32(%esi),%esi 1635 leal 16(%ebx),%ebx 1636 leal 304(%esp),%edx 1637 jmp .L027tail 1638.L025one: 1639 leal 16(%esi),%esi 1640 vpxor %ymm6,%ymm6,%ymm6 1641 leal 32(%ebx,%eax,8),%ebx 1642 leal 312(%esp),%edx 1643 jmp .L027tail 1644.align 32 1645.L024even: 1646 vmovdqu (%esi),%xmm5 1647 vmovdqu 16(%esi),%xmm6 1648 vinserti128 $1,32(%esi),%ymm5,%ymm5 1649 vinserti128 $1,48(%esi),%ymm6,%ymm6 1650 leal 64(%esi),%esi 1651 subl $64,%ecx 1652 jz .L027tail 1653.L028loop: 1654 vmovdqa %ymm2,64(%esp) 1655 vpsrldq $6,%ymm5,%ymm2 1656 vmovdqa %ymm0,(%esp) 1657 vpsrldq $6,%ymm6,%ymm0 1658 vmovdqa %ymm1,32(%esp) 1659 vpunpckhqdq %ymm6,%ymm5,%ymm1 1660 vpunpcklqdq %ymm6,%ymm5,%ymm5 1661 vpunpcklqdq %ymm0,%ymm2,%ymm2 1662 vpsrlq $30,%ymm2,%ymm0 1663 vpsrlq $4,%ymm2,%ymm2 1664 vpsrlq $26,%ymm5,%ymm6 1665 vpsrlq $40,%ymm1,%ymm1 1666 vpand %ymm7,%ymm2,%ymm2 1667 vpand %ymm7,%ymm5,%ymm5 1668 vpand %ymm7,%ymm6,%ymm6 1669 vpand %ymm7,%ymm0,%ymm0 1670 vpor (%ebx),%ymm1,%ymm1 1671 vpaddq 64(%esp),%ymm2,%ymm2 1672 vpaddq (%esp),%ymm5,%ymm5 1673 vpaddq 32(%esp),%ymm6,%ymm6 1674 vpaddq %ymm3,%ymm0,%ymm0 1675 vpaddq %ymm4,%ymm1,%ymm1 1676 vpmuludq -96(%edx),%ymm2,%ymm3 1677 vmovdqa %ymm6,32(%esp) 1678 vpmuludq -64(%edx),%ymm2,%ymm4 1679 vmovdqa %ymm0,96(%esp) 1680 vpmuludq 96(%edx),%ymm2,%ymm0 1681 vmovdqa %ymm1,128(%esp) 1682 vpmuludq 128(%edx),%ymm2,%ymm1 1683 vpmuludq -128(%edx),%ymm2,%ymm2 1684 vpmuludq -32(%edx),%ymm5,%ymm7 1685 vpaddq %ymm7,%ymm3,%ymm3 1686 vpmuludq (%edx),%ymm5,%ymm6 1687 vpaddq %ymm6,%ymm4,%ymm4 1688 vpmuludq -128(%edx),%ymm5,%ymm7 1689 vpaddq %ymm7,%ymm0,%ymm0 1690 vmovdqa 32(%esp),%ymm7 1691 vpmuludq -96(%edx),%ymm5,%ymm6 1692 vpaddq %ymm6,%ymm1,%ymm1 1693 vpmuludq -64(%edx),%ymm5,%ymm5 1694 vpaddq %ymm5,%ymm2,%ymm2 1695 vpmuludq -64(%edx),%ymm7,%ymm6 1696 vpaddq %ymm6,%ymm3,%ymm3 1697 vpmuludq -32(%edx),%ymm7,%ymm5 1698 vpaddq %ymm5,%ymm4,%ymm4 1699 vpmuludq 128(%edx),%ymm7,%ymm6 1700 vpaddq %ymm6,%ymm0,%ymm0 1701 vmovdqa 96(%esp),%ymm6 1702 vpmuludq -128(%edx),%ymm7,%ymm5 1703 vpaddq %ymm5,%ymm1,%ymm1 1704 vpmuludq -96(%edx),%ymm7,%ymm7 1705 vpaddq %ymm7,%ymm2,%ymm2 1706 vpmuludq -128(%edx),%ymm6,%ymm5 1707 vpaddq %ymm5,%ymm3,%ymm3 1708 vpmuludq -96(%edx),%ymm6,%ymm7 1709 vpaddq %ymm7,%ymm4,%ymm4 1710 vpmuludq 64(%edx),%ymm6,%ymm5 1711 vpaddq %ymm5,%ymm0,%ymm0 1712 vmovdqa 128(%esp),%ymm5 1713 vpmuludq 96(%edx),%ymm6,%ymm7 1714 vpaddq %ymm7,%ymm1,%ymm1 1715 vpmuludq 128(%edx),%ymm6,%ymm6 1716 vpaddq %ymm6,%ymm2,%ymm2 1717 vpmuludq 128(%edx),%ymm5,%ymm7 1718 vpaddq %ymm7,%ymm3,%ymm3 1719 vpmuludq 32(%edx),%ymm5,%ymm6 1720 vpaddq %ymm6,%ymm0,%ymm0 1721 vpmuludq -128(%edx),%ymm5,%ymm7 1722 vpaddq %ymm7,%ymm4,%ymm4 1723 vmovdqa 64(%ebx),%ymm7 1724 vpmuludq 64(%edx),%ymm5,%ymm6 1725 vpaddq %ymm6,%ymm1,%ymm1 1726 vpmuludq 96(%edx),%ymm5,%ymm5 1727 vpaddq %ymm5,%ymm2,%ymm2 1728 vpsrlq $26,%ymm3,%ymm5 1729 vpand %ymm7,%ymm3,%ymm3 1730 vpsrlq $26,%ymm0,%ymm6 1731 vpand %ymm7,%ymm0,%ymm0 1732 vpaddq %ymm5,%ymm4,%ymm4 1733 vpaddq %ymm6,%ymm1,%ymm1 1734 vpsrlq $26,%ymm4,%ymm5 1735 vpand %ymm7,%ymm4,%ymm4 1736 vpsrlq $26,%ymm1,%ymm6 1737 vpand %ymm7,%ymm1,%ymm1 1738 vpaddq %ymm6,%ymm2,%ymm2 1739 vpaddq %ymm5,%ymm0,%ymm0 1740 vpsllq $2,%ymm5,%ymm5 1741 vpsrlq $26,%ymm2,%ymm6 1742 vpand %ymm7,%ymm2,%ymm2 1743 vpaddq %ymm5,%ymm0,%ymm0 1744 vpaddq %ymm6,%ymm3,%ymm3 1745 vpsrlq $26,%ymm3,%ymm6 1746 vpsrlq $26,%ymm0,%ymm5 1747 vpand %ymm7,%ymm0,%ymm0 1748 vpand %ymm7,%ymm3,%ymm3 1749 vpaddq %ymm5,%ymm1,%ymm1 1750 vpaddq %ymm6,%ymm4,%ymm4 1751 vmovdqu (%esi),%xmm5 1752 vmovdqu 16(%esi),%xmm6 1753 vinserti128 $1,32(%esi),%ymm5,%ymm5 1754 vinserti128 $1,48(%esi),%ymm6,%ymm6 1755 leal 64(%esi),%esi 1756 subl $64,%ecx 1757 jnz .L028loop 1758.L027tail: 1759 vmovdqa %ymm2,64(%esp) 1760 vpsrldq $6,%ymm5,%ymm2 1761 vmovdqa %ymm0,(%esp) 1762 vpsrldq $6,%ymm6,%ymm0 1763 vmovdqa %ymm1,32(%esp) 1764 vpunpckhqdq %ymm6,%ymm5,%ymm1 1765 vpunpcklqdq %ymm6,%ymm5,%ymm5 1766 vpunpcklqdq %ymm0,%ymm2,%ymm2 1767 vpsrlq $30,%ymm2,%ymm0 1768 vpsrlq $4,%ymm2,%ymm2 1769 vpsrlq $26,%ymm5,%ymm6 1770 vpsrlq $40,%ymm1,%ymm1 1771 vpand %ymm7,%ymm2,%ymm2 1772 vpand %ymm7,%ymm5,%ymm5 1773 vpand %ymm7,%ymm6,%ymm6 1774 vpand %ymm7,%ymm0,%ymm0 1775 vpor (%ebx),%ymm1,%ymm1 1776 andl $-64,%ebx 1777 vpaddq 64(%esp),%ymm2,%ymm2 1778 vpaddq (%esp),%ymm5,%ymm5 1779 vpaddq 32(%esp),%ymm6,%ymm6 1780 vpaddq %ymm3,%ymm0,%ymm0 1781 vpaddq %ymm4,%ymm1,%ymm1 1782 vpmuludq -92(%edx),%ymm2,%ymm3 1783 vmovdqa %ymm6,32(%esp) 1784 vpmuludq -60(%edx),%ymm2,%ymm4 1785 vmovdqa %ymm0,96(%esp) 1786 vpmuludq 100(%edx),%ymm2,%ymm0 1787 vmovdqa %ymm1,128(%esp) 1788 vpmuludq 132(%edx),%ymm2,%ymm1 1789 vpmuludq -124(%edx),%ymm2,%ymm2 1790 vpmuludq -28(%edx),%ymm5,%ymm7 1791 vpaddq %ymm7,%ymm3,%ymm3 1792 vpmuludq 4(%edx),%ymm5,%ymm6 1793 vpaddq %ymm6,%ymm4,%ymm4 1794 vpmuludq -124(%edx),%ymm5,%ymm7 1795 vpaddq %ymm7,%ymm0,%ymm0 1796 vmovdqa 32(%esp),%ymm7 1797 vpmuludq -92(%edx),%ymm5,%ymm6 1798 vpaddq %ymm6,%ymm1,%ymm1 1799 vpmuludq -60(%edx),%ymm5,%ymm5 1800 vpaddq %ymm5,%ymm2,%ymm2 1801 vpmuludq -60(%edx),%ymm7,%ymm6 1802 vpaddq %ymm6,%ymm3,%ymm3 1803 vpmuludq -28(%edx),%ymm7,%ymm5 1804 vpaddq %ymm5,%ymm4,%ymm4 1805 vpmuludq 132(%edx),%ymm7,%ymm6 1806 vpaddq %ymm6,%ymm0,%ymm0 1807 vmovdqa 96(%esp),%ymm6 1808 vpmuludq -124(%edx),%ymm7,%ymm5 1809 vpaddq %ymm5,%ymm1,%ymm1 1810 vpmuludq -92(%edx),%ymm7,%ymm7 1811 vpaddq %ymm7,%ymm2,%ymm2 1812 vpmuludq -124(%edx),%ymm6,%ymm5 1813 vpaddq %ymm5,%ymm3,%ymm3 1814 vpmuludq -92(%edx),%ymm6,%ymm7 1815 vpaddq %ymm7,%ymm4,%ymm4 1816 vpmuludq 68(%edx),%ymm6,%ymm5 1817 vpaddq %ymm5,%ymm0,%ymm0 1818 vmovdqa 128(%esp),%ymm5 1819 vpmuludq 100(%edx),%ymm6,%ymm7 1820 vpaddq %ymm7,%ymm1,%ymm1 1821 vpmuludq 132(%edx),%ymm6,%ymm6 1822 vpaddq %ymm6,%ymm2,%ymm2 1823 vpmuludq 132(%edx),%ymm5,%ymm7 1824 vpaddq %ymm7,%ymm3,%ymm3 1825 vpmuludq 36(%edx),%ymm5,%ymm6 1826 vpaddq %ymm6,%ymm0,%ymm0 1827 vpmuludq -124(%edx),%ymm5,%ymm7 1828 vpaddq %ymm7,%ymm4,%ymm4 1829 vmovdqa 64(%ebx),%ymm7 1830 vpmuludq 68(%edx),%ymm5,%ymm6 1831 vpaddq %ymm6,%ymm1,%ymm1 1832 vpmuludq 100(%edx),%ymm5,%ymm5 1833 vpaddq %ymm5,%ymm2,%ymm2 1834 vpsrldq $8,%ymm4,%ymm5 1835 vpsrldq $8,%ymm3,%ymm6 1836 vpaddq %ymm5,%ymm4,%ymm4 1837 vpsrldq $8,%ymm0,%ymm5 1838 vpaddq %ymm6,%ymm3,%ymm3 1839 vpsrldq $8,%ymm1,%ymm6 1840 vpaddq %ymm5,%ymm0,%ymm0 1841 vpsrldq $8,%ymm2,%ymm5 1842 vpaddq %ymm6,%ymm1,%ymm1 1843 vpermq $2,%ymm4,%ymm6 1844 vpaddq %ymm5,%ymm2,%ymm2 1845 vpermq $2,%ymm3,%ymm5 1846 vpaddq %ymm6,%ymm4,%ymm4 1847 vpermq $2,%ymm0,%ymm6 1848 vpaddq %ymm5,%ymm3,%ymm3 1849 vpermq $2,%ymm1,%ymm5 1850 vpaddq %ymm6,%ymm0,%ymm0 1851 vpermq $2,%ymm2,%ymm6 1852 vpaddq %ymm5,%ymm1,%ymm1 1853 vpaddq %ymm6,%ymm2,%ymm2 1854 vpsrlq $26,%ymm3,%ymm5 1855 vpand %ymm7,%ymm3,%ymm3 1856 vpsrlq $26,%ymm0,%ymm6 1857 vpand %ymm7,%ymm0,%ymm0 1858 vpaddq %ymm5,%ymm4,%ymm4 1859 vpaddq %ymm6,%ymm1,%ymm1 1860 vpsrlq $26,%ymm4,%ymm5 1861 vpand %ymm7,%ymm4,%ymm4 1862 vpsrlq $26,%ymm1,%ymm6 1863 vpand %ymm7,%ymm1,%ymm1 1864 vpaddq %ymm6,%ymm2,%ymm2 1865 vpaddq %ymm5,%ymm0,%ymm0 1866 vpsllq $2,%ymm5,%ymm5 1867 vpsrlq $26,%ymm2,%ymm6 1868 vpand %ymm7,%ymm2,%ymm2 1869 vpaddq %ymm5,%ymm0,%ymm0 1870 vpaddq %ymm6,%ymm3,%ymm3 1871 vpsrlq $26,%ymm3,%ymm6 1872 vpsrlq $26,%ymm0,%ymm5 1873 vpand %ymm7,%ymm0,%ymm0 1874 vpand %ymm7,%ymm3,%ymm3 1875 vpaddq %ymm5,%ymm1,%ymm1 1876 vpaddq %ymm6,%ymm4,%ymm4 1877 cmpl $0,%ecx 1878 je .L029done 1879 vpshufd $252,%xmm0,%xmm0 1880 leal 288(%esp),%edx 1881 vpshufd $252,%xmm1,%xmm1 1882 vpshufd $252,%xmm2,%xmm2 1883 vpshufd $252,%xmm3,%xmm3 1884 vpshufd $252,%xmm4,%xmm4 1885 jmp .L024even 1886.align 16 1887.L029done: 1888 vmovd %xmm0,-48(%edi) 1889 vmovd %xmm1,-44(%edi) 1890 vmovd %xmm2,-40(%edi) 1891 vmovd %xmm3,-36(%edi) 1892 vmovd %xmm4,-32(%edi) 1893 vzeroupper 1894 movl %ebp,%esp 1895.L020nodata: 1896 popl %edi 1897 popl %esi 1898 popl %ebx 1899 popl %ebp 1900 ret 1901.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 1902.align 64 1903.Lconst_sse2: 1904.long 16777216,0,16777216,0,16777216,0,16777216,0 1905.long 0,0,0,0,0,0,0,0 1906.long 67108863,0,67108863,0,67108863,0,67108863,0 1907.long 268435455,268435452,268435452,268435452 1908.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 1909.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 1910.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 1911.byte 114,103,62,0 1912.align 4 1913.comm OPENSSL_ia32cap_P,16,4 1914#else 1915.text 1916.align 64 1917.globl poly1305_init 1918.type poly1305_init,@function 1919.align 16 1920poly1305_init: 1921.L_poly1305_init_begin: 1922 pushl %ebp 1923 pushl %ebx 1924 pushl %esi 1925 pushl %edi 1926 movl 20(%esp),%edi 1927 movl 24(%esp),%esi 1928 movl 28(%esp),%ebp 1929 xorl %eax,%eax 1930 movl %eax,(%edi) 1931 movl %eax,4(%edi) 1932 movl %eax,8(%edi) 1933 movl %eax,12(%edi) 1934 movl %eax,16(%edi) 1935 movl %eax,20(%edi) 1936 cmpl $0,%esi 1937 je .L000nokey 1938 call .L001pic_point 1939.L001pic_point: 1940 popl %ebx 1941 leal poly1305_blocks-.L001pic_point(%ebx),%eax 1942 leal poly1305_emit-.L001pic_point(%ebx),%edx 1943 leal OPENSSL_ia32cap_P,%edi 1944 movl (%edi),%ecx 1945 andl $83886080,%ecx 1946 cmpl $83886080,%ecx 1947 jne .L002no_sse2 1948 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax 1949 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx 1950 movl 8(%edi),%ecx 1951 testl $32,%ecx 1952 jz .L002no_sse2 1953 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax 1954.L002no_sse2: 1955 movl 20(%esp),%edi 1956 movl %eax,(%ebp) 1957 movl %edx,4(%ebp) 1958 movl (%esi),%eax 1959 movl 4(%esi),%ebx 1960 movl 8(%esi),%ecx 1961 movl 12(%esi),%edx 1962 andl $268435455,%eax 1963 andl $268435452,%ebx 1964 andl $268435452,%ecx 1965 andl $268435452,%edx 1966 movl %eax,24(%edi) 1967 movl %ebx,28(%edi) 1968 movl %ecx,32(%edi) 1969 movl %edx,36(%edi) 1970 movl $1,%eax 1971.L000nokey: 1972 popl %edi 1973 popl %esi 1974 popl %ebx 1975 popl %ebp 1976 ret 1977.size poly1305_init,.-.L_poly1305_init_begin 1978.globl poly1305_blocks 1979.type poly1305_blocks,@function 1980.align 16 1981poly1305_blocks: 1982.L_poly1305_blocks_begin: 1983 pushl %ebp 1984 pushl %ebx 1985 pushl %esi 1986 pushl %edi 1987 movl 20(%esp),%edi 1988 movl 24(%esp),%esi 1989 movl 28(%esp),%ecx 1990.Lenter_blocks: 1991 andl $-15,%ecx 1992 jz .L003nodata 1993 subl $64,%esp 1994 movl 24(%edi),%eax 1995 movl 28(%edi),%ebx 1996 leal (%esi,%ecx,1),%ebp 1997 movl 32(%edi),%ecx 1998 movl 36(%edi),%edx 1999 movl %ebp,92(%esp) 2000 movl %esi,%ebp 2001 movl %eax,36(%esp) 2002 movl %ebx,%eax 2003 shrl $2,%eax 2004 movl %ebx,40(%esp) 2005 addl %ebx,%eax 2006 movl %ecx,%ebx 2007 shrl $2,%ebx 2008 movl %ecx,44(%esp) 2009 addl %ecx,%ebx 2010 movl %edx,%ecx 2011 shrl $2,%ecx 2012 movl %edx,48(%esp) 2013 addl %edx,%ecx 2014 movl %eax,52(%esp) 2015 movl %ebx,56(%esp) 2016 movl %ecx,60(%esp) 2017 movl (%edi),%eax 2018 movl 4(%edi),%ebx 2019 movl 8(%edi),%ecx 2020 movl 12(%edi),%esi 2021 movl 16(%edi),%edi 2022 jmp .L004loop 2023.align 32 2024.L004loop: 2025 addl (%ebp),%eax 2026 adcl 4(%ebp),%ebx 2027 adcl 8(%ebp),%ecx 2028 adcl 12(%ebp),%esi 2029 leal 16(%ebp),%ebp 2030 adcl 96(%esp),%edi 2031 movl %eax,(%esp) 2032 movl %esi,12(%esp) 2033 mull 36(%esp) 2034 movl %edi,16(%esp) 2035 movl %eax,%edi 2036 movl %ebx,%eax 2037 movl %edx,%esi 2038 mull 60(%esp) 2039 addl %eax,%edi 2040 movl %ecx,%eax 2041 adcl %edx,%esi 2042 mull 56(%esp) 2043 addl %eax,%edi 2044 movl 12(%esp),%eax 2045 adcl %edx,%esi 2046 mull 52(%esp) 2047 addl %eax,%edi 2048 movl (%esp),%eax 2049 adcl %edx,%esi 2050 mull 40(%esp) 2051 movl %edi,20(%esp) 2052 xorl %edi,%edi 2053 addl %eax,%esi 2054 movl %ebx,%eax 2055 adcl %edx,%edi 2056 mull 36(%esp) 2057 addl %eax,%esi 2058 movl %ecx,%eax 2059 adcl %edx,%edi 2060 mull 60(%esp) 2061 addl %eax,%esi 2062 movl 12(%esp),%eax 2063 adcl %edx,%edi 2064 mull 56(%esp) 2065 addl %eax,%esi 2066 movl 16(%esp),%eax 2067 adcl %edx,%edi 2068 imull 52(%esp),%eax 2069 addl %eax,%esi 2070 movl (%esp),%eax 2071 adcl $0,%edi 2072 mull 44(%esp) 2073 movl %esi,24(%esp) 2074 xorl %esi,%esi 2075 addl %eax,%edi 2076 movl %ebx,%eax 2077 adcl %edx,%esi 2078 mull 40(%esp) 2079 addl %eax,%edi 2080 movl %ecx,%eax 2081 adcl %edx,%esi 2082 mull 36(%esp) 2083 addl %eax,%edi 2084 movl 12(%esp),%eax 2085 adcl %edx,%esi 2086 mull 60(%esp) 2087 addl %eax,%edi 2088 movl 16(%esp),%eax 2089 adcl %edx,%esi 2090 imull 56(%esp),%eax 2091 addl %eax,%edi 2092 movl (%esp),%eax 2093 adcl $0,%esi 2094 mull 48(%esp) 2095 movl %edi,28(%esp) 2096 xorl %edi,%edi 2097 addl %eax,%esi 2098 movl %ebx,%eax 2099 adcl %edx,%edi 2100 mull 44(%esp) 2101 addl %eax,%esi 2102 movl %ecx,%eax 2103 adcl %edx,%edi 2104 mull 40(%esp) 2105 addl %eax,%esi 2106 movl 12(%esp),%eax 2107 adcl %edx,%edi 2108 mull 36(%esp) 2109 addl %eax,%esi 2110 movl 16(%esp),%ecx 2111 adcl %edx,%edi 2112 movl %ecx,%edx 2113 imull 60(%esp),%ecx 2114 addl %ecx,%esi 2115 movl 20(%esp),%eax 2116 adcl $0,%edi 2117 imull 36(%esp),%edx 2118 addl %edi,%edx 2119 movl 24(%esp),%ebx 2120 movl 28(%esp),%ecx 2121 movl %edx,%edi 2122 shrl $2,%edx 2123 andl $3,%edi 2124 leal (%edx,%edx,4),%edx 2125 addl %edx,%eax 2126 adcl $0,%ebx 2127 adcl $0,%ecx 2128 adcl $0,%esi 2129 adcl $0,%edi 2130 cmpl 92(%esp),%ebp 2131 jne .L004loop 2132 movl 84(%esp),%edx 2133 addl $64,%esp 2134 movl %eax,(%edx) 2135 movl %ebx,4(%edx) 2136 movl %ecx,8(%edx) 2137 movl %esi,12(%edx) 2138 movl %edi,16(%edx) 2139.L003nodata: 2140 popl %edi 2141 popl %esi 2142 popl %ebx 2143 popl %ebp 2144 ret 2145.size poly1305_blocks,.-.L_poly1305_blocks_begin 2146.globl poly1305_emit 2147.type poly1305_emit,@function 2148.align 16 2149poly1305_emit: 2150.L_poly1305_emit_begin: 2151 pushl %ebp 2152 pushl %ebx 2153 pushl %esi 2154 pushl %edi 2155 movl 20(%esp),%ebp 2156.Lenter_emit: 2157 movl 24(%esp),%edi 2158 movl (%ebp),%eax 2159 movl 4(%ebp),%ebx 2160 movl 8(%ebp),%ecx 2161 movl 12(%ebp),%edx 2162 movl 16(%ebp),%esi 2163 addl $5,%eax 2164 adcl $0,%ebx 2165 adcl $0,%ecx 2166 adcl $0,%edx 2167 adcl $0,%esi 2168 shrl $2,%esi 2169 negl %esi 2170 andl %esi,%eax 2171 andl %esi,%ebx 2172 andl %esi,%ecx 2173 andl %esi,%edx 2174 movl %eax,(%edi) 2175 movl %ebx,4(%edi) 2176 movl %ecx,8(%edi) 2177 movl %edx,12(%edi) 2178 notl %esi 2179 movl (%ebp),%eax 2180 movl 4(%ebp),%ebx 2181 movl 8(%ebp),%ecx 2182 movl 12(%ebp),%edx 2183 movl 28(%esp),%ebp 2184 andl %esi,%eax 2185 andl %esi,%ebx 2186 andl %esi,%ecx 2187 andl %esi,%edx 2188 orl (%edi),%eax 2189 orl 4(%edi),%ebx 2190 orl 8(%edi),%ecx 2191 orl 12(%edi),%edx 2192 addl (%ebp),%eax 2193 adcl 4(%ebp),%ebx 2194 adcl 8(%ebp),%ecx 2195 adcl 12(%ebp),%edx 2196 movl %eax,(%edi) 2197 movl %ebx,4(%edi) 2198 movl %ecx,8(%edi) 2199 movl %edx,12(%edi) 2200 popl %edi 2201 popl %esi 2202 popl %ebx 2203 popl %ebp 2204 ret 2205.size poly1305_emit,.-.L_poly1305_emit_begin 2206.align 32 2207.type _poly1305_init_sse2,@function 2208.align 16 2209_poly1305_init_sse2: 2210 movdqu 24(%edi),%xmm4 2211 leal 48(%edi),%edi 2212 movl %esp,%ebp 2213 subl $224,%esp 2214 andl $-16,%esp 2215 movq 64(%ebx),%xmm7 2216 movdqa %xmm4,%xmm0 2217 movdqa %xmm4,%xmm1 2218 movdqa %xmm4,%xmm2 2219 pand %xmm7,%xmm0 2220 psrlq $26,%xmm1 2221 psrldq $6,%xmm2 2222 pand %xmm7,%xmm1 2223 movdqa %xmm2,%xmm3 2224 psrlq $4,%xmm2 2225 psrlq $30,%xmm3 2226 pand %xmm7,%xmm2 2227 pand %xmm7,%xmm3 2228 psrldq $13,%xmm4 2229 leal 144(%esp),%edx 2230 movl $2,%ecx 2231.L005square: 2232 movdqa %xmm0,(%esp) 2233 movdqa %xmm1,16(%esp) 2234 movdqa %xmm2,32(%esp) 2235 movdqa %xmm3,48(%esp) 2236 movdqa %xmm4,64(%esp) 2237 movdqa %xmm1,%xmm6 2238 movdqa %xmm2,%xmm5 2239 pslld $2,%xmm6 2240 pslld $2,%xmm5 2241 paddd %xmm1,%xmm6 2242 paddd %xmm2,%xmm5 2243 movdqa %xmm6,80(%esp) 2244 movdqa %xmm5,96(%esp) 2245 movdqa %xmm3,%xmm6 2246 movdqa %xmm4,%xmm5 2247 pslld $2,%xmm6 2248 pslld $2,%xmm5 2249 paddd %xmm3,%xmm6 2250 paddd %xmm4,%xmm5 2251 movdqa %xmm6,112(%esp) 2252 movdqa %xmm5,128(%esp) 2253 pshufd $68,%xmm0,%xmm6 2254 movdqa %xmm1,%xmm5 2255 pshufd $68,%xmm1,%xmm1 2256 pshufd $68,%xmm2,%xmm2 2257 pshufd $68,%xmm3,%xmm3 2258 pshufd $68,%xmm4,%xmm4 2259 movdqa %xmm6,(%edx) 2260 movdqa %xmm1,16(%edx) 2261 movdqa %xmm2,32(%edx) 2262 movdqa %xmm3,48(%edx) 2263 movdqa %xmm4,64(%edx) 2264 pmuludq %xmm0,%xmm4 2265 pmuludq %xmm0,%xmm3 2266 pmuludq %xmm0,%xmm2 2267 pmuludq %xmm0,%xmm1 2268 pmuludq %xmm6,%xmm0 2269 movdqa %xmm5,%xmm6 2270 pmuludq 48(%edx),%xmm5 2271 movdqa %xmm6,%xmm7 2272 pmuludq 32(%edx),%xmm6 2273 paddq %xmm5,%xmm4 2274 movdqa %xmm7,%xmm5 2275 pmuludq 16(%edx),%xmm7 2276 paddq %xmm6,%xmm3 2277 movdqa 80(%esp),%xmm6 2278 pmuludq (%edx),%xmm5 2279 paddq %xmm7,%xmm2 2280 pmuludq 64(%edx),%xmm6 2281 movdqa 32(%esp),%xmm7 2282 paddq %xmm5,%xmm1 2283 movdqa %xmm7,%xmm5 2284 pmuludq 32(%edx),%xmm7 2285 paddq %xmm6,%xmm0 2286 movdqa %xmm5,%xmm6 2287 pmuludq 16(%edx),%xmm5 2288 paddq %xmm7,%xmm4 2289 movdqa 96(%esp),%xmm7 2290 pmuludq (%edx),%xmm6 2291 paddq %xmm5,%xmm3 2292 movdqa %xmm7,%xmm5 2293 pmuludq 64(%edx),%xmm7 2294 paddq %xmm6,%xmm2 2295 pmuludq 48(%edx),%xmm5 2296 movdqa 48(%esp),%xmm6 2297 paddq %xmm7,%xmm1 2298 movdqa %xmm6,%xmm7 2299 pmuludq 16(%edx),%xmm6 2300 paddq %xmm5,%xmm0 2301 movdqa 112(%esp),%xmm5 2302 pmuludq (%edx),%xmm7 2303 paddq %xmm6,%xmm4 2304 movdqa %xmm5,%xmm6 2305 pmuludq 64(%edx),%xmm5 2306 paddq %xmm7,%xmm3 2307 movdqa %xmm6,%xmm7 2308 pmuludq 48(%edx),%xmm6 2309 paddq %xmm5,%xmm2 2310 pmuludq 32(%edx),%xmm7 2311 movdqa 64(%esp),%xmm5 2312 paddq %xmm6,%xmm1 2313 movdqa 128(%esp),%xmm6 2314 pmuludq (%edx),%xmm5 2315 paddq %xmm7,%xmm0 2316 movdqa %xmm6,%xmm7 2317 pmuludq 64(%edx),%xmm6 2318 paddq %xmm5,%xmm4 2319 movdqa %xmm7,%xmm5 2320 pmuludq 16(%edx),%xmm7 2321 paddq %xmm6,%xmm3 2322 movdqa %xmm5,%xmm6 2323 pmuludq 32(%edx),%xmm5 2324 paddq %xmm7,%xmm0 2325 pmuludq 48(%edx),%xmm6 2326 movdqa 64(%ebx),%xmm7 2327 paddq %xmm5,%xmm1 2328 paddq %xmm6,%xmm2 2329 movdqa %xmm3,%xmm5 2330 pand %xmm7,%xmm3 2331 psrlq $26,%xmm5 2332 paddq %xmm4,%xmm5 2333 movdqa %xmm0,%xmm6 2334 pand %xmm7,%xmm0 2335 psrlq $26,%xmm6 2336 movdqa %xmm5,%xmm4 2337 paddq %xmm1,%xmm6 2338 psrlq $26,%xmm5 2339 pand %xmm7,%xmm4 2340 movdqa %xmm6,%xmm1 2341 psrlq $26,%xmm6 2342 paddd %xmm5,%xmm0 2343 psllq $2,%xmm5 2344 paddq %xmm2,%xmm6 2345 paddq %xmm0,%xmm5 2346 pand %xmm7,%xmm1 2347 movdqa %xmm6,%xmm2 2348 psrlq $26,%xmm6 2349 pand %xmm7,%xmm2 2350 paddd %xmm3,%xmm6 2351 movdqa %xmm5,%xmm0 2352 psrlq $26,%xmm5 2353 movdqa %xmm6,%xmm3 2354 psrlq $26,%xmm6 2355 pand %xmm7,%xmm0 2356 paddd %xmm5,%xmm1 2357 pand %xmm7,%xmm3 2358 paddd %xmm6,%xmm4 2359 decl %ecx 2360 jz .L006square_break 2361 punpcklqdq (%esp),%xmm0 2362 punpcklqdq 16(%esp),%xmm1 2363 punpcklqdq 32(%esp),%xmm2 2364 punpcklqdq 48(%esp),%xmm3 2365 punpcklqdq 64(%esp),%xmm4 2366 jmp .L005square 2367.L006square_break: 2368 psllq $32,%xmm0 2369 psllq $32,%xmm1 2370 psllq $32,%xmm2 2371 psllq $32,%xmm3 2372 psllq $32,%xmm4 2373 por (%esp),%xmm0 2374 por 16(%esp),%xmm1 2375 por 32(%esp),%xmm2 2376 por 48(%esp),%xmm3 2377 por 64(%esp),%xmm4 2378 pshufd $141,%xmm0,%xmm0 2379 pshufd $141,%xmm1,%xmm1 2380 pshufd $141,%xmm2,%xmm2 2381 pshufd $141,%xmm3,%xmm3 2382 pshufd $141,%xmm4,%xmm4 2383 movdqu %xmm0,(%edi) 2384 movdqu %xmm1,16(%edi) 2385 movdqu %xmm2,32(%edi) 2386 movdqu %xmm3,48(%edi) 2387 movdqu %xmm4,64(%edi) 2388 movdqa %xmm1,%xmm6 2389 movdqa %xmm2,%xmm5 2390 pslld $2,%xmm6 2391 pslld $2,%xmm5 2392 paddd %xmm1,%xmm6 2393 paddd %xmm2,%xmm5 2394 movdqu %xmm6,80(%edi) 2395 movdqu %xmm5,96(%edi) 2396 movdqa %xmm3,%xmm6 2397 movdqa %xmm4,%xmm5 2398 pslld $2,%xmm6 2399 pslld $2,%xmm5 2400 paddd %xmm3,%xmm6 2401 paddd %xmm4,%xmm5 2402 movdqu %xmm6,112(%edi) 2403 movdqu %xmm5,128(%edi) 2404 movl %ebp,%esp 2405 leal -48(%edi),%edi 2406 ret 2407.size _poly1305_init_sse2,.-_poly1305_init_sse2 2408.align 32 2409.type _poly1305_blocks_sse2,@function 2410.align 16 2411_poly1305_blocks_sse2: 2412 pushl %ebp 2413 pushl %ebx 2414 pushl %esi 2415 pushl %edi 2416 movl 20(%esp),%edi 2417 movl 24(%esp),%esi 2418 movl 28(%esp),%ecx 2419 movl 20(%edi),%eax 2420 andl $-16,%ecx 2421 jz .L007nodata 2422 cmpl $64,%ecx 2423 jae .L008enter_sse2 2424 testl %eax,%eax 2425 jz .Lenter_blocks 2426.align 16 2427.L008enter_sse2: 2428 call .L009pic_point 2429.L009pic_point: 2430 popl %ebx 2431 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx 2432 testl %eax,%eax 2433 jnz .L010base2_26 2434 call _poly1305_init_sse2 2435 movl (%edi),%eax 2436 movl 3(%edi),%ecx 2437 movl 6(%edi),%edx 2438 movl 9(%edi),%esi 2439 movl 13(%edi),%ebp 2440 movl $1,20(%edi) 2441 shrl $2,%ecx 2442 andl $67108863,%eax 2443 shrl $4,%edx 2444 andl $67108863,%ecx 2445 shrl $6,%esi 2446 andl $67108863,%edx 2447 movd %eax,%xmm0 2448 movd %ecx,%xmm1 2449 movd %edx,%xmm2 2450 movd %esi,%xmm3 2451 movd %ebp,%xmm4 2452 movl 24(%esp),%esi 2453 movl 28(%esp),%ecx 2454 jmp .L011base2_32 2455.align 16 2456.L010base2_26: 2457 movd (%edi),%xmm0 2458 movd 4(%edi),%xmm1 2459 movd 8(%edi),%xmm2 2460 movd 12(%edi),%xmm3 2461 movd 16(%edi),%xmm4 2462 movdqa 64(%ebx),%xmm7 2463.L011base2_32: 2464 movl 32(%esp),%eax 2465 movl %esp,%ebp 2466 subl $528,%esp 2467 andl $-16,%esp 2468 leal 48(%edi),%edi 2469 shll $24,%eax 2470 testl $31,%ecx 2471 jz .L012even 2472 movdqu (%esi),%xmm6 2473 leal 16(%esi),%esi 2474 movdqa %xmm6,%xmm5 2475 pand %xmm7,%xmm6 2476 paddd %xmm6,%xmm0 2477 movdqa %xmm5,%xmm6 2478 psrlq $26,%xmm5 2479 psrldq $6,%xmm6 2480 pand %xmm7,%xmm5 2481 paddd %xmm5,%xmm1 2482 movdqa %xmm6,%xmm5 2483 psrlq $4,%xmm6 2484 pand %xmm7,%xmm6 2485 paddd %xmm6,%xmm2 2486 movdqa %xmm5,%xmm6 2487 psrlq $30,%xmm5 2488 pand %xmm7,%xmm5 2489 psrldq $7,%xmm6 2490 paddd %xmm5,%xmm3 2491 movd %eax,%xmm5 2492 paddd %xmm6,%xmm4 2493 movd 12(%edi),%xmm6 2494 paddd %xmm5,%xmm4 2495 movdqa %xmm0,(%esp) 2496 movdqa %xmm1,16(%esp) 2497 movdqa %xmm2,32(%esp) 2498 movdqa %xmm3,48(%esp) 2499 movdqa %xmm4,64(%esp) 2500 pmuludq %xmm6,%xmm0 2501 pmuludq %xmm6,%xmm1 2502 pmuludq %xmm6,%xmm2 2503 movd 28(%edi),%xmm5 2504 pmuludq %xmm6,%xmm3 2505 pmuludq %xmm6,%xmm4 2506 movdqa %xmm5,%xmm6 2507 pmuludq 48(%esp),%xmm5 2508 movdqa %xmm6,%xmm7 2509 pmuludq 32(%esp),%xmm6 2510 paddq %xmm5,%xmm4 2511 movdqa %xmm7,%xmm5 2512 pmuludq 16(%esp),%xmm7 2513 paddq %xmm6,%xmm3 2514 movd 92(%edi),%xmm6 2515 pmuludq (%esp),%xmm5 2516 paddq %xmm7,%xmm2 2517 pmuludq 64(%esp),%xmm6 2518 movd 44(%edi),%xmm7 2519 paddq %xmm5,%xmm1 2520 movdqa %xmm7,%xmm5 2521 pmuludq 32(%esp),%xmm7 2522 paddq %xmm6,%xmm0 2523 movdqa %xmm5,%xmm6 2524 pmuludq 16(%esp),%xmm5 2525 paddq %xmm7,%xmm4 2526 movd 108(%edi),%xmm7 2527 pmuludq (%esp),%xmm6 2528 paddq %xmm5,%xmm3 2529 movdqa %xmm7,%xmm5 2530 pmuludq 64(%esp),%xmm7 2531 paddq %xmm6,%xmm2 2532 pmuludq 48(%esp),%xmm5 2533 movd 60(%edi),%xmm6 2534 paddq %xmm7,%xmm1 2535 movdqa %xmm6,%xmm7 2536 pmuludq 16(%esp),%xmm6 2537 paddq %xmm5,%xmm0 2538 movd 124(%edi),%xmm5 2539 pmuludq (%esp),%xmm7 2540 paddq %xmm6,%xmm4 2541 movdqa %xmm5,%xmm6 2542 pmuludq 64(%esp),%xmm5 2543 paddq %xmm7,%xmm3 2544 movdqa %xmm6,%xmm7 2545 pmuludq 48(%esp),%xmm6 2546 paddq %xmm5,%xmm2 2547 pmuludq 32(%esp),%xmm7 2548 movd 76(%edi),%xmm5 2549 paddq %xmm6,%xmm1 2550 movd 140(%edi),%xmm6 2551 pmuludq (%esp),%xmm5 2552 paddq %xmm7,%xmm0 2553 movdqa %xmm6,%xmm7 2554 pmuludq 64(%esp),%xmm6 2555 paddq %xmm5,%xmm4 2556 movdqa %xmm7,%xmm5 2557 pmuludq 16(%esp),%xmm7 2558 paddq %xmm6,%xmm3 2559 movdqa %xmm5,%xmm6 2560 pmuludq 32(%esp),%xmm5 2561 paddq %xmm7,%xmm0 2562 pmuludq 48(%esp),%xmm6 2563 movdqa 64(%ebx),%xmm7 2564 paddq %xmm5,%xmm1 2565 paddq %xmm6,%xmm2 2566 movdqa %xmm3,%xmm5 2567 pand %xmm7,%xmm3 2568 psrlq $26,%xmm5 2569 paddq %xmm4,%xmm5 2570 movdqa %xmm0,%xmm6 2571 pand %xmm7,%xmm0 2572 psrlq $26,%xmm6 2573 movdqa %xmm5,%xmm4 2574 paddq %xmm1,%xmm6 2575 psrlq $26,%xmm5 2576 pand %xmm7,%xmm4 2577 movdqa %xmm6,%xmm1 2578 psrlq $26,%xmm6 2579 paddd %xmm5,%xmm0 2580 psllq $2,%xmm5 2581 paddq %xmm2,%xmm6 2582 paddq %xmm0,%xmm5 2583 pand %xmm7,%xmm1 2584 movdqa %xmm6,%xmm2 2585 psrlq $26,%xmm6 2586 pand %xmm7,%xmm2 2587 paddd %xmm3,%xmm6 2588 movdqa %xmm5,%xmm0 2589 psrlq $26,%xmm5 2590 movdqa %xmm6,%xmm3 2591 psrlq $26,%xmm6 2592 pand %xmm7,%xmm0 2593 paddd %xmm5,%xmm1 2594 pand %xmm7,%xmm3 2595 paddd %xmm6,%xmm4 2596 subl $16,%ecx 2597 jz .L013done 2598.L012even: 2599 leal 384(%esp),%edx 2600 leal -32(%esi),%eax 2601 subl $64,%ecx 2602 movdqu (%edi),%xmm5 2603 pshufd $68,%xmm5,%xmm6 2604 cmovbl %eax,%esi 2605 pshufd $238,%xmm5,%xmm5 2606 movdqa %xmm6,(%edx) 2607 leal 160(%esp),%eax 2608 movdqu 16(%edi),%xmm6 2609 movdqa %xmm5,-144(%edx) 2610 pshufd $68,%xmm6,%xmm5 2611 pshufd $238,%xmm6,%xmm6 2612 movdqa %xmm5,16(%edx) 2613 movdqu 32(%edi),%xmm5 2614 movdqa %xmm6,-128(%edx) 2615 pshufd $68,%xmm5,%xmm6 2616 pshufd $238,%xmm5,%xmm5 2617 movdqa %xmm6,32(%edx) 2618 movdqu 48(%edi),%xmm6 2619 movdqa %xmm5,-112(%edx) 2620 pshufd $68,%xmm6,%xmm5 2621 pshufd $238,%xmm6,%xmm6 2622 movdqa %xmm5,48(%edx) 2623 movdqu 64(%edi),%xmm5 2624 movdqa %xmm6,-96(%edx) 2625 pshufd $68,%xmm5,%xmm6 2626 pshufd $238,%xmm5,%xmm5 2627 movdqa %xmm6,64(%edx) 2628 movdqu 80(%edi),%xmm6 2629 movdqa %xmm5,-80(%edx) 2630 pshufd $68,%xmm6,%xmm5 2631 pshufd $238,%xmm6,%xmm6 2632 movdqa %xmm5,80(%edx) 2633 movdqu 96(%edi),%xmm5 2634 movdqa %xmm6,-64(%edx) 2635 pshufd $68,%xmm5,%xmm6 2636 pshufd $238,%xmm5,%xmm5 2637 movdqa %xmm6,96(%edx) 2638 movdqu 112(%edi),%xmm6 2639 movdqa %xmm5,-48(%edx) 2640 pshufd $68,%xmm6,%xmm5 2641 pshufd $238,%xmm6,%xmm6 2642 movdqa %xmm5,112(%edx) 2643 movdqu 128(%edi),%xmm5 2644 movdqa %xmm6,-32(%edx) 2645 pshufd $68,%xmm5,%xmm6 2646 pshufd $238,%xmm5,%xmm5 2647 movdqa %xmm6,128(%edx) 2648 movdqa %xmm5,-16(%edx) 2649 movdqu 32(%esi),%xmm5 2650 movdqu 48(%esi),%xmm6 2651 leal 32(%esi),%esi 2652 movdqa %xmm2,112(%esp) 2653 movdqa %xmm3,128(%esp) 2654 movdqa %xmm4,144(%esp) 2655 movdqa %xmm5,%xmm2 2656 movdqa %xmm6,%xmm3 2657 psrldq $6,%xmm2 2658 psrldq $6,%xmm3 2659 movdqa %xmm5,%xmm4 2660 punpcklqdq %xmm3,%xmm2 2661 punpckhqdq %xmm6,%xmm4 2662 punpcklqdq %xmm6,%xmm5 2663 movdqa %xmm2,%xmm3 2664 psrlq $4,%xmm2 2665 psrlq $30,%xmm3 2666 movdqa %xmm5,%xmm6 2667 psrlq $40,%xmm4 2668 psrlq $26,%xmm6 2669 pand %xmm7,%xmm5 2670 pand %xmm7,%xmm6 2671 pand %xmm7,%xmm2 2672 pand %xmm7,%xmm3 2673 por (%ebx),%xmm4 2674 movdqa %xmm0,80(%esp) 2675 movdqa %xmm1,96(%esp) 2676 jbe .L014skip_loop 2677 jmp .L015loop 2678.align 32 2679.L015loop: 2680 movdqa -144(%edx),%xmm7 2681 movdqa %xmm6,16(%eax) 2682 movdqa %xmm2,32(%eax) 2683 movdqa %xmm3,48(%eax) 2684 movdqa %xmm4,64(%eax) 2685 movdqa %xmm5,%xmm1 2686 pmuludq %xmm7,%xmm5 2687 movdqa %xmm6,%xmm0 2688 pmuludq %xmm7,%xmm6 2689 pmuludq %xmm7,%xmm2 2690 pmuludq %xmm7,%xmm3 2691 pmuludq %xmm7,%xmm4 2692 pmuludq -16(%edx),%xmm0 2693 movdqa %xmm1,%xmm7 2694 pmuludq -128(%edx),%xmm1 2695 paddq %xmm5,%xmm0 2696 movdqa %xmm7,%xmm5 2697 pmuludq -112(%edx),%xmm7 2698 paddq %xmm6,%xmm1 2699 movdqa %xmm5,%xmm6 2700 pmuludq -96(%edx),%xmm5 2701 paddq %xmm7,%xmm2 2702 movdqa 16(%eax),%xmm7 2703 pmuludq -80(%edx),%xmm6 2704 paddq %xmm5,%xmm3 2705 movdqa %xmm7,%xmm5 2706 pmuludq -128(%edx),%xmm7 2707 paddq %xmm6,%xmm4 2708 movdqa %xmm5,%xmm6 2709 pmuludq -112(%edx),%xmm5 2710 paddq %xmm7,%xmm2 2711 movdqa 32(%eax),%xmm7 2712 pmuludq -96(%edx),%xmm6 2713 paddq %xmm5,%xmm3 2714 movdqa %xmm7,%xmm5 2715 pmuludq -32(%edx),%xmm7 2716 paddq %xmm6,%xmm4 2717 movdqa %xmm5,%xmm6 2718 pmuludq -16(%edx),%xmm5 2719 paddq %xmm7,%xmm0 2720 movdqa %xmm6,%xmm7 2721 pmuludq -128(%edx),%xmm6 2722 paddq %xmm5,%xmm1 2723 movdqa 48(%eax),%xmm5 2724 pmuludq -112(%edx),%xmm7 2725 paddq %xmm6,%xmm3 2726 movdqa %xmm5,%xmm6 2727 pmuludq -48(%edx),%xmm5 2728 paddq %xmm7,%xmm4 2729 movdqa %xmm6,%xmm7 2730 pmuludq -32(%edx),%xmm6 2731 paddq %xmm5,%xmm0 2732 movdqa %xmm7,%xmm5 2733 pmuludq -16(%edx),%xmm7 2734 paddq %xmm6,%xmm1 2735 movdqa 64(%eax),%xmm6 2736 pmuludq -128(%edx),%xmm5 2737 paddq %xmm7,%xmm2 2738 movdqa %xmm6,%xmm7 2739 pmuludq -16(%edx),%xmm6 2740 paddq %xmm5,%xmm4 2741 movdqa %xmm7,%xmm5 2742 pmuludq -64(%edx),%xmm7 2743 paddq %xmm6,%xmm3 2744 movdqa %xmm5,%xmm6 2745 pmuludq -48(%edx),%xmm5 2746 paddq %xmm7,%xmm0 2747 movdqa 64(%ebx),%xmm7 2748 pmuludq -32(%edx),%xmm6 2749 paddq %xmm5,%xmm1 2750 paddq %xmm6,%xmm2 2751 movdqu -32(%esi),%xmm5 2752 movdqu -16(%esi),%xmm6 2753 leal 32(%esi),%esi 2754 movdqa %xmm2,32(%esp) 2755 movdqa %xmm3,48(%esp) 2756 movdqa %xmm4,64(%esp) 2757 movdqa %xmm5,%xmm2 2758 movdqa %xmm6,%xmm3 2759 psrldq $6,%xmm2 2760 psrldq $6,%xmm3 2761 movdqa %xmm5,%xmm4 2762 punpcklqdq %xmm3,%xmm2 2763 punpckhqdq %xmm6,%xmm4 2764 punpcklqdq %xmm6,%xmm5 2765 movdqa %xmm2,%xmm3 2766 psrlq $4,%xmm2 2767 psrlq $30,%xmm3 2768 movdqa %xmm5,%xmm6 2769 psrlq $40,%xmm4 2770 psrlq $26,%xmm6 2771 pand %xmm7,%xmm5 2772 pand %xmm7,%xmm6 2773 pand %xmm7,%xmm2 2774 pand %xmm7,%xmm3 2775 por (%ebx),%xmm4 2776 leal -32(%esi),%eax 2777 subl $64,%ecx 2778 paddd 80(%esp),%xmm5 2779 paddd 96(%esp),%xmm6 2780 paddd 112(%esp),%xmm2 2781 paddd 128(%esp),%xmm3 2782 paddd 144(%esp),%xmm4 2783 cmovbl %eax,%esi 2784 leal 160(%esp),%eax 2785 movdqa (%edx),%xmm7 2786 movdqa %xmm1,16(%esp) 2787 movdqa %xmm6,16(%eax) 2788 movdqa %xmm2,32(%eax) 2789 movdqa %xmm3,48(%eax) 2790 movdqa %xmm4,64(%eax) 2791 movdqa %xmm5,%xmm1 2792 pmuludq %xmm7,%xmm5 2793 paddq %xmm0,%xmm5 2794 movdqa %xmm6,%xmm0 2795 pmuludq %xmm7,%xmm6 2796 pmuludq %xmm7,%xmm2 2797 pmuludq %xmm7,%xmm3 2798 pmuludq %xmm7,%xmm4 2799 paddq 16(%esp),%xmm6 2800 paddq 32(%esp),%xmm2 2801 paddq 48(%esp),%xmm3 2802 paddq 64(%esp),%xmm4 2803 pmuludq 128(%edx),%xmm0 2804 movdqa %xmm1,%xmm7 2805 pmuludq 16(%edx),%xmm1 2806 paddq %xmm5,%xmm0 2807 movdqa %xmm7,%xmm5 2808 pmuludq 32(%edx),%xmm7 2809 paddq %xmm6,%xmm1 2810 movdqa %xmm5,%xmm6 2811 pmuludq 48(%edx),%xmm5 2812 paddq %xmm7,%xmm2 2813 movdqa 16(%eax),%xmm7 2814 pmuludq 64(%edx),%xmm6 2815 paddq %xmm5,%xmm3 2816 movdqa %xmm7,%xmm5 2817 pmuludq 16(%edx),%xmm7 2818 paddq %xmm6,%xmm4 2819 movdqa %xmm5,%xmm6 2820 pmuludq 32(%edx),%xmm5 2821 paddq %xmm7,%xmm2 2822 movdqa 32(%eax),%xmm7 2823 pmuludq 48(%edx),%xmm6 2824 paddq %xmm5,%xmm3 2825 movdqa %xmm7,%xmm5 2826 pmuludq 112(%edx),%xmm7 2827 paddq %xmm6,%xmm4 2828 movdqa %xmm5,%xmm6 2829 pmuludq 128(%edx),%xmm5 2830 paddq %xmm7,%xmm0 2831 movdqa %xmm6,%xmm7 2832 pmuludq 16(%edx),%xmm6 2833 paddq %xmm5,%xmm1 2834 movdqa 48(%eax),%xmm5 2835 pmuludq 32(%edx),%xmm7 2836 paddq %xmm6,%xmm3 2837 movdqa %xmm5,%xmm6 2838 pmuludq 96(%edx),%xmm5 2839 paddq %xmm7,%xmm4 2840 movdqa %xmm6,%xmm7 2841 pmuludq 112(%edx),%xmm6 2842 paddq %xmm5,%xmm0 2843 movdqa %xmm7,%xmm5 2844 pmuludq 128(%edx),%xmm7 2845 paddq %xmm6,%xmm1 2846 movdqa 64(%eax),%xmm6 2847 pmuludq 16(%edx),%xmm5 2848 paddq %xmm7,%xmm2 2849 movdqa %xmm6,%xmm7 2850 pmuludq 128(%edx),%xmm6 2851 paddq %xmm5,%xmm4 2852 movdqa %xmm7,%xmm5 2853 pmuludq 80(%edx),%xmm7 2854 paddq %xmm6,%xmm3 2855 movdqa %xmm5,%xmm6 2856 pmuludq 96(%edx),%xmm5 2857 paddq %xmm7,%xmm0 2858 movdqa 64(%ebx),%xmm7 2859 pmuludq 112(%edx),%xmm6 2860 paddq %xmm5,%xmm1 2861 paddq %xmm6,%xmm2 2862 movdqa %xmm3,%xmm5 2863 pand %xmm7,%xmm3 2864 psrlq $26,%xmm5 2865 paddq %xmm4,%xmm5 2866 movdqa %xmm0,%xmm6 2867 pand %xmm7,%xmm0 2868 psrlq $26,%xmm6 2869 movdqa %xmm5,%xmm4 2870 paddq %xmm1,%xmm6 2871 psrlq $26,%xmm5 2872 pand %xmm7,%xmm4 2873 movdqa %xmm6,%xmm1 2874 psrlq $26,%xmm6 2875 paddd %xmm5,%xmm0 2876 psllq $2,%xmm5 2877 paddq %xmm2,%xmm6 2878 paddq %xmm0,%xmm5 2879 pand %xmm7,%xmm1 2880 movdqa %xmm6,%xmm2 2881 psrlq $26,%xmm6 2882 pand %xmm7,%xmm2 2883 paddd %xmm3,%xmm6 2884 movdqa %xmm5,%xmm0 2885 psrlq $26,%xmm5 2886 movdqa %xmm6,%xmm3 2887 psrlq $26,%xmm6 2888 pand %xmm7,%xmm0 2889 paddd %xmm5,%xmm1 2890 pand %xmm7,%xmm3 2891 paddd %xmm6,%xmm4 2892 movdqu 32(%esi),%xmm5 2893 movdqu 48(%esi),%xmm6 2894 leal 32(%esi),%esi 2895 movdqa %xmm2,112(%esp) 2896 movdqa %xmm3,128(%esp) 2897 movdqa %xmm4,144(%esp) 2898 movdqa %xmm5,%xmm2 2899 movdqa %xmm6,%xmm3 2900 psrldq $6,%xmm2 2901 psrldq $6,%xmm3 2902 movdqa %xmm5,%xmm4 2903 punpcklqdq %xmm3,%xmm2 2904 punpckhqdq %xmm6,%xmm4 2905 punpcklqdq %xmm6,%xmm5 2906 movdqa %xmm2,%xmm3 2907 psrlq $4,%xmm2 2908 psrlq $30,%xmm3 2909 movdqa %xmm5,%xmm6 2910 psrlq $40,%xmm4 2911 psrlq $26,%xmm6 2912 pand %xmm7,%xmm5 2913 pand %xmm7,%xmm6 2914 pand %xmm7,%xmm2 2915 pand %xmm7,%xmm3 2916 por (%ebx),%xmm4 2917 movdqa %xmm0,80(%esp) 2918 movdqa %xmm1,96(%esp) 2919 ja .L015loop 2920.L014skip_loop: 2921 pshufd $16,-144(%edx),%xmm7 2922 addl $32,%ecx 2923 jnz .L016long_tail 2924 paddd %xmm0,%xmm5 2925 paddd %xmm1,%xmm6 2926 paddd 112(%esp),%xmm2 2927 paddd 128(%esp),%xmm3 2928 paddd 144(%esp),%xmm4 2929.L016long_tail: 2930 movdqa %xmm5,(%eax) 2931 movdqa %xmm6,16(%eax) 2932 movdqa %xmm2,32(%eax) 2933 movdqa %xmm3,48(%eax) 2934 movdqa %xmm4,64(%eax) 2935 pmuludq %xmm7,%xmm5 2936 pmuludq %xmm7,%xmm6 2937 pmuludq %xmm7,%xmm2 2938 movdqa %xmm5,%xmm0 2939 pshufd $16,-128(%edx),%xmm5 2940 pmuludq %xmm7,%xmm3 2941 movdqa %xmm6,%xmm1 2942 pmuludq %xmm7,%xmm4 2943 movdqa %xmm5,%xmm6 2944 pmuludq 48(%eax),%xmm5 2945 movdqa %xmm6,%xmm7 2946 pmuludq 32(%eax),%xmm6 2947 paddq %xmm5,%xmm4 2948 movdqa %xmm7,%xmm5 2949 pmuludq 16(%eax),%xmm7 2950 paddq %xmm6,%xmm3 2951 pshufd $16,-64(%edx),%xmm6 2952 pmuludq (%eax),%xmm5 2953 paddq %xmm7,%xmm2 2954 pmuludq 64(%eax),%xmm6 2955 pshufd $16,-112(%edx),%xmm7 2956 paddq %xmm5,%xmm1 2957 movdqa %xmm7,%xmm5 2958 pmuludq 32(%eax),%xmm7 2959 paddq %xmm6,%xmm0 2960 movdqa %xmm5,%xmm6 2961 pmuludq 16(%eax),%xmm5 2962 paddq %xmm7,%xmm4 2963 pshufd $16,-48(%edx),%xmm7 2964 pmuludq (%eax),%xmm6 2965 paddq %xmm5,%xmm3 2966 movdqa %xmm7,%xmm5 2967 pmuludq 64(%eax),%xmm7 2968 paddq %xmm6,%xmm2 2969 pmuludq 48(%eax),%xmm5 2970 pshufd $16,-96(%edx),%xmm6 2971 paddq %xmm7,%xmm1 2972 movdqa %xmm6,%xmm7 2973 pmuludq 16(%eax),%xmm6 2974 paddq %xmm5,%xmm0 2975 pshufd $16,-32(%edx),%xmm5 2976 pmuludq (%eax),%xmm7 2977 paddq %xmm6,%xmm4 2978 movdqa %xmm5,%xmm6 2979 pmuludq 64(%eax),%xmm5 2980 paddq %xmm7,%xmm3 2981 movdqa %xmm6,%xmm7 2982 pmuludq 48(%eax),%xmm6 2983 paddq %xmm5,%xmm2 2984 pmuludq 32(%eax),%xmm7 2985 pshufd $16,-80(%edx),%xmm5 2986 paddq %xmm6,%xmm1 2987 pshufd $16,-16(%edx),%xmm6 2988 pmuludq (%eax),%xmm5 2989 paddq %xmm7,%xmm0 2990 movdqa %xmm6,%xmm7 2991 pmuludq 64(%eax),%xmm6 2992 paddq %xmm5,%xmm4 2993 movdqa %xmm7,%xmm5 2994 pmuludq 16(%eax),%xmm7 2995 paddq %xmm6,%xmm3 2996 movdqa %xmm5,%xmm6 2997 pmuludq 32(%eax),%xmm5 2998 paddq %xmm7,%xmm0 2999 pmuludq 48(%eax),%xmm6 3000 movdqa 64(%ebx),%xmm7 3001 paddq %xmm5,%xmm1 3002 paddq %xmm6,%xmm2 3003 jz .L017short_tail 3004 movdqu -32(%esi),%xmm5 3005 movdqu -16(%esi),%xmm6 3006 leal 32(%esi),%esi 3007 movdqa %xmm2,32(%esp) 3008 movdqa %xmm3,48(%esp) 3009 movdqa %xmm4,64(%esp) 3010 movdqa %xmm5,%xmm2 3011 movdqa %xmm6,%xmm3 3012 psrldq $6,%xmm2 3013 psrldq $6,%xmm3 3014 movdqa %xmm5,%xmm4 3015 punpcklqdq %xmm3,%xmm2 3016 punpckhqdq %xmm6,%xmm4 3017 punpcklqdq %xmm6,%xmm5 3018 movdqa %xmm2,%xmm3 3019 psrlq $4,%xmm2 3020 psrlq $30,%xmm3 3021 movdqa %xmm5,%xmm6 3022 psrlq $40,%xmm4 3023 psrlq $26,%xmm6 3024 pand %xmm7,%xmm5 3025 pand %xmm7,%xmm6 3026 pand %xmm7,%xmm2 3027 pand %xmm7,%xmm3 3028 por (%ebx),%xmm4 3029 pshufd $16,(%edx),%xmm7 3030 paddd 80(%esp),%xmm5 3031 paddd 96(%esp),%xmm6 3032 paddd 112(%esp),%xmm2 3033 paddd 128(%esp),%xmm3 3034 paddd 144(%esp),%xmm4 3035 movdqa %xmm5,(%esp) 3036 pmuludq %xmm7,%xmm5 3037 movdqa %xmm6,16(%esp) 3038 pmuludq %xmm7,%xmm6 3039 paddq %xmm5,%xmm0 3040 movdqa %xmm2,%xmm5 3041 pmuludq %xmm7,%xmm2 3042 paddq %xmm6,%xmm1 3043 movdqa %xmm3,%xmm6 3044 pmuludq %xmm7,%xmm3 3045 paddq 32(%esp),%xmm2 3046 movdqa %xmm5,32(%esp) 3047 pshufd $16,16(%edx),%xmm5 3048 paddq 48(%esp),%xmm3 3049 movdqa %xmm6,48(%esp) 3050 movdqa %xmm4,%xmm6 3051 pmuludq %xmm7,%xmm4 3052 paddq 64(%esp),%xmm4 3053 movdqa %xmm6,64(%esp) 3054 movdqa %xmm5,%xmm6 3055 pmuludq 48(%esp),%xmm5 3056 movdqa %xmm6,%xmm7 3057 pmuludq 32(%esp),%xmm6 3058 paddq %xmm5,%xmm4 3059 movdqa %xmm7,%xmm5 3060 pmuludq 16(%esp),%xmm7 3061 paddq %xmm6,%xmm3 3062 pshufd $16,80(%edx),%xmm6 3063 pmuludq (%esp),%xmm5 3064 paddq %xmm7,%xmm2 3065 pmuludq 64(%esp),%xmm6 3066 pshufd $16,32(%edx),%xmm7 3067 paddq %xmm5,%xmm1 3068 movdqa %xmm7,%xmm5 3069 pmuludq 32(%esp),%xmm7 3070 paddq %xmm6,%xmm0 3071 movdqa %xmm5,%xmm6 3072 pmuludq 16(%esp),%xmm5 3073 paddq %xmm7,%xmm4 3074 pshufd $16,96(%edx),%xmm7 3075 pmuludq (%esp),%xmm6 3076 paddq %xmm5,%xmm3 3077 movdqa %xmm7,%xmm5 3078 pmuludq 64(%esp),%xmm7 3079 paddq %xmm6,%xmm2 3080 pmuludq 48(%esp),%xmm5 3081 pshufd $16,48(%edx),%xmm6 3082 paddq %xmm7,%xmm1 3083 movdqa %xmm6,%xmm7 3084 pmuludq 16(%esp),%xmm6 3085 paddq %xmm5,%xmm0 3086 pshufd $16,112(%edx),%xmm5 3087 pmuludq (%esp),%xmm7 3088 paddq %xmm6,%xmm4 3089 movdqa %xmm5,%xmm6 3090 pmuludq 64(%esp),%xmm5 3091 paddq %xmm7,%xmm3 3092 movdqa %xmm6,%xmm7 3093 pmuludq 48(%esp),%xmm6 3094 paddq %xmm5,%xmm2 3095 pmuludq 32(%esp),%xmm7 3096 pshufd $16,64(%edx),%xmm5 3097 paddq %xmm6,%xmm1 3098 pshufd $16,128(%edx),%xmm6 3099 pmuludq (%esp),%xmm5 3100 paddq %xmm7,%xmm0 3101 movdqa %xmm6,%xmm7 3102 pmuludq 64(%esp),%xmm6 3103 paddq %xmm5,%xmm4 3104 movdqa %xmm7,%xmm5 3105 pmuludq 16(%esp),%xmm7 3106 paddq %xmm6,%xmm3 3107 movdqa %xmm5,%xmm6 3108 pmuludq 32(%esp),%xmm5 3109 paddq %xmm7,%xmm0 3110 pmuludq 48(%esp),%xmm6 3111 movdqa 64(%ebx),%xmm7 3112 paddq %xmm5,%xmm1 3113 paddq %xmm6,%xmm2 3114.L017short_tail: 3115 pshufd $78,%xmm4,%xmm6 3116 pshufd $78,%xmm3,%xmm5 3117 paddq %xmm6,%xmm4 3118 paddq %xmm5,%xmm3 3119 pshufd $78,%xmm0,%xmm6 3120 pshufd $78,%xmm1,%xmm5 3121 paddq %xmm6,%xmm0 3122 paddq %xmm5,%xmm1 3123 pshufd $78,%xmm2,%xmm6 3124 movdqa %xmm3,%xmm5 3125 pand %xmm7,%xmm3 3126 psrlq $26,%xmm5 3127 paddq %xmm6,%xmm2 3128 paddq %xmm4,%xmm5 3129 movdqa %xmm0,%xmm6 3130 pand %xmm7,%xmm0 3131 psrlq $26,%xmm6 3132 movdqa %xmm5,%xmm4 3133 paddq %xmm1,%xmm6 3134 psrlq $26,%xmm5 3135 pand %xmm7,%xmm4 3136 movdqa %xmm6,%xmm1 3137 psrlq $26,%xmm6 3138 paddd %xmm5,%xmm0 3139 psllq $2,%xmm5 3140 paddq %xmm2,%xmm6 3141 paddq %xmm0,%xmm5 3142 pand %xmm7,%xmm1 3143 movdqa %xmm6,%xmm2 3144 psrlq $26,%xmm6 3145 pand %xmm7,%xmm2 3146 paddd %xmm3,%xmm6 3147 movdqa %xmm5,%xmm0 3148 psrlq $26,%xmm5 3149 movdqa %xmm6,%xmm3 3150 psrlq $26,%xmm6 3151 pand %xmm7,%xmm0 3152 paddd %xmm5,%xmm1 3153 pand %xmm7,%xmm3 3154 paddd %xmm6,%xmm4 3155.L013done: 3156 movd %xmm0,-48(%edi) 3157 movd %xmm1,-44(%edi) 3158 movd %xmm2,-40(%edi) 3159 movd %xmm3,-36(%edi) 3160 movd %xmm4,-32(%edi) 3161 movl %ebp,%esp 3162.L007nodata: 3163 popl %edi 3164 popl %esi 3165 popl %ebx 3166 popl %ebp 3167 ret 3168.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 3169.align 32 3170.type _poly1305_emit_sse2,@function 3171.align 16 3172_poly1305_emit_sse2: 3173 pushl %ebp 3174 pushl %ebx 3175 pushl %esi 3176 pushl %edi 3177 movl 20(%esp),%ebp 3178 cmpl $0,20(%ebp) 3179 je .Lenter_emit 3180 movl (%ebp),%eax 3181 movl 4(%ebp),%edi 3182 movl 8(%ebp),%ecx 3183 movl 12(%ebp),%edx 3184 movl 16(%ebp),%esi 3185 movl %edi,%ebx 3186 shll $26,%edi 3187 shrl $6,%ebx 3188 addl %edi,%eax 3189 movl %ecx,%edi 3190 adcl $0,%ebx 3191 shll $20,%edi 3192 shrl $12,%ecx 3193 addl %edi,%ebx 3194 movl %edx,%edi 3195 adcl $0,%ecx 3196 shll $14,%edi 3197 shrl $18,%edx 3198 addl %edi,%ecx 3199 movl %esi,%edi 3200 adcl $0,%edx 3201 shll $8,%edi 3202 shrl $24,%esi 3203 addl %edi,%edx 3204 adcl $0,%esi 3205 movl %esi,%edi 3206 andl $3,%esi 3207 shrl $2,%edi 3208 leal (%edi,%edi,4),%ebp 3209 movl 24(%esp),%edi 3210 addl %ebp,%eax 3211 movl 28(%esp),%ebp 3212 adcl $0,%ebx 3213 adcl $0,%ecx 3214 adcl $0,%edx 3215 adcl $0,%esi 3216 movd %eax,%xmm0 3217 addl $5,%eax 3218 movd %ebx,%xmm1 3219 adcl $0,%ebx 3220 movd %ecx,%xmm2 3221 adcl $0,%ecx 3222 movd %edx,%xmm3 3223 adcl $0,%edx 3224 adcl $0,%esi 3225 shrl $2,%esi 3226 negl %esi 3227 andl %esi,%eax 3228 andl %esi,%ebx 3229 andl %esi,%ecx 3230 andl %esi,%edx 3231 movl %eax,(%edi) 3232 movd %xmm0,%eax 3233 movl %ebx,4(%edi) 3234 movd %xmm1,%ebx 3235 movl %ecx,8(%edi) 3236 movd %xmm2,%ecx 3237 movl %edx,12(%edi) 3238 movd %xmm3,%edx 3239 notl %esi 3240 andl %esi,%eax 3241 andl %esi,%ebx 3242 orl (%edi),%eax 3243 andl %esi,%ecx 3244 orl 4(%edi),%ebx 3245 andl %esi,%edx 3246 orl 8(%edi),%ecx 3247 orl 12(%edi),%edx 3248 addl (%ebp),%eax 3249 adcl 4(%ebp),%ebx 3250 movl %eax,(%edi) 3251 adcl 8(%ebp),%ecx 3252 movl %ebx,4(%edi) 3253 adcl 12(%ebp),%edx 3254 movl %ecx,8(%edi) 3255 movl %edx,12(%edi) 3256 popl %edi 3257 popl %esi 3258 popl %ebx 3259 popl %ebp 3260 ret 3261.size _poly1305_emit_sse2,.-_poly1305_emit_sse2 3262.align 32 3263.type _poly1305_init_avx2,@function 3264.align 16 3265_poly1305_init_avx2: 3266 vmovdqu 24(%edi),%xmm4 3267 leal 48(%edi),%edi 3268 movl %esp,%ebp 3269 subl $224,%esp 3270 andl $-16,%esp 3271 vmovdqa 64(%ebx),%xmm7 3272 vpand %xmm7,%xmm4,%xmm0 3273 vpsrlq $26,%xmm4,%xmm1 3274 vpsrldq $6,%xmm4,%xmm3 3275 vpand %xmm7,%xmm1,%xmm1 3276 vpsrlq $4,%xmm3,%xmm2 3277 vpsrlq $30,%xmm3,%xmm3 3278 vpand %xmm7,%xmm2,%xmm2 3279 vpand %xmm7,%xmm3,%xmm3 3280 vpsrldq $13,%xmm4,%xmm4 3281 leal 144(%esp),%edx 3282 movl $2,%ecx 3283.L018square: 3284 vmovdqa %xmm0,(%esp) 3285 vmovdqa %xmm1,16(%esp) 3286 vmovdqa %xmm2,32(%esp) 3287 vmovdqa %xmm3,48(%esp) 3288 vmovdqa %xmm4,64(%esp) 3289 vpslld $2,%xmm1,%xmm6 3290 vpslld $2,%xmm2,%xmm5 3291 vpaddd %xmm1,%xmm6,%xmm6 3292 vpaddd %xmm2,%xmm5,%xmm5 3293 vmovdqa %xmm6,80(%esp) 3294 vmovdqa %xmm5,96(%esp) 3295 vpslld $2,%xmm3,%xmm6 3296 vpslld $2,%xmm4,%xmm5 3297 vpaddd %xmm3,%xmm6,%xmm6 3298 vpaddd %xmm4,%xmm5,%xmm5 3299 vmovdqa %xmm6,112(%esp) 3300 vmovdqa %xmm5,128(%esp) 3301 vpshufd $68,%xmm0,%xmm5 3302 vmovdqa %xmm1,%xmm6 3303 vpshufd $68,%xmm1,%xmm1 3304 vpshufd $68,%xmm2,%xmm2 3305 vpshufd $68,%xmm3,%xmm3 3306 vpshufd $68,%xmm4,%xmm4 3307 vmovdqa %xmm5,(%edx) 3308 vmovdqa %xmm1,16(%edx) 3309 vmovdqa %xmm2,32(%edx) 3310 vmovdqa %xmm3,48(%edx) 3311 vmovdqa %xmm4,64(%edx) 3312 vpmuludq %xmm0,%xmm4,%xmm4 3313 vpmuludq %xmm0,%xmm3,%xmm3 3314 vpmuludq %xmm0,%xmm2,%xmm2 3315 vpmuludq %xmm0,%xmm1,%xmm1 3316 vpmuludq %xmm0,%xmm5,%xmm0 3317 vpmuludq 48(%edx),%xmm6,%xmm5 3318 vpaddq %xmm5,%xmm4,%xmm4 3319 vpmuludq 32(%edx),%xmm6,%xmm7 3320 vpaddq %xmm7,%xmm3,%xmm3 3321 vpmuludq 16(%edx),%xmm6,%xmm5 3322 vpaddq %xmm5,%xmm2,%xmm2 3323 vmovdqa 80(%esp),%xmm7 3324 vpmuludq (%edx),%xmm6,%xmm6 3325 vpaddq %xmm6,%xmm1,%xmm1 3326 vmovdqa 32(%esp),%xmm5 3327 vpmuludq 64(%edx),%xmm7,%xmm7 3328 vpaddq %xmm7,%xmm0,%xmm0 3329 vpmuludq 32(%edx),%xmm5,%xmm6 3330 vpaddq %xmm6,%xmm4,%xmm4 3331 vpmuludq 16(%edx),%xmm5,%xmm7 3332 vpaddq %xmm7,%xmm3,%xmm3 3333 vmovdqa 96(%esp),%xmm6 3334 vpmuludq (%edx),%xmm5,%xmm5 3335 vpaddq %xmm5,%xmm2,%xmm2 3336 vpmuludq 64(%edx),%xmm6,%xmm7 3337 vpaddq %xmm7,%xmm1,%xmm1 3338 vmovdqa 48(%esp),%xmm5 3339 vpmuludq 48(%edx),%xmm6,%xmm6 3340 vpaddq %xmm6,%xmm0,%xmm0 3341 vpmuludq 16(%edx),%xmm5,%xmm7 3342 vpaddq %xmm7,%xmm4,%xmm4 3343 vmovdqa 112(%esp),%xmm6 3344 vpmuludq (%edx),%xmm5,%xmm5 3345 vpaddq %xmm5,%xmm3,%xmm3 3346 vpmuludq 64(%edx),%xmm6,%xmm7 3347 vpaddq %xmm7,%xmm2,%xmm2 3348 vpmuludq 48(%edx),%xmm6,%xmm5 3349 vpaddq %xmm5,%xmm1,%xmm1 3350 vmovdqa 64(%esp),%xmm7 3351 vpmuludq 32(%edx),%xmm6,%xmm6 3352 vpaddq %xmm6,%xmm0,%xmm0 3353 vmovdqa 128(%esp),%xmm5 3354 vpmuludq (%edx),%xmm7,%xmm7 3355 vpaddq %xmm7,%xmm4,%xmm4 3356 vpmuludq 64(%edx),%xmm5,%xmm6 3357 vpaddq %xmm6,%xmm3,%xmm3 3358 vpmuludq 16(%edx),%xmm5,%xmm7 3359 vpaddq %xmm7,%xmm0,%xmm0 3360 vpmuludq 32(%edx),%xmm5,%xmm6 3361 vpaddq %xmm6,%xmm1,%xmm1 3362 vmovdqa 64(%ebx),%xmm7 3363 vpmuludq 48(%edx),%xmm5,%xmm5 3364 vpaddq %xmm5,%xmm2,%xmm2 3365 vpsrlq $26,%xmm3,%xmm5 3366 vpand %xmm7,%xmm3,%xmm3 3367 vpsrlq $26,%xmm0,%xmm6 3368 vpand %xmm7,%xmm0,%xmm0 3369 vpaddq %xmm5,%xmm4,%xmm4 3370 vpaddq %xmm6,%xmm1,%xmm1 3371 vpsrlq $26,%xmm4,%xmm5 3372 vpand %xmm7,%xmm4,%xmm4 3373 vpsrlq $26,%xmm1,%xmm6 3374 vpand %xmm7,%xmm1,%xmm1 3375 vpaddq %xmm6,%xmm2,%xmm2 3376 vpaddd %xmm5,%xmm0,%xmm0 3377 vpsllq $2,%xmm5,%xmm5 3378 vpsrlq $26,%xmm2,%xmm6 3379 vpand %xmm7,%xmm2,%xmm2 3380 vpaddd %xmm5,%xmm0,%xmm0 3381 vpaddd %xmm6,%xmm3,%xmm3 3382 vpsrlq $26,%xmm3,%xmm6 3383 vpsrlq $26,%xmm0,%xmm5 3384 vpand %xmm7,%xmm0,%xmm0 3385 vpand %xmm7,%xmm3,%xmm3 3386 vpaddd %xmm5,%xmm1,%xmm1 3387 vpaddd %xmm6,%xmm4,%xmm4 3388 decl %ecx 3389 jz .L019square_break 3390 vpunpcklqdq (%esp),%xmm0,%xmm0 3391 vpunpcklqdq 16(%esp),%xmm1,%xmm1 3392 vpunpcklqdq 32(%esp),%xmm2,%xmm2 3393 vpunpcklqdq 48(%esp),%xmm3,%xmm3 3394 vpunpcklqdq 64(%esp),%xmm4,%xmm4 3395 jmp .L018square 3396.L019square_break: 3397 vpsllq $32,%xmm0,%xmm0 3398 vpsllq $32,%xmm1,%xmm1 3399 vpsllq $32,%xmm2,%xmm2 3400 vpsllq $32,%xmm3,%xmm3 3401 vpsllq $32,%xmm4,%xmm4 3402 vpor (%esp),%xmm0,%xmm0 3403 vpor 16(%esp),%xmm1,%xmm1 3404 vpor 32(%esp),%xmm2,%xmm2 3405 vpor 48(%esp),%xmm3,%xmm3 3406 vpor 64(%esp),%xmm4,%xmm4 3407 vpshufd $141,%xmm0,%xmm0 3408 vpshufd $141,%xmm1,%xmm1 3409 vpshufd $141,%xmm2,%xmm2 3410 vpshufd $141,%xmm3,%xmm3 3411 vpshufd $141,%xmm4,%xmm4 3412 vmovdqu %xmm0,(%edi) 3413 vmovdqu %xmm1,16(%edi) 3414 vmovdqu %xmm2,32(%edi) 3415 vmovdqu %xmm3,48(%edi) 3416 vmovdqu %xmm4,64(%edi) 3417 vpslld $2,%xmm1,%xmm6 3418 vpslld $2,%xmm2,%xmm5 3419 vpaddd %xmm1,%xmm6,%xmm6 3420 vpaddd %xmm2,%xmm5,%xmm5 3421 vmovdqu %xmm6,80(%edi) 3422 vmovdqu %xmm5,96(%edi) 3423 vpslld $2,%xmm3,%xmm6 3424 vpslld $2,%xmm4,%xmm5 3425 vpaddd %xmm3,%xmm6,%xmm6 3426 vpaddd %xmm4,%xmm5,%xmm5 3427 vmovdqu %xmm6,112(%edi) 3428 vmovdqu %xmm5,128(%edi) 3429 movl %ebp,%esp 3430 leal -48(%edi),%edi 3431 ret 3432.size _poly1305_init_avx2,.-_poly1305_init_avx2 3433.align 32 3434.type _poly1305_blocks_avx2,@function 3435.align 16 3436_poly1305_blocks_avx2: 3437 pushl %ebp 3438 pushl %ebx 3439 pushl %esi 3440 pushl %edi 3441 movl 20(%esp),%edi 3442 movl 24(%esp),%esi 3443 movl 28(%esp),%ecx 3444 movl 20(%edi),%eax 3445 andl $-16,%ecx 3446 jz .L020nodata 3447 cmpl $64,%ecx 3448 jae .L021enter_avx2 3449 testl %eax,%eax 3450 jz .Lenter_blocks 3451.L021enter_avx2: 3452 vzeroupper 3453 call .L022pic_point 3454.L022pic_point: 3455 popl %ebx 3456 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx 3457 testl %eax,%eax 3458 jnz .L023base2_26 3459 call _poly1305_init_avx2 3460 movl (%edi),%eax 3461 movl 3(%edi),%ecx 3462 movl 6(%edi),%edx 3463 movl 9(%edi),%esi 3464 movl 13(%edi),%ebp 3465 shrl $2,%ecx 3466 andl $67108863,%eax 3467 shrl $4,%edx 3468 andl $67108863,%ecx 3469 shrl $6,%esi 3470 andl $67108863,%edx 3471 movl %eax,(%edi) 3472 movl %ecx,4(%edi) 3473 movl %edx,8(%edi) 3474 movl %esi,12(%edi) 3475 movl %ebp,16(%edi) 3476 movl $1,20(%edi) 3477 movl 24(%esp),%esi 3478 movl 28(%esp),%ecx 3479.L023base2_26: 3480 movl 32(%esp),%eax 3481 movl %esp,%ebp 3482 subl $448,%esp 3483 andl $-512,%esp 3484 vmovdqu 48(%edi),%xmm0 3485 leal 288(%esp),%edx 3486 vmovdqu 64(%edi),%xmm1 3487 vmovdqu 80(%edi),%xmm2 3488 vmovdqu 96(%edi),%xmm3 3489 vmovdqu 112(%edi),%xmm4 3490 leal 48(%edi),%edi 3491 vpermq $64,%ymm0,%ymm0 3492 vpermq $64,%ymm1,%ymm1 3493 vpermq $64,%ymm2,%ymm2 3494 vpermq $64,%ymm3,%ymm3 3495 vpermq $64,%ymm4,%ymm4 3496 vpshufd $200,%ymm0,%ymm0 3497 vpshufd $200,%ymm1,%ymm1 3498 vpshufd $200,%ymm2,%ymm2 3499 vpshufd $200,%ymm3,%ymm3 3500 vpshufd $200,%ymm4,%ymm4 3501 vmovdqa %ymm0,-128(%edx) 3502 vmovdqu 80(%edi),%xmm0 3503 vmovdqa %ymm1,-96(%edx) 3504 vmovdqu 96(%edi),%xmm1 3505 vmovdqa %ymm2,-64(%edx) 3506 vmovdqu 112(%edi),%xmm2 3507 vmovdqa %ymm3,-32(%edx) 3508 vmovdqu 128(%edi),%xmm3 3509 vmovdqa %ymm4,(%edx) 3510 vpermq $64,%ymm0,%ymm0 3511 vpermq $64,%ymm1,%ymm1 3512 vpermq $64,%ymm2,%ymm2 3513 vpermq $64,%ymm3,%ymm3 3514 vpshufd $200,%ymm0,%ymm0 3515 vpshufd $200,%ymm1,%ymm1 3516 vpshufd $200,%ymm2,%ymm2 3517 vpshufd $200,%ymm3,%ymm3 3518 vmovdqa %ymm0,32(%edx) 3519 vmovd -48(%edi),%xmm0 3520 vmovdqa %ymm1,64(%edx) 3521 vmovd -44(%edi),%xmm1 3522 vmovdqa %ymm2,96(%edx) 3523 vmovd -40(%edi),%xmm2 3524 vmovdqa %ymm3,128(%edx) 3525 vmovd -36(%edi),%xmm3 3526 vmovd -32(%edi),%xmm4 3527 vmovdqa 64(%ebx),%ymm7 3528 negl %eax 3529 testl $63,%ecx 3530 jz .L024even 3531 movl %ecx,%edx 3532 andl $-64,%ecx 3533 andl $63,%edx 3534 vmovdqu (%esi),%xmm5 3535 cmpl $32,%edx 3536 jb .L025one 3537 vmovdqu 16(%esi),%xmm6 3538 je .L026two 3539 vinserti128 $1,32(%esi),%ymm5,%ymm5 3540 leal 48(%esi),%esi 3541 leal 8(%ebx),%ebx 3542 leal 296(%esp),%edx 3543 jmp .L027tail 3544.L026two: 3545 leal 32(%esi),%esi 3546 leal 16(%ebx),%ebx 3547 leal 304(%esp),%edx 3548 jmp .L027tail 3549.L025one: 3550 leal 16(%esi),%esi 3551 vpxor %ymm6,%ymm6,%ymm6 3552 leal 32(%ebx,%eax,8),%ebx 3553 leal 312(%esp),%edx 3554 jmp .L027tail 3555.align 32 3556.L024even: 3557 vmovdqu (%esi),%xmm5 3558 vmovdqu 16(%esi),%xmm6 3559 vinserti128 $1,32(%esi),%ymm5,%ymm5 3560 vinserti128 $1,48(%esi),%ymm6,%ymm6 3561 leal 64(%esi),%esi 3562 subl $64,%ecx 3563 jz .L027tail 3564.L028loop: 3565 vmovdqa %ymm2,64(%esp) 3566 vpsrldq $6,%ymm5,%ymm2 3567 vmovdqa %ymm0,(%esp) 3568 vpsrldq $6,%ymm6,%ymm0 3569 vmovdqa %ymm1,32(%esp) 3570 vpunpckhqdq %ymm6,%ymm5,%ymm1 3571 vpunpcklqdq %ymm6,%ymm5,%ymm5 3572 vpunpcklqdq %ymm0,%ymm2,%ymm2 3573 vpsrlq $30,%ymm2,%ymm0 3574 vpsrlq $4,%ymm2,%ymm2 3575 vpsrlq $26,%ymm5,%ymm6 3576 vpsrlq $40,%ymm1,%ymm1 3577 vpand %ymm7,%ymm2,%ymm2 3578 vpand %ymm7,%ymm5,%ymm5 3579 vpand %ymm7,%ymm6,%ymm6 3580 vpand %ymm7,%ymm0,%ymm0 3581 vpor (%ebx),%ymm1,%ymm1 3582 vpaddq 64(%esp),%ymm2,%ymm2 3583 vpaddq (%esp),%ymm5,%ymm5 3584 vpaddq 32(%esp),%ymm6,%ymm6 3585 vpaddq %ymm3,%ymm0,%ymm0 3586 vpaddq %ymm4,%ymm1,%ymm1 3587 vpmuludq -96(%edx),%ymm2,%ymm3 3588 vmovdqa %ymm6,32(%esp) 3589 vpmuludq -64(%edx),%ymm2,%ymm4 3590 vmovdqa %ymm0,96(%esp) 3591 vpmuludq 96(%edx),%ymm2,%ymm0 3592 vmovdqa %ymm1,128(%esp) 3593 vpmuludq 128(%edx),%ymm2,%ymm1 3594 vpmuludq -128(%edx),%ymm2,%ymm2 3595 vpmuludq -32(%edx),%ymm5,%ymm7 3596 vpaddq %ymm7,%ymm3,%ymm3 3597 vpmuludq (%edx),%ymm5,%ymm6 3598 vpaddq %ymm6,%ymm4,%ymm4 3599 vpmuludq -128(%edx),%ymm5,%ymm7 3600 vpaddq %ymm7,%ymm0,%ymm0 3601 vmovdqa 32(%esp),%ymm7 3602 vpmuludq -96(%edx),%ymm5,%ymm6 3603 vpaddq %ymm6,%ymm1,%ymm1 3604 vpmuludq -64(%edx),%ymm5,%ymm5 3605 vpaddq %ymm5,%ymm2,%ymm2 3606 vpmuludq -64(%edx),%ymm7,%ymm6 3607 vpaddq %ymm6,%ymm3,%ymm3 3608 vpmuludq -32(%edx),%ymm7,%ymm5 3609 vpaddq %ymm5,%ymm4,%ymm4 3610 vpmuludq 128(%edx),%ymm7,%ymm6 3611 vpaddq %ymm6,%ymm0,%ymm0 3612 vmovdqa 96(%esp),%ymm6 3613 vpmuludq -128(%edx),%ymm7,%ymm5 3614 vpaddq %ymm5,%ymm1,%ymm1 3615 vpmuludq -96(%edx),%ymm7,%ymm7 3616 vpaddq %ymm7,%ymm2,%ymm2 3617 vpmuludq -128(%edx),%ymm6,%ymm5 3618 vpaddq %ymm5,%ymm3,%ymm3 3619 vpmuludq -96(%edx),%ymm6,%ymm7 3620 vpaddq %ymm7,%ymm4,%ymm4 3621 vpmuludq 64(%edx),%ymm6,%ymm5 3622 vpaddq %ymm5,%ymm0,%ymm0 3623 vmovdqa 128(%esp),%ymm5 3624 vpmuludq 96(%edx),%ymm6,%ymm7 3625 vpaddq %ymm7,%ymm1,%ymm1 3626 vpmuludq 128(%edx),%ymm6,%ymm6 3627 vpaddq %ymm6,%ymm2,%ymm2 3628 vpmuludq 128(%edx),%ymm5,%ymm7 3629 vpaddq %ymm7,%ymm3,%ymm3 3630 vpmuludq 32(%edx),%ymm5,%ymm6 3631 vpaddq %ymm6,%ymm0,%ymm0 3632 vpmuludq -128(%edx),%ymm5,%ymm7 3633 vpaddq %ymm7,%ymm4,%ymm4 3634 vmovdqa 64(%ebx),%ymm7 3635 vpmuludq 64(%edx),%ymm5,%ymm6 3636 vpaddq %ymm6,%ymm1,%ymm1 3637 vpmuludq 96(%edx),%ymm5,%ymm5 3638 vpaddq %ymm5,%ymm2,%ymm2 3639 vpsrlq $26,%ymm3,%ymm5 3640 vpand %ymm7,%ymm3,%ymm3 3641 vpsrlq $26,%ymm0,%ymm6 3642 vpand %ymm7,%ymm0,%ymm0 3643 vpaddq %ymm5,%ymm4,%ymm4 3644 vpaddq %ymm6,%ymm1,%ymm1 3645 vpsrlq $26,%ymm4,%ymm5 3646 vpand %ymm7,%ymm4,%ymm4 3647 vpsrlq $26,%ymm1,%ymm6 3648 vpand %ymm7,%ymm1,%ymm1 3649 vpaddq %ymm6,%ymm2,%ymm2 3650 vpaddq %ymm5,%ymm0,%ymm0 3651 vpsllq $2,%ymm5,%ymm5 3652 vpsrlq $26,%ymm2,%ymm6 3653 vpand %ymm7,%ymm2,%ymm2 3654 vpaddq %ymm5,%ymm0,%ymm0 3655 vpaddq %ymm6,%ymm3,%ymm3 3656 vpsrlq $26,%ymm3,%ymm6 3657 vpsrlq $26,%ymm0,%ymm5 3658 vpand %ymm7,%ymm0,%ymm0 3659 vpand %ymm7,%ymm3,%ymm3 3660 vpaddq %ymm5,%ymm1,%ymm1 3661 vpaddq %ymm6,%ymm4,%ymm4 3662 vmovdqu (%esi),%xmm5 3663 vmovdqu 16(%esi),%xmm6 3664 vinserti128 $1,32(%esi),%ymm5,%ymm5 3665 vinserti128 $1,48(%esi),%ymm6,%ymm6 3666 leal 64(%esi),%esi 3667 subl $64,%ecx 3668 jnz .L028loop 3669.L027tail: 3670 vmovdqa %ymm2,64(%esp) 3671 vpsrldq $6,%ymm5,%ymm2 3672 vmovdqa %ymm0,(%esp) 3673 vpsrldq $6,%ymm6,%ymm0 3674 vmovdqa %ymm1,32(%esp) 3675 vpunpckhqdq %ymm6,%ymm5,%ymm1 3676 vpunpcklqdq %ymm6,%ymm5,%ymm5 3677 vpunpcklqdq %ymm0,%ymm2,%ymm2 3678 vpsrlq $30,%ymm2,%ymm0 3679 vpsrlq $4,%ymm2,%ymm2 3680 vpsrlq $26,%ymm5,%ymm6 3681 vpsrlq $40,%ymm1,%ymm1 3682 vpand %ymm7,%ymm2,%ymm2 3683 vpand %ymm7,%ymm5,%ymm5 3684 vpand %ymm7,%ymm6,%ymm6 3685 vpand %ymm7,%ymm0,%ymm0 3686 vpor (%ebx),%ymm1,%ymm1 3687 andl $-64,%ebx 3688 vpaddq 64(%esp),%ymm2,%ymm2 3689 vpaddq (%esp),%ymm5,%ymm5 3690 vpaddq 32(%esp),%ymm6,%ymm6 3691 vpaddq %ymm3,%ymm0,%ymm0 3692 vpaddq %ymm4,%ymm1,%ymm1 3693 vpmuludq -92(%edx),%ymm2,%ymm3 3694 vmovdqa %ymm6,32(%esp) 3695 vpmuludq -60(%edx),%ymm2,%ymm4 3696 vmovdqa %ymm0,96(%esp) 3697 vpmuludq 100(%edx),%ymm2,%ymm0 3698 vmovdqa %ymm1,128(%esp) 3699 vpmuludq 132(%edx),%ymm2,%ymm1 3700 vpmuludq -124(%edx),%ymm2,%ymm2 3701 vpmuludq -28(%edx),%ymm5,%ymm7 3702 vpaddq %ymm7,%ymm3,%ymm3 3703 vpmuludq 4(%edx),%ymm5,%ymm6 3704 vpaddq %ymm6,%ymm4,%ymm4 3705 vpmuludq -124(%edx),%ymm5,%ymm7 3706 vpaddq %ymm7,%ymm0,%ymm0 3707 vmovdqa 32(%esp),%ymm7 3708 vpmuludq -92(%edx),%ymm5,%ymm6 3709 vpaddq %ymm6,%ymm1,%ymm1 3710 vpmuludq -60(%edx),%ymm5,%ymm5 3711 vpaddq %ymm5,%ymm2,%ymm2 3712 vpmuludq -60(%edx),%ymm7,%ymm6 3713 vpaddq %ymm6,%ymm3,%ymm3 3714 vpmuludq -28(%edx),%ymm7,%ymm5 3715 vpaddq %ymm5,%ymm4,%ymm4 3716 vpmuludq 132(%edx),%ymm7,%ymm6 3717 vpaddq %ymm6,%ymm0,%ymm0 3718 vmovdqa 96(%esp),%ymm6 3719 vpmuludq -124(%edx),%ymm7,%ymm5 3720 vpaddq %ymm5,%ymm1,%ymm1 3721 vpmuludq -92(%edx),%ymm7,%ymm7 3722 vpaddq %ymm7,%ymm2,%ymm2 3723 vpmuludq -124(%edx),%ymm6,%ymm5 3724 vpaddq %ymm5,%ymm3,%ymm3 3725 vpmuludq -92(%edx),%ymm6,%ymm7 3726 vpaddq %ymm7,%ymm4,%ymm4 3727 vpmuludq 68(%edx),%ymm6,%ymm5 3728 vpaddq %ymm5,%ymm0,%ymm0 3729 vmovdqa 128(%esp),%ymm5 3730 vpmuludq 100(%edx),%ymm6,%ymm7 3731 vpaddq %ymm7,%ymm1,%ymm1 3732 vpmuludq 132(%edx),%ymm6,%ymm6 3733 vpaddq %ymm6,%ymm2,%ymm2 3734 vpmuludq 132(%edx),%ymm5,%ymm7 3735 vpaddq %ymm7,%ymm3,%ymm3 3736 vpmuludq 36(%edx),%ymm5,%ymm6 3737 vpaddq %ymm6,%ymm0,%ymm0 3738 vpmuludq -124(%edx),%ymm5,%ymm7 3739 vpaddq %ymm7,%ymm4,%ymm4 3740 vmovdqa 64(%ebx),%ymm7 3741 vpmuludq 68(%edx),%ymm5,%ymm6 3742 vpaddq %ymm6,%ymm1,%ymm1 3743 vpmuludq 100(%edx),%ymm5,%ymm5 3744 vpaddq %ymm5,%ymm2,%ymm2 3745 vpsrldq $8,%ymm4,%ymm5 3746 vpsrldq $8,%ymm3,%ymm6 3747 vpaddq %ymm5,%ymm4,%ymm4 3748 vpsrldq $8,%ymm0,%ymm5 3749 vpaddq %ymm6,%ymm3,%ymm3 3750 vpsrldq $8,%ymm1,%ymm6 3751 vpaddq %ymm5,%ymm0,%ymm0 3752 vpsrldq $8,%ymm2,%ymm5 3753 vpaddq %ymm6,%ymm1,%ymm1 3754 vpermq $2,%ymm4,%ymm6 3755 vpaddq %ymm5,%ymm2,%ymm2 3756 vpermq $2,%ymm3,%ymm5 3757 vpaddq %ymm6,%ymm4,%ymm4 3758 vpermq $2,%ymm0,%ymm6 3759 vpaddq %ymm5,%ymm3,%ymm3 3760 vpermq $2,%ymm1,%ymm5 3761 vpaddq %ymm6,%ymm0,%ymm0 3762 vpermq $2,%ymm2,%ymm6 3763 vpaddq %ymm5,%ymm1,%ymm1 3764 vpaddq %ymm6,%ymm2,%ymm2 3765 vpsrlq $26,%ymm3,%ymm5 3766 vpand %ymm7,%ymm3,%ymm3 3767 vpsrlq $26,%ymm0,%ymm6 3768 vpand %ymm7,%ymm0,%ymm0 3769 vpaddq %ymm5,%ymm4,%ymm4 3770 vpaddq %ymm6,%ymm1,%ymm1 3771 vpsrlq $26,%ymm4,%ymm5 3772 vpand %ymm7,%ymm4,%ymm4 3773 vpsrlq $26,%ymm1,%ymm6 3774 vpand %ymm7,%ymm1,%ymm1 3775 vpaddq %ymm6,%ymm2,%ymm2 3776 vpaddq %ymm5,%ymm0,%ymm0 3777 vpsllq $2,%ymm5,%ymm5 3778 vpsrlq $26,%ymm2,%ymm6 3779 vpand %ymm7,%ymm2,%ymm2 3780 vpaddq %ymm5,%ymm0,%ymm0 3781 vpaddq %ymm6,%ymm3,%ymm3 3782 vpsrlq $26,%ymm3,%ymm6 3783 vpsrlq $26,%ymm0,%ymm5 3784 vpand %ymm7,%ymm0,%ymm0 3785 vpand %ymm7,%ymm3,%ymm3 3786 vpaddq %ymm5,%ymm1,%ymm1 3787 vpaddq %ymm6,%ymm4,%ymm4 3788 cmpl $0,%ecx 3789 je .L029done 3790 vpshufd $252,%xmm0,%xmm0 3791 leal 288(%esp),%edx 3792 vpshufd $252,%xmm1,%xmm1 3793 vpshufd $252,%xmm2,%xmm2 3794 vpshufd $252,%xmm3,%xmm3 3795 vpshufd $252,%xmm4,%xmm4 3796 jmp .L024even 3797.align 16 3798.L029done: 3799 vmovd %xmm0,-48(%edi) 3800 vmovd %xmm1,-44(%edi) 3801 vmovd %xmm2,-40(%edi) 3802 vmovd %xmm3,-36(%edi) 3803 vmovd %xmm4,-32(%edi) 3804 vzeroupper 3805 movl %ebp,%esp 3806.L020nodata: 3807 popl %edi 3808 popl %esi 3809 popl %ebx 3810 popl %ebp 3811 ret 3812.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 3813.align 64 3814.Lconst_sse2: 3815.long 16777216,0,16777216,0,16777216,0,16777216,0 3816.long 0,0,0,0,0,0,0,0 3817.long 67108863,0,67108863,0,67108863,0,67108863,0 3818.long 268435455,268435452,268435452,268435452 3819.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 3820.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 3821.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 3822.byte 114,103,62,0 3823.align 4 3824.comm OPENSSL_ia32cap_P,16,4 3825#endif 3826