1/* Do not modify. This file is auto-generated from rsaz-2k-avxifma.pl. */ 2.text 3 4.globl ossl_rsaz_avxifma_eligible 5.type ossl_rsaz_avxifma_eligible,@function 6.align 32 7ossl_rsaz_avxifma_eligible: 8 movl OPENSSL_ia32cap_P+20(%rip),%ecx 9 xorl %eax,%eax 10 andl $8388608,%ecx 11 cmpl $8388608,%ecx 12 cmovel %ecx,%eax 13 .byte 0xf3,0xc3 14.size ossl_rsaz_avxifma_eligible, .-ossl_rsaz_avxifma_eligible 15.text 16 17.globl ossl_rsaz_amm52x20_x1_avxifma256 18.type ossl_rsaz_amm52x20_x1_avxifma256,@function 19.align 32 20ossl_rsaz_amm52x20_x1_avxifma256: 21.cfi_startproc 22.byte 243,15,30,250 23 pushq %rbx 24.cfi_adjust_cfa_offset 8 25.cfi_offset %rbx,-16 26 pushq %rbp 27.cfi_adjust_cfa_offset 8 28.cfi_offset %rbp,-24 29 pushq %r12 30.cfi_adjust_cfa_offset 8 31.cfi_offset %r12,-32 32 pushq %r13 33.cfi_adjust_cfa_offset 8 34.cfi_offset %r13,-40 35 pushq %r14 36.cfi_adjust_cfa_offset 8 37.cfi_offset %r14,-48 38 pushq %r15 39.cfi_adjust_cfa_offset 8 40.cfi_offset %r15,-56 41.Lossl_rsaz_amm52x20_x1_avxifma256_body: 42 43 44 vpxor %ymm0,%ymm0,%ymm0 45 vmovapd %ymm0,%ymm3 46 vmovapd %ymm0,%ymm5 47 vmovapd %ymm0,%ymm6 48 vmovapd %ymm0,%ymm7 49 vmovapd %ymm0,%ymm8 50 51 xorl %r9d,%r9d 52 53 movq %rdx,%r11 54 movq $0xfffffffffffff,%rax 55 56 57 movl $5,%ebx 58 59.align 32 60.Lloop5: 61 movq 0(%r11),%r13 62 63 vpbroadcastq 0(%r11),%ymm1 64 movq 0(%rsi),%rdx 65 mulxq %r13,%r13,%r12 66 addq %r13,%r9 67 movq %r12,%r10 68 adcq $0,%r10 69 70 movq %r8,%r13 71 imulq %r9,%r13 72 andq %rax,%r13 73 74 vmovq %r13,%xmm2 75 vpbroadcastq %xmm2,%ymm2 76 movq 0(%rcx),%rdx 77 mulxq %r13,%r13,%r12 78 addq %r13,%r9 79 adcq %r12,%r10 80 81 shrq $52,%r9 82 salq $12,%r10 83 orq %r10,%r9 84 85 leaq -168(%rsp),%rsp 86{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 87{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 88{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 89{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 90{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 91 92{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 93{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 94{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 95{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 96{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 97 98 99 vmovdqu %ymm3,0(%rsp) 100 vmovdqu %ymm5,32(%rsp) 101 vmovdqu %ymm6,64(%rsp) 102 vmovdqu %ymm7,96(%rsp) 103 vmovdqu %ymm8,128(%rsp) 104 movq $0,160(%rsp) 105 106 vmovdqu 8(%rsp),%ymm3 107 vmovdqu 40(%rsp),%ymm5 108 vmovdqu 72(%rsp),%ymm6 109 vmovdqu 104(%rsp),%ymm7 110 vmovdqu 136(%rsp),%ymm8 111 112 addq 8(%rsp),%r9 113 114{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 115{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 116{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 117{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 118{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 119 120{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 121{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 122{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 123{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 124{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 125 leaq 168(%rsp),%rsp 126 movq 8(%r11),%r13 127 128 vpbroadcastq 8(%r11),%ymm1 129 movq 0(%rsi),%rdx 130 mulxq %r13,%r13,%r12 131 addq %r13,%r9 132 movq %r12,%r10 133 adcq $0,%r10 134 135 movq %r8,%r13 136 imulq %r9,%r13 137 andq %rax,%r13 138 139 vmovq %r13,%xmm2 140 vpbroadcastq %xmm2,%ymm2 141 movq 0(%rcx),%rdx 142 mulxq %r13,%r13,%r12 143 addq %r13,%r9 144 adcq %r12,%r10 145 146 shrq $52,%r9 147 salq $12,%r10 148 orq %r10,%r9 149 150 leaq -168(%rsp),%rsp 151{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 152{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 153{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 154{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 155{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 156 157{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 158{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 159{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 160{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 161{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 162 163 164 vmovdqu %ymm3,0(%rsp) 165 vmovdqu %ymm5,32(%rsp) 166 vmovdqu %ymm6,64(%rsp) 167 vmovdqu %ymm7,96(%rsp) 168 vmovdqu %ymm8,128(%rsp) 169 movq $0,160(%rsp) 170 171 vmovdqu 8(%rsp),%ymm3 172 vmovdqu 40(%rsp),%ymm5 173 vmovdqu 72(%rsp),%ymm6 174 vmovdqu 104(%rsp),%ymm7 175 vmovdqu 136(%rsp),%ymm8 176 177 addq 8(%rsp),%r9 178 179{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 180{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 181{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 182{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 183{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 184 185{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 186{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 187{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 188{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 189{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 190 leaq 168(%rsp),%rsp 191 movq 16(%r11),%r13 192 193 vpbroadcastq 16(%r11),%ymm1 194 movq 0(%rsi),%rdx 195 mulxq %r13,%r13,%r12 196 addq %r13,%r9 197 movq %r12,%r10 198 adcq $0,%r10 199 200 movq %r8,%r13 201 imulq %r9,%r13 202 andq %rax,%r13 203 204 vmovq %r13,%xmm2 205 vpbroadcastq %xmm2,%ymm2 206 movq 0(%rcx),%rdx 207 mulxq %r13,%r13,%r12 208 addq %r13,%r9 209 adcq %r12,%r10 210 211 shrq $52,%r9 212 salq $12,%r10 213 orq %r10,%r9 214 215 leaq -168(%rsp),%rsp 216{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 217{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 218{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 219{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 220{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 221 222{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 223{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 224{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 225{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 226{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 227 228 229 vmovdqu %ymm3,0(%rsp) 230 vmovdqu %ymm5,32(%rsp) 231 vmovdqu %ymm6,64(%rsp) 232 vmovdqu %ymm7,96(%rsp) 233 vmovdqu %ymm8,128(%rsp) 234 movq $0,160(%rsp) 235 236 vmovdqu 8(%rsp),%ymm3 237 vmovdqu 40(%rsp),%ymm5 238 vmovdqu 72(%rsp),%ymm6 239 vmovdqu 104(%rsp),%ymm7 240 vmovdqu 136(%rsp),%ymm8 241 242 addq 8(%rsp),%r9 243 244{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 245{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 246{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 247{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 248{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 249 250{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 251{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 252{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 253{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 254{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 255 leaq 168(%rsp),%rsp 256 movq 24(%r11),%r13 257 258 vpbroadcastq 24(%r11),%ymm1 259 movq 0(%rsi),%rdx 260 mulxq %r13,%r13,%r12 261 addq %r13,%r9 262 movq %r12,%r10 263 adcq $0,%r10 264 265 movq %r8,%r13 266 imulq %r9,%r13 267 andq %rax,%r13 268 269 vmovq %r13,%xmm2 270 vpbroadcastq %xmm2,%ymm2 271 movq 0(%rcx),%rdx 272 mulxq %r13,%r13,%r12 273 addq %r13,%r9 274 adcq %r12,%r10 275 276 shrq $52,%r9 277 salq $12,%r10 278 orq %r10,%r9 279 280 leaq -168(%rsp),%rsp 281{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 282{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 283{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 284{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 285{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 286 287{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 288{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 289{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 290{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 291{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 292 293 294 vmovdqu %ymm3,0(%rsp) 295 vmovdqu %ymm5,32(%rsp) 296 vmovdqu %ymm6,64(%rsp) 297 vmovdqu %ymm7,96(%rsp) 298 vmovdqu %ymm8,128(%rsp) 299 movq $0,160(%rsp) 300 301 vmovdqu 8(%rsp),%ymm3 302 vmovdqu 40(%rsp),%ymm5 303 vmovdqu 72(%rsp),%ymm6 304 vmovdqu 104(%rsp),%ymm7 305 vmovdqu 136(%rsp),%ymm8 306 307 addq 8(%rsp),%r9 308 309{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 310{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 311{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 312{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 313{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 314 315{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 316{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 317{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 318{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 319{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 320 leaq 168(%rsp),%rsp 321 leaq 32(%r11),%r11 322 decl %ebx 323 jne .Lloop5 324 325 vmovq %r9,%xmm0 326 vpbroadcastq %xmm0,%ymm0 327 vpblendd $3,%ymm0,%ymm3,%ymm3 328 329 330 331 vpsrlq $52,%ymm3,%ymm0 332 vpsrlq $52,%ymm5,%ymm1 333 vpsrlq $52,%ymm6,%ymm2 334 vpsrlq $52,%ymm7,%ymm13 335 vpsrlq $52,%ymm8,%ymm14 336 337 338 vpermq $144,%ymm14,%ymm14 339 vpermq $3,%ymm13,%ymm15 340 vblendpd $1,%ymm15,%ymm14,%ymm14 341 342 vpermq $144,%ymm13,%ymm13 343 vpermq $3,%ymm2,%ymm15 344 vblendpd $1,%ymm15,%ymm13,%ymm13 345 346 vpermq $144,%ymm2,%ymm2 347 vpermq $3,%ymm1,%ymm15 348 vblendpd $1,%ymm15,%ymm2,%ymm2 349 350 vpermq $144,%ymm1,%ymm1 351 vpermq $3,%ymm0,%ymm15 352 vblendpd $1,%ymm15,%ymm1,%ymm1 353 354 vpermq $144,%ymm0,%ymm0 355 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 356 357 358 vpand .Lmask52x4(%rip),%ymm3,%ymm3 359 vpand .Lmask52x4(%rip),%ymm5,%ymm5 360 vpand .Lmask52x4(%rip),%ymm6,%ymm6 361 vpand .Lmask52x4(%rip),%ymm7,%ymm7 362 vpand .Lmask52x4(%rip),%ymm8,%ymm8 363 364 365 vpaddq %ymm0,%ymm3,%ymm3 366 vpaddq %ymm1,%ymm5,%ymm5 367 vpaddq %ymm2,%ymm6,%ymm6 368 vpaddq %ymm13,%ymm7,%ymm7 369 vpaddq %ymm14,%ymm8,%ymm8 370 371 372 373 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 374 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm1 375 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm2 376 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 377 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm14 378 vmovmskpd %ymm0,%r14d 379 vmovmskpd %ymm1,%r13d 380 vmovmskpd %ymm2,%r12d 381 vmovmskpd %ymm13,%r11d 382 vmovmskpd %ymm14,%r10d 383 384 385 vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 386 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm1 387 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm2 388 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 389 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm14 390 vmovmskpd %ymm0,%r9d 391 vmovmskpd %ymm1,%r8d 392 vmovmskpd %ymm2,%ebx 393 vmovmskpd %ymm13,%ecx 394 vmovmskpd %ymm14,%edx 395 396 397 398 shlb $4,%r13b 399 orb %r13b,%r14b 400 shlb $4,%r11b 401 orb %r11b,%r12b 402 403 addb %r14b,%r14b 404 adcb %r12b,%r12b 405 adcb %r10b,%r10b 406 407 shlb $4,%r8b 408 orb %r8b,%r9b 409 shlb $4,%cl 410 orb %cl,%bl 411 412 addb %r9b,%r14b 413 adcb %bl,%r12b 414 adcb %dl,%r10b 415 416 xorb %r9b,%r14b 417 xorb %bl,%r12b 418 xorb %dl,%r10b 419 420 leaq .Lkmasklut(%rip),%rdx 421 422 movb %r14b,%r13b 423 andq $0xf,%r14 424 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 425 shlq $5,%r14 426 vmovapd (%rdx,%r14,1),%ymm2 427 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 428 429 shrb $4,%r13b 430 andq $0xf,%r13 431 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 432 shlq $5,%r13 433 vmovapd (%rdx,%r13,1),%ymm2 434 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 435 436 movb %r12b,%r11b 437 andq $0xf,%r12 438 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 439 shlq $5,%r12 440 vmovapd (%rdx,%r12,1),%ymm2 441 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 442 443 shrb $4,%r11b 444 andq $0xf,%r11 445 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 446 shlq $5,%r11 447 vmovapd (%rdx,%r11,1),%ymm2 448 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 449 450 andq $0xf,%r10 451 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 452 shlq $5,%r10 453 vmovapd (%rdx,%r10,1),%ymm2 454 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 455 456 457 vpand .Lmask52x4(%rip),%ymm3,%ymm3 458 vpand .Lmask52x4(%rip),%ymm5,%ymm5 459 vpand .Lmask52x4(%rip),%ymm6,%ymm6 460 vpand .Lmask52x4(%rip),%ymm7,%ymm7 461 vpand .Lmask52x4(%rip),%ymm8,%ymm8 462 463 vmovdqu %ymm3,0(%rdi) 464 vmovdqu %ymm5,32(%rdi) 465 vmovdqu %ymm6,64(%rdi) 466 vmovdqu %ymm7,96(%rdi) 467 vmovdqu %ymm8,128(%rdi) 468 469 vzeroupper 470 movq 0(%rsp),%r15 471.cfi_restore %r15 472 movq 8(%rsp),%r14 473.cfi_restore %r14 474 movq 16(%rsp),%r13 475.cfi_restore %r13 476 movq 24(%rsp),%r12 477.cfi_restore %r12 478 movq 32(%rsp),%rbp 479.cfi_restore %rbp 480 movq 40(%rsp),%rbx 481.cfi_restore %rbx 482 leaq 48(%rsp),%rsp 483.cfi_adjust_cfa_offset -48 484.Lossl_rsaz_amm52x20_x1_avxifma256_epilogue: 485 .byte 0xf3,0xc3 486.cfi_endproc 487.size ossl_rsaz_amm52x20_x1_avxifma256, .-ossl_rsaz_amm52x20_x1_avxifma256 488.section .rodata 489.align 32 490.Lmask52x4: 491.quad 0xfffffffffffff 492.quad 0xfffffffffffff 493.quad 0xfffffffffffff 494.quad 0xfffffffffffff 495.Lhigh64x3: 496.quad 0x0 497.quad 0xffffffffffffffff 498.quad 0xffffffffffffffff 499.quad 0xffffffffffffffff 500.Lkmasklut: 501 502.quad 0x0 503.quad 0x0 504.quad 0x0 505.quad 0x0 506 507.quad 0xffffffffffffffff 508.quad 0x0 509.quad 0x0 510.quad 0x0 511 512.quad 0x0 513.quad 0xffffffffffffffff 514.quad 0x0 515.quad 0x0 516 517.quad 0xffffffffffffffff 518.quad 0xffffffffffffffff 519.quad 0x0 520.quad 0x0 521 522.quad 0x0 523.quad 0x0 524.quad 0xffffffffffffffff 525.quad 0x0 526 527.quad 0xffffffffffffffff 528.quad 0x0 529.quad 0xffffffffffffffff 530.quad 0x0 531 532.quad 0x0 533.quad 0xffffffffffffffff 534.quad 0xffffffffffffffff 535.quad 0x0 536 537.quad 0xffffffffffffffff 538.quad 0xffffffffffffffff 539.quad 0xffffffffffffffff 540.quad 0x0 541 542.quad 0x0 543.quad 0x0 544.quad 0x0 545.quad 0xffffffffffffffff 546 547.quad 0xffffffffffffffff 548.quad 0x0 549.quad 0x0 550.quad 0xffffffffffffffff 551 552.quad 0x0 553.quad 0xffffffffffffffff 554.quad 0x0 555.quad 0xffffffffffffffff 556 557.quad 0xffffffffffffffff 558.quad 0xffffffffffffffff 559.quad 0x0 560.quad 0xffffffffffffffff 561 562.quad 0x0 563.quad 0x0 564.quad 0xffffffffffffffff 565.quad 0xffffffffffffffff 566 567.quad 0xffffffffffffffff 568.quad 0x0 569.quad 0xffffffffffffffff 570.quad 0xffffffffffffffff 571 572.quad 0x0 573.quad 0xffffffffffffffff 574.quad 0xffffffffffffffff 575.quad 0xffffffffffffffff 576 577.quad 0xffffffffffffffff 578.quad 0xffffffffffffffff 579.quad 0xffffffffffffffff 580.quad 0xffffffffffffffff 581.text 582 583.globl ossl_rsaz_amm52x20_x2_avxifma256 584.type ossl_rsaz_amm52x20_x2_avxifma256,@function 585.align 32 586ossl_rsaz_amm52x20_x2_avxifma256: 587.cfi_startproc 588.byte 243,15,30,250 589 pushq %rbx 590.cfi_adjust_cfa_offset 8 591.cfi_offset %rbx,-16 592 pushq %rbp 593.cfi_adjust_cfa_offset 8 594.cfi_offset %rbp,-24 595 pushq %r12 596.cfi_adjust_cfa_offset 8 597.cfi_offset %r12,-32 598 pushq %r13 599.cfi_adjust_cfa_offset 8 600.cfi_offset %r13,-40 601 pushq %r14 602.cfi_adjust_cfa_offset 8 603.cfi_offset %r14,-48 604 pushq %r15 605.cfi_adjust_cfa_offset 8 606.cfi_offset %r15,-56 607.Lossl_rsaz_amm52x20_x2_avxifma256_body: 608 609 610 vpxor %ymm0,%ymm0,%ymm0 611 vmovapd %ymm0,%ymm3 612 vmovapd %ymm0,%ymm5 613 vmovapd %ymm0,%ymm6 614 vmovapd %ymm0,%ymm7 615 vmovapd %ymm0,%ymm8 616 vmovapd %ymm0,%ymm4 617 vmovapd %ymm0,%ymm9 618 vmovapd %ymm0,%ymm10 619 vmovapd %ymm0,%ymm11 620 vmovapd %ymm0,%ymm12 621 622 xorl %r9d,%r9d 623 xorl %r15d,%r15d 624 625 movq %rdx,%r11 626 movq $0xfffffffffffff,%rax 627 628 movl $20,%ebx 629 630.align 32 631.Lloop20: 632 movq 0(%r11),%r13 633 634 vpbroadcastq 0(%r11),%ymm1 635 movq 0(%rsi),%rdx 636 mulxq %r13,%r13,%r12 637 addq %r13,%r9 638 movq %r12,%r10 639 adcq $0,%r10 640 641 movq (%r8),%r13 642 imulq %r9,%r13 643 andq %rax,%r13 644 645 vmovq %r13,%xmm2 646 vpbroadcastq %xmm2,%ymm2 647 movq 0(%rcx),%rdx 648 mulxq %r13,%r13,%r12 649 addq %r13,%r9 650 adcq %r12,%r10 651 652 shrq $52,%r9 653 salq $12,%r10 654 orq %r10,%r9 655 656 leaq -168(%rsp),%rsp 657{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 658{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 659{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 660{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 661{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 662 663{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 664{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 665{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 666{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 667{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 668 669 670 vmovdqu %ymm3,0(%rsp) 671 vmovdqu %ymm5,32(%rsp) 672 vmovdqu %ymm6,64(%rsp) 673 vmovdqu %ymm7,96(%rsp) 674 vmovdqu %ymm8,128(%rsp) 675 movq $0,160(%rsp) 676 677 vmovdqu 8(%rsp),%ymm3 678 vmovdqu 40(%rsp),%ymm5 679 vmovdqu 72(%rsp),%ymm6 680 vmovdqu 104(%rsp),%ymm7 681 vmovdqu 136(%rsp),%ymm8 682 683 addq 8(%rsp),%r9 684 685{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 686{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 687{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 688{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 689{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 690 691{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 692{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 693{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 694{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 695{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 696 leaq 168(%rsp),%rsp 697 movq 160(%r11),%r13 698 699 vpbroadcastq 160(%r11),%ymm1 700 movq 160(%rsi),%rdx 701 mulxq %r13,%r13,%r12 702 addq %r13,%r15 703 movq %r12,%r10 704 adcq $0,%r10 705 706 movq 8(%r8),%r13 707 imulq %r15,%r13 708 andq %rax,%r13 709 710 vmovq %r13,%xmm2 711 vpbroadcastq %xmm2,%ymm2 712 movq 160(%rcx),%rdx 713 mulxq %r13,%r13,%r12 714 addq %r13,%r15 715 adcq %r12,%r10 716 717 shrq $52,%r15 718 salq $12,%r10 719 orq %r10,%r15 720 721 leaq -168(%rsp),%rsp 722{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm4 723{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 724{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 725{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 726{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 727 728{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm4 729{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 730{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 731{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 732{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 733 734 735 vmovdqu %ymm4,0(%rsp) 736 vmovdqu %ymm9,32(%rsp) 737 vmovdqu %ymm10,64(%rsp) 738 vmovdqu %ymm11,96(%rsp) 739 vmovdqu %ymm12,128(%rsp) 740 movq $0,160(%rsp) 741 742 vmovdqu 8(%rsp),%ymm4 743 vmovdqu 40(%rsp),%ymm9 744 vmovdqu 72(%rsp),%ymm10 745 vmovdqu 104(%rsp),%ymm11 746 vmovdqu 136(%rsp),%ymm12 747 748 addq 8(%rsp),%r15 749 750{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm4 751{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 752{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 753{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 754{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 755 756{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm4 757{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 758{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 759{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 760{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 761 leaq 168(%rsp),%rsp 762 leaq 8(%r11),%r11 763 decl %ebx 764 jne .Lloop20 765 766 vmovq %r9,%xmm0 767 vpbroadcastq %xmm0,%ymm0 768 vpblendd $3,%ymm0,%ymm3,%ymm3 769 770 771 772 vpsrlq $52,%ymm3,%ymm0 773 vpsrlq $52,%ymm5,%ymm1 774 vpsrlq $52,%ymm6,%ymm2 775 vpsrlq $52,%ymm7,%ymm13 776 vpsrlq $52,%ymm8,%ymm14 777 778 779 vpermq $144,%ymm14,%ymm14 780 vpermq $3,%ymm13,%ymm15 781 vblendpd $1,%ymm15,%ymm14,%ymm14 782 783 vpermq $144,%ymm13,%ymm13 784 vpermq $3,%ymm2,%ymm15 785 vblendpd $1,%ymm15,%ymm13,%ymm13 786 787 vpermq $144,%ymm2,%ymm2 788 vpermq $3,%ymm1,%ymm15 789 vblendpd $1,%ymm15,%ymm2,%ymm2 790 791 vpermq $144,%ymm1,%ymm1 792 vpermq $3,%ymm0,%ymm15 793 vblendpd $1,%ymm15,%ymm1,%ymm1 794 795 vpermq $144,%ymm0,%ymm0 796 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 797 798 799 vpand .Lmask52x4(%rip),%ymm3,%ymm3 800 vpand .Lmask52x4(%rip),%ymm5,%ymm5 801 vpand .Lmask52x4(%rip),%ymm6,%ymm6 802 vpand .Lmask52x4(%rip),%ymm7,%ymm7 803 vpand .Lmask52x4(%rip),%ymm8,%ymm8 804 805 806 vpaddq %ymm0,%ymm3,%ymm3 807 vpaddq %ymm1,%ymm5,%ymm5 808 vpaddq %ymm2,%ymm6,%ymm6 809 vpaddq %ymm13,%ymm7,%ymm7 810 vpaddq %ymm14,%ymm8,%ymm8 811 812 813 814 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 815 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm1 816 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm2 817 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 818 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm14 819 vmovmskpd %ymm0,%r14d 820 vmovmskpd %ymm1,%r13d 821 vmovmskpd %ymm2,%r12d 822 vmovmskpd %ymm13,%r11d 823 vmovmskpd %ymm14,%r10d 824 825 826 vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 827 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm1 828 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm2 829 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 830 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm14 831 vmovmskpd %ymm0,%r9d 832 vmovmskpd %ymm1,%r8d 833 vmovmskpd %ymm2,%ebx 834 vmovmskpd %ymm13,%ecx 835 vmovmskpd %ymm14,%edx 836 837 838 839 shlb $4,%r13b 840 orb %r13b,%r14b 841 shlb $4,%r11b 842 orb %r11b,%r12b 843 844 addb %r14b,%r14b 845 adcb %r12b,%r12b 846 adcb %r10b,%r10b 847 848 shlb $4,%r8b 849 orb %r8b,%r9b 850 shlb $4,%cl 851 orb %cl,%bl 852 853 addb %r9b,%r14b 854 adcb %bl,%r12b 855 adcb %dl,%r10b 856 857 xorb %r9b,%r14b 858 xorb %bl,%r12b 859 xorb %dl,%r10b 860 861 leaq .Lkmasklut(%rip),%rdx 862 863 movb %r14b,%r13b 864 andq $0xf,%r14 865 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 866 shlq $5,%r14 867 vmovapd (%rdx,%r14,1),%ymm2 868 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 869 870 shrb $4,%r13b 871 andq $0xf,%r13 872 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 873 shlq $5,%r13 874 vmovapd (%rdx,%r13,1),%ymm2 875 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 876 877 movb %r12b,%r11b 878 andq $0xf,%r12 879 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 880 shlq $5,%r12 881 vmovapd (%rdx,%r12,1),%ymm2 882 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 883 884 shrb $4,%r11b 885 andq $0xf,%r11 886 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 887 shlq $5,%r11 888 vmovapd (%rdx,%r11,1),%ymm2 889 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 890 891 andq $0xf,%r10 892 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 893 shlq $5,%r10 894 vmovapd (%rdx,%r10,1),%ymm2 895 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 896 897 898 vpand .Lmask52x4(%rip),%ymm3,%ymm3 899 vpand .Lmask52x4(%rip),%ymm5,%ymm5 900 vpand .Lmask52x4(%rip),%ymm6,%ymm6 901 vpand .Lmask52x4(%rip),%ymm7,%ymm7 902 vpand .Lmask52x4(%rip),%ymm8,%ymm8 903 904 vmovq %r15,%xmm0 905 vpbroadcastq %xmm0,%ymm0 906 vpblendd $3,%ymm0,%ymm4,%ymm4 907 908 909 910 vpsrlq $52,%ymm4,%ymm0 911 vpsrlq $52,%ymm9,%ymm1 912 vpsrlq $52,%ymm10,%ymm2 913 vpsrlq $52,%ymm11,%ymm13 914 vpsrlq $52,%ymm12,%ymm14 915 916 917 vpermq $144,%ymm14,%ymm14 918 vpermq $3,%ymm13,%ymm15 919 vblendpd $1,%ymm15,%ymm14,%ymm14 920 921 vpermq $144,%ymm13,%ymm13 922 vpermq $3,%ymm2,%ymm15 923 vblendpd $1,%ymm15,%ymm13,%ymm13 924 925 vpermq $144,%ymm2,%ymm2 926 vpermq $3,%ymm1,%ymm15 927 vblendpd $1,%ymm15,%ymm2,%ymm2 928 929 vpermq $144,%ymm1,%ymm1 930 vpermq $3,%ymm0,%ymm15 931 vblendpd $1,%ymm15,%ymm1,%ymm1 932 933 vpermq $144,%ymm0,%ymm0 934 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 935 936 937 vpand .Lmask52x4(%rip),%ymm4,%ymm4 938 vpand .Lmask52x4(%rip),%ymm9,%ymm9 939 vpand .Lmask52x4(%rip),%ymm10,%ymm10 940 vpand .Lmask52x4(%rip),%ymm11,%ymm11 941 vpand .Lmask52x4(%rip),%ymm12,%ymm12 942 943 944 vpaddq %ymm0,%ymm4,%ymm4 945 vpaddq %ymm1,%ymm9,%ymm9 946 vpaddq %ymm2,%ymm10,%ymm10 947 vpaddq %ymm13,%ymm11,%ymm11 948 vpaddq %ymm14,%ymm12,%ymm12 949 950 951 952 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm0 953 vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm1 954 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm2 955 vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 956 vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm14 957 vmovmskpd %ymm0,%r14d 958 vmovmskpd %ymm1,%r13d 959 vmovmskpd %ymm2,%r12d 960 vmovmskpd %ymm13,%r11d 961 vmovmskpd %ymm14,%r10d 962 963 964 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm0 965 vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm1 966 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm2 967 vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 968 vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm14 969 vmovmskpd %ymm0,%r9d 970 vmovmskpd %ymm1,%r8d 971 vmovmskpd %ymm2,%ebx 972 vmovmskpd %ymm13,%ecx 973 vmovmskpd %ymm14,%edx 974 975 976 977 shlb $4,%r13b 978 orb %r13b,%r14b 979 shlb $4,%r11b 980 orb %r11b,%r12b 981 982 addb %r14b,%r14b 983 adcb %r12b,%r12b 984 adcb %r10b,%r10b 985 986 shlb $4,%r8b 987 orb %r8b,%r9b 988 shlb $4,%cl 989 orb %cl,%bl 990 991 addb %r9b,%r14b 992 adcb %bl,%r12b 993 adcb %dl,%r10b 994 995 xorb %r9b,%r14b 996 xorb %bl,%r12b 997 xorb %dl,%r10b 998 999 leaq .Lkmasklut(%rip),%rdx 1000 1001 movb %r14b,%r13b 1002 andq $0xf,%r14 1003 vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 1004 shlq $5,%r14 1005 vmovapd (%rdx,%r14,1),%ymm2 1006 vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 1007 1008 shrb $4,%r13b 1009 andq $0xf,%r13 1010 vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 1011 shlq $5,%r13 1012 vmovapd (%rdx,%r13,1),%ymm2 1013 vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 1014 1015 movb %r12b,%r11b 1016 andq $0xf,%r12 1017 vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 1018 shlq $5,%r12 1019 vmovapd (%rdx,%r12,1),%ymm2 1020 vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 1021 1022 shrb $4,%r11b 1023 andq $0xf,%r11 1024 vpsubq .Lmask52x4(%rip),%ymm11,%ymm0 1025 shlq $5,%r11 1026 vmovapd (%rdx,%r11,1),%ymm2 1027 vblendvpd %ymm2,%ymm0,%ymm11,%ymm11 1028 1029 andq $0xf,%r10 1030 vpsubq .Lmask52x4(%rip),%ymm12,%ymm0 1031 shlq $5,%r10 1032 vmovapd (%rdx,%r10,1),%ymm2 1033 vblendvpd %ymm2,%ymm0,%ymm12,%ymm12 1034 1035 1036 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1037 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1038 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1039 vpand .Lmask52x4(%rip),%ymm11,%ymm11 1040 vpand .Lmask52x4(%rip),%ymm12,%ymm12 1041 1042 vmovdqu %ymm3,0(%rdi) 1043 vmovdqu %ymm5,32(%rdi) 1044 vmovdqu %ymm6,64(%rdi) 1045 vmovdqu %ymm7,96(%rdi) 1046 vmovdqu %ymm8,128(%rdi) 1047 1048 vmovdqu %ymm4,160(%rdi) 1049 vmovdqu %ymm9,192(%rdi) 1050 vmovdqu %ymm10,224(%rdi) 1051 vmovdqu %ymm11,256(%rdi) 1052 vmovdqu %ymm12,288(%rdi) 1053 1054 vzeroupper 1055 movq 0(%rsp),%r15 1056.cfi_restore %r15 1057 movq 8(%rsp),%r14 1058.cfi_restore %r14 1059 movq 16(%rsp),%r13 1060.cfi_restore %r13 1061 movq 24(%rsp),%r12 1062.cfi_restore %r12 1063 movq 32(%rsp),%rbp 1064.cfi_restore %rbp 1065 movq 40(%rsp),%rbx 1066.cfi_restore %rbx 1067 leaq 48(%rsp),%rsp 1068.cfi_adjust_cfa_offset -48 1069.Lossl_rsaz_amm52x20_x2_avxifma256_epilogue: 1070 .byte 0xf3,0xc3 1071.cfi_endproc 1072.size ossl_rsaz_amm52x20_x2_avxifma256, .-ossl_rsaz_amm52x20_x2_avxifma256 1073.text 1074 1075.align 32 1076.globl ossl_extract_multiplier_2x20_win5_avx 1077.type ossl_extract_multiplier_2x20_win5_avx,@function 1078ossl_extract_multiplier_2x20_win5_avx: 1079.cfi_startproc 1080.byte 243,15,30,250 1081 vmovapd .Lones(%rip),%ymm14 1082 vmovq %rdx,%xmm10 1083 vpbroadcastq %xmm10,%ymm12 1084 vmovq %rcx,%xmm10 1085 vpbroadcastq %xmm10,%ymm13 1086 leaq 10240(%rsi),%rax 1087 1088 1089 vpxor %xmm0,%xmm0,%xmm0 1090 vmovapd %ymm0,%ymm11 1091 vmovapd %ymm0,%ymm1 1092 vmovapd %ymm0,%ymm2 1093 vmovapd %ymm0,%ymm3 1094 vmovapd %ymm0,%ymm4 1095 vmovapd %ymm0,%ymm5 1096 vmovapd %ymm0,%ymm6 1097 vmovapd %ymm0,%ymm7 1098 vmovapd %ymm0,%ymm8 1099 vmovapd %ymm0,%ymm9 1100 1101.align 32 1102.Lloop: 1103 vpcmpeqq %ymm11,%ymm12,%ymm15 1104 vmovdqu 0(%rsi),%ymm10 1105 vblendvpd %ymm15,%ymm10,%ymm0,%ymm0 1106 vmovdqu 32(%rsi),%ymm10 1107 vblendvpd %ymm15,%ymm10,%ymm1,%ymm1 1108 vmovdqu 64(%rsi),%ymm10 1109 vblendvpd %ymm15,%ymm10,%ymm2,%ymm2 1110 vmovdqu 96(%rsi),%ymm10 1111 vblendvpd %ymm15,%ymm10,%ymm3,%ymm3 1112 vmovdqu 128(%rsi),%ymm10 1113 vblendvpd %ymm15,%ymm10,%ymm4,%ymm4 1114 vpcmpeqq %ymm11,%ymm13,%ymm15 1115 vmovdqu 160(%rsi),%ymm10 1116 vblendvpd %ymm15,%ymm10,%ymm5,%ymm5 1117 vmovdqu 192(%rsi),%ymm10 1118 vblendvpd %ymm15,%ymm10,%ymm6,%ymm6 1119 vmovdqu 224(%rsi),%ymm10 1120 vblendvpd %ymm15,%ymm10,%ymm7,%ymm7 1121 vmovdqu 256(%rsi),%ymm10 1122 vblendvpd %ymm15,%ymm10,%ymm8,%ymm8 1123 vmovdqu 288(%rsi),%ymm10 1124 vblendvpd %ymm15,%ymm10,%ymm9,%ymm9 1125 vpaddq %ymm14,%ymm11,%ymm11 1126 addq $320,%rsi 1127 cmpq %rsi,%rax 1128 jne .Lloop 1129 vmovdqu %ymm0,0(%rdi) 1130 vmovdqu %ymm1,32(%rdi) 1131 vmovdqu %ymm2,64(%rdi) 1132 vmovdqu %ymm3,96(%rdi) 1133 vmovdqu %ymm4,128(%rdi) 1134 vmovdqu %ymm5,160(%rdi) 1135 vmovdqu %ymm6,192(%rdi) 1136 vmovdqu %ymm7,224(%rdi) 1137 vmovdqu %ymm8,256(%rdi) 1138 vmovdqu %ymm9,288(%rdi) 1139 .byte 0xf3,0xc3 1140.cfi_endproc 1141.size ossl_extract_multiplier_2x20_win5_avx, .-ossl_extract_multiplier_2x20_win5_avx 1142.section .rodata 1143.align 32 1144.Lones: 1145.quad 1,1,1,1 1146.Lzeros: 1147.quad 0,0,0,0 1148 .section ".note.gnu.property", "a" 1149 .p2align 3 1150 .long 1f - 0f 1151 .long 4f - 1f 1152 .long 5 11530: 1154 # "GNU" encoded with .byte, since .asciz isn't supported 1155 # on Solaris. 1156 .byte 0x47 1157 .byte 0x4e 1158 .byte 0x55 1159 .byte 0 11601: 1161 .p2align 3 1162 .long 0xc0000002 1163 .long 3f - 2f 11642: 1165 .long 3 11663: 1167 .p2align 3 11684: 1169