1/* Do not modify. This file is auto-generated from rsaz-avx512.pl. */ 2 3.globl ossl_rsaz_avx512ifma_eligible 4.type ossl_rsaz_avx512ifma_eligible,@function 5.align 32 6ossl_rsaz_avx512ifma_eligible: 7 movl OPENSSL_ia32cap_P+8(%rip),%ecx 8 xorl %eax,%eax 9 andl $2149777408,%ecx 10 cmpl $2149777408,%ecx 11 cmovel %ecx,%eax 12 .byte 0xf3,0xc3 13.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 14.text 15 16.globl ossl_rsaz_amm52x20_x1_256 17.type ossl_rsaz_amm52x20_x1_256,@function 18.align 32 19ossl_rsaz_amm52x20_x1_256: 20.cfi_startproc 21.byte 243,15,30,250 22 pushq %rbx 23.cfi_adjust_cfa_offset 8 24.cfi_offset %rbx,-16 25 pushq %rbp 26.cfi_adjust_cfa_offset 8 27.cfi_offset %rbp,-24 28 pushq %r12 29.cfi_adjust_cfa_offset 8 30.cfi_offset %r12,-32 31 pushq %r13 32.cfi_adjust_cfa_offset 8 33.cfi_offset %r13,-40 34 pushq %r14 35.cfi_adjust_cfa_offset 8 36.cfi_offset %r14,-48 37 pushq %r15 38.cfi_adjust_cfa_offset 8 39.cfi_offset %r15,-56 40.Lrsaz_amm52x20_x1_256_body: 41 42 43 vpxord %ymm0,%ymm0,%ymm0 44 vmovdqa64 %ymm0,%ymm1 45 vmovdqa64 %ymm0,%ymm16 46 vmovdqa64 %ymm0,%ymm17 47 vmovdqa64 %ymm0,%ymm18 48 vmovdqa64 %ymm0,%ymm19 49 50 xorl %r9d,%r9d 51 52 movq %rdx,%r11 53 movq $0xfffffffffffff,%rax 54 55 56 movl $5,%ebx 57 58.align 32 59.Lloop5: 60 movq 0(%r11),%r13 61 62 vpbroadcastq %r13,%ymm3 63 movq 0(%rsi),%rdx 64 mulxq %r13,%r13,%r12 65 addq %r13,%r9 66 movq %r12,%r10 67 adcq $0,%r10 68 69 movq %r8,%r13 70 imulq %r9,%r13 71 andq %rax,%r13 72 73 vpbroadcastq %r13,%ymm4 74 movq 0(%rcx),%rdx 75 mulxq %r13,%r13,%r12 76 addq %r13,%r9 77 adcq %r12,%r10 78 79 shrq $52,%r9 80 salq $12,%r10 81 orq %r10,%r9 82 83 vpmadd52luq 0(%rsi),%ymm3,%ymm1 84 vpmadd52luq 32(%rsi),%ymm3,%ymm16 85 vpmadd52luq 64(%rsi),%ymm3,%ymm17 86 vpmadd52luq 96(%rsi),%ymm3,%ymm18 87 vpmadd52luq 128(%rsi),%ymm3,%ymm19 88 89 vpmadd52luq 0(%rcx),%ymm4,%ymm1 90 vpmadd52luq 32(%rcx),%ymm4,%ymm16 91 vpmadd52luq 64(%rcx),%ymm4,%ymm17 92 vpmadd52luq 96(%rcx),%ymm4,%ymm18 93 vpmadd52luq 128(%rcx),%ymm4,%ymm19 94 95 96 valignq $1,%ymm1,%ymm16,%ymm1 97 valignq $1,%ymm16,%ymm17,%ymm16 98 valignq $1,%ymm17,%ymm18,%ymm17 99 valignq $1,%ymm18,%ymm19,%ymm18 100 valignq $1,%ymm19,%ymm0,%ymm19 101 102 vmovq %xmm1,%r13 103 addq %r13,%r9 104 105 vpmadd52huq 0(%rsi),%ymm3,%ymm1 106 vpmadd52huq 32(%rsi),%ymm3,%ymm16 107 vpmadd52huq 64(%rsi),%ymm3,%ymm17 108 vpmadd52huq 96(%rsi),%ymm3,%ymm18 109 vpmadd52huq 128(%rsi),%ymm3,%ymm19 110 111 vpmadd52huq 0(%rcx),%ymm4,%ymm1 112 vpmadd52huq 32(%rcx),%ymm4,%ymm16 113 vpmadd52huq 64(%rcx),%ymm4,%ymm17 114 vpmadd52huq 96(%rcx),%ymm4,%ymm18 115 vpmadd52huq 128(%rcx),%ymm4,%ymm19 116 movq 8(%r11),%r13 117 118 vpbroadcastq %r13,%ymm3 119 movq 0(%rsi),%rdx 120 mulxq %r13,%r13,%r12 121 addq %r13,%r9 122 movq %r12,%r10 123 adcq $0,%r10 124 125 movq %r8,%r13 126 imulq %r9,%r13 127 andq %rax,%r13 128 129 vpbroadcastq %r13,%ymm4 130 movq 0(%rcx),%rdx 131 mulxq %r13,%r13,%r12 132 addq %r13,%r9 133 adcq %r12,%r10 134 135 shrq $52,%r9 136 salq $12,%r10 137 orq %r10,%r9 138 139 vpmadd52luq 0(%rsi),%ymm3,%ymm1 140 vpmadd52luq 32(%rsi),%ymm3,%ymm16 141 vpmadd52luq 64(%rsi),%ymm3,%ymm17 142 vpmadd52luq 96(%rsi),%ymm3,%ymm18 143 vpmadd52luq 128(%rsi),%ymm3,%ymm19 144 145 vpmadd52luq 0(%rcx),%ymm4,%ymm1 146 vpmadd52luq 32(%rcx),%ymm4,%ymm16 147 vpmadd52luq 64(%rcx),%ymm4,%ymm17 148 vpmadd52luq 96(%rcx),%ymm4,%ymm18 149 vpmadd52luq 128(%rcx),%ymm4,%ymm19 150 151 152 valignq $1,%ymm1,%ymm16,%ymm1 153 valignq $1,%ymm16,%ymm17,%ymm16 154 valignq $1,%ymm17,%ymm18,%ymm17 155 valignq $1,%ymm18,%ymm19,%ymm18 156 valignq $1,%ymm19,%ymm0,%ymm19 157 158 vmovq %xmm1,%r13 159 addq %r13,%r9 160 161 vpmadd52huq 0(%rsi),%ymm3,%ymm1 162 vpmadd52huq 32(%rsi),%ymm3,%ymm16 163 vpmadd52huq 64(%rsi),%ymm3,%ymm17 164 vpmadd52huq 96(%rsi),%ymm3,%ymm18 165 vpmadd52huq 128(%rsi),%ymm3,%ymm19 166 167 vpmadd52huq 0(%rcx),%ymm4,%ymm1 168 vpmadd52huq 32(%rcx),%ymm4,%ymm16 169 vpmadd52huq 64(%rcx),%ymm4,%ymm17 170 vpmadd52huq 96(%rcx),%ymm4,%ymm18 171 vpmadd52huq 128(%rcx),%ymm4,%ymm19 172 movq 16(%r11),%r13 173 174 vpbroadcastq %r13,%ymm3 175 movq 0(%rsi),%rdx 176 mulxq %r13,%r13,%r12 177 addq %r13,%r9 178 movq %r12,%r10 179 adcq $0,%r10 180 181 movq %r8,%r13 182 imulq %r9,%r13 183 andq %rax,%r13 184 185 vpbroadcastq %r13,%ymm4 186 movq 0(%rcx),%rdx 187 mulxq %r13,%r13,%r12 188 addq %r13,%r9 189 adcq %r12,%r10 190 191 shrq $52,%r9 192 salq $12,%r10 193 orq %r10,%r9 194 195 vpmadd52luq 0(%rsi),%ymm3,%ymm1 196 vpmadd52luq 32(%rsi),%ymm3,%ymm16 197 vpmadd52luq 64(%rsi),%ymm3,%ymm17 198 vpmadd52luq 96(%rsi),%ymm3,%ymm18 199 vpmadd52luq 128(%rsi),%ymm3,%ymm19 200 201 vpmadd52luq 0(%rcx),%ymm4,%ymm1 202 vpmadd52luq 32(%rcx),%ymm4,%ymm16 203 vpmadd52luq 64(%rcx),%ymm4,%ymm17 204 vpmadd52luq 96(%rcx),%ymm4,%ymm18 205 vpmadd52luq 128(%rcx),%ymm4,%ymm19 206 207 208 valignq $1,%ymm1,%ymm16,%ymm1 209 valignq $1,%ymm16,%ymm17,%ymm16 210 valignq $1,%ymm17,%ymm18,%ymm17 211 valignq $1,%ymm18,%ymm19,%ymm18 212 valignq $1,%ymm19,%ymm0,%ymm19 213 214 vmovq %xmm1,%r13 215 addq %r13,%r9 216 217 vpmadd52huq 0(%rsi),%ymm3,%ymm1 218 vpmadd52huq 32(%rsi),%ymm3,%ymm16 219 vpmadd52huq 64(%rsi),%ymm3,%ymm17 220 vpmadd52huq 96(%rsi),%ymm3,%ymm18 221 vpmadd52huq 128(%rsi),%ymm3,%ymm19 222 223 vpmadd52huq 0(%rcx),%ymm4,%ymm1 224 vpmadd52huq 32(%rcx),%ymm4,%ymm16 225 vpmadd52huq 64(%rcx),%ymm4,%ymm17 226 vpmadd52huq 96(%rcx),%ymm4,%ymm18 227 vpmadd52huq 128(%rcx),%ymm4,%ymm19 228 movq 24(%r11),%r13 229 230 vpbroadcastq %r13,%ymm3 231 movq 0(%rsi),%rdx 232 mulxq %r13,%r13,%r12 233 addq %r13,%r9 234 movq %r12,%r10 235 adcq $0,%r10 236 237 movq %r8,%r13 238 imulq %r9,%r13 239 andq %rax,%r13 240 241 vpbroadcastq %r13,%ymm4 242 movq 0(%rcx),%rdx 243 mulxq %r13,%r13,%r12 244 addq %r13,%r9 245 adcq %r12,%r10 246 247 shrq $52,%r9 248 salq $12,%r10 249 orq %r10,%r9 250 251 vpmadd52luq 0(%rsi),%ymm3,%ymm1 252 vpmadd52luq 32(%rsi),%ymm3,%ymm16 253 vpmadd52luq 64(%rsi),%ymm3,%ymm17 254 vpmadd52luq 96(%rsi),%ymm3,%ymm18 255 vpmadd52luq 128(%rsi),%ymm3,%ymm19 256 257 vpmadd52luq 0(%rcx),%ymm4,%ymm1 258 vpmadd52luq 32(%rcx),%ymm4,%ymm16 259 vpmadd52luq 64(%rcx),%ymm4,%ymm17 260 vpmadd52luq 96(%rcx),%ymm4,%ymm18 261 vpmadd52luq 128(%rcx),%ymm4,%ymm19 262 263 264 valignq $1,%ymm1,%ymm16,%ymm1 265 valignq $1,%ymm16,%ymm17,%ymm16 266 valignq $1,%ymm17,%ymm18,%ymm17 267 valignq $1,%ymm18,%ymm19,%ymm18 268 valignq $1,%ymm19,%ymm0,%ymm19 269 270 vmovq %xmm1,%r13 271 addq %r13,%r9 272 273 vpmadd52huq 0(%rsi),%ymm3,%ymm1 274 vpmadd52huq 32(%rsi),%ymm3,%ymm16 275 vpmadd52huq 64(%rsi),%ymm3,%ymm17 276 vpmadd52huq 96(%rsi),%ymm3,%ymm18 277 vpmadd52huq 128(%rsi),%ymm3,%ymm19 278 279 vpmadd52huq 0(%rcx),%ymm4,%ymm1 280 vpmadd52huq 32(%rcx),%ymm4,%ymm16 281 vpmadd52huq 64(%rcx),%ymm4,%ymm17 282 vpmadd52huq 96(%rcx),%ymm4,%ymm18 283 vpmadd52huq 128(%rcx),%ymm4,%ymm19 284 leaq 32(%r11),%r11 285 decl %ebx 286 jne .Lloop5 287 288 vmovdqa64 .Lmask52x4(%rip),%ymm4 289 290 vpbroadcastq %r9,%ymm3 291 vpblendd $3,%ymm3,%ymm1,%ymm1 292 293 294 295 vpsrlq $52,%ymm1,%ymm24 296 vpsrlq $52,%ymm16,%ymm25 297 vpsrlq $52,%ymm17,%ymm26 298 vpsrlq $52,%ymm18,%ymm27 299 vpsrlq $52,%ymm19,%ymm28 300 301 302 valignq $3,%ymm27,%ymm28,%ymm28 303 valignq $3,%ymm26,%ymm27,%ymm27 304 valignq $3,%ymm25,%ymm26,%ymm26 305 valignq $3,%ymm24,%ymm25,%ymm25 306 valignq $3,%ymm0,%ymm24,%ymm24 307 308 309 vpandq %ymm4,%ymm1,%ymm1 310 vpandq %ymm4,%ymm16,%ymm16 311 vpandq %ymm4,%ymm17,%ymm17 312 vpandq %ymm4,%ymm18,%ymm18 313 vpandq %ymm4,%ymm19,%ymm19 314 315 316 vpaddq %ymm24,%ymm1,%ymm1 317 vpaddq %ymm25,%ymm16,%ymm16 318 vpaddq %ymm26,%ymm17,%ymm17 319 vpaddq %ymm27,%ymm18,%ymm18 320 vpaddq %ymm28,%ymm19,%ymm19 321 322 323 324 vpcmpuq $1,%ymm1,%ymm4,%k1 325 vpcmpuq $1,%ymm16,%ymm4,%k2 326 vpcmpuq $1,%ymm17,%ymm4,%k3 327 vpcmpuq $1,%ymm18,%ymm4,%k4 328 vpcmpuq $1,%ymm19,%ymm4,%k5 329 kmovb %k1,%r14d 330 kmovb %k2,%r13d 331 kmovb %k3,%r12d 332 kmovb %k4,%r11d 333 kmovb %k5,%r10d 334 335 336 vpcmpuq $0,%ymm1,%ymm4,%k1 337 vpcmpuq $0,%ymm16,%ymm4,%k2 338 vpcmpuq $0,%ymm17,%ymm4,%k3 339 vpcmpuq $0,%ymm18,%ymm4,%k4 340 vpcmpuq $0,%ymm19,%ymm4,%k5 341 kmovb %k1,%r9d 342 kmovb %k2,%r8d 343 kmovb %k3,%ebx 344 kmovb %k4,%ecx 345 kmovb %k5,%edx 346 347 348 349 shlb $4,%r13b 350 orb %r13b,%r14b 351 shlb $4,%r11b 352 orb %r11b,%r12b 353 354 addb %r14b,%r14b 355 adcb %r12b,%r12b 356 adcb %r10b,%r10b 357 358 shlb $4,%r8b 359 orb %r8b,%r9b 360 shlb $4,%cl 361 orb %cl,%bl 362 363 addb %r9b,%r14b 364 adcb %bl,%r12b 365 adcb %dl,%r10b 366 367 xorb %r9b,%r14b 368 xorb %bl,%r12b 369 xorb %dl,%r10b 370 371 kmovb %r14d,%k1 372 shrb $4,%r14b 373 kmovb %r14d,%k2 374 kmovb %r12d,%k3 375 shrb $4,%r12b 376 kmovb %r12d,%k4 377 kmovb %r10d,%k5 378 379 380 vpsubq %ymm4,%ymm1,%ymm1{%k1} 381 vpsubq %ymm4,%ymm16,%ymm16{%k2} 382 vpsubq %ymm4,%ymm17,%ymm17{%k3} 383 vpsubq %ymm4,%ymm18,%ymm18{%k4} 384 vpsubq %ymm4,%ymm19,%ymm19{%k5} 385 386 vpandq %ymm4,%ymm1,%ymm1 387 vpandq %ymm4,%ymm16,%ymm16 388 vpandq %ymm4,%ymm17,%ymm17 389 vpandq %ymm4,%ymm18,%ymm18 390 vpandq %ymm4,%ymm19,%ymm19 391 392 vmovdqu64 %ymm1,(%rdi) 393 vmovdqu64 %ymm16,32(%rdi) 394 vmovdqu64 %ymm17,64(%rdi) 395 vmovdqu64 %ymm18,96(%rdi) 396 vmovdqu64 %ymm19,128(%rdi) 397 398 vzeroupper 399 movq 0(%rsp),%r15 400.cfi_restore %r15 401 movq 8(%rsp),%r14 402.cfi_restore %r14 403 movq 16(%rsp),%r13 404.cfi_restore %r13 405 movq 24(%rsp),%r12 406.cfi_restore %r12 407 movq 32(%rsp),%rbp 408.cfi_restore %rbp 409 movq 40(%rsp),%rbx 410.cfi_restore %rbx 411 leaq 48(%rsp),%rsp 412.cfi_adjust_cfa_offset -48 413.Lrsaz_amm52x20_x1_256_epilogue: 414 .byte 0xf3,0xc3 415.cfi_endproc 416.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 417.data 418.align 32 419.Lmask52x4: 420.quad 0xfffffffffffff 421.quad 0xfffffffffffff 422.quad 0xfffffffffffff 423.quad 0xfffffffffffff 424.text 425 426.globl ossl_rsaz_amm52x20_x2_256 427.type ossl_rsaz_amm52x20_x2_256,@function 428.align 32 429ossl_rsaz_amm52x20_x2_256: 430.cfi_startproc 431.byte 243,15,30,250 432 pushq %rbx 433.cfi_adjust_cfa_offset 8 434.cfi_offset %rbx,-16 435 pushq %rbp 436.cfi_adjust_cfa_offset 8 437.cfi_offset %rbp,-24 438 pushq %r12 439.cfi_adjust_cfa_offset 8 440.cfi_offset %r12,-32 441 pushq %r13 442.cfi_adjust_cfa_offset 8 443.cfi_offset %r13,-40 444 pushq %r14 445.cfi_adjust_cfa_offset 8 446.cfi_offset %r14,-48 447 pushq %r15 448.cfi_adjust_cfa_offset 8 449.cfi_offset %r15,-56 450.Lrsaz_amm52x20_x2_256_body: 451 452 453 vpxord %ymm0,%ymm0,%ymm0 454 vmovdqa64 %ymm0,%ymm1 455 vmovdqa64 %ymm0,%ymm16 456 vmovdqa64 %ymm0,%ymm17 457 vmovdqa64 %ymm0,%ymm18 458 vmovdqa64 %ymm0,%ymm19 459 vmovdqa64 %ymm0,%ymm2 460 vmovdqa64 %ymm0,%ymm20 461 vmovdqa64 %ymm0,%ymm21 462 vmovdqa64 %ymm0,%ymm22 463 vmovdqa64 %ymm0,%ymm23 464 465 xorl %r9d,%r9d 466 xorl %r15d,%r15d 467 468 movq %rdx,%r11 469 movq $0xfffffffffffff,%rax 470 471 movl $20,%ebx 472 473.align 32 474.Lloop20: 475 movq 0(%r11),%r13 476 477 vpbroadcastq %r13,%ymm3 478 movq 0(%rsi),%rdx 479 mulxq %r13,%r13,%r12 480 addq %r13,%r9 481 movq %r12,%r10 482 adcq $0,%r10 483 484 movq (%r8),%r13 485 imulq %r9,%r13 486 andq %rax,%r13 487 488 vpbroadcastq %r13,%ymm4 489 movq 0(%rcx),%rdx 490 mulxq %r13,%r13,%r12 491 addq %r13,%r9 492 adcq %r12,%r10 493 494 shrq $52,%r9 495 salq $12,%r10 496 orq %r10,%r9 497 498 vpmadd52luq 0(%rsi),%ymm3,%ymm1 499 vpmadd52luq 32(%rsi),%ymm3,%ymm16 500 vpmadd52luq 64(%rsi),%ymm3,%ymm17 501 vpmadd52luq 96(%rsi),%ymm3,%ymm18 502 vpmadd52luq 128(%rsi),%ymm3,%ymm19 503 504 vpmadd52luq 0(%rcx),%ymm4,%ymm1 505 vpmadd52luq 32(%rcx),%ymm4,%ymm16 506 vpmadd52luq 64(%rcx),%ymm4,%ymm17 507 vpmadd52luq 96(%rcx),%ymm4,%ymm18 508 vpmadd52luq 128(%rcx),%ymm4,%ymm19 509 510 511 valignq $1,%ymm1,%ymm16,%ymm1 512 valignq $1,%ymm16,%ymm17,%ymm16 513 valignq $1,%ymm17,%ymm18,%ymm17 514 valignq $1,%ymm18,%ymm19,%ymm18 515 valignq $1,%ymm19,%ymm0,%ymm19 516 517 vmovq %xmm1,%r13 518 addq %r13,%r9 519 520 vpmadd52huq 0(%rsi),%ymm3,%ymm1 521 vpmadd52huq 32(%rsi),%ymm3,%ymm16 522 vpmadd52huq 64(%rsi),%ymm3,%ymm17 523 vpmadd52huq 96(%rsi),%ymm3,%ymm18 524 vpmadd52huq 128(%rsi),%ymm3,%ymm19 525 526 vpmadd52huq 0(%rcx),%ymm4,%ymm1 527 vpmadd52huq 32(%rcx),%ymm4,%ymm16 528 vpmadd52huq 64(%rcx),%ymm4,%ymm17 529 vpmadd52huq 96(%rcx),%ymm4,%ymm18 530 vpmadd52huq 128(%rcx),%ymm4,%ymm19 531 movq 160(%r11),%r13 532 533 vpbroadcastq %r13,%ymm3 534 movq 160(%rsi),%rdx 535 mulxq %r13,%r13,%r12 536 addq %r13,%r15 537 movq %r12,%r10 538 adcq $0,%r10 539 540 movq 8(%r8),%r13 541 imulq %r15,%r13 542 andq %rax,%r13 543 544 vpbroadcastq %r13,%ymm4 545 movq 160(%rcx),%rdx 546 mulxq %r13,%r13,%r12 547 addq %r13,%r15 548 adcq %r12,%r10 549 550 shrq $52,%r15 551 salq $12,%r10 552 orq %r10,%r15 553 554 vpmadd52luq 160(%rsi),%ymm3,%ymm2 555 vpmadd52luq 192(%rsi),%ymm3,%ymm20 556 vpmadd52luq 224(%rsi),%ymm3,%ymm21 557 vpmadd52luq 256(%rsi),%ymm3,%ymm22 558 vpmadd52luq 288(%rsi),%ymm3,%ymm23 559 560 vpmadd52luq 160(%rcx),%ymm4,%ymm2 561 vpmadd52luq 192(%rcx),%ymm4,%ymm20 562 vpmadd52luq 224(%rcx),%ymm4,%ymm21 563 vpmadd52luq 256(%rcx),%ymm4,%ymm22 564 vpmadd52luq 288(%rcx),%ymm4,%ymm23 565 566 567 valignq $1,%ymm2,%ymm20,%ymm2 568 valignq $1,%ymm20,%ymm21,%ymm20 569 valignq $1,%ymm21,%ymm22,%ymm21 570 valignq $1,%ymm22,%ymm23,%ymm22 571 valignq $1,%ymm23,%ymm0,%ymm23 572 573 vmovq %xmm2,%r13 574 addq %r13,%r15 575 576 vpmadd52huq 160(%rsi),%ymm3,%ymm2 577 vpmadd52huq 192(%rsi),%ymm3,%ymm20 578 vpmadd52huq 224(%rsi),%ymm3,%ymm21 579 vpmadd52huq 256(%rsi),%ymm3,%ymm22 580 vpmadd52huq 288(%rsi),%ymm3,%ymm23 581 582 vpmadd52huq 160(%rcx),%ymm4,%ymm2 583 vpmadd52huq 192(%rcx),%ymm4,%ymm20 584 vpmadd52huq 224(%rcx),%ymm4,%ymm21 585 vpmadd52huq 256(%rcx),%ymm4,%ymm22 586 vpmadd52huq 288(%rcx),%ymm4,%ymm23 587 leaq 8(%r11),%r11 588 decl %ebx 589 jne .Lloop20 590 591 vmovdqa64 .Lmask52x4(%rip),%ymm4 592 593 vpbroadcastq %r9,%ymm3 594 vpblendd $3,%ymm3,%ymm1,%ymm1 595 596 597 598 vpsrlq $52,%ymm1,%ymm24 599 vpsrlq $52,%ymm16,%ymm25 600 vpsrlq $52,%ymm17,%ymm26 601 vpsrlq $52,%ymm18,%ymm27 602 vpsrlq $52,%ymm19,%ymm28 603 604 605 valignq $3,%ymm27,%ymm28,%ymm28 606 valignq $3,%ymm26,%ymm27,%ymm27 607 valignq $3,%ymm25,%ymm26,%ymm26 608 valignq $3,%ymm24,%ymm25,%ymm25 609 valignq $3,%ymm0,%ymm24,%ymm24 610 611 612 vpandq %ymm4,%ymm1,%ymm1 613 vpandq %ymm4,%ymm16,%ymm16 614 vpandq %ymm4,%ymm17,%ymm17 615 vpandq %ymm4,%ymm18,%ymm18 616 vpandq %ymm4,%ymm19,%ymm19 617 618 619 vpaddq %ymm24,%ymm1,%ymm1 620 vpaddq %ymm25,%ymm16,%ymm16 621 vpaddq %ymm26,%ymm17,%ymm17 622 vpaddq %ymm27,%ymm18,%ymm18 623 vpaddq %ymm28,%ymm19,%ymm19 624 625 626 627 vpcmpuq $1,%ymm1,%ymm4,%k1 628 vpcmpuq $1,%ymm16,%ymm4,%k2 629 vpcmpuq $1,%ymm17,%ymm4,%k3 630 vpcmpuq $1,%ymm18,%ymm4,%k4 631 vpcmpuq $1,%ymm19,%ymm4,%k5 632 kmovb %k1,%r14d 633 kmovb %k2,%r13d 634 kmovb %k3,%r12d 635 kmovb %k4,%r11d 636 kmovb %k5,%r10d 637 638 639 vpcmpuq $0,%ymm1,%ymm4,%k1 640 vpcmpuq $0,%ymm16,%ymm4,%k2 641 vpcmpuq $0,%ymm17,%ymm4,%k3 642 vpcmpuq $0,%ymm18,%ymm4,%k4 643 vpcmpuq $0,%ymm19,%ymm4,%k5 644 kmovb %k1,%r9d 645 kmovb %k2,%r8d 646 kmovb %k3,%ebx 647 kmovb %k4,%ecx 648 kmovb %k5,%edx 649 650 651 652 shlb $4,%r13b 653 orb %r13b,%r14b 654 shlb $4,%r11b 655 orb %r11b,%r12b 656 657 addb %r14b,%r14b 658 adcb %r12b,%r12b 659 adcb %r10b,%r10b 660 661 shlb $4,%r8b 662 orb %r8b,%r9b 663 shlb $4,%cl 664 orb %cl,%bl 665 666 addb %r9b,%r14b 667 adcb %bl,%r12b 668 adcb %dl,%r10b 669 670 xorb %r9b,%r14b 671 xorb %bl,%r12b 672 xorb %dl,%r10b 673 674 kmovb %r14d,%k1 675 shrb $4,%r14b 676 kmovb %r14d,%k2 677 kmovb %r12d,%k3 678 shrb $4,%r12b 679 kmovb %r12d,%k4 680 kmovb %r10d,%k5 681 682 683 vpsubq %ymm4,%ymm1,%ymm1{%k1} 684 vpsubq %ymm4,%ymm16,%ymm16{%k2} 685 vpsubq %ymm4,%ymm17,%ymm17{%k3} 686 vpsubq %ymm4,%ymm18,%ymm18{%k4} 687 vpsubq %ymm4,%ymm19,%ymm19{%k5} 688 689 vpandq %ymm4,%ymm1,%ymm1 690 vpandq %ymm4,%ymm16,%ymm16 691 vpandq %ymm4,%ymm17,%ymm17 692 vpandq %ymm4,%ymm18,%ymm18 693 vpandq %ymm4,%ymm19,%ymm19 694 695 vpbroadcastq %r15,%ymm3 696 vpblendd $3,%ymm3,%ymm2,%ymm2 697 698 699 700 vpsrlq $52,%ymm2,%ymm24 701 vpsrlq $52,%ymm20,%ymm25 702 vpsrlq $52,%ymm21,%ymm26 703 vpsrlq $52,%ymm22,%ymm27 704 vpsrlq $52,%ymm23,%ymm28 705 706 707 valignq $3,%ymm27,%ymm28,%ymm28 708 valignq $3,%ymm26,%ymm27,%ymm27 709 valignq $3,%ymm25,%ymm26,%ymm26 710 valignq $3,%ymm24,%ymm25,%ymm25 711 valignq $3,%ymm0,%ymm24,%ymm24 712 713 714 vpandq %ymm4,%ymm2,%ymm2 715 vpandq %ymm4,%ymm20,%ymm20 716 vpandq %ymm4,%ymm21,%ymm21 717 vpandq %ymm4,%ymm22,%ymm22 718 vpandq %ymm4,%ymm23,%ymm23 719 720 721 vpaddq %ymm24,%ymm2,%ymm2 722 vpaddq %ymm25,%ymm20,%ymm20 723 vpaddq %ymm26,%ymm21,%ymm21 724 vpaddq %ymm27,%ymm22,%ymm22 725 vpaddq %ymm28,%ymm23,%ymm23 726 727 728 729 vpcmpuq $1,%ymm2,%ymm4,%k1 730 vpcmpuq $1,%ymm20,%ymm4,%k2 731 vpcmpuq $1,%ymm21,%ymm4,%k3 732 vpcmpuq $1,%ymm22,%ymm4,%k4 733 vpcmpuq $1,%ymm23,%ymm4,%k5 734 kmovb %k1,%r14d 735 kmovb %k2,%r13d 736 kmovb %k3,%r12d 737 kmovb %k4,%r11d 738 kmovb %k5,%r10d 739 740 741 vpcmpuq $0,%ymm2,%ymm4,%k1 742 vpcmpuq $0,%ymm20,%ymm4,%k2 743 vpcmpuq $0,%ymm21,%ymm4,%k3 744 vpcmpuq $0,%ymm22,%ymm4,%k4 745 vpcmpuq $0,%ymm23,%ymm4,%k5 746 kmovb %k1,%r9d 747 kmovb %k2,%r8d 748 kmovb %k3,%ebx 749 kmovb %k4,%ecx 750 kmovb %k5,%edx 751 752 753 754 shlb $4,%r13b 755 orb %r13b,%r14b 756 shlb $4,%r11b 757 orb %r11b,%r12b 758 759 addb %r14b,%r14b 760 adcb %r12b,%r12b 761 adcb %r10b,%r10b 762 763 shlb $4,%r8b 764 orb %r8b,%r9b 765 shlb $4,%cl 766 orb %cl,%bl 767 768 addb %r9b,%r14b 769 adcb %bl,%r12b 770 adcb %dl,%r10b 771 772 xorb %r9b,%r14b 773 xorb %bl,%r12b 774 xorb %dl,%r10b 775 776 kmovb %r14d,%k1 777 shrb $4,%r14b 778 kmovb %r14d,%k2 779 kmovb %r12d,%k3 780 shrb $4,%r12b 781 kmovb %r12d,%k4 782 kmovb %r10d,%k5 783 784 785 vpsubq %ymm4,%ymm2,%ymm2{%k1} 786 vpsubq %ymm4,%ymm20,%ymm20{%k2} 787 vpsubq %ymm4,%ymm21,%ymm21{%k3} 788 vpsubq %ymm4,%ymm22,%ymm22{%k4} 789 vpsubq %ymm4,%ymm23,%ymm23{%k5} 790 791 vpandq %ymm4,%ymm2,%ymm2 792 vpandq %ymm4,%ymm20,%ymm20 793 vpandq %ymm4,%ymm21,%ymm21 794 vpandq %ymm4,%ymm22,%ymm22 795 vpandq %ymm4,%ymm23,%ymm23 796 797 vmovdqu64 %ymm1,(%rdi) 798 vmovdqu64 %ymm16,32(%rdi) 799 vmovdqu64 %ymm17,64(%rdi) 800 vmovdqu64 %ymm18,96(%rdi) 801 vmovdqu64 %ymm19,128(%rdi) 802 803 vmovdqu64 %ymm2,160(%rdi) 804 vmovdqu64 %ymm20,192(%rdi) 805 vmovdqu64 %ymm21,224(%rdi) 806 vmovdqu64 %ymm22,256(%rdi) 807 vmovdqu64 %ymm23,288(%rdi) 808 809 vzeroupper 810 movq 0(%rsp),%r15 811.cfi_restore %r15 812 movq 8(%rsp),%r14 813.cfi_restore %r14 814 movq 16(%rsp),%r13 815.cfi_restore %r13 816 movq 24(%rsp),%r12 817.cfi_restore %r12 818 movq 32(%rsp),%rbp 819.cfi_restore %rbp 820 movq 40(%rsp),%rbx 821.cfi_restore %rbx 822 leaq 48(%rsp),%rsp 823.cfi_adjust_cfa_offset -48 824.Lrsaz_amm52x20_x2_256_epilogue: 825 .byte 0xf3,0xc3 826.cfi_endproc 827.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256 828.text 829 830.align 32 831.globl ossl_extract_multiplier_2x20_win5 832.type ossl_extract_multiplier_2x20_win5,@function 833ossl_extract_multiplier_2x20_win5: 834.cfi_startproc 835.byte 243,15,30,250 836 leaq (%rcx,%rcx,4),%rax 837 salq $5,%rax 838 addq %rax,%rsi 839 840 vmovdqa64 .Lones(%rip),%ymm23 841 vpbroadcastq %rdx,%ymm22 842 leaq 10240(%rsi),%rax 843 844 vpxor %xmm4,%xmm4,%xmm4 845 vmovdqa64 %ymm4,%ymm3 846 vmovdqa64 %ymm4,%ymm2 847 vmovdqa64 %ymm4,%ymm1 848 vmovdqa64 %ymm4,%ymm0 849 vmovdqa64 %ymm4,%ymm21 850 851.align 32 852.Lloop: 853 vpcmpq $0,%ymm21,%ymm22,%k1 854 addq $320,%rsi 855 vpaddq %ymm23,%ymm21,%ymm21 856 vmovdqu64 -320(%rsi),%ymm16 857 vmovdqu64 -288(%rsi),%ymm17 858 vmovdqu64 -256(%rsi),%ymm18 859 vmovdqu64 -224(%rsi),%ymm19 860 vmovdqu64 -192(%rsi),%ymm20 861 vpblendmq %ymm16,%ymm0,%ymm0{%k1} 862 vpblendmq %ymm17,%ymm1,%ymm1{%k1} 863 vpblendmq %ymm18,%ymm2,%ymm2{%k1} 864 vpblendmq %ymm19,%ymm3,%ymm3{%k1} 865 vpblendmq %ymm20,%ymm4,%ymm4{%k1} 866 cmpq %rsi,%rax 867 jne .Lloop 868 869 vmovdqu64 %ymm0,(%rdi) 870 vmovdqu64 %ymm1,32(%rdi) 871 vmovdqu64 %ymm2,64(%rdi) 872 vmovdqu64 %ymm3,96(%rdi) 873 vmovdqu64 %ymm4,128(%rdi) 874 875 .byte 0xf3,0xc3 876.cfi_endproc 877.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 878.data 879.align 32 880.Lones: 881.quad 1,1,1,1 882 .section ".note.gnu.property", "a" 883 .p2align 3 884 .long 1f - 0f 885 .long 4f - 1f 886 .long 5 8870: 888 # "GNU" encoded with .byte, since .asciz isn't supported 889 # on Solaris. 890 .byte 0x47 891 .byte 0x4e 892 .byte 0x55 893 .byte 0 8941: 895 .p2align 3 896 .long 0xc0000002 897 .long 3f - 2f 8982: 899 .long 3 9003: 901 .p2align 3 9024: 903