1/* Do not modify. This file is auto-generated from rsaz-2k-avx512.pl. */ 2 3.globl ossl_rsaz_avx512ifma_eligible 4.type ossl_rsaz_avx512ifma_eligible,@function 5.align 32 6ossl_rsaz_avx512ifma_eligible: 7 movl OPENSSL_ia32cap_P+8(%rip),%ecx 8 xorl %eax,%eax 9 andl $2149777408,%ecx 10 cmpl $2149777408,%ecx 11 cmovel %ecx,%eax 12 .byte 0xf3,0xc3 13.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 14.text 15 16.globl ossl_rsaz_amm52x20_x1_ifma256 17.type ossl_rsaz_amm52x20_x1_ifma256,@function 18.align 32 19ossl_rsaz_amm52x20_x1_ifma256: 20.cfi_startproc 21.byte 243,15,30,250 22 pushq %rbx 23.cfi_adjust_cfa_offset 8 24.cfi_offset %rbx,-16 25 pushq %rbp 26.cfi_adjust_cfa_offset 8 27.cfi_offset %rbp,-24 28 pushq %r12 29.cfi_adjust_cfa_offset 8 30.cfi_offset %r12,-32 31 pushq %r13 32.cfi_adjust_cfa_offset 8 33.cfi_offset %r13,-40 34 pushq %r14 35.cfi_adjust_cfa_offset 8 36.cfi_offset %r14,-48 37 pushq %r15 38.cfi_adjust_cfa_offset 8 39.cfi_offset %r15,-56 40.Lossl_rsaz_amm52x20_x1_ifma256_body: 41 42 43 vpxord %ymm0,%ymm0,%ymm0 44 vmovdqa64 %ymm0,%ymm3 45 vmovdqa64 %ymm0,%ymm16 46 vmovdqa64 %ymm0,%ymm17 47 vmovdqa64 %ymm0,%ymm18 48 vmovdqa64 %ymm0,%ymm19 49 50 xorl %r9d,%r9d 51 52 movq %rdx,%r11 53 movq $0xfffffffffffff,%rax 54 55 56 movl $5,%ebx 57 58.align 32 59.Lloop5: 60 movq 0(%r11),%r13 61 62 vpbroadcastq %r13,%ymm1 63 movq 0(%rsi),%rdx 64 mulxq %r13,%r13,%r12 65 addq %r13,%r9 66 movq %r12,%r10 67 adcq $0,%r10 68 69 movq %r8,%r13 70 imulq %r9,%r13 71 andq %rax,%r13 72 73 vpbroadcastq %r13,%ymm2 74 movq 0(%rcx),%rdx 75 mulxq %r13,%r13,%r12 76 addq %r13,%r9 77 adcq %r12,%r10 78 79 shrq $52,%r9 80 salq $12,%r10 81 orq %r10,%r9 82 83 vpmadd52luq 0(%rsi),%ymm1,%ymm3 84 vpmadd52luq 32(%rsi),%ymm1,%ymm16 85 vpmadd52luq 64(%rsi),%ymm1,%ymm17 86 vpmadd52luq 96(%rsi),%ymm1,%ymm18 87 vpmadd52luq 128(%rsi),%ymm1,%ymm19 88 89 vpmadd52luq 0(%rcx),%ymm2,%ymm3 90 vpmadd52luq 32(%rcx),%ymm2,%ymm16 91 vpmadd52luq 64(%rcx),%ymm2,%ymm17 92 vpmadd52luq 96(%rcx),%ymm2,%ymm18 93 vpmadd52luq 128(%rcx),%ymm2,%ymm19 94 95 96 valignq $1,%ymm3,%ymm16,%ymm3 97 valignq $1,%ymm16,%ymm17,%ymm16 98 valignq $1,%ymm17,%ymm18,%ymm17 99 valignq $1,%ymm18,%ymm19,%ymm18 100 valignq $1,%ymm19,%ymm0,%ymm19 101 102 vmovq %xmm3,%r13 103 addq %r13,%r9 104 105 vpmadd52huq 0(%rsi),%ymm1,%ymm3 106 vpmadd52huq 32(%rsi),%ymm1,%ymm16 107 vpmadd52huq 64(%rsi),%ymm1,%ymm17 108 vpmadd52huq 96(%rsi),%ymm1,%ymm18 109 vpmadd52huq 128(%rsi),%ymm1,%ymm19 110 111 vpmadd52huq 0(%rcx),%ymm2,%ymm3 112 vpmadd52huq 32(%rcx),%ymm2,%ymm16 113 vpmadd52huq 64(%rcx),%ymm2,%ymm17 114 vpmadd52huq 96(%rcx),%ymm2,%ymm18 115 vpmadd52huq 128(%rcx),%ymm2,%ymm19 116 movq 8(%r11),%r13 117 118 vpbroadcastq %r13,%ymm1 119 movq 0(%rsi),%rdx 120 mulxq %r13,%r13,%r12 121 addq %r13,%r9 122 movq %r12,%r10 123 adcq $0,%r10 124 125 movq %r8,%r13 126 imulq %r9,%r13 127 andq %rax,%r13 128 129 vpbroadcastq %r13,%ymm2 130 movq 0(%rcx),%rdx 131 mulxq %r13,%r13,%r12 132 addq %r13,%r9 133 adcq %r12,%r10 134 135 shrq $52,%r9 136 salq $12,%r10 137 orq %r10,%r9 138 139 vpmadd52luq 0(%rsi),%ymm1,%ymm3 140 vpmadd52luq 32(%rsi),%ymm1,%ymm16 141 vpmadd52luq 64(%rsi),%ymm1,%ymm17 142 vpmadd52luq 96(%rsi),%ymm1,%ymm18 143 vpmadd52luq 128(%rsi),%ymm1,%ymm19 144 145 vpmadd52luq 0(%rcx),%ymm2,%ymm3 146 vpmadd52luq 32(%rcx),%ymm2,%ymm16 147 vpmadd52luq 64(%rcx),%ymm2,%ymm17 148 vpmadd52luq 96(%rcx),%ymm2,%ymm18 149 vpmadd52luq 128(%rcx),%ymm2,%ymm19 150 151 152 valignq $1,%ymm3,%ymm16,%ymm3 153 valignq $1,%ymm16,%ymm17,%ymm16 154 valignq $1,%ymm17,%ymm18,%ymm17 155 valignq $1,%ymm18,%ymm19,%ymm18 156 valignq $1,%ymm19,%ymm0,%ymm19 157 158 vmovq %xmm3,%r13 159 addq %r13,%r9 160 161 vpmadd52huq 0(%rsi),%ymm1,%ymm3 162 vpmadd52huq 32(%rsi),%ymm1,%ymm16 163 vpmadd52huq 64(%rsi),%ymm1,%ymm17 164 vpmadd52huq 96(%rsi),%ymm1,%ymm18 165 vpmadd52huq 128(%rsi),%ymm1,%ymm19 166 167 vpmadd52huq 0(%rcx),%ymm2,%ymm3 168 vpmadd52huq 32(%rcx),%ymm2,%ymm16 169 vpmadd52huq 64(%rcx),%ymm2,%ymm17 170 vpmadd52huq 96(%rcx),%ymm2,%ymm18 171 vpmadd52huq 128(%rcx),%ymm2,%ymm19 172 movq 16(%r11),%r13 173 174 vpbroadcastq %r13,%ymm1 175 movq 0(%rsi),%rdx 176 mulxq %r13,%r13,%r12 177 addq %r13,%r9 178 movq %r12,%r10 179 adcq $0,%r10 180 181 movq %r8,%r13 182 imulq %r9,%r13 183 andq %rax,%r13 184 185 vpbroadcastq %r13,%ymm2 186 movq 0(%rcx),%rdx 187 mulxq %r13,%r13,%r12 188 addq %r13,%r9 189 adcq %r12,%r10 190 191 shrq $52,%r9 192 salq $12,%r10 193 orq %r10,%r9 194 195 vpmadd52luq 0(%rsi),%ymm1,%ymm3 196 vpmadd52luq 32(%rsi),%ymm1,%ymm16 197 vpmadd52luq 64(%rsi),%ymm1,%ymm17 198 vpmadd52luq 96(%rsi),%ymm1,%ymm18 199 vpmadd52luq 128(%rsi),%ymm1,%ymm19 200 201 vpmadd52luq 0(%rcx),%ymm2,%ymm3 202 vpmadd52luq 32(%rcx),%ymm2,%ymm16 203 vpmadd52luq 64(%rcx),%ymm2,%ymm17 204 vpmadd52luq 96(%rcx),%ymm2,%ymm18 205 vpmadd52luq 128(%rcx),%ymm2,%ymm19 206 207 208 valignq $1,%ymm3,%ymm16,%ymm3 209 valignq $1,%ymm16,%ymm17,%ymm16 210 valignq $1,%ymm17,%ymm18,%ymm17 211 valignq $1,%ymm18,%ymm19,%ymm18 212 valignq $1,%ymm19,%ymm0,%ymm19 213 214 vmovq %xmm3,%r13 215 addq %r13,%r9 216 217 vpmadd52huq 0(%rsi),%ymm1,%ymm3 218 vpmadd52huq 32(%rsi),%ymm1,%ymm16 219 vpmadd52huq 64(%rsi),%ymm1,%ymm17 220 vpmadd52huq 96(%rsi),%ymm1,%ymm18 221 vpmadd52huq 128(%rsi),%ymm1,%ymm19 222 223 vpmadd52huq 0(%rcx),%ymm2,%ymm3 224 vpmadd52huq 32(%rcx),%ymm2,%ymm16 225 vpmadd52huq 64(%rcx),%ymm2,%ymm17 226 vpmadd52huq 96(%rcx),%ymm2,%ymm18 227 vpmadd52huq 128(%rcx),%ymm2,%ymm19 228 movq 24(%r11),%r13 229 230 vpbroadcastq %r13,%ymm1 231 movq 0(%rsi),%rdx 232 mulxq %r13,%r13,%r12 233 addq %r13,%r9 234 movq %r12,%r10 235 adcq $0,%r10 236 237 movq %r8,%r13 238 imulq %r9,%r13 239 andq %rax,%r13 240 241 vpbroadcastq %r13,%ymm2 242 movq 0(%rcx),%rdx 243 mulxq %r13,%r13,%r12 244 addq %r13,%r9 245 adcq %r12,%r10 246 247 shrq $52,%r9 248 salq $12,%r10 249 orq %r10,%r9 250 251 vpmadd52luq 0(%rsi),%ymm1,%ymm3 252 vpmadd52luq 32(%rsi),%ymm1,%ymm16 253 vpmadd52luq 64(%rsi),%ymm1,%ymm17 254 vpmadd52luq 96(%rsi),%ymm1,%ymm18 255 vpmadd52luq 128(%rsi),%ymm1,%ymm19 256 257 vpmadd52luq 0(%rcx),%ymm2,%ymm3 258 vpmadd52luq 32(%rcx),%ymm2,%ymm16 259 vpmadd52luq 64(%rcx),%ymm2,%ymm17 260 vpmadd52luq 96(%rcx),%ymm2,%ymm18 261 vpmadd52luq 128(%rcx),%ymm2,%ymm19 262 263 264 valignq $1,%ymm3,%ymm16,%ymm3 265 valignq $1,%ymm16,%ymm17,%ymm16 266 valignq $1,%ymm17,%ymm18,%ymm17 267 valignq $1,%ymm18,%ymm19,%ymm18 268 valignq $1,%ymm19,%ymm0,%ymm19 269 270 vmovq %xmm3,%r13 271 addq %r13,%r9 272 273 vpmadd52huq 0(%rsi),%ymm1,%ymm3 274 vpmadd52huq 32(%rsi),%ymm1,%ymm16 275 vpmadd52huq 64(%rsi),%ymm1,%ymm17 276 vpmadd52huq 96(%rsi),%ymm1,%ymm18 277 vpmadd52huq 128(%rsi),%ymm1,%ymm19 278 279 vpmadd52huq 0(%rcx),%ymm2,%ymm3 280 vpmadd52huq 32(%rcx),%ymm2,%ymm16 281 vpmadd52huq 64(%rcx),%ymm2,%ymm17 282 vpmadd52huq 96(%rcx),%ymm2,%ymm18 283 vpmadd52huq 128(%rcx),%ymm2,%ymm19 284 leaq 32(%r11),%r11 285 decl %ebx 286 jne .Lloop5 287 288 vpbroadcastq %r9,%ymm0 289 vpblendd $3,%ymm0,%ymm3,%ymm3 290 291 292 293 vpsrlq $52,%ymm3,%ymm0 294 vpsrlq $52,%ymm16,%ymm1 295 vpsrlq $52,%ymm17,%ymm2 296 vpsrlq $52,%ymm18,%ymm25 297 vpsrlq $52,%ymm19,%ymm26 298 299 300 valignq $3,%ymm25,%ymm26,%ymm26 301 valignq $3,%ymm2,%ymm25,%ymm25 302 valignq $3,%ymm1,%ymm2,%ymm2 303 valignq $3,%ymm0,%ymm1,%ymm1 304 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 305 306 307 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 308 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 309 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 310 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 311 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 312 313 314 vpaddq %ymm0,%ymm3,%ymm3 315 vpaddq %ymm1,%ymm16,%ymm16 316 vpaddq %ymm2,%ymm17,%ymm17 317 vpaddq %ymm25,%ymm18,%ymm18 318 vpaddq %ymm26,%ymm19,%ymm19 319 320 321 322 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 323 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 324 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3 325 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4 326 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5 327 kmovb %k1,%r14d 328 kmovb %k2,%r13d 329 kmovb %k3,%r12d 330 kmovb %k4,%r11d 331 kmovb %k5,%r10d 332 333 334 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 335 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 336 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3 337 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4 338 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5 339 kmovb %k1,%r9d 340 kmovb %k2,%r8d 341 kmovb %k3,%ebx 342 kmovb %k4,%ecx 343 kmovb %k5,%edx 344 345 346 347 shlb $4,%r13b 348 orb %r13b,%r14b 349 shlb $4,%r11b 350 orb %r11b,%r12b 351 352 addb %r14b,%r14b 353 adcb %r12b,%r12b 354 adcb %r10b,%r10b 355 356 shlb $4,%r8b 357 orb %r8b,%r9b 358 shlb $4,%cl 359 orb %cl,%bl 360 361 addb %r9b,%r14b 362 adcb %bl,%r12b 363 adcb %dl,%r10b 364 365 xorb %r9b,%r14b 366 xorb %bl,%r12b 367 xorb %dl,%r10b 368 369 kmovb %r14d,%k1 370 shrb $4,%r14b 371 kmovb %r14d,%k2 372 kmovb %r12d,%k3 373 shrb $4,%r12b 374 kmovb %r12d,%k4 375 kmovb %r10d,%k5 376 377 378 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} 379 vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2} 380 vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3} 381 vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4} 382 vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5} 383 384 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 385 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 386 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 387 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 388 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 389 390 vmovdqu64 %ymm3,0(%rdi) 391 vmovdqu64 %ymm16,32(%rdi) 392 vmovdqu64 %ymm17,64(%rdi) 393 vmovdqu64 %ymm18,96(%rdi) 394 vmovdqu64 %ymm19,128(%rdi) 395 396 vzeroupper 397 movq 0(%rsp),%r15 398.cfi_restore %r15 399 movq 8(%rsp),%r14 400.cfi_restore %r14 401 movq 16(%rsp),%r13 402.cfi_restore %r13 403 movq 24(%rsp),%r12 404.cfi_restore %r12 405 movq 32(%rsp),%rbp 406.cfi_restore %rbp 407 movq 40(%rsp),%rbx 408.cfi_restore %rbx 409 leaq 48(%rsp),%rsp 410.cfi_adjust_cfa_offset -48 411.Lossl_rsaz_amm52x20_x1_ifma256_epilogue: 412 .byte 0xf3,0xc3 413.cfi_endproc 414.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 415.section .rodata 416.align 32 417.Lmask52x4: 418.quad 0xfffffffffffff 419.quad 0xfffffffffffff 420.quad 0xfffffffffffff 421.quad 0xfffffffffffff 422.text 423 424.globl ossl_rsaz_amm52x20_x2_ifma256 425.type ossl_rsaz_amm52x20_x2_ifma256,@function 426.align 32 427ossl_rsaz_amm52x20_x2_ifma256: 428.cfi_startproc 429.byte 243,15,30,250 430 pushq %rbx 431.cfi_adjust_cfa_offset 8 432.cfi_offset %rbx,-16 433 pushq %rbp 434.cfi_adjust_cfa_offset 8 435.cfi_offset %rbp,-24 436 pushq %r12 437.cfi_adjust_cfa_offset 8 438.cfi_offset %r12,-32 439 pushq %r13 440.cfi_adjust_cfa_offset 8 441.cfi_offset %r13,-40 442 pushq %r14 443.cfi_adjust_cfa_offset 8 444.cfi_offset %r14,-48 445 pushq %r15 446.cfi_adjust_cfa_offset 8 447.cfi_offset %r15,-56 448.Lossl_rsaz_amm52x20_x2_ifma256_body: 449 450 451 vpxord %ymm0,%ymm0,%ymm0 452 vmovdqa64 %ymm0,%ymm3 453 vmovdqa64 %ymm0,%ymm16 454 vmovdqa64 %ymm0,%ymm17 455 vmovdqa64 %ymm0,%ymm18 456 vmovdqa64 %ymm0,%ymm19 457 vmovdqa64 %ymm0,%ymm4 458 vmovdqa64 %ymm0,%ymm20 459 vmovdqa64 %ymm0,%ymm21 460 vmovdqa64 %ymm0,%ymm22 461 vmovdqa64 %ymm0,%ymm23 462 463 xorl %r9d,%r9d 464 xorl %r15d,%r15d 465 466 movq %rdx,%r11 467 movq $0xfffffffffffff,%rax 468 469 movl $20,%ebx 470 471.align 32 472.Lloop20: 473 movq 0(%r11),%r13 474 475 vpbroadcastq %r13,%ymm1 476 movq 0(%rsi),%rdx 477 mulxq %r13,%r13,%r12 478 addq %r13,%r9 479 movq %r12,%r10 480 adcq $0,%r10 481 482 movq (%r8),%r13 483 imulq %r9,%r13 484 andq %rax,%r13 485 486 vpbroadcastq %r13,%ymm2 487 movq 0(%rcx),%rdx 488 mulxq %r13,%r13,%r12 489 addq %r13,%r9 490 adcq %r12,%r10 491 492 shrq $52,%r9 493 salq $12,%r10 494 orq %r10,%r9 495 496 vpmadd52luq 0(%rsi),%ymm1,%ymm3 497 vpmadd52luq 32(%rsi),%ymm1,%ymm16 498 vpmadd52luq 64(%rsi),%ymm1,%ymm17 499 vpmadd52luq 96(%rsi),%ymm1,%ymm18 500 vpmadd52luq 128(%rsi),%ymm1,%ymm19 501 502 vpmadd52luq 0(%rcx),%ymm2,%ymm3 503 vpmadd52luq 32(%rcx),%ymm2,%ymm16 504 vpmadd52luq 64(%rcx),%ymm2,%ymm17 505 vpmadd52luq 96(%rcx),%ymm2,%ymm18 506 vpmadd52luq 128(%rcx),%ymm2,%ymm19 507 508 509 valignq $1,%ymm3,%ymm16,%ymm3 510 valignq $1,%ymm16,%ymm17,%ymm16 511 valignq $1,%ymm17,%ymm18,%ymm17 512 valignq $1,%ymm18,%ymm19,%ymm18 513 valignq $1,%ymm19,%ymm0,%ymm19 514 515 vmovq %xmm3,%r13 516 addq %r13,%r9 517 518 vpmadd52huq 0(%rsi),%ymm1,%ymm3 519 vpmadd52huq 32(%rsi),%ymm1,%ymm16 520 vpmadd52huq 64(%rsi),%ymm1,%ymm17 521 vpmadd52huq 96(%rsi),%ymm1,%ymm18 522 vpmadd52huq 128(%rsi),%ymm1,%ymm19 523 524 vpmadd52huq 0(%rcx),%ymm2,%ymm3 525 vpmadd52huq 32(%rcx),%ymm2,%ymm16 526 vpmadd52huq 64(%rcx),%ymm2,%ymm17 527 vpmadd52huq 96(%rcx),%ymm2,%ymm18 528 vpmadd52huq 128(%rcx),%ymm2,%ymm19 529 movq 160(%r11),%r13 530 531 vpbroadcastq %r13,%ymm1 532 movq 160(%rsi),%rdx 533 mulxq %r13,%r13,%r12 534 addq %r13,%r15 535 movq %r12,%r10 536 adcq $0,%r10 537 538 movq 8(%r8),%r13 539 imulq %r15,%r13 540 andq %rax,%r13 541 542 vpbroadcastq %r13,%ymm2 543 movq 160(%rcx),%rdx 544 mulxq %r13,%r13,%r12 545 addq %r13,%r15 546 adcq %r12,%r10 547 548 shrq $52,%r15 549 salq $12,%r10 550 orq %r10,%r15 551 552 vpmadd52luq 160(%rsi),%ymm1,%ymm4 553 vpmadd52luq 192(%rsi),%ymm1,%ymm20 554 vpmadd52luq 224(%rsi),%ymm1,%ymm21 555 vpmadd52luq 256(%rsi),%ymm1,%ymm22 556 vpmadd52luq 288(%rsi),%ymm1,%ymm23 557 558 vpmadd52luq 160(%rcx),%ymm2,%ymm4 559 vpmadd52luq 192(%rcx),%ymm2,%ymm20 560 vpmadd52luq 224(%rcx),%ymm2,%ymm21 561 vpmadd52luq 256(%rcx),%ymm2,%ymm22 562 vpmadd52luq 288(%rcx),%ymm2,%ymm23 563 564 565 valignq $1,%ymm4,%ymm20,%ymm4 566 valignq $1,%ymm20,%ymm21,%ymm20 567 valignq $1,%ymm21,%ymm22,%ymm21 568 valignq $1,%ymm22,%ymm23,%ymm22 569 valignq $1,%ymm23,%ymm0,%ymm23 570 571 vmovq %xmm4,%r13 572 addq %r13,%r15 573 574 vpmadd52huq 160(%rsi),%ymm1,%ymm4 575 vpmadd52huq 192(%rsi),%ymm1,%ymm20 576 vpmadd52huq 224(%rsi),%ymm1,%ymm21 577 vpmadd52huq 256(%rsi),%ymm1,%ymm22 578 vpmadd52huq 288(%rsi),%ymm1,%ymm23 579 580 vpmadd52huq 160(%rcx),%ymm2,%ymm4 581 vpmadd52huq 192(%rcx),%ymm2,%ymm20 582 vpmadd52huq 224(%rcx),%ymm2,%ymm21 583 vpmadd52huq 256(%rcx),%ymm2,%ymm22 584 vpmadd52huq 288(%rcx),%ymm2,%ymm23 585 leaq 8(%r11),%r11 586 decl %ebx 587 jne .Lloop20 588 589 vpbroadcastq %r9,%ymm0 590 vpblendd $3,%ymm0,%ymm3,%ymm3 591 592 593 594 vpsrlq $52,%ymm3,%ymm0 595 vpsrlq $52,%ymm16,%ymm1 596 vpsrlq $52,%ymm17,%ymm2 597 vpsrlq $52,%ymm18,%ymm25 598 vpsrlq $52,%ymm19,%ymm26 599 600 601 valignq $3,%ymm25,%ymm26,%ymm26 602 valignq $3,%ymm2,%ymm25,%ymm25 603 valignq $3,%ymm1,%ymm2,%ymm2 604 valignq $3,%ymm0,%ymm1,%ymm1 605 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 606 607 608 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 609 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 610 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 611 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 612 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 613 614 615 vpaddq %ymm0,%ymm3,%ymm3 616 vpaddq %ymm1,%ymm16,%ymm16 617 vpaddq %ymm2,%ymm17,%ymm17 618 vpaddq %ymm25,%ymm18,%ymm18 619 vpaddq %ymm26,%ymm19,%ymm19 620 621 622 623 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 624 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 625 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3 626 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4 627 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5 628 kmovb %k1,%r14d 629 kmovb %k2,%r13d 630 kmovb %k3,%r12d 631 kmovb %k4,%r11d 632 kmovb %k5,%r10d 633 634 635 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 636 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 637 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3 638 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4 639 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5 640 kmovb %k1,%r9d 641 kmovb %k2,%r8d 642 kmovb %k3,%ebx 643 kmovb %k4,%ecx 644 kmovb %k5,%edx 645 646 647 648 shlb $4,%r13b 649 orb %r13b,%r14b 650 shlb $4,%r11b 651 orb %r11b,%r12b 652 653 addb %r14b,%r14b 654 adcb %r12b,%r12b 655 adcb %r10b,%r10b 656 657 shlb $4,%r8b 658 orb %r8b,%r9b 659 shlb $4,%cl 660 orb %cl,%bl 661 662 addb %r9b,%r14b 663 adcb %bl,%r12b 664 adcb %dl,%r10b 665 666 xorb %r9b,%r14b 667 xorb %bl,%r12b 668 xorb %dl,%r10b 669 670 kmovb %r14d,%k1 671 shrb $4,%r14b 672 kmovb %r14d,%k2 673 kmovb %r12d,%k3 674 shrb $4,%r12b 675 kmovb %r12d,%k4 676 kmovb %r10d,%k5 677 678 679 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} 680 vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2} 681 vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3} 682 vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4} 683 vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5} 684 685 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 686 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 687 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 688 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 689 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 690 691 vpbroadcastq %r15,%ymm0 692 vpblendd $3,%ymm0,%ymm4,%ymm4 693 694 695 696 vpsrlq $52,%ymm4,%ymm0 697 vpsrlq $52,%ymm20,%ymm1 698 vpsrlq $52,%ymm21,%ymm2 699 vpsrlq $52,%ymm22,%ymm25 700 vpsrlq $52,%ymm23,%ymm26 701 702 703 valignq $3,%ymm25,%ymm26,%ymm26 704 valignq $3,%ymm2,%ymm25,%ymm25 705 valignq $3,%ymm1,%ymm2,%ymm2 706 valignq $3,%ymm0,%ymm1,%ymm1 707 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 708 709 710 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 711 vpandq .Lmask52x4(%rip),%ymm20,%ymm20 712 vpandq .Lmask52x4(%rip),%ymm21,%ymm21 713 vpandq .Lmask52x4(%rip),%ymm22,%ymm22 714 vpandq .Lmask52x4(%rip),%ymm23,%ymm23 715 716 717 vpaddq %ymm0,%ymm4,%ymm4 718 vpaddq %ymm1,%ymm20,%ymm20 719 vpaddq %ymm2,%ymm21,%ymm21 720 vpaddq %ymm25,%ymm22,%ymm22 721 vpaddq %ymm26,%ymm23,%ymm23 722 723 724 725 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k1 726 vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2 727 vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k3 728 vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k4 729 vpcmpuq $6,.Lmask52x4(%rip),%ymm23,%k5 730 kmovb %k1,%r14d 731 kmovb %k2,%r13d 732 kmovb %k3,%r12d 733 kmovb %k4,%r11d 734 kmovb %k5,%r10d 735 736 737 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k1 738 vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2 739 vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k3 740 vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k4 741 vpcmpuq $0,.Lmask52x4(%rip),%ymm23,%k5 742 kmovb %k1,%r9d 743 kmovb %k2,%r8d 744 kmovb %k3,%ebx 745 kmovb %k4,%ecx 746 kmovb %k5,%edx 747 748 749 750 shlb $4,%r13b 751 orb %r13b,%r14b 752 shlb $4,%r11b 753 orb %r11b,%r12b 754 755 addb %r14b,%r14b 756 adcb %r12b,%r12b 757 adcb %r10b,%r10b 758 759 shlb $4,%r8b 760 orb %r8b,%r9b 761 shlb $4,%cl 762 orb %cl,%bl 763 764 addb %r9b,%r14b 765 adcb %bl,%r12b 766 adcb %dl,%r10b 767 768 xorb %r9b,%r14b 769 xorb %bl,%r12b 770 xorb %dl,%r10b 771 772 kmovb %r14d,%k1 773 shrb $4,%r14b 774 kmovb %r14d,%k2 775 kmovb %r12d,%k3 776 shrb $4,%r12b 777 kmovb %r12d,%k4 778 kmovb %r10d,%k5 779 780 781 vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k1} 782 vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k2} 783 vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k3} 784 vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k4} 785 vpsubq .Lmask52x4(%rip),%ymm23,%ymm23{%k5} 786 787 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 788 vpandq .Lmask52x4(%rip),%ymm20,%ymm20 789 vpandq .Lmask52x4(%rip),%ymm21,%ymm21 790 vpandq .Lmask52x4(%rip),%ymm22,%ymm22 791 vpandq .Lmask52x4(%rip),%ymm23,%ymm23 792 793 vmovdqu64 %ymm3,0(%rdi) 794 vmovdqu64 %ymm16,32(%rdi) 795 vmovdqu64 %ymm17,64(%rdi) 796 vmovdqu64 %ymm18,96(%rdi) 797 vmovdqu64 %ymm19,128(%rdi) 798 799 vmovdqu64 %ymm4,160(%rdi) 800 vmovdqu64 %ymm20,192(%rdi) 801 vmovdqu64 %ymm21,224(%rdi) 802 vmovdqu64 %ymm22,256(%rdi) 803 vmovdqu64 %ymm23,288(%rdi) 804 805 vzeroupper 806 movq 0(%rsp),%r15 807.cfi_restore %r15 808 movq 8(%rsp),%r14 809.cfi_restore %r14 810 movq 16(%rsp),%r13 811.cfi_restore %r13 812 movq 24(%rsp),%r12 813.cfi_restore %r12 814 movq 32(%rsp),%rbp 815.cfi_restore %rbp 816 movq 40(%rsp),%rbx 817.cfi_restore %rbx 818 leaq 48(%rsp),%rsp 819.cfi_adjust_cfa_offset -48 820.Lossl_rsaz_amm52x20_x2_ifma256_epilogue: 821 .byte 0xf3,0xc3 822.cfi_endproc 823.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256 824.text 825 826.align 32 827.globl ossl_extract_multiplier_2x20_win5 828.type ossl_extract_multiplier_2x20_win5,@function 829ossl_extract_multiplier_2x20_win5: 830.cfi_startproc 831.byte 243,15,30,250 832 vmovdqa64 .Lones(%rip),%ymm24 833 vpbroadcastq %rdx,%ymm22 834 vpbroadcastq %rcx,%ymm23 835 leaq 10240(%rsi),%rax 836 837 838 vpxor %xmm0,%xmm0,%xmm0 839 vmovdqa64 %ymm0,%ymm21 840 vmovdqa64 %ymm0,%ymm1 841 vmovdqa64 %ymm0,%ymm2 842 vmovdqa64 %ymm0,%ymm3 843 vmovdqa64 %ymm0,%ymm4 844 vmovdqa64 %ymm0,%ymm5 845 vmovdqa64 %ymm0,%ymm16 846 vmovdqa64 %ymm0,%ymm17 847 vmovdqa64 %ymm0,%ymm18 848 vmovdqa64 %ymm0,%ymm19 849 850.align 32 851.Lloop: 852 vpcmpq $0,%ymm21,%ymm22,%k1 853 vpcmpq $0,%ymm21,%ymm23,%k2 854 vmovdqu64 0(%rsi),%ymm20 855 vpblendmq %ymm20,%ymm0,%ymm0{%k1} 856 vmovdqu64 32(%rsi),%ymm20 857 vpblendmq %ymm20,%ymm1,%ymm1{%k1} 858 vmovdqu64 64(%rsi),%ymm20 859 vpblendmq %ymm20,%ymm2,%ymm2{%k1} 860 vmovdqu64 96(%rsi),%ymm20 861 vpblendmq %ymm20,%ymm3,%ymm3{%k1} 862 vmovdqu64 128(%rsi),%ymm20 863 vpblendmq %ymm20,%ymm4,%ymm4{%k1} 864 vmovdqu64 160(%rsi),%ymm20 865 vpblendmq %ymm20,%ymm5,%ymm5{%k2} 866 vmovdqu64 192(%rsi),%ymm20 867 vpblendmq %ymm20,%ymm16,%ymm16{%k2} 868 vmovdqu64 224(%rsi),%ymm20 869 vpblendmq %ymm20,%ymm17,%ymm17{%k2} 870 vmovdqu64 256(%rsi),%ymm20 871 vpblendmq %ymm20,%ymm18,%ymm18{%k2} 872 vmovdqu64 288(%rsi),%ymm20 873 vpblendmq %ymm20,%ymm19,%ymm19{%k2} 874 vpaddq %ymm24,%ymm21,%ymm21 875 addq $320,%rsi 876 cmpq %rsi,%rax 877 jne .Lloop 878 vmovdqu64 %ymm0,0(%rdi) 879 vmovdqu64 %ymm1,32(%rdi) 880 vmovdqu64 %ymm2,64(%rdi) 881 vmovdqu64 %ymm3,96(%rdi) 882 vmovdqu64 %ymm4,128(%rdi) 883 vmovdqu64 %ymm5,160(%rdi) 884 vmovdqu64 %ymm16,192(%rdi) 885 vmovdqu64 %ymm17,224(%rdi) 886 vmovdqu64 %ymm18,256(%rdi) 887 vmovdqu64 %ymm19,288(%rdi) 888 .byte 0xf3,0xc3 889.cfi_endproc 890.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 891.section .rodata 892.align 32 893.Lones: 894.quad 1,1,1,1 895.Lzeros: 896.quad 0,0,0,0 897 .section ".note.gnu.property", "a" 898 .p2align 3 899 .long 1f - 0f 900 .long 4f - 1f 901 .long 5 9020: 903 # "GNU" encoded with .byte, since .asciz isn't supported 904 # on Solaris. 905 .byte 0x47 906 .byte 0x4e 907 .byte 0x55 908 .byte 0 9091: 910 .p2align 3 911 .long 0xc0000002 912 .long 3f - 2f 9132: 914 .long 3 9153: 916 .p2align 3 9174: 918