1/* Do not modify. This file is auto-generated from rsaz-3k-avxifma.pl. */ 2.text 3 4.globl ossl_rsaz_amm52x30_x1_avxifma256 5.type ossl_rsaz_amm52x30_x1_avxifma256,@function 6.align 32 7ossl_rsaz_amm52x30_x1_avxifma256: 8.cfi_startproc 9.byte 243,15,30,250 10 pushq %rbx 11.cfi_adjust_cfa_offset 8 12.cfi_offset %rbx,-16 13 pushq %rbp 14.cfi_adjust_cfa_offset 8 15.cfi_offset %rbp,-24 16 pushq %r12 17.cfi_adjust_cfa_offset 8 18.cfi_offset %r12,-32 19 pushq %r13 20.cfi_adjust_cfa_offset 8 21.cfi_offset %r13,-40 22 pushq %r14 23.cfi_adjust_cfa_offset 8 24.cfi_offset %r14,-48 25 pushq %r15 26.cfi_adjust_cfa_offset 8 27.cfi_offset %r15,-56 28 29 vpxor %ymm0,%ymm0,%ymm0 30 vmovapd %ymm0,%ymm3 31 vmovapd %ymm0,%ymm4 32 vmovapd %ymm0,%ymm5 33 vmovapd %ymm0,%ymm6 34 vmovapd %ymm0,%ymm7 35 vmovapd %ymm0,%ymm8 36 vmovapd %ymm0,%ymm9 37 vmovapd %ymm0,%ymm10 38 39 xorl %r9d,%r9d 40 41 movq %rdx,%r11 42 movq $0xfffffffffffff,%rax 43 44 45 movl $7,%ebx 46 47.align 32 48.Lloop7: 49 movq 0(%r11),%r13 50 51 vpbroadcastq 0(%r11),%ymm1 52 movq 0(%rsi),%rdx 53 mulxq %r13,%r13,%r12 54 addq %r13,%r9 55 movq %r12,%r10 56 adcq $0,%r10 57 58 movq %r8,%r13 59 imulq %r9,%r13 60 andq %rax,%r13 61 62 vmovq %r13,%xmm2 63 vpbroadcastq %xmm2,%ymm2 64 movq 0(%rcx),%rdx 65 mulxq %r13,%r13,%r12 66 addq %r13,%r9 67 adcq %r12,%r10 68 69 shrq $52,%r9 70 salq $12,%r10 71 orq %r10,%r9 72 73 leaq -264(%rsp),%rsp 74 75{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 76{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 77{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 78{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 79{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 80{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 81{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 82{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 83 84{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 85{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 86{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 87{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 88{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 89{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 90{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 91{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 92 93 94 vmovdqu %ymm3,0(%rsp) 95 vmovdqu %ymm4,32(%rsp) 96 vmovdqu %ymm5,64(%rsp) 97 vmovdqu %ymm6,96(%rsp) 98 vmovdqu %ymm7,128(%rsp) 99 vmovdqu %ymm8,160(%rsp) 100 vmovdqu %ymm9,192(%rsp) 101 vmovdqu %ymm10,224(%rsp) 102 movq $0,256(%rsp) 103 104 vmovdqu 8(%rsp),%ymm3 105 vmovdqu 40(%rsp),%ymm4 106 vmovdqu 72(%rsp),%ymm5 107 vmovdqu 104(%rsp),%ymm6 108 vmovdqu 136(%rsp),%ymm7 109 vmovdqu 168(%rsp),%ymm8 110 vmovdqu 200(%rsp),%ymm9 111 vmovdqu 232(%rsp),%ymm10 112 113 addq 8(%rsp),%r9 114 115{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 116{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 117{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 118{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 119{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 120{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 121{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 122{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 123 124{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 125{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 126{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 127{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 128{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 129{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 130{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 131{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 132 133 leaq 264(%rsp),%rsp 134 movq 8(%r11),%r13 135 136 vpbroadcastq 8(%r11),%ymm1 137 movq 0(%rsi),%rdx 138 mulxq %r13,%r13,%r12 139 addq %r13,%r9 140 movq %r12,%r10 141 adcq $0,%r10 142 143 movq %r8,%r13 144 imulq %r9,%r13 145 andq %rax,%r13 146 147 vmovq %r13,%xmm2 148 vpbroadcastq %xmm2,%ymm2 149 movq 0(%rcx),%rdx 150 mulxq %r13,%r13,%r12 151 addq %r13,%r9 152 adcq %r12,%r10 153 154 shrq $52,%r9 155 salq $12,%r10 156 orq %r10,%r9 157 158 leaq -264(%rsp),%rsp 159 160{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 161{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 162{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 163{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 164{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 165{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 166{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 167{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 168 169{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 170{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 171{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 172{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 173{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 174{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 175{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 176{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 177 178 179 vmovdqu %ymm3,0(%rsp) 180 vmovdqu %ymm4,32(%rsp) 181 vmovdqu %ymm5,64(%rsp) 182 vmovdqu %ymm6,96(%rsp) 183 vmovdqu %ymm7,128(%rsp) 184 vmovdqu %ymm8,160(%rsp) 185 vmovdqu %ymm9,192(%rsp) 186 vmovdqu %ymm10,224(%rsp) 187 movq $0,256(%rsp) 188 189 vmovdqu 8(%rsp),%ymm3 190 vmovdqu 40(%rsp),%ymm4 191 vmovdqu 72(%rsp),%ymm5 192 vmovdqu 104(%rsp),%ymm6 193 vmovdqu 136(%rsp),%ymm7 194 vmovdqu 168(%rsp),%ymm8 195 vmovdqu 200(%rsp),%ymm9 196 vmovdqu 232(%rsp),%ymm10 197 198 addq 8(%rsp),%r9 199 200{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 201{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 202{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 203{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 204{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 205{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 206{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 207{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 208 209{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 210{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 211{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 212{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 213{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 214{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 215{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 216{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 217 218 leaq 264(%rsp),%rsp 219 movq 16(%r11),%r13 220 221 vpbroadcastq 16(%r11),%ymm1 222 movq 0(%rsi),%rdx 223 mulxq %r13,%r13,%r12 224 addq %r13,%r9 225 movq %r12,%r10 226 adcq $0,%r10 227 228 movq %r8,%r13 229 imulq %r9,%r13 230 andq %rax,%r13 231 232 vmovq %r13,%xmm2 233 vpbroadcastq %xmm2,%ymm2 234 movq 0(%rcx),%rdx 235 mulxq %r13,%r13,%r12 236 addq %r13,%r9 237 adcq %r12,%r10 238 239 shrq $52,%r9 240 salq $12,%r10 241 orq %r10,%r9 242 243 leaq -264(%rsp),%rsp 244 245{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 246{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 247{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 248{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 249{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 250{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 251{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 252{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 253 254{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 255{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 256{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 257{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 258{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 259{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 260{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 261{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 262 263 264 vmovdqu %ymm3,0(%rsp) 265 vmovdqu %ymm4,32(%rsp) 266 vmovdqu %ymm5,64(%rsp) 267 vmovdqu %ymm6,96(%rsp) 268 vmovdqu %ymm7,128(%rsp) 269 vmovdqu %ymm8,160(%rsp) 270 vmovdqu %ymm9,192(%rsp) 271 vmovdqu %ymm10,224(%rsp) 272 movq $0,256(%rsp) 273 274 vmovdqu 8(%rsp),%ymm3 275 vmovdqu 40(%rsp),%ymm4 276 vmovdqu 72(%rsp),%ymm5 277 vmovdqu 104(%rsp),%ymm6 278 vmovdqu 136(%rsp),%ymm7 279 vmovdqu 168(%rsp),%ymm8 280 vmovdqu 200(%rsp),%ymm9 281 vmovdqu 232(%rsp),%ymm10 282 283 addq 8(%rsp),%r9 284 285{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 286{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 287{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 288{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 289{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 290{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 291{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 292{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 293 294{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 295{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 296{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 297{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 298{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 299{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 300{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 301{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 302 303 leaq 264(%rsp),%rsp 304 movq 24(%r11),%r13 305 306 vpbroadcastq 24(%r11),%ymm1 307 movq 0(%rsi),%rdx 308 mulxq %r13,%r13,%r12 309 addq %r13,%r9 310 movq %r12,%r10 311 adcq $0,%r10 312 313 movq %r8,%r13 314 imulq %r9,%r13 315 andq %rax,%r13 316 317 vmovq %r13,%xmm2 318 vpbroadcastq %xmm2,%ymm2 319 movq 0(%rcx),%rdx 320 mulxq %r13,%r13,%r12 321 addq %r13,%r9 322 adcq %r12,%r10 323 324 shrq $52,%r9 325 salq $12,%r10 326 orq %r10,%r9 327 328 leaq -264(%rsp),%rsp 329 330{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 331{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 332{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 333{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 334{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 335{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 336{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 337{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 338 339{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 340{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 341{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 342{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 343{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 344{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 345{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 346{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 347 348 349 vmovdqu %ymm3,0(%rsp) 350 vmovdqu %ymm4,32(%rsp) 351 vmovdqu %ymm5,64(%rsp) 352 vmovdqu %ymm6,96(%rsp) 353 vmovdqu %ymm7,128(%rsp) 354 vmovdqu %ymm8,160(%rsp) 355 vmovdqu %ymm9,192(%rsp) 356 vmovdqu %ymm10,224(%rsp) 357 movq $0,256(%rsp) 358 359 vmovdqu 8(%rsp),%ymm3 360 vmovdqu 40(%rsp),%ymm4 361 vmovdqu 72(%rsp),%ymm5 362 vmovdqu 104(%rsp),%ymm6 363 vmovdqu 136(%rsp),%ymm7 364 vmovdqu 168(%rsp),%ymm8 365 vmovdqu 200(%rsp),%ymm9 366 vmovdqu 232(%rsp),%ymm10 367 368 addq 8(%rsp),%r9 369 370{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 371{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 372{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 373{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 374{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 375{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 376{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 377{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 378 379{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 380{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 381{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 382{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 383{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 384{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 385{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 386{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 387 388 leaq 264(%rsp),%rsp 389 leaq 32(%r11),%r11 390 decl %ebx 391 jne .Lloop7 392 movq 0(%r11),%r13 393 394 vpbroadcastq 0(%r11),%ymm1 395 movq 0(%rsi),%rdx 396 mulxq %r13,%r13,%r12 397 addq %r13,%r9 398 movq %r12,%r10 399 adcq $0,%r10 400 401 movq %r8,%r13 402 imulq %r9,%r13 403 andq %rax,%r13 404 405 vmovq %r13,%xmm2 406 vpbroadcastq %xmm2,%ymm2 407 movq 0(%rcx),%rdx 408 mulxq %r13,%r13,%r12 409 addq %r13,%r9 410 adcq %r12,%r10 411 412 shrq $52,%r9 413 salq $12,%r10 414 orq %r10,%r9 415 416 leaq -264(%rsp),%rsp 417 418{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 419{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 420{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 421{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 422{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 423{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 424{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 425{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 426 427{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 428{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 429{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 430{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 431{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 432{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 433{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 434{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 435 436 437 vmovdqu %ymm3,0(%rsp) 438 vmovdqu %ymm4,32(%rsp) 439 vmovdqu %ymm5,64(%rsp) 440 vmovdqu %ymm6,96(%rsp) 441 vmovdqu %ymm7,128(%rsp) 442 vmovdqu %ymm8,160(%rsp) 443 vmovdqu %ymm9,192(%rsp) 444 vmovdqu %ymm10,224(%rsp) 445 movq $0,256(%rsp) 446 447 vmovdqu 8(%rsp),%ymm3 448 vmovdqu 40(%rsp),%ymm4 449 vmovdqu 72(%rsp),%ymm5 450 vmovdqu 104(%rsp),%ymm6 451 vmovdqu 136(%rsp),%ymm7 452 vmovdqu 168(%rsp),%ymm8 453 vmovdqu 200(%rsp),%ymm9 454 vmovdqu 232(%rsp),%ymm10 455 456 addq 8(%rsp),%r9 457 458{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 459{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 460{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 461{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 462{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 463{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 464{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 465{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 466 467{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 468{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 469{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 470{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 471{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 472{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 473{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 474{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 475 476 leaq 264(%rsp),%rsp 477 movq 8(%r11),%r13 478 479 vpbroadcastq 8(%r11),%ymm1 480 movq 0(%rsi),%rdx 481 mulxq %r13,%r13,%r12 482 addq %r13,%r9 483 movq %r12,%r10 484 adcq $0,%r10 485 486 movq %r8,%r13 487 imulq %r9,%r13 488 andq %rax,%r13 489 490 vmovq %r13,%xmm2 491 vpbroadcastq %xmm2,%ymm2 492 movq 0(%rcx),%rdx 493 mulxq %r13,%r13,%r12 494 addq %r13,%r9 495 adcq %r12,%r10 496 497 shrq $52,%r9 498 salq $12,%r10 499 orq %r10,%r9 500 501 leaq -264(%rsp),%rsp 502 503{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 504{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 505{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 506{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 507{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 508{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 509{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 510{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 511 512{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 513{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 514{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 515{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 516{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 517{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 518{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 519{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 520 521 522 vmovdqu %ymm3,0(%rsp) 523 vmovdqu %ymm4,32(%rsp) 524 vmovdqu %ymm5,64(%rsp) 525 vmovdqu %ymm6,96(%rsp) 526 vmovdqu %ymm7,128(%rsp) 527 vmovdqu %ymm8,160(%rsp) 528 vmovdqu %ymm9,192(%rsp) 529 vmovdqu %ymm10,224(%rsp) 530 movq $0,256(%rsp) 531 532 vmovdqu 8(%rsp),%ymm3 533 vmovdqu 40(%rsp),%ymm4 534 vmovdqu 72(%rsp),%ymm5 535 vmovdqu 104(%rsp),%ymm6 536 vmovdqu 136(%rsp),%ymm7 537 vmovdqu 168(%rsp),%ymm8 538 vmovdqu 200(%rsp),%ymm9 539 vmovdqu 232(%rsp),%ymm10 540 541 addq 8(%rsp),%r9 542 543{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 544{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 545{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 546{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 547{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 548{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 549{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 550{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 551 552{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 553{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 554{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 555{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 556{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 557{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 558{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 559{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 560 561 leaq 264(%rsp),%rsp 562 563 vmovq %r9,%xmm0 564 vpbroadcastq %xmm0,%ymm0 565 vpblendd $3,%ymm0,%ymm3,%ymm3 566 567 568 569 vpsrlq $52,%ymm3,%ymm0 570 vpsrlq $52,%ymm4,%ymm1 571 vpsrlq $52,%ymm5,%ymm2 572 vpsrlq $52,%ymm6,%ymm11 573 vpsrlq $52,%ymm7,%ymm12 574 vpsrlq $52,%ymm8,%ymm13 575 vpsrlq $52,%ymm9,%ymm14 576 vpsrlq $52,%ymm10,%ymm15 577 578 leaq -32(%rsp),%rsp 579 vmovupd %ymm3,(%rsp) 580 581 582 vpermq $144,%ymm15,%ymm15 583 vpermq $3,%ymm14,%ymm3 584 vblendpd $1,%ymm3,%ymm15,%ymm15 585 586 vpermq $144,%ymm14,%ymm14 587 vpermq $3,%ymm13,%ymm3 588 vblendpd $1,%ymm3,%ymm14,%ymm14 589 590 vpermq $144,%ymm13,%ymm13 591 vpermq $3,%ymm12,%ymm3 592 vblendpd $1,%ymm3,%ymm13,%ymm13 593 594 vpermq $144,%ymm12,%ymm12 595 vpermq $3,%ymm11,%ymm3 596 vblendpd $1,%ymm3,%ymm12,%ymm12 597 598 vpermq $144,%ymm11,%ymm11 599 vpermq $3,%ymm2,%ymm3 600 vblendpd $1,%ymm3,%ymm11,%ymm11 601 602 vpermq $144,%ymm2,%ymm2 603 vpermq $3,%ymm1,%ymm3 604 vblendpd $1,%ymm3,%ymm2,%ymm2 605 606 vpermq $144,%ymm1,%ymm1 607 vpermq $3,%ymm0,%ymm3 608 vblendpd $1,%ymm3,%ymm1,%ymm1 609 610 vpermq $144,%ymm0,%ymm0 611 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 612 613 vmovupd (%rsp),%ymm3 614 leaq 32(%rsp),%rsp 615 616 617 vpand .Lmask52x4(%rip),%ymm3,%ymm3 618 vpand .Lmask52x4(%rip),%ymm4,%ymm4 619 vpand .Lmask52x4(%rip),%ymm5,%ymm5 620 vpand .Lmask52x4(%rip),%ymm6,%ymm6 621 vpand .Lmask52x4(%rip),%ymm7,%ymm7 622 vpand .Lmask52x4(%rip),%ymm8,%ymm8 623 vpand .Lmask52x4(%rip),%ymm9,%ymm9 624 vpand .Lmask52x4(%rip),%ymm10,%ymm10 625 626 627 vpaddq %ymm0,%ymm3,%ymm3 628 vpaddq %ymm1,%ymm4,%ymm4 629 vpaddq %ymm2,%ymm5,%ymm5 630 vpaddq %ymm11,%ymm6,%ymm6 631 vpaddq %ymm12,%ymm7,%ymm7 632 vpaddq %ymm13,%ymm8,%ymm8 633 vpaddq %ymm14,%ymm9,%ymm9 634 vpaddq %ymm15,%ymm10,%ymm10 635 636 637 638 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 639 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 640 vmovmskpd %ymm0,%r14d 641 vmovmskpd %ymm1,%r13d 642 shlb $4,%r13b 643 orb %r13b,%r14b 644 645 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 646 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 647 vmovmskpd %ymm2,%r13d 648 vmovmskpd %ymm11,%r12d 649 shlb $4,%r12b 650 orb %r12b,%r13b 651 652 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 653 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 654 vmovmskpd %ymm12,%r12d 655 vmovmskpd %ymm13,%r11d 656 shlb $4,%r11b 657 orb %r11b,%r12b 658 659 vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 660 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 661 vmovmskpd %ymm14,%r11d 662 vmovmskpd %ymm15,%r10d 663 shlb $4,%r10b 664 orb %r10b,%r11b 665 666 addb %r14b,%r14b 667 adcb %r13b,%r13b 668 adcb %r12b,%r12b 669 adcb %r11b,%r11b 670 671 672 vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 673 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 674 vmovmskpd %ymm0,%r9d 675 vmovmskpd %ymm1,%r8d 676 shlb $4,%r8b 677 orb %r8b,%r9b 678 679 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 680 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 681 vmovmskpd %ymm2,%r8d 682 vmovmskpd %ymm11,%edx 683 shlb $4,%dl 684 orb %dl,%r8b 685 686 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 687 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 688 vmovmskpd %ymm12,%edx 689 vmovmskpd %ymm13,%ecx 690 shlb $4,%cl 691 orb %cl,%dl 692 693 vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 694 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 695 vmovmskpd %ymm14,%ecx 696 vmovmskpd %ymm15,%ebx 697 shlb $4,%bl 698 orb %bl,%cl 699 700 addb %r9b,%r14b 701 adcb %r8b,%r13b 702 adcb %dl,%r12b 703 adcb %cl,%r11b 704 705 xorb %r9b,%r14b 706 xorb %r8b,%r13b 707 xorb %dl,%r12b 708 xorb %cl,%r11b 709 710 leaq .Lkmasklut(%rip),%rdx 711 712 movb %r14b,%r10b 713 andq $0xf,%r14 714 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 715 shlq $5,%r14 716 vmovapd (%rdx,%r14,1),%ymm2 717 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 718 719 shrb $4,%r10b 720 andq $0xf,%r10 721 vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 722 shlq $5,%r10 723 vmovapd (%rdx,%r10,1),%ymm2 724 vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 725 726 movb %r13b,%r10b 727 andq $0xf,%r13 728 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 729 shlq $5,%r13 730 vmovapd (%rdx,%r13,1),%ymm2 731 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 732 733 shrb $4,%r10b 734 andq $0xf,%r10 735 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 736 shlq $5,%r10 737 vmovapd (%rdx,%r10,1),%ymm2 738 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 739 740 movb %r12b,%r10b 741 andq $0xf,%r12 742 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 743 shlq $5,%r12 744 vmovapd (%rdx,%r12,1),%ymm2 745 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 746 747 shrb $4,%r10b 748 andq $0xf,%r10 749 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 750 shlq $5,%r10 751 vmovapd (%rdx,%r10,1),%ymm2 752 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 753 754 movb %r11b,%r10b 755 andq $0xf,%r11 756 vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 757 shlq $5,%r11 758 vmovapd (%rdx,%r11,1),%ymm2 759 vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 760 761 shrb $4,%r10b 762 andq $0xf,%r10 763 vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 764 shlq $5,%r10 765 vmovapd (%rdx,%r10,1),%ymm2 766 vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 767 768 vpand .Lmask52x4(%rip),%ymm3,%ymm3 769 vpand .Lmask52x4(%rip),%ymm4,%ymm4 770 vpand .Lmask52x4(%rip),%ymm5,%ymm5 771 vpand .Lmask52x4(%rip),%ymm6,%ymm6 772 vpand .Lmask52x4(%rip),%ymm7,%ymm7 773 vpand .Lmask52x4(%rip),%ymm8,%ymm8 774 vpand .Lmask52x4(%rip),%ymm9,%ymm9 775 776 vpand .Lmask52x4(%rip),%ymm10,%ymm10 777 778 vmovdqu %ymm3,0(%rdi) 779 vmovdqu %ymm4,32(%rdi) 780 vmovdqu %ymm5,64(%rdi) 781 vmovdqu %ymm6,96(%rdi) 782 vmovdqu %ymm7,128(%rdi) 783 vmovdqu %ymm8,160(%rdi) 784 vmovdqu %ymm9,192(%rdi) 785 vmovdqu %ymm10,224(%rdi) 786 787 vzeroupper 788 leaq (%rsp),%rax 789.cfi_def_cfa_register %rax 790 movq 0(%rax),%r15 791.cfi_restore %r15 792 movq 8(%rax),%r14 793.cfi_restore %r14 794 movq 16(%rax),%r13 795.cfi_restore %r13 796 movq 24(%rax),%r12 797.cfi_restore %r12 798 movq 32(%rax),%rbp 799.cfi_restore %rbp 800 movq 40(%rax),%rbx 801.cfi_restore %rbx 802 leaq 48(%rax),%rsp 803.cfi_def_cfa %rsp,8 804.Lossl_rsaz_amm52x30_x1_avxifma256_epilogue: 805 .byte 0xf3,0xc3 806.cfi_endproc 807.size ossl_rsaz_amm52x30_x1_avxifma256, .-ossl_rsaz_amm52x30_x1_avxifma256 808.section .rodata 809.align 32 810.Lmask52x4: 811.quad 0xfffffffffffff 812.quad 0xfffffffffffff 813.quad 0xfffffffffffff 814.quad 0xfffffffffffff 815.Lhigh64x3: 816.quad 0x0 817.quad 0xffffffffffffffff 818.quad 0xffffffffffffffff 819.quad 0xffffffffffffffff 820.Lkmasklut: 821 822.quad 0x0 823.quad 0x0 824.quad 0x0 825.quad 0x0 826 827.quad 0xffffffffffffffff 828.quad 0x0 829.quad 0x0 830.quad 0x0 831 832.quad 0x0 833.quad 0xffffffffffffffff 834.quad 0x0 835.quad 0x0 836 837.quad 0xffffffffffffffff 838.quad 0xffffffffffffffff 839.quad 0x0 840.quad 0x0 841 842.quad 0x0 843.quad 0x0 844.quad 0xffffffffffffffff 845.quad 0x0 846 847.quad 0xffffffffffffffff 848.quad 0x0 849.quad 0xffffffffffffffff 850.quad 0x0 851 852.quad 0x0 853.quad 0xffffffffffffffff 854.quad 0xffffffffffffffff 855.quad 0x0 856 857.quad 0xffffffffffffffff 858.quad 0xffffffffffffffff 859.quad 0xffffffffffffffff 860.quad 0x0 861 862.quad 0x0 863.quad 0x0 864.quad 0x0 865.quad 0xffffffffffffffff 866 867.quad 0xffffffffffffffff 868.quad 0x0 869.quad 0x0 870.quad 0xffffffffffffffff 871 872.quad 0x0 873.quad 0xffffffffffffffff 874.quad 0x0 875.quad 0xffffffffffffffff 876 877.quad 0xffffffffffffffff 878.quad 0xffffffffffffffff 879.quad 0x0 880.quad 0xffffffffffffffff 881 882.quad 0x0 883.quad 0x0 884.quad 0xffffffffffffffff 885.quad 0xffffffffffffffff 886 887.quad 0xffffffffffffffff 888.quad 0x0 889.quad 0xffffffffffffffff 890.quad 0xffffffffffffffff 891 892.quad 0x0 893.quad 0xffffffffffffffff 894.quad 0xffffffffffffffff 895.quad 0xffffffffffffffff 896 897.quad 0xffffffffffffffff 898.quad 0xffffffffffffffff 899.quad 0xffffffffffffffff 900.quad 0xffffffffffffffff 901.text 902 903.globl ossl_rsaz_amm52x30_x2_avxifma256 904.type ossl_rsaz_amm52x30_x2_avxifma256,@function 905.align 32 906ossl_rsaz_amm52x30_x2_avxifma256: 907.cfi_startproc 908.byte 243,15,30,250 909 pushq %rbx 910.cfi_adjust_cfa_offset 8 911.cfi_offset %rbx,-16 912 pushq %rbp 913.cfi_adjust_cfa_offset 8 914.cfi_offset %rbp,-24 915 pushq %r12 916.cfi_adjust_cfa_offset 8 917.cfi_offset %r12,-32 918 pushq %r13 919.cfi_adjust_cfa_offset 8 920.cfi_offset %r13,-40 921 pushq %r14 922.cfi_adjust_cfa_offset 8 923.cfi_offset %r14,-48 924 pushq %r15 925.cfi_adjust_cfa_offset 8 926.cfi_offset %r15,-56 927 928 vpxor %ymm0,%ymm0,%ymm0 929 vmovapd %ymm0,%ymm3 930 vmovapd %ymm0,%ymm4 931 vmovapd %ymm0,%ymm5 932 vmovapd %ymm0,%ymm6 933 vmovapd %ymm0,%ymm7 934 vmovapd %ymm0,%ymm8 935 vmovapd %ymm0,%ymm9 936 vmovapd %ymm0,%ymm10 937 938 xorl %r9d,%r9d 939 940 movq %rdx,%r11 941 movq $0xfffffffffffff,%rax 942 943 movl $30,%ebx 944 945.align 32 946.Lloop30: 947 movq 0(%r11),%r13 948 949 vpbroadcastq 0(%r11),%ymm1 950 movq 0(%rsi),%rdx 951 mulxq %r13,%r13,%r12 952 addq %r13,%r9 953 movq %r12,%r10 954 adcq $0,%r10 955 956 movq (%r8),%r13 957 imulq %r9,%r13 958 andq %rax,%r13 959 960 vmovq %r13,%xmm2 961 vpbroadcastq %xmm2,%ymm2 962 movq 0(%rcx),%rdx 963 mulxq %r13,%r13,%r12 964 addq %r13,%r9 965 adcq %r12,%r10 966 967 shrq $52,%r9 968 salq $12,%r10 969 orq %r10,%r9 970 971 leaq -264(%rsp),%rsp 972 973{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 974{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 975{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 976{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 977{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 978{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 979{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 980{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 981 982{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 983{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 984{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 985{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 986{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 987{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 988{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 989{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 990 991 992 vmovdqu %ymm3,0(%rsp) 993 vmovdqu %ymm4,32(%rsp) 994 vmovdqu %ymm5,64(%rsp) 995 vmovdqu %ymm6,96(%rsp) 996 vmovdqu %ymm7,128(%rsp) 997 vmovdqu %ymm8,160(%rsp) 998 vmovdqu %ymm9,192(%rsp) 999 vmovdqu %ymm10,224(%rsp) 1000 movq $0,256(%rsp) 1001 1002 vmovdqu 8(%rsp),%ymm3 1003 vmovdqu 40(%rsp),%ymm4 1004 vmovdqu 72(%rsp),%ymm5 1005 vmovdqu 104(%rsp),%ymm6 1006 vmovdqu 136(%rsp),%ymm7 1007 vmovdqu 168(%rsp),%ymm8 1008 vmovdqu 200(%rsp),%ymm9 1009 vmovdqu 232(%rsp),%ymm10 1010 1011 addq 8(%rsp),%r9 1012 1013{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 1014{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 1015{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 1016{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 1017{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 1018{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 1019{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 1020{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 1021 1022{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 1023{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 1024{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 1025{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 1026{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 1027{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 1028{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 1029{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 1030 1031 leaq 264(%rsp),%rsp 1032 leaq 8(%r11),%r11 1033 decl %ebx 1034 jne .Lloop30 1035 1036 pushq %r11 1037 pushq %rsi 1038 pushq %rcx 1039 pushq %r8 1040 1041 vmovq %r9,%xmm0 1042 vpbroadcastq %xmm0,%ymm0 1043 vpblendd $3,%ymm0,%ymm3,%ymm3 1044 1045 1046 1047 vpsrlq $52,%ymm3,%ymm0 1048 vpsrlq $52,%ymm4,%ymm1 1049 vpsrlq $52,%ymm5,%ymm2 1050 vpsrlq $52,%ymm6,%ymm11 1051 vpsrlq $52,%ymm7,%ymm12 1052 vpsrlq $52,%ymm8,%ymm13 1053 vpsrlq $52,%ymm9,%ymm14 1054 vpsrlq $52,%ymm10,%ymm15 1055 1056 leaq -32(%rsp),%rsp 1057 vmovupd %ymm3,(%rsp) 1058 1059 1060 vpermq $144,%ymm15,%ymm15 1061 vpermq $3,%ymm14,%ymm3 1062 vblendpd $1,%ymm3,%ymm15,%ymm15 1063 1064 vpermq $144,%ymm14,%ymm14 1065 vpermq $3,%ymm13,%ymm3 1066 vblendpd $1,%ymm3,%ymm14,%ymm14 1067 1068 vpermq $144,%ymm13,%ymm13 1069 vpermq $3,%ymm12,%ymm3 1070 vblendpd $1,%ymm3,%ymm13,%ymm13 1071 1072 vpermq $144,%ymm12,%ymm12 1073 vpermq $3,%ymm11,%ymm3 1074 vblendpd $1,%ymm3,%ymm12,%ymm12 1075 1076 vpermq $144,%ymm11,%ymm11 1077 vpermq $3,%ymm2,%ymm3 1078 vblendpd $1,%ymm3,%ymm11,%ymm11 1079 1080 vpermq $144,%ymm2,%ymm2 1081 vpermq $3,%ymm1,%ymm3 1082 vblendpd $1,%ymm3,%ymm2,%ymm2 1083 1084 vpermq $144,%ymm1,%ymm1 1085 vpermq $3,%ymm0,%ymm3 1086 vblendpd $1,%ymm3,%ymm1,%ymm1 1087 1088 vpermq $144,%ymm0,%ymm0 1089 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 1090 1091 vmovupd (%rsp),%ymm3 1092 leaq 32(%rsp),%rsp 1093 1094 1095 vpand .Lmask52x4(%rip),%ymm3,%ymm3 1096 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1097 vpand .Lmask52x4(%rip),%ymm5,%ymm5 1098 vpand .Lmask52x4(%rip),%ymm6,%ymm6 1099 vpand .Lmask52x4(%rip),%ymm7,%ymm7 1100 vpand .Lmask52x4(%rip),%ymm8,%ymm8 1101 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1102 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1103 1104 1105 vpaddq %ymm0,%ymm3,%ymm3 1106 vpaddq %ymm1,%ymm4,%ymm4 1107 vpaddq %ymm2,%ymm5,%ymm5 1108 vpaddq %ymm11,%ymm6,%ymm6 1109 vpaddq %ymm12,%ymm7,%ymm7 1110 vpaddq %ymm13,%ymm8,%ymm8 1111 vpaddq %ymm14,%ymm9,%ymm9 1112 vpaddq %ymm15,%ymm10,%ymm10 1113 1114 1115 1116 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 1117 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 1118 vmovmskpd %ymm0,%r14d 1119 vmovmskpd %ymm1,%r13d 1120 shlb $4,%r13b 1121 orb %r13b,%r14b 1122 1123 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 1124 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 1125 vmovmskpd %ymm2,%r13d 1126 vmovmskpd %ymm11,%r12d 1127 shlb $4,%r12b 1128 orb %r12b,%r13b 1129 1130 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 1131 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 1132 vmovmskpd %ymm12,%r12d 1133 vmovmskpd %ymm13,%r11d 1134 shlb $4,%r11b 1135 orb %r11b,%r12b 1136 1137 vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 1138 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 1139 vmovmskpd %ymm14,%r11d 1140 vmovmskpd %ymm15,%r10d 1141 shlb $4,%r10b 1142 orb %r10b,%r11b 1143 1144 addb %r14b,%r14b 1145 adcb %r13b,%r13b 1146 adcb %r12b,%r12b 1147 adcb %r11b,%r11b 1148 1149 1150 vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 1151 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 1152 vmovmskpd %ymm0,%r9d 1153 vmovmskpd %ymm1,%r8d 1154 shlb $4,%r8b 1155 orb %r8b,%r9b 1156 1157 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 1158 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 1159 vmovmskpd %ymm2,%r8d 1160 vmovmskpd %ymm11,%edx 1161 shlb $4,%dl 1162 orb %dl,%r8b 1163 1164 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 1165 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 1166 vmovmskpd %ymm12,%edx 1167 vmovmskpd %ymm13,%ecx 1168 shlb $4,%cl 1169 orb %cl,%dl 1170 1171 vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 1172 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 1173 vmovmskpd %ymm14,%ecx 1174 vmovmskpd %ymm15,%ebx 1175 shlb $4,%bl 1176 orb %bl,%cl 1177 1178 addb %r9b,%r14b 1179 adcb %r8b,%r13b 1180 adcb %dl,%r12b 1181 adcb %cl,%r11b 1182 1183 xorb %r9b,%r14b 1184 xorb %r8b,%r13b 1185 xorb %dl,%r12b 1186 xorb %cl,%r11b 1187 1188 leaq .Lkmasklut(%rip),%rdx 1189 1190 movb %r14b,%r10b 1191 andq $0xf,%r14 1192 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 1193 shlq $5,%r14 1194 vmovapd (%rdx,%r14,1),%ymm2 1195 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 1196 1197 shrb $4,%r10b 1198 andq $0xf,%r10 1199 vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 1200 shlq $5,%r10 1201 vmovapd (%rdx,%r10,1),%ymm2 1202 vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 1203 1204 movb %r13b,%r10b 1205 andq $0xf,%r13 1206 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 1207 shlq $5,%r13 1208 vmovapd (%rdx,%r13,1),%ymm2 1209 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 1210 1211 shrb $4,%r10b 1212 andq $0xf,%r10 1213 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 1214 shlq $5,%r10 1215 vmovapd (%rdx,%r10,1),%ymm2 1216 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 1217 1218 movb %r12b,%r10b 1219 andq $0xf,%r12 1220 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 1221 shlq $5,%r12 1222 vmovapd (%rdx,%r12,1),%ymm2 1223 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 1224 1225 shrb $4,%r10b 1226 andq $0xf,%r10 1227 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 1228 shlq $5,%r10 1229 vmovapd (%rdx,%r10,1),%ymm2 1230 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 1231 1232 movb %r11b,%r10b 1233 andq $0xf,%r11 1234 vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 1235 shlq $5,%r11 1236 vmovapd (%rdx,%r11,1),%ymm2 1237 vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 1238 1239 shrb $4,%r10b 1240 andq $0xf,%r10 1241 vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 1242 shlq $5,%r10 1243 vmovapd (%rdx,%r10,1),%ymm2 1244 vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 1245 1246 vpand .Lmask52x4(%rip),%ymm3,%ymm3 1247 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1248 vpand .Lmask52x4(%rip),%ymm5,%ymm5 1249 vpand .Lmask52x4(%rip),%ymm6,%ymm6 1250 vpand .Lmask52x4(%rip),%ymm7,%ymm7 1251 vpand .Lmask52x4(%rip),%ymm8,%ymm8 1252 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1253 1254 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1255 popq %r8 1256 popq %rcx 1257 popq %rsi 1258 popq %r11 1259 1260 vmovdqu %ymm3,0(%rdi) 1261 vmovdqu %ymm4,32(%rdi) 1262 vmovdqu %ymm5,64(%rdi) 1263 vmovdqu %ymm6,96(%rdi) 1264 vmovdqu %ymm7,128(%rdi) 1265 vmovdqu %ymm8,160(%rdi) 1266 vmovdqu %ymm9,192(%rdi) 1267 vmovdqu %ymm10,224(%rdi) 1268 1269 xorl %r15d,%r15d 1270 1271 leaq 16(%r11),%r11 1272 movq $0xfffffffffffff,%rax 1273 1274 movl $30,%ebx 1275 1276 vpxor %ymm0,%ymm0,%ymm0 1277 vmovapd %ymm0,%ymm3 1278 vmovapd %ymm0,%ymm4 1279 vmovapd %ymm0,%ymm5 1280 vmovapd %ymm0,%ymm6 1281 vmovapd %ymm0,%ymm7 1282 vmovapd %ymm0,%ymm8 1283 vmovapd %ymm0,%ymm9 1284 vmovapd %ymm0,%ymm10 1285.align 32 1286.Lloop40: 1287 movq 0(%r11),%r13 1288 1289 vpbroadcastq 0(%r11),%ymm1 1290 movq 256(%rsi),%rdx 1291 mulxq %r13,%r13,%r12 1292 addq %r13,%r9 1293 movq %r12,%r10 1294 adcq $0,%r10 1295 1296 movq 8(%r8),%r13 1297 imulq %r9,%r13 1298 andq %rax,%r13 1299 1300 vmovq %r13,%xmm2 1301 vpbroadcastq %xmm2,%ymm2 1302 movq 256(%rcx),%rdx 1303 mulxq %r13,%r13,%r12 1304 addq %r13,%r9 1305 adcq %r12,%r10 1306 1307 shrq $52,%r9 1308 salq $12,%r10 1309 orq %r10,%r9 1310 1311 leaq -264(%rsp),%rsp 1312 1313{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm3 1314{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm4 1315{vex} vpmadd52luq 320(%rsi),%ymm1,%ymm5 1316{vex} vpmadd52luq 352(%rsi),%ymm1,%ymm6 1317{vex} vpmadd52luq 384(%rsi),%ymm1,%ymm7 1318{vex} vpmadd52luq 416(%rsi),%ymm1,%ymm8 1319{vex} vpmadd52luq 448(%rsi),%ymm1,%ymm9 1320{vex} vpmadd52luq 480(%rsi),%ymm1,%ymm10 1321 1322{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm3 1323{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm4 1324{vex} vpmadd52luq 320(%rcx),%ymm2,%ymm5 1325{vex} vpmadd52luq 352(%rcx),%ymm2,%ymm6 1326{vex} vpmadd52luq 384(%rcx),%ymm2,%ymm7 1327{vex} vpmadd52luq 416(%rcx),%ymm2,%ymm8 1328{vex} vpmadd52luq 448(%rcx),%ymm2,%ymm9 1329{vex} vpmadd52luq 480(%rcx),%ymm2,%ymm10 1330 1331 1332 vmovdqu %ymm3,0(%rsp) 1333 vmovdqu %ymm4,32(%rsp) 1334 vmovdqu %ymm5,64(%rsp) 1335 vmovdqu %ymm6,96(%rsp) 1336 vmovdqu %ymm7,128(%rsp) 1337 vmovdqu %ymm8,160(%rsp) 1338 vmovdqu %ymm9,192(%rsp) 1339 vmovdqu %ymm10,224(%rsp) 1340 movq $0,256(%rsp) 1341 1342 vmovdqu 8(%rsp),%ymm3 1343 vmovdqu 40(%rsp),%ymm4 1344 vmovdqu 72(%rsp),%ymm5 1345 vmovdqu 104(%rsp),%ymm6 1346 vmovdqu 136(%rsp),%ymm7 1347 vmovdqu 168(%rsp),%ymm8 1348 vmovdqu 200(%rsp),%ymm9 1349 vmovdqu 232(%rsp),%ymm10 1350 1351 addq 8(%rsp),%r9 1352 1353{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm3 1354{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm4 1355{vex} vpmadd52huq 320(%rsi),%ymm1,%ymm5 1356{vex} vpmadd52huq 352(%rsi),%ymm1,%ymm6 1357{vex} vpmadd52huq 384(%rsi),%ymm1,%ymm7 1358{vex} vpmadd52huq 416(%rsi),%ymm1,%ymm8 1359{vex} vpmadd52huq 448(%rsi),%ymm1,%ymm9 1360{vex} vpmadd52huq 480(%rsi),%ymm1,%ymm10 1361 1362{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm3 1363{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm4 1364{vex} vpmadd52huq 320(%rcx),%ymm2,%ymm5 1365{vex} vpmadd52huq 352(%rcx),%ymm2,%ymm6 1366{vex} vpmadd52huq 384(%rcx),%ymm2,%ymm7 1367{vex} vpmadd52huq 416(%rcx),%ymm2,%ymm8 1368{vex} vpmadd52huq 448(%rcx),%ymm2,%ymm9 1369{vex} vpmadd52huq 480(%rcx),%ymm2,%ymm10 1370 1371 leaq 264(%rsp),%rsp 1372 leaq 8(%r11),%r11 1373 decl %ebx 1374 jne .Lloop40 1375 1376 vmovq %r9,%xmm0 1377 vpbroadcastq %xmm0,%ymm0 1378 vpblendd $3,%ymm0,%ymm3,%ymm3 1379 1380 1381 1382 vpsrlq $52,%ymm3,%ymm0 1383 vpsrlq $52,%ymm4,%ymm1 1384 vpsrlq $52,%ymm5,%ymm2 1385 vpsrlq $52,%ymm6,%ymm11 1386 vpsrlq $52,%ymm7,%ymm12 1387 vpsrlq $52,%ymm8,%ymm13 1388 vpsrlq $52,%ymm9,%ymm14 1389 vpsrlq $52,%ymm10,%ymm15 1390 1391 leaq -32(%rsp),%rsp 1392 vmovupd %ymm3,(%rsp) 1393 1394 1395 vpermq $144,%ymm15,%ymm15 1396 vpermq $3,%ymm14,%ymm3 1397 vblendpd $1,%ymm3,%ymm15,%ymm15 1398 1399 vpermq $144,%ymm14,%ymm14 1400 vpermq $3,%ymm13,%ymm3 1401 vblendpd $1,%ymm3,%ymm14,%ymm14 1402 1403 vpermq $144,%ymm13,%ymm13 1404 vpermq $3,%ymm12,%ymm3 1405 vblendpd $1,%ymm3,%ymm13,%ymm13 1406 1407 vpermq $144,%ymm12,%ymm12 1408 vpermq $3,%ymm11,%ymm3 1409 vblendpd $1,%ymm3,%ymm12,%ymm12 1410 1411 vpermq $144,%ymm11,%ymm11 1412 vpermq $3,%ymm2,%ymm3 1413 vblendpd $1,%ymm3,%ymm11,%ymm11 1414 1415 vpermq $144,%ymm2,%ymm2 1416 vpermq $3,%ymm1,%ymm3 1417 vblendpd $1,%ymm3,%ymm2,%ymm2 1418 1419 vpermq $144,%ymm1,%ymm1 1420 vpermq $3,%ymm0,%ymm3 1421 vblendpd $1,%ymm3,%ymm1,%ymm1 1422 1423 vpermq $144,%ymm0,%ymm0 1424 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 1425 1426 vmovupd (%rsp),%ymm3 1427 leaq 32(%rsp),%rsp 1428 1429 1430 vpand .Lmask52x4(%rip),%ymm3,%ymm3 1431 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1432 vpand .Lmask52x4(%rip),%ymm5,%ymm5 1433 vpand .Lmask52x4(%rip),%ymm6,%ymm6 1434 vpand .Lmask52x4(%rip),%ymm7,%ymm7 1435 vpand .Lmask52x4(%rip),%ymm8,%ymm8 1436 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1437 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1438 1439 1440 vpaddq %ymm0,%ymm3,%ymm3 1441 vpaddq %ymm1,%ymm4,%ymm4 1442 vpaddq %ymm2,%ymm5,%ymm5 1443 vpaddq %ymm11,%ymm6,%ymm6 1444 vpaddq %ymm12,%ymm7,%ymm7 1445 vpaddq %ymm13,%ymm8,%ymm8 1446 vpaddq %ymm14,%ymm9,%ymm9 1447 vpaddq %ymm15,%ymm10,%ymm10 1448 1449 1450 1451 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 1452 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 1453 vmovmskpd %ymm0,%r14d 1454 vmovmskpd %ymm1,%r13d 1455 shlb $4,%r13b 1456 orb %r13b,%r14b 1457 1458 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 1459 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 1460 vmovmskpd %ymm2,%r13d 1461 vmovmskpd %ymm11,%r12d 1462 shlb $4,%r12b 1463 orb %r12b,%r13b 1464 1465 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 1466 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 1467 vmovmskpd %ymm12,%r12d 1468 vmovmskpd %ymm13,%r11d 1469 shlb $4,%r11b 1470 orb %r11b,%r12b 1471 1472 vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 1473 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 1474 vmovmskpd %ymm14,%r11d 1475 vmovmskpd %ymm15,%r10d 1476 shlb $4,%r10b 1477 orb %r10b,%r11b 1478 1479 addb %r14b,%r14b 1480 adcb %r13b,%r13b 1481 adcb %r12b,%r12b 1482 adcb %r11b,%r11b 1483 1484 1485 vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 1486 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 1487 vmovmskpd %ymm0,%r9d 1488 vmovmskpd %ymm1,%r8d 1489 shlb $4,%r8b 1490 orb %r8b,%r9b 1491 1492 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 1493 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 1494 vmovmskpd %ymm2,%r8d 1495 vmovmskpd %ymm11,%edx 1496 shlb $4,%dl 1497 orb %dl,%r8b 1498 1499 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 1500 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 1501 vmovmskpd %ymm12,%edx 1502 vmovmskpd %ymm13,%ecx 1503 shlb $4,%cl 1504 orb %cl,%dl 1505 1506 vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 1507 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 1508 vmovmskpd %ymm14,%ecx 1509 vmovmskpd %ymm15,%ebx 1510 shlb $4,%bl 1511 orb %bl,%cl 1512 1513 addb %r9b,%r14b 1514 adcb %r8b,%r13b 1515 adcb %dl,%r12b 1516 adcb %cl,%r11b 1517 1518 xorb %r9b,%r14b 1519 xorb %r8b,%r13b 1520 xorb %dl,%r12b 1521 xorb %cl,%r11b 1522 1523 leaq .Lkmasklut(%rip),%rdx 1524 1525 movb %r14b,%r10b 1526 andq $0xf,%r14 1527 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 1528 shlq $5,%r14 1529 vmovapd (%rdx,%r14,1),%ymm2 1530 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 1531 1532 shrb $4,%r10b 1533 andq $0xf,%r10 1534 vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 1535 shlq $5,%r10 1536 vmovapd (%rdx,%r10,1),%ymm2 1537 vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 1538 1539 movb %r13b,%r10b 1540 andq $0xf,%r13 1541 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 1542 shlq $5,%r13 1543 vmovapd (%rdx,%r13,1),%ymm2 1544 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 1545 1546 shrb $4,%r10b 1547 andq $0xf,%r10 1548 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 1549 shlq $5,%r10 1550 vmovapd (%rdx,%r10,1),%ymm2 1551 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 1552 1553 movb %r12b,%r10b 1554 andq $0xf,%r12 1555 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 1556 shlq $5,%r12 1557 vmovapd (%rdx,%r12,1),%ymm2 1558 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 1559 1560 shrb $4,%r10b 1561 andq $0xf,%r10 1562 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 1563 shlq $5,%r10 1564 vmovapd (%rdx,%r10,1),%ymm2 1565 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 1566 1567 movb %r11b,%r10b 1568 andq $0xf,%r11 1569 vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 1570 shlq $5,%r11 1571 vmovapd (%rdx,%r11,1),%ymm2 1572 vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 1573 1574 shrb $4,%r10b 1575 andq $0xf,%r10 1576 vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 1577 shlq $5,%r10 1578 vmovapd (%rdx,%r10,1),%ymm2 1579 vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 1580 1581 vpand .Lmask52x4(%rip),%ymm3,%ymm3 1582 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1583 vpand .Lmask52x4(%rip),%ymm5,%ymm5 1584 vpand .Lmask52x4(%rip),%ymm6,%ymm6 1585 vpand .Lmask52x4(%rip),%ymm7,%ymm7 1586 vpand .Lmask52x4(%rip),%ymm8,%ymm8 1587 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1588 1589 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1590 1591 vmovdqu %ymm3,256(%rdi) 1592 vmovdqu %ymm4,288(%rdi) 1593 vmovdqu %ymm5,320(%rdi) 1594 vmovdqu %ymm6,352(%rdi) 1595 vmovdqu %ymm7,384(%rdi) 1596 vmovdqu %ymm8,416(%rdi) 1597 vmovdqu %ymm9,448(%rdi) 1598 vmovdqu %ymm10,480(%rdi) 1599 1600 vzeroupper 1601 leaq (%rsp),%rax 1602.cfi_def_cfa_register %rax 1603 movq 0(%rax),%r15 1604.cfi_restore %r15 1605 movq 8(%rax),%r14 1606.cfi_restore %r14 1607 movq 16(%rax),%r13 1608.cfi_restore %r13 1609 movq 24(%rax),%r12 1610.cfi_restore %r12 1611 movq 32(%rax),%rbp 1612.cfi_restore %rbp 1613 movq 40(%rax),%rbx 1614.cfi_restore %rbx 1615 leaq 48(%rax),%rsp 1616.cfi_def_cfa %rsp,8 1617.Lossl_rsaz_amm52x30_x2_avxifma256_epilogue: 1618 .byte 0xf3,0xc3 1619.cfi_endproc 1620.size ossl_rsaz_amm52x30_x2_avxifma256, .-ossl_rsaz_amm52x30_x2_avxifma256 1621.text 1622 1623.align 32 1624.globl ossl_extract_multiplier_2x30_win5_avx 1625.type ossl_extract_multiplier_2x30_win5_avx,@function 1626ossl_extract_multiplier_2x30_win5_avx: 1627.cfi_startproc 1628.byte 243,15,30,250 1629 vmovapd .Lones(%rip),%ymm12 1630 vmovq %rdx,%xmm8 1631 vpbroadcastq %xmm8,%ymm10 1632 vmovq %rcx,%xmm8 1633 vpbroadcastq %xmm8,%ymm11 1634 leaq 16384(%rsi),%rax 1635 1636 1637 vpxor %xmm0,%xmm0,%xmm0 1638 vmovapd %ymm0,%ymm9 1639 vmovapd %ymm0,%ymm1 1640 vmovapd %ymm0,%ymm2 1641 vmovapd %ymm0,%ymm3 1642 vmovapd %ymm0,%ymm4 1643 vmovapd %ymm0,%ymm5 1644 vmovapd %ymm0,%ymm6 1645 vmovapd %ymm0,%ymm7 1646 1647.align 32 1648.Lloop: 1649 vpcmpeqq %ymm9,%ymm10,%ymm13 1650 vmovdqu 0(%rsi),%ymm8 1651 1652 vblendvpd %ymm13,%ymm8,%ymm0,%ymm0 1653 vmovdqu 32(%rsi),%ymm8 1654 1655 vblendvpd %ymm13,%ymm8,%ymm1,%ymm1 1656 vmovdqu 64(%rsi),%ymm8 1657 1658 vblendvpd %ymm13,%ymm8,%ymm2,%ymm2 1659 vmovdqu 96(%rsi),%ymm8 1660 1661 vblendvpd %ymm13,%ymm8,%ymm3,%ymm3 1662 vmovdqu 128(%rsi),%ymm8 1663 1664 vblendvpd %ymm13,%ymm8,%ymm4,%ymm4 1665 vmovdqu 160(%rsi),%ymm8 1666 1667 vblendvpd %ymm13,%ymm8,%ymm5,%ymm5 1668 vmovdqu 192(%rsi),%ymm8 1669 1670 vblendvpd %ymm13,%ymm8,%ymm6,%ymm6 1671 vmovdqu 224(%rsi),%ymm8 1672 1673 vblendvpd %ymm13,%ymm8,%ymm7,%ymm7 1674 vpaddq %ymm12,%ymm9,%ymm9 1675 addq $512,%rsi 1676 cmpq %rsi,%rax 1677 jne .Lloop 1678 vmovdqu %ymm0,0(%rdi) 1679 vmovdqu %ymm1,32(%rdi) 1680 vmovdqu %ymm2,64(%rdi) 1681 vmovdqu %ymm3,96(%rdi) 1682 vmovdqu %ymm4,128(%rdi) 1683 vmovdqu %ymm5,160(%rdi) 1684 vmovdqu %ymm6,192(%rdi) 1685 vmovdqu %ymm7,224(%rdi) 1686 leaq -16384(%rax),%rsi 1687 1688 1689 vpxor %xmm0,%xmm0,%xmm0 1690 vmovapd %ymm0,%ymm9 1691 vmovapd %ymm0,%ymm0 1692 vmovapd %ymm0,%ymm1 1693 vmovapd %ymm0,%ymm2 1694 vmovapd %ymm0,%ymm3 1695 vmovapd %ymm0,%ymm4 1696 vmovapd %ymm0,%ymm5 1697 vmovapd %ymm0,%ymm6 1698 vmovapd %ymm0,%ymm7 1699 1700.align 32 1701.Lloop_8_15: 1702 vpcmpeqq %ymm9,%ymm11,%ymm13 1703 vmovdqu 256(%rsi),%ymm8 1704 1705 vblendvpd %ymm13,%ymm8,%ymm0,%ymm0 1706 vmovdqu 288(%rsi),%ymm8 1707 1708 vblendvpd %ymm13,%ymm8,%ymm1,%ymm1 1709 vmovdqu 320(%rsi),%ymm8 1710 1711 vblendvpd %ymm13,%ymm8,%ymm2,%ymm2 1712 vmovdqu 352(%rsi),%ymm8 1713 1714 vblendvpd %ymm13,%ymm8,%ymm3,%ymm3 1715 vmovdqu 384(%rsi),%ymm8 1716 1717 vblendvpd %ymm13,%ymm8,%ymm4,%ymm4 1718 vmovdqu 416(%rsi),%ymm8 1719 1720 vblendvpd %ymm13,%ymm8,%ymm5,%ymm5 1721 vmovdqu 448(%rsi),%ymm8 1722 1723 vblendvpd %ymm13,%ymm8,%ymm6,%ymm6 1724 vmovdqu 480(%rsi),%ymm8 1725 1726 vblendvpd %ymm13,%ymm8,%ymm7,%ymm7 1727 vpaddq %ymm12,%ymm9,%ymm9 1728 addq $512,%rsi 1729 cmpq %rsi,%rax 1730 jne .Lloop_8_15 1731 vmovdqu %ymm0,256(%rdi) 1732 vmovdqu %ymm1,288(%rdi) 1733 vmovdqu %ymm2,320(%rdi) 1734 vmovdqu %ymm3,352(%rdi) 1735 vmovdqu %ymm4,384(%rdi) 1736 vmovdqu %ymm5,416(%rdi) 1737 vmovdqu %ymm6,448(%rdi) 1738 vmovdqu %ymm7,480(%rdi) 1739 1740 .byte 0xf3,0xc3 1741.cfi_endproc 1742.size ossl_extract_multiplier_2x30_win5_avx, .-ossl_extract_multiplier_2x30_win5_avx 1743.section .rodata 1744.align 32 1745.Lones: 1746.quad 1,1,1,1 1747.Lzeros: 1748.quad 0,0,0,0 1749 .section ".note.gnu.property", "a" 1750 .p2align 3 1751 .long 1f - 0f 1752 .long 4f - 1f 1753 .long 5 17540: 1755 # "GNU" encoded with .byte, since .asciz isn't supported 1756 # on Solaris. 1757 .byte 0x47 1758 .byte 0x4e 1759 .byte 0x55 1760 .byte 0 17611: 1762 .p2align 3 1763 .long 0xc0000002 1764 .long 3f - 2f 17652: 1766 .long 3 17673: 1768 .p2align 3 17694: 1770