1/* Do not modify. This file is auto-generated from rsaz-4k-avx512.pl. */ 2.text 3 4.globl ossl_rsaz_amm52x40_x1_ifma256 5.type ossl_rsaz_amm52x40_x1_ifma256,@function 6.align 32 7ossl_rsaz_amm52x40_x1_ifma256: 8.cfi_startproc 9.byte 243,15,30,250 10 pushq %rbx 11.cfi_adjust_cfa_offset 8 12.cfi_offset %rbx,-16 13 pushq %rbp 14.cfi_adjust_cfa_offset 8 15.cfi_offset %rbp,-24 16 pushq %r12 17.cfi_adjust_cfa_offset 8 18.cfi_offset %r12,-32 19 pushq %r13 20.cfi_adjust_cfa_offset 8 21.cfi_offset %r13,-40 22 pushq %r14 23.cfi_adjust_cfa_offset 8 24.cfi_offset %r14,-48 25 pushq %r15 26.cfi_adjust_cfa_offset 8 27.cfi_offset %r15,-56 28 29 vpxord %ymm0,%ymm0,%ymm0 30 vmovdqa64 %ymm0,%ymm3 31 vmovdqa64 %ymm0,%ymm4 32 vmovdqa64 %ymm0,%ymm5 33 vmovdqa64 %ymm0,%ymm6 34 vmovdqa64 %ymm0,%ymm7 35 vmovdqa64 %ymm0,%ymm8 36 vmovdqa64 %ymm0,%ymm9 37 vmovdqa64 %ymm0,%ymm10 38 vmovdqa64 %ymm0,%ymm11 39 vmovdqa64 %ymm0,%ymm12 40 41 xorl %r9d,%r9d 42 43 movq %rdx,%r11 44 movq $0xfffffffffffff,%rax 45 46 47 movl $10,%ebx 48 49.align 32 50.Lloop10: 51 movq 0(%r11),%r13 52 53 vpbroadcastq %r13,%ymm1 54 movq 0(%rsi),%rdx 55 mulxq %r13,%r13,%r12 56 addq %r13,%r9 57 movq %r12,%r10 58 adcq $0,%r10 59 60 movq %r8,%r13 61 imulq %r9,%r13 62 andq %rax,%r13 63 64 vpbroadcastq %r13,%ymm2 65 movq 0(%rcx),%rdx 66 mulxq %r13,%r13,%r12 67 addq %r13,%r9 68 adcq %r12,%r10 69 70 shrq $52,%r9 71 salq $12,%r10 72 orq %r10,%r9 73 74 vpmadd52luq 0(%rsi),%ymm1,%ymm3 75 vpmadd52luq 32(%rsi),%ymm1,%ymm4 76 vpmadd52luq 64(%rsi),%ymm1,%ymm5 77 vpmadd52luq 96(%rsi),%ymm1,%ymm6 78 vpmadd52luq 128(%rsi),%ymm1,%ymm7 79 vpmadd52luq 160(%rsi),%ymm1,%ymm8 80 vpmadd52luq 192(%rsi),%ymm1,%ymm9 81 vpmadd52luq 224(%rsi),%ymm1,%ymm10 82 vpmadd52luq 256(%rsi),%ymm1,%ymm11 83 vpmadd52luq 288(%rsi),%ymm1,%ymm12 84 85 vpmadd52luq 0(%rcx),%ymm2,%ymm3 86 vpmadd52luq 32(%rcx),%ymm2,%ymm4 87 vpmadd52luq 64(%rcx),%ymm2,%ymm5 88 vpmadd52luq 96(%rcx),%ymm2,%ymm6 89 vpmadd52luq 128(%rcx),%ymm2,%ymm7 90 vpmadd52luq 160(%rcx),%ymm2,%ymm8 91 vpmadd52luq 192(%rcx),%ymm2,%ymm9 92 vpmadd52luq 224(%rcx),%ymm2,%ymm10 93 vpmadd52luq 256(%rcx),%ymm2,%ymm11 94 vpmadd52luq 288(%rcx),%ymm2,%ymm12 95 96 97 valignq $1,%ymm3,%ymm4,%ymm3 98 valignq $1,%ymm4,%ymm5,%ymm4 99 valignq $1,%ymm5,%ymm6,%ymm5 100 valignq $1,%ymm6,%ymm7,%ymm6 101 valignq $1,%ymm7,%ymm8,%ymm7 102 valignq $1,%ymm8,%ymm9,%ymm8 103 valignq $1,%ymm9,%ymm10,%ymm9 104 valignq $1,%ymm10,%ymm11,%ymm10 105 valignq $1,%ymm11,%ymm12,%ymm11 106 valignq $1,%ymm12,%ymm0,%ymm12 107 108 vmovq %xmm3,%r13 109 addq %r13,%r9 110 111 vpmadd52huq 0(%rsi),%ymm1,%ymm3 112 vpmadd52huq 32(%rsi),%ymm1,%ymm4 113 vpmadd52huq 64(%rsi),%ymm1,%ymm5 114 vpmadd52huq 96(%rsi),%ymm1,%ymm6 115 vpmadd52huq 128(%rsi),%ymm1,%ymm7 116 vpmadd52huq 160(%rsi),%ymm1,%ymm8 117 vpmadd52huq 192(%rsi),%ymm1,%ymm9 118 vpmadd52huq 224(%rsi),%ymm1,%ymm10 119 vpmadd52huq 256(%rsi),%ymm1,%ymm11 120 vpmadd52huq 288(%rsi),%ymm1,%ymm12 121 122 vpmadd52huq 0(%rcx),%ymm2,%ymm3 123 vpmadd52huq 32(%rcx),%ymm2,%ymm4 124 vpmadd52huq 64(%rcx),%ymm2,%ymm5 125 vpmadd52huq 96(%rcx),%ymm2,%ymm6 126 vpmadd52huq 128(%rcx),%ymm2,%ymm7 127 vpmadd52huq 160(%rcx),%ymm2,%ymm8 128 vpmadd52huq 192(%rcx),%ymm2,%ymm9 129 vpmadd52huq 224(%rcx),%ymm2,%ymm10 130 vpmadd52huq 256(%rcx),%ymm2,%ymm11 131 vpmadd52huq 288(%rcx),%ymm2,%ymm12 132 movq 8(%r11),%r13 133 134 vpbroadcastq %r13,%ymm1 135 movq 0(%rsi),%rdx 136 mulxq %r13,%r13,%r12 137 addq %r13,%r9 138 movq %r12,%r10 139 adcq $0,%r10 140 141 movq %r8,%r13 142 imulq %r9,%r13 143 andq %rax,%r13 144 145 vpbroadcastq %r13,%ymm2 146 movq 0(%rcx),%rdx 147 mulxq %r13,%r13,%r12 148 addq %r13,%r9 149 adcq %r12,%r10 150 151 shrq $52,%r9 152 salq $12,%r10 153 orq %r10,%r9 154 155 vpmadd52luq 0(%rsi),%ymm1,%ymm3 156 vpmadd52luq 32(%rsi),%ymm1,%ymm4 157 vpmadd52luq 64(%rsi),%ymm1,%ymm5 158 vpmadd52luq 96(%rsi),%ymm1,%ymm6 159 vpmadd52luq 128(%rsi),%ymm1,%ymm7 160 vpmadd52luq 160(%rsi),%ymm1,%ymm8 161 vpmadd52luq 192(%rsi),%ymm1,%ymm9 162 vpmadd52luq 224(%rsi),%ymm1,%ymm10 163 vpmadd52luq 256(%rsi),%ymm1,%ymm11 164 vpmadd52luq 288(%rsi),%ymm1,%ymm12 165 166 vpmadd52luq 0(%rcx),%ymm2,%ymm3 167 vpmadd52luq 32(%rcx),%ymm2,%ymm4 168 vpmadd52luq 64(%rcx),%ymm2,%ymm5 169 vpmadd52luq 96(%rcx),%ymm2,%ymm6 170 vpmadd52luq 128(%rcx),%ymm2,%ymm7 171 vpmadd52luq 160(%rcx),%ymm2,%ymm8 172 vpmadd52luq 192(%rcx),%ymm2,%ymm9 173 vpmadd52luq 224(%rcx),%ymm2,%ymm10 174 vpmadd52luq 256(%rcx),%ymm2,%ymm11 175 vpmadd52luq 288(%rcx),%ymm2,%ymm12 176 177 178 valignq $1,%ymm3,%ymm4,%ymm3 179 valignq $1,%ymm4,%ymm5,%ymm4 180 valignq $1,%ymm5,%ymm6,%ymm5 181 valignq $1,%ymm6,%ymm7,%ymm6 182 valignq $1,%ymm7,%ymm8,%ymm7 183 valignq $1,%ymm8,%ymm9,%ymm8 184 valignq $1,%ymm9,%ymm10,%ymm9 185 valignq $1,%ymm10,%ymm11,%ymm10 186 valignq $1,%ymm11,%ymm12,%ymm11 187 valignq $1,%ymm12,%ymm0,%ymm12 188 189 vmovq %xmm3,%r13 190 addq %r13,%r9 191 192 vpmadd52huq 0(%rsi),%ymm1,%ymm3 193 vpmadd52huq 32(%rsi),%ymm1,%ymm4 194 vpmadd52huq 64(%rsi),%ymm1,%ymm5 195 vpmadd52huq 96(%rsi),%ymm1,%ymm6 196 vpmadd52huq 128(%rsi),%ymm1,%ymm7 197 vpmadd52huq 160(%rsi),%ymm1,%ymm8 198 vpmadd52huq 192(%rsi),%ymm1,%ymm9 199 vpmadd52huq 224(%rsi),%ymm1,%ymm10 200 vpmadd52huq 256(%rsi),%ymm1,%ymm11 201 vpmadd52huq 288(%rsi),%ymm1,%ymm12 202 203 vpmadd52huq 0(%rcx),%ymm2,%ymm3 204 vpmadd52huq 32(%rcx),%ymm2,%ymm4 205 vpmadd52huq 64(%rcx),%ymm2,%ymm5 206 vpmadd52huq 96(%rcx),%ymm2,%ymm6 207 vpmadd52huq 128(%rcx),%ymm2,%ymm7 208 vpmadd52huq 160(%rcx),%ymm2,%ymm8 209 vpmadd52huq 192(%rcx),%ymm2,%ymm9 210 vpmadd52huq 224(%rcx),%ymm2,%ymm10 211 vpmadd52huq 256(%rcx),%ymm2,%ymm11 212 vpmadd52huq 288(%rcx),%ymm2,%ymm12 213 movq 16(%r11),%r13 214 215 vpbroadcastq %r13,%ymm1 216 movq 0(%rsi),%rdx 217 mulxq %r13,%r13,%r12 218 addq %r13,%r9 219 movq %r12,%r10 220 adcq $0,%r10 221 222 movq %r8,%r13 223 imulq %r9,%r13 224 andq %rax,%r13 225 226 vpbroadcastq %r13,%ymm2 227 movq 0(%rcx),%rdx 228 mulxq %r13,%r13,%r12 229 addq %r13,%r9 230 adcq %r12,%r10 231 232 shrq $52,%r9 233 salq $12,%r10 234 orq %r10,%r9 235 236 vpmadd52luq 0(%rsi),%ymm1,%ymm3 237 vpmadd52luq 32(%rsi),%ymm1,%ymm4 238 vpmadd52luq 64(%rsi),%ymm1,%ymm5 239 vpmadd52luq 96(%rsi),%ymm1,%ymm6 240 vpmadd52luq 128(%rsi),%ymm1,%ymm7 241 vpmadd52luq 160(%rsi),%ymm1,%ymm8 242 vpmadd52luq 192(%rsi),%ymm1,%ymm9 243 vpmadd52luq 224(%rsi),%ymm1,%ymm10 244 vpmadd52luq 256(%rsi),%ymm1,%ymm11 245 vpmadd52luq 288(%rsi),%ymm1,%ymm12 246 247 vpmadd52luq 0(%rcx),%ymm2,%ymm3 248 vpmadd52luq 32(%rcx),%ymm2,%ymm4 249 vpmadd52luq 64(%rcx),%ymm2,%ymm5 250 vpmadd52luq 96(%rcx),%ymm2,%ymm6 251 vpmadd52luq 128(%rcx),%ymm2,%ymm7 252 vpmadd52luq 160(%rcx),%ymm2,%ymm8 253 vpmadd52luq 192(%rcx),%ymm2,%ymm9 254 vpmadd52luq 224(%rcx),%ymm2,%ymm10 255 vpmadd52luq 256(%rcx),%ymm2,%ymm11 256 vpmadd52luq 288(%rcx),%ymm2,%ymm12 257 258 259 valignq $1,%ymm3,%ymm4,%ymm3 260 valignq $1,%ymm4,%ymm5,%ymm4 261 valignq $1,%ymm5,%ymm6,%ymm5 262 valignq $1,%ymm6,%ymm7,%ymm6 263 valignq $1,%ymm7,%ymm8,%ymm7 264 valignq $1,%ymm8,%ymm9,%ymm8 265 valignq $1,%ymm9,%ymm10,%ymm9 266 valignq $1,%ymm10,%ymm11,%ymm10 267 valignq $1,%ymm11,%ymm12,%ymm11 268 valignq $1,%ymm12,%ymm0,%ymm12 269 270 vmovq %xmm3,%r13 271 addq %r13,%r9 272 273 vpmadd52huq 0(%rsi),%ymm1,%ymm3 274 vpmadd52huq 32(%rsi),%ymm1,%ymm4 275 vpmadd52huq 64(%rsi),%ymm1,%ymm5 276 vpmadd52huq 96(%rsi),%ymm1,%ymm6 277 vpmadd52huq 128(%rsi),%ymm1,%ymm7 278 vpmadd52huq 160(%rsi),%ymm1,%ymm8 279 vpmadd52huq 192(%rsi),%ymm1,%ymm9 280 vpmadd52huq 224(%rsi),%ymm1,%ymm10 281 vpmadd52huq 256(%rsi),%ymm1,%ymm11 282 vpmadd52huq 288(%rsi),%ymm1,%ymm12 283 284 vpmadd52huq 0(%rcx),%ymm2,%ymm3 285 vpmadd52huq 32(%rcx),%ymm2,%ymm4 286 vpmadd52huq 64(%rcx),%ymm2,%ymm5 287 vpmadd52huq 96(%rcx),%ymm2,%ymm6 288 vpmadd52huq 128(%rcx),%ymm2,%ymm7 289 vpmadd52huq 160(%rcx),%ymm2,%ymm8 290 vpmadd52huq 192(%rcx),%ymm2,%ymm9 291 vpmadd52huq 224(%rcx),%ymm2,%ymm10 292 vpmadd52huq 256(%rcx),%ymm2,%ymm11 293 vpmadd52huq 288(%rcx),%ymm2,%ymm12 294 movq 24(%r11),%r13 295 296 vpbroadcastq %r13,%ymm1 297 movq 0(%rsi),%rdx 298 mulxq %r13,%r13,%r12 299 addq %r13,%r9 300 movq %r12,%r10 301 adcq $0,%r10 302 303 movq %r8,%r13 304 imulq %r9,%r13 305 andq %rax,%r13 306 307 vpbroadcastq %r13,%ymm2 308 movq 0(%rcx),%rdx 309 mulxq %r13,%r13,%r12 310 addq %r13,%r9 311 adcq %r12,%r10 312 313 shrq $52,%r9 314 salq $12,%r10 315 orq %r10,%r9 316 317 vpmadd52luq 0(%rsi),%ymm1,%ymm3 318 vpmadd52luq 32(%rsi),%ymm1,%ymm4 319 vpmadd52luq 64(%rsi),%ymm1,%ymm5 320 vpmadd52luq 96(%rsi),%ymm1,%ymm6 321 vpmadd52luq 128(%rsi),%ymm1,%ymm7 322 vpmadd52luq 160(%rsi),%ymm1,%ymm8 323 vpmadd52luq 192(%rsi),%ymm1,%ymm9 324 vpmadd52luq 224(%rsi),%ymm1,%ymm10 325 vpmadd52luq 256(%rsi),%ymm1,%ymm11 326 vpmadd52luq 288(%rsi),%ymm1,%ymm12 327 328 vpmadd52luq 0(%rcx),%ymm2,%ymm3 329 vpmadd52luq 32(%rcx),%ymm2,%ymm4 330 vpmadd52luq 64(%rcx),%ymm2,%ymm5 331 vpmadd52luq 96(%rcx),%ymm2,%ymm6 332 vpmadd52luq 128(%rcx),%ymm2,%ymm7 333 vpmadd52luq 160(%rcx),%ymm2,%ymm8 334 vpmadd52luq 192(%rcx),%ymm2,%ymm9 335 vpmadd52luq 224(%rcx),%ymm2,%ymm10 336 vpmadd52luq 256(%rcx),%ymm2,%ymm11 337 vpmadd52luq 288(%rcx),%ymm2,%ymm12 338 339 340 valignq $1,%ymm3,%ymm4,%ymm3 341 valignq $1,%ymm4,%ymm5,%ymm4 342 valignq $1,%ymm5,%ymm6,%ymm5 343 valignq $1,%ymm6,%ymm7,%ymm6 344 valignq $1,%ymm7,%ymm8,%ymm7 345 valignq $1,%ymm8,%ymm9,%ymm8 346 valignq $1,%ymm9,%ymm10,%ymm9 347 valignq $1,%ymm10,%ymm11,%ymm10 348 valignq $1,%ymm11,%ymm12,%ymm11 349 valignq $1,%ymm12,%ymm0,%ymm12 350 351 vmovq %xmm3,%r13 352 addq %r13,%r9 353 354 vpmadd52huq 0(%rsi),%ymm1,%ymm3 355 vpmadd52huq 32(%rsi),%ymm1,%ymm4 356 vpmadd52huq 64(%rsi),%ymm1,%ymm5 357 vpmadd52huq 96(%rsi),%ymm1,%ymm6 358 vpmadd52huq 128(%rsi),%ymm1,%ymm7 359 vpmadd52huq 160(%rsi),%ymm1,%ymm8 360 vpmadd52huq 192(%rsi),%ymm1,%ymm9 361 vpmadd52huq 224(%rsi),%ymm1,%ymm10 362 vpmadd52huq 256(%rsi),%ymm1,%ymm11 363 vpmadd52huq 288(%rsi),%ymm1,%ymm12 364 365 vpmadd52huq 0(%rcx),%ymm2,%ymm3 366 vpmadd52huq 32(%rcx),%ymm2,%ymm4 367 vpmadd52huq 64(%rcx),%ymm2,%ymm5 368 vpmadd52huq 96(%rcx),%ymm2,%ymm6 369 vpmadd52huq 128(%rcx),%ymm2,%ymm7 370 vpmadd52huq 160(%rcx),%ymm2,%ymm8 371 vpmadd52huq 192(%rcx),%ymm2,%ymm9 372 vpmadd52huq 224(%rcx),%ymm2,%ymm10 373 vpmadd52huq 256(%rcx),%ymm2,%ymm11 374 vpmadd52huq 288(%rcx),%ymm2,%ymm12 375 leaq 32(%r11),%r11 376 decl %ebx 377 jne .Lloop10 378 379 vpbroadcastq %r9,%ymm0 380 vpblendd $3,%ymm0,%ymm3,%ymm3 381 382 383 384 vpsrlq $52,%ymm3,%ymm0 385 vpsrlq $52,%ymm4,%ymm1 386 vpsrlq $52,%ymm5,%ymm2 387 vpsrlq $52,%ymm6,%ymm23 388 vpsrlq $52,%ymm7,%ymm24 389 vpsrlq $52,%ymm8,%ymm25 390 vpsrlq $52,%ymm9,%ymm26 391 vpsrlq $52,%ymm10,%ymm27 392 vpsrlq $52,%ymm11,%ymm28 393 vpsrlq $52,%ymm12,%ymm29 394 395 396 valignq $3,%ymm28,%ymm29,%ymm29 397 valignq $3,%ymm27,%ymm28,%ymm28 398 valignq $3,%ymm26,%ymm27,%ymm27 399 valignq $3,%ymm25,%ymm26,%ymm26 400 valignq $3,%ymm24,%ymm25,%ymm25 401 valignq $3,%ymm23,%ymm24,%ymm24 402 valignq $3,%ymm2,%ymm23,%ymm23 403 valignq $3,%ymm1,%ymm2,%ymm2 404 valignq $3,%ymm0,%ymm1,%ymm1 405 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 406 407 408 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 409 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 410 vpandq .Lmask52x4(%rip),%ymm5,%ymm5 411 vpandq .Lmask52x4(%rip),%ymm6,%ymm6 412 vpandq .Lmask52x4(%rip),%ymm7,%ymm7 413 vpandq .Lmask52x4(%rip),%ymm8,%ymm8 414 vpandq .Lmask52x4(%rip),%ymm9,%ymm9 415 vpandq .Lmask52x4(%rip),%ymm10,%ymm10 416 vpandq .Lmask52x4(%rip),%ymm11,%ymm11 417 vpandq .Lmask52x4(%rip),%ymm12,%ymm12 418 419 420 vpaddq %ymm0,%ymm3,%ymm3 421 vpaddq %ymm1,%ymm4,%ymm4 422 vpaddq %ymm2,%ymm5,%ymm5 423 vpaddq %ymm23,%ymm6,%ymm6 424 vpaddq %ymm24,%ymm7,%ymm7 425 vpaddq %ymm25,%ymm8,%ymm8 426 vpaddq %ymm26,%ymm9,%ymm9 427 vpaddq %ymm27,%ymm10,%ymm10 428 vpaddq %ymm28,%ymm11,%ymm11 429 vpaddq %ymm29,%ymm12,%ymm12 430 431 432 433 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 434 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 435 kmovb %k1,%r14d 436 kmovb %k2,%r13d 437 shlb $4,%r13b 438 orb %r13b,%r14b 439 440 vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 441 vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 442 kmovb %k1,%r13d 443 kmovb %k2,%r12d 444 shlb $4,%r12b 445 orb %r12b,%r13b 446 447 vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 448 vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 449 kmovb %k1,%r12d 450 kmovb %k2,%r11d 451 shlb $4,%r11b 452 orb %r11b,%r12b 453 454 vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 455 vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 456 kmovb %k1,%r11d 457 kmovb %k2,%r10d 458 shlb $4,%r10b 459 orb %r10b,%r11b 460 461 vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1 462 vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2 463 kmovb %k1,%r10d 464 kmovb %k2,%r9d 465 shlb $4,%r9b 466 orb %r9b,%r10b 467 468 addb %r14b,%r14b 469 adcb %r13b,%r13b 470 adcb %r12b,%r12b 471 adcb %r11b,%r11b 472 adcb %r10b,%r10b 473 474 475 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 476 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 477 kmovb %k1,%r9d 478 kmovb %k2,%r8d 479 shlb $4,%r8b 480 orb %r8b,%r9b 481 482 vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 483 vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 484 kmovb %k1,%r8d 485 kmovb %k2,%edx 486 shlb $4,%dl 487 orb %dl,%r8b 488 489 vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 490 vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 491 kmovb %k1,%edx 492 kmovb %k2,%ecx 493 shlb $4,%cl 494 orb %cl,%dl 495 496 vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 497 vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 498 kmovb %k1,%ecx 499 kmovb %k2,%ebx 500 shlb $4,%bl 501 orb %bl,%cl 502 503 vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1 504 vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2 505 kmovb %k1,%ebx 506 kmovb %k2,%eax 507 shlb $4,%al 508 orb %al,%bl 509 510 addb %r9b,%r14b 511 adcb %r8b,%r13b 512 adcb %dl,%r12b 513 adcb %cl,%r11b 514 adcb %bl,%r10b 515 516 xorb %r9b,%r14b 517 xorb %r8b,%r13b 518 xorb %dl,%r12b 519 xorb %cl,%r11b 520 xorb %bl,%r10b 521 522 kmovb %r14d,%k1 523 shrb $4,%r14b 524 kmovb %r14d,%k2 525 kmovb %r13d,%k3 526 shrb $4,%r13b 527 kmovb %r13d,%k4 528 kmovb %r12d,%k5 529 shrb $4,%r12b 530 kmovb %r12d,%k6 531 kmovb %r11d,%k7 532 533 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} 534 vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} 535 vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} 536 vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} 537 vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} 538 vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} 539 vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} 540 541 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 542 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 543 vpandq .Lmask52x4(%rip),%ymm5,%ymm5 544 vpandq .Lmask52x4(%rip),%ymm6,%ymm6 545 vpandq .Lmask52x4(%rip),%ymm7,%ymm7 546 vpandq .Lmask52x4(%rip),%ymm8,%ymm8 547 vpandq .Lmask52x4(%rip),%ymm9,%ymm9 548 549 shrb $4,%r11b 550 kmovb %r11d,%k1 551 kmovb %r10d,%k2 552 shrb $4,%r10b 553 kmovb %r10d,%k3 554 555 vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} 556 vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2} 557 vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3} 558 559 vpandq .Lmask52x4(%rip),%ymm10,%ymm10 560 vpandq .Lmask52x4(%rip),%ymm11,%ymm11 561 vpandq .Lmask52x4(%rip),%ymm12,%ymm12 562 563 vmovdqu64 %ymm3,0(%rdi) 564 vmovdqu64 %ymm4,32(%rdi) 565 vmovdqu64 %ymm5,64(%rdi) 566 vmovdqu64 %ymm6,96(%rdi) 567 vmovdqu64 %ymm7,128(%rdi) 568 vmovdqu64 %ymm8,160(%rdi) 569 vmovdqu64 %ymm9,192(%rdi) 570 vmovdqu64 %ymm10,224(%rdi) 571 vmovdqu64 %ymm11,256(%rdi) 572 vmovdqu64 %ymm12,288(%rdi) 573 574 vzeroupper 575 leaq (%rsp),%rax 576.cfi_def_cfa_register %rax 577 movq 0(%rax),%r15 578.cfi_restore %r15 579 movq 8(%rax),%r14 580.cfi_restore %r14 581 movq 16(%rax),%r13 582.cfi_restore %r13 583 movq 24(%rax),%r12 584.cfi_restore %r12 585 movq 32(%rax),%rbp 586.cfi_restore %rbp 587 movq 40(%rax),%rbx 588.cfi_restore %rbx 589 leaq 48(%rax),%rsp 590.cfi_def_cfa %rsp,8 591.Lossl_rsaz_amm52x40_x1_ifma256_epilogue: 592 593 .byte 0xf3,0xc3 594.cfi_endproc 595.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256 596.section .rodata 597.align 32 598.Lmask52x4: 599.quad 0xfffffffffffff 600.quad 0xfffffffffffff 601.quad 0xfffffffffffff 602.quad 0xfffffffffffff 603.text 604 605.globl ossl_rsaz_amm52x40_x2_ifma256 606.type ossl_rsaz_amm52x40_x2_ifma256,@function 607.align 32 608ossl_rsaz_amm52x40_x2_ifma256: 609.cfi_startproc 610.byte 243,15,30,250 611 pushq %rbx 612.cfi_adjust_cfa_offset 8 613.cfi_offset %rbx,-16 614 pushq %rbp 615.cfi_adjust_cfa_offset 8 616.cfi_offset %rbp,-24 617 pushq %r12 618.cfi_adjust_cfa_offset 8 619.cfi_offset %r12,-32 620 pushq %r13 621.cfi_adjust_cfa_offset 8 622.cfi_offset %r13,-40 623 pushq %r14 624.cfi_adjust_cfa_offset 8 625.cfi_offset %r14,-48 626 pushq %r15 627.cfi_adjust_cfa_offset 8 628.cfi_offset %r15,-56 629 630 vpxord %ymm0,%ymm0,%ymm0 631 vmovdqa64 %ymm0,%ymm3 632 vmovdqa64 %ymm0,%ymm4 633 vmovdqa64 %ymm0,%ymm5 634 vmovdqa64 %ymm0,%ymm6 635 vmovdqa64 %ymm0,%ymm7 636 vmovdqa64 %ymm0,%ymm8 637 vmovdqa64 %ymm0,%ymm9 638 vmovdqa64 %ymm0,%ymm10 639 vmovdqa64 %ymm0,%ymm11 640 vmovdqa64 %ymm0,%ymm12 641 642 vmovdqa64 %ymm0,%ymm13 643 vmovdqa64 %ymm0,%ymm14 644 vmovdqa64 %ymm0,%ymm15 645 vmovdqa64 %ymm0,%ymm16 646 vmovdqa64 %ymm0,%ymm17 647 vmovdqa64 %ymm0,%ymm18 648 vmovdqa64 %ymm0,%ymm19 649 vmovdqa64 %ymm0,%ymm20 650 vmovdqa64 %ymm0,%ymm21 651 vmovdqa64 %ymm0,%ymm22 652 653 654 xorl %r9d,%r9d 655 xorl %r15d,%r15d 656 657 movq %rdx,%r11 658 movq $0xfffffffffffff,%rax 659 660 movl $40,%ebx 661 662.align 32 663.Lloop40: 664 movq 0(%r11),%r13 665 666 vpbroadcastq %r13,%ymm1 667 movq 0(%rsi),%rdx 668 mulxq %r13,%r13,%r12 669 addq %r13,%r9 670 movq %r12,%r10 671 adcq $0,%r10 672 673 movq (%r8),%r13 674 imulq %r9,%r13 675 andq %rax,%r13 676 677 vpbroadcastq %r13,%ymm2 678 movq 0(%rcx),%rdx 679 mulxq %r13,%r13,%r12 680 addq %r13,%r9 681 adcq %r12,%r10 682 683 shrq $52,%r9 684 salq $12,%r10 685 orq %r10,%r9 686 687 vpmadd52luq 0(%rsi),%ymm1,%ymm3 688 vpmadd52luq 32(%rsi),%ymm1,%ymm4 689 vpmadd52luq 64(%rsi),%ymm1,%ymm5 690 vpmadd52luq 96(%rsi),%ymm1,%ymm6 691 vpmadd52luq 128(%rsi),%ymm1,%ymm7 692 vpmadd52luq 160(%rsi),%ymm1,%ymm8 693 vpmadd52luq 192(%rsi),%ymm1,%ymm9 694 vpmadd52luq 224(%rsi),%ymm1,%ymm10 695 vpmadd52luq 256(%rsi),%ymm1,%ymm11 696 vpmadd52luq 288(%rsi),%ymm1,%ymm12 697 698 vpmadd52luq 0(%rcx),%ymm2,%ymm3 699 vpmadd52luq 32(%rcx),%ymm2,%ymm4 700 vpmadd52luq 64(%rcx),%ymm2,%ymm5 701 vpmadd52luq 96(%rcx),%ymm2,%ymm6 702 vpmadd52luq 128(%rcx),%ymm2,%ymm7 703 vpmadd52luq 160(%rcx),%ymm2,%ymm8 704 vpmadd52luq 192(%rcx),%ymm2,%ymm9 705 vpmadd52luq 224(%rcx),%ymm2,%ymm10 706 vpmadd52luq 256(%rcx),%ymm2,%ymm11 707 vpmadd52luq 288(%rcx),%ymm2,%ymm12 708 709 710 valignq $1,%ymm3,%ymm4,%ymm3 711 valignq $1,%ymm4,%ymm5,%ymm4 712 valignq $1,%ymm5,%ymm6,%ymm5 713 valignq $1,%ymm6,%ymm7,%ymm6 714 valignq $1,%ymm7,%ymm8,%ymm7 715 valignq $1,%ymm8,%ymm9,%ymm8 716 valignq $1,%ymm9,%ymm10,%ymm9 717 valignq $1,%ymm10,%ymm11,%ymm10 718 valignq $1,%ymm11,%ymm12,%ymm11 719 valignq $1,%ymm12,%ymm0,%ymm12 720 721 vmovq %xmm3,%r13 722 addq %r13,%r9 723 724 vpmadd52huq 0(%rsi),%ymm1,%ymm3 725 vpmadd52huq 32(%rsi),%ymm1,%ymm4 726 vpmadd52huq 64(%rsi),%ymm1,%ymm5 727 vpmadd52huq 96(%rsi),%ymm1,%ymm6 728 vpmadd52huq 128(%rsi),%ymm1,%ymm7 729 vpmadd52huq 160(%rsi),%ymm1,%ymm8 730 vpmadd52huq 192(%rsi),%ymm1,%ymm9 731 vpmadd52huq 224(%rsi),%ymm1,%ymm10 732 vpmadd52huq 256(%rsi),%ymm1,%ymm11 733 vpmadd52huq 288(%rsi),%ymm1,%ymm12 734 735 vpmadd52huq 0(%rcx),%ymm2,%ymm3 736 vpmadd52huq 32(%rcx),%ymm2,%ymm4 737 vpmadd52huq 64(%rcx),%ymm2,%ymm5 738 vpmadd52huq 96(%rcx),%ymm2,%ymm6 739 vpmadd52huq 128(%rcx),%ymm2,%ymm7 740 vpmadd52huq 160(%rcx),%ymm2,%ymm8 741 vpmadd52huq 192(%rcx),%ymm2,%ymm9 742 vpmadd52huq 224(%rcx),%ymm2,%ymm10 743 vpmadd52huq 256(%rcx),%ymm2,%ymm11 744 vpmadd52huq 288(%rcx),%ymm2,%ymm12 745 movq 320(%r11),%r13 746 747 vpbroadcastq %r13,%ymm1 748 movq 320(%rsi),%rdx 749 mulxq %r13,%r13,%r12 750 addq %r13,%r15 751 movq %r12,%r10 752 adcq $0,%r10 753 754 movq 8(%r8),%r13 755 imulq %r15,%r13 756 andq %rax,%r13 757 758 vpbroadcastq %r13,%ymm2 759 movq 320(%rcx),%rdx 760 mulxq %r13,%r13,%r12 761 addq %r13,%r15 762 adcq %r12,%r10 763 764 shrq $52,%r15 765 salq $12,%r10 766 orq %r10,%r15 767 768 vpmadd52luq 320(%rsi),%ymm1,%ymm13 769 vpmadd52luq 352(%rsi),%ymm1,%ymm14 770 vpmadd52luq 384(%rsi),%ymm1,%ymm15 771 vpmadd52luq 416(%rsi),%ymm1,%ymm16 772 vpmadd52luq 448(%rsi),%ymm1,%ymm17 773 vpmadd52luq 480(%rsi),%ymm1,%ymm18 774 vpmadd52luq 512(%rsi),%ymm1,%ymm19 775 vpmadd52luq 544(%rsi),%ymm1,%ymm20 776 vpmadd52luq 576(%rsi),%ymm1,%ymm21 777 vpmadd52luq 608(%rsi),%ymm1,%ymm22 778 779 vpmadd52luq 320(%rcx),%ymm2,%ymm13 780 vpmadd52luq 352(%rcx),%ymm2,%ymm14 781 vpmadd52luq 384(%rcx),%ymm2,%ymm15 782 vpmadd52luq 416(%rcx),%ymm2,%ymm16 783 vpmadd52luq 448(%rcx),%ymm2,%ymm17 784 vpmadd52luq 480(%rcx),%ymm2,%ymm18 785 vpmadd52luq 512(%rcx),%ymm2,%ymm19 786 vpmadd52luq 544(%rcx),%ymm2,%ymm20 787 vpmadd52luq 576(%rcx),%ymm2,%ymm21 788 vpmadd52luq 608(%rcx),%ymm2,%ymm22 789 790 791 valignq $1,%ymm13,%ymm14,%ymm13 792 valignq $1,%ymm14,%ymm15,%ymm14 793 valignq $1,%ymm15,%ymm16,%ymm15 794 valignq $1,%ymm16,%ymm17,%ymm16 795 valignq $1,%ymm17,%ymm18,%ymm17 796 valignq $1,%ymm18,%ymm19,%ymm18 797 valignq $1,%ymm19,%ymm20,%ymm19 798 valignq $1,%ymm20,%ymm21,%ymm20 799 valignq $1,%ymm21,%ymm22,%ymm21 800 valignq $1,%ymm22,%ymm0,%ymm22 801 802 vmovq %xmm13,%r13 803 addq %r13,%r15 804 805 vpmadd52huq 320(%rsi),%ymm1,%ymm13 806 vpmadd52huq 352(%rsi),%ymm1,%ymm14 807 vpmadd52huq 384(%rsi),%ymm1,%ymm15 808 vpmadd52huq 416(%rsi),%ymm1,%ymm16 809 vpmadd52huq 448(%rsi),%ymm1,%ymm17 810 vpmadd52huq 480(%rsi),%ymm1,%ymm18 811 vpmadd52huq 512(%rsi),%ymm1,%ymm19 812 vpmadd52huq 544(%rsi),%ymm1,%ymm20 813 vpmadd52huq 576(%rsi),%ymm1,%ymm21 814 vpmadd52huq 608(%rsi),%ymm1,%ymm22 815 816 vpmadd52huq 320(%rcx),%ymm2,%ymm13 817 vpmadd52huq 352(%rcx),%ymm2,%ymm14 818 vpmadd52huq 384(%rcx),%ymm2,%ymm15 819 vpmadd52huq 416(%rcx),%ymm2,%ymm16 820 vpmadd52huq 448(%rcx),%ymm2,%ymm17 821 vpmadd52huq 480(%rcx),%ymm2,%ymm18 822 vpmadd52huq 512(%rcx),%ymm2,%ymm19 823 vpmadd52huq 544(%rcx),%ymm2,%ymm20 824 vpmadd52huq 576(%rcx),%ymm2,%ymm21 825 vpmadd52huq 608(%rcx),%ymm2,%ymm22 826 leaq 8(%r11),%r11 827 decl %ebx 828 jne .Lloop40 829 830 vpbroadcastq %r9,%ymm0 831 vpblendd $3,%ymm0,%ymm3,%ymm3 832 833 834 835 vpsrlq $52,%ymm3,%ymm0 836 vpsrlq $52,%ymm4,%ymm1 837 vpsrlq $52,%ymm5,%ymm2 838 vpsrlq $52,%ymm6,%ymm23 839 vpsrlq $52,%ymm7,%ymm24 840 vpsrlq $52,%ymm8,%ymm25 841 vpsrlq $52,%ymm9,%ymm26 842 vpsrlq $52,%ymm10,%ymm27 843 vpsrlq $52,%ymm11,%ymm28 844 vpsrlq $52,%ymm12,%ymm29 845 846 847 valignq $3,%ymm28,%ymm29,%ymm29 848 valignq $3,%ymm27,%ymm28,%ymm28 849 valignq $3,%ymm26,%ymm27,%ymm27 850 valignq $3,%ymm25,%ymm26,%ymm26 851 valignq $3,%ymm24,%ymm25,%ymm25 852 valignq $3,%ymm23,%ymm24,%ymm24 853 valignq $3,%ymm2,%ymm23,%ymm23 854 valignq $3,%ymm1,%ymm2,%ymm2 855 valignq $3,%ymm0,%ymm1,%ymm1 856 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 857 858 859 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 860 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 861 vpandq .Lmask52x4(%rip),%ymm5,%ymm5 862 vpandq .Lmask52x4(%rip),%ymm6,%ymm6 863 vpandq .Lmask52x4(%rip),%ymm7,%ymm7 864 vpandq .Lmask52x4(%rip),%ymm8,%ymm8 865 vpandq .Lmask52x4(%rip),%ymm9,%ymm9 866 vpandq .Lmask52x4(%rip),%ymm10,%ymm10 867 vpandq .Lmask52x4(%rip),%ymm11,%ymm11 868 vpandq .Lmask52x4(%rip),%ymm12,%ymm12 869 870 871 vpaddq %ymm0,%ymm3,%ymm3 872 vpaddq %ymm1,%ymm4,%ymm4 873 vpaddq %ymm2,%ymm5,%ymm5 874 vpaddq %ymm23,%ymm6,%ymm6 875 vpaddq %ymm24,%ymm7,%ymm7 876 vpaddq %ymm25,%ymm8,%ymm8 877 vpaddq %ymm26,%ymm9,%ymm9 878 vpaddq %ymm27,%ymm10,%ymm10 879 vpaddq %ymm28,%ymm11,%ymm11 880 vpaddq %ymm29,%ymm12,%ymm12 881 882 883 884 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 885 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 886 kmovb %k1,%r14d 887 kmovb %k2,%r13d 888 shlb $4,%r13b 889 orb %r13b,%r14b 890 891 vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 892 vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 893 kmovb %k1,%r13d 894 kmovb %k2,%r12d 895 shlb $4,%r12b 896 orb %r12b,%r13b 897 898 vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 899 vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 900 kmovb %k1,%r12d 901 kmovb %k2,%r11d 902 shlb $4,%r11b 903 orb %r11b,%r12b 904 905 vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 906 vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 907 kmovb %k1,%r11d 908 kmovb %k2,%r10d 909 shlb $4,%r10b 910 orb %r10b,%r11b 911 912 vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1 913 vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2 914 kmovb %k1,%r10d 915 kmovb %k2,%r9d 916 shlb $4,%r9b 917 orb %r9b,%r10b 918 919 addb %r14b,%r14b 920 adcb %r13b,%r13b 921 adcb %r12b,%r12b 922 adcb %r11b,%r11b 923 adcb %r10b,%r10b 924 925 926 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 927 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 928 kmovb %k1,%r9d 929 kmovb %k2,%r8d 930 shlb $4,%r8b 931 orb %r8b,%r9b 932 933 vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 934 vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 935 kmovb %k1,%r8d 936 kmovb %k2,%edx 937 shlb $4,%dl 938 orb %dl,%r8b 939 940 vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 941 vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 942 kmovb %k1,%edx 943 kmovb %k2,%ecx 944 shlb $4,%cl 945 orb %cl,%dl 946 947 vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 948 vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 949 kmovb %k1,%ecx 950 kmovb %k2,%ebx 951 shlb $4,%bl 952 orb %bl,%cl 953 954 vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1 955 vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2 956 kmovb %k1,%ebx 957 kmovb %k2,%eax 958 shlb $4,%al 959 orb %al,%bl 960 961 addb %r9b,%r14b 962 adcb %r8b,%r13b 963 adcb %dl,%r12b 964 adcb %cl,%r11b 965 adcb %bl,%r10b 966 967 xorb %r9b,%r14b 968 xorb %r8b,%r13b 969 xorb %dl,%r12b 970 xorb %cl,%r11b 971 xorb %bl,%r10b 972 973 kmovb %r14d,%k1 974 shrb $4,%r14b 975 kmovb %r14d,%k2 976 kmovb %r13d,%k3 977 shrb $4,%r13b 978 kmovb %r13d,%k4 979 kmovb %r12d,%k5 980 shrb $4,%r12b 981 kmovb %r12d,%k6 982 kmovb %r11d,%k7 983 984 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} 985 vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} 986 vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} 987 vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} 988 vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} 989 vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} 990 vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} 991 992 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 993 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 994 vpandq .Lmask52x4(%rip),%ymm5,%ymm5 995 vpandq .Lmask52x4(%rip),%ymm6,%ymm6 996 vpandq .Lmask52x4(%rip),%ymm7,%ymm7 997 vpandq .Lmask52x4(%rip),%ymm8,%ymm8 998 vpandq .Lmask52x4(%rip),%ymm9,%ymm9 999 1000 shrb $4,%r11b 1001 kmovb %r11d,%k1 1002 kmovb %r10d,%k2 1003 shrb $4,%r10b 1004 kmovb %r10d,%k3 1005 1006 vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} 1007 vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2} 1008 vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3} 1009 1010 vpandq .Lmask52x4(%rip),%ymm10,%ymm10 1011 vpandq .Lmask52x4(%rip),%ymm11,%ymm11 1012 vpandq .Lmask52x4(%rip),%ymm12,%ymm12 1013 1014 vpbroadcastq %r15,%ymm0 1015 vpblendd $3,%ymm0,%ymm13,%ymm13 1016 1017 1018 1019 vpsrlq $52,%ymm13,%ymm0 1020 vpsrlq $52,%ymm14,%ymm1 1021 vpsrlq $52,%ymm15,%ymm2 1022 vpsrlq $52,%ymm16,%ymm23 1023 vpsrlq $52,%ymm17,%ymm24 1024 vpsrlq $52,%ymm18,%ymm25 1025 vpsrlq $52,%ymm19,%ymm26 1026 vpsrlq $52,%ymm20,%ymm27 1027 vpsrlq $52,%ymm21,%ymm28 1028 vpsrlq $52,%ymm22,%ymm29 1029 1030 1031 valignq $3,%ymm28,%ymm29,%ymm29 1032 valignq $3,%ymm27,%ymm28,%ymm28 1033 valignq $3,%ymm26,%ymm27,%ymm27 1034 valignq $3,%ymm25,%ymm26,%ymm26 1035 valignq $3,%ymm24,%ymm25,%ymm25 1036 valignq $3,%ymm23,%ymm24,%ymm24 1037 valignq $3,%ymm2,%ymm23,%ymm23 1038 valignq $3,%ymm1,%ymm2,%ymm2 1039 valignq $3,%ymm0,%ymm1,%ymm1 1040 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 1041 1042 1043 vpandq .Lmask52x4(%rip),%ymm13,%ymm13 1044 vpandq .Lmask52x4(%rip),%ymm14,%ymm14 1045 vpandq .Lmask52x4(%rip),%ymm15,%ymm15 1046 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 1047 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 1048 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 1049 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 1050 vpandq .Lmask52x4(%rip),%ymm20,%ymm20 1051 vpandq .Lmask52x4(%rip),%ymm21,%ymm21 1052 vpandq .Lmask52x4(%rip),%ymm22,%ymm22 1053 1054 1055 vpaddq %ymm0,%ymm13,%ymm13 1056 vpaddq %ymm1,%ymm14,%ymm14 1057 vpaddq %ymm2,%ymm15,%ymm15 1058 vpaddq %ymm23,%ymm16,%ymm16 1059 vpaddq %ymm24,%ymm17,%ymm17 1060 vpaddq %ymm25,%ymm18,%ymm18 1061 vpaddq %ymm26,%ymm19,%ymm19 1062 vpaddq %ymm27,%ymm20,%ymm20 1063 vpaddq %ymm28,%ymm21,%ymm21 1064 vpaddq %ymm29,%ymm22,%ymm22 1065 1066 1067 1068 vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1 1069 vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2 1070 kmovb %k1,%r14d 1071 kmovb %k2,%r13d 1072 shlb $4,%r13b 1073 orb %r13b,%r14b 1074 1075 vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1 1076 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 1077 kmovb %k1,%r13d 1078 kmovb %k2,%r12d 1079 shlb $4,%r12b 1080 orb %r12b,%r13b 1081 1082 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1 1083 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2 1084 kmovb %k1,%r12d 1085 kmovb %k2,%r11d 1086 shlb $4,%r11b 1087 orb %r11b,%r12b 1088 1089 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k1 1090 vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2 1091 kmovb %k1,%r11d 1092 kmovb %k2,%r10d 1093 shlb $4,%r10b 1094 orb %r10b,%r11b 1095 1096 vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k1 1097 vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k2 1098 kmovb %k1,%r10d 1099 kmovb %k2,%r9d 1100 shlb $4,%r9b 1101 orb %r9b,%r10b 1102 1103 addb %r14b,%r14b 1104 adcb %r13b,%r13b 1105 adcb %r12b,%r12b 1106 adcb %r11b,%r11b 1107 adcb %r10b,%r10b 1108 1109 1110 vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1 1111 vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2 1112 kmovb %k1,%r9d 1113 kmovb %k2,%r8d 1114 shlb $4,%r8b 1115 orb %r8b,%r9b 1116 1117 vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1 1118 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 1119 kmovb %k1,%r8d 1120 kmovb %k2,%edx 1121 shlb $4,%dl 1122 orb %dl,%r8b 1123 1124 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1 1125 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2 1126 kmovb %k1,%edx 1127 kmovb %k2,%ecx 1128 shlb $4,%cl 1129 orb %cl,%dl 1130 1131 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k1 1132 vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2 1133 kmovb %k1,%ecx 1134 kmovb %k2,%ebx 1135 shlb $4,%bl 1136 orb %bl,%cl 1137 1138 vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k1 1139 vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k2 1140 kmovb %k1,%ebx 1141 kmovb %k2,%eax 1142 shlb $4,%al 1143 orb %al,%bl 1144 1145 addb %r9b,%r14b 1146 adcb %r8b,%r13b 1147 adcb %dl,%r12b 1148 adcb %cl,%r11b 1149 adcb %bl,%r10b 1150 1151 xorb %r9b,%r14b 1152 xorb %r8b,%r13b 1153 xorb %dl,%r12b 1154 xorb %cl,%r11b 1155 xorb %bl,%r10b 1156 1157 kmovb %r14d,%k1 1158 shrb $4,%r14b 1159 kmovb %r14d,%k2 1160 kmovb %r13d,%k3 1161 shrb $4,%r13b 1162 kmovb %r13d,%k4 1163 kmovb %r12d,%k5 1164 shrb $4,%r12b 1165 kmovb %r12d,%k6 1166 kmovb %r11d,%k7 1167 1168 vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k1} 1169 vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k2} 1170 vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k3} 1171 vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k4} 1172 vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k5} 1173 vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k6} 1174 vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k7} 1175 1176 vpandq .Lmask52x4(%rip),%ymm13,%ymm13 1177 vpandq .Lmask52x4(%rip),%ymm14,%ymm14 1178 vpandq .Lmask52x4(%rip),%ymm15,%ymm15 1179 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 1180 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 1181 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 1182 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 1183 1184 shrb $4,%r11b 1185 kmovb %r11d,%k1 1186 kmovb %r10d,%k2 1187 shrb $4,%r10b 1188 kmovb %r10d,%k3 1189 1190 vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k1} 1191 vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k2} 1192 vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k3} 1193 1194 vpandq .Lmask52x4(%rip),%ymm20,%ymm20 1195 vpandq .Lmask52x4(%rip),%ymm21,%ymm21 1196 vpandq .Lmask52x4(%rip),%ymm22,%ymm22 1197 1198 vmovdqu64 %ymm3,0(%rdi) 1199 vmovdqu64 %ymm4,32(%rdi) 1200 vmovdqu64 %ymm5,64(%rdi) 1201 vmovdqu64 %ymm6,96(%rdi) 1202 vmovdqu64 %ymm7,128(%rdi) 1203 vmovdqu64 %ymm8,160(%rdi) 1204 vmovdqu64 %ymm9,192(%rdi) 1205 vmovdqu64 %ymm10,224(%rdi) 1206 vmovdqu64 %ymm11,256(%rdi) 1207 vmovdqu64 %ymm12,288(%rdi) 1208 1209 vmovdqu64 %ymm13,320(%rdi) 1210 vmovdqu64 %ymm14,352(%rdi) 1211 vmovdqu64 %ymm15,384(%rdi) 1212 vmovdqu64 %ymm16,416(%rdi) 1213 vmovdqu64 %ymm17,448(%rdi) 1214 vmovdqu64 %ymm18,480(%rdi) 1215 vmovdqu64 %ymm19,512(%rdi) 1216 vmovdqu64 %ymm20,544(%rdi) 1217 vmovdqu64 %ymm21,576(%rdi) 1218 vmovdqu64 %ymm22,608(%rdi) 1219 1220 vzeroupper 1221 leaq (%rsp),%rax 1222.cfi_def_cfa_register %rax 1223 movq 0(%rax),%r15 1224.cfi_restore %r15 1225 movq 8(%rax),%r14 1226.cfi_restore %r14 1227 movq 16(%rax),%r13 1228.cfi_restore %r13 1229 movq 24(%rax),%r12 1230.cfi_restore %r12 1231 movq 32(%rax),%rbp 1232.cfi_restore %rbp 1233 movq 40(%rax),%rbx 1234.cfi_restore %rbx 1235 leaq 48(%rax),%rsp 1236.cfi_def_cfa %rsp,8 1237.Lossl_rsaz_amm52x40_x2_ifma256_epilogue: 1238 .byte 0xf3,0xc3 1239.cfi_endproc 1240.size ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256 1241.text 1242 1243.align 32 1244.globl ossl_extract_multiplier_2x40_win5 1245.type ossl_extract_multiplier_2x40_win5,@function 1246ossl_extract_multiplier_2x40_win5: 1247.cfi_startproc 1248.byte 243,15,30,250 1249 vmovdqa64 .Lones(%rip),%ymm24 1250 vpbroadcastq %rdx,%ymm22 1251 vpbroadcastq %rcx,%ymm23 1252 leaq 20480(%rsi),%rax 1253 1254 1255 movq %rsi,%r10 1256 1257 1258 vpxor %xmm0,%xmm0,%xmm0 1259 vmovdqa64 %ymm0,%ymm1 1260 vmovdqa64 %ymm0,%ymm2 1261 vmovdqa64 %ymm0,%ymm3 1262 vmovdqa64 %ymm0,%ymm4 1263 vmovdqa64 %ymm0,%ymm5 1264 vmovdqa64 %ymm0,%ymm16 1265 vmovdqa64 %ymm0,%ymm17 1266 vmovdqa64 %ymm0,%ymm18 1267 vmovdqa64 %ymm0,%ymm19 1268 vpxorq %ymm21,%ymm21,%ymm21 1269.align 32 1270.Lloop_0: 1271 vpcmpq $0,%ymm21,%ymm22,%k1 1272 vmovdqu64 0(%rsi),%ymm20 1273 vpblendmq %ymm20,%ymm0,%ymm0{%k1} 1274 vmovdqu64 32(%rsi),%ymm20 1275 vpblendmq %ymm20,%ymm1,%ymm1{%k1} 1276 vmovdqu64 64(%rsi),%ymm20 1277 vpblendmq %ymm20,%ymm2,%ymm2{%k1} 1278 vmovdqu64 96(%rsi),%ymm20 1279 vpblendmq %ymm20,%ymm3,%ymm3{%k1} 1280 vmovdqu64 128(%rsi),%ymm20 1281 vpblendmq %ymm20,%ymm4,%ymm4{%k1} 1282 vmovdqu64 160(%rsi),%ymm20 1283 vpblendmq %ymm20,%ymm5,%ymm5{%k1} 1284 vmovdqu64 192(%rsi),%ymm20 1285 vpblendmq %ymm20,%ymm16,%ymm16{%k1} 1286 vmovdqu64 224(%rsi),%ymm20 1287 vpblendmq %ymm20,%ymm17,%ymm17{%k1} 1288 vmovdqu64 256(%rsi),%ymm20 1289 vpblendmq %ymm20,%ymm18,%ymm18{%k1} 1290 vmovdqu64 288(%rsi),%ymm20 1291 vpblendmq %ymm20,%ymm19,%ymm19{%k1} 1292 vpaddq %ymm24,%ymm21,%ymm21 1293 addq $640,%rsi 1294 cmpq %rsi,%rax 1295 jne .Lloop_0 1296 vmovdqu64 %ymm0,0(%rdi) 1297 vmovdqu64 %ymm1,32(%rdi) 1298 vmovdqu64 %ymm2,64(%rdi) 1299 vmovdqu64 %ymm3,96(%rdi) 1300 vmovdqu64 %ymm4,128(%rdi) 1301 vmovdqu64 %ymm5,160(%rdi) 1302 vmovdqu64 %ymm16,192(%rdi) 1303 vmovdqu64 %ymm17,224(%rdi) 1304 vmovdqu64 %ymm18,256(%rdi) 1305 vmovdqu64 %ymm19,288(%rdi) 1306 movq %r10,%rsi 1307 vpxorq %ymm21,%ymm21,%ymm21 1308.align 32 1309.Lloop_320: 1310 vpcmpq $0,%ymm21,%ymm23,%k1 1311 vmovdqu64 320(%rsi),%ymm20 1312 vpblendmq %ymm20,%ymm0,%ymm0{%k1} 1313 vmovdqu64 352(%rsi),%ymm20 1314 vpblendmq %ymm20,%ymm1,%ymm1{%k1} 1315 vmovdqu64 384(%rsi),%ymm20 1316 vpblendmq %ymm20,%ymm2,%ymm2{%k1} 1317 vmovdqu64 416(%rsi),%ymm20 1318 vpblendmq %ymm20,%ymm3,%ymm3{%k1} 1319 vmovdqu64 448(%rsi),%ymm20 1320 vpblendmq %ymm20,%ymm4,%ymm4{%k1} 1321 vmovdqu64 480(%rsi),%ymm20 1322 vpblendmq %ymm20,%ymm5,%ymm5{%k1} 1323 vmovdqu64 512(%rsi),%ymm20 1324 vpblendmq %ymm20,%ymm16,%ymm16{%k1} 1325 vmovdqu64 544(%rsi),%ymm20 1326 vpblendmq %ymm20,%ymm17,%ymm17{%k1} 1327 vmovdqu64 576(%rsi),%ymm20 1328 vpblendmq %ymm20,%ymm18,%ymm18{%k1} 1329 vmovdqu64 608(%rsi),%ymm20 1330 vpblendmq %ymm20,%ymm19,%ymm19{%k1} 1331 vpaddq %ymm24,%ymm21,%ymm21 1332 addq $640,%rsi 1333 cmpq %rsi,%rax 1334 jne .Lloop_320 1335 vmovdqu64 %ymm0,320(%rdi) 1336 vmovdqu64 %ymm1,352(%rdi) 1337 vmovdqu64 %ymm2,384(%rdi) 1338 vmovdqu64 %ymm3,416(%rdi) 1339 vmovdqu64 %ymm4,448(%rdi) 1340 vmovdqu64 %ymm5,480(%rdi) 1341 vmovdqu64 %ymm16,512(%rdi) 1342 vmovdqu64 %ymm17,544(%rdi) 1343 vmovdqu64 %ymm18,576(%rdi) 1344 vmovdqu64 %ymm19,608(%rdi) 1345 1346 .byte 0xf3,0xc3 1347.cfi_endproc 1348.size ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5 1349.section .rodata 1350.align 32 1351.Lones: 1352.quad 1,1,1,1 1353.Lzeros: 1354.quad 0,0,0,0 1355 .section ".note.gnu.property", "a" 1356 .p2align 3 1357 .long 1f - 0f 1358 .long 4f - 1f 1359 .long 5 13600: 1361 # "GNU" encoded with .byte, since .asciz isn't supported 1362 # on Solaris. 1363 .byte 0x47 1364 .byte 0x4e 1365 .byte 0x55 1366 .byte 0 13671: 1368 .p2align 3 1369 .long 0xc0000002 1370 .long 3f - 2f 13712: 1372 .long 3 13733: 1374 .p2align 3 13754: 1376