1# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved. 2# Copyright (c) 2020, Intel Corporation. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov 11# Intel Corporation 12# 13# December 2020 14# 15# Initial release. 16# 17# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. 18# 19# IceLake-Client @ 1.3GHz 20# |---------+----------------------+--------------+-------------| 21# | | OpenSSL 3.0.0-alpha9 | this | Unit | 22# |---------+----------------------+--------------+-------------| 23# | rsa2048 | 2 127 659 | 1 015 625 | cycles/sign | 24# | | 611 | 1280 / +109% | sign/s | 25# |---------+----------------------+--------------+-------------| 26# 27 28# $output is the last argument if it looks like a file (it has an extension) 29# $flavour is the first argument if it doesn't look like a file 30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 32 33$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 34$avx512ifma=0; 35 36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 39die "can't locate x86_64-xlate.pl"; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $avx512ifma = ($1>=2.26); 44} 45 46if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 48 $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); 49} 50 51if (!$avx512 && `$ENV{CC} -v 2>&1` 52 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { 53 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 54 if ($1) { 55 # Apple conditions, they use a different version series, see 56 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 57 # clang 7.0.0 is Apple clang 10.0.1 58 $avx512ifma = ($ver>=10.0001) 59 } else { 60 $avx512ifma = ($3>=7.0); 61 } 62} 63 64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 65 or die "can't call $xlate: $!"; 66*STDOUT=*OUT; 67 68if ($avx512ifma>0) {{{ 69@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 70 71$code.=<<___; 72.extern OPENSSL_ia32cap_P 73.globl ossl_rsaz_avx512ifma_eligible 74.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent 75.align 32 76ossl_rsaz_avx512ifma_eligible: 77 mov OPENSSL_ia32cap_P+8(%rip), %ecx 78 xor %eax,%eax 79 and \$`1<<31|1<<21|1<<17|1<<16`, %ecx # avx512vl + avx512ifma + avx512dq + avx512f 80 cmp \$`1<<31|1<<21|1<<17|1<<16`, %ecx 81 cmove %ecx,%eax 82 ret 83.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 84___ 85 86############################################################################### 87# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. 88# 89# AMM is defined as presented in the paper 90# "Efficient Software Implementations of Modular Exponentiation" by Shay Gueron. 91# 92# The input and output are presented in 2^52 radix domain, i.e. 93# |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. 94# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 95# (note, the implementation counts only 52 bits from it). 96# 97# NB: the AMM implementation does not perform "conditional" subtraction step as 98# specified in the original algorithm as according to the paper "Enhanced Montgomery 99# Multiplication" by Shay Gueron (see Lemma 1), the result will be always < 2*2^1024 100# and can be used as a direct input to the next AMM iteration. 101# This post-condition is true, provided the correct parameter |s| is choosen, i.e. 102# s >= n + 2 * k, which matches our case: 1040 > 1024 + 2 * 1. 103# 104# void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res, 105# const BN_ULONG *a, 106# const BN_ULONG *b, 107# const BN_ULONG *m, 108# BN_ULONG k0); 109############################################################################### 110{ 111# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") 112my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; 113 114my $mask52 = "%rax"; 115my $acc0_0 = "%r9"; 116my $acc0_0_low = "%r9d"; 117my $acc0_1 = "%r15"; 118my $acc0_1_low = "%r15d"; 119my $b_ptr = "%r11"; 120 121my $iter = "%ebx"; 122 123my $zero = "%ymm0"; 124my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm1", map("%ymm$_",(16..19))); 125my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm2", map("%ymm$_",(20..23))); 126my $Bi = "%ymm3"; 127my $Yi = "%ymm4"; 128 129# Registers mapping for normalization. 130# We can reuse Bi, Yi registers here. 131my $TMP = $Bi; 132my $mask52x4 = $Yi; 133my ($T0,$T0h,$T1,$T1h,$T2) = map("%ymm$_", (24..28)); 134 135sub amm52x20_x1() { 136# _data_offset - offset in the |a| or |m| arrays pointing to the beginning 137# of data for corresponding AMM operation; 138# _b_offset - offset in the |b| array pointing to the next qword digit; 139my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_k0) = @_; 140my $_R0_xmm = $_R0; 141$_R0_xmm =~ s/%y/%x/; 142$code.=<<___; 143 movq $_b_offset($b_ptr), %r13 # b[i] 144 145 vpbroadcastq %r13, $Bi # broadcast b[i] 146 movq $_data_offset($a), %rdx 147 mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) 148 addq %r13, $_acc # acc += t0 149 movq %r12, %r10 150 adcq \$0, %r10 # t2 += CF 151 152 movq $_k0, %r13 153 imulq $_acc, %r13 # acc * k0 154 andq $mask52, %r13 # yi = (acc * k0) & mask52 155 156 vpbroadcastq %r13, $Yi # broadcast y[i] 157 movq $_data_offset($m), %rdx 158 mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) 159 addq %r13, $_acc # acc += t0 160 adcq %r12, %r10 # t2 += (t1 + CF) 161 162 shrq \$52, $_acc 163 salq \$12, %r10 164 or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) 165 166 vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 167 vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h 168 vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 169 vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h 170 vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 171 172 vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 173 vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h 174 vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 175 vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h 176 vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 177 178 # Shift accumulators right by 1 qword, zero extending the highest one 179 valignq \$1, $_R0, $_R0h, $_R0 180 valignq \$1, $_R0h, $_R1, $_R0h 181 valignq \$1, $_R1, $_R1h, $_R1 182 valignq \$1, $_R1h, $_R2, $_R1h 183 valignq \$1, $_R2, $zero, $_R2 184 185 vmovq $_R0_xmm, %r13 186 addq %r13, $_acc # acc += R0[0] 187 188 vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 189 vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h 190 vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 191 vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h 192 vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 193 194 vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 195 vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h 196 vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 197 vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h 198 vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 199___ 200} 201 202# Normalization routine: handles carry bits in R0..R2 QWs and 203# gets R0..R2 back to normalized 2^52 representation. 204# 205# Uses %r8-14,%e[bcd]x 206sub amm52x20_x1_norm { 207my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_; 208$code.=<<___; 209 # Put accumulator to low qword in R0 210 vpbroadcastq $_acc, $TMP 211 vpblendd \$3, $TMP, $_R0, $_R0 212 213 # Extract "carries" (12 high bits) from each QW of R0..R2 214 # Save them to LSB of QWs in T0..T2 215 vpsrlq \$52, $_R0, $T0 216 vpsrlq \$52, $_R0h, $T0h 217 vpsrlq \$52, $_R1, $T1 218 vpsrlq \$52, $_R1h, $T1h 219 vpsrlq \$52, $_R2, $T2 220 221 # "Shift left" T0..T2 by 1 QW 222 valignq \$3, $T1h, $T2, $T2 223 valignq \$3, $T1, $T1h, $T1h 224 valignq \$3, $T0h, $T1, $T1 225 valignq \$3, $T0, $T0h, $T0h 226 valignq \$3, $zero, $T0, $T0 227 228 # Drop "carries" from R0..R2 QWs 229 vpandq $mask52x4, $_R0, $_R0 230 vpandq $mask52x4, $_R0h, $_R0h 231 vpandq $mask52x4, $_R1, $_R1 232 vpandq $mask52x4, $_R1h, $_R1h 233 vpandq $mask52x4, $_R2, $_R2 234 235 # Sum R0..R2 with corresponding adjusted carries 236 vpaddq $T0, $_R0, $_R0 237 vpaddq $T0h, $_R0h, $_R0h 238 vpaddq $T1, $_R1, $_R1 239 vpaddq $T1h, $_R1h, $_R1h 240 vpaddq $T2, $_R2, $_R2 241 242 # Now handle carry bits from this addition 243 # Get mask of QWs which 52-bit parts overflow... 244 vpcmpuq \$1, $_R0, $mask52x4, %k1 # OP=lt 245 vpcmpuq \$1, $_R0h, $mask52x4, %k2 246 vpcmpuq \$1, $_R1, $mask52x4, %k3 247 vpcmpuq \$1, $_R1h, $mask52x4, %k4 248 vpcmpuq \$1, $_R2, $mask52x4, %k5 249 kmovb %k1, %r14d # k1 250 kmovb %k2, %r13d # k1h 251 kmovb %k3, %r12d # k2 252 kmovb %k4, %r11d # k2h 253 kmovb %k5, %r10d # k3 254 255 # ...or saturated 256 vpcmpuq \$0, $_R0, $mask52x4, %k1 # OP=eq 257 vpcmpuq \$0, $_R0h, $mask52x4, %k2 258 vpcmpuq \$0, $_R1, $mask52x4, %k3 259 vpcmpuq \$0, $_R1h, $mask52x4, %k4 260 vpcmpuq \$0, $_R2, $mask52x4, %k5 261 kmovb %k1, %r9d # k4 262 kmovb %k2, %r8d # k4h 263 kmovb %k3, %ebx # k5 264 kmovb %k4, %ecx # k5h 265 kmovb %k5, %edx # k6 266 267 # Get mask of QWs where carries shall be propagated to. 268 # Merge 4-bit masks to 8-bit values to use add with carry. 269 shl \$4, %r13b 270 or %r13b, %r14b 271 shl \$4, %r11b 272 or %r11b, %r12b 273 274 add %r14b, %r14b 275 adc %r12b, %r12b 276 adc %r10b, %r10b 277 278 shl \$4, %r8b 279 or %r8b,%r9b 280 shl \$4, %cl 281 or %cl, %bl 282 283 add %r9b, %r14b 284 adc %bl, %r12b 285 adc %dl, %r10b 286 287 xor %r9b, %r14b 288 xor %bl, %r12b 289 xor %dl, %r10b 290 291 kmovb %r14d, %k1 292 shr \$4, %r14b 293 kmovb %r14d, %k2 294 kmovb %r12d, %k3 295 shr \$4, %r12b 296 kmovb %r12d, %k4 297 kmovb %r10d, %k5 298 299 # Add carries according to the obtained mask 300 vpsubq $mask52x4, $_R0, ${_R0}{%k1} 301 vpsubq $mask52x4, $_R0h, ${_R0h}{%k2} 302 vpsubq $mask52x4, $_R1, ${_R1}{%k3} 303 vpsubq $mask52x4, $_R1h, ${_R1h}{%k4} 304 vpsubq $mask52x4, $_R2, ${_R2}{%k5} 305 306 vpandq $mask52x4, $_R0, $_R0 307 vpandq $mask52x4, $_R0h, $_R0h 308 vpandq $mask52x4, $_R1, $_R1 309 vpandq $mask52x4, $_R1h, $_R1h 310 vpandq $mask52x4, $_R2, $_R2 311___ 312} 313 314$code.=<<___; 315.text 316 317.globl ossl_rsaz_amm52x20_x1_256 318.type ossl_rsaz_amm52x20_x1_256,\@function,5 319.align 32 320ossl_rsaz_amm52x20_x1_256: 321.cfi_startproc 322 endbranch 323 push %rbx 324.cfi_push %rbx 325 push %rbp 326.cfi_push %rbp 327 push %r12 328.cfi_push %r12 329 push %r13 330.cfi_push %r13 331 push %r14 332.cfi_push %r14 333 push %r15 334.cfi_push %r15 335.Lrsaz_amm52x20_x1_256_body: 336 337 # Zeroing accumulators 338 vpxord $zero, $zero, $zero 339 vmovdqa64 $zero, $R0_0 340 vmovdqa64 $zero, $R0_0h 341 vmovdqa64 $zero, $R1_0 342 vmovdqa64 $zero, $R1_0h 343 vmovdqa64 $zero, $R2_0 344 345 xorl $acc0_0_low, $acc0_0_low 346 347 movq $b, $b_ptr # backup address of b 348 movq \$0xfffffffffffff, $mask52 # 52-bit mask 349 350 # Loop over 20 digits unrolled by 4 351 mov \$5, $iter 352 353.align 32 354.Lloop5: 355___ 356 foreach my $idx (0..3) { 357 &amm52x20_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$k0); 358 } 359$code.=<<___; 360 lea `4*8`($b_ptr), $b_ptr 361 dec $iter 362 jne .Lloop5 363 364 vmovdqa64 .Lmask52x4(%rip), $mask52x4 365___ 366 &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); 367$code.=<<___; 368 369 vmovdqu64 $R0_0, ($res) 370 vmovdqu64 $R0_0h, 32($res) 371 vmovdqu64 $R1_0, 64($res) 372 vmovdqu64 $R1_0h, 96($res) 373 vmovdqu64 $R2_0, 128($res) 374 375 vzeroupper 376 mov 0(%rsp),%r15 377.cfi_restore %r15 378 mov 8(%rsp),%r14 379.cfi_restore %r14 380 mov 16(%rsp),%r13 381.cfi_restore %r13 382 mov 24(%rsp),%r12 383.cfi_restore %r12 384 mov 32(%rsp),%rbp 385.cfi_restore %rbp 386 mov 40(%rsp),%rbx 387.cfi_restore %rbx 388 lea 48(%rsp),%rsp 389.cfi_adjust_cfa_offset -48 390.Lrsaz_amm52x20_x1_256_epilogue: 391 ret 392.cfi_endproc 393.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 394___ 395 396$code.=<<___; 397.data 398.align 32 399.Lmask52x4: 400 .quad 0xfffffffffffff 401 .quad 0xfffffffffffff 402 .quad 0xfffffffffffff 403 .quad 0xfffffffffffff 404___ 405 406############################################################################### 407# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 408# 409# See description of ossl_rsaz_amm52x20_x1_256() above for details about Almost 410# Montgomery Multiplication algorithm and function input parameters description. 411# 412# This function does two AMMs for two independent inputs, hence dual. 413# 414# void ossl_rsaz_amm52x20_x2_256(BN_ULONG out[2][20], 415# const BN_ULONG a[2][20], 416# const BN_ULONG b[2][20], 417# const BN_ULONG m[2][20], 418# const BN_ULONG k0[2]); 419############################################################################### 420 421$code.=<<___; 422.text 423 424.globl ossl_rsaz_amm52x20_x2_256 425.type ossl_rsaz_amm52x20_x2_256,\@function,5 426.align 32 427ossl_rsaz_amm52x20_x2_256: 428.cfi_startproc 429 endbranch 430 push %rbx 431.cfi_push %rbx 432 push %rbp 433.cfi_push %rbp 434 push %r12 435.cfi_push %r12 436 push %r13 437.cfi_push %r13 438 push %r14 439.cfi_push %r14 440 push %r15 441.cfi_push %r15 442.Lrsaz_amm52x20_x2_256_body: 443 444 # Zeroing accumulators 445 vpxord $zero, $zero, $zero 446 vmovdqa64 $zero, $R0_0 447 vmovdqa64 $zero, $R0_0h 448 vmovdqa64 $zero, $R1_0 449 vmovdqa64 $zero, $R1_0h 450 vmovdqa64 $zero, $R2_0 451 vmovdqa64 $zero, $R0_1 452 vmovdqa64 $zero, $R0_1h 453 vmovdqa64 $zero, $R1_1 454 vmovdqa64 $zero, $R1_1h 455 vmovdqa64 $zero, $R2_1 456 457 xorl $acc0_0_low, $acc0_0_low 458 xorl $acc0_1_low, $acc0_1_low 459 460 movq $b, $b_ptr # backup address of b 461 movq \$0xfffffffffffff, $mask52 # 52-bit mask 462 463 mov \$20, $iter 464 465.align 32 466.Lloop20: 467___ 468 &amm52x20_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,"($k0)"); 469 # 20*8 = offset of the next dimension in two-dimension array 470 &amm52x20_x1(20*8,20*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,"8($k0)"); 471$code.=<<___; 472 lea 8($b_ptr), $b_ptr 473 dec $iter 474 jne .Lloop20 475 476 vmovdqa64 .Lmask52x4(%rip), $mask52x4 477___ 478 &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); 479 &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1); 480$code.=<<___; 481 482 vmovdqu64 $R0_0, ($res) 483 vmovdqu64 $R0_0h, 32($res) 484 vmovdqu64 $R1_0, 64($res) 485 vmovdqu64 $R1_0h, 96($res) 486 vmovdqu64 $R2_0, 128($res) 487 488 vmovdqu64 $R0_1, 160($res) 489 vmovdqu64 $R0_1h, 192($res) 490 vmovdqu64 $R1_1, 224($res) 491 vmovdqu64 $R1_1h, 256($res) 492 vmovdqu64 $R2_1, 288($res) 493 494 vzeroupper 495 mov 0(%rsp),%r15 496.cfi_restore %r15 497 mov 8(%rsp),%r14 498.cfi_restore %r14 499 mov 16(%rsp),%r13 500.cfi_restore %r13 501 mov 24(%rsp),%r12 502.cfi_restore %r12 503 mov 32(%rsp),%rbp 504.cfi_restore %rbp 505 mov 40(%rsp),%rbx 506.cfi_restore %rbx 507 lea 48(%rsp),%rsp 508.cfi_adjust_cfa_offset -48 509.Lrsaz_amm52x20_x2_256_epilogue: 510 ret 511.cfi_endproc 512.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256 513___ 514} 515 516############################################################################### 517# Constant time extraction from the precomputed table of powers base^i, where 518# i = 0..2^EXP_WIN_SIZE-1 519# 520# The input |red_table| contains precomputations for two independent base values, 521# so the |tbl_idx| indicates for which base shall we extract the value. 522# |red_table_idx| is a power index. 523# 524# Extracted value (output) is 20 digit number in 2^52 radix. 525# 526# void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, 527# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], 528# int red_table_idx, 529# int tbl_idx); # 0 or 1 530# 531# EXP_WIN_SIZE = 5 532############################################################################### 533{ 534# input parameters 535my ($out,$red_tbl,$red_tbl_idx,$tbl_idx) = @_6_args_universal_ABI; 536 537my ($t0,$t1,$t2,$t3,$t4) = map("%ymm$_", (0..4)); 538my $t4xmm = $t4; 539$t4xmm =~ s/%y/%x/; 540my ($tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = map("%ymm$_", (16..20)); 541my ($cur_idx,$idx,$ones) = map("%ymm$_", (21..23)); 542 543$code.=<<___; 544.text 545 546.align 32 547.globl ossl_extract_multiplier_2x20_win5 548.type ossl_extract_multiplier_2x20_win5,\@function,4 549ossl_extract_multiplier_2x20_win5: 550.cfi_startproc 551 endbranch 552 leaq ($tbl_idx,$tbl_idx,4), %rax 553 salq \$5, %rax 554 addq %rax, $red_tbl 555 556 vmovdqa64 .Lones(%rip), $ones # broadcast ones 557 vpbroadcastq $red_tbl_idx, $idx 558 leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl 559 560 vpxor $t4xmm, $t4xmm, $t4xmm 561 vmovdqa64 $t4, $t3 # zeroing t0..4, cur_idx 562 vmovdqa64 $t4, $t2 563 vmovdqa64 $t4, $t1 564 vmovdqa64 $t4, $t0 565 vmovdqa64 $t4, $cur_idx 566 567.align 32 568.Lloop: 569 vpcmpq \$0, $cur_idx, $idx, %k1 # mask of (idx == cur_idx) 570 addq \$320, $red_tbl # 320 = 2 * 20 digits * 8 bytes 571 vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx 572 vmovdqu64 -320($red_tbl), $tmp0 # load data from red_tbl 573 vmovdqu64 -288($red_tbl), $tmp1 574 vmovdqu64 -256($red_tbl), $tmp2 575 vmovdqu64 -224($red_tbl), $tmp3 576 vmovdqu64 -192($red_tbl), $tmp4 577 vpblendmq $tmp0, $t0, ${t0}{%k1} # extract data when mask is not zero 578 vpblendmq $tmp1, $t1, ${t1}{%k1} 579 vpblendmq $tmp2, $t2, ${t2}{%k1} 580 vpblendmq $tmp3, $t3, ${t3}{%k1} 581 vpblendmq $tmp4, $t4, ${t4}{%k1} 582 cmpq $red_tbl, %rax 583 jne .Lloop 584 585 vmovdqu64 $t0, ($out) # store t0..4 586 vmovdqu64 $t1, 32($out) 587 vmovdqu64 $t2, 64($out) 588 vmovdqu64 $t3, 96($out) 589 vmovdqu64 $t4, 128($out) 590 591 ret 592.cfi_endproc 593.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 594___ 595$code.=<<___; 596.data 597.align 32 598.Lones: 599 .quad 1,1,1,1 600___ 601} 602 603if ($win64) { 604$rec="%rcx"; 605$frame="%rdx"; 606$context="%r8"; 607$disp="%r9"; 608 609$code.=<<___ 610.extern __imp_RtlVirtualUnwind 611.type rsaz_def_handler,\@abi-omnipotent 612.align 16 613rsaz_def_handler: 614 push %rsi 615 push %rdi 616 push %rbx 617 push %rbp 618 push %r12 619 push %r13 620 push %r14 621 push %r15 622 pushfq 623 sub \$64,%rsp 624 625 mov 120($context),%rax # pull context->Rax 626 mov 248($context),%rbx # pull context->Rip 627 628 mov 8($disp),%rsi # disp->ImageBase 629 mov 56($disp),%r11 # disp->HandlerData 630 631 mov 0(%r11),%r10d # HandlerData[0] 632 lea (%rsi,%r10),%r10 # prologue label 633 cmp %r10,%rbx # context->Rip<.Lprologue 634 jb .Lcommon_seh_tail 635 636 mov 152($context),%rax # pull context->Rsp 637 638 mov 4(%r11),%r10d # HandlerData[1] 639 lea (%rsi,%r10),%r10 # epilogue label 640 cmp %r10,%rbx # context->Rip>=.Lepilogue 641 jae .Lcommon_seh_tail 642 643 lea 48(%rax),%rax 644 645 mov -8(%rax),%rbx 646 mov -16(%rax),%rbp 647 mov -24(%rax),%r12 648 mov -32(%rax),%r13 649 mov -40(%rax),%r14 650 mov -48(%rax),%r15 651 mov %rbx,144($context) # restore context->Rbx 652 mov %rbp,160($context) # restore context->Rbp 653 mov %r12,216($context) # restore context->R12 654 mov %r13,224($context) # restore context->R13 655 mov %r14,232($context) # restore context->R14 656 mov %r15,240($context) # restore context->R14 657 658.Lcommon_seh_tail: 659 mov 8(%rax),%rdi 660 mov 16(%rax),%rsi 661 mov %rax,152($context) # restore context->Rsp 662 mov %rsi,168($context) # restore context->Rsi 663 mov %rdi,176($context) # restore context->Rdi 664 665 mov 40($disp),%rdi # disp->ContextRecord 666 mov $context,%rsi # context 667 mov \$154,%ecx # sizeof(CONTEXT) 668 .long 0xa548f3fc # cld; rep movsq 669 670 mov $disp,%rsi 671 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 672 mov 8(%rsi),%rdx # arg2, disp->ImageBase 673 mov 0(%rsi),%r8 # arg3, disp->ControlPc 674 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 675 mov 40(%rsi),%r10 # disp->ContextRecord 676 lea 56(%rsi),%r11 # &disp->HandlerData 677 lea 24(%rsi),%r12 # &disp->EstablisherFrame 678 mov %r10,32(%rsp) # arg5 679 mov %r11,40(%rsp) # arg6 680 mov %r12,48(%rsp) # arg7 681 mov %rcx,56(%rsp) # arg8, (NULL) 682 call *__imp_RtlVirtualUnwind(%rip) 683 684 mov \$1,%eax # ExceptionContinueSearch 685 add \$64,%rsp 686 popfq 687 pop %r15 688 pop %r14 689 pop %r13 690 pop %r12 691 pop %rbp 692 pop %rbx 693 pop %rdi 694 pop %rsi 695 ret 696.size rsaz_def_handler,.-rsaz_def_handler 697 698.section .pdata 699.align 4 700 .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_256 701 .rva .LSEH_end_ossl_rsaz_amm52x20_x1_256 702 .rva .LSEH_info_ossl_rsaz_amm52x20_x1_256 703 704 .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_256 705 .rva .LSEH_end_ossl_rsaz_amm52x20_x2_256 706 .rva .LSEH_info_ossl_rsaz_amm52x20_x2_256 707 708 .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5 709 .rva .LSEH_end_ossl_extract_multiplier_2x20_win5 710 .rva .LSEH_info_ossl_extract_multiplier_2x20_win5 711 712.section .xdata 713.align 8 714.LSEH_info_ossl_rsaz_amm52x20_x1_256: 715 .byte 9,0,0,0 716 .rva rsaz_def_handler 717 .rva .Lrsaz_amm52x20_x1_256_body,.Lrsaz_amm52x20_x1_256_epilogue 718.LSEH_info_ossl_rsaz_amm52x20_x2_256: 719 .byte 9,0,0,0 720 .rva rsaz_def_handler 721 .rva .Lrsaz_amm52x20_x2_256_body,.Lrsaz_amm52x20_x2_256_epilogue 722.LSEH_info_ossl_extract_multiplier_2x20_win5: 723 .byte 9,0,0,0 724 .rva rsaz_def_handler 725 .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5,.LSEH_begin_ossl_extract_multiplier_2x20_win5 726___ 727} 728}}} else {{{ # fallback for old assembler 729$code.=<<___; 730.text 731 732.globl ossl_rsaz_avx512ifma_eligible 733.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent 734ossl_rsaz_avx512ifma_eligible: 735 xor %eax,%eax 736 ret 737.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 738 739.globl ossl_rsaz_amm52x20_x1_256 740.globl ossl_rsaz_amm52x20_x2_256 741.globl ossl_extract_multiplier_2x20_win5 742.type ossl_rsaz_amm52x20_x1_256,\@abi-omnipotent 743ossl_rsaz_amm52x20_x1_256: 744ossl_rsaz_amm52x20_x2_256: 745ossl_extract_multiplier_2x20_win5: 746 .byte 0x0f,0x0b # ud2 747 ret 748.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 749___ 750}}} 751 752$code =~ s/\`([^\`]*)\`/eval $1/gem; 753print $code; 754close STDOUT or die "error closing STDOUT: $!"; 755