1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# This module implements support for Intel AES-NI extension. In 11# OpenSSL context it's used with Intel engine, but can also be used as 12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 13# details]. 14# 15# Performance. 16# 17# Given aes(enc|dec) instructions' latency asymptotic performance for 18# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 19# processed with 128-bit key. And given their throughput asymptotic 20# performance for parallelizable modes is 1.25 cycles per byte. Being 21# asymptotic limit it's not something you commonly achieve in reality, 22# but how close does one get? Below are results collected for 23# different modes and block sized. Pairs of numbers are for en-/ 24# decryption. 25# 26# 16-byte 64-byte 256-byte 1-KB 8-KB 27# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 28# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 29# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 30# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 31# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 32# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 33# 34# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 35# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 36# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 37# The results were collected with specially crafted speed.c benchmark 38# in order to compare them with results reported in "Intel Advanced 39# Encryption Standard (AES) New Instruction Set" White Paper Revision 40# 3.0 dated May 2010. All above results are consistently better. This 41# module also provides better performance for block sizes smaller than 42# 128 bytes in points *not* represented in the above table. 43# 44# Looking at the results for 8-KB buffer. 45# 46# CFB and OFB results are far from the limit, because implementation 47# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 48# single-block aesni_encrypt, which is not the most optimal way to go. 49# CBC encrypt result is unexpectedly high and there is no documented 50# explanation for it. Seemingly there is a small penalty for feeding 51# the result back to AES unit the way it's done in CBC mode. There is 52# nothing one can do and the result appears optimal. CCM result is 53# identical to CBC, because CBC-MAC is essentially CBC encrypt without 54# saving output. CCM CTR "stays invisible," because it's neatly 55# interleaved wih CBC-MAC. This provides ~30% improvement over 56# "straghtforward" CCM implementation with CTR and CBC-MAC performed 57# disjointly. Parallelizable modes practically achieve the theoretical 58# limit. 59# 60# Looking at how results vary with buffer size. 61# 62# Curves are practically saturated at 1-KB buffer size. In most cases 63# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 64# CTR curve doesn't follow this pattern and is "slowest" changing one 65# with "256-byte" result being 87% of "8-KB." This is because overhead 66# in CTR mode is most computationally intensive. Small-block CCM 67# decrypt is slower than encrypt, because first CTR and last CBC-MAC 68# iterations can't be interleaved. 69# 70# Results for 192- and 256-bit keys. 71# 72# EVP-free results were observed to scale perfectly with number of 73# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 74# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 75# are a tad smaller, because the above mentioned penalty biases all 76# results by same constant value. In similar way function call 77# overhead affects small-block performance, as well as OFB and CFB 78# results. Differences are not large, most common coefficients are 79# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 80# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 81 82# January 2011 83# 84# While Westmere processor features 6 cycles latency for aes[enc|dec] 85# instructions, which can be scheduled every second cycle, Sandy 86# Bridge spends 8 cycles per instruction, but it can schedule them 87# every cycle. This means that code targeting Westmere would perform 88# suboptimally on Sandy Bridge. Therefore this update. 89# 90# In addition, non-parallelizable CBC encrypt (as well as CCM) is 91# optimized. Relative improvement might appear modest, 8% on Westmere, 92# but in absolute terms it's 3.77 cycles per byte encrypted with 93# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 94# should be compared to asymptotic limits of 3.75 for Westmere and 95# 5.00 for Sandy Bridge. Actually, the fact that they get this close 96# to asymptotic limits is quite amazing. Indeed, the limit is 97# calculated as latency times number of rounds, 10 for 128-bit key, 98# and divided by 16, the number of bytes in block, or in other words 99# it accounts *solely* for aesenc instructions. But there are extra 100# instructions, and numbers so close to the asymptotic limits mean 101# that it's as if it takes as little as *one* additional cycle to 102# execute all of them. How is it possible? It is possible thanks to 103# out-of-order execution logic, which manages to overlap post- 104# processing of previous block, things like saving the output, with 105# actual encryption of current block, as well as pre-processing of 106# current block, things like fetching input and xor-ing it with 107# 0-round element of the key schedule, with actual encryption of 108# previous block. Keep this in mind... 109# 110# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 111# performance is achieved by interleaving instructions working on 112# independent blocks. In which case asymptotic limit for such modes 113# can be obtained by dividing above mentioned numbers by AES 114# instructions' interleave factor. Westmere can execute at most 3 115# instructions at a time, meaning that optimal interleave factor is 3, 116# and that's where the "magic" number of 1.25 come from. "Optimal 117# interleave factor" means that increase of interleave factor does 118# not improve performance. The formula has proven to reflect reality 119# pretty well on Westmere... Sandy Bridge on the other hand can 120# execute up to 8 AES instructions at a time, so how does varying 121# interleave factor affect the performance? Here is table for ECB 122# (numbers are cycles per byte processed with 128-bit key): 123# 124# instruction interleave factor 3x 6x 8x 125# theoretical asymptotic limit 1.67 0.83 0.625 126# measured performance for 8KB block 1.05 0.86 0.84 127# 128# "as if" interleave factor 4.7x 5.8x 6.0x 129# 130# Further data for other parallelizable modes: 131# 132# CBC decrypt 1.16 0.93 0.74 133# CTR 1.14 0.91 0.74 134# 135# Well, given 3x column it's probably inappropriate to call the limit 136# asymptotic, if it can be surpassed, isn't it? What happens there? 137# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 138# magic is responsible for this. Processor overlaps not only the 139# additional instructions with AES ones, but even AES instuctions 140# processing adjacent triplets of independent blocks. In the 6x case 141# additional instructions still claim disproportionally small amount 142# of additional cycles, but in 8x case number of instructions must be 143# a tad too high for out-of-order logic to cope with, and AES unit 144# remains underutilized... As you can see 8x interleave is hardly 145# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 146# utilizies 6x interleave because of limited register bank capacity. 147# 148# Higher interleave factors do have negative impact on Westmere 149# performance. While for ECB mode it's negligible ~1.5%, other 150# parallelizables perform ~5% worse, which is outweighed by ~25% 151# improvement on Sandy Bridge. To balance regression on Westmere 152# CTR mode was implemented with 6x aesenc interleave factor. 153 154# April 2011 155# 156# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 157# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 158# in CTR mode AES instruction interleave factor was chosen to be 6x. 159 160###################################################################### 161# Current large-block performance in cycles per byte processed with 162# 128-bit key (less is better). 163# 164# CBC en-/decrypt CTR XTS ECB 165# Westmere 3.77/1.25 1.25 1.25 1.26 166# * Bridge 5.07/0.74 0.75 0.90 0.85 167# Haswell 4.44/0.63 0.63 0.73 0.63 168# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 169# Bulldozer 5.77/0.70 0.72 0.90 0.70 170# 171# (*) Atom Silvermont ECB result is suboptimal because of penalties 172# incurred by operations on %xmm8-15. As ECB is not considered 173# critical, nothing was done to mitigate the problem. 174 175$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 176 # generates drop-in replacement for 177 # crypto/aes/asm/aes-x86_64.pl:-) 178 179$flavour = shift; 180$output = shift; 181if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 182 183$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 184 185$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 186( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 187( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 188die "can't locate x86_64-xlate.pl"; 189 190open OUT,"| \"$^X\" $xlate $flavour $output"; 191*STDOUT=*OUT; 192 193$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 194@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 195 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 196 197$code=".text\n"; 198$code.=".extern OPENSSL_ia32cap_P\n"; 199 200$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 201# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 202$inp="%rdi"; 203$out="%rsi"; 204$len="%rdx"; 205$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 206$ivp="%r8"; # cbc, ctr, ... 207 208$rnds_="%r10d"; # backup copy for $rounds 209$key_="%r11"; # backup copy for $key 210 211# %xmm register layout 212$rndkey0="%xmm0"; $rndkey1="%xmm1"; 213$inout0="%xmm2"; $inout1="%xmm3"; 214$inout2="%xmm4"; $inout3="%xmm5"; 215$inout4="%xmm6"; $inout5="%xmm7"; 216$inout6="%xmm8"; $inout7="%xmm9"; 217 218$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 219$in0="%xmm8"; $iv="%xmm9"; 220 221# Inline version of internal aesni_[en|de]crypt1. 222# 223# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 224# cycles which take care of loop variables... 225{ my $sn; 226sub aesni_generate1 { 227my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 228++$sn; 229$code.=<<___; 230 $movkey ($key),$rndkey0 231 $movkey 16($key),$rndkey1 232___ 233$code.=<<___ if (defined($ivec)); 234 xorps $rndkey0,$ivec 235 lea 32($key),$key 236 xorps $ivec,$inout 237___ 238$code.=<<___ if (!defined($ivec)); 239 lea 32($key),$key 240 xorps $rndkey0,$inout 241___ 242$code.=<<___; 243.Loop_${p}1_$sn: 244 aes${p} $rndkey1,$inout 245 dec $rounds 246 $movkey ($key),$rndkey1 247 lea 16($key),$key 248 jnz .Loop_${p}1_$sn # loop body is 16 bytes 249 aes${p}last $rndkey1,$inout 250___ 251}} 252# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 253# 254{ my ($inp,$out,$key) = @_4args; 255 256$code.=<<___; 257.globl ${PREFIX}_encrypt 258.type ${PREFIX}_encrypt,\@abi-omnipotent 259.align 16 260${PREFIX}_encrypt: 261 movups ($inp),$inout0 # load input 262 mov 240($key),$rounds # key->rounds 263___ 264 &aesni_generate1("enc",$key,$rounds); 265$code.=<<___; 266 pxor $rndkey0,$rndkey0 # clear register bank 267 pxor $rndkey1,$rndkey1 268 movups $inout0,($out) # output 269 pxor $inout0,$inout0 270 ret 271.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 272 273.globl ${PREFIX}_decrypt 274.type ${PREFIX}_decrypt,\@abi-omnipotent 275.align 16 276${PREFIX}_decrypt: 277 movups ($inp),$inout0 # load input 278 mov 240($key),$rounds # key->rounds 279___ 280 &aesni_generate1("dec",$key,$rounds); 281$code.=<<___; 282 pxor $rndkey0,$rndkey0 # clear register bank 283 pxor $rndkey1,$rndkey1 284 movups $inout0,($out) # output 285 pxor $inout0,$inout0 286 ret 287.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 288___ 289} 290 291# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 292# factor. Why 3x subroutine were originally used in loops? Even though 293# aes[enc|dec] latency was originally 6, it could be scheduled only 294# every *2nd* cycle. Thus 3x interleave was the one providing optimal 295# utilization, i.e. when subroutine's throughput is virtually same as 296# of non-interleaved subroutine [for number of input blocks up to 3]. 297# This is why it originally made no sense to implement 2x subroutine. 298# But times change and it became appropriate to spend extra 192 bytes 299# on 2x subroutine on Atom Silvermont account. For processors that 300# can schedule aes[enc|dec] every cycle optimal interleave factor 301# equals to corresponding instructions latency. 8x is optimal for 302# * Bridge and "super-optimal" for other Intel CPUs... 303 304sub aesni_generate2 { 305my $dir=shift; 306# As already mentioned it takes in $key and $rounds, which are *not* 307# preserved. $inout[0-1] is cipher/clear text... 308$code.=<<___; 309.type _aesni_${dir}rypt2,\@abi-omnipotent 310.align 16 311_aesni_${dir}rypt2: 312 $movkey ($key),$rndkey0 313 shl \$4,$rounds 314 $movkey 16($key),$rndkey1 315 xorps $rndkey0,$inout0 316 xorps $rndkey0,$inout1 317 $movkey 32($key),$rndkey0 318 lea 32($key,$rounds),$key 319 neg %rax # $rounds 320 add \$16,%rax 321 322.L${dir}_loop2: 323 aes${dir} $rndkey1,$inout0 324 aes${dir} $rndkey1,$inout1 325 $movkey ($key,%rax),$rndkey1 326 add \$32,%rax 327 aes${dir} $rndkey0,$inout0 328 aes${dir} $rndkey0,$inout1 329 $movkey -16($key,%rax),$rndkey0 330 jnz .L${dir}_loop2 331 332 aes${dir} $rndkey1,$inout0 333 aes${dir} $rndkey1,$inout1 334 aes${dir}last $rndkey0,$inout0 335 aes${dir}last $rndkey0,$inout1 336 ret 337.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 338___ 339} 340sub aesni_generate3 { 341my $dir=shift; 342# As already mentioned it takes in $key and $rounds, which are *not* 343# preserved. $inout[0-2] is cipher/clear text... 344$code.=<<___; 345.type _aesni_${dir}rypt3,\@abi-omnipotent 346.align 16 347_aesni_${dir}rypt3: 348 $movkey ($key),$rndkey0 349 shl \$4,$rounds 350 $movkey 16($key),$rndkey1 351 xorps $rndkey0,$inout0 352 xorps $rndkey0,$inout1 353 xorps $rndkey0,$inout2 354 $movkey 32($key),$rndkey0 355 lea 32($key,$rounds),$key 356 neg %rax # $rounds 357 add \$16,%rax 358 359.L${dir}_loop3: 360 aes${dir} $rndkey1,$inout0 361 aes${dir} $rndkey1,$inout1 362 aes${dir} $rndkey1,$inout2 363 $movkey ($key,%rax),$rndkey1 364 add \$32,%rax 365 aes${dir} $rndkey0,$inout0 366 aes${dir} $rndkey0,$inout1 367 aes${dir} $rndkey0,$inout2 368 $movkey -16($key,%rax),$rndkey0 369 jnz .L${dir}_loop3 370 371 aes${dir} $rndkey1,$inout0 372 aes${dir} $rndkey1,$inout1 373 aes${dir} $rndkey1,$inout2 374 aes${dir}last $rndkey0,$inout0 375 aes${dir}last $rndkey0,$inout1 376 aes${dir}last $rndkey0,$inout2 377 ret 378.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 379___ 380} 381# 4x interleave is implemented to improve small block performance, 382# most notably [and naturally] 4 block by ~30%. One can argue that one 383# should have implemented 5x as well, but improvement would be <20%, 384# so it's not worth it... 385sub aesni_generate4 { 386my $dir=shift; 387# As already mentioned it takes in $key and $rounds, which are *not* 388# preserved. $inout[0-3] is cipher/clear text... 389$code.=<<___; 390.type _aesni_${dir}rypt4,\@abi-omnipotent 391.align 16 392_aesni_${dir}rypt4: 393 $movkey ($key),$rndkey0 394 shl \$4,$rounds 395 $movkey 16($key),$rndkey1 396 xorps $rndkey0,$inout0 397 xorps $rndkey0,$inout1 398 xorps $rndkey0,$inout2 399 xorps $rndkey0,$inout3 400 $movkey 32($key),$rndkey0 401 lea 32($key,$rounds),$key 402 neg %rax # $rounds 403 .byte 0x0f,0x1f,0x00 404 add \$16,%rax 405 406.L${dir}_loop4: 407 aes${dir} $rndkey1,$inout0 408 aes${dir} $rndkey1,$inout1 409 aes${dir} $rndkey1,$inout2 410 aes${dir} $rndkey1,$inout3 411 $movkey ($key,%rax),$rndkey1 412 add \$32,%rax 413 aes${dir} $rndkey0,$inout0 414 aes${dir} $rndkey0,$inout1 415 aes${dir} $rndkey0,$inout2 416 aes${dir} $rndkey0,$inout3 417 $movkey -16($key,%rax),$rndkey0 418 jnz .L${dir}_loop4 419 420 aes${dir} $rndkey1,$inout0 421 aes${dir} $rndkey1,$inout1 422 aes${dir} $rndkey1,$inout2 423 aes${dir} $rndkey1,$inout3 424 aes${dir}last $rndkey0,$inout0 425 aes${dir}last $rndkey0,$inout1 426 aes${dir}last $rndkey0,$inout2 427 aes${dir}last $rndkey0,$inout3 428 ret 429.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 430___ 431} 432sub aesni_generate6 { 433my $dir=shift; 434# As already mentioned it takes in $key and $rounds, which are *not* 435# preserved. $inout[0-5] is cipher/clear text... 436$code.=<<___; 437.type _aesni_${dir}rypt6,\@abi-omnipotent 438.align 16 439_aesni_${dir}rypt6: 440 $movkey ($key),$rndkey0 441 shl \$4,$rounds 442 $movkey 16($key),$rndkey1 443 xorps $rndkey0,$inout0 444 pxor $rndkey0,$inout1 445 pxor $rndkey0,$inout2 446 aes${dir} $rndkey1,$inout0 447 lea 32($key,$rounds),$key 448 neg %rax # $rounds 449 aes${dir} $rndkey1,$inout1 450 pxor $rndkey0,$inout3 451 pxor $rndkey0,$inout4 452 aes${dir} $rndkey1,$inout2 453 pxor $rndkey0,$inout5 454 $movkey ($key,%rax),$rndkey0 455 add \$16,%rax 456 jmp .L${dir}_loop6_enter 457.align 16 458.L${dir}_loop6: 459 aes${dir} $rndkey1,$inout0 460 aes${dir} $rndkey1,$inout1 461 aes${dir} $rndkey1,$inout2 462.L${dir}_loop6_enter: 463 aes${dir} $rndkey1,$inout3 464 aes${dir} $rndkey1,$inout4 465 aes${dir} $rndkey1,$inout5 466 $movkey ($key,%rax),$rndkey1 467 add \$32,%rax 468 aes${dir} $rndkey0,$inout0 469 aes${dir} $rndkey0,$inout1 470 aes${dir} $rndkey0,$inout2 471 aes${dir} $rndkey0,$inout3 472 aes${dir} $rndkey0,$inout4 473 aes${dir} $rndkey0,$inout5 474 $movkey -16($key,%rax),$rndkey0 475 jnz .L${dir}_loop6 476 477 aes${dir} $rndkey1,$inout0 478 aes${dir} $rndkey1,$inout1 479 aes${dir} $rndkey1,$inout2 480 aes${dir} $rndkey1,$inout3 481 aes${dir} $rndkey1,$inout4 482 aes${dir} $rndkey1,$inout5 483 aes${dir}last $rndkey0,$inout0 484 aes${dir}last $rndkey0,$inout1 485 aes${dir}last $rndkey0,$inout2 486 aes${dir}last $rndkey0,$inout3 487 aes${dir}last $rndkey0,$inout4 488 aes${dir}last $rndkey0,$inout5 489 ret 490.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 491___ 492} 493sub aesni_generate8 { 494my $dir=shift; 495# As already mentioned it takes in $key and $rounds, which are *not* 496# preserved. $inout[0-7] is cipher/clear text... 497$code.=<<___; 498.type _aesni_${dir}rypt8,\@abi-omnipotent 499.align 16 500_aesni_${dir}rypt8: 501 $movkey ($key),$rndkey0 502 shl \$4,$rounds 503 $movkey 16($key),$rndkey1 504 xorps $rndkey0,$inout0 505 xorps $rndkey0,$inout1 506 pxor $rndkey0,$inout2 507 pxor $rndkey0,$inout3 508 pxor $rndkey0,$inout4 509 lea 32($key,$rounds),$key 510 neg %rax # $rounds 511 aes${dir} $rndkey1,$inout0 512 pxor $rndkey0,$inout5 513 pxor $rndkey0,$inout6 514 aes${dir} $rndkey1,$inout1 515 pxor $rndkey0,$inout7 516 $movkey ($key,%rax),$rndkey0 517 add \$16,%rax 518 jmp .L${dir}_loop8_inner 519.align 16 520.L${dir}_loop8: 521 aes${dir} $rndkey1,$inout0 522 aes${dir} $rndkey1,$inout1 523.L${dir}_loop8_inner: 524 aes${dir} $rndkey1,$inout2 525 aes${dir} $rndkey1,$inout3 526 aes${dir} $rndkey1,$inout4 527 aes${dir} $rndkey1,$inout5 528 aes${dir} $rndkey1,$inout6 529 aes${dir} $rndkey1,$inout7 530.L${dir}_loop8_enter: 531 $movkey ($key,%rax),$rndkey1 532 add \$32,%rax 533 aes${dir} $rndkey0,$inout0 534 aes${dir} $rndkey0,$inout1 535 aes${dir} $rndkey0,$inout2 536 aes${dir} $rndkey0,$inout3 537 aes${dir} $rndkey0,$inout4 538 aes${dir} $rndkey0,$inout5 539 aes${dir} $rndkey0,$inout6 540 aes${dir} $rndkey0,$inout7 541 $movkey -16($key,%rax),$rndkey0 542 jnz .L${dir}_loop8 543 544 aes${dir} $rndkey1,$inout0 545 aes${dir} $rndkey1,$inout1 546 aes${dir} $rndkey1,$inout2 547 aes${dir} $rndkey1,$inout3 548 aes${dir} $rndkey1,$inout4 549 aes${dir} $rndkey1,$inout5 550 aes${dir} $rndkey1,$inout6 551 aes${dir} $rndkey1,$inout7 552 aes${dir}last $rndkey0,$inout0 553 aes${dir}last $rndkey0,$inout1 554 aes${dir}last $rndkey0,$inout2 555 aes${dir}last $rndkey0,$inout3 556 aes${dir}last $rndkey0,$inout4 557 aes${dir}last $rndkey0,$inout5 558 aes${dir}last $rndkey0,$inout6 559 aes${dir}last $rndkey0,$inout7 560 ret 561.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 562___ 563} 564&aesni_generate2("enc") if ($PREFIX eq "aesni"); 565&aesni_generate2("dec"); 566&aesni_generate3("enc") if ($PREFIX eq "aesni"); 567&aesni_generate3("dec"); 568&aesni_generate4("enc") if ($PREFIX eq "aesni"); 569&aesni_generate4("dec"); 570&aesni_generate6("enc") if ($PREFIX eq "aesni"); 571&aesni_generate6("dec"); 572&aesni_generate8("enc") if ($PREFIX eq "aesni"); 573&aesni_generate8("dec"); 574 575if ($PREFIX eq "aesni") { 576######################################################################## 577# void aesni_ecb_encrypt (const void *in, void *out, 578# size_t length, const AES_KEY *key, 579# int enc); 580$code.=<<___; 581.globl aesni_ecb_encrypt 582.type aesni_ecb_encrypt,\@function,5 583.align 16 584aesni_ecb_encrypt: 585___ 586$code.=<<___ if ($win64); 587 lea -0x58(%rsp),%rsp 588 movaps %xmm6,(%rsp) # offload $inout4..7 589 movaps %xmm7,0x10(%rsp) 590 movaps %xmm8,0x20(%rsp) 591 movaps %xmm9,0x30(%rsp) 592.Lecb_enc_body: 593___ 594$code.=<<___; 595 and \$-16,$len # if ($len<16) 596 jz .Lecb_ret # return 597 598 mov 240($key),$rounds # key->rounds 599 $movkey ($key),$rndkey0 600 mov $key,$key_ # backup $key 601 mov $rounds,$rnds_ # backup $rounds 602 test %r8d,%r8d # 5th argument 603 jz .Lecb_decrypt 604#--------------------------- ECB ENCRYPT ------------------------------# 605 cmp \$0x80,$len # if ($len<8*16) 606 jb .Lecb_enc_tail # short input 607 608 movdqu ($inp),$inout0 # load 8 input blocks 609 movdqu 0x10($inp),$inout1 610 movdqu 0x20($inp),$inout2 611 movdqu 0x30($inp),$inout3 612 movdqu 0x40($inp),$inout4 613 movdqu 0x50($inp),$inout5 614 movdqu 0x60($inp),$inout6 615 movdqu 0x70($inp),$inout7 616 lea 0x80($inp),$inp # $inp+=8*16 617 sub \$0x80,$len # $len-=8*16 (can be zero) 618 jmp .Lecb_enc_loop8_enter 619.align 16 620.Lecb_enc_loop8: 621 movups $inout0,($out) # store 8 output blocks 622 mov $key_,$key # restore $key 623 movdqu ($inp),$inout0 # load 8 input blocks 624 mov $rnds_,$rounds # restore $rounds 625 movups $inout1,0x10($out) 626 movdqu 0x10($inp),$inout1 627 movups $inout2,0x20($out) 628 movdqu 0x20($inp),$inout2 629 movups $inout3,0x30($out) 630 movdqu 0x30($inp),$inout3 631 movups $inout4,0x40($out) 632 movdqu 0x40($inp),$inout4 633 movups $inout5,0x50($out) 634 movdqu 0x50($inp),$inout5 635 movups $inout6,0x60($out) 636 movdqu 0x60($inp),$inout6 637 movups $inout7,0x70($out) 638 lea 0x80($out),$out # $out+=8*16 639 movdqu 0x70($inp),$inout7 640 lea 0x80($inp),$inp # $inp+=8*16 641.Lecb_enc_loop8_enter: 642 643 call _aesni_encrypt8 644 645 sub \$0x80,$len 646 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 647 648 movups $inout0,($out) # store 8 output blocks 649 mov $key_,$key # restore $key 650 movups $inout1,0x10($out) 651 mov $rnds_,$rounds # restore $rounds 652 movups $inout2,0x20($out) 653 movups $inout3,0x30($out) 654 movups $inout4,0x40($out) 655 movups $inout5,0x50($out) 656 movups $inout6,0x60($out) 657 movups $inout7,0x70($out) 658 lea 0x80($out),$out # $out+=8*16 659 add \$0x80,$len # restore real remaining $len 660 jz .Lecb_ret # done if ($len==0) 661 662.Lecb_enc_tail: # $len is less than 8*16 663 movups ($inp),$inout0 664 cmp \$0x20,$len 665 jb .Lecb_enc_one 666 movups 0x10($inp),$inout1 667 je .Lecb_enc_two 668 movups 0x20($inp),$inout2 669 cmp \$0x40,$len 670 jb .Lecb_enc_three 671 movups 0x30($inp),$inout3 672 je .Lecb_enc_four 673 movups 0x40($inp),$inout4 674 cmp \$0x60,$len 675 jb .Lecb_enc_five 676 movups 0x50($inp),$inout5 677 je .Lecb_enc_six 678 movdqu 0x60($inp),$inout6 679 xorps $inout7,$inout7 680 call _aesni_encrypt8 681 movups $inout0,($out) # store 7 output blocks 682 movups $inout1,0x10($out) 683 movups $inout2,0x20($out) 684 movups $inout3,0x30($out) 685 movups $inout4,0x40($out) 686 movups $inout5,0x50($out) 687 movups $inout6,0x60($out) 688 jmp .Lecb_ret 689.align 16 690.Lecb_enc_one: 691___ 692 &aesni_generate1("enc",$key,$rounds); 693$code.=<<___; 694 movups $inout0,($out) # store one output block 695 jmp .Lecb_ret 696.align 16 697.Lecb_enc_two: 698 call _aesni_encrypt2 699 movups $inout0,($out) # store 2 output blocks 700 movups $inout1,0x10($out) 701 jmp .Lecb_ret 702.align 16 703.Lecb_enc_three: 704 call _aesni_encrypt3 705 movups $inout0,($out) # store 3 output blocks 706 movups $inout1,0x10($out) 707 movups $inout2,0x20($out) 708 jmp .Lecb_ret 709.align 16 710.Lecb_enc_four: 711 call _aesni_encrypt4 712 movups $inout0,($out) # store 4 output blocks 713 movups $inout1,0x10($out) 714 movups $inout2,0x20($out) 715 movups $inout3,0x30($out) 716 jmp .Lecb_ret 717.align 16 718.Lecb_enc_five: 719 xorps $inout5,$inout5 720 call _aesni_encrypt6 721 movups $inout0,($out) # store 5 output blocks 722 movups $inout1,0x10($out) 723 movups $inout2,0x20($out) 724 movups $inout3,0x30($out) 725 movups $inout4,0x40($out) 726 jmp .Lecb_ret 727.align 16 728.Lecb_enc_six: 729 call _aesni_encrypt6 730 movups $inout0,($out) # store 6 output blocks 731 movups $inout1,0x10($out) 732 movups $inout2,0x20($out) 733 movups $inout3,0x30($out) 734 movups $inout4,0x40($out) 735 movups $inout5,0x50($out) 736 jmp .Lecb_ret 737#--------------------------- ECB DECRYPT ------------------------------# 738.align 16 739.Lecb_decrypt: 740 cmp \$0x80,$len # if ($len<8*16) 741 jb .Lecb_dec_tail # short input 742 743 movdqu ($inp),$inout0 # load 8 input blocks 744 movdqu 0x10($inp),$inout1 745 movdqu 0x20($inp),$inout2 746 movdqu 0x30($inp),$inout3 747 movdqu 0x40($inp),$inout4 748 movdqu 0x50($inp),$inout5 749 movdqu 0x60($inp),$inout6 750 movdqu 0x70($inp),$inout7 751 lea 0x80($inp),$inp # $inp+=8*16 752 sub \$0x80,$len # $len-=8*16 (can be zero) 753 jmp .Lecb_dec_loop8_enter 754.align 16 755.Lecb_dec_loop8: 756 movups $inout0,($out) # store 8 output blocks 757 mov $key_,$key # restore $key 758 movdqu ($inp),$inout0 # load 8 input blocks 759 mov $rnds_,$rounds # restore $rounds 760 movups $inout1,0x10($out) 761 movdqu 0x10($inp),$inout1 762 movups $inout2,0x20($out) 763 movdqu 0x20($inp),$inout2 764 movups $inout3,0x30($out) 765 movdqu 0x30($inp),$inout3 766 movups $inout4,0x40($out) 767 movdqu 0x40($inp),$inout4 768 movups $inout5,0x50($out) 769 movdqu 0x50($inp),$inout5 770 movups $inout6,0x60($out) 771 movdqu 0x60($inp),$inout6 772 movups $inout7,0x70($out) 773 lea 0x80($out),$out # $out+=8*16 774 movdqu 0x70($inp),$inout7 775 lea 0x80($inp),$inp # $inp+=8*16 776.Lecb_dec_loop8_enter: 777 778 call _aesni_decrypt8 779 780 $movkey ($key_),$rndkey0 781 sub \$0x80,$len 782 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 783 784 movups $inout0,($out) # store 8 output blocks 785 pxor $inout0,$inout0 # clear register bank 786 mov $key_,$key # restore $key 787 movups $inout1,0x10($out) 788 pxor $inout1,$inout1 789 mov $rnds_,$rounds # restore $rounds 790 movups $inout2,0x20($out) 791 pxor $inout2,$inout2 792 movups $inout3,0x30($out) 793 pxor $inout3,$inout3 794 movups $inout4,0x40($out) 795 pxor $inout4,$inout4 796 movups $inout5,0x50($out) 797 pxor $inout5,$inout5 798 movups $inout6,0x60($out) 799 pxor $inout6,$inout6 800 movups $inout7,0x70($out) 801 pxor $inout7,$inout7 802 lea 0x80($out),$out # $out+=8*16 803 add \$0x80,$len # restore real remaining $len 804 jz .Lecb_ret # done if ($len==0) 805 806.Lecb_dec_tail: 807 movups ($inp),$inout0 808 cmp \$0x20,$len 809 jb .Lecb_dec_one 810 movups 0x10($inp),$inout1 811 je .Lecb_dec_two 812 movups 0x20($inp),$inout2 813 cmp \$0x40,$len 814 jb .Lecb_dec_three 815 movups 0x30($inp),$inout3 816 je .Lecb_dec_four 817 movups 0x40($inp),$inout4 818 cmp \$0x60,$len 819 jb .Lecb_dec_five 820 movups 0x50($inp),$inout5 821 je .Lecb_dec_six 822 movups 0x60($inp),$inout6 823 $movkey ($key),$rndkey0 824 xorps $inout7,$inout7 825 call _aesni_decrypt8 826 movups $inout0,($out) # store 7 output blocks 827 pxor $inout0,$inout0 # clear register bank 828 movups $inout1,0x10($out) 829 pxor $inout1,$inout1 830 movups $inout2,0x20($out) 831 pxor $inout2,$inout2 832 movups $inout3,0x30($out) 833 pxor $inout3,$inout3 834 movups $inout4,0x40($out) 835 pxor $inout4,$inout4 836 movups $inout5,0x50($out) 837 pxor $inout5,$inout5 838 movups $inout6,0x60($out) 839 pxor $inout6,$inout6 840 pxor $inout7,$inout7 841 jmp .Lecb_ret 842.align 16 843.Lecb_dec_one: 844___ 845 &aesni_generate1("dec",$key,$rounds); 846$code.=<<___; 847 movups $inout0,($out) # store one output block 848 pxor $inout0,$inout0 # clear register bank 849 jmp .Lecb_ret 850.align 16 851.Lecb_dec_two: 852 call _aesni_decrypt2 853 movups $inout0,($out) # store 2 output blocks 854 pxor $inout0,$inout0 # clear register bank 855 movups $inout1,0x10($out) 856 pxor $inout1,$inout1 857 jmp .Lecb_ret 858.align 16 859.Lecb_dec_three: 860 call _aesni_decrypt3 861 movups $inout0,($out) # store 3 output blocks 862 pxor $inout0,$inout0 # clear register bank 863 movups $inout1,0x10($out) 864 pxor $inout1,$inout1 865 movups $inout2,0x20($out) 866 pxor $inout2,$inout2 867 jmp .Lecb_ret 868.align 16 869.Lecb_dec_four: 870 call _aesni_decrypt4 871 movups $inout0,($out) # store 4 output blocks 872 pxor $inout0,$inout0 # clear register bank 873 movups $inout1,0x10($out) 874 pxor $inout1,$inout1 875 movups $inout2,0x20($out) 876 pxor $inout2,$inout2 877 movups $inout3,0x30($out) 878 pxor $inout3,$inout3 879 jmp .Lecb_ret 880.align 16 881.Lecb_dec_five: 882 xorps $inout5,$inout5 883 call _aesni_decrypt6 884 movups $inout0,($out) # store 5 output blocks 885 pxor $inout0,$inout0 # clear register bank 886 movups $inout1,0x10($out) 887 pxor $inout1,$inout1 888 movups $inout2,0x20($out) 889 pxor $inout2,$inout2 890 movups $inout3,0x30($out) 891 pxor $inout3,$inout3 892 movups $inout4,0x40($out) 893 pxor $inout4,$inout4 894 pxor $inout5,$inout5 895 jmp .Lecb_ret 896.align 16 897.Lecb_dec_six: 898 call _aesni_decrypt6 899 movups $inout0,($out) # store 6 output blocks 900 pxor $inout0,$inout0 # clear register bank 901 movups $inout1,0x10($out) 902 pxor $inout1,$inout1 903 movups $inout2,0x20($out) 904 pxor $inout2,$inout2 905 movups $inout3,0x30($out) 906 pxor $inout3,$inout3 907 movups $inout4,0x40($out) 908 pxor $inout4,$inout4 909 movups $inout5,0x50($out) 910 pxor $inout5,$inout5 911 912.Lecb_ret: 913 xorps $rndkey0,$rndkey0 # %xmm0 914 pxor $rndkey1,$rndkey1 915___ 916$code.=<<___ if ($win64); 917 movaps (%rsp),%xmm6 918 movaps %xmm0,(%rsp) # clear stack 919 movaps 0x10(%rsp),%xmm7 920 movaps %xmm0,0x10(%rsp) 921 movaps 0x20(%rsp),%xmm8 922 movaps %xmm0,0x20(%rsp) 923 movaps 0x30(%rsp),%xmm9 924 movaps %xmm0,0x30(%rsp) 925 lea 0x58(%rsp),%rsp 926.Lecb_enc_ret: 927___ 928$code.=<<___; 929 ret 930.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 931___ 932 933{ 934###################################################################### 935# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 936# size_t blocks, const AES_KEY *key, 937# const char *ivec,char *cmac); 938# 939# Handles only complete blocks, operates on 64-bit counter and 940# does not update *ivec! Nor does it finalize CMAC value 941# (see engine/eng_aesni.c for details) 942# 943{ 944my $cmac="%r9"; # 6th argument 945 946my $increment="%xmm9"; 947my $iv="%xmm6"; 948my $bswap_mask="%xmm7"; 949 950$code.=<<___; 951.globl aesni_ccm64_encrypt_blocks 952.type aesni_ccm64_encrypt_blocks,\@function,6 953.align 16 954aesni_ccm64_encrypt_blocks: 955___ 956$code.=<<___ if ($win64); 957 lea -0x58(%rsp),%rsp 958 movaps %xmm6,(%rsp) # $iv 959 movaps %xmm7,0x10(%rsp) # $bswap_mask 960 movaps %xmm8,0x20(%rsp) # $in0 961 movaps %xmm9,0x30(%rsp) # $increment 962.Lccm64_enc_body: 963___ 964$code.=<<___; 965 mov 240($key),$rounds # key->rounds 966 movdqu ($ivp),$iv 967 movdqa .Lincrement64(%rip),$increment 968 movdqa .Lbswap_mask(%rip),$bswap_mask 969 970 shl \$4,$rounds 971 mov \$16,$rnds_ 972 lea 0($key),$key_ 973 movdqu ($cmac),$inout1 974 movdqa $iv,$inout0 975 lea 32($key,$rounds),$key # end of key schedule 976 pshufb $bswap_mask,$iv 977 sub %rax,%r10 # twisted $rounds 978 jmp .Lccm64_enc_outer 979.align 16 980.Lccm64_enc_outer: 981 $movkey ($key_),$rndkey0 982 mov %r10,%rax 983 movups ($inp),$in0 # load inp 984 985 xorps $rndkey0,$inout0 # counter 986 $movkey 16($key_),$rndkey1 987 xorps $in0,$rndkey0 988 xorps $rndkey0,$inout1 # cmac^=inp 989 $movkey 32($key_),$rndkey0 990 991.Lccm64_enc2_loop: 992 aesenc $rndkey1,$inout0 993 aesenc $rndkey1,$inout1 994 $movkey ($key,%rax),$rndkey1 995 add \$32,%rax 996 aesenc $rndkey0,$inout0 997 aesenc $rndkey0,$inout1 998 $movkey -16($key,%rax),$rndkey0 999 jnz .Lccm64_enc2_loop 1000 aesenc $rndkey1,$inout0 1001 aesenc $rndkey1,$inout1 1002 paddq $increment,$iv 1003 dec $len # $len-- ($len is in blocks) 1004 aesenclast $rndkey0,$inout0 1005 aesenclast $rndkey0,$inout1 1006 1007 lea 16($inp),$inp 1008 xorps $inout0,$in0 # inp ^= E(iv) 1009 movdqa $iv,$inout0 1010 movups $in0,($out) # save output 1011 pshufb $bswap_mask,$inout0 1012 lea 16($out),$out # $out+=16 1013 jnz .Lccm64_enc_outer # loop if ($len!=0) 1014 1015 pxor $rndkey0,$rndkey0 # clear register bank 1016 pxor $rndkey1,$rndkey1 1017 pxor $inout0,$inout0 1018 movups $inout1,($cmac) # store resulting mac 1019 pxor $inout1,$inout1 1020 pxor $in0,$in0 1021 pxor $iv,$iv 1022___ 1023$code.=<<___ if ($win64); 1024 movaps (%rsp),%xmm6 1025 movaps %xmm0,(%rsp) # clear stack 1026 movaps 0x10(%rsp),%xmm7 1027 movaps %xmm0,0x10(%rsp) 1028 movaps 0x20(%rsp),%xmm8 1029 movaps %xmm0,0x20(%rsp) 1030 movaps 0x30(%rsp),%xmm9 1031 movaps %xmm0,0x30(%rsp) 1032 lea 0x58(%rsp),%rsp 1033.Lccm64_enc_ret: 1034___ 1035$code.=<<___; 1036 ret 1037.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1038___ 1039###################################################################### 1040$code.=<<___; 1041.globl aesni_ccm64_decrypt_blocks 1042.type aesni_ccm64_decrypt_blocks,\@function,6 1043.align 16 1044aesni_ccm64_decrypt_blocks: 1045___ 1046$code.=<<___ if ($win64); 1047 lea -0x58(%rsp),%rsp 1048 movaps %xmm6,(%rsp) # $iv 1049 movaps %xmm7,0x10(%rsp) # $bswap_mask 1050 movaps %xmm8,0x20(%rsp) # $in8 1051 movaps %xmm9,0x30(%rsp) # $increment 1052.Lccm64_dec_body: 1053___ 1054$code.=<<___; 1055 mov 240($key),$rounds # key->rounds 1056 movups ($ivp),$iv 1057 movdqu ($cmac),$inout1 1058 movdqa .Lincrement64(%rip),$increment 1059 movdqa .Lbswap_mask(%rip),$bswap_mask 1060 1061 movaps $iv,$inout0 1062 mov $rounds,$rnds_ 1063 mov $key,$key_ 1064 pshufb $bswap_mask,$iv 1065___ 1066 &aesni_generate1("enc",$key,$rounds); 1067$code.=<<___; 1068 shl \$4,$rnds_ 1069 mov \$16,$rounds 1070 movups ($inp),$in0 # load inp 1071 paddq $increment,$iv 1072 lea 16($inp),$inp # $inp+=16 1073 sub %r10,%rax # twisted $rounds 1074 lea 32($key_,$rnds_),$key # end of key schedule 1075 mov %rax,%r10 1076 jmp .Lccm64_dec_outer 1077.align 16 1078.Lccm64_dec_outer: 1079 xorps $inout0,$in0 # inp ^= E(iv) 1080 movdqa $iv,$inout0 1081 movups $in0,($out) # save output 1082 lea 16($out),$out # $out+=16 1083 pshufb $bswap_mask,$inout0 1084 1085 sub \$1,$len # $len-- ($len is in blocks) 1086 jz .Lccm64_dec_break # if ($len==0) break 1087 1088 $movkey ($key_),$rndkey0 1089 mov %r10,%rax 1090 $movkey 16($key_),$rndkey1 1091 xorps $rndkey0,$in0 1092 xorps $rndkey0,$inout0 1093 xorps $in0,$inout1 # cmac^=out 1094 $movkey 32($key_),$rndkey0 1095 jmp .Lccm64_dec2_loop 1096.align 16 1097.Lccm64_dec2_loop: 1098 aesenc $rndkey1,$inout0 1099 aesenc $rndkey1,$inout1 1100 $movkey ($key,%rax),$rndkey1 1101 add \$32,%rax 1102 aesenc $rndkey0,$inout0 1103 aesenc $rndkey0,$inout1 1104 $movkey -16($key,%rax),$rndkey0 1105 jnz .Lccm64_dec2_loop 1106 movups ($inp),$in0 # load input 1107 paddq $increment,$iv 1108 aesenc $rndkey1,$inout0 1109 aesenc $rndkey1,$inout1 1110 aesenclast $rndkey0,$inout0 1111 aesenclast $rndkey0,$inout1 1112 lea 16($inp),$inp # $inp+=16 1113 jmp .Lccm64_dec_outer 1114 1115.align 16 1116.Lccm64_dec_break: 1117 #xorps $in0,$inout1 # cmac^=out 1118 mov 240($key_),$rounds 1119___ 1120 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1121$code.=<<___; 1122 pxor $rndkey0,$rndkey0 # clear register bank 1123 pxor $rndkey1,$rndkey1 1124 pxor $inout0,$inout0 1125 movups $inout1,($cmac) # store resulting mac 1126 pxor $inout1,$inout1 1127 pxor $in0,$in0 1128 pxor $iv,$iv 1129___ 1130$code.=<<___ if ($win64); 1131 movaps (%rsp),%xmm6 1132 movaps %xmm0,(%rsp) # clear stack 1133 movaps 0x10(%rsp),%xmm7 1134 movaps %xmm0,0x10(%rsp) 1135 movaps 0x20(%rsp),%xmm8 1136 movaps %xmm0,0x20(%rsp) 1137 movaps 0x30(%rsp),%xmm9 1138 movaps %xmm0,0x30(%rsp) 1139 lea 0x58(%rsp),%rsp 1140.Lccm64_dec_ret: 1141___ 1142$code.=<<___; 1143 ret 1144.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1145___ 1146} 1147###################################################################### 1148# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1149# size_t blocks, const AES_KEY *key, 1150# const char *ivec); 1151# 1152# Handles only complete blocks, operates on 32-bit counter and 1153# does not update *ivec! (see crypto/modes/ctr128.c for details) 1154# 1155# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1156# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1157# Keywords are full unroll and modulo-schedule counter calculations 1158# with zero-round key xor. 1159{ 1160my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1161my ($key0,$ctr)=("${key_}d","${ivp}d"); 1162my $frame_size = 0x80 + ($win64?160:0); 1163 1164$code.=<<___; 1165.globl aesni_ctr32_encrypt_blocks 1166.type aesni_ctr32_encrypt_blocks,\@function,5 1167.align 16 1168aesni_ctr32_encrypt_blocks: 1169 cmp \$1,$len 1170 jne .Lctr32_bulk 1171 1172 # handle single block without allocating stack frame, 1173 # useful when handling edges 1174 movups ($ivp),$inout0 1175 movups ($inp),$inout1 1176 mov 240($key),%edx # key->rounds 1177___ 1178 &aesni_generate1("enc",$key,"%edx"); 1179$code.=<<___; 1180 pxor $rndkey0,$rndkey0 # clear register bank 1181 pxor $rndkey1,$rndkey1 1182 xorps $inout1,$inout0 1183 pxor $inout1,$inout1 1184 movups $inout0,($out) 1185 xorps $inout0,$inout0 1186 jmp .Lctr32_epilogue 1187 1188.align 16 1189.Lctr32_bulk: 1190 lea (%rsp),%rax 1191 push %rbp 1192 sub \$$frame_size,%rsp 1193 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1194___ 1195$code.=<<___ if ($win64); 1196 movaps %xmm6,-0xa8(%rax) # offload everything 1197 movaps %xmm7,-0x98(%rax) 1198 movaps %xmm8,-0x88(%rax) 1199 movaps %xmm9,-0x78(%rax) 1200 movaps %xmm10,-0x68(%rax) 1201 movaps %xmm11,-0x58(%rax) 1202 movaps %xmm12,-0x48(%rax) 1203 movaps %xmm13,-0x38(%rax) 1204 movaps %xmm14,-0x28(%rax) 1205 movaps %xmm15,-0x18(%rax) 1206.Lctr32_body: 1207___ 1208$code.=<<___; 1209 lea -8(%rax),%rbp 1210 1211 # 8 16-byte words on top of stack are counter values 1212 # xor-ed with zero-round key 1213 1214 movdqu ($ivp),$inout0 1215 movdqu ($key),$rndkey0 1216 mov 12($ivp),$ctr # counter LSB 1217 pxor $rndkey0,$inout0 1218 mov 12($key),$key0 # 0-round key LSB 1219 movdqa $inout0,0x00(%rsp) # populate counter block 1220 bswap $ctr 1221 movdqa $inout0,$inout1 1222 movdqa $inout0,$inout2 1223 movdqa $inout0,$inout3 1224 movdqa $inout0,0x40(%rsp) 1225 movdqa $inout0,0x50(%rsp) 1226 movdqa $inout0,0x60(%rsp) 1227 mov %rdx,%r10 # about to borrow %rdx 1228 movdqa $inout0,0x70(%rsp) 1229 1230 lea 1($ctr),%rax 1231 lea 2($ctr),%rdx 1232 bswap %eax 1233 bswap %edx 1234 xor $key0,%eax 1235 xor $key0,%edx 1236 pinsrd \$3,%eax,$inout1 1237 lea 3($ctr),%rax 1238 movdqa $inout1,0x10(%rsp) 1239 pinsrd \$3,%edx,$inout2 1240 bswap %eax 1241 mov %r10,%rdx # restore %rdx 1242 lea 4($ctr),%r10 1243 movdqa $inout2,0x20(%rsp) 1244 xor $key0,%eax 1245 bswap %r10d 1246 pinsrd \$3,%eax,$inout3 1247 xor $key0,%r10d 1248 movdqa $inout3,0x30(%rsp) 1249 lea 5($ctr),%r9 1250 mov %r10d,0x40+12(%rsp) 1251 bswap %r9d 1252 lea 6($ctr),%r10 1253 mov 240($key),$rounds # key->rounds 1254 xor $key0,%r9d 1255 bswap %r10d 1256 mov %r9d,0x50+12(%rsp) 1257 xor $key0,%r10d 1258 lea 7($ctr),%r9 1259 mov %r10d,0x60+12(%rsp) 1260 bswap %r9d 1261 mov OPENSSL_ia32cap_P+4(%rip),%r10d 1262 xor $key0,%r9d 1263 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1264 mov %r9d,0x70+12(%rsp) 1265 1266 $movkey 0x10($key),$rndkey1 1267 1268 movdqa 0x40(%rsp),$inout4 1269 movdqa 0x50(%rsp),$inout5 1270 1271 cmp \$8,$len # $len is in blocks 1272 jb .Lctr32_tail # short input if ($len<8) 1273 1274 sub \$6,$len # $len is biased by -6 1275 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1276 je .Lctr32_6x # [which denotes Atom Silvermont] 1277 1278 lea 0x80($key),$key # size optimization 1279 sub \$2,$len # $len is biased by -8 1280 jmp .Lctr32_loop8 1281 1282.align 16 1283.Lctr32_6x: 1284 shl \$4,$rounds 1285 mov \$48,$rnds_ 1286 bswap $key0 1287 lea 32($key,$rounds),$key # end of key schedule 1288 sub %rax,%r10 # twisted $rounds 1289 jmp .Lctr32_loop6 1290 1291.align 16 1292.Lctr32_loop6: 1293 add \$6,$ctr # next counter value 1294 $movkey -48($key,$rnds_),$rndkey0 1295 aesenc $rndkey1,$inout0 1296 mov $ctr,%eax 1297 xor $key0,%eax 1298 aesenc $rndkey1,$inout1 1299 movbe %eax,`0x00+12`(%rsp) # store next counter value 1300 lea 1($ctr),%eax 1301 aesenc $rndkey1,$inout2 1302 xor $key0,%eax 1303 movbe %eax,`0x10+12`(%rsp) 1304 aesenc $rndkey1,$inout3 1305 lea 2($ctr),%eax 1306 xor $key0,%eax 1307 aesenc $rndkey1,$inout4 1308 movbe %eax,`0x20+12`(%rsp) 1309 lea 3($ctr),%eax 1310 aesenc $rndkey1,$inout5 1311 $movkey -32($key,$rnds_),$rndkey1 1312 xor $key0,%eax 1313 1314 aesenc $rndkey0,$inout0 1315 movbe %eax,`0x30+12`(%rsp) 1316 lea 4($ctr),%eax 1317 aesenc $rndkey0,$inout1 1318 xor $key0,%eax 1319 movbe %eax,`0x40+12`(%rsp) 1320 aesenc $rndkey0,$inout2 1321 lea 5($ctr),%eax 1322 xor $key0,%eax 1323 aesenc $rndkey0,$inout3 1324 movbe %eax,`0x50+12`(%rsp) 1325 mov %r10,%rax # mov $rnds_,$rounds 1326 aesenc $rndkey0,$inout4 1327 aesenc $rndkey0,$inout5 1328 $movkey -16($key,$rnds_),$rndkey0 1329 1330 call .Lenc_loop6 1331 1332 movdqu ($inp),$inout6 # load 6 input blocks 1333 movdqu 0x10($inp),$inout7 1334 movdqu 0x20($inp),$in0 1335 movdqu 0x30($inp),$in1 1336 movdqu 0x40($inp),$in2 1337 movdqu 0x50($inp),$in3 1338 lea 0x60($inp),$inp # $inp+=6*16 1339 $movkey -64($key,$rnds_),$rndkey1 1340 pxor $inout0,$inout6 # inp^=E(ctr) 1341 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1342 pxor $inout1,$inout7 1343 movaps 0x10(%rsp),$inout1 1344 pxor $inout2,$in0 1345 movaps 0x20(%rsp),$inout2 1346 pxor $inout3,$in1 1347 movaps 0x30(%rsp),$inout3 1348 pxor $inout4,$in2 1349 movaps 0x40(%rsp),$inout4 1350 pxor $inout5,$in3 1351 movaps 0x50(%rsp),$inout5 1352 movdqu $inout6,($out) # store 6 output blocks 1353 movdqu $inout7,0x10($out) 1354 movdqu $in0,0x20($out) 1355 movdqu $in1,0x30($out) 1356 movdqu $in2,0x40($out) 1357 movdqu $in3,0x50($out) 1358 lea 0x60($out),$out # $out+=6*16 1359 1360 sub \$6,$len 1361 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1362 1363 add \$6,$len # restore real remaining $len 1364 jz .Lctr32_done # done if ($len==0) 1365 1366 lea -48($rnds_),$rounds 1367 lea -80($key,$rnds_),$key # restore $key 1368 neg $rounds 1369 shr \$4,$rounds # restore $rounds 1370 jmp .Lctr32_tail 1371 1372.align 32 1373.Lctr32_loop8: 1374 add \$8,$ctr # next counter value 1375 movdqa 0x60(%rsp),$inout6 1376 aesenc $rndkey1,$inout0 1377 mov $ctr,%r9d 1378 movdqa 0x70(%rsp),$inout7 1379 aesenc $rndkey1,$inout1 1380 bswap %r9d 1381 $movkey 0x20-0x80($key),$rndkey0 1382 aesenc $rndkey1,$inout2 1383 xor $key0,%r9d 1384 nop 1385 aesenc $rndkey1,$inout3 1386 mov %r9d,0x00+12(%rsp) # store next counter value 1387 lea 1($ctr),%r9 1388 aesenc $rndkey1,$inout4 1389 aesenc $rndkey1,$inout5 1390 aesenc $rndkey1,$inout6 1391 aesenc $rndkey1,$inout7 1392 $movkey 0x30-0x80($key),$rndkey1 1393___ 1394for($i=2;$i<8;$i++) { 1395my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1396$code.=<<___; 1397 bswap %r9d 1398 aesenc $rndkeyx,$inout0 1399 aesenc $rndkeyx,$inout1 1400 xor $key0,%r9d 1401 .byte 0x66,0x90 1402 aesenc $rndkeyx,$inout2 1403 aesenc $rndkeyx,$inout3 1404 mov %r9d,`0x10*($i-1)`+12(%rsp) 1405 lea $i($ctr),%r9 1406 aesenc $rndkeyx,$inout4 1407 aesenc $rndkeyx,$inout5 1408 aesenc $rndkeyx,$inout6 1409 aesenc $rndkeyx,$inout7 1410 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1411___ 1412} 1413$code.=<<___; 1414 bswap %r9d 1415 aesenc $rndkey0,$inout0 1416 aesenc $rndkey0,$inout1 1417 aesenc $rndkey0,$inout2 1418 xor $key0,%r9d 1419 movdqu 0x00($inp),$in0 # start loading input 1420 aesenc $rndkey0,$inout3 1421 mov %r9d,0x70+12(%rsp) 1422 cmp \$11,$rounds 1423 aesenc $rndkey0,$inout4 1424 aesenc $rndkey0,$inout5 1425 aesenc $rndkey0,$inout6 1426 aesenc $rndkey0,$inout7 1427 $movkey 0xa0-0x80($key),$rndkey0 1428 1429 jb .Lctr32_enc_done 1430 1431 aesenc $rndkey1,$inout0 1432 aesenc $rndkey1,$inout1 1433 aesenc $rndkey1,$inout2 1434 aesenc $rndkey1,$inout3 1435 aesenc $rndkey1,$inout4 1436 aesenc $rndkey1,$inout5 1437 aesenc $rndkey1,$inout6 1438 aesenc $rndkey1,$inout7 1439 $movkey 0xb0-0x80($key),$rndkey1 1440 1441 aesenc $rndkey0,$inout0 1442 aesenc $rndkey0,$inout1 1443 aesenc $rndkey0,$inout2 1444 aesenc $rndkey0,$inout3 1445 aesenc $rndkey0,$inout4 1446 aesenc $rndkey0,$inout5 1447 aesenc $rndkey0,$inout6 1448 aesenc $rndkey0,$inout7 1449 $movkey 0xc0-0x80($key),$rndkey0 1450 je .Lctr32_enc_done 1451 1452 aesenc $rndkey1,$inout0 1453 aesenc $rndkey1,$inout1 1454 aesenc $rndkey1,$inout2 1455 aesenc $rndkey1,$inout3 1456 aesenc $rndkey1,$inout4 1457 aesenc $rndkey1,$inout5 1458 aesenc $rndkey1,$inout6 1459 aesenc $rndkey1,$inout7 1460 $movkey 0xd0-0x80($key),$rndkey1 1461 1462 aesenc $rndkey0,$inout0 1463 aesenc $rndkey0,$inout1 1464 aesenc $rndkey0,$inout2 1465 aesenc $rndkey0,$inout3 1466 aesenc $rndkey0,$inout4 1467 aesenc $rndkey0,$inout5 1468 aesenc $rndkey0,$inout6 1469 aesenc $rndkey0,$inout7 1470 $movkey 0xe0-0x80($key),$rndkey0 1471 jmp .Lctr32_enc_done 1472 1473.align 16 1474.Lctr32_enc_done: 1475 movdqu 0x10($inp),$in1 1476 pxor $rndkey0,$in0 # input^=round[last] 1477 movdqu 0x20($inp),$in2 1478 pxor $rndkey0,$in1 1479 movdqu 0x30($inp),$in3 1480 pxor $rndkey0,$in2 1481 movdqu 0x40($inp),$in4 1482 pxor $rndkey0,$in3 1483 movdqu 0x50($inp),$in5 1484 pxor $rndkey0,$in4 1485 pxor $rndkey0,$in5 1486 aesenc $rndkey1,$inout0 1487 aesenc $rndkey1,$inout1 1488 aesenc $rndkey1,$inout2 1489 aesenc $rndkey1,$inout3 1490 aesenc $rndkey1,$inout4 1491 aesenc $rndkey1,$inout5 1492 aesenc $rndkey1,$inout6 1493 aesenc $rndkey1,$inout7 1494 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1495 lea 0x80($inp),$inp # $inp+=8*16 1496 1497 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1498 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1499 movdqu 0x70-0x80($inp),$in0 1500 aesenclast $in1,$inout1 1501 pxor $rndkey0,$in0 1502 movdqa 0x00(%rsp),$in1 # load next counter block 1503 aesenclast $in2,$inout2 1504 aesenclast $in3,$inout3 1505 movdqa 0x10(%rsp),$in2 1506 movdqa 0x20(%rsp),$in3 1507 aesenclast $in4,$inout4 1508 aesenclast $in5,$inout5 1509 movdqa 0x30(%rsp),$in4 1510 movdqa 0x40(%rsp),$in5 1511 aesenclast $rndkey1,$inout6 1512 movdqa 0x50(%rsp),$rndkey0 1513 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1514 aesenclast $in0,$inout7 1515 1516 movups $inout0,($out) # store 8 output blocks 1517 movdqa $in1,$inout0 1518 movups $inout1,0x10($out) 1519 movdqa $in2,$inout1 1520 movups $inout2,0x20($out) 1521 movdqa $in3,$inout2 1522 movups $inout3,0x30($out) 1523 movdqa $in4,$inout3 1524 movups $inout4,0x40($out) 1525 movdqa $in5,$inout4 1526 movups $inout5,0x50($out) 1527 movdqa $rndkey0,$inout5 1528 movups $inout6,0x60($out) 1529 movups $inout7,0x70($out) 1530 lea 0x80($out),$out # $out+=8*16 1531 1532 sub \$8,$len 1533 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1534 1535 add \$8,$len # restore real remainig $len 1536 jz .Lctr32_done # done if ($len==0) 1537 lea -0x80($key),$key 1538 1539.Lctr32_tail: 1540 # note that at this point $inout0..5 are populated with 1541 # counter values xor-ed with 0-round key 1542 lea 16($key),$key 1543 cmp \$4,$len 1544 jb .Lctr32_loop3 1545 je .Lctr32_loop4 1546 1547 # if ($len>4) compute 7 E(counter) 1548 shl \$4,$rounds 1549 movdqa 0x60(%rsp),$inout6 1550 pxor $inout7,$inout7 1551 1552 $movkey 16($key),$rndkey0 1553 aesenc $rndkey1,$inout0 1554 aesenc $rndkey1,$inout1 1555 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1556 neg %rax 1557 aesenc $rndkey1,$inout2 1558 add \$16,%rax # prepare for .Lenc_loop8_enter 1559 movups ($inp),$in0 1560 aesenc $rndkey1,$inout3 1561 aesenc $rndkey1,$inout4 1562 movups 0x10($inp),$in1 # pre-load input 1563 movups 0x20($inp),$in2 1564 aesenc $rndkey1,$inout5 1565 aesenc $rndkey1,$inout6 1566 1567 call .Lenc_loop8_enter 1568 1569 movdqu 0x30($inp),$in3 1570 pxor $in0,$inout0 1571 movdqu 0x40($inp),$in0 1572 pxor $in1,$inout1 1573 movdqu $inout0,($out) # store output 1574 pxor $in2,$inout2 1575 movdqu $inout1,0x10($out) 1576 pxor $in3,$inout3 1577 movdqu $inout2,0x20($out) 1578 pxor $in0,$inout4 1579 movdqu $inout3,0x30($out) 1580 movdqu $inout4,0x40($out) 1581 cmp \$6,$len 1582 jb .Lctr32_done # $len was 5, stop store 1583 1584 movups 0x50($inp),$in1 1585 xorps $in1,$inout5 1586 movups $inout5,0x50($out) 1587 je .Lctr32_done # $len was 6, stop store 1588 1589 movups 0x60($inp),$in2 1590 xorps $in2,$inout6 1591 movups $inout6,0x60($out) 1592 jmp .Lctr32_done # $len was 7, stop store 1593 1594.align 32 1595.Lctr32_loop4: 1596 aesenc $rndkey1,$inout0 1597 lea 16($key),$key 1598 dec $rounds 1599 aesenc $rndkey1,$inout1 1600 aesenc $rndkey1,$inout2 1601 aesenc $rndkey1,$inout3 1602 $movkey ($key),$rndkey1 1603 jnz .Lctr32_loop4 1604 aesenclast $rndkey1,$inout0 1605 aesenclast $rndkey1,$inout1 1606 movups ($inp),$in0 # load input 1607 movups 0x10($inp),$in1 1608 aesenclast $rndkey1,$inout2 1609 aesenclast $rndkey1,$inout3 1610 movups 0x20($inp),$in2 1611 movups 0x30($inp),$in3 1612 1613 xorps $in0,$inout0 1614 movups $inout0,($out) # store output 1615 xorps $in1,$inout1 1616 movups $inout1,0x10($out) 1617 pxor $in2,$inout2 1618 movdqu $inout2,0x20($out) 1619 pxor $in3,$inout3 1620 movdqu $inout3,0x30($out) 1621 jmp .Lctr32_done # $len was 4, stop store 1622 1623.align 32 1624.Lctr32_loop3: 1625 aesenc $rndkey1,$inout0 1626 lea 16($key),$key 1627 dec $rounds 1628 aesenc $rndkey1,$inout1 1629 aesenc $rndkey1,$inout2 1630 $movkey ($key),$rndkey1 1631 jnz .Lctr32_loop3 1632 aesenclast $rndkey1,$inout0 1633 aesenclast $rndkey1,$inout1 1634 aesenclast $rndkey1,$inout2 1635 1636 movups ($inp),$in0 # load input 1637 xorps $in0,$inout0 1638 movups $inout0,($out) # store output 1639 cmp \$2,$len 1640 jb .Lctr32_done # $len was 1, stop store 1641 1642 movups 0x10($inp),$in1 1643 xorps $in1,$inout1 1644 movups $inout1,0x10($out) 1645 je .Lctr32_done # $len was 2, stop store 1646 1647 movups 0x20($inp),$in2 1648 xorps $in2,$inout2 1649 movups $inout2,0x20($out) # $len was 3, stop store 1650 1651.Lctr32_done: 1652 xorps %xmm0,%xmm0 # clear regiser bank 1653 xor $key0,$key0 1654 pxor %xmm1,%xmm1 1655 pxor %xmm2,%xmm2 1656 pxor %xmm3,%xmm3 1657 pxor %xmm4,%xmm4 1658 pxor %xmm5,%xmm5 1659___ 1660$code.=<<___ if (!$win64); 1661 pxor %xmm6,%xmm6 1662 pxor %xmm7,%xmm7 1663 movaps %xmm0,0x00(%rsp) # clear stack 1664 pxor %xmm8,%xmm8 1665 movaps %xmm0,0x10(%rsp) 1666 pxor %xmm9,%xmm9 1667 movaps %xmm0,0x20(%rsp) 1668 pxor %xmm10,%xmm10 1669 movaps %xmm0,0x30(%rsp) 1670 pxor %xmm11,%xmm11 1671 movaps %xmm0,0x40(%rsp) 1672 pxor %xmm12,%xmm12 1673 movaps %xmm0,0x50(%rsp) 1674 pxor %xmm13,%xmm13 1675 movaps %xmm0,0x60(%rsp) 1676 pxor %xmm14,%xmm14 1677 movaps %xmm0,0x70(%rsp) 1678 pxor %xmm15,%xmm15 1679___ 1680$code.=<<___ if ($win64); 1681 movaps -0xa0(%rbp),%xmm6 1682 movaps %xmm0,-0xa0(%rbp) # clear stack 1683 movaps -0x90(%rbp),%xmm7 1684 movaps %xmm0,-0x90(%rbp) 1685 movaps -0x80(%rbp),%xmm8 1686 movaps %xmm0,-0x80(%rbp) 1687 movaps -0x70(%rbp),%xmm9 1688 movaps %xmm0,-0x70(%rbp) 1689 movaps -0x60(%rbp),%xmm10 1690 movaps %xmm0,-0x60(%rbp) 1691 movaps -0x50(%rbp),%xmm11 1692 movaps %xmm0,-0x50(%rbp) 1693 movaps -0x40(%rbp),%xmm12 1694 movaps %xmm0,-0x40(%rbp) 1695 movaps -0x30(%rbp),%xmm13 1696 movaps %xmm0,-0x30(%rbp) 1697 movaps -0x20(%rbp),%xmm14 1698 movaps %xmm0,-0x20(%rbp) 1699 movaps -0x10(%rbp),%xmm15 1700 movaps %xmm0,-0x10(%rbp) 1701 movaps %xmm0,0x00(%rsp) 1702 movaps %xmm0,0x10(%rsp) 1703 movaps %xmm0,0x20(%rsp) 1704 movaps %xmm0,0x30(%rsp) 1705 movaps %xmm0,0x40(%rsp) 1706 movaps %xmm0,0x50(%rsp) 1707 movaps %xmm0,0x60(%rsp) 1708 movaps %xmm0,0x70(%rsp) 1709___ 1710$code.=<<___; 1711 lea (%rbp),%rsp 1712 pop %rbp 1713.Lctr32_epilogue: 1714 ret 1715.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1716___ 1717} 1718 1719###################################################################### 1720# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1721# const AES_KEY *key1, const AES_KEY *key2 1722# const unsigned char iv[16]); 1723# 1724{ 1725my @tweak=map("%xmm$_",(10..15)); 1726my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1727my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1728my $frame_size = 0x70 + ($win64?160:0); 1729 1730$code.=<<___; 1731.globl aesni_xts_encrypt 1732.type aesni_xts_encrypt,\@function,6 1733.align 16 1734aesni_xts_encrypt: 1735 lea (%rsp),%rax 1736 push %rbp 1737 sub \$$frame_size,%rsp 1738 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1739___ 1740$code.=<<___ if ($win64); 1741 movaps %xmm6,-0xa8(%rax) # offload everything 1742 movaps %xmm7,-0x98(%rax) 1743 movaps %xmm8,-0x88(%rax) 1744 movaps %xmm9,-0x78(%rax) 1745 movaps %xmm10,-0x68(%rax) 1746 movaps %xmm11,-0x58(%rax) 1747 movaps %xmm12,-0x48(%rax) 1748 movaps %xmm13,-0x38(%rax) 1749 movaps %xmm14,-0x28(%rax) 1750 movaps %xmm15,-0x18(%rax) 1751.Lxts_enc_body: 1752___ 1753$code.=<<___; 1754 lea -8(%rax),%rbp 1755 movups ($ivp),$inout0 # load clear-text tweak 1756 mov 240(%r8),$rounds # key2->rounds 1757 mov 240($key),$rnds_ # key1->rounds 1758___ 1759 # generate the tweak 1760 &aesni_generate1("enc",$key2,$rounds,$inout0); 1761$code.=<<___; 1762 $movkey ($key),$rndkey0 # zero round key 1763 mov $key,$key_ # backup $key 1764 mov $rnds_,$rounds # backup $rounds 1765 shl \$4,$rnds_ 1766 mov $len,$len_ # backup $len 1767 and \$-16,$len 1768 1769 $movkey 16($key,$rnds_),$rndkey1 # last round key 1770 1771 movdqa .Lxts_magic(%rip),$twmask 1772 movdqa $inout0,@tweak[5] 1773 pshufd \$0x5f,$inout0,$twres 1774 pxor $rndkey0,$rndkey1 1775___ 1776 # alternative tweak calculation algorithm is based on suggestions 1777 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1778 # and should help in the future... 1779 for ($i=0;$i<4;$i++) { 1780 $code.=<<___; 1781 movdqa $twres,$twtmp 1782 paddd $twres,$twres 1783 movdqa @tweak[5],@tweak[$i] 1784 psrad \$31,$twtmp # broadcast upper bits 1785 paddq @tweak[5],@tweak[5] 1786 pand $twmask,$twtmp 1787 pxor $rndkey0,@tweak[$i] 1788 pxor $twtmp,@tweak[5] 1789___ 1790 } 1791$code.=<<___; 1792 movdqa @tweak[5],@tweak[4] 1793 psrad \$31,$twres 1794 paddq @tweak[5],@tweak[5] 1795 pand $twmask,$twres 1796 pxor $rndkey0,@tweak[4] 1797 pxor $twres,@tweak[5] 1798 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1799 1800 sub \$16*6,$len 1801 jc .Lxts_enc_short # if $len-=6*16 borrowed 1802 1803 mov \$16+96,$rounds 1804 lea 32($key_,$rnds_),$key # end of key schedule 1805 sub %r10,%rax # twisted $rounds 1806 $movkey 16($key_),$rndkey1 1807 mov %rax,%r10 # backup twisted $rounds 1808 lea .Lxts_magic(%rip),%r8 1809 jmp .Lxts_enc_grandloop 1810 1811.align 32 1812.Lxts_enc_grandloop: 1813 movdqu `16*0`($inp),$inout0 # load input 1814 movdqa $rndkey0,$twmask 1815 movdqu `16*1`($inp),$inout1 1816 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1817 movdqu `16*2`($inp),$inout2 1818 pxor @tweak[1],$inout1 1819 aesenc $rndkey1,$inout0 1820 movdqu `16*3`($inp),$inout3 1821 pxor @tweak[2],$inout2 1822 aesenc $rndkey1,$inout1 1823 movdqu `16*4`($inp),$inout4 1824 pxor @tweak[3],$inout3 1825 aesenc $rndkey1,$inout2 1826 movdqu `16*5`($inp),$inout5 1827 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1828 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1829 pxor @tweak[4],$inout4 1830 aesenc $rndkey1,$inout3 1831 $movkey 32($key_),$rndkey0 1832 lea `16*6`($inp),$inp 1833 pxor $twmask,$inout5 1834 1835 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 1836 aesenc $rndkey1,$inout4 1837 pxor $twres,@tweak[1] 1838 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1839 aesenc $rndkey1,$inout5 1840 $movkey 48($key_),$rndkey1 1841 pxor $twres,@tweak[2] 1842 1843 aesenc $rndkey0,$inout0 1844 pxor $twres,@tweak[3] 1845 movdqa @tweak[1],`16*1`(%rsp) 1846 aesenc $rndkey0,$inout1 1847 pxor $twres,@tweak[4] 1848 movdqa @tweak[2],`16*2`(%rsp) 1849 aesenc $rndkey0,$inout2 1850 aesenc $rndkey0,$inout3 1851 pxor $twres,$twmask 1852 movdqa @tweak[4],`16*4`(%rsp) 1853 aesenc $rndkey0,$inout4 1854 aesenc $rndkey0,$inout5 1855 $movkey 64($key_),$rndkey0 1856 movdqa $twmask,`16*5`(%rsp) 1857 pshufd \$0x5f,@tweak[5],$twres 1858 jmp .Lxts_enc_loop6 1859.align 32 1860.Lxts_enc_loop6: 1861 aesenc $rndkey1,$inout0 1862 aesenc $rndkey1,$inout1 1863 aesenc $rndkey1,$inout2 1864 aesenc $rndkey1,$inout3 1865 aesenc $rndkey1,$inout4 1866 aesenc $rndkey1,$inout5 1867 $movkey -64($key,%rax),$rndkey1 1868 add \$32,%rax 1869 1870 aesenc $rndkey0,$inout0 1871 aesenc $rndkey0,$inout1 1872 aesenc $rndkey0,$inout2 1873 aesenc $rndkey0,$inout3 1874 aesenc $rndkey0,$inout4 1875 aesenc $rndkey0,$inout5 1876 $movkey -80($key,%rax),$rndkey0 1877 jnz .Lxts_enc_loop6 1878 1879 movdqa (%r8),$twmask # start calculating next tweak 1880 movdqa $twres,$twtmp 1881 paddd $twres,$twres 1882 aesenc $rndkey1,$inout0 1883 paddq @tweak[5],@tweak[5] 1884 psrad \$31,$twtmp 1885 aesenc $rndkey1,$inout1 1886 pand $twmask,$twtmp 1887 $movkey ($key_),@tweak[0] # load round[0] 1888 aesenc $rndkey1,$inout2 1889 aesenc $rndkey1,$inout3 1890 aesenc $rndkey1,$inout4 1891 pxor $twtmp,@tweak[5] 1892 movaps @tweak[0],@tweak[1] # copy round[0] 1893 aesenc $rndkey1,$inout5 1894 $movkey -64($key),$rndkey1 1895 1896 movdqa $twres,$twtmp 1897 aesenc $rndkey0,$inout0 1898 paddd $twres,$twres 1899 pxor @tweak[5],@tweak[0] 1900 aesenc $rndkey0,$inout1 1901 psrad \$31,$twtmp 1902 paddq @tweak[5],@tweak[5] 1903 aesenc $rndkey0,$inout2 1904 aesenc $rndkey0,$inout3 1905 pand $twmask,$twtmp 1906 movaps @tweak[1],@tweak[2] 1907 aesenc $rndkey0,$inout4 1908 pxor $twtmp,@tweak[5] 1909 movdqa $twres,$twtmp 1910 aesenc $rndkey0,$inout5 1911 $movkey -48($key),$rndkey0 1912 1913 paddd $twres,$twres 1914 aesenc $rndkey1,$inout0 1915 pxor @tweak[5],@tweak[1] 1916 psrad \$31,$twtmp 1917 aesenc $rndkey1,$inout1 1918 paddq @tweak[5],@tweak[5] 1919 pand $twmask,$twtmp 1920 aesenc $rndkey1,$inout2 1921 aesenc $rndkey1,$inout3 1922 movdqa @tweak[3],`16*3`(%rsp) 1923 pxor $twtmp,@tweak[5] 1924 aesenc $rndkey1,$inout4 1925 movaps @tweak[2],@tweak[3] 1926 movdqa $twres,$twtmp 1927 aesenc $rndkey1,$inout5 1928 $movkey -32($key),$rndkey1 1929 1930 paddd $twres,$twres 1931 aesenc $rndkey0,$inout0 1932 pxor @tweak[5],@tweak[2] 1933 psrad \$31,$twtmp 1934 aesenc $rndkey0,$inout1 1935 paddq @tweak[5],@tweak[5] 1936 pand $twmask,$twtmp 1937 aesenc $rndkey0,$inout2 1938 aesenc $rndkey0,$inout3 1939 aesenc $rndkey0,$inout4 1940 pxor $twtmp,@tweak[5] 1941 movaps @tweak[3],@tweak[4] 1942 aesenc $rndkey0,$inout5 1943 1944 movdqa $twres,$rndkey0 1945 paddd $twres,$twres 1946 aesenc $rndkey1,$inout0 1947 pxor @tweak[5],@tweak[3] 1948 psrad \$31,$rndkey0 1949 aesenc $rndkey1,$inout1 1950 paddq @tweak[5],@tweak[5] 1951 pand $twmask,$rndkey0 1952 aesenc $rndkey1,$inout2 1953 aesenc $rndkey1,$inout3 1954 pxor $rndkey0,@tweak[5] 1955 $movkey ($key_),$rndkey0 1956 aesenc $rndkey1,$inout4 1957 aesenc $rndkey1,$inout5 1958 $movkey 16($key_),$rndkey1 1959 1960 pxor @tweak[5],@tweak[4] 1961 aesenclast `16*0`(%rsp),$inout0 1962 psrad \$31,$twres 1963 paddq @tweak[5],@tweak[5] 1964 aesenclast `16*1`(%rsp),$inout1 1965 aesenclast `16*2`(%rsp),$inout2 1966 pand $twmask,$twres 1967 mov %r10,%rax # restore $rounds 1968 aesenclast `16*3`(%rsp),$inout3 1969 aesenclast `16*4`(%rsp),$inout4 1970 aesenclast `16*5`(%rsp),$inout5 1971 pxor $twres,@tweak[5] 1972 1973 lea `16*6`($out),$out # $out+=6*16 1974 movups $inout0,`-16*6`($out) # store 6 output blocks 1975 movups $inout1,`-16*5`($out) 1976 movups $inout2,`-16*4`($out) 1977 movups $inout3,`-16*3`($out) 1978 movups $inout4,`-16*2`($out) 1979 movups $inout5,`-16*1`($out) 1980 sub \$16*6,$len 1981 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 1982 1983 mov \$16+96,$rounds 1984 sub $rnds_,$rounds 1985 mov $key_,$key # restore $key 1986 shr \$4,$rounds # restore original value 1987 1988.Lxts_enc_short: 1989 # at the point @tweak[0..5] are populated with tweak values 1990 mov $rounds,$rnds_ # backup $rounds 1991 pxor $rndkey0,@tweak[0] 1992 add \$16*6,$len # restore real remaining $len 1993 jz .Lxts_enc_done # done if ($len==0) 1994 1995 pxor $rndkey0,@tweak[1] 1996 cmp \$0x20,$len 1997 jb .Lxts_enc_one # $len is 1*16 1998 pxor $rndkey0,@tweak[2] 1999 je .Lxts_enc_two # $len is 2*16 2000 2001 pxor $rndkey0,@tweak[3] 2002 cmp \$0x40,$len 2003 jb .Lxts_enc_three # $len is 3*16 2004 pxor $rndkey0,@tweak[4] 2005 je .Lxts_enc_four # $len is 4*16 2006 2007 movdqu ($inp),$inout0 # $len is 5*16 2008 movdqu 16*1($inp),$inout1 2009 movdqu 16*2($inp),$inout2 2010 pxor @tweak[0],$inout0 2011 movdqu 16*3($inp),$inout3 2012 pxor @tweak[1],$inout1 2013 movdqu 16*4($inp),$inout4 2014 lea 16*5($inp),$inp # $inp+=5*16 2015 pxor @tweak[2],$inout2 2016 pxor @tweak[3],$inout3 2017 pxor @tweak[4],$inout4 2018 pxor $inout5,$inout5 2019 2020 call _aesni_encrypt6 2021 2022 xorps @tweak[0],$inout0 2023 movdqa @tweak[5],@tweak[0] 2024 xorps @tweak[1],$inout1 2025 xorps @tweak[2],$inout2 2026 movdqu $inout0,($out) # store 5 output blocks 2027 xorps @tweak[3],$inout3 2028 movdqu $inout1,16*1($out) 2029 xorps @tweak[4],$inout4 2030 movdqu $inout2,16*2($out) 2031 movdqu $inout3,16*3($out) 2032 movdqu $inout4,16*4($out) 2033 lea 16*5($out),$out # $out+=5*16 2034 jmp .Lxts_enc_done 2035 2036.align 16 2037.Lxts_enc_one: 2038 movups ($inp),$inout0 2039 lea 16*1($inp),$inp # inp+=1*16 2040 xorps @tweak[0],$inout0 2041___ 2042 &aesni_generate1("enc",$key,$rounds); 2043$code.=<<___; 2044 xorps @tweak[0],$inout0 2045 movdqa @tweak[1],@tweak[0] 2046 movups $inout0,($out) # store one output block 2047 lea 16*1($out),$out # $out+=1*16 2048 jmp .Lxts_enc_done 2049 2050.align 16 2051.Lxts_enc_two: 2052 movups ($inp),$inout0 2053 movups 16($inp),$inout1 2054 lea 32($inp),$inp # $inp+=2*16 2055 xorps @tweak[0],$inout0 2056 xorps @tweak[1],$inout1 2057 2058 call _aesni_encrypt2 2059 2060 xorps @tweak[0],$inout0 2061 movdqa @tweak[2],@tweak[0] 2062 xorps @tweak[1],$inout1 2063 movups $inout0,($out) # store 2 output blocks 2064 movups $inout1,16*1($out) 2065 lea 16*2($out),$out # $out+=2*16 2066 jmp .Lxts_enc_done 2067 2068.align 16 2069.Lxts_enc_three: 2070 movups ($inp),$inout0 2071 movups 16*1($inp),$inout1 2072 movups 16*2($inp),$inout2 2073 lea 16*3($inp),$inp # $inp+=3*16 2074 xorps @tweak[0],$inout0 2075 xorps @tweak[1],$inout1 2076 xorps @tweak[2],$inout2 2077 2078 call _aesni_encrypt3 2079 2080 xorps @tweak[0],$inout0 2081 movdqa @tweak[3],@tweak[0] 2082 xorps @tweak[1],$inout1 2083 xorps @tweak[2],$inout2 2084 movups $inout0,($out) # store 3 output blocks 2085 movups $inout1,16*1($out) 2086 movups $inout2,16*2($out) 2087 lea 16*3($out),$out # $out+=3*16 2088 jmp .Lxts_enc_done 2089 2090.align 16 2091.Lxts_enc_four: 2092 movups ($inp),$inout0 2093 movups 16*1($inp),$inout1 2094 movups 16*2($inp),$inout2 2095 xorps @tweak[0],$inout0 2096 movups 16*3($inp),$inout3 2097 lea 16*4($inp),$inp # $inp+=4*16 2098 xorps @tweak[1],$inout1 2099 xorps @tweak[2],$inout2 2100 xorps @tweak[3],$inout3 2101 2102 call _aesni_encrypt4 2103 2104 pxor @tweak[0],$inout0 2105 movdqa @tweak[4],@tweak[0] 2106 pxor @tweak[1],$inout1 2107 pxor @tweak[2],$inout2 2108 movdqu $inout0,($out) # store 4 output blocks 2109 pxor @tweak[3],$inout3 2110 movdqu $inout1,16*1($out) 2111 movdqu $inout2,16*2($out) 2112 movdqu $inout3,16*3($out) 2113 lea 16*4($out),$out # $out+=4*16 2114 jmp .Lxts_enc_done 2115 2116.align 16 2117.Lxts_enc_done: 2118 and \$15,$len_ # see if $len%16 is 0 2119 jz .Lxts_enc_ret 2120 mov $len_,$len 2121 2122.Lxts_enc_steal: 2123 movzb ($inp),%eax # borrow $rounds ... 2124 movzb -16($out),%ecx # ... and $key 2125 lea 1($inp),$inp 2126 mov %al,-16($out) 2127 mov %cl,0($out) 2128 lea 1($out),$out 2129 sub \$1,$len 2130 jnz .Lxts_enc_steal 2131 2132 sub $len_,$out # rewind $out 2133 mov $key_,$key # restore $key 2134 mov $rnds_,$rounds # restore $rounds 2135 2136 movups -16($out),$inout0 2137 xorps @tweak[0],$inout0 2138___ 2139 &aesni_generate1("enc",$key,$rounds); 2140$code.=<<___; 2141 xorps @tweak[0],$inout0 2142 movups $inout0,-16($out) 2143 2144.Lxts_enc_ret: 2145 xorps %xmm0,%xmm0 # clear register bank 2146 pxor %xmm1,%xmm1 2147 pxor %xmm2,%xmm2 2148 pxor %xmm3,%xmm3 2149 pxor %xmm4,%xmm4 2150 pxor %xmm5,%xmm5 2151___ 2152$code.=<<___ if (!$win64); 2153 pxor %xmm6,%xmm6 2154 pxor %xmm7,%xmm7 2155 movaps %xmm0,0x00(%rsp) # clear stack 2156 pxor %xmm8,%xmm8 2157 movaps %xmm0,0x10(%rsp) 2158 pxor %xmm9,%xmm9 2159 movaps %xmm0,0x20(%rsp) 2160 pxor %xmm10,%xmm10 2161 movaps %xmm0,0x30(%rsp) 2162 pxor %xmm11,%xmm11 2163 movaps %xmm0,0x40(%rsp) 2164 pxor %xmm12,%xmm12 2165 movaps %xmm0,0x50(%rsp) 2166 pxor %xmm13,%xmm13 2167 movaps %xmm0,0x60(%rsp) 2168 pxor %xmm14,%xmm14 2169 pxor %xmm15,%xmm15 2170___ 2171$code.=<<___ if ($win64); 2172 movaps -0xa0(%rbp),%xmm6 2173 movaps %xmm0,-0xa0(%rbp) # clear stack 2174 movaps -0x90(%rbp),%xmm7 2175 movaps %xmm0,-0x90(%rbp) 2176 movaps -0x80(%rbp),%xmm8 2177 movaps %xmm0,-0x80(%rbp) 2178 movaps -0x70(%rbp),%xmm9 2179 movaps %xmm0,-0x70(%rbp) 2180 movaps -0x60(%rbp),%xmm10 2181 movaps %xmm0,-0x60(%rbp) 2182 movaps -0x50(%rbp),%xmm11 2183 movaps %xmm0,-0x50(%rbp) 2184 movaps -0x40(%rbp),%xmm12 2185 movaps %xmm0,-0x40(%rbp) 2186 movaps -0x30(%rbp),%xmm13 2187 movaps %xmm0,-0x30(%rbp) 2188 movaps -0x20(%rbp),%xmm14 2189 movaps %xmm0,-0x20(%rbp) 2190 movaps -0x10(%rbp),%xmm15 2191 movaps %xmm0,-0x10(%rbp) 2192 movaps %xmm0,0x00(%rsp) 2193 movaps %xmm0,0x10(%rsp) 2194 movaps %xmm0,0x20(%rsp) 2195 movaps %xmm0,0x30(%rsp) 2196 movaps %xmm0,0x40(%rsp) 2197 movaps %xmm0,0x50(%rsp) 2198 movaps %xmm0,0x60(%rsp) 2199___ 2200$code.=<<___; 2201 lea (%rbp),%rsp 2202 pop %rbp 2203.Lxts_enc_epilogue: 2204 ret 2205.size aesni_xts_encrypt,.-aesni_xts_encrypt 2206___ 2207 2208$code.=<<___; 2209.globl aesni_xts_decrypt 2210.type aesni_xts_decrypt,\@function,6 2211.align 16 2212aesni_xts_decrypt: 2213 lea (%rsp),%rax 2214 push %rbp 2215 sub \$$frame_size,%rsp 2216 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2217___ 2218$code.=<<___ if ($win64); 2219 movaps %xmm6,-0xa8(%rax) # offload everything 2220 movaps %xmm7,-0x98(%rax) 2221 movaps %xmm8,-0x88(%rax) 2222 movaps %xmm9,-0x78(%rax) 2223 movaps %xmm10,-0x68(%rax) 2224 movaps %xmm11,-0x58(%rax) 2225 movaps %xmm12,-0x48(%rax) 2226 movaps %xmm13,-0x38(%rax) 2227 movaps %xmm14,-0x28(%rax) 2228 movaps %xmm15,-0x18(%rax) 2229.Lxts_dec_body: 2230___ 2231$code.=<<___; 2232 lea -8(%rax),%rbp 2233 movups ($ivp),$inout0 # load clear-text tweak 2234 mov 240($key2),$rounds # key2->rounds 2235 mov 240($key),$rnds_ # key1->rounds 2236___ 2237 # generate the tweak 2238 &aesni_generate1("enc",$key2,$rounds,$inout0); 2239$code.=<<___; 2240 xor %eax,%eax # if ($len%16) len-=16; 2241 test \$15,$len 2242 setnz %al 2243 shl \$4,%rax 2244 sub %rax,$len 2245 2246 $movkey ($key),$rndkey0 # zero round key 2247 mov $key,$key_ # backup $key 2248 mov $rnds_,$rounds # backup $rounds 2249 shl \$4,$rnds_ 2250 mov $len,$len_ # backup $len 2251 and \$-16,$len 2252 2253 $movkey 16($key,$rnds_),$rndkey1 # last round key 2254 2255 movdqa .Lxts_magic(%rip),$twmask 2256 movdqa $inout0,@tweak[5] 2257 pshufd \$0x5f,$inout0,$twres 2258 pxor $rndkey0,$rndkey1 2259___ 2260 for ($i=0;$i<4;$i++) { 2261 $code.=<<___; 2262 movdqa $twres,$twtmp 2263 paddd $twres,$twres 2264 movdqa @tweak[5],@tweak[$i] 2265 psrad \$31,$twtmp # broadcast upper bits 2266 paddq @tweak[5],@tweak[5] 2267 pand $twmask,$twtmp 2268 pxor $rndkey0,@tweak[$i] 2269 pxor $twtmp,@tweak[5] 2270___ 2271 } 2272$code.=<<___; 2273 movdqa @tweak[5],@tweak[4] 2274 psrad \$31,$twres 2275 paddq @tweak[5],@tweak[5] 2276 pand $twmask,$twres 2277 pxor $rndkey0,@tweak[4] 2278 pxor $twres,@tweak[5] 2279 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2280 2281 sub \$16*6,$len 2282 jc .Lxts_dec_short # if $len-=6*16 borrowed 2283 2284 mov \$16+96,$rounds 2285 lea 32($key_,$rnds_),$key # end of key schedule 2286 sub %r10,%rax # twisted $rounds 2287 $movkey 16($key_),$rndkey1 2288 mov %rax,%r10 # backup twisted $rounds 2289 lea .Lxts_magic(%rip),%r8 2290 jmp .Lxts_dec_grandloop 2291 2292.align 32 2293.Lxts_dec_grandloop: 2294 movdqu `16*0`($inp),$inout0 # load input 2295 movdqa $rndkey0,$twmask 2296 movdqu `16*1`($inp),$inout1 2297 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2298 movdqu `16*2`($inp),$inout2 2299 pxor @tweak[1],$inout1 2300 aesdec $rndkey1,$inout0 2301 movdqu `16*3`($inp),$inout3 2302 pxor @tweak[2],$inout2 2303 aesdec $rndkey1,$inout1 2304 movdqu `16*4`($inp),$inout4 2305 pxor @tweak[3],$inout3 2306 aesdec $rndkey1,$inout2 2307 movdqu `16*5`($inp),$inout5 2308 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2309 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2310 pxor @tweak[4],$inout4 2311 aesdec $rndkey1,$inout3 2312 $movkey 32($key_),$rndkey0 2313 lea `16*6`($inp),$inp 2314 pxor $twmask,$inout5 2315 2316 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 2317 aesdec $rndkey1,$inout4 2318 pxor $twres,@tweak[1] 2319 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2320 aesdec $rndkey1,$inout5 2321 $movkey 48($key_),$rndkey1 2322 pxor $twres,@tweak[2] 2323 2324 aesdec $rndkey0,$inout0 2325 pxor $twres,@tweak[3] 2326 movdqa @tweak[1],`16*1`(%rsp) 2327 aesdec $rndkey0,$inout1 2328 pxor $twres,@tweak[4] 2329 movdqa @tweak[2],`16*2`(%rsp) 2330 aesdec $rndkey0,$inout2 2331 aesdec $rndkey0,$inout3 2332 pxor $twres,$twmask 2333 movdqa @tweak[4],`16*4`(%rsp) 2334 aesdec $rndkey0,$inout4 2335 aesdec $rndkey0,$inout5 2336 $movkey 64($key_),$rndkey0 2337 movdqa $twmask,`16*5`(%rsp) 2338 pshufd \$0x5f,@tweak[5],$twres 2339 jmp .Lxts_dec_loop6 2340.align 32 2341.Lxts_dec_loop6: 2342 aesdec $rndkey1,$inout0 2343 aesdec $rndkey1,$inout1 2344 aesdec $rndkey1,$inout2 2345 aesdec $rndkey1,$inout3 2346 aesdec $rndkey1,$inout4 2347 aesdec $rndkey1,$inout5 2348 $movkey -64($key,%rax),$rndkey1 2349 add \$32,%rax 2350 2351 aesdec $rndkey0,$inout0 2352 aesdec $rndkey0,$inout1 2353 aesdec $rndkey0,$inout2 2354 aesdec $rndkey0,$inout3 2355 aesdec $rndkey0,$inout4 2356 aesdec $rndkey0,$inout5 2357 $movkey -80($key,%rax),$rndkey0 2358 jnz .Lxts_dec_loop6 2359 2360 movdqa (%r8),$twmask # start calculating next tweak 2361 movdqa $twres,$twtmp 2362 paddd $twres,$twres 2363 aesdec $rndkey1,$inout0 2364 paddq @tweak[5],@tweak[5] 2365 psrad \$31,$twtmp 2366 aesdec $rndkey1,$inout1 2367 pand $twmask,$twtmp 2368 $movkey ($key_),@tweak[0] # load round[0] 2369 aesdec $rndkey1,$inout2 2370 aesdec $rndkey1,$inout3 2371 aesdec $rndkey1,$inout4 2372 pxor $twtmp,@tweak[5] 2373 movaps @tweak[0],@tweak[1] # copy round[0] 2374 aesdec $rndkey1,$inout5 2375 $movkey -64($key),$rndkey1 2376 2377 movdqa $twres,$twtmp 2378 aesdec $rndkey0,$inout0 2379 paddd $twres,$twres 2380 pxor @tweak[5],@tweak[0] 2381 aesdec $rndkey0,$inout1 2382 psrad \$31,$twtmp 2383 paddq @tweak[5],@tweak[5] 2384 aesdec $rndkey0,$inout2 2385 aesdec $rndkey0,$inout3 2386 pand $twmask,$twtmp 2387 movaps @tweak[1],@tweak[2] 2388 aesdec $rndkey0,$inout4 2389 pxor $twtmp,@tweak[5] 2390 movdqa $twres,$twtmp 2391 aesdec $rndkey0,$inout5 2392 $movkey -48($key),$rndkey0 2393 2394 paddd $twres,$twres 2395 aesdec $rndkey1,$inout0 2396 pxor @tweak[5],@tweak[1] 2397 psrad \$31,$twtmp 2398 aesdec $rndkey1,$inout1 2399 paddq @tweak[5],@tweak[5] 2400 pand $twmask,$twtmp 2401 aesdec $rndkey1,$inout2 2402 aesdec $rndkey1,$inout3 2403 movdqa @tweak[3],`16*3`(%rsp) 2404 pxor $twtmp,@tweak[5] 2405 aesdec $rndkey1,$inout4 2406 movaps @tweak[2],@tweak[3] 2407 movdqa $twres,$twtmp 2408 aesdec $rndkey1,$inout5 2409 $movkey -32($key),$rndkey1 2410 2411 paddd $twres,$twres 2412 aesdec $rndkey0,$inout0 2413 pxor @tweak[5],@tweak[2] 2414 psrad \$31,$twtmp 2415 aesdec $rndkey0,$inout1 2416 paddq @tweak[5],@tweak[5] 2417 pand $twmask,$twtmp 2418 aesdec $rndkey0,$inout2 2419 aesdec $rndkey0,$inout3 2420 aesdec $rndkey0,$inout4 2421 pxor $twtmp,@tweak[5] 2422 movaps @tweak[3],@tweak[4] 2423 aesdec $rndkey0,$inout5 2424 2425 movdqa $twres,$rndkey0 2426 paddd $twres,$twres 2427 aesdec $rndkey1,$inout0 2428 pxor @tweak[5],@tweak[3] 2429 psrad \$31,$rndkey0 2430 aesdec $rndkey1,$inout1 2431 paddq @tweak[5],@tweak[5] 2432 pand $twmask,$rndkey0 2433 aesdec $rndkey1,$inout2 2434 aesdec $rndkey1,$inout3 2435 pxor $rndkey0,@tweak[5] 2436 $movkey ($key_),$rndkey0 2437 aesdec $rndkey1,$inout4 2438 aesdec $rndkey1,$inout5 2439 $movkey 16($key_),$rndkey1 2440 2441 pxor @tweak[5],@tweak[4] 2442 aesdeclast `16*0`(%rsp),$inout0 2443 psrad \$31,$twres 2444 paddq @tweak[5],@tweak[5] 2445 aesdeclast `16*1`(%rsp),$inout1 2446 aesdeclast `16*2`(%rsp),$inout2 2447 pand $twmask,$twres 2448 mov %r10,%rax # restore $rounds 2449 aesdeclast `16*3`(%rsp),$inout3 2450 aesdeclast `16*4`(%rsp),$inout4 2451 aesdeclast `16*5`(%rsp),$inout5 2452 pxor $twres,@tweak[5] 2453 2454 lea `16*6`($out),$out # $out+=6*16 2455 movups $inout0,`-16*6`($out) # store 6 output blocks 2456 movups $inout1,`-16*5`($out) 2457 movups $inout2,`-16*4`($out) 2458 movups $inout3,`-16*3`($out) 2459 movups $inout4,`-16*2`($out) 2460 movups $inout5,`-16*1`($out) 2461 sub \$16*6,$len 2462 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2463 2464 mov \$16+96,$rounds 2465 sub $rnds_,$rounds 2466 mov $key_,$key # restore $key 2467 shr \$4,$rounds # restore original value 2468 2469.Lxts_dec_short: 2470 # at the point @tweak[0..5] are populated with tweak values 2471 mov $rounds,$rnds_ # backup $rounds 2472 pxor $rndkey0,@tweak[0] 2473 pxor $rndkey0,@tweak[1] 2474 add \$16*6,$len # restore real remaining $len 2475 jz .Lxts_dec_done # done if ($len==0) 2476 2477 pxor $rndkey0,@tweak[2] 2478 cmp \$0x20,$len 2479 jb .Lxts_dec_one # $len is 1*16 2480 pxor $rndkey0,@tweak[3] 2481 je .Lxts_dec_two # $len is 2*16 2482 2483 pxor $rndkey0,@tweak[4] 2484 cmp \$0x40,$len 2485 jb .Lxts_dec_three # $len is 3*16 2486 je .Lxts_dec_four # $len is 4*16 2487 2488 movdqu ($inp),$inout0 # $len is 5*16 2489 movdqu 16*1($inp),$inout1 2490 movdqu 16*2($inp),$inout2 2491 pxor @tweak[0],$inout0 2492 movdqu 16*3($inp),$inout3 2493 pxor @tweak[1],$inout1 2494 movdqu 16*4($inp),$inout4 2495 lea 16*5($inp),$inp # $inp+=5*16 2496 pxor @tweak[2],$inout2 2497 pxor @tweak[3],$inout3 2498 pxor @tweak[4],$inout4 2499 2500 call _aesni_decrypt6 2501 2502 xorps @tweak[0],$inout0 2503 xorps @tweak[1],$inout1 2504 xorps @tweak[2],$inout2 2505 movdqu $inout0,($out) # store 5 output blocks 2506 xorps @tweak[3],$inout3 2507 movdqu $inout1,16*1($out) 2508 xorps @tweak[4],$inout4 2509 movdqu $inout2,16*2($out) 2510 pxor $twtmp,$twtmp 2511 movdqu $inout3,16*3($out) 2512 pcmpgtd @tweak[5],$twtmp 2513 movdqu $inout4,16*4($out) 2514 lea 16*5($out),$out # $out+=5*16 2515 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2516 and \$15,$len_ 2517 jz .Lxts_dec_ret 2518 2519 movdqa @tweak[5],@tweak[0] 2520 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2521 pand $twmask,@tweak[1] # isolate carry and residue 2522 pxor @tweak[5],@tweak[1] 2523 jmp .Lxts_dec_done2 2524 2525.align 16 2526.Lxts_dec_one: 2527 movups ($inp),$inout0 2528 lea 16*1($inp),$inp # $inp+=1*16 2529 xorps @tweak[0],$inout0 2530___ 2531 &aesni_generate1("dec",$key,$rounds); 2532$code.=<<___; 2533 xorps @tweak[0],$inout0 2534 movdqa @tweak[1],@tweak[0] 2535 movups $inout0,($out) # store one output block 2536 movdqa @tweak[2],@tweak[1] 2537 lea 16*1($out),$out # $out+=1*16 2538 jmp .Lxts_dec_done 2539 2540.align 16 2541.Lxts_dec_two: 2542 movups ($inp),$inout0 2543 movups 16($inp),$inout1 2544 lea 32($inp),$inp # $inp+=2*16 2545 xorps @tweak[0],$inout0 2546 xorps @tweak[1],$inout1 2547 2548 call _aesni_decrypt2 2549 2550 xorps @tweak[0],$inout0 2551 movdqa @tweak[2],@tweak[0] 2552 xorps @tweak[1],$inout1 2553 movdqa @tweak[3],@tweak[1] 2554 movups $inout0,($out) # store 2 output blocks 2555 movups $inout1,16*1($out) 2556 lea 16*2($out),$out # $out+=2*16 2557 jmp .Lxts_dec_done 2558 2559.align 16 2560.Lxts_dec_three: 2561 movups ($inp),$inout0 2562 movups 16*1($inp),$inout1 2563 movups 16*2($inp),$inout2 2564 lea 16*3($inp),$inp # $inp+=3*16 2565 xorps @tweak[0],$inout0 2566 xorps @tweak[1],$inout1 2567 xorps @tweak[2],$inout2 2568 2569 call _aesni_decrypt3 2570 2571 xorps @tweak[0],$inout0 2572 movdqa @tweak[3],@tweak[0] 2573 xorps @tweak[1],$inout1 2574 movdqa @tweak[4],@tweak[1] 2575 xorps @tweak[2],$inout2 2576 movups $inout0,($out) # store 3 output blocks 2577 movups $inout1,16*1($out) 2578 movups $inout2,16*2($out) 2579 lea 16*3($out),$out # $out+=3*16 2580 jmp .Lxts_dec_done 2581 2582.align 16 2583.Lxts_dec_four: 2584 movups ($inp),$inout0 2585 movups 16*1($inp),$inout1 2586 movups 16*2($inp),$inout2 2587 xorps @tweak[0],$inout0 2588 movups 16*3($inp),$inout3 2589 lea 16*4($inp),$inp # $inp+=4*16 2590 xorps @tweak[1],$inout1 2591 xorps @tweak[2],$inout2 2592 xorps @tweak[3],$inout3 2593 2594 call _aesni_decrypt4 2595 2596 pxor @tweak[0],$inout0 2597 movdqa @tweak[4],@tweak[0] 2598 pxor @tweak[1],$inout1 2599 movdqa @tweak[5],@tweak[1] 2600 pxor @tweak[2],$inout2 2601 movdqu $inout0,($out) # store 4 output blocks 2602 pxor @tweak[3],$inout3 2603 movdqu $inout1,16*1($out) 2604 movdqu $inout2,16*2($out) 2605 movdqu $inout3,16*3($out) 2606 lea 16*4($out),$out # $out+=4*16 2607 jmp .Lxts_dec_done 2608 2609.align 16 2610.Lxts_dec_done: 2611 and \$15,$len_ # see if $len%16 is 0 2612 jz .Lxts_dec_ret 2613.Lxts_dec_done2: 2614 mov $len_,$len 2615 mov $key_,$key # restore $key 2616 mov $rnds_,$rounds # restore $rounds 2617 2618 movups ($inp),$inout0 2619 xorps @tweak[1],$inout0 2620___ 2621 &aesni_generate1("dec",$key,$rounds); 2622$code.=<<___; 2623 xorps @tweak[1],$inout0 2624 movups $inout0,($out) 2625 2626.Lxts_dec_steal: 2627 movzb 16($inp),%eax # borrow $rounds ... 2628 movzb ($out),%ecx # ... and $key 2629 lea 1($inp),$inp 2630 mov %al,($out) 2631 mov %cl,16($out) 2632 lea 1($out),$out 2633 sub \$1,$len 2634 jnz .Lxts_dec_steal 2635 2636 sub $len_,$out # rewind $out 2637 mov $key_,$key # restore $key 2638 mov $rnds_,$rounds # restore $rounds 2639 2640 movups ($out),$inout0 2641 xorps @tweak[0],$inout0 2642___ 2643 &aesni_generate1("dec",$key,$rounds); 2644$code.=<<___; 2645 xorps @tweak[0],$inout0 2646 movups $inout0,($out) 2647 2648.Lxts_dec_ret: 2649 xorps %xmm0,%xmm0 # clear register bank 2650 pxor %xmm1,%xmm1 2651 pxor %xmm2,%xmm2 2652 pxor %xmm3,%xmm3 2653 pxor %xmm4,%xmm4 2654 pxor %xmm5,%xmm5 2655___ 2656$code.=<<___ if (!$win64); 2657 pxor %xmm6,%xmm6 2658 pxor %xmm7,%xmm7 2659 movaps %xmm0,0x00(%rsp) # clear stack 2660 pxor %xmm8,%xmm8 2661 movaps %xmm0,0x10(%rsp) 2662 pxor %xmm9,%xmm9 2663 movaps %xmm0,0x20(%rsp) 2664 pxor %xmm10,%xmm10 2665 movaps %xmm0,0x30(%rsp) 2666 pxor %xmm11,%xmm11 2667 movaps %xmm0,0x40(%rsp) 2668 pxor %xmm12,%xmm12 2669 movaps %xmm0,0x50(%rsp) 2670 pxor %xmm13,%xmm13 2671 movaps %xmm0,0x60(%rsp) 2672 pxor %xmm14,%xmm14 2673 pxor %xmm15,%xmm15 2674___ 2675$code.=<<___ if ($win64); 2676 movaps -0xa0(%rbp),%xmm6 2677 movaps %xmm0,-0xa0(%rbp) # clear stack 2678 movaps -0x90(%rbp),%xmm7 2679 movaps %xmm0,-0x90(%rbp) 2680 movaps -0x80(%rbp),%xmm8 2681 movaps %xmm0,-0x80(%rbp) 2682 movaps -0x70(%rbp),%xmm9 2683 movaps %xmm0,-0x70(%rbp) 2684 movaps -0x60(%rbp),%xmm10 2685 movaps %xmm0,-0x60(%rbp) 2686 movaps -0x50(%rbp),%xmm11 2687 movaps %xmm0,-0x50(%rbp) 2688 movaps -0x40(%rbp),%xmm12 2689 movaps %xmm0,-0x40(%rbp) 2690 movaps -0x30(%rbp),%xmm13 2691 movaps %xmm0,-0x30(%rbp) 2692 movaps -0x20(%rbp),%xmm14 2693 movaps %xmm0,-0x20(%rbp) 2694 movaps -0x10(%rbp),%xmm15 2695 movaps %xmm0,-0x10(%rbp) 2696 movaps %xmm0,0x00(%rsp) 2697 movaps %xmm0,0x10(%rsp) 2698 movaps %xmm0,0x20(%rsp) 2699 movaps %xmm0,0x30(%rsp) 2700 movaps %xmm0,0x40(%rsp) 2701 movaps %xmm0,0x50(%rsp) 2702 movaps %xmm0,0x60(%rsp) 2703___ 2704$code.=<<___; 2705 lea (%rbp),%rsp 2706 pop %rbp 2707.Lxts_dec_epilogue: 2708 ret 2709.size aesni_xts_decrypt,.-aesni_xts_decrypt 2710___ 2711} }} 2712 2713######################################################################## 2714# void $PREFIX_cbc_encrypt (const void *inp, void *out, 2715# size_t length, const AES_KEY *key, 2716# unsigned char *ivp,const int enc); 2717{ 2718my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 2719my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 2720my $inp_=$key_; 2721 2722$code.=<<___; 2723.globl ${PREFIX}_cbc_encrypt 2724.type ${PREFIX}_cbc_encrypt,\@function,6 2725.align 16 2726${PREFIX}_cbc_encrypt: 2727 test $len,$len # check length 2728 jz .Lcbc_ret 2729 2730 mov 240($key),$rnds_ # key->rounds 2731 mov $key,$key_ # backup $key 2732 test %r9d,%r9d # 6th argument 2733 jz .Lcbc_decrypt 2734#--------------------------- CBC ENCRYPT ------------------------------# 2735 movups ($ivp),$inout0 # load iv as initial state 2736 mov $rnds_,$rounds 2737 cmp \$16,$len 2738 jb .Lcbc_enc_tail 2739 sub \$16,$len 2740 jmp .Lcbc_enc_loop 2741.align 16 2742.Lcbc_enc_loop: 2743 movups ($inp),$inout1 # load input 2744 lea 16($inp),$inp 2745 #xorps $inout1,$inout0 2746___ 2747 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 2748$code.=<<___; 2749 mov $rnds_,$rounds # restore $rounds 2750 mov $key_,$key # restore $key 2751 movups $inout0,0($out) # store output 2752 lea 16($out),$out 2753 sub \$16,$len 2754 jnc .Lcbc_enc_loop 2755 add \$16,$len 2756 jnz .Lcbc_enc_tail 2757 pxor $rndkey0,$rndkey0 # clear register bank 2758 pxor $rndkey1,$rndkey1 2759 movups $inout0,($ivp) 2760 pxor $inout0,$inout0 2761 pxor $inout1,$inout1 2762 jmp .Lcbc_ret 2763 2764.Lcbc_enc_tail: 2765 mov $len,%rcx # zaps $key 2766 xchg $inp,$out # $inp is %rsi and $out is %rdi now 2767 .long 0x9066A4F3 # rep movsb 2768 mov \$16,%ecx # zero tail 2769 sub $len,%rcx 2770 xor %eax,%eax 2771 .long 0x9066AAF3 # rep stosb 2772 lea -16(%rdi),%rdi # rewind $out by 1 block 2773 mov $rnds_,$rounds # restore $rounds 2774 mov %rdi,%rsi # $inp and $out are the same 2775 mov $key_,$key # restore $key 2776 xor $len,$len # len=16 2777 jmp .Lcbc_enc_loop # one more spin 2778#--------------------------- CBC DECRYPT ------------------------------# 2779.align 16 2780.Lcbc_decrypt: 2781 cmp \$16,$len 2782 jne .Lcbc_decrypt_bulk 2783 2784 # handle single block without allocating stack frame, 2785 # useful in ciphertext stealing mode 2786 movdqu ($inp),$inout0 # load input 2787 movdqu ($ivp),$inout1 # load iv 2788 movdqa $inout0,$inout2 # future iv 2789___ 2790 &aesni_generate1("dec",$key,$rnds_); 2791$code.=<<___; 2792 pxor $rndkey0,$rndkey0 # clear register bank 2793 pxor $rndkey1,$rndkey1 2794 movdqu $inout2,($ivp) # store iv 2795 xorps $inout1,$inout0 # ^=iv 2796 pxor $inout1,$inout1 2797 movups $inout0,($out) # store output 2798 pxor $inout0,$inout0 2799 jmp .Lcbc_ret 2800.align 16 2801.Lcbc_decrypt_bulk: 2802 lea (%rsp),%rax 2803 push %rbp 2804 sub \$$frame_size,%rsp 2805 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2806___ 2807$code.=<<___ if ($win64); 2808 movaps %xmm6,0x10(%rsp) 2809 movaps %xmm7,0x20(%rsp) 2810 movaps %xmm8,0x30(%rsp) 2811 movaps %xmm9,0x40(%rsp) 2812 movaps %xmm10,0x50(%rsp) 2813 movaps %xmm11,0x60(%rsp) 2814 movaps %xmm12,0x70(%rsp) 2815 movaps %xmm13,0x80(%rsp) 2816 movaps %xmm14,0x90(%rsp) 2817 movaps %xmm15,0xa0(%rsp) 2818.Lcbc_decrypt_body: 2819___ 2820$code.=<<___; 2821 lea -8(%rax),%rbp 2822 movups ($ivp),$iv 2823 mov $rnds_,$rounds 2824 cmp \$0x50,$len 2825 jbe .Lcbc_dec_tail 2826 2827 $movkey ($key),$rndkey0 2828 movdqu 0x00($inp),$inout0 # load input 2829 movdqu 0x10($inp),$inout1 2830 movdqa $inout0,$in0 2831 movdqu 0x20($inp),$inout2 2832 movdqa $inout1,$in1 2833 movdqu 0x30($inp),$inout3 2834 movdqa $inout2,$in2 2835 movdqu 0x40($inp),$inout4 2836 movdqa $inout3,$in3 2837 movdqu 0x50($inp),$inout5 2838 movdqa $inout4,$in4 2839 mov OPENSSL_ia32cap_P+4(%rip),%r9d 2840 cmp \$0x70,$len 2841 jbe .Lcbc_dec_six_or_seven 2842 2843 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 2844 sub \$0x50,$len # $len is biased by -5*16 2845 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 2846 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 2847 sub \$0x20,$len # $len is biased by -7*16 2848 lea 0x70($key),$key # size optimization 2849 jmp .Lcbc_dec_loop8_enter 2850.align 16 2851.Lcbc_dec_loop8: 2852 movups $inout7,($out) 2853 lea 0x10($out),$out 2854.Lcbc_dec_loop8_enter: 2855 movdqu 0x60($inp),$inout6 2856 pxor $rndkey0,$inout0 2857 movdqu 0x70($inp),$inout7 2858 pxor $rndkey0,$inout1 2859 $movkey 0x10-0x70($key),$rndkey1 2860 pxor $rndkey0,$inout2 2861 xor $inp_,$inp_ 2862 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 2863 pxor $rndkey0,$inout3 2864 pxor $rndkey0,$inout4 2865 pxor $rndkey0,$inout5 2866 pxor $rndkey0,$inout6 2867 2868 aesdec $rndkey1,$inout0 2869 pxor $rndkey0,$inout7 2870 $movkey 0x20-0x70($key),$rndkey0 2871 aesdec $rndkey1,$inout1 2872 aesdec $rndkey1,$inout2 2873 aesdec $rndkey1,$inout3 2874 aesdec $rndkey1,$inout4 2875 aesdec $rndkey1,$inout5 2876 aesdec $rndkey1,$inout6 2877 setnc ${inp_}b 2878 shl \$7,$inp_ 2879 aesdec $rndkey1,$inout7 2880 add $inp,$inp_ 2881 $movkey 0x30-0x70($key),$rndkey1 2882___ 2883for($i=1;$i<12;$i++) { 2884my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 2885$code.=<<___ if ($i==7); 2886 cmp \$11,$rounds 2887___ 2888$code.=<<___; 2889 aesdec $rndkeyx,$inout0 2890 aesdec $rndkeyx,$inout1 2891 aesdec $rndkeyx,$inout2 2892 aesdec $rndkeyx,$inout3 2893 aesdec $rndkeyx,$inout4 2894 aesdec $rndkeyx,$inout5 2895 aesdec $rndkeyx,$inout6 2896 aesdec $rndkeyx,$inout7 2897 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 2898___ 2899$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 2900 nop 2901___ 2902$code.=<<___ if ($i==7); 2903 jb .Lcbc_dec_done 2904___ 2905$code.=<<___ if ($i==9); 2906 je .Lcbc_dec_done 2907___ 2908$code.=<<___ if ($i==11); 2909 jmp .Lcbc_dec_done 2910___ 2911} 2912$code.=<<___; 2913.align 16 2914.Lcbc_dec_done: 2915 aesdec $rndkey1,$inout0 2916 aesdec $rndkey1,$inout1 2917 pxor $rndkey0,$iv 2918 pxor $rndkey0,$in0 2919 aesdec $rndkey1,$inout2 2920 aesdec $rndkey1,$inout3 2921 pxor $rndkey0,$in1 2922 pxor $rndkey0,$in2 2923 aesdec $rndkey1,$inout4 2924 aesdec $rndkey1,$inout5 2925 pxor $rndkey0,$in3 2926 pxor $rndkey0,$in4 2927 aesdec $rndkey1,$inout6 2928 aesdec $rndkey1,$inout7 2929 movdqu 0x50($inp),$rndkey1 2930 2931 aesdeclast $iv,$inout0 2932 movdqu 0x60($inp),$iv # borrow $iv 2933 pxor $rndkey0,$rndkey1 2934 aesdeclast $in0,$inout1 2935 pxor $rndkey0,$iv 2936 movdqu 0x70($inp),$rndkey0 # next IV 2937 aesdeclast $in1,$inout2 2938 lea 0x80($inp),$inp 2939 movdqu 0x00($inp_),$in0 2940 aesdeclast $in2,$inout3 2941 aesdeclast $in3,$inout4 2942 movdqu 0x10($inp_),$in1 2943 movdqu 0x20($inp_),$in2 2944 aesdeclast $in4,$inout5 2945 aesdeclast $rndkey1,$inout6 2946 movdqu 0x30($inp_),$in3 2947 movdqu 0x40($inp_),$in4 2948 aesdeclast $iv,$inout7 2949 movdqa $rndkey0,$iv # return $iv 2950 movdqu 0x50($inp_),$rndkey1 2951 $movkey -0x70($key),$rndkey0 2952 2953 movups $inout0,($out) # store output 2954 movdqa $in0,$inout0 2955 movups $inout1,0x10($out) 2956 movdqa $in1,$inout1 2957 movups $inout2,0x20($out) 2958 movdqa $in2,$inout2 2959 movups $inout3,0x30($out) 2960 movdqa $in3,$inout3 2961 movups $inout4,0x40($out) 2962 movdqa $in4,$inout4 2963 movups $inout5,0x50($out) 2964 movdqa $rndkey1,$inout5 2965 movups $inout6,0x60($out) 2966 lea 0x70($out),$out 2967 2968 sub \$0x80,$len 2969 ja .Lcbc_dec_loop8 2970 2971 movaps $inout7,$inout0 2972 lea -0x70($key),$key 2973 add \$0x70,$len 2974 jle .Lcbc_dec_clear_tail_collected 2975 movups $inout7,($out) 2976 lea 0x10($out),$out 2977 cmp \$0x50,$len 2978 jbe .Lcbc_dec_tail 2979 2980 movaps $in0,$inout0 2981.Lcbc_dec_six_or_seven: 2982 cmp \$0x60,$len 2983 ja .Lcbc_dec_seven 2984 2985 movaps $inout5,$inout6 2986 call _aesni_decrypt6 2987 pxor $iv,$inout0 # ^= IV 2988 movaps $inout6,$iv 2989 pxor $in0,$inout1 2990 movdqu $inout0,($out) 2991 pxor $in1,$inout2 2992 movdqu $inout1,0x10($out) 2993 pxor $inout1,$inout1 # clear register bank 2994 pxor $in2,$inout3 2995 movdqu $inout2,0x20($out) 2996 pxor $inout2,$inout2 2997 pxor $in3,$inout4 2998 movdqu $inout3,0x30($out) 2999 pxor $inout3,$inout3 3000 pxor $in4,$inout5 3001 movdqu $inout4,0x40($out) 3002 pxor $inout4,$inout4 3003 lea 0x50($out),$out 3004 movdqa $inout5,$inout0 3005 pxor $inout5,$inout5 3006 jmp .Lcbc_dec_tail_collected 3007 3008.align 16 3009.Lcbc_dec_seven: 3010 movups 0x60($inp),$inout6 3011 xorps $inout7,$inout7 3012 call _aesni_decrypt8 3013 movups 0x50($inp),$inout7 3014 pxor $iv,$inout0 # ^= IV 3015 movups 0x60($inp),$iv 3016 pxor $in0,$inout1 3017 movdqu $inout0,($out) 3018 pxor $in1,$inout2 3019 movdqu $inout1,0x10($out) 3020 pxor $inout1,$inout1 # clear register bank 3021 pxor $in2,$inout3 3022 movdqu $inout2,0x20($out) 3023 pxor $inout2,$inout2 3024 pxor $in3,$inout4 3025 movdqu $inout3,0x30($out) 3026 pxor $inout3,$inout3 3027 pxor $in4,$inout5 3028 movdqu $inout4,0x40($out) 3029 pxor $inout4,$inout4 3030 pxor $inout7,$inout6 3031 movdqu $inout5,0x50($out) 3032 pxor $inout5,$inout5 3033 lea 0x60($out),$out 3034 movdqa $inout6,$inout0 3035 pxor $inout6,$inout6 3036 pxor $inout7,$inout7 3037 jmp .Lcbc_dec_tail_collected 3038 3039.align 16 3040.Lcbc_dec_loop6: 3041 movups $inout5,($out) 3042 lea 0x10($out),$out 3043 movdqu 0x00($inp),$inout0 # load input 3044 movdqu 0x10($inp),$inout1 3045 movdqa $inout0,$in0 3046 movdqu 0x20($inp),$inout2 3047 movdqa $inout1,$in1 3048 movdqu 0x30($inp),$inout3 3049 movdqa $inout2,$in2 3050 movdqu 0x40($inp),$inout4 3051 movdqa $inout3,$in3 3052 movdqu 0x50($inp),$inout5 3053 movdqa $inout4,$in4 3054.Lcbc_dec_loop6_enter: 3055 lea 0x60($inp),$inp 3056 movdqa $inout5,$inout6 3057 3058 call _aesni_decrypt6 3059 3060 pxor $iv,$inout0 # ^= IV 3061 movdqa $inout6,$iv 3062 pxor $in0,$inout1 3063 movdqu $inout0,($out) 3064 pxor $in1,$inout2 3065 movdqu $inout1,0x10($out) 3066 pxor $in2,$inout3 3067 movdqu $inout2,0x20($out) 3068 pxor $in3,$inout4 3069 mov $key_,$key 3070 movdqu $inout3,0x30($out) 3071 pxor $in4,$inout5 3072 mov $rnds_,$rounds 3073 movdqu $inout4,0x40($out) 3074 lea 0x50($out),$out 3075 sub \$0x60,$len 3076 ja .Lcbc_dec_loop6 3077 3078 movdqa $inout5,$inout0 3079 add \$0x50,$len 3080 jle .Lcbc_dec_clear_tail_collected 3081 movups $inout5,($out) 3082 lea 0x10($out),$out 3083 3084.Lcbc_dec_tail: 3085 movups ($inp),$inout0 3086 sub \$0x10,$len 3087 jbe .Lcbc_dec_one # $len is 1*16 or less 3088 3089 movups 0x10($inp),$inout1 3090 movaps $inout0,$in0 3091 sub \$0x10,$len 3092 jbe .Lcbc_dec_two # $len is 2*16 or less 3093 3094 movups 0x20($inp),$inout2 3095 movaps $inout1,$in1 3096 sub \$0x10,$len 3097 jbe .Lcbc_dec_three # $len is 3*16 or less 3098 3099 movups 0x30($inp),$inout3 3100 movaps $inout2,$in2 3101 sub \$0x10,$len 3102 jbe .Lcbc_dec_four # $len is 4*16 or less 3103 3104 movups 0x40($inp),$inout4 # $len is 5*16 or less 3105 movaps $inout3,$in3 3106 movaps $inout4,$in4 3107 xorps $inout5,$inout5 3108 call _aesni_decrypt6 3109 pxor $iv,$inout0 3110 movaps $in4,$iv 3111 pxor $in0,$inout1 3112 movdqu $inout0,($out) 3113 pxor $in1,$inout2 3114 movdqu $inout1,0x10($out) 3115 pxor $inout1,$inout1 # clear register bank 3116 pxor $in2,$inout3 3117 movdqu $inout2,0x20($out) 3118 pxor $inout2,$inout2 3119 pxor $in3,$inout4 3120 movdqu $inout3,0x30($out) 3121 pxor $inout3,$inout3 3122 lea 0x40($out),$out 3123 movdqa $inout4,$inout0 3124 pxor $inout4,$inout4 3125 pxor $inout5,$inout5 3126 sub \$0x10,$len 3127 jmp .Lcbc_dec_tail_collected 3128 3129.align 16 3130.Lcbc_dec_one: 3131 movaps $inout0,$in0 3132___ 3133 &aesni_generate1("dec",$key,$rounds); 3134$code.=<<___; 3135 xorps $iv,$inout0 3136 movaps $in0,$iv 3137 jmp .Lcbc_dec_tail_collected 3138.align 16 3139.Lcbc_dec_two: 3140 movaps $inout1,$in1 3141 call _aesni_decrypt2 3142 pxor $iv,$inout0 3143 movaps $in1,$iv 3144 pxor $in0,$inout1 3145 movdqu $inout0,($out) 3146 movdqa $inout1,$inout0 3147 pxor $inout1,$inout1 # clear register bank 3148 lea 0x10($out),$out 3149 jmp .Lcbc_dec_tail_collected 3150.align 16 3151.Lcbc_dec_three: 3152 movaps $inout2,$in2 3153 call _aesni_decrypt3 3154 pxor $iv,$inout0 3155 movaps $in2,$iv 3156 pxor $in0,$inout1 3157 movdqu $inout0,($out) 3158 pxor $in1,$inout2 3159 movdqu $inout1,0x10($out) 3160 pxor $inout1,$inout1 # clear register bank 3161 movdqa $inout2,$inout0 3162 pxor $inout2,$inout2 3163 lea 0x20($out),$out 3164 jmp .Lcbc_dec_tail_collected 3165.align 16 3166.Lcbc_dec_four: 3167 movaps $inout3,$in3 3168 call _aesni_decrypt4 3169 pxor $iv,$inout0 3170 movaps $in3,$iv 3171 pxor $in0,$inout1 3172 movdqu $inout0,($out) 3173 pxor $in1,$inout2 3174 movdqu $inout1,0x10($out) 3175 pxor $inout1,$inout1 # clear register bank 3176 pxor $in2,$inout3 3177 movdqu $inout2,0x20($out) 3178 pxor $inout2,$inout2 3179 movdqa $inout3,$inout0 3180 pxor $inout3,$inout3 3181 lea 0x30($out),$out 3182 jmp .Lcbc_dec_tail_collected 3183 3184.align 16 3185.Lcbc_dec_clear_tail_collected: 3186 pxor $inout1,$inout1 # clear register bank 3187 pxor $inout2,$inout2 3188 pxor $inout3,$inout3 3189___ 3190$code.=<<___ if (!$win64); 3191 pxor $inout4,$inout4 # %xmm6..9 3192 pxor $inout5,$inout5 3193 pxor $inout6,$inout6 3194 pxor $inout7,$inout7 3195___ 3196$code.=<<___; 3197.Lcbc_dec_tail_collected: 3198 movups $iv,($ivp) 3199 and \$15,$len 3200 jnz .Lcbc_dec_tail_partial 3201 movups $inout0,($out) 3202 pxor $inout0,$inout0 3203 jmp .Lcbc_dec_ret 3204.align 16 3205.Lcbc_dec_tail_partial: 3206 movaps $inout0,(%rsp) 3207 pxor $inout0,$inout0 3208 mov \$16,%rcx 3209 mov $out,%rdi 3210 sub $len,%rcx 3211 lea (%rsp),%rsi 3212 .long 0x9066A4F3 # rep movsb 3213 movdqa $inout0,(%rsp) 3214 3215.Lcbc_dec_ret: 3216 xorps $rndkey0,$rndkey0 # %xmm0 3217 pxor $rndkey1,$rndkey1 3218___ 3219$code.=<<___ if ($win64); 3220 movaps 0x10(%rsp),%xmm6 3221 movaps %xmm0,0x10(%rsp) # clear stack 3222 movaps 0x20(%rsp),%xmm7 3223 movaps %xmm0,0x20(%rsp) 3224 movaps 0x30(%rsp),%xmm8 3225 movaps %xmm0,0x30(%rsp) 3226 movaps 0x40(%rsp),%xmm9 3227 movaps %xmm0,0x40(%rsp) 3228 movaps 0x50(%rsp),%xmm10 3229 movaps %xmm0,0x50(%rsp) 3230 movaps 0x60(%rsp),%xmm11 3231 movaps %xmm0,0x60(%rsp) 3232 movaps 0x70(%rsp),%xmm12 3233 movaps %xmm0,0x70(%rsp) 3234 movaps 0x80(%rsp),%xmm13 3235 movaps %xmm0,0x80(%rsp) 3236 movaps 0x90(%rsp),%xmm14 3237 movaps %xmm0,0x90(%rsp) 3238 movaps 0xa0(%rsp),%xmm15 3239 movaps %xmm0,0xa0(%rsp) 3240___ 3241$code.=<<___; 3242 lea (%rbp),%rsp 3243 pop %rbp 3244.Lcbc_ret: 3245 ret 3246.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 3247___ 3248} 3249# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 3250# int bits, AES_KEY *key) 3251# 3252# input: $inp user-supplied key 3253# $bits $inp length in bits 3254# $key pointer to key schedule 3255# output: %eax 0 denoting success, -1 or -2 - failure (see C) 3256# *$key key schedule 3257# 3258{ my ($inp,$bits,$key) = @_4args; 3259 $bits =~ s/%r/%e/; 3260 3261$code.=<<___; 3262.globl ${PREFIX}_set_decrypt_key 3263.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 3264.align 16 3265${PREFIX}_set_decrypt_key: 3266 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 3267 call __aesni_set_encrypt_key 3268 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 3269 test %eax,%eax 3270 jnz .Ldec_key_ret 3271 lea 16($key,$bits),$inp # points at the end of key schedule 3272 3273 $movkey ($key),%xmm0 # just swap 3274 $movkey ($inp),%xmm1 3275 $movkey %xmm0,($inp) 3276 $movkey %xmm1,($key) 3277 lea 16($key),$key 3278 lea -16($inp),$inp 3279 3280.Ldec_key_inverse: 3281 $movkey ($key),%xmm0 # swap and inverse 3282 $movkey ($inp),%xmm1 3283 aesimc %xmm0,%xmm0 3284 aesimc %xmm1,%xmm1 3285 lea 16($key),$key 3286 lea -16($inp),$inp 3287 $movkey %xmm0,16($inp) 3288 $movkey %xmm1,-16($key) 3289 cmp $key,$inp 3290 ja .Ldec_key_inverse 3291 3292 $movkey ($key),%xmm0 # inverse middle 3293 aesimc %xmm0,%xmm0 3294 pxor %xmm1,%xmm1 3295 $movkey %xmm0,($inp) 3296 pxor %xmm0,%xmm0 3297.Ldec_key_ret: 3298 add \$8,%rsp 3299 ret 3300.LSEH_end_set_decrypt_key: 3301.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 3302___ 3303 3304# This is based on submission by 3305# 3306# Huang Ying <ying.huang@intel.com> 3307# Vinodh Gopal <vinodh.gopal@intel.com> 3308# Kahraman Akdemir 3309# 3310# Agressively optimized in respect to aeskeygenassist's critical path 3311# and is contained in %xmm0-5 to meet Win64 ABI requirement. 3312# 3313# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 3314# int bits, AES_KEY * const key); 3315# 3316# input: $inp user-supplied key 3317# $bits $inp length in bits 3318# $key pointer to key schedule 3319# output: %eax 0 denoting success, -1 or -2 - failure (see C) 3320# $bits rounds-1 (used in aesni_set_decrypt_key) 3321# *$key key schedule 3322# $key pointer to key schedule (used in 3323# aesni_set_decrypt_key) 3324# 3325# Subroutine is frame-less, which means that only volatile registers 3326# are used. Note that it's declared "abi-omnipotent", which means that 3327# amount of volatile registers is smaller on Windows. 3328# 3329$code.=<<___; 3330.globl ${PREFIX}_set_encrypt_key 3331.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 3332.align 16 3333${PREFIX}_set_encrypt_key: 3334__aesni_set_encrypt_key: 3335 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 3336 mov \$-1,%rax 3337 test $inp,$inp 3338 jz .Lenc_key_ret 3339 test $key,$key 3340 jz .Lenc_key_ret 3341 3342 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits 3343 movups ($inp),%xmm0 # pull first 128 bits of *userKey 3344 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 3345 and OPENSSL_ia32cap_P+4(%rip),%r10d 3346 lea 16($key),%rax # %rax is used as modifiable copy of $key 3347 cmp \$256,$bits 3348 je .L14rounds 3349 cmp \$192,$bits 3350 je .L12rounds 3351 cmp \$128,$bits 3352 jne .Lbad_keybits 3353 3354.L10rounds: 3355 mov \$9,$bits # 10 rounds for 128-bit key 3356 cmp \$`1<<28`,%r10d # AVX, bit no XOP 3357 je .L10rounds_alt 3358 3359 $movkey %xmm0,($key) # round 0 3360 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 3361 call .Lkey_expansion_128_cold 3362 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 3363 call .Lkey_expansion_128 3364 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 3365 call .Lkey_expansion_128 3366 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 3367 call .Lkey_expansion_128 3368 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 3369 call .Lkey_expansion_128 3370 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 3371 call .Lkey_expansion_128 3372 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 3373 call .Lkey_expansion_128 3374 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 3375 call .Lkey_expansion_128 3376 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 3377 call .Lkey_expansion_128 3378 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 3379 call .Lkey_expansion_128 3380 $movkey %xmm0,(%rax) 3381 mov $bits,80(%rax) # 240(%rdx) 3382 xor %eax,%eax 3383 jmp .Lenc_key_ret 3384 3385.align 16 3386.L10rounds_alt: 3387 movdqa .Lkey_rotate(%rip),%xmm5 3388 mov \$8,%r10d 3389 movdqa .Lkey_rcon1(%rip),%xmm4 3390 movdqa %xmm0,%xmm2 3391 movdqu %xmm0,($key) 3392 jmp .Loop_key128 3393 3394.align 16 3395.Loop_key128: 3396 pshufb %xmm5,%xmm0 3397 aesenclast %xmm4,%xmm0 3398 pslld \$1,%xmm4 3399 lea 16(%rax),%rax 3400 3401 movdqa %xmm2,%xmm3 3402 pslldq \$4,%xmm2 3403 pxor %xmm2,%xmm3 3404 pslldq \$4,%xmm2 3405 pxor %xmm2,%xmm3 3406 pslldq \$4,%xmm2 3407 pxor %xmm3,%xmm2 3408 3409 pxor %xmm2,%xmm0 3410 movdqu %xmm0,-16(%rax) 3411 movdqa %xmm0,%xmm2 3412 3413 dec %r10d 3414 jnz .Loop_key128 3415 3416 movdqa .Lkey_rcon1b(%rip),%xmm4 3417 3418 pshufb %xmm5,%xmm0 3419 aesenclast %xmm4,%xmm0 3420 pslld \$1,%xmm4 3421 3422 movdqa %xmm2,%xmm3 3423 pslldq \$4,%xmm2 3424 pxor %xmm2,%xmm3 3425 pslldq \$4,%xmm2 3426 pxor %xmm2,%xmm3 3427 pslldq \$4,%xmm2 3428 pxor %xmm3,%xmm2 3429 3430 pxor %xmm2,%xmm0 3431 movdqu %xmm0,(%rax) 3432 3433 movdqa %xmm0,%xmm2 3434 pshufb %xmm5,%xmm0 3435 aesenclast %xmm4,%xmm0 3436 3437 movdqa %xmm2,%xmm3 3438 pslldq \$4,%xmm2 3439 pxor %xmm2,%xmm3 3440 pslldq \$4,%xmm2 3441 pxor %xmm2,%xmm3 3442 pslldq \$4,%xmm2 3443 pxor %xmm3,%xmm2 3444 3445 pxor %xmm2,%xmm0 3446 movdqu %xmm0,16(%rax) 3447 3448 mov $bits,96(%rax) # 240($key) 3449 xor %eax,%eax 3450 jmp .Lenc_key_ret 3451 3452.align 16 3453.L12rounds: 3454 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 3455 mov \$11,$bits # 12 rounds for 192 3456 cmp \$`1<<28`,%r10d # AVX, but no XOP 3457 je .L12rounds_alt 3458 3459 $movkey %xmm0,($key) # round 0 3460 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 3461 call .Lkey_expansion_192a_cold 3462 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 3463 call .Lkey_expansion_192b 3464 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 3465 call .Lkey_expansion_192a 3466 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 3467 call .Lkey_expansion_192b 3468 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 3469 call .Lkey_expansion_192a 3470 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 3471 call .Lkey_expansion_192b 3472 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 3473 call .Lkey_expansion_192a 3474 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 3475 call .Lkey_expansion_192b 3476 $movkey %xmm0,(%rax) 3477 mov $bits,48(%rax) # 240(%rdx) 3478 xor %rax, %rax 3479 jmp .Lenc_key_ret 3480 3481.align 16 3482.L12rounds_alt: 3483 movdqa .Lkey_rotate192(%rip),%xmm5 3484 movdqa .Lkey_rcon1(%rip),%xmm4 3485 mov \$8,%r10d 3486 movdqu %xmm0,($key) 3487 jmp .Loop_key192 3488 3489.align 16 3490.Loop_key192: 3491 movq %xmm2,0(%rax) 3492 movdqa %xmm2,%xmm1 3493 pshufb %xmm5,%xmm2 3494 aesenclast %xmm4,%xmm2 3495 pslld \$1, %xmm4 3496 lea 24(%rax),%rax 3497 3498 movdqa %xmm0,%xmm3 3499 pslldq \$4,%xmm0 3500 pxor %xmm0,%xmm3 3501 pslldq \$4,%xmm0 3502 pxor %xmm0,%xmm3 3503 pslldq \$4,%xmm0 3504 pxor %xmm3,%xmm0 3505 3506 pshufd \$0xff,%xmm0,%xmm3 3507 pxor %xmm1,%xmm3 3508 pslldq \$4,%xmm1 3509 pxor %xmm1,%xmm3 3510 3511 pxor %xmm2,%xmm0 3512 pxor %xmm3,%xmm2 3513 movdqu %xmm0,-16(%rax) 3514 3515 dec %r10d 3516 jnz .Loop_key192 3517 3518 mov $bits,32(%rax) # 240($key) 3519 xor %eax,%eax 3520 jmp .Lenc_key_ret 3521 3522.align 16 3523.L14rounds: 3524 movups 16($inp),%xmm2 # remaning half of *userKey 3525 mov \$13,$bits # 14 rounds for 256 3526 lea 16(%rax),%rax 3527 cmp \$`1<<28`,%r10d # AVX, but no XOP 3528 je .L14rounds_alt 3529 3530 $movkey %xmm0,($key) # round 0 3531 $movkey %xmm2,16($key) # round 1 3532 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 3533 call .Lkey_expansion_256a_cold 3534 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 3535 call .Lkey_expansion_256b 3536 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 3537 call .Lkey_expansion_256a 3538 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 3539 call .Lkey_expansion_256b 3540 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 3541 call .Lkey_expansion_256a 3542 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 3543 call .Lkey_expansion_256b 3544 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 3545 call .Lkey_expansion_256a 3546 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 3547 call .Lkey_expansion_256b 3548 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 3549 call .Lkey_expansion_256a 3550 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 3551 call .Lkey_expansion_256b 3552 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 3553 call .Lkey_expansion_256a 3554 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 3555 call .Lkey_expansion_256b 3556 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 3557 call .Lkey_expansion_256a 3558 $movkey %xmm0,(%rax) 3559 mov $bits,16(%rax) # 240(%rdx) 3560 xor %rax,%rax 3561 jmp .Lenc_key_ret 3562 3563.align 16 3564.L14rounds_alt: 3565 movdqa .Lkey_rotate(%rip),%xmm5 3566 movdqa .Lkey_rcon1(%rip),%xmm4 3567 mov \$7,%r10d 3568 movdqu %xmm0,0($key) 3569 movdqa %xmm2,%xmm1 3570 movdqu %xmm2,16($key) 3571 jmp .Loop_key256 3572 3573.align 16 3574.Loop_key256: 3575 pshufb %xmm5,%xmm2 3576 aesenclast %xmm4,%xmm2 3577 3578 movdqa %xmm0,%xmm3 3579 pslldq \$4,%xmm0 3580 pxor %xmm0,%xmm3 3581 pslldq \$4,%xmm0 3582 pxor %xmm0,%xmm3 3583 pslldq \$4,%xmm0 3584 pxor %xmm3,%xmm0 3585 pslld \$1,%xmm4 3586 3587 pxor %xmm2,%xmm0 3588 movdqu %xmm0,(%rax) 3589 3590 dec %r10d 3591 jz .Ldone_key256 3592 3593 pshufd \$0xff,%xmm0,%xmm2 3594 pxor %xmm3,%xmm3 3595 aesenclast %xmm3,%xmm2 3596 3597 movdqa %xmm1,%xmm3 3598 pslldq \$4,%xmm1 3599 pxor %xmm1,%xmm3 3600 pslldq \$4,%xmm1 3601 pxor %xmm1,%xmm3 3602 pslldq \$4,%xmm1 3603 pxor %xmm3,%xmm1 3604 3605 pxor %xmm1,%xmm2 3606 movdqu %xmm2,16(%rax) 3607 lea 32(%rax),%rax 3608 movdqa %xmm2,%xmm1 3609 3610 jmp .Loop_key256 3611 3612.Ldone_key256: 3613 mov $bits,16(%rax) # 240($key) 3614 xor %eax,%eax 3615 jmp .Lenc_key_ret 3616 3617.align 16 3618.Lbad_keybits: 3619 mov \$-2,%rax 3620.Lenc_key_ret: 3621 pxor %xmm0,%xmm0 3622 pxor %xmm1,%xmm1 3623 pxor %xmm2,%xmm2 3624 pxor %xmm3,%xmm3 3625 pxor %xmm4,%xmm4 3626 pxor %xmm5,%xmm5 3627 add \$8,%rsp 3628 ret 3629.LSEH_end_set_encrypt_key: 3630 3631.align 16 3632.Lkey_expansion_128: 3633 $movkey %xmm0,(%rax) 3634 lea 16(%rax),%rax 3635.Lkey_expansion_128_cold: 3636 shufps \$0b00010000,%xmm0,%xmm4 3637 xorps %xmm4, %xmm0 3638 shufps \$0b10001100,%xmm0,%xmm4 3639 xorps %xmm4, %xmm0 3640 shufps \$0b11111111,%xmm1,%xmm1 # critical path 3641 xorps %xmm1,%xmm0 3642 ret 3643 3644.align 16 3645.Lkey_expansion_192a: 3646 $movkey %xmm0,(%rax) 3647 lea 16(%rax),%rax 3648.Lkey_expansion_192a_cold: 3649 movaps %xmm2, %xmm5 3650.Lkey_expansion_192b_warm: 3651 shufps \$0b00010000,%xmm0,%xmm4 3652 movdqa %xmm2,%xmm3 3653 xorps %xmm4,%xmm0 3654 shufps \$0b10001100,%xmm0,%xmm4 3655 pslldq \$4,%xmm3 3656 xorps %xmm4,%xmm0 3657 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 3658 pxor %xmm3,%xmm2 3659 pxor %xmm1,%xmm0 3660 pshufd \$0b11111111,%xmm0,%xmm3 3661 pxor %xmm3,%xmm2 3662 ret 3663 3664.align 16 3665.Lkey_expansion_192b: 3666 movaps %xmm0,%xmm3 3667 shufps \$0b01000100,%xmm0,%xmm5 3668 $movkey %xmm5,(%rax) 3669 shufps \$0b01001110,%xmm2,%xmm3 3670 $movkey %xmm3,16(%rax) 3671 lea 32(%rax),%rax 3672 jmp .Lkey_expansion_192b_warm 3673 3674.align 16 3675.Lkey_expansion_256a: 3676 $movkey %xmm2,(%rax) 3677 lea 16(%rax),%rax 3678.Lkey_expansion_256a_cold: 3679 shufps \$0b00010000,%xmm0,%xmm4 3680 xorps %xmm4,%xmm0 3681 shufps \$0b10001100,%xmm0,%xmm4 3682 xorps %xmm4,%xmm0 3683 shufps \$0b11111111,%xmm1,%xmm1 # critical path 3684 xorps %xmm1,%xmm0 3685 ret 3686 3687.align 16 3688.Lkey_expansion_256b: 3689 $movkey %xmm0,(%rax) 3690 lea 16(%rax),%rax 3691 3692 shufps \$0b00010000,%xmm2,%xmm4 3693 xorps %xmm4,%xmm2 3694 shufps \$0b10001100,%xmm2,%xmm4 3695 xorps %xmm4,%xmm2 3696 shufps \$0b10101010,%xmm1,%xmm1 # critical path 3697 xorps %xmm1,%xmm2 3698 ret 3699.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 3700.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 3701___ 3702} 3703 3704$code.=<<___; 3705.align 64 3706.Lbswap_mask: 3707 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 3708.Lincrement32: 3709 .long 6,6,6,0 3710.Lincrement64: 3711 .long 1,0,0,0 3712.Lxts_magic: 3713 .long 0x87,0,1,0 3714.Lincrement1: 3715 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 3716.Lkey_rotate: 3717 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 3718.Lkey_rotate192: 3719 .long 0x04070605,0x04070605,0x04070605,0x04070605 3720.Lkey_rcon1: 3721 .long 1,1,1,1 3722.Lkey_rcon1b: 3723 .long 0x1b,0x1b,0x1b,0x1b 3724 3725.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 3726.align 64 3727___ 3728 3729# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3730# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3731if ($win64) { 3732$rec="%rcx"; 3733$frame="%rdx"; 3734$context="%r8"; 3735$disp="%r9"; 3736 3737$code.=<<___; 3738.extern __imp_RtlVirtualUnwind 3739___ 3740$code.=<<___ if ($PREFIX eq "aesni"); 3741.type ecb_ccm64_se_handler,\@abi-omnipotent 3742.align 16 3743ecb_ccm64_se_handler: 3744 push %rsi 3745 push %rdi 3746 push %rbx 3747 push %rbp 3748 push %r12 3749 push %r13 3750 push %r14 3751 push %r15 3752 pushfq 3753 sub \$64,%rsp 3754 3755 mov 120($context),%rax # pull context->Rax 3756 mov 248($context),%rbx # pull context->Rip 3757 3758 mov 8($disp),%rsi # disp->ImageBase 3759 mov 56($disp),%r11 # disp->HandlerData 3760 3761 mov 0(%r11),%r10d # HandlerData[0] 3762 lea (%rsi,%r10),%r10 # prologue label 3763 cmp %r10,%rbx # context->Rip<prologue label 3764 jb .Lcommon_seh_tail 3765 3766 mov 152($context),%rax # pull context->Rsp 3767 3768 mov 4(%r11),%r10d # HandlerData[1] 3769 lea (%rsi,%r10),%r10 # epilogue label 3770 cmp %r10,%rbx # context->Rip>=epilogue label 3771 jae .Lcommon_seh_tail 3772 3773 lea 0(%rax),%rsi # %xmm save area 3774 lea 512($context),%rdi # &context.Xmm6 3775 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 3776 .long 0xa548f3fc # cld; rep movsq 3777 lea 0x58(%rax),%rax # adjust stack pointer 3778 3779 jmp .Lcommon_seh_tail 3780.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 3781 3782.type ctr_xts_se_handler,\@abi-omnipotent 3783.align 16 3784ctr_xts_se_handler: 3785 push %rsi 3786 push %rdi 3787 push %rbx 3788 push %rbp 3789 push %r12 3790 push %r13 3791 push %r14 3792 push %r15 3793 pushfq 3794 sub \$64,%rsp 3795 3796 mov 120($context),%rax # pull context->Rax 3797 mov 248($context),%rbx # pull context->Rip 3798 3799 mov 8($disp),%rsi # disp->ImageBase 3800 mov 56($disp),%r11 # disp->HandlerData 3801 3802 mov 0(%r11),%r10d # HandlerData[0] 3803 lea (%rsi,%r10),%r10 # prologue lable 3804 cmp %r10,%rbx # context->Rip<prologue label 3805 jb .Lcommon_seh_tail 3806 3807 mov 152($context),%rax # pull context->Rsp 3808 3809 mov 4(%r11),%r10d # HandlerData[1] 3810 lea (%rsi,%r10),%r10 # epilogue label 3811 cmp %r10,%rbx # context->Rip>=epilogue label 3812 jae .Lcommon_seh_tail 3813 3814 mov 160($context),%rax # pull context->Rbp 3815 lea -0xa0(%rax),%rsi # %xmm save area 3816 lea 512($context),%rdi # & context.Xmm6 3817 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3818 .long 0xa548f3fc # cld; rep movsq 3819 3820 jmp .Lcommon_rbp_tail 3821.size ctr_xts_se_handler,.-ctr_xts_se_handler 3822___ 3823$code.=<<___; 3824.type cbc_se_handler,\@abi-omnipotent 3825.align 16 3826cbc_se_handler: 3827 push %rsi 3828 push %rdi 3829 push %rbx 3830 push %rbp 3831 push %r12 3832 push %r13 3833 push %r14 3834 push %r15 3835 pushfq 3836 sub \$64,%rsp 3837 3838 mov 152($context),%rax # pull context->Rsp 3839 mov 248($context),%rbx # pull context->Rip 3840 3841 lea .Lcbc_decrypt_bulk(%rip),%r10 3842 cmp %r10,%rbx # context->Rip<"prologue" label 3843 jb .Lcommon_seh_tail 3844 3845 lea .Lcbc_decrypt_body(%rip),%r10 3846 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 3847 jb .Lrestore_cbc_rax 3848 3849 lea .Lcbc_ret(%rip),%r10 3850 cmp %r10,%rbx # context->Rip>="epilogue" label 3851 jae .Lcommon_seh_tail 3852 3853 lea 16(%rax),%rsi # %xmm save area 3854 lea 512($context),%rdi # &context.Xmm6 3855 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3856 .long 0xa548f3fc # cld; rep movsq 3857 3858.Lcommon_rbp_tail: 3859 mov 160($context),%rax # pull context->Rbp 3860 mov (%rax),%rbp # restore saved %rbp 3861 lea 8(%rax),%rax # adjust stack pointer 3862 mov %rbp,160($context) # restore context->Rbp 3863 jmp .Lcommon_seh_tail 3864 3865.Lrestore_cbc_rax: 3866 mov 120($context),%rax 3867 3868.Lcommon_seh_tail: 3869 mov 8(%rax),%rdi 3870 mov 16(%rax),%rsi 3871 mov %rax,152($context) # restore context->Rsp 3872 mov %rsi,168($context) # restore context->Rsi 3873 mov %rdi,176($context) # restore context->Rdi 3874 3875 mov 40($disp),%rdi # disp->ContextRecord 3876 mov $context,%rsi # context 3877 mov \$154,%ecx # sizeof(CONTEXT) 3878 .long 0xa548f3fc # cld; rep movsq 3879 3880 mov $disp,%rsi 3881 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3882 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3883 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3884 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3885 mov 40(%rsi),%r10 # disp->ContextRecord 3886 lea 56(%rsi),%r11 # &disp->HandlerData 3887 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3888 mov %r10,32(%rsp) # arg5 3889 mov %r11,40(%rsp) # arg6 3890 mov %r12,48(%rsp) # arg7 3891 mov %rcx,56(%rsp) # arg8, (NULL) 3892 call *__imp_RtlVirtualUnwind(%rip) 3893 3894 mov \$1,%eax # ExceptionContinueSearch 3895 add \$64,%rsp 3896 popfq 3897 pop %r15 3898 pop %r14 3899 pop %r13 3900 pop %r12 3901 pop %rbp 3902 pop %rbx 3903 pop %rdi 3904 pop %rsi 3905 ret 3906.size cbc_se_handler,.-cbc_se_handler 3907 3908.section .pdata 3909.align 4 3910___ 3911$code.=<<___ if ($PREFIX eq "aesni"); 3912 .rva .LSEH_begin_aesni_ecb_encrypt 3913 .rva .LSEH_end_aesni_ecb_encrypt 3914 .rva .LSEH_info_ecb 3915 3916 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 3917 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 3918 .rva .LSEH_info_ccm64_enc 3919 3920 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 3921 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 3922 .rva .LSEH_info_ccm64_dec 3923 3924 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 3925 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 3926 .rva .LSEH_info_ctr32 3927 3928 .rva .LSEH_begin_aesni_xts_encrypt 3929 .rva .LSEH_end_aesni_xts_encrypt 3930 .rva .LSEH_info_xts_enc 3931 3932 .rva .LSEH_begin_aesni_xts_decrypt 3933 .rva .LSEH_end_aesni_xts_decrypt 3934 .rva .LSEH_info_xts_dec 3935___ 3936$code.=<<___; 3937 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 3938 .rva .LSEH_end_${PREFIX}_cbc_encrypt 3939 .rva .LSEH_info_cbc 3940 3941 .rva ${PREFIX}_set_decrypt_key 3942 .rva .LSEH_end_set_decrypt_key 3943 .rva .LSEH_info_key 3944 3945 .rva ${PREFIX}_set_encrypt_key 3946 .rva .LSEH_end_set_encrypt_key 3947 .rva .LSEH_info_key 3948.section .xdata 3949.align 8 3950___ 3951$code.=<<___ if ($PREFIX eq "aesni"); 3952.LSEH_info_ecb: 3953 .byte 9,0,0,0 3954 .rva ecb_ccm64_se_handler 3955 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 3956.LSEH_info_ccm64_enc: 3957 .byte 9,0,0,0 3958 .rva ecb_ccm64_se_handler 3959 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 3960.LSEH_info_ccm64_dec: 3961 .byte 9,0,0,0 3962 .rva ecb_ccm64_se_handler 3963 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 3964.LSEH_info_ctr32: 3965 .byte 9,0,0,0 3966 .rva ctr_xts_se_handler 3967 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 3968.LSEH_info_xts_enc: 3969 .byte 9,0,0,0 3970 .rva ctr_xts_se_handler 3971 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3972.LSEH_info_xts_dec: 3973 .byte 9,0,0,0 3974 .rva ctr_xts_se_handler 3975 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3976___ 3977$code.=<<___; 3978.LSEH_info_cbc: 3979 .byte 9,0,0,0 3980 .rva cbc_se_handler 3981.LSEH_info_key: 3982 .byte 0x01,0x04,0x01,0x00 3983 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 3984___ 3985} 3986 3987sub rex { 3988 local *opcode=shift; 3989 my ($dst,$src)=@_; 3990 my $rex=0; 3991 3992 $rex|=0x04 if($dst>=8); 3993 $rex|=0x01 if($src>=8); 3994 push @opcode,$rex|0x40 if($rex); 3995} 3996 3997sub aesni { 3998 my $line=shift; 3999 my @opcode=(0x66); 4000 4001 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 4002 rex(\@opcode,$4,$3); 4003 push @opcode,0x0f,0x3a,0xdf; 4004 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 4005 my $c=$2; 4006 push @opcode,$c=~/^0/?oct($c):$c; 4007 return ".byte\t".join(',',@opcode); 4008 } 4009 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 4010 my %opcodelet = ( 4011 "aesimc" => 0xdb, 4012 "aesenc" => 0xdc, "aesenclast" => 0xdd, 4013 "aesdec" => 0xde, "aesdeclast" => 0xdf 4014 ); 4015 return undef if (!defined($opcodelet{$1})); 4016 rex(\@opcode,$3,$2); 4017 push @opcode,0x0f,0x38,$opcodelet{$1}; 4018 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 4019 return ".byte\t".join(',',@opcode); 4020 } 4021 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 4022 my %opcodelet = ( 4023 "aesenc" => 0xdc, "aesenclast" => 0xdd, 4024 "aesdec" => 0xde, "aesdeclast" => 0xdf 4025 ); 4026 return undef if (!defined($opcodelet{$1})); 4027 my $off = $2; 4028 push @opcode,0x44 if ($3>=8); 4029 push @opcode,0x0f,0x38,$opcodelet{$1}; 4030 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 4031 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 4032 return ".byte\t".join(',',@opcode); 4033 } 4034 return $line; 4035} 4036 4037sub movbe { 4038 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 4039} 4040 4041$code =~ s/\`([^\`]*)\`/eval($1)/gem; 4042$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 4043#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 4044$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 4045 4046print $code; 4047 4048close STDOUT; 4049