1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Knights L 2.54/0.77 0.78 0.85 - 1.50 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186# 187# (*) Atom Silvermont ECB result is suboptimal because of penalties 188# incurred by operations on %xmm8-15. As ECB is not considered 189# critical, nothing was done to mitigate the problem. 190 191$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195$flavour = shift; 196$output = shift; 197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 198 199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 200 201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 203( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 204die "can't locate x86_64-xlate.pl"; 205 206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 207*STDOUT=*OUT; 208 209$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 212 213$code=".text\n"; 214$code.=".extern OPENSSL_ia32cap_P\n"; 215 216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 218$inp="%rdi"; 219$out="%rsi"; 220$len="%rdx"; 221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 222$ivp="%r8"; # cbc, ctr, ... 223 224$rnds_="%r10d"; # backup copy for $rounds 225$key_="%r11"; # backup copy for $key 226 227# %xmm register layout 228$rndkey0="%xmm0"; $rndkey1="%xmm1"; 229$inout0="%xmm2"; $inout1="%xmm3"; 230$inout2="%xmm4"; $inout3="%xmm5"; 231$inout4="%xmm6"; $inout5="%xmm7"; 232$inout6="%xmm8"; $inout7="%xmm9"; 233 234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 235$in0="%xmm8"; $iv="%xmm9"; 236 237# Inline version of internal aesni_[en|de]crypt1. 238# 239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 240# cycles which take care of loop variables... 241{ my $sn; 242sub aesni_generate1 { 243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 244++$sn; 245$code.=<<___; 246 $movkey ($key),$rndkey0 247 $movkey 16($key),$rndkey1 248___ 249$code.=<<___ if (defined($ivec)); 250 xorps $rndkey0,$ivec 251 lea 32($key),$key 252 xorps $ivec,$inout 253___ 254$code.=<<___ if (!defined($ivec)); 255 lea 32($key),$key 256 xorps $rndkey0,$inout 257___ 258$code.=<<___; 259.Loop_${p}1_$sn: 260 aes${p} $rndkey1,$inout 261 dec $rounds 262 $movkey ($key),$rndkey1 263 lea 16($key),$key 264 jnz .Loop_${p}1_$sn # loop body is 16 bytes 265 aes${p}last $rndkey1,$inout 266___ 267}} 268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 269# 270{ my ($inp,$out,$key) = @_4args; 271 272$code.=<<___; 273.globl ${PREFIX}_encrypt 274.type ${PREFIX}_encrypt,\@abi-omnipotent 275.align 16 276${PREFIX}_encrypt: 277 movups ($inp),$inout0 # load input 278 mov 240($key),$rounds # key->rounds 279___ 280 &aesni_generate1("enc",$key,$rounds); 281$code.=<<___; 282 pxor $rndkey0,$rndkey0 # clear register bank 283 pxor $rndkey1,$rndkey1 284 movups $inout0,($out) # output 285 pxor $inout0,$inout0 286 ret 287.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 288 289.globl ${PREFIX}_decrypt 290.type ${PREFIX}_decrypt,\@abi-omnipotent 291.align 16 292${PREFIX}_decrypt: 293 movups ($inp),$inout0 # load input 294 mov 240($key),$rounds # key->rounds 295___ 296 &aesni_generate1("dec",$key,$rounds); 297$code.=<<___; 298 pxor $rndkey0,$rndkey0 # clear register bank 299 pxor $rndkey1,$rndkey1 300 movups $inout0,($out) # output 301 pxor $inout0,$inout0 302 ret 303.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 304___ 305} 306 307# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 308# factor. Why 3x subroutine were originally used in loops? Even though 309# aes[enc|dec] latency was originally 6, it could be scheduled only 310# every *2nd* cycle. Thus 3x interleave was the one providing optimal 311# utilization, i.e. when subroutine's throughput is virtually same as 312# of non-interleaved subroutine [for number of input blocks up to 3]. 313# This is why it originally made no sense to implement 2x subroutine. 314# But times change and it became appropriate to spend extra 192 bytes 315# on 2x subroutine on Atom Silvermont account. For processors that 316# can schedule aes[enc|dec] every cycle optimal interleave factor 317# equals to corresponding instructions latency. 8x is optimal for 318# * Bridge and "super-optimal" for other Intel CPUs... 319 320sub aesni_generate2 { 321my $dir=shift; 322# As already mentioned it takes in $key and $rounds, which are *not* 323# preserved. $inout[0-1] is cipher/clear text... 324$code.=<<___; 325.type _aesni_${dir}rypt2,\@abi-omnipotent 326.align 16 327_aesni_${dir}rypt2: 328 $movkey ($key),$rndkey0 329 shl \$4,$rounds 330 $movkey 16($key),$rndkey1 331 xorps $rndkey0,$inout0 332 xorps $rndkey0,$inout1 333 $movkey 32($key),$rndkey0 334 lea 32($key,$rounds),$key 335 neg %rax # $rounds 336 add \$16,%rax 337 338.L${dir}_loop2: 339 aes${dir} $rndkey1,$inout0 340 aes${dir} $rndkey1,$inout1 341 $movkey ($key,%rax),$rndkey1 342 add \$32,%rax 343 aes${dir} $rndkey0,$inout0 344 aes${dir} $rndkey0,$inout1 345 $movkey -16($key,%rax),$rndkey0 346 jnz .L${dir}_loop2 347 348 aes${dir} $rndkey1,$inout0 349 aes${dir} $rndkey1,$inout1 350 aes${dir}last $rndkey0,$inout0 351 aes${dir}last $rndkey0,$inout1 352 ret 353.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 354___ 355} 356sub aesni_generate3 { 357my $dir=shift; 358# As already mentioned it takes in $key and $rounds, which are *not* 359# preserved. $inout[0-2] is cipher/clear text... 360$code.=<<___; 361.type _aesni_${dir}rypt3,\@abi-omnipotent 362.align 16 363_aesni_${dir}rypt3: 364 $movkey ($key),$rndkey0 365 shl \$4,$rounds 366 $movkey 16($key),$rndkey1 367 xorps $rndkey0,$inout0 368 xorps $rndkey0,$inout1 369 xorps $rndkey0,$inout2 370 $movkey 32($key),$rndkey0 371 lea 32($key,$rounds),$key 372 neg %rax # $rounds 373 add \$16,%rax 374 375.L${dir}_loop3: 376 aes${dir} $rndkey1,$inout0 377 aes${dir} $rndkey1,$inout1 378 aes${dir} $rndkey1,$inout2 379 $movkey ($key,%rax),$rndkey1 380 add \$32,%rax 381 aes${dir} $rndkey0,$inout0 382 aes${dir} $rndkey0,$inout1 383 aes${dir} $rndkey0,$inout2 384 $movkey -16($key,%rax),$rndkey0 385 jnz .L${dir}_loop3 386 387 aes${dir} $rndkey1,$inout0 388 aes${dir} $rndkey1,$inout1 389 aes${dir} $rndkey1,$inout2 390 aes${dir}last $rndkey0,$inout0 391 aes${dir}last $rndkey0,$inout1 392 aes${dir}last $rndkey0,$inout2 393 ret 394.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 395___ 396} 397# 4x interleave is implemented to improve small block performance, 398# most notably [and naturally] 4 block by ~30%. One can argue that one 399# should have implemented 5x as well, but improvement would be <20%, 400# so it's not worth it... 401sub aesni_generate4 { 402my $dir=shift; 403# As already mentioned it takes in $key and $rounds, which are *not* 404# preserved. $inout[0-3] is cipher/clear text... 405$code.=<<___; 406.type _aesni_${dir}rypt4,\@abi-omnipotent 407.align 16 408_aesni_${dir}rypt4: 409 $movkey ($key),$rndkey0 410 shl \$4,$rounds 411 $movkey 16($key),$rndkey1 412 xorps $rndkey0,$inout0 413 xorps $rndkey0,$inout1 414 xorps $rndkey0,$inout2 415 xorps $rndkey0,$inout3 416 $movkey 32($key),$rndkey0 417 lea 32($key,$rounds),$key 418 neg %rax # $rounds 419 .byte 0x0f,0x1f,0x00 420 add \$16,%rax 421 422.L${dir}_loop4: 423 aes${dir} $rndkey1,$inout0 424 aes${dir} $rndkey1,$inout1 425 aes${dir} $rndkey1,$inout2 426 aes${dir} $rndkey1,$inout3 427 $movkey ($key,%rax),$rndkey1 428 add \$32,%rax 429 aes${dir} $rndkey0,$inout0 430 aes${dir} $rndkey0,$inout1 431 aes${dir} $rndkey0,$inout2 432 aes${dir} $rndkey0,$inout3 433 $movkey -16($key,%rax),$rndkey0 434 jnz .L${dir}_loop4 435 436 aes${dir} $rndkey1,$inout0 437 aes${dir} $rndkey1,$inout1 438 aes${dir} $rndkey1,$inout2 439 aes${dir} $rndkey1,$inout3 440 aes${dir}last $rndkey0,$inout0 441 aes${dir}last $rndkey0,$inout1 442 aes${dir}last $rndkey0,$inout2 443 aes${dir}last $rndkey0,$inout3 444 ret 445.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 446___ 447} 448sub aesni_generate6 { 449my $dir=shift; 450# As already mentioned it takes in $key and $rounds, which are *not* 451# preserved. $inout[0-5] is cipher/clear text... 452$code.=<<___; 453.type _aesni_${dir}rypt6,\@abi-omnipotent 454.align 16 455_aesni_${dir}rypt6: 456 $movkey ($key),$rndkey0 457 shl \$4,$rounds 458 $movkey 16($key),$rndkey1 459 xorps $rndkey0,$inout0 460 pxor $rndkey0,$inout1 461 pxor $rndkey0,$inout2 462 aes${dir} $rndkey1,$inout0 463 lea 32($key,$rounds),$key 464 neg %rax # $rounds 465 aes${dir} $rndkey1,$inout1 466 pxor $rndkey0,$inout3 467 pxor $rndkey0,$inout4 468 aes${dir} $rndkey1,$inout2 469 pxor $rndkey0,$inout5 470 $movkey ($key,%rax),$rndkey0 471 add \$16,%rax 472 jmp .L${dir}_loop6_enter 473.align 16 474.L${dir}_loop6: 475 aes${dir} $rndkey1,$inout0 476 aes${dir} $rndkey1,$inout1 477 aes${dir} $rndkey1,$inout2 478.L${dir}_loop6_enter: 479 aes${dir} $rndkey1,$inout3 480 aes${dir} $rndkey1,$inout4 481 aes${dir} $rndkey1,$inout5 482 $movkey ($key,%rax),$rndkey1 483 add \$32,%rax 484 aes${dir} $rndkey0,$inout0 485 aes${dir} $rndkey0,$inout1 486 aes${dir} $rndkey0,$inout2 487 aes${dir} $rndkey0,$inout3 488 aes${dir} $rndkey0,$inout4 489 aes${dir} $rndkey0,$inout5 490 $movkey -16($key,%rax),$rndkey0 491 jnz .L${dir}_loop6 492 493 aes${dir} $rndkey1,$inout0 494 aes${dir} $rndkey1,$inout1 495 aes${dir} $rndkey1,$inout2 496 aes${dir} $rndkey1,$inout3 497 aes${dir} $rndkey1,$inout4 498 aes${dir} $rndkey1,$inout5 499 aes${dir}last $rndkey0,$inout0 500 aes${dir}last $rndkey0,$inout1 501 aes${dir}last $rndkey0,$inout2 502 aes${dir}last $rndkey0,$inout3 503 aes${dir}last $rndkey0,$inout4 504 aes${dir}last $rndkey0,$inout5 505 ret 506.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 507___ 508} 509sub aesni_generate8 { 510my $dir=shift; 511# As already mentioned it takes in $key and $rounds, which are *not* 512# preserved. $inout[0-7] is cipher/clear text... 513$code.=<<___; 514.type _aesni_${dir}rypt8,\@abi-omnipotent 515.align 16 516_aesni_${dir}rypt8: 517 $movkey ($key),$rndkey0 518 shl \$4,$rounds 519 $movkey 16($key),$rndkey1 520 xorps $rndkey0,$inout0 521 xorps $rndkey0,$inout1 522 pxor $rndkey0,$inout2 523 pxor $rndkey0,$inout3 524 pxor $rndkey0,$inout4 525 lea 32($key,$rounds),$key 526 neg %rax # $rounds 527 aes${dir} $rndkey1,$inout0 528 pxor $rndkey0,$inout5 529 pxor $rndkey0,$inout6 530 aes${dir} $rndkey1,$inout1 531 pxor $rndkey0,$inout7 532 $movkey ($key,%rax),$rndkey0 533 add \$16,%rax 534 jmp .L${dir}_loop8_inner 535.align 16 536.L${dir}_loop8: 537 aes${dir} $rndkey1,$inout0 538 aes${dir} $rndkey1,$inout1 539.L${dir}_loop8_inner: 540 aes${dir} $rndkey1,$inout2 541 aes${dir} $rndkey1,$inout3 542 aes${dir} $rndkey1,$inout4 543 aes${dir} $rndkey1,$inout5 544 aes${dir} $rndkey1,$inout6 545 aes${dir} $rndkey1,$inout7 546.L${dir}_loop8_enter: 547 $movkey ($key,%rax),$rndkey1 548 add \$32,%rax 549 aes${dir} $rndkey0,$inout0 550 aes${dir} $rndkey0,$inout1 551 aes${dir} $rndkey0,$inout2 552 aes${dir} $rndkey0,$inout3 553 aes${dir} $rndkey0,$inout4 554 aes${dir} $rndkey0,$inout5 555 aes${dir} $rndkey0,$inout6 556 aes${dir} $rndkey0,$inout7 557 $movkey -16($key,%rax),$rndkey0 558 jnz .L${dir}_loop8 559 560 aes${dir} $rndkey1,$inout0 561 aes${dir} $rndkey1,$inout1 562 aes${dir} $rndkey1,$inout2 563 aes${dir} $rndkey1,$inout3 564 aes${dir} $rndkey1,$inout4 565 aes${dir} $rndkey1,$inout5 566 aes${dir} $rndkey1,$inout6 567 aes${dir} $rndkey1,$inout7 568 aes${dir}last $rndkey0,$inout0 569 aes${dir}last $rndkey0,$inout1 570 aes${dir}last $rndkey0,$inout2 571 aes${dir}last $rndkey0,$inout3 572 aes${dir}last $rndkey0,$inout4 573 aes${dir}last $rndkey0,$inout5 574 aes${dir}last $rndkey0,$inout6 575 aes${dir}last $rndkey0,$inout7 576 ret 577.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 578___ 579} 580&aesni_generate2("enc") if ($PREFIX eq "aesni"); 581&aesni_generate2("dec"); 582&aesni_generate3("enc") if ($PREFIX eq "aesni"); 583&aesni_generate3("dec"); 584&aesni_generate4("enc") if ($PREFIX eq "aesni"); 585&aesni_generate4("dec"); 586&aesni_generate6("enc") if ($PREFIX eq "aesni"); 587&aesni_generate6("dec"); 588&aesni_generate8("enc") if ($PREFIX eq "aesni"); 589&aesni_generate8("dec"); 590 591if ($PREFIX eq "aesni") { 592######################################################################## 593# void aesni_ecb_encrypt (const void *in, void *out, 594# size_t length, const AES_KEY *key, 595# int enc); 596$code.=<<___; 597.globl aesni_ecb_encrypt 598.type aesni_ecb_encrypt,\@function,5 599.align 16 600aesni_ecb_encrypt: 601___ 602$code.=<<___ if ($win64); 603 lea -0x58(%rsp),%rsp 604 movaps %xmm6,(%rsp) # offload $inout4..7 605 movaps %xmm7,0x10(%rsp) 606 movaps %xmm8,0x20(%rsp) 607 movaps %xmm9,0x30(%rsp) 608.Lecb_enc_body: 609___ 610$code.=<<___; 611 and \$-16,$len # if ($len<16) 612 jz .Lecb_ret # return 613 614 mov 240($key),$rounds # key->rounds 615 $movkey ($key),$rndkey0 616 mov $key,$key_ # backup $key 617 mov $rounds,$rnds_ # backup $rounds 618 test %r8d,%r8d # 5th argument 619 jz .Lecb_decrypt 620#--------------------------- ECB ENCRYPT ------------------------------# 621 cmp \$0x80,$len # if ($len<8*16) 622 jb .Lecb_enc_tail # short input 623 624 movdqu ($inp),$inout0 # load 8 input blocks 625 movdqu 0x10($inp),$inout1 626 movdqu 0x20($inp),$inout2 627 movdqu 0x30($inp),$inout3 628 movdqu 0x40($inp),$inout4 629 movdqu 0x50($inp),$inout5 630 movdqu 0x60($inp),$inout6 631 movdqu 0x70($inp),$inout7 632 lea 0x80($inp),$inp # $inp+=8*16 633 sub \$0x80,$len # $len-=8*16 (can be zero) 634 jmp .Lecb_enc_loop8_enter 635.align 16 636.Lecb_enc_loop8: 637 movups $inout0,($out) # store 8 output blocks 638 mov $key_,$key # restore $key 639 movdqu ($inp),$inout0 # load 8 input blocks 640 mov $rnds_,$rounds # restore $rounds 641 movups $inout1,0x10($out) 642 movdqu 0x10($inp),$inout1 643 movups $inout2,0x20($out) 644 movdqu 0x20($inp),$inout2 645 movups $inout3,0x30($out) 646 movdqu 0x30($inp),$inout3 647 movups $inout4,0x40($out) 648 movdqu 0x40($inp),$inout4 649 movups $inout5,0x50($out) 650 movdqu 0x50($inp),$inout5 651 movups $inout6,0x60($out) 652 movdqu 0x60($inp),$inout6 653 movups $inout7,0x70($out) 654 lea 0x80($out),$out # $out+=8*16 655 movdqu 0x70($inp),$inout7 656 lea 0x80($inp),$inp # $inp+=8*16 657.Lecb_enc_loop8_enter: 658 659 call _aesni_encrypt8 660 661 sub \$0x80,$len 662 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 663 664 movups $inout0,($out) # store 8 output blocks 665 mov $key_,$key # restore $key 666 movups $inout1,0x10($out) 667 mov $rnds_,$rounds # restore $rounds 668 movups $inout2,0x20($out) 669 movups $inout3,0x30($out) 670 movups $inout4,0x40($out) 671 movups $inout5,0x50($out) 672 movups $inout6,0x60($out) 673 movups $inout7,0x70($out) 674 lea 0x80($out),$out # $out+=8*16 675 add \$0x80,$len # restore real remaining $len 676 jz .Lecb_ret # done if ($len==0) 677 678.Lecb_enc_tail: # $len is less than 8*16 679 movups ($inp),$inout0 680 cmp \$0x20,$len 681 jb .Lecb_enc_one 682 movups 0x10($inp),$inout1 683 je .Lecb_enc_two 684 movups 0x20($inp),$inout2 685 cmp \$0x40,$len 686 jb .Lecb_enc_three 687 movups 0x30($inp),$inout3 688 je .Lecb_enc_four 689 movups 0x40($inp),$inout4 690 cmp \$0x60,$len 691 jb .Lecb_enc_five 692 movups 0x50($inp),$inout5 693 je .Lecb_enc_six 694 movdqu 0x60($inp),$inout6 695 xorps $inout7,$inout7 696 call _aesni_encrypt8 697 movups $inout0,($out) # store 7 output blocks 698 movups $inout1,0x10($out) 699 movups $inout2,0x20($out) 700 movups $inout3,0x30($out) 701 movups $inout4,0x40($out) 702 movups $inout5,0x50($out) 703 movups $inout6,0x60($out) 704 jmp .Lecb_ret 705.align 16 706.Lecb_enc_one: 707___ 708 &aesni_generate1("enc",$key,$rounds); 709$code.=<<___; 710 movups $inout0,($out) # store one output block 711 jmp .Lecb_ret 712.align 16 713.Lecb_enc_two: 714 call _aesni_encrypt2 715 movups $inout0,($out) # store 2 output blocks 716 movups $inout1,0x10($out) 717 jmp .Lecb_ret 718.align 16 719.Lecb_enc_three: 720 call _aesni_encrypt3 721 movups $inout0,($out) # store 3 output blocks 722 movups $inout1,0x10($out) 723 movups $inout2,0x20($out) 724 jmp .Lecb_ret 725.align 16 726.Lecb_enc_four: 727 call _aesni_encrypt4 728 movups $inout0,($out) # store 4 output blocks 729 movups $inout1,0x10($out) 730 movups $inout2,0x20($out) 731 movups $inout3,0x30($out) 732 jmp .Lecb_ret 733.align 16 734.Lecb_enc_five: 735 xorps $inout5,$inout5 736 call _aesni_encrypt6 737 movups $inout0,($out) # store 5 output blocks 738 movups $inout1,0x10($out) 739 movups $inout2,0x20($out) 740 movups $inout3,0x30($out) 741 movups $inout4,0x40($out) 742 jmp .Lecb_ret 743.align 16 744.Lecb_enc_six: 745 call _aesni_encrypt6 746 movups $inout0,($out) # store 6 output blocks 747 movups $inout1,0x10($out) 748 movups $inout2,0x20($out) 749 movups $inout3,0x30($out) 750 movups $inout4,0x40($out) 751 movups $inout5,0x50($out) 752 jmp .Lecb_ret 753#--------------------------- ECB DECRYPT ------------------------------# 754.align 16 755.Lecb_decrypt: 756 cmp \$0x80,$len # if ($len<8*16) 757 jb .Lecb_dec_tail # short input 758 759 movdqu ($inp),$inout0 # load 8 input blocks 760 movdqu 0x10($inp),$inout1 761 movdqu 0x20($inp),$inout2 762 movdqu 0x30($inp),$inout3 763 movdqu 0x40($inp),$inout4 764 movdqu 0x50($inp),$inout5 765 movdqu 0x60($inp),$inout6 766 movdqu 0x70($inp),$inout7 767 lea 0x80($inp),$inp # $inp+=8*16 768 sub \$0x80,$len # $len-=8*16 (can be zero) 769 jmp .Lecb_dec_loop8_enter 770.align 16 771.Lecb_dec_loop8: 772 movups $inout0,($out) # store 8 output blocks 773 mov $key_,$key # restore $key 774 movdqu ($inp),$inout0 # load 8 input blocks 775 mov $rnds_,$rounds # restore $rounds 776 movups $inout1,0x10($out) 777 movdqu 0x10($inp),$inout1 778 movups $inout2,0x20($out) 779 movdqu 0x20($inp),$inout2 780 movups $inout3,0x30($out) 781 movdqu 0x30($inp),$inout3 782 movups $inout4,0x40($out) 783 movdqu 0x40($inp),$inout4 784 movups $inout5,0x50($out) 785 movdqu 0x50($inp),$inout5 786 movups $inout6,0x60($out) 787 movdqu 0x60($inp),$inout6 788 movups $inout7,0x70($out) 789 lea 0x80($out),$out # $out+=8*16 790 movdqu 0x70($inp),$inout7 791 lea 0x80($inp),$inp # $inp+=8*16 792.Lecb_dec_loop8_enter: 793 794 call _aesni_decrypt8 795 796 $movkey ($key_),$rndkey0 797 sub \$0x80,$len 798 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 799 800 movups $inout0,($out) # store 8 output blocks 801 pxor $inout0,$inout0 # clear register bank 802 mov $key_,$key # restore $key 803 movups $inout1,0x10($out) 804 pxor $inout1,$inout1 805 mov $rnds_,$rounds # restore $rounds 806 movups $inout2,0x20($out) 807 pxor $inout2,$inout2 808 movups $inout3,0x30($out) 809 pxor $inout3,$inout3 810 movups $inout4,0x40($out) 811 pxor $inout4,$inout4 812 movups $inout5,0x50($out) 813 pxor $inout5,$inout5 814 movups $inout6,0x60($out) 815 pxor $inout6,$inout6 816 movups $inout7,0x70($out) 817 pxor $inout7,$inout7 818 lea 0x80($out),$out # $out+=8*16 819 add \$0x80,$len # restore real remaining $len 820 jz .Lecb_ret # done if ($len==0) 821 822.Lecb_dec_tail: 823 movups ($inp),$inout0 824 cmp \$0x20,$len 825 jb .Lecb_dec_one 826 movups 0x10($inp),$inout1 827 je .Lecb_dec_two 828 movups 0x20($inp),$inout2 829 cmp \$0x40,$len 830 jb .Lecb_dec_three 831 movups 0x30($inp),$inout3 832 je .Lecb_dec_four 833 movups 0x40($inp),$inout4 834 cmp \$0x60,$len 835 jb .Lecb_dec_five 836 movups 0x50($inp),$inout5 837 je .Lecb_dec_six 838 movups 0x60($inp),$inout6 839 $movkey ($key),$rndkey0 840 xorps $inout7,$inout7 841 call _aesni_decrypt8 842 movups $inout0,($out) # store 7 output blocks 843 pxor $inout0,$inout0 # clear register bank 844 movups $inout1,0x10($out) 845 pxor $inout1,$inout1 846 movups $inout2,0x20($out) 847 pxor $inout2,$inout2 848 movups $inout3,0x30($out) 849 pxor $inout3,$inout3 850 movups $inout4,0x40($out) 851 pxor $inout4,$inout4 852 movups $inout5,0x50($out) 853 pxor $inout5,$inout5 854 movups $inout6,0x60($out) 855 pxor $inout6,$inout6 856 pxor $inout7,$inout7 857 jmp .Lecb_ret 858.align 16 859.Lecb_dec_one: 860___ 861 &aesni_generate1("dec",$key,$rounds); 862$code.=<<___; 863 movups $inout0,($out) # store one output block 864 pxor $inout0,$inout0 # clear register bank 865 jmp .Lecb_ret 866.align 16 867.Lecb_dec_two: 868 call _aesni_decrypt2 869 movups $inout0,($out) # store 2 output blocks 870 pxor $inout0,$inout0 # clear register bank 871 movups $inout1,0x10($out) 872 pxor $inout1,$inout1 873 jmp .Lecb_ret 874.align 16 875.Lecb_dec_three: 876 call _aesni_decrypt3 877 movups $inout0,($out) # store 3 output blocks 878 pxor $inout0,$inout0 # clear register bank 879 movups $inout1,0x10($out) 880 pxor $inout1,$inout1 881 movups $inout2,0x20($out) 882 pxor $inout2,$inout2 883 jmp .Lecb_ret 884.align 16 885.Lecb_dec_four: 886 call _aesni_decrypt4 887 movups $inout0,($out) # store 4 output blocks 888 pxor $inout0,$inout0 # clear register bank 889 movups $inout1,0x10($out) 890 pxor $inout1,$inout1 891 movups $inout2,0x20($out) 892 pxor $inout2,$inout2 893 movups $inout3,0x30($out) 894 pxor $inout3,$inout3 895 jmp .Lecb_ret 896.align 16 897.Lecb_dec_five: 898 xorps $inout5,$inout5 899 call _aesni_decrypt6 900 movups $inout0,($out) # store 5 output blocks 901 pxor $inout0,$inout0 # clear register bank 902 movups $inout1,0x10($out) 903 pxor $inout1,$inout1 904 movups $inout2,0x20($out) 905 pxor $inout2,$inout2 906 movups $inout3,0x30($out) 907 pxor $inout3,$inout3 908 movups $inout4,0x40($out) 909 pxor $inout4,$inout4 910 pxor $inout5,$inout5 911 jmp .Lecb_ret 912.align 16 913.Lecb_dec_six: 914 call _aesni_decrypt6 915 movups $inout0,($out) # store 6 output blocks 916 pxor $inout0,$inout0 # clear register bank 917 movups $inout1,0x10($out) 918 pxor $inout1,$inout1 919 movups $inout2,0x20($out) 920 pxor $inout2,$inout2 921 movups $inout3,0x30($out) 922 pxor $inout3,$inout3 923 movups $inout4,0x40($out) 924 pxor $inout4,$inout4 925 movups $inout5,0x50($out) 926 pxor $inout5,$inout5 927 928.Lecb_ret: 929 xorps $rndkey0,$rndkey0 # %xmm0 930 pxor $rndkey1,$rndkey1 931___ 932$code.=<<___ if ($win64); 933 movaps (%rsp),%xmm6 934 movaps %xmm0,(%rsp) # clear stack 935 movaps 0x10(%rsp),%xmm7 936 movaps %xmm0,0x10(%rsp) 937 movaps 0x20(%rsp),%xmm8 938 movaps %xmm0,0x20(%rsp) 939 movaps 0x30(%rsp),%xmm9 940 movaps %xmm0,0x30(%rsp) 941 lea 0x58(%rsp),%rsp 942.Lecb_enc_ret: 943___ 944$code.=<<___; 945 ret 946.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 947___ 948 949{ 950###################################################################### 951# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 952# size_t blocks, const AES_KEY *key, 953# const char *ivec,char *cmac); 954# 955# Handles only complete blocks, operates on 64-bit counter and 956# does not update *ivec! Nor does it finalize CMAC value 957# (see engine/eng_aesni.c for details) 958# 959{ 960my $cmac="%r9"; # 6th argument 961 962my $increment="%xmm9"; 963my $iv="%xmm6"; 964my $bswap_mask="%xmm7"; 965 966$code.=<<___; 967.globl aesni_ccm64_encrypt_blocks 968.type aesni_ccm64_encrypt_blocks,\@function,6 969.align 16 970aesni_ccm64_encrypt_blocks: 971___ 972$code.=<<___ if ($win64); 973 lea -0x58(%rsp),%rsp 974 movaps %xmm6,(%rsp) # $iv 975 movaps %xmm7,0x10(%rsp) # $bswap_mask 976 movaps %xmm8,0x20(%rsp) # $in0 977 movaps %xmm9,0x30(%rsp) # $increment 978.Lccm64_enc_body: 979___ 980$code.=<<___; 981 mov 240($key),$rounds # key->rounds 982 movdqu ($ivp),$iv 983 movdqa .Lincrement64(%rip),$increment 984 movdqa .Lbswap_mask(%rip),$bswap_mask 985 986 shl \$4,$rounds 987 mov \$16,$rnds_ 988 lea 0($key),$key_ 989 movdqu ($cmac),$inout1 990 movdqa $iv,$inout0 991 lea 32($key,$rounds),$key # end of key schedule 992 pshufb $bswap_mask,$iv 993 sub %rax,%r10 # twisted $rounds 994 jmp .Lccm64_enc_outer 995.align 16 996.Lccm64_enc_outer: 997 $movkey ($key_),$rndkey0 998 mov %r10,%rax 999 movups ($inp),$in0 # load inp 1000 1001 xorps $rndkey0,$inout0 # counter 1002 $movkey 16($key_),$rndkey1 1003 xorps $in0,$rndkey0 1004 xorps $rndkey0,$inout1 # cmac^=inp 1005 $movkey 32($key_),$rndkey0 1006 1007.Lccm64_enc2_loop: 1008 aesenc $rndkey1,$inout0 1009 aesenc $rndkey1,$inout1 1010 $movkey ($key,%rax),$rndkey1 1011 add \$32,%rax 1012 aesenc $rndkey0,$inout0 1013 aesenc $rndkey0,$inout1 1014 $movkey -16($key,%rax),$rndkey0 1015 jnz .Lccm64_enc2_loop 1016 aesenc $rndkey1,$inout0 1017 aesenc $rndkey1,$inout1 1018 paddq $increment,$iv 1019 dec $len # $len-- ($len is in blocks) 1020 aesenclast $rndkey0,$inout0 1021 aesenclast $rndkey0,$inout1 1022 1023 lea 16($inp),$inp 1024 xorps $inout0,$in0 # inp ^= E(iv) 1025 movdqa $iv,$inout0 1026 movups $in0,($out) # save output 1027 pshufb $bswap_mask,$inout0 1028 lea 16($out),$out # $out+=16 1029 jnz .Lccm64_enc_outer # loop if ($len!=0) 1030 1031 pxor $rndkey0,$rndkey0 # clear register bank 1032 pxor $rndkey1,$rndkey1 1033 pxor $inout0,$inout0 1034 movups $inout1,($cmac) # store resulting mac 1035 pxor $inout1,$inout1 1036 pxor $in0,$in0 1037 pxor $iv,$iv 1038___ 1039$code.=<<___ if ($win64); 1040 movaps (%rsp),%xmm6 1041 movaps %xmm0,(%rsp) # clear stack 1042 movaps 0x10(%rsp),%xmm7 1043 movaps %xmm0,0x10(%rsp) 1044 movaps 0x20(%rsp),%xmm8 1045 movaps %xmm0,0x20(%rsp) 1046 movaps 0x30(%rsp),%xmm9 1047 movaps %xmm0,0x30(%rsp) 1048 lea 0x58(%rsp),%rsp 1049.Lccm64_enc_ret: 1050___ 1051$code.=<<___; 1052 ret 1053.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1054___ 1055###################################################################### 1056$code.=<<___; 1057.globl aesni_ccm64_decrypt_blocks 1058.type aesni_ccm64_decrypt_blocks,\@function,6 1059.align 16 1060aesni_ccm64_decrypt_blocks: 1061___ 1062$code.=<<___ if ($win64); 1063 lea -0x58(%rsp),%rsp 1064 movaps %xmm6,(%rsp) # $iv 1065 movaps %xmm7,0x10(%rsp) # $bswap_mask 1066 movaps %xmm8,0x20(%rsp) # $in8 1067 movaps %xmm9,0x30(%rsp) # $increment 1068.Lccm64_dec_body: 1069___ 1070$code.=<<___; 1071 mov 240($key),$rounds # key->rounds 1072 movups ($ivp),$iv 1073 movdqu ($cmac),$inout1 1074 movdqa .Lincrement64(%rip),$increment 1075 movdqa .Lbswap_mask(%rip),$bswap_mask 1076 1077 movaps $iv,$inout0 1078 mov $rounds,$rnds_ 1079 mov $key,$key_ 1080 pshufb $bswap_mask,$iv 1081___ 1082 &aesni_generate1("enc",$key,$rounds); 1083$code.=<<___; 1084 shl \$4,$rnds_ 1085 mov \$16,$rounds 1086 movups ($inp),$in0 # load inp 1087 paddq $increment,$iv 1088 lea 16($inp),$inp # $inp+=16 1089 sub %r10,%rax # twisted $rounds 1090 lea 32($key_,$rnds_),$key # end of key schedule 1091 mov %rax,%r10 1092 jmp .Lccm64_dec_outer 1093.align 16 1094.Lccm64_dec_outer: 1095 xorps $inout0,$in0 # inp ^= E(iv) 1096 movdqa $iv,$inout0 1097 movups $in0,($out) # save output 1098 lea 16($out),$out # $out+=16 1099 pshufb $bswap_mask,$inout0 1100 1101 sub \$1,$len # $len-- ($len is in blocks) 1102 jz .Lccm64_dec_break # if ($len==0) break 1103 1104 $movkey ($key_),$rndkey0 1105 mov %r10,%rax 1106 $movkey 16($key_),$rndkey1 1107 xorps $rndkey0,$in0 1108 xorps $rndkey0,$inout0 1109 xorps $in0,$inout1 # cmac^=out 1110 $movkey 32($key_),$rndkey0 1111 jmp .Lccm64_dec2_loop 1112.align 16 1113.Lccm64_dec2_loop: 1114 aesenc $rndkey1,$inout0 1115 aesenc $rndkey1,$inout1 1116 $movkey ($key,%rax),$rndkey1 1117 add \$32,%rax 1118 aesenc $rndkey0,$inout0 1119 aesenc $rndkey0,$inout1 1120 $movkey -16($key,%rax),$rndkey0 1121 jnz .Lccm64_dec2_loop 1122 movups ($inp),$in0 # load input 1123 paddq $increment,$iv 1124 aesenc $rndkey1,$inout0 1125 aesenc $rndkey1,$inout1 1126 aesenclast $rndkey0,$inout0 1127 aesenclast $rndkey0,$inout1 1128 lea 16($inp),$inp # $inp+=16 1129 jmp .Lccm64_dec_outer 1130 1131.align 16 1132.Lccm64_dec_break: 1133 #xorps $in0,$inout1 # cmac^=out 1134 mov 240($key_),$rounds 1135___ 1136 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1137$code.=<<___; 1138 pxor $rndkey0,$rndkey0 # clear register bank 1139 pxor $rndkey1,$rndkey1 1140 pxor $inout0,$inout0 1141 movups $inout1,($cmac) # store resulting mac 1142 pxor $inout1,$inout1 1143 pxor $in0,$in0 1144 pxor $iv,$iv 1145___ 1146$code.=<<___ if ($win64); 1147 movaps (%rsp),%xmm6 1148 movaps %xmm0,(%rsp) # clear stack 1149 movaps 0x10(%rsp),%xmm7 1150 movaps %xmm0,0x10(%rsp) 1151 movaps 0x20(%rsp),%xmm8 1152 movaps %xmm0,0x20(%rsp) 1153 movaps 0x30(%rsp),%xmm9 1154 movaps %xmm0,0x30(%rsp) 1155 lea 0x58(%rsp),%rsp 1156.Lccm64_dec_ret: 1157___ 1158$code.=<<___; 1159 ret 1160.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1161___ 1162} 1163###################################################################### 1164# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1165# size_t blocks, const AES_KEY *key, 1166# const char *ivec); 1167# 1168# Handles only complete blocks, operates on 32-bit counter and 1169# does not update *ivec! (see crypto/modes/ctr128.c for details) 1170# 1171# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1172# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1173# Keywords are full unroll and modulo-schedule counter calculations 1174# with zero-round key xor. 1175{ 1176my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1177my ($key0,$ctr)=("%ebp","${ivp}d"); 1178my $frame_size = 0x80 + ($win64?160:0); 1179 1180$code.=<<___; 1181.globl aesni_ctr32_encrypt_blocks 1182.type aesni_ctr32_encrypt_blocks,\@function,5 1183.align 16 1184aesni_ctr32_encrypt_blocks: 1185.cfi_startproc 1186 cmp \$1,$len 1187 jne .Lctr32_bulk 1188 1189 # handle single block without allocating stack frame, 1190 # useful when handling edges 1191 movups ($ivp),$inout0 1192 movups ($inp),$inout1 1193 mov 240($key),%edx # key->rounds 1194___ 1195 &aesni_generate1("enc",$key,"%edx"); 1196$code.=<<___; 1197 pxor $rndkey0,$rndkey0 # clear register bank 1198 pxor $rndkey1,$rndkey1 1199 xorps $inout1,$inout0 1200 pxor $inout1,$inout1 1201 movups $inout0,($out) 1202 xorps $inout0,$inout0 1203 jmp .Lctr32_epilogue 1204 1205.align 16 1206.Lctr32_bulk: 1207 lea (%rsp),$key_ # use $key_ as frame pointer 1208.cfi_def_cfa_register $key_ 1209 push %rbp 1210.cfi_push %rbp 1211 sub \$$frame_size,%rsp 1212 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1213___ 1214$code.=<<___ if ($win64); 1215 movaps %xmm6,-0xa8($key_) # offload everything 1216 movaps %xmm7,-0x98($key_) 1217 movaps %xmm8,-0x88($key_) 1218 movaps %xmm9,-0x78($key_) 1219 movaps %xmm10,-0x68($key_) 1220 movaps %xmm11,-0x58($key_) 1221 movaps %xmm12,-0x48($key_) 1222 movaps %xmm13,-0x38($key_) 1223 movaps %xmm14,-0x28($key_) 1224 movaps %xmm15,-0x18($key_) 1225.Lctr32_body: 1226___ 1227$code.=<<___; 1228 1229 # 8 16-byte words on top of stack are counter values 1230 # xor-ed with zero-round key 1231 1232 movdqu ($ivp),$inout0 1233 movdqu ($key),$rndkey0 1234 mov 12($ivp),$ctr # counter LSB 1235 pxor $rndkey0,$inout0 1236 mov 12($key),$key0 # 0-round key LSB 1237 movdqa $inout0,0x00(%rsp) # populate counter block 1238 bswap $ctr 1239 movdqa $inout0,$inout1 1240 movdqa $inout0,$inout2 1241 movdqa $inout0,$inout3 1242 movdqa $inout0,0x40(%rsp) 1243 movdqa $inout0,0x50(%rsp) 1244 movdqa $inout0,0x60(%rsp) 1245 mov %rdx,%r10 # about to borrow %rdx 1246 movdqa $inout0,0x70(%rsp) 1247 1248 lea 1($ctr),%rax 1249 lea 2($ctr),%rdx 1250 bswap %eax 1251 bswap %edx 1252 xor $key0,%eax 1253 xor $key0,%edx 1254 pinsrd \$3,%eax,$inout1 1255 lea 3($ctr),%rax 1256 movdqa $inout1,0x10(%rsp) 1257 pinsrd \$3,%edx,$inout2 1258 bswap %eax 1259 mov %r10,%rdx # restore %rdx 1260 lea 4($ctr),%r10 1261 movdqa $inout2,0x20(%rsp) 1262 xor $key0,%eax 1263 bswap %r10d 1264 pinsrd \$3,%eax,$inout3 1265 xor $key0,%r10d 1266 movdqa $inout3,0x30(%rsp) 1267 lea 5($ctr),%r9 1268 mov %r10d,0x40+12(%rsp) 1269 bswap %r9d 1270 lea 6($ctr),%r10 1271 mov 240($key),$rounds # key->rounds 1272 xor $key0,%r9d 1273 bswap %r10d 1274 mov %r9d,0x50+12(%rsp) 1275 xor $key0,%r10d 1276 lea 7($ctr),%r9 1277 mov %r10d,0x60+12(%rsp) 1278 bswap %r9d 1279 mov OPENSSL_ia32cap_P+4(%rip),%r10d 1280 xor $key0,%r9d 1281 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1282 mov %r9d,0x70+12(%rsp) 1283 1284 $movkey 0x10($key),$rndkey1 1285 1286 movdqa 0x40(%rsp),$inout4 1287 movdqa 0x50(%rsp),$inout5 1288 1289 cmp \$8,$len # $len is in blocks 1290 jb .Lctr32_tail # short input if ($len<8) 1291 1292 sub \$6,$len # $len is biased by -6 1293 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1294 je .Lctr32_6x # [which denotes Atom Silvermont] 1295 1296 lea 0x80($key),$key # size optimization 1297 sub \$2,$len # $len is biased by -8 1298 jmp .Lctr32_loop8 1299 1300.align 16 1301.Lctr32_6x: 1302 shl \$4,$rounds 1303 mov \$48,$rnds_ 1304 bswap $key0 1305 lea 32($key,$rounds),$key # end of key schedule 1306 sub %rax,%r10 # twisted $rounds 1307 jmp .Lctr32_loop6 1308 1309.align 16 1310.Lctr32_loop6: 1311 add \$6,$ctr # next counter value 1312 $movkey -48($key,$rnds_),$rndkey0 1313 aesenc $rndkey1,$inout0 1314 mov $ctr,%eax 1315 xor $key0,%eax 1316 aesenc $rndkey1,$inout1 1317 movbe %eax,`0x00+12`(%rsp) # store next counter value 1318 lea 1($ctr),%eax 1319 aesenc $rndkey1,$inout2 1320 xor $key0,%eax 1321 movbe %eax,`0x10+12`(%rsp) 1322 aesenc $rndkey1,$inout3 1323 lea 2($ctr),%eax 1324 xor $key0,%eax 1325 aesenc $rndkey1,$inout4 1326 movbe %eax,`0x20+12`(%rsp) 1327 lea 3($ctr),%eax 1328 aesenc $rndkey1,$inout5 1329 $movkey -32($key,$rnds_),$rndkey1 1330 xor $key0,%eax 1331 1332 aesenc $rndkey0,$inout0 1333 movbe %eax,`0x30+12`(%rsp) 1334 lea 4($ctr),%eax 1335 aesenc $rndkey0,$inout1 1336 xor $key0,%eax 1337 movbe %eax,`0x40+12`(%rsp) 1338 aesenc $rndkey0,$inout2 1339 lea 5($ctr),%eax 1340 xor $key0,%eax 1341 aesenc $rndkey0,$inout3 1342 movbe %eax,`0x50+12`(%rsp) 1343 mov %r10,%rax # mov $rnds_,$rounds 1344 aesenc $rndkey0,$inout4 1345 aesenc $rndkey0,$inout5 1346 $movkey -16($key,$rnds_),$rndkey0 1347 1348 call .Lenc_loop6 1349 1350 movdqu ($inp),$inout6 # load 6 input blocks 1351 movdqu 0x10($inp),$inout7 1352 movdqu 0x20($inp),$in0 1353 movdqu 0x30($inp),$in1 1354 movdqu 0x40($inp),$in2 1355 movdqu 0x50($inp),$in3 1356 lea 0x60($inp),$inp # $inp+=6*16 1357 $movkey -64($key,$rnds_),$rndkey1 1358 pxor $inout0,$inout6 # inp^=E(ctr) 1359 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1360 pxor $inout1,$inout7 1361 movaps 0x10(%rsp),$inout1 1362 pxor $inout2,$in0 1363 movaps 0x20(%rsp),$inout2 1364 pxor $inout3,$in1 1365 movaps 0x30(%rsp),$inout3 1366 pxor $inout4,$in2 1367 movaps 0x40(%rsp),$inout4 1368 pxor $inout5,$in3 1369 movaps 0x50(%rsp),$inout5 1370 movdqu $inout6,($out) # store 6 output blocks 1371 movdqu $inout7,0x10($out) 1372 movdqu $in0,0x20($out) 1373 movdqu $in1,0x30($out) 1374 movdqu $in2,0x40($out) 1375 movdqu $in3,0x50($out) 1376 lea 0x60($out),$out # $out+=6*16 1377 1378 sub \$6,$len 1379 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1380 1381 add \$6,$len # restore real remaining $len 1382 jz .Lctr32_done # done if ($len==0) 1383 1384 lea -48($rnds_),$rounds 1385 lea -80($key,$rnds_),$key # restore $key 1386 neg $rounds 1387 shr \$4,$rounds # restore $rounds 1388 jmp .Lctr32_tail 1389 1390.align 32 1391.Lctr32_loop8: 1392 add \$8,$ctr # next counter value 1393 movdqa 0x60(%rsp),$inout6 1394 aesenc $rndkey1,$inout0 1395 mov $ctr,%r9d 1396 movdqa 0x70(%rsp),$inout7 1397 aesenc $rndkey1,$inout1 1398 bswap %r9d 1399 $movkey 0x20-0x80($key),$rndkey0 1400 aesenc $rndkey1,$inout2 1401 xor $key0,%r9d 1402 nop 1403 aesenc $rndkey1,$inout3 1404 mov %r9d,0x00+12(%rsp) # store next counter value 1405 lea 1($ctr),%r9 1406 aesenc $rndkey1,$inout4 1407 aesenc $rndkey1,$inout5 1408 aesenc $rndkey1,$inout6 1409 aesenc $rndkey1,$inout7 1410 $movkey 0x30-0x80($key),$rndkey1 1411___ 1412for($i=2;$i<8;$i++) { 1413my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1414$code.=<<___; 1415 bswap %r9d 1416 aesenc $rndkeyx,$inout0 1417 aesenc $rndkeyx,$inout1 1418 xor $key0,%r9d 1419 .byte 0x66,0x90 1420 aesenc $rndkeyx,$inout2 1421 aesenc $rndkeyx,$inout3 1422 mov %r9d,`0x10*($i-1)`+12(%rsp) 1423 lea $i($ctr),%r9 1424 aesenc $rndkeyx,$inout4 1425 aesenc $rndkeyx,$inout5 1426 aesenc $rndkeyx,$inout6 1427 aesenc $rndkeyx,$inout7 1428 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1429___ 1430} 1431$code.=<<___; 1432 bswap %r9d 1433 aesenc $rndkey0,$inout0 1434 aesenc $rndkey0,$inout1 1435 aesenc $rndkey0,$inout2 1436 xor $key0,%r9d 1437 movdqu 0x00($inp),$in0 # start loading input 1438 aesenc $rndkey0,$inout3 1439 mov %r9d,0x70+12(%rsp) 1440 cmp \$11,$rounds 1441 aesenc $rndkey0,$inout4 1442 aesenc $rndkey0,$inout5 1443 aesenc $rndkey0,$inout6 1444 aesenc $rndkey0,$inout7 1445 $movkey 0xa0-0x80($key),$rndkey0 1446 1447 jb .Lctr32_enc_done 1448 1449 aesenc $rndkey1,$inout0 1450 aesenc $rndkey1,$inout1 1451 aesenc $rndkey1,$inout2 1452 aesenc $rndkey1,$inout3 1453 aesenc $rndkey1,$inout4 1454 aesenc $rndkey1,$inout5 1455 aesenc $rndkey1,$inout6 1456 aesenc $rndkey1,$inout7 1457 $movkey 0xb0-0x80($key),$rndkey1 1458 1459 aesenc $rndkey0,$inout0 1460 aesenc $rndkey0,$inout1 1461 aesenc $rndkey0,$inout2 1462 aesenc $rndkey0,$inout3 1463 aesenc $rndkey0,$inout4 1464 aesenc $rndkey0,$inout5 1465 aesenc $rndkey0,$inout6 1466 aesenc $rndkey0,$inout7 1467 $movkey 0xc0-0x80($key),$rndkey0 1468 je .Lctr32_enc_done 1469 1470 aesenc $rndkey1,$inout0 1471 aesenc $rndkey1,$inout1 1472 aesenc $rndkey1,$inout2 1473 aesenc $rndkey1,$inout3 1474 aesenc $rndkey1,$inout4 1475 aesenc $rndkey1,$inout5 1476 aesenc $rndkey1,$inout6 1477 aesenc $rndkey1,$inout7 1478 $movkey 0xd0-0x80($key),$rndkey1 1479 1480 aesenc $rndkey0,$inout0 1481 aesenc $rndkey0,$inout1 1482 aesenc $rndkey0,$inout2 1483 aesenc $rndkey0,$inout3 1484 aesenc $rndkey0,$inout4 1485 aesenc $rndkey0,$inout5 1486 aesenc $rndkey0,$inout6 1487 aesenc $rndkey0,$inout7 1488 $movkey 0xe0-0x80($key),$rndkey0 1489 jmp .Lctr32_enc_done 1490 1491.align 16 1492.Lctr32_enc_done: 1493 movdqu 0x10($inp),$in1 1494 pxor $rndkey0,$in0 # input^=round[last] 1495 movdqu 0x20($inp),$in2 1496 pxor $rndkey0,$in1 1497 movdqu 0x30($inp),$in3 1498 pxor $rndkey0,$in2 1499 movdqu 0x40($inp),$in4 1500 pxor $rndkey0,$in3 1501 movdqu 0x50($inp),$in5 1502 pxor $rndkey0,$in4 1503 pxor $rndkey0,$in5 1504 aesenc $rndkey1,$inout0 1505 aesenc $rndkey1,$inout1 1506 aesenc $rndkey1,$inout2 1507 aesenc $rndkey1,$inout3 1508 aesenc $rndkey1,$inout4 1509 aesenc $rndkey1,$inout5 1510 aesenc $rndkey1,$inout6 1511 aesenc $rndkey1,$inout7 1512 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1513 lea 0x80($inp),$inp # $inp+=8*16 1514 1515 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1516 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1517 movdqu 0x70-0x80($inp),$in0 1518 aesenclast $in1,$inout1 1519 pxor $rndkey0,$in0 1520 movdqa 0x00(%rsp),$in1 # load next counter block 1521 aesenclast $in2,$inout2 1522 aesenclast $in3,$inout3 1523 movdqa 0x10(%rsp),$in2 1524 movdqa 0x20(%rsp),$in3 1525 aesenclast $in4,$inout4 1526 aesenclast $in5,$inout5 1527 movdqa 0x30(%rsp),$in4 1528 movdqa 0x40(%rsp),$in5 1529 aesenclast $rndkey1,$inout6 1530 movdqa 0x50(%rsp),$rndkey0 1531 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1532 aesenclast $in0,$inout7 1533 1534 movups $inout0,($out) # store 8 output blocks 1535 movdqa $in1,$inout0 1536 movups $inout1,0x10($out) 1537 movdqa $in2,$inout1 1538 movups $inout2,0x20($out) 1539 movdqa $in3,$inout2 1540 movups $inout3,0x30($out) 1541 movdqa $in4,$inout3 1542 movups $inout4,0x40($out) 1543 movdqa $in5,$inout4 1544 movups $inout5,0x50($out) 1545 movdqa $rndkey0,$inout5 1546 movups $inout6,0x60($out) 1547 movups $inout7,0x70($out) 1548 lea 0x80($out),$out # $out+=8*16 1549 1550 sub \$8,$len 1551 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1552 1553 add \$8,$len # restore real remaining $len 1554 jz .Lctr32_done # done if ($len==0) 1555 lea -0x80($key),$key 1556 1557.Lctr32_tail: 1558 # note that at this point $inout0..5 are populated with 1559 # counter values xor-ed with 0-round key 1560 lea 16($key),$key 1561 cmp \$4,$len 1562 jb .Lctr32_loop3 1563 je .Lctr32_loop4 1564 1565 # if ($len>4) compute 7 E(counter) 1566 shl \$4,$rounds 1567 movdqa 0x60(%rsp),$inout6 1568 pxor $inout7,$inout7 1569 1570 $movkey 16($key),$rndkey0 1571 aesenc $rndkey1,$inout0 1572 aesenc $rndkey1,$inout1 1573 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1574 neg %rax 1575 aesenc $rndkey1,$inout2 1576 add \$16,%rax # prepare for .Lenc_loop8_enter 1577 movups ($inp),$in0 1578 aesenc $rndkey1,$inout3 1579 aesenc $rndkey1,$inout4 1580 movups 0x10($inp),$in1 # pre-load input 1581 movups 0x20($inp),$in2 1582 aesenc $rndkey1,$inout5 1583 aesenc $rndkey1,$inout6 1584 1585 call .Lenc_loop8_enter 1586 1587 movdqu 0x30($inp),$in3 1588 pxor $in0,$inout0 1589 movdqu 0x40($inp),$in0 1590 pxor $in1,$inout1 1591 movdqu $inout0,($out) # store output 1592 pxor $in2,$inout2 1593 movdqu $inout1,0x10($out) 1594 pxor $in3,$inout3 1595 movdqu $inout2,0x20($out) 1596 pxor $in0,$inout4 1597 movdqu $inout3,0x30($out) 1598 movdqu $inout4,0x40($out) 1599 cmp \$6,$len 1600 jb .Lctr32_done # $len was 5, stop store 1601 1602 movups 0x50($inp),$in1 1603 xorps $in1,$inout5 1604 movups $inout5,0x50($out) 1605 je .Lctr32_done # $len was 6, stop store 1606 1607 movups 0x60($inp),$in2 1608 xorps $in2,$inout6 1609 movups $inout6,0x60($out) 1610 jmp .Lctr32_done # $len was 7, stop store 1611 1612.align 32 1613.Lctr32_loop4: 1614 aesenc $rndkey1,$inout0 1615 lea 16($key),$key 1616 dec $rounds 1617 aesenc $rndkey1,$inout1 1618 aesenc $rndkey1,$inout2 1619 aesenc $rndkey1,$inout3 1620 $movkey ($key),$rndkey1 1621 jnz .Lctr32_loop4 1622 aesenclast $rndkey1,$inout0 1623 aesenclast $rndkey1,$inout1 1624 movups ($inp),$in0 # load input 1625 movups 0x10($inp),$in1 1626 aesenclast $rndkey1,$inout2 1627 aesenclast $rndkey1,$inout3 1628 movups 0x20($inp),$in2 1629 movups 0x30($inp),$in3 1630 1631 xorps $in0,$inout0 1632 movups $inout0,($out) # store output 1633 xorps $in1,$inout1 1634 movups $inout1,0x10($out) 1635 pxor $in2,$inout2 1636 movdqu $inout2,0x20($out) 1637 pxor $in3,$inout3 1638 movdqu $inout3,0x30($out) 1639 jmp .Lctr32_done # $len was 4, stop store 1640 1641.align 32 1642.Lctr32_loop3: 1643 aesenc $rndkey1,$inout0 1644 lea 16($key),$key 1645 dec $rounds 1646 aesenc $rndkey1,$inout1 1647 aesenc $rndkey1,$inout2 1648 $movkey ($key),$rndkey1 1649 jnz .Lctr32_loop3 1650 aesenclast $rndkey1,$inout0 1651 aesenclast $rndkey1,$inout1 1652 aesenclast $rndkey1,$inout2 1653 1654 movups ($inp),$in0 # load input 1655 xorps $in0,$inout0 1656 movups $inout0,($out) # store output 1657 cmp \$2,$len 1658 jb .Lctr32_done # $len was 1, stop store 1659 1660 movups 0x10($inp),$in1 1661 xorps $in1,$inout1 1662 movups $inout1,0x10($out) 1663 je .Lctr32_done # $len was 2, stop store 1664 1665 movups 0x20($inp),$in2 1666 xorps $in2,$inout2 1667 movups $inout2,0x20($out) # $len was 3, stop store 1668 1669.Lctr32_done: 1670 xorps %xmm0,%xmm0 # clear register bank 1671 xor $key0,$key0 1672 pxor %xmm1,%xmm1 1673 pxor %xmm2,%xmm2 1674 pxor %xmm3,%xmm3 1675 pxor %xmm4,%xmm4 1676 pxor %xmm5,%xmm5 1677___ 1678$code.=<<___ if (!$win64); 1679 pxor %xmm6,%xmm6 1680 pxor %xmm7,%xmm7 1681 movaps %xmm0,0x00(%rsp) # clear stack 1682 pxor %xmm8,%xmm8 1683 movaps %xmm0,0x10(%rsp) 1684 pxor %xmm9,%xmm9 1685 movaps %xmm0,0x20(%rsp) 1686 pxor %xmm10,%xmm10 1687 movaps %xmm0,0x30(%rsp) 1688 pxor %xmm11,%xmm11 1689 movaps %xmm0,0x40(%rsp) 1690 pxor %xmm12,%xmm12 1691 movaps %xmm0,0x50(%rsp) 1692 pxor %xmm13,%xmm13 1693 movaps %xmm0,0x60(%rsp) 1694 pxor %xmm14,%xmm14 1695 movaps %xmm0,0x70(%rsp) 1696 pxor %xmm15,%xmm15 1697___ 1698$code.=<<___ if ($win64); 1699 movaps -0xa8($key_),%xmm6 1700 movaps %xmm0,-0xa8($key_) # clear stack 1701 movaps -0x98($key_),%xmm7 1702 movaps %xmm0,-0x98($key_) 1703 movaps -0x88($key_),%xmm8 1704 movaps %xmm0,-0x88($key_) 1705 movaps -0x78($key_),%xmm9 1706 movaps %xmm0,-0x78($key_) 1707 movaps -0x68($key_),%xmm10 1708 movaps %xmm0,-0x68($key_) 1709 movaps -0x58($key_),%xmm11 1710 movaps %xmm0,-0x58($key_) 1711 movaps -0x48($key_),%xmm12 1712 movaps %xmm0,-0x48($key_) 1713 movaps -0x38($key_),%xmm13 1714 movaps %xmm0,-0x38($key_) 1715 movaps -0x28($key_),%xmm14 1716 movaps %xmm0,-0x28($key_) 1717 movaps -0x18($key_),%xmm15 1718 movaps %xmm0,-0x18($key_) 1719 movaps %xmm0,0x00(%rsp) 1720 movaps %xmm0,0x10(%rsp) 1721 movaps %xmm0,0x20(%rsp) 1722 movaps %xmm0,0x30(%rsp) 1723 movaps %xmm0,0x40(%rsp) 1724 movaps %xmm0,0x50(%rsp) 1725 movaps %xmm0,0x60(%rsp) 1726 movaps %xmm0,0x70(%rsp) 1727___ 1728$code.=<<___; 1729 mov -8($key_),%rbp 1730.cfi_restore %rbp 1731 lea ($key_),%rsp 1732.cfi_def_cfa_register %rsp 1733.Lctr32_epilogue: 1734 ret 1735.cfi_endproc 1736.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1737___ 1738} 1739 1740###################################################################### 1741# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1742# const AES_KEY *key1, const AES_KEY *key2 1743# const unsigned char iv[16]); 1744# 1745{ 1746my @tweak=map("%xmm$_",(10..15)); 1747my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1748my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1749my $frame_size = 0x70 + ($win64?160:0); 1750my $key_ = "%rbp"; # override so that we can use %r11 as FP 1751 1752$code.=<<___; 1753.globl aesni_xts_encrypt 1754.type aesni_xts_encrypt,\@function,6 1755.align 16 1756aesni_xts_encrypt: 1757.cfi_startproc 1758 lea (%rsp),%r11 # frame pointer 1759.cfi_def_cfa_register %r11 1760 push %rbp 1761.cfi_push %rbp 1762 sub \$$frame_size,%rsp 1763 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1764___ 1765$code.=<<___ if ($win64); 1766 movaps %xmm6,-0xa8(%r11) # offload everything 1767 movaps %xmm7,-0x98(%r11) 1768 movaps %xmm8,-0x88(%r11) 1769 movaps %xmm9,-0x78(%r11) 1770 movaps %xmm10,-0x68(%r11) 1771 movaps %xmm11,-0x58(%r11) 1772 movaps %xmm12,-0x48(%r11) 1773 movaps %xmm13,-0x38(%r11) 1774 movaps %xmm14,-0x28(%r11) 1775 movaps %xmm15,-0x18(%r11) 1776.Lxts_enc_body: 1777___ 1778$code.=<<___; 1779 movups ($ivp),$inout0 # load clear-text tweak 1780 mov 240(%r8),$rounds # key2->rounds 1781 mov 240($key),$rnds_ # key1->rounds 1782___ 1783 # generate the tweak 1784 &aesni_generate1("enc",$key2,$rounds,$inout0); 1785$code.=<<___; 1786 $movkey ($key),$rndkey0 # zero round key 1787 mov $key,$key_ # backup $key 1788 mov $rnds_,$rounds # backup $rounds 1789 shl \$4,$rnds_ 1790 mov $len,$len_ # backup $len 1791 and \$-16,$len 1792 1793 $movkey 16($key,$rnds_),$rndkey1 # last round key 1794 1795 movdqa .Lxts_magic(%rip),$twmask 1796 movdqa $inout0,@tweak[5] 1797 pshufd \$0x5f,$inout0,$twres 1798 pxor $rndkey0,$rndkey1 1799___ 1800 # alternative tweak calculation algorithm is based on suggestions 1801 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1802 # and should help in the future... 1803 for ($i=0;$i<4;$i++) { 1804 $code.=<<___; 1805 movdqa $twres,$twtmp 1806 paddd $twres,$twres 1807 movdqa @tweak[5],@tweak[$i] 1808 psrad \$31,$twtmp # broadcast upper bits 1809 paddq @tweak[5],@tweak[5] 1810 pand $twmask,$twtmp 1811 pxor $rndkey0,@tweak[$i] 1812 pxor $twtmp,@tweak[5] 1813___ 1814 } 1815$code.=<<___; 1816 movdqa @tweak[5],@tweak[4] 1817 psrad \$31,$twres 1818 paddq @tweak[5],@tweak[5] 1819 pand $twmask,$twres 1820 pxor $rndkey0,@tweak[4] 1821 pxor $twres,@tweak[5] 1822 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1823 1824 sub \$16*6,$len 1825 jc .Lxts_enc_short # if $len-=6*16 borrowed 1826 1827 mov \$16+96,$rounds 1828 lea 32($key_,$rnds_),$key # end of key schedule 1829 sub %r10,%rax # twisted $rounds 1830 $movkey 16($key_),$rndkey1 1831 mov %rax,%r10 # backup twisted $rounds 1832 lea .Lxts_magic(%rip),%r8 1833 jmp .Lxts_enc_grandloop 1834 1835.align 32 1836.Lxts_enc_grandloop: 1837 movdqu `16*0`($inp),$inout0 # load input 1838 movdqa $rndkey0,$twmask 1839 movdqu `16*1`($inp),$inout1 1840 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1841 movdqu `16*2`($inp),$inout2 1842 pxor @tweak[1],$inout1 1843 aesenc $rndkey1,$inout0 1844 movdqu `16*3`($inp),$inout3 1845 pxor @tweak[2],$inout2 1846 aesenc $rndkey1,$inout1 1847 movdqu `16*4`($inp),$inout4 1848 pxor @tweak[3],$inout3 1849 aesenc $rndkey1,$inout2 1850 movdqu `16*5`($inp),$inout5 1851 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1852 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1853 pxor @tweak[4],$inout4 1854 aesenc $rndkey1,$inout3 1855 $movkey 32($key_),$rndkey0 1856 lea `16*6`($inp),$inp 1857 pxor $twmask,$inout5 1858 1859 pxor $twres,@tweak[0] # calculate tweaks^round[last] 1860 aesenc $rndkey1,$inout4 1861 pxor $twres,@tweak[1] 1862 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1863 aesenc $rndkey1,$inout5 1864 $movkey 48($key_),$rndkey1 1865 pxor $twres,@tweak[2] 1866 1867 aesenc $rndkey0,$inout0 1868 pxor $twres,@tweak[3] 1869 movdqa @tweak[1],`16*1`(%rsp) 1870 aesenc $rndkey0,$inout1 1871 pxor $twres,@tweak[4] 1872 movdqa @tweak[2],`16*2`(%rsp) 1873 aesenc $rndkey0,$inout2 1874 aesenc $rndkey0,$inout3 1875 pxor $twres,$twmask 1876 movdqa @tweak[4],`16*4`(%rsp) 1877 aesenc $rndkey0,$inout4 1878 aesenc $rndkey0,$inout5 1879 $movkey 64($key_),$rndkey0 1880 movdqa $twmask,`16*5`(%rsp) 1881 pshufd \$0x5f,@tweak[5],$twres 1882 jmp .Lxts_enc_loop6 1883.align 32 1884.Lxts_enc_loop6: 1885 aesenc $rndkey1,$inout0 1886 aesenc $rndkey1,$inout1 1887 aesenc $rndkey1,$inout2 1888 aesenc $rndkey1,$inout3 1889 aesenc $rndkey1,$inout4 1890 aesenc $rndkey1,$inout5 1891 $movkey -64($key,%rax),$rndkey1 1892 add \$32,%rax 1893 1894 aesenc $rndkey0,$inout0 1895 aesenc $rndkey0,$inout1 1896 aesenc $rndkey0,$inout2 1897 aesenc $rndkey0,$inout3 1898 aesenc $rndkey0,$inout4 1899 aesenc $rndkey0,$inout5 1900 $movkey -80($key,%rax),$rndkey0 1901 jnz .Lxts_enc_loop6 1902 1903 movdqa (%r8),$twmask # start calculating next tweak 1904 movdqa $twres,$twtmp 1905 paddd $twres,$twres 1906 aesenc $rndkey1,$inout0 1907 paddq @tweak[5],@tweak[5] 1908 psrad \$31,$twtmp 1909 aesenc $rndkey1,$inout1 1910 pand $twmask,$twtmp 1911 $movkey ($key_),@tweak[0] # load round[0] 1912 aesenc $rndkey1,$inout2 1913 aesenc $rndkey1,$inout3 1914 aesenc $rndkey1,$inout4 1915 pxor $twtmp,@tweak[5] 1916 movaps @tweak[0],@tweak[1] # copy round[0] 1917 aesenc $rndkey1,$inout5 1918 $movkey -64($key),$rndkey1 1919 1920 movdqa $twres,$twtmp 1921 aesenc $rndkey0,$inout0 1922 paddd $twres,$twres 1923 pxor @tweak[5],@tweak[0] 1924 aesenc $rndkey0,$inout1 1925 psrad \$31,$twtmp 1926 paddq @tweak[5],@tweak[5] 1927 aesenc $rndkey0,$inout2 1928 aesenc $rndkey0,$inout3 1929 pand $twmask,$twtmp 1930 movaps @tweak[1],@tweak[2] 1931 aesenc $rndkey0,$inout4 1932 pxor $twtmp,@tweak[5] 1933 movdqa $twres,$twtmp 1934 aesenc $rndkey0,$inout5 1935 $movkey -48($key),$rndkey0 1936 1937 paddd $twres,$twres 1938 aesenc $rndkey1,$inout0 1939 pxor @tweak[5],@tweak[1] 1940 psrad \$31,$twtmp 1941 aesenc $rndkey1,$inout1 1942 paddq @tweak[5],@tweak[5] 1943 pand $twmask,$twtmp 1944 aesenc $rndkey1,$inout2 1945 aesenc $rndkey1,$inout3 1946 movdqa @tweak[3],`16*3`(%rsp) 1947 pxor $twtmp,@tweak[5] 1948 aesenc $rndkey1,$inout4 1949 movaps @tweak[2],@tweak[3] 1950 movdqa $twres,$twtmp 1951 aesenc $rndkey1,$inout5 1952 $movkey -32($key),$rndkey1 1953 1954 paddd $twres,$twres 1955 aesenc $rndkey0,$inout0 1956 pxor @tweak[5],@tweak[2] 1957 psrad \$31,$twtmp 1958 aesenc $rndkey0,$inout1 1959 paddq @tweak[5],@tweak[5] 1960 pand $twmask,$twtmp 1961 aesenc $rndkey0,$inout2 1962 aesenc $rndkey0,$inout3 1963 aesenc $rndkey0,$inout4 1964 pxor $twtmp,@tweak[5] 1965 movaps @tweak[3],@tweak[4] 1966 aesenc $rndkey0,$inout5 1967 1968 movdqa $twres,$rndkey0 1969 paddd $twres,$twres 1970 aesenc $rndkey1,$inout0 1971 pxor @tweak[5],@tweak[3] 1972 psrad \$31,$rndkey0 1973 aesenc $rndkey1,$inout1 1974 paddq @tweak[5],@tweak[5] 1975 pand $twmask,$rndkey0 1976 aesenc $rndkey1,$inout2 1977 aesenc $rndkey1,$inout3 1978 pxor $rndkey0,@tweak[5] 1979 $movkey ($key_),$rndkey0 1980 aesenc $rndkey1,$inout4 1981 aesenc $rndkey1,$inout5 1982 $movkey 16($key_),$rndkey1 1983 1984 pxor @tweak[5],@tweak[4] 1985 aesenclast `16*0`(%rsp),$inout0 1986 psrad \$31,$twres 1987 paddq @tweak[5],@tweak[5] 1988 aesenclast `16*1`(%rsp),$inout1 1989 aesenclast `16*2`(%rsp),$inout2 1990 pand $twmask,$twres 1991 mov %r10,%rax # restore $rounds 1992 aesenclast `16*3`(%rsp),$inout3 1993 aesenclast `16*4`(%rsp),$inout4 1994 aesenclast `16*5`(%rsp),$inout5 1995 pxor $twres,@tweak[5] 1996 1997 lea `16*6`($out),$out # $out+=6*16 1998 movups $inout0,`-16*6`($out) # store 6 output blocks 1999 movups $inout1,`-16*5`($out) 2000 movups $inout2,`-16*4`($out) 2001 movups $inout3,`-16*3`($out) 2002 movups $inout4,`-16*2`($out) 2003 movups $inout5,`-16*1`($out) 2004 sub \$16*6,$len 2005 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 2006 2007 mov \$16+96,$rounds 2008 sub $rnds_,$rounds 2009 mov $key_,$key # restore $key 2010 shr \$4,$rounds # restore original value 2011 2012.Lxts_enc_short: 2013 # at the point @tweak[0..5] are populated with tweak values 2014 mov $rounds,$rnds_ # backup $rounds 2015 pxor $rndkey0,@tweak[0] 2016 add \$16*6,$len # restore real remaining $len 2017 jz .Lxts_enc_done # done if ($len==0) 2018 2019 pxor $rndkey0,@tweak[1] 2020 cmp \$0x20,$len 2021 jb .Lxts_enc_one # $len is 1*16 2022 pxor $rndkey0,@tweak[2] 2023 je .Lxts_enc_two # $len is 2*16 2024 2025 pxor $rndkey0,@tweak[3] 2026 cmp \$0x40,$len 2027 jb .Lxts_enc_three # $len is 3*16 2028 pxor $rndkey0,@tweak[4] 2029 je .Lxts_enc_four # $len is 4*16 2030 2031 movdqu ($inp),$inout0 # $len is 5*16 2032 movdqu 16*1($inp),$inout1 2033 movdqu 16*2($inp),$inout2 2034 pxor @tweak[0],$inout0 2035 movdqu 16*3($inp),$inout3 2036 pxor @tweak[1],$inout1 2037 movdqu 16*4($inp),$inout4 2038 lea 16*5($inp),$inp # $inp+=5*16 2039 pxor @tweak[2],$inout2 2040 pxor @tweak[3],$inout3 2041 pxor @tweak[4],$inout4 2042 pxor $inout5,$inout5 2043 2044 call _aesni_encrypt6 2045 2046 xorps @tweak[0],$inout0 2047 movdqa @tweak[5],@tweak[0] 2048 xorps @tweak[1],$inout1 2049 xorps @tweak[2],$inout2 2050 movdqu $inout0,($out) # store 5 output blocks 2051 xorps @tweak[3],$inout3 2052 movdqu $inout1,16*1($out) 2053 xorps @tweak[4],$inout4 2054 movdqu $inout2,16*2($out) 2055 movdqu $inout3,16*3($out) 2056 movdqu $inout4,16*4($out) 2057 lea 16*5($out),$out # $out+=5*16 2058 jmp .Lxts_enc_done 2059 2060.align 16 2061.Lxts_enc_one: 2062 movups ($inp),$inout0 2063 lea 16*1($inp),$inp # inp+=1*16 2064 xorps @tweak[0],$inout0 2065___ 2066 &aesni_generate1("enc",$key,$rounds); 2067$code.=<<___; 2068 xorps @tweak[0],$inout0 2069 movdqa @tweak[1],@tweak[0] 2070 movups $inout0,($out) # store one output block 2071 lea 16*1($out),$out # $out+=1*16 2072 jmp .Lxts_enc_done 2073 2074.align 16 2075.Lxts_enc_two: 2076 movups ($inp),$inout0 2077 movups 16($inp),$inout1 2078 lea 32($inp),$inp # $inp+=2*16 2079 xorps @tweak[0],$inout0 2080 xorps @tweak[1],$inout1 2081 2082 call _aesni_encrypt2 2083 2084 xorps @tweak[0],$inout0 2085 movdqa @tweak[2],@tweak[0] 2086 xorps @tweak[1],$inout1 2087 movups $inout0,($out) # store 2 output blocks 2088 movups $inout1,16*1($out) 2089 lea 16*2($out),$out # $out+=2*16 2090 jmp .Lxts_enc_done 2091 2092.align 16 2093.Lxts_enc_three: 2094 movups ($inp),$inout0 2095 movups 16*1($inp),$inout1 2096 movups 16*2($inp),$inout2 2097 lea 16*3($inp),$inp # $inp+=3*16 2098 xorps @tweak[0],$inout0 2099 xorps @tweak[1],$inout1 2100 xorps @tweak[2],$inout2 2101 2102 call _aesni_encrypt3 2103 2104 xorps @tweak[0],$inout0 2105 movdqa @tweak[3],@tweak[0] 2106 xorps @tweak[1],$inout1 2107 xorps @tweak[2],$inout2 2108 movups $inout0,($out) # store 3 output blocks 2109 movups $inout1,16*1($out) 2110 movups $inout2,16*2($out) 2111 lea 16*3($out),$out # $out+=3*16 2112 jmp .Lxts_enc_done 2113 2114.align 16 2115.Lxts_enc_four: 2116 movups ($inp),$inout0 2117 movups 16*1($inp),$inout1 2118 movups 16*2($inp),$inout2 2119 xorps @tweak[0],$inout0 2120 movups 16*3($inp),$inout3 2121 lea 16*4($inp),$inp # $inp+=4*16 2122 xorps @tweak[1],$inout1 2123 xorps @tweak[2],$inout2 2124 xorps @tweak[3],$inout3 2125 2126 call _aesni_encrypt4 2127 2128 pxor @tweak[0],$inout0 2129 movdqa @tweak[4],@tweak[0] 2130 pxor @tweak[1],$inout1 2131 pxor @tweak[2],$inout2 2132 movdqu $inout0,($out) # store 4 output blocks 2133 pxor @tweak[3],$inout3 2134 movdqu $inout1,16*1($out) 2135 movdqu $inout2,16*2($out) 2136 movdqu $inout3,16*3($out) 2137 lea 16*4($out),$out # $out+=4*16 2138 jmp .Lxts_enc_done 2139 2140.align 16 2141.Lxts_enc_done: 2142 and \$15,$len_ # see if $len%16 is 0 2143 jz .Lxts_enc_ret 2144 mov $len_,$len 2145 2146.Lxts_enc_steal: 2147 movzb ($inp),%eax # borrow $rounds ... 2148 movzb -16($out),%ecx # ... and $key 2149 lea 1($inp),$inp 2150 mov %al,-16($out) 2151 mov %cl,0($out) 2152 lea 1($out),$out 2153 sub \$1,$len 2154 jnz .Lxts_enc_steal 2155 2156 sub $len_,$out # rewind $out 2157 mov $key_,$key # restore $key 2158 mov $rnds_,$rounds # restore $rounds 2159 2160 movups -16($out),$inout0 2161 xorps @tweak[0],$inout0 2162___ 2163 &aesni_generate1("enc",$key,$rounds); 2164$code.=<<___; 2165 xorps @tweak[0],$inout0 2166 movups $inout0,-16($out) 2167 2168.Lxts_enc_ret: 2169 xorps %xmm0,%xmm0 # clear register bank 2170 pxor %xmm1,%xmm1 2171 pxor %xmm2,%xmm2 2172 pxor %xmm3,%xmm3 2173 pxor %xmm4,%xmm4 2174 pxor %xmm5,%xmm5 2175___ 2176$code.=<<___ if (!$win64); 2177 pxor %xmm6,%xmm6 2178 pxor %xmm7,%xmm7 2179 movaps %xmm0,0x00(%rsp) # clear stack 2180 pxor %xmm8,%xmm8 2181 movaps %xmm0,0x10(%rsp) 2182 pxor %xmm9,%xmm9 2183 movaps %xmm0,0x20(%rsp) 2184 pxor %xmm10,%xmm10 2185 movaps %xmm0,0x30(%rsp) 2186 pxor %xmm11,%xmm11 2187 movaps %xmm0,0x40(%rsp) 2188 pxor %xmm12,%xmm12 2189 movaps %xmm0,0x50(%rsp) 2190 pxor %xmm13,%xmm13 2191 movaps %xmm0,0x60(%rsp) 2192 pxor %xmm14,%xmm14 2193 pxor %xmm15,%xmm15 2194___ 2195$code.=<<___ if ($win64); 2196 movaps -0xa8(%r11),%xmm6 2197 movaps %xmm0,-0xa8(%r11) # clear stack 2198 movaps -0x98(%r11),%xmm7 2199 movaps %xmm0,-0x98(%r11) 2200 movaps -0x88(%r11),%xmm8 2201 movaps %xmm0,-0x88(%r11) 2202 movaps -0x78(%r11),%xmm9 2203 movaps %xmm0,-0x78(%r11) 2204 movaps -0x68(%r11),%xmm10 2205 movaps %xmm0,-0x68(%r11) 2206 movaps -0x58(%r11),%xmm11 2207 movaps %xmm0,-0x58(%r11) 2208 movaps -0x48(%r11),%xmm12 2209 movaps %xmm0,-0x48(%r11) 2210 movaps -0x38(%r11),%xmm13 2211 movaps %xmm0,-0x38(%r11) 2212 movaps -0x28(%r11),%xmm14 2213 movaps %xmm0,-0x28(%r11) 2214 movaps -0x18(%r11),%xmm15 2215 movaps %xmm0,-0x18(%r11) 2216 movaps %xmm0,0x00(%rsp) 2217 movaps %xmm0,0x10(%rsp) 2218 movaps %xmm0,0x20(%rsp) 2219 movaps %xmm0,0x30(%rsp) 2220 movaps %xmm0,0x40(%rsp) 2221 movaps %xmm0,0x50(%rsp) 2222 movaps %xmm0,0x60(%rsp) 2223___ 2224$code.=<<___; 2225 mov -8(%r11),%rbp 2226.cfi_restore %rbp 2227 lea (%r11),%rsp 2228.cfi_def_cfa_register %rsp 2229.Lxts_enc_epilogue: 2230 ret 2231.cfi_endproc 2232.size aesni_xts_encrypt,.-aesni_xts_encrypt 2233___ 2234 2235$code.=<<___; 2236.globl aesni_xts_decrypt 2237.type aesni_xts_decrypt,\@function,6 2238.align 16 2239aesni_xts_decrypt: 2240.cfi_startproc 2241 lea (%rsp),%r11 # frame pointer 2242.cfi_def_cfa_register %r11 2243 push %rbp 2244.cfi_push %rbp 2245 sub \$$frame_size,%rsp 2246 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2247___ 2248$code.=<<___ if ($win64); 2249 movaps %xmm6,-0xa8(%r11) # offload everything 2250 movaps %xmm7,-0x98(%r11) 2251 movaps %xmm8,-0x88(%r11) 2252 movaps %xmm9,-0x78(%r11) 2253 movaps %xmm10,-0x68(%r11) 2254 movaps %xmm11,-0x58(%r11) 2255 movaps %xmm12,-0x48(%r11) 2256 movaps %xmm13,-0x38(%r11) 2257 movaps %xmm14,-0x28(%r11) 2258 movaps %xmm15,-0x18(%r11) 2259.Lxts_dec_body: 2260___ 2261$code.=<<___; 2262 movups ($ivp),$inout0 # load clear-text tweak 2263 mov 240($key2),$rounds # key2->rounds 2264 mov 240($key),$rnds_ # key1->rounds 2265___ 2266 # generate the tweak 2267 &aesni_generate1("enc",$key2,$rounds,$inout0); 2268$code.=<<___; 2269 xor %eax,%eax # if ($len%16) len-=16; 2270 test \$15,$len 2271 setnz %al 2272 shl \$4,%rax 2273 sub %rax,$len 2274 2275 $movkey ($key),$rndkey0 # zero round key 2276 mov $key,$key_ # backup $key 2277 mov $rnds_,$rounds # backup $rounds 2278 shl \$4,$rnds_ 2279 mov $len,$len_ # backup $len 2280 and \$-16,$len 2281 2282 $movkey 16($key,$rnds_),$rndkey1 # last round key 2283 2284 movdqa .Lxts_magic(%rip),$twmask 2285 movdqa $inout0,@tweak[5] 2286 pshufd \$0x5f,$inout0,$twres 2287 pxor $rndkey0,$rndkey1 2288___ 2289 for ($i=0;$i<4;$i++) { 2290 $code.=<<___; 2291 movdqa $twres,$twtmp 2292 paddd $twres,$twres 2293 movdqa @tweak[5],@tweak[$i] 2294 psrad \$31,$twtmp # broadcast upper bits 2295 paddq @tweak[5],@tweak[5] 2296 pand $twmask,$twtmp 2297 pxor $rndkey0,@tweak[$i] 2298 pxor $twtmp,@tweak[5] 2299___ 2300 } 2301$code.=<<___; 2302 movdqa @tweak[5],@tweak[4] 2303 psrad \$31,$twres 2304 paddq @tweak[5],@tweak[5] 2305 pand $twmask,$twres 2306 pxor $rndkey0,@tweak[4] 2307 pxor $twres,@tweak[5] 2308 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2309 2310 sub \$16*6,$len 2311 jc .Lxts_dec_short # if $len-=6*16 borrowed 2312 2313 mov \$16+96,$rounds 2314 lea 32($key_,$rnds_),$key # end of key schedule 2315 sub %r10,%rax # twisted $rounds 2316 $movkey 16($key_),$rndkey1 2317 mov %rax,%r10 # backup twisted $rounds 2318 lea .Lxts_magic(%rip),%r8 2319 jmp .Lxts_dec_grandloop 2320 2321.align 32 2322.Lxts_dec_grandloop: 2323 movdqu `16*0`($inp),$inout0 # load input 2324 movdqa $rndkey0,$twmask 2325 movdqu `16*1`($inp),$inout1 2326 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2327 movdqu `16*2`($inp),$inout2 2328 pxor @tweak[1],$inout1 2329 aesdec $rndkey1,$inout0 2330 movdqu `16*3`($inp),$inout3 2331 pxor @tweak[2],$inout2 2332 aesdec $rndkey1,$inout1 2333 movdqu `16*4`($inp),$inout4 2334 pxor @tweak[3],$inout3 2335 aesdec $rndkey1,$inout2 2336 movdqu `16*5`($inp),$inout5 2337 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2338 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2339 pxor @tweak[4],$inout4 2340 aesdec $rndkey1,$inout3 2341 $movkey 32($key_),$rndkey0 2342 lea `16*6`($inp),$inp 2343 pxor $twmask,$inout5 2344 2345 pxor $twres,@tweak[0] # calculate tweaks^round[last] 2346 aesdec $rndkey1,$inout4 2347 pxor $twres,@tweak[1] 2348 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2349 aesdec $rndkey1,$inout5 2350 $movkey 48($key_),$rndkey1 2351 pxor $twres,@tweak[2] 2352 2353 aesdec $rndkey0,$inout0 2354 pxor $twres,@tweak[3] 2355 movdqa @tweak[1],`16*1`(%rsp) 2356 aesdec $rndkey0,$inout1 2357 pxor $twres,@tweak[4] 2358 movdqa @tweak[2],`16*2`(%rsp) 2359 aesdec $rndkey0,$inout2 2360 aesdec $rndkey0,$inout3 2361 pxor $twres,$twmask 2362 movdqa @tweak[4],`16*4`(%rsp) 2363 aesdec $rndkey0,$inout4 2364 aesdec $rndkey0,$inout5 2365 $movkey 64($key_),$rndkey0 2366 movdqa $twmask,`16*5`(%rsp) 2367 pshufd \$0x5f,@tweak[5],$twres 2368 jmp .Lxts_dec_loop6 2369.align 32 2370.Lxts_dec_loop6: 2371 aesdec $rndkey1,$inout0 2372 aesdec $rndkey1,$inout1 2373 aesdec $rndkey1,$inout2 2374 aesdec $rndkey1,$inout3 2375 aesdec $rndkey1,$inout4 2376 aesdec $rndkey1,$inout5 2377 $movkey -64($key,%rax),$rndkey1 2378 add \$32,%rax 2379 2380 aesdec $rndkey0,$inout0 2381 aesdec $rndkey0,$inout1 2382 aesdec $rndkey0,$inout2 2383 aesdec $rndkey0,$inout3 2384 aesdec $rndkey0,$inout4 2385 aesdec $rndkey0,$inout5 2386 $movkey -80($key,%rax),$rndkey0 2387 jnz .Lxts_dec_loop6 2388 2389 movdqa (%r8),$twmask # start calculating next tweak 2390 movdqa $twres,$twtmp 2391 paddd $twres,$twres 2392 aesdec $rndkey1,$inout0 2393 paddq @tweak[5],@tweak[5] 2394 psrad \$31,$twtmp 2395 aesdec $rndkey1,$inout1 2396 pand $twmask,$twtmp 2397 $movkey ($key_),@tweak[0] # load round[0] 2398 aesdec $rndkey1,$inout2 2399 aesdec $rndkey1,$inout3 2400 aesdec $rndkey1,$inout4 2401 pxor $twtmp,@tweak[5] 2402 movaps @tweak[0],@tweak[1] # copy round[0] 2403 aesdec $rndkey1,$inout5 2404 $movkey -64($key),$rndkey1 2405 2406 movdqa $twres,$twtmp 2407 aesdec $rndkey0,$inout0 2408 paddd $twres,$twres 2409 pxor @tweak[5],@tweak[0] 2410 aesdec $rndkey0,$inout1 2411 psrad \$31,$twtmp 2412 paddq @tweak[5],@tweak[5] 2413 aesdec $rndkey0,$inout2 2414 aesdec $rndkey0,$inout3 2415 pand $twmask,$twtmp 2416 movaps @tweak[1],@tweak[2] 2417 aesdec $rndkey0,$inout4 2418 pxor $twtmp,@tweak[5] 2419 movdqa $twres,$twtmp 2420 aesdec $rndkey0,$inout5 2421 $movkey -48($key),$rndkey0 2422 2423 paddd $twres,$twres 2424 aesdec $rndkey1,$inout0 2425 pxor @tweak[5],@tweak[1] 2426 psrad \$31,$twtmp 2427 aesdec $rndkey1,$inout1 2428 paddq @tweak[5],@tweak[5] 2429 pand $twmask,$twtmp 2430 aesdec $rndkey1,$inout2 2431 aesdec $rndkey1,$inout3 2432 movdqa @tweak[3],`16*3`(%rsp) 2433 pxor $twtmp,@tweak[5] 2434 aesdec $rndkey1,$inout4 2435 movaps @tweak[2],@tweak[3] 2436 movdqa $twres,$twtmp 2437 aesdec $rndkey1,$inout5 2438 $movkey -32($key),$rndkey1 2439 2440 paddd $twres,$twres 2441 aesdec $rndkey0,$inout0 2442 pxor @tweak[5],@tweak[2] 2443 psrad \$31,$twtmp 2444 aesdec $rndkey0,$inout1 2445 paddq @tweak[5],@tweak[5] 2446 pand $twmask,$twtmp 2447 aesdec $rndkey0,$inout2 2448 aesdec $rndkey0,$inout3 2449 aesdec $rndkey0,$inout4 2450 pxor $twtmp,@tweak[5] 2451 movaps @tweak[3],@tweak[4] 2452 aesdec $rndkey0,$inout5 2453 2454 movdqa $twres,$rndkey0 2455 paddd $twres,$twres 2456 aesdec $rndkey1,$inout0 2457 pxor @tweak[5],@tweak[3] 2458 psrad \$31,$rndkey0 2459 aesdec $rndkey1,$inout1 2460 paddq @tweak[5],@tweak[5] 2461 pand $twmask,$rndkey0 2462 aesdec $rndkey1,$inout2 2463 aesdec $rndkey1,$inout3 2464 pxor $rndkey0,@tweak[5] 2465 $movkey ($key_),$rndkey0 2466 aesdec $rndkey1,$inout4 2467 aesdec $rndkey1,$inout5 2468 $movkey 16($key_),$rndkey1 2469 2470 pxor @tweak[5],@tweak[4] 2471 aesdeclast `16*0`(%rsp),$inout0 2472 psrad \$31,$twres 2473 paddq @tweak[5],@tweak[5] 2474 aesdeclast `16*1`(%rsp),$inout1 2475 aesdeclast `16*2`(%rsp),$inout2 2476 pand $twmask,$twres 2477 mov %r10,%rax # restore $rounds 2478 aesdeclast `16*3`(%rsp),$inout3 2479 aesdeclast `16*4`(%rsp),$inout4 2480 aesdeclast `16*5`(%rsp),$inout5 2481 pxor $twres,@tweak[5] 2482 2483 lea `16*6`($out),$out # $out+=6*16 2484 movups $inout0,`-16*6`($out) # store 6 output blocks 2485 movups $inout1,`-16*5`($out) 2486 movups $inout2,`-16*4`($out) 2487 movups $inout3,`-16*3`($out) 2488 movups $inout4,`-16*2`($out) 2489 movups $inout5,`-16*1`($out) 2490 sub \$16*6,$len 2491 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2492 2493 mov \$16+96,$rounds 2494 sub $rnds_,$rounds 2495 mov $key_,$key # restore $key 2496 shr \$4,$rounds # restore original value 2497 2498.Lxts_dec_short: 2499 # at the point @tweak[0..5] are populated with tweak values 2500 mov $rounds,$rnds_ # backup $rounds 2501 pxor $rndkey0,@tweak[0] 2502 pxor $rndkey0,@tweak[1] 2503 add \$16*6,$len # restore real remaining $len 2504 jz .Lxts_dec_done # done if ($len==0) 2505 2506 pxor $rndkey0,@tweak[2] 2507 cmp \$0x20,$len 2508 jb .Lxts_dec_one # $len is 1*16 2509 pxor $rndkey0,@tweak[3] 2510 je .Lxts_dec_two # $len is 2*16 2511 2512 pxor $rndkey0,@tweak[4] 2513 cmp \$0x40,$len 2514 jb .Lxts_dec_three # $len is 3*16 2515 je .Lxts_dec_four # $len is 4*16 2516 2517 movdqu ($inp),$inout0 # $len is 5*16 2518 movdqu 16*1($inp),$inout1 2519 movdqu 16*2($inp),$inout2 2520 pxor @tweak[0],$inout0 2521 movdqu 16*3($inp),$inout3 2522 pxor @tweak[1],$inout1 2523 movdqu 16*4($inp),$inout4 2524 lea 16*5($inp),$inp # $inp+=5*16 2525 pxor @tweak[2],$inout2 2526 pxor @tweak[3],$inout3 2527 pxor @tweak[4],$inout4 2528 2529 call _aesni_decrypt6 2530 2531 xorps @tweak[0],$inout0 2532 xorps @tweak[1],$inout1 2533 xorps @tweak[2],$inout2 2534 movdqu $inout0,($out) # store 5 output blocks 2535 xorps @tweak[3],$inout3 2536 movdqu $inout1,16*1($out) 2537 xorps @tweak[4],$inout4 2538 movdqu $inout2,16*2($out) 2539 pxor $twtmp,$twtmp 2540 movdqu $inout3,16*3($out) 2541 pcmpgtd @tweak[5],$twtmp 2542 movdqu $inout4,16*4($out) 2543 lea 16*5($out),$out # $out+=5*16 2544 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2545 and \$15,$len_ 2546 jz .Lxts_dec_ret 2547 2548 movdqa @tweak[5],@tweak[0] 2549 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2550 pand $twmask,@tweak[1] # isolate carry and residue 2551 pxor @tweak[5],@tweak[1] 2552 jmp .Lxts_dec_done2 2553 2554.align 16 2555.Lxts_dec_one: 2556 movups ($inp),$inout0 2557 lea 16*1($inp),$inp # $inp+=1*16 2558 xorps @tweak[0],$inout0 2559___ 2560 &aesni_generate1("dec",$key,$rounds); 2561$code.=<<___; 2562 xorps @tweak[0],$inout0 2563 movdqa @tweak[1],@tweak[0] 2564 movups $inout0,($out) # store one output block 2565 movdqa @tweak[2],@tweak[1] 2566 lea 16*1($out),$out # $out+=1*16 2567 jmp .Lxts_dec_done 2568 2569.align 16 2570.Lxts_dec_two: 2571 movups ($inp),$inout0 2572 movups 16($inp),$inout1 2573 lea 32($inp),$inp # $inp+=2*16 2574 xorps @tweak[0],$inout0 2575 xorps @tweak[1],$inout1 2576 2577 call _aesni_decrypt2 2578 2579 xorps @tweak[0],$inout0 2580 movdqa @tweak[2],@tweak[0] 2581 xorps @tweak[1],$inout1 2582 movdqa @tweak[3],@tweak[1] 2583 movups $inout0,($out) # store 2 output blocks 2584 movups $inout1,16*1($out) 2585 lea 16*2($out),$out # $out+=2*16 2586 jmp .Lxts_dec_done 2587 2588.align 16 2589.Lxts_dec_three: 2590 movups ($inp),$inout0 2591 movups 16*1($inp),$inout1 2592 movups 16*2($inp),$inout2 2593 lea 16*3($inp),$inp # $inp+=3*16 2594 xorps @tweak[0],$inout0 2595 xorps @tweak[1],$inout1 2596 xorps @tweak[2],$inout2 2597 2598 call _aesni_decrypt3 2599 2600 xorps @tweak[0],$inout0 2601 movdqa @tweak[3],@tweak[0] 2602 xorps @tweak[1],$inout1 2603 movdqa @tweak[4],@tweak[1] 2604 xorps @tweak[2],$inout2 2605 movups $inout0,($out) # store 3 output blocks 2606 movups $inout1,16*1($out) 2607 movups $inout2,16*2($out) 2608 lea 16*3($out),$out # $out+=3*16 2609 jmp .Lxts_dec_done 2610 2611.align 16 2612.Lxts_dec_four: 2613 movups ($inp),$inout0 2614 movups 16*1($inp),$inout1 2615 movups 16*2($inp),$inout2 2616 xorps @tweak[0],$inout0 2617 movups 16*3($inp),$inout3 2618 lea 16*4($inp),$inp # $inp+=4*16 2619 xorps @tweak[1],$inout1 2620 xorps @tweak[2],$inout2 2621 xorps @tweak[3],$inout3 2622 2623 call _aesni_decrypt4 2624 2625 pxor @tweak[0],$inout0 2626 movdqa @tweak[4],@tweak[0] 2627 pxor @tweak[1],$inout1 2628 movdqa @tweak[5],@tweak[1] 2629 pxor @tweak[2],$inout2 2630 movdqu $inout0,($out) # store 4 output blocks 2631 pxor @tweak[3],$inout3 2632 movdqu $inout1,16*1($out) 2633 movdqu $inout2,16*2($out) 2634 movdqu $inout3,16*3($out) 2635 lea 16*4($out),$out # $out+=4*16 2636 jmp .Lxts_dec_done 2637 2638.align 16 2639.Lxts_dec_done: 2640 and \$15,$len_ # see if $len%16 is 0 2641 jz .Lxts_dec_ret 2642.Lxts_dec_done2: 2643 mov $len_,$len 2644 mov $key_,$key # restore $key 2645 mov $rnds_,$rounds # restore $rounds 2646 2647 movups ($inp),$inout0 2648 xorps @tweak[1],$inout0 2649___ 2650 &aesni_generate1("dec",$key,$rounds); 2651$code.=<<___; 2652 xorps @tweak[1],$inout0 2653 movups $inout0,($out) 2654 2655.Lxts_dec_steal: 2656 movzb 16($inp),%eax # borrow $rounds ... 2657 movzb ($out),%ecx # ... and $key 2658 lea 1($inp),$inp 2659 mov %al,($out) 2660 mov %cl,16($out) 2661 lea 1($out),$out 2662 sub \$1,$len 2663 jnz .Lxts_dec_steal 2664 2665 sub $len_,$out # rewind $out 2666 mov $key_,$key # restore $key 2667 mov $rnds_,$rounds # restore $rounds 2668 2669 movups ($out),$inout0 2670 xorps @tweak[0],$inout0 2671___ 2672 &aesni_generate1("dec",$key,$rounds); 2673$code.=<<___; 2674 xorps @tweak[0],$inout0 2675 movups $inout0,($out) 2676 2677.Lxts_dec_ret: 2678 xorps %xmm0,%xmm0 # clear register bank 2679 pxor %xmm1,%xmm1 2680 pxor %xmm2,%xmm2 2681 pxor %xmm3,%xmm3 2682 pxor %xmm4,%xmm4 2683 pxor %xmm5,%xmm5 2684___ 2685$code.=<<___ if (!$win64); 2686 pxor %xmm6,%xmm6 2687 pxor %xmm7,%xmm7 2688 movaps %xmm0,0x00(%rsp) # clear stack 2689 pxor %xmm8,%xmm8 2690 movaps %xmm0,0x10(%rsp) 2691 pxor %xmm9,%xmm9 2692 movaps %xmm0,0x20(%rsp) 2693 pxor %xmm10,%xmm10 2694 movaps %xmm0,0x30(%rsp) 2695 pxor %xmm11,%xmm11 2696 movaps %xmm0,0x40(%rsp) 2697 pxor %xmm12,%xmm12 2698 movaps %xmm0,0x50(%rsp) 2699 pxor %xmm13,%xmm13 2700 movaps %xmm0,0x60(%rsp) 2701 pxor %xmm14,%xmm14 2702 pxor %xmm15,%xmm15 2703___ 2704$code.=<<___ if ($win64); 2705 movaps -0xa8(%r11),%xmm6 2706 movaps %xmm0,-0xa8(%r11) # clear stack 2707 movaps -0x98(%r11),%xmm7 2708 movaps %xmm0,-0x98(%r11) 2709 movaps -0x88(%r11),%xmm8 2710 movaps %xmm0,-0x88(%r11) 2711 movaps -0x78(%r11),%xmm9 2712 movaps %xmm0,-0x78(%r11) 2713 movaps -0x68(%r11),%xmm10 2714 movaps %xmm0,-0x68(%r11) 2715 movaps -0x58(%r11),%xmm11 2716 movaps %xmm0,-0x58(%r11) 2717 movaps -0x48(%r11),%xmm12 2718 movaps %xmm0,-0x48(%r11) 2719 movaps -0x38(%r11),%xmm13 2720 movaps %xmm0,-0x38(%r11) 2721 movaps -0x28(%r11),%xmm14 2722 movaps %xmm0,-0x28(%r11) 2723 movaps -0x18(%r11),%xmm15 2724 movaps %xmm0,-0x18(%r11) 2725 movaps %xmm0,0x00(%rsp) 2726 movaps %xmm0,0x10(%rsp) 2727 movaps %xmm0,0x20(%rsp) 2728 movaps %xmm0,0x30(%rsp) 2729 movaps %xmm0,0x40(%rsp) 2730 movaps %xmm0,0x50(%rsp) 2731 movaps %xmm0,0x60(%rsp) 2732___ 2733$code.=<<___; 2734 mov -8(%r11),%rbp 2735.cfi_restore %rbp 2736 lea (%r11),%rsp 2737.cfi_def_cfa_register %rsp 2738.Lxts_dec_epilogue: 2739 ret 2740.cfi_endproc 2741.size aesni_xts_decrypt,.-aesni_xts_decrypt 2742___ 2743} 2744 2745###################################################################### 2746# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2747# const AES_KEY *key, unsigned int start_block_num, 2748# unsigned char offset_i[16], const unsigned char L_[][16], 2749# unsigned char checksum[16]); 2750# 2751{ 2752my @offset=map("%xmm$_",(10..15)); 2753my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2754my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2755my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2756my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2757my $seventh_arg = $win64 ? 56 : 8; 2758my $blocks = $len; 2759 2760$code.=<<___; 2761.globl aesni_ocb_encrypt 2762.type aesni_ocb_encrypt,\@function,6 2763.align 32 2764aesni_ocb_encrypt: 2765.cfi_startproc 2766 lea (%rsp),%rax 2767 push %rbx 2768.cfi_push %rbx 2769 push %rbp 2770.cfi_push %rbp 2771 push %r12 2772.cfi_push %r12 2773 push %r13 2774.cfi_push %r13 2775 push %r14 2776.cfi_push %r14 2777___ 2778$code.=<<___ if ($win64); 2779 lea -0xa0(%rsp),%rsp 2780 movaps %xmm6,0x00(%rsp) # offload everything 2781 movaps %xmm7,0x10(%rsp) 2782 movaps %xmm8,0x20(%rsp) 2783 movaps %xmm9,0x30(%rsp) 2784 movaps %xmm10,0x40(%rsp) 2785 movaps %xmm11,0x50(%rsp) 2786 movaps %xmm12,0x60(%rsp) 2787 movaps %xmm13,0x70(%rsp) 2788 movaps %xmm14,0x80(%rsp) 2789 movaps %xmm15,0x90(%rsp) 2790.Locb_enc_body: 2791___ 2792$code.=<<___; 2793 mov $seventh_arg(%rax),$L_p # 7th argument 2794 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2795 2796 mov 240($key),$rnds_ 2797 mov $key,$key_ 2798 shl \$4,$rnds_ 2799 $movkey ($key),$rndkey0l # round[0] 2800 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2801 2802 movdqu ($offset_p),@offset[5] # load last offset_i 2803 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2804 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2805 2806 mov \$16+32,$rounds 2807 lea 32($key_,$rnds_),$key 2808 $movkey 16($key_),$rndkey1 # round[1] 2809 sub %r10,%rax # twisted $rounds 2810 mov %rax,%r10 # backup twisted $rounds 2811 2812 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2813 movdqu ($checksum_p),$checksum # load checksum 2814 2815 test \$1,$block_num # is first block number odd? 2816 jnz .Locb_enc_odd 2817 2818 bsf $block_num,$i1 2819 add \$1,$block_num 2820 shl \$4,$i1 2821 movdqu ($L_p,$i1),$inout5 # borrow 2822 movdqu ($inp),$inout0 2823 lea 16($inp),$inp 2824 2825 call __ocb_encrypt1 2826 2827 movdqa $inout5,@offset[5] 2828 movups $inout0,($out) 2829 lea 16($out),$out 2830 sub \$1,$blocks 2831 jz .Locb_enc_done 2832 2833.Locb_enc_odd: 2834 lea 1($block_num),$i1 # even-numbered blocks 2835 lea 3($block_num),$i3 2836 lea 5($block_num),$i5 2837 lea 6($block_num),$block_num 2838 bsf $i1,$i1 # ntz(block) 2839 bsf $i3,$i3 2840 bsf $i5,$i5 2841 shl \$4,$i1 # ntz(block) -> table offset 2842 shl \$4,$i3 2843 shl \$4,$i5 2844 2845 sub \$6,$blocks 2846 jc .Locb_enc_short 2847 jmp .Locb_enc_grandloop 2848 2849.align 32 2850.Locb_enc_grandloop: 2851 movdqu `16*0`($inp),$inout0 # load input 2852 movdqu `16*1`($inp),$inout1 2853 movdqu `16*2`($inp),$inout2 2854 movdqu `16*3`($inp),$inout3 2855 movdqu `16*4`($inp),$inout4 2856 movdqu `16*5`($inp),$inout5 2857 lea `16*6`($inp),$inp 2858 2859 call __ocb_encrypt6 2860 2861 movups $inout0,`16*0`($out) # store output 2862 movups $inout1,`16*1`($out) 2863 movups $inout2,`16*2`($out) 2864 movups $inout3,`16*3`($out) 2865 movups $inout4,`16*4`($out) 2866 movups $inout5,`16*5`($out) 2867 lea `16*6`($out),$out 2868 sub \$6,$blocks 2869 jnc .Locb_enc_grandloop 2870 2871.Locb_enc_short: 2872 add \$6,$blocks 2873 jz .Locb_enc_done 2874 2875 movdqu `16*0`($inp),$inout0 2876 cmp \$2,$blocks 2877 jb .Locb_enc_one 2878 movdqu `16*1`($inp),$inout1 2879 je .Locb_enc_two 2880 2881 movdqu `16*2`($inp),$inout2 2882 cmp \$4,$blocks 2883 jb .Locb_enc_three 2884 movdqu `16*3`($inp),$inout3 2885 je .Locb_enc_four 2886 2887 movdqu `16*4`($inp),$inout4 2888 pxor $inout5,$inout5 2889 2890 call __ocb_encrypt6 2891 2892 movdqa @offset[4],@offset[5] 2893 movups $inout0,`16*0`($out) 2894 movups $inout1,`16*1`($out) 2895 movups $inout2,`16*2`($out) 2896 movups $inout3,`16*3`($out) 2897 movups $inout4,`16*4`($out) 2898 2899 jmp .Locb_enc_done 2900 2901.align 16 2902.Locb_enc_one: 2903 movdqa @offset[0],$inout5 # borrow 2904 2905 call __ocb_encrypt1 2906 2907 movdqa $inout5,@offset[5] 2908 movups $inout0,`16*0`($out) 2909 jmp .Locb_enc_done 2910 2911.align 16 2912.Locb_enc_two: 2913 pxor $inout2,$inout2 2914 pxor $inout3,$inout3 2915 2916 call __ocb_encrypt4 2917 2918 movdqa @offset[1],@offset[5] 2919 movups $inout0,`16*0`($out) 2920 movups $inout1,`16*1`($out) 2921 2922 jmp .Locb_enc_done 2923 2924.align 16 2925.Locb_enc_three: 2926 pxor $inout3,$inout3 2927 2928 call __ocb_encrypt4 2929 2930 movdqa @offset[2],@offset[5] 2931 movups $inout0,`16*0`($out) 2932 movups $inout1,`16*1`($out) 2933 movups $inout2,`16*2`($out) 2934 2935 jmp .Locb_enc_done 2936 2937.align 16 2938.Locb_enc_four: 2939 call __ocb_encrypt4 2940 2941 movdqa @offset[3],@offset[5] 2942 movups $inout0,`16*0`($out) 2943 movups $inout1,`16*1`($out) 2944 movups $inout2,`16*2`($out) 2945 movups $inout3,`16*3`($out) 2946 2947.Locb_enc_done: 2948 pxor $rndkey0,@offset[5] # "remove" round[last] 2949 movdqu $checksum,($checksum_p) # store checksum 2950 movdqu @offset[5],($offset_p) # store last offset_i 2951 2952 xorps %xmm0,%xmm0 # clear register bank 2953 pxor %xmm1,%xmm1 2954 pxor %xmm2,%xmm2 2955 pxor %xmm3,%xmm3 2956 pxor %xmm4,%xmm4 2957 pxor %xmm5,%xmm5 2958___ 2959$code.=<<___ if (!$win64); 2960 pxor %xmm6,%xmm6 2961 pxor %xmm7,%xmm7 2962 pxor %xmm8,%xmm8 2963 pxor %xmm9,%xmm9 2964 pxor %xmm10,%xmm10 2965 pxor %xmm11,%xmm11 2966 pxor %xmm12,%xmm12 2967 pxor %xmm13,%xmm13 2968 pxor %xmm14,%xmm14 2969 pxor %xmm15,%xmm15 2970 lea 0x28(%rsp),%rax 2971.cfi_def_cfa %rax,8 2972___ 2973$code.=<<___ if ($win64); 2974 movaps 0x00(%rsp),%xmm6 2975 movaps %xmm0,0x00(%rsp) # clear stack 2976 movaps 0x10(%rsp),%xmm7 2977 movaps %xmm0,0x10(%rsp) 2978 movaps 0x20(%rsp),%xmm8 2979 movaps %xmm0,0x20(%rsp) 2980 movaps 0x30(%rsp),%xmm9 2981 movaps %xmm0,0x30(%rsp) 2982 movaps 0x40(%rsp),%xmm10 2983 movaps %xmm0,0x40(%rsp) 2984 movaps 0x50(%rsp),%xmm11 2985 movaps %xmm0,0x50(%rsp) 2986 movaps 0x60(%rsp),%xmm12 2987 movaps %xmm0,0x60(%rsp) 2988 movaps 0x70(%rsp),%xmm13 2989 movaps %xmm0,0x70(%rsp) 2990 movaps 0x80(%rsp),%xmm14 2991 movaps %xmm0,0x80(%rsp) 2992 movaps 0x90(%rsp),%xmm15 2993 movaps %xmm0,0x90(%rsp) 2994 lea 0xa0+0x28(%rsp),%rax 2995.Locb_enc_pop: 2996___ 2997$code.=<<___; 2998 mov -40(%rax),%r14 2999.cfi_restore %r14 3000 mov -32(%rax),%r13 3001.cfi_restore %r13 3002 mov -24(%rax),%r12 3003.cfi_restore %r12 3004 mov -16(%rax),%rbp 3005.cfi_restore %rbp 3006 mov -8(%rax),%rbx 3007.cfi_restore %rbx 3008 lea (%rax),%rsp 3009.cfi_def_cfa_register %rsp 3010.Locb_enc_epilogue: 3011 ret 3012.cfi_endproc 3013.size aesni_ocb_encrypt,.-aesni_ocb_encrypt 3014 3015.type __ocb_encrypt6,\@abi-omnipotent 3016.align 32 3017__ocb_encrypt6: 3018 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3019 movdqu ($L_p,$i1),@offset[1] 3020 movdqa @offset[0],@offset[2] 3021 movdqu ($L_p,$i3),@offset[3] 3022 movdqa @offset[0],@offset[4] 3023 pxor @offset[5],@offset[0] 3024 movdqu ($L_p,$i5),@offset[5] 3025 pxor @offset[0],@offset[1] 3026 pxor $inout0,$checksum # accumulate checksum 3027 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3028 pxor @offset[1],@offset[2] 3029 pxor $inout1,$checksum 3030 pxor @offset[1],$inout1 3031 pxor @offset[2],@offset[3] 3032 pxor $inout2,$checksum 3033 pxor @offset[2],$inout2 3034 pxor @offset[3],@offset[4] 3035 pxor $inout3,$checksum 3036 pxor @offset[3],$inout3 3037 pxor @offset[4],@offset[5] 3038 pxor $inout4,$checksum 3039 pxor @offset[4],$inout4 3040 pxor $inout5,$checksum 3041 pxor @offset[5],$inout5 3042 $movkey 32($key_),$rndkey0 3043 3044 lea 1($block_num),$i1 # even-numbered blocks 3045 lea 3($block_num),$i3 3046 lea 5($block_num),$i5 3047 add \$6,$block_num 3048 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3049 bsf $i1,$i1 # ntz(block) 3050 bsf $i3,$i3 3051 bsf $i5,$i5 3052 3053 aesenc $rndkey1,$inout0 3054 aesenc $rndkey1,$inout1 3055 aesenc $rndkey1,$inout2 3056 aesenc $rndkey1,$inout3 3057 pxor $rndkey0l,@offset[1] 3058 pxor $rndkey0l,@offset[2] 3059 aesenc $rndkey1,$inout4 3060 pxor $rndkey0l,@offset[3] 3061 pxor $rndkey0l,@offset[4] 3062 aesenc $rndkey1,$inout5 3063 $movkey 48($key_),$rndkey1 3064 pxor $rndkey0l,@offset[5] 3065 3066 aesenc $rndkey0,$inout0 3067 aesenc $rndkey0,$inout1 3068 aesenc $rndkey0,$inout2 3069 aesenc $rndkey0,$inout3 3070 aesenc $rndkey0,$inout4 3071 aesenc $rndkey0,$inout5 3072 $movkey 64($key_),$rndkey0 3073 shl \$4,$i1 # ntz(block) -> table offset 3074 shl \$4,$i3 3075 jmp .Locb_enc_loop6 3076 3077.align 32 3078.Locb_enc_loop6: 3079 aesenc $rndkey1,$inout0 3080 aesenc $rndkey1,$inout1 3081 aesenc $rndkey1,$inout2 3082 aesenc $rndkey1,$inout3 3083 aesenc $rndkey1,$inout4 3084 aesenc $rndkey1,$inout5 3085 $movkey ($key,%rax),$rndkey1 3086 add \$32,%rax 3087 3088 aesenc $rndkey0,$inout0 3089 aesenc $rndkey0,$inout1 3090 aesenc $rndkey0,$inout2 3091 aesenc $rndkey0,$inout3 3092 aesenc $rndkey0,$inout4 3093 aesenc $rndkey0,$inout5 3094 $movkey -16($key,%rax),$rndkey0 3095 jnz .Locb_enc_loop6 3096 3097 aesenc $rndkey1,$inout0 3098 aesenc $rndkey1,$inout1 3099 aesenc $rndkey1,$inout2 3100 aesenc $rndkey1,$inout3 3101 aesenc $rndkey1,$inout4 3102 aesenc $rndkey1,$inout5 3103 $movkey 16($key_),$rndkey1 3104 shl \$4,$i5 3105 3106 aesenclast @offset[0],$inout0 3107 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3108 mov %r10,%rax # restore twisted rounds 3109 aesenclast @offset[1],$inout1 3110 aesenclast @offset[2],$inout2 3111 aesenclast @offset[3],$inout3 3112 aesenclast @offset[4],$inout4 3113 aesenclast @offset[5],$inout5 3114 ret 3115.size __ocb_encrypt6,.-__ocb_encrypt6 3116 3117.type __ocb_encrypt4,\@abi-omnipotent 3118.align 32 3119__ocb_encrypt4: 3120 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3121 movdqu ($L_p,$i1),@offset[1] 3122 movdqa @offset[0],@offset[2] 3123 movdqu ($L_p,$i3),@offset[3] 3124 pxor @offset[5],@offset[0] 3125 pxor @offset[0],@offset[1] 3126 pxor $inout0,$checksum # accumulate checksum 3127 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3128 pxor @offset[1],@offset[2] 3129 pxor $inout1,$checksum 3130 pxor @offset[1],$inout1 3131 pxor @offset[2],@offset[3] 3132 pxor $inout2,$checksum 3133 pxor @offset[2],$inout2 3134 pxor $inout3,$checksum 3135 pxor @offset[3],$inout3 3136 $movkey 32($key_),$rndkey0 3137 3138 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3139 pxor $rndkey0l,@offset[1] 3140 pxor $rndkey0l,@offset[2] 3141 pxor $rndkey0l,@offset[3] 3142 3143 aesenc $rndkey1,$inout0 3144 aesenc $rndkey1,$inout1 3145 aesenc $rndkey1,$inout2 3146 aesenc $rndkey1,$inout3 3147 $movkey 48($key_),$rndkey1 3148 3149 aesenc $rndkey0,$inout0 3150 aesenc $rndkey0,$inout1 3151 aesenc $rndkey0,$inout2 3152 aesenc $rndkey0,$inout3 3153 $movkey 64($key_),$rndkey0 3154 jmp .Locb_enc_loop4 3155 3156.align 32 3157.Locb_enc_loop4: 3158 aesenc $rndkey1,$inout0 3159 aesenc $rndkey1,$inout1 3160 aesenc $rndkey1,$inout2 3161 aesenc $rndkey1,$inout3 3162 $movkey ($key,%rax),$rndkey1 3163 add \$32,%rax 3164 3165 aesenc $rndkey0,$inout0 3166 aesenc $rndkey0,$inout1 3167 aesenc $rndkey0,$inout2 3168 aesenc $rndkey0,$inout3 3169 $movkey -16($key,%rax),$rndkey0 3170 jnz .Locb_enc_loop4 3171 3172 aesenc $rndkey1,$inout0 3173 aesenc $rndkey1,$inout1 3174 aesenc $rndkey1,$inout2 3175 aesenc $rndkey1,$inout3 3176 $movkey 16($key_),$rndkey1 3177 mov %r10,%rax # restore twisted rounds 3178 3179 aesenclast @offset[0],$inout0 3180 aesenclast @offset[1],$inout1 3181 aesenclast @offset[2],$inout2 3182 aesenclast @offset[3],$inout3 3183 ret 3184.size __ocb_encrypt4,.-__ocb_encrypt4 3185 3186.type __ocb_encrypt1,\@abi-omnipotent 3187.align 32 3188__ocb_encrypt1: 3189 pxor @offset[5],$inout5 # offset_i 3190 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3191 pxor $inout0,$checksum # accumulate checksum 3192 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3193 $movkey 32($key_),$rndkey0 3194 3195 aesenc $rndkey1,$inout0 3196 $movkey 48($key_),$rndkey1 3197 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3198 3199 aesenc $rndkey0,$inout0 3200 $movkey 64($key_),$rndkey0 3201 jmp .Locb_enc_loop1 3202 3203.align 32 3204.Locb_enc_loop1: 3205 aesenc $rndkey1,$inout0 3206 $movkey ($key,%rax),$rndkey1 3207 add \$32,%rax 3208 3209 aesenc $rndkey0,$inout0 3210 $movkey -16($key,%rax),$rndkey0 3211 jnz .Locb_enc_loop1 3212 3213 aesenc $rndkey1,$inout0 3214 $movkey 16($key_),$rndkey1 # redundant in tail 3215 mov %r10,%rax # restore twisted rounds 3216 3217 aesenclast $inout5,$inout0 3218 ret 3219.size __ocb_encrypt1,.-__ocb_encrypt1 3220 3221.globl aesni_ocb_decrypt 3222.type aesni_ocb_decrypt,\@function,6 3223.align 32 3224aesni_ocb_decrypt: 3225.cfi_startproc 3226 lea (%rsp),%rax 3227 push %rbx 3228.cfi_push %rbx 3229 push %rbp 3230.cfi_push %rbp 3231 push %r12 3232.cfi_push %r12 3233 push %r13 3234.cfi_push %r13 3235 push %r14 3236.cfi_push %r14 3237___ 3238$code.=<<___ if ($win64); 3239 lea -0xa0(%rsp),%rsp 3240 movaps %xmm6,0x00(%rsp) # offload everything 3241 movaps %xmm7,0x10(%rsp) 3242 movaps %xmm8,0x20(%rsp) 3243 movaps %xmm9,0x30(%rsp) 3244 movaps %xmm10,0x40(%rsp) 3245 movaps %xmm11,0x50(%rsp) 3246 movaps %xmm12,0x60(%rsp) 3247 movaps %xmm13,0x70(%rsp) 3248 movaps %xmm14,0x80(%rsp) 3249 movaps %xmm15,0x90(%rsp) 3250.Locb_dec_body: 3251___ 3252$code.=<<___; 3253 mov $seventh_arg(%rax),$L_p # 7th argument 3254 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3255 3256 mov 240($key),$rnds_ 3257 mov $key,$key_ 3258 shl \$4,$rnds_ 3259 $movkey ($key),$rndkey0l # round[0] 3260 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3261 3262 movdqu ($offset_p),@offset[5] # load last offset_i 3263 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3264 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3265 3266 mov \$16+32,$rounds 3267 lea 32($key_,$rnds_),$key 3268 $movkey 16($key_),$rndkey1 # round[1] 3269 sub %r10,%rax # twisted $rounds 3270 mov %rax,%r10 # backup twisted $rounds 3271 3272 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3273 movdqu ($checksum_p),$checksum # load checksum 3274 3275 test \$1,$block_num # is first block number odd? 3276 jnz .Locb_dec_odd 3277 3278 bsf $block_num,$i1 3279 add \$1,$block_num 3280 shl \$4,$i1 3281 movdqu ($L_p,$i1),$inout5 # borrow 3282 movdqu ($inp),$inout0 3283 lea 16($inp),$inp 3284 3285 call __ocb_decrypt1 3286 3287 movdqa $inout5,@offset[5] 3288 movups $inout0,($out) 3289 xorps $inout0,$checksum # accumulate checksum 3290 lea 16($out),$out 3291 sub \$1,$blocks 3292 jz .Locb_dec_done 3293 3294.Locb_dec_odd: 3295 lea 1($block_num),$i1 # even-numbered blocks 3296 lea 3($block_num),$i3 3297 lea 5($block_num),$i5 3298 lea 6($block_num),$block_num 3299 bsf $i1,$i1 # ntz(block) 3300 bsf $i3,$i3 3301 bsf $i5,$i5 3302 shl \$4,$i1 # ntz(block) -> table offset 3303 shl \$4,$i3 3304 shl \$4,$i5 3305 3306 sub \$6,$blocks 3307 jc .Locb_dec_short 3308 jmp .Locb_dec_grandloop 3309 3310.align 32 3311.Locb_dec_grandloop: 3312 movdqu `16*0`($inp),$inout0 # load input 3313 movdqu `16*1`($inp),$inout1 3314 movdqu `16*2`($inp),$inout2 3315 movdqu `16*3`($inp),$inout3 3316 movdqu `16*4`($inp),$inout4 3317 movdqu `16*5`($inp),$inout5 3318 lea `16*6`($inp),$inp 3319 3320 call __ocb_decrypt6 3321 3322 movups $inout0,`16*0`($out) # store output 3323 pxor $inout0,$checksum # accumulate checksum 3324 movups $inout1,`16*1`($out) 3325 pxor $inout1,$checksum 3326 movups $inout2,`16*2`($out) 3327 pxor $inout2,$checksum 3328 movups $inout3,`16*3`($out) 3329 pxor $inout3,$checksum 3330 movups $inout4,`16*4`($out) 3331 pxor $inout4,$checksum 3332 movups $inout5,`16*5`($out) 3333 pxor $inout5,$checksum 3334 lea `16*6`($out),$out 3335 sub \$6,$blocks 3336 jnc .Locb_dec_grandloop 3337 3338.Locb_dec_short: 3339 add \$6,$blocks 3340 jz .Locb_dec_done 3341 3342 movdqu `16*0`($inp),$inout0 3343 cmp \$2,$blocks 3344 jb .Locb_dec_one 3345 movdqu `16*1`($inp),$inout1 3346 je .Locb_dec_two 3347 3348 movdqu `16*2`($inp),$inout2 3349 cmp \$4,$blocks 3350 jb .Locb_dec_three 3351 movdqu `16*3`($inp),$inout3 3352 je .Locb_dec_four 3353 3354 movdqu `16*4`($inp),$inout4 3355 pxor $inout5,$inout5 3356 3357 call __ocb_decrypt6 3358 3359 movdqa @offset[4],@offset[5] 3360 movups $inout0,`16*0`($out) # store output 3361 pxor $inout0,$checksum # accumulate checksum 3362 movups $inout1,`16*1`($out) 3363 pxor $inout1,$checksum 3364 movups $inout2,`16*2`($out) 3365 pxor $inout2,$checksum 3366 movups $inout3,`16*3`($out) 3367 pxor $inout3,$checksum 3368 movups $inout4,`16*4`($out) 3369 pxor $inout4,$checksum 3370 3371 jmp .Locb_dec_done 3372 3373.align 16 3374.Locb_dec_one: 3375 movdqa @offset[0],$inout5 # borrow 3376 3377 call __ocb_decrypt1 3378 3379 movdqa $inout5,@offset[5] 3380 movups $inout0,`16*0`($out) # store output 3381 xorps $inout0,$checksum # accumulate checksum 3382 jmp .Locb_dec_done 3383 3384.align 16 3385.Locb_dec_two: 3386 pxor $inout2,$inout2 3387 pxor $inout3,$inout3 3388 3389 call __ocb_decrypt4 3390 3391 movdqa @offset[1],@offset[5] 3392 movups $inout0,`16*0`($out) # store output 3393 xorps $inout0,$checksum # accumulate checksum 3394 movups $inout1,`16*1`($out) 3395 xorps $inout1,$checksum 3396 3397 jmp .Locb_dec_done 3398 3399.align 16 3400.Locb_dec_three: 3401 pxor $inout3,$inout3 3402 3403 call __ocb_decrypt4 3404 3405 movdqa @offset[2],@offset[5] 3406 movups $inout0,`16*0`($out) # store output 3407 xorps $inout0,$checksum # accumulate checksum 3408 movups $inout1,`16*1`($out) 3409 xorps $inout1,$checksum 3410 movups $inout2,`16*2`($out) 3411 xorps $inout2,$checksum 3412 3413 jmp .Locb_dec_done 3414 3415.align 16 3416.Locb_dec_four: 3417 call __ocb_decrypt4 3418 3419 movdqa @offset[3],@offset[5] 3420 movups $inout0,`16*0`($out) # store output 3421 pxor $inout0,$checksum # accumulate checksum 3422 movups $inout1,`16*1`($out) 3423 pxor $inout1,$checksum 3424 movups $inout2,`16*2`($out) 3425 pxor $inout2,$checksum 3426 movups $inout3,`16*3`($out) 3427 pxor $inout3,$checksum 3428 3429.Locb_dec_done: 3430 pxor $rndkey0,@offset[5] # "remove" round[last] 3431 movdqu $checksum,($checksum_p) # store checksum 3432 movdqu @offset[5],($offset_p) # store last offset_i 3433 3434 xorps %xmm0,%xmm0 # clear register bank 3435 pxor %xmm1,%xmm1 3436 pxor %xmm2,%xmm2 3437 pxor %xmm3,%xmm3 3438 pxor %xmm4,%xmm4 3439 pxor %xmm5,%xmm5 3440___ 3441$code.=<<___ if (!$win64); 3442 pxor %xmm6,%xmm6 3443 pxor %xmm7,%xmm7 3444 pxor %xmm8,%xmm8 3445 pxor %xmm9,%xmm9 3446 pxor %xmm10,%xmm10 3447 pxor %xmm11,%xmm11 3448 pxor %xmm12,%xmm12 3449 pxor %xmm13,%xmm13 3450 pxor %xmm14,%xmm14 3451 pxor %xmm15,%xmm15 3452 lea 0x28(%rsp),%rax 3453.cfi_def_cfa %rax,8 3454___ 3455$code.=<<___ if ($win64); 3456 movaps 0x00(%rsp),%xmm6 3457 movaps %xmm0,0x00(%rsp) # clear stack 3458 movaps 0x10(%rsp),%xmm7 3459 movaps %xmm0,0x10(%rsp) 3460 movaps 0x20(%rsp),%xmm8 3461 movaps %xmm0,0x20(%rsp) 3462 movaps 0x30(%rsp),%xmm9 3463 movaps %xmm0,0x30(%rsp) 3464 movaps 0x40(%rsp),%xmm10 3465 movaps %xmm0,0x40(%rsp) 3466 movaps 0x50(%rsp),%xmm11 3467 movaps %xmm0,0x50(%rsp) 3468 movaps 0x60(%rsp),%xmm12 3469 movaps %xmm0,0x60(%rsp) 3470 movaps 0x70(%rsp),%xmm13 3471 movaps %xmm0,0x70(%rsp) 3472 movaps 0x80(%rsp),%xmm14 3473 movaps %xmm0,0x80(%rsp) 3474 movaps 0x90(%rsp),%xmm15 3475 movaps %xmm0,0x90(%rsp) 3476 lea 0xa0+0x28(%rsp),%rax 3477.Locb_dec_pop: 3478___ 3479$code.=<<___; 3480 mov -40(%rax),%r14 3481.cfi_restore %r14 3482 mov -32(%rax),%r13 3483.cfi_restore %r13 3484 mov -24(%rax),%r12 3485.cfi_restore %r12 3486 mov -16(%rax),%rbp 3487.cfi_restore %rbp 3488 mov -8(%rax),%rbx 3489.cfi_restore %rbx 3490 lea (%rax),%rsp 3491.cfi_def_cfa_register %rsp 3492.Locb_dec_epilogue: 3493 ret 3494.cfi_endproc 3495.size aesni_ocb_decrypt,.-aesni_ocb_decrypt 3496 3497.type __ocb_decrypt6,\@abi-omnipotent 3498.align 32 3499__ocb_decrypt6: 3500 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3501 movdqu ($L_p,$i1),@offset[1] 3502 movdqa @offset[0],@offset[2] 3503 movdqu ($L_p,$i3),@offset[3] 3504 movdqa @offset[0],@offset[4] 3505 pxor @offset[5],@offset[0] 3506 movdqu ($L_p,$i5),@offset[5] 3507 pxor @offset[0],@offset[1] 3508 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3509 pxor @offset[1],@offset[2] 3510 pxor @offset[1],$inout1 3511 pxor @offset[2],@offset[3] 3512 pxor @offset[2],$inout2 3513 pxor @offset[3],@offset[4] 3514 pxor @offset[3],$inout3 3515 pxor @offset[4],@offset[5] 3516 pxor @offset[4],$inout4 3517 pxor @offset[5],$inout5 3518 $movkey 32($key_),$rndkey0 3519 3520 lea 1($block_num),$i1 # even-numbered blocks 3521 lea 3($block_num),$i3 3522 lea 5($block_num),$i5 3523 add \$6,$block_num 3524 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3525 bsf $i1,$i1 # ntz(block) 3526 bsf $i3,$i3 3527 bsf $i5,$i5 3528 3529 aesdec $rndkey1,$inout0 3530 aesdec $rndkey1,$inout1 3531 aesdec $rndkey1,$inout2 3532 aesdec $rndkey1,$inout3 3533 pxor $rndkey0l,@offset[1] 3534 pxor $rndkey0l,@offset[2] 3535 aesdec $rndkey1,$inout4 3536 pxor $rndkey0l,@offset[3] 3537 pxor $rndkey0l,@offset[4] 3538 aesdec $rndkey1,$inout5 3539 $movkey 48($key_),$rndkey1 3540 pxor $rndkey0l,@offset[5] 3541 3542 aesdec $rndkey0,$inout0 3543 aesdec $rndkey0,$inout1 3544 aesdec $rndkey0,$inout2 3545 aesdec $rndkey0,$inout3 3546 aesdec $rndkey0,$inout4 3547 aesdec $rndkey0,$inout5 3548 $movkey 64($key_),$rndkey0 3549 shl \$4,$i1 # ntz(block) -> table offset 3550 shl \$4,$i3 3551 jmp .Locb_dec_loop6 3552 3553.align 32 3554.Locb_dec_loop6: 3555 aesdec $rndkey1,$inout0 3556 aesdec $rndkey1,$inout1 3557 aesdec $rndkey1,$inout2 3558 aesdec $rndkey1,$inout3 3559 aesdec $rndkey1,$inout4 3560 aesdec $rndkey1,$inout5 3561 $movkey ($key,%rax),$rndkey1 3562 add \$32,%rax 3563 3564 aesdec $rndkey0,$inout0 3565 aesdec $rndkey0,$inout1 3566 aesdec $rndkey0,$inout2 3567 aesdec $rndkey0,$inout3 3568 aesdec $rndkey0,$inout4 3569 aesdec $rndkey0,$inout5 3570 $movkey -16($key,%rax),$rndkey0 3571 jnz .Locb_dec_loop6 3572 3573 aesdec $rndkey1,$inout0 3574 aesdec $rndkey1,$inout1 3575 aesdec $rndkey1,$inout2 3576 aesdec $rndkey1,$inout3 3577 aesdec $rndkey1,$inout4 3578 aesdec $rndkey1,$inout5 3579 $movkey 16($key_),$rndkey1 3580 shl \$4,$i5 3581 3582 aesdeclast @offset[0],$inout0 3583 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3584 mov %r10,%rax # restore twisted rounds 3585 aesdeclast @offset[1],$inout1 3586 aesdeclast @offset[2],$inout2 3587 aesdeclast @offset[3],$inout3 3588 aesdeclast @offset[4],$inout4 3589 aesdeclast @offset[5],$inout5 3590 ret 3591.size __ocb_decrypt6,.-__ocb_decrypt6 3592 3593.type __ocb_decrypt4,\@abi-omnipotent 3594.align 32 3595__ocb_decrypt4: 3596 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3597 movdqu ($L_p,$i1),@offset[1] 3598 movdqa @offset[0],@offset[2] 3599 movdqu ($L_p,$i3),@offset[3] 3600 pxor @offset[5],@offset[0] 3601 pxor @offset[0],@offset[1] 3602 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3603 pxor @offset[1],@offset[2] 3604 pxor @offset[1],$inout1 3605 pxor @offset[2],@offset[3] 3606 pxor @offset[2],$inout2 3607 pxor @offset[3],$inout3 3608 $movkey 32($key_),$rndkey0 3609 3610 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3611 pxor $rndkey0l,@offset[1] 3612 pxor $rndkey0l,@offset[2] 3613 pxor $rndkey0l,@offset[3] 3614 3615 aesdec $rndkey1,$inout0 3616 aesdec $rndkey1,$inout1 3617 aesdec $rndkey1,$inout2 3618 aesdec $rndkey1,$inout3 3619 $movkey 48($key_),$rndkey1 3620 3621 aesdec $rndkey0,$inout0 3622 aesdec $rndkey0,$inout1 3623 aesdec $rndkey0,$inout2 3624 aesdec $rndkey0,$inout3 3625 $movkey 64($key_),$rndkey0 3626 jmp .Locb_dec_loop4 3627 3628.align 32 3629.Locb_dec_loop4: 3630 aesdec $rndkey1,$inout0 3631 aesdec $rndkey1,$inout1 3632 aesdec $rndkey1,$inout2 3633 aesdec $rndkey1,$inout3 3634 $movkey ($key,%rax),$rndkey1 3635 add \$32,%rax 3636 3637 aesdec $rndkey0,$inout0 3638 aesdec $rndkey0,$inout1 3639 aesdec $rndkey0,$inout2 3640 aesdec $rndkey0,$inout3 3641 $movkey -16($key,%rax),$rndkey0 3642 jnz .Locb_dec_loop4 3643 3644 aesdec $rndkey1,$inout0 3645 aesdec $rndkey1,$inout1 3646 aesdec $rndkey1,$inout2 3647 aesdec $rndkey1,$inout3 3648 $movkey 16($key_),$rndkey1 3649 mov %r10,%rax # restore twisted rounds 3650 3651 aesdeclast @offset[0],$inout0 3652 aesdeclast @offset[1],$inout1 3653 aesdeclast @offset[2],$inout2 3654 aesdeclast @offset[3],$inout3 3655 ret 3656.size __ocb_decrypt4,.-__ocb_decrypt4 3657 3658.type __ocb_decrypt1,\@abi-omnipotent 3659.align 32 3660__ocb_decrypt1: 3661 pxor @offset[5],$inout5 # offset_i 3662 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3663 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3664 $movkey 32($key_),$rndkey0 3665 3666 aesdec $rndkey1,$inout0 3667 $movkey 48($key_),$rndkey1 3668 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3669 3670 aesdec $rndkey0,$inout0 3671 $movkey 64($key_),$rndkey0 3672 jmp .Locb_dec_loop1 3673 3674.align 32 3675.Locb_dec_loop1: 3676 aesdec $rndkey1,$inout0 3677 $movkey ($key,%rax),$rndkey1 3678 add \$32,%rax 3679 3680 aesdec $rndkey0,$inout0 3681 $movkey -16($key,%rax),$rndkey0 3682 jnz .Locb_dec_loop1 3683 3684 aesdec $rndkey1,$inout0 3685 $movkey 16($key_),$rndkey1 # redundant in tail 3686 mov %r10,%rax # restore twisted rounds 3687 3688 aesdeclast $inout5,$inout0 3689 ret 3690.size __ocb_decrypt1,.-__ocb_decrypt1 3691___ 3692} }} 3693 3694######################################################################## 3695# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3696# size_t length, const AES_KEY *key, 3697# unsigned char *ivp,const int enc); 3698{ 3699my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3700my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3701 3702$code.=<<___; 3703.globl ${PREFIX}_cbc_encrypt 3704.type ${PREFIX}_cbc_encrypt,\@function,6 3705.align 16 3706${PREFIX}_cbc_encrypt: 3707.cfi_startproc 3708 test $len,$len # check length 3709 jz .Lcbc_ret 3710 3711 mov 240($key),$rnds_ # key->rounds 3712 mov $key,$key_ # backup $key 3713 test %r9d,%r9d # 6th argument 3714 jz .Lcbc_decrypt 3715#--------------------------- CBC ENCRYPT ------------------------------# 3716 movups ($ivp),$inout0 # load iv as initial state 3717 mov $rnds_,$rounds 3718 cmp \$16,$len 3719 jb .Lcbc_enc_tail 3720 sub \$16,$len 3721 jmp .Lcbc_enc_loop 3722.align 16 3723.Lcbc_enc_loop: 3724 movups ($inp),$inout1 # load input 3725 lea 16($inp),$inp 3726 #xorps $inout1,$inout0 3727___ 3728 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3729$code.=<<___; 3730 mov $rnds_,$rounds # restore $rounds 3731 mov $key_,$key # restore $key 3732 movups $inout0,0($out) # store output 3733 lea 16($out),$out 3734 sub \$16,$len 3735 jnc .Lcbc_enc_loop 3736 add \$16,$len 3737 jnz .Lcbc_enc_tail 3738 pxor $rndkey0,$rndkey0 # clear register bank 3739 pxor $rndkey1,$rndkey1 3740 movups $inout0,($ivp) 3741 pxor $inout0,$inout0 3742 pxor $inout1,$inout1 3743 jmp .Lcbc_ret 3744 3745.Lcbc_enc_tail: 3746 mov $len,%rcx # zaps $key 3747 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3748 .long 0x9066A4F3 # rep movsb 3749 mov \$16,%ecx # zero tail 3750 sub $len,%rcx 3751 xor %eax,%eax 3752 .long 0x9066AAF3 # rep stosb 3753 lea -16(%rdi),%rdi # rewind $out by 1 block 3754 mov $rnds_,$rounds # restore $rounds 3755 mov %rdi,%rsi # $inp and $out are the same 3756 mov $key_,$key # restore $key 3757 xor $len,$len # len=16 3758 jmp .Lcbc_enc_loop # one more spin 3759#--------------------------- CBC DECRYPT ------------------------------# 3760.align 16 3761.Lcbc_decrypt: 3762 cmp \$16,$len 3763 jne .Lcbc_decrypt_bulk 3764 3765 # handle single block without allocating stack frame, 3766 # useful in ciphertext stealing mode 3767 movdqu ($inp),$inout0 # load input 3768 movdqu ($ivp),$inout1 # load iv 3769 movdqa $inout0,$inout2 # future iv 3770___ 3771 &aesni_generate1("dec",$key,$rnds_); 3772$code.=<<___; 3773 pxor $rndkey0,$rndkey0 # clear register bank 3774 pxor $rndkey1,$rndkey1 3775 movdqu $inout2,($ivp) # store iv 3776 xorps $inout1,$inout0 # ^=iv 3777 pxor $inout1,$inout1 3778 movups $inout0,($out) # store output 3779 pxor $inout0,$inout0 3780 jmp .Lcbc_ret 3781.align 16 3782.Lcbc_decrypt_bulk: 3783 lea (%rsp),%r11 # frame pointer 3784.cfi_def_cfa_register %r11 3785 push %rbp 3786.cfi_push %rbp 3787 sub \$$frame_size,%rsp 3788 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3789___ 3790$code.=<<___ if ($win64); 3791 movaps %xmm6,0x10(%rsp) 3792 movaps %xmm7,0x20(%rsp) 3793 movaps %xmm8,0x30(%rsp) 3794 movaps %xmm9,0x40(%rsp) 3795 movaps %xmm10,0x50(%rsp) 3796 movaps %xmm11,0x60(%rsp) 3797 movaps %xmm12,0x70(%rsp) 3798 movaps %xmm13,0x80(%rsp) 3799 movaps %xmm14,0x90(%rsp) 3800 movaps %xmm15,0xa0(%rsp) 3801.Lcbc_decrypt_body: 3802___ 3803 3804my $inp_=$key_="%rbp"; # reassign $key_ 3805 3806$code.=<<___; 3807 mov $key,$key_ # [re-]backup $key [after reassignment] 3808 movups ($ivp),$iv 3809 mov $rnds_,$rounds 3810 cmp \$0x50,$len 3811 jbe .Lcbc_dec_tail 3812 3813 $movkey ($key),$rndkey0 3814 movdqu 0x00($inp),$inout0 # load input 3815 movdqu 0x10($inp),$inout1 3816 movdqa $inout0,$in0 3817 movdqu 0x20($inp),$inout2 3818 movdqa $inout1,$in1 3819 movdqu 0x30($inp),$inout3 3820 movdqa $inout2,$in2 3821 movdqu 0x40($inp),$inout4 3822 movdqa $inout3,$in3 3823 movdqu 0x50($inp),$inout5 3824 movdqa $inout4,$in4 3825 mov OPENSSL_ia32cap_P+4(%rip),%r9d 3826 cmp \$0x70,$len 3827 jbe .Lcbc_dec_six_or_seven 3828 3829 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3830 sub \$0x50,$len # $len is biased by -5*16 3831 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3832 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3833 sub \$0x20,$len # $len is biased by -7*16 3834 lea 0x70($key),$key # size optimization 3835 jmp .Lcbc_dec_loop8_enter 3836.align 16 3837.Lcbc_dec_loop8: 3838 movups $inout7,($out) 3839 lea 0x10($out),$out 3840.Lcbc_dec_loop8_enter: 3841 movdqu 0x60($inp),$inout6 3842 pxor $rndkey0,$inout0 3843 movdqu 0x70($inp),$inout7 3844 pxor $rndkey0,$inout1 3845 $movkey 0x10-0x70($key),$rndkey1 3846 pxor $rndkey0,$inout2 3847 mov \$-1,$inp_ 3848 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3849 pxor $rndkey0,$inout3 3850 pxor $rndkey0,$inout4 3851 pxor $rndkey0,$inout5 3852 pxor $rndkey0,$inout6 3853 3854 aesdec $rndkey1,$inout0 3855 pxor $rndkey0,$inout7 3856 $movkey 0x20-0x70($key),$rndkey0 3857 aesdec $rndkey1,$inout1 3858 aesdec $rndkey1,$inout2 3859 aesdec $rndkey1,$inout3 3860 aesdec $rndkey1,$inout4 3861 aesdec $rndkey1,$inout5 3862 aesdec $rndkey1,$inout6 3863 adc \$0,$inp_ 3864 and \$128,$inp_ 3865 aesdec $rndkey1,$inout7 3866 add $inp,$inp_ 3867 $movkey 0x30-0x70($key),$rndkey1 3868___ 3869for($i=1;$i<12;$i++) { 3870my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3871$code.=<<___ if ($i==7); 3872 cmp \$11,$rounds 3873___ 3874$code.=<<___; 3875 aesdec $rndkeyx,$inout0 3876 aesdec $rndkeyx,$inout1 3877 aesdec $rndkeyx,$inout2 3878 aesdec $rndkeyx,$inout3 3879 aesdec $rndkeyx,$inout4 3880 aesdec $rndkeyx,$inout5 3881 aesdec $rndkeyx,$inout6 3882 aesdec $rndkeyx,$inout7 3883 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3884___ 3885$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3886 nop 3887___ 3888$code.=<<___ if ($i==7); 3889 jb .Lcbc_dec_done 3890___ 3891$code.=<<___ if ($i==9); 3892 je .Lcbc_dec_done 3893___ 3894$code.=<<___ if ($i==11); 3895 jmp .Lcbc_dec_done 3896___ 3897} 3898$code.=<<___; 3899.align 16 3900.Lcbc_dec_done: 3901 aesdec $rndkey1,$inout0 3902 aesdec $rndkey1,$inout1 3903 pxor $rndkey0,$iv 3904 pxor $rndkey0,$in0 3905 aesdec $rndkey1,$inout2 3906 aesdec $rndkey1,$inout3 3907 pxor $rndkey0,$in1 3908 pxor $rndkey0,$in2 3909 aesdec $rndkey1,$inout4 3910 aesdec $rndkey1,$inout5 3911 pxor $rndkey0,$in3 3912 pxor $rndkey0,$in4 3913 aesdec $rndkey1,$inout6 3914 aesdec $rndkey1,$inout7 3915 movdqu 0x50($inp),$rndkey1 3916 3917 aesdeclast $iv,$inout0 3918 movdqu 0x60($inp),$iv # borrow $iv 3919 pxor $rndkey0,$rndkey1 3920 aesdeclast $in0,$inout1 3921 pxor $rndkey0,$iv 3922 movdqu 0x70($inp),$rndkey0 # next IV 3923 aesdeclast $in1,$inout2 3924 lea 0x80($inp),$inp 3925 movdqu 0x00($inp_),$in0 3926 aesdeclast $in2,$inout3 3927 aesdeclast $in3,$inout4 3928 movdqu 0x10($inp_),$in1 3929 movdqu 0x20($inp_),$in2 3930 aesdeclast $in4,$inout5 3931 aesdeclast $rndkey1,$inout6 3932 movdqu 0x30($inp_),$in3 3933 movdqu 0x40($inp_),$in4 3934 aesdeclast $iv,$inout7 3935 movdqa $rndkey0,$iv # return $iv 3936 movdqu 0x50($inp_),$rndkey1 3937 $movkey -0x70($key),$rndkey0 3938 3939 movups $inout0,($out) # store output 3940 movdqa $in0,$inout0 3941 movups $inout1,0x10($out) 3942 movdqa $in1,$inout1 3943 movups $inout2,0x20($out) 3944 movdqa $in2,$inout2 3945 movups $inout3,0x30($out) 3946 movdqa $in3,$inout3 3947 movups $inout4,0x40($out) 3948 movdqa $in4,$inout4 3949 movups $inout5,0x50($out) 3950 movdqa $rndkey1,$inout5 3951 movups $inout6,0x60($out) 3952 lea 0x70($out),$out 3953 3954 sub \$0x80,$len 3955 ja .Lcbc_dec_loop8 3956 3957 movaps $inout7,$inout0 3958 lea -0x70($key),$key 3959 add \$0x70,$len 3960 jle .Lcbc_dec_clear_tail_collected 3961 movups $inout7,($out) 3962 lea 0x10($out),$out 3963 cmp \$0x50,$len 3964 jbe .Lcbc_dec_tail 3965 3966 movaps $in0,$inout0 3967.Lcbc_dec_six_or_seven: 3968 cmp \$0x60,$len 3969 ja .Lcbc_dec_seven 3970 3971 movaps $inout5,$inout6 3972 call _aesni_decrypt6 3973 pxor $iv,$inout0 # ^= IV 3974 movaps $inout6,$iv 3975 pxor $in0,$inout1 3976 movdqu $inout0,($out) 3977 pxor $in1,$inout2 3978 movdqu $inout1,0x10($out) 3979 pxor $inout1,$inout1 # clear register bank 3980 pxor $in2,$inout3 3981 movdqu $inout2,0x20($out) 3982 pxor $inout2,$inout2 3983 pxor $in3,$inout4 3984 movdqu $inout3,0x30($out) 3985 pxor $inout3,$inout3 3986 pxor $in4,$inout5 3987 movdqu $inout4,0x40($out) 3988 pxor $inout4,$inout4 3989 lea 0x50($out),$out 3990 movdqa $inout5,$inout0 3991 pxor $inout5,$inout5 3992 jmp .Lcbc_dec_tail_collected 3993 3994.align 16 3995.Lcbc_dec_seven: 3996 movups 0x60($inp),$inout6 3997 xorps $inout7,$inout7 3998 call _aesni_decrypt8 3999 movups 0x50($inp),$inout7 4000 pxor $iv,$inout0 # ^= IV 4001 movups 0x60($inp),$iv 4002 pxor $in0,$inout1 4003 movdqu $inout0,($out) 4004 pxor $in1,$inout2 4005 movdqu $inout1,0x10($out) 4006 pxor $inout1,$inout1 # clear register bank 4007 pxor $in2,$inout3 4008 movdqu $inout2,0x20($out) 4009 pxor $inout2,$inout2 4010 pxor $in3,$inout4 4011 movdqu $inout3,0x30($out) 4012 pxor $inout3,$inout3 4013 pxor $in4,$inout5 4014 movdqu $inout4,0x40($out) 4015 pxor $inout4,$inout4 4016 pxor $inout7,$inout6 4017 movdqu $inout5,0x50($out) 4018 pxor $inout5,$inout5 4019 lea 0x60($out),$out 4020 movdqa $inout6,$inout0 4021 pxor $inout6,$inout6 4022 pxor $inout7,$inout7 4023 jmp .Lcbc_dec_tail_collected 4024 4025.align 16 4026.Lcbc_dec_loop6: 4027 movups $inout5,($out) 4028 lea 0x10($out),$out 4029 movdqu 0x00($inp),$inout0 # load input 4030 movdqu 0x10($inp),$inout1 4031 movdqa $inout0,$in0 4032 movdqu 0x20($inp),$inout2 4033 movdqa $inout1,$in1 4034 movdqu 0x30($inp),$inout3 4035 movdqa $inout2,$in2 4036 movdqu 0x40($inp),$inout4 4037 movdqa $inout3,$in3 4038 movdqu 0x50($inp),$inout5 4039 movdqa $inout4,$in4 4040.Lcbc_dec_loop6_enter: 4041 lea 0x60($inp),$inp 4042 movdqa $inout5,$inout6 4043 4044 call _aesni_decrypt6 4045 4046 pxor $iv,$inout0 # ^= IV 4047 movdqa $inout6,$iv 4048 pxor $in0,$inout1 4049 movdqu $inout0,($out) 4050 pxor $in1,$inout2 4051 movdqu $inout1,0x10($out) 4052 pxor $in2,$inout3 4053 movdqu $inout2,0x20($out) 4054 pxor $in3,$inout4 4055 mov $key_,$key 4056 movdqu $inout3,0x30($out) 4057 pxor $in4,$inout5 4058 mov $rnds_,$rounds 4059 movdqu $inout4,0x40($out) 4060 lea 0x50($out),$out 4061 sub \$0x60,$len 4062 ja .Lcbc_dec_loop6 4063 4064 movdqa $inout5,$inout0 4065 add \$0x50,$len 4066 jle .Lcbc_dec_clear_tail_collected 4067 movups $inout5,($out) 4068 lea 0x10($out),$out 4069 4070.Lcbc_dec_tail: 4071 movups ($inp),$inout0 4072 sub \$0x10,$len 4073 jbe .Lcbc_dec_one # $len is 1*16 or less 4074 4075 movups 0x10($inp),$inout1 4076 movaps $inout0,$in0 4077 sub \$0x10,$len 4078 jbe .Lcbc_dec_two # $len is 2*16 or less 4079 4080 movups 0x20($inp),$inout2 4081 movaps $inout1,$in1 4082 sub \$0x10,$len 4083 jbe .Lcbc_dec_three # $len is 3*16 or less 4084 4085 movups 0x30($inp),$inout3 4086 movaps $inout2,$in2 4087 sub \$0x10,$len 4088 jbe .Lcbc_dec_four # $len is 4*16 or less 4089 4090 movups 0x40($inp),$inout4 # $len is 5*16 or less 4091 movaps $inout3,$in3 4092 movaps $inout4,$in4 4093 xorps $inout5,$inout5 4094 call _aesni_decrypt6 4095 pxor $iv,$inout0 4096 movaps $in4,$iv 4097 pxor $in0,$inout1 4098 movdqu $inout0,($out) 4099 pxor $in1,$inout2 4100 movdqu $inout1,0x10($out) 4101 pxor $inout1,$inout1 # clear register bank 4102 pxor $in2,$inout3 4103 movdqu $inout2,0x20($out) 4104 pxor $inout2,$inout2 4105 pxor $in3,$inout4 4106 movdqu $inout3,0x30($out) 4107 pxor $inout3,$inout3 4108 lea 0x40($out),$out 4109 movdqa $inout4,$inout0 4110 pxor $inout4,$inout4 4111 pxor $inout5,$inout5 4112 sub \$0x10,$len 4113 jmp .Lcbc_dec_tail_collected 4114 4115.align 16 4116.Lcbc_dec_one: 4117 movaps $inout0,$in0 4118___ 4119 &aesni_generate1("dec",$key,$rounds); 4120$code.=<<___; 4121 xorps $iv,$inout0 4122 movaps $in0,$iv 4123 jmp .Lcbc_dec_tail_collected 4124.align 16 4125.Lcbc_dec_two: 4126 movaps $inout1,$in1 4127 call _aesni_decrypt2 4128 pxor $iv,$inout0 4129 movaps $in1,$iv 4130 pxor $in0,$inout1 4131 movdqu $inout0,($out) 4132 movdqa $inout1,$inout0 4133 pxor $inout1,$inout1 # clear register bank 4134 lea 0x10($out),$out 4135 jmp .Lcbc_dec_tail_collected 4136.align 16 4137.Lcbc_dec_three: 4138 movaps $inout2,$in2 4139 call _aesni_decrypt3 4140 pxor $iv,$inout0 4141 movaps $in2,$iv 4142 pxor $in0,$inout1 4143 movdqu $inout0,($out) 4144 pxor $in1,$inout2 4145 movdqu $inout1,0x10($out) 4146 pxor $inout1,$inout1 # clear register bank 4147 movdqa $inout2,$inout0 4148 pxor $inout2,$inout2 4149 lea 0x20($out),$out 4150 jmp .Lcbc_dec_tail_collected 4151.align 16 4152.Lcbc_dec_four: 4153 movaps $inout3,$in3 4154 call _aesni_decrypt4 4155 pxor $iv,$inout0 4156 movaps $in3,$iv 4157 pxor $in0,$inout1 4158 movdqu $inout0,($out) 4159 pxor $in1,$inout2 4160 movdqu $inout1,0x10($out) 4161 pxor $inout1,$inout1 # clear register bank 4162 pxor $in2,$inout3 4163 movdqu $inout2,0x20($out) 4164 pxor $inout2,$inout2 4165 movdqa $inout3,$inout0 4166 pxor $inout3,$inout3 4167 lea 0x30($out),$out 4168 jmp .Lcbc_dec_tail_collected 4169 4170.align 16 4171.Lcbc_dec_clear_tail_collected: 4172 pxor $inout1,$inout1 # clear register bank 4173 pxor $inout2,$inout2 4174 pxor $inout3,$inout3 4175___ 4176$code.=<<___ if (!$win64); 4177 pxor $inout4,$inout4 # %xmm6..9 4178 pxor $inout5,$inout5 4179 pxor $inout6,$inout6 4180 pxor $inout7,$inout7 4181___ 4182$code.=<<___; 4183.Lcbc_dec_tail_collected: 4184 movups $iv,($ivp) 4185 and \$15,$len 4186 jnz .Lcbc_dec_tail_partial 4187 movups $inout0,($out) 4188 pxor $inout0,$inout0 4189 jmp .Lcbc_dec_ret 4190.align 16 4191.Lcbc_dec_tail_partial: 4192 movaps $inout0,(%rsp) 4193 pxor $inout0,$inout0 4194 mov \$16,%rcx 4195 mov $out,%rdi 4196 sub $len,%rcx 4197 lea (%rsp),%rsi 4198 .long 0x9066A4F3 # rep movsb 4199 movdqa $inout0,(%rsp) 4200 4201.Lcbc_dec_ret: 4202 xorps $rndkey0,$rndkey0 # %xmm0 4203 pxor $rndkey1,$rndkey1 4204___ 4205$code.=<<___ if ($win64); 4206 movaps 0x10(%rsp),%xmm6 4207 movaps %xmm0,0x10(%rsp) # clear stack 4208 movaps 0x20(%rsp),%xmm7 4209 movaps %xmm0,0x20(%rsp) 4210 movaps 0x30(%rsp),%xmm8 4211 movaps %xmm0,0x30(%rsp) 4212 movaps 0x40(%rsp),%xmm9 4213 movaps %xmm0,0x40(%rsp) 4214 movaps 0x50(%rsp),%xmm10 4215 movaps %xmm0,0x50(%rsp) 4216 movaps 0x60(%rsp),%xmm11 4217 movaps %xmm0,0x60(%rsp) 4218 movaps 0x70(%rsp),%xmm12 4219 movaps %xmm0,0x70(%rsp) 4220 movaps 0x80(%rsp),%xmm13 4221 movaps %xmm0,0x80(%rsp) 4222 movaps 0x90(%rsp),%xmm14 4223 movaps %xmm0,0x90(%rsp) 4224 movaps 0xa0(%rsp),%xmm15 4225 movaps %xmm0,0xa0(%rsp) 4226___ 4227$code.=<<___; 4228 mov -8(%r11),%rbp 4229.cfi_restore %rbp 4230 lea (%r11),%rsp 4231.cfi_def_cfa_register %rsp 4232.Lcbc_ret: 4233 ret 4234.cfi_endproc 4235.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4236___ 4237} 4238# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4239# int bits, AES_KEY *key) 4240# 4241# input: $inp user-supplied key 4242# $bits $inp length in bits 4243# $key pointer to key schedule 4244# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4245# *$key key schedule 4246# 4247{ my ($inp,$bits,$key) = @_4args; 4248 $bits =~ s/%r/%e/; 4249 4250$code.=<<___; 4251.globl ${PREFIX}_set_decrypt_key 4252.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4253.align 16 4254${PREFIX}_set_decrypt_key: 4255.cfi_startproc 4256 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4257.cfi_adjust_cfa_offset 8 4258 call __aesni_set_encrypt_key 4259 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4260 test %eax,%eax 4261 jnz .Ldec_key_ret 4262 lea 16($key,$bits),$inp # points at the end of key schedule 4263 4264 $movkey ($key),%xmm0 # just swap 4265 $movkey ($inp),%xmm1 4266 $movkey %xmm0,($inp) 4267 $movkey %xmm1,($key) 4268 lea 16($key),$key 4269 lea -16($inp),$inp 4270 4271.Ldec_key_inverse: 4272 $movkey ($key),%xmm0 # swap and inverse 4273 $movkey ($inp),%xmm1 4274 aesimc %xmm0,%xmm0 4275 aesimc %xmm1,%xmm1 4276 lea 16($key),$key 4277 lea -16($inp),$inp 4278 $movkey %xmm0,16($inp) 4279 $movkey %xmm1,-16($key) 4280 cmp $key,$inp 4281 ja .Ldec_key_inverse 4282 4283 $movkey ($key),%xmm0 # inverse middle 4284 aesimc %xmm0,%xmm0 4285 pxor %xmm1,%xmm1 4286 $movkey %xmm0,($inp) 4287 pxor %xmm0,%xmm0 4288.Ldec_key_ret: 4289 add \$8,%rsp 4290.cfi_adjust_cfa_offset -8 4291 ret 4292.cfi_endproc 4293.LSEH_end_set_decrypt_key: 4294.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4295___ 4296 4297# This is based on submission from Intel by 4298# Huang Ying 4299# Vinodh Gopal 4300# Kahraman Akdemir 4301# 4302# Aggressively optimized in respect to aeskeygenassist's critical path 4303# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4304# 4305# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4306# int bits, AES_KEY * const key); 4307# 4308# input: $inp user-supplied key 4309# $bits $inp length in bits 4310# $key pointer to key schedule 4311# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4312# $bits rounds-1 (used in aesni_set_decrypt_key) 4313# *$key key schedule 4314# $key pointer to key schedule (used in 4315# aesni_set_decrypt_key) 4316# 4317# Subroutine is frame-less, which means that only volatile registers 4318# are used. Note that it's declared "abi-omnipotent", which means that 4319# amount of volatile registers is smaller on Windows. 4320# 4321$code.=<<___; 4322.globl ${PREFIX}_set_encrypt_key 4323.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4324.align 16 4325${PREFIX}_set_encrypt_key: 4326__aesni_set_encrypt_key: 4327.cfi_startproc 4328 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4329.cfi_adjust_cfa_offset 8 4330 mov \$-1,%rax 4331 test $inp,$inp 4332 jz .Lenc_key_ret 4333 test $key,$key 4334 jz .Lenc_key_ret 4335 4336 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4337 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4338 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4339 and OPENSSL_ia32cap_P+4(%rip),%r10d 4340 lea 16($key),%rax # %rax is used as modifiable copy of $key 4341 cmp \$256,$bits 4342 je .L14rounds 4343 cmp \$192,$bits 4344 je .L12rounds 4345 cmp \$128,$bits 4346 jne .Lbad_keybits 4347 4348.L10rounds: 4349 mov \$9,$bits # 10 rounds for 128-bit key 4350 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4351 je .L10rounds_alt 4352 4353 $movkey %xmm0,($key) # round 0 4354 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4355 call .Lkey_expansion_128_cold 4356 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4357 call .Lkey_expansion_128 4358 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4359 call .Lkey_expansion_128 4360 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4361 call .Lkey_expansion_128 4362 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4363 call .Lkey_expansion_128 4364 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4365 call .Lkey_expansion_128 4366 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4367 call .Lkey_expansion_128 4368 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4369 call .Lkey_expansion_128 4370 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4371 call .Lkey_expansion_128 4372 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4373 call .Lkey_expansion_128 4374 $movkey %xmm0,(%rax) 4375 mov $bits,80(%rax) # 240(%rdx) 4376 xor %eax,%eax 4377 jmp .Lenc_key_ret 4378 4379.align 16 4380.L10rounds_alt: 4381 movdqa .Lkey_rotate(%rip),%xmm5 4382 mov \$8,%r10d 4383 movdqa .Lkey_rcon1(%rip),%xmm4 4384 movdqa %xmm0,%xmm2 4385 movdqu %xmm0,($key) 4386 jmp .Loop_key128 4387 4388.align 16 4389.Loop_key128: 4390 pshufb %xmm5,%xmm0 4391 aesenclast %xmm4,%xmm0 4392 pslld \$1,%xmm4 4393 lea 16(%rax),%rax 4394 4395 movdqa %xmm2,%xmm3 4396 pslldq \$4,%xmm2 4397 pxor %xmm2,%xmm3 4398 pslldq \$4,%xmm2 4399 pxor %xmm2,%xmm3 4400 pslldq \$4,%xmm2 4401 pxor %xmm3,%xmm2 4402 4403 pxor %xmm2,%xmm0 4404 movdqu %xmm0,-16(%rax) 4405 movdqa %xmm0,%xmm2 4406 4407 dec %r10d 4408 jnz .Loop_key128 4409 4410 movdqa .Lkey_rcon1b(%rip),%xmm4 4411 4412 pshufb %xmm5,%xmm0 4413 aesenclast %xmm4,%xmm0 4414 pslld \$1,%xmm4 4415 4416 movdqa %xmm2,%xmm3 4417 pslldq \$4,%xmm2 4418 pxor %xmm2,%xmm3 4419 pslldq \$4,%xmm2 4420 pxor %xmm2,%xmm3 4421 pslldq \$4,%xmm2 4422 pxor %xmm3,%xmm2 4423 4424 pxor %xmm2,%xmm0 4425 movdqu %xmm0,(%rax) 4426 4427 movdqa %xmm0,%xmm2 4428 pshufb %xmm5,%xmm0 4429 aesenclast %xmm4,%xmm0 4430 4431 movdqa %xmm2,%xmm3 4432 pslldq \$4,%xmm2 4433 pxor %xmm2,%xmm3 4434 pslldq \$4,%xmm2 4435 pxor %xmm2,%xmm3 4436 pslldq \$4,%xmm2 4437 pxor %xmm3,%xmm2 4438 4439 pxor %xmm2,%xmm0 4440 movdqu %xmm0,16(%rax) 4441 4442 mov $bits,96(%rax) # 240($key) 4443 xor %eax,%eax 4444 jmp .Lenc_key_ret 4445 4446.align 16 4447.L12rounds: 4448 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4449 mov \$11,$bits # 12 rounds for 192 4450 cmp \$`1<<28`,%r10d # AVX, but no XOP 4451 je .L12rounds_alt 4452 4453 $movkey %xmm0,($key) # round 0 4454 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4455 call .Lkey_expansion_192a_cold 4456 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4457 call .Lkey_expansion_192b 4458 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4459 call .Lkey_expansion_192a 4460 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4461 call .Lkey_expansion_192b 4462 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4463 call .Lkey_expansion_192a 4464 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4465 call .Lkey_expansion_192b 4466 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4467 call .Lkey_expansion_192a 4468 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4469 call .Lkey_expansion_192b 4470 $movkey %xmm0,(%rax) 4471 mov $bits,48(%rax) # 240(%rdx) 4472 xor %rax, %rax 4473 jmp .Lenc_key_ret 4474 4475.align 16 4476.L12rounds_alt: 4477 movdqa .Lkey_rotate192(%rip),%xmm5 4478 movdqa .Lkey_rcon1(%rip),%xmm4 4479 mov \$8,%r10d 4480 movdqu %xmm0,($key) 4481 jmp .Loop_key192 4482 4483.align 16 4484.Loop_key192: 4485 movq %xmm2,0(%rax) 4486 movdqa %xmm2,%xmm1 4487 pshufb %xmm5,%xmm2 4488 aesenclast %xmm4,%xmm2 4489 pslld \$1, %xmm4 4490 lea 24(%rax),%rax 4491 4492 movdqa %xmm0,%xmm3 4493 pslldq \$4,%xmm0 4494 pxor %xmm0,%xmm3 4495 pslldq \$4,%xmm0 4496 pxor %xmm0,%xmm3 4497 pslldq \$4,%xmm0 4498 pxor %xmm3,%xmm0 4499 4500 pshufd \$0xff,%xmm0,%xmm3 4501 pxor %xmm1,%xmm3 4502 pslldq \$4,%xmm1 4503 pxor %xmm1,%xmm3 4504 4505 pxor %xmm2,%xmm0 4506 pxor %xmm3,%xmm2 4507 movdqu %xmm0,-16(%rax) 4508 4509 dec %r10d 4510 jnz .Loop_key192 4511 4512 mov $bits,32(%rax) # 240($key) 4513 xor %eax,%eax 4514 jmp .Lenc_key_ret 4515 4516.align 16 4517.L14rounds: 4518 movups 16($inp),%xmm2 # remaining half of *userKey 4519 mov \$13,$bits # 14 rounds for 256 4520 lea 16(%rax),%rax 4521 cmp \$`1<<28`,%r10d # AVX, but no XOP 4522 je .L14rounds_alt 4523 4524 $movkey %xmm0,($key) # round 0 4525 $movkey %xmm2,16($key) # round 1 4526 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4527 call .Lkey_expansion_256a_cold 4528 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4529 call .Lkey_expansion_256b 4530 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4531 call .Lkey_expansion_256a 4532 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4533 call .Lkey_expansion_256b 4534 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4535 call .Lkey_expansion_256a 4536 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4537 call .Lkey_expansion_256b 4538 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4539 call .Lkey_expansion_256a 4540 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4541 call .Lkey_expansion_256b 4542 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4543 call .Lkey_expansion_256a 4544 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4545 call .Lkey_expansion_256b 4546 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4547 call .Lkey_expansion_256a 4548 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4549 call .Lkey_expansion_256b 4550 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4551 call .Lkey_expansion_256a 4552 $movkey %xmm0,(%rax) 4553 mov $bits,16(%rax) # 240(%rdx) 4554 xor %rax,%rax 4555 jmp .Lenc_key_ret 4556 4557.align 16 4558.L14rounds_alt: 4559 movdqa .Lkey_rotate(%rip),%xmm5 4560 movdqa .Lkey_rcon1(%rip),%xmm4 4561 mov \$7,%r10d 4562 movdqu %xmm0,0($key) 4563 movdqa %xmm2,%xmm1 4564 movdqu %xmm2,16($key) 4565 jmp .Loop_key256 4566 4567.align 16 4568.Loop_key256: 4569 pshufb %xmm5,%xmm2 4570 aesenclast %xmm4,%xmm2 4571 4572 movdqa %xmm0,%xmm3 4573 pslldq \$4,%xmm0 4574 pxor %xmm0,%xmm3 4575 pslldq \$4,%xmm0 4576 pxor %xmm0,%xmm3 4577 pslldq \$4,%xmm0 4578 pxor %xmm3,%xmm0 4579 pslld \$1,%xmm4 4580 4581 pxor %xmm2,%xmm0 4582 movdqu %xmm0,(%rax) 4583 4584 dec %r10d 4585 jz .Ldone_key256 4586 4587 pshufd \$0xff,%xmm0,%xmm2 4588 pxor %xmm3,%xmm3 4589 aesenclast %xmm3,%xmm2 4590 4591 movdqa %xmm1,%xmm3 4592 pslldq \$4,%xmm1 4593 pxor %xmm1,%xmm3 4594 pslldq \$4,%xmm1 4595 pxor %xmm1,%xmm3 4596 pslldq \$4,%xmm1 4597 pxor %xmm3,%xmm1 4598 4599 pxor %xmm1,%xmm2 4600 movdqu %xmm2,16(%rax) 4601 lea 32(%rax),%rax 4602 movdqa %xmm2,%xmm1 4603 4604 jmp .Loop_key256 4605 4606.Ldone_key256: 4607 mov $bits,16(%rax) # 240($key) 4608 xor %eax,%eax 4609 jmp .Lenc_key_ret 4610 4611.align 16 4612.Lbad_keybits: 4613 mov \$-2,%rax 4614.Lenc_key_ret: 4615 pxor %xmm0,%xmm0 4616 pxor %xmm1,%xmm1 4617 pxor %xmm2,%xmm2 4618 pxor %xmm3,%xmm3 4619 pxor %xmm4,%xmm4 4620 pxor %xmm5,%xmm5 4621 add \$8,%rsp 4622.cfi_adjust_cfa_offset -8 4623 ret 4624.cfi_endproc 4625.LSEH_end_set_encrypt_key: 4626 4627.align 16 4628.Lkey_expansion_128: 4629 $movkey %xmm0,(%rax) 4630 lea 16(%rax),%rax 4631.Lkey_expansion_128_cold: 4632 shufps \$0b00010000,%xmm0,%xmm4 4633 xorps %xmm4, %xmm0 4634 shufps \$0b10001100,%xmm0,%xmm4 4635 xorps %xmm4, %xmm0 4636 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4637 xorps %xmm1,%xmm0 4638 ret 4639 4640.align 16 4641.Lkey_expansion_192a: 4642 $movkey %xmm0,(%rax) 4643 lea 16(%rax),%rax 4644.Lkey_expansion_192a_cold: 4645 movaps %xmm2, %xmm5 4646.Lkey_expansion_192b_warm: 4647 shufps \$0b00010000,%xmm0,%xmm4 4648 movdqa %xmm2,%xmm3 4649 xorps %xmm4,%xmm0 4650 shufps \$0b10001100,%xmm0,%xmm4 4651 pslldq \$4,%xmm3 4652 xorps %xmm4,%xmm0 4653 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4654 pxor %xmm3,%xmm2 4655 pxor %xmm1,%xmm0 4656 pshufd \$0b11111111,%xmm0,%xmm3 4657 pxor %xmm3,%xmm2 4658 ret 4659 4660.align 16 4661.Lkey_expansion_192b: 4662 movaps %xmm0,%xmm3 4663 shufps \$0b01000100,%xmm0,%xmm5 4664 $movkey %xmm5,(%rax) 4665 shufps \$0b01001110,%xmm2,%xmm3 4666 $movkey %xmm3,16(%rax) 4667 lea 32(%rax),%rax 4668 jmp .Lkey_expansion_192b_warm 4669 4670.align 16 4671.Lkey_expansion_256a: 4672 $movkey %xmm2,(%rax) 4673 lea 16(%rax),%rax 4674.Lkey_expansion_256a_cold: 4675 shufps \$0b00010000,%xmm0,%xmm4 4676 xorps %xmm4,%xmm0 4677 shufps \$0b10001100,%xmm0,%xmm4 4678 xorps %xmm4,%xmm0 4679 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4680 xorps %xmm1,%xmm0 4681 ret 4682 4683.align 16 4684.Lkey_expansion_256b: 4685 $movkey %xmm0,(%rax) 4686 lea 16(%rax),%rax 4687 4688 shufps \$0b00010000,%xmm2,%xmm4 4689 xorps %xmm4,%xmm2 4690 shufps \$0b10001100,%xmm2,%xmm4 4691 xorps %xmm4,%xmm2 4692 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4693 xorps %xmm1,%xmm2 4694 ret 4695.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4696.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4697___ 4698} 4699 4700$code.=<<___; 4701.align 64 4702.Lbswap_mask: 4703 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4704.Lincrement32: 4705 .long 6,6,6,0 4706.Lincrement64: 4707 .long 1,0,0,0 4708.Lxts_magic: 4709 .long 0x87,0,1,0 4710.Lincrement1: 4711 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4712.Lkey_rotate: 4713 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4714.Lkey_rotate192: 4715 .long 0x04070605,0x04070605,0x04070605,0x04070605 4716.Lkey_rcon1: 4717 .long 1,1,1,1 4718.Lkey_rcon1b: 4719 .long 0x1b,0x1b,0x1b,0x1b 4720 4721.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4722.align 64 4723___ 4724 4725# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4726# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4727if ($win64) { 4728$rec="%rcx"; 4729$frame="%rdx"; 4730$context="%r8"; 4731$disp="%r9"; 4732 4733$code.=<<___; 4734.extern __imp_RtlVirtualUnwind 4735___ 4736$code.=<<___ if ($PREFIX eq "aesni"); 4737.type ecb_ccm64_se_handler,\@abi-omnipotent 4738.align 16 4739ecb_ccm64_se_handler: 4740 push %rsi 4741 push %rdi 4742 push %rbx 4743 push %rbp 4744 push %r12 4745 push %r13 4746 push %r14 4747 push %r15 4748 pushfq 4749 sub \$64,%rsp 4750 4751 mov 120($context),%rax # pull context->Rax 4752 mov 248($context),%rbx # pull context->Rip 4753 4754 mov 8($disp),%rsi # disp->ImageBase 4755 mov 56($disp),%r11 # disp->HandlerData 4756 4757 mov 0(%r11),%r10d # HandlerData[0] 4758 lea (%rsi,%r10),%r10 # prologue label 4759 cmp %r10,%rbx # context->Rip<prologue label 4760 jb .Lcommon_seh_tail 4761 4762 mov 152($context),%rax # pull context->Rsp 4763 4764 mov 4(%r11),%r10d # HandlerData[1] 4765 lea (%rsi,%r10),%r10 # epilogue label 4766 cmp %r10,%rbx # context->Rip>=epilogue label 4767 jae .Lcommon_seh_tail 4768 4769 lea 0(%rax),%rsi # %xmm save area 4770 lea 512($context),%rdi # &context.Xmm6 4771 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4772 .long 0xa548f3fc # cld; rep movsq 4773 lea 0x58(%rax),%rax # adjust stack pointer 4774 4775 jmp .Lcommon_seh_tail 4776.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 4777 4778.type ctr_xts_se_handler,\@abi-omnipotent 4779.align 16 4780ctr_xts_se_handler: 4781 push %rsi 4782 push %rdi 4783 push %rbx 4784 push %rbp 4785 push %r12 4786 push %r13 4787 push %r14 4788 push %r15 4789 pushfq 4790 sub \$64,%rsp 4791 4792 mov 120($context),%rax # pull context->Rax 4793 mov 248($context),%rbx # pull context->Rip 4794 4795 mov 8($disp),%rsi # disp->ImageBase 4796 mov 56($disp),%r11 # disp->HandlerData 4797 4798 mov 0(%r11),%r10d # HandlerData[0] 4799 lea (%rsi,%r10),%r10 # prologue lable 4800 cmp %r10,%rbx # context->Rip<prologue label 4801 jb .Lcommon_seh_tail 4802 4803 mov 152($context),%rax # pull context->Rsp 4804 4805 mov 4(%r11),%r10d # HandlerData[1] 4806 lea (%rsi,%r10),%r10 # epilogue label 4807 cmp %r10,%rbx # context->Rip>=epilogue label 4808 jae .Lcommon_seh_tail 4809 4810 mov 208($context),%rax # pull context->R11 4811 4812 lea -0xa8(%rax),%rsi # %xmm save area 4813 lea 512($context),%rdi # & context.Xmm6 4814 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4815 .long 0xa548f3fc # cld; rep movsq 4816 4817 mov -8(%rax),%rbp # restore saved %rbp 4818 mov %rbp,160($context) # restore context->Rbp 4819 jmp .Lcommon_seh_tail 4820.size ctr_xts_se_handler,.-ctr_xts_se_handler 4821 4822.type ocb_se_handler,\@abi-omnipotent 4823.align 16 4824ocb_se_handler: 4825 push %rsi 4826 push %rdi 4827 push %rbx 4828 push %rbp 4829 push %r12 4830 push %r13 4831 push %r14 4832 push %r15 4833 pushfq 4834 sub \$64,%rsp 4835 4836 mov 120($context),%rax # pull context->Rax 4837 mov 248($context),%rbx # pull context->Rip 4838 4839 mov 8($disp),%rsi # disp->ImageBase 4840 mov 56($disp),%r11 # disp->HandlerData 4841 4842 mov 0(%r11),%r10d # HandlerData[0] 4843 lea (%rsi,%r10),%r10 # prologue lable 4844 cmp %r10,%rbx # context->Rip<prologue label 4845 jb .Lcommon_seh_tail 4846 4847 mov 4(%r11),%r10d # HandlerData[1] 4848 lea (%rsi,%r10),%r10 # epilogue label 4849 cmp %r10,%rbx # context->Rip>=epilogue label 4850 jae .Lcommon_seh_tail 4851 4852 mov 8(%r11),%r10d # HandlerData[2] 4853 lea (%rsi,%r10),%r10 4854 cmp %r10,%rbx # context->Rip>=pop label 4855 jae .Locb_no_xmm 4856 4857 mov 152($context),%rax # pull context->Rsp 4858 4859 lea (%rax),%rsi # %xmm save area 4860 lea 512($context),%rdi # & context.Xmm6 4861 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4862 .long 0xa548f3fc # cld; rep movsq 4863 lea 0xa0+0x28(%rax),%rax 4864 4865.Locb_no_xmm: 4866 mov -8(%rax),%rbx 4867 mov -16(%rax),%rbp 4868 mov -24(%rax),%r12 4869 mov -32(%rax),%r13 4870 mov -40(%rax),%r14 4871 4872 mov %rbx,144($context) # restore context->Rbx 4873 mov %rbp,160($context) # restore context->Rbp 4874 mov %r12,216($context) # restore context->R12 4875 mov %r13,224($context) # restore context->R13 4876 mov %r14,232($context) # restore context->R14 4877 4878 jmp .Lcommon_seh_tail 4879.size ocb_se_handler,.-ocb_se_handler 4880___ 4881$code.=<<___; 4882.type cbc_se_handler,\@abi-omnipotent 4883.align 16 4884cbc_se_handler: 4885 push %rsi 4886 push %rdi 4887 push %rbx 4888 push %rbp 4889 push %r12 4890 push %r13 4891 push %r14 4892 push %r15 4893 pushfq 4894 sub \$64,%rsp 4895 4896 mov 152($context),%rax # pull context->Rsp 4897 mov 248($context),%rbx # pull context->Rip 4898 4899 lea .Lcbc_decrypt_bulk(%rip),%r10 4900 cmp %r10,%rbx # context->Rip<"prologue" label 4901 jb .Lcommon_seh_tail 4902 4903 mov 120($context),%rax # pull context->Rax 4904 4905 lea .Lcbc_decrypt_body(%rip),%r10 4906 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4907 jb .Lcommon_seh_tail 4908 4909 mov 152($context),%rax # pull context->Rsp 4910 4911 lea .Lcbc_ret(%rip),%r10 4912 cmp %r10,%rbx # context->Rip>="epilogue" label 4913 jae .Lcommon_seh_tail 4914 4915 lea 16(%rax),%rsi # %xmm save area 4916 lea 512($context),%rdi # &context.Xmm6 4917 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4918 .long 0xa548f3fc # cld; rep movsq 4919 4920 mov 208($context),%rax # pull context->R11 4921 4922 mov -8(%rax),%rbp # restore saved %rbp 4923 mov %rbp,160($context) # restore context->Rbp 4924 4925.Lcommon_seh_tail: 4926 mov 8(%rax),%rdi 4927 mov 16(%rax),%rsi 4928 mov %rax,152($context) # restore context->Rsp 4929 mov %rsi,168($context) # restore context->Rsi 4930 mov %rdi,176($context) # restore context->Rdi 4931 4932 mov 40($disp),%rdi # disp->ContextRecord 4933 mov $context,%rsi # context 4934 mov \$154,%ecx # sizeof(CONTEXT) 4935 .long 0xa548f3fc # cld; rep movsq 4936 4937 mov $disp,%rsi 4938 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4939 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4940 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4941 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4942 mov 40(%rsi),%r10 # disp->ContextRecord 4943 lea 56(%rsi),%r11 # &disp->HandlerData 4944 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4945 mov %r10,32(%rsp) # arg5 4946 mov %r11,40(%rsp) # arg6 4947 mov %r12,48(%rsp) # arg7 4948 mov %rcx,56(%rsp) # arg8, (NULL) 4949 call *__imp_RtlVirtualUnwind(%rip) 4950 4951 mov \$1,%eax # ExceptionContinueSearch 4952 add \$64,%rsp 4953 popfq 4954 pop %r15 4955 pop %r14 4956 pop %r13 4957 pop %r12 4958 pop %rbp 4959 pop %rbx 4960 pop %rdi 4961 pop %rsi 4962 ret 4963.size cbc_se_handler,.-cbc_se_handler 4964 4965.section .pdata 4966.align 4 4967___ 4968$code.=<<___ if ($PREFIX eq "aesni"); 4969 .rva .LSEH_begin_aesni_ecb_encrypt 4970 .rva .LSEH_end_aesni_ecb_encrypt 4971 .rva .LSEH_info_ecb 4972 4973 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 4974 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 4975 .rva .LSEH_info_ccm64_enc 4976 4977 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 4978 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 4979 .rva .LSEH_info_ccm64_dec 4980 4981 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 4982 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 4983 .rva .LSEH_info_ctr32 4984 4985 .rva .LSEH_begin_aesni_xts_encrypt 4986 .rva .LSEH_end_aesni_xts_encrypt 4987 .rva .LSEH_info_xts_enc 4988 4989 .rva .LSEH_begin_aesni_xts_decrypt 4990 .rva .LSEH_end_aesni_xts_decrypt 4991 .rva .LSEH_info_xts_dec 4992 4993 .rva .LSEH_begin_aesni_ocb_encrypt 4994 .rva .LSEH_end_aesni_ocb_encrypt 4995 .rva .LSEH_info_ocb_enc 4996 4997 .rva .LSEH_begin_aesni_ocb_decrypt 4998 .rva .LSEH_end_aesni_ocb_decrypt 4999 .rva .LSEH_info_ocb_dec 5000___ 5001$code.=<<___; 5002 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 5003 .rva .LSEH_end_${PREFIX}_cbc_encrypt 5004 .rva .LSEH_info_cbc 5005 5006 .rva ${PREFIX}_set_decrypt_key 5007 .rva .LSEH_end_set_decrypt_key 5008 .rva .LSEH_info_key 5009 5010 .rva ${PREFIX}_set_encrypt_key 5011 .rva .LSEH_end_set_encrypt_key 5012 .rva .LSEH_info_key 5013.section .xdata 5014.align 8 5015___ 5016$code.=<<___ if ($PREFIX eq "aesni"); 5017.LSEH_info_ecb: 5018 .byte 9,0,0,0 5019 .rva ecb_ccm64_se_handler 5020 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 5021.LSEH_info_ccm64_enc: 5022 .byte 9,0,0,0 5023 .rva ecb_ccm64_se_handler 5024 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 5025.LSEH_info_ccm64_dec: 5026 .byte 9,0,0,0 5027 .rva ecb_ccm64_se_handler 5028 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 5029.LSEH_info_ctr32: 5030 .byte 9,0,0,0 5031 .rva ctr_xts_se_handler 5032 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 5033.LSEH_info_xts_enc: 5034 .byte 9,0,0,0 5035 .rva ctr_xts_se_handler 5036 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 5037.LSEH_info_xts_dec: 5038 .byte 9,0,0,0 5039 .rva ctr_xts_se_handler 5040 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 5041.LSEH_info_ocb_enc: 5042 .byte 9,0,0,0 5043 .rva ocb_se_handler 5044 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] 5045 .rva .Locb_enc_pop 5046 .long 0 5047.LSEH_info_ocb_dec: 5048 .byte 9,0,0,0 5049 .rva ocb_se_handler 5050 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] 5051 .rva .Locb_dec_pop 5052 .long 0 5053___ 5054$code.=<<___; 5055.LSEH_info_cbc: 5056 .byte 9,0,0,0 5057 .rva cbc_se_handler 5058.LSEH_info_key: 5059 .byte 0x01,0x04,0x01,0x00 5060 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5061___ 5062} 5063 5064sub rex { 5065 local *opcode=shift; 5066 my ($dst,$src)=@_; 5067 my $rex=0; 5068 5069 $rex|=0x04 if($dst>=8); 5070 $rex|=0x01 if($src>=8); 5071 push @opcode,$rex|0x40 if($rex); 5072} 5073 5074sub aesni { 5075 my $line=shift; 5076 my @opcode=(0x66); 5077 5078 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5079 rex(\@opcode,$4,$3); 5080 push @opcode,0x0f,0x3a,0xdf; 5081 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5082 my $c=$2; 5083 push @opcode,$c=~/^0/?oct($c):$c; 5084 return ".byte\t".join(',',@opcode); 5085 } 5086 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5087 my %opcodelet = ( 5088 "aesimc" => 0xdb, 5089 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5090 "aesdec" => 0xde, "aesdeclast" => 0xdf 5091 ); 5092 return undef if (!defined($opcodelet{$1})); 5093 rex(\@opcode,$3,$2); 5094 push @opcode,0x0f,0x38,$opcodelet{$1}; 5095 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5096 return ".byte\t".join(',',@opcode); 5097 } 5098 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5099 my %opcodelet = ( 5100 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5101 "aesdec" => 0xde, "aesdeclast" => 0xdf 5102 ); 5103 return undef if (!defined($opcodelet{$1})); 5104 my $off = $2; 5105 push @opcode,0x44 if ($3>=8); 5106 push @opcode,0x0f,0x38,$opcodelet{$1}; 5107 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5108 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5109 return ".byte\t".join(',',@opcode); 5110 } 5111 return $line; 5112} 5113 5114sub movbe { 5115 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5116} 5117 5118$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5119$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5120#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5121$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5122 5123print $code; 5124 5125close STDOUT; 5126