1#! /usr/bin/env perl 2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved with CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Knights L 2.54/0.77 0.78 0.85 - 1.50 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186# 187# (*) Atom Silvermont ECB result is suboptimal because of penalties 188# incurred by operations on %xmm8-15. As ECB is not considered 189# critical, nothing was done to mitigate the problem. 190 191$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195$flavour = shift; 196$output = shift; 197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 198 199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 200 201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 203( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 204die "can't locate x86_64-xlate.pl"; 205 206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 207*STDOUT=*OUT; 208 209$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 212 213$code=".text\n"; 214$code.=".extern OPENSSL_ia32cap_P\n"; 215 216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 218$inp="%rdi"; 219$out="%rsi"; 220$len="%rdx"; 221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 222$ivp="%r8"; # cbc, ctr, ... 223 224$rnds_="%r10d"; # backup copy for $rounds 225$key_="%r11"; # backup copy for $key 226 227# %xmm register layout 228$rndkey0="%xmm0"; $rndkey1="%xmm1"; 229$inout0="%xmm2"; $inout1="%xmm3"; 230$inout2="%xmm4"; $inout3="%xmm5"; 231$inout4="%xmm6"; $inout5="%xmm7"; 232$inout6="%xmm8"; $inout7="%xmm9"; 233 234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 235$in0="%xmm8"; $iv="%xmm9"; 236 237# Inline version of internal aesni_[en|de]crypt1. 238# 239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 240# cycles which take care of loop variables... 241{ my $sn; 242sub aesni_generate1 { 243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 244++$sn; 245$code.=<<___; 246 $movkey ($key),$rndkey0 247 $movkey 16($key),$rndkey1 248___ 249$code.=<<___ if (defined($ivec)); 250 xorps $rndkey0,$ivec 251 lea 32($key),$key 252 xorps $ivec,$inout 253___ 254$code.=<<___ if (!defined($ivec)); 255 lea 32($key),$key 256 xorps $rndkey0,$inout 257___ 258$code.=<<___; 259.Loop_${p}1_$sn: 260 aes${p} $rndkey1,$inout 261 dec $rounds 262 $movkey ($key),$rndkey1 263 lea 16($key),$key 264 jnz .Loop_${p}1_$sn # loop body is 16 bytes 265 aes${p}last $rndkey1,$inout 266___ 267}} 268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 269# 270{ my ($inp,$out,$key) = @_4args; 271 272$code.=<<___; 273.globl ${PREFIX}_encrypt 274.type ${PREFIX}_encrypt,\@abi-omnipotent 275.align 16 276${PREFIX}_encrypt: 277.cfi_startproc 278 movups ($inp),$inout0 # load input 279 mov 240($key),$rounds # key->rounds 280___ 281 &aesni_generate1("enc",$key,$rounds); 282$code.=<<___; 283 pxor $rndkey0,$rndkey0 # clear register bank 284 pxor $rndkey1,$rndkey1 285 movups $inout0,($out) # output 286 pxor $inout0,$inout0 287 ret 288.cfi_endproc 289.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 290 291.globl ${PREFIX}_decrypt 292.type ${PREFIX}_decrypt,\@abi-omnipotent 293.align 16 294${PREFIX}_decrypt: 295.cfi_startproc 296 movups ($inp),$inout0 # load input 297 mov 240($key),$rounds # key->rounds 298___ 299 &aesni_generate1("dec",$key,$rounds); 300$code.=<<___; 301 pxor $rndkey0,$rndkey0 # clear register bank 302 pxor $rndkey1,$rndkey1 303 movups $inout0,($out) # output 304 pxor $inout0,$inout0 305 ret 306.cfi_endproc 307.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 308___ 309} 310 311# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 312# factor. Why 3x subroutine were originally used in loops? Even though 313# aes[enc|dec] latency was originally 6, it could be scheduled only 314# every *2nd* cycle. Thus 3x interleave was the one providing optimal 315# utilization, i.e. when subroutine's throughput is virtually same as 316# of non-interleaved subroutine [for number of input blocks up to 3]. 317# This is why it originally made no sense to implement 2x subroutine. 318# But times change and it became appropriate to spend extra 192 bytes 319# on 2x subroutine on Atom Silvermont account. For processors that 320# can schedule aes[enc|dec] every cycle optimal interleave factor 321# equals to corresponding instructions latency. 8x is optimal for 322# * Bridge and "super-optimal" for other Intel CPUs... 323 324sub aesni_generate2 { 325my $dir=shift; 326# As already mentioned it takes in $key and $rounds, which are *not* 327# preserved. $inout[0-1] is cipher/clear text... 328$code.=<<___; 329.type _aesni_${dir}rypt2,\@abi-omnipotent 330.align 16 331_aesni_${dir}rypt2: 332.cfi_startproc 333 $movkey ($key),$rndkey0 334 shl \$4,$rounds 335 $movkey 16($key),$rndkey1 336 xorps $rndkey0,$inout0 337 xorps $rndkey0,$inout1 338 $movkey 32($key),$rndkey0 339 lea 32($key,$rounds),$key 340 neg %rax # $rounds 341 add \$16,%rax 342 343.L${dir}_loop2: 344 aes${dir} $rndkey1,$inout0 345 aes${dir} $rndkey1,$inout1 346 $movkey ($key,%rax),$rndkey1 347 add \$32,%rax 348 aes${dir} $rndkey0,$inout0 349 aes${dir} $rndkey0,$inout1 350 $movkey -16($key,%rax),$rndkey0 351 jnz .L${dir}_loop2 352 353 aes${dir} $rndkey1,$inout0 354 aes${dir} $rndkey1,$inout1 355 aes${dir}last $rndkey0,$inout0 356 aes${dir}last $rndkey0,$inout1 357 ret 358.cfi_endproc 359.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 360___ 361} 362sub aesni_generate3 { 363my $dir=shift; 364# As already mentioned it takes in $key and $rounds, which are *not* 365# preserved. $inout[0-2] is cipher/clear text... 366$code.=<<___; 367.type _aesni_${dir}rypt3,\@abi-omnipotent 368.align 16 369_aesni_${dir}rypt3: 370.cfi_startproc 371 $movkey ($key),$rndkey0 372 shl \$4,$rounds 373 $movkey 16($key),$rndkey1 374 xorps $rndkey0,$inout0 375 xorps $rndkey0,$inout1 376 xorps $rndkey0,$inout2 377 $movkey 32($key),$rndkey0 378 lea 32($key,$rounds),$key 379 neg %rax # $rounds 380 add \$16,%rax 381 382.L${dir}_loop3: 383 aes${dir} $rndkey1,$inout0 384 aes${dir} $rndkey1,$inout1 385 aes${dir} $rndkey1,$inout2 386 $movkey ($key,%rax),$rndkey1 387 add \$32,%rax 388 aes${dir} $rndkey0,$inout0 389 aes${dir} $rndkey0,$inout1 390 aes${dir} $rndkey0,$inout2 391 $movkey -16($key,%rax),$rndkey0 392 jnz .L${dir}_loop3 393 394 aes${dir} $rndkey1,$inout0 395 aes${dir} $rndkey1,$inout1 396 aes${dir} $rndkey1,$inout2 397 aes${dir}last $rndkey0,$inout0 398 aes${dir}last $rndkey0,$inout1 399 aes${dir}last $rndkey0,$inout2 400 ret 401.cfi_endproc 402.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 403___ 404} 405# 4x interleave is implemented to improve small block performance, 406# most notably [and naturally] 4 block by ~30%. One can argue that one 407# should have implemented 5x as well, but improvement would be <20%, 408# so it's not worth it... 409sub aesni_generate4 { 410my $dir=shift; 411# As already mentioned it takes in $key and $rounds, which are *not* 412# preserved. $inout[0-3] is cipher/clear text... 413$code.=<<___; 414.type _aesni_${dir}rypt4,\@abi-omnipotent 415.align 16 416_aesni_${dir}rypt4: 417.cfi_startproc 418 $movkey ($key),$rndkey0 419 shl \$4,$rounds 420 $movkey 16($key),$rndkey1 421 xorps $rndkey0,$inout0 422 xorps $rndkey0,$inout1 423 xorps $rndkey0,$inout2 424 xorps $rndkey0,$inout3 425 $movkey 32($key),$rndkey0 426 lea 32($key,$rounds),$key 427 neg %rax # $rounds 428 .byte 0x0f,0x1f,0x00 429 add \$16,%rax 430 431.L${dir}_loop4: 432 aes${dir} $rndkey1,$inout0 433 aes${dir} $rndkey1,$inout1 434 aes${dir} $rndkey1,$inout2 435 aes${dir} $rndkey1,$inout3 436 $movkey ($key,%rax),$rndkey1 437 add \$32,%rax 438 aes${dir} $rndkey0,$inout0 439 aes${dir} $rndkey0,$inout1 440 aes${dir} $rndkey0,$inout2 441 aes${dir} $rndkey0,$inout3 442 $movkey -16($key,%rax),$rndkey0 443 jnz .L${dir}_loop4 444 445 aes${dir} $rndkey1,$inout0 446 aes${dir} $rndkey1,$inout1 447 aes${dir} $rndkey1,$inout2 448 aes${dir} $rndkey1,$inout3 449 aes${dir}last $rndkey0,$inout0 450 aes${dir}last $rndkey0,$inout1 451 aes${dir}last $rndkey0,$inout2 452 aes${dir}last $rndkey0,$inout3 453 ret 454.cfi_endproc 455.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 456___ 457} 458sub aesni_generate6 { 459my $dir=shift; 460# As already mentioned it takes in $key and $rounds, which are *not* 461# preserved. $inout[0-5] is cipher/clear text... 462$code.=<<___; 463.type _aesni_${dir}rypt6,\@abi-omnipotent 464.align 16 465_aesni_${dir}rypt6: 466.cfi_startproc 467 $movkey ($key),$rndkey0 468 shl \$4,$rounds 469 $movkey 16($key),$rndkey1 470 xorps $rndkey0,$inout0 471 pxor $rndkey0,$inout1 472 pxor $rndkey0,$inout2 473 aes${dir} $rndkey1,$inout0 474 lea 32($key,$rounds),$key 475 neg %rax # $rounds 476 aes${dir} $rndkey1,$inout1 477 pxor $rndkey0,$inout3 478 pxor $rndkey0,$inout4 479 aes${dir} $rndkey1,$inout2 480 pxor $rndkey0,$inout5 481 $movkey ($key,%rax),$rndkey0 482 add \$16,%rax 483 jmp .L${dir}_loop6_enter 484.align 16 485.L${dir}_loop6: 486 aes${dir} $rndkey1,$inout0 487 aes${dir} $rndkey1,$inout1 488 aes${dir} $rndkey1,$inout2 489.L${dir}_loop6_enter: 490 aes${dir} $rndkey1,$inout3 491 aes${dir} $rndkey1,$inout4 492 aes${dir} $rndkey1,$inout5 493 $movkey ($key,%rax),$rndkey1 494 add \$32,%rax 495 aes${dir} $rndkey0,$inout0 496 aes${dir} $rndkey0,$inout1 497 aes${dir} $rndkey0,$inout2 498 aes${dir} $rndkey0,$inout3 499 aes${dir} $rndkey0,$inout4 500 aes${dir} $rndkey0,$inout5 501 $movkey -16($key,%rax),$rndkey0 502 jnz .L${dir}_loop6 503 504 aes${dir} $rndkey1,$inout0 505 aes${dir} $rndkey1,$inout1 506 aes${dir} $rndkey1,$inout2 507 aes${dir} $rndkey1,$inout3 508 aes${dir} $rndkey1,$inout4 509 aes${dir} $rndkey1,$inout5 510 aes${dir}last $rndkey0,$inout0 511 aes${dir}last $rndkey0,$inout1 512 aes${dir}last $rndkey0,$inout2 513 aes${dir}last $rndkey0,$inout3 514 aes${dir}last $rndkey0,$inout4 515 aes${dir}last $rndkey0,$inout5 516 ret 517.cfi_endproc 518.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 519___ 520} 521sub aesni_generate8 { 522my $dir=shift; 523# As already mentioned it takes in $key and $rounds, which are *not* 524# preserved. $inout[0-7] is cipher/clear text... 525$code.=<<___; 526.type _aesni_${dir}rypt8,\@abi-omnipotent 527.align 16 528_aesni_${dir}rypt8: 529.cfi_startproc 530 $movkey ($key),$rndkey0 531 shl \$4,$rounds 532 $movkey 16($key),$rndkey1 533 xorps $rndkey0,$inout0 534 xorps $rndkey0,$inout1 535 pxor $rndkey0,$inout2 536 pxor $rndkey0,$inout3 537 pxor $rndkey0,$inout4 538 lea 32($key,$rounds),$key 539 neg %rax # $rounds 540 aes${dir} $rndkey1,$inout0 541 pxor $rndkey0,$inout5 542 pxor $rndkey0,$inout6 543 aes${dir} $rndkey1,$inout1 544 pxor $rndkey0,$inout7 545 $movkey ($key,%rax),$rndkey0 546 add \$16,%rax 547 jmp .L${dir}_loop8_inner 548.align 16 549.L${dir}_loop8: 550 aes${dir} $rndkey1,$inout0 551 aes${dir} $rndkey1,$inout1 552.L${dir}_loop8_inner: 553 aes${dir} $rndkey1,$inout2 554 aes${dir} $rndkey1,$inout3 555 aes${dir} $rndkey1,$inout4 556 aes${dir} $rndkey1,$inout5 557 aes${dir} $rndkey1,$inout6 558 aes${dir} $rndkey1,$inout7 559.L${dir}_loop8_enter: 560 $movkey ($key,%rax),$rndkey1 561 add \$32,%rax 562 aes${dir} $rndkey0,$inout0 563 aes${dir} $rndkey0,$inout1 564 aes${dir} $rndkey0,$inout2 565 aes${dir} $rndkey0,$inout3 566 aes${dir} $rndkey0,$inout4 567 aes${dir} $rndkey0,$inout5 568 aes${dir} $rndkey0,$inout6 569 aes${dir} $rndkey0,$inout7 570 $movkey -16($key,%rax),$rndkey0 571 jnz .L${dir}_loop8 572 573 aes${dir} $rndkey1,$inout0 574 aes${dir} $rndkey1,$inout1 575 aes${dir} $rndkey1,$inout2 576 aes${dir} $rndkey1,$inout3 577 aes${dir} $rndkey1,$inout4 578 aes${dir} $rndkey1,$inout5 579 aes${dir} $rndkey1,$inout6 580 aes${dir} $rndkey1,$inout7 581 aes${dir}last $rndkey0,$inout0 582 aes${dir}last $rndkey0,$inout1 583 aes${dir}last $rndkey0,$inout2 584 aes${dir}last $rndkey0,$inout3 585 aes${dir}last $rndkey0,$inout4 586 aes${dir}last $rndkey0,$inout5 587 aes${dir}last $rndkey0,$inout6 588 aes${dir}last $rndkey0,$inout7 589 ret 590.cfi_endproc 591.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 592___ 593} 594&aesni_generate2("enc") if ($PREFIX eq "aesni"); 595&aesni_generate2("dec"); 596&aesni_generate3("enc") if ($PREFIX eq "aesni"); 597&aesni_generate3("dec"); 598&aesni_generate4("enc") if ($PREFIX eq "aesni"); 599&aesni_generate4("dec"); 600&aesni_generate6("enc") if ($PREFIX eq "aesni"); 601&aesni_generate6("dec"); 602&aesni_generate8("enc") if ($PREFIX eq "aesni"); 603&aesni_generate8("dec"); 604 605if ($PREFIX eq "aesni") { 606######################################################################## 607# void aesni_ecb_encrypt (const void *in, void *out, 608# size_t length, const AES_KEY *key, 609# int enc); 610$code.=<<___; 611.globl aesni_ecb_encrypt 612.type aesni_ecb_encrypt,\@function,5 613.align 16 614aesni_ecb_encrypt: 615.cfi_startproc 616___ 617$code.=<<___ if ($win64); 618 lea -0x58(%rsp),%rsp 619 movaps %xmm6,(%rsp) # offload $inout4..7 620 movaps %xmm7,0x10(%rsp) 621 movaps %xmm8,0x20(%rsp) 622 movaps %xmm9,0x30(%rsp) 623.Lecb_enc_body: 624___ 625$code.=<<___; 626 and \$-16,$len # if ($len<16) 627 jz .Lecb_ret # return 628 629 mov 240($key),$rounds # key->rounds 630 $movkey ($key),$rndkey0 631 mov $key,$key_ # backup $key 632 mov $rounds,$rnds_ # backup $rounds 633 test %r8d,%r8d # 5th argument 634 jz .Lecb_decrypt 635#--------------------------- ECB ENCRYPT ------------------------------# 636 cmp \$0x80,$len # if ($len<8*16) 637 jb .Lecb_enc_tail # short input 638 639 movdqu ($inp),$inout0 # load 8 input blocks 640 movdqu 0x10($inp),$inout1 641 movdqu 0x20($inp),$inout2 642 movdqu 0x30($inp),$inout3 643 movdqu 0x40($inp),$inout4 644 movdqu 0x50($inp),$inout5 645 movdqu 0x60($inp),$inout6 646 movdqu 0x70($inp),$inout7 647 lea 0x80($inp),$inp # $inp+=8*16 648 sub \$0x80,$len # $len-=8*16 (can be zero) 649 jmp .Lecb_enc_loop8_enter 650.align 16 651.Lecb_enc_loop8: 652 movups $inout0,($out) # store 8 output blocks 653 mov $key_,$key # restore $key 654 movdqu ($inp),$inout0 # load 8 input blocks 655 mov $rnds_,$rounds # restore $rounds 656 movups $inout1,0x10($out) 657 movdqu 0x10($inp),$inout1 658 movups $inout2,0x20($out) 659 movdqu 0x20($inp),$inout2 660 movups $inout3,0x30($out) 661 movdqu 0x30($inp),$inout3 662 movups $inout4,0x40($out) 663 movdqu 0x40($inp),$inout4 664 movups $inout5,0x50($out) 665 movdqu 0x50($inp),$inout5 666 movups $inout6,0x60($out) 667 movdqu 0x60($inp),$inout6 668 movups $inout7,0x70($out) 669 lea 0x80($out),$out # $out+=8*16 670 movdqu 0x70($inp),$inout7 671 lea 0x80($inp),$inp # $inp+=8*16 672.Lecb_enc_loop8_enter: 673 674 call _aesni_encrypt8 675 676 sub \$0x80,$len 677 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 678 679 movups $inout0,($out) # store 8 output blocks 680 mov $key_,$key # restore $key 681 movups $inout1,0x10($out) 682 mov $rnds_,$rounds # restore $rounds 683 movups $inout2,0x20($out) 684 movups $inout3,0x30($out) 685 movups $inout4,0x40($out) 686 movups $inout5,0x50($out) 687 movups $inout6,0x60($out) 688 movups $inout7,0x70($out) 689 lea 0x80($out),$out # $out+=8*16 690 add \$0x80,$len # restore real remaining $len 691 jz .Lecb_ret # done if ($len==0) 692 693.Lecb_enc_tail: # $len is less than 8*16 694 movups ($inp),$inout0 695 cmp \$0x20,$len 696 jb .Lecb_enc_one 697 movups 0x10($inp),$inout1 698 je .Lecb_enc_two 699 movups 0x20($inp),$inout2 700 cmp \$0x40,$len 701 jb .Lecb_enc_three 702 movups 0x30($inp),$inout3 703 je .Lecb_enc_four 704 movups 0x40($inp),$inout4 705 cmp \$0x60,$len 706 jb .Lecb_enc_five 707 movups 0x50($inp),$inout5 708 je .Lecb_enc_six 709 movdqu 0x60($inp),$inout6 710 xorps $inout7,$inout7 711 call _aesni_encrypt8 712 movups $inout0,($out) # store 7 output blocks 713 movups $inout1,0x10($out) 714 movups $inout2,0x20($out) 715 movups $inout3,0x30($out) 716 movups $inout4,0x40($out) 717 movups $inout5,0x50($out) 718 movups $inout6,0x60($out) 719 jmp .Lecb_ret 720.align 16 721.Lecb_enc_one: 722___ 723 &aesni_generate1("enc",$key,$rounds); 724$code.=<<___; 725 movups $inout0,($out) # store one output block 726 jmp .Lecb_ret 727.align 16 728.Lecb_enc_two: 729 call _aesni_encrypt2 730 movups $inout0,($out) # store 2 output blocks 731 movups $inout1,0x10($out) 732 jmp .Lecb_ret 733.align 16 734.Lecb_enc_three: 735 call _aesni_encrypt3 736 movups $inout0,($out) # store 3 output blocks 737 movups $inout1,0x10($out) 738 movups $inout2,0x20($out) 739 jmp .Lecb_ret 740.align 16 741.Lecb_enc_four: 742 call _aesni_encrypt4 743 movups $inout0,($out) # store 4 output blocks 744 movups $inout1,0x10($out) 745 movups $inout2,0x20($out) 746 movups $inout3,0x30($out) 747 jmp .Lecb_ret 748.align 16 749.Lecb_enc_five: 750 xorps $inout5,$inout5 751 call _aesni_encrypt6 752 movups $inout0,($out) # store 5 output blocks 753 movups $inout1,0x10($out) 754 movups $inout2,0x20($out) 755 movups $inout3,0x30($out) 756 movups $inout4,0x40($out) 757 jmp .Lecb_ret 758.align 16 759.Lecb_enc_six: 760 call _aesni_encrypt6 761 movups $inout0,($out) # store 6 output blocks 762 movups $inout1,0x10($out) 763 movups $inout2,0x20($out) 764 movups $inout3,0x30($out) 765 movups $inout4,0x40($out) 766 movups $inout5,0x50($out) 767 jmp .Lecb_ret 768#--------------------------- ECB DECRYPT ------------------------------# 769.align 16 770.Lecb_decrypt: 771 cmp \$0x80,$len # if ($len<8*16) 772 jb .Lecb_dec_tail # short input 773 774 movdqu ($inp),$inout0 # load 8 input blocks 775 movdqu 0x10($inp),$inout1 776 movdqu 0x20($inp),$inout2 777 movdqu 0x30($inp),$inout3 778 movdqu 0x40($inp),$inout4 779 movdqu 0x50($inp),$inout5 780 movdqu 0x60($inp),$inout6 781 movdqu 0x70($inp),$inout7 782 lea 0x80($inp),$inp # $inp+=8*16 783 sub \$0x80,$len # $len-=8*16 (can be zero) 784 jmp .Lecb_dec_loop8_enter 785.align 16 786.Lecb_dec_loop8: 787 movups $inout0,($out) # store 8 output blocks 788 mov $key_,$key # restore $key 789 movdqu ($inp),$inout0 # load 8 input blocks 790 mov $rnds_,$rounds # restore $rounds 791 movups $inout1,0x10($out) 792 movdqu 0x10($inp),$inout1 793 movups $inout2,0x20($out) 794 movdqu 0x20($inp),$inout2 795 movups $inout3,0x30($out) 796 movdqu 0x30($inp),$inout3 797 movups $inout4,0x40($out) 798 movdqu 0x40($inp),$inout4 799 movups $inout5,0x50($out) 800 movdqu 0x50($inp),$inout5 801 movups $inout6,0x60($out) 802 movdqu 0x60($inp),$inout6 803 movups $inout7,0x70($out) 804 lea 0x80($out),$out # $out+=8*16 805 movdqu 0x70($inp),$inout7 806 lea 0x80($inp),$inp # $inp+=8*16 807.Lecb_dec_loop8_enter: 808 809 call _aesni_decrypt8 810 811 $movkey ($key_),$rndkey0 812 sub \$0x80,$len 813 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 814 815 movups $inout0,($out) # store 8 output blocks 816 pxor $inout0,$inout0 # clear register bank 817 mov $key_,$key # restore $key 818 movups $inout1,0x10($out) 819 pxor $inout1,$inout1 820 mov $rnds_,$rounds # restore $rounds 821 movups $inout2,0x20($out) 822 pxor $inout2,$inout2 823 movups $inout3,0x30($out) 824 pxor $inout3,$inout3 825 movups $inout4,0x40($out) 826 pxor $inout4,$inout4 827 movups $inout5,0x50($out) 828 pxor $inout5,$inout5 829 movups $inout6,0x60($out) 830 pxor $inout6,$inout6 831 movups $inout7,0x70($out) 832 pxor $inout7,$inout7 833 lea 0x80($out),$out # $out+=8*16 834 add \$0x80,$len # restore real remaining $len 835 jz .Lecb_ret # done if ($len==0) 836 837.Lecb_dec_tail: 838 movups ($inp),$inout0 839 cmp \$0x20,$len 840 jb .Lecb_dec_one 841 movups 0x10($inp),$inout1 842 je .Lecb_dec_two 843 movups 0x20($inp),$inout2 844 cmp \$0x40,$len 845 jb .Lecb_dec_three 846 movups 0x30($inp),$inout3 847 je .Lecb_dec_four 848 movups 0x40($inp),$inout4 849 cmp \$0x60,$len 850 jb .Lecb_dec_five 851 movups 0x50($inp),$inout5 852 je .Lecb_dec_six 853 movups 0x60($inp),$inout6 854 $movkey ($key),$rndkey0 855 xorps $inout7,$inout7 856 call _aesni_decrypt8 857 movups $inout0,($out) # store 7 output blocks 858 pxor $inout0,$inout0 # clear register bank 859 movups $inout1,0x10($out) 860 pxor $inout1,$inout1 861 movups $inout2,0x20($out) 862 pxor $inout2,$inout2 863 movups $inout3,0x30($out) 864 pxor $inout3,$inout3 865 movups $inout4,0x40($out) 866 pxor $inout4,$inout4 867 movups $inout5,0x50($out) 868 pxor $inout5,$inout5 869 movups $inout6,0x60($out) 870 pxor $inout6,$inout6 871 pxor $inout7,$inout7 872 jmp .Lecb_ret 873.align 16 874.Lecb_dec_one: 875___ 876 &aesni_generate1("dec",$key,$rounds); 877$code.=<<___; 878 movups $inout0,($out) # store one output block 879 pxor $inout0,$inout0 # clear register bank 880 jmp .Lecb_ret 881.align 16 882.Lecb_dec_two: 883 call _aesni_decrypt2 884 movups $inout0,($out) # store 2 output blocks 885 pxor $inout0,$inout0 # clear register bank 886 movups $inout1,0x10($out) 887 pxor $inout1,$inout1 888 jmp .Lecb_ret 889.align 16 890.Lecb_dec_three: 891 call _aesni_decrypt3 892 movups $inout0,($out) # store 3 output blocks 893 pxor $inout0,$inout0 # clear register bank 894 movups $inout1,0x10($out) 895 pxor $inout1,$inout1 896 movups $inout2,0x20($out) 897 pxor $inout2,$inout2 898 jmp .Lecb_ret 899.align 16 900.Lecb_dec_four: 901 call _aesni_decrypt4 902 movups $inout0,($out) # store 4 output blocks 903 pxor $inout0,$inout0 # clear register bank 904 movups $inout1,0x10($out) 905 pxor $inout1,$inout1 906 movups $inout2,0x20($out) 907 pxor $inout2,$inout2 908 movups $inout3,0x30($out) 909 pxor $inout3,$inout3 910 jmp .Lecb_ret 911.align 16 912.Lecb_dec_five: 913 xorps $inout5,$inout5 914 call _aesni_decrypt6 915 movups $inout0,($out) # store 5 output blocks 916 pxor $inout0,$inout0 # clear register bank 917 movups $inout1,0x10($out) 918 pxor $inout1,$inout1 919 movups $inout2,0x20($out) 920 pxor $inout2,$inout2 921 movups $inout3,0x30($out) 922 pxor $inout3,$inout3 923 movups $inout4,0x40($out) 924 pxor $inout4,$inout4 925 pxor $inout5,$inout5 926 jmp .Lecb_ret 927.align 16 928.Lecb_dec_six: 929 call _aesni_decrypt6 930 movups $inout0,($out) # store 6 output blocks 931 pxor $inout0,$inout0 # clear register bank 932 movups $inout1,0x10($out) 933 pxor $inout1,$inout1 934 movups $inout2,0x20($out) 935 pxor $inout2,$inout2 936 movups $inout3,0x30($out) 937 pxor $inout3,$inout3 938 movups $inout4,0x40($out) 939 pxor $inout4,$inout4 940 movups $inout5,0x50($out) 941 pxor $inout5,$inout5 942 943.Lecb_ret: 944 xorps $rndkey0,$rndkey0 # %xmm0 945 pxor $rndkey1,$rndkey1 946___ 947$code.=<<___ if ($win64); 948 movaps (%rsp),%xmm6 949 movaps %xmm0,(%rsp) # clear stack 950 movaps 0x10(%rsp),%xmm7 951 movaps %xmm0,0x10(%rsp) 952 movaps 0x20(%rsp),%xmm8 953 movaps %xmm0,0x20(%rsp) 954 movaps 0x30(%rsp),%xmm9 955 movaps %xmm0,0x30(%rsp) 956 lea 0x58(%rsp),%rsp 957.Lecb_enc_ret: 958___ 959$code.=<<___; 960 ret 961.cfi_endproc 962.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 963___ 964 965{ 966###################################################################### 967# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 968# size_t blocks, const AES_KEY *key, 969# const char *ivec,char *cmac); 970# 971# Handles only complete blocks, operates on 64-bit counter and 972# does not update *ivec! Nor does it finalize CMAC value 973# (see engine/eng_aesni.c for details) 974# 975{ 976my $cmac="%r9"; # 6th argument 977 978my $increment="%xmm9"; 979my $iv="%xmm6"; 980my $bswap_mask="%xmm7"; 981 982$code.=<<___; 983.globl aesni_ccm64_encrypt_blocks 984.type aesni_ccm64_encrypt_blocks,\@function,6 985.align 16 986aesni_ccm64_encrypt_blocks: 987.cfi_startproc 988___ 989$code.=<<___ if ($win64); 990 lea -0x58(%rsp),%rsp 991 movaps %xmm6,(%rsp) # $iv 992 movaps %xmm7,0x10(%rsp) # $bswap_mask 993 movaps %xmm8,0x20(%rsp) # $in0 994 movaps %xmm9,0x30(%rsp) # $increment 995.Lccm64_enc_body: 996___ 997$code.=<<___; 998 mov 240($key),$rounds # key->rounds 999 movdqu ($ivp),$iv 1000 movdqa .Lincrement64(%rip),$increment 1001 movdqa .Lbswap_mask(%rip),$bswap_mask 1002 1003 shl \$4,$rounds 1004 mov \$16,$rnds_ 1005 lea 0($key),$key_ 1006 movdqu ($cmac),$inout1 1007 movdqa $iv,$inout0 1008 lea 32($key,$rounds),$key # end of key schedule 1009 pshufb $bswap_mask,$iv 1010 sub %rax,%r10 # twisted $rounds 1011 jmp .Lccm64_enc_outer 1012.align 16 1013.Lccm64_enc_outer: 1014 $movkey ($key_),$rndkey0 1015 mov %r10,%rax 1016 movups ($inp),$in0 # load inp 1017 1018 xorps $rndkey0,$inout0 # counter 1019 $movkey 16($key_),$rndkey1 1020 xorps $in0,$rndkey0 1021 xorps $rndkey0,$inout1 # cmac^=inp 1022 $movkey 32($key_),$rndkey0 1023 1024.Lccm64_enc2_loop: 1025 aesenc $rndkey1,$inout0 1026 aesenc $rndkey1,$inout1 1027 $movkey ($key,%rax),$rndkey1 1028 add \$32,%rax 1029 aesenc $rndkey0,$inout0 1030 aesenc $rndkey0,$inout1 1031 $movkey -16($key,%rax),$rndkey0 1032 jnz .Lccm64_enc2_loop 1033 aesenc $rndkey1,$inout0 1034 aesenc $rndkey1,$inout1 1035 paddq $increment,$iv 1036 dec $len # $len-- ($len is in blocks) 1037 aesenclast $rndkey0,$inout0 1038 aesenclast $rndkey0,$inout1 1039 1040 lea 16($inp),$inp 1041 xorps $inout0,$in0 # inp ^= E(iv) 1042 movdqa $iv,$inout0 1043 movups $in0,($out) # save output 1044 pshufb $bswap_mask,$inout0 1045 lea 16($out),$out # $out+=16 1046 jnz .Lccm64_enc_outer # loop if ($len!=0) 1047 1048 pxor $rndkey0,$rndkey0 # clear register bank 1049 pxor $rndkey1,$rndkey1 1050 pxor $inout0,$inout0 1051 movups $inout1,($cmac) # store resulting mac 1052 pxor $inout1,$inout1 1053 pxor $in0,$in0 1054 pxor $iv,$iv 1055___ 1056$code.=<<___ if ($win64); 1057 movaps (%rsp),%xmm6 1058 movaps %xmm0,(%rsp) # clear stack 1059 movaps 0x10(%rsp),%xmm7 1060 movaps %xmm0,0x10(%rsp) 1061 movaps 0x20(%rsp),%xmm8 1062 movaps %xmm0,0x20(%rsp) 1063 movaps 0x30(%rsp),%xmm9 1064 movaps %xmm0,0x30(%rsp) 1065 lea 0x58(%rsp),%rsp 1066.Lccm64_enc_ret: 1067___ 1068$code.=<<___; 1069 ret 1070.cfi_endproc 1071.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1072___ 1073###################################################################### 1074$code.=<<___; 1075.globl aesni_ccm64_decrypt_blocks 1076.type aesni_ccm64_decrypt_blocks,\@function,6 1077.align 16 1078aesni_ccm64_decrypt_blocks: 1079.cfi_startproc 1080___ 1081$code.=<<___ if ($win64); 1082 lea -0x58(%rsp),%rsp 1083 movaps %xmm6,(%rsp) # $iv 1084 movaps %xmm7,0x10(%rsp) # $bswap_mask 1085 movaps %xmm8,0x20(%rsp) # $in8 1086 movaps %xmm9,0x30(%rsp) # $increment 1087.Lccm64_dec_body: 1088___ 1089$code.=<<___; 1090 mov 240($key),$rounds # key->rounds 1091 movups ($ivp),$iv 1092 movdqu ($cmac),$inout1 1093 movdqa .Lincrement64(%rip),$increment 1094 movdqa .Lbswap_mask(%rip),$bswap_mask 1095 1096 movaps $iv,$inout0 1097 mov $rounds,$rnds_ 1098 mov $key,$key_ 1099 pshufb $bswap_mask,$iv 1100___ 1101 &aesni_generate1("enc",$key,$rounds); 1102$code.=<<___; 1103 shl \$4,$rnds_ 1104 mov \$16,$rounds 1105 movups ($inp),$in0 # load inp 1106 paddq $increment,$iv 1107 lea 16($inp),$inp # $inp+=16 1108 sub %r10,%rax # twisted $rounds 1109 lea 32($key_,$rnds_),$key # end of key schedule 1110 mov %rax,%r10 1111 jmp .Lccm64_dec_outer 1112.align 16 1113.Lccm64_dec_outer: 1114 xorps $inout0,$in0 # inp ^= E(iv) 1115 movdqa $iv,$inout0 1116 movups $in0,($out) # save output 1117 lea 16($out),$out # $out+=16 1118 pshufb $bswap_mask,$inout0 1119 1120 sub \$1,$len # $len-- ($len is in blocks) 1121 jz .Lccm64_dec_break # if ($len==0) break 1122 1123 $movkey ($key_),$rndkey0 1124 mov %r10,%rax 1125 $movkey 16($key_),$rndkey1 1126 xorps $rndkey0,$in0 1127 xorps $rndkey0,$inout0 1128 xorps $in0,$inout1 # cmac^=out 1129 $movkey 32($key_),$rndkey0 1130 jmp .Lccm64_dec2_loop 1131.align 16 1132.Lccm64_dec2_loop: 1133 aesenc $rndkey1,$inout0 1134 aesenc $rndkey1,$inout1 1135 $movkey ($key,%rax),$rndkey1 1136 add \$32,%rax 1137 aesenc $rndkey0,$inout0 1138 aesenc $rndkey0,$inout1 1139 $movkey -16($key,%rax),$rndkey0 1140 jnz .Lccm64_dec2_loop 1141 movups ($inp),$in0 # load input 1142 paddq $increment,$iv 1143 aesenc $rndkey1,$inout0 1144 aesenc $rndkey1,$inout1 1145 aesenclast $rndkey0,$inout0 1146 aesenclast $rndkey0,$inout1 1147 lea 16($inp),$inp # $inp+=16 1148 jmp .Lccm64_dec_outer 1149 1150.align 16 1151.Lccm64_dec_break: 1152 #xorps $in0,$inout1 # cmac^=out 1153 mov 240($key_),$rounds 1154___ 1155 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1156$code.=<<___; 1157 pxor $rndkey0,$rndkey0 # clear register bank 1158 pxor $rndkey1,$rndkey1 1159 pxor $inout0,$inout0 1160 movups $inout1,($cmac) # store resulting mac 1161 pxor $inout1,$inout1 1162 pxor $in0,$in0 1163 pxor $iv,$iv 1164___ 1165$code.=<<___ if ($win64); 1166 movaps (%rsp),%xmm6 1167 movaps %xmm0,(%rsp) # clear stack 1168 movaps 0x10(%rsp),%xmm7 1169 movaps %xmm0,0x10(%rsp) 1170 movaps 0x20(%rsp),%xmm8 1171 movaps %xmm0,0x20(%rsp) 1172 movaps 0x30(%rsp),%xmm9 1173 movaps %xmm0,0x30(%rsp) 1174 lea 0x58(%rsp),%rsp 1175.Lccm64_dec_ret: 1176___ 1177$code.=<<___; 1178 ret 1179.cfi_endproc 1180.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1181___ 1182} 1183###################################################################### 1184# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1185# size_t blocks, const AES_KEY *key, 1186# const char *ivec); 1187# 1188# Handles only complete blocks, operates on 32-bit counter and 1189# does not update *ivec! (see crypto/modes/ctr128.c for details) 1190# 1191# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1192# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1193# Keywords are full unroll and modulo-schedule counter calculations 1194# with zero-round key xor. 1195{ 1196my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1197my ($key0,$ctr)=("%ebp","${ivp}d"); 1198my $frame_size = 0x80 + ($win64?160:0); 1199 1200$code.=<<___; 1201.globl aesni_ctr32_encrypt_blocks 1202.type aesni_ctr32_encrypt_blocks,\@function,5 1203.align 16 1204aesni_ctr32_encrypt_blocks: 1205.cfi_startproc 1206 cmp \$1,$len 1207 jne .Lctr32_bulk 1208 1209 # handle single block without allocating stack frame, 1210 # useful when handling edges 1211 movups ($ivp),$inout0 1212 movups ($inp),$inout1 1213 mov 240($key),%edx # key->rounds 1214___ 1215 &aesni_generate1("enc",$key,"%edx"); 1216$code.=<<___; 1217 pxor $rndkey0,$rndkey0 # clear register bank 1218 pxor $rndkey1,$rndkey1 1219 xorps $inout1,$inout0 1220 pxor $inout1,$inout1 1221 movups $inout0,($out) 1222 xorps $inout0,$inout0 1223 jmp .Lctr32_epilogue 1224 1225.align 16 1226.Lctr32_bulk: 1227 lea (%rsp),$key_ # use $key_ as frame pointer 1228.cfi_def_cfa_register $key_ 1229 push %rbp 1230.cfi_push %rbp 1231 sub \$$frame_size,%rsp 1232 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1233___ 1234$code.=<<___ if ($win64); 1235 movaps %xmm6,-0xa8($key_) # offload everything 1236 movaps %xmm7,-0x98($key_) 1237 movaps %xmm8,-0x88($key_) 1238 movaps %xmm9,-0x78($key_) 1239 movaps %xmm10,-0x68($key_) 1240 movaps %xmm11,-0x58($key_) 1241 movaps %xmm12,-0x48($key_) 1242 movaps %xmm13,-0x38($key_) 1243 movaps %xmm14,-0x28($key_) 1244 movaps %xmm15,-0x18($key_) 1245.Lctr32_body: 1246___ 1247$code.=<<___; 1248 1249 # 8 16-byte words on top of stack are counter values 1250 # xor-ed with zero-round key 1251 1252 movdqu ($ivp),$inout0 1253 movdqu ($key),$rndkey0 1254 mov 12($ivp),$ctr # counter LSB 1255 pxor $rndkey0,$inout0 1256 mov 12($key),$key0 # 0-round key LSB 1257 movdqa $inout0,0x00(%rsp) # populate counter block 1258 bswap $ctr 1259 movdqa $inout0,$inout1 1260 movdqa $inout0,$inout2 1261 movdqa $inout0,$inout3 1262 movdqa $inout0,0x40(%rsp) 1263 movdqa $inout0,0x50(%rsp) 1264 movdqa $inout0,0x60(%rsp) 1265 mov %rdx,%r10 # about to borrow %rdx 1266 movdqa $inout0,0x70(%rsp) 1267 1268 lea 1($ctr),%rax 1269 lea 2($ctr),%rdx 1270 bswap %eax 1271 bswap %edx 1272 xor $key0,%eax 1273 xor $key0,%edx 1274 pinsrd \$3,%eax,$inout1 1275 lea 3($ctr),%rax 1276 movdqa $inout1,0x10(%rsp) 1277 pinsrd \$3,%edx,$inout2 1278 bswap %eax 1279 mov %r10,%rdx # restore %rdx 1280 lea 4($ctr),%r10 1281 movdqa $inout2,0x20(%rsp) 1282 xor $key0,%eax 1283 bswap %r10d 1284 pinsrd \$3,%eax,$inout3 1285 xor $key0,%r10d 1286 movdqa $inout3,0x30(%rsp) 1287 lea 5($ctr),%r9 1288 mov %r10d,0x40+12(%rsp) 1289 bswap %r9d 1290 lea 6($ctr),%r10 1291 mov 240($key),$rounds # key->rounds 1292 xor $key0,%r9d 1293 bswap %r10d 1294 mov %r9d,0x50+12(%rsp) 1295 xor $key0,%r10d 1296 lea 7($ctr),%r9 1297 mov %r10d,0x60+12(%rsp) 1298 bswap %r9d 1299 mov OPENSSL_ia32cap_P+4(%rip),%r10d 1300 xor $key0,%r9d 1301 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1302 mov %r9d,0x70+12(%rsp) 1303 1304 $movkey 0x10($key),$rndkey1 1305 1306 movdqa 0x40(%rsp),$inout4 1307 movdqa 0x50(%rsp),$inout5 1308 1309 cmp \$8,$len # $len is in blocks 1310 jb .Lctr32_tail # short input if ($len<8) 1311 1312 sub \$6,$len # $len is biased by -6 1313 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1314 je .Lctr32_6x # [which denotes Atom Silvermont] 1315 1316 lea 0x80($key),$key # size optimization 1317 sub \$2,$len # $len is biased by -8 1318 jmp .Lctr32_loop8 1319 1320.align 16 1321.Lctr32_6x: 1322 shl \$4,$rounds 1323 mov \$48,$rnds_ 1324 bswap $key0 1325 lea 32($key,$rounds),$key # end of key schedule 1326 sub %rax,%r10 # twisted $rounds 1327 jmp .Lctr32_loop6 1328 1329.align 16 1330.Lctr32_loop6: 1331 add \$6,$ctr # next counter value 1332 $movkey -48($key,$rnds_),$rndkey0 1333 aesenc $rndkey1,$inout0 1334 mov $ctr,%eax 1335 xor $key0,%eax 1336 aesenc $rndkey1,$inout1 1337 movbe %eax,`0x00+12`(%rsp) # store next counter value 1338 lea 1($ctr),%eax 1339 aesenc $rndkey1,$inout2 1340 xor $key0,%eax 1341 movbe %eax,`0x10+12`(%rsp) 1342 aesenc $rndkey1,$inout3 1343 lea 2($ctr),%eax 1344 xor $key0,%eax 1345 aesenc $rndkey1,$inout4 1346 movbe %eax,`0x20+12`(%rsp) 1347 lea 3($ctr),%eax 1348 aesenc $rndkey1,$inout5 1349 $movkey -32($key,$rnds_),$rndkey1 1350 xor $key0,%eax 1351 1352 aesenc $rndkey0,$inout0 1353 movbe %eax,`0x30+12`(%rsp) 1354 lea 4($ctr),%eax 1355 aesenc $rndkey0,$inout1 1356 xor $key0,%eax 1357 movbe %eax,`0x40+12`(%rsp) 1358 aesenc $rndkey0,$inout2 1359 lea 5($ctr),%eax 1360 xor $key0,%eax 1361 aesenc $rndkey0,$inout3 1362 movbe %eax,`0x50+12`(%rsp) 1363 mov %r10,%rax # mov $rnds_,$rounds 1364 aesenc $rndkey0,$inout4 1365 aesenc $rndkey0,$inout5 1366 $movkey -16($key,$rnds_),$rndkey0 1367 1368 call .Lenc_loop6 1369 1370 movdqu ($inp),$inout6 # load 6 input blocks 1371 movdqu 0x10($inp),$inout7 1372 movdqu 0x20($inp),$in0 1373 movdqu 0x30($inp),$in1 1374 movdqu 0x40($inp),$in2 1375 movdqu 0x50($inp),$in3 1376 lea 0x60($inp),$inp # $inp+=6*16 1377 $movkey -64($key,$rnds_),$rndkey1 1378 pxor $inout0,$inout6 # inp^=E(ctr) 1379 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1380 pxor $inout1,$inout7 1381 movaps 0x10(%rsp),$inout1 1382 pxor $inout2,$in0 1383 movaps 0x20(%rsp),$inout2 1384 pxor $inout3,$in1 1385 movaps 0x30(%rsp),$inout3 1386 pxor $inout4,$in2 1387 movaps 0x40(%rsp),$inout4 1388 pxor $inout5,$in3 1389 movaps 0x50(%rsp),$inout5 1390 movdqu $inout6,($out) # store 6 output blocks 1391 movdqu $inout7,0x10($out) 1392 movdqu $in0,0x20($out) 1393 movdqu $in1,0x30($out) 1394 movdqu $in2,0x40($out) 1395 movdqu $in3,0x50($out) 1396 lea 0x60($out),$out # $out+=6*16 1397 1398 sub \$6,$len 1399 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1400 1401 add \$6,$len # restore real remaining $len 1402 jz .Lctr32_done # done if ($len==0) 1403 1404 lea -48($rnds_),$rounds 1405 lea -80($key,$rnds_),$key # restore $key 1406 neg $rounds 1407 shr \$4,$rounds # restore $rounds 1408 jmp .Lctr32_tail 1409 1410.align 32 1411.Lctr32_loop8: 1412 add \$8,$ctr # next counter value 1413 movdqa 0x60(%rsp),$inout6 1414 aesenc $rndkey1,$inout0 1415 mov $ctr,%r9d 1416 movdqa 0x70(%rsp),$inout7 1417 aesenc $rndkey1,$inout1 1418 bswap %r9d 1419 $movkey 0x20-0x80($key),$rndkey0 1420 aesenc $rndkey1,$inout2 1421 xor $key0,%r9d 1422 nop 1423 aesenc $rndkey1,$inout3 1424 mov %r9d,0x00+12(%rsp) # store next counter value 1425 lea 1($ctr),%r9 1426 aesenc $rndkey1,$inout4 1427 aesenc $rndkey1,$inout5 1428 aesenc $rndkey1,$inout6 1429 aesenc $rndkey1,$inout7 1430 $movkey 0x30-0x80($key),$rndkey1 1431___ 1432for($i=2;$i<8;$i++) { 1433my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1434$code.=<<___; 1435 bswap %r9d 1436 aesenc $rndkeyx,$inout0 1437 aesenc $rndkeyx,$inout1 1438 xor $key0,%r9d 1439 .byte 0x66,0x90 1440 aesenc $rndkeyx,$inout2 1441 aesenc $rndkeyx,$inout3 1442 mov %r9d,`0x10*($i-1)`+12(%rsp) 1443 lea $i($ctr),%r9 1444 aesenc $rndkeyx,$inout4 1445 aesenc $rndkeyx,$inout5 1446 aesenc $rndkeyx,$inout6 1447 aesenc $rndkeyx,$inout7 1448 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1449___ 1450} 1451$code.=<<___; 1452 bswap %r9d 1453 aesenc $rndkey0,$inout0 1454 aesenc $rndkey0,$inout1 1455 aesenc $rndkey0,$inout2 1456 xor $key0,%r9d 1457 movdqu 0x00($inp),$in0 # start loading input 1458 aesenc $rndkey0,$inout3 1459 mov %r9d,0x70+12(%rsp) 1460 cmp \$11,$rounds 1461 aesenc $rndkey0,$inout4 1462 aesenc $rndkey0,$inout5 1463 aesenc $rndkey0,$inout6 1464 aesenc $rndkey0,$inout7 1465 $movkey 0xa0-0x80($key),$rndkey0 1466 1467 jb .Lctr32_enc_done 1468 1469 aesenc $rndkey1,$inout0 1470 aesenc $rndkey1,$inout1 1471 aesenc $rndkey1,$inout2 1472 aesenc $rndkey1,$inout3 1473 aesenc $rndkey1,$inout4 1474 aesenc $rndkey1,$inout5 1475 aesenc $rndkey1,$inout6 1476 aesenc $rndkey1,$inout7 1477 $movkey 0xb0-0x80($key),$rndkey1 1478 1479 aesenc $rndkey0,$inout0 1480 aesenc $rndkey0,$inout1 1481 aesenc $rndkey0,$inout2 1482 aesenc $rndkey0,$inout3 1483 aesenc $rndkey0,$inout4 1484 aesenc $rndkey0,$inout5 1485 aesenc $rndkey0,$inout6 1486 aesenc $rndkey0,$inout7 1487 $movkey 0xc0-0x80($key),$rndkey0 1488 je .Lctr32_enc_done 1489 1490 aesenc $rndkey1,$inout0 1491 aesenc $rndkey1,$inout1 1492 aesenc $rndkey1,$inout2 1493 aesenc $rndkey1,$inout3 1494 aesenc $rndkey1,$inout4 1495 aesenc $rndkey1,$inout5 1496 aesenc $rndkey1,$inout6 1497 aesenc $rndkey1,$inout7 1498 $movkey 0xd0-0x80($key),$rndkey1 1499 1500 aesenc $rndkey0,$inout0 1501 aesenc $rndkey0,$inout1 1502 aesenc $rndkey0,$inout2 1503 aesenc $rndkey0,$inout3 1504 aesenc $rndkey0,$inout4 1505 aesenc $rndkey0,$inout5 1506 aesenc $rndkey0,$inout6 1507 aesenc $rndkey0,$inout7 1508 $movkey 0xe0-0x80($key),$rndkey0 1509 jmp .Lctr32_enc_done 1510 1511.align 16 1512.Lctr32_enc_done: 1513 movdqu 0x10($inp),$in1 1514 pxor $rndkey0,$in0 # input^=round[last] 1515 movdqu 0x20($inp),$in2 1516 pxor $rndkey0,$in1 1517 movdqu 0x30($inp),$in3 1518 pxor $rndkey0,$in2 1519 movdqu 0x40($inp),$in4 1520 pxor $rndkey0,$in3 1521 movdqu 0x50($inp),$in5 1522 pxor $rndkey0,$in4 1523 pxor $rndkey0,$in5 1524 aesenc $rndkey1,$inout0 1525 aesenc $rndkey1,$inout1 1526 aesenc $rndkey1,$inout2 1527 aesenc $rndkey1,$inout3 1528 aesenc $rndkey1,$inout4 1529 aesenc $rndkey1,$inout5 1530 aesenc $rndkey1,$inout6 1531 aesenc $rndkey1,$inout7 1532 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1533 lea 0x80($inp),$inp # $inp+=8*16 1534 1535 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1536 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1537 movdqu 0x70-0x80($inp),$in0 1538 aesenclast $in1,$inout1 1539 pxor $rndkey0,$in0 1540 movdqa 0x00(%rsp),$in1 # load next counter block 1541 aesenclast $in2,$inout2 1542 aesenclast $in3,$inout3 1543 movdqa 0x10(%rsp),$in2 1544 movdqa 0x20(%rsp),$in3 1545 aesenclast $in4,$inout4 1546 aesenclast $in5,$inout5 1547 movdqa 0x30(%rsp),$in4 1548 movdqa 0x40(%rsp),$in5 1549 aesenclast $rndkey1,$inout6 1550 movdqa 0x50(%rsp),$rndkey0 1551 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1552 aesenclast $in0,$inout7 1553 1554 movups $inout0,($out) # store 8 output blocks 1555 movdqa $in1,$inout0 1556 movups $inout1,0x10($out) 1557 movdqa $in2,$inout1 1558 movups $inout2,0x20($out) 1559 movdqa $in3,$inout2 1560 movups $inout3,0x30($out) 1561 movdqa $in4,$inout3 1562 movups $inout4,0x40($out) 1563 movdqa $in5,$inout4 1564 movups $inout5,0x50($out) 1565 movdqa $rndkey0,$inout5 1566 movups $inout6,0x60($out) 1567 movups $inout7,0x70($out) 1568 lea 0x80($out),$out # $out+=8*16 1569 1570 sub \$8,$len 1571 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1572 1573 add \$8,$len # restore real remaining $len 1574 jz .Lctr32_done # done if ($len==0) 1575 lea -0x80($key),$key 1576 1577.Lctr32_tail: 1578 # note that at this point $inout0..5 are populated with 1579 # counter values xor-ed with 0-round key 1580 lea 16($key),$key 1581 cmp \$4,$len 1582 jb .Lctr32_loop3 1583 je .Lctr32_loop4 1584 1585 # if ($len>4) compute 7 E(counter) 1586 shl \$4,$rounds 1587 movdqa 0x60(%rsp),$inout6 1588 pxor $inout7,$inout7 1589 1590 $movkey 16($key),$rndkey0 1591 aesenc $rndkey1,$inout0 1592 aesenc $rndkey1,$inout1 1593 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1594 neg %rax 1595 aesenc $rndkey1,$inout2 1596 add \$16,%rax # prepare for .Lenc_loop8_enter 1597 movups ($inp),$in0 1598 aesenc $rndkey1,$inout3 1599 aesenc $rndkey1,$inout4 1600 movups 0x10($inp),$in1 # pre-load input 1601 movups 0x20($inp),$in2 1602 aesenc $rndkey1,$inout5 1603 aesenc $rndkey1,$inout6 1604 1605 call .Lenc_loop8_enter 1606 1607 movdqu 0x30($inp),$in3 1608 pxor $in0,$inout0 1609 movdqu 0x40($inp),$in0 1610 pxor $in1,$inout1 1611 movdqu $inout0,($out) # store output 1612 pxor $in2,$inout2 1613 movdqu $inout1,0x10($out) 1614 pxor $in3,$inout3 1615 movdqu $inout2,0x20($out) 1616 pxor $in0,$inout4 1617 movdqu $inout3,0x30($out) 1618 movdqu $inout4,0x40($out) 1619 cmp \$6,$len 1620 jb .Lctr32_done # $len was 5, stop store 1621 1622 movups 0x50($inp),$in1 1623 xorps $in1,$inout5 1624 movups $inout5,0x50($out) 1625 je .Lctr32_done # $len was 6, stop store 1626 1627 movups 0x60($inp),$in2 1628 xorps $in2,$inout6 1629 movups $inout6,0x60($out) 1630 jmp .Lctr32_done # $len was 7, stop store 1631 1632.align 32 1633.Lctr32_loop4: 1634 aesenc $rndkey1,$inout0 1635 lea 16($key),$key 1636 dec $rounds 1637 aesenc $rndkey1,$inout1 1638 aesenc $rndkey1,$inout2 1639 aesenc $rndkey1,$inout3 1640 $movkey ($key),$rndkey1 1641 jnz .Lctr32_loop4 1642 aesenclast $rndkey1,$inout0 1643 aesenclast $rndkey1,$inout1 1644 movups ($inp),$in0 # load input 1645 movups 0x10($inp),$in1 1646 aesenclast $rndkey1,$inout2 1647 aesenclast $rndkey1,$inout3 1648 movups 0x20($inp),$in2 1649 movups 0x30($inp),$in3 1650 1651 xorps $in0,$inout0 1652 movups $inout0,($out) # store output 1653 xorps $in1,$inout1 1654 movups $inout1,0x10($out) 1655 pxor $in2,$inout2 1656 movdqu $inout2,0x20($out) 1657 pxor $in3,$inout3 1658 movdqu $inout3,0x30($out) 1659 jmp .Lctr32_done # $len was 4, stop store 1660 1661.align 32 1662.Lctr32_loop3: 1663 aesenc $rndkey1,$inout0 1664 lea 16($key),$key 1665 dec $rounds 1666 aesenc $rndkey1,$inout1 1667 aesenc $rndkey1,$inout2 1668 $movkey ($key),$rndkey1 1669 jnz .Lctr32_loop3 1670 aesenclast $rndkey1,$inout0 1671 aesenclast $rndkey1,$inout1 1672 aesenclast $rndkey1,$inout2 1673 1674 movups ($inp),$in0 # load input 1675 xorps $in0,$inout0 1676 movups $inout0,($out) # store output 1677 cmp \$2,$len 1678 jb .Lctr32_done # $len was 1, stop store 1679 1680 movups 0x10($inp),$in1 1681 xorps $in1,$inout1 1682 movups $inout1,0x10($out) 1683 je .Lctr32_done # $len was 2, stop store 1684 1685 movups 0x20($inp),$in2 1686 xorps $in2,$inout2 1687 movups $inout2,0x20($out) # $len was 3, stop store 1688 1689.Lctr32_done: 1690 xorps %xmm0,%xmm0 # clear register bank 1691 xor $key0,$key0 1692 pxor %xmm1,%xmm1 1693 pxor %xmm2,%xmm2 1694 pxor %xmm3,%xmm3 1695 pxor %xmm4,%xmm4 1696 pxor %xmm5,%xmm5 1697___ 1698$code.=<<___ if (!$win64); 1699 pxor %xmm6,%xmm6 1700 pxor %xmm7,%xmm7 1701 movaps %xmm0,0x00(%rsp) # clear stack 1702 pxor %xmm8,%xmm8 1703 movaps %xmm0,0x10(%rsp) 1704 pxor %xmm9,%xmm9 1705 movaps %xmm0,0x20(%rsp) 1706 pxor %xmm10,%xmm10 1707 movaps %xmm0,0x30(%rsp) 1708 pxor %xmm11,%xmm11 1709 movaps %xmm0,0x40(%rsp) 1710 pxor %xmm12,%xmm12 1711 movaps %xmm0,0x50(%rsp) 1712 pxor %xmm13,%xmm13 1713 movaps %xmm0,0x60(%rsp) 1714 pxor %xmm14,%xmm14 1715 movaps %xmm0,0x70(%rsp) 1716 pxor %xmm15,%xmm15 1717___ 1718$code.=<<___ if ($win64); 1719 movaps -0xa8($key_),%xmm6 1720 movaps %xmm0,-0xa8($key_) # clear stack 1721 movaps -0x98($key_),%xmm7 1722 movaps %xmm0,-0x98($key_) 1723 movaps -0x88($key_),%xmm8 1724 movaps %xmm0,-0x88($key_) 1725 movaps -0x78($key_),%xmm9 1726 movaps %xmm0,-0x78($key_) 1727 movaps -0x68($key_),%xmm10 1728 movaps %xmm0,-0x68($key_) 1729 movaps -0x58($key_),%xmm11 1730 movaps %xmm0,-0x58($key_) 1731 movaps -0x48($key_),%xmm12 1732 movaps %xmm0,-0x48($key_) 1733 movaps -0x38($key_),%xmm13 1734 movaps %xmm0,-0x38($key_) 1735 movaps -0x28($key_),%xmm14 1736 movaps %xmm0,-0x28($key_) 1737 movaps -0x18($key_),%xmm15 1738 movaps %xmm0,-0x18($key_) 1739 movaps %xmm0,0x00(%rsp) 1740 movaps %xmm0,0x10(%rsp) 1741 movaps %xmm0,0x20(%rsp) 1742 movaps %xmm0,0x30(%rsp) 1743 movaps %xmm0,0x40(%rsp) 1744 movaps %xmm0,0x50(%rsp) 1745 movaps %xmm0,0x60(%rsp) 1746 movaps %xmm0,0x70(%rsp) 1747___ 1748$code.=<<___; 1749 mov -8($key_),%rbp 1750.cfi_restore %rbp 1751 lea ($key_),%rsp 1752.cfi_def_cfa_register %rsp 1753.Lctr32_epilogue: 1754 ret 1755.cfi_endproc 1756.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1757___ 1758} 1759 1760###################################################################### 1761# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1762# const AES_KEY *key1, const AES_KEY *key2 1763# const unsigned char iv[16]); 1764# 1765{ 1766my @tweak=map("%xmm$_",(10..15)); 1767my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1768my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1769my $frame_size = 0x70 + ($win64?160:0); 1770my $key_ = "%rbp"; # override so that we can use %r11 as FP 1771 1772$code.=<<___; 1773.globl aesni_xts_encrypt 1774.type aesni_xts_encrypt,\@function,6 1775.align 16 1776aesni_xts_encrypt: 1777.cfi_startproc 1778 lea (%rsp),%r11 # frame pointer 1779.cfi_def_cfa_register %r11 1780 push %rbp 1781.cfi_push %rbp 1782 sub \$$frame_size,%rsp 1783 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1784___ 1785$code.=<<___ if ($win64); 1786 movaps %xmm6,-0xa8(%r11) # offload everything 1787 movaps %xmm7,-0x98(%r11) 1788 movaps %xmm8,-0x88(%r11) 1789 movaps %xmm9,-0x78(%r11) 1790 movaps %xmm10,-0x68(%r11) 1791 movaps %xmm11,-0x58(%r11) 1792 movaps %xmm12,-0x48(%r11) 1793 movaps %xmm13,-0x38(%r11) 1794 movaps %xmm14,-0x28(%r11) 1795 movaps %xmm15,-0x18(%r11) 1796.Lxts_enc_body: 1797___ 1798$code.=<<___; 1799 movups ($ivp),$inout0 # load clear-text tweak 1800 mov 240(%r8),$rounds # key2->rounds 1801 mov 240($key),$rnds_ # key1->rounds 1802___ 1803 # generate the tweak 1804 &aesni_generate1("enc",$key2,$rounds,$inout0); 1805$code.=<<___; 1806 $movkey ($key),$rndkey0 # zero round key 1807 mov $key,$key_ # backup $key 1808 mov $rnds_,$rounds # backup $rounds 1809 shl \$4,$rnds_ 1810 mov $len,$len_ # backup $len 1811 and \$-16,$len 1812 1813 $movkey 16($key,$rnds_),$rndkey1 # last round key 1814 1815 movdqa .Lxts_magic(%rip),$twmask 1816 movdqa $inout0,@tweak[5] 1817 pshufd \$0x5f,$inout0,$twres 1818 pxor $rndkey0,$rndkey1 1819___ 1820 # alternative tweak calculation algorithm is based on suggestions 1821 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1822 # and should help in the future... 1823 for ($i=0;$i<4;$i++) { 1824 $code.=<<___; 1825 movdqa $twres,$twtmp 1826 paddd $twres,$twres 1827 movdqa @tweak[5],@tweak[$i] 1828 psrad \$31,$twtmp # broadcast upper bits 1829 paddq @tweak[5],@tweak[5] 1830 pand $twmask,$twtmp 1831 pxor $rndkey0,@tweak[$i] 1832 pxor $twtmp,@tweak[5] 1833___ 1834 } 1835$code.=<<___; 1836 movdqa @tweak[5],@tweak[4] 1837 psrad \$31,$twres 1838 paddq @tweak[5],@tweak[5] 1839 pand $twmask,$twres 1840 pxor $rndkey0,@tweak[4] 1841 pxor $twres,@tweak[5] 1842 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1843 1844 sub \$16*6,$len 1845 jc .Lxts_enc_short # if $len-=6*16 borrowed 1846 1847 mov \$16+96,$rounds 1848 lea 32($key_,$rnds_),$key # end of key schedule 1849 sub %r10,%rax # twisted $rounds 1850 $movkey 16($key_),$rndkey1 1851 mov %rax,%r10 # backup twisted $rounds 1852 lea .Lxts_magic(%rip),%r8 1853 jmp .Lxts_enc_grandloop 1854 1855.align 32 1856.Lxts_enc_grandloop: 1857 movdqu `16*0`($inp),$inout0 # load input 1858 movdqa $rndkey0,$twmask 1859 movdqu `16*1`($inp),$inout1 1860 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1861 movdqu `16*2`($inp),$inout2 1862 pxor @tweak[1],$inout1 1863 aesenc $rndkey1,$inout0 1864 movdqu `16*3`($inp),$inout3 1865 pxor @tweak[2],$inout2 1866 aesenc $rndkey1,$inout1 1867 movdqu `16*4`($inp),$inout4 1868 pxor @tweak[3],$inout3 1869 aesenc $rndkey1,$inout2 1870 movdqu `16*5`($inp),$inout5 1871 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1872 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1873 pxor @tweak[4],$inout4 1874 aesenc $rndkey1,$inout3 1875 $movkey 32($key_),$rndkey0 1876 lea `16*6`($inp),$inp 1877 pxor $twmask,$inout5 1878 1879 pxor $twres,@tweak[0] # calculate tweaks^round[last] 1880 aesenc $rndkey1,$inout4 1881 pxor $twres,@tweak[1] 1882 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1883 aesenc $rndkey1,$inout5 1884 $movkey 48($key_),$rndkey1 1885 pxor $twres,@tweak[2] 1886 1887 aesenc $rndkey0,$inout0 1888 pxor $twres,@tweak[3] 1889 movdqa @tweak[1],`16*1`(%rsp) 1890 aesenc $rndkey0,$inout1 1891 pxor $twres,@tweak[4] 1892 movdqa @tweak[2],`16*2`(%rsp) 1893 aesenc $rndkey0,$inout2 1894 aesenc $rndkey0,$inout3 1895 pxor $twres,$twmask 1896 movdqa @tweak[4],`16*4`(%rsp) 1897 aesenc $rndkey0,$inout4 1898 aesenc $rndkey0,$inout5 1899 $movkey 64($key_),$rndkey0 1900 movdqa $twmask,`16*5`(%rsp) 1901 pshufd \$0x5f,@tweak[5],$twres 1902 jmp .Lxts_enc_loop6 1903.align 32 1904.Lxts_enc_loop6: 1905 aesenc $rndkey1,$inout0 1906 aesenc $rndkey1,$inout1 1907 aesenc $rndkey1,$inout2 1908 aesenc $rndkey1,$inout3 1909 aesenc $rndkey1,$inout4 1910 aesenc $rndkey1,$inout5 1911 $movkey -64($key,%rax),$rndkey1 1912 add \$32,%rax 1913 1914 aesenc $rndkey0,$inout0 1915 aesenc $rndkey0,$inout1 1916 aesenc $rndkey0,$inout2 1917 aesenc $rndkey0,$inout3 1918 aesenc $rndkey0,$inout4 1919 aesenc $rndkey0,$inout5 1920 $movkey -80($key,%rax),$rndkey0 1921 jnz .Lxts_enc_loop6 1922 1923 movdqa (%r8),$twmask # start calculating next tweak 1924 movdqa $twres,$twtmp 1925 paddd $twres,$twres 1926 aesenc $rndkey1,$inout0 1927 paddq @tweak[5],@tweak[5] 1928 psrad \$31,$twtmp 1929 aesenc $rndkey1,$inout1 1930 pand $twmask,$twtmp 1931 $movkey ($key_),@tweak[0] # load round[0] 1932 aesenc $rndkey1,$inout2 1933 aesenc $rndkey1,$inout3 1934 aesenc $rndkey1,$inout4 1935 pxor $twtmp,@tweak[5] 1936 movaps @tweak[0],@tweak[1] # copy round[0] 1937 aesenc $rndkey1,$inout5 1938 $movkey -64($key),$rndkey1 1939 1940 movdqa $twres,$twtmp 1941 aesenc $rndkey0,$inout0 1942 paddd $twres,$twres 1943 pxor @tweak[5],@tweak[0] 1944 aesenc $rndkey0,$inout1 1945 psrad \$31,$twtmp 1946 paddq @tweak[5],@tweak[5] 1947 aesenc $rndkey0,$inout2 1948 aesenc $rndkey0,$inout3 1949 pand $twmask,$twtmp 1950 movaps @tweak[1],@tweak[2] 1951 aesenc $rndkey0,$inout4 1952 pxor $twtmp,@tweak[5] 1953 movdqa $twres,$twtmp 1954 aesenc $rndkey0,$inout5 1955 $movkey -48($key),$rndkey0 1956 1957 paddd $twres,$twres 1958 aesenc $rndkey1,$inout0 1959 pxor @tweak[5],@tweak[1] 1960 psrad \$31,$twtmp 1961 aesenc $rndkey1,$inout1 1962 paddq @tweak[5],@tweak[5] 1963 pand $twmask,$twtmp 1964 aesenc $rndkey1,$inout2 1965 aesenc $rndkey1,$inout3 1966 movdqa @tweak[3],`16*3`(%rsp) 1967 pxor $twtmp,@tweak[5] 1968 aesenc $rndkey1,$inout4 1969 movaps @tweak[2],@tweak[3] 1970 movdqa $twres,$twtmp 1971 aesenc $rndkey1,$inout5 1972 $movkey -32($key),$rndkey1 1973 1974 paddd $twres,$twres 1975 aesenc $rndkey0,$inout0 1976 pxor @tweak[5],@tweak[2] 1977 psrad \$31,$twtmp 1978 aesenc $rndkey0,$inout1 1979 paddq @tweak[5],@tweak[5] 1980 pand $twmask,$twtmp 1981 aesenc $rndkey0,$inout2 1982 aesenc $rndkey0,$inout3 1983 aesenc $rndkey0,$inout4 1984 pxor $twtmp,@tweak[5] 1985 movaps @tweak[3],@tweak[4] 1986 aesenc $rndkey0,$inout5 1987 1988 movdqa $twres,$rndkey0 1989 paddd $twres,$twres 1990 aesenc $rndkey1,$inout0 1991 pxor @tweak[5],@tweak[3] 1992 psrad \$31,$rndkey0 1993 aesenc $rndkey1,$inout1 1994 paddq @tweak[5],@tweak[5] 1995 pand $twmask,$rndkey0 1996 aesenc $rndkey1,$inout2 1997 aesenc $rndkey1,$inout3 1998 pxor $rndkey0,@tweak[5] 1999 $movkey ($key_),$rndkey0 2000 aesenc $rndkey1,$inout4 2001 aesenc $rndkey1,$inout5 2002 $movkey 16($key_),$rndkey1 2003 2004 pxor @tweak[5],@tweak[4] 2005 aesenclast `16*0`(%rsp),$inout0 2006 psrad \$31,$twres 2007 paddq @tweak[5],@tweak[5] 2008 aesenclast `16*1`(%rsp),$inout1 2009 aesenclast `16*2`(%rsp),$inout2 2010 pand $twmask,$twres 2011 mov %r10,%rax # restore $rounds 2012 aesenclast `16*3`(%rsp),$inout3 2013 aesenclast `16*4`(%rsp),$inout4 2014 aesenclast `16*5`(%rsp),$inout5 2015 pxor $twres,@tweak[5] 2016 2017 lea `16*6`($out),$out # $out+=6*16 2018 movups $inout0,`-16*6`($out) # store 6 output blocks 2019 movups $inout1,`-16*5`($out) 2020 movups $inout2,`-16*4`($out) 2021 movups $inout3,`-16*3`($out) 2022 movups $inout4,`-16*2`($out) 2023 movups $inout5,`-16*1`($out) 2024 sub \$16*6,$len 2025 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 2026 2027 mov \$16+96,$rounds 2028 sub $rnds_,$rounds 2029 mov $key_,$key # restore $key 2030 shr \$4,$rounds # restore original value 2031 2032.Lxts_enc_short: 2033 # at the point @tweak[0..5] are populated with tweak values 2034 mov $rounds,$rnds_ # backup $rounds 2035 pxor $rndkey0,@tweak[0] 2036 add \$16*6,$len # restore real remaining $len 2037 jz .Lxts_enc_done # done if ($len==0) 2038 2039 pxor $rndkey0,@tweak[1] 2040 cmp \$0x20,$len 2041 jb .Lxts_enc_one # $len is 1*16 2042 pxor $rndkey0,@tweak[2] 2043 je .Lxts_enc_two # $len is 2*16 2044 2045 pxor $rndkey0,@tweak[3] 2046 cmp \$0x40,$len 2047 jb .Lxts_enc_three # $len is 3*16 2048 pxor $rndkey0,@tweak[4] 2049 je .Lxts_enc_four # $len is 4*16 2050 2051 movdqu ($inp),$inout0 # $len is 5*16 2052 movdqu 16*1($inp),$inout1 2053 movdqu 16*2($inp),$inout2 2054 pxor @tweak[0],$inout0 2055 movdqu 16*3($inp),$inout3 2056 pxor @tweak[1],$inout1 2057 movdqu 16*4($inp),$inout4 2058 lea 16*5($inp),$inp # $inp+=5*16 2059 pxor @tweak[2],$inout2 2060 pxor @tweak[3],$inout3 2061 pxor @tweak[4],$inout4 2062 pxor $inout5,$inout5 2063 2064 call _aesni_encrypt6 2065 2066 xorps @tweak[0],$inout0 2067 movdqa @tweak[5],@tweak[0] 2068 xorps @tweak[1],$inout1 2069 xorps @tweak[2],$inout2 2070 movdqu $inout0,($out) # store 5 output blocks 2071 xorps @tweak[3],$inout3 2072 movdqu $inout1,16*1($out) 2073 xorps @tweak[4],$inout4 2074 movdqu $inout2,16*2($out) 2075 movdqu $inout3,16*3($out) 2076 movdqu $inout4,16*4($out) 2077 lea 16*5($out),$out # $out+=5*16 2078 jmp .Lxts_enc_done 2079 2080.align 16 2081.Lxts_enc_one: 2082 movups ($inp),$inout0 2083 lea 16*1($inp),$inp # inp+=1*16 2084 xorps @tweak[0],$inout0 2085___ 2086 &aesni_generate1("enc",$key,$rounds); 2087$code.=<<___; 2088 xorps @tweak[0],$inout0 2089 movdqa @tweak[1],@tweak[0] 2090 movups $inout0,($out) # store one output block 2091 lea 16*1($out),$out # $out+=1*16 2092 jmp .Lxts_enc_done 2093 2094.align 16 2095.Lxts_enc_two: 2096 movups ($inp),$inout0 2097 movups 16($inp),$inout1 2098 lea 32($inp),$inp # $inp+=2*16 2099 xorps @tweak[0],$inout0 2100 xorps @tweak[1],$inout1 2101 2102 call _aesni_encrypt2 2103 2104 xorps @tweak[0],$inout0 2105 movdqa @tweak[2],@tweak[0] 2106 xorps @tweak[1],$inout1 2107 movups $inout0,($out) # store 2 output blocks 2108 movups $inout1,16*1($out) 2109 lea 16*2($out),$out # $out+=2*16 2110 jmp .Lxts_enc_done 2111 2112.align 16 2113.Lxts_enc_three: 2114 movups ($inp),$inout0 2115 movups 16*1($inp),$inout1 2116 movups 16*2($inp),$inout2 2117 lea 16*3($inp),$inp # $inp+=3*16 2118 xorps @tweak[0],$inout0 2119 xorps @tweak[1],$inout1 2120 xorps @tweak[2],$inout2 2121 2122 call _aesni_encrypt3 2123 2124 xorps @tweak[0],$inout0 2125 movdqa @tweak[3],@tweak[0] 2126 xorps @tweak[1],$inout1 2127 xorps @tweak[2],$inout2 2128 movups $inout0,($out) # store 3 output blocks 2129 movups $inout1,16*1($out) 2130 movups $inout2,16*2($out) 2131 lea 16*3($out),$out # $out+=3*16 2132 jmp .Lxts_enc_done 2133 2134.align 16 2135.Lxts_enc_four: 2136 movups ($inp),$inout0 2137 movups 16*1($inp),$inout1 2138 movups 16*2($inp),$inout2 2139 xorps @tweak[0],$inout0 2140 movups 16*3($inp),$inout3 2141 lea 16*4($inp),$inp # $inp+=4*16 2142 xorps @tweak[1],$inout1 2143 xorps @tweak[2],$inout2 2144 xorps @tweak[3],$inout3 2145 2146 call _aesni_encrypt4 2147 2148 pxor @tweak[0],$inout0 2149 movdqa @tweak[4],@tweak[0] 2150 pxor @tweak[1],$inout1 2151 pxor @tweak[2],$inout2 2152 movdqu $inout0,($out) # store 4 output blocks 2153 pxor @tweak[3],$inout3 2154 movdqu $inout1,16*1($out) 2155 movdqu $inout2,16*2($out) 2156 movdqu $inout3,16*3($out) 2157 lea 16*4($out),$out # $out+=4*16 2158 jmp .Lxts_enc_done 2159 2160.align 16 2161.Lxts_enc_done: 2162 and \$15,$len_ # see if $len%16 is 0 2163 jz .Lxts_enc_ret 2164 mov $len_,$len 2165 2166.Lxts_enc_steal: 2167 movzb ($inp),%eax # borrow $rounds ... 2168 movzb -16($out),%ecx # ... and $key 2169 lea 1($inp),$inp 2170 mov %al,-16($out) 2171 mov %cl,0($out) 2172 lea 1($out),$out 2173 sub \$1,$len 2174 jnz .Lxts_enc_steal 2175 2176 sub $len_,$out # rewind $out 2177 mov $key_,$key # restore $key 2178 mov $rnds_,$rounds # restore $rounds 2179 2180 movups -16($out),$inout0 2181 xorps @tweak[0],$inout0 2182___ 2183 &aesni_generate1("enc",$key,$rounds); 2184$code.=<<___; 2185 xorps @tweak[0],$inout0 2186 movups $inout0,-16($out) 2187 2188.Lxts_enc_ret: 2189 xorps %xmm0,%xmm0 # clear register bank 2190 pxor %xmm1,%xmm1 2191 pxor %xmm2,%xmm2 2192 pxor %xmm3,%xmm3 2193 pxor %xmm4,%xmm4 2194 pxor %xmm5,%xmm5 2195___ 2196$code.=<<___ if (!$win64); 2197 pxor %xmm6,%xmm6 2198 pxor %xmm7,%xmm7 2199 movaps %xmm0,0x00(%rsp) # clear stack 2200 pxor %xmm8,%xmm8 2201 movaps %xmm0,0x10(%rsp) 2202 pxor %xmm9,%xmm9 2203 movaps %xmm0,0x20(%rsp) 2204 pxor %xmm10,%xmm10 2205 movaps %xmm0,0x30(%rsp) 2206 pxor %xmm11,%xmm11 2207 movaps %xmm0,0x40(%rsp) 2208 pxor %xmm12,%xmm12 2209 movaps %xmm0,0x50(%rsp) 2210 pxor %xmm13,%xmm13 2211 movaps %xmm0,0x60(%rsp) 2212 pxor %xmm14,%xmm14 2213 pxor %xmm15,%xmm15 2214___ 2215$code.=<<___ if ($win64); 2216 movaps -0xa8(%r11),%xmm6 2217 movaps %xmm0,-0xa8(%r11) # clear stack 2218 movaps -0x98(%r11),%xmm7 2219 movaps %xmm0,-0x98(%r11) 2220 movaps -0x88(%r11),%xmm8 2221 movaps %xmm0,-0x88(%r11) 2222 movaps -0x78(%r11),%xmm9 2223 movaps %xmm0,-0x78(%r11) 2224 movaps -0x68(%r11),%xmm10 2225 movaps %xmm0,-0x68(%r11) 2226 movaps -0x58(%r11),%xmm11 2227 movaps %xmm0,-0x58(%r11) 2228 movaps -0x48(%r11),%xmm12 2229 movaps %xmm0,-0x48(%r11) 2230 movaps -0x38(%r11),%xmm13 2231 movaps %xmm0,-0x38(%r11) 2232 movaps -0x28(%r11),%xmm14 2233 movaps %xmm0,-0x28(%r11) 2234 movaps -0x18(%r11),%xmm15 2235 movaps %xmm0,-0x18(%r11) 2236 movaps %xmm0,0x00(%rsp) 2237 movaps %xmm0,0x10(%rsp) 2238 movaps %xmm0,0x20(%rsp) 2239 movaps %xmm0,0x30(%rsp) 2240 movaps %xmm0,0x40(%rsp) 2241 movaps %xmm0,0x50(%rsp) 2242 movaps %xmm0,0x60(%rsp) 2243___ 2244$code.=<<___; 2245 mov -8(%r11),%rbp 2246.cfi_restore %rbp 2247 lea (%r11),%rsp 2248.cfi_def_cfa_register %rsp 2249.Lxts_enc_epilogue: 2250 ret 2251.cfi_endproc 2252.size aesni_xts_encrypt,.-aesni_xts_encrypt 2253___ 2254 2255$code.=<<___; 2256.globl aesni_xts_decrypt 2257.type aesni_xts_decrypt,\@function,6 2258.align 16 2259aesni_xts_decrypt: 2260.cfi_startproc 2261 lea (%rsp),%r11 # frame pointer 2262.cfi_def_cfa_register %r11 2263 push %rbp 2264.cfi_push %rbp 2265 sub \$$frame_size,%rsp 2266 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2267___ 2268$code.=<<___ if ($win64); 2269 movaps %xmm6,-0xa8(%r11) # offload everything 2270 movaps %xmm7,-0x98(%r11) 2271 movaps %xmm8,-0x88(%r11) 2272 movaps %xmm9,-0x78(%r11) 2273 movaps %xmm10,-0x68(%r11) 2274 movaps %xmm11,-0x58(%r11) 2275 movaps %xmm12,-0x48(%r11) 2276 movaps %xmm13,-0x38(%r11) 2277 movaps %xmm14,-0x28(%r11) 2278 movaps %xmm15,-0x18(%r11) 2279.Lxts_dec_body: 2280___ 2281$code.=<<___; 2282 movups ($ivp),$inout0 # load clear-text tweak 2283 mov 240($key2),$rounds # key2->rounds 2284 mov 240($key),$rnds_ # key1->rounds 2285___ 2286 # generate the tweak 2287 &aesni_generate1("enc",$key2,$rounds,$inout0); 2288$code.=<<___; 2289 xor %eax,%eax # if ($len%16) len-=16; 2290 test \$15,$len 2291 setnz %al 2292 shl \$4,%rax 2293 sub %rax,$len 2294 2295 $movkey ($key),$rndkey0 # zero round key 2296 mov $key,$key_ # backup $key 2297 mov $rnds_,$rounds # backup $rounds 2298 shl \$4,$rnds_ 2299 mov $len,$len_ # backup $len 2300 and \$-16,$len 2301 2302 $movkey 16($key,$rnds_),$rndkey1 # last round key 2303 2304 movdqa .Lxts_magic(%rip),$twmask 2305 movdqa $inout0,@tweak[5] 2306 pshufd \$0x5f,$inout0,$twres 2307 pxor $rndkey0,$rndkey1 2308___ 2309 for ($i=0;$i<4;$i++) { 2310 $code.=<<___; 2311 movdqa $twres,$twtmp 2312 paddd $twres,$twres 2313 movdqa @tweak[5],@tweak[$i] 2314 psrad \$31,$twtmp # broadcast upper bits 2315 paddq @tweak[5],@tweak[5] 2316 pand $twmask,$twtmp 2317 pxor $rndkey0,@tweak[$i] 2318 pxor $twtmp,@tweak[5] 2319___ 2320 } 2321$code.=<<___; 2322 movdqa @tweak[5],@tweak[4] 2323 psrad \$31,$twres 2324 paddq @tweak[5],@tweak[5] 2325 pand $twmask,$twres 2326 pxor $rndkey0,@tweak[4] 2327 pxor $twres,@tweak[5] 2328 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2329 2330 sub \$16*6,$len 2331 jc .Lxts_dec_short # if $len-=6*16 borrowed 2332 2333 mov \$16+96,$rounds 2334 lea 32($key_,$rnds_),$key # end of key schedule 2335 sub %r10,%rax # twisted $rounds 2336 $movkey 16($key_),$rndkey1 2337 mov %rax,%r10 # backup twisted $rounds 2338 lea .Lxts_magic(%rip),%r8 2339 jmp .Lxts_dec_grandloop 2340 2341.align 32 2342.Lxts_dec_grandloop: 2343 movdqu `16*0`($inp),$inout0 # load input 2344 movdqa $rndkey0,$twmask 2345 movdqu `16*1`($inp),$inout1 2346 pxor @tweak[0],$inout0 # input^=tweak^round[0] 2347 movdqu `16*2`($inp),$inout2 2348 pxor @tweak[1],$inout1 2349 aesdec $rndkey1,$inout0 2350 movdqu `16*3`($inp),$inout3 2351 pxor @tweak[2],$inout2 2352 aesdec $rndkey1,$inout1 2353 movdqu `16*4`($inp),$inout4 2354 pxor @tweak[3],$inout3 2355 aesdec $rndkey1,$inout2 2356 movdqu `16*5`($inp),$inout5 2357 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2358 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2359 pxor @tweak[4],$inout4 2360 aesdec $rndkey1,$inout3 2361 $movkey 32($key_),$rndkey0 2362 lea `16*6`($inp),$inp 2363 pxor $twmask,$inout5 2364 2365 pxor $twres,@tweak[0] # calculate tweaks^round[last] 2366 aesdec $rndkey1,$inout4 2367 pxor $twres,@tweak[1] 2368 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2369 aesdec $rndkey1,$inout5 2370 $movkey 48($key_),$rndkey1 2371 pxor $twres,@tweak[2] 2372 2373 aesdec $rndkey0,$inout0 2374 pxor $twres,@tweak[3] 2375 movdqa @tweak[1],`16*1`(%rsp) 2376 aesdec $rndkey0,$inout1 2377 pxor $twres,@tweak[4] 2378 movdqa @tweak[2],`16*2`(%rsp) 2379 aesdec $rndkey0,$inout2 2380 aesdec $rndkey0,$inout3 2381 pxor $twres,$twmask 2382 movdqa @tweak[4],`16*4`(%rsp) 2383 aesdec $rndkey0,$inout4 2384 aesdec $rndkey0,$inout5 2385 $movkey 64($key_),$rndkey0 2386 movdqa $twmask,`16*5`(%rsp) 2387 pshufd \$0x5f,@tweak[5],$twres 2388 jmp .Lxts_dec_loop6 2389.align 32 2390.Lxts_dec_loop6: 2391 aesdec $rndkey1,$inout0 2392 aesdec $rndkey1,$inout1 2393 aesdec $rndkey1,$inout2 2394 aesdec $rndkey1,$inout3 2395 aesdec $rndkey1,$inout4 2396 aesdec $rndkey1,$inout5 2397 $movkey -64($key,%rax),$rndkey1 2398 add \$32,%rax 2399 2400 aesdec $rndkey0,$inout0 2401 aesdec $rndkey0,$inout1 2402 aesdec $rndkey0,$inout2 2403 aesdec $rndkey0,$inout3 2404 aesdec $rndkey0,$inout4 2405 aesdec $rndkey0,$inout5 2406 $movkey -80($key,%rax),$rndkey0 2407 jnz .Lxts_dec_loop6 2408 2409 movdqa (%r8),$twmask # start calculating next tweak 2410 movdqa $twres,$twtmp 2411 paddd $twres,$twres 2412 aesdec $rndkey1,$inout0 2413 paddq @tweak[5],@tweak[5] 2414 psrad \$31,$twtmp 2415 aesdec $rndkey1,$inout1 2416 pand $twmask,$twtmp 2417 $movkey ($key_),@tweak[0] # load round[0] 2418 aesdec $rndkey1,$inout2 2419 aesdec $rndkey1,$inout3 2420 aesdec $rndkey1,$inout4 2421 pxor $twtmp,@tweak[5] 2422 movaps @tweak[0],@tweak[1] # copy round[0] 2423 aesdec $rndkey1,$inout5 2424 $movkey -64($key),$rndkey1 2425 2426 movdqa $twres,$twtmp 2427 aesdec $rndkey0,$inout0 2428 paddd $twres,$twres 2429 pxor @tweak[5],@tweak[0] 2430 aesdec $rndkey0,$inout1 2431 psrad \$31,$twtmp 2432 paddq @tweak[5],@tweak[5] 2433 aesdec $rndkey0,$inout2 2434 aesdec $rndkey0,$inout3 2435 pand $twmask,$twtmp 2436 movaps @tweak[1],@tweak[2] 2437 aesdec $rndkey0,$inout4 2438 pxor $twtmp,@tweak[5] 2439 movdqa $twres,$twtmp 2440 aesdec $rndkey0,$inout5 2441 $movkey -48($key),$rndkey0 2442 2443 paddd $twres,$twres 2444 aesdec $rndkey1,$inout0 2445 pxor @tweak[5],@tweak[1] 2446 psrad \$31,$twtmp 2447 aesdec $rndkey1,$inout1 2448 paddq @tweak[5],@tweak[5] 2449 pand $twmask,$twtmp 2450 aesdec $rndkey1,$inout2 2451 aesdec $rndkey1,$inout3 2452 movdqa @tweak[3],`16*3`(%rsp) 2453 pxor $twtmp,@tweak[5] 2454 aesdec $rndkey1,$inout4 2455 movaps @tweak[2],@tweak[3] 2456 movdqa $twres,$twtmp 2457 aesdec $rndkey1,$inout5 2458 $movkey -32($key),$rndkey1 2459 2460 paddd $twres,$twres 2461 aesdec $rndkey0,$inout0 2462 pxor @tweak[5],@tweak[2] 2463 psrad \$31,$twtmp 2464 aesdec $rndkey0,$inout1 2465 paddq @tweak[5],@tweak[5] 2466 pand $twmask,$twtmp 2467 aesdec $rndkey0,$inout2 2468 aesdec $rndkey0,$inout3 2469 aesdec $rndkey0,$inout4 2470 pxor $twtmp,@tweak[5] 2471 movaps @tweak[3],@tweak[4] 2472 aesdec $rndkey0,$inout5 2473 2474 movdqa $twres,$rndkey0 2475 paddd $twres,$twres 2476 aesdec $rndkey1,$inout0 2477 pxor @tweak[5],@tweak[3] 2478 psrad \$31,$rndkey0 2479 aesdec $rndkey1,$inout1 2480 paddq @tweak[5],@tweak[5] 2481 pand $twmask,$rndkey0 2482 aesdec $rndkey1,$inout2 2483 aesdec $rndkey1,$inout3 2484 pxor $rndkey0,@tweak[5] 2485 $movkey ($key_),$rndkey0 2486 aesdec $rndkey1,$inout4 2487 aesdec $rndkey1,$inout5 2488 $movkey 16($key_),$rndkey1 2489 2490 pxor @tweak[5],@tweak[4] 2491 aesdeclast `16*0`(%rsp),$inout0 2492 psrad \$31,$twres 2493 paddq @tweak[5],@tweak[5] 2494 aesdeclast `16*1`(%rsp),$inout1 2495 aesdeclast `16*2`(%rsp),$inout2 2496 pand $twmask,$twres 2497 mov %r10,%rax # restore $rounds 2498 aesdeclast `16*3`(%rsp),$inout3 2499 aesdeclast `16*4`(%rsp),$inout4 2500 aesdeclast `16*5`(%rsp),$inout5 2501 pxor $twres,@tweak[5] 2502 2503 lea `16*6`($out),$out # $out+=6*16 2504 movups $inout0,`-16*6`($out) # store 6 output blocks 2505 movups $inout1,`-16*5`($out) 2506 movups $inout2,`-16*4`($out) 2507 movups $inout3,`-16*3`($out) 2508 movups $inout4,`-16*2`($out) 2509 movups $inout5,`-16*1`($out) 2510 sub \$16*6,$len 2511 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2512 2513 mov \$16+96,$rounds 2514 sub $rnds_,$rounds 2515 mov $key_,$key # restore $key 2516 shr \$4,$rounds # restore original value 2517 2518.Lxts_dec_short: 2519 # at the point @tweak[0..5] are populated with tweak values 2520 mov $rounds,$rnds_ # backup $rounds 2521 pxor $rndkey0,@tweak[0] 2522 pxor $rndkey0,@tweak[1] 2523 add \$16*6,$len # restore real remaining $len 2524 jz .Lxts_dec_done # done if ($len==0) 2525 2526 pxor $rndkey0,@tweak[2] 2527 cmp \$0x20,$len 2528 jb .Lxts_dec_one # $len is 1*16 2529 pxor $rndkey0,@tweak[3] 2530 je .Lxts_dec_two # $len is 2*16 2531 2532 pxor $rndkey0,@tweak[4] 2533 cmp \$0x40,$len 2534 jb .Lxts_dec_three # $len is 3*16 2535 je .Lxts_dec_four # $len is 4*16 2536 2537 movdqu ($inp),$inout0 # $len is 5*16 2538 movdqu 16*1($inp),$inout1 2539 movdqu 16*2($inp),$inout2 2540 pxor @tweak[0],$inout0 2541 movdqu 16*3($inp),$inout3 2542 pxor @tweak[1],$inout1 2543 movdqu 16*4($inp),$inout4 2544 lea 16*5($inp),$inp # $inp+=5*16 2545 pxor @tweak[2],$inout2 2546 pxor @tweak[3],$inout3 2547 pxor @tweak[4],$inout4 2548 2549 call _aesni_decrypt6 2550 2551 xorps @tweak[0],$inout0 2552 xorps @tweak[1],$inout1 2553 xorps @tweak[2],$inout2 2554 movdqu $inout0,($out) # store 5 output blocks 2555 xorps @tweak[3],$inout3 2556 movdqu $inout1,16*1($out) 2557 xorps @tweak[4],$inout4 2558 movdqu $inout2,16*2($out) 2559 pxor $twtmp,$twtmp 2560 movdqu $inout3,16*3($out) 2561 pcmpgtd @tweak[5],$twtmp 2562 movdqu $inout4,16*4($out) 2563 lea 16*5($out),$out # $out+=5*16 2564 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2565 and \$15,$len_ 2566 jz .Lxts_dec_ret 2567 2568 movdqa @tweak[5],@tweak[0] 2569 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2570 pand $twmask,@tweak[1] # isolate carry and residue 2571 pxor @tweak[5],@tweak[1] 2572 jmp .Lxts_dec_done2 2573 2574.align 16 2575.Lxts_dec_one: 2576 movups ($inp),$inout0 2577 lea 16*1($inp),$inp # $inp+=1*16 2578 xorps @tweak[0],$inout0 2579___ 2580 &aesni_generate1("dec",$key,$rounds); 2581$code.=<<___; 2582 xorps @tweak[0],$inout0 2583 movdqa @tweak[1],@tweak[0] 2584 movups $inout0,($out) # store one output block 2585 movdqa @tweak[2],@tweak[1] 2586 lea 16*1($out),$out # $out+=1*16 2587 jmp .Lxts_dec_done 2588 2589.align 16 2590.Lxts_dec_two: 2591 movups ($inp),$inout0 2592 movups 16($inp),$inout1 2593 lea 32($inp),$inp # $inp+=2*16 2594 xorps @tweak[0],$inout0 2595 xorps @tweak[1],$inout1 2596 2597 call _aesni_decrypt2 2598 2599 xorps @tweak[0],$inout0 2600 movdqa @tweak[2],@tweak[0] 2601 xorps @tweak[1],$inout1 2602 movdqa @tweak[3],@tweak[1] 2603 movups $inout0,($out) # store 2 output blocks 2604 movups $inout1,16*1($out) 2605 lea 16*2($out),$out # $out+=2*16 2606 jmp .Lxts_dec_done 2607 2608.align 16 2609.Lxts_dec_three: 2610 movups ($inp),$inout0 2611 movups 16*1($inp),$inout1 2612 movups 16*2($inp),$inout2 2613 lea 16*3($inp),$inp # $inp+=3*16 2614 xorps @tweak[0],$inout0 2615 xorps @tweak[1],$inout1 2616 xorps @tweak[2],$inout2 2617 2618 call _aesni_decrypt3 2619 2620 xorps @tweak[0],$inout0 2621 movdqa @tweak[3],@tweak[0] 2622 xorps @tweak[1],$inout1 2623 movdqa @tweak[4],@tweak[1] 2624 xorps @tweak[2],$inout2 2625 movups $inout0,($out) # store 3 output blocks 2626 movups $inout1,16*1($out) 2627 movups $inout2,16*2($out) 2628 lea 16*3($out),$out # $out+=3*16 2629 jmp .Lxts_dec_done 2630 2631.align 16 2632.Lxts_dec_four: 2633 movups ($inp),$inout0 2634 movups 16*1($inp),$inout1 2635 movups 16*2($inp),$inout2 2636 xorps @tweak[0],$inout0 2637 movups 16*3($inp),$inout3 2638 lea 16*4($inp),$inp # $inp+=4*16 2639 xorps @tweak[1],$inout1 2640 xorps @tweak[2],$inout2 2641 xorps @tweak[3],$inout3 2642 2643 call _aesni_decrypt4 2644 2645 pxor @tweak[0],$inout0 2646 movdqa @tweak[4],@tweak[0] 2647 pxor @tweak[1],$inout1 2648 movdqa @tweak[5],@tweak[1] 2649 pxor @tweak[2],$inout2 2650 movdqu $inout0,($out) # store 4 output blocks 2651 pxor @tweak[3],$inout3 2652 movdqu $inout1,16*1($out) 2653 movdqu $inout2,16*2($out) 2654 movdqu $inout3,16*3($out) 2655 lea 16*4($out),$out # $out+=4*16 2656 jmp .Lxts_dec_done 2657 2658.align 16 2659.Lxts_dec_done: 2660 and \$15,$len_ # see if $len%16 is 0 2661 jz .Lxts_dec_ret 2662.Lxts_dec_done2: 2663 mov $len_,$len 2664 mov $key_,$key # restore $key 2665 mov $rnds_,$rounds # restore $rounds 2666 2667 movups ($inp),$inout0 2668 xorps @tweak[1],$inout0 2669___ 2670 &aesni_generate1("dec",$key,$rounds); 2671$code.=<<___; 2672 xorps @tweak[1],$inout0 2673 movups $inout0,($out) 2674 2675.Lxts_dec_steal: 2676 movzb 16($inp),%eax # borrow $rounds ... 2677 movzb ($out),%ecx # ... and $key 2678 lea 1($inp),$inp 2679 mov %al,($out) 2680 mov %cl,16($out) 2681 lea 1($out),$out 2682 sub \$1,$len 2683 jnz .Lxts_dec_steal 2684 2685 sub $len_,$out # rewind $out 2686 mov $key_,$key # restore $key 2687 mov $rnds_,$rounds # restore $rounds 2688 2689 movups ($out),$inout0 2690 xorps @tweak[0],$inout0 2691___ 2692 &aesni_generate1("dec",$key,$rounds); 2693$code.=<<___; 2694 xorps @tweak[0],$inout0 2695 movups $inout0,($out) 2696 2697.Lxts_dec_ret: 2698 xorps %xmm0,%xmm0 # clear register bank 2699 pxor %xmm1,%xmm1 2700 pxor %xmm2,%xmm2 2701 pxor %xmm3,%xmm3 2702 pxor %xmm4,%xmm4 2703 pxor %xmm5,%xmm5 2704___ 2705$code.=<<___ if (!$win64); 2706 pxor %xmm6,%xmm6 2707 pxor %xmm7,%xmm7 2708 movaps %xmm0,0x00(%rsp) # clear stack 2709 pxor %xmm8,%xmm8 2710 movaps %xmm0,0x10(%rsp) 2711 pxor %xmm9,%xmm9 2712 movaps %xmm0,0x20(%rsp) 2713 pxor %xmm10,%xmm10 2714 movaps %xmm0,0x30(%rsp) 2715 pxor %xmm11,%xmm11 2716 movaps %xmm0,0x40(%rsp) 2717 pxor %xmm12,%xmm12 2718 movaps %xmm0,0x50(%rsp) 2719 pxor %xmm13,%xmm13 2720 movaps %xmm0,0x60(%rsp) 2721 pxor %xmm14,%xmm14 2722 pxor %xmm15,%xmm15 2723___ 2724$code.=<<___ if ($win64); 2725 movaps -0xa8(%r11),%xmm6 2726 movaps %xmm0,-0xa8(%r11) # clear stack 2727 movaps -0x98(%r11),%xmm7 2728 movaps %xmm0,-0x98(%r11) 2729 movaps -0x88(%r11),%xmm8 2730 movaps %xmm0,-0x88(%r11) 2731 movaps -0x78(%r11),%xmm9 2732 movaps %xmm0,-0x78(%r11) 2733 movaps -0x68(%r11),%xmm10 2734 movaps %xmm0,-0x68(%r11) 2735 movaps -0x58(%r11),%xmm11 2736 movaps %xmm0,-0x58(%r11) 2737 movaps -0x48(%r11),%xmm12 2738 movaps %xmm0,-0x48(%r11) 2739 movaps -0x38(%r11),%xmm13 2740 movaps %xmm0,-0x38(%r11) 2741 movaps -0x28(%r11),%xmm14 2742 movaps %xmm0,-0x28(%r11) 2743 movaps -0x18(%r11),%xmm15 2744 movaps %xmm0,-0x18(%r11) 2745 movaps %xmm0,0x00(%rsp) 2746 movaps %xmm0,0x10(%rsp) 2747 movaps %xmm0,0x20(%rsp) 2748 movaps %xmm0,0x30(%rsp) 2749 movaps %xmm0,0x40(%rsp) 2750 movaps %xmm0,0x50(%rsp) 2751 movaps %xmm0,0x60(%rsp) 2752___ 2753$code.=<<___; 2754 mov -8(%r11),%rbp 2755.cfi_restore %rbp 2756 lea (%r11),%rsp 2757.cfi_def_cfa_register %rsp 2758.Lxts_dec_epilogue: 2759 ret 2760.cfi_endproc 2761.size aesni_xts_decrypt,.-aesni_xts_decrypt 2762___ 2763} 2764 2765###################################################################### 2766# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2767# const AES_KEY *key, unsigned int start_block_num, 2768# unsigned char offset_i[16], const unsigned char L_[][16], 2769# unsigned char checksum[16]); 2770# 2771{ 2772my @offset=map("%xmm$_",(10..15)); 2773my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2774my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2775my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2776my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2777my $seventh_arg = $win64 ? 56 : 8; 2778my $blocks = $len; 2779 2780$code.=<<___; 2781.globl aesni_ocb_encrypt 2782.type aesni_ocb_encrypt,\@function,6 2783.align 32 2784aesni_ocb_encrypt: 2785.cfi_startproc 2786 lea (%rsp),%rax 2787 push %rbx 2788.cfi_push %rbx 2789 push %rbp 2790.cfi_push %rbp 2791 push %r12 2792.cfi_push %r12 2793 push %r13 2794.cfi_push %r13 2795 push %r14 2796.cfi_push %r14 2797___ 2798$code.=<<___ if ($win64); 2799 lea -0xa0(%rsp),%rsp 2800 movaps %xmm6,0x00(%rsp) # offload everything 2801 movaps %xmm7,0x10(%rsp) 2802 movaps %xmm8,0x20(%rsp) 2803 movaps %xmm9,0x30(%rsp) 2804 movaps %xmm10,0x40(%rsp) 2805 movaps %xmm11,0x50(%rsp) 2806 movaps %xmm12,0x60(%rsp) 2807 movaps %xmm13,0x70(%rsp) 2808 movaps %xmm14,0x80(%rsp) 2809 movaps %xmm15,0x90(%rsp) 2810.Locb_enc_body: 2811___ 2812$code.=<<___; 2813 mov $seventh_arg(%rax),$L_p # 7th argument 2814 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2815 2816 mov 240($key),$rnds_ 2817 mov $key,$key_ 2818 shl \$4,$rnds_ 2819 $movkey ($key),$rndkey0l # round[0] 2820 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2821 2822 movdqu ($offset_p),@offset[5] # load last offset_i 2823 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2824 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2825 2826 mov \$16+32,$rounds 2827 lea 32($key_,$rnds_),$key 2828 $movkey 16($key_),$rndkey1 # round[1] 2829 sub %r10,%rax # twisted $rounds 2830 mov %rax,%r10 # backup twisted $rounds 2831 2832 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2833 movdqu ($checksum_p),$checksum # load checksum 2834 2835 test \$1,$block_num # is first block number odd? 2836 jnz .Locb_enc_odd 2837 2838 bsf $block_num,$i1 2839 add \$1,$block_num 2840 shl \$4,$i1 2841 movdqu ($L_p,$i1),$inout5 # borrow 2842 movdqu ($inp),$inout0 2843 lea 16($inp),$inp 2844 2845 call __ocb_encrypt1 2846 2847 movdqa $inout5,@offset[5] 2848 movups $inout0,($out) 2849 lea 16($out),$out 2850 sub \$1,$blocks 2851 jz .Locb_enc_done 2852 2853.Locb_enc_odd: 2854 lea 1($block_num),$i1 # even-numbered blocks 2855 lea 3($block_num),$i3 2856 lea 5($block_num),$i5 2857 lea 6($block_num),$block_num 2858 bsf $i1,$i1 # ntz(block) 2859 bsf $i3,$i3 2860 bsf $i5,$i5 2861 shl \$4,$i1 # ntz(block) -> table offset 2862 shl \$4,$i3 2863 shl \$4,$i5 2864 2865 sub \$6,$blocks 2866 jc .Locb_enc_short 2867 jmp .Locb_enc_grandloop 2868 2869.align 32 2870.Locb_enc_grandloop: 2871 movdqu `16*0`($inp),$inout0 # load input 2872 movdqu `16*1`($inp),$inout1 2873 movdqu `16*2`($inp),$inout2 2874 movdqu `16*3`($inp),$inout3 2875 movdqu `16*4`($inp),$inout4 2876 movdqu `16*5`($inp),$inout5 2877 lea `16*6`($inp),$inp 2878 2879 call __ocb_encrypt6 2880 2881 movups $inout0,`16*0`($out) # store output 2882 movups $inout1,`16*1`($out) 2883 movups $inout2,`16*2`($out) 2884 movups $inout3,`16*3`($out) 2885 movups $inout4,`16*4`($out) 2886 movups $inout5,`16*5`($out) 2887 lea `16*6`($out),$out 2888 sub \$6,$blocks 2889 jnc .Locb_enc_grandloop 2890 2891.Locb_enc_short: 2892 add \$6,$blocks 2893 jz .Locb_enc_done 2894 2895 movdqu `16*0`($inp),$inout0 2896 cmp \$2,$blocks 2897 jb .Locb_enc_one 2898 movdqu `16*1`($inp),$inout1 2899 je .Locb_enc_two 2900 2901 movdqu `16*2`($inp),$inout2 2902 cmp \$4,$blocks 2903 jb .Locb_enc_three 2904 movdqu `16*3`($inp),$inout3 2905 je .Locb_enc_four 2906 2907 movdqu `16*4`($inp),$inout4 2908 pxor $inout5,$inout5 2909 2910 call __ocb_encrypt6 2911 2912 movdqa @offset[4],@offset[5] 2913 movups $inout0,`16*0`($out) 2914 movups $inout1,`16*1`($out) 2915 movups $inout2,`16*2`($out) 2916 movups $inout3,`16*3`($out) 2917 movups $inout4,`16*4`($out) 2918 2919 jmp .Locb_enc_done 2920 2921.align 16 2922.Locb_enc_one: 2923 movdqa @offset[0],$inout5 # borrow 2924 2925 call __ocb_encrypt1 2926 2927 movdqa $inout5,@offset[5] 2928 movups $inout0,`16*0`($out) 2929 jmp .Locb_enc_done 2930 2931.align 16 2932.Locb_enc_two: 2933 pxor $inout2,$inout2 2934 pxor $inout3,$inout3 2935 2936 call __ocb_encrypt4 2937 2938 movdqa @offset[1],@offset[5] 2939 movups $inout0,`16*0`($out) 2940 movups $inout1,`16*1`($out) 2941 2942 jmp .Locb_enc_done 2943 2944.align 16 2945.Locb_enc_three: 2946 pxor $inout3,$inout3 2947 2948 call __ocb_encrypt4 2949 2950 movdqa @offset[2],@offset[5] 2951 movups $inout0,`16*0`($out) 2952 movups $inout1,`16*1`($out) 2953 movups $inout2,`16*2`($out) 2954 2955 jmp .Locb_enc_done 2956 2957.align 16 2958.Locb_enc_four: 2959 call __ocb_encrypt4 2960 2961 movdqa @offset[3],@offset[5] 2962 movups $inout0,`16*0`($out) 2963 movups $inout1,`16*1`($out) 2964 movups $inout2,`16*2`($out) 2965 movups $inout3,`16*3`($out) 2966 2967.Locb_enc_done: 2968 pxor $rndkey0,@offset[5] # "remove" round[last] 2969 movdqu $checksum,($checksum_p) # store checksum 2970 movdqu @offset[5],($offset_p) # store last offset_i 2971 2972 xorps %xmm0,%xmm0 # clear register bank 2973 pxor %xmm1,%xmm1 2974 pxor %xmm2,%xmm2 2975 pxor %xmm3,%xmm3 2976 pxor %xmm4,%xmm4 2977 pxor %xmm5,%xmm5 2978___ 2979$code.=<<___ if (!$win64); 2980 pxor %xmm6,%xmm6 2981 pxor %xmm7,%xmm7 2982 pxor %xmm8,%xmm8 2983 pxor %xmm9,%xmm9 2984 pxor %xmm10,%xmm10 2985 pxor %xmm11,%xmm11 2986 pxor %xmm12,%xmm12 2987 pxor %xmm13,%xmm13 2988 pxor %xmm14,%xmm14 2989 pxor %xmm15,%xmm15 2990 lea 0x28(%rsp),%rax 2991.cfi_def_cfa %rax,8 2992___ 2993$code.=<<___ if ($win64); 2994 movaps 0x00(%rsp),%xmm6 2995 movaps %xmm0,0x00(%rsp) # clear stack 2996 movaps 0x10(%rsp),%xmm7 2997 movaps %xmm0,0x10(%rsp) 2998 movaps 0x20(%rsp),%xmm8 2999 movaps %xmm0,0x20(%rsp) 3000 movaps 0x30(%rsp),%xmm9 3001 movaps %xmm0,0x30(%rsp) 3002 movaps 0x40(%rsp),%xmm10 3003 movaps %xmm0,0x40(%rsp) 3004 movaps 0x50(%rsp),%xmm11 3005 movaps %xmm0,0x50(%rsp) 3006 movaps 0x60(%rsp),%xmm12 3007 movaps %xmm0,0x60(%rsp) 3008 movaps 0x70(%rsp),%xmm13 3009 movaps %xmm0,0x70(%rsp) 3010 movaps 0x80(%rsp),%xmm14 3011 movaps %xmm0,0x80(%rsp) 3012 movaps 0x90(%rsp),%xmm15 3013 movaps %xmm0,0x90(%rsp) 3014 lea 0xa0+0x28(%rsp),%rax 3015.Locb_enc_pop: 3016___ 3017$code.=<<___; 3018 mov -40(%rax),%r14 3019.cfi_restore %r14 3020 mov -32(%rax),%r13 3021.cfi_restore %r13 3022 mov -24(%rax),%r12 3023.cfi_restore %r12 3024 mov -16(%rax),%rbp 3025.cfi_restore %rbp 3026 mov -8(%rax),%rbx 3027.cfi_restore %rbx 3028 lea (%rax),%rsp 3029.cfi_def_cfa_register %rsp 3030.Locb_enc_epilogue: 3031 ret 3032.cfi_endproc 3033.size aesni_ocb_encrypt,.-aesni_ocb_encrypt 3034 3035.type __ocb_encrypt6,\@abi-omnipotent 3036.align 32 3037__ocb_encrypt6: 3038.cfi_startproc 3039 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3040 movdqu ($L_p,$i1),@offset[1] 3041 movdqa @offset[0],@offset[2] 3042 movdqu ($L_p,$i3),@offset[3] 3043 movdqa @offset[0],@offset[4] 3044 pxor @offset[5],@offset[0] 3045 movdqu ($L_p,$i5),@offset[5] 3046 pxor @offset[0],@offset[1] 3047 pxor $inout0,$checksum # accumulate checksum 3048 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3049 pxor @offset[1],@offset[2] 3050 pxor $inout1,$checksum 3051 pxor @offset[1],$inout1 3052 pxor @offset[2],@offset[3] 3053 pxor $inout2,$checksum 3054 pxor @offset[2],$inout2 3055 pxor @offset[3],@offset[4] 3056 pxor $inout3,$checksum 3057 pxor @offset[3],$inout3 3058 pxor @offset[4],@offset[5] 3059 pxor $inout4,$checksum 3060 pxor @offset[4],$inout4 3061 pxor $inout5,$checksum 3062 pxor @offset[5],$inout5 3063 $movkey 32($key_),$rndkey0 3064 3065 lea 1($block_num),$i1 # even-numbered blocks 3066 lea 3($block_num),$i3 3067 lea 5($block_num),$i5 3068 add \$6,$block_num 3069 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3070 bsf $i1,$i1 # ntz(block) 3071 bsf $i3,$i3 3072 bsf $i5,$i5 3073 3074 aesenc $rndkey1,$inout0 3075 aesenc $rndkey1,$inout1 3076 aesenc $rndkey1,$inout2 3077 aesenc $rndkey1,$inout3 3078 pxor $rndkey0l,@offset[1] 3079 pxor $rndkey0l,@offset[2] 3080 aesenc $rndkey1,$inout4 3081 pxor $rndkey0l,@offset[3] 3082 pxor $rndkey0l,@offset[4] 3083 aesenc $rndkey1,$inout5 3084 $movkey 48($key_),$rndkey1 3085 pxor $rndkey0l,@offset[5] 3086 3087 aesenc $rndkey0,$inout0 3088 aesenc $rndkey0,$inout1 3089 aesenc $rndkey0,$inout2 3090 aesenc $rndkey0,$inout3 3091 aesenc $rndkey0,$inout4 3092 aesenc $rndkey0,$inout5 3093 $movkey 64($key_),$rndkey0 3094 shl \$4,$i1 # ntz(block) -> table offset 3095 shl \$4,$i3 3096 jmp .Locb_enc_loop6 3097 3098.align 32 3099.Locb_enc_loop6: 3100 aesenc $rndkey1,$inout0 3101 aesenc $rndkey1,$inout1 3102 aesenc $rndkey1,$inout2 3103 aesenc $rndkey1,$inout3 3104 aesenc $rndkey1,$inout4 3105 aesenc $rndkey1,$inout5 3106 $movkey ($key,%rax),$rndkey1 3107 add \$32,%rax 3108 3109 aesenc $rndkey0,$inout0 3110 aesenc $rndkey0,$inout1 3111 aesenc $rndkey0,$inout2 3112 aesenc $rndkey0,$inout3 3113 aesenc $rndkey0,$inout4 3114 aesenc $rndkey0,$inout5 3115 $movkey -16($key,%rax),$rndkey0 3116 jnz .Locb_enc_loop6 3117 3118 aesenc $rndkey1,$inout0 3119 aesenc $rndkey1,$inout1 3120 aesenc $rndkey1,$inout2 3121 aesenc $rndkey1,$inout3 3122 aesenc $rndkey1,$inout4 3123 aesenc $rndkey1,$inout5 3124 $movkey 16($key_),$rndkey1 3125 shl \$4,$i5 3126 3127 aesenclast @offset[0],$inout0 3128 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3129 mov %r10,%rax # restore twisted rounds 3130 aesenclast @offset[1],$inout1 3131 aesenclast @offset[2],$inout2 3132 aesenclast @offset[3],$inout3 3133 aesenclast @offset[4],$inout4 3134 aesenclast @offset[5],$inout5 3135 ret 3136.cfi_endproc 3137.size __ocb_encrypt6,.-__ocb_encrypt6 3138 3139.type __ocb_encrypt4,\@abi-omnipotent 3140.align 32 3141__ocb_encrypt4: 3142.cfi_startproc 3143 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3144 movdqu ($L_p,$i1),@offset[1] 3145 movdqa @offset[0],@offset[2] 3146 movdqu ($L_p,$i3),@offset[3] 3147 pxor @offset[5],@offset[0] 3148 pxor @offset[0],@offset[1] 3149 pxor $inout0,$checksum # accumulate checksum 3150 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3151 pxor @offset[1],@offset[2] 3152 pxor $inout1,$checksum 3153 pxor @offset[1],$inout1 3154 pxor @offset[2],@offset[3] 3155 pxor $inout2,$checksum 3156 pxor @offset[2],$inout2 3157 pxor $inout3,$checksum 3158 pxor @offset[3],$inout3 3159 $movkey 32($key_),$rndkey0 3160 3161 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3162 pxor $rndkey0l,@offset[1] 3163 pxor $rndkey0l,@offset[2] 3164 pxor $rndkey0l,@offset[3] 3165 3166 aesenc $rndkey1,$inout0 3167 aesenc $rndkey1,$inout1 3168 aesenc $rndkey1,$inout2 3169 aesenc $rndkey1,$inout3 3170 $movkey 48($key_),$rndkey1 3171 3172 aesenc $rndkey0,$inout0 3173 aesenc $rndkey0,$inout1 3174 aesenc $rndkey0,$inout2 3175 aesenc $rndkey0,$inout3 3176 $movkey 64($key_),$rndkey0 3177 jmp .Locb_enc_loop4 3178 3179.align 32 3180.Locb_enc_loop4: 3181 aesenc $rndkey1,$inout0 3182 aesenc $rndkey1,$inout1 3183 aesenc $rndkey1,$inout2 3184 aesenc $rndkey1,$inout3 3185 $movkey ($key,%rax),$rndkey1 3186 add \$32,%rax 3187 3188 aesenc $rndkey0,$inout0 3189 aesenc $rndkey0,$inout1 3190 aesenc $rndkey0,$inout2 3191 aesenc $rndkey0,$inout3 3192 $movkey -16($key,%rax),$rndkey0 3193 jnz .Locb_enc_loop4 3194 3195 aesenc $rndkey1,$inout0 3196 aesenc $rndkey1,$inout1 3197 aesenc $rndkey1,$inout2 3198 aesenc $rndkey1,$inout3 3199 $movkey 16($key_),$rndkey1 3200 mov %r10,%rax # restore twisted rounds 3201 3202 aesenclast @offset[0],$inout0 3203 aesenclast @offset[1],$inout1 3204 aesenclast @offset[2],$inout2 3205 aesenclast @offset[3],$inout3 3206 ret 3207.cfi_endproc 3208.size __ocb_encrypt4,.-__ocb_encrypt4 3209 3210.type __ocb_encrypt1,\@abi-omnipotent 3211.align 32 3212__ocb_encrypt1: 3213.cfi_startproc 3214 pxor @offset[5],$inout5 # offset_i 3215 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3216 pxor $inout0,$checksum # accumulate checksum 3217 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3218 $movkey 32($key_),$rndkey0 3219 3220 aesenc $rndkey1,$inout0 3221 $movkey 48($key_),$rndkey1 3222 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3223 3224 aesenc $rndkey0,$inout0 3225 $movkey 64($key_),$rndkey0 3226 jmp .Locb_enc_loop1 3227 3228.align 32 3229.Locb_enc_loop1: 3230 aesenc $rndkey1,$inout0 3231 $movkey ($key,%rax),$rndkey1 3232 add \$32,%rax 3233 3234 aesenc $rndkey0,$inout0 3235 $movkey -16($key,%rax),$rndkey0 3236 jnz .Locb_enc_loop1 3237 3238 aesenc $rndkey1,$inout0 3239 $movkey 16($key_),$rndkey1 # redundant in tail 3240 mov %r10,%rax # restore twisted rounds 3241 3242 aesenclast $inout5,$inout0 3243 ret 3244.cfi_endproc 3245.size __ocb_encrypt1,.-__ocb_encrypt1 3246 3247.globl aesni_ocb_decrypt 3248.type aesni_ocb_decrypt,\@function,6 3249.align 32 3250aesni_ocb_decrypt: 3251.cfi_startproc 3252 lea (%rsp),%rax 3253 push %rbx 3254.cfi_push %rbx 3255 push %rbp 3256.cfi_push %rbp 3257 push %r12 3258.cfi_push %r12 3259 push %r13 3260.cfi_push %r13 3261 push %r14 3262.cfi_push %r14 3263___ 3264$code.=<<___ if ($win64); 3265 lea -0xa0(%rsp),%rsp 3266 movaps %xmm6,0x00(%rsp) # offload everything 3267 movaps %xmm7,0x10(%rsp) 3268 movaps %xmm8,0x20(%rsp) 3269 movaps %xmm9,0x30(%rsp) 3270 movaps %xmm10,0x40(%rsp) 3271 movaps %xmm11,0x50(%rsp) 3272 movaps %xmm12,0x60(%rsp) 3273 movaps %xmm13,0x70(%rsp) 3274 movaps %xmm14,0x80(%rsp) 3275 movaps %xmm15,0x90(%rsp) 3276.Locb_dec_body: 3277___ 3278$code.=<<___; 3279 mov $seventh_arg(%rax),$L_p # 7th argument 3280 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3281 3282 mov 240($key),$rnds_ 3283 mov $key,$key_ 3284 shl \$4,$rnds_ 3285 $movkey ($key),$rndkey0l # round[0] 3286 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3287 3288 movdqu ($offset_p),@offset[5] # load last offset_i 3289 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3290 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3291 3292 mov \$16+32,$rounds 3293 lea 32($key_,$rnds_),$key 3294 $movkey 16($key_),$rndkey1 # round[1] 3295 sub %r10,%rax # twisted $rounds 3296 mov %rax,%r10 # backup twisted $rounds 3297 3298 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3299 movdqu ($checksum_p),$checksum # load checksum 3300 3301 test \$1,$block_num # is first block number odd? 3302 jnz .Locb_dec_odd 3303 3304 bsf $block_num,$i1 3305 add \$1,$block_num 3306 shl \$4,$i1 3307 movdqu ($L_p,$i1),$inout5 # borrow 3308 movdqu ($inp),$inout0 3309 lea 16($inp),$inp 3310 3311 call __ocb_decrypt1 3312 3313 movdqa $inout5,@offset[5] 3314 movups $inout0,($out) 3315 xorps $inout0,$checksum # accumulate checksum 3316 lea 16($out),$out 3317 sub \$1,$blocks 3318 jz .Locb_dec_done 3319 3320.Locb_dec_odd: 3321 lea 1($block_num),$i1 # even-numbered blocks 3322 lea 3($block_num),$i3 3323 lea 5($block_num),$i5 3324 lea 6($block_num),$block_num 3325 bsf $i1,$i1 # ntz(block) 3326 bsf $i3,$i3 3327 bsf $i5,$i5 3328 shl \$4,$i1 # ntz(block) -> table offset 3329 shl \$4,$i3 3330 shl \$4,$i5 3331 3332 sub \$6,$blocks 3333 jc .Locb_dec_short 3334 jmp .Locb_dec_grandloop 3335 3336.align 32 3337.Locb_dec_grandloop: 3338 movdqu `16*0`($inp),$inout0 # load input 3339 movdqu `16*1`($inp),$inout1 3340 movdqu `16*2`($inp),$inout2 3341 movdqu `16*3`($inp),$inout3 3342 movdqu `16*4`($inp),$inout4 3343 movdqu `16*5`($inp),$inout5 3344 lea `16*6`($inp),$inp 3345 3346 call __ocb_decrypt6 3347 3348 movups $inout0,`16*0`($out) # store output 3349 pxor $inout0,$checksum # accumulate checksum 3350 movups $inout1,`16*1`($out) 3351 pxor $inout1,$checksum 3352 movups $inout2,`16*2`($out) 3353 pxor $inout2,$checksum 3354 movups $inout3,`16*3`($out) 3355 pxor $inout3,$checksum 3356 movups $inout4,`16*4`($out) 3357 pxor $inout4,$checksum 3358 movups $inout5,`16*5`($out) 3359 pxor $inout5,$checksum 3360 lea `16*6`($out),$out 3361 sub \$6,$blocks 3362 jnc .Locb_dec_grandloop 3363 3364.Locb_dec_short: 3365 add \$6,$blocks 3366 jz .Locb_dec_done 3367 3368 movdqu `16*0`($inp),$inout0 3369 cmp \$2,$blocks 3370 jb .Locb_dec_one 3371 movdqu `16*1`($inp),$inout1 3372 je .Locb_dec_two 3373 3374 movdqu `16*2`($inp),$inout2 3375 cmp \$4,$blocks 3376 jb .Locb_dec_three 3377 movdqu `16*3`($inp),$inout3 3378 je .Locb_dec_four 3379 3380 movdqu `16*4`($inp),$inout4 3381 pxor $inout5,$inout5 3382 3383 call __ocb_decrypt6 3384 3385 movdqa @offset[4],@offset[5] 3386 movups $inout0,`16*0`($out) # store output 3387 pxor $inout0,$checksum # accumulate checksum 3388 movups $inout1,`16*1`($out) 3389 pxor $inout1,$checksum 3390 movups $inout2,`16*2`($out) 3391 pxor $inout2,$checksum 3392 movups $inout3,`16*3`($out) 3393 pxor $inout3,$checksum 3394 movups $inout4,`16*4`($out) 3395 pxor $inout4,$checksum 3396 3397 jmp .Locb_dec_done 3398 3399.align 16 3400.Locb_dec_one: 3401 movdqa @offset[0],$inout5 # borrow 3402 3403 call __ocb_decrypt1 3404 3405 movdqa $inout5,@offset[5] 3406 movups $inout0,`16*0`($out) # store output 3407 xorps $inout0,$checksum # accumulate checksum 3408 jmp .Locb_dec_done 3409 3410.align 16 3411.Locb_dec_two: 3412 pxor $inout2,$inout2 3413 pxor $inout3,$inout3 3414 3415 call __ocb_decrypt4 3416 3417 movdqa @offset[1],@offset[5] 3418 movups $inout0,`16*0`($out) # store output 3419 xorps $inout0,$checksum # accumulate checksum 3420 movups $inout1,`16*1`($out) 3421 xorps $inout1,$checksum 3422 3423 jmp .Locb_dec_done 3424 3425.align 16 3426.Locb_dec_three: 3427 pxor $inout3,$inout3 3428 3429 call __ocb_decrypt4 3430 3431 movdqa @offset[2],@offset[5] 3432 movups $inout0,`16*0`($out) # store output 3433 xorps $inout0,$checksum # accumulate checksum 3434 movups $inout1,`16*1`($out) 3435 xorps $inout1,$checksum 3436 movups $inout2,`16*2`($out) 3437 xorps $inout2,$checksum 3438 3439 jmp .Locb_dec_done 3440 3441.align 16 3442.Locb_dec_four: 3443 call __ocb_decrypt4 3444 3445 movdqa @offset[3],@offset[5] 3446 movups $inout0,`16*0`($out) # store output 3447 pxor $inout0,$checksum # accumulate checksum 3448 movups $inout1,`16*1`($out) 3449 pxor $inout1,$checksum 3450 movups $inout2,`16*2`($out) 3451 pxor $inout2,$checksum 3452 movups $inout3,`16*3`($out) 3453 pxor $inout3,$checksum 3454 3455.Locb_dec_done: 3456 pxor $rndkey0,@offset[5] # "remove" round[last] 3457 movdqu $checksum,($checksum_p) # store checksum 3458 movdqu @offset[5],($offset_p) # store last offset_i 3459 3460 xorps %xmm0,%xmm0 # clear register bank 3461 pxor %xmm1,%xmm1 3462 pxor %xmm2,%xmm2 3463 pxor %xmm3,%xmm3 3464 pxor %xmm4,%xmm4 3465 pxor %xmm5,%xmm5 3466___ 3467$code.=<<___ if (!$win64); 3468 pxor %xmm6,%xmm6 3469 pxor %xmm7,%xmm7 3470 pxor %xmm8,%xmm8 3471 pxor %xmm9,%xmm9 3472 pxor %xmm10,%xmm10 3473 pxor %xmm11,%xmm11 3474 pxor %xmm12,%xmm12 3475 pxor %xmm13,%xmm13 3476 pxor %xmm14,%xmm14 3477 pxor %xmm15,%xmm15 3478 lea 0x28(%rsp),%rax 3479.cfi_def_cfa %rax,8 3480___ 3481$code.=<<___ if ($win64); 3482 movaps 0x00(%rsp),%xmm6 3483 movaps %xmm0,0x00(%rsp) # clear stack 3484 movaps 0x10(%rsp),%xmm7 3485 movaps %xmm0,0x10(%rsp) 3486 movaps 0x20(%rsp),%xmm8 3487 movaps %xmm0,0x20(%rsp) 3488 movaps 0x30(%rsp),%xmm9 3489 movaps %xmm0,0x30(%rsp) 3490 movaps 0x40(%rsp),%xmm10 3491 movaps %xmm0,0x40(%rsp) 3492 movaps 0x50(%rsp),%xmm11 3493 movaps %xmm0,0x50(%rsp) 3494 movaps 0x60(%rsp),%xmm12 3495 movaps %xmm0,0x60(%rsp) 3496 movaps 0x70(%rsp),%xmm13 3497 movaps %xmm0,0x70(%rsp) 3498 movaps 0x80(%rsp),%xmm14 3499 movaps %xmm0,0x80(%rsp) 3500 movaps 0x90(%rsp),%xmm15 3501 movaps %xmm0,0x90(%rsp) 3502 lea 0xa0+0x28(%rsp),%rax 3503.Locb_dec_pop: 3504___ 3505$code.=<<___; 3506 mov -40(%rax),%r14 3507.cfi_restore %r14 3508 mov -32(%rax),%r13 3509.cfi_restore %r13 3510 mov -24(%rax),%r12 3511.cfi_restore %r12 3512 mov -16(%rax),%rbp 3513.cfi_restore %rbp 3514 mov -8(%rax),%rbx 3515.cfi_restore %rbx 3516 lea (%rax),%rsp 3517.cfi_def_cfa_register %rsp 3518.Locb_dec_epilogue: 3519 ret 3520.cfi_endproc 3521.size aesni_ocb_decrypt,.-aesni_ocb_decrypt 3522 3523.type __ocb_decrypt6,\@abi-omnipotent 3524.align 32 3525__ocb_decrypt6: 3526.cfi_startproc 3527 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3528 movdqu ($L_p,$i1),@offset[1] 3529 movdqa @offset[0],@offset[2] 3530 movdqu ($L_p,$i3),@offset[3] 3531 movdqa @offset[0],@offset[4] 3532 pxor @offset[5],@offset[0] 3533 movdqu ($L_p,$i5),@offset[5] 3534 pxor @offset[0],@offset[1] 3535 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3536 pxor @offset[1],@offset[2] 3537 pxor @offset[1],$inout1 3538 pxor @offset[2],@offset[3] 3539 pxor @offset[2],$inout2 3540 pxor @offset[3],@offset[4] 3541 pxor @offset[3],$inout3 3542 pxor @offset[4],@offset[5] 3543 pxor @offset[4],$inout4 3544 pxor @offset[5],$inout5 3545 $movkey 32($key_),$rndkey0 3546 3547 lea 1($block_num),$i1 # even-numbered blocks 3548 lea 3($block_num),$i3 3549 lea 5($block_num),$i5 3550 add \$6,$block_num 3551 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3552 bsf $i1,$i1 # ntz(block) 3553 bsf $i3,$i3 3554 bsf $i5,$i5 3555 3556 aesdec $rndkey1,$inout0 3557 aesdec $rndkey1,$inout1 3558 aesdec $rndkey1,$inout2 3559 aesdec $rndkey1,$inout3 3560 pxor $rndkey0l,@offset[1] 3561 pxor $rndkey0l,@offset[2] 3562 aesdec $rndkey1,$inout4 3563 pxor $rndkey0l,@offset[3] 3564 pxor $rndkey0l,@offset[4] 3565 aesdec $rndkey1,$inout5 3566 $movkey 48($key_),$rndkey1 3567 pxor $rndkey0l,@offset[5] 3568 3569 aesdec $rndkey0,$inout0 3570 aesdec $rndkey0,$inout1 3571 aesdec $rndkey0,$inout2 3572 aesdec $rndkey0,$inout3 3573 aesdec $rndkey0,$inout4 3574 aesdec $rndkey0,$inout5 3575 $movkey 64($key_),$rndkey0 3576 shl \$4,$i1 # ntz(block) -> table offset 3577 shl \$4,$i3 3578 jmp .Locb_dec_loop6 3579 3580.align 32 3581.Locb_dec_loop6: 3582 aesdec $rndkey1,$inout0 3583 aesdec $rndkey1,$inout1 3584 aesdec $rndkey1,$inout2 3585 aesdec $rndkey1,$inout3 3586 aesdec $rndkey1,$inout4 3587 aesdec $rndkey1,$inout5 3588 $movkey ($key,%rax),$rndkey1 3589 add \$32,%rax 3590 3591 aesdec $rndkey0,$inout0 3592 aesdec $rndkey0,$inout1 3593 aesdec $rndkey0,$inout2 3594 aesdec $rndkey0,$inout3 3595 aesdec $rndkey0,$inout4 3596 aesdec $rndkey0,$inout5 3597 $movkey -16($key,%rax),$rndkey0 3598 jnz .Locb_dec_loop6 3599 3600 aesdec $rndkey1,$inout0 3601 aesdec $rndkey1,$inout1 3602 aesdec $rndkey1,$inout2 3603 aesdec $rndkey1,$inout3 3604 aesdec $rndkey1,$inout4 3605 aesdec $rndkey1,$inout5 3606 $movkey 16($key_),$rndkey1 3607 shl \$4,$i5 3608 3609 aesdeclast @offset[0],$inout0 3610 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3611 mov %r10,%rax # restore twisted rounds 3612 aesdeclast @offset[1],$inout1 3613 aesdeclast @offset[2],$inout2 3614 aesdeclast @offset[3],$inout3 3615 aesdeclast @offset[4],$inout4 3616 aesdeclast @offset[5],$inout5 3617 ret 3618.cfi_endproc 3619.size __ocb_decrypt6,.-__ocb_decrypt6 3620 3621.type __ocb_decrypt4,\@abi-omnipotent 3622.align 32 3623__ocb_decrypt4: 3624.cfi_startproc 3625 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3626 movdqu ($L_p,$i1),@offset[1] 3627 movdqa @offset[0],@offset[2] 3628 movdqu ($L_p,$i3),@offset[3] 3629 pxor @offset[5],@offset[0] 3630 pxor @offset[0],@offset[1] 3631 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3632 pxor @offset[1],@offset[2] 3633 pxor @offset[1],$inout1 3634 pxor @offset[2],@offset[3] 3635 pxor @offset[2],$inout2 3636 pxor @offset[3],$inout3 3637 $movkey 32($key_),$rndkey0 3638 3639 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3640 pxor $rndkey0l,@offset[1] 3641 pxor $rndkey0l,@offset[2] 3642 pxor $rndkey0l,@offset[3] 3643 3644 aesdec $rndkey1,$inout0 3645 aesdec $rndkey1,$inout1 3646 aesdec $rndkey1,$inout2 3647 aesdec $rndkey1,$inout3 3648 $movkey 48($key_),$rndkey1 3649 3650 aesdec $rndkey0,$inout0 3651 aesdec $rndkey0,$inout1 3652 aesdec $rndkey0,$inout2 3653 aesdec $rndkey0,$inout3 3654 $movkey 64($key_),$rndkey0 3655 jmp .Locb_dec_loop4 3656 3657.align 32 3658.Locb_dec_loop4: 3659 aesdec $rndkey1,$inout0 3660 aesdec $rndkey1,$inout1 3661 aesdec $rndkey1,$inout2 3662 aesdec $rndkey1,$inout3 3663 $movkey ($key,%rax),$rndkey1 3664 add \$32,%rax 3665 3666 aesdec $rndkey0,$inout0 3667 aesdec $rndkey0,$inout1 3668 aesdec $rndkey0,$inout2 3669 aesdec $rndkey0,$inout3 3670 $movkey -16($key,%rax),$rndkey0 3671 jnz .Locb_dec_loop4 3672 3673 aesdec $rndkey1,$inout0 3674 aesdec $rndkey1,$inout1 3675 aesdec $rndkey1,$inout2 3676 aesdec $rndkey1,$inout3 3677 $movkey 16($key_),$rndkey1 3678 mov %r10,%rax # restore twisted rounds 3679 3680 aesdeclast @offset[0],$inout0 3681 aesdeclast @offset[1],$inout1 3682 aesdeclast @offset[2],$inout2 3683 aesdeclast @offset[3],$inout3 3684 ret 3685.cfi_endproc 3686.size __ocb_decrypt4,.-__ocb_decrypt4 3687 3688.type __ocb_decrypt1,\@abi-omnipotent 3689.align 32 3690__ocb_decrypt1: 3691.cfi_startproc 3692 pxor @offset[5],$inout5 # offset_i 3693 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3694 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3695 $movkey 32($key_),$rndkey0 3696 3697 aesdec $rndkey1,$inout0 3698 $movkey 48($key_),$rndkey1 3699 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3700 3701 aesdec $rndkey0,$inout0 3702 $movkey 64($key_),$rndkey0 3703 jmp .Locb_dec_loop1 3704 3705.align 32 3706.Locb_dec_loop1: 3707 aesdec $rndkey1,$inout0 3708 $movkey ($key,%rax),$rndkey1 3709 add \$32,%rax 3710 3711 aesdec $rndkey0,$inout0 3712 $movkey -16($key,%rax),$rndkey0 3713 jnz .Locb_dec_loop1 3714 3715 aesdec $rndkey1,$inout0 3716 $movkey 16($key_),$rndkey1 # redundant in tail 3717 mov %r10,%rax # restore twisted rounds 3718 3719 aesdeclast $inout5,$inout0 3720 ret 3721.cfi_endproc 3722.size __ocb_decrypt1,.-__ocb_decrypt1 3723___ 3724} }} 3725 3726######################################################################## 3727# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3728# size_t length, const AES_KEY *key, 3729# unsigned char *ivp,const int enc); 3730{ 3731my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3732my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3733 3734$code.=<<___; 3735.globl ${PREFIX}_cbc_encrypt 3736.type ${PREFIX}_cbc_encrypt,\@function,6 3737.align 16 3738${PREFIX}_cbc_encrypt: 3739.cfi_startproc 3740 test $len,$len # check length 3741 jz .Lcbc_ret 3742 3743 mov 240($key),$rnds_ # key->rounds 3744 mov $key,$key_ # backup $key 3745 test %r9d,%r9d # 6th argument 3746 jz .Lcbc_decrypt 3747#--------------------------- CBC ENCRYPT ------------------------------# 3748 movups ($ivp),$inout0 # load iv as initial state 3749 mov $rnds_,$rounds 3750 cmp \$16,$len 3751 jb .Lcbc_enc_tail 3752 sub \$16,$len 3753 jmp .Lcbc_enc_loop 3754.align 16 3755.Lcbc_enc_loop: 3756 movups ($inp),$inout1 # load input 3757 lea 16($inp),$inp 3758 #xorps $inout1,$inout0 3759___ 3760 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3761$code.=<<___; 3762 mov $rnds_,$rounds # restore $rounds 3763 mov $key_,$key # restore $key 3764 movups $inout0,0($out) # store output 3765 lea 16($out),$out 3766 sub \$16,$len 3767 jnc .Lcbc_enc_loop 3768 add \$16,$len 3769 jnz .Lcbc_enc_tail 3770 pxor $rndkey0,$rndkey0 # clear register bank 3771 pxor $rndkey1,$rndkey1 3772 movups $inout0,($ivp) 3773 pxor $inout0,$inout0 3774 pxor $inout1,$inout1 3775 jmp .Lcbc_ret 3776 3777.Lcbc_enc_tail: 3778 mov $len,%rcx # zaps $key 3779 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3780 .long 0x9066A4F3 # rep movsb 3781 mov \$16,%ecx # zero tail 3782 sub $len,%rcx 3783 xor %eax,%eax 3784 .long 0x9066AAF3 # rep stosb 3785 lea -16(%rdi),%rdi # rewind $out by 1 block 3786 mov $rnds_,$rounds # restore $rounds 3787 mov %rdi,%rsi # $inp and $out are the same 3788 mov $key_,$key # restore $key 3789 xor $len,$len # len=16 3790 jmp .Lcbc_enc_loop # one more spin 3791#--------------------------- CBC DECRYPT ------------------------------# 3792.align 16 3793.Lcbc_decrypt: 3794 cmp \$16,$len 3795 jne .Lcbc_decrypt_bulk 3796 3797 # handle single block without allocating stack frame, 3798 # useful in ciphertext stealing mode 3799 movdqu ($inp),$inout0 # load input 3800 movdqu ($ivp),$inout1 # load iv 3801 movdqa $inout0,$inout2 # future iv 3802___ 3803 &aesni_generate1("dec",$key,$rnds_); 3804$code.=<<___; 3805 pxor $rndkey0,$rndkey0 # clear register bank 3806 pxor $rndkey1,$rndkey1 3807 movdqu $inout2,($ivp) # store iv 3808 xorps $inout1,$inout0 # ^=iv 3809 pxor $inout1,$inout1 3810 movups $inout0,($out) # store output 3811 pxor $inout0,$inout0 3812 jmp .Lcbc_ret 3813.align 16 3814.Lcbc_decrypt_bulk: 3815 lea (%rsp),%r11 # frame pointer 3816.cfi_def_cfa_register %r11 3817 push %rbp 3818.cfi_push %rbp 3819 sub \$$frame_size,%rsp 3820 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3821___ 3822$code.=<<___ if ($win64); 3823 movaps %xmm6,0x10(%rsp) 3824 movaps %xmm7,0x20(%rsp) 3825 movaps %xmm8,0x30(%rsp) 3826 movaps %xmm9,0x40(%rsp) 3827 movaps %xmm10,0x50(%rsp) 3828 movaps %xmm11,0x60(%rsp) 3829 movaps %xmm12,0x70(%rsp) 3830 movaps %xmm13,0x80(%rsp) 3831 movaps %xmm14,0x90(%rsp) 3832 movaps %xmm15,0xa0(%rsp) 3833.Lcbc_decrypt_body: 3834___ 3835 3836my $inp_=$key_="%rbp"; # reassign $key_ 3837 3838$code.=<<___; 3839 mov $key,$key_ # [re-]backup $key [after reassignment] 3840 movups ($ivp),$iv 3841 mov $rnds_,$rounds 3842 cmp \$0x50,$len 3843 jbe .Lcbc_dec_tail 3844 3845 $movkey ($key),$rndkey0 3846 movdqu 0x00($inp),$inout0 # load input 3847 movdqu 0x10($inp),$inout1 3848 movdqa $inout0,$in0 3849 movdqu 0x20($inp),$inout2 3850 movdqa $inout1,$in1 3851 movdqu 0x30($inp),$inout3 3852 movdqa $inout2,$in2 3853 movdqu 0x40($inp),$inout4 3854 movdqa $inout3,$in3 3855 movdqu 0x50($inp),$inout5 3856 movdqa $inout4,$in4 3857 mov OPENSSL_ia32cap_P+4(%rip),%r9d 3858 cmp \$0x70,$len 3859 jbe .Lcbc_dec_six_or_seven 3860 3861 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3862 sub \$0x50,$len # $len is biased by -5*16 3863 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3864 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3865 sub \$0x20,$len # $len is biased by -7*16 3866 lea 0x70($key),$key # size optimization 3867 jmp .Lcbc_dec_loop8_enter 3868.align 16 3869.Lcbc_dec_loop8: 3870 movups $inout7,($out) 3871 lea 0x10($out),$out 3872.Lcbc_dec_loop8_enter: 3873 movdqu 0x60($inp),$inout6 3874 pxor $rndkey0,$inout0 3875 movdqu 0x70($inp),$inout7 3876 pxor $rndkey0,$inout1 3877 $movkey 0x10-0x70($key),$rndkey1 3878 pxor $rndkey0,$inout2 3879 mov \$-1,$inp_ 3880 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3881 pxor $rndkey0,$inout3 3882 pxor $rndkey0,$inout4 3883 pxor $rndkey0,$inout5 3884 pxor $rndkey0,$inout6 3885 3886 aesdec $rndkey1,$inout0 3887 pxor $rndkey0,$inout7 3888 $movkey 0x20-0x70($key),$rndkey0 3889 aesdec $rndkey1,$inout1 3890 aesdec $rndkey1,$inout2 3891 aesdec $rndkey1,$inout3 3892 aesdec $rndkey1,$inout4 3893 aesdec $rndkey1,$inout5 3894 aesdec $rndkey1,$inout6 3895 adc \$0,$inp_ 3896 and \$128,$inp_ 3897 aesdec $rndkey1,$inout7 3898 add $inp,$inp_ 3899 $movkey 0x30-0x70($key),$rndkey1 3900___ 3901for($i=1;$i<12;$i++) { 3902my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3903$code.=<<___ if ($i==7); 3904 cmp \$11,$rounds 3905___ 3906$code.=<<___; 3907 aesdec $rndkeyx,$inout0 3908 aesdec $rndkeyx,$inout1 3909 aesdec $rndkeyx,$inout2 3910 aesdec $rndkeyx,$inout3 3911 aesdec $rndkeyx,$inout4 3912 aesdec $rndkeyx,$inout5 3913 aesdec $rndkeyx,$inout6 3914 aesdec $rndkeyx,$inout7 3915 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3916___ 3917$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3918 nop 3919___ 3920$code.=<<___ if ($i==7); 3921 jb .Lcbc_dec_done 3922___ 3923$code.=<<___ if ($i==9); 3924 je .Lcbc_dec_done 3925___ 3926$code.=<<___ if ($i==11); 3927 jmp .Lcbc_dec_done 3928___ 3929} 3930$code.=<<___; 3931.align 16 3932.Lcbc_dec_done: 3933 aesdec $rndkey1,$inout0 3934 aesdec $rndkey1,$inout1 3935 pxor $rndkey0,$iv 3936 pxor $rndkey0,$in0 3937 aesdec $rndkey1,$inout2 3938 aesdec $rndkey1,$inout3 3939 pxor $rndkey0,$in1 3940 pxor $rndkey0,$in2 3941 aesdec $rndkey1,$inout4 3942 aesdec $rndkey1,$inout5 3943 pxor $rndkey0,$in3 3944 pxor $rndkey0,$in4 3945 aesdec $rndkey1,$inout6 3946 aesdec $rndkey1,$inout7 3947 movdqu 0x50($inp),$rndkey1 3948 3949 aesdeclast $iv,$inout0 3950 movdqu 0x60($inp),$iv # borrow $iv 3951 pxor $rndkey0,$rndkey1 3952 aesdeclast $in0,$inout1 3953 pxor $rndkey0,$iv 3954 movdqu 0x70($inp),$rndkey0 # next IV 3955 aesdeclast $in1,$inout2 3956 lea 0x80($inp),$inp 3957 movdqu 0x00($inp_),$in0 3958 aesdeclast $in2,$inout3 3959 aesdeclast $in3,$inout4 3960 movdqu 0x10($inp_),$in1 3961 movdqu 0x20($inp_),$in2 3962 aesdeclast $in4,$inout5 3963 aesdeclast $rndkey1,$inout6 3964 movdqu 0x30($inp_),$in3 3965 movdqu 0x40($inp_),$in4 3966 aesdeclast $iv,$inout7 3967 movdqa $rndkey0,$iv # return $iv 3968 movdqu 0x50($inp_),$rndkey1 3969 $movkey -0x70($key),$rndkey0 3970 3971 movups $inout0,($out) # store output 3972 movdqa $in0,$inout0 3973 movups $inout1,0x10($out) 3974 movdqa $in1,$inout1 3975 movups $inout2,0x20($out) 3976 movdqa $in2,$inout2 3977 movups $inout3,0x30($out) 3978 movdqa $in3,$inout3 3979 movups $inout4,0x40($out) 3980 movdqa $in4,$inout4 3981 movups $inout5,0x50($out) 3982 movdqa $rndkey1,$inout5 3983 movups $inout6,0x60($out) 3984 lea 0x70($out),$out 3985 3986 sub \$0x80,$len 3987 ja .Lcbc_dec_loop8 3988 3989 movaps $inout7,$inout0 3990 lea -0x70($key),$key 3991 add \$0x70,$len 3992 jle .Lcbc_dec_clear_tail_collected 3993 movups $inout7,($out) 3994 lea 0x10($out),$out 3995 cmp \$0x50,$len 3996 jbe .Lcbc_dec_tail 3997 3998 movaps $in0,$inout0 3999.Lcbc_dec_six_or_seven: 4000 cmp \$0x60,$len 4001 ja .Lcbc_dec_seven 4002 4003 movaps $inout5,$inout6 4004 call _aesni_decrypt6 4005 pxor $iv,$inout0 # ^= IV 4006 movaps $inout6,$iv 4007 pxor $in0,$inout1 4008 movdqu $inout0,($out) 4009 pxor $in1,$inout2 4010 movdqu $inout1,0x10($out) 4011 pxor $inout1,$inout1 # clear register bank 4012 pxor $in2,$inout3 4013 movdqu $inout2,0x20($out) 4014 pxor $inout2,$inout2 4015 pxor $in3,$inout4 4016 movdqu $inout3,0x30($out) 4017 pxor $inout3,$inout3 4018 pxor $in4,$inout5 4019 movdqu $inout4,0x40($out) 4020 pxor $inout4,$inout4 4021 lea 0x50($out),$out 4022 movdqa $inout5,$inout0 4023 pxor $inout5,$inout5 4024 jmp .Lcbc_dec_tail_collected 4025 4026.align 16 4027.Lcbc_dec_seven: 4028 movups 0x60($inp),$inout6 4029 xorps $inout7,$inout7 4030 call _aesni_decrypt8 4031 movups 0x50($inp),$inout7 4032 pxor $iv,$inout0 # ^= IV 4033 movups 0x60($inp),$iv 4034 pxor $in0,$inout1 4035 movdqu $inout0,($out) 4036 pxor $in1,$inout2 4037 movdqu $inout1,0x10($out) 4038 pxor $inout1,$inout1 # clear register bank 4039 pxor $in2,$inout3 4040 movdqu $inout2,0x20($out) 4041 pxor $inout2,$inout2 4042 pxor $in3,$inout4 4043 movdqu $inout3,0x30($out) 4044 pxor $inout3,$inout3 4045 pxor $in4,$inout5 4046 movdqu $inout4,0x40($out) 4047 pxor $inout4,$inout4 4048 pxor $inout7,$inout6 4049 movdqu $inout5,0x50($out) 4050 pxor $inout5,$inout5 4051 lea 0x60($out),$out 4052 movdqa $inout6,$inout0 4053 pxor $inout6,$inout6 4054 pxor $inout7,$inout7 4055 jmp .Lcbc_dec_tail_collected 4056 4057.align 16 4058.Lcbc_dec_loop6: 4059 movups $inout5,($out) 4060 lea 0x10($out),$out 4061 movdqu 0x00($inp),$inout0 # load input 4062 movdqu 0x10($inp),$inout1 4063 movdqa $inout0,$in0 4064 movdqu 0x20($inp),$inout2 4065 movdqa $inout1,$in1 4066 movdqu 0x30($inp),$inout3 4067 movdqa $inout2,$in2 4068 movdqu 0x40($inp),$inout4 4069 movdqa $inout3,$in3 4070 movdqu 0x50($inp),$inout5 4071 movdqa $inout4,$in4 4072.Lcbc_dec_loop6_enter: 4073 lea 0x60($inp),$inp 4074 movdqa $inout5,$inout6 4075 4076 call _aesni_decrypt6 4077 4078 pxor $iv,$inout0 # ^= IV 4079 movdqa $inout6,$iv 4080 pxor $in0,$inout1 4081 movdqu $inout0,($out) 4082 pxor $in1,$inout2 4083 movdqu $inout1,0x10($out) 4084 pxor $in2,$inout3 4085 movdqu $inout2,0x20($out) 4086 pxor $in3,$inout4 4087 mov $key_,$key 4088 movdqu $inout3,0x30($out) 4089 pxor $in4,$inout5 4090 mov $rnds_,$rounds 4091 movdqu $inout4,0x40($out) 4092 lea 0x50($out),$out 4093 sub \$0x60,$len 4094 ja .Lcbc_dec_loop6 4095 4096 movdqa $inout5,$inout0 4097 add \$0x50,$len 4098 jle .Lcbc_dec_clear_tail_collected 4099 movups $inout5,($out) 4100 lea 0x10($out),$out 4101 4102.Lcbc_dec_tail: 4103 movups ($inp),$inout0 4104 sub \$0x10,$len 4105 jbe .Lcbc_dec_one # $len is 1*16 or less 4106 4107 movups 0x10($inp),$inout1 4108 movaps $inout0,$in0 4109 sub \$0x10,$len 4110 jbe .Lcbc_dec_two # $len is 2*16 or less 4111 4112 movups 0x20($inp),$inout2 4113 movaps $inout1,$in1 4114 sub \$0x10,$len 4115 jbe .Lcbc_dec_three # $len is 3*16 or less 4116 4117 movups 0x30($inp),$inout3 4118 movaps $inout2,$in2 4119 sub \$0x10,$len 4120 jbe .Lcbc_dec_four # $len is 4*16 or less 4121 4122 movups 0x40($inp),$inout4 # $len is 5*16 or less 4123 movaps $inout3,$in3 4124 movaps $inout4,$in4 4125 xorps $inout5,$inout5 4126 call _aesni_decrypt6 4127 pxor $iv,$inout0 4128 movaps $in4,$iv 4129 pxor $in0,$inout1 4130 movdqu $inout0,($out) 4131 pxor $in1,$inout2 4132 movdqu $inout1,0x10($out) 4133 pxor $inout1,$inout1 # clear register bank 4134 pxor $in2,$inout3 4135 movdqu $inout2,0x20($out) 4136 pxor $inout2,$inout2 4137 pxor $in3,$inout4 4138 movdqu $inout3,0x30($out) 4139 pxor $inout3,$inout3 4140 lea 0x40($out),$out 4141 movdqa $inout4,$inout0 4142 pxor $inout4,$inout4 4143 pxor $inout5,$inout5 4144 sub \$0x10,$len 4145 jmp .Lcbc_dec_tail_collected 4146 4147.align 16 4148.Lcbc_dec_one: 4149 movaps $inout0,$in0 4150___ 4151 &aesni_generate1("dec",$key,$rounds); 4152$code.=<<___; 4153 xorps $iv,$inout0 4154 movaps $in0,$iv 4155 jmp .Lcbc_dec_tail_collected 4156.align 16 4157.Lcbc_dec_two: 4158 movaps $inout1,$in1 4159 call _aesni_decrypt2 4160 pxor $iv,$inout0 4161 movaps $in1,$iv 4162 pxor $in0,$inout1 4163 movdqu $inout0,($out) 4164 movdqa $inout1,$inout0 4165 pxor $inout1,$inout1 # clear register bank 4166 lea 0x10($out),$out 4167 jmp .Lcbc_dec_tail_collected 4168.align 16 4169.Lcbc_dec_three: 4170 movaps $inout2,$in2 4171 call _aesni_decrypt3 4172 pxor $iv,$inout0 4173 movaps $in2,$iv 4174 pxor $in0,$inout1 4175 movdqu $inout0,($out) 4176 pxor $in1,$inout2 4177 movdqu $inout1,0x10($out) 4178 pxor $inout1,$inout1 # clear register bank 4179 movdqa $inout2,$inout0 4180 pxor $inout2,$inout2 4181 lea 0x20($out),$out 4182 jmp .Lcbc_dec_tail_collected 4183.align 16 4184.Lcbc_dec_four: 4185 movaps $inout3,$in3 4186 call _aesni_decrypt4 4187 pxor $iv,$inout0 4188 movaps $in3,$iv 4189 pxor $in0,$inout1 4190 movdqu $inout0,($out) 4191 pxor $in1,$inout2 4192 movdqu $inout1,0x10($out) 4193 pxor $inout1,$inout1 # clear register bank 4194 pxor $in2,$inout3 4195 movdqu $inout2,0x20($out) 4196 pxor $inout2,$inout2 4197 movdqa $inout3,$inout0 4198 pxor $inout3,$inout3 4199 lea 0x30($out),$out 4200 jmp .Lcbc_dec_tail_collected 4201 4202.align 16 4203.Lcbc_dec_clear_tail_collected: 4204 pxor $inout1,$inout1 # clear register bank 4205 pxor $inout2,$inout2 4206 pxor $inout3,$inout3 4207___ 4208$code.=<<___ if (!$win64); 4209 pxor $inout4,$inout4 # %xmm6..9 4210 pxor $inout5,$inout5 4211 pxor $inout6,$inout6 4212 pxor $inout7,$inout7 4213___ 4214$code.=<<___; 4215.Lcbc_dec_tail_collected: 4216 movups $iv,($ivp) 4217 and \$15,$len 4218 jnz .Lcbc_dec_tail_partial 4219 movups $inout0,($out) 4220 pxor $inout0,$inout0 4221 jmp .Lcbc_dec_ret 4222.align 16 4223.Lcbc_dec_tail_partial: 4224 movaps $inout0,(%rsp) 4225 pxor $inout0,$inout0 4226 mov \$16,%rcx 4227 mov $out,%rdi 4228 sub $len,%rcx 4229 lea (%rsp),%rsi 4230 .long 0x9066A4F3 # rep movsb 4231 movdqa $inout0,(%rsp) 4232 4233.Lcbc_dec_ret: 4234 xorps $rndkey0,$rndkey0 # %xmm0 4235 pxor $rndkey1,$rndkey1 4236___ 4237$code.=<<___ if ($win64); 4238 movaps 0x10(%rsp),%xmm6 4239 movaps %xmm0,0x10(%rsp) # clear stack 4240 movaps 0x20(%rsp),%xmm7 4241 movaps %xmm0,0x20(%rsp) 4242 movaps 0x30(%rsp),%xmm8 4243 movaps %xmm0,0x30(%rsp) 4244 movaps 0x40(%rsp),%xmm9 4245 movaps %xmm0,0x40(%rsp) 4246 movaps 0x50(%rsp),%xmm10 4247 movaps %xmm0,0x50(%rsp) 4248 movaps 0x60(%rsp),%xmm11 4249 movaps %xmm0,0x60(%rsp) 4250 movaps 0x70(%rsp),%xmm12 4251 movaps %xmm0,0x70(%rsp) 4252 movaps 0x80(%rsp),%xmm13 4253 movaps %xmm0,0x80(%rsp) 4254 movaps 0x90(%rsp),%xmm14 4255 movaps %xmm0,0x90(%rsp) 4256 movaps 0xa0(%rsp),%xmm15 4257 movaps %xmm0,0xa0(%rsp) 4258___ 4259$code.=<<___; 4260 mov -8(%r11),%rbp 4261.cfi_restore %rbp 4262 lea (%r11),%rsp 4263.cfi_def_cfa_register %rsp 4264.Lcbc_ret: 4265 ret 4266.cfi_endproc 4267.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4268___ 4269} 4270# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4271# int bits, AES_KEY *key) 4272# 4273# input: $inp user-supplied key 4274# $bits $inp length in bits 4275# $key pointer to key schedule 4276# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4277# *$key key schedule 4278# 4279{ my ($inp,$bits,$key) = @_4args; 4280 $bits =~ s/%r/%e/; 4281 4282$code.=<<___; 4283.globl ${PREFIX}_set_decrypt_key 4284.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4285.align 16 4286${PREFIX}_set_decrypt_key: 4287.cfi_startproc 4288 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4289.cfi_adjust_cfa_offset 8 4290 call __aesni_set_encrypt_key 4291 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4292 test %eax,%eax 4293 jnz .Ldec_key_ret 4294 lea 16($key,$bits),$inp # points at the end of key schedule 4295 4296 $movkey ($key),%xmm0 # just swap 4297 $movkey ($inp),%xmm1 4298 $movkey %xmm0,($inp) 4299 $movkey %xmm1,($key) 4300 lea 16($key),$key 4301 lea -16($inp),$inp 4302 4303.Ldec_key_inverse: 4304 $movkey ($key),%xmm0 # swap and inverse 4305 $movkey ($inp),%xmm1 4306 aesimc %xmm0,%xmm0 4307 aesimc %xmm1,%xmm1 4308 lea 16($key),$key 4309 lea -16($inp),$inp 4310 $movkey %xmm0,16($inp) 4311 $movkey %xmm1,-16($key) 4312 cmp $key,$inp 4313 ja .Ldec_key_inverse 4314 4315 $movkey ($key),%xmm0 # inverse middle 4316 aesimc %xmm0,%xmm0 4317 pxor %xmm1,%xmm1 4318 $movkey %xmm0,($inp) 4319 pxor %xmm0,%xmm0 4320.Ldec_key_ret: 4321 add \$8,%rsp 4322.cfi_adjust_cfa_offset -8 4323 ret 4324.cfi_endproc 4325.LSEH_end_set_decrypt_key: 4326.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4327___ 4328 4329# This is based on submission from Intel by 4330# Huang Ying 4331# Vinodh Gopal 4332# Kahraman Akdemir 4333# 4334# Aggressively optimized in respect to aeskeygenassist's critical path 4335# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4336# 4337# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4338# int bits, AES_KEY * const key); 4339# 4340# input: $inp user-supplied key 4341# $bits $inp length in bits 4342# $key pointer to key schedule 4343# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4344# $bits rounds-1 (used in aesni_set_decrypt_key) 4345# *$key key schedule 4346# $key pointer to key schedule (used in 4347# aesni_set_decrypt_key) 4348# 4349# Subroutine is frame-less, which means that only volatile registers 4350# are used. Note that it's declared "abi-omnipotent", which means that 4351# amount of volatile registers is smaller on Windows. 4352# 4353$code.=<<___; 4354.globl ${PREFIX}_set_encrypt_key 4355.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4356.align 16 4357${PREFIX}_set_encrypt_key: 4358__aesni_set_encrypt_key: 4359.cfi_startproc 4360 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4361.cfi_adjust_cfa_offset 8 4362 mov \$-1,%rax 4363 test $inp,$inp 4364 jz .Lenc_key_ret 4365 test $key,$key 4366 jz .Lenc_key_ret 4367 4368 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4369 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4370 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4371 and OPENSSL_ia32cap_P+4(%rip),%r10d 4372 lea 16($key),%rax # %rax is used as modifiable copy of $key 4373 cmp \$256,$bits 4374 je .L14rounds 4375 cmp \$192,$bits 4376 je .L12rounds 4377 cmp \$128,$bits 4378 jne .Lbad_keybits 4379 4380.L10rounds: 4381 mov \$9,$bits # 10 rounds for 128-bit key 4382 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4383 je .L10rounds_alt 4384 4385 $movkey %xmm0,($key) # round 0 4386 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4387 call .Lkey_expansion_128_cold 4388 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4389 call .Lkey_expansion_128 4390 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4391 call .Lkey_expansion_128 4392 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4393 call .Lkey_expansion_128 4394 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4395 call .Lkey_expansion_128 4396 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4397 call .Lkey_expansion_128 4398 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4399 call .Lkey_expansion_128 4400 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4401 call .Lkey_expansion_128 4402 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4403 call .Lkey_expansion_128 4404 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4405 call .Lkey_expansion_128 4406 $movkey %xmm0,(%rax) 4407 mov $bits,80(%rax) # 240(%rdx) 4408 xor %eax,%eax 4409 jmp .Lenc_key_ret 4410 4411.align 16 4412.L10rounds_alt: 4413 movdqa .Lkey_rotate(%rip),%xmm5 4414 mov \$8,%r10d 4415 movdqa .Lkey_rcon1(%rip),%xmm4 4416 movdqa %xmm0,%xmm2 4417 movdqu %xmm0,($key) 4418 jmp .Loop_key128 4419 4420.align 16 4421.Loop_key128: 4422 pshufb %xmm5,%xmm0 4423 aesenclast %xmm4,%xmm0 4424 pslld \$1,%xmm4 4425 lea 16(%rax),%rax 4426 4427 movdqa %xmm2,%xmm3 4428 pslldq \$4,%xmm2 4429 pxor %xmm2,%xmm3 4430 pslldq \$4,%xmm2 4431 pxor %xmm2,%xmm3 4432 pslldq \$4,%xmm2 4433 pxor %xmm3,%xmm2 4434 4435 pxor %xmm2,%xmm0 4436 movdqu %xmm0,-16(%rax) 4437 movdqa %xmm0,%xmm2 4438 4439 dec %r10d 4440 jnz .Loop_key128 4441 4442 movdqa .Lkey_rcon1b(%rip),%xmm4 4443 4444 pshufb %xmm5,%xmm0 4445 aesenclast %xmm4,%xmm0 4446 pslld \$1,%xmm4 4447 4448 movdqa %xmm2,%xmm3 4449 pslldq \$4,%xmm2 4450 pxor %xmm2,%xmm3 4451 pslldq \$4,%xmm2 4452 pxor %xmm2,%xmm3 4453 pslldq \$4,%xmm2 4454 pxor %xmm3,%xmm2 4455 4456 pxor %xmm2,%xmm0 4457 movdqu %xmm0,(%rax) 4458 4459 movdqa %xmm0,%xmm2 4460 pshufb %xmm5,%xmm0 4461 aesenclast %xmm4,%xmm0 4462 4463 movdqa %xmm2,%xmm3 4464 pslldq \$4,%xmm2 4465 pxor %xmm2,%xmm3 4466 pslldq \$4,%xmm2 4467 pxor %xmm2,%xmm3 4468 pslldq \$4,%xmm2 4469 pxor %xmm3,%xmm2 4470 4471 pxor %xmm2,%xmm0 4472 movdqu %xmm0,16(%rax) 4473 4474 mov $bits,96(%rax) # 240($key) 4475 xor %eax,%eax 4476 jmp .Lenc_key_ret 4477 4478.align 16 4479.L12rounds: 4480 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4481 mov \$11,$bits # 12 rounds for 192 4482 cmp \$`1<<28`,%r10d # AVX, but no XOP 4483 je .L12rounds_alt 4484 4485 $movkey %xmm0,($key) # round 0 4486 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4487 call .Lkey_expansion_192a_cold 4488 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4489 call .Lkey_expansion_192b 4490 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4491 call .Lkey_expansion_192a 4492 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4493 call .Lkey_expansion_192b 4494 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4495 call .Lkey_expansion_192a 4496 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4497 call .Lkey_expansion_192b 4498 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4499 call .Lkey_expansion_192a 4500 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4501 call .Lkey_expansion_192b 4502 $movkey %xmm0,(%rax) 4503 mov $bits,48(%rax) # 240(%rdx) 4504 xor %rax, %rax 4505 jmp .Lenc_key_ret 4506 4507.align 16 4508.L12rounds_alt: 4509 movdqa .Lkey_rotate192(%rip),%xmm5 4510 movdqa .Lkey_rcon1(%rip),%xmm4 4511 mov \$8,%r10d 4512 movdqu %xmm0,($key) 4513 jmp .Loop_key192 4514 4515.align 16 4516.Loop_key192: 4517 movq %xmm2,0(%rax) 4518 movdqa %xmm2,%xmm1 4519 pshufb %xmm5,%xmm2 4520 aesenclast %xmm4,%xmm2 4521 pslld \$1, %xmm4 4522 lea 24(%rax),%rax 4523 4524 movdqa %xmm0,%xmm3 4525 pslldq \$4,%xmm0 4526 pxor %xmm0,%xmm3 4527 pslldq \$4,%xmm0 4528 pxor %xmm0,%xmm3 4529 pslldq \$4,%xmm0 4530 pxor %xmm3,%xmm0 4531 4532 pshufd \$0xff,%xmm0,%xmm3 4533 pxor %xmm1,%xmm3 4534 pslldq \$4,%xmm1 4535 pxor %xmm1,%xmm3 4536 4537 pxor %xmm2,%xmm0 4538 pxor %xmm3,%xmm2 4539 movdqu %xmm0,-16(%rax) 4540 4541 dec %r10d 4542 jnz .Loop_key192 4543 4544 mov $bits,32(%rax) # 240($key) 4545 xor %eax,%eax 4546 jmp .Lenc_key_ret 4547 4548.align 16 4549.L14rounds: 4550 movups 16($inp),%xmm2 # remaining half of *userKey 4551 mov \$13,$bits # 14 rounds for 256 4552 lea 16(%rax),%rax 4553 cmp \$`1<<28`,%r10d # AVX, but no XOP 4554 je .L14rounds_alt 4555 4556 $movkey %xmm0,($key) # round 0 4557 $movkey %xmm2,16($key) # round 1 4558 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4559 call .Lkey_expansion_256a_cold 4560 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4561 call .Lkey_expansion_256b 4562 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4563 call .Lkey_expansion_256a 4564 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4565 call .Lkey_expansion_256b 4566 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4567 call .Lkey_expansion_256a 4568 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4569 call .Lkey_expansion_256b 4570 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4571 call .Lkey_expansion_256a 4572 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4573 call .Lkey_expansion_256b 4574 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4575 call .Lkey_expansion_256a 4576 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4577 call .Lkey_expansion_256b 4578 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4579 call .Lkey_expansion_256a 4580 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4581 call .Lkey_expansion_256b 4582 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4583 call .Lkey_expansion_256a 4584 $movkey %xmm0,(%rax) 4585 mov $bits,16(%rax) # 240(%rdx) 4586 xor %rax,%rax 4587 jmp .Lenc_key_ret 4588 4589.align 16 4590.L14rounds_alt: 4591 movdqa .Lkey_rotate(%rip),%xmm5 4592 movdqa .Lkey_rcon1(%rip),%xmm4 4593 mov \$7,%r10d 4594 movdqu %xmm0,0($key) 4595 movdqa %xmm2,%xmm1 4596 movdqu %xmm2,16($key) 4597 jmp .Loop_key256 4598 4599.align 16 4600.Loop_key256: 4601 pshufb %xmm5,%xmm2 4602 aesenclast %xmm4,%xmm2 4603 4604 movdqa %xmm0,%xmm3 4605 pslldq \$4,%xmm0 4606 pxor %xmm0,%xmm3 4607 pslldq \$4,%xmm0 4608 pxor %xmm0,%xmm3 4609 pslldq \$4,%xmm0 4610 pxor %xmm3,%xmm0 4611 pslld \$1,%xmm4 4612 4613 pxor %xmm2,%xmm0 4614 movdqu %xmm0,(%rax) 4615 4616 dec %r10d 4617 jz .Ldone_key256 4618 4619 pshufd \$0xff,%xmm0,%xmm2 4620 pxor %xmm3,%xmm3 4621 aesenclast %xmm3,%xmm2 4622 4623 movdqa %xmm1,%xmm3 4624 pslldq \$4,%xmm1 4625 pxor %xmm1,%xmm3 4626 pslldq \$4,%xmm1 4627 pxor %xmm1,%xmm3 4628 pslldq \$4,%xmm1 4629 pxor %xmm3,%xmm1 4630 4631 pxor %xmm1,%xmm2 4632 movdqu %xmm2,16(%rax) 4633 lea 32(%rax),%rax 4634 movdqa %xmm2,%xmm1 4635 4636 jmp .Loop_key256 4637 4638.Ldone_key256: 4639 mov $bits,16(%rax) # 240($key) 4640 xor %eax,%eax 4641 jmp .Lenc_key_ret 4642 4643.align 16 4644.Lbad_keybits: 4645 mov \$-2,%rax 4646.Lenc_key_ret: 4647 pxor %xmm0,%xmm0 4648 pxor %xmm1,%xmm1 4649 pxor %xmm2,%xmm2 4650 pxor %xmm3,%xmm3 4651 pxor %xmm4,%xmm4 4652 pxor %xmm5,%xmm5 4653 add \$8,%rsp 4654.cfi_adjust_cfa_offset -8 4655 ret 4656.LSEH_end_set_encrypt_key: 4657 4658.align 16 4659.Lkey_expansion_128: 4660 $movkey %xmm0,(%rax) 4661 lea 16(%rax),%rax 4662.Lkey_expansion_128_cold: 4663 shufps \$0b00010000,%xmm0,%xmm4 4664 xorps %xmm4, %xmm0 4665 shufps \$0b10001100,%xmm0,%xmm4 4666 xorps %xmm4, %xmm0 4667 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4668 xorps %xmm1,%xmm0 4669 ret 4670 4671.align 16 4672.Lkey_expansion_192a: 4673 $movkey %xmm0,(%rax) 4674 lea 16(%rax),%rax 4675.Lkey_expansion_192a_cold: 4676 movaps %xmm2, %xmm5 4677.Lkey_expansion_192b_warm: 4678 shufps \$0b00010000,%xmm0,%xmm4 4679 movdqa %xmm2,%xmm3 4680 xorps %xmm4,%xmm0 4681 shufps \$0b10001100,%xmm0,%xmm4 4682 pslldq \$4,%xmm3 4683 xorps %xmm4,%xmm0 4684 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4685 pxor %xmm3,%xmm2 4686 pxor %xmm1,%xmm0 4687 pshufd \$0b11111111,%xmm0,%xmm3 4688 pxor %xmm3,%xmm2 4689 ret 4690 4691.align 16 4692.Lkey_expansion_192b: 4693 movaps %xmm0,%xmm3 4694 shufps \$0b01000100,%xmm0,%xmm5 4695 $movkey %xmm5,(%rax) 4696 shufps \$0b01001110,%xmm2,%xmm3 4697 $movkey %xmm3,16(%rax) 4698 lea 32(%rax),%rax 4699 jmp .Lkey_expansion_192b_warm 4700 4701.align 16 4702.Lkey_expansion_256a: 4703 $movkey %xmm2,(%rax) 4704 lea 16(%rax),%rax 4705.Lkey_expansion_256a_cold: 4706 shufps \$0b00010000,%xmm0,%xmm4 4707 xorps %xmm4,%xmm0 4708 shufps \$0b10001100,%xmm0,%xmm4 4709 xorps %xmm4,%xmm0 4710 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4711 xorps %xmm1,%xmm0 4712 ret 4713 4714.align 16 4715.Lkey_expansion_256b: 4716 $movkey %xmm0,(%rax) 4717 lea 16(%rax),%rax 4718 4719 shufps \$0b00010000,%xmm2,%xmm4 4720 xorps %xmm4,%xmm2 4721 shufps \$0b10001100,%xmm2,%xmm4 4722 xorps %xmm4,%xmm2 4723 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4724 xorps %xmm1,%xmm2 4725 ret 4726.cfi_endproc 4727.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4728.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4729___ 4730} 4731 4732$code.=<<___; 4733.align 64 4734.Lbswap_mask: 4735 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4736.Lincrement32: 4737 .long 6,6,6,0 4738.Lincrement64: 4739 .long 1,0,0,0 4740.Lxts_magic: 4741 .long 0x87,0,1,0 4742.Lincrement1: 4743 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4744.Lkey_rotate: 4745 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4746.Lkey_rotate192: 4747 .long 0x04070605,0x04070605,0x04070605,0x04070605 4748.Lkey_rcon1: 4749 .long 1,1,1,1 4750.Lkey_rcon1b: 4751 .long 0x1b,0x1b,0x1b,0x1b 4752 4753.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4754.align 64 4755___ 4756 4757# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4758# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4759if ($win64) { 4760$rec="%rcx"; 4761$frame="%rdx"; 4762$context="%r8"; 4763$disp="%r9"; 4764 4765$code.=<<___; 4766.extern __imp_RtlVirtualUnwind 4767___ 4768$code.=<<___ if ($PREFIX eq "aesni"); 4769.type ecb_ccm64_se_handler,\@abi-omnipotent 4770.align 16 4771ecb_ccm64_se_handler: 4772 push %rsi 4773 push %rdi 4774 push %rbx 4775 push %rbp 4776 push %r12 4777 push %r13 4778 push %r14 4779 push %r15 4780 pushfq 4781 sub \$64,%rsp 4782 4783 mov 120($context),%rax # pull context->Rax 4784 mov 248($context),%rbx # pull context->Rip 4785 4786 mov 8($disp),%rsi # disp->ImageBase 4787 mov 56($disp),%r11 # disp->HandlerData 4788 4789 mov 0(%r11),%r10d # HandlerData[0] 4790 lea (%rsi,%r10),%r10 # prologue label 4791 cmp %r10,%rbx # context->Rip<prologue label 4792 jb .Lcommon_seh_tail 4793 4794 mov 152($context),%rax # pull context->Rsp 4795 4796 mov 4(%r11),%r10d # HandlerData[1] 4797 lea (%rsi,%r10),%r10 # epilogue label 4798 cmp %r10,%rbx # context->Rip>=epilogue label 4799 jae .Lcommon_seh_tail 4800 4801 lea 0(%rax),%rsi # %xmm save area 4802 lea 512($context),%rdi # &context.Xmm6 4803 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4804 .long 0xa548f3fc # cld; rep movsq 4805 lea 0x58(%rax),%rax # adjust stack pointer 4806 4807 jmp .Lcommon_seh_tail 4808.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 4809 4810.type ctr_xts_se_handler,\@abi-omnipotent 4811.align 16 4812ctr_xts_se_handler: 4813 push %rsi 4814 push %rdi 4815 push %rbx 4816 push %rbp 4817 push %r12 4818 push %r13 4819 push %r14 4820 push %r15 4821 pushfq 4822 sub \$64,%rsp 4823 4824 mov 120($context),%rax # pull context->Rax 4825 mov 248($context),%rbx # pull context->Rip 4826 4827 mov 8($disp),%rsi # disp->ImageBase 4828 mov 56($disp),%r11 # disp->HandlerData 4829 4830 mov 0(%r11),%r10d # HandlerData[0] 4831 lea (%rsi,%r10),%r10 # prologue label 4832 cmp %r10,%rbx # context->Rip<prologue label 4833 jb .Lcommon_seh_tail 4834 4835 mov 152($context),%rax # pull context->Rsp 4836 4837 mov 4(%r11),%r10d # HandlerData[1] 4838 lea (%rsi,%r10),%r10 # epilogue label 4839 cmp %r10,%rbx # context->Rip>=epilogue label 4840 jae .Lcommon_seh_tail 4841 4842 mov 208($context),%rax # pull context->R11 4843 4844 lea -0xa8(%rax),%rsi # %xmm save area 4845 lea 512($context),%rdi # & context.Xmm6 4846 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4847 .long 0xa548f3fc # cld; rep movsq 4848 4849 mov -8(%rax),%rbp # restore saved %rbp 4850 mov %rbp,160($context) # restore context->Rbp 4851 jmp .Lcommon_seh_tail 4852.size ctr_xts_se_handler,.-ctr_xts_se_handler 4853 4854.type ocb_se_handler,\@abi-omnipotent 4855.align 16 4856ocb_se_handler: 4857 push %rsi 4858 push %rdi 4859 push %rbx 4860 push %rbp 4861 push %r12 4862 push %r13 4863 push %r14 4864 push %r15 4865 pushfq 4866 sub \$64,%rsp 4867 4868 mov 120($context),%rax # pull context->Rax 4869 mov 248($context),%rbx # pull context->Rip 4870 4871 mov 8($disp),%rsi # disp->ImageBase 4872 mov 56($disp),%r11 # disp->HandlerData 4873 4874 mov 0(%r11),%r10d # HandlerData[0] 4875 lea (%rsi,%r10),%r10 # prologue label 4876 cmp %r10,%rbx # context->Rip<prologue label 4877 jb .Lcommon_seh_tail 4878 4879 mov 4(%r11),%r10d # HandlerData[1] 4880 lea (%rsi,%r10),%r10 # epilogue label 4881 cmp %r10,%rbx # context->Rip>=epilogue label 4882 jae .Lcommon_seh_tail 4883 4884 mov 8(%r11),%r10d # HandlerData[2] 4885 lea (%rsi,%r10),%r10 4886 cmp %r10,%rbx # context->Rip>=pop label 4887 jae .Locb_no_xmm 4888 4889 mov 152($context),%rax # pull context->Rsp 4890 4891 lea (%rax),%rsi # %xmm save area 4892 lea 512($context),%rdi # & context.Xmm6 4893 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4894 .long 0xa548f3fc # cld; rep movsq 4895 lea 0xa0+0x28(%rax),%rax 4896 4897.Locb_no_xmm: 4898 mov -8(%rax),%rbx 4899 mov -16(%rax),%rbp 4900 mov -24(%rax),%r12 4901 mov -32(%rax),%r13 4902 mov -40(%rax),%r14 4903 4904 mov %rbx,144($context) # restore context->Rbx 4905 mov %rbp,160($context) # restore context->Rbp 4906 mov %r12,216($context) # restore context->R12 4907 mov %r13,224($context) # restore context->R13 4908 mov %r14,232($context) # restore context->R14 4909 4910 jmp .Lcommon_seh_tail 4911.size ocb_se_handler,.-ocb_se_handler 4912___ 4913$code.=<<___; 4914.type cbc_se_handler,\@abi-omnipotent 4915.align 16 4916cbc_se_handler: 4917 push %rsi 4918 push %rdi 4919 push %rbx 4920 push %rbp 4921 push %r12 4922 push %r13 4923 push %r14 4924 push %r15 4925 pushfq 4926 sub \$64,%rsp 4927 4928 mov 152($context),%rax # pull context->Rsp 4929 mov 248($context),%rbx # pull context->Rip 4930 4931 lea .Lcbc_decrypt_bulk(%rip),%r10 4932 cmp %r10,%rbx # context->Rip<"prologue" label 4933 jb .Lcommon_seh_tail 4934 4935 mov 120($context),%rax # pull context->Rax 4936 4937 lea .Lcbc_decrypt_body(%rip),%r10 4938 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4939 jb .Lcommon_seh_tail 4940 4941 mov 152($context),%rax # pull context->Rsp 4942 4943 lea .Lcbc_ret(%rip),%r10 4944 cmp %r10,%rbx # context->Rip>="epilogue" label 4945 jae .Lcommon_seh_tail 4946 4947 lea 16(%rax),%rsi # %xmm save area 4948 lea 512($context),%rdi # &context.Xmm6 4949 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4950 .long 0xa548f3fc # cld; rep movsq 4951 4952 mov 208($context),%rax # pull context->R11 4953 4954 mov -8(%rax),%rbp # restore saved %rbp 4955 mov %rbp,160($context) # restore context->Rbp 4956 4957.Lcommon_seh_tail: 4958 mov 8(%rax),%rdi 4959 mov 16(%rax),%rsi 4960 mov %rax,152($context) # restore context->Rsp 4961 mov %rsi,168($context) # restore context->Rsi 4962 mov %rdi,176($context) # restore context->Rdi 4963 4964 mov 40($disp),%rdi # disp->ContextRecord 4965 mov $context,%rsi # context 4966 mov \$154,%ecx # sizeof(CONTEXT) 4967 .long 0xa548f3fc # cld; rep movsq 4968 4969 mov $disp,%rsi 4970 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4971 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4972 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4973 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4974 mov 40(%rsi),%r10 # disp->ContextRecord 4975 lea 56(%rsi),%r11 # &disp->HandlerData 4976 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4977 mov %r10,32(%rsp) # arg5 4978 mov %r11,40(%rsp) # arg6 4979 mov %r12,48(%rsp) # arg7 4980 mov %rcx,56(%rsp) # arg8, (NULL) 4981 call *__imp_RtlVirtualUnwind(%rip) 4982 4983 mov \$1,%eax # ExceptionContinueSearch 4984 add \$64,%rsp 4985 popfq 4986 pop %r15 4987 pop %r14 4988 pop %r13 4989 pop %r12 4990 pop %rbp 4991 pop %rbx 4992 pop %rdi 4993 pop %rsi 4994 ret 4995.size cbc_se_handler,.-cbc_se_handler 4996 4997.section .pdata 4998.align 4 4999___ 5000$code.=<<___ if ($PREFIX eq "aesni"); 5001 .rva .LSEH_begin_aesni_ecb_encrypt 5002 .rva .LSEH_end_aesni_ecb_encrypt 5003 .rva .LSEH_info_ecb 5004 5005 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 5006 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 5007 .rva .LSEH_info_ccm64_enc 5008 5009 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 5010 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 5011 .rva .LSEH_info_ccm64_dec 5012 5013 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 5014 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 5015 .rva .LSEH_info_ctr32 5016 5017 .rva .LSEH_begin_aesni_xts_encrypt 5018 .rva .LSEH_end_aesni_xts_encrypt 5019 .rva .LSEH_info_xts_enc 5020 5021 .rva .LSEH_begin_aesni_xts_decrypt 5022 .rva .LSEH_end_aesni_xts_decrypt 5023 .rva .LSEH_info_xts_dec 5024 5025 .rva .LSEH_begin_aesni_ocb_encrypt 5026 .rva .LSEH_end_aesni_ocb_encrypt 5027 .rva .LSEH_info_ocb_enc 5028 5029 .rva .LSEH_begin_aesni_ocb_decrypt 5030 .rva .LSEH_end_aesni_ocb_decrypt 5031 .rva .LSEH_info_ocb_dec 5032___ 5033$code.=<<___; 5034 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 5035 .rva .LSEH_end_${PREFIX}_cbc_encrypt 5036 .rva .LSEH_info_cbc 5037 5038 .rva ${PREFIX}_set_decrypt_key 5039 .rva .LSEH_end_set_decrypt_key 5040 .rva .LSEH_info_key 5041 5042 .rva ${PREFIX}_set_encrypt_key 5043 .rva .LSEH_end_set_encrypt_key 5044 .rva .LSEH_info_key 5045.section .xdata 5046.align 8 5047___ 5048$code.=<<___ if ($PREFIX eq "aesni"); 5049.LSEH_info_ecb: 5050 .byte 9,0,0,0 5051 .rva ecb_ccm64_se_handler 5052 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 5053.LSEH_info_ccm64_enc: 5054 .byte 9,0,0,0 5055 .rva ecb_ccm64_se_handler 5056 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 5057.LSEH_info_ccm64_dec: 5058 .byte 9,0,0,0 5059 .rva ecb_ccm64_se_handler 5060 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 5061.LSEH_info_ctr32: 5062 .byte 9,0,0,0 5063 .rva ctr_xts_se_handler 5064 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 5065.LSEH_info_xts_enc: 5066 .byte 9,0,0,0 5067 .rva ctr_xts_se_handler 5068 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 5069.LSEH_info_xts_dec: 5070 .byte 9,0,0,0 5071 .rva ctr_xts_se_handler 5072 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 5073.LSEH_info_ocb_enc: 5074 .byte 9,0,0,0 5075 .rva ocb_se_handler 5076 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] 5077 .rva .Locb_enc_pop 5078 .long 0 5079.LSEH_info_ocb_dec: 5080 .byte 9,0,0,0 5081 .rva ocb_se_handler 5082 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] 5083 .rva .Locb_dec_pop 5084 .long 0 5085___ 5086$code.=<<___; 5087.LSEH_info_cbc: 5088 .byte 9,0,0,0 5089 .rva cbc_se_handler 5090.LSEH_info_key: 5091 .byte 0x01,0x04,0x01,0x00 5092 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5093___ 5094} 5095 5096sub rex { 5097 local *opcode=shift; 5098 my ($dst,$src)=@_; 5099 my $rex=0; 5100 5101 $rex|=0x04 if($dst>=8); 5102 $rex|=0x01 if($src>=8); 5103 push @opcode,$rex|0x40 if($rex); 5104} 5105 5106sub aesni { 5107 my $line=shift; 5108 my @opcode=(0x66); 5109 5110 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5111 rex(\@opcode,$4,$3); 5112 push @opcode,0x0f,0x3a,0xdf; 5113 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5114 my $c=$2; 5115 push @opcode,$c=~/^0/?oct($c):$c; 5116 return ".byte\t".join(',',@opcode); 5117 } 5118 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5119 my %opcodelet = ( 5120 "aesimc" => 0xdb, 5121 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5122 "aesdec" => 0xde, "aesdeclast" => 0xdf 5123 ); 5124 return undef if (!defined($opcodelet{$1})); 5125 rex(\@opcode,$3,$2); 5126 push @opcode,0x0f,0x38,$opcodelet{$1}; 5127 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5128 return ".byte\t".join(',',@opcode); 5129 } 5130 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5131 my %opcodelet = ( 5132 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5133 "aesdec" => 0xde, "aesdeclast" => 0xdf 5134 ); 5135 return undef if (!defined($opcodelet{$1})); 5136 my $off = $2; 5137 push @opcode,0x44 if ($3>=8); 5138 push @opcode,0x0f,0x38,$opcodelet{$1}; 5139 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5140 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5141 return ".byte\t".join(',',@opcode); 5142 } 5143 return $line; 5144} 5145 5146sub movbe { 5147 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5148} 5149 5150$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5151$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5152#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5153$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5154 5155print $code; 5156 5157close STDOUT or die "error closing STDOUT: $!"; 5158