1#! /usr/bin/env perl 2# Copyright 2009-2019 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Knights L 2.54/0.77 0.78 0.85 - 1.50 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186# 187# (*) Atom Silvermont ECB result is suboptimal because of penalties 188# incurred by operations on %xmm8-15. As ECB is not considered 189# critical, nothing was done to mitigate the problem. 190 191$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195$flavour = shift; 196$output = shift; 197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 198 199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 200 201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 203( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 204die "can't locate x86_64-xlate.pl"; 205 206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 207*STDOUT=*OUT; 208 209$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 212 213$code=".text\n"; 214$code.=".extern OPENSSL_ia32cap_P\n"; 215 216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 218$inp="%rdi"; 219$out="%rsi"; 220$len="%rdx"; 221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 222$ivp="%r8"; # cbc, ctr, ... 223 224$rnds_="%r10d"; # backup copy for $rounds 225$key_="%r11"; # backup copy for $key 226 227# %xmm register layout 228$rndkey0="%xmm0"; $rndkey1="%xmm1"; 229$inout0="%xmm2"; $inout1="%xmm3"; 230$inout2="%xmm4"; $inout3="%xmm5"; 231$inout4="%xmm6"; $inout5="%xmm7"; 232$inout6="%xmm8"; $inout7="%xmm9"; 233 234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 235$in0="%xmm8"; $iv="%xmm9"; 236 237# Inline version of internal aesni_[en|de]crypt1. 238# 239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 240# cycles which take care of loop variables... 241{ my $sn; 242sub aesni_generate1 { 243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 244++$sn; 245$code.=<<___; 246 $movkey ($key),$rndkey0 247 $movkey 16($key),$rndkey1 248___ 249$code.=<<___ if (defined($ivec)); 250 xorps $rndkey0,$ivec 251 lea 32($key),$key 252 xorps $ivec,$inout 253___ 254$code.=<<___ if (!defined($ivec)); 255 lea 32($key),$key 256 xorps $rndkey0,$inout 257___ 258$code.=<<___; 259.Loop_${p}1_$sn: 260 aes${p} $rndkey1,$inout 261 dec $rounds 262 $movkey ($key),$rndkey1 263 lea 16($key),$key 264 jnz .Loop_${p}1_$sn # loop body is 16 bytes 265 aes${p}last $rndkey1,$inout 266___ 267}} 268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 269# 270{ my ($inp,$out,$key) = @_4args; 271 272$code.=<<___; 273.globl ${PREFIX}_encrypt 274.type ${PREFIX}_encrypt,\@abi-omnipotent 275.align 16 276${PREFIX}_encrypt: 277.cfi_startproc 278 movups ($inp),$inout0 # load input 279 mov 240($key),$rounds # key->rounds 280___ 281 &aesni_generate1("enc",$key,$rounds); 282$code.=<<___; 283 pxor $rndkey0,$rndkey0 # clear register bank 284 pxor $rndkey1,$rndkey1 285 movups $inout0,($out) # output 286 pxor $inout0,$inout0 287 ret 288.cfi_endproc 289.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 290 291.globl ${PREFIX}_decrypt 292.type ${PREFIX}_decrypt,\@abi-omnipotent 293.align 16 294${PREFIX}_decrypt: 295.cfi_startproc 296 movups ($inp),$inout0 # load input 297 mov 240($key),$rounds # key->rounds 298___ 299 &aesni_generate1("dec",$key,$rounds); 300$code.=<<___; 301 pxor $rndkey0,$rndkey0 # clear register bank 302 pxor $rndkey1,$rndkey1 303 movups $inout0,($out) # output 304 pxor $inout0,$inout0 305 ret 306.cfi_endproc 307.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 308___ 309} 310 311# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 312# factor. Why 3x subroutine were originally used in loops? Even though 313# aes[enc|dec] latency was originally 6, it could be scheduled only 314# every *2nd* cycle. Thus 3x interleave was the one providing optimal 315# utilization, i.e. when subroutine's throughput is virtually same as 316# of non-interleaved subroutine [for number of input blocks up to 3]. 317# This is why it originally made no sense to implement 2x subroutine. 318# But times change and it became appropriate to spend extra 192 bytes 319# on 2x subroutine on Atom Silvermont account. For processors that 320# can schedule aes[enc|dec] every cycle optimal interleave factor 321# equals to corresponding instructions latency. 8x is optimal for 322# * Bridge and "super-optimal" for other Intel CPUs... 323 324sub aesni_generate2 { 325my $dir=shift; 326# As already mentioned it takes in $key and $rounds, which are *not* 327# preserved. $inout[0-1] is cipher/clear text... 328$code.=<<___; 329.type _aesni_${dir}rypt2,\@abi-omnipotent 330.align 16 331_aesni_${dir}rypt2: 332.cfi_startproc 333 $movkey ($key),$rndkey0 334 shl \$4,$rounds 335 $movkey 16($key),$rndkey1 336 xorps $rndkey0,$inout0 337 xorps $rndkey0,$inout1 338 $movkey 32($key),$rndkey0 339 lea 32($key,$rounds),$key 340 neg %rax # $rounds 341 add \$16,%rax 342 343.L${dir}_loop2: 344 aes${dir} $rndkey1,$inout0 345 aes${dir} $rndkey1,$inout1 346 $movkey ($key,%rax),$rndkey1 347 add \$32,%rax 348 aes${dir} $rndkey0,$inout0 349 aes${dir} $rndkey0,$inout1 350 $movkey -16($key,%rax),$rndkey0 351 jnz .L${dir}_loop2 352 353 aes${dir} $rndkey1,$inout0 354 aes${dir} $rndkey1,$inout1 355 aes${dir}last $rndkey0,$inout0 356 aes${dir}last $rndkey0,$inout1 357 ret 358.cfi_endproc 359.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 360___ 361} 362sub aesni_generate3 { 363my $dir=shift; 364# As already mentioned it takes in $key and $rounds, which are *not* 365# preserved. $inout[0-2] is cipher/clear text... 366$code.=<<___; 367.type _aesni_${dir}rypt3,\@abi-omnipotent 368.align 16 369_aesni_${dir}rypt3: 370.cfi_startproc 371 $movkey ($key),$rndkey0 372 shl \$4,$rounds 373 $movkey 16($key),$rndkey1 374 xorps $rndkey0,$inout0 375 xorps $rndkey0,$inout1 376 xorps $rndkey0,$inout2 377 $movkey 32($key),$rndkey0 378 lea 32($key,$rounds),$key 379 neg %rax # $rounds 380 add \$16,%rax 381 382.L${dir}_loop3: 383 aes${dir} $rndkey1,$inout0 384 aes${dir} $rndkey1,$inout1 385 aes${dir} $rndkey1,$inout2 386 $movkey ($key,%rax),$rndkey1 387 add \$32,%rax 388 aes${dir} $rndkey0,$inout0 389 aes${dir} $rndkey0,$inout1 390 aes${dir} $rndkey0,$inout2 391 $movkey -16($key,%rax),$rndkey0 392 jnz .L${dir}_loop3 393 394 aes${dir} $rndkey1,$inout0 395 aes${dir} $rndkey1,$inout1 396 aes${dir} $rndkey1,$inout2 397 aes${dir}last $rndkey0,$inout0 398 aes${dir}last $rndkey0,$inout1 399 aes${dir}last $rndkey0,$inout2 400 ret 401.cfi_endproc 402.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 403___ 404} 405# 4x interleave is implemented to improve small block performance, 406# most notably [and naturally] 4 block by ~30%. One can argue that one 407# should have implemented 5x as well, but improvement would be <20%, 408# so it's not worth it... 409sub aesni_generate4 { 410my $dir=shift; 411# As already mentioned it takes in $key and $rounds, which are *not* 412# preserved. $inout[0-3] is cipher/clear text... 413$code.=<<___; 414.type _aesni_${dir}rypt4,\@abi-omnipotent 415.align 16 416_aesni_${dir}rypt4: 417.cfi_startproc 418 $movkey ($key),$rndkey0 419 shl \$4,$rounds 420 $movkey 16($key),$rndkey1 421 xorps $rndkey0,$inout0 422 xorps $rndkey0,$inout1 423 xorps $rndkey0,$inout2 424 xorps $rndkey0,$inout3 425 $movkey 32($key),$rndkey0 426 lea 32($key,$rounds),$key 427 neg %rax # $rounds 428 .byte 0x0f,0x1f,0x00 429 add \$16,%rax 430 431.L${dir}_loop4: 432 aes${dir} $rndkey1,$inout0 433 aes${dir} $rndkey1,$inout1 434 aes${dir} $rndkey1,$inout2 435 aes${dir} $rndkey1,$inout3 436 $movkey ($key,%rax),$rndkey1 437 add \$32,%rax 438 aes${dir} $rndkey0,$inout0 439 aes${dir} $rndkey0,$inout1 440 aes${dir} $rndkey0,$inout2 441 aes${dir} $rndkey0,$inout3 442 $movkey -16($key,%rax),$rndkey0 443 jnz .L${dir}_loop4 444 445 aes${dir} $rndkey1,$inout0 446 aes${dir} $rndkey1,$inout1 447 aes${dir} $rndkey1,$inout2 448 aes${dir} $rndkey1,$inout3 449 aes${dir}last $rndkey0,$inout0 450 aes${dir}last $rndkey0,$inout1 451 aes${dir}last $rndkey0,$inout2 452 aes${dir}last $rndkey0,$inout3 453 ret 454.cfi_endproc 455.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 456___ 457} 458sub aesni_generate6 { 459my $dir=shift; 460# As already mentioned it takes in $key and $rounds, which are *not* 461# preserved. $inout[0-5] is cipher/clear text... 462$code.=<<___; 463.type _aesni_${dir}rypt6,\@abi-omnipotent 464.align 16 465_aesni_${dir}rypt6: 466.cfi_startproc 467 $movkey ($key),$rndkey0 468 shl \$4,$rounds 469 $movkey 16($key),$rndkey1 470 xorps $rndkey0,$inout0 471 pxor $rndkey0,$inout1 472 pxor $rndkey0,$inout2 473 aes${dir} $rndkey1,$inout0 474 lea 32($key,$rounds),$key 475 neg %rax # $rounds 476 aes${dir} $rndkey1,$inout1 477 pxor $rndkey0,$inout3 478 pxor $rndkey0,$inout4 479 aes${dir} $rndkey1,$inout2 480 pxor $rndkey0,$inout5 481 $movkey ($key,%rax),$rndkey0 482 add \$16,%rax 483 jmp .L${dir}_loop6_enter 484.align 16 485.L${dir}_loop6: 486 aes${dir} $rndkey1,$inout0 487 aes${dir} $rndkey1,$inout1 488 aes${dir} $rndkey1,$inout2 489.L${dir}_loop6_enter: 490 aes${dir} $rndkey1,$inout3 491 aes${dir} $rndkey1,$inout4 492 aes${dir} $rndkey1,$inout5 493 $movkey ($key,%rax),$rndkey1 494 add \$32,%rax 495 aes${dir} $rndkey0,$inout0 496 aes${dir} $rndkey0,$inout1 497 aes${dir} $rndkey0,$inout2 498 aes${dir} $rndkey0,$inout3 499 aes${dir} $rndkey0,$inout4 500 aes${dir} $rndkey0,$inout5 501 $movkey -16($key,%rax),$rndkey0 502 jnz .L${dir}_loop6 503 504 aes${dir} $rndkey1,$inout0 505 aes${dir} $rndkey1,$inout1 506 aes${dir} $rndkey1,$inout2 507 aes${dir} $rndkey1,$inout3 508 aes${dir} $rndkey1,$inout4 509 aes${dir} $rndkey1,$inout5 510 aes${dir}last $rndkey0,$inout0 511 aes${dir}last $rndkey0,$inout1 512 aes${dir}last $rndkey0,$inout2 513 aes${dir}last $rndkey0,$inout3 514 aes${dir}last $rndkey0,$inout4 515 aes${dir}last $rndkey0,$inout5 516 ret 517.cfi_endproc 518.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 519___ 520} 521sub aesni_generate8 { 522my $dir=shift; 523# As already mentioned it takes in $key and $rounds, which are *not* 524# preserved. $inout[0-7] is cipher/clear text... 525$code.=<<___; 526.type _aesni_${dir}rypt8,\@abi-omnipotent 527.align 16 528_aesni_${dir}rypt8: 529.cfi_startproc 530 $movkey ($key),$rndkey0 531 shl \$4,$rounds 532 $movkey 16($key),$rndkey1 533 xorps $rndkey0,$inout0 534 xorps $rndkey0,$inout1 535 pxor $rndkey0,$inout2 536 pxor $rndkey0,$inout3 537 pxor $rndkey0,$inout4 538 lea 32($key,$rounds),$key 539 neg %rax # $rounds 540 aes${dir} $rndkey1,$inout0 541 pxor $rndkey0,$inout5 542 pxor $rndkey0,$inout6 543 aes${dir} $rndkey1,$inout1 544 pxor $rndkey0,$inout7 545 $movkey ($key,%rax),$rndkey0 546 add \$16,%rax 547 jmp .L${dir}_loop8_inner 548.align 16 549.L${dir}_loop8: 550 aes${dir} $rndkey1,$inout0 551 aes${dir} $rndkey1,$inout1 552.L${dir}_loop8_inner: 553 aes${dir} $rndkey1,$inout2 554 aes${dir} $rndkey1,$inout3 555 aes${dir} $rndkey1,$inout4 556 aes${dir} $rndkey1,$inout5 557 aes${dir} $rndkey1,$inout6 558 aes${dir} $rndkey1,$inout7 559.L${dir}_loop8_enter: 560 $movkey ($key,%rax),$rndkey1 561 add \$32,%rax 562 aes${dir} $rndkey0,$inout0 563 aes${dir} $rndkey0,$inout1 564 aes${dir} $rndkey0,$inout2 565 aes${dir} $rndkey0,$inout3 566 aes${dir} $rndkey0,$inout4 567 aes${dir} $rndkey0,$inout5 568 aes${dir} $rndkey0,$inout6 569 aes${dir} $rndkey0,$inout7 570 $movkey -16($key,%rax),$rndkey0 571 jnz .L${dir}_loop8 572 573 aes${dir} $rndkey1,$inout0 574 aes${dir} $rndkey1,$inout1 575 aes${dir} $rndkey1,$inout2 576 aes${dir} $rndkey1,$inout3 577 aes${dir} $rndkey1,$inout4 578 aes${dir} $rndkey1,$inout5 579 aes${dir} $rndkey1,$inout6 580 aes${dir} $rndkey1,$inout7 581 aes${dir}last $rndkey0,$inout0 582 aes${dir}last $rndkey0,$inout1 583 aes${dir}last $rndkey0,$inout2 584 aes${dir}last $rndkey0,$inout3 585 aes${dir}last $rndkey0,$inout4 586 aes${dir}last $rndkey0,$inout5 587 aes${dir}last $rndkey0,$inout6 588 aes${dir}last $rndkey0,$inout7 589 ret 590.cfi_endproc 591.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 592___ 593} 594&aesni_generate2("enc") if ($PREFIX eq "aesni"); 595&aesni_generate2("dec"); 596&aesni_generate3("enc") if ($PREFIX eq "aesni"); 597&aesni_generate3("dec"); 598&aesni_generate4("enc") if ($PREFIX eq "aesni"); 599&aesni_generate4("dec"); 600&aesni_generate6("enc") if ($PREFIX eq "aesni"); 601&aesni_generate6("dec"); 602&aesni_generate8("enc") if ($PREFIX eq "aesni"); 603&aesni_generate8("dec"); 604 605if ($PREFIX eq "aesni") { 606######################################################################## 607# void aesni_ecb_encrypt (const void *in, void *out, 608# size_t length, const AES_KEY *key, 609# int enc); 610$code.=<<___; 611.globl aesni_ecb_encrypt 612.type aesni_ecb_encrypt,\@function,5 613.align 16 614aesni_ecb_encrypt: 615.cfi_startproc 616___ 617$code.=<<___ if ($win64); 618 lea -0x58(%rsp),%rsp 619 movaps %xmm6,(%rsp) # offload $inout4..7 620 movaps %xmm7,0x10(%rsp) 621 movaps %xmm8,0x20(%rsp) 622 movaps %xmm9,0x30(%rsp) 623.Lecb_enc_body: 624___ 625$code.=<<___; 626 and \$-16,$len # if ($len<16) 627 jz .Lecb_ret # return 628 629 mov 240($key),$rounds # key->rounds 630 $movkey ($key),$rndkey0 631 mov $key,$key_ # backup $key 632 mov $rounds,$rnds_ # backup $rounds 633 test %r8d,%r8d # 5th argument 634 jz .Lecb_decrypt 635#--------------------------- ECB ENCRYPT ------------------------------# 636 cmp \$0x80,$len # if ($len<8*16) 637 jb .Lecb_enc_tail # short input 638 639 movdqu ($inp),$inout0 # load 8 input blocks 640 movdqu 0x10($inp),$inout1 641 movdqu 0x20($inp),$inout2 642 movdqu 0x30($inp),$inout3 643 movdqu 0x40($inp),$inout4 644 movdqu 0x50($inp),$inout5 645 movdqu 0x60($inp),$inout6 646 movdqu 0x70($inp),$inout7 647 lea 0x80($inp),$inp # $inp+=8*16 648 sub \$0x80,$len # $len-=8*16 (can be zero) 649 jmp .Lecb_enc_loop8_enter 650.align 16 651.Lecb_enc_loop8: 652 movups $inout0,($out) # store 8 output blocks 653 mov $key_,$key # restore $key 654 movdqu ($inp),$inout0 # load 8 input blocks 655 mov $rnds_,$rounds # restore $rounds 656 movups $inout1,0x10($out) 657 movdqu 0x10($inp),$inout1 658 movups $inout2,0x20($out) 659 movdqu 0x20($inp),$inout2 660 movups $inout3,0x30($out) 661 movdqu 0x30($inp),$inout3 662 movups $inout4,0x40($out) 663 movdqu 0x40($inp),$inout4 664 movups $inout5,0x50($out) 665 movdqu 0x50($inp),$inout5 666 movups $inout6,0x60($out) 667 movdqu 0x60($inp),$inout6 668 movups $inout7,0x70($out) 669 lea 0x80($out),$out # $out+=8*16 670 movdqu 0x70($inp),$inout7 671 lea 0x80($inp),$inp # $inp+=8*16 672.Lecb_enc_loop8_enter: 673 674 call _aesni_encrypt8 675 676 sub \$0x80,$len 677 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 678 679 movups $inout0,($out) # store 8 output blocks 680 mov $key_,$key # restore $key 681 movups $inout1,0x10($out) 682 mov $rnds_,$rounds # restore $rounds 683 movups $inout2,0x20($out) 684 movups $inout3,0x30($out) 685 movups $inout4,0x40($out) 686 movups $inout5,0x50($out) 687 movups $inout6,0x60($out) 688 movups $inout7,0x70($out) 689 lea 0x80($out),$out # $out+=8*16 690 add \$0x80,$len # restore real remaining $len 691 jz .Lecb_ret # done if ($len==0) 692 693.Lecb_enc_tail: # $len is less than 8*16 694 movups ($inp),$inout0 695 cmp \$0x20,$len 696 jb .Lecb_enc_one 697 movups 0x10($inp),$inout1 698 je .Lecb_enc_two 699 movups 0x20($inp),$inout2 700 cmp \$0x40,$len 701 jb .Lecb_enc_three 702 movups 0x30($inp),$inout3 703 je .Lecb_enc_four 704 movups 0x40($inp),$inout4 705 cmp \$0x60,$len 706 jb .Lecb_enc_five 707 movups 0x50($inp),$inout5 708 je .Lecb_enc_six 709 movdqu 0x60($inp),$inout6 710 xorps $inout7,$inout7 711 call _aesni_encrypt8 712 movups $inout0,($out) # store 7 output blocks 713 movups $inout1,0x10($out) 714 movups $inout2,0x20($out) 715 movups $inout3,0x30($out) 716 movups $inout4,0x40($out) 717 movups $inout5,0x50($out) 718 movups $inout6,0x60($out) 719 jmp .Lecb_ret 720.align 16 721.Lecb_enc_one: 722___ 723 &aesni_generate1("enc",$key,$rounds); 724$code.=<<___; 725 movups $inout0,($out) # store one output block 726 jmp .Lecb_ret 727.align 16 728.Lecb_enc_two: 729 call _aesni_encrypt2 730 movups $inout0,($out) # store 2 output blocks 731 movups $inout1,0x10($out) 732 jmp .Lecb_ret 733.align 16 734.Lecb_enc_three: 735 call _aesni_encrypt3 736 movups $inout0,($out) # store 3 output blocks 737 movups $inout1,0x10($out) 738 movups $inout2,0x20($out) 739 jmp .Lecb_ret 740.align 16 741.Lecb_enc_four: 742 call _aesni_encrypt4 743 movups $inout0,($out) # store 4 output blocks 744 movups $inout1,0x10($out) 745 movups $inout2,0x20($out) 746 movups $inout3,0x30($out) 747 jmp .Lecb_ret 748.align 16 749.Lecb_enc_five: 750 xorps $inout5,$inout5 751 call _aesni_encrypt6 752 movups $inout0,($out) # store 5 output blocks 753 movups $inout1,0x10($out) 754 movups $inout2,0x20($out) 755 movups $inout3,0x30($out) 756 movups $inout4,0x40($out) 757 jmp .Lecb_ret 758.align 16 759.Lecb_enc_six: 760 call _aesni_encrypt6 761 movups $inout0,($out) # store 6 output blocks 762 movups $inout1,0x10($out) 763 movups $inout2,0x20($out) 764 movups $inout3,0x30($out) 765 movups $inout4,0x40($out) 766 movups $inout5,0x50($out) 767 jmp .Lecb_ret 768#--------------------------- ECB DECRYPT ------------------------------# 769.align 16 770.Lecb_decrypt: 771 cmp \$0x80,$len # if ($len<8*16) 772 jb .Lecb_dec_tail # short input 773 774 movdqu ($inp),$inout0 # load 8 input blocks 775 movdqu 0x10($inp),$inout1 776 movdqu 0x20($inp),$inout2 777 movdqu 0x30($inp),$inout3 778 movdqu 0x40($inp),$inout4 779 movdqu 0x50($inp),$inout5 780 movdqu 0x60($inp),$inout6 781 movdqu 0x70($inp),$inout7 782 lea 0x80($inp),$inp # $inp+=8*16 783 sub \$0x80,$len # $len-=8*16 (can be zero) 784 jmp .Lecb_dec_loop8_enter 785.align 16 786.Lecb_dec_loop8: 787 movups $inout0,($out) # store 8 output blocks 788 mov $key_,$key # restore $key 789 movdqu ($inp),$inout0 # load 8 input blocks 790 mov $rnds_,$rounds # restore $rounds 791 movups $inout1,0x10($out) 792 movdqu 0x10($inp),$inout1 793 movups $inout2,0x20($out) 794 movdqu 0x20($inp),$inout2 795 movups $inout3,0x30($out) 796 movdqu 0x30($inp),$inout3 797 movups $inout4,0x40($out) 798 movdqu 0x40($inp),$inout4 799 movups $inout5,0x50($out) 800 movdqu 0x50($inp),$inout5 801 movups $inout6,0x60($out) 802 movdqu 0x60($inp),$inout6 803 movups $inout7,0x70($out) 804 lea 0x80($out),$out # $out+=8*16 805 movdqu 0x70($inp),$inout7 806 lea 0x80($inp),$inp # $inp+=8*16 807.Lecb_dec_loop8_enter: 808 809 call _aesni_decrypt8 810 811 $movkey ($key_),$rndkey0 812 sub \$0x80,$len 813 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 814 815 movups $inout0,($out) # store 8 output blocks 816 pxor $inout0,$inout0 # clear register bank 817 mov $key_,$key # restore $key 818 movups $inout1,0x10($out) 819 pxor $inout1,$inout1 820 mov $rnds_,$rounds # restore $rounds 821 movups $inout2,0x20($out) 822 pxor $inout2,$inout2 823 movups $inout3,0x30($out) 824 pxor $inout3,$inout3 825 movups $inout4,0x40($out) 826 pxor $inout4,$inout4 827 movups $inout5,0x50($out) 828 pxor $inout5,$inout5 829 movups $inout6,0x60($out) 830 pxor $inout6,$inout6 831 movups $inout7,0x70($out) 832 pxor $inout7,$inout7 833 lea 0x80($out),$out # $out+=8*16 834 add \$0x80,$len # restore real remaining $len 835 jz .Lecb_ret # done if ($len==0) 836 837.Lecb_dec_tail: 838 movups ($inp),$inout0 839 cmp \$0x20,$len 840 jb .Lecb_dec_one 841 movups 0x10($inp),$inout1 842 je .Lecb_dec_two 843 movups 0x20($inp),$inout2 844 cmp \$0x40,$len 845 jb .Lecb_dec_three 846 movups 0x30($inp),$inout3 847 je .Lecb_dec_four 848 movups 0x40($inp),$inout4 849 cmp \$0x60,$len 850 jb .Lecb_dec_five 851 movups 0x50($inp),$inout5 852 je .Lecb_dec_six 853 movups 0x60($inp),$inout6 854 $movkey ($key),$rndkey0 855 xorps $inout7,$inout7 856 call _aesni_decrypt8 857 movups $inout0,($out) # store 7 output blocks 858 pxor $inout0,$inout0 # clear register bank 859 movups $inout1,0x10($out) 860 pxor $inout1,$inout1 861 movups $inout2,0x20($out) 862 pxor $inout2,$inout2 863 movups $inout3,0x30($out) 864 pxor $inout3,$inout3 865 movups $inout4,0x40($out) 866 pxor $inout4,$inout4 867 movups $inout5,0x50($out) 868 pxor $inout5,$inout5 869 movups $inout6,0x60($out) 870 pxor $inout6,$inout6 871 pxor $inout7,$inout7 872 jmp .Lecb_ret 873.align 16 874.Lecb_dec_one: 875___ 876 &aesni_generate1("dec",$key,$rounds); 877$code.=<<___; 878 movups $inout0,($out) # store one output block 879 pxor $inout0,$inout0 # clear register bank 880 jmp .Lecb_ret 881.align 16 882.Lecb_dec_two: 883 call _aesni_decrypt2 884 movups $inout0,($out) # store 2 output blocks 885 pxor $inout0,$inout0 # clear register bank 886 movups $inout1,0x10($out) 887 pxor $inout1,$inout1 888 jmp .Lecb_ret 889.align 16 890.Lecb_dec_three: 891 call _aesni_decrypt3 892 movups $inout0,($out) # store 3 output blocks 893 pxor $inout0,$inout0 # clear register bank 894 movups $inout1,0x10($out) 895 pxor $inout1,$inout1 896 movups $inout2,0x20($out) 897 pxor $inout2,$inout2 898 jmp .Lecb_ret 899.align 16 900.Lecb_dec_four: 901 call _aesni_decrypt4 902 movups $inout0,($out) # store 4 output blocks 903 pxor $inout0,$inout0 # clear register bank 904 movups $inout1,0x10($out) 905 pxor $inout1,$inout1 906 movups $inout2,0x20($out) 907 pxor $inout2,$inout2 908 movups $inout3,0x30($out) 909 pxor $inout3,$inout3 910 jmp .Lecb_ret 911.align 16 912.Lecb_dec_five: 913 xorps $inout5,$inout5 914 call _aesni_decrypt6 915 movups $inout0,($out) # store 5 output blocks 916 pxor $inout0,$inout0 # clear register bank 917 movups $inout1,0x10($out) 918 pxor $inout1,$inout1 919 movups $inout2,0x20($out) 920 pxor $inout2,$inout2 921 movups $inout3,0x30($out) 922 pxor $inout3,$inout3 923 movups $inout4,0x40($out) 924 pxor $inout4,$inout4 925 pxor $inout5,$inout5 926 jmp .Lecb_ret 927.align 16 928.Lecb_dec_six: 929 call _aesni_decrypt6 930 movups $inout0,($out) # store 6 output blocks 931 pxor $inout0,$inout0 # clear register bank 932 movups $inout1,0x10($out) 933 pxor $inout1,$inout1 934 movups $inout2,0x20($out) 935 pxor $inout2,$inout2 936 movups $inout3,0x30($out) 937 pxor $inout3,$inout3 938 movups $inout4,0x40($out) 939 pxor $inout4,$inout4 940 movups $inout5,0x50($out) 941 pxor $inout5,$inout5 942 943.Lecb_ret: 944 xorps $rndkey0,$rndkey0 # %xmm0 945 pxor $rndkey1,$rndkey1 946___ 947$code.=<<___ if ($win64); 948 movaps (%rsp),%xmm6 949 movaps %xmm0,(%rsp) # clear stack 950 movaps 0x10(%rsp),%xmm7 951 movaps %xmm0,0x10(%rsp) 952 movaps 0x20(%rsp),%xmm8 953 movaps %xmm0,0x20(%rsp) 954 movaps 0x30(%rsp),%xmm9 955 movaps %xmm0,0x30(%rsp) 956 lea 0x58(%rsp),%rsp 957.Lecb_enc_ret: 958___ 959$code.=<<___; 960 ret 961.cfi_endproc 962.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 963___ 964 965{ 966###################################################################### 967# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 968# size_t blocks, const AES_KEY *key, 969# const char *ivec,char *cmac); 970# 971# Handles only complete blocks, operates on 64-bit counter and 972# does not update *ivec! Nor does it finalize CMAC value 973# (see engine/eng_aesni.c for details) 974# 975{ 976my $cmac="%r9"; # 6th argument 977 978my $increment="%xmm9"; 979my $iv="%xmm6"; 980my $bswap_mask="%xmm7"; 981 982$code.=<<___; 983.globl aesni_ccm64_encrypt_blocks 984.type aesni_ccm64_encrypt_blocks,\@function,6 985.align 16 986aesni_ccm64_encrypt_blocks: 987___ 988$code.=<<___ if ($win64); 989 lea -0x58(%rsp),%rsp 990 movaps %xmm6,(%rsp) # $iv 991 movaps %xmm7,0x10(%rsp) # $bswap_mask 992 movaps %xmm8,0x20(%rsp) # $in0 993 movaps %xmm9,0x30(%rsp) # $increment 994.Lccm64_enc_body: 995___ 996$code.=<<___; 997 mov 240($key),$rounds # key->rounds 998 movdqu ($ivp),$iv 999 movdqa .Lincrement64(%rip),$increment 1000 movdqa .Lbswap_mask(%rip),$bswap_mask 1001 1002 shl \$4,$rounds 1003 mov \$16,$rnds_ 1004 lea 0($key),$key_ 1005 movdqu ($cmac),$inout1 1006 movdqa $iv,$inout0 1007 lea 32($key,$rounds),$key # end of key schedule 1008 pshufb $bswap_mask,$iv 1009 sub %rax,%r10 # twisted $rounds 1010 jmp .Lccm64_enc_outer 1011.align 16 1012.Lccm64_enc_outer: 1013 $movkey ($key_),$rndkey0 1014 mov %r10,%rax 1015 movups ($inp),$in0 # load inp 1016 1017 xorps $rndkey0,$inout0 # counter 1018 $movkey 16($key_),$rndkey1 1019 xorps $in0,$rndkey0 1020 xorps $rndkey0,$inout1 # cmac^=inp 1021 $movkey 32($key_),$rndkey0 1022 1023.Lccm64_enc2_loop: 1024 aesenc $rndkey1,$inout0 1025 aesenc $rndkey1,$inout1 1026 $movkey ($key,%rax),$rndkey1 1027 add \$32,%rax 1028 aesenc $rndkey0,$inout0 1029 aesenc $rndkey0,$inout1 1030 $movkey -16($key,%rax),$rndkey0 1031 jnz .Lccm64_enc2_loop 1032 aesenc $rndkey1,$inout0 1033 aesenc $rndkey1,$inout1 1034 paddq $increment,$iv 1035 dec $len # $len-- ($len is in blocks) 1036 aesenclast $rndkey0,$inout0 1037 aesenclast $rndkey0,$inout1 1038 1039 lea 16($inp),$inp 1040 xorps $inout0,$in0 # inp ^= E(iv) 1041 movdqa $iv,$inout0 1042 movups $in0,($out) # save output 1043 pshufb $bswap_mask,$inout0 1044 lea 16($out),$out # $out+=16 1045 jnz .Lccm64_enc_outer # loop if ($len!=0) 1046 1047 pxor $rndkey0,$rndkey0 # clear register bank 1048 pxor $rndkey1,$rndkey1 1049 pxor $inout0,$inout0 1050 movups $inout1,($cmac) # store resulting mac 1051 pxor $inout1,$inout1 1052 pxor $in0,$in0 1053 pxor $iv,$iv 1054___ 1055$code.=<<___ if ($win64); 1056 movaps (%rsp),%xmm6 1057 movaps %xmm0,(%rsp) # clear stack 1058 movaps 0x10(%rsp),%xmm7 1059 movaps %xmm0,0x10(%rsp) 1060 movaps 0x20(%rsp),%xmm8 1061 movaps %xmm0,0x20(%rsp) 1062 movaps 0x30(%rsp),%xmm9 1063 movaps %xmm0,0x30(%rsp) 1064 lea 0x58(%rsp),%rsp 1065.Lccm64_enc_ret: 1066___ 1067$code.=<<___; 1068 ret 1069.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1070___ 1071###################################################################### 1072$code.=<<___; 1073.globl aesni_ccm64_decrypt_blocks 1074.type aesni_ccm64_decrypt_blocks,\@function,6 1075.align 16 1076aesni_ccm64_decrypt_blocks: 1077___ 1078$code.=<<___ if ($win64); 1079 lea -0x58(%rsp),%rsp 1080 movaps %xmm6,(%rsp) # $iv 1081 movaps %xmm7,0x10(%rsp) # $bswap_mask 1082 movaps %xmm8,0x20(%rsp) # $in8 1083 movaps %xmm9,0x30(%rsp) # $increment 1084.Lccm64_dec_body: 1085___ 1086$code.=<<___; 1087 mov 240($key),$rounds # key->rounds 1088 movups ($ivp),$iv 1089 movdqu ($cmac),$inout1 1090 movdqa .Lincrement64(%rip),$increment 1091 movdqa .Lbswap_mask(%rip),$bswap_mask 1092 1093 movaps $iv,$inout0 1094 mov $rounds,$rnds_ 1095 mov $key,$key_ 1096 pshufb $bswap_mask,$iv 1097___ 1098 &aesni_generate1("enc",$key,$rounds); 1099$code.=<<___; 1100 shl \$4,$rnds_ 1101 mov \$16,$rounds 1102 movups ($inp),$in0 # load inp 1103 paddq $increment,$iv 1104 lea 16($inp),$inp # $inp+=16 1105 sub %r10,%rax # twisted $rounds 1106 lea 32($key_,$rnds_),$key # end of key schedule 1107 mov %rax,%r10 1108 jmp .Lccm64_dec_outer 1109.align 16 1110.Lccm64_dec_outer: 1111 xorps $inout0,$in0 # inp ^= E(iv) 1112 movdqa $iv,$inout0 1113 movups $in0,($out) # save output 1114 lea 16($out),$out # $out+=16 1115 pshufb $bswap_mask,$inout0 1116 1117 sub \$1,$len # $len-- ($len is in blocks) 1118 jz .Lccm64_dec_break # if ($len==0) break 1119 1120 $movkey ($key_),$rndkey0 1121 mov %r10,%rax 1122 $movkey 16($key_),$rndkey1 1123 xorps $rndkey0,$in0 1124 xorps $rndkey0,$inout0 1125 xorps $in0,$inout1 # cmac^=out 1126 $movkey 32($key_),$rndkey0 1127 jmp .Lccm64_dec2_loop 1128.align 16 1129.Lccm64_dec2_loop: 1130 aesenc $rndkey1,$inout0 1131 aesenc $rndkey1,$inout1 1132 $movkey ($key,%rax),$rndkey1 1133 add \$32,%rax 1134 aesenc $rndkey0,$inout0 1135 aesenc $rndkey0,$inout1 1136 $movkey -16($key,%rax),$rndkey0 1137 jnz .Lccm64_dec2_loop 1138 movups ($inp),$in0 # load input 1139 paddq $increment,$iv 1140 aesenc $rndkey1,$inout0 1141 aesenc $rndkey1,$inout1 1142 aesenclast $rndkey0,$inout0 1143 aesenclast $rndkey0,$inout1 1144 lea 16($inp),$inp # $inp+=16 1145 jmp .Lccm64_dec_outer 1146 1147.align 16 1148.Lccm64_dec_break: 1149 #xorps $in0,$inout1 # cmac^=out 1150 mov 240($key_),$rounds 1151___ 1152 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1153$code.=<<___; 1154 pxor $rndkey0,$rndkey0 # clear register bank 1155 pxor $rndkey1,$rndkey1 1156 pxor $inout0,$inout0 1157 movups $inout1,($cmac) # store resulting mac 1158 pxor $inout1,$inout1 1159 pxor $in0,$in0 1160 pxor $iv,$iv 1161___ 1162$code.=<<___ if ($win64); 1163 movaps (%rsp),%xmm6 1164 movaps %xmm0,(%rsp) # clear stack 1165 movaps 0x10(%rsp),%xmm7 1166 movaps %xmm0,0x10(%rsp) 1167 movaps 0x20(%rsp),%xmm8 1168 movaps %xmm0,0x20(%rsp) 1169 movaps 0x30(%rsp),%xmm9 1170 movaps %xmm0,0x30(%rsp) 1171 lea 0x58(%rsp),%rsp 1172.Lccm64_dec_ret: 1173___ 1174$code.=<<___; 1175 ret 1176.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1177___ 1178} 1179###################################################################### 1180# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1181# size_t blocks, const AES_KEY *key, 1182# const char *ivec); 1183# 1184# Handles only complete blocks, operates on 32-bit counter and 1185# does not update *ivec! (see crypto/modes/ctr128.c for details) 1186# 1187# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1188# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1189# Keywords are full unroll and modulo-schedule counter calculations 1190# with zero-round key xor. 1191{ 1192my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1193my ($key0,$ctr)=("%ebp","${ivp}d"); 1194my $frame_size = 0x80 + ($win64?160:0); 1195 1196$code.=<<___; 1197.globl aesni_ctr32_encrypt_blocks 1198.type aesni_ctr32_encrypt_blocks,\@function,5 1199.align 16 1200aesni_ctr32_encrypt_blocks: 1201.cfi_startproc 1202 cmp \$1,$len 1203 jne .Lctr32_bulk 1204 1205 # handle single block without allocating stack frame, 1206 # useful when handling edges 1207 movups ($ivp),$inout0 1208 movups ($inp),$inout1 1209 mov 240($key),%edx # key->rounds 1210___ 1211 &aesni_generate1("enc",$key,"%edx"); 1212$code.=<<___; 1213 pxor $rndkey0,$rndkey0 # clear register bank 1214 pxor $rndkey1,$rndkey1 1215 xorps $inout1,$inout0 1216 pxor $inout1,$inout1 1217 movups $inout0,($out) 1218 xorps $inout0,$inout0 1219 jmp .Lctr32_epilogue 1220 1221.align 16 1222.Lctr32_bulk: 1223 lea (%rsp),$key_ # use $key_ as frame pointer 1224.cfi_def_cfa_register $key_ 1225 push %rbp 1226.cfi_push %rbp 1227 sub \$$frame_size,%rsp 1228 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1229___ 1230$code.=<<___ if ($win64); 1231 movaps %xmm6,-0xa8($key_) # offload everything 1232 movaps %xmm7,-0x98($key_) 1233 movaps %xmm8,-0x88($key_) 1234 movaps %xmm9,-0x78($key_) 1235 movaps %xmm10,-0x68($key_) 1236 movaps %xmm11,-0x58($key_) 1237 movaps %xmm12,-0x48($key_) 1238 movaps %xmm13,-0x38($key_) 1239 movaps %xmm14,-0x28($key_) 1240 movaps %xmm15,-0x18($key_) 1241.Lctr32_body: 1242___ 1243$code.=<<___; 1244 1245 # 8 16-byte words on top of stack are counter values 1246 # xor-ed with zero-round key 1247 1248 movdqu ($ivp),$inout0 1249 movdqu ($key),$rndkey0 1250 mov 12($ivp),$ctr # counter LSB 1251 pxor $rndkey0,$inout0 1252 mov 12($key),$key0 # 0-round key LSB 1253 movdqa $inout0,0x00(%rsp) # populate counter block 1254 bswap $ctr 1255 movdqa $inout0,$inout1 1256 movdqa $inout0,$inout2 1257 movdqa $inout0,$inout3 1258 movdqa $inout0,0x40(%rsp) 1259 movdqa $inout0,0x50(%rsp) 1260 movdqa $inout0,0x60(%rsp) 1261 mov %rdx,%r10 # about to borrow %rdx 1262 movdqa $inout0,0x70(%rsp) 1263 1264 lea 1($ctr),%rax 1265 lea 2($ctr),%rdx 1266 bswap %eax 1267 bswap %edx 1268 xor $key0,%eax 1269 xor $key0,%edx 1270 pinsrd \$3,%eax,$inout1 1271 lea 3($ctr),%rax 1272 movdqa $inout1,0x10(%rsp) 1273 pinsrd \$3,%edx,$inout2 1274 bswap %eax 1275 mov %r10,%rdx # restore %rdx 1276 lea 4($ctr),%r10 1277 movdqa $inout2,0x20(%rsp) 1278 xor $key0,%eax 1279 bswap %r10d 1280 pinsrd \$3,%eax,$inout3 1281 xor $key0,%r10d 1282 movdqa $inout3,0x30(%rsp) 1283 lea 5($ctr),%r9 1284 mov %r10d,0x40+12(%rsp) 1285 bswap %r9d 1286 lea 6($ctr),%r10 1287 mov 240($key),$rounds # key->rounds 1288 xor $key0,%r9d 1289 bswap %r10d 1290 mov %r9d,0x50+12(%rsp) 1291 xor $key0,%r10d 1292 lea 7($ctr),%r9 1293 mov %r10d,0x60+12(%rsp) 1294 bswap %r9d 1295 mov OPENSSL_ia32cap_P+4(%rip),%r10d 1296 xor $key0,%r9d 1297 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1298 mov %r9d,0x70+12(%rsp) 1299 1300 $movkey 0x10($key),$rndkey1 1301 1302 movdqa 0x40(%rsp),$inout4 1303 movdqa 0x50(%rsp),$inout5 1304 1305 cmp \$8,$len # $len is in blocks 1306 jb .Lctr32_tail # short input if ($len<8) 1307 1308 sub \$6,$len # $len is biased by -6 1309 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1310 je .Lctr32_6x # [which denotes Atom Silvermont] 1311 1312 lea 0x80($key),$key # size optimization 1313 sub \$2,$len # $len is biased by -8 1314 jmp .Lctr32_loop8 1315 1316.align 16 1317.Lctr32_6x: 1318 shl \$4,$rounds 1319 mov \$48,$rnds_ 1320 bswap $key0 1321 lea 32($key,$rounds),$key # end of key schedule 1322 sub %rax,%r10 # twisted $rounds 1323 jmp .Lctr32_loop6 1324 1325.align 16 1326.Lctr32_loop6: 1327 add \$6,$ctr # next counter value 1328 $movkey -48($key,$rnds_),$rndkey0 1329 aesenc $rndkey1,$inout0 1330 mov $ctr,%eax 1331 xor $key0,%eax 1332 aesenc $rndkey1,$inout1 1333 movbe %eax,`0x00+12`(%rsp) # store next counter value 1334 lea 1($ctr),%eax 1335 aesenc $rndkey1,$inout2 1336 xor $key0,%eax 1337 movbe %eax,`0x10+12`(%rsp) 1338 aesenc $rndkey1,$inout3 1339 lea 2($ctr),%eax 1340 xor $key0,%eax 1341 aesenc $rndkey1,$inout4 1342 movbe %eax,`0x20+12`(%rsp) 1343 lea 3($ctr),%eax 1344 aesenc $rndkey1,$inout5 1345 $movkey -32($key,$rnds_),$rndkey1 1346 xor $key0,%eax 1347 1348 aesenc $rndkey0,$inout0 1349 movbe %eax,`0x30+12`(%rsp) 1350 lea 4($ctr),%eax 1351 aesenc $rndkey0,$inout1 1352 xor $key0,%eax 1353 movbe %eax,`0x40+12`(%rsp) 1354 aesenc $rndkey0,$inout2 1355 lea 5($ctr),%eax 1356 xor $key0,%eax 1357 aesenc $rndkey0,$inout3 1358 movbe %eax,`0x50+12`(%rsp) 1359 mov %r10,%rax # mov $rnds_,$rounds 1360 aesenc $rndkey0,$inout4 1361 aesenc $rndkey0,$inout5 1362 $movkey -16($key,$rnds_),$rndkey0 1363 1364 call .Lenc_loop6 1365 1366 movdqu ($inp),$inout6 # load 6 input blocks 1367 movdqu 0x10($inp),$inout7 1368 movdqu 0x20($inp),$in0 1369 movdqu 0x30($inp),$in1 1370 movdqu 0x40($inp),$in2 1371 movdqu 0x50($inp),$in3 1372 lea 0x60($inp),$inp # $inp+=6*16 1373 $movkey -64($key,$rnds_),$rndkey1 1374 pxor $inout0,$inout6 # inp^=E(ctr) 1375 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1376 pxor $inout1,$inout7 1377 movaps 0x10(%rsp),$inout1 1378 pxor $inout2,$in0 1379 movaps 0x20(%rsp),$inout2 1380 pxor $inout3,$in1 1381 movaps 0x30(%rsp),$inout3 1382 pxor $inout4,$in2 1383 movaps 0x40(%rsp),$inout4 1384 pxor $inout5,$in3 1385 movaps 0x50(%rsp),$inout5 1386 movdqu $inout6,($out) # store 6 output blocks 1387 movdqu $inout7,0x10($out) 1388 movdqu $in0,0x20($out) 1389 movdqu $in1,0x30($out) 1390 movdqu $in2,0x40($out) 1391 movdqu $in3,0x50($out) 1392 lea 0x60($out),$out # $out+=6*16 1393 1394 sub \$6,$len 1395 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1396 1397 add \$6,$len # restore real remaining $len 1398 jz .Lctr32_done # done if ($len==0) 1399 1400 lea -48($rnds_),$rounds 1401 lea -80($key,$rnds_),$key # restore $key 1402 neg $rounds 1403 shr \$4,$rounds # restore $rounds 1404 jmp .Lctr32_tail 1405 1406.align 32 1407.Lctr32_loop8: 1408 add \$8,$ctr # next counter value 1409 movdqa 0x60(%rsp),$inout6 1410 aesenc $rndkey1,$inout0 1411 mov $ctr,%r9d 1412 movdqa 0x70(%rsp),$inout7 1413 aesenc $rndkey1,$inout1 1414 bswap %r9d 1415 $movkey 0x20-0x80($key),$rndkey0 1416 aesenc $rndkey1,$inout2 1417 xor $key0,%r9d 1418 nop 1419 aesenc $rndkey1,$inout3 1420 mov %r9d,0x00+12(%rsp) # store next counter value 1421 lea 1($ctr),%r9 1422 aesenc $rndkey1,$inout4 1423 aesenc $rndkey1,$inout5 1424 aesenc $rndkey1,$inout6 1425 aesenc $rndkey1,$inout7 1426 $movkey 0x30-0x80($key),$rndkey1 1427___ 1428for($i=2;$i<8;$i++) { 1429my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1430$code.=<<___; 1431 bswap %r9d 1432 aesenc $rndkeyx,$inout0 1433 aesenc $rndkeyx,$inout1 1434 xor $key0,%r9d 1435 .byte 0x66,0x90 1436 aesenc $rndkeyx,$inout2 1437 aesenc $rndkeyx,$inout3 1438 mov %r9d,`0x10*($i-1)`+12(%rsp) 1439 lea $i($ctr),%r9 1440 aesenc $rndkeyx,$inout4 1441 aesenc $rndkeyx,$inout5 1442 aesenc $rndkeyx,$inout6 1443 aesenc $rndkeyx,$inout7 1444 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1445___ 1446} 1447$code.=<<___; 1448 bswap %r9d 1449 aesenc $rndkey0,$inout0 1450 aesenc $rndkey0,$inout1 1451 aesenc $rndkey0,$inout2 1452 xor $key0,%r9d 1453 movdqu 0x00($inp),$in0 # start loading input 1454 aesenc $rndkey0,$inout3 1455 mov %r9d,0x70+12(%rsp) 1456 cmp \$11,$rounds 1457 aesenc $rndkey0,$inout4 1458 aesenc $rndkey0,$inout5 1459 aesenc $rndkey0,$inout6 1460 aesenc $rndkey0,$inout7 1461 $movkey 0xa0-0x80($key),$rndkey0 1462 1463 jb .Lctr32_enc_done 1464 1465 aesenc $rndkey1,$inout0 1466 aesenc $rndkey1,$inout1 1467 aesenc $rndkey1,$inout2 1468 aesenc $rndkey1,$inout3 1469 aesenc $rndkey1,$inout4 1470 aesenc $rndkey1,$inout5 1471 aesenc $rndkey1,$inout6 1472 aesenc $rndkey1,$inout7 1473 $movkey 0xb0-0x80($key),$rndkey1 1474 1475 aesenc $rndkey0,$inout0 1476 aesenc $rndkey0,$inout1 1477 aesenc $rndkey0,$inout2 1478 aesenc $rndkey0,$inout3 1479 aesenc $rndkey0,$inout4 1480 aesenc $rndkey0,$inout5 1481 aesenc $rndkey0,$inout6 1482 aesenc $rndkey0,$inout7 1483 $movkey 0xc0-0x80($key),$rndkey0 1484 je .Lctr32_enc_done 1485 1486 aesenc $rndkey1,$inout0 1487 aesenc $rndkey1,$inout1 1488 aesenc $rndkey1,$inout2 1489 aesenc $rndkey1,$inout3 1490 aesenc $rndkey1,$inout4 1491 aesenc $rndkey1,$inout5 1492 aesenc $rndkey1,$inout6 1493 aesenc $rndkey1,$inout7 1494 $movkey 0xd0-0x80($key),$rndkey1 1495 1496 aesenc $rndkey0,$inout0 1497 aesenc $rndkey0,$inout1 1498 aesenc $rndkey0,$inout2 1499 aesenc $rndkey0,$inout3 1500 aesenc $rndkey0,$inout4 1501 aesenc $rndkey0,$inout5 1502 aesenc $rndkey0,$inout6 1503 aesenc $rndkey0,$inout7 1504 $movkey 0xe0-0x80($key),$rndkey0 1505 jmp .Lctr32_enc_done 1506 1507.align 16 1508.Lctr32_enc_done: 1509 movdqu 0x10($inp),$in1 1510 pxor $rndkey0,$in0 # input^=round[last] 1511 movdqu 0x20($inp),$in2 1512 pxor $rndkey0,$in1 1513 movdqu 0x30($inp),$in3 1514 pxor $rndkey0,$in2 1515 movdqu 0x40($inp),$in4 1516 pxor $rndkey0,$in3 1517 movdqu 0x50($inp),$in5 1518 pxor $rndkey0,$in4 1519 pxor $rndkey0,$in5 1520 aesenc $rndkey1,$inout0 1521 aesenc $rndkey1,$inout1 1522 aesenc $rndkey1,$inout2 1523 aesenc $rndkey1,$inout3 1524 aesenc $rndkey1,$inout4 1525 aesenc $rndkey1,$inout5 1526 aesenc $rndkey1,$inout6 1527 aesenc $rndkey1,$inout7 1528 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1529 lea 0x80($inp),$inp # $inp+=8*16 1530 1531 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1532 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1533 movdqu 0x70-0x80($inp),$in0 1534 aesenclast $in1,$inout1 1535 pxor $rndkey0,$in0 1536 movdqa 0x00(%rsp),$in1 # load next counter block 1537 aesenclast $in2,$inout2 1538 aesenclast $in3,$inout3 1539 movdqa 0x10(%rsp),$in2 1540 movdqa 0x20(%rsp),$in3 1541 aesenclast $in4,$inout4 1542 aesenclast $in5,$inout5 1543 movdqa 0x30(%rsp),$in4 1544 movdqa 0x40(%rsp),$in5 1545 aesenclast $rndkey1,$inout6 1546 movdqa 0x50(%rsp),$rndkey0 1547 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1548 aesenclast $in0,$inout7 1549 1550 movups $inout0,($out) # store 8 output blocks 1551 movdqa $in1,$inout0 1552 movups $inout1,0x10($out) 1553 movdqa $in2,$inout1 1554 movups $inout2,0x20($out) 1555 movdqa $in3,$inout2 1556 movups $inout3,0x30($out) 1557 movdqa $in4,$inout3 1558 movups $inout4,0x40($out) 1559 movdqa $in5,$inout4 1560 movups $inout5,0x50($out) 1561 movdqa $rndkey0,$inout5 1562 movups $inout6,0x60($out) 1563 movups $inout7,0x70($out) 1564 lea 0x80($out),$out # $out+=8*16 1565 1566 sub \$8,$len 1567 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1568 1569 add \$8,$len # restore real remaining $len 1570 jz .Lctr32_done # done if ($len==0) 1571 lea -0x80($key),$key 1572 1573.Lctr32_tail: 1574 # note that at this point $inout0..5 are populated with 1575 # counter values xor-ed with 0-round key 1576 lea 16($key),$key 1577 cmp \$4,$len 1578 jb .Lctr32_loop3 1579 je .Lctr32_loop4 1580 1581 # if ($len>4) compute 7 E(counter) 1582 shl \$4,$rounds 1583 movdqa 0x60(%rsp),$inout6 1584 pxor $inout7,$inout7 1585 1586 $movkey 16($key),$rndkey0 1587 aesenc $rndkey1,$inout0 1588 aesenc $rndkey1,$inout1 1589 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1590 neg %rax 1591 aesenc $rndkey1,$inout2 1592 add \$16,%rax # prepare for .Lenc_loop8_enter 1593 movups ($inp),$in0 1594 aesenc $rndkey1,$inout3 1595 aesenc $rndkey1,$inout4 1596 movups 0x10($inp),$in1 # pre-load input 1597 movups 0x20($inp),$in2 1598 aesenc $rndkey1,$inout5 1599 aesenc $rndkey1,$inout6 1600 1601 call .Lenc_loop8_enter 1602 1603 movdqu 0x30($inp),$in3 1604 pxor $in0,$inout0 1605 movdqu 0x40($inp),$in0 1606 pxor $in1,$inout1 1607 movdqu $inout0,($out) # store output 1608 pxor $in2,$inout2 1609 movdqu $inout1,0x10($out) 1610 pxor $in3,$inout3 1611 movdqu $inout2,0x20($out) 1612 pxor $in0,$inout4 1613 movdqu $inout3,0x30($out) 1614 movdqu $inout4,0x40($out) 1615 cmp \$6,$len 1616 jb .Lctr32_done # $len was 5, stop store 1617 1618 movups 0x50($inp),$in1 1619 xorps $in1,$inout5 1620 movups $inout5,0x50($out) 1621 je .Lctr32_done # $len was 6, stop store 1622 1623 movups 0x60($inp),$in2 1624 xorps $in2,$inout6 1625 movups $inout6,0x60($out) 1626 jmp .Lctr32_done # $len was 7, stop store 1627 1628.align 32 1629.Lctr32_loop4: 1630 aesenc $rndkey1,$inout0 1631 lea 16($key),$key 1632 dec $rounds 1633 aesenc $rndkey1,$inout1 1634 aesenc $rndkey1,$inout2 1635 aesenc $rndkey1,$inout3 1636 $movkey ($key),$rndkey1 1637 jnz .Lctr32_loop4 1638 aesenclast $rndkey1,$inout0 1639 aesenclast $rndkey1,$inout1 1640 movups ($inp),$in0 # load input 1641 movups 0x10($inp),$in1 1642 aesenclast $rndkey1,$inout2 1643 aesenclast $rndkey1,$inout3 1644 movups 0x20($inp),$in2 1645 movups 0x30($inp),$in3 1646 1647 xorps $in0,$inout0 1648 movups $inout0,($out) # store output 1649 xorps $in1,$inout1 1650 movups $inout1,0x10($out) 1651 pxor $in2,$inout2 1652 movdqu $inout2,0x20($out) 1653 pxor $in3,$inout3 1654 movdqu $inout3,0x30($out) 1655 jmp .Lctr32_done # $len was 4, stop store 1656 1657.align 32 1658.Lctr32_loop3: 1659 aesenc $rndkey1,$inout0 1660 lea 16($key),$key 1661 dec $rounds 1662 aesenc $rndkey1,$inout1 1663 aesenc $rndkey1,$inout2 1664 $movkey ($key),$rndkey1 1665 jnz .Lctr32_loop3 1666 aesenclast $rndkey1,$inout0 1667 aesenclast $rndkey1,$inout1 1668 aesenclast $rndkey1,$inout2 1669 1670 movups ($inp),$in0 # load input 1671 xorps $in0,$inout0 1672 movups $inout0,($out) # store output 1673 cmp \$2,$len 1674 jb .Lctr32_done # $len was 1, stop store 1675 1676 movups 0x10($inp),$in1 1677 xorps $in1,$inout1 1678 movups $inout1,0x10($out) 1679 je .Lctr32_done # $len was 2, stop store 1680 1681 movups 0x20($inp),$in2 1682 xorps $in2,$inout2 1683 movups $inout2,0x20($out) # $len was 3, stop store 1684 1685.Lctr32_done: 1686 xorps %xmm0,%xmm0 # clear register bank 1687 xor $key0,$key0 1688 pxor %xmm1,%xmm1 1689 pxor %xmm2,%xmm2 1690 pxor %xmm3,%xmm3 1691 pxor %xmm4,%xmm4 1692 pxor %xmm5,%xmm5 1693___ 1694$code.=<<___ if (!$win64); 1695 pxor %xmm6,%xmm6 1696 pxor %xmm7,%xmm7 1697 movaps %xmm0,0x00(%rsp) # clear stack 1698 pxor %xmm8,%xmm8 1699 movaps %xmm0,0x10(%rsp) 1700 pxor %xmm9,%xmm9 1701 movaps %xmm0,0x20(%rsp) 1702 pxor %xmm10,%xmm10 1703 movaps %xmm0,0x30(%rsp) 1704 pxor %xmm11,%xmm11 1705 movaps %xmm0,0x40(%rsp) 1706 pxor %xmm12,%xmm12 1707 movaps %xmm0,0x50(%rsp) 1708 pxor %xmm13,%xmm13 1709 movaps %xmm0,0x60(%rsp) 1710 pxor %xmm14,%xmm14 1711 movaps %xmm0,0x70(%rsp) 1712 pxor %xmm15,%xmm15 1713___ 1714$code.=<<___ if ($win64); 1715 movaps -0xa8($key_),%xmm6 1716 movaps %xmm0,-0xa8($key_) # clear stack 1717 movaps -0x98($key_),%xmm7 1718 movaps %xmm0,-0x98($key_) 1719 movaps -0x88($key_),%xmm8 1720 movaps %xmm0,-0x88($key_) 1721 movaps -0x78($key_),%xmm9 1722 movaps %xmm0,-0x78($key_) 1723 movaps -0x68($key_),%xmm10 1724 movaps %xmm0,-0x68($key_) 1725 movaps -0x58($key_),%xmm11 1726 movaps %xmm0,-0x58($key_) 1727 movaps -0x48($key_),%xmm12 1728 movaps %xmm0,-0x48($key_) 1729 movaps -0x38($key_),%xmm13 1730 movaps %xmm0,-0x38($key_) 1731 movaps -0x28($key_),%xmm14 1732 movaps %xmm0,-0x28($key_) 1733 movaps -0x18($key_),%xmm15 1734 movaps %xmm0,-0x18($key_) 1735 movaps %xmm0,0x00(%rsp) 1736 movaps %xmm0,0x10(%rsp) 1737 movaps %xmm0,0x20(%rsp) 1738 movaps %xmm0,0x30(%rsp) 1739 movaps %xmm0,0x40(%rsp) 1740 movaps %xmm0,0x50(%rsp) 1741 movaps %xmm0,0x60(%rsp) 1742 movaps %xmm0,0x70(%rsp) 1743___ 1744$code.=<<___; 1745 mov -8($key_),%rbp 1746.cfi_restore %rbp 1747 lea ($key_),%rsp 1748.cfi_def_cfa_register %rsp 1749.Lctr32_epilogue: 1750 ret 1751.cfi_endproc 1752.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1753___ 1754} 1755 1756###################################################################### 1757# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1758# const AES_KEY *key1, const AES_KEY *key2 1759# const unsigned char iv[16]); 1760# 1761{ 1762my @tweak=map("%xmm$_",(10..15)); 1763my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1764my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1765my $frame_size = 0x70 + ($win64?160:0); 1766my $key_ = "%rbp"; # override so that we can use %r11 as FP 1767 1768$code.=<<___; 1769.globl aesni_xts_encrypt 1770.type aesni_xts_encrypt,\@function,6 1771.align 16 1772aesni_xts_encrypt: 1773.cfi_startproc 1774 lea (%rsp),%r11 # frame pointer 1775.cfi_def_cfa_register %r11 1776 push %rbp 1777.cfi_push %rbp 1778 sub \$$frame_size,%rsp 1779 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1780___ 1781$code.=<<___ if ($win64); 1782 movaps %xmm6,-0xa8(%r11) # offload everything 1783 movaps %xmm7,-0x98(%r11) 1784 movaps %xmm8,-0x88(%r11) 1785 movaps %xmm9,-0x78(%r11) 1786 movaps %xmm10,-0x68(%r11) 1787 movaps %xmm11,-0x58(%r11) 1788 movaps %xmm12,-0x48(%r11) 1789 movaps %xmm13,-0x38(%r11) 1790 movaps %xmm14,-0x28(%r11) 1791 movaps %xmm15,-0x18(%r11) 1792.Lxts_enc_body: 1793___ 1794$code.=<<___; 1795 movups ($ivp),$inout0 # load clear-text tweak 1796 mov 240(%r8),$rounds # key2->rounds 1797 mov 240($key),$rnds_ # key1->rounds 1798___ 1799 # generate the tweak 1800 &aesni_generate1("enc",$key2,$rounds,$inout0); 1801$code.=<<___; 1802 $movkey ($key),$rndkey0 # zero round key 1803 mov $key,$key_ # backup $key 1804 mov $rnds_,$rounds # backup $rounds 1805 shl \$4,$rnds_ 1806 mov $len,$len_ # backup $len 1807 and \$-16,$len 1808 1809 $movkey 16($key,$rnds_),$rndkey1 # last round key 1810 1811 movdqa .Lxts_magic(%rip),$twmask 1812 movdqa $inout0,@tweak[5] 1813 pshufd \$0x5f,$inout0,$twres 1814 pxor $rndkey0,$rndkey1 1815___ 1816 # alternative tweak calculation algorithm is based on suggestions 1817 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1818 # and should help in the future... 1819 for ($i=0;$i<4;$i++) { 1820 $code.=<<___; 1821 movdqa $twres,$twtmp 1822 paddd $twres,$twres 1823 movdqa @tweak[5],@tweak[$i] 1824 psrad \$31,$twtmp # broadcast upper bits 1825 paddq @tweak[5],@tweak[5] 1826 pand $twmask,$twtmp 1827 pxor $rndkey0,@tweak[$i] 1828 pxor $twtmp,@tweak[5] 1829___ 1830 } 1831$code.=<<___; 1832 movdqa @tweak[5],@tweak[4] 1833 psrad \$31,$twres 1834 paddq @tweak[5],@tweak[5] 1835 pand $twmask,$twres 1836 pxor $rndkey0,@tweak[4] 1837 pxor $twres,@tweak[5] 1838 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1839 1840 sub \$16*6,$len 1841 jc .Lxts_enc_short # if $len-=6*16 borrowed 1842 1843 mov \$16+96,$rounds 1844 lea 32($key_,$rnds_),$key # end of key schedule 1845 sub %r10,%rax # twisted $rounds 1846 $movkey 16($key_),$rndkey1 1847 mov %rax,%r10 # backup twisted $rounds 1848 lea .Lxts_magic(%rip),%r8 1849 jmp .Lxts_enc_grandloop 1850 1851.align 32 1852.Lxts_enc_grandloop: 1853 movdqu `16*0`($inp),$inout0 # load input 1854 movdqa $rndkey0,$twmask 1855 movdqu `16*1`($inp),$inout1 1856 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1857 movdqu `16*2`($inp),$inout2 1858 pxor @tweak[1],$inout1 1859 aesenc $rndkey1,$inout0 1860 movdqu `16*3`($inp),$inout3 1861 pxor @tweak[2],$inout2 1862 aesenc $rndkey1,$inout1 1863 movdqu `16*4`($inp),$inout4 1864 pxor @tweak[3],$inout3 1865 aesenc $rndkey1,$inout2 1866 movdqu `16*5`($inp),$inout5 1867 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1868 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1869 pxor @tweak[4],$inout4 1870 aesenc $rndkey1,$inout3 1871 $movkey 32($key_),$rndkey0 1872 lea `16*6`($inp),$inp 1873 pxor $twmask,$inout5 1874 1875 pxor $twres,@tweak[0] # calculate tweaks^round[last] 1876 aesenc $rndkey1,$inout4 1877 pxor $twres,@tweak[1] 1878 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1879 aesenc $rndkey1,$inout5 1880 $movkey 48($key_),$rndkey1 1881 pxor $twres,@tweak[2] 1882 1883 aesenc $rndkey0,$inout0 1884 pxor $twres,@tweak[3] 1885 movdqa @tweak[1],`16*1`(%rsp) 1886 aesenc $rndkey0,$inout1 1887 pxor $twres,@tweak[4] 1888 movdqa @tweak[2],`16*2`(%rsp) 1889 aesenc $rndkey0,$inout2 1890 aesenc $rndkey0,$inout3 1891 pxor $twres,$twmask 1892 movdqa @tweak[4],`16*4`(%rsp) 1893 aesenc $rndkey0,$inout4 1894 aesenc $rndkey0,$inout5 1895 $movkey 64($key_),$rndkey0 1896 movdqa $twmask,`16*5`(%rsp) 1897 pshufd \$0x5f,@tweak[5],$twres 1898 jmp .Lxts_enc_loop6 1899.align 32 1900.Lxts_enc_loop6: 1901 aesenc $rndkey1,$inout0 1902 aesenc $rndkey1,$inout1 1903 aesenc $rndkey1,$inout2 1904 aesenc $rndkey1,$inout3 1905 aesenc $rndkey1,$inout4 1906 aesenc $rndkey1,$inout5 1907 $movkey -64($key,%rax),$rndkey1 1908 add \$32,%rax 1909 1910 aesenc $rndkey0,$inout0 1911 aesenc $rndkey0,$inout1 1912 aesenc $rndkey0,$inout2 1913 aesenc $rndkey0,$inout3 1914 aesenc $rndkey0,$inout4 1915 aesenc $rndkey0,$inout5 1916 $movkey -80($key,%rax),$rndkey0 1917 jnz .Lxts_enc_loop6 1918 1919 movdqa (%r8),$twmask # start calculating next tweak 1920 movdqa $twres,$twtmp 1921 paddd $twres,$twres 1922 aesenc $rndkey1,$inout0 1923 paddq @tweak[5],@tweak[5] 1924 psrad \$31,$twtmp 1925 aesenc $rndkey1,$inout1 1926 pand $twmask,$twtmp 1927 $movkey ($key_),@tweak[0] # load round[0] 1928 aesenc $rndkey1,$inout2 1929 aesenc $rndkey1,$inout3 1930 aesenc $rndkey1,$inout4 1931 pxor $twtmp,@tweak[5] 1932 movaps @tweak[0],@tweak[1] # copy round[0] 1933 aesenc $rndkey1,$inout5 1934 $movkey -64($key),$rndkey1 1935 1936 movdqa $twres,$twtmp 1937 aesenc $rndkey0,$inout0 1938 paddd $twres,$twres 1939 pxor @tweak[5],@tweak[0] 1940 aesenc $rndkey0,$inout1 1941 psrad \$31,$twtmp 1942 paddq @tweak[5],@tweak[5] 1943 aesenc $rndkey0,$inout2 1944 aesenc $rndkey0,$inout3 1945 pand $twmask,$twtmp 1946 movaps @tweak[1],@tweak[2] 1947 aesenc $rndkey0,$inout4 1948 pxor $twtmp,@tweak[5] 1949 movdqa $twres,$twtmp 1950 aesenc $rndkey0,$inout5 1951 $movkey -48($key),$rndkey0 1952 1953 paddd $twres,$twres 1954 aesenc $rndkey1,$inout0 1955 pxor @tweak[5],@tweak[1] 1956 psrad \$31,$twtmp 1957 aesenc $rndkey1,$inout1 1958 paddq @tweak[5],@tweak[5] 1959 pand $twmask,$twtmp 1960 aesenc $rndkey1,$inout2 1961 aesenc $rndkey1,$inout3 1962 movdqa @tweak[3],`16*3`(%rsp) 1963 pxor $twtmp,@tweak[5] 1964 aesenc $rndkey1,$inout4 1965 movaps @tweak[2],@tweak[3] 1966 movdqa $twres,$twtmp 1967 aesenc $rndkey1,$inout5 1968 $movkey -32($key),$rndkey1 1969 1970 paddd $twres,$twres 1971 aesenc $rndkey0,$inout0 1972 pxor @tweak[5],@tweak[2] 1973 psrad \$31,$twtmp 1974 aesenc $rndkey0,$inout1 1975 paddq @tweak[5],@tweak[5] 1976 pand $twmask,$twtmp 1977 aesenc $rndkey0,$inout2 1978 aesenc $rndkey0,$inout3 1979 aesenc $rndkey0,$inout4 1980 pxor $twtmp,@tweak[5] 1981 movaps @tweak[3],@tweak[4] 1982 aesenc $rndkey0,$inout5 1983 1984 movdqa $twres,$rndkey0 1985 paddd $twres,$twres 1986 aesenc $rndkey1,$inout0 1987 pxor @tweak[5],@tweak[3] 1988 psrad \$31,$rndkey0 1989 aesenc $rndkey1,$inout1 1990 paddq @tweak[5],@tweak[5] 1991 pand $twmask,$rndkey0 1992 aesenc $rndkey1,$inout2 1993 aesenc $rndkey1,$inout3 1994 pxor $rndkey0,@tweak[5] 1995 $movkey ($key_),$rndkey0 1996 aesenc $rndkey1,$inout4 1997 aesenc $rndkey1,$inout5 1998 $movkey 16($key_),$rndkey1 1999 2000 pxor @tweak[5],@tweak[4] 2001 aesenclast `16*0`(%rsp),$inout0 2002 psrad \$31,$twres 2003 paddq @tweak[5],@tweak[5] 2004 aesenclast `16*1`(%rsp),$inout1 2005 aesenclast `16*2`(%rsp),$inout2 2006 pand $twmask,$twres 2007 mov %r10,%rax # restore $rounds 2008 aesenclast `16*3`(%rsp),$inout3 2009 aesenclast `16*4`(%rsp),$inout4 2010 aesenclast `16*5`(%rsp),$inout5 2011 pxor $twres,@tweak[5] 2012 2013 lea `16*6`($out),$out # $out+=6*16 2014 movups $inout0,`-16*6`($out) # store 6 output blocks 2015 movups $inout1,`-16*5`($out) 2016 movups $inout2,`-16*4`($out) 2017 movups $inout3,`-16*3`($out) 2018 movups $inout4,`-16*2`($out) 2019 movups $inout5,`-16*1`($out) 2020 sub \$16*6,$len 2021 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 2022 2023 mov \$16+96,$rounds 2024 sub $rnds_,$rounds 2025 mov $key_,$key # restore $key 2026 shr \$4,$rounds # restore original value 2027 2028.Lxts_enc_short: 2029 # at the point @tweak[0..5] are populated with tweak values 2030 mov $rounds,$rnds_ # backup $rounds 2031 pxor $rndkey0,@tweak[0] 2032 add \$16*6,$len # restore real remaining $len 2033 jz .Lxts_enc_done # done if ($len==0) 2034 2035 pxor $rndkey0,@tweak[1] 2036 cmp \$0x20,$len 2037 jb .Lxts_enc_one # $len is 1*16 2038 pxor $rndkey0,@tweak[2] 2039 je .Lxts_enc_two # $len is 2*16 2040 2041 pxor $rndkey0,@tweak[3] 2042 cmp \$0x40,$len 2043 jb .Lxts_enc_three # $len is 3*16 2044 pxor $rndkey0,@tweak[4] 2045 je .Lxts_enc_four # $len is 4*16 2046 2047 movdqu ($inp),$inout0 # $len is 5*16 2048 movdqu 16*1($inp),$inout1 2049 movdqu 16*2($inp),$inout2 2050 pxor @tweak[0],$inout0 2051 movdqu 16*3($inp),$inout3 2052 pxor @tweak[1],$inout1 2053 movdqu 16*4($inp),$inout4 2054 lea 16*5($inp),$inp # $inp+=5*16 2055 pxor @tweak[2],$inout2 2056 pxor @tweak[3],$inout3 2057 pxor @tweak[4],$inout4 2058 pxor $inout5,$inout5 2059 2060 call _aesni_encrypt6 2061 2062 xorps @tweak[0],$inout0 2063 movdqa @tweak[5],@tweak[0] 2064 xorps @tweak[1],$inout1 2065 xorps @tweak[2],$inout2 2066 movdqu $inout0,($out) # store 5 output blocks 2067 xorps @tweak[3],$inout3 2068 movdqu $inout1,16*1($out) 2069 xorps @tweak[4],$inout4 2070 movdqu $inout2,16*2($out) 2071 movdqu $inout3,16*3($out) 2072 movdqu $inout4,16*4($out) 2073 lea 16*5($out),$out # $out+=5*16 2074 jmp .Lxts_enc_done 2075 2076.align 16 2077.Lxts_enc_one: 2078 movups ($inp),$inout0 2079 lea 16*1($inp),$inp # inp+=1*16 2080 xorps @tweak[0],$inout0 2081___ 2082 &aesni_generate1("enc",$key,$rounds); 2083$code.=<<___; 2084 xorps @tweak[0],$inout0 2085 movdqa @tweak[1],@tweak[0] 2086 movups $inout0,($out) # store one output block 2087 lea 16*1($out),$out # $out+=1*16 2088 jmp .Lxts_enc_done 2089 2090.align 16 2091.Lxts_enc_two: 2092 movups ($inp),$inout0 2093 movups 16($inp),$inout1 2094 lea 32($inp),$inp # $inp+=2*16 2095 xorps @tweak[0],$inout0 2096 xorps @tweak[1],$inout1 2097 2098 call _aesni_encrypt2 2099 2100 xorps @tweak[0],$inout0 2101 movdqa @tweak[2],@tweak[0] 2102 xorps @tweak[1],$inout1 2103 movups $inout0,($out) # store 2 output blocks 2104 movups $inout1,16*1($out) 2105 lea 16*2($out),$out # $out+=2*16 2106 jmp .Lxts_enc_done 2107 2108.align 16 2109.Lxts_enc_three: 2110 movups ($inp),$inout0 2111 movups 16*1($inp),$inout1 2112 movups 16*2($inp),$inout2 2113 lea 16*3($inp),$inp # $inp+=3*16 2114 xorps @tweak[0],$inout0 2115 xorps @tweak[1],$inout1 2116 xorps @tweak[2],$inout2 2117 2118 call _aesni_encrypt3 2119 2120 xorps @tweak[0],$inout0 2121 movdqa @tweak[3],@tweak[0] 2122 xorps @tweak[1],$inout1 2123 xorps @tweak[2],$inout2 2124 movups $inout0,($out) # store 3 output blocks 2125 movups $inout1,16*1($out) 2126 movups $inout2,16*2($out) 2127 lea 16*3($out),$out # $out+=3*16 2128 jmp .Lxts_enc_done 2129 2130.align 16 2131.Lxts_enc_four: 2132 movups ($inp),$inout0 2133 movups 16*1($inp),$inout1 2134 movups 16*2($inp),$inout2 2135 xorps @tweak[0],$inout0 2136 movups 16*3($inp),$inout3 2137 lea 16*4($inp),$inp # $inp+=4*16 2138 xorps @tweak[1],$inout1 2139 xorps @tweak[2],$inout2 2140 xorps @tweak[3],$inout3 2141 2142 call _aesni_encrypt4 2143 2144 pxor @tweak[0],$inout0 2145 movdqa @tweak[4],@tweak[0] 2146 pxor @tweak[1],$inout1 2147 pxor @tweak[2],$inout2 2148 movdqu $inout0,($out) # store 4 output blocks 2149 pxor @tweak[3],$inout3 2150 movdqu $inout1,16*1($out) 2151 movdqu $inout2,16*2($out) 2152 movdqu $inout3,16*3($out) 2153 lea 16*4($out),$out # $out+=4*16 2154 jmp .Lxts_enc_done 2155 2156.align 16 2157.Lxts_enc_done: 2158 and \$15,$len_ # see if $len%16 is 0 2159 jz .Lxts_enc_ret 2160 mov $len_,$len 2161 2162.Lxts_enc_steal: 2163 movzb ($inp),%eax # borrow $rounds ... 2164 movzb -16($out),%ecx # ... and $key 2165 lea 1($inp),$inp 2166 mov %al,-16($out) 2167 mov %cl,0($out) 2168 lea 1($out),$out 2169 sub \$1,$len 2170 jnz .Lxts_enc_steal 2171 2172 sub $len_,$out # rewind $out 2173 mov $key_,$key # restore $key 2174 mov $rnds_,$rounds # restore $rounds 2175 2176 movups -16($out),$inout0 2177 xorps @tweak[0],$inout0 2178___ 2179 &aesni_generate1("enc",$key,$rounds); 2180$code.=<<___; 2181 xorps @tweak[0],$inout0 2182 movups $inout0,-16($out) 2183 2184.Lxts_enc_ret: 2185 xorps %xmm0,%xmm0 # clear register bank 2186 pxor %xmm1,%xmm1 2187 pxor %xmm2,%xmm2 2188 pxor %xmm3,%xmm3 2189 pxor %xmm4,%xmm4 2190 pxor %xmm5,%xmm5 2191___ 2192$code.=<<___ if (!$win64); 2193 pxor %xmm6,%xmm6 2194 pxor %xmm7,%xmm7 2195 movaps %xmm0,0x00(%rsp) # clear stack 2196 pxor %xmm8,%xmm8 2197 movaps %xmm0,0x10(%rsp) 2198 pxor %xmm9,%xmm9 2199 movaps %xmm0,0x20(%rsp) 2200 pxor %xmm10,%xmm10 2201 movaps %xmm0,0x30(%rsp) 2202 pxor %xmm11,%xmm11 2203 movaps %xmm0,0x40(%rsp) 2204 pxor %xmm12,%xmm12 2205 movaps %xmm0,0x50(%rsp) 2206 pxor %xmm13,%xmm13 2207 movaps %xmm0,0x60(%rsp) 2208 pxor %xmm14,%xmm14 2209 pxor %xmm15,%xmm15 2210___ 2211$code.=<<___ if ($win64); 2212 movaps -0xa8(%r11),%xmm6 2213 movaps %xmm0,-0xa8(%r11) # clear stack 2214 movaps -0x98(%r11),%xmm7 2215 movaps %xmm0,-0x98(%r11) 2216 movaps -0x88(%r11),%xmm8 2217 movaps %xmm0,-0x88(%r11) 2218 movaps -0x78(%r11),%xmm9 2219 movaps %xmm0,-0x78(%r11) 2220 movaps -0x68(%r11),%xmm10 2221 movaps %xmm0,-0x68(%r11) 2222 movaps -0x58(%r11),%xmm11 2223 movaps %xmm0,-0x58(%r11) 2224 movaps -0x48(%r11),%xmm12 2225 movaps %xmm0,-0x48(%r11) 2226 movaps -0x38(%r11),%xmm13 2227 movaps %xmm0,-0x38(%r11) 2228 movaps -0x28(%r11),%xmm14 2229 movaps %xmm0,-0x28(%r11) 2230 movaps -0x18(%r11),%xmm15 2231 movaps %xmm0,-0x18(%r11) 2232 movaps %xmm0,0x00(%rsp) 2233 movaps %xmm0,0x10(%rsp) 2234 movaps %xmm0,0x20(%rsp) 2235 movaps %xmm0,0x30(%rsp) 2236 movaps %xmm0,0x40(%rsp) 2237 movaps %xmm0,0x50(%rsp) 2238 movaps %xmm0,0x60(%rsp) 2239___ 2240$code.=<<___; 2241 mov -8(%r11),%rbp 2242.cfi_restore %rbp 2243 lea (%r11),%rsp 2244.cfi_def_cfa_register %rsp 2245.Lxts_enc_epilogue: 2246 ret 2247.cfi_endproc 2248.size aesni_xts_encrypt,.-aesni_xts_encrypt 2249___ 2250 2251$code.=<<___; 2252.globl aesni_xts_decrypt 2253.type aesni_xts_decrypt,\@function,6 2254.align 16 2255aesni_xts_decrypt: 2256.cfi_startproc 2257 lea (%rsp),%r11 # frame pointer 2258.cfi_def_cfa_register %r11 2259 push %rbp 2260.cfi_push %rbp 2261 sub \$$frame_size,%rsp 2262 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2263___ 2264$code.=<<___ if ($win64); 2265 movaps %xmm6,-0xa8(%r11) # offload everything 2266 movaps %xmm7,-0x98(%r11) 2267 movaps %xmm8,-0x88(%r11) 2268 movaps %xmm9,-0x78(%r11) 2269 movaps %xmm10,-0x68(%r11) 2270 movaps %xmm11,-0x58(%r11) 2271 movaps %xmm12,-0x48(%r11) 2272 movaps %xmm13,-0x38(%r11) 2273 movaps %xmm14,-0x28(%r11) 2274 movaps %xmm15,-0x18(%r11) 2275.Lxts_dec_body: 2276___ 2277$code.=<<___; 2278 movups ($ivp),$inout0 # load clear-text tweak 2279 mov 240($key2),$rounds # key2->rounds 2280 mov 240($key),$rnds_ # key1->rounds 2281___ 2282 # generate the tweak 2283 &aesni_generate1("enc",$key2,$rounds,$inout0); 2284$code.=<<___; 2285 xor %eax,%eax # if ($len%16) len-=16; 2286 test \$15,$len 2287 setnz %al 2288 shl \$4,%rax 2289 sub %rax,$len 2290 2291 $movkey ($key),$rndkey0 # zero round key 2292 mov $key,$key_ # backup $key 2293 mov $rnds_,$rounds # backup $rounds 2294 shl \$4,$rnds_ 2295 mov $len,$len_ # backup $len 2296 and \$-16,$len 2297 2298 $movkey 16($key,$rnds_),$rndkey1 # last round key 2299 2300 movdqa .Lxts_magic(%rip),$twmask 2301 movdqa $inout0,@tweak[5] 2302 pshufd \$0x5f,$inout0,$twres 2303 pxor $rndkey0,$rndkey1 2304___ 2305 for ($i=0;$i<4;$i++) { 2306 $code.=<<___; 2307 movdqa $twres,$twtmp 2308 paddd $twres,$twres 2309 movdqa @tweak[5],@tweak[$i] 2310 psrad \$31,$twtmp # broadcast upper bits 2311 paddq @tweak[5],@tweak[5] 2312 pand $twmask,$twtmp 2313 pxor $rndkey0,@tweak[$i] 2314 pxor $twtmp,@tweak[5] 2315___ 2316 } 2317$code.=<<___; 2318 movdqa @tweak[5],@tweak[4] 2319 psrad \$31,$twres 2320 paddq @tweak[5],@tweak[5] 2321 pand $twmask,$twres 2322 pxor $rndkey0,@tweak[4] 2323 pxor $twres,@tweak[5] 2324 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2325 2326 sub \$16*6,$len 2327 jc .Lxts_dec_short # if $len-=6*16 borrowed 2328 2329 mov \$16+96,$rounds 2330 lea 32($key_,$rnds_),$key # end of key schedule 2331 sub %r10,%rax # twisted $rounds 2332 $movkey 16($key_),$rndkey1 2333 mov %rax,%r10 # backup twisted $rounds 2334 lea .Lxts_magic(%rip),%r8 2335 jmp .Lxts_dec_grandloop 2336 2337.align 32 2338.Lxts_dec_grandloop: 2339 movdqu `16*0`($inp),$inout0 # load input 2340 movdqa $rndkey0,$twmask 2341 movdqu `16*1`($inp),$inout1 2342 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2343 movdqu `16*2`($inp),$inout2 2344 pxor @tweak[1],$inout1 2345 aesdec $rndkey1,$inout0 2346 movdqu `16*3`($inp),$inout3 2347 pxor @tweak[2],$inout2 2348 aesdec $rndkey1,$inout1 2349 movdqu `16*4`($inp),$inout4 2350 pxor @tweak[3],$inout3 2351 aesdec $rndkey1,$inout2 2352 movdqu `16*5`($inp),$inout5 2353 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2354 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2355 pxor @tweak[4],$inout4 2356 aesdec $rndkey1,$inout3 2357 $movkey 32($key_),$rndkey0 2358 lea `16*6`($inp),$inp 2359 pxor $twmask,$inout5 2360 2361 pxor $twres,@tweak[0] # calculate tweaks^round[last] 2362 aesdec $rndkey1,$inout4 2363 pxor $twres,@tweak[1] 2364 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2365 aesdec $rndkey1,$inout5 2366 $movkey 48($key_),$rndkey1 2367 pxor $twres,@tweak[2] 2368 2369 aesdec $rndkey0,$inout0 2370 pxor $twres,@tweak[3] 2371 movdqa @tweak[1],`16*1`(%rsp) 2372 aesdec $rndkey0,$inout1 2373 pxor $twres,@tweak[4] 2374 movdqa @tweak[2],`16*2`(%rsp) 2375 aesdec $rndkey0,$inout2 2376 aesdec $rndkey0,$inout3 2377 pxor $twres,$twmask 2378 movdqa @tweak[4],`16*4`(%rsp) 2379 aesdec $rndkey0,$inout4 2380 aesdec $rndkey0,$inout5 2381 $movkey 64($key_),$rndkey0 2382 movdqa $twmask,`16*5`(%rsp) 2383 pshufd \$0x5f,@tweak[5],$twres 2384 jmp .Lxts_dec_loop6 2385.align 32 2386.Lxts_dec_loop6: 2387 aesdec $rndkey1,$inout0 2388 aesdec $rndkey1,$inout1 2389 aesdec $rndkey1,$inout2 2390 aesdec $rndkey1,$inout3 2391 aesdec $rndkey1,$inout4 2392 aesdec $rndkey1,$inout5 2393 $movkey -64($key,%rax),$rndkey1 2394 add \$32,%rax 2395 2396 aesdec $rndkey0,$inout0 2397 aesdec $rndkey0,$inout1 2398 aesdec $rndkey0,$inout2 2399 aesdec $rndkey0,$inout3 2400 aesdec $rndkey0,$inout4 2401 aesdec $rndkey0,$inout5 2402 $movkey -80($key,%rax),$rndkey0 2403 jnz .Lxts_dec_loop6 2404 2405 movdqa (%r8),$twmask # start calculating next tweak 2406 movdqa $twres,$twtmp 2407 paddd $twres,$twres 2408 aesdec $rndkey1,$inout0 2409 paddq @tweak[5],@tweak[5] 2410 psrad \$31,$twtmp 2411 aesdec $rndkey1,$inout1 2412 pand $twmask,$twtmp 2413 $movkey ($key_),@tweak[0] # load round[0] 2414 aesdec $rndkey1,$inout2 2415 aesdec $rndkey1,$inout3 2416 aesdec $rndkey1,$inout4 2417 pxor $twtmp,@tweak[5] 2418 movaps @tweak[0],@tweak[1] # copy round[0] 2419 aesdec $rndkey1,$inout5 2420 $movkey -64($key),$rndkey1 2421 2422 movdqa $twres,$twtmp 2423 aesdec $rndkey0,$inout0 2424 paddd $twres,$twres 2425 pxor @tweak[5],@tweak[0] 2426 aesdec $rndkey0,$inout1 2427 psrad \$31,$twtmp 2428 paddq @tweak[5],@tweak[5] 2429 aesdec $rndkey0,$inout2 2430 aesdec $rndkey0,$inout3 2431 pand $twmask,$twtmp 2432 movaps @tweak[1],@tweak[2] 2433 aesdec $rndkey0,$inout4 2434 pxor $twtmp,@tweak[5] 2435 movdqa $twres,$twtmp 2436 aesdec $rndkey0,$inout5 2437 $movkey -48($key),$rndkey0 2438 2439 paddd $twres,$twres 2440 aesdec $rndkey1,$inout0 2441 pxor @tweak[5],@tweak[1] 2442 psrad \$31,$twtmp 2443 aesdec $rndkey1,$inout1 2444 paddq @tweak[5],@tweak[5] 2445 pand $twmask,$twtmp 2446 aesdec $rndkey1,$inout2 2447 aesdec $rndkey1,$inout3 2448 movdqa @tweak[3],`16*3`(%rsp) 2449 pxor $twtmp,@tweak[5] 2450 aesdec $rndkey1,$inout4 2451 movaps @tweak[2],@tweak[3] 2452 movdqa $twres,$twtmp 2453 aesdec $rndkey1,$inout5 2454 $movkey -32($key),$rndkey1 2455 2456 paddd $twres,$twres 2457 aesdec $rndkey0,$inout0 2458 pxor @tweak[5],@tweak[2] 2459 psrad \$31,$twtmp 2460 aesdec $rndkey0,$inout1 2461 paddq @tweak[5],@tweak[5] 2462 pand $twmask,$twtmp 2463 aesdec $rndkey0,$inout2 2464 aesdec $rndkey0,$inout3 2465 aesdec $rndkey0,$inout4 2466 pxor $twtmp,@tweak[5] 2467 movaps @tweak[3],@tweak[4] 2468 aesdec $rndkey0,$inout5 2469 2470 movdqa $twres,$rndkey0 2471 paddd $twres,$twres 2472 aesdec $rndkey1,$inout0 2473 pxor @tweak[5],@tweak[3] 2474 psrad \$31,$rndkey0 2475 aesdec $rndkey1,$inout1 2476 paddq @tweak[5],@tweak[5] 2477 pand $twmask,$rndkey0 2478 aesdec $rndkey1,$inout2 2479 aesdec $rndkey1,$inout3 2480 pxor $rndkey0,@tweak[5] 2481 $movkey ($key_),$rndkey0 2482 aesdec $rndkey1,$inout4 2483 aesdec $rndkey1,$inout5 2484 $movkey 16($key_),$rndkey1 2485 2486 pxor @tweak[5],@tweak[4] 2487 aesdeclast `16*0`(%rsp),$inout0 2488 psrad \$31,$twres 2489 paddq @tweak[5],@tweak[5] 2490 aesdeclast `16*1`(%rsp),$inout1 2491 aesdeclast `16*2`(%rsp),$inout2 2492 pand $twmask,$twres 2493 mov %r10,%rax # restore $rounds 2494 aesdeclast `16*3`(%rsp),$inout3 2495 aesdeclast `16*4`(%rsp),$inout4 2496 aesdeclast `16*5`(%rsp),$inout5 2497 pxor $twres,@tweak[5] 2498 2499 lea `16*6`($out),$out # $out+=6*16 2500 movups $inout0,`-16*6`($out) # store 6 output blocks 2501 movups $inout1,`-16*5`($out) 2502 movups $inout2,`-16*4`($out) 2503 movups $inout3,`-16*3`($out) 2504 movups $inout4,`-16*2`($out) 2505 movups $inout5,`-16*1`($out) 2506 sub \$16*6,$len 2507 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2508 2509 mov \$16+96,$rounds 2510 sub $rnds_,$rounds 2511 mov $key_,$key # restore $key 2512 shr \$4,$rounds # restore original value 2513 2514.Lxts_dec_short: 2515 # at the point @tweak[0..5] are populated with tweak values 2516 mov $rounds,$rnds_ # backup $rounds 2517 pxor $rndkey0,@tweak[0] 2518 pxor $rndkey0,@tweak[1] 2519 add \$16*6,$len # restore real remaining $len 2520 jz .Lxts_dec_done # done if ($len==0) 2521 2522 pxor $rndkey0,@tweak[2] 2523 cmp \$0x20,$len 2524 jb .Lxts_dec_one # $len is 1*16 2525 pxor $rndkey0,@tweak[3] 2526 je .Lxts_dec_two # $len is 2*16 2527 2528 pxor $rndkey0,@tweak[4] 2529 cmp \$0x40,$len 2530 jb .Lxts_dec_three # $len is 3*16 2531 je .Lxts_dec_four # $len is 4*16 2532 2533 movdqu ($inp),$inout0 # $len is 5*16 2534 movdqu 16*1($inp),$inout1 2535 movdqu 16*2($inp),$inout2 2536 pxor @tweak[0],$inout0 2537 movdqu 16*3($inp),$inout3 2538 pxor @tweak[1],$inout1 2539 movdqu 16*4($inp),$inout4 2540 lea 16*5($inp),$inp # $inp+=5*16 2541 pxor @tweak[2],$inout2 2542 pxor @tweak[3],$inout3 2543 pxor @tweak[4],$inout4 2544 2545 call _aesni_decrypt6 2546 2547 xorps @tweak[0],$inout0 2548 xorps @tweak[1],$inout1 2549 xorps @tweak[2],$inout2 2550 movdqu $inout0,($out) # store 5 output blocks 2551 xorps @tweak[3],$inout3 2552 movdqu $inout1,16*1($out) 2553 xorps @tweak[4],$inout4 2554 movdqu $inout2,16*2($out) 2555 pxor $twtmp,$twtmp 2556 movdqu $inout3,16*3($out) 2557 pcmpgtd @tweak[5],$twtmp 2558 movdqu $inout4,16*4($out) 2559 lea 16*5($out),$out # $out+=5*16 2560 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2561 and \$15,$len_ 2562 jz .Lxts_dec_ret 2563 2564 movdqa @tweak[5],@tweak[0] 2565 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2566 pand $twmask,@tweak[1] # isolate carry and residue 2567 pxor @tweak[5],@tweak[1] 2568 jmp .Lxts_dec_done2 2569 2570.align 16 2571.Lxts_dec_one: 2572 movups ($inp),$inout0 2573 lea 16*1($inp),$inp # $inp+=1*16 2574 xorps @tweak[0],$inout0 2575___ 2576 &aesni_generate1("dec",$key,$rounds); 2577$code.=<<___; 2578 xorps @tweak[0],$inout0 2579 movdqa @tweak[1],@tweak[0] 2580 movups $inout0,($out) # store one output block 2581 movdqa @tweak[2],@tweak[1] 2582 lea 16*1($out),$out # $out+=1*16 2583 jmp .Lxts_dec_done 2584 2585.align 16 2586.Lxts_dec_two: 2587 movups ($inp),$inout0 2588 movups 16($inp),$inout1 2589 lea 32($inp),$inp # $inp+=2*16 2590 xorps @tweak[0],$inout0 2591 xorps @tweak[1],$inout1 2592 2593 call _aesni_decrypt2 2594 2595 xorps @tweak[0],$inout0 2596 movdqa @tweak[2],@tweak[0] 2597 xorps @tweak[1],$inout1 2598 movdqa @tweak[3],@tweak[1] 2599 movups $inout0,($out) # store 2 output blocks 2600 movups $inout1,16*1($out) 2601 lea 16*2($out),$out # $out+=2*16 2602 jmp .Lxts_dec_done 2603 2604.align 16 2605.Lxts_dec_three: 2606 movups ($inp),$inout0 2607 movups 16*1($inp),$inout1 2608 movups 16*2($inp),$inout2 2609 lea 16*3($inp),$inp # $inp+=3*16 2610 xorps @tweak[0],$inout0 2611 xorps @tweak[1],$inout1 2612 xorps @tweak[2],$inout2 2613 2614 call _aesni_decrypt3 2615 2616 xorps @tweak[0],$inout0 2617 movdqa @tweak[3],@tweak[0] 2618 xorps @tweak[1],$inout1 2619 movdqa @tweak[4],@tweak[1] 2620 xorps @tweak[2],$inout2 2621 movups $inout0,($out) # store 3 output blocks 2622 movups $inout1,16*1($out) 2623 movups $inout2,16*2($out) 2624 lea 16*3($out),$out # $out+=3*16 2625 jmp .Lxts_dec_done 2626 2627.align 16 2628.Lxts_dec_four: 2629 movups ($inp),$inout0 2630 movups 16*1($inp),$inout1 2631 movups 16*2($inp),$inout2 2632 xorps @tweak[0],$inout0 2633 movups 16*3($inp),$inout3 2634 lea 16*4($inp),$inp # $inp+=4*16 2635 xorps @tweak[1],$inout1 2636 xorps @tweak[2],$inout2 2637 xorps @tweak[3],$inout3 2638 2639 call _aesni_decrypt4 2640 2641 pxor @tweak[0],$inout0 2642 movdqa @tweak[4],@tweak[0] 2643 pxor @tweak[1],$inout1 2644 movdqa @tweak[5],@tweak[1] 2645 pxor @tweak[2],$inout2 2646 movdqu $inout0,($out) # store 4 output blocks 2647 pxor @tweak[3],$inout3 2648 movdqu $inout1,16*1($out) 2649 movdqu $inout2,16*2($out) 2650 movdqu $inout3,16*3($out) 2651 lea 16*4($out),$out # $out+=4*16 2652 jmp .Lxts_dec_done 2653 2654.align 16 2655.Lxts_dec_done: 2656 and \$15,$len_ # see if $len%16 is 0 2657 jz .Lxts_dec_ret 2658.Lxts_dec_done2: 2659 mov $len_,$len 2660 mov $key_,$key # restore $key 2661 mov $rnds_,$rounds # restore $rounds 2662 2663 movups ($inp),$inout0 2664 xorps @tweak[1],$inout0 2665___ 2666 &aesni_generate1("dec",$key,$rounds); 2667$code.=<<___; 2668 xorps @tweak[1],$inout0 2669 movups $inout0,($out) 2670 2671.Lxts_dec_steal: 2672 movzb 16($inp),%eax # borrow $rounds ... 2673 movzb ($out),%ecx # ... and $key 2674 lea 1($inp),$inp 2675 mov %al,($out) 2676 mov %cl,16($out) 2677 lea 1($out),$out 2678 sub \$1,$len 2679 jnz .Lxts_dec_steal 2680 2681 sub $len_,$out # rewind $out 2682 mov $key_,$key # restore $key 2683 mov $rnds_,$rounds # restore $rounds 2684 2685 movups ($out),$inout0 2686 xorps @tweak[0],$inout0 2687___ 2688 &aesni_generate1("dec",$key,$rounds); 2689$code.=<<___; 2690 xorps @tweak[0],$inout0 2691 movups $inout0,($out) 2692 2693.Lxts_dec_ret: 2694 xorps %xmm0,%xmm0 # clear register bank 2695 pxor %xmm1,%xmm1 2696 pxor %xmm2,%xmm2 2697 pxor %xmm3,%xmm3 2698 pxor %xmm4,%xmm4 2699 pxor %xmm5,%xmm5 2700___ 2701$code.=<<___ if (!$win64); 2702 pxor %xmm6,%xmm6 2703 pxor %xmm7,%xmm7 2704 movaps %xmm0,0x00(%rsp) # clear stack 2705 pxor %xmm8,%xmm8 2706 movaps %xmm0,0x10(%rsp) 2707 pxor %xmm9,%xmm9 2708 movaps %xmm0,0x20(%rsp) 2709 pxor %xmm10,%xmm10 2710 movaps %xmm0,0x30(%rsp) 2711 pxor %xmm11,%xmm11 2712 movaps %xmm0,0x40(%rsp) 2713 pxor %xmm12,%xmm12 2714 movaps %xmm0,0x50(%rsp) 2715 pxor %xmm13,%xmm13 2716 movaps %xmm0,0x60(%rsp) 2717 pxor %xmm14,%xmm14 2718 pxor %xmm15,%xmm15 2719___ 2720$code.=<<___ if ($win64); 2721 movaps -0xa8(%r11),%xmm6 2722 movaps %xmm0,-0xa8(%r11) # clear stack 2723 movaps -0x98(%r11),%xmm7 2724 movaps %xmm0,-0x98(%r11) 2725 movaps -0x88(%r11),%xmm8 2726 movaps %xmm0,-0x88(%r11) 2727 movaps -0x78(%r11),%xmm9 2728 movaps %xmm0,-0x78(%r11) 2729 movaps -0x68(%r11),%xmm10 2730 movaps %xmm0,-0x68(%r11) 2731 movaps -0x58(%r11),%xmm11 2732 movaps %xmm0,-0x58(%r11) 2733 movaps -0x48(%r11),%xmm12 2734 movaps %xmm0,-0x48(%r11) 2735 movaps -0x38(%r11),%xmm13 2736 movaps %xmm0,-0x38(%r11) 2737 movaps -0x28(%r11),%xmm14 2738 movaps %xmm0,-0x28(%r11) 2739 movaps -0x18(%r11),%xmm15 2740 movaps %xmm0,-0x18(%r11) 2741 movaps %xmm0,0x00(%rsp) 2742 movaps %xmm0,0x10(%rsp) 2743 movaps %xmm0,0x20(%rsp) 2744 movaps %xmm0,0x30(%rsp) 2745 movaps %xmm0,0x40(%rsp) 2746 movaps %xmm0,0x50(%rsp) 2747 movaps %xmm0,0x60(%rsp) 2748___ 2749$code.=<<___; 2750 mov -8(%r11),%rbp 2751.cfi_restore %rbp 2752 lea (%r11),%rsp 2753.cfi_def_cfa_register %rsp 2754.Lxts_dec_epilogue: 2755 ret 2756.cfi_endproc 2757.size aesni_xts_decrypt,.-aesni_xts_decrypt 2758___ 2759} 2760 2761###################################################################### 2762# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2763# const AES_KEY *key, unsigned int start_block_num, 2764# unsigned char offset_i[16], const unsigned char L_[][16], 2765# unsigned char checksum[16]); 2766# 2767{ 2768my @offset=map("%xmm$_",(10..15)); 2769my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2770my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2771my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2772my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2773my $seventh_arg = $win64 ? 56 : 8; 2774my $blocks = $len; 2775 2776$code.=<<___; 2777.globl aesni_ocb_encrypt 2778.type aesni_ocb_encrypt,\@function,6 2779.align 32 2780aesni_ocb_encrypt: 2781.cfi_startproc 2782 lea (%rsp),%rax 2783 push %rbx 2784.cfi_push %rbx 2785 push %rbp 2786.cfi_push %rbp 2787 push %r12 2788.cfi_push %r12 2789 push %r13 2790.cfi_push %r13 2791 push %r14 2792.cfi_push %r14 2793___ 2794$code.=<<___ if ($win64); 2795 lea -0xa0(%rsp),%rsp 2796 movaps %xmm6,0x00(%rsp) # offload everything 2797 movaps %xmm7,0x10(%rsp) 2798 movaps %xmm8,0x20(%rsp) 2799 movaps %xmm9,0x30(%rsp) 2800 movaps %xmm10,0x40(%rsp) 2801 movaps %xmm11,0x50(%rsp) 2802 movaps %xmm12,0x60(%rsp) 2803 movaps %xmm13,0x70(%rsp) 2804 movaps %xmm14,0x80(%rsp) 2805 movaps %xmm15,0x90(%rsp) 2806.Locb_enc_body: 2807___ 2808$code.=<<___; 2809 mov $seventh_arg(%rax),$L_p # 7th argument 2810 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2811 2812 mov 240($key),$rnds_ 2813 mov $key,$key_ 2814 shl \$4,$rnds_ 2815 $movkey ($key),$rndkey0l # round[0] 2816 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2817 2818 movdqu ($offset_p),@offset[5] # load last offset_i 2819 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2820 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2821 2822 mov \$16+32,$rounds 2823 lea 32($key_,$rnds_),$key 2824 $movkey 16($key_),$rndkey1 # round[1] 2825 sub %r10,%rax # twisted $rounds 2826 mov %rax,%r10 # backup twisted $rounds 2827 2828 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2829 movdqu ($checksum_p),$checksum # load checksum 2830 2831 test \$1,$block_num # is first block number odd? 2832 jnz .Locb_enc_odd 2833 2834 bsf $block_num,$i1 2835 add \$1,$block_num 2836 shl \$4,$i1 2837 movdqu ($L_p,$i1),$inout5 # borrow 2838 movdqu ($inp),$inout0 2839 lea 16($inp),$inp 2840 2841 call __ocb_encrypt1 2842 2843 movdqa $inout5,@offset[5] 2844 movups $inout0,($out) 2845 lea 16($out),$out 2846 sub \$1,$blocks 2847 jz .Locb_enc_done 2848 2849.Locb_enc_odd: 2850 lea 1($block_num),$i1 # even-numbered blocks 2851 lea 3($block_num),$i3 2852 lea 5($block_num),$i5 2853 lea 6($block_num),$block_num 2854 bsf $i1,$i1 # ntz(block) 2855 bsf $i3,$i3 2856 bsf $i5,$i5 2857 shl \$4,$i1 # ntz(block) -> table offset 2858 shl \$4,$i3 2859 shl \$4,$i5 2860 2861 sub \$6,$blocks 2862 jc .Locb_enc_short 2863 jmp .Locb_enc_grandloop 2864 2865.align 32 2866.Locb_enc_grandloop: 2867 movdqu `16*0`($inp),$inout0 # load input 2868 movdqu `16*1`($inp),$inout1 2869 movdqu `16*2`($inp),$inout2 2870 movdqu `16*3`($inp),$inout3 2871 movdqu `16*4`($inp),$inout4 2872 movdqu `16*5`($inp),$inout5 2873 lea `16*6`($inp),$inp 2874 2875 call __ocb_encrypt6 2876 2877 movups $inout0,`16*0`($out) # store output 2878 movups $inout1,`16*1`($out) 2879 movups $inout2,`16*2`($out) 2880 movups $inout3,`16*3`($out) 2881 movups $inout4,`16*4`($out) 2882 movups $inout5,`16*5`($out) 2883 lea `16*6`($out),$out 2884 sub \$6,$blocks 2885 jnc .Locb_enc_grandloop 2886 2887.Locb_enc_short: 2888 add \$6,$blocks 2889 jz .Locb_enc_done 2890 2891 movdqu `16*0`($inp),$inout0 2892 cmp \$2,$blocks 2893 jb .Locb_enc_one 2894 movdqu `16*1`($inp),$inout1 2895 je .Locb_enc_two 2896 2897 movdqu `16*2`($inp),$inout2 2898 cmp \$4,$blocks 2899 jb .Locb_enc_three 2900 movdqu `16*3`($inp),$inout3 2901 je .Locb_enc_four 2902 2903 movdqu `16*4`($inp),$inout4 2904 pxor $inout5,$inout5 2905 2906 call __ocb_encrypt6 2907 2908 movdqa @offset[4],@offset[5] 2909 movups $inout0,`16*0`($out) 2910 movups $inout1,`16*1`($out) 2911 movups $inout2,`16*2`($out) 2912 movups $inout3,`16*3`($out) 2913 movups $inout4,`16*4`($out) 2914 2915 jmp .Locb_enc_done 2916 2917.align 16 2918.Locb_enc_one: 2919 movdqa @offset[0],$inout5 # borrow 2920 2921 call __ocb_encrypt1 2922 2923 movdqa $inout5,@offset[5] 2924 movups $inout0,`16*0`($out) 2925 jmp .Locb_enc_done 2926 2927.align 16 2928.Locb_enc_two: 2929 pxor $inout2,$inout2 2930 pxor $inout3,$inout3 2931 2932 call __ocb_encrypt4 2933 2934 movdqa @offset[1],@offset[5] 2935 movups $inout0,`16*0`($out) 2936 movups $inout1,`16*1`($out) 2937 2938 jmp .Locb_enc_done 2939 2940.align 16 2941.Locb_enc_three: 2942 pxor $inout3,$inout3 2943 2944 call __ocb_encrypt4 2945 2946 movdqa @offset[2],@offset[5] 2947 movups $inout0,`16*0`($out) 2948 movups $inout1,`16*1`($out) 2949 movups $inout2,`16*2`($out) 2950 2951 jmp .Locb_enc_done 2952 2953.align 16 2954.Locb_enc_four: 2955 call __ocb_encrypt4 2956 2957 movdqa @offset[3],@offset[5] 2958 movups $inout0,`16*0`($out) 2959 movups $inout1,`16*1`($out) 2960 movups $inout2,`16*2`($out) 2961 movups $inout3,`16*3`($out) 2962 2963.Locb_enc_done: 2964 pxor $rndkey0,@offset[5] # "remove" round[last] 2965 movdqu $checksum,($checksum_p) # store checksum 2966 movdqu @offset[5],($offset_p) # store last offset_i 2967 2968 xorps %xmm0,%xmm0 # clear register bank 2969 pxor %xmm1,%xmm1 2970 pxor %xmm2,%xmm2 2971 pxor %xmm3,%xmm3 2972 pxor %xmm4,%xmm4 2973 pxor %xmm5,%xmm5 2974___ 2975$code.=<<___ if (!$win64); 2976 pxor %xmm6,%xmm6 2977 pxor %xmm7,%xmm7 2978 pxor %xmm8,%xmm8 2979 pxor %xmm9,%xmm9 2980 pxor %xmm10,%xmm10 2981 pxor %xmm11,%xmm11 2982 pxor %xmm12,%xmm12 2983 pxor %xmm13,%xmm13 2984 pxor %xmm14,%xmm14 2985 pxor %xmm15,%xmm15 2986 lea 0x28(%rsp),%rax 2987.cfi_def_cfa %rax,8 2988___ 2989$code.=<<___ if ($win64); 2990 movaps 0x00(%rsp),%xmm6 2991 movaps %xmm0,0x00(%rsp) # clear stack 2992 movaps 0x10(%rsp),%xmm7 2993 movaps %xmm0,0x10(%rsp) 2994 movaps 0x20(%rsp),%xmm8 2995 movaps %xmm0,0x20(%rsp) 2996 movaps 0x30(%rsp),%xmm9 2997 movaps %xmm0,0x30(%rsp) 2998 movaps 0x40(%rsp),%xmm10 2999 movaps %xmm0,0x40(%rsp) 3000 movaps 0x50(%rsp),%xmm11 3001 movaps %xmm0,0x50(%rsp) 3002 movaps 0x60(%rsp),%xmm12 3003 movaps %xmm0,0x60(%rsp) 3004 movaps 0x70(%rsp),%xmm13 3005 movaps %xmm0,0x70(%rsp) 3006 movaps 0x80(%rsp),%xmm14 3007 movaps %xmm0,0x80(%rsp) 3008 movaps 0x90(%rsp),%xmm15 3009 movaps %xmm0,0x90(%rsp) 3010 lea 0xa0+0x28(%rsp),%rax 3011.Locb_enc_pop: 3012___ 3013$code.=<<___; 3014 mov -40(%rax),%r14 3015.cfi_restore %r14 3016 mov -32(%rax),%r13 3017.cfi_restore %r13 3018 mov -24(%rax),%r12 3019.cfi_restore %r12 3020 mov -16(%rax),%rbp 3021.cfi_restore %rbp 3022 mov -8(%rax),%rbx 3023.cfi_restore %rbx 3024 lea (%rax),%rsp 3025.cfi_def_cfa_register %rsp 3026.Locb_enc_epilogue: 3027 ret 3028.cfi_endproc 3029.size aesni_ocb_encrypt,.-aesni_ocb_encrypt 3030 3031.type __ocb_encrypt6,\@abi-omnipotent 3032.align 32 3033__ocb_encrypt6: 3034 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3035 movdqu ($L_p,$i1),@offset[1] 3036 movdqa @offset[0],@offset[2] 3037 movdqu ($L_p,$i3),@offset[3] 3038 movdqa @offset[0],@offset[4] 3039 pxor @offset[5],@offset[0] 3040 movdqu ($L_p,$i5),@offset[5] 3041 pxor @offset[0],@offset[1] 3042 pxor $inout0,$checksum # accumulate checksum 3043 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3044 pxor @offset[1],@offset[2] 3045 pxor $inout1,$checksum 3046 pxor @offset[1],$inout1 3047 pxor @offset[2],@offset[3] 3048 pxor $inout2,$checksum 3049 pxor @offset[2],$inout2 3050 pxor @offset[3],@offset[4] 3051 pxor $inout3,$checksum 3052 pxor @offset[3],$inout3 3053 pxor @offset[4],@offset[5] 3054 pxor $inout4,$checksum 3055 pxor @offset[4],$inout4 3056 pxor $inout5,$checksum 3057 pxor @offset[5],$inout5 3058 $movkey 32($key_),$rndkey0 3059 3060 lea 1($block_num),$i1 # even-numbered blocks 3061 lea 3($block_num),$i3 3062 lea 5($block_num),$i5 3063 add \$6,$block_num 3064 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3065 bsf $i1,$i1 # ntz(block) 3066 bsf $i3,$i3 3067 bsf $i5,$i5 3068 3069 aesenc $rndkey1,$inout0 3070 aesenc $rndkey1,$inout1 3071 aesenc $rndkey1,$inout2 3072 aesenc $rndkey1,$inout3 3073 pxor $rndkey0l,@offset[1] 3074 pxor $rndkey0l,@offset[2] 3075 aesenc $rndkey1,$inout4 3076 pxor $rndkey0l,@offset[3] 3077 pxor $rndkey0l,@offset[4] 3078 aesenc $rndkey1,$inout5 3079 $movkey 48($key_),$rndkey1 3080 pxor $rndkey0l,@offset[5] 3081 3082 aesenc $rndkey0,$inout0 3083 aesenc $rndkey0,$inout1 3084 aesenc $rndkey0,$inout2 3085 aesenc $rndkey0,$inout3 3086 aesenc $rndkey0,$inout4 3087 aesenc $rndkey0,$inout5 3088 $movkey 64($key_),$rndkey0 3089 shl \$4,$i1 # ntz(block) -> table offset 3090 shl \$4,$i3 3091 jmp .Locb_enc_loop6 3092 3093.align 32 3094.Locb_enc_loop6: 3095 aesenc $rndkey1,$inout0 3096 aesenc $rndkey1,$inout1 3097 aesenc $rndkey1,$inout2 3098 aesenc $rndkey1,$inout3 3099 aesenc $rndkey1,$inout4 3100 aesenc $rndkey1,$inout5 3101 $movkey ($key,%rax),$rndkey1 3102 add \$32,%rax 3103 3104 aesenc $rndkey0,$inout0 3105 aesenc $rndkey0,$inout1 3106 aesenc $rndkey0,$inout2 3107 aesenc $rndkey0,$inout3 3108 aesenc $rndkey0,$inout4 3109 aesenc $rndkey0,$inout5 3110 $movkey -16($key,%rax),$rndkey0 3111 jnz .Locb_enc_loop6 3112 3113 aesenc $rndkey1,$inout0 3114 aesenc $rndkey1,$inout1 3115 aesenc $rndkey1,$inout2 3116 aesenc $rndkey1,$inout3 3117 aesenc $rndkey1,$inout4 3118 aesenc $rndkey1,$inout5 3119 $movkey 16($key_),$rndkey1 3120 shl \$4,$i5 3121 3122 aesenclast @offset[0],$inout0 3123 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3124 mov %r10,%rax # restore twisted rounds 3125 aesenclast @offset[1],$inout1 3126 aesenclast @offset[2],$inout2 3127 aesenclast @offset[3],$inout3 3128 aesenclast @offset[4],$inout4 3129 aesenclast @offset[5],$inout5 3130 ret 3131.size __ocb_encrypt6,.-__ocb_encrypt6 3132 3133.type __ocb_encrypt4,\@abi-omnipotent 3134.align 32 3135__ocb_encrypt4: 3136 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3137 movdqu ($L_p,$i1),@offset[1] 3138 movdqa @offset[0],@offset[2] 3139 movdqu ($L_p,$i3),@offset[3] 3140 pxor @offset[5],@offset[0] 3141 pxor @offset[0],@offset[1] 3142 pxor $inout0,$checksum # accumulate checksum 3143 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3144 pxor @offset[1],@offset[2] 3145 pxor $inout1,$checksum 3146 pxor @offset[1],$inout1 3147 pxor @offset[2],@offset[3] 3148 pxor $inout2,$checksum 3149 pxor @offset[2],$inout2 3150 pxor $inout3,$checksum 3151 pxor @offset[3],$inout3 3152 $movkey 32($key_),$rndkey0 3153 3154 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3155 pxor $rndkey0l,@offset[1] 3156 pxor $rndkey0l,@offset[2] 3157 pxor $rndkey0l,@offset[3] 3158 3159 aesenc $rndkey1,$inout0 3160 aesenc $rndkey1,$inout1 3161 aesenc $rndkey1,$inout2 3162 aesenc $rndkey1,$inout3 3163 $movkey 48($key_),$rndkey1 3164 3165 aesenc $rndkey0,$inout0 3166 aesenc $rndkey0,$inout1 3167 aesenc $rndkey0,$inout2 3168 aesenc $rndkey0,$inout3 3169 $movkey 64($key_),$rndkey0 3170 jmp .Locb_enc_loop4 3171 3172.align 32 3173.Locb_enc_loop4: 3174 aesenc $rndkey1,$inout0 3175 aesenc $rndkey1,$inout1 3176 aesenc $rndkey1,$inout2 3177 aesenc $rndkey1,$inout3 3178 $movkey ($key,%rax),$rndkey1 3179 add \$32,%rax 3180 3181 aesenc $rndkey0,$inout0 3182 aesenc $rndkey0,$inout1 3183 aesenc $rndkey0,$inout2 3184 aesenc $rndkey0,$inout3 3185 $movkey -16($key,%rax),$rndkey0 3186 jnz .Locb_enc_loop4 3187 3188 aesenc $rndkey1,$inout0 3189 aesenc $rndkey1,$inout1 3190 aesenc $rndkey1,$inout2 3191 aesenc $rndkey1,$inout3 3192 $movkey 16($key_),$rndkey1 3193 mov %r10,%rax # restore twisted rounds 3194 3195 aesenclast @offset[0],$inout0 3196 aesenclast @offset[1],$inout1 3197 aesenclast @offset[2],$inout2 3198 aesenclast @offset[3],$inout3 3199 ret 3200.size __ocb_encrypt4,.-__ocb_encrypt4 3201 3202.type __ocb_encrypt1,\@abi-omnipotent 3203.align 32 3204__ocb_encrypt1: 3205 pxor @offset[5],$inout5 # offset_i 3206 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3207 pxor $inout0,$checksum # accumulate checksum 3208 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3209 $movkey 32($key_),$rndkey0 3210 3211 aesenc $rndkey1,$inout0 3212 $movkey 48($key_),$rndkey1 3213 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3214 3215 aesenc $rndkey0,$inout0 3216 $movkey 64($key_),$rndkey0 3217 jmp .Locb_enc_loop1 3218 3219.align 32 3220.Locb_enc_loop1: 3221 aesenc $rndkey1,$inout0 3222 $movkey ($key,%rax),$rndkey1 3223 add \$32,%rax 3224 3225 aesenc $rndkey0,$inout0 3226 $movkey -16($key,%rax),$rndkey0 3227 jnz .Locb_enc_loop1 3228 3229 aesenc $rndkey1,$inout0 3230 $movkey 16($key_),$rndkey1 # redundant in tail 3231 mov %r10,%rax # restore twisted rounds 3232 3233 aesenclast $inout5,$inout0 3234 ret 3235.size __ocb_encrypt1,.-__ocb_encrypt1 3236 3237.globl aesni_ocb_decrypt 3238.type aesni_ocb_decrypt,\@function,6 3239.align 32 3240aesni_ocb_decrypt: 3241.cfi_startproc 3242 lea (%rsp),%rax 3243 push %rbx 3244.cfi_push %rbx 3245 push %rbp 3246.cfi_push %rbp 3247 push %r12 3248.cfi_push %r12 3249 push %r13 3250.cfi_push %r13 3251 push %r14 3252.cfi_push %r14 3253___ 3254$code.=<<___ if ($win64); 3255 lea -0xa0(%rsp),%rsp 3256 movaps %xmm6,0x00(%rsp) # offload everything 3257 movaps %xmm7,0x10(%rsp) 3258 movaps %xmm8,0x20(%rsp) 3259 movaps %xmm9,0x30(%rsp) 3260 movaps %xmm10,0x40(%rsp) 3261 movaps %xmm11,0x50(%rsp) 3262 movaps %xmm12,0x60(%rsp) 3263 movaps %xmm13,0x70(%rsp) 3264 movaps %xmm14,0x80(%rsp) 3265 movaps %xmm15,0x90(%rsp) 3266.Locb_dec_body: 3267___ 3268$code.=<<___; 3269 mov $seventh_arg(%rax),$L_p # 7th argument 3270 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3271 3272 mov 240($key),$rnds_ 3273 mov $key,$key_ 3274 shl \$4,$rnds_ 3275 $movkey ($key),$rndkey0l # round[0] 3276 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3277 3278 movdqu ($offset_p),@offset[5] # load last offset_i 3279 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3280 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3281 3282 mov \$16+32,$rounds 3283 lea 32($key_,$rnds_),$key 3284 $movkey 16($key_),$rndkey1 # round[1] 3285 sub %r10,%rax # twisted $rounds 3286 mov %rax,%r10 # backup twisted $rounds 3287 3288 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3289 movdqu ($checksum_p),$checksum # load checksum 3290 3291 test \$1,$block_num # is first block number odd? 3292 jnz .Locb_dec_odd 3293 3294 bsf $block_num,$i1 3295 add \$1,$block_num 3296 shl \$4,$i1 3297 movdqu ($L_p,$i1),$inout5 # borrow 3298 movdqu ($inp),$inout0 3299 lea 16($inp),$inp 3300 3301 call __ocb_decrypt1 3302 3303 movdqa $inout5,@offset[5] 3304 movups $inout0,($out) 3305 xorps $inout0,$checksum # accumulate checksum 3306 lea 16($out),$out 3307 sub \$1,$blocks 3308 jz .Locb_dec_done 3309 3310.Locb_dec_odd: 3311 lea 1($block_num),$i1 # even-numbered blocks 3312 lea 3($block_num),$i3 3313 lea 5($block_num),$i5 3314 lea 6($block_num),$block_num 3315 bsf $i1,$i1 # ntz(block) 3316 bsf $i3,$i3 3317 bsf $i5,$i5 3318 shl \$4,$i1 # ntz(block) -> table offset 3319 shl \$4,$i3 3320 shl \$4,$i5 3321 3322 sub \$6,$blocks 3323 jc .Locb_dec_short 3324 jmp .Locb_dec_grandloop 3325 3326.align 32 3327.Locb_dec_grandloop: 3328 movdqu `16*0`($inp),$inout0 # load input 3329 movdqu `16*1`($inp),$inout1 3330 movdqu `16*2`($inp),$inout2 3331 movdqu `16*3`($inp),$inout3 3332 movdqu `16*4`($inp),$inout4 3333 movdqu `16*5`($inp),$inout5 3334 lea `16*6`($inp),$inp 3335 3336 call __ocb_decrypt6 3337 3338 movups $inout0,`16*0`($out) # store output 3339 pxor $inout0,$checksum # accumulate checksum 3340 movups $inout1,`16*1`($out) 3341 pxor $inout1,$checksum 3342 movups $inout2,`16*2`($out) 3343 pxor $inout2,$checksum 3344 movups $inout3,`16*3`($out) 3345 pxor $inout3,$checksum 3346 movups $inout4,`16*4`($out) 3347 pxor $inout4,$checksum 3348 movups $inout5,`16*5`($out) 3349 pxor $inout5,$checksum 3350 lea `16*6`($out),$out 3351 sub \$6,$blocks 3352 jnc .Locb_dec_grandloop 3353 3354.Locb_dec_short: 3355 add \$6,$blocks 3356 jz .Locb_dec_done 3357 3358 movdqu `16*0`($inp),$inout0 3359 cmp \$2,$blocks 3360 jb .Locb_dec_one 3361 movdqu `16*1`($inp),$inout1 3362 je .Locb_dec_two 3363 3364 movdqu `16*2`($inp),$inout2 3365 cmp \$4,$blocks 3366 jb .Locb_dec_three 3367 movdqu `16*3`($inp),$inout3 3368 je .Locb_dec_four 3369 3370 movdqu `16*4`($inp),$inout4 3371 pxor $inout5,$inout5 3372 3373 call __ocb_decrypt6 3374 3375 movdqa @offset[4],@offset[5] 3376 movups $inout0,`16*0`($out) # store output 3377 pxor $inout0,$checksum # accumulate checksum 3378 movups $inout1,`16*1`($out) 3379 pxor $inout1,$checksum 3380 movups $inout2,`16*2`($out) 3381 pxor $inout2,$checksum 3382 movups $inout3,`16*3`($out) 3383 pxor $inout3,$checksum 3384 movups $inout4,`16*4`($out) 3385 pxor $inout4,$checksum 3386 3387 jmp .Locb_dec_done 3388 3389.align 16 3390.Locb_dec_one: 3391 movdqa @offset[0],$inout5 # borrow 3392 3393 call __ocb_decrypt1 3394 3395 movdqa $inout5,@offset[5] 3396 movups $inout0,`16*0`($out) # store output 3397 xorps $inout0,$checksum # accumulate checksum 3398 jmp .Locb_dec_done 3399 3400.align 16 3401.Locb_dec_two: 3402 pxor $inout2,$inout2 3403 pxor $inout3,$inout3 3404 3405 call __ocb_decrypt4 3406 3407 movdqa @offset[1],@offset[5] 3408 movups $inout0,`16*0`($out) # store output 3409 xorps $inout0,$checksum # accumulate checksum 3410 movups $inout1,`16*1`($out) 3411 xorps $inout1,$checksum 3412 3413 jmp .Locb_dec_done 3414 3415.align 16 3416.Locb_dec_three: 3417 pxor $inout3,$inout3 3418 3419 call __ocb_decrypt4 3420 3421 movdqa @offset[2],@offset[5] 3422 movups $inout0,`16*0`($out) # store output 3423 xorps $inout0,$checksum # accumulate checksum 3424 movups $inout1,`16*1`($out) 3425 xorps $inout1,$checksum 3426 movups $inout2,`16*2`($out) 3427 xorps $inout2,$checksum 3428 3429 jmp .Locb_dec_done 3430 3431.align 16 3432.Locb_dec_four: 3433 call __ocb_decrypt4 3434 3435 movdqa @offset[3],@offset[5] 3436 movups $inout0,`16*0`($out) # store output 3437 pxor $inout0,$checksum # accumulate checksum 3438 movups $inout1,`16*1`($out) 3439 pxor $inout1,$checksum 3440 movups $inout2,`16*2`($out) 3441 pxor $inout2,$checksum 3442 movups $inout3,`16*3`($out) 3443 pxor $inout3,$checksum 3444 3445.Locb_dec_done: 3446 pxor $rndkey0,@offset[5] # "remove" round[last] 3447 movdqu $checksum,($checksum_p) # store checksum 3448 movdqu @offset[5],($offset_p) # store last offset_i 3449 3450 xorps %xmm0,%xmm0 # clear register bank 3451 pxor %xmm1,%xmm1 3452 pxor %xmm2,%xmm2 3453 pxor %xmm3,%xmm3 3454 pxor %xmm4,%xmm4 3455 pxor %xmm5,%xmm5 3456___ 3457$code.=<<___ if (!$win64); 3458 pxor %xmm6,%xmm6 3459 pxor %xmm7,%xmm7 3460 pxor %xmm8,%xmm8 3461 pxor %xmm9,%xmm9 3462 pxor %xmm10,%xmm10 3463 pxor %xmm11,%xmm11 3464 pxor %xmm12,%xmm12 3465 pxor %xmm13,%xmm13 3466 pxor %xmm14,%xmm14 3467 pxor %xmm15,%xmm15 3468 lea 0x28(%rsp),%rax 3469.cfi_def_cfa %rax,8 3470___ 3471$code.=<<___ if ($win64); 3472 movaps 0x00(%rsp),%xmm6 3473 movaps %xmm0,0x00(%rsp) # clear stack 3474 movaps 0x10(%rsp),%xmm7 3475 movaps %xmm0,0x10(%rsp) 3476 movaps 0x20(%rsp),%xmm8 3477 movaps %xmm0,0x20(%rsp) 3478 movaps 0x30(%rsp),%xmm9 3479 movaps %xmm0,0x30(%rsp) 3480 movaps 0x40(%rsp),%xmm10 3481 movaps %xmm0,0x40(%rsp) 3482 movaps 0x50(%rsp),%xmm11 3483 movaps %xmm0,0x50(%rsp) 3484 movaps 0x60(%rsp),%xmm12 3485 movaps %xmm0,0x60(%rsp) 3486 movaps 0x70(%rsp),%xmm13 3487 movaps %xmm0,0x70(%rsp) 3488 movaps 0x80(%rsp),%xmm14 3489 movaps %xmm0,0x80(%rsp) 3490 movaps 0x90(%rsp),%xmm15 3491 movaps %xmm0,0x90(%rsp) 3492 lea 0xa0+0x28(%rsp),%rax 3493.Locb_dec_pop: 3494___ 3495$code.=<<___; 3496 mov -40(%rax),%r14 3497.cfi_restore %r14 3498 mov -32(%rax),%r13 3499.cfi_restore %r13 3500 mov -24(%rax),%r12 3501.cfi_restore %r12 3502 mov -16(%rax),%rbp 3503.cfi_restore %rbp 3504 mov -8(%rax),%rbx 3505.cfi_restore %rbx 3506 lea (%rax),%rsp 3507.cfi_def_cfa_register %rsp 3508.Locb_dec_epilogue: 3509 ret 3510.cfi_endproc 3511.size aesni_ocb_decrypt,.-aesni_ocb_decrypt 3512 3513.type __ocb_decrypt6,\@abi-omnipotent 3514.align 32 3515__ocb_decrypt6: 3516 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3517 movdqu ($L_p,$i1),@offset[1] 3518 movdqa @offset[0],@offset[2] 3519 movdqu ($L_p,$i3),@offset[3] 3520 movdqa @offset[0],@offset[4] 3521 pxor @offset[5],@offset[0] 3522 movdqu ($L_p,$i5),@offset[5] 3523 pxor @offset[0],@offset[1] 3524 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3525 pxor @offset[1],@offset[2] 3526 pxor @offset[1],$inout1 3527 pxor @offset[2],@offset[3] 3528 pxor @offset[2],$inout2 3529 pxor @offset[3],@offset[4] 3530 pxor @offset[3],$inout3 3531 pxor @offset[4],@offset[5] 3532 pxor @offset[4],$inout4 3533 pxor @offset[5],$inout5 3534 $movkey 32($key_),$rndkey0 3535 3536 lea 1($block_num),$i1 # even-numbered blocks 3537 lea 3($block_num),$i3 3538 lea 5($block_num),$i5 3539 add \$6,$block_num 3540 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3541 bsf $i1,$i1 # ntz(block) 3542 bsf $i3,$i3 3543 bsf $i5,$i5 3544 3545 aesdec $rndkey1,$inout0 3546 aesdec $rndkey1,$inout1 3547 aesdec $rndkey1,$inout2 3548 aesdec $rndkey1,$inout3 3549 pxor $rndkey0l,@offset[1] 3550 pxor $rndkey0l,@offset[2] 3551 aesdec $rndkey1,$inout4 3552 pxor $rndkey0l,@offset[3] 3553 pxor $rndkey0l,@offset[4] 3554 aesdec $rndkey1,$inout5 3555 $movkey 48($key_),$rndkey1 3556 pxor $rndkey0l,@offset[5] 3557 3558 aesdec $rndkey0,$inout0 3559 aesdec $rndkey0,$inout1 3560 aesdec $rndkey0,$inout2 3561 aesdec $rndkey0,$inout3 3562 aesdec $rndkey0,$inout4 3563 aesdec $rndkey0,$inout5 3564 $movkey 64($key_),$rndkey0 3565 shl \$4,$i1 # ntz(block) -> table offset 3566 shl \$4,$i3 3567 jmp .Locb_dec_loop6 3568 3569.align 32 3570.Locb_dec_loop6: 3571 aesdec $rndkey1,$inout0 3572 aesdec $rndkey1,$inout1 3573 aesdec $rndkey1,$inout2 3574 aesdec $rndkey1,$inout3 3575 aesdec $rndkey1,$inout4 3576 aesdec $rndkey1,$inout5 3577 $movkey ($key,%rax),$rndkey1 3578 add \$32,%rax 3579 3580 aesdec $rndkey0,$inout0 3581 aesdec $rndkey0,$inout1 3582 aesdec $rndkey0,$inout2 3583 aesdec $rndkey0,$inout3 3584 aesdec $rndkey0,$inout4 3585 aesdec $rndkey0,$inout5 3586 $movkey -16($key,%rax),$rndkey0 3587 jnz .Locb_dec_loop6 3588 3589 aesdec $rndkey1,$inout0 3590 aesdec $rndkey1,$inout1 3591 aesdec $rndkey1,$inout2 3592 aesdec $rndkey1,$inout3 3593 aesdec $rndkey1,$inout4 3594 aesdec $rndkey1,$inout5 3595 $movkey 16($key_),$rndkey1 3596 shl \$4,$i5 3597 3598 aesdeclast @offset[0],$inout0 3599 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3600 mov %r10,%rax # restore twisted rounds 3601 aesdeclast @offset[1],$inout1 3602 aesdeclast @offset[2],$inout2 3603 aesdeclast @offset[3],$inout3 3604 aesdeclast @offset[4],$inout4 3605 aesdeclast @offset[5],$inout5 3606 ret 3607.size __ocb_decrypt6,.-__ocb_decrypt6 3608 3609.type __ocb_decrypt4,\@abi-omnipotent 3610.align 32 3611__ocb_decrypt4: 3612 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3613 movdqu ($L_p,$i1),@offset[1] 3614 movdqa @offset[0],@offset[2] 3615 movdqu ($L_p,$i3),@offset[3] 3616 pxor @offset[5],@offset[0] 3617 pxor @offset[0],@offset[1] 3618 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3619 pxor @offset[1],@offset[2] 3620 pxor @offset[1],$inout1 3621 pxor @offset[2],@offset[3] 3622 pxor @offset[2],$inout2 3623 pxor @offset[3],$inout3 3624 $movkey 32($key_),$rndkey0 3625 3626 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3627 pxor $rndkey0l,@offset[1] 3628 pxor $rndkey0l,@offset[2] 3629 pxor $rndkey0l,@offset[3] 3630 3631 aesdec $rndkey1,$inout0 3632 aesdec $rndkey1,$inout1 3633 aesdec $rndkey1,$inout2 3634 aesdec $rndkey1,$inout3 3635 $movkey 48($key_),$rndkey1 3636 3637 aesdec $rndkey0,$inout0 3638 aesdec $rndkey0,$inout1 3639 aesdec $rndkey0,$inout2 3640 aesdec $rndkey0,$inout3 3641 $movkey 64($key_),$rndkey0 3642 jmp .Locb_dec_loop4 3643 3644.align 32 3645.Locb_dec_loop4: 3646 aesdec $rndkey1,$inout0 3647 aesdec $rndkey1,$inout1 3648 aesdec $rndkey1,$inout2 3649 aesdec $rndkey1,$inout3 3650 $movkey ($key,%rax),$rndkey1 3651 add \$32,%rax 3652 3653 aesdec $rndkey0,$inout0 3654 aesdec $rndkey0,$inout1 3655 aesdec $rndkey0,$inout2 3656 aesdec $rndkey0,$inout3 3657 $movkey -16($key,%rax),$rndkey0 3658 jnz .Locb_dec_loop4 3659 3660 aesdec $rndkey1,$inout0 3661 aesdec $rndkey1,$inout1 3662 aesdec $rndkey1,$inout2 3663 aesdec $rndkey1,$inout3 3664 $movkey 16($key_),$rndkey1 3665 mov %r10,%rax # restore twisted rounds 3666 3667 aesdeclast @offset[0],$inout0 3668 aesdeclast @offset[1],$inout1 3669 aesdeclast @offset[2],$inout2 3670 aesdeclast @offset[3],$inout3 3671 ret 3672.size __ocb_decrypt4,.-__ocb_decrypt4 3673 3674.type __ocb_decrypt1,\@abi-omnipotent 3675.align 32 3676__ocb_decrypt1: 3677 pxor @offset[5],$inout5 # offset_i 3678 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3679 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3680 $movkey 32($key_),$rndkey0 3681 3682 aesdec $rndkey1,$inout0 3683 $movkey 48($key_),$rndkey1 3684 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3685 3686 aesdec $rndkey0,$inout0 3687 $movkey 64($key_),$rndkey0 3688 jmp .Locb_dec_loop1 3689 3690.align 32 3691.Locb_dec_loop1: 3692 aesdec $rndkey1,$inout0 3693 $movkey ($key,%rax),$rndkey1 3694 add \$32,%rax 3695 3696 aesdec $rndkey0,$inout0 3697 $movkey -16($key,%rax),$rndkey0 3698 jnz .Locb_dec_loop1 3699 3700 aesdec $rndkey1,$inout0 3701 $movkey 16($key_),$rndkey1 # redundant in tail 3702 mov %r10,%rax # restore twisted rounds 3703 3704 aesdeclast $inout5,$inout0 3705 ret 3706.size __ocb_decrypt1,.-__ocb_decrypt1 3707___ 3708} }} 3709 3710######################################################################## 3711# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3712# size_t length, const AES_KEY *key, 3713# unsigned char *ivp,const int enc); 3714{ 3715my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3716my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3717 3718$code.=<<___; 3719.globl ${PREFIX}_cbc_encrypt 3720.type ${PREFIX}_cbc_encrypt,\@function,6 3721.align 16 3722${PREFIX}_cbc_encrypt: 3723.cfi_startproc 3724 test $len,$len # check length 3725 jz .Lcbc_ret 3726 3727 mov 240($key),$rnds_ # key->rounds 3728 mov $key,$key_ # backup $key 3729 test %r9d,%r9d # 6th argument 3730 jz .Lcbc_decrypt 3731#--------------------------- CBC ENCRYPT ------------------------------# 3732 movups ($ivp),$inout0 # load iv as initial state 3733 mov $rnds_,$rounds 3734 cmp \$16,$len 3735 jb .Lcbc_enc_tail 3736 sub \$16,$len 3737 jmp .Lcbc_enc_loop 3738.align 16 3739.Lcbc_enc_loop: 3740 movups ($inp),$inout1 # load input 3741 lea 16($inp),$inp 3742 #xorps $inout1,$inout0 3743___ 3744 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3745$code.=<<___; 3746 mov $rnds_,$rounds # restore $rounds 3747 mov $key_,$key # restore $key 3748 movups $inout0,0($out) # store output 3749 lea 16($out),$out 3750 sub \$16,$len 3751 jnc .Lcbc_enc_loop 3752 add \$16,$len 3753 jnz .Lcbc_enc_tail 3754 pxor $rndkey0,$rndkey0 # clear register bank 3755 pxor $rndkey1,$rndkey1 3756 movups $inout0,($ivp) 3757 pxor $inout0,$inout0 3758 pxor $inout1,$inout1 3759 jmp .Lcbc_ret 3760 3761.Lcbc_enc_tail: 3762 mov $len,%rcx # zaps $key 3763 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3764 .long 0x9066A4F3 # rep movsb 3765 mov \$16,%ecx # zero tail 3766 sub $len,%rcx 3767 xor %eax,%eax 3768 .long 0x9066AAF3 # rep stosb 3769 lea -16(%rdi),%rdi # rewind $out by 1 block 3770 mov $rnds_,$rounds # restore $rounds 3771 mov %rdi,%rsi # $inp and $out are the same 3772 mov $key_,$key # restore $key 3773 xor $len,$len # len=16 3774 jmp .Lcbc_enc_loop # one more spin 3775#--------------------------- CBC DECRYPT ------------------------------# 3776.align 16 3777.Lcbc_decrypt: 3778 cmp \$16,$len 3779 jne .Lcbc_decrypt_bulk 3780 3781 # handle single block without allocating stack frame, 3782 # useful in ciphertext stealing mode 3783 movdqu ($inp),$inout0 # load input 3784 movdqu ($ivp),$inout1 # load iv 3785 movdqa $inout0,$inout2 # future iv 3786___ 3787 &aesni_generate1("dec",$key,$rnds_); 3788$code.=<<___; 3789 pxor $rndkey0,$rndkey0 # clear register bank 3790 pxor $rndkey1,$rndkey1 3791 movdqu $inout2,($ivp) # store iv 3792 xorps $inout1,$inout0 # ^=iv 3793 pxor $inout1,$inout1 3794 movups $inout0,($out) # store output 3795 pxor $inout0,$inout0 3796 jmp .Lcbc_ret 3797.align 16 3798.Lcbc_decrypt_bulk: 3799 lea (%rsp),%r11 # frame pointer 3800.cfi_def_cfa_register %r11 3801 push %rbp 3802.cfi_push %rbp 3803 sub \$$frame_size,%rsp 3804 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3805___ 3806$code.=<<___ if ($win64); 3807 movaps %xmm6,0x10(%rsp) 3808 movaps %xmm7,0x20(%rsp) 3809 movaps %xmm8,0x30(%rsp) 3810 movaps %xmm9,0x40(%rsp) 3811 movaps %xmm10,0x50(%rsp) 3812 movaps %xmm11,0x60(%rsp) 3813 movaps %xmm12,0x70(%rsp) 3814 movaps %xmm13,0x80(%rsp) 3815 movaps %xmm14,0x90(%rsp) 3816 movaps %xmm15,0xa0(%rsp) 3817.Lcbc_decrypt_body: 3818___ 3819 3820my $inp_=$key_="%rbp"; # reassign $key_ 3821 3822$code.=<<___; 3823 mov $key,$key_ # [re-]backup $key [after reassignment] 3824 movups ($ivp),$iv 3825 mov $rnds_,$rounds 3826 cmp \$0x50,$len 3827 jbe .Lcbc_dec_tail 3828 3829 $movkey ($key),$rndkey0 3830 movdqu 0x00($inp),$inout0 # load input 3831 movdqu 0x10($inp),$inout1 3832 movdqa $inout0,$in0 3833 movdqu 0x20($inp),$inout2 3834 movdqa $inout1,$in1 3835 movdqu 0x30($inp),$inout3 3836 movdqa $inout2,$in2 3837 movdqu 0x40($inp),$inout4 3838 movdqa $inout3,$in3 3839 movdqu 0x50($inp),$inout5 3840 movdqa $inout4,$in4 3841 mov OPENSSL_ia32cap_P+4(%rip),%r9d 3842 cmp \$0x70,$len 3843 jbe .Lcbc_dec_six_or_seven 3844 3845 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3846 sub \$0x50,$len # $len is biased by -5*16 3847 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3848 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3849 sub \$0x20,$len # $len is biased by -7*16 3850 lea 0x70($key),$key # size optimization 3851 jmp .Lcbc_dec_loop8_enter 3852.align 16 3853.Lcbc_dec_loop8: 3854 movups $inout7,($out) 3855 lea 0x10($out),$out 3856.Lcbc_dec_loop8_enter: 3857 movdqu 0x60($inp),$inout6 3858 pxor $rndkey0,$inout0 3859 movdqu 0x70($inp),$inout7 3860 pxor $rndkey0,$inout1 3861 $movkey 0x10-0x70($key),$rndkey1 3862 pxor $rndkey0,$inout2 3863 mov \$-1,$inp_ 3864 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3865 pxor $rndkey0,$inout3 3866 pxor $rndkey0,$inout4 3867 pxor $rndkey0,$inout5 3868 pxor $rndkey0,$inout6 3869 3870 aesdec $rndkey1,$inout0 3871 pxor $rndkey0,$inout7 3872 $movkey 0x20-0x70($key),$rndkey0 3873 aesdec $rndkey1,$inout1 3874 aesdec $rndkey1,$inout2 3875 aesdec $rndkey1,$inout3 3876 aesdec $rndkey1,$inout4 3877 aesdec $rndkey1,$inout5 3878 aesdec $rndkey1,$inout6 3879 adc \$0,$inp_ 3880 and \$128,$inp_ 3881 aesdec $rndkey1,$inout7 3882 add $inp,$inp_ 3883 $movkey 0x30-0x70($key),$rndkey1 3884___ 3885for($i=1;$i<12;$i++) { 3886my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3887$code.=<<___ if ($i==7); 3888 cmp \$11,$rounds 3889___ 3890$code.=<<___; 3891 aesdec $rndkeyx,$inout0 3892 aesdec $rndkeyx,$inout1 3893 aesdec $rndkeyx,$inout2 3894 aesdec $rndkeyx,$inout3 3895 aesdec $rndkeyx,$inout4 3896 aesdec $rndkeyx,$inout5 3897 aesdec $rndkeyx,$inout6 3898 aesdec $rndkeyx,$inout7 3899 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3900___ 3901$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3902 nop 3903___ 3904$code.=<<___ if ($i==7); 3905 jb .Lcbc_dec_done 3906___ 3907$code.=<<___ if ($i==9); 3908 je .Lcbc_dec_done 3909___ 3910$code.=<<___ if ($i==11); 3911 jmp .Lcbc_dec_done 3912___ 3913} 3914$code.=<<___; 3915.align 16 3916.Lcbc_dec_done: 3917 aesdec $rndkey1,$inout0 3918 aesdec $rndkey1,$inout1 3919 pxor $rndkey0,$iv 3920 pxor $rndkey0,$in0 3921 aesdec $rndkey1,$inout2 3922 aesdec $rndkey1,$inout3 3923 pxor $rndkey0,$in1 3924 pxor $rndkey0,$in2 3925 aesdec $rndkey1,$inout4 3926 aesdec $rndkey1,$inout5 3927 pxor $rndkey0,$in3 3928 pxor $rndkey0,$in4 3929 aesdec $rndkey1,$inout6 3930 aesdec $rndkey1,$inout7 3931 movdqu 0x50($inp),$rndkey1 3932 3933 aesdeclast $iv,$inout0 3934 movdqu 0x60($inp),$iv # borrow $iv 3935 pxor $rndkey0,$rndkey1 3936 aesdeclast $in0,$inout1 3937 pxor $rndkey0,$iv 3938 movdqu 0x70($inp),$rndkey0 # next IV 3939 aesdeclast $in1,$inout2 3940 lea 0x80($inp),$inp 3941 movdqu 0x00($inp_),$in0 3942 aesdeclast $in2,$inout3 3943 aesdeclast $in3,$inout4 3944 movdqu 0x10($inp_),$in1 3945 movdqu 0x20($inp_),$in2 3946 aesdeclast $in4,$inout5 3947 aesdeclast $rndkey1,$inout6 3948 movdqu 0x30($inp_),$in3 3949 movdqu 0x40($inp_),$in4 3950 aesdeclast $iv,$inout7 3951 movdqa $rndkey0,$iv # return $iv 3952 movdqu 0x50($inp_),$rndkey1 3953 $movkey -0x70($key),$rndkey0 3954 3955 movups $inout0,($out) # store output 3956 movdqa $in0,$inout0 3957 movups $inout1,0x10($out) 3958 movdqa $in1,$inout1 3959 movups $inout2,0x20($out) 3960 movdqa $in2,$inout2 3961 movups $inout3,0x30($out) 3962 movdqa $in3,$inout3 3963 movups $inout4,0x40($out) 3964 movdqa $in4,$inout4 3965 movups $inout5,0x50($out) 3966 movdqa $rndkey1,$inout5 3967 movups $inout6,0x60($out) 3968 lea 0x70($out),$out 3969 3970 sub \$0x80,$len 3971 ja .Lcbc_dec_loop8 3972 3973 movaps $inout7,$inout0 3974 lea -0x70($key),$key 3975 add \$0x70,$len 3976 jle .Lcbc_dec_clear_tail_collected 3977 movups $inout7,($out) 3978 lea 0x10($out),$out 3979 cmp \$0x50,$len 3980 jbe .Lcbc_dec_tail 3981 3982 movaps $in0,$inout0 3983.Lcbc_dec_six_or_seven: 3984 cmp \$0x60,$len 3985 ja .Lcbc_dec_seven 3986 3987 movaps $inout5,$inout6 3988 call _aesni_decrypt6 3989 pxor $iv,$inout0 # ^= IV 3990 movaps $inout6,$iv 3991 pxor $in0,$inout1 3992 movdqu $inout0,($out) 3993 pxor $in1,$inout2 3994 movdqu $inout1,0x10($out) 3995 pxor $inout1,$inout1 # clear register bank 3996 pxor $in2,$inout3 3997 movdqu $inout2,0x20($out) 3998 pxor $inout2,$inout2 3999 pxor $in3,$inout4 4000 movdqu $inout3,0x30($out) 4001 pxor $inout3,$inout3 4002 pxor $in4,$inout5 4003 movdqu $inout4,0x40($out) 4004 pxor $inout4,$inout4 4005 lea 0x50($out),$out 4006 movdqa $inout5,$inout0 4007 pxor $inout5,$inout5 4008 jmp .Lcbc_dec_tail_collected 4009 4010.align 16 4011.Lcbc_dec_seven: 4012 movups 0x60($inp),$inout6 4013 xorps $inout7,$inout7 4014 call _aesni_decrypt8 4015 movups 0x50($inp),$inout7 4016 pxor $iv,$inout0 # ^= IV 4017 movups 0x60($inp),$iv 4018 pxor $in0,$inout1 4019 movdqu $inout0,($out) 4020 pxor $in1,$inout2 4021 movdqu $inout1,0x10($out) 4022 pxor $inout1,$inout1 # clear register bank 4023 pxor $in2,$inout3 4024 movdqu $inout2,0x20($out) 4025 pxor $inout2,$inout2 4026 pxor $in3,$inout4 4027 movdqu $inout3,0x30($out) 4028 pxor $inout3,$inout3 4029 pxor $in4,$inout5 4030 movdqu $inout4,0x40($out) 4031 pxor $inout4,$inout4 4032 pxor $inout7,$inout6 4033 movdqu $inout5,0x50($out) 4034 pxor $inout5,$inout5 4035 lea 0x60($out),$out 4036 movdqa $inout6,$inout0 4037 pxor $inout6,$inout6 4038 pxor $inout7,$inout7 4039 jmp .Lcbc_dec_tail_collected 4040 4041.align 16 4042.Lcbc_dec_loop6: 4043 movups $inout5,($out) 4044 lea 0x10($out),$out 4045 movdqu 0x00($inp),$inout0 # load input 4046 movdqu 0x10($inp),$inout1 4047 movdqa $inout0,$in0 4048 movdqu 0x20($inp),$inout2 4049 movdqa $inout1,$in1 4050 movdqu 0x30($inp),$inout3 4051 movdqa $inout2,$in2 4052 movdqu 0x40($inp),$inout4 4053 movdqa $inout3,$in3 4054 movdqu 0x50($inp),$inout5 4055 movdqa $inout4,$in4 4056.Lcbc_dec_loop6_enter: 4057 lea 0x60($inp),$inp 4058 movdqa $inout5,$inout6 4059 4060 call _aesni_decrypt6 4061 4062 pxor $iv,$inout0 # ^= IV 4063 movdqa $inout6,$iv 4064 pxor $in0,$inout1 4065 movdqu $inout0,($out) 4066 pxor $in1,$inout2 4067 movdqu $inout1,0x10($out) 4068 pxor $in2,$inout3 4069 movdqu $inout2,0x20($out) 4070 pxor $in3,$inout4 4071 mov $key_,$key 4072 movdqu $inout3,0x30($out) 4073 pxor $in4,$inout5 4074 mov $rnds_,$rounds 4075 movdqu $inout4,0x40($out) 4076 lea 0x50($out),$out 4077 sub \$0x60,$len 4078 ja .Lcbc_dec_loop6 4079 4080 movdqa $inout5,$inout0 4081 add \$0x50,$len 4082 jle .Lcbc_dec_clear_tail_collected 4083 movups $inout5,($out) 4084 lea 0x10($out),$out 4085 4086.Lcbc_dec_tail: 4087 movups ($inp),$inout0 4088 sub \$0x10,$len 4089 jbe .Lcbc_dec_one # $len is 1*16 or less 4090 4091 movups 0x10($inp),$inout1 4092 movaps $inout0,$in0 4093 sub \$0x10,$len 4094 jbe .Lcbc_dec_two # $len is 2*16 or less 4095 4096 movups 0x20($inp),$inout2 4097 movaps $inout1,$in1 4098 sub \$0x10,$len 4099 jbe .Lcbc_dec_three # $len is 3*16 or less 4100 4101 movups 0x30($inp),$inout3 4102 movaps $inout2,$in2 4103 sub \$0x10,$len 4104 jbe .Lcbc_dec_four # $len is 4*16 or less 4105 4106 movups 0x40($inp),$inout4 # $len is 5*16 or less 4107 movaps $inout3,$in3 4108 movaps $inout4,$in4 4109 xorps $inout5,$inout5 4110 call _aesni_decrypt6 4111 pxor $iv,$inout0 4112 movaps $in4,$iv 4113 pxor $in0,$inout1 4114 movdqu $inout0,($out) 4115 pxor $in1,$inout2 4116 movdqu $inout1,0x10($out) 4117 pxor $inout1,$inout1 # clear register bank 4118 pxor $in2,$inout3 4119 movdqu $inout2,0x20($out) 4120 pxor $inout2,$inout2 4121 pxor $in3,$inout4 4122 movdqu $inout3,0x30($out) 4123 pxor $inout3,$inout3 4124 lea 0x40($out),$out 4125 movdqa $inout4,$inout0 4126 pxor $inout4,$inout4 4127 pxor $inout5,$inout5 4128 sub \$0x10,$len 4129 jmp .Lcbc_dec_tail_collected 4130 4131.align 16 4132.Lcbc_dec_one: 4133 movaps $inout0,$in0 4134___ 4135 &aesni_generate1("dec",$key,$rounds); 4136$code.=<<___; 4137 xorps $iv,$inout0 4138 movaps $in0,$iv 4139 jmp .Lcbc_dec_tail_collected 4140.align 16 4141.Lcbc_dec_two: 4142 movaps $inout1,$in1 4143 call _aesni_decrypt2 4144 pxor $iv,$inout0 4145 movaps $in1,$iv 4146 pxor $in0,$inout1 4147 movdqu $inout0,($out) 4148 movdqa $inout1,$inout0 4149 pxor $inout1,$inout1 # clear register bank 4150 lea 0x10($out),$out 4151 jmp .Lcbc_dec_tail_collected 4152.align 16 4153.Lcbc_dec_three: 4154 movaps $inout2,$in2 4155 call _aesni_decrypt3 4156 pxor $iv,$inout0 4157 movaps $in2,$iv 4158 pxor $in0,$inout1 4159 movdqu $inout0,($out) 4160 pxor $in1,$inout2 4161 movdqu $inout1,0x10($out) 4162 pxor $inout1,$inout1 # clear register bank 4163 movdqa $inout2,$inout0 4164 pxor $inout2,$inout2 4165 lea 0x20($out),$out 4166 jmp .Lcbc_dec_tail_collected 4167.align 16 4168.Lcbc_dec_four: 4169 movaps $inout3,$in3 4170 call _aesni_decrypt4 4171 pxor $iv,$inout0 4172 movaps $in3,$iv 4173 pxor $in0,$inout1 4174 movdqu $inout0,($out) 4175 pxor $in1,$inout2 4176 movdqu $inout1,0x10($out) 4177 pxor $inout1,$inout1 # clear register bank 4178 pxor $in2,$inout3 4179 movdqu $inout2,0x20($out) 4180 pxor $inout2,$inout2 4181 movdqa $inout3,$inout0 4182 pxor $inout3,$inout3 4183 lea 0x30($out),$out 4184 jmp .Lcbc_dec_tail_collected 4185 4186.align 16 4187.Lcbc_dec_clear_tail_collected: 4188 pxor $inout1,$inout1 # clear register bank 4189 pxor $inout2,$inout2 4190 pxor $inout3,$inout3 4191___ 4192$code.=<<___ if (!$win64); 4193 pxor $inout4,$inout4 # %xmm6..9 4194 pxor $inout5,$inout5 4195 pxor $inout6,$inout6 4196 pxor $inout7,$inout7 4197___ 4198$code.=<<___; 4199.Lcbc_dec_tail_collected: 4200 movups $iv,($ivp) 4201 and \$15,$len 4202 jnz .Lcbc_dec_tail_partial 4203 movups $inout0,($out) 4204 pxor $inout0,$inout0 4205 jmp .Lcbc_dec_ret 4206.align 16 4207.Lcbc_dec_tail_partial: 4208 movaps $inout0,(%rsp) 4209 pxor $inout0,$inout0 4210 mov \$16,%rcx 4211 mov $out,%rdi 4212 sub $len,%rcx 4213 lea (%rsp),%rsi 4214 .long 0x9066A4F3 # rep movsb 4215 movdqa $inout0,(%rsp) 4216 4217.Lcbc_dec_ret: 4218 xorps $rndkey0,$rndkey0 # %xmm0 4219 pxor $rndkey1,$rndkey1 4220___ 4221$code.=<<___ if ($win64); 4222 movaps 0x10(%rsp),%xmm6 4223 movaps %xmm0,0x10(%rsp) # clear stack 4224 movaps 0x20(%rsp),%xmm7 4225 movaps %xmm0,0x20(%rsp) 4226 movaps 0x30(%rsp),%xmm8 4227 movaps %xmm0,0x30(%rsp) 4228 movaps 0x40(%rsp),%xmm9 4229 movaps %xmm0,0x40(%rsp) 4230 movaps 0x50(%rsp),%xmm10 4231 movaps %xmm0,0x50(%rsp) 4232 movaps 0x60(%rsp),%xmm11 4233 movaps %xmm0,0x60(%rsp) 4234 movaps 0x70(%rsp),%xmm12 4235 movaps %xmm0,0x70(%rsp) 4236 movaps 0x80(%rsp),%xmm13 4237 movaps %xmm0,0x80(%rsp) 4238 movaps 0x90(%rsp),%xmm14 4239 movaps %xmm0,0x90(%rsp) 4240 movaps 0xa0(%rsp),%xmm15 4241 movaps %xmm0,0xa0(%rsp) 4242___ 4243$code.=<<___; 4244 mov -8(%r11),%rbp 4245.cfi_restore %rbp 4246 lea (%r11),%rsp 4247.cfi_def_cfa_register %rsp 4248.Lcbc_ret: 4249 ret 4250.cfi_endproc 4251.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4252___ 4253} 4254# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4255# int bits, AES_KEY *key) 4256# 4257# input: $inp user-supplied key 4258# $bits $inp length in bits 4259# $key pointer to key schedule 4260# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4261# *$key key schedule 4262# 4263{ my ($inp,$bits,$key) = @_4args; 4264 $bits =~ s/%r/%e/; 4265 4266$code.=<<___; 4267.globl ${PREFIX}_set_decrypt_key 4268.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4269.align 16 4270${PREFIX}_set_decrypt_key: 4271.cfi_startproc 4272 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4273.cfi_adjust_cfa_offset 8 4274 call __aesni_set_encrypt_key 4275 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4276 test %eax,%eax 4277 jnz .Ldec_key_ret 4278 lea 16($key,$bits),$inp # points at the end of key schedule 4279 4280 $movkey ($key),%xmm0 # just swap 4281 $movkey ($inp),%xmm1 4282 $movkey %xmm0,($inp) 4283 $movkey %xmm1,($key) 4284 lea 16($key),$key 4285 lea -16($inp),$inp 4286 4287.Ldec_key_inverse: 4288 $movkey ($key),%xmm0 # swap and inverse 4289 $movkey ($inp),%xmm1 4290 aesimc %xmm0,%xmm0 4291 aesimc %xmm1,%xmm1 4292 lea 16($key),$key 4293 lea -16($inp),$inp 4294 $movkey %xmm0,16($inp) 4295 $movkey %xmm1,-16($key) 4296 cmp $key,$inp 4297 ja .Ldec_key_inverse 4298 4299 $movkey ($key),%xmm0 # inverse middle 4300 aesimc %xmm0,%xmm0 4301 pxor %xmm1,%xmm1 4302 $movkey %xmm0,($inp) 4303 pxor %xmm0,%xmm0 4304.Ldec_key_ret: 4305 add \$8,%rsp 4306.cfi_adjust_cfa_offset -8 4307 ret 4308.cfi_endproc 4309.LSEH_end_set_decrypt_key: 4310.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4311___ 4312 4313# This is based on submission from Intel by 4314# Huang Ying 4315# Vinodh Gopal 4316# Kahraman Akdemir 4317# 4318# Aggressively optimized in respect to aeskeygenassist's critical path 4319# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4320# 4321# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4322# int bits, AES_KEY * const key); 4323# 4324# input: $inp user-supplied key 4325# $bits $inp length in bits 4326# $key pointer to key schedule 4327# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4328# $bits rounds-1 (used in aesni_set_decrypt_key) 4329# *$key key schedule 4330# $key pointer to key schedule (used in 4331# aesni_set_decrypt_key) 4332# 4333# Subroutine is frame-less, which means that only volatile registers 4334# are used. Note that it's declared "abi-omnipotent", which means that 4335# amount of volatile registers is smaller on Windows. 4336# 4337$code.=<<___; 4338.globl ${PREFIX}_set_encrypt_key 4339.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4340.align 16 4341${PREFIX}_set_encrypt_key: 4342__aesni_set_encrypt_key: 4343.cfi_startproc 4344 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4345.cfi_adjust_cfa_offset 8 4346 mov \$-1,%rax 4347 test $inp,$inp 4348 jz .Lenc_key_ret 4349 test $key,$key 4350 jz .Lenc_key_ret 4351 4352 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4353 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4354 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4355 and OPENSSL_ia32cap_P+4(%rip),%r10d 4356 lea 16($key),%rax # %rax is used as modifiable copy of $key 4357 cmp \$256,$bits 4358 je .L14rounds 4359 cmp \$192,$bits 4360 je .L12rounds 4361 cmp \$128,$bits 4362 jne .Lbad_keybits 4363 4364.L10rounds: 4365 mov \$9,$bits # 10 rounds for 128-bit key 4366 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4367 je .L10rounds_alt 4368 4369 $movkey %xmm0,($key) # round 0 4370 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4371 call .Lkey_expansion_128_cold 4372 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4373 call .Lkey_expansion_128 4374 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4375 call .Lkey_expansion_128 4376 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4377 call .Lkey_expansion_128 4378 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4379 call .Lkey_expansion_128 4380 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4381 call .Lkey_expansion_128 4382 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4383 call .Lkey_expansion_128 4384 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4385 call .Lkey_expansion_128 4386 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4387 call .Lkey_expansion_128 4388 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4389 call .Lkey_expansion_128 4390 $movkey %xmm0,(%rax) 4391 mov $bits,80(%rax) # 240(%rdx) 4392 xor %eax,%eax 4393 jmp .Lenc_key_ret 4394 4395.align 16 4396.L10rounds_alt: 4397 movdqa .Lkey_rotate(%rip),%xmm5 4398 mov \$8,%r10d 4399 movdqa .Lkey_rcon1(%rip),%xmm4 4400 movdqa %xmm0,%xmm2 4401 movdqu %xmm0,($key) 4402 jmp .Loop_key128 4403 4404.align 16 4405.Loop_key128: 4406 pshufb %xmm5,%xmm0 4407 aesenclast %xmm4,%xmm0 4408 pslld \$1,%xmm4 4409 lea 16(%rax),%rax 4410 4411 movdqa %xmm2,%xmm3 4412 pslldq \$4,%xmm2 4413 pxor %xmm2,%xmm3 4414 pslldq \$4,%xmm2 4415 pxor %xmm2,%xmm3 4416 pslldq \$4,%xmm2 4417 pxor %xmm3,%xmm2 4418 4419 pxor %xmm2,%xmm0 4420 movdqu %xmm0,-16(%rax) 4421 movdqa %xmm0,%xmm2 4422 4423 dec %r10d 4424 jnz .Loop_key128 4425 4426 movdqa .Lkey_rcon1b(%rip),%xmm4 4427 4428 pshufb %xmm5,%xmm0 4429 aesenclast %xmm4,%xmm0 4430 pslld \$1,%xmm4 4431 4432 movdqa %xmm2,%xmm3 4433 pslldq \$4,%xmm2 4434 pxor %xmm2,%xmm3 4435 pslldq \$4,%xmm2 4436 pxor %xmm2,%xmm3 4437 pslldq \$4,%xmm2 4438 pxor %xmm3,%xmm2 4439 4440 pxor %xmm2,%xmm0 4441 movdqu %xmm0,(%rax) 4442 4443 movdqa %xmm0,%xmm2 4444 pshufb %xmm5,%xmm0 4445 aesenclast %xmm4,%xmm0 4446 4447 movdqa %xmm2,%xmm3 4448 pslldq \$4,%xmm2 4449 pxor %xmm2,%xmm3 4450 pslldq \$4,%xmm2 4451 pxor %xmm2,%xmm3 4452 pslldq \$4,%xmm2 4453 pxor %xmm3,%xmm2 4454 4455 pxor %xmm2,%xmm0 4456 movdqu %xmm0,16(%rax) 4457 4458 mov $bits,96(%rax) # 240($key) 4459 xor %eax,%eax 4460 jmp .Lenc_key_ret 4461 4462.align 16 4463.L12rounds: 4464 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4465 mov \$11,$bits # 12 rounds for 192 4466 cmp \$`1<<28`,%r10d # AVX, but no XOP 4467 je .L12rounds_alt 4468 4469 $movkey %xmm0,($key) # round 0 4470 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4471 call .Lkey_expansion_192a_cold 4472 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4473 call .Lkey_expansion_192b 4474 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4475 call .Lkey_expansion_192a 4476 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4477 call .Lkey_expansion_192b 4478 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4479 call .Lkey_expansion_192a 4480 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4481 call .Lkey_expansion_192b 4482 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4483 call .Lkey_expansion_192a 4484 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4485 call .Lkey_expansion_192b 4486 $movkey %xmm0,(%rax) 4487 mov $bits,48(%rax) # 240(%rdx) 4488 xor %rax, %rax 4489 jmp .Lenc_key_ret 4490 4491.align 16 4492.L12rounds_alt: 4493 movdqa .Lkey_rotate192(%rip),%xmm5 4494 movdqa .Lkey_rcon1(%rip),%xmm4 4495 mov \$8,%r10d 4496 movdqu %xmm0,($key) 4497 jmp .Loop_key192 4498 4499.align 16 4500.Loop_key192: 4501 movq %xmm2,0(%rax) 4502 movdqa %xmm2,%xmm1 4503 pshufb %xmm5,%xmm2 4504 aesenclast %xmm4,%xmm2 4505 pslld \$1, %xmm4 4506 lea 24(%rax),%rax 4507 4508 movdqa %xmm0,%xmm3 4509 pslldq \$4,%xmm0 4510 pxor %xmm0,%xmm3 4511 pslldq \$4,%xmm0 4512 pxor %xmm0,%xmm3 4513 pslldq \$4,%xmm0 4514 pxor %xmm3,%xmm0 4515 4516 pshufd \$0xff,%xmm0,%xmm3 4517 pxor %xmm1,%xmm3 4518 pslldq \$4,%xmm1 4519 pxor %xmm1,%xmm3 4520 4521 pxor %xmm2,%xmm0 4522 pxor %xmm3,%xmm2 4523 movdqu %xmm0,-16(%rax) 4524 4525 dec %r10d 4526 jnz .Loop_key192 4527 4528 mov $bits,32(%rax) # 240($key) 4529 xor %eax,%eax 4530 jmp .Lenc_key_ret 4531 4532.align 16 4533.L14rounds: 4534 movups 16($inp),%xmm2 # remaining half of *userKey 4535 mov \$13,$bits # 14 rounds for 256 4536 lea 16(%rax),%rax 4537 cmp \$`1<<28`,%r10d # AVX, but no XOP 4538 je .L14rounds_alt 4539 4540 $movkey %xmm0,($key) # round 0 4541 $movkey %xmm2,16($key) # round 1 4542 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4543 call .Lkey_expansion_256a_cold 4544 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4545 call .Lkey_expansion_256b 4546 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4547 call .Lkey_expansion_256a 4548 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4549 call .Lkey_expansion_256b 4550 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4551 call .Lkey_expansion_256a 4552 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4553 call .Lkey_expansion_256b 4554 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4555 call .Lkey_expansion_256a 4556 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4557 call .Lkey_expansion_256b 4558 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4559 call .Lkey_expansion_256a 4560 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4561 call .Lkey_expansion_256b 4562 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4563 call .Lkey_expansion_256a 4564 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4565 call .Lkey_expansion_256b 4566 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4567 call .Lkey_expansion_256a 4568 $movkey %xmm0,(%rax) 4569 mov $bits,16(%rax) # 240(%rdx) 4570 xor %rax,%rax 4571 jmp .Lenc_key_ret 4572 4573.align 16 4574.L14rounds_alt: 4575 movdqa .Lkey_rotate(%rip),%xmm5 4576 movdqa .Lkey_rcon1(%rip),%xmm4 4577 mov \$7,%r10d 4578 movdqu %xmm0,0($key) 4579 movdqa %xmm2,%xmm1 4580 movdqu %xmm2,16($key) 4581 jmp .Loop_key256 4582 4583.align 16 4584.Loop_key256: 4585 pshufb %xmm5,%xmm2 4586 aesenclast %xmm4,%xmm2 4587 4588 movdqa %xmm0,%xmm3 4589 pslldq \$4,%xmm0 4590 pxor %xmm0,%xmm3 4591 pslldq \$4,%xmm0 4592 pxor %xmm0,%xmm3 4593 pslldq \$4,%xmm0 4594 pxor %xmm3,%xmm0 4595 pslld \$1,%xmm4 4596 4597 pxor %xmm2,%xmm0 4598 movdqu %xmm0,(%rax) 4599 4600 dec %r10d 4601 jz .Ldone_key256 4602 4603 pshufd \$0xff,%xmm0,%xmm2 4604 pxor %xmm3,%xmm3 4605 aesenclast %xmm3,%xmm2 4606 4607 movdqa %xmm1,%xmm3 4608 pslldq \$4,%xmm1 4609 pxor %xmm1,%xmm3 4610 pslldq \$4,%xmm1 4611 pxor %xmm1,%xmm3 4612 pslldq \$4,%xmm1 4613 pxor %xmm3,%xmm1 4614 4615 pxor %xmm1,%xmm2 4616 movdqu %xmm2,16(%rax) 4617 lea 32(%rax),%rax 4618 movdqa %xmm2,%xmm1 4619 4620 jmp .Loop_key256 4621 4622.Ldone_key256: 4623 mov $bits,16(%rax) # 240($key) 4624 xor %eax,%eax 4625 jmp .Lenc_key_ret 4626 4627.align 16 4628.Lbad_keybits: 4629 mov \$-2,%rax 4630.Lenc_key_ret: 4631 pxor %xmm0,%xmm0 4632 pxor %xmm1,%xmm1 4633 pxor %xmm2,%xmm2 4634 pxor %xmm3,%xmm3 4635 pxor %xmm4,%xmm4 4636 pxor %xmm5,%xmm5 4637 add \$8,%rsp 4638.cfi_adjust_cfa_offset -8 4639 ret 4640.cfi_endproc 4641.LSEH_end_set_encrypt_key: 4642 4643.align 16 4644.Lkey_expansion_128: 4645 $movkey %xmm0,(%rax) 4646 lea 16(%rax),%rax 4647.Lkey_expansion_128_cold: 4648 shufps \$0b00010000,%xmm0,%xmm4 4649 xorps %xmm4, %xmm0 4650 shufps \$0b10001100,%xmm0,%xmm4 4651 xorps %xmm4, %xmm0 4652 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4653 xorps %xmm1,%xmm0 4654 ret 4655 4656.align 16 4657.Lkey_expansion_192a: 4658 $movkey %xmm0,(%rax) 4659 lea 16(%rax),%rax 4660.Lkey_expansion_192a_cold: 4661 movaps %xmm2, %xmm5 4662.Lkey_expansion_192b_warm: 4663 shufps \$0b00010000,%xmm0,%xmm4 4664 movdqa %xmm2,%xmm3 4665 xorps %xmm4,%xmm0 4666 shufps \$0b10001100,%xmm0,%xmm4 4667 pslldq \$4,%xmm3 4668 xorps %xmm4,%xmm0 4669 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4670 pxor %xmm3,%xmm2 4671 pxor %xmm1,%xmm0 4672 pshufd \$0b11111111,%xmm0,%xmm3 4673 pxor %xmm3,%xmm2 4674 ret 4675 4676.align 16 4677.Lkey_expansion_192b: 4678 movaps %xmm0,%xmm3 4679 shufps \$0b01000100,%xmm0,%xmm5 4680 $movkey %xmm5,(%rax) 4681 shufps \$0b01001110,%xmm2,%xmm3 4682 $movkey %xmm3,16(%rax) 4683 lea 32(%rax),%rax 4684 jmp .Lkey_expansion_192b_warm 4685 4686.align 16 4687.Lkey_expansion_256a: 4688 $movkey %xmm2,(%rax) 4689 lea 16(%rax),%rax 4690.Lkey_expansion_256a_cold: 4691 shufps \$0b00010000,%xmm0,%xmm4 4692 xorps %xmm4,%xmm0 4693 shufps \$0b10001100,%xmm0,%xmm4 4694 xorps %xmm4,%xmm0 4695 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4696 xorps %xmm1,%xmm0 4697 ret 4698 4699.align 16 4700.Lkey_expansion_256b: 4701 $movkey %xmm0,(%rax) 4702 lea 16(%rax),%rax 4703 4704 shufps \$0b00010000,%xmm2,%xmm4 4705 xorps %xmm4,%xmm2 4706 shufps \$0b10001100,%xmm2,%xmm4 4707 xorps %xmm4,%xmm2 4708 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4709 xorps %xmm1,%xmm2 4710 ret 4711.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4712.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4713___ 4714} 4715 4716$code.=<<___; 4717.align 64 4718.Lbswap_mask: 4719 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4720.Lincrement32: 4721 .long 6,6,6,0 4722.Lincrement64: 4723 .long 1,0,0,0 4724.Lxts_magic: 4725 .long 0x87,0,1,0 4726.Lincrement1: 4727 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4728.Lkey_rotate: 4729 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4730.Lkey_rotate192: 4731 .long 0x04070605,0x04070605,0x04070605,0x04070605 4732.Lkey_rcon1: 4733 .long 1,1,1,1 4734.Lkey_rcon1b: 4735 .long 0x1b,0x1b,0x1b,0x1b 4736 4737.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4738.align 64 4739___ 4740 4741# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4742# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4743if ($win64) { 4744$rec="%rcx"; 4745$frame="%rdx"; 4746$context="%r8"; 4747$disp="%r9"; 4748 4749$code.=<<___; 4750.extern __imp_RtlVirtualUnwind 4751___ 4752$code.=<<___ if ($PREFIX eq "aesni"); 4753.type ecb_ccm64_se_handler,\@abi-omnipotent 4754.align 16 4755ecb_ccm64_se_handler: 4756 push %rsi 4757 push %rdi 4758 push %rbx 4759 push %rbp 4760 push %r12 4761 push %r13 4762 push %r14 4763 push %r15 4764 pushfq 4765 sub \$64,%rsp 4766 4767 mov 120($context),%rax # pull context->Rax 4768 mov 248($context),%rbx # pull context->Rip 4769 4770 mov 8($disp),%rsi # disp->ImageBase 4771 mov 56($disp),%r11 # disp->HandlerData 4772 4773 mov 0(%r11),%r10d # HandlerData[0] 4774 lea (%rsi,%r10),%r10 # prologue label 4775 cmp %r10,%rbx # context->Rip<prologue label 4776 jb .Lcommon_seh_tail 4777 4778 mov 152($context),%rax # pull context->Rsp 4779 4780 mov 4(%r11),%r10d # HandlerData[1] 4781 lea (%rsi,%r10),%r10 # epilogue label 4782 cmp %r10,%rbx # context->Rip>=epilogue label 4783 jae .Lcommon_seh_tail 4784 4785 lea 0(%rax),%rsi # %xmm save area 4786 lea 512($context),%rdi # &context.Xmm6 4787 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4788 .long 0xa548f3fc # cld; rep movsq 4789 lea 0x58(%rax),%rax # adjust stack pointer 4790 4791 jmp .Lcommon_seh_tail 4792.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 4793 4794.type ctr_xts_se_handler,\@abi-omnipotent 4795.align 16 4796ctr_xts_se_handler: 4797 push %rsi 4798 push %rdi 4799 push %rbx 4800 push %rbp 4801 push %r12 4802 push %r13 4803 push %r14 4804 push %r15 4805 pushfq 4806 sub \$64,%rsp 4807 4808 mov 120($context),%rax # pull context->Rax 4809 mov 248($context),%rbx # pull context->Rip 4810 4811 mov 8($disp),%rsi # disp->ImageBase 4812 mov 56($disp),%r11 # disp->HandlerData 4813 4814 mov 0(%r11),%r10d # HandlerData[0] 4815 lea (%rsi,%r10),%r10 # prologue lable 4816 cmp %r10,%rbx # context->Rip<prologue label 4817 jb .Lcommon_seh_tail 4818 4819 mov 152($context),%rax # pull context->Rsp 4820 4821 mov 4(%r11),%r10d # HandlerData[1] 4822 lea (%rsi,%r10),%r10 # epilogue label 4823 cmp %r10,%rbx # context->Rip>=epilogue label 4824 jae .Lcommon_seh_tail 4825 4826 mov 208($context),%rax # pull context->R11 4827 4828 lea -0xa8(%rax),%rsi # %xmm save area 4829 lea 512($context),%rdi # & context.Xmm6 4830 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4831 .long 0xa548f3fc # cld; rep movsq 4832 4833 mov -8(%rax),%rbp # restore saved %rbp 4834 mov %rbp,160($context) # restore context->Rbp 4835 jmp .Lcommon_seh_tail 4836.size ctr_xts_se_handler,.-ctr_xts_se_handler 4837 4838.type ocb_se_handler,\@abi-omnipotent 4839.align 16 4840ocb_se_handler: 4841 push %rsi 4842 push %rdi 4843 push %rbx 4844 push %rbp 4845 push %r12 4846 push %r13 4847 push %r14 4848 push %r15 4849 pushfq 4850 sub \$64,%rsp 4851 4852 mov 120($context),%rax # pull context->Rax 4853 mov 248($context),%rbx # pull context->Rip 4854 4855 mov 8($disp),%rsi # disp->ImageBase 4856 mov 56($disp),%r11 # disp->HandlerData 4857 4858 mov 0(%r11),%r10d # HandlerData[0] 4859 lea (%rsi,%r10),%r10 # prologue lable 4860 cmp %r10,%rbx # context->Rip<prologue label 4861 jb .Lcommon_seh_tail 4862 4863 mov 4(%r11),%r10d # HandlerData[1] 4864 lea (%rsi,%r10),%r10 # epilogue label 4865 cmp %r10,%rbx # context->Rip>=epilogue label 4866 jae .Lcommon_seh_tail 4867 4868 mov 8(%r11),%r10d # HandlerData[2] 4869 lea (%rsi,%r10),%r10 4870 cmp %r10,%rbx # context->Rip>=pop label 4871 jae .Locb_no_xmm 4872 4873 mov 152($context),%rax # pull context->Rsp 4874 4875 lea (%rax),%rsi # %xmm save area 4876 lea 512($context),%rdi # & context.Xmm6 4877 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4878 .long 0xa548f3fc # cld; rep movsq 4879 lea 0xa0+0x28(%rax),%rax 4880 4881.Locb_no_xmm: 4882 mov -8(%rax),%rbx 4883 mov -16(%rax),%rbp 4884 mov -24(%rax),%r12 4885 mov -32(%rax),%r13 4886 mov -40(%rax),%r14 4887 4888 mov %rbx,144($context) # restore context->Rbx 4889 mov %rbp,160($context) # restore context->Rbp 4890 mov %r12,216($context) # restore context->R12 4891 mov %r13,224($context) # restore context->R13 4892 mov %r14,232($context) # restore context->R14 4893 4894 jmp .Lcommon_seh_tail 4895.size ocb_se_handler,.-ocb_se_handler 4896___ 4897$code.=<<___; 4898.type cbc_se_handler,\@abi-omnipotent 4899.align 16 4900cbc_se_handler: 4901 push %rsi 4902 push %rdi 4903 push %rbx 4904 push %rbp 4905 push %r12 4906 push %r13 4907 push %r14 4908 push %r15 4909 pushfq 4910 sub \$64,%rsp 4911 4912 mov 152($context),%rax # pull context->Rsp 4913 mov 248($context),%rbx # pull context->Rip 4914 4915 lea .Lcbc_decrypt_bulk(%rip),%r10 4916 cmp %r10,%rbx # context->Rip<"prologue" label 4917 jb .Lcommon_seh_tail 4918 4919 mov 120($context),%rax # pull context->Rax 4920 4921 lea .Lcbc_decrypt_body(%rip),%r10 4922 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4923 jb .Lcommon_seh_tail 4924 4925 mov 152($context),%rax # pull context->Rsp 4926 4927 lea .Lcbc_ret(%rip),%r10 4928 cmp %r10,%rbx # context->Rip>="epilogue" label 4929 jae .Lcommon_seh_tail 4930 4931 lea 16(%rax),%rsi # %xmm save area 4932 lea 512($context),%rdi # &context.Xmm6 4933 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4934 .long 0xa548f3fc # cld; rep movsq 4935 4936 mov 208($context),%rax # pull context->R11 4937 4938 mov -8(%rax),%rbp # restore saved %rbp 4939 mov %rbp,160($context) # restore context->Rbp 4940 4941.Lcommon_seh_tail: 4942 mov 8(%rax),%rdi 4943 mov 16(%rax),%rsi 4944 mov %rax,152($context) # restore context->Rsp 4945 mov %rsi,168($context) # restore context->Rsi 4946 mov %rdi,176($context) # restore context->Rdi 4947 4948 mov 40($disp),%rdi # disp->ContextRecord 4949 mov $context,%rsi # context 4950 mov \$154,%ecx # sizeof(CONTEXT) 4951 .long 0xa548f3fc # cld; rep movsq 4952 4953 mov $disp,%rsi 4954 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4955 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4956 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4957 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4958 mov 40(%rsi),%r10 # disp->ContextRecord 4959 lea 56(%rsi),%r11 # &disp->HandlerData 4960 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4961 mov %r10,32(%rsp) # arg5 4962 mov %r11,40(%rsp) # arg6 4963 mov %r12,48(%rsp) # arg7 4964 mov %rcx,56(%rsp) # arg8, (NULL) 4965 call *__imp_RtlVirtualUnwind(%rip) 4966 4967 mov \$1,%eax # ExceptionContinueSearch 4968 add \$64,%rsp 4969 popfq 4970 pop %r15 4971 pop %r14 4972 pop %r13 4973 pop %r12 4974 pop %rbp 4975 pop %rbx 4976 pop %rdi 4977 pop %rsi 4978 ret 4979.size cbc_se_handler,.-cbc_se_handler 4980 4981.section .pdata 4982.align 4 4983___ 4984$code.=<<___ if ($PREFIX eq "aesni"); 4985 .rva .LSEH_begin_aesni_ecb_encrypt 4986 .rva .LSEH_end_aesni_ecb_encrypt 4987 .rva .LSEH_info_ecb 4988 4989 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 4990 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 4991 .rva .LSEH_info_ccm64_enc 4992 4993 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 4994 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 4995 .rva .LSEH_info_ccm64_dec 4996 4997 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 4998 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 4999 .rva .LSEH_info_ctr32 5000 5001 .rva .LSEH_begin_aesni_xts_encrypt 5002 .rva .LSEH_end_aesni_xts_encrypt 5003 .rva .LSEH_info_xts_enc 5004 5005 .rva .LSEH_begin_aesni_xts_decrypt 5006 .rva .LSEH_end_aesni_xts_decrypt 5007 .rva .LSEH_info_xts_dec 5008 5009 .rva .LSEH_begin_aesni_ocb_encrypt 5010 .rva .LSEH_end_aesni_ocb_encrypt 5011 .rva .LSEH_info_ocb_enc 5012 5013 .rva .LSEH_begin_aesni_ocb_decrypt 5014 .rva .LSEH_end_aesni_ocb_decrypt 5015 .rva .LSEH_info_ocb_dec 5016___ 5017$code.=<<___; 5018 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 5019 .rva .LSEH_end_${PREFIX}_cbc_encrypt 5020 .rva .LSEH_info_cbc 5021 5022 .rva ${PREFIX}_set_decrypt_key 5023 .rva .LSEH_end_set_decrypt_key 5024 .rva .LSEH_info_key 5025 5026 .rva ${PREFIX}_set_encrypt_key 5027 .rva .LSEH_end_set_encrypt_key 5028 .rva .LSEH_info_key 5029.section .xdata 5030.align 8 5031___ 5032$code.=<<___ if ($PREFIX eq "aesni"); 5033.LSEH_info_ecb: 5034 .byte 9,0,0,0 5035 .rva ecb_ccm64_se_handler 5036 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 5037.LSEH_info_ccm64_enc: 5038 .byte 9,0,0,0 5039 .rva ecb_ccm64_se_handler 5040 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 5041.LSEH_info_ccm64_dec: 5042 .byte 9,0,0,0 5043 .rva ecb_ccm64_se_handler 5044 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 5045.LSEH_info_ctr32: 5046 .byte 9,0,0,0 5047 .rva ctr_xts_se_handler 5048 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 5049.LSEH_info_xts_enc: 5050 .byte 9,0,0,0 5051 .rva ctr_xts_se_handler 5052 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 5053.LSEH_info_xts_dec: 5054 .byte 9,0,0,0 5055 .rva ctr_xts_se_handler 5056 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 5057.LSEH_info_ocb_enc: 5058 .byte 9,0,0,0 5059 .rva ocb_se_handler 5060 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] 5061 .rva .Locb_enc_pop 5062 .long 0 5063.LSEH_info_ocb_dec: 5064 .byte 9,0,0,0 5065 .rva ocb_se_handler 5066 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] 5067 .rva .Locb_dec_pop 5068 .long 0 5069___ 5070$code.=<<___; 5071.LSEH_info_cbc: 5072 .byte 9,0,0,0 5073 .rva cbc_se_handler 5074.LSEH_info_key: 5075 .byte 0x01,0x04,0x01,0x00 5076 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5077___ 5078} 5079 5080sub rex { 5081 local *opcode=shift; 5082 my ($dst,$src)=@_; 5083 my $rex=0; 5084 5085 $rex|=0x04 if($dst>=8); 5086 $rex|=0x01 if($src>=8); 5087 push @opcode,$rex|0x40 if($rex); 5088} 5089 5090sub aesni { 5091 my $line=shift; 5092 my @opcode=(0x66); 5093 5094 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5095 rex(\@opcode,$4,$3); 5096 push @opcode,0x0f,0x3a,0xdf; 5097 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5098 my $c=$2; 5099 push @opcode,$c=~/^0/?oct($c):$c; 5100 return ".byte\t".join(',',@opcode); 5101 } 5102 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5103 my %opcodelet = ( 5104 "aesimc" => 0xdb, 5105 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5106 "aesdec" => 0xde, "aesdeclast" => 0xdf 5107 ); 5108 return undef if (!defined($opcodelet{$1})); 5109 rex(\@opcode,$3,$2); 5110 push @opcode,0x0f,0x38,$opcodelet{$1}; 5111 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5112 return ".byte\t".join(',',@opcode); 5113 } 5114 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5115 my %opcodelet = ( 5116 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5117 "aesdec" => 0xde, "aesdeclast" => 0xdf 5118 ); 5119 return undef if (!defined($opcodelet{$1})); 5120 my $off = $2; 5121 push @opcode,0x44 if ($3>=8); 5122 push @opcode,0x0f,0x38,$opcodelet{$1}; 5123 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5124 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5125 return ".byte\t".join(',',@opcode); 5126 } 5127 return $line; 5128} 5129 5130sub movbe { 5131 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5132} 5133 5134$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5135$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5136#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5137$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5138 5139print $code; 5140 5141close STDOUT; 5142