1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# Multi-buffer AES-NI procedures process several independent buffers 18# in parallel by interleaving independent instructions. 19# 20# Cycles per byte for interleave factor 4: 21# 22# asymptotic measured 23# --------------------------- 24# Westmere 5.00/4=1.25 5.13/4=1.28 25# Atom 15.0/4=3.75 ?15.7/4=3.93 26# Sandy Bridge 5.06/4=1.27 5.18/4=1.29 27# Ivy Bridge 5.06/4=1.27 5.14/4=1.29 28# Haswell 4.44/4=1.11 4.44/4=1.11 29# Bulldozer 5.75/4=1.44 5.76/4=1.44 30# 31# Cycles per byte for interleave factor 8 (not implemented for 32# pre-AVX processors, where higher interleave factor incidentally 33# doesn't result in improvement): 34# 35# asymptotic measured 36# --------------------------- 37# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) 38# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) 39# Haswell 5.00/8=0.63 5.00/8=0.63 40# Bulldozer 5.75/8=0.72 5.77/8=0.72 41# 42# (*) Sandy/Ivy Bridge are known to handle high interleave factors 43# suboptimally; 44 45$flavour = shift; 46$output = shift; 47if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 48 49$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 50 51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 52( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 53( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 54die "can't locate x86_64-xlate.pl"; 55 56$avx=0; 57 58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 60 $avx = ($1>=2.19) + ($1>=2.22); 61} 62 63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 65 $avx = ($1>=2.09) + ($1>=2.10); 66} 67 68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 69 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 70 $avx = ($1>=10) + ($1>=11); 71} 72 73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 74 $avx = ($2>=3.0) + ($2>3.0); 75} 76 77open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 78*STDOUT=*OUT; 79 80# void aesni_multi_cbc_encrypt ( 81# struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; 82# const AES_KEY *key, 83# int num); /* 1 or 2 */ 84# 85$inp="%rdi"; # 1st arg 86$key="%rsi"; # 2nd arg 87$num="%edx"; 88 89@inptr=map("%r$_",(8..11)); 90@outptr=map("%r$_",(12..15)); 91 92($rndkey0,$rndkey1)=("%xmm0","%xmm1"); 93@out=map("%xmm$_",(2..5)); 94@inp=map("%xmm$_",(6..9)); 95($counters,$mask,$zero)=map("%xmm$_",(10..12)); 96 97($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); 98 99$code.=<<___; 100.text 101 102.extern OPENSSL_ia32cap_P 103 104.globl aesni_multi_cbc_encrypt 105.type aesni_multi_cbc_encrypt,\@function,3 106.align 32 107aesni_multi_cbc_encrypt: 108.cfi_startproc 109___ 110$code.=<<___ if ($avx); 111 cmp \$2,$num 112 jb .Lenc_non_avx 113 mov OPENSSL_ia32cap_P+4(%rip),%ecx 114 test \$`1<<28`,%ecx # AVX bit 115 jnz _avx_cbc_enc_shortcut 116 jmp .Lenc_non_avx 117.align 16 118.Lenc_non_avx: 119___ 120$code.=<<___; 121 mov %rsp,%rax 122.cfi_def_cfa_register %rax 123 push %rbx 124.cfi_push %rbx 125 push %rbp 126.cfi_push %rbp 127 push %r12 128.cfi_push %r12 129 push %r13 130.cfi_push %r13 131 push %r14 132.cfi_push %r14 133 push %r15 134.cfi_push %r15 135___ 136$code.=<<___ if ($win64); 137 lea -0xa8(%rsp),%rsp 138 movaps %xmm6,(%rsp) 139 movaps %xmm7,0x10(%rsp) 140 movaps %xmm8,0x20(%rsp) 141 movaps %xmm9,0x30(%rsp) 142 movaps %xmm10,0x40(%rsp) 143 movaps %xmm11,0x50(%rsp) 144 movaps %xmm12,0x60(%rsp) 145 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler 146 movaps %xmm14,-0x58(%rax) 147 movaps %xmm15,-0x48(%rax) 148___ 149$code.=<<___; 150 # stack layout 151 # 152 # +0 output sink 153 # +16 input sink [original %rsp and $num] 154 # +32 counters 155 156 sub \$48,%rsp 157 and \$-64,%rsp 158 mov %rax,16(%rsp) # original %rsp 159.cfi_cfa_expression %rsp+16,deref,+8 160 161.Lenc4x_body: 162 movdqu ($key),$zero # 0-round key 163 lea 0x78($key),$key # size optimization 164 lea 40*2($inp),$inp 165 166.Lenc4x_loop_grande: 167 mov $num,24(%rsp) # original $num 168 xor $num,$num 169___ 170for($i=0;$i<4;$i++) { 171 $code.=<<___; 172 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks 173 mov `40*$i+0-40*2`($inp),@inptr[$i] 174 cmp $num,$one 175 mov `40*$i+8-40*2`($inp),@outptr[$i] 176 cmovg $one,$num # find maximum 177 test $one,$one 178 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV 179 mov $one,`32+4*$i`(%rsp) # initialize counters 180 cmovle %rsp,@inptr[$i] # cancel input 181___ 182} 183$code.=<<___; 184 test $num,$num 185 jz .Lenc4x_done 186 187 movups 0x10-0x78($key),$rndkey1 188 pxor $zero,@out[0] 189 movups 0x20-0x78($key),$rndkey0 190 pxor $zero,@out[1] 191 mov 0xf0-0x78($key),$rounds 192 pxor $zero,@out[2] 193 movdqu (@inptr[0]),@inp[0] # load inputs 194 pxor $zero,@out[3] 195 movdqu (@inptr[1]),@inp[1] 196 pxor @inp[0],@out[0] 197 movdqu (@inptr[2]),@inp[2] 198 pxor @inp[1],@out[1] 199 movdqu (@inptr[3]),@inp[3] 200 pxor @inp[2],@out[2] 201 pxor @inp[3],@out[3] 202 movdqa 32(%rsp),$counters # load counters 203 xor $offset,$offset 204 jmp .Loop_enc4x 205 206.align 32 207.Loop_enc4x: 208 add \$16,$offset 209 lea 16(%rsp),$sink # sink pointer 210 mov \$1,$one # constant of 1 211 sub $offset,$sink 212 213 aesenc $rndkey1,@out[0] 214 prefetcht0 31(@inptr[0],$offset) # prefetch input 215 prefetcht0 31(@inptr[1],$offset) 216 aesenc $rndkey1,@out[1] 217 prefetcht0 31(@inptr[2],$offset) 218 prefetcht0 31(@inptr[2],$offset) 219 aesenc $rndkey1,@out[2] 220 aesenc $rndkey1,@out[3] 221 movups 0x30-0x78($key),$rndkey1 222___ 223for($i=0;$i<4;$i++) { 224my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; 225$code.=<<___; 226 cmp `32+4*$i`(%rsp),$one 227 aesenc $rndkey,@out[0] 228 aesenc $rndkey,@out[1] 229 aesenc $rndkey,@out[2] 230 cmovge $sink,@inptr[$i] # cancel input 231 cmovg $sink,@outptr[$i] # sink output 232 aesenc $rndkey,@out[3] 233 movups `0x40+16*$i-0x78`($key),$rndkey 234___ 235} 236$code.=<<___; 237 movdqa $counters,$mask 238 aesenc $rndkey0,@out[0] 239 prefetcht0 15(@outptr[0],$offset) # prefetch output 240 prefetcht0 15(@outptr[1],$offset) 241 aesenc $rndkey0,@out[1] 242 prefetcht0 15(@outptr[2],$offset) 243 prefetcht0 15(@outptr[3],$offset) 244 aesenc $rndkey0,@out[2] 245 aesenc $rndkey0,@out[3] 246 movups 0x80-0x78($key),$rndkey0 247 pxor $zero,$zero 248 249 aesenc $rndkey1,@out[0] 250 pcmpgtd $zero,$mask 251 movdqu -0x78($key),$zero # reload 0-round key 252 aesenc $rndkey1,@out[1] 253 paddd $mask,$counters # decrement counters 254 movdqa $counters,32(%rsp) # update counters 255 aesenc $rndkey1,@out[2] 256 aesenc $rndkey1,@out[3] 257 movups 0x90-0x78($key),$rndkey1 258 259 cmp \$11,$rounds 260 261 aesenc $rndkey0,@out[0] 262 aesenc $rndkey0,@out[1] 263 aesenc $rndkey0,@out[2] 264 aesenc $rndkey0,@out[3] 265 movups 0xa0-0x78($key),$rndkey0 266 267 jb .Lenc4x_tail 268 269 aesenc $rndkey1,@out[0] 270 aesenc $rndkey1,@out[1] 271 aesenc $rndkey1,@out[2] 272 aesenc $rndkey1,@out[3] 273 movups 0xb0-0x78($key),$rndkey1 274 275 aesenc $rndkey0,@out[0] 276 aesenc $rndkey0,@out[1] 277 aesenc $rndkey0,@out[2] 278 aesenc $rndkey0,@out[3] 279 movups 0xc0-0x78($key),$rndkey0 280 281 je .Lenc4x_tail 282 283 aesenc $rndkey1,@out[0] 284 aesenc $rndkey1,@out[1] 285 aesenc $rndkey1,@out[2] 286 aesenc $rndkey1,@out[3] 287 movups 0xd0-0x78($key),$rndkey1 288 289 aesenc $rndkey0,@out[0] 290 aesenc $rndkey0,@out[1] 291 aesenc $rndkey0,@out[2] 292 aesenc $rndkey0,@out[3] 293 movups 0xe0-0x78($key),$rndkey0 294 jmp .Lenc4x_tail 295 296.align 32 297.Lenc4x_tail: 298 aesenc $rndkey1,@out[0] 299 aesenc $rndkey1,@out[1] 300 aesenc $rndkey1,@out[2] 301 aesenc $rndkey1,@out[3] 302 movdqu (@inptr[0],$offset),@inp[0] 303 movdqu 0x10-0x78($key),$rndkey1 304 305 aesenclast $rndkey0,@out[0] 306 movdqu (@inptr[1],$offset),@inp[1] 307 pxor $zero,@inp[0] 308 aesenclast $rndkey0,@out[1] 309 movdqu (@inptr[2],$offset),@inp[2] 310 pxor $zero,@inp[1] 311 aesenclast $rndkey0,@out[2] 312 movdqu (@inptr[3],$offset),@inp[3] 313 pxor $zero,@inp[2] 314 aesenclast $rndkey0,@out[3] 315 movdqu 0x20-0x78($key),$rndkey0 316 pxor $zero,@inp[3] 317 318 movups @out[0],-16(@outptr[0],$offset) 319 pxor @inp[0],@out[0] 320 movups @out[1],-16(@outptr[1],$offset) 321 pxor @inp[1],@out[1] 322 movups @out[2],-16(@outptr[2],$offset) 323 pxor @inp[2],@out[2] 324 movups @out[3],-16(@outptr[3],$offset) 325 pxor @inp[3],@out[3] 326 327 dec $num 328 jnz .Loop_enc4x 329 330 mov 16(%rsp),%rax # original %rsp 331.cfi_def_cfa %rax,8 332 mov 24(%rsp),$num 333 334 #pxor @inp[0],@out[0] 335 #pxor @inp[1],@out[1] 336 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! 337 #pxor @inp[2],@out[2] 338 #movdqu @out[1],`40*1+24-40*2`($inp) 339 #pxor @inp[3],@out[3] 340 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller 341 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... 342 343 lea `40*4`($inp),$inp 344 dec $num 345 jnz .Lenc4x_loop_grande 346 347.Lenc4x_done: 348___ 349$code.=<<___ if ($win64); 350 movaps -0xd8(%rax),%xmm6 351 movaps -0xc8(%rax),%xmm7 352 movaps -0xb8(%rax),%xmm8 353 movaps -0xa8(%rax),%xmm9 354 movaps -0x98(%rax),%xmm10 355 movaps -0x88(%rax),%xmm11 356 movaps -0x78(%rax),%xmm12 357 #movaps -0x68(%rax),%xmm13 358 #movaps -0x58(%rax),%xmm14 359 #movaps -0x48(%rax),%xmm15 360___ 361$code.=<<___; 362 mov -48(%rax),%r15 363.cfi_restore %r15 364 mov -40(%rax),%r14 365.cfi_restore %r14 366 mov -32(%rax),%r13 367.cfi_restore %r13 368 mov -24(%rax),%r12 369.cfi_restore %r12 370 mov -16(%rax),%rbp 371.cfi_restore %rbp 372 mov -8(%rax),%rbx 373.cfi_restore %rbx 374 lea (%rax),%rsp 375.cfi_def_cfa_register %rsp 376.Lenc4x_epilogue: 377 ret 378.cfi_endproc 379.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt 380 381.globl aesni_multi_cbc_decrypt 382.type aesni_multi_cbc_decrypt,\@function,3 383.align 32 384aesni_multi_cbc_decrypt: 385.cfi_startproc 386___ 387$code.=<<___ if ($avx); 388 cmp \$2,$num 389 jb .Ldec_non_avx 390 mov OPENSSL_ia32cap_P+4(%rip),%ecx 391 test \$`1<<28`,%ecx # AVX bit 392 jnz _avx_cbc_dec_shortcut 393 jmp .Ldec_non_avx 394.align 16 395.Ldec_non_avx: 396___ 397$code.=<<___; 398 mov %rsp,%rax 399.cfi_def_cfa_register %rax 400 push %rbx 401.cfi_push %rbx 402 push %rbp 403.cfi_push %rbp 404 push %r12 405.cfi_push %r12 406 push %r13 407.cfi_push %r13 408 push %r14 409.cfi_push %r14 410 push %r15 411.cfi_push %r15 412___ 413$code.=<<___ if ($win64); 414 lea -0xa8(%rsp),%rsp 415 movaps %xmm6,(%rsp) 416 movaps %xmm7,0x10(%rsp) 417 movaps %xmm8,0x20(%rsp) 418 movaps %xmm9,0x30(%rsp) 419 movaps %xmm10,0x40(%rsp) 420 movaps %xmm11,0x50(%rsp) 421 movaps %xmm12,0x60(%rsp) 422 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler 423 movaps %xmm14,-0x58(%rax) 424 movaps %xmm15,-0x48(%rax) 425___ 426$code.=<<___; 427 # stack layout 428 # 429 # +0 output sink 430 # +16 input sink [original %rsp and $num] 431 # +32 counters 432 433 sub \$48,%rsp 434 and \$-64,%rsp 435 mov %rax,16(%rsp) # original %rsp 436.cfi_cfa_expression %rsp+16,deref,+8 437 438.Ldec4x_body: 439 movdqu ($key),$zero # 0-round key 440 lea 0x78($key),$key # size optimization 441 lea 40*2($inp),$inp 442 443.Ldec4x_loop_grande: 444 mov $num,24(%rsp) # original $num 445 xor $num,$num 446___ 447for($i=0;$i<4;$i++) { 448 $code.=<<___; 449 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks 450 mov `40*$i+0-40*2`($inp),@inptr[$i] 451 cmp $num,$one 452 mov `40*$i+8-40*2`($inp),@outptr[$i] 453 cmovg $one,$num # find maximum 454 test $one,$one 455 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV 456 mov $one,`32+4*$i`(%rsp) # initialize counters 457 cmovle %rsp,@inptr[$i] # cancel input 458___ 459} 460$code.=<<___; 461 test $num,$num 462 jz .Ldec4x_done 463 464 movups 0x10-0x78($key),$rndkey1 465 movups 0x20-0x78($key),$rndkey0 466 mov 0xf0-0x78($key),$rounds 467 movdqu (@inptr[0]),@out[0] # load inputs 468 movdqu (@inptr[1]),@out[1] 469 pxor $zero,@out[0] 470 movdqu (@inptr[2]),@out[2] 471 pxor $zero,@out[1] 472 movdqu (@inptr[3]),@out[3] 473 pxor $zero,@out[2] 474 pxor $zero,@out[3] 475 movdqa 32(%rsp),$counters # load counters 476 xor $offset,$offset 477 jmp .Loop_dec4x 478 479.align 32 480.Loop_dec4x: 481 add \$16,$offset 482 lea 16(%rsp),$sink # sink pointer 483 mov \$1,$one # constant of 1 484 sub $offset,$sink 485 486 aesdec $rndkey1,@out[0] 487 prefetcht0 31(@inptr[0],$offset) # prefetch input 488 prefetcht0 31(@inptr[1],$offset) 489 aesdec $rndkey1,@out[1] 490 prefetcht0 31(@inptr[2],$offset) 491 prefetcht0 31(@inptr[3],$offset) 492 aesdec $rndkey1,@out[2] 493 aesdec $rndkey1,@out[3] 494 movups 0x30-0x78($key),$rndkey1 495___ 496for($i=0;$i<4;$i++) { 497my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; 498$code.=<<___; 499 cmp `32+4*$i`(%rsp),$one 500 aesdec $rndkey,@out[0] 501 aesdec $rndkey,@out[1] 502 aesdec $rndkey,@out[2] 503 cmovge $sink,@inptr[$i] # cancel input 504 cmovg $sink,@outptr[$i] # sink output 505 aesdec $rndkey,@out[3] 506 movups `0x40+16*$i-0x78`($key),$rndkey 507___ 508} 509$code.=<<___; 510 movdqa $counters,$mask 511 aesdec $rndkey0,@out[0] 512 prefetcht0 15(@outptr[0],$offset) # prefetch output 513 prefetcht0 15(@outptr[1],$offset) 514 aesdec $rndkey0,@out[1] 515 prefetcht0 15(@outptr[2],$offset) 516 prefetcht0 15(@outptr[3],$offset) 517 aesdec $rndkey0,@out[2] 518 aesdec $rndkey0,@out[3] 519 movups 0x80-0x78($key),$rndkey0 520 pxor $zero,$zero 521 522 aesdec $rndkey1,@out[0] 523 pcmpgtd $zero,$mask 524 movdqu -0x78($key),$zero # reload 0-round key 525 aesdec $rndkey1,@out[1] 526 paddd $mask,$counters # decrement counters 527 movdqa $counters,32(%rsp) # update counters 528 aesdec $rndkey1,@out[2] 529 aesdec $rndkey1,@out[3] 530 movups 0x90-0x78($key),$rndkey1 531 532 cmp \$11,$rounds 533 534 aesdec $rndkey0,@out[0] 535 aesdec $rndkey0,@out[1] 536 aesdec $rndkey0,@out[2] 537 aesdec $rndkey0,@out[3] 538 movups 0xa0-0x78($key),$rndkey0 539 540 jb .Ldec4x_tail 541 542 aesdec $rndkey1,@out[0] 543 aesdec $rndkey1,@out[1] 544 aesdec $rndkey1,@out[2] 545 aesdec $rndkey1,@out[3] 546 movups 0xb0-0x78($key),$rndkey1 547 548 aesdec $rndkey0,@out[0] 549 aesdec $rndkey0,@out[1] 550 aesdec $rndkey0,@out[2] 551 aesdec $rndkey0,@out[3] 552 movups 0xc0-0x78($key),$rndkey0 553 554 je .Ldec4x_tail 555 556 aesdec $rndkey1,@out[0] 557 aesdec $rndkey1,@out[1] 558 aesdec $rndkey1,@out[2] 559 aesdec $rndkey1,@out[3] 560 movups 0xd0-0x78($key),$rndkey1 561 562 aesdec $rndkey0,@out[0] 563 aesdec $rndkey0,@out[1] 564 aesdec $rndkey0,@out[2] 565 aesdec $rndkey0,@out[3] 566 movups 0xe0-0x78($key),$rndkey0 567 jmp .Ldec4x_tail 568 569.align 32 570.Ldec4x_tail: 571 aesdec $rndkey1,@out[0] 572 aesdec $rndkey1,@out[1] 573 aesdec $rndkey1,@out[2] 574 pxor $rndkey0,@inp[0] 575 pxor $rndkey0,@inp[1] 576 aesdec $rndkey1,@out[3] 577 movdqu 0x10-0x78($key),$rndkey1 578 pxor $rndkey0,@inp[2] 579 pxor $rndkey0,@inp[3] 580 movdqu 0x20-0x78($key),$rndkey0 581 582 aesdeclast @inp[0],@out[0] 583 aesdeclast @inp[1],@out[1] 584 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV 585 movdqu -16(@inptr[1],$offset),@inp[1] 586 aesdeclast @inp[2],@out[2] 587 aesdeclast @inp[3],@out[3] 588 movdqu -16(@inptr[2],$offset),@inp[2] 589 movdqu -16(@inptr[3],$offset),@inp[3] 590 591 movups @out[0],-16(@outptr[0],$offset) 592 movdqu (@inptr[0],$offset),@out[0] 593 movups @out[1],-16(@outptr[1],$offset) 594 movdqu (@inptr[1],$offset),@out[1] 595 pxor $zero,@out[0] 596 movups @out[2],-16(@outptr[2],$offset) 597 movdqu (@inptr[2],$offset),@out[2] 598 pxor $zero,@out[1] 599 movups @out[3],-16(@outptr[3],$offset) 600 movdqu (@inptr[3],$offset),@out[3] 601 pxor $zero,@out[2] 602 pxor $zero,@out[3] 603 604 dec $num 605 jnz .Loop_dec4x 606 607 mov 16(%rsp),%rax # original %rsp 608.cfi_def_cfa %rax,8 609 mov 24(%rsp),$num 610 611 lea `40*4`($inp),$inp 612 dec $num 613 jnz .Ldec4x_loop_grande 614 615.Ldec4x_done: 616___ 617$code.=<<___ if ($win64); 618 movaps -0xd8(%rax),%xmm6 619 movaps -0xc8(%rax),%xmm7 620 movaps -0xb8(%rax),%xmm8 621 movaps -0xa8(%rax),%xmm9 622 movaps -0x98(%rax),%xmm10 623 movaps -0x88(%rax),%xmm11 624 movaps -0x78(%rax),%xmm12 625 #movaps -0x68(%rax),%xmm13 626 #movaps -0x58(%rax),%xmm14 627 #movaps -0x48(%rax),%xmm15 628___ 629$code.=<<___; 630 mov -48(%rax),%r15 631.cfi_restore %r15 632 mov -40(%rax),%r14 633.cfi_restore %r14 634 mov -32(%rax),%r13 635.cfi_restore %r13 636 mov -24(%rax),%r12 637.cfi_restore %r12 638 mov -16(%rax),%rbp 639.cfi_restore %rbp 640 mov -8(%rax),%rbx 641.cfi_restore %rbx 642 lea (%rax),%rsp 643.cfi_def_cfa_register %rsp 644.Ldec4x_epilogue: 645 ret 646.cfi_endproc 647.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt 648___ 649 650 if ($avx) {{{ 651my @ptr=map("%r$_",(8..15)); 652my $offload=$sink; 653 654my @out=map("%xmm$_",(2..9)); 655my @inp=map("%xmm$_",(10..13)); 656my ($counters,$zero)=("%xmm14","%xmm15"); 657 658$code.=<<___; 659.type aesni_multi_cbc_encrypt_avx,\@function,3 660.align 32 661aesni_multi_cbc_encrypt_avx: 662.cfi_startproc 663_avx_cbc_enc_shortcut: 664 mov %rsp,%rax 665.cfi_def_cfa_register %rax 666 push %rbx 667.cfi_push %rbx 668 push %rbp 669.cfi_push %rbp 670 push %r12 671.cfi_push %r12 672 push %r13 673.cfi_push %r13 674 push %r14 675.cfi_push %r14 676 push %r15 677.cfi_push %r15 678___ 679$code.=<<___ if ($win64); 680 lea -0xa8(%rsp),%rsp 681 movaps %xmm6,(%rsp) 682 movaps %xmm7,0x10(%rsp) 683 movaps %xmm8,0x20(%rsp) 684 movaps %xmm9,0x30(%rsp) 685 movaps %xmm10,0x40(%rsp) 686 movaps %xmm11,0x50(%rsp) 687 movaps %xmm12,-0x78(%rax) 688 movaps %xmm13,-0x68(%rax) 689 movaps %xmm14,-0x58(%rax) 690 movaps %xmm15,-0x48(%rax) 691___ 692$code.=<<___; 693 # stack layout 694 # 695 # +0 output sink 696 # +16 input sink [original %rsp and $num] 697 # +32 counters 698 # +64 distances between inputs and outputs 699 # +128 off-load area for @inp[0..3] 700 701 sub \$192,%rsp 702 and \$-128,%rsp 703 mov %rax,16(%rsp) # original %rsp 704.cfi_cfa_expression %rsp+16,deref,+8 705 706.Lenc8x_body: 707 vzeroupper 708 vmovdqu ($key),$zero # 0-round key 709 lea 0x78($key),$key # size optimization 710 lea 40*4($inp),$inp 711 shr \$1,$num 712 713.Lenc8x_loop_grande: 714 #mov $num,24(%rsp) # original $num 715 xor $num,$num 716___ 717for($i=0;$i<8;$i++) { 718 my $temp = $i ? $offload : $offset; 719 $code.=<<___; 720 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks 721 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer 722 cmp $num,$one 723 mov `40*$i+8-40*4`($inp),$temp # output pointer 724 cmovg $one,$num # find maximum 725 test $one,$one 726 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV 727 mov $one,`32+4*$i`(%rsp) # initialize counters 728 cmovle %rsp,@ptr[$i] # cancel input 729 sub @ptr[$i],$temp # distance between input and output 730 mov $temp,`64+8*$i`(%rsp) # initialize distances 731___ 732} 733$code.=<<___; 734 test $num,$num 735 jz .Lenc8x_done 736 737 vmovups 0x10-0x78($key),$rndkey1 738 vmovups 0x20-0x78($key),$rndkey0 739 mov 0xf0-0x78($key),$rounds 740 741 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round 742 lea 128(%rsp),$offload # offload area 743 vpxor (@ptr[1]),$zero,@inp[1] 744 vpxor (@ptr[2]),$zero,@inp[2] 745 vpxor (@ptr[3]),$zero,@inp[3] 746 vpxor @inp[0],@out[0],@out[0] 747 vpxor (@ptr[4]),$zero,@inp[0] 748 vpxor @inp[1],@out[1],@out[1] 749 vpxor (@ptr[5]),$zero,@inp[1] 750 vpxor @inp[2],@out[2],@out[2] 751 vpxor (@ptr[6]),$zero,@inp[2] 752 vpxor @inp[3],@out[3],@out[3] 753 vpxor (@ptr[7]),$zero,@inp[3] 754 vpxor @inp[0],@out[4],@out[4] 755 mov \$1,$one # constant of 1 756 vpxor @inp[1],@out[5],@out[5] 757 vpxor @inp[2],@out[6],@out[6] 758 vpxor @inp[3],@out[7],@out[7] 759 jmp .Loop_enc8x 760 761.align 32 762.Loop_enc8x: 763___ 764for($i=0;$i<8;$i++) { 765my $rndkey=($i&1)?$rndkey0:$rndkey1; 766$code.=<<___; 767 vaesenc $rndkey,@out[0],@out[0] 768 cmp 32+4*$i(%rsp),$one 769___ 770$code.=<<___ if ($i); 771 mov 64+8*$i(%rsp),$offset 772___ 773$code.=<<___; 774 vaesenc $rndkey,@out[1],@out[1] 775 prefetcht0 31(@ptr[$i]) # prefetch input 776 vaesenc $rndkey,@out[2],@out[2] 777___ 778$code.=<<___ if ($i>1); 779 prefetcht0 15(@ptr[$i-2]) # prefetch output 780___ 781$code.=<<___; 782 vaesenc $rndkey,@out[3],@out[3] 783 lea (@ptr[$i],$offset),$offset 784 cmovge %rsp,@ptr[$i] # cancel input 785 vaesenc $rndkey,@out[4],@out[4] 786 cmovg %rsp,$offset # sink output 787 vaesenc $rndkey,@out[5],@out[5] 788 sub @ptr[$i],$offset 789 vaesenc $rndkey,@out[6],@out[6] 790 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round 791 mov $offset,64+8*$i(%rsp) 792 vaesenc $rndkey,@out[7],@out[7] 793 vmovups `16*(3+$i)-0x78`($key),$rndkey 794 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output 795___ 796$code.=<<___ if ($i<4) 797 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load 798___ 799} 800$code.=<<___; 801 vmovdqu 32(%rsp),$counters 802 prefetcht0 15(@ptr[$i-2]) # prefetch output 803 prefetcht0 15(@ptr[$i-1]) 804 cmp \$11,$rounds 805 jb .Lenc8x_tail 806 807 vaesenc $rndkey1,@out[0],@out[0] 808 vaesenc $rndkey1,@out[1],@out[1] 809 vaesenc $rndkey1,@out[2],@out[2] 810 vaesenc $rndkey1,@out[3],@out[3] 811 vaesenc $rndkey1,@out[4],@out[4] 812 vaesenc $rndkey1,@out[5],@out[5] 813 vaesenc $rndkey1,@out[6],@out[6] 814 vaesenc $rndkey1,@out[7],@out[7] 815 vmovups 0xb0-0x78($key),$rndkey1 816 817 vaesenc $rndkey0,@out[0],@out[0] 818 vaesenc $rndkey0,@out[1],@out[1] 819 vaesenc $rndkey0,@out[2],@out[2] 820 vaesenc $rndkey0,@out[3],@out[3] 821 vaesenc $rndkey0,@out[4],@out[4] 822 vaesenc $rndkey0,@out[5],@out[5] 823 vaesenc $rndkey0,@out[6],@out[6] 824 vaesenc $rndkey0,@out[7],@out[7] 825 vmovups 0xc0-0x78($key),$rndkey0 826 je .Lenc8x_tail 827 828 vaesenc $rndkey1,@out[0],@out[0] 829 vaesenc $rndkey1,@out[1],@out[1] 830 vaesenc $rndkey1,@out[2],@out[2] 831 vaesenc $rndkey1,@out[3],@out[3] 832 vaesenc $rndkey1,@out[4],@out[4] 833 vaesenc $rndkey1,@out[5],@out[5] 834 vaesenc $rndkey1,@out[6],@out[6] 835 vaesenc $rndkey1,@out[7],@out[7] 836 vmovups 0xd0-0x78($key),$rndkey1 837 838 vaesenc $rndkey0,@out[0],@out[0] 839 vaesenc $rndkey0,@out[1],@out[1] 840 vaesenc $rndkey0,@out[2],@out[2] 841 vaesenc $rndkey0,@out[3],@out[3] 842 vaesenc $rndkey0,@out[4],@out[4] 843 vaesenc $rndkey0,@out[5],@out[5] 844 vaesenc $rndkey0,@out[6],@out[6] 845 vaesenc $rndkey0,@out[7],@out[7] 846 vmovups 0xe0-0x78($key),$rndkey0 847 848.Lenc8x_tail: 849 vaesenc $rndkey1,@out[0],@out[0] 850 vpxor $zero,$zero,$zero 851 vaesenc $rndkey1,@out[1],@out[1] 852 vaesenc $rndkey1,@out[2],@out[2] 853 vpcmpgtd $zero,$counters,$zero 854 vaesenc $rndkey1,@out[3],@out[3] 855 vaesenc $rndkey1,@out[4],@out[4] 856 vpaddd $counters,$zero,$zero # decrement counters 857 vmovdqu 48(%rsp),$counters 858 vaesenc $rndkey1,@out[5],@out[5] 859 mov 64(%rsp),$offset # pre-load 1st offset 860 vaesenc $rndkey1,@out[6],@out[6] 861 vaesenc $rndkey1,@out[7],@out[7] 862 vmovups 0x10-0x78($key),$rndkey1 863 864 vaesenclast $rndkey0,@out[0],@out[0] 865 vmovdqa $zero,32(%rsp) # update counters 866 vpxor $zero,$zero,$zero 867 vaesenclast $rndkey0,@out[1],@out[1] 868 vaesenclast $rndkey0,@out[2],@out[2] 869 vpcmpgtd $zero,$counters,$zero 870 vaesenclast $rndkey0,@out[3],@out[3] 871 vaesenclast $rndkey0,@out[4],@out[4] 872 vpaddd $zero,$counters,$counters # decrement counters 873 vmovdqu -0x78($key),$zero # 0-round 874 vaesenclast $rndkey0,@out[5],@out[5] 875 vaesenclast $rndkey0,@out[6],@out[6] 876 vmovdqa $counters,48(%rsp) # update counters 877 vaesenclast $rndkey0,@out[7],@out[7] 878 vmovups 0x20-0x78($key),$rndkey0 879 880 vmovups @out[0],-16(@ptr[0]) # write output 881 sub $offset,@ptr[0] # switch to input 882 vpxor 0x00($offload),@out[0],@out[0] 883 vmovups @out[1],-16(@ptr[1]) 884 sub `64+1*8`(%rsp),@ptr[1] 885 vpxor 0x10($offload),@out[1],@out[1] 886 vmovups @out[2],-16(@ptr[2]) 887 sub `64+2*8`(%rsp),@ptr[2] 888 vpxor 0x20($offload),@out[2],@out[2] 889 vmovups @out[3],-16(@ptr[3]) 890 sub `64+3*8`(%rsp),@ptr[3] 891 vpxor 0x30($offload),@out[3],@out[3] 892 vmovups @out[4],-16(@ptr[4]) 893 sub `64+4*8`(%rsp),@ptr[4] 894 vpxor @inp[0],@out[4],@out[4] 895 vmovups @out[5],-16(@ptr[5]) 896 sub `64+5*8`(%rsp),@ptr[5] 897 vpxor @inp[1],@out[5],@out[5] 898 vmovups @out[6],-16(@ptr[6]) 899 sub `64+6*8`(%rsp),@ptr[6] 900 vpxor @inp[2],@out[6],@out[6] 901 vmovups @out[7],-16(@ptr[7]) 902 sub `64+7*8`(%rsp),@ptr[7] 903 vpxor @inp[3],@out[7],@out[7] 904 905 dec $num 906 jnz .Loop_enc8x 907 908 mov 16(%rsp),%rax # original %rsp 909.cfi_def_cfa %rax,8 910 #mov 24(%rsp),$num 911 #lea `40*8`($inp),$inp 912 #dec $num 913 #jnz .Lenc8x_loop_grande 914 915.Lenc8x_done: 916 vzeroupper 917___ 918$code.=<<___ if ($win64); 919 movaps -0xd8(%rax),%xmm6 920 movaps -0xc8(%rax),%xmm7 921 movaps -0xb8(%rax),%xmm8 922 movaps -0xa8(%rax),%xmm9 923 movaps -0x98(%rax),%xmm10 924 movaps -0x88(%rax),%xmm11 925 movaps -0x78(%rax),%xmm12 926 movaps -0x68(%rax),%xmm13 927 movaps -0x58(%rax),%xmm14 928 movaps -0x48(%rax),%xmm15 929___ 930$code.=<<___; 931 mov -48(%rax),%r15 932.cfi_restore %r15 933 mov -40(%rax),%r14 934.cfi_restore %r14 935 mov -32(%rax),%r13 936.cfi_restore %r13 937 mov -24(%rax),%r12 938.cfi_restore %r12 939 mov -16(%rax),%rbp 940.cfi_restore %rbp 941 mov -8(%rax),%rbx 942.cfi_restore %rbx 943 lea (%rax),%rsp 944.cfi_def_cfa_register %rsp 945.Lenc8x_epilogue: 946 ret 947.cfi_endproc 948.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx 949 950.type aesni_multi_cbc_decrypt_avx,\@function,3 951.align 32 952aesni_multi_cbc_decrypt_avx: 953.cfi_startproc 954_avx_cbc_dec_shortcut: 955 mov %rsp,%rax 956.cfi_def_cfa_register %rax 957 push %rbx 958.cfi_push %rbx 959 push %rbp 960.cfi_push %rbp 961 push %r12 962.cfi_push %r12 963 push %r13 964.cfi_push %r13 965 push %r14 966.cfi_push %r14 967 push %r15 968.cfi_push %r15 969___ 970$code.=<<___ if ($win64); 971 lea -0xa8(%rsp),%rsp 972 movaps %xmm6,(%rsp) 973 movaps %xmm7,0x10(%rsp) 974 movaps %xmm8,0x20(%rsp) 975 movaps %xmm9,0x30(%rsp) 976 movaps %xmm10,0x40(%rsp) 977 movaps %xmm11,0x50(%rsp) 978 movaps %xmm12,-0x78(%rax) 979 movaps %xmm13,-0x68(%rax) 980 movaps %xmm14,-0x58(%rax) 981 movaps %xmm15,-0x48(%rax) 982___ 983$code.=<<___; 984 # stack layout 985 # 986 # +0 output sink 987 # +16 input sink [original %rsp and $num] 988 # +32 counters 989 # +64 distances between inputs and outputs 990 # +128 off-load area for @inp[0..3] 991 # +192 IV/input offload 992 993 sub \$256,%rsp 994 and \$-256,%rsp 995 sub \$192,%rsp 996 mov %rax,16(%rsp) # original %rsp 997.cfi_cfa_expression %rsp+16,deref,+8 998 999.Ldec8x_body: 1000 vzeroupper 1001 vmovdqu ($key),$zero # 0-round key 1002 lea 0x78($key),$key # size optimization 1003 lea 40*4($inp),$inp 1004 shr \$1,$num 1005 1006.Ldec8x_loop_grande: 1007 #mov $num,24(%rsp) # original $num 1008 xor $num,$num 1009___ 1010for($i=0;$i<8;$i++) { 1011 my $temp = $i ? $offload : $offset; 1012 $code.=<<___; 1013 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks 1014 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer 1015 cmp $num,$one 1016 mov `40*$i+8-40*4`($inp),$temp # output pointer 1017 cmovg $one,$num # find maximum 1018 test $one,$one 1019 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV 1020 mov $one,`32+4*$i`(%rsp) # initialize counters 1021 cmovle %rsp,@ptr[$i] # cancel input 1022 sub @ptr[$i],$temp # distance between input and output 1023 mov $temp,`64+8*$i`(%rsp) # initialize distances 1024 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV 1025___ 1026} 1027$code.=<<___; 1028 test $num,$num 1029 jz .Ldec8x_done 1030 1031 vmovups 0x10-0x78($key),$rndkey1 1032 vmovups 0x20-0x78($key),$rndkey0 1033 mov 0xf0-0x78($key),$rounds 1034 lea 192+128(%rsp),$offload # offload area 1035 1036 vmovdqu (@ptr[0]),@out[0] # load inputs 1037 vmovdqu (@ptr[1]),@out[1] 1038 vmovdqu (@ptr[2]),@out[2] 1039 vmovdqu (@ptr[3]),@out[3] 1040 vmovdqu (@ptr[4]),@out[4] 1041 vmovdqu (@ptr[5]),@out[5] 1042 vmovdqu (@ptr[6]),@out[6] 1043 vmovdqu (@ptr[7]),@out[7] 1044 vmovdqu @out[0],0x00($offload) # offload inputs 1045 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round 1046 vmovdqu @out[1],0x10($offload) 1047 vpxor $zero,@out[1],@out[1] 1048 vmovdqu @out[2],0x20($offload) 1049 vpxor $zero,@out[2],@out[2] 1050 vmovdqu @out[3],0x30($offload) 1051 vpxor $zero,@out[3],@out[3] 1052 vmovdqu @out[4],0x40($offload) 1053 vpxor $zero,@out[4],@out[4] 1054 vmovdqu @out[5],0x50($offload) 1055 vpxor $zero,@out[5],@out[5] 1056 vmovdqu @out[6],0x60($offload) 1057 vpxor $zero,@out[6],@out[6] 1058 vmovdqu @out[7],0x70($offload) 1059 vpxor $zero,@out[7],@out[7] 1060 xor \$0x80,$offload 1061 mov \$1,$one # constant of 1 1062 jmp .Loop_dec8x 1063 1064.align 32 1065.Loop_dec8x: 1066___ 1067for($i=0;$i<8;$i++) { 1068my $rndkey=($i&1)?$rndkey0:$rndkey1; 1069$code.=<<___; 1070 vaesdec $rndkey,@out[0],@out[0] 1071 cmp 32+4*$i(%rsp),$one 1072___ 1073$code.=<<___ if ($i); 1074 mov 64+8*$i(%rsp),$offset 1075___ 1076$code.=<<___; 1077 vaesdec $rndkey,@out[1],@out[1] 1078 prefetcht0 31(@ptr[$i]) # prefetch input 1079 vaesdec $rndkey,@out[2],@out[2] 1080___ 1081$code.=<<___ if ($i>1); 1082 prefetcht0 15(@ptr[$i-2]) # prefetch output 1083___ 1084$code.=<<___; 1085 vaesdec $rndkey,@out[3],@out[3] 1086 lea (@ptr[$i],$offset),$offset 1087 cmovge %rsp,@ptr[$i] # cancel input 1088 vaesdec $rndkey,@out[4],@out[4] 1089 cmovg %rsp,$offset # sink output 1090 vaesdec $rndkey,@out[5],@out[5] 1091 sub @ptr[$i],$offset 1092 vaesdec $rndkey,@out[6],@out[6] 1093 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input 1094 mov $offset,64+8*$i(%rsp) 1095 vaesdec $rndkey,@out[7],@out[7] 1096 vmovups `16*(3+$i)-0x78`($key),$rndkey 1097 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output 1098___ 1099$code.=<<___ if ($i<4); 1100 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load 1101___ 1102} 1103$code.=<<___; 1104 vmovdqu 32(%rsp),$counters 1105 prefetcht0 15(@ptr[$i-2]) # prefetch output 1106 prefetcht0 15(@ptr[$i-1]) 1107 cmp \$11,$rounds 1108 jb .Ldec8x_tail 1109 1110 vaesdec $rndkey1,@out[0],@out[0] 1111 vaesdec $rndkey1,@out[1],@out[1] 1112 vaesdec $rndkey1,@out[2],@out[2] 1113 vaesdec $rndkey1,@out[3],@out[3] 1114 vaesdec $rndkey1,@out[4],@out[4] 1115 vaesdec $rndkey1,@out[5],@out[5] 1116 vaesdec $rndkey1,@out[6],@out[6] 1117 vaesdec $rndkey1,@out[7],@out[7] 1118 vmovups 0xb0-0x78($key),$rndkey1 1119 1120 vaesdec $rndkey0,@out[0],@out[0] 1121 vaesdec $rndkey0,@out[1],@out[1] 1122 vaesdec $rndkey0,@out[2],@out[2] 1123 vaesdec $rndkey0,@out[3],@out[3] 1124 vaesdec $rndkey0,@out[4],@out[4] 1125 vaesdec $rndkey0,@out[5],@out[5] 1126 vaesdec $rndkey0,@out[6],@out[6] 1127 vaesdec $rndkey0,@out[7],@out[7] 1128 vmovups 0xc0-0x78($key),$rndkey0 1129 je .Ldec8x_tail 1130 1131 vaesdec $rndkey1,@out[0],@out[0] 1132 vaesdec $rndkey1,@out[1],@out[1] 1133 vaesdec $rndkey1,@out[2],@out[2] 1134 vaesdec $rndkey1,@out[3],@out[3] 1135 vaesdec $rndkey1,@out[4],@out[4] 1136 vaesdec $rndkey1,@out[5],@out[5] 1137 vaesdec $rndkey1,@out[6],@out[6] 1138 vaesdec $rndkey1,@out[7],@out[7] 1139 vmovups 0xd0-0x78($key),$rndkey1 1140 1141 vaesdec $rndkey0,@out[0],@out[0] 1142 vaesdec $rndkey0,@out[1],@out[1] 1143 vaesdec $rndkey0,@out[2],@out[2] 1144 vaesdec $rndkey0,@out[3],@out[3] 1145 vaesdec $rndkey0,@out[4],@out[4] 1146 vaesdec $rndkey0,@out[5],@out[5] 1147 vaesdec $rndkey0,@out[6],@out[6] 1148 vaesdec $rndkey0,@out[7],@out[7] 1149 vmovups 0xe0-0x78($key),$rndkey0 1150 1151.Ldec8x_tail: 1152 vaesdec $rndkey1,@out[0],@out[0] 1153 vpxor $zero,$zero,$zero 1154 vaesdec $rndkey1,@out[1],@out[1] 1155 vaesdec $rndkey1,@out[2],@out[2] 1156 vpcmpgtd $zero,$counters,$zero 1157 vaesdec $rndkey1,@out[3],@out[3] 1158 vaesdec $rndkey1,@out[4],@out[4] 1159 vpaddd $counters,$zero,$zero # decrement counters 1160 vmovdqu 48(%rsp),$counters 1161 vaesdec $rndkey1,@out[5],@out[5] 1162 mov 64(%rsp),$offset # pre-load 1st offset 1163 vaesdec $rndkey1,@out[6],@out[6] 1164 vaesdec $rndkey1,@out[7],@out[7] 1165 vmovups 0x10-0x78($key),$rndkey1 1166 1167 vaesdeclast $rndkey0,@out[0],@out[0] 1168 vmovdqa $zero,32(%rsp) # update counters 1169 vpxor $zero,$zero,$zero 1170 vaesdeclast $rndkey0,@out[1],@out[1] 1171 vpxor 0x00($offload),@out[0],@out[0] # xor with IV 1172 vaesdeclast $rndkey0,@out[2],@out[2] 1173 vpxor 0x10($offload),@out[1],@out[1] 1174 vpcmpgtd $zero,$counters,$zero 1175 vaesdeclast $rndkey0,@out[3],@out[3] 1176 vpxor 0x20($offload),@out[2],@out[2] 1177 vaesdeclast $rndkey0,@out[4],@out[4] 1178 vpxor 0x30($offload),@out[3],@out[3] 1179 vpaddd $zero,$counters,$counters # decrement counters 1180 vmovdqu -0x78($key),$zero # 0-round 1181 vaesdeclast $rndkey0,@out[5],@out[5] 1182 vpxor 0x40($offload),@out[4],@out[4] 1183 vaesdeclast $rndkey0,@out[6],@out[6] 1184 vpxor 0x50($offload),@out[5],@out[5] 1185 vmovdqa $counters,48(%rsp) # update counters 1186 vaesdeclast $rndkey0,@out[7],@out[7] 1187 vpxor 0x60($offload),@out[6],@out[6] 1188 vmovups 0x20-0x78($key),$rndkey0 1189 1190 vmovups @out[0],-16(@ptr[0]) # write output 1191 sub $offset,@ptr[0] # switch to input 1192 vmovdqu 128+0(%rsp),@out[0] 1193 vpxor 0x70($offload),@out[7],@out[7] 1194 vmovups @out[1],-16(@ptr[1]) 1195 sub `64+1*8`(%rsp),@ptr[1] 1196 vmovdqu @out[0],0x00($offload) 1197 vpxor $zero,@out[0],@out[0] 1198 vmovdqu 128+16(%rsp),@out[1] 1199 vmovups @out[2],-16(@ptr[2]) 1200 sub `64+2*8`(%rsp),@ptr[2] 1201 vmovdqu @out[1],0x10($offload) 1202 vpxor $zero,@out[1],@out[1] 1203 vmovdqu 128+32(%rsp),@out[2] 1204 vmovups @out[3],-16(@ptr[3]) 1205 sub `64+3*8`(%rsp),@ptr[3] 1206 vmovdqu @out[2],0x20($offload) 1207 vpxor $zero,@out[2],@out[2] 1208 vmovdqu 128+48(%rsp),@out[3] 1209 vmovups @out[4],-16(@ptr[4]) 1210 sub `64+4*8`(%rsp),@ptr[4] 1211 vmovdqu @out[3],0x30($offload) 1212 vpxor $zero,@out[3],@out[3] 1213 vmovdqu @inp[0],0x40($offload) 1214 vpxor @inp[0],$zero,@out[4] 1215 vmovups @out[5],-16(@ptr[5]) 1216 sub `64+5*8`(%rsp),@ptr[5] 1217 vmovdqu @inp[1],0x50($offload) 1218 vpxor @inp[1],$zero,@out[5] 1219 vmovups @out[6],-16(@ptr[6]) 1220 sub `64+6*8`(%rsp),@ptr[6] 1221 vmovdqu @inp[2],0x60($offload) 1222 vpxor @inp[2],$zero,@out[6] 1223 vmovups @out[7],-16(@ptr[7]) 1224 sub `64+7*8`(%rsp),@ptr[7] 1225 vmovdqu @inp[3],0x70($offload) 1226 vpxor @inp[3],$zero,@out[7] 1227 1228 xor \$128,$offload 1229 dec $num 1230 jnz .Loop_dec8x 1231 1232 mov 16(%rsp),%rax # original %rsp 1233.cfi_def_cfa %rax,8 1234 #mov 24(%rsp),$num 1235 #lea `40*8`($inp),$inp 1236 #dec $num 1237 #jnz .Ldec8x_loop_grande 1238 1239.Ldec8x_done: 1240 vzeroupper 1241___ 1242$code.=<<___ if ($win64); 1243 movaps -0xd8(%rax),%xmm6 1244 movaps -0xc8(%rax),%xmm7 1245 movaps -0xb8(%rax),%xmm8 1246 movaps -0xa8(%rax),%xmm9 1247 movaps -0x98(%rax),%xmm10 1248 movaps -0x88(%rax),%xmm11 1249 movaps -0x78(%rax),%xmm12 1250 movaps -0x68(%rax),%xmm13 1251 movaps -0x58(%rax),%xmm14 1252 movaps -0x48(%rax),%xmm15 1253___ 1254$code.=<<___; 1255 mov -48(%rax),%r15 1256.cfi_restore %r15 1257 mov -40(%rax),%r14 1258.cfi_restore %r14 1259 mov -32(%rax),%r13 1260.cfi_restore %r13 1261 mov -24(%rax),%r12 1262.cfi_restore %r12 1263 mov -16(%rax),%rbp 1264.cfi_restore %rbp 1265 mov -8(%rax),%rbx 1266.cfi_restore %rbx 1267 lea (%rax),%rsp 1268.cfi_def_cfa_register %rsp 1269.Ldec8x_epilogue: 1270 ret 1271.cfi_endproc 1272.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx 1273___ 1274 }}} 1275 1276if ($win64) { 1277# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1278# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1279$rec="%rcx"; 1280$frame="%rdx"; 1281$context="%r8"; 1282$disp="%r9"; 1283 1284$code.=<<___; 1285.extern __imp_RtlVirtualUnwind 1286.type se_handler,\@abi-omnipotent 1287.align 16 1288se_handler: 1289 push %rsi 1290 push %rdi 1291 push %rbx 1292 push %rbp 1293 push %r12 1294 push %r13 1295 push %r14 1296 push %r15 1297 pushfq 1298 sub \$64,%rsp 1299 1300 mov 120($context),%rax # pull context->Rax 1301 mov 248($context),%rbx # pull context->Rip 1302 1303 mov 8($disp),%rsi # disp->ImageBase 1304 mov 56($disp),%r11 # disp->HandlerData 1305 1306 mov 0(%r11),%r10d # HandlerData[0] 1307 lea (%rsi,%r10),%r10 # prologue label 1308 cmp %r10,%rbx # context->Rip<.Lprologue 1309 jb .Lin_prologue 1310 1311 mov 152($context),%rax # pull context->Rsp 1312 1313 mov 4(%r11),%r10d # HandlerData[1] 1314 lea (%rsi,%r10),%r10 # epilogue label 1315 cmp %r10,%rbx # context->Rip>=.Lepilogue 1316 jae .Lin_prologue 1317 1318 mov 16(%rax),%rax # pull saved stack pointer 1319 1320 mov -8(%rax),%rbx 1321 mov -16(%rax),%rbp 1322 mov -24(%rax),%r12 1323 mov -32(%rax),%r13 1324 mov -40(%rax),%r14 1325 mov -48(%rax),%r15 1326 mov %rbx,144($context) # restore context->Rbx 1327 mov %rbp,160($context) # restore context->Rbp 1328 mov %r12,216($context) # restore context->R12 1329 mov %r13,224($context) # restore context->R13 1330 mov %r14,232($context) # restore context->R14 1331 mov %r15,240($context) # restore context->R15 1332 1333 lea -56-10*16(%rax),%rsi 1334 lea 512($context),%rdi # &context.Xmm6 1335 mov \$20,%ecx 1336 .long 0xa548f3fc # cld; rep movsq 1337 1338.Lin_prologue: 1339 mov 8(%rax),%rdi 1340 mov 16(%rax),%rsi 1341 mov %rax,152($context) # restore context->Rsp 1342 mov %rsi,168($context) # restore context->Rsi 1343 mov %rdi,176($context) # restore context->Rdi 1344 1345 mov 40($disp),%rdi # disp->ContextRecord 1346 mov $context,%rsi # context 1347 mov \$154,%ecx # sizeof(CONTEXT) 1348 .long 0xa548f3fc # cld; rep movsq 1349 1350 mov $disp,%rsi 1351 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1352 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1353 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1354 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1355 mov 40(%rsi),%r10 # disp->ContextRecord 1356 lea 56(%rsi),%r11 # &disp->HandlerData 1357 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1358 mov %r10,32(%rsp) # arg5 1359 mov %r11,40(%rsp) # arg6 1360 mov %r12,48(%rsp) # arg7 1361 mov %rcx,56(%rsp) # arg8, (NULL) 1362 call *__imp_RtlVirtualUnwind(%rip) 1363 1364 mov \$1,%eax # ExceptionContinueSearch 1365 add \$64,%rsp 1366 popfq 1367 pop %r15 1368 pop %r14 1369 pop %r13 1370 pop %r12 1371 pop %rbp 1372 pop %rbx 1373 pop %rdi 1374 pop %rsi 1375 ret 1376.size se_handler,.-se_handler 1377 1378.section .pdata 1379.align 4 1380 .rva .LSEH_begin_aesni_multi_cbc_encrypt 1381 .rva .LSEH_end_aesni_multi_cbc_encrypt 1382 .rva .LSEH_info_aesni_multi_cbc_encrypt 1383 .rva .LSEH_begin_aesni_multi_cbc_decrypt 1384 .rva .LSEH_end_aesni_multi_cbc_decrypt 1385 .rva .LSEH_info_aesni_multi_cbc_decrypt 1386___ 1387$code.=<<___ if ($avx); 1388 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx 1389 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx 1390 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx 1391 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx 1392 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx 1393 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx 1394___ 1395$code.=<<___; 1396.section .xdata 1397.align 8 1398.LSEH_info_aesni_multi_cbc_encrypt: 1399 .byte 9,0,0,0 1400 .rva se_handler 1401 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] 1402.LSEH_info_aesni_multi_cbc_decrypt: 1403 .byte 9,0,0,0 1404 .rva se_handler 1405 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] 1406___ 1407$code.=<<___ if ($avx); 1408.LSEH_info_aesni_multi_cbc_encrypt_avx: 1409 .byte 9,0,0,0 1410 .rva se_handler 1411 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] 1412.LSEH_info_aesni_multi_cbc_decrypt_avx: 1413 .byte 9,0,0,0 1414 .rva se_handler 1415 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] 1416___ 1417} 1418#################################################################### 1419 1420sub rex { 1421 local *opcode=shift; 1422 my ($dst,$src)=@_; 1423 my $rex=0; 1424 1425 $rex|=0x04 if($dst>=8); 1426 $rex|=0x01 if($src>=8); 1427 push @opcode,$rex|0x40 if($rex); 1428} 1429 1430sub aesni { 1431 my $line=shift; 1432 my @opcode=(0x66); 1433 1434 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1435 rex(\@opcode,$4,$3); 1436 push @opcode,0x0f,0x3a,0xdf; 1437 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 1438 my $c=$2; 1439 push @opcode,$c=~/^0/?oct($c):$c; 1440 return ".byte\t".join(',',@opcode); 1441 } 1442 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1443 my %opcodelet = ( 1444 "aesimc" => 0xdb, 1445 "aesenc" => 0xdc, "aesenclast" => 0xdd, 1446 "aesdec" => 0xde, "aesdeclast" => 0xdf 1447 ); 1448 return undef if (!defined($opcodelet{$1})); 1449 rex(\@opcode,$3,$2); 1450 push @opcode,0x0f,0x38,$opcodelet{$1}; 1451 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 1452 return ".byte\t".join(',',@opcode); 1453 } 1454 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 1455 my %opcodelet = ( 1456 "aesenc" => 0xdc, "aesenclast" => 0xdd, 1457 "aesdec" => 0xde, "aesdeclast" => 0xdf 1458 ); 1459 return undef if (!defined($opcodelet{$1})); 1460 my $off = $2; 1461 push @opcode,0x44 if ($3>=8); 1462 push @opcode,0x0f,0x38,$opcodelet{$1}; 1463 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 1464 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 1465 return ".byte\t".join(',',@opcode); 1466 } 1467 return $line; 1468} 1469 1470$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1471$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 1472 1473print $code; 1474close STDOUT or die "error closing STDOUT: $!"; 1475