1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# Multi-buffer AES-NI procedures process several independent buffers 11# in parallel by interleaving independent instructions. 12# 13# Cycles per byte for interleave factor 4: 14# 15# asymptotic measured 16# --------------------------- 17# Westmere 5.00/4=1.25 5.13/4=1.28 18# Atom 15.0/4=3.75 ?15.7/4=3.93 19# Sandy Bridge 5.06/4=1.27 5.18/4=1.29 20# Ivy Bridge 5.06/4=1.27 5.14/4=1.29 21# Haswell 4.44/4=1.11 4.44/4=1.11 22# Bulldozer 5.75/4=1.44 5.76/4=1.44 23# 24# Cycles per byte for interleave factor 8 (not implemented for 25# pre-AVX processors, where higher interleave factor incidentally 26# doesn't result in improvement): 27# 28# asymptotic measured 29# --------------------------- 30# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) 31# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) 32# Haswell 5.00/8=0.63 5.00/8=0.63 33# Bulldozer 5.75/8=0.72 5.77/8=0.72 34# 35# (*) Sandy/Ivy Bridge are known to handle high interleave factors 36# suboptimally; 37 38$flavour = shift; 39$output = shift; 40if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 41 42$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 43 44$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 45( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 46( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 47die "can't locate x86_64-xlate.pl"; 48 49$avx=0; 50 51if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 52 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 53 $avx = ($1>=2.19) + ($1>=2.22); 54} 55 56if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 58 $avx = ($1>=2.09) + ($1>=2.10); 59} 60 61if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 62 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 63 $avx = ($1>=10) + ($1>=11); 64} 65 66if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) { 67 $avx = ($2>=3.0) + ($2>3.0); 68} 69 70open OUT,"| \"$^X\" $xlate $flavour $output"; 71*STDOUT=*OUT; 72 73# void aesni_multi_cbc_encrypt ( 74# struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; 75# const AES_KEY *key, 76# int num); /* 1 or 2 */ 77# 78$inp="%rdi"; # 1st arg 79$key="%rsi"; # 2nd arg 80$num="%edx"; 81 82@inptr=map("%r$_",(8..11)); 83@outptr=map("%r$_",(12..15)); 84 85($rndkey0,$rndkey1)=("%xmm0","%xmm1"); 86@out=map("%xmm$_",(2..5)); 87@inp=map("%xmm$_",(6..9)); 88($counters,$mask,$zero)=map("%xmm$_",(10..12)); 89 90($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); 91 92$code.=<<___; 93.text 94 95.extern OPENSSL_ia32cap_P 96 97.globl aesni_multi_cbc_encrypt 98.type aesni_multi_cbc_encrypt,\@function,3 99.align 32 100aesni_multi_cbc_encrypt: 101___ 102$code.=<<___ if ($avx); 103 cmp \$2,$num 104 jb .Lenc_non_avx 105 mov OPENSSL_ia32cap_P+4(%rip),%ecx 106 test \$`1<<28`,%ecx # AVX bit 107 jnz _avx_cbc_enc_shortcut 108 jmp .Lenc_non_avx 109.align 16 110.Lenc_non_avx: 111___ 112$code.=<<___; 113 mov %rsp,%rax 114 push %rbx 115 push %rbp 116 push %r12 117 push %r13 118 push %r14 119 push %r15 120___ 121$code.=<<___ if ($win64); 122 lea -0xa8(%rsp),%rsp 123 movaps %xmm6,(%rsp) 124 movaps %xmm7,0x10(%rsp) 125 movaps %xmm8,0x20(%rsp) 126 movaps %xmm9,0x30(%rsp) 127 movaps %xmm10,0x40(%rsp) 128 movaps %xmm11,0x50(%rsp) 129 movaps %xmm12,0x60(%rsp) 130 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler 131 movaps %xmm14,-0x58(%rax) 132 movaps %xmm15,-0x48(%rax) 133___ 134$code.=<<___; 135 # stack layout 136 # 137 # +0 output sink 138 # +16 input sink [original %rsp and $num] 139 # +32 counters 140 141 sub \$48,%rsp 142 and \$-64,%rsp 143 mov %rax,16(%rsp) # original %rsp 144 145.Lenc4x_body: 146 movdqu ($key),$zero # 0-round key 147 lea 0x78($key),$key # size optimization 148 lea 40*2($inp),$inp 149 150.Lenc4x_loop_grande: 151 mov $num,24(%rsp) # original $num 152 xor $num,$num 153___ 154for($i=0;$i<4;$i++) { 155 $code.=<<___; 156 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks 157 mov `40*$i+0-40*2`($inp),@inptr[$i] 158 cmp $num,$one 159 mov `40*$i+8-40*2`($inp),@outptr[$i] 160 cmovg $one,$num # find maximum 161 test $one,$one 162 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV 163 mov $one,`32+4*$i`(%rsp) # initialize counters 164 cmovle %rsp,@inptr[$i] # cancel input 165___ 166} 167$code.=<<___; 168 test $num,$num 169 jz .Lenc4x_done 170 171 movups 0x10-0x78($key),$rndkey1 172 pxor $zero,@out[0] 173 movups 0x20-0x78($key),$rndkey0 174 pxor $zero,@out[1] 175 mov 0xf0-0x78($key),$rounds 176 pxor $zero,@out[2] 177 movdqu (@inptr[0]),@inp[0] # load inputs 178 pxor $zero,@out[3] 179 movdqu (@inptr[1]),@inp[1] 180 pxor @inp[0],@out[0] 181 movdqu (@inptr[2]),@inp[2] 182 pxor @inp[1],@out[1] 183 movdqu (@inptr[3]),@inp[3] 184 pxor @inp[2],@out[2] 185 pxor @inp[3],@out[3] 186 movdqa 32(%rsp),$counters # load counters 187 xor $offset,$offset 188 jmp .Loop_enc4x 189 190.align 32 191.Loop_enc4x: 192 add \$16,$offset 193 lea 16(%rsp),$sink # sink pointer 194 mov \$1,$one # constant of 1 195 sub $offset,$sink 196 197 aesenc $rndkey1,@out[0] 198 prefetcht0 31(@inptr[0],$offset) # prefetch input 199 prefetcht0 31(@inptr[1],$offset) 200 aesenc $rndkey1,@out[1] 201 prefetcht0 31(@inptr[2],$offset) 202 prefetcht0 31(@inptr[2],$offset) 203 aesenc $rndkey1,@out[2] 204 aesenc $rndkey1,@out[3] 205 movups 0x30-0x78($key),$rndkey1 206___ 207for($i=0;$i<4;$i++) { 208my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; 209$code.=<<___; 210 cmp `32+4*$i`(%rsp),$one 211 aesenc $rndkey,@out[0] 212 aesenc $rndkey,@out[1] 213 aesenc $rndkey,@out[2] 214 cmovge $sink,@inptr[$i] # cancel input 215 cmovg $sink,@outptr[$i] # sink output 216 aesenc $rndkey,@out[3] 217 movups `0x40+16*$i-0x78`($key),$rndkey 218___ 219} 220$code.=<<___; 221 movdqa $counters,$mask 222 aesenc $rndkey0,@out[0] 223 prefetcht0 15(@outptr[0],$offset) # prefetch output 224 prefetcht0 15(@outptr[1],$offset) 225 aesenc $rndkey0,@out[1] 226 prefetcht0 15(@outptr[2],$offset) 227 prefetcht0 15(@outptr[3],$offset) 228 aesenc $rndkey0,@out[2] 229 aesenc $rndkey0,@out[3] 230 movups 0x80-0x78($key),$rndkey0 231 pxor $zero,$zero 232 233 aesenc $rndkey1,@out[0] 234 pcmpgtd $zero,$mask 235 movdqu -0x78($key),$zero # reload 0-round key 236 aesenc $rndkey1,@out[1] 237 paddd $mask,$counters # decrement counters 238 movdqa $counters,32(%rsp) # update counters 239 aesenc $rndkey1,@out[2] 240 aesenc $rndkey1,@out[3] 241 movups 0x90-0x78($key),$rndkey1 242 243 cmp \$11,$rounds 244 245 aesenc $rndkey0,@out[0] 246 aesenc $rndkey0,@out[1] 247 aesenc $rndkey0,@out[2] 248 aesenc $rndkey0,@out[3] 249 movups 0xa0-0x78($key),$rndkey0 250 251 jb .Lenc4x_tail 252 253 aesenc $rndkey1,@out[0] 254 aesenc $rndkey1,@out[1] 255 aesenc $rndkey1,@out[2] 256 aesenc $rndkey1,@out[3] 257 movups 0xb0-0x78($key),$rndkey1 258 259 aesenc $rndkey0,@out[0] 260 aesenc $rndkey0,@out[1] 261 aesenc $rndkey0,@out[2] 262 aesenc $rndkey0,@out[3] 263 movups 0xc0-0x78($key),$rndkey0 264 265 je .Lenc4x_tail 266 267 aesenc $rndkey1,@out[0] 268 aesenc $rndkey1,@out[1] 269 aesenc $rndkey1,@out[2] 270 aesenc $rndkey1,@out[3] 271 movups 0xd0-0x78($key),$rndkey1 272 273 aesenc $rndkey0,@out[0] 274 aesenc $rndkey0,@out[1] 275 aesenc $rndkey0,@out[2] 276 aesenc $rndkey0,@out[3] 277 movups 0xe0-0x78($key),$rndkey0 278 jmp .Lenc4x_tail 279 280.align 32 281.Lenc4x_tail: 282 aesenc $rndkey1,@out[0] 283 aesenc $rndkey1,@out[1] 284 aesenc $rndkey1,@out[2] 285 aesenc $rndkey1,@out[3] 286 movdqu (@inptr[0],$offset),@inp[0] 287 movdqu 0x10-0x78($key),$rndkey1 288 289 aesenclast $rndkey0,@out[0] 290 movdqu (@inptr[1],$offset),@inp[1] 291 pxor $zero,@inp[0] 292 aesenclast $rndkey0,@out[1] 293 movdqu (@inptr[2],$offset),@inp[2] 294 pxor $zero,@inp[1] 295 aesenclast $rndkey0,@out[2] 296 movdqu (@inptr[3],$offset),@inp[3] 297 pxor $zero,@inp[2] 298 aesenclast $rndkey0,@out[3] 299 movdqu 0x20-0x78($key),$rndkey0 300 pxor $zero,@inp[3] 301 302 movups @out[0],-16(@outptr[0],$offset) 303 pxor @inp[0],@out[0] 304 movups @out[1],-16(@outptr[1],$offset) 305 pxor @inp[1],@out[1] 306 movups @out[2],-16(@outptr[2],$offset) 307 pxor @inp[2],@out[2] 308 movups @out[3],-16(@outptr[3],$offset) 309 pxor @inp[3],@out[3] 310 311 dec $num 312 jnz .Loop_enc4x 313 314 mov 16(%rsp),%rax # original %rsp 315 mov 24(%rsp),$num 316 317 #pxor @inp[0],@out[0] 318 #pxor @inp[1],@out[1] 319 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! 320 #pxor @inp[2],@out[2] 321 #movdqu @out[1],`40*1+24-40*2`($inp) 322 #pxor @inp[3],@out[3] 323 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller 324 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... 325 326 lea `40*4`($inp),$inp 327 dec $num 328 jnz .Lenc4x_loop_grande 329 330.Lenc4x_done: 331___ 332$code.=<<___ if ($win64); 333 movaps -0xd8(%rax),%xmm6 334 movaps -0xc8(%rax),%xmm7 335 movaps -0xb8(%rax),%xmm8 336 movaps -0xa8(%rax),%xmm9 337 movaps -0x98(%rax),%xmm10 338 movaps -0x88(%rax),%xmm11 339 movaps -0x78(%rax),%xmm12 340 #movaps -0x68(%rax),%xmm13 341 #movaps -0x58(%rax),%xmm14 342 #movaps -0x48(%rax),%xmm15 343___ 344$code.=<<___; 345 mov -48(%rax),%r15 346 mov -40(%rax),%r14 347 mov -32(%rax),%r13 348 mov -24(%rax),%r12 349 mov -16(%rax),%rbp 350 mov -8(%rax),%rbx 351 lea (%rax),%rsp 352.Lenc4x_epilogue: 353 ret 354.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt 355 356.globl aesni_multi_cbc_decrypt 357.type aesni_multi_cbc_decrypt,\@function,3 358.align 32 359aesni_multi_cbc_decrypt: 360___ 361$code.=<<___ if ($avx); 362 cmp \$2,$num 363 jb .Ldec_non_avx 364 mov OPENSSL_ia32cap_P+4(%rip),%ecx 365 test \$`1<<28`,%ecx # AVX bit 366 jnz _avx_cbc_dec_shortcut 367 jmp .Ldec_non_avx 368.align 16 369.Ldec_non_avx: 370___ 371$code.=<<___; 372 mov %rsp,%rax 373 push %rbx 374 push %rbp 375 push %r12 376 push %r13 377 push %r14 378 push %r15 379___ 380$code.=<<___ if ($win64); 381 lea -0xa8(%rsp),%rsp 382 movaps %xmm6,(%rsp) 383 movaps %xmm7,0x10(%rsp) 384 movaps %xmm8,0x20(%rsp) 385 movaps %xmm9,0x30(%rsp) 386 movaps %xmm10,0x40(%rsp) 387 movaps %xmm11,0x50(%rsp) 388 movaps %xmm12,0x60(%rsp) 389 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler 390 movaps %xmm14,-0x58(%rax) 391 movaps %xmm15,-0x48(%rax) 392___ 393$code.=<<___; 394 # stack layout 395 # 396 # +0 output sink 397 # +16 input sink [original %rsp and $num] 398 # +32 counters 399 400 sub \$48,%rsp 401 and \$-64,%rsp 402 mov %rax,16(%rsp) # original %rsp 403 404.Ldec4x_body: 405 movdqu ($key),$zero # 0-round key 406 lea 0x78($key),$key # size optimization 407 lea 40*2($inp),$inp 408 409.Ldec4x_loop_grande: 410 mov $num,24(%rsp) # original $num 411 xor $num,$num 412___ 413for($i=0;$i<4;$i++) { 414 $code.=<<___; 415 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks 416 mov `40*$i+0-40*2`($inp),@inptr[$i] 417 cmp $num,$one 418 mov `40*$i+8-40*2`($inp),@outptr[$i] 419 cmovg $one,$num # find maximum 420 test $one,$one 421 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV 422 mov $one,`32+4*$i`(%rsp) # initialize counters 423 cmovle %rsp,@inptr[$i] # cancel input 424___ 425} 426$code.=<<___; 427 test $num,$num 428 jz .Ldec4x_done 429 430 movups 0x10-0x78($key),$rndkey1 431 movups 0x20-0x78($key),$rndkey0 432 mov 0xf0-0x78($key),$rounds 433 movdqu (@inptr[0]),@out[0] # load inputs 434 movdqu (@inptr[1]),@out[1] 435 pxor $zero,@out[0] 436 movdqu (@inptr[2]),@out[2] 437 pxor $zero,@out[1] 438 movdqu (@inptr[3]),@out[3] 439 pxor $zero,@out[2] 440 pxor $zero,@out[3] 441 movdqa 32(%rsp),$counters # load counters 442 xor $offset,$offset 443 jmp .Loop_dec4x 444 445.align 32 446.Loop_dec4x: 447 add \$16,$offset 448 lea 16(%rsp),$sink # sink pointer 449 mov \$1,$one # constant of 1 450 sub $offset,$sink 451 452 aesdec $rndkey1,@out[0] 453 prefetcht0 31(@inptr[0],$offset) # prefetch input 454 prefetcht0 31(@inptr[1],$offset) 455 aesdec $rndkey1,@out[1] 456 prefetcht0 31(@inptr[2],$offset) 457 prefetcht0 31(@inptr[3],$offset) 458 aesdec $rndkey1,@out[2] 459 aesdec $rndkey1,@out[3] 460 movups 0x30-0x78($key),$rndkey1 461___ 462for($i=0;$i<4;$i++) { 463my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; 464$code.=<<___; 465 cmp `32+4*$i`(%rsp),$one 466 aesdec $rndkey,@out[0] 467 aesdec $rndkey,@out[1] 468 aesdec $rndkey,@out[2] 469 cmovge $sink,@inptr[$i] # cancel input 470 cmovg $sink,@outptr[$i] # sink output 471 aesdec $rndkey,@out[3] 472 movups `0x40+16*$i-0x78`($key),$rndkey 473___ 474} 475$code.=<<___; 476 movdqa $counters,$mask 477 aesdec $rndkey0,@out[0] 478 prefetcht0 15(@outptr[0],$offset) # prefetch output 479 prefetcht0 15(@outptr[1],$offset) 480 aesdec $rndkey0,@out[1] 481 prefetcht0 15(@outptr[2],$offset) 482 prefetcht0 15(@outptr[3],$offset) 483 aesdec $rndkey0,@out[2] 484 aesdec $rndkey0,@out[3] 485 movups 0x80-0x78($key),$rndkey0 486 pxor $zero,$zero 487 488 aesdec $rndkey1,@out[0] 489 pcmpgtd $zero,$mask 490 movdqu -0x78($key),$zero # reload 0-round key 491 aesdec $rndkey1,@out[1] 492 paddd $mask,$counters # decrement counters 493 movdqa $counters,32(%rsp) # update counters 494 aesdec $rndkey1,@out[2] 495 aesdec $rndkey1,@out[3] 496 movups 0x90-0x78($key),$rndkey1 497 498 cmp \$11,$rounds 499 500 aesdec $rndkey0,@out[0] 501 aesdec $rndkey0,@out[1] 502 aesdec $rndkey0,@out[2] 503 aesdec $rndkey0,@out[3] 504 movups 0xa0-0x78($key),$rndkey0 505 506 jb .Ldec4x_tail 507 508 aesdec $rndkey1,@out[0] 509 aesdec $rndkey1,@out[1] 510 aesdec $rndkey1,@out[2] 511 aesdec $rndkey1,@out[3] 512 movups 0xb0-0x78($key),$rndkey1 513 514 aesdec $rndkey0,@out[0] 515 aesdec $rndkey0,@out[1] 516 aesdec $rndkey0,@out[2] 517 aesdec $rndkey0,@out[3] 518 movups 0xc0-0x78($key),$rndkey0 519 520 je .Ldec4x_tail 521 522 aesdec $rndkey1,@out[0] 523 aesdec $rndkey1,@out[1] 524 aesdec $rndkey1,@out[2] 525 aesdec $rndkey1,@out[3] 526 movups 0xd0-0x78($key),$rndkey1 527 528 aesdec $rndkey0,@out[0] 529 aesdec $rndkey0,@out[1] 530 aesdec $rndkey0,@out[2] 531 aesdec $rndkey0,@out[3] 532 movups 0xe0-0x78($key),$rndkey0 533 jmp .Ldec4x_tail 534 535.align 32 536.Ldec4x_tail: 537 aesdec $rndkey1,@out[0] 538 aesdec $rndkey1,@out[1] 539 aesdec $rndkey1,@out[2] 540 pxor $rndkey0,@inp[0] 541 pxor $rndkey0,@inp[1] 542 aesdec $rndkey1,@out[3] 543 movdqu 0x10-0x78($key),$rndkey1 544 pxor $rndkey0,@inp[2] 545 pxor $rndkey0,@inp[3] 546 movdqu 0x20-0x78($key),$rndkey0 547 548 aesdeclast @inp[0],@out[0] 549 aesdeclast @inp[1],@out[1] 550 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV 551 movdqu -16(@inptr[1],$offset),@inp[1] 552 aesdeclast @inp[2],@out[2] 553 aesdeclast @inp[3],@out[3] 554 movdqu -16(@inptr[2],$offset),@inp[2] 555 movdqu -16(@inptr[3],$offset),@inp[3] 556 557 movups @out[0],-16(@outptr[0],$offset) 558 movdqu (@inptr[0],$offset),@out[0] 559 movups @out[1],-16(@outptr[1],$offset) 560 movdqu (@inptr[1],$offset),@out[1] 561 pxor $zero,@out[0] 562 movups @out[2],-16(@outptr[2],$offset) 563 movdqu (@inptr[2],$offset),@out[2] 564 pxor $zero,@out[1] 565 movups @out[3],-16(@outptr[3],$offset) 566 movdqu (@inptr[3],$offset),@out[3] 567 pxor $zero,@out[2] 568 pxor $zero,@out[3] 569 570 dec $num 571 jnz .Loop_dec4x 572 573 mov 16(%rsp),%rax # original %rsp 574 mov 24(%rsp),$num 575 576 lea `40*4`($inp),$inp 577 dec $num 578 jnz .Ldec4x_loop_grande 579 580.Ldec4x_done: 581___ 582$code.=<<___ if ($win64); 583 movaps -0xd8(%rax),%xmm6 584 movaps -0xc8(%rax),%xmm7 585 movaps -0xb8(%rax),%xmm8 586 movaps -0xa8(%rax),%xmm9 587 movaps -0x98(%rax),%xmm10 588 movaps -0x88(%rax),%xmm11 589 movaps -0x78(%rax),%xmm12 590 #movaps -0x68(%rax),%xmm13 591 #movaps -0x58(%rax),%xmm14 592 #movaps -0x48(%rax),%xmm15 593___ 594$code.=<<___; 595 mov -48(%rax),%r15 596 mov -40(%rax),%r14 597 mov -32(%rax),%r13 598 mov -24(%rax),%r12 599 mov -16(%rax),%rbp 600 mov -8(%rax),%rbx 601 lea (%rax),%rsp 602.Ldec4x_epilogue: 603 ret 604.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt 605___ 606 607 if ($avx) {{{ 608my @ptr=map("%r$_",(8..15)); 609my $offload=$sink; 610 611my @out=map("%xmm$_",(2..9)); 612my @inp=map("%xmm$_",(10..13)); 613my ($counters,$zero)=("%xmm14","%xmm15"); 614 615$code.=<<___; 616.type aesni_multi_cbc_encrypt_avx,\@function,3 617.align 32 618aesni_multi_cbc_encrypt_avx: 619_avx_cbc_enc_shortcut: 620 mov %rsp,%rax 621 push %rbx 622 push %rbp 623 push %r12 624 push %r13 625 push %r14 626 push %r15 627___ 628$code.=<<___ if ($win64); 629 lea -0xa8(%rsp),%rsp 630 movaps %xmm6,(%rsp) 631 movaps %xmm7,0x10(%rsp) 632 movaps %xmm8,0x20(%rsp) 633 movaps %xmm9,0x30(%rsp) 634 movaps %xmm10,0x40(%rsp) 635 movaps %xmm11,0x50(%rsp) 636 movaps %xmm12,-0x78(%rax) 637 movaps %xmm13,-0x68(%rax) 638 movaps %xmm14,-0x58(%rax) 639 movaps %xmm15,-0x48(%rax) 640___ 641$code.=<<___; 642 # stack layout 643 # 644 # +0 output sink 645 # +16 input sink [original %rsp and $num] 646 # +32 counters 647 # +64 distances between inputs and outputs 648 # +128 off-load area for @inp[0..3] 649 650 sub \$192,%rsp 651 and \$-128,%rsp 652 mov %rax,16(%rsp) # original %rsp 653 654.Lenc8x_body: 655 vzeroupper 656 vmovdqu ($key),$zero # 0-round key 657 lea 0x78($key),$key # size optimization 658 lea 40*4($inp),$inp 659 shr \$1,$num 660 661.Lenc8x_loop_grande: 662 #mov $num,24(%rsp) # original $num 663 xor $num,$num 664___ 665for($i=0;$i<8;$i++) { 666 my $temp = $i ? $offload : $offset; 667 $code.=<<___; 668 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks 669 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer 670 cmp $num,$one 671 mov `40*$i+8-40*4`($inp),$temp # output pointer 672 cmovg $one,$num # find maximum 673 test $one,$one 674 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV 675 mov $one,`32+4*$i`(%rsp) # initialize counters 676 cmovle %rsp,@ptr[$i] # cancel input 677 sub @ptr[$i],$temp # distance between input and output 678 mov $temp,`64+8*$i`(%rsp) # initialize distances 679___ 680} 681$code.=<<___; 682 test $num,$num 683 jz .Lenc8x_done 684 685 vmovups 0x10-0x78($key),$rndkey1 686 vmovups 0x20-0x78($key),$rndkey0 687 mov 0xf0-0x78($key),$rounds 688 689 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round 690 lea 128(%rsp),$offload # offload area 691 vpxor (@ptr[1]),$zero,@inp[1] 692 vpxor (@ptr[2]),$zero,@inp[2] 693 vpxor (@ptr[3]),$zero,@inp[3] 694 vpxor @inp[0],@out[0],@out[0] 695 vpxor (@ptr[4]),$zero,@inp[0] 696 vpxor @inp[1],@out[1],@out[1] 697 vpxor (@ptr[5]),$zero,@inp[1] 698 vpxor @inp[2],@out[2],@out[2] 699 vpxor (@ptr[6]),$zero,@inp[2] 700 vpxor @inp[3],@out[3],@out[3] 701 vpxor (@ptr[7]),$zero,@inp[3] 702 vpxor @inp[0],@out[4],@out[4] 703 mov \$1,$one # constant of 1 704 vpxor @inp[1],@out[5],@out[5] 705 vpxor @inp[2],@out[6],@out[6] 706 vpxor @inp[3],@out[7],@out[7] 707 jmp .Loop_enc8x 708 709.align 32 710.Loop_enc8x: 711___ 712for($i=0;$i<8;$i++) { 713my $rndkey=($i&1)?$rndkey0:$rndkey1; 714$code.=<<___; 715 vaesenc $rndkey,@out[0],@out[0] 716 cmp 32+4*$i(%rsp),$one 717___ 718$code.=<<___ if ($i); 719 mov 64+8*$i(%rsp),$offset 720___ 721$code.=<<___; 722 vaesenc $rndkey,@out[1],@out[1] 723 prefetcht0 31(@ptr[$i]) # prefetch input 724 vaesenc $rndkey,@out[2],@out[2] 725___ 726$code.=<<___ if ($i>1); 727 prefetcht0 15(@ptr[$i-2]) # prefetch output 728___ 729$code.=<<___; 730 vaesenc $rndkey,@out[3],@out[3] 731 lea (@ptr[$i],$offset),$offset 732 cmovge %rsp,@ptr[$i] # cancel input 733 vaesenc $rndkey,@out[4],@out[4] 734 cmovg %rsp,$offset # sink output 735 vaesenc $rndkey,@out[5],@out[5] 736 sub @ptr[$i],$offset 737 vaesenc $rndkey,@out[6],@out[6] 738 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round 739 mov $offset,64+8*$i(%rsp) 740 vaesenc $rndkey,@out[7],@out[7] 741 vmovups `16*(3+$i)-0x78`($key),$rndkey 742 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output 743___ 744$code.=<<___ if ($i<4) 745 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load 746___ 747} 748$code.=<<___; 749 vmovdqu 32(%rsp),$counters 750 prefetcht0 15(@ptr[$i-2]) # prefetch output 751 prefetcht0 15(@ptr[$i-1]) 752 cmp \$11,$rounds 753 jb .Lenc8x_tail 754 755 vaesenc $rndkey1,@out[0],@out[0] 756 vaesenc $rndkey1,@out[1],@out[1] 757 vaesenc $rndkey1,@out[2],@out[2] 758 vaesenc $rndkey1,@out[3],@out[3] 759 vaesenc $rndkey1,@out[4],@out[4] 760 vaesenc $rndkey1,@out[5],@out[5] 761 vaesenc $rndkey1,@out[6],@out[6] 762 vaesenc $rndkey1,@out[7],@out[7] 763 vmovups 0xb0-0x78($key),$rndkey1 764 765 vaesenc $rndkey0,@out[0],@out[0] 766 vaesenc $rndkey0,@out[1],@out[1] 767 vaesenc $rndkey0,@out[2],@out[2] 768 vaesenc $rndkey0,@out[3],@out[3] 769 vaesenc $rndkey0,@out[4],@out[4] 770 vaesenc $rndkey0,@out[5],@out[5] 771 vaesenc $rndkey0,@out[6],@out[6] 772 vaesenc $rndkey0,@out[7],@out[7] 773 vmovups 0xc0-0x78($key),$rndkey0 774 je .Lenc8x_tail 775 776 vaesenc $rndkey1,@out[0],@out[0] 777 vaesenc $rndkey1,@out[1],@out[1] 778 vaesenc $rndkey1,@out[2],@out[2] 779 vaesenc $rndkey1,@out[3],@out[3] 780 vaesenc $rndkey1,@out[4],@out[4] 781 vaesenc $rndkey1,@out[5],@out[5] 782 vaesenc $rndkey1,@out[6],@out[6] 783 vaesenc $rndkey1,@out[7],@out[7] 784 vmovups 0xd0-0x78($key),$rndkey1 785 786 vaesenc $rndkey0,@out[0],@out[0] 787 vaesenc $rndkey0,@out[1],@out[1] 788 vaesenc $rndkey0,@out[2],@out[2] 789 vaesenc $rndkey0,@out[3],@out[3] 790 vaesenc $rndkey0,@out[4],@out[4] 791 vaesenc $rndkey0,@out[5],@out[5] 792 vaesenc $rndkey0,@out[6],@out[6] 793 vaesenc $rndkey0,@out[7],@out[7] 794 vmovups 0xe0-0x78($key),$rndkey0 795 796.Lenc8x_tail: 797 vaesenc $rndkey1,@out[0],@out[0] 798 vpxor $zero,$zero,$zero 799 vaesenc $rndkey1,@out[1],@out[1] 800 vaesenc $rndkey1,@out[2],@out[2] 801 vpcmpgtd $zero,$counters,$zero 802 vaesenc $rndkey1,@out[3],@out[3] 803 vaesenc $rndkey1,@out[4],@out[4] 804 vpaddd $counters,$zero,$zero # decrement counters 805 vmovdqu 48(%rsp),$counters 806 vaesenc $rndkey1,@out[5],@out[5] 807 mov 64(%rsp),$offset # pre-load 1st offset 808 vaesenc $rndkey1,@out[6],@out[6] 809 vaesenc $rndkey1,@out[7],@out[7] 810 vmovups 0x10-0x78($key),$rndkey1 811 812 vaesenclast $rndkey0,@out[0],@out[0] 813 vmovdqa $zero,32(%rsp) # update counters 814 vpxor $zero,$zero,$zero 815 vaesenclast $rndkey0,@out[1],@out[1] 816 vaesenclast $rndkey0,@out[2],@out[2] 817 vpcmpgtd $zero,$counters,$zero 818 vaesenclast $rndkey0,@out[3],@out[3] 819 vaesenclast $rndkey0,@out[4],@out[4] 820 vpaddd $zero,$counters,$counters # decrement counters 821 vmovdqu -0x78($key),$zero # 0-round 822 vaesenclast $rndkey0,@out[5],@out[5] 823 vaesenclast $rndkey0,@out[6],@out[6] 824 vmovdqa $counters,48(%rsp) # update counters 825 vaesenclast $rndkey0,@out[7],@out[7] 826 vmovups 0x20-0x78($key),$rndkey0 827 828 vmovups @out[0],-16(@ptr[0]) # write output 829 sub $offset,@ptr[0] # switch to input 830 vpxor 0x00($offload),@out[0],@out[0] 831 vmovups @out[1],-16(@ptr[1]) 832 sub `64+1*8`(%rsp),@ptr[1] 833 vpxor 0x10($offload),@out[1],@out[1] 834 vmovups @out[2],-16(@ptr[2]) 835 sub `64+2*8`(%rsp),@ptr[2] 836 vpxor 0x20($offload),@out[2],@out[2] 837 vmovups @out[3],-16(@ptr[3]) 838 sub `64+3*8`(%rsp),@ptr[3] 839 vpxor 0x30($offload),@out[3],@out[3] 840 vmovups @out[4],-16(@ptr[4]) 841 sub `64+4*8`(%rsp),@ptr[4] 842 vpxor @inp[0],@out[4],@out[4] 843 vmovups @out[5],-16(@ptr[5]) 844 sub `64+5*8`(%rsp),@ptr[5] 845 vpxor @inp[1],@out[5],@out[5] 846 vmovups @out[6],-16(@ptr[6]) 847 sub `64+6*8`(%rsp),@ptr[6] 848 vpxor @inp[2],@out[6],@out[6] 849 vmovups @out[7],-16(@ptr[7]) 850 sub `64+7*8`(%rsp),@ptr[7] 851 vpxor @inp[3],@out[7],@out[7] 852 853 dec $num 854 jnz .Loop_enc8x 855 856 mov 16(%rsp),%rax # original %rsp 857 #mov 24(%rsp),$num 858 #lea `40*8`($inp),$inp 859 #dec $num 860 #jnz .Lenc8x_loop_grande 861 862.Lenc8x_done: 863 vzeroupper 864___ 865$code.=<<___ if ($win64); 866 movaps -0xd8(%rax),%xmm6 867 movaps -0xc8(%rax),%xmm7 868 movaps -0xb8(%rax),%xmm8 869 movaps -0xa8(%rax),%xmm9 870 movaps -0x98(%rax),%xmm10 871 movaps -0x88(%rax),%xmm11 872 movaps -0x78(%rax),%xmm12 873 movaps -0x68(%rax),%xmm13 874 movaps -0x58(%rax),%xmm14 875 movaps -0x48(%rax),%xmm15 876___ 877$code.=<<___; 878 mov -48(%rax),%r15 879 mov -40(%rax),%r14 880 mov -32(%rax),%r13 881 mov -24(%rax),%r12 882 mov -16(%rax),%rbp 883 mov -8(%rax),%rbx 884 lea (%rax),%rsp 885.Lenc8x_epilogue: 886 ret 887.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx 888 889.type aesni_multi_cbc_decrypt_avx,\@function,3 890.align 32 891aesni_multi_cbc_decrypt_avx: 892_avx_cbc_dec_shortcut: 893 mov %rsp,%rax 894 push %rbx 895 push %rbp 896 push %r12 897 push %r13 898 push %r14 899 push %r15 900___ 901$code.=<<___ if ($win64); 902 lea -0xa8(%rsp),%rsp 903 movaps %xmm6,(%rsp) 904 movaps %xmm7,0x10(%rsp) 905 movaps %xmm8,0x20(%rsp) 906 movaps %xmm9,0x30(%rsp) 907 movaps %xmm10,0x40(%rsp) 908 movaps %xmm11,0x50(%rsp) 909 movaps %xmm12,-0x78(%rax) 910 movaps %xmm13,-0x68(%rax) 911 movaps %xmm14,-0x58(%rax) 912 movaps %xmm15,-0x48(%rax) 913___ 914$code.=<<___; 915 # stack layout 916 # 917 # +0 output sink 918 # +16 input sink [original %rsp and $num] 919 # +32 counters 920 # +64 distances between inputs and outputs 921 # +128 off-load area for @inp[0..3] 922 # +192 IV/input offload 923 924 sub \$256,%rsp 925 and \$-256,%rsp 926 sub \$192,%rsp 927 mov %rax,16(%rsp) # original %rsp 928 929.Ldec8x_body: 930 vzeroupper 931 vmovdqu ($key),$zero # 0-round key 932 lea 0x78($key),$key # size optimization 933 lea 40*4($inp),$inp 934 shr \$1,$num 935 936.Ldec8x_loop_grande: 937 #mov $num,24(%rsp) # original $num 938 xor $num,$num 939___ 940for($i=0;$i<8;$i++) { 941 my $temp = $i ? $offload : $offset; 942 $code.=<<___; 943 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks 944 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer 945 cmp $num,$one 946 mov `40*$i+8-40*4`($inp),$temp # output pointer 947 cmovg $one,$num # find maximum 948 test $one,$one 949 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV 950 mov $one,`32+4*$i`(%rsp) # initialize counters 951 cmovle %rsp,@ptr[$i] # cancel input 952 sub @ptr[$i],$temp # distance between input and output 953 mov $temp,`64+8*$i`(%rsp) # initialize distances 954 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV 955___ 956} 957$code.=<<___; 958 test $num,$num 959 jz .Ldec8x_done 960 961 vmovups 0x10-0x78($key),$rndkey1 962 vmovups 0x20-0x78($key),$rndkey0 963 mov 0xf0-0x78($key),$rounds 964 lea 192+128(%rsp),$offload # offload area 965 966 vmovdqu (@ptr[0]),@out[0] # load inputs 967 vmovdqu (@ptr[1]),@out[1] 968 vmovdqu (@ptr[2]),@out[2] 969 vmovdqu (@ptr[3]),@out[3] 970 vmovdqu (@ptr[4]),@out[4] 971 vmovdqu (@ptr[5]),@out[5] 972 vmovdqu (@ptr[6]),@out[6] 973 vmovdqu (@ptr[7]),@out[7] 974 vmovdqu @out[0],0x00($offload) # offload inputs 975 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round 976 vmovdqu @out[1],0x10($offload) 977 vpxor $zero,@out[1],@out[1] 978 vmovdqu @out[2],0x20($offload) 979 vpxor $zero,@out[2],@out[2] 980 vmovdqu @out[3],0x30($offload) 981 vpxor $zero,@out[3],@out[3] 982 vmovdqu @out[4],0x40($offload) 983 vpxor $zero,@out[4],@out[4] 984 vmovdqu @out[5],0x50($offload) 985 vpxor $zero,@out[5],@out[5] 986 vmovdqu @out[6],0x60($offload) 987 vpxor $zero,@out[6],@out[6] 988 vmovdqu @out[7],0x70($offload) 989 vpxor $zero,@out[7],@out[7] 990 xor \$0x80,$offload 991 mov \$1,$one # constant of 1 992 jmp .Loop_dec8x 993 994.align 32 995.Loop_dec8x: 996___ 997for($i=0;$i<8;$i++) { 998my $rndkey=($i&1)?$rndkey0:$rndkey1; 999$code.=<<___; 1000 vaesdec $rndkey,@out[0],@out[0] 1001 cmp 32+4*$i(%rsp),$one 1002___ 1003$code.=<<___ if ($i); 1004 mov 64+8*$i(%rsp),$offset 1005___ 1006$code.=<<___; 1007 vaesdec $rndkey,@out[1],@out[1] 1008 prefetcht0 31(@ptr[$i]) # prefetch input 1009 vaesdec $rndkey,@out[2],@out[2] 1010___ 1011$code.=<<___ if ($i>1); 1012 prefetcht0 15(@ptr[$i-2]) # prefetch output 1013___ 1014$code.=<<___; 1015 vaesdec $rndkey,@out[3],@out[3] 1016 lea (@ptr[$i],$offset),$offset 1017 cmovge %rsp,@ptr[$i] # cancel input 1018 vaesdec $rndkey,@out[4],@out[4] 1019 cmovg %rsp,$offset # sink output 1020 vaesdec $rndkey,@out[5],@out[5] 1021 sub @ptr[$i],$offset 1022 vaesdec $rndkey,@out[6],@out[6] 1023 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input 1024 mov $offset,64+8*$i(%rsp) 1025 vaesdec $rndkey,@out[7],@out[7] 1026 vmovups `16*(3+$i)-0x78`($key),$rndkey 1027 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output 1028___ 1029$code.=<<___ if ($i<4); 1030 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load 1031___ 1032} 1033$code.=<<___; 1034 vmovdqu 32(%rsp),$counters 1035 prefetcht0 15(@ptr[$i-2]) # prefetch output 1036 prefetcht0 15(@ptr[$i-1]) 1037 cmp \$11,$rounds 1038 jb .Ldec8x_tail 1039 1040 vaesdec $rndkey1,@out[0],@out[0] 1041 vaesdec $rndkey1,@out[1],@out[1] 1042 vaesdec $rndkey1,@out[2],@out[2] 1043 vaesdec $rndkey1,@out[3],@out[3] 1044 vaesdec $rndkey1,@out[4],@out[4] 1045 vaesdec $rndkey1,@out[5],@out[5] 1046 vaesdec $rndkey1,@out[6],@out[6] 1047 vaesdec $rndkey1,@out[7],@out[7] 1048 vmovups 0xb0-0x78($key),$rndkey1 1049 1050 vaesdec $rndkey0,@out[0],@out[0] 1051 vaesdec $rndkey0,@out[1],@out[1] 1052 vaesdec $rndkey0,@out[2],@out[2] 1053 vaesdec $rndkey0,@out[3],@out[3] 1054 vaesdec $rndkey0,@out[4],@out[4] 1055 vaesdec $rndkey0,@out[5],@out[5] 1056 vaesdec $rndkey0,@out[6],@out[6] 1057 vaesdec $rndkey0,@out[7],@out[7] 1058 vmovups 0xc0-0x78($key),$rndkey0 1059 je .Ldec8x_tail 1060 1061 vaesdec $rndkey1,@out[0],@out[0] 1062 vaesdec $rndkey1,@out[1],@out[1] 1063 vaesdec $rndkey1,@out[2],@out[2] 1064 vaesdec $rndkey1,@out[3],@out[3] 1065 vaesdec $rndkey1,@out[4],@out[4] 1066 vaesdec $rndkey1,@out[5],@out[5] 1067 vaesdec $rndkey1,@out[6],@out[6] 1068 vaesdec $rndkey1,@out[7],@out[7] 1069 vmovups 0xd0-0x78($key),$rndkey1 1070 1071 vaesdec $rndkey0,@out[0],@out[0] 1072 vaesdec $rndkey0,@out[1],@out[1] 1073 vaesdec $rndkey0,@out[2],@out[2] 1074 vaesdec $rndkey0,@out[3],@out[3] 1075 vaesdec $rndkey0,@out[4],@out[4] 1076 vaesdec $rndkey0,@out[5],@out[5] 1077 vaesdec $rndkey0,@out[6],@out[6] 1078 vaesdec $rndkey0,@out[7],@out[7] 1079 vmovups 0xe0-0x78($key),$rndkey0 1080 1081.Ldec8x_tail: 1082 vaesdec $rndkey1,@out[0],@out[0] 1083 vpxor $zero,$zero,$zero 1084 vaesdec $rndkey1,@out[1],@out[1] 1085 vaesdec $rndkey1,@out[2],@out[2] 1086 vpcmpgtd $zero,$counters,$zero 1087 vaesdec $rndkey1,@out[3],@out[3] 1088 vaesdec $rndkey1,@out[4],@out[4] 1089 vpaddd $counters,$zero,$zero # decrement counters 1090 vmovdqu 48(%rsp),$counters 1091 vaesdec $rndkey1,@out[5],@out[5] 1092 mov 64(%rsp),$offset # pre-load 1st offset 1093 vaesdec $rndkey1,@out[6],@out[6] 1094 vaesdec $rndkey1,@out[7],@out[7] 1095 vmovups 0x10-0x78($key),$rndkey1 1096 1097 vaesdeclast $rndkey0,@out[0],@out[0] 1098 vmovdqa $zero,32(%rsp) # update counters 1099 vpxor $zero,$zero,$zero 1100 vaesdeclast $rndkey0,@out[1],@out[1] 1101 vpxor 0x00($offload),@out[0],@out[0] # xor with IV 1102 vaesdeclast $rndkey0,@out[2],@out[2] 1103 vpxor 0x10($offload),@out[1],@out[1] 1104 vpcmpgtd $zero,$counters,$zero 1105 vaesdeclast $rndkey0,@out[3],@out[3] 1106 vpxor 0x20($offload),@out[2],@out[2] 1107 vaesdeclast $rndkey0,@out[4],@out[4] 1108 vpxor 0x30($offload),@out[3],@out[3] 1109 vpaddd $zero,$counters,$counters # decrement counters 1110 vmovdqu -0x78($key),$zero # 0-round 1111 vaesdeclast $rndkey0,@out[5],@out[5] 1112 vpxor 0x40($offload),@out[4],@out[4] 1113 vaesdeclast $rndkey0,@out[6],@out[6] 1114 vpxor 0x50($offload),@out[5],@out[5] 1115 vmovdqa $counters,48(%rsp) # update counters 1116 vaesdeclast $rndkey0,@out[7],@out[7] 1117 vpxor 0x60($offload),@out[6],@out[6] 1118 vmovups 0x20-0x78($key),$rndkey0 1119 1120 vmovups @out[0],-16(@ptr[0]) # write output 1121 sub $offset,@ptr[0] # switch to input 1122 vmovdqu 128+0(%rsp),@out[0] 1123 vpxor 0x70($offload),@out[7],@out[7] 1124 vmovups @out[1],-16(@ptr[1]) 1125 sub `64+1*8`(%rsp),@ptr[1] 1126 vmovdqu @out[0],0x00($offload) 1127 vpxor $zero,@out[0],@out[0] 1128 vmovdqu 128+16(%rsp),@out[1] 1129 vmovups @out[2],-16(@ptr[2]) 1130 sub `64+2*8`(%rsp),@ptr[2] 1131 vmovdqu @out[1],0x10($offload) 1132 vpxor $zero,@out[1],@out[1] 1133 vmovdqu 128+32(%rsp),@out[2] 1134 vmovups @out[3],-16(@ptr[3]) 1135 sub `64+3*8`(%rsp),@ptr[3] 1136 vmovdqu @out[2],0x20($offload) 1137 vpxor $zero,@out[2],@out[2] 1138 vmovdqu 128+48(%rsp),@out[3] 1139 vmovups @out[4],-16(@ptr[4]) 1140 sub `64+4*8`(%rsp),@ptr[4] 1141 vmovdqu @out[3],0x30($offload) 1142 vpxor $zero,@out[3],@out[3] 1143 vmovdqu @inp[0],0x40($offload) 1144 vpxor @inp[0],$zero,@out[4] 1145 vmovups @out[5],-16(@ptr[5]) 1146 sub `64+5*8`(%rsp),@ptr[5] 1147 vmovdqu @inp[1],0x50($offload) 1148 vpxor @inp[1],$zero,@out[5] 1149 vmovups @out[6],-16(@ptr[6]) 1150 sub `64+6*8`(%rsp),@ptr[6] 1151 vmovdqu @inp[2],0x60($offload) 1152 vpxor @inp[2],$zero,@out[6] 1153 vmovups @out[7],-16(@ptr[7]) 1154 sub `64+7*8`(%rsp),@ptr[7] 1155 vmovdqu @inp[3],0x70($offload) 1156 vpxor @inp[3],$zero,@out[7] 1157 1158 xor \$128,$offload 1159 dec $num 1160 jnz .Loop_dec8x 1161 1162 mov 16(%rsp),%rax # original %rsp 1163 #mov 24(%rsp),$num 1164 #lea `40*8`($inp),$inp 1165 #dec $num 1166 #jnz .Ldec8x_loop_grande 1167 1168.Ldec8x_done: 1169 vzeroupper 1170___ 1171$code.=<<___ if ($win64); 1172 movaps -0xd8(%rax),%xmm6 1173 movaps -0xc8(%rax),%xmm7 1174 movaps -0xb8(%rax),%xmm8 1175 movaps -0xa8(%rax),%xmm9 1176 movaps -0x98(%rax),%xmm10 1177 movaps -0x88(%rax),%xmm11 1178 movaps -0x78(%rax),%xmm12 1179 movaps -0x68(%rax),%xmm13 1180 movaps -0x58(%rax),%xmm14 1181 movaps -0x48(%rax),%xmm15 1182___ 1183$code.=<<___; 1184 mov -48(%rax),%r15 1185 mov -40(%rax),%r14 1186 mov -32(%rax),%r13 1187 mov -24(%rax),%r12 1188 mov -16(%rax),%rbp 1189 mov -8(%rax),%rbx 1190 lea (%rax),%rsp 1191.Ldec8x_epilogue: 1192 ret 1193.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx 1194___ 1195 }}} 1196 1197if ($win64) { 1198# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1199# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1200$rec="%rcx"; 1201$frame="%rdx"; 1202$context="%r8"; 1203$disp="%r9"; 1204 1205$code.=<<___; 1206.extern __imp_RtlVirtualUnwind 1207.type se_handler,\@abi-omnipotent 1208.align 16 1209se_handler: 1210 push %rsi 1211 push %rdi 1212 push %rbx 1213 push %rbp 1214 push %r12 1215 push %r13 1216 push %r14 1217 push %r15 1218 pushfq 1219 sub \$64,%rsp 1220 1221 mov 120($context),%rax # pull context->Rax 1222 mov 248($context),%rbx # pull context->Rip 1223 1224 mov 8($disp),%rsi # disp->ImageBase 1225 mov 56($disp),%r11 # disp->HandlerData 1226 1227 mov 0(%r11),%r10d # HandlerData[0] 1228 lea (%rsi,%r10),%r10 # prologue label 1229 cmp %r10,%rbx # context->Rip<.Lprologue 1230 jb .Lin_prologue 1231 1232 mov 152($context),%rax # pull context->Rsp 1233 1234 mov 4(%r11),%r10d # HandlerData[1] 1235 lea (%rsi,%r10),%r10 # epilogue label 1236 cmp %r10,%rbx # context->Rip>=.Lepilogue 1237 jae .Lin_prologue 1238 1239 mov 16(%rax),%rax # pull saved stack pointer 1240 1241 mov -8(%rax),%rbx 1242 mov -16(%rax),%rbp 1243 mov -24(%rax),%r12 1244 mov -32(%rax),%r13 1245 mov -40(%rax),%r14 1246 mov -48(%rax),%r15 1247 mov %rbx,144($context) # restore context->Rbx 1248 mov %rbp,160($context) # restore context->Rbp 1249 mov %r12,216($context) # restore cotnext->R12 1250 mov %r13,224($context) # restore cotnext->R13 1251 mov %r14,232($context) # restore cotnext->R14 1252 mov %r15,240($context) # restore cotnext->R15 1253 1254 lea -56-10*16(%rax),%rsi 1255 lea 512($context),%rdi # &context.Xmm6 1256 mov \$20,%ecx 1257 .long 0xa548f3fc # cld; rep movsq 1258 1259.Lin_prologue: 1260 mov 8(%rax),%rdi 1261 mov 16(%rax),%rsi 1262 mov %rax,152($context) # restore context->Rsp 1263 mov %rsi,168($context) # restore context->Rsi 1264 mov %rdi,176($context) # restore context->Rdi 1265 1266 mov 40($disp),%rdi # disp->ContextRecord 1267 mov $context,%rsi # context 1268 mov \$154,%ecx # sizeof(CONTEXT) 1269 .long 0xa548f3fc # cld; rep movsq 1270 1271 mov $disp,%rsi 1272 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1273 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1274 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1275 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1276 mov 40(%rsi),%r10 # disp->ContextRecord 1277 lea 56(%rsi),%r11 # &disp->HandlerData 1278 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1279 mov %r10,32(%rsp) # arg5 1280 mov %r11,40(%rsp) # arg6 1281 mov %r12,48(%rsp) # arg7 1282 mov %rcx,56(%rsp) # arg8, (NULL) 1283 call *__imp_RtlVirtualUnwind(%rip) 1284 1285 mov \$1,%eax # ExceptionContinueSearch 1286 add \$64,%rsp 1287 popfq 1288 pop %r15 1289 pop %r14 1290 pop %r13 1291 pop %r12 1292 pop %rbp 1293 pop %rbx 1294 pop %rdi 1295 pop %rsi 1296 ret 1297.size se_handler,.-se_handler 1298 1299.section .pdata 1300.align 4 1301 .rva .LSEH_begin_aesni_multi_cbc_encrypt 1302 .rva .LSEH_end_aesni_multi_cbc_encrypt 1303 .rva .LSEH_info_aesni_multi_cbc_encrypt 1304 .rva .LSEH_begin_aesni_multi_cbc_decrypt 1305 .rva .LSEH_end_aesni_multi_cbc_decrypt 1306 .rva .LSEH_info_aesni_multi_cbc_decrypt 1307___ 1308$code.=<<___ if ($avx); 1309 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx 1310 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx 1311 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx 1312 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx 1313 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx 1314 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx 1315___ 1316$code.=<<___; 1317.section .xdata 1318.align 8 1319.LSEH_info_aesni_multi_cbc_encrypt: 1320 .byte 9,0,0,0 1321 .rva se_handler 1322 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] 1323.LSEH_info_aesni_multi_cbc_decrypt: 1324 .byte 9,0,0,0 1325 .rva se_handler 1326 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] 1327___ 1328$code.=<<___ if ($avx); 1329.LSEH_info_aesni_multi_cbc_encrypt_avx: 1330 .byte 9,0,0,0 1331 .rva se_handler 1332 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] 1333.LSEH_info_aesni_multi_cbc_decrypt_avx: 1334 .byte 9,0,0,0 1335 .rva se_handler 1336 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] 1337___ 1338} 1339#################################################################### 1340 1341sub rex { 1342 local *opcode=shift; 1343 my ($dst,$src)=@_; 1344 my $rex=0; 1345 1346 $rex|=0x04 if($dst>=8); 1347 $rex|=0x01 if($src>=8); 1348 push @opcode,$rex|0x40 if($rex); 1349} 1350 1351sub aesni { 1352 my $line=shift; 1353 my @opcode=(0x66); 1354 1355 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1356 rex(\@opcode,$4,$3); 1357 push @opcode,0x0f,0x3a,0xdf; 1358 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 1359 my $c=$2; 1360 push @opcode,$c=~/^0/?oct($c):$c; 1361 return ".byte\t".join(',',@opcode); 1362 } 1363 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1364 my %opcodelet = ( 1365 "aesimc" => 0xdb, 1366 "aesenc" => 0xdc, "aesenclast" => 0xdd, 1367 "aesdec" => 0xde, "aesdeclast" => 0xdf 1368 ); 1369 return undef if (!defined($opcodelet{$1})); 1370 rex(\@opcode,$3,$2); 1371 push @opcode,0x0f,0x38,$opcodelet{$1}; 1372 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 1373 return ".byte\t".join(',',@opcode); 1374 } 1375 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 1376 my %opcodelet = ( 1377 "aesenc" => 0xdc, "aesenclast" => 0xdd, 1378 "aesdec" => 0xde, "aesdeclast" => 0xdf 1379 ); 1380 return undef if (!defined($opcodelet{$1})); 1381 my $off = $2; 1382 push @opcode,0x44 if ($3>=8); 1383 push @opcode,0x0f,0x38,$opcodelet{$1}; 1384 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 1385 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 1386 return ".byte\t".join(',',@opcode); 1387 } 1388 return $line; 1389} 1390 1391$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1392$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 1393 1394print $code; 1395close STDOUT; 1396