17bded2dbSJung-uk Kim#! /usr/bin/env perl 217f01e99SJung-uk Kim# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 97bded2dbSJung-uk Kim 107bded2dbSJung-uk Kim# ==================================================================== 117bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 127bded2dbSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 137bded2dbSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 147bded2dbSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 157bded2dbSJung-uk Kim# ==================================================================== 167bded2dbSJung-uk Kim 177bded2dbSJung-uk Kim# Multi-buffer AES-NI procedures process several independent buffers 187bded2dbSJung-uk Kim# in parallel by interleaving independent instructions. 197bded2dbSJung-uk Kim# 207bded2dbSJung-uk Kim# Cycles per byte for interleave factor 4: 217bded2dbSJung-uk Kim# 227bded2dbSJung-uk Kim# asymptotic measured 237bded2dbSJung-uk Kim# --------------------------- 247bded2dbSJung-uk Kim# Westmere 5.00/4=1.25 5.13/4=1.28 257bded2dbSJung-uk Kim# Atom 15.0/4=3.75 ?15.7/4=3.93 267bded2dbSJung-uk Kim# Sandy Bridge 5.06/4=1.27 5.18/4=1.29 277bded2dbSJung-uk Kim# Ivy Bridge 5.06/4=1.27 5.14/4=1.29 287bded2dbSJung-uk Kim# Haswell 4.44/4=1.11 4.44/4=1.11 297bded2dbSJung-uk Kim# Bulldozer 5.75/4=1.44 5.76/4=1.44 307bded2dbSJung-uk Kim# 317bded2dbSJung-uk Kim# Cycles per byte for interleave factor 8 (not implemented for 327bded2dbSJung-uk Kim# pre-AVX processors, where higher interleave factor incidentally 337bded2dbSJung-uk Kim# doesn't result in improvement): 347bded2dbSJung-uk Kim# 357bded2dbSJung-uk Kim# asymptotic measured 367bded2dbSJung-uk Kim# --------------------------- 377bded2dbSJung-uk Kim# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) 387bded2dbSJung-uk Kim# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) 397bded2dbSJung-uk Kim# Haswell 5.00/8=0.63 5.00/8=0.63 407bded2dbSJung-uk Kim# Bulldozer 5.75/8=0.72 5.77/8=0.72 417bded2dbSJung-uk Kim# 427bded2dbSJung-uk Kim# (*) Sandy/Ivy Bridge are known to handle high interleave factors 437bded2dbSJung-uk Kim# suboptimally; 447bded2dbSJung-uk Kim 45*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension) 46*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file 47*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 48*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 497bded2dbSJung-uk Kim 507bded2dbSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 517bded2dbSJung-uk Kim 527bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 537bded2dbSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 547bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 557bded2dbSJung-uk Kimdie "can't locate x86_64-xlate.pl"; 567bded2dbSJung-uk Kim 57*b077aed3SPierre Proncherypush(@INC,"${dir}","${dir}../../perlasm"); 58*b077aed3SPierre Proncheryrequire "x86_64-support.pl"; 59*b077aed3SPierre Pronchery 60*b077aed3SPierre Pronchery$ptr_size=&pointer_size($flavour); 61*b077aed3SPierre Pronchery 627bded2dbSJung-uk Kim$avx=0; 637bded2dbSJung-uk Kim 647bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 657bded2dbSJung-uk Kim =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 667bded2dbSJung-uk Kim $avx = ($1>=2.19) + ($1>=2.22); 677bded2dbSJung-uk Kim} 687bded2dbSJung-uk Kim 697bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 707bded2dbSJung-uk Kim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 717bded2dbSJung-uk Kim $avx = ($1>=2.09) + ($1>=2.10); 727bded2dbSJung-uk Kim} 737bded2dbSJung-uk Kim 747bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 757bded2dbSJung-uk Kim `ml64 2>&1` =~ /Version ([0-9]+)\./) { 767bded2dbSJung-uk Kim $avx = ($1>=10) + ($1>=11); 777bded2dbSJung-uk Kim} 787bded2dbSJung-uk Kim 7963c1bb51SJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 807bded2dbSJung-uk Kim $avx = ($2>=3.0) + ($2>3.0); 817bded2dbSJung-uk Kim} 827bded2dbSJung-uk Kim 83*b077aed3SPierre Proncheryopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 84*b077aed3SPierre Pronchery or die "can't call $xlate: $!"; 857bded2dbSJung-uk Kim*STDOUT=*OUT; 867bded2dbSJung-uk Kim 877bded2dbSJung-uk Kim# void aesni_multi_cbc_encrypt ( 887bded2dbSJung-uk Kim# struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; 897bded2dbSJung-uk Kim# const AES_KEY *key, 907bded2dbSJung-uk Kim# int num); /* 1 or 2 */ 917bded2dbSJung-uk Kim# 927bded2dbSJung-uk Kim$inp="%rdi"; # 1st arg 937bded2dbSJung-uk Kim$key="%rsi"; # 2nd arg 947bded2dbSJung-uk Kim$num="%edx"; 957bded2dbSJung-uk Kim 96*b077aed3SPierre Pronchery$inp_elm_size=2*$ptr_size+8+16; 97*b077aed3SPierre Pronchery 987bded2dbSJung-uk Kim@inptr=map("%r$_",(8..11)); 997bded2dbSJung-uk Kim@outptr=map("%r$_",(12..15)); 1007bded2dbSJung-uk Kim 1017bded2dbSJung-uk Kim($rndkey0,$rndkey1)=("%xmm0","%xmm1"); 1027bded2dbSJung-uk Kim@out=map("%xmm$_",(2..5)); 1037bded2dbSJung-uk Kim@inp=map("%xmm$_",(6..9)); 1047bded2dbSJung-uk Kim($counters,$mask,$zero)=map("%xmm$_",(10..12)); 1057bded2dbSJung-uk Kim 1067bded2dbSJung-uk Kim($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); 1077bded2dbSJung-uk Kim 1087bded2dbSJung-uk Kim$code.=<<___; 1097bded2dbSJung-uk Kim.text 1107bded2dbSJung-uk Kim 1117bded2dbSJung-uk Kim.extern OPENSSL_ia32cap_P 1127bded2dbSJung-uk Kim 1137bded2dbSJung-uk Kim.globl aesni_multi_cbc_encrypt 1147bded2dbSJung-uk Kim.type aesni_multi_cbc_encrypt,\@function,3 1157bded2dbSJung-uk Kim.align 32 1167bded2dbSJung-uk Kimaesni_multi_cbc_encrypt: 117e71b7053SJung-uk Kim.cfi_startproc 1187bded2dbSJung-uk Kim___ 1197bded2dbSJung-uk Kim$code.=<<___ if ($avx); 1207bded2dbSJung-uk Kim cmp \$2,$num 1217bded2dbSJung-uk Kim jb .Lenc_non_avx 1227bded2dbSJung-uk Kim mov OPENSSL_ia32cap_P+4(%rip),%ecx 1237bded2dbSJung-uk Kim test \$`1<<28`,%ecx # AVX bit 1247bded2dbSJung-uk Kim jnz _avx_cbc_enc_shortcut 1257bded2dbSJung-uk Kim jmp .Lenc_non_avx 1267bded2dbSJung-uk Kim.align 16 1277bded2dbSJung-uk Kim.Lenc_non_avx: 1287bded2dbSJung-uk Kim___ 1297bded2dbSJung-uk Kim$code.=<<___; 1307bded2dbSJung-uk Kim mov %rsp,%rax 131e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 1327bded2dbSJung-uk Kim push %rbx 133e71b7053SJung-uk Kim.cfi_push %rbx 1347bded2dbSJung-uk Kim push %rbp 135e71b7053SJung-uk Kim.cfi_push %rbp 1367bded2dbSJung-uk Kim push %r12 137e71b7053SJung-uk Kim.cfi_push %r12 1387bded2dbSJung-uk Kim push %r13 139e71b7053SJung-uk Kim.cfi_push %r13 1407bded2dbSJung-uk Kim push %r14 141e71b7053SJung-uk Kim.cfi_push %r14 1427bded2dbSJung-uk Kim push %r15 143e71b7053SJung-uk Kim.cfi_push %r15 1447bded2dbSJung-uk Kim___ 1457bded2dbSJung-uk Kim$code.=<<___ if ($win64); 1467bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 1477bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 1487bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 1497bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 1507bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 1517bded2dbSJung-uk Kim movaps %xmm10,0x40(%rsp) 1527bded2dbSJung-uk Kim movaps %xmm11,0x50(%rsp) 1537bded2dbSJung-uk Kim movaps %xmm12,0x60(%rsp) 1547bded2dbSJung-uk Kim movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler 1557bded2dbSJung-uk Kim movaps %xmm14,-0x58(%rax) 1567bded2dbSJung-uk Kim movaps %xmm15,-0x48(%rax) 1577bded2dbSJung-uk Kim___ 1587bded2dbSJung-uk Kim$code.=<<___; 1597bded2dbSJung-uk Kim # stack layout 1607bded2dbSJung-uk Kim # 1617bded2dbSJung-uk Kim # +0 output sink 1627bded2dbSJung-uk Kim # +16 input sink [original %rsp and $num] 1637bded2dbSJung-uk Kim # +32 counters 1647bded2dbSJung-uk Kim 1657bded2dbSJung-uk Kim sub \$48,%rsp 1667bded2dbSJung-uk Kim and \$-64,%rsp 1677bded2dbSJung-uk Kim mov %rax,16(%rsp) # original %rsp 168e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+16,deref,+8 1697bded2dbSJung-uk Kim 1707bded2dbSJung-uk Kim.Lenc4x_body: 1717bded2dbSJung-uk Kim movdqu ($key),$zero # 0-round key 1727bded2dbSJung-uk Kim lea 0x78($key),$key # size optimization 173*b077aed3SPierre Pronchery lea $inp_elm_size*2($inp),$inp 1747bded2dbSJung-uk Kim 1757bded2dbSJung-uk Kim.Lenc4x_loop_grande: 1767bded2dbSJung-uk Kim mov $num,24(%rsp) # original $num 1777bded2dbSJung-uk Kim xor $num,$num 1787bded2dbSJung-uk Kim___ 1797bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 180*b077aed3SPierre Pronchery $inptr_reg=&pointer_register($flavour,@inptr[$i]); 181*b077aed3SPierre Pronchery $outptr_reg=&pointer_register($flavour,@outptr[$i]); 1827bded2dbSJung-uk Kim $code.=<<___; 183*b077aed3SPierre Pronchery # borrow $one for number of blocks 184*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one 185*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg 1867bded2dbSJung-uk Kim cmp $num,$one 187*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg 1887bded2dbSJung-uk Kim cmovg $one,$num # find maximum 1897bded2dbSJung-uk Kim test $one,$one 190*b077aed3SPierre Pronchery # load IV 191*b077aed3SPierre Pronchery movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i] 1927bded2dbSJung-uk Kim mov $one,`32+4*$i`(%rsp) # initialize counters 1937bded2dbSJung-uk Kim cmovle %rsp,@inptr[$i] # cancel input 1947bded2dbSJung-uk Kim___ 1957bded2dbSJung-uk Kim} 1967bded2dbSJung-uk Kim$code.=<<___; 1977bded2dbSJung-uk Kim test $num,$num 1987bded2dbSJung-uk Kim jz .Lenc4x_done 1997bded2dbSJung-uk Kim 2007bded2dbSJung-uk Kim movups 0x10-0x78($key),$rndkey1 2017bded2dbSJung-uk Kim pxor $zero,@out[0] 2027bded2dbSJung-uk Kim movups 0x20-0x78($key),$rndkey0 2037bded2dbSJung-uk Kim pxor $zero,@out[1] 2047bded2dbSJung-uk Kim mov 0xf0-0x78($key),$rounds 2057bded2dbSJung-uk Kim pxor $zero,@out[2] 2067bded2dbSJung-uk Kim movdqu (@inptr[0]),@inp[0] # load inputs 2077bded2dbSJung-uk Kim pxor $zero,@out[3] 2087bded2dbSJung-uk Kim movdqu (@inptr[1]),@inp[1] 2097bded2dbSJung-uk Kim pxor @inp[0],@out[0] 2107bded2dbSJung-uk Kim movdqu (@inptr[2]),@inp[2] 2117bded2dbSJung-uk Kim pxor @inp[1],@out[1] 2127bded2dbSJung-uk Kim movdqu (@inptr[3]),@inp[3] 2137bded2dbSJung-uk Kim pxor @inp[2],@out[2] 2147bded2dbSJung-uk Kim pxor @inp[3],@out[3] 2157bded2dbSJung-uk Kim movdqa 32(%rsp),$counters # load counters 2167bded2dbSJung-uk Kim xor $offset,$offset 2177bded2dbSJung-uk Kim jmp .Loop_enc4x 2187bded2dbSJung-uk Kim 2197bded2dbSJung-uk Kim.align 32 2207bded2dbSJung-uk Kim.Loop_enc4x: 2217bded2dbSJung-uk Kim add \$16,$offset 2227bded2dbSJung-uk Kim lea 16(%rsp),$sink # sink pointer 2237bded2dbSJung-uk Kim mov \$1,$one # constant of 1 2247bded2dbSJung-uk Kim sub $offset,$sink 2257bded2dbSJung-uk Kim 2267bded2dbSJung-uk Kim aesenc $rndkey1,@out[0] 2277bded2dbSJung-uk Kim prefetcht0 31(@inptr[0],$offset) # prefetch input 2287bded2dbSJung-uk Kim prefetcht0 31(@inptr[1],$offset) 2297bded2dbSJung-uk Kim aesenc $rndkey1,@out[1] 2307bded2dbSJung-uk Kim prefetcht0 31(@inptr[2],$offset) 2317bded2dbSJung-uk Kim prefetcht0 31(@inptr[2],$offset) 2327bded2dbSJung-uk Kim aesenc $rndkey1,@out[2] 2337bded2dbSJung-uk Kim aesenc $rndkey1,@out[3] 2347bded2dbSJung-uk Kim movups 0x30-0x78($key),$rndkey1 2357bded2dbSJung-uk Kim___ 2367bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 2377bded2dbSJung-uk Kimmy $rndkey = ($i&1) ? $rndkey1 : $rndkey0; 2387bded2dbSJung-uk Kim$code.=<<___; 2397bded2dbSJung-uk Kim cmp `32+4*$i`(%rsp),$one 2407bded2dbSJung-uk Kim aesenc $rndkey,@out[0] 2417bded2dbSJung-uk Kim aesenc $rndkey,@out[1] 2427bded2dbSJung-uk Kim aesenc $rndkey,@out[2] 2437bded2dbSJung-uk Kim cmovge $sink,@inptr[$i] # cancel input 2447bded2dbSJung-uk Kim cmovg $sink,@outptr[$i] # sink output 2457bded2dbSJung-uk Kim aesenc $rndkey,@out[3] 2467bded2dbSJung-uk Kim movups `0x40+16*$i-0x78`($key),$rndkey 2477bded2dbSJung-uk Kim___ 2487bded2dbSJung-uk Kim} 2497bded2dbSJung-uk Kim$code.=<<___; 2507bded2dbSJung-uk Kim movdqa $counters,$mask 2517bded2dbSJung-uk Kim aesenc $rndkey0,@out[0] 2527bded2dbSJung-uk Kim prefetcht0 15(@outptr[0],$offset) # prefetch output 2537bded2dbSJung-uk Kim prefetcht0 15(@outptr[1],$offset) 2547bded2dbSJung-uk Kim aesenc $rndkey0,@out[1] 2557bded2dbSJung-uk Kim prefetcht0 15(@outptr[2],$offset) 2567bded2dbSJung-uk Kim prefetcht0 15(@outptr[3],$offset) 2577bded2dbSJung-uk Kim aesenc $rndkey0,@out[2] 2587bded2dbSJung-uk Kim aesenc $rndkey0,@out[3] 2597bded2dbSJung-uk Kim movups 0x80-0x78($key),$rndkey0 2607bded2dbSJung-uk Kim pxor $zero,$zero 2617bded2dbSJung-uk Kim 2627bded2dbSJung-uk Kim aesenc $rndkey1,@out[0] 2637bded2dbSJung-uk Kim pcmpgtd $zero,$mask 2647bded2dbSJung-uk Kim movdqu -0x78($key),$zero # reload 0-round key 2657bded2dbSJung-uk Kim aesenc $rndkey1,@out[1] 2667bded2dbSJung-uk Kim paddd $mask,$counters # decrement counters 2677bded2dbSJung-uk Kim movdqa $counters,32(%rsp) # update counters 2687bded2dbSJung-uk Kim aesenc $rndkey1,@out[2] 2697bded2dbSJung-uk Kim aesenc $rndkey1,@out[3] 2707bded2dbSJung-uk Kim movups 0x90-0x78($key),$rndkey1 2717bded2dbSJung-uk Kim 2727bded2dbSJung-uk Kim cmp \$11,$rounds 2737bded2dbSJung-uk Kim 2747bded2dbSJung-uk Kim aesenc $rndkey0,@out[0] 2757bded2dbSJung-uk Kim aesenc $rndkey0,@out[1] 2767bded2dbSJung-uk Kim aesenc $rndkey0,@out[2] 2777bded2dbSJung-uk Kim aesenc $rndkey0,@out[3] 2787bded2dbSJung-uk Kim movups 0xa0-0x78($key),$rndkey0 2797bded2dbSJung-uk Kim 2807bded2dbSJung-uk Kim jb .Lenc4x_tail 2817bded2dbSJung-uk Kim 2827bded2dbSJung-uk Kim aesenc $rndkey1,@out[0] 2837bded2dbSJung-uk Kim aesenc $rndkey1,@out[1] 2847bded2dbSJung-uk Kim aesenc $rndkey1,@out[2] 2857bded2dbSJung-uk Kim aesenc $rndkey1,@out[3] 2867bded2dbSJung-uk Kim movups 0xb0-0x78($key),$rndkey1 2877bded2dbSJung-uk Kim 2887bded2dbSJung-uk Kim aesenc $rndkey0,@out[0] 2897bded2dbSJung-uk Kim aesenc $rndkey0,@out[1] 2907bded2dbSJung-uk Kim aesenc $rndkey0,@out[2] 2917bded2dbSJung-uk Kim aesenc $rndkey0,@out[3] 2927bded2dbSJung-uk Kim movups 0xc0-0x78($key),$rndkey0 2937bded2dbSJung-uk Kim 2947bded2dbSJung-uk Kim je .Lenc4x_tail 2957bded2dbSJung-uk Kim 2967bded2dbSJung-uk Kim aesenc $rndkey1,@out[0] 2977bded2dbSJung-uk Kim aesenc $rndkey1,@out[1] 2987bded2dbSJung-uk Kim aesenc $rndkey1,@out[2] 2997bded2dbSJung-uk Kim aesenc $rndkey1,@out[3] 3007bded2dbSJung-uk Kim movups 0xd0-0x78($key),$rndkey1 3017bded2dbSJung-uk Kim 3027bded2dbSJung-uk Kim aesenc $rndkey0,@out[0] 3037bded2dbSJung-uk Kim aesenc $rndkey0,@out[1] 3047bded2dbSJung-uk Kim aesenc $rndkey0,@out[2] 3057bded2dbSJung-uk Kim aesenc $rndkey0,@out[3] 3067bded2dbSJung-uk Kim movups 0xe0-0x78($key),$rndkey0 3077bded2dbSJung-uk Kim jmp .Lenc4x_tail 3087bded2dbSJung-uk Kim 3097bded2dbSJung-uk Kim.align 32 3107bded2dbSJung-uk Kim.Lenc4x_tail: 3117bded2dbSJung-uk Kim aesenc $rndkey1,@out[0] 3127bded2dbSJung-uk Kim aesenc $rndkey1,@out[1] 3137bded2dbSJung-uk Kim aesenc $rndkey1,@out[2] 3147bded2dbSJung-uk Kim aesenc $rndkey1,@out[3] 3157bded2dbSJung-uk Kim movdqu (@inptr[0],$offset),@inp[0] 3167bded2dbSJung-uk Kim movdqu 0x10-0x78($key),$rndkey1 3177bded2dbSJung-uk Kim 3187bded2dbSJung-uk Kim aesenclast $rndkey0,@out[0] 3197bded2dbSJung-uk Kim movdqu (@inptr[1],$offset),@inp[1] 3207bded2dbSJung-uk Kim pxor $zero,@inp[0] 3217bded2dbSJung-uk Kim aesenclast $rndkey0,@out[1] 3227bded2dbSJung-uk Kim movdqu (@inptr[2],$offset),@inp[2] 3237bded2dbSJung-uk Kim pxor $zero,@inp[1] 3247bded2dbSJung-uk Kim aesenclast $rndkey0,@out[2] 3257bded2dbSJung-uk Kim movdqu (@inptr[3],$offset),@inp[3] 3267bded2dbSJung-uk Kim pxor $zero,@inp[2] 3277bded2dbSJung-uk Kim aesenclast $rndkey0,@out[3] 3287bded2dbSJung-uk Kim movdqu 0x20-0x78($key),$rndkey0 3297bded2dbSJung-uk Kim pxor $zero,@inp[3] 3307bded2dbSJung-uk Kim 3317bded2dbSJung-uk Kim movups @out[0],-16(@outptr[0],$offset) 3327bded2dbSJung-uk Kim pxor @inp[0],@out[0] 3337bded2dbSJung-uk Kim movups @out[1],-16(@outptr[1],$offset) 3347bded2dbSJung-uk Kim pxor @inp[1],@out[1] 3357bded2dbSJung-uk Kim movups @out[2],-16(@outptr[2],$offset) 3367bded2dbSJung-uk Kim pxor @inp[2],@out[2] 3377bded2dbSJung-uk Kim movups @out[3],-16(@outptr[3],$offset) 3387bded2dbSJung-uk Kim pxor @inp[3],@out[3] 3397bded2dbSJung-uk Kim 3407bded2dbSJung-uk Kim dec $num 3417bded2dbSJung-uk Kim jnz .Loop_enc4x 3427bded2dbSJung-uk Kim 3437bded2dbSJung-uk Kim mov 16(%rsp),%rax # original %rsp 344e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 3457bded2dbSJung-uk Kim mov 24(%rsp),$num 3467bded2dbSJung-uk Kim 3477bded2dbSJung-uk Kim #pxor @inp[0],@out[0] 3487bded2dbSJung-uk Kim #pxor @inp[1],@out[1] 349*b077aed3SPierre Pronchery # output iv FIX ME! 350*b077aed3SPierre Pronchery #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp) 3517bded2dbSJung-uk Kim #pxor @inp[2],@out[2] 352*b077aed3SPierre Pronchery #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp) 3537bded2dbSJung-uk Kim #pxor @inp[3],@out[3] 354*b077aed3SPierre Pronchery #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller 355*b077aed3SPierre Pronchery #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out... 3567bded2dbSJung-uk Kim 357*b077aed3SPierre Pronchery lea `$inp_elm_size*4`($inp),$inp 3587bded2dbSJung-uk Kim dec $num 3597bded2dbSJung-uk Kim jnz .Lenc4x_loop_grande 3607bded2dbSJung-uk Kim 3617bded2dbSJung-uk Kim.Lenc4x_done: 3627bded2dbSJung-uk Kim___ 3637bded2dbSJung-uk Kim$code.=<<___ if ($win64); 3647bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 3657bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 3667bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 3677bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 3687bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 3697bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 3707bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 3717bded2dbSJung-uk Kim #movaps -0x68(%rax),%xmm13 3727bded2dbSJung-uk Kim #movaps -0x58(%rax),%xmm14 3737bded2dbSJung-uk Kim #movaps -0x48(%rax),%xmm15 3747bded2dbSJung-uk Kim___ 3757bded2dbSJung-uk Kim$code.=<<___; 3767bded2dbSJung-uk Kim mov -48(%rax),%r15 377e71b7053SJung-uk Kim.cfi_restore %r15 3787bded2dbSJung-uk Kim mov -40(%rax),%r14 379e71b7053SJung-uk Kim.cfi_restore %r14 3807bded2dbSJung-uk Kim mov -32(%rax),%r13 381e71b7053SJung-uk Kim.cfi_restore %r13 3827bded2dbSJung-uk Kim mov -24(%rax),%r12 383e71b7053SJung-uk Kim.cfi_restore %r12 3847bded2dbSJung-uk Kim mov -16(%rax),%rbp 385e71b7053SJung-uk Kim.cfi_restore %rbp 3867bded2dbSJung-uk Kim mov -8(%rax),%rbx 387e71b7053SJung-uk Kim.cfi_restore %rbx 3887bded2dbSJung-uk Kim lea (%rax),%rsp 389e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 3907bded2dbSJung-uk Kim.Lenc4x_epilogue: 3917bded2dbSJung-uk Kim ret 392e71b7053SJung-uk Kim.cfi_endproc 3937bded2dbSJung-uk Kim.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt 3947bded2dbSJung-uk Kim 3957bded2dbSJung-uk Kim.globl aesni_multi_cbc_decrypt 3967bded2dbSJung-uk Kim.type aesni_multi_cbc_decrypt,\@function,3 3977bded2dbSJung-uk Kim.align 32 3987bded2dbSJung-uk Kimaesni_multi_cbc_decrypt: 399e71b7053SJung-uk Kim.cfi_startproc 4007bded2dbSJung-uk Kim___ 4017bded2dbSJung-uk Kim$code.=<<___ if ($avx); 4027bded2dbSJung-uk Kim cmp \$2,$num 4037bded2dbSJung-uk Kim jb .Ldec_non_avx 4047bded2dbSJung-uk Kim mov OPENSSL_ia32cap_P+4(%rip),%ecx 4057bded2dbSJung-uk Kim test \$`1<<28`,%ecx # AVX bit 4067bded2dbSJung-uk Kim jnz _avx_cbc_dec_shortcut 4077bded2dbSJung-uk Kim jmp .Ldec_non_avx 4087bded2dbSJung-uk Kim.align 16 4097bded2dbSJung-uk Kim.Ldec_non_avx: 4107bded2dbSJung-uk Kim___ 4117bded2dbSJung-uk Kim$code.=<<___; 4127bded2dbSJung-uk Kim mov %rsp,%rax 413e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 4147bded2dbSJung-uk Kim push %rbx 415e71b7053SJung-uk Kim.cfi_push %rbx 4167bded2dbSJung-uk Kim push %rbp 417e71b7053SJung-uk Kim.cfi_push %rbp 4187bded2dbSJung-uk Kim push %r12 419e71b7053SJung-uk Kim.cfi_push %r12 4207bded2dbSJung-uk Kim push %r13 421e71b7053SJung-uk Kim.cfi_push %r13 4227bded2dbSJung-uk Kim push %r14 423e71b7053SJung-uk Kim.cfi_push %r14 4247bded2dbSJung-uk Kim push %r15 425e71b7053SJung-uk Kim.cfi_push %r15 4267bded2dbSJung-uk Kim___ 4277bded2dbSJung-uk Kim$code.=<<___ if ($win64); 4287bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 4297bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 4307bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 4317bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 4327bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 4337bded2dbSJung-uk Kim movaps %xmm10,0x40(%rsp) 4347bded2dbSJung-uk Kim movaps %xmm11,0x50(%rsp) 4357bded2dbSJung-uk Kim movaps %xmm12,0x60(%rsp) 4367bded2dbSJung-uk Kim movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler 4377bded2dbSJung-uk Kim movaps %xmm14,-0x58(%rax) 4387bded2dbSJung-uk Kim movaps %xmm15,-0x48(%rax) 4397bded2dbSJung-uk Kim___ 4407bded2dbSJung-uk Kim$code.=<<___; 4417bded2dbSJung-uk Kim # stack layout 4427bded2dbSJung-uk Kim # 4437bded2dbSJung-uk Kim # +0 output sink 4447bded2dbSJung-uk Kim # +16 input sink [original %rsp and $num] 4457bded2dbSJung-uk Kim # +32 counters 4467bded2dbSJung-uk Kim 4477bded2dbSJung-uk Kim sub \$48,%rsp 4487bded2dbSJung-uk Kim and \$-64,%rsp 4497bded2dbSJung-uk Kim mov %rax,16(%rsp) # original %rsp 450e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+16,deref,+8 4517bded2dbSJung-uk Kim 4527bded2dbSJung-uk Kim.Ldec4x_body: 4537bded2dbSJung-uk Kim movdqu ($key),$zero # 0-round key 4547bded2dbSJung-uk Kim lea 0x78($key),$key # size optimization 455*b077aed3SPierre Pronchery lea $inp_elm_size*2($inp),$inp 4567bded2dbSJung-uk Kim 4577bded2dbSJung-uk Kim.Ldec4x_loop_grande: 4587bded2dbSJung-uk Kim mov $num,24(%rsp) # original $num 4597bded2dbSJung-uk Kim xor $num,$num 4607bded2dbSJung-uk Kim___ 4617bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 462*b077aed3SPierre Pronchery $inptr_reg=&pointer_register($flavour,@inptr[$i]); 463*b077aed3SPierre Pronchery $outptr_reg=&pointer_register($flavour,@outptr[$i]); 4647bded2dbSJung-uk Kim $code.=<<___; 465*b077aed3SPierre Pronchery # borrow $one for number of blocks 466*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one 467*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg 4687bded2dbSJung-uk Kim cmp $num,$one 469*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg 4707bded2dbSJung-uk Kim cmovg $one,$num # find maximum 4717bded2dbSJung-uk Kim test $one,$one 472*b077aed3SPierre Pronchery # load IV 473*b077aed3SPierre Pronchery movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i] 4747bded2dbSJung-uk Kim mov $one,`32+4*$i`(%rsp) # initialize counters 4757bded2dbSJung-uk Kim cmovle %rsp,@inptr[$i] # cancel input 4767bded2dbSJung-uk Kim___ 4777bded2dbSJung-uk Kim} 4787bded2dbSJung-uk Kim$code.=<<___; 4797bded2dbSJung-uk Kim test $num,$num 4807bded2dbSJung-uk Kim jz .Ldec4x_done 4817bded2dbSJung-uk Kim 4827bded2dbSJung-uk Kim movups 0x10-0x78($key),$rndkey1 4837bded2dbSJung-uk Kim movups 0x20-0x78($key),$rndkey0 4847bded2dbSJung-uk Kim mov 0xf0-0x78($key),$rounds 4857bded2dbSJung-uk Kim movdqu (@inptr[0]),@out[0] # load inputs 4867bded2dbSJung-uk Kim movdqu (@inptr[1]),@out[1] 4877bded2dbSJung-uk Kim pxor $zero,@out[0] 4887bded2dbSJung-uk Kim movdqu (@inptr[2]),@out[2] 4897bded2dbSJung-uk Kim pxor $zero,@out[1] 4907bded2dbSJung-uk Kim movdqu (@inptr[3]),@out[3] 4917bded2dbSJung-uk Kim pxor $zero,@out[2] 4927bded2dbSJung-uk Kim pxor $zero,@out[3] 4937bded2dbSJung-uk Kim movdqa 32(%rsp),$counters # load counters 4947bded2dbSJung-uk Kim xor $offset,$offset 4957bded2dbSJung-uk Kim jmp .Loop_dec4x 4967bded2dbSJung-uk Kim 4977bded2dbSJung-uk Kim.align 32 4987bded2dbSJung-uk Kim.Loop_dec4x: 4997bded2dbSJung-uk Kim add \$16,$offset 5007bded2dbSJung-uk Kim lea 16(%rsp),$sink # sink pointer 5017bded2dbSJung-uk Kim mov \$1,$one # constant of 1 5027bded2dbSJung-uk Kim sub $offset,$sink 5037bded2dbSJung-uk Kim 5047bded2dbSJung-uk Kim aesdec $rndkey1,@out[0] 5057bded2dbSJung-uk Kim prefetcht0 31(@inptr[0],$offset) # prefetch input 5067bded2dbSJung-uk Kim prefetcht0 31(@inptr[1],$offset) 5077bded2dbSJung-uk Kim aesdec $rndkey1,@out[1] 5087bded2dbSJung-uk Kim prefetcht0 31(@inptr[2],$offset) 5097bded2dbSJung-uk Kim prefetcht0 31(@inptr[3],$offset) 5107bded2dbSJung-uk Kim aesdec $rndkey1,@out[2] 5117bded2dbSJung-uk Kim aesdec $rndkey1,@out[3] 5127bded2dbSJung-uk Kim movups 0x30-0x78($key),$rndkey1 5137bded2dbSJung-uk Kim___ 5147bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) { 5157bded2dbSJung-uk Kimmy $rndkey = ($i&1) ? $rndkey1 : $rndkey0; 5167bded2dbSJung-uk Kim$code.=<<___; 5177bded2dbSJung-uk Kim cmp `32+4*$i`(%rsp),$one 5187bded2dbSJung-uk Kim aesdec $rndkey,@out[0] 5197bded2dbSJung-uk Kim aesdec $rndkey,@out[1] 5207bded2dbSJung-uk Kim aesdec $rndkey,@out[2] 5217bded2dbSJung-uk Kim cmovge $sink,@inptr[$i] # cancel input 5227bded2dbSJung-uk Kim cmovg $sink,@outptr[$i] # sink output 5237bded2dbSJung-uk Kim aesdec $rndkey,@out[3] 5247bded2dbSJung-uk Kim movups `0x40+16*$i-0x78`($key),$rndkey 5257bded2dbSJung-uk Kim___ 5267bded2dbSJung-uk Kim} 5277bded2dbSJung-uk Kim$code.=<<___; 5287bded2dbSJung-uk Kim movdqa $counters,$mask 5297bded2dbSJung-uk Kim aesdec $rndkey0,@out[0] 5307bded2dbSJung-uk Kim prefetcht0 15(@outptr[0],$offset) # prefetch output 5317bded2dbSJung-uk Kim prefetcht0 15(@outptr[1],$offset) 5327bded2dbSJung-uk Kim aesdec $rndkey0,@out[1] 5337bded2dbSJung-uk Kim prefetcht0 15(@outptr[2],$offset) 5347bded2dbSJung-uk Kim prefetcht0 15(@outptr[3],$offset) 5357bded2dbSJung-uk Kim aesdec $rndkey0,@out[2] 5367bded2dbSJung-uk Kim aesdec $rndkey0,@out[3] 5377bded2dbSJung-uk Kim movups 0x80-0x78($key),$rndkey0 5387bded2dbSJung-uk Kim pxor $zero,$zero 5397bded2dbSJung-uk Kim 5407bded2dbSJung-uk Kim aesdec $rndkey1,@out[0] 5417bded2dbSJung-uk Kim pcmpgtd $zero,$mask 5427bded2dbSJung-uk Kim movdqu -0x78($key),$zero # reload 0-round key 5437bded2dbSJung-uk Kim aesdec $rndkey1,@out[1] 5447bded2dbSJung-uk Kim paddd $mask,$counters # decrement counters 5457bded2dbSJung-uk Kim movdqa $counters,32(%rsp) # update counters 5467bded2dbSJung-uk Kim aesdec $rndkey1,@out[2] 5477bded2dbSJung-uk Kim aesdec $rndkey1,@out[3] 5487bded2dbSJung-uk Kim movups 0x90-0x78($key),$rndkey1 5497bded2dbSJung-uk Kim 5507bded2dbSJung-uk Kim cmp \$11,$rounds 5517bded2dbSJung-uk Kim 5527bded2dbSJung-uk Kim aesdec $rndkey0,@out[0] 5537bded2dbSJung-uk Kim aesdec $rndkey0,@out[1] 5547bded2dbSJung-uk Kim aesdec $rndkey0,@out[2] 5557bded2dbSJung-uk Kim aesdec $rndkey0,@out[3] 5567bded2dbSJung-uk Kim movups 0xa0-0x78($key),$rndkey0 5577bded2dbSJung-uk Kim 5587bded2dbSJung-uk Kim jb .Ldec4x_tail 5597bded2dbSJung-uk Kim 5607bded2dbSJung-uk Kim aesdec $rndkey1,@out[0] 5617bded2dbSJung-uk Kim aesdec $rndkey1,@out[1] 5627bded2dbSJung-uk Kim aesdec $rndkey1,@out[2] 5637bded2dbSJung-uk Kim aesdec $rndkey1,@out[3] 5647bded2dbSJung-uk Kim movups 0xb0-0x78($key),$rndkey1 5657bded2dbSJung-uk Kim 5667bded2dbSJung-uk Kim aesdec $rndkey0,@out[0] 5677bded2dbSJung-uk Kim aesdec $rndkey0,@out[1] 5687bded2dbSJung-uk Kim aesdec $rndkey0,@out[2] 5697bded2dbSJung-uk Kim aesdec $rndkey0,@out[3] 5707bded2dbSJung-uk Kim movups 0xc0-0x78($key),$rndkey0 5717bded2dbSJung-uk Kim 5727bded2dbSJung-uk Kim je .Ldec4x_tail 5737bded2dbSJung-uk Kim 5747bded2dbSJung-uk Kim aesdec $rndkey1,@out[0] 5757bded2dbSJung-uk Kim aesdec $rndkey1,@out[1] 5767bded2dbSJung-uk Kim aesdec $rndkey1,@out[2] 5777bded2dbSJung-uk Kim aesdec $rndkey1,@out[3] 5787bded2dbSJung-uk Kim movups 0xd0-0x78($key),$rndkey1 5797bded2dbSJung-uk Kim 5807bded2dbSJung-uk Kim aesdec $rndkey0,@out[0] 5817bded2dbSJung-uk Kim aesdec $rndkey0,@out[1] 5827bded2dbSJung-uk Kim aesdec $rndkey0,@out[2] 5837bded2dbSJung-uk Kim aesdec $rndkey0,@out[3] 5847bded2dbSJung-uk Kim movups 0xe0-0x78($key),$rndkey0 5857bded2dbSJung-uk Kim jmp .Ldec4x_tail 5867bded2dbSJung-uk Kim 5877bded2dbSJung-uk Kim.align 32 5887bded2dbSJung-uk Kim.Ldec4x_tail: 5897bded2dbSJung-uk Kim aesdec $rndkey1,@out[0] 5907bded2dbSJung-uk Kim aesdec $rndkey1,@out[1] 5917bded2dbSJung-uk Kim aesdec $rndkey1,@out[2] 5927bded2dbSJung-uk Kim pxor $rndkey0,@inp[0] 5937bded2dbSJung-uk Kim pxor $rndkey0,@inp[1] 5947bded2dbSJung-uk Kim aesdec $rndkey1,@out[3] 5957bded2dbSJung-uk Kim movdqu 0x10-0x78($key),$rndkey1 5967bded2dbSJung-uk Kim pxor $rndkey0,@inp[2] 5977bded2dbSJung-uk Kim pxor $rndkey0,@inp[3] 5987bded2dbSJung-uk Kim movdqu 0x20-0x78($key),$rndkey0 5997bded2dbSJung-uk Kim 6007bded2dbSJung-uk Kim aesdeclast @inp[0],@out[0] 6017bded2dbSJung-uk Kim aesdeclast @inp[1],@out[1] 6027bded2dbSJung-uk Kim movdqu -16(@inptr[0],$offset),@inp[0] # load next IV 6037bded2dbSJung-uk Kim movdqu -16(@inptr[1],$offset),@inp[1] 6047bded2dbSJung-uk Kim aesdeclast @inp[2],@out[2] 6057bded2dbSJung-uk Kim aesdeclast @inp[3],@out[3] 6067bded2dbSJung-uk Kim movdqu -16(@inptr[2],$offset),@inp[2] 6077bded2dbSJung-uk Kim movdqu -16(@inptr[3],$offset),@inp[3] 6087bded2dbSJung-uk Kim 6097bded2dbSJung-uk Kim movups @out[0],-16(@outptr[0],$offset) 6107bded2dbSJung-uk Kim movdqu (@inptr[0],$offset),@out[0] 6117bded2dbSJung-uk Kim movups @out[1],-16(@outptr[1],$offset) 6127bded2dbSJung-uk Kim movdqu (@inptr[1],$offset),@out[1] 6137bded2dbSJung-uk Kim pxor $zero,@out[0] 6147bded2dbSJung-uk Kim movups @out[2],-16(@outptr[2],$offset) 6157bded2dbSJung-uk Kim movdqu (@inptr[2],$offset),@out[2] 6167bded2dbSJung-uk Kim pxor $zero,@out[1] 6177bded2dbSJung-uk Kim movups @out[3],-16(@outptr[3],$offset) 6187bded2dbSJung-uk Kim movdqu (@inptr[3],$offset),@out[3] 6197bded2dbSJung-uk Kim pxor $zero,@out[2] 6207bded2dbSJung-uk Kim pxor $zero,@out[3] 6217bded2dbSJung-uk Kim 6227bded2dbSJung-uk Kim dec $num 6237bded2dbSJung-uk Kim jnz .Loop_dec4x 6247bded2dbSJung-uk Kim 6257bded2dbSJung-uk Kim mov 16(%rsp),%rax # original %rsp 626e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 6277bded2dbSJung-uk Kim mov 24(%rsp),$num 6287bded2dbSJung-uk Kim 629*b077aed3SPierre Pronchery lea `$inp_elm_size*4`($inp),$inp 6307bded2dbSJung-uk Kim dec $num 6317bded2dbSJung-uk Kim jnz .Ldec4x_loop_grande 6327bded2dbSJung-uk Kim 6337bded2dbSJung-uk Kim.Ldec4x_done: 6347bded2dbSJung-uk Kim___ 6357bded2dbSJung-uk Kim$code.=<<___ if ($win64); 6367bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 6377bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 6387bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 6397bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 6407bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 6417bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 6427bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 6437bded2dbSJung-uk Kim #movaps -0x68(%rax),%xmm13 6447bded2dbSJung-uk Kim #movaps -0x58(%rax),%xmm14 6457bded2dbSJung-uk Kim #movaps -0x48(%rax),%xmm15 6467bded2dbSJung-uk Kim___ 6477bded2dbSJung-uk Kim$code.=<<___; 6487bded2dbSJung-uk Kim mov -48(%rax),%r15 649e71b7053SJung-uk Kim.cfi_restore %r15 6507bded2dbSJung-uk Kim mov -40(%rax),%r14 651e71b7053SJung-uk Kim.cfi_restore %r14 6527bded2dbSJung-uk Kim mov -32(%rax),%r13 653e71b7053SJung-uk Kim.cfi_restore %r13 6547bded2dbSJung-uk Kim mov -24(%rax),%r12 655e71b7053SJung-uk Kim.cfi_restore %r12 6567bded2dbSJung-uk Kim mov -16(%rax),%rbp 657e71b7053SJung-uk Kim.cfi_restore %rbp 6587bded2dbSJung-uk Kim mov -8(%rax),%rbx 659e71b7053SJung-uk Kim.cfi_restore %rbx 6607bded2dbSJung-uk Kim lea (%rax),%rsp 661e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 6627bded2dbSJung-uk Kim.Ldec4x_epilogue: 6637bded2dbSJung-uk Kim ret 664e71b7053SJung-uk Kim.cfi_endproc 6657bded2dbSJung-uk Kim.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt 6667bded2dbSJung-uk Kim___ 6677bded2dbSJung-uk Kim 6687bded2dbSJung-uk Kim if ($avx) {{{ 6697bded2dbSJung-uk Kimmy @ptr=map("%r$_",(8..15)); 6707bded2dbSJung-uk Kimmy $offload=$sink; 6717bded2dbSJung-uk Kim 6727bded2dbSJung-uk Kimmy @out=map("%xmm$_",(2..9)); 6737bded2dbSJung-uk Kimmy @inp=map("%xmm$_",(10..13)); 6747bded2dbSJung-uk Kimmy ($counters,$zero)=("%xmm14","%xmm15"); 6757bded2dbSJung-uk Kim 6767bded2dbSJung-uk Kim$code.=<<___; 6777bded2dbSJung-uk Kim.type aesni_multi_cbc_encrypt_avx,\@function,3 6787bded2dbSJung-uk Kim.align 32 6797bded2dbSJung-uk Kimaesni_multi_cbc_encrypt_avx: 680e71b7053SJung-uk Kim.cfi_startproc 6817bded2dbSJung-uk Kim_avx_cbc_enc_shortcut: 6827bded2dbSJung-uk Kim mov %rsp,%rax 683e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 6847bded2dbSJung-uk Kim push %rbx 685e71b7053SJung-uk Kim.cfi_push %rbx 6867bded2dbSJung-uk Kim push %rbp 687e71b7053SJung-uk Kim.cfi_push %rbp 6887bded2dbSJung-uk Kim push %r12 689e71b7053SJung-uk Kim.cfi_push %r12 6907bded2dbSJung-uk Kim push %r13 691e71b7053SJung-uk Kim.cfi_push %r13 6927bded2dbSJung-uk Kim push %r14 693e71b7053SJung-uk Kim.cfi_push %r14 6947bded2dbSJung-uk Kim push %r15 695e71b7053SJung-uk Kim.cfi_push %r15 6967bded2dbSJung-uk Kim___ 6977bded2dbSJung-uk Kim$code.=<<___ if ($win64); 6987bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 6997bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 7007bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 7017bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 7027bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 7037bded2dbSJung-uk Kim movaps %xmm10,0x40(%rsp) 7047bded2dbSJung-uk Kim movaps %xmm11,0x50(%rsp) 7057bded2dbSJung-uk Kim movaps %xmm12,-0x78(%rax) 7067bded2dbSJung-uk Kim movaps %xmm13,-0x68(%rax) 7077bded2dbSJung-uk Kim movaps %xmm14,-0x58(%rax) 7087bded2dbSJung-uk Kim movaps %xmm15,-0x48(%rax) 7097bded2dbSJung-uk Kim___ 7107bded2dbSJung-uk Kim$code.=<<___; 7117bded2dbSJung-uk Kim # stack layout 7127bded2dbSJung-uk Kim # 7137bded2dbSJung-uk Kim # +0 output sink 7147bded2dbSJung-uk Kim # +16 input sink [original %rsp and $num] 7157bded2dbSJung-uk Kim # +32 counters 7167bded2dbSJung-uk Kim # +64 distances between inputs and outputs 7177bded2dbSJung-uk Kim # +128 off-load area for @inp[0..3] 7187bded2dbSJung-uk Kim 7197bded2dbSJung-uk Kim sub \$192,%rsp 7207bded2dbSJung-uk Kim and \$-128,%rsp 7217bded2dbSJung-uk Kim mov %rax,16(%rsp) # original %rsp 722e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+16,deref,+8 7237bded2dbSJung-uk Kim 7247bded2dbSJung-uk Kim.Lenc8x_body: 7257bded2dbSJung-uk Kim vzeroupper 7267bded2dbSJung-uk Kim vmovdqu ($key),$zero # 0-round key 7277bded2dbSJung-uk Kim lea 0x78($key),$key # size optimization 728*b077aed3SPierre Pronchery lea `$inp_elm_size*4`($inp),$inp 7297bded2dbSJung-uk Kim shr \$1,$num 7307bded2dbSJung-uk Kim 7317bded2dbSJung-uk Kim.Lenc8x_loop_grande: 7327bded2dbSJung-uk Kim #mov $num,24(%rsp) # original $num 7337bded2dbSJung-uk Kim xor $num,$num 7347bded2dbSJung-uk Kim___ 7357bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 7367bded2dbSJung-uk Kim my $temp = $i ? $offload : $offset; 737*b077aed3SPierre Pronchery $ptr_reg=&pointer_register($flavour,@ptr[$i]); 738*b077aed3SPierre Pronchery $temp_reg=&pointer_register($flavour,$temp); 7397bded2dbSJung-uk Kim $code.=<<___; 740*b077aed3SPierre Pronchery # borrow $one for number of blocks 741*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one 742*b077aed3SPierre Pronchery # input pointer 743*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg 7447bded2dbSJung-uk Kim cmp $num,$one 745*b077aed3SPierre Pronchery # output pointer 746*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg 7477bded2dbSJung-uk Kim cmovg $one,$num # find maximum 7487bded2dbSJung-uk Kim test $one,$one 749*b077aed3SPierre Pronchery # load IV 750*b077aed3SPierre Pronchery vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] 7517bded2dbSJung-uk Kim mov $one,`32+4*$i`(%rsp) # initialize counters 7527bded2dbSJung-uk Kim cmovle %rsp,@ptr[$i] # cancel input 7537bded2dbSJung-uk Kim sub @ptr[$i],$temp # distance between input and output 7547bded2dbSJung-uk Kim mov $temp,`64+8*$i`(%rsp) # initialize distances 7557bded2dbSJung-uk Kim___ 7567bded2dbSJung-uk Kim} 7577bded2dbSJung-uk Kim$code.=<<___; 7587bded2dbSJung-uk Kim test $num,$num 7597bded2dbSJung-uk Kim jz .Lenc8x_done 7607bded2dbSJung-uk Kim 7617bded2dbSJung-uk Kim vmovups 0x10-0x78($key),$rndkey1 7627bded2dbSJung-uk Kim vmovups 0x20-0x78($key),$rndkey0 7637bded2dbSJung-uk Kim mov 0xf0-0x78($key),$rounds 7647bded2dbSJung-uk Kim 7657bded2dbSJung-uk Kim vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round 7667bded2dbSJung-uk Kim lea 128(%rsp),$offload # offload area 7677bded2dbSJung-uk Kim vpxor (@ptr[1]),$zero,@inp[1] 7687bded2dbSJung-uk Kim vpxor (@ptr[2]),$zero,@inp[2] 7697bded2dbSJung-uk Kim vpxor (@ptr[3]),$zero,@inp[3] 7707bded2dbSJung-uk Kim vpxor @inp[0],@out[0],@out[0] 7717bded2dbSJung-uk Kim vpxor (@ptr[4]),$zero,@inp[0] 7727bded2dbSJung-uk Kim vpxor @inp[1],@out[1],@out[1] 7737bded2dbSJung-uk Kim vpxor (@ptr[5]),$zero,@inp[1] 7747bded2dbSJung-uk Kim vpxor @inp[2],@out[2],@out[2] 7757bded2dbSJung-uk Kim vpxor (@ptr[6]),$zero,@inp[2] 7767bded2dbSJung-uk Kim vpxor @inp[3],@out[3],@out[3] 7777bded2dbSJung-uk Kim vpxor (@ptr[7]),$zero,@inp[3] 7787bded2dbSJung-uk Kim vpxor @inp[0],@out[4],@out[4] 7797bded2dbSJung-uk Kim mov \$1,$one # constant of 1 7807bded2dbSJung-uk Kim vpxor @inp[1],@out[5],@out[5] 7817bded2dbSJung-uk Kim vpxor @inp[2],@out[6],@out[6] 7827bded2dbSJung-uk Kim vpxor @inp[3],@out[7],@out[7] 7837bded2dbSJung-uk Kim jmp .Loop_enc8x 7847bded2dbSJung-uk Kim 7857bded2dbSJung-uk Kim.align 32 7867bded2dbSJung-uk Kim.Loop_enc8x: 7877bded2dbSJung-uk Kim___ 7887bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 7897bded2dbSJung-uk Kimmy $rndkey=($i&1)?$rndkey0:$rndkey1; 7907bded2dbSJung-uk Kim$code.=<<___; 7917bded2dbSJung-uk Kim vaesenc $rndkey,@out[0],@out[0] 7927bded2dbSJung-uk Kim cmp 32+4*$i(%rsp),$one 7937bded2dbSJung-uk Kim___ 7947bded2dbSJung-uk Kim$code.=<<___ if ($i); 7957bded2dbSJung-uk Kim mov 64+8*$i(%rsp),$offset 7967bded2dbSJung-uk Kim___ 7977bded2dbSJung-uk Kim$code.=<<___; 7987bded2dbSJung-uk Kim vaesenc $rndkey,@out[1],@out[1] 7997bded2dbSJung-uk Kim prefetcht0 31(@ptr[$i]) # prefetch input 8007bded2dbSJung-uk Kim vaesenc $rndkey,@out[2],@out[2] 8017bded2dbSJung-uk Kim___ 8027bded2dbSJung-uk Kim$code.=<<___ if ($i>1); 8037bded2dbSJung-uk Kim prefetcht0 15(@ptr[$i-2]) # prefetch output 8047bded2dbSJung-uk Kim___ 8057bded2dbSJung-uk Kim$code.=<<___; 8067bded2dbSJung-uk Kim vaesenc $rndkey,@out[3],@out[3] 8077bded2dbSJung-uk Kim lea (@ptr[$i],$offset),$offset 8087bded2dbSJung-uk Kim cmovge %rsp,@ptr[$i] # cancel input 8097bded2dbSJung-uk Kim vaesenc $rndkey,@out[4],@out[4] 8107bded2dbSJung-uk Kim cmovg %rsp,$offset # sink output 8117bded2dbSJung-uk Kim vaesenc $rndkey,@out[5],@out[5] 8127bded2dbSJung-uk Kim sub @ptr[$i],$offset 8137bded2dbSJung-uk Kim vaesenc $rndkey,@out[6],@out[6] 8147bded2dbSJung-uk Kim vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round 8157bded2dbSJung-uk Kim mov $offset,64+8*$i(%rsp) 8167bded2dbSJung-uk Kim vaesenc $rndkey,@out[7],@out[7] 8177bded2dbSJung-uk Kim vmovups `16*(3+$i)-0x78`($key),$rndkey 8187bded2dbSJung-uk Kim lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output 8197bded2dbSJung-uk Kim___ 8207bded2dbSJung-uk Kim$code.=<<___ if ($i<4) 8217bded2dbSJung-uk Kim vmovdqu @inp[$i%4],`16*$i`($offload) # off-load 8227bded2dbSJung-uk Kim___ 8237bded2dbSJung-uk Kim} 8247bded2dbSJung-uk Kim$code.=<<___; 8257bded2dbSJung-uk Kim vmovdqu 32(%rsp),$counters 8267bded2dbSJung-uk Kim prefetcht0 15(@ptr[$i-2]) # prefetch output 8277bded2dbSJung-uk Kim prefetcht0 15(@ptr[$i-1]) 8287bded2dbSJung-uk Kim cmp \$11,$rounds 8297bded2dbSJung-uk Kim jb .Lenc8x_tail 8307bded2dbSJung-uk Kim 8317bded2dbSJung-uk Kim vaesenc $rndkey1,@out[0],@out[0] 8327bded2dbSJung-uk Kim vaesenc $rndkey1,@out[1],@out[1] 8337bded2dbSJung-uk Kim vaesenc $rndkey1,@out[2],@out[2] 8347bded2dbSJung-uk Kim vaesenc $rndkey1,@out[3],@out[3] 8357bded2dbSJung-uk Kim vaesenc $rndkey1,@out[4],@out[4] 8367bded2dbSJung-uk Kim vaesenc $rndkey1,@out[5],@out[5] 8377bded2dbSJung-uk Kim vaesenc $rndkey1,@out[6],@out[6] 8387bded2dbSJung-uk Kim vaesenc $rndkey1,@out[7],@out[7] 8397bded2dbSJung-uk Kim vmovups 0xb0-0x78($key),$rndkey1 8407bded2dbSJung-uk Kim 8417bded2dbSJung-uk Kim vaesenc $rndkey0,@out[0],@out[0] 8427bded2dbSJung-uk Kim vaesenc $rndkey0,@out[1],@out[1] 8437bded2dbSJung-uk Kim vaesenc $rndkey0,@out[2],@out[2] 8447bded2dbSJung-uk Kim vaesenc $rndkey0,@out[3],@out[3] 8457bded2dbSJung-uk Kim vaesenc $rndkey0,@out[4],@out[4] 8467bded2dbSJung-uk Kim vaesenc $rndkey0,@out[5],@out[5] 8477bded2dbSJung-uk Kim vaesenc $rndkey0,@out[6],@out[6] 8487bded2dbSJung-uk Kim vaesenc $rndkey0,@out[7],@out[7] 8497bded2dbSJung-uk Kim vmovups 0xc0-0x78($key),$rndkey0 8507bded2dbSJung-uk Kim je .Lenc8x_tail 8517bded2dbSJung-uk Kim 8527bded2dbSJung-uk Kim vaesenc $rndkey1,@out[0],@out[0] 8537bded2dbSJung-uk Kim vaesenc $rndkey1,@out[1],@out[1] 8547bded2dbSJung-uk Kim vaesenc $rndkey1,@out[2],@out[2] 8557bded2dbSJung-uk Kim vaesenc $rndkey1,@out[3],@out[3] 8567bded2dbSJung-uk Kim vaesenc $rndkey1,@out[4],@out[4] 8577bded2dbSJung-uk Kim vaesenc $rndkey1,@out[5],@out[5] 8587bded2dbSJung-uk Kim vaesenc $rndkey1,@out[6],@out[6] 8597bded2dbSJung-uk Kim vaesenc $rndkey1,@out[7],@out[7] 8607bded2dbSJung-uk Kim vmovups 0xd0-0x78($key),$rndkey1 8617bded2dbSJung-uk Kim 8627bded2dbSJung-uk Kim vaesenc $rndkey0,@out[0],@out[0] 8637bded2dbSJung-uk Kim vaesenc $rndkey0,@out[1],@out[1] 8647bded2dbSJung-uk Kim vaesenc $rndkey0,@out[2],@out[2] 8657bded2dbSJung-uk Kim vaesenc $rndkey0,@out[3],@out[3] 8667bded2dbSJung-uk Kim vaesenc $rndkey0,@out[4],@out[4] 8677bded2dbSJung-uk Kim vaesenc $rndkey0,@out[5],@out[5] 8687bded2dbSJung-uk Kim vaesenc $rndkey0,@out[6],@out[6] 8697bded2dbSJung-uk Kim vaesenc $rndkey0,@out[7],@out[7] 8707bded2dbSJung-uk Kim vmovups 0xe0-0x78($key),$rndkey0 8717bded2dbSJung-uk Kim 8727bded2dbSJung-uk Kim.Lenc8x_tail: 8737bded2dbSJung-uk Kim vaesenc $rndkey1,@out[0],@out[0] 8747bded2dbSJung-uk Kim vpxor $zero,$zero,$zero 8757bded2dbSJung-uk Kim vaesenc $rndkey1,@out[1],@out[1] 8767bded2dbSJung-uk Kim vaesenc $rndkey1,@out[2],@out[2] 8777bded2dbSJung-uk Kim vpcmpgtd $zero,$counters,$zero 8787bded2dbSJung-uk Kim vaesenc $rndkey1,@out[3],@out[3] 8797bded2dbSJung-uk Kim vaesenc $rndkey1,@out[4],@out[4] 8807bded2dbSJung-uk Kim vpaddd $counters,$zero,$zero # decrement counters 8817bded2dbSJung-uk Kim vmovdqu 48(%rsp),$counters 8827bded2dbSJung-uk Kim vaesenc $rndkey1,@out[5],@out[5] 8837bded2dbSJung-uk Kim mov 64(%rsp),$offset # pre-load 1st offset 8847bded2dbSJung-uk Kim vaesenc $rndkey1,@out[6],@out[6] 8857bded2dbSJung-uk Kim vaesenc $rndkey1,@out[7],@out[7] 8867bded2dbSJung-uk Kim vmovups 0x10-0x78($key),$rndkey1 8877bded2dbSJung-uk Kim 8887bded2dbSJung-uk Kim vaesenclast $rndkey0,@out[0],@out[0] 8897bded2dbSJung-uk Kim vmovdqa $zero,32(%rsp) # update counters 8907bded2dbSJung-uk Kim vpxor $zero,$zero,$zero 8917bded2dbSJung-uk Kim vaesenclast $rndkey0,@out[1],@out[1] 8927bded2dbSJung-uk Kim vaesenclast $rndkey0,@out[2],@out[2] 8937bded2dbSJung-uk Kim vpcmpgtd $zero,$counters,$zero 8947bded2dbSJung-uk Kim vaesenclast $rndkey0,@out[3],@out[3] 8957bded2dbSJung-uk Kim vaesenclast $rndkey0,@out[4],@out[4] 8967bded2dbSJung-uk Kim vpaddd $zero,$counters,$counters # decrement counters 8977bded2dbSJung-uk Kim vmovdqu -0x78($key),$zero # 0-round 8987bded2dbSJung-uk Kim vaesenclast $rndkey0,@out[5],@out[5] 8997bded2dbSJung-uk Kim vaesenclast $rndkey0,@out[6],@out[6] 9007bded2dbSJung-uk Kim vmovdqa $counters,48(%rsp) # update counters 9017bded2dbSJung-uk Kim vaesenclast $rndkey0,@out[7],@out[7] 9027bded2dbSJung-uk Kim vmovups 0x20-0x78($key),$rndkey0 9037bded2dbSJung-uk Kim 9047bded2dbSJung-uk Kim vmovups @out[0],-16(@ptr[0]) # write output 9057bded2dbSJung-uk Kim sub $offset,@ptr[0] # switch to input 9067bded2dbSJung-uk Kim vpxor 0x00($offload),@out[0],@out[0] 9077bded2dbSJung-uk Kim vmovups @out[1],-16(@ptr[1]) 9087bded2dbSJung-uk Kim sub `64+1*8`(%rsp),@ptr[1] 9097bded2dbSJung-uk Kim vpxor 0x10($offload),@out[1],@out[1] 9107bded2dbSJung-uk Kim vmovups @out[2],-16(@ptr[2]) 9117bded2dbSJung-uk Kim sub `64+2*8`(%rsp),@ptr[2] 9127bded2dbSJung-uk Kim vpxor 0x20($offload),@out[2],@out[2] 9137bded2dbSJung-uk Kim vmovups @out[3],-16(@ptr[3]) 9147bded2dbSJung-uk Kim sub `64+3*8`(%rsp),@ptr[3] 9157bded2dbSJung-uk Kim vpxor 0x30($offload),@out[3],@out[3] 9167bded2dbSJung-uk Kim vmovups @out[4],-16(@ptr[4]) 9177bded2dbSJung-uk Kim sub `64+4*8`(%rsp),@ptr[4] 9187bded2dbSJung-uk Kim vpxor @inp[0],@out[4],@out[4] 9197bded2dbSJung-uk Kim vmovups @out[5],-16(@ptr[5]) 9207bded2dbSJung-uk Kim sub `64+5*8`(%rsp),@ptr[5] 9217bded2dbSJung-uk Kim vpxor @inp[1],@out[5],@out[5] 9227bded2dbSJung-uk Kim vmovups @out[6],-16(@ptr[6]) 9237bded2dbSJung-uk Kim sub `64+6*8`(%rsp),@ptr[6] 9247bded2dbSJung-uk Kim vpxor @inp[2],@out[6],@out[6] 9257bded2dbSJung-uk Kim vmovups @out[7],-16(@ptr[7]) 9267bded2dbSJung-uk Kim sub `64+7*8`(%rsp),@ptr[7] 9277bded2dbSJung-uk Kim vpxor @inp[3],@out[7],@out[7] 9287bded2dbSJung-uk Kim 9297bded2dbSJung-uk Kim dec $num 9307bded2dbSJung-uk Kim jnz .Loop_enc8x 9317bded2dbSJung-uk Kim 9327bded2dbSJung-uk Kim mov 16(%rsp),%rax # original %rsp 933e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 9347bded2dbSJung-uk Kim #mov 24(%rsp),$num 935*b077aed3SPierre Pronchery #lea `$inp_elm_size*8`($inp),$inp 9367bded2dbSJung-uk Kim #dec $num 9377bded2dbSJung-uk Kim #jnz .Lenc8x_loop_grande 9387bded2dbSJung-uk Kim 9397bded2dbSJung-uk Kim.Lenc8x_done: 9407bded2dbSJung-uk Kim vzeroupper 9417bded2dbSJung-uk Kim___ 9427bded2dbSJung-uk Kim$code.=<<___ if ($win64); 9437bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 9447bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 9457bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 9467bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 9477bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 9487bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 9497bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 9507bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm13 9517bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm14 9527bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm15 9537bded2dbSJung-uk Kim___ 9547bded2dbSJung-uk Kim$code.=<<___; 9557bded2dbSJung-uk Kim mov -48(%rax),%r15 956e71b7053SJung-uk Kim.cfi_restore %r15 9577bded2dbSJung-uk Kim mov -40(%rax),%r14 958e71b7053SJung-uk Kim.cfi_restore %r14 9597bded2dbSJung-uk Kim mov -32(%rax),%r13 960e71b7053SJung-uk Kim.cfi_restore %r13 9617bded2dbSJung-uk Kim mov -24(%rax),%r12 962e71b7053SJung-uk Kim.cfi_restore %r12 9637bded2dbSJung-uk Kim mov -16(%rax),%rbp 964e71b7053SJung-uk Kim.cfi_restore %rbp 9657bded2dbSJung-uk Kim mov -8(%rax),%rbx 966e71b7053SJung-uk Kim.cfi_restore %rbx 9677bded2dbSJung-uk Kim lea (%rax),%rsp 968e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 9697bded2dbSJung-uk Kim.Lenc8x_epilogue: 9707bded2dbSJung-uk Kim ret 971e71b7053SJung-uk Kim.cfi_endproc 9727bded2dbSJung-uk Kim.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx 9737bded2dbSJung-uk Kim 9747bded2dbSJung-uk Kim.type aesni_multi_cbc_decrypt_avx,\@function,3 9757bded2dbSJung-uk Kim.align 32 9767bded2dbSJung-uk Kimaesni_multi_cbc_decrypt_avx: 977e71b7053SJung-uk Kim.cfi_startproc 9787bded2dbSJung-uk Kim_avx_cbc_dec_shortcut: 9797bded2dbSJung-uk Kim mov %rsp,%rax 980e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 9817bded2dbSJung-uk Kim push %rbx 982e71b7053SJung-uk Kim.cfi_push %rbx 9837bded2dbSJung-uk Kim push %rbp 984e71b7053SJung-uk Kim.cfi_push %rbp 9857bded2dbSJung-uk Kim push %r12 986e71b7053SJung-uk Kim.cfi_push %r12 9877bded2dbSJung-uk Kim push %r13 988e71b7053SJung-uk Kim.cfi_push %r13 9897bded2dbSJung-uk Kim push %r14 990e71b7053SJung-uk Kim.cfi_push %r14 9917bded2dbSJung-uk Kim push %r15 992e71b7053SJung-uk Kim.cfi_push %r15 9937bded2dbSJung-uk Kim___ 9947bded2dbSJung-uk Kim$code.=<<___ if ($win64); 9957bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 9967bded2dbSJung-uk Kim movaps %xmm6,(%rsp) 9977bded2dbSJung-uk Kim movaps %xmm7,0x10(%rsp) 9987bded2dbSJung-uk Kim movaps %xmm8,0x20(%rsp) 9997bded2dbSJung-uk Kim movaps %xmm9,0x30(%rsp) 10007bded2dbSJung-uk Kim movaps %xmm10,0x40(%rsp) 10017bded2dbSJung-uk Kim movaps %xmm11,0x50(%rsp) 10027bded2dbSJung-uk Kim movaps %xmm12,-0x78(%rax) 10037bded2dbSJung-uk Kim movaps %xmm13,-0x68(%rax) 10047bded2dbSJung-uk Kim movaps %xmm14,-0x58(%rax) 10057bded2dbSJung-uk Kim movaps %xmm15,-0x48(%rax) 10067bded2dbSJung-uk Kim___ 10077bded2dbSJung-uk Kim$code.=<<___; 10087bded2dbSJung-uk Kim # stack layout 10097bded2dbSJung-uk Kim # 10107bded2dbSJung-uk Kim # +0 output sink 10117bded2dbSJung-uk Kim # +16 input sink [original %rsp and $num] 10127bded2dbSJung-uk Kim # +32 counters 10137bded2dbSJung-uk Kim # +64 distances between inputs and outputs 10147bded2dbSJung-uk Kim # +128 off-load area for @inp[0..3] 10157bded2dbSJung-uk Kim # +192 IV/input offload 10167bded2dbSJung-uk Kim 10177bded2dbSJung-uk Kim sub \$256,%rsp 10187bded2dbSJung-uk Kim and \$-256,%rsp 10197bded2dbSJung-uk Kim sub \$192,%rsp 10207bded2dbSJung-uk Kim mov %rax,16(%rsp) # original %rsp 1021e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+16,deref,+8 10227bded2dbSJung-uk Kim 10237bded2dbSJung-uk Kim.Ldec8x_body: 10247bded2dbSJung-uk Kim vzeroupper 10257bded2dbSJung-uk Kim vmovdqu ($key),$zero # 0-round key 10267bded2dbSJung-uk Kim lea 0x78($key),$key # size optimization 1027*b077aed3SPierre Pronchery lea `$inp_elm_size*4`($inp),$inp 10287bded2dbSJung-uk Kim shr \$1,$num 10297bded2dbSJung-uk Kim 10307bded2dbSJung-uk Kim.Ldec8x_loop_grande: 10317bded2dbSJung-uk Kim #mov $num,24(%rsp) # original $num 10327bded2dbSJung-uk Kim xor $num,$num 10337bded2dbSJung-uk Kim___ 10347bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 10357bded2dbSJung-uk Kim my $temp = $i ? $offload : $offset; 1036*b077aed3SPierre Pronchery $ptr_reg=&pointer_register($flavour,@ptr[$i]); 1037*b077aed3SPierre Pronchery $temp_reg=&pointer_register($flavour,$temp); 10387bded2dbSJung-uk Kim $code.=<<___; 1039*b077aed3SPierre Pronchery # borrow $one for number of blocks 1040*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one 1041*b077aed3SPierre Pronchery # input pointer 1042*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg 10437bded2dbSJung-uk Kim cmp $num,$one 1044*b077aed3SPierre Pronchery # output pointer 1045*b077aed3SPierre Pronchery mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg 10467bded2dbSJung-uk Kim cmovg $one,$num # find maximum 10477bded2dbSJung-uk Kim test $one,$one 1048*b077aed3SPierre Pronchery # load IV 1049*b077aed3SPierre Pronchery vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] 10507bded2dbSJung-uk Kim mov $one,`32+4*$i`(%rsp) # initialize counters 10517bded2dbSJung-uk Kim cmovle %rsp,@ptr[$i] # cancel input 10527bded2dbSJung-uk Kim sub @ptr[$i],$temp # distance between input and output 10537bded2dbSJung-uk Kim mov $temp,`64+8*$i`(%rsp) # initialize distances 10547bded2dbSJung-uk Kim vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV 10557bded2dbSJung-uk Kim___ 10567bded2dbSJung-uk Kim} 10577bded2dbSJung-uk Kim$code.=<<___; 10587bded2dbSJung-uk Kim test $num,$num 10597bded2dbSJung-uk Kim jz .Ldec8x_done 10607bded2dbSJung-uk Kim 10617bded2dbSJung-uk Kim vmovups 0x10-0x78($key),$rndkey1 10627bded2dbSJung-uk Kim vmovups 0x20-0x78($key),$rndkey0 10637bded2dbSJung-uk Kim mov 0xf0-0x78($key),$rounds 10647bded2dbSJung-uk Kim lea 192+128(%rsp),$offload # offload area 10657bded2dbSJung-uk Kim 10667bded2dbSJung-uk Kim vmovdqu (@ptr[0]),@out[0] # load inputs 10677bded2dbSJung-uk Kim vmovdqu (@ptr[1]),@out[1] 10687bded2dbSJung-uk Kim vmovdqu (@ptr[2]),@out[2] 10697bded2dbSJung-uk Kim vmovdqu (@ptr[3]),@out[3] 10707bded2dbSJung-uk Kim vmovdqu (@ptr[4]),@out[4] 10717bded2dbSJung-uk Kim vmovdqu (@ptr[5]),@out[5] 10727bded2dbSJung-uk Kim vmovdqu (@ptr[6]),@out[6] 10737bded2dbSJung-uk Kim vmovdqu (@ptr[7]),@out[7] 10747bded2dbSJung-uk Kim vmovdqu @out[0],0x00($offload) # offload inputs 10757bded2dbSJung-uk Kim vpxor $zero,@out[0],@out[0] # xor inputs with 0-round 10767bded2dbSJung-uk Kim vmovdqu @out[1],0x10($offload) 10777bded2dbSJung-uk Kim vpxor $zero,@out[1],@out[1] 10787bded2dbSJung-uk Kim vmovdqu @out[2],0x20($offload) 10797bded2dbSJung-uk Kim vpxor $zero,@out[2],@out[2] 10807bded2dbSJung-uk Kim vmovdqu @out[3],0x30($offload) 10817bded2dbSJung-uk Kim vpxor $zero,@out[3],@out[3] 10827bded2dbSJung-uk Kim vmovdqu @out[4],0x40($offload) 10837bded2dbSJung-uk Kim vpxor $zero,@out[4],@out[4] 10847bded2dbSJung-uk Kim vmovdqu @out[5],0x50($offload) 10857bded2dbSJung-uk Kim vpxor $zero,@out[5],@out[5] 10867bded2dbSJung-uk Kim vmovdqu @out[6],0x60($offload) 10877bded2dbSJung-uk Kim vpxor $zero,@out[6],@out[6] 10887bded2dbSJung-uk Kim vmovdqu @out[7],0x70($offload) 10897bded2dbSJung-uk Kim vpxor $zero,@out[7],@out[7] 10907bded2dbSJung-uk Kim xor \$0x80,$offload 10917bded2dbSJung-uk Kim mov \$1,$one # constant of 1 10927bded2dbSJung-uk Kim jmp .Loop_dec8x 10937bded2dbSJung-uk Kim 10947bded2dbSJung-uk Kim.align 32 10957bded2dbSJung-uk Kim.Loop_dec8x: 10967bded2dbSJung-uk Kim___ 10977bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) { 10987bded2dbSJung-uk Kimmy $rndkey=($i&1)?$rndkey0:$rndkey1; 10997bded2dbSJung-uk Kim$code.=<<___; 11007bded2dbSJung-uk Kim vaesdec $rndkey,@out[0],@out[0] 11017bded2dbSJung-uk Kim cmp 32+4*$i(%rsp),$one 11027bded2dbSJung-uk Kim___ 11037bded2dbSJung-uk Kim$code.=<<___ if ($i); 11047bded2dbSJung-uk Kim mov 64+8*$i(%rsp),$offset 11057bded2dbSJung-uk Kim___ 11067bded2dbSJung-uk Kim$code.=<<___; 11077bded2dbSJung-uk Kim vaesdec $rndkey,@out[1],@out[1] 11087bded2dbSJung-uk Kim prefetcht0 31(@ptr[$i]) # prefetch input 11097bded2dbSJung-uk Kim vaesdec $rndkey,@out[2],@out[2] 11107bded2dbSJung-uk Kim___ 11117bded2dbSJung-uk Kim$code.=<<___ if ($i>1); 11127bded2dbSJung-uk Kim prefetcht0 15(@ptr[$i-2]) # prefetch output 11137bded2dbSJung-uk Kim___ 11147bded2dbSJung-uk Kim$code.=<<___; 11157bded2dbSJung-uk Kim vaesdec $rndkey,@out[3],@out[3] 11167bded2dbSJung-uk Kim lea (@ptr[$i],$offset),$offset 11177bded2dbSJung-uk Kim cmovge %rsp,@ptr[$i] # cancel input 11187bded2dbSJung-uk Kim vaesdec $rndkey,@out[4],@out[4] 11197bded2dbSJung-uk Kim cmovg %rsp,$offset # sink output 11207bded2dbSJung-uk Kim vaesdec $rndkey,@out[5],@out[5] 11217bded2dbSJung-uk Kim sub @ptr[$i],$offset 11227bded2dbSJung-uk Kim vaesdec $rndkey,@out[6],@out[6] 11237bded2dbSJung-uk Kim vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input 11247bded2dbSJung-uk Kim mov $offset,64+8*$i(%rsp) 11257bded2dbSJung-uk Kim vaesdec $rndkey,@out[7],@out[7] 11267bded2dbSJung-uk Kim vmovups `16*(3+$i)-0x78`($key),$rndkey 11277bded2dbSJung-uk Kim lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output 11287bded2dbSJung-uk Kim___ 11297bded2dbSJung-uk Kim$code.=<<___ if ($i<4); 11307bded2dbSJung-uk Kim vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load 11317bded2dbSJung-uk Kim___ 11327bded2dbSJung-uk Kim} 11337bded2dbSJung-uk Kim$code.=<<___; 11347bded2dbSJung-uk Kim vmovdqu 32(%rsp),$counters 11357bded2dbSJung-uk Kim prefetcht0 15(@ptr[$i-2]) # prefetch output 11367bded2dbSJung-uk Kim prefetcht0 15(@ptr[$i-1]) 11377bded2dbSJung-uk Kim cmp \$11,$rounds 11387bded2dbSJung-uk Kim jb .Ldec8x_tail 11397bded2dbSJung-uk Kim 11407bded2dbSJung-uk Kim vaesdec $rndkey1,@out[0],@out[0] 11417bded2dbSJung-uk Kim vaesdec $rndkey1,@out[1],@out[1] 11427bded2dbSJung-uk Kim vaesdec $rndkey1,@out[2],@out[2] 11437bded2dbSJung-uk Kim vaesdec $rndkey1,@out[3],@out[3] 11447bded2dbSJung-uk Kim vaesdec $rndkey1,@out[4],@out[4] 11457bded2dbSJung-uk Kim vaesdec $rndkey1,@out[5],@out[5] 11467bded2dbSJung-uk Kim vaesdec $rndkey1,@out[6],@out[6] 11477bded2dbSJung-uk Kim vaesdec $rndkey1,@out[7],@out[7] 11487bded2dbSJung-uk Kim vmovups 0xb0-0x78($key),$rndkey1 11497bded2dbSJung-uk Kim 11507bded2dbSJung-uk Kim vaesdec $rndkey0,@out[0],@out[0] 11517bded2dbSJung-uk Kim vaesdec $rndkey0,@out[1],@out[1] 11527bded2dbSJung-uk Kim vaesdec $rndkey0,@out[2],@out[2] 11537bded2dbSJung-uk Kim vaesdec $rndkey0,@out[3],@out[3] 11547bded2dbSJung-uk Kim vaesdec $rndkey0,@out[4],@out[4] 11557bded2dbSJung-uk Kim vaesdec $rndkey0,@out[5],@out[5] 11567bded2dbSJung-uk Kim vaesdec $rndkey0,@out[6],@out[6] 11577bded2dbSJung-uk Kim vaesdec $rndkey0,@out[7],@out[7] 11587bded2dbSJung-uk Kim vmovups 0xc0-0x78($key),$rndkey0 11597bded2dbSJung-uk Kim je .Ldec8x_tail 11607bded2dbSJung-uk Kim 11617bded2dbSJung-uk Kim vaesdec $rndkey1,@out[0],@out[0] 11627bded2dbSJung-uk Kim vaesdec $rndkey1,@out[1],@out[1] 11637bded2dbSJung-uk Kim vaesdec $rndkey1,@out[2],@out[2] 11647bded2dbSJung-uk Kim vaesdec $rndkey1,@out[3],@out[3] 11657bded2dbSJung-uk Kim vaesdec $rndkey1,@out[4],@out[4] 11667bded2dbSJung-uk Kim vaesdec $rndkey1,@out[5],@out[5] 11677bded2dbSJung-uk Kim vaesdec $rndkey1,@out[6],@out[6] 11687bded2dbSJung-uk Kim vaesdec $rndkey1,@out[7],@out[7] 11697bded2dbSJung-uk Kim vmovups 0xd0-0x78($key),$rndkey1 11707bded2dbSJung-uk Kim 11717bded2dbSJung-uk Kim vaesdec $rndkey0,@out[0],@out[0] 11727bded2dbSJung-uk Kim vaesdec $rndkey0,@out[1],@out[1] 11737bded2dbSJung-uk Kim vaesdec $rndkey0,@out[2],@out[2] 11747bded2dbSJung-uk Kim vaesdec $rndkey0,@out[3],@out[3] 11757bded2dbSJung-uk Kim vaesdec $rndkey0,@out[4],@out[4] 11767bded2dbSJung-uk Kim vaesdec $rndkey0,@out[5],@out[5] 11777bded2dbSJung-uk Kim vaesdec $rndkey0,@out[6],@out[6] 11787bded2dbSJung-uk Kim vaesdec $rndkey0,@out[7],@out[7] 11797bded2dbSJung-uk Kim vmovups 0xe0-0x78($key),$rndkey0 11807bded2dbSJung-uk Kim 11817bded2dbSJung-uk Kim.Ldec8x_tail: 11827bded2dbSJung-uk Kim vaesdec $rndkey1,@out[0],@out[0] 11837bded2dbSJung-uk Kim vpxor $zero,$zero,$zero 11847bded2dbSJung-uk Kim vaesdec $rndkey1,@out[1],@out[1] 11857bded2dbSJung-uk Kim vaesdec $rndkey1,@out[2],@out[2] 11867bded2dbSJung-uk Kim vpcmpgtd $zero,$counters,$zero 11877bded2dbSJung-uk Kim vaesdec $rndkey1,@out[3],@out[3] 11887bded2dbSJung-uk Kim vaesdec $rndkey1,@out[4],@out[4] 11897bded2dbSJung-uk Kim vpaddd $counters,$zero,$zero # decrement counters 11907bded2dbSJung-uk Kim vmovdqu 48(%rsp),$counters 11917bded2dbSJung-uk Kim vaesdec $rndkey1,@out[5],@out[5] 11927bded2dbSJung-uk Kim mov 64(%rsp),$offset # pre-load 1st offset 11937bded2dbSJung-uk Kim vaesdec $rndkey1,@out[6],@out[6] 11947bded2dbSJung-uk Kim vaesdec $rndkey1,@out[7],@out[7] 11957bded2dbSJung-uk Kim vmovups 0x10-0x78($key),$rndkey1 11967bded2dbSJung-uk Kim 11977bded2dbSJung-uk Kim vaesdeclast $rndkey0,@out[0],@out[0] 11987bded2dbSJung-uk Kim vmovdqa $zero,32(%rsp) # update counters 11997bded2dbSJung-uk Kim vpxor $zero,$zero,$zero 12007bded2dbSJung-uk Kim vaesdeclast $rndkey0,@out[1],@out[1] 12017bded2dbSJung-uk Kim vpxor 0x00($offload),@out[0],@out[0] # xor with IV 12027bded2dbSJung-uk Kim vaesdeclast $rndkey0,@out[2],@out[2] 12037bded2dbSJung-uk Kim vpxor 0x10($offload),@out[1],@out[1] 12047bded2dbSJung-uk Kim vpcmpgtd $zero,$counters,$zero 12057bded2dbSJung-uk Kim vaesdeclast $rndkey0,@out[3],@out[3] 12067bded2dbSJung-uk Kim vpxor 0x20($offload),@out[2],@out[2] 12077bded2dbSJung-uk Kim vaesdeclast $rndkey0,@out[4],@out[4] 12087bded2dbSJung-uk Kim vpxor 0x30($offload),@out[3],@out[3] 12097bded2dbSJung-uk Kim vpaddd $zero,$counters,$counters # decrement counters 12107bded2dbSJung-uk Kim vmovdqu -0x78($key),$zero # 0-round 12117bded2dbSJung-uk Kim vaesdeclast $rndkey0,@out[5],@out[5] 12127bded2dbSJung-uk Kim vpxor 0x40($offload),@out[4],@out[4] 12137bded2dbSJung-uk Kim vaesdeclast $rndkey0,@out[6],@out[6] 12147bded2dbSJung-uk Kim vpxor 0x50($offload),@out[5],@out[5] 12157bded2dbSJung-uk Kim vmovdqa $counters,48(%rsp) # update counters 12167bded2dbSJung-uk Kim vaesdeclast $rndkey0,@out[7],@out[7] 12177bded2dbSJung-uk Kim vpxor 0x60($offload),@out[6],@out[6] 12187bded2dbSJung-uk Kim vmovups 0x20-0x78($key),$rndkey0 12197bded2dbSJung-uk Kim 12207bded2dbSJung-uk Kim vmovups @out[0],-16(@ptr[0]) # write output 12217bded2dbSJung-uk Kim sub $offset,@ptr[0] # switch to input 12227bded2dbSJung-uk Kim vmovdqu 128+0(%rsp),@out[0] 12237bded2dbSJung-uk Kim vpxor 0x70($offload),@out[7],@out[7] 12247bded2dbSJung-uk Kim vmovups @out[1],-16(@ptr[1]) 12257bded2dbSJung-uk Kim sub `64+1*8`(%rsp),@ptr[1] 12267bded2dbSJung-uk Kim vmovdqu @out[0],0x00($offload) 12277bded2dbSJung-uk Kim vpxor $zero,@out[0],@out[0] 12287bded2dbSJung-uk Kim vmovdqu 128+16(%rsp),@out[1] 12297bded2dbSJung-uk Kim vmovups @out[2],-16(@ptr[2]) 12307bded2dbSJung-uk Kim sub `64+2*8`(%rsp),@ptr[2] 12317bded2dbSJung-uk Kim vmovdqu @out[1],0x10($offload) 12327bded2dbSJung-uk Kim vpxor $zero,@out[1],@out[1] 12337bded2dbSJung-uk Kim vmovdqu 128+32(%rsp),@out[2] 12347bded2dbSJung-uk Kim vmovups @out[3],-16(@ptr[3]) 12357bded2dbSJung-uk Kim sub `64+3*8`(%rsp),@ptr[3] 12367bded2dbSJung-uk Kim vmovdqu @out[2],0x20($offload) 12377bded2dbSJung-uk Kim vpxor $zero,@out[2],@out[2] 12387bded2dbSJung-uk Kim vmovdqu 128+48(%rsp),@out[3] 12397bded2dbSJung-uk Kim vmovups @out[4],-16(@ptr[4]) 12407bded2dbSJung-uk Kim sub `64+4*8`(%rsp),@ptr[4] 12417bded2dbSJung-uk Kim vmovdqu @out[3],0x30($offload) 12427bded2dbSJung-uk Kim vpxor $zero,@out[3],@out[3] 12437bded2dbSJung-uk Kim vmovdqu @inp[0],0x40($offload) 12447bded2dbSJung-uk Kim vpxor @inp[0],$zero,@out[4] 12457bded2dbSJung-uk Kim vmovups @out[5],-16(@ptr[5]) 12467bded2dbSJung-uk Kim sub `64+5*8`(%rsp),@ptr[5] 12477bded2dbSJung-uk Kim vmovdqu @inp[1],0x50($offload) 12487bded2dbSJung-uk Kim vpxor @inp[1],$zero,@out[5] 12497bded2dbSJung-uk Kim vmovups @out[6],-16(@ptr[6]) 12507bded2dbSJung-uk Kim sub `64+6*8`(%rsp),@ptr[6] 12517bded2dbSJung-uk Kim vmovdqu @inp[2],0x60($offload) 12527bded2dbSJung-uk Kim vpxor @inp[2],$zero,@out[6] 12537bded2dbSJung-uk Kim vmovups @out[7],-16(@ptr[7]) 12547bded2dbSJung-uk Kim sub `64+7*8`(%rsp),@ptr[7] 12557bded2dbSJung-uk Kim vmovdqu @inp[3],0x70($offload) 12567bded2dbSJung-uk Kim vpxor @inp[3],$zero,@out[7] 12577bded2dbSJung-uk Kim 12587bded2dbSJung-uk Kim xor \$128,$offload 12597bded2dbSJung-uk Kim dec $num 12607bded2dbSJung-uk Kim jnz .Loop_dec8x 12617bded2dbSJung-uk Kim 12627bded2dbSJung-uk Kim mov 16(%rsp),%rax # original %rsp 1263e71b7053SJung-uk Kim.cfi_def_cfa %rax,8 12647bded2dbSJung-uk Kim #mov 24(%rsp),$num 1265*b077aed3SPierre Pronchery #lea `$inp_elm_size*8`($inp),$inp 12667bded2dbSJung-uk Kim #dec $num 12677bded2dbSJung-uk Kim #jnz .Ldec8x_loop_grande 12687bded2dbSJung-uk Kim 12697bded2dbSJung-uk Kim.Ldec8x_done: 12707bded2dbSJung-uk Kim vzeroupper 12717bded2dbSJung-uk Kim___ 12727bded2dbSJung-uk Kim$code.=<<___ if ($win64); 12737bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 12747bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 12757bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 12767bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 12777bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 12787bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 12797bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 12807bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm13 12817bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm14 12827bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm15 12837bded2dbSJung-uk Kim___ 12847bded2dbSJung-uk Kim$code.=<<___; 12857bded2dbSJung-uk Kim mov -48(%rax),%r15 1286e71b7053SJung-uk Kim.cfi_restore %r15 12877bded2dbSJung-uk Kim mov -40(%rax),%r14 1288e71b7053SJung-uk Kim.cfi_restore %r14 12897bded2dbSJung-uk Kim mov -32(%rax),%r13 1290e71b7053SJung-uk Kim.cfi_restore %r13 12917bded2dbSJung-uk Kim mov -24(%rax),%r12 1292e71b7053SJung-uk Kim.cfi_restore %r12 12937bded2dbSJung-uk Kim mov -16(%rax),%rbp 1294e71b7053SJung-uk Kim.cfi_restore %rbp 12957bded2dbSJung-uk Kim mov -8(%rax),%rbx 1296e71b7053SJung-uk Kim.cfi_restore %rbx 12977bded2dbSJung-uk Kim lea (%rax),%rsp 1298e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 12997bded2dbSJung-uk Kim.Ldec8x_epilogue: 13007bded2dbSJung-uk Kim ret 1301e71b7053SJung-uk Kim.cfi_endproc 13027bded2dbSJung-uk Kim.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx 13037bded2dbSJung-uk Kim___ 13047bded2dbSJung-uk Kim }}} 13057bded2dbSJung-uk Kim 13067bded2dbSJung-uk Kimif ($win64) { 13077bded2dbSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 13087bded2dbSJung-uk Kim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 13097bded2dbSJung-uk Kim$rec="%rcx"; 13107bded2dbSJung-uk Kim$frame="%rdx"; 13117bded2dbSJung-uk Kim$context="%r8"; 13127bded2dbSJung-uk Kim$disp="%r9"; 13137bded2dbSJung-uk Kim 13147bded2dbSJung-uk Kim$code.=<<___; 13157bded2dbSJung-uk Kim.extern __imp_RtlVirtualUnwind 13167bded2dbSJung-uk Kim.type se_handler,\@abi-omnipotent 13177bded2dbSJung-uk Kim.align 16 13187bded2dbSJung-uk Kimse_handler: 13197bded2dbSJung-uk Kim push %rsi 13207bded2dbSJung-uk Kim push %rdi 13217bded2dbSJung-uk Kim push %rbx 13227bded2dbSJung-uk Kim push %rbp 13237bded2dbSJung-uk Kim push %r12 13247bded2dbSJung-uk Kim push %r13 13257bded2dbSJung-uk Kim push %r14 13267bded2dbSJung-uk Kim push %r15 13277bded2dbSJung-uk Kim pushfq 13287bded2dbSJung-uk Kim sub \$64,%rsp 13297bded2dbSJung-uk Kim 13307bded2dbSJung-uk Kim mov 120($context),%rax # pull context->Rax 13317bded2dbSJung-uk Kim mov 248($context),%rbx # pull context->Rip 13327bded2dbSJung-uk Kim 13337bded2dbSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 13347bded2dbSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 13357bded2dbSJung-uk Kim 13367bded2dbSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 13377bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # prologue label 13387bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip<.Lprologue 13397bded2dbSJung-uk Kim jb .Lin_prologue 13407bded2dbSJung-uk Kim 13417bded2dbSJung-uk Kim mov 152($context),%rax # pull context->Rsp 13427bded2dbSJung-uk Kim 13437bded2dbSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 13447bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 13457bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip>=.Lepilogue 13467bded2dbSJung-uk Kim jae .Lin_prologue 13477bded2dbSJung-uk Kim 13487bded2dbSJung-uk Kim mov 16(%rax),%rax # pull saved stack pointer 13497bded2dbSJung-uk Kim 13507bded2dbSJung-uk Kim mov -8(%rax),%rbx 13517bded2dbSJung-uk Kim mov -16(%rax),%rbp 13527bded2dbSJung-uk Kim mov -24(%rax),%r12 13537bded2dbSJung-uk Kim mov -32(%rax),%r13 13547bded2dbSJung-uk Kim mov -40(%rax),%r14 13557bded2dbSJung-uk Kim mov -48(%rax),%r15 13567bded2dbSJung-uk Kim mov %rbx,144($context) # restore context->Rbx 13577bded2dbSJung-uk Kim mov %rbp,160($context) # restore context->Rbp 1358e71b7053SJung-uk Kim mov %r12,216($context) # restore context->R12 1359e71b7053SJung-uk Kim mov %r13,224($context) # restore context->R13 1360e71b7053SJung-uk Kim mov %r14,232($context) # restore context->R14 1361e71b7053SJung-uk Kim mov %r15,240($context) # restore context->R15 13627bded2dbSJung-uk Kim 13637bded2dbSJung-uk Kim lea -56-10*16(%rax),%rsi 13647bded2dbSJung-uk Kim lea 512($context),%rdi # &context.Xmm6 13657bded2dbSJung-uk Kim mov \$20,%ecx 13667bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 13677bded2dbSJung-uk Kim 13687bded2dbSJung-uk Kim.Lin_prologue: 13697bded2dbSJung-uk Kim mov 8(%rax),%rdi 13707bded2dbSJung-uk Kim mov 16(%rax),%rsi 13717bded2dbSJung-uk Kim mov %rax,152($context) # restore context->Rsp 13727bded2dbSJung-uk Kim mov %rsi,168($context) # restore context->Rsi 13737bded2dbSJung-uk Kim mov %rdi,176($context) # restore context->Rdi 13747bded2dbSJung-uk Kim 13757bded2dbSJung-uk Kim mov 40($disp),%rdi # disp->ContextRecord 13767bded2dbSJung-uk Kim mov $context,%rsi # context 13777bded2dbSJung-uk Kim mov \$154,%ecx # sizeof(CONTEXT) 13787bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 13797bded2dbSJung-uk Kim 13807bded2dbSJung-uk Kim mov $disp,%rsi 13817bded2dbSJung-uk Kim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 13827bded2dbSJung-uk Kim mov 8(%rsi),%rdx # arg2, disp->ImageBase 13837bded2dbSJung-uk Kim mov 0(%rsi),%r8 # arg3, disp->ControlPc 13847bded2dbSJung-uk Kim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 13857bded2dbSJung-uk Kim mov 40(%rsi),%r10 # disp->ContextRecord 13867bded2dbSJung-uk Kim lea 56(%rsi),%r11 # &disp->HandlerData 13877bded2dbSJung-uk Kim lea 24(%rsi),%r12 # &disp->EstablisherFrame 13887bded2dbSJung-uk Kim mov %r10,32(%rsp) # arg5 13897bded2dbSJung-uk Kim mov %r11,40(%rsp) # arg6 13907bded2dbSJung-uk Kim mov %r12,48(%rsp) # arg7 13917bded2dbSJung-uk Kim mov %rcx,56(%rsp) # arg8, (NULL) 13927bded2dbSJung-uk Kim call *__imp_RtlVirtualUnwind(%rip) 13937bded2dbSJung-uk Kim 13947bded2dbSJung-uk Kim mov \$1,%eax # ExceptionContinueSearch 13957bded2dbSJung-uk Kim add \$64,%rsp 13967bded2dbSJung-uk Kim popfq 13977bded2dbSJung-uk Kim pop %r15 13987bded2dbSJung-uk Kim pop %r14 13997bded2dbSJung-uk Kim pop %r13 14007bded2dbSJung-uk Kim pop %r12 14017bded2dbSJung-uk Kim pop %rbp 14027bded2dbSJung-uk Kim pop %rbx 14037bded2dbSJung-uk Kim pop %rdi 14047bded2dbSJung-uk Kim pop %rsi 14057bded2dbSJung-uk Kim ret 14067bded2dbSJung-uk Kim.size se_handler,.-se_handler 14077bded2dbSJung-uk Kim 14087bded2dbSJung-uk Kim.section .pdata 14097bded2dbSJung-uk Kim.align 4 14107bded2dbSJung-uk Kim .rva .LSEH_begin_aesni_multi_cbc_encrypt 14117bded2dbSJung-uk Kim .rva .LSEH_end_aesni_multi_cbc_encrypt 14127bded2dbSJung-uk Kim .rva .LSEH_info_aesni_multi_cbc_encrypt 14137bded2dbSJung-uk Kim .rva .LSEH_begin_aesni_multi_cbc_decrypt 14147bded2dbSJung-uk Kim .rva .LSEH_end_aesni_multi_cbc_decrypt 14157bded2dbSJung-uk Kim .rva .LSEH_info_aesni_multi_cbc_decrypt 14167bded2dbSJung-uk Kim___ 14177bded2dbSJung-uk Kim$code.=<<___ if ($avx); 14187bded2dbSJung-uk Kim .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx 14197bded2dbSJung-uk Kim .rva .LSEH_end_aesni_multi_cbc_encrypt_avx 14207bded2dbSJung-uk Kim .rva .LSEH_info_aesni_multi_cbc_encrypt_avx 14217bded2dbSJung-uk Kim .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx 14227bded2dbSJung-uk Kim .rva .LSEH_end_aesni_multi_cbc_decrypt_avx 14237bded2dbSJung-uk Kim .rva .LSEH_info_aesni_multi_cbc_decrypt_avx 14247bded2dbSJung-uk Kim___ 14257bded2dbSJung-uk Kim$code.=<<___; 14267bded2dbSJung-uk Kim.section .xdata 14277bded2dbSJung-uk Kim.align 8 14287bded2dbSJung-uk Kim.LSEH_info_aesni_multi_cbc_encrypt: 14297bded2dbSJung-uk Kim .byte 9,0,0,0 14307bded2dbSJung-uk Kim .rva se_handler 14317bded2dbSJung-uk Kim .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] 14327bded2dbSJung-uk Kim.LSEH_info_aesni_multi_cbc_decrypt: 14337bded2dbSJung-uk Kim .byte 9,0,0,0 14347bded2dbSJung-uk Kim .rva se_handler 14357bded2dbSJung-uk Kim .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] 14367bded2dbSJung-uk Kim___ 14377bded2dbSJung-uk Kim$code.=<<___ if ($avx); 14387bded2dbSJung-uk Kim.LSEH_info_aesni_multi_cbc_encrypt_avx: 14397bded2dbSJung-uk Kim .byte 9,0,0,0 14407bded2dbSJung-uk Kim .rva se_handler 14417bded2dbSJung-uk Kim .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] 14427bded2dbSJung-uk Kim.LSEH_info_aesni_multi_cbc_decrypt_avx: 14437bded2dbSJung-uk Kim .byte 9,0,0,0 14447bded2dbSJung-uk Kim .rva se_handler 14457bded2dbSJung-uk Kim .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] 14467bded2dbSJung-uk Kim___ 14477bded2dbSJung-uk Kim} 14487bded2dbSJung-uk Kim#################################################################### 14497bded2dbSJung-uk Kim 14507bded2dbSJung-uk Kimsub rex { 14517bded2dbSJung-uk Kim local *opcode=shift; 14527bded2dbSJung-uk Kim my ($dst,$src)=@_; 14537bded2dbSJung-uk Kim my $rex=0; 14547bded2dbSJung-uk Kim 14557bded2dbSJung-uk Kim $rex|=0x04 if($dst>=8); 14567bded2dbSJung-uk Kim $rex|=0x01 if($src>=8); 14577bded2dbSJung-uk Kim push @opcode,$rex|0x40 if($rex); 14587bded2dbSJung-uk Kim} 14597bded2dbSJung-uk Kim 14607bded2dbSJung-uk Kimsub aesni { 14617bded2dbSJung-uk Kim my $line=shift; 14627bded2dbSJung-uk Kim my @opcode=(0x66); 14637bded2dbSJung-uk Kim 14647bded2dbSJung-uk Kim if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 14657bded2dbSJung-uk Kim rex(\@opcode,$4,$3); 14667bded2dbSJung-uk Kim push @opcode,0x0f,0x3a,0xdf; 14677bded2dbSJung-uk Kim push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 14687bded2dbSJung-uk Kim my $c=$2; 14697bded2dbSJung-uk Kim push @opcode,$c=~/^0/?oct($c):$c; 14707bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 14717bded2dbSJung-uk Kim } 14727bded2dbSJung-uk Kim elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 14737bded2dbSJung-uk Kim my %opcodelet = ( 14747bded2dbSJung-uk Kim "aesimc" => 0xdb, 14757bded2dbSJung-uk Kim "aesenc" => 0xdc, "aesenclast" => 0xdd, 14767bded2dbSJung-uk Kim "aesdec" => 0xde, "aesdeclast" => 0xdf 14777bded2dbSJung-uk Kim ); 14787bded2dbSJung-uk Kim return undef if (!defined($opcodelet{$1})); 14797bded2dbSJung-uk Kim rex(\@opcode,$3,$2); 14807bded2dbSJung-uk Kim push @opcode,0x0f,0x38,$opcodelet{$1}; 14817bded2dbSJung-uk Kim push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 14827bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 14837bded2dbSJung-uk Kim } 14847bded2dbSJung-uk Kim elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 14857bded2dbSJung-uk Kim my %opcodelet = ( 14867bded2dbSJung-uk Kim "aesenc" => 0xdc, "aesenclast" => 0xdd, 14877bded2dbSJung-uk Kim "aesdec" => 0xde, "aesdeclast" => 0xdf 14887bded2dbSJung-uk Kim ); 14897bded2dbSJung-uk Kim return undef if (!defined($opcodelet{$1})); 14907bded2dbSJung-uk Kim my $off = $2; 14917bded2dbSJung-uk Kim push @opcode,0x44 if ($3>=8); 14927bded2dbSJung-uk Kim push @opcode,0x0f,0x38,$opcodelet{$1}; 14937bded2dbSJung-uk Kim push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 14947bded2dbSJung-uk Kim push @opcode,($off=~/^0/?oct($off):$off)&0xff; 14957bded2dbSJung-uk Kim return ".byte\t".join(',',@opcode); 14967bded2dbSJung-uk Kim } 14977bded2dbSJung-uk Kim return $line; 14987bded2dbSJung-uk Kim} 14997bded2dbSJung-uk Kim 15007bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 15017bded2dbSJung-uk Kim$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 15027bded2dbSJung-uk Kim 15037bded2dbSJung-uk Kimprint $code; 150417f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 1505