xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl (revision b077aed33b7b6aefca7b17ddb250cf521f938613)
17bded2dbSJung-uk Kim#! /usr/bin/env perl
217f01e99SJung-uk Kim# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3e71b7053SJung-uk Kim#
4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
5e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
8e71b7053SJung-uk Kim
97bded2dbSJung-uk Kim
107bded2dbSJung-uk Kim# ====================================================================
117bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
127bded2dbSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
137bded2dbSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
147bded2dbSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
157bded2dbSJung-uk Kim# ====================================================================
167bded2dbSJung-uk Kim
177bded2dbSJung-uk Kim# Multi-buffer AES-NI procedures process several independent buffers
187bded2dbSJung-uk Kim# in parallel by interleaving independent instructions.
197bded2dbSJung-uk Kim#
207bded2dbSJung-uk Kim# Cycles per byte for interleave factor 4:
217bded2dbSJung-uk Kim#
227bded2dbSJung-uk Kim#			asymptotic	measured
237bded2dbSJung-uk Kim#			---------------------------
247bded2dbSJung-uk Kim# Westmere		5.00/4=1.25	5.13/4=1.28
257bded2dbSJung-uk Kim# Atom			15.0/4=3.75	?15.7/4=3.93
267bded2dbSJung-uk Kim# Sandy Bridge		5.06/4=1.27	5.18/4=1.29
277bded2dbSJung-uk Kim# Ivy Bridge		5.06/4=1.27	5.14/4=1.29
287bded2dbSJung-uk Kim# Haswell		4.44/4=1.11	4.44/4=1.11
297bded2dbSJung-uk Kim# Bulldozer		5.75/4=1.44	5.76/4=1.44
307bded2dbSJung-uk Kim#
317bded2dbSJung-uk Kim# Cycles per byte for interleave factor 8 (not implemented for
327bded2dbSJung-uk Kim# pre-AVX processors, where higher interleave factor incidentally
337bded2dbSJung-uk Kim# doesn't result in improvement):
347bded2dbSJung-uk Kim#
357bded2dbSJung-uk Kim#			asymptotic	measured
367bded2dbSJung-uk Kim#			---------------------------
377bded2dbSJung-uk Kim# Sandy Bridge		5.06/8=0.64	7.10/8=0.89(*)
387bded2dbSJung-uk Kim# Ivy Bridge		5.06/8=0.64	7.14/8=0.89(*)
397bded2dbSJung-uk Kim# Haswell		5.00/8=0.63	5.00/8=0.63
407bded2dbSJung-uk Kim# Bulldozer		5.75/8=0.72	5.77/8=0.72
417bded2dbSJung-uk Kim#
427bded2dbSJung-uk Kim# (*)	Sandy/Ivy Bridge are known to handle high interleave factors
437bded2dbSJung-uk Kim#	suboptimally;
447bded2dbSJung-uk Kim
45*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension)
46*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file
47*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
497bded2dbSJung-uk Kim
507bded2dbSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
517bded2dbSJung-uk Kim
527bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
537bded2dbSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
547bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
557bded2dbSJung-uk Kimdie "can't locate x86_64-xlate.pl";
567bded2dbSJung-uk Kim
57*b077aed3SPierre Proncherypush(@INC,"${dir}","${dir}../../perlasm");
58*b077aed3SPierre Proncheryrequire "x86_64-support.pl";
59*b077aed3SPierre Pronchery
60*b077aed3SPierre Pronchery$ptr_size=&pointer_size($flavour);
61*b077aed3SPierre Pronchery
627bded2dbSJung-uk Kim$avx=0;
637bded2dbSJung-uk Kim
647bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
657bded2dbSJung-uk Kim		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
667bded2dbSJung-uk Kim	$avx = ($1>=2.19) + ($1>=2.22);
677bded2dbSJung-uk Kim}
687bded2dbSJung-uk Kim
697bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
707bded2dbSJung-uk Kim	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
717bded2dbSJung-uk Kim	$avx = ($1>=2.09) + ($1>=2.10);
727bded2dbSJung-uk Kim}
737bded2dbSJung-uk Kim
747bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
757bded2dbSJung-uk Kim	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
767bded2dbSJung-uk Kim	$avx = ($1>=10) + ($1>=11);
777bded2dbSJung-uk Kim}
787bded2dbSJung-uk Kim
7963c1bb51SJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
807bded2dbSJung-uk Kim	$avx = ($2>=3.0) + ($2>3.0);
817bded2dbSJung-uk Kim}
827bded2dbSJung-uk Kim
83*b077aed3SPierre Proncheryopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
84*b077aed3SPierre Pronchery    or die "can't call $xlate: $!";
857bded2dbSJung-uk Kim*STDOUT=*OUT;
867bded2dbSJung-uk Kim
877bded2dbSJung-uk Kim# void aesni_multi_cbc_encrypt (
887bded2dbSJung-uk Kim#     struct {	void *inp,*out; int blocks; double iv[2]; } inp[8];
897bded2dbSJung-uk Kim#     const AES_KEY *key,
907bded2dbSJung-uk Kim#     int num);		/* 1 or 2 */
917bded2dbSJung-uk Kim#
927bded2dbSJung-uk Kim$inp="%rdi";	# 1st arg
937bded2dbSJung-uk Kim$key="%rsi";	# 2nd arg
947bded2dbSJung-uk Kim$num="%edx";
957bded2dbSJung-uk Kim
96*b077aed3SPierre Pronchery$inp_elm_size=2*$ptr_size+8+16;
97*b077aed3SPierre Pronchery
987bded2dbSJung-uk Kim@inptr=map("%r$_",(8..11));
997bded2dbSJung-uk Kim@outptr=map("%r$_",(12..15));
1007bded2dbSJung-uk Kim
1017bded2dbSJung-uk Kim($rndkey0,$rndkey1)=("%xmm0","%xmm1");
1027bded2dbSJung-uk Kim@out=map("%xmm$_",(2..5));
1037bded2dbSJung-uk Kim@inp=map("%xmm$_",(6..9));
1047bded2dbSJung-uk Kim($counters,$mask,$zero)=map("%xmm$_",(10..12));
1057bded2dbSJung-uk Kim
1067bded2dbSJung-uk Kim($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
1077bded2dbSJung-uk Kim
1087bded2dbSJung-uk Kim$code.=<<___;
1097bded2dbSJung-uk Kim.text
1107bded2dbSJung-uk Kim
1117bded2dbSJung-uk Kim.extern	OPENSSL_ia32cap_P
1127bded2dbSJung-uk Kim
1137bded2dbSJung-uk Kim.globl	aesni_multi_cbc_encrypt
1147bded2dbSJung-uk Kim.type	aesni_multi_cbc_encrypt,\@function,3
1157bded2dbSJung-uk Kim.align	32
1167bded2dbSJung-uk Kimaesni_multi_cbc_encrypt:
117e71b7053SJung-uk Kim.cfi_startproc
1187bded2dbSJung-uk Kim___
1197bded2dbSJung-uk Kim$code.=<<___ if ($avx);
1207bded2dbSJung-uk Kim	cmp	\$2,$num
1217bded2dbSJung-uk Kim	jb	.Lenc_non_avx
1227bded2dbSJung-uk Kim	mov	OPENSSL_ia32cap_P+4(%rip),%ecx
1237bded2dbSJung-uk Kim	test	\$`1<<28`,%ecx			# AVX bit
1247bded2dbSJung-uk Kim	jnz	_avx_cbc_enc_shortcut
1257bded2dbSJung-uk Kim	jmp	.Lenc_non_avx
1267bded2dbSJung-uk Kim.align	16
1277bded2dbSJung-uk Kim.Lenc_non_avx:
1287bded2dbSJung-uk Kim___
1297bded2dbSJung-uk Kim$code.=<<___;
1307bded2dbSJung-uk Kim	mov	%rsp,%rax
131e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
1327bded2dbSJung-uk Kim	push	%rbx
133e71b7053SJung-uk Kim.cfi_push	%rbx
1347bded2dbSJung-uk Kim	push	%rbp
135e71b7053SJung-uk Kim.cfi_push	%rbp
1367bded2dbSJung-uk Kim	push	%r12
137e71b7053SJung-uk Kim.cfi_push	%r12
1387bded2dbSJung-uk Kim	push	%r13
139e71b7053SJung-uk Kim.cfi_push	%r13
1407bded2dbSJung-uk Kim	push	%r14
141e71b7053SJung-uk Kim.cfi_push	%r14
1427bded2dbSJung-uk Kim	push	%r15
143e71b7053SJung-uk Kim.cfi_push	%r15
1447bded2dbSJung-uk Kim___
1457bded2dbSJung-uk Kim$code.=<<___ if ($win64);
1467bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
1477bded2dbSJung-uk Kim	movaps	%xmm6,(%rsp)
1487bded2dbSJung-uk Kim	movaps	%xmm7,0x10(%rsp)
1497bded2dbSJung-uk Kim	movaps	%xmm8,0x20(%rsp)
1507bded2dbSJung-uk Kim	movaps	%xmm9,0x30(%rsp)
1517bded2dbSJung-uk Kim	movaps	%xmm10,0x40(%rsp)
1527bded2dbSJung-uk Kim	movaps	%xmm11,0x50(%rsp)
1537bded2dbSJung-uk Kim	movaps	%xmm12,0x60(%rsp)
1547bded2dbSJung-uk Kim	movaps	%xmm13,-0x68(%rax)	# not used, saved to share se_handler
1557bded2dbSJung-uk Kim	movaps	%xmm14,-0x58(%rax)
1567bded2dbSJung-uk Kim	movaps	%xmm15,-0x48(%rax)
1577bded2dbSJung-uk Kim___
1587bded2dbSJung-uk Kim$code.=<<___;
1597bded2dbSJung-uk Kim	# stack layout
1607bded2dbSJung-uk Kim	#
1617bded2dbSJung-uk Kim	# +0	output sink
1627bded2dbSJung-uk Kim	# +16	input sink [original %rsp and $num]
1637bded2dbSJung-uk Kim	# +32	counters
1647bded2dbSJung-uk Kim
1657bded2dbSJung-uk Kim	sub	\$48,%rsp
1667bded2dbSJung-uk Kim	and	\$-64,%rsp
1677bded2dbSJung-uk Kim	mov	%rax,16(%rsp)			# original %rsp
168e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+16,deref,+8
1697bded2dbSJung-uk Kim
1707bded2dbSJung-uk Kim.Lenc4x_body:
1717bded2dbSJung-uk Kim	movdqu	($key),$zero			# 0-round key
1727bded2dbSJung-uk Kim	lea	0x78($key),$key			# size optimization
173*b077aed3SPierre Pronchery	lea	$inp_elm_size*2($inp),$inp
1747bded2dbSJung-uk Kim
1757bded2dbSJung-uk Kim.Lenc4x_loop_grande:
1767bded2dbSJung-uk Kim	mov	$num,24(%rsp)			# original $num
1777bded2dbSJung-uk Kim	xor	$num,$num
1787bded2dbSJung-uk Kim___
1797bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) {
180*b077aed3SPierre Pronchery    $inptr_reg=&pointer_register($flavour,@inptr[$i]);
181*b077aed3SPierre Pronchery    $outptr_reg=&pointer_register($flavour,@outptr[$i]);
1827bded2dbSJung-uk Kim    $code.=<<___;
183*b077aed3SPierre Pronchery	# borrow $one for number of blocks
184*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
185*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
1867bded2dbSJung-uk Kim	cmp	$num,$one
187*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
1887bded2dbSJung-uk Kim	cmovg	$one,$num			# find maximum
1897bded2dbSJung-uk Kim	test	$one,$one
190*b077aed3SPierre Pronchery	# load IV
191*b077aed3SPierre Pronchery	movdqu	`$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i]
1927bded2dbSJung-uk Kim	mov	$one,`32+4*$i`(%rsp)		# initialize counters
1937bded2dbSJung-uk Kim	cmovle	%rsp,@inptr[$i]			# cancel input
1947bded2dbSJung-uk Kim___
1957bded2dbSJung-uk Kim}
1967bded2dbSJung-uk Kim$code.=<<___;
1977bded2dbSJung-uk Kim	test	$num,$num
1987bded2dbSJung-uk Kim	jz	.Lenc4x_done
1997bded2dbSJung-uk Kim
2007bded2dbSJung-uk Kim	movups	0x10-0x78($key),$rndkey1
2017bded2dbSJung-uk Kim	 pxor	$zero,@out[0]
2027bded2dbSJung-uk Kim	movups	0x20-0x78($key),$rndkey0
2037bded2dbSJung-uk Kim	 pxor	$zero,@out[1]
2047bded2dbSJung-uk Kim	mov	0xf0-0x78($key),$rounds
2057bded2dbSJung-uk Kim	 pxor	$zero,@out[2]
2067bded2dbSJung-uk Kim	movdqu	(@inptr[0]),@inp[0]		# load inputs
2077bded2dbSJung-uk Kim	 pxor	$zero,@out[3]
2087bded2dbSJung-uk Kim	movdqu	(@inptr[1]),@inp[1]
2097bded2dbSJung-uk Kim	 pxor	@inp[0],@out[0]
2107bded2dbSJung-uk Kim	movdqu	(@inptr[2]),@inp[2]
2117bded2dbSJung-uk Kim	 pxor	@inp[1],@out[1]
2127bded2dbSJung-uk Kim	movdqu	(@inptr[3]),@inp[3]
2137bded2dbSJung-uk Kim	 pxor	@inp[2],@out[2]
2147bded2dbSJung-uk Kim	 pxor	@inp[3],@out[3]
2157bded2dbSJung-uk Kim	movdqa	32(%rsp),$counters		# load counters
2167bded2dbSJung-uk Kim	xor	$offset,$offset
2177bded2dbSJung-uk Kim	jmp	.Loop_enc4x
2187bded2dbSJung-uk Kim
2197bded2dbSJung-uk Kim.align	32
2207bded2dbSJung-uk Kim.Loop_enc4x:
2217bded2dbSJung-uk Kim	add	\$16,$offset
2227bded2dbSJung-uk Kim	lea	16(%rsp),$sink			# sink pointer
2237bded2dbSJung-uk Kim	mov	\$1,$one			# constant of 1
2247bded2dbSJung-uk Kim	sub	$offset,$sink
2257bded2dbSJung-uk Kim
2267bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[0]
2277bded2dbSJung-uk Kim	prefetcht0	31(@inptr[0],$offset)	# prefetch input
2287bded2dbSJung-uk Kim	prefetcht0	31(@inptr[1],$offset)
2297bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[1]
2307bded2dbSJung-uk Kim	prefetcht0	31(@inptr[2],$offset)
2317bded2dbSJung-uk Kim	prefetcht0	31(@inptr[2],$offset)
2327bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[2]
2337bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[3]
2347bded2dbSJung-uk Kim	movups		0x30-0x78($key),$rndkey1
2357bded2dbSJung-uk Kim___
2367bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) {
2377bded2dbSJung-uk Kimmy $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
2387bded2dbSJung-uk Kim$code.=<<___;
2397bded2dbSJung-uk Kim	 cmp		`32+4*$i`(%rsp),$one
2407bded2dbSJung-uk Kim	aesenc		$rndkey,@out[0]
2417bded2dbSJung-uk Kim	aesenc		$rndkey,@out[1]
2427bded2dbSJung-uk Kim	aesenc		$rndkey,@out[2]
2437bded2dbSJung-uk Kim	 cmovge		$sink,@inptr[$i]	# cancel input
2447bded2dbSJung-uk Kim	 cmovg		$sink,@outptr[$i]	# sink output
2457bded2dbSJung-uk Kim	aesenc		$rndkey,@out[3]
2467bded2dbSJung-uk Kim	movups		`0x40+16*$i-0x78`($key),$rndkey
2477bded2dbSJung-uk Kim___
2487bded2dbSJung-uk Kim}
2497bded2dbSJung-uk Kim$code.=<<___;
2507bded2dbSJung-uk Kim	 movdqa		$counters,$mask
2517bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[0]
2527bded2dbSJung-uk Kim	prefetcht0	15(@outptr[0],$offset)	# prefetch output
2537bded2dbSJung-uk Kim	prefetcht0	15(@outptr[1],$offset)
2547bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[1]
2557bded2dbSJung-uk Kim	prefetcht0	15(@outptr[2],$offset)
2567bded2dbSJung-uk Kim	prefetcht0	15(@outptr[3],$offset)
2577bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[2]
2587bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[3]
2597bded2dbSJung-uk Kim	movups		0x80-0x78($key),$rndkey0
2607bded2dbSJung-uk Kim	 pxor		$zero,$zero
2617bded2dbSJung-uk Kim
2627bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[0]
2637bded2dbSJung-uk Kim	 pcmpgtd	$zero,$mask
2647bded2dbSJung-uk Kim	 movdqu		-0x78($key),$zero	# reload 0-round key
2657bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[1]
2667bded2dbSJung-uk Kim	 paddd		$mask,$counters		# decrement counters
2677bded2dbSJung-uk Kim	 movdqa		$counters,32(%rsp)	# update counters
2687bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[2]
2697bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[3]
2707bded2dbSJung-uk Kim	movups		0x90-0x78($key),$rndkey1
2717bded2dbSJung-uk Kim
2727bded2dbSJung-uk Kim	cmp	\$11,$rounds
2737bded2dbSJung-uk Kim
2747bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[0]
2757bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[1]
2767bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[2]
2777bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[3]
2787bded2dbSJung-uk Kim	movups		0xa0-0x78($key),$rndkey0
2797bded2dbSJung-uk Kim
2807bded2dbSJung-uk Kim	jb	.Lenc4x_tail
2817bded2dbSJung-uk Kim
2827bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[0]
2837bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[1]
2847bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[2]
2857bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[3]
2867bded2dbSJung-uk Kim	movups		0xb0-0x78($key),$rndkey1
2877bded2dbSJung-uk Kim
2887bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[0]
2897bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[1]
2907bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[2]
2917bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[3]
2927bded2dbSJung-uk Kim	movups		0xc0-0x78($key),$rndkey0
2937bded2dbSJung-uk Kim
2947bded2dbSJung-uk Kim	je	.Lenc4x_tail
2957bded2dbSJung-uk Kim
2967bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[0]
2977bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[1]
2987bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[2]
2997bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[3]
3007bded2dbSJung-uk Kim	movups		0xd0-0x78($key),$rndkey1
3017bded2dbSJung-uk Kim
3027bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[0]
3037bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[1]
3047bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[2]
3057bded2dbSJung-uk Kim	aesenc		$rndkey0,@out[3]
3067bded2dbSJung-uk Kim	movups		0xe0-0x78($key),$rndkey0
3077bded2dbSJung-uk Kim	jmp	.Lenc4x_tail
3087bded2dbSJung-uk Kim
3097bded2dbSJung-uk Kim.align	32
3107bded2dbSJung-uk Kim.Lenc4x_tail:
3117bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[0]
3127bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[1]
3137bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[2]
3147bded2dbSJung-uk Kim	aesenc		$rndkey1,@out[3]
3157bded2dbSJung-uk Kim	 movdqu		(@inptr[0],$offset),@inp[0]
3167bded2dbSJung-uk Kim	movdqu		0x10-0x78($key),$rndkey1
3177bded2dbSJung-uk Kim
3187bded2dbSJung-uk Kim	aesenclast	$rndkey0,@out[0]
3197bded2dbSJung-uk Kim	 movdqu		(@inptr[1],$offset),@inp[1]
3207bded2dbSJung-uk Kim	 pxor		$zero,@inp[0]
3217bded2dbSJung-uk Kim	aesenclast	$rndkey0,@out[1]
3227bded2dbSJung-uk Kim	 movdqu		(@inptr[2],$offset),@inp[2]
3237bded2dbSJung-uk Kim	 pxor		$zero,@inp[1]
3247bded2dbSJung-uk Kim	aesenclast	$rndkey0,@out[2]
3257bded2dbSJung-uk Kim	 movdqu		(@inptr[3],$offset),@inp[3]
3267bded2dbSJung-uk Kim	 pxor		$zero,@inp[2]
3277bded2dbSJung-uk Kim	aesenclast	$rndkey0,@out[3]
3287bded2dbSJung-uk Kim	movdqu		0x20-0x78($key),$rndkey0
3297bded2dbSJung-uk Kim	 pxor		$zero,@inp[3]
3307bded2dbSJung-uk Kim
3317bded2dbSJung-uk Kim	movups		@out[0],-16(@outptr[0],$offset)
3327bded2dbSJung-uk Kim	 pxor		@inp[0],@out[0]
3337bded2dbSJung-uk Kim	movups		@out[1],-16(@outptr[1],$offset)
3347bded2dbSJung-uk Kim	 pxor		@inp[1],@out[1]
3357bded2dbSJung-uk Kim	movups		@out[2],-16(@outptr[2],$offset)
3367bded2dbSJung-uk Kim	 pxor		@inp[2],@out[2]
3377bded2dbSJung-uk Kim	movups		@out[3],-16(@outptr[3],$offset)
3387bded2dbSJung-uk Kim	 pxor		@inp[3],@out[3]
3397bded2dbSJung-uk Kim
3407bded2dbSJung-uk Kim	dec	$num
3417bded2dbSJung-uk Kim	jnz	.Loop_enc4x
3427bded2dbSJung-uk Kim
3437bded2dbSJung-uk Kim	mov	16(%rsp),%rax			# original %rsp
344e71b7053SJung-uk Kim.cfi_def_cfa	%rax,8
3457bded2dbSJung-uk Kim	mov	24(%rsp),$num
3467bded2dbSJung-uk Kim
3477bded2dbSJung-uk Kim	#pxor	@inp[0],@out[0]
3487bded2dbSJung-uk Kim	#pxor	@inp[1],@out[1]
349*b077aed3SPierre Pronchery	# output iv FIX ME!
350*b077aed3SPierre Pronchery	#movdqu	@out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp)
3517bded2dbSJung-uk Kim	#pxor	@inp[2],@out[2]
352*b077aed3SPierre Pronchery	#movdqu	@out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp)
3537bded2dbSJung-uk Kim	#pxor	@inp[3],@out[3]
354*b077aed3SPierre Pronchery	#movdqu	@out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp)	# won't fix, let caller
355*b077aed3SPierre Pronchery	#movdqu	@out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp)	# figure this out...
3567bded2dbSJung-uk Kim
357*b077aed3SPierre Pronchery	lea	`$inp_elm_size*4`($inp),$inp
3587bded2dbSJung-uk Kim	dec	$num
3597bded2dbSJung-uk Kim	jnz	.Lenc4x_loop_grande
3607bded2dbSJung-uk Kim
3617bded2dbSJung-uk Kim.Lenc4x_done:
3627bded2dbSJung-uk Kim___
3637bded2dbSJung-uk Kim$code.=<<___ if ($win64);
3647bded2dbSJung-uk Kim	movaps	-0xd8(%rax),%xmm6
3657bded2dbSJung-uk Kim	movaps	-0xc8(%rax),%xmm7
3667bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm8
3677bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm9
3687bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm10
3697bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm11
3707bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm12
3717bded2dbSJung-uk Kim	#movaps	-0x68(%rax),%xmm13
3727bded2dbSJung-uk Kim	#movaps	-0x58(%rax),%xmm14
3737bded2dbSJung-uk Kim	#movaps	-0x48(%rax),%xmm15
3747bded2dbSJung-uk Kim___
3757bded2dbSJung-uk Kim$code.=<<___;
3767bded2dbSJung-uk Kim	mov	-48(%rax),%r15
377e71b7053SJung-uk Kim.cfi_restore	%r15
3787bded2dbSJung-uk Kim	mov	-40(%rax),%r14
379e71b7053SJung-uk Kim.cfi_restore	%r14
3807bded2dbSJung-uk Kim	mov	-32(%rax),%r13
381e71b7053SJung-uk Kim.cfi_restore	%r13
3827bded2dbSJung-uk Kim	mov	-24(%rax),%r12
383e71b7053SJung-uk Kim.cfi_restore	%r12
3847bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
385e71b7053SJung-uk Kim.cfi_restore	%rbp
3867bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
387e71b7053SJung-uk Kim.cfi_restore	%rbx
3887bded2dbSJung-uk Kim	lea	(%rax),%rsp
389e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
3907bded2dbSJung-uk Kim.Lenc4x_epilogue:
3917bded2dbSJung-uk Kim	ret
392e71b7053SJung-uk Kim.cfi_endproc
3937bded2dbSJung-uk Kim.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
3947bded2dbSJung-uk Kim
3957bded2dbSJung-uk Kim.globl	aesni_multi_cbc_decrypt
3967bded2dbSJung-uk Kim.type	aesni_multi_cbc_decrypt,\@function,3
3977bded2dbSJung-uk Kim.align	32
3987bded2dbSJung-uk Kimaesni_multi_cbc_decrypt:
399e71b7053SJung-uk Kim.cfi_startproc
4007bded2dbSJung-uk Kim___
4017bded2dbSJung-uk Kim$code.=<<___ if ($avx);
4027bded2dbSJung-uk Kim	cmp	\$2,$num
4037bded2dbSJung-uk Kim	jb	.Ldec_non_avx
4047bded2dbSJung-uk Kim	mov	OPENSSL_ia32cap_P+4(%rip),%ecx
4057bded2dbSJung-uk Kim	test	\$`1<<28`,%ecx			# AVX bit
4067bded2dbSJung-uk Kim	jnz	_avx_cbc_dec_shortcut
4077bded2dbSJung-uk Kim	jmp	.Ldec_non_avx
4087bded2dbSJung-uk Kim.align	16
4097bded2dbSJung-uk Kim.Ldec_non_avx:
4107bded2dbSJung-uk Kim___
4117bded2dbSJung-uk Kim$code.=<<___;
4127bded2dbSJung-uk Kim	mov	%rsp,%rax
413e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
4147bded2dbSJung-uk Kim	push	%rbx
415e71b7053SJung-uk Kim.cfi_push	%rbx
4167bded2dbSJung-uk Kim	push	%rbp
417e71b7053SJung-uk Kim.cfi_push	%rbp
4187bded2dbSJung-uk Kim	push	%r12
419e71b7053SJung-uk Kim.cfi_push	%r12
4207bded2dbSJung-uk Kim	push	%r13
421e71b7053SJung-uk Kim.cfi_push	%r13
4227bded2dbSJung-uk Kim	push	%r14
423e71b7053SJung-uk Kim.cfi_push	%r14
4247bded2dbSJung-uk Kim	push	%r15
425e71b7053SJung-uk Kim.cfi_push	%r15
4267bded2dbSJung-uk Kim___
4277bded2dbSJung-uk Kim$code.=<<___ if ($win64);
4287bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
4297bded2dbSJung-uk Kim	movaps	%xmm6,(%rsp)
4307bded2dbSJung-uk Kim	movaps	%xmm7,0x10(%rsp)
4317bded2dbSJung-uk Kim	movaps	%xmm8,0x20(%rsp)
4327bded2dbSJung-uk Kim	movaps	%xmm9,0x30(%rsp)
4337bded2dbSJung-uk Kim	movaps	%xmm10,0x40(%rsp)
4347bded2dbSJung-uk Kim	movaps	%xmm11,0x50(%rsp)
4357bded2dbSJung-uk Kim	movaps	%xmm12,0x60(%rsp)
4367bded2dbSJung-uk Kim	movaps	%xmm13,-0x68(%rax)	# not used, saved to share se_handler
4377bded2dbSJung-uk Kim	movaps	%xmm14,-0x58(%rax)
4387bded2dbSJung-uk Kim	movaps	%xmm15,-0x48(%rax)
4397bded2dbSJung-uk Kim___
4407bded2dbSJung-uk Kim$code.=<<___;
4417bded2dbSJung-uk Kim	# stack layout
4427bded2dbSJung-uk Kim	#
4437bded2dbSJung-uk Kim	# +0	output sink
4447bded2dbSJung-uk Kim	# +16	input sink [original %rsp and $num]
4457bded2dbSJung-uk Kim	# +32	counters
4467bded2dbSJung-uk Kim
4477bded2dbSJung-uk Kim	sub	\$48,%rsp
4487bded2dbSJung-uk Kim	and	\$-64,%rsp
4497bded2dbSJung-uk Kim	mov	%rax,16(%rsp)			# original %rsp
450e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+16,deref,+8
4517bded2dbSJung-uk Kim
4527bded2dbSJung-uk Kim.Ldec4x_body:
4537bded2dbSJung-uk Kim	movdqu	($key),$zero			# 0-round key
4547bded2dbSJung-uk Kim	lea	0x78($key),$key			# size optimization
455*b077aed3SPierre Pronchery	lea	$inp_elm_size*2($inp),$inp
4567bded2dbSJung-uk Kim
4577bded2dbSJung-uk Kim.Ldec4x_loop_grande:
4587bded2dbSJung-uk Kim	mov	$num,24(%rsp)			# original $num
4597bded2dbSJung-uk Kim	xor	$num,$num
4607bded2dbSJung-uk Kim___
4617bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) {
462*b077aed3SPierre Pronchery    $inptr_reg=&pointer_register($flavour,@inptr[$i]);
463*b077aed3SPierre Pronchery    $outptr_reg=&pointer_register($flavour,@outptr[$i]);
4647bded2dbSJung-uk Kim    $code.=<<___;
465*b077aed3SPierre Pronchery	# borrow $one for number of blocks
466*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
467*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
4687bded2dbSJung-uk Kim	cmp	$num,$one
469*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
4707bded2dbSJung-uk Kim	cmovg	$one,$num			# find maximum
4717bded2dbSJung-uk Kim	test	$one,$one
472*b077aed3SPierre Pronchery	# load IV
473*b077aed3SPierre Pronchery	movdqu	`$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i]
4747bded2dbSJung-uk Kim	mov	$one,`32+4*$i`(%rsp)		# initialize counters
4757bded2dbSJung-uk Kim	cmovle	%rsp,@inptr[$i]			# cancel input
4767bded2dbSJung-uk Kim___
4777bded2dbSJung-uk Kim}
4787bded2dbSJung-uk Kim$code.=<<___;
4797bded2dbSJung-uk Kim	test	$num,$num
4807bded2dbSJung-uk Kim	jz	.Ldec4x_done
4817bded2dbSJung-uk Kim
4827bded2dbSJung-uk Kim	movups	0x10-0x78($key),$rndkey1
4837bded2dbSJung-uk Kim	movups	0x20-0x78($key),$rndkey0
4847bded2dbSJung-uk Kim	mov	0xf0-0x78($key),$rounds
4857bded2dbSJung-uk Kim	movdqu	(@inptr[0]),@out[0]		# load inputs
4867bded2dbSJung-uk Kim	movdqu	(@inptr[1]),@out[1]
4877bded2dbSJung-uk Kim	 pxor	$zero,@out[0]
4887bded2dbSJung-uk Kim	movdqu	(@inptr[2]),@out[2]
4897bded2dbSJung-uk Kim	 pxor	$zero,@out[1]
4907bded2dbSJung-uk Kim	movdqu	(@inptr[3]),@out[3]
4917bded2dbSJung-uk Kim	 pxor	$zero,@out[2]
4927bded2dbSJung-uk Kim	 pxor	$zero,@out[3]
4937bded2dbSJung-uk Kim	movdqa	32(%rsp),$counters		# load counters
4947bded2dbSJung-uk Kim	xor	$offset,$offset
4957bded2dbSJung-uk Kim	jmp	.Loop_dec4x
4967bded2dbSJung-uk Kim
4977bded2dbSJung-uk Kim.align	32
4987bded2dbSJung-uk Kim.Loop_dec4x:
4997bded2dbSJung-uk Kim	add	\$16,$offset
5007bded2dbSJung-uk Kim	lea	16(%rsp),$sink			# sink pointer
5017bded2dbSJung-uk Kim	mov	\$1,$one			# constant of 1
5027bded2dbSJung-uk Kim	sub	$offset,$sink
5037bded2dbSJung-uk Kim
5047bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[0]
5057bded2dbSJung-uk Kim	prefetcht0	31(@inptr[0],$offset)	# prefetch input
5067bded2dbSJung-uk Kim	prefetcht0	31(@inptr[1],$offset)
5077bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[1]
5087bded2dbSJung-uk Kim	prefetcht0	31(@inptr[2],$offset)
5097bded2dbSJung-uk Kim	prefetcht0	31(@inptr[3],$offset)
5107bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[2]
5117bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[3]
5127bded2dbSJung-uk Kim	movups		0x30-0x78($key),$rndkey1
5137bded2dbSJung-uk Kim___
5147bded2dbSJung-uk Kimfor($i=0;$i<4;$i++) {
5157bded2dbSJung-uk Kimmy $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
5167bded2dbSJung-uk Kim$code.=<<___;
5177bded2dbSJung-uk Kim	 cmp		`32+4*$i`(%rsp),$one
5187bded2dbSJung-uk Kim	aesdec		$rndkey,@out[0]
5197bded2dbSJung-uk Kim	aesdec		$rndkey,@out[1]
5207bded2dbSJung-uk Kim	aesdec		$rndkey,@out[2]
5217bded2dbSJung-uk Kim	 cmovge		$sink,@inptr[$i]	# cancel input
5227bded2dbSJung-uk Kim	 cmovg		$sink,@outptr[$i]	# sink output
5237bded2dbSJung-uk Kim	aesdec		$rndkey,@out[3]
5247bded2dbSJung-uk Kim	movups		`0x40+16*$i-0x78`($key),$rndkey
5257bded2dbSJung-uk Kim___
5267bded2dbSJung-uk Kim}
5277bded2dbSJung-uk Kim$code.=<<___;
5287bded2dbSJung-uk Kim	 movdqa		$counters,$mask
5297bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[0]
5307bded2dbSJung-uk Kim	prefetcht0	15(@outptr[0],$offset)	# prefetch output
5317bded2dbSJung-uk Kim	prefetcht0	15(@outptr[1],$offset)
5327bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[1]
5337bded2dbSJung-uk Kim	prefetcht0	15(@outptr[2],$offset)
5347bded2dbSJung-uk Kim	prefetcht0	15(@outptr[3],$offset)
5357bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[2]
5367bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[3]
5377bded2dbSJung-uk Kim	movups		0x80-0x78($key),$rndkey0
5387bded2dbSJung-uk Kim	 pxor		$zero,$zero
5397bded2dbSJung-uk Kim
5407bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[0]
5417bded2dbSJung-uk Kim	 pcmpgtd	$zero,$mask
5427bded2dbSJung-uk Kim	 movdqu		-0x78($key),$zero	# reload 0-round key
5437bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[1]
5447bded2dbSJung-uk Kim	 paddd		$mask,$counters		# decrement counters
5457bded2dbSJung-uk Kim	 movdqa		$counters,32(%rsp)	# update counters
5467bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[2]
5477bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[3]
5487bded2dbSJung-uk Kim	movups		0x90-0x78($key),$rndkey1
5497bded2dbSJung-uk Kim
5507bded2dbSJung-uk Kim	cmp	\$11,$rounds
5517bded2dbSJung-uk Kim
5527bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[0]
5537bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[1]
5547bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[2]
5557bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[3]
5567bded2dbSJung-uk Kim	movups		0xa0-0x78($key),$rndkey0
5577bded2dbSJung-uk Kim
5587bded2dbSJung-uk Kim	jb	.Ldec4x_tail
5597bded2dbSJung-uk Kim
5607bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[0]
5617bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[1]
5627bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[2]
5637bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[3]
5647bded2dbSJung-uk Kim	movups		0xb0-0x78($key),$rndkey1
5657bded2dbSJung-uk Kim
5667bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[0]
5677bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[1]
5687bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[2]
5697bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[3]
5707bded2dbSJung-uk Kim	movups		0xc0-0x78($key),$rndkey0
5717bded2dbSJung-uk Kim
5727bded2dbSJung-uk Kim	je	.Ldec4x_tail
5737bded2dbSJung-uk Kim
5747bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[0]
5757bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[1]
5767bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[2]
5777bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[3]
5787bded2dbSJung-uk Kim	movups		0xd0-0x78($key),$rndkey1
5797bded2dbSJung-uk Kim
5807bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[0]
5817bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[1]
5827bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[2]
5837bded2dbSJung-uk Kim	aesdec		$rndkey0,@out[3]
5847bded2dbSJung-uk Kim	movups		0xe0-0x78($key),$rndkey0
5857bded2dbSJung-uk Kim	jmp	.Ldec4x_tail
5867bded2dbSJung-uk Kim
5877bded2dbSJung-uk Kim.align	32
5887bded2dbSJung-uk Kim.Ldec4x_tail:
5897bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[0]
5907bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[1]
5917bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[2]
5927bded2dbSJung-uk Kim	 pxor		$rndkey0,@inp[0]
5937bded2dbSJung-uk Kim	 pxor		$rndkey0,@inp[1]
5947bded2dbSJung-uk Kim	aesdec		$rndkey1,@out[3]
5957bded2dbSJung-uk Kim	movdqu		0x10-0x78($key),$rndkey1
5967bded2dbSJung-uk Kim	 pxor		$rndkey0,@inp[2]
5977bded2dbSJung-uk Kim	 pxor		$rndkey0,@inp[3]
5987bded2dbSJung-uk Kim	movdqu		0x20-0x78($key),$rndkey0
5997bded2dbSJung-uk Kim
6007bded2dbSJung-uk Kim	aesdeclast	@inp[0],@out[0]
6017bded2dbSJung-uk Kim	aesdeclast	@inp[1],@out[1]
6027bded2dbSJung-uk Kim	 movdqu		-16(@inptr[0],$offset),@inp[0]	# load next IV
6037bded2dbSJung-uk Kim	 movdqu		-16(@inptr[1],$offset),@inp[1]
6047bded2dbSJung-uk Kim	aesdeclast	@inp[2],@out[2]
6057bded2dbSJung-uk Kim	aesdeclast	@inp[3],@out[3]
6067bded2dbSJung-uk Kim	 movdqu		-16(@inptr[2],$offset),@inp[2]
6077bded2dbSJung-uk Kim	 movdqu		-16(@inptr[3],$offset),@inp[3]
6087bded2dbSJung-uk Kim
6097bded2dbSJung-uk Kim	movups		@out[0],-16(@outptr[0],$offset)
6107bded2dbSJung-uk Kim	 movdqu		(@inptr[0],$offset),@out[0]
6117bded2dbSJung-uk Kim	movups		@out[1],-16(@outptr[1],$offset)
6127bded2dbSJung-uk Kim	 movdqu		(@inptr[1],$offset),@out[1]
6137bded2dbSJung-uk Kim	 pxor		$zero,@out[0]
6147bded2dbSJung-uk Kim	movups		@out[2],-16(@outptr[2],$offset)
6157bded2dbSJung-uk Kim	 movdqu		(@inptr[2],$offset),@out[2]
6167bded2dbSJung-uk Kim	 pxor		$zero,@out[1]
6177bded2dbSJung-uk Kim	movups		@out[3],-16(@outptr[3],$offset)
6187bded2dbSJung-uk Kim	 movdqu		(@inptr[3],$offset),@out[3]
6197bded2dbSJung-uk Kim	 pxor		$zero,@out[2]
6207bded2dbSJung-uk Kim	 pxor		$zero,@out[3]
6217bded2dbSJung-uk Kim
6227bded2dbSJung-uk Kim	dec	$num
6237bded2dbSJung-uk Kim	jnz	.Loop_dec4x
6247bded2dbSJung-uk Kim
6257bded2dbSJung-uk Kim	mov	16(%rsp),%rax			# original %rsp
626e71b7053SJung-uk Kim.cfi_def_cfa	%rax,8
6277bded2dbSJung-uk Kim	mov	24(%rsp),$num
6287bded2dbSJung-uk Kim
629*b077aed3SPierre Pronchery	lea	`$inp_elm_size*4`($inp),$inp
6307bded2dbSJung-uk Kim	dec	$num
6317bded2dbSJung-uk Kim	jnz	.Ldec4x_loop_grande
6327bded2dbSJung-uk Kim
6337bded2dbSJung-uk Kim.Ldec4x_done:
6347bded2dbSJung-uk Kim___
6357bded2dbSJung-uk Kim$code.=<<___ if ($win64);
6367bded2dbSJung-uk Kim	movaps	-0xd8(%rax),%xmm6
6377bded2dbSJung-uk Kim	movaps	-0xc8(%rax),%xmm7
6387bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm8
6397bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm9
6407bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm10
6417bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm11
6427bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm12
6437bded2dbSJung-uk Kim	#movaps	-0x68(%rax),%xmm13
6447bded2dbSJung-uk Kim	#movaps	-0x58(%rax),%xmm14
6457bded2dbSJung-uk Kim	#movaps	-0x48(%rax),%xmm15
6467bded2dbSJung-uk Kim___
6477bded2dbSJung-uk Kim$code.=<<___;
6487bded2dbSJung-uk Kim	mov	-48(%rax),%r15
649e71b7053SJung-uk Kim.cfi_restore	%r15
6507bded2dbSJung-uk Kim	mov	-40(%rax),%r14
651e71b7053SJung-uk Kim.cfi_restore	%r14
6527bded2dbSJung-uk Kim	mov	-32(%rax),%r13
653e71b7053SJung-uk Kim.cfi_restore	%r13
6547bded2dbSJung-uk Kim	mov	-24(%rax),%r12
655e71b7053SJung-uk Kim.cfi_restore	%r12
6567bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
657e71b7053SJung-uk Kim.cfi_restore	%rbp
6587bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
659e71b7053SJung-uk Kim.cfi_restore	%rbx
6607bded2dbSJung-uk Kim	lea	(%rax),%rsp
661e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
6627bded2dbSJung-uk Kim.Ldec4x_epilogue:
6637bded2dbSJung-uk Kim	ret
664e71b7053SJung-uk Kim.cfi_endproc
6657bded2dbSJung-uk Kim.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
6667bded2dbSJung-uk Kim___
6677bded2dbSJung-uk Kim
6687bded2dbSJung-uk Kim						if ($avx) {{{
6697bded2dbSJung-uk Kimmy @ptr=map("%r$_",(8..15));
6707bded2dbSJung-uk Kimmy $offload=$sink;
6717bded2dbSJung-uk Kim
6727bded2dbSJung-uk Kimmy @out=map("%xmm$_",(2..9));
6737bded2dbSJung-uk Kimmy @inp=map("%xmm$_",(10..13));
6747bded2dbSJung-uk Kimmy ($counters,$zero)=("%xmm14","%xmm15");
6757bded2dbSJung-uk Kim
6767bded2dbSJung-uk Kim$code.=<<___;
6777bded2dbSJung-uk Kim.type	aesni_multi_cbc_encrypt_avx,\@function,3
6787bded2dbSJung-uk Kim.align	32
6797bded2dbSJung-uk Kimaesni_multi_cbc_encrypt_avx:
680e71b7053SJung-uk Kim.cfi_startproc
6817bded2dbSJung-uk Kim_avx_cbc_enc_shortcut:
6827bded2dbSJung-uk Kim	mov	%rsp,%rax
683e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
6847bded2dbSJung-uk Kim	push	%rbx
685e71b7053SJung-uk Kim.cfi_push	%rbx
6867bded2dbSJung-uk Kim	push	%rbp
687e71b7053SJung-uk Kim.cfi_push	%rbp
6887bded2dbSJung-uk Kim	push	%r12
689e71b7053SJung-uk Kim.cfi_push	%r12
6907bded2dbSJung-uk Kim	push	%r13
691e71b7053SJung-uk Kim.cfi_push	%r13
6927bded2dbSJung-uk Kim	push	%r14
693e71b7053SJung-uk Kim.cfi_push	%r14
6947bded2dbSJung-uk Kim	push	%r15
695e71b7053SJung-uk Kim.cfi_push	%r15
6967bded2dbSJung-uk Kim___
6977bded2dbSJung-uk Kim$code.=<<___ if ($win64);
6987bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
6997bded2dbSJung-uk Kim	movaps	%xmm6,(%rsp)
7007bded2dbSJung-uk Kim	movaps	%xmm7,0x10(%rsp)
7017bded2dbSJung-uk Kim	movaps	%xmm8,0x20(%rsp)
7027bded2dbSJung-uk Kim	movaps	%xmm9,0x30(%rsp)
7037bded2dbSJung-uk Kim	movaps	%xmm10,0x40(%rsp)
7047bded2dbSJung-uk Kim	movaps	%xmm11,0x50(%rsp)
7057bded2dbSJung-uk Kim	movaps	%xmm12,-0x78(%rax)
7067bded2dbSJung-uk Kim	movaps	%xmm13,-0x68(%rax)
7077bded2dbSJung-uk Kim	movaps	%xmm14,-0x58(%rax)
7087bded2dbSJung-uk Kim	movaps	%xmm15,-0x48(%rax)
7097bded2dbSJung-uk Kim___
7107bded2dbSJung-uk Kim$code.=<<___;
7117bded2dbSJung-uk Kim	# stack layout
7127bded2dbSJung-uk Kim	#
7137bded2dbSJung-uk Kim	# +0	output sink
7147bded2dbSJung-uk Kim	# +16	input sink [original %rsp and $num]
7157bded2dbSJung-uk Kim	# +32	counters
7167bded2dbSJung-uk Kim	# +64	distances between inputs and outputs
7177bded2dbSJung-uk Kim	# +128	off-load area for @inp[0..3]
7187bded2dbSJung-uk Kim
7197bded2dbSJung-uk Kim	sub	\$192,%rsp
7207bded2dbSJung-uk Kim	and	\$-128,%rsp
7217bded2dbSJung-uk Kim	mov	%rax,16(%rsp)			# original %rsp
722e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+16,deref,+8
7237bded2dbSJung-uk Kim
7247bded2dbSJung-uk Kim.Lenc8x_body:
7257bded2dbSJung-uk Kim	vzeroupper
7267bded2dbSJung-uk Kim	vmovdqu	($key),$zero			# 0-round key
7277bded2dbSJung-uk Kim	lea	0x78($key),$key			# size optimization
728*b077aed3SPierre Pronchery	lea	`$inp_elm_size*4`($inp),$inp
7297bded2dbSJung-uk Kim	shr	\$1,$num
7307bded2dbSJung-uk Kim
7317bded2dbSJung-uk Kim.Lenc8x_loop_grande:
7327bded2dbSJung-uk Kim	#mov	$num,24(%rsp)			# original $num
7337bded2dbSJung-uk Kim	xor	$num,$num
7347bded2dbSJung-uk Kim___
7357bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) {
7367bded2dbSJung-uk Kim  my $temp = $i ? $offload : $offset;
737*b077aed3SPierre Pronchery    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
738*b077aed3SPierre Pronchery    $temp_reg=&pointer_register($flavour,$temp);
7397bded2dbSJung-uk Kim    $code.=<<___;
740*b077aed3SPierre Pronchery	# borrow $one for number of blocks
741*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
742*b077aed3SPierre Pronchery	# input pointer
743*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
7447bded2dbSJung-uk Kim	cmp	$num,$one
745*b077aed3SPierre Pronchery	# output pointer
746*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
7477bded2dbSJung-uk Kim	cmovg	$one,$num			# find maximum
7487bded2dbSJung-uk Kim	test	$one,$one
749*b077aed3SPierre Pronchery	# load IV
750*b077aed3SPierre Pronchery	vmovdqu	`$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
7517bded2dbSJung-uk Kim	mov	$one,`32+4*$i`(%rsp)		# initialize counters
7527bded2dbSJung-uk Kim	cmovle	%rsp,@ptr[$i]			# cancel input
7537bded2dbSJung-uk Kim	sub	@ptr[$i],$temp			# distance between input and output
7547bded2dbSJung-uk Kim	mov	$temp,`64+8*$i`(%rsp)		# initialize distances
7557bded2dbSJung-uk Kim___
7567bded2dbSJung-uk Kim}
7577bded2dbSJung-uk Kim$code.=<<___;
7587bded2dbSJung-uk Kim	test	$num,$num
7597bded2dbSJung-uk Kim	jz	.Lenc8x_done
7607bded2dbSJung-uk Kim
7617bded2dbSJung-uk Kim	vmovups	0x10-0x78($key),$rndkey1
7627bded2dbSJung-uk Kim	vmovups	0x20-0x78($key),$rndkey0
7637bded2dbSJung-uk Kim	mov	0xf0-0x78($key),$rounds
7647bded2dbSJung-uk Kim
7657bded2dbSJung-uk Kim	vpxor	(@ptr[0]),$zero,@inp[0]		# load inputs and xor with 0-round
7667bded2dbSJung-uk Kim	 lea	128(%rsp),$offload		# offload area
7677bded2dbSJung-uk Kim	vpxor	(@ptr[1]),$zero,@inp[1]
7687bded2dbSJung-uk Kim	vpxor	(@ptr[2]),$zero,@inp[2]
7697bded2dbSJung-uk Kim	vpxor	(@ptr[3]),$zero,@inp[3]
7707bded2dbSJung-uk Kim	 vpxor	@inp[0],@out[0],@out[0]
7717bded2dbSJung-uk Kim	vpxor	(@ptr[4]),$zero,@inp[0]
7727bded2dbSJung-uk Kim	 vpxor	@inp[1],@out[1],@out[1]
7737bded2dbSJung-uk Kim	vpxor	(@ptr[5]),$zero,@inp[1]
7747bded2dbSJung-uk Kim	 vpxor	@inp[2],@out[2],@out[2]
7757bded2dbSJung-uk Kim	vpxor	(@ptr[6]),$zero,@inp[2]
7767bded2dbSJung-uk Kim	 vpxor	@inp[3],@out[3],@out[3]
7777bded2dbSJung-uk Kim	vpxor	(@ptr[7]),$zero,@inp[3]
7787bded2dbSJung-uk Kim	 vpxor	@inp[0],@out[4],@out[4]
7797bded2dbSJung-uk Kim	mov	\$1,$one			# constant of 1
7807bded2dbSJung-uk Kim	 vpxor	@inp[1],@out[5],@out[5]
7817bded2dbSJung-uk Kim	 vpxor	@inp[2],@out[6],@out[6]
7827bded2dbSJung-uk Kim	 vpxor	@inp[3],@out[7],@out[7]
7837bded2dbSJung-uk Kim	jmp	.Loop_enc8x
7847bded2dbSJung-uk Kim
7857bded2dbSJung-uk Kim.align	32
7867bded2dbSJung-uk Kim.Loop_enc8x:
7877bded2dbSJung-uk Kim___
7887bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) {
7897bded2dbSJung-uk Kimmy $rndkey=($i&1)?$rndkey0:$rndkey1;
7907bded2dbSJung-uk Kim$code.=<<___;
7917bded2dbSJung-uk Kim	vaesenc		$rndkey,@out[0],@out[0]
7927bded2dbSJung-uk Kim	 cmp		32+4*$i(%rsp),$one
7937bded2dbSJung-uk Kim___
7947bded2dbSJung-uk Kim$code.=<<___ if ($i);
7957bded2dbSJung-uk Kim	 mov		64+8*$i(%rsp),$offset
7967bded2dbSJung-uk Kim___
7977bded2dbSJung-uk Kim$code.=<<___;
7987bded2dbSJung-uk Kim	vaesenc		$rndkey,@out[1],@out[1]
7997bded2dbSJung-uk Kim	prefetcht0	31(@ptr[$i])			# prefetch input
8007bded2dbSJung-uk Kim	vaesenc		$rndkey,@out[2],@out[2]
8017bded2dbSJung-uk Kim___
8027bded2dbSJung-uk Kim$code.=<<___ if ($i>1);
8037bded2dbSJung-uk Kim	prefetcht0	15(@ptr[$i-2])			# prefetch output
8047bded2dbSJung-uk Kim___
8057bded2dbSJung-uk Kim$code.=<<___;
8067bded2dbSJung-uk Kim	vaesenc		$rndkey,@out[3],@out[3]
8077bded2dbSJung-uk Kim	 lea		(@ptr[$i],$offset),$offset
8087bded2dbSJung-uk Kim	 cmovge		%rsp,@ptr[$i]			# cancel input
8097bded2dbSJung-uk Kim	vaesenc		$rndkey,@out[4],@out[4]
8107bded2dbSJung-uk Kim	 cmovg		%rsp,$offset			# sink output
8117bded2dbSJung-uk Kim	vaesenc		$rndkey,@out[5],@out[5]
8127bded2dbSJung-uk Kim	 sub		@ptr[$i],$offset
8137bded2dbSJung-uk Kim	vaesenc		$rndkey,@out[6],@out[6]
8147bded2dbSJung-uk Kim	 vpxor		16(@ptr[$i]),$zero,@inp[$i%4]	# load input and xor with 0-round
8157bded2dbSJung-uk Kim	 mov		$offset,64+8*$i(%rsp)
8167bded2dbSJung-uk Kim	vaesenc		$rndkey,@out[7],@out[7]
8177bded2dbSJung-uk Kim	vmovups		`16*(3+$i)-0x78`($key),$rndkey
8187bded2dbSJung-uk Kim	 lea		16(@ptr[$i],$offset),@ptr[$i]	# switch to output
8197bded2dbSJung-uk Kim___
8207bded2dbSJung-uk Kim$code.=<<___ if ($i<4)
8217bded2dbSJung-uk Kim	 vmovdqu	@inp[$i%4],`16*$i`($offload)	# off-load
8227bded2dbSJung-uk Kim___
8237bded2dbSJung-uk Kim}
8247bded2dbSJung-uk Kim$code.=<<___;
8257bded2dbSJung-uk Kim	 vmovdqu	32(%rsp),$counters
8267bded2dbSJung-uk Kim	prefetcht0	15(@ptr[$i-2])			# prefetch output
8277bded2dbSJung-uk Kim	prefetcht0	15(@ptr[$i-1])
8287bded2dbSJung-uk Kim	cmp	\$11,$rounds
8297bded2dbSJung-uk Kim	jb	.Lenc8x_tail
8307bded2dbSJung-uk Kim
8317bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[0],@out[0]
8327bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[1],@out[1]
8337bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[2],@out[2]
8347bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[3],@out[3]
8357bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[4],@out[4]
8367bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[5],@out[5]
8377bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[6],@out[6]
8387bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[7],@out[7]
8397bded2dbSJung-uk Kim	vmovups		0xb0-0x78($key),$rndkey1
8407bded2dbSJung-uk Kim
8417bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[0],@out[0]
8427bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[1],@out[1]
8437bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[2],@out[2]
8447bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[3],@out[3]
8457bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[4],@out[4]
8467bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[5],@out[5]
8477bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[6],@out[6]
8487bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[7],@out[7]
8497bded2dbSJung-uk Kim	vmovups		0xc0-0x78($key),$rndkey0
8507bded2dbSJung-uk Kim	je	.Lenc8x_tail
8517bded2dbSJung-uk Kim
8527bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[0],@out[0]
8537bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[1],@out[1]
8547bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[2],@out[2]
8557bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[3],@out[3]
8567bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[4],@out[4]
8577bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[5],@out[5]
8587bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[6],@out[6]
8597bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[7],@out[7]
8607bded2dbSJung-uk Kim	vmovups		0xd0-0x78($key),$rndkey1
8617bded2dbSJung-uk Kim
8627bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[0],@out[0]
8637bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[1],@out[1]
8647bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[2],@out[2]
8657bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[3],@out[3]
8667bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[4],@out[4]
8677bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[5],@out[5]
8687bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[6],@out[6]
8697bded2dbSJung-uk Kim	vaesenc		$rndkey0,@out[7],@out[7]
8707bded2dbSJung-uk Kim	vmovups		0xe0-0x78($key),$rndkey0
8717bded2dbSJung-uk Kim
8727bded2dbSJung-uk Kim.Lenc8x_tail:
8737bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[0],@out[0]
8747bded2dbSJung-uk Kim	 vpxor		$zero,$zero,$zero
8757bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[1],@out[1]
8767bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[2],@out[2]
8777bded2dbSJung-uk Kim	 vpcmpgtd	$zero,$counters,$zero
8787bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[3],@out[3]
8797bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[4],@out[4]
8807bded2dbSJung-uk Kim	 vpaddd		$counters,$zero,$zero		# decrement counters
8817bded2dbSJung-uk Kim	 vmovdqu	48(%rsp),$counters
8827bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[5],@out[5]
8837bded2dbSJung-uk Kim	 mov		64(%rsp),$offset		# pre-load 1st offset
8847bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[6],@out[6]
8857bded2dbSJung-uk Kim	vaesenc		$rndkey1,@out[7],@out[7]
8867bded2dbSJung-uk Kim	vmovups		0x10-0x78($key),$rndkey1
8877bded2dbSJung-uk Kim
8887bded2dbSJung-uk Kim	vaesenclast	$rndkey0,@out[0],@out[0]
8897bded2dbSJung-uk Kim	 vmovdqa	$zero,32(%rsp)			# update counters
8907bded2dbSJung-uk Kim	 vpxor		$zero,$zero,$zero
8917bded2dbSJung-uk Kim	vaesenclast	$rndkey0,@out[1],@out[1]
8927bded2dbSJung-uk Kim	vaesenclast	$rndkey0,@out[2],@out[2]
8937bded2dbSJung-uk Kim	 vpcmpgtd	$zero,$counters,$zero
8947bded2dbSJung-uk Kim	vaesenclast	$rndkey0,@out[3],@out[3]
8957bded2dbSJung-uk Kim	vaesenclast	$rndkey0,@out[4],@out[4]
8967bded2dbSJung-uk Kim	 vpaddd		$zero,$counters,$counters	# decrement counters
8977bded2dbSJung-uk Kim	 vmovdqu	-0x78($key),$zero		# 0-round
8987bded2dbSJung-uk Kim	vaesenclast	$rndkey0,@out[5],@out[5]
8997bded2dbSJung-uk Kim	vaesenclast	$rndkey0,@out[6],@out[6]
9007bded2dbSJung-uk Kim	 vmovdqa	$counters,48(%rsp)		# update counters
9017bded2dbSJung-uk Kim	vaesenclast	$rndkey0,@out[7],@out[7]
9027bded2dbSJung-uk Kim	vmovups		0x20-0x78($key),$rndkey0
9037bded2dbSJung-uk Kim
9047bded2dbSJung-uk Kim	vmovups		@out[0],-16(@ptr[0])		# write output
9057bded2dbSJung-uk Kim	 sub		$offset,@ptr[0]			# switch to input
9067bded2dbSJung-uk Kim	 vpxor		0x00($offload),@out[0],@out[0]
9077bded2dbSJung-uk Kim	vmovups		@out[1],-16(@ptr[1])
9087bded2dbSJung-uk Kim	 sub		`64+1*8`(%rsp),@ptr[1]
9097bded2dbSJung-uk Kim	 vpxor		0x10($offload),@out[1],@out[1]
9107bded2dbSJung-uk Kim	vmovups		@out[2],-16(@ptr[2])
9117bded2dbSJung-uk Kim	 sub		`64+2*8`(%rsp),@ptr[2]
9127bded2dbSJung-uk Kim	 vpxor		0x20($offload),@out[2],@out[2]
9137bded2dbSJung-uk Kim	vmovups		@out[3],-16(@ptr[3])
9147bded2dbSJung-uk Kim	 sub		`64+3*8`(%rsp),@ptr[3]
9157bded2dbSJung-uk Kim	 vpxor		0x30($offload),@out[3],@out[3]
9167bded2dbSJung-uk Kim	vmovups		@out[4],-16(@ptr[4])
9177bded2dbSJung-uk Kim	 sub		`64+4*8`(%rsp),@ptr[4]
9187bded2dbSJung-uk Kim	 vpxor		@inp[0],@out[4],@out[4]
9197bded2dbSJung-uk Kim	vmovups		@out[5],-16(@ptr[5])
9207bded2dbSJung-uk Kim	 sub		`64+5*8`(%rsp),@ptr[5]
9217bded2dbSJung-uk Kim	 vpxor		@inp[1],@out[5],@out[5]
9227bded2dbSJung-uk Kim	vmovups		@out[6],-16(@ptr[6])
9237bded2dbSJung-uk Kim	 sub		`64+6*8`(%rsp),@ptr[6]
9247bded2dbSJung-uk Kim	 vpxor		@inp[2],@out[6],@out[6]
9257bded2dbSJung-uk Kim	vmovups		@out[7],-16(@ptr[7])
9267bded2dbSJung-uk Kim	 sub		`64+7*8`(%rsp),@ptr[7]
9277bded2dbSJung-uk Kim	 vpxor		@inp[3],@out[7],@out[7]
9287bded2dbSJung-uk Kim
9297bded2dbSJung-uk Kim	dec	$num
9307bded2dbSJung-uk Kim	jnz	.Loop_enc8x
9317bded2dbSJung-uk Kim
9327bded2dbSJung-uk Kim	mov	16(%rsp),%rax			# original %rsp
933e71b7053SJung-uk Kim.cfi_def_cfa	%rax,8
9347bded2dbSJung-uk Kim	#mov	24(%rsp),$num
935*b077aed3SPierre Pronchery	#lea	`$inp_elm_size*8`($inp),$inp
9367bded2dbSJung-uk Kim	#dec	$num
9377bded2dbSJung-uk Kim	#jnz	.Lenc8x_loop_grande
9387bded2dbSJung-uk Kim
9397bded2dbSJung-uk Kim.Lenc8x_done:
9407bded2dbSJung-uk Kim	vzeroupper
9417bded2dbSJung-uk Kim___
9427bded2dbSJung-uk Kim$code.=<<___ if ($win64);
9437bded2dbSJung-uk Kim	movaps	-0xd8(%rax),%xmm6
9447bded2dbSJung-uk Kim	movaps	-0xc8(%rax),%xmm7
9457bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm8
9467bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm9
9477bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm10
9487bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm11
9497bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm12
9507bded2dbSJung-uk Kim	movaps	-0x68(%rax),%xmm13
9517bded2dbSJung-uk Kim	movaps	-0x58(%rax),%xmm14
9527bded2dbSJung-uk Kim	movaps	-0x48(%rax),%xmm15
9537bded2dbSJung-uk Kim___
9547bded2dbSJung-uk Kim$code.=<<___;
9557bded2dbSJung-uk Kim	mov	-48(%rax),%r15
956e71b7053SJung-uk Kim.cfi_restore	%r15
9577bded2dbSJung-uk Kim	mov	-40(%rax),%r14
958e71b7053SJung-uk Kim.cfi_restore	%r14
9597bded2dbSJung-uk Kim	mov	-32(%rax),%r13
960e71b7053SJung-uk Kim.cfi_restore	%r13
9617bded2dbSJung-uk Kim	mov	-24(%rax),%r12
962e71b7053SJung-uk Kim.cfi_restore	%r12
9637bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
964e71b7053SJung-uk Kim.cfi_restore	%rbp
9657bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
966e71b7053SJung-uk Kim.cfi_restore	%rbx
9677bded2dbSJung-uk Kim	lea	(%rax),%rsp
968e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
9697bded2dbSJung-uk Kim.Lenc8x_epilogue:
9707bded2dbSJung-uk Kim	ret
971e71b7053SJung-uk Kim.cfi_endproc
9727bded2dbSJung-uk Kim.size	aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
9737bded2dbSJung-uk Kim
9747bded2dbSJung-uk Kim.type	aesni_multi_cbc_decrypt_avx,\@function,3
9757bded2dbSJung-uk Kim.align	32
9767bded2dbSJung-uk Kimaesni_multi_cbc_decrypt_avx:
977e71b7053SJung-uk Kim.cfi_startproc
9787bded2dbSJung-uk Kim_avx_cbc_dec_shortcut:
9797bded2dbSJung-uk Kim	mov	%rsp,%rax
980e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
9817bded2dbSJung-uk Kim	push	%rbx
982e71b7053SJung-uk Kim.cfi_push	%rbx
9837bded2dbSJung-uk Kim	push	%rbp
984e71b7053SJung-uk Kim.cfi_push	%rbp
9857bded2dbSJung-uk Kim	push	%r12
986e71b7053SJung-uk Kim.cfi_push	%r12
9877bded2dbSJung-uk Kim	push	%r13
988e71b7053SJung-uk Kim.cfi_push	%r13
9897bded2dbSJung-uk Kim	push	%r14
990e71b7053SJung-uk Kim.cfi_push	%r14
9917bded2dbSJung-uk Kim	push	%r15
992e71b7053SJung-uk Kim.cfi_push	%r15
9937bded2dbSJung-uk Kim___
9947bded2dbSJung-uk Kim$code.=<<___ if ($win64);
9957bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
9967bded2dbSJung-uk Kim	movaps	%xmm6,(%rsp)
9977bded2dbSJung-uk Kim	movaps	%xmm7,0x10(%rsp)
9987bded2dbSJung-uk Kim	movaps	%xmm8,0x20(%rsp)
9997bded2dbSJung-uk Kim	movaps	%xmm9,0x30(%rsp)
10007bded2dbSJung-uk Kim	movaps	%xmm10,0x40(%rsp)
10017bded2dbSJung-uk Kim	movaps	%xmm11,0x50(%rsp)
10027bded2dbSJung-uk Kim	movaps	%xmm12,-0x78(%rax)
10037bded2dbSJung-uk Kim	movaps	%xmm13,-0x68(%rax)
10047bded2dbSJung-uk Kim	movaps	%xmm14,-0x58(%rax)
10057bded2dbSJung-uk Kim	movaps	%xmm15,-0x48(%rax)
10067bded2dbSJung-uk Kim___
10077bded2dbSJung-uk Kim$code.=<<___;
10087bded2dbSJung-uk Kim	# stack layout
10097bded2dbSJung-uk Kim	#
10107bded2dbSJung-uk Kim	# +0	output sink
10117bded2dbSJung-uk Kim	# +16	input sink [original %rsp and $num]
10127bded2dbSJung-uk Kim	# +32	counters
10137bded2dbSJung-uk Kim	# +64	distances between inputs and outputs
10147bded2dbSJung-uk Kim	# +128	off-load area for @inp[0..3]
10157bded2dbSJung-uk Kim	# +192	IV/input offload
10167bded2dbSJung-uk Kim
10177bded2dbSJung-uk Kim	sub	\$256,%rsp
10187bded2dbSJung-uk Kim	and	\$-256,%rsp
10197bded2dbSJung-uk Kim	sub	\$192,%rsp
10207bded2dbSJung-uk Kim	mov	%rax,16(%rsp)			# original %rsp
1021e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+16,deref,+8
10227bded2dbSJung-uk Kim
10237bded2dbSJung-uk Kim.Ldec8x_body:
10247bded2dbSJung-uk Kim	vzeroupper
10257bded2dbSJung-uk Kim	vmovdqu	($key),$zero			# 0-round key
10267bded2dbSJung-uk Kim	lea	0x78($key),$key			# size optimization
1027*b077aed3SPierre Pronchery	lea	`$inp_elm_size*4`($inp),$inp
10287bded2dbSJung-uk Kim	shr	\$1,$num
10297bded2dbSJung-uk Kim
10307bded2dbSJung-uk Kim.Ldec8x_loop_grande:
10317bded2dbSJung-uk Kim	#mov	$num,24(%rsp)			# original $num
10327bded2dbSJung-uk Kim	xor	$num,$num
10337bded2dbSJung-uk Kim___
10347bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) {
10357bded2dbSJung-uk Kim  my $temp = $i ? $offload : $offset;
1036*b077aed3SPierre Pronchery    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1037*b077aed3SPierre Pronchery    $temp_reg=&pointer_register($flavour,$temp);
10387bded2dbSJung-uk Kim    $code.=<<___;
1039*b077aed3SPierre Pronchery	# borrow $one for number of blocks
1040*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
1041*b077aed3SPierre Pronchery	# input pointer
1042*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
10437bded2dbSJung-uk Kim	cmp	$num,$one
1044*b077aed3SPierre Pronchery	# output pointer
1045*b077aed3SPierre Pronchery	mov	`$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
10467bded2dbSJung-uk Kim	cmovg	$one,$num			# find maximum
10477bded2dbSJung-uk Kim	test	$one,$one
1048*b077aed3SPierre Pronchery	# load IV
1049*b077aed3SPierre Pronchery	vmovdqu	`$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
10507bded2dbSJung-uk Kim	mov	$one,`32+4*$i`(%rsp)		# initialize counters
10517bded2dbSJung-uk Kim	cmovle	%rsp,@ptr[$i]			# cancel input
10527bded2dbSJung-uk Kim	sub	@ptr[$i],$temp			# distance between input and output
10537bded2dbSJung-uk Kim	mov	$temp,`64+8*$i`(%rsp)		# initialize distances
10547bded2dbSJung-uk Kim	vmovdqu	@out[$i],`192+16*$i`(%rsp)	# offload IV
10557bded2dbSJung-uk Kim___
10567bded2dbSJung-uk Kim}
10577bded2dbSJung-uk Kim$code.=<<___;
10587bded2dbSJung-uk Kim	test	$num,$num
10597bded2dbSJung-uk Kim	jz	.Ldec8x_done
10607bded2dbSJung-uk Kim
10617bded2dbSJung-uk Kim	vmovups	0x10-0x78($key),$rndkey1
10627bded2dbSJung-uk Kim	vmovups	0x20-0x78($key),$rndkey0
10637bded2dbSJung-uk Kim	mov	0xf0-0x78($key),$rounds
10647bded2dbSJung-uk Kim	 lea	192+128(%rsp),$offload		# offload area
10657bded2dbSJung-uk Kim
10667bded2dbSJung-uk Kim	vmovdqu	(@ptr[0]),@out[0]		# load inputs
10677bded2dbSJung-uk Kim	vmovdqu	(@ptr[1]),@out[1]
10687bded2dbSJung-uk Kim	vmovdqu	(@ptr[2]),@out[2]
10697bded2dbSJung-uk Kim	vmovdqu	(@ptr[3]),@out[3]
10707bded2dbSJung-uk Kim	vmovdqu	(@ptr[4]),@out[4]
10717bded2dbSJung-uk Kim	vmovdqu	(@ptr[5]),@out[5]
10727bded2dbSJung-uk Kim	vmovdqu	(@ptr[6]),@out[6]
10737bded2dbSJung-uk Kim	vmovdqu	(@ptr[7]),@out[7]
10747bded2dbSJung-uk Kim	vmovdqu	@out[0],0x00($offload)		# offload inputs
10757bded2dbSJung-uk Kim	vpxor	$zero,@out[0],@out[0]		# xor inputs with 0-round
10767bded2dbSJung-uk Kim	vmovdqu	@out[1],0x10($offload)
10777bded2dbSJung-uk Kim	vpxor	$zero,@out[1],@out[1]
10787bded2dbSJung-uk Kim	vmovdqu	@out[2],0x20($offload)
10797bded2dbSJung-uk Kim	vpxor	$zero,@out[2],@out[2]
10807bded2dbSJung-uk Kim	vmovdqu	@out[3],0x30($offload)
10817bded2dbSJung-uk Kim	vpxor	$zero,@out[3],@out[3]
10827bded2dbSJung-uk Kim	vmovdqu	@out[4],0x40($offload)
10837bded2dbSJung-uk Kim	vpxor	$zero,@out[4],@out[4]
10847bded2dbSJung-uk Kim	vmovdqu	@out[5],0x50($offload)
10857bded2dbSJung-uk Kim	vpxor	$zero,@out[5],@out[5]
10867bded2dbSJung-uk Kim	vmovdqu	@out[6],0x60($offload)
10877bded2dbSJung-uk Kim	vpxor	$zero,@out[6],@out[6]
10887bded2dbSJung-uk Kim	vmovdqu	@out[7],0x70($offload)
10897bded2dbSJung-uk Kim	vpxor	$zero,@out[7],@out[7]
10907bded2dbSJung-uk Kim	xor	\$0x80,$offload
10917bded2dbSJung-uk Kim	mov	\$1,$one			# constant of 1
10927bded2dbSJung-uk Kim	jmp	.Loop_dec8x
10937bded2dbSJung-uk Kim
10947bded2dbSJung-uk Kim.align	32
10957bded2dbSJung-uk Kim.Loop_dec8x:
10967bded2dbSJung-uk Kim___
10977bded2dbSJung-uk Kimfor($i=0;$i<8;$i++) {
10987bded2dbSJung-uk Kimmy $rndkey=($i&1)?$rndkey0:$rndkey1;
10997bded2dbSJung-uk Kim$code.=<<___;
11007bded2dbSJung-uk Kim	vaesdec		$rndkey,@out[0],@out[0]
11017bded2dbSJung-uk Kim	 cmp		32+4*$i(%rsp),$one
11027bded2dbSJung-uk Kim___
11037bded2dbSJung-uk Kim$code.=<<___ if ($i);
11047bded2dbSJung-uk Kim	 mov		64+8*$i(%rsp),$offset
11057bded2dbSJung-uk Kim___
11067bded2dbSJung-uk Kim$code.=<<___;
11077bded2dbSJung-uk Kim	vaesdec		$rndkey,@out[1],@out[1]
11087bded2dbSJung-uk Kim	prefetcht0	31(@ptr[$i])			# prefetch input
11097bded2dbSJung-uk Kim	vaesdec		$rndkey,@out[2],@out[2]
11107bded2dbSJung-uk Kim___
11117bded2dbSJung-uk Kim$code.=<<___ if ($i>1);
11127bded2dbSJung-uk Kim	prefetcht0	15(@ptr[$i-2])			# prefetch output
11137bded2dbSJung-uk Kim___
11147bded2dbSJung-uk Kim$code.=<<___;
11157bded2dbSJung-uk Kim	vaesdec		$rndkey,@out[3],@out[3]
11167bded2dbSJung-uk Kim	 lea		(@ptr[$i],$offset),$offset
11177bded2dbSJung-uk Kim	 cmovge		%rsp,@ptr[$i]			# cancel input
11187bded2dbSJung-uk Kim	vaesdec		$rndkey,@out[4],@out[4]
11197bded2dbSJung-uk Kim	 cmovg		%rsp,$offset			# sink output
11207bded2dbSJung-uk Kim	vaesdec		$rndkey,@out[5],@out[5]
11217bded2dbSJung-uk Kim	 sub		@ptr[$i],$offset
11227bded2dbSJung-uk Kim	vaesdec		$rndkey,@out[6],@out[6]
11237bded2dbSJung-uk Kim	 vmovdqu	16(@ptr[$i]),@inp[$i%4]		# load input
11247bded2dbSJung-uk Kim	 mov		$offset,64+8*$i(%rsp)
11257bded2dbSJung-uk Kim	vaesdec		$rndkey,@out[7],@out[7]
11267bded2dbSJung-uk Kim	vmovups		`16*(3+$i)-0x78`($key),$rndkey
11277bded2dbSJung-uk Kim	 lea		16(@ptr[$i],$offset),@ptr[$i]	# switch to output
11287bded2dbSJung-uk Kim___
11297bded2dbSJung-uk Kim$code.=<<___ if ($i<4);
11307bded2dbSJung-uk Kim	 vmovdqu	@inp[$i%4],`128+16*$i`(%rsp)	# off-load
11317bded2dbSJung-uk Kim___
11327bded2dbSJung-uk Kim}
11337bded2dbSJung-uk Kim$code.=<<___;
11347bded2dbSJung-uk Kim	 vmovdqu	32(%rsp),$counters
11357bded2dbSJung-uk Kim	prefetcht0	15(@ptr[$i-2])			# prefetch output
11367bded2dbSJung-uk Kim	prefetcht0	15(@ptr[$i-1])
11377bded2dbSJung-uk Kim	cmp	\$11,$rounds
11387bded2dbSJung-uk Kim	jb	.Ldec8x_tail
11397bded2dbSJung-uk Kim
11407bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[0],@out[0]
11417bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[1],@out[1]
11427bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[2],@out[2]
11437bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[3],@out[3]
11447bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[4],@out[4]
11457bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[5],@out[5]
11467bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[6],@out[6]
11477bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[7],@out[7]
11487bded2dbSJung-uk Kim	vmovups		0xb0-0x78($key),$rndkey1
11497bded2dbSJung-uk Kim
11507bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[0],@out[0]
11517bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[1],@out[1]
11527bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[2],@out[2]
11537bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[3],@out[3]
11547bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[4],@out[4]
11557bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[5],@out[5]
11567bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[6],@out[6]
11577bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[7],@out[7]
11587bded2dbSJung-uk Kim	vmovups		0xc0-0x78($key),$rndkey0
11597bded2dbSJung-uk Kim	je	.Ldec8x_tail
11607bded2dbSJung-uk Kim
11617bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[0],@out[0]
11627bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[1],@out[1]
11637bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[2],@out[2]
11647bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[3],@out[3]
11657bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[4],@out[4]
11667bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[5],@out[5]
11677bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[6],@out[6]
11687bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[7],@out[7]
11697bded2dbSJung-uk Kim	vmovups		0xd0-0x78($key),$rndkey1
11707bded2dbSJung-uk Kim
11717bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[0],@out[0]
11727bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[1],@out[1]
11737bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[2],@out[2]
11747bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[3],@out[3]
11757bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[4],@out[4]
11767bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[5],@out[5]
11777bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[6],@out[6]
11787bded2dbSJung-uk Kim	vaesdec		$rndkey0,@out[7],@out[7]
11797bded2dbSJung-uk Kim	vmovups		0xe0-0x78($key),$rndkey0
11807bded2dbSJung-uk Kim
11817bded2dbSJung-uk Kim.Ldec8x_tail:
11827bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[0],@out[0]
11837bded2dbSJung-uk Kim	 vpxor		$zero,$zero,$zero
11847bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[1],@out[1]
11857bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[2],@out[2]
11867bded2dbSJung-uk Kim	 vpcmpgtd	$zero,$counters,$zero
11877bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[3],@out[3]
11887bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[4],@out[4]
11897bded2dbSJung-uk Kim	 vpaddd		$counters,$zero,$zero		# decrement counters
11907bded2dbSJung-uk Kim	 vmovdqu	48(%rsp),$counters
11917bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[5],@out[5]
11927bded2dbSJung-uk Kim	 mov		64(%rsp),$offset		# pre-load 1st offset
11937bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[6],@out[6]
11947bded2dbSJung-uk Kim	vaesdec		$rndkey1,@out[7],@out[7]
11957bded2dbSJung-uk Kim	vmovups		0x10-0x78($key),$rndkey1
11967bded2dbSJung-uk Kim
11977bded2dbSJung-uk Kim	vaesdeclast	$rndkey0,@out[0],@out[0]
11987bded2dbSJung-uk Kim	 vmovdqa	$zero,32(%rsp)			# update counters
11997bded2dbSJung-uk Kim	 vpxor		$zero,$zero,$zero
12007bded2dbSJung-uk Kim	vaesdeclast	$rndkey0,@out[1],@out[1]
12017bded2dbSJung-uk Kim	vpxor		0x00($offload),@out[0],@out[0]	# xor with IV
12027bded2dbSJung-uk Kim	vaesdeclast	$rndkey0,@out[2],@out[2]
12037bded2dbSJung-uk Kim	vpxor		0x10($offload),@out[1],@out[1]
12047bded2dbSJung-uk Kim	 vpcmpgtd	$zero,$counters,$zero
12057bded2dbSJung-uk Kim	vaesdeclast	$rndkey0,@out[3],@out[3]
12067bded2dbSJung-uk Kim	vpxor		0x20($offload),@out[2],@out[2]
12077bded2dbSJung-uk Kim	vaesdeclast	$rndkey0,@out[4],@out[4]
12087bded2dbSJung-uk Kim	vpxor		0x30($offload),@out[3],@out[3]
12097bded2dbSJung-uk Kim	 vpaddd		$zero,$counters,$counters	# decrement counters
12107bded2dbSJung-uk Kim	 vmovdqu	-0x78($key),$zero		# 0-round
12117bded2dbSJung-uk Kim	vaesdeclast	$rndkey0,@out[5],@out[5]
12127bded2dbSJung-uk Kim	vpxor		0x40($offload),@out[4],@out[4]
12137bded2dbSJung-uk Kim	vaesdeclast	$rndkey0,@out[6],@out[6]
12147bded2dbSJung-uk Kim	vpxor		0x50($offload),@out[5],@out[5]
12157bded2dbSJung-uk Kim	 vmovdqa	$counters,48(%rsp)		# update counters
12167bded2dbSJung-uk Kim	vaesdeclast	$rndkey0,@out[7],@out[7]
12177bded2dbSJung-uk Kim	vpxor		0x60($offload),@out[6],@out[6]
12187bded2dbSJung-uk Kim	vmovups		0x20-0x78($key),$rndkey0
12197bded2dbSJung-uk Kim
12207bded2dbSJung-uk Kim	vmovups		@out[0],-16(@ptr[0])		# write output
12217bded2dbSJung-uk Kim	 sub		$offset,@ptr[0]			# switch to input
12227bded2dbSJung-uk Kim	 vmovdqu	128+0(%rsp),@out[0]
12237bded2dbSJung-uk Kim	vpxor		0x70($offload),@out[7],@out[7]
12247bded2dbSJung-uk Kim	vmovups		@out[1],-16(@ptr[1])
12257bded2dbSJung-uk Kim	 sub		`64+1*8`(%rsp),@ptr[1]
12267bded2dbSJung-uk Kim	 vmovdqu	@out[0],0x00($offload)
12277bded2dbSJung-uk Kim	 vpxor		$zero,@out[0],@out[0]
12287bded2dbSJung-uk Kim	 vmovdqu	128+16(%rsp),@out[1]
12297bded2dbSJung-uk Kim	vmovups		@out[2],-16(@ptr[2])
12307bded2dbSJung-uk Kim	 sub		`64+2*8`(%rsp),@ptr[2]
12317bded2dbSJung-uk Kim	 vmovdqu	@out[1],0x10($offload)
12327bded2dbSJung-uk Kim	 vpxor		$zero,@out[1],@out[1]
12337bded2dbSJung-uk Kim	 vmovdqu	128+32(%rsp),@out[2]
12347bded2dbSJung-uk Kim	vmovups		@out[3],-16(@ptr[3])
12357bded2dbSJung-uk Kim	 sub		`64+3*8`(%rsp),@ptr[3]
12367bded2dbSJung-uk Kim	 vmovdqu	@out[2],0x20($offload)
12377bded2dbSJung-uk Kim	 vpxor		$zero,@out[2],@out[2]
12387bded2dbSJung-uk Kim	 vmovdqu	128+48(%rsp),@out[3]
12397bded2dbSJung-uk Kim	vmovups		@out[4],-16(@ptr[4])
12407bded2dbSJung-uk Kim	 sub		`64+4*8`(%rsp),@ptr[4]
12417bded2dbSJung-uk Kim	 vmovdqu	@out[3],0x30($offload)
12427bded2dbSJung-uk Kim	 vpxor		$zero,@out[3],@out[3]
12437bded2dbSJung-uk Kim	 vmovdqu	@inp[0],0x40($offload)
12447bded2dbSJung-uk Kim	 vpxor		@inp[0],$zero,@out[4]
12457bded2dbSJung-uk Kim	vmovups		@out[5],-16(@ptr[5])
12467bded2dbSJung-uk Kim	 sub		`64+5*8`(%rsp),@ptr[5]
12477bded2dbSJung-uk Kim	 vmovdqu	@inp[1],0x50($offload)
12487bded2dbSJung-uk Kim	 vpxor		@inp[1],$zero,@out[5]
12497bded2dbSJung-uk Kim	vmovups		@out[6],-16(@ptr[6])
12507bded2dbSJung-uk Kim	 sub		`64+6*8`(%rsp),@ptr[6]
12517bded2dbSJung-uk Kim	 vmovdqu	@inp[2],0x60($offload)
12527bded2dbSJung-uk Kim	 vpxor		@inp[2],$zero,@out[6]
12537bded2dbSJung-uk Kim	vmovups		@out[7],-16(@ptr[7])
12547bded2dbSJung-uk Kim	 sub		`64+7*8`(%rsp),@ptr[7]
12557bded2dbSJung-uk Kim	 vmovdqu	@inp[3],0x70($offload)
12567bded2dbSJung-uk Kim	 vpxor		@inp[3],$zero,@out[7]
12577bded2dbSJung-uk Kim
12587bded2dbSJung-uk Kim	xor	\$128,$offload
12597bded2dbSJung-uk Kim	dec	$num
12607bded2dbSJung-uk Kim	jnz	.Loop_dec8x
12617bded2dbSJung-uk Kim
12627bded2dbSJung-uk Kim	mov	16(%rsp),%rax			# original %rsp
1263e71b7053SJung-uk Kim.cfi_def_cfa	%rax,8
12647bded2dbSJung-uk Kim	#mov	24(%rsp),$num
1265*b077aed3SPierre Pronchery	#lea	`$inp_elm_size*8`($inp),$inp
12667bded2dbSJung-uk Kim	#dec	$num
12677bded2dbSJung-uk Kim	#jnz	.Ldec8x_loop_grande
12687bded2dbSJung-uk Kim
12697bded2dbSJung-uk Kim.Ldec8x_done:
12707bded2dbSJung-uk Kim	vzeroupper
12717bded2dbSJung-uk Kim___
12727bded2dbSJung-uk Kim$code.=<<___ if ($win64);
12737bded2dbSJung-uk Kim	movaps	-0xd8(%rax),%xmm6
12747bded2dbSJung-uk Kim	movaps	-0xc8(%rax),%xmm7
12757bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm8
12767bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm9
12777bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm10
12787bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm11
12797bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm12
12807bded2dbSJung-uk Kim	movaps	-0x68(%rax),%xmm13
12817bded2dbSJung-uk Kim	movaps	-0x58(%rax),%xmm14
12827bded2dbSJung-uk Kim	movaps	-0x48(%rax),%xmm15
12837bded2dbSJung-uk Kim___
12847bded2dbSJung-uk Kim$code.=<<___;
12857bded2dbSJung-uk Kim	mov	-48(%rax),%r15
1286e71b7053SJung-uk Kim.cfi_restore	%r15
12877bded2dbSJung-uk Kim	mov	-40(%rax),%r14
1288e71b7053SJung-uk Kim.cfi_restore	%r14
12897bded2dbSJung-uk Kim	mov	-32(%rax),%r13
1290e71b7053SJung-uk Kim.cfi_restore	%r13
12917bded2dbSJung-uk Kim	mov	-24(%rax),%r12
1292e71b7053SJung-uk Kim.cfi_restore	%r12
12937bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
1294e71b7053SJung-uk Kim.cfi_restore	%rbp
12957bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
1296e71b7053SJung-uk Kim.cfi_restore	%rbx
12977bded2dbSJung-uk Kim	lea	(%rax),%rsp
1298e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
12997bded2dbSJung-uk Kim.Ldec8x_epilogue:
13007bded2dbSJung-uk Kim	ret
1301e71b7053SJung-uk Kim.cfi_endproc
13027bded2dbSJung-uk Kim.size	aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
13037bded2dbSJung-uk Kim___
13047bded2dbSJung-uk Kim						}}}
13057bded2dbSJung-uk Kim
13067bded2dbSJung-uk Kimif ($win64) {
13077bded2dbSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
13087bded2dbSJung-uk Kim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
13097bded2dbSJung-uk Kim$rec="%rcx";
13107bded2dbSJung-uk Kim$frame="%rdx";
13117bded2dbSJung-uk Kim$context="%r8";
13127bded2dbSJung-uk Kim$disp="%r9";
13137bded2dbSJung-uk Kim
13147bded2dbSJung-uk Kim$code.=<<___;
13157bded2dbSJung-uk Kim.extern	__imp_RtlVirtualUnwind
13167bded2dbSJung-uk Kim.type	se_handler,\@abi-omnipotent
13177bded2dbSJung-uk Kim.align	16
13187bded2dbSJung-uk Kimse_handler:
13197bded2dbSJung-uk Kim	push	%rsi
13207bded2dbSJung-uk Kim	push	%rdi
13217bded2dbSJung-uk Kim	push	%rbx
13227bded2dbSJung-uk Kim	push	%rbp
13237bded2dbSJung-uk Kim	push	%r12
13247bded2dbSJung-uk Kim	push	%r13
13257bded2dbSJung-uk Kim	push	%r14
13267bded2dbSJung-uk Kim	push	%r15
13277bded2dbSJung-uk Kim	pushfq
13287bded2dbSJung-uk Kim	sub	\$64,%rsp
13297bded2dbSJung-uk Kim
13307bded2dbSJung-uk Kim	mov	120($context),%rax	# pull context->Rax
13317bded2dbSJung-uk Kim	mov	248($context),%rbx	# pull context->Rip
13327bded2dbSJung-uk Kim
13337bded2dbSJung-uk Kim	mov	8($disp),%rsi		# disp->ImageBase
13347bded2dbSJung-uk Kim	mov	56($disp),%r11		# disp->HandlerData
13357bded2dbSJung-uk Kim
13367bded2dbSJung-uk Kim	mov	0(%r11),%r10d		# HandlerData[0]
13377bded2dbSJung-uk Kim	lea	(%rsi,%r10),%r10	# prologue label
13387bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip<.Lprologue
13397bded2dbSJung-uk Kim	jb	.Lin_prologue
13407bded2dbSJung-uk Kim
13417bded2dbSJung-uk Kim	mov	152($context),%rax	# pull context->Rsp
13427bded2dbSJung-uk Kim
13437bded2dbSJung-uk Kim	mov	4(%r11),%r10d		# HandlerData[1]
13447bded2dbSJung-uk Kim	lea	(%rsi,%r10),%r10	# epilogue label
13457bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
13467bded2dbSJung-uk Kim	jae	.Lin_prologue
13477bded2dbSJung-uk Kim
13487bded2dbSJung-uk Kim	mov	16(%rax),%rax		# pull saved stack pointer
13497bded2dbSJung-uk Kim
13507bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
13517bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
13527bded2dbSJung-uk Kim	mov	-24(%rax),%r12
13537bded2dbSJung-uk Kim	mov	-32(%rax),%r13
13547bded2dbSJung-uk Kim	mov	-40(%rax),%r14
13557bded2dbSJung-uk Kim	mov	-48(%rax),%r15
13567bded2dbSJung-uk Kim	mov	%rbx,144($context)	# restore context->Rbx
13577bded2dbSJung-uk Kim	mov	%rbp,160($context)	# restore context->Rbp
1358e71b7053SJung-uk Kim	mov	%r12,216($context)	# restore context->R12
1359e71b7053SJung-uk Kim	mov	%r13,224($context)	# restore context->R13
1360e71b7053SJung-uk Kim	mov	%r14,232($context)	# restore context->R14
1361e71b7053SJung-uk Kim	mov	%r15,240($context)	# restore context->R15
13627bded2dbSJung-uk Kim
13637bded2dbSJung-uk Kim	lea	-56-10*16(%rax),%rsi
13647bded2dbSJung-uk Kim	lea	512($context),%rdi	# &context.Xmm6
13657bded2dbSJung-uk Kim	mov	\$20,%ecx
13667bded2dbSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
13677bded2dbSJung-uk Kim
13687bded2dbSJung-uk Kim.Lin_prologue:
13697bded2dbSJung-uk Kim	mov	8(%rax),%rdi
13707bded2dbSJung-uk Kim	mov	16(%rax),%rsi
13717bded2dbSJung-uk Kim	mov	%rax,152($context)	# restore context->Rsp
13727bded2dbSJung-uk Kim	mov	%rsi,168($context)	# restore context->Rsi
13737bded2dbSJung-uk Kim	mov	%rdi,176($context)	# restore context->Rdi
13747bded2dbSJung-uk Kim
13757bded2dbSJung-uk Kim	mov	40($disp),%rdi		# disp->ContextRecord
13767bded2dbSJung-uk Kim	mov	$context,%rsi		# context
13777bded2dbSJung-uk Kim	mov	\$154,%ecx		# sizeof(CONTEXT)
13787bded2dbSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
13797bded2dbSJung-uk Kim
13807bded2dbSJung-uk Kim	mov	$disp,%rsi
13817bded2dbSJung-uk Kim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
13827bded2dbSJung-uk Kim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
13837bded2dbSJung-uk Kim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
13847bded2dbSJung-uk Kim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
13857bded2dbSJung-uk Kim	mov	40(%rsi),%r10		# disp->ContextRecord
13867bded2dbSJung-uk Kim	lea	56(%rsi),%r11		# &disp->HandlerData
13877bded2dbSJung-uk Kim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
13887bded2dbSJung-uk Kim	mov	%r10,32(%rsp)		# arg5
13897bded2dbSJung-uk Kim	mov	%r11,40(%rsp)		# arg6
13907bded2dbSJung-uk Kim	mov	%r12,48(%rsp)		# arg7
13917bded2dbSJung-uk Kim	mov	%rcx,56(%rsp)		# arg8, (NULL)
13927bded2dbSJung-uk Kim	call	*__imp_RtlVirtualUnwind(%rip)
13937bded2dbSJung-uk Kim
13947bded2dbSJung-uk Kim	mov	\$1,%eax		# ExceptionContinueSearch
13957bded2dbSJung-uk Kim	add	\$64,%rsp
13967bded2dbSJung-uk Kim	popfq
13977bded2dbSJung-uk Kim	pop	%r15
13987bded2dbSJung-uk Kim	pop	%r14
13997bded2dbSJung-uk Kim	pop	%r13
14007bded2dbSJung-uk Kim	pop	%r12
14017bded2dbSJung-uk Kim	pop	%rbp
14027bded2dbSJung-uk Kim	pop	%rbx
14037bded2dbSJung-uk Kim	pop	%rdi
14047bded2dbSJung-uk Kim	pop	%rsi
14057bded2dbSJung-uk Kim	ret
14067bded2dbSJung-uk Kim.size	se_handler,.-se_handler
14077bded2dbSJung-uk Kim
14087bded2dbSJung-uk Kim.section	.pdata
14097bded2dbSJung-uk Kim.align	4
14107bded2dbSJung-uk Kim	.rva	.LSEH_begin_aesni_multi_cbc_encrypt
14117bded2dbSJung-uk Kim	.rva	.LSEH_end_aesni_multi_cbc_encrypt
14127bded2dbSJung-uk Kim	.rva	.LSEH_info_aesni_multi_cbc_encrypt
14137bded2dbSJung-uk Kim	.rva	.LSEH_begin_aesni_multi_cbc_decrypt
14147bded2dbSJung-uk Kim	.rva	.LSEH_end_aesni_multi_cbc_decrypt
14157bded2dbSJung-uk Kim	.rva	.LSEH_info_aesni_multi_cbc_decrypt
14167bded2dbSJung-uk Kim___
14177bded2dbSJung-uk Kim$code.=<<___ if ($avx);
14187bded2dbSJung-uk Kim	.rva	.LSEH_begin_aesni_multi_cbc_encrypt_avx
14197bded2dbSJung-uk Kim	.rva	.LSEH_end_aesni_multi_cbc_encrypt_avx
14207bded2dbSJung-uk Kim	.rva	.LSEH_info_aesni_multi_cbc_encrypt_avx
14217bded2dbSJung-uk Kim	.rva	.LSEH_begin_aesni_multi_cbc_decrypt_avx
14227bded2dbSJung-uk Kim	.rva	.LSEH_end_aesni_multi_cbc_decrypt_avx
14237bded2dbSJung-uk Kim	.rva	.LSEH_info_aesni_multi_cbc_decrypt_avx
14247bded2dbSJung-uk Kim___
14257bded2dbSJung-uk Kim$code.=<<___;
14267bded2dbSJung-uk Kim.section	.xdata
14277bded2dbSJung-uk Kim.align	8
14287bded2dbSJung-uk Kim.LSEH_info_aesni_multi_cbc_encrypt:
14297bded2dbSJung-uk Kim	.byte	9,0,0,0
14307bded2dbSJung-uk Kim	.rva	se_handler
14317bded2dbSJung-uk Kim	.rva	.Lenc4x_body,.Lenc4x_epilogue		# HandlerData[]
14327bded2dbSJung-uk Kim.LSEH_info_aesni_multi_cbc_decrypt:
14337bded2dbSJung-uk Kim	.byte	9,0,0,0
14347bded2dbSJung-uk Kim	.rva	se_handler
14357bded2dbSJung-uk Kim	.rva	.Ldec4x_body,.Ldec4x_epilogue		# HandlerData[]
14367bded2dbSJung-uk Kim___
14377bded2dbSJung-uk Kim$code.=<<___ if ($avx);
14387bded2dbSJung-uk Kim.LSEH_info_aesni_multi_cbc_encrypt_avx:
14397bded2dbSJung-uk Kim	.byte	9,0,0,0
14407bded2dbSJung-uk Kim	.rva	se_handler
14417bded2dbSJung-uk Kim	.rva	.Lenc8x_body,.Lenc8x_epilogue		# HandlerData[]
14427bded2dbSJung-uk Kim.LSEH_info_aesni_multi_cbc_decrypt_avx:
14437bded2dbSJung-uk Kim	.byte	9,0,0,0
14447bded2dbSJung-uk Kim	.rva	se_handler
14457bded2dbSJung-uk Kim	.rva	.Ldec8x_body,.Ldec8x_epilogue		# HandlerData[]
14467bded2dbSJung-uk Kim___
14477bded2dbSJung-uk Kim}
14487bded2dbSJung-uk Kim####################################################################
14497bded2dbSJung-uk Kim
14507bded2dbSJung-uk Kimsub rex {
14517bded2dbSJung-uk Kim  local *opcode=shift;
14527bded2dbSJung-uk Kim  my ($dst,$src)=@_;
14537bded2dbSJung-uk Kim  my $rex=0;
14547bded2dbSJung-uk Kim
14557bded2dbSJung-uk Kim    $rex|=0x04			if($dst>=8);
14567bded2dbSJung-uk Kim    $rex|=0x01			if($src>=8);
14577bded2dbSJung-uk Kim    push @opcode,$rex|0x40	if($rex);
14587bded2dbSJung-uk Kim}
14597bded2dbSJung-uk Kim
14607bded2dbSJung-uk Kimsub aesni {
14617bded2dbSJung-uk Kim  my $line=shift;
14627bded2dbSJung-uk Kim  my @opcode=(0x66);
14637bded2dbSJung-uk Kim
14647bded2dbSJung-uk Kim    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
14657bded2dbSJung-uk Kim	rex(\@opcode,$4,$3);
14667bded2dbSJung-uk Kim	push @opcode,0x0f,0x3a,0xdf;
14677bded2dbSJung-uk Kim	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
14687bded2dbSJung-uk Kim	my $c=$2;
14697bded2dbSJung-uk Kim	push @opcode,$c=~/^0/?oct($c):$c;
14707bded2dbSJung-uk Kim	return ".byte\t".join(',',@opcode);
14717bded2dbSJung-uk Kim    }
14727bded2dbSJung-uk Kim    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
14737bded2dbSJung-uk Kim	my %opcodelet = (
14747bded2dbSJung-uk Kim		"aesimc" => 0xdb,
14757bded2dbSJung-uk Kim		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
14767bded2dbSJung-uk Kim		"aesdec" => 0xde,	"aesdeclast" => 0xdf
14777bded2dbSJung-uk Kim	);
14787bded2dbSJung-uk Kim	return undef if (!defined($opcodelet{$1}));
14797bded2dbSJung-uk Kim	rex(\@opcode,$3,$2);
14807bded2dbSJung-uk Kim	push @opcode,0x0f,0x38,$opcodelet{$1};
14817bded2dbSJung-uk Kim	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
14827bded2dbSJung-uk Kim	return ".byte\t".join(',',@opcode);
14837bded2dbSJung-uk Kim    }
14847bded2dbSJung-uk Kim    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
14857bded2dbSJung-uk Kim	my %opcodelet = (
14867bded2dbSJung-uk Kim		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
14877bded2dbSJung-uk Kim		"aesdec" => 0xde,	"aesdeclast" => 0xdf
14887bded2dbSJung-uk Kim	);
14897bded2dbSJung-uk Kim	return undef if (!defined($opcodelet{$1}));
14907bded2dbSJung-uk Kim	my $off = $2;
14917bded2dbSJung-uk Kim	push @opcode,0x44 if ($3>=8);
14927bded2dbSJung-uk Kim	push @opcode,0x0f,0x38,$opcodelet{$1};
14937bded2dbSJung-uk Kim	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
14947bded2dbSJung-uk Kim	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
14957bded2dbSJung-uk Kim	return ".byte\t".join(',',@opcode);
14967bded2dbSJung-uk Kim    }
14977bded2dbSJung-uk Kim    return $line;
14987bded2dbSJung-uk Kim}
14997bded2dbSJung-uk Kim
15007bded2dbSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval($1)/gem;
15017bded2dbSJung-uk Kim$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
15027bded2dbSJung-uk Kim
15037bded2dbSJung-uk Kimprint $code;
150417f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!";
1505