xref: /linux/arch/x86/crypto/aes-ctr-avx-x86_64.S (revision 74f1af95820fc2ee580a775a3a17c416db30b38c)
18c4fc9ceSEric Biggers/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
28c4fc9ceSEric Biggers//
38c4fc9ceSEric Biggers// Copyright 2025 Google LLC
48c4fc9ceSEric Biggers//
58c4fc9ceSEric Biggers// Author: Eric Biggers <ebiggers@google.com>
68c4fc9ceSEric Biggers//
78c4fc9ceSEric Biggers// This file is dual-licensed, meaning that you can use it under your choice of
88c4fc9ceSEric Biggers// either of the following two licenses:
98c4fc9ceSEric Biggers//
108c4fc9ceSEric Biggers// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
118c4fc9ceSEric Biggers// of the License at
128c4fc9ceSEric Biggers//
138c4fc9ceSEric Biggers//	http://www.apache.org/licenses/LICENSE-2.0
148c4fc9ceSEric Biggers//
158c4fc9ceSEric Biggers// Unless required by applicable law or agreed to in writing, software
168c4fc9ceSEric Biggers// distributed under the License is distributed on an "AS IS" BASIS,
178c4fc9ceSEric Biggers// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
188c4fc9ceSEric Biggers// See the License for the specific language governing permissions and
198c4fc9ceSEric Biggers// limitations under the License.
208c4fc9ceSEric Biggers//
218c4fc9ceSEric Biggers// or
228c4fc9ceSEric Biggers//
238c4fc9ceSEric Biggers// Redistribution and use in source and binary forms, with or without
248c4fc9ceSEric Biggers// modification, are permitted provided that the following conditions are met:
258c4fc9ceSEric Biggers//
268c4fc9ceSEric Biggers// 1. Redistributions of source code must retain the above copyright notice,
278c4fc9ceSEric Biggers//    this list of conditions and the following disclaimer.
288c4fc9ceSEric Biggers//
298c4fc9ceSEric Biggers// 2. Redistributions in binary form must reproduce the above copyright
308c4fc9ceSEric Biggers//    notice, this list of conditions and the following disclaimer in the
318c4fc9ceSEric Biggers//    documentation and/or other materials provided with the distribution.
328c4fc9ceSEric Biggers//
338c4fc9ceSEric Biggers// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
348c4fc9ceSEric Biggers// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
358c4fc9ceSEric Biggers// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
368c4fc9ceSEric Biggers// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
378c4fc9ceSEric Biggers// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
388c4fc9ceSEric Biggers// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
398c4fc9ceSEric Biggers// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
408c4fc9ceSEric Biggers// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
418c4fc9ceSEric Biggers// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
428c4fc9ceSEric Biggers// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
438c4fc9ceSEric Biggers// POSSIBILITY OF SUCH DAMAGE.
448c4fc9ceSEric Biggers//
458c4fc9ceSEric Biggers//------------------------------------------------------------------------------
468c4fc9ceSEric Biggers//
478c4fc9ceSEric Biggers// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR
488c4fc9ceSEric Biggers// using the following sets of CPU features:
498c4fc9ceSEric Biggers//	- AES-NI && AVX
508c4fc9ceSEric Biggers//	- VAES && AVX2
51*7d14fbc5SEric Biggers//	- VAES && AVX512BW && AVX512VL && BMI2
528c4fc9ceSEric Biggers//
538c4fc9ceSEric Biggers// See the function definitions at the bottom of the file for more information.
548c4fc9ceSEric Biggers
558c4fc9ceSEric Biggers#include <linux/linkage.h>
568c4fc9ceSEric Biggers#include <linux/cfi_types.h>
578c4fc9ceSEric Biggers
588c4fc9ceSEric Biggers.section .rodata
598c4fc9ceSEric Biggers.p2align 4
608c4fc9ceSEric Biggers
618c4fc9ceSEric Biggers.Lbswap_mask:
628c4fc9ceSEric Biggers	.octa	0x000102030405060708090a0b0c0d0e0f
638c4fc9ceSEric Biggers
648c4fc9ceSEric Biggers.Lctr_pattern:
658c4fc9ceSEric Biggers	.quad	0, 0
668c4fc9ceSEric Biggers.Lone:
678c4fc9ceSEric Biggers	.quad	1, 0
688c4fc9ceSEric Biggers.Ltwo:
698c4fc9ceSEric Biggers	.quad	2, 0
708c4fc9ceSEric Biggers	.quad	3, 0
718c4fc9ceSEric Biggers
728c4fc9ceSEric Biggers.Lfour:
738c4fc9ceSEric Biggers	.quad	4, 0
748c4fc9ceSEric Biggers
758c4fc9ceSEric Biggers.text
768c4fc9ceSEric Biggers
778c4fc9ceSEric Biggers// Move a vector between memory and a register.
788c4fc9ceSEric Biggers.macro	_vmovdqu	src, dst
798c4fc9ceSEric Biggers.if VL < 64
808c4fc9ceSEric Biggers	vmovdqu		\src, \dst
818c4fc9ceSEric Biggers.else
828c4fc9ceSEric Biggers	vmovdqu8	\src, \dst
838c4fc9ceSEric Biggers.endif
848c4fc9ceSEric Biggers.endm
858c4fc9ceSEric Biggers
868c4fc9ceSEric Biggers// Move a vector between registers.
878c4fc9ceSEric Biggers.macro	_vmovdqa	src, dst
888c4fc9ceSEric Biggers.if VL < 64
898c4fc9ceSEric Biggers	vmovdqa		\src, \dst
908c4fc9ceSEric Biggers.else
918c4fc9ceSEric Biggers	vmovdqa64	\src, \dst
928c4fc9ceSEric Biggers.endif
938c4fc9ceSEric Biggers.endm
948c4fc9ceSEric Biggers
958c4fc9ceSEric Biggers// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector
96*7d14fbc5SEric Biggers// register.
978c4fc9ceSEric Biggers.macro	_vbroadcast128	src, dst
988c4fc9ceSEric Biggers.if VL == 16
998c4fc9ceSEric Biggers	vmovdqu		\src, \dst
1008c4fc9ceSEric Biggers.elseif VL == 32
1018c4fc9ceSEric Biggers	vbroadcasti128	\src, \dst
1028c4fc9ceSEric Biggers.else
1038c4fc9ceSEric Biggers	vbroadcasti32x4	\src, \dst
1048c4fc9ceSEric Biggers.endif
1058c4fc9ceSEric Biggers.endm
1068c4fc9ceSEric Biggers
1078c4fc9ceSEric Biggers// XOR two vectors together.
1088c4fc9ceSEric Biggers.macro	_vpxor	src1, src2, dst
1098c4fc9ceSEric Biggers.if VL < 64
1108c4fc9ceSEric Biggers	vpxor		\src1, \src2, \dst
1118c4fc9ceSEric Biggers.else
1128c4fc9ceSEric Biggers	vpxord		\src1, \src2, \dst
1138c4fc9ceSEric Biggers.endif
1148c4fc9ceSEric Biggers.endm
1158c4fc9ceSEric Biggers
1168c4fc9ceSEric Biggers// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
1178c4fc9ceSEric Biggers// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
1188c4fc9ceSEric Biggers.macro	_load_partial_block	src, dst, tmp64, tmp32
1198c4fc9ceSEric Biggers	sub		$8, %ecx		// LEN - 8
1208c4fc9ceSEric Biggers	jle		.Lle8\@
1218c4fc9ceSEric Biggers
1228c4fc9ceSEric Biggers	// Load 9 <= LEN <= 15 bytes.
1238c4fc9ceSEric Biggers	vmovq		(\src), \dst		// Load first 8 bytes
1248c4fc9ceSEric Biggers	mov		(\src, %rcx), %rax	// Load last 8 bytes
1258c4fc9ceSEric Biggers	neg		%ecx
1268c4fc9ceSEric Biggers	shl		$3, %ecx
1278c4fc9ceSEric Biggers	shr		%cl, %rax		// Discard overlapping bytes
1288c4fc9ceSEric Biggers	vpinsrq		$1, %rax, \dst, \dst
1298c4fc9ceSEric Biggers	jmp		.Ldone\@
1308c4fc9ceSEric Biggers
1318c4fc9ceSEric Biggers.Lle8\@:
1328c4fc9ceSEric Biggers	add		$4, %ecx		// LEN - 4
1338c4fc9ceSEric Biggers	jl		.Llt4\@
1348c4fc9ceSEric Biggers
1358c4fc9ceSEric Biggers	// Load 4 <= LEN <= 8 bytes.
1368c4fc9ceSEric Biggers	mov		(\src), %eax		// Load first 4 bytes
1378c4fc9ceSEric Biggers	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
1388c4fc9ceSEric Biggers	jmp		.Lcombine\@
1398c4fc9ceSEric Biggers
1408c4fc9ceSEric Biggers.Llt4\@:
1418c4fc9ceSEric Biggers	// Load 1 <= LEN <= 3 bytes.
1428c4fc9ceSEric Biggers	add		$2, %ecx		// LEN - 2
1438c4fc9ceSEric Biggers	movzbl		(\src), %eax		// Load first byte
1448c4fc9ceSEric Biggers	jl		.Lmovq\@
1458c4fc9ceSEric Biggers	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
1468c4fc9ceSEric Biggers.Lcombine\@:
1478c4fc9ceSEric Biggers	shl		$3, %ecx
1488c4fc9ceSEric Biggers	shl		%cl, \tmp64
1498c4fc9ceSEric Biggers	or		\tmp64, %rax		// Combine the two parts
1508c4fc9ceSEric Biggers.Lmovq\@:
1518c4fc9ceSEric Biggers	vmovq		%rax, \dst
1528c4fc9ceSEric Biggers.Ldone\@:
1538c4fc9ceSEric Biggers.endm
1548c4fc9ceSEric Biggers
1558c4fc9ceSEric Biggers// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
1568c4fc9ceSEric Biggers// Clobbers %rax, %rcx, and \tmp{64,32}.
1578c4fc9ceSEric Biggers.macro	_store_partial_block	src, dst, tmp64, tmp32
1588c4fc9ceSEric Biggers	sub		$8, %ecx		// LEN - 8
1598c4fc9ceSEric Biggers	jl		.Llt8\@
1608c4fc9ceSEric Biggers
1618c4fc9ceSEric Biggers	// Store 8 <= LEN <= 15 bytes.
1628c4fc9ceSEric Biggers	vpextrq		$1, \src, %rax
1638c4fc9ceSEric Biggers	mov		%ecx, \tmp32
1648c4fc9ceSEric Biggers	shl		$3, %ecx
1658c4fc9ceSEric Biggers	ror		%cl, %rax
1668c4fc9ceSEric Biggers	mov		%rax, (\dst, \tmp64)	// Store last LEN - 8 bytes
1678c4fc9ceSEric Biggers	vmovq		\src, (\dst)		// Store first 8 bytes
1688c4fc9ceSEric Biggers	jmp		.Ldone\@
1698c4fc9ceSEric Biggers
1708c4fc9ceSEric Biggers.Llt8\@:
1718c4fc9ceSEric Biggers	add		$4, %ecx		// LEN - 4
1728c4fc9ceSEric Biggers	jl		.Llt4\@
1738c4fc9ceSEric Biggers
1748c4fc9ceSEric Biggers	// Store 4 <= LEN <= 7 bytes.
1758c4fc9ceSEric Biggers	vpextrd		$1, \src, %eax
1768c4fc9ceSEric Biggers	mov		%ecx, \tmp32
1778c4fc9ceSEric Biggers	shl		$3, %ecx
1788c4fc9ceSEric Biggers	ror		%cl, %eax
1798c4fc9ceSEric Biggers	mov		%eax, (\dst, \tmp64)	// Store last LEN - 4 bytes
1808c4fc9ceSEric Biggers	vmovd		\src, (\dst)		// Store first 4 bytes
1818c4fc9ceSEric Biggers	jmp		.Ldone\@
1828c4fc9ceSEric Biggers
1838c4fc9ceSEric Biggers.Llt4\@:
1848c4fc9ceSEric Biggers	// Store 1 <= LEN <= 3 bytes.
1858c4fc9ceSEric Biggers	vpextrb		$0, \src, 0(\dst)
1868c4fc9ceSEric Biggers	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
1878c4fc9ceSEric Biggers	jl		.Ldone\@
1888c4fc9ceSEric Biggers	vpextrb		$1, \src, 1(\dst)
1898c4fc9ceSEric Biggers	je		.Ldone\@
1908c4fc9ceSEric Biggers	vpextrb		$2, \src, 2(\dst)
1918c4fc9ceSEric Biggers.Ldone\@:
1928c4fc9ceSEric Biggers.endm
1938c4fc9ceSEric Biggers
1948c4fc9ceSEric Biggers// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and
1958c4fc9ceSEric Biggers// XOR each with the zero-th round key.  Also update LE_CTR if !\final.
1968c4fc9ceSEric Biggers.macro	_prepare_2_ctr_vecs	is_xctr, i0, i1, final=0
1978c4fc9ceSEric Biggers.if \is_xctr
198*7d14fbc5SEric Biggers  .if USE_AVX512
199*7d14fbc5SEric Biggers	vmovdqa64	LE_CTR, AESDATA\i0
2008c4fc9ceSEric Biggers	vpternlogd	$0x96, XCTR_IV, RNDKEY0, AESDATA\i0
2018c4fc9ceSEric Biggers  .else
2028c4fc9ceSEric Biggers	vpxor		XCTR_IV, LE_CTR, AESDATA\i0
2038c4fc9ceSEric Biggers	vpxor		RNDKEY0, AESDATA\i0, AESDATA\i0
2048c4fc9ceSEric Biggers  .endif
2058c4fc9ceSEric Biggers	vpaddq		LE_CTR_INC1, LE_CTR, AESDATA\i1
2068c4fc9ceSEric Biggers
207*7d14fbc5SEric Biggers  .if USE_AVX512
2088c4fc9ceSEric Biggers	vpternlogd	$0x96, XCTR_IV, RNDKEY0, AESDATA\i1
2098c4fc9ceSEric Biggers  .else
2108c4fc9ceSEric Biggers	vpxor		XCTR_IV, AESDATA\i1, AESDATA\i1
2118c4fc9ceSEric Biggers	vpxor		RNDKEY0, AESDATA\i1, AESDATA\i1
2128c4fc9ceSEric Biggers  .endif
2138c4fc9ceSEric Biggers.else
2148c4fc9ceSEric Biggers	vpshufb		BSWAP_MASK, LE_CTR, AESDATA\i0
2158c4fc9ceSEric Biggers	_vpxor		RNDKEY0, AESDATA\i0, AESDATA\i0
2168c4fc9ceSEric Biggers	vpaddq		LE_CTR_INC1, LE_CTR, AESDATA\i1
2178c4fc9ceSEric Biggers	vpshufb		BSWAP_MASK, AESDATA\i1, AESDATA\i1
2188c4fc9ceSEric Biggers	_vpxor		RNDKEY0, AESDATA\i1, AESDATA\i1
2198c4fc9ceSEric Biggers.endif
2208c4fc9ceSEric Biggers.if !\final
2218c4fc9ceSEric Biggers	vpaddq		LE_CTR_INC2, LE_CTR, LE_CTR
2228c4fc9ceSEric Biggers.endif
2238c4fc9ceSEric Biggers.endm
2248c4fc9ceSEric Biggers
2258c4fc9ceSEric Biggers// Do all AES rounds on the data in the given AESDATA vectors, excluding the
2268c4fc9ceSEric Biggers// zero-th and last rounds.
2278c4fc9ceSEric Biggers.macro	_aesenc_loop	vecs:vararg
2288c4fc9ceSEric Biggers	mov		KEY, %rax
2298c4fc9ceSEric Biggers1:
2308c4fc9ceSEric Biggers	_vbroadcast128	(%rax), RNDKEY
2318c4fc9ceSEric Biggers.irp i, \vecs
2328c4fc9ceSEric Biggers	vaesenc		RNDKEY, AESDATA\i, AESDATA\i
2338c4fc9ceSEric Biggers.endr
2348c4fc9ceSEric Biggers	add		$16, %rax
2358c4fc9ceSEric Biggers	cmp		%rax, RNDKEYLAST_PTR
2368c4fc9ceSEric Biggers	jne		1b
2378c4fc9ceSEric Biggers.endm
2388c4fc9ceSEric Biggers
2398c4fc9ceSEric Biggers// Finalize the keystream blocks in the given AESDATA vectors by doing the last
2408c4fc9ceSEric Biggers// AES round, then XOR those keystream blocks with the corresponding data.
2418c4fc9ceSEric Biggers// Reduce latency by doing the XOR before the vaesenclast, utilizing the
2428c4fc9ceSEric Biggers// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
2438c4fc9ceSEric Biggers.macro	_aesenclast_and_xor	vecs:vararg
2448c4fc9ceSEric Biggers.irp i, \vecs
2458c4fc9ceSEric Biggers	_vpxor		\i*VL(SRC), RNDKEYLAST, RNDKEY
2468c4fc9ceSEric Biggers	vaesenclast	RNDKEY, AESDATA\i, AESDATA\i
2478c4fc9ceSEric Biggers.endr
2488c4fc9ceSEric Biggers.irp i, \vecs
2498c4fc9ceSEric Biggers	_vmovdqu	AESDATA\i, \i*VL(DST)
2508c4fc9ceSEric Biggers.endr
2518c4fc9ceSEric Biggers.endm
2528c4fc9ceSEric Biggers
2538c4fc9ceSEric Biggers// XOR the keystream blocks in the specified AESDATA vectors with the
2548c4fc9ceSEric Biggers// corresponding data.
2558c4fc9ceSEric Biggers.macro	_xor_data	vecs:vararg
2568c4fc9ceSEric Biggers.irp i, \vecs
2578c4fc9ceSEric Biggers	_vpxor		\i*VL(SRC), AESDATA\i, AESDATA\i
2588c4fc9ceSEric Biggers.endr
2598c4fc9ceSEric Biggers.irp i, \vecs
2608c4fc9ceSEric Biggers	_vmovdqu	AESDATA\i, \i*VL(DST)
2618c4fc9ceSEric Biggers.endr
2628c4fc9ceSEric Biggers.endm
2638c4fc9ceSEric Biggers
2648c4fc9ceSEric Biggers.macro	_aes_ctr_crypt		is_xctr
2658c4fc9ceSEric Biggers
2668c4fc9ceSEric Biggers	// Define register aliases V0-V15 that map to the xmm, ymm, or zmm
2678c4fc9ceSEric Biggers	// registers according to the selected Vector Length (VL).
2688c4fc9ceSEric Biggers.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
2698c4fc9ceSEric Biggers  .if VL == 16
2708c4fc9ceSEric Biggers	.set	V\i,		%xmm\i
2718c4fc9ceSEric Biggers  .elseif VL == 32
2728c4fc9ceSEric Biggers	.set	V\i,		%ymm\i
2738c4fc9ceSEric Biggers  .elseif VL == 64
2748c4fc9ceSEric Biggers	.set	V\i,		%zmm\i
2758c4fc9ceSEric Biggers  .else
2768c4fc9ceSEric Biggers	.error "Unsupported Vector Length (VL)"
2778c4fc9ceSEric Biggers  .endif
2788c4fc9ceSEric Biggers.endr
2798c4fc9ceSEric Biggers
2808c4fc9ceSEric Biggers	// Function arguments
2818c4fc9ceSEric Biggers	.set	KEY,		%rdi	// Initially points to the start of the
2828c4fc9ceSEric Biggers					// crypto_aes_ctx, then is advanced to
2838c4fc9ceSEric Biggers					// point to the index 1 round key
2848c4fc9ceSEric Biggers	.set	KEY32,		%edi	// Available as temp register after all
2858c4fc9ceSEric Biggers					// keystream blocks have been generated
2868c4fc9ceSEric Biggers	.set	SRC,		%rsi	// Pointer to next source data
2878c4fc9ceSEric Biggers	.set	DST,		%rdx	// Pointer to next destination data
2888c4fc9ceSEric Biggers	.set	LEN,		%ecx	// Remaining length in bytes.
2898c4fc9ceSEric Biggers					// Note: _load_partial_block relies on
2908c4fc9ceSEric Biggers					// this being in %ecx.
2918c4fc9ceSEric Biggers	.set	LEN64,		%rcx	// Zero-extend LEN before using!
2928c4fc9ceSEric Biggers	.set	LEN8,		%cl
2938c4fc9ceSEric Biggers.if \is_xctr
2948c4fc9ceSEric Biggers	.set	XCTR_IV_PTR,	%r8	// const u8 iv[AES_BLOCK_SIZE];
2958c4fc9ceSEric Biggers	.set	XCTR_CTR,	%r9	// u64 ctr;
2968c4fc9ceSEric Biggers.else
2978c4fc9ceSEric Biggers	.set	LE_CTR_PTR,	%r8	// const u64 le_ctr[2];
2988c4fc9ceSEric Biggers.endif
2998c4fc9ceSEric Biggers
3008c4fc9ceSEric Biggers	// Additional local variables
3018c4fc9ceSEric Biggers	.set	RNDKEYLAST_PTR,	%r10
3028c4fc9ceSEric Biggers	.set	AESDATA0,	V0
3038c4fc9ceSEric Biggers	.set	AESDATA0_XMM,	%xmm0
3048c4fc9ceSEric Biggers	.set	AESDATA1,	V1
3058c4fc9ceSEric Biggers	.set	AESDATA1_XMM,	%xmm1
3068c4fc9ceSEric Biggers	.set	AESDATA2,	V2
3078c4fc9ceSEric Biggers	.set	AESDATA3,	V3
3088c4fc9ceSEric Biggers	.set	AESDATA4,	V4
3098c4fc9ceSEric Biggers	.set	AESDATA5,	V5
3108c4fc9ceSEric Biggers	.set	AESDATA6,	V6
3118c4fc9ceSEric Biggers	.set	AESDATA7,	V7
3128c4fc9ceSEric Biggers.if \is_xctr
3138c4fc9ceSEric Biggers	.set	XCTR_IV,	V8
3148c4fc9ceSEric Biggers.else
3158c4fc9ceSEric Biggers	.set	BSWAP_MASK,	V8
3168c4fc9ceSEric Biggers.endif
3178c4fc9ceSEric Biggers	.set	LE_CTR,		V9
3188c4fc9ceSEric Biggers	.set	LE_CTR_XMM,	%xmm9
3198c4fc9ceSEric Biggers	.set	LE_CTR_INC1,	V10
3208c4fc9ceSEric Biggers	.set	LE_CTR_INC2,	V11
3218c4fc9ceSEric Biggers	.set	RNDKEY0,	V12
3228c4fc9ceSEric Biggers	.set	RNDKEYLAST,	V13
3238c4fc9ceSEric Biggers	.set	RNDKEY,		V14
3248c4fc9ceSEric Biggers
3258c4fc9ceSEric Biggers	// Create the first vector of counters.
3268c4fc9ceSEric Biggers.if \is_xctr
3278c4fc9ceSEric Biggers  .if VL == 16
3288c4fc9ceSEric Biggers	vmovq		XCTR_CTR, LE_CTR
3298c4fc9ceSEric Biggers  .elseif VL == 32
3308c4fc9ceSEric Biggers	vmovq		XCTR_CTR, LE_CTR_XMM
3318c4fc9ceSEric Biggers	inc		XCTR_CTR
3328c4fc9ceSEric Biggers	vmovq		XCTR_CTR, AESDATA0_XMM
3338c4fc9ceSEric Biggers	vinserti128	$1, AESDATA0_XMM, LE_CTR, LE_CTR
3348c4fc9ceSEric Biggers  .else
3358c4fc9ceSEric Biggers	vpbroadcastq	XCTR_CTR, LE_CTR
3368c4fc9ceSEric Biggers	vpsrldq		$8, LE_CTR, LE_CTR
3378c4fc9ceSEric Biggers	vpaddq		.Lctr_pattern(%rip), LE_CTR, LE_CTR
3388c4fc9ceSEric Biggers  .endif
3398c4fc9ceSEric Biggers	_vbroadcast128	(XCTR_IV_PTR), XCTR_IV
3408c4fc9ceSEric Biggers.else
3418c4fc9ceSEric Biggers	_vbroadcast128	(LE_CTR_PTR), LE_CTR
3428c4fc9ceSEric Biggers  .if VL > 16
3438c4fc9ceSEric Biggers	vpaddq		.Lctr_pattern(%rip), LE_CTR, LE_CTR
3448c4fc9ceSEric Biggers  .endif
3458c4fc9ceSEric Biggers	_vbroadcast128	.Lbswap_mask(%rip), BSWAP_MASK
3468c4fc9ceSEric Biggers.endif
3478c4fc9ceSEric Biggers
3488c4fc9ceSEric Biggers.if VL == 16
3498c4fc9ceSEric Biggers	_vbroadcast128	.Lone(%rip), LE_CTR_INC1
3508c4fc9ceSEric Biggers.elseif VL == 32
3518c4fc9ceSEric Biggers	_vbroadcast128	.Ltwo(%rip), LE_CTR_INC1
3528c4fc9ceSEric Biggers.else
3538c4fc9ceSEric Biggers	_vbroadcast128	.Lfour(%rip), LE_CTR_INC1
3548c4fc9ceSEric Biggers.endif
3558c4fc9ceSEric Biggers	vpsllq		$1, LE_CTR_INC1, LE_CTR_INC2
3568c4fc9ceSEric Biggers
3578c4fc9ceSEric Biggers	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
3588c4fc9ceSEric Biggers	movl		480(KEY), %eax
3598c4fc9ceSEric Biggers
3608c4fc9ceSEric Biggers	// Compute the pointer to the last round key.
3618c4fc9ceSEric Biggers	lea		6*16(KEY, %rax, 4), RNDKEYLAST_PTR
3628c4fc9ceSEric Biggers
3638c4fc9ceSEric Biggers	// Load the zero-th and last round keys.
3648c4fc9ceSEric Biggers	_vbroadcast128	(KEY), RNDKEY0
3658c4fc9ceSEric Biggers	_vbroadcast128	(RNDKEYLAST_PTR), RNDKEYLAST
3668c4fc9ceSEric Biggers
3678c4fc9ceSEric Biggers	// Make KEY point to the first round key.
3688c4fc9ceSEric Biggers	add		$16, KEY
3698c4fc9ceSEric Biggers
3708c4fc9ceSEric Biggers	// This is the main loop, which encrypts 8 vectors of data at a time.
3718c4fc9ceSEric Biggers	add		$-8*VL, LEN
3728c4fc9ceSEric Biggers	jl		.Lloop_8x_done\@
3738c4fc9ceSEric Biggers.Lloop_8x\@:
3748c4fc9ceSEric Biggers	_prepare_2_ctr_vecs	\is_xctr, 0, 1
3758c4fc9ceSEric Biggers	_prepare_2_ctr_vecs	\is_xctr, 2, 3
3768c4fc9ceSEric Biggers	_prepare_2_ctr_vecs	\is_xctr, 4, 5
3778c4fc9ceSEric Biggers	_prepare_2_ctr_vecs	\is_xctr, 6, 7
3788c4fc9ceSEric Biggers	_aesenc_loop	0,1,2,3,4,5,6,7
3798c4fc9ceSEric Biggers	_aesenclast_and_xor 0,1,2,3,4,5,6,7
3808c4fc9ceSEric Biggers	sub		$-8*VL, SRC
3818c4fc9ceSEric Biggers	sub		$-8*VL, DST
3828c4fc9ceSEric Biggers	add		$-8*VL, LEN
3838c4fc9ceSEric Biggers	jge		.Lloop_8x\@
3848c4fc9ceSEric Biggers.Lloop_8x_done\@:
3858c4fc9ceSEric Biggers	sub		$-8*VL, LEN
3868c4fc9ceSEric Biggers	jz		.Ldone\@
3878c4fc9ceSEric Biggers
3888c4fc9ceSEric Biggers	// 1 <= LEN < 8*VL.  Generate 2, 4, or 8 more vectors of keystream
3898c4fc9ceSEric Biggers	// blocks, depending on the remaining LEN.
3908c4fc9ceSEric Biggers
3918c4fc9ceSEric Biggers	_prepare_2_ctr_vecs	\is_xctr, 0, 1
3928c4fc9ceSEric Biggers	_prepare_2_ctr_vecs	\is_xctr, 2, 3
3938c4fc9ceSEric Biggers	cmp		$4*VL, LEN
3948c4fc9ceSEric Biggers	jle		.Lenc_tail_atmost4vecs\@
3958c4fc9ceSEric Biggers
3968c4fc9ceSEric Biggers	// 4*VL < LEN < 8*VL.  Generate 8 vectors of keystream blocks.  Use the
3978c4fc9ceSEric Biggers	// first 4 to XOR 4 full vectors of data.  Then XOR the remaining data.
3988c4fc9ceSEric Biggers	_prepare_2_ctr_vecs	\is_xctr, 4, 5
3998c4fc9ceSEric Biggers	_prepare_2_ctr_vecs	\is_xctr, 6, 7, final=1
4008c4fc9ceSEric Biggers	_aesenc_loop	0,1,2,3,4,5,6,7
4018c4fc9ceSEric Biggers	_aesenclast_and_xor 0,1,2,3
4028c4fc9ceSEric Biggers	vaesenclast	RNDKEYLAST, AESDATA4, AESDATA0
4038c4fc9ceSEric Biggers	vaesenclast	RNDKEYLAST, AESDATA5, AESDATA1
4048c4fc9ceSEric Biggers	vaesenclast	RNDKEYLAST, AESDATA6, AESDATA2
4058c4fc9ceSEric Biggers	vaesenclast	RNDKEYLAST, AESDATA7, AESDATA3
4068c4fc9ceSEric Biggers	sub		$-4*VL, SRC
4078c4fc9ceSEric Biggers	sub		$-4*VL, DST
4088c4fc9ceSEric Biggers	add		$-4*VL, LEN
4098c4fc9ceSEric Biggers	cmp		$1*VL-1, LEN
4108c4fc9ceSEric Biggers	jle		.Lxor_tail_partial_vec_0\@
4118c4fc9ceSEric Biggers	_xor_data	0
4128c4fc9ceSEric Biggers	cmp		$2*VL-1, LEN
4138c4fc9ceSEric Biggers	jle		.Lxor_tail_partial_vec_1\@
4148c4fc9ceSEric Biggers	_xor_data	1
4158c4fc9ceSEric Biggers	cmp		$3*VL-1, LEN
4168c4fc9ceSEric Biggers	jle		.Lxor_tail_partial_vec_2\@
4178c4fc9ceSEric Biggers	_xor_data	2
4188c4fc9ceSEric Biggers	cmp		$4*VL-1, LEN
4198c4fc9ceSEric Biggers	jle		.Lxor_tail_partial_vec_3\@
4208c4fc9ceSEric Biggers	_xor_data	3
4218c4fc9ceSEric Biggers	jmp		.Ldone\@
4228c4fc9ceSEric Biggers
4238c4fc9ceSEric Biggers.Lenc_tail_atmost4vecs\@:
4248c4fc9ceSEric Biggers	cmp		$2*VL, LEN
4258c4fc9ceSEric Biggers	jle		.Lenc_tail_atmost2vecs\@
4268c4fc9ceSEric Biggers
4278c4fc9ceSEric Biggers	// 2*VL < LEN <= 4*VL.  Generate 4 vectors of keystream blocks.  Use the
4288c4fc9ceSEric Biggers	// first 2 to XOR 2 full vectors of data.  Then XOR the remaining data.
4298c4fc9ceSEric Biggers	_aesenc_loop	0,1,2,3
4308c4fc9ceSEric Biggers	_aesenclast_and_xor 0,1
4318c4fc9ceSEric Biggers	vaesenclast	RNDKEYLAST, AESDATA2, AESDATA0
4328c4fc9ceSEric Biggers	vaesenclast	RNDKEYLAST, AESDATA3, AESDATA1
4338c4fc9ceSEric Biggers	sub		$-2*VL, SRC
4348c4fc9ceSEric Biggers	sub		$-2*VL, DST
4358c4fc9ceSEric Biggers	add		$-2*VL, LEN
4368c4fc9ceSEric Biggers	jmp		.Lxor_tail_upto2vecs\@
4378c4fc9ceSEric Biggers
4388c4fc9ceSEric Biggers.Lenc_tail_atmost2vecs\@:
4398c4fc9ceSEric Biggers	// 1 <= LEN <= 2*VL.  Generate 2 vectors of keystream blocks.  Then XOR
4408c4fc9ceSEric Biggers	// the remaining data.
4418c4fc9ceSEric Biggers	_aesenc_loop	0,1
4428c4fc9ceSEric Biggers	vaesenclast	RNDKEYLAST, AESDATA0, AESDATA0
4438c4fc9ceSEric Biggers	vaesenclast	RNDKEYLAST, AESDATA1, AESDATA1
4448c4fc9ceSEric Biggers
4458c4fc9ceSEric Biggers.Lxor_tail_upto2vecs\@:
4468c4fc9ceSEric Biggers	cmp		$1*VL-1, LEN
4478c4fc9ceSEric Biggers	jle		.Lxor_tail_partial_vec_0\@
4488c4fc9ceSEric Biggers	_xor_data	0
4498c4fc9ceSEric Biggers	cmp		$2*VL-1, LEN
4508c4fc9ceSEric Biggers	jle		.Lxor_tail_partial_vec_1\@
4518c4fc9ceSEric Biggers	_xor_data	1
4528c4fc9ceSEric Biggers	jmp		.Ldone\@
4538c4fc9ceSEric Biggers
4548c4fc9ceSEric Biggers.Lxor_tail_partial_vec_1\@:
4558c4fc9ceSEric Biggers	add		$-1*VL, LEN
4568c4fc9ceSEric Biggers	jz		.Ldone\@
4578c4fc9ceSEric Biggers	sub		$-1*VL, SRC
4588c4fc9ceSEric Biggers	sub		$-1*VL, DST
4598c4fc9ceSEric Biggers	_vmovdqa	AESDATA1, AESDATA0
4608c4fc9ceSEric Biggers	jmp		.Lxor_tail_partial_vec_0\@
4618c4fc9ceSEric Biggers
4628c4fc9ceSEric Biggers.Lxor_tail_partial_vec_2\@:
4638c4fc9ceSEric Biggers	add		$-2*VL, LEN
4648c4fc9ceSEric Biggers	jz		.Ldone\@
4658c4fc9ceSEric Biggers	sub		$-2*VL, SRC
4668c4fc9ceSEric Biggers	sub		$-2*VL, DST
4678c4fc9ceSEric Biggers	_vmovdqa	AESDATA2, AESDATA0
4688c4fc9ceSEric Biggers	jmp		.Lxor_tail_partial_vec_0\@
4698c4fc9ceSEric Biggers
4708c4fc9ceSEric Biggers.Lxor_tail_partial_vec_3\@:
4718c4fc9ceSEric Biggers	add		$-3*VL, LEN
4728c4fc9ceSEric Biggers	jz		.Ldone\@
4738c4fc9ceSEric Biggers	sub		$-3*VL, SRC
4748c4fc9ceSEric Biggers	sub		$-3*VL, DST
4758c4fc9ceSEric Biggers	_vmovdqa	AESDATA3, AESDATA0
4768c4fc9ceSEric Biggers
4778c4fc9ceSEric Biggers.Lxor_tail_partial_vec_0\@:
4788c4fc9ceSEric Biggers	// XOR the remaining 1 <= LEN < VL bytes.  It's easy if masked
4798c4fc9ceSEric Biggers	// loads/stores are available; otherwise it's a bit harder...
480*7d14fbc5SEric Biggers.if USE_AVX512
4818c4fc9ceSEric Biggers	mov		$-1, %rax
4828c4fc9ceSEric Biggers	bzhi		LEN64, %rax, %rax
4838c4fc9ceSEric Biggers	kmovq		%rax, %k1
4848c4fc9ceSEric Biggers	vmovdqu8	(SRC), AESDATA1{%k1}{z}
485*7d14fbc5SEric Biggers	vpxord		AESDATA1, AESDATA0, AESDATA0
4868c4fc9ceSEric Biggers	vmovdqu8	AESDATA0, (DST){%k1}
4878c4fc9ceSEric Biggers.else
4888c4fc9ceSEric Biggers  .if VL == 32
4898c4fc9ceSEric Biggers	cmp		$16, LEN
4908c4fc9ceSEric Biggers	jl		1f
4918c4fc9ceSEric Biggers	vpxor		(SRC), AESDATA0_XMM, AESDATA1_XMM
4928c4fc9ceSEric Biggers	vmovdqu		AESDATA1_XMM, (DST)
4938c4fc9ceSEric Biggers	add		$16, SRC
4948c4fc9ceSEric Biggers	add		$16, DST
4958c4fc9ceSEric Biggers	sub		$16, LEN
4968c4fc9ceSEric Biggers	jz		.Ldone\@
4978c4fc9ceSEric Biggers	vextracti128	$1, AESDATA0, AESDATA0_XMM
4988c4fc9ceSEric Biggers1:
4998c4fc9ceSEric Biggers  .endif
5008c4fc9ceSEric Biggers	mov		LEN, %r10d
5018c4fc9ceSEric Biggers	_load_partial_block	SRC, AESDATA1_XMM, KEY, KEY32
5028c4fc9ceSEric Biggers	vpxor		AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM
5038c4fc9ceSEric Biggers	mov		%r10d, %ecx
5048c4fc9ceSEric Biggers	_store_partial_block	AESDATA0_XMM, DST, KEY, KEY32
5058c4fc9ceSEric Biggers.endif
5068c4fc9ceSEric Biggers
5078c4fc9ceSEric Biggers.Ldone\@:
5088c4fc9ceSEric Biggers.if VL > 16
5098c4fc9ceSEric Biggers	vzeroupper
5108c4fc9ceSEric Biggers.endif
5118c4fc9ceSEric Biggers	RET
5128c4fc9ceSEric Biggers.endm
5138c4fc9ceSEric Biggers
5148c4fc9ceSEric Biggers// Below are the definitions of the functions generated by the above macro.
5158c4fc9ceSEric Biggers// They have the following prototypes:
5168c4fc9ceSEric Biggers//
5178c4fc9ceSEric Biggers//
5188c4fc9ceSEric Biggers// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key,
5198c4fc9ceSEric Biggers//				 const u8 *src, u8 *dst, int len,
5208c4fc9ceSEric Biggers//				 const u64 le_ctr[2]);
5218c4fc9ceSEric Biggers//
5228c4fc9ceSEric Biggers// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key,
5238c4fc9ceSEric Biggers//				const u8 *src, u8 *dst, int len,
5248c4fc9ceSEric Biggers//				const u8 iv[AES_BLOCK_SIZE], u64 ctr);
5258c4fc9ceSEric Biggers//
5268c4fc9ceSEric Biggers// Both functions generate |len| bytes of keystream, XOR it with the data from
5278c4fc9ceSEric Biggers// |src|, and write the result to |dst|.  On non-final calls, |len| must be a
5288c4fc9ceSEric Biggers// multiple of 16.  On the final call, |len| can be any value.
5298c4fc9ceSEric Biggers//
5308c4fc9ceSEric Biggers// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated
5318c4fc9ceSEric Biggers// from a 128-bit big endian counter that increments by 1 for each AES block.
5328c4fc9ceSEric Biggers// HOWEVER, to keep the assembly code simple, some of the counter management is
5338c4fc9ceSEric Biggers// left to the caller.  aes_ctr64_crypt_* take the counter in little endian
5348c4fc9ceSEric Biggers// form, only increment the low 64 bits internally, do the conversion to big
5358c4fc9ceSEric Biggers// endian internally, and don't write the updated counter back to memory.  The
5368c4fc9ceSEric Biggers// caller is responsible for converting the starting IV to the little endian
5378c4fc9ceSEric Biggers// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits
5388c4fc9ceSEric Biggers// being needed and splitting at that point with a carry done in between, and
5398c4fc9ceSEric Biggers// updating le_ctr after each part if the message is multi-part.
5408c4fc9ceSEric Biggers//
5418c4fc9ceSEric Biggers// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption
5428c4fc9ceSEric Biggers// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf).  XCTR is an
5438c4fc9ceSEric Biggers// easier-to-implement variant of CTR that uses little endian byte order and
5448c4fc9ceSEric Biggers// eliminates carries.  |ctr| is the per-message block counter starting at 1.
5458c4fc9ceSEric Biggers
5468c4fc9ceSEric Biggers.set	VL, 16
547*7d14fbc5SEric Biggers.set	USE_AVX512, 0
5488c4fc9ceSEric BiggersSYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
5498c4fc9ceSEric Biggers	_aes_ctr_crypt	0
5508c4fc9ceSEric BiggersSYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
5518c4fc9ceSEric BiggersSYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx)
5528c4fc9ceSEric Biggers	_aes_ctr_crypt	1
5538c4fc9ceSEric BiggersSYM_FUNC_END(aes_xctr_crypt_aesni_avx)
5548c4fc9ceSEric Biggers
5558c4fc9ceSEric Biggers#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
5568c4fc9ceSEric Biggers.set	VL, 32
557*7d14fbc5SEric Biggers.set	USE_AVX512, 0
5588c4fc9ceSEric BiggersSYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
5598c4fc9ceSEric Biggers	_aes_ctr_crypt	0
5608c4fc9ceSEric BiggersSYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
5618c4fc9ceSEric BiggersSYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
5628c4fc9ceSEric Biggers	_aes_ctr_crypt	1
5638c4fc9ceSEric BiggersSYM_FUNC_END(aes_xctr_crypt_vaes_avx2)
5648c4fc9ceSEric Biggers
5658c4fc9ceSEric Biggers.set	VL, 64
566*7d14fbc5SEric Biggers.set	USE_AVX512, 1
567*7d14fbc5SEric BiggersSYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512)
5688c4fc9ceSEric Biggers	_aes_ctr_crypt	0
569*7d14fbc5SEric BiggersSYM_FUNC_END(aes_ctr64_crypt_vaes_avx512)
570*7d14fbc5SEric BiggersSYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512)
5718c4fc9ceSEric Biggers	_aes_ctr_crypt	1
572*7d14fbc5SEric BiggersSYM_FUNC_END(aes_xctr_crypt_vaes_avx512)
5738c4fc9ceSEric Biggers#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ
574