18c4fc9ceSEric Biggers/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 28c4fc9ceSEric Biggers// 38c4fc9ceSEric Biggers// Copyright 2025 Google LLC 48c4fc9ceSEric Biggers// 58c4fc9ceSEric Biggers// Author: Eric Biggers <ebiggers@google.com> 68c4fc9ceSEric Biggers// 78c4fc9ceSEric Biggers// This file is dual-licensed, meaning that you can use it under your choice of 88c4fc9ceSEric Biggers// either of the following two licenses: 98c4fc9ceSEric Biggers// 108c4fc9ceSEric Biggers// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy 118c4fc9ceSEric Biggers// of the License at 128c4fc9ceSEric Biggers// 138c4fc9ceSEric Biggers// http://www.apache.org/licenses/LICENSE-2.0 148c4fc9ceSEric Biggers// 158c4fc9ceSEric Biggers// Unless required by applicable law or agreed to in writing, software 168c4fc9ceSEric Biggers// distributed under the License is distributed on an "AS IS" BASIS, 178c4fc9ceSEric Biggers// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 188c4fc9ceSEric Biggers// See the License for the specific language governing permissions and 198c4fc9ceSEric Biggers// limitations under the License. 208c4fc9ceSEric Biggers// 218c4fc9ceSEric Biggers// or 228c4fc9ceSEric Biggers// 238c4fc9ceSEric Biggers// Redistribution and use in source and binary forms, with or without 248c4fc9ceSEric Biggers// modification, are permitted provided that the following conditions are met: 258c4fc9ceSEric Biggers// 268c4fc9ceSEric Biggers// 1. Redistributions of source code must retain the above copyright notice, 278c4fc9ceSEric Biggers// this list of conditions and the following disclaimer. 288c4fc9ceSEric Biggers// 298c4fc9ceSEric Biggers// 2. Redistributions in binary form must reproduce the above copyright 308c4fc9ceSEric Biggers// notice, this list of conditions and the following disclaimer in the 318c4fc9ceSEric Biggers// documentation and/or other materials provided with the distribution. 328c4fc9ceSEric Biggers// 338c4fc9ceSEric Biggers// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 348c4fc9ceSEric Biggers// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 358c4fc9ceSEric Biggers// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 368c4fc9ceSEric Biggers// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 378c4fc9ceSEric Biggers// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 388c4fc9ceSEric Biggers// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 398c4fc9ceSEric Biggers// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 408c4fc9ceSEric Biggers// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 418c4fc9ceSEric Biggers// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 428c4fc9ceSEric Biggers// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 438c4fc9ceSEric Biggers// POSSIBILITY OF SUCH DAMAGE. 448c4fc9ceSEric Biggers// 458c4fc9ceSEric Biggers//------------------------------------------------------------------------------ 468c4fc9ceSEric Biggers// 478c4fc9ceSEric Biggers// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR 488c4fc9ceSEric Biggers// using the following sets of CPU features: 498c4fc9ceSEric Biggers// - AES-NI && AVX 508c4fc9ceSEric Biggers// - VAES && AVX2 51*7d14fbc5SEric Biggers// - VAES && AVX512BW && AVX512VL && BMI2 528c4fc9ceSEric Biggers// 538c4fc9ceSEric Biggers// See the function definitions at the bottom of the file for more information. 548c4fc9ceSEric Biggers 558c4fc9ceSEric Biggers#include <linux/linkage.h> 568c4fc9ceSEric Biggers#include <linux/cfi_types.h> 578c4fc9ceSEric Biggers 588c4fc9ceSEric Biggers.section .rodata 598c4fc9ceSEric Biggers.p2align 4 608c4fc9ceSEric Biggers 618c4fc9ceSEric Biggers.Lbswap_mask: 628c4fc9ceSEric Biggers .octa 0x000102030405060708090a0b0c0d0e0f 638c4fc9ceSEric Biggers 648c4fc9ceSEric Biggers.Lctr_pattern: 658c4fc9ceSEric Biggers .quad 0, 0 668c4fc9ceSEric Biggers.Lone: 678c4fc9ceSEric Biggers .quad 1, 0 688c4fc9ceSEric Biggers.Ltwo: 698c4fc9ceSEric Biggers .quad 2, 0 708c4fc9ceSEric Biggers .quad 3, 0 718c4fc9ceSEric Biggers 728c4fc9ceSEric Biggers.Lfour: 738c4fc9ceSEric Biggers .quad 4, 0 748c4fc9ceSEric Biggers 758c4fc9ceSEric Biggers.text 768c4fc9ceSEric Biggers 778c4fc9ceSEric Biggers// Move a vector between memory and a register. 788c4fc9ceSEric Biggers.macro _vmovdqu src, dst 798c4fc9ceSEric Biggers.if VL < 64 808c4fc9ceSEric Biggers vmovdqu \src, \dst 818c4fc9ceSEric Biggers.else 828c4fc9ceSEric Biggers vmovdqu8 \src, \dst 838c4fc9ceSEric Biggers.endif 848c4fc9ceSEric Biggers.endm 858c4fc9ceSEric Biggers 868c4fc9ceSEric Biggers// Move a vector between registers. 878c4fc9ceSEric Biggers.macro _vmovdqa src, dst 888c4fc9ceSEric Biggers.if VL < 64 898c4fc9ceSEric Biggers vmovdqa \src, \dst 908c4fc9ceSEric Biggers.else 918c4fc9ceSEric Biggers vmovdqa64 \src, \dst 928c4fc9ceSEric Biggers.endif 938c4fc9ceSEric Biggers.endm 948c4fc9ceSEric Biggers 958c4fc9ceSEric Biggers// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector 96*7d14fbc5SEric Biggers// register. 978c4fc9ceSEric Biggers.macro _vbroadcast128 src, dst 988c4fc9ceSEric Biggers.if VL == 16 998c4fc9ceSEric Biggers vmovdqu \src, \dst 1008c4fc9ceSEric Biggers.elseif VL == 32 1018c4fc9ceSEric Biggers vbroadcasti128 \src, \dst 1028c4fc9ceSEric Biggers.else 1038c4fc9ceSEric Biggers vbroadcasti32x4 \src, \dst 1048c4fc9ceSEric Biggers.endif 1058c4fc9ceSEric Biggers.endm 1068c4fc9ceSEric Biggers 1078c4fc9ceSEric Biggers// XOR two vectors together. 1088c4fc9ceSEric Biggers.macro _vpxor src1, src2, dst 1098c4fc9ceSEric Biggers.if VL < 64 1108c4fc9ceSEric Biggers vpxor \src1, \src2, \dst 1118c4fc9ceSEric Biggers.else 1128c4fc9ceSEric Biggers vpxord \src1, \src2, \dst 1138c4fc9ceSEric Biggers.endif 1148c4fc9ceSEric Biggers.endm 1158c4fc9ceSEric Biggers 1168c4fc9ceSEric Biggers// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst 1178c4fc9ceSEric Biggers// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. 1188c4fc9ceSEric Biggers.macro _load_partial_block src, dst, tmp64, tmp32 1198c4fc9ceSEric Biggers sub $8, %ecx // LEN - 8 1208c4fc9ceSEric Biggers jle .Lle8\@ 1218c4fc9ceSEric Biggers 1228c4fc9ceSEric Biggers // Load 9 <= LEN <= 15 bytes. 1238c4fc9ceSEric Biggers vmovq (\src), \dst // Load first 8 bytes 1248c4fc9ceSEric Biggers mov (\src, %rcx), %rax // Load last 8 bytes 1258c4fc9ceSEric Biggers neg %ecx 1268c4fc9ceSEric Biggers shl $3, %ecx 1278c4fc9ceSEric Biggers shr %cl, %rax // Discard overlapping bytes 1288c4fc9ceSEric Biggers vpinsrq $1, %rax, \dst, \dst 1298c4fc9ceSEric Biggers jmp .Ldone\@ 1308c4fc9ceSEric Biggers 1318c4fc9ceSEric Biggers.Lle8\@: 1328c4fc9ceSEric Biggers add $4, %ecx // LEN - 4 1338c4fc9ceSEric Biggers jl .Llt4\@ 1348c4fc9ceSEric Biggers 1358c4fc9ceSEric Biggers // Load 4 <= LEN <= 8 bytes. 1368c4fc9ceSEric Biggers mov (\src), %eax // Load first 4 bytes 1378c4fc9ceSEric Biggers mov (\src, %rcx), \tmp32 // Load last 4 bytes 1388c4fc9ceSEric Biggers jmp .Lcombine\@ 1398c4fc9ceSEric Biggers 1408c4fc9ceSEric Biggers.Llt4\@: 1418c4fc9ceSEric Biggers // Load 1 <= LEN <= 3 bytes. 1428c4fc9ceSEric Biggers add $2, %ecx // LEN - 2 1438c4fc9ceSEric Biggers movzbl (\src), %eax // Load first byte 1448c4fc9ceSEric Biggers jl .Lmovq\@ 1458c4fc9ceSEric Biggers movzwl (\src, %rcx), \tmp32 // Load last 2 bytes 1468c4fc9ceSEric Biggers.Lcombine\@: 1478c4fc9ceSEric Biggers shl $3, %ecx 1488c4fc9ceSEric Biggers shl %cl, \tmp64 1498c4fc9ceSEric Biggers or \tmp64, %rax // Combine the two parts 1508c4fc9ceSEric Biggers.Lmovq\@: 1518c4fc9ceSEric Biggers vmovq %rax, \dst 1528c4fc9ceSEric Biggers.Ldone\@: 1538c4fc9ceSEric Biggers.endm 1548c4fc9ceSEric Biggers 1558c4fc9ceSEric Biggers// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. 1568c4fc9ceSEric Biggers// Clobbers %rax, %rcx, and \tmp{64,32}. 1578c4fc9ceSEric Biggers.macro _store_partial_block src, dst, tmp64, tmp32 1588c4fc9ceSEric Biggers sub $8, %ecx // LEN - 8 1598c4fc9ceSEric Biggers jl .Llt8\@ 1608c4fc9ceSEric Biggers 1618c4fc9ceSEric Biggers // Store 8 <= LEN <= 15 bytes. 1628c4fc9ceSEric Biggers vpextrq $1, \src, %rax 1638c4fc9ceSEric Biggers mov %ecx, \tmp32 1648c4fc9ceSEric Biggers shl $3, %ecx 1658c4fc9ceSEric Biggers ror %cl, %rax 1668c4fc9ceSEric Biggers mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes 1678c4fc9ceSEric Biggers vmovq \src, (\dst) // Store first 8 bytes 1688c4fc9ceSEric Biggers jmp .Ldone\@ 1698c4fc9ceSEric Biggers 1708c4fc9ceSEric Biggers.Llt8\@: 1718c4fc9ceSEric Biggers add $4, %ecx // LEN - 4 1728c4fc9ceSEric Biggers jl .Llt4\@ 1738c4fc9ceSEric Biggers 1748c4fc9ceSEric Biggers // Store 4 <= LEN <= 7 bytes. 1758c4fc9ceSEric Biggers vpextrd $1, \src, %eax 1768c4fc9ceSEric Biggers mov %ecx, \tmp32 1778c4fc9ceSEric Biggers shl $3, %ecx 1788c4fc9ceSEric Biggers ror %cl, %eax 1798c4fc9ceSEric Biggers mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes 1808c4fc9ceSEric Biggers vmovd \src, (\dst) // Store first 4 bytes 1818c4fc9ceSEric Biggers jmp .Ldone\@ 1828c4fc9ceSEric Biggers 1838c4fc9ceSEric Biggers.Llt4\@: 1848c4fc9ceSEric Biggers // Store 1 <= LEN <= 3 bytes. 1858c4fc9ceSEric Biggers vpextrb $0, \src, 0(\dst) 1868c4fc9ceSEric Biggers cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? 1878c4fc9ceSEric Biggers jl .Ldone\@ 1888c4fc9ceSEric Biggers vpextrb $1, \src, 1(\dst) 1898c4fc9ceSEric Biggers je .Ldone\@ 1908c4fc9ceSEric Biggers vpextrb $2, \src, 2(\dst) 1918c4fc9ceSEric Biggers.Ldone\@: 1928c4fc9ceSEric Biggers.endm 1938c4fc9ceSEric Biggers 1948c4fc9ceSEric Biggers// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and 1958c4fc9ceSEric Biggers// XOR each with the zero-th round key. Also update LE_CTR if !\final. 1968c4fc9ceSEric Biggers.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 1978c4fc9ceSEric Biggers.if \is_xctr 198*7d14fbc5SEric Biggers .if USE_AVX512 199*7d14fbc5SEric Biggers vmovdqa64 LE_CTR, AESDATA\i0 2008c4fc9ceSEric Biggers vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 2018c4fc9ceSEric Biggers .else 2028c4fc9ceSEric Biggers vpxor XCTR_IV, LE_CTR, AESDATA\i0 2038c4fc9ceSEric Biggers vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 2048c4fc9ceSEric Biggers .endif 2058c4fc9ceSEric Biggers vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 2068c4fc9ceSEric Biggers 207*7d14fbc5SEric Biggers .if USE_AVX512 2088c4fc9ceSEric Biggers vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 2098c4fc9ceSEric Biggers .else 2108c4fc9ceSEric Biggers vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 2118c4fc9ceSEric Biggers vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 2128c4fc9ceSEric Biggers .endif 2138c4fc9ceSEric Biggers.else 2148c4fc9ceSEric Biggers vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0 2158c4fc9ceSEric Biggers _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 2168c4fc9ceSEric Biggers vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 2178c4fc9ceSEric Biggers vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1 2188c4fc9ceSEric Biggers _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 2198c4fc9ceSEric Biggers.endif 2208c4fc9ceSEric Biggers.if !\final 2218c4fc9ceSEric Biggers vpaddq LE_CTR_INC2, LE_CTR, LE_CTR 2228c4fc9ceSEric Biggers.endif 2238c4fc9ceSEric Biggers.endm 2248c4fc9ceSEric Biggers 2258c4fc9ceSEric Biggers// Do all AES rounds on the data in the given AESDATA vectors, excluding the 2268c4fc9ceSEric Biggers// zero-th and last rounds. 2278c4fc9ceSEric Biggers.macro _aesenc_loop vecs:vararg 2288c4fc9ceSEric Biggers mov KEY, %rax 2298c4fc9ceSEric Biggers1: 2308c4fc9ceSEric Biggers _vbroadcast128 (%rax), RNDKEY 2318c4fc9ceSEric Biggers.irp i, \vecs 2328c4fc9ceSEric Biggers vaesenc RNDKEY, AESDATA\i, AESDATA\i 2338c4fc9ceSEric Biggers.endr 2348c4fc9ceSEric Biggers add $16, %rax 2358c4fc9ceSEric Biggers cmp %rax, RNDKEYLAST_PTR 2368c4fc9ceSEric Biggers jne 1b 2378c4fc9ceSEric Biggers.endm 2388c4fc9ceSEric Biggers 2398c4fc9ceSEric Biggers// Finalize the keystream blocks in the given AESDATA vectors by doing the last 2408c4fc9ceSEric Biggers// AES round, then XOR those keystream blocks with the corresponding data. 2418c4fc9ceSEric Biggers// Reduce latency by doing the XOR before the vaesenclast, utilizing the 2428c4fc9ceSEric Biggers// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). 2438c4fc9ceSEric Biggers.macro _aesenclast_and_xor vecs:vararg 2448c4fc9ceSEric Biggers.irp i, \vecs 2458c4fc9ceSEric Biggers _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY 2468c4fc9ceSEric Biggers vaesenclast RNDKEY, AESDATA\i, AESDATA\i 2478c4fc9ceSEric Biggers.endr 2488c4fc9ceSEric Biggers.irp i, \vecs 2498c4fc9ceSEric Biggers _vmovdqu AESDATA\i, \i*VL(DST) 2508c4fc9ceSEric Biggers.endr 2518c4fc9ceSEric Biggers.endm 2528c4fc9ceSEric Biggers 2538c4fc9ceSEric Biggers// XOR the keystream blocks in the specified AESDATA vectors with the 2548c4fc9ceSEric Biggers// corresponding data. 2558c4fc9ceSEric Biggers.macro _xor_data vecs:vararg 2568c4fc9ceSEric Biggers.irp i, \vecs 2578c4fc9ceSEric Biggers _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i 2588c4fc9ceSEric Biggers.endr 2598c4fc9ceSEric Biggers.irp i, \vecs 2608c4fc9ceSEric Biggers _vmovdqu AESDATA\i, \i*VL(DST) 2618c4fc9ceSEric Biggers.endr 2628c4fc9ceSEric Biggers.endm 2638c4fc9ceSEric Biggers 2648c4fc9ceSEric Biggers.macro _aes_ctr_crypt is_xctr 2658c4fc9ceSEric Biggers 2668c4fc9ceSEric Biggers // Define register aliases V0-V15 that map to the xmm, ymm, or zmm 2678c4fc9ceSEric Biggers // registers according to the selected Vector Length (VL). 2688c4fc9ceSEric Biggers.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 2698c4fc9ceSEric Biggers .if VL == 16 2708c4fc9ceSEric Biggers .set V\i, %xmm\i 2718c4fc9ceSEric Biggers .elseif VL == 32 2728c4fc9ceSEric Biggers .set V\i, %ymm\i 2738c4fc9ceSEric Biggers .elseif VL == 64 2748c4fc9ceSEric Biggers .set V\i, %zmm\i 2758c4fc9ceSEric Biggers .else 2768c4fc9ceSEric Biggers .error "Unsupported Vector Length (VL)" 2778c4fc9ceSEric Biggers .endif 2788c4fc9ceSEric Biggers.endr 2798c4fc9ceSEric Biggers 2808c4fc9ceSEric Biggers // Function arguments 2818c4fc9ceSEric Biggers .set KEY, %rdi // Initially points to the start of the 2828c4fc9ceSEric Biggers // crypto_aes_ctx, then is advanced to 2838c4fc9ceSEric Biggers // point to the index 1 round key 2848c4fc9ceSEric Biggers .set KEY32, %edi // Available as temp register after all 2858c4fc9ceSEric Biggers // keystream blocks have been generated 2868c4fc9ceSEric Biggers .set SRC, %rsi // Pointer to next source data 2878c4fc9ceSEric Biggers .set DST, %rdx // Pointer to next destination data 2888c4fc9ceSEric Biggers .set LEN, %ecx // Remaining length in bytes. 2898c4fc9ceSEric Biggers // Note: _load_partial_block relies on 2908c4fc9ceSEric Biggers // this being in %ecx. 2918c4fc9ceSEric Biggers .set LEN64, %rcx // Zero-extend LEN before using! 2928c4fc9ceSEric Biggers .set LEN8, %cl 2938c4fc9ceSEric Biggers.if \is_xctr 2948c4fc9ceSEric Biggers .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE]; 2958c4fc9ceSEric Biggers .set XCTR_CTR, %r9 // u64 ctr; 2968c4fc9ceSEric Biggers.else 2978c4fc9ceSEric Biggers .set LE_CTR_PTR, %r8 // const u64 le_ctr[2]; 2988c4fc9ceSEric Biggers.endif 2998c4fc9ceSEric Biggers 3008c4fc9ceSEric Biggers // Additional local variables 3018c4fc9ceSEric Biggers .set RNDKEYLAST_PTR, %r10 3028c4fc9ceSEric Biggers .set AESDATA0, V0 3038c4fc9ceSEric Biggers .set AESDATA0_XMM, %xmm0 3048c4fc9ceSEric Biggers .set AESDATA1, V1 3058c4fc9ceSEric Biggers .set AESDATA1_XMM, %xmm1 3068c4fc9ceSEric Biggers .set AESDATA2, V2 3078c4fc9ceSEric Biggers .set AESDATA3, V3 3088c4fc9ceSEric Biggers .set AESDATA4, V4 3098c4fc9ceSEric Biggers .set AESDATA5, V5 3108c4fc9ceSEric Biggers .set AESDATA6, V6 3118c4fc9ceSEric Biggers .set AESDATA7, V7 3128c4fc9ceSEric Biggers.if \is_xctr 3138c4fc9ceSEric Biggers .set XCTR_IV, V8 3148c4fc9ceSEric Biggers.else 3158c4fc9ceSEric Biggers .set BSWAP_MASK, V8 3168c4fc9ceSEric Biggers.endif 3178c4fc9ceSEric Biggers .set LE_CTR, V9 3188c4fc9ceSEric Biggers .set LE_CTR_XMM, %xmm9 3198c4fc9ceSEric Biggers .set LE_CTR_INC1, V10 3208c4fc9ceSEric Biggers .set LE_CTR_INC2, V11 3218c4fc9ceSEric Biggers .set RNDKEY0, V12 3228c4fc9ceSEric Biggers .set RNDKEYLAST, V13 3238c4fc9ceSEric Biggers .set RNDKEY, V14 3248c4fc9ceSEric Biggers 3258c4fc9ceSEric Biggers // Create the first vector of counters. 3268c4fc9ceSEric Biggers.if \is_xctr 3278c4fc9ceSEric Biggers .if VL == 16 3288c4fc9ceSEric Biggers vmovq XCTR_CTR, LE_CTR 3298c4fc9ceSEric Biggers .elseif VL == 32 3308c4fc9ceSEric Biggers vmovq XCTR_CTR, LE_CTR_XMM 3318c4fc9ceSEric Biggers inc XCTR_CTR 3328c4fc9ceSEric Biggers vmovq XCTR_CTR, AESDATA0_XMM 3338c4fc9ceSEric Biggers vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR 3348c4fc9ceSEric Biggers .else 3358c4fc9ceSEric Biggers vpbroadcastq XCTR_CTR, LE_CTR 3368c4fc9ceSEric Biggers vpsrldq $8, LE_CTR, LE_CTR 3378c4fc9ceSEric Biggers vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR 3388c4fc9ceSEric Biggers .endif 3398c4fc9ceSEric Biggers _vbroadcast128 (XCTR_IV_PTR), XCTR_IV 3408c4fc9ceSEric Biggers.else 3418c4fc9ceSEric Biggers _vbroadcast128 (LE_CTR_PTR), LE_CTR 3428c4fc9ceSEric Biggers .if VL > 16 3438c4fc9ceSEric Biggers vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR 3448c4fc9ceSEric Biggers .endif 3458c4fc9ceSEric Biggers _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK 3468c4fc9ceSEric Biggers.endif 3478c4fc9ceSEric Biggers 3488c4fc9ceSEric Biggers.if VL == 16 3498c4fc9ceSEric Biggers _vbroadcast128 .Lone(%rip), LE_CTR_INC1 3508c4fc9ceSEric Biggers.elseif VL == 32 3518c4fc9ceSEric Biggers _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1 3528c4fc9ceSEric Biggers.else 3538c4fc9ceSEric Biggers _vbroadcast128 .Lfour(%rip), LE_CTR_INC1 3548c4fc9ceSEric Biggers.endif 3558c4fc9ceSEric Biggers vpsllq $1, LE_CTR_INC1, LE_CTR_INC2 3568c4fc9ceSEric Biggers 3578c4fc9ceSEric Biggers // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). 3588c4fc9ceSEric Biggers movl 480(KEY), %eax 3598c4fc9ceSEric Biggers 3608c4fc9ceSEric Biggers // Compute the pointer to the last round key. 3618c4fc9ceSEric Biggers lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR 3628c4fc9ceSEric Biggers 3638c4fc9ceSEric Biggers // Load the zero-th and last round keys. 3648c4fc9ceSEric Biggers _vbroadcast128 (KEY), RNDKEY0 3658c4fc9ceSEric Biggers _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST 3668c4fc9ceSEric Biggers 3678c4fc9ceSEric Biggers // Make KEY point to the first round key. 3688c4fc9ceSEric Biggers add $16, KEY 3698c4fc9ceSEric Biggers 3708c4fc9ceSEric Biggers // This is the main loop, which encrypts 8 vectors of data at a time. 3718c4fc9ceSEric Biggers add $-8*VL, LEN 3728c4fc9ceSEric Biggers jl .Lloop_8x_done\@ 3738c4fc9ceSEric Biggers.Lloop_8x\@: 3748c4fc9ceSEric Biggers _prepare_2_ctr_vecs \is_xctr, 0, 1 3758c4fc9ceSEric Biggers _prepare_2_ctr_vecs \is_xctr, 2, 3 3768c4fc9ceSEric Biggers _prepare_2_ctr_vecs \is_xctr, 4, 5 3778c4fc9ceSEric Biggers _prepare_2_ctr_vecs \is_xctr, 6, 7 3788c4fc9ceSEric Biggers _aesenc_loop 0,1,2,3,4,5,6,7 3798c4fc9ceSEric Biggers _aesenclast_and_xor 0,1,2,3,4,5,6,7 3808c4fc9ceSEric Biggers sub $-8*VL, SRC 3818c4fc9ceSEric Biggers sub $-8*VL, DST 3828c4fc9ceSEric Biggers add $-8*VL, LEN 3838c4fc9ceSEric Biggers jge .Lloop_8x\@ 3848c4fc9ceSEric Biggers.Lloop_8x_done\@: 3858c4fc9ceSEric Biggers sub $-8*VL, LEN 3868c4fc9ceSEric Biggers jz .Ldone\@ 3878c4fc9ceSEric Biggers 3888c4fc9ceSEric Biggers // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream 3898c4fc9ceSEric Biggers // blocks, depending on the remaining LEN. 3908c4fc9ceSEric Biggers 3918c4fc9ceSEric Biggers _prepare_2_ctr_vecs \is_xctr, 0, 1 3928c4fc9ceSEric Biggers _prepare_2_ctr_vecs \is_xctr, 2, 3 3938c4fc9ceSEric Biggers cmp $4*VL, LEN 3948c4fc9ceSEric Biggers jle .Lenc_tail_atmost4vecs\@ 3958c4fc9ceSEric Biggers 3968c4fc9ceSEric Biggers // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the 3978c4fc9ceSEric Biggers // first 4 to XOR 4 full vectors of data. Then XOR the remaining data. 3988c4fc9ceSEric Biggers _prepare_2_ctr_vecs \is_xctr, 4, 5 3998c4fc9ceSEric Biggers _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1 4008c4fc9ceSEric Biggers _aesenc_loop 0,1,2,3,4,5,6,7 4018c4fc9ceSEric Biggers _aesenclast_and_xor 0,1,2,3 4028c4fc9ceSEric Biggers vaesenclast RNDKEYLAST, AESDATA4, AESDATA0 4038c4fc9ceSEric Biggers vaesenclast RNDKEYLAST, AESDATA5, AESDATA1 4048c4fc9ceSEric Biggers vaesenclast RNDKEYLAST, AESDATA6, AESDATA2 4058c4fc9ceSEric Biggers vaesenclast RNDKEYLAST, AESDATA7, AESDATA3 4068c4fc9ceSEric Biggers sub $-4*VL, SRC 4078c4fc9ceSEric Biggers sub $-4*VL, DST 4088c4fc9ceSEric Biggers add $-4*VL, LEN 4098c4fc9ceSEric Biggers cmp $1*VL-1, LEN 4108c4fc9ceSEric Biggers jle .Lxor_tail_partial_vec_0\@ 4118c4fc9ceSEric Biggers _xor_data 0 4128c4fc9ceSEric Biggers cmp $2*VL-1, LEN 4138c4fc9ceSEric Biggers jle .Lxor_tail_partial_vec_1\@ 4148c4fc9ceSEric Biggers _xor_data 1 4158c4fc9ceSEric Biggers cmp $3*VL-1, LEN 4168c4fc9ceSEric Biggers jle .Lxor_tail_partial_vec_2\@ 4178c4fc9ceSEric Biggers _xor_data 2 4188c4fc9ceSEric Biggers cmp $4*VL-1, LEN 4198c4fc9ceSEric Biggers jle .Lxor_tail_partial_vec_3\@ 4208c4fc9ceSEric Biggers _xor_data 3 4218c4fc9ceSEric Biggers jmp .Ldone\@ 4228c4fc9ceSEric Biggers 4238c4fc9ceSEric Biggers.Lenc_tail_atmost4vecs\@: 4248c4fc9ceSEric Biggers cmp $2*VL, LEN 4258c4fc9ceSEric Biggers jle .Lenc_tail_atmost2vecs\@ 4268c4fc9ceSEric Biggers 4278c4fc9ceSEric Biggers // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the 4288c4fc9ceSEric Biggers // first 2 to XOR 2 full vectors of data. Then XOR the remaining data. 4298c4fc9ceSEric Biggers _aesenc_loop 0,1,2,3 4308c4fc9ceSEric Biggers _aesenclast_and_xor 0,1 4318c4fc9ceSEric Biggers vaesenclast RNDKEYLAST, AESDATA2, AESDATA0 4328c4fc9ceSEric Biggers vaesenclast RNDKEYLAST, AESDATA3, AESDATA1 4338c4fc9ceSEric Biggers sub $-2*VL, SRC 4348c4fc9ceSEric Biggers sub $-2*VL, DST 4358c4fc9ceSEric Biggers add $-2*VL, LEN 4368c4fc9ceSEric Biggers jmp .Lxor_tail_upto2vecs\@ 4378c4fc9ceSEric Biggers 4388c4fc9ceSEric Biggers.Lenc_tail_atmost2vecs\@: 4398c4fc9ceSEric Biggers // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR 4408c4fc9ceSEric Biggers // the remaining data. 4418c4fc9ceSEric Biggers _aesenc_loop 0,1 4428c4fc9ceSEric Biggers vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 4438c4fc9ceSEric Biggers vaesenclast RNDKEYLAST, AESDATA1, AESDATA1 4448c4fc9ceSEric Biggers 4458c4fc9ceSEric Biggers.Lxor_tail_upto2vecs\@: 4468c4fc9ceSEric Biggers cmp $1*VL-1, LEN 4478c4fc9ceSEric Biggers jle .Lxor_tail_partial_vec_0\@ 4488c4fc9ceSEric Biggers _xor_data 0 4498c4fc9ceSEric Biggers cmp $2*VL-1, LEN 4508c4fc9ceSEric Biggers jle .Lxor_tail_partial_vec_1\@ 4518c4fc9ceSEric Biggers _xor_data 1 4528c4fc9ceSEric Biggers jmp .Ldone\@ 4538c4fc9ceSEric Biggers 4548c4fc9ceSEric Biggers.Lxor_tail_partial_vec_1\@: 4558c4fc9ceSEric Biggers add $-1*VL, LEN 4568c4fc9ceSEric Biggers jz .Ldone\@ 4578c4fc9ceSEric Biggers sub $-1*VL, SRC 4588c4fc9ceSEric Biggers sub $-1*VL, DST 4598c4fc9ceSEric Biggers _vmovdqa AESDATA1, AESDATA0 4608c4fc9ceSEric Biggers jmp .Lxor_tail_partial_vec_0\@ 4618c4fc9ceSEric Biggers 4628c4fc9ceSEric Biggers.Lxor_tail_partial_vec_2\@: 4638c4fc9ceSEric Biggers add $-2*VL, LEN 4648c4fc9ceSEric Biggers jz .Ldone\@ 4658c4fc9ceSEric Biggers sub $-2*VL, SRC 4668c4fc9ceSEric Biggers sub $-2*VL, DST 4678c4fc9ceSEric Biggers _vmovdqa AESDATA2, AESDATA0 4688c4fc9ceSEric Biggers jmp .Lxor_tail_partial_vec_0\@ 4698c4fc9ceSEric Biggers 4708c4fc9ceSEric Biggers.Lxor_tail_partial_vec_3\@: 4718c4fc9ceSEric Biggers add $-3*VL, LEN 4728c4fc9ceSEric Biggers jz .Ldone\@ 4738c4fc9ceSEric Biggers sub $-3*VL, SRC 4748c4fc9ceSEric Biggers sub $-3*VL, DST 4758c4fc9ceSEric Biggers _vmovdqa AESDATA3, AESDATA0 4768c4fc9ceSEric Biggers 4778c4fc9ceSEric Biggers.Lxor_tail_partial_vec_0\@: 4788c4fc9ceSEric Biggers // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked 4798c4fc9ceSEric Biggers // loads/stores are available; otherwise it's a bit harder... 480*7d14fbc5SEric Biggers.if USE_AVX512 4818c4fc9ceSEric Biggers mov $-1, %rax 4828c4fc9ceSEric Biggers bzhi LEN64, %rax, %rax 4838c4fc9ceSEric Biggers kmovq %rax, %k1 4848c4fc9ceSEric Biggers vmovdqu8 (SRC), AESDATA1{%k1}{z} 485*7d14fbc5SEric Biggers vpxord AESDATA1, AESDATA0, AESDATA0 4868c4fc9ceSEric Biggers vmovdqu8 AESDATA0, (DST){%k1} 4878c4fc9ceSEric Biggers.else 4888c4fc9ceSEric Biggers .if VL == 32 4898c4fc9ceSEric Biggers cmp $16, LEN 4908c4fc9ceSEric Biggers jl 1f 4918c4fc9ceSEric Biggers vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM 4928c4fc9ceSEric Biggers vmovdqu AESDATA1_XMM, (DST) 4938c4fc9ceSEric Biggers add $16, SRC 4948c4fc9ceSEric Biggers add $16, DST 4958c4fc9ceSEric Biggers sub $16, LEN 4968c4fc9ceSEric Biggers jz .Ldone\@ 4978c4fc9ceSEric Biggers vextracti128 $1, AESDATA0, AESDATA0_XMM 4988c4fc9ceSEric Biggers1: 4998c4fc9ceSEric Biggers .endif 5008c4fc9ceSEric Biggers mov LEN, %r10d 5018c4fc9ceSEric Biggers _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32 5028c4fc9ceSEric Biggers vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM 5038c4fc9ceSEric Biggers mov %r10d, %ecx 5048c4fc9ceSEric Biggers _store_partial_block AESDATA0_XMM, DST, KEY, KEY32 5058c4fc9ceSEric Biggers.endif 5068c4fc9ceSEric Biggers 5078c4fc9ceSEric Biggers.Ldone\@: 5088c4fc9ceSEric Biggers.if VL > 16 5098c4fc9ceSEric Biggers vzeroupper 5108c4fc9ceSEric Biggers.endif 5118c4fc9ceSEric Biggers RET 5128c4fc9ceSEric Biggers.endm 5138c4fc9ceSEric Biggers 5148c4fc9ceSEric Biggers// Below are the definitions of the functions generated by the above macro. 5158c4fc9ceSEric Biggers// They have the following prototypes: 5168c4fc9ceSEric Biggers// 5178c4fc9ceSEric Biggers// 5188c4fc9ceSEric Biggers// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, 5198c4fc9ceSEric Biggers// const u8 *src, u8 *dst, int len, 5208c4fc9ceSEric Biggers// const u64 le_ctr[2]); 5218c4fc9ceSEric Biggers// 5228c4fc9ceSEric Biggers// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, 5238c4fc9ceSEric Biggers// const u8 *src, u8 *dst, int len, 5248c4fc9ceSEric Biggers// const u8 iv[AES_BLOCK_SIZE], u64 ctr); 5258c4fc9ceSEric Biggers// 5268c4fc9ceSEric Biggers// Both functions generate |len| bytes of keystream, XOR it with the data from 5278c4fc9ceSEric Biggers// |src|, and write the result to |dst|. On non-final calls, |len| must be a 5288c4fc9ceSEric Biggers// multiple of 16. On the final call, |len| can be any value. 5298c4fc9ceSEric Biggers// 5308c4fc9ceSEric Biggers// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated 5318c4fc9ceSEric Biggers// from a 128-bit big endian counter that increments by 1 for each AES block. 5328c4fc9ceSEric Biggers// HOWEVER, to keep the assembly code simple, some of the counter management is 5338c4fc9ceSEric Biggers// left to the caller. aes_ctr64_crypt_* take the counter in little endian 5348c4fc9ceSEric Biggers// form, only increment the low 64 bits internally, do the conversion to big 5358c4fc9ceSEric Biggers// endian internally, and don't write the updated counter back to memory. The 5368c4fc9ceSEric Biggers// caller is responsible for converting the starting IV to the little endian 5378c4fc9ceSEric Biggers// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits 5388c4fc9ceSEric Biggers// being needed and splitting at that point with a carry done in between, and 5398c4fc9ceSEric Biggers// updating le_ctr after each part if the message is multi-part. 5408c4fc9ceSEric Biggers// 5418c4fc9ceSEric Biggers// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption 5428c4fc9ceSEric Biggers// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an 5438c4fc9ceSEric Biggers// easier-to-implement variant of CTR that uses little endian byte order and 5448c4fc9ceSEric Biggers// eliminates carries. |ctr| is the per-message block counter starting at 1. 5458c4fc9ceSEric Biggers 5468c4fc9ceSEric Biggers.set VL, 16 547*7d14fbc5SEric Biggers.set USE_AVX512, 0 5488c4fc9ceSEric BiggersSYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) 5498c4fc9ceSEric Biggers _aes_ctr_crypt 0 5508c4fc9ceSEric BiggersSYM_FUNC_END(aes_ctr64_crypt_aesni_avx) 5518c4fc9ceSEric BiggersSYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx) 5528c4fc9ceSEric Biggers _aes_ctr_crypt 1 5538c4fc9ceSEric BiggersSYM_FUNC_END(aes_xctr_crypt_aesni_avx) 5548c4fc9ceSEric Biggers 5558c4fc9ceSEric Biggers#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 5568c4fc9ceSEric Biggers.set VL, 32 557*7d14fbc5SEric Biggers.set USE_AVX512, 0 5588c4fc9ceSEric BiggersSYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) 5598c4fc9ceSEric Biggers _aes_ctr_crypt 0 5608c4fc9ceSEric BiggersSYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) 5618c4fc9ceSEric BiggersSYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) 5628c4fc9ceSEric Biggers _aes_ctr_crypt 1 5638c4fc9ceSEric BiggersSYM_FUNC_END(aes_xctr_crypt_vaes_avx2) 5648c4fc9ceSEric Biggers 5658c4fc9ceSEric Biggers.set VL, 64 566*7d14fbc5SEric Biggers.set USE_AVX512, 1 567*7d14fbc5SEric BiggersSYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512) 5688c4fc9ceSEric Biggers _aes_ctr_crypt 0 569*7d14fbc5SEric BiggersSYM_FUNC_END(aes_ctr64_crypt_vaes_avx512) 570*7d14fbc5SEric BiggersSYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512) 5718c4fc9ceSEric Biggers _aes_ctr_crypt 1 572*7d14fbc5SEric BiggersSYM_FUNC_END(aes_xctr_crypt_vaes_avx512) 5738c4fc9ceSEric Biggers#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ 574