1fae3b96bSEric Biggers/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2fae3b96bSEric Biggers// 3fae3b96bSEric Biggers// AES-GCM implementation for x86_64 CPUs that support the following CPU 4fae3b96bSEric Biggers// features: VAES && VPCLMULQDQ && AVX2 5fae3b96bSEric Biggers// 6fae3b96bSEric Biggers// Copyright 2025 Google LLC 7fae3b96bSEric Biggers// 8fae3b96bSEric Biggers// Author: Eric Biggers <ebiggers@google.com> 9fae3b96bSEric Biggers// 10fae3b96bSEric Biggers//------------------------------------------------------------------------------ 11fae3b96bSEric Biggers// 12fae3b96bSEric Biggers// This file is dual-licensed, meaning that you can use it under your choice of 13fae3b96bSEric Biggers// either of the following two licenses: 14fae3b96bSEric Biggers// 15fae3b96bSEric Biggers// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy 16fae3b96bSEric Biggers// of the License at 17fae3b96bSEric Biggers// 18fae3b96bSEric Biggers// http://www.apache.org/licenses/LICENSE-2.0 19fae3b96bSEric Biggers// 20fae3b96bSEric Biggers// Unless required by applicable law or agreed to in writing, software 21fae3b96bSEric Biggers// distributed under the License is distributed on an "AS IS" BASIS, 22fae3b96bSEric Biggers// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 23fae3b96bSEric Biggers// See the License for the specific language governing permissions and 24fae3b96bSEric Biggers// limitations under the License. 25fae3b96bSEric Biggers// 26fae3b96bSEric Biggers// or 27fae3b96bSEric Biggers// 28fae3b96bSEric Biggers// Redistribution and use in source and binary forms, with or without 29fae3b96bSEric Biggers// modification, are permitted provided that the following conditions are met: 30fae3b96bSEric Biggers// 31fae3b96bSEric Biggers// 1. Redistributions of source code must retain the above copyright notice, 32fae3b96bSEric Biggers// this list of conditions and the following disclaimer. 33fae3b96bSEric Biggers// 34fae3b96bSEric Biggers// 2. Redistributions in binary form must reproduce the above copyright 35fae3b96bSEric Biggers// notice, this list of conditions and the following disclaimer in the 36fae3b96bSEric Biggers// documentation and/or other materials provided with the distribution. 37fae3b96bSEric Biggers// 38fae3b96bSEric Biggers// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 39fae3b96bSEric Biggers// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40fae3b96bSEric Biggers// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41fae3b96bSEric Biggers// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 42fae3b96bSEric Biggers// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 43fae3b96bSEric Biggers// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 44fae3b96bSEric Biggers// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 45fae3b96bSEric Biggers// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 46fae3b96bSEric Biggers// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 47fae3b96bSEric Biggers// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 48fae3b96bSEric Biggers// POSSIBILITY OF SUCH DAMAGE. 49fae3b96bSEric Biggers// 50fae3b96bSEric Biggers// ----------------------------------------------------------------------------- 51fae3b96bSEric Biggers// 5212beec21SEric Biggers// This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512. 5312beec21SEric Biggers// This means it can only use 16 vector registers instead of 32, the maximum 5412beec21SEric Biggers// vector length is 32 bytes, and some instructions such as vpternlogd and 5512beec21SEric Biggers// masked loads/stores are unavailable. However, it is able to run on CPUs that 5612beec21SEric Biggers// have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs), 5712beec21SEric Biggers// various Intel client CPUs such as Alder Lake, and Intel Sierra Forest. 58fae3b96bSEric Biggers// 59fae3b96bSEric Biggers// This implementation also uses Karatsuba multiplication instead of schoolbook 60fae3b96bSEric Biggers// multiplication for GHASH in its main loop. This does not help much on Intel, 61fae3b96bSEric Biggers// but it improves performance by ~5% on AMD Zen 3. Other factors weighing 62fae3b96bSEric Biggers// slightly in favor of Karatsuba multiplication in this implementation are the 63fae3b96bSEric Biggers// lower maximum vector length (which means there are fewer key powers, so we 64fae3b96bSEric Biggers// can cache the halves of each key power XOR'd together and still use less 65fae3b96bSEric Biggers// memory than the AVX512 implementation), and the unavailability of the 66fae3b96bSEric Biggers// vpternlogd instruction (which helped schoolbook a bit more than Karatsuba). 67fae3b96bSEric Biggers 68fae3b96bSEric Biggers#include <linux/linkage.h> 69fae3b96bSEric Biggers 70fae3b96bSEric Biggers.section .rodata 71fae3b96bSEric Biggers.p2align 4 72fae3b96bSEric Biggers 73fae3b96bSEric Biggers // The below three 16-byte values must be in the order that they are, as 74fae3b96bSEric Biggers // they are really two 32-byte tables and a 16-byte value that overlap: 75fae3b96bSEric Biggers // 76fae3b96bSEric Biggers // - The first 32-byte table begins at .Lselect_high_bytes_table. 77fae3b96bSEric Biggers // For 0 <= len <= 16, the 16-byte value at 78fae3b96bSEric Biggers // '.Lselect_high_bytes_table + len' selects the high 'len' bytes of 79fae3b96bSEric Biggers // another 16-byte value when AND'ed with it. 80fae3b96bSEric Biggers // 81fae3b96bSEric Biggers // - The second 32-byte table begins at .Lrshift_and_bswap_table. 82fae3b96bSEric Biggers // For 0 <= len <= 16, the 16-byte value at 83fae3b96bSEric Biggers // '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the 84fae3b96bSEric Biggers // following operation: right-shift by '16 - len' bytes (shifting in 85fae3b96bSEric Biggers // zeroes), then reflect all 16 bytes. 86fae3b96bSEric Biggers // 87fae3b96bSEric Biggers // - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects 88fae3b96bSEric Biggers // all 16 bytes. 89fae3b96bSEric Biggers.Lselect_high_bytes_table: 90fae3b96bSEric Biggers .octa 0 91fae3b96bSEric Biggers.Lrshift_and_bswap_table: 92fae3b96bSEric Biggers .octa 0xffffffffffffffffffffffffffffffff 93fae3b96bSEric Biggers.Lbswap_mask: 94fae3b96bSEric Biggers .octa 0x000102030405060708090a0b0c0d0e0f 95fae3b96bSEric Biggers 96fae3b96bSEric Biggers // Sixteen 0x0f bytes. By XOR'ing an entry of .Lrshift_and_bswap_table 97fae3b96bSEric Biggers // with this, we get a mask that left-shifts by '16 - len' bytes. 98fae3b96bSEric Biggers.Lfifteens: 99fae3b96bSEric Biggers .octa 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 100fae3b96bSEric Biggers 101fae3b96bSEric Biggers // This is the GHASH reducing polynomial without its constant term, i.e. 102fae3b96bSEric Biggers // x^128 + x^7 + x^2 + x, represented using the backwards mapping 103fae3b96bSEric Biggers // between bits and polynomial coefficients. 104fae3b96bSEric Biggers // 105fae3b96bSEric Biggers // Alternatively, it can be interpreted as the naturally-ordered 106fae3b96bSEric Biggers // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the 107fae3b96bSEric Biggers // "reversed" GHASH reducing polynomial without its x^128 term. 108fae3b96bSEric Biggers.Lgfpoly: 109fae3b96bSEric Biggers .octa 0xc2000000000000000000000000000001 110fae3b96bSEric Biggers 111fae3b96bSEric Biggers // Same as above, but with the (1 << 64) bit set. 112fae3b96bSEric Biggers.Lgfpoly_and_internal_carrybit: 113fae3b96bSEric Biggers .octa 0xc2000000000000010000000000000001 114fae3b96bSEric Biggers 115fae3b96bSEric Biggers // Values needed to prepare the initial vector of counter blocks. 116fae3b96bSEric Biggers.Lctr_pattern: 117fae3b96bSEric Biggers .octa 0 118fae3b96bSEric Biggers .octa 1 119fae3b96bSEric Biggers 120fae3b96bSEric Biggers // The number of AES blocks per vector, as a 128-bit value. 121fae3b96bSEric Biggers.Linc_2blocks: 122fae3b96bSEric Biggers .octa 2 123fae3b96bSEric Biggers 124fae3b96bSEric Biggers// Offsets in struct aes_gcm_key_vaes_avx2 125fae3b96bSEric Biggers#define OFFSETOF_AESKEYLEN 480 126fae3b96bSEric Biggers#define OFFSETOF_H_POWERS 512 127fae3b96bSEric Biggers#define NUM_H_POWERS 8 128fae3b96bSEric Biggers#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) 129fae3b96bSEric Biggers#define OFFSETOF_H_POWERS_XORED OFFSETOFEND_H_POWERS 130fae3b96bSEric Biggers 131fae3b96bSEric Biggers.text 132fae3b96bSEric Biggers 133fae3b96bSEric Biggers// Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes 134fae3b96bSEric Biggers// of \b and storing the reduced products in \dst. Uses schoolbook 135fae3b96bSEric Biggers// multiplication. 136fae3b96bSEric Biggers.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 137fae3b96bSEric Biggers.if \i == 0 138fae3b96bSEric Biggers vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L 139fae3b96bSEric Biggers vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H 140fae3b96bSEric Biggers.elseif \i == 1 141fae3b96bSEric Biggers vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L 142fae3b96bSEric Biggers.elseif \i == 2 143fae3b96bSEric Biggers vpxor \t2, \t1, \t1 // MI = MI_0 + MI_1 144fae3b96bSEric Biggers.elseif \i == 3 145fae3b96bSEric Biggers vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) 146fae3b96bSEric Biggers.elseif \i == 4 147fae3b96bSEric Biggers vpshufd $0x4e, \t0, \t0 // Swap halves of LO 148fae3b96bSEric Biggers.elseif \i == 5 149fae3b96bSEric Biggers vpxor \t0, \t1, \t1 // Fold LO into MI (part 1) 150fae3b96bSEric Biggers vpxor \t2, \t1, \t1 // Fold LO into MI (part 2) 151fae3b96bSEric Biggers.elseif \i == 6 152fae3b96bSEric Biggers vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H 153fae3b96bSEric Biggers.elseif \i == 7 154fae3b96bSEric Biggers vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) 155fae3b96bSEric Biggers.elseif \i == 8 156fae3b96bSEric Biggers vpshufd $0x4e, \t1, \t1 // Swap halves of MI 157fae3b96bSEric Biggers.elseif \i == 9 158fae3b96bSEric Biggers vpxor \t1, \dst, \dst // Fold MI into HI (part 1) 159fae3b96bSEric Biggers vpxor \t0, \dst, \dst // Fold MI into HI (part 2) 160fae3b96bSEric Biggers.endif 161fae3b96bSEric Biggers.endm 162fae3b96bSEric Biggers 163fae3b96bSEric Biggers// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store 164fae3b96bSEric Biggers// the reduced products in \dst. See _ghash_mul_step for full explanation. 165fae3b96bSEric Biggers.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 166fae3b96bSEric Biggers.irp i, 0,1,2,3,4,5,6,7,8,9 167fae3b96bSEric Biggers _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 168fae3b96bSEric Biggers.endr 169fae3b96bSEric Biggers.endm 170fae3b96bSEric Biggers 171fae3b96bSEric Biggers// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the 172fae3b96bSEric Biggers// *unreduced* products to \lo, \mi, and \hi. 173fae3b96bSEric Biggers.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0 174fae3b96bSEric Biggers vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L 175fae3b96bSEric Biggers vpxor \t0, \lo, \lo 176fae3b96bSEric Biggers vpclmulqdq $0x01, \a, \b, \t0 // a_L * b_H 177fae3b96bSEric Biggers vpxor \t0, \mi, \mi 178fae3b96bSEric Biggers vpclmulqdq $0x10, \a, \b, \t0 // a_H * b_L 179fae3b96bSEric Biggers vpxor \t0, \mi, \mi 180fae3b96bSEric Biggers vpclmulqdq $0x11, \a, \b, \t0 // a_H * b_H 181fae3b96bSEric Biggers vpxor \t0, \hi, \hi 182fae3b96bSEric Biggers.endm 183fae3b96bSEric Biggers 184fae3b96bSEric Biggers// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit 185fae3b96bSEric Biggers// reduced products in \hi. See _ghash_mul_step for explanation of reduction. 186fae3b96bSEric Biggers.macro _ghash_reduce lo, mi, hi, gfpoly, t0 187fae3b96bSEric Biggers vpclmulqdq $0x01, \lo, \gfpoly, \t0 188fae3b96bSEric Biggers vpshufd $0x4e, \lo, \lo 189fae3b96bSEric Biggers vpxor \lo, \mi, \mi 190fae3b96bSEric Biggers vpxor \t0, \mi, \mi 191fae3b96bSEric Biggers vpclmulqdq $0x01, \mi, \gfpoly, \t0 192fae3b96bSEric Biggers vpshufd $0x4e, \mi, \mi 193fae3b96bSEric Biggers vpxor \mi, \hi, \hi 194fae3b96bSEric Biggers vpxor \t0, \hi, \hi 195fae3b96bSEric Biggers.endm 196fae3b96bSEric Biggers 197fae3b96bSEric Biggers// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it 198fae3b96bSEric Biggers// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0. 199fae3b96bSEric Biggers.macro _ghash_square a, dst, gfpoly, t0, t1 200fae3b96bSEric Biggers vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L 201fae3b96bSEric Biggers vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H 202fae3b96bSEric Biggers vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57) 203fae3b96bSEric Biggers vpshufd $0x4e, \t0, \t0 // Swap halves of LO 204fae3b96bSEric Biggers vpxor \t0, \t1, \t1 // Fold LO into MI 205fae3b96bSEric Biggers vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) 206fae3b96bSEric Biggers vpshufd $0x4e, \t1, \t1 // Swap halves of MI 207fae3b96bSEric Biggers vpxor \t1, \dst, \dst // Fold MI into HI (part 1) 208fae3b96bSEric Biggers vpxor \t0, \dst, \dst // Fold MI into HI (part 2) 209fae3b96bSEric Biggers.endm 210fae3b96bSEric Biggers 211fae3b96bSEric Biggers// void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); 212fae3b96bSEric Biggers// 213fae3b96bSEric Biggers// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and 214fae3b96bSEric Biggers// initialize |key->h_powers| and |key->h_powers_xored|. 215fae3b96bSEric Biggers// 216fae3b96bSEric Biggers// We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to 217fae3b96bSEric Biggers// store the 64-bit halves of the key powers XOR'd together (for Karatsuba 218fae3b96bSEric Biggers// multiplication) in the order 8,6,7,5,4,2,3,1. 219fae3b96bSEric BiggersSYM_FUNC_START(aes_gcm_precompute_vaes_avx2) 220fae3b96bSEric Biggers 221fae3b96bSEric Biggers // Function arguments 222fae3b96bSEric Biggers .set KEY, %rdi 223fae3b96bSEric Biggers 224fae3b96bSEric Biggers // Additional local variables 225fae3b96bSEric Biggers .set POWERS_PTR, %rsi 226fae3b96bSEric Biggers .set RNDKEYLAST_PTR, %rdx 227fae3b96bSEric Biggers .set TMP0, %ymm0 228fae3b96bSEric Biggers .set TMP0_XMM, %xmm0 229fae3b96bSEric Biggers .set TMP1, %ymm1 230fae3b96bSEric Biggers .set TMP1_XMM, %xmm1 231fae3b96bSEric Biggers .set TMP2, %ymm2 232fae3b96bSEric Biggers .set TMP2_XMM, %xmm2 233fae3b96bSEric Biggers .set H_CUR, %ymm3 234fae3b96bSEric Biggers .set H_CUR_XMM, %xmm3 235fae3b96bSEric Biggers .set H_CUR2, %ymm4 236fae3b96bSEric Biggers .set H_INC, %ymm5 237fae3b96bSEric Biggers .set H_INC_XMM, %xmm5 238fae3b96bSEric Biggers .set GFPOLY, %ymm6 239fae3b96bSEric Biggers .set GFPOLY_XMM, %xmm6 240fae3b96bSEric Biggers 241fae3b96bSEric Biggers // Encrypt an all-zeroes block to get the raw hash subkey. 242fae3b96bSEric Biggers movl OFFSETOF_AESKEYLEN(KEY), %eax 243fae3b96bSEric Biggers lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR 244fae3b96bSEric Biggers vmovdqu (KEY), H_CUR_XMM // Zero-th round key XOR all-zeroes block 245fae3b96bSEric Biggers lea 16(KEY), %rax 246fae3b96bSEric Biggers1: 247fae3b96bSEric Biggers vaesenc (%rax), H_CUR_XMM, H_CUR_XMM 248fae3b96bSEric Biggers add $16, %rax 249fae3b96bSEric Biggers cmp %rax, RNDKEYLAST_PTR 250fae3b96bSEric Biggers jne 1b 251fae3b96bSEric Biggers vaesenclast (RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM 252fae3b96bSEric Biggers 253fae3b96bSEric Biggers // Reflect the bytes of the raw hash subkey. 254fae3b96bSEric Biggers vpshufb .Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM 255fae3b96bSEric Biggers 256fae3b96bSEric Biggers // Finish preprocessing the byte-reflected hash subkey by multiplying it 257fae3b96bSEric Biggers // by x^-1 ("standard" interpretation of polynomial coefficients) or 258fae3b96bSEric Biggers // equivalently x^1 (natural interpretation). This gets the key into a 259fae3b96bSEric Biggers // format that avoids having to bit-reflect the data blocks later. 260fae3b96bSEric Biggers vpshufd $0xd3, H_CUR_XMM, TMP0_XMM 261fae3b96bSEric Biggers vpsrad $31, TMP0_XMM, TMP0_XMM 262fae3b96bSEric Biggers vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM 263fae3b96bSEric Biggers vpand .Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM 264fae3b96bSEric Biggers vpxor TMP0_XMM, H_CUR_XMM, H_CUR_XMM 265fae3b96bSEric Biggers 266fae3b96bSEric Biggers // Load the gfpoly constant. 267fae3b96bSEric Biggers vbroadcasti128 .Lgfpoly(%rip), GFPOLY 268fae3b96bSEric Biggers 269fae3b96bSEric Biggers // Square H^1 to get H^2. 270fae3b96bSEric Biggers _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM 271fae3b96bSEric Biggers 272fae3b96bSEric Biggers // Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. 273fae3b96bSEric Biggers vinserti128 $1, H_CUR_XMM, H_INC, H_CUR 274fae3b96bSEric Biggers vinserti128 $1, H_INC_XMM, H_INC, H_INC 275fae3b96bSEric Biggers 276fae3b96bSEric Biggers // Compute H_CUR2 = [H^4, H^3]. 277fae3b96bSEric Biggers _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2 278fae3b96bSEric Biggers 279fae3b96bSEric Biggers // Store [H^2, H^1] and [H^4, H^3]. 280fae3b96bSEric Biggers vmovdqu H_CUR, OFFSETOF_H_POWERS+3*32(KEY) 281fae3b96bSEric Biggers vmovdqu H_CUR2, OFFSETOF_H_POWERS+2*32(KEY) 282fae3b96bSEric Biggers 283fae3b96bSEric Biggers // For Karatsuba multiplication: compute and store the two 64-bit halves 284fae3b96bSEric Biggers // of each key power XOR'd together. Order is 4,2,3,1. 285fae3b96bSEric Biggers vpunpcklqdq H_CUR, H_CUR2, TMP0 286fae3b96bSEric Biggers vpunpckhqdq H_CUR, H_CUR2, TMP1 287fae3b96bSEric Biggers vpxor TMP1, TMP0, TMP0 288fae3b96bSEric Biggers vmovdqu TMP0, OFFSETOF_H_POWERS_XORED+32(KEY) 289fae3b96bSEric Biggers 290fae3b96bSEric Biggers // Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. 291fae3b96bSEric Biggers _ghash_mul H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2 292fae3b96bSEric Biggers _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2 293fae3b96bSEric Biggers vmovdqu H_CUR, OFFSETOF_H_POWERS+1*32(KEY) 294fae3b96bSEric Biggers vmovdqu H_CUR2, OFFSETOF_H_POWERS+0*32(KEY) 295fae3b96bSEric Biggers 296fae3b96bSEric Biggers // Again, compute and store the two 64-bit halves of each key power 297fae3b96bSEric Biggers // XOR'd together. Order is 8,6,7,5. 298fae3b96bSEric Biggers vpunpcklqdq H_CUR, H_CUR2, TMP0 299fae3b96bSEric Biggers vpunpckhqdq H_CUR, H_CUR2, TMP1 300fae3b96bSEric Biggers vpxor TMP1, TMP0, TMP0 301fae3b96bSEric Biggers vmovdqu TMP0, OFFSETOF_H_POWERS_XORED(KEY) 302fae3b96bSEric Biggers 303fae3b96bSEric Biggers vzeroupper 304fae3b96bSEric Biggers RET 305fae3b96bSEric BiggersSYM_FUNC_END(aes_gcm_precompute_vaes_avx2) 306fae3b96bSEric Biggers 307fae3b96bSEric Biggers// Do one step of the GHASH update of four vectors of data blocks. 308fae3b96bSEric Biggers// \i: the step to do, 0 through 9 309fae3b96bSEric Biggers// \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) 310fae3b96bSEric Biggers// KEY: pointer to struct aes_gcm_key_vaes_avx2 311fae3b96bSEric Biggers// BSWAP_MASK: mask for reflecting the bytes of blocks 312fae3b96bSEric Biggers// H_POW[2-1]_XORED: cached values from KEY->h_powers_xored 313fae3b96bSEric Biggers// TMP[0-2]: temporary registers. TMP[1-2] must be preserved across steps. 314fae3b96bSEric Biggers// LO, MI: working state for this macro that must be preserved across steps 315fae3b96bSEric Biggers// GHASH_ACC: the GHASH accumulator (input/output) 316fae3b96bSEric Biggers.macro _ghash_step_4x i, ghashdata_ptr 317fae3b96bSEric Biggers .set HI, GHASH_ACC # alias 318fae3b96bSEric Biggers .set HI_XMM, GHASH_ACC_XMM 319fae3b96bSEric Biggers.if \i == 0 320fae3b96bSEric Biggers // First vector 321fae3b96bSEric Biggers vmovdqu 0*32(\ghashdata_ptr), TMP1 322fae3b96bSEric Biggers vpshufb BSWAP_MASK, TMP1, TMP1 323fae3b96bSEric Biggers vmovdqu OFFSETOF_H_POWERS+0*32(KEY), TMP2 324fae3b96bSEric Biggers vpxor GHASH_ACC, TMP1, TMP1 325fae3b96bSEric Biggers vpclmulqdq $0x00, TMP2, TMP1, LO 326fae3b96bSEric Biggers vpclmulqdq $0x11, TMP2, TMP1, HI 327fae3b96bSEric Biggers vpunpckhqdq TMP1, TMP1, TMP0 328fae3b96bSEric Biggers vpxor TMP1, TMP0, TMP0 329fae3b96bSEric Biggers vpclmulqdq $0x00, H_POW2_XORED, TMP0, MI 330fae3b96bSEric Biggers.elseif \i == 1 331fae3b96bSEric Biggers.elseif \i == 2 332fae3b96bSEric Biggers // Second vector 333fae3b96bSEric Biggers vmovdqu 1*32(\ghashdata_ptr), TMP1 334fae3b96bSEric Biggers vpshufb BSWAP_MASK, TMP1, TMP1 335fae3b96bSEric Biggers vmovdqu OFFSETOF_H_POWERS+1*32(KEY), TMP2 336fae3b96bSEric Biggers vpclmulqdq $0x00, TMP2, TMP1, TMP0 337fae3b96bSEric Biggers vpxor TMP0, LO, LO 338fae3b96bSEric Biggers vpclmulqdq $0x11, TMP2, TMP1, TMP0 339fae3b96bSEric Biggers vpxor TMP0, HI, HI 340fae3b96bSEric Biggers vpunpckhqdq TMP1, TMP1, TMP0 341fae3b96bSEric Biggers vpxor TMP1, TMP0, TMP0 342fae3b96bSEric Biggers vpclmulqdq $0x10, H_POW2_XORED, TMP0, TMP0 343fae3b96bSEric Biggers vpxor TMP0, MI, MI 344fae3b96bSEric Biggers.elseif \i == 3 345fae3b96bSEric Biggers // Third vector 346fae3b96bSEric Biggers vmovdqu 2*32(\ghashdata_ptr), TMP1 347fae3b96bSEric Biggers vpshufb BSWAP_MASK, TMP1, TMP1 348fae3b96bSEric Biggers vmovdqu OFFSETOF_H_POWERS+2*32(KEY), TMP2 349fae3b96bSEric Biggers.elseif \i == 4 350fae3b96bSEric Biggers vpclmulqdq $0x00, TMP2, TMP1, TMP0 351fae3b96bSEric Biggers vpxor TMP0, LO, LO 352fae3b96bSEric Biggers vpclmulqdq $0x11, TMP2, TMP1, TMP0 353fae3b96bSEric Biggers vpxor TMP0, HI, HI 354fae3b96bSEric Biggers.elseif \i == 5 355fae3b96bSEric Biggers vpunpckhqdq TMP1, TMP1, TMP0 356fae3b96bSEric Biggers vpxor TMP1, TMP0, TMP0 357fae3b96bSEric Biggers vpclmulqdq $0x00, H_POW1_XORED, TMP0, TMP0 358fae3b96bSEric Biggers vpxor TMP0, MI, MI 359fae3b96bSEric Biggers 360fae3b96bSEric Biggers // Fourth vector 361fae3b96bSEric Biggers vmovdqu 3*32(\ghashdata_ptr), TMP1 362fae3b96bSEric Biggers vpshufb BSWAP_MASK, TMP1, TMP1 363fae3b96bSEric Biggers.elseif \i == 6 364fae3b96bSEric Biggers vmovdqu OFFSETOF_H_POWERS+3*32(KEY), TMP2 365fae3b96bSEric Biggers vpclmulqdq $0x00, TMP2, TMP1, TMP0 366fae3b96bSEric Biggers vpxor TMP0, LO, LO 367fae3b96bSEric Biggers vpclmulqdq $0x11, TMP2, TMP1, TMP0 368fae3b96bSEric Biggers vpxor TMP0, HI, HI 369fae3b96bSEric Biggers vpunpckhqdq TMP1, TMP1, TMP0 370fae3b96bSEric Biggers vpxor TMP1, TMP0, TMP0 371fae3b96bSEric Biggers vpclmulqdq $0x10, H_POW1_XORED, TMP0, TMP0 372fae3b96bSEric Biggers vpxor TMP0, MI, MI 373fae3b96bSEric Biggers.elseif \i == 7 374fae3b96bSEric Biggers // Finalize 'mi' following Karatsuba multiplication. 375fae3b96bSEric Biggers vpxor LO, MI, MI 376fae3b96bSEric Biggers vpxor HI, MI, MI 377fae3b96bSEric Biggers 378fae3b96bSEric Biggers // Fold lo into mi. 379fae3b96bSEric Biggers vbroadcasti128 .Lgfpoly(%rip), TMP2 380fae3b96bSEric Biggers vpclmulqdq $0x01, LO, TMP2, TMP0 381fae3b96bSEric Biggers vpshufd $0x4e, LO, LO 382fae3b96bSEric Biggers vpxor LO, MI, MI 383fae3b96bSEric Biggers vpxor TMP0, MI, MI 384fae3b96bSEric Biggers.elseif \i == 8 385fae3b96bSEric Biggers // Fold mi into hi. 386fae3b96bSEric Biggers vpclmulqdq $0x01, MI, TMP2, TMP0 387fae3b96bSEric Biggers vpshufd $0x4e, MI, MI 388fae3b96bSEric Biggers vpxor MI, HI, HI 389fae3b96bSEric Biggers vpxor TMP0, HI, HI 390fae3b96bSEric Biggers.elseif \i == 9 391fae3b96bSEric Biggers vextracti128 $1, HI, TMP0_XMM 392fae3b96bSEric Biggers vpxor TMP0_XMM, HI_XMM, GHASH_ACC_XMM 393fae3b96bSEric Biggers.endif 394fae3b96bSEric Biggers.endm 395fae3b96bSEric Biggers 396fae3b96bSEric Biggers// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full 397fae3b96bSEric Biggers// explanation. 398fae3b96bSEric Biggers.macro _ghash_4x ghashdata_ptr 399fae3b96bSEric Biggers.irp i, 0,1,2,3,4,5,6,7,8,9 400fae3b96bSEric Biggers _ghash_step_4x \i, \ghashdata_ptr 401fae3b96bSEric Biggers.endr 402fae3b96bSEric Biggers.endm 403fae3b96bSEric Biggers 404fae3b96bSEric Biggers// Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst 405fae3b96bSEric Biggers// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. 406fae3b96bSEric Biggers.macro _load_partial_block src, dst, tmp64, tmp32 407fae3b96bSEric Biggers sub $8, %ecx // LEN - 8 408fae3b96bSEric Biggers jle .Lle8\@ 409fae3b96bSEric Biggers 410fae3b96bSEric Biggers // Load 9 <= LEN <= 16 bytes. 411fae3b96bSEric Biggers vmovq (\src), \dst // Load first 8 bytes 412fae3b96bSEric Biggers mov (\src, %rcx), %rax // Load last 8 bytes 413fae3b96bSEric Biggers neg %ecx 414fae3b96bSEric Biggers shl $3, %ecx 415fae3b96bSEric Biggers shr %cl, %rax // Discard overlapping bytes 416fae3b96bSEric Biggers vpinsrq $1, %rax, \dst, \dst 417fae3b96bSEric Biggers jmp .Ldone\@ 418fae3b96bSEric Biggers 419fae3b96bSEric Biggers.Lle8\@: 420fae3b96bSEric Biggers add $4, %ecx // LEN - 4 421fae3b96bSEric Biggers jl .Llt4\@ 422fae3b96bSEric Biggers 423fae3b96bSEric Biggers // Load 4 <= LEN <= 8 bytes. 424fae3b96bSEric Biggers mov (\src), %eax // Load first 4 bytes 425fae3b96bSEric Biggers mov (\src, %rcx), \tmp32 // Load last 4 bytes 426fae3b96bSEric Biggers jmp .Lcombine\@ 427fae3b96bSEric Biggers 428fae3b96bSEric Biggers.Llt4\@: 429fae3b96bSEric Biggers // Load 1 <= LEN <= 3 bytes. 430fae3b96bSEric Biggers add $2, %ecx // LEN - 2 431fae3b96bSEric Biggers movzbl (\src), %eax // Load first byte 432fae3b96bSEric Biggers jl .Lmovq\@ 433fae3b96bSEric Biggers movzwl (\src, %rcx), \tmp32 // Load last 2 bytes 434fae3b96bSEric Biggers.Lcombine\@: 435fae3b96bSEric Biggers shl $3, %ecx 436fae3b96bSEric Biggers shl %cl, \tmp64 437fae3b96bSEric Biggers or \tmp64, %rax // Combine the two parts 438fae3b96bSEric Biggers.Lmovq\@: 439fae3b96bSEric Biggers vmovq %rax, \dst 440fae3b96bSEric Biggers.Ldone\@: 441fae3b96bSEric Biggers.endm 442fae3b96bSEric Biggers 443fae3b96bSEric Biggers// Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst. 444fae3b96bSEric Biggers// Clobbers %rax, %rcx, and \tmp{64,32}. 445fae3b96bSEric Biggers.macro _store_partial_block src, dst, tmp64, tmp32 446fae3b96bSEric Biggers sub $8, %ecx // LEN - 8 447fae3b96bSEric Biggers jl .Llt8\@ 448fae3b96bSEric Biggers 449fae3b96bSEric Biggers // Store 8 <= LEN <= 16 bytes. 450fae3b96bSEric Biggers vpextrq $1, \src, %rax 451fae3b96bSEric Biggers mov %ecx, \tmp32 452fae3b96bSEric Biggers shl $3, %ecx 453fae3b96bSEric Biggers ror %cl, %rax 454fae3b96bSEric Biggers mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes 455fae3b96bSEric Biggers vmovq \src, (\dst) // Store first 8 bytes 456fae3b96bSEric Biggers jmp .Ldone\@ 457fae3b96bSEric Biggers 458fae3b96bSEric Biggers.Llt8\@: 459fae3b96bSEric Biggers add $4, %ecx // LEN - 4 460fae3b96bSEric Biggers jl .Llt4\@ 461fae3b96bSEric Biggers 462fae3b96bSEric Biggers // Store 4 <= LEN <= 7 bytes. 463fae3b96bSEric Biggers vpextrd $1, \src, %eax 464fae3b96bSEric Biggers mov %ecx, \tmp32 465fae3b96bSEric Biggers shl $3, %ecx 466fae3b96bSEric Biggers ror %cl, %eax 467fae3b96bSEric Biggers mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes 468fae3b96bSEric Biggers vmovd \src, (\dst) // Store first 4 bytes 469fae3b96bSEric Biggers jmp .Ldone\@ 470fae3b96bSEric Biggers 471fae3b96bSEric Biggers.Llt4\@: 472fae3b96bSEric Biggers // Store 1 <= LEN <= 3 bytes. 473fae3b96bSEric Biggers vpextrb $0, \src, 0(\dst) 474fae3b96bSEric Biggers cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? 475fae3b96bSEric Biggers jl .Ldone\@ 476fae3b96bSEric Biggers vpextrb $1, \src, 1(\dst) 477fae3b96bSEric Biggers je .Ldone\@ 478fae3b96bSEric Biggers vpextrb $2, \src, 2(\dst) 479fae3b96bSEric Biggers.Ldone\@: 480fae3b96bSEric Biggers.endm 481fae3b96bSEric Biggers 482fae3b96bSEric Biggers// void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 483fae3b96bSEric Biggers// u8 ghash_acc[16], 484fae3b96bSEric Biggers// const u8 *aad, int aadlen); 485fae3b96bSEric Biggers// 486fae3b96bSEric Biggers// This function processes the AAD (Additional Authenticated Data) in GCM. 487fae3b96bSEric Biggers// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the 488fae3b96bSEric Biggers// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all 489fae3b96bSEric Biggers// zeroes. |aadlen| must be a multiple of 16, except on the last call where it 490fae3b96bSEric Biggers// can be any length. The caller must do any buffering needed to ensure this. 491fae3b96bSEric Biggers// 492fae3b96bSEric Biggers// This handles large amounts of AAD efficiently, while also keeping overhead 493fae3b96bSEric Biggers// low for small amounts which is the common case. TLS and IPsec use less than 494fae3b96bSEric Biggers// one block of AAD, but (uncommonly) other use cases may use much more. 495fae3b96bSEric BiggersSYM_FUNC_START(aes_gcm_aad_update_vaes_avx2) 496fae3b96bSEric Biggers 497fae3b96bSEric Biggers // Function arguments 498fae3b96bSEric Biggers .set KEY, %rdi 499fae3b96bSEric Biggers .set GHASH_ACC_PTR, %rsi 500fae3b96bSEric Biggers .set AAD, %rdx 501fae3b96bSEric Biggers .set AADLEN, %ecx // Must be %ecx for _load_partial_block 502fae3b96bSEric Biggers .set AADLEN64, %rcx // Zero-extend AADLEN before using! 503fae3b96bSEric Biggers 504fae3b96bSEric Biggers // Additional local variables. 505fae3b96bSEric Biggers // %rax and %r8 are used as temporary registers. 506fae3b96bSEric Biggers .set TMP0, %ymm0 507fae3b96bSEric Biggers .set TMP0_XMM, %xmm0 508fae3b96bSEric Biggers .set TMP1, %ymm1 509fae3b96bSEric Biggers .set TMP1_XMM, %xmm1 510fae3b96bSEric Biggers .set TMP2, %ymm2 511fae3b96bSEric Biggers .set TMP2_XMM, %xmm2 512fae3b96bSEric Biggers .set LO, %ymm3 513fae3b96bSEric Biggers .set LO_XMM, %xmm3 514fae3b96bSEric Biggers .set MI, %ymm4 515fae3b96bSEric Biggers .set MI_XMM, %xmm4 516fae3b96bSEric Biggers .set GHASH_ACC, %ymm5 517fae3b96bSEric Biggers .set GHASH_ACC_XMM, %xmm5 518fae3b96bSEric Biggers .set BSWAP_MASK, %ymm6 519fae3b96bSEric Biggers .set BSWAP_MASK_XMM, %xmm6 520fae3b96bSEric Biggers .set GFPOLY, %ymm7 521fae3b96bSEric Biggers .set GFPOLY_XMM, %xmm7 522fae3b96bSEric Biggers .set H_POW2_XORED, %ymm8 523fae3b96bSEric Biggers .set H_POW1_XORED, %ymm9 524fae3b96bSEric Biggers 525fae3b96bSEric Biggers // Load the bswap_mask and gfpoly constants. Since AADLEN is usually 526fae3b96bSEric Biggers // small, usually only 128-bit vectors will be used. So as an 527fae3b96bSEric Biggers // optimization, don't broadcast these constants to both 128-bit lanes 528fae3b96bSEric Biggers // quite yet. 529fae3b96bSEric Biggers vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM 530fae3b96bSEric Biggers vmovdqu .Lgfpoly(%rip), GFPOLY_XMM 531fae3b96bSEric Biggers 532fae3b96bSEric Biggers // Load the GHASH accumulator. 533fae3b96bSEric Biggers vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM 534fae3b96bSEric Biggers 535fae3b96bSEric Biggers // Check for the common case of AADLEN <= 16, as well as AADLEN == 0. 536fae3b96bSEric Biggers test AADLEN, AADLEN 537fae3b96bSEric Biggers jz .Laad_done 538fae3b96bSEric Biggers cmp $16, AADLEN 539fae3b96bSEric Biggers jle .Laad_lastblock 540fae3b96bSEric Biggers 541fae3b96bSEric Biggers // AADLEN > 16, so we'll operate on full vectors. Broadcast bswap_mask 542fae3b96bSEric Biggers // and gfpoly to both 128-bit lanes. 543fae3b96bSEric Biggers vinserti128 $1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK 544fae3b96bSEric Biggers vinserti128 $1, GFPOLY_XMM, GFPOLY, GFPOLY 545fae3b96bSEric Biggers 546fae3b96bSEric Biggers // If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time. 547fae3b96bSEric Biggers add $-128, AADLEN // 128 is 4 bytes, -128 is 1 byte 548fae3b96bSEric Biggers jl .Laad_loop_4x_done 549fae3b96bSEric Biggers vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED 550fae3b96bSEric Biggers vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED 551fae3b96bSEric Biggers.Laad_loop_4x: 552fae3b96bSEric Biggers _ghash_4x AAD 553fae3b96bSEric Biggers sub $-128, AAD 554fae3b96bSEric Biggers add $-128, AADLEN 555fae3b96bSEric Biggers jge .Laad_loop_4x 556fae3b96bSEric Biggers.Laad_loop_4x_done: 557fae3b96bSEric Biggers 558fae3b96bSEric Biggers // If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time. 559fae3b96bSEric Biggers add $96, AADLEN 560fae3b96bSEric Biggers jl .Laad_loop_1x_done 561fae3b96bSEric Biggers.Laad_loop_1x: 562fae3b96bSEric Biggers vmovdqu (AAD), TMP0 563fae3b96bSEric Biggers vpshufb BSWAP_MASK, TMP0, TMP0 564fae3b96bSEric Biggers vpxor TMP0, GHASH_ACC, GHASH_ACC 565fae3b96bSEric Biggers vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0 566fae3b96bSEric Biggers _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO 567fae3b96bSEric Biggers vextracti128 $1, GHASH_ACC, TMP0_XMM 568fae3b96bSEric Biggers vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 569fae3b96bSEric Biggers add $32, AAD 570fae3b96bSEric Biggers sub $32, AADLEN 571fae3b96bSEric Biggers jge .Laad_loop_1x 572fae3b96bSEric Biggers.Laad_loop_1x_done: 573fae3b96bSEric Biggers add $32, AADLEN 574fae3b96bSEric Biggers // Now 0 <= AADLEN < 32. 575fae3b96bSEric Biggers 576fae3b96bSEric Biggers jz .Laad_done 577fae3b96bSEric Biggers cmp $16, AADLEN 578fae3b96bSEric Biggers jle .Laad_lastblock 579fae3b96bSEric Biggers 580fae3b96bSEric Biggers // Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD. 581fae3b96bSEric Biggers mov AADLEN, AADLEN // Zero-extend AADLEN to AADLEN64. 582fae3b96bSEric Biggers vmovdqu (AAD), TMP0_XMM 583fae3b96bSEric Biggers vmovdqu -16(AAD, AADLEN64), TMP1_XMM 584fae3b96bSEric Biggers vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM 585fae3b96bSEric Biggers vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 586fae3b96bSEric Biggers lea .Lrshift_and_bswap_table(%rip), %rax 587fae3b96bSEric Biggers vpshufb -16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM 588fae3b96bSEric Biggers vinserti128 $1, TMP1_XMM, GHASH_ACC, GHASH_ACC 589fae3b96bSEric Biggers vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0 590fae3b96bSEric Biggers _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO 591fae3b96bSEric Biggers vextracti128 $1, GHASH_ACC, TMP0_XMM 592fae3b96bSEric Biggers vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 593fae3b96bSEric Biggers jmp .Laad_done 594fae3b96bSEric Biggers 595fae3b96bSEric Biggers.Laad_lastblock: 596fae3b96bSEric Biggers // Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD. 597fae3b96bSEric Biggers _load_partial_block AAD, TMP0_XMM, %r8, %r8d 598fae3b96bSEric Biggers vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM 599fae3b96bSEric Biggers vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 600fae3b96bSEric Biggers vmovdqu OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM 601fae3b96bSEric Biggers _ghash_mul TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \ 602fae3b96bSEric Biggers TMP1_XMM, TMP2_XMM, LO_XMM 603fae3b96bSEric Biggers 604fae3b96bSEric Biggers.Laad_done: 605fae3b96bSEric Biggers // Store the updated GHASH accumulator back to memory. 606fae3b96bSEric Biggers vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) 607fae3b96bSEric Biggers 608fae3b96bSEric Biggers vzeroupper 609fae3b96bSEric Biggers RET 610fae3b96bSEric BiggersSYM_FUNC_END(aes_gcm_aad_update_vaes_avx2) 611fae3b96bSEric Biggers 612fae3b96bSEric Biggers// Do one non-last round of AES encryption on the blocks in the given AESDATA 613fae3b96bSEric Biggers// vectors using the round key that has been broadcast to all 128-bit lanes of 614fae3b96bSEric Biggers// \round_key. 615fae3b96bSEric Biggers.macro _vaesenc round_key, vecs:vararg 616fae3b96bSEric Biggers.irp i, \vecs 617fae3b96bSEric Biggers vaesenc \round_key, AESDATA\i, AESDATA\i 618fae3b96bSEric Biggers.endr 619fae3b96bSEric Biggers.endm 620fae3b96bSEric Biggers 621fae3b96bSEric Biggers// Generate counter blocks in the given AESDATA vectors, then do the zero-th AES 622fae3b96bSEric Biggers// round on them. Clobbers TMP0. 623fae3b96bSEric Biggers.macro _ctr_begin vecs:vararg 624fae3b96bSEric Biggers vbroadcasti128 .Linc_2blocks(%rip), TMP0 625fae3b96bSEric Biggers.irp i, \vecs 626fae3b96bSEric Biggers vpshufb BSWAP_MASK, LE_CTR, AESDATA\i 627fae3b96bSEric Biggers vpaddd TMP0, LE_CTR, LE_CTR 628fae3b96bSEric Biggers.endr 629fae3b96bSEric Biggers.irp i, \vecs 630fae3b96bSEric Biggers vpxor RNDKEY0, AESDATA\i, AESDATA\i 631fae3b96bSEric Biggers.endr 632fae3b96bSEric Biggers.endm 633fae3b96bSEric Biggers 634fae3b96bSEric Biggers// Generate and encrypt counter blocks in the given AESDATA vectors, excluding 635fae3b96bSEric Biggers// the last AES round. Clobbers %rax and TMP0. 636fae3b96bSEric Biggers.macro _aesenc_loop vecs:vararg 637fae3b96bSEric Biggers _ctr_begin \vecs 638fae3b96bSEric Biggers lea 16(KEY), %rax 639fae3b96bSEric Biggers.Laesenc_loop\@: 640fae3b96bSEric Biggers vbroadcasti128 (%rax), TMP0 641fae3b96bSEric Biggers _vaesenc TMP0, \vecs 642fae3b96bSEric Biggers add $16, %rax 643fae3b96bSEric Biggers cmp %rax, RNDKEYLAST_PTR 644fae3b96bSEric Biggers jne .Laesenc_loop\@ 645fae3b96bSEric Biggers.endm 646fae3b96bSEric Biggers 647fae3b96bSEric Biggers// Finalize the keystream blocks in the given AESDATA vectors by doing the last 648fae3b96bSEric Biggers// AES round, then XOR those keystream blocks with the corresponding data. 649fae3b96bSEric Biggers// Reduce latency by doing the XOR before the vaesenclast, utilizing the 650fae3b96bSEric Biggers// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). Clobbers TMP0. 651fae3b96bSEric Biggers.macro _aesenclast_and_xor vecs:vararg 652fae3b96bSEric Biggers.irp i, \vecs 653fae3b96bSEric Biggers vpxor \i*32(SRC), RNDKEYLAST, TMP0 654fae3b96bSEric Biggers vaesenclast TMP0, AESDATA\i, AESDATA\i 655fae3b96bSEric Biggers.endr 656fae3b96bSEric Biggers.irp i, \vecs 657fae3b96bSEric Biggers vmovdqu AESDATA\i, \i*32(DST) 658fae3b96bSEric Biggers.endr 659fae3b96bSEric Biggers.endm 660fae3b96bSEric Biggers 661fae3b96bSEric Biggers// void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 662fae3b96bSEric Biggers// const u32 le_ctr[4], u8 ghash_acc[16], 663fae3b96bSEric Biggers// const u8 *src, u8 *dst, int datalen); 664fae3b96bSEric Biggers// 665fae3b96bSEric Biggers// This macro generates a GCM encryption or decryption update function with the 666fae3b96bSEric Biggers// above prototype (with \enc selecting which one). The function computes the 667fae3b96bSEric Biggers// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|, 668fae3b96bSEric Biggers// and writes the resulting encrypted or decrypted data to |dst|. It also 669fae3b96bSEric Biggers// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext 670fae3b96bSEric Biggers// bytes. 671fae3b96bSEric Biggers// 672fae3b96bSEric Biggers// |datalen| must be a multiple of 16, except on the last call where it can be 673fae3b96bSEric Biggers// any length. The caller must do any buffering needed to ensure this. Both 674fae3b96bSEric Biggers// in-place and out-of-place en/decryption are supported. 675fae3b96bSEric Biggers// 676fae3b96bSEric Biggers// |le_ctr| must give the current counter in little-endian format. This 677fae3b96bSEric Biggers// function loads the counter from |le_ctr| and increments the loaded counter as 678fae3b96bSEric Biggers// needed, but it does *not* store the updated counter back to |le_ctr|. The 679fae3b96bSEric Biggers// caller must update |le_ctr| if any more data segments follow. Internally, 680fae3b96bSEric Biggers// only the low 32-bit word of the counter is incremented, following the GCM 681fae3b96bSEric Biggers// standard. 682fae3b96bSEric Biggers.macro _aes_gcm_update enc 683fae3b96bSEric Biggers 684fae3b96bSEric Biggers // Function arguments 685fae3b96bSEric Biggers .set KEY, %rdi 686fae3b96bSEric Biggers .set LE_CTR_PTR, %rsi 687fae3b96bSEric Biggers .set LE_CTR_PTR32, %esi 688fae3b96bSEric Biggers .set GHASH_ACC_PTR, %rdx 689fae3b96bSEric Biggers .set SRC, %rcx // Assumed to be %rcx. 690fae3b96bSEric Biggers // See .Ltail_xor_and_ghash_1to16bytes 691fae3b96bSEric Biggers .set DST, %r8 692fae3b96bSEric Biggers .set DATALEN, %r9d 693fae3b96bSEric Biggers .set DATALEN64, %r9 // Zero-extend DATALEN before using! 694fae3b96bSEric Biggers 695fae3b96bSEric Biggers // Additional local variables 696fae3b96bSEric Biggers 697fae3b96bSEric Biggers // %rax is used as a temporary register. LE_CTR_PTR is also available 698fae3b96bSEric Biggers // as a temporary register after the counter is loaded. 699fae3b96bSEric Biggers 700fae3b96bSEric Biggers // AES key length in bytes 701fae3b96bSEric Biggers .set AESKEYLEN, %r10d 702fae3b96bSEric Biggers .set AESKEYLEN64, %r10 703fae3b96bSEric Biggers 704fae3b96bSEric Biggers // Pointer to the last AES round key for the chosen AES variant 705fae3b96bSEric Biggers .set RNDKEYLAST_PTR, %r11 706fae3b96bSEric Biggers 707fae3b96bSEric Biggers // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values 708fae3b96bSEric Biggers // using vpshufb, copied to all 128-bit lanes. 709fae3b96bSEric Biggers .set BSWAP_MASK, %ymm0 710fae3b96bSEric Biggers .set BSWAP_MASK_XMM, %xmm0 711fae3b96bSEric Biggers 712fae3b96bSEric Biggers // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, 713fae3b96bSEric Biggers // only the lowest 128-bit lane can be nonzero. When not fully reduced, 714fae3b96bSEric Biggers // more than one lane may be used, and they need to be XOR'd together. 715fae3b96bSEric Biggers .set GHASH_ACC, %ymm1 716fae3b96bSEric Biggers .set GHASH_ACC_XMM, %xmm1 717fae3b96bSEric Biggers 718fae3b96bSEric Biggers // TMP[0-2] are temporary registers. 719fae3b96bSEric Biggers .set TMP0, %ymm2 720fae3b96bSEric Biggers .set TMP0_XMM, %xmm2 721fae3b96bSEric Biggers .set TMP1, %ymm3 722fae3b96bSEric Biggers .set TMP1_XMM, %xmm3 723fae3b96bSEric Biggers .set TMP2, %ymm4 724fae3b96bSEric Biggers .set TMP2_XMM, %xmm4 725fae3b96bSEric Biggers 726fae3b96bSEric Biggers // LO and MI are used to accumulate unreduced GHASH products. 727fae3b96bSEric Biggers .set LO, %ymm5 728fae3b96bSEric Biggers .set LO_XMM, %xmm5 729fae3b96bSEric Biggers .set MI, %ymm6 730fae3b96bSEric Biggers .set MI_XMM, %xmm6 731fae3b96bSEric Biggers 732fae3b96bSEric Biggers // H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored. The 733fae3b96bSEric Biggers // descending numbering reflects the order of the key powers. 734fae3b96bSEric Biggers .set H_POW2_XORED, %ymm7 735fae3b96bSEric Biggers .set H_POW2_XORED_XMM, %xmm7 736fae3b96bSEric Biggers .set H_POW1_XORED, %ymm8 737fae3b96bSEric Biggers 738fae3b96bSEric Biggers // RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. 739fae3b96bSEric Biggers .set RNDKEY0, %ymm9 740fae3b96bSEric Biggers .set RNDKEYLAST, %ymm10 741fae3b96bSEric Biggers 742fae3b96bSEric Biggers // LE_CTR contains the next set of little-endian counter blocks. 743fae3b96bSEric Biggers .set LE_CTR, %ymm11 744fae3b96bSEric Biggers 745fae3b96bSEric Biggers // AESDATA[0-3] hold the counter blocks that are being encrypted by AES. 746fae3b96bSEric Biggers .set AESDATA0, %ymm12 747fae3b96bSEric Biggers .set AESDATA0_XMM, %xmm12 748fae3b96bSEric Biggers .set AESDATA1, %ymm13 749fae3b96bSEric Biggers .set AESDATA1_XMM, %xmm13 750fae3b96bSEric Biggers .set AESDATA2, %ymm14 751fae3b96bSEric Biggers .set AESDATA3, %ymm15 752fae3b96bSEric Biggers 753fae3b96bSEric Biggers.if \enc 754fae3b96bSEric Biggers .set GHASHDATA_PTR, DST 755fae3b96bSEric Biggers.else 756fae3b96bSEric Biggers .set GHASHDATA_PTR, SRC 757fae3b96bSEric Biggers.endif 758fae3b96bSEric Biggers 759fae3b96bSEric Biggers vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK 760fae3b96bSEric Biggers 761fae3b96bSEric Biggers // Load the GHASH accumulator and the starting counter. 762fae3b96bSEric Biggers vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM 763fae3b96bSEric Biggers vbroadcasti128 (LE_CTR_PTR), LE_CTR 764fae3b96bSEric Biggers 765fae3b96bSEric Biggers // Load the AES key length in bytes. 766fae3b96bSEric Biggers movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN 767fae3b96bSEric Biggers 768fae3b96bSEric Biggers // Make RNDKEYLAST_PTR point to the last AES round key. This is the 769fae3b96bSEric Biggers // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 770fae3b96bSEric Biggers // respectively. Then load the zero-th and last round keys. 771fae3b96bSEric Biggers lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR 772fae3b96bSEric Biggers vbroadcasti128 (KEY), RNDKEY0 773fae3b96bSEric Biggers vbroadcasti128 (RNDKEYLAST_PTR), RNDKEYLAST 774fae3b96bSEric Biggers 775fae3b96bSEric Biggers // Finish initializing LE_CTR by adding 1 to the second block. 776fae3b96bSEric Biggers vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR 777fae3b96bSEric Biggers 778fae3b96bSEric Biggers // If there are at least 128 bytes of data, then continue into the loop 779fae3b96bSEric Biggers // that processes 128 bytes of data at a time. Otherwise skip it. 780fae3b96bSEric Biggers add $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte 781fae3b96bSEric Biggers jl .Lcrypt_loop_4x_done\@ 782fae3b96bSEric Biggers 783fae3b96bSEric Biggers vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED 784fae3b96bSEric Biggers vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED 785fae3b96bSEric Biggers 786fae3b96bSEric Biggers // Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. 787fae3b96bSEric Biggers 788fae3b96bSEric Biggers.if \enc 789fae3b96bSEric Biggers // Encrypt the first 4 vectors of plaintext blocks. 790fae3b96bSEric Biggers _aesenc_loop 0,1,2,3 791fae3b96bSEric Biggers _aesenclast_and_xor 0,1,2,3 792fae3b96bSEric Biggers sub $-128, SRC // 128 is 4 bytes, -128 is 1 byte 793fae3b96bSEric Biggers add $-128, DATALEN 794fae3b96bSEric Biggers jl .Lghash_last_ciphertext_4x\@ 795fae3b96bSEric Biggers.endif 796fae3b96bSEric Biggers 797fae3b96bSEric Biggers.align 16 798fae3b96bSEric Biggers.Lcrypt_loop_4x\@: 799fae3b96bSEric Biggers 800fae3b96bSEric Biggers // Start the AES encryption of the counter blocks. 801fae3b96bSEric Biggers _ctr_begin 0,1,2,3 802fae3b96bSEric Biggers cmp $24, AESKEYLEN 803fae3b96bSEric Biggers jl 128f // AES-128? 804fae3b96bSEric Biggers je 192f // AES-192? 805fae3b96bSEric Biggers // AES-256 806fae3b96bSEric Biggers vbroadcasti128 -13*16(RNDKEYLAST_PTR), TMP0 807fae3b96bSEric Biggers _vaesenc TMP0, 0,1,2,3 808fae3b96bSEric Biggers vbroadcasti128 -12*16(RNDKEYLAST_PTR), TMP0 809fae3b96bSEric Biggers _vaesenc TMP0, 0,1,2,3 810fae3b96bSEric Biggers192: 811fae3b96bSEric Biggers vbroadcasti128 -11*16(RNDKEYLAST_PTR), TMP0 812fae3b96bSEric Biggers _vaesenc TMP0, 0,1,2,3 813fae3b96bSEric Biggers vbroadcasti128 -10*16(RNDKEYLAST_PTR), TMP0 814fae3b96bSEric Biggers _vaesenc TMP0, 0,1,2,3 815fae3b96bSEric Biggers128: 816fae3b96bSEric Biggers 817fae3b96bSEric Biggers // Finish the AES encryption of the counter blocks in AESDATA[0-3], 818fae3b96bSEric Biggers // interleaved with the GHASH update of the ciphertext blocks. 819fae3b96bSEric Biggers.irp i, 9,8,7,6,5,4,3,2,1 820fae3b96bSEric Biggers _ghash_step_4x (9 - \i), GHASHDATA_PTR 821fae3b96bSEric Biggers vbroadcasti128 -\i*16(RNDKEYLAST_PTR), TMP0 822fae3b96bSEric Biggers _vaesenc TMP0, 0,1,2,3 823fae3b96bSEric Biggers.endr 824fae3b96bSEric Biggers _ghash_step_4x 9, GHASHDATA_PTR 825fae3b96bSEric Biggers.if \enc 826fae3b96bSEric Biggers sub $-128, DST // 128 is 4 bytes, -128 is 1 byte 827fae3b96bSEric Biggers.endif 828fae3b96bSEric Biggers _aesenclast_and_xor 0,1,2,3 829fae3b96bSEric Biggers sub $-128, SRC 830fae3b96bSEric Biggers.if !\enc 831fae3b96bSEric Biggers sub $-128, DST 832fae3b96bSEric Biggers.endif 833fae3b96bSEric Biggers add $-128, DATALEN 834fae3b96bSEric Biggers jge .Lcrypt_loop_4x\@ 835fae3b96bSEric Biggers 836fae3b96bSEric Biggers.if \enc 837fae3b96bSEric Biggers.Lghash_last_ciphertext_4x\@: 838fae3b96bSEric Biggers // Update GHASH with the last set of ciphertext blocks. 839fae3b96bSEric Biggers _ghash_4x DST 840fae3b96bSEric Biggers sub $-128, DST 841fae3b96bSEric Biggers.endif 842fae3b96bSEric Biggers 843fae3b96bSEric Biggers.Lcrypt_loop_4x_done\@: 844fae3b96bSEric Biggers 845fae3b96bSEric Biggers // Undo the extra subtraction by 128 and check whether data remains. 846fae3b96bSEric Biggers sub $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte 847fae3b96bSEric Biggers jz .Ldone\@ 848fae3b96bSEric Biggers 849fae3b96bSEric Biggers // The data length isn't a multiple of 128 bytes. Process the remaining 850fae3b96bSEric Biggers // data of length 1 <= DATALEN < 128. 851fae3b96bSEric Biggers // 852fae3b96bSEric Biggers // Since there are enough key powers available for all remaining data, 853fae3b96bSEric Biggers // there is no need to do a GHASH reduction after each iteration. 854fae3b96bSEric Biggers // Instead, multiply each remaining block by its own key power, and only 855fae3b96bSEric Biggers // do a GHASH reduction at the very end. 856fae3b96bSEric Biggers 857fae3b96bSEric Biggers // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N 858fae3b96bSEric Biggers // is the number of blocks that remain. 859fae3b96bSEric Biggers .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. 860fae3b96bSEric Biggers .set POWERS_PTR32, LE_CTR_PTR32 861fae3b96bSEric Biggers mov DATALEN, %eax 862fae3b96bSEric Biggers neg %rax 863fae3b96bSEric Biggers and $~15, %rax // -round_up(DATALEN, 16) 864fae3b96bSEric Biggers lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR 865fae3b96bSEric Biggers 866fae3b96bSEric Biggers // Start collecting the unreduced GHASH intermediate value LO, MI, HI. 867fae3b96bSEric Biggers .set HI, H_POW2_XORED // H_POW2_XORED is free to be reused. 868fae3b96bSEric Biggers .set HI_XMM, H_POW2_XORED_XMM 869fae3b96bSEric Biggers vpxor LO_XMM, LO_XMM, LO_XMM 870fae3b96bSEric Biggers vpxor MI_XMM, MI_XMM, MI_XMM 871fae3b96bSEric Biggers vpxor HI_XMM, HI_XMM, HI_XMM 872fae3b96bSEric Biggers 873fae3b96bSEric Biggers // 1 <= DATALEN < 128. Generate 2 or 4 more vectors of keystream blocks 874fae3b96bSEric Biggers // excluding the last AES round, depending on the remaining DATALEN. 875fae3b96bSEric Biggers cmp $64, DATALEN 876fae3b96bSEric Biggers jg .Ltail_gen_4_keystream_vecs\@ 877fae3b96bSEric Biggers _aesenc_loop 0,1 878fae3b96bSEric Biggers cmp $32, DATALEN 879fae3b96bSEric Biggers jge .Ltail_xor_and_ghash_full_vec_loop\@ 880fae3b96bSEric Biggers jmp .Ltail_xor_and_ghash_partial_vec\@ 881fae3b96bSEric Biggers.Ltail_gen_4_keystream_vecs\@: 882fae3b96bSEric Biggers _aesenc_loop 0,1,2,3 883fae3b96bSEric Biggers 884fae3b96bSEric Biggers // XOR the remaining data and accumulate the unreduced GHASH products 885fae3b96bSEric Biggers // for DATALEN >= 32, starting with one full 32-byte vector at a time. 886fae3b96bSEric Biggers.Ltail_xor_and_ghash_full_vec_loop\@: 887fae3b96bSEric Biggers.if \enc 888fae3b96bSEric Biggers _aesenclast_and_xor 0 889fae3b96bSEric Biggers vpshufb BSWAP_MASK, AESDATA0, AESDATA0 890fae3b96bSEric Biggers.else 891fae3b96bSEric Biggers vmovdqu (SRC), TMP1 892fae3b96bSEric Biggers vpxor TMP1, RNDKEYLAST, TMP0 893fae3b96bSEric Biggers vaesenclast TMP0, AESDATA0, AESDATA0 894fae3b96bSEric Biggers vmovdqu AESDATA0, (DST) 895fae3b96bSEric Biggers vpshufb BSWAP_MASK, TMP1, AESDATA0 896fae3b96bSEric Biggers.endif 897fae3b96bSEric Biggers // The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0. 898fae3b96bSEric Biggers vpxor GHASH_ACC, AESDATA0, AESDATA0 899fae3b96bSEric Biggers vmovdqu (POWERS_PTR), TMP2 900fae3b96bSEric Biggers _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0 901fae3b96bSEric Biggers vmovdqa AESDATA1, AESDATA0 902fae3b96bSEric Biggers vmovdqa AESDATA2, AESDATA1 903fae3b96bSEric Biggers vmovdqa AESDATA3, AESDATA2 904fae3b96bSEric Biggers vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 905fae3b96bSEric Biggers add $32, SRC 906fae3b96bSEric Biggers add $32, DST 907fae3b96bSEric Biggers add $32, POWERS_PTR 908fae3b96bSEric Biggers sub $32, DATALEN 909fae3b96bSEric Biggers cmp $32, DATALEN 910fae3b96bSEric Biggers jge .Ltail_xor_and_ghash_full_vec_loop\@ 911fae3b96bSEric Biggers test DATALEN, DATALEN 912fae3b96bSEric Biggers jz .Ltail_ghash_reduce\@ 913fae3b96bSEric Biggers 914fae3b96bSEric Biggers.Ltail_xor_and_ghash_partial_vec\@: 915fae3b96bSEric Biggers // XOR the remaining data and accumulate the unreduced GHASH products, 916fae3b96bSEric Biggers // for 1 <= DATALEN < 32. 917fae3b96bSEric Biggers vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 918fae3b96bSEric Biggers cmp $16, DATALEN 919fae3b96bSEric Biggers jle .Ltail_xor_and_ghash_1to16bytes\@ 920fae3b96bSEric Biggers 921fae3b96bSEric Biggers // Handle 17 <= DATALEN < 32. 922fae3b96bSEric Biggers 923fae3b96bSEric Biggers // Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes 924fae3b96bSEric Biggers // (shifting in zeroes), then reflect all 16 bytes. 925fae3b96bSEric Biggers lea .Lrshift_and_bswap_table(%rip), %rax 926fae3b96bSEric Biggers vmovdqu -16(%rax, DATALEN64), TMP2_XMM 927fae3b96bSEric Biggers 928fae3b96bSEric Biggers // Move the second keystream block to its own register and left-align it 929fae3b96bSEric Biggers vextracti128 $1, AESDATA0, AESDATA1_XMM 930fae3b96bSEric Biggers vpxor .Lfifteens(%rip), TMP2_XMM, TMP0_XMM 931fae3b96bSEric Biggers vpshufb TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM 932fae3b96bSEric Biggers 933fae3b96bSEric Biggers // Using overlapping loads and stores, XOR the source data with the 934fae3b96bSEric Biggers // keystream and write the destination data. Then prepare the GHASH 935fae3b96bSEric Biggers // input data: the full ciphertext block and the zero-padded partial 936fae3b96bSEric Biggers // ciphertext block, both byte-reflected, in AESDATA0. 937fae3b96bSEric Biggers.if \enc 938fae3b96bSEric Biggers vpxor -16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM 939fae3b96bSEric Biggers vpxor (SRC), AESDATA0_XMM, AESDATA0_XMM 940fae3b96bSEric Biggers vmovdqu AESDATA1_XMM, -16(DST, DATALEN64) 941fae3b96bSEric Biggers vmovdqu AESDATA0_XMM, (DST) 942fae3b96bSEric Biggers vpshufb TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM 943fae3b96bSEric Biggers vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM 944fae3b96bSEric Biggers.else 945fae3b96bSEric Biggers vmovdqu -16(SRC, DATALEN64), TMP1_XMM 946fae3b96bSEric Biggers vmovdqu (SRC), TMP0_XMM 947fae3b96bSEric Biggers vpxor TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM 948fae3b96bSEric Biggers vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM 949fae3b96bSEric Biggers vmovdqu AESDATA1_XMM, -16(DST, DATALEN64) 950fae3b96bSEric Biggers vmovdqu AESDATA0_XMM, (DST) 951fae3b96bSEric Biggers vpshufb TMP2_XMM, TMP1_XMM, AESDATA1_XMM 952fae3b96bSEric Biggers vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM 953fae3b96bSEric Biggers.endif 954fae3b96bSEric Biggers vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM 955fae3b96bSEric Biggers vinserti128 $1, AESDATA1_XMM, AESDATA0, AESDATA0 956fae3b96bSEric Biggers vmovdqu (POWERS_PTR), TMP2 957fae3b96bSEric Biggers jmp .Ltail_ghash_last_vec\@ 958fae3b96bSEric Biggers 959fae3b96bSEric Biggers.Ltail_xor_and_ghash_1to16bytes\@: 960fae3b96bSEric Biggers // Handle 1 <= DATALEN <= 16. Carefully load and store the 961fae3b96bSEric Biggers // possibly-partial block, which we mustn't access out of bounds. 962fae3b96bSEric Biggers vmovdqu (POWERS_PTR), TMP2_XMM 963fae3b96bSEric Biggers mov SRC, KEY // Free up %rcx, assuming SRC == %rcx 964fae3b96bSEric Biggers mov DATALEN, %ecx 965fae3b96bSEric Biggers _load_partial_block KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32 966fae3b96bSEric Biggers vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM 967fae3b96bSEric Biggers mov DATALEN, %ecx 968fae3b96bSEric Biggers _store_partial_block AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32 969fae3b96bSEric Biggers.if \enc 970fae3b96bSEric Biggers lea .Lselect_high_bytes_table(%rip), %rax 971fae3b96bSEric Biggers vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM 972fae3b96bSEric Biggers vpand (%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM 973fae3b96bSEric Biggers.else 974fae3b96bSEric Biggers vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM 975fae3b96bSEric Biggers.endif 976fae3b96bSEric Biggers vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM 977fae3b96bSEric Biggers 978fae3b96bSEric Biggers.Ltail_ghash_last_vec\@: 979fae3b96bSEric Biggers // Accumulate the unreduced GHASH products for the last 1-2 blocks. The 980fae3b96bSEric Biggers // GHASH input data is in AESDATA0. If only one block remains, then the 981fae3b96bSEric Biggers // second block in AESDATA0 is zero and does not affect the result. 982fae3b96bSEric Biggers _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0 983fae3b96bSEric Biggers 984fae3b96bSEric Biggers.Ltail_ghash_reduce\@: 985fae3b96bSEric Biggers // Finally, do the GHASH reduction. 986fae3b96bSEric Biggers vbroadcasti128 .Lgfpoly(%rip), TMP0 987fae3b96bSEric Biggers _ghash_reduce LO, MI, HI, TMP0, TMP1 988fae3b96bSEric Biggers vextracti128 $1, HI, GHASH_ACC_XMM 989fae3b96bSEric Biggers vpxor HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 990fae3b96bSEric Biggers 991fae3b96bSEric Biggers.Ldone\@: 992fae3b96bSEric Biggers // Store the updated GHASH accumulator back to memory. 993fae3b96bSEric Biggers vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) 994fae3b96bSEric Biggers 995fae3b96bSEric Biggers vzeroupper 996fae3b96bSEric Biggers RET 997fae3b96bSEric Biggers.endm 998fae3b96bSEric Biggers 999fae3b96bSEric Biggers// void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 1000fae3b96bSEric Biggers// const u32 le_ctr[4], u8 ghash_acc[16], 1001fae3b96bSEric Biggers// u64 total_aadlen, u64 total_datalen); 1002fae3b96bSEric Biggers// bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 1003fae3b96bSEric Biggers// const u32 le_ctr[4], const u8 ghash_acc[16], 1004fae3b96bSEric Biggers// u64 total_aadlen, u64 total_datalen, 1005fae3b96bSEric Biggers// const u8 tag[16], int taglen); 1006fae3b96bSEric Biggers// 1007fae3b96bSEric Biggers// This macro generates one of the above two functions (with \enc selecting 1008fae3b96bSEric Biggers// which one). Both functions finish computing the GCM authentication tag by 1009fae3b96bSEric Biggers// updating GHASH with the lengths block and encrypting the GHASH accumulator. 1010fae3b96bSEric Biggers// |total_aadlen| and |total_datalen| must be the total length of the additional 1011fae3b96bSEric Biggers// authenticated data and the en/decrypted data in bytes, respectively. 1012fae3b96bSEric Biggers// 1013fae3b96bSEric Biggers// The encryption function then stores the full-length (16-byte) computed 1014fae3b96bSEric Biggers// authentication tag to |ghash_acc|. The decryption function instead loads the 1015fae3b96bSEric Biggers// expected authentication tag (the one that was transmitted) from the 16-byte 1016fae3b96bSEric Biggers// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the 1017fae3b96bSEric Biggers// computed tag in constant time, and returns true if and only if they match. 1018fae3b96bSEric Biggers.macro _aes_gcm_final enc 1019fae3b96bSEric Biggers 1020fae3b96bSEric Biggers // Function arguments 1021fae3b96bSEric Biggers .set KEY, %rdi 1022fae3b96bSEric Biggers .set LE_CTR_PTR, %rsi 1023fae3b96bSEric Biggers .set GHASH_ACC_PTR, %rdx 1024fae3b96bSEric Biggers .set TOTAL_AADLEN, %rcx 1025fae3b96bSEric Biggers .set TOTAL_DATALEN, %r8 1026fae3b96bSEric Biggers .set TAG, %r9 1027fae3b96bSEric Biggers .set TAGLEN, %r10d // Originally at 8(%rsp) 1028fae3b96bSEric Biggers .set TAGLEN64, %r10 1029fae3b96bSEric Biggers 1030fae3b96bSEric Biggers // Additional local variables. 1031fae3b96bSEric Biggers // %rax and %xmm0-%xmm3 are used as temporary registers. 1032fae3b96bSEric Biggers .set AESKEYLEN, %r11d 1033fae3b96bSEric Biggers .set AESKEYLEN64, %r11 1034fae3b96bSEric Biggers .set GFPOLY, %xmm4 1035fae3b96bSEric Biggers .set BSWAP_MASK, %xmm5 1036fae3b96bSEric Biggers .set LE_CTR, %xmm6 1037fae3b96bSEric Biggers .set GHASH_ACC, %xmm7 1038fae3b96bSEric Biggers .set H_POW1, %xmm8 1039fae3b96bSEric Biggers 1040fae3b96bSEric Biggers // Load some constants. 1041fae3b96bSEric Biggers vmovdqa .Lgfpoly(%rip), GFPOLY 1042fae3b96bSEric Biggers vmovdqa .Lbswap_mask(%rip), BSWAP_MASK 1043fae3b96bSEric Biggers 1044fae3b96bSEric Biggers // Load the AES key length in bytes. 1045fae3b96bSEric Biggers movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN 1046fae3b96bSEric Biggers 1047fae3b96bSEric Biggers // Set up a counter block with 1 in the low 32-bit word. This is the 1048fae3b96bSEric Biggers // counter that produces the ciphertext needed to encrypt the auth tag. 1049fae3b96bSEric Biggers // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. 1050fae3b96bSEric Biggers vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR 1051fae3b96bSEric Biggers 1052fae3b96bSEric Biggers // Build the lengths block and XOR it with the GHASH accumulator. 1053fae3b96bSEric Biggers // Although the lengths block is defined as the AAD length followed by 1054fae3b96bSEric Biggers // the en/decrypted data length, both in big-endian byte order, a byte 1055fae3b96bSEric Biggers // reflection of the full block is needed because of the way we compute 1056fae3b96bSEric Biggers // GHASH (see _ghash_mul_step). By using little-endian values in the 1057fae3b96bSEric Biggers // opposite order, we avoid having to reflect any bytes here. 1058fae3b96bSEric Biggers vmovq TOTAL_DATALEN, %xmm0 1059fae3b96bSEric Biggers vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 1060fae3b96bSEric Biggers vpsllq $3, %xmm0, %xmm0 // Bytes to bits 1061fae3b96bSEric Biggers vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC 1062fae3b96bSEric Biggers 1063fae3b96bSEric Biggers // Load the first hash key power (H^1), which is stored last. 1064fae3b96bSEric Biggers vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1 1065fae3b96bSEric Biggers 1066fae3b96bSEric Biggers // Load TAGLEN if decrypting. 1067fae3b96bSEric Biggers.if !\enc 1068fae3b96bSEric Biggers movl 8(%rsp), TAGLEN 1069fae3b96bSEric Biggers.endif 1070fae3b96bSEric Biggers 1071fae3b96bSEric Biggers // Make %rax point to the last AES round key for the chosen AES variant. 1072fae3b96bSEric Biggers lea 6*16(KEY,AESKEYLEN64,4), %rax 1073fae3b96bSEric Biggers 1074fae3b96bSEric Biggers // Start the AES encryption of the counter block by swapping the counter 1075fae3b96bSEric Biggers // block to big-endian and XOR-ing it with the zero-th AES round key. 1076fae3b96bSEric Biggers vpshufb BSWAP_MASK, LE_CTR, %xmm0 1077fae3b96bSEric Biggers vpxor (KEY), %xmm0, %xmm0 1078fae3b96bSEric Biggers 1079fae3b96bSEric Biggers // Complete the AES encryption and multiply GHASH_ACC by H^1. 1080fae3b96bSEric Biggers // Interleave the AES and GHASH instructions to improve performance. 1081fae3b96bSEric Biggers cmp $24, AESKEYLEN 1082fae3b96bSEric Biggers jl 128f // AES-128? 1083fae3b96bSEric Biggers je 192f // AES-192? 1084fae3b96bSEric Biggers // AES-256 1085fae3b96bSEric Biggers vaesenc -13*16(%rax), %xmm0, %xmm0 1086fae3b96bSEric Biggers vaesenc -12*16(%rax), %xmm0, %xmm0 1087fae3b96bSEric Biggers192: 1088fae3b96bSEric Biggers vaesenc -11*16(%rax), %xmm0, %xmm0 1089fae3b96bSEric Biggers vaesenc -10*16(%rax), %xmm0, %xmm0 1090fae3b96bSEric Biggers128: 1091fae3b96bSEric Biggers.irp i, 0,1,2,3,4,5,6,7,8 1092fae3b96bSEric Biggers _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1093fae3b96bSEric Biggers %xmm1, %xmm2, %xmm3 1094fae3b96bSEric Biggers vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 1095fae3b96bSEric Biggers.endr 1096fae3b96bSEric Biggers _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1097fae3b96bSEric Biggers %xmm1, %xmm2, %xmm3 1098fae3b96bSEric Biggers 1099fae3b96bSEric Biggers // Undo the byte reflection of the GHASH accumulator. 1100fae3b96bSEric Biggers vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC 1101fae3b96bSEric Biggers 1102fae3b96bSEric Biggers // Do the last AES round and XOR the resulting keystream block with the 1103fae3b96bSEric Biggers // GHASH accumulator to produce the full computed authentication tag. 1104fae3b96bSEric Biggers // 1105fae3b96bSEric Biggers // Reduce latency by taking advantage of the property vaesenclast(key, 1106fae3b96bSEric Biggers // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last 1107fae3b96bSEric Biggers // round key, instead of XOR'ing the final AES output with GHASH_ACC. 1108fae3b96bSEric Biggers // 1109fae3b96bSEric Biggers // enc_final then returns the computed auth tag, while dec_final 1110fae3b96bSEric Biggers // compares it with the transmitted one and returns a bool. To compare 1111fae3b96bSEric Biggers // the tags, dec_final XORs them together and uses vptest to check 1112fae3b96bSEric Biggers // whether the result is all-zeroes. This should be constant-time. 1113fae3b96bSEric Biggers // dec_final applies the vaesenclast optimization to this additional 1114fae3b96bSEric Biggers // value XOR'd too. 1115fae3b96bSEric Biggers.if \enc 1116fae3b96bSEric Biggers vpxor (%rax), GHASH_ACC, %xmm1 1117fae3b96bSEric Biggers vaesenclast %xmm1, %xmm0, GHASH_ACC 1118fae3b96bSEric Biggers vmovdqu GHASH_ACC, (GHASH_ACC_PTR) 1119fae3b96bSEric Biggers.else 1120fae3b96bSEric Biggers vpxor (TAG), GHASH_ACC, GHASH_ACC 1121fae3b96bSEric Biggers vpxor (%rax), GHASH_ACC, GHASH_ACC 1122fae3b96bSEric Biggers vaesenclast GHASH_ACC, %xmm0, %xmm0 1123fae3b96bSEric Biggers lea .Lselect_high_bytes_table(%rip), %rax 1124fae3b96bSEric Biggers vmovdqu (%rax, TAGLEN64), %xmm1 1125fae3b96bSEric Biggers vpshufb BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high 1126*0e253e25SEric Biggers xor %eax, %eax 1127fae3b96bSEric Biggers vptest %xmm1, %xmm0 1128fae3b96bSEric Biggers sete %al 1129fae3b96bSEric Biggers.endif 1130fae3b96bSEric Biggers // No need for vzeroupper here, since only used xmm registers were used. 1131fae3b96bSEric Biggers RET 1132fae3b96bSEric Biggers.endm 1133fae3b96bSEric Biggers 1134fae3b96bSEric BiggersSYM_FUNC_START(aes_gcm_enc_update_vaes_avx2) 1135fae3b96bSEric Biggers _aes_gcm_update 1 1136fae3b96bSEric BiggersSYM_FUNC_END(aes_gcm_enc_update_vaes_avx2) 1137fae3b96bSEric BiggersSYM_FUNC_START(aes_gcm_dec_update_vaes_avx2) 1138fae3b96bSEric Biggers _aes_gcm_update 0 1139fae3b96bSEric BiggersSYM_FUNC_END(aes_gcm_dec_update_vaes_avx2) 1140fae3b96bSEric Biggers 1141fae3b96bSEric BiggersSYM_FUNC_START(aes_gcm_enc_final_vaes_avx2) 1142fae3b96bSEric Biggers _aes_gcm_final 1 1143fae3b96bSEric BiggersSYM_FUNC_END(aes_gcm_enc_final_vaes_avx2) 1144fae3b96bSEric BiggersSYM_FUNC_START(aes_gcm_dec_final_vaes_avx2) 1145fae3b96bSEric Biggers _aes_gcm_final 0 1146fae3b96bSEric BiggersSYM_FUNC_END(aes_gcm_dec_final_vaes_avx2) 1147