Lines Matching +full:aes +full:- +full:gcm

1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
3 // AES-NI optimized AES-GCM for x86_64
9 //------------------------------------------------------------------------------
11 // This file is dual-licensed, meaning that you can use it under your choice of
17 // http://www.apache.org/licenses/LICENSE-2.0
49 //------------------------------------------------------------------------------
51 // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
52 // support the original set of AES instructions, i.e. AES-NI. Two
55 // that the AVX implementation takes advantage of VEX-coded instructions in some
57 // implementation does *not* use 256-bit vectors, as AES is not supported on
58 // 256-bit vectors until the VAES feature (which this file doesn't target).
60 // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
62 // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
64 // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
67 // - The vector length is fixed at 128-bit, i.e. xmm registers. This means
68 // there is only one AES block (and GHASH block) per register.
70 // - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
74 // - Masking is not available either. We work around this by implementing
78 // - The main loop is organized differently due to the different design
79 // constraints. First, with just one AES block per SIMD register, on some
81 // do an 8-register wide loop. Considering that and the fact that we have
82 // just 16 SIMD registers to work with, it's not feasible to cache AES
89 // - We implement the GHASH multiplications in the main loop using Karatsuba
91 // pclmulqdq instruction per block, at the cost of one 64-bit load, one
92 // pshufd, and 0.25 pxors per block. (This is without the three-argument
98 // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
103 // low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
110 // Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
115 // saved by using a multiplication-less reduction method. We don't do that
124 // multi-block processing we use Karatsuba multiplication with a regular
125 // reduction. For single-block processing, we use the x^64 optimization.
139 // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
211 sub $8, %ecx // LEN - 8
224 add $4, %ecx // LEN - 4
234 add $2, %ecx // LEN - 2
250 sub $8, %ecx // LEN - 8
258 mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
263 add $4, %ecx // LEN - 4
271 mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
278 cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
286 // Do one step of GHASH-multiplying \a by \b and storing the reduced product in
289 // .Lgfpoly constant, and \t0-\t1 must be temporary registers.
320 // GHASH-multiply \a by \b and store the reduced product in \b.
328 // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
383 // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
391 // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
393 // powers H^i and their XOR'd-together halves to be available in the struct
394 // pointed to by KEY. Both macros clobber TMP[0-2].
403 // Load the first ciphertext block and byte-reflect it.
438 // Load the next ciphertext block and byte-reflect it.
491 // Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
499 // %xmm0-%xmm1 and %rax are used as temporaries.
506 // Encrypt an all-zeroes block to get the raw hash subkey.
509 movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block
519 // bit-reflected values directly: reflect its bytes, then multiply it by
520 // x^-1 (using the backwards interpretation of polynomial coefficients
521 // from the GCM spec) or equivalently x^1 (using the alternative,
549 // Compute H^i = H^{i-1} * H^1.
566 // This function processes the AAD (Additional Authenticated Data) in GCM.
581 // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
622 // Increment LE_CTR eight times to generate eight little-endian counter blocks,
623 // swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
624 // the zero-th AES round key. Clobbers TMP0 and TMP1.
627 movdqa (KEY), TMP1 // zero-th round key
635 // Do a non-last round of AES on AESDATA[0-7] using \round_key.
642 // Do the last round of AES on AESDATA[0-7] using \round_key.
649 // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
664 // This macro generates a GCM encryption or decryption update function with the
674 // in-place and out-of-place en/decryption are supported.
676 // |le_ctr| must give the current counter in little-endian format. For a new
681 // 32-bit word of the counter is incremented, following the GCM standard.
691 .set DATALEN64, %r9 // Zero-extend DATALEN before using!
700 .set AESKEYLEN, %r10d // AES key length in bytes
702 .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key
704 // Put the most frequently used values in %xmm0-%xmm7 to reduce code
705 // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
714 .set LE_CTR, %xmm7 // Little-endian counter value
734 // The main loop interleaves AES and GHASH to improve performance on
741 // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
743 add $-8*16, DATALEN
760 sub $-8*16, SRC
761 add $-8*16, DATALEN
772 // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
779 // Do 7 more rounds of AES, and continue the GHASH update by doing the
790 // Do the remaining AES rounds.
799 // Do the GHASH reduction and the last round of AES.
805 // XOR the data with the AES-CTR keystream blocks.
807 sub $-8*16, DST
810 sub $-8*16, SRC
812 sub $-8*16, DST
814 add $-8*16, DATALEN
828 sub $-8*16, DST
833 sub $-8*16, DATALEN
854 // Set up a block counter %rax to contain 8*(8-n), where n is the number
873 lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
875 jl 128f // AES-128?
876 je 192f // AES-192?
877 // AES-256
878 aesenc -7*16(%rsi), TMP0
879 aesenc -6*16(%rsi), TMP0
881 aesenc -5*16(%rsi), TMP0
882 aesenc -4*16(%rsi), TMP0
884 .irp i, -3,-2,-1,0,1,2,3,4,5
942 // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
955 // If encrypting, zero-pad the final ciphertext block for GHASH. (If
988 // which one). Both functions finish computing the GCM authentication tag by
993 // The encryption function then stores the full-length (16-byte) computed
995 // expected authentication tag (the one that was transmitted) from the 16-byte
1011 // %rax and %xmm0-%xmm2 are used as temporary registers.
1023 // Set up a counter block with 1 in the low 32-bit word. This is the
1039 // Make %rax point to the 6th from last AES round key. (Using signed
1040 // byte offsets -7*16 through 6*16 decreases code size.)
1043 // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
1044 // Interleave the AES and GHASH instructions to improve performance.
1048 jl 128f // AES-128?
1049 je 192f // AES-192?
1050 // AES-256
1051 aesenc -7*16(%rax), %xmm0
1052 aesenc -6*16(%rax), %xmm0
1054 aesenc -5*16(%rax), %xmm0
1055 aesenc -4*16(%rax), %xmm0
1058 aesenc (\i-3)*16(%rax), %xmm0