Lines Matching +full:sub +full:- +full:block

1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
3 // AES-NI optimized AES-GCM for x86_64
9 //------------------------------------------------------------------------------
11 // This file is dual-licensed, meaning that you can use it under your choice of
17 // http://www.apache.org/licenses/LICENSE-2.0
49 //------------------------------------------------------------------------------
51 // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
52 // support the original set of AES instructions, i.e. AES-NI. Two
55 // that the AVX implementation takes advantage of VEX-coded instructions in some
57 // implementation does *not* use 256-bit vectors, as AES is not supported on
58 // 256-bit vectors until the VAES feature (which this file doesn't target).
60 // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
62 // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
64 // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
67 // - The vector length is fixed at 128-bit, i.e. xmm registers. This means
68 // there is only one AES block (and GHASH block) per register.
70 // - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
74 // - Masking is not available either. We work around this by implementing
75 // partial block loads and stores using overlapping scalar loads and stores
78 // - The main loop is organized differently due to the different design
79 // constraints. First, with just one AES block per SIMD register, on some
81 // do an 8-register wide loop. Considering that and the fact that we have
89 // - We implement the GHASH multiplications in the main loop using Karatsuba
91 // pclmulqdq instruction per block, at the cost of one 64-bit load, one
92 // pshufd, and 0.25 pxors per block. (This is without the three-argument
98 // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
103 // low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
110 // Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
114 // An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
115 // saved by using a multiplication-less reduction method. We don't do that
121 // multiply the low half of the data block by the hash key with the extra
124 // multi-block processing we use Karatsuba multiplication with a regular
125 // reduction. For single-block processing, we use the x^64 optimization.
139 // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
211 sub $8, %ecx // LEN - 8
224 add $4, %ecx // LEN - 4
234 add $2, %ecx // LEN - 2
250 sub $8, %ecx // LEN - 8
258 mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
263 add $4, %ecx // LEN - 4
271 mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
278 cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
286 // Do one step of GHASH-multiplying \a by \b and storing the reduced product in
289 // .Lgfpoly constant, and \t0-\t1 must be temporary registers.
320 // GHASH-multiply \a by \b and store the reduced product in \b.
328 // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
383 // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
384 // inner block counter in %rax, which is a value that counts up by 8 for each
385 // block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
391 // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
393 // powers H^i and their XOR'd-together halves to be available in the struct
394 // pointed to by KEY. Both macros clobber TMP[0-2].
397 // Initialize the inner block counter.
403 // Load the first ciphertext block and byte-reflect it.
411 // Add the GHASH accumulator to the ciphertext block to get the block
430 // an unreduced multiplication of the next ciphertext block by the next lowest
438 // Load the next ciphertext block and byte-reflect it.
499 // %xmm0-%xmm1 and %rax are used as temporaries.
506 // Encrypt an all-zeroes block to get the raw hash subkey.
509 movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block
519 // bit-reflected values directly: reflect its bytes, then multiply it by
520 // x^-1 (using the backwards interpretation of polynomial coefficients
549 // Compute H^i = H^{i-1} * H^1.
557 sub $8, %eax
581 // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
594 // Process the AAD one full block at a time.
595 sub $16, AADLEN
603 sub $16, AADLEN
606 // Check whether there is a partial block at the end.
610 // Process a partial block of length 1 <= AADLEN <= 15.
622 // Increment LE_CTR eight times to generate eight little-endian counter blocks,
623 // swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
624 // the zero-th AES round key. Clobbers TMP0 and TMP1.
627 movdqa (KEY), TMP1 // zero-th round key
635 // Do a non-last round of AES on AESDATA[0-7] using \round_key.
642 // Do the last round of AES on AESDATA[0-7] using \round_key.
649 // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
674 // in-place and out-of-place en/decryption are supported.
676 // |le_ctr| must give the current counter in little-endian format. For a new
681 // 32-bit word of the counter is incremented, following the GCM standard.
691 .set DATALEN64, %r9 // Zero-extend DATALEN before using!
704 // Put the most frequently used values in %xmm0-%xmm7 to reduce code
705 // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
714 .set LE_CTR, %xmm7 // Little-endian counter value
741 // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
743 add $-8*16, DATALEN
760 sub $-8*16, SRC
761 add $-8*16, DATALEN
773 // by doing the unreduced multiplication for the first ciphertext block.
805 // XOR the data with the AES-CTR keystream blocks.
807 sub $-8*16, DST
810 sub $-8*16, SRC
812 sub $-8*16, DST
814 add $-8*16, DATALEN
828 sub $-8*16, DST
833 sub $-8*16, DATALEN
837 // things simple and keep the code size down by just going one block at
854 // Set up a block counter %rax to contain 8*(8-n), where n is the number
855 // of blocks that remain, counting any partial block. This will be used
863 sub $16, DATALEN
866 // Process the data one full block at a time.
869 // Encrypt the next counter block.
873 lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
875 jl 128f // AES-128?
876 je 192f // AES-192?
877 // AES-256
878 aesenc -7*16(%rsi), TMP0
879 aesenc -6*16(%rsi), TMP0
881 aesenc -5*16(%rsi), TMP0
882 aesenc -4*16(%rsi), TMP0
884 .irp i, -3,-2,-1,0,1,2,3,4,5
893 // XOR the keystream block that was just generated in TMP0 with the next
894 // source data block and store the resulting en/decrypted data to DST.
904 // Update GHASH with the ciphertext block.
918 sub $16, DATALEN
921 // Check whether there is a partial block at the end.
925 // Process a partial block of length 1 <= DATALEN <= 15.
927 // Encrypt a counter block for the last time.
942 // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
949 // XOR the keystream block that was just generated in LE_CTR with the
950 // source data block and store the resulting en/decrypted data to DST.
955 // If encrypting, zero-pad the final ciphertext block for GHASH. (If
959 sub DATALEN64, %rax
963 // Update GHASH with the final ciphertext block.
989 // updating GHASH with the lengths block and encrypting the GHASH accumulator.
993 // The encryption function then stores the full-length (16-byte) computed
995 // expected authentication tag (the one that was transmitted) from the 16-byte
1011 // %rax and %xmm0-%xmm2 are used as temporary registers.
1023 // Set up a counter block with 1 in the low 32-bit word. This is the
1029 // Build the lengths block and XOR it into the GHASH accumulator.
1040 // byte offsets -7*16 through 6*16 decreases code size.)
1043 // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
1048 jl 128f // AES-128?
1049 je 192f // AES-192?
1050 // AES-256
1051 aesenc -7*16(%rax), %xmm0
1052 aesenc -6*16(%rax), %xmm0
1054 aesenc -5*16(%rax), %xmm0
1055 aesenc -4*16(%rax), %xmm0
1058 aesenc (\i-3)*16(%rax), %xmm0
1082 sub TAGLEN64, ZEROPAD_MASK_PTR