aes-gcm-aesni-x86_64.S - OpenGrok cross reference for /linux/arch/x86/crypto/aes-gcm-aesni-x86

Lines Matching +full:sub +full:- +full:block
1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
3 // AES-NI optimized AES-GCM for x86_64
9 //------------------------------------------------------------------------------
11 // This file is dual-licensed, meaning that you can use it under your choice of
17 //	http://www.apache.org/licenses/LICENSE-2.0
49 //------------------------------------------------------------------------------
51 // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
52 // support the original set of AES instructions, i.e. AES-NI.  Two
55 // that the AVX implementation takes advantage of VEX-coded instructions in some
57 // implementation does *not* use 256-bit vectors, as AES is not supported on
58 // 256-bit vectors until the VAES feature (which this file doesn't target).
60 // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
62 // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
64 // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
67 //    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
68 //      there is only one AES block (and GHASH block) per register.
70 //    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
74 //    - Masking is not available either.  We work around this by implementing
75 //      partial block loads and stores using overlapping scalar loads and stores
78 //    - The main loop is organized differently due to the different design
79 //      constraints.  First, with just one AES block per SIMD register, on some
81 //      do an 8-register wide loop.  Considering that and the fact that we have
89 //    - We implement the GHASH multiplications in the main loop using Karatsuba
91 //      pclmulqdq instruction per block, at the cost of one 64-bit load, one
92 //      pshufd, and 0.25 pxors per block.  (This is without the three-argument
98 //      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
103 //      low-power side.  On some of these CPUs, pclmulqdq is quite slow, and the
110 //      Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
114 //      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
115 //      saved by using a multiplication-less reduction method.  We don't do that
121 //      multiply the low half of the data block by the hash key with the extra
124 //      multi-block processing we use Karatsuba multiplication with a regular
125 //      reduction.  For single-block processing, we use the x^64 optimization.
139 	// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
211 	sub		$8, %ecx		// LEN - 8
224 	add		$4, %ecx		// LEN - 4
234 	add		$2, %ecx		// LEN - 2
250 	sub		$8, %ecx		// LEN - 8
258 	mov		%rax, (\dst, %rsi)	// Store last LEN - 8 bytes
263 	add		$4, %ecx		// LEN - 4
271 	mov		%eax, (\dst, %rsi)	// Store last LEN - 4 bytes
278 	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
286 // Do one step of GHASH-multiplying \a by \b and storing the reduced product in
289 // .Lgfpoly constant, and \t0-\t1 must be temporary registers.
320 // GHASH-multiply \a by \b and store the reduced product in \b.
328 // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
383 // registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
384 // inner block counter in %rax, which is a value that counts up by 8 for each
385 // block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
391 // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
393 // powers H^i and their XOR'd-together halves to be available in the struct
394 // pointed to by KEY.  Both macros clobber TMP[0-2].
397 	// Initialize the inner block counter.
403 	// Load the first ciphertext block and byte-reflect it.
411 	// Add the GHASH accumulator to the ciphertext block to get the block
430 // an unreduced multiplication of the next ciphertext block by the next lowest
438 	// Load the next ciphertext block and byte-reflect it.
499 	// %xmm0-%xmm1 and %rax are used as temporaries.
506 	// Encrypt an all-zeroes block to get the raw hash subkey.
509 	movdqa		(KEY), H_POW1  // Zero-th round key XOR all-zeroes block
519 	// bit-reflected values directly: reflect its bytes, then multiply it by
520 	// x^-1 (using the backwards interpretation of polynomial coefficients
549 	// Compute H^i = H^{i-1} * H^1.
557 	sub		$8, %eax
581 	// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
594 	// Process the AAD one full block at a time.
595 	sub		$16, AADLEN
603 	sub		$16, AADLEN
606 	// Check whether there is a partial block at the end.
610 	// Process a partial block of length 1 <= AADLEN <= 15.
622 // Increment LE_CTR eight times to generate eight little-endian counter blocks,
623 // swap each to big-endian, and store them in AESDATA[0-7].  Also XOR them with
624 // the zero-th AES round key.  Clobbers TMP0 and TMP1.
627 	movdqa		(KEY), TMP1		// zero-th round key
635 // Do a non-last round of AES on AESDATA[0-7] using \round_key.
642 // Do the last round of AES on AESDATA[0-7] using \round_key.
649 // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
674 // in-place and out-of-place en/decryption are supported.
676 // |le_ctr| must give the current counter in little-endian format.  For a new
681 // 32-bit word of the counter is incremented, following the GCM standard.
691 	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
704 	// Put the most frequently used values in %xmm0-%xmm7 to reduce code
705 	// size.  (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
714 	.set	LE_CTR,		%xmm7	// Little-endian counter value
741 	// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
743 	add		$-8*16, DATALEN
760 	sub		$-8*16, SRC
761 	add		$-8*16, DATALEN
773 	// by doing the unreduced multiplication for the first ciphertext block.
805 	// XOR the data with the AES-CTR keystream blocks.
807 	sub		$-8*16, DST
810 	sub		$-8*16, SRC
812 	sub		$-8*16, DST
814 	add		$-8*16, DATALEN
828 	sub		$-8*16, DST
833 	sub		$-8*16, DATALEN
837 	// things simple and keep the code size down by just going one block at
854 	// Set up a block counter %rax to contain 8*(8-n), where n is the number
855 	// of blocks that remain, counting any partial block.  This will be used
863 	sub		$16, DATALEN
866 	// Process the data one full block at a time.
869 	// Encrypt the next counter block.
873 	lea		-6*16(RNDKEYLAST_PTR), %rsi	// Reduce code size
875 	jl		128f	// AES-128?
876 	je		192f	// AES-192?
877 	// AES-256
878 	aesenc		-7*16(%rsi), TMP0
879 	aesenc		-6*16(%rsi), TMP0
881 	aesenc		-5*16(%rsi), TMP0
882 	aesenc		-4*16(%rsi), TMP0
884 .irp i, -3,-2,-1,0,1,2,3,4,5
893 	// XOR the keystream block that was just generated in TMP0 with the next
894 	// source data block and store the resulting en/decrypted data to DST.
904 	// Update GHASH with the ciphertext block.
918 	sub		$16, DATALEN
921 	// Check whether there is a partial block at the end.
925 	// Process a partial block of length 1 <= DATALEN <= 15.
927 	// Encrypt a counter block for the last time.
942 	// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC.  SRC is
949 	// XOR the keystream block that was just generated in LE_CTR with the
950 	// source data block and store the resulting en/decrypted data to DST.
955 	// If encrypting, zero-pad the final ciphertext block for GHASH.  (If
959 	sub		DATALEN64, %rax
963 	// Update GHASH with the final ciphertext block.
989 // updating GHASH with the lengths block and encrypting the GHASH accumulator.
993 // The encryption function then stores the full-length (16-byte) computed
995 // expected authentication tag (the one that was transmitted) from the 16-byte
1011 	// %rax and %xmm0-%xmm2 are used as temporary registers.
1023 	// Set up a counter block with 1 in the low 32-bit word.  This is the
1029 	// Build the lengths block and XOR it into the GHASH accumulator.
1040 	// byte offsets -7*16 through 6*16 decreases code size.)
1043 	// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
1048 	jl		128f	// AES-128?
1049 	je		192f	// AES-192?
1050 	// AES-256
1051 	aesenc		-7*16(%rax), %xmm0
1052 	aesenc		-6*16(%rax), %xmm0
1054 	aesenc		-5*16(%rax), %xmm0
1055 	aesenc		-4*16(%rax), %xmm0
1058 	aesenc		(\i-3)*16(%rax), %xmm0
1082 	sub		TAGLEN64, ZEROPAD_MASK_PTR