aes-xts-avx-x86_64.S - OpenGrok cross reference for /linux/arch/x86/crypto/aes-xts-avx-x86

Lines Matching +full:sub +full:- +full:block
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3  * AES-XTS for modern x86_64 CPUs
11  * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
16  * AES-NI + AVX
17  *    - 128-bit vectors (1 AES block per vector)
18  *    - VEX-coded instructions
19  *    - xmm0-xmm15
20  *    - This is for older CPUs that lack VAES but do have AVX.
23  *    - 256-bit vectors (2 AES blocks per vector)
24  *    - VEX-coded instructions
25  *    - ymm0-ymm15
26  *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
30  *    - 256-bit vectors (2 AES blocks per vector)
31  *    - EVEX-coded instructions
32  *    - ymm0-ymm31
33  *    - This is for CPUs that have AVX512 but where using zmm registers causes
35  *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
36  *      To avoid confusion with 512-bit, we just write AVX10/256.
39  *    - Same as the previous one, but upgrades to 512-bit vectors
40  *      (4 AES blocks per vector) in zmm0-zmm31.
41  *    - This is for CPUs that have good AVX512 or AVX10/512 support.
43  * This file doesn't have an implementation for AES-NI alone (without AVX), as
51  * The AES-XTS implementations in this file support everything required by the
52  * crypto API, including support for arbitrary input lengths and multi-part
54  * power-of-2 length inputs that are processed in a single part (disk sectors).
73 	// on CPUs that don't support AVX10-style masking.
85 				// advanced to point to 7th-from-last round key
97 // %r9-r11 are available as temporaries.
112 	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
150 	// V0-V3 hold the data blocks during the main loop, or temporary values
151 	// otherwise.  V4-V5 hold temporary values.
153 	// V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
161 	// V10-V13 are used for computing the next values of TWEAK[0-3].
167 	// V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
171 	// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
175 	// If 32 SIMD registers are available, then V16-V29 hold the remaining
176 	// AES round keys, copied to all 128-bit lanes.
178 	// AES-128, AES-192, and AES-256 use different numbers of round keys.
180 	// keys to the *end* of this register range.  I.e., AES-128 uses
181 	// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
182 	// (All also use KEY0 for the XOR-only "round" at the beginning.)
213 	// V30-V31 are currently unused.
225 // Broadcast a 128-bit value into a vector.
248 	// vpternlogd with immediate 0x96 is a three-argument XOR.
256 // Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
276 	vpsrlq		$64 - VL/16, \src, \tmp1
285 // store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
296 	// Compute the second block of TWEAK0.
308 	// Compute TWEAK[1-3] from TWEAK0.
309 	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
310 	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
311 	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
373 // when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
374 // which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
377 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
379 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
381 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
383 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
405 // TWEAK[0-3].  To complete all steps, this is invoked with increasing values of
431 	// For AES-128, increment by 3*16, resulting in the 10 round keys (not
432 	// counting the zero-th round key which was just loaded into KEY0) being
433 	// -2*16(KEY) through 7*16(KEY).  For AES-192, increment by 5*16 and use
434 	// 12 round keys -4*16(KEY) through 7*16(KEY).  For AES-256, increment
435 	// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
438 	// any round key be in the range [-96, 112], fitting in a signed byte.
439 	// This shortens VEX-encoded instructions that access the later round
440 	// keys which otherwise would need 4-byte offsets.  Second, it makes it
441 	// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
449 	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
456 	_vbroadcast128	-6*16(KEY), KEY1
457 	_vbroadcast128	-5*16(KEY), KEY2
459 	_vbroadcast128	-4*16(KEY), KEY3
460 	_vbroadcast128	-3*16(KEY), KEY4
462 	_vbroadcast128	-2*16(KEY), KEY5
463 	_vbroadcast128	-1*16(KEY), KEY6
476 // on the block(s) in \data using the round key(s) in \key.  The register length
494 // Do a single round of AES en/decryption on the block(s) in \data, using the
495 // same key for all block(s).  The round key is loaded from the appropriate
502 	_vaes		\enc, \last, (\i-7)*16(KEY), \data
504 	_vbroadcast128	(\i-7)*16(KEY), V4
510 // Do a single round of AES en/decryption on the blocks in registers V0-V3,
516 	_tweak_step	(2*(\i-5))
519 	_tweak_step	(2*(\i-5) + 1)
523 	_vbroadcast128	(\i-7)*16(KEY), V4
524 	_tweak_step	(2*(\i-5))
527 	_tweak_step	(2*(\i-5) + 1)
534 // then XOR with \tweak again) of the block(s) in \data.  To process a single
535 // block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
566 	// block length, exclude the last full block from the main loop by
569 	// the last full block and the partial block specially at the end.
570 	lea		-16(LEN), %eax
575 	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
581 	// Compute the first set of tweaks TWEAK[0-3].
584 	sub		$4*VL, LEN
590 	// XOR each source block with its tweak and the zero-th round key.
649 	sub		$4*VL, LEN
653 	// 4*VL.  Handle it out-of-line in order to optimize for the common
655 	test		$4*VL-1, LEN8
669 	add		$3*VL, LEN	// Undo extra sub of 4*VL, then sub VL.
678 	sub		$VL, LEN
681 	add		$VL-16, LEN	// Undo extra sub of VL, then sub 16.
683 	add		$4*VL-16, LEN	// Undo extra sub of 4*VL, then sub 16.
695 	sub		$16, LEN
698 	add		$16, LEN	// Undo the extra sub of 16.
706 	// If encrypting, the main loop already encrypted the last full block to
709 	sub		$16, SRC
710 	sub		$16, DST
713 	// If decrypting, the main loop didn't decrypt the last full block
715 	// Do it now by advancing the tweak and decrypting the last full block.
723 	mov		$-1, %r9d
727 	// Swap the first LEN bytes of the en/decryption of the last full block
728 	// with the partial block.  Note that to support in-place en/decryption,
729 	// the load from the src partial block must happen before the store to
730 	// the dst partial block.
737 	// Load the src partial block, left-aligned.  Note that to support
738 	// in-place en/decryption, this must happen before the store to the dst
739 	// partial block.
742 	// Shift the first LEN bytes of the en/decryption of the last full block
744 	// dst partial block.  It also writes to the second part of the dst last
745 	// full block, but that part is overwritten later.
749 	// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
750 	sub		LEN64, %r9
753 	// Shift the src partial block to the beginning of its register.
756 	// Do a blend to generate the src partial block followed by the second
757 	// part of the en/decryption of the last full block.
760 	// En/decrypt again and store the last full block.
772 	lea		-16(%rdi, %rax, 4), %rdi
776 	vaesenc		-6*16(%rdi), %xmm0, %xmm0
777 	vaesenc		-5*16(%rdi), %xmm0, %xmm0
779 	vaesenc		-4*16(%rdi), %xmm0, %xmm0
780 	vaesenc		-3*16(%rdi), %xmm0, %xmm0
782 	vaesenc		-2*16(%rdi), %xmm0, %xmm0
783 	vaesenc		-1*16(%rdi), %xmm0, %xmm0
796 // Below are the actual AES-XTS encryption and decryption functions,