Lines Matching +full:sub +full:- +full:block
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * AES-XTS for modern x86_64 CPUs
11 * This file implements AES-XTS for modern x86_64 CPUs. To handle the
16 * AES-NI + AVX
17 * - 128-bit vectors (1 AES block per vector)
18 * - VEX-coded instructions
19 * - xmm0-xmm15
20 * - This is for older CPUs that lack VAES but do have AVX.
23 * - 256-bit vectors (2 AES blocks per vector)
24 * - VEX-coded instructions
25 * - ymm0-ymm15
26 * - This is for CPUs that have VAES but lack AVX512 or AVX10,
30 * - 256-bit vectors (2 AES blocks per vector)
31 * - EVEX-coded instructions
32 * - ymm0-ymm31
33 * - This is for CPUs that have AVX512 but where using zmm registers causes
35 * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
36 * To avoid confusion with 512-bit, we just write AVX10/256.
39 * - Same as the previous one, but upgrades to 512-bit vectors
40 * (4 AES blocks per vector) in zmm0-zmm31.
41 * - This is for CPUs that have good AVX512 or AVX10/512 support.
43 * This file doesn't have an implementation for AES-NI alone (without AVX), as
51 * The AES-XTS implementations in this file support everything required by the
52 * crypto API, including support for arbitrary input lengths and multi-part
54 * power-of-2 length inputs that are processed in a single part (disk sectors).
73 // on CPUs that don't support AVX10-style masking.
85 // advanced to point to 7th-from-last round key
97 // %r9-r11 are available as temporaries.
112 // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
150 // V0-V3 hold the data blocks during the main loop, or temporary values
151 // otherwise. V4-V5 hold temporary values.
153 // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
161 // V10-V13 are used for computing the next values of TWEAK[0-3].
167 // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
171 // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
175 // If 32 SIMD registers are available, then V16-V29 hold the remaining
176 // AES round keys, copied to all 128-bit lanes.
178 // AES-128, AES-192, and AES-256 use different numbers of round keys.
180 // keys to the *end* of this register range. I.e., AES-128 uses
181 // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
182 // (All also use KEY0 for the XOR-only "round" at the beginning.)
213 // V30-V31 are currently unused.
225 // Broadcast a 128-bit value into a vector.
248 // vpternlogd with immediate 0x96 is a three-argument XOR.
256 // Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
276 vpsrlq $64 - VL/16, \src, \tmp1
285 // store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
296 // Compute the second block of TWEAK0.
308 // Compute TWEAK[1-3] from TWEAK0.
309 vpsrlq $64 - 1*VL/16, TWEAK0, V0
310 vpsrlq $64 - 2*VL/16, TWEAK0, V2
311 vpsrlq $64 - 3*VL/16, TWEAK0, V4
373 // when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
374 // which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
377 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
379 vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
381 vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
383 vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
405 // TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
431 // For AES-128, increment by 3*16, resulting in the 10 round keys (not
432 // counting the zero-th round key which was just loaded into KEY0) being
433 // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
434 // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
435 // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
438 // any round key be in the range [-96, 112], fitting in a signed byte.
439 // This shortens VEX-encoded instructions that access the later round
440 // keys which otherwise would need 4-byte offsets. Second, it makes it
441 // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
449 lea OFFS-16(KEY, KEYLEN64, 4), KEY
456 _vbroadcast128 -6*16(KEY), KEY1
457 _vbroadcast128 -5*16(KEY), KEY2
459 _vbroadcast128 -4*16(KEY), KEY3
460 _vbroadcast128 -3*16(KEY), KEY4
462 _vbroadcast128 -2*16(KEY), KEY5
463 _vbroadcast128 -1*16(KEY), KEY6
476 // on the block(s) in \data using the round key(s) in \key. The register length
494 // Do a single round of AES en/decryption on the block(s) in \data, using the
495 // same key for all block(s). The round key is loaded from the appropriate
502 _vaes \enc, \last, (\i-7)*16(KEY), \data
504 _vbroadcast128 (\i-7)*16(KEY), V4
510 // Do a single round of AES en/decryption on the blocks in registers V0-V3,
516 _tweak_step (2*(\i-5))
519 _tweak_step (2*(\i-5) + 1)
523 _vbroadcast128 (\i-7)*16(KEY), V4
524 _tweak_step (2*(\i-5))
527 _tweak_step (2*(\i-5) + 1)
534 // then XOR with \tweak again) of the block(s) in \data. To process a single
535 // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
566 // block length, exclude the last full block from the main loop by
569 // the last full block and the partial block specially at the end.
570 lea -16(LEN), %eax
575 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
581 // Compute the first set of tweaks TWEAK[0-3].
584 sub $4*VL, LEN
590 // XOR each source block with its tweak and the zero-th round key.
649 sub $4*VL, LEN
653 // 4*VL. Handle it out-of-line in order to optimize for the common
655 test $4*VL-1, LEN8
669 add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
678 sub $VL, LEN
681 add $VL-16, LEN // Undo extra sub of VL, then sub 16.
683 add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
695 sub $16, LEN
698 add $16, LEN // Undo the extra sub of 16.
706 // If encrypting, the main loop already encrypted the last full block to
709 sub $16, SRC
710 sub $16, DST
713 // If decrypting, the main loop didn't decrypt the last full block
715 // Do it now by advancing the tweak and decrypting the last full block.
723 mov $-1, %r9d
727 // Swap the first LEN bytes of the en/decryption of the last full block
728 // with the partial block. Note that to support in-place en/decryption,
729 // the load from the src partial block must happen before the store to
730 // the dst partial block.
737 // Load the src partial block, left-aligned. Note that to support
738 // in-place en/decryption, this must happen before the store to the dst
739 // partial block.
742 // Shift the first LEN bytes of the en/decryption of the last full block
744 // dst partial block. It also writes to the second part of the dst last
745 // full block, but that part is overwritten later.
749 // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
750 sub LEN64, %r9
753 // Shift the src partial block to the beginning of its register.
756 // Do a blend to generate the src partial block followed by the second
757 // part of the en/decryption of the last full block.
760 // En/decrypt again and store the last full block.
772 lea -16(%rdi, %rax, 4), %rdi
776 vaesenc -6*16(%rdi), %xmm0, %xmm0
777 vaesenc -5*16(%rdi), %xmm0, %xmm0
779 vaesenc -4*16(%rdi), %xmm0, %xmm0
780 vaesenc -3*16(%rdi), %xmm0, %xmm0
782 vaesenc -2*16(%rdi), %xmm0, %xmm0
783 vaesenc -1*16(%rdi), %xmm0, %xmm0
796 // Below are the actual AES-XTS encryption and decryption functions,