1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * AES-XTS for modern x86_64 CPUs 4 * 5 * Copyright 2024 Google LLC 6 * 7 * Author: Eric Biggers <ebiggers@google.com> 8 */ 9 10/* 11 * This file implements AES-XTS for modern x86_64 CPUs. To handle the 12 * complexities of coding for x86 SIMD, e.g. where every vector length needs 13 * different code, it uses a macro to generate several implementations that 14 * share similar source code but are targeted at different CPUs, listed below: 15 * 16 * AES-NI + AVX 17 * - 128-bit vectors (1 AES block per vector) 18 * - VEX-coded instructions 19 * - xmm0-xmm15 20 * - This is for older CPUs that lack VAES but do have AVX. 21 * 22 * VAES + VPCLMULQDQ + AVX2 23 * - 256-bit vectors (2 AES blocks per vector) 24 * - VEX-coded instructions 25 * - ymm0-ymm15 26 * - This is for CPUs that have VAES but lack AVX512 or AVX10, 27 * e.g. Intel's Alder Lake and AMD's Zen 3. 28 * 29 * VAES + VPCLMULQDQ + AVX10/256 + BMI2 30 * - 256-bit vectors (2 AES blocks per vector) 31 * - EVEX-coded instructions 32 * - ymm0-ymm31 33 * - This is for CPUs that have AVX512 but where using zmm registers causes 34 * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. 35 * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. 36 * To avoid confusion with 512-bit, we just write AVX10/256. 37 * 38 * VAES + VPCLMULQDQ + AVX10/512 + BMI2 39 * - Same as the previous one, but upgrades to 512-bit vectors 40 * (4 AES blocks per vector) in zmm0-zmm31. 41 * - This is for CPUs that have good AVX512 or AVX10/512 support. 42 * 43 * This file doesn't have an implementation for AES-NI alone (without AVX), as 44 * the lack of VEX would make all the assembly code different. 45 * 46 * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of 47 * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be 48 * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might 49 * need to start also providing an implementation using VAES alone. 50 * 51 * The AES-XTS implementations in this file support everything required by the 52 * crypto API, including support for arbitrary input lengths and multi-part 53 * processing. However, they are most heavily optimized for the common case of 54 * power-of-2 length inputs that are processed in a single part (disk sectors). 55 */ 56 57#include <linux/linkage.h> 58#include <linux/cfi_types.h> 59 60.section .rodata 61.p2align 4 62.Lgf_poly: 63 // The low 64 bits of this value represent the polynomial x^7 + x^2 + x 64 // + 1. It is the value that must be XOR'd into the low 64 bits of the 65 // tweak each time a 1 is carried out of the high 64 bits. 66 // 67 // The high 64 bits of this value is just the internal carry bit that 68 // exists when there's a carry out of the low 64 bits of the tweak. 69 .quad 0x87, 1 70 71 // This table contains constants for vpshufb and vpblendvb, used to 72 // handle variable byte shifts and blending during ciphertext stealing 73 // on CPUs that don't support AVX10-style masking. 74.Lcts_permute_table: 75 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 76 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 77 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 78 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 79 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 81.text 82 83// Function parameters 84.set KEY, %rdi // Initially points to crypto_aes_ctx, then is 85 // advanced to point to 7th-from-last round key 86.set SRC, %rsi // Pointer to next source data 87.set DST, %rdx // Pointer to next destination data 88.set LEN, %ecx // Remaining length in bytes 89.set LEN8, %cl 90.set LEN64, %rcx 91.set TWEAK, %r8 // Pointer to next tweak 92 93// %rax holds the AES key length in bytes. 94.set KEYLEN, %eax 95.set KEYLEN64, %rax 96 97// %r9-r11 are available as temporaries. 98 99.macro _define_Vi i 100.if VL == 16 101 .set V\i, %xmm\i 102.elseif VL == 32 103 .set V\i, %ymm\i 104.elseif VL == 64 105 .set V\i, %zmm\i 106.else 107 .error "Unsupported Vector Length (VL)" 108.endif 109.endm 110 111.macro _define_aliases 112 // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers 113 // are available, that map to the xmm, ymm, or zmm registers according 114 // to the selected Vector Length (VL). 115 _define_Vi 0 116 _define_Vi 1 117 _define_Vi 2 118 _define_Vi 3 119 _define_Vi 4 120 _define_Vi 5 121 _define_Vi 6 122 _define_Vi 7 123 _define_Vi 8 124 _define_Vi 9 125 _define_Vi 10 126 _define_Vi 11 127 _define_Vi 12 128 _define_Vi 13 129 _define_Vi 14 130 _define_Vi 15 131.if USE_AVX10 132 _define_Vi 16 133 _define_Vi 17 134 _define_Vi 18 135 _define_Vi 19 136 _define_Vi 20 137 _define_Vi 21 138 _define_Vi 22 139 _define_Vi 23 140 _define_Vi 24 141 _define_Vi 25 142 _define_Vi 26 143 _define_Vi 27 144 _define_Vi 28 145 _define_Vi 29 146 _define_Vi 30 147 _define_Vi 31 148.endif 149 150 // V0-V3 hold the data blocks during the main loop, or temporary values 151 // otherwise. V4-V5 hold temporary values. 152 153 // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. 154 .set TWEAK0_XMM, %xmm6 155 .set TWEAK0, V6 156 .set TWEAK1_XMM, %xmm7 157 .set TWEAK1, V7 158 .set TWEAK2, V8 159 .set TWEAK3, V9 160 161 // V10-V13 are used for computing the next values of TWEAK[0-3]. 162 .set NEXT_TWEAK0, V10 163 .set NEXT_TWEAK1, V11 164 .set NEXT_TWEAK2, V12 165 .set NEXT_TWEAK3, V13 166 167 // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes. 168 .set GF_POLY_XMM, %xmm14 169 .set GF_POLY, V14 170 171 // V15 holds the key for AES "round 0", copied to all 128-bit lanes. 172 .set KEY0_XMM, %xmm15 173 .set KEY0, V15 174 175 // If 32 SIMD registers are available, then V16-V29 hold the remaining 176 // AES round keys, copied to all 128-bit lanes. 177 // 178 // AES-128, AES-192, and AES-256 use different numbers of round keys. 179 // To allow handling all three variants efficiently, we align the round 180 // keys to the *end* of this register range. I.e., AES-128 uses 181 // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. 182 // (All also use KEY0 for the XOR-only "round" at the beginning.) 183.if USE_AVX10 184 .set KEY1_XMM, %xmm16 185 .set KEY1, V16 186 .set KEY2_XMM, %xmm17 187 .set KEY2, V17 188 .set KEY3_XMM, %xmm18 189 .set KEY3, V18 190 .set KEY4_XMM, %xmm19 191 .set KEY4, V19 192 .set KEY5_XMM, %xmm20 193 .set KEY5, V20 194 .set KEY6_XMM, %xmm21 195 .set KEY6, V21 196 .set KEY7_XMM, %xmm22 197 .set KEY7, V22 198 .set KEY8_XMM, %xmm23 199 .set KEY8, V23 200 .set KEY9_XMM, %xmm24 201 .set KEY9, V24 202 .set KEY10_XMM, %xmm25 203 .set KEY10, V25 204 .set KEY11_XMM, %xmm26 205 .set KEY11, V26 206 .set KEY12_XMM, %xmm27 207 .set KEY12, V27 208 .set KEY13_XMM, %xmm28 209 .set KEY13, V28 210 .set KEY14_XMM, %xmm29 211 .set KEY14, V29 212.endif 213 // V30-V31 are currently unused. 214.endm 215 216// Move a vector between memory and a register. 217.macro _vmovdqu src, dst 218.if VL < 64 219 vmovdqu \src, \dst 220.else 221 vmovdqu8 \src, \dst 222.endif 223.endm 224 225// Broadcast a 128-bit value into a vector. 226.macro _vbroadcast128 src, dst 227.if VL == 16 && !USE_AVX10 228 vmovdqu \src, \dst 229.elseif VL == 32 && !USE_AVX10 230 vbroadcasti128 \src, \dst 231.else 232 vbroadcasti32x4 \src, \dst 233.endif 234.endm 235 236// XOR two vectors together. 237.macro _vpxor src1, src2, dst 238.if USE_AVX10 239 vpxord \src1, \src2, \dst 240.else 241 vpxor \src1, \src2, \dst 242.endif 243.endm 244 245// XOR three vectors together. 246.macro _xor3 src1, src2, src3_and_dst 247.if USE_AVX10 248 // vpternlogd with immediate 0x96 is a three-argument XOR. 249 vpternlogd $0x96, \src1, \src2, \src3_and_dst 250.else 251 vpxor \src1, \src3_and_dst, \src3_and_dst 252 vpxor \src2, \src3_and_dst, \src3_and_dst 253.endif 254.endm 255 256// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak 257// (by multiplying by the polynomial 'x') and write it to \dst. 258.macro _next_tweak src, tmp, dst 259 vpshufd $0x13, \src, \tmp 260 vpaddq \src, \src, \dst 261 vpsrad $31, \tmp, \tmp 262 vpand GF_POLY_XMM, \tmp, \tmp 263 vpxor \tmp, \dst, \dst 264.endm 265 266// Given the XTS tweak(s) in the vector \src, compute the next vector of 267// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. 268// 269// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute 270// all tweaks in the vector in parallel. If VL=16, we just do the regular 271// computation without vpclmulqdq, as it's the faster method for a single tweak. 272.macro _next_tweakvec src, tmp1, tmp2, dst 273.if VL == 16 274 _next_tweak \src, \tmp1, \dst 275.else 276 vpsrlq $64 - VL/16, \src, \tmp1 277 vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2 278 vpslldq $8, \tmp1, \tmp1 279 vpsllq $VL/16, \src, \dst 280 _xor3 \tmp1, \tmp2, \dst 281.endif 282.endm 283 284// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and 285// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. 286.macro _compute_first_set_of_tweaks 287 vmovdqu (TWEAK), TWEAK0_XMM 288 _vbroadcast128 .Lgf_poly(%rip), GF_POLY 289.if VL == 16 290 // With VL=16, multiplying by x serially is fastest. 291 _next_tweak TWEAK0, %xmm0, TWEAK1 292 _next_tweak TWEAK1, %xmm0, TWEAK2 293 _next_tweak TWEAK2, %xmm0, TWEAK3 294.else 295.if VL == 32 296 // Compute the second block of TWEAK0. 297 _next_tweak TWEAK0_XMM, %xmm0, %xmm1 298 vinserti128 $1, %xmm1, TWEAK0, TWEAK0 299.elseif VL == 64 300 // Compute the remaining blocks of TWEAK0. 301 _next_tweak TWEAK0_XMM, %xmm0, %xmm1 302 _next_tweak %xmm1, %xmm0, %xmm2 303 _next_tweak %xmm2, %xmm0, %xmm3 304 vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 305 vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 306 vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 307.endif 308 // Compute TWEAK[1-3] from TWEAK0. 309 vpsrlq $64 - 1*VL/16, TWEAK0, V0 310 vpsrlq $64 - 2*VL/16, TWEAK0, V2 311 vpsrlq $64 - 3*VL/16, TWEAK0, V4 312 vpclmulqdq $0x01, GF_POLY, V0, V1 313 vpclmulqdq $0x01, GF_POLY, V2, V3 314 vpclmulqdq $0x01, GF_POLY, V4, V5 315 vpslldq $8, V0, V0 316 vpslldq $8, V2, V2 317 vpslldq $8, V4, V4 318 vpsllq $1*VL/16, TWEAK0, TWEAK1 319 vpsllq $2*VL/16, TWEAK0, TWEAK2 320 vpsllq $3*VL/16, TWEAK0, TWEAK3 321.if USE_AVX10 322 vpternlogd $0x96, V0, V1, TWEAK1 323 vpternlogd $0x96, V2, V3, TWEAK2 324 vpternlogd $0x96, V4, V5, TWEAK3 325.else 326 vpxor V0, TWEAK1, TWEAK1 327 vpxor V2, TWEAK2, TWEAK2 328 vpxor V4, TWEAK3, TWEAK3 329 vpxor V1, TWEAK1, TWEAK1 330 vpxor V3, TWEAK2, TWEAK2 331 vpxor V5, TWEAK3, TWEAK3 332.endif 333.endif 334.endm 335 336// Do one step in computing the next set of tweaks using the method of just 337// multiplying by x repeatedly (the same method _next_tweak uses). 338.macro _tweak_step_mulx i 339.if \i == 0 340 .set PREV_TWEAK, TWEAK3 341 .set NEXT_TWEAK, NEXT_TWEAK0 342.elseif \i == 5 343 .set PREV_TWEAK, NEXT_TWEAK0 344 .set NEXT_TWEAK, NEXT_TWEAK1 345.elseif \i == 10 346 .set PREV_TWEAK, NEXT_TWEAK1 347 .set NEXT_TWEAK, NEXT_TWEAK2 348.elseif \i == 15 349 .set PREV_TWEAK, NEXT_TWEAK2 350 .set NEXT_TWEAK, NEXT_TWEAK3 351.endif 352.if \i >= 0 && \i < 20 && \i % 5 == 0 353 vpshufd $0x13, PREV_TWEAK, V5 354.elseif \i >= 0 && \i < 20 && \i % 5 == 1 355 vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK 356.elseif \i >= 0 && \i < 20 && \i % 5 == 2 357 vpsrad $31, V5, V5 358.elseif \i >= 0 && \i < 20 && \i % 5 == 3 359 vpand GF_POLY, V5, V5 360.elseif \i >= 0 && \i < 20 && \i % 5 == 4 361 vpxor V5, NEXT_TWEAK, NEXT_TWEAK 362.elseif \i == 1000 363 vmovdqa NEXT_TWEAK0, TWEAK0 364 vmovdqa NEXT_TWEAK1, TWEAK1 365 vmovdqa NEXT_TWEAK2, TWEAK2 366 vmovdqa NEXT_TWEAK3, TWEAK3 367.endif 368.endm 369 370// Do one step in computing the next set of tweaks using the VPCLMULQDQ method 371// (the same method _next_tweakvec uses for VL > 16). This means multiplying 372// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 373// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, 374// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. 375.macro _tweak_step_pclmul i 376.if \i == 0 377 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 378.elseif \i == 2 379 vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 380.elseif \i == 4 381 vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 382.elseif \i == 6 383 vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 384.elseif \i == 8 385 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 386.elseif \i == 10 387 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 388.elseif \i == 12 389 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 390.elseif \i == 14 391 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 392.elseif \i == 1000 393 vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 394 vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1 395 vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2 396 vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3 397 _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0 398 _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1 399 _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2 400 _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3 401.endif 402.endm 403 404// _tweak_step does one step of the computation of the next set of tweaks from 405// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of 406// \i that include at least 0 through 19, then 1000 which signals the last step. 407// 408// This is used to interleave the computation of the next set of tweaks with the 409// AES en/decryptions, which increases performance in some cases. 410.macro _tweak_step i 411.if VL == 16 412 _tweak_step_mulx \i 413.else 414 _tweak_step_pclmul \i 415.endif 416.endm 417 418.macro _setup_round_keys enc 419 420 // Select either the encryption round keys or the decryption round keys. 421.if \enc 422 .set OFFS, 0 423.else 424 .set OFFS, 240 425.endif 426 427 // Load the round key for "round 0". 428 _vbroadcast128 OFFS(KEY), KEY0 429 430 // Increment KEY to make it so that 7*16(KEY) is the last round key. 431 // For AES-128, increment by 3*16, resulting in the 10 round keys (not 432 // counting the zero-th round key which was just loaded into KEY0) being 433 // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use 434 // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment 435 // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY). 436 // 437 // This rebasing provides two benefits. First, it makes the offset to 438 // any round key be in the range [-96, 112], fitting in a signed byte. 439 // This shortens VEX-encoded instructions that access the later round 440 // keys which otherwise would need 4-byte offsets. Second, it makes it 441 // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the 442 // beginning. Skipping rounds at the end doesn't work as well because 443 // the last round needs different instructions. 444 // 445 // An alternative approach would be to roll up all the round loops. We 446 // don't do that because it isn't compatible with caching the round keys 447 // in registers which we do when possible (see below), and also because 448 // it seems unwise to rely *too* heavily on the CPU's branch predictor. 449 lea OFFS-16(KEY, KEYLEN64, 4), KEY 450 451 // If all 32 SIMD registers are available, cache all the round keys. 452.if USE_AVX10 453 cmp $24, KEYLEN 454 jl .Laes128\@ 455 je .Laes192\@ 456 _vbroadcast128 -6*16(KEY), KEY1 457 _vbroadcast128 -5*16(KEY), KEY2 458.Laes192\@: 459 _vbroadcast128 -4*16(KEY), KEY3 460 _vbroadcast128 -3*16(KEY), KEY4 461.Laes128\@: 462 _vbroadcast128 -2*16(KEY), KEY5 463 _vbroadcast128 -1*16(KEY), KEY6 464 _vbroadcast128 0*16(KEY), KEY7 465 _vbroadcast128 1*16(KEY), KEY8 466 _vbroadcast128 2*16(KEY), KEY9 467 _vbroadcast128 3*16(KEY), KEY10 468 _vbroadcast128 4*16(KEY), KEY11 469 _vbroadcast128 5*16(KEY), KEY12 470 _vbroadcast128 6*16(KEY), KEY13 471 _vbroadcast128 7*16(KEY), KEY14 472.endif 473.endm 474 475// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) 476// on the block(s) in \data using the round key(s) in \key. The register length 477// determines the number of AES blocks en/decrypted. 478.macro _vaes enc, last, key, data 479.if \enc 480.if \last 481 vaesenclast \key, \data, \data 482.else 483 vaesenc \key, \data, \data 484.endif 485.else 486.if \last 487 vaesdeclast \key, \data, \data 488.else 489 vaesdec \key, \data, \data 490.endif 491.endif 492.endm 493 494// Do a single round of AES en/decryption on the block(s) in \data, using the 495// same key for all block(s). The round key is loaded from the appropriate 496// register or memory location for round \i. May clobber V4. 497.macro _vaes_1x enc, last, i, xmm_suffix, data 498.if USE_AVX10 499 _vaes \enc, \last, KEY\i\xmm_suffix, \data 500.else 501.ifnb \xmm_suffix 502 _vaes \enc, \last, (\i-7)*16(KEY), \data 503.else 504 _vbroadcast128 (\i-7)*16(KEY), V4 505 _vaes \enc, \last, V4, \data 506.endif 507.endif 508.endm 509 510// Do a single round of AES en/decryption on the blocks in registers V0-V3, 511// using the same key for all blocks. The round key is loaded from the 512// appropriate register or memory location for round \i. In addition, does two 513// steps of the computation of the next set of tweaks. May clobber V4. 514.macro _vaes_4x enc, last, i 515.if USE_AVX10 516 _tweak_step (2*(\i-5)) 517 _vaes \enc, \last, KEY\i, V0 518 _vaes \enc, \last, KEY\i, V1 519 _tweak_step (2*(\i-5) + 1) 520 _vaes \enc, \last, KEY\i, V2 521 _vaes \enc, \last, KEY\i, V3 522.else 523 _vbroadcast128 (\i-7)*16(KEY), V4 524 _tweak_step (2*(\i-5)) 525 _vaes \enc, \last, V4, V0 526 _vaes \enc, \last, V4, V1 527 _tweak_step (2*(\i-5) + 1) 528 _vaes \enc, \last, V4, V2 529 _vaes \enc, \last, V4, V3 530.endif 531.endm 532 533// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, 534// then XOR with \tweak again) of the block(s) in \data. To process a single 535// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of 536// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. 537.macro _aes_crypt enc, xmm_suffix, tweak, data 538 _xor3 KEY0\xmm_suffix, \tweak, \data 539 cmp $24, KEYLEN 540 jl .Laes128\@ 541 je .Laes192\@ 542 _vaes_1x \enc, 0, 1, \xmm_suffix, \data 543 _vaes_1x \enc, 0, 2, \xmm_suffix, \data 544.Laes192\@: 545 _vaes_1x \enc, 0, 3, \xmm_suffix, \data 546 _vaes_1x \enc, 0, 4, \xmm_suffix, \data 547.Laes128\@: 548 _vaes_1x \enc, 0, 5, \xmm_suffix, \data 549 _vaes_1x \enc, 0, 6, \xmm_suffix, \data 550 _vaes_1x \enc, 0, 7, \xmm_suffix, \data 551 _vaes_1x \enc, 0, 8, \xmm_suffix, \data 552 _vaes_1x \enc, 0, 9, \xmm_suffix, \data 553 _vaes_1x \enc, 0, 10, \xmm_suffix, \data 554 _vaes_1x \enc, 0, 11, \xmm_suffix, \data 555 _vaes_1x \enc, 0, 12, \xmm_suffix, \data 556 _vaes_1x \enc, 0, 13, \xmm_suffix, \data 557 _vaes_1x \enc, 1, 14, \xmm_suffix, \data 558 _vpxor \tweak, \data, \data 559.endm 560 561.macro _aes_xts_crypt enc 562 _define_aliases 563 564.if !\enc 565 // When decrypting a message whose length isn't a multiple of the AES 566 // block length, exclude the last full block from the main loop by 567 // subtracting 16 from LEN. This is needed because ciphertext stealing 568 // decryption uses the last two tweaks in reverse order. We'll handle 569 // the last full block and the partial block specially at the end. 570 lea -16(LEN), %eax 571 test $15, LEN8 572 cmovnz %eax, LEN 573.endif 574 575 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). 576 movl 480(KEY), KEYLEN 577 578 // Setup the pointer to the round keys and cache as many as possible. 579 _setup_round_keys \enc 580 581 // Compute the first set of tweaks TWEAK[0-3]. 582 _compute_first_set_of_tweaks 583 584 sub $4*VL, LEN 585 jl .Lhandle_remainder\@ 586 587.Lmain_loop\@: 588 // This is the main loop, en/decrypting 4*VL bytes per iteration. 589 590 // XOR each source block with its tweak and the zero-th round key. 591.if USE_AVX10 592 vmovdqu8 0*VL(SRC), V0 593 vmovdqu8 1*VL(SRC), V1 594 vmovdqu8 2*VL(SRC), V2 595 vmovdqu8 3*VL(SRC), V3 596 vpternlogd $0x96, TWEAK0, KEY0, V0 597 vpternlogd $0x96, TWEAK1, KEY0, V1 598 vpternlogd $0x96, TWEAK2, KEY0, V2 599 vpternlogd $0x96, TWEAK3, KEY0, V3 600.else 601 vpxor 0*VL(SRC), KEY0, V0 602 vpxor 1*VL(SRC), KEY0, V1 603 vpxor 2*VL(SRC), KEY0, V2 604 vpxor 3*VL(SRC), KEY0, V3 605 vpxor TWEAK0, V0, V0 606 vpxor TWEAK1, V1, V1 607 vpxor TWEAK2, V2, V2 608 vpxor TWEAK3, V3, V3 609.endif 610 cmp $24, KEYLEN 611 jl .Laes128\@ 612 je .Laes192\@ 613 // Do all the AES rounds on the data blocks, interleaved with 614 // the computation of the next set of tweaks. 615 _vaes_4x \enc, 0, 1 616 _vaes_4x \enc, 0, 2 617.Laes192\@: 618 _vaes_4x \enc, 0, 3 619 _vaes_4x \enc, 0, 4 620.Laes128\@: 621 _vaes_4x \enc, 0, 5 622 _vaes_4x \enc, 0, 6 623 _vaes_4x \enc, 0, 7 624 _vaes_4x \enc, 0, 8 625 _vaes_4x \enc, 0, 9 626 _vaes_4x \enc, 0, 10 627 _vaes_4x \enc, 0, 11 628 _vaes_4x \enc, 0, 12 629 _vaes_4x \enc, 0, 13 630 _vaes_4x \enc, 1, 14 631 632 // XOR in the tweaks again. 633 _vpxor TWEAK0, V0, V0 634 _vpxor TWEAK1, V1, V1 635 _vpxor TWEAK2, V2, V2 636 _vpxor TWEAK3, V3, V3 637 638 // Store the destination blocks. 639 _vmovdqu V0, 0*VL(DST) 640 _vmovdqu V1, 1*VL(DST) 641 _vmovdqu V2, 2*VL(DST) 642 _vmovdqu V3, 3*VL(DST) 643 644 // Finish computing the next set of tweaks. 645 _tweak_step 1000 646 647 add $4*VL, SRC 648 add $4*VL, DST 649 sub $4*VL, LEN 650 jge .Lmain_loop\@ 651 652 // Check for the uncommon case where the data length isn't a multiple of 653 // 4*VL. Handle it out-of-line in order to optimize for the common 654 // case. In the common case, just fall through to the ret. 655 test $4*VL-1, LEN8 656 jnz .Lhandle_remainder\@ 657.Ldone\@: 658 // Store the next tweak back to *TWEAK to support continuation calls. 659 vmovdqu TWEAK0_XMM, (TWEAK) 660.if VL > 16 661 vzeroupper 662.endif 663 RET 664 665.Lhandle_remainder\@: 666 667 // En/decrypt any remaining full blocks, one vector at a time. 668.if VL > 16 669 add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL. 670 jl .Lvec_at_a_time_done\@ 671.Lvec_at_a_time\@: 672 _vmovdqu (SRC), V0 673 _aes_crypt \enc, , TWEAK0, V0 674 _vmovdqu V0, (DST) 675 _next_tweakvec TWEAK0, V0, V1, TWEAK0 676 add $VL, SRC 677 add $VL, DST 678 sub $VL, LEN 679 jge .Lvec_at_a_time\@ 680.Lvec_at_a_time_done\@: 681 add $VL-16, LEN // Undo extra sub of VL, then sub 16. 682.else 683 add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16. 684.endif 685 686 // En/decrypt any remaining full blocks, one at a time. 687 jl .Lblock_at_a_time_done\@ 688.Lblock_at_a_time\@: 689 vmovdqu (SRC), %xmm0 690 _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 691 vmovdqu %xmm0, (DST) 692 _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM 693 add $16, SRC 694 add $16, DST 695 sub $16, LEN 696 jge .Lblock_at_a_time\@ 697.Lblock_at_a_time_done\@: 698 add $16, LEN // Undo the extra sub of 16. 699 // Now 0 <= LEN <= 15. If LEN is zero, we're done. 700 jz .Ldone\@ 701 702 // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN. 703 // Do ciphertext stealing to process the last 16 + LEN bytes. 704 705.if \enc 706 // If encrypting, the main loop already encrypted the last full block to 707 // create the CTS intermediate ciphertext. Prepare for the rest of CTS 708 // by rewinding the pointers and loading the intermediate ciphertext. 709 sub $16, SRC 710 sub $16, DST 711 vmovdqu (DST), %xmm0 712.else 713 // If decrypting, the main loop didn't decrypt the last full block 714 // because CTS decryption uses the last two tweaks in reverse order. 715 // Do it now by advancing the tweak and decrypting the last full block. 716 _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM 717 vmovdqu (SRC), %xmm0 718 _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0 719.endif 720 721.if USE_AVX10 722 // Create a mask that has the first LEN bits set. 723 mov $-1, %r9d 724 bzhi LEN, %r9d, %r9d 725 kmovd %r9d, %k1 726 727 // Swap the first LEN bytes of the en/decryption of the last full block 728 // with the partial block. Note that to support in-place en/decryption, 729 // the load from the src partial block must happen before the store to 730 // the dst partial block. 731 vmovdqa %xmm0, %xmm1 732 vmovdqu8 16(SRC), %xmm0{%k1} 733 vmovdqu8 %xmm1, 16(DST){%k1} 734.else 735 lea .Lcts_permute_table(%rip), %r9 736 737 // Load the src partial block, left-aligned. Note that to support 738 // in-place en/decryption, this must happen before the store to the dst 739 // partial block. 740 vmovdqu (SRC, LEN64, 1), %xmm1 741 742 // Shift the first LEN bytes of the en/decryption of the last full block 743 // to the end of a register, then store it to DST+LEN. This stores the 744 // dst partial block. It also writes to the second part of the dst last 745 // full block, but that part is overwritten later. 746 vpshufb (%r9, LEN64, 1), %xmm0, %xmm2 747 vmovdqu %xmm2, (DST, LEN64, 1) 748 749 // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. 750 sub LEN64, %r9 751 vmovdqu 32(%r9), %xmm3 752 753 // Shift the src partial block to the beginning of its register. 754 vpshufb %xmm3, %xmm1, %xmm1 755 756 // Do a blend to generate the src partial block followed by the second 757 // part of the en/decryption of the last full block. 758 vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 759.endif 760 // En/decrypt again and store the last full block. 761 _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 762 vmovdqu %xmm0, (DST) 763 jmp .Ldone\@ 764.endm 765 766// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, 767// u8 iv[AES_BLOCK_SIZE]); 768SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) 769 vmovdqu (%rsi), %xmm0 770 vpxor (%rdi), %xmm0, %xmm0 771 movl 480(%rdi), %eax // AES key length 772 lea -16(%rdi, %rax, 4), %rdi 773 cmp $24, %eax 774 jl .Lencrypt_iv_aes128 775 je .Lencrypt_iv_aes192 776 vaesenc -6*16(%rdi), %xmm0, %xmm0 777 vaesenc -5*16(%rdi), %xmm0, %xmm0 778.Lencrypt_iv_aes192: 779 vaesenc -4*16(%rdi), %xmm0, %xmm0 780 vaesenc -3*16(%rdi), %xmm0, %xmm0 781.Lencrypt_iv_aes128: 782 vaesenc -2*16(%rdi), %xmm0, %xmm0 783 vaesenc -1*16(%rdi), %xmm0, %xmm0 784 vaesenc 0*16(%rdi), %xmm0, %xmm0 785 vaesenc 1*16(%rdi), %xmm0, %xmm0 786 vaesenc 2*16(%rdi), %xmm0, %xmm0 787 vaesenc 3*16(%rdi), %xmm0, %xmm0 788 vaesenc 4*16(%rdi), %xmm0, %xmm0 789 vaesenc 5*16(%rdi), %xmm0, %xmm0 790 vaesenc 6*16(%rdi), %xmm0, %xmm0 791 vaesenclast 7*16(%rdi), %xmm0, %xmm0 792 vmovdqu %xmm0, (%rsi) 793 RET 794SYM_FUNC_END(aes_xts_encrypt_iv) 795 796// Below are the actual AES-XTS encryption and decryption functions, 797// instantiated from the above macro. They all have the following prototype: 798// 799// void (*xts_asm_func)(const struct crypto_aes_ctx *key, 800// const u8 *src, u8 *dst, unsigned int len, 801// u8 tweak[AES_BLOCK_SIZE]); 802// 803// |key| is the data key. |tweak| contains the next tweak; the encryption of 804// the original IV with the tweak key was already done. This function supports 805// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and 806// |len| must be a multiple of 16 except on the last call. If |len| is a 807// multiple of 16, then this function updates |tweak| to contain the next tweak. 808 809.set VL, 16 810.set USE_AVX10, 0 811SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) 812 _aes_xts_crypt 1 813SYM_FUNC_END(aes_xts_encrypt_aesni_avx) 814SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx) 815 _aes_xts_crypt 0 816SYM_FUNC_END(aes_xts_decrypt_aesni_avx) 817 818#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 819.set VL, 32 820.set USE_AVX10, 0 821SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) 822 _aes_xts_crypt 1 823SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) 824SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) 825 _aes_xts_crypt 0 826SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) 827 828.set VL, 32 829.set USE_AVX10, 1 830SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) 831 _aes_xts_crypt 1 832SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) 833SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) 834 _aes_xts_crypt 0 835SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) 836 837.set VL, 64 838.set USE_AVX10, 1 839SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) 840 _aes_xts_crypt 1 841SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) 842SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) 843 _aes_xts_crypt 0 844SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) 845#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ 846