1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2// 3// AES block cipher using AES-NI instructions 4// 5// Copyright 2026 Google LLC 6// 7// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require 8// AVX. It does use up to SSE4.1, which all CPUs with AES-NI have. 9#include <linux/linkage.h> 10 11.section .rodata 12#ifdef __x86_64__ 13#define RODATA(label) label(%rip) 14#else 15#define RODATA(label) label 16#endif 17 18 // A mask for pshufb that extracts the last dword, rotates it right by 8 19 // bits, and copies the result to all four dwords. 20.p2align 4 21.Lmask: 22 .byte 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 23 24 // The AES round constants, used during key expansion 25.Lrcon: 26 .long 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 27 28.text 29 30// Transform four dwords [a0, a1, a2, a3] in \a into 31// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]. \tmp is a temporary xmm register. 32// 33// Note: this could be done in four instructions, shufps + pxor + shufps + pxor, 34// if the temporary register were zero-initialized ahead of time. We instead do 35// it in an easier-to-understand way that doesn't require zero-initialization 36// and avoids the unusual shufps instruction. movdqa is usually "free" anyway. 37.macro _prefix_sum a, tmp 38 movdqa \a, \tmp // [a0, a1, a2, a3] 39 pslldq $4, \a // [0, a0, a1, a2] 40 pxor \tmp, \a // [a0, a0^a1, a1^a2, a2^a3] 41 movdqa \a, \tmp 42 pslldq $8, \a // [0, 0, a0, a0^a1] 43 pxor \tmp, \a // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3] 44.endm 45 46.macro _gen_round_key a, b 47 // Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is 48 // the last dword of the previous round key (given in \b). 49 // 50 // 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)). 51 // It is used here solely for the SubBytes and the XOR. The ShiftRows 52 // is a no-op because all four columns are the same here. 53 // 54 // Don't use the 'aeskeygenassist' instruction, since: 55 // - On most Intel CPUs it is microcoded, making it have a much higher 56 // latency and use more execution ports than 'aesenclast'. 57 // - It cannot be used in a loop, since it requires an immediate. 58 // - It doesn't do much more than 'aesenclast' in the first place. 59 movdqa \b, %xmm2 60 pshufb MASK, %xmm2 61 aesenclast RCON, %xmm2 62 63 // XOR in the prefix sum of the four dwords of \a, which is the 64 // previous round key (AES-128) or the first round key in the previous 65 // pair of round keys (AES-256). The result is the next round key. 66 _prefix_sum \a, tmp=%xmm3 67 pxor %xmm2, \a 68 69 // Store the next round key to memory. Also leave it in \a. 70 movdqu \a, (RNDKEYS) 71.endm 72 73.macro _aes_expandkey_aesni is_aes128 74#ifdef __x86_64__ 75 // Arguments 76 .set RNDKEYS, %rdi 77 .set INV_RNDKEYS, %rsi 78 .set IN_KEY, %rdx 79 80 // Other local variables 81 .set RCON_PTR, %rcx 82 .set COUNTER, %eax 83#else 84 // Arguments, assuming -mregparm=3 85 .set RNDKEYS, %eax 86 .set INV_RNDKEYS, %edx 87 .set IN_KEY, %ecx 88 89 // Other local variables 90 .set RCON_PTR, %ebx 91 .set COUNTER, %esi 92#endif 93 .set RCON, %xmm6 94 .set MASK, %xmm7 95 96#ifdef __i386__ 97 push %ebx 98 push %esi 99#endif 100 101.if \is_aes128 102 // AES-128: the first round key is simply a copy of the raw key. 103 movdqu (IN_KEY), %xmm0 104 movdqu %xmm0, (RNDKEYS) 105.else 106 // AES-256: the first two round keys are simply a copy of the raw key. 107 movdqu (IN_KEY), %xmm0 108 movdqu %xmm0, (RNDKEYS) 109 movdqu 16(IN_KEY), %xmm1 110 movdqu %xmm1, 16(RNDKEYS) 111 add $32, RNDKEYS 112.endif 113 114 // Generate the remaining round keys. 115 movdqa RODATA(.Lmask), MASK 116.if \is_aes128 117 lea RODATA(.Lrcon), RCON_PTR 118 mov $10, COUNTER 119.Lgen_next_aes128_round_key: 120 add $16, RNDKEYS 121 movd (RCON_PTR), RCON 122 pshufd $0x00, RCON, RCON 123 add $4, RCON_PTR 124 _gen_round_key %xmm0, %xmm0 125 dec COUNTER 126 jnz .Lgen_next_aes128_round_key 127.else 128 // AES-256: only the first 7 round constants are needed, so instead of 129 // loading each one from memory, just start by loading [1, 1, 1, 1] and 130 // then generate the rest by doubling. 131 pshufd $0x00, RODATA(.Lrcon), RCON 132 pxor %xmm5, %xmm5 // All-zeroes 133 mov $7, COUNTER 134.Lgen_next_aes256_round_key_pair: 135 // Generate the next AES-256 round key: either the first of a pair of 136 // two, or the last one. 137 _gen_round_key %xmm0, %xmm1 138 139 dec COUNTER 140 jz .Lgen_aes256_round_keys_done 141 142 // Generate the second AES-256 round key of the pair. Compared to the 143 // first, there's no rotation and no XOR of a round constant. 144 pshufd $0xff, %xmm0, %xmm2 // Get four copies of last dword 145 aesenclast %xmm5, %xmm2 // Just does SubBytes 146 _prefix_sum %xmm1, tmp=%xmm3 147 pxor %xmm2, %xmm1 148 movdqu %xmm1, 16(RNDKEYS) 149 add $32, RNDKEYS 150 paddd RCON, RCON // RCON <<= 1 151 jmp .Lgen_next_aes256_round_key_pair 152.Lgen_aes256_round_keys_done: 153.endif 154 155 // If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent 156 // Inverse Cipher to it. To do that, reverse the standard round keys, 157 // and apply aesimc (InvMixColumn) to each except the first and last. 158 test INV_RNDKEYS, INV_RNDKEYS 159 jz .Ldone\@ 160 movdqu (RNDKEYS), %xmm0 // Last standard round key 161 movdqu %xmm0, (INV_RNDKEYS) // => First inverse round key 162.if \is_aes128 163 mov $9, COUNTER 164.else 165 mov $13, COUNTER 166.endif 167.Lgen_next_inv_round_key\@: 168 sub $16, RNDKEYS 169 add $16, INV_RNDKEYS 170 movdqu (RNDKEYS), %xmm0 171 aesimc %xmm0, %xmm0 172 movdqu %xmm0, (INV_RNDKEYS) 173 dec COUNTER 174 jnz .Lgen_next_inv_round_key\@ 175 movdqu -16(RNDKEYS), %xmm0 // First standard round key 176 movdqu %xmm0, 16(INV_RNDKEYS) // => Last inverse round key 177 178.Ldone\@: 179#ifdef __i386__ 180 pop %esi 181 pop %ebx 182#endif 183 RET 184.endm 185 186// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, 187// const u8 in_key[AES_KEYSIZE_128]); 188SYM_FUNC_START(aes128_expandkey_aesni) 189 _aes_expandkey_aesni 1 190SYM_FUNC_END(aes128_expandkey_aesni) 191 192// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, 193// const u8 in_key[AES_KEYSIZE_256]); 194SYM_FUNC_START(aes256_expandkey_aesni) 195 _aes_expandkey_aesni 0 196SYM_FUNC_END(aes256_expandkey_aesni) 197 198.macro _aes_crypt_aesni enc 199#ifdef __x86_64__ 200 .set RNDKEYS, %rdi 201 .set NROUNDS, %esi 202 .set OUT, %rdx 203 .set IN, %rcx 204#else 205 // Assuming -mregparm=3 206 .set RNDKEYS, %eax 207 .set NROUNDS, %edx 208 .set OUT, %ecx 209 .set IN, %ebx // Passed on stack 210#endif 211 212#ifdef __i386__ 213 push %ebx 214 mov 8(%esp), %ebx 215#endif 216 217 // Zero-th round 218 movdqu (IN), %xmm0 219 movdqu (RNDKEYS), %xmm1 220 pxor %xmm1, %xmm0 221 222 // Normal rounds 223 add $16, RNDKEYS 224 dec NROUNDS 225.Lnext_round\@: 226 movdqu (RNDKEYS), %xmm1 227.if \enc 228 aesenc %xmm1, %xmm0 229.else 230 aesdec %xmm1, %xmm0 231.endif 232 add $16, RNDKEYS 233 dec NROUNDS 234 jne .Lnext_round\@ 235 236 // Last round 237 movdqu (RNDKEYS), %xmm1 238.if \enc 239 aesenclast %xmm1, %xmm0 240.else 241 aesdeclast %xmm1, %xmm0 242.endif 243 movdqu %xmm0, (OUT) 244 245#ifdef __i386__ 246 pop %ebx 247#endif 248 RET 249.endm 250 251// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds, 252// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); 253SYM_FUNC_START(aes_encrypt_aesni) 254 _aes_crypt_aesni 1 255SYM_FUNC_END(aes_encrypt_aesni) 256 257// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds, 258// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); 259SYM_FUNC_START(aes_decrypt_aesni) 260 _aes_crypt_aesni 0 261SYM_FUNC_END(aes_decrypt_aesni) 262