1// SPDX-License-Identifier: Brian-Gladman-3-Clause 2/* 3 * --------------------------------------------------------------------------- 4 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. 5 * 6 * LICENSE TERMS 7 * 8 * The free distribution and use of this software is allowed (with or without 9 * changes) provided that: 10 * 11 * 1. source code distributions include the above copyright notice, this 12 * list of conditions and the following disclaimer; 13 * 14 * 2. binary distributions include the above copyright notice, this list 15 * of conditions and the following disclaimer in their documentation; 16 * 17 * 3. the name of the copyright holder is not used to endorse products 18 * built using this software without specific written permission. 19 * 20 * DISCLAIMER 21 * 22 * This software is provided 'as is' with no explicit or implied warranties 23 * in respect of its properties, including, but not limited to, correctness 24 * and/or fitness for purpose. 25 * --------------------------------------------------------------------------- 26 * Issue 20/12/2007 27 * 28 * I am grateful to Dag Arne Osvik for many discussions of the techniques that 29 * can be used to optimise AES assembler code on AMD64/EM64T architectures. 30 * Some of the techniques used in this implementation are the result of 31 * suggestions made by him for which I am most grateful. 32 * 33 * An AES implementation for AMD64 processors using the YASM assembler. This 34 * implementation provides only encryption, decryption and hence requires key 35 * scheduling support in C. It uses 8k bytes of tables but its encryption and 36 * decryption performance is very close to that obtained using large tables. 37 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, 38 * which are as follows: 39 * ms windows gnu/linux/opensolaris os 40 * 41 * in_blk rcx rdi 42 * out_blk rdx rsi 43 * context (cx) r8 rdx 44 * 45 * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 46 * registers rdi - on both 47 * 48 * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 49 * registers - rdi on both 50 * 51 * The convention used here is that for gnu/linux/opensolaris os. 52 * 53 * This code provides the standard AES block size (128 bits, 16 bytes) and the 54 * three standard AES key sizes (128, 192 and 256 bits). It has the same call 55 * interface as my C implementation. It uses the Microsoft C AMD64 calling 56 * conventions in which the three parameters are placed in rcx, rdx and r8 57 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. 58 * 59 * OpenSolaris Note: 60 * Modified to use GNU/Linux/Solaris calling conventions. 61 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. 62 * 63 * AES_RETURN aes_encrypt(const unsigned char in_blk[], 64 * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ 65 * 66 * AES_RETURN aes_decrypt(const unsigned char in_blk[], 67 * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ 68 * 69 * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[], 70 * const aes_encrypt_ctx cx[1])/ 71 * 72 * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[], 73 * const aes_decrypt_ctx cx[1])/ 74 * 75 * AES_RETURN aes_encrypt_key(const unsigned char key[], 76 * unsigned int len, const aes_decrypt_ctx cx[1])/ 77 * 78 * AES_RETURN aes_decrypt_key(const unsigned char key[], 79 * unsigned int len, const aes_decrypt_ctx cx[1])/ 80 * 81 * where <NNN> is 128, 102 or 256. In the last two calls the length can be in 82 * either bits or bytes. 83 * 84 * Comment in/out the following lines to obtain the desired subroutines. These 85 * selections MUST match those in the C header file aesopt.h 86 */ 87#define AES_REV_DKS /* define if key decryption schedule is reversed */ 88 89#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ 90 91/* 92 * The encryption key schedule has the following in memory layout where N is the 93 * number of rounds (10, 12 or 14): 94 * 95 * lo: | input key (round 0) | / each round is four 32-bit words 96 * | encryption round 1 | 97 * | encryption round 2 | 98 * .... 99 * | encryption round N-1 | 100 * hi: | encryption round N | 101 * 102 * The decryption key schedule is normally set up so that it has the same 103 * layout as above by actually reversing the order of the encryption key 104 * schedule in memory (this happens when AES_REV_DKS is set): 105 * 106 * lo: | decryption round 0 | = | encryption round N | 107 * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] 108 * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] 109 * .... .... 110 * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] 111 * hi: | decryption round N | = | input key (round 0) | 112 * 113 * with rounds except the first and last modified using inv_mix_column() 114 * But if AES_REV_DKS is NOT set the order of keys is left as it is for 115 * encryption so that it has to be accessed in reverse when used for 116 * decryption (although the inverse mix column modifications are done) 117 * 118 * lo: | decryption round 0 | = | input key (round 0) | 119 * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] 120 * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] 121 * .... .... 122 * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] 123 * hi: | decryption round N | = | encryption round N | 124 * 125 * This layout is faster when the assembler key scheduling provided here 126 * is used. 127 * 128 * End of user defines 129 */ 130 131/* 132 * --------------------------------------------------------------------------- 133 * OpenSolaris OS modifications 134 * 135 * This source originates from Brian Gladman file aes_amd64.asm 136 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip 137 * with these changes: 138 * 139 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and 140 * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, 141 * AES_128, AES_192, AES_256, AES_VAR ifdefs. 142 * 143 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define 144 * 145 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef 146 * 147 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax 148 * (operands reversed, literals prefixed with "$", registers prefixed with "%", 149 * and "[register+offset]", addressing changed to "offset(register)", 150 * parenthesis in constant expressions "()" changed to square brackets "[]", 151 * "." removed from local (numeric) labels, and other changes. 152 * Examples: 153 * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax 154 * mov rax,(4*20h) mov $[4*0x20],%rax 155 * mov rax,[ebx+20h] mov 0x20(%ebx),%rax 156 * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax 157 * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax 158 * 159 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 160 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function 161 * definitions for lint. 162 * 163 * 6. Renamed functions and reordered parameters to match OpenSolaris: 164 * Original Gladman interface: 165 * int aes_encrypt(const unsigned char *in, 166 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 167 * int aes_decrypt(const unsigned char *in, 168 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 169 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, 170 * and a union type, inf., containing inf.l, a uint32_t and 171 * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is 172 * used and contains the key schedule length * 16 where key schedule length is 173 * 10, 12, or 14 bytes. 174 * 175 * OpenSolaris OS interface: 176 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, 177 * const uint32_t pt[4], uint32_t ct[4])/ 178 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, 179 * const uint32_t pt[4], uint32_t ct[4])/ 180 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ 181 * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ 182 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, 183 * ct is crypto text, and MAX_AES_NR is 14. 184 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. 185 */ 186 187#if defined(lint) || defined(__lint) 188 189#include <sys/types.h> 190void 191aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], 192 uint32_t ct[4]) { 193 (void) rk, (void) Nr, (void) pt, (void) ct; 194} 195void 196aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], 197 uint32_t pt[4]) { 198 (void) rk, (void) Nr, (void) pt, (void) ct; 199} 200 201 202#else 203 204#define _ASM 205#include <sys/asm_linkage.h> 206 207#define KS_LENGTH 60 208 209#define raxd eax 210#define rdxd edx 211#define rcxd ecx 212#define rbxd ebx 213#define rsid esi 214#define rdid edi 215 216#define raxb al 217#define rdxb dl 218#define rcxb cl 219#define rbxb bl 220#define rsib sil 221#define rdib dil 222 223// finite field multiplies by {02}, {04} and {08} 224 225#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) 226#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) 227#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) 228 229// finite field multiplies required in table generation 230 231#define f3(x) ((f2(x)) ^ (x)) 232#define f9(x) ((f8(x)) ^ (x)) 233#define fb(x) ((f8(x)) ^ (f2(x)) ^ (x)) 234#define fd(x) ((f8(x)) ^ (f4(x)) ^ (x)) 235#define fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x))) 236 237// macros for expanding S-box data 238 239#define u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x)) 240#define v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x) 241#define w8(x) (x), 0, 0, 0, (x), 0, 0, 0 242 243#define enc_vals(x) \ 244 .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ 245 .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ 246 .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ 247 .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ 248 .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ 249 .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ 250 .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ 251 .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ 252 .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ 253 .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ 254 .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ 255 .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ 256 .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ 257 .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ 258 .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ 259 .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ 260 .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ 261 .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ 262 .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ 263 .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ 264 .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ 265 .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ 266 .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ 267 .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ 268 .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ 269 .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ 270 .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ 271 .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ 272 .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ 273 .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ 274 .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ 275 .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) 276 277#define dec_vals(x) \ 278 .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ 279 .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ 280 .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ 281 .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ 282 .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ 283 .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ 284 .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ 285 .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ 286 .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ 287 .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ 288 .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ 289 .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ 290 .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ 291 .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ 292 .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ 293 .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ 294 .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ 295 .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ 296 .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ 297 .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ 298 .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ 299 .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ 300 .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ 301 .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ 302 .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ 303 .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ 304 .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ 305 .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ 306 .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ 307 .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ 308 .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ 309 .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) 310 311#define tptr %rbp /* table pointer */ 312#define kptr %r8 /* key schedule pointer */ 313#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ 314#define fk_ref(x, y) -16*x+fofs+4*y(kptr) 315 316#ifdef AES_REV_DKS 317#define rofs 128 318#define ik_ref(x, y) -16*x+rofs+4*y(kptr) 319 320#else 321#define rofs -128 322#define ik_ref(x, y) 16*x+rofs+4*y(kptr) 323#endif /* AES_REV_DKS */ 324 325#define tab_0(x) (tptr,x,8) 326#define tab_1(x) 3(tptr,x,8) 327#define tab_2(x) 2(tptr,x,8) 328#define tab_3(x) 1(tptr,x,8) 329#define tab_f(x) 1(tptr,x,8) 330#define tab_i(x) 7(tptr,x,8) 331 332#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ 333 mov fk_ref(round,0), p1; \ 334 mov fk_ref(round,1), p2; \ 335 mov fk_ref(round,2), p3; \ 336 mov fk_ref(round,3), p4; \ 337 \ 338 movzx %al, %esi; \ 339 movzx %ah, %edi; \ 340 shr $16, %eax; \ 341 xor tab_0(%rsi), p1; \ 342 xor tab_1(%rdi), p4; \ 343 movzx %al, %esi; \ 344 movzx %ah, %edi; \ 345 xor tab_2(%rsi), p3; \ 346 xor tab_3(%rdi), p2; \ 347 \ 348 movzx %bl, %esi; \ 349 movzx %bh, %edi; \ 350 shr $16, %ebx; \ 351 xor tab_0(%rsi), p2; \ 352 xor tab_1(%rdi), p1; \ 353 movzx %bl, %esi; \ 354 movzx %bh, %edi; \ 355 xor tab_2(%rsi), p4; \ 356 xor tab_3(%rdi), p3; \ 357 \ 358 movzx %cl, %esi; \ 359 movzx %ch, %edi; \ 360 shr $16, %ecx; \ 361 xor tab_0(%rsi), p3; \ 362 xor tab_1(%rdi), p2; \ 363 movzx %cl, %esi; \ 364 movzx %ch, %edi; \ 365 xor tab_2(%rsi), p1; \ 366 xor tab_3(%rdi), p4; \ 367 \ 368 movzx %dl, %esi; \ 369 movzx %dh, %edi; \ 370 shr $16, %edx; \ 371 xor tab_0(%rsi), p4; \ 372 xor tab_1(%rdi), p3; \ 373 movzx %dl, %esi; \ 374 movzx %dh, %edi; \ 375 xor tab_2(%rsi), p2; \ 376 xor tab_3(%rdi), p1; \ 377 \ 378 mov p1, %eax; \ 379 mov p2, %ebx; \ 380 mov p3, %ecx; \ 381 mov p4, %edx 382 383#ifdef LAST_ROUND_TABLES 384 385#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ 386 add $2048, tptr; \ 387 mov fk_ref(round,0), p1; \ 388 mov fk_ref(round,1), p2; \ 389 mov fk_ref(round,2), p3; \ 390 mov fk_ref(round,3), p4; \ 391 \ 392 movzx %al, %esi; \ 393 movzx %ah, %edi; \ 394 shr $16, %eax; \ 395 xor tab_0(%rsi), p1; \ 396 xor tab_1(%rdi), p4; \ 397 movzx %al, %esi; \ 398 movzx %ah, %edi; \ 399 xor tab_2(%rsi), p3; \ 400 xor tab_3(%rdi), p2; \ 401 \ 402 movzx %bl, %esi; \ 403 movzx %bh, %edi; \ 404 shr $16, %ebx; \ 405 xor tab_0(%rsi), p2; \ 406 xor tab_1(%rdi), p1; \ 407 movzx %bl, %esi; \ 408 movzx %bh, %edi; \ 409 xor tab_2(%rsi), p4; \ 410 xor tab_3(%rdi), p3; \ 411 \ 412 movzx %cl, %esi; \ 413 movzx %ch, %edi; \ 414 shr $16, %ecx; \ 415 xor tab_0(%rsi), p3; \ 416 xor tab_1(%rdi), p2; \ 417 movzx %cl, %esi; \ 418 movzx %ch, %edi; \ 419 xor tab_2(%rsi), p1; \ 420 xor tab_3(%rdi), p4; \ 421 \ 422 movzx %dl, %esi; \ 423 movzx %dh, %edi; \ 424 shr $16, %edx; \ 425 xor tab_0(%rsi), p4; \ 426 xor tab_1(%rdi), p3; \ 427 movzx %dl, %esi; \ 428 movzx %dh, %edi; \ 429 xor tab_2(%rsi), p2; \ 430 xor tab_3(%rdi), p1 431 432#else 433 434#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ 435 mov fk_ref(round,0), p1; \ 436 mov fk_ref(round,1), p2; \ 437 mov fk_ref(round,2), p3; \ 438 mov fk_ref(round,3), p4; \ 439 \ 440 movzx %al, %esi; \ 441 movzx %ah, %edi; \ 442 shr $16, %eax; \ 443 movzx tab_f(%rsi), %esi; \ 444 movzx tab_f(%rdi), %edi; \ 445 xor %esi, p1; \ 446 rol $8, %edi; \ 447 xor %edi, p4; \ 448 movzx %al, %esi; \ 449 movzx %ah, %edi; \ 450 movzx tab_f(%rsi), %esi; \ 451 movzx tab_f(%rdi), %edi; \ 452 rol $16, %esi; \ 453 rol $24, %edi; \ 454 xor %esi, p3; \ 455 xor %edi, p2; \ 456 \ 457 movzx %bl, %esi; \ 458 movzx %bh, %edi; \ 459 shr $16, %ebx; \ 460 movzx tab_f(%rsi), %esi; \ 461 movzx tab_f(%rdi), %edi; \ 462 xor %esi, p2; \ 463 rol $8, %edi; \ 464 xor %edi, p1; \ 465 movzx %bl, %esi; \ 466 movzx %bh, %edi; \ 467 movzx tab_f(%rsi), %esi; \ 468 movzx tab_f(%rdi), %edi; \ 469 rol $16, %esi; \ 470 rol $24, %edi; \ 471 xor %esi, p4; \ 472 xor %edi, p3; \ 473 \ 474 movzx %cl, %esi; \ 475 movzx %ch, %edi; \ 476 movzx tab_f(%rsi), %esi; \ 477 movzx tab_f(%rdi), %edi; \ 478 shr $16, %ecx; \ 479 xor %esi, p3; \ 480 rol $8, %edi; \ 481 xor %edi, p2; \ 482 movzx %cl, %esi; \ 483 movzx %ch, %edi; \ 484 movzx tab_f(%rsi), %esi; \ 485 movzx tab_f(%rdi), %edi; \ 486 rol $16, %esi; \ 487 rol $24, %edi; \ 488 xor %esi, p1; \ 489 xor %edi, p4; \ 490 \ 491 movzx %dl, %esi; \ 492 movzx %dh, %edi; \ 493 movzx tab_f(%rsi), %esi; \ 494 movzx tab_f(%rdi), %edi; \ 495 shr $16, %edx; \ 496 xor %esi, p4; \ 497 rol $8, %edi; \ 498 xor %edi, p3; \ 499 movzx %dl, %esi; \ 500 movzx %dh, %edi; \ 501 movzx tab_f(%rsi), %esi; \ 502 movzx tab_f(%rdi), %edi; \ 503 rol $16, %esi; \ 504 rol $24, %edi; \ 505 xor %esi, p2; \ 506 xor %edi, p1 507 508#endif /* LAST_ROUND_TABLES */ 509 510#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ 511 mov ik_ref(round,0), p1; \ 512 mov ik_ref(round,1), p2; \ 513 mov ik_ref(round,2), p3; \ 514 mov ik_ref(round,3), p4; \ 515 \ 516 movzx %al, %esi; \ 517 movzx %ah, %edi; \ 518 shr $16, %eax; \ 519 xor tab_0(%rsi), p1; \ 520 xor tab_1(%rdi), p2; \ 521 movzx %al, %esi; \ 522 movzx %ah, %edi; \ 523 xor tab_2(%rsi), p3; \ 524 xor tab_3(%rdi), p4; \ 525 \ 526 movzx %bl, %esi; \ 527 movzx %bh, %edi; \ 528 shr $16, %ebx; \ 529 xor tab_0(%rsi), p2; \ 530 xor tab_1(%rdi), p3; \ 531 movzx %bl, %esi; \ 532 movzx %bh, %edi; \ 533 xor tab_2(%rsi), p4; \ 534 xor tab_3(%rdi), p1; \ 535 \ 536 movzx %cl, %esi; \ 537 movzx %ch, %edi; \ 538 shr $16, %ecx; \ 539 xor tab_0(%rsi), p3; \ 540 xor tab_1(%rdi), p4; \ 541 movzx %cl, %esi; \ 542 movzx %ch, %edi; \ 543 xor tab_2(%rsi), p1; \ 544 xor tab_3(%rdi), p2; \ 545 \ 546 movzx %dl, %esi; \ 547 movzx %dh, %edi; \ 548 shr $16, %edx; \ 549 xor tab_0(%rsi), p4; \ 550 xor tab_1(%rdi), p1; \ 551 movzx %dl, %esi; \ 552 movzx %dh, %edi; \ 553 xor tab_2(%rsi), p2; \ 554 xor tab_3(%rdi), p3; \ 555 \ 556 mov p1, %eax; \ 557 mov p2, %ebx; \ 558 mov p3, %ecx; \ 559 mov p4, %edx 560 561#ifdef LAST_ROUND_TABLES 562 563#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ 564 add $2048, tptr; \ 565 mov ik_ref(round,0), p1; \ 566 mov ik_ref(round,1), p2; \ 567 mov ik_ref(round,2), p3; \ 568 mov ik_ref(round,3), p4; \ 569 \ 570 movzx %al, %esi; \ 571 movzx %ah, %edi; \ 572 shr $16, %eax; \ 573 xor tab_0(%rsi), p1; \ 574 xor tab_1(%rdi), p2; \ 575 movzx %al, %esi; \ 576 movzx %ah, %edi; \ 577 xor tab_2(%rsi), p3; \ 578 xor tab_3(%rdi), p4; \ 579 \ 580 movzx %bl, %esi; \ 581 movzx %bh, %edi; \ 582 shr $16, %ebx; \ 583 xor tab_0(%rsi), p2; \ 584 xor tab_1(%rdi), p3; \ 585 movzx %bl, %esi; \ 586 movzx %bh, %edi; \ 587 xor tab_2(%rsi), p4; \ 588 xor tab_3(%rdi), p1; \ 589 \ 590 movzx %cl, %esi; \ 591 movzx %ch, %edi; \ 592 shr $16, %ecx; \ 593 xor tab_0(%rsi), p3; \ 594 xor tab_1(%rdi), p4; \ 595 movzx %cl, %esi; \ 596 movzx %ch, %edi; \ 597 xor tab_2(%rsi), p1; \ 598 xor tab_3(%rdi), p2; \ 599 \ 600 movzx %dl, %esi; \ 601 movzx %dh, %edi; \ 602 shr $16, %edx; \ 603 xor tab_0(%rsi), p4; \ 604 xor tab_1(%rdi), p1; \ 605 movzx %dl, %esi; \ 606 movzx %dh, %edi; \ 607 xor tab_2(%rsi), p2; \ 608 xor tab_3(%rdi), p3 609 610#else 611 612#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ 613 mov ik_ref(round,0), p1; \ 614 mov ik_ref(round,1), p2; \ 615 mov ik_ref(round,2), p3; \ 616 mov ik_ref(round,3), p4; \ 617 \ 618 movzx %al, %esi; \ 619 movzx %ah, %edi; \ 620 movzx tab_i(%rsi), %esi; \ 621 movzx tab_i(%rdi), %edi; \ 622 shr $16, %eax; \ 623 xor %esi, p1; \ 624 rol $8, %edi; \ 625 xor %edi, p2; \ 626 movzx %al, %esi; \ 627 movzx %ah, %edi; \ 628 movzx tab_i(%rsi), %esi; \ 629 movzx tab_i(%rdi), %edi; \ 630 rol $16, %esi; \ 631 rol $24, %edi; \ 632 xor %esi, p3; \ 633 xor %edi, p4; \ 634 \ 635 movzx %bl, %esi; \ 636 movzx %bh, %edi; \ 637 movzx tab_i(%rsi), %esi; \ 638 movzx tab_i(%rdi), %edi; \ 639 shr $16, %ebx; \ 640 xor %esi, p2; \ 641 rol $8, %edi; \ 642 xor %edi, p3; \ 643 movzx %bl, %esi; \ 644 movzx %bh, %edi; \ 645 movzx tab_i(%rsi), %esi; \ 646 movzx tab_i(%rdi), %edi; \ 647 rol $16, %esi; \ 648 rol $24, %edi; \ 649 xor %esi, p4; \ 650 xor %edi, p1; \ 651 \ 652 movzx %cl, %esi; \ 653 movzx %ch, %edi; \ 654 movzx tab_i(%rsi), %esi; \ 655 movzx tab_i(%rdi), %edi; \ 656 shr $16, %ecx; \ 657 xor %esi, p3; \ 658 rol $8, %edi; \ 659 xor %edi, p4; \ 660 movzx %cl, %esi; \ 661 movzx %ch, %edi; \ 662 movzx tab_i(%rsi), %esi; \ 663 movzx tab_i(%rdi), %edi; \ 664 rol $16, %esi; \ 665 rol $24, %edi; \ 666 xor %esi, p1; \ 667 xor %edi, p2; \ 668 \ 669 movzx %dl, %esi; \ 670 movzx %dh, %edi; \ 671 movzx tab_i(%rsi), %esi; \ 672 movzx tab_i(%rdi), %edi; \ 673 shr $16, %edx; \ 674 xor %esi, p4; \ 675 rol $8, %edi; \ 676 xor %edi, p1; \ 677 movzx %dl, %esi; \ 678 movzx %dh, %edi; \ 679 movzx tab_i(%rsi), %esi; \ 680 movzx tab_i(%rdi), %edi; \ 681 rol $16, %esi; \ 682 rol $24, %edi; \ 683 xor %esi, p2; \ 684 xor %edi, p3 685 686#endif /* LAST_ROUND_TABLES */ 687 688/* 689 * OpenSolaris OS: 690 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, 691 * const uint32_t pt[4], uint32_t ct[4])/ 692 * 693 * Original interface: 694 * int aes_encrypt(const unsigned char *in, 695 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 696 */ 697SECTION_STATIC 698.balign 64 699enc_tab: 700 enc_vals(u8) 701#ifdef LAST_ROUND_TABLES 702 // Last Round Tables: 703 enc_vals(w8) 704#endif 705 706 707ENTRY_NP(aes_encrypt_amd64) 708 ENDBR 709#ifdef GLADMAN_INTERFACE 710 // Original interface 711 sub $[4*8], %rsp // gnu/linux/opensolaris binary interface 712 mov %rsi, (%rsp) // output pointer (P2) 713 mov %rdx, %r8 // context (P3) 714 715 mov %rbx, 1*8(%rsp) // P1: input pointer in rdi 716 mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) 717 mov %r12, 3*8(%rsp) // P3: context in r8 718 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 719 720#else 721 // OpenSolaris OS interface 722 sub $(4*8), %rsp // Make room on stack to save registers 723 mov %rcx, (%rsp) // Save output pointer (P4) on stack 724 mov %rdi, %r8 // context (P1) 725 mov %rdx, %rdi // P3: save input pointer 726 shl $4, %esi // P2: esi byte key length * 16 727 728 mov %rbx, 1*8(%rsp) // Save registers 729 mov %rbp, 2*8(%rsp) 730 mov %r12, 3*8(%rsp) 731 // P1: context in r8 732 // P2: byte key length * 16 in esi 733 // P3: input pointer in rdi 734 // P4: output pointer in (rsp) 735#endif /* GLADMAN_INTERFACE */ 736 737 lea enc_tab(%rip), tptr 738 sub $fofs, kptr 739 740 // Load input block into registers 741 mov (%rdi), %eax 742 mov 1*4(%rdi), %ebx 743 mov 2*4(%rdi), %ecx 744 mov 3*4(%rdi), %edx 745 746 xor fofs(kptr), %eax 747 xor fofs+4(kptr), %ebx 748 xor fofs+8(kptr), %ecx 749 xor fofs+12(kptr), %edx 750 751 lea (kptr,%rsi), kptr 752 // Jump based on byte key length * 16: 753 cmp $(10*16), %esi 754 je 3f 755 cmp $(12*16), %esi 756 je 2f 757 cmp $(14*16), %esi 758 je 1f 759 mov $-1, %rax // error 760 jmp 4f 761 762 // Perform normal forward rounds 7631: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) 764 ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) 7652: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) 766 ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) 7673: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) 768 ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) 769 ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) 770 ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) 771 ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) 772 ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) 773 ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) 774 ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) 775 ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) 776 fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) 777 778 // Copy results 779 mov (%rsp), %rbx 780 mov %r9d, (%rbx) 781 mov %r10d, 4(%rbx) 782 mov %r11d, 8(%rbx) 783 mov %r12d, 12(%rbx) 784 xor %rax, %rax 7854: // Restore registers 786 mov 1*8(%rsp), %rbx 787 mov 2*8(%rsp), %rbp 788 mov 3*8(%rsp), %r12 789 add $(4*8), %rsp 790 RET 791 792 SET_SIZE(aes_encrypt_amd64) 793 794/* 795 * OpenSolaris OS: 796 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, 797 * const uint32_t pt[4], uint32_t ct[4])/ 798 * 799 * Original interface: 800 * int aes_decrypt(const unsigned char *in, 801 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 802 */ 803SECTION_STATIC 804.balign 64 805dec_tab: 806 dec_vals(v8) 807#ifdef LAST_ROUND_TABLES 808 // Last Round Tables: 809 dec_vals(w8) 810#endif 811 812 813ENTRY_NP(aes_decrypt_amd64) 814 ENDBR 815#ifdef GLADMAN_INTERFACE 816 // Original interface 817 sub $[4*8], %rsp // gnu/linux/opensolaris binary interface 818 mov %rsi, (%rsp) // output pointer (P2) 819 mov %rdx, %r8 // context (P3) 820 821 mov %rbx, 1*8(%rsp) // P1: input pointer in rdi 822 mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) 823 mov %r12, 3*8(%rsp) // P3: context in r8 824 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 825 826#else 827 // OpenSolaris OS interface 828 sub $(4*8), %rsp // Make room on stack to save registers 829 mov %rcx, (%rsp) // Save output pointer (P4) on stack 830 mov %rdi, %r8 // context (P1) 831 mov %rdx, %rdi // P3: save input pointer 832 shl $4, %esi // P2: esi byte key length * 16 833 834 mov %rbx, 1*8(%rsp) // Save registers 835 mov %rbp, 2*8(%rsp) 836 mov %r12, 3*8(%rsp) 837 // P1: context in r8 838 // P2: byte key length * 16 in esi 839 // P3: input pointer in rdi 840 // P4: output pointer in (rsp) 841#endif /* GLADMAN_INTERFACE */ 842 843 lea dec_tab(%rip), tptr 844 sub $rofs, kptr 845 846 // Load input block into registers 847 mov (%rdi), %eax 848 mov 1*4(%rdi), %ebx 849 mov 2*4(%rdi), %ecx 850 mov 3*4(%rdi), %edx 851 852#ifdef AES_REV_DKS 853 mov kptr, %rdi 854 lea (kptr,%rsi), kptr 855#else 856 lea (kptr,%rsi), %rdi 857#endif 858 859 xor rofs(%rdi), %eax 860 xor rofs+4(%rdi), %ebx 861 xor rofs+8(%rdi), %ecx 862 xor rofs+12(%rdi), %edx 863 864 // Jump based on byte key length * 16: 865 cmp $(10*16), %esi 866 je 3f 867 cmp $(12*16), %esi 868 je 2f 869 cmp $(14*16), %esi 870 je 1f 871 mov $-1, %rax // error 872 jmp 4f 873 874 // Perform normal inverse rounds 8751: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) 876 ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) 8772: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) 878 ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) 8793: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) 880 ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) 881 ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) 882 ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) 883 ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) 884 ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) 885 ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) 886 ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) 887 ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) 888 il_rnd(%r9d, %r10d, %r11d, %r12d, 0) 889 890 // Copy results 891 mov (%rsp), %rbx 892 mov %r9d, (%rbx) 893 mov %r10d, 4(%rbx) 894 mov %r11d, 8(%rbx) 895 mov %r12d, 12(%rbx) 896 xor %rax, %rax 8974: // Restore registers 898 mov 1*8(%rsp), %rbx 899 mov 2*8(%rsp), %rbp 900 mov 3*8(%rsp), %r12 901 add $(4*8), %rsp 902 RET 903 904 SET_SIZE(aes_decrypt_amd64) 905#endif /* lint || __lint */ 906 907#ifdef __ELF__ 908.section .note.GNU-stack,"",%progbits 909#endif 910