1/* 2 * --------------------------------------------------------------------------- 3 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. 4 * 5 * LICENSE TERMS 6 * 7 * The free distribution and use of this software is allowed (with or without 8 * changes) provided that: 9 * 10 * 1. source code distributions include the above copyright notice, this 11 * list of conditions and the following disclaimer; 12 * 13 * 2. binary distributions include the above copyright notice, this list 14 * of conditions and the following disclaimer in their documentation; 15 * 16 * 3. the name of the copyright holder is not used to endorse products 17 * built using this software without specific written permission. 18 * 19 * DISCLAIMER 20 * 21 * This software is provided 'as is' with no explicit or implied warranties 22 * in respect of its properties, including, but not limited to, correctness 23 * and/or fitness for purpose. 24 * --------------------------------------------------------------------------- 25 * Issue 20/12/2007 26 * 27 * I am grateful to Dag Arne Osvik for many discussions of the techniques that 28 * can be used to optimise AES assembler code on AMD64/EM64T architectures. 29 * Some of the techniques used in this implementation are the result of 30 * suggestions made by him for which I am most grateful. 31 * 32 * An AES implementation for AMD64 processors using the YASM assembler. This 33 * implementation provides only encryption, decryption and hence requires key 34 * scheduling support in C. It uses 8k bytes of tables but its encryption and 35 * decryption performance is very close to that obtained using large tables. 36 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, 37 * which are as follows: 38 * ms windows gnu/linux/opensolaris os 39 * 40 * in_blk rcx rdi 41 * out_blk rdx rsi 42 * context (cx) r8 rdx 43 * 44 * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 45 * registers rdi - on both 46 * 47 * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 48 * registers - rdi on both 49 * 50 * The convention used here is that for gnu/linux/opensolaris os. 51 * 52 * This code provides the standard AES block size (128 bits, 16 bytes) and the 53 * three standard AES key sizes (128, 192 and 256 bits). It has the same call 54 * interface as my C implementation. It uses the Microsoft C AMD64 calling 55 * conventions in which the three parameters are placed in rcx, rdx and r8 56 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. 57 * 58 * OpenSolaris Note: 59 * Modified to use GNU/Linux/Solaris calling conventions. 60 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. 61 * 62 * AES_RETURN aes_encrypt(const unsigned char in_blk[], 63 * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ 64 * 65 * AES_RETURN aes_decrypt(const unsigned char in_blk[], 66 * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ 67 * 68 * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[], 69 * const aes_encrypt_ctx cx[1])/ 70 * 71 * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[], 72 * const aes_decrypt_ctx cx[1])/ 73 * 74 * AES_RETURN aes_encrypt_key(const unsigned char key[], 75 * unsigned int len, const aes_decrypt_ctx cx[1])/ 76 * 77 * AES_RETURN aes_decrypt_key(const unsigned char key[], 78 * unsigned int len, const aes_decrypt_ctx cx[1])/ 79 * 80 * where <NNN> is 128, 102 or 256. In the last two calls the length can be in 81 * either bits or bytes. 82 * 83 * Comment in/out the following lines to obtain the desired subroutines. These 84 * selections MUST match those in the C header file aesopt.h 85 */ 86#define AES_REV_DKS /* define if key decryption schedule is reversed */ 87 88#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ 89 90/* 91 * The encryption key schedule has the following in memory layout where N is the 92 * number of rounds (10, 12 or 14): 93 * 94 * lo: | input key (round 0) | / each round is four 32-bit words 95 * | encryption round 1 | 96 * | encryption round 2 | 97 * .... 98 * | encryption round N-1 | 99 * hi: | encryption round N | 100 * 101 * The decryption key schedule is normally set up so that it has the same 102 * layout as above by actually reversing the order of the encryption key 103 * schedule in memory (this happens when AES_REV_DKS is set): 104 * 105 * lo: | decryption round 0 | = | encryption round N | 106 * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] 107 * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] 108 * .... .... 109 * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] 110 * hi: | decryption round N | = | input key (round 0) | 111 * 112 * with rounds except the first and last modified using inv_mix_column() 113 * But if AES_REV_DKS is NOT set the order of keys is left as it is for 114 * encryption so that it has to be accessed in reverse when used for 115 * decryption (although the inverse mix column modifications are done) 116 * 117 * lo: | decryption round 0 | = | input key (round 0) | 118 * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] 119 * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] 120 * .... .... 121 * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] 122 * hi: | decryption round N | = | encryption round N | 123 * 124 * This layout is faster when the assembler key scheduling provided here 125 * is used. 126 * 127 * End of user defines 128 */ 129 130/* 131 * --------------------------------------------------------------------------- 132 * OpenSolaris OS modifications 133 * 134 * This source originates from Brian Gladman file aes_amd64.asm 135 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip 136 * with these changes: 137 * 138 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and 139 * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, 140 * AES_128, AES_192, AES_256, AES_VAR ifdefs. 141 * 142 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define 143 * 144 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef 145 * 146 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax 147 * (operands reversed, literals prefixed with "$", registers prefixed with "%", 148 * and "[register+offset]", addressing changed to "offset(register)", 149 * parenthesis in constant expressions "()" changed to square brackets "[]", 150 * "." removed from local (numeric) labels, and other changes. 151 * Examples: 152 * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax 153 * mov rax,(4*20h) mov $[4*0x20],%rax 154 * mov rax,[ebx+20h] mov 0x20(%ebx),%rax 155 * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax 156 * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax 157 * 158 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 159 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function 160 * definitions for lint. 161 * 162 * 6. Renamed functions and reordered parameters to match OpenSolaris: 163 * Original Gladman interface: 164 * int aes_encrypt(const unsigned char *in, 165 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 166 * int aes_decrypt(const unsigned char *in, 167 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 168 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, 169 * and a union type, inf., containing inf.l, a uint32_t and 170 * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is 171 * used and contains the key schedule length * 16 where key schedule length is 172 * 10, 12, or 14 bytes. 173 * 174 * OpenSolaris OS interface: 175 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, 176 * const uint32_t pt[4], uint32_t ct[4])/ 177 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, 178 * const uint32_t pt[4], uint32_t ct[4])/ 179 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ 180 * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ 181 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, 182 * ct is crypto text, and MAX_AES_NR is 14. 183 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. 184 */ 185 186#if defined(lint) || defined(__lint) 187 188#include <sys/types.h> 189/* ARGSUSED */ 190void 191aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], 192 uint32_t ct[4]) { 193} 194/* ARGSUSED */ 195void 196aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], 197 uint32_t pt[4]) { 198} 199 200 201#else 202 203#include <sys/asm_linkage.h> 204 205#define KS_LENGTH 60 206 207#define raxd eax 208#define rdxd edx 209#define rcxd ecx 210#define rbxd ebx 211#define rsid esi 212#define rdid edi 213 214#define raxb al 215#define rdxb dl 216#define rcxb cl 217#define rbxb bl 218#define rsib sil 219#define rdib dil 220 221/ finite field multiplies by {02}, {04} and {08} 222 223#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] 224#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] 225#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] 226 227/ finite field multiplies required in table generation 228 229#define f3(x) [[f2(x)] ^ [x]] 230#define f9(x) [[f8(x)] ^ [x]] 231#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] 232#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] 233#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] 234 235/ macros for expanding S-box data 236 237#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] 238#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] 239#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 240 241#define enc_vals(x) \ 242 .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ 243 .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ 244 .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ 245 .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ 246 .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ 247 .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ 248 .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ 249 .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ 250 .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ 251 .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ 252 .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ 253 .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ 254 .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ 255 .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ 256 .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ 257 .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ 258 .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ 259 .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ 260 .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ 261 .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ 262 .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ 263 .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ 264 .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ 265 .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ 266 .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ 267 .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ 268 .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ 269 .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ 270 .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ 271 .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ 272 .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ 273 .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) 274 275#define dec_vals(x) \ 276 .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ 277 .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ 278 .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ 279 .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ 280 .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ 281 .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ 282 .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ 283 .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ 284 .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ 285 .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ 286 .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ 287 .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ 288 .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ 289 .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ 290 .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ 291 .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ 292 .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ 293 .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ 294 .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ 295 .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ 296 .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ 297 .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ 298 .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ 299 .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ 300 .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ 301 .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ 302 .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ 303 .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ 304 .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ 305 .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ 306 .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ 307 .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) 308 309#define tptr %rbp /* table pointer */ 310#define kptr %r8 /* key schedule pointer */ 311#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ 312#define fk_ref(x, y) -16*x+fofs+4*y(kptr) 313 314#ifdef AES_REV_DKS 315#define rofs 128 316#define ik_ref(x, y) -16*x+rofs+4*y(kptr) 317 318#else 319#define rofs -128 320#define ik_ref(x, y) 16*x+rofs+4*y(kptr) 321#endif /* AES_REV_DKS */ 322 323#define tab_0(x) (tptr,x,8) 324#define tab_1(x) 3(tptr,x,8) 325#define tab_2(x) 2(tptr,x,8) 326#define tab_3(x) 1(tptr,x,8) 327#define tab_f(x) 1(tptr,x,8) 328#define tab_i(x) 7(tptr,x,8) 329 330#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ 331 mov fk_ref(round,0), p1; \ 332 mov fk_ref(round,1), p2; \ 333 mov fk_ref(round,2), p3; \ 334 mov fk_ref(round,3), p4; \ 335 \ 336 movzx %al, %esi; \ 337 movzx %ah, %edi; \ 338 shr $16, %eax; \ 339 xor tab_0(%rsi), p1; \ 340 xor tab_1(%rdi), p4; \ 341 movzx %al, %esi; \ 342 movzx %ah, %edi; \ 343 xor tab_2(%rsi), p3; \ 344 xor tab_3(%rdi), p2; \ 345 \ 346 movzx %bl, %esi; \ 347 movzx %bh, %edi; \ 348 shr $16, %ebx; \ 349 xor tab_0(%rsi), p2; \ 350 xor tab_1(%rdi), p1; \ 351 movzx %bl, %esi; \ 352 movzx %bh, %edi; \ 353 xor tab_2(%rsi), p4; \ 354 xor tab_3(%rdi), p3; \ 355 \ 356 movzx %cl, %esi; \ 357 movzx %ch, %edi; \ 358 shr $16, %ecx; \ 359 xor tab_0(%rsi), p3; \ 360 xor tab_1(%rdi), p2; \ 361 movzx %cl, %esi; \ 362 movzx %ch, %edi; \ 363 xor tab_2(%rsi), p1; \ 364 xor tab_3(%rdi), p4; \ 365 \ 366 movzx %dl, %esi; \ 367 movzx %dh, %edi; \ 368 shr $16, %edx; \ 369 xor tab_0(%rsi), p4; \ 370 xor tab_1(%rdi), p3; \ 371 movzx %dl, %esi; \ 372 movzx %dh, %edi; \ 373 xor tab_2(%rsi), p2; \ 374 xor tab_3(%rdi), p1; \ 375 \ 376 mov p1, %eax; \ 377 mov p2, %ebx; \ 378 mov p3, %ecx; \ 379 mov p4, %edx 380 381#ifdef LAST_ROUND_TABLES 382 383#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ 384 add $2048, tptr; \ 385 mov fk_ref(round,0), p1; \ 386 mov fk_ref(round,1), p2; \ 387 mov fk_ref(round,2), p3; \ 388 mov fk_ref(round,3), p4; \ 389 \ 390 movzx %al, %esi; \ 391 movzx %ah, %edi; \ 392 shr $16, %eax; \ 393 xor tab_0(%rsi), p1; \ 394 xor tab_1(%rdi), p4; \ 395 movzx %al, %esi; \ 396 movzx %ah, %edi; \ 397 xor tab_2(%rsi), p3; \ 398 xor tab_3(%rdi), p2; \ 399 \ 400 movzx %bl, %esi; \ 401 movzx %bh, %edi; \ 402 shr $16, %ebx; \ 403 xor tab_0(%rsi), p2; \ 404 xor tab_1(%rdi), p1; \ 405 movzx %bl, %esi; \ 406 movzx %bh, %edi; \ 407 xor tab_2(%rsi), p4; \ 408 xor tab_3(%rdi), p3; \ 409 \ 410 movzx %cl, %esi; \ 411 movzx %ch, %edi; \ 412 shr $16, %ecx; \ 413 xor tab_0(%rsi), p3; \ 414 xor tab_1(%rdi), p2; \ 415 movzx %cl, %esi; \ 416 movzx %ch, %edi; \ 417 xor tab_2(%rsi), p1; \ 418 xor tab_3(%rdi), p4; \ 419 \ 420 movzx %dl, %esi; \ 421 movzx %dh, %edi; \ 422 shr $16, %edx; \ 423 xor tab_0(%rsi), p4; \ 424 xor tab_1(%rdi), p3; \ 425 movzx %dl, %esi; \ 426 movzx %dh, %edi; \ 427 xor tab_2(%rsi), p2; \ 428 xor tab_3(%rdi), p1 429 430#else 431 432#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ 433 mov fk_ref(round,0), p1; \ 434 mov fk_ref(round,1), p2; \ 435 mov fk_ref(round,2), p3; \ 436 mov fk_ref(round,3), p4; \ 437 \ 438 movzx %al, %esi; \ 439 movzx %ah, %edi; \ 440 shr $16, %eax; \ 441 movzx tab_f(%rsi), %esi; \ 442 movzx tab_f(%rdi), %edi; \ 443 xor %esi, p1; \ 444 rol $8, %edi; \ 445 xor %edi, p4; \ 446 movzx %al, %esi; \ 447 movzx %ah, %edi; \ 448 movzx tab_f(%rsi), %esi; \ 449 movzx tab_f(%rdi), %edi; \ 450 rol $16, %esi; \ 451 rol $24, %edi; \ 452 xor %esi, p3; \ 453 xor %edi, p2; \ 454 \ 455 movzx %bl, %esi; \ 456 movzx %bh, %edi; \ 457 shr $16, %ebx; \ 458 movzx tab_f(%rsi), %esi; \ 459 movzx tab_f(%rdi), %edi; \ 460 xor %esi, p2; \ 461 rol $8, %edi; \ 462 xor %edi, p1; \ 463 movzx %bl, %esi; \ 464 movzx %bh, %edi; \ 465 movzx tab_f(%rsi), %esi; \ 466 movzx tab_f(%rdi), %edi; \ 467 rol $16, %esi; \ 468 rol $24, %edi; \ 469 xor %esi, p4; \ 470 xor %edi, p3; \ 471 \ 472 movzx %cl, %esi; \ 473 movzx %ch, %edi; \ 474 movzx tab_f(%rsi), %esi; \ 475 movzx tab_f(%rdi), %edi; \ 476 shr $16, %ecx; \ 477 xor %esi, p3; \ 478 rol $8, %edi; \ 479 xor %edi, p2; \ 480 movzx %cl, %esi; \ 481 movzx %ch, %edi; \ 482 movzx tab_f(%rsi), %esi; \ 483 movzx tab_f(%rdi), %edi; \ 484 rol $16, %esi; \ 485 rol $24, %edi; \ 486 xor %esi, p1; \ 487 xor %edi, p4; \ 488 \ 489 movzx %dl, %esi; \ 490 movzx %dh, %edi; \ 491 movzx tab_f(%rsi), %esi; \ 492 movzx tab_f(%rdi), %edi; \ 493 shr $16, %edx; \ 494 xor %esi, p4; \ 495 rol $8, %edi; \ 496 xor %edi, p3; \ 497 movzx %dl, %esi; \ 498 movzx %dh, %edi; \ 499 movzx tab_f(%rsi), %esi; \ 500 movzx tab_f(%rdi), %edi; \ 501 rol $16, %esi; \ 502 rol $24, %edi; \ 503 xor %esi, p2; \ 504 xor %edi, p1 505 506#endif /* LAST_ROUND_TABLES */ 507 508#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ 509 mov ik_ref(round,0), p1; \ 510 mov ik_ref(round,1), p2; \ 511 mov ik_ref(round,2), p3; \ 512 mov ik_ref(round,3), p4; \ 513 \ 514 movzx %al, %esi; \ 515 movzx %ah, %edi; \ 516 shr $16, %eax; \ 517 xor tab_0(%rsi), p1; \ 518 xor tab_1(%rdi), p2; \ 519 movzx %al, %esi; \ 520 movzx %ah, %edi; \ 521 xor tab_2(%rsi), p3; \ 522 xor tab_3(%rdi), p4; \ 523 \ 524 movzx %bl, %esi; \ 525 movzx %bh, %edi; \ 526 shr $16, %ebx; \ 527 xor tab_0(%rsi), p2; \ 528 xor tab_1(%rdi), p3; \ 529 movzx %bl, %esi; \ 530 movzx %bh, %edi; \ 531 xor tab_2(%rsi), p4; \ 532 xor tab_3(%rdi), p1; \ 533 \ 534 movzx %cl, %esi; \ 535 movzx %ch, %edi; \ 536 shr $16, %ecx; \ 537 xor tab_0(%rsi), p3; \ 538 xor tab_1(%rdi), p4; \ 539 movzx %cl, %esi; \ 540 movzx %ch, %edi; \ 541 xor tab_2(%rsi), p1; \ 542 xor tab_3(%rdi), p2; \ 543 \ 544 movzx %dl, %esi; \ 545 movzx %dh, %edi; \ 546 shr $16, %edx; \ 547 xor tab_0(%rsi), p4; \ 548 xor tab_1(%rdi), p1; \ 549 movzx %dl, %esi; \ 550 movzx %dh, %edi; \ 551 xor tab_2(%rsi), p2; \ 552 xor tab_3(%rdi), p3; \ 553 \ 554 mov p1, %eax; \ 555 mov p2, %ebx; \ 556 mov p3, %ecx; \ 557 mov p4, %edx 558 559#ifdef LAST_ROUND_TABLES 560 561#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ 562 add $2048, tptr; \ 563 mov ik_ref(round,0), p1; \ 564 mov ik_ref(round,1), p2; \ 565 mov ik_ref(round,2), p3; \ 566 mov ik_ref(round,3), p4; \ 567 \ 568 movzx %al, %esi; \ 569 movzx %ah, %edi; \ 570 shr $16, %eax; \ 571 xor tab_0(%rsi), p1; \ 572 xor tab_1(%rdi), p2; \ 573 movzx %al, %esi; \ 574 movzx %ah, %edi; \ 575 xor tab_2(%rsi), p3; \ 576 xor tab_3(%rdi), p4; \ 577 \ 578 movzx %bl, %esi; \ 579 movzx %bh, %edi; \ 580 shr $16, %ebx; \ 581 xor tab_0(%rsi), p2; \ 582 xor tab_1(%rdi), p3; \ 583 movzx %bl, %esi; \ 584 movzx %bh, %edi; \ 585 xor tab_2(%rsi), p4; \ 586 xor tab_3(%rdi), p1; \ 587 \ 588 movzx %cl, %esi; \ 589 movzx %ch, %edi; \ 590 shr $16, %ecx; \ 591 xor tab_0(%rsi), p3; \ 592 xor tab_1(%rdi), p4; \ 593 movzx %cl, %esi; \ 594 movzx %ch, %edi; \ 595 xor tab_2(%rsi), p1; \ 596 xor tab_3(%rdi), p2; \ 597 \ 598 movzx %dl, %esi; \ 599 movzx %dh, %edi; \ 600 shr $16, %edx; \ 601 xor tab_0(%rsi), p4; \ 602 xor tab_1(%rdi), p1; \ 603 movzx %dl, %esi; \ 604 movzx %dh, %edi; \ 605 xor tab_2(%rsi), p2; \ 606 xor tab_3(%rdi), p3 607 608#else 609 610#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ 611 mov ik_ref(round,0), p1; \ 612 mov ik_ref(round,1), p2; \ 613 mov ik_ref(round,2), p3; \ 614 mov ik_ref(round,3), p4; \ 615 \ 616 movzx %al, %esi; \ 617 movzx %ah, %edi; \ 618 movzx tab_i(%rsi), %esi; \ 619 movzx tab_i(%rdi), %edi; \ 620 shr $16, %eax; \ 621 xor %esi, p1; \ 622 rol $8, %edi; \ 623 xor %edi, p2; \ 624 movzx %al, %esi; \ 625 movzx %ah, %edi; \ 626 movzx tab_i(%rsi), %esi; \ 627 movzx tab_i(%rdi), %edi; \ 628 rol $16, %esi; \ 629 rol $24, %edi; \ 630 xor %esi, p3; \ 631 xor %edi, p4; \ 632 \ 633 movzx %bl, %esi; \ 634 movzx %bh, %edi; \ 635 movzx tab_i(%rsi), %esi; \ 636 movzx tab_i(%rdi), %edi; \ 637 shr $16, %ebx; \ 638 xor %esi, p2; \ 639 rol $8, %edi; \ 640 xor %edi, p3; \ 641 movzx %bl, %esi; \ 642 movzx %bh, %edi; \ 643 movzx tab_i(%rsi), %esi; \ 644 movzx tab_i(%rdi), %edi; \ 645 rol $16, %esi; \ 646 rol $24, %edi; \ 647 xor %esi, p4; \ 648 xor %edi, p1; \ 649 \ 650 movzx %cl, %esi; \ 651 movzx %ch, %edi; \ 652 movzx tab_i(%rsi), %esi; \ 653 movzx tab_i(%rdi), %edi; \ 654 shr $16, %ecx; \ 655 xor %esi, p3; \ 656 rol $8, %edi; \ 657 xor %edi, p4; \ 658 movzx %cl, %esi; \ 659 movzx %ch, %edi; \ 660 movzx tab_i(%rsi), %esi; \ 661 movzx tab_i(%rdi), %edi; \ 662 rol $16, %esi; \ 663 rol $24, %edi; \ 664 xor %esi, p1; \ 665 xor %edi, p2; \ 666 \ 667 movzx %dl, %esi; \ 668 movzx %dh, %edi; \ 669 movzx tab_i(%rsi), %esi; \ 670 movzx tab_i(%rdi), %edi; \ 671 shr $16, %edx; \ 672 xor %esi, p4; \ 673 rol $8, %edi; \ 674 xor %edi, p1; \ 675 movzx %dl, %esi; \ 676 movzx %dh, %edi; \ 677 movzx tab_i(%rsi), %esi; \ 678 movzx tab_i(%rdi), %edi; \ 679 rol $16, %esi; \ 680 rol $24, %edi; \ 681 xor %esi, p2; \ 682 xor %edi, p3 683 684#endif /* LAST_ROUND_TABLES */ 685 686/* 687 * OpenSolaris OS: 688 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, 689 * const uint32_t pt[4], uint32_t ct[4])/ 690 * 691 * Original interface: 692 * int aes_encrypt(const unsigned char *in, 693 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 694 */ 695 .align 64 696enc_tab: 697 enc_vals(u8) 698#ifdef LAST_ROUND_TABLES 699 / Last Round Tables: 700 enc_vals(w8) 701#endif 702 703 704 ENTRY_NP(aes_encrypt_amd64) 705#ifdef GLADMAN_INTERFACE 706 / Original interface 707 sub $[4*8], %rsp / gnu/linux/opensolaris binary interface 708 mov %rsi, (%rsp) / output pointer (P2) 709 mov %rdx, %r8 / context (P3) 710 711 mov %rbx, 1*8(%rsp) / P1: input pointer in rdi 712 mov %rbp, 2*8(%rsp) / P2: output pointer in (rsp) 713 mov %r12, 3*8(%rsp) / P3: context in r8 714 movzx 4*KS_LENGTH(kptr), %esi / Get byte key length * 16 715 716#else 717 / OpenSolaris OS interface 718 sub $[4*8], %rsp / Make room on stack to save registers 719 mov %rcx, (%rsp) / Save output pointer (P4) on stack 720 mov %rdi, %r8 / context (P1) 721 mov %rdx, %rdi / P3: save input pointer 722 shl $4, %esi / P2: esi byte key length * 16 723 724 mov %rbx, 1*8(%rsp) / Save registers 725 mov %rbp, 2*8(%rsp) 726 mov %r12, 3*8(%rsp) 727 / P1: context in r8 728 / P2: byte key length * 16 in esi 729 / P3: input pointer in rdi 730 / P4: output pointer in (rsp) 731#endif /* GLADMAN_INTERFACE */ 732 733 lea enc_tab(%rip), tptr 734 sub $fofs, kptr 735 736 / Load input block into registers 737 mov (%rdi), %eax 738 mov 1*4(%rdi), %ebx 739 mov 2*4(%rdi), %ecx 740 mov 3*4(%rdi), %edx 741 742 xor fofs(kptr), %eax 743 xor fofs+4(kptr), %ebx 744 xor fofs+8(kptr), %ecx 745 xor fofs+12(kptr), %edx 746 747 lea (kptr,%rsi), kptr 748 / Jump based on byte key length * 16: 749 cmp $[10*16], %esi 750 je 3f 751 cmp $[12*16], %esi 752 je 2f 753 cmp $[14*16], %esi 754 je 1f 755 mov $-1, %rax / error 756 jmp 4f 757 758 / Perform normal forward rounds 7591: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) 760 ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) 7612: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) 762 ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) 7633: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) 764 ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) 765 ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) 766 ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) 767 ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) 768 ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) 769 ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) 770 ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) 771 ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) 772 fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) 773 774 / Copy results 775 mov (%rsp), %rbx 776 mov %r9d, (%rbx) 777 mov %r10d, 4(%rbx) 778 mov %r11d, 8(%rbx) 779 mov %r12d, 12(%rbx) 780 xor %rax, %rax 7814: / Restore registers 782 mov 1*8(%rsp), %rbx 783 mov 2*8(%rsp), %rbp 784 mov 3*8(%rsp), %r12 785 add $[4*8], %rsp 786 ret 787 788 SET_SIZE(aes_encrypt_amd64) 789 790/* 791 * OpenSolaris OS: 792 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, 793 * const uint32_t pt[4], uint32_t ct[4])/ 794 * 795 * Original interface: 796 * int aes_decrypt(const unsigned char *in, 797 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 798 */ 799 .align 64 800dec_tab: 801 dec_vals(v8) 802#ifdef LAST_ROUND_TABLES 803 / Last Round Tables: 804 dec_vals(w8) 805#endif 806 807 808 ENTRY_NP(aes_decrypt_amd64) 809#ifdef GLADMAN_INTERFACE 810 / Original interface 811 sub $[4*8], %rsp / gnu/linux/opensolaris binary interface 812 mov %rsi, (%rsp) / output pointer (P2) 813 mov %rdx, %r8 / context (P3) 814 815 mov %rbx, 1*8(%rsp) / P1: input pointer in rdi 816 mov %rbp, 2*8(%rsp) / P2: output pointer in (rsp) 817 mov %r12, 3*8(%rsp) / P3: context in r8 818 movzx 4*KS_LENGTH(kptr), %esi / Get byte key length * 16 819 820#else 821 / OpenSolaris OS interface 822 sub $[4*8], %rsp / Make room on stack to save registers 823 mov %rcx, (%rsp) / Save output pointer (P4) on stack 824 mov %rdi, %r8 / context (P1) 825 mov %rdx, %rdi / P3: save input pointer 826 shl $4, %esi / P2: esi byte key length * 16 827 828 mov %rbx, 1*8(%rsp) / Save registers 829 mov %rbp, 2*8(%rsp) 830 mov %r12, 3*8(%rsp) 831 / P1: context in r8 832 / P2: byte key length * 16 in esi 833 / P3: input pointer in rdi 834 / P4: output pointer in (rsp) 835#endif /* GLADMAN_INTERFACE */ 836 837 lea dec_tab(%rip), tptr 838 sub $rofs, kptr 839 840 / Load input block into registers 841 mov (%rdi), %eax 842 mov 1*4(%rdi), %ebx 843 mov 2*4(%rdi), %ecx 844 mov 3*4(%rdi), %edx 845 846#ifdef AES_REV_DKS 847 mov kptr, %rdi 848 lea (kptr,%rsi), kptr 849#else 850 lea (kptr,%rsi), %rdi 851#endif 852 853 xor rofs(%rdi), %eax 854 xor rofs+4(%rdi), %ebx 855 xor rofs+8(%rdi), %ecx 856 xor rofs+12(%rdi), %edx 857 858 / Jump based on byte key length * 16: 859 cmp $[10*16], %esi 860 je 3f 861 cmp $[12*16], %esi 862 je 2f 863 cmp $[14*16], %esi 864 je 1f 865 mov $-1, %rax / error 866 jmp 4f 867 868 / Perform normal inverse rounds 8691: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) 870 ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) 8712: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) 872 ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) 8733: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) 874 ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) 875 ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) 876 ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) 877 ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) 878 ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) 879 ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) 880 ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) 881 ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) 882 il_rnd(%r9d, %r10d, %r11d, %r12d, 0) 883 884 / Copy results 885 mov (%rsp), %rbx 886 mov %r9d, (%rbx) 887 mov %r10d, 4(%rbx) 888 mov %r11d, 8(%rbx) 889 mov %r12d, 12(%rbx) 890 xor %rax, %rax 8914: / Restore registers 892 mov 1*8(%rsp), %rbx 893 mov 2*8(%rsp), %rbp 894 mov 3*8(%rsp), %r12 895 add $[4*8], %rsp 896 ret 897 898 SET_SIZE(aes_decrypt_amd64) 899#endif /* lint || __lint */ 900