1/* 2 * --------------------------------------------------------------------------- 3 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. 4 * 5 * LICENSE TERMS 6 * 7 * The free distribution and use of this software is allowed (with or without 8 * changes) provided that: 9 * 10 * 1. source code distributions include the above copyright notice, this 11 * list of conditions and the following disclaimer; 12 * 13 * 2. binary distributions include the above copyright notice, this list 14 * of conditions and the following disclaimer in their documentation; 15 * 16 * 3. the name of the copyright holder is not used to endorse products 17 * built using this software without specific written permission. 18 * 19 * DISCLAIMER 20 * 21 * This software is provided 'as is' with no explicit or implied warranties 22 * in respect of its properties, including, but not limited to, correctness 23 * and/or fitness for purpose. 24 * --------------------------------------------------------------------------- 25 * Issue 20/12/2007 26 * 27 * I am grateful to Dag Arne Osvik for many discussions of the techniques that 28 * can be used to optimise AES assembler code on AMD64/EM64T architectures. 29 * Some of the techniques used in this implementation are the result of 30 * suggestions made by him for which I am most grateful. 31 * 32 * An AES implementation for AMD64 processors using the YASM assembler. This 33 * implementation provides only encryption, decryption and hence requires key 34 * scheduling support in C. It uses 8k bytes of tables but its encryption and 35 * decryption performance is very close to that obtained using large tables. 36 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, 37 * which are as follows: 38 * ms windows gnu/linux/opensolaris os 39 * 40 * in_blk rcx rdi 41 * out_blk rdx rsi 42 * context (cx) r8 rdx 43 * 44 * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 45 * registers rdi - on both 46 * 47 * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 48 * registers - rdi on both 49 * 50 * The convention used here is that for gnu/linux/opensolaris os. 51 * 52 * This code provides the standard AES block size (128 bits, 16 bytes) and the 53 * three standard AES key sizes (128, 192 and 256 bits). It has the same call 54 * interface as my C implementation. It uses the Microsoft C AMD64 calling 55 * conventions in which the three parameters are placed in rcx, rdx and r8 56 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. 57 * 58 * OpenSolaris Note: 59 * Modified to use GNU/Linux/Solaris calling conventions. 60 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. 61 * 62 * AES_RETURN aes_encrypt(const unsigned char in_blk[], 63 * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ 64 * 65 * AES_RETURN aes_decrypt(const unsigned char in_blk[], 66 * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ 67 * 68 * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[], 69 * const aes_encrypt_ctx cx[1])/ 70 * 71 * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[], 72 * const aes_decrypt_ctx cx[1])/ 73 * 74 * AES_RETURN aes_encrypt_key(const unsigned char key[], 75 * unsigned int len, const aes_decrypt_ctx cx[1])/ 76 * 77 * AES_RETURN aes_decrypt_key(const unsigned char key[], 78 * unsigned int len, const aes_decrypt_ctx cx[1])/ 79 * 80 * where <NNN> is 128, 102 or 256. In the last two calls the length can be in 81 * either bits or bytes. 82 * 83 * Comment in/out the following lines to obtain the desired subroutines. These 84 * selections MUST match those in the C header file aesopt.h 85 */ 86#define AES_REV_DKS /* define if key decryption schedule is reversed */ 87 88#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ 89 90/* 91 * The encryption key schedule has the following in memory layout where N is the 92 * number of rounds (10, 12 or 14): 93 * 94 * lo: | input key (round 0) | / each round is four 32-bit words 95 * | encryption round 1 | 96 * | encryption round 2 | 97 * .... 98 * | encryption round N-1 | 99 * hi: | encryption round N | 100 * 101 * The decryption key schedule is normally set up so that it has the same 102 * layout as above by actually reversing the order of the encryption key 103 * schedule in memory (this happens when AES_REV_DKS is set): 104 * 105 * lo: | decryption round 0 | = | encryption round N | 106 * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] 107 * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] 108 * .... .... 109 * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] 110 * hi: | decryption round N | = | input key (round 0) | 111 * 112 * with rounds except the first and last modified using inv_mix_column() 113 * But if AES_REV_DKS is NOT set the order of keys is left as it is for 114 * encryption so that it has to be accessed in reverse when used for 115 * decryption (although the inverse mix column modifications are done) 116 * 117 * lo: | decryption round 0 | = | input key (round 0) | 118 * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] 119 * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] 120 * .... .... 121 * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] 122 * hi: | decryption round N | = | encryption round N | 123 * 124 * This layout is faster when the assembler key scheduling provided here 125 * is used. 126 * 127 * End of user defines 128 */ 129 130/* 131 * --------------------------------------------------------------------------- 132 * OpenSolaris OS modifications 133 * 134 * This source originates from Brian Gladman file aes_amd64.asm 135 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip 136 * with these changes: 137 * 138 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and 139 * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, 140 * AES_128, AES_192, AES_256, AES_VAR ifdefs. 141 * 142 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define 143 * 144 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef 145 * 146 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax 147 * (operands reversed, literals prefixed with "$", registers prefixed with "%", 148 * and "[register+offset]", addressing changed to "offset(register)", 149 * parenthesis in constant expressions "()" changed to square brackets "[]", 150 * "." removed from local (numeric) labels, and other changes. 151 * Examples: 152 * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax 153 * mov rax,(4*20h) mov $[4*0x20],%rax 154 * mov rax,[ebx+20h] mov 0x20(%ebx),%rax 155 * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax 156 * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax 157 * 158 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 159 * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. 160 * 161 * 6. Renamed functions and reordered parameters to match OpenSolaris: 162 * Original Gladman interface: 163 * int aes_encrypt(const unsigned char *in, 164 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 165 * int aes_decrypt(const unsigned char *in, 166 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 167 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, 168 * and a union type, inf., containing inf.l, a uint32_t and 169 * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is 170 * used and contains the key schedule length * 16 where key schedule length is 171 * 10, 12, or 14 bytes. 172 * 173 * OpenSolaris OS interface: 174 * void aes_encrypt_impl(const aes_ks_t *ks, int Nr, 175 * const uint32_t pt[4], uint32_t ct[4])/ 176 * void aes_decrypt_impl(const aes_ks_t *ks, int Nr, 177 * const uint32_t pt[4], uint32_t ct[4])/ 178 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ 179 * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ 180 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, 181 * ct is crypto text, and MAX_AES_NR is 14. 182 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. 183 */ 184 185#if !defined(lint) && !defined(__lint) 186 .ident "%Z%%M% %I% %E% SMI" 187#include <sys/asm_linkage.h> 188 189#define KS_LENGTH 60 190 191#define raxd eax 192#define rdxd edx 193#define rcxd ecx 194#define rbxd ebx 195#define rsid esi 196#define rdid edi 197 198#define raxb al 199#define rdxb dl 200#define rcxb cl 201#define rbxb bl 202#define rsib sil 203#define rdib dil 204 205/ finite field multiplies by {02}, {04} and {08} 206 207#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] 208#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] 209#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] 210 211/ finite field multiplies required in table generation 212 213#define f3(x) [[f2(x)] ^ [x]] 214#define f9(x) [[f8(x)] ^ [x]] 215#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] 216#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] 217#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] 218 219/ macros for expanding S-box data 220 221#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] 222#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] 223#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 224 225#define enc_vals(x) \ 226 .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ 227 .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ 228 .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ 229 .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ 230 .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ 231 .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ 232 .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ 233 .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ 234 .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ 235 .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ 236 .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ 237 .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ 238 .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ 239 .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ 240 .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ 241 .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ 242 .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ 243 .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ 244 .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ 245 .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ 246 .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ 247 .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ 248 .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ 249 .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ 250 .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ 251 .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ 252 .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ 253 .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ 254 .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ 255 .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ 256 .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ 257 .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) 258 259#define dec_vals(x) \ 260 .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ 261 .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ 262 .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ 263 .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ 264 .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ 265 .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ 266 .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ 267 .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ 268 .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ 269 .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ 270 .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ 271 .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ 272 .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ 273 .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ 274 .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ 275 .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ 276 .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ 277 .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ 278 .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ 279 .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ 280 .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ 281 .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ 282 .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ 283 .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ 284 .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ 285 .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ 286 .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ 287 .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ 288 .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ 289 .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ 290 .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ 291 .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) 292 293#define tptr %rbp /* table pointer */ 294#define kptr %r8 /* key schedule pointer */ 295#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ 296#define fk_ref(x, y) -16*x+fofs+4*y(kptr) 297 298#ifdef AES_REV_DKS 299#define rofs 128 300#define ik_ref(x, y) -16*x+rofs+4*y(kptr) 301 302#else 303#define rofs -128 304#define ik_ref(x, y) 16*x+rofs+4*y(kptr) 305#endif /* AES_REV_DKS */ 306 307#define tab_0(x) (tptr,x,8) 308#define tab_1(x) 3(tptr,x,8) 309#define tab_2(x) 2(tptr,x,8) 310#define tab_3(x) 1(tptr,x,8) 311#define tab_f(x) 1(tptr,x,8) 312#define tab_i(x) 7(tptr,x,8) 313 314 /* EXPORT DELETE START */ 315#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ 316 mov fk_ref(round,0), p1; \ 317 mov fk_ref(round,1), p2; \ 318 mov fk_ref(round,2), p3; \ 319 mov fk_ref(round,3), p4; \ 320 \ 321 movzx %al, %esi; \ 322 movzx %ah, %edi; \ 323 shr $16, %eax; \ 324 xor tab_0(%rsi), p1; \ 325 xor tab_1(%rdi), p4; \ 326 movzx %al, %esi; \ 327 movzx %ah, %edi; \ 328 xor tab_2(%rsi), p3; \ 329 xor tab_3(%rdi), p2; \ 330 \ 331 movzx %bl, %esi; \ 332 movzx %bh, %edi; \ 333 shr $16, %ebx; \ 334 xor tab_0(%rsi), p2; \ 335 xor tab_1(%rdi), p1; \ 336 movzx %bl, %esi; \ 337 movzx %bh, %edi; \ 338 xor tab_2(%rsi), p4; \ 339 xor tab_3(%rdi), p3; \ 340 \ 341 movzx %cl, %esi; \ 342 movzx %ch, %edi; \ 343 shr $16, %ecx; \ 344 xor tab_0(%rsi), p3; \ 345 xor tab_1(%rdi), p2; \ 346 movzx %cl, %esi; \ 347 movzx %ch, %edi; \ 348 xor tab_2(%rsi), p1; \ 349 xor tab_3(%rdi), p4; \ 350 \ 351 movzx %dl, %esi; \ 352 movzx %dh, %edi; \ 353 shr $16, %edx; \ 354 xor tab_0(%rsi), p4; \ 355 xor tab_1(%rdi), p3; \ 356 movzx %dl, %esi; \ 357 movzx %dh, %edi; \ 358 xor tab_2(%rsi), p2; \ 359 xor tab_3(%rdi), p1; \ 360 \ 361 mov p1, %eax; \ 362 mov p2, %ebx; \ 363 mov p3, %ecx; \ 364 mov p4, %edx 365 366#ifdef LAST_ROUND_TABLES 367 368#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ 369 add $2048, tptr; \ 370 mov fk_ref(round,0), p1; \ 371 mov fk_ref(round,1), p2; \ 372 mov fk_ref(round,2), p3; \ 373 mov fk_ref(round,3), p4; \ 374 \ 375 movzx %al, %esi; \ 376 movzx %ah, %edi; \ 377 shr $16, %eax; \ 378 xor tab_0(%rsi), p1; \ 379 xor tab_1(%rdi), p4; \ 380 movzx %al, %esi; \ 381 movzx %ah, %edi; \ 382 xor tab_2(%rsi), p3; \ 383 xor tab_3(%rdi), p2; \ 384 \ 385 movzx %bl, %esi; \ 386 movzx %bh, %edi; \ 387 shr $16, %ebx; \ 388 xor tab_0(%rsi), p2; \ 389 xor tab_1(%rdi), p1; \ 390 movzx %bl, %esi; \ 391 movzx %bh, %edi; \ 392 xor tab_2(%rsi), p4; \ 393 xor tab_3(%rdi), p3; \ 394 \ 395 movzx %cl, %esi; \ 396 movzx %ch, %edi; \ 397 shr $16, %ecx; \ 398 xor tab_0(%rsi), p3; \ 399 xor tab_1(%rdi), p2; \ 400 movzx %cl, %esi; \ 401 movzx %ch, %edi; \ 402 xor tab_2(%rsi), p1; \ 403 xor tab_3(%rdi), p4; \ 404 \ 405 movzx %dl, %esi; \ 406 movzx %dh, %edi; \ 407 shr $16, %edx; \ 408 xor tab_0(%rsi), p4; \ 409 xor tab_1(%rdi), p3; \ 410 movzx %dl, %esi; \ 411 movzx %dh, %edi; \ 412 xor tab_2(%rsi), p2; \ 413 xor tab_3(%rdi), p1 414 415#else 416 417#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ 418 mov fk_ref(round,0), p1; \ 419 mov fk_ref(round,1), p2; \ 420 mov fk_ref(round,2), p3; \ 421 mov fk_ref(round,3), p4; \ 422 \ 423 movzx %al, %esi; \ 424 movzx %ah, %edi; \ 425 shr $16, %eax; \ 426 movzx tab_f(%rsi), %esi; \ 427 movzx tab_f(%rdi), %edi; \ 428 xor %esi, p1; \ 429 rol $8, %edi; \ 430 xor %edi, p4; \ 431 movzx %al, %esi; \ 432 movzx %ah, %edi; \ 433 movzx tab_f(%rsi), %esi; \ 434 movzx tab_f(%rdi), %edi; \ 435 rol $16, %esi; \ 436 rol $24, %edi; \ 437 xor %esi, p3; \ 438 xor %edi, p2; \ 439 \ 440 movzx %bl, %esi; \ 441 movzx %bh, %edi; \ 442 shr $16, %ebx; \ 443 movzx tab_f(%rsi), %esi; \ 444 movzx tab_f(%rdi), %edi; \ 445 xor %esi, p2; \ 446 rol $8, %edi; \ 447 xor %edi, p1; \ 448 movzx %bl, %esi; \ 449 movzx %bh, %edi; \ 450 movzx tab_f(%rsi), %esi; \ 451 movzx tab_f(%rdi), %edi; \ 452 rol $16, %esi; \ 453 rol $24, %edi; \ 454 xor %esi, p4; \ 455 xor %edi, p3; \ 456 \ 457 movzx %cl, %esi; \ 458 movzx %ch, %edi; \ 459 movzx tab_f(%rsi), %esi; \ 460 movzx tab_f(%rdi), %edi; \ 461 shr $16, %ecx; \ 462 xor %esi, p3; \ 463 rol $8, %edi; \ 464 xor %edi, p2; \ 465 movzx %cl, %esi; \ 466 movzx %ch, %edi; \ 467 movzx tab_f(%rsi), %esi; \ 468 movzx tab_f(%rdi), %edi; \ 469 rol $16, %esi; \ 470 rol $24, %edi; \ 471 xor %esi, p1; \ 472 xor %edi, p4; \ 473 \ 474 movzx %dl, %esi; \ 475 movzx %dh, %edi; \ 476 movzx tab_f(%rsi), %esi; \ 477 movzx tab_f(%rdi), %edi; \ 478 shr $16, %edx; \ 479 xor %esi, p4; \ 480 rol $8, %edi; \ 481 xor %edi, p3; \ 482 movzx %dl, %esi; \ 483 movzx %dh, %edi; \ 484 movzx tab_f(%rsi), %esi; \ 485 movzx tab_f(%rdi), %edi; \ 486 rol $16, %esi; \ 487 rol $24, %edi; \ 488 xor %esi, p2; \ 489 xor %edi, p1 490 491#endif /* LAST_ROUND_TABLES */ 492 493#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ 494 mov ik_ref(round,0), p1; \ 495 mov ik_ref(round,1), p2; \ 496 mov ik_ref(round,2), p3; \ 497 mov ik_ref(round,3), p4; \ 498 \ 499 movzx %al, %esi; \ 500 movzx %ah, %edi; \ 501 shr $16, %eax; \ 502 xor tab_0(%rsi), p1; \ 503 xor tab_1(%rdi), p2; \ 504 movzx %al, %esi; \ 505 movzx %ah, %edi; \ 506 xor tab_2(%rsi), p3; \ 507 xor tab_3(%rdi), p4; \ 508 \ 509 movzx %bl, %esi; \ 510 movzx %bh, %edi; \ 511 shr $16, %ebx; \ 512 xor tab_0(%rsi), p2; \ 513 xor tab_1(%rdi), p3; \ 514 movzx %bl, %esi; \ 515 movzx %bh, %edi; \ 516 xor tab_2(%rsi), p4; \ 517 xor tab_3(%rdi), p1; \ 518 \ 519 movzx %cl, %esi; \ 520 movzx %ch, %edi; \ 521 shr $16, %ecx; \ 522 xor tab_0(%rsi), p3; \ 523 xor tab_1(%rdi), p4; \ 524 movzx %cl, %esi; \ 525 movzx %ch, %edi; \ 526 xor tab_2(%rsi), p1; \ 527 xor tab_3(%rdi), p2; \ 528 \ 529 movzx %dl, %esi; \ 530 movzx %dh, %edi; \ 531 shr $16, %edx; \ 532 xor tab_0(%rsi), p4; \ 533 xor tab_1(%rdi), p1; \ 534 movzx %dl, %esi; \ 535 movzx %dh, %edi; \ 536 xor tab_2(%rsi), p2; \ 537 xor tab_3(%rdi), p3; \ 538 \ 539 mov p1, %eax; \ 540 mov p2, %ebx; \ 541 mov p3, %ecx; \ 542 mov p4, %edx 543 544#ifdef LAST_ROUND_TABLES 545 546#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ 547 add $2048, tptr; \ 548 mov ik_ref(round,0), p1; \ 549 mov ik_ref(round,1), p2; \ 550 mov ik_ref(round,2), p3; \ 551 mov ik_ref(round,3), p4; \ 552 \ 553 movzx %al, %esi; \ 554 movzx %ah, %edi; \ 555 shr $16, %eax; \ 556 xor tab_0(%rsi), p1; \ 557 xor tab_1(%rdi), p2; \ 558 movzx %al, %esi; \ 559 movzx %ah, %edi; \ 560 xor tab_2(%rsi), p3; \ 561 xor tab_3(%rdi), p4; \ 562 \ 563 movzx %bl, %esi; \ 564 movzx %bh, %edi; \ 565 shr $16, %ebx; \ 566 xor tab_0(%rsi), p2; \ 567 xor tab_1(%rdi), p3; \ 568 movzx %bl, %esi; \ 569 movzx %bh, %edi; \ 570 xor tab_2(%rsi), p4; \ 571 xor tab_3(%rdi), p1; \ 572 \ 573 movzx %cl, %esi; \ 574 movzx %ch, %edi; \ 575 shr $16, %ecx; \ 576 xor tab_0(%rsi), p3; \ 577 xor tab_1(%rdi), p4; \ 578 movzx %cl, %esi; \ 579 movzx %ch, %edi; \ 580 xor tab_2(%rsi), p1; \ 581 xor tab_3(%rdi), p2; \ 582 \ 583 movzx %dl, %esi; \ 584 movzx %dh, %edi; \ 585 shr $16, %edx; \ 586 xor tab_0(%rsi), p4; \ 587 xor tab_1(%rdi), p1; \ 588 movzx %dl, %esi; \ 589 movzx %dh, %edi; \ 590 xor tab_2(%rsi), p2; \ 591 xor tab_3(%rdi), p3 592 593#else 594 595#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ 596 mov ik_ref(round,0), p1; \ 597 mov ik_ref(round,1), p2; \ 598 mov ik_ref(round,2), p3; \ 599 mov ik_ref(round,3), p4; \ 600 \ 601 movzx %al, %esi; \ 602 movzx %ah, %edi; \ 603 movzx tab_i(%rsi), %esi; \ 604 movzx tab_i(%rdi), %edi; \ 605 shr $16, %eax; \ 606 xor %esi, p1; \ 607 rol $8, %edi; \ 608 xor %edi, p2; \ 609 movzx %al, %esi; \ 610 movzx %ah, %edi; \ 611 movzx tab_i(%rsi), %esi; \ 612 movzx tab_i(%rdi), %edi; \ 613 rol $16, %esi; \ 614 rol $24, %edi; \ 615 xor %esi, p3; \ 616 xor %edi, p4; \ 617 \ 618 movzx %bl, %esi; \ 619 movzx %bh, %edi; \ 620 movzx tab_i(%rsi), %esi; \ 621 movzx tab_i(%rdi), %edi; \ 622 shr $16, %ebx; \ 623 xor %esi, p2; \ 624 rol $8, %edi; \ 625 xor %edi, p3; \ 626 movzx %bl, %esi; \ 627 movzx %bh, %edi; \ 628 movzx tab_i(%rsi), %esi; \ 629 movzx tab_i(%rdi), %edi; \ 630 rol $16, %esi; \ 631 rol $24, %edi; \ 632 xor %esi, p4; \ 633 xor %edi, p1; \ 634 \ 635 movzx %cl, %esi; \ 636 movzx %ch, %edi; \ 637 movzx tab_i(%rsi), %esi; \ 638 movzx tab_i(%rdi), %edi; \ 639 shr $16, %ecx; \ 640 xor %esi, p3; \ 641 rol $8, %edi; \ 642 xor %edi, p4; \ 643 movzx %cl, %esi; \ 644 movzx %ch, %edi; \ 645 movzx tab_i(%rsi), %esi; \ 646 movzx tab_i(%rdi), %edi; \ 647 rol $16, %esi; \ 648 rol $24, %edi; \ 649 xor %esi, p1; \ 650 xor %edi, p2; \ 651 \ 652 movzx %dl, %esi; \ 653 movzx %dh, %edi; \ 654 movzx tab_i(%rsi), %esi; \ 655 movzx tab_i(%rdi), %edi; \ 656 shr $16, %edx; \ 657 xor %esi, p4; \ 658 rol $8, %edi; \ 659 xor %edi, p1; \ 660 movzx %dl, %esi; \ 661 movzx %dh, %edi; \ 662 movzx tab_i(%rsi), %esi; \ 663 movzx tab_i(%rdi), %edi; \ 664 rol $16, %esi; \ 665 rol $24, %edi; \ 666 xor %esi, p2; \ 667 xor %edi, p3 668 669#endif /* LAST_ROUND_TABLES */ 670 /* EXPORT DELETE END */ 671 672/* 673 * OpenSolaris OS: 674 * void aes_encrypt_impl(const aes_ks_t *ks, int Nr, 675 * const uint32_t pt[4], uint32_t ct[4])/ 676 * 677 * Original interface: 678 * int aes_encrypt(const unsigned char *in, 679 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 680 */ 681 .align 64 682enc_tab: 683 enc_vals(u8) 684#ifdef LAST_ROUND_TABLES 685 / Last Round Tables: 686 enc_vals(w8) 687#endif 688 689 690 ENTRY_NP(aes_encrypt_impl) 691 /* EXPORT DELETE START */ 692#ifdef GLADMAN_INTERFACE 693 / Original interface 694 sub $[4*8], %rsp / gnu/linux/opensolaris binary interface 695 mov %rsi, (%rsp) / output pointer (P2) 696 mov %rdx, %r8 / context (P3) 697 698 mov %rbx, 1*8(%rsp) / P1: input pointer in rdi 699 mov %rbp, 2*8(%rsp) / P2: output pointer in (rsp) 700 mov %r12, 3*8(%rsp) / P3: context in r8 701 movzx 4*KS_LENGTH(kptr), %esi / Get byte key length * 16 702 703#else 704 / OpenSolaris OS interface 705 sub $[4*8], %rsp / Make room on stack to save registers 706 mov %rcx, (%rsp) / Save output pointer (P4) on stack 707 mov %rdi, %r8 / context (P1) 708 mov %rdx, %rdi / P3: save input pointer 709 shl $4, %esi / P2: esi byte key length * 16 710 711 mov %rbx, 1*8(%rsp) / Save registers 712 mov %rbp, 2*8(%rsp) 713 mov %r12, 3*8(%rsp) 714 / P1: context in r8 715 / P2: byte key length * 16 in esi 716 / P3: input pointer in rdi 717 / P4: output pointer in (rsp) 718#endif /* GLADMAN_INTERFACE */ 719 720 lea enc_tab(%rip), tptr 721 sub $fofs, kptr 722 723 / Load input block into registers 724 mov (%rdi), %eax 725 mov 1*4(%rdi), %ebx 726 mov 2*4(%rdi), %ecx 727 mov 3*4(%rdi), %edx 728 729 xor fofs(kptr), %eax 730 xor fofs+4(kptr), %ebx 731 xor fofs+8(kptr), %ecx 732 xor fofs+12(kptr), %edx 733 734 lea (kptr,%rsi), kptr 735 / Jump based on byte key length * 16: 736 cmp $[10*16], %esi 737 je 3f 738 cmp $[12*16], %esi 739 je 2f 740 cmp $[14*16], %esi 741 je 1f 742 mov $-1, %rax / error 743 jmp 4f 744 745 / Perform normal forward rounds 7461: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) 747 ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) 7482: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) 749 ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) 7503: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) 751 ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) 752 ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) 753 ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) 754 ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) 755 ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) 756 ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) 757 ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) 758 ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) 759 fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) 760 761 / Copy results 762 mov (%rsp), %rbx 763 mov %r9d, (%rbx) 764 mov %r10d, 4(%rbx) 765 mov %r11d, 8(%rbx) 766 mov %r12d, 12(%rbx) 767 xor %rax, %rax 7684: / Restore registers 769 mov 1*8(%rsp), %rbx 770 mov 2*8(%rsp), %rbp 771 mov 3*8(%rsp), %r12 772 add $[4*8], %rsp 773 /* EXPORT DELETE END */ 774 ret 775 776 SET_SIZE(aes_encrypt_impl) 777 778/* 779 * OpenSolaris OS: 780 * void aes_decrypt_impl(const aes_ks_t *ks, int Nr, 781 * const uint32_t pt[4], uint32_t ct[4])/ 782 * 783 * Original interface: 784 * int aes_decrypt(const unsigned char *in, 785 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 786 */ 787 .align 64 788dec_tab: 789 dec_vals(v8) 790#ifdef LAST_ROUND_TABLES 791 / Last Round Tables: 792 dec_vals(w8) 793#endif 794 795 796 ENTRY_NP(aes_decrypt_impl) 797 /* EXPORT DELETE START */ 798#ifdef GLADMAN_INTERFACE 799 / Original interface 800 sub $[4*8], %rsp / gnu/linux/opensolaris binary interface 801 mov %rsi, (%rsp) / output pointer (P2) 802 mov %rdx, %r8 / context (P3) 803 804 mov %rbx, 1*8(%rsp) / P1: input pointer in rdi 805 mov %rbp, 2*8(%rsp) / P2: output pointer in (rsp) 806 mov %r12, 3*8(%rsp) / P3: context in r8 807 movzx 4*KS_LENGTH(kptr), %esi / Get byte key length * 16 808 809#else 810 / OpenSolaris OS interface 811 sub $[4*8], %rsp / Make room on stack to save registers 812 mov %rcx, (%rsp) / Save output pointer (P4) on stack 813 mov %rdi, %r8 / context (P1) 814 mov %rdx, %rdi / P3: save input pointer 815 shl $4, %esi / P2: esi byte key length * 16 816 817 mov %rbx, 1*8(%rsp) / Save registers 818 mov %rbp, 2*8(%rsp) 819 mov %r12, 3*8(%rsp) 820 / P1: context in r8 821 / P2: byte key length * 16 in esi 822 / P3: input pointer in rdi 823 / P4: output pointer in (rsp) 824#endif /* GLADMAN_INTERFACE */ 825 826 lea dec_tab(%rip), tptr 827 sub $rofs, kptr 828 829 / Load input block into registers 830 mov (%rdi), %eax 831 mov 1*4(%rdi), %ebx 832 mov 2*4(%rdi), %ecx 833 mov 3*4(%rdi), %edx 834 835#ifdef AES_REV_DKS 836 mov kptr, %rdi 837 lea (kptr,%rsi), kptr 838#else 839 lea (kptr,%rsi), %rdi 840#endif 841 842 xor rofs(%rdi), %eax 843 xor rofs+4(%rdi), %ebx 844 xor rofs+8(%rdi), %ecx 845 xor rofs+12(%rdi), %edx 846 847 / Jump based on byte key length * 16: 848 cmp $[10*16], %esi 849 je 3f 850 cmp $[12*16], %esi 851 je 2f 852 cmp $[14*16], %esi 853 je 1f 854 mov $-1, %rax / error 855 jmp 4f 856 857 / Perform normal inverse rounds 8581: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) 859 ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) 8602: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) 861 ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) 8623: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) 863 ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) 864 ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) 865 ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) 866 ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) 867 ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) 868 ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) 869 ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) 870 ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) 871 il_rnd(%r9d, %r10d, %r11d, %r12d, 0) 872 873 / Copy results 874 mov (%rsp), %rbx 875 mov %r9d, (%rbx) 876 mov %r10d, 4(%rbx) 877 mov %r11d, 8(%rbx) 878 mov %r12d, 12(%rbx) 879 xor %rax, %rax 8804: / Restore registers 881 mov 1*8(%rsp), %rbx 882 mov 2*8(%rsp), %rbp 883 mov 3*8(%rsp), %r12 884 add $[4*8], %rsp 885 /* EXPORT DELETE END */ 886 ret 887 888 SET_SIZE(aes_decrypt_impl) 889 890#else 891 /* LINTED */ 892 /* Nothing to be linted in this file--it's pure assembly source. */ 893#endif /* !lint && !__lint */ 894