190bcde94Sda73024/* 290bcde94Sda73024 * --------------------------------------------------------------------------- 390bcde94Sda73024 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. 490bcde94Sda73024 * 590bcde94Sda73024 * LICENSE TERMS 690bcde94Sda73024 * 790bcde94Sda73024 * The free distribution and use of this software is allowed (with or without 890bcde94Sda73024 * changes) provided that: 990bcde94Sda73024 * 1090bcde94Sda73024 * 1. source code distributions include the above copyright notice, this 1190bcde94Sda73024 * list of conditions and the following disclaimer; 1290bcde94Sda73024 * 1390bcde94Sda73024 * 2. binary distributions include the above copyright notice, this list 1490bcde94Sda73024 * of conditions and the following disclaimer in their documentation; 1590bcde94Sda73024 * 1690bcde94Sda73024 * 3. the name of the copyright holder is not used to endorse products 1790bcde94Sda73024 * built using this software without specific written permission. 1890bcde94Sda73024 * 1990bcde94Sda73024 * DISCLAIMER 2090bcde94Sda73024 * 2190bcde94Sda73024 * This software is provided 'as is' with no explicit or implied warranties 2290bcde94Sda73024 * in respect of its properties, including, but not limited to, correctness 2390bcde94Sda73024 * and/or fitness for purpose. 2490bcde94Sda73024 * --------------------------------------------------------------------------- 2590bcde94Sda73024 * Issue 20/12/2007 2690bcde94Sda73024 * 2790bcde94Sda73024 * I am grateful to Dag Arne Osvik for many discussions of the techniques that 2890bcde94Sda73024 * can be used to optimise AES assembler code on AMD64/EM64T architectures. 2990bcde94Sda73024 * Some of the techniques used in this implementation are the result of 3090bcde94Sda73024 * suggestions made by him for which I am most grateful. 3190bcde94Sda73024 * 3290bcde94Sda73024 * An AES implementation for AMD64 processors using the YASM assembler. This 3390bcde94Sda73024 * implementation provides only encryption, decryption and hence requires key 3490bcde94Sda73024 * scheduling support in C. It uses 8k bytes of tables but its encryption and 3590bcde94Sda73024 * decryption performance is very close to that obtained using large tables. 3690bcde94Sda73024 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, 3790bcde94Sda73024 * which are as follows: 3890bcde94Sda73024 * ms windows gnu/linux/opensolaris os 3990bcde94Sda73024 * 4090bcde94Sda73024 * in_blk rcx rdi 4190bcde94Sda73024 * out_blk rdx rsi 4290bcde94Sda73024 * context (cx) r8 rdx 4390bcde94Sda73024 * 4490bcde94Sda73024 * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 4590bcde94Sda73024 * registers rdi - on both 4690bcde94Sda73024 * 4790bcde94Sda73024 * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 4890bcde94Sda73024 * registers - rdi on both 4990bcde94Sda73024 * 5090bcde94Sda73024 * The convention used here is that for gnu/linux/opensolaris os. 5190bcde94Sda73024 * 5290bcde94Sda73024 * This code provides the standard AES block size (128 bits, 16 bytes) and the 5390bcde94Sda73024 * three standard AES key sizes (128, 192 and 256 bits). It has the same call 5490bcde94Sda73024 * interface as my C implementation. It uses the Microsoft C AMD64 calling 5590bcde94Sda73024 * conventions in which the three parameters are placed in rcx, rdx and r8 5690bcde94Sda73024 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. 5790bcde94Sda73024 * 5890bcde94Sda73024 * OpenSolaris Note: 5990bcde94Sda73024 * Modified to use GNU/Linux/Solaris calling conventions. 6090bcde94Sda73024 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. 6190bcde94Sda73024 * 6290bcde94Sda73024 * AES_RETURN aes_encrypt(const unsigned char in_blk[], 6390bcde94Sda73024 * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ 6490bcde94Sda73024 * 6590bcde94Sda73024 * AES_RETURN aes_decrypt(const unsigned char in_blk[], 6690bcde94Sda73024 * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ 6790bcde94Sda73024 * 6890bcde94Sda73024 * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[], 6990bcde94Sda73024 * const aes_encrypt_ctx cx[1])/ 7090bcde94Sda73024 * 7190bcde94Sda73024 * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[], 7290bcde94Sda73024 * const aes_decrypt_ctx cx[1])/ 7390bcde94Sda73024 * 7490bcde94Sda73024 * AES_RETURN aes_encrypt_key(const unsigned char key[], 7590bcde94Sda73024 * unsigned int len, const aes_decrypt_ctx cx[1])/ 7690bcde94Sda73024 * 7790bcde94Sda73024 * AES_RETURN aes_decrypt_key(const unsigned char key[], 7890bcde94Sda73024 * unsigned int len, const aes_decrypt_ctx cx[1])/ 7990bcde94Sda73024 * 8090bcde94Sda73024 * where <NNN> is 128, 102 or 256. In the last two calls the length can be in 8190bcde94Sda73024 * either bits or bytes. 8290bcde94Sda73024 * 8390bcde94Sda73024 * Comment in/out the following lines to obtain the desired subroutines. These 8490bcde94Sda73024 * selections MUST match those in the C header file aesopt.h 8590bcde94Sda73024 */ 8690bcde94Sda73024#define AES_REV_DKS /* define if key decryption schedule is reversed */ 8790bcde94Sda73024 8890bcde94Sda73024#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ 8990bcde94Sda73024 9090bcde94Sda73024/* 9190bcde94Sda73024 * The encryption key schedule has the following in memory layout where N is the 9290bcde94Sda73024 * number of rounds (10, 12 or 14): 9390bcde94Sda73024 * 9490bcde94Sda73024 * lo: | input key (round 0) | / each round is four 32-bit words 9590bcde94Sda73024 * | encryption round 1 | 9690bcde94Sda73024 * | encryption round 2 | 9790bcde94Sda73024 * .... 9890bcde94Sda73024 * | encryption round N-1 | 9990bcde94Sda73024 * hi: | encryption round N | 10090bcde94Sda73024 * 10190bcde94Sda73024 * The decryption key schedule is normally set up so that it has the same 10290bcde94Sda73024 * layout as above by actually reversing the order of the encryption key 10390bcde94Sda73024 * schedule in memory (this happens when AES_REV_DKS is set): 10490bcde94Sda73024 * 10590bcde94Sda73024 * lo: | decryption round 0 | = | encryption round N | 10690bcde94Sda73024 * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] 10790bcde94Sda73024 * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] 10890bcde94Sda73024 * .... .... 10990bcde94Sda73024 * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] 11090bcde94Sda73024 * hi: | decryption round N | = | input key (round 0) | 11190bcde94Sda73024 * 11290bcde94Sda73024 * with rounds except the first and last modified using inv_mix_column() 11390bcde94Sda73024 * But if AES_REV_DKS is NOT set the order of keys is left as it is for 11490bcde94Sda73024 * encryption so that it has to be accessed in reverse when used for 11590bcde94Sda73024 * decryption (although the inverse mix column modifications are done) 11690bcde94Sda73024 * 11790bcde94Sda73024 * lo: | decryption round 0 | = | input key (round 0) | 11890bcde94Sda73024 * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] 11990bcde94Sda73024 * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] 12090bcde94Sda73024 * .... .... 12190bcde94Sda73024 * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] 12290bcde94Sda73024 * hi: | decryption round N | = | encryption round N | 12390bcde94Sda73024 * 12490bcde94Sda73024 * This layout is faster when the assembler key scheduling provided here 12590bcde94Sda73024 * is used. 12690bcde94Sda73024 * 12790bcde94Sda73024 * End of user defines 12890bcde94Sda73024 */ 12990bcde94Sda73024 13090bcde94Sda73024/* 13190bcde94Sda73024 * --------------------------------------------------------------------------- 13290bcde94Sda73024 * OpenSolaris OS modifications 13390bcde94Sda73024 * 13490bcde94Sda73024 * This source originates from Brian Gladman file aes_amd64.asm 13590bcde94Sda73024 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip 13690bcde94Sda73024 * with these changes: 13790bcde94Sda73024 * 13890bcde94Sda73024 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and 13990bcde94Sda73024 * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, 14090bcde94Sda73024 * AES_128, AES_192, AES_256, AES_VAR ifdefs. 14190bcde94Sda73024 * 14290bcde94Sda73024 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define 14390bcde94Sda73024 * 14490bcde94Sda73024 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef 14590bcde94Sda73024 * 14690bcde94Sda73024 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax 14790bcde94Sda73024 * (operands reversed, literals prefixed with "$", registers prefixed with "%", 14890bcde94Sda73024 * and "[register+offset]", addressing changed to "offset(register)", 14990bcde94Sda73024 * parenthesis in constant expressions "()" changed to square brackets "[]", 15090bcde94Sda73024 * "." removed from local (numeric) labels, and other changes. 15190bcde94Sda73024 * Examples: 15290bcde94Sda73024 * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax 15390bcde94Sda73024 * mov rax,(4*20h) mov $[4*0x20],%rax 15490bcde94Sda73024 * mov rax,[ebx+20h] mov 0x20(%ebx),%rax 15590bcde94Sda73024 * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax 15690bcde94Sda73024 * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax 15790bcde94Sda73024 * 15890bcde94Sda73024 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 159*694c35faSJosef 'Jeff' Sipek * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function 160*694c35faSJosef 'Jeff' Sipek * definitions for lint. 16190bcde94Sda73024 * 16290bcde94Sda73024 * 6. Renamed functions and reordered parameters to match OpenSolaris: 16390bcde94Sda73024 * Original Gladman interface: 16490bcde94Sda73024 * int aes_encrypt(const unsigned char *in, 16590bcde94Sda73024 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 16690bcde94Sda73024 * int aes_decrypt(const unsigned char *in, 16790bcde94Sda73024 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 16890bcde94Sda73024 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, 16990bcde94Sda73024 * and a union type, inf., containing inf.l, a uint32_t and 17090bcde94Sda73024 * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is 17190bcde94Sda73024 * used and contains the key schedule length * 16 where key schedule length is 17290bcde94Sda73024 * 10, 12, or 14 bytes. 17390bcde94Sda73024 * 17490bcde94Sda73024 * OpenSolaris OS interface: 17554034eb2SDan OpenSolaris Anderson * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, 17690bcde94Sda73024 * const uint32_t pt[4], uint32_t ct[4])/ 17754034eb2SDan OpenSolaris Anderson * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, 17890bcde94Sda73024 * const uint32_t pt[4], uint32_t ct[4])/ 17990bcde94Sda73024 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ 18090bcde94Sda73024 * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ 18190bcde94Sda73024 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, 18290bcde94Sda73024 * ct is crypto text, and MAX_AES_NR is 14. 18390bcde94Sda73024 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. 18490bcde94Sda73024 */ 18590bcde94Sda73024 18654034eb2SDan OpenSolaris Anderson#if defined(lint) || defined(__lint) 18754034eb2SDan OpenSolaris Anderson 18854034eb2SDan OpenSolaris Anderson#include <sys/types.h> 18954034eb2SDan OpenSolaris Anderson/* ARGSUSED */ 19054034eb2SDan OpenSolaris Andersonvoid 19154034eb2SDan OpenSolaris Andersonaes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], 19254034eb2SDan OpenSolaris Anderson uint32_t ct[4]) { 19354034eb2SDan OpenSolaris Anderson} 19454034eb2SDan OpenSolaris Anderson/* ARGSUSED */ 19554034eb2SDan OpenSolaris Andersonvoid 19654034eb2SDan OpenSolaris Andersonaes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], 19754034eb2SDan OpenSolaris Anderson uint32_t pt[4]) { 19854034eb2SDan OpenSolaris Anderson} 19954034eb2SDan OpenSolaris Anderson 20054034eb2SDan OpenSolaris Anderson 20154034eb2SDan OpenSolaris Anderson#else 20254034eb2SDan OpenSolaris Anderson 20390bcde94Sda73024#include <sys/asm_linkage.h> 20490bcde94Sda73024 20590bcde94Sda73024#define KS_LENGTH 60 20690bcde94Sda73024 20790bcde94Sda73024#define raxd eax 20890bcde94Sda73024#define rdxd edx 20990bcde94Sda73024#define rcxd ecx 21090bcde94Sda73024#define rbxd ebx 21190bcde94Sda73024#define rsid esi 21290bcde94Sda73024#define rdid edi 21390bcde94Sda73024 21490bcde94Sda73024#define raxb al 21590bcde94Sda73024#define rdxb dl 21690bcde94Sda73024#define rcxb cl 21790bcde94Sda73024#define rbxb bl 21890bcde94Sda73024#define rsib sil 21990bcde94Sda73024#define rdib dil 22090bcde94Sda73024 22190bcde94Sda73024/ finite field multiplies by {02}, {04} and {08} 22290bcde94Sda73024 22390bcde94Sda73024#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] 22490bcde94Sda73024#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] 22590bcde94Sda73024#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] 22690bcde94Sda73024 22790bcde94Sda73024/ finite field multiplies required in table generation 22890bcde94Sda73024 22990bcde94Sda73024#define f3(x) [[f2(x)] ^ [x]] 23090bcde94Sda73024#define f9(x) [[f8(x)] ^ [x]] 23190bcde94Sda73024#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] 23290bcde94Sda73024#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] 23390bcde94Sda73024#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] 23490bcde94Sda73024 23590bcde94Sda73024/ macros for expanding S-box data 23690bcde94Sda73024 23790bcde94Sda73024#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] 23890bcde94Sda73024#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] 23990bcde94Sda73024#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 24090bcde94Sda73024 24190bcde94Sda73024#define enc_vals(x) \ 24290bcde94Sda73024 .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ 24390bcde94Sda73024 .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ 24490bcde94Sda73024 .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ 24590bcde94Sda73024 .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ 24690bcde94Sda73024 .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ 24790bcde94Sda73024 .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ 24890bcde94Sda73024 .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ 24990bcde94Sda73024 .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ 25090bcde94Sda73024 .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ 25190bcde94Sda73024 .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ 25290bcde94Sda73024 .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ 25390bcde94Sda73024 .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ 25490bcde94Sda73024 .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ 25590bcde94Sda73024 .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ 25690bcde94Sda73024 .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ 25790bcde94Sda73024 .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ 25890bcde94Sda73024 .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ 25990bcde94Sda73024 .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ 26090bcde94Sda73024 .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ 26190bcde94Sda73024 .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ 26290bcde94Sda73024 .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ 26390bcde94Sda73024 .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ 26490bcde94Sda73024 .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ 26590bcde94Sda73024 .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ 26690bcde94Sda73024 .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ 26790bcde94Sda73024 .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ 26890bcde94Sda73024 .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ 26990bcde94Sda73024 .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ 27090bcde94Sda73024 .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ 27190bcde94Sda73024 .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ 27290bcde94Sda73024 .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ 27390bcde94Sda73024 .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) 27490bcde94Sda73024 27590bcde94Sda73024#define dec_vals(x) \ 27690bcde94Sda73024 .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ 27790bcde94Sda73024 .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ 27890bcde94Sda73024 .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ 27990bcde94Sda73024 .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ 28090bcde94Sda73024 .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ 28190bcde94Sda73024 .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ 28290bcde94Sda73024 .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ 28390bcde94Sda73024 .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ 28490bcde94Sda73024 .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ 28590bcde94Sda73024 .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ 28690bcde94Sda73024 .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ 28790bcde94Sda73024 .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ 28890bcde94Sda73024 .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ 28990bcde94Sda73024 .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ 29090bcde94Sda73024 .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ 29190bcde94Sda73024 .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ 29290bcde94Sda73024 .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ 29390bcde94Sda73024 .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ 29490bcde94Sda73024 .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ 29590bcde94Sda73024 .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ 29690bcde94Sda73024 .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ 29790bcde94Sda73024 .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ 29890bcde94Sda73024 .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ 29990bcde94Sda73024 .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ 30090bcde94Sda73024 .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ 30190bcde94Sda73024 .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ 30290bcde94Sda73024 .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ 30390bcde94Sda73024 .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ 30490bcde94Sda73024 .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ 30590bcde94Sda73024 .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ 30690bcde94Sda73024 .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ 30790bcde94Sda73024 .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) 30890bcde94Sda73024 30990bcde94Sda73024#define tptr %rbp /* table pointer */ 31090bcde94Sda73024#define kptr %r8 /* key schedule pointer */ 31190bcde94Sda73024#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ 31290bcde94Sda73024#define fk_ref(x, y) -16*x+fofs+4*y(kptr) 31390bcde94Sda73024 31490bcde94Sda73024#ifdef AES_REV_DKS 31590bcde94Sda73024#define rofs 128 31690bcde94Sda73024#define ik_ref(x, y) -16*x+rofs+4*y(kptr) 31790bcde94Sda73024 31890bcde94Sda73024#else 31990bcde94Sda73024#define rofs -128 32090bcde94Sda73024#define ik_ref(x, y) 16*x+rofs+4*y(kptr) 32190bcde94Sda73024#endif /* AES_REV_DKS */ 32290bcde94Sda73024 32390bcde94Sda73024#define tab_0(x) (tptr,x,8) 32490bcde94Sda73024#define tab_1(x) 3(tptr,x,8) 32590bcde94Sda73024#define tab_2(x) 2(tptr,x,8) 32690bcde94Sda73024#define tab_3(x) 1(tptr,x,8) 32790bcde94Sda73024#define tab_f(x) 1(tptr,x,8) 32890bcde94Sda73024#define tab_i(x) 7(tptr,x,8) 32990bcde94Sda73024 33090bcde94Sda73024#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ 33190bcde94Sda73024 mov fk_ref(round,0), p1; \ 33290bcde94Sda73024 mov fk_ref(round,1), p2; \ 33390bcde94Sda73024 mov fk_ref(round,2), p3; \ 33490bcde94Sda73024 mov fk_ref(round,3), p4; \ 33590bcde94Sda73024 \ 33690bcde94Sda73024 movzx %al, %esi; \ 33790bcde94Sda73024 movzx %ah, %edi; \ 33890bcde94Sda73024 shr $16, %eax; \ 33990bcde94Sda73024 xor tab_0(%rsi), p1; \ 34090bcde94Sda73024 xor tab_1(%rdi), p4; \ 34190bcde94Sda73024 movzx %al, %esi; \ 34290bcde94Sda73024 movzx %ah, %edi; \ 34390bcde94Sda73024 xor tab_2(%rsi), p3; \ 34490bcde94Sda73024 xor tab_3(%rdi), p2; \ 34590bcde94Sda73024 \ 34690bcde94Sda73024 movzx %bl, %esi; \ 34790bcde94Sda73024 movzx %bh, %edi; \ 34890bcde94Sda73024 shr $16, %ebx; \ 34990bcde94Sda73024 xor tab_0(%rsi), p2; \ 35090bcde94Sda73024 xor tab_1(%rdi), p1; \ 35190bcde94Sda73024 movzx %bl, %esi; \ 35290bcde94Sda73024 movzx %bh, %edi; \ 35390bcde94Sda73024 xor tab_2(%rsi), p4; \ 35490bcde94Sda73024 xor tab_3(%rdi), p3; \ 35590bcde94Sda73024 \ 35690bcde94Sda73024 movzx %cl, %esi; \ 35790bcde94Sda73024 movzx %ch, %edi; \ 35890bcde94Sda73024 shr $16, %ecx; \ 35990bcde94Sda73024 xor tab_0(%rsi), p3; \ 36090bcde94Sda73024 xor tab_1(%rdi), p2; \ 36190bcde94Sda73024 movzx %cl, %esi; \ 36290bcde94Sda73024 movzx %ch, %edi; \ 36390bcde94Sda73024 xor tab_2(%rsi), p1; \ 36490bcde94Sda73024 xor tab_3(%rdi), p4; \ 36590bcde94Sda73024 \ 36690bcde94Sda73024 movzx %dl, %esi; \ 36790bcde94Sda73024 movzx %dh, %edi; \ 36890bcde94Sda73024 shr $16, %edx; \ 36990bcde94Sda73024 xor tab_0(%rsi), p4; \ 37090bcde94Sda73024 xor tab_1(%rdi), p3; \ 37190bcde94Sda73024 movzx %dl, %esi; \ 37290bcde94Sda73024 movzx %dh, %edi; \ 37390bcde94Sda73024 xor tab_2(%rsi), p2; \ 37490bcde94Sda73024 xor tab_3(%rdi), p1; \ 37590bcde94Sda73024 \ 37690bcde94Sda73024 mov p1, %eax; \ 37790bcde94Sda73024 mov p2, %ebx; \ 37890bcde94Sda73024 mov p3, %ecx; \ 37990bcde94Sda73024 mov p4, %edx 38090bcde94Sda73024 38190bcde94Sda73024#ifdef LAST_ROUND_TABLES 38290bcde94Sda73024 38390bcde94Sda73024#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ 38490bcde94Sda73024 add $2048, tptr; \ 38590bcde94Sda73024 mov fk_ref(round,0), p1; \ 38690bcde94Sda73024 mov fk_ref(round,1), p2; \ 38790bcde94Sda73024 mov fk_ref(round,2), p3; \ 38890bcde94Sda73024 mov fk_ref(round,3), p4; \ 38990bcde94Sda73024 \ 39090bcde94Sda73024 movzx %al, %esi; \ 39190bcde94Sda73024 movzx %ah, %edi; \ 39290bcde94Sda73024 shr $16, %eax; \ 39390bcde94Sda73024 xor tab_0(%rsi), p1; \ 39490bcde94Sda73024 xor tab_1(%rdi), p4; \ 39590bcde94Sda73024 movzx %al, %esi; \ 39690bcde94Sda73024 movzx %ah, %edi; \ 39790bcde94Sda73024 xor tab_2(%rsi), p3; \ 39890bcde94Sda73024 xor tab_3(%rdi), p2; \ 39990bcde94Sda73024 \ 40090bcde94Sda73024 movzx %bl, %esi; \ 40190bcde94Sda73024 movzx %bh, %edi; \ 40290bcde94Sda73024 shr $16, %ebx; \ 40390bcde94Sda73024 xor tab_0(%rsi), p2; \ 40490bcde94Sda73024 xor tab_1(%rdi), p1; \ 40590bcde94Sda73024 movzx %bl, %esi; \ 40690bcde94Sda73024 movzx %bh, %edi; \ 40790bcde94Sda73024 xor tab_2(%rsi), p4; \ 40890bcde94Sda73024 xor tab_3(%rdi), p3; \ 40990bcde94Sda73024 \ 41090bcde94Sda73024 movzx %cl, %esi; \ 41190bcde94Sda73024 movzx %ch, %edi; \ 41290bcde94Sda73024 shr $16, %ecx; \ 41390bcde94Sda73024 xor tab_0(%rsi), p3; \ 41490bcde94Sda73024 xor tab_1(%rdi), p2; \ 41590bcde94Sda73024 movzx %cl, %esi; \ 41690bcde94Sda73024 movzx %ch, %edi; \ 41790bcde94Sda73024 xor tab_2(%rsi), p1; \ 41890bcde94Sda73024 xor tab_3(%rdi), p4; \ 41990bcde94Sda73024 \ 42090bcde94Sda73024 movzx %dl, %esi; \ 42190bcde94Sda73024 movzx %dh, %edi; \ 42290bcde94Sda73024 shr $16, %edx; \ 42390bcde94Sda73024 xor tab_0(%rsi), p4; \ 42490bcde94Sda73024 xor tab_1(%rdi), p3; \ 42590bcde94Sda73024 movzx %dl, %esi; \ 42690bcde94Sda73024 movzx %dh, %edi; \ 42790bcde94Sda73024 xor tab_2(%rsi), p2; \ 42890bcde94Sda73024 xor tab_3(%rdi), p1 42990bcde94Sda73024 43090bcde94Sda73024#else 43190bcde94Sda73024 43290bcde94Sda73024#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ 43390bcde94Sda73024 mov fk_ref(round,0), p1; \ 43490bcde94Sda73024 mov fk_ref(round,1), p2; \ 43590bcde94Sda73024 mov fk_ref(round,2), p3; \ 43690bcde94Sda73024 mov fk_ref(round,3), p4; \ 43790bcde94Sda73024 \ 43890bcde94Sda73024 movzx %al, %esi; \ 43990bcde94Sda73024 movzx %ah, %edi; \ 44090bcde94Sda73024 shr $16, %eax; \ 44190bcde94Sda73024 movzx tab_f(%rsi), %esi; \ 44290bcde94Sda73024 movzx tab_f(%rdi), %edi; \ 44390bcde94Sda73024 xor %esi, p1; \ 44490bcde94Sda73024 rol $8, %edi; \ 44590bcde94Sda73024 xor %edi, p4; \ 44690bcde94Sda73024 movzx %al, %esi; \ 44790bcde94Sda73024 movzx %ah, %edi; \ 44890bcde94Sda73024 movzx tab_f(%rsi), %esi; \ 44990bcde94Sda73024 movzx tab_f(%rdi), %edi; \ 45090bcde94Sda73024 rol $16, %esi; \ 45190bcde94Sda73024 rol $24, %edi; \ 45290bcde94Sda73024 xor %esi, p3; \ 45390bcde94Sda73024 xor %edi, p2; \ 45490bcde94Sda73024 \ 45590bcde94Sda73024 movzx %bl, %esi; \ 45690bcde94Sda73024 movzx %bh, %edi; \ 45790bcde94Sda73024 shr $16, %ebx; \ 45890bcde94Sda73024 movzx tab_f(%rsi), %esi; \ 45990bcde94Sda73024 movzx tab_f(%rdi), %edi; \ 46090bcde94Sda73024 xor %esi, p2; \ 46190bcde94Sda73024 rol $8, %edi; \ 46290bcde94Sda73024 xor %edi, p1; \ 46390bcde94Sda73024 movzx %bl, %esi; \ 46490bcde94Sda73024 movzx %bh, %edi; \ 46590bcde94Sda73024 movzx tab_f(%rsi), %esi; \ 46690bcde94Sda73024 movzx tab_f(%rdi), %edi; \ 46790bcde94Sda73024 rol $16, %esi; \ 46890bcde94Sda73024 rol $24, %edi; \ 46990bcde94Sda73024 xor %esi, p4; \ 47090bcde94Sda73024 xor %edi, p3; \ 47190bcde94Sda73024 \ 47290bcde94Sda73024 movzx %cl, %esi; \ 47390bcde94Sda73024 movzx %ch, %edi; \ 47490bcde94Sda73024 movzx tab_f(%rsi), %esi; \ 47590bcde94Sda73024 movzx tab_f(%rdi), %edi; \ 47690bcde94Sda73024 shr $16, %ecx; \ 47790bcde94Sda73024 xor %esi, p3; \ 47890bcde94Sda73024 rol $8, %edi; \ 47990bcde94Sda73024 xor %edi, p2; \ 48090bcde94Sda73024 movzx %cl, %esi; \ 48190bcde94Sda73024 movzx %ch, %edi; \ 48290bcde94Sda73024 movzx tab_f(%rsi), %esi; \ 48390bcde94Sda73024 movzx tab_f(%rdi), %edi; \ 48490bcde94Sda73024 rol $16, %esi; \ 48590bcde94Sda73024 rol $24, %edi; \ 48690bcde94Sda73024 xor %esi, p1; \ 48790bcde94Sda73024 xor %edi, p4; \ 48890bcde94Sda73024 \ 48990bcde94Sda73024 movzx %dl, %esi; \ 49090bcde94Sda73024 movzx %dh, %edi; \ 49190bcde94Sda73024 movzx tab_f(%rsi), %esi; \ 49290bcde94Sda73024 movzx tab_f(%rdi), %edi; \ 49390bcde94Sda73024 shr $16, %edx; \ 49490bcde94Sda73024 xor %esi, p4; \ 49590bcde94Sda73024 rol $8, %edi; \ 49690bcde94Sda73024 xor %edi, p3; \ 49790bcde94Sda73024 movzx %dl, %esi; \ 49890bcde94Sda73024 movzx %dh, %edi; \ 49990bcde94Sda73024 movzx tab_f(%rsi), %esi; \ 50090bcde94Sda73024 movzx tab_f(%rdi), %edi; \ 50190bcde94Sda73024 rol $16, %esi; \ 50290bcde94Sda73024 rol $24, %edi; \ 50390bcde94Sda73024 xor %esi, p2; \ 50490bcde94Sda73024 xor %edi, p1 50590bcde94Sda73024 50690bcde94Sda73024#endif /* LAST_ROUND_TABLES */ 50790bcde94Sda73024 50890bcde94Sda73024#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ 50990bcde94Sda73024 mov ik_ref(round,0), p1; \ 51090bcde94Sda73024 mov ik_ref(round,1), p2; \ 51190bcde94Sda73024 mov ik_ref(round,2), p3; \ 51290bcde94Sda73024 mov ik_ref(round,3), p4; \ 51390bcde94Sda73024 \ 51490bcde94Sda73024 movzx %al, %esi; \ 51590bcde94Sda73024 movzx %ah, %edi; \ 51690bcde94Sda73024 shr $16, %eax; \ 51790bcde94Sda73024 xor tab_0(%rsi), p1; \ 51890bcde94Sda73024 xor tab_1(%rdi), p2; \ 51990bcde94Sda73024 movzx %al, %esi; \ 52090bcde94Sda73024 movzx %ah, %edi; \ 52190bcde94Sda73024 xor tab_2(%rsi), p3; \ 52290bcde94Sda73024 xor tab_3(%rdi), p4; \ 52390bcde94Sda73024 \ 52490bcde94Sda73024 movzx %bl, %esi; \ 52590bcde94Sda73024 movzx %bh, %edi; \ 52690bcde94Sda73024 shr $16, %ebx; \ 52790bcde94Sda73024 xor tab_0(%rsi), p2; \ 52890bcde94Sda73024 xor tab_1(%rdi), p3; \ 52990bcde94Sda73024 movzx %bl, %esi; \ 53090bcde94Sda73024 movzx %bh, %edi; \ 53190bcde94Sda73024 xor tab_2(%rsi), p4; \ 53290bcde94Sda73024 xor tab_3(%rdi), p1; \ 53390bcde94Sda73024 \ 53490bcde94Sda73024 movzx %cl, %esi; \ 53590bcde94Sda73024 movzx %ch, %edi; \ 53690bcde94Sda73024 shr $16, %ecx; \ 53790bcde94Sda73024 xor tab_0(%rsi), p3; \ 53890bcde94Sda73024 xor tab_1(%rdi), p4; \ 53990bcde94Sda73024 movzx %cl, %esi; \ 54090bcde94Sda73024 movzx %ch, %edi; \ 54190bcde94Sda73024 xor tab_2(%rsi), p1; \ 54290bcde94Sda73024 xor tab_3(%rdi), p2; \ 54390bcde94Sda73024 \ 54490bcde94Sda73024 movzx %dl, %esi; \ 54590bcde94Sda73024 movzx %dh, %edi; \ 54690bcde94Sda73024 shr $16, %edx; \ 54790bcde94Sda73024 xor tab_0(%rsi), p4; \ 54890bcde94Sda73024 xor tab_1(%rdi), p1; \ 54990bcde94Sda73024 movzx %dl, %esi; \ 55090bcde94Sda73024 movzx %dh, %edi; \ 55190bcde94Sda73024 xor tab_2(%rsi), p2; \ 55290bcde94Sda73024 xor tab_3(%rdi), p3; \ 55390bcde94Sda73024 \ 55490bcde94Sda73024 mov p1, %eax; \ 55590bcde94Sda73024 mov p2, %ebx; \ 55690bcde94Sda73024 mov p3, %ecx; \ 55790bcde94Sda73024 mov p4, %edx 55890bcde94Sda73024 55990bcde94Sda73024#ifdef LAST_ROUND_TABLES 56090bcde94Sda73024 56190bcde94Sda73024#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ 56290bcde94Sda73024 add $2048, tptr; \ 56390bcde94Sda73024 mov ik_ref(round,0), p1; \ 56490bcde94Sda73024 mov ik_ref(round,1), p2; \ 56590bcde94Sda73024 mov ik_ref(round,2), p3; \ 56690bcde94Sda73024 mov ik_ref(round,3), p4; \ 56790bcde94Sda73024 \ 56890bcde94Sda73024 movzx %al, %esi; \ 56990bcde94Sda73024 movzx %ah, %edi; \ 57090bcde94Sda73024 shr $16, %eax; \ 57190bcde94Sda73024 xor tab_0(%rsi), p1; \ 57290bcde94Sda73024 xor tab_1(%rdi), p2; \ 57390bcde94Sda73024 movzx %al, %esi; \ 57490bcde94Sda73024 movzx %ah, %edi; \ 57590bcde94Sda73024 xor tab_2(%rsi), p3; \ 57690bcde94Sda73024 xor tab_3(%rdi), p4; \ 57790bcde94Sda73024 \ 57890bcde94Sda73024 movzx %bl, %esi; \ 57990bcde94Sda73024 movzx %bh, %edi; \ 58090bcde94Sda73024 shr $16, %ebx; \ 58190bcde94Sda73024 xor tab_0(%rsi), p2; \ 58290bcde94Sda73024 xor tab_1(%rdi), p3; \ 58390bcde94Sda73024 movzx %bl, %esi; \ 58490bcde94Sda73024 movzx %bh, %edi; \ 58590bcde94Sda73024 xor tab_2(%rsi), p4; \ 58690bcde94Sda73024 xor tab_3(%rdi), p1; \ 58790bcde94Sda73024 \ 58890bcde94Sda73024 movzx %cl, %esi; \ 58990bcde94Sda73024 movzx %ch, %edi; \ 59090bcde94Sda73024 shr $16, %ecx; \ 59190bcde94Sda73024 xor tab_0(%rsi), p3; \ 59290bcde94Sda73024 xor tab_1(%rdi), p4; \ 59390bcde94Sda73024 movzx %cl, %esi; \ 59490bcde94Sda73024 movzx %ch, %edi; \ 59590bcde94Sda73024 xor tab_2(%rsi), p1; \ 59690bcde94Sda73024 xor tab_3(%rdi), p2; \ 59790bcde94Sda73024 \ 59890bcde94Sda73024 movzx %dl, %esi; \ 59990bcde94Sda73024 movzx %dh, %edi; \ 60090bcde94Sda73024 shr $16, %edx; \ 60190bcde94Sda73024 xor tab_0(%rsi), p4; \ 60290bcde94Sda73024 xor tab_1(%rdi), p1; \ 60390bcde94Sda73024 movzx %dl, %esi; \ 60490bcde94Sda73024 movzx %dh, %edi; \ 60590bcde94Sda73024 xor tab_2(%rsi), p2; \ 60690bcde94Sda73024 xor tab_3(%rdi), p3 60790bcde94Sda73024 60890bcde94Sda73024#else 60990bcde94Sda73024 61090bcde94Sda73024#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ 61190bcde94Sda73024 mov ik_ref(round,0), p1; \ 61290bcde94Sda73024 mov ik_ref(round,1), p2; \ 61390bcde94Sda73024 mov ik_ref(round,2), p3; \ 61490bcde94Sda73024 mov ik_ref(round,3), p4; \ 61590bcde94Sda73024 \ 61690bcde94Sda73024 movzx %al, %esi; \ 61790bcde94Sda73024 movzx %ah, %edi; \ 61890bcde94Sda73024 movzx tab_i(%rsi), %esi; \ 61990bcde94Sda73024 movzx tab_i(%rdi), %edi; \ 62090bcde94Sda73024 shr $16, %eax; \ 62190bcde94Sda73024 xor %esi, p1; \ 62290bcde94Sda73024 rol $8, %edi; \ 62390bcde94Sda73024 xor %edi, p2; \ 62490bcde94Sda73024 movzx %al, %esi; \ 62590bcde94Sda73024 movzx %ah, %edi; \ 62690bcde94Sda73024 movzx tab_i(%rsi), %esi; \ 62790bcde94Sda73024 movzx tab_i(%rdi), %edi; \ 62890bcde94Sda73024 rol $16, %esi; \ 62990bcde94Sda73024 rol $24, %edi; \ 63090bcde94Sda73024 xor %esi, p3; \ 63190bcde94Sda73024 xor %edi, p4; \ 63290bcde94Sda73024 \ 63390bcde94Sda73024 movzx %bl, %esi; \ 63490bcde94Sda73024 movzx %bh, %edi; \ 63590bcde94Sda73024 movzx tab_i(%rsi), %esi; \ 63690bcde94Sda73024 movzx tab_i(%rdi), %edi; \ 63790bcde94Sda73024 shr $16, %ebx; \ 63890bcde94Sda73024 xor %esi, p2; \ 63990bcde94Sda73024 rol $8, %edi; \ 64090bcde94Sda73024 xor %edi, p3; \ 64190bcde94Sda73024 movzx %bl, %esi; \ 64290bcde94Sda73024 movzx %bh, %edi; \ 64390bcde94Sda73024 movzx tab_i(%rsi), %esi; \ 64490bcde94Sda73024 movzx tab_i(%rdi), %edi; \ 64590bcde94Sda73024 rol $16, %esi; \ 64690bcde94Sda73024 rol $24, %edi; \ 64790bcde94Sda73024 xor %esi, p4; \ 64890bcde94Sda73024 xor %edi, p1; \ 64990bcde94Sda73024 \ 65090bcde94Sda73024 movzx %cl, %esi; \ 65190bcde94Sda73024 movzx %ch, %edi; \ 65290bcde94Sda73024 movzx tab_i(%rsi), %esi; \ 65390bcde94Sda73024 movzx tab_i(%rdi), %edi; \ 65490bcde94Sda73024 shr $16, %ecx; \ 65590bcde94Sda73024 xor %esi, p3; \ 65690bcde94Sda73024 rol $8, %edi; \ 65790bcde94Sda73024 xor %edi, p4; \ 65890bcde94Sda73024 movzx %cl, %esi; \ 65990bcde94Sda73024 movzx %ch, %edi; \ 66090bcde94Sda73024 movzx tab_i(%rsi), %esi; \ 66190bcde94Sda73024 movzx tab_i(%rdi), %edi; \ 66290bcde94Sda73024 rol $16, %esi; \ 66390bcde94Sda73024 rol $24, %edi; \ 66490bcde94Sda73024 xor %esi, p1; \ 66590bcde94Sda73024 xor %edi, p2; \ 66690bcde94Sda73024 \ 66790bcde94Sda73024 movzx %dl, %esi; \ 66890bcde94Sda73024 movzx %dh, %edi; \ 66990bcde94Sda73024 movzx tab_i(%rsi), %esi; \ 67090bcde94Sda73024 movzx tab_i(%rdi), %edi; \ 67190bcde94Sda73024 shr $16, %edx; \ 67290bcde94Sda73024 xor %esi, p4; \ 67390bcde94Sda73024 rol $8, %edi; \ 67490bcde94Sda73024 xor %edi, p1; \ 67590bcde94Sda73024 movzx %dl, %esi; \ 67690bcde94Sda73024 movzx %dh, %edi; \ 67790bcde94Sda73024 movzx tab_i(%rsi), %esi; \ 67890bcde94Sda73024 movzx tab_i(%rdi), %edi; \ 67990bcde94Sda73024 rol $16, %esi; \ 68090bcde94Sda73024 rol $24, %edi; \ 68190bcde94Sda73024 xor %esi, p2; \ 68290bcde94Sda73024 xor %edi, p3 68390bcde94Sda73024 68490bcde94Sda73024#endif /* LAST_ROUND_TABLES */ 68590bcde94Sda73024 68690bcde94Sda73024/* 68790bcde94Sda73024 * OpenSolaris OS: 68854034eb2SDan OpenSolaris Anderson * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, 68990bcde94Sda73024 * const uint32_t pt[4], uint32_t ct[4])/ 69090bcde94Sda73024 * 69190bcde94Sda73024 * Original interface: 69290bcde94Sda73024 * int aes_encrypt(const unsigned char *in, 69390bcde94Sda73024 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 69490bcde94Sda73024 */ 69590bcde94Sda73024 .align 64 69690bcde94Sda73024enc_tab: 69790bcde94Sda73024 enc_vals(u8) 69890bcde94Sda73024#ifdef LAST_ROUND_TABLES 69990bcde94Sda73024 / Last Round Tables: 70090bcde94Sda73024 enc_vals(w8) 70190bcde94Sda73024#endif 70290bcde94Sda73024 70390bcde94Sda73024 70454034eb2SDan OpenSolaris Anderson ENTRY_NP(aes_encrypt_amd64) 70590bcde94Sda73024#ifdef GLADMAN_INTERFACE 70690bcde94Sda73024 / Original interface 70790bcde94Sda73024 sub $[4*8], %rsp / gnu/linux/opensolaris binary interface 70890bcde94Sda73024 mov %rsi, (%rsp) / output pointer (P2) 70990bcde94Sda73024 mov %rdx, %r8 / context (P3) 71090bcde94Sda73024 71190bcde94Sda73024 mov %rbx, 1*8(%rsp) / P1: input pointer in rdi 71290bcde94Sda73024 mov %rbp, 2*8(%rsp) / P2: output pointer in (rsp) 71390bcde94Sda73024 mov %r12, 3*8(%rsp) / P3: context in r8 71490bcde94Sda73024 movzx 4*KS_LENGTH(kptr), %esi / Get byte key length * 16 71590bcde94Sda73024 71690bcde94Sda73024#else 71790bcde94Sda73024 / OpenSolaris OS interface 71890bcde94Sda73024 sub $[4*8], %rsp / Make room on stack to save registers 71990bcde94Sda73024 mov %rcx, (%rsp) / Save output pointer (P4) on stack 72090bcde94Sda73024 mov %rdi, %r8 / context (P1) 72190bcde94Sda73024 mov %rdx, %rdi / P3: save input pointer 72290bcde94Sda73024 shl $4, %esi / P2: esi byte key length * 16 72390bcde94Sda73024 72490bcde94Sda73024 mov %rbx, 1*8(%rsp) / Save registers 72590bcde94Sda73024 mov %rbp, 2*8(%rsp) 72690bcde94Sda73024 mov %r12, 3*8(%rsp) 72790bcde94Sda73024 / P1: context in r8 72890bcde94Sda73024 / P2: byte key length * 16 in esi 72990bcde94Sda73024 / P3: input pointer in rdi 73090bcde94Sda73024 / P4: output pointer in (rsp) 73190bcde94Sda73024#endif /* GLADMAN_INTERFACE */ 73290bcde94Sda73024 73390bcde94Sda73024 lea enc_tab(%rip), tptr 73490bcde94Sda73024 sub $fofs, kptr 73590bcde94Sda73024 73690bcde94Sda73024 / Load input block into registers 73790bcde94Sda73024 mov (%rdi), %eax 73890bcde94Sda73024 mov 1*4(%rdi), %ebx 73990bcde94Sda73024 mov 2*4(%rdi), %ecx 74090bcde94Sda73024 mov 3*4(%rdi), %edx 74190bcde94Sda73024 74290bcde94Sda73024 xor fofs(kptr), %eax 74390bcde94Sda73024 xor fofs+4(kptr), %ebx 74490bcde94Sda73024 xor fofs+8(kptr), %ecx 74590bcde94Sda73024 xor fofs+12(kptr), %edx 74690bcde94Sda73024 74790bcde94Sda73024 lea (kptr,%rsi), kptr 74890bcde94Sda73024 / Jump based on byte key length * 16: 74990bcde94Sda73024 cmp $[10*16], %esi 75090bcde94Sda73024 je 3f 75190bcde94Sda73024 cmp $[12*16], %esi 75290bcde94Sda73024 je 2f 75390bcde94Sda73024 cmp $[14*16], %esi 75490bcde94Sda73024 je 1f 75590bcde94Sda73024 mov $-1, %rax / error 75690bcde94Sda73024 jmp 4f 75790bcde94Sda73024 75890bcde94Sda73024 / Perform normal forward rounds 75990bcde94Sda730241: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) 76090bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) 76190bcde94Sda730242: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) 76290bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) 76390bcde94Sda730243: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) 76490bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) 76590bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) 76690bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) 76790bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) 76890bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) 76990bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) 77090bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) 77190bcde94Sda73024 ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) 77290bcde94Sda73024 fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) 77390bcde94Sda73024 77490bcde94Sda73024 / Copy results 77590bcde94Sda73024 mov (%rsp), %rbx 77690bcde94Sda73024 mov %r9d, (%rbx) 77790bcde94Sda73024 mov %r10d, 4(%rbx) 77890bcde94Sda73024 mov %r11d, 8(%rbx) 77990bcde94Sda73024 mov %r12d, 12(%rbx) 78090bcde94Sda73024 xor %rax, %rax 78190bcde94Sda730244: / Restore registers 78290bcde94Sda73024 mov 1*8(%rsp), %rbx 78390bcde94Sda73024 mov 2*8(%rsp), %rbp 78490bcde94Sda73024 mov 3*8(%rsp), %r12 78590bcde94Sda73024 add $[4*8], %rsp 78690bcde94Sda73024 ret 78790bcde94Sda73024 78854034eb2SDan OpenSolaris Anderson SET_SIZE(aes_encrypt_amd64) 78990bcde94Sda73024 79090bcde94Sda73024/* 79190bcde94Sda73024 * OpenSolaris OS: 79254034eb2SDan OpenSolaris Anderson * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, 79390bcde94Sda73024 * const uint32_t pt[4], uint32_t ct[4])/ 79490bcde94Sda73024 * 79590bcde94Sda73024 * Original interface: 79690bcde94Sda73024 * int aes_decrypt(const unsigned char *in, 79790bcde94Sda73024 * unsigned char *out, const aes_encrypt_ctx cx[1])/ 79890bcde94Sda73024 */ 79990bcde94Sda73024 .align 64 80090bcde94Sda73024dec_tab: 80190bcde94Sda73024 dec_vals(v8) 80290bcde94Sda73024#ifdef LAST_ROUND_TABLES 80390bcde94Sda73024 / Last Round Tables: 80490bcde94Sda73024 dec_vals(w8) 80590bcde94Sda73024#endif 80690bcde94Sda73024 80790bcde94Sda73024 80854034eb2SDan OpenSolaris Anderson ENTRY_NP(aes_decrypt_amd64) 80990bcde94Sda73024#ifdef GLADMAN_INTERFACE 81090bcde94Sda73024 / Original interface 81190bcde94Sda73024 sub $[4*8], %rsp / gnu/linux/opensolaris binary interface 81290bcde94Sda73024 mov %rsi, (%rsp) / output pointer (P2) 81390bcde94Sda73024 mov %rdx, %r8 / context (P3) 81490bcde94Sda73024 81590bcde94Sda73024 mov %rbx, 1*8(%rsp) / P1: input pointer in rdi 81690bcde94Sda73024 mov %rbp, 2*8(%rsp) / P2: output pointer in (rsp) 81790bcde94Sda73024 mov %r12, 3*8(%rsp) / P3: context in r8 81890bcde94Sda73024 movzx 4*KS_LENGTH(kptr), %esi / Get byte key length * 16 81990bcde94Sda73024 82090bcde94Sda73024#else 82190bcde94Sda73024 / OpenSolaris OS interface 82290bcde94Sda73024 sub $[4*8], %rsp / Make room on stack to save registers 82390bcde94Sda73024 mov %rcx, (%rsp) / Save output pointer (P4) on stack 82490bcde94Sda73024 mov %rdi, %r8 / context (P1) 82590bcde94Sda73024 mov %rdx, %rdi / P3: save input pointer 82690bcde94Sda73024 shl $4, %esi / P2: esi byte key length * 16 82790bcde94Sda73024 82890bcde94Sda73024 mov %rbx, 1*8(%rsp) / Save registers 82990bcde94Sda73024 mov %rbp, 2*8(%rsp) 83090bcde94Sda73024 mov %r12, 3*8(%rsp) 83190bcde94Sda73024 / P1: context in r8 83290bcde94Sda73024 / P2: byte key length * 16 in esi 83390bcde94Sda73024 / P3: input pointer in rdi 83490bcde94Sda73024 / P4: output pointer in (rsp) 83590bcde94Sda73024#endif /* GLADMAN_INTERFACE */ 83690bcde94Sda73024 83790bcde94Sda73024 lea dec_tab(%rip), tptr 83890bcde94Sda73024 sub $rofs, kptr 83990bcde94Sda73024 84090bcde94Sda73024 / Load input block into registers 84190bcde94Sda73024 mov (%rdi), %eax 84290bcde94Sda73024 mov 1*4(%rdi), %ebx 84390bcde94Sda73024 mov 2*4(%rdi), %ecx 84490bcde94Sda73024 mov 3*4(%rdi), %edx 84590bcde94Sda73024 84690bcde94Sda73024#ifdef AES_REV_DKS 84790bcde94Sda73024 mov kptr, %rdi 84890bcde94Sda73024 lea (kptr,%rsi), kptr 84990bcde94Sda73024#else 85090bcde94Sda73024 lea (kptr,%rsi), %rdi 85190bcde94Sda73024#endif 85290bcde94Sda73024 85390bcde94Sda73024 xor rofs(%rdi), %eax 85490bcde94Sda73024 xor rofs+4(%rdi), %ebx 85590bcde94Sda73024 xor rofs+8(%rdi), %ecx 85690bcde94Sda73024 xor rofs+12(%rdi), %edx 85790bcde94Sda73024 85890bcde94Sda73024 / Jump based on byte key length * 16: 85990bcde94Sda73024 cmp $[10*16], %esi 86090bcde94Sda73024 je 3f 86190bcde94Sda73024 cmp $[12*16], %esi 86290bcde94Sda73024 je 2f 86390bcde94Sda73024 cmp $[14*16], %esi 86490bcde94Sda73024 je 1f 86590bcde94Sda73024 mov $-1, %rax / error 86690bcde94Sda73024 jmp 4f 86790bcde94Sda73024 86890bcde94Sda73024 / Perform normal inverse rounds 86990bcde94Sda730241: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) 87090bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) 87190bcde94Sda730242: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) 87290bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) 87390bcde94Sda730243: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) 87490bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) 87590bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) 87690bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) 87790bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) 87890bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) 87990bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) 88090bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) 88190bcde94Sda73024 ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) 88290bcde94Sda73024 il_rnd(%r9d, %r10d, %r11d, %r12d, 0) 88390bcde94Sda73024 88490bcde94Sda73024 / Copy results 88590bcde94Sda73024 mov (%rsp), %rbx 88690bcde94Sda73024 mov %r9d, (%rbx) 88790bcde94Sda73024 mov %r10d, 4(%rbx) 88890bcde94Sda73024 mov %r11d, 8(%rbx) 88990bcde94Sda73024 mov %r12d, 12(%rbx) 89090bcde94Sda73024 xor %rax, %rax 89190bcde94Sda730244: / Restore registers 89290bcde94Sda73024 mov 1*8(%rsp), %rbx 89390bcde94Sda73024 mov 2*8(%rsp), %rbp 89490bcde94Sda73024 mov 3*8(%rsp), %r12 89590bcde94Sda73024 add $[4*8], %rsp 89690bcde94Sda73024 ret 89790bcde94Sda73024 89854034eb2SDan OpenSolaris Anderson SET_SIZE(aes_decrypt_amd64) 89954034eb2SDan OpenSolaris Anderson#endif /* lint || __lint */ 900