xref: /titanic_51/usr/src/common/crypto/aes/amd64/aes_amd64.s (revision 694c35faa87b858ecdadfe4fc592615f4eefbb07)
190bcde94Sda73024/*
290bcde94Sda73024 * ---------------------------------------------------------------------------
390bcde94Sda73024 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
490bcde94Sda73024 *
590bcde94Sda73024 * LICENSE TERMS
690bcde94Sda73024 *
790bcde94Sda73024 * The free distribution and use of this software is allowed (with or without
890bcde94Sda73024 * changes) provided that:
990bcde94Sda73024 *
1090bcde94Sda73024 *  1. source code distributions include the above copyright notice, this
1190bcde94Sda73024 *     list of conditions and the following disclaimer;
1290bcde94Sda73024 *
1390bcde94Sda73024 *  2. binary distributions include the above copyright notice, this list
1490bcde94Sda73024 *     of conditions and the following disclaimer in their documentation;
1590bcde94Sda73024 *
1690bcde94Sda73024 *  3. the name of the copyright holder is not used to endorse products
1790bcde94Sda73024 *     built using this software without specific written permission.
1890bcde94Sda73024 *
1990bcde94Sda73024 * DISCLAIMER
2090bcde94Sda73024 *
2190bcde94Sda73024 * This software is provided 'as is' with no explicit or implied warranties
2290bcde94Sda73024 * in respect of its properties, including, but not limited to, correctness
2390bcde94Sda73024 * and/or fitness for purpose.
2490bcde94Sda73024 * ---------------------------------------------------------------------------
2590bcde94Sda73024 * Issue 20/12/2007
2690bcde94Sda73024 *
2790bcde94Sda73024 * I am grateful to Dag Arne Osvik for many discussions of the techniques that
2890bcde94Sda73024 * can be used to optimise AES assembler code on AMD64/EM64T architectures.
2990bcde94Sda73024 * Some of the techniques used in this implementation are the result of
3090bcde94Sda73024 * suggestions made by him for which I am most grateful.
3190bcde94Sda73024 *
3290bcde94Sda73024 * An AES implementation for AMD64 processors using the YASM assembler.  This
3390bcde94Sda73024 * implementation provides only encryption, decryption and hence requires key
3490bcde94Sda73024 * scheduling support in C. It uses 8k bytes of tables but its encryption and
3590bcde94Sda73024 * decryption performance is very close to that obtained using large tables.
3690bcde94Sda73024 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
3790bcde94Sda73024 * which are as follows:
3890bcde94Sda73024 *               ms windows  gnu/linux/opensolaris os
3990bcde94Sda73024 *
4090bcde94Sda73024 *   in_blk          rcx     rdi
4190bcde94Sda73024 *   out_blk         rdx     rsi
4290bcde94Sda73024 *   context (cx)     r8     rdx
4390bcde94Sda73024 *
4490bcde94Sda73024 *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
4590bcde94Sda73024 *   registers       rdi      -      on both
4690bcde94Sda73024 *
4790bcde94Sda73024 *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
4890bcde94Sda73024 *   registers        -      rdi     on both
4990bcde94Sda73024 *
5090bcde94Sda73024 * The convention used here is that for gnu/linux/opensolaris os.
5190bcde94Sda73024 *
5290bcde94Sda73024 * This code provides the standard AES block size (128 bits, 16 bytes) and the
5390bcde94Sda73024 * three standard AES key sizes (128, 192 and 256 bits). It has the same call
5490bcde94Sda73024 * interface as my C implementation.  It uses the Microsoft C AMD64 calling
5590bcde94Sda73024 * conventions in which the three parameters are placed in  rcx, rdx and r8
5690bcde94Sda73024 * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
5790bcde94Sda73024 *
5890bcde94Sda73024 * OpenSolaris Note:
5990bcde94Sda73024 * Modified to use GNU/Linux/Solaris calling conventions.
6090bcde94Sda73024 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
6190bcde94Sda73024 *
6290bcde94Sda73024 *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
6390bcde94Sda73024 *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
6490bcde94Sda73024 *
6590bcde94Sda73024 *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
6690bcde94Sda73024 *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
6790bcde94Sda73024 *
6890bcde94Sda73024 *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
6990bcde94Sda73024 *                                            const aes_encrypt_ctx cx[1])/
7090bcde94Sda73024 *
7190bcde94Sda73024 *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
7290bcde94Sda73024 *                                            const aes_decrypt_ctx cx[1])/
7390bcde94Sda73024 *
7490bcde94Sda73024 *     AES_RETURN aes_encrypt_key(const unsigned char key[],
7590bcde94Sda73024 *                           unsigned int len, const aes_decrypt_ctx cx[1])/
7690bcde94Sda73024 *
7790bcde94Sda73024 *     AES_RETURN aes_decrypt_key(const unsigned char key[],
7890bcde94Sda73024 *                           unsigned int len, const aes_decrypt_ctx cx[1])/
7990bcde94Sda73024 *
8090bcde94Sda73024 * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
8190bcde94Sda73024 * either bits or bytes.
8290bcde94Sda73024 *
8390bcde94Sda73024 * Comment in/out the following lines to obtain the desired subroutines. These
8490bcde94Sda73024 * selections MUST match those in the C header file aesopt.h
8590bcde94Sda73024 */
8690bcde94Sda73024#define	AES_REV_DKS	  /* define if key decryption schedule is reversed */
8790bcde94Sda73024
8890bcde94Sda73024#define	LAST_ROUND_TABLES /* define for the faster version using extra tables */
8990bcde94Sda73024
9090bcde94Sda73024/*
9190bcde94Sda73024 * The encryption key schedule has the following in memory layout where N is the
9290bcde94Sda73024 * number of rounds (10, 12 or 14):
9390bcde94Sda73024 *
9490bcde94Sda73024 * lo: | input key (round 0)  |  / each round is four 32-bit words
9590bcde94Sda73024 *     | encryption round 1   |
9690bcde94Sda73024 *     | encryption round 2   |
9790bcde94Sda73024 *     ....
9890bcde94Sda73024 *     | encryption round N-1 |
9990bcde94Sda73024 * hi: | encryption round N   |
10090bcde94Sda73024 *
10190bcde94Sda73024 * The decryption key schedule is normally set up so that it has the same
10290bcde94Sda73024 * layout as above by actually reversing the order of the encryption key
10390bcde94Sda73024 * schedule in memory (this happens when AES_REV_DKS is set):
10490bcde94Sda73024 *
10590bcde94Sda73024 * lo: | decryption round 0   | =              | encryption round N   |
10690bcde94Sda73024 *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
10790bcde94Sda73024 *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
10890bcde94Sda73024 *     ....                       ....
10990bcde94Sda73024 *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
11090bcde94Sda73024 * hi: | decryption round N   | =              | input key (round 0)  |
11190bcde94Sda73024 *
11290bcde94Sda73024 * with rounds except the first and last modified using inv_mix_column()
11390bcde94Sda73024 * But if AES_REV_DKS is NOT set the order of keys is left as it is for
11490bcde94Sda73024 * encryption so that it has to be accessed in reverse when used for
11590bcde94Sda73024 * decryption (although the inverse mix column modifications are done)
11690bcde94Sda73024 *
11790bcde94Sda73024 * lo: | decryption round 0   | =              | input key (round 0)  |
11890bcde94Sda73024 *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
11990bcde94Sda73024 *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
12090bcde94Sda73024 *     ....                       ....
12190bcde94Sda73024 *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
12290bcde94Sda73024 * hi: | decryption round N   | =              | encryption round N   |
12390bcde94Sda73024 *
12490bcde94Sda73024 * This layout is faster when the assembler key scheduling provided here
12590bcde94Sda73024 * is used.
12690bcde94Sda73024 *
12790bcde94Sda73024 * End of user defines
12890bcde94Sda73024 */
12990bcde94Sda73024
13090bcde94Sda73024/*
13190bcde94Sda73024 * ---------------------------------------------------------------------------
13290bcde94Sda73024 * OpenSolaris OS modifications
13390bcde94Sda73024 *
13490bcde94Sda73024 * This source originates from Brian Gladman file aes_amd64.asm
13590bcde94Sda73024 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
13690bcde94Sda73024 * with these changes:
13790bcde94Sda73024 *
13890bcde94Sda73024 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
13990bcde94Sda73024 * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
14090bcde94Sda73024 * AES_128, AES_192, AES_256, AES_VAR ifdefs.
14190bcde94Sda73024 *
14290bcde94Sda73024 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
14390bcde94Sda73024 *
14490bcde94Sda73024 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
14590bcde94Sda73024 *
14690bcde94Sda73024 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
14790bcde94Sda73024 * (operands reversed, literals prefixed with "$", registers prefixed with "%",
14890bcde94Sda73024 * and "[register+offset]", addressing changed to "offset(register)",
14990bcde94Sda73024 * parenthesis in constant expressions "()" changed to square brackets "[]",
15090bcde94Sda73024 * "." removed from  local (numeric) labels, and other changes.
15190bcde94Sda73024 * Examples:
15290bcde94Sda73024 * Intel/yasm/nasm Syntax	ATT/OpenSolaris Syntax
15390bcde94Sda73024 * mov	rax,(4*20h)		mov	$[4*0x20],%rax
15490bcde94Sda73024 * mov	rax,[ebx+20h]		mov	0x20(%ebx),%rax
15590bcde94Sda73024 * lea	rax,[ebx+ecx]		lea	(%ebx,%ecx),%rax
15690bcde94Sda73024 * sub	rax,[ebx+ecx*4-20h]	sub	-0x20(%ebx,%ecx,4),%rax
15790bcde94Sda73024 *
15890bcde94Sda73024 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
159*694c35faSJosef 'Jeff' Sipek * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
160*694c35faSJosef 'Jeff' Sipek * definitions for lint.
16190bcde94Sda73024 *
16290bcde94Sda73024 * 6. Renamed functions and reordered parameters to match OpenSolaris:
16390bcde94Sda73024 * Original Gladman interface:
16490bcde94Sda73024 *	int aes_encrypt(const unsigned char *in,
16590bcde94Sda73024 *		unsigned char *out, const aes_encrypt_ctx cx[1])/
16690bcde94Sda73024 *	int aes_decrypt(const unsigned char *in,
16790bcde94Sda73024 *		unsigned char *out, const aes_encrypt_ctx cx[1])/
16890bcde94Sda73024 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
16990bcde94Sda73024 * and a union type, inf., containing inf.l, a uint32_t and
17090bcde94Sda73024 * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
17190bcde94Sda73024 * used and contains the key schedule length * 16 where key schedule length is
17290bcde94Sda73024 * 10, 12, or 14 bytes.
17390bcde94Sda73024 *
17490bcde94Sda73024 * OpenSolaris OS interface:
17554034eb2SDan OpenSolaris Anderson *	void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
17690bcde94Sda73024 *		const uint32_t pt[4], uint32_t ct[4])/
17754034eb2SDan OpenSolaris Anderson *	void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
17890bcde94Sda73024 *		const uint32_t pt[4], uint32_t ct[4])/
17990bcde94Sda73024 *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
18090bcde94Sda73024 *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
18190bcde94Sda73024 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
18290bcde94Sda73024 * ct is crypto text, and MAX_AES_NR is 14.
18390bcde94Sda73024 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
18490bcde94Sda73024 */
18590bcde94Sda73024
18654034eb2SDan OpenSolaris Anderson#if defined(lint) || defined(__lint)
18754034eb2SDan OpenSolaris Anderson
18854034eb2SDan OpenSolaris Anderson#include <sys/types.h>
18954034eb2SDan OpenSolaris Anderson/* ARGSUSED */
19054034eb2SDan OpenSolaris Andersonvoid
19154034eb2SDan OpenSolaris Andersonaes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
19254034eb2SDan OpenSolaris Anderson	uint32_t ct[4]) {
19354034eb2SDan OpenSolaris Anderson}
19454034eb2SDan OpenSolaris Anderson/* ARGSUSED */
19554034eb2SDan OpenSolaris Andersonvoid
19654034eb2SDan OpenSolaris Andersonaes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
19754034eb2SDan OpenSolaris Anderson	uint32_t pt[4]) {
19854034eb2SDan OpenSolaris Anderson}
19954034eb2SDan OpenSolaris Anderson
20054034eb2SDan OpenSolaris Anderson
20154034eb2SDan OpenSolaris Anderson#else
20254034eb2SDan OpenSolaris Anderson
20390bcde94Sda73024#include <sys/asm_linkage.h>
20490bcde94Sda73024
20590bcde94Sda73024#define	KS_LENGTH	60
20690bcde94Sda73024
20790bcde94Sda73024#define	raxd		eax
20890bcde94Sda73024#define	rdxd		edx
20990bcde94Sda73024#define	rcxd		ecx
21090bcde94Sda73024#define	rbxd		ebx
21190bcde94Sda73024#define	rsid		esi
21290bcde94Sda73024#define	rdid		edi
21390bcde94Sda73024
21490bcde94Sda73024#define	raxb		al
21590bcde94Sda73024#define	rdxb		dl
21690bcde94Sda73024#define	rcxb		cl
21790bcde94Sda73024#define	rbxb		bl
21890bcde94Sda73024#define	rsib		sil
21990bcde94Sda73024#define	rdib		dil
22090bcde94Sda73024
22190bcde94Sda73024/ finite field multiplies by {02}, {04} and {08}
22290bcde94Sda73024
22390bcde94Sda73024#define	f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
22490bcde94Sda73024#define	f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
22590bcde94Sda73024#define	f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
22690bcde94Sda73024
22790bcde94Sda73024/ finite field multiplies required in table generation
22890bcde94Sda73024
22990bcde94Sda73024#define	f3(x) [[f2(x)] ^ [x]]
23090bcde94Sda73024#define	f9(x) [[f8(x)] ^ [x]]
23190bcde94Sda73024#define	fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
23290bcde94Sda73024#define	fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
23390bcde94Sda73024#define	fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
23490bcde94Sda73024
23590bcde94Sda73024/ macros for expanding S-box data
23690bcde94Sda73024
23790bcde94Sda73024#define	u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
23890bcde94Sda73024#define	v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
23990bcde94Sda73024#define	w8(x) [x], 0, 0, 0, [x], 0, 0, 0
24090bcde94Sda73024
24190bcde94Sda73024#define	enc_vals(x)	\
24290bcde94Sda73024   .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
24390bcde94Sda73024   .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
24490bcde94Sda73024   .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
24590bcde94Sda73024   .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
24690bcde94Sda73024   .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
24790bcde94Sda73024   .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
24890bcde94Sda73024   .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
24990bcde94Sda73024   .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
25090bcde94Sda73024   .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
25190bcde94Sda73024   .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
25290bcde94Sda73024   .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
25390bcde94Sda73024   .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
25490bcde94Sda73024   .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
25590bcde94Sda73024   .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
25690bcde94Sda73024   .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
25790bcde94Sda73024   .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
25890bcde94Sda73024   .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
25990bcde94Sda73024   .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
26090bcde94Sda73024   .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
26190bcde94Sda73024   .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
26290bcde94Sda73024   .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
26390bcde94Sda73024   .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
26490bcde94Sda73024   .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
26590bcde94Sda73024   .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
26690bcde94Sda73024   .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
26790bcde94Sda73024   .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
26890bcde94Sda73024   .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
26990bcde94Sda73024   .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
27090bcde94Sda73024   .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
27190bcde94Sda73024   .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
27290bcde94Sda73024   .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
27390bcde94Sda73024   .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
27490bcde94Sda73024
27590bcde94Sda73024#define	dec_vals(x) \
27690bcde94Sda73024   .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
27790bcde94Sda73024   .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
27890bcde94Sda73024   .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
27990bcde94Sda73024   .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
28090bcde94Sda73024   .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
28190bcde94Sda73024   .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
28290bcde94Sda73024   .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
28390bcde94Sda73024   .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
28490bcde94Sda73024   .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
28590bcde94Sda73024   .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
28690bcde94Sda73024   .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
28790bcde94Sda73024   .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
28890bcde94Sda73024   .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
28990bcde94Sda73024   .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
29090bcde94Sda73024   .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
29190bcde94Sda73024   .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
29290bcde94Sda73024   .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
29390bcde94Sda73024   .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
29490bcde94Sda73024   .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
29590bcde94Sda73024   .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
29690bcde94Sda73024   .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
29790bcde94Sda73024   .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
29890bcde94Sda73024   .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
29990bcde94Sda73024   .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
30090bcde94Sda73024   .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
30190bcde94Sda73024   .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
30290bcde94Sda73024   .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
30390bcde94Sda73024   .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
30490bcde94Sda73024   .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
30590bcde94Sda73024   .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
30690bcde94Sda73024   .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
30790bcde94Sda73024   .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
30890bcde94Sda73024
30990bcde94Sda73024#define	tptr	%rbp	/* table pointer */
31090bcde94Sda73024#define	kptr	%r8	/* key schedule pointer */
31190bcde94Sda73024#define	fofs	128	/* adjust offset in key schedule to keep |disp| < 128 */
31290bcde94Sda73024#define	fk_ref(x, y)	-16*x+fofs+4*y(kptr)
31390bcde94Sda73024
31490bcde94Sda73024#ifdef	AES_REV_DKS
31590bcde94Sda73024#define	rofs		128
31690bcde94Sda73024#define	ik_ref(x, y)	-16*x+rofs+4*y(kptr)
31790bcde94Sda73024
31890bcde94Sda73024#else
31990bcde94Sda73024#define	rofs		-128
32090bcde94Sda73024#define	ik_ref(x, y)	16*x+rofs+4*y(kptr)
32190bcde94Sda73024#endif	/* AES_REV_DKS */
32290bcde94Sda73024
32390bcde94Sda73024#define	tab_0(x)	(tptr,x,8)
32490bcde94Sda73024#define	tab_1(x)	3(tptr,x,8)
32590bcde94Sda73024#define	tab_2(x)	2(tptr,x,8)
32690bcde94Sda73024#define	tab_3(x)	1(tptr,x,8)
32790bcde94Sda73024#define	tab_f(x)	1(tptr,x,8)
32890bcde94Sda73024#define	tab_i(x)	7(tptr,x,8)
32990bcde94Sda73024
33090bcde94Sda73024#define	ff_rnd(p1, p2, p3, p4, round)	/* normal forward round */ \
33190bcde94Sda73024	mov	fk_ref(round,0), p1; \
33290bcde94Sda73024	mov	fk_ref(round,1), p2; \
33390bcde94Sda73024	mov	fk_ref(round,2), p3; \
33490bcde94Sda73024	mov	fk_ref(round,3), p4; \
33590bcde94Sda73024 \
33690bcde94Sda73024	movzx	%al, %esi; \
33790bcde94Sda73024	movzx	%ah, %edi; \
33890bcde94Sda73024	shr	$16, %eax; \
33990bcde94Sda73024	xor	tab_0(%rsi), p1; \
34090bcde94Sda73024	xor	tab_1(%rdi), p4; \
34190bcde94Sda73024	movzx	%al, %esi; \
34290bcde94Sda73024	movzx	%ah, %edi; \
34390bcde94Sda73024	xor	tab_2(%rsi), p3; \
34490bcde94Sda73024	xor	tab_3(%rdi), p2; \
34590bcde94Sda73024 \
34690bcde94Sda73024	movzx	%bl, %esi; \
34790bcde94Sda73024	movzx	%bh, %edi; \
34890bcde94Sda73024	shr	$16, %ebx; \
34990bcde94Sda73024	xor	tab_0(%rsi), p2; \
35090bcde94Sda73024	xor	tab_1(%rdi), p1; \
35190bcde94Sda73024	movzx	%bl, %esi; \
35290bcde94Sda73024	movzx	%bh, %edi; \
35390bcde94Sda73024	xor	tab_2(%rsi), p4; \
35490bcde94Sda73024	xor	tab_3(%rdi), p3; \
35590bcde94Sda73024 \
35690bcde94Sda73024	movzx	%cl, %esi; \
35790bcde94Sda73024	movzx	%ch, %edi; \
35890bcde94Sda73024	shr	$16, %ecx; \
35990bcde94Sda73024	xor	tab_0(%rsi), p3; \
36090bcde94Sda73024	xor	tab_1(%rdi), p2; \
36190bcde94Sda73024	movzx	%cl, %esi; \
36290bcde94Sda73024	movzx	%ch, %edi; \
36390bcde94Sda73024	xor	tab_2(%rsi), p1; \
36490bcde94Sda73024	xor	tab_3(%rdi), p4; \
36590bcde94Sda73024 \
36690bcde94Sda73024	movzx	%dl, %esi; \
36790bcde94Sda73024	movzx	%dh, %edi; \
36890bcde94Sda73024	shr	$16, %edx; \
36990bcde94Sda73024	xor	tab_0(%rsi), p4; \
37090bcde94Sda73024	xor	tab_1(%rdi), p3; \
37190bcde94Sda73024	movzx	%dl, %esi; \
37290bcde94Sda73024	movzx	%dh, %edi; \
37390bcde94Sda73024	xor	tab_2(%rsi), p2; \
37490bcde94Sda73024	xor	tab_3(%rdi), p1; \
37590bcde94Sda73024 \
37690bcde94Sda73024	mov	p1, %eax; \
37790bcde94Sda73024	mov	p2, %ebx; \
37890bcde94Sda73024	mov	p3, %ecx; \
37990bcde94Sda73024	mov	p4, %edx
38090bcde94Sda73024
38190bcde94Sda73024#ifdef	LAST_ROUND_TABLES
38290bcde94Sda73024
38390bcde94Sda73024#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
38490bcde94Sda73024	add	$2048, tptr; \
38590bcde94Sda73024	mov	fk_ref(round,0), p1; \
38690bcde94Sda73024	mov	fk_ref(round,1), p2; \
38790bcde94Sda73024	mov	fk_ref(round,2), p3; \
38890bcde94Sda73024	mov	fk_ref(round,3), p4; \
38990bcde94Sda73024 \
39090bcde94Sda73024	movzx	%al, %esi; \
39190bcde94Sda73024	movzx	%ah, %edi; \
39290bcde94Sda73024	shr	$16, %eax; \
39390bcde94Sda73024	xor	tab_0(%rsi), p1; \
39490bcde94Sda73024	xor	tab_1(%rdi), p4; \
39590bcde94Sda73024	movzx	%al, %esi; \
39690bcde94Sda73024	movzx	%ah, %edi; \
39790bcde94Sda73024	xor	tab_2(%rsi), p3; \
39890bcde94Sda73024	xor	tab_3(%rdi), p2; \
39990bcde94Sda73024 \
40090bcde94Sda73024	movzx	%bl, %esi; \
40190bcde94Sda73024	movzx	%bh, %edi; \
40290bcde94Sda73024	shr	$16, %ebx; \
40390bcde94Sda73024	xor	tab_0(%rsi), p2; \
40490bcde94Sda73024	xor	tab_1(%rdi), p1; \
40590bcde94Sda73024	movzx	%bl, %esi; \
40690bcde94Sda73024	movzx	%bh, %edi; \
40790bcde94Sda73024	xor	tab_2(%rsi), p4; \
40890bcde94Sda73024	xor	tab_3(%rdi), p3; \
40990bcde94Sda73024 \
41090bcde94Sda73024	movzx	%cl, %esi; \
41190bcde94Sda73024	movzx	%ch, %edi; \
41290bcde94Sda73024	shr	$16, %ecx; \
41390bcde94Sda73024	xor	tab_0(%rsi), p3; \
41490bcde94Sda73024	xor	tab_1(%rdi), p2; \
41590bcde94Sda73024	movzx	%cl, %esi; \
41690bcde94Sda73024	movzx	%ch, %edi; \
41790bcde94Sda73024	xor	tab_2(%rsi), p1; \
41890bcde94Sda73024	xor	tab_3(%rdi), p4; \
41990bcde94Sda73024 \
42090bcde94Sda73024	movzx	%dl, %esi; \
42190bcde94Sda73024	movzx	%dh, %edi; \
42290bcde94Sda73024	shr	$16, %edx; \
42390bcde94Sda73024	xor	tab_0(%rsi), p4; \
42490bcde94Sda73024	xor	tab_1(%rdi), p3; \
42590bcde94Sda73024	movzx	%dl, %esi; \
42690bcde94Sda73024	movzx	%dh, %edi; \
42790bcde94Sda73024	xor	tab_2(%rsi), p2; \
42890bcde94Sda73024	xor	tab_3(%rdi), p1
42990bcde94Sda73024
43090bcde94Sda73024#else
43190bcde94Sda73024
43290bcde94Sda73024#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
43390bcde94Sda73024	mov	fk_ref(round,0), p1; \
43490bcde94Sda73024	mov	fk_ref(round,1), p2; \
43590bcde94Sda73024	mov	fk_ref(round,2), p3; \
43690bcde94Sda73024	mov	fk_ref(round,3), p4; \
43790bcde94Sda73024 \
43890bcde94Sda73024	movzx	%al, %esi; \
43990bcde94Sda73024	movzx	%ah, %edi; \
44090bcde94Sda73024	shr	$16, %eax; \
44190bcde94Sda73024	movzx	tab_f(%rsi), %esi; \
44290bcde94Sda73024	movzx	tab_f(%rdi), %edi; \
44390bcde94Sda73024	xor	%esi, p1; \
44490bcde94Sda73024	rol	$8, %edi; \
44590bcde94Sda73024	xor	%edi, p4; \
44690bcde94Sda73024	movzx	%al, %esi; \
44790bcde94Sda73024	movzx	%ah, %edi; \
44890bcde94Sda73024	movzx	tab_f(%rsi), %esi; \
44990bcde94Sda73024	movzx	tab_f(%rdi), %edi; \
45090bcde94Sda73024	rol	$16, %esi; \
45190bcde94Sda73024	rol	$24, %edi; \
45290bcde94Sda73024	xor	%esi, p3; \
45390bcde94Sda73024	xor	%edi, p2; \
45490bcde94Sda73024 \
45590bcde94Sda73024	movzx	%bl, %esi; \
45690bcde94Sda73024	movzx	%bh, %edi; \
45790bcde94Sda73024	shr	$16, %ebx; \
45890bcde94Sda73024	movzx	tab_f(%rsi), %esi; \
45990bcde94Sda73024	movzx	tab_f(%rdi), %edi; \
46090bcde94Sda73024	xor	%esi, p2; \
46190bcde94Sda73024	rol	$8, %edi; \
46290bcde94Sda73024	xor	%edi, p1; \
46390bcde94Sda73024	movzx	%bl, %esi; \
46490bcde94Sda73024	movzx	%bh, %edi; \
46590bcde94Sda73024	movzx	tab_f(%rsi), %esi; \
46690bcde94Sda73024	movzx	tab_f(%rdi), %edi; \
46790bcde94Sda73024	rol	$16, %esi; \
46890bcde94Sda73024	rol	$24, %edi; \
46990bcde94Sda73024	xor	%esi, p4; \
47090bcde94Sda73024	xor	%edi, p3; \
47190bcde94Sda73024 \
47290bcde94Sda73024	movzx	%cl, %esi; \
47390bcde94Sda73024	movzx	%ch, %edi; \
47490bcde94Sda73024	movzx	tab_f(%rsi), %esi; \
47590bcde94Sda73024	movzx	tab_f(%rdi), %edi; \
47690bcde94Sda73024	shr	$16, %ecx; \
47790bcde94Sda73024	xor	%esi, p3; \
47890bcde94Sda73024	rol	$8, %edi; \
47990bcde94Sda73024	xor	%edi, p2; \
48090bcde94Sda73024	movzx	%cl, %esi; \
48190bcde94Sda73024	movzx	%ch, %edi; \
48290bcde94Sda73024	movzx	tab_f(%rsi), %esi; \
48390bcde94Sda73024	movzx	tab_f(%rdi), %edi; \
48490bcde94Sda73024	rol	$16, %esi; \
48590bcde94Sda73024	rol	$24, %edi; \
48690bcde94Sda73024	xor	%esi, p1; \
48790bcde94Sda73024	xor	%edi, p4; \
48890bcde94Sda73024 \
48990bcde94Sda73024	movzx	%dl, %esi; \
49090bcde94Sda73024	movzx	%dh, %edi; \
49190bcde94Sda73024	movzx	tab_f(%rsi), %esi; \
49290bcde94Sda73024	movzx	tab_f(%rdi), %edi; \
49390bcde94Sda73024	shr	$16, %edx; \
49490bcde94Sda73024	xor	%esi, p4; \
49590bcde94Sda73024	rol	$8, %edi; \
49690bcde94Sda73024	xor	%edi, p3; \
49790bcde94Sda73024	movzx	%dl, %esi; \
49890bcde94Sda73024	movzx	%dh, %edi; \
49990bcde94Sda73024	movzx	tab_f(%rsi), %esi; \
50090bcde94Sda73024	movzx	tab_f(%rdi), %edi; \
50190bcde94Sda73024	rol	$16, %esi; \
50290bcde94Sda73024	rol	$24, %edi; \
50390bcde94Sda73024	xor	%esi, p2; \
50490bcde94Sda73024	xor	%edi, p1
50590bcde94Sda73024
50690bcde94Sda73024#endif	/* LAST_ROUND_TABLES */
50790bcde94Sda73024
50890bcde94Sda73024#define	ii_rnd(p1, p2, p3, p4, round)	/* normal inverse round */ \
50990bcde94Sda73024	mov	ik_ref(round,0), p1; \
51090bcde94Sda73024	mov	ik_ref(round,1), p2; \
51190bcde94Sda73024	mov	ik_ref(round,2), p3; \
51290bcde94Sda73024	mov	ik_ref(round,3), p4; \
51390bcde94Sda73024 \
51490bcde94Sda73024	movzx	%al, %esi; \
51590bcde94Sda73024	movzx	%ah, %edi; \
51690bcde94Sda73024	shr	$16, %eax; \
51790bcde94Sda73024	xor	tab_0(%rsi), p1; \
51890bcde94Sda73024	xor	tab_1(%rdi), p2; \
51990bcde94Sda73024	movzx	%al, %esi; \
52090bcde94Sda73024	movzx	%ah, %edi; \
52190bcde94Sda73024	xor	tab_2(%rsi), p3; \
52290bcde94Sda73024	xor	tab_3(%rdi), p4; \
52390bcde94Sda73024 \
52490bcde94Sda73024	movzx	%bl, %esi; \
52590bcde94Sda73024	movzx	%bh, %edi; \
52690bcde94Sda73024	shr	$16, %ebx; \
52790bcde94Sda73024	xor	tab_0(%rsi), p2; \
52890bcde94Sda73024	xor	tab_1(%rdi), p3; \
52990bcde94Sda73024	movzx	%bl, %esi; \
53090bcde94Sda73024	movzx	%bh, %edi; \
53190bcde94Sda73024	xor	tab_2(%rsi), p4; \
53290bcde94Sda73024	xor	tab_3(%rdi), p1; \
53390bcde94Sda73024 \
53490bcde94Sda73024	movzx	%cl, %esi; \
53590bcde94Sda73024	movzx	%ch, %edi; \
53690bcde94Sda73024	shr	$16, %ecx; \
53790bcde94Sda73024	xor	tab_0(%rsi), p3; \
53890bcde94Sda73024	xor	tab_1(%rdi), p4; \
53990bcde94Sda73024	movzx	%cl, %esi; \
54090bcde94Sda73024	movzx	%ch, %edi; \
54190bcde94Sda73024	xor	tab_2(%rsi), p1; \
54290bcde94Sda73024	xor	tab_3(%rdi), p2; \
54390bcde94Sda73024 \
54490bcde94Sda73024	movzx	%dl, %esi; \
54590bcde94Sda73024	movzx	%dh, %edi; \
54690bcde94Sda73024	shr	$16, %edx; \
54790bcde94Sda73024	xor	tab_0(%rsi), p4; \
54890bcde94Sda73024	xor	tab_1(%rdi), p1; \
54990bcde94Sda73024	movzx	%dl, %esi; \
55090bcde94Sda73024	movzx	%dh, %edi; \
55190bcde94Sda73024	xor	tab_2(%rsi), p2; \
55290bcde94Sda73024	xor	tab_3(%rdi), p3; \
55390bcde94Sda73024 \
55490bcde94Sda73024	mov	p1, %eax; \
55590bcde94Sda73024	mov	p2, %ebx; \
55690bcde94Sda73024	mov	p3, %ecx; \
55790bcde94Sda73024	mov	p4, %edx
55890bcde94Sda73024
55990bcde94Sda73024#ifdef	LAST_ROUND_TABLES
56090bcde94Sda73024
56190bcde94Sda73024#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
56290bcde94Sda73024	add	$2048, tptr; \
56390bcde94Sda73024	mov	ik_ref(round,0), p1; \
56490bcde94Sda73024	mov	ik_ref(round,1), p2; \
56590bcde94Sda73024	mov	ik_ref(round,2), p3; \
56690bcde94Sda73024	mov	ik_ref(round,3), p4; \
56790bcde94Sda73024 \
56890bcde94Sda73024	movzx	%al, %esi; \
56990bcde94Sda73024	movzx	%ah, %edi; \
57090bcde94Sda73024	shr	$16, %eax; \
57190bcde94Sda73024	xor	tab_0(%rsi), p1; \
57290bcde94Sda73024	xor	tab_1(%rdi), p2; \
57390bcde94Sda73024	movzx	%al, %esi; \
57490bcde94Sda73024	movzx	%ah, %edi; \
57590bcde94Sda73024	xor	tab_2(%rsi), p3; \
57690bcde94Sda73024	xor	tab_3(%rdi), p4; \
57790bcde94Sda73024 \
57890bcde94Sda73024	movzx	%bl, %esi; \
57990bcde94Sda73024	movzx	%bh, %edi; \
58090bcde94Sda73024	shr	$16, %ebx; \
58190bcde94Sda73024	xor	tab_0(%rsi), p2; \
58290bcde94Sda73024	xor	tab_1(%rdi), p3; \
58390bcde94Sda73024	movzx	%bl, %esi; \
58490bcde94Sda73024	movzx	%bh, %edi; \
58590bcde94Sda73024	xor	tab_2(%rsi), p4; \
58690bcde94Sda73024	xor	tab_3(%rdi), p1; \
58790bcde94Sda73024 \
58890bcde94Sda73024	movzx	%cl, %esi; \
58990bcde94Sda73024	movzx	%ch, %edi; \
59090bcde94Sda73024	shr	$16, %ecx; \
59190bcde94Sda73024	xor	tab_0(%rsi), p3; \
59290bcde94Sda73024	xor	tab_1(%rdi), p4; \
59390bcde94Sda73024	movzx	%cl, %esi; \
59490bcde94Sda73024	movzx	%ch, %edi; \
59590bcde94Sda73024	xor	tab_2(%rsi), p1; \
59690bcde94Sda73024	xor	tab_3(%rdi), p2; \
59790bcde94Sda73024 \
59890bcde94Sda73024	movzx	%dl, %esi; \
59990bcde94Sda73024	movzx	%dh, %edi; \
60090bcde94Sda73024	shr	$16, %edx; \
60190bcde94Sda73024	xor	tab_0(%rsi), p4; \
60290bcde94Sda73024	xor	tab_1(%rdi), p1; \
60390bcde94Sda73024	movzx	%dl, %esi; \
60490bcde94Sda73024	movzx	%dh, %edi; \
60590bcde94Sda73024	xor	tab_2(%rsi), p2; \
60690bcde94Sda73024	xor	tab_3(%rdi), p3
60790bcde94Sda73024
60890bcde94Sda73024#else
60990bcde94Sda73024
61090bcde94Sda73024#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
61190bcde94Sda73024	mov	ik_ref(round,0), p1; \
61290bcde94Sda73024	mov	ik_ref(round,1), p2; \
61390bcde94Sda73024	mov	ik_ref(round,2), p3; \
61490bcde94Sda73024	mov	ik_ref(round,3), p4; \
61590bcde94Sda73024 \
61690bcde94Sda73024	movzx	%al, %esi; \
61790bcde94Sda73024	movzx	%ah, %edi; \
61890bcde94Sda73024	movzx	tab_i(%rsi), %esi; \
61990bcde94Sda73024	movzx	tab_i(%rdi), %edi; \
62090bcde94Sda73024	shr	$16, %eax; \
62190bcde94Sda73024	xor	%esi, p1; \
62290bcde94Sda73024	rol	$8, %edi; \
62390bcde94Sda73024	xor	%edi, p2; \
62490bcde94Sda73024	movzx	%al, %esi; \
62590bcde94Sda73024	movzx	%ah, %edi; \
62690bcde94Sda73024	movzx	tab_i(%rsi), %esi; \
62790bcde94Sda73024	movzx	tab_i(%rdi), %edi; \
62890bcde94Sda73024	rol	$16, %esi; \
62990bcde94Sda73024	rol	$24, %edi; \
63090bcde94Sda73024	xor	%esi, p3; \
63190bcde94Sda73024	xor	%edi, p4; \
63290bcde94Sda73024 \
63390bcde94Sda73024	movzx	%bl, %esi; \
63490bcde94Sda73024	movzx	%bh, %edi; \
63590bcde94Sda73024	movzx	tab_i(%rsi), %esi; \
63690bcde94Sda73024	movzx	tab_i(%rdi), %edi; \
63790bcde94Sda73024	shr	$16, %ebx; \
63890bcde94Sda73024	xor	%esi, p2; \
63990bcde94Sda73024	rol	$8, %edi; \
64090bcde94Sda73024	xor	%edi, p3; \
64190bcde94Sda73024	movzx	%bl, %esi; \
64290bcde94Sda73024	movzx	%bh, %edi; \
64390bcde94Sda73024	movzx	tab_i(%rsi), %esi; \
64490bcde94Sda73024	movzx	tab_i(%rdi), %edi; \
64590bcde94Sda73024	rol	$16, %esi; \
64690bcde94Sda73024	rol	$24, %edi; \
64790bcde94Sda73024	xor	%esi, p4; \
64890bcde94Sda73024	xor	%edi, p1; \
64990bcde94Sda73024 \
65090bcde94Sda73024	movzx	%cl, %esi; \
65190bcde94Sda73024	movzx	%ch, %edi; \
65290bcde94Sda73024	movzx	tab_i(%rsi), %esi; \
65390bcde94Sda73024	movzx	tab_i(%rdi), %edi; \
65490bcde94Sda73024	shr	$16, %ecx; \
65590bcde94Sda73024	xor	%esi, p3; \
65690bcde94Sda73024	rol	$8, %edi; \
65790bcde94Sda73024	xor	%edi, p4; \
65890bcde94Sda73024	movzx	%cl, %esi; \
65990bcde94Sda73024	movzx	%ch, %edi; \
66090bcde94Sda73024	movzx	tab_i(%rsi), %esi; \
66190bcde94Sda73024	movzx	tab_i(%rdi), %edi; \
66290bcde94Sda73024	rol	$16, %esi; \
66390bcde94Sda73024	rol	$24, %edi; \
66490bcde94Sda73024	xor	%esi, p1; \
66590bcde94Sda73024	xor	%edi, p2; \
66690bcde94Sda73024 \
66790bcde94Sda73024	movzx	%dl, %esi; \
66890bcde94Sda73024	movzx	%dh, %edi; \
66990bcde94Sda73024	movzx	tab_i(%rsi), %esi; \
67090bcde94Sda73024	movzx	tab_i(%rdi), %edi; \
67190bcde94Sda73024	shr	$16, %edx; \
67290bcde94Sda73024	xor	%esi, p4; \
67390bcde94Sda73024	rol	$8, %edi; \
67490bcde94Sda73024	xor	%edi, p1; \
67590bcde94Sda73024	movzx	%dl, %esi; \
67690bcde94Sda73024	movzx	%dh, %edi; \
67790bcde94Sda73024	movzx	tab_i(%rsi), %esi; \
67890bcde94Sda73024	movzx	tab_i(%rdi), %edi; \
67990bcde94Sda73024	rol	$16, %esi; \
68090bcde94Sda73024	rol	$24, %edi; \
68190bcde94Sda73024	xor	%esi, p2; \
68290bcde94Sda73024	xor	%edi, p3
68390bcde94Sda73024
68490bcde94Sda73024#endif	/* LAST_ROUND_TABLES */
68590bcde94Sda73024
68690bcde94Sda73024/*
68790bcde94Sda73024 * OpenSolaris OS:
68854034eb2SDan OpenSolaris Anderson * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
68990bcde94Sda73024 *	const uint32_t pt[4], uint32_t ct[4])/
69090bcde94Sda73024 *
69190bcde94Sda73024 * Original interface:
69290bcde94Sda73024 * int aes_encrypt(const unsigned char *in,
69390bcde94Sda73024 *	unsigned char *out, const aes_encrypt_ctx cx[1])/
69490bcde94Sda73024 */
69590bcde94Sda73024	.align	64
69690bcde94Sda73024enc_tab:
69790bcde94Sda73024	enc_vals(u8)
69890bcde94Sda73024#ifdef	LAST_ROUND_TABLES
69990bcde94Sda73024	/ Last Round Tables:
70090bcde94Sda73024	enc_vals(w8)
70190bcde94Sda73024#endif
70290bcde94Sda73024
70390bcde94Sda73024
70454034eb2SDan OpenSolaris Anderson	ENTRY_NP(aes_encrypt_amd64)
70590bcde94Sda73024#ifdef	GLADMAN_INTERFACE
70690bcde94Sda73024	/ Original interface
70790bcde94Sda73024	sub	$[4*8], %rsp	/ gnu/linux/opensolaris binary interface
70890bcde94Sda73024	mov	%rsi, (%rsp)	/ output pointer (P2)
70990bcde94Sda73024	mov	%rdx, %r8	/ context (P3)
71090bcde94Sda73024
71190bcde94Sda73024	mov	%rbx, 1*8(%rsp)	/ P1: input pointer in rdi
71290bcde94Sda73024	mov	%rbp, 2*8(%rsp)	/ P2: output pointer in (rsp)
71390bcde94Sda73024	mov	%r12, 3*8(%rsp)	/ P3: context in r8
71490bcde94Sda73024	movzx	4*KS_LENGTH(kptr), %esi	/ Get byte key length * 16
71590bcde94Sda73024
71690bcde94Sda73024#else
71790bcde94Sda73024	/ OpenSolaris OS interface
71890bcde94Sda73024	sub	$[4*8], %rsp	/ Make room on stack to save registers
71990bcde94Sda73024	mov	%rcx, (%rsp)	/ Save output pointer (P4) on stack
72090bcde94Sda73024	mov	%rdi, %r8	/ context (P1)
72190bcde94Sda73024	mov	%rdx, %rdi	/ P3: save input pointer
72290bcde94Sda73024	shl	$4, %esi	/ P2: esi byte key length * 16
72390bcde94Sda73024
72490bcde94Sda73024	mov	%rbx, 1*8(%rsp)	/ Save registers
72590bcde94Sda73024	mov	%rbp, 2*8(%rsp)
72690bcde94Sda73024	mov	%r12, 3*8(%rsp)
72790bcde94Sda73024	/ P1: context in r8
72890bcde94Sda73024	/ P2: byte key length * 16 in esi
72990bcde94Sda73024	/ P3: input pointer in rdi
73090bcde94Sda73024	/ P4: output pointer in (rsp)
73190bcde94Sda73024#endif	/* GLADMAN_INTERFACE */
73290bcde94Sda73024
73390bcde94Sda73024	lea	enc_tab(%rip), tptr
73490bcde94Sda73024	sub	$fofs, kptr
73590bcde94Sda73024
73690bcde94Sda73024	/ Load input block into registers
73790bcde94Sda73024	mov	(%rdi), %eax
73890bcde94Sda73024	mov	1*4(%rdi), %ebx
73990bcde94Sda73024	mov	2*4(%rdi), %ecx
74090bcde94Sda73024	mov	3*4(%rdi), %edx
74190bcde94Sda73024
74290bcde94Sda73024	xor	fofs(kptr), %eax
74390bcde94Sda73024	xor	fofs+4(kptr), %ebx
74490bcde94Sda73024	xor	fofs+8(kptr), %ecx
74590bcde94Sda73024	xor	fofs+12(kptr), %edx
74690bcde94Sda73024
74790bcde94Sda73024	lea	(kptr,%rsi), kptr
74890bcde94Sda73024	/ Jump based on byte key length * 16:
74990bcde94Sda73024	cmp	$[10*16], %esi
75090bcde94Sda73024	je	3f
75190bcde94Sda73024	cmp	$[12*16], %esi
75290bcde94Sda73024	je	2f
75390bcde94Sda73024	cmp	$[14*16], %esi
75490bcde94Sda73024	je	1f
75590bcde94Sda73024	mov	$-1, %rax	/ error
75690bcde94Sda73024	jmp	4f
75790bcde94Sda73024
75890bcde94Sda73024	/ Perform normal forward rounds
75990bcde94Sda730241:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
76090bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
76190bcde94Sda730242:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
76290bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
76390bcde94Sda730243:	ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
76490bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
76590bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
76690bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
76790bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
76890bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
76990bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
77090bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
77190bcde94Sda73024	ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
77290bcde94Sda73024	fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
77390bcde94Sda73024
77490bcde94Sda73024	/ Copy results
77590bcde94Sda73024	mov	(%rsp), %rbx
77690bcde94Sda73024	mov	%r9d, (%rbx)
77790bcde94Sda73024	mov	%r10d, 4(%rbx)
77890bcde94Sda73024	mov	%r11d, 8(%rbx)
77990bcde94Sda73024	mov	%r12d, 12(%rbx)
78090bcde94Sda73024	xor	%rax, %rax
78190bcde94Sda730244:	/ Restore registers
78290bcde94Sda73024	mov	1*8(%rsp), %rbx
78390bcde94Sda73024	mov	2*8(%rsp), %rbp
78490bcde94Sda73024	mov	3*8(%rsp), %r12
78590bcde94Sda73024	add	$[4*8], %rsp
78690bcde94Sda73024	ret
78790bcde94Sda73024
78854034eb2SDan OpenSolaris Anderson	SET_SIZE(aes_encrypt_amd64)
78990bcde94Sda73024
79090bcde94Sda73024/*
79190bcde94Sda73024 * OpenSolaris OS:
79254034eb2SDan OpenSolaris Anderson * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
79390bcde94Sda73024 *	const uint32_t pt[4], uint32_t ct[4])/
79490bcde94Sda73024 *
79590bcde94Sda73024 * Original interface:
79690bcde94Sda73024 * int aes_decrypt(const unsigned char *in,
79790bcde94Sda73024 *	unsigned char *out, const aes_encrypt_ctx cx[1])/
79890bcde94Sda73024 */
79990bcde94Sda73024	.align	64
80090bcde94Sda73024dec_tab:
80190bcde94Sda73024	dec_vals(v8)
80290bcde94Sda73024#ifdef	LAST_ROUND_TABLES
80390bcde94Sda73024	/ Last Round Tables:
80490bcde94Sda73024	dec_vals(w8)
80590bcde94Sda73024#endif
80690bcde94Sda73024
80790bcde94Sda73024
80854034eb2SDan OpenSolaris Anderson	ENTRY_NP(aes_decrypt_amd64)
80990bcde94Sda73024#ifdef	GLADMAN_INTERFACE
81090bcde94Sda73024	/ Original interface
81190bcde94Sda73024	sub	$[4*8], %rsp	/ gnu/linux/opensolaris binary interface
81290bcde94Sda73024	mov	%rsi, (%rsp)	/ output pointer (P2)
81390bcde94Sda73024	mov	%rdx, %r8	/ context (P3)
81490bcde94Sda73024
81590bcde94Sda73024	mov	%rbx, 1*8(%rsp)	/ P1: input pointer in rdi
81690bcde94Sda73024	mov	%rbp, 2*8(%rsp)	/ P2: output pointer in (rsp)
81790bcde94Sda73024	mov	%r12, 3*8(%rsp)	/ P3: context in r8
81890bcde94Sda73024	movzx	4*KS_LENGTH(kptr), %esi	/ Get byte key length * 16
81990bcde94Sda73024
82090bcde94Sda73024#else
82190bcde94Sda73024	/ OpenSolaris OS interface
82290bcde94Sda73024	sub	$[4*8], %rsp	/ Make room on stack to save registers
82390bcde94Sda73024	mov	%rcx, (%rsp)	/ Save output pointer (P4) on stack
82490bcde94Sda73024	mov	%rdi, %r8	/ context (P1)
82590bcde94Sda73024	mov	%rdx, %rdi	/ P3: save input pointer
82690bcde94Sda73024	shl	$4, %esi	/ P2: esi byte key length * 16
82790bcde94Sda73024
82890bcde94Sda73024	mov	%rbx, 1*8(%rsp)	/ Save registers
82990bcde94Sda73024	mov	%rbp, 2*8(%rsp)
83090bcde94Sda73024	mov	%r12, 3*8(%rsp)
83190bcde94Sda73024	/ P1: context in r8
83290bcde94Sda73024	/ P2: byte key length * 16 in esi
83390bcde94Sda73024	/ P3: input pointer in rdi
83490bcde94Sda73024	/ P4: output pointer in (rsp)
83590bcde94Sda73024#endif	/* GLADMAN_INTERFACE */
83690bcde94Sda73024
83790bcde94Sda73024	lea	dec_tab(%rip), tptr
83890bcde94Sda73024	sub	$rofs, kptr
83990bcde94Sda73024
84090bcde94Sda73024	/ Load input block into registers
84190bcde94Sda73024	mov	(%rdi), %eax
84290bcde94Sda73024	mov	1*4(%rdi), %ebx
84390bcde94Sda73024	mov	2*4(%rdi), %ecx
84490bcde94Sda73024	mov	3*4(%rdi), %edx
84590bcde94Sda73024
84690bcde94Sda73024#ifdef AES_REV_DKS
84790bcde94Sda73024	mov	kptr, %rdi
84890bcde94Sda73024	lea	(kptr,%rsi), kptr
84990bcde94Sda73024#else
85090bcde94Sda73024	lea	(kptr,%rsi), %rdi
85190bcde94Sda73024#endif
85290bcde94Sda73024
85390bcde94Sda73024	xor	rofs(%rdi), %eax
85490bcde94Sda73024	xor	rofs+4(%rdi), %ebx
85590bcde94Sda73024	xor	rofs+8(%rdi), %ecx
85690bcde94Sda73024	xor	rofs+12(%rdi), %edx
85790bcde94Sda73024
85890bcde94Sda73024	/ Jump based on byte key length * 16:
85990bcde94Sda73024	cmp	$[10*16], %esi
86090bcde94Sda73024	je	3f
86190bcde94Sda73024	cmp	$[12*16], %esi
86290bcde94Sda73024	je	2f
86390bcde94Sda73024	cmp	$[14*16], %esi
86490bcde94Sda73024	je	1f
86590bcde94Sda73024	mov	$-1, %rax	/ error
86690bcde94Sda73024	jmp	4f
86790bcde94Sda73024
86890bcde94Sda73024	/ Perform normal inverse rounds
86990bcde94Sda730241:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
87090bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
87190bcde94Sda730242:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
87290bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
87390bcde94Sda730243:	ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
87490bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
87590bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
87690bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
87790bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
87890bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
87990bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
88090bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
88190bcde94Sda73024	ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
88290bcde94Sda73024	il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
88390bcde94Sda73024
88490bcde94Sda73024	/ Copy results
88590bcde94Sda73024	mov	(%rsp), %rbx
88690bcde94Sda73024	mov	%r9d, (%rbx)
88790bcde94Sda73024	mov	%r10d, 4(%rbx)
88890bcde94Sda73024	mov	%r11d, 8(%rbx)
88990bcde94Sda73024	mov	%r12d, 12(%rbx)
89090bcde94Sda73024	xor	%rax, %rax
89190bcde94Sda730244:	/ Restore registers
89290bcde94Sda73024	mov	1*8(%rsp), %rbx
89390bcde94Sda73024	mov	2*8(%rsp), %rbp
89490bcde94Sda73024	mov	3*8(%rsp), %r12
89590bcde94Sda73024	add	$[4*8], %rsp
89690bcde94Sda73024	ret
89790bcde94Sda73024
89854034eb2SDan OpenSolaris Anderson	SET_SIZE(aes_decrypt_amd64)
89954034eb2SDan OpenSolaris Anderson#endif	/* lint || __lint */
900