xref: /illumos-gate/usr/src/common/crypto/aes/amd64/aes_amd64.S (revision 5d9d9091f564c198a760790b0bfa72c44e17912b)
1/*
2 * ---------------------------------------------------------------------------
3 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
4 *
5 * LICENSE TERMS
6 *
7 * The free distribution and use of this software is allowed (with or without
8 * changes) provided that:
9 *
10 *  1. source code distributions include the above copyright notice, this
11 *     list of conditions and the following disclaimer;
12 *
13 *  2. binary distributions include the above copyright notice, this list
14 *     of conditions and the following disclaimer in their documentation;
15 *
16 *  3. the name of the copyright holder is not used to endorse products
17 *     built using this software without specific written permission.
18 *
19 * DISCLAIMER
20 *
21 * This software is provided 'as is' with no explicit or implied warranties
22 * in respect of its properties, including, but not limited to, correctness
23 * and/or fitness for purpose.
24 * ---------------------------------------------------------------------------
25 * Issue 20/12/2007
26 *
27 * I am grateful to Dag Arne Osvik for many discussions of the techniques that
28 * can be used to optimise AES assembler code on AMD64/EM64T architectures.
29 * Some of the techniques used in this implementation are the result of
30 * suggestions made by him for which I am most grateful.
31 *
32 * An AES implementation for AMD64 processors using the YASM assembler.  This
33 * implementation provides only encryption, decryption and hence requires key
34 * scheduling support in C. It uses 8k bytes of tables but its encryption and
35 * decryption performance is very close to that obtained using large tables.
36 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
37 * which are as follows:
38 *               ms windows  gnu/linux/opensolaris os
39 *
40 *   in_blk          rcx     rdi
41 *   out_blk         rdx     rsi
42 *   context (cx)     r8     rdx
43 *
44 *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
45 *   registers       rdi      -      on both
46 *
47 *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
48 *   registers        -      rdi     on both
49 *
50 * The convention used here is that for gnu/linux/opensolaris os.
51 *
52 * This code provides the standard AES block size (128 bits, 16 bytes) and the
53 * three standard AES key sizes (128, 192 and 256 bits). It has the same call
54 * interface as my C implementation.  It uses the Microsoft C AMD64 calling
55 * conventions in which the three parameters are placed in  rcx, rdx and r8
56 * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
57 *
58 * OpenSolaris Note:
59 * Modified to use GNU/Linux/Solaris calling conventions.
60 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
61 *
62 *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
63 *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
64 *
65 *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
66 *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
67 *
68 *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
69 *                                            const aes_encrypt_ctx cx[1])/
70 *
71 *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
72 *                                            const aes_decrypt_ctx cx[1])/
73 *
74 *     AES_RETURN aes_encrypt_key(const unsigned char key[],
75 *                           unsigned int len, const aes_decrypt_ctx cx[1])/
76 *
77 *     AES_RETURN aes_decrypt_key(const unsigned char key[],
78 *                           unsigned int len, const aes_decrypt_ctx cx[1])/
79 *
80 * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
81 * either bits or bytes.
82 *
83 * Comment in/out the following lines to obtain the desired subroutines. These
84 * selections MUST match those in the C header file aesopt.h
85 */
86#define	AES_REV_DKS	  /* define if key decryption schedule is reversed */
87
88#define	LAST_ROUND_TABLES /* define for the faster version using extra tables */
89
90/*
91 * The encryption key schedule has the following in memory layout where N is the
92 * number of rounds (10, 12 or 14):
93 *
94 * lo: | input key (round 0)  |  / each round is four 32-bit words
95 *     | encryption round 1   |
96 *     | encryption round 2   |
97 *     ....
98 *     | encryption round N-1 |
99 * hi: | encryption round N   |
100 *
101 * The decryption key schedule is normally set up so that it has the same
102 * layout as above by actually reversing the order of the encryption key
103 * schedule in memory (this happens when AES_REV_DKS is set):
104 *
105 * lo: | decryption round 0   | =              | encryption round N   |
106 *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
107 *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
108 *     ....                       ....
109 *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
110 * hi: | decryption round N   | =              | input key (round 0)  |
111 *
112 * with rounds except the first and last modified using inv_mix_column()
113 * But if AES_REV_DKS is NOT set the order of keys is left as it is for
114 * encryption so that it has to be accessed in reverse when used for
115 * decryption (although the inverse mix column modifications are done)
116 *
117 * lo: | decryption round 0   | =              | input key (round 0)  |
118 *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
119 *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
120 *     ....                       ....
121 *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
122 * hi: | decryption round N   | =              | encryption round N   |
123 *
124 * This layout is faster when the assembler key scheduling provided here
125 * is used.
126 *
127 * End of user defines
128 */
129
130/*
131 * ---------------------------------------------------------------------------
132 * OpenSolaris OS modifications
133 *
134 * This source originates from Brian Gladman file aes_amd64.asm
135 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
136 * with these changes:
137 *
138 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
139 * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
140 * AES_128, AES_192, AES_256, AES_VAR ifdefs.
141 *
142 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
143 *
144 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
145 *
146 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
147 * (operands reversed, literals prefixed with "$", registers prefixed with "%",
148 * and "[register+offset]", addressing changed to "offset(register)",
149 * parenthesis in constant expressions "()" changed to square brackets "[]",
150 * "." removed from  local (numeric) labels, and other changes.
151 * Examples:
152 * Intel/yasm/nasm Syntax	ATT/OpenSolaris Syntax
153 * mov	rax,(4*20h)		mov	$[4*0x20],%rax
154 * mov	rax,[ebx+20h]		mov	0x20(%ebx),%rax
155 * lea	rax,[ebx+ecx]		lea	(%ebx,%ecx),%rax
156 * sub	rax,[ebx+ecx*4-20h]	sub	-0x20(%ebx,%ecx,4),%rax
157 *
158 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
159 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
160 * definitions for lint.
161 *
162 * 6. Renamed functions and reordered parameters to match OpenSolaris:
163 * Original Gladman interface:
164 *	int aes_encrypt(const unsigned char *in,
165 *		unsigned char *out, const aes_encrypt_ctx cx[1])/
166 *	int aes_decrypt(const unsigned char *in,
167 *		unsigned char *out, const aes_encrypt_ctx cx[1])/
168 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
169 * and a union type, inf., containing inf.l, a uint32_t and
170 * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
171 * used and contains the key schedule length * 16 where key schedule length is
172 * 10, 12, or 14 bytes.
173 *
174 * OpenSolaris OS interface:
175 *	void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
176 *		const uint32_t pt[4], uint32_t ct[4])/
177 *	void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
178 *		const uint32_t pt[4], uint32_t ct[4])/
179 *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
180 *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
181 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
182 * ct is crypto text, and MAX_AES_NR is 14.
183 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
184 */
185
186#if defined(lint) || defined(__lint)
187
188#include <sys/types.h>
189/* ARGSUSED */
190void
191aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
192	uint32_t ct[4]) {
193}
194/* ARGSUSED */
195void
196aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
197	uint32_t pt[4]) {
198}
199
200
201#else
202
203#include <sys/asm_linkage.h>
204
205#define	KS_LENGTH	60
206
207#define	raxd		eax
208#define	rdxd		edx
209#define	rcxd		ecx
210#define	rbxd		ebx
211#define	rsid		esi
212#define	rdid		edi
213
214#define	raxb		al
215#define	rdxb		dl
216#define	rcxb		cl
217#define	rbxb		bl
218#define	rsib		sil
219#define	rdib		dil
220
221/ finite field multiplies by {02}, {04} and {08}
222
223#define	f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
224#define	f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
225#define	f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
226
227/ finite field multiplies required in table generation
228
229#define	f3(x) [[f2(x)] ^ [x]]
230#define	f9(x) [[f8(x)] ^ [x]]
231#define	fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
232#define	fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
233#define	fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
234
235/ macros for expanding S-box data
236
237#define	u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
238#define	v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
239#define	w8(x) [x], 0, 0, 0, [x], 0, 0, 0
240
241#define	enc_vals(x)	\
242   .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
243   .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
244   .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
245   .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
246   .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
247   .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
248   .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
249   .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
250   .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
251   .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
252   .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
253   .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
254   .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
255   .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
256   .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
257   .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
258   .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
259   .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
260   .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
261   .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
262   .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
263   .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
264   .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
265   .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
266   .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
267   .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
268   .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
269   .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
270   .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
271   .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
272   .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
273   .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
274
275#define	dec_vals(x) \
276   .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
277   .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
278   .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
279   .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
280   .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
281   .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
282   .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
283   .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
284   .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
285   .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
286   .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
287   .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
288   .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
289   .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
290   .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
291   .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
292   .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
293   .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
294   .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
295   .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
296   .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
297   .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
298   .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
299   .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
300   .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
301   .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
302   .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
303   .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
304   .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
305   .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
306   .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
307   .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
308
309#define	tptr	%rbp	/* table pointer */
310#define	kptr	%r8	/* key schedule pointer */
311#define	fofs	128	/* adjust offset in key schedule to keep |disp| < 128 */
312#define	fk_ref(x, y)	-16*x+fofs+4*y(kptr)
313
314#ifdef	AES_REV_DKS
315#define	rofs		128
316#define	ik_ref(x, y)	-16*x+rofs+4*y(kptr)
317
318#else
319#define	rofs		-128
320#define	ik_ref(x, y)	16*x+rofs+4*y(kptr)
321#endif	/* AES_REV_DKS */
322
323#define	tab_0(x)	(tptr,x,8)
324#define	tab_1(x)	3(tptr,x,8)
325#define	tab_2(x)	2(tptr,x,8)
326#define	tab_3(x)	1(tptr,x,8)
327#define	tab_f(x)	1(tptr,x,8)
328#define	tab_i(x)	7(tptr,x,8)
329
330#define	ff_rnd(p1, p2, p3, p4, round)	/* normal forward round */ \
331	mov	fk_ref(round,0), p1; \
332	mov	fk_ref(round,1), p2; \
333	mov	fk_ref(round,2), p3; \
334	mov	fk_ref(round,3), p4; \
335 \
336	movzx	%al, %esi; \
337	movzx	%ah, %edi; \
338	shr	$16, %eax; \
339	xor	tab_0(%rsi), p1; \
340	xor	tab_1(%rdi), p4; \
341	movzx	%al, %esi; \
342	movzx	%ah, %edi; \
343	xor	tab_2(%rsi), p3; \
344	xor	tab_3(%rdi), p2; \
345 \
346	movzx	%bl, %esi; \
347	movzx	%bh, %edi; \
348	shr	$16, %ebx; \
349	xor	tab_0(%rsi), p2; \
350	xor	tab_1(%rdi), p1; \
351	movzx	%bl, %esi; \
352	movzx	%bh, %edi; \
353	xor	tab_2(%rsi), p4; \
354	xor	tab_3(%rdi), p3; \
355 \
356	movzx	%cl, %esi; \
357	movzx	%ch, %edi; \
358	shr	$16, %ecx; \
359	xor	tab_0(%rsi), p3; \
360	xor	tab_1(%rdi), p2; \
361	movzx	%cl, %esi; \
362	movzx	%ch, %edi; \
363	xor	tab_2(%rsi), p1; \
364	xor	tab_3(%rdi), p4; \
365 \
366	movzx	%dl, %esi; \
367	movzx	%dh, %edi; \
368	shr	$16, %edx; \
369	xor	tab_0(%rsi), p4; \
370	xor	tab_1(%rdi), p3; \
371	movzx	%dl, %esi; \
372	movzx	%dh, %edi; \
373	xor	tab_2(%rsi), p2; \
374	xor	tab_3(%rdi), p1; \
375 \
376	mov	p1, %eax; \
377	mov	p2, %ebx; \
378	mov	p3, %ecx; \
379	mov	p4, %edx
380
381#ifdef	LAST_ROUND_TABLES
382
383#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
384	add	$2048, tptr; \
385	mov	fk_ref(round,0), p1; \
386	mov	fk_ref(round,1), p2; \
387	mov	fk_ref(round,2), p3; \
388	mov	fk_ref(round,3), p4; \
389 \
390	movzx	%al, %esi; \
391	movzx	%ah, %edi; \
392	shr	$16, %eax; \
393	xor	tab_0(%rsi), p1; \
394	xor	tab_1(%rdi), p4; \
395	movzx	%al, %esi; \
396	movzx	%ah, %edi; \
397	xor	tab_2(%rsi), p3; \
398	xor	tab_3(%rdi), p2; \
399 \
400	movzx	%bl, %esi; \
401	movzx	%bh, %edi; \
402	shr	$16, %ebx; \
403	xor	tab_0(%rsi), p2; \
404	xor	tab_1(%rdi), p1; \
405	movzx	%bl, %esi; \
406	movzx	%bh, %edi; \
407	xor	tab_2(%rsi), p4; \
408	xor	tab_3(%rdi), p3; \
409 \
410	movzx	%cl, %esi; \
411	movzx	%ch, %edi; \
412	shr	$16, %ecx; \
413	xor	tab_0(%rsi), p3; \
414	xor	tab_1(%rdi), p2; \
415	movzx	%cl, %esi; \
416	movzx	%ch, %edi; \
417	xor	tab_2(%rsi), p1; \
418	xor	tab_3(%rdi), p4; \
419 \
420	movzx	%dl, %esi; \
421	movzx	%dh, %edi; \
422	shr	$16, %edx; \
423	xor	tab_0(%rsi), p4; \
424	xor	tab_1(%rdi), p3; \
425	movzx	%dl, %esi; \
426	movzx	%dh, %edi; \
427	xor	tab_2(%rsi), p2; \
428	xor	tab_3(%rdi), p1
429
430#else
431
432#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
433	mov	fk_ref(round,0), p1; \
434	mov	fk_ref(round,1), p2; \
435	mov	fk_ref(round,2), p3; \
436	mov	fk_ref(round,3), p4; \
437 \
438	movzx	%al, %esi; \
439	movzx	%ah, %edi; \
440	shr	$16, %eax; \
441	movzx	tab_f(%rsi), %esi; \
442	movzx	tab_f(%rdi), %edi; \
443	xor	%esi, p1; \
444	rol	$8, %edi; \
445	xor	%edi, p4; \
446	movzx	%al, %esi; \
447	movzx	%ah, %edi; \
448	movzx	tab_f(%rsi), %esi; \
449	movzx	tab_f(%rdi), %edi; \
450	rol	$16, %esi; \
451	rol	$24, %edi; \
452	xor	%esi, p3; \
453	xor	%edi, p2; \
454 \
455	movzx	%bl, %esi; \
456	movzx	%bh, %edi; \
457	shr	$16, %ebx; \
458	movzx	tab_f(%rsi), %esi; \
459	movzx	tab_f(%rdi), %edi; \
460	xor	%esi, p2; \
461	rol	$8, %edi; \
462	xor	%edi, p1; \
463	movzx	%bl, %esi; \
464	movzx	%bh, %edi; \
465	movzx	tab_f(%rsi), %esi; \
466	movzx	tab_f(%rdi), %edi; \
467	rol	$16, %esi; \
468	rol	$24, %edi; \
469	xor	%esi, p4; \
470	xor	%edi, p3; \
471 \
472	movzx	%cl, %esi; \
473	movzx	%ch, %edi; \
474	movzx	tab_f(%rsi), %esi; \
475	movzx	tab_f(%rdi), %edi; \
476	shr	$16, %ecx; \
477	xor	%esi, p3; \
478	rol	$8, %edi; \
479	xor	%edi, p2; \
480	movzx	%cl, %esi; \
481	movzx	%ch, %edi; \
482	movzx	tab_f(%rsi), %esi; \
483	movzx	tab_f(%rdi), %edi; \
484	rol	$16, %esi; \
485	rol	$24, %edi; \
486	xor	%esi, p1; \
487	xor	%edi, p4; \
488 \
489	movzx	%dl, %esi; \
490	movzx	%dh, %edi; \
491	movzx	tab_f(%rsi), %esi; \
492	movzx	tab_f(%rdi), %edi; \
493	shr	$16, %edx; \
494	xor	%esi, p4; \
495	rol	$8, %edi; \
496	xor	%edi, p3; \
497	movzx	%dl, %esi; \
498	movzx	%dh, %edi; \
499	movzx	tab_f(%rsi), %esi; \
500	movzx	tab_f(%rdi), %edi; \
501	rol	$16, %esi; \
502	rol	$24, %edi; \
503	xor	%esi, p2; \
504	xor	%edi, p1
505
506#endif	/* LAST_ROUND_TABLES */
507
508#define	ii_rnd(p1, p2, p3, p4, round)	/* normal inverse round */ \
509	mov	ik_ref(round,0), p1; \
510	mov	ik_ref(round,1), p2; \
511	mov	ik_ref(round,2), p3; \
512	mov	ik_ref(round,3), p4; \
513 \
514	movzx	%al, %esi; \
515	movzx	%ah, %edi; \
516	shr	$16, %eax; \
517	xor	tab_0(%rsi), p1; \
518	xor	tab_1(%rdi), p2; \
519	movzx	%al, %esi; \
520	movzx	%ah, %edi; \
521	xor	tab_2(%rsi), p3; \
522	xor	tab_3(%rdi), p4; \
523 \
524	movzx	%bl, %esi; \
525	movzx	%bh, %edi; \
526	shr	$16, %ebx; \
527	xor	tab_0(%rsi), p2; \
528	xor	tab_1(%rdi), p3; \
529	movzx	%bl, %esi; \
530	movzx	%bh, %edi; \
531	xor	tab_2(%rsi), p4; \
532	xor	tab_3(%rdi), p1; \
533 \
534	movzx	%cl, %esi; \
535	movzx	%ch, %edi; \
536	shr	$16, %ecx; \
537	xor	tab_0(%rsi), p3; \
538	xor	tab_1(%rdi), p4; \
539	movzx	%cl, %esi; \
540	movzx	%ch, %edi; \
541	xor	tab_2(%rsi), p1; \
542	xor	tab_3(%rdi), p2; \
543 \
544	movzx	%dl, %esi; \
545	movzx	%dh, %edi; \
546	shr	$16, %edx; \
547	xor	tab_0(%rsi), p4; \
548	xor	tab_1(%rdi), p1; \
549	movzx	%dl, %esi; \
550	movzx	%dh, %edi; \
551	xor	tab_2(%rsi), p2; \
552	xor	tab_3(%rdi), p3; \
553 \
554	mov	p1, %eax; \
555	mov	p2, %ebx; \
556	mov	p3, %ecx; \
557	mov	p4, %edx
558
559#ifdef	LAST_ROUND_TABLES
560
561#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
562	add	$2048, tptr; \
563	mov	ik_ref(round,0), p1; \
564	mov	ik_ref(round,1), p2; \
565	mov	ik_ref(round,2), p3; \
566	mov	ik_ref(round,3), p4; \
567 \
568	movzx	%al, %esi; \
569	movzx	%ah, %edi; \
570	shr	$16, %eax; \
571	xor	tab_0(%rsi), p1; \
572	xor	tab_1(%rdi), p2; \
573	movzx	%al, %esi; \
574	movzx	%ah, %edi; \
575	xor	tab_2(%rsi), p3; \
576	xor	tab_3(%rdi), p4; \
577 \
578	movzx	%bl, %esi; \
579	movzx	%bh, %edi; \
580	shr	$16, %ebx; \
581	xor	tab_0(%rsi), p2; \
582	xor	tab_1(%rdi), p3; \
583	movzx	%bl, %esi; \
584	movzx	%bh, %edi; \
585	xor	tab_2(%rsi), p4; \
586	xor	tab_3(%rdi), p1; \
587 \
588	movzx	%cl, %esi; \
589	movzx	%ch, %edi; \
590	shr	$16, %ecx; \
591	xor	tab_0(%rsi), p3; \
592	xor	tab_1(%rdi), p4; \
593	movzx	%cl, %esi; \
594	movzx	%ch, %edi; \
595	xor	tab_2(%rsi), p1; \
596	xor	tab_3(%rdi), p2; \
597 \
598	movzx	%dl, %esi; \
599	movzx	%dh, %edi; \
600	shr	$16, %edx; \
601	xor	tab_0(%rsi), p4; \
602	xor	tab_1(%rdi), p1; \
603	movzx	%dl, %esi; \
604	movzx	%dh, %edi; \
605	xor	tab_2(%rsi), p2; \
606	xor	tab_3(%rdi), p3
607
608#else
609
610#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
611	mov	ik_ref(round,0), p1; \
612	mov	ik_ref(round,1), p2; \
613	mov	ik_ref(round,2), p3; \
614	mov	ik_ref(round,3), p4; \
615 \
616	movzx	%al, %esi; \
617	movzx	%ah, %edi; \
618	movzx	tab_i(%rsi), %esi; \
619	movzx	tab_i(%rdi), %edi; \
620	shr	$16, %eax; \
621	xor	%esi, p1; \
622	rol	$8, %edi; \
623	xor	%edi, p2; \
624	movzx	%al, %esi; \
625	movzx	%ah, %edi; \
626	movzx	tab_i(%rsi), %esi; \
627	movzx	tab_i(%rdi), %edi; \
628	rol	$16, %esi; \
629	rol	$24, %edi; \
630	xor	%esi, p3; \
631	xor	%edi, p4; \
632 \
633	movzx	%bl, %esi; \
634	movzx	%bh, %edi; \
635	movzx	tab_i(%rsi), %esi; \
636	movzx	tab_i(%rdi), %edi; \
637	shr	$16, %ebx; \
638	xor	%esi, p2; \
639	rol	$8, %edi; \
640	xor	%edi, p3; \
641	movzx	%bl, %esi; \
642	movzx	%bh, %edi; \
643	movzx	tab_i(%rsi), %esi; \
644	movzx	tab_i(%rdi), %edi; \
645	rol	$16, %esi; \
646	rol	$24, %edi; \
647	xor	%esi, p4; \
648	xor	%edi, p1; \
649 \
650	movzx	%cl, %esi; \
651	movzx	%ch, %edi; \
652	movzx	tab_i(%rsi), %esi; \
653	movzx	tab_i(%rdi), %edi; \
654	shr	$16, %ecx; \
655	xor	%esi, p3; \
656	rol	$8, %edi; \
657	xor	%edi, p4; \
658	movzx	%cl, %esi; \
659	movzx	%ch, %edi; \
660	movzx	tab_i(%rsi), %esi; \
661	movzx	tab_i(%rdi), %edi; \
662	rol	$16, %esi; \
663	rol	$24, %edi; \
664	xor	%esi, p1; \
665	xor	%edi, p2; \
666 \
667	movzx	%dl, %esi; \
668	movzx	%dh, %edi; \
669	movzx	tab_i(%rsi), %esi; \
670	movzx	tab_i(%rdi), %edi; \
671	shr	$16, %edx; \
672	xor	%esi, p4; \
673	rol	$8, %edi; \
674	xor	%edi, p1; \
675	movzx	%dl, %esi; \
676	movzx	%dh, %edi; \
677	movzx	tab_i(%rsi), %esi; \
678	movzx	tab_i(%rdi), %edi; \
679	rol	$16, %esi; \
680	rol	$24, %edi; \
681	xor	%esi, p2; \
682	xor	%edi, p3
683
684#endif	/* LAST_ROUND_TABLES */
685
686/*
687 * OpenSolaris OS:
688 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
689 *	const uint32_t pt[4], uint32_t ct[4])/
690 *
691 * Original interface:
692 * int aes_encrypt(const unsigned char *in,
693 *	unsigned char *out, const aes_encrypt_ctx cx[1])/
694 */
695	.align	64
696enc_tab:
697	enc_vals(u8)
698#ifdef	LAST_ROUND_TABLES
699	/ Last Round Tables:
700	enc_vals(w8)
701#endif
702
703
704	ENTRY_NP(aes_encrypt_amd64)
705#ifdef	GLADMAN_INTERFACE
706	/ Original interface
707	sub	$[4*8], %rsp	/ gnu/linux/opensolaris binary interface
708	mov	%rsi, (%rsp)	/ output pointer (P2)
709	mov	%rdx, %r8	/ context (P3)
710
711	mov	%rbx, 1*8(%rsp)	/ P1: input pointer in rdi
712	mov	%rbp, 2*8(%rsp)	/ P2: output pointer in (rsp)
713	mov	%r12, 3*8(%rsp)	/ P3: context in r8
714	movzx	4*KS_LENGTH(kptr), %esi	/ Get byte key length * 16
715
716#else
717	/ OpenSolaris OS interface
718	sub	$[4*8], %rsp	/ Make room on stack to save registers
719	mov	%rcx, (%rsp)	/ Save output pointer (P4) on stack
720	mov	%rdi, %r8	/ context (P1)
721	mov	%rdx, %rdi	/ P3: save input pointer
722	shl	$4, %esi	/ P2: esi byte key length * 16
723
724	mov	%rbx, 1*8(%rsp)	/ Save registers
725	mov	%rbp, 2*8(%rsp)
726	mov	%r12, 3*8(%rsp)
727	/ P1: context in r8
728	/ P2: byte key length * 16 in esi
729	/ P3: input pointer in rdi
730	/ P4: output pointer in (rsp)
731#endif	/* GLADMAN_INTERFACE */
732
733	lea	enc_tab(%rip), tptr
734	sub	$fofs, kptr
735
736	/ Load input block into registers
737	mov	(%rdi), %eax
738	mov	1*4(%rdi), %ebx
739	mov	2*4(%rdi), %ecx
740	mov	3*4(%rdi), %edx
741
742	xor	fofs(kptr), %eax
743	xor	fofs+4(kptr), %ebx
744	xor	fofs+8(kptr), %ecx
745	xor	fofs+12(kptr), %edx
746
747	lea	(kptr,%rsi), kptr
748	/ Jump based on byte key length * 16:
749	cmp	$[10*16], %esi
750	je	3f
751	cmp	$[12*16], %esi
752	je	2f
753	cmp	$[14*16], %esi
754	je	1f
755	mov	$-1, %rax	/ error
756	jmp	4f
757
758	/ Perform normal forward rounds
7591:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
760	ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
7612:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
762	ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
7633:	ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
764	ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
765	ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
766	ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
767	ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
768	ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
769	ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
770	ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
771	ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
772	fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
773
774	/ Copy results
775	mov	(%rsp), %rbx
776	mov	%r9d, (%rbx)
777	mov	%r10d, 4(%rbx)
778	mov	%r11d, 8(%rbx)
779	mov	%r12d, 12(%rbx)
780	xor	%rax, %rax
7814:	/ Restore registers
782	mov	1*8(%rsp), %rbx
783	mov	2*8(%rsp), %rbp
784	mov	3*8(%rsp), %r12
785	add	$[4*8], %rsp
786	ret
787
788	SET_SIZE(aes_encrypt_amd64)
789
790/*
791 * OpenSolaris OS:
792 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
793 *	const uint32_t pt[4], uint32_t ct[4])/
794 *
795 * Original interface:
796 * int aes_decrypt(const unsigned char *in,
797 *	unsigned char *out, const aes_encrypt_ctx cx[1])/
798 */
799	.align	64
800dec_tab:
801	dec_vals(v8)
802#ifdef	LAST_ROUND_TABLES
803	/ Last Round Tables:
804	dec_vals(w8)
805#endif
806
807
808	ENTRY_NP(aes_decrypt_amd64)
809#ifdef	GLADMAN_INTERFACE
810	/ Original interface
811	sub	$[4*8], %rsp	/ gnu/linux/opensolaris binary interface
812	mov	%rsi, (%rsp)	/ output pointer (P2)
813	mov	%rdx, %r8	/ context (P3)
814
815	mov	%rbx, 1*8(%rsp)	/ P1: input pointer in rdi
816	mov	%rbp, 2*8(%rsp)	/ P2: output pointer in (rsp)
817	mov	%r12, 3*8(%rsp)	/ P3: context in r8
818	movzx	4*KS_LENGTH(kptr), %esi	/ Get byte key length * 16
819
820#else
821	/ OpenSolaris OS interface
822	sub	$[4*8], %rsp	/ Make room on stack to save registers
823	mov	%rcx, (%rsp)	/ Save output pointer (P4) on stack
824	mov	%rdi, %r8	/ context (P1)
825	mov	%rdx, %rdi	/ P3: save input pointer
826	shl	$4, %esi	/ P2: esi byte key length * 16
827
828	mov	%rbx, 1*8(%rsp)	/ Save registers
829	mov	%rbp, 2*8(%rsp)
830	mov	%r12, 3*8(%rsp)
831	/ P1: context in r8
832	/ P2: byte key length * 16 in esi
833	/ P3: input pointer in rdi
834	/ P4: output pointer in (rsp)
835#endif	/* GLADMAN_INTERFACE */
836
837	lea	dec_tab(%rip), tptr
838	sub	$rofs, kptr
839
840	/ Load input block into registers
841	mov	(%rdi), %eax
842	mov	1*4(%rdi), %ebx
843	mov	2*4(%rdi), %ecx
844	mov	3*4(%rdi), %edx
845
846#ifdef AES_REV_DKS
847	mov	kptr, %rdi
848	lea	(kptr,%rsi), kptr
849#else
850	lea	(kptr,%rsi), %rdi
851#endif
852
853	xor	rofs(%rdi), %eax
854	xor	rofs+4(%rdi), %ebx
855	xor	rofs+8(%rdi), %ecx
856	xor	rofs+12(%rdi), %edx
857
858	/ Jump based on byte key length * 16:
859	cmp	$[10*16], %esi
860	je	3f
861	cmp	$[12*16], %esi
862	je	2f
863	cmp	$[14*16], %esi
864	je	1f
865	mov	$-1, %rax	/ error
866	jmp	4f
867
868	/ Perform normal inverse rounds
8691:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
870	ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
8712:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
872	ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
8733:	ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
874	ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
875	ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
876	ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
877	ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
878	ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
879	ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
880	ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
881	ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
882	il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
883
884	/ Copy results
885	mov	(%rsp), %rbx
886	mov	%r9d, (%rbx)
887	mov	%r10d, 4(%rbx)
888	mov	%r11d, 8(%rbx)
889	mov	%r12d, 12(%rbx)
890	xor	%rax, %rax
8914:	/ Restore registers
892	mov	1*8(%rsp), %rbx
893	mov	2*8(%rsp), %rbp
894	mov	3*8(%rsp), %r12
895	add	$[4*8], %rsp
896	ret
897
898	SET_SIZE(aes_decrypt_amd64)
899#endif	/* lint || __lint */
900