xref: /freebsd/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1// SPDX-License-Identifier: Brian-Gladman-3-Clause
2/*
3 * ---------------------------------------------------------------------------
4 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
5 *
6 * LICENSE TERMS
7 *
8 * The free distribution and use of this software is allowed (with or without
9 * changes) provided that:
10 *
11 *  1. source code distributions include the above copyright notice, this
12 *     list of conditions and the following disclaimer;
13 *
14 *  2. binary distributions include the above copyright notice, this list
15 *     of conditions and the following disclaimer in their documentation;
16 *
17 *  3. the name of the copyright holder is not used to endorse products
18 *     built using this software without specific written permission.
19 *
20 * DISCLAIMER
21 *
22 * This software is provided 'as is' with no explicit or implied warranties
23 * in respect of its properties, including, but not limited to, correctness
24 * and/or fitness for purpose.
25 * ---------------------------------------------------------------------------
26 * Issue 20/12/2007
27 *
28 * I am grateful to Dag Arne Osvik for many discussions of the techniques that
29 * can be used to optimise AES assembler code on AMD64/EM64T architectures.
30 * Some of the techniques used in this implementation are the result of
31 * suggestions made by him for which I am most grateful.
32 *
33 * An AES implementation for AMD64 processors using the YASM assembler.  This
34 * implementation provides only encryption, decryption and hence requires key
35 * scheduling support in C. It uses 8k bytes of tables but its encryption and
36 * decryption performance is very close to that obtained using large tables.
37 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
38 * which are as follows:
39 *               ms windows  gnu/linux/opensolaris os
40 *
41 *   in_blk          rcx     rdi
42 *   out_blk         rdx     rsi
43 *   context (cx)     r8     rdx
44 *
45 *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
46 *   registers       rdi      -      on both
47 *
48 *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
49 *   registers        -      rdi     on both
50 *
51 * The convention used here is that for gnu/linux/opensolaris os.
52 *
53 * This code provides the standard AES block size (128 bits, 16 bytes) and the
54 * three standard AES key sizes (128, 192 and 256 bits). It has the same call
55 * interface as my C implementation.  It uses the Microsoft C AMD64 calling
56 * conventions in which the three parameters are placed in  rcx, rdx and r8
57 * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
58 *
59 * OpenSolaris Note:
60 * Modified to use GNU/Linux/Solaris calling conventions.
61 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
62 *
63 *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
64 *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
65 *
66 *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
67 *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
68 *
69 *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
70 *                                            const aes_encrypt_ctx cx[1])/
71 *
72 *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
73 *                                            const aes_decrypt_ctx cx[1])/
74 *
75 *     AES_RETURN aes_encrypt_key(const unsigned char key[],
76 *                           unsigned int len, const aes_decrypt_ctx cx[1])/
77 *
78 *     AES_RETURN aes_decrypt_key(const unsigned char key[],
79 *                           unsigned int len, const aes_decrypt_ctx cx[1])/
80 *
81 * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
82 * either bits or bytes.
83 *
84 * Comment in/out the following lines to obtain the desired subroutines. These
85 * selections MUST match those in the C header file aesopt.h
86 */
87#define	AES_REV_DKS	  /* define if key decryption schedule is reversed */
88
89#define	LAST_ROUND_TABLES /* define for the faster version using extra tables */
90
91/*
92 * The encryption key schedule has the following in memory layout where N is the
93 * number of rounds (10, 12 or 14):
94 *
95 * lo: | input key (round 0)  |  / each round is four 32-bit words
96 *     | encryption round 1   |
97 *     | encryption round 2   |
98 *     ....
99 *     | encryption round N-1 |
100 * hi: | encryption round N   |
101 *
102 * The decryption key schedule is normally set up so that it has the same
103 * layout as above by actually reversing the order of the encryption key
104 * schedule in memory (this happens when AES_REV_DKS is set):
105 *
106 * lo: | decryption round 0   | =              | encryption round N   |
107 *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
108 *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
109 *     ....                       ....
110 *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
111 * hi: | decryption round N   | =              | input key (round 0)  |
112 *
113 * with rounds except the first and last modified using inv_mix_column()
114 * But if AES_REV_DKS is NOT set the order of keys is left as it is for
115 * encryption so that it has to be accessed in reverse when used for
116 * decryption (although the inverse mix column modifications are done)
117 *
118 * lo: | decryption round 0   | =              | input key (round 0)  |
119 *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
120 *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
121 *     ....                       ....
122 *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
123 * hi: | decryption round N   | =              | encryption round N   |
124 *
125 * This layout is faster when the assembler key scheduling provided here
126 * is used.
127 *
128 * End of user defines
129 */
130
131/*
132 * ---------------------------------------------------------------------------
133 * OpenSolaris OS modifications
134 *
135 * This source originates from Brian Gladman file aes_amd64.asm
136 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
137 * with these changes:
138 *
139 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
140 * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
141 * AES_128, AES_192, AES_256, AES_VAR ifdefs.
142 *
143 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
144 *
145 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
146 *
147 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
148 * (operands reversed, literals prefixed with "$", registers prefixed with "%",
149 * and "[register+offset]", addressing changed to "offset(register)",
150 * parenthesis in constant expressions "()" changed to square brackets "[]",
151 * "." removed from  local (numeric) labels, and other changes.
152 * Examples:
153 * Intel/yasm/nasm Syntax	ATT/OpenSolaris Syntax
154 * mov	rax,(4*20h)		mov	$[4*0x20],%rax
155 * mov	rax,[ebx+20h]		mov	0x20(%ebx),%rax
156 * lea	rax,[ebx+ecx]		lea	(%ebx,%ecx),%rax
157 * sub	rax,[ebx+ecx*4-20h]	sub	-0x20(%ebx,%ecx,4),%rax
158 *
159 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
160 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
161 * definitions for lint.
162 *
163 * 6. Renamed functions and reordered parameters to match OpenSolaris:
164 * Original Gladman interface:
165 *	int aes_encrypt(const unsigned char *in,
166 *		unsigned char *out, const aes_encrypt_ctx cx[1])/
167 *	int aes_decrypt(const unsigned char *in,
168 *		unsigned char *out, const aes_encrypt_ctx cx[1])/
169 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
170 * and a union type, inf., containing inf.l, a uint32_t and
171 * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
172 * used and contains the key schedule length * 16 where key schedule length is
173 * 10, 12, or 14 bytes.
174 *
175 * OpenSolaris OS interface:
176 *	void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
177 *		const uint32_t pt[4], uint32_t ct[4])/
178 *	void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
179 *		const uint32_t pt[4], uint32_t ct[4])/
180 *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
181 *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
182 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
183 * ct is crypto text, and MAX_AES_NR is 14.
184 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
185 */
186
187#if defined(lint) || defined(__lint)
188
189#include <sys/types.h>
190void
191aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
192       uint32_t ct[4]) {
193		(void) rk, (void) Nr, (void) pt, (void) ct;
194}
195void
196aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
197       uint32_t pt[4]) {
198		(void) rk, (void) Nr, (void) pt, (void) ct;
199}
200
201
202#else
203
204#define _ASM
205#include <sys/asm_linkage.h>
206
207#define	KS_LENGTH	60
208
209#define	raxd		eax
210#define	rdxd		edx
211#define	rcxd		ecx
212#define	rbxd		ebx
213#define	rsid		esi
214#define	rdid		edi
215
216#define	raxb		al
217#define	rdxb		dl
218#define	rcxb		cl
219#define	rbxb		bl
220#define	rsib		sil
221#define	rdib		dil
222
223// finite field multiplies by {02}, {04} and {08}
224
225#define	f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
226#define	f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
227#define	f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
228
229// finite field multiplies required in table generation
230
231#define	f3(x) ((f2(x)) ^ (x))
232#define	f9(x) ((f8(x)) ^ (x))
233#define	fb(x) ((f8(x)) ^ (f2(x)) ^ (x))
234#define	fd(x) ((f8(x)) ^ (f4(x)) ^ (x))
235#define	fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x)))
236
237// macros for expanding S-box data
238
239#define	u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x))
240#define	v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x)
241#define	w8(x) (x), 0, 0, 0, (x), 0, 0, 0
242
243#define	enc_vals(x)	\
244   .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
245   .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
246   .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
247   .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
248   .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
249   .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
250   .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
251   .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
252   .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
253   .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
254   .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
255   .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
256   .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
257   .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
258   .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
259   .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
260   .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
261   .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
262   .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
263   .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
264   .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
265   .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
266   .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
267   .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
268   .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
269   .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
270   .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
271   .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
272   .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
273   .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
274   .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
275   .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
276
277#define	dec_vals(x) \
278   .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
279   .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
280   .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
281   .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
282   .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
283   .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
284   .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
285   .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
286   .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
287   .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
288   .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
289   .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
290   .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
291   .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
292   .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
293   .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
294   .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
295   .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
296   .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
297   .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
298   .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
299   .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
300   .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
301   .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
302   .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
303   .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
304   .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
305   .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
306   .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
307   .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
308   .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
309   .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
310
311#define	tptr	%rbp	/* table pointer */
312#define	kptr	%r8	/* key schedule pointer */
313#define	fofs	128	/* adjust offset in key schedule to keep |disp| < 128 */
314#define	fk_ref(x, y)	-16*x+fofs+4*y(kptr)
315
316#ifdef	AES_REV_DKS
317#define	rofs		128
318#define	ik_ref(x, y)	-16*x+rofs+4*y(kptr)
319
320#else
321#define	rofs		-128
322#define	ik_ref(x, y)	16*x+rofs+4*y(kptr)
323#endif	/* AES_REV_DKS */
324
325#define	tab_0(x)	(tptr,x,8)
326#define	tab_1(x)	3(tptr,x,8)
327#define	tab_2(x)	2(tptr,x,8)
328#define	tab_3(x)	1(tptr,x,8)
329#define	tab_f(x)	1(tptr,x,8)
330#define	tab_i(x)	7(tptr,x,8)
331
332#define	ff_rnd(p1, p2, p3, p4, round)	/* normal forward round */ \
333	mov	fk_ref(round,0), p1; \
334	mov	fk_ref(round,1), p2; \
335	mov	fk_ref(round,2), p3; \
336	mov	fk_ref(round,3), p4; \
337 \
338	movzx	%al, %esi; \
339	movzx	%ah, %edi; \
340	shr	$16, %eax; \
341	xor	tab_0(%rsi), p1; \
342	xor	tab_1(%rdi), p4; \
343	movzx	%al, %esi; \
344	movzx	%ah, %edi; \
345	xor	tab_2(%rsi), p3; \
346	xor	tab_3(%rdi), p2; \
347 \
348	movzx	%bl, %esi; \
349	movzx	%bh, %edi; \
350	shr	$16, %ebx; \
351	xor	tab_0(%rsi), p2; \
352	xor	tab_1(%rdi), p1; \
353	movzx	%bl, %esi; \
354	movzx	%bh, %edi; \
355	xor	tab_2(%rsi), p4; \
356	xor	tab_3(%rdi), p3; \
357 \
358	movzx	%cl, %esi; \
359	movzx	%ch, %edi; \
360	shr	$16, %ecx; \
361	xor	tab_0(%rsi), p3; \
362	xor	tab_1(%rdi), p2; \
363	movzx	%cl, %esi; \
364	movzx	%ch, %edi; \
365	xor	tab_2(%rsi), p1; \
366	xor	tab_3(%rdi), p4; \
367 \
368	movzx	%dl, %esi; \
369	movzx	%dh, %edi; \
370	shr	$16, %edx; \
371	xor	tab_0(%rsi), p4; \
372	xor	tab_1(%rdi), p3; \
373	movzx	%dl, %esi; \
374	movzx	%dh, %edi; \
375	xor	tab_2(%rsi), p2; \
376	xor	tab_3(%rdi), p1; \
377 \
378	mov	p1, %eax; \
379	mov	p2, %ebx; \
380	mov	p3, %ecx; \
381	mov	p4, %edx
382
383#ifdef	LAST_ROUND_TABLES
384
385#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
386	add	$2048, tptr; \
387	mov	fk_ref(round,0), p1; \
388	mov	fk_ref(round,1), p2; \
389	mov	fk_ref(round,2), p3; \
390	mov	fk_ref(round,3), p4; \
391 \
392	movzx	%al, %esi; \
393	movzx	%ah, %edi; \
394	shr	$16, %eax; \
395	xor	tab_0(%rsi), p1; \
396	xor	tab_1(%rdi), p4; \
397	movzx	%al, %esi; \
398	movzx	%ah, %edi; \
399	xor	tab_2(%rsi), p3; \
400	xor	tab_3(%rdi), p2; \
401 \
402	movzx	%bl, %esi; \
403	movzx	%bh, %edi; \
404	shr	$16, %ebx; \
405	xor	tab_0(%rsi), p2; \
406	xor	tab_1(%rdi), p1; \
407	movzx	%bl, %esi; \
408	movzx	%bh, %edi; \
409	xor	tab_2(%rsi), p4; \
410	xor	tab_3(%rdi), p3; \
411 \
412	movzx	%cl, %esi; \
413	movzx	%ch, %edi; \
414	shr	$16, %ecx; \
415	xor	tab_0(%rsi), p3; \
416	xor	tab_1(%rdi), p2; \
417	movzx	%cl, %esi; \
418	movzx	%ch, %edi; \
419	xor	tab_2(%rsi), p1; \
420	xor	tab_3(%rdi), p4; \
421 \
422	movzx	%dl, %esi; \
423	movzx	%dh, %edi; \
424	shr	$16, %edx; \
425	xor	tab_0(%rsi), p4; \
426	xor	tab_1(%rdi), p3; \
427	movzx	%dl, %esi; \
428	movzx	%dh, %edi; \
429	xor	tab_2(%rsi), p2; \
430	xor	tab_3(%rdi), p1
431
432#else
433
434#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
435	mov	fk_ref(round,0), p1; \
436	mov	fk_ref(round,1), p2; \
437	mov	fk_ref(round,2), p3; \
438	mov	fk_ref(round,3), p4; \
439 \
440	movzx	%al, %esi; \
441	movzx	%ah, %edi; \
442	shr	$16, %eax; \
443	movzx	tab_f(%rsi), %esi; \
444	movzx	tab_f(%rdi), %edi; \
445	xor	%esi, p1; \
446	rol	$8, %edi; \
447	xor	%edi, p4; \
448	movzx	%al, %esi; \
449	movzx	%ah, %edi; \
450	movzx	tab_f(%rsi), %esi; \
451	movzx	tab_f(%rdi), %edi; \
452	rol	$16, %esi; \
453	rol	$24, %edi; \
454	xor	%esi, p3; \
455	xor	%edi, p2; \
456 \
457	movzx	%bl, %esi; \
458	movzx	%bh, %edi; \
459	shr	$16, %ebx; \
460	movzx	tab_f(%rsi), %esi; \
461	movzx	tab_f(%rdi), %edi; \
462	xor	%esi, p2; \
463	rol	$8, %edi; \
464	xor	%edi, p1; \
465	movzx	%bl, %esi; \
466	movzx	%bh, %edi; \
467	movzx	tab_f(%rsi), %esi; \
468	movzx	tab_f(%rdi), %edi; \
469	rol	$16, %esi; \
470	rol	$24, %edi; \
471	xor	%esi, p4; \
472	xor	%edi, p3; \
473 \
474	movzx	%cl, %esi; \
475	movzx	%ch, %edi; \
476	movzx	tab_f(%rsi), %esi; \
477	movzx	tab_f(%rdi), %edi; \
478	shr	$16, %ecx; \
479	xor	%esi, p3; \
480	rol	$8, %edi; \
481	xor	%edi, p2; \
482	movzx	%cl, %esi; \
483	movzx	%ch, %edi; \
484	movzx	tab_f(%rsi), %esi; \
485	movzx	tab_f(%rdi), %edi; \
486	rol	$16, %esi; \
487	rol	$24, %edi; \
488	xor	%esi, p1; \
489	xor	%edi, p4; \
490 \
491	movzx	%dl, %esi; \
492	movzx	%dh, %edi; \
493	movzx	tab_f(%rsi), %esi; \
494	movzx	tab_f(%rdi), %edi; \
495	shr	$16, %edx; \
496	xor	%esi, p4; \
497	rol	$8, %edi; \
498	xor	%edi, p3; \
499	movzx	%dl, %esi; \
500	movzx	%dh, %edi; \
501	movzx	tab_f(%rsi), %esi; \
502	movzx	tab_f(%rdi), %edi; \
503	rol	$16, %esi; \
504	rol	$24, %edi; \
505	xor	%esi, p2; \
506	xor	%edi, p1
507
508#endif	/* LAST_ROUND_TABLES */
509
510#define	ii_rnd(p1, p2, p3, p4, round)	/* normal inverse round */ \
511	mov	ik_ref(round,0), p1; \
512	mov	ik_ref(round,1), p2; \
513	mov	ik_ref(round,2), p3; \
514	mov	ik_ref(round,3), p4; \
515 \
516	movzx	%al, %esi; \
517	movzx	%ah, %edi; \
518	shr	$16, %eax; \
519	xor	tab_0(%rsi), p1; \
520	xor	tab_1(%rdi), p2; \
521	movzx	%al, %esi; \
522	movzx	%ah, %edi; \
523	xor	tab_2(%rsi), p3; \
524	xor	tab_3(%rdi), p4; \
525 \
526	movzx	%bl, %esi; \
527	movzx	%bh, %edi; \
528	shr	$16, %ebx; \
529	xor	tab_0(%rsi), p2; \
530	xor	tab_1(%rdi), p3; \
531	movzx	%bl, %esi; \
532	movzx	%bh, %edi; \
533	xor	tab_2(%rsi), p4; \
534	xor	tab_3(%rdi), p1; \
535 \
536	movzx	%cl, %esi; \
537	movzx	%ch, %edi; \
538	shr	$16, %ecx; \
539	xor	tab_0(%rsi), p3; \
540	xor	tab_1(%rdi), p4; \
541	movzx	%cl, %esi; \
542	movzx	%ch, %edi; \
543	xor	tab_2(%rsi), p1; \
544	xor	tab_3(%rdi), p2; \
545 \
546	movzx	%dl, %esi; \
547	movzx	%dh, %edi; \
548	shr	$16, %edx; \
549	xor	tab_0(%rsi), p4; \
550	xor	tab_1(%rdi), p1; \
551	movzx	%dl, %esi; \
552	movzx	%dh, %edi; \
553	xor	tab_2(%rsi), p2; \
554	xor	tab_3(%rdi), p3; \
555 \
556	mov	p1, %eax; \
557	mov	p2, %ebx; \
558	mov	p3, %ecx; \
559	mov	p4, %edx
560
561#ifdef	LAST_ROUND_TABLES
562
563#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
564	add	$2048, tptr; \
565	mov	ik_ref(round,0), p1; \
566	mov	ik_ref(round,1), p2; \
567	mov	ik_ref(round,2), p3; \
568	mov	ik_ref(round,3), p4; \
569 \
570	movzx	%al, %esi; \
571	movzx	%ah, %edi; \
572	shr	$16, %eax; \
573	xor	tab_0(%rsi), p1; \
574	xor	tab_1(%rdi), p2; \
575	movzx	%al, %esi; \
576	movzx	%ah, %edi; \
577	xor	tab_2(%rsi), p3; \
578	xor	tab_3(%rdi), p4; \
579 \
580	movzx	%bl, %esi; \
581	movzx	%bh, %edi; \
582	shr	$16, %ebx; \
583	xor	tab_0(%rsi), p2; \
584	xor	tab_1(%rdi), p3; \
585	movzx	%bl, %esi; \
586	movzx	%bh, %edi; \
587	xor	tab_2(%rsi), p4; \
588	xor	tab_3(%rdi), p1; \
589 \
590	movzx	%cl, %esi; \
591	movzx	%ch, %edi; \
592	shr	$16, %ecx; \
593	xor	tab_0(%rsi), p3; \
594	xor	tab_1(%rdi), p4; \
595	movzx	%cl, %esi; \
596	movzx	%ch, %edi; \
597	xor	tab_2(%rsi), p1; \
598	xor	tab_3(%rdi), p2; \
599 \
600	movzx	%dl, %esi; \
601	movzx	%dh, %edi; \
602	shr	$16, %edx; \
603	xor	tab_0(%rsi), p4; \
604	xor	tab_1(%rdi), p1; \
605	movzx	%dl, %esi; \
606	movzx	%dh, %edi; \
607	xor	tab_2(%rsi), p2; \
608	xor	tab_3(%rdi), p3
609
610#else
611
612#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
613	mov	ik_ref(round,0), p1; \
614	mov	ik_ref(round,1), p2; \
615	mov	ik_ref(round,2), p3; \
616	mov	ik_ref(round,3), p4; \
617 \
618	movzx	%al, %esi; \
619	movzx	%ah, %edi; \
620	movzx	tab_i(%rsi), %esi; \
621	movzx	tab_i(%rdi), %edi; \
622	shr	$16, %eax; \
623	xor	%esi, p1; \
624	rol	$8, %edi; \
625	xor	%edi, p2; \
626	movzx	%al, %esi; \
627	movzx	%ah, %edi; \
628	movzx	tab_i(%rsi), %esi; \
629	movzx	tab_i(%rdi), %edi; \
630	rol	$16, %esi; \
631	rol	$24, %edi; \
632	xor	%esi, p3; \
633	xor	%edi, p4; \
634 \
635	movzx	%bl, %esi; \
636	movzx	%bh, %edi; \
637	movzx	tab_i(%rsi), %esi; \
638	movzx	tab_i(%rdi), %edi; \
639	shr	$16, %ebx; \
640	xor	%esi, p2; \
641	rol	$8, %edi; \
642	xor	%edi, p3; \
643	movzx	%bl, %esi; \
644	movzx	%bh, %edi; \
645	movzx	tab_i(%rsi), %esi; \
646	movzx	tab_i(%rdi), %edi; \
647	rol	$16, %esi; \
648	rol	$24, %edi; \
649	xor	%esi, p4; \
650	xor	%edi, p1; \
651 \
652	movzx	%cl, %esi; \
653	movzx	%ch, %edi; \
654	movzx	tab_i(%rsi), %esi; \
655	movzx	tab_i(%rdi), %edi; \
656	shr	$16, %ecx; \
657	xor	%esi, p3; \
658	rol	$8, %edi; \
659	xor	%edi, p4; \
660	movzx	%cl, %esi; \
661	movzx	%ch, %edi; \
662	movzx	tab_i(%rsi), %esi; \
663	movzx	tab_i(%rdi), %edi; \
664	rol	$16, %esi; \
665	rol	$24, %edi; \
666	xor	%esi, p1; \
667	xor	%edi, p2; \
668 \
669	movzx	%dl, %esi; \
670	movzx	%dh, %edi; \
671	movzx	tab_i(%rsi), %esi; \
672	movzx	tab_i(%rdi), %edi; \
673	shr	$16, %edx; \
674	xor	%esi, p4; \
675	rol	$8, %edi; \
676	xor	%edi, p1; \
677	movzx	%dl, %esi; \
678	movzx	%dh, %edi; \
679	movzx	tab_i(%rsi), %esi; \
680	movzx	tab_i(%rdi), %edi; \
681	rol	$16, %esi; \
682	rol	$24, %edi; \
683	xor	%esi, p2; \
684	xor	%edi, p3
685
686#endif	/* LAST_ROUND_TABLES */
687
688/*
689 * OpenSolaris OS:
690 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
691 *	const uint32_t pt[4], uint32_t ct[4])/
692 *
693 * Original interface:
694 * int aes_encrypt(const unsigned char *in,
695 *	unsigned char *out, const aes_encrypt_ctx cx[1])/
696 */
697SECTION_STATIC
698.balign	64
699enc_tab:
700	enc_vals(u8)
701#ifdef	LAST_ROUND_TABLES
702	// Last Round Tables:
703	enc_vals(w8)
704#endif
705
706
707ENTRY_NP(aes_encrypt_amd64)
708	ENDBR
709#ifdef	GLADMAN_INTERFACE
710	// Original interface
711	sub	$[4*8], %rsp	// gnu/linux/opensolaris binary interface
712	mov	%rsi, (%rsp)	// output pointer (P2)
713	mov	%rdx, %r8	// context (P3)
714
715	mov	%rbx, 1*8(%rsp)	// P1: input pointer in rdi
716	mov	%rbp, 2*8(%rsp)	// P2: output pointer in (rsp)
717	mov	%r12, 3*8(%rsp)	// P3: context in r8
718	movzx	4*KS_LENGTH(kptr), %esi	// Get byte key length * 16
719
720#else
721	// OpenSolaris OS interface
722	sub	$(4*8), %rsp	// Make room on stack to save registers
723	mov	%rcx, (%rsp)	// Save output pointer (P4) on stack
724	mov	%rdi, %r8	// context (P1)
725	mov	%rdx, %rdi	// P3: save input pointer
726	shl	$4, %esi	// P2: esi byte key length * 16
727
728	mov	%rbx, 1*8(%rsp)	// Save registers
729	mov	%rbp, 2*8(%rsp)
730	mov	%r12, 3*8(%rsp)
731	// P1: context in r8
732	// P2: byte key length * 16 in esi
733	// P3: input pointer in rdi
734	// P4: output pointer in (rsp)
735#endif	/* GLADMAN_INTERFACE */
736
737	lea	enc_tab(%rip), tptr
738	sub	$fofs, kptr
739
740	// Load input block into registers
741	mov	(%rdi), %eax
742	mov	1*4(%rdi), %ebx
743	mov	2*4(%rdi), %ecx
744	mov	3*4(%rdi), %edx
745
746	xor	fofs(kptr), %eax
747	xor	fofs+4(kptr), %ebx
748	xor	fofs+8(kptr), %ecx
749	xor	fofs+12(kptr), %edx
750
751	lea	(kptr,%rsi), kptr
752	// Jump based on byte key length * 16:
753	cmp	$(10*16), %esi
754	je	3f
755	cmp	$(12*16), %esi
756	je	2f
757	cmp	$(14*16), %esi
758	je	1f
759	mov	$-1, %rax	// error
760	jmp	4f
761
762	// Perform normal forward rounds
7631:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
764	ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
7652:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
766	ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
7673:	ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
768	ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
769	ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
770	ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
771	ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
772	ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
773	ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
774	ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
775	ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
776	fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
777
778	// Copy results
779	mov	(%rsp), %rbx
780	mov	%r9d, (%rbx)
781	mov	%r10d, 4(%rbx)
782	mov	%r11d, 8(%rbx)
783	mov	%r12d, 12(%rbx)
784	xor	%rax, %rax
7854:	// Restore registers
786	mov	1*8(%rsp), %rbx
787	mov	2*8(%rsp), %rbp
788	mov	3*8(%rsp), %r12
789	add	$(4*8), %rsp
790	RET
791
792	SET_SIZE(aes_encrypt_amd64)
793
794/*
795 * OpenSolaris OS:
796 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
797 *	const uint32_t pt[4], uint32_t ct[4])/
798 *
799 * Original interface:
800 * int aes_decrypt(const unsigned char *in,
801 *	unsigned char *out, const aes_encrypt_ctx cx[1])/
802 */
803SECTION_STATIC
804.balign	64
805dec_tab:
806	dec_vals(v8)
807#ifdef	LAST_ROUND_TABLES
808	// Last Round Tables:
809	dec_vals(w8)
810#endif
811
812
813ENTRY_NP(aes_decrypt_amd64)
814	ENDBR
815#ifdef	GLADMAN_INTERFACE
816	// Original interface
817	sub	$[4*8], %rsp	// gnu/linux/opensolaris binary interface
818	mov	%rsi, (%rsp)	// output pointer (P2)
819	mov	%rdx, %r8	// context (P3)
820
821	mov	%rbx, 1*8(%rsp)	// P1: input pointer in rdi
822	mov	%rbp, 2*8(%rsp)	// P2: output pointer in (rsp)
823	mov	%r12, 3*8(%rsp)	// P3: context in r8
824	movzx	4*KS_LENGTH(kptr), %esi	// Get byte key length * 16
825
826#else
827	// OpenSolaris OS interface
828	sub	$(4*8), %rsp	// Make room on stack to save registers
829	mov	%rcx, (%rsp)	// Save output pointer (P4) on stack
830	mov	%rdi, %r8	// context (P1)
831	mov	%rdx, %rdi	// P3: save input pointer
832	shl	$4, %esi	// P2: esi byte key length * 16
833
834	mov	%rbx, 1*8(%rsp)	// Save registers
835	mov	%rbp, 2*8(%rsp)
836	mov	%r12, 3*8(%rsp)
837	// P1: context in r8
838	// P2: byte key length * 16 in esi
839	// P3: input pointer in rdi
840	// P4: output pointer in (rsp)
841#endif	/* GLADMAN_INTERFACE */
842
843	lea	dec_tab(%rip), tptr
844	sub	$rofs, kptr
845
846	// Load input block into registers
847	mov	(%rdi), %eax
848	mov	1*4(%rdi), %ebx
849	mov	2*4(%rdi), %ecx
850	mov	3*4(%rdi), %edx
851
852#ifdef AES_REV_DKS
853	mov	kptr, %rdi
854	lea	(kptr,%rsi), kptr
855#else
856	lea	(kptr,%rsi), %rdi
857#endif
858
859	xor	rofs(%rdi), %eax
860	xor	rofs+4(%rdi), %ebx
861	xor	rofs+8(%rdi), %ecx
862	xor	rofs+12(%rdi), %edx
863
864	// Jump based on byte key length * 16:
865	cmp	$(10*16), %esi
866	je	3f
867	cmp	$(12*16), %esi
868	je	2f
869	cmp	$(14*16), %esi
870	je	1f
871	mov	$-1, %rax	// error
872	jmp	4f
873
874	// Perform normal inverse rounds
8751:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
876	ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
8772:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
878	ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
8793:	ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
880	ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
881	ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
882	ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
883	ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
884	ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
885	ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
886	ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
887	ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
888	il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
889
890	// Copy results
891	mov	(%rsp), %rbx
892	mov	%r9d, (%rbx)
893	mov	%r10d, 4(%rbx)
894	mov	%r11d, 8(%rbx)
895	mov	%r12d, 12(%rbx)
896	xor	%rax, %rax
8974:	// Restore registers
898	mov	1*8(%rsp), %rbx
899	mov	2*8(%rsp), %rbp
900	mov	3*8(%rsp), %r12
901	add	$(4*8), %rsp
902	RET
903
904	SET_SIZE(aes_decrypt_amd64)
905#endif /* lint || __lint */
906
907#ifdef __ELF__
908.section .note.GNU-stack,"",%progbits
909#endif
910