xref: /titanic_52/usr/src/common/crypto/aes/amd64/aes_intel.s (revision 0a0e9771ca0211c15f3ac4466b661c145feeb9e4)
1/*
2 * ====================================================================
3 * Written by Intel Corporation for the OpenSSL project to add support
4 * for Intel AES-NI instructions. Rights for redistribution and usage
5 * in source and binary forms are granted according to the OpenSSL
6 * license.
7 *
8 *   Author: Huang Ying <ying.huang at intel dot com>
9 *           Vinodh Gopal <vinodh.gopal at intel dot com>
10 *           Kahraman Akdemir
11 *
12 * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
13 * instructions that are going to be introduced in the next generation
14 * of Intel processor, as of 2009. These instructions enable fast and
15 * secure data encryption and decryption, using the Advanced Encryption
16 * Standard (AES), defined by FIPS Publication number 197. The
17 * architecture introduces six instructions that offer full hardware
18 * support for AES. Four of them support high performance data
19 * encryption and decryption, and the other two instructions support
20 * the AES key expansion procedure.
21 * ====================================================================
22 */
23
24/*
25 * ====================================================================
26 * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * 1. Redistributions of source code must retain the above copyright
33 *    notice, this list of conditions and the following disclaimer.
34 *
35 * 2. Redistributions in binary form must reproduce the above copyright
36 *    notice, this list of conditions and the following disclaimer in
37 *    the documentation and/or other materials provided with the
38 *    distribution.
39 *
40 * 3. All advertising materials mentioning features or use of this
41 *    software must display the following acknowleoudgment:
42 *    "This product includes software developed by the OpenSSL Project
43 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
44 *
45 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
46 *    endorse or promote products derived from this software without
47 *    prior written permission. For written permission, please contact
48 *    openssl-core@openssl.org.
49 *
50 * 5. Products derived from this software may not be called "OpenSSL"
51 *    nor may "OpenSSL" appear in their names without prior written
52 *    permission of the OpenSSL Project.
53 *
54 * 6. Redistributions of any form whatsoever must retain the following
55 *    acknowledgment:
56 *    "This product includes software developed by the OpenSSL Project
57 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
60 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
62 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
63 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
64 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
65 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
66 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
68 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
70 * OF THE POSSIBILITY OF SUCH DAMAGE.
71 * ====================================================================
72 */
73
74/*
75 * ====================================================================
76 * OpenSolaris OS modifications
77 *
78 * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
79 * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
80 * Huang Ying of Intel to the openssl-dev mailing list under the subject
81 * of "Add support to Intel AES-NI instruction set for x86_64 platform".
82 *
83 * This OpenSolaris version has these major changes from the original source:
84 *
85 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
86 * /usr/include/sys/asm_linkage.h, lint(1B) guards, EXPORT DELETE START
87 * and EXPORT DELETE END markers, and dummy C function definitions for lint.
88 *
89 * 2. Formatted code, added comments, and added #includes and #defines.
90 *
91 * 3. Replaced aes* and palignr instructions with .byte sequences
92 * (as they are not supported yet by all of the gas, as, and aw assemblers).
93 *
94 * 4. If bit CR0.TS is set, clear and set the TS bit, after and before
95 * calling kpreempt_disable() and kpreempt_enable().
96 * If the TS bit is not set, Save and restore %xmm registers at the beginning
97 * and end of function calls (%xmm* registers are not saved and restored by
98 * during kernel thread preemption).
99 *
100 * 5. Renamed functions, reordered parameters, and changed return value
101 * to match OpenSolaris:
102 *
103 * OpenSSL interface:
104 *	int intel_AES_set_encrypt_key(const unsigned char *userKey,
105 *		const int bits, AES_KEY *key);
106 *	int intel_AES_set_decrypt_key(const unsigned char *userKey,
107 *		const int bits, AES_KEY *key);
108 *	Return values for above are non-zero on error, 0 on success.
109 *
110 *	void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
111 *		const AES_KEY *key);
112 *	void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
113 *		const AES_KEY *key);
114 *	typedef struct aes_key_st {
115 *		unsigned int	rd_key[4 *(AES_MAXNR + 1)];
116 *		int		rounds;
117 *		unsigned int	pad[3];
118 *	} AES_KEY;
119 * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
120 * (ks32) instead of 64-bit (ks64).
121 * Number of rounds (aka round count) is at offset 240 of AES_KEY.
122 *
123 * OpenSolaris OS interface (#ifdefs removed for readability):
124 *	int rijndael_key_setup_dec_intel(uint32_t rk[],
125 *		const uint32_t cipherKey[], uint64_t keyBits);
126 *	int rijndael_key_setup_enc_intel(uint32_t rk[],
127 *		const uint32_t cipherKey[], uint64_t keyBits);
128 *	Return values for above are 0 on error, number of rounds on success.
129 *
130 *	void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
131 *		const uint32_t pt[4], uint32_t ct[4]);
132 *	void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
133 *		const uint32_t pt[4], uint32_t ct[4]);
134 *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
135 *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
136 *
137 *	typedef union {
138 *		uint32_t	ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
139 *	} aes_ks_t;
140 *	typedef struct aes_key {
141 *		aes_ks_t	encr_ks, decr_ks;
142 *		long double	align128;
143 *		int		flags, nr, type;
144 *	} aes_key_t;
145 *
146 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
147 * ct is crypto text, and MAX_AES_NR is 14.
148 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
149 * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
150 * ====================================================================
151 */
152
153#if defined(lint) || defined(__lint)
154
155#include <sys/types.h>
156
157/* ARGSUSED */
158void
159aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
160    uint32_t ct[4]) {
161}
162/* ARGSUSED */
163void
164aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
165    uint32_t pt[4]) {
166}
167/* ARGSUSED */
168int
169rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
170    uint64_t keyBits) {
171	return (0);
172}
173/* ARGSUSED */
174int
175rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
176   uint64_t keyBits) {
177	return (0);
178}
179
180
181#else	/* lint */
182
183#include <sys/asm_linkage.h>
184#include <sys/controlregs.h>
185#ifdef _KERNEL
186#include <sys/machprivregs.h>
187#endif
188
189#ifdef _KERNEL
190	/*
191	 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
192	 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
193	 * uses it to pass P2 to syscall.
194	 * This also occurs with the STTS macro, but we don't care if
195	 * P2 (%rsi) is modified just before function exit.
196	 * The CLTS and STTS macros push and pop P1 (%rdi) already.
197	 */
198#ifdef __xpv
199#define	PROTECTED_CLTS \
200	push	%rsi; \
201	CLTS; \
202	pop	%rsi
203#else
204#define	PROTECTED_CLTS \
205	CLTS
206#endif	/* __xpv */
207
208#define	CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \
209	push	%rbp; \
210	mov	%rsp, %rbp; \
211	movq    %cr0, tmpreg; \
212	testq	$CR0_TS, tmpreg; \
213	jnz	1f; \
214	and	$-XMM_ALIGN, %rsp; \
215	sub	$[XMM_SIZE * 2], %rsp; \
216	movaps	%xmm0, 16(%rsp); \
217	movaps	%xmm1, (%rsp); \
218	jmp	2f; \
2191: \
220	PROTECTED_CLTS; \
2212:
222
223	/*
224	 * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack,
225	 * otherwise set CR0_TS.
226	 */
227#define	SET_TS_OR_POP_XMM0_XMM1(tmpreg) \
228	testq	$CR0_TS, tmpreg; \
229	jnz	1f; \
230	movaps	(%rsp), %xmm1; \
231	movaps	16(%rsp), %xmm0; \
232	jmp	2f; \
2331: \
234	STTS(tmpreg); \
2352: \
236	mov	%rbp, %rsp; \
237	pop	%rbp
238
239	/*
240	 * If CR0_TS is not set, align stack (with push %rbp) and push
241	 * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS
242	 */
243#define	CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \
244	push	%rbp; \
245	mov	%rsp, %rbp; \
246	movq    %cr0, tmpreg; \
247	testq	$CR0_TS, tmpreg; \
248	jnz	1f; \
249	and	$-XMM_ALIGN, %rsp; \
250	sub	$[XMM_SIZE * 7], %rsp; \
251	movaps	%xmm0, 96(%rsp); \
252	movaps	%xmm1, 80(%rsp); \
253	movaps	%xmm2, 64(%rsp); \
254	movaps	%xmm3, 48(%rsp); \
255	movaps	%xmm4, 32(%rsp); \
256	movaps	%xmm5, 16(%rsp); \
257	movaps	%xmm6, (%rsp); \
258	jmp	2f; \
2591: \
260	PROTECTED_CLTS; \
2612:
262
263
264	/*
265	 * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
266	 * otherwise set CR0_TS.
267	 */
268#define	SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
269	testq	$CR0_TS, tmpreg; \
270	jnz	1f; \
271	movaps	(%rsp), %xmm6; \
272	movaps	16(%rsp), %xmm5; \
273	movaps	32(%rsp), %xmm4; \
274	movaps	48(%rsp), %xmm3; \
275	movaps	64(%rsp), %xmm2; \
276	movaps	80(%rsp), %xmm1; \
277	movaps	96(%rsp), %xmm0; \
278	jmp	2f; \
2791: \
280	STTS(tmpreg); \
2812: \
282	mov	%rbp, %rsp; \
283	pop	%rbp
284
285
286#else
287#define	PROTECTED_CLTS
288#define	CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
289#define	SET_TS_OR_POP_XMM0_XMM1(tmpreg)
290#define	CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
291#define	SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
292#endif	/* _KERNEL */
293
294
295/*
296 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
297 * _key_expansion_256a(), _key_expansion_256b()
298 *
299 * Helper functions called by rijndael_key_setup_inc_intel().
300 * Also used indirectly by rijndael_key_setup_dec_intel().
301 *
302 * Input:
303 * %xmm0	User-provided cipher key
304 * %xmm1	Round constant
305 * Output:
306 * (%rcx)	AES key
307 */
308
309	/* EXPORT DELETE START */
310.align	16
311_key_expansion_128:
312_key_expansion_256a:
313	pshufd	$0b11111111, %xmm1, %xmm1
314	shufps	$0b00010000, %xmm0, %xmm4
315	pxor	%xmm4, %xmm0
316	shufps	$0b10001100, %xmm0, %xmm4
317	pxor	%xmm4, %xmm0
318	pxor	%xmm1, %xmm0
319	movaps	%xmm0, (%rcx)
320	add	$0x10, %rcx
321	ret
322	SET_SIZE(_key_expansion_128)
323	SET_SIZE(_key_expansion_256a)
324
325.align 16
326_key_expansion_192a:
327	pshufd	$0b01010101, %xmm1, %xmm1
328	shufps	$0b00010000, %xmm0, %xmm4
329	pxor	%xmm4, %xmm0
330	shufps	$0b10001100, %xmm0, %xmm4
331	pxor	%xmm4, %xmm0
332	pxor	%xmm1, %xmm0
333
334	movaps	%xmm2, %xmm5
335	movaps	%xmm2, %xmm6
336	pslldq	$4, %xmm5
337	pshufd	$0b11111111, %xmm0, %xmm3
338	pxor	%xmm3, %xmm2
339	pxor	%xmm5, %xmm2
340
341	movaps	%xmm0, %xmm1
342	shufps	$0b01000100, %xmm0, %xmm6
343	movaps	%xmm6, (%rcx)
344	shufps	$0b01001110, %xmm2, %xmm1
345	movaps	%xmm1, 0x10(%rcx)
346	add	$0x20, %rcx
347	ret
348	SET_SIZE(_key_expansion_192a)
349
350.align 16
351_key_expansion_192b:
352	pshufd	$0b01010101, %xmm1, %xmm1
353	shufps	$0b00010000, %xmm0, %xmm4
354	pxor	%xmm4, %xmm0
355	shufps	$0b10001100, %xmm0, %xmm4
356	pxor	%xmm4, %xmm0
357	pxor	%xmm1, %xmm0
358
359	movaps	%xmm2, %xmm5
360	pslldq	$4, %xmm5
361	pshufd	$0b11111111, %xmm0, %xmm3
362	pxor	%xmm3, %xmm2
363	pxor	%xmm5, %xmm2
364
365	movaps	%xmm0, (%rcx)
366	add	$0x10, %rcx
367	ret
368	SET_SIZE(_key_expansion_192b)
369
370.align 16
371_key_expansion_256b:
372	pshufd	$0b10101010, %xmm1, %xmm1
373	shufps	$0b00010000, %xmm2, %xmm4
374	pxor	%xmm4, %xmm2
375	shufps	$0b10001100, %xmm2, %xmm4
376	pxor	%xmm4, %xmm2
377	pxor	%xmm1, %xmm2
378	movaps	%xmm2, (%rcx)
379	add	$0x10, %rcx
380	ret
381	SET_SIZE(_key_expansion_256b)
382	/* EXPORT DELETE END */
383
384
385/*
386 * rijndael_key_setup_enc_intel()
387 * Expand the cipher key into the encryption key schedule.
388 *
389 * For kernel code, caller is responsible for ensuring kpreempt_disable()
390 * has been called.  This is because %xmm registers are not saved/restored.
391 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
392 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
393 * on the stack.
394 *
395 * OpenSolaris interface:
396 * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
397 *	uint64_t keyBits);
398 * Return value is 0 on error, number of rounds on success.
399 *
400 * Original Intel OpenSSL interface:
401 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
402 *	const int bits, AES_KEY *key);
403 * Return value is non-zero on error, 0 on success.
404 */
405
406#ifdef	OPENSSL_INTERFACE
407#define	rijndael_key_setup_enc_intel	intel_AES_set_encrypt_key
408#define	rijndael_key_setup_dec_intel	intel_AES_set_decrypt_key
409
410#define	USERCIPHERKEY		rdi	/* P1, 64 bits */
411#define	KEYSIZE32		esi	/* P2, 32 bits */
412#define	KEYSIZE64		rsi	/* P2, 64 bits */
413#define	AESKEY			rdx	/* P3, 64 bits */
414
415#else	/* OpenSolaris Interface */
416#define	AESKEY			rdi	/* P1, 64 bits */
417#define	USERCIPHERKEY		rsi	/* P2, 64 bits */
418#define	KEYSIZE32		edx	/* P3, 32 bits */
419#define	KEYSIZE64		rdx	/* P3, 64 bits */
420#endif	/* OPENSSL_INTERFACE */
421
422#define	ROUNDS32		KEYSIZE32	/* temp */
423#define	ROUNDS64		KEYSIZE64	/* temp */
424#define	ENDAESKEY		USERCIPHERKEY	/* temp */
425
426
427ENTRY_NP(rijndael_key_setup_enc_intel)
428	/* EXPORT DELETE START */
429	CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10)
430
431	/ NULL pointer sanity check
432	test	%USERCIPHERKEY, %USERCIPHERKEY
433	jz	.Lenc_key_invalid_param
434	test	%AESKEY, %AESKEY
435	jz	.Lenc_key_invalid_param
436
437	movups	(%USERCIPHERKEY), %xmm0	/ user key (first 16 bytes)
438	movaps	%xmm0, (%AESKEY)
439	lea	0x10(%AESKEY), %rcx	/ key addr
440	pxor	%xmm4, %xmm4		/ xmm4 is assumed 0 in _key_expansion_x
441
442	cmp	$256, %KEYSIZE32
443	jnz	.Lenc_key192
444
445	/ AES 256: 14 rounds
446#ifdef OPENSSL_INTERFACE
447	mov	$14, %ROUNDS32
448	movl	%ROUNDS32, 240(%AESKEY)		/ key.rounds = 14
449#endif	/* OPENSSL_INTERFACE */
450
451	movups	0x10(%USERCIPHERKEY), %xmm2	/ other user key (2nd 16 bytes)
452	movaps	%xmm2, (%rcx)
453	add	$0x10, %rcx
454
455	/ aeskeygenassist $0x1, %xmm2, %xmm1	/ round 1
456	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
457	call	_key_expansion_256a
458	/ aeskeygenassist $0x1, %xmm0, %xmm1
459	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
460	call	_key_expansion_256b
461	/ aeskeygenassist $0x2, %xmm2, %xmm1	/ round 2
462	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
463	call	_key_expansion_256a
464	/ aeskeygenassist $0x2, %xmm0, %xmm1
465	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
466	call	_key_expansion_256b
467	/ aeskeygenassist $0x4, %xmm2, %xmm1	/ round 3
468	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
469	call	_key_expansion_256a
470	/ aeskeygenassist $0x4, %xmm0, %xmm1
471	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
472	call	_key_expansion_256b
473	/ aeskeygenassist $0x8, %xmm2, %xmm1	/ round 4
474	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
475	call	_key_expansion_256a
476	/ aeskeygenassist $0x8, %xmm0, %xmm1
477	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
478	call	_key_expansion_256b
479	/ aeskeygenassist $0x10, %xmm2, %xmm1	/ round 5
480	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
481	call	_key_expansion_256a
482	/ aeskeygenassist $0x10, %xmm0, %xmm1
483	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
484	call	_key_expansion_256b
485	/ aeskeygenassist $0x20, %xmm2, %xmm1	/ round 6
486	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
487	call	_key_expansion_256a
488	/ aeskeygenassist $0x20, %xmm0, %xmm1
489	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
490	call	_key_expansion_256b
491	/ aeskeygenassist $0x40, %xmm2, %xmm1	/ round 7
492	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
493	call	_key_expansion_256a
494
495	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
496#ifdef	OPENSSL_INTERFACE
497	xor	%rax, %rax			/ return 0 (OK)
498#else	/* Open Solaris Interface */
499	mov	$14, %rax			/ return # rounds = 14
500#endif
501	ret
502
503.align 4
504.Lenc_key192:
505	cmp	$192, %KEYSIZE32
506	jnz	.Lenc_key128
507
508	/ AES 192: 12 rounds
509#ifdef OPENSSL_INTERFACE
510	mov	$12, %ROUNDS32
511	movl	%ROUNDS32, 240(%AESKEY)	/ key.rounds = 12
512#endif	/* OPENSSL_INTERFACE */
513
514	movq	0x10(%USERCIPHERKEY), %xmm2	/ other user key
515	/ aeskeygenassist $0x1, %xmm2, %xmm1	/ round 1
516	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
517	call	_key_expansion_192a
518	/ aeskeygenassist $0x2, %xmm2, %xmm1	/ round 2
519	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
520	call	_key_expansion_192b
521	/ aeskeygenassist $0x4, %xmm2, %xmm1	/ round 3
522	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
523	call	_key_expansion_192a
524	/ aeskeygenassist $0x8, %xmm2, %xmm1	/ round 4
525	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
526	call	_key_expansion_192b
527	/ aeskeygenassist $0x10, %xmm2, %xmm1	/ round 5
528	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
529	call	_key_expansion_192a
530	/ aeskeygenassist $0x20, %xmm2, %xmm1	/ round 6
531	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
532	call	_key_expansion_192b
533	/ aeskeygenassist $0x40, %xmm2, %xmm1	/ round 7
534	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
535	call	_key_expansion_192a
536	/ aeskeygenassist $0x80, %xmm2, %xmm1	/ round 8
537	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
538	call	_key_expansion_192b
539
540	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
541#ifdef	OPENSSL_INTERFACE
542	xor	%rax, %rax			/ return 0 (OK)
543#else	/* OpenSolaris Interface */
544	mov	$12, %rax			/ return # rounds = 12
545#endif
546	ret
547
548.align 4
549.Lenc_key128:
550	cmp $128, %KEYSIZE32
551	jnz .Lenc_key_invalid_key_bits
552#ifdef OPENSSL_INTERFACE
553	mov	$10, %ROUNDS32
554	movl	%ROUNDS32, 240(%AESKEY)		/ key.rounds = 10
555#endif	/* OPENSSL_INTERFACE */
556
557	/ aeskeygenassist $0x1, %xmm0, %xmm1	/ round 1
558	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
559	call	_key_expansion_128
560	/ aeskeygenassist $0x2, %xmm0, %xmm1	/ round 2
561	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
562	call	_key_expansion_128
563	/ aeskeygenassist $0x4, %xmm0, %xmm1	/ round 3
564	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
565	call	_key_expansion_128
566	/ aeskeygenassist $0x8, %xmm0, %xmm1	/ round 4
567	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
568	call	_key_expansion_128
569	/ aeskeygenassist $0x10, %xmm0, %xmm1	/ round 5
570	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
571	call	_key_expansion_128
572	/ aeskeygenassist $0x20, %xmm0, %xmm1	/ round 6
573	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
574	call	_key_expansion_128
575	/ aeskeygenassist $0x40, %xmm0, %xmm1	/ round 7
576	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
577	call	_key_expansion_128
578	/ aeskeygenassist $0x80, %xmm0, %xmm1	/ round 8
579	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
580	call	_key_expansion_128
581	/ aeskeygenassist $0x1b, %xmm0, %xmm1	/ round 9
582	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
583	call	_key_expansion_128
584	/ aeskeygenassist $0x36, %xmm0, %xmm1	/ round 10
585	.byte	0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
586	call	_key_expansion_128
587
588	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
589#ifdef	OPENSSL_INTERFACE
590	xor	%rax, %rax			/ return 0 (OK)
591#else	/* OpenSolaris Interface */
592	mov	$10, %rax			/ return # rounds = 10
593#endif
594	ret
595
596.Lenc_key_invalid_param:
597#ifdef	OPENSSL_INTERFACE
598	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
599	mov	$-1, %rax	/ user key or AES key pointer is NULL
600	ret
601#else
602	/* FALLTHROUGH */
603#endif	/* OPENSSL_INTERFACE */
604
605.Lenc_key_invalid_key_bits:
606	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
607#ifdef	OPENSSL_INTERFACE
608	mov	$-2, %rax	/ keysize is invalid
609#else	/* Open Solaris Interface */
610	xor	%rax, %rax	/ a key pointer is NULL or invalid keysize
611#endif	/* OPENSSL_INTERFACE */
612
613	/* EXPORT DELETE END */
614	ret
615	SET_SIZE(rijndael_key_setup_enc_intel)
616
617
618/*
619 * rijndael_key_setup_dec_intel()
620 * Expand the cipher key into the decryption key schedule.
621 *
622 * For kernel code, caller is responsible for ensuring kpreempt_disable()
623 * has been called.  This is because %xmm registers are not saved/restored.
624 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
625 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
626 * on the stack.
627 *
628 * OpenSolaris interface:
629 * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
630 *	uint64_t keyBits);
631 * Return value is 0 on error, number of rounds on success.
632 * P1->P2, P2->P3, P3->P1
633 *
634 * Original Intel OpenSSL interface:
635 * int intel_AES_set_decrypt_key(const unsigned char *userKey,
636 *	const int bits, AES_KEY *key);
637 * Return value is non-zero on error, 0 on success.
638 */
639ENTRY_NP(rijndael_key_setup_dec_intel)
640	/* EXPORT DELETE START */
641	call	rijndael_key_setup_enc_intel
642	test	%rax, %rax
643#ifdef	OPENSSL_INTERFACE
644	jnz	.Ldec_key_exit	/ Failed if returned non-0
645#else	/* OpenSolaris Interface */
646	jz	.Ldec_key_exit	/ Failed if returned 0
647#endif	/* OPENSSL_INTERFACE */
648
649	CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
650
651#ifndef	OPENSSL_INTERFACE		/* OpenSolaris Interface */
652	mov	%rax, %ROUNDS64		/ set # rounds (10, 12, or 14)
653					/ (already set for OpenSSL)
654#endif
655
656	lea	0x10(%AESKEY), %rcx	/ key addr
657	shl	$4, %ROUNDS32
658	add	%AESKEY, %ROUNDS64
659	mov	%ROUNDS64, %ENDAESKEY
660
661.align 4
662.Ldec_key_reorder_loop:
663	movaps	(%AESKEY), %xmm0
664	movaps	(%ROUNDS64), %xmm1
665	movaps	%xmm0, (%ROUNDS64)
666	movaps	%xmm1, (%AESKEY)
667	lea	0x10(%AESKEY), %AESKEY
668	lea	-0x10(%ROUNDS64), %ROUNDS64
669	cmp	%AESKEY, %ROUNDS64
670	ja	.Ldec_key_reorder_loop
671
672.align 4
673.Ldec_key_inv_loop:
674	movaps	(%rcx), %xmm0
675	/aesimc	%xmm0, %xmm1
676	.byte	0x66, 0x0f, 0x38, 0xdb, 0xc8
677	movaps	%xmm1, (%rcx)
678	lea	0x10(%rcx), %rcx
679	cmp	%ENDAESKEY, %rcx
680	jnz	.Ldec_key_inv_loop
681
682	SET_TS_OR_POP_XMM0_XMM1(%r10)
683
684.Ldec_key_exit:
685	/ OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
686	/ OpenSSL: rax = 0 for OK, or non-zero for error
687	/* EXPORT DELETE END */
688	ret
689	SET_SIZE(rijndael_key_setup_dec_intel)
690
691
692/*
693 * aes_encrypt_intel()
694 * Encrypt a single block (in and out can overlap).
695 *
696 * For kernel code, caller is responsible for ensuring kpreempt_disable()
697 * has been called.  This is because %xmm registers are not saved/restored.
698 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
699 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
700 * on the stack.
701 *
702 * Temporary register usage:
703 * %xmm0	State
704 * %xmm1	Key
705 *
706 * Original OpenSolaris Interface:
707 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
708 *	const uint32_t pt[4], uint32_t ct[4])
709 *
710 * Original Intel OpenSSL Interface:
711 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
712 *	const AES_KEY *key)
713 */
714
715#ifdef	OPENSSL_INTERFACE
716#define	aes_encrypt_intel	intel_AES_encrypt
717#define	aes_decrypt_intel	intel_AES_decrypt
718
719#define	INP		rdi	/* P1, 64 bits */
720#define	OUTP		rsi	/* P2, 64 bits */
721#define	KEYP		rdx	/* P3, 64 bits */
722
723/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
724#define	NROUNDS32	ecx	/* temporary, 32 bits */
725#define	NROUNDS		cl	/* temporary,  8 bits */
726
727#else	/* OpenSolaris Interface */
728#define	KEYP		rdi	/* P1, 64 bits */
729#define	NROUNDS		esi	/* P2, 32 bits */
730#define	INP		rdx	/* P3, 64 bits */
731#define	OUTP		rcx	/* P4, 64 bits */
732#endif	/* OPENSSL_INTERFACE */
733
734#define	STATE		xmm0	/* temporary, 128 bits */
735#define	KEY		xmm1	/* temporary, 128 bits */
736
737ENTRY_NP(aes_encrypt_intel)
738	/* EXPORT DELETE START */
739	CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
740
741	movups	(%INP), %STATE			/ input
742	movaps	(%KEYP), %KEY			/ key
743#ifdef	OPENSSL_INTERFACE
744	mov	240(%KEYP), %NROUNDS32		/ round count
745#else	/* OpenSolaris Interface */
746	/* Round count is already present as P2 in %rsi/%esi */
747#endif	/* OPENSSL_INTERFACE */
748
749	pxor	%KEY, %STATE			/ round 0
750	lea	0x30(%KEYP), %KEYP
751	cmp	$12, %NROUNDS
752	jb	.Lenc128
753	lea	0x20(%KEYP), %KEYP
754	je	.Lenc192
755
756	/ AES 256
757	lea	0x20(%KEYP), %KEYP
758	movaps	-0x60(%KEYP), %KEY
759	/aesenc	%KEY, %STATE
760	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
761	movaps	-0x50(%KEYP), %KEY
762	/aesenc	%KEY, %STATE
763	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
764
765.align 4
766.Lenc192:
767	/ AES 192 and 256
768	movaps	-0x40(%KEYP), %KEY
769	/aesenc	%KEY, %STATE
770	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
771	movaps	-0x30(%KEYP), %KEY
772	/aesenc	%KEY, %STATE
773	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
774
775.align 4
776.Lenc128:
777	/ AES 128, 192, and 256
778	movaps	-0x20(%KEYP), %KEY
779	/aesenc	%KEY, %STATE
780	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
781	movaps	-0x10(%KEYP), %KEY
782	/aesenc	%KEY, %STATE
783	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
784	movaps	(%KEYP), %KEY
785	/aesenc	%KEY, %STATE
786	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
787	movaps	0x10(%KEYP), %KEY
788	/aesenc	%KEY, %STATE
789	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
790	movaps	0x20(%KEYP), %KEY
791	/aesenc	%KEY, %STATE
792	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
793	movaps	0x30(%KEYP), %KEY
794	/aesenc	%KEY, %STATE
795	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
796	movaps	0x40(%KEYP), %KEY
797	/aesenc	%KEY, %STATE
798	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
799	movaps	0x50(%KEYP), %KEY
800	/aesenc	%KEY, %STATE
801	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
802	movaps	0x60(%KEYP), %KEY
803	/aesenc	%KEY, %STATE
804	.byte	0x66, 0x0f, 0x38, 0xdc, 0xc1
805	movaps	0x70(%KEYP), %KEY
806	/aesenclast	 %KEY, %STATE		/ last round
807	.byte	0x66, 0x0f, 0x38, 0xdd, 0xc1
808	movups	%STATE, (%OUTP)			/ output
809
810	SET_TS_OR_POP_XMM0_XMM1(%r10)
811	/* EXPORT DELETE END */
812	ret
813	SET_SIZE(aes_encrypt_intel)
814
815
816/*
817 * aes_decrypt_intel()
818 * Decrypt a single block (in and out can overlap).
819 *
820 * For kernel code, caller is responsible for ensuring kpreempt_disable()
821 * has been called.  This is because %xmm registers are not saved/restored.
822 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
823 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
824 * on the stack.
825 *
826 * Temporary register usage:
827 * %xmm0	State
828 * %xmm1	Key
829 *
830 * Original OpenSolaris Interface:
831 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
832 *	const uint32_t pt[4], uint32_t ct[4])/
833 *
834 * Original Intel OpenSSL Interface:
835 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
836 *	const AES_KEY *key);
837 */
838ENTRY_NP(aes_decrypt_intel)
839	/* EXPORT DELETE START */
840	CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
841
842	movups	(%INP), %STATE			/ input
843	movaps	(%KEYP), %KEY			/ key
844#ifdef	OPENSSL_INTERFACE
845	mov	240(%KEYP), %NROUNDS32		/ round count
846#else	/* OpenSolaris Interface */
847	/* Round count is already present as P2 in %rsi/%esi */
848#endif	/* OPENSSL_INTERFACE */
849
850	pxor	%KEY, %STATE			/ round 0
851	lea	0x30(%KEYP), %KEYP
852	cmp	$12, %NROUNDS
853	jb	.Ldec128
854	lea	0x20(%KEYP), %KEYP
855	je	.Ldec192
856
857	/ AES 256
858	lea	0x20(%KEYP), %KEYP
859	movaps	-0x60(%KEYP), %KEY
860	/aesdec	%KEY, %STATE
861	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
862	movaps	-0x50(%KEYP), %KEY
863	/aesdec	%KEY, %STATE
864	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
865
866.align 4
867.Ldec192:
868	/ AES 192 and 256
869	movaps	-0x40(%KEYP), %KEY
870	/aesdec	%KEY, %STATE
871	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
872	movaps	-0x30(%KEYP), %KEY
873	/aesdec	%KEY, %STATE
874	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
875
876.align 4
877.Ldec128:
878	/ AES 128, 192, and 256
879	movaps	-0x20(%KEYP), %KEY
880	/aesdec	%KEY, %STATE
881	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
882	movaps	-0x10(%KEYP), %KEY
883	/aesdec	%KEY, %STATE
884	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
885	movaps	(%KEYP), %KEY
886	/aesdec	%KEY, %STATE
887	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
888	movaps	0x10(%KEYP), %KEY
889	/aesdec	%KEY, %STATE
890	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
891	movaps	0x20(%KEYP), %KEY
892	/aesdec	%KEY, %STATE
893	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
894	movaps	0x30(%KEYP), %KEY
895	/aesdec	%KEY, %STATE
896	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
897	movaps	0x40(%KEYP), %KEY
898	/aesdec	%KEY, %STATE
899	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
900	movaps	0x50(%KEYP), %KEY
901	/aesdec	%KEY, %STATE
902	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
903	movaps	0x60(%KEYP), %KEY
904	/aesdec	%KEY, %STATE
905	.byte	0x66, 0x0f, 0x38, 0xde, 0xc1
906	movaps	0x70(%KEYP), %KEY
907	/aesdeclast	%KEY, %STATE		/ last round
908	.byte	0x66, 0x0f, 0x38, 0xdf, 0xc1
909	movups	%STATE, (%OUTP)			/ output
910
911	SET_TS_OR_POP_XMM0_XMM1(%r10)
912	ret
913	/* EXPORT DELETE END */
914	SET_SIZE(aes_decrypt_intel)
915
916#endif  /* lint || __lint */
917