xref: /titanic_51/usr/src/common/crypto/aes/amd64/aes_intel.s (revision 46b2e54fef23f79962caefa489542a11b53acd45)
1/*
2 * ====================================================================
3 * Written by Intel Corporation for the OpenSSL project to add support
4 * for Intel AES-NI instructions. Rights for redistribution and usage
5 * in source and binary forms are granted according to the OpenSSL
6 * license.
7 *
8 *   Author: Huang Ying <ying.huang at intel dot com>
9 *           Vinodh Gopal <vinodh.gopal at intel dot com>
10 *           Kahraman Akdemir
11 *
12 * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
13 * instructions that are going to be introduced in the next generation
14 * of Intel processor, as of 2009. These instructions enable fast and
15 * secure data encryption and decryption, using the Advanced Encryption
16 * Standard (AES), defined by FIPS Publication number 197. The
17 * architecture introduces six instructions that offer full hardware
18 * support for AES. Four of them support high performance data
19 * encryption and decryption, and the other two instructions support
20 * the AES key expansion procedure.
21 * ====================================================================
22 */
23
24/*
25 * ====================================================================
26 * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * 1. Redistributions of source code must retain the above copyright
33 *    notice, this list of conditions and the following disclaimer.
34 *
35 * 2. Redistributions in binary form must reproduce the above copyright
36 *    notice, this list of conditions and the following disclaimer in
37 *    the documentation and/or other materials provided with the
38 *    distribution.
39 *
40 * 3. All advertising materials mentioning features or use of this
41 *    software must display the following acknowledgment:
42 *    "This product includes software developed by the OpenSSL Project
43 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
44 *
45 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
46 *    endorse or promote products derived from this software without
47 *    prior written permission. For written permission, please contact
48 *    openssl-core@openssl.org.
49 *
50 * 5. Products derived from this software may not be called "OpenSSL"
51 *    nor may "OpenSSL" appear in their names without prior written
52 *    permission of the OpenSSL Project.
53 *
54 * 6. Redistributions of any form whatsoever must retain the following
55 *    acknowledgment:
56 *    "This product includes software developed by the OpenSSL Project
57 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
60 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
62 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
63 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
64 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
65 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
66 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
68 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
70 * OF THE POSSIBILITY OF SUCH DAMAGE.
71 * ====================================================================
72 */
73
74/*
75 * ====================================================================
76 * OpenSolaris OS modifications
77 *
78 * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
79 * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
80 * Huang Ying of Intel to the openssl-dev mailing list under the subject
81 * of "Add support to Intel AES-NI instruction set for x86_64 platform".
82 *
83 * This OpenSolaris version has these major changes from the original source:
84 *
85 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
86 * /usr/include/sys/asm_linkage.h, lint(1B) guards, EXPORT DELETE START
87 * and EXPORT DELETE END markers, and dummy C function definitions for lint.
88 *
89 * 2. Formatted code, added comments, and added #includes and #defines.
90 *
91 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
92 * calling kpreempt_disable() and kpreempt_enable().
93 * If the TS bit is not set, Save and restore %xmm registers at the beginning
94 * and end of function calls (%xmm* registers are not saved and restored by
95 * during kernel thread preemption).
96 *
97 * 4. Renamed functions, reordered parameters, and changed return value
98 * to match OpenSolaris:
99 *
100 * OpenSSL interface:
101 *	int intel_AES_set_encrypt_key(const unsigned char *userKey,
102 *		const int bits, AES_KEY *key);
103 *	int intel_AES_set_decrypt_key(const unsigned char *userKey,
104 *		const int bits, AES_KEY *key);
105 *	Return values for above are non-zero on error, 0 on success.
106 *
107 *	void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
108 *		const AES_KEY *key);
109 *	void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
110 *		const AES_KEY *key);
111 *	typedef struct aes_key_st {
112 *		unsigned int	rd_key[4 *(AES_MAXNR + 1)];
113 *		int		rounds;
114 *		unsigned int	pad[3];
115 *	} AES_KEY;
116 * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
117 * (ks32) instead of 64-bit (ks64).
118 * Number of rounds (aka round count) is at offset 240 of AES_KEY.
119 *
120 * OpenSolaris OS interface (#ifdefs removed for readability):
121 *	int rijndael_key_setup_dec_intel(uint32_t rk[],
122 *		const uint32_t cipherKey[], uint64_t keyBits);
123 *	int rijndael_key_setup_enc_intel(uint32_t rk[],
124 *		const uint32_t cipherKey[], uint64_t keyBits);
125 *	Return values for above are 0 on error, number of rounds on success.
126 *
127 *	void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
128 *		const uint32_t pt[4], uint32_t ct[4]);
129 *	void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
130 *		const uint32_t pt[4], uint32_t ct[4]);
131 *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
132 *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
133 *
134 *	typedef union {
135 *		uint32_t	ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
136 *	} aes_ks_t;
137 *	typedef struct aes_key {
138 *		aes_ks_t	encr_ks, decr_ks;
139 *		long double	align128;
140 *		int		flags, nr, type;
141 *	} aes_key_t;
142 *
143 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
144 * ct is crypto text, and MAX_AES_NR is 14.
145 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
146 *
147 * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
148 *
149 * ====================================================================
150 */
151
152#if defined(lint) || defined(__lint)
153
154#include <sys/types.h>
155
156/* ARGSUSED */
157void
158aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
159    uint32_t ct[4]) {
160}
161/* ARGSUSED */
162void
163aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
164    uint32_t pt[4]) {
165}
166/* ARGSUSED */
167int
168rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
169    uint64_t keyBits) {
170	return (0);
171}
172/* ARGSUSED */
173int
174rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
175   uint64_t keyBits) {
176	return (0);
177}
178
179
180#else	/* lint */
181
182#include <sys/asm_linkage.h>
183#include <sys/controlregs.h>
184#ifdef _KERNEL
185#include <sys/machprivregs.h>
186#endif
187
188#ifdef _KERNEL
189	/*
190	 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
191	 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
192	 * uses it to pass P2 to syscall.
193	 * This also occurs with the STTS macro, but we don't care if
194	 * P2 (%rsi) is modified just before function exit.
195	 * The CLTS and STTS macros push and pop P1 (%rdi) already.
196	 */
197#ifdef __xpv
198#define	PROTECTED_CLTS \
199	push	%rsi; \
200	CLTS; \
201	pop	%rsi
202#else
203#define	PROTECTED_CLTS \
204	CLTS
205#endif	/* __xpv */
206
207#define	CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \
208	push	%rbp; \
209	mov	%rsp, %rbp; \
210	movq	%cr0, tmpreg; \
211	testq	$CR0_TS, tmpreg; \
212	jnz	1f; \
213	and	$-XMM_ALIGN, %rsp; \
214	sub	$[XMM_SIZE * 2], %rsp; \
215	movaps	%xmm0, 16(%rsp); \
216	movaps	%xmm1, (%rsp); \
217	jmp	2f; \
2181: \
219	PROTECTED_CLTS; \
2202:
221
222	/*
223	 * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack,
224	 * otherwise set CR0_TS.
225	 */
226#define	SET_TS_OR_POP_XMM0_XMM1(tmpreg) \
227	testq	$CR0_TS, tmpreg; \
228	jnz	1f; \
229	movaps	(%rsp), %xmm1; \
230	movaps	16(%rsp), %xmm0; \
231	jmp	2f; \
2321: \
233	STTS(tmpreg); \
2342: \
235	mov	%rbp, %rsp; \
236	pop	%rbp
237
238	/*
239	 * If CR0_TS is not set, align stack (with push %rbp) and push
240	 * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS
241	 */
242#define	CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \
243	push	%rbp; \
244	mov	%rsp, %rbp; \
245	movq	%cr0, tmpreg; \
246	testq	$CR0_TS, tmpreg; \
247	jnz	1f; \
248	and	$-XMM_ALIGN, %rsp; \
249	sub	$[XMM_SIZE * 7], %rsp; \
250	movaps	%xmm0, 96(%rsp); \
251	movaps	%xmm1, 80(%rsp); \
252	movaps	%xmm2, 64(%rsp); \
253	movaps	%xmm3, 48(%rsp); \
254	movaps	%xmm4, 32(%rsp); \
255	movaps	%xmm5, 16(%rsp); \
256	movaps	%xmm6, (%rsp); \
257	jmp	2f; \
2581: \
259	PROTECTED_CLTS; \
2602:
261
262
263	/*
264	 * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
265	 * otherwise set CR0_TS.
266	 */
267#define	SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
268	testq	$CR0_TS, tmpreg; \
269	jnz	1f; \
270	movaps	(%rsp), %xmm6; \
271	movaps	16(%rsp), %xmm5; \
272	movaps	32(%rsp), %xmm4; \
273	movaps	48(%rsp), %xmm3; \
274	movaps	64(%rsp), %xmm2; \
275	movaps	80(%rsp), %xmm1; \
276	movaps	96(%rsp), %xmm0; \
277	jmp	2f; \
2781: \
279	STTS(tmpreg); \
2802: \
281	mov	%rbp, %rsp; \
282	pop	%rbp
283
284
285#else
286#define	PROTECTED_CLTS
287#define	CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
288#define	SET_TS_OR_POP_XMM0_XMM1(tmpreg)
289#define	CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
290#define	SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
291#endif	/* _KERNEL */
292
293
294/*
295 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
296 * _key_expansion_256a(), _key_expansion_256b()
297 *
298 * Helper functions called by rijndael_key_setup_inc_intel().
299 * Also used indirectly by rijndael_key_setup_dec_intel().
300 *
301 * Input:
302 * %xmm0	User-provided cipher key
303 * %xmm1	Round constant
304 * Output:
305 * (%rcx)	AES key
306 */
307
308	/* EXPORT DELETE START */
309.align	16
310_key_expansion_128:
311_key_expansion_256a:
312	pshufd	$0b11111111, %xmm1, %xmm1
313	shufps	$0b00010000, %xmm0, %xmm4
314	pxor	%xmm4, %xmm0
315	shufps	$0b10001100, %xmm0, %xmm4
316	pxor	%xmm4, %xmm0
317	pxor	%xmm1, %xmm0
318	movaps	%xmm0, (%rcx)
319	add	$0x10, %rcx
320	ret
321	SET_SIZE(_key_expansion_128)
322	SET_SIZE(_key_expansion_256a)
323
324.align 16
325_key_expansion_192a:
326	pshufd	$0b01010101, %xmm1, %xmm1
327	shufps	$0b00010000, %xmm0, %xmm4
328	pxor	%xmm4, %xmm0
329	shufps	$0b10001100, %xmm0, %xmm4
330	pxor	%xmm4, %xmm0
331	pxor	%xmm1, %xmm0
332
333	movaps	%xmm2, %xmm5
334	movaps	%xmm2, %xmm6
335	pslldq	$4, %xmm5
336	pshufd	$0b11111111, %xmm0, %xmm3
337	pxor	%xmm3, %xmm2
338	pxor	%xmm5, %xmm2
339
340	movaps	%xmm0, %xmm1
341	shufps	$0b01000100, %xmm0, %xmm6
342	movaps	%xmm6, (%rcx)
343	shufps	$0b01001110, %xmm2, %xmm1
344	movaps	%xmm1, 0x10(%rcx)
345	add	$0x20, %rcx
346	ret
347	SET_SIZE(_key_expansion_192a)
348
349.align 16
350_key_expansion_192b:
351	pshufd	$0b01010101, %xmm1, %xmm1
352	shufps	$0b00010000, %xmm0, %xmm4
353	pxor	%xmm4, %xmm0
354	shufps	$0b10001100, %xmm0, %xmm4
355	pxor	%xmm4, %xmm0
356	pxor	%xmm1, %xmm0
357
358	movaps	%xmm2, %xmm5
359	pslldq	$4, %xmm5
360	pshufd	$0b11111111, %xmm0, %xmm3
361	pxor	%xmm3, %xmm2
362	pxor	%xmm5, %xmm2
363
364	movaps	%xmm0, (%rcx)
365	add	$0x10, %rcx
366	ret
367	SET_SIZE(_key_expansion_192b)
368
369.align 16
370_key_expansion_256b:
371	pshufd	$0b10101010, %xmm1, %xmm1
372	shufps	$0b00010000, %xmm2, %xmm4
373	pxor	%xmm4, %xmm2
374	shufps	$0b10001100, %xmm2, %xmm4
375	pxor	%xmm4, %xmm2
376	pxor	%xmm1, %xmm2
377	movaps	%xmm2, (%rcx)
378	add	$0x10, %rcx
379	ret
380	SET_SIZE(_key_expansion_256b)
381	/* EXPORT DELETE END */
382
383
384/*
385 * rijndael_key_setup_enc_intel()
386 * Expand the cipher key into the encryption key schedule.
387 *
388 * For kernel code, caller is responsible for ensuring kpreempt_disable()
389 * has been called.  This is because %xmm registers are not saved/restored.
390 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
391 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
392 * on the stack.
393 *
394 * OpenSolaris interface:
395 * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
396 *	uint64_t keyBits);
397 * Return value is 0 on error, number of rounds on success.
398 *
399 * Original Intel OpenSSL interface:
400 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
401 *	const int bits, AES_KEY *key);
402 * Return value is non-zero on error, 0 on success.
403 */
404
405#ifdef	OPENSSL_INTERFACE
406#define	rijndael_key_setup_enc_intel	intel_AES_set_encrypt_key
407#define	rijndael_key_setup_dec_intel	intel_AES_set_decrypt_key
408
409#define	USERCIPHERKEY		rdi	/* P1, 64 bits */
410#define	KEYSIZE32		esi	/* P2, 32 bits */
411#define	KEYSIZE64		rsi	/* P2, 64 bits */
412#define	AESKEY			rdx	/* P3, 64 bits */
413
414#else	/* OpenSolaris Interface */
415#define	AESKEY			rdi	/* P1, 64 bits */
416#define	USERCIPHERKEY		rsi	/* P2, 64 bits */
417#define	KEYSIZE32		edx	/* P3, 32 bits */
418#define	KEYSIZE64		rdx	/* P3, 64 bits */
419#endif	/* OPENSSL_INTERFACE */
420
421#define	ROUNDS32		KEYSIZE32	/* temp */
422#define	ROUNDS64		KEYSIZE64	/* temp */
423#define	ENDAESKEY		USERCIPHERKEY	/* temp */
424
425
426ENTRY_NP(rijndael_key_setup_enc_intel)
427	/* EXPORT DELETE START */
428	CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10)
429
430	/ NULL pointer sanity check
431	test	%USERCIPHERKEY, %USERCIPHERKEY
432	jz	.Lenc_key_invalid_param
433	test	%AESKEY, %AESKEY
434	jz	.Lenc_key_invalid_param
435
436	movups	(%USERCIPHERKEY), %xmm0	/ user key (first 16 bytes)
437	movaps	%xmm0, (%AESKEY)
438	lea	0x10(%AESKEY), %rcx	/ key addr
439	pxor	%xmm4, %xmm4		/ xmm4 is assumed 0 in _key_expansion_x
440
441	cmp	$256, %KEYSIZE32
442	jnz	.Lenc_key192
443
444	/ AES 256: 14 rounds in encryption key schedule
445#ifdef OPENSSL_INTERFACE
446	mov	$14, %ROUNDS32
447	movl	%ROUNDS32, 240(%AESKEY)		/ key.rounds = 14
448#endif	/* OPENSSL_INTERFACE */
449
450	movups	0x10(%USERCIPHERKEY), %xmm2	/ other user key (2nd 16 bytes)
451	movaps	%xmm2, (%rcx)
452	add	$0x10, %rcx
453
454	aeskeygenassist $0x1, %xmm2, %xmm1	/ expand the key
455	call	_key_expansion_256a
456	aeskeygenassist $0x1, %xmm0, %xmm1
457	call	_key_expansion_256b
458	aeskeygenassist $0x2, %xmm2, %xmm1	/ expand the key
459	call	_key_expansion_256a
460	aeskeygenassist $0x2, %xmm0, %xmm1
461	call	_key_expansion_256b
462	aeskeygenassist $0x4, %xmm2, %xmm1	/ expand the key
463	call	_key_expansion_256a
464	aeskeygenassist $0x4, %xmm0, %xmm1
465	call	_key_expansion_256b
466	aeskeygenassist $0x8, %xmm2, %xmm1	/ expand the key
467	call	_key_expansion_256a
468	aeskeygenassist $0x8, %xmm0, %xmm1
469	call	_key_expansion_256b
470	aeskeygenassist $0x10, %xmm2, %xmm1	/ expand the key
471	call	_key_expansion_256a
472	aeskeygenassist $0x10, %xmm0, %xmm1
473	call	_key_expansion_256b
474	aeskeygenassist $0x20, %xmm2, %xmm1	/ expand the key
475	call	_key_expansion_256a
476	aeskeygenassist $0x20, %xmm0, %xmm1
477	call	_key_expansion_256b
478	aeskeygenassist $0x40, %xmm2, %xmm1	/ expand the key
479	call	_key_expansion_256a
480
481	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
482#ifdef	OPENSSL_INTERFACE
483	xor	%rax, %rax			/ return 0 (OK)
484#else	/* Open Solaris Interface */
485	mov	$14, %rax			/ return # rounds = 14
486#endif
487	ret
488
489.align 4
490.Lenc_key192:
491	cmp	$192, %KEYSIZE32
492	jnz	.Lenc_key128
493
494	/ AES 192: 12 rounds in encryption key schedule
495#ifdef OPENSSL_INTERFACE
496	mov	$12, %ROUNDS32
497	movl	%ROUNDS32, 240(%AESKEY)	/ key.rounds = 12
498#endif	/* OPENSSL_INTERFACE */
499
500	movq	0x10(%USERCIPHERKEY), %xmm2	/ other user key
501	aeskeygenassist $0x1, %xmm2, %xmm1	/ expand the key
502	call	_key_expansion_192a
503	aeskeygenassist $0x2, %xmm2, %xmm1	/ expand the key
504	call	_key_expansion_192b
505	aeskeygenassist $0x4, %xmm2, %xmm1	/ expand the key
506	call	_key_expansion_192a
507	aeskeygenassist $0x8, %xmm2, %xmm1	/ expand the key
508	call	_key_expansion_192b
509	aeskeygenassist $0x10, %xmm2, %xmm1	/ expand the key
510	call	_key_expansion_192a
511	aeskeygenassist $0x20, %xmm2, %xmm1	/ expand the key
512	call	_key_expansion_192b
513	aeskeygenassist $0x40, %xmm2, %xmm1	/ expand the key
514	call	_key_expansion_192a
515	aeskeygenassist $0x80, %xmm2, %xmm1	/ expand the key
516	call	_key_expansion_192b
517
518	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
519#ifdef	OPENSSL_INTERFACE
520	xor	%rax, %rax			/ return 0 (OK)
521#else	/* OpenSolaris Interface */
522	mov	$12, %rax			/ return # rounds = 12
523#endif
524	ret
525
526.align 4
527.Lenc_key128:
528	cmp $128, %KEYSIZE32
529	jnz .Lenc_key_invalid_key_bits
530
531	/ AES 128: 10 rounds in encryption key schedule
532#ifdef OPENSSL_INTERFACE
533	mov	$10, %ROUNDS32
534	movl	%ROUNDS32, 240(%AESKEY)		/ key.rounds = 10
535#endif	/* OPENSSL_INTERFACE */
536
537	aeskeygenassist $0x1, %xmm0, %xmm1	/ expand the key
538	call	_key_expansion_128
539	aeskeygenassist $0x2, %xmm0, %xmm1	/ expand the key
540	call	_key_expansion_128
541	aeskeygenassist $0x4, %xmm0, %xmm1	/ expand the key
542	call	_key_expansion_128
543	aeskeygenassist $0x8, %xmm0, %xmm1	/ expand the key
544	call	_key_expansion_128
545	aeskeygenassist $0x10, %xmm0, %xmm1	/ expand the key
546	call	_key_expansion_128
547	aeskeygenassist $0x20, %xmm0, %xmm1	/ expand the key
548	call	_key_expansion_128
549	aeskeygenassist $0x40, %xmm0, %xmm1	/ expand the key
550	call	_key_expansion_128
551	aeskeygenassist $0x80, %xmm0, %xmm1	/ expand the key
552	call	_key_expansion_128
553	aeskeygenassist $0x1b, %xmm0, %xmm1	/ expand the key
554	call	_key_expansion_128
555	aeskeygenassist $0x36, %xmm0, %xmm1	/ expand the key
556	call	_key_expansion_128
557
558	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
559#ifdef	OPENSSL_INTERFACE
560	xor	%rax, %rax			/ return 0 (OK)
561#else	/* OpenSolaris Interface */
562	mov	$10, %rax			/ return # rounds = 10
563#endif
564	ret
565
566.Lenc_key_invalid_param:
567#ifdef	OPENSSL_INTERFACE
568	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
569	mov	$-1, %rax	/ user key or AES key pointer is NULL
570	ret
571#else
572	/* FALLTHROUGH */
573#endif	/* OPENSSL_INTERFACE */
574
575.Lenc_key_invalid_key_bits:
576	SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
577#ifdef	OPENSSL_INTERFACE
578	mov	$-2, %rax	/ keysize is invalid
579#else	/* Open Solaris Interface */
580	xor	%rax, %rax	/ a key pointer is NULL or invalid keysize
581#endif	/* OPENSSL_INTERFACE */
582
583	/* EXPORT DELETE END */
584	ret
585	SET_SIZE(rijndael_key_setup_enc_intel)
586
587
588/*
589 * rijndael_key_setup_dec_intel()
590 * Expand the cipher key into the decryption key schedule.
591 *
592 * For kernel code, caller is responsible for ensuring kpreempt_disable()
593 * has been called.  This is because %xmm registers are not saved/restored.
594 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
595 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
596 * on the stack.
597 *
598 * OpenSolaris interface:
599 * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
600 *	uint64_t keyBits);
601 * Return value is 0 on error, number of rounds on success.
602 * P1->P2, P2->P3, P3->P1
603 *
604 * Original Intel OpenSSL interface:
605 * int intel_AES_set_decrypt_key(const unsigned char *userKey,
606 *	const int bits, AES_KEY *key);
607 * Return value is non-zero on error, 0 on success.
608 */
609ENTRY_NP(rijndael_key_setup_dec_intel)
610	/* EXPORT DELETE START */
611	/ Generate round keys used for encryption
612	call	rijndael_key_setup_enc_intel
613	test	%rax, %rax
614#ifdef	OPENSSL_INTERFACE
615	jnz	.Ldec_key_exit	/ Failed if returned non-0
616#else	/* OpenSolaris Interface */
617	jz	.Ldec_key_exit	/ Failed if returned 0
618#endif	/* OPENSSL_INTERFACE */
619
620	CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
621
622	/*
623	 * Convert round keys used for encryption
624	 * to a form usable for decryption
625	 */
626#ifndef	OPENSSL_INTERFACE		/* OpenSolaris Interface */
627	mov	%rax, %ROUNDS64		/ set # rounds (10, 12, or 14)
628					/ (already set for OpenSSL)
629#endif
630
631	lea	0x10(%AESKEY), %rcx	/ key addr
632	shl	$4, %ROUNDS32
633	add	%AESKEY, %ROUNDS64
634	mov	%ROUNDS64, %ENDAESKEY
635
636.align 4
637.Ldec_key_reorder_loop:
638	movaps	(%AESKEY), %xmm0
639	movaps	(%ROUNDS64), %xmm1
640	movaps	%xmm0, (%ROUNDS64)
641	movaps	%xmm1, (%AESKEY)
642	lea	0x10(%AESKEY), %AESKEY
643	lea	-0x10(%ROUNDS64), %ROUNDS64
644	cmp	%AESKEY, %ROUNDS64
645	ja	.Ldec_key_reorder_loop
646
647.align 4
648.Ldec_key_inv_loop:
649	movaps	(%rcx), %xmm0
650	/ Convert an encryption round key to a form usable for decryption
651	/ with the "AES Inverse Mix Columns" instruction
652	aesimc	%xmm0, %xmm1
653	movaps	%xmm1, (%rcx)
654	lea	0x10(%rcx), %rcx
655	cmp	%ENDAESKEY, %rcx
656	jnz	.Ldec_key_inv_loop
657
658	SET_TS_OR_POP_XMM0_XMM1(%r10)
659
660.Ldec_key_exit:
661	/ OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
662	/ OpenSSL: rax = 0 for OK, or non-zero for error
663	/* EXPORT DELETE END */
664	ret
665	SET_SIZE(rijndael_key_setup_dec_intel)
666
667
668/*
669 * aes_encrypt_intel()
670 * Encrypt a single block (in and out can overlap).
671 *
672 * For kernel code, caller is responsible for ensuring kpreempt_disable()
673 * has been called.  This is because %xmm registers are not saved/restored.
674 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
675 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
676 * on the stack.
677 *
678 * Temporary register usage:
679 * %xmm0	State
680 * %xmm1	Key
681 *
682 * Original OpenSolaris Interface:
683 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
684 *	const uint32_t pt[4], uint32_t ct[4])
685 *
686 * Original Intel OpenSSL Interface:
687 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
688 *	const AES_KEY *key)
689 */
690
691#ifdef	OPENSSL_INTERFACE
692#define	aes_encrypt_intel	intel_AES_encrypt
693#define	aes_decrypt_intel	intel_AES_decrypt
694
695#define	INP		rdi	/* P1, 64 bits */
696#define	OUTP		rsi	/* P2, 64 bits */
697#define	KEYP		rdx	/* P3, 64 bits */
698
699/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
700#define	NROUNDS32	ecx	/* temporary, 32 bits */
701#define	NROUNDS		cl	/* temporary,  8 bits */
702
703#else	/* OpenSolaris Interface */
704#define	KEYP		rdi	/* P1, 64 bits */
705#define	NROUNDS		esi	/* P2, 32 bits */
706#define	INP		rdx	/* P3, 64 bits */
707#define	OUTP		rcx	/* P4, 64 bits */
708#endif	/* OPENSSL_INTERFACE */
709
710#define	STATE		xmm0	/* temporary, 128 bits */
711#define	KEY		xmm1	/* temporary, 128 bits */
712
713ENTRY_NP(aes_encrypt_intel)
714	/* EXPORT DELETE START */
715	CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
716
717	movups	(%INP), %STATE			/ input
718	movaps	(%KEYP), %KEY			/ key
719#ifdef	OPENSSL_INTERFACE
720	mov	240(%KEYP), %NROUNDS32		/ round count
721#else	/* OpenSolaris Interface */
722	/* Round count is already present as P2 in %rsi/%esi */
723#endif	/* OPENSSL_INTERFACE */
724
725	pxor	%KEY, %STATE			/ round 0
726	lea	0x30(%KEYP), %KEYP
727	cmp	$12, %NROUNDS
728	jb	.Lenc128
729	lea	0x20(%KEYP), %KEYP
730	je	.Lenc192
731
732	/ AES 256
733	lea	0x20(%KEYP), %KEYP
734	movaps	-0x60(%KEYP), %KEY
735	aesenc	%KEY, %STATE
736	movaps	-0x50(%KEYP), %KEY
737	aesenc	%KEY, %STATE
738
739.align 4
740.Lenc192:
741	/ AES 192 and 256
742	movaps	-0x40(%KEYP), %KEY
743	aesenc	%KEY, %STATE
744	movaps	-0x30(%KEYP), %KEY
745	aesenc	%KEY, %STATE
746
747.align 4
748.Lenc128:
749	/ AES 128, 192, and 256
750	movaps	-0x20(%KEYP), %KEY
751	aesenc	%KEY, %STATE
752	movaps	-0x10(%KEYP), %KEY
753	aesenc	%KEY, %STATE
754	movaps	(%KEYP), %KEY
755	aesenc	%KEY, %STATE
756	movaps	0x10(%KEYP), %KEY
757	aesenc	%KEY, %STATE
758	movaps	0x20(%KEYP), %KEY
759	aesenc	%KEY, %STATE
760	movaps	0x30(%KEYP), %KEY
761	aesenc	%KEY, %STATE
762	movaps	0x40(%KEYP), %KEY
763	aesenc	%KEY, %STATE
764	movaps	0x50(%KEYP), %KEY
765	aesenc	%KEY, %STATE
766	movaps	0x60(%KEYP), %KEY
767	aesenc	%KEY, %STATE
768	movaps	0x70(%KEYP), %KEY
769	aesenclast	 %KEY, %STATE		/ last round
770	movups	%STATE, (%OUTP)			/ output
771
772	SET_TS_OR_POP_XMM0_XMM1(%r10)
773	/* EXPORT DELETE END */
774	ret
775	SET_SIZE(aes_encrypt_intel)
776
777
778/*
779 * aes_decrypt_intel()
780 * Decrypt a single block (in and out can overlap).
781 *
782 * For kernel code, caller is responsible for ensuring kpreempt_disable()
783 * has been called.  This is because %xmm registers are not saved/restored.
784 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
785 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
786 * on the stack.
787 *
788 * Temporary register usage:
789 * %xmm0	State
790 * %xmm1	Key
791 *
792 * Original OpenSolaris Interface:
793 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
794 *	const uint32_t pt[4], uint32_t ct[4])/
795 *
796 * Original Intel OpenSSL Interface:
797 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
798 *	const AES_KEY *key);
799 */
800ENTRY_NP(aes_decrypt_intel)
801	/* EXPORT DELETE START */
802	CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
803
804	movups	(%INP), %STATE			/ input
805	movaps	(%KEYP), %KEY			/ key
806#ifdef	OPENSSL_INTERFACE
807	mov	240(%KEYP), %NROUNDS32		/ round count
808#else	/* OpenSolaris Interface */
809	/* Round count is already present as P2 in %rsi/%esi */
810#endif	/* OPENSSL_INTERFACE */
811
812	pxor	%KEY, %STATE			/ round 0
813	lea	0x30(%KEYP), %KEYP
814	cmp	$12, %NROUNDS
815	jb	.Ldec128
816	lea	0x20(%KEYP), %KEYP
817	je	.Ldec192
818
819	/ AES 256
820	lea	0x20(%KEYP), %KEYP
821	movaps	-0x60(%KEYP), %KEY
822	aesdec	%KEY, %STATE
823	movaps	-0x50(%KEYP), %KEY
824	aesdec	%KEY, %STATE
825
826.align 4
827.Ldec192:
828	/ AES 192 and 256
829	movaps	-0x40(%KEYP), %KEY
830	aesdec	%KEY, %STATE
831	movaps	-0x30(%KEYP), %KEY
832	aesdec	%KEY, %STATE
833
834.align 4
835.Ldec128:
836	/ AES 128, 192, and 256
837	movaps	-0x20(%KEYP), %KEY
838	aesdec	%KEY, %STATE
839	movaps	-0x10(%KEYP), %KEY
840	aesdec	%KEY, %STATE
841	movaps	(%KEYP), %KEY
842	aesdec	%KEY, %STATE
843	movaps	0x10(%KEYP), %KEY
844	aesdec	%KEY, %STATE
845	movaps	0x20(%KEYP), %KEY
846	aesdec	%KEY, %STATE
847	movaps	0x30(%KEYP), %KEY
848	aesdec	%KEY, %STATE
849	movaps	0x40(%KEYP), %KEY
850	aesdec	%KEY, %STATE
851	movaps	0x50(%KEYP), %KEY
852	aesdec	%KEY, %STATE
853	movaps	0x60(%KEYP), %KEY
854	aesdec	%KEY, %STATE
855	movaps	0x70(%KEYP), %KEY
856	aesdeclast	%KEY, %STATE		/ last round
857	movups	%STATE, (%OUTP)			/ output
858
859	SET_TS_OR_POP_XMM0_XMM1(%r10)
860	ret
861	/* EXPORT DELETE END */
862	SET_SIZE(aes_decrypt_intel)
863
864#endif	/* lint || __lint */
865