xref: /linux/arch/x86/crypto/aesni-intel_asm.S (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 *    Author: Huang Ying <ying.huang@intel.com>
10 *            Vinodh Gopal <vinodh.gopal@intel.com>
11 *            Kahraman Akdemir
12 *
13 * Copyright (c) 2010, Intel Corporation.
14 *
15 * Ported x86_64 version to x86:
16 *    Author: Mathias Krause <minipli@googlemail.com>
17 */
18
19#include <linux/linkage.h>
20#include <asm/frame.h>
21
22#define STATE1	%xmm0
23#define STATE2	%xmm4
24#define STATE3	%xmm5
25#define STATE4	%xmm6
26#define STATE	STATE1
27#define IN1	%xmm1
28#define IN2	%xmm7
29#define IN3	%xmm8
30#define IN4	%xmm9
31#define IN	IN1
32#define KEY	%xmm2
33#define IV	%xmm3
34
35#define BSWAP_MASK %xmm10
36#define CTR	%xmm11
37#define INC	%xmm12
38
39#define GF128MUL_MASK %xmm7
40
41#ifdef __x86_64__
42#define AREG	%rax
43#define KEYP	%rdi
44#define OUTP	%rsi
45#define UKEYP	OUTP
46#define INP	%rdx
47#define LEN	%rcx
48#define IVP	%r8
49#define KLEN	%r9d
50#define T1	%r10
51#define TKEYP	T1
52#define T2	%r11
53#define TCTR_LOW T2
54#else
55#define AREG	%eax
56#define KEYP	%edi
57#define OUTP	AREG
58#define UKEYP	OUTP
59#define INP	%edx
60#define LEN	%esi
61#define IVP	%ebp
62#define KLEN	%ebx
63#define T1	%ecx
64#define TKEYP	T1
65#endif
66
67SYM_FUNC_START_LOCAL(_key_expansion_256a)
68	pshufd $0b11111111, %xmm1, %xmm1
69	shufps $0b00010000, %xmm0, %xmm4
70	pxor %xmm4, %xmm0
71	shufps $0b10001100, %xmm0, %xmm4
72	pxor %xmm4, %xmm0
73	pxor %xmm1, %xmm0
74	movaps %xmm0, (TKEYP)
75	add $0x10, TKEYP
76	RET
77SYM_FUNC_END(_key_expansion_256a)
78SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
79
80SYM_FUNC_START_LOCAL(_key_expansion_192a)
81	pshufd $0b01010101, %xmm1, %xmm1
82	shufps $0b00010000, %xmm0, %xmm4
83	pxor %xmm4, %xmm0
84	shufps $0b10001100, %xmm0, %xmm4
85	pxor %xmm4, %xmm0
86	pxor %xmm1, %xmm0
87
88	movaps %xmm2, %xmm5
89	movaps %xmm2, %xmm6
90	pslldq $4, %xmm5
91	pshufd $0b11111111, %xmm0, %xmm3
92	pxor %xmm3, %xmm2
93	pxor %xmm5, %xmm2
94
95	movaps %xmm0, %xmm1
96	shufps $0b01000100, %xmm0, %xmm6
97	movaps %xmm6, (TKEYP)
98	shufps $0b01001110, %xmm2, %xmm1
99	movaps %xmm1, 0x10(TKEYP)
100	add $0x20, TKEYP
101	RET
102SYM_FUNC_END(_key_expansion_192a)
103
104SYM_FUNC_START_LOCAL(_key_expansion_192b)
105	pshufd $0b01010101, %xmm1, %xmm1
106	shufps $0b00010000, %xmm0, %xmm4
107	pxor %xmm4, %xmm0
108	shufps $0b10001100, %xmm0, %xmm4
109	pxor %xmm4, %xmm0
110	pxor %xmm1, %xmm0
111
112	movaps %xmm2, %xmm5
113	pslldq $4, %xmm5
114	pshufd $0b11111111, %xmm0, %xmm3
115	pxor %xmm3, %xmm2
116	pxor %xmm5, %xmm2
117
118	movaps %xmm0, (TKEYP)
119	add $0x10, TKEYP
120	RET
121SYM_FUNC_END(_key_expansion_192b)
122
123SYM_FUNC_START_LOCAL(_key_expansion_256b)
124	pshufd $0b10101010, %xmm1, %xmm1
125	shufps $0b00010000, %xmm2, %xmm4
126	pxor %xmm4, %xmm2
127	shufps $0b10001100, %xmm2, %xmm4
128	pxor %xmm4, %xmm2
129	pxor %xmm1, %xmm2
130	movaps %xmm2, (TKEYP)
131	add $0x10, TKEYP
132	RET
133SYM_FUNC_END(_key_expansion_256b)
134
135/*
136 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
137 *                    unsigned int key_len)
138 */
139SYM_FUNC_START(aesni_set_key)
140	FRAME_BEGIN
141#ifndef __x86_64__
142	pushl KEYP
143	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
144	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
145	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
146#endif
147	movups (UKEYP), %xmm0		# user key (first 16 bytes)
148	movaps %xmm0, (KEYP)
149	lea 0x10(KEYP), TKEYP		# key addr
150	movl %edx, 480(KEYP)
151	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
152	cmp $24, %dl
153	jb .Lenc_key128
154	je .Lenc_key192
155	movups 0x10(UKEYP), %xmm2	# other user key
156	movaps %xmm2, (TKEYP)
157	add $0x10, TKEYP
158	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
159	call _key_expansion_256a
160	aeskeygenassist $0x1, %xmm0, %xmm1
161	call _key_expansion_256b
162	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
163	call _key_expansion_256a
164	aeskeygenassist $0x2, %xmm0, %xmm1
165	call _key_expansion_256b
166	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
167	call _key_expansion_256a
168	aeskeygenassist $0x4, %xmm0, %xmm1
169	call _key_expansion_256b
170	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
171	call _key_expansion_256a
172	aeskeygenassist $0x8, %xmm0, %xmm1
173	call _key_expansion_256b
174	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
175	call _key_expansion_256a
176	aeskeygenassist $0x10, %xmm0, %xmm1
177	call _key_expansion_256b
178	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
179	call _key_expansion_256a
180	aeskeygenassist $0x20, %xmm0, %xmm1
181	call _key_expansion_256b
182	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
183	call _key_expansion_256a
184	jmp .Ldec_key
185.Lenc_key192:
186	movq 0x10(UKEYP), %xmm2		# other user key
187	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
188	call _key_expansion_192a
189	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
190	call _key_expansion_192b
191	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
192	call _key_expansion_192a
193	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
194	call _key_expansion_192b
195	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
196	call _key_expansion_192a
197	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
198	call _key_expansion_192b
199	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
200	call _key_expansion_192a
201	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
202	call _key_expansion_192b
203	jmp .Ldec_key
204.Lenc_key128:
205	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
206	call _key_expansion_128
207	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
208	call _key_expansion_128
209	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
210	call _key_expansion_128
211	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
212	call _key_expansion_128
213	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
214	call _key_expansion_128
215	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
216	call _key_expansion_128
217	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
218	call _key_expansion_128
219	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
220	call _key_expansion_128
221	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
222	call _key_expansion_128
223	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
224	call _key_expansion_128
225.Ldec_key:
226	sub $0x10, TKEYP
227	movaps (KEYP), %xmm0
228	movaps (TKEYP), %xmm1
229	movaps %xmm0, 240(TKEYP)
230	movaps %xmm1, 240(KEYP)
231	add $0x10, KEYP
232	lea 240-16(TKEYP), UKEYP
233.align 4
234.Ldec_key_loop:
235	movaps (KEYP), %xmm0
236	aesimc %xmm0, %xmm1
237	movaps %xmm1, (UKEYP)
238	add $0x10, KEYP
239	sub $0x10, UKEYP
240	cmp TKEYP, KEYP
241	jb .Ldec_key_loop
242#ifndef __x86_64__
243	popl KEYP
244#endif
245	FRAME_END
246	RET
247SYM_FUNC_END(aesni_set_key)
248
249/*
250 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
251 */
252SYM_FUNC_START(aesni_enc)
253	FRAME_BEGIN
254#ifndef __x86_64__
255	pushl KEYP
256	pushl KLEN
257	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
258	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
259	movl (FRAME_OFFSET+20)(%esp), INP	# src
260#endif
261	movl 480(KEYP), KLEN		# key length
262	movups (INP), STATE		# input
263	call _aesni_enc1
264	movups STATE, (OUTP)		# output
265#ifndef __x86_64__
266	popl KLEN
267	popl KEYP
268#endif
269	FRAME_END
270	RET
271SYM_FUNC_END(aesni_enc)
272
273/*
274 * _aesni_enc1:		internal ABI
275 * input:
276 *	KEYP:		key struct pointer
277 *	KLEN:		round count
278 *	STATE:		initial state (input)
279 * output:
280 *	STATE:		finial state (output)
281 * changed:
282 *	KEY
283 *	TKEYP (T1)
284 */
285SYM_FUNC_START_LOCAL(_aesni_enc1)
286	movaps (KEYP), KEY		# key
287	mov KEYP, TKEYP
288	pxor KEY, STATE		# round 0
289	add $0x30, TKEYP
290	cmp $24, KLEN
291	jb .Lenc128
292	lea 0x20(TKEYP), TKEYP
293	je .Lenc192
294	add $0x20, TKEYP
295	movaps -0x60(TKEYP), KEY
296	aesenc KEY, STATE
297	movaps -0x50(TKEYP), KEY
298	aesenc KEY, STATE
299.align 4
300.Lenc192:
301	movaps -0x40(TKEYP), KEY
302	aesenc KEY, STATE
303	movaps -0x30(TKEYP), KEY
304	aesenc KEY, STATE
305.align 4
306.Lenc128:
307	movaps -0x20(TKEYP), KEY
308	aesenc KEY, STATE
309	movaps -0x10(TKEYP), KEY
310	aesenc KEY, STATE
311	movaps (TKEYP), KEY
312	aesenc KEY, STATE
313	movaps 0x10(TKEYP), KEY
314	aesenc KEY, STATE
315	movaps 0x20(TKEYP), KEY
316	aesenc KEY, STATE
317	movaps 0x30(TKEYP), KEY
318	aesenc KEY, STATE
319	movaps 0x40(TKEYP), KEY
320	aesenc KEY, STATE
321	movaps 0x50(TKEYP), KEY
322	aesenc KEY, STATE
323	movaps 0x60(TKEYP), KEY
324	aesenc KEY, STATE
325	movaps 0x70(TKEYP), KEY
326	aesenclast KEY, STATE
327	RET
328SYM_FUNC_END(_aesni_enc1)
329
330/*
331 * _aesni_enc4:	internal ABI
332 * input:
333 *	KEYP:		key struct pointer
334 *	KLEN:		round count
335 *	STATE1:		initial state (input)
336 *	STATE2
337 *	STATE3
338 *	STATE4
339 * output:
340 *	STATE1:		finial state (output)
341 *	STATE2
342 *	STATE3
343 *	STATE4
344 * changed:
345 *	KEY
346 *	TKEYP (T1)
347 */
348SYM_FUNC_START_LOCAL(_aesni_enc4)
349	movaps (KEYP), KEY		# key
350	mov KEYP, TKEYP
351	pxor KEY, STATE1		# round 0
352	pxor KEY, STATE2
353	pxor KEY, STATE3
354	pxor KEY, STATE4
355	add $0x30, TKEYP
356	cmp $24, KLEN
357	jb .L4enc128
358	lea 0x20(TKEYP), TKEYP
359	je .L4enc192
360	add $0x20, TKEYP
361	movaps -0x60(TKEYP), KEY
362	aesenc KEY, STATE1
363	aesenc KEY, STATE2
364	aesenc KEY, STATE3
365	aesenc KEY, STATE4
366	movaps -0x50(TKEYP), KEY
367	aesenc KEY, STATE1
368	aesenc KEY, STATE2
369	aesenc KEY, STATE3
370	aesenc KEY, STATE4
371#.align 4
372.L4enc192:
373	movaps -0x40(TKEYP), KEY
374	aesenc KEY, STATE1
375	aesenc KEY, STATE2
376	aesenc KEY, STATE3
377	aesenc KEY, STATE4
378	movaps -0x30(TKEYP), KEY
379	aesenc KEY, STATE1
380	aesenc KEY, STATE2
381	aesenc KEY, STATE3
382	aesenc KEY, STATE4
383#.align 4
384.L4enc128:
385	movaps -0x20(TKEYP), KEY
386	aesenc KEY, STATE1
387	aesenc KEY, STATE2
388	aesenc KEY, STATE3
389	aesenc KEY, STATE4
390	movaps -0x10(TKEYP), KEY
391	aesenc KEY, STATE1
392	aesenc KEY, STATE2
393	aesenc KEY, STATE3
394	aesenc KEY, STATE4
395	movaps (TKEYP), KEY
396	aesenc KEY, STATE1
397	aesenc KEY, STATE2
398	aesenc KEY, STATE3
399	aesenc KEY, STATE4
400	movaps 0x10(TKEYP), KEY
401	aesenc KEY, STATE1
402	aesenc KEY, STATE2
403	aesenc KEY, STATE3
404	aesenc KEY, STATE4
405	movaps 0x20(TKEYP), KEY
406	aesenc KEY, STATE1
407	aesenc KEY, STATE2
408	aesenc KEY, STATE3
409	aesenc KEY, STATE4
410	movaps 0x30(TKEYP), KEY
411	aesenc KEY, STATE1
412	aesenc KEY, STATE2
413	aesenc KEY, STATE3
414	aesenc KEY, STATE4
415	movaps 0x40(TKEYP), KEY
416	aesenc KEY, STATE1
417	aesenc KEY, STATE2
418	aesenc KEY, STATE3
419	aesenc KEY, STATE4
420	movaps 0x50(TKEYP), KEY
421	aesenc KEY, STATE1
422	aesenc KEY, STATE2
423	aesenc KEY, STATE3
424	aesenc KEY, STATE4
425	movaps 0x60(TKEYP), KEY
426	aesenc KEY, STATE1
427	aesenc KEY, STATE2
428	aesenc KEY, STATE3
429	aesenc KEY, STATE4
430	movaps 0x70(TKEYP), KEY
431	aesenclast KEY, STATE1		# last round
432	aesenclast KEY, STATE2
433	aesenclast KEY, STATE3
434	aesenclast KEY, STATE4
435	RET
436SYM_FUNC_END(_aesni_enc4)
437
438/*
439 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
440 */
441SYM_FUNC_START(aesni_dec)
442	FRAME_BEGIN
443#ifndef __x86_64__
444	pushl KEYP
445	pushl KLEN
446	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
447	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
448	movl (FRAME_OFFSET+20)(%esp), INP	# src
449#endif
450	mov 480(KEYP), KLEN		# key length
451	add $240, KEYP
452	movups (INP), STATE		# input
453	call _aesni_dec1
454	movups STATE, (OUTP)		#output
455#ifndef __x86_64__
456	popl KLEN
457	popl KEYP
458#endif
459	FRAME_END
460	RET
461SYM_FUNC_END(aesni_dec)
462
463/*
464 * _aesni_dec1:		internal ABI
465 * input:
466 *	KEYP:		key struct pointer
467 *	KLEN:		key length
468 *	STATE:		initial state (input)
469 * output:
470 *	STATE:		finial state (output)
471 * changed:
472 *	KEY
473 *	TKEYP (T1)
474 */
475SYM_FUNC_START_LOCAL(_aesni_dec1)
476	movaps (KEYP), KEY		# key
477	mov KEYP, TKEYP
478	pxor KEY, STATE		# round 0
479	add $0x30, TKEYP
480	cmp $24, KLEN
481	jb .Ldec128
482	lea 0x20(TKEYP), TKEYP
483	je .Ldec192
484	add $0x20, TKEYP
485	movaps -0x60(TKEYP), KEY
486	aesdec KEY, STATE
487	movaps -0x50(TKEYP), KEY
488	aesdec KEY, STATE
489.align 4
490.Ldec192:
491	movaps -0x40(TKEYP), KEY
492	aesdec KEY, STATE
493	movaps -0x30(TKEYP), KEY
494	aesdec KEY, STATE
495.align 4
496.Ldec128:
497	movaps -0x20(TKEYP), KEY
498	aesdec KEY, STATE
499	movaps -0x10(TKEYP), KEY
500	aesdec KEY, STATE
501	movaps (TKEYP), KEY
502	aesdec KEY, STATE
503	movaps 0x10(TKEYP), KEY
504	aesdec KEY, STATE
505	movaps 0x20(TKEYP), KEY
506	aesdec KEY, STATE
507	movaps 0x30(TKEYP), KEY
508	aesdec KEY, STATE
509	movaps 0x40(TKEYP), KEY
510	aesdec KEY, STATE
511	movaps 0x50(TKEYP), KEY
512	aesdec KEY, STATE
513	movaps 0x60(TKEYP), KEY
514	aesdec KEY, STATE
515	movaps 0x70(TKEYP), KEY
516	aesdeclast KEY, STATE
517	RET
518SYM_FUNC_END(_aesni_dec1)
519
520/*
521 * _aesni_dec4:	internal ABI
522 * input:
523 *	KEYP:		key struct pointer
524 *	KLEN:		key length
525 *	STATE1:		initial state (input)
526 *	STATE2
527 *	STATE3
528 *	STATE4
529 * output:
530 *	STATE1:		finial state (output)
531 *	STATE2
532 *	STATE3
533 *	STATE4
534 * changed:
535 *	KEY
536 *	TKEYP (T1)
537 */
538SYM_FUNC_START_LOCAL(_aesni_dec4)
539	movaps (KEYP), KEY		# key
540	mov KEYP, TKEYP
541	pxor KEY, STATE1		# round 0
542	pxor KEY, STATE2
543	pxor KEY, STATE3
544	pxor KEY, STATE4
545	add $0x30, TKEYP
546	cmp $24, KLEN
547	jb .L4dec128
548	lea 0x20(TKEYP), TKEYP
549	je .L4dec192
550	add $0x20, TKEYP
551	movaps -0x60(TKEYP), KEY
552	aesdec KEY, STATE1
553	aesdec KEY, STATE2
554	aesdec KEY, STATE3
555	aesdec KEY, STATE4
556	movaps -0x50(TKEYP), KEY
557	aesdec KEY, STATE1
558	aesdec KEY, STATE2
559	aesdec KEY, STATE3
560	aesdec KEY, STATE4
561.align 4
562.L4dec192:
563	movaps -0x40(TKEYP), KEY
564	aesdec KEY, STATE1
565	aesdec KEY, STATE2
566	aesdec KEY, STATE3
567	aesdec KEY, STATE4
568	movaps -0x30(TKEYP), KEY
569	aesdec KEY, STATE1
570	aesdec KEY, STATE2
571	aesdec KEY, STATE3
572	aesdec KEY, STATE4
573.align 4
574.L4dec128:
575	movaps -0x20(TKEYP), KEY
576	aesdec KEY, STATE1
577	aesdec KEY, STATE2
578	aesdec KEY, STATE3
579	aesdec KEY, STATE4
580	movaps -0x10(TKEYP), KEY
581	aesdec KEY, STATE1
582	aesdec KEY, STATE2
583	aesdec KEY, STATE3
584	aesdec KEY, STATE4
585	movaps (TKEYP), KEY
586	aesdec KEY, STATE1
587	aesdec KEY, STATE2
588	aesdec KEY, STATE3
589	aesdec KEY, STATE4
590	movaps 0x10(TKEYP), KEY
591	aesdec KEY, STATE1
592	aesdec KEY, STATE2
593	aesdec KEY, STATE3
594	aesdec KEY, STATE4
595	movaps 0x20(TKEYP), KEY
596	aesdec KEY, STATE1
597	aesdec KEY, STATE2
598	aesdec KEY, STATE3
599	aesdec KEY, STATE4
600	movaps 0x30(TKEYP), KEY
601	aesdec KEY, STATE1
602	aesdec KEY, STATE2
603	aesdec KEY, STATE3
604	aesdec KEY, STATE4
605	movaps 0x40(TKEYP), KEY
606	aesdec KEY, STATE1
607	aesdec KEY, STATE2
608	aesdec KEY, STATE3
609	aesdec KEY, STATE4
610	movaps 0x50(TKEYP), KEY
611	aesdec KEY, STATE1
612	aesdec KEY, STATE2
613	aesdec KEY, STATE3
614	aesdec KEY, STATE4
615	movaps 0x60(TKEYP), KEY
616	aesdec KEY, STATE1
617	aesdec KEY, STATE2
618	aesdec KEY, STATE3
619	aesdec KEY, STATE4
620	movaps 0x70(TKEYP), KEY
621	aesdeclast KEY, STATE1		# last round
622	aesdeclast KEY, STATE2
623	aesdeclast KEY, STATE3
624	aesdeclast KEY, STATE4
625	RET
626SYM_FUNC_END(_aesni_dec4)
627
628/*
629 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
630 *		      size_t len)
631 */
632SYM_FUNC_START(aesni_ecb_enc)
633	FRAME_BEGIN
634#ifndef __x86_64__
635	pushl LEN
636	pushl KEYP
637	pushl KLEN
638	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
639	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
640	movl (FRAME_OFFSET+24)(%esp), INP	# src
641	movl (FRAME_OFFSET+28)(%esp), LEN	# len
642#endif
643	test LEN, LEN		# check length
644	jz .Lecb_enc_ret
645	mov 480(KEYP), KLEN
646	cmp $16, LEN
647	jb .Lecb_enc_ret
648	cmp $64, LEN
649	jb .Lecb_enc_loop1
650.align 4
651.Lecb_enc_loop4:
652	movups (INP), STATE1
653	movups 0x10(INP), STATE2
654	movups 0x20(INP), STATE3
655	movups 0x30(INP), STATE4
656	call _aesni_enc4
657	movups STATE1, (OUTP)
658	movups STATE2, 0x10(OUTP)
659	movups STATE3, 0x20(OUTP)
660	movups STATE4, 0x30(OUTP)
661	sub $64, LEN
662	add $64, INP
663	add $64, OUTP
664	cmp $64, LEN
665	jge .Lecb_enc_loop4
666	cmp $16, LEN
667	jb .Lecb_enc_ret
668.align 4
669.Lecb_enc_loop1:
670	movups (INP), STATE1
671	call _aesni_enc1
672	movups STATE1, (OUTP)
673	sub $16, LEN
674	add $16, INP
675	add $16, OUTP
676	cmp $16, LEN
677	jge .Lecb_enc_loop1
678.Lecb_enc_ret:
679#ifndef __x86_64__
680	popl KLEN
681	popl KEYP
682	popl LEN
683#endif
684	FRAME_END
685	RET
686SYM_FUNC_END(aesni_ecb_enc)
687
688/*
689 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
690 *		      size_t len);
691 */
692SYM_FUNC_START(aesni_ecb_dec)
693	FRAME_BEGIN
694#ifndef __x86_64__
695	pushl LEN
696	pushl KEYP
697	pushl KLEN
698	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
699	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
700	movl (FRAME_OFFSET+24)(%esp), INP	# src
701	movl (FRAME_OFFSET+28)(%esp), LEN	# len
702#endif
703	test LEN, LEN
704	jz .Lecb_dec_ret
705	mov 480(KEYP), KLEN
706	add $240, KEYP
707	cmp $16, LEN
708	jb .Lecb_dec_ret
709	cmp $64, LEN
710	jb .Lecb_dec_loop1
711.align 4
712.Lecb_dec_loop4:
713	movups (INP), STATE1
714	movups 0x10(INP), STATE2
715	movups 0x20(INP), STATE3
716	movups 0x30(INP), STATE4
717	call _aesni_dec4
718	movups STATE1, (OUTP)
719	movups STATE2, 0x10(OUTP)
720	movups STATE3, 0x20(OUTP)
721	movups STATE4, 0x30(OUTP)
722	sub $64, LEN
723	add $64, INP
724	add $64, OUTP
725	cmp $64, LEN
726	jge .Lecb_dec_loop4
727	cmp $16, LEN
728	jb .Lecb_dec_ret
729.align 4
730.Lecb_dec_loop1:
731	movups (INP), STATE1
732	call _aesni_dec1
733	movups STATE1, (OUTP)
734	sub $16, LEN
735	add $16, INP
736	add $16, OUTP
737	cmp $16, LEN
738	jge .Lecb_dec_loop1
739.Lecb_dec_ret:
740#ifndef __x86_64__
741	popl KLEN
742	popl KEYP
743	popl LEN
744#endif
745	FRAME_END
746	RET
747SYM_FUNC_END(aesni_ecb_dec)
748
749/*
750 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
751 *		      size_t len, u8 *iv)
752 */
753SYM_FUNC_START(aesni_cbc_enc)
754	FRAME_BEGIN
755#ifndef __x86_64__
756	pushl IVP
757	pushl LEN
758	pushl KEYP
759	pushl KLEN
760	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
761	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
762	movl (FRAME_OFFSET+28)(%esp), INP	# src
763	movl (FRAME_OFFSET+32)(%esp), LEN	# len
764	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
765#endif
766	cmp $16, LEN
767	jb .Lcbc_enc_ret
768	mov 480(KEYP), KLEN
769	movups (IVP), STATE	# load iv as initial state
770.align 4
771.Lcbc_enc_loop:
772	movups (INP), IN	# load input
773	pxor IN, STATE
774	call _aesni_enc1
775	movups STATE, (OUTP)	# store output
776	sub $16, LEN
777	add $16, INP
778	add $16, OUTP
779	cmp $16, LEN
780	jge .Lcbc_enc_loop
781	movups STATE, (IVP)
782.Lcbc_enc_ret:
783#ifndef __x86_64__
784	popl KLEN
785	popl KEYP
786	popl LEN
787	popl IVP
788#endif
789	FRAME_END
790	RET
791SYM_FUNC_END(aesni_cbc_enc)
792
793/*
794 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
795 *		      size_t len, u8 *iv)
796 */
797SYM_FUNC_START(aesni_cbc_dec)
798	FRAME_BEGIN
799#ifndef __x86_64__
800	pushl IVP
801	pushl LEN
802	pushl KEYP
803	pushl KLEN
804	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
805	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
806	movl (FRAME_OFFSET+28)(%esp), INP	# src
807	movl (FRAME_OFFSET+32)(%esp), LEN	# len
808	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
809#endif
810	cmp $16, LEN
811	jb .Lcbc_dec_just_ret
812	mov 480(KEYP), KLEN
813	add $240, KEYP
814	movups (IVP), IV
815	cmp $64, LEN
816	jb .Lcbc_dec_loop1
817.align 4
818.Lcbc_dec_loop4:
819	movups (INP), IN1
820	movaps IN1, STATE1
821	movups 0x10(INP), IN2
822	movaps IN2, STATE2
823#ifdef __x86_64__
824	movups 0x20(INP), IN3
825	movaps IN3, STATE3
826	movups 0x30(INP), IN4
827	movaps IN4, STATE4
828#else
829	movups 0x20(INP), IN1
830	movaps IN1, STATE3
831	movups 0x30(INP), IN2
832	movaps IN2, STATE4
833#endif
834	call _aesni_dec4
835	pxor IV, STATE1
836#ifdef __x86_64__
837	pxor IN1, STATE2
838	pxor IN2, STATE3
839	pxor IN3, STATE4
840	movaps IN4, IV
841#else
842	pxor IN1, STATE4
843	movaps IN2, IV
844	movups (INP), IN1
845	pxor IN1, STATE2
846	movups 0x10(INP), IN2
847	pxor IN2, STATE3
848#endif
849	movups STATE1, (OUTP)
850	movups STATE2, 0x10(OUTP)
851	movups STATE3, 0x20(OUTP)
852	movups STATE4, 0x30(OUTP)
853	sub $64, LEN
854	add $64, INP
855	add $64, OUTP
856	cmp $64, LEN
857	jge .Lcbc_dec_loop4
858	cmp $16, LEN
859	jb .Lcbc_dec_ret
860.align 4
861.Lcbc_dec_loop1:
862	movups (INP), IN
863	movaps IN, STATE
864	call _aesni_dec1
865	pxor IV, STATE
866	movups STATE, (OUTP)
867	movaps IN, IV
868	sub $16, LEN
869	add $16, INP
870	add $16, OUTP
871	cmp $16, LEN
872	jge .Lcbc_dec_loop1
873.Lcbc_dec_ret:
874	movups IV, (IVP)
875.Lcbc_dec_just_ret:
876#ifndef __x86_64__
877	popl KLEN
878	popl KEYP
879	popl LEN
880	popl IVP
881#endif
882	FRAME_END
883	RET
884SYM_FUNC_END(aesni_cbc_dec)
885
886/*
887 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
888 *			  size_t len, u8 *iv)
889 */
890SYM_FUNC_START(aesni_cts_cbc_enc)
891	FRAME_BEGIN
892#ifndef __x86_64__
893	pushl IVP
894	pushl LEN
895	pushl KEYP
896	pushl KLEN
897	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
898	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
899	movl (FRAME_OFFSET+28)(%esp), INP	# src
900	movl (FRAME_OFFSET+32)(%esp), LEN	# len
901	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
902	lea .Lcts_permute_table, T1
903#else
904	lea .Lcts_permute_table(%rip), T1
905#endif
906	mov 480(KEYP), KLEN
907	movups (IVP), STATE
908	sub $16, LEN
909	mov T1, IVP
910	add $32, IVP
911	add LEN, T1
912	sub LEN, IVP
913	movups (T1), %xmm4
914	movups (IVP), %xmm5
915
916	movups (INP), IN1
917	add LEN, INP
918	movups (INP), IN2
919
920	pxor IN1, STATE
921	call _aesni_enc1
922
923	pshufb %xmm5, IN2
924	pxor STATE, IN2
925	pshufb %xmm4, STATE
926	add OUTP, LEN
927	movups STATE, (LEN)
928
929	movaps IN2, STATE
930	call _aesni_enc1
931	movups STATE, (OUTP)
932
933#ifndef __x86_64__
934	popl KLEN
935	popl KEYP
936	popl LEN
937	popl IVP
938#endif
939	FRAME_END
940	RET
941SYM_FUNC_END(aesni_cts_cbc_enc)
942
943/*
944 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
945 *			  size_t len, u8 *iv)
946 */
947SYM_FUNC_START(aesni_cts_cbc_dec)
948	FRAME_BEGIN
949#ifndef __x86_64__
950	pushl IVP
951	pushl LEN
952	pushl KEYP
953	pushl KLEN
954	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
955	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
956	movl (FRAME_OFFSET+28)(%esp), INP	# src
957	movl (FRAME_OFFSET+32)(%esp), LEN	# len
958	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
959	lea .Lcts_permute_table, T1
960#else
961	lea .Lcts_permute_table(%rip), T1
962#endif
963	mov 480(KEYP), KLEN
964	add $240, KEYP
965	movups (IVP), IV
966	sub $16, LEN
967	mov T1, IVP
968	add $32, IVP
969	add LEN, T1
970	sub LEN, IVP
971	movups (T1), %xmm4
972
973	movups (INP), STATE
974	add LEN, INP
975	movups (INP), IN1
976
977	call _aesni_dec1
978	movaps STATE, IN2
979	pshufb %xmm4, STATE
980	pxor IN1, STATE
981
982	add OUTP, LEN
983	movups STATE, (LEN)
984
985	movups (IVP), %xmm0
986	pshufb %xmm0, IN1
987	pblendvb IN2, IN1
988	movaps IN1, STATE
989	call _aesni_dec1
990
991	pxor IV, STATE
992	movups STATE, (OUTP)
993
994#ifndef __x86_64__
995	popl KLEN
996	popl KEYP
997	popl LEN
998	popl IVP
999#endif
1000	FRAME_END
1001	RET
1002SYM_FUNC_END(aesni_cts_cbc_dec)
1003
1004.pushsection .rodata
1005.align 16
1006.Lcts_permute_table:
1007	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1008	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1009	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
1010	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
1011	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1012	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1013#ifdef __x86_64__
1014.Lbswap_mask:
1015	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1016#endif
1017.popsection
1018
1019#ifdef __x86_64__
1020/*
1021 * _aesni_inc_init:	internal ABI
1022 *	setup registers used by _aesni_inc
1023 * input:
1024 *	IV
1025 * output:
1026 *	CTR:	== IV, in little endian
1027 *	TCTR_LOW: == lower qword of CTR
1028 *	INC:	== 1, in little endian
1029 *	BSWAP_MASK == endian swapping mask
1030 */
1031SYM_FUNC_START_LOCAL(_aesni_inc_init)
1032	movaps .Lbswap_mask(%rip), BSWAP_MASK
1033	movaps IV, CTR
1034	pshufb BSWAP_MASK, CTR
1035	mov $1, TCTR_LOW
1036	movq TCTR_LOW, INC
1037	movq CTR, TCTR_LOW
1038	RET
1039SYM_FUNC_END(_aesni_inc_init)
1040
1041/*
1042 * _aesni_inc:		internal ABI
1043 *	Increase IV by 1, IV is in big endian
1044 * input:
1045 *	IV
1046 *	CTR:	== IV, in little endian
1047 *	TCTR_LOW: == lower qword of CTR
1048 *	INC:	== 1, in little endian
1049 *	BSWAP_MASK == endian swapping mask
1050 * output:
1051 *	IV:	Increase by 1
1052 * changed:
1053 *	CTR:	== output IV, in little endian
1054 *	TCTR_LOW: == lower qword of CTR
1055 */
1056SYM_FUNC_START_LOCAL(_aesni_inc)
1057	paddq INC, CTR
1058	add $1, TCTR_LOW
1059	jnc .Linc_low
1060	pslldq $8, INC
1061	paddq INC, CTR
1062	psrldq $8, INC
1063.Linc_low:
1064	movaps CTR, IV
1065	pshufb BSWAP_MASK, IV
1066	RET
1067SYM_FUNC_END(_aesni_inc)
1068
1069/*
1070 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1071 *		      size_t len, u8 *iv)
1072 */
1073SYM_FUNC_START(aesni_ctr_enc)
1074	FRAME_BEGIN
1075	cmp $16, LEN
1076	jb .Lctr_enc_just_ret
1077	mov 480(KEYP), KLEN
1078	movups (IVP), IV
1079	call _aesni_inc_init
1080	cmp $64, LEN
1081	jb .Lctr_enc_loop1
1082.align 4
1083.Lctr_enc_loop4:
1084	movaps IV, STATE1
1085	call _aesni_inc
1086	movups (INP), IN1
1087	movaps IV, STATE2
1088	call _aesni_inc
1089	movups 0x10(INP), IN2
1090	movaps IV, STATE3
1091	call _aesni_inc
1092	movups 0x20(INP), IN3
1093	movaps IV, STATE4
1094	call _aesni_inc
1095	movups 0x30(INP), IN4
1096	call _aesni_enc4
1097	pxor IN1, STATE1
1098	movups STATE1, (OUTP)
1099	pxor IN2, STATE2
1100	movups STATE2, 0x10(OUTP)
1101	pxor IN3, STATE3
1102	movups STATE3, 0x20(OUTP)
1103	pxor IN4, STATE4
1104	movups STATE4, 0x30(OUTP)
1105	sub $64, LEN
1106	add $64, INP
1107	add $64, OUTP
1108	cmp $64, LEN
1109	jge .Lctr_enc_loop4
1110	cmp $16, LEN
1111	jb .Lctr_enc_ret
1112.align 4
1113.Lctr_enc_loop1:
1114	movaps IV, STATE
1115	call _aesni_inc
1116	movups (INP), IN
1117	call _aesni_enc1
1118	pxor IN, STATE
1119	movups STATE, (OUTP)
1120	sub $16, LEN
1121	add $16, INP
1122	add $16, OUTP
1123	cmp $16, LEN
1124	jge .Lctr_enc_loop1
1125.Lctr_enc_ret:
1126	movups IV, (IVP)
1127.Lctr_enc_just_ret:
1128	FRAME_END
1129	RET
1130SYM_FUNC_END(aesni_ctr_enc)
1131
1132#endif
1133
1134.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
1135.align 16
1136.Lgf128mul_x_ble_mask:
1137	.octa 0x00000000000000010000000000000087
1138.previous
1139
1140/*
1141 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
1142 * input:
1143 *	IV:	current IV
1144 *	GF128MUL_MASK == mask with 0x87 and 0x01
1145 * output:
1146 *	IV:	next IV
1147 * changed:
1148 *	KEY:	== temporary value
1149 */
1150.macro _aesni_gf128mul_x_ble
1151	pshufd $0x13, IV, KEY
1152	paddq IV, IV
1153	psrad $31, KEY
1154	pand GF128MUL_MASK, KEY
1155	pxor KEY, IV
1156.endm
1157
1158.macro	_aesni_xts_crypt	enc
1159	FRAME_BEGIN
1160#ifndef __x86_64__
1161	pushl IVP
1162	pushl LEN
1163	pushl KEYP
1164	pushl KLEN
1165	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
1166	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
1167	movl (FRAME_OFFSET+28)(%esp), INP	# src
1168	movl (FRAME_OFFSET+32)(%esp), LEN	# len
1169	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
1170	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
1171#else
1172	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
1173#endif
1174	movups (IVP), IV
1175
1176	mov 480(KEYP), KLEN
1177.if !\enc
1178	add $240, KEYP
1179
1180	test $15, LEN
1181	jz .Lxts_loop4\@
1182	sub $16, LEN
1183.endif
1184
1185.Lxts_loop4\@:
1186	sub $64, LEN
1187	jl .Lxts_1x\@
1188
1189	movdqa IV, STATE1
1190	movdqu 0x00(INP), IN
1191	pxor IN, STATE1
1192	movdqu IV, 0x00(OUTP)
1193
1194	_aesni_gf128mul_x_ble
1195	movdqa IV, STATE2
1196	movdqu 0x10(INP), IN
1197	pxor IN, STATE2
1198	movdqu IV, 0x10(OUTP)
1199
1200	_aesni_gf128mul_x_ble
1201	movdqa IV, STATE3
1202	movdqu 0x20(INP), IN
1203	pxor IN, STATE3
1204	movdqu IV, 0x20(OUTP)
1205
1206	_aesni_gf128mul_x_ble
1207	movdqa IV, STATE4
1208	movdqu 0x30(INP), IN
1209	pxor IN, STATE4
1210	movdqu IV, 0x30(OUTP)
1211
1212.if \enc
1213	call _aesni_enc4
1214.else
1215	call _aesni_dec4
1216.endif
1217
1218	movdqu 0x00(OUTP), IN
1219	pxor IN, STATE1
1220	movdqu STATE1, 0x00(OUTP)
1221
1222	movdqu 0x10(OUTP), IN
1223	pxor IN, STATE2
1224	movdqu STATE2, 0x10(OUTP)
1225
1226	movdqu 0x20(OUTP), IN
1227	pxor IN, STATE3
1228	movdqu STATE3, 0x20(OUTP)
1229
1230	movdqu 0x30(OUTP), IN
1231	pxor IN, STATE4
1232	movdqu STATE4, 0x30(OUTP)
1233
1234	_aesni_gf128mul_x_ble
1235
1236	add $64, INP
1237	add $64, OUTP
1238	test LEN, LEN
1239	jnz .Lxts_loop4\@
1240
1241.Lxts_ret_iv\@:
1242	movups IV, (IVP)
1243
1244.Lxts_ret\@:
1245#ifndef __x86_64__
1246	popl KLEN
1247	popl KEYP
1248	popl LEN
1249	popl IVP
1250#endif
1251	FRAME_END
1252	RET
1253
1254.Lxts_1x\@:
1255	add $64, LEN
1256	jz .Lxts_ret_iv\@
1257.if \enc
1258	sub $16, LEN
1259	jl .Lxts_cts4\@
1260.endif
1261
1262.Lxts_loop1\@:
1263	movdqu (INP), STATE
1264.if \enc
1265	pxor IV, STATE
1266	call _aesni_enc1
1267.else
1268	add $16, INP
1269	sub $16, LEN
1270	jl .Lxts_cts1\@
1271	pxor IV, STATE
1272	call _aesni_dec1
1273.endif
1274	pxor IV, STATE
1275	_aesni_gf128mul_x_ble
1276
1277	test LEN, LEN
1278	jz .Lxts_out\@
1279
1280.if \enc
1281	add $16, INP
1282	sub $16, LEN
1283	jl .Lxts_cts1\@
1284.endif
1285
1286	movdqu STATE, (OUTP)
1287	add $16, OUTP
1288	jmp .Lxts_loop1\@
1289
1290.Lxts_out\@:
1291	movdqu STATE, (OUTP)
1292	jmp .Lxts_ret_iv\@
1293
1294.if \enc
1295.Lxts_cts4\@:
1296	movdqa STATE4, STATE
1297	sub $16, OUTP
1298.Lxts_cts1\@:
1299.else
1300.Lxts_cts1\@:
1301	movdqa IV, STATE4
1302	_aesni_gf128mul_x_ble
1303
1304	pxor IV, STATE
1305	call _aesni_dec1
1306	pxor IV, STATE
1307.endif
1308#ifndef __x86_64__
1309	lea .Lcts_permute_table, T1
1310#else
1311	lea .Lcts_permute_table(%rip), T1
1312#endif
1313	add LEN, INP		/* rewind input pointer */
1314	add $16, LEN		/* # bytes in final block */
1315	movups (INP), IN1
1316
1317	mov T1, IVP
1318	add $32, IVP
1319	add LEN, T1
1320	sub LEN, IVP
1321	add OUTP, LEN
1322
1323	movups (T1), %xmm4
1324	movaps STATE, IN2
1325	pshufb %xmm4, STATE
1326	movups STATE, (LEN)
1327
1328	movups (IVP), %xmm0
1329	pshufb %xmm0, IN1
1330	pblendvb IN2, IN1
1331	movaps IN1, STATE
1332
1333.if \enc
1334	pxor IV, STATE
1335	call _aesni_enc1
1336	pxor IV, STATE
1337.else
1338	pxor STATE4, STATE
1339	call _aesni_dec1
1340	pxor STATE4, STATE
1341.endif
1342
1343	movups STATE, (OUTP)
1344	jmp .Lxts_ret\@
1345.endm
1346
1347/*
1348 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
1349 *		      const u8 *src, unsigned int len, le128 *iv)
1350 */
1351SYM_FUNC_START(aesni_xts_enc)
1352	_aesni_xts_crypt	1
1353SYM_FUNC_END(aesni_xts_enc)
1354
1355/*
1356 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
1357 *		      const u8 *src, unsigned int len, le128 *iv)
1358 */
1359SYM_FUNC_START(aesni_xts_dec)
1360	_aesni_xts_crypt	0
1361SYM_FUNC_END(aesni_xts_dec)
1362