xref: /linux/arch/x86/crypto/aesni-intel_asm.S (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 *    Author: Huang Ying <ying.huang@intel.com>
10 *            Vinodh Gopal <vinodh.gopal@intel.com>
11 *            Kahraman Akdemir
12 *
13 * Copyright (c) 2010, Intel Corporation.
14 *
15 * Ported x86_64 version to x86:
16 *    Author: Mathias Krause <minipli@googlemail.com>
17 */
18
19#include <linux/linkage.h>
20#include <linux/objtool.h>
21#include <asm/frame.h>
22
23#define STATE1	%xmm0
24#define STATE2	%xmm4
25#define STATE3	%xmm5
26#define STATE4	%xmm6
27#define STATE	STATE1
28#define IN1	%xmm1
29#define IN2	%xmm7
30#define IN3	%xmm8
31#define IN4	%xmm9
32#define IN	IN1
33#define KEY	%xmm2
34#define IV	%xmm3
35
36#define BSWAP_MASK %xmm10
37#define CTR	%xmm11
38#define INC	%xmm12
39
40#define GF128MUL_MASK %xmm7
41
42#ifdef __x86_64__
43#define AREG	%rax
44#define KEYP	%rdi
45#define OUTP	%rsi
46#define UKEYP	OUTP
47#define INP	%rdx
48#define LEN	%rcx
49#define IVP	%r8
50#define KLEN	%r9d
51#define T1	%r10
52#define TKEYP	T1
53#define T2	%r11
54#define TCTR_LOW T2
55#else
56#define AREG	%eax
57#define KEYP	%edi
58#define OUTP	AREG
59#define UKEYP	OUTP
60#define INP	%edx
61#define LEN	%esi
62#define IVP	%ebp
63#define KLEN	%ebx
64#define T1	%ecx
65#define TKEYP	T1
66#endif
67
68SYM_FUNC_START_LOCAL(_key_expansion_256a)
69	pshufd $0b11111111, %xmm1, %xmm1
70	shufps $0b00010000, %xmm0, %xmm4
71	pxor %xmm4, %xmm0
72	shufps $0b10001100, %xmm0, %xmm4
73	pxor %xmm4, %xmm0
74	pxor %xmm1, %xmm0
75	movaps %xmm0, (TKEYP)
76	add $0x10, TKEYP
77	RET
78SYM_FUNC_END(_key_expansion_256a)
79SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
80
81SYM_FUNC_START_LOCAL(_key_expansion_192a)
82	pshufd $0b01010101, %xmm1, %xmm1
83	shufps $0b00010000, %xmm0, %xmm4
84	pxor %xmm4, %xmm0
85	shufps $0b10001100, %xmm0, %xmm4
86	pxor %xmm4, %xmm0
87	pxor %xmm1, %xmm0
88
89	movaps %xmm2, %xmm5
90	movaps %xmm2, %xmm6
91	pslldq $4, %xmm5
92	pshufd $0b11111111, %xmm0, %xmm3
93	pxor %xmm3, %xmm2
94	pxor %xmm5, %xmm2
95
96	movaps %xmm0, %xmm1
97	shufps $0b01000100, %xmm0, %xmm6
98	movaps %xmm6, (TKEYP)
99	shufps $0b01001110, %xmm2, %xmm1
100	movaps %xmm1, 0x10(TKEYP)
101	add $0x20, TKEYP
102	RET
103SYM_FUNC_END(_key_expansion_192a)
104
105SYM_FUNC_START_LOCAL(_key_expansion_192b)
106	pshufd $0b01010101, %xmm1, %xmm1
107	shufps $0b00010000, %xmm0, %xmm4
108	pxor %xmm4, %xmm0
109	shufps $0b10001100, %xmm0, %xmm4
110	pxor %xmm4, %xmm0
111	pxor %xmm1, %xmm0
112
113	movaps %xmm2, %xmm5
114	pslldq $4, %xmm5
115	pshufd $0b11111111, %xmm0, %xmm3
116	pxor %xmm3, %xmm2
117	pxor %xmm5, %xmm2
118
119	movaps %xmm0, (TKEYP)
120	add $0x10, TKEYP
121	RET
122SYM_FUNC_END(_key_expansion_192b)
123
124SYM_FUNC_START_LOCAL(_key_expansion_256b)
125	pshufd $0b10101010, %xmm1, %xmm1
126	shufps $0b00010000, %xmm2, %xmm4
127	pxor %xmm4, %xmm2
128	shufps $0b10001100, %xmm2, %xmm4
129	pxor %xmm4, %xmm2
130	pxor %xmm1, %xmm2
131	movaps %xmm2, (TKEYP)
132	add $0x10, TKEYP
133	RET
134SYM_FUNC_END(_key_expansion_256b)
135
136/*
137 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
138 *                    unsigned int key_len)
139 */
140SYM_FUNC_START(aesni_set_key)
141	FRAME_BEGIN
142#ifndef __x86_64__
143	pushl KEYP
144	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
145	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
146	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
147#endif
148	movups (UKEYP), %xmm0		# user key (first 16 bytes)
149	movaps %xmm0, (KEYP)
150	lea 0x10(KEYP), TKEYP		# key addr
151	movl %edx, 480(KEYP)
152	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
153	cmp $24, %dl
154	jb .Lenc_key128
155	je .Lenc_key192
156	movups 0x10(UKEYP), %xmm2	# other user key
157	movaps %xmm2, (TKEYP)
158	add $0x10, TKEYP
159	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
160	call _key_expansion_256a
161	aeskeygenassist $0x1, %xmm0, %xmm1
162	call _key_expansion_256b
163	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
164	call _key_expansion_256a
165	aeskeygenassist $0x2, %xmm0, %xmm1
166	call _key_expansion_256b
167	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
168	call _key_expansion_256a
169	aeskeygenassist $0x4, %xmm0, %xmm1
170	call _key_expansion_256b
171	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
172	call _key_expansion_256a
173	aeskeygenassist $0x8, %xmm0, %xmm1
174	call _key_expansion_256b
175	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
176	call _key_expansion_256a
177	aeskeygenassist $0x10, %xmm0, %xmm1
178	call _key_expansion_256b
179	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
180	call _key_expansion_256a
181	aeskeygenassist $0x20, %xmm0, %xmm1
182	call _key_expansion_256b
183	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
184	call _key_expansion_256a
185	jmp .Ldec_key
186.Lenc_key192:
187	movq 0x10(UKEYP), %xmm2		# other user key
188	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
189	call _key_expansion_192a
190	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
191	call _key_expansion_192b
192	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
193	call _key_expansion_192a
194	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
195	call _key_expansion_192b
196	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
197	call _key_expansion_192a
198	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
199	call _key_expansion_192b
200	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
201	call _key_expansion_192a
202	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
203	call _key_expansion_192b
204	jmp .Ldec_key
205.Lenc_key128:
206	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
207	call _key_expansion_128
208	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
209	call _key_expansion_128
210	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
211	call _key_expansion_128
212	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
213	call _key_expansion_128
214	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
215	call _key_expansion_128
216	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
217	call _key_expansion_128
218	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
219	call _key_expansion_128
220	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
221	call _key_expansion_128
222	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
223	call _key_expansion_128
224	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
225	call _key_expansion_128
226.Ldec_key:
227	sub $0x10, TKEYP
228	movaps (KEYP), %xmm0
229	movaps (TKEYP), %xmm1
230	movaps %xmm0, 240(TKEYP)
231	movaps %xmm1, 240(KEYP)
232	add $0x10, KEYP
233	lea 240-16(TKEYP), UKEYP
234.align 4
235.Ldec_key_loop:
236	movaps (KEYP), %xmm0
237	aesimc %xmm0, %xmm1
238	movaps %xmm1, (UKEYP)
239	add $0x10, KEYP
240	sub $0x10, UKEYP
241	cmp TKEYP, KEYP
242	jb .Ldec_key_loop
243#ifndef __x86_64__
244	popl KEYP
245#endif
246	FRAME_END
247	RET
248SYM_FUNC_END(aesni_set_key)
249
250/*
251 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
252 */
253SYM_FUNC_START(aesni_enc)
254	FRAME_BEGIN
255#ifndef __x86_64__
256	pushl KEYP
257	pushl KLEN
258	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
259	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
260	movl (FRAME_OFFSET+20)(%esp), INP	# src
261#endif
262	movl 480(KEYP), KLEN		# key length
263	movups (INP), STATE		# input
264	call _aesni_enc1
265	movups STATE, (OUTP)		# output
266#ifndef __x86_64__
267	popl KLEN
268	popl KEYP
269#endif
270	FRAME_END
271	RET
272SYM_FUNC_END(aesni_enc)
273
274/*
275 * _aesni_enc1:		internal ABI
276 * input:
277 *	KEYP:		key struct pointer
278 *	KLEN:		round count
279 *	STATE:		initial state (input)
280 * output:
281 *	STATE:		finial state (output)
282 * changed:
283 *	KEY
284 *	TKEYP (T1)
285 */
286SYM_FUNC_START_LOCAL(_aesni_enc1)
287	movaps (KEYP), KEY		# key
288	mov KEYP, TKEYP
289	pxor KEY, STATE		# round 0
290	add $0x30, TKEYP
291	cmp $24, KLEN
292	jb .Lenc128
293	lea 0x20(TKEYP), TKEYP
294	je .Lenc192
295	add $0x20, TKEYP
296	movaps -0x60(TKEYP), KEY
297	aesenc KEY, STATE
298	movaps -0x50(TKEYP), KEY
299	aesenc KEY, STATE
300.align 4
301.Lenc192:
302	movaps -0x40(TKEYP), KEY
303	aesenc KEY, STATE
304	movaps -0x30(TKEYP), KEY
305	aesenc KEY, STATE
306.align 4
307.Lenc128:
308	movaps -0x20(TKEYP), KEY
309	aesenc KEY, STATE
310	movaps -0x10(TKEYP), KEY
311	aesenc KEY, STATE
312	movaps (TKEYP), KEY
313	aesenc KEY, STATE
314	movaps 0x10(TKEYP), KEY
315	aesenc KEY, STATE
316	movaps 0x20(TKEYP), KEY
317	aesenc KEY, STATE
318	movaps 0x30(TKEYP), KEY
319	aesenc KEY, STATE
320	movaps 0x40(TKEYP), KEY
321	aesenc KEY, STATE
322	movaps 0x50(TKEYP), KEY
323	aesenc KEY, STATE
324	movaps 0x60(TKEYP), KEY
325	aesenc KEY, STATE
326	movaps 0x70(TKEYP), KEY
327	aesenclast KEY, STATE
328	RET
329SYM_FUNC_END(_aesni_enc1)
330
331/*
332 * _aesni_enc4:	internal ABI
333 * input:
334 *	KEYP:		key struct pointer
335 *	KLEN:		round count
336 *	STATE1:		initial state (input)
337 *	STATE2
338 *	STATE3
339 *	STATE4
340 * output:
341 *	STATE1:		finial state (output)
342 *	STATE2
343 *	STATE3
344 *	STATE4
345 * changed:
346 *	KEY
347 *	TKEYP (T1)
348 */
349SYM_FUNC_START_LOCAL(_aesni_enc4)
350	movaps (KEYP), KEY		# key
351	mov KEYP, TKEYP
352	pxor KEY, STATE1		# round 0
353	pxor KEY, STATE2
354	pxor KEY, STATE3
355	pxor KEY, STATE4
356	add $0x30, TKEYP
357	cmp $24, KLEN
358	jb .L4enc128
359	lea 0x20(TKEYP), TKEYP
360	je .L4enc192
361	add $0x20, TKEYP
362	movaps -0x60(TKEYP), KEY
363	aesenc KEY, STATE1
364	aesenc KEY, STATE2
365	aesenc KEY, STATE3
366	aesenc KEY, STATE4
367	movaps -0x50(TKEYP), KEY
368	aesenc KEY, STATE1
369	aesenc KEY, STATE2
370	aesenc KEY, STATE3
371	aesenc KEY, STATE4
372#.align 4
373.L4enc192:
374	movaps -0x40(TKEYP), KEY
375	aesenc KEY, STATE1
376	aesenc KEY, STATE2
377	aesenc KEY, STATE3
378	aesenc KEY, STATE4
379	movaps -0x30(TKEYP), KEY
380	aesenc KEY, STATE1
381	aesenc KEY, STATE2
382	aesenc KEY, STATE3
383	aesenc KEY, STATE4
384#.align 4
385.L4enc128:
386	movaps -0x20(TKEYP), KEY
387	aesenc KEY, STATE1
388	aesenc KEY, STATE2
389	aesenc KEY, STATE3
390	aesenc KEY, STATE4
391	movaps -0x10(TKEYP), KEY
392	aesenc KEY, STATE1
393	aesenc KEY, STATE2
394	aesenc KEY, STATE3
395	aesenc KEY, STATE4
396	movaps (TKEYP), KEY
397	aesenc KEY, STATE1
398	aesenc KEY, STATE2
399	aesenc KEY, STATE3
400	aesenc KEY, STATE4
401	movaps 0x10(TKEYP), KEY
402	aesenc KEY, STATE1
403	aesenc KEY, STATE2
404	aesenc KEY, STATE3
405	aesenc KEY, STATE4
406	movaps 0x20(TKEYP), KEY
407	aesenc KEY, STATE1
408	aesenc KEY, STATE2
409	aesenc KEY, STATE3
410	aesenc KEY, STATE4
411	movaps 0x30(TKEYP), KEY
412	aesenc KEY, STATE1
413	aesenc KEY, STATE2
414	aesenc KEY, STATE3
415	aesenc KEY, STATE4
416	movaps 0x40(TKEYP), KEY
417	aesenc KEY, STATE1
418	aesenc KEY, STATE2
419	aesenc KEY, STATE3
420	aesenc KEY, STATE4
421	movaps 0x50(TKEYP), KEY
422	aesenc KEY, STATE1
423	aesenc KEY, STATE2
424	aesenc KEY, STATE3
425	aesenc KEY, STATE4
426	movaps 0x60(TKEYP), KEY
427	aesenc KEY, STATE1
428	aesenc KEY, STATE2
429	aesenc KEY, STATE3
430	aesenc KEY, STATE4
431	movaps 0x70(TKEYP), KEY
432	aesenclast KEY, STATE1		# last round
433	aesenclast KEY, STATE2
434	aesenclast KEY, STATE3
435	aesenclast KEY, STATE4
436	RET
437SYM_FUNC_END(_aesni_enc4)
438
439/*
440 * _aesni_dec1:		internal ABI
441 * input:
442 *	KEYP:		key struct pointer
443 *	KLEN:		key length
444 *	STATE:		initial state (input)
445 * output:
446 *	STATE:		finial state (output)
447 * changed:
448 *	KEY
449 *	TKEYP (T1)
450 */
451SYM_FUNC_START_LOCAL(_aesni_dec1)
452	movaps (KEYP), KEY		# key
453	mov KEYP, TKEYP
454	pxor KEY, STATE		# round 0
455	add $0x30, TKEYP
456	cmp $24, KLEN
457	jb .Ldec128
458	lea 0x20(TKEYP), TKEYP
459	je .Ldec192
460	add $0x20, TKEYP
461	movaps -0x60(TKEYP), KEY
462	aesdec KEY, STATE
463	movaps -0x50(TKEYP), KEY
464	aesdec KEY, STATE
465.align 4
466.Ldec192:
467	movaps -0x40(TKEYP), KEY
468	aesdec KEY, STATE
469	movaps -0x30(TKEYP), KEY
470	aesdec KEY, STATE
471.align 4
472.Ldec128:
473	movaps -0x20(TKEYP), KEY
474	aesdec KEY, STATE
475	movaps -0x10(TKEYP), KEY
476	aesdec KEY, STATE
477	movaps (TKEYP), KEY
478	aesdec KEY, STATE
479	movaps 0x10(TKEYP), KEY
480	aesdec KEY, STATE
481	movaps 0x20(TKEYP), KEY
482	aesdec KEY, STATE
483	movaps 0x30(TKEYP), KEY
484	aesdec KEY, STATE
485	movaps 0x40(TKEYP), KEY
486	aesdec KEY, STATE
487	movaps 0x50(TKEYP), KEY
488	aesdec KEY, STATE
489	movaps 0x60(TKEYP), KEY
490	aesdec KEY, STATE
491	movaps 0x70(TKEYP), KEY
492	aesdeclast KEY, STATE
493	RET
494SYM_FUNC_END(_aesni_dec1)
495
496/*
497 * _aesni_dec4:	internal ABI
498 * input:
499 *	KEYP:		key struct pointer
500 *	KLEN:		key length
501 *	STATE1:		initial state (input)
502 *	STATE2
503 *	STATE3
504 *	STATE4
505 * output:
506 *	STATE1:		finial state (output)
507 *	STATE2
508 *	STATE3
509 *	STATE4
510 * changed:
511 *	KEY
512 *	TKEYP (T1)
513 */
514SYM_FUNC_START_LOCAL(_aesni_dec4)
515	movaps (KEYP), KEY		# key
516	mov KEYP, TKEYP
517	pxor KEY, STATE1		# round 0
518	pxor KEY, STATE2
519	pxor KEY, STATE3
520	pxor KEY, STATE4
521	add $0x30, TKEYP
522	cmp $24, KLEN
523	jb .L4dec128
524	lea 0x20(TKEYP), TKEYP
525	je .L4dec192
526	add $0x20, TKEYP
527	movaps -0x60(TKEYP), KEY
528	aesdec KEY, STATE1
529	aesdec KEY, STATE2
530	aesdec KEY, STATE3
531	aesdec KEY, STATE4
532	movaps -0x50(TKEYP), KEY
533	aesdec KEY, STATE1
534	aesdec KEY, STATE2
535	aesdec KEY, STATE3
536	aesdec KEY, STATE4
537.align 4
538.L4dec192:
539	movaps -0x40(TKEYP), KEY
540	aesdec KEY, STATE1
541	aesdec KEY, STATE2
542	aesdec KEY, STATE3
543	aesdec KEY, STATE4
544	movaps -0x30(TKEYP), KEY
545	aesdec KEY, STATE1
546	aesdec KEY, STATE2
547	aesdec KEY, STATE3
548	aesdec KEY, STATE4
549.align 4
550.L4dec128:
551	movaps -0x20(TKEYP), KEY
552	aesdec KEY, STATE1
553	aesdec KEY, STATE2
554	aesdec KEY, STATE3
555	aesdec KEY, STATE4
556	movaps -0x10(TKEYP), KEY
557	aesdec KEY, STATE1
558	aesdec KEY, STATE2
559	aesdec KEY, STATE3
560	aesdec KEY, STATE4
561	movaps (TKEYP), KEY
562	aesdec KEY, STATE1
563	aesdec KEY, STATE2
564	aesdec KEY, STATE3
565	aesdec KEY, STATE4
566	movaps 0x10(TKEYP), KEY
567	aesdec KEY, STATE1
568	aesdec KEY, STATE2
569	aesdec KEY, STATE3
570	aesdec KEY, STATE4
571	movaps 0x20(TKEYP), KEY
572	aesdec KEY, STATE1
573	aesdec KEY, STATE2
574	aesdec KEY, STATE3
575	aesdec KEY, STATE4
576	movaps 0x30(TKEYP), KEY
577	aesdec KEY, STATE1
578	aesdec KEY, STATE2
579	aesdec KEY, STATE3
580	aesdec KEY, STATE4
581	movaps 0x40(TKEYP), KEY
582	aesdec KEY, STATE1
583	aesdec KEY, STATE2
584	aesdec KEY, STATE3
585	aesdec KEY, STATE4
586	movaps 0x50(TKEYP), KEY
587	aesdec KEY, STATE1
588	aesdec KEY, STATE2
589	aesdec KEY, STATE3
590	aesdec KEY, STATE4
591	movaps 0x60(TKEYP), KEY
592	aesdec KEY, STATE1
593	aesdec KEY, STATE2
594	aesdec KEY, STATE3
595	aesdec KEY, STATE4
596	movaps 0x70(TKEYP), KEY
597	aesdeclast KEY, STATE1		# last round
598	aesdeclast KEY, STATE2
599	aesdeclast KEY, STATE3
600	aesdeclast KEY, STATE4
601	RET
602SYM_FUNC_END(_aesni_dec4)
603
604/*
605 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
606 *		      size_t len)
607 */
608SYM_FUNC_START(aesni_ecb_enc)
609	FRAME_BEGIN
610#ifndef __x86_64__
611	pushl LEN
612	pushl KEYP
613	pushl KLEN
614	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
615	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
616	movl (FRAME_OFFSET+24)(%esp), INP	# src
617	movl (FRAME_OFFSET+28)(%esp), LEN	# len
618#endif
619	test LEN, LEN		# check length
620	jz .Lecb_enc_ret
621	mov 480(KEYP), KLEN
622	cmp $16, LEN
623	jb .Lecb_enc_ret
624	cmp $64, LEN
625	jb .Lecb_enc_loop1
626.align 4
627.Lecb_enc_loop4:
628	movups (INP), STATE1
629	movups 0x10(INP), STATE2
630	movups 0x20(INP), STATE3
631	movups 0x30(INP), STATE4
632	call _aesni_enc4
633	movups STATE1, (OUTP)
634	movups STATE2, 0x10(OUTP)
635	movups STATE3, 0x20(OUTP)
636	movups STATE4, 0x30(OUTP)
637	sub $64, LEN
638	add $64, INP
639	add $64, OUTP
640	cmp $64, LEN
641	jge .Lecb_enc_loop4
642	cmp $16, LEN
643	jb .Lecb_enc_ret
644.align 4
645.Lecb_enc_loop1:
646	movups (INP), STATE1
647	call _aesni_enc1
648	movups STATE1, (OUTP)
649	sub $16, LEN
650	add $16, INP
651	add $16, OUTP
652	cmp $16, LEN
653	jge .Lecb_enc_loop1
654.Lecb_enc_ret:
655#ifndef __x86_64__
656	popl KLEN
657	popl KEYP
658	popl LEN
659#endif
660	FRAME_END
661	RET
662SYM_FUNC_END(aesni_ecb_enc)
663
664/*
665 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
666 *		      size_t len);
667 */
668SYM_FUNC_START(aesni_ecb_dec)
669	FRAME_BEGIN
670#ifndef __x86_64__
671	pushl LEN
672	pushl KEYP
673	pushl KLEN
674	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
675	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
676	movl (FRAME_OFFSET+24)(%esp), INP	# src
677	movl (FRAME_OFFSET+28)(%esp), LEN	# len
678#endif
679	test LEN, LEN
680	jz .Lecb_dec_ret
681	mov 480(KEYP), KLEN
682	add $240, KEYP
683	cmp $16, LEN
684	jb .Lecb_dec_ret
685	cmp $64, LEN
686	jb .Lecb_dec_loop1
687.align 4
688.Lecb_dec_loop4:
689	movups (INP), STATE1
690	movups 0x10(INP), STATE2
691	movups 0x20(INP), STATE3
692	movups 0x30(INP), STATE4
693	call _aesni_dec4
694	movups STATE1, (OUTP)
695	movups STATE2, 0x10(OUTP)
696	movups STATE3, 0x20(OUTP)
697	movups STATE4, 0x30(OUTP)
698	sub $64, LEN
699	add $64, INP
700	add $64, OUTP
701	cmp $64, LEN
702	jge .Lecb_dec_loop4
703	cmp $16, LEN
704	jb .Lecb_dec_ret
705.align 4
706.Lecb_dec_loop1:
707	movups (INP), STATE1
708	call _aesni_dec1
709	movups STATE1, (OUTP)
710	sub $16, LEN
711	add $16, INP
712	add $16, OUTP
713	cmp $16, LEN
714	jge .Lecb_dec_loop1
715.Lecb_dec_ret:
716#ifndef __x86_64__
717	popl KLEN
718	popl KEYP
719	popl LEN
720#endif
721	FRAME_END
722	RET
723SYM_FUNC_END(aesni_ecb_dec)
724
725/*
726 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
727 *		      size_t len, u8 *iv)
728 */
729SYM_FUNC_START(aesni_cbc_enc)
730	FRAME_BEGIN
731#ifndef __x86_64__
732	pushl IVP
733	pushl LEN
734	pushl KEYP
735	pushl KLEN
736	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
737	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
738	movl (FRAME_OFFSET+28)(%esp), INP	# src
739	movl (FRAME_OFFSET+32)(%esp), LEN	# len
740	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
741#endif
742	cmp $16, LEN
743	jb .Lcbc_enc_ret
744	mov 480(KEYP), KLEN
745	movups (IVP), STATE	# load iv as initial state
746.align 4
747.Lcbc_enc_loop:
748	movups (INP), IN	# load input
749	pxor IN, STATE
750	call _aesni_enc1
751	movups STATE, (OUTP)	# store output
752	sub $16, LEN
753	add $16, INP
754	add $16, OUTP
755	cmp $16, LEN
756	jge .Lcbc_enc_loop
757	movups STATE, (IVP)
758.Lcbc_enc_ret:
759#ifndef __x86_64__
760	popl KLEN
761	popl KEYP
762	popl LEN
763	popl IVP
764#endif
765	FRAME_END
766	RET
767SYM_FUNC_END(aesni_cbc_enc)
768
769/*
770 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
771 *		      size_t len, u8 *iv)
772 */
773SYM_FUNC_START(aesni_cbc_dec)
774	FRAME_BEGIN
775#ifndef __x86_64__
776	pushl IVP
777	pushl LEN
778	pushl KEYP
779	pushl KLEN
780	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
781	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
782	movl (FRAME_OFFSET+28)(%esp), INP	# src
783	movl (FRAME_OFFSET+32)(%esp), LEN	# len
784	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
785#endif
786	cmp $16, LEN
787	jb .Lcbc_dec_just_ret
788	mov 480(KEYP), KLEN
789	add $240, KEYP
790	movups (IVP), IV
791	cmp $64, LEN
792	jb .Lcbc_dec_loop1
793.align 4
794.Lcbc_dec_loop4:
795	movups (INP), IN1
796	movaps IN1, STATE1
797	movups 0x10(INP), IN2
798	movaps IN2, STATE2
799#ifdef __x86_64__
800	movups 0x20(INP), IN3
801	movaps IN3, STATE3
802	movups 0x30(INP), IN4
803	movaps IN4, STATE4
804#else
805	movups 0x20(INP), IN1
806	movaps IN1, STATE3
807	movups 0x30(INP), IN2
808	movaps IN2, STATE4
809#endif
810	call _aesni_dec4
811	pxor IV, STATE1
812#ifdef __x86_64__
813	pxor IN1, STATE2
814	pxor IN2, STATE3
815	pxor IN3, STATE4
816	movaps IN4, IV
817#else
818	pxor IN1, STATE4
819	movaps IN2, IV
820	movups (INP), IN1
821	pxor IN1, STATE2
822	movups 0x10(INP), IN2
823	pxor IN2, STATE3
824#endif
825	movups STATE1, (OUTP)
826	movups STATE2, 0x10(OUTP)
827	movups STATE3, 0x20(OUTP)
828	movups STATE4, 0x30(OUTP)
829	sub $64, LEN
830	add $64, INP
831	add $64, OUTP
832	cmp $64, LEN
833	jge .Lcbc_dec_loop4
834	cmp $16, LEN
835	jb .Lcbc_dec_ret
836.align 4
837.Lcbc_dec_loop1:
838	movups (INP), IN
839	movaps IN, STATE
840	call _aesni_dec1
841	pxor IV, STATE
842	movups STATE, (OUTP)
843	movaps IN, IV
844	sub $16, LEN
845	add $16, INP
846	add $16, OUTP
847	cmp $16, LEN
848	jge .Lcbc_dec_loop1
849.Lcbc_dec_ret:
850	movups IV, (IVP)
851.Lcbc_dec_just_ret:
852#ifndef __x86_64__
853	popl KLEN
854	popl KEYP
855	popl LEN
856	popl IVP
857#endif
858	FRAME_END
859	RET
860SYM_FUNC_END(aesni_cbc_dec)
861
862/*
863 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
864 *			  size_t len, u8 *iv)
865 */
866SYM_FUNC_START(aesni_cts_cbc_enc)
867	FRAME_BEGIN
868#ifndef __x86_64__
869	pushl IVP
870	pushl LEN
871	pushl KEYP
872	pushl KLEN
873	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
874	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
875	movl (FRAME_OFFSET+28)(%esp), INP	# src
876	movl (FRAME_OFFSET+32)(%esp), LEN	# len
877	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
878	lea .Lcts_permute_table, T1
879#else
880	lea .Lcts_permute_table(%rip), T1
881#endif
882	mov 480(KEYP), KLEN
883	movups (IVP), STATE
884	sub $16, LEN
885	mov T1, IVP
886	add $32, IVP
887	add LEN, T1
888	sub LEN, IVP
889	movups (T1), %xmm4
890	movups (IVP), %xmm5
891
892	movups (INP), IN1
893	add LEN, INP
894	movups (INP), IN2
895
896	pxor IN1, STATE
897	call _aesni_enc1
898
899	pshufb %xmm5, IN2
900	pxor STATE, IN2
901	pshufb %xmm4, STATE
902	add OUTP, LEN
903	movups STATE, (LEN)
904
905	movaps IN2, STATE
906	call _aesni_enc1
907	movups STATE, (OUTP)
908
909#ifndef __x86_64__
910	popl KLEN
911	popl KEYP
912	popl LEN
913	popl IVP
914#endif
915	FRAME_END
916	RET
917SYM_FUNC_END(aesni_cts_cbc_enc)
918
919/*
920 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
921 *			  size_t len, u8 *iv)
922 */
923SYM_FUNC_START(aesni_cts_cbc_dec)
924	FRAME_BEGIN
925#ifndef __x86_64__
926	pushl IVP
927	pushl LEN
928	pushl KEYP
929	pushl KLEN
930	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
931	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
932	movl (FRAME_OFFSET+28)(%esp), INP	# src
933	movl (FRAME_OFFSET+32)(%esp), LEN	# len
934	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
935	lea .Lcts_permute_table, T1
936#else
937	lea .Lcts_permute_table(%rip), T1
938#endif
939	mov 480(KEYP), KLEN
940	add $240, KEYP
941	movups (IVP), IV
942	sub $16, LEN
943	mov T1, IVP
944	add $32, IVP
945	add LEN, T1
946	sub LEN, IVP
947	movups (T1), %xmm4
948
949	movups (INP), STATE
950	add LEN, INP
951	movups (INP), IN1
952
953	call _aesni_dec1
954	movaps STATE, IN2
955	pshufb %xmm4, STATE
956	pxor IN1, STATE
957
958	add OUTP, LEN
959	movups STATE, (LEN)
960
961	movups (IVP), %xmm0
962	pshufb %xmm0, IN1
963	pblendvb IN2, IN1
964	movaps IN1, STATE
965	call _aesni_dec1
966
967	pxor IV, STATE
968	movups STATE, (OUTP)
969
970#ifndef __x86_64__
971	popl KLEN
972	popl KEYP
973	popl LEN
974	popl IVP
975#endif
976	FRAME_END
977	RET
978SYM_FUNC_END(aesni_cts_cbc_dec)
979
980.pushsection .rodata
981.align 16
982.Lcts_permute_table:
983	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
984	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
985	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
986	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
987	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
988	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
989#ifdef __x86_64__
990.Lbswap_mask:
991	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
992#endif
993.popsection
994
995#ifdef __x86_64__
996/*
997 * _aesni_inc_init:	internal ABI
998 *	setup registers used by _aesni_inc
999 * input:
1000 *	IV
1001 * output:
1002 *	CTR:	== IV, in little endian
1003 *	TCTR_LOW: == lower qword of CTR
1004 *	INC:	== 1, in little endian
1005 *	BSWAP_MASK == endian swapping mask
1006 */
1007SYM_FUNC_START_LOCAL(_aesni_inc_init)
1008	movaps .Lbswap_mask(%rip), BSWAP_MASK
1009	movaps IV, CTR
1010	pshufb BSWAP_MASK, CTR
1011	mov $1, TCTR_LOW
1012	movq TCTR_LOW, INC
1013	movq CTR, TCTR_LOW
1014	RET
1015SYM_FUNC_END(_aesni_inc_init)
1016
1017/*
1018 * _aesni_inc:		internal ABI
1019 *	Increase IV by 1, IV is in big endian
1020 * input:
1021 *	IV
1022 *	CTR:	== IV, in little endian
1023 *	TCTR_LOW: == lower qword of CTR
1024 *	INC:	== 1, in little endian
1025 *	BSWAP_MASK == endian swapping mask
1026 * output:
1027 *	IV:	Increase by 1
1028 * changed:
1029 *	CTR:	== output IV, in little endian
1030 *	TCTR_LOW: == lower qword of CTR
1031 */
1032SYM_FUNC_START_LOCAL(_aesni_inc)
1033	paddq INC, CTR
1034	add $1, TCTR_LOW
1035	jnc .Linc_low
1036	pslldq $8, INC
1037	paddq INC, CTR
1038	psrldq $8, INC
1039.Linc_low:
1040	movaps CTR, IV
1041	pshufb BSWAP_MASK, IV
1042	RET
1043SYM_FUNC_END(_aesni_inc)
1044
1045/*
1046 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1047 *		      size_t len, u8 *iv)
1048 */
1049SYM_FUNC_START(aesni_ctr_enc)
1050	ANNOTATE_NOENDBR
1051	FRAME_BEGIN
1052	cmp $16, LEN
1053	jb .Lctr_enc_just_ret
1054	mov 480(KEYP), KLEN
1055	movups (IVP), IV
1056	call _aesni_inc_init
1057	cmp $64, LEN
1058	jb .Lctr_enc_loop1
1059.align 4
1060.Lctr_enc_loop4:
1061	movaps IV, STATE1
1062	call _aesni_inc
1063	movups (INP), IN1
1064	movaps IV, STATE2
1065	call _aesni_inc
1066	movups 0x10(INP), IN2
1067	movaps IV, STATE3
1068	call _aesni_inc
1069	movups 0x20(INP), IN3
1070	movaps IV, STATE4
1071	call _aesni_inc
1072	movups 0x30(INP), IN4
1073	call _aesni_enc4
1074	pxor IN1, STATE1
1075	movups STATE1, (OUTP)
1076	pxor IN2, STATE2
1077	movups STATE2, 0x10(OUTP)
1078	pxor IN3, STATE3
1079	movups STATE3, 0x20(OUTP)
1080	pxor IN4, STATE4
1081	movups STATE4, 0x30(OUTP)
1082	sub $64, LEN
1083	add $64, INP
1084	add $64, OUTP
1085	cmp $64, LEN
1086	jge .Lctr_enc_loop4
1087	cmp $16, LEN
1088	jb .Lctr_enc_ret
1089.align 4
1090.Lctr_enc_loop1:
1091	movaps IV, STATE
1092	call _aesni_inc
1093	movups (INP), IN
1094	call _aesni_enc1
1095	pxor IN, STATE
1096	movups STATE, (OUTP)
1097	sub $16, LEN
1098	add $16, INP
1099	add $16, OUTP
1100	cmp $16, LEN
1101	jge .Lctr_enc_loop1
1102.Lctr_enc_ret:
1103	movups IV, (IVP)
1104.Lctr_enc_just_ret:
1105	FRAME_END
1106	RET
1107SYM_FUNC_END(aesni_ctr_enc)
1108
1109#endif
1110
1111.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
1112.align 16
1113.Lgf128mul_x_ble_mask:
1114	.octa 0x00000000000000010000000000000087
1115.previous
1116
1117/*
1118 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
1119 * input:
1120 *	IV:	current IV
1121 *	GF128MUL_MASK == mask with 0x87 and 0x01
1122 * output:
1123 *	IV:	next IV
1124 * changed:
1125 *	KEY:	== temporary value
1126 */
1127.macro _aesni_gf128mul_x_ble
1128	pshufd $0x13, IV, KEY
1129	paddq IV, IV
1130	psrad $31, KEY
1131	pand GF128MUL_MASK, KEY
1132	pxor KEY, IV
1133.endm
1134
1135.macro	_aesni_xts_crypt	enc
1136	FRAME_BEGIN
1137#ifndef __x86_64__
1138	pushl IVP
1139	pushl LEN
1140	pushl KEYP
1141	pushl KLEN
1142	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
1143	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
1144	movl (FRAME_OFFSET+28)(%esp), INP	# src
1145	movl (FRAME_OFFSET+32)(%esp), LEN	# len
1146	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
1147	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
1148#else
1149	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
1150#endif
1151	movups (IVP), IV
1152
1153	mov 480(KEYP), KLEN
1154.if !\enc
1155	add $240, KEYP
1156
1157	test $15, LEN
1158	jz .Lxts_loop4\@
1159	sub $16, LEN
1160.endif
1161
1162.Lxts_loop4\@:
1163	sub $64, LEN
1164	jl .Lxts_1x\@
1165
1166	movdqa IV, STATE1
1167	movdqu 0x00(INP), IN
1168	pxor IN, STATE1
1169	movdqu IV, 0x00(OUTP)
1170
1171	_aesni_gf128mul_x_ble
1172	movdqa IV, STATE2
1173	movdqu 0x10(INP), IN
1174	pxor IN, STATE2
1175	movdqu IV, 0x10(OUTP)
1176
1177	_aesni_gf128mul_x_ble
1178	movdqa IV, STATE3
1179	movdqu 0x20(INP), IN
1180	pxor IN, STATE3
1181	movdqu IV, 0x20(OUTP)
1182
1183	_aesni_gf128mul_x_ble
1184	movdqa IV, STATE4
1185	movdqu 0x30(INP), IN
1186	pxor IN, STATE4
1187	movdqu IV, 0x30(OUTP)
1188
1189.if \enc
1190	call _aesni_enc4
1191.else
1192	call _aesni_dec4
1193.endif
1194
1195	movdqu 0x00(OUTP), IN
1196	pxor IN, STATE1
1197	movdqu STATE1, 0x00(OUTP)
1198
1199	movdqu 0x10(OUTP), IN
1200	pxor IN, STATE2
1201	movdqu STATE2, 0x10(OUTP)
1202
1203	movdqu 0x20(OUTP), IN
1204	pxor IN, STATE3
1205	movdqu STATE3, 0x20(OUTP)
1206
1207	movdqu 0x30(OUTP), IN
1208	pxor IN, STATE4
1209	movdqu STATE4, 0x30(OUTP)
1210
1211	_aesni_gf128mul_x_ble
1212
1213	add $64, INP
1214	add $64, OUTP
1215	test LEN, LEN
1216	jnz .Lxts_loop4\@
1217
1218.Lxts_ret_iv\@:
1219	movups IV, (IVP)
1220
1221.Lxts_ret\@:
1222#ifndef __x86_64__
1223	popl KLEN
1224	popl KEYP
1225	popl LEN
1226	popl IVP
1227#endif
1228	FRAME_END
1229	RET
1230
1231.Lxts_1x\@:
1232	add $64, LEN
1233	jz .Lxts_ret_iv\@
1234.if \enc
1235	sub $16, LEN
1236	jl .Lxts_cts4\@
1237.endif
1238
1239.Lxts_loop1\@:
1240	movdqu (INP), STATE
1241.if \enc
1242	pxor IV, STATE
1243	call _aesni_enc1
1244.else
1245	add $16, INP
1246	sub $16, LEN
1247	jl .Lxts_cts1\@
1248	pxor IV, STATE
1249	call _aesni_dec1
1250.endif
1251	pxor IV, STATE
1252	_aesni_gf128mul_x_ble
1253
1254	test LEN, LEN
1255	jz .Lxts_out\@
1256
1257.if \enc
1258	add $16, INP
1259	sub $16, LEN
1260	jl .Lxts_cts1\@
1261.endif
1262
1263	movdqu STATE, (OUTP)
1264	add $16, OUTP
1265	jmp .Lxts_loop1\@
1266
1267.Lxts_out\@:
1268	movdqu STATE, (OUTP)
1269	jmp .Lxts_ret_iv\@
1270
1271.if \enc
1272.Lxts_cts4\@:
1273	movdqa STATE4, STATE
1274	sub $16, OUTP
1275.Lxts_cts1\@:
1276.else
1277.Lxts_cts1\@:
1278	movdqa IV, STATE4
1279	_aesni_gf128mul_x_ble
1280
1281	pxor IV, STATE
1282	call _aesni_dec1
1283	pxor IV, STATE
1284.endif
1285#ifndef __x86_64__
1286	lea .Lcts_permute_table, T1
1287#else
1288	lea .Lcts_permute_table(%rip), T1
1289#endif
1290	add LEN, INP		/* rewind input pointer */
1291	add $16, LEN		/* # bytes in final block */
1292	movups (INP), IN1
1293
1294	mov T1, IVP
1295	add $32, IVP
1296	add LEN, T1
1297	sub LEN, IVP
1298	add OUTP, LEN
1299
1300	movups (T1), %xmm4
1301	movaps STATE, IN2
1302	pshufb %xmm4, STATE
1303	movups STATE, (LEN)
1304
1305	movups (IVP), %xmm0
1306	pshufb %xmm0, IN1
1307	pblendvb IN2, IN1
1308	movaps IN1, STATE
1309
1310.if \enc
1311	pxor IV, STATE
1312	call _aesni_enc1
1313	pxor IV, STATE
1314.else
1315	pxor STATE4, STATE
1316	call _aesni_dec1
1317	pxor STATE4, STATE
1318.endif
1319
1320	movups STATE, (OUTP)
1321	jmp .Lxts_ret\@
1322.endm
1323
1324/*
1325 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
1326 *		      const u8 *src, unsigned int len, le128 *iv)
1327 */
1328SYM_FUNC_START(aesni_xts_enc)
1329	_aesni_xts_crypt	1
1330SYM_FUNC_END(aesni_xts_enc)
1331
1332/*
1333 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
1334 *		      const u8 *src, unsigned int len, le128 *iv)
1335 */
1336SYM_FUNC_START(aesni_xts_dec)
1337	_aesni_xts_crypt	0
1338SYM_FUNC_END(aesni_xts_dec)
1339