xref: /linux/arch/x86/crypto/aesni-intel_asm.S (revision 140eb5227767c6754742020a16d2691222b9c19b)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
16 *             Adrian Hoban <adrian.hoban@intel.com>
17 *             James Guilford (james.guilford@intel.com)
18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
19 *             Tadeusz Struk (tadeusz.struk@intel.com)
20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
21 *    Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 *    Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35#include <asm/nospec-branch.h>
36
37/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register.  This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned).  It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ	movaps
46#define MOVUDQ	movups
47
48#ifdef __x86_64__
49
50# constants in mergeable sections, linker can reorder and merge
51.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52.align 16
53.Lgf128mul_x_ble_mask:
54	.octa 0x00000000000000010000000000000087
55.section	.rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
57POLY:   .octa 0xC2000000000000000000000000000001
58.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
60TWOONE: .octa 0x00000001000000000000000000000001
61
62.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
65.section	.rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1:      .octa 0x0000000000000000ffffffffffffffff
68.section	.rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2:      .octa 0xffffffffffffffff0000000000000000
71.section	.rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE:        .octa 0x00000000000000000000000000000001
74.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section	.rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec:        .octa 0x1
80.section	.rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc:        .octa 0x2
83
84# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
86# and zero should follow ALL_F
87.section	.rodata, "a", @progbits
88.align 16
89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
91            .octa 0x00000000000000000000000000000000
92
93.section .rodata
94.align 16
95.type aad_shift_arr, @object
96.size aad_shift_arr, 272
97aad_shift_arr:
98        .octa     0xffffffffffffffffffffffffffffffff
99        .octa     0xffffffffffffffffffffffffffffff0C
100        .octa     0xffffffffffffffffffffffffffff0D0C
101        .octa     0xffffffffffffffffffffffffff0E0D0C
102        .octa     0xffffffffffffffffffffffff0F0E0D0C
103        .octa     0xffffffffffffffffffffff0C0B0A0908
104        .octa     0xffffffffffffffffffff0D0C0B0A0908
105        .octa     0xffffffffffffffffff0E0D0C0B0A0908
106        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
107        .octa     0xffffffffffffff0C0B0A090807060504
108        .octa     0xffffffffffff0D0C0B0A090807060504
109        .octa     0xffffffffff0E0D0C0B0A090807060504
110        .octa     0xffffffff0F0E0D0C0B0A090807060504
111        .octa     0xffffff0C0B0A09080706050403020100
112        .octa     0xffff0D0C0B0A09080706050403020100
113        .octa     0xff0E0D0C0B0A09080706050403020100
114        .octa     0x0F0E0D0C0B0A09080706050403020100
115
116
117.text
118
119
120#define	STACK_OFFSET    8*3
121#define	HashKey		16*0	// store HashKey <<1 mod poly here
122#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
123#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
124#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
125#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
126				// bits of  HashKey <<1 mod poly here
127				//(for Karatsuba purposes)
128#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
129				// bits of  HashKey^2 <<1 mod poly here
130				// (for Karatsuba purposes)
131#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
132				// bits of  HashKey^3 <<1 mod poly here
133				// (for Karatsuba purposes)
134#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
135				// bits of  HashKey^4 <<1 mod poly here
136				// (for Karatsuba purposes)
137#define	VARIABLE_OFFSET	16*8
138
139#define arg1 rdi
140#define arg2 rsi
141#define arg3 rdx
142#define arg4 rcx
143#define arg5 r8
144#define arg6 r9
145#define arg7 STACK_OFFSET+8(%r14)
146#define arg8 STACK_OFFSET+16(%r14)
147#define arg9 STACK_OFFSET+24(%r14)
148#define arg10 STACK_OFFSET+32(%r14)
149#define keysize 2*15*16(%arg1)
150#endif
151
152
153#define STATE1	%xmm0
154#define STATE2	%xmm4
155#define STATE3	%xmm5
156#define STATE4	%xmm6
157#define STATE	STATE1
158#define IN1	%xmm1
159#define IN2	%xmm7
160#define IN3	%xmm8
161#define IN4	%xmm9
162#define IN	IN1
163#define KEY	%xmm2
164#define IV	%xmm3
165
166#define BSWAP_MASK %xmm10
167#define CTR	%xmm11
168#define INC	%xmm12
169
170#define GF128MUL_MASK %xmm10
171
172#ifdef __x86_64__
173#define AREG	%rax
174#define KEYP	%rdi
175#define OUTP	%rsi
176#define UKEYP	OUTP
177#define INP	%rdx
178#define LEN	%rcx
179#define IVP	%r8
180#define KLEN	%r9d
181#define T1	%r10
182#define TKEYP	T1
183#define T2	%r11
184#define TCTR_LOW T2
185#else
186#define AREG	%eax
187#define KEYP	%edi
188#define OUTP	AREG
189#define UKEYP	OUTP
190#define INP	%edx
191#define LEN	%esi
192#define IVP	%ebp
193#define KLEN	%ebx
194#define T1	%ecx
195#define TKEYP	T1
196#endif
197
198
199#ifdef __x86_64__
200/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
201*
202*
203* Input: A and B (128-bits each, bit-reflected)
204* Output: C = A*B*x mod poly, (i.e. >>1 )
205* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
206* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
207*
208*/
209.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
210	movdqa	  \GH, \TMP1
211	pshufd	  $78, \GH, \TMP2
212	pshufd	  $78, \HK, \TMP3
213	pxor	  \GH, \TMP2            # TMP2 = a1+a0
214	pxor	  \HK, \TMP3            # TMP3 = b1+b0
215	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
216	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
217	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
218	pxor	  \GH, \TMP2
219	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
220	movdqa	  \TMP2, \TMP3
221	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
222	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
223	pxor	  \TMP3, \GH
224	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
225
226        # first phase of the reduction
227
228	movdqa    \GH, \TMP2
229	movdqa    \GH, \TMP3
230	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
231					# in in order to perform
232					# independent shifts
233	pslld     $31, \TMP2            # packed right shift <<31
234	pslld     $30, \TMP3            # packed right shift <<30
235	pslld     $25, \TMP4            # packed right shift <<25
236	pxor      \TMP3, \TMP2          # xor the shifted versions
237	pxor      \TMP4, \TMP2
238	movdqa    \TMP2, \TMP5
239	psrldq    $4, \TMP5             # right shift TMP5 1 DW
240	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
241	pxor      \TMP2, \GH
242
243        # second phase of the reduction
244
245	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
246					# in in order to perform
247					# independent shifts
248	movdqa    \GH,\TMP3
249	movdqa    \GH,\TMP4
250	psrld     $1,\TMP2              # packed left shift >>1
251	psrld     $2,\TMP3              # packed left shift >>2
252	psrld     $7,\TMP4              # packed left shift >>7
253	pxor      \TMP3,\TMP2		# xor the shifted versions
254	pxor      \TMP4,\TMP2
255	pxor      \TMP5, \TMP2
256	pxor      \TMP2, \GH
257	pxor      \TMP1, \GH            # result is in TMP1
258.endm
259
260/*
261* if a = number of total plaintext bytes
262* b = floor(a/16)
263* num_initial_blocks = b mod 4
264* encrypt the initial num_initial_blocks blocks and apply ghash on
265* the ciphertext
266* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
267* are clobbered
268* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
269*/
270
271
272.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
273XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
274        MOVADQ     SHUF_MASK(%rip), %xmm14
275	mov	   arg7, %r10           # %r10 = AAD
276	mov	   arg8, %r12           # %r12 = aadLen
277	mov	   %r12, %r11
278	pxor	   %xmm\i, %xmm\i
279	pxor       \XMM2, \XMM2
280
281	cmp	   $16, %r11
282	jl	   _get_AAD_rest8\num_initial_blocks\operation
283_get_AAD_blocks\num_initial_blocks\operation:
284	movdqu	   (%r10), %xmm\i
285	PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
286	pxor	   %xmm\i, \XMM2
287	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
288	add	   $16, %r10
289	sub	   $16, %r12
290	sub	   $16, %r11
291	cmp	   $16, %r11
292	jge	   _get_AAD_blocks\num_initial_blocks\operation
293
294	movdqu	   \XMM2, %xmm\i
295	cmp	   $0, %r11
296	je	   _get_AAD_done\num_initial_blocks\operation
297
298	pxor	   %xmm\i,%xmm\i
299
300	/* read the last <16B of AAD. since we have at least 4B of
301	data right after the AAD (the ICV, and maybe some CT), we can
302	read 4B/8B blocks safely, and then get rid of the extra stuff */
303_get_AAD_rest8\num_initial_blocks\operation:
304	cmp	   $4, %r11
305	jle	   _get_AAD_rest4\num_initial_blocks\operation
306	movq	   (%r10), \TMP1
307	add	   $8, %r10
308	sub	   $8, %r11
309	pslldq	   $8, \TMP1
310	psrldq	   $8, %xmm\i
311	pxor	   \TMP1, %xmm\i
312	jmp	   _get_AAD_rest8\num_initial_blocks\operation
313_get_AAD_rest4\num_initial_blocks\operation:
314	cmp	   $0, %r11
315	jle	   _get_AAD_rest0\num_initial_blocks\operation
316	mov	   (%r10), %eax
317	movq	   %rax, \TMP1
318	add	   $4, %r10
319	sub	   $4, %r10
320	pslldq	   $12, \TMP1
321	psrldq	   $4, %xmm\i
322	pxor	   \TMP1, %xmm\i
323_get_AAD_rest0\num_initial_blocks\operation:
324	/* finalize: shift out the extra bytes we read, and align
325	left. since pslldq can only shift by an immediate, we use
326	vpshufb and an array of shuffle masks */
327	movq	   %r12, %r11
328	salq	   $4, %r11
329	movdqu	   aad_shift_arr(%r11), \TMP1
330	PSHUFB_XMM \TMP1, %xmm\i
331_get_AAD_rest_final\num_initial_blocks\operation:
332	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
333	pxor	   \XMM2, %xmm\i
334	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
335
336_get_AAD_done\num_initial_blocks\operation:
337	xor	   %r11, %r11 # initialise the data pointer offset as zero
338	# start AES for num_initial_blocks blocks
339
340	mov	   %arg5, %rax                      # %rax = *Y0
341	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
342	PSHUFB_XMM   %xmm14, \XMM0
343
344.if (\i == 5) || (\i == 6) || (\i == 7)
345	MOVADQ		ONE(%RIP),\TMP1
346	MOVADQ		(%arg1),\TMP2
347.irpc index, \i_seq
348	paddd	   \TMP1, \XMM0                 # INCR Y0
349	movdqa	   \XMM0, %xmm\index
350	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
351	pxor	   \TMP2, %xmm\index
352.endr
353	lea	0x10(%arg1),%r10
354	mov	keysize,%eax
355	shr	$2,%eax				# 128->4, 192->6, 256->8
356	add	$5,%eax			      # 128->9, 192->11, 256->13
357
358aes_loop_initial_dec\num_initial_blocks:
359	MOVADQ	(%r10),\TMP1
360.irpc	index, \i_seq
361	AESENC	\TMP1, %xmm\index
362.endr
363	add	$16,%r10
364	sub	$1,%eax
365	jnz	aes_loop_initial_dec\num_initial_blocks
366
367	MOVADQ	(%r10), \TMP1
368.irpc index, \i_seq
369	AESENCLAST \TMP1, %xmm\index         # Last Round
370.endr
371.irpc index, \i_seq
372	movdqu	   (%arg3 , %r11, 1), \TMP1
373	pxor	   \TMP1, %xmm\index
374	movdqu	   %xmm\index, (%arg2 , %r11, 1)
375	# write back plaintext/ciphertext for num_initial_blocks
376	add	   $16, %r11
377
378	movdqa     \TMP1, %xmm\index
379	PSHUFB_XMM	   %xmm14, %xmm\index
380                # prepare plaintext/ciphertext for GHASH computation
381.endr
382.endif
383
384        # apply GHASH on num_initial_blocks blocks
385
386.if \i == 5
387        pxor       %xmm5, %xmm6
388	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
389        pxor       %xmm6, %xmm7
390	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
391        pxor       %xmm7, %xmm8
392	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
393.elseif \i == 6
394        pxor       %xmm6, %xmm7
395	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
396        pxor       %xmm7, %xmm8
397	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
398.elseif \i == 7
399        pxor       %xmm7, %xmm8
400	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
401.endif
402	cmp	   $64, %r13
403	jl	_initial_blocks_done\num_initial_blocks\operation
404	# no need for precomputed values
405/*
406*
407* Precomputations for HashKey parallel with encryption of first 4 blocks.
408* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
409*/
410	MOVADQ	   ONE(%rip), \TMP1
411	paddd	   \TMP1, \XMM0              # INCR Y0
412	MOVADQ	   \XMM0, \XMM1
413	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
414
415	paddd	   \TMP1, \XMM0              # INCR Y0
416	MOVADQ	   \XMM0, \XMM2
417	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
418
419	paddd	   \TMP1, \XMM0              # INCR Y0
420	MOVADQ	   \XMM0, \XMM3
421	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
422
423	paddd	   \TMP1, \XMM0              # INCR Y0
424	MOVADQ	   \XMM0, \XMM4
425	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
426
427	MOVADQ	   0(%arg1),\TMP1
428	pxor	   \TMP1, \XMM1
429	pxor	   \TMP1, \XMM2
430	pxor	   \TMP1, \XMM3
431	pxor	   \TMP1, \XMM4
432	movdqa	   \TMP3, \TMP5
433	pshufd	   $78, \TMP3, \TMP1
434	pxor	   \TMP3, \TMP1
435	movdqa	   \TMP1, HashKey_k(%rsp)
436	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
437# TMP5 = HashKey^2<<1 (mod poly)
438	movdqa	   \TMP5, HashKey_2(%rsp)
439# HashKey_2 = HashKey^2<<1 (mod poly)
440	pshufd	   $78, \TMP5, \TMP1
441	pxor	   \TMP5, \TMP1
442	movdqa	   \TMP1, HashKey_2_k(%rsp)
443.irpc index, 1234 # do 4 rounds
444	movaps 0x10*\index(%arg1), \TMP1
445	AESENC	   \TMP1, \XMM1
446	AESENC	   \TMP1, \XMM2
447	AESENC	   \TMP1, \XMM3
448	AESENC	   \TMP1, \XMM4
449.endr
450	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
451# TMP5 = HashKey^3<<1 (mod poly)
452	movdqa	   \TMP5, HashKey_3(%rsp)
453	pshufd	   $78, \TMP5, \TMP1
454	pxor	   \TMP5, \TMP1
455	movdqa	   \TMP1, HashKey_3_k(%rsp)
456.irpc index, 56789 # do next 5 rounds
457	movaps 0x10*\index(%arg1), \TMP1
458	AESENC	   \TMP1, \XMM1
459	AESENC	   \TMP1, \XMM2
460	AESENC	   \TMP1, \XMM3
461	AESENC	   \TMP1, \XMM4
462.endr
463	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
464# TMP5 = HashKey^3<<1 (mod poly)
465	movdqa	   \TMP5, HashKey_4(%rsp)
466	pshufd	   $78, \TMP5, \TMP1
467	pxor	   \TMP5, \TMP1
468	movdqa	   \TMP1, HashKey_4_k(%rsp)
469	lea	   0xa0(%arg1),%r10
470	mov	   keysize,%eax
471	shr	   $2,%eax			# 128->4, 192->6, 256->8
472	sub	   $4,%eax			# 128->0, 192->2, 256->4
473	jz	   aes_loop_pre_dec_done\num_initial_blocks
474
475aes_loop_pre_dec\num_initial_blocks:
476	MOVADQ	   (%r10),\TMP2
477.irpc	index, 1234
478	AESENC	   \TMP2, %xmm\index
479.endr
480	add	   $16,%r10
481	sub	   $1,%eax
482	jnz	   aes_loop_pre_dec\num_initial_blocks
483
484aes_loop_pre_dec_done\num_initial_blocks:
485	MOVADQ	   (%r10), \TMP2
486	AESENCLAST \TMP2, \XMM1
487	AESENCLAST \TMP2, \XMM2
488	AESENCLAST \TMP2, \XMM3
489	AESENCLAST \TMP2, \XMM4
490	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
491	pxor	   \TMP1, \XMM1
492	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
493	movdqa     \TMP1, \XMM1
494	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
495	pxor	   \TMP1, \XMM2
496	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
497	movdqa     \TMP1, \XMM2
498	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
499	pxor	   \TMP1, \XMM3
500	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
501	movdqa     \TMP1, \XMM3
502	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
503	pxor	   \TMP1, \XMM4
504	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
505	movdqa     \TMP1, \XMM4
506	add	   $64, %r11
507	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
508	pxor	   \XMMDst, \XMM1
509# combine GHASHed value with the corresponding ciphertext
510	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
511	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
512	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
513
514_initial_blocks_done\num_initial_blocks\operation:
515
516.endm
517
518
519/*
520* if a = number of total plaintext bytes
521* b = floor(a/16)
522* num_initial_blocks = b mod 4
523* encrypt the initial num_initial_blocks blocks and apply ghash on
524* the ciphertext
525* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
526* are clobbered
527* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
528*/
529
530
531.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
532XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
533        MOVADQ     SHUF_MASK(%rip), %xmm14
534	mov	   arg7, %r10           # %r10 = AAD
535	mov	   arg8, %r12           # %r12 = aadLen
536	mov	   %r12, %r11
537	pxor	   %xmm\i, %xmm\i
538	pxor	   \XMM2, \XMM2
539
540	cmp	   $16, %r11
541	jl	   _get_AAD_rest8\num_initial_blocks\operation
542_get_AAD_blocks\num_initial_blocks\operation:
543	movdqu	   (%r10), %xmm\i
544	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
545	pxor	   %xmm\i, \XMM2
546	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
547	add	   $16, %r10
548	sub	   $16, %r12
549	sub	   $16, %r11
550	cmp	   $16, %r11
551	jge	   _get_AAD_blocks\num_initial_blocks\operation
552
553	movdqu	   \XMM2, %xmm\i
554	cmp	   $0, %r11
555	je	   _get_AAD_done\num_initial_blocks\operation
556
557	pxor	   %xmm\i,%xmm\i
558
559	/* read the last <16B of AAD. since we have at least 4B of
560	data right after the AAD (the ICV, and maybe some PT), we can
561	read 4B/8B blocks safely, and then get rid of the extra stuff */
562_get_AAD_rest8\num_initial_blocks\operation:
563	cmp	   $4, %r11
564	jle	   _get_AAD_rest4\num_initial_blocks\operation
565	movq	   (%r10), \TMP1
566	add	   $8, %r10
567	sub	   $8, %r11
568	pslldq	   $8, \TMP1
569	psrldq	   $8, %xmm\i
570	pxor	   \TMP1, %xmm\i
571	jmp	   _get_AAD_rest8\num_initial_blocks\operation
572_get_AAD_rest4\num_initial_blocks\operation:
573	cmp	   $0, %r11
574	jle	   _get_AAD_rest0\num_initial_blocks\operation
575	mov	   (%r10), %eax
576	movq	   %rax, \TMP1
577	add	   $4, %r10
578	sub	   $4, %r10
579	pslldq	   $12, \TMP1
580	psrldq	   $4, %xmm\i
581	pxor	   \TMP1, %xmm\i
582_get_AAD_rest0\num_initial_blocks\operation:
583	/* finalize: shift out the extra bytes we read, and align
584	left. since pslldq can only shift by an immediate, we use
585	vpshufb and an array of shuffle masks */
586	movq	   %r12, %r11
587	salq	   $4, %r11
588	movdqu	   aad_shift_arr(%r11), \TMP1
589	PSHUFB_XMM \TMP1, %xmm\i
590_get_AAD_rest_final\num_initial_blocks\operation:
591	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
592	pxor	   \XMM2, %xmm\i
593	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
594
595_get_AAD_done\num_initial_blocks\operation:
596	xor	   %r11, %r11 # initialise the data pointer offset as zero
597	# start AES for num_initial_blocks blocks
598
599	mov	   %arg5, %rax                      # %rax = *Y0
600	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
601	PSHUFB_XMM   %xmm14, \XMM0
602
603.if (\i == 5) || (\i == 6) || (\i == 7)
604
605	MOVADQ		ONE(%RIP),\TMP1
606	MOVADQ		0(%arg1),\TMP2
607.irpc index, \i_seq
608	paddd		\TMP1, \XMM0                 # INCR Y0
609	MOVADQ		\XMM0, %xmm\index
610	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
611	pxor		\TMP2, %xmm\index
612.endr
613	lea	0x10(%arg1),%r10
614	mov	keysize,%eax
615	shr	$2,%eax				# 128->4, 192->6, 256->8
616	add	$5,%eax			      # 128->9, 192->11, 256->13
617
618aes_loop_initial_enc\num_initial_blocks:
619	MOVADQ	(%r10),\TMP1
620.irpc	index, \i_seq
621	AESENC	\TMP1, %xmm\index
622.endr
623	add	$16,%r10
624	sub	$1,%eax
625	jnz	aes_loop_initial_enc\num_initial_blocks
626
627	MOVADQ	(%r10), \TMP1
628.irpc index, \i_seq
629	AESENCLAST \TMP1, %xmm\index         # Last Round
630.endr
631.irpc index, \i_seq
632	movdqu	   (%arg3 , %r11, 1), \TMP1
633	pxor	   \TMP1, %xmm\index
634	movdqu	   %xmm\index, (%arg2 , %r11, 1)
635	# write back plaintext/ciphertext for num_initial_blocks
636	add	   $16, %r11
637	PSHUFB_XMM	   %xmm14, %xmm\index
638
639		# prepare plaintext/ciphertext for GHASH computation
640.endr
641.endif
642
643        # apply GHASH on num_initial_blocks blocks
644
645.if \i == 5
646        pxor       %xmm5, %xmm6
647	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
648        pxor       %xmm6, %xmm7
649	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
650        pxor       %xmm7, %xmm8
651	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
652.elseif \i == 6
653        pxor       %xmm6, %xmm7
654	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
655        pxor       %xmm7, %xmm8
656	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
657.elseif \i == 7
658        pxor       %xmm7, %xmm8
659	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
660.endif
661	cmp	   $64, %r13
662	jl	_initial_blocks_done\num_initial_blocks\operation
663	# no need for precomputed values
664/*
665*
666* Precomputations for HashKey parallel with encryption of first 4 blocks.
667* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
668*/
669	MOVADQ	   ONE(%RIP),\TMP1
670	paddd	   \TMP1, \XMM0              # INCR Y0
671	MOVADQ	   \XMM0, \XMM1
672	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
673
674	paddd	   \TMP1, \XMM0              # INCR Y0
675	MOVADQ	   \XMM0, \XMM2
676	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
677
678	paddd	   \TMP1, \XMM0              # INCR Y0
679	MOVADQ	   \XMM0, \XMM3
680	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
681
682	paddd	   \TMP1, \XMM0              # INCR Y0
683	MOVADQ	   \XMM0, \XMM4
684	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
685
686	MOVADQ	   0(%arg1),\TMP1
687	pxor	   \TMP1, \XMM1
688	pxor	   \TMP1, \XMM2
689	pxor	   \TMP1, \XMM3
690	pxor	   \TMP1, \XMM4
691	movdqa	   \TMP3, \TMP5
692	pshufd	   $78, \TMP3, \TMP1
693	pxor	   \TMP3, \TMP1
694	movdqa	   \TMP1, HashKey_k(%rsp)
695	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
696# TMP5 = HashKey^2<<1 (mod poly)
697	movdqa	   \TMP5, HashKey_2(%rsp)
698# HashKey_2 = HashKey^2<<1 (mod poly)
699	pshufd	   $78, \TMP5, \TMP1
700	pxor	   \TMP5, \TMP1
701	movdqa	   \TMP1, HashKey_2_k(%rsp)
702.irpc index, 1234 # do 4 rounds
703	movaps 0x10*\index(%arg1), \TMP1
704	AESENC	   \TMP1, \XMM1
705	AESENC	   \TMP1, \XMM2
706	AESENC	   \TMP1, \XMM3
707	AESENC	   \TMP1, \XMM4
708.endr
709	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
710# TMP5 = HashKey^3<<1 (mod poly)
711	movdqa	   \TMP5, HashKey_3(%rsp)
712	pshufd	   $78, \TMP5, \TMP1
713	pxor	   \TMP5, \TMP1
714	movdqa	   \TMP1, HashKey_3_k(%rsp)
715.irpc index, 56789 # do next 5 rounds
716	movaps 0x10*\index(%arg1), \TMP1
717	AESENC	   \TMP1, \XMM1
718	AESENC	   \TMP1, \XMM2
719	AESENC	   \TMP1, \XMM3
720	AESENC	   \TMP1, \XMM4
721.endr
722	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
723# TMP5 = HashKey^3<<1 (mod poly)
724	movdqa	   \TMP5, HashKey_4(%rsp)
725	pshufd	   $78, \TMP5, \TMP1
726	pxor	   \TMP5, \TMP1
727	movdqa	   \TMP1, HashKey_4_k(%rsp)
728	lea	   0xa0(%arg1),%r10
729	mov	   keysize,%eax
730	shr	   $2,%eax			# 128->4, 192->6, 256->8
731	sub	   $4,%eax			# 128->0, 192->2, 256->4
732	jz	   aes_loop_pre_enc_done\num_initial_blocks
733
734aes_loop_pre_enc\num_initial_blocks:
735	MOVADQ	   (%r10),\TMP2
736.irpc	index, 1234
737	AESENC	   \TMP2, %xmm\index
738.endr
739	add	   $16,%r10
740	sub	   $1,%eax
741	jnz	   aes_loop_pre_enc\num_initial_blocks
742
743aes_loop_pre_enc_done\num_initial_blocks:
744	MOVADQ	   (%r10), \TMP2
745	AESENCLAST \TMP2, \XMM1
746	AESENCLAST \TMP2, \XMM2
747	AESENCLAST \TMP2, \XMM3
748	AESENCLAST \TMP2, \XMM4
749	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
750	pxor	   \TMP1, \XMM1
751	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
752	pxor	   \TMP1, \XMM2
753	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
754	pxor	   \TMP1, \XMM3
755	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
756	pxor	   \TMP1, \XMM4
757	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
758	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
759	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
760	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
761
762	add	   $64, %r11
763	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
764	pxor	   \XMMDst, \XMM1
765# combine GHASHed value with the corresponding ciphertext
766	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
767	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
768	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
769
770_initial_blocks_done\num_initial_blocks\operation:
771
772.endm
773
774/*
775* encrypt 4 blocks at a time
776* ghash the 4 previously encrypted ciphertext blocks
777* arg1, %arg2, %arg3 are used as pointers only, not modified
778* %r11 is the data offset value
779*/
780.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
781TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
782
783	movdqa	  \XMM1, \XMM5
784	movdqa	  \XMM2, \XMM6
785	movdqa	  \XMM3, \XMM7
786	movdqa	  \XMM4, \XMM8
787
788        movdqa    SHUF_MASK(%rip), %xmm15
789        # multiply TMP5 * HashKey using karatsuba
790
791	movdqa	  \XMM5, \TMP4
792	pshufd	  $78, \XMM5, \TMP6
793	pxor	  \XMM5, \TMP6
794	paddd     ONE(%rip), \XMM0		# INCR CNT
795	movdqa	  HashKey_4(%rsp), \TMP5
796	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
797	movdqa    \XMM0, \XMM1
798	paddd     ONE(%rip), \XMM0		# INCR CNT
799	movdqa    \XMM0, \XMM2
800	paddd     ONE(%rip), \XMM0		# INCR CNT
801	movdqa    \XMM0, \XMM3
802	paddd     ONE(%rip), \XMM0		# INCR CNT
803	movdqa    \XMM0, \XMM4
804	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
805	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
806	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
807	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
808	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
809
810	pxor	  (%arg1), \XMM1
811	pxor	  (%arg1), \XMM2
812	pxor	  (%arg1), \XMM3
813	pxor	  (%arg1), \XMM4
814	movdqa	  HashKey_4_k(%rsp), \TMP5
815	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
816	movaps 0x10(%arg1), \TMP1
817	AESENC	  \TMP1, \XMM1              # Round 1
818	AESENC	  \TMP1, \XMM2
819	AESENC	  \TMP1, \XMM3
820	AESENC	  \TMP1, \XMM4
821	movaps 0x20(%arg1), \TMP1
822	AESENC	  \TMP1, \XMM1              # Round 2
823	AESENC	  \TMP1, \XMM2
824	AESENC	  \TMP1, \XMM3
825	AESENC	  \TMP1, \XMM4
826	movdqa	  \XMM6, \TMP1
827	pshufd	  $78, \XMM6, \TMP2
828	pxor	  \XMM6, \TMP2
829	movdqa	  HashKey_3(%rsp), \TMP5
830	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
831	movaps 0x30(%arg1), \TMP3
832	AESENC    \TMP3, \XMM1              # Round 3
833	AESENC    \TMP3, \XMM2
834	AESENC    \TMP3, \XMM3
835	AESENC    \TMP3, \XMM4
836	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
837	movaps 0x40(%arg1), \TMP3
838	AESENC	  \TMP3, \XMM1              # Round 4
839	AESENC	  \TMP3, \XMM2
840	AESENC	  \TMP3, \XMM3
841	AESENC	  \TMP3, \XMM4
842	movdqa	  HashKey_3_k(%rsp), \TMP5
843	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
844	movaps 0x50(%arg1), \TMP3
845	AESENC	  \TMP3, \XMM1              # Round 5
846	AESENC	  \TMP3, \XMM2
847	AESENC	  \TMP3, \XMM3
848	AESENC	  \TMP3, \XMM4
849	pxor	  \TMP1, \TMP4
850# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
851	pxor	  \XMM6, \XMM5
852	pxor	  \TMP2, \TMP6
853	movdqa	  \XMM7, \TMP1
854	pshufd	  $78, \XMM7, \TMP2
855	pxor	  \XMM7, \TMP2
856	movdqa	  HashKey_2(%rsp ), \TMP5
857
858        # Multiply TMP5 * HashKey using karatsuba
859
860	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
861	movaps 0x60(%arg1), \TMP3
862	AESENC	  \TMP3, \XMM1              # Round 6
863	AESENC	  \TMP3, \XMM2
864	AESENC	  \TMP3, \XMM3
865	AESENC	  \TMP3, \XMM4
866	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
867	movaps 0x70(%arg1), \TMP3
868	AESENC	  \TMP3, \XMM1             # Round 7
869	AESENC	  \TMP3, \XMM2
870	AESENC	  \TMP3, \XMM3
871	AESENC	  \TMP3, \XMM4
872	movdqa	  HashKey_2_k(%rsp), \TMP5
873	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
874	movaps 0x80(%arg1), \TMP3
875	AESENC	  \TMP3, \XMM1             # Round 8
876	AESENC	  \TMP3, \XMM2
877	AESENC	  \TMP3, \XMM3
878	AESENC	  \TMP3, \XMM4
879	pxor	  \TMP1, \TMP4
880# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
881	pxor	  \XMM7, \XMM5
882	pxor	  \TMP2, \TMP6
883
884        # Multiply XMM8 * HashKey
885        # XMM8 and TMP5 hold the values for the two operands
886
887	movdqa	  \XMM8, \TMP1
888	pshufd	  $78, \XMM8, \TMP2
889	pxor	  \XMM8, \TMP2
890	movdqa	  HashKey(%rsp), \TMP5
891	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
892	movaps 0x90(%arg1), \TMP3
893	AESENC	  \TMP3, \XMM1            # Round 9
894	AESENC	  \TMP3, \XMM2
895	AESENC	  \TMP3, \XMM3
896	AESENC	  \TMP3, \XMM4
897	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
898	lea	  0xa0(%arg1),%r10
899	mov	  keysize,%eax
900	shr	  $2,%eax			# 128->4, 192->6, 256->8
901	sub	  $4,%eax			# 128->0, 192->2, 256->4
902	jz	  aes_loop_par_enc_done
903
904aes_loop_par_enc:
905	MOVADQ	  (%r10),\TMP3
906.irpc	index, 1234
907	AESENC	  \TMP3, %xmm\index
908.endr
909	add	  $16,%r10
910	sub	  $1,%eax
911	jnz	  aes_loop_par_enc
912
913aes_loop_par_enc_done:
914	MOVADQ	  (%r10), \TMP3
915	AESENCLAST \TMP3, \XMM1           # Round 10
916	AESENCLAST \TMP3, \XMM2
917	AESENCLAST \TMP3, \XMM3
918	AESENCLAST \TMP3, \XMM4
919	movdqa    HashKey_k(%rsp), \TMP5
920	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
921	movdqu	  (%arg3,%r11,1), \TMP3
922	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
923	movdqu	  16(%arg3,%r11,1), \TMP3
924	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
925	movdqu	  32(%arg3,%r11,1), \TMP3
926	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
927	movdqu	  48(%arg3,%r11,1), \TMP3
928	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
929        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
930        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
931        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
932        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
933	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
934	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
935	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
936	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
937
938	pxor	  \TMP4, \TMP1
939	pxor	  \XMM8, \XMM5
940	pxor	  \TMP6, \TMP2
941	pxor	  \TMP1, \TMP2
942	pxor	  \XMM5, \TMP2
943	movdqa	  \TMP2, \TMP3
944	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
945	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
946	pxor	  \TMP3, \XMM5
947	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
948
949        # first phase of reduction
950
951	movdqa    \XMM5, \TMP2
952	movdqa    \XMM5, \TMP3
953	movdqa    \XMM5, \TMP4
954# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
955	pslld     $31, \TMP2                   # packed right shift << 31
956	pslld     $30, \TMP3                   # packed right shift << 30
957	pslld     $25, \TMP4                   # packed right shift << 25
958	pxor      \TMP3, \TMP2	               # xor the shifted versions
959	pxor      \TMP4, \TMP2
960	movdqa    \TMP2, \TMP5
961	psrldq    $4, \TMP5                    # right shift T5 1 DW
962	pslldq    $12, \TMP2                   # left shift T2 3 DWs
963	pxor      \TMP2, \XMM5
964
965        # second phase of reduction
966
967	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
968	movdqa    \XMM5,\TMP3
969	movdqa    \XMM5,\TMP4
970	psrld     $1, \TMP2                    # packed left shift >>1
971	psrld     $2, \TMP3                    # packed left shift >>2
972	psrld     $7, \TMP4                    # packed left shift >>7
973	pxor      \TMP3,\TMP2		       # xor the shifted versions
974	pxor      \TMP4,\TMP2
975	pxor      \TMP5, \TMP2
976	pxor      \TMP2, \XMM5
977	pxor      \TMP1, \XMM5                 # result is in TMP1
978
979	pxor	  \XMM5, \XMM1
980.endm
981
982/*
983* decrypt 4 blocks at a time
984* ghash the 4 previously decrypted ciphertext blocks
985* arg1, %arg2, %arg3 are used as pointers only, not modified
986* %r11 is the data offset value
987*/
988.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
989TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
990
991	movdqa	  \XMM1, \XMM5
992	movdqa	  \XMM2, \XMM6
993	movdqa	  \XMM3, \XMM7
994	movdqa	  \XMM4, \XMM8
995
996        movdqa    SHUF_MASK(%rip), %xmm15
997        # multiply TMP5 * HashKey using karatsuba
998
999	movdqa	  \XMM5, \TMP4
1000	pshufd	  $78, \XMM5, \TMP6
1001	pxor	  \XMM5, \TMP6
1002	paddd     ONE(%rip), \XMM0		# INCR CNT
1003	movdqa	  HashKey_4(%rsp), \TMP5
1004	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1005	movdqa    \XMM0, \XMM1
1006	paddd     ONE(%rip), \XMM0		# INCR CNT
1007	movdqa    \XMM0, \XMM2
1008	paddd     ONE(%rip), \XMM0		# INCR CNT
1009	movdqa    \XMM0, \XMM3
1010	paddd     ONE(%rip), \XMM0		# INCR CNT
1011	movdqa    \XMM0, \XMM4
1012	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1013	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1014	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1015	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1016	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1017
1018	pxor	  (%arg1), \XMM1
1019	pxor	  (%arg1), \XMM2
1020	pxor	  (%arg1), \XMM3
1021	pxor	  (%arg1), \XMM4
1022	movdqa	  HashKey_4_k(%rsp), \TMP5
1023	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1024	movaps 0x10(%arg1), \TMP1
1025	AESENC	  \TMP1, \XMM1              # Round 1
1026	AESENC	  \TMP1, \XMM2
1027	AESENC	  \TMP1, \XMM3
1028	AESENC	  \TMP1, \XMM4
1029	movaps 0x20(%arg1), \TMP1
1030	AESENC	  \TMP1, \XMM1              # Round 2
1031	AESENC	  \TMP1, \XMM2
1032	AESENC	  \TMP1, \XMM3
1033	AESENC	  \TMP1, \XMM4
1034	movdqa	  \XMM6, \TMP1
1035	pshufd	  $78, \XMM6, \TMP2
1036	pxor	  \XMM6, \TMP2
1037	movdqa	  HashKey_3(%rsp), \TMP5
1038	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1039	movaps 0x30(%arg1), \TMP3
1040	AESENC    \TMP3, \XMM1              # Round 3
1041	AESENC    \TMP3, \XMM2
1042	AESENC    \TMP3, \XMM3
1043	AESENC    \TMP3, \XMM4
1044	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1045	movaps 0x40(%arg1), \TMP3
1046	AESENC	  \TMP3, \XMM1              # Round 4
1047	AESENC	  \TMP3, \XMM2
1048	AESENC	  \TMP3, \XMM3
1049	AESENC	  \TMP3, \XMM4
1050	movdqa	  HashKey_3_k(%rsp), \TMP5
1051	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1052	movaps 0x50(%arg1), \TMP3
1053	AESENC	  \TMP3, \XMM1              # Round 5
1054	AESENC	  \TMP3, \XMM2
1055	AESENC	  \TMP3, \XMM3
1056	AESENC	  \TMP3, \XMM4
1057	pxor	  \TMP1, \TMP4
1058# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1059	pxor	  \XMM6, \XMM5
1060	pxor	  \TMP2, \TMP6
1061	movdqa	  \XMM7, \TMP1
1062	pshufd	  $78, \XMM7, \TMP2
1063	pxor	  \XMM7, \TMP2
1064	movdqa	  HashKey_2(%rsp ), \TMP5
1065
1066        # Multiply TMP5 * HashKey using karatsuba
1067
1068	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1069	movaps 0x60(%arg1), \TMP3
1070	AESENC	  \TMP3, \XMM1              # Round 6
1071	AESENC	  \TMP3, \XMM2
1072	AESENC	  \TMP3, \XMM3
1073	AESENC	  \TMP3, \XMM4
1074	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1075	movaps 0x70(%arg1), \TMP3
1076	AESENC	  \TMP3, \XMM1             # Round 7
1077	AESENC	  \TMP3, \XMM2
1078	AESENC	  \TMP3, \XMM3
1079	AESENC	  \TMP3, \XMM4
1080	movdqa	  HashKey_2_k(%rsp), \TMP5
1081	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1082	movaps 0x80(%arg1), \TMP3
1083	AESENC	  \TMP3, \XMM1             # Round 8
1084	AESENC	  \TMP3, \XMM2
1085	AESENC	  \TMP3, \XMM3
1086	AESENC	  \TMP3, \XMM4
1087	pxor	  \TMP1, \TMP4
1088# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1089	pxor	  \XMM7, \XMM5
1090	pxor	  \TMP2, \TMP6
1091
1092        # Multiply XMM8 * HashKey
1093        # XMM8 and TMP5 hold the values for the two operands
1094
1095	movdqa	  \XMM8, \TMP1
1096	pshufd	  $78, \XMM8, \TMP2
1097	pxor	  \XMM8, \TMP2
1098	movdqa	  HashKey(%rsp), \TMP5
1099	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1100	movaps 0x90(%arg1), \TMP3
1101	AESENC	  \TMP3, \XMM1            # Round 9
1102	AESENC	  \TMP3, \XMM2
1103	AESENC	  \TMP3, \XMM3
1104	AESENC	  \TMP3, \XMM4
1105	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1106	lea	  0xa0(%arg1),%r10
1107	mov	  keysize,%eax
1108	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1109	sub	  $4,%eax			# 128->0, 192->2, 256->4
1110	jz	  aes_loop_par_dec_done
1111
1112aes_loop_par_dec:
1113	MOVADQ	  (%r10),\TMP3
1114.irpc	index, 1234
1115	AESENC	  \TMP3, %xmm\index
1116.endr
1117	add	  $16,%r10
1118	sub	  $1,%eax
1119	jnz	  aes_loop_par_dec
1120
1121aes_loop_par_dec_done:
1122	MOVADQ	  (%r10), \TMP3
1123	AESENCLAST \TMP3, \XMM1           # last round
1124	AESENCLAST \TMP3, \XMM2
1125	AESENCLAST \TMP3, \XMM3
1126	AESENCLAST \TMP3, \XMM4
1127	movdqa    HashKey_k(%rsp), \TMP5
1128	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1129	movdqu	  (%arg3,%r11,1), \TMP3
1130	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1131	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
1132	movdqa    \TMP3, \XMM1
1133	movdqu	  16(%arg3,%r11,1), \TMP3
1134	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1135	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
1136	movdqa    \TMP3, \XMM2
1137	movdqu	  32(%arg3,%r11,1), \TMP3
1138	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1139	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1140	movdqa    \TMP3, \XMM3
1141	movdqu	  48(%arg3,%r11,1), \TMP3
1142	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1143	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1144	movdqa    \TMP3, \XMM4
1145	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1146	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1147	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1148	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1149
1150	pxor	  \TMP4, \TMP1
1151	pxor	  \XMM8, \XMM5
1152	pxor	  \TMP6, \TMP2
1153	pxor	  \TMP1, \TMP2
1154	pxor	  \XMM5, \TMP2
1155	movdqa	  \TMP2, \TMP3
1156	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1157	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1158	pxor	  \TMP3, \XMM5
1159	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1160
1161        # first phase of reduction
1162
1163	movdqa    \XMM5, \TMP2
1164	movdqa    \XMM5, \TMP3
1165	movdqa    \XMM5, \TMP4
1166# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1167	pslld     $31, \TMP2                   # packed right shift << 31
1168	pslld     $30, \TMP3                   # packed right shift << 30
1169	pslld     $25, \TMP4                   # packed right shift << 25
1170	pxor      \TMP3, \TMP2	               # xor the shifted versions
1171	pxor      \TMP4, \TMP2
1172	movdqa    \TMP2, \TMP5
1173	psrldq    $4, \TMP5                    # right shift T5 1 DW
1174	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1175	pxor      \TMP2, \XMM5
1176
1177        # second phase of reduction
1178
1179	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1180	movdqa    \XMM5,\TMP3
1181	movdqa    \XMM5,\TMP4
1182	psrld     $1, \TMP2                    # packed left shift >>1
1183	psrld     $2, \TMP3                    # packed left shift >>2
1184	psrld     $7, \TMP4                    # packed left shift >>7
1185	pxor      \TMP3,\TMP2		       # xor the shifted versions
1186	pxor      \TMP4,\TMP2
1187	pxor      \TMP5, \TMP2
1188	pxor      \TMP2, \XMM5
1189	pxor      \TMP1, \XMM5                 # result is in TMP1
1190
1191	pxor	  \XMM5, \XMM1
1192.endm
1193
1194/* GHASH the last 4 ciphertext blocks. */
1195.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1196TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1197
1198        # Multiply TMP6 * HashKey (using Karatsuba)
1199
1200	movdqa	  \XMM1, \TMP6
1201	pshufd	  $78, \XMM1, \TMP2
1202	pxor	  \XMM1, \TMP2
1203	movdqa	  HashKey_4(%rsp), \TMP5
1204	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1205	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1206	movdqa	  HashKey_4_k(%rsp), \TMP4
1207	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1208	movdqa	  \XMM1, \XMMDst
1209	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1210
1211        # Multiply TMP1 * HashKey (using Karatsuba)
1212
1213	movdqa	  \XMM2, \TMP1
1214	pshufd	  $78, \XMM2, \TMP2
1215	pxor	  \XMM2, \TMP2
1216	movdqa	  HashKey_3(%rsp), \TMP5
1217	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1218	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1219	movdqa	  HashKey_3_k(%rsp), \TMP4
1220	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1221	pxor	  \TMP1, \TMP6
1222	pxor	  \XMM2, \XMMDst
1223	pxor	  \TMP2, \XMM1
1224# results accumulated in TMP6, XMMDst, XMM1
1225
1226        # Multiply TMP1 * HashKey (using Karatsuba)
1227
1228	movdqa	  \XMM3, \TMP1
1229	pshufd	  $78, \XMM3, \TMP2
1230	pxor	  \XMM3, \TMP2
1231	movdqa	  HashKey_2(%rsp), \TMP5
1232	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1233	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1234	movdqa	  HashKey_2_k(%rsp), \TMP4
1235	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1236	pxor	  \TMP1, \TMP6
1237	pxor	  \XMM3, \XMMDst
1238	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1239
1240        # Multiply TMP1 * HashKey (using Karatsuba)
1241	movdqa	  \XMM4, \TMP1
1242	pshufd	  $78, \XMM4, \TMP2
1243	pxor	  \XMM4, \TMP2
1244	movdqa	  HashKey(%rsp), \TMP5
1245	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1246	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1247	movdqa	  HashKey_k(%rsp), \TMP4
1248	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1249	pxor	  \TMP1, \TMP6
1250	pxor	  \XMM4, \XMMDst
1251	pxor	  \XMM1, \TMP2
1252	pxor	  \TMP6, \TMP2
1253	pxor	  \XMMDst, \TMP2
1254	# middle section of the temp results combined as in karatsuba algorithm
1255	movdqa	  \TMP2, \TMP4
1256	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1257	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1258	pxor	  \TMP4, \XMMDst
1259	pxor	  \TMP2, \TMP6
1260# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1261	# first phase of the reduction
1262	movdqa    \XMMDst, \TMP2
1263	movdqa    \XMMDst, \TMP3
1264	movdqa    \XMMDst, \TMP4
1265# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1266	pslld     $31, \TMP2                # packed right shifting << 31
1267	pslld     $30, \TMP3                # packed right shifting << 30
1268	pslld     $25, \TMP4                # packed right shifting << 25
1269	pxor      \TMP3, \TMP2              # xor the shifted versions
1270	pxor      \TMP4, \TMP2
1271	movdqa    \TMP2, \TMP7
1272	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1273	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1274	pxor      \TMP2, \XMMDst
1275
1276        # second phase of the reduction
1277	movdqa    \XMMDst, \TMP2
1278	# make 3 copies of XMMDst for doing 3 shift operations
1279	movdqa    \XMMDst, \TMP3
1280	movdqa    \XMMDst, \TMP4
1281	psrld     $1, \TMP2                 # packed left shift >> 1
1282	psrld     $2, \TMP3                 # packed left shift >> 2
1283	psrld     $7, \TMP4                 # packed left shift >> 7
1284	pxor      \TMP3, \TMP2              # xor the shifted versions
1285	pxor      \TMP4, \TMP2
1286	pxor      \TMP7, \TMP2
1287	pxor      \TMP2, \XMMDst
1288	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1289.endm
1290
1291
1292/* Encryption of a single block
1293* uses eax & r10
1294*/
1295
1296.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1297
1298	pxor		(%arg1), \XMM0
1299	mov		keysize,%eax
1300	shr		$2,%eax			# 128->4, 192->6, 256->8
1301	add		$5,%eax			# 128->9, 192->11, 256->13
1302	lea		16(%arg1), %r10	  # get first expanded key address
1303
1304_esb_loop_\@:
1305	MOVADQ		(%r10),\TMP1
1306	AESENC		\TMP1,\XMM0
1307	add		$16,%r10
1308	sub		$1,%eax
1309	jnz		_esb_loop_\@
1310
1311	MOVADQ		(%r10),\TMP1
1312	AESENCLAST	\TMP1,\XMM0
1313.endm
1314/*****************************************************************************
1315* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1316*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1317*                   const u8 *in,      // Ciphertext input
1318*                   u64 plaintext_len, // Length of data in bytes for decryption.
1319*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1320*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1321*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1322*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1323*                   const u8 *aad,     // Additional Authentication Data (AAD)
1324*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1325*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1326*                                      // given authentication tag and only return the plaintext if they match.
1327*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1328*                                      // (most likely), 12 or 8.
1329*
1330* Assumptions:
1331*
1332* keys:
1333*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1334*       set of 11 keys in the data structure void *aes_ctx
1335*
1336* iv:
1337*       0                   1                   2                   3
1338*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1339*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1340*       |                             Salt  (From the SA)               |
1341*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1342*       |                     Initialization Vector                     |
1343*       |         (This is the sequence number from IPSec header)       |
1344*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1345*       |                              0x1                              |
1346*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1347*
1348*
1349*
1350* AAD:
1351*       AAD padded to 128 bits with 0
1352*       for example, assume AAD is a u32 vector
1353*
1354*       if AAD is 8 bytes:
1355*       AAD[3] = {A0, A1};
1356*       padded AAD in xmm register = {A1 A0 0 0}
1357*
1358*       0                   1                   2                   3
1359*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1360*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1361*       |                               SPI (A1)                        |
1362*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1363*       |                     32-bit Sequence Number (A0)               |
1364*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1365*       |                              0x0                              |
1366*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1367*
1368*                                       AAD Format with 32-bit Sequence Number
1369*
1370*       if AAD is 12 bytes:
1371*       AAD[3] = {A0, A1, A2};
1372*       padded AAD in xmm register = {A2 A1 A0 0}
1373*
1374*       0                   1                   2                   3
1375*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1376*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1377*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1378*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1379*       |                               SPI (A2)                        |
1380*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1381*       |                 64-bit Extended Sequence Number {A1,A0}       |
1382*       |                                                               |
1383*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1384*       |                              0x0                              |
1385*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1386*
1387*                        AAD Format with 64-bit Extended Sequence Number
1388*
1389* aadLen:
1390*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1391*       The code supports 16 too but for other sizes, the code will fail.
1392*
1393* TLen:
1394*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1395*       For other sizes, the code will fail.
1396*
1397* poly = x^128 + x^127 + x^126 + x^121 + 1
1398*
1399*****************************************************************************/
1400ENTRY(aesni_gcm_dec)
1401	push	%r12
1402	push	%r13
1403	push	%r14
1404	mov	%rsp, %r14
1405/*
1406* states of %xmm registers %xmm6:%xmm15 not saved
1407* all %xmm registers are clobbered
1408*/
1409	sub	$VARIABLE_OFFSET, %rsp
1410	and	$~63, %rsp                        # align rsp to 64 bytes
1411	mov	%arg6, %r12
1412	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1413        movdqa  SHUF_MASK(%rip), %xmm2
1414	PSHUFB_XMM %xmm2, %xmm13
1415
1416
1417# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1418
1419	movdqa	%xmm13, %xmm2
1420	psllq	$1, %xmm13
1421	psrlq	$63, %xmm2
1422	movdqa	%xmm2, %xmm1
1423	pslldq	$8, %xmm2
1424	psrldq	$8, %xmm1
1425	por	%xmm2, %xmm13
1426
1427        # Reduction
1428
1429	pshufd	$0x24, %xmm1, %xmm2
1430	pcmpeqd TWOONE(%rip), %xmm2
1431	pand	POLY(%rip), %xmm2
1432	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1433
1434
1435        # Decrypt first few blocks
1436
1437	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1438	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1439	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1440	mov %r13, %r12
1441	and $(3<<4), %r12
1442	jz _initial_num_blocks_is_0_decrypt
1443	cmp $(2<<4), %r12
1444	jb _initial_num_blocks_is_1_decrypt
1445	je _initial_num_blocks_is_2_decrypt
1446_initial_num_blocks_is_3_decrypt:
1447	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1448%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1449	sub	$48, %r13
1450	jmp	_initial_blocks_decrypted
1451_initial_num_blocks_is_2_decrypt:
1452	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1453%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1454	sub	$32, %r13
1455	jmp	_initial_blocks_decrypted
1456_initial_num_blocks_is_1_decrypt:
1457	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1458%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1459	sub	$16, %r13
1460	jmp	_initial_blocks_decrypted
1461_initial_num_blocks_is_0_decrypt:
1462	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1463%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1464_initial_blocks_decrypted:
1465	cmp	$0, %r13
1466	je	_zero_cipher_left_decrypt
1467	sub	$64, %r13
1468	je	_four_cipher_left_decrypt
1469_decrypt_by_4:
1470	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1471%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1472	add	$64, %r11
1473	sub	$64, %r13
1474	jne	_decrypt_by_4
1475_four_cipher_left_decrypt:
1476	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1477%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1478_zero_cipher_left_decrypt:
1479	mov	%arg4, %r13
1480	and	$15, %r13				# %r13 = arg4 (mod 16)
1481	je	_multiple_of_16_bytes_decrypt
1482
1483        # Handle the last <16 byte block separately
1484
1485	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1486        movdqa SHUF_MASK(%rip), %xmm10
1487	PSHUFB_XMM %xmm10, %xmm0
1488
1489	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1490	sub $16, %r11
1491	add %r13, %r11
1492	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1493	lea SHIFT_MASK+16(%rip), %r12
1494	sub %r13, %r12
1495# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1496# (%r13 is the number of bytes in plaintext mod 16)
1497	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1498	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1499
1500	movdqa  %xmm1, %xmm2
1501	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1502	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1503	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1504	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1505	pand    %xmm1, %xmm2
1506        movdqa SHUF_MASK(%rip), %xmm10
1507	PSHUFB_XMM %xmm10 ,%xmm2
1508
1509	pxor %xmm2, %xmm8
1510	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1511	          # GHASH computation for the last <16 byte block
1512	sub %r13, %r11
1513	add $16, %r11
1514
1515        # output %r13 bytes
1516	MOVQ_R64_XMM	%xmm0, %rax
1517	cmp	$8, %r13
1518	jle	_less_than_8_bytes_left_decrypt
1519	mov	%rax, (%arg2 , %r11, 1)
1520	add	$8, %r11
1521	psrldq	$8, %xmm0
1522	MOVQ_R64_XMM	%xmm0, %rax
1523	sub	$8, %r13
1524_less_than_8_bytes_left_decrypt:
1525	mov	%al,  (%arg2, %r11, 1)
1526	add	$1, %r11
1527	shr	$8, %rax
1528	sub	$1, %r13
1529	jne	_less_than_8_bytes_left_decrypt
1530_multiple_of_16_bytes_decrypt:
1531	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1532	shl	$3, %r12		  # convert into number of bits
1533	movd	%r12d, %xmm15		  # len(A) in %xmm15
1534	shl	$3, %arg4		  # len(C) in bits (*128)
1535	MOVQ_R64_XMM	%arg4, %xmm1
1536	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1537	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1538	pxor	%xmm15, %xmm8
1539	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1540	         # final GHASH computation
1541        movdqa SHUF_MASK(%rip), %xmm10
1542	PSHUFB_XMM %xmm10, %xmm8
1543
1544	mov	%arg5, %rax		  # %rax = *Y0
1545	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1546	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1547	pxor	%xmm8, %xmm0
1548_return_T_decrypt:
1549	mov	arg9, %r10                # %r10 = authTag
1550	mov	arg10, %r11               # %r11 = auth_tag_len
1551	cmp	$16, %r11
1552	je	_T_16_decrypt
1553	cmp	$8, %r11
1554	jl	_T_4_decrypt
1555_T_8_decrypt:
1556	MOVQ_R64_XMM	%xmm0, %rax
1557	mov	%rax, (%r10)
1558	add	$8, %r10
1559	sub	$8, %r11
1560	psrldq	$8, %xmm0
1561	cmp	$0, %r11
1562	je	_return_T_done_decrypt
1563_T_4_decrypt:
1564	movd	%xmm0, %eax
1565	mov	%eax, (%r10)
1566	add	$4, %r10
1567	sub	$4, %r11
1568	psrldq	$4, %xmm0
1569	cmp	$0, %r11
1570	je	_return_T_done_decrypt
1571_T_123_decrypt:
1572	movd	%xmm0, %eax
1573	cmp	$2, %r11
1574	jl	_T_1_decrypt
1575	mov	%ax, (%r10)
1576	cmp	$2, %r11
1577	je	_return_T_done_decrypt
1578	add	$2, %r10
1579	sar	$16, %eax
1580_T_1_decrypt:
1581	mov	%al, (%r10)
1582	jmp	_return_T_done_decrypt
1583_T_16_decrypt:
1584	movdqu	%xmm0, (%r10)
1585_return_T_done_decrypt:
1586	mov	%r14, %rsp
1587	pop	%r14
1588	pop	%r13
1589	pop	%r12
1590	ret
1591ENDPROC(aesni_gcm_dec)
1592
1593
1594/*****************************************************************************
1595* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1596*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1597*                    const u8 *in,       // Plaintext input
1598*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1599*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1600*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1601*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1602*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1603*                    const u8 *aad,      // Additional Authentication Data (AAD)
1604*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1605*                    u8 *auth_tag,       // Authenticated Tag output.
1606*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1607*                                        // 12 or 8.
1608*
1609* Assumptions:
1610*
1611* keys:
1612*       keys are pre-expanded and aligned to 16 bytes. we are using the
1613*       first set of 11 keys in the data structure void *aes_ctx
1614*
1615*
1616* iv:
1617*       0                   1                   2                   3
1618*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1619*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1620*       |                             Salt  (From the SA)               |
1621*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1622*       |                     Initialization Vector                     |
1623*       |         (This is the sequence number from IPSec header)       |
1624*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1625*       |                              0x1                              |
1626*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1627*
1628*
1629*
1630* AAD:
1631*       AAD padded to 128 bits with 0
1632*       for example, assume AAD is a u32 vector
1633*
1634*       if AAD is 8 bytes:
1635*       AAD[3] = {A0, A1};
1636*       padded AAD in xmm register = {A1 A0 0 0}
1637*
1638*       0                   1                   2                   3
1639*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1640*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641*       |                               SPI (A1)                        |
1642*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643*       |                     32-bit Sequence Number (A0)               |
1644*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645*       |                              0x0                              |
1646*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1647*
1648*                                 AAD Format with 32-bit Sequence Number
1649*
1650*       if AAD is 12 bytes:
1651*       AAD[3] = {A0, A1, A2};
1652*       padded AAD in xmm register = {A2 A1 A0 0}
1653*
1654*       0                   1                   2                   3
1655*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1656*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657*       |                               SPI (A2)                        |
1658*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659*       |                 64-bit Extended Sequence Number {A1,A0}       |
1660*       |                                                               |
1661*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1662*       |                              0x0                              |
1663*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1664*
1665*                         AAD Format with 64-bit Extended Sequence Number
1666*
1667* aadLen:
1668*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1669*       The code supports 16 too but for other sizes, the code will fail.
1670*
1671* TLen:
1672*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1673*       For other sizes, the code will fail.
1674*
1675* poly = x^128 + x^127 + x^126 + x^121 + 1
1676***************************************************************************/
1677ENTRY(aesni_gcm_enc)
1678	push	%r12
1679	push	%r13
1680	push	%r14
1681	mov	%rsp, %r14
1682#
1683# states of %xmm registers %xmm6:%xmm15 not saved
1684# all %xmm registers are clobbered
1685#
1686	sub	$VARIABLE_OFFSET, %rsp
1687	and	$~63, %rsp
1688	mov	%arg6, %r12
1689	movdqu	(%r12), %xmm13
1690        movdqa  SHUF_MASK(%rip), %xmm2
1691	PSHUFB_XMM %xmm2, %xmm13
1692
1693
1694# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1695
1696	movdqa	%xmm13, %xmm2
1697	psllq	$1, %xmm13
1698	psrlq	$63, %xmm2
1699	movdqa	%xmm2, %xmm1
1700	pslldq	$8, %xmm2
1701	psrldq	$8, %xmm1
1702	por	%xmm2, %xmm13
1703
1704        # reduce HashKey<<1
1705
1706	pshufd	$0x24, %xmm1, %xmm2
1707	pcmpeqd TWOONE(%rip), %xmm2
1708	pand	POLY(%rip), %xmm2
1709	pxor	%xmm2, %xmm13
1710	movdqa	%xmm13, HashKey(%rsp)
1711	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1712	and	$-16, %r13
1713	mov	%r13, %r12
1714
1715        # Encrypt first few blocks
1716
1717	and	$(3<<4), %r12
1718	jz	_initial_num_blocks_is_0_encrypt
1719	cmp	$(2<<4), %r12
1720	jb	_initial_num_blocks_is_1_encrypt
1721	je	_initial_num_blocks_is_2_encrypt
1722_initial_num_blocks_is_3_encrypt:
1723	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1724%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1725	sub	$48, %r13
1726	jmp	_initial_blocks_encrypted
1727_initial_num_blocks_is_2_encrypt:
1728	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1729%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1730	sub	$32, %r13
1731	jmp	_initial_blocks_encrypted
1732_initial_num_blocks_is_1_encrypt:
1733	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1734%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1735	sub	$16, %r13
1736	jmp	_initial_blocks_encrypted
1737_initial_num_blocks_is_0_encrypt:
1738	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1739%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1740_initial_blocks_encrypted:
1741
1742        # Main loop - Encrypt remaining blocks
1743
1744	cmp	$0, %r13
1745	je	_zero_cipher_left_encrypt
1746	sub	$64, %r13
1747	je	_four_cipher_left_encrypt
1748_encrypt_by_4_encrypt:
1749	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1750%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1751	add	$64, %r11
1752	sub	$64, %r13
1753	jne	_encrypt_by_4_encrypt
1754_four_cipher_left_encrypt:
1755	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1756%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1757_zero_cipher_left_encrypt:
1758	mov	%arg4, %r13
1759	and	$15, %r13			# %r13 = arg4 (mod 16)
1760	je	_multiple_of_16_bytes_encrypt
1761
1762         # Handle the last <16 Byte block separately
1763	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1764        movdqa SHUF_MASK(%rip), %xmm10
1765	PSHUFB_XMM %xmm10, %xmm0
1766
1767
1768	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1769	sub $16, %r11
1770	add %r13, %r11
1771	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1772	lea SHIFT_MASK+16(%rip), %r12
1773	sub %r13, %r12
1774	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1775	# (%r13 is the number of bytes in plaintext mod 16)
1776	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1777	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1778	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1779	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1780	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1781	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1782        movdqa SHUF_MASK(%rip), %xmm10
1783	PSHUFB_XMM %xmm10,%xmm0
1784
1785	pxor	%xmm0, %xmm8
1786	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1787	# GHASH computation for the last <16 byte block
1788	sub	%r13, %r11
1789	add	$16, %r11
1790
1791	movdqa SHUF_MASK(%rip), %xmm10
1792	PSHUFB_XMM %xmm10, %xmm0
1793
1794	# shuffle xmm0 back to output as ciphertext
1795
1796        # Output %r13 bytes
1797	MOVQ_R64_XMM %xmm0, %rax
1798	cmp $8, %r13
1799	jle _less_than_8_bytes_left_encrypt
1800	mov %rax, (%arg2 , %r11, 1)
1801	add $8, %r11
1802	psrldq $8, %xmm0
1803	MOVQ_R64_XMM %xmm0, %rax
1804	sub $8, %r13
1805_less_than_8_bytes_left_encrypt:
1806	mov %al,  (%arg2, %r11, 1)
1807	add $1, %r11
1808	shr $8, %rax
1809	sub $1, %r13
1810	jne _less_than_8_bytes_left_encrypt
1811_multiple_of_16_bytes_encrypt:
1812	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1813	shl	$3, %r12
1814	movd	%r12d, %xmm15       # len(A) in %xmm15
1815	shl	$3, %arg4               # len(C) in bits (*128)
1816	MOVQ_R64_XMM	%arg4, %xmm1
1817	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1818	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1819	pxor	%xmm15, %xmm8
1820	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1821	# final GHASH computation
1822        movdqa SHUF_MASK(%rip), %xmm10
1823	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1824
1825	mov	%arg5, %rax		       # %rax  = *Y0
1826	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1827	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1828	pxor	%xmm8, %xmm0
1829_return_T_encrypt:
1830	mov	arg9, %r10                     # %r10 = authTag
1831	mov	arg10, %r11                    # %r11 = auth_tag_len
1832	cmp	$16, %r11
1833	je	_T_16_encrypt
1834	cmp	$8, %r11
1835	jl	_T_4_encrypt
1836_T_8_encrypt:
1837	MOVQ_R64_XMM	%xmm0, %rax
1838	mov	%rax, (%r10)
1839	add	$8, %r10
1840	sub	$8, %r11
1841	psrldq	$8, %xmm0
1842	cmp	$0, %r11
1843	je	_return_T_done_encrypt
1844_T_4_encrypt:
1845	movd	%xmm0, %eax
1846	mov	%eax, (%r10)
1847	add	$4, %r10
1848	sub	$4, %r11
1849	psrldq	$4, %xmm0
1850	cmp	$0, %r11
1851	je	_return_T_done_encrypt
1852_T_123_encrypt:
1853	movd	%xmm0, %eax
1854	cmp	$2, %r11
1855	jl	_T_1_encrypt
1856	mov	%ax, (%r10)
1857	cmp	$2, %r11
1858	je	_return_T_done_encrypt
1859	add	$2, %r10
1860	sar	$16, %eax
1861_T_1_encrypt:
1862	mov	%al, (%r10)
1863	jmp	_return_T_done_encrypt
1864_T_16_encrypt:
1865	movdqu	%xmm0, (%r10)
1866_return_T_done_encrypt:
1867	mov	%r14, %rsp
1868	pop	%r14
1869	pop	%r13
1870	pop	%r12
1871	ret
1872ENDPROC(aesni_gcm_enc)
1873
1874#endif
1875
1876
1877.align 4
1878_key_expansion_128:
1879_key_expansion_256a:
1880	pshufd $0b11111111, %xmm1, %xmm1
1881	shufps $0b00010000, %xmm0, %xmm4
1882	pxor %xmm4, %xmm0
1883	shufps $0b10001100, %xmm0, %xmm4
1884	pxor %xmm4, %xmm0
1885	pxor %xmm1, %xmm0
1886	movaps %xmm0, (TKEYP)
1887	add $0x10, TKEYP
1888	ret
1889ENDPROC(_key_expansion_128)
1890ENDPROC(_key_expansion_256a)
1891
1892.align 4
1893_key_expansion_192a:
1894	pshufd $0b01010101, %xmm1, %xmm1
1895	shufps $0b00010000, %xmm0, %xmm4
1896	pxor %xmm4, %xmm0
1897	shufps $0b10001100, %xmm0, %xmm4
1898	pxor %xmm4, %xmm0
1899	pxor %xmm1, %xmm0
1900
1901	movaps %xmm2, %xmm5
1902	movaps %xmm2, %xmm6
1903	pslldq $4, %xmm5
1904	pshufd $0b11111111, %xmm0, %xmm3
1905	pxor %xmm3, %xmm2
1906	pxor %xmm5, %xmm2
1907
1908	movaps %xmm0, %xmm1
1909	shufps $0b01000100, %xmm0, %xmm6
1910	movaps %xmm6, (TKEYP)
1911	shufps $0b01001110, %xmm2, %xmm1
1912	movaps %xmm1, 0x10(TKEYP)
1913	add $0x20, TKEYP
1914	ret
1915ENDPROC(_key_expansion_192a)
1916
1917.align 4
1918_key_expansion_192b:
1919	pshufd $0b01010101, %xmm1, %xmm1
1920	shufps $0b00010000, %xmm0, %xmm4
1921	pxor %xmm4, %xmm0
1922	shufps $0b10001100, %xmm0, %xmm4
1923	pxor %xmm4, %xmm0
1924	pxor %xmm1, %xmm0
1925
1926	movaps %xmm2, %xmm5
1927	pslldq $4, %xmm5
1928	pshufd $0b11111111, %xmm0, %xmm3
1929	pxor %xmm3, %xmm2
1930	pxor %xmm5, %xmm2
1931
1932	movaps %xmm0, (TKEYP)
1933	add $0x10, TKEYP
1934	ret
1935ENDPROC(_key_expansion_192b)
1936
1937.align 4
1938_key_expansion_256b:
1939	pshufd $0b10101010, %xmm1, %xmm1
1940	shufps $0b00010000, %xmm2, %xmm4
1941	pxor %xmm4, %xmm2
1942	shufps $0b10001100, %xmm2, %xmm4
1943	pxor %xmm4, %xmm2
1944	pxor %xmm1, %xmm2
1945	movaps %xmm2, (TKEYP)
1946	add $0x10, TKEYP
1947	ret
1948ENDPROC(_key_expansion_256b)
1949
1950/*
1951 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1952 *                   unsigned int key_len)
1953 */
1954ENTRY(aesni_set_key)
1955	FRAME_BEGIN
1956#ifndef __x86_64__
1957	pushl KEYP
1958	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1959	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1960	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1961#endif
1962	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1963	movaps %xmm0, (KEYP)
1964	lea 0x10(KEYP), TKEYP		# key addr
1965	movl %edx, 480(KEYP)
1966	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1967	cmp $24, %dl
1968	jb .Lenc_key128
1969	je .Lenc_key192
1970	movups 0x10(UKEYP), %xmm2	# other user key
1971	movaps %xmm2, (TKEYP)
1972	add $0x10, TKEYP
1973	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1974	call _key_expansion_256a
1975	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1976	call _key_expansion_256b
1977	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1978	call _key_expansion_256a
1979	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1980	call _key_expansion_256b
1981	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1982	call _key_expansion_256a
1983	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1984	call _key_expansion_256b
1985	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1986	call _key_expansion_256a
1987	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1988	call _key_expansion_256b
1989	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1990	call _key_expansion_256a
1991	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1992	call _key_expansion_256b
1993	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1994	call _key_expansion_256a
1995	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1996	call _key_expansion_256b
1997	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1998	call _key_expansion_256a
1999	jmp .Ldec_key
2000.Lenc_key192:
2001	movq 0x10(UKEYP), %xmm2		# other user key
2002	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
2003	call _key_expansion_192a
2004	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
2005	call _key_expansion_192b
2006	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
2007	call _key_expansion_192a
2008	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
2009	call _key_expansion_192b
2010	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
2011	call _key_expansion_192a
2012	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
2013	call _key_expansion_192b
2014	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
2015	call _key_expansion_192a
2016	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
2017	call _key_expansion_192b
2018	jmp .Ldec_key
2019.Lenc_key128:
2020	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
2021	call _key_expansion_128
2022	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
2023	call _key_expansion_128
2024	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
2025	call _key_expansion_128
2026	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
2027	call _key_expansion_128
2028	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
2029	call _key_expansion_128
2030	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
2031	call _key_expansion_128
2032	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
2033	call _key_expansion_128
2034	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
2035	call _key_expansion_128
2036	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
2037	call _key_expansion_128
2038	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
2039	call _key_expansion_128
2040.Ldec_key:
2041	sub $0x10, TKEYP
2042	movaps (KEYP), %xmm0
2043	movaps (TKEYP), %xmm1
2044	movaps %xmm0, 240(TKEYP)
2045	movaps %xmm1, 240(KEYP)
2046	add $0x10, KEYP
2047	lea 240-16(TKEYP), UKEYP
2048.align 4
2049.Ldec_key_loop:
2050	movaps (KEYP), %xmm0
2051	AESIMC %xmm0 %xmm1
2052	movaps %xmm1, (UKEYP)
2053	add $0x10, KEYP
2054	sub $0x10, UKEYP
2055	cmp TKEYP, KEYP
2056	jb .Ldec_key_loop
2057	xor AREG, AREG
2058#ifndef __x86_64__
2059	popl KEYP
2060#endif
2061	FRAME_END
2062	ret
2063ENDPROC(aesni_set_key)
2064
2065/*
2066 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2067 */
2068ENTRY(aesni_enc)
2069	FRAME_BEGIN
2070#ifndef __x86_64__
2071	pushl KEYP
2072	pushl KLEN
2073	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2074	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2075	movl (FRAME_OFFSET+20)(%esp), INP	# src
2076#endif
2077	movl 480(KEYP), KLEN		# key length
2078	movups (INP), STATE		# input
2079	call _aesni_enc1
2080	movups STATE, (OUTP)		# output
2081#ifndef __x86_64__
2082	popl KLEN
2083	popl KEYP
2084#endif
2085	FRAME_END
2086	ret
2087ENDPROC(aesni_enc)
2088
2089/*
2090 * _aesni_enc1:		internal ABI
2091 * input:
2092 *	KEYP:		key struct pointer
2093 *	KLEN:		round count
2094 *	STATE:		initial state (input)
2095 * output:
2096 *	STATE:		finial state (output)
2097 * changed:
2098 *	KEY
2099 *	TKEYP (T1)
2100 */
2101.align 4
2102_aesni_enc1:
2103	movaps (KEYP), KEY		# key
2104	mov KEYP, TKEYP
2105	pxor KEY, STATE		# round 0
2106	add $0x30, TKEYP
2107	cmp $24, KLEN
2108	jb .Lenc128
2109	lea 0x20(TKEYP), TKEYP
2110	je .Lenc192
2111	add $0x20, TKEYP
2112	movaps -0x60(TKEYP), KEY
2113	AESENC KEY STATE
2114	movaps -0x50(TKEYP), KEY
2115	AESENC KEY STATE
2116.align 4
2117.Lenc192:
2118	movaps -0x40(TKEYP), KEY
2119	AESENC KEY STATE
2120	movaps -0x30(TKEYP), KEY
2121	AESENC KEY STATE
2122.align 4
2123.Lenc128:
2124	movaps -0x20(TKEYP), KEY
2125	AESENC KEY STATE
2126	movaps -0x10(TKEYP), KEY
2127	AESENC KEY STATE
2128	movaps (TKEYP), KEY
2129	AESENC KEY STATE
2130	movaps 0x10(TKEYP), KEY
2131	AESENC KEY STATE
2132	movaps 0x20(TKEYP), KEY
2133	AESENC KEY STATE
2134	movaps 0x30(TKEYP), KEY
2135	AESENC KEY STATE
2136	movaps 0x40(TKEYP), KEY
2137	AESENC KEY STATE
2138	movaps 0x50(TKEYP), KEY
2139	AESENC KEY STATE
2140	movaps 0x60(TKEYP), KEY
2141	AESENC KEY STATE
2142	movaps 0x70(TKEYP), KEY
2143	AESENCLAST KEY STATE
2144	ret
2145ENDPROC(_aesni_enc1)
2146
2147/*
2148 * _aesni_enc4:	internal ABI
2149 * input:
2150 *	KEYP:		key struct pointer
2151 *	KLEN:		round count
2152 *	STATE1:		initial state (input)
2153 *	STATE2
2154 *	STATE3
2155 *	STATE4
2156 * output:
2157 *	STATE1:		finial state (output)
2158 *	STATE2
2159 *	STATE3
2160 *	STATE4
2161 * changed:
2162 *	KEY
2163 *	TKEYP (T1)
2164 */
2165.align 4
2166_aesni_enc4:
2167	movaps (KEYP), KEY		# key
2168	mov KEYP, TKEYP
2169	pxor KEY, STATE1		# round 0
2170	pxor KEY, STATE2
2171	pxor KEY, STATE3
2172	pxor KEY, STATE4
2173	add $0x30, TKEYP
2174	cmp $24, KLEN
2175	jb .L4enc128
2176	lea 0x20(TKEYP), TKEYP
2177	je .L4enc192
2178	add $0x20, TKEYP
2179	movaps -0x60(TKEYP), KEY
2180	AESENC KEY STATE1
2181	AESENC KEY STATE2
2182	AESENC KEY STATE3
2183	AESENC KEY STATE4
2184	movaps -0x50(TKEYP), KEY
2185	AESENC KEY STATE1
2186	AESENC KEY STATE2
2187	AESENC KEY STATE3
2188	AESENC KEY STATE4
2189#.align 4
2190.L4enc192:
2191	movaps -0x40(TKEYP), KEY
2192	AESENC KEY STATE1
2193	AESENC KEY STATE2
2194	AESENC KEY STATE3
2195	AESENC KEY STATE4
2196	movaps -0x30(TKEYP), KEY
2197	AESENC KEY STATE1
2198	AESENC KEY STATE2
2199	AESENC KEY STATE3
2200	AESENC KEY STATE4
2201#.align 4
2202.L4enc128:
2203	movaps -0x20(TKEYP), KEY
2204	AESENC KEY STATE1
2205	AESENC KEY STATE2
2206	AESENC KEY STATE3
2207	AESENC KEY STATE4
2208	movaps -0x10(TKEYP), KEY
2209	AESENC KEY STATE1
2210	AESENC KEY STATE2
2211	AESENC KEY STATE3
2212	AESENC KEY STATE4
2213	movaps (TKEYP), KEY
2214	AESENC KEY STATE1
2215	AESENC KEY STATE2
2216	AESENC KEY STATE3
2217	AESENC KEY STATE4
2218	movaps 0x10(TKEYP), KEY
2219	AESENC KEY STATE1
2220	AESENC KEY STATE2
2221	AESENC KEY STATE3
2222	AESENC KEY STATE4
2223	movaps 0x20(TKEYP), KEY
2224	AESENC KEY STATE1
2225	AESENC KEY STATE2
2226	AESENC KEY STATE3
2227	AESENC KEY STATE4
2228	movaps 0x30(TKEYP), KEY
2229	AESENC KEY STATE1
2230	AESENC KEY STATE2
2231	AESENC KEY STATE3
2232	AESENC KEY STATE4
2233	movaps 0x40(TKEYP), KEY
2234	AESENC KEY STATE1
2235	AESENC KEY STATE2
2236	AESENC KEY STATE3
2237	AESENC KEY STATE4
2238	movaps 0x50(TKEYP), KEY
2239	AESENC KEY STATE1
2240	AESENC KEY STATE2
2241	AESENC KEY STATE3
2242	AESENC KEY STATE4
2243	movaps 0x60(TKEYP), KEY
2244	AESENC KEY STATE1
2245	AESENC KEY STATE2
2246	AESENC KEY STATE3
2247	AESENC KEY STATE4
2248	movaps 0x70(TKEYP), KEY
2249	AESENCLAST KEY STATE1		# last round
2250	AESENCLAST KEY STATE2
2251	AESENCLAST KEY STATE3
2252	AESENCLAST KEY STATE4
2253	ret
2254ENDPROC(_aesni_enc4)
2255
2256/*
2257 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2258 */
2259ENTRY(aesni_dec)
2260	FRAME_BEGIN
2261#ifndef __x86_64__
2262	pushl KEYP
2263	pushl KLEN
2264	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2265	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2266	movl (FRAME_OFFSET+20)(%esp), INP	# src
2267#endif
2268	mov 480(KEYP), KLEN		# key length
2269	add $240, KEYP
2270	movups (INP), STATE		# input
2271	call _aesni_dec1
2272	movups STATE, (OUTP)		#output
2273#ifndef __x86_64__
2274	popl KLEN
2275	popl KEYP
2276#endif
2277	FRAME_END
2278	ret
2279ENDPROC(aesni_dec)
2280
2281/*
2282 * _aesni_dec1:		internal ABI
2283 * input:
2284 *	KEYP:		key struct pointer
2285 *	KLEN:		key length
2286 *	STATE:		initial state (input)
2287 * output:
2288 *	STATE:		finial state (output)
2289 * changed:
2290 *	KEY
2291 *	TKEYP (T1)
2292 */
2293.align 4
2294_aesni_dec1:
2295	movaps (KEYP), KEY		# key
2296	mov KEYP, TKEYP
2297	pxor KEY, STATE		# round 0
2298	add $0x30, TKEYP
2299	cmp $24, KLEN
2300	jb .Ldec128
2301	lea 0x20(TKEYP), TKEYP
2302	je .Ldec192
2303	add $0x20, TKEYP
2304	movaps -0x60(TKEYP), KEY
2305	AESDEC KEY STATE
2306	movaps -0x50(TKEYP), KEY
2307	AESDEC KEY STATE
2308.align 4
2309.Ldec192:
2310	movaps -0x40(TKEYP), KEY
2311	AESDEC KEY STATE
2312	movaps -0x30(TKEYP), KEY
2313	AESDEC KEY STATE
2314.align 4
2315.Ldec128:
2316	movaps -0x20(TKEYP), KEY
2317	AESDEC KEY STATE
2318	movaps -0x10(TKEYP), KEY
2319	AESDEC KEY STATE
2320	movaps (TKEYP), KEY
2321	AESDEC KEY STATE
2322	movaps 0x10(TKEYP), KEY
2323	AESDEC KEY STATE
2324	movaps 0x20(TKEYP), KEY
2325	AESDEC KEY STATE
2326	movaps 0x30(TKEYP), KEY
2327	AESDEC KEY STATE
2328	movaps 0x40(TKEYP), KEY
2329	AESDEC KEY STATE
2330	movaps 0x50(TKEYP), KEY
2331	AESDEC KEY STATE
2332	movaps 0x60(TKEYP), KEY
2333	AESDEC KEY STATE
2334	movaps 0x70(TKEYP), KEY
2335	AESDECLAST KEY STATE
2336	ret
2337ENDPROC(_aesni_dec1)
2338
2339/*
2340 * _aesni_dec4:	internal ABI
2341 * input:
2342 *	KEYP:		key struct pointer
2343 *	KLEN:		key length
2344 *	STATE1:		initial state (input)
2345 *	STATE2
2346 *	STATE3
2347 *	STATE4
2348 * output:
2349 *	STATE1:		finial state (output)
2350 *	STATE2
2351 *	STATE3
2352 *	STATE4
2353 * changed:
2354 *	KEY
2355 *	TKEYP (T1)
2356 */
2357.align 4
2358_aesni_dec4:
2359	movaps (KEYP), KEY		# key
2360	mov KEYP, TKEYP
2361	pxor KEY, STATE1		# round 0
2362	pxor KEY, STATE2
2363	pxor KEY, STATE3
2364	pxor KEY, STATE4
2365	add $0x30, TKEYP
2366	cmp $24, KLEN
2367	jb .L4dec128
2368	lea 0x20(TKEYP), TKEYP
2369	je .L4dec192
2370	add $0x20, TKEYP
2371	movaps -0x60(TKEYP), KEY
2372	AESDEC KEY STATE1
2373	AESDEC KEY STATE2
2374	AESDEC KEY STATE3
2375	AESDEC KEY STATE4
2376	movaps -0x50(TKEYP), KEY
2377	AESDEC KEY STATE1
2378	AESDEC KEY STATE2
2379	AESDEC KEY STATE3
2380	AESDEC KEY STATE4
2381.align 4
2382.L4dec192:
2383	movaps -0x40(TKEYP), KEY
2384	AESDEC KEY STATE1
2385	AESDEC KEY STATE2
2386	AESDEC KEY STATE3
2387	AESDEC KEY STATE4
2388	movaps -0x30(TKEYP), KEY
2389	AESDEC KEY STATE1
2390	AESDEC KEY STATE2
2391	AESDEC KEY STATE3
2392	AESDEC KEY STATE4
2393.align 4
2394.L4dec128:
2395	movaps -0x20(TKEYP), KEY
2396	AESDEC KEY STATE1
2397	AESDEC KEY STATE2
2398	AESDEC KEY STATE3
2399	AESDEC KEY STATE4
2400	movaps -0x10(TKEYP), KEY
2401	AESDEC KEY STATE1
2402	AESDEC KEY STATE2
2403	AESDEC KEY STATE3
2404	AESDEC KEY STATE4
2405	movaps (TKEYP), KEY
2406	AESDEC KEY STATE1
2407	AESDEC KEY STATE2
2408	AESDEC KEY STATE3
2409	AESDEC KEY STATE4
2410	movaps 0x10(TKEYP), KEY
2411	AESDEC KEY STATE1
2412	AESDEC KEY STATE2
2413	AESDEC KEY STATE3
2414	AESDEC KEY STATE4
2415	movaps 0x20(TKEYP), KEY
2416	AESDEC KEY STATE1
2417	AESDEC KEY STATE2
2418	AESDEC KEY STATE3
2419	AESDEC KEY STATE4
2420	movaps 0x30(TKEYP), KEY
2421	AESDEC KEY STATE1
2422	AESDEC KEY STATE2
2423	AESDEC KEY STATE3
2424	AESDEC KEY STATE4
2425	movaps 0x40(TKEYP), KEY
2426	AESDEC KEY STATE1
2427	AESDEC KEY STATE2
2428	AESDEC KEY STATE3
2429	AESDEC KEY STATE4
2430	movaps 0x50(TKEYP), KEY
2431	AESDEC KEY STATE1
2432	AESDEC KEY STATE2
2433	AESDEC KEY STATE3
2434	AESDEC KEY STATE4
2435	movaps 0x60(TKEYP), KEY
2436	AESDEC KEY STATE1
2437	AESDEC KEY STATE2
2438	AESDEC KEY STATE3
2439	AESDEC KEY STATE4
2440	movaps 0x70(TKEYP), KEY
2441	AESDECLAST KEY STATE1		# last round
2442	AESDECLAST KEY STATE2
2443	AESDECLAST KEY STATE3
2444	AESDECLAST KEY STATE4
2445	ret
2446ENDPROC(_aesni_dec4)
2447
2448/*
2449 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2450 *		      size_t len)
2451 */
2452ENTRY(aesni_ecb_enc)
2453	FRAME_BEGIN
2454#ifndef __x86_64__
2455	pushl LEN
2456	pushl KEYP
2457	pushl KLEN
2458	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2459	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2460	movl (FRAME_OFFSET+24)(%esp), INP	# src
2461	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2462#endif
2463	test LEN, LEN		# check length
2464	jz .Lecb_enc_ret
2465	mov 480(KEYP), KLEN
2466	cmp $16, LEN
2467	jb .Lecb_enc_ret
2468	cmp $64, LEN
2469	jb .Lecb_enc_loop1
2470.align 4
2471.Lecb_enc_loop4:
2472	movups (INP), STATE1
2473	movups 0x10(INP), STATE2
2474	movups 0x20(INP), STATE3
2475	movups 0x30(INP), STATE4
2476	call _aesni_enc4
2477	movups STATE1, (OUTP)
2478	movups STATE2, 0x10(OUTP)
2479	movups STATE3, 0x20(OUTP)
2480	movups STATE4, 0x30(OUTP)
2481	sub $64, LEN
2482	add $64, INP
2483	add $64, OUTP
2484	cmp $64, LEN
2485	jge .Lecb_enc_loop4
2486	cmp $16, LEN
2487	jb .Lecb_enc_ret
2488.align 4
2489.Lecb_enc_loop1:
2490	movups (INP), STATE1
2491	call _aesni_enc1
2492	movups STATE1, (OUTP)
2493	sub $16, LEN
2494	add $16, INP
2495	add $16, OUTP
2496	cmp $16, LEN
2497	jge .Lecb_enc_loop1
2498.Lecb_enc_ret:
2499#ifndef __x86_64__
2500	popl KLEN
2501	popl KEYP
2502	popl LEN
2503#endif
2504	FRAME_END
2505	ret
2506ENDPROC(aesni_ecb_enc)
2507
2508/*
2509 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2510 *		      size_t len);
2511 */
2512ENTRY(aesni_ecb_dec)
2513	FRAME_BEGIN
2514#ifndef __x86_64__
2515	pushl LEN
2516	pushl KEYP
2517	pushl KLEN
2518	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2519	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2520	movl (FRAME_OFFSET+24)(%esp), INP	# src
2521	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2522#endif
2523	test LEN, LEN
2524	jz .Lecb_dec_ret
2525	mov 480(KEYP), KLEN
2526	add $240, KEYP
2527	cmp $16, LEN
2528	jb .Lecb_dec_ret
2529	cmp $64, LEN
2530	jb .Lecb_dec_loop1
2531.align 4
2532.Lecb_dec_loop4:
2533	movups (INP), STATE1
2534	movups 0x10(INP), STATE2
2535	movups 0x20(INP), STATE3
2536	movups 0x30(INP), STATE4
2537	call _aesni_dec4
2538	movups STATE1, (OUTP)
2539	movups STATE2, 0x10(OUTP)
2540	movups STATE3, 0x20(OUTP)
2541	movups STATE4, 0x30(OUTP)
2542	sub $64, LEN
2543	add $64, INP
2544	add $64, OUTP
2545	cmp $64, LEN
2546	jge .Lecb_dec_loop4
2547	cmp $16, LEN
2548	jb .Lecb_dec_ret
2549.align 4
2550.Lecb_dec_loop1:
2551	movups (INP), STATE1
2552	call _aesni_dec1
2553	movups STATE1, (OUTP)
2554	sub $16, LEN
2555	add $16, INP
2556	add $16, OUTP
2557	cmp $16, LEN
2558	jge .Lecb_dec_loop1
2559.Lecb_dec_ret:
2560#ifndef __x86_64__
2561	popl KLEN
2562	popl KEYP
2563	popl LEN
2564#endif
2565	FRAME_END
2566	ret
2567ENDPROC(aesni_ecb_dec)
2568
2569/*
2570 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2571 *		      size_t len, u8 *iv)
2572 */
2573ENTRY(aesni_cbc_enc)
2574	FRAME_BEGIN
2575#ifndef __x86_64__
2576	pushl IVP
2577	pushl LEN
2578	pushl KEYP
2579	pushl KLEN
2580	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2581	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2582	movl (FRAME_OFFSET+28)(%esp), INP	# src
2583	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2584	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2585#endif
2586	cmp $16, LEN
2587	jb .Lcbc_enc_ret
2588	mov 480(KEYP), KLEN
2589	movups (IVP), STATE	# load iv as initial state
2590.align 4
2591.Lcbc_enc_loop:
2592	movups (INP), IN	# load input
2593	pxor IN, STATE
2594	call _aesni_enc1
2595	movups STATE, (OUTP)	# store output
2596	sub $16, LEN
2597	add $16, INP
2598	add $16, OUTP
2599	cmp $16, LEN
2600	jge .Lcbc_enc_loop
2601	movups STATE, (IVP)
2602.Lcbc_enc_ret:
2603#ifndef __x86_64__
2604	popl KLEN
2605	popl KEYP
2606	popl LEN
2607	popl IVP
2608#endif
2609	FRAME_END
2610	ret
2611ENDPROC(aesni_cbc_enc)
2612
2613/*
2614 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2615 *		      size_t len, u8 *iv)
2616 */
2617ENTRY(aesni_cbc_dec)
2618	FRAME_BEGIN
2619#ifndef __x86_64__
2620	pushl IVP
2621	pushl LEN
2622	pushl KEYP
2623	pushl KLEN
2624	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2625	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2626	movl (FRAME_OFFSET+28)(%esp), INP	# src
2627	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2628	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2629#endif
2630	cmp $16, LEN
2631	jb .Lcbc_dec_just_ret
2632	mov 480(KEYP), KLEN
2633	add $240, KEYP
2634	movups (IVP), IV
2635	cmp $64, LEN
2636	jb .Lcbc_dec_loop1
2637.align 4
2638.Lcbc_dec_loop4:
2639	movups (INP), IN1
2640	movaps IN1, STATE1
2641	movups 0x10(INP), IN2
2642	movaps IN2, STATE2
2643#ifdef __x86_64__
2644	movups 0x20(INP), IN3
2645	movaps IN3, STATE3
2646	movups 0x30(INP), IN4
2647	movaps IN4, STATE4
2648#else
2649	movups 0x20(INP), IN1
2650	movaps IN1, STATE3
2651	movups 0x30(INP), IN2
2652	movaps IN2, STATE4
2653#endif
2654	call _aesni_dec4
2655	pxor IV, STATE1
2656#ifdef __x86_64__
2657	pxor IN1, STATE2
2658	pxor IN2, STATE3
2659	pxor IN3, STATE4
2660	movaps IN4, IV
2661#else
2662	pxor IN1, STATE4
2663	movaps IN2, IV
2664	movups (INP), IN1
2665	pxor IN1, STATE2
2666	movups 0x10(INP), IN2
2667	pxor IN2, STATE3
2668#endif
2669	movups STATE1, (OUTP)
2670	movups STATE2, 0x10(OUTP)
2671	movups STATE3, 0x20(OUTP)
2672	movups STATE4, 0x30(OUTP)
2673	sub $64, LEN
2674	add $64, INP
2675	add $64, OUTP
2676	cmp $64, LEN
2677	jge .Lcbc_dec_loop4
2678	cmp $16, LEN
2679	jb .Lcbc_dec_ret
2680.align 4
2681.Lcbc_dec_loop1:
2682	movups (INP), IN
2683	movaps IN, STATE
2684	call _aesni_dec1
2685	pxor IV, STATE
2686	movups STATE, (OUTP)
2687	movaps IN, IV
2688	sub $16, LEN
2689	add $16, INP
2690	add $16, OUTP
2691	cmp $16, LEN
2692	jge .Lcbc_dec_loop1
2693.Lcbc_dec_ret:
2694	movups IV, (IVP)
2695.Lcbc_dec_just_ret:
2696#ifndef __x86_64__
2697	popl KLEN
2698	popl KEYP
2699	popl LEN
2700	popl IVP
2701#endif
2702	FRAME_END
2703	ret
2704ENDPROC(aesni_cbc_dec)
2705
2706#ifdef __x86_64__
2707.pushsection .rodata
2708.align 16
2709.Lbswap_mask:
2710	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2711.popsection
2712
2713/*
2714 * _aesni_inc_init:	internal ABI
2715 *	setup registers used by _aesni_inc
2716 * input:
2717 *	IV
2718 * output:
2719 *	CTR:	== IV, in little endian
2720 *	TCTR_LOW: == lower qword of CTR
2721 *	INC:	== 1, in little endian
2722 *	BSWAP_MASK == endian swapping mask
2723 */
2724.align 4
2725_aesni_inc_init:
2726	movaps .Lbswap_mask, BSWAP_MASK
2727	movaps IV, CTR
2728	PSHUFB_XMM BSWAP_MASK CTR
2729	mov $1, TCTR_LOW
2730	MOVQ_R64_XMM TCTR_LOW INC
2731	MOVQ_R64_XMM CTR TCTR_LOW
2732	ret
2733ENDPROC(_aesni_inc_init)
2734
2735/*
2736 * _aesni_inc:		internal ABI
2737 *	Increase IV by 1, IV is in big endian
2738 * input:
2739 *	IV
2740 *	CTR:	== IV, in little endian
2741 *	TCTR_LOW: == lower qword of CTR
2742 *	INC:	== 1, in little endian
2743 *	BSWAP_MASK == endian swapping mask
2744 * output:
2745 *	IV:	Increase by 1
2746 * changed:
2747 *	CTR:	== output IV, in little endian
2748 *	TCTR_LOW: == lower qword of CTR
2749 */
2750.align 4
2751_aesni_inc:
2752	paddq INC, CTR
2753	add $1, TCTR_LOW
2754	jnc .Linc_low
2755	pslldq $8, INC
2756	paddq INC, CTR
2757	psrldq $8, INC
2758.Linc_low:
2759	movaps CTR, IV
2760	PSHUFB_XMM BSWAP_MASK IV
2761	ret
2762ENDPROC(_aesni_inc)
2763
2764/*
2765 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2766 *		      size_t len, u8 *iv)
2767 */
2768ENTRY(aesni_ctr_enc)
2769	FRAME_BEGIN
2770	cmp $16, LEN
2771	jb .Lctr_enc_just_ret
2772	mov 480(KEYP), KLEN
2773	movups (IVP), IV
2774	call _aesni_inc_init
2775	cmp $64, LEN
2776	jb .Lctr_enc_loop1
2777.align 4
2778.Lctr_enc_loop4:
2779	movaps IV, STATE1
2780	call _aesni_inc
2781	movups (INP), IN1
2782	movaps IV, STATE2
2783	call _aesni_inc
2784	movups 0x10(INP), IN2
2785	movaps IV, STATE3
2786	call _aesni_inc
2787	movups 0x20(INP), IN3
2788	movaps IV, STATE4
2789	call _aesni_inc
2790	movups 0x30(INP), IN4
2791	call _aesni_enc4
2792	pxor IN1, STATE1
2793	movups STATE1, (OUTP)
2794	pxor IN2, STATE2
2795	movups STATE2, 0x10(OUTP)
2796	pxor IN3, STATE3
2797	movups STATE3, 0x20(OUTP)
2798	pxor IN4, STATE4
2799	movups STATE4, 0x30(OUTP)
2800	sub $64, LEN
2801	add $64, INP
2802	add $64, OUTP
2803	cmp $64, LEN
2804	jge .Lctr_enc_loop4
2805	cmp $16, LEN
2806	jb .Lctr_enc_ret
2807.align 4
2808.Lctr_enc_loop1:
2809	movaps IV, STATE
2810	call _aesni_inc
2811	movups (INP), IN
2812	call _aesni_enc1
2813	pxor IN, STATE
2814	movups STATE, (OUTP)
2815	sub $16, LEN
2816	add $16, INP
2817	add $16, OUTP
2818	cmp $16, LEN
2819	jge .Lctr_enc_loop1
2820.Lctr_enc_ret:
2821	movups IV, (IVP)
2822.Lctr_enc_just_ret:
2823	FRAME_END
2824	ret
2825ENDPROC(aesni_ctr_enc)
2826
2827/*
2828 * _aesni_gf128mul_x_ble:		internal ABI
2829 *	Multiply in GF(2^128) for XTS IVs
2830 * input:
2831 *	IV:	current IV
2832 *	GF128MUL_MASK == mask with 0x87 and 0x01
2833 * output:
2834 *	IV:	next IV
2835 * changed:
2836 *	CTR:	== temporary value
2837 */
2838#define _aesni_gf128mul_x_ble() \
2839	pshufd $0x13, IV, CTR; \
2840	paddq IV, IV; \
2841	psrad $31, CTR; \
2842	pand GF128MUL_MASK, CTR; \
2843	pxor CTR, IV;
2844
2845/*
2846 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2847 *			 bool enc, u8 *iv)
2848 */
2849ENTRY(aesni_xts_crypt8)
2850	FRAME_BEGIN
2851	cmpb $0, %cl
2852	movl $0, %ecx
2853	movl $240, %r10d
2854	leaq _aesni_enc4, %r11
2855	leaq _aesni_dec4, %rax
2856	cmovel %r10d, %ecx
2857	cmoveq %rax, %r11
2858
2859	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2860	movups (IVP), IV
2861
2862	mov 480(KEYP), KLEN
2863	addq %rcx, KEYP
2864
2865	movdqa IV, STATE1
2866	movdqu 0x00(INP), INC
2867	pxor INC, STATE1
2868	movdqu IV, 0x00(OUTP)
2869
2870	_aesni_gf128mul_x_ble()
2871	movdqa IV, STATE2
2872	movdqu 0x10(INP), INC
2873	pxor INC, STATE2
2874	movdqu IV, 0x10(OUTP)
2875
2876	_aesni_gf128mul_x_ble()
2877	movdqa IV, STATE3
2878	movdqu 0x20(INP), INC
2879	pxor INC, STATE3
2880	movdqu IV, 0x20(OUTP)
2881
2882	_aesni_gf128mul_x_ble()
2883	movdqa IV, STATE4
2884	movdqu 0x30(INP), INC
2885	pxor INC, STATE4
2886	movdqu IV, 0x30(OUTP)
2887
2888	CALL_NOSPEC %r11
2889
2890	movdqu 0x00(OUTP), INC
2891	pxor INC, STATE1
2892	movdqu STATE1, 0x00(OUTP)
2893
2894	_aesni_gf128mul_x_ble()
2895	movdqa IV, STATE1
2896	movdqu 0x40(INP), INC
2897	pxor INC, STATE1
2898	movdqu IV, 0x40(OUTP)
2899
2900	movdqu 0x10(OUTP), INC
2901	pxor INC, STATE2
2902	movdqu STATE2, 0x10(OUTP)
2903
2904	_aesni_gf128mul_x_ble()
2905	movdqa IV, STATE2
2906	movdqu 0x50(INP), INC
2907	pxor INC, STATE2
2908	movdqu IV, 0x50(OUTP)
2909
2910	movdqu 0x20(OUTP), INC
2911	pxor INC, STATE3
2912	movdqu STATE3, 0x20(OUTP)
2913
2914	_aesni_gf128mul_x_ble()
2915	movdqa IV, STATE3
2916	movdqu 0x60(INP), INC
2917	pxor INC, STATE3
2918	movdqu IV, 0x60(OUTP)
2919
2920	movdqu 0x30(OUTP), INC
2921	pxor INC, STATE4
2922	movdqu STATE4, 0x30(OUTP)
2923
2924	_aesni_gf128mul_x_ble()
2925	movdqa IV, STATE4
2926	movdqu 0x70(INP), INC
2927	pxor INC, STATE4
2928	movdqu IV, 0x70(OUTP)
2929
2930	_aesni_gf128mul_x_ble()
2931	movups IV, (IVP)
2932
2933	CALL_NOSPEC %r11
2934
2935	movdqu 0x40(OUTP), INC
2936	pxor INC, STATE1
2937	movdqu STATE1, 0x40(OUTP)
2938
2939	movdqu 0x50(OUTP), INC
2940	pxor INC, STATE2
2941	movdqu STATE2, 0x50(OUTP)
2942
2943	movdqu 0x60(OUTP), INC
2944	pxor INC, STATE3
2945	movdqu STATE3, 0x60(OUTP)
2946
2947	movdqu 0x70(OUTP), INC
2948	pxor INC, STATE4
2949	movdqu STATE4, 0x70(OUTP)
2950
2951	FRAME_END
2952	ret
2953ENDPROC(aesni_xts_crypt8)
2954
2955#endif
2956