xref: /linux/arch/x86/crypto/aesni-intel_asm.S (revision e58e871becec2d3b04ed91c0c16fe8deac9c9dfa)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
16 *             Adrian Hoban <adrian.hoban@intel.com>
17 *             James Guilford (james.guilford@intel.com)
18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
19 *             Tadeusz Struk (tadeusz.struk@intel.com)
20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
21 *    Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 *    Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35
36/*
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register.  This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned).  It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
43 */
44#define MOVADQ	movaps
45#define MOVUDQ	movups
46
47#ifdef __x86_64__
48
49# constants in mergeable sections, linker can reorder and merge
50.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
51.align 16
52.Lgf128mul_x_ble_mask:
53	.octa 0x00000000000000010000000000000087
54.section	.rodata.cst16.POLY, "aM", @progbits, 16
55.align 16
56POLY:   .octa 0xC2000000000000000000000000000001
57.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
58.align 16
59TWOONE: .octa 0x00000001000000000000000000000001
60
61.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
62.align 16
63SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
64.section	.rodata.cst16.MASK1, "aM", @progbits, 16
65.align 16
66MASK1:      .octa 0x0000000000000000ffffffffffffffff
67.section	.rodata.cst16.MASK2, "aM", @progbits, 16
68.align 16
69MASK2:      .octa 0xffffffffffffffff0000000000000000
70.section	.rodata.cst16.ONE, "aM", @progbits, 16
71.align 16
72ONE:        .octa 0x00000000000000000000000000000001
73.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
74.align 16
75F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
76.section	.rodata.cst16.dec, "aM", @progbits, 16
77.align 16
78dec:        .octa 0x1
79.section	.rodata.cst16.enc, "aM", @progbits, 16
80.align 16
81enc:        .octa 0x2
82
83# order of these constants should not change.
84# more specifically, ALL_F should follow SHIFT_MASK,
85# and zero should follow ALL_F
86.section	.rodata, "a", @progbits
87.align 16
88SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
89ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
90            .octa 0x00000000000000000000000000000000
91
92
93.text
94
95
96#define	STACK_OFFSET    8*3
97#define	HashKey		16*0	// store HashKey <<1 mod poly here
98#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
99#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
100#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
101#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
102				// bits of  HashKey <<1 mod poly here
103				//(for Karatsuba purposes)
104#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
105				// bits of  HashKey^2 <<1 mod poly here
106				// (for Karatsuba purposes)
107#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
108				// bits of  HashKey^3 <<1 mod poly here
109				// (for Karatsuba purposes)
110#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
111				// bits of  HashKey^4 <<1 mod poly here
112				// (for Karatsuba purposes)
113#define	VARIABLE_OFFSET	16*8
114
115#define arg1 rdi
116#define arg2 rsi
117#define arg3 rdx
118#define arg4 rcx
119#define arg5 r8
120#define arg6 r9
121#define arg7 STACK_OFFSET+8(%r14)
122#define arg8 STACK_OFFSET+16(%r14)
123#define arg9 STACK_OFFSET+24(%r14)
124#define arg10 STACK_OFFSET+32(%r14)
125#define keysize 2*15*16(%arg1)
126#endif
127
128
129#define STATE1	%xmm0
130#define STATE2	%xmm4
131#define STATE3	%xmm5
132#define STATE4	%xmm6
133#define STATE	STATE1
134#define IN1	%xmm1
135#define IN2	%xmm7
136#define IN3	%xmm8
137#define IN4	%xmm9
138#define IN	IN1
139#define KEY	%xmm2
140#define IV	%xmm3
141
142#define BSWAP_MASK %xmm10
143#define CTR	%xmm11
144#define INC	%xmm12
145
146#define GF128MUL_MASK %xmm10
147
148#ifdef __x86_64__
149#define AREG	%rax
150#define KEYP	%rdi
151#define OUTP	%rsi
152#define UKEYP	OUTP
153#define INP	%rdx
154#define LEN	%rcx
155#define IVP	%r8
156#define KLEN	%r9d
157#define T1	%r10
158#define TKEYP	T1
159#define T2	%r11
160#define TCTR_LOW T2
161#else
162#define AREG	%eax
163#define KEYP	%edi
164#define OUTP	AREG
165#define UKEYP	OUTP
166#define INP	%edx
167#define LEN	%esi
168#define IVP	%ebp
169#define KLEN	%ebx
170#define T1	%ecx
171#define TKEYP	T1
172#endif
173
174
175#ifdef __x86_64__
176/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
177*
178*
179* Input: A and B (128-bits each, bit-reflected)
180* Output: C = A*B*x mod poly, (i.e. >>1 )
181* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
182* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
183*
184*/
185.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
186	movdqa	  \GH, \TMP1
187	pshufd	  $78, \GH, \TMP2
188	pshufd	  $78, \HK, \TMP3
189	pxor	  \GH, \TMP2            # TMP2 = a1+a0
190	pxor	  \HK, \TMP3            # TMP3 = b1+b0
191	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
192	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
193	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
194	pxor	  \GH, \TMP2
195	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
196	movdqa	  \TMP2, \TMP3
197	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
198	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
199	pxor	  \TMP3, \GH
200	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
201
202        # first phase of the reduction
203
204	movdqa    \GH, \TMP2
205	movdqa    \GH, \TMP3
206	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
207					# in in order to perform
208					# independent shifts
209	pslld     $31, \TMP2            # packed right shift <<31
210	pslld     $30, \TMP3            # packed right shift <<30
211	pslld     $25, \TMP4            # packed right shift <<25
212	pxor      \TMP3, \TMP2          # xor the shifted versions
213	pxor      \TMP4, \TMP2
214	movdqa    \TMP2, \TMP5
215	psrldq    $4, \TMP5             # right shift TMP5 1 DW
216	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
217	pxor      \TMP2, \GH
218
219        # second phase of the reduction
220
221	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
222					# in in order to perform
223					# independent shifts
224	movdqa    \GH,\TMP3
225	movdqa    \GH,\TMP4
226	psrld     $1,\TMP2              # packed left shift >>1
227	psrld     $2,\TMP3              # packed left shift >>2
228	psrld     $7,\TMP4              # packed left shift >>7
229	pxor      \TMP3,\TMP2		# xor the shifted versions
230	pxor      \TMP4,\TMP2
231	pxor      \TMP5, \TMP2
232	pxor      \TMP2, \GH
233	pxor      \TMP1, \GH            # result is in TMP1
234.endm
235
236/*
237* if a = number of total plaintext bytes
238* b = floor(a/16)
239* num_initial_blocks = b mod 4
240* encrypt the initial num_initial_blocks blocks and apply ghash on
241* the ciphertext
242* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
243* are clobbered
244* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
245*/
246
247
248.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
249XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
250        MOVADQ     SHUF_MASK(%rip), %xmm14
251	mov	   arg7, %r10           # %r10 = AAD
252	mov	   arg8, %r12           # %r12 = aadLen
253	mov	   %r12, %r11
254	pxor	   %xmm\i, %xmm\i
255
256_get_AAD_loop\num_initial_blocks\operation:
257	movd	   (%r10), \TMP1
258	pslldq	   $12, \TMP1
259	psrldq	   $4, %xmm\i
260	pxor	   \TMP1, %xmm\i
261	add	   $4, %r10
262	sub	   $4, %r12
263	jne	   _get_AAD_loop\num_initial_blocks\operation
264
265	cmp	   $16, %r11
266	je	   _get_AAD_loop2_done\num_initial_blocks\operation
267
268	mov	   $16, %r12
269_get_AAD_loop2\num_initial_blocks\operation:
270	psrldq	   $4, %xmm\i
271	sub	   $4, %r12
272	cmp	   %r11, %r12
273	jne	   _get_AAD_loop2\num_initial_blocks\operation
274
275_get_AAD_loop2_done\num_initial_blocks\operation:
276	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
277
278	xor	   %r11, %r11 # initialise the data pointer offset as zero
279
280        # start AES for num_initial_blocks blocks
281
282	mov	   %arg5, %rax                      # %rax = *Y0
283	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
284	PSHUFB_XMM   %xmm14, \XMM0
285
286.if (\i == 5) || (\i == 6) || (\i == 7)
287	MOVADQ		ONE(%RIP),\TMP1
288	MOVADQ		(%arg1),\TMP2
289.irpc index, \i_seq
290	paddd	   \TMP1, \XMM0                 # INCR Y0
291	movdqa	   \XMM0, %xmm\index
292	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
293	pxor	   \TMP2, %xmm\index
294.endr
295	lea	0x10(%arg1),%r10
296	mov	keysize,%eax
297	shr	$2,%eax				# 128->4, 192->6, 256->8
298	add	$5,%eax			      # 128->9, 192->11, 256->13
299
300aes_loop_initial_dec\num_initial_blocks:
301	MOVADQ	(%r10),\TMP1
302.irpc	index, \i_seq
303	AESENC	\TMP1, %xmm\index
304.endr
305	add	$16,%r10
306	sub	$1,%eax
307	jnz	aes_loop_initial_dec\num_initial_blocks
308
309	MOVADQ	(%r10), \TMP1
310.irpc index, \i_seq
311	AESENCLAST \TMP1, %xmm\index         # Last Round
312.endr
313.irpc index, \i_seq
314	movdqu	   (%arg3 , %r11, 1), \TMP1
315	pxor	   \TMP1, %xmm\index
316	movdqu	   %xmm\index, (%arg2 , %r11, 1)
317	# write back plaintext/ciphertext for num_initial_blocks
318	add	   $16, %r11
319
320	movdqa     \TMP1, %xmm\index
321	PSHUFB_XMM	   %xmm14, %xmm\index
322                # prepare plaintext/ciphertext for GHASH computation
323.endr
324.endif
325	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326        # apply GHASH on num_initial_blocks blocks
327
328.if \i == 5
329        pxor       %xmm5, %xmm6
330	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
331        pxor       %xmm6, %xmm7
332	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
333        pxor       %xmm7, %xmm8
334	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
335.elseif \i == 6
336        pxor       %xmm6, %xmm7
337	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
338        pxor       %xmm7, %xmm8
339	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
340.elseif \i == 7
341        pxor       %xmm7, %xmm8
342	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
343.endif
344	cmp	   $64, %r13
345	jl	_initial_blocks_done\num_initial_blocks\operation
346	# no need for precomputed values
347/*
348*
349* Precomputations for HashKey parallel with encryption of first 4 blocks.
350* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
351*/
352	MOVADQ	   ONE(%rip), \TMP1
353	paddd	   \TMP1, \XMM0              # INCR Y0
354	MOVADQ	   \XMM0, \XMM1
355	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
356
357	paddd	   \TMP1, \XMM0              # INCR Y0
358	MOVADQ	   \XMM0, \XMM2
359	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
360
361	paddd	   \TMP1, \XMM0              # INCR Y0
362	MOVADQ	   \XMM0, \XMM3
363	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
364
365	paddd	   \TMP1, \XMM0              # INCR Y0
366	MOVADQ	   \XMM0, \XMM4
367	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
368
369	MOVADQ	   0(%arg1),\TMP1
370	pxor	   \TMP1, \XMM1
371	pxor	   \TMP1, \XMM2
372	pxor	   \TMP1, \XMM3
373	pxor	   \TMP1, \XMM4
374	movdqa	   \TMP3, \TMP5
375	pshufd	   $78, \TMP3, \TMP1
376	pxor	   \TMP3, \TMP1
377	movdqa	   \TMP1, HashKey_k(%rsp)
378	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
379# TMP5 = HashKey^2<<1 (mod poly)
380	movdqa	   \TMP5, HashKey_2(%rsp)
381# HashKey_2 = HashKey^2<<1 (mod poly)
382	pshufd	   $78, \TMP5, \TMP1
383	pxor	   \TMP5, \TMP1
384	movdqa	   \TMP1, HashKey_2_k(%rsp)
385.irpc index, 1234 # do 4 rounds
386	movaps 0x10*\index(%arg1), \TMP1
387	AESENC	   \TMP1, \XMM1
388	AESENC	   \TMP1, \XMM2
389	AESENC	   \TMP1, \XMM3
390	AESENC	   \TMP1, \XMM4
391.endr
392	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
393# TMP5 = HashKey^3<<1 (mod poly)
394	movdqa	   \TMP5, HashKey_3(%rsp)
395	pshufd	   $78, \TMP5, \TMP1
396	pxor	   \TMP5, \TMP1
397	movdqa	   \TMP1, HashKey_3_k(%rsp)
398.irpc index, 56789 # do next 5 rounds
399	movaps 0x10*\index(%arg1), \TMP1
400	AESENC	   \TMP1, \XMM1
401	AESENC	   \TMP1, \XMM2
402	AESENC	   \TMP1, \XMM3
403	AESENC	   \TMP1, \XMM4
404.endr
405	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
406# TMP5 = HashKey^3<<1 (mod poly)
407	movdqa	   \TMP5, HashKey_4(%rsp)
408	pshufd	   $78, \TMP5, \TMP1
409	pxor	   \TMP5, \TMP1
410	movdqa	   \TMP1, HashKey_4_k(%rsp)
411	lea	   0xa0(%arg1),%r10
412	mov	   keysize,%eax
413	shr	   $2,%eax			# 128->4, 192->6, 256->8
414	sub	   $4,%eax			# 128->0, 192->2, 256->4
415	jz	   aes_loop_pre_dec_done\num_initial_blocks
416
417aes_loop_pre_dec\num_initial_blocks:
418	MOVADQ	   (%r10),\TMP2
419.irpc	index, 1234
420	AESENC	   \TMP2, %xmm\index
421.endr
422	add	   $16,%r10
423	sub	   $1,%eax
424	jnz	   aes_loop_pre_dec\num_initial_blocks
425
426aes_loop_pre_dec_done\num_initial_blocks:
427	MOVADQ	   (%r10), \TMP2
428	AESENCLAST \TMP2, \XMM1
429	AESENCLAST \TMP2, \XMM2
430	AESENCLAST \TMP2, \XMM3
431	AESENCLAST \TMP2, \XMM4
432	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
433	pxor	   \TMP1, \XMM1
434	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
435	movdqa     \TMP1, \XMM1
436	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
437	pxor	   \TMP1, \XMM2
438	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
439	movdqa     \TMP1, \XMM2
440	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
441	pxor	   \TMP1, \XMM3
442	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
443	movdqa     \TMP1, \XMM3
444	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
445	pxor	   \TMP1, \XMM4
446	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
447	movdqa     \TMP1, \XMM4
448	add	   $64, %r11
449	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
450	pxor	   \XMMDst, \XMM1
451# combine GHASHed value with the corresponding ciphertext
452	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
453	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
454	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
455
456_initial_blocks_done\num_initial_blocks\operation:
457
458.endm
459
460
461/*
462* if a = number of total plaintext bytes
463* b = floor(a/16)
464* num_initial_blocks = b mod 4
465* encrypt the initial num_initial_blocks blocks and apply ghash on
466* the ciphertext
467* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
468* are clobbered
469* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
470*/
471
472
473.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
474XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
475        MOVADQ     SHUF_MASK(%rip), %xmm14
476	mov	   arg7, %r10           # %r10 = AAD
477	mov	   arg8, %r12           # %r12 = aadLen
478	mov	   %r12, %r11
479	pxor	   %xmm\i, %xmm\i
480_get_AAD_loop\num_initial_blocks\operation:
481	movd	   (%r10), \TMP1
482	pslldq	   $12, \TMP1
483	psrldq	   $4, %xmm\i
484	pxor	   \TMP1, %xmm\i
485	add	   $4, %r10
486	sub	   $4, %r12
487	jne	   _get_AAD_loop\num_initial_blocks\operation
488	cmp	   $16, %r11
489	je	   _get_AAD_loop2_done\num_initial_blocks\operation
490	mov	   $16, %r12
491_get_AAD_loop2\num_initial_blocks\operation:
492	psrldq	   $4, %xmm\i
493	sub	   $4, %r12
494	cmp	   %r11, %r12
495	jne	   _get_AAD_loop2\num_initial_blocks\operation
496_get_AAD_loop2_done\num_initial_blocks\operation:
497	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
498
499	xor	   %r11, %r11 # initialise the data pointer offset as zero
500
501        # start AES for num_initial_blocks blocks
502
503	mov	   %arg5, %rax                      # %rax = *Y0
504	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
505	PSHUFB_XMM   %xmm14, \XMM0
506
507.if (\i == 5) || (\i == 6) || (\i == 7)
508
509	MOVADQ		ONE(%RIP),\TMP1
510	MOVADQ		0(%arg1),\TMP2
511.irpc index, \i_seq
512	paddd		\TMP1, \XMM0                 # INCR Y0
513	MOVADQ		\XMM0, %xmm\index
514	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
515	pxor		\TMP2, %xmm\index
516.endr
517	lea	0x10(%arg1),%r10
518	mov	keysize,%eax
519	shr	$2,%eax				# 128->4, 192->6, 256->8
520	add	$5,%eax			      # 128->9, 192->11, 256->13
521
522aes_loop_initial_enc\num_initial_blocks:
523	MOVADQ	(%r10),\TMP1
524.irpc	index, \i_seq
525	AESENC	\TMP1, %xmm\index
526.endr
527	add	$16,%r10
528	sub	$1,%eax
529	jnz	aes_loop_initial_enc\num_initial_blocks
530
531	MOVADQ	(%r10), \TMP1
532.irpc index, \i_seq
533	AESENCLAST \TMP1, %xmm\index         # Last Round
534.endr
535.irpc index, \i_seq
536	movdqu	   (%arg3 , %r11, 1), \TMP1
537	pxor	   \TMP1, %xmm\index
538	movdqu	   %xmm\index, (%arg2 , %r11, 1)
539	# write back plaintext/ciphertext for num_initial_blocks
540	add	   $16, %r11
541	PSHUFB_XMM	   %xmm14, %xmm\index
542
543		# prepare plaintext/ciphertext for GHASH computation
544.endr
545.endif
546	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
547        # apply GHASH on num_initial_blocks blocks
548
549.if \i == 5
550        pxor       %xmm5, %xmm6
551	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
552        pxor       %xmm6, %xmm7
553	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
554        pxor       %xmm7, %xmm8
555	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
556.elseif \i == 6
557        pxor       %xmm6, %xmm7
558	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
559        pxor       %xmm7, %xmm8
560	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
561.elseif \i == 7
562        pxor       %xmm7, %xmm8
563	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
564.endif
565	cmp	   $64, %r13
566	jl	_initial_blocks_done\num_initial_blocks\operation
567	# no need for precomputed values
568/*
569*
570* Precomputations for HashKey parallel with encryption of first 4 blocks.
571* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
572*/
573	MOVADQ	   ONE(%RIP),\TMP1
574	paddd	   \TMP1, \XMM0              # INCR Y0
575	MOVADQ	   \XMM0, \XMM1
576	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
577
578	paddd	   \TMP1, \XMM0              # INCR Y0
579	MOVADQ	   \XMM0, \XMM2
580	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
581
582	paddd	   \TMP1, \XMM0              # INCR Y0
583	MOVADQ	   \XMM0, \XMM3
584	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
585
586	paddd	   \TMP1, \XMM0              # INCR Y0
587	MOVADQ	   \XMM0, \XMM4
588	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
589
590	MOVADQ	   0(%arg1),\TMP1
591	pxor	   \TMP1, \XMM1
592	pxor	   \TMP1, \XMM2
593	pxor	   \TMP1, \XMM3
594	pxor	   \TMP1, \XMM4
595	movdqa	   \TMP3, \TMP5
596	pshufd	   $78, \TMP3, \TMP1
597	pxor	   \TMP3, \TMP1
598	movdqa	   \TMP1, HashKey_k(%rsp)
599	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
600# TMP5 = HashKey^2<<1 (mod poly)
601	movdqa	   \TMP5, HashKey_2(%rsp)
602# HashKey_2 = HashKey^2<<1 (mod poly)
603	pshufd	   $78, \TMP5, \TMP1
604	pxor	   \TMP5, \TMP1
605	movdqa	   \TMP1, HashKey_2_k(%rsp)
606.irpc index, 1234 # do 4 rounds
607	movaps 0x10*\index(%arg1), \TMP1
608	AESENC	   \TMP1, \XMM1
609	AESENC	   \TMP1, \XMM2
610	AESENC	   \TMP1, \XMM3
611	AESENC	   \TMP1, \XMM4
612.endr
613	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
614# TMP5 = HashKey^3<<1 (mod poly)
615	movdqa	   \TMP5, HashKey_3(%rsp)
616	pshufd	   $78, \TMP5, \TMP1
617	pxor	   \TMP5, \TMP1
618	movdqa	   \TMP1, HashKey_3_k(%rsp)
619.irpc index, 56789 # do next 5 rounds
620	movaps 0x10*\index(%arg1), \TMP1
621	AESENC	   \TMP1, \XMM1
622	AESENC	   \TMP1, \XMM2
623	AESENC	   \TMP1, \XMM3
624	AESENC	   \TMP1, \XMM4
625.endr
626	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
627# TMP5 = HashKey^3<<1 (mod poly)
628	movdqa	   \TMP5, HashKey_4(%rsp)
629	pshufd	   $78, \TMP5, \TMP1
630	pxor	   \TMP5, \TMP1
631	movdqa	   \TMP1, HashKey_4_k(%rsp)
632	lea	   0xa0(%arg1),%r10
633	mov	   keysize,%eax
634	shr	   $2,%eax			# 128->4, 192->6, 256->8
635	sub	   $4,%eax			# 128->0, 192->2, 256->4
636	jz	   aes_loop_pre_enc_done\num_initial_blocks
637
638aes_loop_pre_enc\num_initial_blocks:
639	MOVADQ	   (%r10),\TMP2
640.irpc	index, 1234
641	AESENC	   \TMP2, %xmm\index
642.endr
643	add	   $16,%r10
644	sub	   $1,%eax
645	jnz	   aes_loop_pre_enc\num_initial_blocks
646
647aes_loop_pre_enc_done\num_initial_blocks:
648	MOVADQ	   (%r10), \TMP2
649	AESENCLAST \TMP2, \XMM1
650	AESENCLAST \TMP2, \XMM2
651	AESENCLAST \TMP2, \XMM3
652	AESENCLAST \TMP2, \XMM4
653	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
654	pxor	   \TMP1, \XMM1
655	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
656	pxor	   \TMP1, \XMM2
657	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
658	pxor	   \TMP1, \XMM3
659	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
660	pxor	   \TMP1, \XMM4
661	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
662	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
663	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
664	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
665
666	add	   $64, %r11
667	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
668	pxor	   \XMMDst, \XMM1
669# combine GHASHed value with the corresponding ciphertext
670	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
671	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
672	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
673
674_initial_blocks_done\num_initial_blocks\operation:
675
676.endm
677
678/*
679* encrypt 4 blocks at a time
680* ghash the 4 previously encrypted ciphertext blocks
681* arg1, %arg2, %arg3 are used as pointers only, not modified
682* %r11 is the data offset value
683*/
684.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
685TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
686
687	movdqa	  \XMM1, \XMM5
688	movdqa	  \XMM2, \XMM6
689	movdqa	  \XMM3, \XMM7
690	movdqa	  \XMM4, \XMM8
691
692        movdqa    SHUF_MASK(%rip), %xmm15
693        # multiply TMP5 * HashKey using karatsuba
694
695	movdqa	  \XMM5, \TMP4
696	pshufd	  $78, \XMM5, \TMP6
697	pxor	  \XMM5, \TMP6
698	paddd     ONE(%rip), \XMM0		# INCR CNT
699	movdqa	  HashKey_4(%rsp), \TMP5
700	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
701	movdqa    \XMM0, \XMM1
702	paddd     ONE(%rip), \XMM0		# INCR CNT
703	movdqa    \XMM0, \XMM2
704	paddd     ONE(%rip), \XMM0		# INCR CNT
705	movdqa    \XMM0, \XMM3
706	paddd     ONE(%rip), \XMM0		# INCR CNT
707	movdqa    \XMM0, \XMM4
708	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
709	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
710	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
711	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
712	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
713
714	pxor	  (%arg1), \XMM1
715	pxor	  (%arg1), \XMM2
716	pxor	  (%arg1), \XMM3
717	pxor	  (%arg1), \XMM4
718	movdqa	  HashKey_4_k(%rsp), \TMP5
719	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
720	movaps 0x10(%arg1), \TMP1
721	AESENC	  \TMP1, \XMM1              # Round 1
722	AESENC	  \TMP1, \XMM2
723	AESENC	  \TMP1, \XMM3
724	AESENC	  \TMP1, \XMM4
725	movaps 0x20(%arg1), \TMP1
726	AESENC	  \TMP1, \XMM1              # Round 2
727	AESENC	  \TMP1, \XMM2
728	AESENC	  \TMP1, \XMM3
729	AESENC	  \TMP1, \XMM4
730	movdqa	  \XMM6, \TMP1
731	pshufd	  $78, \XMM6, \TMP2
732	pxor	  \XMM6, \TMP2
733	movdqa	  HashKey_3(%rsp), \TMP5
734	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
735	movaps 0x30(%arg1), \TMP3
736	AESENC    \TMP3, \XMM1              # Round 3
737	AESENC    \TMP3, \XMM2
738	AESENC    \TMP3, \XMM3
739	AESENC    \TMP3, \XMM4
740	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
741	movaps 0x40(%arg1), \TMP3
742	AESENC	  \TMP3, \XMM1              # Round 4
743	AESENC	  \TMP3, \XMM2
744	AESENC	  \TMP3, \XMM3
745	AESENC	  \TMP3, \XMM4
746	movdqa	  HashKey_3_k(%rsp), \TMP5
747	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
748	movaps 0x50(%arg1), \TMP3
749	AESENC	  \TMP3, \XMM1              # Round 5
750	AESENC	  \TMP3, \XMM2
751	AESENC	  \TMP3, \XMM3
752	AESENC	  \TMP3, \XMM4
753	pxor	  \TMP1, \TMP4
754# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
755	pxor	  \XMM6, \XMM5
756	pxor	  \TMP2, \TMP6
757	movdqa	  \XMM7, \TMP1
758	pshufd	  $78, \XMM7, \TMP2
759	pxor	  \XMM7, \TMP2
760	movdqa	  HashKey_2(%rsp ), \TMP5
761
762        # Multiply TMP5 * HashKey using karatsuba
763
764	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
765	movaps 0x60(%arg1), \TMP3
766	AESENC	  \TMP3, \XMM1              # Round 6
767	AESENC	  \TMP3, \XMM2
768	AESENC	  \TMP3, \XMM3
769	AESENC	  \TMP3, \XMM4
770	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
771	movaps 0x70(%arg1), \TMP3
772	AESENC	  \TMP3, \XMM1             # Round 7
773	AESENC	  \TMP3, \XMM2
774	AESENC	  \TMP3, \XMM3
775	AESENC	  \TMP3, \XMM4
776	movdqa	  HashKey_2_k(%rsp), \TMP5
777	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
778	movaps 0x80(%arg1), \TMP3
779	AESENC	  \TMP3, \XMM1             # Round 8
780	AESENC	  \TMP3, \XMM2
781	AESENC	  \TMP3, \XMM3
782	AESENC	  \TMP3, \XMM4
783	pxor	  \TMP1, \TMP4
784# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
785	pxor	  \XMM7, \XMM5
786	pxor	  \TMP2, \TMP6
787
788        # Multiply XMM8 * HashKey
789        # XMM8 and TMP5 hold the values for the two operands
790
791	movdqa	  \XMM8, \TMP1
792	pshufd	  $78, \XMM8, \TMP2
793	pxor	  \XMM8, \TMP2
794	movdqa	  HashKey(%rsp), \TMP5
795	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
796	movaps 0x90(%arg1), \TMP3
797	AESENC	  \TMP3, \XMM1            # Round 9
798	AESENC	  \TMP3, \XMM2
799	AESENC	  \TMP3, \XMM3
800	AESENC	  \TMP3, \XMM4
801	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
802	lea	  0xa0(%arg1),%r10
803	mov	  keysize,%eax
804	shr	  $2,%eax			# 128->4, 192->6, 256->8
805	sub	  $4,%eax			# 128->0, 192->2, 256->4
806	jz	  aes_loop_par_enc_done
807
808aes_loop_par_enc:
809	MOVADQ	  (%r10),\TMP3
810.irpc	index, 1234
811	AESENC	  \TMP3, %xmm\index
812.endr
813	add	  $16,%r10
814	sub	  $1,%eax
815	jnz	  aes_loop_par_enc
816
817aes_loop_par_enc_done:
818	MOVADQ	  (%r10), \TMP3
819	AESENCLAST \TMP3, \XMM1           # Round 10
820	AESENCLAST \TMP3, \XMM2
821	AESENCLAST \TMP3, \XMM3
822	AESENCLAST \TMP3, \XMM4
823	movdqa    HashKey_k(%rsp), \TMP5
824	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
825	movdqu	  (%arg3,%r11,1), \TMP3
826	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
827	movdqu	  16(%arg3,%r11,1), \TMP3
828	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
829	movdqu	  32(%arg3,%r11,1), \TMP3
830	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
831	movdqu	  48(%arg3,%r11,1), \TMP3
832	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
833        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
834        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
835        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
836        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
837	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
838	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
839	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
840	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
841
842	pxor	  \TMP4, \TMP1
843	pxor	  \XMM8, \XMM5
844	pxor	  \TMP6, \TMP2
845	pxor	  \TMP1, \TMP2
846	pxor	  \XMM5, \TMP2
847	movdqa	  \TMP2, \TMP3
848	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
849	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
850	pxor	  \TMP3, \XMM5
851	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
852
853        # first phase of reduction
854
855	movdqa    \XMM5, \TMP2
856	movdqa    \XMM5, \TMP3
857	movdqa    \XMM5, \TMP4
858# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
859	pslld     $31, \TMP2                   # packed right shift << 31
860	pslld     $30, \TMP3                   # packed right shift << 30
861	pslld     $25, \TMP4                   # packed right shift << 25
862	pxor      \TMP3, \TMP2	               # xor the shifted versions
863	pxor      \TMP4, \TMP2
864	movdqa    \TMP2, \TMP5
865	psrldq    $4, \TMP5                    # right shift T5 1 DW
866	pslldq    $12, \TMP2                   # left shift T2 3 DWs
867	pxor      \TMP2, \XMM5
868
869        # second phase of reduction
870
871	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
872	movdqa    \XMM5,\TMP3
873	movdqa    \XMM5,\TMP4
874	psrld     $1, \TMP2                    # packed left shift >>1
875	psrld     $2, \TMP3                    # packed left shift >>2
876	psrld     $7, \TMP4                    # packed left shift >>7
877	pxor      \TMP3,\TMP2		       # xor the shifted versions
878	pxor      \TMP4,\TMP2
879	pxor      \TMP5, \TMP2
880	pxor      \TMP2, \XMM5
881	pxor      \TMP1, \XMM5                 # result is in TMP1
882
883	pxor	  \XMM5, \XMM1
884.endm
885
886/*
887* decrypt 4 blocks at a time
888* ghash the 4 previously decrypted ciphertext blocks
889* arg1, %arg2, %arg3 are used as pointers only, not modified
890* %r11 is the data offset value
891*/
892.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
893TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
894
895	movdqa	  \XMM1, \XMM5
896	movdqa	  \XMM2, \XMM6
897	movdqa	  \XMM3, \XMM7
898	movdqa	  \XMM4, \XMM8
899
900        movdqa    SHUF_MASK(%rip), %xmm15
901        # multiply TMP5 * HashKey using karatsuba
902
903	movdqa	  \XMM5, \TMP4
904	pshufd	  $78, \XMM5, \TMP6
905	pxor	  \XMM5, \TMP6
906	paddd     ONE(%rip), \XMM0		# INCR CNT
907	movdqa	  HashKey_4(%rsp), \TMP5
908	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
909	movdqa    \XMM0, \XMM1
910	paddd     ONE(%rip), \XMM0		# INCR CNT
911	movdqa    \XMM0, \XMM2
912	paddd     ONE(%rip), \XMM0		# INCR CNT
913	movdqa    \XMM0, \XMM3
914	paddd     ONE(%rip), \XMM0		# INCR CNT
915	movdqa    \XMM0, \XMM4
916	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
917	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
918	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
919	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
920	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
921
922	pxor	  (%arg1), \XMM1
923	pxor	  (%arg1), \XMM2
924	pxor	  (%arg1), \XMM3
925	pxor	  (%arg1), \XMM4
926	movdqa	  HashKey_4_k(%rsp), \TMP5
927	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
928	movaps 0x10(%arg1), \TMP1
929	AESENC	  \TMP1, \XMM1              # Round 1
930	AESENC	  \TMP1, \XMM2
931	AESENC	  \TMP1, \XMM3
932	AESENC	  \TMP1, \XMM4
933	movaps 0x20(%arg1), \TMP1
934	AESENC	  \TMP1, \XMM1              # Round 2
935	AESENC	  \TMP1, \XMM2
936	AESENC	  \TMP1, \XMM3
937	AESENC	  \TMP1, \XMM4
938	movdqa	  \XMM6, \TMP1
939	pshufd	  $78, \XMM6, \TMP2
940	pxor	  \XMM6, \TMP2
941	movdqa	  HashKey_3(%rsp), \TMP5
942	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
943	movaps 0x30(%arg1), \TMP3
944	AESENC    \TMP3, \XMM1              # Round 3
945	AESENC    \TMP3, \XMM2
946	AESENC    \TMP3, \XMM3
947	AESENC    \TMP3, \XMM4
948	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
949	movaps 0x40(%arg1), \TMP3
950	AESENC	  \TMP3, \XMM1              # Round 4
951	AESENC	  \TMP3, \XMM2
952	AESENC	  \TMP3, \XMM3
953	AESENC	  \TMP3, \XMM4
954	movdqa	  HashKey_3_k(%rsp), \TMP5
955	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
956	movaps 0x50(%arg1), \TMP3
957	AESENC	  \TMP3, \XMM1              # Round 5
958	AESENC	  \TMP3, \XMM2
959	AESENC	  \TMP3, \XMM3
960	AESENC	  \TMP3, \XMM4
961	pxor	  \TMP1, \TMP4
962# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
963	pxor	  \XMM6, \XMM5
964	pxor	  \TMP2, \TMP6
965	movdqa	  \XMM7, \TMP1
966	pshufd	  $78, \XMM7, \TMP2
967	pxor	  \XMM7, \TMP2
968	movdqa	  HashKey_2(%rsp ), \TMP5
969
970        # Multiply TMP5 * HashKey using karatsuba
971
972	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
973	movaps 0x60(%arg1), \TMP3
974	AESENC	  \TMP3, \XMM1              # Round 6
975	AESENC	  \TMP3, \XMM2
976	AESENC	  \TMP3, \XMM3
977	AESENC	  \TMP3, \XMM4
978	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
979	movaps 0x70(%arg1), \TMP3
980	AESENC	  \TMP3, \XMM1             # Round 7
981	AESENC	  \TMP3, \XMM2
982	AESENC	  \TMP3, \XMM3
983	AESENC	  \TMP3, \XMM4
984	movdqa	  HashKey_2_k(%rsp), \TMP5
985	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
986	movaps 0x80(%arg1), \TMP3
987	AESENC	  \TMP3, \XMM1             # Round 8
988	AESENC	  \TMP3, \XMM2
989	AESENC	  \TMP3, \XMM3
990	AESENC	  \TMP3, \XMM4
991	pxor	  \TMP1, \TMP4
992# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
993	pxor	  \XMM7, \XMM5
994	pxor	  \TMP2, \TMP6
995
996        # Multiply XMM8 * HashKey
997        # XMM8 and TMP5 hold the values for the two operands
998
999	movdqa	  \XMM8, \TMP1
1000	pshufd	  $78, \XMM8, \TMP2
1001	pxor	  \XMM8, \TMP2
1002	movdqa	  HashKey(%rsp), \TMP5
1003	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1004	movaps 0x90(%arg1), \TMP3
1005	AESENC	  \TMP3, \XMM1            # Round 9
1006	AESENC	  \TMP3, \XMM2
1007	AESENC	  \TMP3, \XMM3
1008	AESENC	  \TMP3, \XMM4
1009	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1010	lea	  0xa0(%arg1),%r10
1011	mov	  keysize,%eax
1012	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1013	sub	  $4,%eax			# 128->0, 192->2, 256->4
1014	jz	  aes_loop_par_dec_done
1015
1016aes_loop_par_dec:
1017	MOVADQ	  (%r10),\TMP3
1018.irpc	index, 1234
1019	AESENC	  \TMP3, %xmm\index
1020.endr
1021	add	  $16,%r10
1022	sub	  $1,%eax
1023	jnz	  aes_loop_par_dec
1024
1025aes_loop_par_dec_done:
1026	MOVADQ	  (%r10), \TMP3
1027	AESENCLAST \TMP3, \XMM1           # last round
1028	AESENCLAST \TMP3, \XMM2
1029	AESENCLAST \TMP3, \XMM3
1030	AESENCLAST \TMP3, \XMM4
1031	movdqa    HashKey_k(%rsp), \TMP5
1032	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1033	movdqu	  (%arg3,%r11,1), \TMP3
1034	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1035	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
1036	movdqa    \TMP3, \XMM1
1037	movdqu	  16(%arg3,%r11,1), \TMP3
1038	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1039	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
1040	movdqa    \TMP3, \XMM2
1041	movdqu	  32(%arg3,%r11,1), \TMP3
1042	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1043	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1044	movdqa    \TMP3, \XMM3
1045	movdqu	  48(%arg3,%r11,1), \TMP3
1046	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1047	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1048	movdqa    \TMP3, \XMM4
1049	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1050	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1051	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1052	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1053
1054	pxor	  \TMP4, \TMP1
1055	pxor	  \XMM8, \XMM5
1056	pxor	  \TMP6, \TMP2
1057	pxor	  \TMP1, \TMP2
1058	pxor	  \XMM5, \TMP2
1059	movdqa	  \TMP2, \TMP3
1060	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1061	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1062	pxor	  \TMP3, \XMM5
1063	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1064
1065        # first phase of reduction
1066
1067	movdqa    \XMM5, \TMP2
1068	movdqa    \XMM5, \TMP3
1069	movdqa    \XMM5, \TMP4
1070# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1071	pslld     $31, \TMP2                   # packed right shift << 31
1072	pslld     $30, \TMP3                   # packed right shift << 30
1073	pslld     $25, \TMP4                   # packed right shift << 25
1074	pxor      \TMP3, \TMP2	               # xor the shifted versions
1075	pxor      \TMP4, \TMP2
1076	movdqa    \TMP2, \TMP5
1077	psrldq    $4, \TMP5                    # right shift T5 1 DW
1078	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1079	pxor      \TMP2, \XMM5
1080
1081        # second phase of reduction
1082
1083	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1084	movdqa    \XMM5,\TMP3
1085	movdqa    \XMM5,\TMP4
1086	psrld     $1, \TMP2                    # packed left shift >>1
1087	psrld     $2, \TMP3                    # packed left shift >>2
1088	psrld     $7, \TMP4                    # packed left shift >>7
1089	pxor      \TMP3,\TMP2		       # xor the shifted versions
1090	pxor      \TMP4,\TMP2
1091	pxor      \TMP5, \TMP2
1092	pxor      \TMP2, \XMM5
1093	pxor      \TMP1, \XMM5                 # result is in TMP1
1094
1095	pxor	  \XMM5, \XMM1
1096.endm
1097
1098/* GHASH the last 4 ciphertext blocks. */
1099.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1100TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1101
1102        # Multiply TMP6 * HashKey (using Karatsuba)
1103
1104	movdqa	  \XMM1, \TMP6
1105	pshufd	  $78, \XMM1, \TMP2
1106	pxor	  \XMM1, \TMP2
1107	movdqa	  HashKey_4(%rsp), \TMP5
1108	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1109	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1110	movdqa	  HashKey_4_k(%rsp), \TMP4
1111	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1112	movdqa	  \XMM1, \XMMDst
1113	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1114
1115        # Multiply TMP1 * HashKey (using Karatsuba)
1116
1117	movdqa	  \XMM2, \TMP1
1118	pshufd	  $78, \XMM2, \TMP2
1119	pxor	  \XMM2, \TMP2
1120	movdqa	  HashKey_3(%rsp), \TMP5
1121	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1122	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1123	movdqa	  HashKey_3_k(%rsp), \TMP4
1124	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1125	pxor	  \TMP1, \TMP6
1126	pxor	  \XMM2, \XMMDst
1127	pxor	  \TMP2, \XMM1
1128# results accumulated in TMP6, XMMDst, XMM1
1129
1130        # Multiply TMP1 * HashKey (using Karatsuba)
1131
1132	movdqa	  \XMM3, \TMP1
1133	pshufd	  $78, \XMM3, \TMP2
1134	pxor	  \XMM3, \TMP2
1135	movdqa	  HashKey_2(%rsp), \TMP5
1136	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1137	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1138	movdqa	  HashKey_2_k(%rsp), \TMP4
1139	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1140	pxor	  \TMP1, \TMP6
1141	pxor	  \XMM3, \XMMDst
1142	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1143
1144        # Multiply TMP1 * HashKey (using Karatsuba)
1145	movdqa	  \XMM4, \TMP1
1146	pshufd	  $78, \XMM4, \TMP2
1147	pxor	  \XMM4, \TMP2
1148	movdqa	  HashKey(%rsp), \TMP5
1149	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1150	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1151	movdqa	  HashKey_k(%rsp), \TMP4
1152	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1153	pxor	  \TMP1, \TMP6
1154	pxor	  \XMM4, \XMMDst
1155	pxor	  \XMM1, \TMP2
1156	pxor	  \TMP6, \TMP2
1157	pxor	  \XMMDst, \TMP2
1158	# middle section of the temp results combined as in karatsuba algorithm
1159	movdqa	  \TMP2, \TMP4
1160	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1161	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1162	pxor	  \TMP4, \XMMDst
1163	pxor	  \TMP2, \TMP6
1164# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1165	# first phase of the reduction
1166	movdqa    \XMMDst, \TMP2
1167	movdqa    \XMMDst, \TMP3
1168	movdqa    \XMMDst, \TMP4
1169# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1170	pslld     $31, \TMP2                # packed right shifting << 31
1171	pslld     $30, \TMP3                # packed right shifting << 30
1172	pslld     $25, \TMP4                # packed right shifting << 25
1173	pxor      \TMP3, \TMP2              # xor the shifted versions
1174	pxor      \TMP4, \TMP2
1175	movdqa    \TMP2, \TMP7
1176	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1177	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1178	pxor      \TMP2, \XMMDst
1179
1180        # second phase of the reduction
1181	movdqa    \XMMDst, \TMP2
1182	# make 3 copies of XMMDst for doing 3 shift operations
1183	movdqa    \XMMDst, \TMP3
1184	movdqa    \XMMDst, \TMP4
1185	psrld     $1, \TMP2                 # packed left shift >> 1
1186	psrld     $2, \TMP3                 # packed left shift >> 2
1187	psrld     $7, \TMP4                 # packed left shift >> 7
1188	pxor      \TMP3, \TMP2              # xor the shifted versions
1189	pxor      \TMP4, \TMP2
1190	pxor      \TMP7, \TMP2
1191	pxor      \TMP2, \XMMDst
1192	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1193.endm
1194
1195
1196/* Encryption of a single block
1197* uses eax & r10
1198*/
1199
1200.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1201
1202	pxor		(%arg1), \XMM0
1203	mov		keysize,%eax
1204	shr		$2,%eax			# 128->4, 192->6, 256->8
1205	add		$5,%eax			# 128->9, 192->11, 256->13
1206	lea		16(%arg1), %r10	  # get first expanded key address
1207
1208_esb_loop_\@:
1209	MOVADQ		(%r10),\TMP1
1210	AESENC		\TMP1,\XMM0
1211	add		$16,%r10
1212	sub		$1,%eax
1213	jnz		_esb_loop_\@
1214
1215	MOVADQ		(%r10),\TMP1
1216	AESENCLAST	\TMP1,\XMM0
1217.endm
1218/*****************************************************************************
1219* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1220*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1221*                   const u8 *in,      // Ciphertext input
1222*                   u64 plaintext_len, // Length of data in bytes for decryption.
1223*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1224*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1225*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1226*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1227*                   const u8 *aad,     // Additional Authentication Data (AAD)
1228*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1229*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1230*                                      // given authentication tag and only return the plaintext if they match.
1231*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1232*                                      // (most likely), 12 or 8.
1233*
1234* Assumptions:
1235*
1236* keys:
1237*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1238*       set of 11 keys in the data structure void *aes_ctx
1239*
1240* iv:
1241*       0                   1                   2                   3
1242*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244*       |                             Salt  (From the SA)               |
1245*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246*       |                     Initialization Vector                     |
1247*       |         (This is the sequence number from IPSec header)       |
1248*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249*       |                              0x1                              |
1250*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252*
1253*
1254* AAD:
1255*       AAD padded to 128 bits with 0
1256*       for example, assume AAD is a u32 vector
1257*
1258*       if AAD is 8 bytes:
1259*       AAD[3] = {A0, A1};
1260*       padded AAD in xmm register = {A1 A0 0 0}
1261*
1262*       0                   1                   2                   3
1263*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1264*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1265*       |                               SPI (A1)                        |
1266*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1267*       |                     32-bit Sequence Number (A0)               |
1268*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1269*       |                              0x0                              |
1270*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1271*
1272*                                       AAD Format with 32-bit Sequence Number
1273*
1274*       if AAD is 12 bytes:
1275*       AAD[3] = {A0, A1, A2};
1276*       padded AAD in xmm register = {A2 A1 A0 0}
1277*
1278*       0                   1                   2                   3
1279*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1280*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1281*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1282*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1283*       |                               SPI (A2)                        |
1284*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1285*       |                 64-bit Extended Sequence Number {A1,A0}       |
1286*       |                                                               |
1287*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1288*       |                              0x0                              |
1289*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1290*
1291*                        AAD Format with 64-bit Extended Sequence Number
1292*
1293* aadLen:
1294*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1295*       The code supports 16 too but for other sizes, the code will fail.
1296*
1297* TLen:
1298*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1299*       For other sizes, the code will fail.
1300*
1301* poly = x^128 + x^127 + x^126 + x^121 + 1
1302*
1303*****************************************************************************/
1304ENTRY(aesni_gcm_dec)
1305	push	%r12
1306	push	%r13
1307	push	%r14
1308	mov	%rsp, %r14
1309/*
1310* states of %xmm registers %xmm6:%xmm15 not saved
1311* all %xmm registers are clobbered
1312*/
1313	sub	$VARIABLE_OFFSET, %rsp
1314	and	$~63, %rsp                        # align rsp to 64 bytes
1315	mov	%arg6, %r12
1316	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1317        movdqa  SHUF_MASK(%rip), %xmm2
1318	PSHUFB_XMM %xmm2, %xmm13
1319
1320
1321# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1322
1323	movdqa	%xmm13, %xmm2
1324	psllq	$1, %xmm13
1325	psrlq	$63, %xmm2
1326	movdqa	%xmm2, %xmm1
1327	pslldq	$8, %xmm2
1328	psrldq	$8, %xmm1
1329	por	%xmm2, %xmm13
1330
1331        # Reduction
1332
1333	pshufd	$0x24, %xmm1, %xmm2
1334	pcmpeqd TWOONE(%rip), %xmm2
1335	pand	POLY(%rip), %xmm2
1336	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1337
1338
1339        # Decrypt first few blocks
1340
1341	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1342	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1343	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1344	mov %r13, %r12
1345	and $(3<<4), %r12
1346	jz _initial_num_blocks_is_0_decrypt
1347	cmp $(2<<4), %r12
1348	jb _initial_num_blocks_is_1_decrypt
1349	je _initial_num_blocks_is_2_decrypt
1350_initial_num_blocks_is_3_decrypt:
1351	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1352%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1353	sub	$48, %r13
1354	jmp	_initial_blocks_decrypted
1355_initial_num_blocks_is_2_decrypt:
1356	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1357%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1358	sub	$32, %r13
1359	jmp	_initial_blocks_decrypted
1360_initial_num_blocks_is_1_decrypt:
1361	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1362%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1363	sub	$16, %r13
1364	jmp	_initial_blocks_decrypted
1365_initial_num_blocks_is_0_decrypt:
1366	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1367%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1368_initial_blocks_decrypted:
1369	cmp	$0, %r13
1370	je	_zero_cipher_left_decrypt
1371	sub	$64, %r13
1372	je	_four_cipher_left_decrypt
1373_decrypt_by_4:
1374	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1375%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1376	add	$64, %r11
1377	sub	$64, %r13
1378	jne	_decrypt_by_4
1379_four_cipher_left_decrypt:
1380	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1381%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1382_zero_cipher_left_decrypt:
1383	mov	%arg4, %r13
1384	and	$15, %r13				# %r13 = arg4 (mod 16)
1385	je	_multiple_of_16_bytes_decrypt
1386
1387        # Handle the last <16 byte block separately
1388
1389	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1390        movdqa SHUF_MASK(%rip), %xmm10
1391	PSHUFB_XMM %xmm10, %xmm0
1392
1393	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1394	sub $16, %r11
1395	add %r13, %r11
1396	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1397	lea SHIFT_MASK+16(%rip), %r12
1398	sub %r13, %r12
1399# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1400# (%r13 is the number of bytes in plaintext mod 16)
1401	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1402	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1403
1404	movdqa  %xmm1, %xmm2
1405	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1406	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1407	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1408	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1409	pand    %xmm1, %xmm2
1410        movdqa SHUF_MASK(%rip), %xmm10
1411	PSHUFB_XMM %xmm10 ,%xmm2
1412
1413	pxor %xmm2, %xmm8
1414	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1415	          # GHASH computation for the last <16 byte block
1416	sub %r13, %r11
1417	add $16, %r11
1418
1419        # output %r13 bytes
1420	MOVQ_R64_XMM	%xmm0, %rax
1421	cmp	$8, %r13
1422	jle	_less_than_8_bytes_left_decrypt
1423	mov	%rax, (%arg2 , %r11, 1)
1424	add	$8, %r11
1425	psrldq	$8, %xmm0
1426	MOVQ_R64_XMM	%xmm0, %rax
1427	sub	$8, %r13
1428_less_than_8_bytes_left_decrypt:
1429	mov	%al,  (%arg2, %r11, 1)
1430	add	$1, %r11
1431	shr	$8, %rax
1432	sub	$1, %r13
1433	jne	_less_than_8_bytes_left_decrypt
1434_multiple_of_16_bytes_decrypt:
1435	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1436	shl	$3, %r12		  # convert into number of bits
1437	movd	%r12d, %xmm15		  # len(A) in %xmm15
1438	shl	$3, %arg4		  # len(C) in bits (*128)
1439	MOVQ_R64_XMM	%arg4, %xmm1
1440	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1441	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1442	pxor	%xmm15, %xmm8
1443	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1444	         # final GHASH computation
1445        movdqa SHUF_MASK(%rip), %xmm10
1446	PSHUFB_XMM %xmm10, %xmm8
1447
1448	mov	%arg5, %rax		  # %rax = *Y0
1449	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1450	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1451	pxor	%xmm8, %xmm0
1452_return_T_decrypt:
1453	mov	arg9, %r10                # %r10 = authTag
1454	mov	arg10, %r11               # %r11 = auth_tag_len
1455	cmp	$16, %r11
1456	je	_T_16_decrypt
1457	cmp	$12, %r11
1458	je	_T_12_decrypt
1459_T_8_decrypt:
1460	MOVQ_R64_XMM	%xmm0, %rax
1461	mov	%rax, (%r10)
1462	jmp	_return_T_done_decrypt
1463_T_12_decrypt:
1464	MOVQ_R64_XMM	%xmm0, %rax
1465	mov	%rax, (%r10)
1466	psrldq	$8, %xmm0
1467	movd	%xmm0, %eax
1468	mov	%eax, 8(%r10)
1469	jmp	_return_T_done_decrypt
1470_T_16_decrypt:
1471	movdqu	%xmm0, (%r10)
1472_return_T_done_decrypt:
1473	mov	%r14, %rsp
1474	pop	%r14
1475	pop	%r13
1476	pop	%r12
1477	ret
1478ENDPROC(aesni_gcm_dec)
1479
1480
1481/*****************************************************************************
1482* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1483*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1484*                    const u8 *in,       // Plaintext input
1485*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1486*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1487*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1488*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1489*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1490*                    const u8 *aad,      // Additional Authentication Data (AAD)
1491*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1492*                    u8 *auth_tag,       // Authenticated Tag output.
1493*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1494*                                        // 12 or 8.
1495*
1496* Assumptions:
1497*
1498* keys:
1499*       keys are pre-expanded and aligned to 16 bytes. we are using the
1500*       first set of 11 keys in the data structure void *aes_ctx
1501*
1502*
1503* iv:
1504*       0                   1                   2                   3
1505*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1506*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507*       |                             Salt  (From the SA)               |
1508*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1509*       |                     Initialization Vector                     |
1510*       |         (This is the sequence number from IPSec header)       |
1511*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*       |                              0x1                              |
1513*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1514*
1515*
1516*
1517* AAD:
1518*       AAD padded to 128 bits with 0
1519*       for example, assume AAD is a u32 vector
1520*
1521*       if AAD is 8 bytes:
1522*       AAD[3] = {A0, A1};
1523*       padded AAD in xmm register = {A1 A0 0 0}
1524*
1525*       0                   1                   2                   3
1526*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1527*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1528*       |                               SPI (A1)                        |
1529*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1530*       |                     32-bit Sequence Number (A0)               |
1531*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1532*       |                              0x0                              |
1533*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1534*
1535*                                 AAD Format with 32-bit Sequence Number
1536*
1537*       if AAD is 12 bytes:
1538*       AAD[3] = {A0, A1, A2};
1539*       padded AAD in xmm register = {A2 A1 A0 0}
1540*
1541*       0                   1                   2                   3
1542*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1543*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544*       |                               SPI (A2)                        |
1545*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1546*       |                 64-bit Extended Sequence Number {A1,A0}       |
1547*       |                                                               |
1548*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*       |                              0x0                              |
1550*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1551*
1552*                         AAD Format with 64-bit Extended Sequence Number
1553*
1554* aadLen:
1555*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1556*       The code supports 16 too but for other sizes, the code will fail.
1557*
1558* TLen:
1559*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1560*       For other sizes, the code will fail.
1561*
1562* poly = x^128 + x^127 + x^126 + x^121 + 1
1563***************************************************************************/
1564ENTRY(aesni_gcm_enc)
1565	push	%r12
1566	push	%r13
1567	push	%r14
1568	mov	%rsp, %r14
1569#
1570# states of %xmm registers %xmm6:%xmm15 not saved
1571# all %xmm registers are clobbered
1572#
1573	sub	$VARIABLE_OFFSET, %rsp
1574	and	$~63, %rsp
1575	mov	%arg6, %r12
1576	movdqu	(%r12), %xmm13
1577        movdqa  SHUF_MASK(%rip), %xmm2
1578	PSHUFB_XMM %xmm2, %xmm13
1579
1580
1581# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1582
1583	movdqa	%xmm13, %xmm2
1584	psllq	$1, %xmm13
1585	psrlq	$63, %xmm2
1586	movdqa	%xmm2, %xmm1
1587	pslldq	$8, %xmm2
1588	psrldq	$8, %xmm1
1589	por	%xmm2, %xmm13
1590
1591        # reduce HashKey<<1
1592
1593	pshufd	$0x24, %xmm1, %xmm2
1594	pcmpeqd TWOONE(%rip), %xmm2
1595	pand	POLY(%rip), %xmm2
1596	pxor	%xmm2, %xmm13
1597	movdqa	%xmm13, HashKey(%rsp)
1598	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1599	and	$-16, %r13
1600	mov	%r13, %r12
1601
1602        # Encrypt first few blocks
1603
1604	and	$(3<<4), %r12
1605	jz	_initial_num_blocks_is_0_encrypt
1606	cmp	$(2<<4), %r12
1607	jb	_initial_num_blocks_is_1_encrypt
1608	je	_initial_num_blocks_is_2_encrypt
1609_initial_num_blocks_is_3_encrypt:
1610	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1611%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1612	sub	$48, %r13
1613	jmp	_initial_blocks_encrypted
1614_initial_num_blocks_is_2_encrypt:
1615	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1616%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1617	sub	$32, %r13
1618	jmp	_initial_blocks_encrypted
1619_initial_num_blocks_is_1_encrypt:
1620	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1621%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1622	sub	$16, %r13
1623	jmp	_initial_blocks_encrypted
1624_initial_num_blocks_is_0_encrypt:
1625	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1626%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1627_initial_blocks_encrypted:
1628
1629        # Main loop - Encrypt remaining blocks
1630
1631	cmp	$0, %r13
1632	je	_zero_cipher_left_encrypt
1633	sub	$64, %r13
1634	je	_four_cipher_left_encrypt
1635_encrypt_by_4_encrypt:
1636	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1637%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1638	add	$64, %r11
1639	sub	$64, %r13
1640	jne	_encrypt_by_4_encrypt
1641_four_cipher_left_encrypt:
1642	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1643%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1644_zero_cipher_left_encrypt:
1645	mov	%arg4, %r13
1646	and	$15, %r13			# %r13 = arg4 (mod 16)
1647	je	_multiple_of_16_bytes_encrypt
1648
1649         # Handle the last <16 Byte block separately
1650	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1651        movdqa SHUF_MASK(%rip), %xmm10
1652	PSHUFB_XMM %xmm10, %xmm0
1653
1654
1655	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1656	sub $16, %r11
1657	add %r13, %r11
1658	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1659	lea SHIFT_MASK+16(%rip), %r12
1660	sub %r13, %r12
1661	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1662	# (%r13 is the number of bytes in plaintext mod 16)
1663	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1664	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1665	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1666	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1667	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1668	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1669        movdqa SHUF_MASK(%rip), %xmm10
1670	PSHUFB_XMM %xmm10,%xmm0
1671
1672	pxor	%xmm0, %xmm8
1673	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1674	# GHASH computation for the last <16 byte block
1675	sub	%r13, %r11
1676	add	$16, %r11
1677
1678	movdqa SHUF_MASK(%rip), %xmm10
1679	PSHUFB_XMM %xmm10, %xmm0
1680
1681	# shuffle xmm0 back to output as ciphertext
1682
1683        # Output %r13 bytes
1684	MOVQ_R64_XMM %xmm0, %rax
1685	cmp $8, %r13
1686	jle _less_than_8_bytes_left_encrypt
1687	mov %rax, (%arg2 , %r11, 1)
1688	add $8, %r11
1689	psrldq $8, %xmm0
1690	MOVQ_R64_XMM %xmm0, %rax
1691	sub $8, %r13
1692_less_than_8_bytes_left_encrypt:
1693	mov %al,  (%arg2, %r11, 1)
1694	add $1, %r11
1695	shr $8, %rax
1696	sub $1, %r13
1697	jne _less_than_8_bytes_left_encrypt
1698_multiple_of_16_bytes_encrypt:
1699	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1700	shl	$3, %r12
1701	movd	%r12d, %xmm15       # len(A) in %xmm15
1702	shl	$3, %arg4               # len(C) in bits (*128)
1703	MOVQ_R64_XMM	%arg4, %xmm1
1704	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1705	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1706	pxor	%xmm15, %xmm8
1707	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1708	# final GHASH computation
1709        movdqa SHUF_MASK(%rip), %xmm10
1710	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1711
1712	mov	%arg5, %rax		       # %rax  = *Y0
1713	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1714	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1715	pxor	%xmm8, %xmm0
1716_return_T_encrypt:
1717	mov	arg9, %r10                     # %r10 = authTag
1718	mov	arg10, %r11                    # %r11 = auth_tag_len
1719	cmp	$16, %r11
1720	je	_T_16_encrypt
1721	cmp	$12, %r11
1722	je	_T_12_encrypt
1723_T_8_encrypt:
1724	MOVQ_R64_XMM	%xmm0, %rax
1725	mov	%rax, (%r10)
1726	jmp	_return_T_done_encrypt
1727_T_12_encrypt:
1728	MOVQ_R64_XMM	%xmm0, %rax
1729	mov	%rax, (%r10)
1730	psrldq	$8, %xmm0
1731	movd	%xmm0, %eax
1732	mov	%eax, 8(%r10)
1733	jmp	_return_T_done_encrypt
1734_T_16_encrypt:
1735	movdqu	%xmm0, (%r10)
1736_return_T_done_encrypt:
1737	mov	%r14, %rsp
1738	pop	%r14
1739	pop	%r13
1740	pop	%r12
1741	ret
1742ENDPROC(aesni_gcm_enc)
1743
1744#endif
1745
1746
1747.align 4
1748_key_expansion_128:
1749_key_expansion_256a:
1750	pshufd $0b11111111, %xmm1, %xmm1
1751	shufps $0b00010000, %xmm0, %xmm4
1752	pxor %xmm4, %xmm0
1753	shufps $0b10001100, %xmm0, %xmm4
1754	pxor %xmm4, %xmm0
1755	pxor %xmm1, %xmm0
1756	movaps %xmm0, (TKEYP)
1757	add $0x10, TKEYP
1758	ret
1759ENDPROC(_key_expansion_128)
1760ENDPROC(_key_expansion_256a)
1761
1762.align 4
1763_key_expansion_192a:
1764	pshufd $0b01010101, %xmm1, %xmm1
1765	shufps $0b00010000, %xmm0, %xmm4
1766	pxor %xmm4, %xmm0
1767	shufps $0b10001100, %xmm0, %xmm4
1768	pxor %xmm4, %xmm0
1769	pxor %xmm1, %xmm0
1770
1771	movaps %xmm2, %xmm5
1772	movaps %xmm2, %xmm6
1773	pslldq $4, %xmm5
1774	pshufd $0b11111111, %xmm0, %xmm3
1775	pxor %xmm3, %xmm2
1776	pxor %xmm5, %xmm2
1777
1778	movaps %xmm0, %xmm1
1779	shufps $0b01000100, %xmm0, %xmm6
1780	movaps %xmm6, (TKEYP)
1781	shufps $0b01001110, %xmm2, %xmm1
1782	movaps %xmm1, 0x10(TKEYP)
1783	add $0x20, TKEYP
1784	ret
1785ENDPROC(_key_expansion_192a)
1786
1787.align 4
1788_key_expansion_192b:
1789	pshufd $0b01010101, %xmm1, %xmm1
1790	shufps $0b00010000, %xmm0, %xmm4
1791	pxor %xmm4, %xmm0
1792	shufps $0b10001100, %xmm0, %xmm4
1793	pxor %xmm4, %xmm0
1794	pxor %xmm1, %xmm0
1795
1796	movaps %xmm2, %xmm5
1797	pslldq $4, %xmm5
1798	pshufd $0b11111111, %xmm0, %xmm3
1799	pxor %xmm3, %xmm2
1800	pxor %xmm5, %xmm2
1801
1802	movaps %xmm0, (TKEYP)
1803	add $0x10, TKEYP
1804	ret
1805ENDPROC(_key_expansion_192b)
1806
1807.align 4
1808_key_expansion_256b:
1809	pshufd $0b10101010, %xmm1, %xmm1
1810	shufps $0b00010000, %xmm2, %xmm4
1811	pxor %xmm4, %xmm2
1812	shufps $0b10001100, %xmm2, %xmm4
1813	pxor %xmm4, %xmm2
1814	pxor %xmm1, %xmm2
1815	movaps %xmm2, (TKEYP)
1816	add $0x10, TKEYP
1817	ret
1818ENDPROC(_key_expansion_256b)
1819
1820/*
1821 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1822 *                   unsigned int key_len)
1823 */
1824ENTRY(aesni_set_key)
1825	FRAME_BEGIN
1826#ifndef __x86_64__
1827	pushl KEYP
1828	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1829	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1830	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1831#endif
1832	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1833	movaps %xmm0, (KEYP)
1834	lea 0x10(KEYP), TKEYP		# key addr
1835	movl %edx, 480(KEYP)
1836	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1837	cmp $24, %dl
1838	jb .Lenc_key128
1839	je .Lenc_key192
1840	movups 0x10(UKEYP), %xmm2	# other user key
1841	movaps %xmm2, (TKEYP)
1842	add $0x10, TKEYP
1843	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1844	call _key_expansion_256a
1845	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1846	call _key_expansion_256b
1847	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1848	call _key_expansion_256a
1849	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1850	call _key_expansion_256b
1851	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1852	call _key_expansion_256a
1853	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1854	call _key_expansion_256b
1855	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1856	call _key_expansion_256a
1857	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1858	call _key_expansion_256b
1859	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1860	call _key_expansion_256a
1861	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1862	call _key_expansion_256b
1863	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1864	call _key_expansion_256a
1865	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1866	call _key_expansion_256b
1867	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1868	call _key_expansion_256a
1869	jmp .Ldec_key
1870.Lenc_key192:
1871	movq 0x10(UKEYP), %xmm2		# other user key
1872	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1873	call _key_expansion_192a
1874	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1875	call _key_expansion_192b
1876	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1877	call _key_expansion_192a
1878	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1879	call _key_expansion_192b
1880	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1881	call _key_expansion_192a
1882	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1883	call _key_expansion_192b
1884	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1885	call _key_expansion_192a
1886	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1887	call _key_expansion_192b
1888	jmp .Ldec_key
1889.Lenc_key128:
1890	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1891	call _key_expansion_128
1892	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1893	call _key_expansion_128
1894	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1895	call _key_expansion_128
1896	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1897	call _key_expansion_128
1898	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1899	call _key_expansion_128
1900	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1901	call _key_expansion_128
1902	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1903	call _key_expansion_128
1904	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1905	call _key_expansion_128
1906	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1907	call _key_expansion_128
1908	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1909	call _key_expansion_128
1910.Ldec_key:
1911	sub $0x10, TKEYP
1912	movaps (KEYP), %xmm0
1913	movaps (TKEYP), %xmm1
1914	movaps %xmm0, 240(TKEYP)
1915	movaps %xmm1, 240(KEYP)
1916	add $0x10, KEYP
1917	lea 240-16(TKEYP), UKEYP
1918.align 4
1919.Ldec_key_loop:
1920	movaps (KEYP), %xmm0
1921	AESIMC %xmm0 %xmm1
1922	movaps %xmm1, (UKEYP)
1923	add $0x10, KEYP
1924	sub $0x10, UKEYP
1925	cmp TKEYP, KEYP
1926	jb .Ldec_key_loop
1927	xor AREG, AREG
1928#ifndef __x86_64__
1929	popl KEYP
1930#endif
1931	FRAME_END
1932	ret
1933ENDPROC(aesni_set_key)
1934
1935/*
1936 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1937 */
1938ENTRY(aesni_enc)
1939	FRAME_BEGIN
1940#ifndef __x86_64__
1941	pushl KEYP
1942	pushl KLEN
1943	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1944	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1945	movl (FRAME_OFFSET+20)(%esp), INP	# src
1946#endif
1947	movl 480(KEYP), KLEN		# key length
1948	movups (INP), STATE		# input
1949	call _aesni_enc1
1950	movups STATE, (OUTP)		# output
1951#ifndef __x86_64__
1952	popl KLEN
1953	popl KEYP
1954#endif
1955	FRAME_END
1956	ret
1957ENDPROC(aesni_enc)
1958
1959/*
1960 * _aesni_enc1:		internal ABI
1961 * input:
1962 *	KEYP:		key struct pointer
1963 *	KLEN:		round count
1964 *	STATE:		initial state (input)
1965 * output:
1966 *	STATE:		finial state (output)
1967 * changed:
1968 *	KEY
1969 *	TKEYP (T1)
1970 */
1971.align 4
1972_aesni_enc1:
1973	movaps (KEYP), KEY		# key
1974	mov KEYP, TKEYP
1975	pxor KEY, STATE		# round 0
1976	add $0x30, TKEYP
1977	cmp $24, KLEN
1978	jb .Lenc128
1979	lea 0x20(TKEYP), TKEYP
1980	je .Lenc192
1981	add $0x20, TKEYP
1982	movaps -0x60(TKEYP), KEY
1983	AESENC KEY STATE
1984	movaps -0x50(TKEYP), KEY
1985	AESENC KEY STATE
1986.align 4
1987.Lenc192:
1988	movaps -0x40(TKEYP), KEY
1989	AESENC KEY STATE
1990	movaps -0x30(TKEYP), KEY
1991	AESENC KEY STATE
1992.align 4
1993.Lenc128:
1994	movaps -0x20(TKEYP), KEY
1995	AESENC KEY STATE
1996	movaps -0x10(TKEYP), KEY
1997	AESENC KEY STATE
1998	movaps (TKEYP), KEY
1999	AESENC KEY STATE
2000	movaps 0x10(TKEYP), KEY
2001	AESENC KEY STATE
2002	movaps 0x20(TKEYP), KEY
2003	AESENC KEY STATE
2004	movaps 0x30(TKEYP), KEY
2005	AESENC KEY STATE
2006	movaps 0x40(TKEYP), KEY
2007	AESENC KEY STATE
2008	movaps 0x50(TKEYP), KEY
2009	AESENC KEY STATE
2010	movaps 0x60(TKEYP), KEY
2011	AESENC KEY STATE
2012	movaps 0x70(TKEYP), KEY
2013	AESENCLAST KEY STATE
2014	ret
2015ENDPROC(_aesni_enc1)
2016
2017/*
2018 * _aesni_enc4:	internal ABI
2019 * input:
2020 *	KEYP:		key struct pointer
2021 *	KLEN:		round count
2022 *	STATE1:		initial state (input)
2023 *	STATE2
2024 *	STATE3
2025 *	STATE4
2026 * output:
2027 *	STATE1:		finial state (output)
2028 *	STATE2
2029 *	STATE3
2030 *	STATE4
2031 * changed:
2032 *	KEY
2033 *	TKEYP (T1)
2034 */
2035.align 4
2036_aesni_enc4:
2037	movaps (KEYP), KEY		# key
2038	mov KEYP, TKEYP
2039	pxor KEY, STATE1		# round 0
2040	pxor KEY, STATE2
2041	pxor KEY, STATE3
2042	pxor KEY, STATE4
2043	add $0x30, TKEYP
2044	cmp $24, KLEN
2045	jb .L4enc128
2046	lea 0x20(TKEYP), TKEYP
2047	je .L4enc192
2048	add $0x20, TKEYP
2049	movaps -0x60(TKEYP), KEY
2050	AESENC KEY STATE1
2051	AESENC KEY STATE2
2052	AESENC KEY STATE3
2053	AESENC KEY STATE4
2054	movaps -0x50(TKEYP), KEY
2055	AESENC KEY STATE1
2056	AESENC KEY STATE2
2057	AESENC KEY STATE3
2058	AESENC KEY STATE4
2059#.align 4
2060.L4enc192:
2061	movaps -0x40(TKEYP), KEY
2062	AESENC KEY STATE1
2063	AESENC KEY STATE2
2064	AESENC KEY STATE3
2065	AESENC KEY STATE4
2066	movaps -0x30(TKEYP), KEY
2067	AESENC KEY STATE1
2068	AESENC KEY STATE2
2069	AESENC KEY STATE3
2070	AESENC KEY STATE4
2071#.align 4
2072.L4enc128:
2073	movaps -0x20(TKEYP), KEY
2074	AESENC KEY STATE1
2075	AESENC KEY STATE2
2076	AESENC KEY STATE3
2077	AESENC KEY STATE4
2078	movaps -0x10(TKEYP), KEY
2079	AESENC KEY STATE1
2080	AESENC KEY STATE2
2081	AESENC KEY STATE3
2082	AESENC KEY STATE4
2083	movaps (TKEYP), KEY
2084	AESENC KEY STATE1
2085	AESENC KEY STATE2
2086	AESENC KEY STATE3
2087	AESENC KEY STATE4
2088	movaps 0x10(TKEYP), KEY
2089	AESENC KEY STATE1
2090	AESENC KEY STATE2
2091	AESENC KEY STATE3
2092	AESENC KEY STATE4
2093	movaps 0x20(TKEYP), KEY
2094	AESENC KEY STATE1
2095	AESENC KEY STATE2
2096	AESENC KEY STATE3
2097	AESENC KEY STATE4
2098	movaps 0x30(TKEYP), KEY
2099	AESENC KEY STATE1
2100	AESENC KEY STATE2
2101	AESENC KEY STATE3
2102	AESENC KEY STATE4
2103	movaps 0x40(TKEYP), KEY
2104	AESENC KEY STATE1
2105	AESENC KEY STATE2
2106	AESENC KEY STATE3
2107	AESENC KEY STATE4
2108	movaps 0x50(TKEYP), KEY
2109	AESENC KEY STATE1
2110	AESENC KEY STATE2
2111	AESENC KEY STATE3
2112	AESENC KEY STATE4
2113	movaps 0x60(TKEYP), KEY
2114	AESENC KEY STATE1
2115	AESENC KEY STATE2
2116	AESENC KEY STATE3
2117	AESENC KEY STATE4
2118	movaps 0x70(TKEYP), KEY
2119	AESENCLAST KEY STATE1		# last round
2120	AESENCLAST KEY STATE2
2121	AESENCLAST KEY STATE3
2122	AESENCLAST KEY STATE4
2123	ret
2124ENDPROC(_aesni_enc4)
2125
2126/*
2127 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2128 */
2129ENTRY(aesni_dec)
2130	FRAME_BEGIN
2131#ifndef __x86_64__
2132	pushl KEYP
2133	pushl KLEN
2134	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2135	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2136	movl (FRAME_OFFSET+20)(%esp), INP	# src
2137#endif
2138	mov 480(KEYP), KLEN		# key length
2139	add $240, KEYP
2140	movups (INP), STATE		# input
2141	call _aesni_dec1
2142	movups STATE, (OUTP)		#output
2143#ifndef __x86_64__
2144	popl KLEN
2145	popl KEYP
2146#endif
2147	FRAME_END
2148	ret
2149ENDPROC(aesni_dec)
2150
2151/*
2152 * _aesni_dec1:		internal ABI
2153 * input:
2154 *	KEYP:		key struct pointer
2155 *	KLEN:		key length
2156 *	STATE:		initial state (input)
2157 * output:
2158 *	STATE:		finial state (output)
2159 * changed:
2160 *	KEY
2161 *	TKEYP (T1)
2162 */
2163.align 4
2164_aesni_dec1:
2165	movaps (KEYP), KEY		# key
2166	mov KEYP, TKEYP
2167	pxor KEY, STATE		# round 0
2168	add $0x30, TKEYP
2169	cmp $24, KLEN
2170	jb .Ldec128
2171	lea 0x20(TKEYP), TKEYP
2172	je .Ldec192
2173	add $0x20, TKEYP
2174	movaps -0x60(TKEYP), KEY
2175	AESDEC KEY STATE
2176	movaps -0x50(TKEYP), KEY
2177	AESDEC KEY STATE
2178.align 4
2179.Ldec192:
2180	movaps -0x40(TKEYP), KEY
2181	AESDEC KEY STATE
2182	movaps -0x30(TKEYP), KEY
2183	AESDEC KEY STATE
2184.align 4
2185.Ldec128:
2186	movaps -0x20(TKEYP), KEY
2187	AESDEC KEY STATE
2188	movaps -0x10(TKEYP), KEY
2189	AESDEC KEY STATE
2190	movaps (TKEYP), KEY
2191	AESDEC KEY STATE
2192	movaps 0x10(TKEYP), KEY
2193	AESDEC KEY STATE
2194	movaps 0x20(TKEYP), KEY
2195	AESDEC KEY STATE
2196	movaps 0x30(TKEYP), KEY
2197	AESDEC KEY STATE
2198	movaps 0x40(TKEYP), KEY
2199	AESDEC KEY STATE
2200	movaps 0x50(TKEYP), KEY
2201	AESDEC KEY STATE
2202	movaps 0x60(TKEYP), KEY
2203	AESDEC KEY STATE
2204	movaps 0x70(TKEYP), KEY
2205	AESDECLAST KEY STATE
2206	ret
2207ENDPROC(_aesni_dec1)
2208
2209/*
2210 * _aesni_dec4:	internal ABI
2211 * input:
2212 *	KEYP:		key struct pointer
2213 *	KLEN:		key length
2214 *	STATE1:		initial state (input)
2215 *	STATE2
2216 *	STATE3
2217 *	STATE4
2218 * output:
2219 *	STATE1:		finial state (output)
2220 *	STATE2
2221 *	STATE3
2222 *	STATE4
2223 * changed:
2224 *	KEY
2225 *	TKEYP (T1)
2226 */
2227.align 4
2228_aesni_dec4:
2229	movaps (KEYP), KEY		# key
2230	mov KEYP, TKEYP
2231	pxor KEY, STATE1		# round 0
2232	pxor KEY, STATE2
2233	pxor KEY, STATE3
2234	pxor KEY, STATE4
2235	add $0x30, TKEYP
2236	cmp $24, KLEN
2237	jb .L4dec128
2238	lea 0x20(TKEYP), TKEYP
2239	je .L4dec192
2240	add $0x20, TKEYP
2241	movaps -0x60(TKEYP), KEY
2242	AESDEC KEY STATE1
2243	AESDEC KEY STATE2
2244	AESDEC KEY STATE3
2245	AESDEC KEY STATE4
2246	movaps -0x50(TKEYP), KEY
2247	AESDEC KEY STATE1
2248	AESDEC KEY STATE2
2249	AESDEC KEY STATE3
2250	AESDEC KEY STATE4
2251.align 4
2252.L4dec192:
2253	movaps -0x40(TKEYP), KEY
2254	AESDEC KEY STATE1
2255	AESDEC KEY STATE2
2256	AESDEC KEY STATE3
2257	AESDEC KEY STATE4
2258	movaps -0x30(TKEYP), KEY
2259	AESDEC KEY STATE1
2260	AESDEC KEY STATE2
2261	AESDEC KEY STATE3
2262	AESDEC KEY STATE4
2263.align 4
2264.L4dec128:
2265	movaps -0x20(TKEYP), KEY
2266	AESDEC KEY STATE1
2267	AESDEC KEY STATE2
2268	AESDEC KEY STATE3
2269	AESDEC KEY STATE4
2270	movaps -0x10(TKEYP), KEY
2271	AESDEC KEY STATE1
2272	AESDEC KEY STATE2
2273	AESDEC KEY STATE3
2274	AESDEC KEY STATE4
2275	movaps (TKEYP), KEY
2276	AESDEC KEY STATE1
2277	AESDEC KEY STATE2
2278	AESDEC KEY STATE3
2279	AESDEC KEY STATE4
2280	movaps 0x10(TKEYP), KEY
2281	AESDEC KEY STATE1
2282	AESDEC KEY STATE2
2283	AESDEC KEY STATE3
2284	AESDEC KEY STATE4
2285	movaps 0x20(TKEYP), KEY
2286	AESDEC KEY STATE1
2287	AESDEC KEY STATE2
2288	AESDEC KEY STATE3
2289	AESDEC KEY STATE4
2290	movaps 0x30(TKEYP), KEY
2291	AESDEC KEY STATE1
2292	AESDEC KEY STATE2
2293	AESDEC KEY STATE3
2294	AESDEC KEY STATE4
2295	movaps 0x40(TKEYP), KEY
2296	AESDEC KEY STATE1
2297	AESDEC KEY STATE2
2298	AESDEC KEY STATE3
2299	AESDEC KEY STATE4
2300	movaps 0x50(TKEYP), KEY
2301	AESDEC KEY STATE1
2302	AESDEC KEY STATE2
2303	AESDEC KEY STATE3
2304	AESDEC KEY STATE4
2305	movaps 0x60(TKEYP), KEY
2306	AESDEC KEY STATE1
2307	AESDEC KEY STATE2
2308	AESDEC KEY STATE3
2309	AESDEC KEY STATE4
2310	movaps 0x70(TKEYP), KEY
2311	AESDECLAST KEY STATE1		# last round
2312	AESDECLAST KEY STATE2
2313	AESDECLAST KEY STATE3
2314	AESDECLAST KEY STATE4
2315	ret
2316ENDPROC(_aesni_dec4)
2317
2318/*
2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2320 *		      size_t len)
2321 */
2322ENTRY(aesni_ecb_enc)
2323	FRAME_BEGIN
2324#ifndef __x86_64__
2325	pushl LEN
2326	pushl KEYP
2327	pushl KLEN
2328	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2329	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2330	movl (FRAME_OFFSET+24)(%esp), INP	# src
2331	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2332#endif
2333	test LEN, LEN		# check length
2334	jz .Lecb_enc_ret
2335	mov 480(KEYP), KLEN
2336	cmp $16, LEN
2337	jb .Lecb_enc_ret
2338	cmp $64, LEN
2339	jb .Lecb_enc_loop1
2340.align 4
2341.Lecb_enc_loop4:
2342	movups (INP), STATE1
2343	movups 0x10(INP), STATE2
2344	movups 0x20(INP), STATE3
2345	movups 0x30(INP), STATE4
2346	call _aesni_enc4
2347	movups STATE1, (OUTP)
2348	movups STATE2, 0x10(OUTP)
2349	movups STATE3, 0x20(OUTP)
2350	movups STATE4, 0x30(OUTP)
2351	sub $64, LEN
2352	add $64, INP
2353	add $64, OUTP
2354	cmp $64, LEN
2355	jge .Lecb_enc_loop4
2356	cmp $16, LEN
2357	jb .Lecb_enc_ret
2358.align 4
2359.Lecb_enc_loop1:
2360	movups (INP), STATE1
2361	call _aesni_enc1
2362	movups STATE1, (OUTP)
2363	sub $16, LEN
2364	add $16, INP
2365	add $16, OUTP
2366	cmp $16, LEN
2367	jge .Lecb_enc_loop1
2368.Lecb_enc_ret:
2369#ifndef __x86_64__
2370	popl KLEN
2371	popl KEYP
2372	popl LEN
2373#endif
2374	FRAME_END
2375	ret
2376ENDPROC(aesni_ecb_enc)
2377
2378/*
2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2380 *		      size_t len);
2381 */
2382ENTRY(aesni_ecb_dec)
2383	FRAME_BEGIN
2384#ifndef __x86_64__
2385	pushl LEN
2386	pushl KEYP
2387	pushl KLEN
2388	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2389	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2390	movl (FRAME_OFFSET+24)(%esp), INP	# src
2391	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2392#endif
2393	test LEN, LEN
2394	jz .Lecb_dec_ret
2395	mov 480(KEYP), KLEN
2396	add $240, KEYP
2397	cmp $16, LEN
2398	jb .Lecb_dec_ret
2399	cmp $64, LEN
2400	jb .Lecb_dec_loop1
2401.align 4
2402.Lecb_dec_loop4:
2403	movups (INP), STATE1
2404	movups 0x10(INP), STATE2
2405	movups 0x20(INP), STATE3
2406	movups 0x30(INP), STATE4
2407	call _aesni_dec4
2408	movups STATE1, (OUTP)
2409	movups STATE2, 0x10(OUTP)
2410	movups STATE3, 0x20(OUTP)
2411	movups STATE4, 0x30(OUTP)
2412	sub $64, LEN
2413	add $64, INP
2414	add $64, OUTP
2415	cmp $64, LEN
2416	jge .Lecb_dec_loop4
2417	cmp $16, LEN
2418	jb .Lecb_dec_ret
2419.align 4
2420.Lecb_dec_loop1:
2421	movups (INP), STATE1
2422	call _aesni_dec1
2423	movups STATE1, (OUTP)
2424	sub $16, LEN
2425	add $16, INP
2426	add $16, OUTP
2427	cmp $16, LEN
2428	jge .Lecb_dec_loop1
2429.Lecb_dec_ret:
2430#ifndef __x86_64__
2431	popl KLEN
2432	popl KEYP
2433	popl LEN
2434#endif
2435	FRAME_END
2436	ret
2437ENDPROC(aesni_ecb_dec)
2438
2439/*
2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2441 *		      size_t len, u8 *iv)
2442 */
2443ENTRY(aesni_cbc_enc)
2444	FRAME_BEGIN
2445#ifndef __x86_64__
2446	pushl IVP
2447	pushl LEN
2448	pushl KEYP
2449	pushl KLEN
2450	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2451	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2452	movl (FRAME_OFFSET+28)(%esp), INP	# src
2453	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2454	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2455#endif
2456	cmp $16, LEN
2457	jb .Lcbc_enc_ret
2458	mov 480(KEYP), KLEN
2459	movups (IVP), STATE	# load iv as initial state
2460.align 4
2461.Lcbc_enc_loop:
2462	movups (INP), IN	# load input
2463	pxor IN, STATE
2464	call _aesni_enc1
2465	movups STATE, (OUTP)	# store output
2466	sub $16, LEN
2467	add $16, INP
2468	add $16, OUTP
2469	cmp $16, LEN
2470	jge .Lcbc_enc_loop
2471	movups STATE, (IVP)
2472.Lcbc_enc_ret:
2473#ifndef __x86_64__
2474	popl KLEN
2475	popl KEYP
2476	popl LEN
2477	popl IVP
2478#endif
2479	FRAME_END
2480	ret
2481ENDPROC(aesni_cbc_enc)
2482
2483/*
2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2485 *		      size_t len, u8 *iv)
2486 */
2487ENTRY(aesni_cbc_dec)
2488	FRAME_BEGIN
2489#ifndef __x86_64__
2490	pushl IVP
2491	pushl LEN
2492	pushl KEYP
2493	pushl KLEN
2494	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2495	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2496	movl (FRAME_OFFSET+28)(%esp), INP	# src
2497	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2498	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2499#endif
2500	cmp $16, LEN
2501	jb .Lcbc_dec_just_ret
2502	mov 480(KEYP), KLEN
2503	add $240, KEYP
2504	movups (IVP), IV
2505	cmp $64, LEN
2506	jb .Lcbc_dec_loop1
2507.align 4
2508.Lcbc_dec_loop4:
2509	movups (INP), IN1
2510	movaps IN1, STATE1
2511	movups 0x10(INP), IN2
2512	movaps IN2, STATE2
2513#ifdef __x86_64__
2514	movups 0x20(INP), IN3
2515	movaps IN3, STATE3
2516	movups 0x30(INP), IN4
2517	movaps IN4, STATE4
2518#else
2519	movups 0x20(INP), IN1
2520	movaps IN1, STATE3
2521	movups 0x30(INP), IN2
2522	movaps IN2, STATE4
2523#endif
2524	call _aesni_dec4
2525	pxor IV, STATE1
2526#ifdef __x86_64__
2527	pxor IN1, STATE2
2528	pxor IN2, STATE3
2529	pxor IN3, STATE4
2530	movaps IN4, IV
2531#else
2532	pxor IN1, STATE4
2533	movaps IN2, IV
2534	movups (INP), IN1
2535	pxor IN1, STATE2
2536	movups 0x10(INP), IN2
2537	pxor IN2, STATE3
2538#endif
2539	movups STATE1, (OUTP)
2540	movups STATE2, 0x10(OUTP)
2541	movups STATE3, 0x20(OUTP)
2542	movups STATE4, 0x30(OUTP)
2543	sub $64, LEN
2544	add $64, INP
2545	add $64, OUTP
2546	cmp $64, LEN
2547	jge .Lcbc_dec_loop4
2548	cmp $16, LEN
2549	jb .Lcbc_dec_ret
2550.align 4
2551.Lcbc_dec_loop1:
2552	movups (INP), IN
2553	movaps IN, STATE
2554	call _aesni_dec1
2555	pxor IV, STATE
2556	movups STATE, (OUTP)
2557	movaps IN, IV
2558	sub $16, LEN
2559	add $16, INP
2560	add $16, OUTP
2561	cmp $16, LEN
2562	jge .Lcbc_dec_loop1
2563.Lcbc_dec_ret:
2564	movups IV, (IVP)
2565.Lcbc_dec_just_ret:
2566#ifndef __x86_64__
2567	popl KLEN
2568	popl KEYP
2569	popl LEN
2570	popl IVP
2571#endif
2572	FRAME_END
2573	ret
2574ENDPROC(aesni_cbc_dec)
2575
2576#ifdef __x86_64__
2577.pushsection .rodata
2578.align 16
2579.Lbswap_mask:
2580	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2581.popsection
2582
2583/*
2584 * _aesni_inc_init:	internal ABI
2585 *	setup registers used by _aesni_inc
2586 * input:
2587 *	IV
2588 * output:
2589 *	CTR:	== IV, in little endian
2590 *	TCTR_LOW: == lower qword of CTR
2591 *	INC:	== 1, in little endian
2592 *	BSWAP_MASK == endian swapping mask
2593 */
2594.align 4
2595_aesni_inc_init:
2596	movaps .Lbswap_mask, BSWAP_MASK
2597	movaps IV, CTR
2598	PSHUFB_XMM BSWAP_MASK CTR
2599	mov $1, TCTR_LOW
2600	MOVQ_R64_XMM TCTR_LOW INC
2601	MOVQ_R64_XMM CTR TCTR_LOW
2602	ret
2603ENDPROC(_aesni_inc_init)
2604
2605/*
2606 * _aesni_inc:		internal ABI
2607 *	Increase IV by 1, IV is in big endian
2608 * input:
2609 *	IV
2610 *	CTR:	== IV, in little endian
2611 *	TCTR_LOW: == lower qword of CTR
2612 *	INC:	== 1, in little endian
2613 *	BSWAP_MASK == endian swapping mask
2614 * output:
2615 *	IV:	Increase by 1
2616 * changed:
2617 *	CTR:	== output IV, in little endian
2618 *	TCTR_LOW: == lower qword of CTR
2619 */
2620.align 4
2621_aesni_inc:
2622	paddq INC, CTR
2623	add $1, TCTR_LOW
2624	jnc .Linc_low
2625	pslldq $8, INC
2626	paddq INC, CTR
2627	psrldq $8, INC
2628.Linc_low:
2629	movaps CTR, IV
2630	PSHUFB_XMM BSWAP_MASK IV
2631	ret
2632ENDPROC(_aesni_inc)
2633
2634/*
2635 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2636 *		      size_t len, u8 *iv)
2637 */
2638ENTRY(aesni_ctr_enc)
2639	FRAME_BEGIN
2640	cmp $16, LEN
2641	jb .Lctr_enc_just_ret
2642	mov 480(KEYP), KLEN
2643	movups (IVP), IV
2644	call _aesni_inc_init
2645	cmp $64, LEN
2646	jb .Lctr_enc_loop1
2647.align 4
2648.Lctr_enc_loop4:
2649	movaps IV, STATE1
2650	call _aesni_inc
2651	movups (INP), IN1
2652	movaps IV, STATE2
2653	call _aesni_inc
2654	movups 0x10(INP), IN2
2655	movaps IV, STATE3
2656	call _aesni_inc
2657	movups 0x20(INP), IN3
2658	movaps IV, STATE4
2659	call _aesni_inc
2660	movups 0x30(INP), IN4
2661	call _aesni_enc4
2662	pxor IN1, STATE1
2663	movups STATE1, (OUTP)
2664	pxor IN2, STATE2
2665	movups STATE2, 0x10(OUTP)
2666	pxor IN3, STATE3
2667	movups STATE3, 0x20(OUTP)
2668	pxor IN4, STATE4
2669	movups STATE4, 0x30(OUTP)
2670	sub $64, LEN
2671	add $64, INP
2672	add $64, OUTP
2673	cmp $64, LEN
2674	jge .Lctr_enc_loop4
2675	cmp $16, LEN
2676	jb .Lctr_enc_ret
2677.align 4
2678.Lctr_enc_loop1:
2679	movaps IV, STATE
2680	call _aesni_inc
2681	movups (INP), IN
2682	call _aesni_enc1
2683	pxor IN, STATE
2684	movups STATE, (OUTP)
2685	sub $16, LEN
2686	add $16, INP
2687	add $16, OUTP
2688	cmp $16, LEN
2689	jge .Lctr_enc_loop1
2690.Lctr_enc_ret:
2691	movups IV, (IVP)
2692.Lctr_enc_just_ret:
2693	FRAME_END
2694	ret
2695ENDPROC(aesni_ctr_enc)
2696
2697/*
2698 * _aesni_gf128mul_x_ble:		internal ABI
2699 *	Multiply in GF(2^128) for XTS IVs
2700 * input:
2701 *	IV:	current IV
2702 *	GF128MUL_MASK == mask with 0x87 and 0x01
2703 * output:
2704 *	IV:	next IV
2705 * changed:
2706 *	CTR:	== temporary value
2707 */
2708#define _aesni_gf128mul_x_ble() \
2709	pshufd $0x13, IV, CTR; \
2710	paddq IV, IV; \
2711	psrad $31, CTR; \
2712	pand GF128MUL_MASK, CTR; \
2713	pxor CTR, IV;
2714
2715/*
2716 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2717 *			 bool enc, u8 *iv)
2718 */
2719ENTRY(aesni_xts_crypt8)
2720	FRAME_BEGIN
2721	cmpb $0, %cl
2722	movl $0, %ecx
2723	movl $240, %r10d
2724	leaq _aesni_enc4, %r11
2725	leaq _aesni_dec4, %rax
2726	cmovel %r10d, %ecx
2727	cmoveq %rax, %r11
2728
2729	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2730	movups (IVP), IV
2731
2732	mov 480(KEYP), KLEN
2733	addq %rcx, KEYP
2734
2735	movdqa IV, STATE1
2736	movdqu 0x00(INP), INC
2737	pxor INC, STATE1
2738	movdqu IV, 0x00(OUTP)
2739
2740	_aesni_gf128mul_x_ble()
2741	movdqa IV, STATE2
2742	movdqu 0x10(INP), INC
2743	pxor INC, STATE2
2744	movdqu IV, 0x10(OUTP)
2745
2746	_aesni_gf128mul_x_ble()
2747	movdqa IV, STATE3
2748	movdqu 0x20(INP), INC
2749	pxor INC, STATE3
2750	movdqu IV, 0x20(OUTP)
2751
2752	_aesni_gf128mul_x_ble()
2753	movdqa IV, STATE4
2754	movdqu 0x30(INP), INC
2755	pxor INC, STATE4
2756	movdqu IV, 0x30(OUTP)
2757
2758	call *%r11
2759
2760	movdqu 0x00(OUTP), INC
2761	pxor INC, STATE1
2762	movdqu STATE1, 0x00(OUTP)
2763
2764	_aesni_gf128mul_x_ble()
2765	movdqa IV, STATE1
2766	movdqu 0x40(INP), INC
2767	pxor INC, STATE1
2768	movdqu IV, 0x40(OUTP)
2769
2770	movdqu 0x10(OUTP), INC
2771	pxor INC, STATE2
2772	movdqu STATE2, 0x10(OUTP)
2773
2774	_aesni_gf128mul_x_ble()
2775	movdqa IV, STATE2
2776	movdqu 0x50(INP), INC
2777	pxor INC, STATE2
2778	movdqu IV, 0x50(OUTP)
2779
2780	movdqu 0x20(OUTP), INC
2781	pxor INC, STATE3
2782	movdqu STATE3, 0x20(OUTP)
2783
2784	_aesni_gf128mul_x_ble()
2785	movdqa IV, STATE3
2786	movdqu 0x60(INP), INC
2787	pxor INC, STATE3
2788	movdqu IV, 0x60(OUTP)
2789
2790	movdqu 0x30(OUTP), INC
2791	pxor INC, STATE4
2792	movdqu STATE4, 0x30(OUTP)
2793
2794	_aesni_gf128mul_x_ble()
2795	movdqa IV, STATE4
2796	movdqu 0x70(INP), INC
2797	pxor INC, STATE4
2798	movdqu IV, 0x70(OUTP)
2799
2800	_aesni_gf128mul_x_ble()
2801	movups IV, (IVP)
2802
2803	call *%r11
2804
2805	movdqu 0x40(OUTP), INC
2806	pxor INC, STATE1
2807	movdqu STATE1, 0x40(OUTP)
2808
2809	movdqu 0x50(OUTP), INC
2810	pxor INC, STATE2
2811	movdqu STATE2, 0x50(OUTP)
2812
2813	movdqu 0x60(OUTP), INC
2814	pxor INC, STATE3
2815	movdqu STATE3, 0x60(OUTP)
2816
2817	movdqu 0x70(OUTP), INC
2818	pxor INC, STATE4
2819	movdqu STATE4, 0x70(OUTP)
2820
2821	FRAME_END
2822	ret
2823ENDPROC(aesni_xts_crypt8)
2824
2825#endif
2826