xref: /linux/arch/x86/crypto/aesni-intel_asm.S (revision ca55b2fef3a9373fcfc30f82fd26bc7fccbda732)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
16 *             Adrian Hoban <adrian.hoban@intel.com>
17 *             James Guilford (james.guilford@intel.com)
18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
19 *             Tadeusz Struk (tadeusz.struk@intel.com)
20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
21 *    Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 *    Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34
35/*
36 * The following macros are used to move an (un)aligned 16 byte value to/from
37 * an XMM register.  This can done for either FP or integer values, for FP use
38 * movaps (move aligned packed single) or integer use movdqa (move double quad
39 * aligned).  It doesn't make a performance difference which instruction is used
40 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
41 * shorter, so that is the one we'll use for now. (same for unaligned).
42 */
43#define MOVADQ	movaps
44#define MOVUDQ	movups
45
46#ifdef __x86_64__
47
48.data
49.align 16
50.Lgf128mul_x_ble_mask:
51	.octa 0x00000000000000010000000000000087
52POLY:   .octa 0xC2000000000000000000000000000001
53TWOONE: .octa 0x00000001000000000000000000000001
54
55# order of these constants should not change.
56# more specifically, ALL_F should follow SHIFT_MASK,
57# and ZERO should follow ALL_F
58
59SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
60MASK1:      .octa 0x0000000000000000ffffffffffffffff
61MASK2:      .octa 0xffffffffffffffff0000000000000000
62SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
63ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
64ZERO:       .octa 0x00000000000000000000000000000000
65ONE:        .octa 0x00000000000000000000000000000001
66F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
67dec:        .octa 0x1
68enc:        .octa 0x2
69
70
71.text
72
73
74#define	STACK_OFFSET    8*3
75#define	HashKey		16*0	// store HashKey <<1 mod poly here
76#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
77#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
78#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
79#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
80				// bits of  HashKey <<1 mod poly here
81				//(for Karatsuba purposes)
82#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
83				// bits of  HashKey^2 <<1 mod poly here
84				// (for Karatsuba purposes)
85#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
86				// bits of  HashKey^3 <<1 mod poly here
87				// (for Karatsuba purposes)
88#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
89				// bits of  HashKey^4 <<1 mod poly here
90				// (for Karatsuba purposes)
91#define	VARIABLE_OFFSET	16*8
92
93#define arg1 rdi
94#define arg2 rsi
95#define arg3 rdx
96#define arg4 rcx
97#define arg5 r8
98#define arg6 r9
99#define arg7 STACK_OFFSET+8(%r14)
100#define arg8 STACK_OFFSET+16(%r14)
101#define arg9 STACK_OFFSET+24(%r14)
102#define arg10 STACK_OFFSET+32(%r14)
103#define keysize 2*15*16(%arg1)
104#endif
105
106
107#define STATE1	%xmm0
108#define STATE2	%xmm4
109#define STATE3	%xmm5
110#define STATE4	%xmm6
111#define STATE	STATE1
112#define IN1	%xmm1
113#define IN2	%xmm7
114#define IN3	%xmm8
115#define IN4	%xmm9
116#define IN	IN1
117#define KEY	%xmm2
118#define IV	%xmm3
119
120#define BSWAP_MASK %xmm10
121#define CTR	%xmm11
122#define INC	%xmm12
123
124#define GF128MUL_MASK %xmm10
125
126#ifdef __x86_64__
127#define AREG	%rax
128#define KEYP	%rdi
129#define OUTP	%rsi
130#define UKEYP	OUTP
131#define INP	%rdx
132#define LEN	%rcx
133#define IVP	%r8
134#define KLEN	%r9d
135#define T1	%r10
136#define TKEYP	T1
137#define T2	%r11
138#define TCTR_LOW T2
139#else
140#define AREG	%eax
141#define KEYP	%edi
142#define OUTP	AREG
143#define UKEYP	OUTP
144#define INP	%edx
145#define LEN	%esi
146#define IVP	%ebp
147#define KLEN	%ebx
148#define T1	%ecx
149#define TKEYP	T1
150#endif
151
152
153#ifdef __x86_64__
154/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
155*
156*
157* Input: A and B (128-bits each, bit-reflected)
158* Output: C = A*B*x mod poly, (i.e. >>1 )
159* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
160* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
161*
162*/
163.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
164	movdqa	  \GH, \TMP1
165	pshufd	  $78, \GH, \TMP2
166	pshufd	  $78, \HK, \TMP3
167	pxor	  \GH, \TMP2            # TMP2 = a1+a0
168	pxor	  \HK, \TMP3            # TMP3 = b1+b0
169	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
170	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
171	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
172	pxor	  \GH, \TMP2
173	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
174	movdqa	  \TMP2, \TMP3
175	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
176	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
177	pxor	  \TMP3, \GH
178	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
179
180        # first phase of the reduction
181
182	movdqa    \GH, \TMP2
183	movdqa    \GH, \TMP3
184	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
185					# in in order to perform
186					# independent shifts
187	pslld     $31, \TMP2            # packed right shift <<31
188	pslld     $30, \TMP3            # packed right shift <<30
189	pslld     $25, \TMP4            # packed right shift <<25
190	pxor      \TMP3, \TMP2          # xor the shifted versions
191	pxor      \TMP4, \TMP2
192	movdqa    \TMP2, \TMP5
193	psrldq    $4, \TMP5             # right shift TMP5 1 DW
194	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
195	pxor      \TMP2, \GH
196
197        # second phase of the reduction
198
199	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
200					# in in order to perform
201					# independent shifts
202	movdqa    \GH,\TMP3
203	movdqa    \GH,\TMP4
204	psrld     $1,\TMP2              # packed left shift >>1
205	psrld     $2,\TMP3              # packed left shift >>2
206	psrld     $7,\TMP4              # packed left shift >>7
207	pxor      \TMP3,\TMP2		# xor the shifted versions
208	pxor      \TMP4,\TMP2
209	pxor      \TMP5, \TMP2
210	pxor      \TMP2, \GH
211	pxor      \TMP1, \GH            # result is in TMP1
212.endm
213
214/*
215* if a = number of total plaintext bytes
216* b = floor(a/16)
217* num_initial_blocks = b mod 4
218* encrypt the initial num_initial_blocks blocks and apply ghash on
219* the ciphertext
220* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
221* are clobbered
222* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
223*/
224
225
226.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
227XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
228        MOVADQ     SHUF_MASK(%rip), %xmm14
229	mov	   arg7, %r10           # %r10 = AAD
230	mov	   arg8, %r12           # %r12 = aadLen
231	mov	   %r12, %r11
232	pxor	   %xmm\i, %xmm\i
233
234_get_AAD_loop\num_initial_blocks\operation:
235	movd	   (%r10), \TMP1
236	pslldq	   $12, \TMP1
237	psrldq	   $4, %xmm\i
238	pxor	   \TMP1, %xmm\i
239	add	   $4, %r10
240	sub	   $4, %r12
241	jne	   _get_AAD_loop\num_initial_blocks\operation
242
243	cmp	   $16, %r11
244	je	   _get_AAD_loop2_done\num_initial_blocks\operation
245
246	mov	   $16, %r12
247_get_AAD_loop2\num_initial_blocks\operation:
248	psrldq	   $4, %xmm\i
249	sub	   $4, %r12
250	cmp	   %r11, %r12
251	jne	   _get_AAD_loop2\num_initial_blocks\operation
252
253_get_AAD_loop2_done\num_initial_blocks\operation:
254	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
255
256	xor	   %r11, %r11 # initialise the data pointer offset as zero
257
258        # start AES for num_initial_blocks blocks
259
260	mov	   %arg5, %rax                      # %rax = *Y0
261	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
262	PSHUFB_XMM   %xmm14, \XMM0
263
264.if (\i == 5) || (\i == 6) || (\i == 7)
265	MOVADQ		ONE(%RIP),\TMP1
266	MOVADQ		(%arg1),\TMP2
267.irpc index, \i_seq
268	paddd	   \TMP1, \XMM0                 # INCR Y0
269	movdqa	   \XMM0, %xmm\index
270	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
271	pxor	   \TMP2, %xmm\index
272.endr
273	lea	0x10(%arg1),%r10
274	mov	keysize,%eax
275	shr	$2,%eax				# 128->4, 192->6, 256->8
276	add	$5,%eax			      # 128->9, 192->11, 256->13
277
278aes_loop_initial_dec\num_initial_blocks:
279	MOVADQ	(%r10),\TMP1
280.irpc	index, \i_seq
281	AESENC	\TMP1, %xmm\index
282.endr
283	add	$16,%r10
284	sub	$1,%eax
285	jnz	aes_loop_initial_dec\num_initial_blocks
286
287	MOVADQ	(%r10), \TMP1
288.irpc index, \i_seq
289	AESENCLAST \TMP1, %xmm\index         # Last Round
290.endr
291.irpc index, \i_seq
292	movdqu	   (%arg3 , %r11, 1), \TMP1
293	pxor	   \TMP1, %xmm\index
294	movdqu	   %xmm\index, (%arg2 , %r11, 1)
295	# write back plaintext/ciphertext for num_initial_blocks
296	add	   $16, %r11
297
298	movdqa     \TMP1, %xmm\index
299	PSHUFB_XMM	   %xmm14, %xmm\index
300                # prepare plaintext/ciphertext for GHASH computation
301.endr
302.endif
303	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
304        # apply GHASH on num_initial_blocks blocks
305
306.if \i == 5
307        pxor       %xmm5, %xmm6
308	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309        pxor       %xmm6, %xmm7
310	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
311        pxor       %xmm7, %xmm8
312	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
313.elseif \i == 6
314        pxor       %xmm6, %xmm7
315	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316        pxor       %xmm7, %xmm8
317	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 7
319        pxor       %xmm7, %xmm8
320	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321.endif
322	cmp	   $64, %r13
323	jl	_initial_blocks_done\num_initial_blocks\operation
324	# no need for precomputed values
325/*
326*
327* Precomputations for HashKey parallel with encryption of first 4 blocks.
328* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
329*/
330	MOVADQ	   ONE(%rip), \TMP1
331	paddd	   \TMP1, \XMM0              # INCR Y0
332	MOVADQ	   \XMM0, \XMM1
333	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
334
335	paddd	   \TMP1, \XMM0              # INCR Y0
336	MOVADQ	   \XMM0, \XMM2
337	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
338
339	paddd	   \TMP1, \XMM0              # INCR Y0
340	MOVADQ	   \XMM0, \XMM3
341	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
342
343	paddd	   \TMP1, \XMM0              # INCR Y0
344	MOVADQ	   \XMM0, \XMM4
345	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
346
347	MOVADQ	   0(%arg1),\TMP1
348	pxor	   \TMP1, \XMM1
349	pxor	   \TMP1, \XMM2
350	pxor	   \TMP1, \XMM3
351	pxor	   \TMP1, \XMM4
352	movdqa	   \TMP3, \TMP5
353	pshufd	   $78, \TMP3, \TMP1
354	pxor	   \TMP3, \TMP1
355	movdqa	   \TMP1, HashKey_k(%rsp)
356	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
357# TMP5 = HashKey^2<<1 (mod poly)
358	movdqa	   \TMP5, HashKey_2(%rsp)
359# HashKey_2 = HashKey^2<<1 (mod poly)
360	pshufd	   $78, \TMP5, \TMP1
361	pxor	   \TMP5, \TMP1
362	movdqa	   \TMP1, HashKey_2_k(%rsp)
363.irpc index, 1234 # do 4 rounds
364	movaps 0x10*\index(%arg1), \TMP1
365	AESENC	   \TMP1, \XMM1
366	AESENC	   \TMP1, \XMM2
367	AESENC	   \TMP1, \XMM3
368	AESENC	   \TMP1, \XMM4
369.endr
370	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
371# TMP5 = HashKey^3<<1 (mod poly)
372	movdqa	   \TMP5, HashKey_3(%rsp)
373	pshufd	   $78, \TMP5, \TMP1
374	pxor	   \TMP5, \TMP1
375	movdqa	   \TMP1, HashKey_3_k(%rsp)
376.irpc index, 56789 # do next 5 rounds
377	movaps 0x10*\index(%arg1), \TMP1
378	AESENC	   \TMP1, \XMM1
379	AESENC	   \TMP1, \XMM2
380	AESENC	   \TMP1, \XMM3
381	AESENC	   \TMP1, \XMM4
382.endr
383	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
384# TMP5 = HashKey^3<<1 (mod poly)
385	movdqa	   \TMP5, HashKey_4(%rsp)
386	pshufd	   $78, \TMP5, \TMP1
387	pxor	   \TMP5, \TMP1
388	movdqa	   \TMP1, HashKey_4_k(%rsp)
389	lea	   0xa0(%arg1),%r10
390	mov	   keysize,%eax
391	shr	   $2,%eax			# 128->4, 192->6, 256->8
392	sub	   $4,%eax			# 128->0, 192->2, 256->4
393	jz	   aes_loop_pre_dec_done\num_initial_blocks
394
395aes_loop_pre_dec\num_initial_blocks:
396	MOVADQ	   (%r10),\TMP2
397.irpc	index, 1234
398	AESENC	   \TMP2, %xmm\index
399.endr
400	add	   $16,%r10
401	sub	   $1,%eax
402	jnz	   aes_loop_pre_dec\num_initial_blocks
403
404aes_loop_pre_dec_done\num_initial_blocks:
405	MOVADQ	   (%r10), \TMP2
406	AESENCLAST \TMP2, \XMM1
407	AESENCLAST \TMP2, \XMM2
408	AESENCLAST \TMP2, \XMM3
409	AESENCLAST \TMP2, \XMM4
410	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
411	pxor	   \TMP1, \XMM1
412	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
413	movdqa     \TMP1, \XMM1
414	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
415	pxor	   \TMP1, \XMM2
416	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
417	movdqa     \TMP1, \XMM2
418	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
419	pxor	   \TMP1, \XMM3
420	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
421	movdqa     \TMP1, \XMM3
422	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
423	pxor	   \TMP1, \XMM4
424	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
425	movdqa     \TMP1, \XMM4
426	add	   $64, %r11
427	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
428	pxor	   \XMMDst, \XMM1
429# combine GHASHed value with the corresponding ciphertext
430	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
431	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
432	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
433
434_initial_blocks_done\num_initial_blocks\operation:
435
436.endm
437
438
439/*
440* if a = number of total plaintext bytes
441* b = floor(a/16)
442* num_initial_blocks = b mod 4
443* encrypt the initial num_initial_blocks blocks and apply ghash on
444* the ciphertext
445* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
446* are clobbered
447* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
448*/
449
450
451.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
452XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
453        MOVADQ     SHUF_MASK(%rip), %xmm14
454	mov	   arg7, %r10           # %r10 = AAD
455	mov	   arg8, %r12           # %r12 = aadLen
456	mov	   %r12, %r11
457	pxor	   %xmm\i, %xmm\i
458_get_AAD_loop\num_initial_blocks\operation:
459	movd	   (%r10), \TMP1
460	pslldq	   $12, \TMP1
461	psrldq	   $4, %xmm\i
462	pxor	   \TMP1, %xmm\i
463	add	   $4, %r10
464	sub	   $4, %r12
465	jne	   _get_AAD_loop\num_initial_blocks\operation
466	cmp	   $16, %r11
467	je	   _get_AAD_loop2_done\num_initial_blocks\operation
468	mov	   $16, %r12
469_get_AAD_loop2\num_initial_blocks\operation:
470	psrldq	   $4, %xmm\i
471	sub	   $4, %r12
472	cmp	   %r11, %r12
473	jne	   _get_AAD_loop2\num_initial_blocks\operation
474_get_AAD_loop2_done\num_initial_blocks\operation:
475	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
476
477	xor	   %r11, %r11 # initialise the data pointer offset as zero
478
479        # start AES for num_initial_blocks blocks
480
481	mov	   %arg5, %rax                      # %rax = *Y0
482	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
483	PSHUFB_XMM   %xmm14, \XMM0
484
485.if (\i == 5) || (\i == 6) || (\i == 7)
486
487	MOVADQ		ONE(%RIP),\TMP1
488	MOVADQ		0(%arg1),\TMP2
489.irpc index, \i_seq
490	paddd		\TMP1, \XMM0                 # INCR Y0
491	MOVADQ		\XMM0, %xmm\index
492	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
493	pxor		\TMP2, %xmm\index
494.endr
495	lea	0x10(%arg1),%r10
496	mov	keysize,%eax
497	shr	$2,%eax				# 128->4, 192->6, 256->8
498	add	$5,%eax			      # 128->9, 192->11, 256->13
499
500aes_loop_initial_enc\num_initial_blocks:
501	MOVADQ	(%r10),\TMP1
502.irpc	index, \i_seq
503	AESENC	\TMP1, %xmm\index
504.endr
505	add	$16,%r10
506	sub	$1,%eax
507	jnz	aes_loop_initial_enc\num_initial_blocks
508
509	MOVADQ	(%r10), \TMP1
510.irpc index, \i_seq
511	AESENCLAST \TMP1, %xmm\index         # Last Round
512.endr
513.irpc index, \i_seq
514	movdqu	   (%arg3 , %r11, 1), \TMP1
515	pxor	   \TMP1, %xmm\index
516	movdqu	   %xmm\index, (%arg2 , %r11, 1)
517	# write back plaintext/ciphertext for num_initial_blocks
518	add	   $16, %r11
519	PSHUFB_XMM	   %xmm14, %xmm\index
520
521		# prepare plaintext/ciphertext for GHASH computation
522.endr
523.endif
524	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
525        # apply GHASH on num_initial_blocks blocks
526
527.if \i == 5
528        pxor       %xmm5, %xmm6
529	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
530        pxor       %xmm6, %xmm7
531	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
532        pxor       %xmm7, %xmm8
533	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
534.elseif \i == 6
535        pxor       %xmm6, %xmm7
536	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
537        pxor       %xmm7, %xmm8
538	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
539.elseif \i == 7
540        pxor       %xmm7, %xmm8
541	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
542.endif
543	cmp	   $64, %r13
544	jl	_initial_blocks_done\num_initial_blocks\operation
545	# no need for precomputed values
546/*
547*
548* Precomputations for HashKey parallel with encryption of first 4 blocks.
549* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
550*/
551	MOVADQ	   ONE(%RIP),\TMP1
552	paddd	   \TMP1, \XMM0              # INCR Y0
553	MOVADQ	   \XMM0, \XMM1
554	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
555
556	paddd	   \TMP1, \XMM0              # INCR Y0
557	MOVADQ	   \XMM0, \XMM2
558	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
559
560	paddd	   \TMP1, \XMM0              # INCR Y0
561	MOVADQ	   \XMM0, \XMM3
562	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
563
564	paddd	   \TMP1, \XMM0              # INCR Y0
565	MOVADQ	   \XMM0, \XMM4
566	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
567
568	MOVADQ	   0(%arg1),\TMP1
569	pxor	   \TMP1, \XMM1
570	pxor	   \TMP1, \XMM2
571	pxor	   \TMP1, \XMM3
572	pxor	   \TMP1, \XMM4
573	movdqa	   \TMP3, \TMP5
574	pshufd	   $78, \TMP3, \TMP1
575	pxor	   \TMP3, \TMP1
576	movdqa	   \TMP1, HashKey_k(%rsp)
577	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
578# TMP5 = HashKey^2<<1 (mod poly)
579	movdqa	   \TMP5, HashKey_2(%rsp)
580# HashKey_2 = HashKey^2<<1 (mod poly)
581	pshufd	   $78, \TMP5, \TMP1
582	pxor	   \TMP5, \TMP1
583	movdqa	   \TMP1, HashKey_2_k(%rsp)
584.irpc index, 1234 # do 4 rounds
585	movaps 0x10*\index(%arg1), \TMP1
586	AESENC	   \TMP1, \XMM1
587	AESENC	   \TMP1, \XMM2
588	AESENC	   \TMP1, \XMM3
589	AESENC	   \TMP1, \XMM4
590.endr
591	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
592# TMP5 = HashKey^3<<1 (mod poly)
593	movdqa	   \TMP5, HashKey_3(%rsp)
594	pshufd	   $78, \TMP5, \TMP1
595	pxor	   \TMP5, \TMP1
596	movdqa	   \TMP1, HashKey_3_k(%rsp)
597.irpc index, 56789 # do next 5 rounds
598	movaps 0x10*\index(%arg1), \TMP1
599	AESENC	   \TMP1, \XMM1
600	AESENC	   \TMP1, \XMM2
601	AESENC	   \TMP1, \XMM3
602	AESENC	   \TMP1, \XMM4
603.endr
604	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
605# TMP5 = HashKey^3<<1 (mod poly)
606	movdqa	   \TMP5, HashKey_4(%rsp)
607	pshufd	   $78, \TMP5, \TMP1
608	pxor	   \TMP5, \TMP1
609	movdqa	   \TMP1, HashKey_4_k(%rsp)
610	lea	   0xa0(%arg1),%r10
611	mov	   keysize,%eax
612	shr	   $2,%eax			# 128->4, 192->6, 256->8
613	sub	   $4,%eax			# 128->0, 192->2, 256->4
614	jz	   aes_loop_pre_enc_done\num_initial_blocks
615
616aes_loop_pre_enc\num_initial_blocks:
617	MOVADQ	   (%r10),\TMP2
618.irpc	index, 1234
619	AESENC	   \TMP2, %xmm\index
620.endr
621	add	   $16,%r10
622	sub	   $1,%eax
623	jnz	   aes_loop_pre_enc\num_initial_blocks
624
625aes_loop_pre_enc_done\num_initial_blocks:
626	MOVADQ	   (%r10), \TMP2
627	AESENCLAST \TMP2, \XMM1
628	AESENCLAST \TMP2, \XMM2
629	AESENCLAST \TMP2, \XMM3
630	AESENCLAST \TMP2, \XMM4
631	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
632	pxor	   \TMP1, \XMM1
633	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
634	pxor	   \TMP1, \XMM2
635	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
636	pxor	   \TMP1, \XMM3
637	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
638	pxor	   \TMP1, \XMM4
639	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
640	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
641	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
642	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
643
644	add	   $64, %r11
645	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
646	pxor	   \XMMDst, \XMM1
647# combine GHASHed value with the corresponding ciphertext
648	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
649	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
650	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
651
652_initial_blocks_done\num_initial_blocks\operation:
653
654.endm
655
656/*
657* encrypt 4 blocks at a time
658* ghash the 4 previously encrypted ciphertext blocks
659* arg1, %arg2, %arg3 are used as pointers only, not modified
660* %r11 is the data offset value
661*/
662.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
663TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
664
665	movdqa	  \XMM1, \XMM5
666	movdqa	  \XMM2, \XMM6
667	movdqa	  \XMM3, \XMM7
668	movdqa	  \XMM4, \XMM8
669
670        movdqa    SHUF_MASK(%rip), %xmm15
671        # multiply TMP5 * HashKey using karatsuba
672
673	movdqa	  \XMM5, \TMP4
674	pshufd	  $78, \XMM5, \TMP6
675	pxor	  \XMM5, \TMP6
676	paddd     ONE(%rip), \XMM0		# INCR CNT
677	movdqa	  HashKey_4(%rsp), \TMP5
678	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
679	movdqa    \XMM0, \XMM1
680	paddd     ONE(%rip), \XMM0		# INCR CNT
681	movdqa    \XMM0, \XMM2
682	paddd     ONE(%rip), \XMM0		# INCR CNT
683	movdqa    \XMM0, \XMM3
684	paddd     ONE(%rip), \XMM0		# INCR CNT
685	movdqa    \XMM0, \XMM4
686	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
687	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
688	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
689	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
690	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
691
692	pxor	  (%arg1), \XMM1
693	pxor	  (%arg1), \XMM2
694	pxor	  (%arg1), \XMM3
695	pxor	  (%arg1), \XMM4
696	movdqa	  HashKey_4_k(%rsp), \TMP5
697	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
698	movaps 0x10(%arg1), \TMP1
699	AESENC	  \TMP1, \XMM1              # Round 1
700	AESENC	  \TMP1, \XMM2
701	AESENC	  \TMP1, \XMM3
702	AESENC	  \TMP1, \XMM4
703	movaps 0x20(%arg1), \TMP1
704	AESENC	  \TMP1, \XMM1              # Round 2
705	AESENC	  \TMP1, \XMM2
706	AESENC	  \TMP1, \XMM3
707	AESENC	  \TMP1, \XMM4
708	movdqa	  \XMM6, \TMP1
709	pshufd	  $78, \XMM6, \TMP2
710	pxor	  \XMM6, \TMP2
711	movdqa	  HashKey_3(%rsp), \TMP5
712	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
713	movaps 0x30(%arg1), \TMP3
714	AESENC    \TMP3, \XMM1              # Round 3
715	AESENC    \TMP3, \XMM2
716	AESENC    \TMP3, \XMM3
717	AESENC    \TMP3, \XMM4
718	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
719	movaps 0x40(%arg1), \TMP3
720	AESENC	  \TMP3, \XMM1              # Round 4
721	AESENC	  \TMP3, \XMM2
722	AESENC	  \TMP3, \XMM3
723	AESENC	  \TMP3, \XMM4
724	movdqa	  HashKey_3_k(%rsp), \TMP5
725	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
726	movaps 0x50(%arg1), \TMP3
727	AESENC	  \TMP3, \XMM1              # Round 5
728	AESENC	  \TMP3, \XMM2
729	AESENC	  \TMP3, \XMM3
730	AESENC	  \TMP3, \XMM4
731	pxor	  \TMP1, \TMP4
732# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
733	pxor	  \XMM6, \XMM5
734	pxor	  \TMP2, \TMP6
735	movdqa	  \XMM7, \TMP1
736	pshufd	  $78, \XMM7, \TMP2
737	pxor	  \XMM7, \TMP2
738	movdqa	  HashKey_2(%rsp ), \TMP5
739
740        # Multiply TMP5 * HashKey using karatsuba
741
742	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
743	movaps 0x60(%arg1), \TMP3
744	AESENC	  \TMP3, \XMM1              # Round 6
745	AESENC	  \TMP3, \XMM2
746	AESENC	  \TMP3, \XMM3
747	AESENC	  \TMP3, \XMM4
748	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
749	movaps 0x70(%arg1), \TMP3
750	AESENC	  \TMP3, \XMM1             # Round 7
751	AESENC	  \TMP3, \XMM2
752	AESENC	  \TMP3, \XMM3
753	AESENC	  \TMP3, \XMM4
754	movdqa	  HashKey_2_k(%rsp), \TMP5
755	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
756	movaps 0x80(%arg1), \TMP3
757	AESENC	  \TMP3, \XMM1             # Round 8
758	AESENC	  \TMP3, \XMM2
759	AESENC	  \TMP3, \XMM3
760	AESENC	  \TMP3, \XMM4
761	pxor	  \TMP1, \TMP4
762# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
763	pxor	  \XMM7, \XMM5
764	pxor	  \TMP2, \TMP6
765
766        # Multiply XMM8 * HashKey
767        # XMM8 and TMP5 hold the values for the two operands
768
769	movdqa	  \XMM8, \TMP1
770	pshufd	  $78, \XMM8, \TMP2
771	pxor	  \XMM8, \TMP2
772	movdqa	  HashKey(%rsp), \TMP5
773	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
774	movaps 0x90(%arg1), \TMP3
775	AESENC	  \TMP3, \XMM1            # Round 9
776	AESENC	  \TMP3, \XMM2
777	AESENC	  \TMP3, \XMM3
778	AESENC	  \TMP3, \XMM4
779	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
780	lea	  0xa0(%arg1),%r10
781	mov	  keysize,%eax
782	shr	  $2,%eax			# 128->4, 192->6, 256->8
783	sub	  $4,%eax			# 128->0, 192->2, 256->4
784	jz	  aes_loop_par_enc_done
785
786aes_loop_par_enc:
787	MOVADQ	  (%r10),\TMP3
788.irpc	index, 1234
789	AESENC	  \TMP3, %xmm\index
790.endr
791	add	  $16,%r10
792	sub	  $1,%eax
793	jnz	  aes_loop_par_enc
794
795aes_loop_par_enc_done:
796	MOVADQ	  (%r10), \TMP3
797	AESENCLAST \TMP3, \XMM1           # Round 10
798	AESENCLAST \TMP3, \XMM2
799	AESENCLAST \TMP3, \XMM3
800	AESENCLAST \TMP3, \XMM4
801	movdqa    HashKey_k(%rsp), \TMP5
802	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
803	movdqu	  (%arg3,%r11,1), \TMP3
804	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
805	movdqu	  16(%arg3,%r11,1), \TMP3
806	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
807	movdqu	  32(%arg3,%r11,1), \TMP3
808	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
809	movdqu	  48(%arg3,%r11,1), \TMP3
810	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
811        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
812        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
813        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
814        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
815	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
816	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
817	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
818	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
819
820	pxor	  \TMP4, \TMP1
821	pxor	  \XMM8, \XMM5
822	pxor	  \TMP6, \TMP2
823	pxor	  \TMP1, \TMP2
824	pxor	  \XMM5, \TMP2
825	movdqa	  \TMP2, \TMP3
826	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
827	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
828	pxor	  \TMP3, \XMM5
829	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
830
831        # first phase of reduction
832
833	movdqa    \XMM5, \TMP2
834	movdqa    \XMM5, \TMP3
835	movdqa    \XMM5, \TMP4
836# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
837	pslld     $31, \TMP2                   # packed right shift << 31
838	pslld     $30, \TMP3                   # packed right shift << 30
839	pslld     $25, \TMP4                   # packed right shift << 25
840	pxor      \TMP3, \TMP2	               # xor the shifted versions
841	pxor      \TMP4, \TMP2
842	movdqa    \TMP2, \TMP5
843	psrldq    $4, \TMP5                    # right shift T5 1 DW
844	pslldq    $12, \TMP2                   # left shift T2 3 DWs
845	pxor      \TMP2, \XMM5
846
847        # second phase of reduction
848
849	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
850	movdqa    \XMM5,\TMP3
851	movdqa    \XMM5,\TMP4
852	psrld     $1, \TMP2                    # packed left shift >>1
853	psrld     $2, \TMP3                    # packed left shift >>2
854	psrld     $7, \TMP4                    # packed left shift >>7
855	pxor      \TMP3,\TMP2		       # xor the shifted versions
856	pxor      \TMP4,\TMP2
857	pxor      \TMP5, \TMP2
858	pxor      \TMP2, \XMM5
859	pxor      \TMP1, \XMM5                 # result is in TMP1
860
861	pxor	  \XMM5, \XMM1
862.endm
863
864/*
865* decrypt 4 blocks at a time
866* ghash the 4 previously decrypted ciphertext blocks
867* arg1, %arg2, %arg3 are used as pointers only, not modified
868* %r11 is the data offset value
869*/
870.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
871TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
872
873	movdqa	  \XMM1, \XMM5
874	movdqa	  \XMM2, \XMM6
875	movdqa	  \XMM3, \XMM7
876	movdqa	  \XMM4, \XMM8
877
878        movdqa    SHUF_MASK(%rip), %xmm15
879        # multiply TMP5 * HashKey using karatsuba
880
881	movdqa	  \XMM5, \TMP4
882	pshufd	  $78, \XMM5, \TMP6
883	pxor	  \XMM5, \TMP6
884	paddd     ONE(%rip), \XMM0		# INCR CNT
885	movdqa	  HashKey_4(%rsp), \TMP5
886	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
887	movdqa    \XMM0, \XMM1
888	paddd     ONE(%rip), \XMM0		# INCR CNT
889	movdqa    \XMM0, \XMM2
890	paddd     ONE(%rip), \XMM0		# INCR CNT
891	movdqa    \XMM0, \XMM3
892	paddd     ONE(%rip), \XMM0		# INCR CNT
893	movdqa    \XMM0, \XMM4
894	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
895	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
896	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
897	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
898	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
899
900	pxor	  (%arg1), \XMM1
901	pxor	  (%arg1), \XMM2
902	pxor	  (%arg1), \XMM3
903	pxor	  (%arg1), \XMM4
904	movdqa	  HashKey_4_k(%rsp), \TMP5
905	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
906	movaps 0x10(%arg1), \TMP1
907	AESENC	  \TMP1, \XMM1              # Round 1
908	AESENC	  \TMP1, \XMM2
909	AESENC	  \TMP1, \XMM3
910	AESENC	  \TMP1, \XMM4
911	movaps 0x20(%arg1), \TMP1
912	AESENC	  \TMP1, \XMM1              # Round 2
913	AESENC	  \TMP1, \XMM2
914	AESENC	  \TMP1, \XMM3
915	AESENC	  \TMP1, \XMM4
916	movdqa	  \XMM6, \TMP1
917	pshufd	  $78, \XMM6, \TMP2
918	pxor	  \XMM6, \TMP2
919	movdqa	  HashKey_3(%rsp), \TMP5
920	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
921	movaps 0x30(%arg1), \TMP3
922	AESENC    \TMP3, \XMM1              # Round 3
923	AESENC    \TMP3, \XMM2
924	AESENC    \TMP3, \XMM3
925	AESENC    \TMP3, \XMM4
926	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
927	movaps 0x40(%arg1), \TMP3
928	AESENC	  \TMP3, \XMM1              # Round 4
929	AESENC	  \TMP3, \XMM2
930	AESENC	  \TMP3, \XMM3
931	AESENC	  \TMP3, \XMM4
932	movdqa	  HashKey_3_k(%rsp), \TMP5
933	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
934	movaps 0x50(%arg1), \TMP3
935	AESENC	  \TMP3, \XMM1              # Round 5
936	AESENC	  \TMP3, \XMM2
937	AESENC	  \TMP3, \XMM3
938	AESENC	  \TMP3, \XMM4
939	pxor	  \TMP1, \TMP4
940# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
941	pxor	  \XMM6, \XMM5
942	pxor	  \TMP2, \TMP6
943	movdqa	  \XMM7, \TMP1
944	pshufd	  $78, \XMM7, \TMP2
945	pxor	  \XMM7, \TMP2
946	movdqa	  HashKey_2(%rsp ), \TMP5
947
948        # Multiply TMP5 * HashKey using karatsuba
949
950	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
951	movaps 0x60(%arg1), \TMP3
952	AESENC	  \TMP3, \XMM1              # Round 6
953	AESENC	  \TMP3, \XMM2
954	AESENC	  \TMP3, \XMM3
955	AESENC	  \TMP3, \XMM4
956	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
957	movaps 0x70(%arg1), \TMP3
958	AESENC	  \TMP3, \XMM1             # Round 7
959	AESENC	  \TMP3, \XMM2
960	AESENC	  \TMP3, \XMM3
961	AESENC	  \TMP3, \XMM4
962	movdqa	  HashKey_2_k(%rsp), \TMP5
963	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
964	movaps 0x80(%arg1), \TMP3
965	AESENC	  \TMP3, \XMM1             # Round 8
966	AESENC	  \TMP3, \XMM2
967	AESENC	  \TMP3, \XMM3
968	AESENC	  \TMP3, \XMM4
969	pxor	  \TMP1, \TMP4
970# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
971	pxor	  \XMM7, \XMM5
972	pxor	  \TMP2, \TMP6
973
974        # Multiply XMM8 * HashKey
975        # XMM8 and TMP5 hold the values for the two operands
976
977	movdqa	  \XMM8, \TMP1
978	pshufd	  $78, \XMM8, \TMP2
979	pxor	  \XMM8, \TMP2
980	movdqa	  HashKey(%rsp), \TMP5
981	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
982	movaps 0x90(%arg1), \TMP3
983	AESENC	  \TMP3, \XMM1            # Round 9
984	AESENC	  \TMP3, \XMM2
985	AESENC	  \TMP3, \XMM3
986	AESENC	  \TMP3, \XMM4
987	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
988	lea	  0xa0(%arg1),%r10
989	mov	  keysize,%eax
990	shr	  $2,%eax		        # 128->4, 192->6, 256->8
991	sub	  $4,%eax			# 128->0, 192->2, 256->4
992	jz	  aes_loop_par_dec_done
993
994aes_loop_par_dec:
995	MOVADQ	  (%r10),\TMP3
996.irpc	index, 1234
997	AESENC	  \TMP3, %xmm\index
998.endr
999	add	  $16,%r10
1000	sub	  $1,%eax
1001	jnz	  aes_loop_par_dec
1002
1003aes_loop_par_dec_done:
1004	MOVADQ	  (%r10), \TMP3
1005	AESENCLAST \TMP3, \XMM1           # last round
1006	AESENCLAST \TMP3, \XMM2
1007	AESENCLAST \TMP3, \XMM3
1008	AESENCLAST \TMP3, \XMM4
1009	movdqa    HashKey_k(%rsp), \TMP5
1010	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1011	movdqu	  (%arg3,%r11,1), \TMP3
1012	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1013	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
1014	movdqa    \TMP3, \XMM1
1015	movdqu	  16(%arg3,%r11,1), \TMP3
1016	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1017	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
1018	movdqa    \TMP3, \XMM2
1019	movdqu	  32(%arg3,%r11,1), \TMP3
1020	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1021	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1022	movdqa    \TMP3, \XMM3
1023	movdqu	  48(%arg3,%r11,1), \TMP3
1024	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1025	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1026	movdqa    \TMP3, \XMM4
1027	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1028	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1029	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1030	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1031
1032	pxor	  \TMP4, \TMP1
1033	pxor	  \XMM8, \XMM5
1034	pxor	  \TMP6, \TMP2
1035	pxor	  \TMP1, \TMP2
1036	pxor	  \XMM5, \TMP2
1037	movdqa	  \TMP2, \TMP3
1038	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1039	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1040	pxor	  \TMP3, \XMM5
1041	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1042
1043        # first phase of reduction
1044
1045	movdqa    \XMM5, \TMP2
1046	movdqa    \XMM5, \TMP3
1047	movdqa    \XMM5, \TMP4
1048# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1049	pslld     $31, \TMP2                   # packed right shift << 31
1050	pslld     $30, \TMP3                   # packed right shift << 30
1051	pslld     $25, \TMP4                   # packed right shift << 25
1052	pxor      \TMP3, \TMP2	               # xor the shifted versions
1053	pxor      \TMP4, \TMP2
1054	movdqa    \TMP2, \TMP5
1055	psrldq    $4, \TMP5                    # right shift T5 1 DW
1056	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1057	pxor      \TMP2, \XMM5
1058
1059        # second phase of reduction
1060
1061	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1062	movdqa    \XMM5,\TMP3
1063	movdqa    \XMM5,\TMP4
1064	psrld     $1, \TMP2                    # packed left shift >>1
1065	psrld     $2, \TMP3                    # packed left shift >>2
1066	psrld     $7, \TMP4                    # packed left shift >>7
1067	pxor      \TMP3,\TMP2		       # xor the shifted versions
1068	pxor      \TMP4,\TMP2
1069	pxor      \TMP5, \TMP2
1070	pxor      \TMP2, \XMM5
1071	pxor      \TMP1, \XMM5                 # result is in TMP1
1072
1073	pxor	  \XMM5, \XMM1
1074.endm
1075
1076/* GHASH the last 4 ciphertext blocks. */
1077.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1078TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1079
1080        # Multiply TMP6 * HashKey (using Karatsuba)
1081
1082	movdqa	  \XMM1, \TMP6
1083	pshufd	  $78, \XMM1, \TMP2
1084	pxor	  \XMM1, \TMP2
1085	movdqa	  HashKey_4(%rsp), \TMP5
1086	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1087	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1088	movdqa	  HashKey_4_k(%rsp), \TMP4
1089	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1090	movdqa	  \XMM1, \XMMDst
1091	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1092
1093        # Multiply TMP1 * HashKey (using Karatsuba)
1094
1095	movdqa	  \XMM2, \TMP1
1096	pshufd	  $78, \XMM2, \TMP2
1097	pxor	  \XMM2, \TMP2
1098	movdqa	  HashKey_3(%rsp), \TMP5
1099	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1100	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1101	movdqa	  HashKey_3_k(%rsp), \TMP4
1102	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1103	pxor	  \TMP1, \TMP6
1104	pxor	  \XMM2, \XMMDst
1105	pxor	  \TMP2, \XMM1
1106# results accumulated in TMP6, XMMDst, XMM1
1107
1108        # Multiply TMP1 * HashKey (using Karatsuba)
1109
1110	movdqa	  \XMM3, \TMP1
1111	pshufd	  $78, \XMM3, \TMP2
1112	pxor	  \XMM3, \TMP2
1113	movdqa	  HashKey_2(%rsp), \TMP5
1114	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1115	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1116	movdqa	  HashKey_2_k(%rsp), \TMP4
1117	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1118	pxor	  \TMP1, \TMP6
1119	pxor	  \XMM3, \XMMDst
1120	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1121
1122        # Multiply TMP1 * HashKey (using Karatsuba)
1123	movdqa	  \XMM4, \TMP1
1124	pshufd	  $78, \XMM4, \TMP2
1125	pxor	  \XMM4, \TMP2
1126	movdqa	  HashKey(%rsp), \TMP5
1127	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1128	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1129	movdqa	  HashKey_k(%rsp), \TMP4
1130	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1131	pxor	  \TMP1, \TMP6
1132	pxor	  \XMM4, \XMMDst
1133	pxor	  \XMM1, \TMP2
1134	pxor	  \TMP6, \TMP2
1135	pxor	  \XMMDst, \TMP2
1136	# middle section of the temp results combined as in karatsuba algorithm
1137	movdqa	  \TMP2, \TMP4
1138	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1139	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1140	pxor	  \TMP4, \XMMDst
1141	pxor	  \TMP2, \TMP6
1142# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1143	# first phase of the reduction
1144	movdqa    \XMMDst, \TMP2
1145	movdqa    \XMMDst, \TMP3
1146	movdqa    \XMMDst, \TMP4
1147# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1148	pslld     $31, \TMP2                # packed right shifting << 31
1149	pslld     $30, \TMP3                # packed right shifting << 30
1150	pslld     $25, \TMP4                # packed right shifting << 25
1151	pxor      \TMP3, \TMP2              # xor the shifted versions
1152	pxor      \TMP4, \TMP2
1153	movdqa    \TMP2, \TMP7
1154	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1155	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1156	pxor      \TMP2, \XMMDst
1157
1158        # second phase of the reduction
1159	movdqa    \XMMDst, \TMP2
1160	# make 3 copies of XMMDst for doing 3 shift operations
1161	movdqa    \XMMDst, \TMP3
1162	movdqa    \XMMDst, \TMP4
1163	psrld     $1, \TMP2                 # packed left shift >> 1
1164	psrld     $2, \TMP3                 # packed left shift >> 2
1165	psrld     $7, \TMP4                 # packed left shift >> 7
1166	pxor      \TMP3, \TMP2              # xor the shifted versions
1167	pxor      \TMP4, \TMP2
1168	pxor      \TMP7, \TMP2
1169	pxor      \TMP2, \XMMDst
1170	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1171.endm
1172
1173
1174/* Encryption of a single block
1175* uses eax & r10
1176*/
1177
1178.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1179
1180	pxor		(%arg1), \XMM0
1181	mov		keysize,%eax
1182	shr		$2,%eax			# 128->4, 192->6, 256->8
1183	add		$5,%eax			# 128->9, 192->11, 256->13
1184	lea		16(%arg1), %r10	  # get first expanded key address
1185
1186_esb_loop_\@:
1187	MOVADQ		(%r10),\TMP1
1188	AESENC		\TMP1,\XMM0
1189	add		$16,%r10
1190	sub		$1,%eax
1191	jnz		_esb_loop_\@
1192
1193	MOVADQ		(%r10),\TMP1
1194	AESENCLAST	\TMP1,\XMM0
1195.endm
1196/*****************************************************************************
1197* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1198*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1199*                   const u8 *in,      // Ciphertext input
1200*                   u64 plaintext_len, // Length of data in bytes for decryption.
1201*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1202*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1203*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1204*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1205*                   const u8 *aad,     // Additional Authentication Data (AAD)
1206*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1207*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1208*                                      // given authentication tag and only return the plaintext if they match.
1209*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1210*                                      // (most likely), 12 or 8.
1211*
1212* Assumptions:
1213*
1214* keys:
1215*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1216*       set of 11 keys in the data structure void *aes_ctx
1217*
1218* iv:
1219*       0                   1                   2                   3
1220*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1221*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1222*       |                             Salt  (From the SA)               |
1223*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1224*       |                     Initialization Vector                     |
1225*       |         (This is the sequence number from IPSec header)       |
1226*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1227*       |                              0x1                              |
1228*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1229*
1230*
1231*
1232* AAD:
1233*       AAD padded to 128 bits with 0
1234*       for example, assume AAD is a u32 vector
1235*
1236*       if AAD is 8 bytes:
1237*       AAD[3] = {A0, A1};
1238*       padded AAD in xmm register = {A1 A0 0 0}
1239*
1240*       0                   1                   2                   3
1241*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1242*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1243*       |                               SPI (A1)                        |
1244*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1245*       |                     32-bit Sequence Number (A0)               |
1246*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1247*       |                              0x0                              |
1248*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249*
1250*                                       AAD Format with 32-bit Sequence Number
1251*
1252*       if AAD is 12 bytes:
1253*       AAD[3] = {A0, A1, A2};
1254*       padded AAD in xmm register = {A2 A1 A0 0}
1255*
1256*       0                   1                   2                   3
1257*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1258*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1259*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1260*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1261*       |                               SPI (A2)                        |
1262*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1263*       |                 64-bit Extended Sequence Number {A1,A0}       |
1264*       |                                                               |
1265*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1266*       |                              0x0                              |
1267*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1268*
1269*                        AAD Format with 64-bit Extended Sequence Number
1270*
1271* aadLen:
1272*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1273*       The code supports 16 too but for other sizes, the code will fail.
1274*
1275* TLen:
1276*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1277*       For other sizes, the code will fail.
1278*
1279* poly = x^128 + x^127 + x^126 + x^121 + 1
1280*
1281*****************************************************************************/
1282ENTRY(aesni_gcm_dec)
1283	push	%r12
1284	push	%r13
1285	push	%r14
1286	mov	%rsp, %r14
1287/*
1288* states of %xmm registers %xmm6:%xmm15 not saved
1289* all %xmm registers are clobbered
1290*/
1291	sub	$VARIABLE_OFFSET, %rsp
1292	and	$~63, %rsp                        # align rsp to 64 bytes
1293	mov	%arg6, %r12
1294	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1295        movdqa  SHUF_MASK(%rip), %xmm2
1296	PSHUFB_XMM %xmm2, %xmm13
1297
1298
1299# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1300
1301	movdqa	%xmm13, %xmm2
1302	psllq	$1, %xmm13
1303	psrlq	$63, %xmm2
1304	movdqa	%xmm2, %xmm1
1305	pslldq	$8, %xmm2
1306	psrldq	$8, %xmm1
1307	por	%xmm2, %xmm13
1308
1309        # Reduction
1310
1311	pshufd	$0x24, %xmm1, %xmm2
1312	pcmpeqd TWOONE(%rip), %xmm2
1313	pand	POLY(%rip), %xmm2
1314	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1315
1316
1317        # Decrypt first few blocks
1318
1319	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1320	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1321	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1322	mov %r13, %r12
1323	and $(3<<4), %r12
1324	jz _initial_num_blocks_is_0_decrypt
1325	cmp $(2<<4), %r12
1326	jb _initial_num_blocks_is_1_decrypt
1327	je _initial_num_blocks_is_2_decrypt
1328_initial_num_blocks_is_3_decrypt:
1329	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1330%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1331	sub	$48, %r13
1332	jmp	_initial_blocks_decrypted
1333_initial_num_blocks_is_2_decrypt:
1334	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1335%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1336	sub	$32, %r13
1337	jmp	_initial_blocks_decrypted
1338_initial_num_blocks_is_1_decrypt:
1339	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1340%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1341	sub	$16, %r13
1342	jmp	_initial_blocks_decrypted
1343_initial_num_blocks_is_0_decrypt:
1344	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1345%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1346_initial_blocks_decrypted:
1347	cmp	$0, %r13
1348	je	_zero_cipher_left_decrypt
1349	sub	$64, %r13
1350	je	_four_cipher_left_decrypt
1351_decrypt_by_4:
1352	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1353%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1354	add	$64, %r11
1355	sub	$64, %r13
1356	jne	_decrypt_by_4
1357_four_cipher_left_decrypt:
1358	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1359%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1360_zero_cipher_left_decrypt:
1361	mov	%arg4, %r13
1362	and	$15, %r13				# %r13 = arg4 (mod 16)
1363	je	_multiple_of_16_bytes_decrypt
1364
1365        # Handle the last <16 byte block separately
1366
1367	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1368        movdqa SHUF_MASK(%rip), %xmm10
1369	PSHUFB_XMM %xmm10, %xmm0
1370
1371	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1372	sub $16, %r11
1373	add %r13, %r11
1374	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1375	lea SHIFT_MASK+16(%rip), %r12
1376	sub %r13, %r12
1377# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1378# (%r13 is the number of bytes in plaintext mod 16)
1379	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1380	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1381
1382	movdqa  %xmm1, %xmm2
1383	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1384	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1385	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1386	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1387	pand    %xmm1, %xmm2
1388        movdqa SHUF_MASK(%rip), %xmm10
1389	PSHUFB_XMM %xmm10 ,%xmm2
1390
1391	pxor %xmm2, %xmm8
1392	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1393	          # GHASH computation for the last <16 byte block
1394	sub %r13, %r11
1395	add $16, %r11
1396
1397        # output %r13 bytes
1398	MOVQ_R64_XMM	%xmm0, %rax
1399	cmp	$8, %r13
1400	jle	_less_than_8_bytes_left_decrypt
1401	mov	%rax, (%arg2 , %r11, 1)
1402	add	$8, %r11
1403	psrldq	$8, %xmm0
1404	MOVQ_R64_XMM	%xmm0, %rax
1405	sub	$8, %r13
1406_less_than_8_bytes_left_decrypt:
1407	mov	%al,  (%arg2, %r11, 1)
1408	add	$1, %r11
1409	shr	$8, %rax
1410	sub	$1, %r13
1411	jne	_less_than_8_bytes_left_decrypt
1412_multiple_of_16_bytes_decrypt:
1413	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1414	shl	$3, %r12		  # convert into number of bits
1415	movd	%r12d, %xmm15		  # len(A) in %xmm15
1416	shl	$3, %arg4		  # len(C) in bits (*128)
1417	MOVQ_R64_XMM	%arg4, %xmm1
1418	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1419	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1420	pxor	%xmm15, %xmm8
1421	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1422	         # final GHASH computation
1423        movdqa SHUF_MASK(%rip), %xmm10
1424	PSHUFB_XMM %xmm10, %xmm8
1425
1426	mov	%arg5, %rax		  # %rax = *Y0
1427	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1428	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1429	pxor	%xmm8, %xmm0
1430_return_T_decrypt:
1431	mov	arg9, %r10                # %r10 = authTag
1432	mov	arg10, %r11               # %r11 = auth_tag_len
1433	cmp	$16, %r11
1434	je	_T_16_decrypt
1435	cmp	$12, %r11
1436	je	_T_12_decrypt
1437_T_8_decrypt:
1438	MOVQ_R64_XMM	%xmm0, %rax
1439	mov	%rax, (%r10)
1440	jmp	_return_T_done_decrypt
1441_T_12_decrypt:
1442	MOVQ_R64_XMM	%xmm0, %rax
1443	mov	%rax, (%r10)
1444	psrldq	$8, %xmm0
1445	movd	%xmm0, %eax
1446	mov	%eax, 8(%r10)
1447	jmp	_return_T_done_decrypt
1448_T_16_decrypt:
1449	movdqu	%xmm0, (%r10)
1450_return_T_done_decrypt:
1451	mov	%r14, %rsp
1452	pop	%r14
1453	pop	%r13
1454	pop	%r12
1455	ret
1456ENDPROC(aesni_gcm_dec)
1457
1458
1459/*****************************************************************************
1460* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1461*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1462*                    const u8 *in,       // Plaintext input
1463*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1464*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1465*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1466*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1467*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1468*                    const u8 *aad,      // Additional Authentication Data (AAD)
1469*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1470*                    u8 *auth_tag,       // Authenticated Tag output.
1471*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1472*                                        // 12 or 8.
1473*
1474* Assumptions:
1475*
1476* keys:
1477*       keys are pre-expanded and aligned to 16 bytes. we are using the
1478*       first set of 11 keys in the data structure void *aes_ctx
1479*
1480*
1481* iv:
1482*       0                   1                   2                   3
1483*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1484*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1485*       |                             Salt  (From the SA)               |
1486*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1487*       |                     Initialization Vector                     |
1488*       |         (This is the sequence number from IPSec header)       |
1489*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1490*       |                              0x1                              |
1491*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1492*
1493*
1494*
1495* AAD:
1496*       AAD padded to 128 bits with 0
1497*       for example, assume AAD is a u32 vector
1498*
1499*       if AAD is 8 bytes:
1500*       AAD[3] = {A0, A1};
1501*       padded AAD in xmm register = {A1 A0 0 0}
1502*
1503*       0                   1                   2                   3
1504*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1505*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1506*       |                               SPI (A1)                        |
1507*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1508*       |                     32-bit Sequence Number (A0)               |
1509*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510*       |                              0x0                              |
1511*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513*                                 AAD Format with 32-bit Sequence Number
1514*
1515*       if AAD is 12 bytes:
1516*       AAD[3] = {A0, A1, A2};
1517*       padded AAD in xmm register = {A2 A1 A0 0}
1518*
1519*       0                   1                   2                   3
1520*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1521*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1522*       |                               SPI (A2)                        |
1523*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1524*       |                 64-bit Extended Sequence Number {A1,A0}       |
1525*       |                                                               |
1526*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1527*       |                              0x0                              |
1528*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1529*
1530*                         AAD Format with 64-bit Extended Sequence Number
1531*
1532* aadLen:
1533*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1534*       The code supports 16 too but for other sizes, the code will fail.
1535*
1536* TLen:
1537*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1538*       For other sizes, the code will fail.
1539*
1540* poly = x^128 + x^127 + x^126 + x^121 + 1
1541***************************************************************************/
1542ENTRY(aesni_gcm_enc)
1543	push	%r12
1544	push	%r13
1545	push	%r14
1546	mov	%rsp, %r14
1547#
1548# states of %xmm registers %xmm6:%xmm15 not saved
1549# all %xmm registers are clobbered
1550#
1551	sub	$VARIABLE_OFFSET, %rsp
1552	and	$~63, %rsp
1553	mov	%arg6, %r12
1554	movdqu	(%r12), %xmm13
1555        movdqa  SHUF_MASK(%rip), %xmm2
1556	PSHUFB_XMM %xmm2, %xmm13
1557
1558
1559# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1560
1561	movdqa	%xmm13, %xmm2
1562	psllq	$1, %xmm13
1563	psrlq	$63, %xmm2
1564	movdqa	%xmm2, %xmm1
1565	pslldq	$8, %xmm2
1566	psrldq	$8, %xmm1
1567	por	%xmm2, %xmm13
1568
1569        # reduce HashKey<<1
1570
1571	pshufd	$0x24, %xmm1, %xmm2
1572	pcmpeqd TWOONE(%rip), %xmm2
1573	pand	POLY(%rip), %xmm2
1574	pxor	%xmm2, %xmm13
1575	movdqa	%xmm13, HashKey(%rsp)
1576	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1577	and	$-16, %r13
1578	mov	%r13, %r12
1579
1580        # Encrypt first few blocks
1581
1582	and	$(3<<4), %r12
1583	jz	_initial_num_blocks_is_0_encrypt
1584	cmp	$(2<<4), %r12
1585	jb	_initial_num_blocks_is_1_encrypt
1586	je	_initial_num_blocks_is_2_encrypt
1587_initial_num_blocks_is_3_encrypt:
1588	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1589%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1590	sub	$48, %r13
1591	jmp	_initial_blocks_encrypted
1592_initial_num_blocks_is_2_encrypt:
1593	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1594%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1595	sub	$32, %r13
1596	jmp	_initial_blocks_encrypted
1597_initial_num_blocks_is_1_encrypt:
1598	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1599%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1600	sub	$16, %r13
1601	jmp	_initial_blocks_encrypted
1602_initial_num_blocks_is_0_encrypt:
1603	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1604%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1605_initial_blocks_encrypted:
1606
1607        # Main loop - Encrypt remaining blocks
1608
1609	cmp	$0, %r13
1610	je	_zero_cipher_left_encrypt
1611	sub	$64, %r13
1612	je	_four_cipher_left_encrypt
1613_encrypt_by_4_encrypt:
1614	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1615%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1616	add	$64, %r11
1617	sub	$64, %r13
1618	jne	_encrypt_by_4_encrypt
1619_four_cipher_left_encrypt:
1620	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1621%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1622_zero_cipher_left_encrypt:
1623	mov	%arg4, %r13
1624	and	$15, %r13			# %r13 = arg4 (mod 16)
1625	je	_multiple_of_16_bytes_encrypt
1626
1627         # Handle the last <16 Byte block separately
1628	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1629        movdqa SHUF_MASK(%rip), %xmm10
1630	PSHUFB_XMM %xmm10, %xmm0
1631
1632
1633	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1634	sub $16, %r11
1635	add %r13, %r11
1636	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1637	lea SHIFT_MASK+16(%rip), %r12
1638	sub %r13, %r12
1639	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1640	# (%r13 is the number of bytes in plaintext mod 16)
1641	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1642	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1643	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1644	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1645	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1646	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1647        movdqa SHUF_MASK(%rip), %xmm10
1648	PSHUFB_XMM %xmm10,%xmm0
1649
1650	pxor	%xmm0, %xmm8
1651	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1652	# GHASH computation for the last <16 byte block
1653	sub	%r13, %r11
1654	add	$16, %r11
1655
1656	movdqa SHUF_MASK(%rip), %xmm10
1657	PSHUFB_XMM %xmm10, %xmm0
1658
1659	# shuffle xmm0 back to output as ciphertext
1660
1661        # Output %r13 bytes
1662	MOVQ_R64_XMM %xmm0, %rax
1663	cmp $8, %r13
1664	jle _less_than_8_bytes_left_encrypt
1665	mov %rax, (%arg2 , %r11, 1)
1666	add $8, %r11
1667	psrldq $8, %xmm0
1668	MOVQ_R64_XMM %xmm0, %rax
1669	sub $8, %r13
1670_less_than_8_bytes_left_encrypt:
1671	mov %al,  (%arg2, %r11, 1)
1672	add $1, %r11
1673	shr $8, %rax
1674	sub $1, %r13
1675	jne _less_than_8_bytes_left_encrypt
1676_multiple_of_16_bytes_encrypt:
1677	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1678	shl	$3, %r12
1679	movd	%r12d, %xmm15       # len(A) in %xmm15
1680	shl	$3, %arg4               # len(C) in bits (*128)
1681	MOVQ_R64_XMM	%arg4, %xmm1
1682	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1683	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1684	pxor	%xmm15, %xmm8
1685	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1686	# final GHASH computation
1687        movdqa SHUF_MASK(%rip), %xmm10
1688	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1689
1690	mov	%arg5, %rax		       # %rax  = *Y0
1691	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1692	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1693	pxor	%xmm8, %xmm0
1694_return_T_encrypt:
1695	mov	arg9, %r10                     # %r10 = authTag
1696	mov	arg10, %r11                    # %r11 = auth_tag_len
1697	cmp	$16, %r11
1698	je	_T_16_encrypt
1699	cmp	$12, %r11
1700	je	_T_12_encrypt
1701_T_8_encrypt:
1702	MOVQ_R64_XMM	%xmm0, %rax
1703	mov	%rax, (%r10)
1704	jmp	_return_T_done_encrypt
1705_T_12_encrypt:
1706	MOVQ_R64_XMM	%xmm0, %rax
1707	mov	%rax, (%r10)
1708	psrldq	$8, %xmm0
1709	movd	%xmm0, %eax
1710	mov	%eax, 8(%r10)
1711	jmp	_return_T_done_encrypt
1712_T_16_encrypt:
1713	movdqu	%xmm0, (%r10)
1714_return_T_done_encrypt:
1715	mov	%r14, %rsp
1716	pop	%r14
1717	pop	%r13
1718	pop	%r12
1719	ret
1720ENDPROC(aesni_gcm_enc)
1721
1722#endif
1723
1724
1725.align 4
1726_key_expansion_128:
1727_key_expansion_256a:
1728	pshufd $0b11111111, %xmm1, %xmm1
1729	shufps $0b00010000, %xmm0, %xmm4
1730	pxor %xmm4, %xmm0
1731	shufps $0b10001100, %xmm0, %xmm4
1732	pxor %xmm4, %xmm0
1733	pxor %xmm1, %xmm0
1734	movaps %xmm0, (TKEYP)
1735	add $0x10, TKEYP
1736	ret
1737ENDPROC(_key_expansion_128)
1738ENDPROC(_key_expansion_256a)
1739
1740.align 4
1741_key_expansion_192a:
1742	pshufd $0b01010101, %xmm1, %xmm1
1743	shufps $0b00010000, %xmm0, %xmm4
1744	pxor %xmm4, %xmm0
1745	shufps $0b10001100, %xmm0, %xmm4
1746	pxor %xmm4, %xmm0
1747	pxor %xmm1, %xmm0
1748
1749	movaps %xmm2, %xmm5
1750	movaps %xmm2, %xmm6
1751	pslldq $4, %xmm5
1752	pshufd $0b11111111, %xmm0, %xmm3
1753	pxor %xmm3, %xmm2
1754	pxor %xmm5, %xmm2
1755
1756	movaps %xmm0, %xmm1
1757	shufps $0b01000100, %xmm0, %xmm6
1758	movaps %xmm6, (TKEYP)
1759	shufps $0b01001110, %xmm2, %xmm1
1760	movaps %xmm1, 0x10(TKEYP)
1761	add $0x20, TKEYP
1762	ret
1763ENDPROC(_key_expansion_192a)
1764
1765.align 4
1766_key_expansion_192b:
1767	pshufd $0b01010101, %xmm1, %xmm1
1768	shufps $0b00010000, %xmm0, %xmm4
1769	pxor %xmm4, %xmm0
1770	shufps $0b10001100, %xmm0, %xmm4
1771	pxor %xmm4, %xmm0
1772	pxor %xmm1, %xmm0
1773
1774	movaps %xmm2, %xmm5
1775	pslldq $4, %xmm5
1776	pshufd $0b11111111, %xmm0, %xmm3
1777	pxor %xmm3, %xmm2
1778	pxor %xmm5, %xmm2
1779
1780	movaps %xmm0, (TKEYP)
1781	add $0x10, TKEYP
1782	ret
1783ENDPROC(_key_expansion_192b)
1784
1785.align 4
1786_key_expansion_256b:
1787	pshufd $0b10101010, %xmm1, %xmm1
1788	shufps $0b00010000, %xmm2, %xmm4
1789	pxor %xmm4, %xmm2
1790	shufps $0b10001100, %xmm2, %xmm4
1791	pxor %xmm4, %xmm2
1792	pxor %xmm1, %xmm2
1793	movaps %xmm2, (TKEYP)
1794	add $0x10, TKEYP
1795	ret
1796ENDPROC(_key_expansion_256b)
1797
1798/*
1799 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1800 *                   unsigned int key_len)
1801 */
1802ENTRY(aesni_set_key)
1803#ifndef __x86_64__
1804	pushl KEYP
1805	movl 8(%esp), KEYP		# ctx
1806	movl 12(%esp), UKEYP		# in_key
1807	movl 16(%esp), %edx		# key_len
1808#endif
1809	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1810	movaps %xmm0, (KEYP)
1811	lea 0x10(KEYP), TKEYP		# key addr
1812	movl %edx, 480(KEYP)
1813	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1814	cmp $24, %dl
1815	jb .Lenc_key128
1816	je .Lenc_key192
1817	movups 0x10(UKEYP), %xmm2	# other user key
1818	movaps %xmm2, (TKEYP)
1819	add $0x10, TKEYP
1820	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1821	call _key_expansion_256a
1822	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1823	call _key_expansion_256b
1824	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1825	call _key_expansion_256a
1826	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1827	call _key_expansion_256b
1828	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1829	call _key_expansion_256a
1830	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1831	call _key_expansion_256b
1832	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1833	call _key_expansion_256a
1834	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1835	call _key_expansion_256b
1836	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1837	call _key_expansion_256a
1838	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1839	call _key_expansion_256b
1840	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1841	call _key_expansion_256a
1842	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1843	call _key_expansion_256b
1844	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1845	call _key_expansion_256a
1846	jmp .Ldec_key
1847.Lenc_key192:
1848	movq 0x10(UKEYP), %xmm2		# other user key
1849	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1850	call _key_expansion_192a
1851	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1852	call _key_expansion_192b
1853	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1854	call _key_expansion_192a
1855	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1856	call _key_expansion_192b
1857	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1858	call _key_expansion_192a
1859	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1860	call _key_expansion_192b
1861	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1862	call _key_expansion_192a
1863	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1864	call _key_expansion_192b
1865	jmp .Ldec_key
1866.Lenc_key128:
1867	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1868	call _key_expansion_128
1869	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1870	call _key_expansion_128
1871	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1872	call _key_expansion_128
1873	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1874	call _key_expansion_128
1875	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1876	call _key_expansion_128
1877	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1878	call _key_expansion_128
1879	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1880	call _key_expansion_128
1881	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1882	call _key_expansion_128
1883	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1884	call _key_expansion_128
1885	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1886	call _key_expansion_128
1887.Ldec_key:
1888	sub $0x10, TKEYP
1889	movaps (KEYP), %xmm0
1890	movaps (TKEYP), %xmm1
1891	movaps %xmm0, 240(TKEYP)
1892	movaps %xmm1, 240(KEYP)
1893	add $0x10, KEYP
1894	lea 240-16(TKEYP), UKEYP
1895.align 4
1896.Ldec_key_loop:
1897	movaps (KEYP), %xmm0
1898	AESIMC %xmm0 %xmm1
1899	movaps %xmm1, (UKEYP)
1900	add $0x10, KEYP
1901	sub $0x10, UKEYP
1902	cmp TKEYP, KEYP
1903	jb .Ldec_key_loop
1904	xor AREG, AREG
1905#ifndef __x86_64__
1906	popl KEYP
1907#endif
1908	ret
1909ENDPROC(aesni_set_key)
1910
1911/*
1912 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1913 */
1914ENTRY(aesni_enc)
1915#ifndef __x86_64__
1916	pushl KEYP
1917	pushl KLEN
1918	movl 12(%esp), KEYP
1919	movl 16(%esp), OUTP
1920	movl 20(%esp), INP
1921#endif
1922	movl 480(KEYP), KLEN		# key length
1923	movups (INP), STATE		# input
1924	call _aesni_enc1
1925	movups STATE, (OUTP)		# output
1926#ifndef __x86_64__
1927	popl KLEN
1928	popl KEYP
1929#endif
1930	ret
1931ENDPROC(aesni_enc)
1932
1933/*
1934 * _aesni_enc1:		internal ABI
1935 * input:
1936 *	KEYP:		key struct pointer
1937 *	KLEN:		round count
1938 *	STATE:		initial state (input)
1939 * output:
1940 *	STATE:		finial state (output)
1941 * changed:
1942 *	KEY
1943 *	TKEYP (T1)
1944 */
1945.align 4
1946_aesni_enc1:
1947	movaps (KEYP), KEY		# key
1948	mov KEYP, TKEYP
1949	pxor KEY, STATE		# round 0
1950	add $0x30, TKEYP
1951	cmp $24, KLEN
1952	jb .Lenc128
1953	lea 0x20(TKEYP), TKEYP
1954	je .Lenc192
1955	add $0x20, TKEYP
1956	movaps -0x60(TKEYP), KEY
1957	AESENC KEY STATE
1958	movaps -0x50(TKEYP), KEY
1959	AESENC KEY STATE
1960.align 4
1961.Lenc192:
1962	movaps -0x40(TKEYP), KEY
1963	AESENC KEY STATE
1964	movaps -0x30(TKEYP), KEY
1965	AESENC KEY STATE
1966.align 4
1967.Lenc128:
1968	movaps -0x20(TKEYP), KEY
1969	AESENC KEY STATE
1970	movaps -0x10(TKEYP), KEY
1971	AESENC KEY STATE
1972	movaps (TKEYP), KEY
1973	AESENC KEY STATE
1974	movaps 0x10(TKEYP), KEY
1975	AESENC KEY STATE
1976	movaps 0x20(TKEYP), KEY
1977	AESENC KEY STATE
1978	movaps 0x30(TKEYP), KEY
1979	AESENC KEY STATE
1980	movaps 0x40(TKEYP), KEY
1981	AESENC KEY STATE
1982	movaps 0x50(TKEYP), KEY
1983	AESENC KEY STATE
1984	movaps 0x60(TKEYP), KEY
1985	AESENC KEY STATE
1986	movaps 0x70(TKEYP), KEY
1987	AESENCLAST KEY STATE
1988	ret
1989ENDPROC(_aesni_enc1)
1990
1991/*
1992 * _aesni_enc4:	internal ABI
1993 * input:
1994 *	KEYP:		key struct pointer
1995 *	KLEN:		round count
1996 *	STATE1:		initial state (input)
1997 *	STATE2
1998 *	STATE3
1999 *	STATE4
2000 * output:
2001 *	STATE1:		finial state (output)
2002 *	STATE2
2003 *	STATE3
2004 *	STATE4
2005 * changed:
2006 *	KEY
2007 *	TKEYP (T1)
2008 */
2009.align 4
2010_aesni_enc4:
2011	movaps (KEYP), KEY		# key
2012	mov KEYP, TKEYP
2013	pxor KEY, STATE1		# round 0
2014	pxor KEY, STATE2
2015	pxor KEY, STATE3
2016	pxor KEY, STATE4
2017	add $0x30, TKEYP
2018	cmp $24, KLEN
2019	jb .L4enc128
2020	lea 0x20(TKEYP), TKEYP
2021	je .L4enc192
2022	add $0x20, TKEYP
2023	movaps -0x60(TKEYP), KEY
2024	AESENC KEY STATE1
2025	AESENC KEY STATE2
2026	AESENC KEY STATE3
2027	AESENC KEY STATE4
2028	movaps -0x50(TKEYP), KEY
2029	AESENC KEY STATE1
2030	AESENC KEY STATE2
2031	AESENC KEY STATE3
2032	AESENC KEY STATE4
2033#.align 4
2034.L4enc192:
2035	movaps -0x40(TKEYP), KEY
2036	AESENC KEY STATE1
2037	AESENC KEY STATE2
2038	AESENC KEY STATE3
2039	AESENC KEY STATE4
2040	movaps -0x30(TKEYP), KEY
2041	AESENC KEY STATE1
2042	AESENC KEY STATE2
2043	AESENC KEY STATE3
2044	AESENC KEY STATE4
2045#.align 4
2046.L4enc128:
2047	movaps -0x20(TKEYP), KEY
2048	AESENC KEY STATE1
2049	AESENC KEY STATE2
2050	AESENC KEY STATE3
2051	AESENC KEY STATE4
2052	movaps -0x10(TKEYP), KEY
2053	AESENC KEY STATE1
2054	AESENC KEY STATE2
2055	AESENC KEY STATE3
2056	AESENC KEY STATE4
2057	movaps (TKEYP), KEY
2058	AESENC KEY STATE1
2059	AESENC KEY STATE2
2060	AESENC KEY STATE3
2061	AESENC KEY STATE4
2062	movaps 0x10(TKEYP), KEY
2063	AESENC KEY STATE1
2064	AESENC KEY STATE2
2065	AESENC KEY STATE3
2066	AESENC KEY STATE4
2067	movaps 0x20(TKEYP), KEY
2068	AESENC KEY STATE1
2069	AESENC KEY STATE2
2070	AESENC KEY STATE3
2071	AESENC KEY STATE4
2072	movaps 0x30(TKEYP), KEY
2073	AESENC KEY STATE1
2074	AESENC KEY STATE2
2075	AESENC KEY STATE3
2076	AESENC KEY STATE4
2077	movaps 0x40(TKEYP), KEY
2078	AESENC KEY STATE1
2079	AESENC KEY STATE2
2080	AESENC KEY STATE3
2081	AESENC KEY STATE4
2082	movaps 0x50(TKEYP), KEY
2083	AESENC KEY STATE1
2084	AESENC KEY STATE2
2085	AESENC KEY STATE3
2086	AESENC KEY STATE4
2087	movaps 0x60(TKEYP), KEY
2088	AESENC KEY STATE1
2089	AESENC KEY STATE2
2090	AESENC KEY STATE3
2091	AESENC KEY STATE4
2092	movaps 0x70(TKEYP), KEY
2093	AESENCLAST KEY STATE1		# last round
2094	AESENCLAST KEY STATE2
2095	AESENCLAST KEY STATE3
2096	AESENCLAST KEY STATE4
2097	ret
2098ENDPROC(_aesni_enc4)
2099
2100/*
2101 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2102 */
2103ENTRY(aesni_dec)
2104#ifndef __x86_64__
2105	pushl KEYP
2106	pushl KLEN
2107	movl 12(%esp), KEYP
2108	movl 16(%esp), OUTP
2109	movl 20(%esp), INP
2110#endif
2111	mov 480(KEYP), KLEN		# key length
2112	add $240, KEYP
2113	movups (INP), STATE		# input
2114	call _aesni_dec1
2115	movups STATE, (OUTP)		#output
2116#ifndef __x86_64__
2117	popl KLEN
2118	popl KEYP
2119#endif
2120	ret
2121ENDPROC(aesni_dec)
2122
2123/*
2124 * _aesni_dec1:		internal ABI
2125 * input:
2126 *	KEYP:		key struct pointer
2127 *	KLEN:		key length
2128 *	STATE:		initial state (input)
2129 * output:
2130 *	STATE:		finial state (output)
2131 * changed:
2132 *	KEY
2133 *	TKEYP (T1)
2134 */
2135.align 4
2136_aesni_dec1:
2137	movaps (KEYP), KEY		# key
2138	mov KEYP, TKEYP
2139	pxor KEY, STATE		# round 0
2140	add $0x30, TKEYP
2141	cmp $24, KLEN
2142	jb .Ldec128
2143	lea 0x20(TKEYP), TKEYP
2144	je .Ldec192
2145	add $0x20, TKEYP
2146	movaps -0x60(TKEYP), KEY
2147	AESDEC KEY STATE
2148	movaps -0x50(TKEYP), KEY
2149	AESDEC KEY STATE
2150.align 4
2151.Ldec192:
2152	movaps -0x40(TKEYP), KEY
2153	AESDEC KEY STATE
2154	movaps -0x30(TKEYP), KEY
2155	AESDEC KEY STATE
2156.align 4
2157.Ldec128:
2158	movaps -0x20(TKEYP), KEY
2159	AESDEC KEY STATE
2160	movaps -0x10(TKEYP), KEY
2161	AESDEC KEY STATE
2162	movaps (TKEYP), KEY
2163	AESDEC KEY STATE
2164	movaps 0x10(TKEYP), KEY
2165	AESDEC KEY STATE
2166	movaps 0x20(TKEYP), KEY
2167	AESDEC KEY STATE
2168	movaps 0x30(TKEYP), KEY
2169	AESDEC KEY STATE
2170	movaps 0x40(TKEYP), KEY
2171	AESDEC KEY STATE
2172	movaps 0x50(TKEYP), KEY
2173	AESDEC KEY STATE
2174	movaps 0x60(TKEYP), KEY
2175	AESDEC KEY STATE
2176	movaps 0x70(TKEYP), KEY
2177	AESDECLAST KEY STATE
2178	ret
2179ENDPROC(_aesni_dec1)
2180
2181/*
2182 * _aesni_dec4:	internal ABI
2183 * input:
2184 *	KEYP:		key struct pointer
2185 *	KLEN:		key length
2186 *	STATE1:		initial state (input)
2187 *	STATE2
2188 *	STATE3
2189 *	STATE4
2190 * output:
2191 *	STATE1:		finial state (output)
2192 *	STATE2
2193 *	STATE3
2194 *	STATE4
2195 * changed:
2196 *	KEY
2197 *	TKEYP (T1)
2198 */
2199.align 4
2200_aesni_dec4:
2201	movaps (KEYP), KEY		# key
2202	mov KEYP, TKEYP
2203	pxor KEY, STATE1		# round 0
2204	pxor KEY, STATE2
2205	pxor KEY, STATE3
2206	pxor KEY, STATE4
2207	add $0x30, TKEYP
2208	cmp $24, KLEN
2209	jb .L4dec128
2210	lea 0x20(TKEYP), TKEYP
2211	je .L4dec192
2212	add $0x20, TKEYP
2213	movaps -0x60(TKEYP), KEY
2214	AESDEC KEY STATE1
2215	AESDEC KEY STATE2
2216	AESDEC KEY STATE3
2217	AESDEC KEY STATE4
2218	movaps -0x50(TKEYP), KEY
2219	AESDEC KEY STATE1
2220	AESDEC KEY STATE2
2221	AESDEC KEY STATE3
2222	AESDEC KEY STATE4
2223.align 4
2224.L4dec192:
2225	movaps -0x40(TKEYP), KEY
2226	AESDEC KEY STATE1
2227	AESDEC KEY STATE2
2228	AESDEC KEY STATE3
2229	AESDEC KEY STATE4
2230	movaps -0x30(TKEYP), KEY
2231	AESDEC KEY STATE1
2232	AESDEC KEY STATE2
2233	AESDEC KEY STATE3
2234	AESDEC KEY STATE4
2235.align 4
2236.L4dec128:
2237	movaps -0x20(TKEYP), KEY
2238	AESDEC KEY STATE1
2239	AESDEC KEY STATE2
2240	AESDEC KEY STATE3
2241	AESDEC KEY STATE4
2242	movaps -0x10(TKEYP), KEY
2243	AESDEC KEY STATE1
2244	AESDEC KEY STATE2
2245	AESDEC KEY STATE3
2246	AESDEC KEY STATE4
2247	movaps (TKEYP), KEY
2248	AESDEC KEY STATE1
2249	AESDEC KEY STATE2
2250	AESDEC KEY STATE3
2251	AESDEC KEY STATE4
2252	movaps 0x10(TKEYP), KEY
2253	AESDEC KEY STATE1
2254	AESDEC KEY STATE2
2255	AESDEC KEY STATE3
2256	AESDEC KEY STATE4
2257	movaps 0x20(TKEYP), KEY
2258	AESDEC KEY STATE1
2259	AESDEC KEY STATE2
2260	AESDEC KEY STATE3
2261	AESDEC KEY STATE4
2262	movaps 0x30(TKEYP), KEY
2263	AESDEC KEY STATE1
2264	AESDEC KEY STATE2
2265	AESDEC KEY STATE3
2266	AESDEC KEY STATE4
2267	movaps 0x40(TKEYP), KEY
2268	AESDEC KEY STATE1
2269	AESDEC KEY STATE2
2270	AESDEC KEY STATE3
2271	AESDEC KEY STATE4
2272	movaps 0x50(TKEYP), KEY
2273	AESDEC KEY STATE1
2274	AESDEC KEY STATE2
2275	AESDEC KEY STATE3
2276	AESDEC KEY STATE4
2277	movaps 0x60(TKEYP), KEY
2278	AESDEC KEY STATE1
2279	AESDEC KEY STATE2
2280	AESDEC KEY STATE3
2281	AESDEC KEY STATE4
2282	movaps 0x70(TKEYP), KEY
2283	AESDECLAST KEY STATE1		# last round
2284	AESDECLAST KEY STATE2
2285	AESDECLAST KEY STATE3
2286	AESDECLAST KEY STATE4
2287	ret
2288ENDPROC(_aesni_dec4)
2289
2290/*
2291 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2292 *		      size_t len)
2293 */
2294ENTRY(aesni_ecb_enc)
2295#ifndef __x86_64__
2296	pushl LEN
2297	pushl KEYP
2298	pushl KLEN
2299	movl 16(%esp), KEYP
2300	movl 20(%esp), OUTP
2301	movl 24(%esp), INP
2302	movl 28(%esp), LEN
2303#endif
2304	test LEN, LEN		# check length
2305	jz .Lecb_enc_ret
2306	mov 480(KEYP), KLEN
2307	cmp $16, LEN
2308	jb .Lecb_enc_ret
2309	cmp $64, LEN
2310	jb .Lecb_enc_loop1
2311.align 4
2312.Lecb_enc_loop4:
2313	movups (INP), STATE1
2314	movups 0x10(INP), STATE2
2315	movups 0x20(INP), STATE3
2316	movups 0x30(INP), STATE4
2317	call _aesni_enc4
2318	movups STATE1, (OUTP)
2319	movups STATE2, 0x10(OUTP)
2320	movups STATE3, 0x20(OUTP)
2321	movups STATE4, 0x30(OUTP)
2322	sub $64, LEN
2323	add $64, INP
2324	add $64, OUTP
2325	cmp $64, LEN
2326	jge .Lecb_enc_loop4
2327	cmp $16, LEN
2328	jb .Lecb_enc_ret
2329.align 4
2330.Lecb_enc_loop1:
2331	movups (INP), STATE1
2332	call _aesni_enc1
2333	movups STATE1, (OUTP)
2334	sub $16, LEN
2335	add $16, INP
2336	add $16, OUTP
2337	cmp $16, LEN
2338	jge .Lecb_enc_loop1
2339.Lecb_enc_ret:
2340#ifndef __x86_64__
2341	popl KLEN
2342	popl KEYP
2343	popl LEN
2344#endif
2345	ret
2346ENDPROC(aesni_ecb_enc)
2347
2348/*
2349 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2350 *		      size_t len);
2351 */
2352ENTRY(aesni_ecb_dec)
2353#ifndef __x86_64__
2354	pushl LEN
2355	pushl KEYP
2356	pushl KLEN
2357	movl 16(%esp), KEYP
2358	movl 20(%esp), OUTP
2359	movl 24(%esp), INP
2360	movl 28(%esp), LEN
2361#endif
2362	test LEN, LEN
2363	jz .Lecb_dec_ret
2364	mov 480(KEYP), KLEN
2365	add $240, KEYP
2366	cmp $16, LEN
2367	jb .Lecb_dec_ret
2368	cmp $64, LEN
2369	jb .Lecb_dec_loop1
2370.align 4
2371.Lecb_dec_loop4:
2372	movups (INP), STATE1
2373	movups 0x10(INP), STATE2
2374	movups 0x20(INP), STATE3
2375	movups 0x30(INP), STATE4
2376	call _aesni_dec4
2377	movups STATE1, (OUTP)
2378	movups STATE2, 0x10(OUTP)
2379	movups STATE3, 0x20(OUTP)
2380	movups STATE4, 0x30(OUTP)
2381	sub $64, LEN
2382	add $64, INP
2383	add $64, OUTP
2384	cmp $64, LEN
2385	jge .Lecb_dec_loop4
2386	cmp $16, LEN
2387	jb .Lecb_dec_ret
2388.align 4
2389.Lecb_dec_loop1:
2390	movups (INP), STATE1
2391	call _aesni_dec1
2392	movups STATE1, (OUTP)
2393	sub $16, LEN
2394	add $16, INP
2395	add $16, OUTP
2396	cmp $16, LEN
2397	jge .Lecb_dec_loop1
2398.Lecb_dec_ret:
2399#ifndef __x86_64__
2400	popl KLEN
2401	popl KEYP
2402	popl LEN
2403#endif
2404	ret
2405ENDPROC(aesni_ecb_dec)
2406
2407/*
2408 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2409 *		      size_t len, u8 *iv)
2410 */
2411ENTRY(aesni_cbc_enc)
2412#ifndef __x86_64__
2413	pushl IVP
2414	pushl LEN
2415	pushl KEYP
2416	pushl KLEN
2417	movl 20(%esp), KEYP
2418	movl 24(%esp), OUTP
2419	movl 28(%esp), INP
2420	movl 32(%esp), LEN
2421	movl 36(%esp), IVP
2422#endif
2423	cmp $16, LEN
2424	jb .Lcbc_enc_ret
2425	mov 480(KEYP), KLEN
2426	movups (IVP), STATE	# load iv as initial state
2427.align 4
2428.Lcbc_enc_loop:
2429	movups (INP), IN	# load input
2430	pxor IN, STATE
2431	call _aesni_enc1
2432	movups STATE, (OUTP)	# store output
2433	sub $16, LEN
2434	add $16, INP
2435	add $16, OUTP
2436	cmp $16, LEN
2437	jge .Lcbc_enc_loop
2438	movups STATE, (IVP)
2439.Lcbc_enc_ret:
2440#ifndef __x86_64__
2441	popl KLEN
2442	popl KEYP
2443	popl LEN
2444	popl IVP
2445#endif
2446	ret
2447ENDPROC(aesni_cbc_enc)
2448
2449/*
2450 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2451 *		      size_t len, u8 *iv)
2452 */
2453ENTRY(aesni_cbc_dec)
2454#ifndef __x86_64__
2455	pushl IVP
2456	pushl LEN
2457	pushl KEYP
2458	pushl KLEN
2459	movl 20(%esp), KEYP
2460	movl 24(%esp), OUTP
2461	movl 28(%esp), INP
2462	movl 32(%esp), LEN
2463	movl 36(%esp), IVP
2464#endif
2465	cmp $16, LEN
2466	jb .Lcbc_dec_just_ret
2467	mov 480(KEYP), KLEN
2468	add $240, KEYP
2469	movups (IVP), IV
2470	cmp $64, LEN
2471	jb .Lcbc_dec_loop1
2472.align 4
2473.Lcbc_dec_loop4:
2474	movups (INP), IN1
2475	movaps IN1, STATE1
2476	movups 0x10(INP), IN2
2477	movaps IN2, STATE2
2478#ifdef __x86_64__
2479	movups 0x20(INP), IN3
2480	movaps IN3, STATE3
2481	movups 0x30(INP), IN4
2482	movaps IN4, STATE4
2483#else
2484	movups 0x20(INP), IN1
2485	movaps IN1, STATE3
2486	movups 0x30(INP), IN2
2487	movaps IN2, STATE4
2488#endif
2489	call _aesni_dec4
2490	pxor IV, STATE1
2491#ifdef __x86_64__
2492	pxor IN1, STATE2
2493	pxor IN2, STATE3
2494	pxor IN3, STATE4
2495	movaps IN4, IV
2496#else
2497	pxor IN1, STATE4
2498	movaps IN2, IV
2499	movups (INP), IN1
2500	pxor IN1, STATE2
2501	movups 0x10(INP), IN2
2502	pxor IN2, STATE3
2503#endif
2504	movups STATE1, (OUTP)
2505	movups STATE2, 0x10(OUTP)
2506	movups STATE3, 0x20(OUTP)
2507	movups STATE4, 0x30(OUTP)
2508	sub $64, LEN
2509	add $64, INP
2510	add $64, OUTP
2511	cmp $64, LEN
2512	jge .Lcbc_dec_loop4
2513	cmp $16, LEN
2514	jb .Lcbc_dec_ret
2515.align 4
2516.Lcbc_dec_loop1:
2517	movups (INP), IN
2518	movaps IN, STATE
2519	call _aesni_dec1
2520	pxor IV, STATE
2521	movups STATE, (OUTP)
2522	movaps IN, IV
2523	sub $16, LEN
2524	add $16, INP
2525	add $16, OUTP
2526	cmp $16, LEN
2527	jge .Lcbc_dec_loop1
2528.Lcbc_dec_ret:
2529	movups IV, (IVP)
2530.Lcbc_dec_just_ret:
2531#ifndef __x86_64__
2532	popl KLEN
2533	popl KEYP
2534	popl LEN
2535	popl IVP
2536#endif
2537	ret
2538ENDPROC(aesni_cbc_dec)
2539
2540#ifdef __x86_64__
2541.align 16
2542.Lbswap_mask:
2543	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2544
2545/*
2546 * _aesni_inc_init:	internal ABI
2547 *	setup registers used by _aesni_inc
2548 * input:
2549 *	IV
2550 * output:
2551 *	CTR:	== IV, in little endian
2552 *	TCTR_LOW: == lower qword of CTR
2553 *	INC:	== 1, in little endian
2554 *	BSWAP_MASK == endian swapping mask
2555 */
2556.align 4
2557_aesni_inc_init:
2558	movaps .Lbswap_mask, BSWAP_MASK
2559	movaps IV, CTR
2560	PSHUFB_XMM BSWAP_MASK CTR
2561	mov $1, TCTR_LOW
2562	MOVQ_R64_XMM TCTR_LOW INC
2563	MOVQ_R64_XMM CTR TCTR_LOW
2564	ret
2565ENDPROC(_aesni_inc_init)
2566
2567/*
2568 * _aesni_inc:		internal ABI
2569 *	Increase IV by 1, IV is in big endian
2570 * input:
2571 *	IV
2572 *	CTR:	== IV, in little endian
2573 *	TCTR_LOW: == lower qword of CTR
2574 *	INC:	== 1, in little endian
2575 *	BSWAP_MASK == endian swapping mask
2576 * output:
2577 *	IV:	Increase by 1
2578 * changed:
2579 *	CTR:	== output IV, in little endian
2580 *	TCTR_LOW: == lower qword of CTR
2581 */
2582.align 4
2583_aesni_inc:
2584	paddq INC, CTR
2585	add $1, TCTR_LOW
2586	jnc .Linc_low
2587	pslldq $8, INC
2588	paddq INC, CTR
2589	psrldq $8, INC
2590.Linc_low:
2591	movaps CTR, IV
2592	PSHUFB_XMM BSWAP_MASK IV
2593	ret
2594ENDPROC(_aesni_inc)
2595
2596/*
2597 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2598 *		      size_t len, u8 *iv)
2599 */
2600ENTRY(aesni_ctr_enc)
2601	cmp $16, LEN
2602	jb .Lctr_enc_just_ret
2603	mov 480(KEYP), KLEN
2604	movups (IVP), IV
2605	call _aesni_inc_init
2606	cmp $64, LEN
2607	jb .Lctr_enc_loop1
2608.align 4
2609.Lctr_enc_loop4:
2610	movaps IV, STATE1
2611	call _aesni_inc
2612	movups (INP), IN1
2613	movaps IV, STATE2
2614	call _aesni_inc
2615	movups 0x10(INP), IN2
2616	movaps IV, STATE3
2617	call _aesni_inc
2618	movups 0x20(INP), IN3
2619	movaps IV, STATE4
2620	call _aesni_inc
2621	movups 0x30(INP), IN4
2622	call _aesni_enc4
2623	pxor IN1, STATE1
2624	movups STATE1, (OUTP)
2625	pxor IN2, STATE2
2626	movups STATE2, 0x10(OUTP)
2627	pxor IN3, STATE3
2628	movups STATE3, 0x20(OUTP)
2629	pxor IN4, STATE4
2630	movups STATE4, 0x30(OUTP)
2631	sub $64, LEN
2632	add $64, INP
2633	add $64, OUTP
2634	cmp $64, LEN
2635	jge .Lctr_enc_loop4
2636	cmp $16, LEN
2637	jb .Lctr_enc_ret
2638.align 4
2639.Lctr_enc_loop1:
2640	movaps IV, STATE
2641	call _aesni_inc
2642	movups (INP), IN
2643	call _aesni_enc1
2644	pxor IN, STATE
2645	movups STATE, (OUTP)
2646	sub $16, LEN
2647	add $16, INP
2648	add $16, OUTP
2649	cmp $16, LEN
2650	jge .Lctr_enc_loop1
2651.Lctr_enc_ret:
2652	movups IV, (IVP)
2653.Lctr_enc_just_ret:
2654	ret
2655ENDPROC(aesni_ctr_enc)
2656
2657/*
2658 * _aesni_gf128mul_x_ble:		internal ABI
2659 *	Multiply in GF(2^128) for XTS IVs
2660 * input:
2661 *	IV:	current IV
2662 *	GF128MUL_MASK == mask with 0x87 and 0x01
2663 * output:
2664 *	IV:	next IV
2665 * changed:
2666 *	CTR:	== temporary value
2667 */
2668#define _aesni_gf128mul_x_ble() \
2669	pshufd $0x13, IV, CTR; \
2670	paddq IV, IV; \
2671	psrad $31, CTR; \
2672	pand GF128MUL_MASK, CTR; \
2673	pxor CTR, IV;
2674
2675/*
2676 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2677 *			 bool enc, u8 *iv)
2678 */
2679ENTRY(aesni_xts_crypt8)
2680	cmpb $0, %cl
2681	movl $0, %ecx
2682	movl $240, %r10d
2683	leaq _aesni_enc4, %r11
2684	leaq _aesni_dec4, %rax
2685	cmovel %r10d, %ecx
2686	cmoveq %rax, %r11
2687
2688	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2689	movups (IVP), IV
2690
2691	mov 480(KEYP), KLEN
2692	addq %rcx, KEYP
2693
2694	movdqa IV, STATE1
2695	movdqu 0x00(INP), INC
2696	pxor INC, STATE1
2697	movdqu IV, 0x00(OUTP)
2698
2699	_aesni_gf128mul_x_ble()
2700	movdqa IV, STATE2
2701	movdqu 0x10(INP), INC
2702	pxor INC, STATE2
2703	movdqu IV, 0x10(OUTP)
2704
2705	_aesni_gf128mul_x_ble()
2706	movdqa IV, STATE3
2707	movdqu 0x20(INP), INC
2708	pxor INC, STATE3
2709	movdqu IV, 0x20(OUTP)
2710
2711	_aesni_gf128mul_x_ble()
2712	movdqa IV, STATE4
2713	movdqu 0x30(INP), INC
2714	pxor INC, STATE4
2715	movdqu IV, 0x30(OUTP)
2716
2717	call *%r11
2718
2719	movdqu 0x00(OUTP), INC
2720	pxor INC, STATE1
2721	movdqu STATE1, 0x00(OUTP)
2722
2723	_aesni_gf128mul_x_ble()
2724	movdqa IV, STATE1
2725	movdqu 0x40(INP), INC
2726	pxor INC, STATE1
2727	movdqu IV, 0x40(OUTP)
2728
2729	movdqu 0x10(OUTP), INC
2730	pxor INC, STATE2
2731	movdqu STATE2, 0x10(OUTP)
2732
2733	_aesni_gf128mul_x_ble()
2734	movdqa IV, STATE2
2735	movdqu 0x50(INP), INC
2736	pxor INC, STATE2
2737	movdqu IV, 0x50(OUTP)
2738
2739	movdqu 0x20(OUTP), INC
2740	pxor INC, STATE3
2741	movdqu STATE3, 0x20(OUTP)
2742
2743	_aesni_gf128mul_x_ble()
2744	movdqa IV, STATE3
2745	movdqu 0x60(INP), INC
2746	pxor INC, STATE3
2747	movdqu IV, 0x60(OUTP)
2748
2749	movdqu 0x30(OUTP), INC
2750	pxor INC, STATE4
2751	movdqu STATE4, 0x30(OUTP)
2752
2753	_aesni_gf128mul_x_ble()
2754	movdqa IV, STATE4
2755	movdqu 0x70(INP), INC
2756	pxor INC, STATE4
2757	movdqu IV, 0x70(OUTP)
2758
2759	_aesni_gf128mul_x_ble()
2760	movups IV, (IVP)
2761
2762	call *%r11
2763
2764	movdqu 0x40(OUTP), INC
2765	pxor INC, STATE1
2766	movdqu STATE1, 0x40(OUTP)
2767
2768	movdqu 0x50(OUTP), INC
2769	pxor INC, STATE2
2770	movdqu STATE2, 0x50(OUTP)
2771
2772	movdqu 0x60(OUTP), INC
2773	pxor INC, STATE3
2774	movdqu STATE3, 0x60(OUTP)
2775
2776	movdqu 0x70(OUTP), INC
2777	pxor INC, STATE4
2778	movdqu STATE4, 0x70(OUTP)
2779
2780	ret
2781ENDPROC(aesni_xts_crypt8)
2782
2783#endif
2784