xref: /linux/arch/x86/crypto/aesni-intel_asm.S (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
16 *             Adrian Hoban <adrian.hoban@intel.com>
17 *             James Guilford (james.guilford@intel.com)
18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
19 *             Tadeusz Struk (tadeusz.struk@intel.com)
20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
21 *    Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 *    Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35
36/*
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register.  This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned).  It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
43 */
44#define MOVADQ	movaps
45#define MOVUDQ	movups
46
47#ifdef __x86_64__
48
49.data
50.align 16
51.Lgf128mul_x_ble_mask:
52	.octa 0x00000000000000010000000000000087
53POLY:   .octa 0xC2000000000000000000000000000001
54TWOONE: .octa 0x00000001000000000000000000000001
55
56# order of these constants should not change.
57# more specifically, ALL_F should follow SHIFT_MASK,
58# and ZERO should follow ALL_F
59
60SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
61MASK1:      .octa 0x0000000000000000ffffffffffffffff
62MASK2:      .octa 0xffffffffffffffff0000000000000000
63SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
64ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
65ZERO:       .octa 0x00000000000000000000000000000000
66ONE:        .octa 0x00000000000000000000000000000001
67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68dec:        .octa 0x1
69enc:        .octa 0x2
70
71
72.text
73
74
75#define	STACK_OFFSET    8*3
76#define	HashKey		16*0	// store HashKey <<1 mod poly here
77#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
78#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
79#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
80#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
81				// bits of  HashKey <<1 mod poly here
82				//(for Karatsuba purposes)
83#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
84				// bits of  HashKey^2 <<1 mod poly here
85				// (for Karatsuba purposes)
86#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
87				// bits of  HashKey^3 <<1 mod poly here
88				// (for Karatsuba purposes)
89#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
90				// bits of  HashKey^4 <<1 mod poly here
91				// (for Karatsuba purposes)
92#define	VARIABLE_OFFSET	16*8
93
94#define arg1 rdi
95#define arg2 rsi
96#define arg3 rdx
97#define arg4 rcx
98#define arg5 r8
99#define arg6 r9
100#define arg7 STACK_OFFSET+8(%r14)
101#define arg8 STACK_OFFSET+16(%r14)
102#define arg9 STACK_OFFSET+24(%r14)
103#define arg10 STACK_OFFSET+32(%r14)
104#define keysize 2*15*16(%arg1)
105#endif
106
107
108#define STATE1	%xmm0
109#define STATE2	%xmm4
110#define STATE3	%xmm5
111#define STATE4	%xmm6
112#define STATE	STATE1
113#define IN1	%xmm1
114#define IN2	%xmm7
115#define IN3	%xmm8
116#define IN4	%xmm9
117#define IN	IN1
118#define KEY	%xmm2
119#define IV	%xmm3
120
121#define BSWAP_MASK %xmm10
122#define CTR	%xmm11
123#define INC	%xmm12
124
125#define GF128MUL_MASK %xmm10
126
127#ifdef __x86_64__
128#define AREG	%rax
129#define KEYP	%rdi
130#define OUTP	%rsi
131#define UKEYP	OUTP
132#define INP	%rdx
133#define LEN	%rcx
134#define IVP	%r8
135#define KLEN	%r9d
136#define T1	%r10
137#define TKEYP	T1
138#define T2	%r11
139#define TCTR_LOW T2
140#else
141#define AREG	%eax
142#define KEYP	%edi
143#define OUTP	AREG
144#define UKEYP	OUTP
145#define INP	%edx
146#define LEN	%esi
147#define IVP	%ebp
148#define KLEN	%ebx
149#define T1	%ecx
150#define TKEYP	T1
151#endif
152
153
154#ifdef __x86_64__
155/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
156*
157*
158* Input: A and B (128-bits each, bit-reflected)
159* Output: C = A*B*x mod poly, (i.e. >>1 )
160* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
161* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
162*
163*/
164.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
165	movdqa	  \GH, \TMP1
166	pshufd	  $78, \GH, \TMP2
167	pshufd	  $78, \HK, \TMP3
168	pxor	  \GH, \TMP2            # TMP2 = a1+a0
169	pxor	  \HK, \TMP3            # TMP3 = b1+b0
170	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
171	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
172	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
173	pxor	  \GH, \TMP2
174	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
175	movdqa	  \TMP2, \TMP3
176	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
177	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
178	pxor	  \TMP3, \GH
179	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
180
181        # first phase of the reduction
182
183	movdqa    \GH, \TMP2
184	movdqa    \GH, \TMP3
185	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
186					# in in order to perform
187					# independent shifts
188	pslld     $31, \TMP2            # packed right shift <<31
189	pslld     $30, \TMP3            # packed right shift <<30
190	pslld     $25, \TMP4            # packed right shift <<25
191	pxor      \TMP3, \TMP2          # xor the shifted versions
192	pxor      \TMP4, \TMP2
193	movdqa    \TMP2, \TMP5
194	psrldq    $4, \TMP5             # right shift TMP5 1 DW
195	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
196	pxor      \TMP2, \GH
197
198        # second phase of the reduction
199
200	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
201					# in in order to perform
202					# independent shifts
203	movdqa    \GH,\TMP3
204	movdqa    \GH,\TMP4
205	psrld     $1,\TMP2              # packed left shift >>1
206	psrld     $2,\TMP3              # packed left shift >>2
207	psrld     $7,\TMP4              # packed left shift >>7
208	pxor      \TMP3,\TMP2		# xor the shifted versions
209	pxor      \TMP4,\TMP2
210	pxor      \TMP5, \TMP2
211	pxor      \TMP2, \GH
212	pxor      \TMP1, \GH            # result is in TMP1
213.endm
214
215/*
216* if a = number of total plaintext bytes
217* b = floor(a/16)
218* num_initial_blocks = b mod 4
219* encrypt the initial num_initial_blocks blocks and apply ghash on
220* the ciphertext
221* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
222* are clobbered
223* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
224*/
225
226
227.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
228XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
229        MOVADQ     SHUF_MASK(%rip), %xmm14
230	mov	   arg7, %r10           # %r10 = AAD
231	mov	   arg8, %r12           # %r12 = aadLen
232	mov	   %r12, %r11
233	pxor	   %xmm\i, %xmm\i
234
235_get_AAD_loop\num_initial_blocks\operation:
236	movd	   (%r10), \TMP1
237	pslldq	   $12, \TMP1
238	psrldq	   $4, %xmm\i
239	pxor	   \TMP1, %xmm\i
240	add	   $4, %r10
241	sub	   $4, %r12
242	jne	   _get_AAD_loop\num_initial_blocks\operation
243
244	cmp	   $16, %r11
245	je	   _get_AAD_loop2_done\num_initial_blocks\operation
246
247	mov	   $16, %r12
248_get_AAD_loop2\num_initial_blocks\operation:
249	psrldq	   $4, %xmm\i
250	sub	   $4, %r12
251	cmp	   %r11, %r12
252	jne	   _get_AAD_loop2\num_initial_blocks\operation
253
254_get_AAD_loop2_done\num_initial_blocks\operation:
255	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
256
257	xor	   %r11, %r11 # initialise the data pointer offset as zero
258
259        # start AES for num_initial_blocks blocks
260
261	mov	   %arg5, %rax                      # %rax = *Y0
262	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
263	PSHUFB_XMM   %xmm14, \XMM0
264
265.if (\i == 5) || (\i == 6) || (\i == 7)
266	MOVADQ		ONE(%RIP),\TMP1
267	MOVADQ		(%arg1),\TMP2
268.irpc index, \i_seq
269	paddd	   \TMP1, \XMM0                 # INCR Y0
270	movdqa	   \XMM0, %xmm\index
271	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
272	pxor	   \TMP2, %xmm\index
273.endr
274	lea	0x10(%arg1),%r10
275	mov	keysize,%eax
276	shr	$2,%eax				# 128->4, 192->6, 256->8
277	add	$5,%eax			      # 128->9, 192->11, 256->13
278
279aes_loop_initial_dec\num_initial_blocks:
280	MOVADQ	(%r10),\TMP1
281.irpc	index, \i_seq
282	AESENC	\TMP1, %xmm\index
283.endr
284	add	$16,%r10
285	sub	$1,%eax
286	jnz	aes_loop_initial_dec\num_initial_blocks
287
288	MOVADQ	(%r10), \TMP1
289.irpc index, \i_seq
290	AESENCLAST \TMP1, %xmm\index         # Last Round
291.endr
292.irpc index, \i_seq
293	movdqu	   (%arg3 , %r11, 1), \TMP1
294	pxor	   \TMP1, %xmm\index
295	movdqu	   %xmm\index, (%arg2 , %r11, 1)
296	# write back plaintext/ciphertext for num_initial_blocks
297	add	   $16, %r11
298
299	movdqa     \TMP1, %xmm\index
300	PSHUFB_XMM	   %xmm14, %xmm\index
301                # prepare plaintext/ciphertext for GHASH computation
302.endr
303.endif
304	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
305        # apply GHASH on num_initial_blocks blocks
306
307.if \i == 5
308        pxor       %xmm5, %xmm6
309	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
310        pxor       %xmm6, %xmm7
311	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
312        pxor       %xmm7, %xmm8
313	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314.elseif \i == 6
315        pxor       %xmm6, %xmm7
316	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
317        pxor       %xmm7, %xmm8
318	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
319.elseif \i == 7
320        pxor       %xmm7, %xmm8
321	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
322.endif
323	cmp	   $64, %r13
324	jl	_initial_blocks_done\num_initial_blocks\operation
325	# no need for precomputed values
326/*
327*
328* Precomputations for HashKey parallel with encryption of first 4 blocks.
329* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
330*/
331	MOVADQ	   ONE(%rip), \TMP1
332	paddd	   \TMP1, \XMM0              # INCR Y0
333	MOVADQ	   \XMM0, \XMM1
334	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
335
336	paddd	   \TMP1, \XMM0              # INCR Y0
337	MOVADQ	   \XMM0, \XMM2
338	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
339
340	paddd	   \TMP1, \XMM0              # INCR Y0
341	MOVADQ	   \XMM0, \XMM3
342	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
343
344	paddd	   \TMP1, \XMM0              # INCR Y0
345	MOVADQ	   \XMM0, \XMM4
346	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
347
348	MOVADQ	   0(%arg1),\TMP1
349	pxor	   \TMP1, \XMM1
350	pxor	   \TMP1, \XMM2
351	pxor	   \TMP1, \XMM3
352	pxor	   \TMP1, \XMM4
353	movdqa	   \TMP3, \TMP5
354	pshufd	   $78, \TMP3, \TMP1
355	pxor	   \TMP3, \TMP1
356	movdqa	   \TMP1, HashKey_k(%rsp)
357	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
358# TMP5 = HashKey^2<<1 (mod poly)
359	movdqa	   \TMP5, HashKey_2(%rsp)
360# HashKey_2 = HashKey^2<<1 (mod poly)
361	pshufd	   $78, \TMP5, \TMP1
362	pxor	   \TMP5, \TMP1
363	movdqa	   \TMP1, HashKey_2_k(%rsp)
364.irpc index, 1234 # do 4 rounds
365	movaps 0x10*\index(%arg1), \TMP1
366	AESENC	   \TMP1, \XMM1
367	AESENC	   \TMP1, \XMM2
368	AESENC	   \TMP1, \XMM3
369	AESENC	   \TMP1, \XMM4
370.endr
371	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
372# TMP5 = HashKey^3<<1 (mod poly)
373	movdqa	   \TMP5, HashKey_3(%rsp)
374	pshufd	   $78, \TMP5, \TMP1
375	pxor	   \TMP5, \TMP1
376	movdqa	   \TMP1, HashKey_3_k(%rsp)
377.irpc index, 56789 # do next 5 rounds
378	movaps 0x10*\index(%arg1), \TMP1
379	AESENC	   \TMP1, \XMM1
380	AESENC	   \TMP1, \XMM2
381	AESENC	   \TMP1, \XMM3
382	AESENC	   \TMP1, \XMM4
383.endr
384	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
385# TMP5 = HashKey^3<<1 (mod poly)
386	movdqa	   \TMP5, HashKey_4(%rsp)
387	pshufd	   $78, \TMP5, \TMP1
388	pxor	   \TMP5, \TMP1
389	movdqa	   \TMP1, HashKey_4_k(%rsp)
390	lea	   0xa0(%arg1),%r10
391	mov	   keysize,%eax
392	shr	   $2,%eax			# 128->4, 192->6, 256->8
393	sub	   $4,%eax			# 128->0, 192->2, 256->4
394	jz	   aes_loop_pre_dec_done\num_initial_blocks
395
396aes_loop_pre_dec\num_initial_blocks:
397	MOVADQ	   (%r10),\TMP2
398.irpc	index, 1234
399	AESENC	   \TMP2, %xmm\index
400.endr
401	add	   $16,%r10
402	sub	   $1,%eax
403	jnz	   aes_loop_pre_dec\num_initial_blocks
404
405aes_loop_pre_dec_done\num_initial_blocks:
406	MOVADQ	   (%r10), \TMP2
407	AESENCLAST \TMP2, \XMM1
408	AESENCLAST \TMP2, \XMM2
409	AESENCLAST \TMP2, \XMM3
410	AESENCLAST \TMP2, \XMM4
411	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
412	pxor	   \TMP1, \XMM1
413	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
414	movdqa     \TMP1, \XMM1
415	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
416	pxor	   \TMP1, \XMM2
417	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
418	movdqa     \TMP1, \XMM2
419	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
420	pxor	   \TMP1, \XMM3
421	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
422	movdqa     \TMP1, \XMM3
423	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
424	pxor	   \TMP1, \XMM4
425	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
426	movdqa     \TMP1, \XMM4
427	add	   $64, %r11
428	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
429	pxor	   \XMMDst, \XMM1
430# combine GHASHed value with the corresponding ciphertext
431	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
432	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
433	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
434
435_initial_blocks_done\num_initial_blocks\operation:
436
437.endm
438
439
440/*
441* if a = number of total plaintext bytes
442* b = floor(a/16)
443* num_initial_blocks = b mod 4
444* encrypt the initial num_initial_blocks blocks and apply ghash on
445* the ciphertext
446* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
447* are clobbered
448* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
449*/
450
451
452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
454        MOVADQ     SHUF_MASK(%rip), %xmm14
455	mov	   arg7, %r10           # %r10 = AAD
456	mov	   arg8, %r12           # %r12 = aadLen
457	mov	   %r12, %r11
458	pxor	   %xmm\i, %xmm\i
459_get_AAD_loop\num_initial_blocks\operation:
460	movd	   (%r10), \TMP1
461	pslldq	   $12, \TMP1
462	psrldq	   $4, %xmm\i
463	pxor	   \TMP1, %xmm\i
464	add	   $4, %r10
465	sub	   $4, %r12
466	jne	   _get_AAD_loop\num_initial_blocks\operation
467	cmp	   $16, %r11
468	je	   _get_AAD_loop2_done\num_initial_blocks\operation
469	mov	   $16, %r12
470_get_AAD_loop2\num_initial_blocks\operation:
471	psrldq	   $4, %xmm\i
472	sub	   $4, %r12
473	cmp	   %r11, %r12
474	jne	   _get_AAD_loop2\num_initial_blocks\operation
475_get_AAD_loop2_done\num_initial_blocks\operation:
476	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
477
478	xor	   %r11, %r11 # initialise the data pointer offset as zero
479
480        # start AES for num_initial_blocks blocks
481
482	mov	   %arg5, %rax                      # %rax = *Y0
483	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
484	PSHUFB_XMM   %xmm14, \XMM0
485
486.if (\i == 5) || (\i == 6) || (\i == 7)
487
488	MOVADQ		ONE(%RIP),\TMP1
489	MOVADQ		0(%arg1),\TMP2
490.irpc index, \i_seq
491	paddd		\TMP1, \XMM0                 # INCR Y0
492	MOVADQ		\XMM0, %xmm\index
493	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
494	pxor		\TMP2, %xmm\index
495.endr
496	lea	0x10(%arg1),%r10
497	mov	keysize,%eax
498	shr	$2,%eax				# 128->4, 192->6, 256->8
499	add	$5,%eax			      # 128->9, 192->11, 256->13
500
501aes_loop_initial_enc\num_initial_blocks:
502	MOVADQ	(%r10),\TMP1
503.irpc	index, \i_seq
504	AESENC	\TMP1, %xmm\index
505.endr
506	add	$16,%r10
507	sub	$1,%eax
508	jnz	aes_loop_initial_enc\num_initial_blocks
509
510	MOVADQ	(%r10), \TMP1
511.irpc index, \i_seq
512	AESENCLAST \TMP1, %xmm\index         # Last Round
513.endr
514.irpc index, \i_seq
515	movdqu	   (%arg3 , %r11, 1), \TMP1
516	pxor	   \TMP1, %xmm\index
517	movdqu	   %xmm\index, (%arg2 , %r11, 1)
518	# write back plaintext/ciphertext for num_initial_blocks
519	add	   $16, %r11
520	PSHUFB_XMM	   %xmm14, %xmm\index
521
522		# prepare plaintext/ciphertext for GHASH computation
523.endr
524.endif
525	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
526        # apply GHASH on num_initial_blocks blocks
527
528.if \i == 5
529        pxor       %xmm5, %xmm6
530	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
531        pxor       %xmm6, %xmm7
532	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
533        pxor       %xmm7, %xmm8
534	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
535.elseif \i == 6
536        pxor       %xmm6, %xmm7
537	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
538        pxor       %xmm7, %xmm8
539	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
540.elseif \i == 7
541        pxor       %xmm7, %xmm8
542	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
543.endif
544	cmp	   $64, %r13
545	jl	_initial_blocks_done\num_initial_blocks\operation
546	# no need for precomputed values
547/*
548*
549* Precomputations for HashKey parallel with encryption of first 4 blocks.
550* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
551*/
552	MOVADQ	   ONE(%RIP),\TMP1
553	paddd	   \TMP1, \XMM0              # INCR Y0
554	MOVADQ	   \XMM0, \XMM1
555	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
556
557	paddd	   \TMP1, \XMM0              # INCR Y0
558	MOVADQ	   \XMM0, \XMM2
559	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
560
561	paddd	   \TMP1, \XMM0              # INCR Y0
562	MOVADQ	   \XMM0, \XMM3
563	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
564
565	paddd	   \TMP1, \XMM0              # INCR Y0
566	MOVADQ	   \XMM0, \XMM4
567	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
568
569	MOVADQ	   0(%arg1),\TMP1
570	pxor	   \TMP1, \XMM1
571	pxor	   \TMP1, \XMM2
572	pxor	   \TMP1, \XMM3
573	pxor	   \TMP1, \XMM4
574	movdqa	   \TMP3, \TMP5
575	pshufd	   $78, \TMP3, \TMP1
576	pxor	   \TMP3, \TMP1
577	movdqa	   \TMP1, HashKey_k(%rsp)
578	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
579# TMP5 = HashKey^2<<1 (mod poly)
580	movdqa	   \TMP5, HashKey_2(%rsp)
581# HashKey_2 = HashKey^2<<1 (mod poly)
582	pshufd	   $78, \TMP5, \TMP1
583	pxor	   \TMP5, \TMP1
584	movdqa	   \TMP1, HashKey_2_k(%rsp)
585.irpc index, 1234 # do 4 rounds
586	movaps 0x10*\index(%arg1), \TMP1
587	AESENC	   \TMP1, \XMM1
588	AESENC	   \TMP1, \XMM2
589	AESENC	   \TMP1, \XMM3
590	AESENC	   \TMP1, \XMM4
591.endr
592	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
593# TMP5 = HashKey^3<<1 (mod poly)
594	movdqa	   \TMP5, HashKey_3(%rsp)
595	pshufd	   $78, \TMP5, \TMP1
596	pxor	   \TMP5, \TMP1
597	movdqa	   \TMP1, HashKey_3_k(%rsp)
598.irpc index, 56789 # do next 5 rounds
599	movaps 0x10*\index(%arg1), \TMP1
600	AESENC	   \TMP1, \XMM1
601	AESENC	   \TMP1, \XMM2
602	AESENC	   \TMP1, \XMM3
603	AESENC	   \TMP1, \XMM4
604.endr
605	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
606# TMP5 = HashKey^3<<1 (mod poly)
607	movdqa	   \TMP5, HashKey_4(%rsp)
608	pshufd	   $78, \TMP5, \TMP1
609	pxor	   \TMP5, \TMP1
610	movdqa	   \TMP1, HashKey_4_k(%rsp)
611	lea	   0xa0(%arg1),%r10
612	mov	   keysize,%eax
613	shr	   $2,%eax			# 128->4, 192->6, 256->8
614	sub	   $4,%eax			# 128->0, 192->2, 256->4
615	jz	   aes_loop_pre_enc_done\num_initial_blocks
616
617aes_loop_pre_enc\num_initial_blocks:
618	MOVADQ	   (%r10),\TMP2
619.irpc	index, 1234
620	AESENC	   \TMP2, %xmm\index
621.endr
622	add	   $16,%r10
623	sub	   $1,%eax
624	jnz	   aes_loop_pre_enc\num_initial_blocks
625
626aes_loop_pre_enc_done\num_initial_blocks:
627	MOVADQ	   (%r10), \TMP2
628	AESENCLAST \TMP2, \XMM1
629	AESENCLAST \TMP2, \XMM2
630	AESENCLAST \TMP2, \XMM3
631	AESENCLAST \TMP2, \XMM4
632	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
633	pxor	   \TMP1, \XMM1
634	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
635	pxor	   \TMP1, \XMM2
636	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
637	pxor	   \TMP1, \XMM3
638	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
639	pxor	   \TMP1, \XMM4
640	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
641	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
642	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
643	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
644
645	add	   $64, %r11
646	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
647	pxor	   \XMMDst, \XMM1
648# combine GHASHed value with the corresponding ciphertext
649	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
650	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
651	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
652
653_initial_blocks_done\num_initial_blocks\operation:
654
655.endm
656
657/*
658* encrypt 4 blocks at a time
659* ghash the 4 previously encrypted ciphertext blocks
660* arg1, %arg2, %arg3 are used as pointers only, not modified
661* %r11 is the data offset value
662*/
663.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
664TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
665
666	movdqa	  \XMM1, \XMM5
667	movdqa	  \XMM2, \XMM6
668	movdqa	  \XMM3, \XMM7
669	movdqa	  \XMM4, \XMM8
670
671        movdqa    SHUF_MASK(%rip), %xmm15
672        # multiply TMP5 * HashKey using karatsuba
673
674	movdqa	  \XMM5, \TMP4
675	pshufd	  $78, \XMM5, \TMP6
676	pxor	  \XMM5, \TMP6
677	paddd     ONE(%rip), \XMM0		# INCR CNT
678	movdqa	  HashKey_4(%rsp), \TMP5
679	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
680	movdqa    \XMM0, \XMM1
681	paddd     ONE(%rip), \XMM0		# INCR CNT
682	movdqa    \XMM0, \XMM2
683	paddd     ONE(%rip), \XMM0		# INCR CNT
684	movdqa    \XMM0, \XMM3
685	paddd     ONE(%rip), \XMM0		# INCR CNT
686	movdqa    \XMM0, \XMM4
687	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
688	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
689	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
690	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
691	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
692
693	pxor	  (%arg1), \XMM1
694	pxor	  (%arg1), \XMM2
695	pxor	  (%arg1), \XMM3
696	pxor	  (%arg1), \XMM4
697	movdqa	  HashKey_4_k(%rsp), \TMP5
698	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
699	movaps 0x10(%arg1), \TMP1
700	AESENC	  \TMP1, \XMM1              # Round 1
701	AESENC	  \TMP1, \XMM2
702	AESENC	  \TMP1, \XMM3
703	AESENC	  \TMP1, \XMM4
704	movaps 0x20(%arg1), \TMP1
705	AESENC	  \TMP1, \XMM1              # Round 2
706	AESENC	  \TMP1, \XMM2
707	AESENC	  \TMP1, \XMM3
708	AESENC	  \TMP1, \XMM4
709	movdqa	  \XMM6, \TMP1
710	pshufd	  $78, \XMM6, \TMP2
711	pxor	  \XMM6, \TMP2
712	movdqa	  HashKey_3(%rsp), \TMP5
713	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
714	movaps 0x30(%arg1), \TMP3
715	AESENC    \TMP3, \XMM1              # Round 3
716	AESENC    \TMP3, \XMM2
717	AESENC    \TMP3, \XMM3
718	AESENC    \TMP3, \XMM4
719	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
720	movaps 0x40(%arg1), \TMP3
721	AESENC	  \TMP3, \XMM1              # Round 4
722	AESENC	  \TMP3, \XMM2
723	AESENC	  \TMP3, \XMM3
724	AESENC	  \TMP3, \XMM4
725	movdqa	  HashKey_3_k(%rsp), \TMP5
726	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
727	movaps 0x50(%arg1), \TMP3
728	AESENC	  \TMP3, \XMM1              # Round 5
729	AESENC	  \TMP3, \XMM2
730	AESENC	  \TMP3, \XMM3
731	AESENC	  \TMP3, \XMM4
732	pxor	  \TMP1, \TMP4
733# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
734	pxor	  \XMM6, \XMM5
735	pxor	  \TMP2, \TMP6
736	movdqa	  \XMM7, \TMP1
737	pshufd	  $78, \XMM7, \TMP2
738	pxor	  \XMM7, \TMP2
739	movdqa	  HashKey_2(%rsp ), \TMP5
740
741        # Multiply TMP5 * HashKey using karatsuba
742
743	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
744	movaps 0x60(%arg1), \TMP3
745	AESENC	  \TMP3, \XMM1              # Round 6
746	AESENC	  \TMP3, \XMM2
747	AESENC	  \TMP3, \XMM3
748	AESENC	  \TMP3, \XMM4
749	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
750	movaps 0x70(%arg1), \TMP3
751	AESENC	  \TMP3, \XMM1             # Round 7
752	AESENC	  \TMP3, \XMM2
753	AESENC	  \TMP3, \XMM3
754	AESENC	  \TMP3, \XMM4
755	movdqa	  HashKey_2_k(%rsp), \TMP5
756	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
757	movaps 0x80(%arg1), \TMP3
758	AESENC	  \TMP3, \XMM1             # Round 8
759	AESENC	  \TMP3, \XMM2
760	AESENC	  \TMP3, \XMM3
761	AESENC	  \TMP3, \XMM4
762	pxor	  \TMP1, \TMP4
763# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
764	pxor	  \XMM7, \XMM5
765	pxor	  \TMP2, \TMP6
766
767        # Multiply XMM8 * HashKey
768        # XMM8 and TMP5 hold the values for the two operands
769
770	movdqa	  \XMM8, \TMP1
771	pshufd	  $78, \XMM8, \TMP2
772	pxor	  \XMM8, \TMP2
773	movdqa	  HashKey(%rsp), \TMP5
774	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
775	movaps 0x90(%arg1), \TMP3
776	AESENC	  \TMP3, \XMM1            # Round 9
777	AESENC	  \TMP3, \XMM2
778	AESENC	  \TMP3, \XMM3
779	AESENC	  \TMP3, \XMM4
780	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
781	lea	  0xa0(%arg1),%r10
782	mov	  keysize,%eax
783	shr	  $2,%eax			# 128->4, 192->6, 256->8
784	sub	  $4,%eax			# 128->0, 192->2, 256->4
785	jz	  aes_loop_par_enc_done
786
787aes_loop_par_enc:
788	MOVADQ	  (%r10),\TMP3
789.irpc	index, 1234
790	AESENC	  \TMP3, %xmm\index
791.endr
792	add	  $16,%r10
793	sub	  $1,%eax
794	jnz	  aes_loop_par_enc
795
796aes_loop_par_enc_done:
797	MOVADQ	  (%r10), \TMP3
798	AESENCLAST \TMP3, \XMM1           # Round 10
799	AESENCLAST \TMP3, \XMM2
800	AESENCLAST \TMP3, \XMM3
801	AESENCLAST \TMP3, \XMM4
802	movdqa    HashKey_k(%rsp), \TMP5
803	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
804	movdqu	  (%arg3,%r11,1), \TMP3
805	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
806	movdqu	  16(%arg3,%r11,1), \TMP3
807	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
808	movdqu	  32(%arg3,%r11,1), \TMP3
809	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
810	movdqu	  48(%arg3,%r11,1), \TMP3
811	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
812        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
813        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
814        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
815        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
816	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
817	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
818	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
819	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
820
821	pxor	  \TMP4, \TMP1
822	pxor	  \XMM8, \XMM5
823	pxor	  \TMP6, \TMP2
824	pxor	  \TMP1, \TMP2
825	pxor	  \XMM5, \TMP2
826	movdqa	  \TMP2, \TMP3
827	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
828	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
829	pxor	  \TMP3, \XMM5
830	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
831
832        # first phase of reduction
833
834	movdqa    \XMM5, \TMP2
835	movdqa    \XMM5, \TMP3
836	movdqa    \XMM5, \TMP4
837# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
838	pslld     $31, \TMP2                   # packed right shift << 31
839	pslld     $30, \TMP3                   # packed right shift << 30
840	pslld     $25, \TMP4                   # packed right shift << 25
841	pxor      \TMP3, \TMP2	               # xor the shifted versions
842	pxor      \TMP4, \TMP2
843	movdqa    \TMP2, \TMP5
844	psrldq    $4, \TMP5                    # right shift T5 1 DW
845	pslldq    $12, \TMP2                   # left shift T2 3 DWs
846	pxor      \TMP2, \XMM5
847
848        # second phase of reduction
849
850	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
851	movdqa    \XMM5,\TMP3
852	movdqa    \XMM5,\TMP4
853	psrld     $1, \TMP2                    # packed left shift >>1
854	psrld     $2, \TMP3                    # packed left shift >>2
855	psrld     $7, \TMP4                    # packed left shift >>7
856	pxor      \TMP3,\TMP2		       # xor the shifted versions
857	pxor      \TMP4,\TMP2
858	pxor      \TMP5, \TMP2
859	pxor      \TMP2, \XMM5
860	pxor      \TMP1, \XMM5                 # result is in TMP1
861
862	pxor	  \XMM5, \XMM1
863.endm
864
865/*
866* decrypt 4 blocks at a time
867* ghash the 4 previously decrypted ciphertext blocks
868* arg1, %arg2, %arg3 are used as pointers only, not modified
869* %r11 is the data offset value
870*/
871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
873
874	movdqa	  \XMM1, \XMM5
875	movdqa	  \XMM2, \XMM6
876	movdqa	  \XMM3, \XMM7
877	movdqa	  \XMM4, \XMM8
878
879        movdqa    SHUF_MASK(%rip), %xmm15
880        # multiply TMP5 * HashKey using karatsuba
881
882	movdqa	  \XMM5, \TMP4
883	pshufd	  $78, \XMM5, \TMP6
884	pxor	  \XMM5, \TMP6
885	paddd     ONE(%rip), \XMM0		# INCR CNT
886	movdqa	  HashKey_4(%rsp), \TMP5
887	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
888	movdqa    \XMM0, \XMM1
889	paddd     ONE(%rip), \XMM0		# INCR CNT
890	movdqa    \XMM0, \XMM2
891	paddd     ONE(%rip), \XMM0		# INCR CNT
892	movdqa    \XMM0, \XMM3
893	paddd     ONE(%rip), \XMM0		# INCR CNT
894	movdqa    \XMM0, \XMM4
895	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
896	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
897	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
898	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
899	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
900
901	pxor	  (%arg1), \XMM1
902	pxor	  (%arg1), \XMM2
903	pxor	  (%arg1), \XMM3
904	pxor	  (%arg1), \XMM4
905	movdqa	  HashKey_4_k(%rsp), \TMP5
906	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
907	movaps 0x10(%arg1), \TMP1
908	AESENC	  \TMP1, \XMM1              # Round 1
909	AESENC	  \TMP1, \XMM2
910	AESENC	  \TMP1, \XMM3
911	AESENC	  \TMP1, \XMM4
912	movaps 0x20(%arg1), \TMP1
913	AESENC	  \TMP1, \XMM1              # Round 2
914	AESENC	  \TMP1, \XMM2
915	AESENC	  \TMP1, \XMM3
916	AESENC	  \TMP1, \XMM4
917	movdqa	  \XMM6, \TMP1
918	pshufd	  $78, \XMM6, \TMP2
919	pxor	  \XMM6, \TMP2
920	movdqa	  HashKey_3(%rsp), \TMP5
921	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
922	movaps 0x30(%arg1), \TMP3
923	AESENC    \TMP3, \XMM1              # Round 3
924	AESENC    \TMP3, \XMM2
925	AESENC    \TMP3, \XMM3
926	AESENC    \TMP3, \XMM4
927	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
928	movaps 0x40(%arg1), \TMP3
929	AESENC	  \TMP3, \XMM1              # Round 4
930	AESENC	  \TMP3, \XMM2
931	AESENC	  \TMP3, \XMM3
932	AESENC	  \TMP3, \XMM4
933	movdqa	  HashKey_3_k(%rsp), \TMP5
934	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
935	movaps 0x50(%arg1), \TMP3
936	AESENC	  \TMP3, \XMM1              # Round 5
937	AESENC	  \TMP3, \XMM2
938	AESENC	  \TMP3, \XMM3
939	AESENC	  \TMP3, \XMM4
940	pxor	  \TMP1, \TMP4
941# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
942	pxor	  \XMM6, \XMM5
943	pxor	  \TMP2, \TMP6
944	movdqa	  \XMM7, \TMP1
945	pshufd	  $78, \XMM7, \TMP2
946	pxor	  \XMM7, \TMP2
947	movdqa	  HashKey_2(%rsp ), \TMP5
948
949        # Multiply TMP5 * HashKey using karatsuba
950
951	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
952	movaps 0x60(%arg1), \TMP3
953	AESENC	  \TMP3, \XMM1              # Round 6
954	AESENC	  \TMP3, \XMM2
955	AESENC	  \TMP3, \XMM3
956	AESENC	  \TMP3, \XMM4
957	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
958	movaps 0x70(%arg1), \TMP3
959	AESENC	  \TMP3, \XMM1             # Round 7
960	AESENC	  \TMP3, \XMM2
961	AESENC	  \TMP3, \XMM3
962	AESENC	  \TMP3, \XMM4
963	movdqa	  HashKey_2_k(%rsp), \TMP5
964	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
965	movaps 0x80(%arg1), \TMP3
966	AESENC	  \TMP3, \XMM1             # Round 8
967	AESENC	  \TMP3, \XMM2
968	AESENC	  \TMP3, \XMM3
969	AESENC	  \TMP3, \XMM4
970	pxor	  \TMP1, \TMP4
971# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
972	pxor	  \XMM7, \XMM5
973	pxor	  \TMP2, \TMP6
974
975        # Multiply XMM8 * HashKey
976        # XMM8 and TMP5 hold the values for the two operands
977
978	movdqa	  \XMM8, \TMP1
979	pshufd	  $78, \XMM8, \TMP2
980	pxor	  \XMM8, \TMP2
981	movdqa	  HashKey(%rsp), \TMP5
982	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
983	movaps 0x90(%arg1), \TMP3
984	AESENC	  \TMP3, \XMM1            # Round 9
985	AESENC	  \TMP3, \XMM2
986	AESENC	  \TMP3, \XMM3
987	AESENC	  \TMP3, \XMM4
988	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
989	lea	  0xa0(%arg1),%r10
990	mov	  keysize,%eax
991	shr	  $2,%eax		        # 128->4, 192->6, 256->8
992	sub	  $4,%eax			# 128->0, 192->2, 256->4
993	jz	  aes_loop_par_dec_done
994
995aes_loop_par_dec:
996	MOVADQ	  (%r10),\TMP3
997.irpc	index, 1234
998	AESENC	  \TMP3, %xmm\index
999.endr
1000	add	  $16,%r10
1001	sub	  $1,%eax
1002	jnz	  aes_loop_par_dec
1003
1004aes_loop_par_dec_done:
1005	MOVADQ	  (%r10), \TMP3
1006	AESENCLAST \TMP3, \XMM1           # last round
1007	AESENCLAST \TMP3, \XMM2
1008	AESENCLAST \TMP3, \XMM3
1009	AESENCLAST \TMP3, \XMM4
1010	movdqa    HashKey_k(%rsp), \TMP5
1011	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1012	movdqu	  (%arg3,%r11,1), \TMP3
1013	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1014	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
1015	movdqa    \TMP3, \XMM1
1016	movdqu	  16(%arg3,%r11,1), \TMP3
1017	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1018	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
1019	movdqa    \TMP3, \XMM2
1020	movdqu	  32(%arg3,%r11,1), \TMP3
1021	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1022	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1023	movdqa    \TMP3, \XMM3
1024	movdqu	  48(%arg3,%r11,1), \TMP3
1025	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1026	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1027	movdqa    \TMP3, \XMM4
1028	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1029	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1030	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1031	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1032
1033	pxor	  \TMP4, \TMP1
1034	pxor	  \XMM8, \XMM5
1035	pxor	  \TMP6, \TMP2
1036	pxor	  \TMP1, \TMP2
1037	pxor	  \XMM5, \TMP2
1038	movdqa	  \TMP2, \TMP3
1039	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1040	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1041	pxor	  \TMP3, \XMM5
1042	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1043
1044        # first phase of reduction
1045
1046	movdqa    \XMM5, \TMP2
1047	movdqa    \XMM5, \TMP3
1048	movdqa    \XMM5, \TMP4
1049# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1050	pslld     $31, \TMP2                   # packed right shift << 31
1051	pslld     $30, \TMP3                   # packed right shift << 30
1052	pslld     $25, \TMP4                   # packed right shift << 25
1053	pxor      \TMP3, \TMP2	               # xor the shifted versions
1054	pxor      \TMP4, \TMP2
1055	movdqa    \TMP2, \TMP5
1056	psrldq    $4, \TMP5                    # right shift T5 1 DW
1057	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1058	pxor      \TMP2, \XMM5
1059
1060        # second phase of reduction
1061
1062	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1063	movdqa    \XMM5,\TMP3
1064	movdqa    \XMM5,\TMP4
1065	psrld     $1, \TMP2                    # packed left shift >>1
1066	psrld     $2, \TMP3                    # packed left shift >>2
1067	psrld     $7, \TMP4                    # packed left shift >>7
1068	pxor      \TMP3,\TMP2		       # xor the shifted versions
1069	pxor      \TMP4,\TMP2
1070	pxor      \TMP5, \TMP2
1071	pxor      \TMP2, \XMM5
1072	pxor      \TMP1, \XMM5                 # result is in TMP1
1073
1074	pxor	  \XMM5, \XMM1
1075.endm
1076
1077/* GHASH the last 4 ciphertext blocks. */
1078.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1079TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1080
1081        # Multiply TMP6 * HashKey (using Karatsuba)
1082
1083	movdqa	  \XMM1, \TMP6
1084	pshufd	  $78, \XMM1, \TMP2
1085	pxor	  \XMM1, \TMP2
1086	movdqa	  HashKey_4(%rsp), \TMP5
1087	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1088	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1089	movdqa	  HashKey_4_k(%rsp), \TMP4
1090	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1091	movdqa	  \XMM1, \XMMDst
1092	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1093
1094        # Multiply TMP1 * HashKey (using Karatsuba)
1095
1096	movdqa	  \XMM2, \TMP1
1097	pshufd	  $78, \XMM2, \TMP2
1098	pxor	  \XMM2, \TMP2
1099	movdqa	  HashKey_3(%rsp), \TMP5
1100	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1101	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1102	movdqa	  HashKey_3_k(%rsp), \TMP4
1103	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1104	pxor	  \TMP1, \TMP6
1105	pxor	  \XMM2, \XMMDst
1106	pxor	  \TMP2, \XMM1
1107# results accumulated in TMP6, XMMDst, XMM1
1108
1109        # Multiply TMP1 * HashKey (using Karatsuba)
1110
1111	movdqa	  \XMM3, \TMP1
1112	pshufd	  $78, \XMM3, \TMP2
1113	pxor	  \XMM3, \TMP2
1114	movdqa	  HashKey_2(%rsp), \TMP5
1115	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1116	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1117	movdqa	  HashKey_2_k(%rsp), \TMP4
1118	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1119	pxor	  \TMP1, \TMP6
1120	pxor	  \XMM3, \XMMDst
1121	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1122
1123        # Multiply TMP1 * HashKey (using Karatsuba)
1124	movdqa	  \XMM4, \TMP1
1125	pshufd	  $78, \XMM4, \TMP2
1126	pxor	  \XMM4, \TMP2
1127	movdqa	  HashKey(%rsp), \TMP5
1128	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1129	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1130	movdqa	  HashKey_k(%rsp), \TMP4
1131	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1132	pxor	  \TMP1, \TMP6
1133	pxor	  \XMM4, \XMMDst
1134	pxor	  \XMM1, \TMP2
1135	pxor	  \TMP6, \TMP2
1136	pxor	  \XMMDst, \TMP2
1137	# middle section of the temp results combined as in karatsuba algorithm
1138	movdqa	  \TMP2, \TMP4
1139	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1140	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1141	pxor	  \TMP4, \XMMDst
1142	pxor	  \TMP2, \TMP6
1143# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1144	# first phase of the reduction
1145	movdqa    \XMMDst, \TMP2
1146	movdqa    \XMMDst, \TMP3
1147	movdqa    \XMMDst, \TMP4
1148# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1149	pslld     $31, \TMP2                # packed right shifting << 31
1150	pslld     $30, \TMP3                # packed right shifting << 30
1151	pslld     $25, \TMP4                # packed right shifting << 25
1152	pxor      \TMP3, \TMP2              # xor the shifted versions
1153	pxor      \TMP4, \TMP2
1154	movdqa    \TMP2, \TMP7
1155	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1156	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1157	pxor      \TMP2, \XMMDst
1158
1159        # second phase of the reduction
1160	movdqa    \XMMDst, \TMP2
1161	# make 3 copies of XMMDst for doing 3 shift operations
1162	movdqa    \XMMDst, \TMP3
1163	movdqa    \XMMDst, \TMP4
1164	psrld     $1, \TMP2                 # packed left shift >> 1
1165	psrld     $2, \TMP3                 # packed left shift >> 2
1166	psrld     $7, \TMP4                 # packed left shift >> 7
1167	pxor      \TMP3, \TMP2              # xor the shifted versions
1168	pxor      \TMP4, \TMP2
1169	pxor      \TMP7, \TMP2
1170	pxor      \TMP2, \XMMDst
1171	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1172.endm
1173
1174
1175/* Encryption of a single block
1176* uses eax & r10
1177*/
1178
1179.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1180
1181	pxor		(%arg1), \XMM0
1182	mov		keysize,%eax
1183	shr		$2,%eax			# 128->4, 192->6, 256->8
1184	add		$5,%eax			# 128->9, 192->11, 256->13
1185	lea		16(%arg1), %r10	  # get first expanded key address
1186
1187_esb_loop_\@:
1188	MOVADQ		(%r10),\TMP1
1189	AESENC		\TMP1,\XMM0
1190	add		$16,%r10
1191	sub		$1,%eax
1192	jnz		_esb_loop_\@
1193
1194	MOVADQ		(%r10),\TMP1
1195	AESENCLAST	\TMP1,\XMM0
1196.endm
1197/*****************************************************************************
1198* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1199*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1200*                   const u8 *in,      // Ciphertext input
1201*                   u64 plaintext_len, // Length of data in bytes for decryption.
1202*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1203*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1204*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1205*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1206*                   const u8 *aad,     // Additional Authentication Data (AAD)
1207*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1208*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1209*                                      // given authentication tag and only return the plaintext if they match.
1210*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1211*                                      // (most likely), 12 or 8.
1212*
1213* Assumptions:
1214*
1215* keys:
1216*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1217*       set of 11 keys in the data structure void *aes_ctx
1218*
1219* iv:
1220*       0                   1                   2                   3
1221*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1222*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1223*       |                             Salt  (From the SA)               |
1224*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1225*       |                     Initialization Vector                     |
1226*       |         (This is the sequence number from IPSec header)       |
1227*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228*       |                              0x1                              |
1229*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230*
1231*
1232*
1233* AAD:
1234*       AAD padded to 128 bits with 0
1235*       for example, assume AAD is a u32 vector
1236*
1237*       if AAD is 8 bytes:
1238*       AAD[3] = {A0, A1};
1239*       padded AAD in xmm register = {A1 A0 0 0}
1240*
1241*       0                   1                   2                   3
1242*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244*       |                               SPI (A1)                        |
1245*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246*       |                     32-bit Sequence Number (A0)               |
1247*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1248*       |                              0x0                              |
1249*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250*
1251*                                       AAD Format with 32-bit Sequence Number
1252*
1253*       if AAD is 12 bytes:
1254*       AAD[3] = {A0, A1, A2};
1255*       padded AAD in xmm register = {A2 A1 A0 0}
1256*
1257*       0                   1                   2                   3
1258*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1259*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1260*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1261*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1262*       |                               SPI (A2)                        |
1263*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1264*       |                 64-bit Extended Sequence Number {A1,A0}       |
1265*       |                                                               |
1266*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1267*       |                              0x0                              |
1268*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1269*
1270*                        AAD Format with 64-bit Extended Sequence Number
1271*
1272* aadLen:
1273*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1274*       The code supports 16 too but for other sizes, the code will fail.
1275*
1276* TLen:
1277*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1278*       For other sizes, the code will fail.
1279*
1280* poly = x^128 + x^127 + x^126 + x^121 + 1
1281*
1282*****************************************************************************/
1283ENTRY(aesni_gcm_dec)
1284	push	%r12
1285	push	%r13
1286	push	%r14
1287	mov	%rsp, %r14
1288/*
1289* states of %xmm registers %xmm6:%xmm15 not saved
1290* all %xmm registers are clobbered
1291*/
1292	sub	$VARIABLE_OFFSET, %rsp
1293	and	$~63, %rsp                        # align rsp to 64 bytes
1294	mov	%arg6, %r12
1295	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1296        movdqa  SHUF_MASK(%rip), %xmm2
1297	PSHUFB_XMM %xmm2, %xmm13
1298
1299
1300# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1301
1302	movdqa	%xmm13, %xmm2
1303	psllq	$1, %xmm13
1304	psrlq	$63, %xmm2
1305	movdqa	%xmm2, %xmm1
1306	pslldq	$8, %xmm2
1307	psrldq	$8, %xmm1
1308	por	%xmm2, %xmm13
1309
1310        # Reduction
1311
1312	pshufd	$0x24, %xmm1, %xmm2
1313	pcmpeqd TWOONE(%rip), %xmm2
1314	pand	POLY(%rip), %xmm2
1315	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1316
1317
1318        # Decrypt first few blocks
1319
1320	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1321	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1322	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1323	mov %r13, %r12
1324	and $(3<<4), %r12
1325	jz _initial_num_blocks_is_0_decrypt
1326	cmp $(2<<4), %r12
1327	jb _initial_num_blocks_is_1_decrypt
1328	je _initial_num_blocks_is_2_decrypt
1329_initial_num_blocks_is_3_decrypt:
1330	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1331%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1332	sub	$48, %r13
1333	jmp	_initial_blocks_decrypted
1334_initial_num_blocks_is_2_decrypt:
1335	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1336%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1337	sub	$32, %r13
1338	jmp	_initial_blocks_decrypted
1339_initial_num_blocks_is_1_decrypt:
1340	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1341%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1342	sub	$16, %r13
1343	jmp	_initial_blocks_decrypted
1344_initial_num_blocks_is_0_decrypt:
1345	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1346%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1347_initial_blocks_decrypted:
1348	cmp	$0, %r13
1349	je	_zero_cipher_left_decrypt
1350	sub	$64, %r13
1351	je	_four_cipher_left_decrypt
1352_decrypt_by_4:
1353	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1354%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1355	add	$64, %r11
1356	sub	$64, %r13
1357	jne	_decrypt_by_4
1358_four_cipher_left_decrypt:
1359	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1360%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1361_zero_cipher_left_decrypt:
1362	mov	%arg4, %r13
1363	and	$15, %r13				# %r13 = arg4 (mod 16)
1364	je	_multiple_of_16_bytes_decrypt
1365
1366        # Handle the last <16 byte block separately
1367
1368	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1369        movdqa SHUF_MASK(%rip), %xmm10
1370	PSHUFB_XMM %xmm10, %xmm0
1371
1372	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1373	sub $16, %r11
1374	add %r13, %r11
1375	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1376	lea SHIFT_MASK+16(%rip), %r12
1377	sub %r13, %r12
1378# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1379# (%r13 is the number of bytes in plaintext mod 16)
1380	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1381	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1382
1383	movdqa  %xmm1, %xmm2
1384	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1385	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1386	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1387	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1388	pand    %xmm1, %xmm2
1389        movdqa SHUF_MASK(%rip), %xmm10
1390	PSHUFB_XMM %xmm10 ,%xmm2
1391
1392	pxor %xmm2, %xmm8
1393	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1394	          # GHASH computation for the last <16 byte block
1395	sub %r13, %r11
1396	add $16, %r11
1397
1398        # output %r13 bytes
1399	MOVQ_R64_XMM	%xmm0, %rax
1400	cmp	$8, %r13
1401	jle	_less_than_8_bytes_left_decrypt
1402	mov	%rax, (%arg2 , %r11, 1)
1403	add	$8, %r11
1404	psrldq	$8, %xmm0
1405	MOVQ_R64_XMM	%xmm0, %rax
1406	sub	$8, %r13
1407_less_than_8_bytes_left_decrypt:
1408	mov	%al,  (%arg2, %r11, 1)
1409	add	$1, %r11
1410	shr	$8, %rax
1411	sub	$1, %r13
1412	jne	_less_than_8_bytes_left_decrypt
1413_multiple_of_16_bytes_decrypt:
1414	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1415	shl	$3, %r12		  # convert into number of bits
1416	movd	%r12d, %xmm15		  # len(A) in %xmm15
1417	shl	$3, %arg4		  # len(C) in bits (*128)
1418	MOVQ_R64_XMM	%arg4, %xmm1
1419	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1420	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1421	pxor	%xmm15, %xmm8
1422	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1423	         # final GHASH computation
1424        movdqa SHUF_MASK(%rip), %xmm10
1425	PSHUFB_XMM %xmm10, %xmm8
1426
1427	mov	%arg5, %rax		  # %rax = *Y0
1428	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1429	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1430	pxor	%xmm8, %xmm0
1431_return_T_decrypt:
1432	mov	arg9, %r10                # %r10 = authTag
1433	mov	arg10, %r11               # %r11 = auth_tag_len
1434	cmp	$16, %r11
1435	je	_T_16_decrypt
1436	cmp	$12, %r11
1437	je	_T_12_decrypt
1438_T_8_decrypt:
1439	MOVQ_R64_XMM	%xmm0, %rax
1440	mov	%rax, (%r10)
1441	jmp	_return_T_done_decrypt
1442_T_12_decrypt:
1443	MOVQ_R64_XMM	%xmm0, %rax
1444	mov	%rax, (%r10)
1445	psrldq	$8, %xmm0
1446	movd	%xmm0, %eax
1447	mov	%eax, 8(%r10)
1448	jmp	_return_T_done_decrypt
1449_T_16_decrypt:
1450	movdqu	%xmm0, (%r10)
1451_return_T_done_decrypt:
1452	mov	%r14, %rsp
1453	pop	%r14
1454	pop	%r13
1455	pop	%r12
1456	ret
1457ENDPROC(aesni_gcm_dec)
1458
1459
1460/*****************************************************************************
1461* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1462*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1463*                    const u8 *in,       // Plaintext input
1464*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1465*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1466*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1467*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1468*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1469*                    const u8 *aad,      // Additional Authentication Data (AAD)
1470*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1471*                    u8 *auth_tag,       // Authenticated Tag output.
1472*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1473*                                        // 12 or 8.
1474*
1475* Assumptions:
1476*
1477* keys:
1478*       keys are pre-expanded and aligned to 16 bytes. we are using the
1479*       first set of 11 keys in the data structure void *aes_ctx
1480*
1481*
1482* iv:
1483*       0                   1                   2                   3
1484*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1485*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1486*       |                             Salt  (From the SA)               |
1487*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1488*       |                     Initialization Vector                     |
1489*       |         (This is the sequence number from IPSec header)       |
1490*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491*       |                              0x1                              |
1492*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493*
1494*
1495*
1496* AAD:
1497*       AAD padded to 128 bits with 0
1498*       for example, assume AAD is a u32 vector
1499*
1500*       if AAD is 8 bytes:
1501*       AAD[3] = {A0, A1};
1502*       padded AAD in xmm register = {A1 A0 0 0}
1503*
1504*       0                   1                   2                   3
1505*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1506*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507*       |                               SPI (A1)                        |
1508*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1509*       |                     32-bit Sequence Number (A0)               |
1510*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1511*       |                              0x0                              |
1512*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513*
1514*                                 AAD Format with 32-bit Sequence Number
1515*
1516*       if AAD is 12 bytes:
1517*       AAD[3] = {A0, A1, A2};
1518*       padded AAD in xmm register = {A2 A1 A0 0}
1519*
1520*       0                   1                   2                   3
1521*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1522*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1523*       |                               SPI (A2)                        |
1524*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1525*       |                 64-bit Extended Sequence Number {A1,A0}       |
1526*       |                                                               |
1527*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1528*       |                              0x0                              |
1529*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1530*
1531*                         AAD Format with 64-bit Extended Sequence Number
1532*
1533* aadLen:
1534*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1535*       The code supports 16 too but for other sizes, the code will fail.
1536*
1537* TLen:
1538*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1539*       For other sizes, the code will fail.
1540*
1541* poly = x^128 + x^127 + x^126 + x^121 + 1
1542***************************************************************************/
1543ENTRY(aesni_gcm_enc)
1544	push	%r12
1545	push	%r13
1546	push	%r14
1547	mov	%rsp, %r14
1548#
1549# states of %xmm registers %xmm6:%xmm15 not saved
1550# all %xmm registers are clobbered
1551#
1552	sub	$VARIABLE_OFFSET, %rsp
1553	and	$~63, %rsp
1554	mov	%arg6, %r12
1555	movdqu	(%r12), %xmm13
1556        movdqa  SHUF_MASK(%rip), %xmm2
1557	PSHUFB_XMM %xmm2, %xmm13
1558
1559
1560# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1561
1562	movdqa	%xmm13, %xmm2
1563	psllq	$1, %xmm13
1564	psrlq	$63, %xmm2
1565	movdqa	%xmm2, %xmm1
1566	pslldq	$8, %xmm2
1567	psrldq	$8, %xmm1
1568	por	%xmm2, %xmm13
1569
1570        # reduce HashKey<<1
1571
1572	pshufd	$0x24, %xmm1, %xmm2
1573	pcmpeqd TWOONE(%rip), %xmm2
1574	pand	POLY(%rip), %xmm2
1575	pxor	%xmm2, %xmm13
1576	movdqa	%xmm13, HashKey(%rsp)
1577	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1578	and	$-16, %r13
1579	mov	%r13, %r12
1580
1581        # Encrypt first few blocks
1582
1583	and	$(3<<4), %r12
1584	jz	_initial_num_blocks_is_0_encrypt
1585	cmp	$(2<<4), %r12
1586	jb	_initial_num_blocks_is_1_encrypt
1587	je	_initial_num_blocks_is_2_encrypt
1588_initial_num_blocks_is_3_encrypt:
1589	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1590%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1591	sub	$48, %r13
1592	jmp	_initial_blocks_encrypted
1593_initial_num_blocks_is_2_encrypt:
1594	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1595%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1596	sub	$32, %r13
1597	jmp	_initial_blocks_encrypted
1598_initial_num_blocks_is_1_encrypt:
1599	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1600%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1601	sub	$16, %r13
1602	jmp	_initial_blocks_encrypted
1603_initial_num_blocks_is_0_encrypt:
1604	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1605%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1606_initial_blocks_encrypted:
1607
1608        # Main loop - Encrypt remaining blocks
1609
1610	cmp	$0, %r13
1611	je	_zero_cipher_left_encrypt
1612	sub	$64, %r13
1613	je	_four_cipher_left_encrypt
1614_encrypt_by_4_encrypt:
1615	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1616%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1617	add	$64, %r11
1618	sub	$64, %r13
1619	jne	_encrypt_by_4_encrypt
1620_four_cipher_left_encrypt:
1621	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1622%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1623_zero_cipher_left_encrypt:
1624	mov	%arg4, %r13
1625	and	$15, %r13			# %r13 = arg4 (mod 16)
1626	je	_multiple_of_16_bytes_encrypt
1627
1628         # Handle the last <16 Byte block separately
1629	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1630        movdqa SHUF_MASK(%rip), %xmm10
1631	PSHUFB_XMM %xmm10, %xmm0
1632
1633
1634	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1635	sub $16, %r11
1636	add %r13, %r11
1637	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1638	lea SHIFT_MASK+16(%rip), %r12
1639	sub %r13, %r12
1640	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1641	# (%r13 is the number of bytes in plaintext mod 16)
1642	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1643	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1644	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1645	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1646	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1647	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1648        movdqa SHUF_MASK(%rip), %xmm10
1649	PSHUFB_XMM %xmm10,%xmm0
1650
1651	pxor	%xmm0, %xmm8
1652	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1653	# GHASH computation for the last <16 byte block
1654	sub	%r13, %r11
1655	add	$16, %r11
1656
1657	movdqa SHUF_MASK(%rip), %xmm10
1658	PSHUFB_XMM %xmm10, %xmm0
1659
1660	# shuffle xmm0 back to output as ciphertext
1661
1662        # Output %r13 bytes
1663	MOVQ_R64_XMM %xmm0, %rax
1664	cmp $8, %r13
1665	jle _less_than_8_bytes_left_encrypt
1666	mov %rax, (%arg2 , %r11, 1)
1667	add $8, %r11
1668	psrldq $8, %xmm0
1669	MOVQ_R64_XMM %xmm0, %rax
1670	sub $8, %r13
1671_less_than_8_bytes_left_encrypt:
1672	mov %al,  (%arg2, %r11, 1)
1673	add $1, %r11
1674	shr $8, %rax
1675	sub $1, %r13
1676	jne _less_than_8_bytes_left_encrypt
1677_multiple_of_16_bytes_encrypt:
1678	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1679	shl	$3, %r12
1680	movd	%r12d, %xmm15       # len(A) in %xmm15
1681	shl	$3, %arg4               # len(C) in bits (*128)
1682	MOVQ_R64_XMM	%arg4, %xmm1
1683	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1684	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1685	pxor	%xmm15, %xmm8
1686	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1687	# final GHASH computation
1688        movdqa SHUF_MASK(%rip), %xmm10
1689	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1690
1691	mov	%arg5, %rax		       # %rax  = *Y0
1692	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1693	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1694	pxor	%xmm8, %xmm0
1695_return_T_encrypt:
1696	mov	arg9, %r10                     # %r10 = authTag
1697	mov	arg10, %r11                    # %r11 = auth_tag_len
1698	cmp	$16, %r11
1699	je	_T_16_encrypt
1700	cmp	$12, %r11
1701	je	_T_12_encrypt
1702_T_8_encrypt:
1703	MOVQ_R64_XMM	%xmm0, %rax
1704	mov	%rax, (%r10)
1705	jmp	_return_T_done_encrypt
1706_T_12_encrypt:
1707	MOVQ_R64_XMM	%xmm0, %rax
1708	mov	%rax, (%r10)
1709	psrldq	$8, %xmm0
1710	movd	%xmm0, %eax
1711	mov	%eax, 8(%r10)
1712	jmp	_return_T_done_encrypt
1713_T_16_encrypt:
1714	movdqu	%xmm0, (%r10)
1715_return_T_done_encrypt:
1716	mov	%r14, %rsp
1717	pop	%r14
1718	pop	%r13
1719	pop	%r12
1720	ret
1721ENDPROC(aesni_gcm_enc)
1722
1723#endif
1724
1725
1726.align 4
1727_key_expansion_128:
1728_key_expansion_256a:
1729	pshufd $0b11111111, %xmm1, %xmm1
1730	shufps $0b00010000, %xmm0, %xmm4
1731	pxor %xmm4, %xmm0
1732	shufps $0b10001100, %xmm0, %xmm4
1733	pxor %xmm4, %xmm0
1734	pxor %xmm1, %xmm0
1735	movaps %xmm0, (TKEYP)
1736	add $0x10, TKEYP
1737	ret
1738ENDPROC(_key_expansion_128)
1739ENDPROC(_key_expansion_256a)
1740
1741.align 4
1742_key_expansion_192a:
1743	pshufd $0b01010101, %xmm1, %xmm1
1744	shufps $0b00010000, %xmm0, %xmm4
1745	pxor %xmm4, %xmm0
1746	shufps $0b10001100, %xmm0, %xmm4
1747	pxor %xmm4, %xmm0
1748	pxor %xmm1, %xmm0
1749
1750	movaps %xmm2, %xmm5
1751	movaps %xmm2, %xmm6
1752	pslldq $4, %xmm5
1753	pshufd $0b11111111, %xmm0, %xmm3
1754	pxor %xmm3, %xmm2
1755	pxor %xmm5, %xmm2
1756
1757	movaps %xmm0, %xmm1
1758	shufps $0b01000100, %xmm0, %xmm6
1759	movaps %xmm6, (TKEYP)
1760	shufps $0b01001110, %xmm2, %xmm1
1761	movaps %xmm1, 0x10(TKEYP)
1762	add $0x20, TKEYP
1763	ret
1764ENDPROC(_key_expansion_192a)
1765
1766.align 4
1767_key_expansion_192b:
1768	pshufd $0b01010101, %xmm1, %xmm1
1769	shufps $0b00010000, %xmm0, %xmm4
1770	pxor %xmm4, %xmm0
1771	shufps $0b10001100, %xmm0, %xmm4
1772	pxor %xmm4, %xmm0
1773	pxor %xmm1, %xmm0
1774
1775	movaps %xmm2, %xmm5
1776	pslldq $4, %xmm5
1777	pshufd $0b11111111, %xmm0, %xmm3
1778	pxor %xmm3, %xmm2
1779	pxor %xmm5, %xmm2
1780
1781	movaps %xmm0, (TKEYP)
1782	add $0x10, TKEYP
1783	ret
1784ENDPROC(_key_expansion_192b)
1785
1786.align 4
1787_key_expansion_256b:
1788	pshufd $0b10101010, %xmm1, %xmm1
1789	shufps $0b00010000, %xmm2, %xmm4
1790	pxor %xmm4, %xmm2
1791	shufps $0b10001100, %xmm2, %xmm4
1792	pxor %xmm4, %xmm2
1793	pxor %xmm1, %xmm2
1794	movaps %xmm2, (TKEYP)
1795	add $0x10, TKEYP
1796	ret
1797ENDPROC(_key_expansion_256b)
1798
1799/*
1800 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1801 *                   unsigned int key_len)
1802 */
1803ENTRY(aesni_set_key)
1804	FRAME_BEGIN
1805#ifndef __x86_64__
1806	pushl KEYP
1807	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1808	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1809	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1810#endif
1811	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1812	movaps %xmm0, (KEYP)
1813	lea 0x10(KEYP), TKEYP		# key addr
1814	movl %edx, 480(KEYP)
1815	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1816	cmp $24, %dl
1817	jb .Lenc_key128
1818	je .Lenc_key192
1819	movups 0x10(UKEYP), %xmm2	# other user key
1820	movaps %xmm2, (TKEYP)
1821	add $0x10, TKEYP
1822	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1823	call _key_expansion_256a
1824	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1825	call _key_expansion_256b
1826	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1827	call _key_expansion_256a
1828	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1829	call _key_expansion_256b
1830	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1831	call _key_expansion_256a
1832	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1833	call _key_expansion_256b
1834	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1835	call _key_expansion_256a
1836	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1837	call _key_expansion_256b
1838	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1839	call _key_expansion_256a
1840	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1841	call _key_expansion_256b
1842	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1843	call _key_expansion_256a
1844	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1845	call _key_expansion_256b
1846	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1847	call _key_expansion_256a
1848	jmp .Ldec_key
1849.Lenc_key192:
1850	movq 0x10(UKEYP), %xmm2		# other user key
1851	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1852	call _key_expansion_192a
1853	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1854	call _key_expansion_192b
1855	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1856	call _key_expansion_192a
1857	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1858	call _key_expansion_192b
1859	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1860	call _key_expansion_192a
1861	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1862	call _key_expansion_192b
1863	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1864	call _key_expansion_192a
1865	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1866	call _key_expansion_192b
1867	jmp .Ldec_key
1868.Lenc_key128:
1869	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1870	call _key_expansion_128
1871	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1872	call _key_expansion_128
1873	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1874	call _key_expansion_128
1875	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1876	call _key_expansion_128
1877	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1878	call _key_expansion_128
1879	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1880	call _key_expansion_128
1881	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1882	call _key_expansion_128
1883	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1884	call _key_expansion_128
1885	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1886	call _key_expansion_128
1887	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1888	call _key_expansion_128
1889.Ldec_key:
1890	sub $0x10, TKEYP
1891	movaps (KEYP), %xmm0
1892	movaps (TKEYP), %xmm1
1893	movaps %xmm0, 240(TKEYP)
1894	movaps %xmm1, 240(KEYP)
1895	add $0x10, KEYP
1896	lea 240-16(TKEYP), UKEYP
1897.align 4
1898.Ldec_key_loop:
1899	movaps (KEYP), %xmm0
1900	AESIMC %xmm0 %xmm1
1901	movaps %xmm1, (UKEYP)
1902	add $0x10, KEYP
1903	sub $0x10, UKEYP
1904	cmp TKEYP, KEYP
1905	jb .Ldec_key_loop
1906	xor AREG, AREG
1907#ifndef __x86_64__
1908	popl KEYP
1909#endif
1910	FRAME_END
1911	ret
1912ENDPROC(aesni_set_key)
1913
1914/*
1915 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1916 */
1917ENTRY(aesni_enc)
1918	FRAME_BEGIN
1919#ifndef __x86_64__
1920	pushl KEYP
1921	pushl KLEN
1922	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1923	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1924	movl (FRAME_OFFSET+20)(%esp), INP	# src
1925#endif
1926	movl 480(KEYP), KLEN		# key length
1927	movups (INP), STATE		# input
1928	call _aesni_enc1
1929	movups STATE, (OUTP)		# output
1930#ifndef __x86_64__
1931	popl KLEN
1932	popl KEYP
1933#endif
1934	FRAME_END
1935	ret
1936ENDPROC(aesni_enc)
1937
1938/*
1939 * _aesni_enc1:		internal ABI
1940 * input:
1941 *	KEYP:		key struct pointer
1942 *	KLEN:		round count
1943 *	STATE:		initial state (input)
1944 * output:
1945 *	STATE:		finial state (output)
1946 * changed:
1947 *	KEY
1948 *	TKEYP (T1)
1949 */
1950.align 4
1951_aesni_enc1:
1952	movaps (KEYP), KEY		# key
1953	mov KEYP, TKEYP
1954	pxor KEY, STATE		# round 0
1955	add $0x30, TKEYP
1956	cmp $24, KLEN
1957	jb .Lenc128
1958	lea 0x20(TKEYP), TKEYP
1959	je .Lenc192
1960	add $0x20, TKEYP
1961	movaps -0x60(TKEYP), KEY
1962	AESENC KEY STATE
1963	movaps -0x50(TKEYP), KEY
1964	AESENC KEY STATE
1965.align 4
1966.Lenc192:
1967	movaps -0x40(TKEYP), KEY
1968	AESENC KEY STATE
1969	movaps -0x30(TKEYP), KEY
1970	AESENC KEY STATE
1971.align 4
1972.Lenc128:
1973	movaps -0x20(TKEYP), KEY
1974	AESENC KEY STATE
1975	movaps -0x10(TKEYP), KEY
1976	AESENC KEY STATE
1977	movaps (TKEYP), KEY
1978	AESENC KEY STATE
1979	movaps 0x10(TKEYP), KEY
1980	AESENC KEY STATE
1981	movaps 0x20(TKEYP), KEY
1982	AESENC KEY STATE
1983	movaps 0x30(TKEYP), KEY
1984	AESENC KEY STATE
1985	movaps 0x40(TKEYP), KEY
1986	AESENC KEY STATE
1987	movaps 0x50(TKEYP), KEY
1988	AESENC KEY STATE
1989	movaps 0x60(TKEYP), KEY
1990	AESENC KEY STATE
1991	movaps 0x70(TKEYP), KEY
1992	AESENCLAST KEY STATE
1993	ret
1994ENDPROC(_aesni_enc1)
1995
1996/*
1997 * _aesni_enc4:	internal ABI
1998 * input:
1999 *	KEYP:		key struct pointer
2000 *	KLEN:		round count
2001 *	STATE1:		initial state (input)
2002 *	STATE2
2003 *	STATE3
2004 *	STATE4
2005 * output:
2006 *	STATE1:		finial state (output)
2007 *	STATE2
2008 *	STATE3
2009 *	STATE4
2010 * changed:
2011 *	KEY
2012 *	TKEYP (T1)
2013 */
2014.align 4
2015_aesni_enc4:
2016	movaps (KEYP), KEY		# key
2017	mov KEYP, TKEYP
2018	pxor KEY, STATE1		# round 0
2019	pxor KEY, STATE2
2020	pxor KEY, STATE3
2021	pxor KEY, STATE4
2022	add $0x30, TKEYP
2023	cmp $24, KLEN
2024	jb .L4enc128
2025	lea 0x20(TKEYP), TKEYP
2026	je .L4enc192
2027	add $0x20, TKEYP
2028	movaps -0x60(TKEYP), KEY
2029	AESENC KEY STATE1
2030	AESENC KEY STATE2
2031	AESENC KEY STATE3
2032	AESENC KEY STATE4
2033	movaps -0x50(TKEYP), KEY
2034	AESENC KEY STATE1
2035	AESENC KEY STATE2
2036	AESENC KEY STATE3
2037	AESENC KEY STATE4
2038#.align 4
2039.L4enc192:
2040	movaps -0x40(TKEYP), KEY
2041	AESENC KEY STATE1
2042	AESENC KEY STATE2
2043	AESENC KEY STATE3
2044	AESENC KEY STATE4
2045	movaps -0x30(TKEYP), KEY
2046	AESENC KEY STATE1
2047	AESENC KEY STATE2
2048	AESENC KEY STATE3
2049	AESENC KEY STATE4
2050#.align 4
2051.L4enc128:
2052	movaps -0x20(TKEYP), KEY
2053	AESENC KEY STATE1
2054	AESENC KEY STATE2
2055	AESENC KEY STATE3
2056	AESENC KEY STATE4
2057	movaps -0x10(TKEYP), KEY
2058	AESENC KEY STATE1
2059	AESENC KEY STATE2
2060	AESENC KEY STATE3
2061	AESENC KEY STATE4
2062	movaps (TKEYP), KEY
2063	AESENC KEY STATE1
2064	AESENC KEY STATE2
2065	AESENC KEY STATE3
2066	AESENC KEY STATE4
2067	movaps 0x10(TKEYP), KEY
2068	AESENC KEY STATE1
2069	AESENC KEY STATE2
2070	AESENC KEY STATE3
2071	AESENC KEY STATE4
2072	movaps 0x20(TKEYP), KEY
2073	AESENC KEY STATE1
2074	AESENC KEY STATE2
2075	AESENC KEY STATE3
2076	AESENC KEY STATE4
2077	movaps 0x30(TKEYP), KEY
2078	AESENC KEY STATE1
2079	AESENC KEY STATE2
2080	AESENC KEY STATE3
2081	AESENC KEY STATE4
2082	movaps 0x40(TKEYP), KEY
2083	AESENC KEY STATE1
2084	AESENC KEY STATE2
2085	AESENC KEY STATE3
2086	AESENC KEY STATE4
2087	movaps 0x50(TKEYP), KEY
2088	AESENC KEY STATE1
2089	AESENC KEY STATE2
2090	AESENC KEY STATE3
2091	AESENC KEY STATE4
2092	movaps 0x60(TKEYP), KEY
2093	AESENC KEY STATE1
2094	AESENC KEY STATE2
2095	AESENC KEY STATE3
2096	AESENC KEY STATE4
2097	movaps 0x70(TKEYP), KEY
2098	AESENCLAST KEY STATE1		# last round
2099	AESENCLAST KEY STATE2
2100	AESENCLAST KEY STATE3
2101	AESENCLAST KEY STATE4
2102	ret
2103ENDPROC(_aesni_enc4)
2104
2105/*
2106 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2107 */
2108ENTRY(aesni_dec)
2109	FRAME_BEGIN
2110#ifndef __x86_64__
2111	pushl KEYP
2112	pushl KLEN
2113	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2114	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2115	movl (FRAME_OFFSET+20)(%esp), INP	# src
2116#endif
2117	mov 480(KEYP), KLEN		# key length
2118	add $240, KEYP
2119	movups (INP), STATE		# input
2120	call _aesni_dec1
2121	movups STATE, (OUTP)		#output
2122#ifndef __x86_64__
2123	popl KLEN
2124	popl KEYP
2125#endif
2126	FRAME_END
2127	ret
2128ENDPROC(aesni_dec)
2129
2130/*
2131 * _aesni_dec1:		internal ABI
2132 * input:
2133 *	KEYP:		key struct pointer
2134 *	KLEN:		key length
2135 *	STATE:		initial state (input)
2136 * output:
2137 *	STATE:		finial state (output)
2138 * changed:
2139 *	KEY
2140 *	TKEYP (T1)
2141 */
2142.align 4
2143_aesni_dec1:
2144	movaps (KEYP), KEY		# key
2145	mov KEYP, TKEYP
2146	pxor KEY, STATE		# round 0
2147	add $0x30, TKEYP
2148	cmp $24, KLEN
2149	jb .Ldec128
2150	lea 0x20(TKEYP), TKEYP
2151	je .Ldec192
2152	add $0x20, TKEYP
2153	movaps -0x60(TKEYP), KEY
2154	AESDEC KEY STATE
2155	movaps -0x50(TKEYP), KEY
2156	AESDEC KEY STATE
2157.align 4
2158.Ldec192:
2159	movaps -0x40(TKEYP), KEY
2160	AESDEC KEY STATE
2161	movaps -0x30(TKEYP), KEY
2162	AESDEC KEY STATE
2163.align 4
2164.Ldec128:
2165	movaps -0x20(TKEYP), KEY
2166	AESDEC KEY STATE
2167	movaps -0x10(TKEYP), KEY
2168	AESDEC KEY STATE
2169	movaps (TKEYP), KEY
2170	AESDEC KEY STATE
2171	movaps 0x10(TKEYP), KEY
2172	AESDEC KEY STATE
2173	movaps 0x20(TKEYP), KEY
2174	AESDEC KEY STATE
2175	movaps 0x30(TKEYP), KEY
2176	AESDEC KEY STATE
2177	movaps 0x40(TKEYP), KEY
2178	AESDEC KEY STATE
2179	movaps 0x50(TKEYP), KEY
2180	AESDEC KEY STATE
2181	movaps 0x60(TKEYP), KEY
2182	AESDEC KEY STATE
2183	movaps 0x70(TKEYP), KEY
2184	AESDECLAST KEY STATE
2185	ret
2186ENDPROC(_aesni_dec1)
2187
2188/*
2189 * _aesni_dec4:	internal ABI
2190 * input:
2191 *	KEYP:		key struct pointer
2192 *	KLEN:		key length
2193 *	STATE1:		initial state (input)
2194 *	STATE2
2195 *	STATE3
2196 *	STATE4
2197 * output:
2198 *	STATE1:		finial state (output)
2199 *	STATE2
2200 *	STATE3
2201 *	STATE4
2202 * changed:
2203 *	KEY
2204 *	TKEYP (T1)
2205 */
2206.align 4
2207_aesni_dec4:
2208	movaps (KEYP), KEY		# key
2209	mov KEYP, TKEYP
2210	pxor KEY, STATE1		# round 0
2211	pxor KEY, STATE2
2212	pxor KEY, STATE3
2213	pxor KEY, STATE4
2214	add $0x30, TKEYP
2215	cmp $24, KLEN
2216	jb .L4dec128
2217	lea 0x20(TKEYP), TKEYP
2218	je .L4dec192
2219	add $0x20, TKEYP
2220	movaps -0x60(TKEYP), KEY
2221	AESDEC KEY STATE1
2222	AESDEC KEY STATE2
2223	AESDEC KEY STATE3
2224	AESDEC KEY STATE4
2225	movaps -0x50(TKEYP), KEY
2226	AESDEC KEY STATE1
2227	AESDEC KEY STATE2
2228	AESDEC KEY STATE3
2229	AESDEC KEY STATE4
2230.align 4
2231.L4dec192:
2232	movaps -0x40(TKEYP), KEY
2233	AESDEC KEY STATE1
2234	AESDEC KEY STATE2
2235	AESDEC KEY STATE3
2236	AESDEC KEY STATE4
2237	movaps -0x30(TKEYP), KEY
2238	AESDEC KEY STATE1
2239	AESDEC KEY STATE2
2240	AESDEC KEY STATE3
2241	AESDEC KEY STATE4
2242.align 4
2243.L4dec128:
2244	movaps -0x20(TKEYP), KEY
2245	AESDEC KEY STATE1
2246	AESDEC KEY STATE2
2247	AESDEC KEY STATE3
2248	AESDEC KEY STATE4
2249	movaps -0x10(TKEYP), KEY
2250	AESDEC KEY STATE1
2251	AESDEC KEY STATE2
2252	AESDEC KEY STATE3
2253	AESDEC KEY STATE4
2254	movaps (TKEYP), KEY
2255	AESDEC KEY STATE1
2256	AESDEC KEY STATE2
2257	AESDEC KEY STATE3
2258	AESDEC KEY STATE4
2259	movaps 0x10(TKEYP), KEY
2260	AESDEC KEY STATE1
2261	AESDEC KEY STATE2
2262	AESDEC KEY STATE3
2263	AESDEC KEY STATE4
2264	movaps 0x20(TKEYP), KEY
2265	AESDEC KEY STATE1
2266	AESDEC KEY STATE2
2267	AESDEC KEY STATE3
2268	AESDEC KEY STATE4
2269	movaps 0x30(TKEYP), KEY
2270	AESDEC KEY STATE1
2271	AESDEC KEY STATE2
2272	AESDEC KEY STATE3
2273	AESDEC KEY STATE4
2274	movaps 0x40(TKEYP), KEY
2275	AESDEC KEY STATE1
2276	AESDEC KEY STATE2
2277	AESDEC KEY STATE3
2278	AESDEC KEY STATE4
2279	movaps 0x50(TKEYP), KEY
2280	AESDEC KEY STATE1
2281	AESDEC KEY STATE2
2282	AESDEC KEY STATE3
2283	AESDEC KEY STATE4
2284	movaps 0x60(TKEYP), KEY
2285	AESDEC KEY STATE1
2286	AESDEC KEY STATE2
2287	AESDEC KEY STATE3
2288	AESDEC KEY STATE4
2289	movaps 0x70(TKEYP), KEY
2290	AESDECLAST KEY STATE1		# last round
2291	AESDECLAST KEY STATE2
2292	AESDECLAST KEY STATE3
2293	AESDECLAST KEY STATE4
2294	ret
2295ENDPROC(_aesni_dec4)
2296
2297/*
2298 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2299 *		      size_t len)
2300 */
2301ENTRY(aesni_ecb_enc)
2302	FRAME_BEGIN
2303#ifndef __x86_64__
2304	pushl LEN
2305	pushl KEYP
2306	pushl KLEN
2307	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2308	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2309	movl (FRAME_OFFSET+24)(%esp), INP	# src
2310	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2311#endif
2312	test LEN, LEN		# check length
2313	jz .Lecb_enc_ret
2314	mov 480(KEYP), KLEN
2315	cmp $16, LEN
2316	jb .Lecb_enc_ret
2317	cmp $64, LEN
2318	jb .Lecb_enc_loop1
2319.align 4
2320.Lecb_enc_loop4:
2321	movups (INP), STATE1
2322	movups 0x10(INP), STATE2
2323	movups 0x20(INP), STATE3
2324	movups 0x30(INP), STATE4
2325	call _aesni_enc4
2326	movups STATE1, (OUTP)
2327	movups STATE2, 0x10(OUTP)
2328	movups STATE3, 0x20(OUTP)
2329	movups STATE4, 0x30(OUTP)
2330	sub $64, LEN
2331	add $64, INP
2332	add $64, OUTP
2333	cmp $64, LEN
2334	jge .Lecb_enc_loop4
2335	cmp $16, LEN
2336	jb .Lecb_enc_ret
2337.align 4
2338.Lecb_enc_loop1:
2339	movups (INP), STATE1
2340	call _aesni_enc1
2341	movups STATE1, (OUTP)
2342	sub $16, LEN
2343	add $16, INP
2344	add $16, OUTP
2345	cmp $16, LEN
2346	jge .Lecb_enc_loop1
2347.Lecb_enc_ret:
2348#ifndef __x86_64__
2349	popl KLEN
2350	popl KEYP
2351	popl LEN
2352#endif
2353	FRAME_END
2354	ret
2355ENDPROC(aesni_ecb_enc)
2356
2357/*
2358 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2359 *		      size_t len);
2360 */
2361ENTRY(aesni_ecb_dec)
2362	FRAME_BEGIN
2363#ifndef __x86_64__
2364	pushl LEN
2365	pushl KEYP
2366	pushl KLEN
2367	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2368	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2369	movl (FRAME_OFFSET+24)(%esp), INP	# src
2370	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2371#endif
2372	test LEN, LEN
2373	jz .Lecb_dec_ret
2374	mov 480(KEYP), KLEN
2375	add $240, KEYP
2376	cmp $16, LEN
2377	jb .Lecb_dec_ret
2378	cmp $64, LEN
2379	jb .Lecb_dec_loop1
2380.align 4
2381.Lecb_dec_loop4:
2382	movups (INP), STATE1
2383	movups 0x10(INP), STATE2
2384	movups 0x20(INP), STATE3
2385	movups 0x30(INP), STATE4
2386	call _aesni_dec4
2387	movups STATE1, (OUTP)
2388	movups STATE2, 0x10(OUTP)
2389	movups STATE3, 0x20(OUTP)
2390	movups STATE4, 0x30(OUTP)
2391	sub $64, LEN
2392	add $64, INP
2393	add $64, OUTP
2394	cmp $64, LEN
2395	jge .Lecb_dec_loop4
2396	cmp $16, LEN
2397	jb .Lecb_dec_ret
2398.align 4
2399.Lecb_dec_loop1:
2400	movups (INP), STATE1
2401	call _aesni_dec1
2402	movups STATE1, (OUTP)
2403	sub $16, LEN
2404	add $16, INP
2405	add $16, OUTP
2406	cmp $16, LEN
2407	jge .Lecb_dec_loop1
2408.Lecb_dec_ret:
2409#ifndef __x86_64__
2410	popl KLEN
2411	popl KEYP
2412	popl LEN
2413#endif
2414	FRAME_END
2415	ret
2416ENDPROC(aesni_ecb_dec)
2417
2418/*
2419 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2420 *		      size_t len, u8 *iv)
2421 */
2422ENTRY(aesni_cbc_enc)
2423	FRAME_BEGIN
2424#ifndef __x86_64__
2425	pushl IVP
2426	pushl LEN
2427	pushl KEYP
2428	pushl KLEN
2429	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2430	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2431	movl (FRAME_OFFSET+28)(%esp), INP	# src
2432	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2433	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2434#endif
2435	cmp $16, LEN
2436	jb .Lcbc_enc_ret
2437	mov 480(KEYP), KLEN
2438	movups (IVP), STATE	# load iv as initial state
2439.align 4
2440.Lcbc_enc_loop:
2441	movups (INP), IN	# load input
2442	pxor IN, STATE
2443	call _aesni_enc1
2444	movups STATE, (OUTP)	# store output
2445	sub $16, LEN
2446	add $16, INP
2447	add $16, OUTP
2448	cmp $16, LEN
2449	jge .Lcbc_enc_loop
2450	movups STATE, (IVP)
2451.Lcbc_enc_ret:
2452#ifndef __x86_64__
2453	popl KLEN
2454	popl KEYP
2455	popl LEN
2456	popl IVP
2457#endif
2458	FRAME_END
2459	ret
2460ENDPROC(aesni_cbc_enc)
2461
2462/*
2463 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2464 *		      size_t len, u8 *iv)
2465 */
2466ENTRY(aesni_cbc_dec)
2467	FRAME_BEGIN
2468#ifndef __x86_64__
2469	pushl IVP
2470	pushl LEN
2471	pushl KEYP
2472	pushl KLEN
2473	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2474	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2475	movl (FRAME_OFFSET+28)(%esp), INP	# src
2476	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2477	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2478#endif
2479	cmp $16, LEN
2480	jb .Lcbc_dec_just_ret
2481	mov 480(KEYP), KLEN
2482	add $240, KEYP
2483	movups (IVP), IV
2484	cmp $64, LEN
2485	jb .Lcbc_dec_loop1
2486.align 4
2487.Lcbc_dec_loop4:
2488	movups (INP), IN1
2489	movaps IN1, STATE1
2490	movups 0x10(INP), IN2
2491	movaps IN2, STATE2
2492#ifdef __x86_64__
2493	movups 0x20(INP), IN3
2494	movaps IN3, STATE3
2495	movups 0x30(INP), IN4
2496	movaps IN4, STATE4
2497#else
2498	movups 0x20(INP), IN1
2499	movaps IN1, STATE3
2500	movups 0x30(INP), IN2
2501	movaps IN2, STATE4
2502#endif
2503	call _aesni_dec4
2504	pxor IV, STATE1
2505#ifdef __x86_64__
2506	pxor IN1, STATE2
2507	pxor IN2, STATE3
2508	pxor IN3, STATE4
2509	movaps IN4, IV
2510#else
2511	pxor IN1, STATE4
2512	movaps IN2, IV
2513	movups (INP), IN1
2514	pxor IN1, STATE2
2515	movups 0x10(INP), IN2
2516	pxor IN2, STATE3
2517#endif
2518	movups STATE1, (OUTP)
2519	movups STATE2, 0x10(OUTP)
2520	movups STATE3, 0x20(OUTP)
2521	movups STATE4, 0x30(OUTP)
2522	sub $64, LEN
2523	add $64, INP
2524	add $64, OUTP
2525	cmp $64, LEN
2526	jge .Lcbc_dec_loop4
2527	cmp $16, LEN
2528	jb .Lcbc_dec_ret
2529.align 4
2530.Lcbc_dec_loop1:
2531	movups (INP), IN
2532	movaps IN, STATE
2533	call _aesni_dec1
2534	pxor IV, STATE
2535	movups STATE, (OUTP)
2536	movaps IN, IV
2537	sub $16, LEN
2538	add $16, INP
2539	add $16, OUTP
2540	cmp $16, LEN
2541	jge .Lcbc_dec_loop1
2542.Lcbc_dec_ret:
2543	movups IV, (IVP)
2544.Lcbc_dec_just_ret:
2545#ifndef __x86_64__
2546	popl KLEN
2547	popl KEYP
2548	popl LEN
2549	popl IVP
2550#endif
2551	FRAME_END
2552	ret
2553ENDPROC(aesni_cbc_dec)
2554
2555#ifdef __x86_64__
2556.pushsection .rodata
2557.align 16
2558.Lbswap_mask:
2559	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2560.popsection
2561
2562/*
2563 * _aesni_inc_init:	internal ABI
2564 *	setup registers used by _aesni_inc
2565 * input:
2566 *	IV
2567 * output:
2568 *	CTR:	== IV, in little endian
2569 *	TCTR_LOW: == lower qword of CTR
2570 *	INC:	== 1, in little endian
2571 *	BSWAP_MASK == endian swapping mask
2572 */
2573.align 4
2574_aesni_inc_init:
2575	movaps .Lbswap_mask, BSWAP_MASK
2576	movaps IV, CTR
2577	PSHUFB_XMM BSWAP_MASK CTR
2578	mov $1, TCTR_LOW
2579	MOVQ_R64_XMM TCTR_LOW INC
2580	MOVQ_R64_XMM CTR TCTR_LOW
2581	ret
2582ENDPROC(_aesni_inc_init)
2583
2584/*
2585 * _aesni_inc:		internal ABI
2586 *	Increase IV by 1, IV is in big endian
2587 * input:
2588 *	IV
2589 *	CTR:	== IV, in little endian
2590 *	TCTR_LOW: == lower qword of CTR
2591 *	INC:	== 1, in little endian
2592 *	BSWAP_MASK == endian swapping mask
2593 * output:
2594 *	IV:	Increase by 1
2595 * changed:
2596 *	CTR:	== output IV, in little endian
2597 *	TCTR_LOW: == lower qword of CTR
2598 */
2599.align 4
2600_aesni_inc:
2601	paddq INC, CTR
2602	add $1, TCTR_LOW
2603	jnc .Linc_low
2604	pslldq $8, INC
2605	paddq INC, CTR
2606	psrldq $8, INC
2607.Linc_low:
2608	movaps CTR, IV
2609	PSHUFB_XMM BSWAP_MASK IV
2610	ret
2611ENDPROC(_aesni_inc)
2612
2613/*
2614 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2615 *		      size_t len, u8 *iv)
2616 */
2617ENTRY(aesni_ctr_enc)
2618	FRAME_BEGIN
2619	cmp $16, LEN
2620	jb .Lctr_enc_just_ret
2621	mov 480(KEYP), KLEN
2622	movups (IVP), IV
2623	call _aesni_inc_init
2624	cmp $64, LEN
2625	jb .Lctr_enc_loop1
2626.align 4
2627.Lctr_enc_loop4:
2628	movaps IV, STATE1
2629	call _aesni_inc
2630	movups (INP), IN1
2631	movaps IV, STATE2
2632	call _aesni_inc
2633	movups 0x10(INP), IN2
2634	movaps IV, STATE3
2635	call _aesni_inc
2636	movups 0x20(INP), IN3
2637	movaps IV, STATE4
2638	call _aesni_inc
2639	movups 0x30(INP), IN4
2640	call _aesni_enc4
2641	pxor IN1, STATE1
2642	movups STATE1, (OUTP)
2643	pxor IN2, STATE2
2644	movups STATE2, 0x10(OUTP)
2645	pxor IN3, STATE3
2646	movups STATE3, 0x20(OUTP)
2647	pxor IN4, STATE4
2648	movups STATE4, 0x30(OUTP)
2649	sub $64, LEN
2650	add $64, INP
2651	add $64, OUTP
2652	cmp $64, LEN
2653	jge .Lctr_enc_loop4
2654	cmp $16, LEN
2655	jb .Lctr_enc_ret
2656.align 4
2657.Lctr_enc_loop1:
2658	movaps IV, STATE
2659	call _aesni_inc
2660	movups (INP), IN
2661	call _aesni_enc1
2662	pxor IN, STATE
2663	movups STATE, (OUTP)
2664	sub $16, LEN
2665	add $16, INP
2666	add $16, OUTP
2667	cmp $16, LEN
2668	jge .Lctr_enc_loop1
2669.Lctr_enc_ret:
2670	movups IV, (IVP)
2671.Lctr_enc_just_ret:
2672	FRAME_END
2673	ret
2674ENDPROC(aesni_ctr_enc)
2675
2676/*
2677 * _aesni_gf128mul_x_ble:		internal ABI
2678 *	Multiply in GF(2^128) for XTS IVs
2679 * input:
2680 *	IV:	current IV
2681 *	GF128MUL_MASK == mask with 0x87 and 0x01
2682 * output:
2683 *	IV:	next IV
2684 * changed:
2685 *	CTR:	== temporary value
2686 */
2687#define _aesni_gf128mul_x_ble() \
2688	pshufd $0x13, IV, CTR; \
2689	paddq IV, IV; \
2690	psrad $31, CTR; \
2691	pand GF128MUL_MASK, CTR; \
2692	pxor CTR, IV;
2693
2694/*
2695 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2696 *			 bool enc, u8 *iv)
2697 */
2698ENTRY(aesni_xts_crypt8)
2699	FRAME_BEGIN
2700	cmpb $0, %cl
2701	movl $0, %ecx
2702	movl $240, %r10d
2703	leaq _aesni_enc4, %r11
2704	leaq _aesni_dec4, %rax
2705	cmovel %r10d, %ecx
2706	cmoveq %rax, %r11
2707
2708	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2709	movups (IVP), IV
2710
2711	mov 480(KEYP), KLEN
2712	addq %rcx, KEYP
2713
2714	movdqa IV, STATE1
2715	movdqu 0x00(INP), INC
2716	pxor INC, STATE1
2717	movdqu IV, 0x00(OUTP)
2718
2719	_aesni_gf128mul_x_ble()
2720	movdqa IV, STATE2
2721	movdqu 0x10(INP), INC
2722	pxor INC, STATE2
2723	movdqu IV, 0x10(OUTP)
2724
2725	_aesni_gf128mul_x_ble()
2726	movdqa IV, STATE3
2727	movdqu 0x20(INP), INC
2728	pxor INC, STATE3
2729	movdqu IV, 0x20(OUTP)
2730
2731	_aesni_gf128mul_x_ble()
2732	movdqa IV, STATE4
2733	movdqu 0x30(INP), INC
2734	pxor INC, STATE4
2735	movdqu IV, 0x30(OUTP)
2736
2737	call *%r11
2738
2739	movdqu 0x00(OUTP), INC
2740	pxor INC, STATE1
2741	movdqu STATE1, 0x00(OUTP)
2742
2743	_aesni_gf128mul_x_ble()
2744	movdqa IV, STATE1
2745	movdqu 0x40(INP), INC
2746	pxor INC, STATE1
2747	movdqu IV, 0x40(OUTP)
2748
2749	movdqu 0x10(OUTP), INC
2750	pxor INC, STATE2
2751	movdqu STATE2, 0x10(OUTP)
2752
2753	_aesni_gf128mul_x_ble()
2754	movdqa IV, STATE2
2755	movdqu 0x50(INP), INC
2756	pxor INC, STATE2
2757	movdqu IV, 0x50(OUTP)
2758
2759	movdqu 0x20(OUTP), INC
2760	pxor INC, STATE3
2761	movdqu STATE3, 0x20(OUTP)
2762
2763	_aesni_gf128mul_x_ble()
2764	movdqa IV, STATE3
2765	movdqu 0x60(INP), INC
2766	pxor INC, STATE3
2767	movdqu IV, 0x60(OUTP)
2768
2769	movdqu 0x30(OUTP), INC
2770	pxor INC, STATE4
2771	movdqu STATE4, 0x30(OUTP)
2772
2773	_aesni_gf128mul_x_ble()
2774	movdqa IV, STATE4
2775	movdqu 0x70(INP), INC
2776	pxor INC, STATE4
2777	movdqu IV, 0x70(OUTP)
2778
2779	_aesni_gf128mul_x_ble()
2780	movups IV, (IVP)
2781
2782	call *%r11
2783
2784	movdqu 0x40(OUTP), INC
2785	pxor INC, STATE1
2786	movdqu STATE1, 0x40(OUTP)
2787
2788	movdqu 0x50(OUTP), INC
2789	pxor INC, STATE2
2790	movdqu STATE2, 0x50(OUTP)
2791
2792	movdqu 0x60(OUTP), INC
2793	pxor INC, STATE3
2794	movdqu STATE3, 0x60(OUTP)
2795
2796	movdqu 0x70(OUTP), INC
2797	pxor INC, STATE4
2798	movdqu STATE4, 0x70(OUTP)
2799
2800	FRAME_END
2801	ret
2802ENDPROC(aesni_xts_crypt8)
2803
2804#endif
2805