xref: /linux/arch/x86/crypto/aesni-intel_asm.S (revision 90d32e92011eaae8e70a9169b4e7acf4ca8f9d3a)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 *    Author: Huang Ying <ying.huang@intel.com>
10 *            Vinodh Gopal <vinodh.gopal@intel.com>
11 *            Kahraman Akdemir
12 *
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
17 *             Adrian Hoban <adrian.hoban@intel.com>
18 *             James Guilford (james.guilford@intel.com)
19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
20 *             Tadeusz Struk (tadeusz.struk@intel.com)
21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
22 *    Copyright (c) 2010, Intel Corporation.
23 *
24 * Ported x86_64 version to x86:
25 *    Author: Mathias Krause <minipli@googlemail.com>
26 */
27
28#include <linux/linkage.h>
29#include <asm/frame.h>
30#include <asm/nospec-branch.h>
31
32/*
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register.  This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned).  It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
39 */
40#define MOVADQ	movaps
41#define MOVUDQ	movups
42
43#ifdef __x86_64__
44
45# constants in mergeable sections, linker can reorder and merge
46.section	.rodata.cst16.POLY, "aM", @progbits, 16
47.align 16
48POLY:   .octa 0xC2000000000000000000000000000001
49.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
50.align 16
51TWOONE: .octa 0x00000001000000000000000000000001
52
53.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
54.align 16
55SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
56.section	.rodata.cst16.MASK1, "aM", @progbits, 16
57.align 16
58MASK1:      .octa 0x0000000000000000ffffffffffffffff
59.section	.rodata.cst16.MASK2, "aM", @progbits, 16
60.align 16
61MASK2:      .octa 0xffffffffffffffff0000000000000000
62.section	.rodata.cst16.ONE, "aM", @progbits, 16
63.align 16
64ONE:        .octa 0x00000000000000000000000000000001
65.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
66.align 16
67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68.section	.rodata.cst16.dec, "aM", @progbits, 16
69.align 16
70dec:        .octa 0x1
71.section	.rodata.cst16.enc, "aM", @progbits, 16
72.align 16
73enc:        .octa 0x2
74
75# order of these constants should not change.
76# more specifically, ALL_F should follow SHIFT_MASK,
77# and zero should follow ALL_F
78.section	.rodata, "a", @progbits
79.align 16
80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
81ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
82            .octa 0x00000000000000000000000000000000
83
84.text
85
86#define AadHash 16*0
87#define AadLen 16*1
88#define InLen (16*1)+8
89#define PBlockEncKey 16*2
90#define OrigIV 16*3
91#define CurCount 16*4
92#define PBlockLen 16*5
93#define	HashKey		16*6	// store HashKey <<1 mod poly here
94#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
95#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
96#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
97#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
98				// bits of  HashKey <<1 mod poly here
99				//(for Karatsuba purposes)
100#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
101				// bits of  HashKey^2 <<1 mod poly here
102				// (for Karatsuba purposes)
103#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
104				// bits of  HashKey^3 <<1 mod poly here
105				// (for Karatsuba purposes)
106#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
107				// bits of  HashKey^4 <<1 mod poly here
108				// (for Karatsuba purposes)
109
110#define arg1 rdi
111#define arg2 rsi
112#define arg3 rdx
113#define arg4 rcx
114#define arg5 r8
115#define arg6 r9
116#define keysize 2*15*16(%arg1)
117#endif
118
119
120#define STATE1	%xmm0
121#define STATE2	%xmm4
122#define STATE3	%xmm5
123#define STATE4	%xmm6
124#define STATE	STATE1
125#define IN1	%xmm1
126#define IN2	%xmm7
127#define IN3	%xmm8
128#define IN4	%xmm9
129#define IN	IN1
130#define KEY	%xmm2
131#define IV	%xmm3
132
133#define BSWAP_MASK %xmm10
134#define CTR	%xmm11
135#define INC	%xmm12
136
137#define GF128MUL_MASK %xmm7
138
139#ifdef __x86_64__
140#define AREG	%rax
141#define KEYP	%rdi
142#define OUTP	%rsi
143#define UKEYP	OUTP
144#define INP	%rdx
145#define LEN	%rcx
146#define IVP	%r8
147#define KLEN	%r9d
148#define T1	%r10
149#define TKEYP	T1
150#define T2	%r11
151#define TCTR_LOW T2
152#else
153#define AREG	%eax
154#define KEYP	%edi
155#define OUTP	AREG
156#define UKEYP	OUTP
157#define INP	%edx
158#define LEN	%esi
159#define IVP	%ebp
160#define KLEN	%ebx
161#define T1	%ecx
162#define TKEYP	T1
163#endif
164
165.macro FUNC_SAVE
166	push	%r12
167	push	%r13
168	push	%r14
169#
170# states of %xmm registers %xmm6:%xmm15 not saved
171# all %xmm registers are clobbered
172#
173.endm
174
175
176.macro FUNC_RESTORE
177	pop	%r14
178	pop	%r13
179	pop	%r12
180.endm
181
182# Precompute hashkeys.
183# Input: Hash subkey.
184# Output: HashKeys stored in gcm_context_data.  Only needs to be called
185# once per key.
186# clobbers r12, and tmp xmm registers.
187.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
188	mov	\SUBKEY, %r12
189	movdqu	(%r12), \TMP3
190	movdqa	SHUF_MASK(%rip), \TMP2
191	pshufb	\TMP2, \TMP3
192
193	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
194
195	movdqa	\TMP3, \TMP2
196	psllq	$1, \TMP3
197	psrlq	$63, \TMP2
198	movdqa	\TMP2, \TMP1
199	pslldq	$8, \TMP2
200	psrldq	$8, \TMP1
201	por	\TMP2, \TMP3
202
203	# reduce HashKey<<1
204
205	pshufd	$0x24, \TMP1, \TMP2
206	pcmpeqd TWOONE(%rip), \TMP2
207	pand	POLY(%rip), \TMP2
208	pxor	\TMP2, \TMP3
209	movdqu	\TMP3, HashKey(%arg2)
210
211	movdqa	   \TMP3, \TMP5
212	pshufd	   $78, \TMP3, \TMP1
213	pxor	   \TMP3, \TMP1
214	movdqu	   \TMP1, HashKey_k(%arg2)
215
216	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
217# TMP5 = HashKey^2<<1 (mod poly)
218	movdqu	   \TMP5, HashKey_2(%arg2)
219# HashKey_2 = HashKey^2<<1 (mod poly)
220	pshufd	   $78, \TMP5, \TMP1
221	pxor	   \TMP5, \TMP1
222	movdqu	   \TMP1, HashKey_2_k(%arg2)
223
224	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
225# TMP5 = HashKey^3<<1 (mod poly)
226	movdqu	   \TMP5, HashKey_3(%arg2)
227	pshufd	   $78, \TMP5, \TMP1
228	pxor	   \TMP5, \TMP1
229	movdqu	   \TMP1, HashKey_3_k(%arg2)
230
231	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
232# TMP5 = HashKey^3<<1 (mod poly)
233	movdqu	   \TMP5, HashKey_4(%arg2)
234	pshufd	   $78, \TMP5, \TMP1
235	pxor	   \TMP5, \TMP1
236	movdqu	   \TMP1, HashKey_4_k(%arg2)
237.endm
238
239# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
240# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
241.macro GCM_INIT Iv SUBKEY AAD AADLEN
242	mov \AADLEN, %r11
243	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
244	xor %r11d, %r11d
245	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
246	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
247	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
248	mov \Iv, %rax
249	movdqu (%rax), %xmm0
250	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
251
252	movdqa  SHUF_MASK(%rip), %xmm2
253	pshufb %xmm2, %xmm0
254	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
255
256	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
257	movdqu HashKey(%arg2), %xmm13
258
259	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
260	%xmm4, %xmm5, %xmm6
261.endm
262
263# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
264# struct has been initialized by GCM_INIT.
265# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
266# Clobbers rax, r10-r13, and xmm0-xmm15
267.macro GCM_ENC_DEC operation
268	movdqu AadHash(%arg2), %xmm8
269	movdqu HashKey(%arg2), %xmm13
270	add %arg5, InLen(%arg2)
271
272	xor %r11d, %r11d # initialise the data pointer offset as zero
273	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
274
275	sub %r11, %arg5		# sub partial block data used
276	mov %arg5, %r13		# save the number of bytes
277
278	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
279	mov %r13, %r12
280	# Encrypt/Decrypt first few blocks
281
282	and	$(3<<4), %r12
283	jz	.L_initial_num_blocks_is_0_\@
284	cmp	$(2<<4), %r12
285	jb	.L_initial_num_blocks_is_1_\@
286	je	.L_initial_num_blocks_is_2_\@
287.L_initial_num_blocks_is_3_\@:
288	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
289%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
290	sub	$48, %r13
291	jmp	.L_initial_blocks_\@
292.L_initial_num_blocks_is_2_\@:
293	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
294%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
295	sub	$32, %r13
296	jmp	.L_initial_blocks_\@
297.L_initial_num_blocks_is_1_\@:
298	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
299%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
300	sub	$16, %r13
301	jmp	.L_initial_blocks_\@
302.L_initial_num_blocks_is_0_\@:
303	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
304%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
305.L_initial_blocks_\@:
306
307	# Main loop - Encrypt/Decrypt remaining blocks
308
309	test	%r13, %r13
310	je	.L_zero_cipher_left_\@
311	sub	$64, %r13
312	je	.L_four_cipher_left_\@
313.L_crypt_by_4_\@:
314	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
315	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
316	%xmm7, %xmm8, enc
317	add	$64, %r11
318	sub	$64, %r13
319	jne	.L_crypt_by_4_\@
320.L_four_cipher_left_\@:
321	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
322%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
323.L_zero_cipher_left_\@:
324	movdqu %xmm8, AadHash(%arg2)
325	movdqu %xmm0, CurCount(%arg2)
326
327	mov	%arg5, %r13
328	and	$15, %r13			# %r13 = arg5 (mod 16)
329	je	.L_multiple_of_16_bytes_\@
330
331	mov %r13, PBlockLen(%arg2)
332
333	# Handle the last <16 Byte block separately
334	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
335	movdqu %xmm0, CurCount(%arg2)
336	movdqa SHUF_MASK(%rip), %xmm10
337	pshufb %xmm10, %xmm0
338
339	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
340	movdqu %xmm0, PBlockEncKey(%arg2)
341
342	cmp	$16, %arg5
343	jge	.L_large_enough_update_\@
344
345	lea (%arg4,%r11,1), %r10
346	mov %r13, %r12
347	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
348	jmp	.L_data_read_\@
349
350.L_large_enough_update_\@:
351	sub	$16, %r11
352	add	%r13, %r11
353
354	# receive the last <16 Byte block
355	movdqu	(%arg4, %r11, 1), %xmm1
356
357	sub	%r13, %r11
358	add	$16, %r11
359
360	lea	SHIFT_MASK+16(%rip), %r12
361	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
362	# (r13 is the number of bytes in plaintext mod 16)
363	sub	%r13, %r12
364	# get the appropriate shuffle mask
365	movdqu	(%r12), %xmm2
366	# shift right 16-r13 bytes
367	pshufb  %xmm2, %xmm1
368
369.L_data_read_\@:
370	lea ALL_F+16(%rip), %r12
371	sub %r13, %r12
372
373.ifc \operation, dec
374	movdqa  %xmm1, %xmm2
375.endif
376	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
377	movdqu	(%r12), %xmm1
378	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
379	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
380.ifc \operation, dec
381	pand    %xmm1, %xmm2
382	movdqa SHUF_MASK(%rip), %xmm10
383	pshufb %xmm10 ,%xmm2
384
385	pxor %xmm2, %xmm8
386.else
387	movdqa SHUF_MASK(%rip), %xmm10
388	pshufb %xmm10,%xmm0
389
390	pxor	%xmm0, %xmm8
391.endif
392
393	movdqu %xmm8, AadHash(%arg2)
394.ifc \operation, enc
395	# GHASH computation for the last <16 byte block
396	movdqa SHUF_MASK(%rip), %xmm10
397	# shuffle xmm0 back to output as ciphertext
398	pshufb %xmm10, %xmm0
399.endif
400
401	# Output %r13 bytes
402	movq %xmm0, %rax
403	cmp $8, %r13
404	jle .L_less_than_8_bytes_left_\@
405	mov %rax, (%arg3 , %r11, 1)
406	add $8, %r11
407	psrldq $8, %xmm0
408	movq %xmm0, %rax
409	sub $8, %r13
410.L_less_than_8_bytes_left_\@:
411	mov %al,  (%arg3, %r11, 1)
412	add $1, %r11
413	shr $8, %rax
414	sub $1, %r13
415	jne .L_less_than_8_bytes_left_\@
416.L_multiple_of_16_bytes_\@:
417.endm
418
419# GCM_COMPLETE Finishes update of tag of last partial block
420# Output: Authorization Tag (AUTH_TAG)
421# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
422.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
423	movdqu AadHash(%arg2), %xmm8
424	movdqu HashKey(%arg2), %xmm13
425
426	mov PBlockLen(%arg2), %r12
427
428	test %r12, %r12
429	je .L_partial_done\@
430
431	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
432
433.L_partial_done\@:
434	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
435	shl	$3, %r12		  # convert into number of bits
436	movd	%r12d, %xmm15		  # len(A) in %xmm15
437	mov InLen(%arg2), %r12
438	shl     $3, %r12                  # len(C) in bits (*128)
439	movq    %r12, %xmm1
440
441	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
442	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
443	pxor	%xmm15, %xmm8
444	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
445	# final GHASH computation
446	movdqa SHUF_MASK(%rip), %xmm10
447	pshufb %xmm10, %xmm8
448
449	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
450	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
451	pxor	%xmm8, %xmm0
452.L_return_T_\@:
453	mov	\AUTHTAG, %r10                     # %r10 = authTag
454	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
455	cmp	$16, %r11
456	je	.L_T_16_\@
457	cmp	$8, %r11
458	jl	.L_T_4_\@
459.L_T_8_\@:
460	movq	%xmm0, %rax
461	mov	%rax, (%r10)
462	add	$8, %r10
463	sub	$8, %r11
464	psrldq	$8, %xmm0
465	test	%r11, %r11
466	je	.L_return_T_done_\@
467.L_T_4_\@:
468	movd	%xmm0, %eax
469	mov	%eax, (%r10)
470	add	$4, %r10
471	sub	$4, %r11
472	psrldq	$4, %xmm0
473	test	%r11, %r11
474	je	.L_return_T_done_\@
475.L_T_123_\@:
476	movd	%xmm0, %eax
477	cmp	$2, %r11
478	jl	.L_T_1_\@
479	mov	%ax, (%r10)
480	cmp	$2, %r11
481	je	.L_return_T_done_\@
482	add	$2, %r10
483	sar	$16, %eax
484.L_T_1_\@:
485	mov	%al, (%r10)
486	jmp	.L_return_T_done_\@
487.L_T_16_\@:
488	movdqu	%xmm0, (%r10)
489.L_return_T_done_\@:
490.endm
491
492#ifdef __x86_64__
493/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
494*
495*
496* Input: A and B (128-bits each, bit-reflected)
497* Output: C = A*B*x mod poly, (i.e. >>1 )
498* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
499* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
500*
501*/
502.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
503	movdqa	  \GH, \TMP1
504	pshufd	  $78, \GH, \TMP2
505	pshufd	  $78, \HK, \TMP3
506	pxor	  \GH, \TMP2            # TMP2 = a1+a0
507	pxor	  \HK, \TMP3            # TMP3 = b1+b0
508	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
509	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
510	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
511	pxor	  \GH, \TMP2
512	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
513	movdqa	  \TMP2, \TMP3
514	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
515	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
516	pxor	  \TMP3, \GH
517	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
518
519        # first phase of the reduction
520
521	movdqa    \GH, \TMP2
522	movdqa    \GH, \TMP3
523	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
524					# in in order to perform
525					# independent shifts
526	pslld     $31, \TMP2            # packed right shift <<31
527	pslld     $30, \TMP3            # packed right shift <<30
528	pslld     $25, \TMP4            # packed right shift <<25
529	pxor      \TMP3, \TMP2          # xor the shifted versions
530	pxor      \TMP4, \TMP2
531	movdqa    \TMP2, \TMP5
532	psrldq    $4, \TMP5             # right shift TMP5 1 DW
533	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
534	pxor      \TMP2, \GH
535
536        # second phase of the reduction
537
538	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
539					# in in order to perform
540					# independent shifts
541	movdqa    \GH,\TMP3
542	movdqa    \GH,\TMP4
543	psrld     $1,\TMP2              # packed left shift >>1
544	psrld     $2,\TMP3              # packed left shift >>2
545	psrld     $7,\TMP4              # packed left shift >>7
546	pxor      \TMP3,\TMP2		# xor the shifted versions
547	pxor      \TMP4,\TMP2
548	pxor      \TMP5, \TMP2
549	pxor      \TMP2, \GH
550	pxor      \TMP1, \GH            # result is in TMP1
551.endm
552
553# Reads DLEN bytes starting at DPTR and stores in XMMDst
554# where 0 < DLEN < 16
555# Clobbers %rax, DLEN and XMM1
556.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
557        cmp $8, \DLEN
558        jl .L_read_lt8_\@
559        mov (\DPTR), %rax
560        movq %rax, \XMMDst
561        sub $8, \DLEN
562        jz .L_done_read_partial_block_\@
563	xor %eax, %eax
564.L_read_next_byte_\@:
565        shl $8, %rax
566        mov 7(\DPTR, \DLEN, 1), %al
567        dec \DLEN
568        jnz .L_read_next_byte_\@
569        movq %rax, \XMM1
570	pslldq $8, \XMM1
571        por \XMM1, \XMMDst
572	jmp .L_done_read_partial_block_\@
573.L_read_lt8_\@:
574	xor %eax, %eax
575.L_read_next_byte_lt8_\@:
576        shl $8, %rax
577        mov -1(\DPTR, \DLEN, 1), %al
578        dec \DLEN
579        jnz .L_read_next_byte_lt8_\@
580        movq %rax, \XMMDst
581.L_done_read_partial_block_\@:
582.endm
583
584# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
585# clobbers r10-11, xmm14
586.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
587	TMP6 TMP7
588	MOVADQ	   SHUF_MASK(%rip), %xmm14
589	mov	   \AAD, %r10		# %r10 = AAD
590	mov	   \AADLEN, %r11		# %r11 = aadLen
591	pxor	   \TMP7, \TMP7
592	pxor	   \TMP6, \TMP6
593
594	cmp	   $16, %r11
595	jl	   .L_get_AAD_rest\@
596.L_get_AAD_blocks\@:
597	movdqu	   (%r10), \TMP7
598	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
599	pxor	   \TMP7, \TMP6
600	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
601	add	   $16, %r10
602	sub	   $16, %r11
603	cmp	   $16, %r11
604	jge	   .L_get_AAD_blocks\@
605
606	movdqu	   \TMP6, \TMP7
607
608	/* read the last <16B of AAD */
609.L_get_AAD_rest\@:
610	test	   %r11, %r11
611	je	   .L_get_AAD_done\@
612
613	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
614	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
615	pxor	   \TMP6, \TMP7
616	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
617	movdqu \TMP7, \TMP6
618
619.L_get_AAD_done\@:
620	movdqu \TMP6, AadHash(%arg2)
621.endm
622
623# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
624# between update calls.
625# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
626# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
627# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
628.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
629	AAD_HASH operation
630	mov 	PBlockLen(%arg2), %r13
631	test	%r13, %r13
632	je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
633	# Read in input data without over reading
634	cmp	$16, \PLAIN_CYPH_LEN
635	jl	.L_fewer_than_16_bytes_\@
636	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
637	jmp	.L_data_read_\@
638
639.L_fewer_than_16_bytes_\@:
640	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
641	mov	\PLAIN_CYPH_LEN, %r12
642	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
643
644	mov PBlockLen(%arg2), %r13
645
646.L_data_read_\@:				# Finished reading in data
647
648	movdqu	PBlockEncKey(%arg2), %xmm9
649	movdqu	HashKey(%arg2), %xmm13
650
651	lea	SHIFT_MASK(%rip), %r12
652
653	# adjust the shuffle mask pointer to be able to shift r13 bytes
654	# r16-r13 is the number of bytes in plaintext mod 16)
655	add	%r13, %r12
656	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
657	pshufb	%xmm2, %xmm9		# shift right r13 bytes
658
659.ifc \operation, dec
660	movdqa	%xmm1, %xmm3
661	pxor	%xmm1, %xmm9		# Ciphertext XOR E(K, Yn)
662
663	mov	\PLAIN_CYPH_LEN, %r10
664	add	%r13, %r10
665	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
666	sub	$16, %r10
667	# Determine if partial block is not being filled and
668	# shift mask accordingly
669	jge	.L_no_extra_mask_1_\@
670	sub	%r10, %r12
671.L_no_extra_mask_1_\@:
672
673	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
674	# get the appropriate mask to mask out bottom r13 bytes of xmm9
675	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
676
677	pand	%xmm1, %xmm3
678	movdqa	SHUF_MASK(%rip), %xmm10
679	pshufb	%xmm10, %xmm3
680	pshufb	%xmm2, %xmm3
681	pxor	%xmm3, \AAD_HASH
682
683	test	%r10, %r10
684	jl	.L_partial_incomplete_1_\@
685
686	# GHASH computation for the last <16 Byte block
687	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
688	xor	%eax, %eax
689
690	mov	%rax, PBlockLen(%arg2)
691	jmp	.L_dec_done_\@
692.L_partial_incomplete_1_\@:
693	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
694.L_dec_done_\@:
695	movdqu	\AAD_HASH, AadHash(%arg2)
696.else
697	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
698
699	mov	\PLAIN_CYPH_LEN, %r10
700	add	%r13, %r10
701	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
702	sub	$16, %r10
703	# Determine if partial block is not being filled and
704	# shift mask accordingly
705	jge	.L_no_extra_mask_2_\@
706	sub	%r10, %r12
707.L_no_extra_mask_2_\@:
708
709	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
710	# get the appropriate mask to mask out bottom r13 bytes of xmm9
711	pand	%xmm1, %xmm9
712
713	movdqa	SHUF_MASK(%rip), %xmm1
714	pshufb	%xmm1, %xmm9
715	pshufb	%xmm2, %xmm9
716	pxor	%xmm9, \AAD_HASH
717
718	test	%r10, %r10
719	jl	.L_partial_incomplete_2_\@
720
721	# GHASH computation for the last <16 Byte block
722	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
723	xor	%eax, %eax
724
725	mov	%rax, PBlockLen(%arg2)
726	jmp	.L_encode_done_\@
727.L_partial_incomplete_2_\@:
728	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
729.L_encode_done_\@:
730	movdqu	\AAD_HASH, AadHash(%arg2)
731
732	movdqa	SHUF_MASK(%rip), %xmm10
733	# shuffle xmm9 back to output as ciphertext
734	pshufb	%xmm10, %xmm9
735	pshufb	%xmm2, %xmm9
736.endif
737	# output encrypted Bytes
738	test	%r10, %r10
739	jl	.L_partial_fill_\@
740	mov	%r13, %r12
741	mov	$16, %r13
742	# Set r13 to be the number of bytes to write out
743	sub	%r12, %r13
744	jmp	.L_count_set_\@
745.L_partial_fill_\@:
746	mov	\PLAIN_CYPH_LEN, %r13
747.L_count_set_\@:
748	movdqa	%xmm9, %xmm0
749	movq	%xmm0, %rax
750	cmp	$8, %r13
751	jle	.L_less_than_8_bytes_left_\@
752
753	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
754	add	$8, \DATA_OFFSET
755	psrldq	$8, %xmm0
756	movq	%xmm0, %rax
757	sub	$8, %r13
758.L_less_than_8_bytes_left_\@:
759	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
760	add	$1, \DATA_OFFSET
761	shr	$8, %rax
762	sub	$1, %r13
763	jne	.L_less_than_8_bytes_left_\@
764.L_partial_block_done_\@:
765.endm # PARTIAL_BLOCK
766
767/*
768* if a = number of total plaintext bytes
769* b = floor(a/16)
770* num_initial_blocks = b mod 4
771* encrypt the initial num_initial_blocks blocks and apply ghash on
772* the ciphertext
773* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
774* are clobbered
775* arg1, %arg2, %arg3 are used as a pointer only, not modified
776*/
777
778
779.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
780	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
781	MOVADQ		SHUF_MASK(%rip), %xmm14
782
783	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
784
785	# start AES for num_initial_blocks blocks
786
787	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
788
789.if (\i == 5) || (\i == 6) || (\i == 7)
790
791	MOVADQ		ONE(%RIP),\TMP1
792	MOVADQ		0(%arg1),\TMP2
793.irpc index, \i_seq
794	paddd		\TMP1, \XMM0                 # INCR Y0
795.ifc \operation, dec
796        movdqa     \XMM0, %xmm\index
797.else
798	MOVADQ		\XMM0, %xmm\index
799.endif
800	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
801	pxor		\TMP2, %xmm\index
802.endr
803	lea	0x10(%arg1),%r10
804	mov	keysize,%eax
805	shr	$2,%eax				# 128->4, 192->6, 256->8
806	add	$5,%eax			      # 128->9, 192->11, 256->13
807
808.Laes_loop_initial_\@:
809	MOVADQ	(%r10),\TMP1
810.irpc	index, \i_seq
811	aesenc	\TMP1, %xmm\index
812.endr
813	add	$16,%r10
814	sub	$1,%eax
815	jnz	.Laes_loop_initial_\@
816
817	MOVADQ	(%r10), \TMP1
818.irpc index, \i_seq
819	aesenclast \TMP1, %xmm\index         # Last Round
820.endr
821.irpc index, \i_seq
822	movdqu	   (%arg4 , %r11, 1), \TMP1
823	pxor	   \TMP1, %xmm\index
824	movdqu	   %xmm\index, (%arg3 , %r11, 1)
825	# write back plaintext/ciphertext for num_initial_blocks
826	add	   $16, %r11
827
828.ifc \operation, dec
829	movdqa     \TMP1, %xmm\index
830.endif
831	pshufb	   %xmm14, %xmm\index
832
833		# prepare plaintext/ciphertext for GHASH computation
834.endr
835.endif
836
837        # apply GHASH on num_initial_blocks blocks
838
839.if \i == 5
840        pxor       %xmm5, %xmm6
841	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
842        pxor       %xmm6, %xmm7
843	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
844        pxor       %xmm7, %xmm8
845	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
846.elseif \i == 6
847        pxor       %xmm6, %xmm7
848	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
849        pxor       %xmm7, %xmm8
850	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
851.elseif \i == 7
852        pxor       %xmm7, %xmm8
853	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854.endif
855	cmp	   $64, %r13
856	jl	.L_initial_blocks_done\@
857	# no need for precomputed values
858/*
859*
860* Precomputations for HashKey parallel with encryption of first 4 blocks.
861* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
862*/
863	MOVADQ	   ONE(%RIP),\TMP1
864	paddd	   \TMP1, \XMM0              # INCR Y0
865	MOVADQ	   \XMM0, \XMM1
866	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
867
868	paddd	   \TMP1, \XMM0              # INCR Y0
869	MOVADQ	   \XMM0, \XMM2
870	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
871
872	paddd	   \TMP1, \XMM0              # INCR Y0
873	MOVADQ	   \XMM0, \XMM3
874	pshufb %xmm14, \XMM3        # perform a 16 byte swap
875
876	paddd	   \TMP1, \XMM0              # INCR Y0
877	MOVADQ	   \XMM0, \XMM4
878	pshufb %xmm14, \XMM4        # perform a 16 byte swap
879
880	MOVADQ	   0(%arg1),\TMP1
881	pxor	   \TMP1, \XMM1
882	pxor	   \TMP1, \XMM2
883	pxor	   \TMP1, \XMM3
884	pxor	   \TMP1, \XMM4
885.irpc index, 1234 # do 4 rounds
886	movaps 0x10*\index(%arg1), \TMP1
887	aesenc	   \TMP1, \XMM1
888	aesenc	   \TMP1, \XMM2
889	aesenc	   \TMP1, \XMM3
890	aesenc	   \TMP1, \XMM4
891.endr
892.irpc index, 56789 # do next 5 rounds
893	movaps 0x10*\index(%arg1), \TMP1
894	aesenc	   \TMP1, \XMM1
895	aesenc	   \TMP1, \XMM2
896	aesenc	   \TMP1, \XMM3
897	aesenc	   \TMP1, \XMM4
898.endr
899	lea	   0xa0(%arg1),%r10
900	mov	   keysize,%eax
901	shr	   $2,%eax			# 128->4, 192->6, 256->8
902	sub	   $4,%eax			# 128->0, 192->2, 256->4
903	jz	   .Laes_loop_pre_done\@
904
905.Laes_loop_pre_\@:
906	MOVADQ	   (%r10),\TMP2
907.irpc	index, 1234
908	aesenc	   \TMP2, %xmm\index
909.endr
910	add	   $16,%r10
911	sub	   $1,%eax
912	jnz	   .Laes_loop_pre_\@
913
914.Laes_loop_pre_done\@:
915	MOVADQ	   (%r10), \TMP2
916	aesenclast \TMP2, \XMM1
917	aesenclast \TMP2, \XMM2
918	aesenclast \TMP2, \XMM3
919	aesenclast \TMP2, \XMM4
920	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
921	pxor	   \TMP1, \XMM1
922.ifc \operation, dec
923	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
924	movdqa     \TMP1, \XMM1
925.endif
926	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
927	pxor	   \TMP1, \XMM2
928.ifc \operation, dec
929	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
930	movdqa     \TMP1, \XMM2
931.endif
932	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
933	pxor	   \TMP1, \XMM3
934.ifc \operation, dec
935	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
936	movdqa     \TMP1, \XMM3
937.endif
938	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
939	pxor	   \TMP1, \XMM4
940.ifc \operation, dec
941	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
942	movdqa     \TMP1, \XMM4
943.else
944	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
945	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
946	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
947	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
948.endif
949
950	add	   $64, %r11
951	pshufb %xmm14, \XMM1 # perform a 16 byte swap
952	pxor	   \XMMDst, \XMM1
953# combine GHASHed value with the corresponding ciphertext
954	pshufb %xmm14, \XMM2 # perform a 16 byte swap
955	pshufb %xmm14, \XMM3 # perform a 16 byte swap
956	pshufb %xmm14, \XMM4 # perform a 16 byte swap
957
958.L_initial_blocks_done\@:
959
960.endm
961
962/*
963* encrypt 4 blocks at a time
964* ghash the 4 previously encrypted ciphertext blocks
965* arg1, %arg3, %arg4 are used as pointers only, not modified
966* %r11 is the data offset value
967*/
968.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
969TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
970
971	movdqa	  \XMM1, \XMM5
972	movdqa	  \XMM2, \XMM6
973	movdqa	  \XMM3, \XMM7
974	movdqa	  \XMM4, \XMM8
975
976        movdqa    SHUF_MASK(%rip), %xmm15
977        # multiply TMP5 * HashKey using karatsuba
978
979	movdqa	  \XMM5, \TMP4
980	pshufd	  $78, \XMM5, \TMP6
981	pxor	  \XMM5, \TMP6
982	paddd     ONE(%rip), \XMM0		# INCR CNT
983	movdqu	  HashKey_4(%arg2), \TMP5
984	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
985	movdqa    \XMM0, \XMM1
986	paddd     ONE(%rip), \XMM0		# INCR CNT
987	movdqa    \XMM0, \XMM2
988	paddd     ONE(%rip), \XMM0		# INCR CNT
989	movdqa    \XMM0, \XMM3
990	paddd     ONE(%rip), \XMM0		# INCR CNT
991	movdqa    \XMM0, \XMM4
992	pshufb %xmm15, \XMM1	# perform a 16 byte swap
993	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
994	pshufb %xmm15, \XMM2	# perform a 16 byte swap
995	pshufb %xmm15, \XMM3	# perform a 16 byte swap
996	pshufb %xmm15, \XMM4	# perform a 16 byte swap
997
998	pxor	  (%arg1), \XMM1
999	pxor	  (%arg1), \XMM2
1000	pxor	  (%arg1), \XMM3
1001	pxor	  (%arg1), \XMM4
1002	movdqu	  HashKey_4_k(%arg2), \TMP5
1003	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1004	movaps 0x10(%arg1), \TMP1
1005	aesenc	  \TMP1, \XMM1              # Round 1
1006	aesenc	  \TMP1, \XMM2
1007	aesenc	  \TMP1, \XMM3
1008	aesenc	  \TMP1, \XMM4
1009	movaps 0x20(%arg1), \TMP1
1010	aesenc	  \TMP1, \XMM1              # Round 2
1011	aesenc	  \TMP1, \XMM2
1012	aesenc	  \TMP1, \XMM3
1013	aesenc	  \TMP1, \XMM4
1014	movdqa	  \XMM6, \TMP1
1015	pshufd	  $78, \XMM6, \TMP2
1016	pxor	  \XMM6, \TMP2
1017	movdqu	  HashKey_3(%arg2), \TMP5
1018	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1019	movaps 0x30(%arg1), \TMP3
1020	aesenc    \TMP3, \XMM1              # Round 3
1021	aesenc    \TMP3, \XMM2
1022	aesenc    \TMP3, \XMM3
1023	aesenc    \TMP3, \XMM4
1024	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1025	movaps 0x40(%arg1), \TMP3
1026	aesenc	  \TMP3, \XMM1              # Round 4
1027	aesenc	  \TMP3, \XMM2
1028	aesenc	  \TMP3, \XMM3
1029	aesenc	  \TMP3, \XMM4
1030	movdqu	  HashKey_3_k(%arg2), \TMP5
1031	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1032	movaps 0x50(%arg1), \TMP3
1033	aesenc	  \TMP3, \XMM1              # Round 5
1034	aesenc	  \TMP3, \XMM2
1035	aesenc	  \TMP3, \XMM3
1036	aesenc	  \TMP3, \XMM4
1037	pxor	  \TMP1, \TMP4
1038# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1039	pxor	  \XMM6, \XMM5
1040	pxor	  \TMP2, \TMP6
1041	movdqa	  \XMM7, \TMP1
1042	pshufd	  $78, \XMM7, \TMP2
1043	pxor	  \XMM7, \TMP2
1044	movdqu	  HashKey_2(%arg2), \TMP5
1045
1046        # Multiply TMP5 * HashKey using karatsuba
1047
1048	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1049	movaps 0x60(%arg1), \TMP3
1050	aesenc	  \TMP3, \XMM1              # Round 6
1051	aesenc	  \TMP3, \XMM2
1052	aesenc	  \TMP3, \XMM3
1053	aesenc	  \TMP3, \XMM4
1054	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1055	movaps 0x70(%arg1), \TMP3
1056	aesenc	  \TMP3, \XMM1              # Round 7
1057	aesenc	  \TMP3, \XMM2
1058	aesenc	  \TMP3, \XMM3
1059	aesenc	  \TMP3, \XMM4
1060	movdqu	  HashKey_2_k(%arg2), \TMP5
1061	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1062	movaps 0x80(%arg1), \TMP3
1063	aesenc	  \TMP3, \XMM1              # Round 8
1064	aesenc	  \TMP3, \XMM2
1065	aesenc	  \TMP3, \XMM3
1066	aesenc	  \TMP3, \XMM4
1067	pxor	  \TMP1, \TMP4
1068# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1069	pxor	  \XMM7, \XMM5
1070	pxor	  \TMP2, \TMP6
1071
1072        # Multiply XMM8 * HashKey
1073        # XMM8 and TMP5 hold the values for the two operands
1074
1075	movdqa	  \XMM8, \TMP1
1076	pshufd	  $78, \XMM8, \TMP2
1077	pxor	  \XMM8, \TMP2
1078	movdqu	  HashKey(%arg2), \TMP5
1079	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1080	movaps 0x90(%arg1), \TMP3
1081	aesenc	  \TMP3, \XMM1             # Round 9
1082	aesenc	  \TMP3, \XMM2
1083	aesenc	  \TMP3, \XMM3
1084	aesenc	  \TMP3, \XMM4
1085	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1086	lea	  0xa0(%arg1),%r10
1087	mov	  keysize,%eax
1088	shr	  $2,%eax			# 128->4, 192->6, 256->8
1089	sub	  $4,%eax			# 128->0, 192->2, 256->4
1090	jz	  .Laes_loop_par_enc_done\@
1091
1092.Laes_loop_par_enc\@:
1093	MOVADQ	  (%r10),\TMP3
1094.irpc	index, 1234
1095	aesenc	  \TMP3, %xmm\index
1096.endr
1097	add	  $16,%r10
1098	sub	  $1,%eax
1099	jnz	  .Laes_loop_par_enc\@
1100
1101.Laes_loop_par_enc_done\@:
1102	MOVADQ	  (%r10), \TMP3
1103	aesenclast \TMP3, \XMM1           # Round 10
1104	aesenclast \TMP3, \XMM2
1105	aesenclast \TMP3, \XMM3
1106	aesenclast \TMP3, \XMM4
1107	movdqu    HashKey_k(%arg2), \TMP5
1108	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1109	movdqu	  (%arg4,%r11,1), \TMP3
1110	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1111	movdqu	  16(%arg4,%r11,1), \TMP3
1112	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1113	movdqu	  32(%arg4,%r11,1), \TMP3
1114	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1115	movdqu	  48(%arg4,%r11,1), \TMP3
1116	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1117        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1118        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1119        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1120        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1121	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1122	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1123	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1124	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1125
1126	pxor	  \TMP4, \TMP1
1127	pxor	  \XMM8, \XMM5
1128	pxor	  \TMP6, \TMP2
1129	pxor	  \TMP1, \TMP2
1130	pxor	  \XMM5, \TMP2
1131	movdqa	  \TMP2, \TMP3
1132	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1133	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1134	pxor	  \TMP3, \XMM5
1135	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1136
1137        # first phase of reduction
1138
1139	movdqa    \XMM5, \TMP2
1140	movdqa    \XMM5, \TMP3
1141	movdqa    \XMM5, \TMP4
1142# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1143	pslld     $31, \TMP2                   # packed right shift << 31
1144	pslld     $30, \TMP3                   # packed right shift << 30
1145	pslld     $25, \TMP4                   # packed right shift << 25
1146	pxor      \TMP3, \TMP2	               # xor the shifted versions
1147	pxor      \TMP4, \TMP2
1148	movdqa    \TMP2, \TMP5
1149	psrldq    $4, \TMP5                    # right shift T5 1 DW
1150	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1151	pxor      \TMP2, \XMM5
1152
1153        # second phase of reduction
1154
1155	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1156	movdqa    \XMM5,\TMP3
1157	movdqa    \XMM5,\TMP4
1158	psrld     $1, \TMP2                    # packed left shift >>1
1159	psrld     $2, \TMP3                    # packed left shift >>2
1160	psrld     $7, \TMP4                    # packed left shift >>7
1161	pxor      \TMP3,\TMP2		       # xor the shifted versions
1162	pxor      \TMP4,\TMP2
1163	pxor      \TMP5, \TMP2
1164	pxor      \TMP2, \XMM5
1165	pxor      \TMP1, \XMM5                 # result is in TMP1
1166
1167	pxor	  \XMM5, \XMM1
1168.endm
1169
1170/*
1171* decrypt 4 blocks at a time
1172* ghash the 4 previously decrypted ciphertext blocks
1173* arg1, %arg3, %arg4 are used as pointers only, not modified
1174* %r11 is the data offset value
1175*/
1176.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1177TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1178
1179	movdqa	  \XMM1, \XMM5
1180	movdqa	  \XMM2, \XMM6
1181	movdqa	  \XMM3, \XMM7
1182	movdqa	  \XMM4, \XMM8
1183
1184        movdqa    SHUF_MASK(%rip), %xmm15
1185        # multiply TMP5 * HashKey using karatsuba
1186
1187	movdqa	  \XMM5, \TMP4
1188	pshufd	  $78, \XMM5, \TMP6
1189	pxor	  \XMM5, \TMP6
1190	paddd     ONE(%rip), \XMM0		# INCR CNT
1191	movdqu	  HashKey_4(%arg2), \TMP5
1192	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1193	movdqa    \XMM0, \XMM1
1194	paddd     ONE(%rip), \XMM0		# INCR CNT
1195	movdqa    \XMM0, \XMM2
1196	paddd     ONE(%rip), \XMM0		# INCR CNT
1197	movdqa    \XMM0, \XMM3
1198	paddd     ONE(%rip), \XMM0		# INCR CNT
1199	movdqa    \XMM0, \XMM4
1200	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1201	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1202	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1203	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1204	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1205
1206	pxor	  (%arg1), \XMM1
1207	pxor	  (%arg1), \XMM2
1208	pxor	  (%arg1), \XMM3
1209	pxor	  (%arg1), \XMM4
1210	movdqu	  HashKey_4_k(%arg2), \TMP5
1211	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1212	movaps 0x10(%arg1), \TMP1
1213	aesenc	  \TMP1, \XMM1              # Round 1
1214	aesenc	  \TMP1, \XMM2
1215	aesenc	  \TMP1, \XMM3
1216	aesenc	  \TMP1, \XMM4
1217	movaps 0x20(%arg1), \TMP1
1218	aesenc	  \TMP1, \XMM1              # Round 2
1219	aesenc	  \TMP1, \XMM2
1220	aesenc	  \TMP1, \XMM3
1221	aesenc	  \TMP1, \XMM4
1222	movdqa	  \XMM6, \TMP1
1223	pshufd	  $78, \XMM6, \TMP2
1224	pxor	  \XMM6, \TMP2
1225	movdqu	  HashKey_3(%arg2), \TMP5
1226	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1227	movaps 0x30(%arg1), \TMP3
1228	aesenc    \TMP3, \XMM1              # Round 3
1229	aesenc    \TMP3, \XMM2
1230	aesenc    \TMP3, \XMM3
1231	aesenc    \TMP3, \XMM4
1232	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1233	movaps 0x40(%arg1), \TMP3
1234	aesenc	  \TMP3, \XMM1              # Round 4
1235	aesenc	  \TMP3, \XMM2
1236	aesenc	  \TMP3, \XMM3
1237	aesenc	  \TMP3, \XMM4
1238	movdqu	  HashKey_3_k(%arg2), \TMP5
1239	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1240	movaps 0x50(%arg1), \TMP3
1241	aesenc	  \TMP3, \XMM1              # Round 5
1242	aesenc	  \TMP3, \XMM2
1243	aesenc	  \TMP3, \XMM3
1244	aesenc	  \TMP3, \XMM4
1245	pxor	  \TMP1, \TMP4
1246# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1247	pxor	  \XMM6, \XMM5
1248	pxor	  \TMP2, \TMP6
1249	movdqa	  \XMM7, \TMP1
1250	pshufd	  $78, \XMM7, \TMP2
1251	pxor	  \XMM7, \TMP2
1252	movdqu	  HashKey_2(%arg2), \TMP5
1253
1254        # Multiply TMP5 * HashKey using karatsuba
1255
1256	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1257	movaps 0x60(%arg1), \TMP3
1258	aesenc	  \TMP3, \XMM1              # Round 6
1259	aesenc	  \TMP3, \XMM2
1260	aesenc	  \TMP3, \XMM3
1261	aesenc	  \TMP3, \XMM4
1262	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1263	movaps 0x70(%arg1), \TMP3
1264	aesenc	  \TMP3, \XMM1              # Round 7
1265	aesenc	  \TMP3, \XMM2
1266	aesenc	  \TMP3, \XMM3
1267	aesenc	  \TMP3, \XMM4
1268	movdqu	  HashKey_2_k(%arg2), \TMP5
1269	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1270	movaps 0x80(%arg1), \TMP3
1271	aesenc	  \TMP3, \XMM1              # Round 8
1272	aesenc	  \TMP3, \XMM2
1273	aesenc	  \TMP3, \XMM3
1274	aesenc	  \TMP3, \XMM4
1275	pxor	  \TMP1, \TMP4
1276# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1277	pxor	  \XMM7, \XMM5
1278	pxor	  \TMP2, \TMP6
1279
1280        # Multiply XMM8 * HashKey
1281        # XMM8 and TMP5 hold the values for the two operands
1282
1283	movdqa	  \XMM8, \TMP1
1284	pshufd	  $78, \XMM8, \TMP2
1285	pxor	  \XMM8, \TMP2
1286	movdqu	  HashKey(%arg2), \TMP5
1287	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1288	movaps 0x90(%arg1), \TMP3
1289	aesenc	  \TMP3, \XMM1             # Round 9
1290	aesenc	  \TMP3, \XMM2
1291	aesenc	  \TMP3, \XMM3
1292	aesenc	  \TMP3, \XMM4
1293	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1294	lea	  0xa0(%arg1),%r10
1295	mov	  keysize,%eax
1296	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1297	sub	  $4,%eax			# 128->0, 192->2, 256->4
1298	jz	  .Laes_loop_par_dec_done\@
1299
1300.Laes_loop_par_dec\@:
1301	MOVADQ	  (%r10),\TMP3
1302.irpc	index, 1234
1303	aesenc	  \TMP3, %xmm\index
1304.endr
1305	add	  $16,%r10
1306	sub	  $1,%eax
1307	jnz	  .Laes_loop_par_dec\@
1308
1309.Laes_loop_par_dec_done\@:
1310	MOVADQ	  (%r10), \TMP3
1311	aesenclast \TMP3, \XMM1           # last round
1312	aesenclast \TMP3, \XMM2
1313	aesenclast \TMP3, \XMM3
1314	aesenclast \TMP3, \XMM4
1315	movdqu    HashKey_k(%arg2), \TMP5
1316	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1317	movdqu	  (%arg4,%r11,1), \TMP3
1318	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1319	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1320	movdqa    \TMP3, \XMM1
1321	movdqu	  16(%arg4,%r11,1), \TMP3
1322	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1323	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1324	movdqa    \TMP3, \XMM2
1325	movdqu	  32(%arg4,%r11,1), \TMP3
1326	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1327	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1328	movdqa    \TMP3, \XMM3
1329	movdqu	  48(%arg4,%r11,1), \TMP3
1330	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1331	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1332	movdqa    \TMP3, \XMM4
1333	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1334	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1335	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1336	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1337
1338	pxor	  \TMP4, \TMP1
1339	pxor	  \XMM8, \XMM5
1340	pxor	  \TMP6, \TMP2
1341	pxor	  \TMP1, \TMP2
1342	pxor	  \XMM5, \TMP2
1343	movdqa	  \TMP2, \TMP3
1344	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1345	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1346	pxor	  \TMP3, \XMM5
1347	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1348
1349        # first phase of reduction
1350
1351	movdqa    \XMM5, \TMP2
1352	movdqa    \XMM5, \TMP3
1353	movdqa    \XMM5, \TMP4
1354# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1355	pslld     $31, \TMP2                   # packed right shift << 31
1356	pslld     $30, \TMP3                   # packed right shift << 30
1357	pslld     $25, \TMP4                   # packed right shift << 25
1358	pxor      \TMP3, \TMP2	               # xor the shifted versions
1359	pxor      \TMP4, \TMP2
1360	movdqa    \TMP2, \TMP5
1361	psrldq    $4, \TMP5                    # right shift T5 1 DW
1362	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1363	pxor      \TMP2, \XMM5
1364
1365        # second phase of reduction
1366
1367	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1368	movdqa    \XMM5,\TMP3
1369	movdqa    \XMM5,\TMP4
1370	psrld     $1, \TMP2                    # packed left shift >>1
1371	psrld     $2, \TMP3                    # packed left shift >>2
1372	psrld     $7, \TMP4                    # packed left shift >>7
1373	pxor      \TMP3,\TMP2		       # xor the shifted versions
1374	pxor      \TMP4,\TMP2
1375	pxor      \TMP5, \TMP2
1376	pxor      \TMP2, \XMM5
1377	pxor      \TMP1, \XMM5                 # result is in TMP1
1378
1379	pxor	  \XMM5, \XMM1
1380.endm
1381
1382/* GHASH the last 4 ciphertext blocks. */
1383.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1384TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1385
1386        # Multiply TMP6 * HashKey (using Karatsuba)
1387
1388	movdqa	  \XMM1, \TMP6
1389	pshufd	  $78, \XMM1, \TMP2
1390	pxor	  \XMM1, \TMP2
1391	movdqu	  HashKey_4(%arg2), \TMP5
1392	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1393	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1394	movdqu	  HashKey_4_k(%arg2), \TMP4
1395	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1396	movdqa	  \XMM1, \XMMDst
1397	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1398
1399        # Multiply TMP1 * HashKey (using Karatsuba)
1400
1401	movdqa	  \XMM2, \TMP1
1402	pshufd	  $78, \XMM2, \TMP2
1403	pxor	  \XMM2, \TMP2
1404	movdqu	  HashKey_3(%arg2), \TMP5
1405	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1406	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1407	movdqu	  HashKey_3_k(%arg2), \TMP4
1408	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1409	pxor	  \TMP1, \TMP6
1410	pxor	  \XMM2, \XMMDst
1411	pxor	  \TMP2, \XMM1
1412# results accumulated in TMP6, XMMDst, XMM1
1413
1414        # Multiply TMP1 * HashKey (using Karatsuba)
1415
1416	movdqa	  \XMM3, \TMP1
1417	pshufd	  $78, \XMM3, \TMP2
1418	pxor	  \XMM3, \TMP2
1419	movdqu	  HashKey_2(%arg2), \TMP5
1420	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1421	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1422	movdqu	  HashKey_2_k(%arg2), \TMP4
1423	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1424	pxor	  \TMP1, \TMP6
1425	pxor	  \XMM3, \XMMDst
1426	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1427
1428        # Multiply TMP1 * HashKey (using Karatsuba)
1429	movdqa	  \XMM4, \TMP1
1430	pshufd	  $78, \XMM4, \TMP2
1431	pxor	  \XMM4, \TMP2
1432	movdqu	  HashKey(%arg2), \TMP5
1433	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1434	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1435	movdqu	  HashKey_k(%arg2), \TMP4
1436	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1437	pxor	  \TMP1, \TMP6
1438	pxor	  \XMM4, \XMMDst
1439	pxor	  \XMM1, \TMP2
1440	pxor	  \TMP6, \TMP2
1441	pxor	  \XMMDst, \TMP2
1442	# middle section of the temp results combined as in karatsuba algorithm
1443	movdqa	  \TMP2, \TMP4
1444	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1445	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1446	pxor	  \TMP4, \XMMDst
1447	pxor	  \TMP2, \TMP6
1448# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1449	# first phase of the reduction
1450	movdqa    \XMMDst, \TMP2
1451	movdqa    \XMMDst, \TMP3
1452	movdqa    \XMMDst, \TMP4
1453# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1454	pslld     $31, \TMP2                # packed right shifting << 31
1455	pslld     $30, \TMP3                # packed right shifting << 30
1456	pslld     $25, \TMP4                # packed right shifting << 25
1457	pxor      \TMP3, \TMP2              # xor the shifted versions
1458	pxor      \TMP4, \TMP2
1459	movdqa    \TMP2, \TMP7
1460	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1461	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1462	pxor      \TMP2, \XMMDst
1463
1464        # second phase of the reduction
1465	movdqa    \XMMDst, \TMP2
1466	# make 3 copies of XMMDst for doing 3 shift operations
1467	movdqa    \XMMDst, \TMP3
1468	movdqa    \XMMDst, \TMP4
1469	psrld     $1, \TMP2                 # packed left shift >> 1
1470	psrld     $2, \TMP3                 # packed left shift >> 2
1471	psrld     $7, \TMP4                 # packed left shift >> 7
1472	pxor      \TMP3, \TMP2              # xor the shifted versions
1473	pxor      \TMP4, \TMP2
1474	pxor      \TMP7, \TMP2
1475	pxor      \TMP2, \XMMDst
1476	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1477.endm
1478
1479
1480/* Encryption of a single block
1481* uses eax & r10
1482*/
1483
1484.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1485
1486	pxor		(%arg1), \XMM0
1487	mov		keysize,%eax
1488	shr		$2,%eax			# 128->4, 192->6, 256->8
1489	add		$5,%eax			# 128->9, 192->11, 256->13
1490	lea		16(%arg1), %r10	  # get first expanded key address
1491
1492_esb_loop_\@:
1493	MOVADQ		(%r10),\TMP1
1494	aesenc		\TMP1,\XMM0
1495	add		$16,%r10
1496	sub		$1,%eax
1497	jnz		_esb_loop_\@
1498
1499	MOVADQ		(%r10),\TMP1
1500	aesenclast	\TMP1,\XMM0
1501.endm
1502
1503/*****************************************************************************
1504* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1505*                     struct gcm_context_data *data,
1506*                                         // context data
1507*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1508*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1509*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1510*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1511*                     const u8 *aad,      // Additional Authentication Data (AAD)
1512*                     u64 aad_len)        // Length of AAD in bytes.
1513*/
1514SYM_FUNC_START(aesni_gcm_init)
1515	FUNC_SAVE
1516	GCM_INIT %arg3, %arg4,%arg5, %arg6
1517	FUNC_RESTORE
1518	RET
1519SYM_FUNC_END(aesni_gcm_init)
1520
1521/*****************************************************************************
1522* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1523*                    struct gcm_context_data *data,
1524*                                        // context data
1525*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1526*                    const u8 *in,       // Plaintext input
1527*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1528*/
1529SYM_FUNC_START(aesni_gcm_enc_update)
1530	FUNC_SAVE
1531	GCM_ENC_DEC enc
1532	FUNC_RESTORE
1533	RET
1534SYM_FUNC_END(aesni_gcm_enc_update)
1535
1536/*****************************************************************************
1537* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1538*                    struct gcm_context_data *data,
1539*                                        // context data
1540*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1541*                    const u8 *in,       // Plaintext input
1542*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1543*/
1544SYM_FUNC_START(aesni_gcm_dec_update)
1545	FUNC_SAVE
1546	GCM_ENC_DEC dec
1547	FUNC_RESTORE
1548	RET
1549SYM_FUNC_END(aesni_gcm_dec_update)
1550
1551/*****************************************************************************
1552* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1553*                    struct gcm_context_data *data,
1554*                                        // context data
1555*                    u8 *auth_tag,       // Authenticated Tag output.
1556*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1557*                                        // 12 or 8.
1558*/
1559SYM_FUNC_START(aesni_gcm_finalize)
1560	FUNC_SAVE
1561	GCM_COMPLETE %arg3 %arg4
1562	FUNC_RESTORE
1563	RET
1564SYM_FUNC_END(aesni_gcm_finalize)
1565
1566#endif
1567
1568SYM_FUNC_START_LOCAL(_key_expansion_256a)
1569	pshufd $0b11111111, %xmm1, %xmm1
1570	shufps $0b00010000, %xmm0, %xmm4
1571	pxor %xmm4, %xmm0
1572	shufps $0b10001100, %xmm0, %xmm4
1573	pxor %xmm4, %xmm0
1574	pxor %xmm1, %xmm0
1575	movaps %xmm0, (TKEYP)
1576	add $0x10, TKEYP
1577	RET
1578SYM_FUNC_END(_key_expansion_256a)
1579SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1580
1581SYM_FUNC_START_LOCAL(_key_expansion_192a)
1582	pshufd $0b01010101, %xmm1, %xmm1
1583	shufps $0b00010000, %xmm0, %xmm4
1584	pxor %xmm4, %xmm0
1585	shufps $0b10001100, %xmm0, %xmm4
1586	pxor %xmm4, %xmm0
1587	pxor %xmm1, %xmm0
1588
1589	movaps %xmm2, %xmm5
1590	movaps %xmm2, %xmm6
1591	pslldq $4, %xmm5
1592	pshufd $0b11111111, %xmm0, %xmm3
1593	pxor %xmm3, %xmm2
1594	pxor %xmm5, %xmm2
1595
1596	movaps %xmm0, %xmm1
1597	shufps $0b01000100, %xmm0, %xmm6
1598	movaps %xmm6, (TKEYP)
1599	shufps $0b01001110, %xmm2, %xmm1
1600	movaps %xmm1, 0x10(TKEYP)
1601	add $0x20, TKEYP
1602	RET
1603SYM_FUNC_END(_key_expansion_192a)
1604
1605SYM_FUNC_START_LOCAL(_key_expansion_192b)
1606	pshufd $0b01010101, %xmm1, %xmm1
1607	shufps $0b00010000, %xmm0, %xmm4
1608	pxor %xmm4, %xmm0
1609	shufps $0b10001100, %xmm0, %xmm4
1610	pxor %xmm4, %xmm0
1611	pxor %xmm1, %xmm0
1612
1613	movaps %xmm2, %xmm5
1614	pslldq $4, %xmm5
1615	pshufd $0b11111111, %xmm0, %xmm3
1616	pxor %xmm3, %xmm2
1617	pxor %xmm5, %xmm2
1618
1619	movaps %xmm0, (TKEYP)
1620	add $0x10, TKEYP
1621	RET
1622SYM_FUNC_END(_key_expansion_192b)
1623
1624SYM_FUNC_START_LOCAL(_key_expansion_256b)
1625	pshufd $0b10101010, %xmm1, %xmm1
1626	shufps $0b00010000, %xmm2, %xmm4
1627	pxor %xmm4, %xmm2
1628	shufps $0b10001100, %xmm2, %xmm4
1629	pxor %xmm4, %xmm2
1630	pxor %xmm1, %xmm2
1631	movaps %xmm2, (TKEYP)
1632	add $0x10, TKEYP
1633	RET
1634SYM_FUNC_END(_key_expansion_256b)
1635
1636/*
1637 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1638 *                    unsigned int key_len)
1639 */
1640SYM_FUNC_START(aesni_set_key)
1641	FRAME_BEGIN
1642#ifndef __x86_64__
1643	pushl KEYP
1644	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1645	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1646	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1647#endif
1648	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1649	movaps %xmm0, (KEYP)
1650	lea 0x10(KEYP), TKEYP		# key addr
1651	movl %edx, 480(KEYP)
1652	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1653	cmp $24, %dl
1654	jb .Lenc_key128
1655	je .Lenc_key192
1656	movups 0x10(UKEYP), %xmm2	# other user key
1657	movaps %xmm2, (TKEYP)
1658	add $0x10, TKEYP
1659	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1660	call _key_expansion_256a
1661	aeskeygenassist $0x1, %xmm0, %xmm1
1662	call _key_expansion_256b
1663	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1664	call _key_expansion_256a
1665	aeskeygenassist $0x2, %xmm0, %xmm1
1666	call _key_expansion_256b
1667	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1668	call _key_expansion_256a
1669	aeskeygenassist $0x4, %xmm0, %xmm1
1670	call _key_expansion_256b
1671	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1672	call _key_expansion_256a
1673	aeskeygenassist $0x8, %xmm0, %xmm1
1674	call _key_expansion_256b
1675	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1676	call _key_expansion_256a
1677	aeskeygenassist $0x10, %xmm0, %xmm1
1678	call _key_expansion_256b
1679	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1680	call _key_expansion_256a
1681	aeskeygenassist $0x20, %xmm0, %xmm1
1682	call _key_expansion_256b
1683	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1684	call _key_expansion_256a
1685	jmp .Ldec_key
1686.Lenc_key192:
1687	movq 0x10(UKEYP), %xmm2		# other user key
1688	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1689	call _key_expansion_192a
1690	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1691	call _key_expansion_192b
1692	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1693	call _key_expansion_192a
1694	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1695	call _key_expansion_192b
1696	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1697	call _key_expansion_192a
1698	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1699	call _key_expansion_192b
1700	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1701	call _key_expansion_192a
1702	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
1703	call _key_expansion_192b
1704	jmp .Ldec_key
1705.Lenc_key128:
1706	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
1707	call _key_expansion_128
1708	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
1709	call _key_expansion_128
1710	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
1711	call _key_expansion_128
1712	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
1713	call _key_expansion_128
1714	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
1715	call _key_expansion_128
1716	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
1717	call _key_expansion_128
1718	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
1719	call _key_expansion_128
1720	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
1721	call _key_expansion_128
1722	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
1723	call _key_expansion_128
1724	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
1725	call _key_expansion_128
1726.Ldec_key:
1727	sub $0x10, TKEYP
1728	movaps (KEYP), %xmm0
1729	movaps (TKEYP), %xmm1
1730	movaps %xmm0, 240(TKEYP)
1731	movaps %xmm1, 240(KEYP)
1732	add $0x10, KEYP
1733	lea 240-16(TKEYP), UKEYP
1734.align 4
1735.Ldec_key_loop:
1736	movaps (KEYP), %xmm0
1737	aesimc %xmm0, %xmm1
1738	movaps %xmm1, (UKEYP)
1739	add $0x10, KEYP
1740	sub $0x10, UKEYP
1741	cmp TKEYP, KEYP
1742	jb .Ldec_key_loop
1743#ifndef __x86_64__
1744	popl KEYP
1745#endif
1746	FRAME_END
1747	RET
1748SYM_FUNC_END(aesni_set_key)
1749
1750/*
1751 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1752 */
1753SYM_FUNC_START(aesni_enc)
1754	FRAME_BEGIN
1755#ifndef __x86_64__
1756	pushl KEYP
1757	pushl KLEN
1758	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1759	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1760	movl (FRAME_OFFSET+20)(%esp), INP	# src
1761#endif
1762	movl 480(KEYP), KLEN		# key length
1763	movups (INP), STATE		# input
1764	call _aesni_enc1
1765	movups STATE, (OUTP)		# output
1766#ifndef __x86_64__
1767	popl KLEN
1768	popl KEYP
1769#endif
1770	FRAME_END
1771	RET
1772SYM_FUNC_END(aesni_enc)
1773
1774/*
1775 * _aesni_enc1:		internal ABI
1776 * input:
1777 *	KEYP:		key struct pointer
1778 *	KLEN:		round count
1779 *	STATE:		initial state (input)
1780 * output:
1781 *	STATE:		finial state (output)
1782 * changed:
1783 *	KEY
1784 *	TKEYP (T1)
1785 */
1786SYM_FUNC_START_LOCAL(_aesni_enc1)
1787	movaps (KEYP), KEY		# key
1788	mov KEYP, TKEYP
1789	pxor KEY, STATE		# round 0
1790	add $0x30, TKEYP
1791	cmp $24, KLEN
1792	jb .Lenc128
1793	lea 0x20(TKEYP), TKEYP
1794	je .Lenc192
1795	add $0x20, TKEYP
1796	movaps -0x60(TKEYP), KEY
1797	aesenc KEY, STATE
1798	movaps -0x50(TKEYP), KEY
1799	aesenc KEY, STATE
1800.align 4
1801.Lenc192:
1802	movaps -0x40(TKEYP), KEY
1803	aesenc KEY, STATE
1804	movaps -0x30(TKEYP), KEY
1805	aesenc KEY, STATE
1806.align 4
1807.Lenc128:
1808	movaps -0x20(TKEYP), KEY
1809	aesenc KEY, STATE
1810	movaps -0x10(TKEYP), KEY
1811	aesenc KEY, STATE
1812	movaps (TKEYP), KEY
1813	aesenc KEY, STATE
1814	movaps 0x10(TKEYP), KEY
1815	aesenc KEY, STATE
1816	movaps 0x20(TKEYP), KEY
1817	aesenc KEY, STATE
1818	movaps 0x30(TKEYP), KEY
1819	aesenc KEY, STATE
1820	movaps 0x40(TKEYP), KEY
1821	aesenc KEY, STATE
1822	movaps 0x50(TKEYP), KEY
1823	aesenc KEY, STATE
1824	movaps 0x60(TKEYP), KEY
1825	aesenc KEY, STATE
1826	movaps 0x70(TKEYP), KEY
1827	aesenclast KEY, STATE
1828	RET
1829SYM_FUNC_END(_aesni_enc1)
1830
1831/*
1832 * _aesni_enc4:	internal ABI
1833 * input:
1834 *	KEYP:		key struct pointer
1835 *	KLEN:		round count
1836 *	STATE1:		initial state (input)
1837 *	STATE2
1838 *	STATE3
1839 *	STATE4
1840 * output:
1841 *	STATE1:		finial state (output)
1842 *	STATE2
1843 *	STATE3
1844 *	STATE4
1845 * changed:
1846 *	KEY
1847 *	TKEYP (T1)
1848 */
1849SYM_FUNC_START_LOCAL(_aesni_enc4)
1850	movaps (KEYP), KEY		# key
1851	mov KEYP, TKEYP
1852	pxor KEY, STATE1		# round 0
1853	pxor KEY, STATE2
1854	pxor KEY, STATE3
1855	pxor KEY, STATE4
1856	add $0x30, TKEYP
1857	cmp $24, KLEN
1858	jb .L4enc128
1859	lea 0x20(TKEYP), TKEYP
1860	je .L4enc192
1861	add $0x20, TKEYP
1862	movaps -0x60(TKEYP), KEY
1863	aesenc KEY, STATE1
1864	aesenc KEY, STATE2
1865	aesenc KEY, STATE3
1866	aesenc KEY, STATE4
1867	movaps -0x50(TKEYP), KEY
1868	aesenc KEY, STATE1
1869	aesenc KEY, STATE2
1870	aesenc KEY, STATE3
1871	aesenc KEY, STATE4
1872#.align 4
1873.L4enc192:
1874	movaps -0x40(TKEYP), KEY
1875	aesenc KEY, STATE1
1876	aesenc KEY, STATE2
1877	aesenc KEY, STATE3
1878	aesenc KEY, STATE4
1879	movaps -0x30(TKEYP), KEY
1880	aesenc KEY, STATE1
1881	aesenc KEY, STATE2
1882	aesenc KEY, STATE3
1883	aesenc KEY, STATE4
1884#.align 4
1885.L4enc128:
1886	movaps -0x20(TKEYP), KEY
1887	aesenc KEY, STATE1
1888	aesenc KEY, STATE2
1889	aesenc KEY, STATE3
1890	aesenc KEY, STATE4
1891	movaps -0x10(TKEYP), KEY
1892	aesenc KEY, STATE1
1893	aesenc KEY, STATE2
1894	aesenc KEY, STATE3
1895	aesenc KEY, STATE4
1896	movaps (TKEYP), KEY
1897	aesenc KEY, STATE1
1898	aesenc KEY, STATE2
1899	aesenc KEY, STATE3
1900	aesenc KEY, STATE4
1901	movaps 0x10(TKEYP), KEY
1902	aesenc KEY, STATE1
1903	aesenc KEY, STATE2
1904	aesenc KEY, STATE3
1905	aesenc KEY, STATE4
1906	movaps 0x20(TKEYP), KEY
1907	aesenc KEY, STATE1
1908	aesenc KEY, STATE2
1909	aesenc KEY, STATE3
1910	aesenc KEY, STATE4
1911	movaps 0x30(TKEYP), KEY
1912	aesenc KEY, STATE1
1913	aesenc KEY, STATE2
1914	aesenc KEY, STATE3
1915	aesenc KEY, STATE4
1916	movaps 0x40(TKEYP), KEY
1917	aesenc KEY, STATE1
1918	aesenc KEY, STATE2
1919	aesenc KEY, STATE3
1920	aesenc KEY, STATE4
1921	movaps 0x50(TKEYP), KEY
1922	aesenc KEY, STATE1
1923	aesenc KEY, STATE2
1924	aesenc KEY, STATE3
1925	aesenc KEY, STATE4
1926	movaps 0x60(TKEYP), KEY
1927	aesenc KEY, STATE1
1928	aesenc KEY, STATE2
1929	aesenc KEY, STATE3
1930	aesenc KEY, STATE4
1931	movaps 0x70(TKEYP), KEY
1932	aesenclast KEY, STATE1		# last round
1933	aesenclast KEY, STATE2
1934	aesenclast KEY, STATE3
1935	aesenclast KEY, STATE4
1936	RET
1937SYM_FUNC_END(_aesni_enc4)
1938
1939/*
1940 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
1941 */
1942SYM_FUNC_START(aesni_dec)
1943	FRAME_BEGIN
1944#ifndef __x86_64__
1945	pushl KEYP
1946	pushl KLEN
1947	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1948	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1949	movl (FRAME_OFFSET+20)(%esp), INP	# src
1950#endif
1951	mov 480(KEYP), KLEN		# key length
1952	add $240, KEYP
1953	movups (INP), STATE		# input
1954	call _aesni_dec1
1955	movups STATE, (OUTP)		#output
1956#ifndef __x86_64__
1957	popl KLEN
1958	popl KEYP
1959#endif
1960	FRAME_END
1961	RET
1962SYM_FUNC_END(aesni_dec)
1963
1964/*
1965 * _aesni_dec1:		internal ABI
1966 * input:
1967 *	KEYP:		key struct pointer
1968 *	KLEN:		key length
1969 *	STATE:		initial state (input)
1970 * output:
1971 *	STATE:		finial state (output)
1972 * changed:
1973 *	KEY
1974 *	TKEYP (T1)
1975 */
1976SYM_FUNC_START_LOCAL(_aesni_dec1)
1977	movaps (KEYP), KEY		# key
1978	mov KEYP, TKEYP
1979	pxor KEY, STATE		# round 0
1980	add $0x30, TKEYP
1981	cmp $24, KLEN
1982	jb .Ldec128
1983	lea 0x20(TKEYP), TKEYP
1984	je .Ldec192
1985	add $0x20, TKEYP
1986	movaps -0x60(TKEYP), KEY
1987	aesdec KEY, STATE
1988	movaps -0x50(TKEYP), KEY
1989	aesdec KEY, STATE
1990.align 4
1991.Ldec192:
1992	movaps -0x40(TKEYP), KEY
1993	aesdec KEY, STATE
1994	movaps -0x30(TKEYP), KEY
1995	aesdec KEY, STATE
1996.align 4
1997.Ldec128:
1998	movaps -0x20(TKEYP), KEY
1999	aesdec KEY, STATE
2000	movaps -0x10(TKEYP), KEY
2001	aesdec KEY, STATE
2002	movaps (TKEYP), KEY
2003	aesdec KEY, STATE
2004	movaps 0x10(TKEYP), KEY
2005	aesdec KEY, STATE
2006	movaps 0x20(TKEYP), KEY
2007	aesdec KEY, STATE
2008	movaps 0x30(TKEYP), KEY
2009	aesdec KEY, STATE
2010	movaps 0x40(TKEYP), KEY
2011	aesdec KEY, STATE
2012	movaps 0x50(TKEYP), KEY
2013	aesdec KEY, STATE
2014	movaps 0x60(TKEYP), KEY
2015	aesdec KEY, STATE
2016	movaps 0x70(TKEYP), KEY
2017	aesdeclast KEY, STATE
2018	RET
2019SYM_FUNC_END(_aesni_dec1)
2020
2021/*
2022 * _aesni_dec4:	internal ABI
2023 * input:
2024 *	KEYP:		key struct pointer
2025 *	KLEN:		key length
2026 *	STATE1:		initial state (input)
2027 *	STATE2
2028 *	STATE3
2029 *	STATE4
2030 * output:
2031 *	STATE1:		finial state (output)
2032 *	STATE2
2033 *	STATE3
2034 *	STATE4
2035 * changed:
2036 *	KEY
2037 *	TKEYP (T1)
2038 */
2039SYM_FUNC_START_LOCAL(_aesni_dec4)
2040	movaps (KEYP), KEY		# key
2041	mov KEYP, TKEYP
2042	pxor KEY, STATE1		# round 0
2043	pxor KEY, STATE2
2044	pxor KEY, STATE3
2045	pxor KEY, STATE4
2046	add $0x30, TKEYP
2047	cmp $24, KLEN
2048	jb .L4dec128
2049	lea 0x20(TKEYP), TKEYP
2050	je .L4dec192
2051	add $0x20, TKEYP
2052	movaps -0x60(TKEYP), KEY
2053	aesdec KEY, STATE1
2054	aesdec KEY, STATE2
2055	aesdec KEY, STATE3
2056	aesdec KEY, STATE4
2057	movaps -0x50(TKEYP), KEY
2058	aesdec KEY, STATE1
2059	aesdec KEY, STATE2
2060	aesdec KEY, STATE3
2061	aesdec KEY, STATE4
2062.align 4
2063.L4dec192:
2064	movaps -0x40(TKEYP), KEY
2065	aesdec KEY, STATE1
2066	aesdec KEY, STATE2
2067	aesdec KEY, STATE3
2068	aesdec KEY, STATE4
2069	movaps -0x30(TKEYP), KEY
2070	aesdec KEY, STATE1
2071	aesdec KEY, STATE2
2072	aesdec KEY, STATE3
2073	aesdec KEY, STATE4
2074.align 4
2075.L4dec128:
2076	movaps -0x20(TKEYP), KEY
2077	aesdec KEY, STATE1
2078	aesdec KEY, STATE2
2079	aesdec KEY, STATE3
2080	aesdec KEY, STATE4
2081	movaps -0x10(TKEYP), KEY
2082	aesdec KEY, STATE1
2083	aesdec KEY, STATE2
2084	aesdec KEY, STATE3
2085	aesdec KEY, STATE4
2086	movaps (TKEYP), KEY
2087	aesdec KEY, STATE1
2088	aesdec KEY, STATE2
2089	aesdec KEY, STATE3
2090	aesdec KEY, STATE4
2091	movaps 0x10(TKEYP), KEY
2092	aesdec KEY, STATE1
2093	aesdec KEY, STATE2
2094	aesdec KEY, STATE3
2095	aesdec KEY, STATE4
2096	movaps 0x20(TKEYP), KEY
2097	aesdec KEY, STATE1
2098	aesdec KEY, STATE2
2099	aesdec KEY, STATE3
2100	aesdec KEY, STATE4
2101	movaps 0x30(TKEYP), KEY
2102	aesdec KEY, STATE1
2103	aesdec KEY, STATE2
2104	aesdec KEY, STATE3
2105	aesdec KEY, STATE4
2106	movaps 0x40(TKEYP), KEY
2107	aesdec KEY, STATE1
2108	aesdec KEY, STATE2
2109	aesdec KEY, STATE3
2110	aesdec KEY, STATE4
2111	movaps 0x50(TKEYP), KEY
2112	aesdec KEY, STATE1
2113	aesdec KEY, STATE2
2114	aesdec KEY, STATE3
2115	aesdec KEY, STATE4
2116	movaps 0x60(TKEYP), KEY
2117	aesdec KEY, STATE1
2118	aesdec KEY, STATE2
2119	aesdec KEY, STATE3
2120	aesdec KEY, STATE4
2121	movaps 0x70(TKEYP), KEY
2122	aesdeclast KEY, STATE1		# last round
2123	aesdeclast KEY, STATE2
2124	aesdeclast KEY, STATE3
2125	aesdeclast KEY, STATE4
2126	RET
2127SYM_FUNC_END(_aesni_dec4)
2128
2129/*
2130 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2131 *		      size_t len)
2132 */
2133SYM_FUNC_START(aesni_ecb_enc)
2134	FRAME_BEGIN
2135#ifndef __x86_64__
2136	pushl LEN
2137	pushl KEYP
2138	pushl KLEN
2139	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2140	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2141	movl (FRAME_OFFSET+24)(%esp), INP	# src
2142	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2143#endif
2144	test LEN, LEN		# check length
2145	jz .Lecb_enc_ret
2146	mov 480(KEYP), KLEN
2147	cmp $16, LEN
2148	jb .Lecb_enc_ret
2149	cmp $64, LEN
2150	jb .Lecb_enc_loop1
2151.align 4
2152.Lecb_enc_loop4:
2153	movups (INP), STATE1
2154	movups 0x10(INP), STATE2
2155	movups 0x20(INP), STATE3
2156	movups 0x30(INP), STATE4
2157	call _aesni_enc4
2158	movups STATE1, (OUTP)
2159	movups STATE2, 0x10(OUTP)
2160	movups STATE3, 0x20(OUTP)
2161	movups STATE4, 0x30(OUTP)
2162	sub $64, LEN
2163	add $64, INP
2164	add $64, OUTP
2165	cmp $64, LEN
2166	jge .Lecb_enc_loop4
2167	cmp $16, LEN
2168	jb .Lecb_enc_ret
2169.align 4
2170.Lecb_enc_loop1:
2171	movups (INP), STATE1
2172	call _aesni_enc1
2173	movups STATE1, (OUTP)
2174	sub $16, LEN
2175	add $16, INP
2176	add $16, OUTP
2177	cmp $16, LEN
2178	jge .Lecb_enc_loop1
2179.Lecb_enc_ret:
2180#ifndef __x86_64__
2181	popl KLEN
2182	popl KEYP
2183	popl LEN
2184#endif
2185	FRAME_END
2186	RET
2187SYM_FUNC_END(aesni_ecb_enc)
2188
2189/*
2190 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2191 *		      size_t len);
2192 */
2193SYM_FUNC_START(aesni_ecb_dec)
2194	FRAME_BEGIN
2195#ifndef __x86_64__
2196	pushl LEN
2197	pushl KEYP
2198	pushl KLEN
2199	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2200	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2201	movl (FRAME_OFFSET+24)(%esp), INP	# src
2202	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2203#endif
2204	test LEN, LEN
2205	jz .Lecb_dec_ret
2206	mov 480(KEYP), KLEN
2207	add $240, KEYP
2208	cmp $16, LEN
2209	jb .Lecb_dec_ret
2210	cmp $64, LEN
2211	jb .Lecb_dec_loop1
2212.align 4
2213.Lecb_dec_loop4:
2214	movups (INP), STATE1
2215	movups 0x10(INP), STATE2
2216	movups 0x20(INP), STATE3
2217	movups 0x30(INP), STATE4
2218	call _aesni_dec4
2219	movups STATE1, (OUTP)
2220	movups STATE2, 0x10(OUTP)
2221	movups STATE3, 0x20(OUTP)
2222	movups STATE4, 0x30(OUTP)
2223	sub $64, LEN
2224	add $64, INP
2225	add $64, OUTP
2226	cmp $64, LEN
2227	jge .Lecb_dec_loop4
2228	cmp $16, LEN
2229	jb .Lecb_dec_ret
2230.align 4
2231.Lecb_dec_loop1:
2232	movups (INP), STATE1
2233	call _aesni_dec1
2234	movups STATE1, (OUTP)
2235	sub $16, LEN
2236	add $16, INP
2237	add $16, OUTP
2238	cmp $16, LEN
2239	jge .Lecb_dec_loop1
2240.Lecb_dec_ret:
2241#ifndef __x86_64__
2242	popl KLEN
2243	popl KEYP
2244	popl LEN
2245#endif
2246	FRAME_END
2247	RET
2248SYM_FUNC_END(aesni_ecb_dec)
2249
2250/*
2251 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2252 *		      size_t len, u8 *iv)
2253 */
2254SYM_FUNC_START(aesni_cbc_enc)
2255	FRAME_BEGIN
2256#ifndef __x86_64__
2257	pushl IVP
2258	pushl LEN
2259	pushl KEYP
2260	pushl KLEN
2261	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2262	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2263	movl (FRAME_OFFSET+28)(%esp), INP	# src
2264	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2265	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2266#endif
2267	cmp $16, LEN
2268	jb .Lcbc_enc_ret
2269	mov 480(KEYP), KLEN
2270	movups (IVP), STATE	# load iv as initial state
2271.align 4
2272.Lcbc_enc_loop:
2273	movups (INP), IN	# load input
2274	pxor IN, STATE
2275	call _aesni_enc1
2276	movups STATE, (OUTP)	# store output
2277	sub $16, LEN
2278	add $16, INP
2279	add $16, OUTP
2280	cmp $16, LEN
2281	jge .Lcbc_enc_loop
2282	movups STATE, (IVP)
2283.Lcbc_enc_ret:
2284#ifndef __x86_64__
2285	popl KLEN
2286	popl KEYP
2287	popl LEN
2288	popl IVP
2289#endif
2290	FRAME_END
2291	RET
2292SYM_FUNC_END(aesni_cbc_enc)
2293
2294/*
2295 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2296 *		      size_t len, u8 *iv)
2297 */
2298SYM_FUNC_START(aesni_cbc_dec)
2299	FRAME_BEGIN
2300#ifndef __x86_64__
2301	pushl IVP
2302	pushl LEN
2303	pushl KEYP
2304	pushl KLEN
2305	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2306	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2307	movl (FRAME_OFFSET+28)(%esp), INP	# src
2308	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2309	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2310#endif
2311	cmp $16, LEN
2312	jb .Lcbc_dec_just_ret
2313	mov 480(KEYP), KLEN
2314	add $240, KEYP
2315	movups (IVP), IV
2316	cmp $64, LEN
2317	jb .Lcbc_dec_loop1
2318.align 4
2319.Lcbc_dec_loop4:
2320	movups (INP), IN1
2321	movaps IN1, STATE1
2322	movups 0x10(INP), IN2
2323	movaps IN2, STATE2
2324#ifdef __x86_64__
2325	movups 0x20(INP), IN3
2326	movaps IN3, STATE3
2327	movups 0x30(INP), IN4
2328	movaps IN4, STATE4
2329#else
2330	movups 0x20(INP), IN1
2331	movaps IN1, STATE3
2332	movups 0x30(INP), IN2
2333	movaps IN2, STATE4
2334#endif
2335	call _aesni_dec4
2336	pxor IV, STATE1
2337#ifdef __x86_64__
2338	pxor IN1, STATE2
2339	pxor IN2, STATE3
2340	pxor IN3, STATE4
2341	movaps IN4, IV
2342#else
2343	pxor IN1, STATE4
2344	movaps IN2, IV
2345	movups (INP), IN1
2346	pxor IN1, STATE2
2347	movups 0x10(INP), IN2
2348	pxor IN2, STATE3
2349#endif
2350	movups STATE1, (OUTP)
2351	movups STATE2, 0x10(OUTP)
2352	movups STATE3, 0x20(OUTP)
2353	movups STATE4, 0x30(OUTP)
2354	sub $64, LEN
2355	add $64, INP
2356	add $64, OUTP
2357	cmp $64, LEN
2358	jge .Lcbc_dec_loop4
2359	cmp $16, LEN
2360	jb .Lcbc_dec_ret
2361.align 4
2362.Lcbc_dec_loop1:
2363	movups (INP), IN
2364	movaps IN, STATE
2365	call _aesni_dec1
2366	pxor IV, STATE
2367	movups STATE, (OUTP)
2368	movaps IN, IV
2369	sub $16, LEN
2370	add $16, INP
2371	add $16, OUTP
2372	cmp $16, LEN
2373	jge .Lcbc_dec_loop1
2374.Lcbc_dec_ret:
2375	movups IV, (IVP)
2376.Lcbc_dec_just_ret:
2377#ifndef __x86_64__
2378	popl KLEN
2379	popl KEYP
2380	popl LEN
2381	popl IVP
2382#endif
2383	FRAME_END
2384	RET
2385SYM_FUNC_END(aesni_cbc_dec)
2386
2387/*
2388 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2389 *			  size_t len, u8 *iv)
2390 */
2391SYM_FUNC_START(aesni_cts_cbc_enc)
2392	FRAME_BEGIN
2393#ifndef __x86_64__
2394	pushl IVP
2395	pushl LEN
2396	pushl KEYP
2397	pushl KLEN
2398	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2399	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2400	movl (FRAME_OFFSET+28)(%esp), INP	# src
2401	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2402	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2403	lea .Lcts_permute_table, T1
2404#else
2405	lea .Lcts_permute_table(%rip), T1
2406#endif
2407	mov 480(KEYP), KLEN
2408	movups (IVP), STATE
2409	sub $16, LEN
2410	mov T1, IVP
2411	add $32, IVP
2412	add LEN, T1
2413	sub LEN, IVP
2414	movups (T1), %xmm4
2415	movups (IVP), %xmm5
2416
2417	movups (INP), IN1
2418	add LEN, INP
2419	movups (INP), IN2
2420
2421	pxor IN1, STATE
2422	call _aesni_enc1
2423
2424	pshufb %xmm5, IN2
2425	pxor STATE, IN2
2426	pshufb %xmm4, STATE
2427	add OUTP, LEN
2428	movups STATE, (LEN)
2429
2430	movaps IN2, STATE
2431	call _aesni_enc1
2432	movups STATE, (OUTP)
2433
2434#ifndef __x86_64__
2435	popl KLEN
2436	popl KEYP
2437	popl LEN
2438	popl IVP
2439#endif
2440	FRAME_END
2441	RET
2442SYM_FUNC_END(aesni_cts_cbc_enc)
2443
2444/*
2445 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2446 *			  size_t len, u8 *iv)
2447 */
2448SYM_FUNC_START(aesni_cts_cbc_dec)
2449	FRAME_BEGIN
2450#ifndef __x86_64__
2451	pushl IVP
2452	pushl LEN
2453	pushl KEYP
2454	pushl KLEN
2455	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2456	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2457	movl (FRAME_OFFSET+28)(%esp), INP	# src
2458	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2459	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2460	lea .Lcts_permute_table, T1
2461#else
2462	lea .Lcts_permute_table(%rip), T1
2463#endif
2464	mov 480(KEYP), KLEN
2465	add $240, KEYP
2466	movups (IVP), IV
2467	sub $16, LEN
2468	mov T1, IVP
2469	add $32, IVP
2470	add LEN, T1
2471	sub LEN, IVP
2472	movups (T1), %xmm4
2473
2474	movups (INP), STATE
2475	add LEN, INP
2476	movups (INP), IN1
2477
2478	call _aesni_dec1
2479	movaps STATE, IN2
2480	pshufb %xmm4, STATE
2481	pxor IN1, STATE
2482
2483	add OUTP, LEN
2484	movups STATE, (LEN)
2485
2486	movups (IVP), %xmm0
2487	pshufb %xmm0, IN1
2488	pblendvb IN2, IN1
2489	movaps IN1, STATE
2490	call _aesni_dec1
2491
2492	pxor IV, STATE
2493	movups STATE, (OUTP)
2494
2495#ifndef __x86_64__
2496	popl KLEN
2497	popl KEYP
2498	popl LEN
2499	popl IVP
2500#endif
2501	FRAME_END
2502	RET
2503SYM_FUNC_END(aesni_cts_cbc_dec)
2504
2505.pushsection .rodata
2506.align 16
2507.Lcts_permute_table:
2508	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2509	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2510	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2511	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2512	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2513	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2514#ifdef __x86_64__
2515.Lbswap_mask:
2516	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2517#endif
2518.popsection
2519
2520#ifdef __x86_64__
2521/*
2522 * _aesni_inc_init:	internal ABI
2523 *	setup registers used by _aesni_inc
2524 * input:
2525 *	IV
2526 * output:
2527 *	CTR:	== IV, in little endian
2528 *	TCTR_LOW: == lower qword of CTR
2529 *	INC:	== 1, in little endian
2530 *	BSWAP_MASK == endian swapping mask
2531 */
2532SYM_FUNC_START_LOCAL(_aesni_inc_init)
2533	movaps .Lbswap_mask(%rip), BSWAP_MASK
2534	movaps IV, CTR
2535	pshufb BSWAP_MASK, CTR
2536	mov $1, TCTR_LOW
2537	movq TCTR_LOW, INC
2538	movq CTR, TCTR_LOW
2539	RET
2540SYM_FUNC_END(_aesni_inc_init)
2541
2542/*
2543 * _aesni_inc:		internal ABI
2544 *	Increase IV by 1, IV is in big endian
2545 * input:
2546 *	IV
2547 *	CTR:	== IV, in little endian
2548 *	TCTR_LOW: == lower qword of CTR
2549 *	INC:	== 1, in little endian
2550 *	BSWAP_MASK == endian swapping mask
2551 * output:
2552 *	IV:	Increase by 1
2553 * changed:
2554 *	CTR:	== output IV, in little endian
2555 *	TCTR_LOW: == lower qword of CTR
2556 */
2557SYM_FUNC_START_LOCAL(_aesni_inc)
2558	paddq INC, CTR
2559	add $1, TCTR_LOW
2560	jnc .Linc_low
2561	pslldq $8, INC
2562	paddq INC, CTR
2563	psrldq $8, INC
2564.Linc_low:
2565	movaps CTR, IV
2566	pshufb BSWAP_MASK, IV
2567	RET
2568SYM_FUNC_END(_aesni_inc)
2569
2570/*
2571 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2572 *		      size_t len, u8 *iv)
2573 */
2574SYM_FUNC_START(aesni_ctr_enc)
2575	FRAME_BEGIN
2576	cmp $16, LEN
2577	jb .Lctr_enc_just_ret
2578	mov 480(KEYP), KLEN
2579	movups (IVP), IV
2580	call _aesni_inc_init
2581	cmp $64, LEN
2582	jb .Lctr_enc_loop1
2583.align 4
2584.Lctr_enc_loop4:
2585	movaps IV, STATE1
2586	call _aesni_inc
2587	movups (INP), IN1
2588	movaps IV, STATE2
2589	call _aesni_inc
2590	movups 0x10(INP), IN2
2591	movaps IV, STATE3
2592	call _aesni_inc
2593	movups 0x20(INP), IN3
2594	movaps IV, STATE4
2595	call _aesni_inc
2596	movups 0x30(INP), IN4
2597	call _aesni_enc4
2598	pxor IN1, STATE1
2599	movups STATE1, (OUTP)
2600	pxor IN2, STATE2
2601	movups STATE2, 0x10(OUTP)
2602	pxor IN3, STATE3
2603	movups STATE3, 0x20(OUTP)
2604	pxor IN4, STATE4
2605	movups STATE4, 0x30(OUTP)
2606	sub $64, LEN
2607	add $64, INP
2608	add $64, OUTP
2609	cmp $64, LEN
2610	jge .Lctr_enc_loop4
2611	cmp $16, LEN
2612	jb .Lctr_enc_ret
2613.align 4
2614.Lctr_enc_loop1:
2615	movaps IV, STATE
2616	call _aesni_inc
2617	movups (INP), IN
2618	call _aesni_enc1
2619	pxor IN, STATE
2620	movups STATE, (OUTP)
2621	sub $16, LEN
2622	add $16, INP
2623	add $16, OUTP
2624	cmp $16, LEN
2625	jge .Lctr_enc_loop1
2626.Lctr_enc_ret:
2627	movups IV, (IVP)
2628.Lctr_enc_just_ret:
2629	FRAME_END
2630	RET
2631SYM_FUNC_END(aesni_ctr_enc)
2632
2633#endif
2634
2635.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2636.align 16
2637.Lgf128mul_x_ble_mask:
2638	.octa 0x00000000000000010000000000000087
2639.previous
2640
2641/*
2642 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
2643 * input:
2644 *	IV:	current IV
2645 *	GF128MUL_MASK == mask with 0x87 and 0x01
2646 * output:
2647 *	IV:	next IV
2648 * changed:
2649 *	KEY:	== temporary value
2650 */
2651.macro _aesni_gf128mul_x_ble
2652	pshufd $0x13, IV, KEY
2653	paddq IV, IV
2654	psrad $31, KEY
2655	pand GF128MUL_MASK, KEY
2656	pxor KEY, IV
2657.endm
2658
2659.macro	_aesni_xts_crypt	enc
2660	FRAME_BEGIN
2661#ifndef __x86_64__
2662	pushl IVP
2663	pushl LEN
2664	pushl KEYP
2665	pushl KLEN
2666	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2667	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2668	movl (FRAME_OFFSET+28)(%esp), INP	# src
2669	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2670	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2671	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2672#else
2673	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2674#endif
2675	movups (IVP), IV
2676
2677	mov 480(KEYP), KLEN
2678.if !\enc
2679	add $240, KEYP
2680
2681	test $15, LEN
2682	jz .Lxts_loop4\@
2683	sub $16, LEN
2684.endif
2685
2686.Lxts_loop4\@:
2687	sub $64, LEN
2688	jl .Lxts_1x\@
2689
2690	movdqa IV, STATE1
2691	movdqu 0x00(INP), IN
2692	pxor IN, STATE1
2693	movdqu IV, 0x00(OUTP)
2694
2695	_aesni_gf128mul_x_ble
2696	movdqa IV, STATE2
2697	movdqu 0x10(INP), IN
2698	pxor IN, STATE2
2699	movdqu IV, 0x10(OUTP)
2700
2701	_aesni_gf128mul_x_ble
2702	movdqa IV, STATE3
2703	movdqu 0x20(INP), IN
2704	pxor IN, STATE3
2705	movdqu IV, 0x20(OUTP)
2706
2707	_aesni_gf128mul_x_ble
2708	movdqa IV, STATE4
2709	movdqu 0x30(INP), IN
2710	pxor IN, STATE4
2711	movdqu IV, 0x30(OUTP)
2712
2713.if \enc
2714	call _aesni_enc4
2715.else
2716	call _aesni_dec4
2717.endif
2718
2719	movdqu 0x00(OUTP), IN
2720	pxor IN, STATE1
2721	movdqu STATE1, 0x00(OUTP)
2722
2723	movdqu 0x10(OUTP), IN
2724	pxor IN, STATE2
2725	movdqu STATE2, 0x10(OUTP)
2726
2727	movdqu 0x20(OUTP), IN
2728	pxor IN, STATE3
2729	movdqu STATE3, 0x20(OUTP)
2730
2731	movdqu 0x30(OUTP), IN
2732	pxor IN, STATE4
2733	movdqu STATE4, 0x30(OUTP)
2734
2735	_aesni_gf128mul_x_ble
2736
2737	add $64, INP
2738	add $64, OUTP
2739	test LEN, LEN
2740	jnz .Lxts_loop4\@
2741
2742.Lxts_ret_iv\@:
2743	movups IV, (IVP)
2744
2745.Lxts_ret\@:
2746#ifndef __x86_64__
2747	popl KLEN
2748	popl KEYP
2749	popl LEN
2750	popl IVP
2751#endif
2752	FRAME_END
2753	RET
2754
2755.Lxts_1x\@:
2756	add $64, LEN
2757	jz .Lxts_ret_iv\@
2758.if \enc
2759	sub $16, LEN
2760	jl .Lxts_cts4\@
2761.endif
2762
2763.Lxts_loop1\@:
2764	movdqu (INP), STATE
2765.if \enc
2766	pxor IV, STATE
2767	call _aesni_enc1
2768.else
2769	add $16, INP
2770	sub $16, LEN
2771	jl .Lxts_cts1\@
2772	pxor IV, STATE
2773	call _aesni_dec1
2774.endif
2775	pxor IV, STATE
2776	_aesni_gf128mul_x_ble
2777
2778	test LEN, LEN
2779	jz .Lxts_out\@
2780
2781.if \enc
2782	add $16, INP
2783	sub $16, LEN
2784	jl .Lxts_cts1\@
2785.endif
2786
2787	movdqu STATE, (OUTP)
2788	add $16, OUTP
2789	jmp .Lxts_loop1\@
2790
2791.Lxts_out\@:
2792	movdqu STATE, (OUTP)
2793	jmp .Lxts_ret_iv\@
2794
2795.if \enc
2796.Lxts_cts4\@:
2797	movdqa STATE4, STATE
2798	sub $16, OUTP
2799.Lxts_cts1\@:
2800.else
2801.Lxts_cts1\@:
2802	movdqa IV, STATE4
2803	_aesni_gf128mul_x_ble
2804
2805	pxor IV, STATE
2806	call _aesni_dec1
2807	pxor IV, STATE
2808.endif
2809#ifndef __x86_64__
2810	lea .Lcts_permute_table, T1
2811#else
2812	lea .Lcts_permute_table(%rip), T1
2813#endif
2814	add LEN, INP		/* rewind input pointer */
2815	add $16, LEN		/* # bytes in final block */
2816	movups (INP), IN1
2817
2818	mov T1, IVP
2819	add $32, IVP
2820	add LEN, T1
2821	sub LEN, IVP
2822	add OUTP, LEN
2823
2824	movups (T1), %xmm4
2825	movaps STATE, IN2
2826	pshufb %xmm4, STATE
2827	movups STATE, (LEN)
2828
2829	movups (IVP), %xmm0
2830	pshufb %xmm0, IN1
2831	pblendvb IN2, IN1
2832	movaps IN1, STATE
2833
2834.if \enc
2835	pxor IV, STATE
2836	call _aesni_enc1
2837	pxor IV, STATE
2838.else
2839	pxor STATE4, STATE
2840	call _aesni_dec1
2841	pxor STATE4, STATE
2842.endif
2843
2844	movups STATE, (OUTP)
2845	jmp .Lxts_ret\@
2846.endm
2847
2848/*
2849 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
2850 *		      const u8 *src, unsigned int len, le128 *iv)
2851 */
2852SYM_FUNC_START(aesni_xts_enc)
2853	_aesni_xts_crypt	1
2854SYM_FUNC_END(aesni_xts_enc)
2855
2856/*
2857 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
2858 *		      const u8 *src, unsigned int len, le128 *iv)
2859 */
2860SYM_FUNC_START(aesni_xts_dec)
2861	_aesni_xts_crypt	0
2862SYM_FUNC_END(aesni_xts_dec)
2863