xref: /linux/arch/x86/crypto/aesni-intel_asm.S (revision 95298d63c67673c654c08952672d016212b26054)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 *    Author: Huang Ying <ying.huang@intel.com>
10 *            Vinodh Gopal <vinodh.gopal@intel.com>
11 *            Kahraman Akdemir
12 *
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
17 *             Adrian Hoban <adrian.hoban@intel.com>
18 *             James Guilford (james.guilford@intel.com)
19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
20 *             Tadeusz Struk (tadeusz.struk@intel.com)
21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
22 *    Copyright (c) 2010, Intel Corporation.
23 *
24 * Ported x86_64 version to x86:
25 *    Author: Mathias Krause <minipli@googlemail.com>
26 */
27
28#include <linux/linkage.h>
29#include <asm/inst.h>
30#include <asm/frame.h>
31#include <asm/nospec-branch.h>
32
33/*
34 * The following macros are used to move an (un)aligned 16 byte value to/from
35 * an XMM register.  This can done for either FP or integer values, for FP use
36 * movaps (move aligned packed single) or integer use movdqa (move double quad
37 * aligned).  It doesn't make a performance difference which instruction is used
38 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
39 * shorter, so that is the one we'll use for now. (same for unaligned).
40 */
41#define MOVADQ	movaps
42#define MOVUDQ	movups
43
44#ifdef __x86_64__
45
46# constants in mergeable sections, linker can reorder and merge
47.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
48.align 16
49.Lgf128mul_x_ble_mask:
50	.octa 0x00000000000000010000000000000087
51.section	.rodata.cst16.POLY, "aM", @progbits, 16
52.align 16
53POLY:   .octa 0xC2000000000000000000000000000001
54.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
55.align 16
56TWOONE: .octa 0x00000001000000000000000000000001
57
58.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
59.align 16
60SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
61.section	.rodata.cst16.MASK1, "aM", @progbits, 16
62.align 16
63MASK1:      .octa 0x0000000000000000ffffffffffffffff
64.section	.rodata.cst16.MASK2, "aM", @progbits, 16
65.align 16
66MASK2:      .octa 0xffffffffffffffff0000000000000000
67.section	.rodata.cst16.ONE, "aM", @progbits, 16
68.align 16
69ONE:        .octa 0x00000000000000000000000000000001
70.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
71.align 16
72F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
73.section	.rodata.cst16.dec, "aM", @progbits, 16
74.align 16
75dec:        .octa 0x1
76.section	.rodata.cst16.enc, "aM", @progbits, 16
77.align 16
78enc:        .octa 0x2
79
80# order of these constants should not change.
81# more specifically, ALL_F should follow SHIFT_MASK,
82# and zero should follow ALL_F
83.section	.rodata, "a", @progbits
84.align 16
85SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
86ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
87            .octa 0x00000000000000000000000000000000
88
89.text
90
91
92#define	STACK_OFFSET    8*3
93
94#define AadHash 16*0
95#define AadLen 16*1
96#define InLen (16*1)+8
97#define PBlockEncKey 16*2
98#define OrigIV 16*3
99#define CurCount 16*4
100#define PBlockLen 16*5
101#define	HashKey		16*6	// store HashKey <<1 mod poly here
102#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
103#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
104#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
105#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
106				// bits of  HashKey <<1 mod poly here
107				//(for Karatsuba purposes)
108#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
109				// bits of  HashKey^2 <<1 mod poly here
110				// (for Karatsuba purposes)
111#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
112				// bits of  HashKey^3 <<1 mod poly here
113				// (for Karatsuba purposes)
114#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
115				// bits of  HashKey^4 <<1 mod poly here
116				// (for Karatsuba purposes)
117
118#define arg1 rdi
119#define arg2 rsi
120#define arg3 rdx
121#define arg4 rcx
122#define arg5 r8
123#define arg6 r9
124#define arg7 STACK_OFFSET+8(%rsp)
125#define arg8 STACK_OFFSET+16(%rsp)
126#define arg9 STACK_OFFSET+24(%rsp)
127#define arg10 STACK_OFFSET+32(%rsp)
128#define arg11 STACK_OFFSET+40(%rsp)
129#define keysize 2*15*16(%arg1)
130#endif
131
132
133#define STATE1	%xmm0
134#define STATE2	%xmm4
135#define STATE3	%xmm5
136#define STATE4	%xmm6
137#define STATE	STATE1
138#define IN1	%xmm1
139#define IN2	%xmm7
140#define IN3	%xmm8
141#define IN4	%xmm9
142#define IN	IN1
143#define KEY	%xmm2
144#define IV	%xmm3
145
146#define BSWAP_MASK %xmm10
147#define CTR	%xmm11
148#define INC	%xmm12
149
150#define GF128MUL_MASK %xmm10
151
152#ifdef __x86_64__
153#define AREG	%rax
154#define KEYP	%rdi
155#define OUTP	%rsi
156#define UKEYP	OUTP
157#define INP	%rdx
158#define LEN	%rcx
159#define IVP	%r8
160#define KLEN	%r9d
161#define T1	%r10
162#define TKEYP	T1
163#define T2	%r11
164#define TCTR_LOW T2
165#else
166#define AREG	%eax
167#define KEYP	%edi
168#define OUTP	AREG
169#define UKEYP	OUTP
170#define INP	%edx
171#define LEN	%esi
172#define IVP	%ebp
173#define KLEN	%ebx
174#define T1	%ecx
175#define TKEYP	T1
176#endif
177
178.macro FUNC_SAVE
179	push	%r12
180	push	%r13
181	push	%r14
182#
183# states of %xmm registers %xmm6:%xmm15 not saved
184# all %xmm registers are clobbered
185#
186.endm
187
188
189.macro FUNC_RESTORE
190	pop	%r14
191	pop	%r13
192	pop	%r12
193.endm
194
195# Precompute hashkeys.
196# Input: Hash subkey.
197# Output: HashKeys stored in gcm_context_data.  Only needs to be called
198# once per key.
199# clobbers r12, and tmp xmm registers.
200.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
201	mov	\SUBKEY, %r12
202	movdqu	(%r12), \TMP3
203	movdqa	SHUF_MASK(%rip), \TMP2
204	PSHUFB_XMM \TMP2, \TMP3
205
206	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
207
208	movdqa	\TMP3, \TMP2
209	psllq	$1, \TMP3
210	psrlq	$63, \TMP2
211	movdqa	\TMP2, \TMP1
212	pslldq	$8, \TMP2
213	psrldq	$8, \TMP1
214	por	\TMP2, \TMP3
215
216	# reduce HashKey<<1
217
218	pshufd	$0x24, \TMP1, \TMP2
219	pcmpeqd TWOONE(%rip), \TMP2
220	pand	POLY(%rip), \TMP2
221	pxor	\TMP2, \TMP3
222	movdqu	\TMP3, HashKey(%arg2)
223
224	movdqa	   \TMP3, \TMP5
225	pshufd	   $78, \TMP3, \TMP1
226	pxor	   \TMP3, \TMP1
227	movdqu	   \TMP1, HashKey_k(%arg2)
228
229	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
230# TMP5 = HashKey^2<<1 (mod poly)
231	movdqu	   \TMP5, HashKey_2(%arg2)
232# HashKey_2 = HashKey^2<<1 (mod poly)
233	pshufd	   $78, \TMP5, \TMP1
234	pxor	   \TMP5, \TMP1
235	movdqu	   \TMP1, HashKey_2_k(%arg2)
236
237	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
238# TMP5 = HashKey^3<<1 (mod poly)
239	movdqu	   \TMP5, HashKey_3(%arg2)
240	pshufd	   $78, \TMP5, \TMP1
241	pxor	   \TMP5, \TMP1
242	movdqu	   \TMP1, HashKey_3_k(%arg2)
243
244	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
245# TMP5 = HashKey^3<<1 (mod poly)
246	movdqu	   \TMP5, HashKey_4(%arg2)
247	pshufd	   $78, \TMP5, \TMP1
248	pxor	   \TMP5, \TMP1
249	movdqu	   \TMP1, HashKey_4_k(%arg2)
250.endm
251
252# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
253# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
254.macro GCM_INIT Iv SUBKEY AAD AADLEN
255	mov \AADLEN, %r11
256	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
257	xor %r11d, %r11d
258	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
259	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
260	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
261	mov \Iv, %rax
262	movdqu (%rax), %xmm0
263	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
264
265	movdqa  SHUF_MASK(%rip), %xmm2
266	PSHUFB_XMM %xmm2, %xmm0
267	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
268
269	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
270	movdqu HashKey(%arg2), %xmm13
271
272	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
273	%xmm4, %xmm5, %xmm6
274.endm
275
276# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
277# struct has been initialized by GCM_INIT.
278# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
279# Clobbers rax, r10-r13, and xmm0-xmm15
280.macro GCM_ENC_DEC operation
281	movdqu AadHash(%arg2), %xmm8
282	movdqu HashKey(%arg2), %xmm13
283	add %arg5, InLen(%arg2)
284
285	xor %r11d, %r11d # initialise the data pointer offset as zero
286	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
287
288	sub %r11, %arg5		# sub partial block data used
289	mov %arg5, %r13		# save the number of bytes
290
291	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
292	mov %r13, %r12
293	# Encrypt/Decrypt first few blocks
294
295	and	$(3<<4), %r12
296	jz	_initial_num_blocks_is_0_\@
297	cmp	$(2<<4), %r12
298	jb	_initial_num_blocks_is_1_\@
299	je	_initial_num_blocks_is_2_\@
300_initial_num_blocks_is_3_\@:
301	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
303	sub	$48, %r13
304	jmp	_initial_blocks_\@
305_initial_num_blocks_is_2_\@:
306	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
308	sub	$32, %r13
309	jmp	_initial_blocks_\@
310_initial_num_blocks_is_1_\@:
311	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
313	sub	$16, %r13
314	jmp	_initial_blocks_\@
315_initial_num_blocks_is_0_\@:
316	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
317%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
318_initial_blocks_\@:
319
320	# Main loop - Encrypt/Decrypt remaining blocks
321
322	cmp	$0, %r13
323	je	_zero_cipher_left_\@
324	sub	$64, %r13
325	je	_four_cipher_left_\@
326_crypt_by_4_\@:
327	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
328	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
329	%xmm7, %xmm8, enc
330	add	$64, %r11
331	sub	$64, %r13
332	jne	_crypt_by_4_\@
333_four_cipher_left_\@:
334	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
335%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
336_zero_cipher_left_\@:
337	movdqu %xmm8, AadHash(%arg2)
338	movdqu %xmm0, CurCount(%arg2)
339
340	mov	%arg5, %r13
341	and	$15, %r13			# %r13 = arg5 (mod 16)
342	je	_multiple_of_16_bytes_\@
343
344	mov %r13, PBlockLen(%arg2)
345
346	# Handle the last <16 Byte block separately
347	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
348	movdqu %xmm0, CurCount(%arg2)
349	movdqa SHUF_MASK(%rip), %xmm10
350	PSHUFB_XMM %xmm10, %xmm0
351
352	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
353	movdqu %xmm0, PBlockEncKey(%arg2)
354
355	cmp	$16, %arg5
356	jge _large_enough_update_\@
357
358	lea (%arg4,%r11,1), %r10
359	mov %r13, %r12
360	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
361	jmp _data_read_\@
362
363_large_enough_update_\@:
364	sub	$16, %r11
365	add	%r13, %r11
366
367	# receive the last <16 Byte block
368	movdqu	(%arg4, %r11, 1), %xmm1
369
370	sub	%r13, %r11
371	add	$16, %r11
372
373	lea	SHIFT_MASK+16(%rip), %r12
374	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
375	# (r13 is the number of bytes in plaintext mod 16)
376	sub	%r13, %r12
377	# get the appropriate shuffle mask
378	movdqu	(%r12), %xmm2
379	# shift right 16-r13 bytes
380	PSHUFB_XMM  %xmm2, %xmm1
381
382_data_read_\@:
383	lea ALL_F+16(%rip), %r12
384	sub %r13, %r12
385
386.ifc \operation, dec
387	movdqa  %xmm1, %xmm2
388.endif
389	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
390	movdqu	(%r12), %xmm1
391	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
392	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
393.ifc \operation, dec
394	pand    %xmm1, %xmm2
395	movdqa SHUF_MASK(%rip), %xmm10
396	PSHUFB_XMM %xmm10 ,%xmm2
397
398	pxor %xmm2, %xmm8
399.else
400	movdqa SHUF_MASK(%rip), %xmm10
401	PSHUFB_XMM %xmm10,%xmm0
402
403	pxor	%xmm0, %xmm8
404.endif
405
406	movdqu %xmm8, AadHash(%arg2)
407.ifc \operation, enc
408	# GHASH computation for the last <16 byte block
409	movdqa SHUF_MASK(%rip), %xmm10
410	# shuffle xmm0 back to output as ciphertext
411	PSHUFB_XMM %xmm10, %xmm0
412.endif
413
414	# Output %r13 bytes
415	MOVQ_R64_XMM %xmm0, %rax
416	cmp $8, %r13
417	jle _less_than_8_bytes_left_\@
418	mov %rax, (%arg3 , %r11, 1)
419	add $8, %r11
420	psrldq $8, %xmm0
421	MOVQ_R64_XMM %xmm0, %rax
422	sub $8, %r13
423_less_than_8_bytes_left_\@:
424	mov %al,  (%arg3, %r11, 1)
425	add $1, %r11
426	shr $8, %rax
427	sub $1, %r13
428	jne _less_than_8_bytes_left_\@
429_multiple_of_16_bytes_\@:
430.endm
431
432# GCM_COMPLETE Finishes update of tag of last partial block
433# Output: Authorization Tag (AUTH_TAG)
434# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
435.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
436	movdqu AadHash(%arg2), %xmm8
437	movdqu HashKey(%arg2), %xmm13
438
439	mov PBlockLen(%arg2), %r12
440
441	cmp $0, %r12
442	je _partial_done\@
443
444	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
445
446_partial_done\@:
447	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
448	shl	$3, %r12		  # convert into number of bits
449	movd	%r12d, %xmm15		  # len(A) in %xmm15
450	mov InLen(%arg2), %r12
451	shl     $3, %r12                  # len(C) in bits (*128)
452	MOVQ_R64_XMM    %r12, %xmm1
453
454	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
455	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
456	pxor	%xmm15, %xmm8
457	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
458	# final GHASH computation
459	movdqa SHUF_MASK(%rip), %xmm10
460	PSHUFB_XMM %xmm10, %xmm8
461
462	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
463	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
464	pxor	%xmm8, %xmm0
465_return_T_\@:
466	mov	\AUTHTAG, %r10                     # %r10 = authTag
467	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
468	cmp	$16, %r11
469	je	_T_16_\@
470	cmp	$8, %r11
471	jl	_T_4_\@
472_T_8_\@:
473	MOVQ_R64_XMM	%xmm0, %rax
474	mov	%rax, (%r10)
475	add	$8, %r10
476	sub	$8, %r11
477	psrldq	$8, %xmm0
478	cmp	$0, %r11
479	je	_return_T_done_\@
480_T_4_\@:
481	movd	%xmm0, %eax
482	mov	%eax, (%r10)
483	add	$4, %r10
484	sub	$4, %r11
485	psrldq	$4, %xmm0
486	cmp	$0, %r11
487	je	_return_T_done_\@
488_T_123_\@:
489	movd	%xmm0, %eax
490	cmp	$2, %r11
491	jl	_T_1_\@
492	mov	%ax, (%r10)
493	cmp	$2, %r11
494	je	_return_T_done_\@
495	add	$2, %r10
496	sar	$16, %eax
497_T_1_\@:
498	mov	%al, (%r10)
499	jmp	_return_T_done_\@
500_T_16_\@:
501	movdqu	%xmm0, (%r10)
502_return_T_done_\@:
503.endm
504
505#ifdef __x86_64__
506/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
507*
508*
509* Input: A and B (128-bits each, bit-reflected)
510* Output: C = A*B*x mod poly, (i.e. >>1 )
511* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
512* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
513*
514*/
515.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
516	movdqa	  \GH, \TMP1
517	pshufd	  $78, \GH, \TMP2
518	pshufd	  $78, \HK, \TMP3
519	pxor	  \GH, \TMP2            # TMP2 = a1+a0
520	pxor	  \HK, \TMP3            # TMP3 = b1+b0
521	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
522	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
523	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
524	pxor	  \GH, \TMP2
525	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
526	movdqa	  \TMP2, \TMP3
527	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
528	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
529	pxor	  \TMP3, \GH
530	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
531
532        # first phase of the reduction
533
534	movdqa    \GH, \TMP2
535	movdqa    \GH, \TMP3
536	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
537					# in in order to perform
538					# independent shifts
539	pslld     $31, \TMP2            # packed right shift <<31
540	pslld     $30, \TMP3            # packed right shift <<30
541	pslld     $25, \TMP4            # packed right shift <<25
542	pxor      \TMP3, \TMP2          # xor the shifted versions
543	pxor      \TMP4, \TMP2
544	movdqa    \TMP2, \TMP5
545	psrldq    $4, \TMP5             # right shift TMP5 1 DW
546	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
547	pxor      \TMP2, \GH
548
549        # second phase of the reduction
550
551	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
552					# in in order to perform
553					# independent shifts
554	movdqa    \GH,\TMP3
555	movdqa    \GH,\TMP4
556	psrld     $1,\TMP2              # packed left shift >>1
557	psrld     $2,\TMP3              # packed left shift >>2
558	psrld     $7,\TMP4              # packed left shift >>7
559	pxor      \TMP3,\TMP2		# xor the shifted versions
560	pxor      \TMP4,\TMP2
561	pxor      \TMP5, \TMP2
562	pxor      \TMP2, \GH
563	pxor      \TMP1, \GH            # result is in TMP1
564.endm
565
566# Reads DLEN bytes starting at DPTR and stores in XMMDst
567# where 0 < DLEN < 16
568# Clobbers %rax, DLEN and XMM1
569.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
570        cmp $8, \DLEN
571        jl _read_lt8_\@
572        mov (\DPTR), %rax
573        MOVQ_R64_XMM %rax, \XMMDst
574        sub $8, \DLEN
575        jz _done_read_partial_block_\@
576	xor %eax, %eax
577_read_next_byte_\@:
578        shl $8, %rax
579        mov 7(\DPTR, \DLEN, 1), %al
580        dec \DLEN
581        jnz _read_next_byte_\@
582        MOVQ_R64_XMM %rax, \XMM1
583	pslldq $8, \XMM1
584        por \XMM1, \XMMDst
585	jmp _done_read_partial_block_\@
586_read_lt8_\@:
587	xor %eax, %eax
588_read_next_byte_lt8_\@:
589        shl $8, %rax
590        mov -1(\DPTR, \DLEN, 1), %al
591        dec \DLEN
592        jnz _read_next_byte_lt8_\@
593        MOVQ_R64_XMM %rax, \XMMDst
594_done_read_partial_block_\@:
595.endm
596
597# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
598# clobbers r10-11, xmm14
599.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
600	TMP6 TMP7
601	MOVADQ	   SHUF_MASK(%rip), %xmm14
602	mov	   \AAD, %r10		# %r10 = AAD
603	mov	   \AADLEN, %r11		# %r11 = aadLen
604	pxor	   \TMP7, \TMP7
605	pxor	   \TMP6, \TMP6
606
607	cmp	   $16, %r11
608	jl	   _get_AAD_rest\@
609_get_AAD_blocks\@:
610	movdqu	   (%r10), \TMP7
611	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
612	pxor	   \TMP7, \TMP6
613	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
614	add	   $16, %r10
615	sub	   $16, %r11
616	cmp	   $16, %r11
617	jge	   _get_AAD_blocks\@
618
619	movdqu	   \TMP6, \TMP7
620
621	/* read the last <16B of AAD */
622_get_AAD_rest\@:
623	cmp	   $0, %r11
624	je	   _get_AAD_done\@
625
626	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
627	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
628	pxor	   \TMP6, \TMP7
629	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
630	movdqu \TMP7, \TMP6
631
632_get_AAD_done\@:
633	movdqu \TMP6, AadHash(%arg2)
634.endm
635
636# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
637# between update calls.
638# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
639# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
640# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
641.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
642	AAD_HASH operation
643	mov 	PBlockLen(%arg2), %r13
644	cmp	$0, %r13
645	je	_partial_block_done_\@	# Leave Macro if no partial blocks
646	# Read in input data without over reading
647	cmp	$16, \PLAIN_CYPH_LEN
648	jl	_fewer_than_16_bytes_\@
649	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
650	jmp	_data_read_\@
651
652_fewer_than_16_bytes_\@:
653	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
654	mov	\PLAIN_CYPH_LEN, %r12
655	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
656
657	mov PBlockLen(%arg2), %r13
658
659_data_read_\@:				# Finished reading in data
660
661	movdqu	PBlockEncKey(%arg2), %xmm9
662	movdqu	HashKey(%arg2), %xmm13
663
664	lea	SHIFT_MASK(%rip), %r12
665
666	# adjust the shuffle mask pointer to be able to shift r13 bytes
667	# r16-r13 is the number of bytes in plaintext mod 16)
668	add	%r13, %r12
669	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
670	PSHUFB_XMM %xmm2, %xmm9		# shift right r13 bytes
671
672.ifc \operation, dec
673	movdqa	%xmm1, %xmm3
674	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
675
676	mov	\PLAIN_CYPH_LEN, %r10
677	add	%r13, %r10
678	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
679	sub	$16, %r10
680	# Determine if if partial block is not being filled and
681	# shift mask accordingly
682	jge	_no_extra_mask_1_\@
683	sub	%r10, %r12
684_no_extra_mask_1_\@:
685
686	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
687	# get the appropriate mask to mask out bottom r13 bytes of xmm9
688	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
689
690	pand	%xmm1, %xmm3
691	movdqa	SHUF_MASK(%rip), %xmm10
692	PSHUFB_XMM	%xmm10, %xmm3
693	PSHUFB_XMM	%xmm2, %xmm3
694	pxor	%xmm3, \AAD_HASH
695
696	cmp	$0, %r10
697	jl	_partial_incomplete_1_\@
698
699	# GHASH computation for the last <16 Byte block
700	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
701	xor	%eax, %eax
702
703	mov	%rax, PBlockLen(%arg2)
704	jmp	_dec_done_\@
705_partial_incomplete_1_\@:
706	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
707_dec_done_\@:
708	movdqu	\AAD_HASH, AadHash(%arg2)
709.else
710	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
711
712	mov	\PLAIN_CYPH_LEN, %r10
713	add	%r13, %r10
714	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
715	sub	$16, %r10
716	# Determine if if partial block is not being filled and
717	# shift mask accordingly
718	jge	_no_extra_mask_2_\@
719	sub	%r10, %r12
720_no_extra_mask_2_\@:
721
722	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
723	# get the appropriate mask to mask out bottom r13 bytes of xmm9
724	pand	%xmm1, %xmm9
725
726	movdqa	SHUF_MASK(%rip), %xmm1
727	PSHUFB_XMM %xmm1, %xmm9
728	PSHUFB_XMM %xmm2, %xmm9
729	pxor	%xmm9, \AAD_HASH
730
731	cmp	$0, %r10
732	jl	_partial_incomplete_2_\@
733
734	# GHASH computation for the last <16 Byte block
735	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
736	xor	%eax, %eax
737
738	mov	%rax, PBlockLen(%arg2)
739	jmp	_encode_done_\@
740_partial_incomplete_2_\@:
741	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
742_encode_done_\@:
743	movdqu	\AAD_HASH, AadHash(%arg2)
744
745	movdqa	SHUF_MASK(%rip), %xmm10
746	# shuffle xmm9 back to output as ciphertext
747	PSHUFB_XMM	%xmm10, %xmm9
748	PSHUFB_XMM	%xmm2, %xmm9
749.endif
750	# output encrypted Bytes
751	cmp	$0, %r10
752	jl	_partial_fill_\@
753	mov	%r13, %r12
754	mov	$16, %r13
755	# Set r13 to be the number of bytes to write out
756	sub	%r12, %r13
757	jmp	_count_set_\@
758_partial_fill_\@:
759	mov	\PLAIN_CYPH_LEN, %r13
760_count_set_\@:
761	movdqa	%xmm9, %xmm0
762	MOVQ_R64_XMM	%xmm0, %rax
763	cmp	$8, %r13
764	jle	_less_than_8_bytes_left_\@
765
766	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
767	add	$8, \DATA_OFFSET
768	psrldq	$8, %xmm0
769	MOVQ_R64_XMM	%xmm0, %rax
770	sub	$8, %r13
771_less_than_8_bytes_left_\@:
772	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
773	add	$1, \DATA_OFFSET
774	shr	$8, %rax
775	sub	$1, %r13
776	jne	_less_than_8_bytes_left_\@
777_partial_block_done_\@:
778.endm # PARTIAL_BLOCK
779
780/*
781* if a = number of total plaintext bytes
782* b = floor(a/16)
783* num_initial_blocks = b mod 4
784* encrypt the initial num_initial_blocks blocks and apply ghash on
785* the ciphertext
786* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
787* are clobbered
788* arg1, %arg2, %arg3 are used as a pointer only, not modified
789*/
790
791
792.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
793	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
794	MOVADQ		SHUF_MASK(%rip), %xmm14
795
796	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
797
798	# start AES for num_initial_blocks blocks
799
800	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
801
802.if (\i == 5) || (\i == 6) || (\i == 7)
803
804	MOVADQ		ONE(%RIP),\TMP1
805	MOVADQ		0(%arg1),\TMP2
806.irpc index, \i_seq
807	paddd		\TMP1, \XMM0                 # INCR Y0
808.ifc \operation, dec
809        movdqa     \XMM0, %xmm\index
810.else
811	MOVADQ		\XMM0, %xmm\index
812.endif
813	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
814	pxor		\TMP2, %xmm\index
815.endr
816	lea	0x10(%arg1),%r10
817	mov	keysize,%eax
818	shr	$2,%eax				# 128->4, 192->6, 256->8
819	add	$5,%eax			      # 128->9, 192->11, 256->13
820
821aes_loop_initial_\@:
822	MOVADQ	(%r10),\TMP1
823.irpc	index, \i_seq
824	AESENC	\TMP1, %xmm\index
825.endr
826	add	$16,%r10
827	sub	$1,%eax
828	jnz	aes_loop_initial_\@
829
830	MOVADQ	(%r10), \TMP1
831.irpc index, \i_seq
832	AESENCLAST \TMP1, %xmm\index         # Last Round
833.endr
834.irpc index, \i_seq
835	movdqu	   (%arg4 , %r11, 1), \TMP1
836	pxor	   \TMP1, %xmm\index
837	movdqu	   %xmm\index, (%arg3 , %r11, 1)
838	# write back plaintext/ciphertext for num_initial_blocks
839	add	   $16, %r11
840
841.ifc \operation, dec
842	movdqa     \TMP1, %xmm\index
843.endif
844	PSHUFB_XMM	   %xmm14, %xmm\index
845
846		# prepare plaintext/ciphertext for GHASH computation
847.endr
848.endif
849
850        # apply GHASH on num_initial_blocks blocks
851
852.if \i == 5
853        pxor       %xmm5, %xmm6
854	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
855        pxor       %xmm6, %xmm7
856	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857        pxor       %xmm7, %xmm8
858	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859.elseif \i == 6
860        pxor       %xmm6, %xmm7
861	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862        pxor       %xmm7, %xmm8
863	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
864.elseif \i == 7
865        pxor       %xmm7, %xmm8
866	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
867.endif
868	cmp	   $64, %r13
869	jl	_initial_blocks_done\@
870	# no need for precomputed values
871/*
872*
873* Precomputations for HashKey parallel with encryption of first 4 blocks.
874* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
875*/
876	MOVADQ	   ONE(%RIP),\TMP1
877	paddd	   \TMP1, \XMM0              # INCR Y0
878	MOVADQ	   \XMM0, \XMM1
879	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
880
881	paddd	   \TMP1, \XMM0              # INCR Y0
882	MOVADQ	   \XMM0, \XMM2
883	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
884
885	paddd	   \TMP1, \XMM0              # INCR Y0
886	MOVADQ	   \XMM0, \XMM3
887	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
888
889	paddd	   \TMP1, \XMM0              # INCR Y0
890	MOVADQ	   \XMM0, \XMM4
891	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
892
893	MOVADQ	   0(%arg1),\TMP1
894	pxor	   \TMP1, \XMM1
895	pxor	   \TMP1, \XMM2
896	pxor	   \TMP1, \XMM3
897	pxor	   \TMP1, \XMM4
898.irpc index, 1234 # do 4 rounds
899	movaps 0x10*\index(%arg1), \TMP1
900	AESENC	   \TMP1, \XMM1
901	AESENC	   \TMP1, \XMM2
902	AESENC	   \TMP1, \XMM3
903	AESENC	   \TMP1, \XMM4
904.endr
905.irpc index, 56789 # do next 5 rounds
906	movaps 0x10*\index(%arg1), \TMP1
907	AESENC	   \TMP1, \XMM1
908	AESENC	   \TMP1, \XMM2
909	AESENC	   \TMP1, \XMM3
910	AESENC	   \TMP1, \XMM4
911.endr
912	lea	   0xa0(%arg1),%r10
913	mov	   keysize,%eax
914	shr	   $2,%eax			# 128->4, 192->6, 256->8
915	sub	   $4,%eax			# 128->0, 192->2, 256->4
916	jz	   aes_loop_pre_done\@
917
918aes_loop_pre_\@:
919	MOVADQ	   (%r10),\TMP2
920.irpc	index, 1234
921	AESENC	   \TMP2, %xmm\index
922.endr
923	add	   $16,%r10
924	sub	   $1,%eax
925	jnz	   aes_loop_pre_\@
926
927aes_loop_pre_done\@:
928	MOVADQ	   (%r10), \TMP2
929	AESENCLAST \TMP2, \XMM1
930	AESENCLAST \TMP2, \XMM2
931	AESENCLAST \TMP2, \XMM3
932	AESENCLAST \TMP2, \XMM4
933	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
934	pxor	   \TMP1, \XMM1
935.ifc \operation, dec
936	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
937	movdqa     \TMP1, \XMM1
938.endif
939	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
940	pxor	   \TMP1, \XMM2
941.ifc \operation, dec
942	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
943	movdqa     \TMP1, \XMM2
944.endif
945	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
946	pxor	   \TMP1, \XMM3
947.ifc \operation, dec
948	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
949	movdqa     \TMP1, \XMM3
950.endif
951	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
952	pxor	   \TMP1, \XMM4
953.ifc \operation, dec
954	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
955	movdqa     \TMP1, \XMM4
956.else
957	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
958	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
959	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
960	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
961.endif
962
963	add	   $64, %r11
964	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
965	pxor	   \XMMDst, \XMM1
966# combine GHASHed value with the corresponding ciphertext
967	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
968	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
969	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
970
971_initial_blocks_done\@:
972
973.endm
974
975/*
976* encrypt 4 blocks at a time
977* ghash the 4 previously encrypted ciphertext blocks
978* arg1, %arg3, %arg4 are used as pointers only, not modified
979* %r11 is the data offset value
980*/
981.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
982TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
983
984	movdqa	  \XMM1, \XMM5
985	movdqa	  \XMM2, \XMM6
986	movdqa	  \XMM3, \XMM7
987	movdqa	  \XMM4, \XMM8
988
989        movdqa    SHUF_MASK(%rip), %xmm15
990        # multiply TMP5 * HashKey using karatsuba
991
992	movdqa	  \XMM5, \TMP4
993	pshufd	  $78, \XMM5, \TMP6
994	pxor	  \XMM5, \TMP6
995	paddd     ONE(%rip), \XMM0		# INCR CNT
996	movdqu	  HashKey_4(%arg2), \TMP5
997	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
998	movdqa    \XMM0, \XMM1
999	paddd     ONE(%rip), \XMM0		# INCR CNT
1000	movdqa    \XMM0, \XMM2
1001	paddd     ONE(%rip), \XMM0		# INCR CNT
1002	movdqa    \XMM0, \XMM3
1003	paddd     ONE(%rip), \XMM0		# INCR CNT
1004	movdqa    \XMM0, \XMM4
1005	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1006	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1007	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1008	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1009	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1010
1011	pxor	  (%arg1), \XMM1
1012	pxor	  (%arg1), \XMM2
1013	pxor	  (%arg1), \XMM3
1014	pxor	  (%arg1), \XMM4
1015	movdqu	  HashKey_4_k(%arg2), \TMP5
1016	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1017	movaps 0x10(%arg1), \TMP1
1018	AESENC	  \TMP1, \XMM1              # Round 1
1019	AESENC	  \TMP1, \XMM2
1020	AESENC	  \TMP1, \XMM3
1021	AESENC	  \TMP1, \XMM4
1022	movaps 0x20(%arg1), \TMP1
1023	AESENC	  \TMP1, \XMM1              # Round 2
1024	AESENC	  \TMP1, \XMM2
1025	AESENC	  \TMP1, \XMM3
1026	AESENC	  \TMP1, \XMM4
1027	movdqa	  \XMM6, \TMP1
1028	pshufd	  $78, \XMM6, \TMP2
1029	pxor	  \XMM6, \TMP2
1030	movdqu	  HashKey_3(%arg2), \TMP5
1031	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1032	movaps 0x30(%arg1), \TMP3
1033	AESENC    \TMP3, \XMM1              # Round 3
1034	AESENC    \TMP3, \XMM2
1035	AESENC    \TMP3, \XMM3
1036	AESENC    \TMP3, \XMM4
1037	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1038	movaps 0x40(%arg1), \TMP3
1039	AESENC	  \TMP3, \XMM1              # Round 4
1040	AESENC	  \TMP3, \XMM2
1041	AESENC	  \TMP3, \XMM3
1042	AESENC	  \TMP3, \XMM4
1043	movdqu	  HashKey_3_k(%arg2), \TMP5
1044	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1045	movaps 0x50(%arg1), \TMP3
1046	AESENC	  \TMP3, \XMM1              # Round 5
1047	AESENC	  \TMP3, \XMM2
1048	AESENC	  \TMP3, \XMM3
1049	AESENC	  \TMP3, \XMM4
1050	pxor	  \TMP1, \TMP4
1051# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1052	pxor	  \XMM6, \XMM5
1053	pxor	  \TMP2, \TMP6
1054	movdqa	  \XMM7, \TMP1
1055	pshufd	  $78, \XMM7, \TMP2
1056	pxor	  \XMM7, \TMP2
1057	movdqu	  HashKey_2(%arg2), \TMP5
1058
1059        # Multiply TMP5 * HashKey using karatsuba
1060
1061	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1062	movaps 0x60(%arg1), \TMP3
1063	AESENC	  \TMP3, \XMM1              # Round 6
1064	AESENC	  \TMP3, \XMM2
1065	AESENC	  \TMP3, \XMM3
1066	AESENC	  \TMP3, \XMM4
1067	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1068	movaps 0x70(%arg1), \TMP3
1069	AESENC	  \TMP3, \XMM1             # Round 7
1070	AESENC	  \TMP3, \XMM2
1071	AESENC	  \TMP3, \XMM3
1072	AESENC	  \TMP3, \XMM4
1073	movdqu	  HashKey_2_k(%arg2), \TMP5
1074	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1075	movaps 0x80(%arg1), \TMP3
1076	AESENC	  \TMP3, \XMM1             # Round 8
1077	AESENC	  \TMP3, \XMM2
1078	AESENC	  \TMP3, \XMM3
1079	AESENC	  \TMP3, \XMM4
1080	pxor	  \TMP1, \TMP4
1081# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1082	pxor	  \XMM7, \XMM5
1083	pxor	  \TMP2, \TMP6
1084
1085        # Multiply XMM8 * HashKey
1086        # XMM8 and TMP5 hold the values for the two operands
1087
1088	movdqa	  \XMM8, \TMP1
1089	pshufd	  $78, \XMM8, \TMP2
1090	pxor	  \XMM8, \TMP2
1091	movdqu	  HashKey(%arg2), \TMP5
1092	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1093	movaps 0x90(%arg1), \TMP3
1094	AESENC	  \TMP3, \XMM1            # Round 9
1095	AESENC	  \TMP3, \XMM2
1096	AESENC	  \TMP3, \XMM3
1097	AESENC	  \TMP3, \XMM4
1098	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1099	lea	  0xa0(%arg1),%r10
1100	mov	  keysize,%eax
1101	shr	  $2,%eax			# 128->4, 192->6, 256->8
1102	sub	  $4,%eax			# 128->0, 192->2, 256->4
1103	jz	  aes_loop_par_enc_done\@
1104
1105aes_loop_par_enc\@:
1106	MOVADQ	  (%r10),\TMP3
1107.irpc	index, 1234
1108	AESENC	  \TMP3, %xmm\index
1109.endr
1110	add	  $16,%r10
1111	sub	  $1,%eax
1112	jnz	  aes_loop_par_enc\@
1113
1114aes_loop_par_enc_done\@:
1115	MOVADQ	  (%r10), \TMP3
1116	AESENCLAST \TMP3, \XMM1           # Round 10
1117	AESENCLAST \TMP3, \XMM2
1118	AESENCLAST \TMP3, \XMM3
1119	AESENCLAST \TMP3, \XMM4
1120	movdqu    HashKey_k(%arg2), \TMP5
1121	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1122	movdqu	  (%arg4,%r11,1), \TMP3
1123	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1124	movdqu	  16(%arg4,%r11,1), \TMP3
1125	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1126	movdqu	  32(%arg4,%r11,1), \TMP3
1127	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1128	movdqu	  48(%arg4,%r11,1), \TMP3
1129	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1130        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1131        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1132        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1133        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1134	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1135	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1136	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1137	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1138
1139	pxor	  \TMP4, \TMP1
1140	pxor	  \XMM8, \XMM5
1141	pxor	  \TMP6, \TMP2
1142	pxor	  \TMP1, \TMP2
1143	pxor	  \XMM5, \TMP2
1144	movdqa	  \TMP2, \TMP3
1145	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1146	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1147	pxor	  \TMP3, \XMM5
1148	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1149
1150        # first phase of reduction
1151
1152	movdqa    \XMM5, \TMP2
1153	movdqa    \XMM5, \TMP3
1154	movdqa    \XMM5, \TMP4
1155# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1156	pslld     $31, \TMP2                   # packed right shift << 31
1157	pslld     $30, \TMP3                   # packed right shift << 30
1158	pslld     $25, \TMP4                   # packed right shift << 25
1159	pxor      \TMP3, \TMP2	               # xor the shifted versions
1160	pxor      \TMP4, \TMP2
1161	movdqa    \TMP2, \TMP5
1162	psrldq    $4, \TMP5                    # right shift T5 1 DW
1163	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1164	pxor      \TMP2, \XMM5
1165
1166        # second phase of reduction
1167
1168	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1169	movdqa    \XMM5,\TMP3
1170	movdqa    \XMM5,\TMP4
1171	psrld     $1, \TMP2                    # packed left shift >>1
1172	psrld     $2, \TMP3                    # packed left shift >>2
1173	psrld     $7, \TMP4                    # packed left shift >>7
1174	pxor      \TMP3,\TMP2		       # xor the shifted versions
1175	pxor      \TMP4,\TMP2
1176	pxor      \TMP5, \TMP2
1177	pxor      \TMP2, \XMM5
1178	pxor      \TMP1, \XMM5                 # result is in TMP1
1179
1180	pxor	  \XMM5, \XMM1
1181.endm
1182
1183/*
1184* decrypt 4 blocks at a time
1185* ghash the 4 previously decrypted ciphertext blocks
1186* arg1, %arg3, %arg4 are used as pointers only, not modified
1187* %r11 is the data offset value
1188*/
1189.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1190TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1191
1192	movdqa	  \XMM1, \XMM5
1193	movdqa	  \XMM2, \XMM6
1194	movdqa	  \XMM3, \XMM7
1195	movdqa	  \XMM4, \XMM8
1196
1197        movdqa    SHUF_MASK(%rip), %xmm15
1198        # multiply TMP5 * HashKey using karatsuba
1199
1200	movdqa	  \XMM5, \TMP4
1201	pshufd	  $78, \XMM5, \TMP6
1202	pxor	  \XMM5, \TMP6
1203	paddd     ONE(%rip), \XMM0		# INCR CNT
1204	movdqu	  HashKey_4(%arg2), \TMP5
1205	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1206	movdqa    \XMM0, \XMM1
1207	paddd     ONE(%rip), \XMM0		# INCR CNT
1208	movdqa    \XMM0, \XMM2
1209	paddd     ONE(%rip), \XMM0		# INCR CNT
1210	movdqa    \XMM0, \XMM3
1211	paddd     ONE(%rip), \XMM0		# INCR CNT
1212	movdqa    \XMM0, \XMM4
1213	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1214	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1215	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1216	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1217	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1218
1219	pxor	  (%arg1), \XMM1
1220	pxor	  (%arg1), \XMM2
1221	pxor	  (%arg1), \XMM3
1222	pxor	  (%arg1), \XMM4
1223	movdqu	  HashKey_4_k(%arg2), \TMP5
1224	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1225	movaps 0x10(%arg1), \TMP1
1226	AESENC	  \TMP1, \XMM1              # Round 1
1227	AESENC	  \TMP1, \XMM2
1228	AESENC	  \TMP1, \XMM3
1229	AESENC	  \TMP1, \XMM4
1230	movaps 0x20(%arg1), \TMP1
1231	AESENC	  \TMP1, \XMM1              # Round 2
1232	AESENC	  \TMP1, \XMM2
1233	AESENC	  \TMP1, \XMM3
1234	AESENC	  \TMP1, \XMM4
1235	movdqa	  \XMM6, \TMP1
1236	pshufd	  $78, \XMM6, \TMP2
1237	pxor	  \XMM6, \TMP2
1238	movdqu	  HashKey_3(%arg2), \TMP5
1239	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1240	movaps 0x30(%arg1), \TMP3
1241	AESENC    \TMP3, \XMM1              # Round 3
1242	AESENC    \TMP3, \XMM2
1243	AESENC    \TMP3, \XMM3
1244	AESENC    \TMP3, \XMM4
1245	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1246	movaps 0x40(%arg1), \TMP3
1247	AESENC	  \TMP3, \XMM1              # Round 4
1248	AESENC	  \TMP3, \XMM2
1249	AESENC	  \TMP3, \XMM3
1250	AESENC	  \TMP3, \XMM4
1251	movdqu	  HashKey_3_k(%arg2), \TMP5
1252	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1253	movaps 0x50(%arg1), \TMP3
1254	AESENC	  \TMP3, \XMM1              # Round 5
1255	AESENC	  \TMP3, \XMM2
1256	AESENC	  \TMP3, \XMM3
1257	AESENC	  \TMP3, \XMM4
1258	pxor	  \TMP1, \TMP4
1259# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1260	pxor	  \XMM6, \XMM5
1261	pxor	  \TMP2, \TMP6
1262	movdqa	  \XMM7, \TMP1
1263	pshufd	  $78, \XMM7, \TMP2
1264	pxor	  \XMM7, \TMP2
1265	movdqu	  HashKey_2(%arg2), \TMP5
1266
1267        # Multiply TMP5 * HashKey using karatsuba
1268
1269	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1270	movaps 0x60(%arg1), \TMP3
1271	AESENC	  \TMP3, \XMM1              # Round 6
1272	AESENC	  \TMP3, \XMM2
1273	AESENC	  \TMP3, \XMM3
1274	AESENC	  \TMP3, \XMM4
1275	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1276	movaps 0x70(%arg1), \TMP3
1277	AESENC	  \TMP3, \XMM1             # Round 7
1278	AESENC	  \TMP3, \XMM2
1279	AESENC	  \TMP3, \XMM3
1280	AESENC	  \TMP3, \XMM4
1281	movdqu	  HashKey_2_k(%arg2), \TMP5
1282	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1283	movaps 0x80(%arg1), \TMP3
1284	AESENC	  \TMP3, \XMM1             # Round 8
1285	AESENC	  \TMP3, \XMM2
1286	AESENC	  \TMP3, \XMM3
1287	AESENC	  \TMP3, \XMM4
1288	pxor	  \TMP1, \TMP4
1289# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1290	pxor	  \XMM7, \XMM5
1291	pxor	  \TMP2, \TMP6
1292
1293        # Multiply XMM8 * HashKey
1294        # XMM8 and TMP5 hold the values for the two operands
1295
1296	movdqa	  \XMM8, \TMP1
1297	pshufd	  $78, \XMM8, \TMP2
1298	pxor	  \XMM8, \TMP2
1299	movdqu	  HashKey(%arg2), \TMP5
1300	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1301	movaps 0x90(%arg1), \TMP3
1302	AESENC	  \TMP3, \XMM1            # Round 9
1303	AESENC	  \TMP3, \XMM2
1304	AESENC	  \TMP3, \XMM3
1305	AESENC	  \TMP3, \XMM4
1306	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1307	lea	  0xa0(%arg1),%r10
1308	mov	  keysize,%eax
1309	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1310	sub	  $4,%eax			# 128->0, 192->2, 256->4
1311	jz	  aes_loop_par_dec_done\@
1312
1313aes_loop_par_dec\@:
1314	MOVADQ	  (%r10),\TMP3
1315.irpc	index, 1234
1316	AESENC	  \TMP3, %xmm\index
1317.endr
1318	add	  $16,%r10
1319	sub	  $1,%eax
1320	jnz	  aes_loop_par_dec\@
1321
1322aes_loop_par_dec_done\@:
1323	MOVADQ	  (%r10), \TMP3
1324	AESENCLAST \TMP3, \XMM1           # last round
1325	AESENCLAST \TMP3, \XMM2
1326	AESENCLAST \TMP3, \XMM3
1327	AESENCLAST \TMP3, \XMM4
1328	movdqu    HashKey_k(%arg2), \TMP5
1329	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1330	movdqu	  (%arg4,%r11,1), \TMP3
1331	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1332	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1333	movdqa    \TMP3, \XMM1
1334	movdqu	  16(%arg4,%r11,1), \TMP3
1335	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1336	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1337	movdqa    \TMP3, \XMM2
1338	movdqu	  32(%arg4,%r11,1), \TMP3
1339	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1340	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1341	movdqa    \TMP3, \XMM3
1342	movdqu	  48(%arg4,%r11,1), \TMP3
1343	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1344	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1345	movdqa    \TMP3, \XMM4
1346	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1347	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1348	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1349	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1350
1351	pxor	  \TMP4, \TMP1
1352	pxor	  \XMM8, \XMM5
1353	pxor	  \TMP6, \TMP2
1354	pxor	  \TMP1, \TMP2
1355	pxor	  \XMM5, \TMP2
1356	movdqa	  \TMP2, \TMP3
1357	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1358	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1359	pxor	  \TMP3, \XMM5
1360	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1361
1362        # first phase of reduction
1363
1364	movdqa    \XMM5, \TMP2
1365	movdqa    \XMM5, \TMP3
1366	movdqa    \XMM5, \TMP4
1367# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1368	pslld     $31, \TMP2                   # packed right shift << 31
1369	pslld     $30, \TMP3                   # packed right shift << 30
1370	pslld     $25, \TMP4                   # packed right shift << 25
1371	pxor      \TMP3, \TMP2	               # xor the shifted versions
1372	pxor      \TMP4, \TMP2
1373	movdqa    \TMP2, \TMP5
1374	psrldq    $4, \TMP5                    # right shift T5 1 DW
1375	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1376	pxor      \TMP2, \XMM5
1377
1378        # second phase of reduction
1379
1380	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1381	movdqa    \XMM5,\TMP3
1382	movdqa    \XMM5,\TMP4
1383	psrld     $1, \TMP2                    # packed left shift >>1
1384	psrld     $2, \TMP3                    # packed left shift >>2
1385	psrld     $7, \TMP4                    # packed left shift >>7
1386	pxor      \TMP3,\TMP2		       # xor the shifted versions
1387	pxor      \TMP4,\TMP2
1388	pxor      \TMP5, \TMP2
1389	pxor      \TMP2, \XMM5
1390	pxor      \TMP1, \XMM5                 # result is in TMP1
1391
1392	pxor	  \XMM5, \XMM1
1393.endm
1394
1395/* GHASH the last 4 ciphertext blocks. */
1396.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1397TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1398
1399        # Multiply TMP6 * HashKey (using Karatsuba)
1400
1401	movdqa	  \XMM1, \TMP6
1402	pshufd	  $78, \XMM1, \TMP2
1403	pxor	  \XMM1, \TMP2
1404	movdqu	  HashKey_4(%arg2), \TMP5
1405	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1406	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1407	movdqu	  HashKey_4_k(%arg2), \TMP4
1408	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1409	movdqa	  \XMM1, \XMMDst
1410	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1411
1412        # Multiply TMP1 * HashKey (using Karatsuba)
1413
1414	movdqa	  \XMM2, \TMP1
1415	pshufd	  $78, \XMM2, \TMP2
1416	pxor	  \XMM2, \TMP2
1417	movdqu	  HashKey_3(%arg2), \TMP5
1418	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1419	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1420	movdqu	  HashKey_3_k(%arg2), \TMP4
1421	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1422	pxor	  \TMP1, \TMP6
1423	pxor	  \XMM2, \XMMDst
1424	pxor	  \TMP2, \XMM1
1425# results accumulated in TMP6, XMMDst, XMM1
1426
1427        # Multiply TMP1 * HashKey (using Karatsuba)
1428
1429	movdqa	  \XMM3, \TMP1
1430	pshufd	  $78, \XMM3, \TMP2
1431	pxor	  \XMM3, \TMP2
1432	movdqu	  HashKey_2(%arg2), \TMP5
1433	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1434	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1435	movdqu	  HashKey_2_k(%arg2), \TMP4
1436	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1437	pxor	  \TMP1, \TMP6
1438	pxor	  \XMM3, \XMMDst
1439	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1440
1441        # Multiply TMP1 * HashKey (using Karatsuba)
1442	movdqa	  \XMM4, \TMP1
1443	pshufd	  $78, \XMM4, \TMP2
1444	pxor	  \XMM4, \TMP2
1445	movdqu	  HashKey(%arg2), \TMP5
1446	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1447	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1448	movdqu	  HashKey_k(%arg2), \TMP4
1449	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1450	pxor	  \TMP1, \TMP6
1451	pxor	  \XMM4, \XMMDst
1452	pxor	  \XMM1, \TMP2
1453	pxor	  \TMP6, \TMP2
1454	pxor	  \XMMDst, \TMP2
1455	# middle section of the temp results combined as in karatsuba algorithm
1456	movdqa	  \TMP2, \TMP4
1457	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1458	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1459	pxor	  \TMP4, \XMMDst
1460	pxor	  \TMP2, \TMP6
1461# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1462	# first phase of the reduction
1463	movdqa    \XMMDst, \TMP2
1464	movdqa    \XMMDst, \TMP3
1465	movdqa    \XMMDst, \TMP4
1466# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1467	pslld     $31, \TMP2                # packed right shifting << 31
1468	pslld     $30, \TMP3                # packed right shifting << 30
1469	pslld     $25, \TMP4                # packed right shifting << 25
1470	pxor      \TMP3, \TMP2              # xor the shifted versions
1471	pxor      \TMP4, \TMP2
1472	movdqa    \TMP2, \TMP7
1473	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1474	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1475	pxor      \TMP2, \XMMDst
1476
1477        # second phase of the reduction
1478	movdqa    \XMMDst, \TMP2
1479	# make 3 copies of XMMDst for doing 3 shift operations
1480	movdqa    \XMMDst, \TMP3
1481	movdqa    \XMMDst, \TMP4
1482	psrld     $1, \TMP2                 # packed left shift >> 1
1483	psrld     $2, \TMP3                 # packed left shift >> 2
1484	psrld     $7, \TMP4                 # packed left shift >> 7
1485	pxor      \TMP3, \TMP2              # xor the shifted versions
1486	pxor      \TMP4, \TMP2
1487	pxor      \TMP7, \TMP2
1488	pxor      \TMP2, \XMMDst
1489	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1490.endm
1491
1492
1493/* Encryption of a single block
1494* uses eax & r10
1495*/
1496
1497.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1498
1499	pxor		(%arg1), \XMM0
1500	mov		keysize,%eax
1501	shr		$2,%eax			# 128->4, 192->6, 256->8
1502	add		$5,%eax			# 128->9, 192->11, 256->13
1503	lea		16(%arg1), %r10	  # get first expanded key address
1504
1505_esb_loop_\@:
1506	MOVADQ		(%r10),\TMP1
1507	AESENC		\TMP1,\XMM0
1508	add		$16,%r10
1509	sub		$1,%eax
1510	jnz		_esb_loop_\@
1511
1512	MOVADQ		(%r10),\TMP1
1513	AESENCLAST	\TMP1,\XMM0
1514.endm
1515/*****************************************************************************
1516* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1517*                   struct gcm_context_data *data
1518*                                      // Context data
1519*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1520*                   const u8 *in,      // Ciphertext input
1521*                   u64 plaintext_len, // Length of data in bytes for decryption.
1522*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1523*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1524*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1525*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1526*                   const u8 *aad,     // Additional Authentication Data (AAD)
1527*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1528*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1529*                                      // given authentication tag and only return the plaintext if they match.
1530*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1531*                                      // (most likely), 12 or 8.
1532*
1533* Assumptions:
1534*
1535* keys:
1536*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1537*       set of 11 keys in the data structure void *aes_ctx
1538*
1539* iv:
1540*       0                   1                   2                   3
1541*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543*       |                             Salt  (From the SA)               |
1544*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*       |                     Initialization Vector                     |
1546*       |         (This is the sequence number from IPSec header)       |
1547*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1548*       |                              0x1                              |
1549*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1550*
1551*
1552*
1553* AAD:
1554*       AAD padded to 128 bits with 0
1555*       for example, assume AAD is a u32 vector
1556*
1557*       if AAD is 8 bytes:
1558*       AAD[3] = {A0, A1};
1559*       padded AAD in xmm register = {A1 A0 0 0}
1560*
1561*       0                   1                   2                   3
1562*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1563*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1564*       |                               SPI (A1)                        |
1565*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566*       |                     32-bit Sequence Number (A0)               |
1567*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568*       |                              0x0                              |
1569*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570*
1571*                                       AAD Format with 32-bit Sequence Number
1572*
1573*       if AAD is 12 bytes:
1574*       AAD[3] = {A0, A1, A2};
1575*       padded AAD in xmm register = {A2 A1 A0 0}
1576*
1577*       0                   1                   2                   3
1578*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1579*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1580*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1581*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582*       |                               SPI (A2)                        |
1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*       |                 64-bit Extended Sequence Number {A1,A0}       |
1585*       |                                                               |
1586*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587*       |                              0x0                              |
1588*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1589*
1590*                        AAD Format with 64-bit Extended Sequence Number
1591*
1592* poly = x^128 + x^127 + x^126 + x^121 + 1
1593*
1594*****************************************************************************/
1595SYM_FUNC_START(aesni_gcm_dec)
1596	FUNC_SAVE
1597
1598	GCM_INIT %arg6, arg7, arg8, arg9
1599	GCM_ENC_DEC dec
1600	GCM_COMPLETE arg10, arg11
1601	FUNC_RESTORE
1602	ret
1603SYM_FUNC_END(aesni_gcm_dec)
1604
1605
1606/*****************************************************************************
1607* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1608*                    struct gcm_context_data *data
1609*                                        // Context data
1610*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1611*                    const u8 *in,       // Plaintext input
1612*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1613*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1614*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1615*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1616*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1617*                    const u8 *aad,      // Additional Authentication Data (AAD)
1618*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1619*                    u8 *auth_tag,       // Authenticated Tag output.
1620*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1621*                                        // 12 or 8.
1622*
1623* Assumptions:
1624*
1625* keys:
1626*       keys are pre-expanded and aligned to 16 bytes. we are using the
1627*       first set of 11 keys in the data structure void *aes_ctx
1628*
1629*
1630* iv:
1631*       0                   1                   2                   3
1632*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634*       |                             Salt  (From the SA)               |
1635*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*       |                     Initialization Vector                     |
1637*       |         (This is the sequence number from IPSec header)       |
1638*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1639*       |                              0x1                              |
1640*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641*
1642*
1643*
1644* AAD:
1645*       AAD padded to 128 bits with 0
1646*       for example, assume AAD is a u32 vector
1647*
1648*       if AAD is 8 bytes:
1649*       AAD[3] = {A0, A1};
1650*       padded AAD in xmm register = {A1 A0 0 0}
1651*
1652*       0                   1                   2                   3
1653*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1654*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655*       |                               SPI (A1)                        |
1656*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657*       |                     32-bit Sequence Number (A0)               |
1658*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659*       |                              0x0                              |
1660*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661*
1662*                                 AAD Format with 32-bit Sequence Number
1663*
1664*       if AAD is 12 bytes:
1665*       AAD[3] = {A0, A1, A2};
1666*       padded AAD in xmm register = {A2 A1 A0 0}
1667*
1668*       0                   1                   2                   3
1669*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1670*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671*       |                               SPI (A2)                        |
1672*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*       |                 64-bit Extended Sequence Number {A1,A0}       |
1674*       |                                                               |
1675*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1676*       |                              0x0                              |
1677*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1678*
1679*                         AAD Format with 64-bit Extended Sequence Number
1680*
1681* poly = x^128 + x^127 + x^126 + x^121 + 1
1682***************************************************************************/
1683SYM_FUNC_START(aesni_gcm_enc)
1684	FUNC_SAVE
1685
1686	GCM_INIT %arg6, arg7, arg8, arg9
1687	GCM_ENC_DEC enc
1688
1689	GCM_COMPLETE arg10, arg11
1690	FUNC_RESTORE
1691	ret
1692SYM_FUNC_END(aesni_gcm_enc)
1693
1694/*****************************************************************************
1695* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1696*                     struct gcm_context_data *data,
1697*                                         // context data
1698*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1699*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1700*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1701*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1702*                     const u8 *aad,      // Additional Authentication Data (AAD)
1703*                     u64 aad_len)        // Length of AAD in bytes.
1704*/
1705SYM_FUNC_START(aesni_gcm_init)
1706	FUNC_SAVE
1707	GCM_INIT %arg3, %arg4,%arg5, %arg6
1708	FUNC_RESTORE
1709	ret
1710SYM_FUNC_END(aesni_gcm_init)
1711
1712/*****************************************************************************
1713* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1714*                    struct gcm_context_data *data,
1715*                                        // context data
1716*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1717*                    const u8 *in,       // Plaintext input
1718*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1719*/
1720SYM_FUNC_START(aesni_gcm_enc_update)
1721	FUNC_SAVE
1722	GCM_ENC_DEC enc
1723	FUNC_RESTORE
1724	ret
1725SYM_FUNC_END(aesni_gcm_enc_update)
1726
1727/*****************************************************************************
1728* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1729*                    struct gcm_context_data *data,
1730*                                        // context data
1731*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1732*                    const u8 *in,       // Plaintext input
1733*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1734*/
1735SYM_FUNC_START(aesni_gcm_dec_update)
1736	FUNC_SAVE
1737	GCM_ENC_DEC dec
1738	FUNC_RESTORE
1739	ret
1740SYM_FUNC_END(aesni_gcm_dec_update)
1741
1742/*****************************************************************************
1743* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1744*                    struct gcm_context_data *data,
1745*                                        // context data
1746*                    u8 *auth_tag,       // Authenticated Tag output.
1747*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1748*                                        // 12 or 8.
1749*/
1750SYM_FUNC_START(aesni_gcm_finalize)
1751	FUNC_SAVE
1752	GCM_COMPLETE %arg3 %arg4
1753	FUNC_RESTORE
1754	ret
1755SYM_FUNC_END(aesni_gcm_finalize)
1756
1757#endif
1758
1759
1760SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1761SYM_FUNC_START_LOCAL(_key_expansion_256a)
1762	pshufd $0b11111111, %xmm1, %xmm1
1763	shufps $0b00010000, %xmm0, %xmm4
1764	pxor %xmm4, %xmm0
1765	shufps $0b10001100, %xmm0, %xmm4
1766	pxor %xmm4, %xmm0
1767	pxor %xmm1, %xmm0
1768	movaps %xmm0, (TKEYP)
1769	add $0x10, TKEYP
1770	ret
1771SYM_FUNC_END(_key_expansion_256a)
1772SYM_FUNC_END_ALIAS(_key_expansion_128)
1773
1774SYM_FUNC_START_LOCAL(_key_expansion_192a)
1775	pshufd $0b01010101, %xmm1, %xmm1
1776	shufps $0b00010000, %xmm0, %xmm4
1777	pxor %xmm4, %xmm0
1778	shufps $0b10001100, %xmm0, %xmm4
1779	pxor %xmm4, %xmm0
1780	pxor %xmm1, %xmm0
1781
1782	movaps %xmm2, %xmm5
1783	movaps %xmm2, %xmm6
1784	pslldq $4, %xmm5
1785	pshufd $0b11111111, %xmm0, %xmm3
1786	pxor %xmm3, %xmm2
1787	pxor %xmm5, %xmm2
1788
1789	movaps %xmm0, %xmm1
1790	shufps $0b01000100, %xmm0, %xmm6
1791	movaps %xmm6, (TKEYP)
1792	shufps $0b01001110, %xmm2, %xmm1
1793	movaps %xmm1, 0x10(TKEYP)
1794	add $0x20, TKEYP
1795	ret
1796SYM_FUNC_END(_key_expansion_192a)
1797
1798SYM_FUNC_START_LOCAL(_key_expansion_192b)
1799	pshufd $0b01010101, %xmm1, %xmm1
1800	shufps $0b00010000, %xmm0, %xmm4
1801	pxor %xmm4, %xmm0
1802	shufps $0b10001100, %xmm0, %xmm4
1803	pxor %xmm4, %xmm0
1804	pxor %xmm1, %xmm0
1805
1806	movaps %xmm2, %xmm5
1807	pslldq $4, %xmm5
1808	pshufd $0b11111111, %xmm0, %xmm3
1809	pxor %xmm3, %xmm2
1810	pxor %xmm5, %xmm2
1811
1812	movaps %xmm0, (TKEYP)
1813	add $0x10, TKEYP
1814	ret
1815SYM_FUNC_END(_key_expansion_192b)
1816
1817SYM_FUNC_START_LOCAL(_key_expansion_256b)
1818	pshufd $0b10101010, %xmm1, %xmm1
1819	shufps $0b00010000, %xmm2, %xmm4
1820	pxor %xmm4, %xmm2
1821	shufps $0b10001100, %xmm2, %xmm4
1822	pxor %xmm4, %xmm2
1823	pxor %xmm1, %xmm2
1824	movaps %xmm2, (TKEYP)
1825	add $0x10, TKEYP
1826	ret
1827SYM_FUNC_END(_key_expansion_256b)
1828
1829/*
1830 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1831 *                   unsigned int key_len)
1832 */
1833SYM_FUNC_START(aesni_set_key)
1834	FRAME_BEGIN
1835#ifndef __x86_64__
1836	pushl KEYP
1837	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1838	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1839	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1840#endif
1841	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1842	movaps %xmm0, (KEYP)
1843	lea 0x10(KEYP), TKEYP		# key addr
1844	movl %edx, 480(KEYP)
1845	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1846	cmp $24, %dl
1847	jb .Lenc_key128
1848	je .Lenc_key192
1849	movups 0x10(UKEYP), %xmm2	# other user key
1850	movaps %xmm2, (TKEYP)
1851	add $0x10, TKEYP
1852	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1853	call _key_expansion_256a
1854	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1855	call _key_expansion_256b
1856	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1857	call _key_expansion_256a
1858	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1859	call _key_expansion_256b
1860	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1861	call _key_expansion_256a
1862	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1863	call _key_expansion_256b
1864	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1865	call _key_expansion_256a
1866	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1867	call _key_expansion_256b
1868	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1869	call _key_expansion_256a
1870	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1871	call _key_expansion_256b
1872	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1873	call _key_expansion_256a
1874	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1875	call _key_expansion_256b
1876	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1877	call _key_expansion_256a
1878	jmp .Ldec_key
1879.Lenc_key192:
1880	movq 0x10(UKEYP), %xmm2		# other user key
1881	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1882	call _key_expansion_192a
1883	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1884	call _key_expansion_192b
1885	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1886	call _key_expansion_192a
1887	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1888	call _key_expansion_192b
1889	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1890	call _key_expansion_192a
1891	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1892	call _key_expansion_192b
1893	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1894	call _key_expansion_192a
1895	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1896	call _key_expansion_192b
1897	jmp .Ldec_key
1898.Lenc_key128:
1899	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1900	call _key_expansion_128
1901	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1902	call _key_expansion_128
1903	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1904	call _key_expansion_128
1905	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1906	call _key_expansion_128
1907	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1908	call _key_expansion_128
1909	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1910	call _key_expansion_128
1911	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1912	call _key_expansion_128
1913	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1914	call _key_expansion_128
1915	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1916	call _key_expansion_128
1917	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1918	call _key_expansion_128
1919.Ldec_key:
1920	sub $0x10, TKEYP
1921	movaps (KEYP), %xmm0
1922	movaps (TKEYP), %xmm1
1923	movaps %xmm0, 240(TKEYP)
1924	movaps %xmm1, 240(KEYP)
1925	add $0x10, KEYP
1926	lea 240-16(TKEYP), UKEYP
1927.align 4
1928.Ldec_key_loop:
1929	movaps (KEYP), %xmm0
1930	AESIMC %xmm0 %xmm1
1931	movaps %xmm1, (UKEYP)
1932	add $0x10, KEYP
1933	sub $0x10, UKEYP
1934	cmp TKEYP, KEYP
1935	jb .Ldec_key_loop
1936	xor AREG, AREG
1937#ifndef __x86_64__
1938	popl KEYP
1939#endif
1940	FRAME_END
1941	ret
1942SYM_FUNC_END(aesni_set_key)
1943
1944/*
1945 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1946 */
1947SYM_FUNC_START(aesni_enc)
1948	FRAME_BEGIN
1949#ifndef __x86_64__
1950	pushl KEYP
1951	pushl KLEN
1952	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1953	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1954	movl (FRAME_OFFSET+20)(%esp), INP	# src
1955#endif
1956	movl 480(KEYP), KLEN		# key length
1957	movups (INP), STATE		# input
1958	call _aesni_enc1
1959	movups STATE, (OUTP)		# output
1960#ifndef __x86_64__
1961	popl KLEN
1962	popl KEYP
1963#endif
1964	FRAME_END
1965	ret
1966SYM_FUNC_END(aesni_enc)
1967
1968/*
1969 * _aesni_enc1:		internal ABI
1970 * input:
1971 *	KEYP:		key struct pointer
1972 *	KLEN:		round count
1973 *	STATE:		initial state (input)
1974 * output:
1975 *	STATE:		finial state (output)
1976 * changed:
1977 *	KEY
1978 *	TKEYP (T1)
1979 */
1980SYM_FUNC_START_LOCAL(_aesni_enc1)
1981	movaps (KEYP), KEY		# key
1982	mov KEYP, TKEYP
1983	pxor KEY, STATE		# round 0
1984	add $0x30, TKEYP
1985	cmp $24, KLEN
1986	jb .Lenc128
1987	lea 0x20(TKEYP), TKEYP
1988	je .Lenc192
1989	add $0x20, TKEYP
1990	movaps -0x60(TKEYP), KEY
1991	AESENC KEY STATE
1992	movaps -0x50(TKEYP), KEY
1993	AESENC KEY STATE
1994.align 4
1995.Lenc192:
1996	movaps -0x40(TKEYP), KEY
1997	AESENC KEY STATE
1998	movaps -0x30(TKEYP), KEY
1999	AESENC KEY STATE
2000.align 4
2001.Lenc128:
2002	movaps -0x20(TKEYP), KEY
2003	AESENC KEY STATE
2004	movaps -0x10(TKEYP), KEY
2005	AESENC KEY STATE
2006	movaps (TKEYP), KEY
2007	AESENC KEY STATE
2008	movaps 0x10(TKEYP), KEY
2009	AESENC KEY STATE
2010	movaps 0x20(TKEYP), KEY
2011	AESENC KEY STATE
2012	movaps 0x30(TKEYP), KEY
2013	AESENC KEY STATE
2014	movaps 0x40(TKEYP), KEY
2015	AESENC KEY STATE
2016	movaps 0x50(TKEYP), KEY
2017	AESENC KEY STATE
2018	movaps 0x60(TKEYP), KEY
2019	AESENC KEY STATE
2020	movaps 0x70(TKEYP), KEY
2021	AESENCLAST KEY STATE
2022	ret
2023SYM_FUNC_END(_aesni_enc1)
2024
2025/*
2026 * _aesni_enc4:	internal ABI
2027 * input:
2028 *	KEYP:		key struct pointer
2029 *	KLEN:		round count
2030 *	STATE1:		initial state (input)
2031 *	STATE2
2032 *	STATE3
2033 *	STATE4
2034 * output:
2035 *	STATE1:		finial state (output)
2036 *	STATE2
2037 *	STATE3
2038 *	STATE4
2039 * changed:
2040 *	KEY
2041 *	TKEYP (T1)
2042 */
2043SYM_FUNC_START_LOCAL(_aesni_enc4)
2044	movaps (KEYP), KEY		# key
2045	mov KEYP, TKEYP
2046	pxor KEY, STATE1		# round 0
2047	pxor KEY, STATE2
2048	pxor KEY, STATE3
2049	pxor KEY, STATE4
2050	add $0x30, TKEYP
2051	cmp $24, KLEN
2052	jb .L4enc128
2053	lea 0x20(TKEYP), TKEYP
2054	je .L4enc192
2055	add $0x20, TKEYP
2056	movaps -0x60(TKEYP), KEY
2057	AESENC KEY STATE1
2058	AESENC KEY STATE2
2059	AESENC KEY STATE3
2060	AESENC KEY STATE4
2061	movaps -0x50(TKEYP), KEY
2062	AESENC KEY STATE1
2063	AESENC KEY STATE2
2064	AESENC KEY STATE3
2065	AESENC KEY STATE4
2066#.align 4
2067.L4enc192:
2068	movaps -0x40(TKEYP), KEY
2069	AESENC KEY STATE1
2070	AESENC KEY STATE2
2071	AESENC KEY STATE3
2072	AESENC KEY STATE4
2073	movaps -0x30(TKEYP), KEY
2074	AESENC KEY STATE1
2075	AESENC KEY STATE2
2076	AESENC KEY STATE3
2077	AESENC KEY STATE4
2078#.align 4
2079.L4enc128:
2080	movaps -0x20(TKEYP), KEY
2081	AESENC KEY STATE1
2082	AESENC KEY STATE2
2083	AESENC KEY STATE3
2084	AESENC KEY STATE4
2085	movaps -0x10(TKEYP), KEY
2086	AESENC KEY STATE1
2087	AESENC KEY STATE2
2088	AESENC KEY STATE3
2089	AESENC KEY STATE4
2090	movaps (TKEYP), KEY
2091	AESENC KEY STATE1
2092	AESENC KEY STATE2
2093	AESENC KEY STATE3
2094	AESENC KEY STATE4
2095	movaps 0x10(TKEYP), KEY
2096	AESENC KEY STATE1
2097	AESENC KEY STATE2
2098	AESENC KEY STATE3
2099	AESENC KEY STATE4
2100	movaps 0x20(TKEYP), KEY
2101	AESENC KEY STATE1
2102	AESENC KEY STATE2
2103	AESENC KEY STATE3
2104	AESENC KEY STATE4
2105	movaps 0x30(TKEYP), KEY
2106	AESENC KEY STATE1
2107	AESENC KEY STATE2
2108	AESENC KEY STATE3
2109	AESENC KEY STATE4
2110	movaps 0x40(TKEYP), KEY
2111	AESENC KEY STATE1
2112	AESENC KEY STATE2
2113	AESENC KEY STATE3
2114	AESENC KEY STATE4
2115	movaps 0x50(TKEYP), KEY
2116	AESENC KEY STATE1
2117	AESENC KEY STATE2
2118	AESENC KEY STATE3
2119	AESENC KEY STATE4
2120	movaps 0x60(TKEYP), KEY
2121	AESENC KEY STATE1
2122	AESENC KEY STATE2
2123	AESENC KEY STATE3
2124	AESENC KEY STATE4
2125	movaps 0x70(TKEYP), KEY
2126	AESENCLAST KEY STATE1		# last round
2127	AESENCLAST KEY STATE2
2128	AESENCLAST KEY STATE3
2129	AESENCLAST KEY STATE4
2130	ret
2131SYM_FUNC_END(_aesni_enc4)
2132
2133/*
2134 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2135 */
2136SYM_FUNC_START(aesni_dec)
2137	FRAME_BEGIN
2138#ifndef __x86_64__
2139	pushl KEYP
2140	pushl KLEN
2141	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2142	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2143	movl (FRAME_OFFSET+20)(%esp), INP	# src
2144#endif
2145	mov 480(KEYP), KLEN		# key length
2146	add $240, KEYP
2147	movups (INP), STATE		# input
2148	call _aesni_dec1
2149	movups STATE, (OUTP)		#output
2150#ifndef __x86_64__
2151	popl KLEN
2152	popl KEYP
2153#endif
2154	FRAME_END
2155	ret
2156SYM_FUNC_END(aesni_dec)
2157
2158/*
2159 * _aesni_dec1:		internal ABI
2160 * input:
2161 *	KEYP:		key struct pointer
2162 *	KLEN:		key length
2163 *	STATE:		initial state (input)
2164 * output:
2165 *	STATE:		finial state (output)
2166 * changed:
2167 *	KEY
2168 *	TKEYP (T1)
2169 */
2170SYM_FUNC_START_LOCAL(_aesni_dec1)
2171	movaps (KEYP), KEY		# key
2172	mov KEYP, TKEYP
2173	pxor KEY, STATE		# round 0
2174	add $0x30, TKEYP
2175	cmp $24, KLEN
2176	jb .Ldec128
2177	lea 0x20(TKEYP), TKEYP
2178	je .Ldec192
2179	add $0x20, TKEYP
2180	movaps -0x60(TKEYP), KEY
2181	AESDEC KEY STATE
2182	movaps -0x50(TKEYP), KEY
2183	AESDEC KEY STATE
2184.align 4
2185.Ldec192:
2186	movaps -0x40(TKEYP), KEY
2187	AESDEC KEY STATE
2188	movaps -0x30(TKEYP), KEY
2189	AESDEC KEY STATE
2190.align 4
2191.Ldec128:
2192	movaps -0x20(TKEYP), KEY
2193	AESDEC KEY STATE
2194	movaps -0x10(TKEYP), KEY
2195	AESDEC KEY STATE
2196	movaps (TKEYP), KEY
2197	AESDEC KEY STATE
2198	movaps 0x10(TKEYP), KEY
2199	AESDEC KEY STATE
2200	movaps 0x20(TKEYP), KEY
2201	AESDEC KEY STATE
2202	movaps 0x30(TKEYP), KEY
2203	AESDEC KEY STATE
2204	movaps 0x40(TKEYP), KEY
2205	AESDEC KEY STATE
2206	movaps 0x50(TKEYP), KEY
2207	AESDEC KEY STATE
2208	movaps 0x60(TKEYP), KEY
2209	AESDEC KEY STATE
2210	movaps 0x70(TKEYP), KEY
2211	AESDECLAST KEY STATE
2212	ret
2213SYM_FUNC_END(_aesni_dec1)
2214
2215/*
2216 * _aesni_dec4:	internal ABI
2217 * input:
2218 *	KEYP:		key struct pointer
2219 *	KLEN:		key length
2220 *	STATE1:		initial state (input)
2221 *	STATE2
2222 *	STATE3
2223 *	STATE4
2224 * output:
2225 *	STATE1:		finial state (output)
2226 *	STATE2
2227 *	STATE3
2228 *	STATE4
2229 * changed:
2230 *	KEY
2231 *	TKEYP (T1)
2232 */
2233SYM_FUNC_START_LOCAL(_aesni_dec4)
2234	movaps (KEYP), KEY		# key
2235	mov KEYP, TKEYP
2236	pxor KEY, STATE1		# round 0
2237	pxor KEY, STATE2
2238	pxor KEY, STATE3
2239	pxor KEY, STATE4
2240	add $0x30, TKEYP
2241	cmp $24, KLEN
2242	jb .L4dec128
2243	lea 0x20(TKEYP), TKEYP
2244	je .L4dec192
2245	add $0x20, TKEYP
2246	movaps -0x60(TKEYP), KEY
2247	AESDEC KEY STATE1
2248	AESDEC KEY STATE2
2249	AESDEC KEY STATE3
2250	AESDEC KEY STATE4
2251	movaps -0x50(TKEYP), KEY
2252	AESDEC KEY STATE1
2253	AESDEC KEY STATE2
2254	AESDEC KEY STATE3
2255	AESDEC KEY STATE4
2256.align 4
2257.L4dec192:
2258	movaps -0x40(TKEYP), KEY
2259	AESDEC KEY STATE1
2260	AESDEC KEY STATE2
2261	AESDEC KEY STATE3
2262	AESDEC KEY STATE4
2263	movaps -0x30(TKEYP), KEY
2264	AESDEC KEY STATE1
2265	AESDEC KEY STATE2
2266	AESDEC KEY STATE3
2267	AESDEC KEY STATE4
2268.align 4
2269.L4dec128:
2270	movaps -0x20(TKEYP), KEY
2271	AESDEC KEY STATE1
2272	AESDEC KEY STATE2
2273	AESDEC KEY STATE3
2274	AESDEC KEY STATE4
2275	movaps -0x10(TKEYP), KEY
2276	AESDEC KEY STATE1
2277	AESDEC KEY STATE2
2278	AESDEC KEY STATE3
2279	AESDEC KEY STATE4
2280	movaps (TKEYP), KEY
2281	AESDEC KEY STATE1
2282	AESDEC KEY STATE2
2283	AESDEC KEY STATE3
2284	AESDEC KEY STATE4
2285	movaps 0x10(TKEYP), KEY
2286	AESDEC KEY STATE1
2287	AESDEC KEY STATE2
2288	AESDEC KEY STATE3
2289	AESDEC KEY STATE4
2290	movaps 0x20(TKEYP), KEY
2291	AESDEC KEY STATE1
2292	AESDEC KEY STATE2
2293	AESDEC KEY STATE3
2294	AESDEC KEY STATE4
2295	movaps 0x30(TKEYP), KEY
2296	AESDEC KEY STATE1
2297	AESDEC KEY STATE2
2298	AESDEC KEY STATE3
2299	AESDEC KEY STATE4
2300	movaps 0x40(TKEYP), KEY
2301	AESDEC KEY STATE1
2302	AESDEC KEY STATE2
2303	AESDEC KEY STATE3
2304	AESDEC KEY STATE4
2305	movaps 0x50(TKEYP), KEY
2306	AESDEC KEY STATE1
2307	AESDEC KEY STATE2
2308	AESDEC KEY STATE3
2309	AESDEC KEY STATE4
2310	movaps 0x60(TKEYP), KEY
2311	AESDEC KEY STATE1
2312	AESDEC KEY STATE2
2313	AESDEC KEY STATE3
2314	AESDEC KEY STATE4
2315	movaps 0x70(TKEYP), KEY
2316	AESDECLAST KEY STATE1		# last round
2317	AESDECLAST KEY STATE2
2318	AESDECLAST KEY STATE3
2319	AESDECLAST KEY STATE4
2320	ret
2321SYM_FUNC_END(_aesni_dec4)
2322
2323/*
2324 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2325 *		      size_t len)
2326 */
2327SYM_FUNC_START(aesni_ecb_enc)
2328	FRAME_BEGIN
2329#ifndef __x86_64__
2330	pushl LEN
2331	pushl KEYP
2332	pushl KLEN
2333	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2334	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2335	movl (FRAME_OFFSET+24)(%esp), INP	# src
2336	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2337#endif
2338	test LEN, LEN		# check length
2339	jz .Lecb_enc_ret
2340	mov 480(KEYP), KLEN
2341	cmp $16, LEN
2342	jb .Lecb_enc_ret
2343	cmp $64, LEN
2344	jb .Lecb_enc_loop1
2345.align 4
2346.Lecb_enc_loop4:
2347	movups (INP), STATE1
2348	movups 0x10(INP), STATE2
2349	movups 0x20(INP), STATE3
2350	movups 0x30(INP), STATE4
2351	call _aesni_enc4
2352	movups STATE1, (OUTP)
2353	movups STATE2, 0x10(OUTP)
2354	movups STATE3, 0x20(OUTP)
2355	movups STATE4, 0x30(OUTP)
2356	sub $64, LEN
2357	add $64, INP
2358	add $64, OUTP
2359	cmp $64, LEN
2360	jge .Lecb_enc_loop4
2361	cmp $16, LEN
2362	jb .Lecb_enc_ret
2363.align 4
2364.Lecb_enc_loop1:
2365	movups (INP), STATE1
2366	call _aesni_enc1
2367	movups STATE1, (OUTP)
2368	sub $16, LEN
2369	add $16, INP
2370	add $16, OUTP
2371	cmp $16, LEN
2372	jge .Lecb_enc_loop1
2373.Lecb_enc_ret:
2374#ifndef __x86_64__
2375	popl KLEN
2376	popl KEYP
2377	popl LEN
2378#endif
2379	FRAME_END
2380	ret
2381SYM_FUNC_END(aesni_ecb_enc)
2382
2383/*
2384 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2385 *		      size_t len);
2386 */
2387SYM_FUNC_START(aesni_ecb_dec)
2388	FRAME_BEGIN
2389#ifndef __x86_64__
2390	pushl LEN
2391	pushl KEYP
2392	pushl KLEN
2393	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2394	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2395	movl (FRAME_OFFSET+24)(%esp), INP	# src
2396	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2397#endif
2398	test LEN, LEN
2399	jz .Lecb_dec_ret
2400	mov 480(KEYP), KLEN
2401	add $240, KEYP
2402	cmp $16, LEN
2403	jb .Lecb_dec_ret
2404	cmp $64, LEN
2405	jb .Lecb_dec_loop1
2406.align 4
2407.Lecb_dec_loop4:
2408	movups (INP), STATE1
2409	movups 0x10(INP), STATE2
2410	movups 0x20(INP), STATE3
2411	movups 0x30(INP), STATE4
2412	call _aesni_dec4
2413	movups STATE1, (OUTP)
2414	movups STATE2, 0x10(OUTP)
2415	movups STATE3, 0x20(OUTP)
2416	movups STATE4, 0x30(OUTP)
2417	sub $64, LEN
2418	add $64, INP
2419	add $64, OUTP
2420	cmp $64, LEN
2421	jge .Lecb_dec_loop4
2422	cmp $16, LEN
2423	jb .Lecb_dec_ret
2424.align 4
2425.Lecb_dec_loop1:
2426	movups (INP), STATE1
2427	call _aesni_dec1
2428	movups STATE1, (OUTP)
2429	sub $16, LEN
2430	add $16, INP
2431	add $16, OUTP
2432	cmp $16, LEN
2433	jge .Lecb_dec_loop1
2434.Lecb_dec_ret:
2435#ifndef __x86_64__
2436	popl KLEN
2437	popl KEYP
2438	popl LEN
2439#endif
2440	FRAME_END
2441	ret
2442SYM_FUNC_END(aesni_ecb_dec)
2443
2444/*
2445 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2446 *		      size_t len, u8 *iv)
2447 */
2448SYM_FUNC_START(aesni_cbc_enc)
2449	FRAME_BEGIN
2450#ifndef __x86_64__
2451	pushl IVP
2452	pushl LEN
2453	pushl KEYP
2454	pushl KLEN
2455	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2456	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2457	movl (FRAME_OFFSET+28)(%esp), INP	# src
2458	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2459	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2460#endif
2461	cmp $16, LEN
2462	jb .Lcbc_enc_ret
2463	mov 480(KEYP), KLEN
2464	movups (IVP), STATE	# load iv as initial state
2465.align 4
2466.Lcbc_enc_loop:
2467	movups (INP), IN	# load input
2468	pxor IN, STATE
2469	call _aesni_enc1
2470	movups STATE, (OUTP)	# store output
2471	sub $16, LEN
2472	add $16, INP
2473	add $16, OUTP
2474	cmp $16, LEN
2475	jge .Lcbc_enc_loop
2476	movups STATE, (IVP)
2477.Lcbc_enc_ret:
2478#ifndef __x86_64__
2479	popl KLEN
2480	popl KEYP
2481	popl LEN
2482	popl IVP
2483#endif
2484	FRAME_END
2485	ret
2486SYM_FUNC_END(aesni_cbc_enc)
2487
2488/*
2489 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2490 *		      size_t len, u8 *iv)
2491 */
2492SYM_FUNC_START(aesni_cbc_dec)
2493	FRAME_BEGIN
2494#ifndef __x86_64__
2495	pushl IVP
2496	pushl LEN
2497	pushl KEYP
2498	pushl KLEN
2499	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2500	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2501	movl (FRAME_OFFSET+28)(%esp), INP	# src
2502	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2503	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2504#endif
2505	cmp $16, LEN
2506	jb .Lcbc_dec_just_ret
2507	mov 480(KEYP), KLEN
2508	add $240, KEYP
2509	movups (IVP), IV
2510	cmp $64, LEN
2511	jb .Lcbc_dec_loop1
2512.align 4
2513.Lcbc_dec_loop4:
2514	movups (INP), IN1
2515	movaps IN1, STATE1
2516	movups 0x10(INP), IN2
2517	movaps IN2, STATE2
2518#ifdef __x86_64__
2519	movups 0x20(INP), IN3
2520	movaps IN3, STATE3
2521	movups 0x30(INP), IN4
2522	movaps IN4, STATE4
2523#else
2524	movups 0x20(INP), IN1
2525	movaps IN1, STATE3
2526	movups 0x30(INP), IN2
2527	movaps IN2, STATE4
2528#endif
2529	call _aesni_dec4
2530	pxor IV, STATE1
2531#ifdef __x86_64__
2532	pxor IN1, STATE2
2533	pxor IN2, STATE3
2534	pxor IN3, STATE4
2535	movaps IN4, IV
2536#else
2537	pxor IN1, STATE4
2538	movaps IN2, IV
2539	movups (INP), IN1
2540	pxor IN1, STATE2
2541	movups 0x10(INP), IN2
2542	pxor IN2, STATE3
2543#endif
2544	movups STATE1, (OUTP)
2545	movups STATE2, 0x10(OUTP)
2546	movups STATE3, 0x20(OUTP)
2547	movups STATE4, 0x30(OUTP)
2548	sub $64, LEN
2549	add $64, INP
2550	add $64, OUTP
2551	cmp $64, LEN
2552	jge .Lcbc_dec_loop4
2553	cmp $16, LEN
2554	jb .Lcbc_dec_ret
2555.align 4
2556.Lcbc_dec_loop1:
2557	movups (INP), IN
2558	movaps IN, STATE
2559	call _aesni_dec1
2560	pxor IV, STATE
2561	movups STATE, (OUTP)
2562	movaps IN, IV
2563	sub $16, LEN
2564	add $16, INP
2565	add $16, OUTP
2566	cmp $16, LEN
2567	jge .Lcbc_dec_loop1
2568.Lcbc_dec_ret:
2569	movups IV, (IVP)
2570.Lcbc_dec_just_ret:
2571#ifndef __x86_64__
2572	popl KLEN
2573	popl KEYP
2574	popl LEN
2575	popl IVP
2576#endif
2577	FRAME_END
2578	ret
2579SYM_FUNC_END(aesni_cbc_dec)
2580
2581#ifdef __x86_64__
2582.pushsection .rodata
2583.align 16
2584.Lbswap_mask:
2585	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2586.popsection
2587
2588/*
2589 * _aesni_inc_init:	internal ABI
2590 *	setup registers used by _aesni_inc
2591 * input:
2592 *	IV
2593 * output:
2594 *	CTR:	== IV, in little endian
2595 *	TCTR_LOW: == lower qword of CTR
2596 *	INC:	== 1, in little endian
2597 *	BSWAP_MASK == endian swapping mask
2598 */
2599SYM_FUNC_START_LOCAL(_aesni_inc_init)
2600	movaps .Lbswap_mask, BSWAP_MASK
2601	movaps IV, CTR
2602	PSHUFB_XMM BSWAP_MASK CTR
2603	mov $1, TCTR_LOW
2604	MOVQ_R64_XMM TCTR_LOW INC
2605	MOVQ_R64_XMM CTR TCTR_LOW
2606	ret
2607SYM_FUNC_END(_aesni_inc_init)
2608
2609/*
2610 * _aesni_inc:		internal ABI
2611 *	Increase IV by 1, IV is in big endian
2612 * input:
2613 *	IV
2614 *	CTR:	== IV, in little endian
2615 *	TCTR_LOW: == lower qword of CTR
2616 *	INC:	== 1, in little endian
2617 *	BSWAP_MASK == endian swapping mask
2618 * output:
2619 *	IV:	Increase by 1
2620 * changed:
2621 *	CTR:	== output IV, in little endian
2622 *	TCTR_LOW: == lower qword of CTR
2623 */
2624SYM_FUNC_START_LOCAL(_aesni_inc)
2625	paddq INC, CTR
2626	add $1, TCTR_LOW
2627	jnc .Linc_low
2628	pslldq $8, INC
2629	paddq INC, CTR
2630	psrldq $8, INC
2631.Linc_low:
2632	movaps CTR, IV
2633	PSHUFB_XMM BSWAP_MASK IV
2634	ret
2635SYM_FUNC_END(_aesni_inc)
2636
2637/*
2638 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2639 *		      size_t len, u8 *iv)
2640 */
2641SYM_FUNC_START(aesni_ctr_enc)
2642	FRAME_BEGIN
2643	cmp $16, LEN
2644	jb .Lctr_enc_just_ret
2645	mov 480(KEYP), KLEN
2646	movups (IVP), IV
2647	call _aesni_inc_init
2648	cmp $64, LEN
2649	jb .Lctr_enc_loop1
2650.align 4
2651.Lctr_enc_loop4:
2652	movaps IV, STATE1
2653	call _aesni_inc
2654	movups (INP), IN1
2655	movaps IV, STATE2
2656	call _aesni_inc
2657	movups 0x10(INP), IN2
2658	movaps IV, STATE3
2659	call _aesni_inc
2660	movups 0x20(INP), IN3
2661	movaps IV, STATE4
2662	call _aesni_inc
2663	movups 0x30(INP), IN4
2664	call _aesni_enc4
2665	pxor IN1, STATE1
2666	movups STATE1, (OUTP)
2667	pxor IN2, STATE2
2668	movups STATE2, 0x10(OUTP)
2669	pxor IN3, STATE3
2670	movups STATE3, 0x20(OUTP)
2671	pxor IN4, STATE4
2672	movups STATE4, 0x30(OUTP)
2673	sub $64, LEN
2674	add $64, INP
2675	add $64, OUTP
2676	cmp $64, LEN
2677	jge .Lctr_enc_loop4
2678	cmp $16, LEN
2679	jb .Lctr_enc_ret
2680.align 4
2681.Lctr_enc_loop1:
2682	movaps IV, STATE
2683	call _aesni_inc
2684	movups (INP), IN
2685	call _aesni_enc1
2686	pxor IN, STATE
2687	movups STATE, (OUTP)
2688	sub $16, LEN
2689	add $16, INP
2690	add $16, OUTP
2691	cmp $16, LEN
2692	jge .Lctr_enc_loop1
2693.Lctr_enc_ret:
2694	movups IV, (IVP)
2695.Lctr_enc_just_ret:
2696	FRAME_END
2697	ret
2698SYM_FUNC_END(aesni_ctr_enc)
2699
2700/*
2701 * _aesni_gf128mul_x_ble:		internal ABI
2702 *	Multiply in GF(2^128) for XTS IVs
2703 * input:
2704 *	IV:	current IV
2705 *	GF128MUL_MASK == mask with 0x87 and 0x01
2706 * output:
2707 *	IV:	next IV
2708 * changed:
2709 *	CTR:	== temporary value
2710 */
2711#define _aesni_gf128mul_x_ble() \
2712	pshufd $0x13, IV, CTR; \
2713	paddq IV, IV; \
2714	psrad $31, CTR; \
2715	pand GF128MUL_MASK, CTR; \
2716	pxor CTR, IV;
2717
2718/*
2719 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2720 *			 const u8 *src, bool enc, le128 *iv)
2721 */
2722SYM_FUNC_START(aesni_xts_crypt8)
2723	FRAME_BEGIN
2724	cmpb $0, %cl
2725	movl $0, %ecx
2726	movl $240, %r10d
2727	leaq _aesni_enc4, %r11
2728	leaq _aesni_dec4, %rax
2729	cmovel %r10d, %ecx
2730	cmoveq %rax, %r11
2731
2732	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2733	movups (IVP), IV
2734
2735	mov 480(KEYP), KLEN
2736	addq %rcx, KEYP
2737
2738	movdqa IV, STATE1
2739	movdqu 0x00(INP), INC
2740	pxor INC, STATE1
2741	movdqu IV, 0x00(OUTP)
2742
2743	_aesni_gf128mul_x_ble()
2744	movdqa IV, STATE2
2745	movdqu 0x10(INP), INC
2746	pxor INC, STATE2
2747	movdqu IV, 0x10(OUTP)
2748
2749	_aesni_gf128mul_x_ble()
2750	movdqa IV, STATE3
2751	movdqu 0x20(INP), INC
2752	pxor INC, STATE3
2753	movdqu IV, 0x20(OUTP)
2754
2755	_aesni_gf128mul_x_ble()
2756	movdqa IV, STATE4
2757	movdqu 0x30(INP), INC
2758	pxor INC, STATE4
2759	movdqu IV, 0x30(OUTP)
2760
2761	CALL_NOSPEC r11
2762
2763	movdqu 0x00(OUTP), INC
2764	pxor INC, STATE1
2765	movdqu STATE1, 0x00(OUTP)
2766
2767	_aesni_gf128mul_x_ble()
2768	movdqa IV, STATE1
2769	movdqu 0x40(INP), INC
2770	pxor INC, STATE1
2771	movdqu IV, 0x40(OUTP)
2772
2773	movdqu 0x10(OUTP), INC
2774	pxor INC, STATE2
2775	movdqu STATE2, 0x10(OUTP)
2776
2777	_aesni_gf128mul_x_ble()
2778	movdqa IV, STATE2
2779	movdqu 0x50(INP), INC
2780	pxor INC, STATE2
2781	movdqu IV, 0x50(OUTP)
2782
2783	movdqu 0x20(OUTP), INC
2784	pxor INC, STATE3
2785	movdqu STATE3, 0x20(OUTP)
2786
2787	_aesni_gf128mul_x_ble()
2788	movdqa IV, STATE3
2789	movdqu 0x60(INP), INC
2790	pxor INC, STATE3
2791	movdqu IV, 0x60(OUTP)
2792
2793	movdqu 0x30(OUTP), INC
2794	pxor INC, STATE4
2795	movdqu STATE4, 0x30(OUTP)
2796
2797	_aesni_gf128mul_x_ble()
2798	movdqa IV, STATE4
2799	movdqu 0x70(INP), INC
2800	pxor INC, STATE4
2801	movdqu IV, 0x70(OUTP)
2802
2803	_aesni_gf128mul_x_ble()
2804	movups IV, (IVP)
2805
2806	CALL_NOSPEC r11
2807
2808	movdqu 0x40(OUTP), INC
2809	pxor INC, STATE1
2810	movdqu STATE1, 0x40(OUTP)
2811
2812	movdqu 0x50(OUTP), INC
2813	pxor INC, STATE2
2814	movdqu STATE2, 0x50(OUTP)
2815
2816	movdqu 0x60(OUTP), INC
2817	pxor INC, STATE3
2818	movdqu STATE3, 0x60(OUTP)
2819
2820	movdqu 0x70(OUTP), INC
2821	pxor INC, STATE4
2822	movdqu STATE4, 0x70(OUTP)
2823
2824	FRAME_END
2825	ret
2826SYM_FUNC_END(aesni_xts_crypt8)
2827
2828#endif
2829