xref: /linux/arch/x86/crypto/aria-aesni-avx-asm_64.S (revision c67b553a4f4a8bd921e4c9ceae00e111be09c488)
1ba3579e6STaehee Yoo/* SPDX-License-Identifier: GPL-2.0-or-later */
2ba3579e6STaehee Yoo/*
3ba3579e6STaehee Yoo * ARIA Cipher 16-way parallel algorithm (AVX)
4ba3579e6STaehee Yoo *
5ba3579e6STaehee Yoo * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6ba3579e6STaehee Yoo *
7ba3579e6STaehee Yoo */
8ba3579e6STaehee Yoo
9ba3579e6STaehee Yoo#include <linux/linkage.h>
10*c67b553aSEric Biggers#include <linux/cfi_types.h>
11ba3579e6STaehee Yoo#include <asm/frame.h>
12ba3579e6STaehee Yoo
13ba3579e6STaehee Yoo/* struct aria_ctx: */
14ba3579e6STaehee Yoo#define enc_key 0
15ba3579e6STaehee Yoo#define dec_key 272
16ba3579e6STaehee Yoo#define rounds 544
17ba3579e6STaehee Yoo
18ba3579e6STaehee Yoo/* register macros */
19ba3579e6STaehee Yoo#define CTX %rdi
20ba3579e6STaehee Yoo
21ba3579e6STaehee Yoo
22ba3579e6STaehee Yoo#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
23ba3579e6STaehee Yoo	( (((a0) & 1) << 0) |				\
24ba3579e6STaehee Yoo	  (((a1) & 1) << 1) |				\
25ba3579e6STaehee Yoo	  (((a2) & 1) << 2) |				\
26ba3579e6STaehee Yoo	  (((a3) & 1) << 3) |				\
27ba3579e6STaehee Yoo	  (((a4) & 1) << 4) |				\
28ba3579e6STaehee Yoo	  (((a5) & 1) << 5) |				\
29ba3579e6STaehee Yoo	  (((a6) & 1) << 6) |				\
30ba3579e6STaehee Yoo	  (((a7) & 1) << 7) )
31ba3579e6STaehee Yoo
32ba3579e6STaehee Yoo#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
33ba3579e6STaehee Yoo	( ((l7) << (0 * 8)) |				\
34ba3579e6STaehee Yoo	  ((l6) << (1 * 8)) |				\
35ba3579e6STaehee Yoo	  ((l5) << (2 * 8)) |				\
36ba3579e6STaehee Yoo	  ((l4) << (3 * 8)) |				\
37ba3579e6STaehee Yoo	  ((l3) << (4 * 8)) |				\
38ba3579e6STaehee Yoo	  ((l2) << (5 * 8)) |				\
39ba3579e6STaehee Yoo	  ((l1) << (6 * 8)) |				\
40ba3579e6STaehee Yoo	  ((l0) << (7 * 8)) )
41ba3579e6STaehee Yoo
42ba3579e6STaehee Yoo#define inc_le128(x, minus_one, tmp)			\
43ba3579e6STaehee Yoo	vpcmpeqq minus_one, x, tmp;			\
44ba3579e6STaehee Yoo	vpsubq minus_one, x, x;				\
45ba3579e6STaehee Yoo	vpslldq $8, tmp, tmp;				\
46ba3579e6STaehee Yoo	vpsubq tmp, x, x;
47ba3579e6STaehee Yoo
48ba3579e6STaehee Yoo#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
49ba3579e6STaehee Yoo	vpand x, mask4bit, tmp0;			\
50ba3579e6STaehee Yoo	vpandn x, mask4bit, x;				\
51ba3579e6STaehee Yoo	vpsrld $4, x, x;				\
52ba3579e6STaehee Yoo							\
53ba3579e6STaehee Yoo	vpshufb tmp0, lo_t, tmp0;			\
54ba3579e6STaehee Yoo	vpshufb x, hi_t, x;				\
55ba3579e6STaehee Yoo	vpxor tmp0, x, x;
56ba3579e6STaehee Yoo
57ba3579e6STaehee Yoo#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
58ba3579e6STaehee Yoo	vpunpckhdq x1, x0, t2;				\
59ba3579e6STaehee Yoo	vpunpckldq x1, x0, x0;				\
60ba3579e6STaehee Yoo							\
61ba3579e6STaehee Yoo	vpunpckldq x3, x2, t1;				\
62ba3579e6STaehee Yoo	vpunpckhdq x3, x2, x2;				\
63ba3579e6STaehee Yoo							\
64ba3579e6STaehee Yoo	vpunpckhqdq t1, x0, x1;				\
65ba3579e6STaehee Yoo	vpunpcklqdq t1, x0, x0;				\
66ba3579e6STaehee Yoo							\
67ba3579e6STaehee Yoo	vpunpckhqdq x2, t2, x3;				\
68ba3579e6STaehee Yoo	vpunpcklqdq x2, t2, x2;
69ba3579e6STaehee Yoo
70ba3579e6STaehee Yoo#define byteslice_16x16b(a0, b0, c0, d0,		\
71ba3579e6STaehee Yoo			 a1, b1, c1, d1,		\
72ba3579e6STaehee Yoo			 a2, b2, c2, d2,		\
73ba3579e6STaehee Yoo			 a3, b3, c3, d3,		\
74ba3579e6STaehee Yoo			 st0, st1)			\
75ba3579e6STaehee Yoo	vmovdqu d2, st0;				\
76ba3579e6STaehee Yoo	vmovdqu d3, st1;				\
77ba3579e6STaehee Yoo	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
78ba3579e6STaehee Yoo	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
79ba3579e6STaehee Yoo	vmovdqu st0, d2;				\
80ba3579e6STaehee Yoo	vmovdqu st1, d3;				\
81ba3579e6STaehee Yoo							\
82ba3579e6STaehee Yoo	vmovdqu a0, st0;				\
83ba3579e6STaehee Yoo	vmovdqu a1, st1;				\
84ba3579e6STaehee Yoo	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
85ba3579e6STaehee Yoo	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
86ba3579e6STaehee Yoo							\
87ba3579e6STaehee Yoo	vmovdqu .Lshufb_16x16b, a0;			\
88ba3579e6STaehee Yoo	vmovdqu st1, a1;				\
89ba3579e6STaehee Yoo	vpshufb a0, a2, a2;				\
90ba3579e6STaehee Yoo	vpshufb a0, a3, a3;				\
91ba3579e6STaehee Yoo	vpshufb a0, b0, b0;				\
92ba3579e6STaehee Yoo	vpshufb a0, b1, b1;				\
93ba3579e6STaehee Yoo	vpshufb a0, b2, b2;				\
94ba3579e6STaehee Yoo	vpshufb a0, b3, b3;				\
95ba3579e6STaehee Yoo	vpshufb a0, a1, a1;				\
96ba3579e6STaehee Yoo	vpshufb a0, c0, c0;				\
97ba3579e6STaehee Yoo	vpshufb a0, c1, c1;				\
98ba3579e6STaehee Yoo	vpshufb a0, c2, c2;				\
99ba3579e6STaehee Yoo	vpshufb a0, c3, c3;				\
100ba3579e6STaehee Yoo	vpshufb a0, d0, d0;				\
101ba3579e6STaehee Yoo	vpshufb a0, d1, d1;				\
102ba3579e6STaehee Yoo	vpshufb a0, d2, d2;				\
103ba3579e6STaehee Yoo	vpshufb a0, d3, d3;				\
104ba3579e6STaehee Yoo	vmovdqu d3, st1;				\
105ba3579e6STaehee Yoo	vmovdqu st0, d3;				\
106ba3579e6STaehee Yoo	vpshufb a0, d3, a0;				\
107ba3579e6STaehee Yoo	vmovdqu d2, st0;				\
108ba3579e6STaehee Yoo							\
109ba3579e6STaehee Yoo	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
110ba3579e6STaehee Yoo	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
111ba3579e6STaehee Yoo	vmovdqu st0, d2;				\
112ba3579e6STaehee Yoo	vmovdqu st1, d3;				\
113ba3579e6STaehee Yoo							\
114ba3579e6STaehee Yoo	vmovdqu b0, st0;				\
115ba3579e6STaehee Yoo	vmovdqu b1, st1;				\
116ba3579e6STaehee Yoo	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
117ba3579e6STaehee Yoo	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
118ba3579e6STaehee Yoo	vmovdqu st0, b0;				\
119ba3579e6STaehee Yoo	vmovdqu st1, b1;				\
120ba3579e6STaehee Yoo	/* does not adjust output bytes inside vectors */
121ba3579e6STaehee Yoo
122ba3579e6STaehee Yoo#define debyteslice_16x16b(a0, b0, c0, d0,		\
123ba3579e6STaehee Yoo			   a1, b1, c1, d1,		\
124ba3579e6STaehee Yoo			   a2, b2, c2, d2,		\
125ba3579e6STaehee Yoo			   a3, b3, c3, d3,		\
126ba3579e6STaehee Yoo			   st0, st1)			\
127ba3579e6STaehee Yoo	vmovdqu d2, st0;				\
128ba3579e6STaehee Yoo	vmovdqu d3, st1;				\
129ba3579e6STaehee Yoo	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
130ba3579e6STaehee Yoo	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
131ba3579e6STaehee Yoo	vmovdqu st0, d2;				\
132ba3579e6STaehee Yoo	vmovdqu st1, d3;				\
133ba3579e6STaehee Yoo							\
134ba3579e6STaehee Yoo	vmovdqu a0, st0;				\
135ba3579e6STaehee Yoo	vmovdqu a1, st1;				\
136ba3579e6STaehee Yoo	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
137ba3579e6STaehee Yoo	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
138ba3579e6STaehee Yoo							\
139ba3579e6STaehee Yoo	vmovdqu .Lshufb_16x16b, a0;			\
140ba3579e6STaehee Yoo	vmovdqu st1, a1;				\
141ba3579e6STaehee Yoo	vpshufb a0, a2, a2;				\
142ba3579e6STaehee Yoo	vpshufb a0, a3, a3;				\
143ba3579e6STaehee Yoo	vpshufb a0, b0, b0;				\
144ba3579e6STaehee Yoo	vpshufb a0, b1, b1;				\
145ba3579e6STaehee Yoo	vpshufb a0, b2, b2;				\
146ba3579e6STaehee Yoo	vpshufb a0, b3, b3;				\
147ba3579e6STaehee Yoo	vpshufb a0, a1, a1;				\
148ba3579e6STaehee Yoo	vpshufb a0, c0, c0;				\
149ba3579e6STaehee Yoo	vpshufb a0, c1, c1;				\
150ba3579e6STaehee Yoo	vpshufb a0, c2, c2;				\
151ba3579e6STaehee Yoo	vpshufb a0, c3, c3;				\
152ba3579e6STaehee Yoo	vpshufb a0, d0, d0;				\
153ba3579e6STaehee Yoo	vpshufb a0, d1, d1;				\
154ba3579e6STaehee Yoo	vpshufb a0, d2, d2;				\
155ba3579e6STaehee Yoo	vpshufb a0, d3, d3;				\
156ba3579e6STaehee Yoo	vmovdqu d3, st1;				\
157ba3579e6STaehee Yoo	vmovdqu st0, d3;				\
158ba3579e6STaehee Yoo	vpshufb a0, d3, a0;				\
159ba3579e6STaehee Yoo	vmovdqu d2, st0;				\
160ba3579e6STaehee Yoo							\
161ba3579e6STaehee Yoo	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
162ba3579e6STaehee Yoo	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
163ba3579e6STaehee Yoo	vmovdqu st0, d2;				\
164ba3579e6STaehee Yoo	vmovdqu st1, d3;				\
165ba3579e6STaehee Yoo							\
166ba3579e6STaehee Yoo	vmovdqu b0, st0;				\
167ba3579e6STaehee Yoo	vmovdqu b1, st1;				\
168ba3579e6STaehee Yoo	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
169ba3579e6STaehee Yoo	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
170ba3579e6STaehee Yoo	vmovdqu st0, b0;				\
171ba3579e6STaehee Yoo	vmovdqu st1, b1;				\
172ba3579e6STaehee Yoo	/* does not adjust output bytes inside vectors */
173ba3579e6STaehee Yoo
174ba3579e6STaehee Yoo/* load blocks to registers and apply pre-whitening */
175ba3579e6STaehee Yoo#define inpack16_pre(x0, x1, x2, x3,			\
176ba3579e6STaehee Yoo		     x4, x5, x6, x7,			\
177ba3579e6STaehee Yoo		     y0, y1, y2, y3,			\
178ba3579e6STaehee Yoo		     y4, y5, y6, y7,			\
179ba3579e6STaehee Yoo		     rio)				\
180ba3579e6STaehee Yoo	vmovdqu (0 * 16)(rio), x0;			\
181ba3579e6STaehee Yoo	vmovdqu (1 * 16)(rio), x1;			\
182ba3579e6STaehee Yoo	vmovdqu (2 * 16)(rio), x2;			\
183ba3579e6STaehee Yoo	vmovdqu (3 * 16)(rio), x3;			\
184ba3579e6STaehee Yoo	vmovdqu (4 * 16)(rio), x4;			\
185ba3579e6STaehee Yoo	vmovdqu (5 * 16)(rio), x5;			\
186ba3579e6STaehee Yoo	vmovdqu (6 * 16)(rio), x6;			\
187ba3579e6STaehee Yoo	vmovdqu (7 * 16)(rio), x7;			\
188ba3579e6STaehee Yoo	vmovdqu (8 * 16)(rio), y0;			\
189ba3579e6STaehee Yoo	vmovdqu (9 * 16)(rio), y1;			\
190ba3579e6STaehee Yoo	vmovdqu (10 * 16)(rio), y2;			\
191ba3579e6STaehee Yoo	vmovdqu (11 * 16)(rio), y3;			\
192ba3579e6STaehee Yoo	vmovdqu (12 * 16)(rio), y4;			\
193ba3579e6STaehee Yoo	vmovdqu (13 * 16)(rio), y5;			\
194ba3579e6STaehee Yoo	vmovdqu (14 * 16)(rio), y6;			\
195ba3579e6STaehee Yoo	vmovdqu (15 * 16)(rio), y7;
196ba3579e6STaehee Yoo
197ba3579e6STaehee Yoo/* byteslice pre-whitened blocks and store to temporary memory */
198ba3579e6STaehee Yoo#define inpack16_post(x0, x1, x2, x3,			\
199ba3579e6STaehee Yoo		      x4, x5, x6, x7,			\
200ba3579e6STaehee Yoo		      y0, y1, y2, y3,			\
201ba3579e6STaehee Yoo		      y4, y5, y6, y7,			\
202ba3579e6STaehee Yoo		      mem_ab, mem_cd)			\
203ba3579e6STaehee Yoo	byteslice_16x16b(x0, x1, x2, x3,		\
204ba3579e6STaehee Yoo			 x4, x5, x6, x7,		\
205ba3579e6STaehee Yoo			 y0, y1, y2, y3,		\
206ba3579e6STaehee Yoo			 y4, y5, y6, y7,		\
207ba3579e6STaehee Yoo			 (mem_ab), (mem_cd));		\
208ba3579e6STaehee Yoo							\
209ba3579e6STaehee Yoo	vmovdqu x0, 0 * 16(mem_ab);			\
210ba3579e6STaehee Yoo	vmovdqu x1, 1 * 16(mem_ab);			\
211ba3579e6STaehee Yoo	vmovdqu x2, 2 * 16(mem_ab);			\
212ba3579e6STaehee Yoo	vmovdqu x3, 3 * 16(mem_ab);			\
213ba3579e6STaehee Yoo	vmovdqu x4, 4 * 16(mem_ab);			\
214ba3579e6STaehee Yoo	vmovdqu x5, 5 * 16(mem_ab);			\
215ba3579e6STaehee Yoo	vmovdqu x6, 6 * 16(mem_ab);			\
216ba3579e6STaehee Yoo	vmovdqu x7, 7 * 16(mem_ab);			\
217ba3579e6STaehee Yoo	vmovdqu y0, 0 * 16(mem_cd);			\
218ba3579e6STaehee Yoo	vmovdqu y1, 1 * 16(mem_cd);			\
219ba3579e6STaehee Yoo	vmovdqu y2, 2 * 16(mem_cd);			\
220ba3579e6STaehee Yoo	vmovdqu y3, 3 * 16(mem_cd);			\
221ba3579e6STaehee Yoo	vmovdqu y4, 4 * 16(mem_cd);			\
222ba3579e6STaehee Yoo	vmovdqu y5, 5 * 16(mem_cd);			\
223ba3579e6STaehee Yoo	vmovdqu y6, 6 * 16(mem_cd);			\
224ba3579e6STaehee Yoo	vmovdqu y7, 7 * 16(mem_cd);
225ba3579e6STaehee Yoo
226ba3579e6STaehee Yoo#define write_output(x0, x1, x2, x3,			\
227ba3579e6STaehee Yoo		     x4, x5, x6, x7,			\
228ba3579e6STaehee Yoo		     y0, y1, y2, y3,			\
229ba3579e6STaehee Yoo		     y4, y5, y6, y7,			\
230ba3579e6STaehee Yoo		     mem)				\
231ba3579e6STaehee Yoo	vmovdqu x0, 0 * 16(mem);			\
232ba3579e6STaehee Yoo	vmovdqu x1, 1 * 16(mem);			\
233ba3579e6STaehee Yoo	vmovdqu x2, 2 * 16(mem);			\
234ba3579e6STaehee Yoo	vmovdqu x3, 3 * 16(mem);			\
235ba3579e6STaehee Yoo	vmovdqu x4, 4 * 16(mem);			\
236ba3579e6STaehee Yoo	vmovdqu x5, 5 * 16(mem);			\
237ba3579e6STaehee Yoo	vmovdqu x6, 6 * 16(mem);			\
238ba3579e6STaehee Yoo	vmovdqu x7, 7 * 16(mem);			\
239ba3579e6STaehee Yoo	vmovdqu y0, 8 * 16(mem);			\
240ba3579e6STaehee Yoo	vmovdqu y1, 9 * 16(mem);			\
241ba3579e6STaehee Yoo	vmovdqu y2, 10 * 16(mem);			\
242ba3579e6STaehee Yoo	vmovdqu y3, 11 * 16(mem);			\
243ba3579e6STaehee Yoo	vmovdqu y4, 12 * 16(mem);			\
244ba3579e6STaehee Yoo	vmovdqu y5, 13 * 16(mem);			\
245ba3579e6STaehee Yoo	vmovdqu y6, 14 * 16(mem);			\
246ba3579e6STaehee Yoo	vmovdqu y7, 15 * 16(mem);			\
247ba3579e6STaehee Yoo
248ba3579e6STaehee Yoo#define aria_store_state_8way(x0, x1, x2, x3,		\
249ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
250ba3579e6STaehee Yoo			      mem_tmp, idx)		\
251ba3579e6STaehee Yoo	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
252ba3579e6STaehee Yoo	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
253ba3579e6STaehee Yoo	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
254ba3579e6STaehee Yoo	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
255ba3579e6STaehee Yoo	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
256ba3579e6STaehee Yoo	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
257ba3579e6STaehee Yoo	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
258ba3579e6STaehee Yoo	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
259ba3579e6STaehee Yoo
260ba3579e6STaehee Yoo#define aria_load_state_8way(x0, x1, x2, x3,		\
261ba3579e6STaehee Yoo			     x4, x5, x6, x7,		\
262ba3579e6STaehee Yoo			     mem_tmp, idx)		\
263ba3579e6STaehee Yoo	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
264ba3579e6STaehee Yoo	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
265ba3579e6STaehee Yoo	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
266ba3579e6STaehee Yoo	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
267ba3579e6STaehee Yoo	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
268ba3579e6STaehee Yoo	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
269ba3579e6STaehee Yoo	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
270ba3579e6STaehee Yoo	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
271ba3579e6STaehee Yoo
272ba3579e6STaehee Yoo#define aria_ark_8way(x0, x1, x2, x3,			\
273ba3579e6STaehee Yoo		      x4, x5, x6, x7,			\
274ba3579e6STaehee Yoo		      t0, rk, idx, round)		\
275ba3579e6STaehee Yoo	/* AddRoundKey */                               \
276ba3579e6STaehee Yoo	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
277ba3579e6STaehee Yoo	vpxor t0, x0, x0;				\
278ba3579e6STaehee Yoo	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
279ba3579e6STaehee Yoo	vpxor t0, x1, x1;				\
280ba3579e6STaehee Yoo	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
281ba3579e6STaehee Yoo	vpxor t0, x2, x2;				\
282ba3579e6STaehee Yoo	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
283ba3579e6STaehee Yoo	vpxor t0, x3, x3;				\
284ba3579e6STaehee Yoo	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
285ba3579e6STaehee Yoo	vpxor t0, x4, x4;				\
286ba3579e6STaehee Yoo	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
287ba3579e6STaehee Yoo	vpxor t0, x5, x5;				\
288ba3579e6STaehee Yoo	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
289ba3579e6STaehee Yoo	vpxor t0, x6, x6;				\
290ba3579e6STaehee Yoo	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
291ba3579e6STaehee Yoo	vpxor t0, x7, x7;
292ba3579e6STaehee Yoo
293ba3579e6STaehee Yoo#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
294ba3579e6STaehee Yoo			    x4, x5, x6, x7,		\
295ba3579e6STaehee Yoo			    t0, t1, t2, t3,		\
296ba3579e6STaehee Yoo			    t4, t5, t6, t7)		\
297ba3579e6STaehee Yoo	vpbroadcastq .Ltf_s2_bitmatrix, t0;		\
298ba3579e6STaehee Yoo	vpbroadcastq .Ltf_inv_bitmatrix, t1;		\
299ba3579e6STaehee Yoo	vpbroadcastq .Ltf_id_bitmatrix, t2;		\
300ba3579e6STaehee Yoo	vpbroadcastq .Ltf_aff_bitmatrix, t3;		\
301ba3579e6STaehee Yoo	vpbroadcastq .Ltf_x2_bitmatrix, t4;		\
302ba3579e6STaehee Yoo	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
303ba3579e6STaehee Yoo	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
304ba3579e6STaehee Yoo	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
305ba3579e6STaehee Yoo	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
306ba3579e6STaehee Yoo	vgf2p8affineinvqb $0, t2, x2, x2;		\
307ba3579e6STaehee Yoo	vgf2p8affineinvqb $0, t2, x6, x6;		\
308ba3579e6STaehee Yoo	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
309ba3579e6STaehee Yoo	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
310ba3579e6STaehee Yoo	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
311ba3579e6STaehee Yoo	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
312ba3579e6STaehee Yoo	vgf2p8affineinvqb $0, t2, x3, x3;		\
313ba3579e6STaehee Yoo	vgf2p8affineinvqb $0, t2, x7, x7
314ba3579e6STaehee Yoo
315ba3579e6STaehee Yoo#define aria_sbox_8way(x0, x1, x2, x3,            	\
316ba3579e6STaehee Yoo		       x4, x5, x6, x7,			\
317ba3579e6STaehee Yoo		       t0, t1, t2, t3,			\
318ba3579e6STaehee Yoo		       t4, t5, t6, t7)			\
319ba3579e6STaehee Yoo	vpxor t7, t7, t7;				\
320ba3579e6STaehee Yoo	vmovdqa .Linv_shift_row, t0;			\
321ba3579e6STaehee Yoo	vmovdqa .Lshift_row, t1;			\
322ba3579e6STaehee Yoo	vpbroadcastd .L0f0f0f0f, t6;			\
323ba3579e6STaehee Yoo	vmovdqa .Ltf_lo__inv_aff__and__s2, t2;		\
324ba3579e6STaehee Yoo	vmovdqa .Ltf_hi__inv_aff__and__s2, t3;		\
325ba3579e6STaehee Yoo	vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;		\
326ba3579e6STaehee Yoo	vmovdqa .Ltf_hi__x2__and__fwd_aff, t5;		\
327ba3579e6STaehee Yoo							\
328ba3579e6STaehee Yoo	vaesenclast t7, x0, x0;				\
329ba3579e6STaehee Yoo	vaesenclast t7, x4, x4;				\
330ba3579e6STaehee Yoo	vaesenclast t7, x1, x1;				\
331ba3579e6STaehee Yoo	vaesenclast t7, x5, x5;				\
332ba3579e6STaehee Yoo	vaesdeclast t7, x2, x2;				\
333ba3579e6STaehee Yoo	vaesdeclast t7, x6, x6;				\
334ba3579e6STaehee Yoo							\
335ba3579e6STaehee Yoo	/* AES inverse shift rows */			\
336ba3579e6STaehee Yoo	vpshufb t0, x0, x0;				\
337ba3579e6STaehee Yoo	vpshufb t0, x4, x4;				\
338ba3579e6STaehee Yoo	vpshufb t0, x1, x1;				\
339ba3579e6STaehee Yoo	vpshufb t0, x5, x5;				\
340ba3579e6STaehee Yoo	vpshufb t1, x3, x3;				\
341ba3579e6STaehee Yoo	vpshufb t1, x7, x7;				\
342ba3579e6STaehee Yoo	vpshufb t1, x2, x2;				\
343ba3579e6STaehee Yoo	vpshufb t1, x6, x6;				\
344ba3579e6STaehee Yoo							\
345ba3579e6STaehee Yoo	/* affine transformation for S2 */		\
346ba3579e6STaehee Yoo	filter_8bit(x1, t2, t3, t6, t0);		\
347ba3579e6STaehee Yoo	/* affine transformation for S2 */		\
348ba3579e6STaehee Yoo	filter_8bit(x5, t2, t3, t6, t0);		\
349ba3579e6STaehee Yoo							\
350ba3579e6STaehee Yoo	/* affine transformation for X2 */		\
351ba3579e6STaehee Yoo	filter_8bit(x3, t4, t5, t6, t0);		\
352ba3579e6STaehee Yoo	/* affine transformation for X2 */		\
353ba3579e6STaehee Yoo	filter_8bit(x7, t4, t5, t6, t0);		\
354ba3579e6STaehee Yoo	vaesdeclast t7, x3, x3;				\
355ba3579e6STaehee Yoo	vaesdeclast t7, x7, x7;
356ba3579e6STaehee Yoo
357ba3579e6STaehee Yoo#define aria_diff_m(x0, x1, x2, x3,			\
358ba3579e6STaehee Yoo		    t0, t1, t2, t3)			\
359ba3579e6STaehee Yoo	/* T = rotr32(X, 8); */				\
360ba3579e6STaehee Yoo	/* X ^= T */					\
361ba3579e6STaehee Yoo	vpxor x0, x3, t0;				\
362ba3579e6STaehee Yoo	vpxor x1, x0, t1;				\
363ba3579e6STaehee Yoo	vpxor x2, x1, t2;				\
364ba3579e6STaehee Yoo	vpxor x3, x2, t3;				\
365ba3579e6STaehee Yoo	/* X = T ^ rotr(X, 16); */			\
366ba3579e6STaehee Yoo	vpxor t2, x0, x0;				\
367ba3579e6STaehee Yoo	vpxor x1, t3, t3;				\
368ba3579e6STaehee Yoo	vpxor t0, x2, x2;				\
369ba3579e6STaehee Yoo	vpxor t1, x3, x1;				\
370ba3579e6STaehee Yoo	vmovdqu t3, x3;
371ba3579e6STaehee Yoo
372ba3579e6STaehee Yoo#define aria_diff_word(x0, x1, x2, x3,			\
373ba3579e6STaehee Yoo		       x4, x5, x6, x7,			\
374ba3579e6STaehee Yoo		       y0, y1, y2, y3,			\
375ba3579e6STaehee Yoo		       y4, y5, y6, y7)			\
376ba3579e6STaehee Yoo	/* t1 ^= t2; */					\
377ba3579e6STaehee Yoo	vpxor y0, x4, x4;				\
378ba3579e6STaehee Yoo	vpxor y1, x5, x5;				\
379ba3579e6STaehee Yoo	vpxor y2, x6, x6;				\
380ba3579e6STaehee Yoo	vpxor y3, x7, x7;				\
381ba3579e6STaehee Yoo							\
382ba3579e6STaehee Yoo	/* t2 ^= t3; */					\
383ba3579e6STaehee Yoo	vpxor y4, y0, y0;				\
384ba3579e6STaehee Yoo	vpxor y5, y1, y1;				\
385ba3579e6STaehee Yoo	vpxor y6, y2, y2;				\
386ba3579e6STaehee Yoo	vpxor y7, y3, y3;				\
387ba3579e6STaehee Yoo							\
388ba3579e6STaehee Yoo	/* t0 ^= t1; */					\
389ba3579e6STaehee Yoo	vpxor x4, x0, x0;				\
390ba3579e6STaehee Yoo	vpxor x5, x1, x1;				\
391ba3579e6STaehee Yoo	vpxor x6, x2, x2;				\
392ba3579e6STaehee Yoo	vpxor x7, x3, x3;				\
393ba3579e6STaehee Yoo							\
394ba3579e6STaehee Yoo	/* t3 ^= t1; */					\
395ba3579e6STaehee Yoo	vpxor x4, y4, y4;				\
396ba3579e6STaehee Yoo	vpxor x5, y5, y5;				\
397ba3579e6STaehee Yoo	vpxor x6, y6, y6;				\
398ba3579e6STaehee Yoo	vpxor x7, y7, y7;				\
399ba3579e6STaehee Yoo							\
400ba3579e6STaehee Yoo	/* t2 ^= t0; */					\
401ba3579e6STaehee Yoo	vpxor x0, y0, y0;				\
402ba3579e6STaehee Yoo	vpxor x1, y1, y1;				\
403ba3579e6STaehee Yoo	vpxor x2, y2, y2;				\
404ba3579e6STaehee Yoo	vpxor x3, y3, y3;				\
405ba3579e6STaehee Yoo							\
406ba3579e6STaehee Yoo	/* t1 ^= t2; */					\
407ba3579e6STaehee Yoo	vpxor y0, x4, x4;				\
408ba3579e6STaehee Yoo	vpxor y1, x5, x5;				\
409ba3579e6STaehee Yoo	vpxor y2, x6, x6;				\
410ba3579e6STaehee Yoo	vpxor y3, x7, x7;
411ba3579e6STaehee Yoo
412ba3579e6STaehee Yoo#define aria_fe(x0, x1, x2, x3,				\
413ba3579e6STaehee Yoo		x4, x5, x6, x7,				\
414ba3579e6STaehee Yoo		y0, y1, y2, y3,				\
415ba3579e6STaehee Yoo		y4, y5, y6, y7,				\
416ba3579e6STaehee Yoo		mem_tmp, rk, round)			\
417ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
418ba3579e6STaehee Yoo		      y0, rk, 8, round);		\
419ba3579e6STaehee Yoo							\
420ba3579e6STaehee Yoo	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
421ba3579e6STaehee Yoo		       y0, y1, y2, y3, y4, y5, y6, y7);	\
422ba3579e6STaehee Yoo							\
423ba3579e6STaehee Yoo	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
424ba3579e6STaehee Yoo	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
425ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
426ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
427ba3579e6STaehee Yoo			      mem_tmp, 8);		\
428ba3579e6STaehee Yoo							\
429ba3579e6STaehee Yoo	aria_load_state_8way(x0, x1, x2, x3,		\
430ba3579e6STaehee Yoo			     x4, x5, x6, x7,		\
431ba3579e6STaehee Yoo			     mem_tmp, 0);		\
432ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
433ba3579e6STaehee Yoo		      y0, rk, 0, round);		\
434ba3579e6STaehee Yoo							\
435ba3579e6STaehee Yoo	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
436ba3579e6STaehee Yoo		       y0, y1, y2, y3, y4, y5, y6, y7);	\
437ba3579e6STaehee Yoo							\
438ba3579e6STaehee Yoo	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
439ba3579e6STaehee Yoo	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
440ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
441ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
442ba3579e6STaehee Yoo			      mem_tmp, 0);		\
443ba3579e6STaehee Yoo	aria_load_state_8way(y0, y1, y2, y3,		\
444ba3579e6STaehee Yoo			     y4, y5, y6, y7,		\
445ba3579e6STaehee Yoo			     mem_tmp, 8);		\
446ba3579e6STaehee Yoo	aria_diff_word(x0, x1, x2, x3,			\
447ba3579e6STaehee Yoo		       x4, x5, x6, x7,			\
448ba3579e6STaehee Yoo		       y0, y1, y2, y3,			\
449ba3579e6STaehee Yoo		       y4, y5, y6, y7);			\
450ba3579e6STaehee Yoo	/* aria_diff_byte() 				\
451ba3579e6STaehee Yoo	 * T3 = ABCD -> BADC 				\
452ba3579e6STaehee Yoo	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
453ba3579e6STaehee Yoo	 * T0 = ABCD -> CDAB 				\
454ba3579e6STaehee Yoo	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
455ba3579e6STaehee Yoo	 * T1 = ABCD -> DCBA 				\
456ba3579e6STaehee Yoo	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
457ba3579e6STaehee Yoo	 */						\
458ba3579e6STaehee Yoo	aria_diff_word(x2, x3, x0, x1,			\
459ba3579e6STaehee Yoo		       x7, x6, x5, x4,			\
460ba3579e6STaehee Yoo		       y0, y1, y2, y3,			\
461ba3579e6STaehee Yoo		       y5, y4, y7, y6);			\
462ba3579e6STaehee Yoo	aria_store_state_8way(x3, x2, x1, x0,		\
463ba3579e6STaehee Yoo			      x6, x7, x4, x5,		\
464ba3579e6STaehee Yoo			      mem_tmp, 0);
465ba3579e6STaehee Yoo
466ba3579e6STaehee Yoo#define aria_fo(x0, x1, x2, x3,				\
467ba3579e6STaehee Yoo		x4, x5, x6, x7,				\
468ba3579e6STaehee Yoo		y0, y1, y2, y3,				\
469ba3579e6STaehee Yoo		y4, y5, y6, y7,				\
470ba3579e6STaehee Yoo		mem_tmp, rk, round)			\
471ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
472ba3579e6STaehee Yoo		      y0, rk, 8, round);		\
473ba3579e6STaehee Yoo							\
474ba3579e6STaehee Yoo	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
475ba3579e6STaehee Yoo		       y0, y1, y2, y3, y4, y5, y6, y7);	\
476ba3579e6STaehee Yoo							\
477ba3579e6STaehee Yoo	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
478ba3579e6STaehee Yoo	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
479ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
480ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
481ba3579e6STaehee Yoo			      mem_tmp, 8);		\
482ba3579e6STaehee Yoo							\
483ba3579e6STaehee Yoo	aria_load_state_8way(x0, x1, x2, x3,		\
484ba3579e6STaehee Yoo			     x4, x5, x6, x7,		\
485ba3579e6STaehee Yoo			     mem_tmp, 0);		\
486ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
487ba3579e6STaehee Yoo		      y0, rk, 0, round);		\
488ba3579e6STaehee Yoo							\
489ba3579e6STaehee Yoo	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
490ba3579e6STaehee Yoo		       y0, y1, y2, y3, y4, y5, y6, y7);	\
491ba3579e6STaehee Yoo							\
492ba3579e6STaehee Yoo	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
493ba3579e6STaehee Yoo	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
494ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
495ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
496ba3579e6STaehee Yoo			      mem_tmp, 0);		\
497ba3579e6STaehee Yoo	aria_load_state_8way(y0, y1, y2, y3,		\
498ba3579e6STaehee Yoo			     y4, y5, y6, y7,		\
499ba3579e6STaehee Yoo			     mem_tmp, 8);		\
500ba3579e6STaehee Yoo	aria_diff_word(x0, x1, x2, x3,			\
501ba3579e6STaehee Yoo		       x4, x5, x6, x7,			\
502ba3579e6STaehee Yoo		       y0, y1, y2, y3,			\
503ba3579e6STaehee Yoo		       y4, y5, y6, y7);			\
504ba3579e6STaehee Yoo	/* aria_diff_byte() 				\
505ba3579e6STaehee Yoo	 * T1 = ABCD -> BADC 				\
506ba3579e6STaehee Yoo	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
507ba3579e6STaehee Yoo	 * T2 = ABCD -> CDAB 				\
508ba3579e6STaehee Yoo	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
509ba3579e6STaehee Yoo	 * T3 = ABCD -> DCBA 				\
510ba3579e6STaehee Yoo	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
511ba3579e6STaehee Yoo	 */						\
512ba3579e6STaehee Yoo	aria_diff_word(x0, x1, x2, x3,			\
513ba3579e6STaehee Yoo		       x5, x4, x7, x6,			\
514ba3579e6STaehee Yoo		       y2, y3, y0, y1,			\
515ba3579e6STaehee Yoo		       y7, y6, y5, y4);			\
516ba3579e6STaehee Yoo	aria_store_state_8way(x3, x2, x1, x0,		\
517ba3579e6STaehee Yoo			      x6, x7, x4, x5,		\
518ba3579e6STaehee Yoo			      mem_tmp, 0);
519ba3579e6STaehee Yoo
520ba3579e6STaehee Yoo#define aria_ff(x0, x1, x2, x3,				\
521ba3579e6STaehee Yoo		x4, x5, x6, x7,				\
522ba3579e6STaehee Yoo		y0, y1, y2, y3,				\
523ba3579e6STaehee Yoo		y4, y5, y6, y7,				\
524ba3579e6STaehee Yoo		mem_tmp, rk, round, last_round)		\
525ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
526ba3579e6STaehee Yoo		      y0, rk, 8, round);		\
527ba3579e6STaehee Yoo							\
528ba3579e6STaehee Yoo	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
529ba3579e6STaehee Yoo		       y0, y1, y2, y3, y4, y5, y6, y7);	\
530ba3579e6STaehee Yoo							\
531ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
532ba3579e6STaehee Yoo		      y0, rk, 8, last_round);		\
533ba3579e6STaehee Yoo							\
534ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
535ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
536ba3579e6STaehee Yoo			      mem_tmp, 8);		\
537ba3579e6STaehee Yoo							\
538ba3579e6STaehee Yoo	aria_load_state_8way(x0, x1, x2, x3,		\
539ba3579e6STaehee Yoo			     x4, x5, x6, x7,		\
540ba3579e6STaehee Yoo			     mem_tmp, 0);		\
541ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
542ba3579e6STaehee Yoo		      y0, rk, 0, round);		\
543ba3579e6STaehee Yoo							\
544ba3579e6STaehee Yoo	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
545ba3579e6STaehee Yoo		       y0, y1, y2, y3, y4, y5, y6, y7);	\
546ba3579e6STaehee Yoo							\
547ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
548ba3579e6STaehee Yoo		      y0, rk, 0, last_round);		\
549ba3579e6STaehee Yoo							\
550ba3579e6STaehee Yoo	aria_load_state_8way(y0, y1, y2, y3,		\
551ba3579e6STaehee Yoo			     y4, y5, y6, y7,		\
552ba3579e6STaehee Yoo			     mem_tmp, 8);
553ba3579e6STaehee Yoo
554ba3579e6STaehee Yoo#define aria_fe_gfni(x0, x1, x2, x3,			\
555ba3579e6STaehee Yoo		     x4, x5, x6, x7,			\
556ba3579e6STaehee Yoo		     y0, y1, y2, y3,			\
557ba3579e6STaehee Yoo		     y4, y5, y6, y7,			\
558ba3579e6STaehee Yoo		     mem_tmp, rk, round)		\
559ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
560ba3579e6STaehee Yoo		      y0, rk, 8, round);		\
561ba3579e6STaehee Yoo							\
562ba3579e6STaehee Yoo	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
563ba3579e6STaehee Yoo			    x6, x7, x4, x5,		\
564ba3579e6STaehee Yoo			    y0, y1, y2, y3, 		\
565ba3579e6STaehee Yoo			    y4, y5, y6, y7);		\
566ba3579e6STaehee Yoo							\
567ba3579e6STaehee Yoo	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
568ba3579e6STaehee Yoo	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
569ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
570ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
571ba3579e6STaehee Yoo			      mem_tmp, 8);		\
572ba3579e6STaehee Yoo							\
573ba3579e6STaehee Yoo	aria_load_state_8way(x0, x1, x2, x3,		\
574ba3579e6STaehee Yoo			     x4, x5, x6, x7,		\
575ba3579e6STaehee Yoo			     mem_tmp, 0);		\
576ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
577ba3579e6STaehee Yoo		      y0, rk, 0, round);		\
578ba3579e6STaehee Yoo							\
579ba3579e6STaehee Yoo	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
580ba3579e6STaehee Yoo			    x6, x7, x4, x5,		\
581ba3579e6STaehee Yoo			    y0, y1, y2, y3, 		\
582ba3579e6STaehee Yoo			    y4, y5, y6, y7);		\
583ba3579e6STaehee Yoo							\
584ba3579e6STaehee Yoo	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
585ba3579e6STaehee Yoo	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
586ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
587ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
588ba3579e6STaehee Yoo			      mem_tmp, 0);		\
589ba3579e6STaehee Yoo	aria_load_state_8way(y0, y1, y2, y3,		\
590ba3579e6STaehee Yoo			     y4, y5, y6, y7,		\
591ba3579e6STaehee Yoo			     mem_tmp, 8);		\
592ba3579e6STaehee Yoo	aria_diff_word(x0, x1, x2, x3,			\
593ba3579e6STaehee Yoo		       x4, x5, x6, x7,			\
594ba3579e6STaehee Yoo		       y0, y1, y2, y3,			\
595ba3579e6STaehee Yoo		       y4, y5, y6, y7);			\
596ba3579e6STaehee Yoo	/* aria_diff_byte() 				\
597ba3579e6STaehee Yoo	 * T3 = ABCD -> BADC 				\
598ba3579e6STaehee Yoo	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
599ba3579e6STaehee Yoo	 * T0 = ABCD -> CDAB 				\
600ba3579e6STaehee Yoo	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
601ba3579e6STaehee Yoo	 * T1 = ABCD -> DCBA 				\
602ba3579e6STaehee Yoo	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
603ba3579e6STaehee Yoo	 */						\
604ba3579e6STaehee Yoo	aria_diff_word(x2, x3, x0, x1,			\
605ba3579e6STaehee Yoo		       x7, x6, x5, x4,			\
606ba3579e6STaehee Yoo		       y0, y1, y2, y3,			\
607ba3579e6STaehee Yoo		       y5, y4, y7, y6);			\
608ba3579e6STaehee Yoo	aria_store_state_8way(x3, x2, x1, x0,		\
609ba3579e6STaehee Yoo			      x6, x7, x4, x5,		\
610ba3579e6STaehee Yoo			      mem_tmp, 0);
611ba3579e6STaehee Yoo
612ba3579e6STaehee Yoo#define aria_fo_gfni(x0, x1, x2, x3,			\
613ba3579e6STaehee Yoo		     x4, x5, x6, x7,			\
614ba3579e6STaehee Yoo		     y0, y1, y2, y3,			\
615ba3579e6STaehee Yoo		     y4, y5, y6, y7,			\
616ba3579e6STaehee Yoo		     mem_tmp, rk, round)		\
617ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
618ba3579e6STaehee Yoo		      y0, rk, 8, round);		\
619ba3579e6STaehee Yoo							\
620ba3579e6STaehee Yoo	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
621ba3579e6STaehee Yoo			    x4, x5, x6, x7,		\
622ba3579e6STaehee Yoo			    y0, y1, y2, y3, 		\
623ba3579e6STaehee Yoo			    y4, y5, y6, y7);		\
624ba3579e6STaehee Yoo							\
625ba3579e6STaehee Yoo	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
626ba3579e6STaehee Yoo	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
627ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
628ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
629ba3579e6STaehee Yoo			      mem_tmp, 8);		\
630ba3579e6STaehee Yoo							\
631ba3579e6STaehee Yoo	aria_load_state_8way(x0, x1, x2, x3,		\
632ba3579e6STaehee Yoo			     x4, x5, x6, x7,		\
633ba3579e6STaehee Yoo			     mem_tmp, 0);		\
634ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
635ba3579e6STaehee Yoo		      y0, rk, 0, round);		\
636ba3579e6STaehee Yoo							\
637ba3579e6STaehee Yoo	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
638ba3579e6STaehee Yoo			    x4, x5, x6, x7,		\
639ba3579e6STaehee Yoo			    y0, y1, y2, y3, 		\
640ba3579e6STaehee Yoo			    y4, y5, y6, y7);		\
641ba3579e6STaehee Yoo							\
642ba3579e6STaehee Yoo	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
643ba3579e6STaehee Yoo	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
644ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
645ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
646ba3579e6STaehee Yoo			      mem_tmp, 0);		\
647ba3579e6STaehee Yoo	aria_load_state_8way(y0, y1, y2, y3,		\
648ba3579e6STaehee Yoo			     y4, y5, y6, y7,		\
649ba3579e6STaehee Yoo			     mem_tmp, 8);		\
650ba3579e6STaehee Yoo	aria_diff_word(x0, x1, x2, x3,			\
651ba3579e6STaehee Yoo		       x4, x5, x6, x7,			\
652ba3579e6STaehee Yoo		       y0, y1, y2, y3,			\
653ba3579e6STaehee Yoo		       y4, y5, y6, y7);			\
654ba3579e6STaehee Yoo	/* aria_diff_byte() 				\
655ba3579e6STaehee Yoo	 * T1 = ABCD -> BADC 				\
656ba3579e6STaehee Yoo	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
657ba3579e6STaehee Yoo	 * T2 = ABCD -> CDAB 				\
658ba3579e6STaehee Yoo	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
659ba3579e6STaehee Yoo	 * T3 = ABCD -> DCBA 				\
660ba3579e6STaehee Yoo	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
661ba3579e6STaehee Yoo	 */						\
662ba3579e6STaehee Yoo	aria_diff_word(x0, x1, x2, x3,			\
663ba3579e6STaehee Yoo		       x5, x4, x7, x6,			\
664ba3579e6STaehee Yoo		       y2, y3, y0, y1,			\
665ba3579e6STaehee Yoo		       y7, y6, y5, y4);			\
666ba3579e6STaehee Yoo	aria_store_state_8way(x3, x2, x1, x0,		\
667ba3579e6STaehee Yoo			      x6, x7, x4, x5,		\
668ba3579e6STaehee Yoo			      mem_tmp, 0);
669ba3579e6STaehee Yoo
670ba3579e6STaehee Yoo#define aria_ff_gfni(x0, x1, x2, x3,			\
671ba3579e6STaehee Yoo		x4, x5, x6, x7,				\
672ba3579e6STaehee Yoo		y0, y1, y2, y3,				\
673ba3579e6STaehee Yoo		y4, y5, y6, y7,				\
674ba3579e6STaehee Yoo		mem_tmp, rk, round, last_round)		\
675ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
676ba3579e6STaehee Yoo		      y0, rk, 8, round);		\
677ba3579e6STaehee Yoo							\
678ba3579e6STaehee Yoo	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
679ba3579e6STaehee Yoo			    x6, x7, x4, x5,		\
680ba3579e6STaehee Yoo			    y0, y1, y2, y3, 		\
681ba3579e6STaehee Yoo			    y4, y5, y6, y7);		\
682ba3579e6STaehee Yoo							\
683ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
684ba3579e6STaehee Yoo		      y0, rk, 8, last_round);		\
685ba3579e6STaehee Yoo							\
686ba3579e6STaehee Yoo	aria_store_state_8way(x0, x1, x2, x3,		\
687ba3579e6STaehee Yoo			      x4, x5, x6, x7,		\
688ba3579e6STaehee Yoo			      mem_tmp, 8);		\
689ba3579e6STaehee Yoo							\
690ba3579e6STaehee Yoo	aria_load_state_8way(x0, x1, x2, x3,		\
691ba3579e6STaehee Yoo			     x4, x5, x6, x7,		\
692ba3579e6STaehee Yoo			     mem_tmp, 0);		\
693ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
694ba3579e6STaehee Yoo		      y0, rk, 0, round);		\
695ba3579e6STaehee Yoo							\
696ba3579e6STaehee Yoo	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
697ba3579e6STaehee Yoo			    x6, x7, x4, x5,		\
698ba3579e6STaehee Yoo			    y0, y1, y2, y3, 		\
699ba3579e6STaehee Yoo			    y4, y5, y6, y7);		\
700ba3579e6STaehee Yoo							\
701ba3579e6STaehee Yoo	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
702ba3579e6STaehee Yoo		      y0, rk, 0, last_round);		\
703ba3579e6STaehee Yoo							\
704ba3579e6STaehee Yoo	aria_load_state_8way(y0, y1, y2, y3,		\
705ba3579e6STaehee Yoo			     y4, y5, y6, y7,		\
706ba3579e6STaehee Yoo			     mem_tmp, 8);
707ba3579e6STaehee Yoo
708ba3579e6STaehee Yoo/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
709ba3579e6STaehee Yoo.section	.rodata.cst16, "aM", @progbits, 16
710ba3579e6STaehee Yoo.align 16
711ba3579e6STaehee Yoo
712ba3579e6STaehee Yoo#define SHUFB_BYTES(idx) \
713ba3579e6STaehee Yoo	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
714ba3579e6STaehee Yoo
715ba3579e6STaehee Yoo.Lshufb_16x16b:
716ba3579e6STaehee Yoo	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
717ba3579e6STaehee Yoo/* For isolating SubBytes from AESENCLAST, inverse shift row */
718ba3579e6STaehee Yoo.Linv_shift_row:
719ba3579e6STaehee Yoo	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
720ba3579e6STaehee Yoo	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
721ba3579e6STaehee Yoo.Lshift_row:
722ba3579e6STaehee Yoo	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
723ba3579e6STaehee Yoo	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
724ba3579e6STaehee Yoo/* For CTR-mode IV byteswap */
725ba3579e6STaehee Yoo.Lbswap128_mask:
726ba3579e6STaehee Yoo	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
727ba3579e6STaehee Yoo	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
728ba3579e6STaehee Yoo
729ba3579e6STaehee Yoo/* AES inverse affine and S2 combined:
730ba3579e6STaehee Yoo *      1 1 0 0 0 0 0 1     x0     0
731ba3579e6STaehee Yoo *      0 1 0 0 1 0 0 0     x1     0
732ba3579e6STaehee Yoo *      1 1 0 0 1 1 1 1     x2     0
733ba3579e6STaehee Yoo *      0 1 1 0 1 0 0 1     x3     1
734ba3579e6STaehee Yoo *      0 1 0 0 1 1 0 0  *  x4  +  0
735ba3579e6STaehee Yoo *      0 1 0 1 1 0 0 0     x5     0
736ba3579e6STaehee Yoo *      0 0 0 0 0 1 0 1     x6     0
737ba3579e6STaehee Yoo *      1 1 1 0 0 1 1 1     x7     1
738ba3579e6STaehee Yoo */
739ba3579e6STaehee Yoo.Ltf_lo__inv_aff__and__s2:
740ba3579e6STaehee Yoo	.octa 0x92172DA81A9FA520B2370D883ABF8500
741ba3579e6STaehee Yoo.Ltf_hi__inv_aff__and__s2:
742ba3579e6STaehee Yoo	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
743ba3579e6STaehee Yoo
744ba3579e6STaehee Yoo/* X2 and AES forward affine combined:
745ba3579e6STaehee Yoo *      1 0 1 1 0 0 0 1     x0     0
746ba3579e6STaehee Yoo *      0 1 1 1 1 0 1 1     x1     0
747ba3579e6STaehee Yoo *      0 0 0 1 1 0 1 0     x2     1
748ba3579e6STaehee Yoo *      0 1 0 0 0 1 0 0     x3     0
749ba3579e6STaehee Yoo *      0 0 1 1 1 0 1 1  *  x4  +  0
750ba3579e6STaehee Yoo *      0 1 0 0 1 0 0 0     x5     0
751ba3579e6STaehee Yoo *      1 1 0 1 0 0 1 1     x6     0
752ba3579e6STaehee Yoo *      0 1 0 0 1 0 1 0     x7     0
753ba3579e6STaehee Yoo */
754ba3579e6STaehee Yoo.Ltf_lo__x2__and__fwd_aff:
755ba3579e6STaehee Yoo	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
756ba3579e6STaehee Yoo.Ltf_hi__x2__and__fwd_aff:
757ba3579e6STaehee Yoo	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
758ba3579e6STaehee Yoo
759ba3579e6STaehee Yoo.section	.rodata.cst8, "aM", @progbits, 8
760ba3579e6STaehee Yoo.align 8
761ba3579e6STaehee Yoo/* AES affine: */
762ba3579e6STaehee Yoo#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
763ba3579e6STaehee Yoo.Ltf_aff_bitmatrix:
764ba3579e6STaehee Yoo	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
765ba3579e6STaehee Yoo		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
766ba3579e6STaehee Yoo		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
767ba3579e6STaehee Yoo		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
768ba3579e6STaehee Yoo		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
769ba3579e6STaehee Yoo		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
770ba3579e6STaehee Yoo		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
771ba3579e6STaehee Yoo		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
772ba3579e6STaehee Yoo
773ba3579e6STaehee Yoo/* AES inverse affine: */
774ba3579e6STaehee Yoo#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
775ba3579e6STaehee Yoo.Ltf_inv_bitmatrix:
776ba3579e6STaehee Yoo	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
777ba3579e6STaehee Yoo		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
778ba3579e6STaehee Yoo		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
779ba3579e6STaehee Yoo		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
780ba3579e6STaehee Yoo		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
781ba3579e6STaehee Yoo		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
782ba3579e6STaehee Yoo		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
783ba3579e6STaehee Yoo		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
784ba3579e6STaehee Yoo
785ba3579e6STaehee Yoo/* S2: */
786ba3579e6STaehee Yoo#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
787ba3579e6STaehee Yoo.Ltf_s2_bitmatrix:
788ba3579e6STaehee Yoo	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
789ba3579e6STaehee Yoo		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
790ba3579e6STaehee Yoo		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
791ba3579e6STaehee Yoo		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
792ba3579e6STaehee Yoo		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
793ba3579e6STaehee Yoo		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
794ba3579e6STaehee Yoo		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
795ba3579e6STaehee Yoo		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
796ba3579e6STaehee Yoo
797ba3579e6STaehee Yoo/* X2: */
798ba3579e6STaehee Yoo#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
799ba3579e6STaehee Yoo.Ltf_x2_bitmatrix:
800ba3579e6STaehee Yoo	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
801ba3579e6STaehee Yoo		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
802ba3579e6STaehee Yoo		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
803ba3579e6STaehee Yoo		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
804ba3579e6STaehee Yoo		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
805ba3579e6STaehee Yoo		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
806ba3579e6STaehee Yoo		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
807ba3579e6STaehee Yoo		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
808ba3579e6STaehee Yoo
809ba3579e6STaehee Yoo/* Identity matrix: */
810ba3579e6STaehee Yoo.Ltf_id_bitmatrix:
811ba3579e6STaehee Yoo	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
812ba3579e6STaehee Yoo		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
813ba3579e6STaehee Yoo		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
814ba3579e6STaehee Yoo		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
815ba3579e6STaehee Yoo		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
816ba3579e6STaehee Yoo		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
817ba3579e6STaehee Yoo		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
818ba3579e6STaehee Yoo		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
819ba3579e6STaehee Yoo
820ba3579e6STaehee Yoo/* 4-bit mask */
821ba3579e6STaehee Yoo.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
822ba3579e6STaehee Yoo.align 4
823ba3579e6STaehee Yoo.L0f0f0f0f:
824ba3579e6STaehee Yoo	.long 0x0f0f0f0f
825ba3579e6STaehee Yoo
826ba3579e6STaehee Yoo.text
827ba3579e6STaehee Yoo
828ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
829ba3579e6STaehee Yoo	/* input:
830ba3579e6STaehee Yoo	*      %r9: rk
831ba3579e6STaehee Yoo	*      %rsi: dst
832ba3579e6STaehee Yoo	*      %rdx: src
833ba3579e6STaehee Yoo	*      %xmm0..%xmm15: 16 byte-sliced blocks
834ba3579e6STaehee Yoo	*/
835ba3579e6STaehee Yoo
836ba3579e6STaehee Yoo	FRAME_BEGIN
837ba3579e6STaehee Yoo
838ba3579e6STaehee Yoo	movq %rsi, %rax;
839ba3579e6STaehee Yoo	leaq 8 * 16(%rax), %r8;
840ba3579e6STaehee Yoo
841ba3579e6STaehee Yoo	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
842ba3579e6STaehee Yoo		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
843ba3579e6STaehee Yoo		      %xmm15, %rax, %r8);
844ba3579e6STaehee Yoo	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
845ba3579e6STaehee Yoo		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
846ba3579e6STaehee Yoo		%rax, %r9, 0);
847ba3579e6STaehee Yoo	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
848ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
849ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 1);
850ba3579e6STaehee Yoo	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
851ba3579e6STaehee Yoo		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
852ba3579e6STaehee Yoo		%rax, %r9, 2);
853ba3579e6STaehee Yoo	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
854ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
855ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 3);
856ba3579e6STaehee Yoo	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
857ba3579e6STaehee Yoo		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
858ba3579e6STaehee Yoo		%rax, %r9, 4);
859ba3579e6STaehee Yoo	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
860ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
861ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 5);
862ba3579e6STaehee Yoo	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
863ba3579e6STaehee Yoo		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
864ba3579e6STaehee Yoo		%rax, %r9, 6);
865ba3579e6STaehee Yoo	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
866ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
867ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 7);
868ba3579e6STaehee Yoo	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
869ba3579e6STaehee Yoo		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
870ba3579e6STaehee Yoo		%rax, %r9, 8);
871ba3579e6STaehee Yoo	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
872ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 9);
874ba3579e6STaehee Yoo	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
875ba3579e6STaehee Yoo		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
876ba3579e6STaehee Yoo		%rax, %r9, 10);
877ba3579e6STaehee Yoo	cmpl $12, rounds(CTX);
878ba3579e6STaehee Yoo	jne .Laria_192;
879ba3579e6STaehee Yoo	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
880ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
881ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 11, 12);
882ba3579e6STaehee Yoo	jmp .Laria_end;
883ba3579e6STaehee Yoo.Laria_192:
884ba3579e6STaehee Yoo	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
885ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
886ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 11);
887ba3579e6STaehee Yoo	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
888ba3579e6STaehee Yoo		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
889ba3579e6STaehee Yoo		%rax, %r9, 12);
890ba3579e6STaehee Yoo	cmpl $14, rounds(CTX);
891ba3579e6STaehee Yoo	jne .Laria_256;
892ba3579e6STaehee Yoo	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
893ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
894ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 13, 14);
895ba3579e6STaehee Yoo	jmp .Laria_end;
896ba3579e6STaehee Yoo.Laria_256:
897ba3579e6STaehee Yoo	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
898ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 13);
900ba3579e6STaehee Yoo	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
901ba3579e6STaehee Yoo		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902ba3579e6STaehee Yoo		%rax, %r9, 14);
903ba3579e6STaehee Yoo	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 15, 16);
906ba3579e6STaehee Yoo.Laria_end:
907ba3579e6STaehee Yoo	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
908ba3579e6STaehee Yoo			   %xmm9, %xmm13, %xmm0, %xmm5,
909ba3579e6STaehee Yoo			   %xmm10, %xmm14, %xmm3, %xmm6,
910ba3579e6STaehee Yoo			   %xmm11, %xmm15, %xmm2, %xmm7,
911ba3579e6STaehee Yoo			   (%rax), (%r8));
912ba3579e6STaehee Yoo
913ba3579e6STaehee Yoo	FRAME_END
914ba3579e6STaehee Yoo	RET;
915ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_crypt_16way)
916ba3579e6STaehee Yoo
917*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
918ba3579e6STaehee Yoo	/* input:
919ba3579e6STaehee Yoo	*      %rdi: ctx, CTX
920ba3579e6STaehee Yoo	*      %rsi: dst
921ba3579e6STaehee Yoo	*      %rdx: src
922ba3579e6STaehee Yoo	*/
923ba3579e6STaehee Yoo
924ba3579e6STaehee Yoo	FRAME_BEGIN
925ba3579e6STaehee Yoo
926ba3579e6STaehee Yoo	leaq enc_key(CTX), %r9;
927ba3579e6STaehee Yoo
928ba3579e6STaehee Yoo	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
929ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
930ba3579e6STaehee Yoo		     %xmm15, %rdx);
931ba3579e6STaehee Yoo
932ba3579e6STaehee Yoo	call __aria_aesni_avx_crypt_16way;
933ba3579e6STaehee Yoo
934ba3579e6STaehee Yoo	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
935ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
936ba3579e6STaehee Yoo		     %xmm15, %rax);
937ba3579e6STaehee Yoo
938ba3579e6STaehee Yoo	FRAME_END
939ba3579e6STaehee Yoo	RET;
940ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_encrypt_16way)
941ba3579e6STaehee Yoo
942*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
943ba3579e6STaehee Yoo	/* input:
944ba3579e6STaehee Yoo	*      %rdi: ctx, CTX
945ba3579e6STaehee Yoo	*      %rsi: dst
946ba3579e6STaehee Yoo	*      %rdx: src
947ba3579e6STaehee Yoo	*/
948ba3579e6STaehee Yoo
949ba3579e6STaehee Yoo	FRAME_BEGIN
950ba3579e6STaehee Yoo
951ba3579e6STaehee Yoo	leaq dec_key(CTX), %r9;
952ba3579e6STaehee Yoo
953ba3579e6STaehee Yoo	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
954ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955ba3579e6STaehee Yoo		     %xmm15, %rdx);
956ba3579e6STaehee Yoo
957ba3579e6STaehee Yoo	call __aria_aesni_avx_crypt_16way;
958ba3579e6STaehee Yoo
959ba3579e6STaehee Yoo	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961ba3579e6STaehee Yoo		     %xmm15, %rax);
962ba3579e6STaehee Yoo
963ba3579e6STaehee Yoo	FRAME_END
964ba3579e6STaehee Yoo	RET;
965ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_decrypt_16way)
966ba3579e6STaehee Yoo
967ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
968ba3579e6STaehee Yoo	/* input:
969ba3579e6STaehee Yoo	*      %rdi: ctx
970ba3579e6STaehee Yoo	*      %rsi: dst
971ba3579e6STaehee Yoo	*      %rdx: src
972ba3579e6STaehee Yoo	*      %rcx: keystream
973ba3579e6STaehee Yoo	*      %r8: iv (big endian, 128bit)
974ba3579e6STaehee Yoo	*/
975ba3579e6STaehee Yoo
976ba3579e6STaehee Yoo	FRAME_BEGIN
977ba3579e6STaehee Yoo	/* load IV and byteswap */
978ba3579e6STaehee Yoo	vmovdqu (%r8), %xmm8;
979ba3579e6STaehee Yoo
980ba3579e6STaehee Yoo	vmovdqa .Lbswap128_mask (%rip), %xmm1;
981ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
982ba3579e6STaehee Yoo
983ba3579e6STaehee Yoo	vpcmpeqd %xmm0, %xmm0, %xmm0;
984ba3579e6STaehee Yoo	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
985ba3579e6STaehee Yoo
986ba3579e6STaehee Yoo	/* construct IVs */
987ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
988ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm9;
989ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
990ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm10;
991ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
992ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm11;
993ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
994ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm12;
995ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
996ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm13;
997ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
998ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm14;
999ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1000ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm15;
1001ba3579e6STaehee Yoo	vmovdqu %xmm8, (0 * 16)(%rcx);
1002ba3579e6STaehee Yoo	vmovdqu %xmm9, (1 * 16)(%rcx);
1003ba3579e6STaehee Yoo	vmovdqu %xmm10, (2 * 16)(%rcx);
1004ba3579e6STaehee Yoo	vmovdqu %xmm11, (3 * 16)(%rcx);
1005ba3579e6STaehee Yoo	vmovdqu %xmm12, (4 * 16)(%rcx);
1006ba3579e6STaehee Yoo	vmovdqu %xmm13, (5 * 16)(%rcx);
1007ba3579e6STaehee Yoo	vmovdqu %xmm14, (6 * 16)(%rcx);
1008ba3579e6STaehee Yoo	vmovdqu %xmm15, (7 * 16)(%rcx);
1009ba3579e6STaehee Yoo
1010ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1011ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm8;
1012ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1013ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm9;
1014ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1015ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm10;
1016ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1017ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm11;
1018ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1019ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm12;
1020ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1021ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm13;
1022ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1023ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm14;
1024ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1025ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm15;
1026ba3579e6STaehee Yoo	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1027ba3579e6STaehee Yoo	vpshufb %xmm1, %xmm3, %xmm4;
1028ba3579e6STaehee Yoo	vmovdqu %xmm4, (%r8);
1029ba3579e6STaehee Yoo
1030ba3579e6STaehee Yoo	vmovdqu (0 * 16)(%rcx), %xmm0;
1031ba3579e6STaehee Yoo	vmovdqu (1 * 16)(%rcx), %xmm1;
1032ba3579e6STaehee Yoo	vmovdqu (2 * 16)(%rcx), %xmm2;
1033ba3579e6STaehee Yoo	vmovdqu (3 * 16)(%rcx), %xmm3;
1034ba3579e6STaehee Yoo	vmovdqu (4 * 16)(%rcx), %xmm4;
1035ba3579e6STaehee Yoo	vmovdqu (5 * 16)(%rcx), %xmm5;
1036ba3579e6STaehee Yoo	vmovdqu (6 * 16)(%rcx), %xmm6;
1037ba3579e6STaehee Yoo	vmovdqu (7 * 16)(%rcx), %xmm7;
1038ba3579e6STaehee Yoo
1039ba3579e6STaehee Yoo	FRAME_END
1040ba3579e6STaehee Yoo	RET;
1041ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1042ba3579e6STaehee Yoo
1043*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1044ba3579e6STaehee Yoo	/* input:
1045ba3579e6STaehee Yoo	*      %rdi: ctx
1046ba3579e6STaehee Yoo	*      %rsi: dst
1047ba3579e6STaehee Yoo	*      %rdx: src
1048ba3579e6STaehee Yoo	*      %rcx: keystream
1049ba3579e6STaehee Yoo	*      %r8: iv (big endian, 128bit)
1050ba3579e6STaehee Yoo	*/
1051ba3579e6STaehee Yoo	FRAME_BEGIN
1052ba3579e6STaehee Yoo
1053ba3579e6STaehee Yoo	call __aria_aesni_avx_ctr_gen_keystream_16way;
1054ba3579e6STaehee Yoo
1055ba3579e6STaehee Yoo	leaq (%rsi), %r10;
1056ba3579e6STaehee Yoo	leaq (%rdx), %r11;
1057ba3579e6STaehee Yoo	leaq (%rcx), %rsi;
1058ba3579e6STaehee Yoo	leaq (%rcx), %rdx;
1059ba3579e6STaehee Yoo	leaq enc_key(CTX), %r9;
1060ba3579e6STaehee Yoo
1061ba3579e6STaehee Yoo	call __aria_aesni_avx_crypt_16way;
1062ba3579e6STaehee Yoo
1063ba3579e6STaehee Yoo	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1064ba3579e6STaehee Yoo	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1065ba3579e6STaehee Yoo	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1066ba3579e6STaehee Yoo	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1067ba3579e6STaehee Yoo	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1068ba3579e6STaehee Yoo	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1069ba3579e6STaehee Yoo	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1070ba3579e6STaehee Yoo	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1071ba3579e6STaehee Yoo	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1072ba3579e6STaehee Yoo	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1073ba3579e6STaehee Yoo	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1074ba3579e6STaehee Yoo	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1075ba3579e6STaehee Yoo	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1076ba3579e6STaehee Yoo	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1077ba3579e6STaehee Yoo	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1078ba3579e6STaehee Yoo	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1079ba3579e6STaehee Yoo	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1080ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1081ba3579e6STaehee Yoo		     %xmm15, %r10);
1082ba3579e6STaehee Yoo
1083ba3579e6STaehee Yoo	FRAME_END
1084ba3579e6STaehee Yoo	RET;
1085ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1086ba3579e6STaehee Yoo
1087ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1088ba3579e6STaehee Yoo	/* input:
1089ba3579e6STaehee Yoo	*      %r9: rk
1090ba3579e6STaehee Yoo	*      %rsi: dst
1091ba3579e6STaehee Yoo	*      %rdx: src
1092ba3579e6STaehee Yoo	*      %xmm0..%xmm15: 16 byte-sliced blocks
1093ba3579e6STaehee Yoo	*/
1094ba3579e6STaehee Yoo
1095ba3579e6STaehee Yoo	FRAME_BEGIN
1096ba3579e6STaehee Yoo
1097ba3579e6STaehee Yoo	movq %rsi, %rax;
1098ba3579e6STaehee Yoo	leaq 8 * 16(%rax), %r8;
1099ba3579e6STaehee Yoo
1100ba3579e6STaehee Yoo	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1101ba3579e6STaehee Yoo		      %xmm4, %xmm5, %xmm6, %xmm7,
1102ba3579e6STaehee Yoo		      %xmm8, %xmm9, %xmm10, %xmm11,
1103ba3579e6STaehee Yoo		      %xmm12, %xmm13, %xmm14,
1104ba3579e6STaehee Yoo		      %xmm15, %rax, %r8);
1105ba3579e6STaehee Yoo	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1106ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14, %xmm15,
1107ba3579e6STaehee Yoo		     %xmm0, %xmm1, %xmm2, %xmm3,
1108ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1109ba3579e6STaehee Yoo		     %rax, %r9, 0);
1110ba3579e6STaehee Yoo	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1111ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1112ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1113ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1114ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 1);
1115ba3579e6STaehee Yoo	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1116ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14, %xmm15,
1117ba3579e6STaehee Yoo		     %xmm0, %xmm1, %xmm2, %xmm3,
1118ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1119ba3579e6STaehee Yoo		     %rax, %r9, 2);
1120ba3579e6STaehee Yoo	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1121ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1122ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1123ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1124ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 3);
1125ba3579e6STaehee Yoo	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1126ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14, %xmm15,
1127ba3579e6STaehee Yoo		     %xmm0, %xmm1, %xmm2, %xmm3,
1128ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1129ba3579e6STaehee Yoo		     %rax, %r9, 4);
1130ba3579e6STaehee Yoo	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1131ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1132ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1133ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1134ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 5);
1135ba3579e6STaehee Yoo	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1136ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14, %xmm15,
1137ba3579e6STaehee Yoo		     %xmm0, %xmm1, %xmm2, %xmm3,
1138ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1139ba3579e6STaehee Yoo		     %rax, %r9, 6);
1140ba3579e6STaehee Yoo	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1141ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1142ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1143ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1144ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 7);
1145ba3579e6STaehee Yoo	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1146ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14, %xmm15,
1147ba3579e6STaehee Yoo		     %xmm0, %xmm1, %xmm2, %xmm3,
1148ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1149ba3579e6STaehee Yoo		     %rax, %r9, 8);
1150ba3579e6STaehee Yoo	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1151ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1152ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1153ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1154ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 9);
1155ba3579e6STaehee Yoo	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1156ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14, %xmm15,
1157ba3579e6STaehee Yoo		     %xmm0, %xmm1, %xmm2, %xmm3,
1158ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1159ba3579e6STaehee Yoo		     %rax, %r9, 10);
1160ba3579e6STaehee Yoo	cmpl $12, rounds(CTX);
1161ba3579e6STaehee Yoo	jne .Laria_gfni_192;
1162ba3579e6STaehee Yoo	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1163ba3579e6STaehee Yoo		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1164ba3579e6STaehee Yoo		%xmm15, %rax, %r9, 11, 12);
1165ba3579e6STaehee Yoo	jmp .Laria_gfni_end;
1166ba3579e6STaehee Yoo.Laria_gfni_192:
1167ba3579e6STaehee Yoo	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1169ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1170ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1171ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 11);
1172ba3579e6STaehee Yoo	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14, %xmm15,
1174ba3579e6STaehee Yoo		     %xmm0, %xmm1, %xmm2, %xmm3,
1175ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1176ba3579e6STaehee Yoo		     %rax, %r9, 12);
1177ba3579e6STaehee Yoo	cmpl $14, rounds(CTX);
1178ba3579e6STaehee Yoo	jne .Laria_gfni_256;
1179ba3579e6STaehee Yoo	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1180ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1181ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1182ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1183ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 13, 14);
1184ba3579e6STaehee Yoo	jmp .Laria_gfni_end;
1185ba3579e6STaehee Yoo.Laria_gfni_256:
1186ba3579e6STaehee Yoo	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1187ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1188ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1189ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1190ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 13);
1191ba3579e6STaehee Yoo	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1192ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14, %xmm15,
1193ba3579e6STaehee Yoo		     %xmm0, %xmm1, %xmm2, %xmm3,
1194ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1195ba3579e6STaehee Yoo		     %rax, %r9, 14);
1196ba3579e6STaehee Yoo	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1197ba3579e6STaehee Yoo		     %xmm4, %xmm5, %xmm6, %xmm7,
1198ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11,
1199ba3579e6STaehee Yoo		     %xmm12, %xmm13, %xmm14,
1200ba3579e6STaehee Yoo		     %xmm15, %rax, %r9, 15, 16);
1201ba3579e6STaehee Yoo.Laria_gfni_end:
1202ba3579e6STaehee Yoo	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1203ba3579e6STaehee Yoo			   %xmm9, %xmm13, %xmm0, %xmm5,
1204ba3579e6STaehee Yoo			   %xmm10, %xmm14, %xmm3, %xmm6,
1205ba3579e6STaehee Yoo			   %xmm11, %xmm15, %xmm2, %xmm7,
1206ba3579e6STaehee Yoo			   (%rax), (%r8));
1207ba3579e6STaehee Yoo
1208ba3579e6STaehee Yoo	FRAME_END
1209ba3579e6STaehee Yoo	RET;
1210ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1211ba3579e6STaehee Yoo
1212*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1213ba3579e6STaehee Yoo	/* input:
1214ba3579e6STaehee Yoo	*      %rdi: ctx, CTX
1215ba3579e6STaehee Yoo	*      %rsi: dst
1216ba3579e6STaehee Yoo	*      %rdx: src
1217ba3579e6STaehee Yoo	*/
1218ba3579e6STaehee Yoo
1219ba3579e6STaehee Yoo	FRAME_BEGIN
1220ba3579e6STaehee Yoo
1221ba3579e6STaehee Yoo	leaq enc_key(CTX), %r9;
1222ba3579e6STaehee Yoo
1223ba3579e6STaehee Yoo	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1224ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1225ba3579e6STaehee Yoo		     %xmm15, %rdx);
1226ba3579e6STaehee Yoo
1227ba3579e6STaehee Yoo	call __aria_aesni_avx_gfni_crypt_16way;
1228ba3579e6STaehee Yoo
1229ba3579e6STaehee Yoo	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1230ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1231ba3579e6STaehee Yoo		     %xmm15, %rax);
1232ba3579e6STaehee Yoo
1233ba3579e6STaehee Yoo	FRAME_END
1234ba3579e6STaehee Yoo	RET;
1235ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1236ba3579e6STaehee Yoo
1237*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1238ba3579e6STaehee Yoo	/* input:
1239ba3579e6STaehee Yoo	*      %rdi: ctx, CTX
1240ba3579e6STaehee Yoo	*      %rsi: dst
1241ba3579e6STaehee Yoo	*      %rdx: src
1242ba3579e6STaehee Yoo	*/
1243ba3579e6STaehee Yoo
1244ba3579e6STaehee Yoo	FRAME_BEGIN
1245ba3579e6STaehee Yoo
1246ba3579e6STaehee Yoo	leaq dec_key(CTX), %r9;
1247ba3579e6STaehee Yoo
1248ba3579e6STaehee Yoo	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1249ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1250ba3579e6STaehee Yoo		     %xmm15, %rdx);
1251ba3579e6STaehee Yoo
1252ba3579e6STaehee Yoo	call __aria_aesni_avx_gfni_crypt_16way;
1253ba3579e6STaehee Yoo
1254ba3579e6STaehee Yoo	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1255ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1256ba3579e6STaehee Yoo		     %xmm15, %rax);
1257ba3579e6STaehee Yoo
1258ba3579e6STaehee Yoo	FRAME_END
1259ba3579e6STaehee Yoo	RET;
1260ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1261ba3579e6STaehee Yoo
1262*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1263ba3579e6STaehee Yoo	/* input:
1264ba3579e6STaehee Yoo	*      %rdi: ctx
1265ba3579e6STaehee Yoo	*      %rsi: dst
1266ba3579e6STaehee Yoo	*      %rdx: src
1267ba3579e6STaehee Yoo	*      %rcx: keystream
1268ba3579e6STaehee Yoo	*      %r8: iv (big endian, 128bit)
1269ba3579e6STaehee Yoo	*/
1270ba3579e6STaehee Yoo	FRAME_BEGIN
1271ba3579e6STaehee Yoo
1272ba3579e6STaehee Yoo	call __aria_aesni_avx_ctr_gen_keystream_16way
1273ba3579e6STaehee Yoo
1274ba3579e6STaehee Yoo	leaq (%rsi), %r10;
1275ba3579e6STaehee Yoo	leaq (%rdx), %r11;
1276ba3579e6STaehee Yoo	leaq (%rcx), %rsi;
1277ba3579e6STaehee Yoo	leaq (%rcx), %rdx;
1278ba3579e6STaehee Yoo	leaq enc_key(CTX), %r9;
1279ba3579e6STaehee Yoo
1280ba3579e6STaehee Yoo	call __aria_aesni_avx_gfni_crypt_16way;
1281ba3579e6STaehee Yoo
1282ba3579e6STaehee Yoo	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1283ba3579e6STaehee Yoo	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1284ba3579e6STaehee Yoo	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1285ba3579e6STaehee Yoo	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1286ba3579e6STaehee Yoo	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1287ba3579e6STaehee Yoo	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1288ba3579e6STaehee Yoo	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1289ba3579e6STaehee Yoo	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1290ba3579e6STaehee Yoo	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1291ba3579e6STaehee Yoo	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1292ba3579e6STaehee Yoo	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1293ba3579e6STaehee Yoo	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1294ba3579e6STaehee Yoo	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1295ba3579e6STaehee Yoo	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1296ba3579e6STaehee Yoo	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1297ba3579e6STaehee Yoo	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1298ba3579e6STaehee Yoo	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1299ba3579e6STaehee Yoo		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1300ba3579e6STaehee Yoo		     %xmm15, %r10);
1301ba3579e6STaehee Yoo
1302ba3579e6STaehee Yoo	FRAME_END
1303ba3579e6STaehee Yoo	RET;
1304ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1305