xref: /linux/arch/x86/crypto/aria-aesni-avx2-asm_64.S (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 32-way parallel algorithm (AVX2)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <asm/frame.h>
11#include <asm/asm-offsets.h>
12#include <linux/cfi_types.h>
13
14/* register macros */
15#define CTX %rdi
16
17#define ymm0_x xmm0
18#define ymm1_x xmm1
19#define ymm2_x xmm2
20#define ymm3_x xmm3
21#define ymm4_x xmm4
22#define ymm5_x xmm5
23#define ymm6_x xmm6
24#define ymm7_x xmm7
25#define ymm8_x xmm8
26#define ymm9_x xmm9
27#define ymm10_x xmm10
28#define ymm11_x xmm11
29#define ymm12_x xmm12
30#define ymm13_x xmm13
31#define ymm14_x xmm14
32#define ymm15_x xmm15
33
34#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
35	( (((a0) & 1) << 0) |				\
36	  (((a1) & 1) << 1) |				\
37	  (((a2) & 1) << 2) |				\
38	  (((a3) & 1) << 3) |				\
39	  (((a4) & 1) << 4) |				\
40	  (((a5) & 1) << 5) |				\
41	  (((a6) & 1) << 6) |				\
42	  (((a7) & 1) << 7) )
43
44#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
45	( ((l7) << (0 * 8)) |				\
46	  ((l6) << (1 * 8)) |				\
47	  ((l5) << (2 * 8)) |				\
48	  ((l4) << (3 * 8)) |				\
49	  ((l3) << (4 * 8)) |				\
50	  ((l2) << (5 * 8)) |				\
51	  ((l1) << (6 * 8)) |				\
52	  ((l0) << (7 * 8)) )
53
54#define inc_le128(x, minus_one, tmp)			\
55	vpcmpeqq minus_one, x, tmp;			\
56	vpsubq minus_one, x, x;				\
57	vpslldq $8, tmp, tmp;				\
58	vpsubq tmp, x, x;
59
60#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
61	vpand x, mask4bit, tmp0;			\
62	vpandn x, mask4bit, x;				\
63	vpsrld $4, x, x;				\
64							\
65	vpshufb tmp0, lo_t, tmp0;			\
66	vpshufb x, hi_t, x;				\
67	vpxor tmp0, x, x;
68
69#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
70	vpunpckhdq x1, x0, t2;				\
71	vpunpckldq x1, x0, x0;				\
72							\
73	vpunpckldq x3, x2, t1;				\
74	vpunpckhdq x3, x2, x2;				\
75							\
76	vpunpckhqdq t1, x0, x1;				\
77	vpunpcklqdq t1, x0, x0;				\
78							\
79	vpunpckhqdq x2, t2, x3;				\
80	vpunpcklqdq x2, t2, x2;
81
82#define byteslice_16x16b(a0, b0, c0, d0,		\
83			 a1, b1, c1, d1,		\
84			 a2, b2, c2, d2,		\
85			 a3, b3, c3, d3,		\
86			 st0, st1)			\
87	vmovdqu d2, st0;				\
88	vmovdqu d3, st1;				\
89	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
90	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
91	vmovdqu st0, d2;				\
92	vmovdqu st1, d3;				\
93							\
94	vmovdqu a0, st0;				\
95	vmovdqu a1, st1;				\
96	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
97	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
98							\
99	vbroadcasti128 .Lshufb_16x16b(%rip), a0;	\
100	vmovdqu st1, a1;				\
101	vpshufb a0, a2, a2;				\
102	vpshufb a0, a3, a3;				\
103	vpshufb a0, b0, b0;				\
104	vpshufb a0, b1, b1;				\
105	vpshufb a0, b2, b2;				\
106	vpshufb a0, b3, b3;				\
107	vpshufb a0, a1, a1;				\
108	vpshufb a0, c0, c0;				\
109	vpshufb a0, c1, c1;				\
110	vpshufb a0, c2, c2;				\
111	vpshufb a0, c3, c3;				\
112	vpshufb a0, d0, d0;				\
113	vpshufb a0, d1, d1;				\
114	vpshufb a0, d2, d2;				\
115	vpshufb a0, d3, d3;				\
116	vmovdqu d3, st1;				\
117	vmovdqu st0, d3;				\
118	vpshufb a0, d3, a0;				\
119	vmovdqu d2, st0;				\
120							\
121	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
122	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
123	vmovdqu st0, d2;				\
124	vmovdqu st1, d3;				\
125							\
126	vmovdqu b0, st0;				\
127	vmovdqu b1, st1;				\
128	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
129	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
130	vmovdqu st0, b0;				\
131	vmovdqu st1, b1;				\
132	/* does not adjust output bytes inside vectors */
133
134#define debyteslice_16x16b(a0, b0, c0, d0,		\
135			   a1, b1, c1, d1,		\
136			   a2, b2, c2, d2,		\
137			   a3, b3, c3, d3,		\
138			   st0, st1)			\
139	vmovdqu d2, st0;				\
140	vmovdqu d3, st1;				\
141	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
142	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
143	vmovdqu st0, d2;				\
144	vmovdqu st1, d3;				\
145							\
146	vmovdqu a0, st0;				\
147	vmovdqu a1, st1;				\
148	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
149	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
150							\
151	vbroadcasti128 .Lshufb_16x16b(%rip), a0;	\
152	vmovdqu st1, a1;				\
153	vpshufb a0, a2, a2;				\
154	vpshufb a0, a3, a3;				\
155	vpshufb a0, b0, b0;				\
156	vpshufb a0, b1, b1;				\
157	vpshufb a0, b2, b2;				\
158	vpshufb a0, b3, b3;				\
159	vpshufb a0, a1, a1;				\
160	vpshufb a0, c0, c0;				\
161	vpshufb a0, c1, c1;				\
162	vpshufb a0, c2, c2;				\
163	vpshufb a0, c3, c3;				\
164	vpshufb a0, d0, d0;				\
165	vpshufb a0, d1, d1;				\
166	vpshufb a0, d2, d2;				\
167	vpshufb a0, d3, d3;				\
168	vmovdqu d3, st1;				\
169	vmovdqu st0, d3;				\
170	vpshufb a0, d3, a0;				\
171	vmovdqu d2, st0;				\
172							\
173	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
174	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
175	vmovdqu st0, d2;				\
176	vmovdqu st1, d3;				\
177							\
178	vmovdqu b0, st0;				\
179	vmovdqu b1, st1;				\
180	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
181	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
182	vmovdqu st0, b0;				\
183	vmovdqu st1, b1;				\
184	/* does not adjust output bytes inside vectors */
185
186/* load blocks to registers and apply pre-whitening */
187#define inpack16_pre(x0, x1, x2, x3,			\
188		     x4, x5, x6, x7,			\
189		     y0, y1, y2, y3,			\
190		     y4, y5, y6, y7,			\
191		     rio)				\
192	vmovdqu (0 * 32)(rio), x0;			\
193	vmovdqu (1 * 32)(rio), x1;			\
194	vmovdqu (2 * 32)(rio), x2;			\
195	vmovdqu (3 * 32)(rio), x3;			\
196	vmovdqu (4 * 32)(rio), x4;			\
197	vmovdqu (5 * 32)(rio), x5;			\
198	vmovdqu (6 * 32)(rio), x6;			\
199	vmovdqu (7 * 32)(rio), x7;			\
200	vmovdqu (8 * 32)(rio), y0;			\
201	vmovdqu (9 * 32)(rio), y1;			\
202	vmovdqu (10 * 32)(rio), y2;			\
203	vmovdqu (11 * 32)(rio), y3;			\
204	vmovdqu (12 * 32)(rio), y4;			\
205	vmovdqu (13 * 32)(rio), y5;			\
206	vmovdqu (14 * 32)(rio), y6;			\
207	vmovdqu (15 * 32)(rio), y7;
208
209/* byteslice pre-whitened blocks and store to temporary memory */
210#define inpack16_post(x0, x1, x2, x3,			\
211		      x4, x5, x6, x7,			\
212		      y0, y1, y2, y3,			\
213		      y4, y5, y6, y7,			\
214		      mem_ab, mem_cd)			\
215	byteslice_16x16b(x0, x1, x2, x3,		\
216			 x4, x5, x6, x7,		\
217			 y0, y1, y2, y3,		\
218			 y4, y5, y6, y7,		\
219			 (mem_ab), (mem_cd));		\
220							\
221	vmovdqu x0, 0 * 32(mem_ab);			\
222	vmovdqu x1, 1 * 32(mem_ab);			\
223	vmovdqu x2, 2 * 32(mem_ab);			\
224	vmovdqu x3, 3 * 32(mem_ab);			\
225	vmovdqu x4, 4 * 32(mem_ab);			\
226	vmovdqu x5, 5 * 32(mem_ab);			\
227	vmovdqu x6, 6 * 32(mem_ab);			\
228	vmovdqu x7, 7 * 32(mem_ab);			\
229	vmovdqu y0, 0 * 32(mem_cd);			\
230	vmovdqu y1, 1 * 32(mem_cd);			\
231	vmovdqu y2, 2 * 32(mem_cd);			\
232	vmovdqu y3, 3 * 32(mem_cd);			\
233	vmovdqu y4, 4 * 32(mem_cd);			\
234	vmovdqu y5, 5 * 32(mem_cd);			\
235	vmovdqu y6, 6 * 32(mem_cd);			\
236	vmovdqu y7, 7 * 32(mem_cd);
237
238#define write_output(x0, x1, x2, x3,			\
239		     x4, x5, x6, x7,			\
240		     y0, y1, y2, y3,			\
241		     y4, y5, y6, y7,			\
242		     mem)				\
243	vmovdqu x0, 0 * 32(mem);			\
244	vmovdqu x1, 1 * 32(mem);			\
245	vmovdqu x2, 2 * 32(mem);			\
246	vmovdqu x3, 3 * 32(mem);			\
247	vmovdqu x4, 4 * 32(mem);			\
248	vmovdqu x5, 5 * 32(mem);			\
249	vmovdqu x6, 6 * 32(mem);			\
250	vmovdqu x7, 7 * 32(mem);			\
251	vmovdqu y0, 8 * 32(mem);			\
252	vmovdqu y1, 9 * 32(mem);			\
253	vmovdqu y2, 10 * 32(mem);			\
254	vmovdqu y3, 11 * 32(mem);			\
255	vmovdqu y4, 12 * 32(mem);			\
256	vmovdqu y5, 13 * 32(mem);			\
257	vmovdqu y6, 14 * 32(mem);			\
258	vmovdqu y7, 15 * 32(mem);			\
259
260#define aria_store_state_8way(x0, x1, x2, x3,		\
261			      x4, x5, x6, x7,		\
262			      mem_tmp, idx)		\
263	vmovdqu x0, ((idx + 0) * 32)(mem_tmp);		\
264	vmovdqu x1, ((idx + 1) * 32)(mem_tmp);		\
265	vmovdqu x2, ((idx + 2) * 32)(mem_tmp);		\
266	vmovdqu x3, ((idx + 3) * 32)(mem_tmp);		\
267	vmovdqu x4, ((idx + 4) * 32)(mem_tmp);		\
268	vmovdqu x5, ((idx + 5) * 32)(mem_tmp);		\
269	vmovdqu x6, ((idx + 6) * 32)(mem_tmp);		\
270	vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
271
272#define aria_load_state_8way(x0, x1, x2, x3,		\
273			     x4, x5, x6, x7,		\
274			     mem_tmp, idx)		\
275	vmovdqu ((idx + 0) * 32)(mem_tmp), x0;		\
276	vmovdqu ((idx + 1) * 32)(mem_tmp), x1;		\
277	vmovdqu ((idx + 2) * 32)(mem_tmp), x2;		\
278	vmovdqu ((idx + 3) * 32)(mem_tmp), x3;		\
279	vmovdqu ((idx + 4) * 32)(mem_tmp), x4;		\
280	vmovdqu ((idx + 5) * 32)(mem_tmp), x5;		\
281	vmovdqu ((idx + 6) * 32)(mem_tmp), x6;		\
282	vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
283
284#define aria_ark_8way(x0, x1, x2, x3,			\
285		      x4, x5, x6, x7,			\
286		      t0, rk, idx, round)		\
287	/* AddRoundKey */                               \
288	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
289	vpxor t0, x0, x0;				\
290	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
291	vpxor t0, x1, x1;				\
292	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
293	vpxor t0, x2, x2;				\
294	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
295	vpxor t0, x3, x3;				\
296	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
297	vpxor t0, x4, x4;				\
298	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
299	vpxor t0, x5, x5;				\
300	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
301	vpxor t0, x6, x6;				\
302	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
303	vpxor t0, x7, x7;
304
305#ifdef CONFIG_AS_GFNI
306#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
307			    x4, x5, x6, x7,		\
308			    t0, t1, t2, t3,		\
309			    t4, t5, t6, t7)		\
310	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
311	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
312	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
313	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
314	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
315	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
316	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
317	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
318	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
319	vgf2p8affineinvqb $0, t2, x2, x2;		\
320	vgf2p8affineinvqb $0, t2, x6, x6;		\
321	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
322	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
323	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
324	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
325	vgf2p8affineinvqb $0, t2, x3, x3;		\
326	vgf2p8affineinvqb $0, t2, x7, x7
327
328#endif /* CONFIG_AS_GFNI */
329#define aria_sbox_8way(x0, x1, x2, x3,			\
330		       x4, x5, x6, x7,			\
331		       t0, t1, t2, t3,			\
332		       t4, t5, t6, t7)			\
333	vpxor t7, t7, t7;				\
334	vpxor t6, t6, t6;				\
335	vbroadcasti128 .Linv_shift_row(%rip), t0;	\
336	vbroadcasti128 .Lshift_row(%rip), t1;		\
337	vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
338	vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
339	vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
340	vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
341							\
342	vextracti128 $1, x0, t6##_x;			\
343	vaesenclast t7##_x, x0##_x, x0##_x;		\
344	vaesenclast t7##_x, t6##_x, t6##_x;		\
345	vinserti128 $1, t6##_x, x0, x0;			\
346							\
347	vextracti128 $1, x4, t6##_x;			\
348	vaesenclast t7##_x, x4##_x, x4##_x;		\
349	vaesenclast t7##_x, t6##_x, t6##_x;		\
350	vinserti128 $1, t6##_x, x4, x4;			\
351							\
352	vextracti128 $1, x1, t6##_x;			\
353	vaesenclast t7##_x, x1##_x, x1##_x;		\
354	vaesenclast t7##_x, t6##_x, t6##_x;		\
355	vinserti128 $1, t6##_x, x1, x1;			\
356							\
357	vextracti128 $1, x5, t6##_x;			\
358	vaesenclast t7##_x, x5##_x, x5##_x;		\
359	vaesenclast t7##_x, t6##_x, t6##_x;		\
360	vinserti128 $1, t6##_x, x5, x5;			\
361							\
362	vextracti128 $1, x2, t6##_x;			\
363	vaesdeclast t7##_x, x2##_x, x2##_x;		\
364	vaesdeclast t7##_x, t6##_x, t6##_x;		\
365	vinserti128 $1, t6##_x, x2, x2;			\
366							\
367	vextracti128 $1, x6, t6##_x;			\
368	vaesdeclast t7##_x, x6##_x, x6##_x;		\
369	vaesdeclast t7##_x, t6##_x, t6##_x;		\
370	vinserti128 $1, t6##_x, x6, x6;			\
371							\
372	vpbroadcastd .L0f0f0f0f(%rip), t6;		\
373							\
374	/* AES inverse shift rows */			\
375	vpshufb t0, x0, x0;				\
376	vpshufb t0, x4, x4;				\
377	vpshufb t0, x1, x1;				\
378	vpshufb t0, x5, x5;				\
379	vpshufb t1, x3, x3;				\
380	vpshufb t1, x7, x7;				\
381	vpshufb t1, x2, x2;				\
382	vpshufb t1, x6, x6;				\
383							\
384	/* affine transformation for S2 */		\
385	filter_8bit(x1, t2, t3, t6, t0);		\
386	/* affine transformation for S2 */		\
387	filter_8bit(x5, t2, t3, t6, t0);		\
388							\
389	/* affine transformation for X2 */		\
390	filter_8bit(x3, t4, t5, t6, t0);		\
391	/* affine transformation for X2 */		\
392	filter_8bit(x7, t4, t5, t6, t0);		\
393							\
394	vpxor t6, t6, t6;				\
395	vextracti128 $1, x3, t6##_x;			\
396	vaesdeclast t7##_x, x3##_x, x3##_x;		\
397	vaesdeclast t7##_x, t6##_x, t6##_x;		\
398	vinserti128 $1, t6##_x, x3, x3;			\
399							\
400	vextracti128 $1, x7, t6##_x;			\
401	vaesdeclast t7##_x, x7##_x, x7##_x;		\
402	vaesdeclast t7##_x, t6##_x, t6##_x;		\
403	vinserti128 $1, t6##_x, x7, x7;			\
404
405#define aria_diff_m(x0, x1, x2, x3,			\
406		    t0, t1, t2, t3)			\
407	/* T = rotr32(X, 8); */				\
408	/* X ^= T */					\
409	vpxor x0, x3, t0;				\
410	vpxor x1, x0, t1;				\
411	vpxor x2, x1, t2;				\
412	vpxor x3, x2, t3;				\
413	/* X = T ^ rotr(X, 16); */			\
414	vpxor t2, x0, x0;				\
415	vpxor x1, t3, t3;				\
416	vpxor t0, x2, x2;				\
417	vpxor t1, x3, x1;				\
418	vmovdqu t3, x3;
419
420#define aria_diff_word(x0, x1, x2, x3,			\
421		       x4, x5, x6, x7,			\
422		       y0, y1, y2, y3,			\
423		       y4, y5, y6, y7)			\
424	/* t1 ^= t2; */					\
425	vpxor y0, x4, x4;				\
426	vpxor y1, x5, x5;				\
427	vpxor y2, x6, x6;				\
428	vpxor y3, x7, x7;				\
429							\
430	/* t2 ^= t3; */					\
431	vpxor y4, y0, y0;				\
432	vpxor y5, y1, y1;				\
433	vpxor y6, y2, y2;				\
434	vpxor y7, y3, y3;				\
435							\
436	/* t0 ^= t1; */					\
437	vpxor x4, x0, x0;				\
438	vpxor x5, x1, x1;				\
439	vpxor x6, x2, x2;				\
440	vpxor x7, x3, x3;				\
441							\
442	/* t3 ^= t1; */					\
443	vpxor x4, y4, y4;				\
444	vpxor x5, y5, y5;				\
445	vpxor x6, y6, y6;				\
446	vpxor x7, y7, y7;				\
447							\
448	/* t2 ^= t0; */					\
449	vpxor x0, y0, y0;				\
450	vpxor x1, y1, y1;				\
451	vpxor x2, y2, y2;				\
452	vpxor x3, y3, y3;				\
453							\
454	/* t1 ^= t2; */					\
455	vpxor y0, x4, x4;				\
456	vpxor y1, x5, x5;				\
457	vpxor y2, x6, x6;				\
458	vpxor y3, x7, x7;
459
460#define aria_fe(x0, x1, x2, x3,				\
461		x4, x5, x6, x7,				\
462		y0, y1, y2, y3,				\
463		y4, y5, y6, y7,				\
464		mem_tmp, rk, round)			\
465	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
466		      y0, rk, 8, round);		\
467							\
468	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
469		       y0, y1, y2, y3, y4, y5, y6, y7);	\
470							\
471	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
472	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
473	aria_store_state_8way(x0, x1, x2, x3,		\
474			      x4, x5, x6, x7,		\
475			      mem_tmp, 8);		\
476							\
477	aria_load_state_8way(x0, x1, x2, x3,		\
478			     x4, x5, x6, x7,		\
479			     mem_tmp, 0);		\
480	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
481		      y0, rk, 0, round);		\
482							\
483	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
484		       y0, y1, y2, y3, y4, y5, y6, y7);	\
485							\
486	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
487	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
488	aria_store_state_8way(x0, x1, x2, x3,		\
489			      x4, x5, x6, x7,		\
490			      mem_tmp, 0);		\
491	aria_load_state_8way(y0, y1, y2, y3,		\
492			     y4, y5, y6, y7,		\
493			     mem_tmp, 8);		\
494	aria_diff_word(x0, x1, x2, x3,			\
495		       x4, x5, x6, x7,			\
496		       y0, y1, y2, y3,			\
497		       y4, y5, y6, y7);			\
498	/* aria_diff_byte()				\
499	 * T3 = ABCD -> BADC				\
500	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
501	 * T0 = ABCD -> CDAB				\
502	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
503	 * T1 = ABCD -> DCBA				\
504	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
505	 */						\
506	aria_diff_word(x2, x3, x0, x1,			\
507		       x7, x6, x5, x4,			\
508		       y0, y1, y2, y3,			\
509		       y5, y4, y7, y6);			\
510	aria_store_state_8way(x3, x2, x1, x0,		\
511			      x6, x7, x4, x5,		\
512			      mem_tmp, 0);
513
514#define aria_fo(x0, x1, x2, x3,				\
515		x4, x5, x6, x7,				\
516		y0, y1, y2, y3,				\
517		y4, y5, y6, y7,				\
518		mem_tmp, rk, round)			\
519	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
520		      y0, rk, 8, round);		\
521							\
522	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
523		       y0, y1, y2, y3, y4, y5, y6, y7);	\
524							\
525	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
526	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
527	aria_store_state_8way(x0, x1, x2, x3,		\
528			      x4, x5, x6, x7,		\
529			      mem_tmp, 8);		\
530							\
531	aria_load_state_8way(x0, x1, x2, x3,		\
532			     x4, x5, x6, x7,		\
533			     mem_tmp, 0);		\
534	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
535		      y0, rk, 0, round);		\
536							\
537	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
538		       y0, y1, y2, y3, y4, y5, y6, y7);	\
539							\
540	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
541	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
542	aria_store_state_8way(x0, x1, x2, x3,		\
543			      x4, x5, x6, x7,		\
544			      mem_tmp, 0);		\
545	aria_load_state_8way(y0, y1, y2, y3,		\
546			     y4, y5, y6, y7,		\
547			     mem_tmp, 8);		\
548	aria_diff_word(x0, x1, x2, x3,			\
549		       x4, x5, x6, x7,			\
550		       y0, y1, y2, y3,			\
551		       y4, y5, y6, y7);			\
552	/* aria_diff_byte()				\
553	 * T1 = ABCD -> BADC				\
554	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
555	 * T2 = ABCD -> CDAB				\
556	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
557	 * T3 = ABCD -> DCBA				\
558	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
559	 */						\
560	aria_diff_word(x0, x1, x2, x3,			\
561		       x5, x4, x7, x6,			\
562		       y2, y3, y0, y1,			\
563		       y7, y6, y5, y4);			\
564	aria_store_state_8way(x3, x2, x1, x0,		\
565			      x6, x7, x4, x5,		\
566			      mem_tmp, 0);
567
568#define aria_ff(x0, x1, x2, x3,				\
569		x4, x5, x6, x7,				\
570		y0, y1, y2, y3,				\
571		y4, y5, y6, y7,				\
572		mem_tmp, rk, round, last_round)		\
573	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
574		      y0, rk, 8, round);		\
575							\
576	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
577		       y0, y1, y2, y3, y4, y5, y6, y7);	\
578							\
579	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
580		      y0, rk, 8, last_round);		\
581							\
582	aria_store_state_8way(x0, x1, x2, x3,		\
583			      x4, x5, x6, x7,		\
584			      mem_tmp, 8);		\
585							\
586	aria_load_state_8way(x0, x1, x2, x3,		\
587			     x4, x5, x6, x7,		\
588			     mem_tmp, 0);		\
589	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
590		      y0, rk, 0, round);		\
591							\
592	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
593		       y0, y1, y2, y3, y4, y5, y6, y7);	\
594							\
595	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
596		      y0, rk, 0, last_round);		\
597							\
598	aria_load_state_8way(y0, y1, y2, y3,		\
599			     y4, y5, y6, y7,		\
600			     mem_tmp, 8);
601#ifdef CONFIG_AS_GFNI
602#define aria_fe_gfni(x0, x1, x2, x3,			\
603		     x4, x5, x6, x7,			\
604		     y0, y1, y2, y3,			\
605		     y4, y5, y6, y7,			\
606		     mem_tmp, rk, round)		\
607	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
608		      y0, rk, 8, round);		\
609							\
610	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
611			    x6, x7, x4, x5,		\
612			    y0, y1, y2, y3,		\
613			    y4, y5, y6, y7);		\
614							\
615	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
616	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
617	aria_store_state_8way(x0, x1, x2, x3,		\
618			      x4, x5, x6, x7,		\
619			      mem_tmp, 8);		\
620							\
621	aria_load_state_8way(x0, x1, x2, x3,		\
622			     x4, x5, x6, x7,		\
623			     mem_tmp, 0);		\
624	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
625		      y0, rk, 0, round);		\
626							\
627	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
628			    x6, x7, x4, x5,		\
629			    y0, y1, y2, y3,		\
630			    y4, y5, y6, y7);		\
631							\
632	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
633	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
634	aria_store_state_8way(x0, x1, x2, x3,		\
635			      x4, x5, x6, x7,		\
636			      mem_tmp, 0);		\
637	aria_load_state_8way(y0, y1, y2, y3,		\
638			     y4, y5, y6, y7,		\
639			     mem_tmp, 8);		\
640	aria_diff_word(x0, x1, x2, x3,			\
641		       x4, x5, x6, x7,			\
642		       y0, y1, y2, y3,			\
643		       y4, y5, y6, y7);			\
644	/* aria_diff_byte()				\
645	 * T3 = ABCD -> BADC				\
646	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
647	 * T0 = ABCD -> CDAB				\
648	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
649	 * T1 = ABCD -> DCBA				\
650	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
651	 */						\
652	aria_diff_word(x2, x3, x0, x1,			\
653		       x7, x6, x5, x4,			\
654		       y0, y1, y2, y3,			\
655		       y5, y4, y7, y6);			\
656	aria_store_state_8way(x3, x2, x1, x0,		\
657			      x6, x7, x4, x5,		\
658			      mem_tmp, 0);
659
660#define aria_fo_gfni(x0, x1, x2, x3,			\
661		     x4, x5, x6, x7,			\
662		     y0, y1, y2, y3,			\
663		     y4, y5, y6, y7,			\
664		     mem_tmp, rk, round)		\
665	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
666		      y0, rk, 8, round);		\
667							\
668	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
669			    x4, x5, x6, x7,		\
670			    y0, y1, y2, y3,		\
671			    y4, y5, y6, y7);		\
672							\
673	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
674	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
675	aria_store_state_8way(x0, x1, x2, x3,		\
676			      x4, x5, x6, x7,		\
677			      mem_tmp, 8);		\
678							\
679	aria_load_state_8way(x0, x1, x2, x3,		\
680			     x4, x5, x6, x7,		\
681			     mem_tmp, 0);		\
682	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
683		      y0, rk, 0, round);		\
684							\
685	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
686			    x4, x5, x6, x7,		\
687			    y0, y1, y2, y3,		\
688			    y4, y5, y6, y7);		\
689							\
690	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
691	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
692	aria_store_state_8way(x0, x1, x2, x3,		\
693			      x4, x5, x6, x7,		\
694			      mem_tmp, 0);		\
695	aria_load_state_8way(y0, y1, y2, y3,		\
696			     y4, y5, y6, y7,		\
697			     mem_tmp, 8);		\
698	aria_diff_word(x0, x1, x2, x3,			\
699		       x4, x5, x6, x7,			\
700		       y0, y1, y2, y3,			\
701		       y4, y5, y6, y7);			\
702	/* aria_diff_byte()				\
703	 * T1 = ABCD -> BADC				\
704	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
705	 * T2 = ABCD -> CDAB				\
706	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
707	 * T3 = ABCD -> DCBA				\
708	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
709	 */						\
710	aria_diff_word(x0, x1, x2, x3,			\
711		       x5, x4, x7, x6,			\
712		       y2, y3, y0, y1,			\
713		       y7, y6, y5, y4);			\
714	aria_store_state_8way(x3, x2, x1, x0,		\
715			      x6, x7, x4, x5,		\
716			      mem_tmp, 0);
717
718#define aria_ff_gfni(x0, x1, x2, x3,			\
719		x4, x5, x6, x7,				\
720		y0, y1, y2, y3,				\
721		y4, y5, y6, y7,				\
722		mem_tmp, rk, round, last_round)		\
723	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
724		      y0, rk, 8, round);		\
725							\
726	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
727			    x6, x7, x4, x5,		\
728			    y0, y1, y2, y3,		\
729			    y4, y5, y6, y7);		\
730							\
731	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
732		      y0, rk, 8, last_round);		\
733							\
734	aria_store_state_8way(x0, x1, x2, x3,		\
735			      x4, x5, x6, x7,		\
736			      mem_tmp, 8);		\
737							\
738	aria_load_state_8way(x0, x1, x2, x3,		\
739			     x4, x5, x6, x7,		\
740			     mem_tmp, 0);		\
741	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
742		      y0, rk, 0, round);		\
743							\
744	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
745			    x6, x7, x4, x5,		\
746			    y0, y1, y2, y3,		\
747			    y4, y5, y6, y7);		\
748							\
749	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
750		      y0, rk, 0, last_round);		\
751							\
752	aria_load_state_8way(y0, y1, y2, y3,		\
753			     y4, y5, y6, y7,		\
754			     mem_tmp, 8);
755#endif /* CONFIG_AS_GFNI */
756
757.section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
758.align 32
759#define SHUFB_BYTES(idx) \
760	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
761.Lshufb_16x16b:
762	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
763	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
764
765.section	.rodata.cst16, "aM", @progbits, 16
766.align 16
767/* For isolating SubBytes from AESENCLAST, inverse shift row */
768.Linv_shift_row:
769	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
770	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
771.Lshift_row:
772	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
773	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
774/* For CTR-mode IV byteswap */
775.Lbswap128_mask:
776	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
777	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
778
779/* AES inverse affine and S2 combined:
780 *      1 1 0 0 0 0 0 1     x0     0
781 *      0 1 0 0 1 0 0 0     x1     0
782 *      1 1 0 0 1 1 1 1     x2     0
783 *      0 1 1 0 1 0 0 1     x3     1
784 *      0 1 0 0 1 1 0 0  *  x4  +  0
785 *      0 1 0 1 1 0 0 0     x5     0
786 *      0 0 0 0 0 1 0 1     x6     0
787 *      1 1 1 0 0 1 1 1     x7     1
788 */
789.Ltf_lo__inv_aff__and__s2:
790	.octa 0x92172DA81A9FA520B2370D883ABF8500
791.Ltf_hi__inv_aff__and__s2:
792	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
793
794/* X2 and AES forward affine combined:
795 *      1 0 1 1 0 0 0 1     x0     0
796 *      0 1 1 1 1 0 1 1     x1     0
797 *      0 0 0 1 1 0 1 0     x2     1
798 *      0 1 0 0 0 1 0 0     x3     0
799 *      0 0 1 1 1 0 1 1  *  x4  +  0
800 *      0 1 0 0 1 0 0 0     x5     0
801 *      1 1 0 1 0 0 1 1     x6     0
802 *      0 1 0 0 1 0 1 0     x7     0
803 */
804.Ltf_lo__x2__and__fwd_aff:
805	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
806.Ltf_hi__x2__and__fwd_aff:
807	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
808
809#ifdef CONFIG_AS_GFNI
810.section	.rodata.cst8, "aM", @progbits, 8
811.align 8
812/* AES affine: */
813#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
814.Ltf_aff_bitmatrix:
815	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
816		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
817		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
818		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
819		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
820		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
821		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
822		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
823
824/* AES inverse affine: */
825#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
826.Ltf_inv_bitmatrix:
827	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
828		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
829		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
830		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
831		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
832		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
833		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
834		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
835
836/* S2: */
837#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
838.Ltf_s2_bitmatrix:
839	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
840		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
841		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
842		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
843		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
844		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
845		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
846		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
847
848/* X2: */
849#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
850.Ltf_x2_bitmatrix:
851	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
852		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
853		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
854		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
855		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
856		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
857		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
858		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
859
860/* Identity matrix: */
861.Ltf_id_bitmatrix:
862	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
863		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
864		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
865		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
866		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
867		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
868		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
869		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
870
871#endif /* CONFIG_AS_GFNI */
872
873/* 4-bit mask */
874.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
875.align 4
876.L0f0f0f0f:
877	.long 0x0f0f0f0f
878
879.text
880
881SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
882	/* input:
883	 *      %r9: rk
884	 *      %rsi: dst
885	 *      %rdx: src
886	 *      %ymm0..%ymm15: byte-sliced blocks
887	 */
888
889	FRAME_BEGIN
890
891	movq %rsi, %rax;
892	leaq 8 * 32(%rax), %r8;
893
894	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
895		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896		      %ymm15, %rax, %r8);
897	aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
898		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
899		%rax, %r9, 0);
900	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
901		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902		%ymm15, %rax, %r9, 1);
903	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
904		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905		%rax, %r9, 2);
906	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
907		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
908		%ymm15, %rax, %r9, 3);
909	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
910		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911		%rax, %r9, 4);
912	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
913		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914		%ymm15, %rax, %r9, 5);
915	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
916		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917		%rax, %r9, 6);
918	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
919		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920		%ymm15, %rax, %r9, 7);
921	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
922		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
923		%rax, %r9, 8);
924	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
925		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
926		%ymm15, %rax, %r9, 9);
927	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
928		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
929		%rax, %r9, 10);
930	cmpl $12, ARIA_CTX_rounds(CTX);
931	jne .Laria_192;
932	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
933		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
934		%ymm15, %rax, %r9, 11, 12);
935	jmp .Laria_end;
936.Laria_192:
937	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
938		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
939		%ymm15, %rax, %r9, 11);
940	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
941		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
942		%rax, %r9, 12);
943	cmpl $14, ARIA_CTX_rounds(CTX);
944	jne .Laria_256;
945	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
946		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
947		%ymm15, %rax, %r9, 13, 14);
948	jmp .Laria_end;
949.Laria_256:
950	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
951		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952		%ymm15, %rax, %r9, 13);
953	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
954		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
955		%rax, %r9, 14);
956	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
957		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
958		%ymm15, %rax, %r9, 15, 16);
959.Laria_end:
960	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
961			   %ymm9, %ymm13, %ymm0, %ymm5,
962			   %ymm10, %ymm14, %ymm3, %ymm6,
963			   %ymm11, %ymm15, %ymm2, %ymm7,
964			   (%rax), (%r8));
965
966	FRAME_END
967	RET;
968SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
969
970SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
971	/* input:
972	 *      %rdi: ctx, CTX
973	 *      %rsi: dst
974	 *      %rdx: src
975	 */
976
977	FRAME_BEGIN
978
979	leaq ARIA_CTX_enc_key(CTX), %r9;
980
981	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
982		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
983		     %ymm15, %rdx);
984
985	call __aria_aesni_avx2_crypt_32way;
986
987	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
988		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
989		     %ymm15, %rax);
990
991	FRAME_END
992	RET;
993SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
994
995SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
996	/* input:
997	 *      %rdi: ctx, CTX
998	 *      %rsi: dst
999	 *      %rdx: src
1000	 */
1001
1002	FRAME_BEGIN
1003
1004	leaq ARIA_CTX_dec_key(CTX), %r9;
1005
1006	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1007		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1008		     %ymm15, %rdx);
1009
1010	call __aria_aesni_avx2_crypt_32way;
1011
1012	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1013		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1014		     %ymm15, %rax);
1015
1016	FRAME_END
1017	RET;
1018SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
1019
1020SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
1021	/* input:
1022	 *      %rdi: ctx
1023	 *      %rsi: dst
1024	 *      %rdx: src
1025	 *      %rcx: keystream
1026	 *      %r8: iv (big endian, 128bit)
1027	 */
1028
1029	FRAME_BEGIN
1030	movq 8(%r8), %r11;
1031	bswapq %r11;
1032
1033	vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
1034	vpcmpeqd %ymm0, %ymm0, %ymm0;
1035	vpsrldq $8, %ymm0, %ymm0;   /* ab: -1:0 ; cd: -1:0 */
1036	vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
1037
1038	/* load IV and byteswap */
1039	vmovdqu (%r8), %xmm7;
1040	vpshufb %xmm6, %xmm7, %xmm7;
1041	vmovdqa %xmm7, %xmm3;
1042	inc_le128(%xmm7, %xmm0, %xmm4);
1043	vinserti128 $1, %xmm7, %ymm3, %ymm3;
1044	vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
1045
1046	/* check need for handling 64-bit overflow and carry */
1047	cmpq $(0xffffffffffffffff - 32), %r11;
1048	ja .Lhandle_ctr_carry;
1049
1050	/* construct IVs */
1051	vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
1052	vpshufb %ymm6, %ymm3, %ymm9;
1053	vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
1054	vpshufb %ymm6, %ymm3, %ymm10;
1055	vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
1056	vpshufb %ymm6, %ymm3, %ymm11;
1057	vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
1058	vpshufb %ymm6, %ymm3, %ymm12;
1059	vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
1060	vpshufb %ymm6, %ymm3, %ymm13;
1061	vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
1062	vpshufb %ymm6, %ymm3, %ymm14;
1063	vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
1064	vpshufb %ymm6, %ymm3, %ymm15;
1065	vmovdqu %ymm8, (0 * 32)(%rcx);
1066	vmovdqu %ymm9, (1 * 32)(%rcx);
1067	vmovdqu %ymm10, (2 * 32)(%rcx);
1068	vmovdqu %ymm11, (3 * 32)(%rcx);
1069	vmovdqu %ymm12, (4 * 32)(%rcx);
1070	vmovdqu %ymm13, (5 * 32)(%rcx);
1071	vmovdqu %ymm14, (6 * 32)(%rcx);
1072	vmovdqu %ymm15, (7 * 32)(%rcx);
1073
1074	vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
1075	vpshufb %ymm6, %ymm3, %ymm8;
1076	vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
1077	vpshufb %ymm6, %ymm3, %ymm9;
1078	vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
1079	vpshufb %ymm6, %ymm3, %ymm10;
1080	vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
1081	vpshufb %ymm6, %ymm3, %ymm11;
1082	vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
1083	vpshufb %ymm6, %ymm3, %ymm12;
1084	vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
1085	vpshufb %ymm6, %ymm3, %ymm13;
1086	vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
1087	vpshufb %ymm6, %ymm3, %ymm14;
1088	vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
1089	vpshufb %ymm6, %ymm3, %ymm15;
1090	vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
1091	vpshufb %xmm6, %xmm3, %xmm3;
1092	vmovdqu %xmm3, (%r8);
1093	vmovdqu (0 * 32)(%rcx), %ymm0;
1094	vmovdqu (1 * 32)(%rcx), %ymm1;
1095	vmovdqu (2 * 32)(%rcx), %ymm2;
1096	vmovdqu (3 * 32)(%rcx), %ymm3;
1097	vmovdqu (4 * 32)(%rcx), %ymm4;
1098	vmovdqu (5 * 32)(%rcx), %ymm5;
1099	vmovdqu (6 * 32)(%rcx), %ymm6;
1100	vmovdqu (7 * 32)(%rcx), %ymm7;
1101	jmp .Lctr_carry_done;
1102
1103	.Lhandle_ctr_carry:
1104	/* construct IVs */
1105	inc_le128(%ymm3, %ymm0, %ymm4);
1106	inc_le128(%ymm3, %ymm0, %ymm4);
1107	vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
1108	inc_le128(%ymm3, %ymm0, %ymm4);
1109	inc_le128(%ymm3, %ymm0, %ymm4);
1110	vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
1111	inc_le128(%ymm3, %ymm0, %ymm4);
1112	inc_le128(%ymm3, %ymm0, %ymm4);
1113	vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
1114	inc_le128(%ymm3, %ymm0, %ymm4);
1115	inc_le128(%ymm3, %ymm0, %ymm4);
1116	vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
1117	inc_le128(%ymm3, %ymm0, %ymm4);
1118	inc_le128(%ymm3, %ymm0, %ymm4);
1119	vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
1120	inc_le128(%ymm3, %ymm0, %ymm4);
1121	inc_le128(%ymm3, %ymm0, %ymm4);
1122	vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
1123	inc_le128(%ymm3, %ymm0, %ymm4);
1124	inc_le128(%ymm3, %ymm0, %ymm4);
1125	vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
1126	vmovdqu %ymm8, (0 * 32)(%rcx);
1127	vmovdqu %ymm9, (1 * 32)(%rcx);
1128	vmovdqu %ymm10, (2 * 32)(%rcx);
1129	vmovdqu %ymm11, (3 * 32)(%rcx);
1130	vmovdqu %ymm12, (4 * 32)(%rcx);
1131	vmovdqu %ymm13, (5 * 32)(%rcx);
1132	vmovdqu %ymm14, (6 * 32)(%rcx);
1133	vmovdqu %ymm15, (7 * 32)(%rcx);
1134
1135	inc_le128(%ymm3, %ymm0, %ymm4);
1136	inc_le128(%ymm3, %ymm0, %ymm4);
1137	vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
1138	inc_le128(%ymm3, %ymm0, %ymm4);
1139	inc_le128(%ymm3, %ymm0, %ymm4);
1140	vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
1141	inc_le128(%ymm3, %ymm0, %ymm4);
1142	inc_le128(%ymm3, %ymm0, %ymm4);
1143	vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
1144	inc_le128(%ymm3, %ymm0, %ymm4);
1145	inc_le128(%ymm3, %ymm0, %ymm4);
1146	vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
1147	inc_le128(%ymm3, %ymm0, %ymm4);
1148	inc_le128(%ymm3, %ymm0, %ymm4);
1149	vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
1150	inc_le128(%ymm3, %ymm0, %ymm4);
1151	inc_le128(%ymm3, %ymm0, %ymm4);
1152	vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
1153	inc_le128(%ymm3, %ymm0, %ymm4);
1154	inc_le128(%ymm3, %ymm0, %ymm4);
1155	vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
1156	inc_le128(%ymm3, %ymm0, %ymm4);
1157	inc_le128(%ymm3, %ymm0, %ymm4);
1158	vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
1159	inc_le128(%ymm3, %ymm0, %ymm4);
1160	vextracti128 $1, %ymm3, %xmm3;
1161	vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
1162	vmovdqu %xmm3, (%r8);
1163	vmovdqu (0 * 32)(%rcx), %ymm0;
1164	vmovdqu (1 * 32)(%rcx), %ymm1;
1165	vmovdqu (2 * 32)(%rcx), %ymm2;
1166	vmovdqu (3 * 32)(%rcx), %ymm3;
1167	vmovdqu (4 * 32)(%rcx), %ymm4;
1168	vmovdqu (5 * 32)(%rcx), %ymm5;
1169	vmovdqu (6 * 32)(%rcx), %ymm6;
1170	vmovdqu (7 * 32)(%rcx), %ymm7;
1171
1172	.Lctr_carry_done:
1173
1174	FRAME_END
1175	RET;
1176SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
1177
1178SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
1179	/* input:
1180	 *      %rdi: ctx
1181	 *      %rsi: dst
1182	 *      %rdx: src
1183	 *      %rcx: keystream
1184	 *      %r8: iv (big endian, 128bit)
1185	 */
1186	FRAME_BEGIN
1187
1188	call __aria_aesni_avx2_ctr_gen_keystream_32way;
1189
1190	leaq (%rsi), %r10;
1191	leaq (%rdx), %r11;
1192	leaq (%rcx), %rsi;
1193	leaq (%rcx), %rdx;
1194	leaq ARIA_CTX_enc_key(CTX), %r9;
1195
1196	call __aria_aesni_avx2_crypt_32way;
1197
1198	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1199	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1200	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1201	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1202	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1203	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1204	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1205	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1206	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1207	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1208	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1209	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1210	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1211	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1212	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1213	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1214	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1215		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1216		     %ymm15, %r10);
1217
1218	FRAME_END
1219	RET;
1220SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
1221
1222#ifdef CONFIG_AS_GFNI
1223SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
1224	/* input:
1225	 *      %r9: rk
1226	 *      %rsi: dst
1227	 *      %rdx: src
1228	 *      %ymm0..%ymm15: 16 byte-sliced blocks
1229	 */
1230
1231	FRAME_BEGIN
1232
1233	movq %rsi, %rax;
1234	leaq 8 * 32(%rax), %r8;
1235
1236	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
1237		      %ymm4, %ymm5, %ymm6, %ymm7,
1238		      %ymm8, %ymm9, %ymm10, %ymm11,
1239		      %ymm12, %ymm13, %ymm14,
1240		      %ymm15, %rax, %r8);
1241	aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
1242		     %ymm12, %ymm13, %ymm14, %ymm15,
1243		     %ymm0, %ymm1, %ymm2, %ymm3,
1244		     %ymm4, %ymm5, %ymm6, %ymm7,
1245		     %rax, %r9, 0);
1246	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1247		     %ymm4, %ymm5, %ymm6, %ymm7,
1248		     %ymm8, %ymm9, %ymm10, %ymm11,
1249		     %ymm12, %ymm13, %ymm14,
1250		     %ymm15, %rax, %r9, 1);
1251	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1252		     %ymm12, %ymm13, %ymm14, %ymm15,
1253		     %ymm0, %ymm1, %ymm2, %ymm3,
1254		     %ymm4, %ymm5, %ymm6, %ymm7,
1255		     %rax, %r9, 2);
1256	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1257		     %ymm4, %ymm5, %ymm6, %ymm7,
1258		     %ymm8, %ymm9, %ymm10, %ymm11,
1259		     %ymm12, %ymm13, %ymm14,
1260		     %ymm15, %rax, %r9, 3);
1261	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1262		     %ymm12, %ymm13, %ymm14, %ymm15,
1263		     %ymm0, %ymm1, %ymm2, %ymm3,
1264		     %ymm4, %ymm5, %ymm6, %ymm7,
1265		     %rax, %r9, 4);
1266	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1267		     %ymm4, %ymm5, %ymm6, %ymm7,
1268		     %ymm8, %ymm9, %ymm10, %ymm11,
1269		     %ymm12, %ymm13, %ymm14,
1270		     %ymm15, %rax, %r9, 5);
1271	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1272		     %ymm12, %ymm13, %ymm14, %ymm15,
1273		     %ymm0, %ymm1, %ymm2, %ymm3,
1274		     %ymm4, %ymm5, %ymm6, %ymm7,
1275		     %rax, %r9, 6);
1276	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1277		     %ymm4, %ymm5, %ymm6, %ymm7,
1278		     %ymm8, %ymm9, %ymm10, %ymm11,
1279		     %ymm12, %ymm13, %ymm14,
1280		     %ymm15, %rax, %r9, 7);
1281	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1282		     %ymm12, %ymm13, %ymm14, %ymm15,
1283		     %ymm0, %ymm1, %ymm2, %ymm3,
1284		     %ymm4, %ymm5, %ymm6, %ymm7,
1285		     %rax, %r9, 8);
1286	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1287		     %ymm4, %ymm5, %ymm6, %ymm7,
1288		     %ymm8, %ymm9, %ymm10, %ymm11,
1289		     %ymm12, %ymm13, %ymm14,
1290		     %ymm15, %rax, %r9, 9);
1291	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1292		     %ymm12, %ymm13, %ymm14, %ymm15,
1293		     %ymm0, %ymm1, %ymm2, %ymm3,
1294		     %ymm4, %ymm5, %ymm6, %ymm7,
1295		     %rax, %r9, 10);
1296	cmpl $12, ARIA_CTX_rounds(CTX);
1297	jne .Laria_gfni_192;
1298	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1299		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1300		%ymm15, %rax, %r9, 11, 12);
1301	jmp .Laria_gfni_end;
1302.Laria_gfni_192:
1303	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1304		     %ymm4, %ymm5, %ymm6, %ymm7,
1305		     %ymm8, %ymm9, %ymm10, %ymm11,
1306		     %ymm12, %ymm13, %ymm14,
1307		     %ymm15, %rax, %r9, 11);
1308	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1309		     %ymm12, %ymm13, %ymm14, %ymm15,
1310		     %ymm0, %ymm1, %ymm2, %ymm3,
1311		     %ymm4, %ymm5, %ymm6, %ymm7,
1312		     %rax, %r9, 12);
1313	cmpl $14, ARIA_CTX_rounds(CTX);
1314	jne .Laria_gfni_256;
1315	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1316		     %ymm4, %ymm5, %ymm6, %ymm7,
1317		     %ymm8, %ymm9, %ymm10, %ymm11,
1318		     %ymm12, %ymm13, %ymm14,
1319		     %ymm15, %rax, %r9, 13, 14);
1320	jmp .Laria_gfni_end;
1321.Laria_gfni_256:
1322	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1323		     %ymm4, %ymm5, %ymm6, %ymm7,
1324		     %ymm8, %ymm9, %ymm10, %ymm11,
1325		     %ymm12, %ymm13, %ymm14,
1326		     %ymm15, %rax, %r9, 13);
1327	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1328		     %ymm12, %ymm13, %ymm14, %ymm15,
1329		     %ymm0, %ymm1, %ymm2, %ymm3,
1330		     %ymm4, %ymm5, %ymm6, %ymm7,
1331		     %rax, %r9, 14);
1332	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1333		     %ymm4, %ymm5, %ymm6, %ymm7,
1334		     %ymm8, %ymm9, %ymm10, %ymm11,
1335		     %ymm12, %ymm13, %ymm14,
1336		     %ymm15, %rax, %r9, 15, 16);
1337.Laria_gfni_end:
1338	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
1339			   %ymm9, %ymm13, %ymm0, %ymm5,
1340			   %ymm10, %ymm14, %ymm3, %ymm6,
1341			   %ymm11, %ymm15, %ymm2, %ymm7,
1342			   (%rax), (%r8));
1343
1344	FRAME_END
1345	RET;
1346SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
1347
1348SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
1349	/* input:
1350	 *      %rdi: ctx, CTX
1351	 *      %rsi: dst
1352	 *      %rdx: src
1353	 */
1354
1355	FRAME_BEGIN
1356
1357	leaq ARIA_CTX_enc_key(CTX), %r9;
1358
1359	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1360		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1361		     %ymm15, %rdx);
1362
1363	call __aria_aesni_avx2_gfni_crypt_32way;
1364
1365	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1366		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1367		     %ymm15, %rax);
1368
1369	FRAME_END
1370	RET;
1371SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
1372
1373SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
1374	/* input:
1375	 *      %rdi: ctx, CTX
1376	 *      %rsi: dst
1377	 *      %rdx: src
1378	 */
1379
1380	FRAME_BEGIN
1381
1382	leaq ARIA_CTX_dec_key(CTX), %r9;
1383
1384	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1385		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1386		     %ymm15, %rdx);
1387
1388	call __aria_aesni_avx2_gfni_crypt_32way;
1389
1390	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1391		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1392		     %ymm15, %rax);
1393
1394	FRAME_END
1395	RET;
1396SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
1397
1398SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
1399	/* input:
1400	 *      %rdi: ctx
1401	 *      %rsi: dst
1402	 *      %rdx: src
1403	 *      %rcx: keystream
1404	 *      %r8: iv (big endian, 128bit)
1405	 */
1406	FRAME_BEGIN
1407
1408	call __aria_aesni_avx2_ctr_gen_keystream_32way
1409
1410	leaq (%rsi), %r10;
1411	leaq (%rdx), %r11;
1412	leaq (%rcx), %rsi;
1413	leaq (%rcx), %rdx;
1414	leaq ARIA_CTX_enc_key(CTX), %r9;
1415
1416	call __aria_aesni_avx2_gfni_crypt_32way;
1417
1418	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1419	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1420	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1421	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1422	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1423	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1424	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1425	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1426	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1427	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1428	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1429	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1430	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1431	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1432	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1433	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1434	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1435		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1436		     %ymm15, %r10);
1437
1438	FRAME_END
1439	RET;
1440SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
1441#endif /* CONFIG_AS_GFNI */
1442