xref: /linux/arch/x86/crypto/aria-aesni-avx2-asm_64.S (revision 2f0a7504530c24f55daec7d2364d933bb1a1fa68)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 32-way parallel algorithm (AVX2)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <asm/frame.h>
11#include <asm/asm-offsets.h>
12#include <linux/cfi_types.h>
13
14/* register macros */
15#define CTX %rdi
16
17#define ymm0_x xmm0
18#define ymm1_x xmm1
19#define ymm2_x xmm2
20#define ymm3_x xmm3
21#define ymm4_x xmm4
22#define ymm5_x xmm5
23#define ymm6_x xmm6
24#define ymm7_x xmm7
25#define ymm8_x xmm8
26#define ymm9_x xmm9
27#define ymm10_x xmm10
28#define ymm11_x xmm11
29#define ymm12_x xmm12
30#define ymm13_x xmm13
31#define ymm14_x xmm14
32#define ymm15_x xmm15
33
34#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
35	( (((a0) & 1) << 0) |				\
36	  (((a1) & 1) << 1) |				\
37	  (((a2) & 1) << 2) |				\
38	  (((a3) & 1) << 3) |				\
39	  (((a4) & 1) << 4) |				\
40	  (((a5) & 1) << 5) |				\
41	  (((a6) & 1) << 6) |				\
42	  (((a7) & 1) << 7) )
43
44#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
45	( ((l7) << (0 * 8)) |				\
46	  ((l6) << (1 * 8)) |				\
47	  ((l5) << (2 * 8)) |				\
48	  ((l4) << (3 * 8)) |				\
49	  ((l3) << (4 * 8)) |				\
50	  ((l2) << (5 * 8)) |				\
51	  ((l1) << (6 * 8)) |				\
52	  ((l0) << (7 * 8)) )
53
54#define inc_le128(x, minus_one, tmp)			\
55	vpcmpeqq minus_one, x, tmp;			\
56	vpsubq minus_one, x, x;				\
57	vpslldq $8, tmp, tmp;				\
58	vpsubq tmp, x, x;
59
60#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
61	vpand x, mask4bit, tmp0;			\
62	vpandn x, mask4bit, x;				\
63	vpsrld $4, x, x;				\
64							\
65	vpshufb tmp0, lo_t, tmp0;			\
66	vpshufb x, hi_t, x;				\
67	vpxor tmp0, x, x;
68
69#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
70	vpunpckhdq x1, x0, t2;				\
71	vpunpckldq x1, x0, x0;				\
72							\
73	vpunpckldq x3, x2, t1;				\
74	vpunpckhdq x3, x2, x2;				\
75							\
76	vpunpckhqdq t1, x0, x1;				\
77	vpunpcklqdq t1, x0, x0;				\
78							\
79	vpunpckhqdq x2, t2, x3;				\
80	vpunpcklqdq x2, t2, x2;
81
82#define byteslice_16x16b(a0, b0, c0, d0,		\
83			 a1, b1, c1, d1,		\
84			 a2, b2, c2, d2,		\
85			 a3, b3, c3, d3,		\
86			 st0, st1)			\
87	vmovdqu d2, st0;				\
88	vmovdqu d3, st1;				\
89	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
90	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
91	vmovdqu st0, d2;				\
92	vmovdqu st1, d3;				\
93							\
94	vmovdqu a0, st0;				\
95	vmovdqu a1, st1;				\
96	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
97	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
98							\
99	vbroadcasti128 .Lshufb_16x16b(%rip), a0;	\
100	vmovdqu st1, a1;				\
101	vpshufb a0, a2, a2;				\
102	vpshufb a0, a3, a3;				\
103	vpshufb a0, b0, b0;				\
104	vpshufb a0, b1, b1;				\
105	vpshufb a0, b2, b2;				\
106	vpshufb a0, b3, b3;				\
107	vpshufb a0, a1, a1;				\
108	vpshufb a0, c0, c0;				\
109	vpshufb a0, c1, c1;				\
110	vpshufb a0, c2, c2;				\
111	vpshufb a0, c3, c3;				\
112	vpshufb a0, d0, d0;				\
113	vpshufb a0, d1, d1;				\
114	vpshufb a0, d2, d2;				\
115	vpshufb a0, d3, d3;				\
116	vmovdqu d3, st1;				\
117	vmovdqu st0, d3;				\
118	vpshufb a0, d3, a0;				\
119	vmovdqu d2, st0;				\
120							\
121	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
122	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
123	vmovdqu st0, d2;				\
124	vmovdqu st1, d3;				\
125							\
126	vmovdqu b0, st0;				\
127	vmovdqu b1, st1;				\
128	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
129	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
130	vmovdqu st0, b0;				\
131	vmovdqu st1, b1;				\
132	/* does not adjust output bytes inside vectors */
133
134#define debyteslice_16x16b(a0, b0, c0, d0,		\
135			   a1, b1, c1, d1,		\
136			   a2, b2, c2, d2,		\
137			   a3, b3, c3, d3,		\
138			   st0, st1)			\
139	vmovdqu d2, st0;				\
140	vmovdqu d3, st1;				\
141	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
142	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
143	vmovdqu st0, d2;				\
144	vmovdqu st1, d3;				\
145							\
146	vmovdqu a0, st0;				\
147	vmovdqu a1, st1;				\
148	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
149	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
150							\
151	vbroadcasti128 .Lshufb_16x16b(%rip), a0;	\
152	vmovdqu st1, a1;				\
153	vpshufb a0, a2, a2;				\
154	vpshufb a0, a3, a3;				\
155	vpshufb a0, b0, b0;				\
156	vpshufb a0, b1, b1;				\
157	vpshufb a0, b2, b2;				\
158	vpshufb a0, b3, b3;				\
159	vpshufb a0, a1, a1;				\
160	vpshufb a0, c0, c0;				\
161	vpshufb a0, c1, c1;				\
162	vpshufb a0, c2, c2;				\
163	vpshufb a0, c3, c3;				\
164	vpshufb a0, d0, d0;				\
165	vpshufb a0, d1, d1;				\
166	vpshufb a0, d2, d2;				\
167	vpshufb a0, d3, d3;				\
168	vmovdqu d3, st1;				\
169	vmovdqu st0, d3;				\
170	vpshufb a0, d3, a0;				\
171	vmovdqu d2, st0;				\
172							\
173	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
174	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
175	vmovdqu st0, d2;				\
176	vmovdqu st1, d3;				\
177							\
178	vmovdqu b0, st0;				\
179	vmovdqu b1, st1;				\
180	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
181	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
182	vmovdqu st0, b0;				\
183	vmovdqu st1, b1;				\
184	/* does not adjust output bytes inside vectors */
185
186/* load blocks to registers and apply pre-whitening */
187#define inpack16_pre(x0, x1, x2, x3,			\
188		     x4, x5, x6, x7,			\
189		     y0, y1, y2, y3,			\
190		     y4, y5, y6, y7,			\
191		     rio)				\
192	vmovdqu (0 * 32)(rio), x0;			\
193	vmovdqu (1 * 32)(rio), x1;			\
194	vmovdqu (2 * 32)(rio), x2;			\
195	vmovdqu (3 * 32)(rio), x3;			\
196	vmovdqu (4 * 32)(rio), x4;			\
197	vmovdqu (5 * 32)(rio), x5;			\
198	vmovdqu (6 * 32)(rio), x6;			\
199	vmovdqu (7 * 32)(rio), x7;			\
200	vmovdqu (8 * 32)(rio), y0;			\
201	vmovdqu (9 * 32)(rio), y1;			\
202	vmovdqu (10 * 32)(rio), y2;			\
203	vmovdqu (11 * 32)(rio), y3;			\
204	vmovdqu (12 * 32)(rio), y4;			\
205	vmovdqu (13 * 32)(rio), y5;			\
206	vmovdqu (14 * 32)(rio), y6;			\
207	vmovdqu (15 * 32)(rio), y7;
208
209/* byteslice pre-whitened blocks and store to temporary memory */
210#define inpack16_post(x0, x1, x2, x3,			\
211		      x4, x5, x6, x7,			\
212		      y0, y1, y2, y3,			\
213		      y4, y5, y6, y7,			\
214		      mem_ab, mem_cd)			\
215	byteslice_16x16b(x0, x1, x2, x3,		\
216			 x4, x5, x6, x7,		\
217			 y0, y1, y2, y3,		\
218			 y4, y5, y6, y7,		\
219			 (mem_ab), (mem_cd));		\
220							\
221	vmovdqu x0, 0 * 32(mem_ab);			\
222	vmovdqu x1, 1 * 32(mem_ab);			\
223	vmovdqu x2, 2 * 32(mem_ab);			\
224	vmovdqu x3, 3 * 32(mem_ab);			\
225	vmovdqu x4, 4 * 32(mem_ab);			\
226	vmovdqu x5, 5 * 32(mem_ab);			\
227	vmovdqu x6, 6 * 32(mem_ab);			\
228	vmovdqu x7, 7 * 32(mem_ab);			\
229	vmovdqu y0, 0 * 32(mem_cd);			\
230	vmovdqu y1, 1 * 32(mem_cd);			\
231	vmovdqu y2, 2 * 32(mem_cd);			\
232	vmovdqu y3, 3 * 32(mem_cd);			\
233	vmovdqu y4, 4 * 32(mem_cd);			\
234	vmovdqu y5, 5 * 32(mem_cd);			\
235	vmovdqu y6, 6 * 32(mem_cd);			\
236	vmovdqu y7, 7 * 32(mem_cd);
237
238#define write_output(x0, x1, x2, x3,			\
239		     x4, x5, x6, x7,			\
240		     y0, y1, y2, y3,			\
241		     y4, y5, y6, y7,			\
242		     mem)				\
243	vmovdqu x0, 0 * 32(mem);			\
244	vmovdqu x1, 1 * 32(mem);			\
245	vmovdqu x2, 2 * 32(mem);			\
246	vmovdqu x3, 3 * 32(mem);			\
247	vmovdqu x4, 4 * 32(mem);			\
248	vmovdqu x5, 5 * 32(mem);			\
249	vmovdqu x6, 6 * 32(mem);			\
250	vmovdqu x7, 7 * 32(mem);			\
251	vmovdqu y0, 8 * 32(mem);			\
252	vmovdqu y1, 9 * 32(mem);			\
253	vmovdqu y2, 10 * 32(mem);			\
254	vmovdqu y3, 11 * 32(mem);			\
255	vmovdqu y4, 12 * 32(mem);			\
256	vmovdqu y5, 13 * 32(mem);			\
257	vmovdqu y6, 14 * 32(mem);			\
258	vmovdqu y7, 15 * 32(mem);			\
259
260#define aria_store_state_8way(x0, x1, x2, x3,		\
261			      x4, x5, x6, x7,		\
262			      mem_tmp, idx)		\
263	vmovdqu x0, ((idx + 0) * 32)(mem_tmp);		\
264	vmovdqu x1, ((idx + 1) * 32)(mem_tmp);		\
265	vmovdqu x2, ((idx + 2) * 32)(mem_tmp);		\
266	vmovdqu x3, ((idx + 3) * 32)(mem_tmp);		\
267	vmovdqu x4, ((idx + 4) * 32)(mem_tmp);		\
268	vmovdqu x5, ((idx + 5) * 32)(mem_tmp);		\
269	vmovdqu x6, ((idx + 6) * 32)(mem_tmp);		\
270	vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
271
272#define aria_load_state_8way(x0, x1, x2, x3,		\
273			     x4, x5, x6, x7,		\
274			     mem_tmp, idx)		\
275	vmovdqu ((idx + 0) * 32)(mem_tmp), x0;		\
276	vmovdqu ((idx + 1) * 32)(mem_tmp), x1;		\
277	vmovdqu ((idx + 2) * 32)(mem_tmp), x2;		\
278	vmovdqu ((idx + 3) * 32)(mem_tmp), x3;		\
279	vmovdqu ((idx + 4) * 32)(mem_tmp), x4;		\
280	vmovdqu ((idx + 5) * 32)(mem_tmp), x5;		\
281	vmovdqu ((idx + 6) * 32)(mem_tmp), x6;		\
282	vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
283
284#define aria_ark_8way(x0, x1, x2, x3,			\
285		      x4, x5, x6, x7,			\
286		      t0, rk, idx, round)		\
287	/* AddRoundKey */                               \
288	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
289	vpxor t0, x0, x0;				\
290	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
291	vpxor t0, x1, x1;				\
292	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
293	vpxor t0, x2, x2;				\
294	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
295	vpxor t0, x3, x3;				\
296	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
297	vpxor t0, x4, x4;				\
298	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
299	vpxor t0, x5, x5;				\
300	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
301	vpxor t0, x6, x6;				\
302	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
303	vpxor t0, x7, x7;
304
305#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
306			    x4, x5, x6, x7,		\
307			    t0, t1, t2, t3,		\
308			    t4, t5, t6, t7)		\
309	vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;	\
310	vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;	\
311	vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;	\
312	vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;	\
313	vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;	\
314	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
315	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
316	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
317	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
318	vgf2p8affineinvqb $0, t2, x2, x2;		\
319	vgf2p8affineinvqb $0, t2, x6, x6;		\
320	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
321	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
322	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
323	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
324	vgf2p8affineinvqb $0, t2, x3, x3;		\
325	vgf2p8affineinvqb $0, t2, x7, x7
326
327#define aria_sbox_8way(x0, x1, x2, x3,			\
328		       x4, x5, x6, x7,			\
329		       t0, t1, t2, t3,			\
330		       t4, t5, t6, t7)			\
331	vpxor t7, t7, t7;				\
332	vpxor t6, t6, t6;				\
333	vbroadcasti128 .Linv_shift_row(%rip), t0;	\
334	vbroadcasti128 .Lshift_row(%rip), t1;		\
335	vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
336	vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
337	vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
338	vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
339							\
340	vextracti128 $1, x0, t6##_x;			\
341	vaesenclast t7##_x, x0##_x, x0##_x;		\
342	vaesenclast t7##_x, t6##_x, t6##_x;		\
343	vinserti128 $1, t6##_x, x0, x0;			\
344							\
345	vextracti128 $1, x4, t6##_x;			\
346	vaesenclast t7##_x, x4##_x, x4##_x;		\
347	vaesenclast t7##_x, t6##_x, t6##_x;		\
348	vinserti128 $1, t6##_x, x4, x4;			\
349							\
350	vextracti128 $1, x1, t6##_x;			\
351	vaesenclast t7##_x, x1##_x, x1##_x;		\
352	vaesenclast t7##_x, t6##_x, t6##_x;		\
353	vinserti128 $1, t6##_x, x1, x1;			\
354							\
355	vextracti128 $1, x5, t6##_x;			\
356	vaesenclast t7##_x, x5##_x, x5##_x;		\
357	vaesenclast t7##_x, t6##_x, t6##_x;		\
358	vinserti128 $1, t6##_x, x5, x5;			\
359							\
360	vextracti128 $1, x2, t6##_x;			\
361	vaesdeclast t7##_x, x2##_x, x2##_x;		\
362	vaesdeclast t7##_x, t6##_x, t6##_x;		\
363	vinserti128 $1, t6##_x, x2, x2;			\
364							\
365	vextracti128 $1, x6, t6##_x;			\
366	vaesdeclast t7##_x, x6##_x, x6##_x;		\
367	vaesdeclast t7##_x, t6##_x, t6##_x;		\
368	vinserti128 $1, t6##_x, x6, x6;			\
369							\
370	vpbroadcastd .L0f0f0f0f(%rip), t6;		\
371							\
372	/* AES inverse shift rows */			\
373	vpshufb t0, x0, x0;				\
374	vpshufb t0, x4, x4;				\
375	vpshufb t0, x1, x1;				\
376	vpshufb t0, x5, x5;				\
377	vpshufb t1, x3, x3;				\
378	vpshufb t1, x7, x7;				\
379	vpshufb t1, x2, x2;				\
380	vpshufb t1, x6, x6;				\
381							\
382	/* affine transformation for S2 */		\
383	filter_8bit(x1, t2, t3, t6, t0);		\
384	/* affine transformation for S2 */		\
385	filter_8bit(x5, t2, t3, t6, t0);		\
386							\
387	/* affine transformation for X2 */		\
388	filter_8bit(x3, t4, t5, t6, t0);		\
389	/* affine transformation for X2 */		\
390	filter_8bit(x7, t4, t5, t6, t0);		\
391							\
392	vpxor t6, t6, t6;				\
393	vextracti128 $1, x3, t6##_x;			\
394	vaesdeclast t7##_x, x3##_x, x3##_x;		\
395	vaesdeclast t7##_x, t6##_x, t6##_x;		\
396	vinserti128 $1, t6##_x, x3, x3;			\
397							\
398	vextracti128 $1, x7, t6##_x;			\
399	vaesdeclast t7##_x, x7##_x, x7##_x;		\
400	vaesdeclast t7##_x, t6##_x, t6##_x;		\
401	vinserti128 $1, t6##_x, x7, x7;			\
402
403#define aria_diff_m(x0, x1, x2, x3,			\
404		    t0, t1, t2, t3)			\
405	/* T = rotr32(X, 8); */				\
406	/* X ^= T */					\
407	vpxor x0, x3, t0;				\
408	vpxor x1, x0, t1;				\
409	vpxor x2, x1, t2;				\
410	vpxor x3, x2, t3;				\
411	/* X = T ^ rotr(X, 16); */			\
412	vpxor t2, x0, x0;				\
413	vpxor x1, t3, t3;				\
414	vpxor t0, x2, x2;				\
415	vpxor t1, x3, x1;				\
416	vmovdqu t3, x3;
417
418#define aria_diff_word(x0, x1, x2, x3,			\
419		       x4, x5, x6, x7,			\
420		       y0, y1, y2, y3,			\
421		       y4, y5, y6, y7)			\
422	/* t1 ^= t2; */					\
423	vpxor y0, x4, x4;				\
424	vpxor y1, x5, x5;				\
425	vpxor y2, x6, x6;				\
426	vpxor y3, x7, x7;				\
427							\
428	/* t2 ^= t3; */					\
429	vpxor y4, y0, y0;				\
430	vpxor y5, y1, y1;				\
431	vpxor y6, y2, y2;				\
432	vpxor y7, y3, y3;				\
433							\
434	/* t0 ^= t1; */					\
435	vpxor x4, x0, x0;				\
436	vpxor x5, x1, x1;				\
437	vpxor x6, x2, x2;				\
438	vpxor x7, x3, x3;				\
439							\
440	/* t3 ^= t1; */					\
441	vpxor x4, y4, y4;				\
442	vpxor x5, y5, y5;				\
443	vpxor x6, y6, y6;				\
444	vpxor x7, y7, y7;				\
445							\
446	/* t2 ^= t0; */					\
447	vpxor x0, y0, y0;				\
448	vpxor x1, y1, y1;				\
449	vpxor x2, y2, y2;				\
450	vpxor x3, y3, y3;				\
451							\
452	/* t1 ^= t2; */					\
453	vpxor y0, x4, x4;				\
454	vpxor y1, x5, x5;				\
455	vpxor y2, x6, x6;				\
456	vpxor y3, x7, x7;
457
458#define aria_fe(x0, x1, x2, x3,				\
459		x4, x5, x6, x7,				\
460		y0, y1, y2, y3,				\
461		y4, y5, y6, y7,				\
462		mem_tmp, rk, round)			\
463	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
464		      y0, rk, 8, round);		\
465							\
466	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
467		       y0, y1, y2, y3, y4, y5, y6, y7);	\
468							\
469	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
470	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
471	aria_store_state_8way(x0, x1, x2, x3,		\
472			      x4, x5, x6, x7,		\
473			      mem_tmp, 8);		\
474							\
475	aria_load_state_8way(x0, x1, x2, x3,		\
476			     x4, x5, x6, x7,		\
477			     mem_tmp, 0);		\
478	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
479		      y0, rk, 0, round);		\
480							\
481	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
482		       y0, y1, y2, y3, y4, y5, y6, y7);	\
483							\
484	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
485	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
486	aria_store_state_8way(x0, x1, x2, x3,		\
487			      x4, x5, x6, x7,		\
488			      mem_tmp, 0);		\
489	aria_load_state_8way(y0, y1, y2, y3,		\
490			     y4, y5, y6, y7,		\
491			     mem_tmp, 8);		\
492	aria_diff_word(x0, x1, x2, x3,			\
493		       x4, x5, x6, x7,			\
494		       y0, y1, y2, y3,			\
495		       y4, y5, y6, y7);			\
496	/* aria_diff_byte()				\
497	 * T3 = ABCD -> BADC				\
498	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
499	 * T0 = ABCD -> CDAB				\
500	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
501	 * T1 = ABCD -> DCBA				\
502	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
503	 */						\
504	aria_diff_word(x2, x3, x0, x1,			\
505		       x7, x6, x5, x4,			\
506		       y0, y1, y2, y3,			\
507		       y5, y4, y7, y6);			\
508	aria_store_state_8way(x3, x2, x1, x0,		\
509			      x6, x7, x4, x5,		\
510			      mem_tmp, 0);
511
512#define aria_fo(x0, x1, x2, x3,				\
513		x4, x5, x6, x7,				\
514		y0, y1, y2, y3,				\
515		y4, y5, y6, y7,				\
516		mem_tmp, rk, round)			\
517	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
518		      y0, rk, 8, round);		\
519							\
520	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
521		       y0, y1, y2, y3, y4, y5, y6, y7);	\
522							\
523	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
524	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
525	aria_store_state_8way(x0, x1, x2, x3,		\
526			      x4, x5, x6, x7,		\
527			      mem_tmp, 8);		\
528							\
529	aria_load_state_8way(x0, x1, x2, x3,		\
530			     x4, x5, x6, x7,		\
531			     mem_tmp, 0);		\
532	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
533		      y0, rk, 0, round);		\
534							\
535	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
536		       y0, y1, y2, y3, y4, y5, y6, y7);	\
537							\
538	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
539	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
540	aria_store_state_8way(x0, x1, x2, x3,		\
541			      x4, x5, x6, x7,		\
542			      mem_tmp, 0);		\
543	aria_load_state_8way(y0, y1, y2, y3,		\
544			     y4, y5, y6, y7,		\
545			     mem_tmp, 8);		\
546	aria_diff_word(x0, x1, x2, x3,			\
547		       x4, x5, x6, x7,			\
548		       y0, y1, y2, y3,			\
549		       y4, y5, y6, y7);			\
550	/* aria_diff_byte()				\
551	 * T1 = ABCD -> BADC				\
552	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
553	 * T2 = ABCD -> CDAB				\
554	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
555	 * T3 = ABCD -> DCBA				\
556	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
557	 */						\
558	aria_diff_word(x0, x1, x2, x3,			\
559		       x5, x4, x7, x6,			\
560		       y2, y3, y0, y1,			\
561		       y7, y6, y5, y4);			\
562	aria_store_state_8way(x3, x2, x1, x0,		\
563			      x6, x7, x4, x5,		\
564			      mem_tmp, 0);
565
566#define aria_ff(x0, x1, x2, x3,				\
567		x4, x5, x6, x7,				\
568		y0, y1, y2, y3,				\
569		y4, y5, y6, y7,				\
570		mem_tmp, rk, round, last_round)		\
571	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
572		      y0, rk, 8, round);		\
573							\
574	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
575		       y0, y1, y2, y3, y4, y5, y6, y7);	\
576							\
577	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
578		      y0, rk, 8, last_round);		\
579							\
580	aria_store_state_8way(x0, x1, x2, x3,		\
581			      x4, x5, x6, x7,		\
582			      mem_tmp, 8);		\
583							\
584	aria_load_state_8way(x0, x1, x2, x3,		\
585			     x4, x5, x6, x7,		\
586			     mem_tmp, 0);		\
587	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
588		      y0, rk, 0, round);		\
589							\
590	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
591		       y0, y1, y2, y3, y4, y5, y6, y7);	\
592							\
593	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
594		      y0, rk, 0, last_round);		\
595							\
596	aria_load_state_8way(y0, y1, y2, y3,		\
597			     y4, y5, y6, y7,		\
598			     mem_tmp, 8);
599
600#define aria_fe_gfni(x0, x1, x2, x3,			\
601		     x4, x5, x6, x7,			\
602		     y0, y1, y2, y3,			\
603		     y4, y5, y6, y7,			\
604		     mem_tmp, rk, round)		\
605	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
606		      y0, rk, 8, round);		\
607							\
608	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
609			    x6, x7, x4, x5,		\
610			    y0, y1, y2, y3,		\
611			    y4, y5, y6, y7);		\
612							\
613	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
614	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
615	aria_store_state_8way(x0, x1, x2, x3,		\
616			      x4, x5, x6, x7,		\
617			      mem_tmp, 8);		\
618							\
619	aria_load_state_8way(x0, x1, x2, x3,		\
620			     x4, x5, x6, x7,		\
621			     mem_tmp, 0);		\
622	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
623		      y0, rk, 0, round);		\
624							\
625	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
626			    x6, x7, x4, x5,		\
627			    y0, y1, y2, y3,		\
628			    y4, y5, y6, y7);		\
629							\
630	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
631	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
632	aria_store_state_8way(x0, x1, x2, x3,		\
633			      x4, x5, x6, x7,		\
634			      mem_tmp, 0);		\
635	aria_load_state_8way(y0, y1, y2, y3,		\
636			     y4, y5, y6, y7,		\
637			     mem_tmp, 8);		\
638	aria_diff_word(x0, x1, x2, x3,			\
639		       x4, x5, x6, x7,			\
640		       y0, y1, y2, y3,			\
641		       y4, y5, y6, y7);			\
642	/* aria_diff_byte()				\
643	 * T3 = ABCD -> BADC				\
644	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
645	 * T0 = ABCD -> CDAB				\
646	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
647	 * T1 = ABCD -> DCBA				\
648	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
649	 */						\
650	aria_diff_word(x2, x3, x0, x1,			\
651		       x7, x6, x5, x4,			\
652		       y0, y1, y2, y3,			\
653		       y5, y4, y7, y6);			\
654	aria_store_state_8way(x3, x2, x1, x0,		\
655			      x6, x7, x4, x5,		\
656			      mem_tmp, 0);
657
658#define aria_fo_gfni(x0, x1, x2, x3,			\
659		     x4, x5, x6, x7,			\
660		     y0, y1, y2, y3,			\
661		     y4, y5, y6, y7,			\
662		     mem_tmp, rk, round)		\
663	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
664		      y0, rk, 8, round);		\
665							\
666	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
667			    x4, x5, x6, x7,		\
668			    y0, y1, y2, y3,		\
669			    y4, y5, y6, y7);		\
670							\
671	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
672	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
673	aria_store_state_8way(x0, x1, x2, x3,		\
674			      x4, x5, x6, x7,		\
675			      mem_tmp, 8);		\
676							\
677	aria_load_state_8way(x0, x1, x2, x3,		\
678			     x4, x5, x6, x7,		\
679			     mem_tmp, 0);		\
680	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
681		      y0, rk, 0, round);		\
682							\
683	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
684			    x4, x5, x6, x7,		\
685			    y0, y1, y2, y3,		\
686			    y4, y5, y6, y7);		\
687							\
688	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
689	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
690	aria_store_state_8way(x0, x1, x2, x3,		\
691			      x4, x5, x6, x7,		\
692			      mem_tmp, 0);		\
693	aria_load_state_8way(y0, y1, y2, y3,		\
694			     y4, y5, y6, y7,		\
695			     mem_tmp, 8);		\
696	aria_diff_word(x0, x1, x2, x3,			\
697		       x4, x5, x6, x7,			\
698		       y0, y1, y2, y3,			\
699		       y4, y5, y6, y7);			\
700	/* aria_diff_byte()				\
701	 * T1 = ABCD -> BADC				\
702	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
703	 * T2 = ABCD -> CDAB				\
704	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
705	 * T3 = ABCD -> DCBA				\
706	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
707	 */						\
708	aria_diff_word(x0, x1, x2, x3,			\
709		       x5, x4, x7, x6,			\
710		       y2, y3, y0, y1,			\
711		       y7, y6, y5, y4);			\
712	aria_store_state_8way(x3, x2, x1, x0,		\
713			      x6, x7, x4, x5,		\
714			      mem_tmp, 0);
715
716#define aria_ff_gfni(x0, x1, x2, x3,			\
717		x4, x5, x6, x7,				\
718		y0, y1, y2, y3,				\
719		y4, y5, y6, y7,				\
720		mem_tmp, rk, round, last_round)		\
721	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
722		      y0, rk, 8, round);		\
723							\
724	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
725			    x6, x7, x4, x5,		\
726			    y0, y1, y2, y3,		\
727			    y4, y5, y6, y7);		\
728							\
729	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
730		      y0, rk, 8, last_round);		\
731							\
732	aria_store_state_8way(x0, x1, x2, x3,		\
733			      x4, x5, x6, x7,		\
734			      mem_tmp, 8);		\
735							\
736	aria_load_state_8way(x0, x1, x2, x3,		\
737			     x4, x5, x6, x7,		\
738			     mem_tmp, 0);		\
739	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
740		      y0, rk, 0, round);		\
741							\
742	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
743			    x6, x7, x4, x5,		\
744			    y0, y1, y2, y3,		\
745			    y4, y5, y6, y7);		\
746							\
747	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
748		      y0, rk, 0, last_round);		\
749							\
750	aria_load_state_8way(y0, y1, y2, y3,		\
751			     y4, y5, y6, y7,		\
752			     mem_tmp, 8);
753
754.section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
755.align 32
756#define SHUFB_BYTES(idx) \
757	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
758.Lshufb_16x16b:
759	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
760	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
761
762.section	.rodata.cst16, "aM", @progbits, 16
763.align 16
764/* For isolating SubBytes from AESENCLAST, inverse shift row */
765.Linv_shift_row:
766	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
767	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
768.Lshift_row:
769	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
770	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
771/* For CTR-mode IV byteswap */
772.Lbswap128_mask:
773	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
774	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
775
776/* AES inverse affine and S2 combined:
777 *      1 1 0 0 0 0 0 1     x0     0
778 *      0 1 0 0 1 0 0 0     x1     0
779 *      1 1 0 0 1 1 1 1     x2     0
780 *      0 1 1 0 1 0 0 1     x3     1
781 *      0 1 0 0 1 1 0 0  *  x4  +  0
782 *      0 1 0 1 1 0 0 0     x5     0
783 *      0 0 0 0 0 1 0 1     x6     0
784 *      1 1 1 0 0 1 1 1     x7     1
785 */
786.Ltf_lo__inv_aff__and__s2:
787	.octa 0x92172DA81A9FA520B2370D883ABF8500
788.Ltf_hi__inv_aff__and__s2:
789	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
790
791/* X2 and AES forward affine combined:
792 *      1 0 1 1 0 0 0 1     x0     0
793 *      0 1 1 1 1 0 1 1     x1     0
794 *      0 0 0 1 1 0 1 0     x2     1
795 *      0 1 0 0 0 1 0 0     x3     0
796 *      0 0 1 1 1 0 1 1  *  x4  +  0
797 *      0 1 0 0 1 0 0 0     x5     0
798 *      1 1 0 1 0 0 1 1     x6     0
799 *      0 1 0 0 1 0 1 0     x7     0
800 */
801.Ltf_lo__x2__and__fwd_aff:
802	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
803.Ltf_hi__x2__and__fwd_aff:
804	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
805
806.section	.rodata.cst8, "aM", @progbits, 8
807.align 8
808/* AES affine: */
809#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
810.Ltf_aff_bitmatrix:
811	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
812		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
813		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
814		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
815		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
816		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
817		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
818		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
819
820/* AES inverse affine: */
821#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
822.Ltf_inv_bitmatrix:
823	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
824		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
825		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
826		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
827		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
828		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
829		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
830		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
831
832/* S2: */
833#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
834.Ltf_s2_bitmatrix:
835	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
836		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
837		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
838		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
839		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
840		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
841		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
842		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
843
844/* X2: */
845#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
846.Ltf_x2_bitmatrix:
847	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
849		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
850		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
851		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
852		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
853		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
854		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
855
856/* Identity matrix: */
857.Ltf_id_bitmatrix:
858	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
860		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
861		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
862		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
863		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
864		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
865		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
866
867/* 4-bit mask */
868.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
869.align 4
870.L0f0f0f0f:
871	.long 0x0f0f0f0f
872
873.text
874
875SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
876	/* input:
877	 *      %r9: rk
878	 *      %rsi: dst
879	 *      %rdx: src
880	 *      %ymm0..%ymm15: byte-sliced blocks
881	 */
882
883	FRAME_BEGIN
884
885	movq %rsi, %rax;
886	leaq 8 * 32(%rax), %r8;
887
888	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
889		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
890		      %ymm15, %rax, %r8);
891	aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
892		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
893		%rax, %r9, 0);
894	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
895		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896		%ymm15, %rax, %r9, 1);
897	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
898		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
899		%rax, %r9, 2);
900	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
901		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902		%ymm15, %rax, %r9, 3);
903	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
904		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905		%rax, %r9, 4);
906	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
907		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
908		%ymm15, %rax, %r9, 5);
909	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
910		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911		%rax, %r9, 6);
912	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
913		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914		%ymm15, %rax, %r9, 7);
915	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
916		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917		%rax, %r9, 8);
918	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
919		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920		%ymm15, %rax, %r9, 9);
921	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
922		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
923		%rax, %r9, 10);
924	cmpl $12, ARIA_CTX_rounds(CTX);
925	jne .Laria_192;
926	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
927		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
928		%ymm15, %rax, %r9, 11, 12);
929	jmp .Laria_end;
930.Laria_192:
931	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
932		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
933		%ymm15, %rax, %r9, 11);
934	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
935		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
936		%rax, %r9, 12);
937	cmpl $14, ARIA_CTX_rounds(CTX);
938	jne .Laria_256;
939	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
940		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
941		%ymm15, %rax, %r9, 13, 14);
942	jmp .Laria_end;
943.Laria_256:
944	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
945		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
946		%ymm15, %rax, %r9, 13);
947	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
948		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
949		%rax, %r9, 14);
950	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
951		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952		%ymm15, %rax, %r9, 15, 16);
953.Laria_end:
954	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
955			   %ymm9, %ymm13, %ymm0, %ymm5,
956			   %ymm10, %ymm14, %ymm3, %ymm6,
957			   %ymm11, %ymm15, %ymm2, %ymm7,
958			   (%rax), (%r8));
959
960	FRAME_END
961	RET;
962SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
963
964SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
965	/* input:
966	 *      %rdi: ctx, CTX
967	 *      %rsi: dst
968	 *      %rdx: src
969	 */
970
971	FRAME_BEGIN
972
973	leaq ARIA_CTX_enc_key(CTX), %r9;
974
975	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
976		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
977		     %ymm15, %rdx);
978
979	call __aria_aesni_avx2_crypt_32way;
980
981	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
982		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
983		     %ymm15, %rax);
984
985	FRAME_END
986	RET;
987SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
988
989SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
990	/* input:
991	 *      %rdi: ctx, CTX
992	 *      %rsi: dst
993	 *      %rdx: src
994	 */
995
996	FRAME_BEGIN
997
998	leaq ARIA_CTX_dec_key(CTX), %r9;
999
1000	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1001		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1002		     %ymm15, %rdx);
1003
1004	call __aria_aesni_avx2_crypt_32way;
1005
1006	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1007		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1008		     %ymm15, %rax);
1009
1010	FRAME_END
1011	RET;
1012SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
1013
1014SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
1015	/* input:
1016	 *      %rdi: ctx
1017	 *      %rsi: dst
1018	 *      %rdx: src
1019	 *      %rcx: keystream
1020	 *      %r8: iv (big endian, 128bit)
1021	 */
1022
1023	FRAME_BEGIN
1024	movq 8(%r8), %r11;
1025	bswapq %r11;
1026
1027	vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
1028	vpcmpeqd %ymm0, %ymm0, %ymm0;
1029	vpsrldq $8, %ymm0, %ymm0;   /* ab: -1:0 ; cd: -1:0 */
1030	vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
1031
1032	/* load IV and byteswap */
1033	vmovdqu (%r8), %xmm7;
1034	vpshufb %xmm6, %xmm7, %xmm7;
1035	vmovdqa %xmm7, %xmm3;
1036	inc_le128(%xmm7, %xmm0, %xmm4);
1037	vinserti128 $1, %xmm7, %ymm3, %ymm3;
1038	vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
1039
1040	/* check need for handling 64-bit overflow and carry */
1041	cmpq $(0xffffffffffffffff - 32), %r11;
1042	ja .Lhandle_ctr_carry;
1043
1044	/* construct IVs */
1045	vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
1046	vpshufb %ymm6, %ymm3, %ymm9;
1047	vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
1048	vpshufb %ymm6, %ymm3, %ymm10;
1049	vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
1050	vpshufb %ymm6, %ymm3, %ymm11;
1051	vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
1052	vpshufb %ymm6, %ymm3, %ymm12;
1053	vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
1054	vpshufb %ymm6, %ymm3, %ymm13;
1055	vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
1056	vpshufb %ymm6, %ymm3, %ymm14;
1057	vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
1058	vpshufb %ymm6, %ymm3, %ymm15;
1059	vmovdqu %ymm8, (0 * 32)(%rcx);
1060	vmovdqu %ymm9, (1 * 32)(%rcx);
1061	vmovdqu %ymm10, (2 * 32)(%rcx);
1062	vmovdqu %ymm11, (3 * 32)(%rcx);
1063	vmovdqu %ymm12, (4 * 32)(%rcx);
1064	vmovdqu %ymm13, (5 * 32)(%rcx);
1065	vmovdqu %ymm14, (6 * 32)(%rcx);
1066	vmovdqu %ymm15, (7 * 32)(%rcx);
1067
1068	vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
1069	vpshufb %ymm6, %ymm3, %ymm8;
1070	vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
1071	vpshufb %ymm6, %ymm3, %ymm9;
1072	vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
1073	vpshufb %ymm6, %ymm3, %ymm10;
1074	vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
1075	vpshufb %ymm6, %ymm3, %ymm11;
1076	vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
1077	vpshufb %ymm6, %ymm3, %ymm12;
1078	vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
1079	vpshufb %ymm6, %ymm3, %ymm13;
1080	vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
1081	vpshufb %ymm6, %ymm3, %ymm14;
1082	vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
1083	vpshufb %ymm6, %ymm3, %ymm15;
1084	vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
1085	vpshufb %xmm6, %xmm3, %xmm3;
1086	vmovdqu %xmm3, (%r8);
1087	vmovdqu (0 * 32)(%rcx), %ymm0;
1088	vmovdqu (1 * 32)(%rcx), %ymm1;
1089	vmovdqu (2 * 32)(%rcx), %ymm2;
1090	vmovdqu (3 * 32)(%rcx), %ymm3;
1091	vmovdqu (4 * 32)(%rcx), %ymm4;
1092	vmovdqu (5 * 32)(%rcx), %ymm5;
1093	vmovdqu (6 * 32)(%rcx), %ymm6;
1094	vmovdqu (7 * 32)(%rcx), %ymm7;
1095	jmp .Lctr_carry_done;
1096
1097	.Lhandle_ctr_carry:
1098	/* construct IVs */
1099	inc_le128(%ymm3, %ymm0, %ymm4);
1100	inc_le128(%ymm3, %ymm0, %ymm4);
1101	vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
1102	inc_le128(%ymm3, %ymm0, %ymm4);
1103	inc_le128(%ymm3, %ymm0, %ymm4);
1104	vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
1105	inc_le128(%ymm3, %ymm0, %ymm4);
1106	inc_le128(%ymm3, %ymm0, %ymm4);
1107	vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
1108	inc_le128(%ymm3, %ymm0, %ymm4);
1109	inc_le128(%ymm3, %ymm0, %ymm4);
1110	vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
1111	inc_le128(%ymm3, %ymm0, %ymm4);
1112	inc_le128(%ymm3, %ymm0, %ymm4);
1113	vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
1114	inc_le128(%ymm3, %ymm0, %ymm4);
1115	inc_le128(%ymm3, %ymm0, %ymm4);
1116	vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
1117	inc_le128(%ymm3, %ymm0, %ymm4);
1118	inc_le128(%ymm3, %ymm0, %ymm4);
1119	vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
1120	vmovdqu %ymm8, (0 * 32)(%rcx);
1121	vmovdqu %ymm9, (1 * 32)(%rcx);
1122	vmovdqu %ymm10, (2 * 32)(%rcx);
1123	vmovdqu %ymm11, (3 * 32)(%rcx);
1124	vmovdqu %ymm12, (4 * 32)(%rcx);
1125	vmovdqu %ymm13, (5 * 32)(%rcx);
1126	vmovdqu %ymm14, (6 * 32)(%rcx);
1127	vmovdqu %ymm15, (7 * 32)(%rcx);
1128
1129	inc_le128(%ymm3, %ymm0, %ymm4);
1130	inc_le128(%ymm3, %ymm0, %ymm4);
1131	vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
1132	inc_le128(%ymm3, %ymm0, %ymm4);
1133	inc_le128(%ymm3, %ymm0, %ymm4);
1134	vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
1135	inc_le128(%ymm3, %ymm0, %ymm4);
1136	inc_le128(%ymm3, %ymm0, %ymm4);
1137	vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
1138	inc_le128(%ymm3, %ymm0, %ymm4);
1139	inc_le128(%ymm3, %ymm0, %ymm4);
1140	vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
1141	inc_le128(%ymm3, %ymm0, %ymm4);
1142	inc_le128(%ymm3, %ymm0, %ymm4);
1143	vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
1144	inc_le128(%ymm3, %ymm0, %ymm4);
1145	inc_le128(%ymm3, %ymm0, %ymm4);
1146	vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
1147	inc_le128(%ymm3, %ymm0, %ymm4);
1148	inc_le128(%ymm3, %ymm0, %ymm4);
1149	vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
1150	inc_le128(%ymm3, %ymm0, %ymm4);
1151	inc_le128(%ymm3, %ymm0, %ymm4);
1152	vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
1153	inc_le128(%ymm3, %ymm0, %ymm4);
1154	vextracti128 $1, %ymm3, %xmm3;
1155	vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
1156	vmovdqu %xmm3, (%r8);
1157	vmovdqu (0 * 32)(%rcx), %ymm0;
1158	vmovdqu (1 * 32)(%rcx), %ymm1;
1159	vmovdqu (2 * 32)(%rcx), %ymm2;
1160	vmovdqu (3 * 32)(%rcx), %ymm3;
1161	vmovdqu (4 * 32)(%rcx), %ymm4;
1162	vmovdqu (5 * 32)(%rcx), %ymm5;
1163	vmovdqu (6 * 32)(%rcx), %ymm6;
1164	vmovdqu (7 * 32)(%rcx), %ymm7;
1165
1166	.Lctr_carry_done:
1167
1168	FRAME_END
1169	RET;
1170SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
1171
1172SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
1173	/* input:
1174	 *      %rdi: ctx
1175	 *      %rsi: dst
1176	 *      %rdx: src
1177	 *      %rcx: keystream
1178	 *      %r8: iv (big endian, 128bit)
1179	 */
1180	FRAME_BEGIN
1181
1182	call __aria_aesni_avx2_ctr_gen_keystream_32way;
1183
1184	leaq (%rsi), %r10;
1185	leaq (%rdx), %r11;
1186	leaq (%rcx), %rsi;
1187	leaq (%rcx), %rdx;
1188	leaq ARIA_CTX_enc_key(CTX), %r9;
1189
1190	call __aria_aesni_avx2_crypt_32way;
1191
1192	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1193	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1194	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1195	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1196	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1197	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1198	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1199	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1200	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1201	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1202	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1203	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1204	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1205	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1206	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1207	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1208	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1209		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1210		     %ymm15, %r10);
1211
1212	FRAME_END
1213	RET;
1214SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
1215
1216SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
1217	/* input:
1218	 *      %r9: rk
1219	 *      %rsi: dst
1220	 *      %rdx: src
1221	 *      %ymm0..%ymm15: 16 byte-sliced blocks
1222	 */
1223
1224	FRAME_BEGIN
1225
1226	movq %rsi, %rax;
1227	leaq 8 * 32(%rax), %r8;
1228
1229	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
1230		      %ymm4, %ymm5, %ymm6, %ymm7,
1231		      %ymm8, %ymm9, %ymm10, %ymm11,
1232		      %ymm12, %ymm13, %ymm14,
1233		      %ymm15, %rax, %r8);
1234	aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
1235		     %ymm12, %ymm13, %ymm14, %ymm15,
1236		     %ymm0, %ymm1, %ymm2, %ymm3,
1237		     %ymm4, %ymm5, %ymm6, %ymm7,
1238		     %rax, %r9, 0);
1239	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1240		     %ymm4, %ymm5, %ymm6, %ymm7,
1241		     %ymm8, %ymm9, %ymm10, %ymm11,
1242		     %ymm12, %ymm13, %ymm14,
1243		     %ymm15, %rax, %r9, 1);
1244	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1245		     %ymm12, %ymm13, %ymm14, %ymm15,
1246		     %ymm0, %ymm1, %ymm2, %ymm3,
1247		     %ymm4, %ymm5, %ymm6, %ymm7,
1248		     %rax, %r9, 2);
1249	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1250		     %ymm4, %ymm5, %ymm6, %ymm7,
1251		     %ymm8, %ymm9, %ymm10, %ymm11,
1252		     %ymm12, %ymm13, %ymm14,
1253		     %ymm15, %rax, %r9, 3);
1254	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1255		     %ymm12, %ymm13, %ymm14, %ymm15,
1256		     %ymm0, %ymm1, %ymm2, %ymm3,
1257		     %ymm4, %ymm5, %ymm6, %ymm7,
1258		     %rax, %r9, 4);
1259	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1260		     %ymm4, %ymm5, %ymm6, %ymm7,
1261		     %ymm8, %ymm9, %ymm10, %ymm11,
1262		     %ymm12, %ymm13, %ymm14,
1263		     %ymm15, %rax, %r9, 5);
1264	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1265		     %ymm12, %ymm13, %ymm14, %ymm15,
1266		     %ymm0, %ymm1, %ymm2, %ymm3,
1267		     %ymm4, %ymm5, %ymm6, %ymm7,
1268		     %rax, %r9, 6);
1269	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1270		     %ymm4, %ymm5, %ymm6, %ymm7,
1271		     %ymm8, %ymm9, %ymm10, %ymm11,
1272		     %ymm12, %ymm13, %ymm14,
1273		     %ymm15, %rax, %r9, 7);
1274	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1275		     %ymm12, %ymm13, %ymm14, %ymm15,
1276		     %ymm0, %ymm1, %ymm2, %ymm3,
1277		     %ymm4, %ymm5, %ymm6, %ymm7,
1278		     %rax, %r9, 8);
1279	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1280		     %ymm4, %ymm5, %ymm6, %ymm7,
1281		     %ymm8, %ymm9, %ymm10, %ymm11,
1282		     %ymm12, %ymm13, %ymm14,
1283		     %ymm15, %rax, %r9, 9);
1284	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1285		     %ymm12, %ymm13, %ymm14, %ymm15,
1286		     %ymm0, %ymm1, %ymm2, %ymm3,
1287		     %ymm4, %ymm5, %ymm6, %ymm7,
1288		     %rax, %r9, 10);
1289	cmpl $12, ARIA_CTX_rounds(CTX);
1290	jne .Laria_gfni_192;
1291	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1292		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1293		%ymm15, %rax, %r9, 11, 12);
1294	jmp .Laria_gfni_end;
1295.Laria_gfni_192:
1296	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1297		     %ymm4, %ymm5, %ymm6, %ymm7,
1298		     %ymm8, %ymm9, %ymm10, %ymm11,
1299		     %ymm12, %ymm13, %ymm14,
1300		     %ymm15, %rax, %r9, 11);
1301	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1302		     %ymm12, %ymm13, %ymm14, %ymm15,
1303		     %ymm0, %ymm1, %ymm2, %ymm3,
1304		     %ymm4, %ymm5, %ymm6, %ymm7,
1305		     %rax, %r9, 12);
1306	cmpl $14, ARIA_CTX_rounds(CTX);
1307	jne .Laria_gfni_256;
1308	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1309		     %ymm4, %ymm5, %ymm6, %ymm7,
1310		     %ymm8, %ymm9, %ymm10, %ymm11,
1311		     %ymm12, %ymm13, %ymm14,
1312		     %ymm15, %rax, %r9, 13, 14);
1313	jmp .Laria_gfni_end;
1314.Laria_gfni_256:
1315	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1316		     %ymm4, %ymm5, %ymm6, %ymm7,
1317		     %ymm8, %ymm9, %ymm10, %ymm11,
1318		     %ymm12, %ymm13, %ymm14,
1319		     %ymm15, %rax, %r9, 13);
1320	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1321		     %ymm12, %ymm13, %ymm14, %ymm15,
1322		     %ymm0, %ymm1, %ymm2, %ymm3,
1323		     %ymm4, %ymm5, %ymm6, %ymm7,
1324		     %rax, %r9, 14);
1325	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1326		     %ymm4, %ymm5, %ymm6, %ymm7,
1327		     %ymm8, %ymm9, %ymm10, %ymm11,
1328		     %ymm12, %ymm13, %ymm14,
1329		     %ymm15, %rax, %r9, 15, 16);
1330.Laria_gfni_end:
1331	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
1332			   %ymm9, %ymm13, %ymm0, %ymm5,
1333			   %ymm10, %ymm14, %ymm3, %ymm6,
1334			   %ymm11, %ymm15, %ymm2, %ymm7,
1335			   (%rax), (%r8));
1336
1337	FRAME_END
1338	RET;
1339SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
1340
1341SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
1342	/* input:
1343	 *      %rdi: ctx, CTX
1344	 *      %rsi: dst
1345	 *      %rdx: src
1346	 */
1347
1348	FRAME_BEGIN
1349
1350	leaq ARIA_CTX_enc_key(CTX), %r9;
1351
1352	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1353		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1354		     %ymm15, %rdx);
1355
1356	call __aria_aesni_avx2_gfni_crypt_32way;
1357
1358	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1359		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1360		     %ymm15, %rax);
1361
1362	FRAME_END
1363	RET;
1364SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
1365
1366SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
1367	/* input:
1368	 *      %rdi: ctx, CTX
1369	 *      %rsi: dst
1370	 *      %rdx: src
1371	 */
1372
1373	FRAME_BEGIN
1374
1375	leaq ARIA_CTX_dec_key(CTX), %r9;
1376
1377	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1378		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1379		     %ymm15, %rdx);
1380
1381	call __aria_aesni_avx2_gfni_crypt_32way;
1382
1383	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1384		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1385		     %ymm15, %rax);
1386
1387	FRAME_END
1388	RET;
1389SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
1390
1391SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
1392	/* input:
1393	 *      %rdi: ctx
1394	 *      %rsi: dst
1395	 *      %rdx: src
1396	 *      %rcx: keystream
1397	 *      %r8: iv (big endian, 128bit)
1398	 */
1399	FRAME_BEGIN
1400
1401	call __aria_aesni_avx2_ctr_gen_keystream_32way
1402
1403	leaq (%rsi), %r10;
1404	leaq (%rdx), %r11;
1405	leaq (%rcx), %rsi;
1406	leaq (%rcx), %rdx;
1407	leaq ARIA_CTX_enc_key(CTX), %r9;
1408
1409	call __aria_aesni_avx2_gfni_crypt_32way;
1410
1411	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1412	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1413	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1414	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1415	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1416	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1417	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1418	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1419	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1420	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1421	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1422	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1423	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1424	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1425	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1426	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1427	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1428		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1429		     %ymm15, %r10);
1430
1431	FRAME_END
1432	RET;
1433SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
1434