xref: /linux/arch/x86/crypto/aria-aesni-avx-asm_64.S (revision db4a3f0fbedb0398f77b9047e8b8bb2b49f355bb)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 16-way parallel algorithm (AVX)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <linux/cfi_types.h>
11#include <asm/asm-offsets.h>
12#include <asm/frame.h>
13
14/* register macros */
15#define CTX %rdi
16
17
18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
19	( (((a0) & 1) << 0) |				\
20	  (((a1) & 1) << 1) |				\
21	  (((a2) & 1) << 2) |				\
22	  (((a3) & 1) << 3) |				\
23	  (((a4) & 1) << 4) |				\
24	  (((a5) & 1) << 5) |				\
25	  (((a6) & 1) << 6) |				\
26	  (((a7) & 1) << 7) )
27
28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
29	( ((l7) << (0 * 8)) |				\
30	  ((l6) << (1 * 8)) |				\
31	  ((l5) << (2 * 8)) |				\
32	  ((l4) << (3 * 8)) |				\
33	  ((l3) << (4 * 8)) |				\
34	  ((l2) << (5 * 8)) |				\
35	  ((l1) << (6 * 8)) |				\
36	  ((l0) << (7 * 8)) )
37
38#define inc_le128(x, minus_one, tmp)			\
39	vpcmpeqq minus_one, x, tmp;			\
40	vpsubq minus_one, x, x;				\
41	vpslldq $8, tmp, tmp;				\
42	vpsubq tmp, x, x;
43
44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
45	vpand x, mask4bit, tmp0;			\
46	vpandn x, mask4bit, x;				\
47	vpsrld $4, x, x;				\
48							\
49	vpshufb tmp0, lo_t, tmp0;			\
50	vpshufb x, hi_t, x;				\
51	vpxor tmp0, x, x;
52
53#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
54	vpunpckhdq x1, x0, t2;				\
55	vpunpckldq x1, x0, x0;				\
56							\
57	vpunpckldq x3, x2, t1;				\
58	vpunpckhdq x3, x2, x2;				\
59							\
60	vpunpckhqdq t1, x0, x1;				\
61	vpunpcklqdq t1, x0, x0;				\
62							\
63	vpunpckhqdq x2, t2, x3;				\
64	vpunpcklqdq x2, t2, x2;
65
66#define byteslice_16x16b(a0, b0, c0, d0,		\
67			 a1, b1, c1, d1,		\
68			 a2, b2, c2, d2,		\
69			 a3, b3, c3, d3,		\
70			 st0, st1)			\
71	vmovdqu d2, st0;				\
72	vmovdqu d3, st1;				\
73	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
74	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
75	vmovdqu st0, d2;				\
76	vmovdqu st1, d3;				\
77							\
78	vmovdqu a0, st0;				\
79	vmovdqu a1, st1;				\
80	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
81	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
82							\
83	vmovdqu .Lshufb_16x16b(%rip), a0;		\
84	vmovdqu st1, a1;				\
85	vpshufb a0, a2, a2;				\
86	vpshufb a0, a3, a3;				\
87	vpshufb a0, b0, b0;				\
88	vpshufb a0, b1, b1;				\
89	vpshufb a0, b2, b2;				\
90	vpshufb a0, b3, b3;				\
91	vpshufb a0, a1, a1;				\
92	vpshufb a0, c0, c0;				\
93	vpshufb a0, c1, c1;				\
94	vpshufb a0, c2, c2;				\
95	vpshufb a0, c3, c3;				\
96	vpshufb a0, d0, d0;				\
97	vpshufb a0, d1, d1;				\
98	vpshufb a0, d2, d2;				\
99	vpshufb a0, d3, d3;				\
100	vmovdqu d3, st1;				\
101	vmovdqu st0, d3;				\
102	vpshufb a0, d3, a0;				\
103	vmovdqu d2, st0;				\
104							\
105	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
106	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
107	vmovdqu st0, d2;				\
108	vmovdqu st1, d3;				\
109							\
110	vmovdqu b0, st0;				\
111	vmovdqu b1, st1;				\
112	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
113	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
114	vmovdqu st0, b0;				\
115	vmovdqu st1, b1;				\
116	/* does not adjust output bytes inside vectors */
117
118#define debyteslice_16x16b(a0, b0, c0, d0,		\
119			   a1, b1, c1, d1,		\
120			   a2, b2, c2, d2,		\
121			   a3, b3, c3, d3,		\
122			   st0, st1)			\
123	vmovdqu d2, st0;				\
124	vmovdqu d3, st1;				\
125	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
126	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
127	vmovdqu st0, d2;				\
128	vmovdqu st1, d3;				\
129							\
130	vmovdqu a0, st0;				\
131	vmovdqu a1, st1;				\
132	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
133	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
134							\
135	vmovdqu .Lshufb_16x16b(%rip), a0;		\
136	vmovdqu st1, a1;				\
137	vpshufb a0, a2, a2;				\
138	vpshufb a0, a3, a3;				\
139	vpshufb a0, b0, b0;				\
140	vpshufb a0, b1, b1;				\
141	vpshufb a0, b2, b2;				\
142	vpshufb a0, b3, b3;				\
143	vpshufb a0, a1, a1;				\
144	vpshufb a0, c0, c0;				\
145	vpshufb a0, c1, c1;				\
146	vpshufb a0, c2, c2;				\
147	vpshufb a0, c3, c3;				\
148	vpshufb a0, d0, d0;				\
149	vpshufb a0, d1, d1;				\
150	vpshufb a0, d2, d2;				\
151	vpshufb a0, d3, d3;				\
152	vmovdqu d3, st1;				\
153	vmovdqu st0, d3;				\
154	vpshufb a0, d3, a0;				\
155	vmovdqu d2, st0;				\
156							\
157	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
158	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
159	vmovdqu st0, d2;				\
160	vmovdqu st1, d3;				\
161							\
162	vmovdqu b0, st0;				\
163	vmovdqu b1, st1;				\
164	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
165	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
166	vmovdqu st0, b0;				\
167	vmovdqu st1, b1;				\
168	/* does not adjust output bytes inside vectors */
169
170/* load blocks to registers and apply pre-whitening */
171#define inpack16_pre(x0, x1, x2, x3,			\
172		     x4, x5, x6, x7,			\
173		     y0, y1, y2, y3,			\
174		     y4, y5, y6, y7,			\
175		     rio)				\
176	vmovdqu (0 * 16)(rio), x0;			\
177	vmovdqu (1 * 16)(rio), x1;			\
178	vmovdqu (2 * 16)(rio), x2;			\
179	vmovdqu (3 * 16)(rio), x3;			\
180	vmovdqu (4 * 16)(rio), x4;			\
181	vmovdqu (5 * 16)(rio), x5;			\
182	vmovdqu (6 * 16)(rio), x6;			\
183	vmovdqu (7 * 16)(rio), x7;			\
184	vmovdqu (8 * 16)(rio), y0;			\
185	vmovdqu (9 * 16)(rio), y1;			\
186	vmovdqu (10 * 16)(rio), y2;			\
187	vmovdqu (11 * 16)(rio), y3;			\
188	vmovdqu (12 * 16)(rio), y4;			\
189	vmovdqu (13 * 16)(rio), y5;			\
190	vmovdqu (14 * 16)(rio), y6;			\
191	vmovdqu (15 * 16)(rio), y7;
192
193/* byteslice pre-whitened blocks and store to temporary memory */
194#define inpack16_post(x0, x1, x2, x3,			\
195		      x4, x5, x6, x7,			\
196		      y0, y1, y2, y3,			\
197		      y4, y5, y6, y7,			\
198		      mem_ab, mem_cd)			\
199	byteslice_16x16b(x0, x1, x2, x3,		\
200			 x4, x5, x6, x7,		\
201			 y0, y1, y2, y3,		\
202			 y4, y5, y6, y7,		\
203			 (mem_ab), (mem_cd));		\
204							\
205	vmovdqu x0, 0 * 16(mem_ab);			\
206	vmovdqu x1, 1 * 16(mem_ab);			\
207	vmovdqu x2, 2 * 16(mem_ab);			\
208	vmovdqu x3, 3 * 16(mem_ab);			\
209	vmovdqu x4, 4 * 16(mem_ab);			\
210	vmovdqu x5, 5 * 16(mem_ab);			\
211	vmovdqu x6, 6 * 16(mem_ab);			\
212	vmovdqu x7, 7 * 16(mem_ab);			\
213	vmovdqu y0, 0 * 16(mem_cd);			\
214	vmovdqu y1, 1 * 16(mem_cd);			\
215	vmovdqu y2, 2 * 16(mem_cd);			\
216	vmovdqu y3, 3 * 16(mem_cd);			\
217	vmovdqu y4, 4 * 16(mem_cd);			\
218	vmovdqu y5, 5 * 16(mem_cd);			\
219	vmovdqu y6, 6 * 16(mem_cd);			\
220	vmovdqu y7, 7 * 16(mem_cd);
221
222#define write_output(x0, x1, x2, x3,			\
223		     x4, x5, x6, x7,			\
224		     y0, y1, y2, y3,			\
225		     y4, y5, y6, y7,			\
226		     mem)				\
227	vmovdqu x0, 0 * 16(mem);			\
228	vmovdqu x1, 1 * 16(mem);			\
229	vmovdqu x2, 2 * 16(mem);			\
230	vmovdqu x3, 3 * 16(mem);			\
231	vmovdqu x4, 4 * 16(mem);			\
232	vmovdqu x5, 5 * 16(mem);			\
233	vmovdqu x6, 6 * 16(mem);			\
234	vmovdqu x7, 7 * 16(mem);			\
235	vmovdqu y0, 8 * 16(mem);			\
236	vmovdqu y1, 9 * 16(mem);			\
237	vmovdqu y2, 10 * 16(mem);			\
238	vmovdqu y3, 11 * 16(mem);			\
239	vmovdqu y4, 12 * 16(mem);			\
240	vmovdqu y5, 13 * 16(mem);			\
241	vmovdqu y6, 14 * 16(mem);			\
242	vmovdqu y7, 15 * 16(mem);			\
243
244#define aria_store_state_8way(x0, x1, x2, x3,		\
245			      x4, x5, x6, x7,		\
246			      mem_tmp, idx)		\
247	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
248	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
249	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
250	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
251	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
252	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
253	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
254	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255
256#define aria_load_state_8way(x0, x1, x2, x3,		\
257			     x4, x5, x6, x7,		\
258			     mem_tmp, idx)		\
259	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
260	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
261	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
262	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
263	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
264	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
265	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
266	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267
268#define aria_ark_8way(x0, x1, x2, x3,			\
269		      x4, x5, x6, x7,			\
270		      t0, t1, t2, rk,			\
271		      idx, round)			\
272	/* AddRoundKey */                               \
273	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
274	vpsrld $24, t0, t2;				\
275	vpshufb t1, t2, t2;				\
276	vpxor t2, x0, x0;				\
277	vpsrld $16, t0, t2;				\
278	vpshufb t1, t2, t2;				\
279	vpxor t2, x1, x1;				\
280	vpsrld $8, t0, t2;				\
281	vpshufb t1, t2, t2;				\
282	vpxor t2, x2, x2;				\
283	vpshufb t1, t0, t2;				\
284	vpxor t2, x3, x3;				\
285	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
286	vpsrld $24, t0, t2;				\
287	vpshufb t1, t2, t2;				\
288	vpxor t2, x4, x4;				\
289	vpsrld $16, t0, t2;				\
290	vpshufb t1, t2, t2;				\
291	vpxor t2, x5, x5;				\
292	vpsrld $8, t0, t2;				\
293	vpshufb t1, t2, t2;				\
294	vpxor t2, x6, x6;				\
295	vpshufb t1, t0, t2;				\
296	vpxor t2, x7, x7;
297
298#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
299			    x4, x5, x6, x7,		\
300			    t0, t1, t2, t3,		\
301			    t4, t5, t6, t7)		\
302	vmovdqa .Ltf_s2_bitmatrix(%rip), t0;		\
303	vmovdqa .Ltf_inv_bitmatrix(%rip), t1;		\
304	vmovdqa .Ltf_id_bitmatrix(%rip), t2;		\
305	vmovdqa .Ltf_aff_bitmatrix(%rip), t3;		\
306	vmovdqa .Ltf_x2_bitmatrix(%rip), t4;		\
307	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
308	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
309	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
310	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
311	vgf2p8affineinvqb $0, t2, x2, x2;		\
312	vgf2p8affineinvqb $0, t2, x6, x6;		\
313	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
314	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
315	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
316	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
317	vgf2p8affineinvqb $0, t2, x3, x3;		\
318	vgf2p8affineinvqb $0, t2, x7, x7
319
320#define aria_sbox_8way(x0, x1, x2, x3,            	\
321		       x4, x5, x6, x7,			\
322		       t0, t1, t2, t3,			\
323		       t4, t5, t6, t7)			\
324	vmovdqa .Linv_shift_row(%rip), t0;		\
325	vmovdqa .Lshift_row(%rip), t1;			\
326	vbroadcastss .L0f0f0f0f(%rip), t6;		\
327	vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2;	\
328	vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3;	\
329	vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4;	\
330	vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5;	\
331							\
332	vaesenclast t7, x0, x0;				\
333	vaesenclast t7, x4, x4;				\
334	vaesenclast t7, x1, x1;				\
335	vaesenclast t7, x5, x5;				\
336	vaesdeclast t7, x2, x2;				\
337	vaesdeclast t7, x6, x6;				\
338							\
339	/* AES inverse shift rows */			\
340	vpshufb t0, x0, x0;				\
341	vpshufb t0, x4, x4;				\
342	vpshufb t0, x1, x1;				\
343	vpshufb t0, x5, x5;				\
344	vpshufb t1, x3, x3;				\
345	vpshufb t1, x7, x7;				\
346	vpshufb t1, x2, x2;				\
347	vpshufb t1, x6, x6;				\
348							\
349	/* affine transformation for S2 */		\
350	filter_8bit(x1, t2, t3, t6, t0);		\
351	/* affine transformation for S2 */		\
352	filter_8bit(x5, t2, t3, t6, t0);		\
353							\
354	/* affine transformation for X2 */		\
355	filter_8bit(x3, t4, t5, t6, t0);		\
356	/* affine transformation for X2 */		\
357	filter_8bit(x7, t4, t5, t6, t0);		\
358	vaesdeclast t7, x3, x3;				\
359	vaesdeclast t7, x7, x7;
360
361#define aria_diff_m(x0, x1, x2, x3,			\
362		    t0, t1, t2, t3)			\
363	/* T = rotr32(X, 8); */				\
364	/* X ^= T */					\
365	vpxor x0, x3, t0;				\
366	vpxor x1, x0, t1;				\
367	vpxor x2, x1, t2;				\
368	vpxor x3, x2, t3;				\
369	/* X = T ^ rotr(X, 16); */			\
370	vpxor t2, x0, x0;				\
371	vpxor x1, t3, t3;				\
372	vpxor t0, x2, x2;				\
373	vpxor t1, x3, x1;				\
374	vmovdqu t3, x3;
375
376#define aria_diff_word(x0, x1, x2, x3,			\
377		       x4, x5, x6, x7,			\
378		       y0, y1, y2, y3,			\
379		       y4, y5, y6, y7)			\
380	/* t1 ^= t2; */					\
381	vpxor y0, x4, x4;				\
382	vpxor y1, x5, x5;				\
383	vpxor y2, x6, x6;				\
384	vpxor y3, x7, x7;				\
385							\
386	/* t2 ^= t3; */					\
387	vpxor y4, y0, y0;				\
388	vpxor y5, y1, y1;				\
389	vpxor y6, y2, y2;				\
390	vpxor y7, y3, y3;				\
391							\
392	/* t0 ^= t1; */					\
393	vpxor x4, x0, x0;				\
394	vpxor x5, x1, x1;				\
395	vpxor x6, x2, x2;				\
396	vpxor x7, x3, x3;				\
397							\
398	/* t3 ^= t1; */					\
399	vpxor x4, y4, y4;				\
400	vpxor x5, y5, y5;				\
401	vpxor x6, y6, y6;				\
402	vpxor x7, y7, y7;				\
403							\
404	/* t2 ^= t0; */					\
405	vpxor x0, y0, y0;				\
406	vpxor x1, y1, y1;				\
407	vpxor x2, y2, y2;				\
408	vpxor x3, y3, y3;				\
409							\
410	/* t1 ^= t2; */					\
411	vpxor y0, x4, x4;				\
412	vpxor y1, x5, x5;				\
413	vpxor y2, x6, x6;				\
414	vpxor y3, x7, x7;
415
416#define aria_fe(x0, x1, x2, x3,				\
417		x4, x5, x6, x7,				\
418		y0, y1, y2, y3,				\
419		y4, y5, y6, y7,				\
420		mem_tmp, rk, round)			\
421	vpxor y7, y7, y7;				\
422	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
423		      y0, y7, y2, rk, 8, round);	\
424							\
425	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
426		       y0, y1, y2, y3, y4, y5, y6, y7);	\
427							\
428	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
429	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
430	aria_store_state_8way(x0, x1, x2, x3,		\
431			      x4, x5, x6, x7,		\
432			      mem_tmp, 8);		\
433							\
434	aria_load_state_8way(x0, x1, x2, x3,		\
435			     x4, x5, x6, x7,		\
436			     mem_tmp, 0);		\
437	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
438		      y0, y7, y2, rk, 0, round);	\
439							\
440	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
441		       y0, y1, y2, y3, y4, y5, y6, y7);	\
442							\
443	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
444	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
445	aria_store_state_8way(x0, x1, x2, x3,		\
446			      x4, x5, x6, x7,		\
447			      mem_tmp, 0);		\
448	aria_load_state_8way(y0, y1, y2, y3,		\
449			     y4, y5, y6, y7,		\
450			     mem_tmp, 8);		\
451	aria_diff_word(x0, x1, x2, x3,			\
452		       x4, x5, x6, x7,			\
453		       y0, y1, y2, y3,			\
454		       y4, y5, y6, y7);			\
455	/* aria_diff_byte() 				\
456	 * T3 = ABCD -> BADC 				\
457	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
458	 * T0 = ABCD -> CDAB 				\
459	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
460	 * T1 = ABCD -> DCBA 				\
461	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
462	 */						\
463	aria_diff_word(x2, x3, x0, x1,			\
464		       x7, x6, x5, x4,			\
465		       y0, y1, y2, y3,			\
466		       y5, y4, y7, y6);			\
467	aria_store_state_8way(x3, x2, x1, x0,		\
468			      x6, x7, x4, x5,		\
469			      mem_tmp, 0);
470
471#define aria_fo(x0, x1, x2, x3,				\
472		x4, x5, x6, x7,				\
473		y0, y1, y2, y3,				\
474		y4, y5, y6, y7,				\
475		mem_tmp, rk, round)			\
476	vpxor y7, y7, y7;				\
477	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
478		      y0, y7, y2, rk, 8, round);	\
479							\
480	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
481		       y0, y1, y2, y3, y4, y5, y6, y7);	\
482							\
483	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
484	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
485	aria_store_state_8way(x0, x1, x2, x3,		\
486			      x4, x5, x6, x7,		\
487			      mem_tmp, 8);		\
488							\
489	aria_load_state_8way(x0, x1, x2, x3,		\
490			     x4, x5, x6, x7,		\
491			     mem_tmp, 0);		\
492	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
493		      y0, y7, y2, rk, 0, round);	\
494							\
495	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
496		       y0, y1, y2, y3, y4, y5, y6, y7);	\
497							\
498	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
499	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
500	aria_store_state_8way(x0, x1, x2, x3,		\
501			      x4, x5, x6, x7,		\
502			      mem_tmp, 0);		\
503	aria_load_state_8way(y0, y1, y2, y3,		\
504			     y4, y5, y6, y7,		\
505			     mem_tmp, 8);		\
506	aria_diff_word(x0, x1, x2, x3,			\
507		       x4, x5, x6, x7,			\
508		       y0, y1, y2, y3,			\
509		       y4, y5, y6, y7);			\
510	/* aria_diff_byte() 				\
511	 * T1 = ABCD -> BADC 				\
512	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
513	 * T2 = ABCD -> CDAB 				\
514	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
515	 * T3 = ABCD -> DCBA 				\
516	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
517	 */						\
518	aria_diff_word(x0, x1, x2, x3,			\
519		       x5, x4, x7, x6,			\
520		       y2, y3, y0, y1,			\
521		       y7, y6, y5, y4);			\
522	aria_store_state_8way(x3, x2, x1, x0,		\
523			      x6, x7, x4, x5,		\
524			      mem_tmp, 0);
525
526#define aria_ff(x0, x1, x2, x3,				\
527		x4, x5, x6, x7,				\
528		y0, y1, y2, y3,				\
529		y4, y5, y6, y7,				\
530		mem_tmp, rk, round, last_round)		\
531	vpxor y7, y7, y7;				\
532	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
533		      y0, y7, y2, rk, 8, round);	\
534							\
535	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
536		       y0, y1, y2, y3, y4, y5, y6, y7);	\
537							\
538	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
539		      y0, y7, y2, rk, 8, last_round);	\
540							\
541	aria_store_state_8way(x0, x1, x2, x3,		\
542			      x4, x5, x6, x7,		\
543			      mem_tmp, 8);		\
544							\
545	aria_load_state_8way(x0, x1, x2, x3,		\
546			     x4, x5, x6, x7,		\
547			     mem_tmp, 0);		\
548	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
549		      y0, y7, y2, rk, 0, round);	\
550							\
551	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
552		       y0, y1, y2, y3, y4, y5, y6, y7);	\
553							\
554	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
555		      y0, y7, y2, rk, 0, last_round);	\
556							\
557	aria_load_state_8way(y0, y1, y2, y3,		\
558			     y4, y5, y6, y7,		\
559			     mem_tmp, 8);
560
561#define aria_fe_gfni(x0, x1, x2, x3,			\
562		     x4, x5, x6, x7,			\
563		     y0, y1, y2, y3,			\
564		     y4, y5, y6, y7,			\
565		     mem_tmp, rk, round)		\
566	vpxor y7, y7, y7;				\
567	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
568		      y0, y7, y2, rk, 8, round);	\
569							\
570	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
571			    x6, x7, x4, x5,		\
572			    y0, y1, y2, y3, 		\
573			    y4, y5, y6, y7);		\
574							\
575	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
576	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
577	aria_store_state_8way(x0, x1, x2, x3,		\
578			      x4, x5, x6, x7,		\
579			      mem_tmp, 8);		\
580							\
581	aria_load_state_8way(x0, x1, x2, x3,		\
582			     x4, x5, x6, x7,		\
583			     mem_tmp, 0);		\
584	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
585		      y0, y7, y2, rk, 0, round);	\
586							\
587	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
588			    x6, x7, x4, x5,		\
589			    y0, y1, y2, y3, 		\
590			    y4, y5, y6, y7);		\
591							\
592	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
593	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
594	aria_store_state_8way(x0, x1, x2, x3,		\
595			      x4, x5, x6, x7,		\
596			      mem_tmp, 0);		\
597	aria_load_state_8way(y0, y1, y2, y3,		\
598			     y4, y5, y6, y7,		\
599			     mem_tmp, 8);		\
600	aria_diff_word(x0, x1, x2, x3,			\
601		       x4, x5, x6, x7,			\
602		       y0, y1, y2, y3,			\
603		       y4, y5, y6, y7);			\
604	/* aria_diff_byte() 				\
605	 * T3 = ABCD -> BADC 				\
606	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
607	 * T0 = ABCD -> CDAB 				\
608	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
609	 * T1 = ABCD -> DCBA 				\
610	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
611	 */						\
612	aria_diff_word(x2, x3, x0, x1,			\
613		       x7, x6, x5, x4,			\
614		       y0, y1, y2, y3,			\
615		       y5, y4, y7, y6);			\
616	aria_store_state_8way(x3, x2, x1, x0,		\
617			      x6, x7, x4, x5,		\
618			      mem_tmp, 0);
619
620#define aria_fo_gfni(x0, x1, x2, x3,			\
621		     x4, x5, x6, x7,			\
622		     y0, y1, y2, y3,			\
623		     y4, y5, y6, y7,			\
624		     mem_tmp, rk, round)		\
625	vpxor y7, y7, y7;				\
626	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
627		      y0, y7, y2, rk, 8, round);	\
628							\
629	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
630			    x4, x5, x6, x7,		\
631			    y0, y1, y2, y3, 		\
632			    y4, y5, y6, y7);		\
633							\
634	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
635	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
636	aria_store_state_8way(x0, x1, x2, x3,		\
637			      x4, x5, x6, x7,		\
638			      mem_tmp, 8);		\
639							\
640	aria_load_state_8way(x0, x1, x2, x3,		\
641			     x4, x5, x6, x7,		\
642			     mem_tmp, 0);		\
643	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
644		      y0, y7, y2, rk, 0, round);	\
645							\
646	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
647			    x4, x5, x6, x7,		\
648			    y0, y1, y2, y3, 		\
649			    y4, y5, y6, y7);		\
650							\
651	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
652	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
653	aria_store_state_8way(x0, x1, x2, x3,		\
654			      x4, x5, x6, x7,		\
655			      mem_tmp, 0);		\
656	aria_load_state_8way(y0, y1, y2, y3,		\
657			     y4, y5, y6, y7,		\
658			     mem_tmp, 8);		\
659	aria_diff_word(x0, x1, x2, x3,			\
660		       x4, x5, x6, x7,			\
661		       y0, y1, y2, y3,			\
662		       y4, y5, y6, y7);			\
663	/* aria_diff_byte() 				\
664	 * T1 = ABCD -> BADC 				\
665	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
666	 * T2 = ABCD -> CDAB 				\
667	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
668	 * T3 = ABCD -> DCBA 				\
669	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
670	 */						\
671	aria_diff_word(x0, x1, x2, x3,			\
672		       x5, x4, x7, x6,			\
673		       y2, y3, y0, y1,			\
674		       y7, y6, y5, y4);			\
675	aria_store_state_8way(x3, x2, x1, x0,		\
676			      x6, x7, x4, x5,		\
677			      mem_tmp, 0);
678
679#define aria_ff_gfni(x0, x1, x2, x3,			\
680		x4, x5, x6, x7,				\
681		y0, y1, y2, y3,				\
682		y4, y5, y6, y7,				\
683		mem_tmp, rk, round, last_round)		\
684	vpxor y7, y7, y7;				\
685	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
686		      y0, y7, y2, rk, 8, round);	\
687							\
688	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
689			    x6, x7, x4, x5,		\
690			    y0, y1, y2, y3, 		\
691			    y4, y5, y6, y7);		\
692							\
693	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
694		      y0, y7, y2, rk, 8, last_round);	\
695							\
696	aria_store_state_8way(x0, x1, x2, x3,		\
697			      x4, x5, x6, x7,		\
698			      mem_tmp, 8);		\
699							\
700	aria_load_state_8way(x0, x1, x2, x3,		\
701			     x4, x5, x6, x7,		\
702			     mem_tmp, 0);		\
703	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
704		      y0, y7, y2, rk, 0, round);	\
705							\
706	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
707			    x6, x7, x4, x5,		\
708			    y0, y1, y2, y3, 		\
709			    y4, y5, y6, y7);		\
710							\
711	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
712		      y0, y7, y2, rk, 0, last_round);	\
713							\
714	aria_load_state_8way(y0, y1, y2, y3,		\
715			     y4, y5, y6, y7,		\
716			     mem_tmp, 8);
717
718/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
719.section	.rodata.cst16, "aM", @progbits, 16
720.align 16
721
722#define SHUFB_BYTES(idx) \
723	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
724
725.Lshufb_16x16b:
726	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
727/* For isolating SubBytes from AESENCLAST, inverse shift row */
728.Linv_shift_row:
729	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
730	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
731.Lshift_row:
732	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
733	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
734/* For CTR-mode IV byteswap */
735.Lbswap128_mask:
736	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
737	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
738
739/* AES inverse affine and S2 combined:
740 *      1 1 0 0 0 0 0 1     x0     0
741 *      0 1 0 0 1 0 0 0     x1     0
742 *      1 1 0 0 1 1 1 1     x2     0
743 *      0 1 1 0 1 0 0 1     x3     1
744 *      0 1 0 0 1 1 0 0  *  x4  +  0
745 *      0 1 0 1 1 0 0 0     x5     0
746 *      0 0 0 0 0 1 0 1     x6     0
747 *      1 1 1 0 0 1 1 1     x7     1
748 */
749.Ltf_lo__inv_aff__and__s2:
750	.octa 0x92172DA81A9FA520B2370D883ABF8500
751.Ltf_hi__inv_aff__and__s2:
752	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
753
754/* X2 and AES forward affine combined:
755 *      1 0 1 1 0 0 0 1     x0     0
756 *      0 1 1 1 1 0 1 1     x1     0
757 *      0 0 0 1 1 0 1 0     x2     1
758 *      0 1 0 0 0 1 0 0     x3     0
759 *      0 0 1 1 1 0 1 1  *  x4  +  0
760 *      0 1 0 0 1 0 0 0     x5     0
761 *      1 1 0 1 0 0 1 1     x6     0
762 *      0 1 0 0 1 0 1 0     x7     0
763 */
764.Ltf_lo__x2__and__fwd_aff:
765	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
766.Ltf_hi__x2__and__fwd_aff:
767	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
768
769/* AES affine: */
770#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
771.Ltf_aff_bitmatrix:
772	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
773		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
774		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
775		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
776		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
777		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
778		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
779		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
780	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
781		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
782		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
783		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
784		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
785		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
786		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
787		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
788
789/* AES inverse affine: */
790#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
791.Ltf_inv_bitmatrix:
792	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
793		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
794		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
795		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
796		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
797		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
798		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
799		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
800	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
801		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
802		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
803		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
804		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
805		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
806		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
807		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
808
809/* S2: */
810#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
811.Ltf_s2_bitmatrix:
812	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
813		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
814		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
815		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
816		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
817		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
818		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
819		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
820	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
821		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
822		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
823		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
824		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
825		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
826		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
827		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
828
829/* X2: */
830#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
831.Ltf_x2_bitmatrix:
832	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
833		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
834		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
835		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
836		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
837		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
838		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
839		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
840	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
841		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
842		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
843		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
844		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
845		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
846		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
847		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
848
849/* Identity matrix: */
850.Ltf_id_bitmatrix:
851	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
852		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
853		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
854		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
855		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
856		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
857		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
858		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
859	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
860		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
861		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
862		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
863		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
864		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
865		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
866		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
867
868/* 4-bit mask */
869.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
870.align 4
871.L0f0f0f0f:
872	.long 0x0f0f0f0f
873
874.text
875
876SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
877	/* input:
878	*      %r9: rk
879	*      %rsi: dst
880	*      %rdx: src
881	*      %xmm0..%xmm15: 16 byte-sliced blocks
882	*/
883
884	FRAME_BEGIN
885
886	movq %rsi, %rax;
887	leaq 8 * 16(%rax), %r8;
888
889	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
890		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
891		      %xmm15, %rax, %r8);
892	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
893		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
894		%rax, %r9, 0);
895	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
896		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
897		%xmm15, %rax, %r9, 1);
898	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
899		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
900		%rax, %r9, 2);
901	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
902		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
903		%xmm15, %rax, %r9, 3);
904	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
905		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
906		%rax, %r9, 4);
907	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
908		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
909		%xmm15, %rax, %r9, 5);
910	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
911		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
912		%rax, %r9, 6);
913	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
914		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
915		%xmm15, %rax, %r9, 7);
916	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
917		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
918		%rax, %r9, 8);
919	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
920		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
921		%xmm15, %rax, %r9, 9);
922	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
923		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
924		%rax, %r9, 10);
925	cmpl $12, ARIA_CTX_rounds(CTX);
926	jne .Laria_192;
927	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929		%xmm15, %rax, %r9, 11, 12);
930	jmp .Laria_end;
931.Laria_192:
932	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
933		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
934		%xmm15, %rax, %r9, 11);
935	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
936		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
937		%rax, %r9, 12);
938	cmpl $14, ARIA_CTX_rounds(CTX);
939	jne .Laria_256;
940	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942		%xmm15, %rax, %r9, 13, 14);
943	jmp .Laria_end;
944.Laria_256:
945	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
946		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
947		%xmm15, %rax, %r9, 13);
948	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
949		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
950		%rax, %r9, 14);
951	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
952		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
953		%xmm15, %rax, %r9, 15, 16);
954.Laria_end:
955	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
956			   %xmm9, %xmm13, %xmm0, %xmm5,
957			   %xmm10, %xmm14, %xmm3, %xmm6,
958			   %xmm11, %xmm15, %xmm2, %xmm7,
959			   (%rax), (%r8));
960
961	FRAME_END
962	RET;
963SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
964
965SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
966	/* input:
967	*      %rdi: ctx, CTX
968	*      %rsi: dst
969	*      %rdx: src
970	*/
971
972	FRAME_BEGIN
973
974	leaq ARIA_CTX_enc_key(CTX), %r9;
975
976	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
977		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
978		     %xmm15, %rdx);
979
980	call __aria_aesni_avx_crypt_16way;
981
982	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
983		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
984		     %xmm15, %rax);
985
986	FRAME_END
987	RET;
988SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
989
990SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
991	/* input:
992	*      %rdi: ctx, CTX
993	*      %rsi: dst
994	*      %rdx: src
995	*/
996
997	FRAME_BEGIN
998
999	leaq ARIA_CTX_dec_key(CTX), %r9;
1000
1001	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1002		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1003		     %xmm15, %rdx);
1004
1005	call __aria_aesni_avx_crypt_16way;
1006
1007	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1008		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1009		     %xmm15, %rax);
1010
1011	FRAME_END
1012	RET;
1013SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1014
1015SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1016	/* input:
1017	*      %rdi: ctx
1018	*      %rsi: dst
1019	*      %rdx: src
1020	*      %rcx: keystream
1021	*      %r8: iv (big endian, 128bit)
1022	*/
1023
1024	FRAME_BEGIN
1025	/* load IV and byteswap */
1026	vmovdqu (%r8), %xmm8;
1027
1028	vmovdqa .Lbswap128_mask (%rip), %xmm1;
1029	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1030
1031	vpcmpeqd %xmm0, %xmm0, %xmm0;
1032	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1033
1034	/* construct IVs */
1035	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1036	vpshufb %xmm1, %xmm3, %xmm9;
1037	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1038	vpshufb %xmm1, %xmm3, %xmm10;
1039	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1040	vpshufb %xmm1, %xmm3, %xmm11;
1041	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1042	vpshufb %xmm1, %xmm3, %xmm12;
1043	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044	vpshufb %xmm1, %xmm3, %xmm13;
1045	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046	vpshufb %xmm1, %xmm3, %xmm14;
1047	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048	vpshufb %xmm1, %xmm3, %xmm15;
1049	vmovdqu %xmm8, (0 * 16)(%rcx);
1050	vmovdqu %xmm9, (1 * 16)(%rcx);
1051	vmovdqu %xmm10, (2 * 16)(%rcx);
1052	vmovdqu %xmm11, (3 * 16)(%rcx);
1053	vmovdqu %xmm12, (4 * 16)(%rcx);
1054	vmovdqu %xmm13, (5 * 16)(%rcx);
1055	vmovdqu %xmm14, (6 * 16)(%rcx);
1056	vmovdqu %xmm15, (7 * 16)(%rcx);
1057
1058	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1059	vpshufb %xmm1, %xmm3, %xmm8;
1060	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1061	vpshufb %xmm1, %xmm3, %xmm9;
1062	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1063	vpshufb %xmm1, %xmm3, %xmm10;
1064	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1065	vpshufb %xmm1, %xmm3, %xmm11;
1066	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067	vpshufb %xmm1, %xmm3, %xmm12;
1068	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069	vpshufb %xmm1, %xmm3, %xmm13;
1070	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071	vpshufb %xmm1, %xmm3, %xmm14;
1072	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073	vpshufb %xmm1, %xmm3, %xmm15;
1074	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075	vpshufb %xmm1, %xmm3, %xmm4;
1076	vmovdqu %xmm4, (%r8);
1077
1078	vmovdqu (0 * 16)(%rcx), %xmm0;
1079	vmovdqu (1 * 16)(%rcx), %xmm1;
1080	vmovdqu (2 * 16)(%rcx), %xmm2;
1081	vmovdqu (3 * 16)(%rcx), %xmm3;
1082	vmovdqu (4 * 16)(%rcx), %xmm4;
1083	vmovdqu (5 * 16)(%rcx), %xmm5;
1084	vmovdqu (6 * 16)(%rcx), %xmm6;
1085	vmovdqu (7 * 16)(%rcx), %xmm7;
1086
1087	FRAME_END
1088	RET;
1089SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1090
1091SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1092	/* input:
1093	*      %rdi: ctx
1094	*      %rsi: dst
1095	*      %rdx: src
1096	*      %rcx: keystream
1097	*      %r8: iv (big endian, 128bit)
1098	*/
1099	FRAME_BEGIN
1100
1101	call __aria_aesni_avx_ctr_gen_keystream_16way;
1102
1103	leaq (%rsi), %r10;
1104	leaq (%rdx), %r11;
1105	leaq (%rcx), %rsi;
1106	leaq (%rcx), %rdx;
1107	leaq ARIA_CTX_enc_key(CTX), %r9;
1108
1109	call __aria_aesni_avx_crypt_16way;
1110
1111	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1112	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1113	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1114	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1115	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1116	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1117	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1118	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1119	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1120	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1121	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1122	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1123	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1124	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1125	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1126	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1127	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1128		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1129		     %xmm15, %r10);
1130
1131	FRAME_END
1132	RET;
1133SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1134
1135SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1136	/* input:
1137	*      %r9: rk
1138	*      %rsi: dst
1139	*      %rdx: src
1140	*      %xmm0..%xmm15: 16 byte-sliced blocks
1141	*/
1142
1143	FRAME_BEGIN
1144
1145	movq %rsi, %rax;
1146	leaq 8 * 16(%rax), %r8;
1147
1148	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1149		      %xmm4, %xmm5, %xmm6, %xmm7,
1150		      %xmm8, %xmm9, %xmm10, %xmm11,
1151		      %xmm12, %xmm13, %xmm14,
1152		      %xmm15, %rax, %r8);
1153	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1154		     %xmm12, %xmm13, %xmm14, %xmm15,
1155		     %xmm0, %xmm1, %xmm2, %xmm3,
1156		     %xmm4, %xmm5, %xmm6, %xmm7,
1157		     %rax, %r9, 0);
1158	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1159		     %xmm4, %xmm5, %xmm6, %xmm7,
1160		     %xmm8, %xmm9, %xmm10, %xmm11,
1161		     %xmm12, %xmm13, %xmm14,
1162		     %xmm15, %rax, %r9, 1);
1163	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1164		     %xmm12, %xmm13, %xmm14, %xmm15,
1165		     %xmm0, %xmm1, %xmm2, %xmm3,
1166		     %xmm4, %xmm5, %xmm6, %xmm7,
1167		     %rax, %r9, 2);
1168	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1169		     %xmm4, %xmm5, %xmm6, %xmm7,
1170		     %xmm8, %xmm9, %xmm10, %xmm11,
1171		     %xmm12, %xmm13, %xmm14,
1172		     %xmm15, %rax, %r9, 3);
1173	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1174		     %xmm12, %xmm13, %xmm14, %xmm15,
1175		     %xmm0, %xmm1, %xmm2, %xmm3,
1176		     %xmm4, %xmm5, %xmm6, %xmm7,
1177		     %rax, %r9, 4);
1178	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1179		     %xmm4, %xmm5, %xmm6, %xmm7,
1180		     %xmm8, %xmm9, %xmm10, %xmm11,
1181		     %xmm12, %xmm13, %xmm14,
1182		     %xmm15, %rax, %r9, 5);
1183	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1184		     %xmm12, %xmm13, %xmm14, %xmm15,
1185		     %xmm0, %xmm1, %xmm2, %xmm3,
1186		     %xmm4, %xmm5, %xmm6, %xmm7,
1187		     %rax, %r9, 6);
1188	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1189		     %xmm4, %xmm5, %xmm6, %xmm7,
1190		     %xmm8, %xmm9, %xmm10, %xmm11,
1191		     %xmm12, %xmm13, %xmm14,
1192		     %xmm15, %rax, %r9, 7);
1193	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1194		     %xmm12, %xmm13, %xmm14, %xmm15,
1195		     %xmm0, %xmm1, %xmm2, %xmm3,
1196		     %xmm4, %xmm5, %xmm6, %xmm7,
1197		     %rax, %r9, 8);
1198	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1199		     %xmm4, %xmm5, %xmm6, %xmm7,
1200		     %xmm8, %xmm9, %xmm10, %xmm11,
1201		     %xmm12, %xmm13, %xmm14,
1202		     %xmm15, %rax, %r9, 9);
1203	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1204		     %xmm12, %xmm13, %xmm14, %xmm15,
1205		     %xmm0, %xmm1, %xmm2, %xmm3,
1206		     %xmm4, %xmm5, %xmm6, %xmm7,
1207		     %rax, %r9, 10);
1208	cmpl $12, ARIA_CTX_rounds(CTX);
1209	jne .Laria_gfni_192;
1210	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1211		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1212		%xmm15, %rax, %r9, 11, 12);
1213	jmp .Laria_gfni_end;
1214.Laria_gfni_192:
1215	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1216		     %xmm4, %xmm5, %xmm6, %xmm7,
1217		     %xmm8, %xmm9, %xmm10, %xmm11,
1218		     %xmm12, %xmm13, %xmm14,
1219		     %xmm15, %rax, %r9, 11);
1220	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1221		     %xmm12, %xmm13, %xmm14, %xmm15,
1222		     %xmm0, %xmm1, %xmm2, %xmm3,
1223		     %xmm4, %xmm5, %xmm6, %xmm7,
1224		     %rax, %r9, 12);
1225	cmpl $14, ARIA_CTX_rounds(CTX);
1226	jne .Laria_gfni_256;
1227	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1228		     %xmm4, %xmm5, %xmm6, %xmm7,
1229		     %xmm8, %xmm9, %xmm10, %xmm11,
1230		     %xmm12, %xmm13, %xmm14,
1231		     %xmm15, %rax, %r9, 13, 14);
1232	jmp .Laria_gfni_end;
1233.Laria_gfni_256:
1234	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1235		     %xmm4, %xmm5, %xmm6, %xmm7,
1236		     %xmm8, %xmm9, %xmm10, %xmm11,
1237		     %xmm12, %xmm13, %xmm14,
1238		     %xmm15, %rax, %r9, 13);
1239	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1240		     %xmm12, %xmm13, %xmm14, %xmm15,
1241		     %xmm0, %xmm1, %xmm2, %xmm3,
1242		     %xmm4, %xmm5, %xmm6, %xmm7,
1243		     %rax, %r9, 14);
1244	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1245		     %xmm4, %xmm5, %xmm6, %xmm7,
1246		     %xmm8, %xmm9, %xmm10, %xmm11,
1247		     %xmm12, %xmm13, %xmm14,
1248		     %xmm15, %rax, %r9, 15, 16);
1249.Laria_gfni_end:
1250	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1251			   %xmm9, %xmm13, %xmm0, %xmm5,
1252			   %xmm10, %xmm14, %xmm3, %xmm6,
1253			   %xmm11, %xmm15, %xmm2, %xmm7,
1254			   (%rax), (%r8));
1255
1256	FRAME_END
1257	RET;
1258SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1259
1260SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1261	/* input:
1262	*      %rdi: ctx, CTX
1263	*      %rsi: dst
1264	*      %rdx: src
1265	*/
1266
1267	FRAME_BEGIN
1268
1269	leaq ARIA_CTX_enc_key(CTX), %r9;
1270
1271	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1272		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1273		     %xmm15, %rdx);
1274
1275	call __aria_aesni_avx_gfni_crypt_16way;
1276
1277	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1278		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1279		     %xmm15, %rax);
1280
1281	FRAME_END
1282	RET;
1283SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1284
1285SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1286	/* input:
1287	*      %rdi: ctx, CTX
1288	*      %rsi: dst
1289	*      %rdx: src
1290	*/
1291
1292	FRAME_BEGIN
1293
1294	leaq ARIA_CTX_dec_key(CTX), %r9;
1295
1296	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1297		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1298		     %xmm15, %rdx);
1299
1300	call __aria_aesni_avx_gfni_crypt_16way;
1301
1302	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1303		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1304		     %xmm15, %rax);
1305
1306	FRAME_END
1307	RET;
1308SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1309
1310SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1311	/* input:
1312	*      %rdi: ctx
1313	*      %rsi: dst
1314	*      %rdx: src
1315	*      %rcx: keystream
1316	*      %r8: iv (big endian, 128bit)
1317	*/
1318	FRAME_BEGIN
1319
1320	call __aria_aesni_avx_ctr_gen_keystream_16way
1321
1322	leaq (%rsi), %r10;
1323	leaq (%rdx), %r11;
1324	leaq (%rcx), %rsi;
1325	leaq (%rcx), %rdx;
1326	leaq ARIA_CTX_enc_key(CTX), %r9;
1327
1328	call __aria_aesni_avx_gfni_crypt_16way;
1329
1330	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1331	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1332	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1333	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1334	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1335	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1336	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1337	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1338	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1339	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1340	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1341	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1342	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1343	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1344	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1345	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1346	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1347		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1348		     %xmm15, %r10);
1349
1350	FRAME_END
1351	RET;
1352SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1353