xref: /linux/arch/x86/crypto/aria-aesni-avx-asm_64.S (revision 27dfc44e1ba30d2d49675e21918bf4b3b3b59fa6)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 16-way parallel algorithm (AVX)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <asm/frame.h>
11
12/* struct aria_ctx: */
13#define enc_key 0
14#define dec_key 272
15#define rounds 544
16
17/* register macros */
18#define CTX %rdi
19
20
21#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
22	( (((a0) & 1) << 0) |				\
23	  (((a1) & 1) << 1) |				\
24	  (((a2) & 1) << 2) |				\
25	  (((a3) & 1) << 3) |				\
26	  (((a4) & 1) << 4) |				\
27	  (((a5) & 1) << 5) |				\
28	  (((a6) & 1) << 6) |				\
29	  (((a7) & 1) << 7) )
30
31#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
32	( ((l7) << (0 * 8)) |				\
33	  ((l6) << (1 * 8)) |				\
34	  ((l5) << (2 * 8)) |				\
35	  ((l4) << (3 * 8)) |				\
36	  ((l3) << (4 * 8)) |				\
37	  ((l2) << (5 * 8)) |				\
38	  ((l1) << (6 * 8)) |				\
39	  ((l0) << (7 * 8)) )
40
41#define inc_le128(x, minus_one, tmp)			\
42	vpcmpeqq minus_one, x, tmp;			\
43	vpsubq minus_one, x, x;				\
44	vpslldq $8, tmp, tmp;				\
45	vpsubq tmp, x, x;
46
47#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
48	vpand x, mask4bit, tmp0;			\
49	vpandn x, mask4bit, x;				\
50	vpsrld $4, x, x;				\
51							\
52	vpshufb tmp0, lo_t, tmp0;			\
53	vpshufb x, hi_t, x;				\
54	vpxor tmp0, x, x;
55
56#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
57	vpunpckhdq x1, x0, t2;				\
58	vpunpckldq x1, x0, x0;				\
59							\
60	vpunpckldq x3, x2, t1;				\
61	vpunpckhdq x3, x2, x2;				\
62							\
63	vpunpckhqdq t1, x0, x1;				\
64	vpunpcklqdq t1, x0, x0;				\
65							\
66	vpunpckhqdq x2, t2, x3;				\
67	vpunpcklqdq x2, t2, x2;
68
69#define byteslice_16x16b(a0, b0, c0, d0,		\
70			 a1, b1, c1, d1,		\
71			 a2, b2, c2, d2,		\
72			 a3, b3, c3, d3,		\
73			 st0, st1)			\
74	vmovdqu d2, st0;				\
75	vmovdqu d3, st1;				\
76	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
77	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
78	vmovdqu st0, d2;				\
79	vmovdqu st1, d3;				\
80							\
81	vmovdqu a0, st0;				\
82	vmovdqu a1, st1;				\
83	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
84	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
85							\
86	vmovdqu .Lshufb_16x16b, a0;			\
87	vmovdqu st1, a1;				\
88	vpshufb a0, a2, a2;				\
89	vpshufb a0, a3, a3;				\
90	vpshufb a0, b0, b0;				\
91	vpshufb a0, b1, b1;				\
92	vpshufb a0, b2, b2;				\
93	vpshufb a0, b3, b3;				\
94	vpshufb a0, a1, a1;				\
95	vpshufb a0, c0, c0;				\
96	vpshufb a0, c1, c1;				\
97	vpshufb a0, c2, c2;				\
98	vpshufb a0, c3, c3;				\
99	vpshufb a0, d0, d0;				\
100	vpshufb a0, d1, d1;				\
101	vpshufb a0, d2, d2;				\
102	vpshufb a0, d3, d3;				\
103	vmovdqu d3, st1;				\
104	vmovdqu st0, d3;				\
105	vpshufb a0, d3, a0;				\
106	vmovdqu d2, st0;				\
107							\
108	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
109	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
110	vmovdqu st0, d2;				\
111	vmovdqu st1, d3;				\
112							\
113	vmovdqu b0, st0;				\
114	vmovdqu b1, st1;				\
115	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
116	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
117	vmovdqu st0, b0;				\
118	vmovdqu st1, b1;				\
119	/* does not adjust output bytes inside vectors */
120
121#define debyteslice_16x16b(a0, b0, c0, d0,		\
122			   a1, b1, c1, d1,		\
123			   a2, b2, c2, d2,		\
124			   a3, b3, c3, d3,		\
125			   st0, st1)			\
126	vmovdqu d2, st0;				\
127	vmovdqu d3, st1;				\
128	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
129	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
130	vmovdqu st0, d2;				\
131	vmovdqu st1, d3;				\
132							\
133	vmovdqu a0, st0;				\
134	vmovdqu a1, st1;				\
135	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
136	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
137							\
138	vmovdqu .Lshufb_16x16b, a0;			\
139	vmovdqu st1, a1;				\
140	vpshufb a0, a2, a2;				\
141	vpshufb a0, a3, a3;				\
142	vpshufb a0, b0, b0;				\
143	vpshufb a0, b1, b1;				\
144	vpshufb a0, b2, b2;				\
145	vpshufb a0, b3, b3;				\
146	vpshufb a0, a1, a1;				\
147	vpshufb a0, c0, c0;				\
148	vpshufb a0, c1, c1;				\
149	vpshufb a0, c2, c2;				\
150	vpshufb a0, c3, c3;				\
151	vpshufb a0, d0, d0;				\
152	vpshufb a0, d1, d1;				\
153	vpshufb a0, d2, d2;				\
154	vpshufb a0, d3, d3;				\
155	vmovdqu d3, st1;				\
156	vmovdqu st0, d3;				\
157	vpshufb a0, d3, a0;				\
158	vmovdqu d2, st0;				\
159							\
160	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
161	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
162	vmovdqu st0, d2;				\
163	vmovdqu st1, d3;				\
164							\
165	vmovdqu b0, st0;				\
166	vmovdqu b1, st1;				\
167	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
168	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
169	vmovdqu st0, b0;				\
170	vmovdqu st1, b1;				\
171	/* does not adjust output bytes inside vectors */
172
173/* load blocks to registers and apply pre-whitening */
174#define inpack16_pre(x0, x1, x2, x3,			\
175		     x4, x5, x6, x7,			\
176		     y0, y1, y2, y3,			\
177		     y4, y5, y6, y7,			\
178		     rio)				\
179	vmovdqu (0 * 16)(rio), x0;			\
180	vmovdqu (1 * 16)(rio), x1;			\
181	vmovdqu (2 * 16)(rio), x2;			\
182	vmovdqu (3 * 16)(rio), x3;			\
183	vmovdqu (4 * 16)(rio), x4;			\
184	vmovdqu (5 * 16)(rio), x5;			\
185	vmovdqu (6 * 16)(rio), x6;			\
186	vmovdqu (7 * 16)(rio), x7;			\
187	vmovdqu (8 * 16)(rio), y0;			\
188	vmovdqu (9 * 16)(rio), y1;			\
189	vmovdqu (10 * 16)(rio), y2;			\
190	vmovdqu (11 * 16)(rio), y3;			\
191	vmovdqu (12 * 16)(rio), y4;			\
192	vmovdqu (13 * 16)(rio), y5;			\
193	vmovdqu (14 * 16)(rio), y6;			\
194	vmovdqu (15 * 16)(rio), y7;
195
196/* byteslice pre-whitened blocks and store to temporary memory */
197#define inpack16_post(x0, x1, x2, x3,			\
198		      x4, x5, x6, x7,			\
199		      y0, y1, y2, y3,			\
200		      y4, y5, y6, y7,			\
201		      mem_ab, mem_cd)			\
202	byteslice_16x16b(x0, x1, x2, x3,		\
203			 x4, x5, x6, x7,		\
204			 y0, y1, y2, y3,		\
205			 y4, y5, y6, y7,		\
206			 (mem_ab), (mem_cd));		\
207							\
208	vmovdqu x0, 0 * 16(mem_ab);			\
209	vmovdqu x1, 1 * 16(mem_ab);			\
210	vmovdqu x2, 2 * 16(mem_ab);			\
211	vmovdqu x3, 3 * 16(mem_ab);			\
212	vmovdqu x4, 4 * 16(mem_ab);			\
213	vmovdqu x5, 5 * 16(mem_ab);			\
214	vmovdqu x6, 6 * 16(mem_ab);			\
215	vmovdqu x7, 7 * 16(mem_ab);			\
216	vmovdqu y0, 0 * 16(mem_cd);			\
217	vmovdqu y1, 1 * 16(mem_cd);			\
218	vmovdqu y2, 2 * 16(mem_cd);			\
219	vmovdqu y3, 3 * 16(mem_cd);			\
220	vmovdqu y4, 4 * 16(mem_cd);			\
221	vmovdqu y5, 5 * 16(mem_cd);			\
222	vmovdqu y6, 6 * 16(mem_cd);			\
223	vmovdqu y7, 7 * 16(mem_cd);
224
225#define write_output(x0, x1, x2, x3,			\
226		     x4, x5, x6, x7,			\
227		     y0, y1, y2, y3,			\
228		     y4, y5, y6, y7,			\
229		     mem)				\
230	vmovdqu x0, 0 * 16(mem);			\
231	vmovdqu x1, 1 * 16(mem);			\
232	vmovdqu x2, 2 * 16(mem);			\
233	vmovdqu x3, 3 * 16(mem);			\
234	vmovdqu x4, 4 * 16(mem);			\
235	vmovdqu x5, 5 * 16(mem);			\
236	vmovdqu x6, 6 * 16(mem);			\
237	vmovdqu x7, 7 * 16(mem);			\
238	vmovdqu y0, 8 * 16(mem);			\
239	vmovdqu y1, 9 * 16(mem);			\
240	vmovdqu y2, 10 * 16(mem);			\
241	vmovdqu y3, 11 * 16(mem);			\
242	vmovdqu y4, 12 * 16(mem);			\
243	vmovdqu y5, 13 * 16(mem);			\
244	vmovdqu y6, 14 * 16(mem);			\
245	vmovdqu y7, 15 * 16(mem);			\
246
247#define aria_store_state_8way(x0, x1, x2, x3,		\
248			      x4, x5, x6, x7,		\
249			      mem_tmp, idx)		\
250	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
251	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
252	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
253	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
254	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
255	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
256	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
257	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
258
259#define aria_load_state_8way(x0, x1, x2, x3,		\
260			     x4, x5, x6, x7,		\
261			     mem_tmp, idx)		\
262	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
263	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
264	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
265	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
266	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
267	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
268	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
269	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
270
271#define aria_ark_8way(x0, x1, x2, x3,			\
272		      x4, x5, x6, x7,			\
273		      t0, rk, idx, round)		\
274	/* AddRoundKey */                               \
275	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
276	vpxor t0, x0, x0;				\
277	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
278	vpxor t0, x1, x1;				\
279	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
280	vpxor t0, x2, x2;				\
281	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
282	vpxor t0, x3, x3;				\
283	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
284	vpxor t0, x4, x4;				\
285	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
286	vpxor t0, x5, x5;				\
287	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
288	vpxor t0, x6, x6;				\
289	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
290	vpxor t0, x7, x7;
291
292#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
293			    x4, x5, x6, x7,		\
294			    t0, t1, t2, t3,		\
295			    t4, t5, t6, t7)		\
296	vpbroadcastq .Ltf_s2_bitmatrix, t0;		\
297	vpbroadcastq .Ltf_inv_bitmatrix, t1;		\
298	vpbroadcastq .Ltf_id_bitmatrix, t2;		\
299	vpbroadcastq .Ltf_aff_bitmatrix, t3;		\
300	vpbroadcastq .Ltf_x2_bitmatrix, t4;		\
301	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
302	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
303	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
304	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
305	vgf2p8affineinvqb $0, t2, x2, x2;		\
306	vgf2p8affineinvqb $0, t2, x6, x6;		\
307	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
308	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
309	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
310	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
311	vgf2p8affineinvqb $0, t2, x3, x3;		\
312	vgf2p8affineinvqb $0, t2, x7, x7
313
314#define aria_sbox_8way(x0, x1, x2, x3,            	\
315		       x4, x5, x6, x7,			\
316		       t0, t1, t2, t3,			\
317		       t4, t5, t6, t7)			\
318	vpxor t7, t7, t7;				\
319	vmovdqa .Linv_shift_row, t0;			\
320	vmovdqa .Lshift_row, t1;			\
321	vpbroadcastd .L0f0f0f0f, t6;			\
322	vmovdqa .Ltf_lo__inv_aff__and__s2, t2;		\
323	vmovdqa .Ltf_hi__inv_aff__and__s2, t3;		\
324	vmovdqa .Ltf_lo__x2__and__fwd_aff, t4;		\
325	vmovdqa .Ltf_hi__x2__and__fwd_aff, t5;		\
326							\
327	vaesenclast t7, x0, x0;				\
328	vaesenclast t7, x4, x4;				\
329	vaesenclast t7, x1, x1;				\
330	vaesenclast t7, x5, x5;				\
331	vaesdeclast t7, x2, x2;				\
332	vaesdeclast t7, x6, x6;				\
333							\
334	/* AES inverse shift rows */			\
335	vpshufb t0, x0, x0;				\
336	vpshufb t0, x4, x4;				\
337	vpshufb t0, x1, x1;				\
338	vpshufb t0, x5, x5;				\
339	vpshufb t1, x3, x3;				\
340	vpshufb t1, x7, x7;				\
341	vpshufb t1, x2, x2;				\
342	vpshufb t1, x6, x6;				\
343							\
344	/* affine transformation for S2 */		\
345	filter_8bit(x1, t2, t3, t6, t0);		\
346	/* affine transformation for S2 */		\
347	filter_8bit(x5, t2, t3, t6, t0);		\
348							\
349	/* affine transformation for X2 */		\
350	filter_8bit(x3, t4, t5, t6, t0);		\
351	/* affine transformation for X2 */		\
352	filter_8bit(x7, t4, t5, t6, t0);		\
353	vaesdeclast t7, x3, x3;				\
354	vaesdeclast t7, x7, x7;
355
356#define aria_diff_m(x0, x1, x2, x3,			\
357		    t0, t1, t2, t3)			\
358	/* T = rotr32(X, 8); */				\
359	/* X ^= T */					\
360	vpxor x0, x3, t0;				\
361	vpxor x1, x0, t1;				\
362	vpxor x2, x1, t2;				\
363	vpxor x3, x2, t3;				\
364	/* X = T ^ rotr(X, 16); */			\
365	vpxor t2, x0, x0;				\
366	vpxor x1, t3, t3;				\
367	vpxor t0, x2, x2;				\
368	vpxor t1, x3, x1;				\
369	vmovdqu t3, x3;
370
371#define aria_diff_word(x0, x1, x2, x3,			\
372		       x4, x5, x6, x7,			\
373		       y0, y1, y2, y3,			\
374		       y4, y5, y6, y7)			\
375	/* t1 ^= t2; */					\
376	vpxor y0, x4, x4;				\
377	vpxor y1, x5, x5;				\
378	vpxor y2, x6, x6;				\
379	vpxor y3, x7, x7;				\
380							\
381	/* t2 ^= t3; */					\
382	vpxor y4, y0, y0;				\
383	vpxor y5, y1, y1;				\
384	vpxor y6, y2, y2;				\
385	vpxor y7, y3, y3;				\
386							\
387	/* t0 ^= t1; */					\
388	vpxor x4, x0, x0;				\
389	vpxor x5, x1, x1;				\
390	vpxor x6, x2, x2;				\
391	vpxor x7, x3, x3;				\
392							\
393	/* t3 ^= t1; */					\
394	vpxor x4, y4, y4;				\
395	vpxor x5, y5, y5;				\
396	vpxor x6, y6, y6;				\
397	vpxor x7, y7, y7;				\
398							\
399	/* t2 ^= t0; */					\
400	vpxor x0, y0, y0;				\
401	vpxor x1, y1, y1;				\
402	vpxor x2, y2, y2;				\
403	vpxor x3, y3, y3;				\
404							\
405	/* t1 ^= t2; */					\
406	vpxor y0, x4, x4;				\
407	vpxor y1, x5, x5;				\
408	vpxor y2, x6, x6;				\
409	vpxor y3, x7, x7;
410
411#define aria_fe(x0, x1, x2, x3,				\
412		x4, x5, x6, x7,				\
413		y0, y1, y2, y3,				\
414		y4, y5, y6, y7,				\
415		mem_tmp, rk, round)			\
416	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
417		      y0, rk, 8, round);		\
418							\
419	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
420		       y0, y1, y2, y3, y4, y5, y6, y7);	\
421							\
422	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
423	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
424	aria_store_state_8way(x0, x1, x2, x3,		\
425			      x4, x5, x6, x7,		\
426			      mem_tmp, 8);		\
427							\
428	aria_load_state_8way(x0, x1, x2, x3,		\
429			     x4, x5, x6, x7,		\
430			     mem_tmp, 0);		\
431	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
432		      y0, rk, 0, round);		\
433							\
434	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
435		       y0, y1, y2, y3, y4, y5, y6, y7);	\
436							\
437	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
438	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
439	aria_store_state_8way(x0, x1, x2, x3,		\
440			      x4, x5, x6, x7,		\
441			      mem_tmp, 0);		\
442	aria_load_state_8way(y0, y1, y2, y3,		\
443			     y4, y5, y6, y7,		\
444			     mem_tmp, 8);		\
445	aria_diff_word(x0, x1, x2, x3,			\
446		       x4, x5, x6, x7,			\
447		       y0, y1, y2, y3,			\
448		       y4, y5, y6, y7);			\
449	/* aria_diff_byte() 				\
450	 * T3 = ABCD -> BADC 				\
451	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
452	 * T0 = ABCD -> CDAB 				\
453	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
454	 * T1 = ABCD -> DCBA 				\
455	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
456	 */						\
457	aria_diff_word(x2, x3, x0, x1,			\
458		       x7, x6, x5, x4,			\
459		       y0, y1, y2, y3,			\
460		       y5, y4, y7, y6);			\
461	aria_store_state_8way(x3, x2, x1, x0,		\
462			      x6, x7, x4, x5,		\
463			      mem_tmp, 0);
464
465#define aria_fo(x0, x1, x2, x3,				\
466		x4, x5, x6, x7,				\
467		y0, y1, y2, y3,				\
468		y4, y5, y6, y7,				\
469		mem_tmp, rk, round)			\
470	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
471		      y0, rk, 8, round);		\
472							\
473	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
474		       y0, y1, y2, y3, y4, y5, y6, y7);	\
475							\
476	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
477	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
478	aria_store_state_8way(x0, x1, x2, x3,		\
479			      x4, x5, x6, x7,		\
480			      mem_tmp, 8);		\
481							\
482	aria_load_state_8way(x0, x1, x2, x3,		\
483			     x4, x5, x6, x7,		\
484			     mem_tmp, 0);		\
485	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
486		      y0, rk, 0, round);		\
487							\
488	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
489		       y0, y1, y2, y3, y4, y5, y6, y7);	\
490							\
491	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
492	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
493	aria_store_state_8way(x0, x1, x2, x3,		\
494			      x4, x5, x6, x7,		\
495			      mem_tmp, 0);		\
496	aria_load_state_8way(y0, y1, y2, y3,		\
497			     y4, y5, y6, y7,		\
498			     mem_tmp, 8);		\
499	aria_diff_word(x0, x1, x2, x3,			\
500		       x4, x5, x6, x7,			\
501		       y0, y1, y2, y3,			\
502		       y4, y5, y6, y7);			\
503	/* aria_diff_byte() 				\
504	 * T1 = ABCD -> BADC 				\
505	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
506	 * T2 = ABCD -> CDAB 				\
507	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
508	 * T3 = ABCD -> DCBA 				\
509	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
510	 */						\
511	aria_diff_word(x0, x1, x2, x3,			\
512		       x5, x4, x7, x6,			\
513		       y2, y3, y0, y1,			\
514		       y7, y6, y5, y4);			\
515	aria_store_state_8way(x3, x2, x1, x0,		\
516			      x6, x7, x4, x5,		\
517			      mem_tmp, 0);
518
519#define aria_ff(x0, x1, x2, x3,				\
520		x4, x5, x6, x7,				\
521		y0, y1, y2, y3,				\
522		y4, y5, y6, y7,				\
523		mem_tmp, rk, round, last_round)		\
524	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
525		      y0, rk, 8, round);		\
526							\
527	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
528		       y0, y1, y2, y3, y4, y5, y6, y7);	\
529							\
530	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
531		      y0, rk, 8, last_round);		\
532							\
533	aria_store_state_8way(x0, x1, x2, x3,		\
534			      x4, x5, x6, x7,		\
535			      mem_tmp, 8);		\
536							\
537	aria_load_state_8way(x0, x1, x2, x3,		\
538			     x4, x5, x6, x7,		\
539			     mem_tmp, 0);		\
540	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
541		      y0, rk, 0, round);		\
542							\
543	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
544		       y0, y1, y2, y3, y4, y5, y6, y7);	\
545							\
546	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
547		      y0, rk, 0, last_round);		\
548							\
549	aria_load_state_8way(y0, y1, y2, y3,		\
550			     y4, y5, y6, y7,		\
551			     mem_tmp, 8);
552
553#define aria_fe_gfni(x0, x1, x2, x3,			\
554		     x4, x5, x6, x7,			\
555		     y0, y1, y2, y3,			\
556		     y4, y5, y6, y7,			\
557		     mem_tmp, rk, round)		\
558	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
559		      y0, rk, 8, round);		\
560							\
561	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
562			    x6, x7, x4, x5,		\
563			    y0, y1, y2, y3, 		\
564			    y4, y5, y6, y7);		\
565							\
566	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
567	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
568	aria_store_state_8way(x0, x1, x2, x3,		\
569			      x4, x5, x6, x7,		\
570			      mem_tmp, 8);		\
571							\
572	aria_load_state_8way(x0, x1, x2, x3,		\
573			     x4, x5, x6, x7,		\
574			     mem_tmp, 0);		\
575	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
576		      y0, rk, 0, round);		\
577							\
578	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
579			    x6, x7, x4, x5,		\
580			    y0, y1, y2, y3, 		\
581			    y4, y5, y6, y7);		\
582							\
583	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
584	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
585	aria_store_state_8way(x0, x1, x2, x3,		\
586			      x4, x5, x6, x7,		\
587			      mem_tmp, 0);		\
588	aria_load_state_8way(y0, y1, y2, y3,		\
589			     y4, y5, y6, y7,		\
590			     mem_tmp, 8);		\
591	aria_diff_word(x0, x1, x2, x3,			\
592		       x4, x5, x6, x7,			\
593		       y0, y1, y2, y3,			\
594		       y4, y5, y6, y7);			\
595	/* aria_diff_byte() 				\
596	 * T3 = ABCD -> BADC 				\
597	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
598	 * T0 = ABCD -> CDAB 				\
599	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
600	 * T1 = ABCD -> DCBA 				\
601	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
602	 */						\
603	aria_diff_word(x2, x3, x0, x1,			\
604		       x7, x6, x5, x4,			\
605		       y0, y1, y2, y3,			\
606		       y5, y4, y7, y6);			\
607	aria_store_state_8way(x3, x2, x1, x0,		\
608			      x6, x7, x4, x5,		\
609			      mem_tmp, 0);
610
611#define aria_fo_gfni(x0, x1, x2, x3,			\
612		     x4, x5, x6, x7,			\
613		     y0, y1, y2, y3,			\
614		     y4, y5, y6, y7,			\
615		     mem_tmp, rk, round)		\
616	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
617		      y0, rk, 8, round);		\
618							\
619	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
620			    x4, x5, x6, x7,		\
621			    y0, y1, y2, y3, 		\
622			    y4, y5, y6, y7);		\
623							\
624	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
625	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
626	aria_store_state_8way(x0, x1, x2, x3,		\
627			      x4, x5, x6, x7,		\
628			      mem_tmp, 8);		\
629							\
630	aria_load_state_8way(x0, x1, x2, x3,		\
631			     x4, x5, x6, x7,		\
632			     mem_tmp, 0);		\
633	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
634		      y0, rk, 0, round);		\
635							\
636	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
637			    x4, x5, x6, x7,		\
638			    y0, y1, y2, y3, 		\
639			    y4, y5, y6, y7);		\
640							\
641	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
642	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
643	aria_store_state_8way(x0, x1, x2, x3,		\
644			      x4, x5, x6, x7,		\
645			      mem_tmp, 0);		\
646	aria_load_state_8way(y0, y1, y2, y3,		\
647			     y4, y5, y6, y7,		\
648			     mem_tmp, 8);		\
649	aria_diff_word(x0, x1, x2, x3,			\
650		       x4, x5, x6, x7,			\
651		       y0, y1, y2, y3,			\
652		       y4, y5, y6, y7);			\
653	/* aria_diff_byte() 				\
654	 * T1 = ABCD -> BADC 				\
655	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
656	 * T2 = ABCD -> CDAB 				\
657	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
658	 * T3 = ABCD -> DCBA 				\
659	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
660	 */						\
661	aria_diff_word(x0, x1, x2, x3,			\
662		       x5, x4, x7, x6,			\
663		       y2, y3, y0, y1,			\
664		       y7, y6, y5, y4);			\
665	aria_store_state_8way(x3, x2, x1, x0,		\
666			      x6, x7, x4, x5,		\
667			      mem_tmp, 0);
668
669#define aria_ff_gfni(x0, x1, x2, x3,			\
670		x4, x5, x6, x7,				\
671		y0, y1, y2, y3,				\
672		y4, y5, y6, y7,				\
673		mem_tmp, rk, round, last_round)		\
674	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
675		      y0, rk, 8, round);		\
676							\
677	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
678			    x6, x7, x4, x5,		\
679			    y0, y1, y2, y3, 		\
680			    y4, y5, y6, y7);		\
681							\
682	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
683		      y0, rk, 8, last_round);		\
684							\
685	aria_store_state_8way(x0, x1, x2, x3,		\
686			      x4, x5, x6, x7,		\
687			      mem_tmp, 8);		\
688							\
689	aria_load_state_8way(x0, x1, x2, x3,		\
690			     x4, x5, x6, x7,		\
691			     mem_tmp, 0);		\
692	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
693		      y0, rk, 0, round);		\
694							\
695	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
696			    x6, x7, x4, x5,		\
697			    y0, y1, y2, y3, 		\
698			    y4, y5, y6, y7);		\
699							\
700	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
701		      y0, rk, 0, last_round);		\
702							\
703	aria_load_state_8way(y0, y1, y2, y3,		\
704			     y4, y5, y6, y7,		\
705			     mem_tmp, 8);
706
707/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
708.section	.rodata.cst16, "aM", @progbits, 16
709.align 16
710
711#define SHUFB_BYTES(idx) \
712	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
713
714.Lshufb_16x16b:
715	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
716/* For isolating SubBytes from AESENCLAST, inverse shift row */
717.Linv_shift_row:
718	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
719	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
720.Lshift_row:
721	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
722	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
723/* For CTR-mode IV byteswap */
724.Lbswap128_mask:
725	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
726	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
727
728/* AES inverse affine and S2 combined:
729 *      1 1 0 0 0 0 0 1     x0     0
730 *      0 1 0 0 1 0 0 0     x1     0
731 *      1 1 0 0 1 1 1 1     x2     0
732 *      0 1 1 0 1 0 0 1     x3     1
733 *      0 1 0 0 1 1 0 0  *  x4  +  0
734 *      0 1 0 1 1 0 0 0     x5     0
735 *      0 0 0 0 0 1 0 1     x6     0
736 *      1 1 1 0 0 1 1 1     x7     1
737 */
738.Ltf_lo__inv_aff__and__s2:
739	.octa 0x92172DA81A9FA520B2370D883ABF8500
740.Ltf_hi__inv_aff__and__s2:
741	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
742
743/* X2 and AES forward affine combined:
744 *      1 0 1 1 0 0 0 1     x0     0
745 *      0 1 1 1 1 0 1 1     x1     0
746 *      0 0 0 1 1 0 1 0     x2     1
747 *      0 1 0 0 0 1 0 0     x3     0
748 *      0 0 1 1 1 0 1 1  *  x4  +  0
749 *      0 1 0 0 1 0 0 0     x5     0
750 *      1 1 0 1 0 0 1 1     x6     0
751 *      0 1 0 0 1 0 1 0     x7     0
752 */
753.Ltf_lo__x2__and__fwd_aff:
754	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
755.Ltf_hi__x2__and__fwd_aff:
756	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
757
758.section	.rodata.cst8, "aM", @progbits, 8
759.align 8
760/* AES affine: */
761#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
762.Ltf_aff_bitmatrix:
763	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
764		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
765		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
766		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
767		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
768		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
769		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
770		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
771
772/* AES inverse affine: */
773#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
774.Ltf_inv_bitmatrix:
775	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
776		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
777		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
778		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
779		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
780		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
781		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
782		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
783
784/* S2: */
785#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
786.Ltf_s2_bitmatrix:
787	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
788		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
789		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
790		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
791		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
792		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
793		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
794		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
795
796/* X2: */
797#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
798.Ltf_x2_bitmatrix:
799	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
800		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
801		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
802		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
803		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
804		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
805		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
806		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
807
808/* Identity matrix: */
809.Ltf_id_bitmatrix:
810	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
811		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
812		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
813		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
814		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
815		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
816		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
817		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
818
819/* 4-bit mask */
820.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
821.align 4
822.L0f0f0f0f:
823	.long 0x0f0f0f0f
824
825.text
826
827SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
828	/* input:
829	*      %r9: rk
830	*      %rsi: dst
831	*      %rdx: src
832	*      %xmm0..%xmm15: 16 byte-sliced blocks
833	*/
834
835	FRAME_BEGIN
836
837	movq %rsi, %rax;
838	leaq 8 * 16(%rax), %r8;
839
840	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
841		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
842		      %xmm15, %rax, %r8);
843	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
844		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
845		%rax, %r9, 0);
846	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
847		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
848		%xmm15, %rax, %r9, 1);
849	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
850		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
851		%rax, %r9, 2);
852	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
853		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
854		%xmm15, %rax, %r9, 3);
855	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
856		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
857		%rax, %r9, 4);
858	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
859		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
860		%xmm15, %rax, %r9, 5);
861	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
862		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
863		%rax, %r9, 6);
864	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
865		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
866		%xmm15, %rax, %r9, 7);
867	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
868		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
869		%rax, %r9, 8);
870	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
871		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
872		%xmm15, %rax, %r9, 9);
873	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
874		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
875		%rax, %r9, 10);
876	cmpl $12, rounds(CTX);
877	jne .Laria_192;
878	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
879		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
880		%xmm15, %rax, %r9, 11, 12);
881	jmp .Laria_end;
882.Laria_192:
883	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
884		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
885		%xmm15, %rax, %r9, 11);
886	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
887		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
888		%rax, %r9, 12);
889	cmpl $14, rounds(CTX);
890	jne .Laria_256;
891	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
892		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
893		%xmm15, %rax, %r9, 13, 14);
894	jmp .Laria_end;
895.Laria_256:
896	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
897		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
898		%xmm15, %rax, %r9, 13);
899	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
900		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
901		%rax, %r9, 14);
902	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
903		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
904		%xmm15, %rax, %r9, 15, 16);
905.Laria_end:
906	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
907			   %xmm9, %xmm13, %xmm0, %xmm5,
908			   %xmm10, %xmm14, %xmm3, %xmm6,
909			   %xmm11, %xmm15, %xmm2, %xmm7,
910			   (%rax), (%r8));
911
912	FRAME_END
913	RET;
914SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
915
916SYM_FUNC_START(aria_aesni_avx_encrypt_16way)
917	/* input:
918	*      %rdi: ctx, CTX
919	*      %rsi: dst
920	*      %rdx: src
921	*/
922
923	FRAME_BEGIN
924
925	leaq enc_key(CTX), %r9;
926
927	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
928		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929		     %xmm15, %rdx);
930
931	call __aria_aesni_avx_crypt_16way;
932
933	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
934		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
935		     %xmm15, %rax);
936
937	FRAME_END
938	RET;
939SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
940
941SYM_FUNC_START(aria_aesni_avx_decrypt_16way)
942	/* input:
943	*      %rdi: ctx, CTX
944	*      %rsi: dst
945	*      %rdx: src
946	*/
947
948	FRAME_BEGIN
949
950	leaq dec_key(CTX), %r9;
951
952	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
953		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
954		     %xmm15, %rdx);
955
956	call __aria_aesni_avx_crypt_16way;
957
958	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
959		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
960		     %xmm15, %rax);
961
962	FRAME_END
963	RET;
964SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
965
966SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
967	/* input:
968	*      %rdi: ctx
969	*      %rsi: dst
970	*      %rdx: src
971	*      %rcx: keystream
972	*      %r8: iv (big endian, 128bit)
973	*/
974
975	FRAME_BEGIN
976	/* load IV and byteswap */
977	vmovdqu (%r8), %xmm8;
978
979	vmovdqa .Lbswap128_mask (%rip), %xmm1;
980	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
981
982	vpcmpeqd %xmm0, %xmm0, %xmm0;
983	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
984
985	/* construct IVs */
986	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
987	vpshufb %xmm1, %xmm3, %xmm9;
988	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
989	vpshufb %xmm1, %xmm3, %xmm10;
990	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
991	vpshufb %xmm1, %xmm3, %xmm11;
992	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
993	vpshufb %xmm1, %xmm3, %xmm12;
994	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
995	vpshufb %xmm1, %xmm3, %xmm13;
996	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
997	vpshufb %xmm1, %xmm3, %xmm14;
998	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
999	vpshufb %xmm1, %xmm3, %xmm15;
1000	vmovdqu %xmm8, (0 * 16)(%rcx);
1001	vmovdqu %xmm9, (1 * 16)(%rcx);
1002	vmovdqu %xmm10, (2 * 16)(%rcx);
1003	vmovdqu %xmm11, (3 * 16)(%rcx);
1004	vmovdqu %xmm12, (4 * 16)(%rcx);
1005	vmovdqu %xmm13, (5 * 16)(%rcx);
1006	vmovdqu %xmm14, (6 * 16)(%rcx);
1007	vmovdqu %xmm15, (7 * 16)(%rcx);
1008
1009	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1010	vpshufb %xmm1, %xmm3, %xmm8;
1011	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1012	vpshufb %xmm1, %xmm3, %xmm9;
1013	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1014	vpshufb %xmm1, %xmm3, %xmm10;
1015	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1016	vpshufb %xmm1, %xmm3, %xmm11;
1017	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1018	vpshufb %xmm1, %xmm3, %xmm12;
1019	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1020	vpshufb %xmm1, %xmm3, %xmm13;
1021	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1022	vpshufb %xmm1, %xmm3, %xmm14;
1023	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1024	vpshufb %xmm1, %xmm3, %xmm15;
1025	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1026	vpshufb %xmm1, %xmm3, %xmm4;
1027	vmovdqu %xmm4, (%r8);
1028
1029	vmovdqu (0 * 16)(%rcx), %xmm0;
1030	vmovdqu (1 * 16)(%rcx), %xmm1;
1031	vmovdqu (2 * 16)(%rcx), %xmm2;
1032	vmovdqu (3 * 16)(%rcx), %xmm3;
1033	vmovdqu (4 * 16)(%rcx), %xmm4;
1034	vmovdqu (5 * 16)(%rcx), %xmm5;
1035	vmovdqu (6 * 16)(%rcx), %xmm6;
1036	vmovdqu (7 * 16)(%rcx), %xmm7;
1037
1038	FRAME_END
1039	RET;
1040SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1041
1042SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1043	/* input:
1044	*      %rdi: ctx
1045	*      %rsi: dst
1046	*      %rdx: src
1047	*      %rcx: keystream
1048	*      %r8: iv (big endian, 128bit)
1049	*/
1050	FRAME_BEGIN
1051
1052	call __aria_aesni_avx_ctr_gen_keystream_16way;
1053
1054	leaq (%rsi), %r10;
1055	leaq (%rdx), %r11;
1056	leaq (%rcx), %rsi;
1057	leaq (%rcx), %rdx;
1058	leaq enc_key(CTX), %r9;
1059
1060	call __aria_aesni_avx_crypt_16way;
1061
1062	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1063	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1064	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1065	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1066	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1067	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1068	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1069	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1070	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1071	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1072	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1073	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1074	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1075	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1076	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1077	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1078	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1079		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1080		     %xmm15, %r10);
1081
1082	FRAME_END
1083	RET;
1084SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1085
1086SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1087	/* input:
1088	*      %r9: rk
1089	*      %rsi: dst
1090	*      %rdx: src
1091	*      %xmm0..%xmm15: 16 byte-sliced blocks
1092	*/
1093
1094	FRAME_BEGIN
1095
1096	movq %rsi, %rax;
1097	leaq 8 * 16(%rax), %r8;
1098
1099	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1100		      %xmm4, %xmm5, %xmm6, %xmm7,
1101		      %xmm8, %xmm9, %xmm10, %xmm11,
1102		      %xmm12, %xmm13, %xmm14,
1103		      %xmm15, %rax, %r8);
1104	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1105		     %xmm12, %xmm13, %xmm14, %xmm15,
1106		     %xmm0, %xmm1, %xmm2, %xmm3,
1107		     %xmm4, %xmm5, %xmm6, %xmm7,
1108		     %rax, %r9, 0);
1109	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1110		     %xmm4, %xmm5, %xmm6, %xmm7,
1111		     %xmm8, %xmm9, %xmm10, %xmm11,
1112		     %xmm12, %xmm13, %xmm14,
1113		     %xmm15, %rax, %r9, 1);
1114	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1115		     %xmm12, %xmm13, %xmm14, %xmm15,
1116		     %xmm0, %xmm1, %xmm2, %xmm3,
1117		     %xmm4, %xmm5, %xmm6, %xmm7,
1118		     %rax, %r9, 2);
1119	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1120		     %xmm4, %xmm5, %xmm6, %xmm7,
1121		     %xmm8, %xmm9, %xmm10, %xmm11,
1122		     %xmm12, %xmm13, %xmm14,
1123		     %xmm15, %rax, %r9, 3);
1124	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1125		     %xmm12, %xmm13, %xmm14, %xmm15,
1126		     %xmm0, %xmm1, %xmm2, %xmm3,
1127		     %xmm4, %xmm5, %xmm6, %xmm7,
1128		     %rax, %r9, 4);
1129	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1130		     %xmm4, %xmm5, %xmm6, %xmm7,
1131		     %xmm8, %xmm9, %xmm10, %xmm11,
1132		     %xmm12, %xmm13, %xmm14,
1133		     %xmm15, %rax, %r9, 5);
1134	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1135		     %xmm12, %xmm13, %xmm14, %xmm15,
1136		     %xmm0, %xmm1, %xmm2, %xmm3,
1137		     %xmm4, %xmm5, %xmm6, %xmm7,
1138		     %rax, %r9, 6);
1139	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1140		     %xmm4, %xmm5, %xmm6, %xmm7,
1141		     %xmm8, %xmm9, %xmm10, %xmm11,
1142		     %xmm12, %xmm13, %xmm14,
1143		     %xmm15, %rax, %r9, 7);
1144	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1145		     %xmm12, %xmm13, %xmm14, %xmm15,
1146		     %xmm0, %xmm1, %xmm2, %xmm3,
1147		     %xmm4, %xmm5, %xmm6, %xmm7,
1148		     %rax, %r9, 8);
1149	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1150		     %xmm4, %xmm5, %xmm6, %xmm7,
1151		     %xmm8, %xmm9, %xmm10, %xmm11,
1152		     %xmm12, %xmm13, %xmm14,
1153		     %xmm15, %rax, %r9, 9);
1154	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1155		     %xmm12, %xmm13, %xmm14, %xmm15,
1156		     %xmm0, %xmm1, %xmm2, %xmm3,
1157		     %xmm4, %xmm5, %xmm6, %xmm7,
1158		     %rax, %r9, 10);
1159	cmpl $12, rounds(CTX);
1160	jne .Laria_gfni_192;
1161	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1162		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1163		%xmm15, %rax, %r9, 11, 12);
1164	jmp .Laria_gfni_end;
1165.Laria_gfni_192:
1166	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1167		     %xmm4, %xmm5, %xmm6, %xmm7,
1168		     %xmm8, %xmm9, %xmm10, %xmm11,
1169		     %xmm12, %xmm13, %xmm14,
1170		     %xmm15, %rax, %r9, 11);
1171	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1172		     %xmm12, %xmm13, %xmm14, %xmm15,
1173		     %xmm0, %xmm1, %xmm2, %xmm3,
1174		     %xmm4, %xmm5, %xmm6, %xmm7,
1175		     %rax, %r9, 12);
1176	cmpl $14, rounds(CTX);
1177	jne .Laria_gfni_256;
1178	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1179		     %xmm4, %xmm5, %xmm6, %xmm7,
1180		     %xmm8, %xmm9, %xmm10, %xmm11,
1181		     %xmm12, %xmm13, %xmm14,
1182		     %xmm15, %rax, %r9, 13, 14);
1183	jmp .Laria_gfni_end;
1184.Laria_gfni_256:
1185	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1186		     %xmm4, %xmm5, %xmm6, %xmm7,
1187		     %xmm8, %xmm9, %xmm10, %xmm11,
1188		     %xmm12, %xmm13, %xmm14,
1189		     %xmm15, %rax, %r9, 13);
1190	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1191		     %xmm12, %xmm13, %xmm14, %xmm15,
1192		     %xmm0, %xmm1, %xmm2, %xmm3,
1193		     %xmm4, %xmm5, %xmm6, %xmm7,
1194		     %rax, %r9, 14);
1195	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1196		     %xmm4, %xmm5, %xmm6, %xmm7,
1197		     %xmm8, %xmm9, %xmm10, %xmm11,
1198		     %xmm12, %xmm13, %xmm14,
1199		     %xmm15, %rax, %r9, 15, 16);
1200.Laria_gfni_end:
1201	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1202			   %xmm9, %xmm13, %xmm0, %xmm5,
1203			   %xmm10, %xmm14, %xmm3, %xmm6,
1204			   %xmm11, %xmm15, %xmm2, %xmm7,
1205			   (%rax), (%r8));
1206
1207	FRAME_END
1208	RET;
1209SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1210
1211SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1212	/* input:
1213	*      %rdi: ctx, CTX
1214	*      %rsi: dst
1215	*      %rdx: src
1216	*/
1217
1218	FRAME_BEGIN
1219
1220	leaq enc_key(CTX), %r9;
1221
1222	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1223		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1224		     %xmm15, %rdx);
1225
1226	call __aria_aesni_avx_gfni_crypt_16way;
1227
1228	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1229		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1230		     %xmm15, %rax);
1231
1232	FRAME_END
1233	RET;
1234SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1235
1236SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1237	/* input:
1238	*      %rdi: ctx, CTX
1239	*      %rsi: dst
1240	*      %rdx: src
1241	*/
1242
1243	FRAME_BEGIN
1244
1245	leaq dec_key(CTX), %r9;
1246
1247	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1248		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1249		     %xmm15, %rdx);
1250
1251	call __aria_aesni_avx_gfni_crypt_16way;
1252
1253	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1254		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1255		     %xmm15, %rax);
1256
1257	FRAME_END
1258	RET;
1259SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1260
1261SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1262	/* input:
1263	*      %rdi: ctx
1264	*      %rsi: dst
1265	*      %rdx: src
1266	*      %rcx: keystream
1267	*      %r8: iv (big endian, 128bit)
1268	*/
1269	FRAME_BEGIN
1270
1271	call __aria_aesni_avx_ctr_gen_keystream_16way
1272
1273	leaq (%rsi), %r10;
1274	leaq (%rdx), %r11;
1275	leaq (%rcx), %rsi;
1276	leaq (%rcx), %rdx;
1277	leaq enc_key(CTX), %r9;
1278
1279	call __aria_aesni_avx_gfni_crypt_16way;
1280
1281	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1282	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1283	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1284	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1285	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1286	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1287	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1288	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1289	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1290	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1291	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1292	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1293	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1294	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1295	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1296	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1297	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1298		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1299		     %xmm15, %r10);
1300
1301	FRAME_END
1302	RET;
1303SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1304