xref: /linux/arch/x86/crypto/camellia-aesni-avx-asm_64.S (revision befddb21c845f8fb49e637997891ef97c6a869dc)
1/*
2 * x86_64/AVX/AES-NI assembler implementation of Camellia
3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13/*
14 * Version licensed under 2-clause BSD License is available at:
15 *	http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
16 */
17
18#define CAMELLIA_TABLE_BYTE_LEN 272
19
20/* struct camellia_ctx: */
21#define key_table 0
22#define key_length CAMELLIA_TABLE_BYTE_LEN
23
24/* register macros */
25#define CTX %rdi
26
27/**********************************************************************
28  16-way camellia
29 **********************************************************************/
30#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
31	vpand x, mask4bit, tmp0; \
32	vpandn x, mask4bit, x; \
33	vpsrld $4, x, x; \
34	\
35	vpshufb tmp0, lo_t, tmp0; \
36	vpshufb x, hi_t, x; \
37	vpxor tmp0, x, x;
38
39/*
40 * IN:
41 *   x0..x7: byte-sliced AB state
42 *   mem_cd: register pointer storing CD state
43 *   key: index for key material
44 * OUT:
45 *   x0..x7: new byte-sliced CD state
46 */
47#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
48		  t7, mem_cd, key) \
49	/* \
50	 * S-function with AES subbytes \
51	 */ \
52	vmovdqa .Linv_shift_row, t4; \
53	vbroadcastss .L0f0f0f0f, t7; \
54	vmovdqa .Lpre_tf_lo_s1, t0; \
55	vmovdqa .Lpre_tf_hi_s1, t1; \
56	\
57	/* AES inverse shift rows */ \
58	vpshufb t4, x0, x0; \
59	vpshufb t4, x7, x7; \
60	vpshufb t4, x1, x1; \
61	vpshufb t4, x4, x4; \
62	vpshufb t4, x2, x2; \
63	vpshufb t4, x5, x5; \
64	vpshufb t4, x3, x3; \
65	vpshufb t4, x6, x6; \
66	\
67	/* prefilter sboxes 1, 2 and 3 */ \
68	vmovdqa .Lpre_tf_lo_s4, t2; \
69	vmovdqa .Lpre_tf_hi_s4, t3; \
70	filter_8bit(x0, t0, t1, t7, t6); \
71	filter_8bit(x7, t0, t1, t7, t6); \
72	filter_8bit(x1, t0, t1, t7, t6); \
73	filter_8bit(x4, t0, t1, t7, t6); \
74	filter_8bit(x2, t0, t1, t7, t6); \
75	filter_8bit(x5, t0, t1, t7, t6); \
76	\
77	/* prefilter sbox 4 */ \
78	vpxor t4, t4, t4; \
79	filter_8bit(x3, t2, t3, t7, t6); \
80	filter_8bit(x6, t2, t3, t7, t6); \
81	\
82	/* AES subbytes + AES shift rows */ \
83	vmovdqa .Lpost_tf_lo_s1, t0; \
84	vmovdqa .Lpost_tf_hi_s1, t1; \
85	vaesenclast t4, x0, x0; \
86	vaesenclast t4, x7, x7; \
87	vaesenclast t4, x1, x1; \
88	vaesenclast t4, x4, x4; \
89	vaesenclast t4, x2, x2; \
90	vaesenclast t4, x5, x5; \
91	vaesenclast t4, x3, x3; \
92	vaesenclast t4, x6, x6; \
93	\
94	/* postfilter sboxes 1 and 4 */ \
95	vmovdqa .Lpost_tf_lo_s3, t2; \
96	vmovdqa .Lpost_tf_hi_s3, t3; \
97	filter_8bit(x0, t0, t1, t7, t6); \
98	filter_8bit(x7, t0, t1, t7, t6); \
99	filter_8bit(x3, t0, t1, t7, t6); \
100	filter_8bit(x6, t0, t1, t7, t6); \
101	\
102	/* postfilter sbox 3 */ \
103	vmovdqa .Lpost_tf_lo_s2, t4; \
104	vmovdqa .Lpost_tf_hi_s2, t5; \
105	filter_8bit(x2, t2, t3, t7, t6); \
106	filter_8bit(x5, t2, t3, t7, t6); \
107	\
108	vpxor t6, t6, t6; \
109	vmovq key, t0; \
110	\
111	/* postfilter sbox 2 */ \
112	filter_8bit(x1, t4, t5, t7, t2); \
113	filter_8bit(x4, t4, t5, t7, t2); \
114	\
115	vpsrldq $5, t0, t5; \
116	vpsrldq $1, t0, t1; \
117	vpsrldq $2, t0, t2; \
118	vpsrldq $3, t0, t3; \
119	vpsrldq $4, t0, t4; \
120	vpshufb t6, t0, t0; \
121	vpshufb t6, t1, t1; \
122	vpshufb t6, t2, t2; \
123	vpshufb t6, t3, t3; \
124	vpshufb t6, t4, t4; \
125	vpsrldq $2, t5, t7; \
126	vpshufb t6, t7, t7; \
127	\
128	/* \
129	 * P-function \
130	 */ \
131	vpxor x5, x0, x0; \
132	vpxor x6, x1, x1; \
133	vpxor x7, x2, x2; \
134	vpxor x4, x3, x3; \
135	\
136	vpxor x2, x4, x4; \
137	vpxor x3, x5, x5; \
138	vpxor x0, x6, x6; \
139	vpxor x1, x7, x7; \
140	\
141	vpxor x7, x0, x0; \
142	vpxor x4, x1, x1; \
143	vpxor x5, x2, x2; \
144	vpxor x6, x3, x3; \
145	\
146	vpxor x3, x4, x4; \
147	vpxor x0, x5, x5; \
148	vpxor x1, x6, x6; \
149	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
150	\
151	/* \
152	 * Add key material and result to CD (x becomes new CD) \
153	 */ \
154	\
155	vpxor t3, x4, x4; \
156	vpxor 0 * 16(mem_cd), x4, x4; \
157	\
158	vpxor t2, x5, x5; \
159	vpxor 1 * 16(mem_cd), x5, x5; \
160	\
161	vpsrldq $1, t5, t3; \
162	vpshufb t6, t5, t5; \
163	vpshufb t6, t3, t6; \
164	\
165	vpxor t1, x6, x6; \
166	vpxor 2 * 16(mem_cd), x6, x6; \
167	\
168	vpxor t0, x7, x7; \
169	vpxor 3 * 16(mem_cd), x7, x7; \
170	\
171	vpxor t7, x0, x0; \
172	vpxor 4 * 16(mem_cd), x0, x0; \
173	\
174	vpxor t6, x1, x1; \
175	vpxor 5 * 16(mem_cd), x1, x1; \
176	\
177	vpxor t5, x2, x2; \
178	vpxor 6 * 16(mem_cd), x2, x2; \
179	\
180	vpxor t4, x3, x3; \
181	vpxor 7 * 16(mem_cd), x3, x3;
182
183/*
184 * Size optimization... with inlined roundsm16, binary would be over 5 times
185 * larger and would only be 0.5% faster (on sandy-bridge).
186 */
187.align 8
188roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
189	roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
190		  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
191		  %rcx, (%r9));
192	ret;
193
194.align 8
195roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
196	roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
197		  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
198		  %rax, (%r9));
199	ret;
200
201/*
202 * IN/OUT:
203 *  x0..x7: byte-sliced AB state preloaded
204 *  mem_ab: byte-sliced AB state in memory
205 *  mem_cb: byte-sliced CD state in memory
206 */
207#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
208		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
209	leaq (key_table + (i) * 8)(CTX), %r9; \
210	call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
211	\
212	vmovdqu x4, 0 * 16(mem_cd); \
213	vmovdqu x5, 1 * 16(mem_cd); \
214	vmovdqu x6, 2 * 16(mem_cd); \
215	vmovdqu x7, 3 * 16(mem_cd); \
216	vmovdqu x0, 4 * 16(mem_cd); \
217	vmovdqu x1, 5 * 16(mem_cd); \
218	vmovdqu x2, 6 * 16(mem_cd); \
219	vmovdqu x3, 7 * 16(mem_cd); \
220	\
221	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
222	call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
223	\
224	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
225
226#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
227
228#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
229	/* Store new AB state */ \
230	vmovdqu x0, 0 * 16(mem_ab); \
231	vmovdqu x1, 1 * 16(mem_ab); \
232	vmovdqu x2, 2 * 16(mem_ab); \
233	vmovdqu x3, 3 * 16(mem_ab); \
234	vmovdqu x4, 4 * 16(mem_ab); \
235	vmovdqu x5, 5 * 16(mem_ab); \
236	vmovdqu x6, 6 * 16(mem_ab); \
237	vmovdqu x7, 7 * 16(mem_ab);
238
239#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
240		      y6, y7, mem_ab, mem_cd, i) \
241	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
242		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
243	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
244		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
245	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
247
248#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
249		      y6, y7, mem_ab, mem_cd, i) \
250	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
251		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
252	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
254	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
256
257/*
258 * IN:
259 *  v0..3: byte-sliced 32-bit integers
260 * OUT:
261 *  v0..3: (IN <<< 1)
262 */
263#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
264	vpcmpgtb v0, zero, t0; \
265	vpaddb v0, v0, v0; \
266	vpabsb t0, t0; \
267	\
268	vpcmpgtb v1, zero, t1; \
269	vpaddb v1, v1, v1; \
270	vpabsb t1, t1; \
271	\
272	vpcmpgtb v2, zero, t2; \
273	vpaddb v2, v2, v2; \
274	vpabsb t2, t2; \
275	\
276	vpor t0, v1, v1; \
277	\
278	vpcmpgtb v3, zero, t0; \
279	vpaddb v3, v3, v3; \
280	vpabsb t0, t0; \
281	\
282	vpor t1, v2, v2; \
283	vpor t2, v3, v3; \
284	vpor t0, v0, v0;
285
286/*
287 * IN:
288 *   r: byte-sliced AB state in memory
289 *   l: byte-sliced CD state in memory
290 * OUT:
291 *   x0..x7: new byte-sliced CD state
292 */
293#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
294	      tt1, tt2, tt3, kll, klr, krl, krr) \
295	/* \
296	 * t0 = kll; \
297	 * t0 &= ll; \
298	 * lr ^= rol32(t0, 1); \
299	 */ \
300	vpxor tt0, tt0, tt0; \
301	vmovd kll, t0; \
302	vpshufb tt0, t0, t3; \
303	vpsrldq $1, t0, t0; \
304	vpshufb tt0, t0, t2; \
305	vpsrldq $1, t0, t0; \
306	vpshufb tt0, t0, t1; \
307	vpsrldq $1, t0, t0; \
308	vpshufb tt0, t0, t0; \
309	\
310	vpand l0, t0, t0; \
311	vpand l1, t1, t1; \
312	vpand l2, t2, t2; \
313	vpand l3, t3, t3; \
314	\
315	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
316	\
317	vpxor l4, t0, l4; \
318	vmovdqu l4, 4 * 16(l); \
319	vpxor l5, t1, l5; \
320	vmovdqu l5, 5 * 16(l); \
321	vpxor l6, t2, l6; \
322	vmovdqu l6, 6 * 16(l); \
323	vpxor l7, t3, l7; \
324	vmovdqu l7, 7 * 16(l); \
325	\
326	/* \
327	 * t2 = krr; \
328	 * t2 |= rr; \
329	 * rl ^= t2; \
330	 */ \
331	\
332	vmovd krr, t0; \
333	vpshufb tt0, t0, t3; \
334	vpsrldq $1, t0, t0; \
335	vpshufb tt0, t0, t2; \
336	vpsrldq $1, t0, t0; \
337	vpshufb tt0, t0, t1; \
338	vpsrldq $1, t0, t0; \
339	vpshufb tt0, t0, t0; \
340	\
341	vpor 4 * 16(r), t0, t0; \
342	vpor 5 * 16(r), t1, t1; \
343	vpor 6 * 16(r), t2, t2; \
344	vpor 7 * 16(r), t3, t3; \
345	\
346	vpxor 0 * 16(r), t0, t0; \
347	vpxor 1 * 16(r), t1, t1; \
348	vpxor 2 * 16(r), t2, t2; \
349	vpxor 3 * 16(r), t3, t3; \
350	vmovdqu t0, 0 * 16(r); \
351	vmovdqu t1, 1 * 16(r); \
352	vmovdqu t2, 2 * 16(r); \
353	vmovdqu t3, 3 * 16(r); \
354	\
355	/* \
356	 * t2 = krl; \
357	 * t2 &= rl; \
358	 * rr ^= rol32(t2, 1); \
359	 */ \
360	vmovd krl, t0; \
361	vpshufb tt0, t0, t3; \
362	vpsrldq $1, t0, t0; \
363	vpshufb tt0, t0, t2; \
364	vpsrldq $1, t0, t0; \
365	vpshufb tt0, t0, t1; \
366	vpsrldq $1, t0, t0; \
367	vpshufb tt0, t0, t0; \
368	\
369	vpand 0 * 16(r), t0, t0; \
370	vpand 1 * 16(r), t1, t1; \
371	vpand 2 * 16(r), t2, t2; \
372	vpand 3 * 16(r), t3, t3; \
373	\
374	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
375	\
376	vpxor 4 * 16(r), t0, t0; \
377	vpxor 5 * 16(r), t1, t1; \
378	vpxor 6 * 16(r), t2, t2; \
379	vpxor 7 * 16(r), t3, t3; \
380	vmovdqu t0, 4 * 16(r); \
381	vmovdqu t1, 5 * 16(r); \
382	vmovdqu t2, 6 * 16(r); \
383	vmovdqu t3, 7 * 16(r); \
384	\
385	/* \
386	 * t0 = klr; \
387	 * t0 |= lr; \
388	 * ll ^= t0; \
389	 */ \
390	\
391	vmovd klr, t0; \
392	vpshufb tt0, t0, t3; \
393	vpsrldq $1, t0, t0; \
394	vpshufb tt0, t0, t2; \
395	vpsrldq $1, t0, t0; \
396	vpshufb tt0, t0, t1; \
397	vpsrldq $1, t0, t0; \
398	vpshufb tt0, t0, t0; \
399	\
400	vpor l4, t0, t0; \
401	vpor l5, t1, t1; \
402	vpor l6, t2, t2; \
403	vpor l7, t3, t3; \
404	\
405	vpxor l0, t0, l0; \
406	vmovdqu l0, 0 * 16(l); \
407	vpxor l1, t1, l1; \
408	vmovdqu l1, 1 * 16(l); \
409	vpxor l2, t2, l2; \
410	vmovdqu l2, 2 * 16(l); \
411	vpxor l3, t3, l3; \
412	vmovdqu l3, 3 * 16(l);
413
414#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
415	vpunpckhdq x1, x0, t2; \
416	vpunpckldq x1, x0, x0; \
417	\
418	vpunpckldq x3, x2, t1; \
419	vpunpckhdq x3, x2, x2; \
420	\
421	vpunpckhqdq t1, x0, x1; \
422	vpunpcklqdq t1, x0, x0; \
423	\
424	vpunpckhqdq x2, t2, x3; \
425	vpunpcklqdq x2, t2, x2;
426
427#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
428			 b3, c3, d3, st0, st1) \
429	vmovdqu d2, st0; \
430	vmovdqu d3, st1; \
431	transpose_4x4(a0, a1, a2, a3, d2, d3); \
432	transpose_4x4(b0, b1, b2, b3, d2, d3); \
433	vmovdqu st0, d2; \
434	vmovdqu st1, d3; \
435	\
436	vmovdqu a0, st0; \
437	vmovdqu a1, st1; \
438	transpose_4x4(c0, c1, c2, c3, a0, a1); \
439	transpose_4x4(d0, d1, d2, d3, a0, a1); \
440	\
441	vmovdqu .Lshufb_16x16b, a0; \
442	vmovdqu st1, a1; \
443	vpshufb a0, a2, a2; \
444	vpshufb a0, a3, a3; \
445	vpshufb a0, b0, b0; \
446	vpshufb a0, b1, b1; \
447	vpshufb a0, b2, b2; \
448	vpshufb a0, b3, b3; \
449	vpshufb a0, a1, a1; \
450	vpshufb a0, c0, c0; \
451	vpshufb a0, c1, c1; \
452	vpshufb a0, c2, c2; \
453	vpshufb a0, c3, c3; \
454	vpshufb a0, d0, d0; \
455	vpshufb a0, d1, d1; \
456	vpshufb a0, d2, d2; \
457	vpshufb a0, d3, d3; \
458	vmovdqu d3, st1; \
459	vmovdqu st0, d3; \
460	vpshufb a0, d3, a0; \
461	vmovdqu d2, st0; \
462	\
463	transpose_4x4(a0, b0, c0, d0, d2, d3); \
464	transpose_4x4(a1, b1, c1, d1, d2, d3); \
465	vmovdqu st0, d2; \
466	vmovdqu st1, d3; \
467	\
468	vmovdqu b0, st0; \
469	vmovdqu b1, st1; \
470	transpose_4x4(a2, b2, c2, d2, b0, b1); \
471	transpose_4x4(a3, b3, c3, d3, b0, b1); \
472	vmovdqu st0, b0; \
473	vmovdqu st1, b1; \
474	/* does not adjust output bytes inside vectors */
475
476/* load blocks to registers and apply pre-whitening */
477#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
478		     y6, y7, rio, key) \
479	vmovq key, x0; \
480	vpshufb .Lpack_bswap, x0, x0; \
481	\
482	vpxor 0 * 16(rio), x0, y7; \
483	vpxor 1 * 16(rio), x0, y6; \
484	vpxor 2 * 16(rio), x0, y5; \
485	vpxor 3 * 16(rio), x0, y4; \
486	vpxor 4 * 16(rio), x0, y3; \
487	vpxor 5 * 16(rio), x0, y2; \
488	vpxor 6 * 16(rio), x0, y1; \
489	vpxor 7 * 16(rio), x0, y0; \
490	vpxor 8 * 16(rio), x0, x7; \
491	vpxor 9 * 16(rio), x0, x6; \
492	vpxor 10 * 16(rio), x0, x5; \
493	vpxor 11 * 16(rio), x0, x4; \
494	vpxor 12 * 16(rio), x0, x3; \
495	vpxor 13 * 16(rio), x0, x2; \
496	vpxor 14 * 16(rio), x0, x1; \
497	vpxor 15 * 16(rio), x0, x0;
498
499/* byteslice pre-whitened blocks and store to temporary memory */
500#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
501		      y6, y7, mem_ab, mem_cd) \
502	byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
503			 y5, y6, y7, (mem_ab), (mem_cd)); \
504	\
505	vmovdqu x0, 0 * 16(mem_ab); \
506	vmovdqu x1, 1 * 16(mem_ab); \
507	vmovdqu x2, 2 * 16(mem_ab); \
508	vmovdqu x3, 3 * 16(mem_ab); \
509	vmovdqu x4, 4 * 16(mem_ab); \
510	vmovdqu x5, 5 * 16(mem_ab); \
511	vmovdqu x6, 6 * 16(mem_ab); \
512	vmovdqu x7, 7 * 16(mem_ab); \
513	vmovdqu y0, 0 * 16(mem_cd); \
514	vmovdqu y1, 1 * 16(mem_cd); \
515	vmovdqu y2, 2 * 16(mem_cd); \
516	vmovdqu y3, 3 * 16(mem_cd); \
517	vmovdqu y4, 4 * 16(mem_cd); \
518	vmovdqu y5, 5 * 16(mem_cd); \
519	vmovdqu y6, 6 * 16(mem_cd); \
520	vmovdqu y7, 7 * 16(mem_cd);
521
522/* de-byteslice, apply post-whitening and store blocks */
523#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
524		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
525	byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
526			 y7, x3, x7, stack_tmp0, stack_tmp1); \
527	\
528	vmovdqu x0, stack_tmp0; \
529	\
530	vmovq key, x0; \
531	vpshufb .Lpack_bswap, x0, x0; \
532	\
533	vpxor x0, y7, y7; \
534	vpxor x0, y6, y6; \
535	vpxor x0, y5, y5; \
536	vpxor x0, y4, y4; \
537	vpxor x0, y3, y3; \
538	vpxor x0, y2, y2; \
539	vpxor x0, y1, y1; \
540	vpxor x0, y0, y0; \
541	vpxor x0, x7, x7; \
542	vpxor x0, x6, x6; \
543	vpxor x0, x5, x5; \
544	vpxor x0, x4, x4; \
545	vpxor x0, x3, x3; \
546	vpxor x0, x2, x2; \
547	vpxor x0, x1, x1; \
548	vpxor stack_tmp0, x0, x0;
549
550#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
551		     y6, y7, rio) \
552	vmovdqu x0, 0 * 16(rio); \
553	vmovdqu x1, 1 * 16(rio); \
554	vmovdqu x2, 2 * 16(rio); \
555	vmovdqu x3, 3 * 16(rio); \
556	vmovdqu x4, 4 * 16(rio); \
557	vmovdqu x5, 5 * 16(rio); \
558	vmovdqu x6, 6 * 16(rio); \
559	vmovdqu x7, 7 * 16(rio); \
560	vmovdqu y0, 8 * 16(rio); \
561	vmovdqu y1, 9 * 16(rio); \
562	vmovdqu y2, 10 * 16(rio); \
563	vmovdqu y3, 11 * 16(rio); \
564	vmovdqu y4, 12 * 16(rio); \
565	vmovdqu y5, 13 * 16(rio); \
566	vmovdqu y6, 14 * 16(rio); \
567	vmovdqu y7, 15 * 16(rio);
568
569.data
570.align 16
571
572#define SHUFB_BYTES(idx) \
573	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
574
575.Lshufb_16x16b:
576	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
577
578.Lpack_bswap:
579	.long 0x00010203
580	.long 0x04050607
581	.long 0x80808080
582	.long 0x80808080
583
584/* For CTR-mode IV byteswap */
585.Lbswap128_mask:
586	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
587
588/*
589 * pre-SubByte transform
590 *
591 * pre-lookup for sbox1, sbox2, sbox3:
592 *   swap_bitendianness(
593 *       isom_map_camellia_to_aes(
594 *           camellia_f(
595 *               swap_bitendianess(in)
596 *           )
597 *       )
598 *   )
599 *
600 * (note: '⊕ 0xc5' inside camellia_f())
601 */
602.Lpre_tf_lo_s1:
603	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
604	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
605.Lpre_tf_hi_s1:
606	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
607	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
608
609/*
610 * pre-SubByte transform
611 *
612 * pre-lookup for sbox4:
613 *   swap_bitendianness(
614 *       isom_map_camellia_to_aes(
615 *           camellia_f(
616 *               swap_bitendianess(in <<< 1)
617 *           )
618 *       )
619 *   )
620 *
621 * (note: '⊕ 0xc5' inside camellia_f())
622 */
623.Lpre_tf_lo_s4:
624	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
625	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
626.Lpre_tf_hi_s4:
627	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
628	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
629
630/*
631 * post-SubByte transform
632 *
633 * post-lookup for sbox1, sbox4:
634 *  swap_bitendianness(
635 *      camellia_h(
636 *          isom_map_aes_to_camellia(
637 *              swap_bitendianness(
638 *                  aes_inverse_affine_transform(in)
639 *              )
640 *          )
641 *      )
642 *  )
643 *
644 * (note: '⊕ 0x6e' inside camellia_h())
645 */
646.Lpost_tf_lo_s1:
647	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
648	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
649.Lpost_tf_hi_s1:
650	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
651	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
652
653/*
654 * post-SubByte transform
655 *
656 * post-lookup for sbox2:
657 *  swap_bitendianness(
658 *      camellia_h(
659 *          isom_map_aes_to_camellia(
660 *              swap_bitendianness(
661 *                  aes_inverse_affine_transform(in)
662 *              )
663 *          )
664 *      )
665 *  ) <<< 1
666 *
667 * (note: '⊕ 0x6e' inside camellia_h())
668 */
669.Lpost_tf_lo_s2:
670	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
671	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
672.Lpost_tf_hi_s2:
673	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
674	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
675
676/*
677 * post-SubByte transform
678 *
679 * post-lookup for sbox3:
680 *  swap_bitendianness(
681 *      camellia_h(
682 *          isom_map_aes_to_camellia(
683 *              swap_bitendianness(
684 *                  aes_inverse_affine_transform(in)
685 *              )
686 *          )
687 *      )
688 *  ) >>> 1
689 *
690 * (note: '⊕ 0x6e' inside camellia_h())
691 */
692.Lpost_tf_lo_s3:
693	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
694	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
695.Lpost_tf_hi_s3:
696	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
697	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
698
699/* For isolating SubBytes from AESENCLAST, inverse shift row */
700.Linv_shift_row:
701	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
702	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
703
704/* 4-bit mask */
705.align 4
706.L0f0f0f0f:
707	.long 0x0f0f0f0f
708
709.text
710
711.align 8
712.type   __camellia_enc_blk16,@function;
713
714__camellia_enc_blk16:
715	/* input:
716	 *	%rdi: ctx, CTX
717	 *	%rax: temporary storage, 256 bytes
718	 *	%xmm0..%xmm15: 16 plaintext blocks
719	 * output:
720	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
721	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
722	 */
723
724	leaq 8 * 16(%rax), %rcx;
725
726	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
727		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
728		      %xmm15, %rax, %rcx);
729
730	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
731		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
732		     %xmm15, %rax, %rcx, 0);
733
734	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
735	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
736	      %xmm15,
737	      ((key_table + (8) * 8) + 0)(CTX),
738	      ((key_table + (8) * 8) + 4)(CTX),
739	      ((key_table + (8) * 8) + 8)(CTX),
740	      ((key_table + (8) * 8) + 12)(CTX));
741
742	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
743		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
744		     %xmm15, %rax, %rcx, 8);
745
746	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
747	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
748	      %xmm15,
749	      ((key_table + (16) * 8) + 0)(CTX),
750	      ((key_table + (16) * 8) + 4)(CTX),
751	      ((key_table + (16) * 8) + 8)(CTX),
752	      ((key_table + (16) * 8) + 12)(CTX));
753
754	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
755		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
756		     %xmm15, %rax, %rcx, 16);
757
758	movl $24, %r8d;
759	cmpl $16, key_length(CTX);
760	jne .Lenc_max32;
761
762.Lenc_done:
763	/* load CD for output */
764	vmovdqu 0 * 16(%rcx), %xmm8;
765	vmovdqu 1 * 16(%rcx), %xmm9;
766	vmovdqu 2 * 16(%rcx), %xmm10;
767	vmovdqu 3 * 16(%rcx), %xmm11;
768	vmovdqu 4 * 16(%rcx), %xmm12;
769	vmovdqu 5 * 16(%rcx), %xmm13;
770	vmovdqu 6 * 16(%rcx), %xmm14;
771	vmovdqu 7 * 16(%rcx), %xmm15;
772
773	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
774		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
775		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
776
777	ret;
778
779.align 8
780.Lenc_max32:
781	movl $32, %r8d;
782
783	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
784	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
785	      %xmm15,
786	      ((key_table + (24) * 8) + 0)(CTX),
787	      ((key_table + (24) * 8) + 4)(CTX),
788	      ((key_table + (24) * 8) + 8)(CTX),
789	      ((key_table + (24) * 8) + 12)(CTX));
790
791	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
792		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
793		     %xmm15, %rax, %rcx, 24);
794
795	jmp .Lenc_done;
796
797.align 8
798.type   __camellia_dec_blk16,@function;
799
800__camellia_dec_blk16:
801	/* input:
802	 *	%rdi: ctx, CTX
803	 *	%rax: temporary storage, 256 bytes
804	 *	%r8d: 24 for 16 byte key, 32 for larger
805	 *	%xmm0..%xmm15: 16 encrypted blocks
806	 * output:
807	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
808	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
809	 */
810
811	leaq 8 * 16(%rax), %rcx;
812
813	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
814		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
815		      %xmm15, %rax, %rcx);
816
817	cmpl $32, %r8d;
818	je .Ldec_max32;
819
820.Ldec_max24:
821	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
822		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
823		     %xmm15, %rax, %rcx, 16);
824
825	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
826	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
827	      %xmm15,
828	      ((key_table + (16) * 8) + 8)(CTX),
829	      ((key_table + (16) * 8) + 12)(CTX),
830	      ((key_table + (16) * 8) + 0)(CTX),
831	      ((key_table + (16) * 8) + 4)(CTX));
832
833	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
834		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
835		     %xmm15, %rax, %rcx, 8);
836
837	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
838	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
839	      %xmm15,
840	      ((key_table + (8) * 8) + 8)(CTX),
841	      ((key_table + (8) * 8) + 12)(CTX),
842	      ((key_table + (8) * 8) + 0)(CTX),
843	      ((key_table + (8) * 8) + 4)(CTX));
844
845	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
846		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
847		     %xmm15, %rax, %rcx, 0);
848
849	/* load CD for output */
850	vmovdqu 0 * 16(%rcx), %xmm8;
851	vmovdqu 1 * 16(%rcx), %xmm9;
852	vmovdqu 2 * 16(%rcx), %xmm10;
853	vmovdqu 3 * 16(%rcx), %xmm11;
854	vmovdqu 4 * 16(%rcx), %xmm12;
855	vmovdqu 5 * 16(%rcx), %xmm13;
856	vmovdqu 6 * 16(%rcx), %xmm14;
857	vmovdqu 7 * 16(%rcx), %xmm15;
858
859	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
860		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
861		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
862
863	ret;
864
865.align 8
866.Ldec_max32:
867	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
868		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
869		     %xmm15, %rax, %rcx, 24);
870
871	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
872	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873	      %xmm15,
874	      ((key_table + (24) * 8) + 8)(CTX),
875	      ((key_table + (24) * 8) + 12)(CTX),
876	      ((key_table + (24) * 8) + 0)(CTX),
877	      ((key_table + (24) * 8) + 4)(CTX));
878
879	jmp .Ldec_max24;
880
881.align 8
882.global camellia_ecb_enc_16way
883.type   camellia_ecb_enc_16way,@function;
884
885camellia_ecb_enc_16way:
886	/* input:
887	 *	%rdi: ctx, CTX
888	 *	%rsi: dst (16 blocks)
889	 *	%rdx: src (16 blocks)
890	 */
891
892	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
893		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
894		     %xmm15, %rdx, (key_table)(CTX));
895
896	/* now dst can be used as temporary buffer (even in src == dst case) */
897	movq	%rsi, %rax;
898
899	call __camellia_enc_blk16;
900
901	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
902		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
903		     %xmm8, %rsi);
904
905	ret;
906
907.align 8
908.global camellia_ecb_dec_16way
909.type   camellia_ecb_dec_16way,@function;
910
911camellia_ecb_dec_16way:
912	/* input:
913	 *	%rdi: ctx, CTX
914	 *	%rsi: dst (16 blocks)
915	 *	%rdx: src (16 blocks)
916	 */
917
918	cmpl $16, key_length(CTX);
919	movl $32, %r8d;
920	movl $24, %eax;
921	cmovel %eax, %r8d; /* max */
922
923	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
924		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
925		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
926
927	/* now dst can be used as temporary buffer (even in src == dst case) */
928	movq	%rsi, %rax;
929
930	call __camellia_dec_blk16;
931
932	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
933		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
934		     %xmm8, %rsi);
935
936	ret;
937
938.align 8
939.global camellia_cbc_dec_16way
940.type   camellia_cbc_dec_16way,@function;
941
942camellia_cbc_dec_16way:
943	/* input:
944	 *	%rdi: ctx, CTX
945	 *	%rsi: dst (16 blocks)
946	 *	%rdx: src (16 blocks)
947	 */
948
949	cmpl $16, key_length(CTX);
950	movl $32, %r8d;
951	movl $24, %eax;
952	cmovel %eax, %r8d; /* max */
953
954	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
955		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
956		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
957
958	/*
959	 * dst might still be in-use (in case dst == src), so use stack for
960	 * temporary storage.
961	 */
962	subq $(16 * 16), %rsp;
963	movq %rsp, %rax;
964
965	call __camellia_dec_blk16;
966
967	addq $(16 * 16), %rsp;
968
969	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
970	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
971	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
972	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
973	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
974	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
975	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
976	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
977	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
978	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
979	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
980	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
981	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
982	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
983	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
984	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
985		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
986		     %xmm8, %rsi);
987
988	ret;
989
990#define inc_le128(x, minus_one, tmp) \
991	vpcmpeqq minus_one, x, tmp; \
992	vpsubq minus_one, x, x; \
993	vpslldq $8, tmp, tmp; \
994	vpsubq tmp, x, x;
995
996.align 8
997.global camellia_ctr_16way
998.type   camellia_ctr_16way,@function;
999
1000camellia_ctr_16way:
1001	/* input:
1002	 *	%rdi: ctx, CTX
1003	 *	%rsi: dst (16 blocks)
1004	 *	%rdx: src (16 blocks)
1005	 *	%rcx: iv (little endian, 128bit)
1006	 */
1007
1008	subq $(16 * 16), %rsp;
1009	movq %rsp, %rax;
1010
1011	vmovdqa .Lbswap128_mask, %xmm14;
1012
1013	/* load IV and byteswap */
1014	vmovdqu (%rcx), %xmm0;
1015	vpshufb %xmm14, %xmm0, %xmm15;
1016	vmovdqu %xmm15, 15 * 16(%rax);
1017
1018	vpcmpeqd %xmm15, %xmm15, %xmm15;
1019	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1020
1021	/* construct IVs */
1022	inc_le128(%xmm0, %xmm15, %xmm13);
1023	vpshufb %xmm14, %xmm0, %xmm13;
1024	vmovdqu %xmm13, 14 * 16(%rax);
1025	inc_le128(%xmm0, %xmm15, %xmm13);
1026	vpshufb %xmm14, %xmm0, %xmm13;
1027	vmovdqu %xmm13, 13 * 16(%rax);
1028	inc_le128(%xmm0, %xmm15, %xmm13);
1029	vpshufb %xmm14, %xmm0, %xmm12;
1030	inc_le128(%xmm0, %xmm15, %xmm13);
1031	vpshufb %xmm14, %xmm0, %xmm11;
1032	inc_le128(%xmm0, %xmm15, %xmm13);
1033	vpshufb %xmm14, %xmm0, %xmm10;
1034	inc_le128(%xmm0, %xmm15, %xmm13);
1035	vpshufb %xmm14, %xmm0, %xmm9;
1036	inc_le128(%xmm0, %xmm15, %xmm13);
1037	vpshufb %xmm14, %xmm0, %xmm8;
1038	inc_le128(%xmm0, %xmm15, %xmm13);
1039	vpshufb %xmm14, %xmm0, %xmm7;
1040	inc_le128(%xmm0, %xmm15, %xmm13);
1041	vpshufb %xmm14, %xmm0, %xmm6;
1042	inc_le128(%xmm0, %xmm15, %xmm13);
1043	vpshufb %xmm14, %xmm0, %xmm5;
1044	inc_le128(%xmm0, %xmm15, %xmm13);
1045	vpshufb %xmm14, %xmm0, %xmm4;
1046	inc_le128(%xmm0, %xmm15, %xmm13);
1047	vpshufb %xmm14, %xmm0, %xmm3;
1048	inc_le128(%xmm0, %xmm15, %xmm13);
1049	vpshufb %xmm14, %xmm0, %xmm2;
1050	inc_le128(%xmm0, %xmm15, %xmm13);
1051	vpshufb %xmm14, %xmm0, %xmm1;
1052	inc_le128(%xmm0, %xmm15, %xmm13);
1053	vmovdqa %xmm0, %xmm13;
1054	vpshufb %xmm14, %xmm0, %xmm0;
1055	inc_le128(%xmm13, %xmm15, %xmm14);
1056	vmovdqu %xmm13, (%rcx);
1057
1058	/* inpack16_pre: */
1059	vmovq (key_table)(CTX), %xmm15;
1060	vpshufb .Lpack_bswap, %xmm15, %xmm15;
1061	vpxor %xmm0, %xmm15, %xmm0;
1062	vpxor %xmm1, %xmm15, %xmm1;
1063	vpxor %xmm2, %xmm15, %xmm2;
1064	vpxor %xmm3, %xmm15, %xmm3;
1065	vpxor %xmm4, %xmm15, %xmm4;
1066	vpxor %xmm5, %xmm15, %xmm5;
1067	vpxor %xmm6, %xmm15, %xmm6;
1068	vpxor %xmm7, %xmm15, %xmm7;
1069	vpxor %xmm8, %xmm15, %xmm8;
1070	vpxor %xmm9, %xmm15, %xmm9;
1071	vpxor %xmm10, %xmm15, %xmm10;
1072	vpxor %xmm11, %xmm15, %xmm11;
1073	vpxor %xmm12, %xmm15, %xmm12;
1074	vpxor 13 * 16(%rax), %xmm15, %xmm13;
1075	vpxor 14 * 16(%rax), %xmm15, %xmm14;
1076	vpxor 15 * 16(%rax), %xmm15, %xmm15;
1077
1078	call __camellia_enc_blk16;
1079
1080	addq $(16 * 16), %rsp;
1081
1082	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1083	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1084	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1085	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1086	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1087	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1088	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1089	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1090	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1091	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1092	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1093	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1094	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1095	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1096	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1097	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1098	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1099		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1100		     %xmm8, %rsi);
1101
1102	ret;
1103