xref: /linux/arch/x86/crypto/camellia-x86_64-asm_64.S (revision fa79e55d467366a2c52c68a261a0d6ea5f8a6534)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Camellia Cipher Algorithm (x86_64)
4 *
5 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9#include <linux/cfi_types.h>
10
11.file "camellia-x86_64-asm_64.S"
12.text
13
14.extern camellia_sp10011110;
15.extern camellia_sp22000222;
16.extern camellia_sp03303033;
17.extern camellia_sp00444404;
18.extern camellia_sp02220222;
19.extern camellia_sp30333033;
20.extern camellia_sp44044404;
21.extern camellia_sp11101110;
22
23#define sp10011110 camellia_sp10011110
24#define sp22000222 camellia_sp22000222
25#define sp03303033 camellia_sp03303033
26#define sp00444404 camellia_sp00444404
27#define sp02220222 camellia_sp02220222
28#define sp30333033 camellia_sp30333033
29#define sp44044404 camellia_sp44044404
30#define sp11101110 camellia_sp11101110
31
32#define CAMELLIA_TABLE_BYTE_LEN 272
33
34/* struct camellia_ctx: */
35#define key_table 0
36#define key_length CAMELLIA_TABLE_BYTE_LEN
37
38/* register macros */
39#define CTX %rdi
40#define RIO %rsi
41#define RIOd %esi
42
43#define RAB0 %rax
44#define RCD0 %rcx
45#define RAB1 %rbx
46#define RCD1 %rdx
47
48#define RAB0d %eax
49#define RCD0d %ecx
50#define RAB1d %ebx
51#define RCD1d %edx
52
53#define RAB0bl %al
54#define RCD0bl %cl
55#define RAB1bl %bl
56#define RCD1bl %dl
57
58#define RAB0bh %ah
59#define RCD0bh %ch
60#define RAB1bh %bh
61#define RCD1bh %dh
62
63#define RT0 %rsi
64#define RT1 %r12
65#define RT2 %r8
66
67#define RT0d %esi
68#define RT1d %r12d
69#define RT2d %r8d
70
71#define RT2bl %r8b
72
73#define RXOR %r9
74#define RR12 %r10
75#define RDST %r11
76
77#define RXORd %r9d
78#define RXORbl %r9b
79
80#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
81	leaq T0(%rip), 			tmp1; \
82	movzbl ab ## bl,		tmp2 ## d; \
83	xorq (tmp1, tmp2, 8),		dst; \
84	leaq T1(%rip), 			tmp2; \
85	movzbl ab ## bh,		tmp1 ## d; \
86	rorq $16,			ab; \
87	xorq (tmp2, tmp1, 8),		dst;
88
89/**********************************************************************
90  1-way camellia
91 **********************************************************************/
92#define roundsm(ab, subkey, cd) \
93	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
94	\
95	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
96	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
97	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
98	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
99	\
100	xorq RT2,					cd ## 0;
101
102#define fls(l, r, kl, kr) \
103	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
104	andl l ## 0d,					RT0d; \
105	roll $1,					RT0d; \
106	shlq $32,					RT0; \
107	xorq RT0,					l ## 0; \
108	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
109	orq r ## 0,					RT1; \
110	shrq $32,					RT1; \
111	xorq RT1,					r ## 0; \
112	\
113	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
114	orq l ## 0,					RT2; \
115	shrq $32,					RT2; \
116	xorq RT2,					l ## 0; \
117	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
118	andl r ## 0d,					RT0d; \
119	roll $1,					RT0d; \
120	shlq $32,					RT0; \
121	xorq RT0,					r ## 0;
122
123#define enc_rounds(i) \
124	roundsm(RAB, i + 2, RCD); \
125	roundsm(RCD, i + 3, RAB); \
126	roundsm(RAB, i + 4, RCD); \
127	roundsm(RCD, i + 5, RAB); \
128	roundsm(RAB, i + 6, RCD); \
129	roundsm(RCD, i + 7, RAB);
130
131#define enc_fls(i) \
132	fls(RAB, RCD, i + 0, i + 1);
133
134#define enc_inpack() \
135	movq (RIO),			RAB0; \
136	bswapq				RAB0; \
137	rolq $32,			RAB0; \
138	movq 4*2(RIO),			RCD0; \
139	bswapq				RCD0; \
140	rorq $32,			RCD0; \
141	xorq key_table(CTX),		RAB0;
142
143#define enc_outunpack(op, max) \
144	xorq key_table(CTX, max, 8),	RCD0; \
145	rorq $32,			RCD0; \
146	bswapq				RCD0; \
147	op ## q RCD0,			(RIO); \
148	rolq $32,			RAB0; \
149	bswapq				RAB0; \
150	op ## q RAB0,			4*2(RIO);
151
152#define dec_rounds(i) \
153	roundsm(RAB, i + 7, RCD); \
154	roundsm(RCD, i + 6, RAB); \
155	roundsm(RAB, i + 5, RCD); \
156	roundsm(RCD, i + 4, RAB); \
157	roundsm(RAB, i + 3, RCD); \
158	roundsm(RCD, i + 2, RAB);
159
160#define dec_fls(i) \
161	fls(RAB, RCD, i + 1, i + 0);
162
163#define dec_inpack(max) \
164	movq (RIO),			RAB0; \
165	bswapq				RAB0; \
166	rolq $32,			RAB0; \
167	movq 4*2(RIO),			RCD0; \
168	bswapq				RCD0; \
169	rorq $32,			RCD0; \
170	xorq key_table(CTX, max, 8),	RAB0;
171
172#define dec_outunpack() \
173	xorq key_table(CTX),		RCD0; \
174	rorq $32,			RCD0; \
175	bswapq				RCD0; \
176	movq RCD0,			(RIO); \
177	rolq $32,			RAB0; \
178	bswapq				RAB0; \
179	movq RAB0,			4*2(RIO);
180
181SYM_TYPED_FUNC_START(__camellia_enc_blk)
182	/* input:
183	 *	%rdi: ctx, CTX
184	 *	%rsi: dst
185	 *	%rdx: src
186	 *	%rcx: bool xor
187	 */
188	movq %r12, RR12;
189
190	movq %rcx, RXOR;
191	movq %rsi, RDST;
192	movq %rdx, RIO;
193
194	enc_inpack();
195
196	enc_rounds(0);
197	enc_fls(8);
198	enc_rounds(8);
199	enc_fls(16);
200	enc_rounds(16);
201	movl $24, RT1d; /* max */
202
203	cmpb $16, key_length(CTX);
204	je .L__enc_done;
205
206	enc_fls(24);
207	enc_rounds(24);
208	movl $32, RT1d; /* max */
209
210.L__enc_done:
211	testb RXORbl, RXORbl;
212	movq RDST, RIO;
213
214	jnz .L__enc_xor;
215
216	enc_outunpack(mov, RT1);
217
218	movq RR12, %r12;
219	RET;
220
221.L__enc_xor:
222	enc_outunpack(xor, RT1);
223
224	movq RR12, %r12;
225	RET;
226SYM_FUNC_END(__camellia_enc_blk)
227
228SYM_TYPED_FUNC_START(camellia_dec_blk)
229	/* input:
230	 *	%rdi: ctx, CTX
231	 *	%rsi: dst
232	 *	%rdx: src
233	 */
234	cmpl $16, key_length(CTX);
235	movl $32, RT2d;
236	movl $24, RXORd;
237	cmovel RXORd, RT2d; /* max */
238
239	movq %r12, RR12;
240	movq %rsi, RDST;
241	movq %rdx, RIO;
242
243	dec_inpack(RT2);
244
245	cmpb $24, RT2bl;
246	je .L__dec_rounds16;
247
248	dec_rounds(24);
249	dec_fls(24);
250
251.L__dec_rounds16:
252	dec_rounds(16);
253	dec_fls(16);
254	dec_rounds(8);
255	dec_fls(8);
256	dec_rounds(0);
257
258	movq RDST, RIO;
259
260	dec_outunpack();
261
262	movq RR12, %r12;
263	RET;
264SYM_FUNC_END(camellia_dec_blk)
265
266/**********************************************************************
267  2-way camellia
268 **********************************************************************/
269#define roundsm2(ab, subkey, cd) \
270	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
271	xorq RT2,					cd ## 1; \
272	\
273	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
274	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
275	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
276	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
277	\
278		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
279		xorq RT2,					cd ## 0; \
280		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
281		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
282		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
283
284#define fls2(l, r, kl, kr) \
285	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
286	andl l ## 0d,					RT0d; \
287	roll $1,					RT0d; \
288	shlq $32,					RT0; \
289	xorq RT0,					l ## 0; \
290	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
291	orq r ## 0,					RT1; \
292	shrq $32,					RT1; \
293	xorq RT1,					r ## 0; \
294	\
295		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
296		andl l ## 1d,					RT2d; \
297		roll $1,					RT2d; \
298		shlq $32,					RT2; \
299		xorq RT2,					l ## 1; \
300		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
301		orq r ## 1,					RT0; \
302		shrq $32,					RT0; \
303		xorq RT0,					r ## 1; \
304	\
305	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
306	orq l ## 0,					RT1; \
307	shrq $32,					RT1; \
308	xorq RT1,					l ## 0; \
309	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
310	andl r ## 0d,					RT2d; \
311	roll $1,					RT2d; \
312	shlq $32,					RT2; \
313	xorq RT2,					r ## 0; \
314	\
315		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
316		orq l ## 1,					RT0; \
317		shrq $32,					RT0; \
318		xorq RT0,					l ## 1; \
319		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
320		andl r ## 1d,					RT1d; \
321		roll $1,					RT1d; \
322		shlq $32,					RT1; \
323		xorq RT1,					r ## 1;
324
325#define enc_rounds2(i) \
326	roundsm2(RAB, i + 2, RCD); \
327	roundsm2(RCD, i + 3, RAB); \
328	roundsm2(RAB, i + 4, RCD); \
329	roundsm2(RCD, i + 5, RAB); \
330	roundsm2(RAB, i + 6, RCD); \
331	roundsm2(RCD, i + 7, RAB);
332
333#define enc_fls2(i) \
334	fls2(RAB, RCD, i + 0, i + 1);
335
336#define enc_inpack2() \
337	movq (RIO),			RAB0; \
338	bswapq				RAB0; \
339	rorq $32,			RAB0; \
340	movq 4*2(RIO),			RCD0; \
341	bswapq				RCD0; \
342	rolq $32,			RCD0; \
343	xorq key_table(CTX),		RAB0; \
344	\
345		movq 8*2(RIO),			RAB1; \
346		bswapq				RAB1; \
347		rorq $32,			RAB1; \
348		movq 12*2(RIO),			RCD1; \
349		bswapq				RCD1; \
350		rolq $32,			RCD1; \
351		xorq key_table(CTX),		RAB1;
352
353#define enc_outunpack2(op, max) \
354	xorq key_table(CTX, max, 8),	RCD0; \
355	rolq $32,			RCD0; \
356	bswapq				RCD0; \
357	op ## q RCD0,			(RIO); \
358	rorq $32,			RAB0; \
359	bswapq				RAB0; \
360	op ## q RAB0,			4*2(RIO); \
361	\
362		xorq key_table(CTX, max, 8),	RCD1; \
363		rolq $32,			RCD1; \
364		bswapq				RCD1; \
365		op ## q RCD1,			8*2(RIO); \
366		rorq $32,			RAB1; \
367		bswapq				RAB1; \
368		op ## q RAB1,			12*2(RIO);
369
370#define dec_rounds2(i) \
371	roundsm2(RAB, i + 7, RCD); \
372	roundsm2(RCD, i + 6, RAB); \
373	roundsm2(RAB, i + 5, RCD); \
374	roundsm2(RCD, i + 4, RAB); \
375	roundsm2(RAB, i + 3, RCD); \
376	roundsm2(RCD, i + 2, RAB);
377
378#define dec_fls2(i) \
379	fls2(RAB, RCD, i + 1, i + 0);
380
381#define dec_inpack2(max) \
382	movq (RIO),			RAB0; \
383	bswapq				RAB0; \
384	rorq $32,			RAB0; \
385	movq 4*2(RIO),			RCD0; \
386	bswapq				RCD0; \
387	rolq $32,			RCD0; \
388	xorq key_table(CTX, max, 8),	RAB0; \
389	\
390		movq 8*2(RIO),			RAB1; \
391		bswapq				RAB1; \
392		rorq $32,			RAB1; \
393		movq 12*2(RIO),			RCD1; \
394		bswapq				RCD1; \
395		rolq $32,			RCD1; \
396		xorq key_table(CTX, max, 8),	RAB1;
397
398#define dec_outunpack2() \
399	xorq key_table(CTX),		RCD0; \
400	rolq $32,			RCD0; \
401	bswapq				RCD0; \
402	movq RCD0,			(RIO); \
403	rorq $32,			RAB0; \
404	bswapq				RAB0; \
405	movq RAB0,			4*2(RIO); \
406	\
407		xorq key_table(CTX),		RCD1; \
408		rolq $32,			RCD1; \
409		bswapq				RCD1; \
410		movq RCD1,			8*2(RIO); \
411		rorq $32,			RAB1; \
412		bswapq				RAB1; \
413		movq RAB1,			12*2(RIO);
414
415SYM_TYPED_FUNC_START(__camellia_enc_blk_2way)
416	/* input:
417	 *	%rdi: ctx, CTX
418	 *	%rsi: dst
419	 *	%rdx: src
420	 *	%rcx: bool xor
421	 */
422	pushq %rbx;
423
424	movq %r12, RR12;
425	movq %rcx, RXOR;
426	movq %rsi, RDST;
427	movq %rdx, RIO;
428
429	enc_inpack2();
430
431	enc_rounds2(0);
432	enc_fls2(8);
433	enc_rounds2(8);
434	enc_fls2(16);
435	enc_rounds2(16);
436	movl $24, RT2d; /* max */
437
438	cmpb $16, key_length(CTX);
439	je .L__enc2_done;
440
441	enc_fls2(24);
442	enc_rounds2(24);
443	movl $32, RT2d; /* max */
444
445.L__enc2_done:
446	test RXORbl, RXORbl;
447	movq RDST, RIO;
448	jnz .L__enc2_xor;
449
450	enc_outunpack2(mov, RT2);
451
452	movq RR12, %r12;
453	popq %rbx;
454	RET;
455
456.L__enc2_xor:
457	enc_outunpack2(xor, RT2);
458
459	movq RR12, %r12;
460	popq %rbx;
461	RET;
462SYM_FUNC_END(__camellia_enc_blk_2way)
463
464SYM_TYPED_FUNC_START(camellia_dec_blk_2way)
465	/* input:
466	 *	%rdi: ctx, CTX
467	 *	%rsi: dst
468	 *	%rdx: src
469	 */
470	cmpl $16, key_length(CTX);
471	movl $32, RT2d;
472	movl $24, RXORd;
473	cmovel RXORd, RT2d; /* max */
474
475	movq %rbx, RXOR;
476	movq %r12, RR12;
477	movq %rsi, RDST;
478	movq %rdx, RIO;
479
480	dec_inpack2(RT2);
481
482	cmpb $24, RT2bl;
483	je .L__dec2_rounds16;
484
485	dec_rounds2(24);
486	dec_fls2(24);
487
488.L__dec2_rounds16:
489	dec_rounds2(16);
490	dec_fls2(16);
491	dec_rounds2(8);
492	dec_fls2(8);
493	dec_rounds2(0);
494
495	movq RDST, RIO;
496
497	dec_outunpack2();
498
499	movq RR12, %r12;
500	movq RXOR, %rbx;
501	RET;
502SYM_FUNC_END(camellia_dec_blk_2way)
503