xref: /linux/arch/x86/crypto/blowfish-x86_64-asm_64.S (revision 320fefa9e2edc67011e235ea1d50f0d00ddfe004)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Blowfish Cipher Algorithm (x86_64)
4 *
5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9#include <linux/cfi_types.h>
10
11.file "blowfish-x86_64-asm.S"
12.text
13
14/* structure of crypto context */
15#define p	0
16#define s0	((16 + 2) * 4)
17#define s1	((16 + 2 + (1 * 256)) * 4)
18#define s2	((16 + 2 + (2 * 256)) * 4)
19#define s3	((16 + 2 + (3 * 256)) * 4)
20
21/* register macros */
22#define CTX %r12
23#define RIO %rsi
24
25#define RX0 %rax
26#define RX1 %rbx
27#define RX2 %rcx
28#define RX3 %rdx
29
30#define RX0d %eax
31#define RX1d %ebx
32#define RX2d %ecx
33#define RX3d %edx
34
35#define RX0bl %al
36#define RX1bl %bl
37#define RX2bl %cl
38#define RX3bl %dl
39
40#define RX0bh %ah
41#define RX1bh %bh
42#define RX2bh %ch
43#define RX3bh %dh
44
45#define RT0 %rdi
46#define RT1 %rsi
47#define RT2 %r8
48#define RT3 %r9
49
50#define RT0d %edi
51#define RT1d %esi
52#define RT2d %r8d
53#define RT3d %r9d
54
55#define RKEY %r10
56
57/***********************************************************************
58 * 1-way blowfish
59 ***********************************************************************/
60#define F() \
61	rorq $16,		RX0; \
62	movzbl RX0bh,		RT0d; \
63	movzbl RX0bl,		RT1d; \
64	rolq $16,		RX0; \
65	movl s0(CTX,RT0,4),	RT0d; \
66	addl s1(CTX,RT1,4),	RT0d; \
67	movzbl RX0bh,		RT1d; \
68	movzbl RX0bl,		RT2d; \
69	rolq $32,		RX0; \
70	xorl s2(CTX,RT1,4),	RT0d; \
71	addl s3(CTX,RT2,4),	RT0d; \
72	xorq RT0,		RX0;
73
74#define add_roundkey_enc(n) \
75	xorq p+4*(n)(CTX), 	RX0;
76
77#define round_enc(n) \
78	add_roundkey_enc(n); \
79	\
80	F(); \
81	F();
82
83#define add_roundkey_dec(n) \
84	movq p+4*(n-1)(CTX),	RT0; \
85	rorq $32,		RT0; \
86	xorq RT0,		RX0;
87
88#define round_dec(n) \
89	add_roundkey_dec(n); \
90	\
91	F(); \
92	F(); \
93
94#define read_block() \
95	movq (RIO), 		RX0; \
96	rorq $32, 		RX0; \
97	bswapq 			RX0;
98
99#define write_block() \
100	bswapq 			RX0; \
101	movq RX0, 		(RIO);
102
103#define xor_block() \
104	bswapq 			RX0; \
105	xorq RX0, 		(RIO);
106
107SYM_FUNC_START(__blowfish_enc_blk)
108	/* input:
109	 *	%rdi: ctx
110	 *	%rsi: dst
111	 *	%rdx: src
112	 *	%rcx: bool, if true: xor output
113	 */
114	movq %r12, %r11;
115
116	movq %rdi, CTX;
117	movq %rsi, %r10;
118	movq %rdx, RIO;
119
120	read_block();
121
122	round_enc(0);
123	round_enc(2);
124	round_enc(4);
125	round_enc(6);
126	round_enc(8);
127	round_enc(10);
128	round_enc(12);
129	round_enc(14);
130	add_roundkey_enc(16);
131
132	movq %r11, %r12;
133
134	movq %r10, RIO;
135	test %cl, %cl;
136	jnz .L__enc_xor;
137
138	write_block();
139	RET;
140.L__enc_xor:
141	xor_block();
142	RET;
143SYM_FUNC_END(__blowfish_enc_blk)
144
145SYM_TYPED_FUNC_START(blowfish_dec_blk)
146	/* input:
147	 *	%rdi: ctx
148	 *	%rsi: dst
149	 *	%rdx: src
150	 */
151	movq %r12, %r11;
152
153	movq %rdi, CTX;
154	movq %rsi, %r10;
155	movq %rdx, RIO;
156
157	read_block();
158
159	round_dec(17);
160	round_dec(15);
161	round_dec(13);
162	round_dec(11);
163	round_dec(9);
164	round_dec(7);
165	round_dec(5);
166	round_dec(3);
167	add_roundkey_dec(1);
168
169	movq %r10, RIO;
170	write_block();
171
172	movq %r11, %r12;
173
174	RET;
175SYM_FUNC_END(blowfish_dec_blk)
176
177/**********************************************************************
178  4-way blowfish, four blocks parallel
179 **********************************************************************/
180
181/* F() for 4-way. Slower when used alone/1-way, but faster when used
182 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
183 */
184#define F4(x) \
185	movzbl x ## bh,		RT1d; \
186	movzbl x ## bl,		RT3d; \
187	rorq $16,		x; \
188	movzbl x ## bh,		RT0d; \
189	movzbl x ## bl,		RT2d; \
190	rorq $16,		x; \
191	movl s0(CTX,RT0,4),	RT0d; \
192	addl s1(CTX,RT2,4),	RT0d; \
193	xorl s2(CTX,RT1,4),	RT0d; \
194	addl s3(CTX,RT3,4),	RT0d; \
195	xorq RT0,		x;
196
197#define add_preloaded_roundkey4() \
198	xorq RKEY,		RX0; \
199	xorq RKEY,		RX1; \
200	xorq RKEY,		RX2; \
201	xorq RKEY,		RX3;
202
203#define preload_roundkey_enc(n) \
204	movq p+4*(n)(CTX),	RKEY;
205
206#define add_roundkey_enc4(n) \
207	add_preloaded_roundkey4(); \
208	preload_roundkey_enc(n + 2);
209
210#define round_enc4(n) \
211	add_roundkey_enc4(n); \
212	\
213	F4(RX0); \
214	F4(RX1); \
215	F4(RX2); \
216	F4(RX3); \
217	\
218	F4(RX0); \
219	F4(RX1); \
220	F4(RX2); \
221	F4(RX3);
222
223#define preload_roundkey_dec(n) \
224	movq p+4*((n)-1)(CTX),	RKEY; \
225	rorq $32,		RKEY;
226
227#define add_roundkey_dec4(n) \
228	add_preloaded_roundkey4(); \
229	preload_roundkey_dec(n - 2);
230
231#define round_dec4(n) \
232	add_roundkey_dec4(n); \
233	\
234	F4(RX0); \
235	F4(RX1); \
236	F4(RX2); \
237	F4(RX3); \
238	\
239	F4(RX0); \
240	F4(RX1); \
241	F4(RX2); \
242	F4(RX3);
243
244#define read_block4() \
245	movq (RIO),		RX0; \
246	rorq $32,		RX0; \
247	bswapq 			RX0; \
248	\
249	movq 8(RIO),		RX1; \
250	rorq $32,		RX1; \
251	bswapq 			RX1; \
252	\
253	movq 16(RIO),		RX2; \
254	rorq $32,		RX2; \
255	bswapq 			RX2; \
256	\
257	movq 24(RIO),		RX3; \
258	rorq $32,		RX3; \
259	bswapq 			RX3;
260
261#define write_block4() \
262	bswapq 			RX0; \
263	movq RX0,		(RIO); \
264	\
265	bswapq 			RX1; \
266	movq RX1,		8(RIO); \
267	\
268	bswapq 			RX2; \
269	movq RX2,		16(RIO); \
270	\
271	bswapq 			RX3; \
272	movq RX3,		24(RIO);
273
274#define xor_block4() \
275	bswapq 			RX0; \
276	xorq RX0,		(RIO); \
277	\
278	bswapq 			RX1; \
279	xorq RX1,		8(RIO); \
280	\
281	bswapq 			RX2; \
282	xorq RX2,		16(RIO); \
283	\
284	bswapq 			RX3; \
285	xorq RX3,		24(RIO);
286
287SYM_FUNC_START(__blowfish_enc_blk_4way)
288	/* input:
289	 *	%rdi: ctx
290	 *	%rsi: dst
291	 *	%rdx: src
292	 *	%rcx: bool, if true: xor output
293	 */
294	pushq %r12;
295	pushq %rbx;
296	pushq %rcx;
297
298	movq %rdi, CTX
299	movq %rsi, %r11;
300	movq %rdx, RIO;
301
302	preload_roundkey_enc(0);
303
304	read_block4();
305
306	round_enc4(0);
307	round_enc4(2);
308	round_enc4(4);
309	round_enc4(6);
310	round_enc4(8);
311	round_enc4(10);
312	round_enc4(12);
313	round_enc4(14);
314	add_preloaded_roundkey4();
315
316	popq %r12;
317	movq %r11, RIO;
318
319	test %r12b, %r12b;
320	jnz .L__enc_xor4;
321
322	write_block4();
323
324	popq %rbx;
325	popq %r12;
326	RET;
327
328.L__enc_xor4:
329	xor_block4();
330
331	popq %rbx;
332	popq %r12;
333	RET;
334SYM_FUNC_END(__blowfish_enc_blk_4way)
335
336SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
337	/* input:
338	 *	%rdi: ctx
339	 *	%rsi: dst
340	 *	%rdx: src
341	 */
342	pushq %r12;
343	pushq %rbx;
344
345	movq %rdi, CTX;
346	movq %rsi, %r11
347	movq %rdx, RIO;
348
349	preload_roundkey_dec(17);
350	read_block4();
351
352	round_dec4(17);
353	round_dec4(15);
354	round_dec4(13);
355	round_dec4(11);
356	round_dec4(9);
357	round_dec4(7);
358	round_dec4(5);
359	round_dec4(3);
360	add_preloaded_roundkey4();
361
362	movq %r11, RIO;
363	write_block4();
364
365	popq %rbx;
366	popq %r12;
367
368	RET;
369SYM_FUNC_END(blowfish_dec_blk_4way)
370