xref: /linux/arch/powerpc/crypto/sha1-spe-asm.S (revision 2a52ca7c98960aafb0eca9ef96b2d0c932171357)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Fast SHA-1 implementation for SPE instruction set (PPC)
4 *
5 * This code makes use of the SPE SIMD instruction set as defined in
6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7 * Implementation is based on optimization guide notes from
8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9 *
10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11 */
12
13#include <asm/ppc_asm.h>
14#include <asm/asm-offsets.h>
15
16#define rHP	r3	/* pointer to hash value			*/
17#define rWP	r4	/* pointer to input				*/
18#define rKP	r5	/* pointer to constants				*/
19
20#define rW0	r14	/* 64 bit round words				*/
21#define rW1	r15
22#define rW2	r16
23#define rW3	r17
24#define rW4	r18
25#define rW5	r19
26#define rW6	r20
27#define rW7	r21
28
29#define rH0	r6	/* 32 bit hash values 				*/
30#define rH1	r7
31#define rH2	r8
32#define rH3	r9
33#define rH4	r10
34
35#define rT0	r22	/* 64 bit temporary				*/
36#define rT1	r0	/* 32 bit temporaries				*/
37#define rT2	r11
38#define rT3	r12
39
40#define rK	r23	/* 64 bit constant in volatile register		*/
41
42#define LOAD_K01
43
44#define LOAD_K11 \
45	evlwwsplat	rK,0(rKP);
46
47#define LOAD_K21 \
48	evlwwsplat	rK,4(rKP);
49
50#define LOAD_K31 \
51	evlwwsplat	rK,8(rKP);
52
53#define LOAD_K41 \
54	evlwwsplat	rK,12(rKP);
55
56#define INITIALIZE \
57	stwu		r1,-128(r1);	/* create stack frame		*/ \
58	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
59	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
60	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
61	evstdw		r17,32(r1);					   \
62	evstdw		r18,40(r1);					   \
63	evstdw		r19,48(r1);					   \
64	evstdw		r20,56(r1);					   \
65	evstdw		r21,64(r1);					   \
66	evstdw		r22,72(r1);					   \
67	evstdw		r23,80(r1);
68
69
70#define FINALIZE \
71	evldw		r14,8(r1);	/* restore SPE registers	*/ \
72	evldw		r15,16(r1);					   \
73	evldw		r16,24(r1);					   \
74	evldw		r17,32(r1);					   \
75	evldw		r18,40(r1);					   \
76	evldw		r19,48(r1);					   \
77	evldw		r20,56(r1);					   \
78	evldw		r21,64(r1);					   \
79	evldw		r22,72(r1);					   \
80	evldw		r23,80(r1);					   \
81	xor		r0,r0,r0;					   \
82	stw		r0,8(r1);	/* Delete sensitive data	*/ \
83	stw		r0,16(r1);	/* that we might have pushed	*/ \
84	stw		r0,24(r1);	/* from other context that runs	*/ \
85	stw		r0,32(r1);	/* the same code. Assume that	*/ \
86	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
87	stw		r0,48(r1);	/* were already overwritten on	*/ \
88	stw		r0,56(r1);	/* the way down to here		*/ \
89	stw		r0,64(r1);					   \
90	stw		r0,72(r1);					   \
91	stw		r0,80(r1);					   \
92	addi		r1,r1,128;	/* cleanup stack frame		*/
93
94#ifdef __BIG_ENDIAN__
95#define LOAD_DATA(reg, off) \
96	lwz		reg,off(rWP);	/* load data			*/
97#define NEXT_BLOCK \
98	addi		rWP,rWP,64;	/* increment per block		*/
99#else
100#define LOAD_DATA(reg, off) \
101	lwbrx		reg,0,rWP;	/* load data			*/ \
102	addi		rWP,rWP,4;	/* increment per word		*/
103#define NEXT_BLOCK			/* nothing to do		*/
104#endif
105
106#define	R_00_15(a, b, c, d, e, w0, w1, k, off) \
107	LOAD_DATA(w0, off)		/* 1: W				*/ \
108	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
109	LOAD_K##k##1							   \
110	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
111	rotrwi		rT0,a,27;	/* 1: A' = A rotl 5		*/ \
112	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
113	add		e,e,rT0;	/* 1: E = E + A'		*/ \
114	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
115	add		e,e,w0;		/* 1: E = E + W			*/ \
116	LOAD_DATA(w1, off+4)		/* 2: W				*/ \
117	add		e,e,rT2;	/* 1: E = E + F			*/ \
118	and		rT1,a,b;	/* 2: F' = B and C 		*/ \
119	add		e,e,rK;		/* 1: E = E + K			*/ \
120	andc		rT2,c,a;	/* 2: F" = ~B and D 		*/ \
121	add		d,d,rK;		/* 2: E = E + K			*/ \
122	or		rT2,rT2,rT1;	/* 2: F = F' or F"		*/ \
123	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
124	add		d,d,w1;		/* 2: E = E + W			*/ \
125	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
126	add		d,d,rT0;	/* 2: E = E + A'		*/ \
127	evmergelo	w1,w1,w0;	/*    mix W[0]/W[1]		*/ \
128	add		d,d,rT2		/* 2: E = E + F			*/
129
130#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
131	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
132	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
133	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
134	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
135	or		rT1,rT1,rT2;	/* 1: F = F' or F"		*/ \
136	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
137	add		e,e,rT1;	/* 1: E = E + F			*/ \
138	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
139	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
140	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
141	add		e,e,rT2;	/* 1: E = E + A'		*/ \
142	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
143	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
144	LOAD_K##k##1							   \
145	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
146	add		e,e,rT0;	/* 1: E = E + WK		*/ \
147	add		d,d,rT1;	/* 2: E = E + WK		*/ \
148	and		rT2,a,b;	/* 2: F' = B and C 		*/ \
149	andc		rT1,c,a;	/* 2: F" = ~B and D 		*/ \
150	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
151	or		rT1,rT1,rT2;	/* 2: F = F' or F"		*/ \
152	add		d,d,rT0;	/* 2: E = E + A'		*/ \
153	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
154	add		d,d,rT1		/* 2: E = E + F			*/
155
156#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
157	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
158	xor		rT2,b,c;	/* 1: F' = B xor C		*/ \
159	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
160	xor		rT2,rT2,d;	/* 1: F = F' xor D		*/ \
161	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
162	add		e,e,rT2;	/* 1: E = E + F			*/ \
163	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
164	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
165	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
166	add		e,e,rT2;	/* 1: E = E + A'		*/ \
167	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
168	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
169	LOAD_K##k##1							   \
170	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
171	add		e,e,rT0;	/* 1: E = E + WK		*/ \
172	xor		rT2,a,b;	/* 2: F' = B xor C		*/ \
173	add		d,d,rT1;	/* 2: E = E + WK		*/ \
174	xor		rT2,rT2,c;	/* 2: F = F' xor D		*/ \
175	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
176	add		d,d,rT2;	/* 2: E = E + F			*/ \
177	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
178	add		d,d,rT0		/* 2: E = E + A'		*/
179
180#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
181	and		rT2,b,c;	/* 1: F' = B and C		*/ \
182	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
183	or		rT1,b,c;	/* 1: F" = B or C		*/ \
184	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
185	and		rT1,d,rT1;	/* 1: F" = F" and D		*/ \
186	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
187	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
188	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
189	add		e,e,rT2;	/* 1: E = E + F			*/ \
190	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
191	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
192	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
193	add		e,e,rT2;	/* 1: E = E + A'		*/ \
194	LOAD_K##k##1							   \
195	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
196	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
197	add		e,e,rT0;	/* 1: E = E + WK		*/ \
198	and		rT2,a,b;	/* 2: F' = B and C		*/ \
199	or		rT0,a,b;	/* 2: F" = B or C		*/ \
200	add		d,d,rT1;	/* 2: E = E + WK		*/ \
201	and		rT0,c,rT0;	/* 2: F" = F" and D		*/ \
202	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
203	or		rT2,rT2,rT0;	/* 2: F = F' or F"		*/ \
204	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
205	add		d,d,rT2;	/* 2: E = E + F			*/ \
206	add		d,d,rT0		/* 2: E = E + A'		*/
207
208#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
209	R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
210
211_GLOBAL(ppc_spe_sha1_transform)
212	INITIALIZE
213
214	lwz		rH0,0(rHP)
215	lwz		rH1,4(rHP)
216	mtctr		r5
217	lwz		rH2,8(rHP)
218	lis		rKP,PPC_SPE_SHA1_K@h
219	lwz		rH3,12(rHP)
220	ori		rKP,rKP,PPC_SPE_SHA1_K@l
221	lwz		rH4,16(rHP)
222
223ppc_spe_sha1_main:
224	R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
225	R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
226	R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
227	R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
228	R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
229	R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
230	R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
231	R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
232
233	R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
234	R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
235
236	R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
237	R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
238	R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
239	R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
240	R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
241	R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
242	R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
243	R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
244	R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
245	R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
246
247	R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
248	R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
249	R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
250	R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
251	R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
252	R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
253	R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
254	R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
255	R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
256	R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
257
258	R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
259	R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
260	R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
261	R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
262	R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
263	R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
264	R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
265	lwz		rT3,0(rHP)
266	R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
267	lwz		rW1,4(rHP)
268	R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
269	lwz		rW2,8(rHP)
270	R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
271	lwz		rW3,12(rHP)
272	NEXT_BLOCK
273	lwz		rW4,16(rHP)
274
275	add		rH0,rH0,rT3
276	stw		rH0,0(rHP)
277	add		rH1,rH1,rW1
278	stw		rH1,4(rHP)
279	add		rH2,rH2,rW2
280	stw		rH2,8(rHP)
281	add		rH3,rH3,rW3
282	stw		rH3,12(rHP)
283	add		rH4,rH4,rW4
284	stw		rH4,16(rHP)
285
286	bdnz		ppc_spe_sha1_main
287
288	FINALIZE
289	blr
290
291.data
292.align 4
293PPC_SPE_SHA1_K:
294	.long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
295