xref: /linux/arch/powerpc/crypto/sha256-spe-asm.S (revision 75bf465f0bc33e9b776a46d6a1b9b990f5fb7c37)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Fast SHA-256 implementation for SPE instruction set (PPC)
4 *
5 * This code makes use of the SPE SIMD instruction set as defined in
6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7 * Implementation is based on optimization guide notes from
8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9 *
10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11 */
12
13#include <asm/ppc_asm.h>
14#include <asm/asm-offsets.h>
15
16#define rHP	r3	/* pointer to hash values in memory		*/
17#define rKP	r24	/* pointer to round constants			*/
18#define rWP	r4	/* pointer to input data			*/
19
20#define rH0	r5	/* 8 32 bit hash values in 8 registers		*/
21#define rH1	r6
22#define rH2	r7
23#define rH3	r8
24#define rH4	r9
25#define rH5	r10
26#define rH6	r11
27#define rH7	r12
28
29#define rW0	r14	/* 64 bit registers. 16 words in 8 registers	*/
30#define rW1	r15
31#define rW2	r16
32#define rW3	r17
33#define rW4	r18
34#define rW5	r19
35#define rW6	r20
36#define rW7	r21
37
38#define rT0	r22	/* 64 bit temporaries 				*/
39#define rT1	r23
40#define rT2	r0	/* 32 bit temporaries				*/
41#define rT3	r25
42
43#define CMP_KN_LOOP
44#define CMP_KC_LOOP \
45	cmpwi		rT1,0;
46
47#define INITIALIZE \
48	stwu		r1,-128(r1);	/* create stack frame		*/ \
49	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
50	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
51	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
52	evstdw		r17,32(r1);					   \
53	evstdw		r18,40(r1);					   \
54	evstdw		r19,48(r1);					   \
55	evstdw		r20,56(r1);					   \
56	evstdw		r21,64(r1);					   \
57	evstdw		r22,72(r1);					   \
58	evstdw		r23,80(r1);					   \
59	stw		r24,88(r1);	/* save normal registers	*/ \
60	stw		r25,92(r1);
61
62
63#define FINALIZE \
64	evldw		r14,8(r1);	/* restore SPE registers	*/ \
65	evldw		r15,16(r1);					   \
66	evldw		r16,24(r1);					   \
67	evldw		r17,32(r1);					   \
68	evldw		r18,40(r1);					   \
69	evldw		r19,48(r1);					   \
70	evldw		r20,56(r1);					   \
71	evldw		r21,64(r1);					   \
72	evldw		r22,72(r1);					   \
73	evldw		r23,80(r1);					   \
74	lwz		r24,88(r1);	/* restore normal registers	*/ \
75	lwz		r25,92(r1);					   \
76	xor		r0,r0,r0;					   \
77	stw		r0,8(r1);	/* Delete sensitive data	*/ \
78	stw		r0,16(r1);	/* that we might have pushed	*/ \
79	stw		r0,24(r1);	/* from other context that runs	*/ \
80	stw		r0,32(r1);	/* the same code. Assume that	*/ \
81	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
82	stw		r0,48(r1);	/* was already overwritten on	*/ \
83	stw		r0,56(r1);	/* the way down to here		*/ \
84	stw		r0,64(r1);					   \
85	stw		r0,72(r1);					   \
86	stw		r0,80(r1);					   \
87	addi		r1,r1,128;	/* cleanup stack frame		*/
88
89#ifdef __BIG_ENDIAN__
90#define LOAD_DATA(reg, off) \
91	lwz		reg,off(rWP);	/* load data			*/
92#define NEXT_BLOCK \
93	addi		rWP,rWP,64;	/* increment per block		*/
94#else
95#define LOAD_DATA(reg, off) \
96	lwbrx		reg,0,rWP; 	/* load data			*/ \
97	addi		rWP,rWP,4;	/* increment per word		*/
98#define NEXT_BLOCK			/* nothing to do		*/
99#endif
100
101#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
102	LOAD_DATA(w, off)		/* 1: W				*/ \
103	rotrwi		rT0,e,6;	/* 1: S1 = e rotr 6		*/ \
104	rotrwi		rT1,e,11;	/* 1: S1' = e rotr 11		*/ \
105	rotrwi		rT2,e,25;	/* 1: S1" = e rotr 25		*/ \
106	xor		rT0,rT0,rT1;	/* 1: S1 = S1 xor S1'		*/ \
107	and		rT3,e,f;	/* 1: ch = e and f		*/ \
108	xor		rT0,rT0,rT2;	/* 1: S1 = S1 xor S1"		*/ \
109	andc		rT1,g,e;	/* 1: ch' = ~e and g		*/ \
110	lwz		rT2,off(rKP);	/* 1: K				*/ \
111	xor		rT3,rT3,rT1;	/* 1: ch = ch xor ch'		*/ \
112	add		h,h,rT0;	/* 1: temp1 = h + S1		*/ \
113	add		rT3,rT3,w;	/* 1: temp1' = ch + w		*/ \
114	rotrwi		rT0,a,2;	/* 1: S0 = a rotr 2		*/ \
115	add		h,h,rT3;	/* 1: temp1 = temp1 + temp1'	*/ \
116	rotrwi		rT1,a,13;	/* 1: S0' = a rotr 13		*/ \
117	add		h,h,rT2;	/* 1: temp1 = temp1 + K		*/ \
118	rotrwi		rT3,a,22;	/* 1: S0" = a rotr 22		*/ \
119	xor		rT0,rT0,rT1;	/* 1: S0 = S0 xor S0'		*/ \
120	add		d,d,h;		/* 1: d = d + temp1		*/ \
121	xor		rT3,rT0,rT3;	/* 1: S0 = S0 xor S0"		*/ \
122	evmergelo	w,w,w;		/*    shift W			*/ \
123	or		rT2,a,b;	/* 1: maj = a or b		*/ \
124	and		rT1,a,b;	/* 1: maj' = a and b		*/ \
125	and		rT2,rT2,c;	/* 1: maj = maj and c		*/ \
126	LOAD_DATA(w, off+4)		/* 2: W				*/ \
127	or		rT2,rT1,rT2;	/* 1: maj = maj or maj'		*/ \
128	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
129	add		rT3,rT3,rT2;	/* 1: temp2 = S0 + maj		*/ \
130	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
131	add		h,h,rT3;	/* 1: h = temp1 + temp2		*/ \
132	rotrwi		rT2,d,25;	/* 2: S1" = e rotr 25		*/ \
133	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
134	and		rT3,d,e;	/* 2: ch = e and f		*/ \
135	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
136	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
137	lwz		rT2,off+4(rKP);	/* 2: K				*/ \
138	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
139	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
140	add		rT3,rT3,w;	/* 2: temp1' = ch + w		*/ \
141	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
142	add		g,g,rT3;	/* 2: temp1 = temp1 + temp1'	*/ \
143	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
144	add		g,g,rT2;	/* 2: temp1 = temp1 + K		*/ \
145	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
146	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
147	or		rT2,h,a;	/* 2: maj = a or b		*/ \
148	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
149	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
150	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
151	add		c,c,g;		/* 2: d = d + temp1		*/ \
152	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
153	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
154	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
155
156#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
157	rotrwi		rT2,e,6;	/* 1: S1 = e rotr 6		*/ \
158	evmergelohi	rT0,w0,w1;	/*    w[-15]			*/ \
159	rotrwi		rT3,e,11;	/* 1: S1' = e rotr 11		*/ \
160	evsrwiu		rT1,rT0,3;	/*    s0 = w[-15] >> 3		*/ \
161	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
162	evrlwi		rT0,rT0,25;	/*    s0' = w[-15] rotr	7	*/ \
163	rotrwi		rT3,e,25;	/* 1: S1' = e rotr 25		*/ \
164	evxor		rT1,rT1,rT0;	/*    s0 = s0 xor s0'		*/ \
165	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
166	evrlwi		rT0,rT0,21;	/*    s0' = w[-15] rotr 18	*/ \
167	add		h,h,rT2;	/* 1: temp1 = h + S1		*/ \
168	evxor		rT0,rT0,rT1;	/*    s0 = s0 xor s0'		*/ \
169	and		rT2,e,f;	/* 1: ch = e and f		*/ \
170	evaddw		w0,w0,rT0;	/*    w = w[-16] + s0		*/ \
171	andc		rT3,g,e;	/* 1: ch' = ~e and g		*/ \
172	evsrwiu		rT0,w7,10;	/*    s1 = w[-2] >> 10		*/ \
173	xor		rT2,rT2,rT3;	/* 1: ch = ch xor ch'		*/ \
174	evrlwi		rT1,w7,15;	/*    s1' = w[-2] rotr 17	*/ \
175	add		h,h,rT2;	/* 1: temp1 = temp1 + ch	*/ \
176	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
177	rotrwi		rT2,a,2;	/* 1: S0 = a rotr 2		*/ \
178	evrlwi		rT1,w7,13;	/*    s1' = w[-2] rotr 19	*/ \
179	rotrwi		rT3,a,13;	/* 1: S0' = a rotr 13		*/ \
180	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
181	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
182	evldw		rT1,off(rKP);	/*    k				*/ \
183	rotrwi		rT3,a,22;	/* 1: S0' = a rotr 22		*/ \
184	evaddw		w0,w0,rT0;	/*    w = w + s1		*/ \
185	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
186	evmergelohi	rT0,w4,w5;	/*    w[-7]			*/ \
187	and		rT3,a,b;	/* 1: maj = a and b		*/ \
188	evaddw		w0,w0,rT0;	/*    w = w + w[-7]		*/ \
189	CMP_K##k##_LOOP							   \
190	add		rT2,rT2,rT3;	/* 1: temp2 = S0 + maj		*/ \
191	evaddw		rT1,rT1,w0;	/*    wk = w + k		*/ \
192	xor		rT3,a,b;	/* 1: maj = a xor b		*/ \
193	evmergehi	rT0,rT1,rT1;	/*    wk1/wk2			*/ \
194	and		rT3,rT3,c;	/* 1: maj = maj and c		*/ \
195	add		h,h,rT0;	/* 1: temp1 = temp1 + wk	*/ \
196	add		rT2,rT2,rT3;	/* 1: temp2 = temp2 + maj	*/ \
197	add		g,g,rT1;	/* 2: temp1 = temp1 + wk	*/ \
198	add		d,d,h;		/* 1: d = d + temp1		*/ \
199	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
200	add		h,h,rT2;	/* 1: h = temp1 + temp2		*/ \
201	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
202	rotrwi		rT2,d,25;	/* 2: S" = e rotr 25		*/ \
203	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
204	and		rT3,d,e;	/* 2: ch = e and f		*/ \
205	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
206	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
207	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
208	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
209	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
210	add		g,g,rT3;	/* 2: temp1 = temp1 + ch	*/ \
211	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
212	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
213	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
214	or		rT2,h,a;	/* 2: maj = a or b		*/ \
215	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
216	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
217	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
218	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
219	add		c,c,g;		/* 2: d = d + temp1		*/ \
220	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
221	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
222
223_GLOBAL(ppc_spe_sha256_transform)
224	INITIALIZE
225
226	mtctr		r5
227	lwz		rH0,0(rHP)
228	lwz		rH1,4(rHP)
229	lwz		rH2,8(rHP)
230	lwz		rH3,12(rHP)
231	lwz		rH4,16(rHP)
232	lwz		rH5,20(rHP)
233	lwz		rH6,24(rHP)
234	lwz		rH7,28(rHP)
235
236ppc_spe_sha256_main:
237	lis		rKP,PPC_SPE_SHA256_K@ha
238	addi		rKP,rKP,PPC_SPE_SHA256_K@l
239
240	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
241	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
242	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
243	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
244	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
245	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
246	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
247	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
248ppc_spe_sha256_16_rounds:
249	addi		rKP,rKP,64
250	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
251		 rW0, rW1, rW4, rW5, rW7, N, 0)
252	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
253		 rW1, rW2, rW5, rW6, rW0, N, 8)
254	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
255		 rW2, rW3, rW6, rW7, rW1, N, 16)
256	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
257		 rW3, rW4, rW7, rW0, rW2, N, 24)
258	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
259		 rW4, rW5, rW0, rW1, rW3, N, 32)
260	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
261		 rW5, rW6, rW1, rW2, rW4, N, 40)
262	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
263		 rW6, rW7, rW2, rW3, rW5, N, 48)
264	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
265		 rW7, rW0, rW3, rW4, rW6, C, 56)
266	bt		gt,ppc_spe_sha256_16_rounds
267
268	lwz		rW0,0(rHP)
269	NEXT_BLOCK
270	lwz		rW1,4(rHP)
271	lwz		rW2,8(rHP)
272	lwz		rW3,12(rHP)
273	lwz		rW4,16(rHP)
274	lwz		rW5,20(rHP)
275	lwz		rW6,24(rHP)
276	lwz		rW7,28(rHP)
277
278	add		rH0,rH0,rW0
279	stw		rH0,0(rHP)
280	add		rH1,rH1,rW1
281	stw		rH1,4(rHP)
282	add		rH2,rH2,rW2
283	stw		rH2,8(rHP)
284	add		rH3,rH3,rW3
285	stw		rH3,12(rHP)
286	add		rH4,rH4,rW4
287	stw		rH4,16(rHP)
288	add		rH5,rH5,rW5
289	stw		rH5,20(rHP)
290	add		rH6,rH6,rW6
291	stw		rH6,24(rHP)
292	add		rH7,rH7,rW7
293	stw		rH7,28(rHP)
294
295	bdnz		ppc_spe_sha256_main
296
297	FINALIZE
298	blr
299
300.data
301.align 5
302PPC_SPE_SHA256_K:
303	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
304	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
305	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
306	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
307	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
308	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
309	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
310	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
311	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
312	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
313	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
314	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
315	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
316	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
317	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
318	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
319