xref: /linux/lib/crypto/arm/sha1-armv7-neon.S (revision 13150742b09e720fdf021de14cd2b98b37415a89)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
3 *
4 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 */
6
7#include <linux/linkage.h>
8#include <asm/assembler.h>
9
10.syntax unified
11.fpu neon
12
13.text
14
15
16/* Context structure */
17
18#define state_h0 0
19#define state_h1 4
20#define state_h2 8
21#define state_h3 12
22#define state_h4 16
23
24
25/* Constants */
26
27#define K1  0x5A827999
28#define K2  0x6ED9EBA1
29#define K3  0x8F1BBCDC
30#define K4  0xCA62C1D6
31.align 4
32.LK_VEC:
33.LK1:	.long K1, K1, K1, K1
34.LK2:	.long K2, K2, K2, K2
35.LK3:	.long K3, K3, K3, K3
36.LK4:	.long K4, K4, K4, K4
37
38
39/* Register macros */
40
41#define RSTATE r0
42#define RDATA r1
43#define RNBLKS r2
44#define ROLDSTACK r3
45#define RWK lr
46
47#define _a r4
48#define _b r5
49#define _c r6
50#define _d r7
51#define _e r8
52
53#define RT0 r9
54#define RT1 r10
55#define RT2 r11
56#define RT3 r12
57
58#define W0 q0
59#define W1 q7
60#define W2 q2
61#define W3 q3
62#define W4 q4
63#define W5 q6
64#define W6 q5
65#define W7 q1
66
67#define tmp0 q8
68#define tmp1 q9
69#define tmp2 q10
70#define tmp3 q11
71
72#define qK1 q12
73#define qK2 q13
74#define qK3 q14
75#define qK4 q15
76
77#ifdef CONFIG_CPU_BIG_ENDIAN
78#define ARM_LE(code...)
79#else
80#define ARM_LE(code...)		code
81#endif
82
83/* Round function macros. */
84
85#define WK_offs(i) (((i) & 15) * 4)
86
87#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
88	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
89	ldr RT3, [sp, WK_offs(i)]; \
90		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
91	bic RT0, d, b; \
92	add e, e, a, ror #(32 - 5); \
93	and RT1, c, b; \
94		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
95	add RT0, RT0, RT3; \
96	add e, e, RT1; \
97	ror b, #(32 - 30); \
98		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
99	add e, e, RT0;
100
101#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
102	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
103	ldr RT3, [sp, WK_offs(i)]; \
104		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
105	eor RT0, d, b; \
106	add e, e, a, ror #(32 - 5); \
107	eor RT0, RT0, c; \
108		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
109	add e, e, RT3; \
110	ror b, #(32 - 30); \
111		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
112	add e, e, RT0; \
113
114#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
115	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
116	ldr RT3, [sp, WK_offs(i)]; \
117		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
118	eor RT0, b, c; \
119	and RT1, b, c; \
120	add e, e, a, ror #(32 - 5); \
121		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
122	and RT0, RT0, d; \
123	add RT1, RT1, RT3; \
124	add e, e, RT0; \
125	ror b, #(32 - 30); \
126		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
127	add e, e, RT1;
128
129#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
130	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
131	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
132	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
133
134#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
135           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
136	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
137	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
138
139#define R(a,b,c,d,e,f,i) \
140	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
141	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
142
143#define dummy(...)
144
145
146/* Input expansion macros. */
147
148/********* Precalc macros for rounds 0-15 *************************************/
149
150#define W_PRECALC_00_15() \
151	add       RWK, sp, #(WK_offs(0));			\
152	\
153	vld1.32   {W0, W7}, [RDATA]!;				\
154 ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
155	vld1.32   {W6, W5}, [RDATA]!;				\
156	vadd.u32  tmp0, W0, curK;				\
157 ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
158 ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
159	vadd.u32  tmp1, W7, curK;				\
160 ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
161	vadd.u32  tmp2, W6, curK;				\
162	vst1.32   {tmp0, tmp1}, [RWK]!;				\
163	vadd.u32  tmp3, W5, curK;				\
164	vst1.32   {tmp2, tmp3}, [RWK];				\
165
166#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
167	vld1.32   {W0, W7}, [RDATA]!;				\
168
169#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
170	add       RWK, sp, #(WK_offs(0));			\
171
172#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
173 ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
174
175#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
176	vld1.32   {W6, W5}, [RDATA]!;				\
177
178#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
179	vadd.u32  tmp0, W0, curK;				\
180
181#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
182 ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
183
184#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
185 ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
186
187#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
188	vadd.u32  tmp1, W7, curK;				\
189
190#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
191 ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
192
193#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
194	vadd.u32  tmp2, W6, curK;				\
195
196#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
197	vst1.32   {tmp0, tmp1}, [RWK]!;				\
198
199#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
200	vadd.u32  tmp3, W5, curK;				\
201
202#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
203	vst1.32   {tmp2, tmp3}, [RWK];				\
204
205
206/********* Precalc macros for rounds 16-31 ************************************/
207
208#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
209	veor      tmp0, tmp0;			\
210	vext.8    W, W_m16, W_m12, #8;		\
211
212#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
213	add       RWK, sp, #(WK_offs(i));	\
214	vext.8    tmp0, W_m04, tmp0, #4;	\
215
216#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
217	veor      tmp0, tmp0, W_m16;		\
218	veor.32   W, W, W_m08;			\
219
220#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
221	veor      tmp1, tmp1;			\
222	veor      W, W, tmp0;			\
223
224#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
225	vshl.u32  tmp0, W, #1;			\
226
227#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
228	vext.8    tmp1, tmp1, W, #(16-12);	\
229	vshr.u32  W, W, #31;			\
230
231#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
232	vorr      tmp0, tmp0, W;		\
233	vshr.u32  W, tmp1, #30;			\
234
235#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
236	vshl.u32  tmp1, tmp1, #2;		\
237
238#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
239	veor      tmp0, tmp0, W;		\
240
241#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
242	veor      W, tmp0, tmp1;		\
243
244#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
245	vadd.u32  tmp0, W, curK;		\
246
247#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
248	vst1.32   {tmp0}, [RWK];
249
250
251/********* Precalc macros for rounds 32-79 ************************************/
252
253#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
254	veor W, W_m28; \
255
256#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
257	vext.8 tmp0, W_m08, W_m04, #8; \
258
259#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
260	veor W, W_m16; \
261
262#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
263	veor W, tmp0; \
264
265#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
266	add RWK, sp, #(WK_offs(i&~3)); \
267
268#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
269	vshl.u32 tmp1, W, #2; \
270
271#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
272	vshr.u32 tmp0, W, #30; \
273
274#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
275	vorr W, tmp0, tmp1; \
276
277#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
278	vadd.u32 tmp0, W, curK; \
279
280#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
281	vst1.32 {tmp0}, [RWK];
282
283
284/*
285 * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
286 *
287 * void sha1_transform_neon(struct sha1_block_state *state,
288 *			    const u8 *data, size_t nblocks);
289 */
290.align 3
291ENTRY(sha1_transform_neon)
292  /* input:
293   *	r0: state
294   *	r1: data (64*nblocks bytes)
295   *	r2: nblocks
296   */
297
298  cmp RNBLKS, #0;
299  beq .Ldo_nothing;
300
301  push {r4-r12, lr};
302  /*vpush {q4-q7};*/
303
304  adr RT3, .LK_VEC;
305
306  mov ROLDSTACK, sp;
307
308  /* Align stack. */
309  sub RT0, sp, #(16*4);
310  and RT0, #(~(16-1));
311  mov sp, RT0;
312
313  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
314
315  /* Get the values of the chaining variables. */
316  ldm RSTATE, {_a-_e};
317
318  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
319
320#undef curK
321#define curK qK1
322  /* Precalc 0-15. */
323  W_PRECALC_00_15();
324
325.Loop:
326  /* Transform 0-15 + Precalc 16-31. */
327  _R( _a, _b, _c, _d, _e, F1,  0,
328      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
329      W4, W5, W6, W7, W0, _, _, _ );
330  _R( _e, _a, _b, _c, _d, F1,  1,
331      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
332      W4, W5, W6, W7, W0, _, _, _ );
333  _R( _d, _e, _a, _b, _c, F1,  2,
334      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
335      W4, W5, W6, W7, W0, _, _, _ );
336  _R( _c, _d, _e, _a, _b, F1,  3,
337      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
338      W4, W5, W6, W7, W0, _, _, _ );
339
340#undef curK
341#define curK qK2
342  _R( _b, _c, _d, _e, _a, F1,  4,
343      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
344      W3, W4, W5, W6, W7, _, _, _ );
345  _R( _a, _b, _c, _d, _e, F1,  5,
346      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
347      W3, W4, W5, W6, W7, _, _, _ );
348  _R( _e, _a, _b, _c, _d, F1,  6,
349      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
350      W3, W4, W5, W6, W7, _, _, _ );
351  _R( _d, _e, _a, _b, _c, F1,  7,
352      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
353      W3, W4, W5, W6, W7, _, _, _ );
354
355  _R( _c, _d, _e, _a, _b, F1,  8,
356      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
357      W2, W3, W4, W5, W6, _, _, _ );
358  _R( _b, _c, _d, _e, _a, F1,  9,
359      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
360      W2, W3, W4, W5, W6, _, _, _ );
361  _R( _a, _b, _c, _d, _e, F1, 10,
362      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
363      W2, W3, W4, W5, W6, _, _, _ );
364  _R( _e, _a, _b, _c, _d, F1, 11,
365      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
366      W2, W3, W4, W5, W6, _, _, _ );
367
368  _R( _d, _e, _a, _b, _c, F1, 12,
369      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
370      W1, W2, W3, W4, W5, _, _, _ );
371  _R( _c, _d, _e, _a, _b, F1, 13,
372      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
373      W1, W2, W3, W4, W5, _, _, _ );
374  _R( _b, _c, _d, _e, _a, F1, 14,
375      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
376      W1, W2, W3, W4, W5, _, _, _ );
377  _R( _a, _b, _c, _d, _e, F1, 15,
378      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
379      W1, W2, W3, W4, W5, _, _, _ );
380
381  /* Transform 16-63 + Precalc 32-79. */
382  _R( _e, _a, _b, _c, _d, F1, 16,
383      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
384      W0, W1, W2, W3, W4, W5, W6, W7);
385  _R( _d, _e, _a, _b, _c, F1, 17,
386      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
387      W0, W1, W2, W3, W4, W5, W6, W7);
388  _R( _c, _d, _e, _a, _b, F1, 18,
389      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
390      W0, W1, W2, W3, W4, W5, W6, W7);
391  _R( _b, _c, _d, _e, _a, F1, 19,
392      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
393      W0, W1, W2, W3, W4, W5, W6, W7);
394
395  _R( _a, _b, _c, _d, _e, F2, 20,
396      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
397      W7, W0, W1, W2, W3, W4, W5, W6);
398  _R( _e, _a, _b, _c, _d, F2, 21,
399      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
400      W7, W0, W1, W2, W3, W4, W5, W6);
401  _R( _d, _e, _a, _b, _c, F2, 22,
402      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
403      W7, W0, W1, W2, W3, W4, W5, W6);
404  _R( _c, _d, _e, _a, _b, F2, 23,
405      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
406      W7, W0, W1, W2, W3, W4, W5, W6);
407
408#undef curK
409#define curK qK3
410  _R( _b, _c, _d, _e, _a, F2, 24,
411      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
412      W6, W7, W0, W1, W2, W3, W4, W5);
413  _R( _a, _b, _c, _d, _e, F2, 25,
414      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
415      W6, W7, W0, W1, W2, W3, W4, W5);
416  _R( _e, _a, _b, _c, _d, F2, 26,
417      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
418      W6, W7, W0, W1, W2, W3, W4, W5);
419  _R( _d, _e, _a, _b, _c, F2, 27,
420      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
421      W6, W7, W0, W1, W2, W3, W4, W5);
422
423  _R( _c, _d, _e, _a, _b, F2, 28,
424      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
425      W5, W6, W7, W0, W1, W2, W3, W4);
426  _R( _b, _c, _d, _e, _a, F2, 29,
427      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
428      W5, W6, W7, W0, W1, W2, W3, W4);
429  _R( _a, _b, _c, _d, _e, F2, 30,
430      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
431      W5, W6, W7, W0, W1, W2, W3, W4);
432  _R( _e, _a, _b, _c, _d, F2, 31,
433      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
434      W5, W6, W7, W0, W1, W2, W3, W4);
435
436  _R( _d, _e, _a, _b, _c, F2, 32,
437      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
438      W4, W5, W6, W7, W0, W1, W2, W3);
439  _R( _c, _d, _e, _a, _b, F2, 33,
440      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
441      W4, W5, W6, W7, W0, W1, W2, W3);
442  _R( _b, _c, _d, _e, _a, F2, 34,
443      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
444      W4, W5, W6, W7, W0, W1, W2, W3);
445  _R( _a, _b, _c, _d, _e, F2, 35,
446      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
447      W4, W5, W6, W7, W0, W1, W2, W3);
448
449  _R( _e, _a, _b, _c, _d, F2, 36,
450      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
451      W3, W4, W5, W6, W7, W0, W1, W2);
452  _R( _d, _e, _a, _b, _c, F2, 37,
453      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
454      W3, W4, W5, W6, W7, W0, W1, W2);
455  _R( _c, _d, _e, _a, _b, F2, 38,
456      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
457      W3, W4, W5, W6, W7, W0, W1, W2);
458  _R( _b, _c, _d, _e, _a, F2, 39,
459      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
460      W3, W4, W5, W6, W7, W0, W1, W2);
461
462  _R( _a, _b, _c, _d, _e, F3, 40,
463      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
464      W2, W3, W4, W5, W6, W7, W0, W1);
465  _R( _e, _a, _b, _c, _d, F3, 41,
466      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
467      W2, W3, W4, W5, W6, W7, W0, W1);
468  _R( _d, _e, _a, _b, _c, F3, 42,
469      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
470      W2, W3, W4, W5, W6, W7, W0, W1);
471  _R( _c, _d, _e, _a, _b, F3, 43,
472      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
473      W2, W3, W4, W5, W6, W7, W0, W1);
474
475#undef curK
476#define curK qK4
477  _R( _b, _c, _d, _e, _a, F3, 44,
478      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
479      W1, W2, W3, W4, W5, W6, W7, W0);
480  _R( _a, _b, _c, _d, _e, F3, 45,
481      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
482      W1, W2, W3, W4, W5, W6, W7, W0);
483  _R( _e, _a, _b, _c, _d, F3, 46,
484      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
485      W1, W2, W3, W4, W5, W6, W7, W0);
486  _R( _d, _e, _a, _b, _c, F3, 47,
487      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
488      W1, W2, W3, W4, W5, W6, W7, W0);
489
490  _R( _c, _d, _e, _a, _b, F3, 48,
491      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
492      W0, W1, W2, W3, W4, W5, W6, W7);
493  _R( _b, _c, _d, _e, _a, F3, 49,
494      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
495      W0, W1, W2, W3, W4, W5, W6, W7);
496  _R( _a, _b, _c, _d, _e, F3, 50,
497      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
498      W0, W1, W2, W3, W4, W5, W6, W7);
499  _R( _e, _a, _b, _c, _d, F3, 51,
500      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
501      W0, W1, W2, W3, W4, W5, W6, W7);
502
503  _R( _d, _e, _a, _b, _c, F3, 52,
504      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
505      W7, W0, W1, W2, W3, W4, W5, W6);
506  _R( _c, _d, _e, _a, _b, F3, 53,
507      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
508      W7, W0, W1, W2, W3, W4, W5, W6);
509  _R( _b, _c, _d, _e, _a, F3, 54,
510      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
511      W7, W0, W1, W2, W3, W4, W5, W6);
512  _R( _a, _b, _c, _d, _e, F3, 55,
513      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
514      W7, W0, W1, W2, W3, W4, W5, W6);
515
516  _R( _e, _a, _b, _c, _d, F3, 56,
517      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
518      W6, W7, W0, W1, W2, W3, W4, W5);
519  _R( _d, _e, _a, _b, _c, F3, 57,
520      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
521      W6, W7, W0, W1, W2, W3, W4, W5);
522  _R( _c, _d, _e, _a, _b, F3, 58,
523      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
524      W6, W7, W0, W1, W2, W3, W4, W5);
525  _R( _b, _c, _d, _e, _a, F3, 59,
526      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
527      W6, W7, W0, W1, W2, W3, W4, W5);
528
529  subs RNBLKS, #1;
530
531  _R( _a, _b, _c, _d, _e, F4, 60,
532      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
533      W5, W6, W7, W0, W1, W2, W3, W4);
534  _R( _e, _a, _b, _c, _d, F4, 61,
535      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
536      W5, W6, W7, W0, W1, W2, W3, W4);
537  _R( _d, _e, _a, _b, _c, F4, 62,
538      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
539      W5, W6, W7, W0, W1, W2, W3, W4);
540  _R( _c, _d, _e, _a, _b, F4, 63,
541      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
542      W5, W6, W7, W0, W1, W2, W3, W4);
543
544  beq .Lend;
545
546  /* Transform 64-79 + Precalc 0-15 of next block. */
547#undef curK
548#define curK qK1
549  _R( _b, _c, _d, _e, _a, F4, 64,
550      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
551  _R( _a, _b, _c, _d, _e, F4, 65,
552      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
553  _R( _e, _a, _b, _c, _d, F4, 66,
554      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
555  _R( _d, _e, _a, _b, _c, F4, 67,
556      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
557
558  _R( _c, _d, _e, _a, _b, F4, 68,
559      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
560  _R( _b, _c, _d, _e, _a, F4, 69,
561      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
562  _R( _a, _b, _c, _d, _e, F4, 70,
563      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
564  _R( _e, _a, _b, _c, _d, F4, 71,
565      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
566
567  _R( _d, _e, _a, _b, _c, F4, 72,
568      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
569  _R( _c, _d, _e, _a, _b, F4, 73,
570      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
571  _R( _b, _c, _d, _e, _a, F4, 74,
572      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
573  _R( _a, _b, _c, _d, _e, F4, 75,
574      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
575
576  _R( _e, _a, _b, _c, _d, F4, 76,
577      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
578  _R( _d, _e, _a, _b, _c, F4, 77,
579      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
580  _R( _c, _d, _e, _a, _b, F4, 78,
581      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
582  _R( _b, _c, _d, _e, _a, F4, 79,
583      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
584
585  /* Update the chaining variables. */
586  ldm RSTATE, {RT0-RT3};
587  add _a, RT0;
588  ldr RT0, [RSTATE, #state_h4];
589  add _b, RT1;
590  add _c, RT2;
591  add _d, RT3;
592  add _e, RT0;
593  stm RSTATE, {_a-_e};
594
595  b .Loop;
596
597.Lend:
598  /* Transform 64-79 */
599  R( _b, _c, _d, _e, _a, F4, 64 );
600  R( _a, _b, _c, _d, _e, F4, 65 );
601  R( _e, _a, _b, _c, _d, F4, 66 );
602  R( _d, _e, _a, _b, _c, F4, 67 );
603  R( _c, _d, _e, _a, _b, F4, 68 );
604  R( _b, _c, _d, _e, _a, F4, 69 );
605  R( _a, _b, _c, _d, _e, F4, 70 );
606  R( _e, _a, _b, _c, _d, F4, 71 );
607  R( _d, _e, _a, _b, _c, F4, 72 );
608  R( _c, _d, _e, _a, _b, F4, 73 );
609  R( _b, _c, _d, _e, _a, F4, 74 );
610  R( _a, _b, _c, _d, _e, F4, 75 );
611  R( _e, _a, _b, _c, _d, F4, 76 );
612  R( _d, _e, _a, _b, _c, F4, 77 );
613  R( _c, _d, _e, _a, _b, F4, 78 );
614  R( _b, _c, _d, _e, _a, F4, 79 );
615
616  mov sp, ROLDSTACK;
617
618  /* Update the chaining variables. */
619  ldm RSTATE, {RT0-RT3};
620  add _a, RT0;
621  ldr RT0, [RSTATE, #state_h4];
622  add _b, RT1;
623  add _c, RT2;
624  add _d, RT3;
625  /*vpop {q4-q7};*/
626  add _e, RT0;
627  stm RSTATE, {_a-_e};
628
629  pop {r4-r12, pc};
630
631.Ldo_nothing:
632  bx lr
633ENDPROC(sha1_transform_neon)
634