xref: /linux/arch/arm64/crypto/sm3-neon-core.S (revision c8bfe3fad4f86a029da7157bae9699c816f0c309)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * sm3-neon-core.S - SM3 secure hash using NEON instructions
4 *
5 * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
6 *
7 * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <linux/cfi_types.h>
13#include <asm/assembler.h>
14
15/* Context structure */
16
17#define state_h0 0
18#define state_h1 4
19#define state_h2 8
20#define state_h3 12
21#define state_h4 16
22#define state_h5 20
23#define state_h6 24
24#define state_h7 28
25
26/* Stack structure */
27
28#define STACK_W_SIZE        (32 * 2 * 3)
29
30#define STACK_W             (0)
31#define STACK_SIZE          (STACK_W + STACK_W_SIZE)
32
33/* Register macros */
34
35#define RSTATE x0
36#define RDATA  x1
37#define RNBLKS x2
38#define RKPTR  x28
39#define RFRAME x29
40
41#define ra w3
42#define rb w4
43#define rc w5
44#define rd w6
45#define re w7
46#define rf w8
47#define rg w9
48#define rh w10
49
50#define t0 w11
51#define t1 w12
52#define t2 w13
53#define t3 w14
54#define t4 w15
55#define t5 w16
56#define t6 w17
57
58#define k_even w19
59#define k_odd w20
60
61#define addr0 x21
62#define addr1 x22
63
64#define s0 w23
65#define s1 w24
66#define s2 w25
67#define s3 w26
68
69#define W0 v0
70#define W1 v1
71#define W2 v2
72#define W3 v3
73#define W4 v4
74#define W5 v5
75
76#define XTMP0 v6
77#define XTMP1 v7
78#define XTMP2 v16
79#define XTMP3 v17
80#define XTMP4 v18
81#define XTMP5 v19
82#define XTMP6 v20
83
84/* Helper macros. */
85
86#define _(...) /*_*/
87
88#define clear_vec(x) \
89	movi	x.8h, #0;
90
91#define rolw(o, a, n) \
92	ror	o, a, #(32 - n);
93
94/* Round function macros. */
95
96#define GG1_1(x, y, z, o, t) \
97	eor	o, x, y;
98#define GG1_2(x, y, z, o, t) \
99	eor	o, o, z;
100#define GG1_3(x, y, z, o, t)
101
102#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
103#define FF1_2(x, y, z, o, t)
104#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
105
106#define GG2_1(x, y, z, o, t) \
107	bic	o, z, x;
108#define GG2_2(x, y, z, o, t) \
109	and	t, y, x;
110#define GG2_3(x, y, z, o, t) \
111	eor	o, o, t;
112
113#define FF2_1(x, y, z, o, t) \
114	eor	o, x, y;
115#define FF2_2(x, y, z, o, t) \
116	and	t, x, y; \
117	and	o, o, z;
118#define FF2_3(x, y, z, o, t) \
119	eor	o, o, t;
120
121#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
122	K_LOAD(round);                                                        \
123	ldr	t5, [sp, #(wtype##_W1_ADDR(round, widx))];                    \
124	rolw(t0, a, 12);                              /* rol(a, 12) => t0 */  \
125      IOP(1, iop_param);                                                      \
126	FF##i##_1(a, b, c, t1, t2);                                           \
127	ldr	t6, [sp, #(wtype##_W1W2_ADDR(round, widx))];                  \
128	add	k, k, e;                                                      \
129      IOP(2, iop_param);                                                      \
130	GG##i##_1(e, f, g, t3, t4);                                           \
131	FF##i##_2(a, b, c, t1, t2);                                           \
132      IOP(3, iop_param);                                                      \
133	add	k, k, t0;                                                     \
134	add	h, h, t5;                                                     \
135	add	d, d, t6;                     /* w1w2 + d => d */             \
136      IOP(4, iop_param);                                                      \
137	rolw(k, k, 7);                        /* rol (t0 + e + t), 7) => k */ \
138	GG##i##_2(e, f, g, t3, t4);                                           \
139	add	h, h, k;                      /* h + w1 + k => h */           \
140      IOP(5, iop_param);                                                      \
141	FF##i##_3(a, b, c, t1, t2);                                           \
142	eor	t0, t0, k;                    /* k ^ t0 => t0 */              \
143	GG##i##_3(e, f, g, t3, t4);                                           \
144	add	d, d, t1;                     /* FF(a,b,c) + d => d */        \
145      IOP(6, iop_param);                                                      \
146	add	t3, t3, h;                    /* GG(e,f,g) + h => t3 */       \
147	rolw(b, b, 9);                        /* rol(b, 9) => b */            \
148	eor	h, t3, t3, ror #(32-9);                                       \
149      IOP(7, iop_param);                                                      \
150	add	d, d, t0;                     /* t0 + d => d */               \
151	rolw(f, f, 19);                       /* rol(f, 19) => f */           \
152      IOP(8, iop_param);                                                      \
153	eor	h, h, t3, ror #(32-17);       /* P0(t3) => h */
154
155#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
156	R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
157
158#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
159	R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
160
161#define KL(round) \
162	ldp	k_even, k_odd, [RKPTR, #(4*(round))];
163
164/* Input expansion macros. */
165
166/* Byte-swapped input address. */
167#define IW_W_ADDR(round, widx, offs) \
168	(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
169
170/* Expanded input address. */
171#define XW_W_ADDR(round, widx, offs) \
172	(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
173
174/* Rounds 1-12, byte-swapped input block addresses. */
175#define IW_W1_ADDR(round, widx)   IW_W_ADDR(round, widx, 32)
176#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
177
178/* Rounds 1-12, expanded input block addresses. */
179#define XW_W1_ADDR(round, widx)   XW_W_ADDR(round, widx, 0)
180#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
181
182/* Input block loading.
183 * Interleaving within round function needed for in-order CPUs. */
184#define LOAD_W_VEC_1_1() \
185	add	addr0, sp, #IW_W1_ADDR(0, 0);
186#define LOAD_W_VEC_1_2() \
187	add	addr1, sp, #IW_W1_ADDR(4, 0);
188#define LOAD_W_VEC_1_3() \
189	ld1	{W0.16b}, [RDATA], #16;
190#define LOAD_W_VEC_1_4() \
191	ld1	{W1.16b}, [RDATA], #16;
192#define LOAD_W_VEC_1_5() \
193	ld1	{W2.16b}, [RDATA], #16;
194#define LOAD_W_VEC_1_6() \
195	ld1	{W3.16b}, [RDATA], #16;
196#define LOAD_W_VEC_1_7() \
197	rev32	XTMP0.16b, W0.16b;
198#define LOAD_W_VEC_1_8() \
199	rev32	XTMP1.16b, W1.16b;
200#define LOAD_W_VEC_2_1() \
201	rev32	XTMP2.16b, W2.16b;
202#define LOAD_W_VEC_2_2() \
203	rev32	XTMP3.16b, W3.16b;
204#define LOAD_W_VEC_2_3() \
205	eor	XTMP4.16b, XTMP1.16b, XTMP0.16b;
206#define LOAD_W_VEC_2_4() \
207	eor	XTMP5.16b, XTMP2.16b, XTMP1.16b;
208#define LOAD_W_VEC_2_5() \
209	st1	{XTMP0.16b}, [addr0], #16;
210#define LOAD_W_VEC_2_6() \
211	st1	{XTMP4.16b}, [addr0]; \
212	add	addr0, sp, #IW_W1_ADDR(8, 0);
213#define LOAD_W_VEC_2_7() \
214	eor	XTMP6.16b, XTMP3.16b, XTMP2.16b;
215#define LOAD_W_VEC_2_8() \
216	ext	W0.16b, XTMP0.16b, XTMP0.16b, #8;  /* W0: xx, w0, xx, xx */
217#define LOAD_W_VEC_3_1() \
218	mov	W2.16b, XTMP1.16b;                 /* W2: xx, w6, w5, w4 */
219#define LOAD_W_VEC_3_2() \
220	st1	{XTMP1.16b}, [addr1], #16;
221#define LOAD_W_VEC_3_3() \
222	st1	{XTMP5.16b}, [addr1]; \
223	ext	W1.16b, XTMP0.16b, XTMP0.16b, #4;  /* W1: xx, w3, w2, w1 */
224#define LOAD_W_VEC_3_4() \
225	ext	W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
226#define LOAD_W_VEC_3_5() \
227	ext	W4.16b, XTMP2.16b, XTMP3.16b, #8;  /* W4: xx, w12, w11, w10 */
228#define LOAD_W_VEC_3_6() \
229	st1	{XTMP2.16b}, [addr0], #16;
230#define LOAD_W_VEC_3_7() \
231	st1	{XTMP6.16b}, [addr0];
232#define LOAD_W_VEC_3_8() \
233	ext	W5.16b, XTMP3.16b, XTMP3.16b, #4;  /* W5: xx, w15, w14, w13 */
234
235#define LOAD_W_VEC_1(iop_num, ...) \
236	LOAD_W_VEC_1_##iop_num()
237#define LOAD_W_VEC_2(iop_num, ...) \
238	LOAD_W_VEC_2_##iop_num()
239#define LOAD_W_VEC_3(iop_num, ...) \
240	LOAD_W_VEC_3_##iop_num()
241
242/* Message scheduling. Note: 3 words per vector register.
243 * Interleaving within round function needed for in-order CPUs. */
244#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
245	/* Load (w[i - 16]) => XTMP0 */            \
246	/* Load (w[i - 13]) => XTMP5 */            \
247	ext	XTMP0.16b, w0.16b, w0.16b, #12;    /* XTMP0: w0, xx, xx, xx */
248#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
249	ext	XTMP5.16b, w1.16b, w1.16b, #12;
250#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
251	ext	XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
252#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
253	ext	XTMP5.16b, XTMP5.16b, w2.16b, #12;
254#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
255	/* w[i - 9] == w3 */                       \
256	/* W3 ^ XTMP0 => XTMP0 */                  \
257	eor	XTMP0.16b, XTMP0.16b, w3.16b;
258#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
259	/* w[i - 3] == w5 */                       \
260	/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */       \
261	/* rol(XTMP5, 7) => XTMP1 */               \
262	add	addr0, sp, #XW_W1_ADDR((round), 0); \
263	shl	XTMP2.4s, w5.4s, #15;
264#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
265	shl	XTMP1.4s, XTMP5.4s, #7;
266#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
267	sri	XTMP2.4s, w5.4s, #(32-15);
268#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
269	sri	XTMP1.4s, XTMP5.4s, #(32-7);
270#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
271	eor	XTMP0.16b, XTMP0.16b, XTMP2.16b;
272#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
273	/* w[i - 6] == W4 */                       \
274	/* W4 ^ XTMP1 => XTMP1 */                  \
275	eor	XTMP1.16b, XTMP1.16b, w4.16b;
276#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
277	/* P1(XTMP0) ^ XTMP1 => W0 */              \
278	shl	XTMP3.4s, XTMP0.4s, #15;
279#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
280	shl	XTMP4.4s, XTMP0.4s, #23;
281#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
282	eor	w0.16b, XTMP1.16b, XTMP0.16b;
283#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
284	sri	XTMP3.4s, XTMP0.4s, #(32-15);
285#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
286	sri	XTMP4.4s, XTMP0.4s, #(32-23);
287#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
288	eor	w0.16b, w0.16b, XTMP3.16b;
289#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
290	/* Load (w[i - 3]) => XTMP2 */             \
291	ext	XTMP2.16b, w4.16b, w4.16b, #12;
292#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
293	eor	w0.16b, w0.16b, XTMP4.16b;
294#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
295	ext	XTMP2.16b, XTMP2.16b, w5.16b, #12;
296#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
297	/* W1 ^ W2 => XTMP3 */                     \
298	eor	XTMP3.16b, XTMP2.16b, w0.16b;
299#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
300#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
301	st1	{XTMP2.16b-XTMP3.16b}, [addr0];
302#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
303
304#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
305	SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
306#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
307	SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
308#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
309	SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
310
311#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
312	SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
313#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
314	SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
315#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
316	SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
317
318#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
319	SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
320#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
321	SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
322#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
323	SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
324
325#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
326	SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
327#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
328	SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
329#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
330	SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
331
332#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
333	SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
334#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
335	SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
336#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
337	SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
338
339#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
340	SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
341#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
342	SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
343#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
344	SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
345
346
347	/*
348	 * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'.
349	 *
350	 * void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
351	 *                         int blocks)
352	 */
353	.text
354.align 3
355SYM_TYPED_FUNC_START(sm3_neon_transform)
356	ldp		ra, rb, [RSTATE, #0]
357	ldp		rc, rd, [RSTATE, #8]
358	ldp		re, rf, [RSTATE, #16]
359	ldp		rg, rh, [RSTATE, #24]
360
361	stp		x28, x29, [sp, #-16]!
362	stp		x19, x20, [sp, #-16]!
363	stp		x21, x22, [sp, #-16]!
364	stp		x23, x24, [sp, #-16]!
365	stp		x25, x26, [sp, #-16]!
366	mov		RFRAME, sp
367
368	sub		addr0, sp, #STACK_SIZE
369	adr_l		RKPTR, .LKtable
370	and		sp, addr0, #(~63)
371
372	/* Preload first block. */
373	LOAD_W_VEC_1(1, 0)
374	LOAD_W_VEC_1(2, 0)
375	LOAD_W_VEC_1(3, 0)
376	LOAD_W_VEC_1(4, 0)
377	LOAD_W_VEC_1(5, 0)
378	LOAD_W_VEC_1(6, 0)
379	LOAD_W_VEC_1(7, 0)
380	LOAD_W_VEC_1(8, 0)
381	LOAD_W_VEC_2(1, 0)
382	LOAD_W_VEC_2(2, 0)
383	LOAD_W_VEC_2(3, 0)
384	LOAD_W_VEC_2(4, 0)
385	LOAD_W_VEC_2(5, 0)
386	LOAD_W_VEC_2(6, 0)
387	LOAD_W_VEC_2(7, 0)
388	LOAD_W_VEC_2(8, 0)
389	LOAD_W_VEC_3(1, 0)
390	LOAD_W_VEC_3(2, 0)
391	LOAD_W_VEC_3(3, 0)
392	LOAD_W_VEC_3(4, 0)
393	LOAD_W_VEC_3(5, 0)
394	LOAD_W_VEC_3(6, 0)
395	LOAD_W_VEC_3(7, 0)
396	LOAD_W_VEC_3(8, 0)
397
398.balign 16
399.Loop:
400	/* Transform 0-3 */
401	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
402	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  1, 1, IW, _, 0)
403	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
404	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  3, 3, IW, _, 0)
405
406	/* Transform 4-7 + Precalc 12-14 */
407	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
408	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  5, 1, IW, _, 0)
409	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
410	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
411
412	/* Transform 8-11 + Precalc 12-17 */
413	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
414	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
415	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
416	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
417
418	/* Transform 12-14 + Precalc 18-20 */
419	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
420	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
421	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
422
423	/* Transform 15-17 + Precalc 21-23 */
424	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
425	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
426	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
427
428	/* Transform 18-20 + Precalc 24-26 */
429	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
430	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
431	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
432
433	/* Transform 21-23 + Precalc 27-29 */
434	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
435	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
436	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
437
438	/* Transform 24-26 + Precalc 30-32 */
439	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
440	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
441	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
442
443	/* Transform 27-29 + Precalc 33-35 */
444	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
445	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
446	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
447
448	/* Transform 30-32 + Precalc 36-38 */
449	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
450	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
451	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
452
453	/* Transform 33-35 + Precalc 39-41 */
454	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
455	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
456	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
457
458	/* Transform 36-38 + Precalc 42-44 */
459	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
460	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
461	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
462
463	/* Transform 39-41 + Precalc 45-47 */
464	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
465	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
466	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
467
468	/* Transform 42-44 + Precalc 48-50 */
469	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
470	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
471	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
472
473	/* Transform 45-47 + Precalc 51-53 */
474	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
475	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
476	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
477
478	/* Transform 48-50 + Precalc 54-56 */
479	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
480	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
481	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
482
483	/* Transform 51-53 + Precalc 57-59 */
484	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
485	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
486	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
487
488	/* Transform 54-56 + Precalc 60-62 */
489	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
490	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
491	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
492
493	/* Transform 57-59 + Precalc 63 */
494	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
495	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
496	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
497
498	/* Transform 60 */
499	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
500	subs		RNBLKS, RNBLKS, #1
501	b.eq		.Lend
502
503	/* Transform 61-63 + Preload next block */
504	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, LOAD_W_VEC_1, _)
505	ldp		s0, s1, [RSTATE, #0]
506	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
507	ldp		s2, s3, [RSTATE, #8]
508	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, LOAD_W_VEC_3, _)
509
510	/* Update the chaining variables. */
511	eor		ra, ra, s0
512	eor		rb, rb, s1
513	ldp		s0, s1, [RSTATE, #16]
514	eor		rc, rc, s2
515	ldp		k_even, k_odd, [RSTATE, #24]
516	eor		rd, rd, s3
517	eor		re, re, s0
518	stp		ra, rb, [RSTATE, #0]
519	eor		rf, rf, s1
520	stp		rc, rd, [RSTATE, #8]
521	eor		rg, rg, k_even
522	stp		re, rf, [RSTATE, #16]
523	eor		rh, rh, k_odd
524	stp		rg, rh, [RSTATE, #24]
525	b		.Loop
526
527.Lend:
528	/* Transform 61-63 */
529	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, _, _)
530	ldp		s0, s1, [RSTATE, #0]
531	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
532	ldp		s2, s3, [RSTATE, #8]
533	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, _, _)
534
535	/* Update the chaining variables. */
536	eor		ra, ra, s0
537	clear_vec(W0)
538	eor		rb, rb, s1
539	clear_vec(W1)
540	ldp		s0, s1, [RSTATE, #16]
541	clear_vec(W2)
542	eor		rc, rc, s2
543	clear_vec(W3)
544	ldp		k_even, k_odd, [RSTATE, #24]
545	clear_vec(W4)
546	eor		rd, rd, s3
547	clear_vec(W5)
548	eor		re, re, s0
549	clear_vec(XTMP0)
550	stp		ra, rb, [RSTATE, #0]
551	clear_vec(XTMP1)
552	eor		rf, rf, s1
553	clear_vec(XTMP2)
554	stp		rc, rd, [RSTATE, #8]
555	clear_vec(XTMP3)
556	eor		rg, rg, k_even
557	clear_vec(XTMP4)
558	stp		re, rf, [RSTATE, #16]
559	clear_vec(XTMP5)
560	eor		rh, rh, k_odd
561	clear_vec(XTMP6)
562	stp		rg, rh, [RSTATE, #24]
563
564	/* Clear message expansion area */
565	add		addr0, sp, #STACK_W
566	st1		{W0.16b-W3.16b}, [addr0], #64
567	st1		{W0.16b-W3.16b}, [addr0], #64
568	st1		{W0.16b-W3.16b}, [addr0]
569
570	mov		sp, RFRAME
571
572	ldp		x25, x26, [sp], #16
573	ldp		x23, x24, [sp], #16
574	ldp		x21, x22, [sp], #16
575	ldp		x19, x20, [sp], #16
576	ldp		x28, x29, [sp], #16
577
578	ret
579SYM_FUNC_END(sm3_neon_transform)
580
581
582	.section	".rodata", "a"
583
584	.align 4
585.LKtable:
586	.long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
587	.long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
588	.long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
589	.long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
590	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
591	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
592	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
593	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
594	.long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
595	.long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
596	.long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
597	.long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
598	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
599	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
600	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
601	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
602