xref: /linux/lib/crypto/arm64/sm3-neon-core.S (revision 370c3883195566ee3e7d79e0146c3d735a406573)
1*9f69f52bSEric Biggers// SPDX-License-Identifier: GPL-2.0-or-later
2*9f69f52bSEric Biggers/*
3*9f69f52bSEric Biggers * sm3-neon-core.S - SM3 secure hash using NEON instructions
4*9f69f52bSEric Biggers *
5*9f69f52bSEric Biggers * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
6*9f69f52bSEric Biggers *
7*9f69f52bSEric Biggers * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8*9f69f52bSEric Biggers * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9*9f69f52bSEric Biggers */
10*9f69f52bSEric Biggers
11*9f69f52bSEric Biggers#include <linux/linkage.h>
12*9f69f52bSEric Biggers#include <asm/assembler.h>
13*9f69f52bSEric Biggers
14*9f69f52bSEric Biggers/* Context structure */
15*9f69f52bSEric Biggers
16*9f69f52bSEric Biggers#define state_h0 0
17*9f69f52bSEric Biggers#define state_h1 4
18*9f69f52bSEric Biggers#define state_h2 8
19*9f69f52bSEric Biggers#define state_h3 12
20*9f69f52bSEric Biggers#define state_h4 16
21*9f69f52bSEric Biggers#define state_h5 20
22*9f69f52bSEric Biggers#define state_h6 24
23*9f69f52bSEric Biggers#define state_h7 28
24*9f69f52bSEric Biggers
25*9f69f52bSEric Biggers/* Stack structure */
26*9f69f52bSEric Biggers
27*9f69f52bSEric Biggers#define STACK_W_SIZE        (32 * 2 * 3)
28*9f69f52bSEric Biggers
29*9f69f52bSEric Biggers#define STACK_W             (0)
30*9f69f52bSEric Biggers#define STACK_SIZE          (STACK_W + STACK_W_SIZE)
31*9f69f52bSEric Biggers
32*9f69f52bSEric Biggers/* Register macros */
33*9f69f52bSEric Biggers
34*9f69f52bSEric Biggers#define RSTATE x0
35*9f69f52bSEric Biggers#define RDATA  x1
36*9f69f52bSEric Biggers#define RNBLKS x2
37*9f69f52bSEric Biggers#define RKPTR  x28
38*9f69f52bSEric Biggers#define RFRAME x29
39*9f69f52bSEric Biggers
40*9f69f52bSEric Biggers#define ra w3
41*9f69f52bSEric Biggers#define rb w4
42*9f69f52bSEric Biggers#define rc w5
43*9f69f52bSEric Biggers#define rd w6
44*9f69f52bSEric Biggers#define re w7
45*9f69f52bSEric Biggers#define rf w8
46*9f69f52bSEric Biggers#define rg w9
47*9f69f52bSEric Biggers#define rh w10
48*9f69f52bSEric Biggers
49*9f69f52bSEric Biggers#define t0 w11
50*9f69f52bSEric Biggers#define t1 w12
51*9f69f52bSEric Biggers#define t2 w13
52*9f69f52bSEric Biggers#define t3 w14
53*9f69f52bSEric Biggers#define t4 w15
54*9f69f52bSEric Biggers#define t5 w16
55*9f69f52bSEric Biggers#define t6 w17
56*9f69f52bSEric Biggers
57*9f69f52bSEric Biggers#define k_even w19
58*9f69f52bSEric Biggers#define k_odd w20
59*9f69f52bSEric Biggers
60*9f69f52bSEric Biggers#define addr0 x21
61*9f69f52bSEric Biggers#define addr1 x22
62*9f69f52bSEric Biggers
63*9f69f52bSEric Biggers#define s0 w23
64*9f69f52bSEric Biggers#define s1 w24
65*9f69f52bSEric Biggers#define s2 w25
66*9f69f52bSEric Biggers#define s3 w26
67*9f69f52bSEric Biggers
68*9f69f52bSEric Biggers#define W0 v0
69*9f69f52bSEric Biggers#define W1 v1
70*9f69f52bSEric Biggers#define W2 v2
71*9f69f52bSEric Biggers#define W3 v3
72*9f69f52bSEric Biggers#define W4 v4
73*9f69f52bSEric Biggers#define W5 v5
74*9f69f52bSEric Biggers
75*9f69f52bSEric Biggers#define XTMP0 v6
76*9f69f52bSEric Biggers#define XTMP1 v7
77*9f69f52bSEric Biggers#define XTMP2 v16
78*9f69f52bSEric Biggers#define XTMP3 v17
79*9f69f52bSEric Biggers#define XTMP4 v18
80*9f69f52bSEric Biggers#define XTMP5 v19
81*9f69f52bSEric Biggers#define XTMP6 v20
82*9f69f52bSEric Biggers
83*9f69f52bSEric Biggers/* Helper macros. */
84*9f69f52bSEric Biggers
85*9f69f52bSEric Biggers#define _(...) /*_*/
86*9f69f52bSEric Biggers
87*9f69f52bSEric Biggers#define clear_vec(x) \
88*9f69f52bSEric Biggers	movi	x.8h, #0;
89*9f69f52bSEric Biggers
90*9f69f52bSEric Biggers#define rolw(o, a, n) \
91*9f69f52bSEric Biggers	ror	o, a, #(32 - n);
92*9f69f52bSEric Biggers
93*9f69f52bSEric Biggers/* Round function macros. */
94*9f69f52bSEric Biggers
95*9f69f52bSEric Biggers#define GG1_1(x, y, z, o, t) \
96*9f69f52bSEric Biggers	eor	o, x, y;
97*9f69f52bSEric Biggers#define GG1_2(x, y, z, o, t) \
98*9f69f52bSEric Biggers	eor	o, o, z;
99*9f69f52bSEric Biggers#define GG1_3(x, y, z, o, t)
100*9f69f52bSEric Biggers
101*9f69f52bSEric Biggers#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
102*9f69f52bSEric Biggers#define FF1_2(x, y, z, o, t)
103*9f69f52bSEric Biggers#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
104*9f69f52bSEric Biggers
105*9f69f52bSEric Biggers#define GG2_1(x, y, z, o, t) \
106*9f69f52bSEric Biggers	bic	o, z, x;
107*9f69f52bSEric Biggers#define GG2_2(x, y, z, o, t) \
108*9f69f52bSEric Biggers	and	t, y, x;
109*9f69f52bSEric Biggers#define GG2_3(x, y, z, o, t) \
110*9f69f52bSEric Biggers	eor	o, o, t;
111*9f69f52bSEric Biggers
112*9f69f52bSEric Biggers#define FF2_1(x, y, z, o, t) \
113*9f69f52bSEric Biggers	eor	o, x, y;
114*9f69f52bSEric Biggers#define FF2_2(x, y, z, o, t) \
115*9f69f52bSEric Biggers	and	t, x, y; \
116*9f69f52bSEric Biggers	and	o, o, z;
117*9f69f52bSEric Biggers#define FF2_3(x, y, z, o, t) \
118*9f69f52bSEric Biggers	eor	o, o, t;
119*9f69f52bSEric Biggers
120*9f69f52bSEric Biggers#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
121*9f69f52bSEric Biggers	K_LOAD(round);                                                        \
122*9f69f52bSEric Biggers	ldr	t5, [sp, #(wtype##_W1_ADDR(round, widx))];                    \
123*9f69f52bSEric Biggers	rolw(t0, a, 12);                              /* rol(a, 12) => t0 */  \
124*9f69f52bSEric Biggers      IOP(1, iop_param);                                                      \
125*9f69f52bSEric Biggers	FF##i##_1(a, b, c, t1, t2);                                           \
126*9f69f52bSEric Biggers	ldr	t6, [sp, #(wtype##_W1W2_ADDR(round, widx))];                  \
127*9f69f52bSEric Biggers	add	k, k, e;                                                      \
128*9f69f52bSEric Biggers      IOP(2, iop_param);                                                      \
129*9f69f52bSEric Biggers	GG##i##_1(e, f, g, t3, t4);                                           \
130*9f69f52bSEric Biggers	FF##i##_2(a, b, c, t1, t2);                                           \
131*9f69f52bSEric Biggers      IOP(3, iop_param);                                                      \
132*9f69f52bSEric Biggers	add	k, k, t0;                                                     \
133*9f69f52bSEric Biggers	add	h, h, t5;                                                     \
134*9f69f52bSEric Biggers	add	d, d, t6;                     /* w1w2 + d => d */             \
135*9f69f52bSEric Biggers      IOP(4, iop_param);                                                      \
136*9f69f52bSEric Biggers	rolw(k, k, 7);                        /* rol (t0 + e + t), 7) => k */ \
137*9f69f52bSEric Biggers	GG##i##_2(e, f, g, t3, t4);                                           \
138*9f69f52bSEric Biggers	add	h, h, k;                      /* h + w1 + k => h */           \
139*9f69f52bSEric Biggers      IOP(5, iop_param);                                                      \
140*9f69f52bSEric Biggers	FF##i##_3(a, b, c, t1, t2);                                           \
141*9f69f52bSEric Biggers	eor	t0, t0, k;                    /* k ^ t0 => t0 */              \
142*9f69f52bSEric Biggers	GG##i##_3(e, f, g, t3, t4);                                           \
143*9f69f52bSEric Biggers	add	d, d, t1;                     /* FF(a,b,c) + d => d */        \
144*9f69f52bSEric Biggers      IOP(6, iop_param);                                                      \
145*9f69f52bSEric Biggers	add	t3, t3, h;                    /* GG(e,f,g) + h => t3 */       \
146*9f69f52bSEric Biggers	rolw(b, b, 9);                        /* rol(b, 9) => b */            \
147*9f69f52bSEric Biggers	eor	h, t3, t3, ror #(32-9);                                       \
148*9f69f52bSEric Biggers      IOP(7, iop_param);                                                      \
149*9f69f52bSEric Biggers	add	d, d, t0;                     /* t0 + d => d */               \
150*9f69f52bSEric Biggers	rolw(f, f, 19);                       /* rol(f, 19) => f */           \
151*9f69f52bSEric Biggers      IOP(8, iop_param);                                                      \
152*9f69f52bSEric Biggers	eor	h, h, t3, ror #(32-17);       /* P0(t3) => h */
153*9f69f52bSEric Biggers
154*9f69f52bSEric Biggers#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
155*9f69f52bSEric Biggers	R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
156*9f69f52bSEric Biggers
157*9f69f52bSEric Biggers#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
158*9f69f52bSEric Biggers	R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
159*9f69f52bSEric Biggers
160*9f69f52bSEric Biggers#define KL(round) \
161*9f69f52bSEric Biggers	ldp	k_even, k_odd, [RKPTR, #(4*(round))];
162*9f69f52bSEric Biggers
163*9f69f52bSEric Biggers/* Input expansion macros. */
164*9f69f52bSEric Biggers
165*9f69f52bSEric Biggers/* Byte-swapped input address. */
166*9f69f52bSEric Biggers#define IW_W_ADDR(round, widx, offs) \
167*9f69f52bSEric Biggers	(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
168*9f69f52bSEric Biggers
169*9f69f52bSEric Biggers/* Expanded input address. */
170*9f69f52bSEric Biggers#define XW_W_ADDR(round, widx, offs) \
171*9f69f52bSEric Biggers	(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
172*9f69f52bSEric Biggers
173*9f69f52bSEric Biggers/* Rounds 1-12, byte-swapped input block addresses. */
174*9f69f52bSEric Biggers#define IW_W1_ADDR(round, widx)   IW_W_ADDR(round, widx, 32)
175*9f69f52bSEric Biggers#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
176*9f69f52bSEric Biggers
177*9f69f52bSEric Biggers/* Rounds 1-12, expanded input block addresses. */
178*9f69f52bSEric Biggers#define XW_W1_ADDR(round, widx)   XW_W_ADDR(round, widx, 0)
179*9f69f52bSEric Biggers#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
180*9f69f52bSEric Biggers
181*9f69f52bSEric Biggers/* Input block loading.
182*9f69f52bSEric Biggers * Interleaving within round function needed for in-order CPUs. */
183*9f69f52bSEric Biggers#define LOAD_W_VEC_1_1() \
184*9f69f52bSEric Biggers	add	addr0, sp, #IW_W1_ADDR(0, 0);
185*9f69f52bSEric Biggers#define LOAD_W_VEC_1_2() \
186*9f69f52bSEric Biggers	add	addr1, sp, #IW_W1_ADDR(4, 0);
187*9f69f52bSEric Biggers#define LOAD_W_VEC_1_3() \
188*9f69f52bSEric Biggers	ld1	{W0.16b}, [RDATA], #16;
189*9f69f52bSEric Biggers#define LOAD_W_VEC_1_4() \
190*9f69f52bSEric Biggers	ld1	{W1.16b}, [RDATA], #16;
191*9f69f52bSEric Biggers#define LOAD_W_VEC_1_5() \
192*9f69f52bSEric Biggers	ld1	{W2.16b}, [RDATA], #16;
193*9f69f52bSEric Biggers#define LOAD_W_VEC_1_6() \
194*9f69f52bSEric Biggers	ld1	{W3.16b}, [RDATA], #16;
195*9f69f52bSEric Biggers#define LOAD_W_VEC_1_7() \
196*9f69f52bSEric Biggers	rev32	XTMP0.16b, W0.16b;
197*9f69f52bSEric Biggers#define LOAD_W_VEC_1_8() \
198*9f69f52bSEric Biggers	rev32	XTMP1.16b, W1.16b;
199*9f69f52bSEric Biggers#define LOAD_W_VEC_2_1() \
200*9f69f52bSEric Biggers	rev32	XTMP2.16b, W2.16b;
201*9f69f52bSEric Biggers#define LOAD_W_VEC_2_2() \
202*9f69f52bSEric Biggers	rev32	XTMP3.16b, W3.16b;
203*9f69f52bSEric Biggers#define LOAD_W_VEC_2_3() \
204*9f69f52bSEric Biggers	eor	XTMP4.16b, XTMP1.16b, XTMP0.16b;
205*9f69f52bSEric Biggers#define LOAD_W_VEC_2_4() \
206*9f69f52bSEric Biggers	eor	XTMP5.16b, XTMP2.16b, XTMP1.16b;
207*9f69f52bSEric Biggers#define LOAD_W_VEC_2_5() \
208*9f69f52bSEric Biggers	st1	{XTMP0.16b}, [addr0], #16;
209*9f69f52bSEric Biggers#define LOAD_W_VEC_2_6() \
210*9f69f52bSEric Biggers	st1	{XTMP4.16b}, [addr0]; \
211*9f69f52bSEric Biggers	add	addr0, sp, #IW_W1_ADDR(8, 0);
212*9f69f52bSEric Biggers#define LOAD_W_VEC_2_7() \
213*9f69f52bSEric Biggers	eor	XTMP6.16b, XTMP3.16b, XTMP2.16b;
214*9f69f52bSEric Biggers#define LOAD_W_VEC_2_8() \
215*9f69f52bSEric Biggers	ext	W0.16b, XTMP0.16b, XTMP0.16b, #8;  /* W0: xx, w0, xx, xx */
216*9f69f52bSEric Biggers#define LOAD_W_VEC_3_1() \
217*9f69f52bSEric Biggers	mov	W2.16b, XTMP1.16b;                 /* W2: xx, w6, w5, w4 */
218*9f69f52bSEric Biggers#define LOAD_W_VEC_3_2() \
219*9f69f52bSEric Biggers	st1	{XTMP1.16b}, [addr1], #16;
220*9f69f52bSEric Biggers#define LOAD_W_VEC_3_3() \
221*9f69f52bSEric Biggers	st1	{XTMP5.16b}, [addr1]; \
222*9f69f52bSEric Biggers	ext	W1.16b, XTMP0.16b, XTMP0.16b, #4;  /* W1: xx, w3, w2, w1 */
223*9f69f52bSEric Biggers#define LOAD_W_VEC_3_4() \
224*9f69f52bSEric Biggers	ext	W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
225*9f69f52bSEric Biggers#define LOAD_W_VEC_3_5() \
226*9f69f52bSEric Biggers	ext	W4.16b, XTMP2.16b, XTMP3.16b, #8;  /* W4: xx, w12, w11, w10 */
227*9f69f52bSEric Biggers#define LOAD_W_VEC_3_6() \
228*9f69f52bSEric Biggers	st1	{XTMP2.16b}, [addr0], #16;
229*9f69f52bSEric Biggers#define LOAD_W_VEC_3_7() \
230*9f69f52bSEric Biggers	st1	{XTMP6.16b}, [addr0];
231*9f69f52bSEric Biggers#define LOAD_W_VEC_3_8() \
232*9f69f52bSEric Biggers	ext	W5.16b, XTMP3.16b, XTMP3.16b, #4;  /* W5: xx, w15, w14, w13 */
233*9f69f52bSEric Biggers
234*9f69f52bSEric Biggers#define LOAD_W_VEC_1(iop_num, ...) \
235*9f69f52bSEric Biggers	LOAD_W_VEC_1_##iop_num()
236*9f69f52bSEric Biggers#define LOAD_W_VEC_2(iop_num, ...) \
237*9f69f52bSEric Biggers	LOAD_W_VEC_2_##iop_num()
238*9f69f52bSEric Biggers#define LOAD_W_VEC_3(iop_num, ...) \
239*9f69f52bSEric Biggers	LOAD_W_VEC_3_##iop_num()
240*9f69f52bSEric Biggers
241*9f69f52bSEric Biggers/* Message scheduling. Note: 3 words per vector register.
242*9f69f52bSEric Biggers * Interleaving within round function needed for in-order CPUs. */
243*9f69f52bSEric Biggers#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
244*9f69f52bSEric Biggers	/* Load (w[i - 16]) => XTMP0 */            \
245*9f69f52bSEric Biggers	/* Load (w[i - 13]) => XTMP5 */            \
246*9f69f52bSEric Biggers	ext	XTMP0.16b, w0.16b, w0.16b, #12;    /* XTMP0: w0, xx, xx, xx */
247*9f69f52bSEric Biggers#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
248*9f69f52bSEric Biggers	ext	XTMP5.16b, w1.16b, w1.16b, #12;
249*9f69f52bSEric Biggers#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
250*9f69f52bSEric Biggers	ext	XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
251*9f69f52bSEric Biggers#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
252*9f69f52bSEric Biggers	ext	XTMP5.16b, XTMP5.16b, w2.16b, #12;
253*9f69f52bSEric Biggers#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
254*9f69f52bSEric Biggers	/* w[i - 9] == w3 */                       \
255*9f69f52bSEric Biggers	/* W3 ^ XTMP0 => XTMP0 */                  \
256*9f69f52bSEric Biggers	eor	XTMP0.16b, XTMP0.16b, w3.16b;
257*9f69f52bSEric Biggers#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
258*9f69f52bSEric Biggers	/* w[i - 3] == w5 */                       \
259*9f69f52bSEric Biggers	/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */       \
260*9f69f52bSEric Biggers	/* rol(XTMP5, 7) => XTMP1 */               \
261*9f69f52bSEric Biggers	add	addr0, sp, #XW_W1_ADDR((round), 0); \
262*9f69f52bSEric Biggers	shl	XTMP2.4s, w5.4s, #15;
263*9f69f52bSEric Biggers#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
264*9f69f52bSEric Biggers	shl	XTMP1.4s, XTMP5.4s, #7;
265*9f69f52bSEric Biggers#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
266*9f69f52bSEric Biggers	sri	XTMP2.4s, w5.4s, #(32-15);
267*9f69f52bSEric Biggers#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
268*9f69f52bSEric Biggers	sri	XTMP1.4s, XTMP5.4s, #(32-7);
269*9f69f52bSEric Biggers#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
270*9f69f52bSEric Biggers	eor	XTMP0.16b, XTMP0.16b, XTMP2.16b;
271*9f69f52bSEric Biggers#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
272*9f69f52bSEric Biggers	/* w[i - 6] == W4 */                       \
273*9f69f52bSEric Biggers	/* W4 ^ XTMP1 => XTMP1 */                  \
274*9f69f52bSEric Biggers	eor	XTMP1.16b, XTMP1.16b, w4.16b;
275*9f69f52bSEric Biggers#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
276*9f69f52bSEric Biggers	/* P1(XTMP0) ^ XTMP1 => W0 */              \
277*9f69f52bSEric Biggers	shl	XTMP3.4s, XTMP0.4s, #15;
278*9f69f52bSEric Biggers#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
279*9f69f52bSEric Biggers	shl	XTMP4.4s, XTMP0.4s, #23;
280*9f69f52bSEric Biggers#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
281*9f69f52bSEric Biggers	eor	w0.16b, XTMP1.16b, XTMP0.16b;
282*9f69f52bSEric Biggers#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
283*9f69f52bSEric Biggers	sri	XTMP3.4s, XTMP0.4s, #(32-15);
284*9f69f52bSEric Biggers#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
285*9f69f52bSEric Biggers	sri	XTMP4.4s, XTMP0.4s, #(32-23);
286*9f69f52bSEric Biggers#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
287*9f69f52bSEric Biggers	eor	w0.16b, w0.16b, XTMP3.16b;
288*9f69f52bSEric Biggers#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
289*9f69f52bSEric Biggers	/* Load (w[i - 3]) => XTMP2 */             \
290*9f69f52bSEric Biggers	ext	XTMP2.16b, w4.16b, w4.16b, #12;
291*9f69f52bSEric Biggers#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
292*9f69f52bSEric Biggers	eor	w0.16b, w0.16b, XTMP4.16b;
293*9f69f52bSEric Biggers#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
294*9f69f52bSEric Biggers	ext	XTMP2.16b, XTMP2.16b, w5.16b, #12;
295*9f69f52bSEric Biggers#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
296*9f69f52bSEric Biggers	/* W1 ^ W2 => XTMP3 */                     \
297*9f69f52bSEric Biggers	eor	XTMP3.16b, XTMP2.16b, w0.16b;
298*9f69f52bSEric Biggers#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
299*9f69f52bSEric Biggers#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
300*9f69f52bSEric Biggers	st1	{XTMP2.16b-XTMP3.16b}, [addr0];
301*9f69f52bSEric Biggers#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
302*9f69f52bSEric Biggers
303*9f69f52bSEric Biggers#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
304*9f69f52bSEric Biggers	SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
305*9f69f52bSEric Biggers#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
306*9f69f52bSEric Biggers	SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
307*9f69f52bSEric Biggers#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
308*9f69f52bSEric Biggers	SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
309*9f69f52bSEric Biggers
310*9f69f52bSEric Biggers#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
311*9f69f52bSEric Biggers	SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
312*9f69f52bSEric Biggers#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
313*9f69f52bSEric Biggers	SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
314*9f69f52bSEric Biggers#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
315*9f69f52bSEric Biggers	SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
316*9f69f52bSEric Biggers
317*9f69f52bSEric Biggers#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
318*9f69f52bSEric Biggers	SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
319*9f69f52bSEric Biggers#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
320*9f69f52bSEric Biggers	SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
321*9f69f52bSEric Biggers#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
322*9f69f52bSEric Biggers	SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
323*9f69f52bSEric Biggers
324*9f69f52bSEric Biggers#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
325*9f69f52bSEric Biggers	SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
326*9f69f52bSEric Biggers#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
327*9f69f52bSEric Biggers	SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
328*9f69f52bSEric Biggers#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
329*9f69f52bSEric Biggers	SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
330*9f69f52bSEric Biggers
331*9f69f52bSEric Biggers#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
332*9f69f52bSEric Biggers	SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
333*9f69f52bSEric Biggers#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
334*9f69f52bSEric Biggers	SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
335*9f69f52bSEric Biggers#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
336*9f69f52bSEric Biggers	SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
337*9f69f52bSEric Biggers
338*9f69f52bSEric Biggers#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
339*9f69f52bSEric Biggers	SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
340*9f69f52bSEric Biggers#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
341*9f69f52bSEric Biggers	SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
342*9f69f52bSEric Biggers#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
343*9f69f52bSEric Biggers	SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
344*9f69f52bSEric Biggers
345*9f69f52bSEric Biggers
346*9f69f52bSEric Biggers	/*
347*9f69f52bSEric Biggers	 * Transform nblocks*64 bytes (nblocks*16 32-bit words) at 'data'.
348*9f69f52bSEric Biggers	 *
349*9f69f52bSEric Biggers	 * void sm3_neon_transform(struct sm3_block_state *state,
350*9f69f52bSEric Biggers	 *			   const u8 *data, size_t nblocks)
351*9f69f52bSEric Biggers	 */
352*9f69f52bSEric Biggers	.text
353*9f69f52bSEric Biggers.align 3
354*9f69f52bSEric BiggersSYM_FUNC_START(sm3_neon_transform)
355*9f69f52bSEric Biggers	ldp		ra, rb, [RSTATE, #0]
356*9f69f52bSEric Biggers	ldp		rc, rd, [RSTATE, #8]
357*9f69f52bSEric Biggers	ldp		re, rf, [RSTATE, #16]
358*9f69f52bSEric Biggers	ldp		rg, rh, [RSTATE, #24]
359*9f69f52bSEric Biggers
360*9f69f52bSEric Biggers	stp		x28, x29, [sp, #-16]!
361*9f69f52bSEric Biggers	stp		x19, x20, [sp, #-16]!
362*9f69f52bSEric Biggers	stp		x21, x22, [sp, #-16]!
363*9f69f52bSEric Biggers	stp		x23, x24, [sp, #-16]!
364*9f69f52bSEric Biggers	stp		x25, x26, [sp, #-16]!
365*9f69f52bSEric Biggers	mov		RFRAME, sp
366*9f69f52bSEric Biggers
367*9f69f52bSEric Biggers	sub		addr0, sp, #STACK_SIZE
368*9f69f52bSEric Biggers	adr_l		RKPTR, .LKtable
369*9f69f52bSEric Biggers	and		sp, addr0, #(~63)
370*9f69f52bSEric Biggers
371*9f69f52bSEric Biggers	/* Preload first block. */
372*9f69f52bSEric Biggers	LOAD_W_VEC_1(1, 0)
373*9f69f52bSEric Biggers	LOAD_W_VEC_1(2, 0)
374*9f69f52bSEric Biggers	LOAD_W_VEC_1(3, 0)
375*9f69f52bSEric Biggers	LOAD_W_VEC_1(4, 0)
376*9f69f52bSEric Biggers	LOAD_W_VEC_1(5, 0)
377*9f69f52bSEric Biggers	LOAD_W_VEC_1(6, 0)
378*9f69f52bSEric Biggers	LOAD_W_VEC_1(7, 0)
379*9f69f52bSEric Biggers	LOAD_W_VEC_1(8, 0)
380*9f69f52bSEric Biggers	LOAD_W_VEC_2(1, 0)
381*9f69f52bSEric Biggers	LOAD_W_VEC_2(2, 0)
382*9f69f52bSEric Biggers	LOAD_W_VEC_2(3, 0)
383*9f69f52bSEric Biggers	LOAD_W_VEC_2(4, 0)
384*9f69f52bSEric Biggers	LOAD_W_VEC_2(5, 0)
385*9f69f52bSEric Biggers	LOAD_W_VEC_2(6, 0)
386*9f69f52bSEric Biggers	LOAD_W_VEC_2(7, 0)
387*9f69f52bSEric Biggers	LOAD_W_VEC_2(8, 0)
388*9f69f52bSEric Biggers	LOAD_W_VEC_3(1, 0)
389*9f69f52bSEric Biggers	LOAD_W_VEC_3(2, 0)
390*9f69f52bSEric Biggers	LOAD_W_VEC_3(3, 0)
391*9f69f52bSEric Biggers	LOAD_W_VEC_3(4, 0)
392*9f69f52bSEric Biggers	LOAD_W_VEC_3(5, 0)
393*9f69f52bSEric Biggers	LOAD_W_VEC_3(6, 0)
394*9f69f52bSEric Biggers	LOAD_W_VEC_3(7, 0)
395*9f69f52bSEric Biggers	LOAD_W_VEC_3(8, 0)
396*9f69f52bSEric Biggers
397*9f69f52bSEric Biggers.balign 16
398*9f69f52bSEric Biggers.Loop:
399*9f69f52bSEric Biggers	/* Transform 0-3 */
400*9f69f52bSEric Biggers	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
401*9f69f52bSEric Biggers	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  1, 1, IW, _, 0)
402*9f69f52bSEric Biggers	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
403*9f69f52bSEric Biggers	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  3, 3, IW, _, 0)
404*9f69f52bSEric Biggers
405*9f69f52bSEric Biggers	/* Transform 4-7 + Precalc 12-14 */
406*9f69f52bSEric Biggers	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
407*9f69f52bSEric Biggers	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  5, 1, IW, _, 0)
408*9f69f52bSEric Biggers	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
409*9f69f52bSEric Biggers	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
410*9f69f52bSEric Biggers
411*9f69f52bSEric Biggers	/* Transform 8-11 + Precalc 12-17 */
412*9f69f52bSEric Biggers	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
413*9f69f52bSEric Biggers	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
414*9f69f52bSEric Biggers	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
415*9f69f52bSEric Biggers	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
416*9f69f52bSEric Biggers
417*9f69f52bSEric Biggers	/* Transform 12-14 + Precalc 18-20 */
418*9f69f52bSEric Biggers	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
419*9f69f52bSEric Biggers	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
420*9f69f52bSEric Biggers	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
421*9f69f52bSEric Biggers
422*9f69f52bSEric Biggers	/* Transform 15-17 + Precalc 21-23 */
423*9f69f52bSEric Biggers	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
424*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
425*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
426*9f69f52bSEric Biggers
427*9f69f52bSEric Biggers	/* Transform 18-20 + Precalc 24-26 */
428*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
429*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
430*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
431*9f69f52bSEric Biggers
432*9f69f52bSEric Biggers	/* Transform 21-23 + Precalc 27-29 */
433*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
434*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
435*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
436*9f69f52bSEric Biggers
437*9f69f52bSEric Biggers	/* Transform 24-26 + Precalc 30-32 */
438*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
439*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
440*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
441*9f69f52bSEric Biggers
442*9f69f52bSEric Biggers	/* Transform 27-29 + Precalc 33-35 */
443*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
444*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
445*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
446*9f69f52bSEric Biggers
447*9f69f52bSEric Biggers	/* Transform 30-32 + Precalc 36-38 */
448*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
449*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
450*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
451*9f69f52bSEric Biggers
452*9f69f52bSEric Biggers	/* Transform 33-35 + Precalc 39-41 */
453*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
454*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
455*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
456*9f69f52bSEric Biggers
457*9f69f52bSEric Biggers	/* Transform 36-38 + Precalc 42-44 */
458*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
459*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
460*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
461*9f69f52bSEric Biggers
462*9f69f52bSEric Biggers	/* Transform 39-41 + Precalc 45-47 */
463*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
464*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
465*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
466*9f69f52bSEric Biggers
467*9f69f52bSEric Biggers	/* Transform 42-44 + Precalc 48-50 */
468*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
469*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
470*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
471*9f69f52bSEric Biggers
472*9f69f52bSEric Biggers	/* Transform 45-47 + Precalc 51-53 */
473*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
474*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
475*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
476*9f69f52bSEric Biggers
477*9f69f52bSEric Biggers	/* Transform 48-50 + Precalc 54-56 */
478*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
479*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
480*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
481*9f69f52bSEric Biggers
482*9f69f52bSEric Biggers	/* Transform 51-53 + Precalc 57-59 */
483*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
484*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
485*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
486*9f69f52bSEric Biggers
487*9f69f52bSEric Biggers	/* Transform 54-56 + Precalc 60-62 */
488*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
489*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
490*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
491*9f69f52bSEric Biggers
492*9f69f52bSEric Biggers	/* Transform 57-59 + Precalc 63 */
493*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
494*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
495*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
496*9f69f52bSEric Biggers
497*9f69f52bSEric Biggers	/* Transform 60 */
498*9f69f52bSEric Biggers	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
499*9f69f52bSEric Biggers	subs		RNBLKS, RNBLKS, #1
500*9f69f52bSEric Biggers	b.eq		.Lend
501*9f69f52bSEric Biggers
502*9f69f52bSEric Biggers	/* Transform 61-63 + Preload next block */
503*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, LOAD_W_VEC_1, _)
504*9f69f52bSEric Biggers	ldp		s0, s1, [RSTATE, #0]
505*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
506*9f69f52bSEric Biggers	ldp		s2, s3, [RSTATE, #8]
507*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, LOAD_W_VEC_3, _)
508*9f69f52bSEric Biggers
509*9f69f52bSEric Biggers	/* Update the chaining variables. */
510*9f69f52bSEric Biggers	eor		ra, ra, s0
511*9f69f52bSEric Biggers	eor		rb, rb, s1
512*9f69f52bSEric Biggers	ldp		s0, s1, [RSTATE, #16]
513*9f69f52bSEric Biggers	eor		rc, rc, s2
514*9f69f52bSEric Biggers	ldp		k_even, k_odd, [RSTATE, #24]
515*9f69f52bSEric Biggers	eor		rd, rd, s3
516*9f69f52bSEric Biggers	eor		re, re, s0
517*9f69f52bSEric Biggers	stp		ra, rb, [RSTATE, #0]
518*9f69f52bSEric Biggers	eor		rf, rf, s1
519*9f69f52bSEric Biggers	stp		rc, rd, [RSTATE, #8]
520*9f69f52bSEric Biggers	eor		rg, rg, k_even
521*9f69f52bSEric Biggers	stp		re, rf, [RSTATE, #16]
522*9f69f52bSEric Biggers	eor		rh, rh, k_odd
523*9f69f52bSEric Biggers	stp		rg, rh, [RSTATE, #24]
524*9f69f52bSEric Biggers	b		.Loop
525*9f69f52bSEric Biggers
526*9f69f52bSEric Biggers.Lend:
527*9f69f52bSEric Biggers	/* Transform 61-63 */
528*9f69f52bSEric Biggers	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, _, _)
529*9f69f52bSEric Biggers	ldp		s0, s1, [RSTATE, #0]
530*9f69f52bSEric Biggers	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
531*9f69f52bSEric Biggers	ldp		s2, s3, [RSTATE, #8]
532*9f69f52bSEric Biggers	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, _, _)
533*9f69f52bSEric Biggers
534*9f69f52bSEric Biggers	/* Update the chaining variables. */
535*9f69f52bSEric Biggers	eor		ra, ra, s0
536*9f69f52bSEric Biggers	clear_vec(W0)
537*9f69f52bSEric Biggers	eor		rb, rb, s1
538*9f69f52bSEric Biggers	clear_vec(W1)
539*9f69f52bSEric Biggers	ldp		s0, s1, [RSTATE, #16]
540*9f69f52bSEric Biggers	clear_vec(W2)
541*9f69f52bSEric Biggers	eor		rc, rc, s2
542*9f69f52bSEric Biggers	clear_vec(W3)
543*9f69f52bSEric Biggers	ldp		k_even, k_odd, [RSTATE, #24]
544*9f69f52bSEric Biggers	clear_vec(W4)
545*9f69f52bSEric Biggers	eor		rd, rd, s3
546*9f69f52bSEric Biggers	clear_vec(W5)
547*9f69f52bSEric Biggers	eor		re, re, s0
548*9f69f52bSEric Biggers	clear_vec(XTMP0)
549*9f69f52bSEric Biggers	stp		ra, rb, [RSTATE, #0]
550*9f69f52bSEric Biggers	clear_vec(XTMP1)
551*9f69f52bSEric Biggers	eor		rf, rf, s1
552*9f69f52bSEric Biggers	clear_vec(XTMP2)
553*9f69f52bSEric Biggers	stp		rc, rd, [RSTATE, #8]
554*9f69f52bSEric Biggers	clear_vec(XTMP3)
555*9f69f52bSEric Biggers	eor		rg, rg, k_even
556*9f69f52bSEric Biggers	clear_vec(XTMP4)
557*9f69f52bSEric Biggers	stp		re, rf, [RSTATE, #16]
558*9f69f52bSEric Biggers	clear_vec(XTMP5)
559*9f69f52bSEric Biggers	eor		rh, rh, k_odd
560*9f69f52bSEric Biggers	clear_vec(XTMP6)
561*9f69f52bSEric Biggers	stp		rg, rh, [RSTATE, #24]
562*9f69f52bSEric Biggers
563*9f69f52bSEric Biggers	/* Clear message expansion area */
564*9f69f52bSEric Biggers	add		addr0, sp, #STACK_W
565*9f69f52bSEric Biggers	st1		{W0.16b-W3.16b}, [addr0], #64
566*9f69f52bSEric Biggers	st1		{W0.16b-W3.16b}, [addr0], #64
567*9f69f52bSEric Biggers	st1		{W0.16b-W3.16b}, [addr0]
568*9f69f52bSEric Biggers
569*9f69f52bSEric Biggers	mov		sp, RFRAME
570*9f69f52bSEric Biggers
571*9f69f52bSEric Biggers	ldp		x25, x26, [sp], #16
572*9f69f52bSEric Biggers	ldp		x23, x24, [sp], #16
573*9f69f52bSEric Biggers	ldp		x21, x22, [sp], #16
574*9f69f52bSEric Biggers	ldp		x19, x20, [sp], #16
575*9f69f52bSEric Biggers	ldp		x28, x29, [sp], #16
576*9f69f52bSEric Biggers
577*9f69f52bSEric Biggers	ret
578*9f69f52bSEric BiggersSYM_FUNC_END(sm3_neon_transform)
579*9f69f52bSEric Biggers
580*9f69f52bSEric Biggers
581*9f69f52bSEric Biggers	.section	".rodata", "a"
582*9f69f52bSEric Biggers
583*9f69f52bSEric Biggers	.align 4
584*9f69f52bSEric Biggers.LKtable:
585*9f69f52bSEric Biggers	.long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
586*9f69f52bSEric Biggers	.long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
587*9f69f52bSEric Biggers	.long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
588*9f69f52bSEric Biggers	.long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
589*9f69f52bSEric Biggers	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
590*9f69f52bSEric Biggers	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
591*9f69f52bSEric Biggers	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
592*9f69f52bSEric Biggers	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
593*9f69f52bSEric Biggers	.long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
594*9f69f52bSEric Biggers	.long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
595*9f69f52bSEric Biggers	.long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
596*9f69f52bSEric Biggers	.long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
597*9f69f52bSEric Biggers	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
598*9f69f52bSEric Biggers	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
599*9f69f52bSEric Biggers	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
600*9f69f52bSEric Biggers	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
601