xref: /linux/arch/arm64/crypto/sm4-neon-core.S (revision 06d07429858317ded2db7986113a9e0129cd599b)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 NEON
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14/* Register macros */
15
16#define RTMP0	v8
17#define RTMP1	v9
18#define RTMP2	v10
19#define RTMP3	v11
20
21#define RTMP4	v12
22#define RTMP5	v13
23#define RTMP6	v14
24#define RTMP7	v15
25
26#define RX0	v12
27#define RX1	v13
28#define RKEY	v14
29#define RIV	v15
30
31/* Helper macros. */
32
33#define SM4_PREPARE()                                           \
34	adr_l		x5, crypto_sm4_sbox;                    \
35	ld1		{v16.16b-v19.16b}, [x5], #64;           \
36	ld1		{v20.16b-v23.16b}, [x5], #64;           \
37	ld1		{v24.16b-v27.16b}, [x5], #64;           \
38	ld1		{v28.16b-v31.16b}, [x5];
39
40#define transpose_4x4(s0, s1, s2, s3)                           \
41	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
42	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
43	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
44	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
45	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
46	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
47	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
48	zip2		s3.2d, RTMP2.2d, RTMP3.2d;
49
50#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7)        \
51	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
52	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
53	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
54	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
55	zip1		RTMP4.4s, s4.4s, s5.4s;                 \
56	zip1		RTMP5.4s, s6.4s, s7.4s;                 \
57	zip2		RTMP6.4s, s4.4s, s5.4s;                 \
58	zip2		RTMP7.4s, s6.4s, s7.4s;                 \
59	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
60	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
61	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
62	zip2		s3.2d, RTMP2.2d, RTMP3.2d;              \
63	zip1		s4.2d, RTMP4.2d, RTMP5.2d;              \
64	zip2		s5.2d, RTMP4.2d, RTMP5.2d;              \
65	zip1		s6.2d, RTMP6.2d, RTMP7.2d;              \
66	zip2		s7.2d, RTMP6.2d, RTMP7.2d;
67
68#define rotate_clockwise_4x4(s0, s1, s2, s3)                    \
69	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
70	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
71	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
72	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
73	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
74	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
75	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
76	zip2		s3.2d, RTMP3.2d, RTMP1.2d;
77
78#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
79	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
80	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
81	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
82	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
83	zip1		RTMP4.4s, s5.4s, s4.4s;                 \
84	zip1		RTMP6.4s, s7.4s, s6.4s;                 \
85	zip2		RTMP5.4s, s5.4s, s4.4s;                 \
86	zip2		RTMP7.4s, s7.4s, s6.4s;                 \
87	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
88	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
89	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
90	zip2		s3.2d, RTMP3.2d, RTMP1.2d;              \
91	zip1		s4.2d, RTMP6.2d, RTMP4.2d;              \
92	zip2		s5.2d, RTMP6.2d, RTMP4.2d;              \
93	zip1		s6.2d, RTMP7.2d, RTMP5.2d;              \
94	zip2		s7.2d, RTMP7.2d, RTMP5.2d;
95
96#define ROUND4(round, s0, s1, s2, s3)                           \
97	dup		RX0.4s, RKEY.s[round];                  \
98	/* rk ^ s1 ^ s2 ^ s3 */                                 \
99	eor		RTMP1.16b, s2.16b, s3.16b;              \
100	eor		RX0.16b, RX0.16b, s1.16b;               \
101	eor		RX0.16b, RX0.16b, RTMP1.16b;            \
102                                                                \
103	/* sbox, non-linear part */                             \
104	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
105	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
106	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
107	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
108	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
109	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
110	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
111	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
112                                                                \
113	/* linear part */                                       \
114	shl		RTMP1.4s, RTMP0.4s, #8;                 \
115	shl		RTMP2.4s, RTMP0.4s, #16;                \
116	shl		RTMP3.4s, RTMP0.4s, #24;                \
117	sri		RTMP1.4s, RTMP0.4s, #(32-8);            \
118	sri		RTMP2.4s, RTMP0.4s, #(32-16);           \
119	sri		RTMP3.4s, RTMP0.4s, #(32-24);           \
120	/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
121	eor		RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
122	eor		RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
123	/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
124	eor		RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
125	shl		RTMP2.4s, RTMP1.4s, 2;                  \
126	sri		RTMP2.4s, RTMP1.4s, #(32-2);            \
127	eor		RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
128	/* s0 ^= RTMP3 */                                       \
129	eor		s0.16b, s0.16b, RTMP3.16b;
130
131#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3)                       \
132	mov		x6, 8;                                  \
1334:                                                              \
134	ld1		{RKEY.4s}, [x0], #16;                   \
135	subs		x6, x6, #1;                             \
136                                                                \
137	ROUND4(0, b0, b1, b2, b3);                              \
138	ROUND4(1, b1, b2, b3, b0);                              \
139	ROUND4(2, b2, b3, b0, b1);                              \
140	ROUND4(3, b3, b0, b1, b2);                              \
141                                                                \
142	bne		4b;                                     \
143                                                                \
144	rev32		b0.16b, b0.16b;                         \
145	rev32		b1.16b, b1.16b;                         \
146	rev32		b2.16b, b2.16b;                         \
147	rev32		b3.16b, b3.16b;                         \
148                                                                \
149	rotate_clockwise_4x4(b0, b1, b2, b3);                   \
150                                                                \
151	/* repoint to rkey */                                   \
152	sub		x0, x0, #128;
153
154#define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
155	rev32		b0.16b, b0.16b;                         \
156	rev32		b1.16b, b1.16b;                         \
157	rev32		b2.16b, b2.16b;                         \
158	rev32		b3.16b, b3.16b;                         \
159	SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
160
161#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
162	/* rk ^ s1 ^ s2 ^ s3 */                                 \
163	dup		RX0.4s, RKEY.s[round];                  \
164	eor		RTMP0.16b, s2.16b, s3.16b;              \
165	mov		RX1.16b, RX0.16b;                       \
166	eor		RTMP1.16b, t2.16b, t3.16b;              \
167	eor		RX0.16b, RX0.16b, s1.16b;               \
168	eor		RX1.16b, RX1.16b, t1.16b;               \
169	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
170	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
171                                                                \
172	/* sbox, non-linear part */                             \
173	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
174	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
175	tbl		RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
176	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
177	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
178	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
179	tbx		RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
180	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
181	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
182	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
183	tbx		RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
184	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
185	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
186	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
187	tbx		RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
188                                                                \
189	/* linear part */                                       \
190	shl		RX0.4s, RTMP0.4s, #8;                   \
191	shl		RX1.4s, RTMP1.4s, #8;                   \
192	shl		RTMP2.4s, RTMP0.4s, #16;                \
193	shl		RTMP3.4s, RTMP1.4s, #16;                \
194	sri		RX0.4s, RTMP0.4s, #(32 - 8);            \
195	sri		RX1.4s, RTMP1.4s, #(32 - 8);            \
196	sri		RTMP2.4s, RTMP0.4s, #(32 - 16);         \
197	sri		RTMP3.4s, RTMP1.4s, #(32 - 16);         \
198	/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
199	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
200	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
201	eor		RX0.16b, RX0.16b, RTMP2.16b;            \
202	eor		RX1.16b, RX1.16b, RTMP3.16b;            \
203	/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
204	shl		RTMP2.4s, RTMP0.4s, #24;                \
205	shl		RTMP3.4s, RTMP1.4s, #24;                \
206	sri		RTMP2.4s, RTMP0.4s, #(32 - 24);         \
207	sri		RTMP3.4s, RTMP1.4s, #(32 - 24);         \
208	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
209	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
210	shl		RTMP2.4s, RX0.4s, #2;                   \
211	shl		RTMP3.4s, RX1.4s, #2;                   \
212	sri		RTMP2.4s, RX0.4s, #(32 - 2);            \
213	sri		RTMP3.4s, RX1.4s, #(32 - 2);            \
214	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
215	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
216	/* s0/t0 ^= RTMP0/1 */                                  \
217	eor		s0.16b, s0.16b, RTMP0.16b;              \
218	eor		t0.16b, t0.16b, RTMP1.16b;
219
220#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
221	rev32		b0.16b, b0.16b;                         \
222	rev32		b1.16b, b1.16b;                         \
223	rev32		b2.16b, b2.16b;                         \
224	rev32		b3.16b, b3.16b;                         \
225	rev32		b4.16b, b4.16b;                         \
226	rev32		b5.16b, b5.16b;                         \
227	rev32		b6.16b, b6.16b;                         \
228	rev32		b7.16b, b7.16b;                         \
229                                                                \
230	mov		x6, 8;                                  \
2318:                                                              \
232	ld1		{RKEY.4s}, [x0], #16;                   \
233	subs		x6, x6, #1;                             \
234                                                                \
235	ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
236	ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
237	ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
238	ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
239                                                                \
240	bne		8b;                                     \
241                                                                \
242	rev32		b0.16b, b0.16b;                         \
243	rev32		b1.16b, b1.16b;                         \
244	rev32		b2.16b, b2.16b;                         \
245	rev32		b3.16b, b3.16b;                         \
246	rev32		b4.16b, b4.16b;                         \
247	rev32		b5.16b, b5.16b;                         \
248	rev32		b6.16b, b6.16b;                         \
249	rev32		b7.16b, b7.16b;                         \
250                                                                \
251	/* repoint to rkey */                                   \
252	sub		x0, x0, #128;
253
254#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)			\
255	SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7);	\
256	rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7);	\
257
258
259.align 3
260SYM_FUNC_START(sm4_neon_crypt)
261	/* input:
262	 *   x0: round key array, CTX
263	 *   x1: dst
264	 *   x2: src
265	 *   w3: nblocks
266	 */
267	SM4_PREPARE()
268
269.Lcrypt_loop_8x:
270	sub		w3, w3, #8
271	tbnz		w3, #31, .Lcrypt_4x
272
273	ld4		{v0.4s-v3.4s}, [x2], #64
274	ld4		{v4.4s-v7.4s}, [x2], #64
275
276	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
277
278	st1		{v0.16b-v3.16b}, [x1], #64
279	st1		{v4.16b-v7.16b}, [x1], #64
280
281	cbz		w3, .Lcrypt_end
282	b		.Lcrypt_loop_8x
283
284.Lcrypt_4x:
285	add		w3, w3, #8
286	cmp		w3, #4
287	blt		.Lcrypt_tail
288
289	sub		w3, w3, #4
290
291	ld4		{v0.4s-v3.4s}, [x2], #64
292
293	SM4_CRYPT_BLK4(v0, v1, v2, v3)
294
295	st1		{v0.16b-v3.16b}, [x1], #64
296
297	cbz		w3, .Lcrypt_end
298
299.Lcrypt_tail:
300	cmp		w3, #2
301	ld1		{v0.16b}, [x2], #16
302	blt		.Lcrypt_tail_load_done
303	ld1		{v1.16b}, [x2], #16
304	beq		.Lcrypt_tail_load_done
305	ld1		{v2.16b}, [x2], #16
306
307.Lcrypt_tail_load_done:
308	transpose_4x4(v0, v1, v2, v3)
309
310	SM4_CRYPT_BLK4(v0, v1, v2, v3)
311
312	cmp		w3, #2
313	st1		{v0.16b}, [x1], #16
314	blt		.Lcrypt_end
315	st1		{v1.16b}, [x1], #16
316	beq		.Lcrypt_end
317	st1		{v2.16b}, [x1], #16
318
319.Lcrypt_end:
320	ret
321SYM_FUNC_END(sm4_neon_crypt)
322
323.align 3
324SYM_FUNC_START(sm4_neon_cbc_dec)
325	/* input:
326	 *   x0: round key array, CTX
327	 *   x1: dst
328	 *   x2: src
329	 *   x3: iv (big endian, 128 bit)
330	 *   w4: nblocks
331	 */
332	SM4_PREPARE()
333
334	ld1		{RIV.16b}, [x3]
335
336.Lcbc_dec_loop_8x:
337	sub		w4, w4, #8
338	tbnz		w4, #31, .Lcbc_dec_4x
339
340	ld4		{v0.4s-v3.4s}, [x2], #64
341	ld4		{v4.4s-v7.4s}, [x2]
342
343	SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
344
345	/* Avoid overwriting the RIV register */
346	rotate_clockwise_4x4(v0, v1, v2, v3)
347	rotate_clockwise_4x4(v4, v5, v6, v7)
348
349	sub		x2, x2, #64
350
351	eor		v0.16b, v0.16b, RIV.16b
352
353	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
354	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
355
356	eor		v1.16b, v1.16b, RTMP0.16b
357	eor		v2.16b, v2.16b, RTMP1.16b
358	eor		v3.16b, v3.16b, RTMP2.16b
359	eor		v4.16b, v4.16b, RTMP3.16b
360	eor		v5.16b, v5.16b, RTMP4.16b
361	eor		v6.16b, v6.16b, RTMP5.16b
362	eor		v7.16b, v7.16b, RTMP6.16b
363
364	mov		RIV.16b, RTMP7.16b
365
366	st1		{v0.16b-v3.16b}, [x1], #64
367	st1		{v4.16b-v7.16b}, [x1], #64
368
369	cbz		w4, .Lcbc_dec_end
370	b		.Lcbc_dec_loop_8x
371
372.Lcbc_dec_4x:
373	add		w4, w4, #8
374	cmp		w4, #4
375	blt		.Lcbc_dec_tail
376
377	sub		w4, w4, #4
378
379	ld1		{v0.16b-v3.16b}, [x2], #64
380
381	rev32		v4.16b, v0.16b
382	rev32		v5.16b, v1.16b
383	rev32		v6.16b, v2.16b
384	rev32		v7.16b, v3.16b
385
386	transpose_4x4(v4, v5, v6, v7)
387
388	SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
389
390	eor		v4.16b, v4.16b, RIV.16b
391	eor		v5.16b, v5.16b, v0.16b
392	eor		v6.16b, v6.16b, v1.16b
393	eor		v7.16b, v7.16b, v2.16b
394
395	mov		RIV.16b, v3.16b
396
397	st1		{v4.16b-v7.16b}, [x1], #64
398
399	cbz		w4, .Lcbc_dec_end
400
401.Lcbc_dec_tail:
402	cmp		w4, #2
403	ld1		{v0.16b}, [x2], #16
404	blt		.Lcbc_dec_tail_load_done
405	ld1		{v1.16b}, [x2], #16
406	beq		.Lcbc_dec_tail_load_done
407	ld1		{v2.16b}, [x2], #16
408
409.Lcbc_dec_tail_load_done:
410	rev32		v4.16b, v0.16b
411	rev32		v5.16b, v1.16b
412	rev32		v6.16b, v2.16b
413
414	transpose_4x4(v4, v5, v6, v7)
415
416	SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
417
418	cmp		w4, #2
419	eor		v4.16b, v4.16b, RIV.16b
420	mov		RIV.16b, v0.16b
421	st1		{v4.16b}, [x1], #16
422	blt		.Lcbc_dec_end
423
424	eor		v5.16b, v5.16b, v0.16b
425	mov		RIV.16b, v1.16b
426	st1		{v5.16b}, [x1], #16
427	beq		.Lcbc_dec_end
428
429	eor		v6.16b, v6.16b, v1.16b
430	mov		RIV.16b, v2.16b
431	st1		{v6.16b}, [x1], #16
432
433.Lcbc_dec_end:
434	/* store new IV */
435	st1		{RIV.16b}, [x3]
436
437	ret
438SYM_FUNC_END(sm4_neon_cbc_dec)
439
440.align 3
441SYM_FUNC_START(sm4_neon_ctr_crypt)
442	/* input:
443	 *   x0: round key array, CTX
444	 *   x1: dst
445	 *   x2: src
446	 *   x3: ctr (big endian, 128 bit)
447	 *   w4: nblocks
448	 */
449	SM4_PREPARE()
450
451	ldp		x7, x8, [x3]
452	rev		x7, x7
453	rev		x8, x8
454
455.Lctr_crypt_loop_8x:
456	sub		w4, w4, #8
457	tbnz		w4, #31, .Lctr_crypt_4x
458
459#define inc_le128(vctr)                             \
460		mov		vctr.d[1], x8;      \
461		mov		vctr.d[0], x7;      \
462		adds		x8, x8, #1;         \
463		rev64		vctr.16b, vctr.16b; \
464		adc		x7, x7, xzr;
465
466	/* construct CTRs */
467	inc_le128(v0)			/* +0 */
468	inc_le128(v1)			/* +1 */
469	inc_le128(v2)			/* +2 */
470	inc_le128(v3)			/* +3 */
471	inc_le128(v4)			/* +4 */
472	inc_le128(v5)			/* +5 */
473	inc_le128(v6)			/* +6 */
474	inc_le128(v7)			/* +7 */
475
476	transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
477
478	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
479
480	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
481	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
482
483	eor		v0.16b, v0.16b, RTMP0.16b
484	eor		v1.16b, v1.16b, RTMP1.16b
485	eor		v2.16b, v2.16b, RTMP2.16b
486	eor		v3.16b, v3.16b, RTMP3.16b
487	eor		v4.16b, v4.16b, RTMP4.16b
488	eor		v5.16b, v5.16b, RTMP5.16b
489	eor		v6.16b, v6.16b, RTMP6.16b
490	eor		v7.16b, v7.16b, RTMP7.16b
491
492	st1		{v0.16b-v3.16b}, [x1], #64
493	st1		{v4.16b-v7.16b}, [x1], #64
494
495	cbz		w4, .Lctr_crypt_end
496	b		.Lctr_crypt_loop_8x
497
498.Lctr_crypt_4x:
499	add		w4, w4, #8
500	cmp		w4, #4
501	blt		.Lctr_crypt_tail
502
503	sub		w4, w4, #4
504
505	/* construct CTRs */
506	inc_le128(v0)			/* +0 */
507	inc_le128(v1)			/* +1 */
508	inc_le128(v2)			/* +2 */
509	inc_le128(v3)			/* +3 */
510
511	ld1		{v4.16b-v7.16b}, [x2], #64
512
513	transpose_4x4(v0, v1, v2, v3)
514
515	SM4_CRYPT_BLK4(v0, v1, v2, v3)
516
517	eor		v0.16b, v0.16b, v4.16b
518	eor		v1.16b, v1.16b, v5.16b
519	eor		v2.16b, v2.16b, v6.16b
520	eor		v3.16b, v3.16b, v7.16b
521
522	st1		{v0.16b-v3.16b}, [x1], #64
523
524	cbz		w4, .Lctr_crypt_end
525
526.Lctr_crypt_tail:
527	/* inc_le128 will change the sign bit */
528	ld1		{v4.16b}, [x2], #16
529	inc_le128(v0)
530	cmp		w4, #2
531	blt		.Lctr_crypt_tail_load_done
532
533	ld1		{v5.16b}, [x2], #16
534	inc_le128(v1)
535	cmp		w4, #2
536	beq		.Lctr_crypt_tail_load_done
537
538	ld1		{v6.16b}, [x2], #16
539	inc_le128(v2)
540
541.Lctr_crypt_tail_load_done:
542	transpose_4x4(v0, v1, v2, v3)
543
544	SM4_CRYPT_BLK4(v0, v1, v2, v3)
545
546	cmp		w4, #2
547
548	eor		v0.16b, v0.16b, v4.16b
549	st1		{v0.16b}, [x1], #16
550	blt		.Lctr_crypt_end
551
552	eor		v1.16b, v1.16b, v5.16b
553	st1		{v1.16b}, [x1], #16
554	beq		.Lctr_crypt_end
555
556	eor		v2.16b, v2.16b, v6.16b
557	st1		{v2.16b}, [x1], #16
558
559.Lctr_crypt_end:
560	/* store new CTR */
561	rev		x7, x7
562	rev		x8, x8
563	stp		x7, x8, [x3]
564
565	ret
566SYM_FUNC_END(sm4_neon_ctr_crypt)
567