xref: /linux/arch/arm64/crypto/sm4-ce-core.S (revision 8e1bb4a41aa78d6105e59186af3dcd545fc66e70)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13#include "sm4-ce-asm.h"
14
15.arch	armv8-a+crypto
16
17.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18		20, 24, 25, 26, 27, 28, 29, 30, 31
19	.set .Lv\b\().4s, \b
20.endr
21
22.macro sm4e, vd, vn
23	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
24.endm
25
26.macro sm4ekey, vd, vn, vm
27	.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
28.endm
29
30/* Register macros */
31
32#define RTMP0	v16
33#define RTMP1	v17
34#define RTMP2	v18
35#define RTMP3	v19
36
37#define RIV	v20
38#define RMAC	v20
39#define RMASK	v21
40
41
42.align 3
43SYM_FUNC_START(sm4_ce_expand_key)
44	/* input:
45	 *   x0: 128-bit key
46	 *   x1: rkey_enc
47	 *   x2: rkey_dec
48	 *   x3: fk array
49	 *   x4: ck array
50	 */
51	ld1		{v0.16b}, [x0];
52	rev32		v0.16b, v0.16b;
53	ld1		{v1.16b}, [x3];
54	/* load ck */
55	ld1		{v24.16b-v27.16b}, [x4], #64;
56	ld1		{v28.16b-v31.16b}, [x4];
57
58	/* input ^ fk */
59	eor		v0.16b, v0.16b, v1.16b;
60
61	sm4ekey		v0.4s, v0.4s, v24.4s;
62	sm4ekey		v1.4s, v0.4s, v25.4s;
63	sm4ekey		v2.4s, v1.4s, v26.4s;
64	sm4ekey		v3.4s, v2.4s, v27.4s;
65	sm4ekey		v4.4s, v3.4s, v28.4s;
66	sm4ekey		v5.4s, v4.4s, v29.4s;
67	sm4ekey		v6.4s, v5.4s, v30.4s;
68	sm4ekey		v7.4s, v6.4s, v31.4s;
69
70	adr_l		x5, .Lbswap128_mask
71	ld1		{v24.16b}, [x5]
72
73	st1		{v0.16b-v3.16b}, [x1], #64;
74	st1		{v4.16b-v7.16b}, [x1];
75
76	tbl		v16.16b, {v7.16b}, v24.16b
77	tbl		v17.16b, {v6.16b}, v24.16b
78	tbl		v18.16b, {v5.16b}, v24.16b
79	tbl		v19.16b, {v4.16b}, v24.16b
80	tbl		v20.16b, {v3.16b}, v24.16b
81	tbl		v21.16b, {v2.16b}, v24.16b
82	tbl		v22.16b, {v1.16b}, v24.16b
83	tbl		v23.16b, {v0.16b}, v24.16b
84
85	st1		{v16.16b-v19.16b}, [x2], #64
86	st1		{v20.16b-v23.16b}, [x2]
87
88	ret;
89SYM_FUNC_END(sm4_ce_expand_key)
90
91.align 3
92SYM_FUNC_START(sm4_ce_crypt_block)
93	/* input:
94	 *   x0: round key array, CTX
95	 *   x1: dst
96	 *   x2: src
97	 */
98	SM4_PREPARE(x0)
99
100	ld1		{v0.16b}, [x2];
101	SM4_CRYPT_BLK(v0);
102	st1		{v0.16b}, [x1];
103
104	ret;
105SYM_FUNC_END(sm4_ce_crypt_block)
106
107.align 3
108SYM_FUNC_START(sm4_ce_crypt)
109	/* input:
110	 *   x0: round key array, CTX
111	 *   x1: dst
112	 *   x2: src
113	 *   w3: nblocks
114	 */
115	SM4_PREPARE(x0)
116
117.Lcrypt_loop_blk:
118	sub		w3, w3, #8;
119	tbnz		w3, #31, .Lcrypt_tail8;
120
121	ld1		{v0.16b-v3.16b}, [x2], #64;
122	ld1		{v4.16b-v7.16b}, [x2], #64;
123
124	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
125
126	st1		{v0.16b-v3.16b}, [x1], #64;
127	st1		{v4.16b-v7.16b}, [x1], #64;
128
129	cbz		w3, .Lcrypt_end;
130	b		.Lcrypt_loop_blk;
131
132.Lcrypt_tail8:
133	add		w3, w3, #8;
134	cmp		w3, #4;
135	blt		.Lcrypt_tail4;
136
137	sub		w3, w3, #4;
138
139	ld1		{v0.16b-v3.16b}, [x2], #64;
140	SM4_CRYPT_BLK4(v0, v1, v2, v3);
141	st1		{v0.16b-v3.16b}, [x1], #64;
142
143	cbz		w3, .Lcrypt_end;
144
145.Lcrypt_tail4:
146	sub		w3, w3, #1;
147
148	ld1		{v0.16b}, [x2], #16;
149	SM4_CRYPT_BLK(v0);
150	st1		{v0.16b}, [x1], #16;
151
152	cbnz		w3, .Lcrypt_tail4;
153
154.Lcrypt_end:
155	ret;
156SYM_FUNC_END(sm4_ce_crypt)
157
158.align 3
159SYM_FUNC_START(sm4_ce_cbc_enc)
160	/* input:
161	 *   x0: round key array, CTX
162	 *   x1: dst
163	 *   x2: src
164	 *   x3: iv (big endian, 128 bit)
165	 *   w4: nblocks
166	 */
167	SM4_PREPARE(x0)
168
169	ld1		{RIV.16b}, [x3]
170
171.Lcbc_enc_loop_4x:
172	cmp		w4, #4
173	blt		.Lcbc_enc_loop_1x
174
175	sub		w4, w4, #4
176
177	ld1		{v0.16b-v3.16b}, [x2], #64
178
179	eor		v0.16b, v0.16b, RIV.16b
180	SM4_CRYPT_BLK(v0)
181	eor		v1.16b, v1.16b, v0.16b
182	SM4_CRYPT_BLK(v1)
183	eor		v2.16b, v2.16b, v1.16b
184	SM4_CRYPT_BLK(v2)
185	eor		v3.16b, v3.16b, v2.16b
186	SM4_CRYPT_BLK(v3)
187
188	st1		{v0.16b-v3.16b}, [x1], #64
189	mov		RIV.16b, v3.16b
190
191	cbz		w4, .Lcbc_enc_end
192	b		.Lcbc_enc_loop_4x
193
194.Lcbc_enc_loop_1x:
195	sub		w4, w4, #1
196
197	ld1		{v0.16b}, [x2], #16
198
199	eor		RIV.16b, RIV.16b, v0.16b
200	SM4_CRYPT_BLK(RIV)
201
202	st1		{RIV.16b}, [x1], #16
203
204	cbnz		w4, .Lcbc_enc_loop_1x
205
206.Lcbc_enc_end:
207	/* store new IV */
208	st1		{RIV.16b}, [x3]
209
210	ret
211SYM_FUNC_END(sm4_ce_cbc_enc)
212
213.align 3
214SYM_FUNC_START(sm4_ce_cbc_dec)
215	/* input:
216	 *   x0: round key array, CTX
217	 *   x1: dst
218	 *   x2: src
219	 *   x3: iv (big endian, 128 bit)
220	 *   w4: nblocks
221	 */
222	SM4_PREPARE(x0)
223
224	ld1		{RIV.16b}, [x3]
225
226.Lcbc_dec_loop_8x:
227	sub		w4, w4, #8
228	tbnz		w4, #31, .Lcbc_dec_4x
229
230	ld1		{v0.16b-v3.16b}, [x2], #64
231	ld1		{v4.16b-v7.16b}, [x2], #64
232
233	rev32		v8.16b, v0.16b
234	rev32		v9.16b, v1.16b
235	rev32		v10.16b, v2.16b
236	rev32		v11.16b, v3.16b
237	rev32		v12.16b, v4.16b
238	rev32		v13.16b, v5.16b
239	rev32		v14.16b, v6.16b
240	rev32		v15.16b, v7.16b
241
242	SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
243
244	eor		v8.16b, v8.16b, RIV.16b
245	eor		v9.16b, v9.16b, v0.16b
246	eor		v10.16b, v10.16b, v1.16b
247	eor		v11.16b, v11.16b, v2.16b
248	eor		v12.16b, v12.16b, v3.16b
249	eor		v13.16b, v13.16b, v4.16b
250	eor		v14.16b, v14.16b, v5.16b
251	eor		v15.16b, v15.16b, v6.16b
252
253	st1		{v8.16b-v11.16b}, [x1], #64
254	st1		{v12.16b-v15.16b}, [x1], #64
255
256	mov		RIV.16b, v7.16b
257
258	cbz		w4, .Lcbc_dec_end
259	b		.Lcbc_dec_loop_8x
260
261.Lcbc_dec_4x:
262	add		w4, w4, #8
263	cmp		w4, #4
264	blt		.Lcbc_dec_loop_1x
265
266	sub		w4, w4, #4
267
268	ld1		{v0.16b-v3.16b}, [x2], #64
269
270	rev32		v8.16b, v0.16b
271	rev32		v9.16b, v1.16b
272	rev32		v10.16b, v2.16b
273	rev32		v11.16b, v3.16b
274
275	SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
276
277	eor		v8.16b, v8.16b, RIV.16b
278	eor		v9.16b, v9.16b, v0.16b
279	eor		v10.16b, v10.16b, v1.16b
280	eor		v11.16b, v11.16b, v2.16b
281
282	st1		{v8.16b-v11.16b}, [x1], #64
283
284	mov		RIV.16b, v3.16b
285
286	cbz		w4, .Lcbc_dec_end
287
288.Lcbc_dec_loop_1x:
289	sub		w4, w4, #1
290
291	ld1		{v0.16b}, [x2], #16
292
293	rev32		v8.16b, v0.16b
294
295	SM4_CRYPT_BLK_BE(v8)
296
297	eor		v8.16b, v8.16b, RIV.16b
298	st1		{v8.16b}, [x1], #16
299
300	mov		RIV.16b, v0.16b
301
302	cbnz		w4, .Lcbc_dec_loop_1x
303
304.Lcbc_dec_end:
305	/* store new IV */
306	st1		{RIV.16b}, [x3]
307
308	ret
309SYM_FUNC_END(sm4_ce_cbc_dec)
310
311.align 3
312SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313	/* input:
314	 *   x0: round key array, CTX
315	 *   x1: dst
316	 *   x2: src
317	 *   x3: iv (big endian, 128 bit)
318	 *   w4: nbytes
319	 */
320	SM4_PREPARE(x0)
321
322	sub		w5, w4, #16
323	uxtw		x5, w5
324
325	ld1		{RIV.16b}, [x3]
326
327	ld1		{v0.16b}, [x2]
328	eor		RIV.16b, RIV.16b, v0.16b
329	SM4_CRYPT_BLK(RIV)
330
331	/* load permute table */
332	adr_l		x6, .Lcts_permute_table
333	add		x7, x6, #32
334	add		x6, x6, x5
335	sub		x7, x7, x5
336	ld1		{v3.16b}, [x6]
337	ld1		{v4.16b}, [x7]
338
339	/* overlapping loads */
340	add		x2, x2, x5
341	ld1		{v1.16b}, [x2]
342
343	/* create Cn from En-1 */
344	tbl		v0.16b, {RIV.16b}, v3.16b
345	/* padding Pn with zeros */
346	tbl		v1.16b, {v1.16b}, v4.16b
347
348	eor		v1.16b, v1.16b, RIV.16b
349	SM4_CRYPT_BLK(v1)
350
351	/* overlapping stores */
352	add		x5, x1, x5
353	st1		{v0.16b}, [x5]
354	st1		{v1.16b}, [x1]
355
356	ret
357SYM_FUNC_END(sm4_ce_cbc_cts_enc)
358
359.align 3
360SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361	/* input:
362	 *   x0: round key array, CTX
363	 *   x1: dst
364	 *   x2: src
365	 *   x3: iv (big endian, 128 bit)
366	 *   w4: nbytes
367	 */
368	SM4_PREPARE(x0)
369
370	sub		w5, w4, #16
371	uxtw		x5, w5
372
373	ld1		{RIV.16b}, [x3]
374
375	/* load permute table */
376	adr_l		x6, .Lcts_permute_table
377	add		x7, x6, #32
378	add		x6, x6, x5
379	sub		x7, x7, x5
380	ld1		{v3.16b}, [x6]
381	ld1		{v4.16b}, [x7]
382
383	/* overlapping loads */
384	ld1		{v0.16b}, [x2], x5
385	ld1		{v1.16b}, [x2]
386
387	SM4_CRYPT_BLK(v0)
388	/* select the first Ln bytes of Xn to create Pn */
389	tbl		v2.16b, {v0.16b}, v3.16b
390	eor		v2.16b, v2.16b, v1.16b
391
392	/* overwrite the first Ln bytes with Cn to create En-1 */
393	tbx		v0.16b, {v1.16b}, v4.16b
394	SM4_CRYPT_BLK(v0)
395	eor		v0.16b, v0.16b, RIV.16b
396
397	/* overlapping stores */
398	add		x5, x1, x5
399	st1		{v2.16b}, [x5]
400	st1		{v0.16b}, [x1]
401
402	ret
403SYM_FUNC_END(sm4_ce_cbc_cts_dec)
404
405.align 3
406SYM_FUNC_START(sm4_ce_ctr_enc)
407	/* input:
408	 *   x0: round key array, CTX
409	 *   x1: dst
410	 *   x2: src
411	 *   x3: ctr (big endian, 128 bit)
412	 *   w4: nblocks
413	 */
414	SM4_PREPARE(x0)
415
416	ldp		x7, x8, [x3]
417	rev		x7, x7
418	rev		x8, x8
419
420.Lctr_loop_8x:
421	sub		w4, w4, #8
422	tbnz		w4, #31, .Lctr_4x
423
424#define inc_le128(vctr)					\
425		mov		vctr.d[1], x8;		\
426		mov		vctr.d[0], x7;		\
427		adds		x8, x8, #1;		\
428		rev64		vctr.16b, vctr.16b;	\
429		adc		x7, x7, xzr;
430
431	/* construct CTRs */
432	inc_le128(v0)			/* +0 */
433	inc_le128(v1)			/* +1 */
434	inc_le128(v2)			/* +2 */
435	inc_le128(v3)			/* +3 */
436	inc_le128(v4)			/* +4 */
437	inc_le128(v5)			/* +5 */
438	inc_le128(v6)			/* +6 */
439	inc_le128(v7)			/* +7 */
440
441	ld1		{v8.16b-v11.16b}, [x2], #64
442	ld1		{v12.16b-v15.16b}, [x2], #64
443
444	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
445
446	eor		v0.16b, v0.16b, v8.16b
447	eor		v1.16b, v1.16b, v9.16b
448	eor		v2.16b, v2.16b, v10.16b
449	eor		v3.16b, v3.16b, v11.16b
450	eor		v4.16b, v4.16b, v12.16b
451	eor		v5.16b, v5.16b, v13.16b
452	eor		v6.16b, v6.16b, v14.16b
453	eor		v7.16b, v7.16b, v15.16b
454
455	st1		{v0.16b-v3.16b}, [x1], #64
456	st1		{v4.16b-v7.16b}, [x1], #64
457
458	cbz		w4, .Lctr_end
459	b		.Lctr_loop_8x
460
461.Lctr_4x:
462	add		w4, w4, #8
463	cmp		w4, #4
464	blt		.Lctr_loop_1x
465
466	sub		w4, w4, #4
467
468	/* construct CTRs */
469	inc_le128(v0)			/* +0 */
470	inc_le128(v1)			/* +1 */
471	inc_le128(v2)			/* +2 */
472	inc_le128(v3)			/* +3 */
473
474	ld1		{v8.16b-v11.16b}, [x2], #64
475
476	SM4_CRYPT_BLK4(v0, v1, v2, v3)
477
478	eor		v0.16b, v0.16b, v8.16b
479	eor		v1.16b, v1.16b, v9.16b
480	eor		v2.16b, v2.16b, v10.16b
481	eor		v3.16b, v3.16b, v11.16b
482
483	st1		{v0.16b-v3.16b}, [x1], #64
484
485	cbz		w4, .Lctr_end
486
487.Lctr_loop_1x:
488	sub		w4, w4, #1
489
490	/* construct CTRs */
491	inc_le128(v0)
492
493	ld1		{v8.16b}, [x2], #16
494
495	SM4_CRYPT_BLK(v0)
496
497	eor		v0.16b, v0.16b, v8.16b
498	st1		{v0.16b}, [x1], #16
499
500	cbnz		w4, .Lctr_loop_1x
501
502.Lctr_end:
503	/* store new CTR */
504	rev		x7, x7
505	rev		x8, x8
506	stp		x7, x8, [x3]
507
508	ret
509SYM_FUNC_END(sm4_ce_ctr_enc)
510
511
512#define tweak_next(vt, vin, RTMP)					\
513		sshr		RTMP.2d, vin.2d, #63;			\
514		and		RTMP.16b, RTMP.16b, RMASK.16b;		\
515		add		vt.2d, vin.2d, vin.2d;			\
516		ext		RTMP.16b, RTMP.16b, RTMP.16b, #8;	\
517		eor		vt.16b, vt.16b, RTMP.16b;
518
519.align 3
520SYM_FUNC_START(sm4_ce_xts_enc)
521	/* input:
522	 *   x0: round key array, CTX
523	 *   x1: dst
524	 *   x2: src
525	 *   x3: tweak (big endian, 128 bit)
526	 *   w4: nbytes
527	 *   x5: round key array for IV
528	 */
529	ld1		{v8.16b}, [x3]
530
531	cbz		x5, .Lxts_enc_nofirst
532
533	SM4_PREPARE(x5)
534
535	/* Generate first tweak */
536	SM4_CRYPT_BLK(v8)
537
538.Lxts_enc_nofirst:
539	SM4_PREPARE(x0)
540
541	ands		w5, w4, #15
542	lsr		w4, w4, #4
543	sub		w6, w4, #1
544	csel		w4, w4, w6, eq
545	uxtw		x5, w5
546
547	movi		RMASK.2s, #0x1
548	movi		RTMP0.2s, #0x87
549	uzp1		RMASK.4s, RMASK.4s, RTMP0.4s
550
551	cbz		w4, .Lxts_enc_cts
552
553.Lxts_enc_loop_8x:
554	sub		w4, w4, #8
555	tbnz		w4, #31, .Lxts_enc_4x
556
557	tweak_next( v9,  v8, RTMP0)
558	tweak_next(v10,  v9, RTMP1)
559	tweak_next(v11, v10, RTMP2)
560	tweak_next(v12, v11, RTMP3)
561	tweak_next(v13, v12, RTMP0)
562	tweak_next(v14, v13, RTMP1)
563	tweak_next(v15, v14, RTMP2)
564
565	ld1		{v0.16b-v3.16b}, [x2], #64
566	ld1		{v4.16b-v7.16b}, [x2], #64
567	eor		v0.16b, v0.16b,  v8.16b
568	eor		v1.16b, v1.16b,  v9.16b
569	eor		v2.16b, v2.16b, v10.16b
570	eor		v3.16b, v3.16b, v11.16b
571	eor		v4.16b, v4.16b, v12.16b
572	eor		v5.16b, v5.16b, v13.16b
573	eor		v6.16b, v6.16b, v14.16b
574	eor		v7.16b, v7.16b, v15.16b
575
576	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
577
578	eor		v0.16b, v0.16b,  v8.16b
579	eor		v1.16b, v1.16b,  v9.16b
580	eor		v2.16b, v2.16b, v10.16b
581	eor		v3.16b, v3.16b, v11.16b
582	eor		v4.16b, v4.16b, v12.16b
583	eor		v5.16b, v5.16b, v13.16b
584	eor		v6.16b, v6.16b, v14.16b
585	eor		v7.16b, v7.16b, v15.16b
586	st1		{v0.16b-v3.16b}, [x1], #64
587	st1		{v4.16b-v7.16b}, [x1], #64
588
589	tweak_next(v8, v15, RTMP3)
590
591	cbz		w4, .Lxts_enc_cts
592	b		.Lxts_enc_loop_8x
593
594.Lxts_enc_4x:
595	add		w4, w4, #8
596	cmp		w4, #4
597	blt		.Lxts_enc_loop_1x
598
599	sub		w4, w4, #4
600
601	tweak_next( v9,  v8, RTMP0)
602	tweak_next(v10,  v9, RTMP1)
603	tweak_next(v11, v10, RTMP2)
604
605	ld1		{v0.16b-v3.16b}, [x2], #64
606	eor		v0.16b, v0.16b,  v8.16b
607	eor		v1.16b, v1.16b,  v9.16b
608	eor		v2.16b, v2.16b, v10.16b
609	eor		v3.16b, v3.16b, v11.16b
610
611	SM4_CRYPT_BLK4(v0, v1, v2, v3)
612
613	eor		v0.16b, v0.16b,  v8.16b
614	eor		v1.16b, v1.16b,  v9.16b
615	eor		v2.16b, v2.16b, v10.16b
616	eor		v3.16b, v3.16b, v11.16b
617	st1		{v0.16b-v3.16b}, [x1], #64
618
619	tweak_next(v8, v11, RTMP3)
620
621	cbz		w4, .Lxts_enc_cts
622
623.Lxts_enc_loop_1x:
624	sub		w4, w4, #1
625
626	ld1		{v0.16b}, [x2], #16
627	eor		v0.16b, v0.16b, v8.16b
628
629	SM4_CRYPT_BLK(v0)
630
631	eor		v0.16b, v0.16b, v8.16b
632	st1		{v0.16b}, [x1], #16
633
634	tweak_next(v8, v8, RTMP0)
635
636	cbnz		w4, .Lxts_enc_loop_1x
637
638.Lxts_enc_cts:
639	cbz		x5, .Lxts_enc_end
640
641	/* cipher text stealing */
642
643	tweak_next(v9, v8, RTMP0)
644	ld1		{v0.16b}, [x2]
645	eor		v0.16b, v0.16b, v8.16b
646	SM4_CRYPT_BLK(v0)
647	eor		v0.16b, v0.16b, v8.16b
648
649	/* load permute table */
650	adr_l		x6, .Lcts_permute_table
651	add		x7, x6, #32
652	add		x6, x6, x5
653	sub		x7, x7, x5
654	ld1		{v3.16b}, [x6]
655	ld1		{v4.16b}, [x7]
656
657	/* overlapping loads */
658	add		x2, x2, x5
659	ld1		{v1.16b}, [x2]
660
661	/* create Cn from En-1 */
662	tbl		v2.16b, {v0.16b}, v3.16b
663	/* padding Pn with En-1 at the end */
664	tbx		v0.16b, {v1.16b}, v4.16b
665
666	eor		v0.16b, v0.16b, v9.16b
667	SM4_CRYPT_BLK(v0)
668	eor		v0.16b, v0.16b, v9.16b
669
670
671	/* overlapping stores */
672	add		x5, x1, x5
673	st1		{v2.16b}, [x5]
674	st1		{v0.16b}, [x1]
675
676	b		.Lxts_enc_ret
677
678.Lxts_enc_end:
679	/* store new tweak */
680	st1		{v8.16b}, [x3]
681
682.Lxts_enc_ret:
683	ret
684SYM_FUNC_END(sm4_ce_xts_enc)
685
686.align 3
687SYM_FUNC_START(sm4_ce_xts_dec)
688	/* input:
689	 *   x0: round key array, CTX
690	 *   x1: dst
691	 *   x2: src
692	 *   x3: tweak (big endian, 128 bit)
693	 *   w4: nbytes
694	 *   x5: round key array for IV
695	 */
696	ld1		{v8.16b}, [x3]
697
698	cbz		x5, .Lxts_dec_nofirst
699
700	SM4_PREPARE(x5)
701
702	/* Generate first tweak */
703	SM4_CRYPT_BLK(v8)
704
705.Lxts_dec_nofirst:
706	SM4_PREPARE(x0)
707
708	ands		w5, w4, #15
709	lsr		w4, w4, #4
710	sub		w6, w4, #1
711	csel		w4, w4, w6, eq
712	uxtw		x5, w5
713
714	movi		RMASK.2s, #0x1
715	movi		RTMP0.2s, #0x87
716	uzp1		RMASK.4s, RMASK.4s, RTMP0.4s
717
718	cbz		w4, .Lxts_dec_cts
719
720.Lxts_dec_loop_8x:
721	sub		w4, w4, #8
722	tbnz		w4, #31, .Lxts_dec_4x
723
724	tweak_next( v9,  v8, RTMP0)
725	tweak_next(v10,  v9, RTMP1)
726	tweak_next(v11, v10, RTMP2)
727	tweak_next(v12, v11, RTMP3)
728	tweak_next(v13, v12, RTMP0)
729	tweak_next(v14, v13, RTMP1)
730	tweak_next(v15, v14, RTMP2)
731
732	ld1		{v0.16b-v3.16b}, [x2], #64
733	ld1		{v4.16b-v7.16b}, [x2], #64
734	eor		v0.16b, v0.16b,  v8.16b
735	eor		v1.16b, v1.16b,  v9.16b
736	eor		v2.16b, v2.16b, v10.16b
737	eor		v3.16b, v3.16b, v11.16b
738	eor		v4.16b, v4.16b, v12.16b
739	eor		v5.16b, v5.16b, v13.16b
740	eor		v6.16b, v6.16b, v14.16b
741	eor		v7.16b, v7.16b, v15.16b
742
743	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
744
745	eor		v0.16b, v0.16b,  v8.16b
746	eor		v1.16b, v1.16b,  v9.16b
747	eor		v2.16b, v2.16b, v10.16b
748	eor		v3.16b, v3.16b, v11.16b
749	eor		v4.16b, v4.16b, v12.16b
750	eor		v5.16b, v5.16b, v13.16b
751	eor		v6.16b, v6.16b, v14.16b
752	eor		v7.16b, v7.16b, v15.16b
753	st1		{v0.16b-v3.16b}, [x1], #64
754	st1		{v4.16b-v7.16b}, [x1], #64
755
756	tweak_next(v8, v15, RTMP3)
757
758	cbz		w4, .Lxts_dec_cts
759	b		.Lxts_dec_loop_8x
760
761.Lxts_dec_4x:
762	add		w4, w4, #8
763	cmp		w4, #4
764	blt		.Lxts_dec_loop_1x
765
766	sub		w4, w4, #4
767
768	tweak_next( v9,  v8, RTMP0)
769	tweak_next(v10,  v9, RTMP1)
770	tweak_next(v11, v10, RTMP2)
771
772	ld1		{v0.16b-v3.16b}, [x2], #64
773	eor		v0.16b, v0.16b,  v8.16b
774	eor		v1.16b, v1.16b,  v9.16b
775	eor		v2.16b, v2.16b, v10.16b
776	eor		v3.16b, v3.16b, v11.16b
777
778	SM4_CRYPT_BLK4(v0, v1, v2, v3)
779
780	eor		v0.16b, v0.16b,  v8.16b
781	eor		v1.16b, v1.16b,  v9.16b
782	eor		v2.16b, v2.16b, v10.16b
783	eor		v3.16b, v3.16b, v11.16b
784	st1		{v0.16b-v3.16b}, [x1], #64
785
786	tweak_next(v8, v11, RTMP3)
787
788	cbz		w4, .Lxts_dec_cts
789
790.Lxts_dec_loop_1x:
791	sub		w4, w4, #1
792
793	ld1		{v0.16b}, [x2], #16
794	eor		v0.16b, v0.16b, v8.16b
795
796	SM4_CRYPT_BLK(v0)
797
798	eor		v0.16b, v0.16b, v8.16b
799	st1		{v0.16b}, [x1], #16
800
801	tweak_next(v8, v8, RTMP0)
802
803	cbnz		w4, .Lxts_dec_loop_1x
804
805.Lxts_dec_cts:
806	cbz		x5, .Lxts_dec_end
807
808	/* cipher text stealing */
809
810	tweak_next(v9, v8, RTMP0)
811	ld1		{v0.16b}, [x2]
812	eor		v0.16b, v0.16b, v9.16b
813	SM4_CRYPT_BLK(v0)
814	eor		v0.16b, v0.16b, v9.16b
815
816	/* load permute table */
817	adr_l		x6, .Lcts_permute_table
818	add		x7, x6, #32
819	add		x6, x6, x5
820	sub		x7, x7, x5
821	ld1		{v3.16b}, [x6]
822	ld1		{v4.16b}, [x7]
823
824	/* overlapping loads */
825	add		x2, x2, x5
826	ld1		{v1.16b}, [x2]
827
828	/* create Cn from En-1 */
829	tbl		v2.16b, {v0.16b}, v3.16b
830	/* padding Pn with En-1 at the end */
831	tbx		v0.16b, {v1.16b}, v4.16b
832
833	eor		v0.16b, v0.16b, v8.16b
834	SM4_CRYPT_BLK(v0)
835	eor		v0.16b, v0.16b, v8.16b
836
837
838	/* overlapping stores */
839	add		x5, x1, x5
840	st1		{v2.16b}, [x5]
841	st1		{v0.16b}, [x1]
842
843	b		.Lxts_dec_ret
844
845.Lxts_dec_end:
846	/* store new tweak */
847	st1		{v8.16b}, [x3]
848
849.Lxts_dec_ret:
850	ret
851SYM_FUNC_END(sm4_ce_xts_dec)
852
853.align 3
854SYM_FUNC_START(sm4_ce_mac_update)
855	/* input:
856	 *   x0: round key array, CTX
857	 *   x1: digest
858	 *   x2: src
859	 *   w3: nblocks
860	 *   w4: enc_before
861	 *   w5: enc_after
862	 */
863	SM4_PREPARE(x0)
864
865	ld1		{RMAC.16b}, [x1]
866
867	cbz		w4, .Lmac_update
868
869	SM4_CRYPT_BLK(RMAC)
870
871.Lmac_update:
872	cbz		w3, .Lmac_ret
873
874	sub		w6, w3, #1
875	cmp		w5, wzr
876	csel		w3, w3, w6, ne
877
878	cbz		w3, .Lmac_end
879
880.Lmac_loop_4x:
881	cmp		w3, #4
882	blt		.Lmac_loop_1x
883
884	sub		w3, w3, #4
885
886	ld1		{v0.16b-v3.16b}, [x2], #64
887
888	eor		RMAC.16b, RMAC.16b, v0.16b
889	SM4_CRYPT_BLK(RMAC)
890	eor		RMAC.16b, RMAC.16b, v1.16b
891	SM4_CRYPT_BLK(RMAC)
892	eor		RMAC.16b, RMAC.16b, v2.16b
893	SM4_CRYPT_BLK(RMAC)
894	eor		RMAC.16b, RMAC.16b, v3.16b
895	SM4_CRYPT_BLK(RMAC)
896
897	cbz		w3, .Lmac_end
898	b		.Lmac_loop_4x
899
900.Lmac_loop_1x:
901	sub		w3, w3, #1
902
903	ld1		{v0.16b}, [x2], #16
904
905	eor		RMAC.16b, RMAC.16b, v0.16b
906	SM4_CRYPT_BLK(RMAC)
907
908	cbnz		w3, .Lmac_loop_1x
909
910
911.Lmac_end:
912	cbnz		w5, .Lmac_ret
913
914	ld1		{v0.16b}, [x2], #16
915	eor		RMAC.16b, RMAC.16b, v0.16b
916
917.Lmac_ret:
918	st1		{RMAC.16b}, [x1]
919	ret
920SYM_FUNC_END(sm4_ce_mac_update)
921
922
923	.section	".rodata", "a"
924	.align 4
925.Lbswap128_mask:
926	.byte		0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
927	.byte		0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
928
929.Lcts_permute_table:
930	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
931	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
932	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
933	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
934	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
935	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
936