xref: /linux/arch/arm64/crypto/ghash-ce-core.S (revision c6ed444fd6fffaaf2e3857d926ed18bf3df81e8e)
1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	SHASH		.req	v0
15	SHASH2		.req	v1
16	T1		.req	v2
17	T2		.req	v3
18	MASK		.req	v4
19	XL		.req	v5
20	XM		.req	v6
21	XH		.req	v7
22	IN1		.req	v7
23
24	k00_16		.req	v8
25	k32_48		.req	v9
26
27	t3		.req	v10
28	t4		.req	v11
29	t5		.req	v12
30	t6		.req	v13
31	t7		.req	v14
32	t8		.req	v15
33	t9		.req	v16
34
35	perm1		.req	v17
36	perm2		.req	v18
37	perm3		.req	v19
38
39	sh1		.req	v20
40	sh2		.req	v21
41	sh3		.req	v22
42	sh4		.req	v23
43
44	ss1		.req	v24
45	ss2		.req	v25
46	ss3		.req	v26
47	ss4		.req	v27
48
49	.text
50	.arch		armv8-a+crypto
51
52	.macro		__pmull_p64, rd, rn, rm
53	pmull		\rd\().1q, \rn\().1d, \rm\().1d
54	.endm
55
56	.macro		__pmull2_p64, rd, rn, rm
57	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
58	.endm
59
60	.macro		__pmull_p8, rq, ad, bd
61	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
62	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
63	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
64
65	__pmull_p8_\bd	\rq, \ad
66	.endm
67
68	.macro		__pmull2_p8, rq, ad, bd
69	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
70	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
71	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
72
73	__pmull2_p8_\bd	\rq, \ad
74	.endm
75
76	.macro		__pmull_p8_SHASH, rq, ad
77	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
78	.endm
79
80	.macro		__pmull_p8_SHASH2, rq, ad
81	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
82	.endm
83
84	.macro		__pmull2_p8_SHASH, rq, ad
85	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
86	.endm
87
88	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
89	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
90	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
91	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
92	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
93	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
94	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
95	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
96	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
97
98	eor		t3.16b, t3.16b, t4.16b			// L = E + F
99	eor		t5.16b, t5.16b, t6.16b			// M = G + H
100	eor		t7.16b, t7.16b, t8.16b			// N = I + J
101
102	uzp1		t4.2d, t3.2d, t5.2d
103	uzp2		t3.2d, t3.2d, t5.2d
104	uzp1		t6.2d, t7.2d, t9.2d
105	uzp2		t7.2d, t7.2d, t9.2d
106
107	// t3 = (L) (P0 + P1) << 8
108	// t5 = (M) (P2 + P3) << 16
109	eor		t4.16b, t4.16b, t3.16b
110	and		t3.16b, t3.16b, k32_48.16b
111
112	// t7 = (N) (P4 + P5) << 24
113	// t9 = (K) (P6 + P7) << 32
114	eor		t6.16b, t6.16b, t7.16b
115	and		t7.16b, t7.16b, k00_16.16b
116
117	eor		t4.16b, t4.16b, t3.16b
118	eor		t6.16b, t6.16b, t7.16b
119
120	zip2		t5.2d, t4.2d, t3.2d
121	zip1		t3.2d, t4.2d, t3.2d
122	zip2		t9.2d, t6.2d, t7.2d
123	zip1		t7.2d, t6.2d, t7.2d
124
125	ext		t3.16b, t3.16b, t3.16b, #15
126	ext		t5.16b, t5.16b, t5.16b, #14
127	ext		t7.16b, t7.16b, t7.16b, #13
128	ext		t9.16b, t9.16b, t9.16b, #12
129
130	eor		t3.16b, t3.16b, t5.16b
131	eor		t7.16b, t7.16b, t9.16b
132	eor		\rq\().16b, \rq\().16b, t3.16b
133	eor		\rq\().16b, \rq\().16b, t7.16b
134	.endm
135
136	.macro		__pmull_pre_p64
137	movi		MASK.16b, #0xe1
138	shl		MASK.2d, MASK.2d, #57
139	.endm
140
141	.macro		__pmull_pre_p8
142	// k00_16 := 0x0000000000000000_000000000000ffff
143	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
144	movi		k32_48.2d, #0xffffffff
145	mov		k32_48.h[2], k32_48.h[0]
146	ushr		k00_16.2d, k32_48.2d, #32
147
148	// prepare the permutation vectors
149	mov_q		x5, 0x080f0e0d0c0b0a09
150	movi		T1.8b, #8
151	dup		perm1.2d, x5
152	eor		perm1.16b, perm1.16b, T1.16b
153	ushr		perm2.2d, perm1.2d, #8
154	ushr		perm3.2d, perm1.2d, #16
155	ushr		T1.2d, perm1.2d, #24
156	sli		perm2.2d, perm1.2d, #56
157	sli		perm3.2d, perm1.2d, #48
158	sli		T1.2d, perm1.2d, #40
159
160	// precompute loop invariants
161	tbl		sh1.16b, {SHASH.16b}, perm1.16b
162	tbl		sh2.16b, {SHASH.16b}, perm2.16b
163	tbl		sh3.16b, {SHASH.16b}, perm3.16b
164	tbl		sh4.16b, {SHASH.16b}, T1.16b
165	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
166	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
167	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
168	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
169	.endm
170
171	//
172	// PMULL (64x64->128) based reduction for CPUs that can do
173	// it in a single instruction.
174	//
175	.macro		__pmull_reduce_p64
176	pmull		T2.1q, XL.1d, MASK.1d
177	eor		XM.16b, XM.16b, T1.16b
178
179	mov		XH.d[0], XM.d[1]
180	mov		XM.d[1], XL.d[0]
181
182	eor		XL.16b, XM.16b, T2.16b
183	ext		T2.16b, XL.16b, XL.16b, #8
184	pmull		XL.1q, XL.1d, MASK.1d
185	.endm
186
187	//
188	// Alternative reduction for CPUs that lack support for the
189	// 64x64->128 PMULL instruction
190	//
191	.macro		__pmull_reduce_p8
192	eor		XM.16b, XM.16b, T1.16b
193
194	mov		XL.d[1], XM.d[0]
195	mov		XH.d[0], XM.d[1]
196
197	shl		T1.2d, XL.2d, #57
198	shl		T2.2d, XL.2d, #62
199	eor		T2.16b, T2.16b, T1.16b
200	shl		T1.2d, XL.2d, #63
201	eor		T2.16b, T2.16b, T1.16b
202	ext		T1.16b, XL.16b, XH.16b, #8
203	eor		T2.16b, T2.16b, T1.16b
204
205	mov		XL.d[1], T2.d[0]
206	mov		XH.d[0], T2.d[1]
207
208	ushr		T2.2d, XL.2d, #1
209	eor		XH.16b, XH.16b, XL.16b
210	eor		XL.16b, XL.16b, T2.16b
211	ushr		T2.2d, T2.2d, #6
212	ushr		XL.2d, XL.2d, #1
213	.endm
214
215	.macro		__pmull_ghash, pn
216	frame_push	5
217
218	mov		x19, x0
219	mov		x20, x1
220	mov		x21, x2
221	mov		x22, x3
222	mov		x23, x4
223
2240:	ld1		{SHASH.2d}, [x22]
225	ld1		{XL.2d}, [x20]
226	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
227	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
228
229	__pmull_pre_\pn
230
231	/* do the head block first, if supplied */
232	cbz		x23, 1f
233	ld1		{T1.2d}, [x23]
234	mov		x23, xzr
235	b		2f
236
2371:	ld1		{T1.2d}, [x21], #16
238	sub		w19, w19, #1
239
2402:	/* multiply XL by SHASH in GF(2^128) */
241CPU_LE(	rev64		T1.16b, T1.16b	)
242
243	ext		T2.16b, XL.16b, XL.16b, #8
244	ext		IN1.16b, T1.16b, T1.16b, #8
245	eor		T1.16b, T1.16b, T2.16b
246	eor		XL.16b, XL.16b, IN1.16b
247
248	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
249	eor		T1.16b, T1.16b, XL.16b
250	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
251	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
252
253	eor		T2.16b, XL.16b, XH.16b
254	ext		T1.16b, XL.16b, XH.16b, #8
255	eor		XM.16b, XM.16b, T2.16b
256
257	__pmull_reduce_\pn
258
259	eor		T2.16b, T2.16b, XH.16b
260	eor		XL.16b, XL.16b, T2.16b
261
262	cbz		w19, 3f
263
264	if_will_cond_yield_neon
265	st1		{XL.2d}, [x20]
266	do_cond_yield_neon
267	b		0b
268	endif_yield_neon
269
270	b		1b
271
2723:	st1		{XL.2d}, [x20]
273	frame_pop
274	ret
275	.endm
276
277	/*
278	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
279	 *			   struct ghash_key const *k, const char *head)
280	 */
281ENTRY(pmull_ghash_update_p64)
282	__pmull_ghash	p64
283ENDPROC(pmull_ghash_update_p64)
284
285ENTRY(pmull_ghash_update_p8)
286	__pmull_ghash	p8
287ENDPROC(pmull_ghash_update_p8)
288
289	KS		.req	v8
290	CTR		.req	v9
291	INP		.req	v10
292
293	.macro		load_round_keys, rounds, rk
294	cmp		\rounds, #12
295	blo		2222f		/* 128 bits */
296	beq		1111f		/* 192 bits */
297	ld1		{v17.4s-v18.4s}, [\rk], #32
2981111:	ld1		{v19.4s-v20.4s}, [\rk], #32
2992222:	ld1		{v21.4s-v24.4s}, [\rk], #64
300	ld1		{v25.4s-v28.4s}, [\rk], #64
301	ld1		{v29.4s-v31.4s}, [\rk]
302	.endm
303
304	.macro		enc_round, state, key
305	aese		\state\().16b, \key\().16b
306	aesmc		\state\().16b, \state\().16b
307	.endm
308
309	.macro		enc_block, state, rounds
310	cmp		\rounds, #12
311	b.lo		2222f		/* 128 bits */
312	b.eq		1111f		/* 192 bits */
313	enc_round	\state, v17
314	enc_round	\state, v18
3151111:	enc_round	\state, v19
316	enc_round	\state, v20
3172222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
318	enc_round	\state, \key
319	.endr
320	aese		\state\().16b, v30.16b
321	eor		\state\().16b, \state\().16b, v31.16b
322	.endm
323
324	.macro		pmull_gcm_do_crypt, enc
325	ld1		{SHASH.2d}, [x4]
326	ld1		{XL.2d}, [x1]
327	ldr		x8, [x5, #8]			// load lower counter
328
329	load_round_keys	w7, x6
330
331	movi		MASK.16b, #0xe1
332	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
333CPU_LE(	rev		x8, x8		)
334	shl		MASK.2d, MASK.2d, #57
335	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
336
337	.if		\enc == 1
338	ldr		x10, [sp]
339	ld1		{KS.16b}, [x10]
340	.endif
341
3420:	ld1		{CTR.8b}, [x5]			// load upper counter
343	ld1		{INP.16b}, [x3], #16
344	rev		x9, x8
345	add		x8, x8, #1
346	sub		w0, w0, #1
347	ins		CTR.d[1], x9			// set lower counter
348
349	.if		\enc == 1
350	eor		INP.16b, INP.16b, KS.16b	// encrypt input
351	st1		{INP.16b}, [x2], #16
352	.endif
353
354	rev64		T1.16b, INP.16b
355
356	cmp		w7, #12
357	b.ge		2f				// AES-192/256?
358
3591:	enc_round	CTR, v21
360
361	ext		T2.16b, XL.16b, XL.16b, #8
362	ext		IN1.16b, T1.16b, T1.16b, #8
363
364	enc_round	CTR, v22
365
366	eor		T1.16b, T1.16b, T2.16b
367	eor		XL.16b, XL.16b, IN1.16b
368
369	enc_round	CTR, v23
370
371	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
372	eor		T1.16b, T1.16b, XL.16b
373
374	enc_round	CTR, v24
375
376	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
377	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
378
379	enc_round	CTR, v25
380
381	ext		T1.16b, XL.16b, XH.16b, #8
382	eor		T2.16b, XL.16b, XH.16b
383	eor		XM.16b, XM.16b, T1.16b
384
385	enc_round	CTR, v26
386
387	eor		XM.16b, XM.16b, T2.16b
388	pmull		T2.1q, XL.1d, MASK.1d
389
390	enc_round	CTR, v27
391
392	mov		XH.d[0], XM.d[1]
393	mov		XM.d[1], XL.d[0]
394
395	enc_round	CTR, v28
396
397	eor		XL.16b, XM.16b, T2.16b
398
399	enc_round	CTR, v29
400
401	ext		T2.16b, XL.16b, XL.16b, #8
402
403	aese		CTR.16b, v30.16b
404
405	pmull		XL.1q, XL.1d, MASK.1d
406	eor		T2.16b, T2.16b, XH.16b
407
408	eor		KS.16b, CTR.16b, v31.16b
409
410	eor		XL.16b, XL.16b, T2.16b
411
412	.if		\enc == 0
413	eor		INP.16b, INP.16b, KS.16b
414	st1		{INP.16b}, [x2], #16
415	.endif
416
417	cbnz		w0, 0b
418
419CPU_LE(	rev		x8, x8		)
420	st1		{XL.2d}, [x1]
421	str		x8, [x5, #8]			// store lower counter
422
423	.if		\enc == 1
424	st1		{KS.16b}, [x10]
425	.endif
426
427	ret
428
4292:	b.eq		3f				// AES-192?
430	enc_round	CTR, v17
431	enc_round	CTR, v18
4323:	enc_round	CTR, v19
433	enc_round	CTR, v20
434	b		1b
435	.endm
436
437	/*
438	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
439	 *			  struct ghash_key const *k, u8 ctr[],
440	 *			  int rounds, u8 ks[])
441	 */
442ENTRY(pmull_gcm_encrypt)
443	pmull_gcm_do_crypt	1
444ENDPROC(pmull_gcm_encrypt)
445
446	/*
447	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
448	 *			  struct ghash_key const *k, u8 ctr[],
449	 *			  int rounds)
450	 */
451ENTRY(pmull_gcm_decrypt)
452	pmull_gcm_do_crypt	0
453ENDPROC(pmull_gcm_decrypt)
454
455	/*
456	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
457	 */
458ENTRY(pmull_gcm_encrypt_block)
459	cbz		x2, 0f
460	load_round_keys	w3, x2
4610:	ld1		{v0.16b}, [x1]
462	enc_block	v0, w3
463	st1		{v0.16b}, [x0]
464	ret
465ENDPROC(pmull_gcm_encrypt_block)
466