xref: /linux/arch/arm/crypto/ghash-ce-core.S (revision 2c97b5ae83dca56718774e7b4bf9640f05d11867)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4 *
5 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	SHASH		.req	q0
12	T1		.req	q1
13	XL		.req	q2
14	XM		.req	q3
15	XH		.req	q4
16	IN1		.req	q4
17
18	SHASH_L		.req	d0
19	SHASH_H		.req	d1
20	T1_L		.req	d2
21	T1_H		.req	d3
22	XL_L		.req	d4
23	XL_H		.req	d5
24	XM_L		.req	d6
25	XM_H		.req	d7
26	XH_L		.req	d8
27
28	t0l		.req	d10
29	t0h		.req	d11
30	t1l		.req	d12
31	t1h		.req	d13
32	t2l		.req	d14
33	t2h		.req	d15
34	t3l		.req	d16
35	t3h		.req	d17
36	t4l		.req	d18
37	t4h		.req	d19
38
39	t0q		.req	q5
40	t1q		.req	q6
41	t2q		.req	q7
42	t3q		.req	q8
43	t4q		.req	q9
44	T2		.req	q9
45
46	s1l		.req	d20
47	s1h		.req	d21
48	s2l		.req	d22
49	s2h		.req	d23
50	s3l		.req	d24
51	s3h		.req	d25
52	s4l		.req	d26
53	s4h		.req	d27
54
55	MASK		.req	d28
56	SHASH2_p8	.req	d28
57
58	k16		.req	d29
59	k32		.req	d30
60	k48		.req	d31
61	SHASH2_p64	.req	d31
62
63	HH		.req	q10
64	HH3		.req	q11
65	HH4		.req	q12
66	HH34		.req	q13
67
68	HH_L		.req	d20
69	HH_H		.req	d21
70	HH3_L		.req	d22
71	HH3_H		.req	d23
72	HH4_L		.req	d24
73	HH4_H		.req	d25
74	HH34_L		.req	d26
75	HH34_H		.req	d27
76	SHASH2_H	.req	d29
77
78	XL2		.req	q5
79	XM2		.req	q6
80	XH2		.req	q7
81	T3		.req	q8
82
83	XL2_L		.req	d10
84	XL2_H		.req	d11
85	XM2_L		.req	d12
86	XM2_H		.req	d13
87	T3_L		.req	d16
88	T3_H		.req	d17
89
90	.text
91	.arch		armv8-a
92	.fpu		crypto-neon-fp-armv8
93
94	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
95	vmull.p64	\rd, \rn, \rm
96	.endm
97
98	/*
99	 * This implementation of 64x64 -> 128 bit polynomial multiplication
100	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
101	 * "Fast Software Polynomial Multiplication on ARM Processors Using
102	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
103	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
104	 *
105	 * It has been slightly tweaked for in-order performance, and to allow
106	 * 'rq' to overlap with 'ad' or 'bd'.
107	 */
108	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
109	vext.8		t0l, \ad, \ad, #1	@ A1
110	.ifc		\b1, t4l
111	vext.8		t4l, \bd, \bd, #1	@ B1
112	.endif
113	vmull.p8	t0q, t0l, \bd		@ F = A1*B
114	vext.8		t1l, \ad, \ad, #2	@ A2
115	vmull.p8	t4q, \ad, \b1		@ E = A*B1
116	.ifc		\b2, t3l
117	vext.8		t3l, \bd, \bd, #2	@ B2
118	.endif
119	vmull.p8	t1q, t1l, \bd		@ H = A2*B
120	vext.8		t2l, \ad, \ad, #3	@ A3
121	vmull.p8	t3q, \ad, \b2		@ G = A*B2
122	veor		t0q, t0q, t4q		@ L = E + F
123	.ifc		\b3, t4l
124	vext.8		t4l, \bd, \bd, #3	@ B3
125	.endif
126	vmull.p8	t2q, t2l, \bd		@ J = A3*B
127	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
128	veor		t1q, t1q, t3q		@ M = G + H
129	.ifc		\b4, t3l
130	vext.8		t3l, \bd, \bd, #4	@ B4
131	.endif
132	vmull.p8	t4q, \ad, \b3		@ I = A*B3
133	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
134	vmull.p8	t3q, \ad, \b4		@ K = A*B4
135	vand		t0h, t0h, k48
136	vand		t1h, t1h, k32
137	veor		t2q, t2q, t4q		@ N = I + J
138	veor		t0l, t0l, t0h
139	veor		t1l, t1l, t1h
140	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
141	vand		t2h, t2h, k16
142	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
143	vmov.i64	t3h, #0
144	vext.8		t0q, t0q, t0q, #15
145	veor		t2l, t2l, t2h
146	vext.8		t1q, t1q, t1q, #14
147	vmull.p8	\rq, \ad, \bd		@ D = A*B
148	vext.8		t2q, t2q, t2q, #13
149	vext.8		t3q, t3q, t3q, #12
150	veor		t0q, t0q, t1q
151	veor		t2q, t2q, t3q
152	veor		\rq, \rq, t0q
153	veor		\rq, \rq, t2q
154	.endm
155
156	//
157	// PMULL (64x64->128) based reduction for CPUs that can do
158	// it in a single instruction.
159	//
160	.macro		__pmull_reduce_p64
161	vmull.p64	T1, XL_L, MASK
162
163	veor		XH_L, XH_L, XM_H
164	vext.8		T1, T1, T1, #8
165	veor		XL_H, XL_H, XM_L
166	veor		T1, T1, XL
167
168	vmull.p64	XL, T1_H, MASK
169	.endm
170
171	//
172	// Alternative reduction for CPUs that lack support for the
173	// 64x64->128 PMULL instruction
174	//
175	.macro		__pmull_reduce_p8
176	veor		XL_H, XL_H, XM_L
177	veor		XH_L, XH_L, XM_H
178
179	vshl.i64	T1, XL, #57
180	vshl.i64	T2, XL, #62
181	veor		T1, T1, T2
182	vshl.i64	T2, XL, #63
183	veor		T1, T1, T2
184	veor		XL_H, XL_H, T1_L
185	veor		XH_L, XH_L, T1_H
186
187	vshr.u64	T1, XL, #1
188	veor		XH, XH, XL
189	veor		XL, XL, T1
190	vshr.u64	T1, T1, #6
191	vshr.u64	XL, XL, #1
192	.endm
193
194	.macro		ghash_update, pn
195	vld1.64		{XL}, [r1]
196
197	/* do the head block first, if supplied */
198	ldr		ip, [sp]
199	teq		ip, #0
200	beq		0f
201	vld1.64		{T1}, [ip]
202	teq		r0, #0
203	b		3f
204
2050:	.ifc		\pn, p64
206	tst		r0, #3			// skip until #blocks is a
207	bne		2f			// round multiple of 4
208
209	vld1.8		{XL2-XM2}, [r2]!
2101:	vld1.8		{T3-T2}, [r2]!
211	vrev64.8	XL2, XL2
212	vrev64.8	XM2, XM2
213
214	subs		r0, r0, #4
215
216	vext.8		T1, XL2, XL2, #8
217	veor		XL2_H, XL2_H, XL_L
218	veor		XL, XL, T1
219
220	vrev64.8	T3, T3
221	vrev64.8	T1, T2
222
223	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
224	veor		XL2_H, XL2_H, XL_H
225	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
226	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
227
228	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
229	veor		XM2_L, XM2_L, XM2_H
230	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
231	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
232
233	veor		XH, XH, XH2
234	veor		XL, XL, XL2
235	veor		XM, XM, XM2
236
237	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
238	veor		T3_L, T3_L, T3_H
239	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
240	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
241
242	veor		XH, XH, XH2
243	veor		XL, XL, XL2
244	veor		XM, XM, XM2
245
246	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
247	veor		T1_L, T1_L, T1_H
248	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
249	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
250
251	veor		XH, XH, XH2
252	veor		XL, XL, XL2
253	veor		XM, XM, XM2
254
255	beq		4f
256
257	vld1.8		{XL2-XM2}, [r2]!
258
259	veor		T1, XL, XH
260	veor		XM, XM, T1
261
262	__pmull_reduce_p64
263
264	veor		T1, T1, XH
265	veor		XL, XL, T1
266
267	b		1b
268	.endif
269
2702:	vld1.64		{T1}, [r2]!
271	subs		r0, r0, #1
272
2733:	/* multiply XL by SHASH in GF(2^128) */
274#ifndef CONFIG_CPU_BIG_ENDIAN
275	vrev64.8	T1, T1
276#endif
277	vext.8		IN1, T1, T1, #8
278	veor		T1_L, T1_L, XL_H
279	veor		XL, XL, IN1
280
281	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
282	veor		T1, T1, XL
283	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
284	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
285
2864:	veor		T1, XL, XH
287	veor		XM, XM, T1
288
289	__pmull_reduce_\pn
290
291	veor		T1, T1, XH
292	veor		XL, XL, T1
293
294	bne		0b
295
296	vst1.64		{XL}, [r1]
297	bx		lr
298	.endm
299
300	/*
301	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
302	 *			   struct ghash_key const *k, const char *head)
303	 */
304ENTRY(pmull_ghash_update_p64)
305	vld1.64		{SHASH}, [r3]!
306	vld1.64		{HH}, [r3]!
307	vld1.64		{HH3-HH4}, [r3]
308
309	veor		SHASH2_p64, SHASH_L, SHASH_H
310	veor		SHASH2_H, HH_L, HH_H
311	veor		HH34_L, HH3_L, HH3_H
312	veor		HH34_H, HH4_L, HH4_H
313
314	vmov.i8		MASK, #0xe1
315	vshl.u64	MASK, MASK, #57
316
317	ghash_update	p64
318ENDPROC(pmull_ghash_update_p64)
319
320ENTRY(pmull_ghash_update_p8)
321	vld1.64		{SHASH}, [r3]
322	veor		SHASH2_p8, SHASH_L, SHASH_H
323
324	vext.8		s1l, SHASH_L, SHASH_L, #1
325	vext.8		s2l, SHASH_L, SHASH_L, #2
326	vext.8		s3l, SHASH_L, SHASH_L, #3
327	vext.8		s4l, SHASH_L, SHASH_L, #4
328	vext.8		s1h, SHASH_H, SHASH_H, #1
329	vext.8		s2h, SHASH_H, SHASH_H, #2
330	vext.8		s3h, SHASH_H, SHASH_H, #3
331	vext.8		s4h, SHASH_H, SHASH_H, #4
332
333	vmov.i64	k16, #0xffff
334	vmov.i64	k32, #0xffffffff
335	vmov.i64	k48, #0xffffffffffff
336
337	ghash_update	p8
338ENDPROC(pmull_ghash_update_p8)
339