xref: /linux/lib/crypto/arm64/ghash-neon-core.S (revision 370c3883195566ee3e7d79e0146c3d735a406573)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with ARMv8 ASIMD instructions.
4 *
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	SHASH		.req	v0
12	SHASH2		.req	v1
13	T1		.req	v2
14	T2		.req	v3
15	XM		.req	v5
16	XL		.req	v6
17	XH		.req	v7
18	IN1		.req	v7
19
20	k00_16		.req	v8
21	k32_48		.req	v9
22
23	t3		.req	v10
24	t4		.req	v11
25	t5		.req	v12
26	t6		.req	v13
27	t7		.req	v14
28	t8		.req	v15
29	t9		.req	v16
30
31	perm1		.req	v17
32	perm2		.req	v18
33	perm3		.req	v19
34
35	sh1		.req	v20
36	sh2		.req	v21
37	sh3		.req	v22
38	sh4		.req	v23
39
40	ss1		.req	v24
41	ss2		.req	v25
42	ss3		.req	v26
43	ss4		.req	v27
44
45	.text
46
47	.macro		__pmull_p8, rq, ad, bd
48	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
49	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
50	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
51
52	__pmull_p8_\bd	\rq, \ad
53	.endm
54
55	.macro		__pmull2_p8, rq, ad, bd
56	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
57	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
58	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
59
60	__pmull2_p8_\bd	\rq, \ad
61	.endm
62
63	.macro		__pmull_p8_SHASH, rq, ad
64	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
65	.endm
66
67	.macro		__pmull_p8_SHASH2, rq, ad
68	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
69	.endm
70
71	.macro		__pmull2_p8_SHASH, rq, ad
72	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
73	.endm
74
75	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
76	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
77	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
78	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
79	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
80	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
81	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
82	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
83	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
84
85	eor		t3.16b, t3.16b, t4.16b			// L = E + F
86	eor		t5.16b, t5.16b, t6.16b			// M = G + H
87	eor		t7.16b, t7.16b, t8.16b			// N = I + J
88
89	uzp1		t4.2d, t3.2d, t5.2d
90	uzp2		t3.2d, t3.2d, t5.2d
91	uzp1		t6.2d, t7.2d, t9.2d
92	uzp2		t7.2d, t7.2d, t9.2d
93
94	// t3 = (L) (P0 + P1) << 8
95	// t5 = (M) (P2 + P3) << 16
96	eor		t4.16b, t4.16b, t3.16b
97	and		t3.16b, t3.16b, k32_48.16b
98
99	// t7 = (N) (P4 + P5) << 24
100	// t9 = (K) (P6 + P7) << 32
101	eor		t6.16b, t6.16b, t7.16b
102	and		t7.16b, t7.16b, k00_16.16b
103
104	eor		t4.16b, t4.16b, t3.16b
105	eor		t6.16b, t6.16b, t7.16b
106
107	zip2		t5.2d, t4.2d, t3.2d
108	zip1		t3.2d, t4.2d, t3.2d
109	zip2		t9.2d, t6.2d, t7.2d
110	zip1		t7.2d, t6.2d, t7.2d
111
112	ext		t3.16b, t3.16b, t3.16b, #15
113	ext		t5.16b, t5.16b, t5.16b, #14
114	ext		t7.16b, t7.16b, t7.16b, #13
115	ext		t9.16b, t9.16b, t9.16b, #12
116
117	eor		t3.16b, t3.16b, t5.16b
118	eor		t7.16b, t7.16b, t9.16b
119	eor		\rq\().16b, \rq\().16b, t3.16b
120	eor		\rq\().16b, \rq\().16b, t7.16b
121	.endm
122
123	.macro		__pmull_pre_p8
124	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
125	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
126
127	// k00_16 := 0x0000000000000000_000000000000ffff
128	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
129	movi		k32_48.2d, #0xffffffff
130	mov		k32_48.h[2], k32_48.h[0]
131	ushr		k00_16.2d, k32_48.2d, #32
132
133	// prepare the permutation vectors
134	mov_q		x5, 0x080f0e0d0c0b0a09
135	movi		T1.8b, #8
136	dup		perm1.2d, x5
137	eor		perm1.16b, perm1.16b, T1.16b
138	ushr		perm2.2d, perm1.2d, #8
139	ushr		perm3.2d, perm1.2d, #16
140	ushr		T1.2d, perm1.2d, #24
141	sli		perm2.2d, perm1.2d, #56
142	sli		perm3.2d, perm1.2d, #48
143	sli		T1.2d, perm1.2d, #40
144
145	// precompute loop invariants
146	tbl		sh1.16b, {SHASH.16b}, perm1.16b
147	tbl		sh2.16b, {SHASH.16b}, perm2.16b
148	tbl		sh3.16b, {SHASH.16b}, perm3.16b
149	tbl		sh4.16b, {SHASH.16b}, T1.16b
150	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
151	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
152	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
153	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
154	.endm
155
156	.macro		__pmull_reduce_p8
157	eor		XM.16b, XM.16b, T1.16b
158
159	mov		XL.d[1], XM.d[0]
160	mov		XH.d[0], XM.d[1]
161
162	shl		T1.2d, XL.2d, #57
163	shl		T2.2d, XL.2d, #62
164	eor		T2.16b, T2.16b, T1.16b
165	shl		T1.2d, XL.2d, #63
166	eor		T2.16b, T2.16b, T1.16b
167	ext		T1.16b, XL.16b, XH.16b, #8
168	eor		T2.16b, T2.16b, T1.16b
169
170	mov		XL.d[1], T2.d[0]
171	mov		XH.d[0], T2.d[1]
172
173	ushr		T2.2d, XL.2d, #1
174	eor		XH.16b, XH.16b, XL.16b
175	eor		XL.16b, XL.16b, T2.16b
176	ushr		T2.2d, T2.2d, #6
177	ushr		XL.2d, XL.2d, #1
178	.endm
179
180	/*
181	 * void pmull_ghash_update_p8(size_t blocks, struct polyval_elem *dg,
182	 *			      const u8 *src,
183	 *			      const struct polyval_elem *h)
184	 */
185SYM_FUNC_START(pmull_ghash_update_p8)
186	ld1		{SHASH.2d}, [x3]
187	ld1		{XL.2d}, [x1]
188
189	__pmull_pre_p8
190
1910:	ld1		{T1.2d}, [x2], #16
192	sub		x0, x0, #1
193
194	/* multiply XL by SHASH in GF(2^128) */
195	rev64		T1.16b, T1.16b
196
197	ext		T2.16b, XL.16b, XL.16b, #8
198	ext		IN1.16b, T1.16b, T1.16b, #8
199	eor		T1.16b, T1.16b, T2.16b
200	eor		XL.16b, XL.16b, IN1.16b
201
202	__pmull2_p8	XH, XL, SHASH			// a1 * b1
203	eor		T1.16b, T1.16b, XL.16b
204	__pmull_p8 	XL, XL, SHASH			// a0 * b0
205	__pmull_p8	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
206
207	eor		T2.16b, XL.16b, XH.16b
208	ext		T1.16b, XL.16b, XH.16b, #8
209	eor		XM.16b, XM.16b, T2.16b
210
211	__pmull_reduce_p8
212
213	eor		T2.16b, T2.16b, XH.16b
214	eor		XL.16b, XL.16b, T2.16b
215
216	cbnz		x0, 0b
217
218	st1		{XL.2d}, [x1]
219	ret
220SYM_FUNC_END(pmull_ghash_update_p8)
221