xref: /linux/arch/arm64/crypto/ghash-ce-core.S (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 *
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <linux/cfi_types.h>
10#include <asm/assembler.h>
11
12	SHASH		.req	v0
13	SHASH2		.req	v1
14	T1		.req	v2
15	T2		.req	v3
16	MASK		.req	v4
17	XM		.req	v5
18	XL		.req	v6
19	XH		.req	v7
20	IN1		.req	v7
21
22	k00_16		.req	v8
23	k32_48		.req	v9
24
25	t3		.req	v10
26	t4		.req	v11
27	t5		.req	v12
28	t6		.req	v13
29	t7		.req	v14
30	t8		.req	v15
31	t9		.req	v16
32
33	perm1		.req	v17
34	perm2		.req	v18
35	perm3		.req	v19
36
37	sh1		.req	v20
38	sh2		.req	v21
39	sh3		.req	v22
40	sh4		.req	v23
41
42	ss1		.req	v24
43	ss2		.req	v25
44	ss3		.req	v26
45	ss4		.req	v27
46
47	XL2		.req	v8
48	XM2		.req	v9
49	XH2		.req	v10
50	XL3		.req	v11
51	XM3		.req	v12
52	XH3		.req	v13
53	TT3		.req	v14
54	TT4		.req	v15
55	HH		.req	v16
56	HH3		.req	v17
57	HH4		.req	v18
58	HH34		.req	v19
59
60	.text
61	.arch		armv8-a+crypto
62
63	.macro		__pmull_p64, rd, rn, rm
64	pmull		\rd\().1q, \rn\().1d, \rm\().1d
65	.endm
66
67	.macro		__pmull2_p64, rd, rn, rm
68	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
69	.endm
70
71	.macro		__pmull_p8, rq, ad, bd
72	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
73	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
74	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
75
76	__pmull_p8_\bd	\rq, \ad
77	.endm
78
79	.macro		__pmull2_p8, rq, ad, bd
80	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
81	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
82	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
83
84	__pmull2_p8_\bd	\rq, \ad
85	.endm
86
87	.macro		__pmull_p8_SHASH, rq, ad
88	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
89	.endm
90
91	.macro		__pmull_p8_SHASH2, rq, ad
92	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
93	.endm
94
95	.macro		__pmull2_p8_SHASH, rq, ad
96	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
97	.endm
98
99	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
101	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
102	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
103	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
104	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
105	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
106	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
107	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
108
109	eor		t3.16b, t3.16b, t4.16b			// L = E + F
110	eor		t5.16b, t5.16b, t6.16b			// M = G + H
111	eor		t7.16b, t7.16b, t8.16b			// N = I + J
112
113	uzp1		t4.2d, t3.2d, t5.2d
114	uzp2		t3.2d, t3.2d, t5.2d
115	uzp1		t6.2d, t7.2d, t9.2d
116	uzp2		t7.2d, t7.2d, t9.2d
117
118	// t3 = (L) (P0 + P1) << 8
119	// t5 = (M) (P2 + P3) << 16
120	eor		t4.16b, t4.16b, t3.16b
121	and		t3.16b, t3.16b, k32_48.16b
122
123	// t7 = (N) (P4 + P5) << 24
124	// t9 = (K) (P6 + P7) << 32
125	eor		t6.16b, t6.16b, t7.16b
126	and		t7.16b, t7.16b, k00_16.16b
127
128	eor		t4.16b, t4.16b, t3.16b
129	eor		t6.16b, t6.16b, t7.16b
130
131	zip2		t5.2d, t4.2d, t3.2d
132	zip1		t3.2d, t4.2d, t3.2d
133	zip2		t9.2d, t6.2d, t7.2d
134	zip1		t7.2d, t6.2d, t7.2d
135
136	ext		t3.16b, t3.16b, t3.16b, #15
137	ext		t5.16b, t5.16b, t5.16b, #14
138	ext		t7.16b, t7.16b, t7.16b, #13
139	ext		t9.16b, t9.16b, t9.16b, #12
140
141	eor		t3.16b, t3.16b, t5.16b
142	eor		t7.16b, t7.16b, t9.16b
143	eor		\rq\().16b, \rq\().16b, t3.16b
144	eor		\rq\().16b, \rq\().16b, t7.16b
145	.endm
146
147	.macro		__pmull_pre_p64
148	add		x8, x3, #16
149	ld1		{HH.2d-HH4.2d}, [x8]
150
151	trn1		SHASH2.2d, SHASH.2d, HH.2d
152	trn2		T1.2d, SHASH.2d, HH.2d
153	eor		SHASH2.16b, SHASH2.16b, T1.16b
154
155	trn1		HH34.2d, HH3.2d, HH4.2d
156	trn2		T1.2d, HH3.2d, HH4.2d
157	eor		HH34.16b, HH34.16b, T1.16b
158
159	movi		MASK.16b, #0xe1
160	shl		MASK.2d, MASK.2d, #57
161	.endm
162
163	.macro		__pmull_pre_p8
164	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
165	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
166
167	// k00_16 := 0x0000000000000000_000000000000ffff
168	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
169	movi		k32_48.2d, #0xffffffff
170	mov		k32_48.h[2], k32_48.h[0]
171	ushr		k00_16.2d, k32_48.2d, #32
172
173	// prepare the permutation vectors
174	mov_q		x5, 0x080f0e0d0c0b0a09
175	movi		T1.8b, #8
176	dup		perm1.2d, x5
177	eor		perm1.16b, perm1.16b, T1.16b
178	ushr		perm2.2d, perm1.2d, #8
179	ushr		perm3.2d, perm1.2d, #16
180	ushr		T1.2d, perm1.2d, #24
181	sli		perm2.2d, perm1.2d, #56
182	sli		perm3.2d, perm1.2d, #48
183	sli		T1.2d, perm1.2d, #40
184
185	// precompute loop invariants
186	tbl		sh1.16b, {SHASH.16b}, perm1.16b
187	tbl		sh2.16b, {SHASH.16b}, perm2.16b
188	tbl		sh3.16b, {SHASH.16b}, perm3.16b
189	tbl		sh4.16b, {SHASH.16b}, T1.16b
190	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
191	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
192	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
193	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
194	.endm
195
196	//
197	// PMULL (64x64->128) based reduction for CPUs that can do
198	// it in a single instruction.
199	//
200	.macro		__pmull_reduce_p64
201	pmull		T2.1q, XL.1d, MASK.1d
202	eor		XM.16b, XM.16b, T1.16b
203
204	mov		XH.d[0], XM.d[1]
205	mov		XM.d[1], XL.d[0]
206
207	eor		XL.16b, XM.16b, T2.16b
208	ext		T2.16b, XL.16b, XL.16b, #8
209	pmull		XL.1q, XL.1d, MASK.1d
210	.endm
211
212	//
213	// Alternative reduction for CPUs that lack support for the
214	// 64x64->128 PMULL instruction
215	//
216	.macro		__pmull_reduce_p8
217	eor		XM.16b, XM.16b, T1.16b
218
219	mov		XL.d[1], XM.d[0]
220	mov		XH.d[0], XM.d[1]
221
222	shl		T1.2d, XL.2d, #57
223	shl		T2.2d, XL.2d, #62
224	eor		T2.16b, T2.16b, T1.16b
225	shl		T1.2d, XL.2d, #63
226	eor		T2.16b, T2.16b, T1.16b
227	ext		T1.16b, XL.16b, XH.16b, #8
228	eor		T2.16b, T2.16b, T1.16b
229
230	mov		XL.d[1], T2.d[0]
231	mov		XH.d[0], T2.d[1]
232
233	ushr		T2.2d, XL.2d, #1
234	eor		XH.16b, XH.16b, XL.16b
235	eor		XL.16b, XL.16b, T2.16b
236	ushr		T2.2d, T2.2d, #6
237	ushr		XL.2d, XL.2d, #1
238	.endm
239
240	.macro		__pmull_ghash, pn
241	ld1		{SHASH.2d}, [x3]
242	ld1		{XL.2d}, [x1]
243
244	__pmull_pre_\pn
245
246	/* do the head block first, if supplied */
247	cbz		x4, 0f
248	ld1		{T1.2d}, [x4]
249	mov		x4, xzr
250	b		3f
251
2520:	.ifc		\pn, p64
253	tbnz		w0, #0, 2f		// skip until #blocks is a
254	tbnz		w0, #1, 2f		// round multiple of 4
255
2561:	ld1		{XM3.16b-TT4.16b}, [x2], #64
257
258	sub		w0, w0, #4
259
260	rev64		T1.16b, XM3.16b
261	rev64		T2.16b, XH3.16b
262	rev64		TT4.16b, TT4.16b
263	rev64		TT3.16b, TT3.16b
264
265	ext		IN1.16b, TT4.16b, TT4.16b, #8
266	ext		XL3.16b, TT3.16b, TT3.16b, #8
267
268	eor		TT4.16b, TT4.16b, IN1.16b
269	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
270	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
271	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
272
273	eor		TT3.16b, TT3.16b, XL3.16b
274	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
275	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
276	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
277
278	ext		IN1.16b, T2.16b, T2.16b, #8
279	eor		XL2.16b, XL2.16b, XL3.16b
280	eor		XH2.16b, XH2.16b, XH3.16b
281	eor		XM2.16b, XM2.16b, XM3.16b
282
283	eor		T2.16b, T2.16b, IN1.16b
284	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
285	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
286	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
287
288	eor		XL2.16b, XL2.16b, XL3.16b
289	eor		XH2.16b, XH2.16b, XH3.16b
290	eor		XM2.16b, XM2.16b, XM3.16b
291
292	ext		IN1.16b, T1.16b, T1.16b, #8
293	ext		TT3.16b, XL.16b, XL.16b, #8
294	eor		XL.16b, XL.16b, IN1.16b
295	eor		T1.16b, T1.16b, TT3.16b
296
297	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
298	eor		T1.16b, T1.16b, XL.16b
299	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
300	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
301
302	eor		XL.16b, XL.16b, XL2.16b
303	eor		XH.16b, XH.16b, XH2.16b
304	eor		XM.16b, XM.16b, XM2.16b
305
306	eor		T2.16b, XL.16b, XH.16b
307	ext		T1.16b, XL.16b, XH.16b, #8
308	eor		XM.16b, XM.16b, T2.16b
309
310	__pmull_reduce_p64
311
312	eor		T2.16b, T2.16b, XH.16b
313	eor		XL.16b, XL.16b, T2.16b
314
315	cbz		w0, 5f
316	b		1b
317	.endif
318
3192:	ld1		{T1.2d}, [x2], #16
320	sub		w0, w0, #1
321
3223:	/* multiply XL by SHASH in GF(2^128) */
323CPU_LE(	rev64		T1.16b, T1.16b	)
324
325	ext		T2.16b, XL.16b, XL.16b, #8
326	ext		IN1.16b, T1.16b, T1.16b, #8
327	eor		T1.16b, T1.16b, T2.16b
328	eor		XL.16b, XL.16b, IN1.16b
329
330	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
331	eor		T1.16b, T1.16b, XL.16b
332	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
333	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
334
3354:	eor		T2.16b, XL.16b, XH.16b
336	ext		T1.16b, XL.16b, XH.16b, #8
337	eor		XM.16b, XM.16b, T2.16b
338
339	__pmull_reduce_\pn
340
341	eor		T2.16b, T2.16b, XH.16b
342	eor		XL.16b, XL.16b, T2.16b
343
344	cbnz		w0, 0b
345
3465:	st1		{XL.2d}, [x1]
347	ret
348	.endm
349
350	/*
351	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352	 *			   struct ghash_key const *k, const char *head)
353	 */
354SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355	__pmull_ghash	p64
356SYM_FUNC_END(pmull_ghash_update_p64)
357
358SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359	__pmull_ghash	p8
360SYM_FUNC_END(pmull_ghash_update_p8)
361
362	KS0		.req	v8
363	KS1		.req	v9
364	KS2		.req	v10
365	KS3		.req	v11
366
367	INP0		.req	v21
368	INP1		.req	v22
369	INP2		.req	v23
370	INP3		.req	v24
371
372	K0		.req	v25
373	K1		.req	v26
374	K2		.req	v27
375	K3		.req	v28
376	K4		.req	v12
377	K5		.req	v13
378	K6		.req	v4
379	K7		.req	v5
380	K8		.req	v14
381	K9		.req	v15
382	KK		.req	v29
383	KL		.req	v30
384	KM		.req	v31
385
386	.macro		load_round_keys, rounds, rk, tmp
387	add		\tmp, \rk, #64
388	ld1		{K0.4s-K3.4s}, [\rk]
389	ld1		{K4.4s-K5.4s}, [\tmp]
390	add		\tmp, \rk, \rounds, lsl #4
391	sub		\tmp, \tmp, #32
392	ld1		{KK.4s-KM.4s}, [\tmp]
393	.endm
394
395	.macro		enc_round, state, key
396	aese		\state\().16b, \key\().16b
397	aesmc		\state\().16b, \state\().16b
398	.endm
399
400	.macro		enc_qround, s0, s1, s2, s3, key
401	enc_round	\s0, \key
402	enc_round	\s1, \key
403	enc_round	\s2, \key
404	enc_round	\s3, \key
405	.endm
406
407	.macro		enc_block, state, rounds, rk, tmp
408	add		\tmp, \rk, #96
409	ld1		{K6.4s-K7.4s}, [\tmp], #32
410	.irp		key, K0, K1, K2, K3, K4 K5
411	enc_round	\state, \key
412	.endr
413
414	tbnz		\rounds, #2, .Lnot128_\@
415.Lout256_\@:
416	enc_round	\state, K6
417	enc_round	\state, K7
418
419.Lout192_\@:
420	enc_round	\state, KK
421	aese		\state\().16b, KL.16b
422	eor		\state\().16b, \state\().16b, KM.16b
423
424	.subsection	1
425.Lnot128_\@:
426	ld1		{K8.4s-K9.4s}, [\tmp], #32
427	enc_round	\state, K6
428	enc_round	\state, K7
429	ld1		{K6.4s-K7.4s}, [\tmp]
430	enc_round	\state, K8
431	enc_round	\state, K9
432	tbz		\rounds, #1, .Lout192_\@
433	b		.Lout256_\@
434	.previous
435	.endm
436
437	.align		6
438	.macro		pmull_gcm_do_crypt, enc
439	frame_push	1
440
441	load_round_keys	x7, x6, x8
442
443	ld1		{SHASH.2d}, [x3], #16
444	ld1		{HH.2d-HH4.2d}, [x3]
445
446	trn1		SHASH2.2d, SHASH.2d, HH.2d
447	trn2		T1.2d, SHASH.2d, HH.2d
448	eor		SHASH2.16b, SHASH2.16b, T1.16b
449
450	trn1		HH34.2d, HH3.2d, HH4.2d
451	trn2		T1.2d, HH3.2d, HH4.2d
452	eor		HH34.16b, HH34.16b, T1.16b
453
454	ld1		{XL.2d}, [x4]
455
456	cbz		x0, 3f				// tag only?
457
458	ldr		w8, [x5, #12]			// load lower counter
459CPU_LE(	rev		w8, w8		)
460
4610:	mov		w9, #4				// max blocks per round
462	add		x10, x0, #0xf
463	lsr		x10, x10, #4			// remaining blocks
464
465	subs		x0, x0, #64
466	csel		w9, w10, w9, mi
467	add		w8, w8, w9
468
469	bmi		1f
470	ld1		{INP0.16b-INP3.16b}, [x2], #64
471	.subsection	1
472	/*
473	 * Populate the four input registers right to left with up to 63 bytes
474	 * of data, using overlapping loads to avoid branches.
475	 *
476	 *                INP0     INP1     INP2     INP3
477	 *  1 byte     |        |        |        |x       |
478	 * 16 bytes    |        |        |        |xxxxxxxx|
479	 * 17 bytes    |        |        |xxxxxxxx|x       |
480	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
481	 * etc etc
482	 *
483	 * Note that this code may read up to 15 bytes before the start of
484	 * the input. It is up to the calling code to ensure this is safe if
485	 * this happens in the first iteration of the loop (i.e., when the
486	 * input size is < 16 bytes)
487	 */
4881:	mov		x15, #16
489	ands		x19, x0, #0xf
490	csel		x19, x19, x15, ne
491	adr_l		x17, .Lpermute_table + 16
492
493	sub		x11, x15, x19
494	add		x12, x17, x11
495	sub		x17, x17, x11
496	ld1		{T1.16b}, [x12]
497	sub		x10, x1, x11
498	sub		x11, x2, x11
499
500	cmp		x0, #-16
501	csel		x14, x15, xzr, gt
502	cmp		x0, #-32
503	csel		x15, x15, xzr, gt
504	cmp		x0, #-48
505	csel		x16, x19, xzr, gt
506	csel		x1, x1, x10, gt
507	csel		x2, x2, x11, gt
508
509	ld1		{INP0.16b}, [x2], x14
510	ld1		{INP1.16b}, [x2], x15
511	ld1		{INP2.16b}, [x2], x16
512	ld1		{INP3.16b}, [x2]
513	tbl		INP3.16b, {INP3.16b}, T1.16b
514	b		2f
515	.previous
516
5172:	.if		\enc == 0
518	bl		pmull_gcm_ghash_4x
519	.endif
520
521	bl		pmull_gcm_enc_4x
522
523	tbnz		x0, #63, 6f
524	st1		{INP0.16b-INP3.16b}, [x1], #64
525	.if		\enc == 1
526	bl		pmull_gcm_ghash_4x
527	.endif
528	bne		0b
529
5303:	ldr		x10, [sp, #.Lframe_local_offset]
531	cbz		x10, 5f				// output tag?
532
533	ld1		{INP3.16b}, [x10]		// load lengths[]
534	mov		w9, #1
535	bl		pmull_gcm_ghash_4x
536
537	mov		w11, #(0x1 << 24)		// BE '1U'
538	ld1		{KS0.16b}, [x5]
539	mov		KS0.s[3], w11
540
541	enc_block	KS0, x7, x6, x12
542
543	ext		XL.16b, XL.16b, XL.16b, #8
544	rev64		XL.16b, XL.16b
545	eor		XL.16b, XL.16b, KS0.16b
546
547	.if		\enc == 1
548	st1		{XL.16b}, [x10]			// store tag
549	.else
550	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
551	adr_l		x17, .Lpermute_table
552	ld1		{KS0.16b}, [x11]		// load supplied tag
553	add		x17, x17, x12
554	ld1		{KS1.16b}, [x17]		// load permute vector
555
556	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
557	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
558	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
559	sminv		b0, XL.16b			// signed minimum across XL
560	smov		w0, v0.b[0]			// return b0
561	.endif
562
5634:	frame_pop
564	ret
565
5665:
567CPU_LE(	rev		w8, w8		)
568	str		w8, [x5, #12]			// store lower counter
569	st1		{XL.2d}, [x4]
570	b		4b
571
5726:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
573	sub		x17, x17, x19, lsl #1
574
575	cmp		w9, #1
576	beq		7f
577	.subsection	1
5787:	ld1		{INP2.16b}, [x1]
579	tbx		INP2.16b, {INP3.16b}, T1.16b
580	mov		INP3.16b, INP2.16b
581	b		8f
582	.previous
583
584	st1		{INP0.16b}, [x1], x14
585	st1		{INP1.16b}, [x1], x15
586	st1		{INP2.16b}, [x1], x16
587	tbl		INP3.16b, {INP3.16b}, T1.16b
588	tbx		INP3.16b, {INP2.16b}, T2.16b
5898:	st1		{INP3.16b}, [x1]
590
591	.if		\enc == 1
592	ld1		{T1.16b}, [x17]
593	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
594	bl		pmull_gcm_ghash_4x
595	.endif
596	b		3b
597	.endm
598
599	/*
600	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
602	 *			  int rounds, u8 tag)
603	 */
604SYM_FUNC_START(pmull_gcm_encrypt)
605	pmull_gcm_do_crypt	1
606SYM_FUNC_END(pmull_gcm_encrypt)
607
608	/*
609	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
611	 *			  int rounds, u8 tag)
612	 */
613SYM_FUNC_START(pmull_gcm_decrypt)
614	pmull_gcm_do_crypt	0
615SYM_FUNC_END(pmull_gcm_decrypt)
616
617SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618	movi		MASK.16b, #0xe1
619	shl		MASK.2d, MASK.2d, #57
620
621	rev64		T1.16b, INP0.16b
622	rev64		T2.16b, INP1.16b
623	rev64		TT3.16b, INP2.16b
624	rev64		TT4.16b, INP3.16b
625
626	ext		XL.16b, XL.16b, XL.16b, #8
627
628	tbz		w9, #2, 0f			// <4 blocks?
629	.subsection	1
6300:	movi		XH2.16b, #0
631	movi		XM2.16b, #0
632	movi		XL2.16b, #0
633
634	tbz		w9, #0, 1f			// 2 blocks?
635	tbz		w9, #1, 2f			// 1 block?
636
637	eor		T2.16b, T2.16b, XL.16b
638	ext		T1.16b, T2.16b, T2.16b, #8
639	b		.Lgh3
640
6411:	eor		TT3.16b, TT3.16b, XL.16b
642	ext		T2.16b, TT3.16b, TT3.16b, #8
643	b		.Lgh2
644
6452:	eor		TT4.16b, TT4.16b, XL.16b
646	ext		IN1.16b, TT4.16b, TT4.16b, #8
647	b		.Lgh1
648	.previous
649
650	eor		T1.16b, T1.16b, XL.16b
651	ext		IN1.16b, T1.16b, T1.16b, #8
652
653	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
654	eor		T1.16b, T1.16b, IN1.16b
655	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
656	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
657
658	ext		T1.16b, T2.16b, T2.16b, #8
659.Lgh3:	eor		T2.16b, T2.16b, T1.16b
660	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
661	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
662	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
663
664	eor		XH2.16b, XH2.16b, XH.16b
665	eor		XL2.16b, XL2.16b, XL.16b
666	eor		XM2.16b, XM2.16b, XM.16b
667
668	ext		T2.16b, TT3.16b, TT3.16b, #8
669.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
670	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
671	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
672	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
673
674	eor		XH2.16b, XH2.16b, XH.16b
675	eor		XL2.16b, XL2.16b, XL.16b
676	eor		XM2.16b, XM2.16b, XM.16b
677
678	ext		IN1.16b, TT4.16b, TT4.16b, #8
679.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
680	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
681	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
682	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
683
684	eor		XH.16b, XH.16b, XH2.16b
685	eor		XL.16b, XL.16b, XL2.16b
686	eor		XM.16b, XM.16b, XM2.16b
687
688	eor		T2.16b, XL.16b, XH.16b
689	ext		T1.16b, XL.16b, XH.16b, #8
690	eor		XM.16b, XM.16b, T2.16b
691
692	__pmull_reduce_p64
693
694	eor		T2.16b, T2.16b, XH.16b
695	eor		XL.16b, XL.16b, T2.16b
696
697	ret
698SYM_FUNC_END(pmull_gcm_ghash_4x)
699
700SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701	ld1		{KS0.16b}, [x5]			// load upper counter
702	sub		w10, w8, #4
703	sub		w11, w8, #3
704	sub		w12, w8, #2
705	sub		w13, w8, #1
706	rev		w10, w10
707	rev		w11, w11
708	rev		w12, w12
709	rev		w13, w13
710	mov		KS1.16b, KS0.16b
711	mov		KS2.16b, KS0.16b
712	mov		KS3.16b, KS0.16b
713	ins		KS0.s[3], w10			// set lower counter
714	ins		KS1.s[3], w11
715	ins		KS2.s[3], w12
716	ins		KS3.s[3], w13
717
718	add		x10, x6, #96			// round key pointer
719	ld1		{K6.4s-K7.4s}, [x10], #32
720	.irp		key, K0, K1, K2, K3, K4, K5
721	enc_qround	KS0, KS1, KS2, KS3, \key
722	.endr
723
724	tbnz		x7, #2, .Lnot128
725	.subsection	1
726.Lnot128:
727	ld1		{K8.4s-K9.4s}, [x10], #32
728	.irp		key, K6, K7
729	enc_qround	KS0, KS1, KS2, KS3, \key
730	.endr
731	ld1		{K6.4s-K7.4s}, [x10]
732	.irp		key, K8, K9
733	enc_qround	KS0, KS1, KS2, KS3, \key
734	.endr
735	tbz		x7, #1, .Lout192
736	b		.Lout256
737	.previous
738
739.Lout256:
740	.irp		key, K6, K7
741	enc_qround	KS0, KS1, KS2, KS3, \key
742	.endr
743
744.Lout192:
745	enc_qround	KS0, KS1, KS2, KS3, KK
746
747	aese		KS0.16b, KL.16b
748	aese		KS1.16b, KL.16b
749	aese		KS2.16b, KL.16b
750	aese		KS3.16b, KL.16b
751
752	eor		KS0.16b, KS0.16b, KM.16b
753	eor		KS1.16b, KS1.16b, KM.16b
754	eor		KS2.16b, KS2.16b, KM.16b
755	eor		KS3.16b, KS3.16b, KM.16b
756
757	eor		INP0.16b, INP0.16b, KS0.16b
758	eor		INP1.16b, INP1.16b, KS1.16b
759	eor		INP2.16b, INP2.16b, KS2.16b
760	eor		INP3.16b, INP3.16b, KS3.16b
761
762	ret
763SYM_FUNC_END(pmull_gcm_enc_4x)
764
765	.section	".rodata", "a"
766	.align		6
767.Lpermute_table:
768	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
771	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
772	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
775	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
776	.previous
777