xref: /linux/arch/arm64/crypto/ghash-ce-core.S (revision 3381df0954199458fa3993db72fb427f0ed1e43b)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 *
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	SHASH		.req	v0
12	SHASH2		.req	v1
13	T1		.req	v2
14	T2		.req	v3
15	MASK		.req	v4
16	XM		.req	v5
17	XL		.req	v6
18	XH		.req	v7
19	IN1		.req	v7
20
21	k00_16		.req	v8
22	k32_48		.req	v9
23
24	t3		.req	v10
25	t4		.req	v11
26	t5		.req	v12
27	t6		.req	v13
28	t7		.req	v14
29	t8		.req	v15
30	t9		.req	v16
31
32	perm1		.req	v17
33	perm2		.req	v18
34	perm3		.req	v19
35
36	sh1		.req	v20
37	sh2		.req	v21
38	sh3		.req	v22
39	sh4		.req	v23
40
41	ss1		.req	v24
42	ss2		.req	v25
43	ss3		.req	v26
44	ss4		.req	v27
45
46	XL2		.req	v8
47	XM2		.req	v9
48	XH2		.req	v10
49	XL3		.req	v11
50	XM3		.req	v12
51	XH3		.req	v13
52	TT3		.req	v14
53	TT4		.req	v15
54	HH		.req	v16
55	HH3		.req	v17
56	HH4		.req	v18
57	HH34		.req	v19
58
59	.text
60	.arch		armv8-a+crypto
61
62	.macro		__pmull_p64, rd, rn, rm
63	pmull		\rd\().1q, \rn\().1d, \rm\().1d
64	.endm
65
66	.macro		__pmull2_p64, rd, rn, rm
67	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
68	.endm
69
70	.macro		__pmull_p8, rq, ad, bd
71	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
72	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
73	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
74
75	__pmull_p8_\bd	\rq, \ad
76	.endm
77
78	.macro		__pmull2_p8, rq, ad, bd
79	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
80	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
81	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
82
83	__pmull2_p8_\bd	\rq, \ad
84	.endm
85
86	.macro		__pmull_p8_SHASH, rq, ad
87	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
88	.endm
89
90	.macro		__pmull_p8_SHASH2, rq, ad
91	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
92	.endm
93
94	.macro		__pmull2_p8_SHASH, rq, ad
95	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
96	.endm
97
98	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
99	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
100	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
101	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
102	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
103	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
104	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
105	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
106	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
107
108	eor		t3.16b, t3.16b, t4.16b			// L = E + F
109	eor		t5.16b, t5.16b, t6.16b			// M = G + H
110	eor		t7.16b, t7.16b, t8.16b			// N = I + J
111
112	uzp1		t4.2d, t3.2d, t5.2d
113	uzp2		t3.2d, t3.2d, t5.2d
114	uzp1		t6.2d, t7.2d, t9.2d
115	uzp2		t7.2d, t7.2d, t9.2d
116
117	// t3 = (L) (P0 + P1) << 8
118	// t5 = (M) (P2 + P3) << 16
119	eor		t4.16b, t4.16b, t3.16b
120	and		t3.16b, t3.16b, k32_48.16b
121
122	// t7 = (N) (P4 + P5) << 24
123	// t9 = (K) (P6 + P7) << 32
124	eor		t6.16b, t6.16b, t7.16b
125	and		t7.16b, t7.16b, k00_16.16b
126
127	eor		t4.16b, t4.16b, t3.16b
128	eor		t6.16b, t6.16b, t7.16b
129
130	zip2		t5.2d, t4.2d, t3.2d
131	zip1		t3.2d, t4.2d, t3.2d
132	zip2		t9.2d, t6.2d, t7.2d
133	zip1		t7.2d, t6.2d, t7.2d
134
135	ext		t3.16b, t3.16b, t3.16b, #15
136	ext		t5.16b, t5.16b, t5.16b, #14
137	ext		t7.16b, t7.16b, t7.16b, #13
138	ext		t9.16b, t9.16b, t9.16b, #12
139
140	eor		t3.16b, t3.16b, t5.16b
141	eor		t7.16b, t7.16b, t9.16b
142	eor		\rq\().16b, \rq\().16b, t3.16b
143	eor		\rq\().16b, \rq\().16b, t7.16b
144	.endm
145
146	.macro		__pmull_pre_p64
147	add		x8, x3, #16
148	ld1		{HH.2d-HH4.2d}, [x8]
149
150	trn1		SHASH2.2d, SHASH.2d, HH.2d
151	trn2		T1.2d, SHASH.2d, HH.2d
152	eor		SHASH2.16b, SHASH2.16b, T1.16b
153
154	trn1		HH34.2d, HH3.2d, HH4.2d
155	trn2		T1.2d, HH3.2d, HH4.2d
156	eor		HH34.16b, HH34.16b, T1.16b
157
158	movi		MASK.16b, #0xe1
159	shl		MASK.2d, MASK.2d, #57
160	.endm
161
162	.macro		__pmull_pre_p8
163	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
164	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
165
166	// k00_16 := 0x0000000000000000_000000000000ffff
167	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
168	movi		k32_48.2d, #0xffffffff
169	mov		k32_48.h[2], k32_48.h[0]
170	ushr		k00_16.2d, k32_48.2d, #32
171
172	// prepare the permutation vectors
173	mov_q		x5, 0x080f0e0d0c0b0a09
174	movi		T1.8b, #8
175	dup		perm1.2d, x5
176	eor		perm1.16b, perm1.16b, T1.16b
177	ushr		perm2.2d, perm1.2d, #8
178	ushr		perm3.2d, perm1.2d, #16
179	ushr		T1.2d, perm1.2d, #24
180	sli		perm2.2d, perm1.2d, #56
181	sli		perm3.2d, perm1.2d, #48
182	sli		T1.2d, perm1.2d, #40
183
184	// precompute loop invariants
185	tbl		sh1.16b, {SHASH.16b}, perm1.16b
186	tbl		sh2.16b, {SHASH.16b}, perm2.16b
187	tbl		sh3.16b, {SHASH.16b}, perm3.16b
188	tbl		sh4.16b, {SHASH.16b}, T1.16b
189	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
190	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
191	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
192	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
193	.endm
194
195	//
196	// PMULL (64x64->128) based reduction for CPUs that can do
197	// it in a single instruction.
198	//
199	.macro		__pmull_reduce_p64
200	pmull		T2.1q, XL.1d, MASK.1d
201	eor		XM.16b, XM.16b, T1.16b
202
203	mov		XH.d[0], XM.d[1]
204	mov		XM.d[1], XL.d[0]
205
206	eor		XL.16b, XM.16b, T2.16b
207	ext		T2.16b, XL.16b, XL.16b, #8
208	pmull		XL.1q, XL.1d, MASK.1d
209	.endm
210
211	//
212	// Alternative reduction for CPUs that lack support for the
213	// 64x64->128 PMULL instruction
214	//
215	.macro		__pmull_reduce_p8
216	eor		XM.16b, XM.16b, T1.16b
217
218	mov		XL.d[1], XM.d[0]
219	mov		XH.d[0], XM.d[1]
220
221	shl		T1.2d, XL.2d, #57
222	shl		T2.2d, XL.2d, #62
223	eor		T2.16b, T2.16b, T1.16b
224	shl		T1.2d, XL.2d, #63
225	eor		T2.16b, T2.16b, T1.16b
226	ext		T1.16b, XL.16b, XH.16b, #8
227	eor		T2.16b, T2.16b, T1.16b
228
229	mov		XL.d[1], T2.d[0]
230	mov		XH.d[0], T2.d[1]
231
232	ushr		T2.2d, XL.2d, #1
233	eor		XH.16b, XH.16b, XL.16b
234	eor		XL.16b, XL.16b, T2.16b
235	ushr		T2.2d, T2.2d, #6
236	ushr		XL.2d, XL.2d, #1
237	.endm
238
239	.macro		__pmull_ghash, pn
240	ld1		{SHASH.2d}, [x3]
241	ld1		{XL.2d}, [x1]
242
243	__pmull_pre_\pn
244
245	/* do the head block first, if supplied */
246	cbz		x4, 0f
247	ld1		{T1.2d}, [x4]
248	mov		x4, xzr
249	b		3f
250
2510:	.ifc		\pn, p64
252	tbnz		w0, #0, 2f		// skip until #blocks is a
253	tbnz		w0, #1, 2f		// round multiple of 4
254
2551:	ld1		{XM3.16b-TT4.16b}, [x2], #64
256
257	sub		w0, w0, #4
258
259	rev64		T1.16b, XM3.16b
260	rev64		T2.16b, XH3.16b
261	rev64		TT4.16b, TT4.16b
262	rev64		TT3.16b, TT3.16b
263
264	ext		IN1.16b, TT4.16b, TT4.16b, #8
265	ext		XL3.16b, TT3.16b, TT3.16b, #8
266
267	eor		TT4.16b, TT4.16b, IN1.16b
268	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
269	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
270	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
271
272	eor		TT3.16b, TT3.16b, XL3.16b
273	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
274	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
275	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
276
277	ext		IN1.16b, T2.16b, T2.16b, #8
278	eor		XL2.16b, XL2.16b, XL3.16b
279	eor		XH2.16b, XH2.16b, XH3.16b
280	eor		XM2.16b, XM2.16b, XM3.16b
281
282	eor		T2.16b, T2.16b, IN1.16b
283	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
284	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
285	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
286
287	eor		XL2.16b, XL2.16b, XL3.16b
288	eor		XH2.16b, XH2.16b, XH3.16b
289	eor		XM2.16b, XM2.16b, XM3.16b
290
291	ext		IN1.16b, T1.16b, T1.16b, #8
292	ext		TT3.16b, XL.16b, XL.16b, #8
293	eor		XL.16b, XL.16b, IN1.16b
294	eor		T1.16b, T1.16b, TT3.16b
295
296	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
297	eor		T1.16b, T1.16b, XL.16b
298	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
299	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
300
301	eor		XL.16b, XL.16b, XL2.16b
302	eor		XH.16b, XH.16b, XH2.16b
303	eor		XM.16b, XM.16b, XM2.16b
304
305	eor		T2.16b, XL.16b, XH.16b
306	ext		T1.16b, XL.16b, XH.16b, #8
307	eor		XM.16b, XM.16b, T2.16b
308
309	__pmull_reduce_p64
310
311	eor		T2.16b, T2.16b, XH.16b
312	eor		XL.16b, XL.16b, T2.16b
313
314	cbz		w0, 5f
315	b		1b
316	.endif
317
3182:	ld1		{T1.2d}, [x2], #16
319	sub		w0, w0, #1
320
3213:	/* multiply XL by SHASH in GF(2^128) */
322CPU_LE(	rev64		T1.16b, T1.16b	)
323
324	ext		T2.16b, XL.16b, XL.16b, #8
325	ext		IN1.16b, T1.16b, T1.16b, #8
326	eor		T1.16b, T1.16b, T2.16b
327	eor		XL.16b, XL.16b, IN1.16b
328
329	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
330	eor		T1.16b, T1.16b, XL.16b
331	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
332	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
333
3344:	eor		T2.16b, XL.16b, XH.16b
335	ext		T1.16b, XL.16b, XH.16b, #8
336	eor		XM.16b, XM.16b, T2.16b
337
338	__pmull_reduce_\pn
339
340	eor		T2.16b, T2.16b, XH.16b
341	eor		XL.16b, XL.16b, T2.16b
342
343	cbnz		w0, 0b
344
3455:	st1		{XL.2d}, [x1]
346	ret
347	.endm
348
349	/*
350	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
351	 *			   struct ghash_key const *k, const char *head)
352	 */
353SYM_FUNC_START(pmull_ghash_update_p64)
354	__pmull_ghash	p64
355SYM_FUNC_END(pmull_ghash_update_p64)
356
357SYM_FUNC_START(pmull_ghash_update_p8)
358	__pmull_ghash	p8
359SYM_FUNC_END(pmull_ghash_update_p8)
360
361	KS0		.req	v8
362	KS1		.req	v9
363	KS2		.req	v10
364	KS3		.req	v11
365
366	INP0		.req	v21
367	INP1		.req	v22
368	INP2		.req	v23
369	INP3		.req	v24
370
371	K0		.req	v25
372	K1		.req	v26
373	K2		.req	v27
374	K3		.req	v28
375	K4		.req	v12
376	K5		.req	v13
377	K6		.req	v4
378	K7		.req	v5
379	K8		.req	v14
380	K9		.req	v15
381	KK		.req	v29
382	KL		.req	v30
383	KM		.req	v31
384
385	.macro		load_round_keys, rounds, rk, tmp
386	add		\tmp, \rk, #64
387	ld1		{K0.4s-K3.4s}, [\rk]
388	ld1		{K4.4s-K5.4s}, [\tmp]
389	add		\tmp, \rk, \rounds, lsl #4
390	sub		\tmp, \tmp, #32
391	ld1		{KK.4s-KM.4s}, [\tmp]
392	.endm
393
394	.macro		enc_round, state, key
395	aese		\state\().16b, \key\().16b
396	aesmc		\state\().16b, \state\().16b
397	.endm
398
399	.macro		enc_qround, s0, s1, s2, s3, key
400	enc_round	\s0, \key
401	enc_round	\s1, \key
402	enc_round	\s2, \key
403	enc_round	\s3, \key
404	.endm
405
406	.macro		enc_block, state, rounds, rk, tmp
407	add		\tmp, \rk, #96
408	ld1		{K6.4s-K7.4s}, [\tmp], #32
409	.irp		key, K0, K1, K2, K3, K4 K5
410	enc_round	\state, \key
411	.endr
412
413	tbnz		\rounds, #2, .Lnot128_\@
414.Lout256_\@:
415	enc_round	\state, K6
416	enc_round	\state, K7
417
418.Lout192_\@:
419	enc_round	\state, KK
420	aese		\state\().16b, KL.16b
421	eor		\state\().16b, \state\().16b, KM.16b
422
423	.subsection	1
424.Lnot128_\@:
425	ld1		{K8.4s-K9.4s}, [\tmp], #32
426	enc_round	\state, K6
427	enc_round	\state, K7
428	ld1		{K6.4s-K7.4s}, [\tmp]
429	enc_round	\state, K8
430	enc_round	\state, K9
431	tbz		\rounds, #1, .Lout192_\@
432	b		.Lout256_\@
433	.previous
434	.endm
435
436	.align		6
437	.macro		pmull_gcm_do_crypt, enc
438	stp		x29, x30, [sp, #-32]!
439	mov		x29, sp
440	str		x19, [sp, #24]
441
442	load_round_keys	x7, x6, x8
443
444	ld1		{SHASH.2d}, [x3], #16
445	ld1		{HH.2d-HH4.2d}, [x3]
446
447	trn1		SHASH2.2d, SHASH.2d, HH.2d
448	trn2		T1.2d, SHASH.2d, HH.2d
449	eor		SHASH2.16b, SHASH2.16b, T1.16b
450
451	trn1		HH34.2d, HH3.2d, HH4.2d
452	trn2		T1.2d, HH3.2d, HH4.2d
453	eor		HH34.16b, HH34.16b, T1.16b
454
455	ld1		{XL.2d}, [x4]
456
457	cbz		x0, 3f				// tag only?
458
459	ldr		w8, [x5, #12]			// load lower counter
460CPU_LE(	rev		w8, w8		)
461
4620:	mov		w9, #4				// max blocks per round
463	add		x10, x0, #0xf
464	lsr		x10, x10, #4			// remaining blocks
465
466	subs		x0, x0, #64
467	csel		w9, w10, w9, mi
468	add		w8, w8, w9
469
470	bmi		1f
471	ld1		{INP0.16b-INP3.16b}, [x2], #64
472	.subsection	1
473	/*
474	 * Populate the four input registers right to left with up to 63 bytes
475	 * of data, using overlapping loads to avoid branches.
476	 *
477	 *                INP0     INP1     INP2     INP3
478	 *  1 byte     |        |        |        |x       |
479	 * 16 bytes    |        |        |        |xxxxxxxx|
480	 * 17 bytes    |        |        |xxxxxxxx|x       |
481	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
482	 * etc etc
483	 *
484	 * Note that this code may read up to 15 bytes before the start of
485	 * the input. It is up to the calling code to ensure this is safe if
486	 * this happens in the first iteration of the loop (i.e., when the
487	 * input size is < 16 bytes)
488	 */
4891:	mov		x15, #16
490	ands		x19, x0, #0xf
491	csel		x19, x19, x15, ne
492	adr_l		x17, .Lpermute_table + 16
493
494	sub		x11, x15, x19
495	add		x12, x17, x11
496	sub		x17, x17, x11
497	ld1		{T1.16b}, [x12]
498	sub		x10, x1, x11
499	sub		x11, x2, x11
500
501	cmp		x0, #-16
502	csel		x14, x15, xzr, gt
503	cmp		x0, #-32
504	csel		x15, x15, xzr, gt
505	cmp		x0, #-48
506	csel		x16, x19, xzr, gt
507	csel		x1, x1, x10, gt
508	csel		x2, x2, x11, gt
509
510	ld1		{INP0.16b}, [x2], x14
511	ld1		{INP1.16b}, [x2], x15
512	ld1		{INP2.16b}, [x2], x16
513	ld1		{INP3.16b}, [x2]
514	tbl		INP3.16b, {INP3.16b}, T1.16b
515	b		2f
516	.previous
517
5182:	.if		\enc == 0
519	bl		pmull_gcm_ghash_4x
520	.endif
521
522	bl		pmull_gcm_enc_4x
523
524	tbnz		x0, #63, 6f
525	st1		{INP0.16b-INP3.16b}, [x1], #64
526	.if		\enc == 1
527	bl		pmull_gcm_ghash_4x
528	.endif
529	bne		0b
530
5313:	ldp		x19, x10, [sp, #24]
532	cbz		x10, 5f				// output tag?
533
534	ld1		{INP3.16b}, [x10]		// load lengths[]
535	mov		w9, #1
536	bl		pmull_gcm_ghash_4x
537
538	mov		w11, #(0x1 << 24)		// BE '1U'
539	ld1		{KS0.16b}, [x5]
540	mov		KS0.s[3], w11
541
542	enc_block	KS0, x7, x6, x12
543
544	ext		XL.16b, XL.16b, XL.16b, #8
545	rev64		XL.16b, XL.16b
546	eor		XL.16b, XL.16b, KS0.16b
547	st1		{XL.16b}, [x10]			// store tag
548
5494:	ldp		x29, x30, [sp], #32
550	ret
551
5525:
553CPU_LE(	rev		w8, w8		)
554	str		w8, [x5, #12]			// store lower counter
555	st1		{XL.2d}, [x4]
556	b		4b
557
5586:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
559	sub		x17, x17, x19, lsl #1
560
561	cmp		w9, #1
562	beq		7f
563	.subsection	1
5647:	ld1		{INP2.16b}, [x1]
565	tbx		INP2.16b, {INP3.16b}, T1.16b
566	mov		INP3.16b, INP2.16b
567	b		8f
568	.previous
569
570	st1		{INP0.16b}, [x1], x14
571	st1		{INP1.16b}, [x1], x15
572	st1		{INP2.16b}, [x1], x16
573	tbl		INP3.16b, {INP3.16b}, T1.16b
574	tbx		INP3.16b, {INP2.16b}, T2.16b
5758:	st1		{INP3.16b}, [x1]
576
577	.if		\enc == 1
578	ld1		{T1.16b}, [x17]
579	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
580	bl		pmull_gcm_ghash_4x
581	.endif
582	b		3b
583	.endm
584
585	/*
586	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
587	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
588	 *			  int rounds, u8 tag)
589	 */
590SYM_FUNC_START(pmull_gcm_encrypt)
591	pmull_gcm_do_crypt	1
592SYM_FUNC_END(pmull_gcm_encrypt)
593
594	/*
595	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
596	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
597	 *			  int rounds, u8 tag)
598	 */
599SYM_FUNC_START(pmull_gcm_decrypt)
600	pmull_gcm_do_crypt	0
601SYM_FUNC_END(pmull_gcm_decrypt)
602
603SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
604	movi		MASK.16b, #0xe1
605	shl		MASK.2d, MASK.2d, #57
606
607	rev64		T1.16b, INP0.16b
608	rev64		T2.16b, INP1.16b
609	rev64		TT3.16b, INP2.16b
610	rev64		TT4.16b, INP3.16b
611
612	ext		XL.16b, XL.16b, XL.16b, #8
613
614	tbz		w9, #2, 0f			// <4 blocks?
615	.subsection	1
6160:	movi		XH2.16b, #0
617	movi		XM2.16b, #0
618	movi		XL2.16b, #0
619
620	tbz		w9, #0, 1f			// 2 blocks?
621	tbz		w9, #1, 2f			// 1 block?
622
623	eor		T2.16b, T2.16b, XL.16b
624	ext		T1.16b, T2.16b, T2.16b, #8
625	b		.Lgh3
626
6271:	eor		TT3.16b, TT3.16b, XL.16b
628	ext		T2.16b, TT3.16b, TT3.16b, #8
629	b		.Lgh2
630
6312:	eor		TT4.16b, TT4.16b, XL.16b
632	ext		IN1.16b, TT4.16b, TT4.16b, #8
633	b		.Lgh1
634	.previous
635
636	eor		T1.16b, T1.16b, XL.16b
637	ext		IN1.16b, T1.16b, T1.16b, #8
638
639	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
640	eor		T1.16b, T1.16b, IN1.16b
641	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
642	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
643
644	ext		T1.16b, T2.16b, T2.16b, #8
645.Lgh3:	eor		T2.16b, T2.16b, T1.16b
646	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
647	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
648	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
649
650	eor		XH2.16b, XH2.16b, XH.16b
651	eor		XL2.16b, XL2.16b, XL.16b
652	eor		XM2.16b, XM2.16b, XM.16b
653
654	ext		T2.16b, TT3.16b, TT3.16b, #8
655.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
656	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
657	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
658	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
659
660	eor		XH2.16b, XH2.16b, XH.16b
661	eor		XL2.16b, XL2.16b, XL.16b
662	eor		XM2.16b, XM2.16b, XM.16b
663
664	ext		IN1.16b, TT4.16b, TT4.16b, #8
665.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
666	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
667	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
668	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
669
670	eor		XH.16b, XH.16b, XH2.16b
671	eor		XL.16b, XL.16b, XL2.16b
672	eor		XM.16b, XM.16b, XM2.16b
673
674	eor		T2.16b, XL.16b, XH.16b
675	ext		T1.16b, XL.16b, XH.16b, #8
676	eor		XM.16b, XM.16b, T2.16b
677
678	__pmull_reduce_p64
679
680	eor		T2.16b, T2.16b, XH.16b
681	eor		XL.16b, XL.16b, T2.16b
682
683	ret
684SYM_FUNC_END(pmull_gcm_ghash_4x)
685
686SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
687	ld1		{KS0.16b}, [x5]			// load upper counter
688	sub		w10, w8, #4
689	sub		w11, w8, #3
690	sub		w12, w8, #2
691	sub		w13, w8, #1
692	rev		w10, w10
693	rev		w11, w11
694	rev		w12, w12
695	rev		w13, w13
696	mov		KS1.16b, KS0.16b
697	mov		KS2.16b, KS0.16b
698	mov		KS3.16b, KS0.16b
699	ins		KS0.s[3], w10			// set lower counter
700	ins		KS1.s[3], w11
701	ins		KS2.s[3], w12
702	ins		KS3.s[3], w13
703
704	add		x10, x6, #96			// round key pointer
705	ld1		{K6.4s-K7.4s}, [x10], #32
706	.irp		key, K0, K1, K2, K3, K4, K5
707	enc_qround	KS0, KS1, KS2, KS3, \key
708	.endr
709
710	tbnz		x7, #2, .Lnot128
711	.subsection	1
712.Lnot128:
713	ld1		{K8.4s-K9.4s}, [x10], #32
714	.irp		key, K6, K7
715	enc_qround	KS0, KS1, KS2, KS3, \key
716	.endr
717	ld1		{K6.4s-K7.4s}, [x10]
718	.irp		key, K8, K9
719	enc_qround	KS0, KS1, KS2, KS3, \key
720	.endr
721	tbz		x7, #1, .Lout192
722	b		.Lout256
723	.previous
724
725.Lout256:
726	.irp		key, K6, K7
727	enc_qround	KS0, KS1, KS2, KS3, \key
728	.endr
729
730.Lout192:
731	enc_qround	KS0, KS1, KS2, KS3, KK
732
733	aese		KS0.16b, KL.16b
734	aese		KS1.16b, KL.16b
735	aese		KS2.16b, KL.16b
736	aese		KS3.16b, KL.16b
737
738	eor		KS0.16b, KS0.16b, KM.16b
739	eor		KS1.16b, KS1.16b, KM.16b
740	eor		KS2.16b, KS2.16b, KM.16b
741	eor		KS3.16b, KS3.16b, KM.16b
742
743	eor		INP0.16b, INP0.16b, KS0.16b
744	eor		INP1.16b, INP1.16b, KS1.16b
745	eor		INP2.16b, INP2.16b, KS2.16b
746	eor		INP3.16b, INP3.16b, KS3.16b
747
748	ret
749SYM_FUNC_END(pmull_gcm_enc_4x)
750
751	.section	".rodata", "a"
752	.align		6
753.Lpermute_table:
754	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
755	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
756	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
757	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
758	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
759	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
760	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
761	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
762	.previous
763