xref: /linux/lib/crypto/arm/ghash-neon-core.S (revision 0fc8f6200d2313278fbf4539bbab74677c685531)
1*71e59795SEric Biggers/* SPDX-License-Identifier: GPL-2.0-only */
2*71e59795SEric Biggers/*
3*71e59795SEric Biggers * Accelerated GHASH implementation with NEON vmull.p8 instructions.
4*71e59795SEric Biggers *
5*71e59795SEric Biggers * Copyright (C) 2015 - 2017 Linaro Ltd.
6*71e59795SEric Biggers * Copyright (C) 2023 Google LLC. <ardb@google.com>
7*71e59795SEric Biggers */
8*71e59795SEric Biggers
9*71e59795SEric Biggers#include <linux/linkage.h>
10*71e59795SEric Biggers#include <asm/assembler.h>
11*71e59795SEric Biggers
12*71e59795SEric Biggers	.fpu		neon
13*71e59795SEric Biggers
14*71e59795SEric Biggers	SHASH		.req	q0
15*71e59795SEric Biggers	T1		.req	q1
16*71e59795SEric Biggers	XL		.req	q2
17*71e59795SEric Biggers	XM		.req	q3
18*71e59795SEric Biggers	XH		.req	q4
19*71e59795SEric Biggers	IN1		.req	q4
20*71e59795SEric Biggers
21*71e59795SEric Biggers	SHASH_L		.req	d0
22*71e59795SEric Biggers	SHASH_H		.req	d1
23*71e59795SEric Biggers	T1_L		.req	d2
24*71e59795SEric Biggers	T1_H		.req	d3
25*71e59795SEric Biggers	XL_L		.req	d4
26*71e59795SEric Biggers	XL_H		.req	d5
27*71e59795SEric Biggers	XM_L		.req	d6
28*71e59795SEric Biggers	XM_H		.req	d7
29*71e59795SEric Biggers	XH_L		.req	d8
30*71e59795SEric Biggers
31*71e59795SEric Biggers	t0l		.req	d10
32*71e59795SEric Biggers	t0h		.req	d11
33*71e59795SEric Biggers	t1l		.req	d12
34*71e59795SEric Biggers	t1h		.req	d13
35*71e59795SEric Biggers	t2l		.req	d14
36*71e59795SEric Biggers	t2h		.req	d15
37*71e59795SEric Biggers	t3l		.req	d16
38*71e59795SEric Biggers	t3h		.req	d17
39*71e59795SEric Biggers	t4l		.req	d18
40*71e59795SEric Biggers	t4h		.req	d19
41*71e59795SEric Biggers
42*71e59795SEric Biggers	t0q		.req	q5
43*71e59795SEric Biggers	t1q		.req	q6
44*71e59795SEric Biggers	t2q		.req	q7
45*71e59795SEric Biggers	t3q		.req	q8
46*71e59795SEric Biggers	t4q		.req	q9
47*71e59795SEric Biggers
48*71e59795SEric Biggers	s1l		.req	d20
49*71e59795SEric Biggers	s1h		.req	d21
50*71e59795SEric Biggers	s2l		.req	d22
51*71e59795SEric Biggers	s2h		.req	d23
52*71e59795SEric Biggers	s3l		.req	d24
53*71e59795SEric Biggers	s3h		.req	d25
54*71e59795SEric Biggers	s4l		.req	d26
55*71e59795SEric Biggers	s4h		.req	d27
56*71e59795SEric Biggers
57*71e59795SEric Biggers	SHASH2_p8	.req	d28
58*71e59795SEric Biggers
59*71e59795SEric Biggers	k16		.req	d29
60*71e59795SEric Biggers	k32		.req	d30
61*71e59795SEric Biggers	k48		.req	d31
62*71e59795SEric Biggers
63*71e59795SEric Biggers	T2		.req	q7
64*71e59795SEric Biggers
65*71e59795SEric Biggers	.text
66*71e59795SEric Biggers
67*71e59795SEric Biggers	/*
68*71e59795SEric Biggers	 * This implementation of 64x64 -> 128 bit polynomial multiplication
69*71e59795SEric Biggers	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
70*71e59795SEric Biggers	 * "Fast Software Polynomial Multiplication on ARM Processors Using
71*71e59795SEric Biggers	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
72*71e59795SEric Biggers	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
73*71e59795SEric Biggers	 *
74*71e59795SEric Biggers	 * It has been slightly tweaked for in-order performance, and to allow
75*71e59795SEric Biggers	 * 'rq' to overlap with 'ad' or 'bd'.
76*71e59795SEric Biggers	 */
77*71e59795SEric Biggers	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
78*71e59795SEric Biggers	vext.8		t0l, \ad, \ad, #1	@ A1
79*71e59795SEric Biggers	.ifc		\b1, t4l
80*71e59795SEric Biggers	vext.8		t4l, \bd, \bd, #1	@ B1
81*71e59795SEric Biggers	.endif
82*71e59795SEric Biggers	vmull.p8	t0q, t0l, \bd		@ F = A1*B
83*71e59795SEric Biggers	vext.8		t1l, \ad, \ad, #2	@ A2
84*71e59795SEric Biggers	vmull.p8	t4q, \ad, \b1		@ E = A*B1
85*71e59795SEric Biggers	.ifc		\b2, t3l
86*71e59795SEric Biggers	vext.8		t3l, \bd, \bd, #2	@ B2
87*71e59795SEric Biggers	.endif
88*71e59795SEric Biggers	vmull.p8	t1q, t1l, \bd		@ H = A2*B
89*71e59795SEric Biggers	vext.8		t2l, \ad, \ad, #3	@ A3
90*71e59795SEric Biggers	vmull.p8	t3q, \ad, \b2		@ G = A*B2
91*71e59795SEric Biggers	veor		t0q, t0q, t4q		@ L = E + F
92*71e59795SEric Biggers	.ifc		\b3, t4l
93*71e59795SEric Biggers	vext.8		t4l, \bd, \bd, #3	@ B3
94*71e59795SEric Biggers	.endif
95*71e59795SEric Biggers	vmull.p8	t2q, t2l, \bd		@ J = A3*B
96*71e59795SEric Biggers	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
97*71e59795SEric Biggers	veor		t1q, t1q, t3q		@ M = G + H
98*71e59795SEric Biggers	.ifc		\b4, t3l
99*71e59795SEric Biggers	vext.8		t3l, \bd, \bd, #4	@ B4
100*71e59795SEric Biggers	.endif
101*71e59795SEric Biggers	vmull.p8	t4q, \ad, \b3		@ I = A*B3
102*71e59795SEric Biggers	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
103*71e59795SEric Biggers	vmull.p8	t3q, \ad, \b4		@ K = A*B4
104*71e59795SEric Biggers	vand		t0h, t0h, k48
105*71e59795SEric Biggers	vand		t1h, t1h, k32
106*71e59795SEric Biggers	veor		t2q, t2q, t4q		@ N = I + J
107*71e59795SEric Biggers	veor		t0l, t0l, t0h
108*71e59795SEric Biggers	veor		t1l, t1l, t1h
109*71e59795SEric Biggers	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
110*71e59795SEric Biggers	vand		t2h, t2h, k16
111*71e59795SEric Biggers	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
112*71e59795SEric Biggers	vmov.i64	t3h, #0
113*71e59795SEric Biggers	vext.8		t0q, t0q, t0q, #15
114*71e59795SEric Biggers	veor		t2l, t2l, t2h
115*71e59795SEric Biggers	vext.8		t1q, t1q, t1q, #14
116*71e59795SEric Biggers	vmull.p8	\rq, \ad, \bd		@ D = A*B
117*71e59795SEric Biggers	vext.8		t2q, t2q, t2q, #13
118*71e59795SEric Biggers	vext.8		t3q, t3q, t3q, #12
119*71e59795SEric Biggers	veor		t0q, t0q, t1q
120*71e59795SEric Biggers	veor		t2q, t2q, t3q
121*71e59795SEric Biggers	veor		\rq, \rq, t0q
122*71e59795SEric Biggers	veor		\rq, \rq, t2q
123*71e59795SEric Biggers	.endm
124*71e59795SEric Biggers
125*71e59795SEric Biggers	.macro		__pmull_reduce_p8
126*71e59795SEric Biggers	veor		XL_H, XL_H, XM_L
127*71e59795SEric Biggers	veor		XH_L, XH_L, XM_H
128*71e59795SEric Biggers
129*71e59795SEric Biggers	vshl.i64	T1, XL, #57
130*71e59795SEric Biggers	vshl.i64	T2, XL, #62
131*71e59795SEric Biggers	veor		T1, T1, T2
132*71e59795SEric Biggers	vshl.i64	T2, XL, #63
133*71e59795SEric Biggers	veor		T1, T1, T2
134*71e59795SEric Biggers	veor		XL_H, XL_H, T1_L
135*71e59795SEric Biggers	veor		XH_L, XH_L, T1_H
136*71e59795SEric Biggers
137*71e59795SEric Biggers	vshr.u64	T1, XL, #1
138*71e59795SEric Biggers	veor		XH, XH, XL
139*71e59795SEric Biggers	veor		XL, XL, T1
140*71e59795SEric Biggers	vshr.u64	T1, T1, #6
141*71e59795SEric Biggers	vshr.u64	XL, XL, #1
142*71e59795SEric Biggers	.endm
143*71e59795SEric Biggers
144*71e59795SEric Biggers	.macro		vrev64_if_be	a
145*71e59795SEric Biggers#ifdef CONFIG_CPU_BIG_ENDIAN
146*71e59795SEric Biggers	vrev64.8	\a, \a
147*71e59795SEric Biggers#endif
148*71e59795SEric Biggers	.endm
149*71e59795SEric Biggers
150*71e59795SEric Biggers	.macro		ghash_update
151*71e59795SEric Biggers	vld1.64		{XL}, [r1]
152*71e59795SEric Biggers	vrev64_if_be	XL
153*71e59795SEric Biggers
154*71e59795SEric Biggers0:
155*71e59795SEric Biggers	vld1.8		{T1}, [r2]!
156*71e59795SEric Biggers	subs		r0, r0, #1
157*71e59795SEric Biggers
158*71e59795SEric Biggers	/* multiply XL by SHASH in GF(2^128) */
159*71e59795SEric Biggers	vrev64.8	T1, T1
160*71e59795SEric Biggers
161*71e59795SEric Biggers	vext.8		IN1, T1, T1, #8
162*71e59795SEric Biggers	veor		T1_L, T1_L, XL_H
163*71e59795SEric Biggers	veor		XL, XL, IN1
164*71e59795SEric Biggers
165*71e59795SEric Biggers	__pmull_p8	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
166*71e59795SEric Biggers	veor		T1, T1, XL
167*71e59795SEric Biggers	__pmull_p8	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
168*71e59795SEric Biggers	__pmull_p8	XM, T1_L, SHASH2_p8			@ (a1+a0)(b1+b0)
169*71e59795SEric Biggers
170*71e59795SEric Biggers	veor		T1, XL, XH
171*71e59795SEric Biggers	veor		XM, XM, T1
172*71e59795SEric Biggers
173*71e59795SEric Biggers	__pmull_reduce_p8
174*71e59795SEric Biggers
175*71e59795SEric Biggers	veor		T1, T1, XH
176*71e59795SEric Biggers	veor		XL, XL, T1
177*71e59795SEric Biggers
178*71e59795SEric Biggers	bne		0b
179*71e59795SEric Biggers	.endm
180*71e59795SEric Biggers
181*71e59795SEric Biggers	/*
182*71e59795SEric Biggers	 * void pmull_ghash_update_p8(size_t blocks, struct polyval_elem *dg,
183*71e59795SEric Biggers	 *			      const u8 *src,
184*71e59795SEric Biggers	 *			      const struct polyval_elem *h)
185*71e59795SEric Biggers	 */
186*71e59795SEric BiggersENTRY(pmull_ghash_update_p8)
187*71e59795SEric Biggers	vld1.64		{SHASH}, [r3]
188*71e59795SEric Biggers	vrev64_if_be	SHASH
189*71e59795SEric Biggers	veor		SHASH2_p8, SHASH_L, SHASH_H
190*71e59795SEric Biggers
191*71e59795SEric Biggers	vext.8		s1l, SHASH_L, SHASH_L, #1
192*71e59795SEric Biggers	vext.8		s2l, SHASH_L, SHASH_L, #2
193*71e59795SEric Biggers	vext.8		s3l, SHASH_L, SHASH_L, #3
194*71e59795SEric Biggers	vext.8		s4l, SHASH_L, SHASH_L, #4
195*71e59795SEric Biggers	vext.8		s1h, SHASH_H, SHASH_H, #1
196*71e59795SEric Biggers	vext.8		s2h, SHASH_H, SHASH_H, #2
197*71e59795SEric Biggers	vext.8		s3h, SHASH_H, SHASH_H, #3
198*71e59795SEric Biggers	vext.8		s4h, SHASH_H, SHASH_H, #4
199*71e59795SEric Biggers
200*71e59795SEric Biggers	vmov.i64	k16, #0xffff
201*71e59795SEric Biggers	vmov.i64	k32, #0xffffffff
202*71e59795SEric Biggers	vmov.i64	k48, #0xffffffffffff
203*71e59795SEric Biggers
204*71e59795SEric Biggers	ghash_update
205*71e59795SEric Biggers	vrev64_if_be	XL
206*71e59795SEric Biggers	vst1.64		{XL}, [r1]
207*71e59795SEric Biggers
208*71e59795SEric Biggers	bx		lr
209*71e59795SEric BiggersENDPROC(pmull_ghash_update_p8)
210