xref: /freebsd/sys/crypto/openssl/aarch64/ghashv8-armx.S (revision 575878a533823aa3e5bab715928d9cdffbc4dcbc)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
2bc3d5698SJohn Baldwin#include "arm_arch.h"
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7
5c0855eaaSJohn Baldwin.arch	armv8-a+crypto
6bc3d5698SJohn Baldwin.text
7bc3d5698SJohn Baldwin.globl	gcm_init_v8
8bc3d5698SJohn Baldwin.type	gcm_init_v8,%function
9bc3d5698SJohn Baldwin.align	4
10bc3d5698SJohn Baldwingcm_init_v8:
11bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
12bc3d5698SJohn Baldwin	ld1	{v17.2d},[x1]		//load input H
13bc3d5698SJohn Baldwin	movi	v19.16b,#0xe1
14bc3d5698SJohn Baldwin	shl	v19.2d,v19.2d,#57		//0xc2.0
15bc3d5698SJohn Baldwin	ext	v3.16b,v17.16b,v17.16b,#8
16bc3d5698SJohn Baldwin	ushr	v18.2d,v19.2d,#63
17bc3d5698SJohn Baldwin	dup	v17.4s,v17.s[1]
18bc3d5698SJohn Baldwin	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
19bc3d5698SJohn Baldwin	ushr	v18.2d,v3.2d,#63
20bc3d5698SJohn Baldwin	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
21bc3d5698SJohn Baldwin	and	v18.16b,v18.16b,v16.16b
22bc3d5698SJohn Baldwin	shl	v3.2d,v3.2d,#1
23bc3d5698SJohn Baldwin	ext	v18.16b,v18.16b,v18.16b,#8
24bc3d5698SJohn Baldwin	and	v16.16b,v16.16b,v17.16b
25bc3d5698SJohn Baldwin	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
26bc3d5698SJohn Baldwin	eor	v20.16b,v3.16b,v16.16b		//twisted H
27bc3d5698SJohn Baldwin	st1	{v20.2d},[x0],#16		//store Htable[0]
28bc3d5698SJohn Baldwin
29bc3d5698SJohn Baldwin	//calculate H^2
30bc3d5698SJohn Baldwin	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
31bc3d5698SJohn Baldwin	pmull	v0.1q,v20.1d,v20.1d
32bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v20.16b
33bc3d5698SJohn Baldwin	pmull2	v2.1q,v20.2d,v20.2d
34bc3d5698SJohn Baldwin	pmull	v1.1q,v16.1d,v16.1d
35bc3d5698SJohn Baldwin
36bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
37bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
38bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
39bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
40bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase
41bc3d5698SJohn Baldwin
42bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
43bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
44bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
45bc3d5698SJohn Baldwin
46bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
47bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
48bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
49bc3d5698SJohn Baldwin	eor	v22.16b,v0.16b,v18.16b
50bc3d5698SJohn Baldwin
51bc3d5698SJohn Baldwin	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
52bc3d5698SJohn Baldwin	eor	v17.16b,v17.16b,v22.16b
53bc3d5698SJohn Baldwin	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
54bc3d5698SJohn Baldwin	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
55bc3d5698SJohn Baldwin	//calculate H^3 and H^4
56bc3d5698SJohn Baldwin	pmull	v0.1q,v20.1d, v22.1d
57bc3d5698SJohn Baldwin	pmull	v5.1q,v22.1d,v22.1d
58bc3d5698SJohn Baldwin	pmull2	v2.1q,v20.2d, v22.2d
59bc3d5698SJohn Baldwin	pmull2	v7.1q,v22.2d,v22.2d
60bc3d5698SJohn Baldwin	pmull	v1.1q,v16.1d,v17.1d
61bc3d5698SJohn Baldwin	pmull	v6.1q,v17.1d,v17.1d
62bc3d5698SJohn Baldwin
63bc3d5698SJohn Baldwin	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
64bc3d5698SJohn Baldwin	ext	v17.16b,v5.16b,v7.16b,#8
65bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
66bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v16.16b
67bc3d5698SJohn Baldwin	eor	v4.16b,v5.16b,v7.16b
68bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v17.16b
69bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
70bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase
71bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v4.16b
72bc3d5698SJohn Baldwin	pmull	v4.1q,v5.1d,v19.1d
73bc3d5698SJohn Baldwin
74bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
75bc3d5698SJohn Baldwin	ins	v7.d[0],v6.d[1]
76bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
77bc3d5698SJohn Baldwin	ins	v6.d[1],v5.d[0]
78bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
79bc3d5698SJohn Baldwin	eor	v5.16b,v6.16b,v4.16b
80bc3d5698SJohn Baldwin
81bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
82bc3d5698SJohn Baldwin	ext	v4.16b,v5.16b,v5.16b,#8
83bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
84bc3d5698SJohn Baldwin	pmull	v5.1q,v5.1d,v19.1d
85bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
86bc3d5698SJohn Baldwin	eor	v4.16b,v4.16b,v7.16b
87bc3d5698SJohn Baldwin	eor	v20.16b, v0.16b,v18.16b		//H^3
88bc3d5698SJohn Baldwin	eor	v22.16b,v5.16b,v4.16b		//H^4
89bc3d5698SJohn Baldwin
90bc3d5698SJohn Baldwin	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
91bc3d5698SJohn Baldwin	ext	v17.16b,v22.16b,v22.16b,#8
92bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v20.16b
93bc3d5698SJohn Baldwin	eor	v17.16b,v17.16b,v22.16b
94bc3d5698SJohn Baldwin	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
95bc3d5698SJohn Baldwin	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
96bc3d5698SJohn Baldwin	ret
97bc3d5698SJohn Baldwin.size	gcm_init_v8,.-gcm_init_v8
98bc3d5698SJohn Baldwin.globl	gcm_gmult_v8
99bc3d5698SJohn Baldwin.type	gcm_gmult_v8,%function
100bc3d5698SJohn Baldwin.align	4
101bc3d5698SJohn Baldwingcm_gmult_v8:
102bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
103bc3d5698SJohn Baldwin	ld1	{v17.2d},[x0]		//load Xi
104bc3d5698SJohn Baldwin	movi	v19.16b,#0xe1
105bc3d5698SJohn Baldwin	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
106bc3d5698SJohn Baldwin	shl	v19.2d,v19.2d,#57
107*575878a5SEd Maste#ifndef __AARCH64EB__
108bc3d5698SJohn Baldwin	rev64	v17.16b,v17.16b
109bc3d5698SJohn Baldwin#endif
110bc3d5698SJohn Baldwin	ext	v3.16b,v17.16b,v17.16b,#8
111bc3d5698SJohn Baldwin
112bc3d5698SJohn Baldwin	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
113bc3d5698SJohn Baldwin	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
114bc3d5698SJohn Baldwin	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
115bc3d5698SJohn Baldwin	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
116bc3d5698SJohn Baldwin
117bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
118bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
119bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
120bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
121bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
122bc3d5698SJohn Baldwin
123bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
124bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
125bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
126bc3d5698SJohn Baldwin
127bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
128bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
129bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
130bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v18.16b
131bc3d5698SJohn Baldwin
132*575878a5SEd Maste#ifndef __AARCH64EB__
133bc3d5698SJohn Baldwin	rev64	v0.16b,v0.16b
134bc3d5698SJohn Baldwin#endif
135bc3d5698SJohn Baldwin	ext	v0.16b,v0.16b,v0.16b,#8
136bc3d5698SJohn Baldwin	st1	{v0.2d},[x0]		//write out Xi
137bc3d5698SJohn Baldwin
138bc3d5698SJohn Baldwin	ret
139bc3d5698SJohn Baldwin.size	gcm_gmult_v8,.-gcm_gmult_v8
140bc3d5698SJohn Baldwin.globl	gcm_ghash_v8
141bc3d5698SJohn Baldwin.type	gcm_ghash_v8,%function
142bc3d5698SJohn Baldwin.align	4
143bc3d5698SJohn Baldwingcm_ghash_v8:
144bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
145bc3d5698SJohn Baldwin	cmp	x3,#64
146bc3d5698SJohn Baldwin	b.hs	.Lgcm_ghash_v8_4x
147bc3d5698SJohn Baldwin	ld1	{v0.2d},[x0]		//load [rotated] Xi
148bc3d5698SJohn Baldwin						//"[rotated]" means that
149bc3d5698SJohn Baldwin						//loaded value would have
150bc3d5698SJohn Baldwin						//to be rotated in order to
151bc3d5698SJohn Baldwin						//make it appear as in
152bc3d5698SJohn Baldwin						//algorithm specification
153bc3d5698SJohn Baldwin	subs	x3,x3,#32		//see if x3 is 32 or larger
154bc3d5698SJohn Baldwin	mov	x12,#16		//x12 is used as post-
155bc3d5698SJohn Baldwin						//increment for input pointer;
156bc3d5698SJohn Baldwin						//as loop is modulo-scheduled
157bc3d5698SJohn Baldwin						//x12 is zeroed just in time
158bc3d5698SJohn Baldwin						//to preclude overstepping
159bc3d5698SJohn Baldwin						//inp[len], which means that
160bc3d5698SJohn Baldwin						//last block[s] are actually
161bc3d5698SJohn Baldwin						//loaded twice, but last
162bc3d5698SJohn Baldwin						//copy is not processed
163bc3d5698SJohn Baldwin	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
164bc3d5698SJohn Baldwin	movi	v19.16b,#0xe1
165bc3d5698SJohn Baldwin	ld1	{v22.2d},[x1]
166bc3d5698SJohn Baldwin	csel	x12,xzr,x12,eq			//is it time to zero x12?
167bc3d5698SJohn Baldwin	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
168bc3d5698SJohn Baldwin	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
169bc3d5698SJohn Baldwin	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
170*575878a5SEd Maste#ifndef __AARCH64EB__
171bc3d5698SJohn Baldwin	rev64	v16.16b,v16.16b
172bc3d5698SJohn Baldwin	rev64	v0.16b,v0.16b
173bc3d5698SJohn Baldwin#endif
174bc3d5698SJohn Baldwin	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
175bc3d5698SJohn Baldwin	b.lo	.Lodd_tail_v8		//x3 was less than 32
176bc3d5698SJohn Baldwin	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
177*575878a5SEd Maste#ifndef __AARCH64EB__
178bc3d5698SJohn Baldwin	rev64	v17.16b,v17.16b
179bc3d5698SJohn Baldwin#endif
180bc3d5698SJohn Baldwin	ext	v7.16b,v17.16b,v17.16b,#8
181bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
182bc3d5698SJohn Baldwin	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
183bc3d5698SJohn Baldwin	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
184bc3d5698SJohn Baldwin	pmull2	v6.1q,v20.2d,v7.2d
185bc3d5698SJohn Baldwin	b	.Loop_mod2x_v8
186bc3d5698SJohn Baldwin
187bc3d5698SJohn Baldwin.align	4
188bc3d5698SJohn Baldwin.Loop_mod2x_v8:
189bc3d5698SJohn Baldwin	ext	v18.16b,v3.16b,v3.16b,#8
190bc3d5698SJohn Baldwin	subs	x3,x3,#32		//is there more data?
191bc3d5698SJohn Baldwin	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
192bc3d5698SJohn Baldwin	csel	x12,xzr,x12,lo			//is it time to zero x12?
193bc3d5698SJohn Baldwin
194bc3d5698SJohn Baldwin	pmull	v5.1q,v21.1d,v17.1d
195bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
196bc3d5698SJohn Baldwin	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
197bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v4.16b		//accumulate
198bc3d5698SJohn Baldwin	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
199bc3d5698SJohn Baldwin	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
200bc3d5698SJohn Baldwin
201bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v6.16b
202bc3d5698SJohn Baldwin	csel	x12,xzr,x12,eq			//is it time to zero x12?
203bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v5.16b
204bc3d5698SJohn Baldwin
205bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
206bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
207bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
208bc3d5698SJohn Baldwin	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
209*575878a5SEd Maste#ifndef __AARCH64EB__
210bc3d5698SJohn Baldwin	rev64	v16.16b,v16.16b
211bc3d5698SJohn Baldwin#endif
212bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
213bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
214bc3d5698SJohn Baldwin
215*575878a5SEd Maste#ifndef __AARCH64EB__
216bc3d5698SJohn Baldwin	rev64	v17.16b,v17.16b
217bc3d5698SJohn Baldwin#endif
218bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
219bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
220bc3d5698SJohn Baldwin	ext	v7.16b,v17.16b,v17.16b,#8
221bc3d5698SJohn Baldwin	ext	v3.16b,v16.16b,v16.16b,#8
222bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
223bc3d5698SJohn Baldwin	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
224bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
225bc3d5698SJohn Baldwin
226bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
227bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
228bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v18.16b
229bc3d5698SJohn Baldwin	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
230bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v0.16b
231bc3d5698SJohn Baldwin	pmull2	v6.1q,v20.2d,v7.2d
232bc3d5698SJohn Baldwin	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
233bc3d5698SJohn Baldwin
234bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v18.16b
235bc3d5698SJohn Baldwin	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
236bc3d5698SJohn Baldwin	adds	x3,x3,#32		//re-construct x3
237bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
238bc3d5698SJohn Baldwin	b.eq	.Ldone_v8		//is x3 zero?
239bc3d5698SJohn Baldwin.Lodd_tail_v8:
240bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8
241bc3d5698SJohn Baldwin	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
242bc3d5698SJohn Baldwin	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
243bc3d5698SJohn Baldwin
244bc3d5698SJohn Baldwin	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
245bc3d5698SJohn Baldwin	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
246bc3d5698SJohn Baldwin	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
247bc3d5698SJohn Baldwin	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
248bc3d5698SJohn Baldwin
249bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
250bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
251bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
252bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
253bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
254bc3d5698SJohn Baldwin
255bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
256bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
257bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
258bc3d5698SJohn Baldwin
259bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
260bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
261bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
262bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v18.16b
263bc3d5698SJohn Baldwin
264bc3d5698SJohn Baldwin.Ldone_v8:
265*575878a5SEd Maste#ifndef __AARCH64EB__
266bc3d5698SJohn Baldwin	rev64	v0.16b,v0.16b
267bc3d5698SJohn Baldwin#endif
268bc3d5698SJohn Baldwin	ext	v0.16b,v0.16b,v0.16b,#8
269bc3d5698SJohn Baldwin	st1	{v0.2d},[x0]		//write out Xi
270bc3d5698SJohn Baldwin
271bc3d5698SJohn Baldwin	ret
272bc3d5698SJohn Baldwin.size	gcm_ghash_v8,.-gcm_ghash_v8
273bc3d5698SJohn Baldwin.type	gcm_ghash_v8_4x,%function
274bc3d5698SJohn Baldwin.align	4
275bc3d5698SJohn Baldwingcm_ghash_v8_4x:
276bc3d5698SJohn Baldwin.Lgcm_ghash_v8_4x:
277bc3d5698SJohn Baldwin	ld1	{v0.2d},[x0]		//load [rotated] Xi
278bc3d5698SJohn Baldwin	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
279bc3d5698SJohn Baldwin	movi	v19.16b,#0xe1
280bc3d5698SJohn Baldwin	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
281bc3d5698SJohn Baldwin	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
282bc3d5698SJohn Baldwin
283bc3d5698SJohn Baldwin	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
284*575878a5SEd Maste#ifndef __AARCH64EB__
285bc3d5698SJohn Baldwin	rev64	v0.16b,v0.16b
286bc3d5698SJohn Baldwin	rev64	v5.16b,v5.16b
287bc3d5698SJohn Baldwin	rev64	v6.16b,v6.16b
288bc3d5698SJohn Baldwin	rev64	v7.16b,v7.16b
289bc3d5698SJohn Baldwin	rev64	v4.16b,v4.16b
290bc3d5698SJohn Baldwin#endif
291bc3d5698SJohn Baldwin	ext	v25.16b,v7.16b,v7.16b,#8
292bc3d5698SJohn Baldwin	ext	v24.16b,v6.16b,v6.16b,#8
293bc3d5698SJohn Baldwin	ext	v23.16b,v5.16b,v5.16b,#8
294bc3d5698SJohn Baldwin
295bc3d5698SJohn Baldwin	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
296bc3d5698SJohn Baldwin	eor	v7.16b,v7.16b,v25.16b
297bc3d5698SJohn Baldwin	pmull2	v31.1q,v20.2d,v25.2d
298bc3d5698SJohn Baldwin	pmull	v30.1q,v21.1d,v7.1d
299bc3d5698SJohn Baldwin
300bc3d5698SJohn Baldwin	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
301bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v24.16b
302bc3d5698SJohn Baldwin	pmull2	v24.1q,v22.2d,v24.2d
303bc3d5698SJohn Baldwin	pmull2	v6.1q,v21.2d,v6.2d
304bc3d5698SJohn Baldwin
305bc3d5698SJohn Baldwin	eor	v29.16b,v29.16b,v16.16b
306bc3d5698SJohn Baldwin	eor	v31.16b,v31.16b,v24.16b
307bc3d5698SJohn Baldwin	eor	v30.16b,v30.16b,v6.16b
308bc3d5698SJohn Baldwin
309bc3d5698SJohn Baldwin	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
310bc3d5698SJohn Baldwin	eor	v5.16b,v5.16b,v23.16b
311bc3d5698SJohn Baldwin	pmull2	v23.1q,v26.2d,v23.2d
312bc3d5698SJohn Baldwin	pmull	v5.1q,v27.1d,v5.1d
313bc3d5698SJohn Baldwin
314bc3d5698SJohn Baldwin	eor	v29.16b,v29.16b,v7.16b
315bc3d5698SJohn Baldwin	eor	v31.16b,v31.16b,v23.16b
316bc3d5698SJohn Baldwin	eor	v30.16b,v30.16b,v5.16b
317bc3d5698SJohn Baldwin
318bc3d5698SJohn Baldwin	subs	x3,x3,#128
319bc3d5698SJohn Baldwin	b.lo	.Ltail4x
320bc3d5698SJohn Baldwin
321bc3d5698SJohn Baldwin	b	.Loop4x
322bc3d5698SJohn Baldwin
323bc3d5698SJohn Baldwin.align	4
324bc3d5698SJohn Baldwin.Loop4x:
325bc3d5698SJohn Baldwin	eor	v16.16b,v4.16b,v0.16b
326bc3d5698SJohn Baldwin	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
327bc3d5698SJohn Baldwin	ext	v3.16b,v16.16b,v16.16b,#8
328*575878a5SEd Maste#ifndef __AARCH64EB__
329bc3d5698SJohn Baldwin	rev64	v5.16b,v5.16b
330bc3d5698SJohn Baldwin	rev64	v6.16b,v6.16b
331bc3d5698SJohn Baldwin	rev64	v7.16b,v7.16b
332bc3d5698SJohn Baldwin	rev64	v4.16b,v4.16b
333bc3d5698SJohn Baldwin#endif
334bc3d5698SJohn Baldwin
335bc3d5698SJohn Baldwin	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
336bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v3.16b
337bc3d5698SJohn Baldwin	pmull2	v2.1q,v28.2d,v3.2d
338bc3d5698SJohn Baldwin	ext	v25.16b,v7.16b,v7.16b,#8
339bc3d5698SJohn Baldwin	pmull2	v1.1q,v27.2d,v16.2d
340bc3d5698SJohn Baldwin
341bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v29.16b
342bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v31.16b
343bc3d5698SJohn Baldwin	ext	v24.16b,v6.16b,v6.16b,#8
344bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v30.16b
345bc3d5698SJohn Baldwin	ext	v23.16b,v5.16b,v5.16b,#8
346bc3d5698SJohn Baldwin
347bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
348bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
349bc3d5698SJohn Baldwin	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
350bc3d5698SJohn Baldwin	eor	v7.16b,v7.16b,v25.16b
351bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
352bc3d5698SJohn Baldwin	pmull2	v31.1q,v20.2d,v25.2d
353bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
354bc3d5698SJohn Baldwin	pmull	v30.1q,v21.1d,v7.1d
355bc3d5698SJohn Baldwin
356bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
357bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
358bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
359bc3d5698SJohn Baldwin	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
360bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v24.16b
361bc3d5698SJohn Baldwin	pmull2	v24.1q,v22.2d,v24.2d
362bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
363bc3d5698SJohn Baldwin	pmull2	v6.1q,v21.2d,v6.2d
364bc3d5698SJohn Baldwin
365bc3d5698SJohn Baldwin	eor	v29.16b,v29.16b,v16.16b
366bc3d5698SJohn Baldwin	eor	v31.16b,v31.16b,v24.16b
367bc3d5698SJohn Baldwin	eor	v30.16b,v30.16b,v6.16b
368bc3d5698SJohn Baldwin
369bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
370bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
371bc3d5698SJohn Baldwin	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
372bc3d5698SJohn Baldwin	eor	v5.16b,v5.16b,v23.16b
373bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
374bc3d5698SJohn Baldwin	pmull2	v23.1q,v26.2d,v23.2d
375bc3d5698SJohn Baldwin	pmull	v5.1q,v27.1d,v5.1d
376bc3d5698SJohn Baldwin
377bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v18.16b
378bc3d5698SJohn Baldwin	eor	v29.16b,v29.16b,v7.16b
379bc3d5698SJohn Baldwin	eor	v31.16b,v31.16b,v23.16b
380bc3d5698SJohn Baldwin	ext	v0.16b,v0.16b,v0.16b,#8
381bc3d5698SJohn Baldwin	eor	v30.16b,v30.16b,v5.16b
382bc3d5698SJohn Baldwin
383bc3d5698SJohn Baldwin	subs	x3,x3,#64
384bc3d5698SJohn Baldwin	b.hs	.Loop4x
385bc3d5698SJohn Baldwin
386bc3d5698SJohn Baldwin.Ltail4x:
387bc3d5698SJohn Baldwin	eor	v16.16b,v4.16b,v0.16b
388bc3d5698SJohn Baldwin	ext	v3.16b,v16.16b,v16.16b,#8
389bc3d5698SJohn Baldwin
390bc3d5698SJohn Baldwin	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
391bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v3.16b
392bc3d5698SJohn Baldwin	pmull2	v2.1q,v28.2d,v3.2d
393bc3d5698SJohn Baldwin	pmull2	v1.1q,v27.2d,v16.2d
394bc3d5698SJohn Baldwin
395bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v29.16b
396bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v31.16b
397bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v30.16b
398bc3d5698SJohn Baldwin
399bc3d5698SJohn Baldwin	adds	x3,x3,#64
400bc3d5698SJohn Baldwin	b.eq	.Ldone4x
401bc3d5698SJohn Baldwin
402bc3d5698SJohn Baldwin	cmp	x3,#32
403bc3d5698SJohn Baldwin	b.lo	.Lone
404bc3d5698SJohn Baldwin	b.eq	.Ltwo
405bc3d5698SJohn Baldwin.Lthree:
406bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
407bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
408bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
409bc3d5698SJohn Baldwin	ld1	{v4.2d,v5.2d,v6.2d},[x2]
410bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
411*575878a5SEd Maste#ifndef	__AARCH64EB__
412bc3d5698SJohn Baldwin	rev64	v5.16b,v5.16b
413bc3d5698SJohn Baldwin	rev64	v6.16b,v6.16b
414bc3d5698SJohn Baldwin	rev64	v4.16b,v4.16b
415bc3d5698SJohn Baldwin#endif
416bc3d5698SJohn Baldwin
417bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
418bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
419bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
420bc3d5698SJohn Baldwin	ext	v24.16b,v6.16b,v6.16b,#8
421bc3d5698SJohn Baldwin	ext	v23.16b,v5.16b,v5.16b,#8
422bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
423bc3d5698SJohn Baldwin
424bc3d5698SJohn Baldwin	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
425bc3d5698SJohn Baldwin	eor	v6.16b,v6.16b,v24.16b
426bc3d5698SJohn Baldwin
427bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
428bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
429bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
430bc3d5698SJohn Baldwin	pmull2	v31.1q,v20.2d,v24.2d
431bc3d5698SJohn Baldwin	pmull	v30.1q,v21.1d,v6.1d
432bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v18.16b
433bc3d5698SJohn Baldwin	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
434bc3d5698SJohn Baldwin	eor	v5.16b,v5.16b,v23.16b
435bc3d5698SJohn Baldwin	ext	v0.16b,v0.16b,v0.16b,#8
436bc3d5698SJohn Baldwin
437bc3d5698SJohn Baldwin	pmull2	v23.1q,v22.2d,v23.2d
438bc3d5698SJohn Baldwin	eor	v16.16b,v4.16b,v0.16b
439bc3d5698SJohn Baldwin	pmull2	v5.1q,v21.2d,v5.2d
440bc3d5698SJohn Baldwin	ext	v3.16b,v16.16b,v16.16b,#8
441bc3d5698SJohn Baldwin
442bc3d5698SJohn Baldwin	eor	v29.16b,v29.16b,v7.16b
443bc3d5698SJohn Baldwin	eor	v31.16b,v31.16b,v23.16b
444bc3d5698SJohn Baldwin	eor	v30.16b,v30.16b,v5.16b
445bc3d5698SJohn Baldwin
446bc3d5698SJohn Baldwin	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
447bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v3.16b
448bc3d5698SJohn Baldwin	pmull2	v2.1q,v26.2d,v3.2d
449bc3d5698SJohn Baldwin	pmull	v1.1q,v27.1d,v16.1d
450bc3d5698SJohn Baldwin
451bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v29.16b
452bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v31.16b
453bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v30.16b
454bc3d5698SJohn Baldwin	b	.Ldone4x
455bc3d5698SJohn Baldwin
456bc3d5698SJohn Baldwin.align	4
457bc3d5698SJohn Baldwin.Ltwo:
458bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
459bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
460bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
461bc3d5698SJohn Baldwin	ld1	{v4.2d,v5.2d},[x2]
462bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
463*575878a5SEd Maste#ifndef	__AARCH64EB__
464bc3d5698SJohn Baldwin	rev64	v5.16b,v5.16b
465bc3d5698SJohn Baldwin	rev64	v4.16b,v4.16b
466bc3d5698SJohn Baldwin#endif
467bc3d5698SJohn Baldwin
468bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
469bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
470bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
471bc3d5698SJohn Baldwin	ext	v23.16b,v5.16b,v5.16b,#8
472bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
473bc3d5698SJohn Baldwin
474bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
475bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
476bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
477bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v18.16b
478bc3d5698SJohn Baldwin	ext	v0.16b,v0.16b,v0.16b,#8
479bc3d5698SJohn Baldwin
480bc3d5698SJohn Baldwin	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
481bc3d5698SJohn Baldwin	eor	v5.16b,v5.16b,v23.16b
482bc3d5698SJohn Baldwin
483bc3d5698SJohn Baldwin	eor	v16.16b,v4.16b,v0.16b
484bc3d5698SJohn Baldwin	ext	v3.16b,v16.16b,v16.16b,#8
485bc3d5698SJohn Baldwin
486bc3d5698SJohn Baldwin	pmull2	v31.1q,v20.2d,v23.2d
487bc3d5698SJohn Baldwin	pmull	v30.1q,v21.1d,v5.1d
488bc3d5698SJohn Baldwin
489bc3d5698SJohn Baldwin	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
490bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v3.16b
491bc3d5698SJohn Baldwin	pmull2	v2.1q,v22.2d,v3.2d
492bc3d5698SJohn Baldwin	pmull2	v1.1q,v21.2d,v16.2d
493bc3d5698SJohn Baldwin
494bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v29.16b
495bc3d5698SJohn Baldwin	eor	v2.16b,v2.16b,v31.16b
496bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v30.16b
497bc3d5698SJohn Baldwin	b	.Ldone4x
498bc3d5698SJohn Baldwin
499bc3d5698SJohn Baldwin.align	4
500bc3d5698SJohn Baldwin.Lone:
501bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
502bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
503bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
504bc3d5698SJohn Baldwin	ld1	{v4.2d},[x2]
505bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
506*575878a5SEd Maste#ifndef	__AARCH64EB__
507bc3d5698SJohn Baldwin	rev64	v4.16b,v4.16b
508bc3d5698SJohn Baldwin#endif
509bc3d5698SJohn Baldwin
510bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
511bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
512bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
513bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
514bc3d5698SJohn Baldwin
515bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
516bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
517bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
518bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v18.16b
519bc3d5698SJohn Baldwin	ext	v0.16b,v0.16b,v0.16b,#8
520bc3d5698SJohn Baldwin
521bc3d5698SJohn Baldwin	eor	v16.16b,v4.16b,v0.16b
522bc3d5698SJohn Baldwin	ext	v3.16b,v16.16b,v16.16b,#8
523bc3d5698SJohn Baldwin
524bc3d5698SJohn Baldwin	pmull	v0.1q,v20.1d,v3.1d
525bc3d5698SJohn Baldwin	eor	v16.16b,v16.16b,v3.16b
526bc3d5698SJohn Baldwin	pmull2	v2.1q,v20.2d,v3.2d
527bc3d5698SJohn Baldwin	pmull	v1.1q,v21.1d,v16.1d
528bc3d5698SJohn Baldwin
529bc3d5698SJohn Baldwin.Ldone4x:
530bc3d5698SJohn Baldwin	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
531bc3d5698SJohn Baldwin	eor	v18.16b,v0.16b,v2.16b
532bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v17.16b
533bc3d5698SJohn Baldwin	eor	v1.16b,v1.16b,v18.16b
534bc3d5698SJohn Baldwin
535bc3d5698SJohn Baldwin	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
536bc3d5698SJohn Baldwin	ins	v2.d[0],v1.d[1]
537bc3d5698SJohn Baldwin	ins	v1.d[1],v0.d[0]
538bc3d5698SJohn Baldwin	eor	v0.16b,v1.16b,v18.16b
539bc3d5698SJohn Baldwin
540bc3d5698SJohn Baldwin	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
541bc3d5698SJohn Baldwin	pmull	v0.1q,v0.1d,v19.1d
542bc3d5698SJohn Baldwin	eor	v18.16b,v18.16b,v2.16b
543bc3d5698SJohn Baldwin	eor	v0.16b,v0.16b,v18.16b
544bc3d5698SJohn Baldwin	ext	v0.16b,v0.16b,v0.16b,#8
545bc3d5698SJohn Baldwin
546*575878a5SEd Maste#ifndef __AARCH64EB__
547bc3d5698SJohn Baldwin	rev64	v0.16b,v0.16b
548bc3d5698SJohn Baldwin#endif
549bc3d5698SJohn Baldwin	st1	{v0.2d},[x0]		//write out Xi
550bc3d5698SJohn Baldwin
551bc3d5698SJohn Baldwin	ret
552bc3d5698SJohn Baldwin.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
553bc3d5698SJohn Baldwin.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
554bc3d5698SJohn Baldwin.align	2
555bc3d5698SJohn Baldwin.align	2
556bc3d5698SJohn Baldwin#endif
557