xref: /freebsd/sys/crypto/openssl/aarch64/ghashv8-armx.S (revision ac77b2621508c6a50ab01d07fe8d43795d908f05)
1/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=7
5.arch	armv8-a+crypto
6.text
7.globl	gcm_init_v8
8.type	gcm_init_v8,%function
9.align	4
10gcm_init_v8:
11	AARCH64_VALID_CALL_TARGET
12	ld1	{v17.2d},[x1]		//load input H
13	movi	v19.16b,#0xe1
14	shl	v19.2d,v19.2d,#57		//0xc2.0
15	ext	v3.16b,v17.16b,v17.16b,#8
16	ushr	v18.2d,v19.2d,#63
17	dup	v17.4s,v17.s[1]
18	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
19	ushr	v18.2d,v3.2d,#63
20	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
21	and	v18.16b,v18.16b,v16.16b
22	shl	v3.2d,v3.2d,#1
23	ext	v18.16b,v18.16b,v18.16b,#8
24	and	v16.16b,v16.16b,v17.16b
25	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
26	eor	v20.16b,v3.16b,v16.16b		//twisted H
27	st1	{v20.2d},[x0],#16		//store Htable[0]
28
29	//calculate H^2
30	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
31	pmull	v0.1q,v20.1d,v20.1d
32	eor	v16.16b,v16.16b,v20.16b
33	pmull2	v2.1q,v20.2d,v20.2d
34	pmull	v1.1q,v16.1d,v16.1d
35
36	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
37	eor	v18.16b,v0.16b,v2.16b
38	eor	v1.16b,v1.16b,v17.16b
39	eor	v1.16b,v1.16b,v18.16b
40	pmull	v18.1q,v0.1d,v19.1d		//1st phase
41
42	ins	v2.d[0],v1.d[1]
43	ins	v1.d[1],v0.d[0]
44	eor	v0.16b,v1.16b,v18.16b
45
46	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
47	pmull	v0.1q,v0.1d,v19.1d
48	eor	v18.16b,v18.16b,v2.16b
49	eor	v22.16b,v0.16b,v18.16b
50
51	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
52	eor	v17.16b,v17.16b,v22.16b
53	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
54	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
55	//calculate H^3 and H^4
56	pmull	v0.1q,v20.1d, v22.1d
57	pmull	v5.1q,v22.1d,v22.1d
58	pmull2	v2.1q,v20.2d, v22.2d
59	pmull2	v7.1q,v22.2d,v22.2d
60	pmull	v1.1q,v16.1d,v17.1d
61	pmull	v6.1q,v17.1d,v17.1d
62
63	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
64	ext	v17.16b,v5.16b,v7.16b,#8
65	eor	v18.16b,v0.16b,v2.16b
66	eor	v1.16b,v1.16b,v16.16b
67	eor	v4.16b,v5.16b,v7.16b
68	eor	v6.16b,v6.16b,v17.16b
69	eor	v1.16b,v1.16b,v18.16b
70	pmull	v18.1q,v0.1d,v19.1d		//1st phase
71	eor	v6.16b,v6.16b,v4.16b
72	pmull	v4.1q,v5.1d,v19.1d
73
74	ins	v2.d[0],v1.d[1]
75	ins	v7.d[0],v6.d[1]
76	ins	v1.d[1],v0.d[0]
77	ins	v6.d[1],v5.d[0]
78	eor	v0.16b,v1.16b,v18.16b
79	eor	v5.16b,v6.16b,v4.16b
80
81	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
82	ext	v4.16b,v5.16b,v5.16b,#8
83	pmull	v0.1q,v0.1d,v19.1d
84	pmull	v5.1q,v5.1d,v19.1d
85	eor	v18.16b,v18.16b,v2.16b
86	eor	v4.16b,v4.16b,v7.16b
87	eor	v20.16b, v0.16b,v18.16b		//H^3
88	eor	v22.16b,v5.16b,v4.16b		//H^4
89
90	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
91	ext	v17.16b,v22.16b,v22.16b,#8
92	eor	v16.16b,v16.16b,v20.16b
93	eor	v17.16b,v17.16b,v22.16b
94	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
95	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
96	ret
97.size	gcm_init_v8,.-gcm_init_v8
98.globl	gcm_gmult_v8
99.type	gcm_gmult_v8,%function
100.align	4
101gcm_gmult_v8:
102	AARCH64_VALID_CALL_TARGET
103	ld1	{v17.2d},[x0]		//load Xi
104	movi	v19.16b,#0xe1
105	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
106	shl	v19.2d,v19.2d,#57
107#ifndef __AARCH64EB__
108	rev64	v17.16b,v17.16b
109#endif
110	ext	v3.16b,v17.16b,v17.16b,#8
111
112	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
113	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
114	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
115	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
116
117	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
118	eor	v18.16b,v0.16b,v2.16b
119	eor	v1.16b,v1.16b,v17.16b
120	eor	v1.16b,v1.16b,v18.16b
121	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
122
123	ins	v2.d[0],v1.d[1]
124	ins	v1.d[1],v0.d[0]
125	eor	v0.16b,v1.16b,v18.16b
126
127	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
128	pmull	v0.1q,v0.1d,v19.1d
129	eor	v18.16b,v18.16b,v2.16b
130	eor	v0.16b,v0.16b,v18.16b
131
132#ifndef __AARCH64EB__
133	rev64	v0.16b,v0.16b
134#endif
135	ext	v0.16b,v0.16b,v0.16b,#8
136	st1	{v0.2d},[x0]		//write out Xi
137
138	ret
139.size	gcm_gmult_v8,.-gcm_gmult_v8
140.globl	gcm_ghash_v8
141.type	gcm_ghash_v8,%function
142.align	4
143gcm_ghash_v8:
144	AARCH64_VALID_CALL_TARGET
145	cmp	x3,#64
146	b.hs	.Lgcm_ghash_v8_4x
147	ld1	{v0.2d},[x0]		//load [rotated] Xi
148						//"[rotated]" means that
149						//loaded value would have
150						//to be rotated in order to
151						//make it appear as in
152						//algorithm specification
153	subs	x3,x3,#32		//see if x3 is 32 or larger
154	mov	x12,#16		//x12 is used as post-
155						//increment for input pointer;
156						//as loop is modulo-scheduled
157						//x12 is zeroed just in time
158						//to preclude overstepping
159						//inp[len], which means that
160						//last block[s] are actually
161						//loaded twice, but last
162						//copy is not processed
163	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
164	movi	v19.16b,#0xe1
165	ld1	{v22.2d},[x1]
166	csel	x12,xzr,x12,eq			//is it time to zero x12?
167	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
168	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
169	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
170#ifndef __AARCH64EB__
171	rev64	v16.16b,v16.16b
172	rev64	v0.16b,v0.16b
173#endif
174	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
175	b.lo	.Lodd_tail_v8		//x3 was less than 32
176	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
177#ifndef __AARCH64EB__
178	rev64	v17.16b,v17.16b
179#endif
180	ext	v7.16b,v17.16b,v17.16b,#8
181	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
182	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
183	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
184	pmull2	v6.1q,v20.2d,v7.2d
185	b	.Loop_mod2x_v8
186
187.align	4
188.Loop_mod2x_v8:
189	ext	v18.16b,v3.16b,v3.16b,#8
190	subs	x3,x3,#32		//is there more data?
191	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
192	csel	x12,xzr,x12,lo			//is it time to zero x12?
193
194	pmull	v5.1q,v21.1d,v17.1d
195	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
196	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
197	eor	v0.16b,v0.16b,v4.16b		//accumulate
198	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
199	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
200
201	eor	v2.16b,v2.16b,v6.16b
202	csel	x12,xzr,x12,eq			//is it time to zero x12?
203	eor	v1.16b,v1.16b,v5.16b
204
205	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
206	eor	v18.16b,v0.16b,v2.16b
207	eor	v1.16b,v1.16b,v17.16b
208	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
209#ifndef __AARCH64EB__
210	rev64	v16.16b,v16.16b
211#endif
212	eor	v1.16b,v1.16b,v18.16b
213	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
214
215#ifndef __AARCH64EB__
216	rev64	v17.16b,v17.16b
217#endif
218	ins	v2.d[0],v1.d[1]
219	ins	v1.d[1],v0.d[0]
220	ext	v7.16b,v17.16b,v17.16b,#8
221	ext	v3.16b,v16.16b,v16.16b,#8
222	eor	v0.16b,v1.16b,v18.16b
223	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
224	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
225
226	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
227	pmull	v0.1q,v0.1d,v19.1d
228	eor	v3.16b,v3.16b,v18.16b
229	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
230	eor	v3.16b,v3.16b,v0.16b
231	pmull2	v6.1q,v20.2d,v7.2d
232	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
233
234	eor	v2.16b,v2.16b,v18.16b
235	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
236	adds	x3,x3,#32		//re-construct x3
237	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
238	b.eq	.Ldone_v8		//is x3 zero?
239.Lodd_tail_v8:
240	ext	v18.16b,v0.16b,v0.16b,#8
241	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
242	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
243
244	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
245	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
246	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
247	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
248
249	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
250	eor	v18.16b,v0.16b,v2.16b
251	eor	v1.16b,v1.16b,v17.16b
252	eor	v1.16b,v1.16b,v18.16b
253	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
254
255	ins	v2.d[0],v1.d[1]
256	ins	v1.d[1],v0.d[0]
257	eor	v0.16b,v1.16b,v18.16b
258
259	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
260	pmull	v0.1q,v0.1d,v19.1d
261	eor	v18.16b,v18.16b,v2.16b
262	eor	v0.16b,v0.16b,v18.16b
263
264.Ldone_v8:
265#ifndef __AARCH64EB__
266	rev64	v0.16b,v0.16b
267#endif
268	ext	v0.16b,v0.16b,v0.16b,#8
269	st1	{v0.2d},[x0]		//write out Xi
270
271	ret
272.size	gcm_ghash_v8,.-gcm_ghash_v8
273.type	gcm_ghash_v8_4x,%function
274.align	4
275gcm_ghash_v8_4x:
276.Lgcm_ghash_v8_4x:
277	ld1	{v0.2d},[x0]		//load [rotated] Xi
278	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
279	movi	v19.16b,#0xe1
280	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
281	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
282
283	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
284#ifndef __AARCH64EB__
285	rev64	v0.16b,v0.16b
286	rev64	v5.16b,v5.16b
287	rev64	v6.16b,v6.16b
288	rev64	v7.16b,v7.16b
289	rev64	v4.16b,v4.16b
290#endif
291	ext	v25.16b,v7.16b,v7.16b,#8
292	ext	v24.16b,v6.16b,v6.16b,#8
293	ext	v23.16b,v5.16b,v5.16b,#8
294
295	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
296	eor	v7.16b,v7.16b,v25.16b
297	pmull2	v31.1q,v20.2d,v25.2d
298	pmull	v30.1q,v21.1d,v7.1d
299
300	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
301	eor	v6.16b,v6.16b,v24.16b
302	pmull2	v24.1q,v22.2d,v24.2d
303	pmull2	v6.1q,v21.2d,v6.2d
304
305	eor	v29.16b,v29.16b,v16.16b
306	eor	v31.16b,v31.16b,v24.16b
307	eor	v30.16b,v30.16b,v6.16b
308
309	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
310	eor	v5.16b,v5.16b,v23.16b
311	pmull2	v23.1q,v26.2d,v23.2d
312	pmull	v5.1q,v27.1d,v5.1d
313
314	eor	v29.16b,v29.16b,v7.16b
315	eor	v31.16b,v31.16b,v23.16b
316	eor	v30.16b,v30.16b,v5.16b
317
318	subs	x3,x3,#128
319	b.lo	.Ltail4x
320
321	b	.Loop4x
322
323.align	4
324.Loop4x:
325	eor	v16.16b,v4.16b,v0.16b
326	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
327	ext	v3.16b,v16.16b,v16.16b,#8
328#ifndef __AARCH64EB__
329	rev64	v5.16b,v5.16b
330	rev64	v6.16b,v6.16b
331	rev64	v7.16b,v7.16b
332	rev64	v4.16b,v4.16b
333#endif
334
335	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
336	eor	v16.16b,v16.16b,v3.16b
337	pmull2	v2.1q,v28.2d,v3.2d
338	ext	v25.16b,v7.16b,v7.16b,#8
339	pmull2	v1.1q,v27.2d,v16.2d
340
341	eor	v0.16b,v0.16b,v29.16b
342	eor	v2.16b,v2.16b,v31.16b
343	ext	v24.16b,v6.16b,v6.16b,#8
344	eor	v1.16b,v1.16b,v30.16b
345	ext	v23.16b,v5.16b,v5.16b,#8
346
347	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
348	eor	v18.16b,v0.16b,v2.16b
349	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
350	eor	v7.16b,v7.16b,v25.16b
351	eor	v1.16b,v1.16b,v17.16b
352	pmull2	v31.1q,v20.2d,v25.2d
353	eor	v1.16b,v1.16b,v18.16b
354	pmull	v30.1q,v21.1d,v7.1d
355
356	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
357	ins	v2.d[0],v1.d[1]
358	ins	v1.d[1],v0.d[0]
359	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
360	eor	v6.16b,v6.16b,v24.16b
361	pmull2	v24.1q,v22.2d,v24.2d
362	eor	v0.16b,v1.16b,v18.16b
363	pmull2	v6.1q,v21.2d,v6.2d
364
365	eor	v29.16b,v29.16b,v16.16b
366	eor	v31.16b,v31.16b,v24.16b
367	eor	v30.16b,v30.16b,v6.16b
368
369	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
370	pmull	v0.1q,v0.1d,v19.1d
371	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
372	eor	v5.16b,v5.16b,v23.16b
373	eor	v18.16b,v18.16b,v2.16b
374	pmull2	v23.1q,v26.2d,v23.2d
375	pmull	v5.1q,v27.1d,v5.1d
376
377	eor	v0.16b,v0.16b,v18.16b
378	eor	v29.16b,v29.16b,v7.16b
379	eor	v31.16b,v31.16b,v23.16b
380	ext	v0.16b,v0.16b,v0.16b,#8
381	eor	v30.16b,v30.16b,v5.16b
382
383	subs	x3,x3,#64
384	b.hs	.Loop4x
385
386.Ltail4x:
387	eor	v16.16b,v4.16b,v0.16b
388	ext	v3.16b,v16.16b,v16.16b,#8
389
390	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
391	eor	v16.16b,v16.16b,v3.16b
392	pmull2	v2.1q,v28.2d,v3.2d
393	pmull2	v1.1q,v27.2d,v16.2d
394
395	eor	v0.16b,v0.16b,v29.16b
396	eor	v2.16b,v2.16b,v31.16b
397	eor	v1.16b,v1.16b,v30.16b
398
399	adds	x3,x3,#64
400	b.eq	.Ldone4x
401
402	cmp	x3,#32
403	b.lo	.Lone
404	b.eq	.Ltwo
405.Lthree:
406	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
407	eor	v18.16b,v0.16b,v2.16b
408	eor	v1.16b,v1.16b,v17.16b
409	ld1	{v4.2d,v5.2d,v6.2d},[x2]
410	eor	v1.16b,v1.16b,v18.16b
411#ifndef	__AARCH64EB__
412	rev64	v5.16b,v5.16b
413	rev64	v6.16b,v6.16b
414	rev64	v4.16b,v4.16b
415#endif
416
417	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
418	ins	v2.d[0],v1.d[1]
419	ins	v1.d[1],v0.d[0]
420	ext	v24.16b,v6.16b,v6.16b,#8
421	ext	v23.16b,v5.16b,v5.16b,#8
422	eor	v0.16b,v1.16b,v18.16b
423
424	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
425	eor	v6.16b,v6.16b,v24.16b
426
427	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
428	pmull	v0.1q,v0.1d,v19.1d
429	eor	v18.16b,v18.16b,v2.16b
430	pmull2	v31.1q,v20.2d,v24.2d
431	pmull	v30.1q,v21.1d,v6.1d
432	eor	v0.16b,v0.16b,v18.16b
433	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
434	eor	v5.16b,v5.16b,v23.16b
435	ext	v0.16b,v0.16b,v0.16b,#8
436
437	pmull2	v23.1q,v22.2d,v23.2d
438	eor	v16.16b,v4.16b,v0.16b
439	pmull2	v5.1q,v21.2d,v5.2d
440	ext	v3.16b,v16.16b,v16.16b,#8
441
442	eor	v29.16b,v29.16b,v7.16b
443	eor	v31.16b,v31.16b,v23.16b
444	eor	v30.16b,v30.16b,v5.16b
445
446	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
447	eor	v16.16b,v16.16b,v3.16b
448	pmull2	v2.1q,v26.2d,v3.2d
449	pmull	v1.1q,v27.1d,v16.1d
450
451	eor	v0.16b,v0.16b,v29.16b
452	eor	v2.16b,v2.16b,v31.16b
453	eor	v1.16b,v1.16b,v30.16b
454	b	.Ldone4x
455
456.align	4
457.Ltwo:
458	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
459	eor	v18.16b,v0.16b,v2.16b
460	eor	v1.16b,v1.16b,v17.16b
461	ld1	{v4.2d,v5.2d},[x2]
462	eor	v1.16b,v1.16b,v18.16b
463#ifndef	__AARCH64EB__
464	rev64	v5.16b,v5.16b
465	rev64	v4.16b,v4.16b
466#endif
467
468	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
469	ins	v2.d[0],v1.d[1]
470	ins	v1.d[1],v0.d[0]
471	ext	v23.16b,v5.16b,v5.16b,#8
472	eor	v0.16b,v1.16b,v18.16b
473
474	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
475	pmull	v0.1q,v0.1d,v19.1d
476	eor	v18.16b,v18.16b,v2.16b
477	eor	v0.16b,v0.16b,v18.16b
478	ext	v0.16b,v0.16b,v0.16b,#8
479
480	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
481	eor	v5.16b,v5.16b,v23.16b
482
483	eor	v16.16b,v4.16b,v0.16b
484	ext	v3.16b,v16.16b,v16.16b,#8
485
486	pmull2	v31.1q,v20.2d,v23.2d
487	pmull	v30.1q,v21.1d,v5.1d
488
489	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
490	eor	v16.16b,v16.16b,v3.16b
491	pmull2	v2.1q,v22.2d,v3.2d
492	pmull2	v1.1q,v21.2d,v16.2d
493
494	eor	v0.16b,v0.16b,v29.16b
495	eor	v2.16b,v2.16b,v31.16b
496	eor	v1.16b,v1.16b,v30.16b
497	b	.Ldone4x
498
499.align	4
500.Lone:
501	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
502	eor	v18.16b,v0.16b,v2.16b
503	eor	v1.16b,v1.16b,v17.16b
504	ld1	{v4.2d},[x2]
505	eor	v1.16b,v1.16b,v18.16b
506#ifndef	__AARCH64EB__
507	rev64	v4.16b,v4.16b
508#endif
509
510	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
511	ins	v2.d[0],v1.d[1]
512	ins	v1.d[1],v0.d[0]
513	eor	v0.16b,v1.16b,v18.16b
514
515	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
516	pmull	v0.1q,v0.1d,v19.1d
517	eor	v18.16b,v18.16b,v2.16b
518	eor	v0.16b,v0.16b,v18.16b
519	ext	v0.16b,v0.16b,v0.16b,#8
520
521	eor	v16.16b,v4.16b,v0.16b
522	ext	v3.16b,v16.16b,v16.16b,#8
523
524	pmull	v0.1q,v20.1d,v3.1d
525	eor	v16.16b,v16.16b,v3.16b
526	pmull2	v2.1q,v20.2d,v3.2d
527	pmull	v1.1q,v21.1d,v16.1d
528
529.Ldone4x:
530	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
531	eor	v18.16b,v0.16b,v2.16b
532	eor	v1.16b,v1.16b,v17.16b
533	eor	v1.16b,v1.16b,v18.16b
534
535	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
536	ins	v2.d[0],v1.d[1]
537	ins	v1.d[1],v0.d[0]
538	eor	v0.16b,v1.16b,v18.16b
539
540	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
541	pmull	v0.1q,v0.1d,v19.1d
542	eor	v18.16b,v18.16b,v2.16b
543	eor	v0.16b,v0.16b,v18.16b
544	ext	v0.16b,v0.16b,v0.16b,#8
545
546#ifndef __AARCH64EB__
547	rev64	v0.16b,v0.16b
548#endif
549	st1	{v0.2d},[x0]		//write out Xi
550
551	ret
552.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
553.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
554.align	2
555.align	2
556#endif
557