xref: /freebsd/sys/crypto/openssl/aarch64/ghashv8-armx.S (revision 4fbb9c43aa44d9145151bb5f77d302ba01fb7551)
1/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=7
5.arch	armv8-a+crypto
6.text
7.globl	gcm_init_v8
8.type	gcm_init_v8,%function
9.align	4
10gcm_init_v8:
11	ld1	{v17.2d},[x1]		//load input H
12	movi	v19.16b,#0xe1
13	shl	v19.2d,v19.2d,#57		//0xc2.0
14	ext	v3.16b,v17.16b,v17.16b,#8
15	ushr	v18.2d,v19.2d,#63
16	dup	v17.4s,v17.s[1]
17	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
18	ushr	v18.2d,v3.2d,#63
19	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
20	and	v18.16b,v18.16b,v16.16b
21	shl	v3.2d,v3.2d,#1
22	ext	v18.16b,v18.16b,v18.16b,#8
23	and	v16.16b,v16.16b,v17.16b
24	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
25	eor	v20.16b,v3.16b,v16.16b		//twisted H
26	st1	{v20.2d},[x0],#16		//store Htable[0]
27
28	//calculate H^2
29	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
30	pmull	v0.1q,v20.1d,v20.1d
31	eor	v16.16b,v16.16b,v20.16b
32	pmull2	v2.1q,v20.2d,v20.2d
33	pmull	v1.1q,v16.1d,v16.1d
34
35	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
36	eor	v18.16b,v0.16b,v2.16b
37	eor	v1.16b,v1.16b,v17.16b
38	eor	v1.16b,v1.16b,v18.16b
39	pmull	v18.1q,v0.1d,v19.1d		//1st phase
40
41	ins	v2.d[0],v1.d[1]
42	ins	v1.d[1],v0.d[0]
43	eor	v0.16b,v1.16b,v18.16b
44
45	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
46	pmull	v0.1q,v0.1d,v19.1d
47	eor	v18.16b,v18.16b,v2.16b
48	eor	v22.16b,v0.16b,v18.16b
49
50	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
51	eor	v17.16b,v17.16b,v22.16b
52	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
53	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
54	//calculate H^3 and H^4
55	pmull	v0.1q,v20.1d, v22.1d
56	pmull	v5.1q,v22.1d,v22.1d
57	pmull2	v2.1q,v20.2d, v22.2d
58	pmull2	v7.1q,v22.2d,v22.2d
59	pmull	v1.1q,v16.1d,v17.1d
60	pmull	v6.1q,v17.1d,v17.1d
61
62	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
63	ext	v17.16b,v5.16b,v7.16b,#8
64	eor	v18.16b,v0.16b,v2.16b
65	eor	v1.16b,v1.16b,v16.16b
66	eor	v4.16b,v5.16b,v7.16b
67	eor	v6.16b,v6.16b,v17.16b
68	eor	v1.16b,v1.16b,v18.16b
69	pmull	v18.1q,v0.1d,v19.1d		//1st phase
70	eor	v6.16b,v6.16b,v4.16b
71	pmull	v4.1q,v5.1d,v19.1d
72
73	ins	v2.d[0],v1.d[1]
74	ins	v7.d[0],v6.d[1]
75	ins	v1.d[1],v0.d[0]
76	ins	v6.d[1],v5.d[0]
77	eor	v0.16b,v1.16b,v18.16b
78	eor	v5.16b,v6.16b,v4.16b
79
80	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
81	ext	v4.16b,v5.16b,v5.16b,#8
82	pmull	v0.1q,v0.1d,v19.1d
83	pmull	v5.1q,v5.1d,v19.1d
84	eor	v18.16b,v18.16b,v2.16b
85	eor	v4.16b,v4.16b,v7.16b
86	eor	v20.16b, v0.16b,v18.16b		//H^3
87	eor	v22.16b,v5.16b,v4.16b		//H^4
88
89	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
90	ext	v17.16b,v22.16b,v22.16b,#8
91	eor	v16.16b,v16.16b,v20.16b
92	eor	v17.16b,v17.16b,v22.16b
93	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
94	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
95	ret
96.size	gcm_init_v8,.-gcm_init_v8
97.globl	gcm_gmult_v8
98.type	gcm_gmult_v8,%function
99.align	4
100gcm_gmult_v8:
101	ld1	{v17.2d},[x0]		//load Xi
102	movi	v19.16b,#0xe1
103	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
104	shl	v19.2d,v19.2d,#57
105#ifndef __ARMEB__
106	rev64	v17.16b,v17.16b
107#endif
108	ext	v3.16b,v17.16b,v17.16b,#8
109
110	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
111	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
112	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
113	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
114
115	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
116	eor	v18.16b,v0.16b,v2.16b
117	eor	v1.16b,v1.16b,v17.16b
118	eor	v1.16b,v1.16b,v18.16b
119	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
120
121	ins	v2.d[0],v1.d[1]
122	ins	v1.d[1],v0.d[0]
123	eor	v0.16b,v1.16b,v18.16b
124
125	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
126	pmull	v0.1q,v0.1d,v19.1d
127	eor	v18.16b,v18.16b,v2.16b
128	eor	v0.16b,v0.16b,v18.16b
129
130#ifndef __ARMEB__
131	rev64	v0.16b,v0.16b
132#endif
133	ext	v0.16b,v0.16b,v0.16b,#8
134	st1	{v0.2d},[x0]		//write out Xi
135
136	ret
137.size	gcm_gmult_v8,.-gcm_gmult_v8
138.globl	gcm_ghash_v8
139.type	gcm_ghash_v8,%function
140.align	4
141gcm_ghash_v8:
142	cmp	x3,#64
143	b.hs	.Lgcm_ghash_v8_4x
144	ld1	{v0.2d},[x0]		//load [rotated] Xi
145						//"[rotated]" means that
146						//loaded value would have
147						//to be rotated in order to
148						//make it appear as in
149						//algorithm specification
150	subs	x3,x3,#32		//see if x3 is 32 or larger
151	mov	x12,#16		//x12 is used as post-
152						//increment for input pointer;
153						//as loop is modulo-scheduled
154						//x12 is zeroed just in time
155						//to preclude overstepping
156						//inp[len], which means that
157						//last block[s] are actually
158						//loaded twice, but last
159						//copy is not processed
160	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
161	movi	v19.16b,#0xe1
162	ld1	{v22.2d},[x1]
163	csel	x12,xzr,x12,eq			//is it time to zero x12?
164	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
165	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
166	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
167#ifndef __ARMEB__
168	rev64	v16.16b,v16.16b
169	rev64	v0.16b,v0.16b
170#endif
171	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
172	b.lo	.Lodd_tail_v8		//x3 was less than 32
173	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
174#ifndef __ARMEB__
175	rev64	v17.16b,v17.16b
176#endif
177	ext	v7.16b,v17.16b,v17.16b,#8
178	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
179	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
180	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
181	pmull2	v6.1q,v20.2d,v7.2d
182	b	.Loop_mod2x_v8
183
184.align	4
185.Loop_mod2x_v8:
186	ext	v18.16b,v3.16b,v3.16b,#8
187	subs	x3,x3,#32		//is there more data?
188	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
189	csel	x12,xzr,x12,lo			//is it time to zero x12?
190
191	pmull	v5.1q,v21.1d,v17.1d
192	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
193	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
194	eor	v0.16b,v0.16b,v4.16b		//accumulate
195	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
196	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
197
198	eor	v2.16b,v2.16b,v6.16b
199	csel	x12,xzr,x12,eq			//is it time to zero x12?
200	eor	v1.16b,v1.16b,v5.16b
201
202	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
203	eor	v18.16b,v0.16b,v2.16b
204	eor	v1.16b,v1.16b,v17.16b
205	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
206#ifndef __ARMEB__
207	rev64	v16.16b,v16.16b
208#endif
209	eor	v1.16b,v1.16b,v18.16b
210	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
211
212#ifndef __ARMEB__
213	rev64	v17.16b,v17.16b
214#endif
215	ins	v2.d[0],v1.d[1]
216	ins	v1.d[1],v0.d[0]
217	ext	v7.16b,v17.16b,v17.16b,#8
218	ext	v3.16b,v16.16b,v16.16b,#8
219	eor	v0.16b,v1.16b,v18.16b
220	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
221	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
222
223	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
224	pmull	v0.1q,v0.1d,v19.1d
225	eor	v3.16b,v3.16b,v18.16b
226	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
227	eor	v3.16b,v3.16b,v0.16b
228	pmull2	v6.1q,v20.2d,v7.2d
229	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
230
231	eor	v2.16b,v2.16b,v18.16b
232	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
233	adds	x3,x3,#32		//re-construct x3
234	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
235	b.eq	.Ldone_v8		//is x3 zero?
236.Lodd_tail_v8:
237	ext	v18.16b,v0.16b,v0.16b,#8
238	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
239	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
240
241	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
242	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
243	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
244	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
245
246	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
247	eor	v18.16b,v0.16b,v2.16b
248	eor	v1.16b,v1.16b,v17.16b
249	eor	v1.16b,v1.16b,v18.16b
250	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
251
252	ins	v2.d[0],v1.d[1]
253	ins	v1.d[1],v0.d[0]
254	eor	v0.16b,v1.16b,v18.16b
255
256	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
257	pmull	v0.1q,v0.1d,v19.1d
258	eor	v18.16b,v18.16b,v2.16b
259	eor	v0.16b,v0.16b,v18.16b
260
261.Ldone_v8:
262#ifndef __ARMEB__
263	rev64	v0.16b,v0.16b
264#endif
265	ext	v0.16b,v0.16b,v0.16b,#8
266	st1	{v0.2d},[x0]		//write out Xi
267
268	ret
269.size	gcm_ghash_v8,.-gcm_ghash_v8
270.type	gcm_ghash_v8_4x,%function
271.align	4
272gcm_ghash_v8_4x:
273.Lgcm_ghash_v8_4x:
274	ld1	{v0.2d},[x0]		//load [rotated] Xi
275	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
276	movi	v19.16b,#0xe1
277	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
278	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
279
280	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
281#ifndef __ARMEB__
282	rev64	v0.16b,v0.16b
283	rev64	v5.16b,v5.16b
284	rev64	v6.16b,v6.16b
285	rev64	v7.16b,v7.16b
286	rev64	v4.16b,v4.16b
287#endif
288	ext	v25.16b,v7.16b,v7.16b,#8
289	ext	v24.16b,v6.16b,v6.16b,#8
290	ext	v23.16b,v5.16b,v5.16b,#8
291
292	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
293	eor	v7.16b,v7.16b,v25.16b
294	pmull2	v31.1q,v20.2d,v25.2d
295	pmull	v30.1q,v21.1d,v7.1d
296
297	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
298	eor	v6.16b,v6.16b,v24.16b
299	pmull2	v24.1q,v22.2d,v24.2d
300	pmull2	v6.1q,v21.2d,v6.2d
301
302	eor	v29.16b,v29.16b,v16.16b
303	eor	v31.16b,v31.16b,v24.16b
304	eor	v30.16b,v30.16b,v6.16b
305
306	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
307	eor	v5.16b,v5.16b,v23.16b
308	pmull2	v23.1q,v26.2d,v23.2d
309	pmull	v5.1q,v27.1d,v5.1d
310
311	eor	v29.16b,v29.16b,v7.16b
312	eor	v31.16b,v31.16b,v23.16b
313	eor	v30.16b,v30.16b,v5.16b
314
315	subs	x3,x3,#128
316	b.lo	.Ltail4x
317
318	b	.Loop4x
319
320.align	4
321.Loop4x:
322	eor	v16.16b,v4.16b,v0.16b
323	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
324	ext	v3.16b,v16.16b,v16.16b,#8
325#ifndef __ARMEB__
326	rev64	v5.16b,v5.16b
327	rev64	v6.16b,v6.16b
328	rev64	v7.16b,v7.16b
329	rev64	v4.16b,v4.16b
330#endif
331
332	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
333	eor	v16.16b,v16.16b,v3.16b
334	pmull2	v2.1q,v28.2d,v3.2d
335	ext	v25.16b,v7.16b,v7.16b,#8
336	pmull2	v1.1q,v27.2d,v16.2d
337
338	eor	v0.16b,v0.16b,v29.16b
339	eor	v2.16b,v2.16b,v31.16b
340	ext	v24.16b,v6.16b,v6.16b,#8
341	eor	v1.16b,v1.16b,v30.16b
342	ext	v23.16b,v5.16b,v5.16b,#8
343
344	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
345	eor	v18.16b,v0.16b,v2.16b
346	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
347	eor	v7.16b,v7.16b,v25.16b
348	eor	v1.16b,v1.16b,v17.16b
349	pmull2	v31.1q,v20.2d,v25.2d
350	eor	v1.16b,v1.16b,v18.16b
351	pmull	v30.1q,v21.1d,v7.1d
352
353	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
354	ins	v2.d[0],v1.d[1]
355	ins	v1.d[1],v0.d[0]
356	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
357	eor	v6.16b,v6.16b,v24.16b
358	pmull2	v24.1q,v22.2d,v24.2d
359	eor	v0.16b,v1.16b,v18.16b
360	pmull2	v6.1q,v21.2d,v6.2d
361
362	eor	v29.16b,v29.16b,v16.16b
363	eor	v31.16b,v31.16b,v24.16b
364	eor	v30.16b,v30.16b,v6.16b
365
366	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
367	pmull	v0.1q,v0.1d,v19.1d
368	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
369	eor	v5.16b,v5.16b,v23.16b
370	eor	v18.16b,v18.16b,v2.16b
371	pmull2	v23.1q,v26.2d,v23.2d
372	pmull	v5.1q,v27.1d,v5.1d
373
374	eor	v0.16b,v0.16b,v18.16b
375	eor	v29.16b,v29.16b,v7.16b
376	eor	v31.16b,v31.16b,v23.16b
377	ext	v0.16b,v0.16b,v0.16b,#8
378	eor	v30.16b,v30.16b,v5.16b
379
380	subs	x3,x3,#64
381	b.hs	.Loop4x
382
383.Ltail4x:
384	eor	v16.16b,v4.16b,v0.16b
385	ext	v3.16b,v16.16b,v16.16b,#8
386
387	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
388	eor	v16.16b,v16.16b,v3.16b
389	pmull2	v2.1q,v28.2d,v3.2d
390	pmull2	v1.1q,v27.2d,v16.2d
391
392	eor	v0.16b,v0.16b,v29.16b
393	eor	v2.16b,v2.16b,v31.16b
394	eor	v1.16b,v1.16b,v30.16b
395
396	adds	x3,x3,#64
397	b.eq	.Ldone4x
398
399	cmp	x3,#32
400	b.lo	.Lone
401	b.eq	.Ltwo
402.Lthree:
403	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
404	eor	v18.16b,v0.16b,v2.16b
405	eor	v1.16b,v1.16b,v17.16b
406	ld1	{v4.2d,v5.2d,v6.2d},[x2]
407	eor	v1.16b,v1.16b,v18.16b
408#ifndef	__ARMEB__
409	rev64	v5.16b,v5.16b
410	rev64	v6.16b,v6.16b
411	rev64	v4.16b,v4.16b
412#endif
413
414	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
415	ins	v2.d[0],v1.d[1]
416	ins	v1.d[1],v0.d[0]
417	ext	v24.16b,v6.16b,v6.16b,#8
418	ext	v23.16b,v5.16b,v5.16b,#8
419	eor	v0.16b,v1.16b,v18.16b
420
421	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
422	eor	v6.16b,v6.16b,v24.16b
423
424	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
425	pmull	v0.1q,v0.1d,v19.1d
426	eor	v18.16b,v18.16b,v2.16b
427	pmull2	v31.1q,v20.2d,v24.2d
428	pmull	v30.1q,v21.1d,v6.1d
429	eor	v0.16b,v0.16b,v18.16b
430	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
431	eor	v5.16b,v5.16b,v23.16b
432	ext	v0.16b,v0.16b,v0.16b,#8
433
434	pmull2	v23.1q,v22.2d,v23.2d
435	eor	v16.16b,v4.16b,v0.16b
436	pmull2	v5.1q,v21.2d,v5.2d
437	ext	v3.16b,v16.16b,v16.16b,#8
438
439	eor	v29.16b,v29.16b,v7.16b
440	eor	v31.16b,v31.16b,v23.16b
441	eor	v30.16b,v30.16b,v5.16b
442
443	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
444	eor	v16.16b,v16.16b,v3.16b
445	pmull2	v2.1q,v26.2d,v3.2d
446	pmull	v1.1q,v27.1d,v16.1d
447
448	eor	v0.16b,v0.16b,v29.16b
449	eor	v2.16b,v2.16b,v31.16b
450	eor	v1.16b,v1.16b,v30.16b
451	b	.Ldone4x
452
453.align	4
454.Ltwo:
455	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
456	eor	v18.16b,v0.16b,v2.16b
457	eor	v1.16b,v1.16b,v17.16b
458	ld1	{v4.2d,v5.2d},[x2]
459	eor	v1.16b,v1.16b,v18.16b
460#ifndef	__ARMEB__
461	rev64	v5.16b,v5.16b
462	rev64	v4.16b,v4.16b
463#endif
464
465	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
466	ins	v2.d[0],v1.d[1]
467	ins	v1.d[1],v0.d[0]
468	ext	v23.16b,v5.16b,v5.16b,#8
469	eor	v0.16b,v1.16b,v18.16b
470
471	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
472	pmull	v0.1q,v0.1d,v19.1d
473	eor	v18.16b,v18.16b,v2.16b
474	eor	v0.16b,v0.16b,v18.16b
475	ext	v0.16b,v0.16b,v0.16b,#8
476
477	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
478	eor	v5.16b,v5.16b,v23.16b
479
480	eor	v16.16b,v4.16b,v0.16b
481	ext	v3.16b,v16.16b,v16.16b,#8
482
483	pmull2	v31.1q,v20.2d,v23.2d
484	pmull	v30.1q,v21.1d,v5.1d
485
486	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
487	eor	v16.16b,v16.16b,v3.16b
488	pmull2	v2.1q,v22.2d,v3.2d
489	pmull2	v1.1q,v21.2d,v16.2d
490
491	eor	v0.16b,v0.16b,v29.16b
492	eor	v2.16b,v2.16b,v31.16b
493	eor	v1.16b,v1.16b,v30.16b
494	b	.Ldone4x
495
496.align	4
497.Lone:
498	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
499	eor	v18.16b,v0.16b,v2.16b
500	eor	v1.16b,v1.16b,v17.16b
501	ld1	{v4.2d},[x2]
502	eor	v1.16b,v1.16b,v18.16b
503#ifndef	__ARMEB__
504	rev64	v4.16b,v4.16b
505#endif
506
507	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
508	ins	v2.d[0],v1.d[1]
509	ins	v1.d[1],v0.d[0]
510	eor	v0.16b,v1.16b,v18.16b
511
512	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
513	pmull	v0.1q,v0.1d,v19.1d
514	eor	v18.16b,v18.16b,v2.16b
515	eor	v0.16b,v0.16b,v18.16b
516	ext	v0.16b,v0.16b,v0.16b,#8
517
518	eor	v16.16b,v4.16b,v0.16b
519	ext	v3.16b,v16.16b,v16.16b,#8
520
521	pmull	v0.1q,v20.1d,v3.1d
522	eor	v16.16b,v16.16b,v3.16b
523	pmull2	v2.1q,v20.2d,v3.2d
524	pmull	v1.1q,v21.1d,v16.1d
525
526.Ldone4x:
527	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
528	eor	v18.16b,v0.16b,v2.16b
529	eor	v1.16b,v1.16b,v17.16b
530	eor	v1.16b,v1.16b,v18.16b
531
532	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
533	ins	v2.d[0],v1.d[1]
534	ins	v1.d[1],v0.d[0]
535	eor	v0.16b,v1.16b,v18.16b
536
537	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
538	pmull	v0.1q,v0.1d,v19.1d
539	eor	v18.16b,v18.16b,v2.16b
540	eor	v0.16b,v0.16b,v18.16b
541	ext	v0.16b,v0.16b,v0.16b,#8
542
543#ifndef __ARMEB__
544	rev64	v0.16b,v0.16b
545#endif
546	st1	{v0.2d},[x0]		//write out Xi
547
548	ret
549.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
550.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
551.align	2
552.align	2
553#endif
554