xref: /freebsd/sys/crypto/openssl/aarch64/ghashv8-armx.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=7
5.arch	armv8-a+crypto
6.text
7.globl	gcm_init_v8
8.type	gcm_init_v8,%function
9.align	4
10gcm_init_v8:
11	AARCH64_VALID_CALL_TARGET
12	ld1	{v17.2d},[x1]		//load input H
13	movi	v19.16b,#0xe1
14	shl	v19.2d,v19.2d,#57		//0xc2.0
15	ext	v3.16b,v17.16b,v17.16b,#8
16	ushr	v18.2d,v19.2d,#63
17	dup	v17.4s,v17.s[1]
18	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
19	ushr	v18.2d,v3.2d,#63
20	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
21	and	v18.16b,v18.16b,v16.16b
22	shl	v3.2d,v3.2d,#1
23	ext	v18.16b,v18.16b,v18.16b,#8
24	and	v16.16b,v16.16b,v17.16b
25	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
26	eor	v20.16b,v3.16b,v16.16b		//twisted H
27	st1	{v20.2d},[x0],#16		//store Htable[0]
28
29	//calculate H^2
30	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
31	pmull	v0.1q,v20.1d,v20.1d
32	eor	v16.16b,v16.16b,v20.16b
33	pmull2	v2.1q,v20.2d,v20.2d
34	pmull	v1.1q,v16.1d,v16.1d
35
36	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
37	eor	v18.16b,v0.16b,v2.16b
38	eor	v1.16b,v1.16b,v17.16b
39	eor	v1.16b,v1.16b,v18.16b
40	pmull	v18.1q,v0.1d,v19.1d		//1st phase
41
42	ins	v2.d[0],v1.d[1]
43	ins	v1.d[1],v0.d[0]
44	eor	v0.16b,v1.16b,v18.16b
45
46	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
47	pmull	v0.1q,v0.1d,v19.1d
48	eor	v18.16b,v18.16b,v2.16b
49	eor	v22.16b,v0.16b,v18.16b
50
51	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
52	eor	v17.16b,v17.16b,v22.16b
53	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
54	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
55	//calculate H^3 and H^4
56	pmull	v0.1q,v20.1d, v22.1d
57	pmull	v5.1q,v22.1d,v22.1d
58	pmull2	v2.1q,v20.2d, v22.2d
59	pmull2	v7.1q,v22.2d,v22.2d
60	pmull	v1.1q,v16.1d,v17.1d
61	pmull	v6.1q,v17.1d,v17.1d
62
63	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
64	ext	v17.16b,v5.16b,v7.16b,#8
65	eor	v18.16b,v0.16b,v2.16b
66	eor	v1.16b,v1.16b,v16.16b
67	eor	v4.16b,v5.16b,v7.16b
68	eor	v6.16b,v6.16b,v17.16b
69	eor	v1.16b,v1.16b,v18.16b
70	pmull	v18.1q,v0.1d,v19.1d		//1st phase
71	eor	v6.16b,v6.16b,v4.16b
72	pmull	v4.1q,v5.1d,v19.1d
73
74	ins	v2.d[0],v1.d[1]
75	ins	v7.d[0],v6.d[1]
76	ins	v1.d[1],v0.d[0]
77	ins	v6.d[1],v5.d[0]
78	eor	v0.16b,v1.16b,v18.16b
79	eor	v5.16b,v6.16b,v4.16b
80
81	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
82	ext	v4.16b,v5.16b,v5.16b,#8
83	pmull	v0.1q,v0.1d,v19.1d
84	pmull	v5.1q,v5.1d,v19.1d
85	eor	v18.16b,v18.16b,v2.16b
86	eor	v4.16b,v4.16b,v7.16b
87	eor	v23.16b, v0.16b,v18.16b		//H^3
88	eor	v25.16b,v5.16b,v4.16b		//H^4
89
90	ext	v16.16b,v23.16b, v23.16b,#8		//Karatsuba pre-processing
91	ext	v17.16b,v25.16b,v25.16b,#8
92	ext	v18.16b,v22.16b,v22.16b,#8
93	eor	v16.16b,v16.16b,v23.16b
94	eor	v17.16b,v17.16b,v25.16b
95	eor	v18.16b,v18.16b,v22.16b
96	ext	v24.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
97	st1	{v23.2d,v24.2d,v25.2d},[x0],#48		//store Htable[3..5]
98
99	//calculate H^5 and H^6
100	pmull	v0.1q,v22.1d, v23.1d
101	pmull	v5.1q,v23.1d,v23.1d
102	pmull2	v2.1q,v22.2d, v23.2d
103	pmull2	v7.1q,v23.2d,v23.2d
104	pmull	v1.1q,v16.1d,v18.1d
105	pmull	v6.1q,v16.1d,v16.1d
106
107	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
108	ext	v17.16b,v5.16b,v7.16b,#8
109	eor	v18.16b,v0.16b,v2.16b
110	eor	v1.16b,v1.16b,v16.16b
111	eor	v4.16b,v5.16b,v7.16b
112	eor	v6.16b,v6.16b,v17.16b
113	eor	v1.16b,v1.16b,v18.16b
114	pmull	v18.1q,v0.1d,v19.1d		//1st phase
115	eor	v6.16b,v6.16b,v4.16b
116	pmull	v4.1q,v5.1d,v19.1d
117
118	ins	v2.d[0],v1.d[1]
119	ins	v7.d[0],v6.d[1]
120	ins	v1.d[1],v0.d[0]
121	ins	v6.d[1],v5.d[0]
122	eor	v0.16b,v1.16b,v18.16b
123	eor	v5.16b,v6.16b,v4.16b
124
125	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
126	ext	v4.16b,v5.16b,v5.16b,#8
127	pmull	v0.1q,v0.1d,v19.1d
128	pmull	v5.1q,v5.1d,v19.1d
129	eor	v18.16b,v18.16b,v2.16b
130	eor	v4.16b,v4.16b,v7.16b
131	eor	v26.16b,v0.16b,v18.16b		//H^5
132	eor	v28.16b,v5.16b,v4.16b		//H^6
133
134	ext	v16.16b,v26.16b, v26.16b,#8		//Karatsuba pre-processing
135	ext	v17.16b,v28.16b,v28.16b,#8
136	ext	v18.16b,v22.16b,v22.16b,#8
137	eor	v16.16b,v16.16b,v26.16b
138	eor	v17.16b,v17.16b,v28.16b
139	eor	v18.16b,v18.16b,v22.16b
140	ext	v27.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
141	st1	{v26.2d,v27.2d,v28.2d},[x0],#48		//store Htable[6..8]
142
143	//calculate H^7 and H^8
144	pmull	v0.1q,v22.1d,v26.1d
145	pmull	v5.1q,v22.1d,v28.1d
146	pmull2	v2.1q,v22.2d,v26.2d
147	pmull2	v7.1q,v22.2d,v28.2d
148	pmull	v1.1q,v16.1d,v18.1d
149	pmull	v6.1q,v17.1d,v18.1d
150
151	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
152	ext	v17.16b,v5.16b,v7.16b,#8
153	eor	v18.16b,v0.16b,v2.16b
154	eor	v1.16b,v1.16b,v16.16b
155	eor	v4.16b,v5.16b,v7.16b
156	eor	v6.16b,v6.16b,v17.16b
157	eor	v1.16b,v1.16b,v18.16b
158	pmull	v18.1q,v0.1d,v19.1d		//1st phase
159	eor	v6.16b,v6.16b,v4.16b
160	pmull	v4.1q,v5.1d,v19.1d
161
162	ins	v2.d[0],v1.d[1]
163	ins	v7.d[0],v6.d[1]
164	ins	v1.d[1],v0.d[0]
165	ins	v6.d[1],v5.d[0]
166	eor	v0.16b,v1.16b,v18.16b
167	eor	v5.16b,v6.16b,v4.16b
168
169	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
170	ext	v4.16b,v5.16b,v5.16b,#8
171	pmull	v0.1q,v0.1d,v19.1d
172	pmull	v5.1q,v5.1d,v19.1d
173	eor	v18.16b,v18.16b,v2.16b
174	eor	v4.16b,v4.16b,v7.16b
175	eor	v29.16b,v0.16b,v18.16b		//H^7
176	eor	v31.16b,v5.16b,v4.16b		//H^8
177
178	ext	v16.16b,v29.16b,v29.16b,#8		//Karatsuba pre-processing
179	ext	v17.16b,v31.16b,v31.16b,#8
180	eor	v16.16b,v16.16b,v29.16b
181	eor	v17.16b,v17.16b,v31.16b
182	ext	v30.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
183	st1	{v29.2d,v30.2d,v31.2d},[x0]		//store Htable[9..11]
184	ret
185.size	gcm_init_v8,.-gcm_init_v8
186.globl	gcm_gmult_v8
187.type	gcm_gmult_v8,%function
188.align	4
189gcm_gmult_v8:
190	AARCH64_VALID_CALL_TARGET
191	ld1	{v17.2d},[x0]		//load Xi
192	movi	v19.16b,#0xe1
193	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
194	shl	v19.2d,v19.2d,#57
195#ifndef __AARCH64EB__
196	rev64	v17.16b,v17.16b
197#endif
198	ext	v3.16b,v17.16b,v17.16b,#8
199
200	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
201	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
202	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
203	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
204
205	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
206	eor	v18.16b,v0.16b,v2.16b
207	eor	v1.16b,v1.16b,v17.16b
208	eor	v1.16b,v1.16b,v18.16b
209	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
210
211	ins	v2.d[0],v1.d[1]
212	ins	v1.d[1],v0.d[0]
213	eor	v0.16b,v1.16b,v18.16b
214
215	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
216	pmull	v0.1q,v0.1d,v19.1d
217	eor	v18.16b,v18.16b,v2.16b
218	eor	v0.16b,v0.16b,v18.16b
219
220#ifndef __AARCH64EB__
221	rev64	v0.16b,v0.16b
222#endif
223	ext	v0.16b,v0.16b,v0.16b,#8
224	st1	{v0.2d},[x0]		//write out Xi
225
226	ret
227.size	gcm_gmult_v8,.-gcm_gmult_v8
228.globl	gcm_ghash_v8
229.type	gcm_ghash_v8,%function
230.align	4
231gcm_ghash_v8:
232	AARCH64_VALID_CALL_TARGET
233	cmp	x3,#64
234	b.hs	.Lgcm_ghash_v8_4x
235	ld1	{v0.2d},[x0]		//load [rotated] Xi
236						//"[rotated]" means that
237						//loaded value would have
238						//to be rotated in order to
239						//make it appear as in
240						//algorithm specification
241	subs	x3,x3,#32		//see if x3 is 32 or larger
242	mov	x12,#16		//x12 is used as post-
243						//increment for input pointer;
244						//as loop is modulo-scheduled
245						//x12 is zeroed just in time
246						//to preclude overstepping
247						//inp[len], which means that
248						//last block[s] are actually
249						//loaded twice, but last
250						//copy is not processed
251	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
252	movi	v19.16b,#0xe1
253	ld1	{v22.2d},[x1]
254	csel	x12,xzr,x12,eq			//is it time to zero x12?
255	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
256	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
257	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
258#ifndef __AARCH64EB__
259	rev64	v16.16b,v16.16b
260	rev64	v0.16b,v0.16b
261#endif
262	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
263	b.lo	.Lodd_tail_v8		//x3 was less than 32
264	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
265#ifndef __AARCH64EB__
266	rev64	v17.16b,v17.16b
267#endif
268	ext	v7.16b,v17.16b,v17.16b,#8
269	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
270	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
271	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
272	pmull2	v6.1q,v20.2d,v7.2d
273	b	.Loop_mod2x_v8
274
275.align	4
276.Loop_mod2x_v8:
277	ext	v18.16b,v3.16b,v3.16b,#8
278	subs	x3,x3,#32		//is there more data?
279	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
280	csel	x12,xzr,x12,lo			//is it time to zero x12?
281
282	pmull	v5.1q,v21.1d,v17.1d
283	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
284	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
285	eor	v0.16b,v0.16b,v4.16b		//accumulate
286	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
287	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
288
289	eor	v2.16b,v2.16b,v6.16b
290	csel	x12,xzr,x12,eq			//is it time to zero x12?
291	eor	v1.16b,v1.16b,v5.16b
292
293	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
294	eor	v18.16b,v0.16b,v2.16b
295	eor	v1.16b,v1.16b,v17.16b
296	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
297#ifndef __AARCH64EB__
298	rev64	v16.16b,v16.16b
299#endif
300	eor	v1.16b,v1.16b,v18.16b
301	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
302
303#ifndef __AARCH64EB__
304	rev64	v17.16b,v17.16b
305#endif
306	ins	v2.d[0],v1.d[1]
307	ins	v1.d[1],v0.d[0]
308	ext	v7.16b,v17.16b,v17.16b,#8
309	ext	v3.16b,v16.16b,v16.16b,#8
310	eor	v0.16b,v1.16b,v18.16b
311	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
312	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
313
314	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
315	pmull	v0.1q,v0.1d,v19.1d
316	eor	v3.16b,v3.16b,v18.16b
317	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
318	eor	v3.16b,v3.16b,v0.16b
319	pmull2	v6.1q,v20.2d,v7.2d
320	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
321
322	eor	v2.16b,v2.16b,v18.16b
323	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
324	adds	x3,x3,#32		//re-construct x3
325	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
326	b.eq	.Ldone_v8		//is x3 zero?
327.Lodd_tail_v8:
328	ext	v18.16b,v0.16b,v0.16b,#8
329	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
330	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
331
332	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
333	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
334	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
335	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
336
337	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
338	eor	v18.16b,v0.16b,v2.16b
339	eor	v1.16b,v1.16b,v17.16b
340	eor	v1.16b,v1.16b,v18.16b
341	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
342
343	ins	v2.d[0],v1.d[1]
344	ins	v1.d[1],v0.d[0]
345	eor	v0.16b,v1.16b,v18.16b
346
347	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
348	pmull	v0.1q,v0.1d,v19.1d
349	eor	v18.16b,v18.16b,v2.16b
350	eor	v0.16b,v0.16b,v18.16b
351
352.Ldone_v8:
353#ifndef __AARCH64EB__
354	rev64	v0.16b,v0.16b
355#endif
356	ext	v0.16b,v0.16b,v0.16b,#8
357	st1	{v0.2d},[x0]		//write out Xi
358
359	ret
360.size	gcm_ghash_v8,.-gcm_ghash_v8
361.type	gcm_ghash_v8_4x,%function
362.align	4
363gcm_ghash_v8_4x:
364.Lgcm_ghash_v8_4x:
365	ld1	{v0.2d},[x0]		//load [rotated] Xi
366	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
367	movi	v19.16b,#0xe1
368	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
369	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
370
371	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
372#ifndef __AARCH64EB__
373	rev64	v0.16b,v0.16b
374	rev64	v5.16b,v5.16b
375	rev64	v6.16b,v6.16b
376	rev64	v7.16b,v7.16b
377	rev64	v4.16b,v4.16b
378#endif
379	ext	v25.16b,v7.16b,v7.16b,#8
380	ext	v24.16b,v6.16b,v6.16b,#8
381	ext	v23.16b,v5.16b,v5.16b,#8
382
383	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
384	eor	v7.16b,v7.16b,v25.16b
385	pmull2	v31.1q,v20.2d,v25.2d
386	pmull	v30.1q,v21.1d,v7.1d
387
388	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
389	eor	v6.16b,v6.16b,v24.16b
390	pmull2	v24.1q,v22.2d,v24.2d
391	pmull2	v6.1q,v21.2d,v6.2d
392
393	eor	v29.16b,v29.16b,v16.16b
394	eor	v31.16b,v31.16b,v24.16b
395	eor	v30.16b,v30.16b,v6.16b
396
397	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
398	eor	v5.16b,v5.16b,v23.16b
399	pmull2	v23.1q,v26.2d,v23.2d
400	pmull	v5.1q,v27.1d,v5.1d
401
402	eor	v29.16b,v29.16b,v7.16b
403	eor	v31.16b,v31.16b,v23.16b
404	eor	v30.16b,v30.16b,v5.16b
405
406	subs	x3,x3,#128
407	b.lo	.Ltail4x
408
409	b	.Loop4x
410
411.align	4
412.Loop4x:
413	eor	v16.16b,v4.16b,v0.16b
414	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
415	ext	v3.16b,v16.16b,v16.16b,#8
416#ifndef __AARCH64EB__
417	rev64	v5.16b,v5.16b
418	rev64	v6.16b,v6.16b
419	rev64	v7.16b,v7.16b
420	rev64	v4.16b,v4.16b
421#endif
422
423	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
424	eor	v16.16b,v16.16b,v3.16b
425	pmull2	v2.1q,v28.2d,v3.2d
426	ext	v25.16b,v7.16b,v7.16b,#8
427	pmull2	v1.1q,v27.2d,v16.2d
428
429	eor	v0.16b,v0.16b,v29.16b
430	eor	v2.16b,v2.16b,v31.16b
431	ext	v24.16b,v6.16b,v6.16b,#8
432	eor	v1.16b,v1.16b,v30.16b
433	ext	v23.16b,v5.16b,v5.16b,#8
434
435	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
436	eor	v18.16b,v0.16b,v2.16b
437	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
438	eor	v7.16b,v7.16b,v25.16b
439	eor	v1.16b,v1.16b,v17.16b
440	pmull2	v31.1q,v20.2d,v25.2d
441	eor	v1.16b,v1.16b,v18.16b
442	pmull	v30.1q,v21.1d,v7.1d
443
444	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
445	ins	v2.d[0],v1.d[1]
446	ins	v1.d[1],v0.d[0]
447	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
448	eor	v6.16b,v6.16b,v24.16b
449	pmull2	v24.1q,v22.2d,v24.2d
450	eor	v0.16b,v1.16b,v18.16b
451	pmull2	v6.1q,v21.2d,v6.2d
452
453	eor	v29.16b,v29.16b,v16.16b
454	eor	v31.16b,v31.16b,v24.16b
455	eor	v30.16b,v30.16b,v6.16b
456
457	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
458	pmull	v0.1q,v0.1d,v19.1d
459	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
460	eor	v5.16b,v5.16b,v23.16b
461	eor	v18.16b,v18.16b,v2.16b
462	pmull2	v23.1q,v26.2d,v23.2d
463	pmull	v5.1q,v27.1d,v5.1d
464
465	eor	v0.16b,v0.16b,v18.16b
466	eor	v29.16b,v29.16b,v7.16b
467	eor	v31.16b,v31.16b,v23.16b
468	ext	v0.16b,v0.16b,v0.16b,#8
469	eor	v30.16b,v30.16b,v5.16b
470
471	subs	x3,x3,#64
472	b.hs	.Loop4x
473
474.Ltail4x:
475	eor	v16.16b,v4.16b,v0.16b
476	ext	v3.16b,v16.16b,v16.16b,#8
477
478	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
479	eor	v16.16b,v16.16b,v3.16b
480	pmull2	v2.1q,v28.2d,v3.2d
481	pmull2	v1.1q,v27.2d,v16.2d
482
483	eor	v0.16b,v0.16b,v29.16b
484	eor	v2.16b,v2.16b,v31.16b
485	eor	v1.16b,v1.16b,v30.16b
486
487	adds	x3,x3,#64
488	b.eq	.Ldone4x
489
490	cmp	x3,#32
491	b.lo	.Lone
492	b.eq	.Ltwo
493.Lthree:
494	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
495	eor	v18.16b,v0.16b,v2.16b
496	eor	v1.16b,v1.16b,v17.16b
497	ld1	{v4.2d,v5.2d,v6.2d},[x2]
498	eor	v1.16b,v1.16b,v18.16b
499#ifndef	__AARCH64EB__
500	rev64	v5.16b,v5.16b
501	rev64	v6.16b,v6.16b
502	rev64	v4.16b,v4.16b
503#endif
504
505	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
506	ins	v2.d[0],v1.d[1]
507	ins	v1.d[1],v0.d[0]
508	ext	v24.16b,v6.16b,v6.16b,#8
509	ext	v23.16b,v5.16b,v5.16b,#8
510	eor	v0.16b,v1.16b,v18.16b
511
512	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
513	eor	v6.16b,v6.16b,v24.16b
514
515	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
516	pmull	v0.1q,v0.1d,v19.1d
517	eor	v18.16b,v18.16b,v2.16b
518	pmull2	v31.1q,v20.2d,v24.2d
519	pmull	v30.1q,v21.1d,v6.1d
520	eor	v0.16b,v0.16b,v18.16b
521	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
522	eor	v5.16b,v5.16b,v23.16b
523	ext	v0.16b,v0.16b,v0.16b,#8
524
525	pmull2	v23.1q,v22.2d,v23.2d
526	eor	v16.16b,v4.16b,v0.16b
527	pmull2	v5.1q,v21.2d,v5.2d
528	ext	v3.16b,v16.16b,v16.16b,#8
529
530	eor	v29.16b,v29.16b,v7.16b
531	eor	v31.16b,v31.16b,v23.16b
532	eor	v30.16b,v30.16b,v5.16b
533
534	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
535	eor	v16.16b,v16.16b,v3.16b
536	pmull2	v2.1q,v26.2d,v3.2d
537	pmull	v1.1q,v27.1d,v16.1d
538
539	eor	v0.16b,v0.16b,v29.16b
540	eor	v2.16b,v2.16b,v31.16b
541	eor	v1.16b,v1.16b,v30.16b
542	b	.Ldone4x
543
544.align	4
545.Ltwo:
546	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
547	eor	v18.16b,v0.16b,v2.16b
548	eor	v1.16b,v1.16b,v17.16b
549	ld1	{v4.2d,v5.2d},[x2]
550	eor	v1.16b,v1.16b,v18.16b
551#ifndef	__AARCH64EB__
552	rev64	v5.16b,v5.16b
553	rev64	v4.16b,v4.16b
554#endif
555
556	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
557	ins	v2.d[0],v1.d[1]
558	ins	v1.d[1],v0.d[0]
559	ext	v23.16b,v5.16b,v5.16b,#8
560	eor	v0.16b,v1.16b,v18.16b
561
562	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
563	pmull	v0.1q,v0.1d,v19.1d
564	eor	v18.16b,v18.16b,v2.16b
565	eor	v0.16b,v0.16b,v18.16b
566	ext	v0.16b,v0.16b,v0.16b,#8
567
568	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
569	eor	v5.16b,v5.16b,v23.16b
570
571	eor	v16.16b,v4.16b,v0.16b
572	ext	v3.16b,v16.16b,v16.16b,#8
573
574	pmull2	v31.1q,v20.2d,v23.2d
575	pmull	v30.1q,v21.1d,v5.1d
576
577	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
578	eor	v16.16b,v16.16b,v3.16b
579	pmull2	v2.1q,v22.2d,v3.2d
580	pmull2	v1.1q,v21.2d,v16.2d
581
582	eor	v0.16b,v0.16b,v29.16b
583	eor	v2.16b,v2.16b,v31.16b
584	eor	v1.16b,v1.16b,v30.16b
585	b	.Ldone4x
586
587.align	4
588.Lone:
589	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
590	eor	v18.16b,v0.16b,v2.16b
591	eor	v1.16b,v1.16b,v17.16b
592	ld1	{v4.2d},[x2]
593	eor	v1.16b,v1.16b,v18.16b
594#ifndef	__AARCH64EB__
595	rev64	v4.16b,v4.16b
596#endif
597
598	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
599	ins	v2.d[0],v1.d[1]
600	ins	v1.d[1],v0.d[0]
601	eor	v0.16b,v1.16b,v18.16b
602
603	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
604	pmull	v0.1q,v0.1d,v19.1d
605	eor	v18.16b,v18.16b,v2.16b
606	eor	v0.16b,v0.16b,v18.16b
607	ext	v0.16b,v0.16b,v0.16b,#8
608
609	eor	v16.16b,v4.16b,v0.16b
610	ext	v3.16b,v16.16b,v16.16b,#8
611
612	pmull	v0.1q,v20.1d,v3.1d
613	eor	v16.16b,v16.16b,v3.16b
614	pmull2	v2.1q,v20.2d,v3.2d
615	pmull	v1.1q,v21.1d,v16.1d
616
617.Ldone4x:
618	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
619	eor	v18.16b,v0.16b,v2.16b
620	eor	v1.16b,v1.16b,v17.16b
621	eor	v1.16b,v1.16b,v18.16b
622
623	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
624	ins	v2.d[0],v1.d[1]
625	ins	v1.d[1],v0.d[0]
626	eor	v0.16b,v1.16b,v18.16b
627
628	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
629	pmull	v0.1q,v0.1d,v19.1d
630	eor	v18.16b,v18.16b,v2.16b
631	eor	v0.16b,v0.16b,v18.16b
632	ext	v0.16b,v0.16b,v0.16b,#8
633
634#ifndef __AARCH64EB__
635	rev64	v0.16b,v0.16b
636#endif
637	st1	{v0.2d},[x0]		//write out Xi
638
639	ret
640.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
641.section	.rodata
642.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
643.align	2
644.align	2
645#endif
646