xref: /freebsd/sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from aes-gcm-armv8-unroll8_64.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=8
5.arch	armv8-a+crypto
6.text
7.globl	unroll8_eor3_aes_gcm_enc_128_kernel
8.type	unroll8_eor3_aes_gcm_enc_128_kernel,%function
9.align	4
10unroll8_eor3_aes_gcm_enc_128_kernel:
11	AARCH64_VALID_CALL_TARGET
12	cbz	x1, .L128_enc_ret
13	stp	d8, d9, [sp, #-80]!
14	lsr	x9, x1, #3
15	mov	x16, x4
16	mov	x8, x5
17	stp	d10, d11, [sp, #16]
18	stp	d12, d13, [sp, #32]
19	stp	d14, d15, [sp, #48]
20	mov	x5, #0xc200000000000000
21	stp	x5, xzr, [sp, #64]
22	add	x10, sp, #64
23
24	mov	x15, #0x100000000				//set up counter increment
25	movi	v31.16b, #0x0
26	mov	v31.d[1], x15
27	mov	x5, x9
28	ld1	{ v0.16b}, [x16]					//CTR block 0
29
30	sub	x5, x5, #1	 	//byte_len - 1
31
32	and	x5, x5, #0xffffffffffffff80		//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
33
34	rev32	v30.16b, v0.16b				//set up reversed counter
35
36	add	v30.4s, v30.4s, v31.4s		//CTR block 0
37
38	rev32	v1.16b, v30.16b				//CTR block 1
39	add	v30.4s, v30.4s, v31.4s		//CTR block 1
40
41	rev32	v2.16b, v30.16b				//CTR block 2
42	add	v30.4s, v30.4s, v31.4s		//CTR block 2
43
44	rev32	v3.16b, v30.16b				//CTR block 3
45	add	v30.4s, v30.4s, v31.4s		//CTR block 3
46
47	rev32	v4.16b, v30.16b				//CTR block 4
48	add	v30.4s, v30.4s, v31.4s		//CTR block 4
49
50	rev32	v5.16b, v30.16b				//CTR block 5
51	add	v30.4s, v30.4s, v31.4s		//CTR block 5
52	ldp	q26, q27, [x8, #0]				  	//load rk0, rk1
53
54	rev32	v6.16b, v30.16b				//CTR block 6
55	add	v30.4s, v30.4s, v31.4s		//CTR block 6
56
57	rev32	v7.16b, v30.16b				//CTR block 7
58	add	v30.4s, v30.4s, v31.4s		//CTR block 7
59
60	aese	v4.16b, v26.16b
61	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
62	aese	v6.16b, v26.16b
63	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
64	aese	v3.16b, v26.16b
65	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
66
67	aese	v0.16b, v26.16b
68	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
69	aese	v1.16b, v26.16b
70	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
71	aese	v2.16b, v26.16b
72	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
73
74	aese	v7.16b, v26.16b
75	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
76	aese	v5.16b, v26.16b
77	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
78	ldp	q28, q26, [x8, #32]				//load rk2, rk3
79
80	aese	v3.16b, v27.16b
81	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
82
83	aese	v7.16b, v27.16b
84	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
85	aese	v5.16b, v27.16b
86	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
87	aese	v4.16b, v27.16b
88	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
89
90	aese	v2.16b, v27.16b
91	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
92	aese	v6.16b, v27.16b
93	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
94	aese	v0.16b, v27.16b
95	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
96
97	aese	v5.16b, v28.16b
98	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
99	aese	v1.16b, v27.16b
100	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
101	aese	v0.16b, v28.16b
102	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
103
104	aese	v2.16b, v28.16b
105	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
106	aese	v3.16b, v28.16b
107	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
108	aese	v7.16b, v28.16b
109	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
110
111	aese	v1.16b, v28.16b
112	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
113	aese	v6.16b, v28.16b
114	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
115	aese	v4.16b, v28.16b
116	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
117
118	aese	v2.16b, v26.16b
119	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
120
121	ldp	q27, q28, [x8, #64]				//load rk4, rk5
122	aese	v5.16b, v26.16b
123	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
124	aese	v0.16b, v26.16b
125	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
126
127	aese	v4.16b, v26.16b
128	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
129	aese	v3.16b, v26.16b
130	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
131	aese	v6.16b, v26.16b
132	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
133
134	aese	v7.16b, v26.16b
135	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
136
137	aese	v6.16b, v27.16b
138	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
139	aese	v1.16b, v26.16b
140	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
141	aese	v5.16b, v27.16b
142	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
143
144	aese	v7.16b, v27.16b
145	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
146	aese	v4.16b, v27.16b
147	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
148	aese	v0.16b, v27.16b
149	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
150
151	aese	v1.16b, v27.16b
152	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
153	aese	v2.16b, v27.16b
154	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
155	aese	v3.16b, v27.16b
156	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
157
158	aese	v7.16b, v28.16b
159	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
160	aese	v0.16b, v28.16b
161	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
162	ldp	q26, q27, [x8, #96]				//load rk6, rk7
163
164	aese	v1.16b, v28.16b
165	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
166	aese	v3.16b, v28.16b
167	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
168	aese	v2.16b, v28.16b
169	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
170
171	aese	v4.16b, v28.16b
172	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
173	aese	v5.16b, v28.16b
174	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
175	aese	v6.16b, v28.16b
176	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
177
178	aese	v4.16b, v26.16b
179	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
180	aese	v3.16b, v26.16b
181	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
182	aese	v2.16b, v26.16b
183	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
184
185	aese	v7.16b, v26.16b
186	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
187	aese	v6.16b, v26.16b
188	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
189	aese	v5.16b, v26.16b
190	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
191
192	aese	v0.16b, v26.16b
193	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
194	aese	v1.16b, v26.16b
195	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
196	ldp	q28, q26, [x8, #128]				//load rk8, rk9
197
198	aese	v5.16b, v27.16b
199	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
200
201	ld1	{ v19.16b}, [x3]
202	ext	v19.16b, v19.16b, v19.16b, #8
203	rev64	v19.16b, v19.16b
204
205	aese	v7.16b, v27.16b
206	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
207
208	aese	v4.16b, v27.16b
209	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
210	aese	v3.16b, v27.16b
211	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
212	aese	v6.16b, v27.16b
213	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
214
215	aese	v1.16b, v27.16b
216	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
217	aese	v2.16b, v27.16b
218	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
219	aese	v0.16b, v27.16b
220	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
221
222	aese	v3.16b, v28.16b
223	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
224	aese	v6.16b, v28.16b
225	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
226	aese	v2.16b, v28.16b
227	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
228
229	aese	v7.16b, v28.16b
230	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
231	aese	v0.16b, v28.16b
232	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
233	ldr	q27, [x8, #160]					//load rk10
234
235	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
236	aese	v4.16b, v28.16b
237	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
238	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
239
240	aese	v5.16b, v28.16b
241	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
242	aese	v1.16b, v28.16b
243	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
244	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
245
246	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
247	add	x5, x5, x0
248	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
249
250	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
251	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
252	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
253
254	add	x4, x0, x1, lsr #3		//end_input_ptr
255	cmp	x0, x5				//check if we have <= 8 blocks
256	b.ge	.L128_enc_tail						//handle tail
257
258	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext
259
260	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext
261
262	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext
263
264	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
265	cmp	x0, x5				//check if we have <= 8 blocks
266
267.inst	0xce006d08	//eor3 v8.16b, v8.16b, v0.16b, v27.16b				//AES block 0 - result
268	rev32	v0.16b, v30.16b				//CTR block 8
269	add	v30.4s, v30.4s, v31.4s		//CTR block 8
270
271.inst	0xce016d29	//eor3 v9.16b, v9.16b, v1.16b, v27.16b				//AES block 1 - result
272	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result
273
274	rev32	v1.16b, v30.16b				//CTR block 9
275.inst	0xce056dad	//eor3 v13.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
276	add	v30.4s, v30.4s, v31.4s		//CTR block 9
277
278.inst	0xce026d4a	//eor3 v10.16b, v10.16b, v2.16b, v27.16b				//AES block 2 - result
279.inst	0xce066dce	//eor3 v14.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result
280.inst	0xce046d8c	//eor3 v12.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result
281
282	rev32	v2.16b, v30.16b				//CTR block 10
283	add	v30.4s, v30.4s, v31.4s		//CTR block 10
284
285.inst	0xce036d6b	//eor3 v11.16b, v11.16b, v3.16b, v27.16b				//AES block 3 - result
286.inst	0xce076def	//eor3 v15.16b, v15.16b, v7.16b,v27.16b				//AES block 7 - result
287	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result
288
289	rev32	v3.16b, v30.16b				//CTR block 11
290	add	v30.4s, v30.4s, v31.4s		//CTR block 11
291	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
292
293	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
294
295	rev32	v4.16b, v30.16b				//CTR block 12
296	add	v30.4s, v30.4s, v31.4s		//CTR block 12
297	b.ge	.L128_enc_prepretail					//do prepretail
298
299.L128_enc_main_loop:	//main	loop start
300	rev32	v5.16b, v30.16b				//CTR block 8k+13
301	ldr	q20, [x3, #128]				//load h5l | h5h
302	ext	v20.16b, v20.16b, v20.16b, #8
303	ldr	q22, [x3, #160]				//load h6l | h6h
304	ext	v22.16b, v22.16b, v22.16b, #8
305	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
306
307	rev64	v9.16b, v9.16b						//GHASH block 8k+1
308	rev64	v8.16b, v8.16b						//GHASH block 8k
309	ldr	q23, [x3, #176]				//load h7l | h7h
310	ext	v23.16b, v23.16b, v23.16b, #8
311	ldr	q25, [x3, #208]				//load h8l | h8h
312	ext	v25.16b, v25.16b, v25.16b, #8
313
314	rev32	v6.16b, v30.16b				//CTR block 8k+14
315	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
316	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
317
318	ldr	q21, [x3, #144]				//load h6k | h5k
319	ldr	q24, [x3, #192]				//load h8k | h7k
320	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
321	rev64	v11.16b, v11.16b						//GHASH block 8k+3
322
323	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
324	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
325	rev32	v7.16b, v30.16b				//CTR block 8k+15
326
327	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
328
329	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
330	rev64	v10.16b, v10.16b						//GHASH block 8k+2
331	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
332
333	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
334	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
335	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
336
337	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
338	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
339	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
340
341	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
342	ldr	q23, [x3, #80]				//load h3l | h3h
343	ext	v23.16b, v23.16b, v23.16b, #8
344	ldr	q25, [x3, #112]				//load h3l | h3h
345	ext	v25.16b, v25.16b, v25.16b, #8
346	aese	v5.16b, v26.16b
347	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
348
349	aese	v1.16b, v26.16b
350	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
351	aese	v4.16b, v26.16b
352	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
353	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
354
355	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
356	aese	v2.16b, v26.16b
357	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
358	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
359
360	aese	v6.16b, v26.16b
361	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
362	aese	v1.16b, v27.16b
363	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
364	aese	v0.16b, v26.16b
365	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
366
367	aese	v2.16b, v27.16b
368	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
369	aese	v3.16b, v26.16b
370	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
371	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
372
373	aese	v5.16b, v27.16b
374	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
375	aese	v7.16b, v26.16b
376	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
377	aese	v0.16b, v27.16b
378	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
379
380.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b,v9.16b			//GHASH block 8k+2, 8k+3 - high
381	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
382	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
383
384	ldp	q28, q26, [x8, #32]				//load rk2, rk3
385	aese	v4.16b, v27.16b
386	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
387	aese	v3.16b, v27.16b
388	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
389
390	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
391	aese	v7.16b, v27.16b
392	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
393	aese	v6.16b, v27.16b
394	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
395
396	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
397	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
398	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
399
400	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
401.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
402
403	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
404	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
405	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
406
407	aese	v5.16b, v28.16b
408	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
409	aese	v4.16b, v28.16b
410	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
411	aese	v2.16b, v28.16b
412	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
413
414	aese	v1.16b, v28.16b
415	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
416.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
417	aese	v6.16b, v28.16b
418	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
419
420	aese	v0.16b, v28.16b
421	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
422	aese	v3.16b, v28.16b
423	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
424	aese	v7.16b, v28.16b
425	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
426
427	aese	v6.16b, v26.16b
428	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
429	ldr	q21, [x3, #48]				//load h2k | h1k
430	ldr	q24, [x3, #96]				//load h4k | h3k
431	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
432
433	ldp	q27, q28, [x8, #64]				//load rk4, rk5
434	aese	v2.16b, v26.16b
435	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
436	aese	v1.16b, v26.16b
437	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
438
439	ldr	q20, [x3, #32]				//load h1l | h1h
440	ext	v20.16b, v20.16b, v20.16b, #8
441	ldr	q22, [x3, #64]				//load h1l | h1h
442	ext	v22.16b, v22.16b, v22.16b, #8
443	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
444	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
445
446	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
447	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
448
449	aese	v0.16b, v26.16b
450	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
451	aese	v3.16b, v26.16b
452	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
453
454	aese	v7.16b, v26.16b
455	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
456	aese	v4.16b, v26.16b
457	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
458
459	aese	v5.16b, v26.16b
460	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
461	aese	v0.16b, v27.16b
462	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
463
464	aese	v7.16b, v27.16b
465	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
466	aese	v3.16b, v27.16b
467	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
468	aese	v4.16b, v27.16b
469	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
470
471	aese	v5.16b, v27.16b
472	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
473	aese	v6.16b, v27.16b
474	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
475	aese	v1.16b, v27.16b
476	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
477
478	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
479	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
480	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
481
482	aese	v2.16b, v27.16b
483	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
484	ldp	q26, q27, [x8, #96]				//load rk6, rk7
485	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
486
487	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
488	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
489	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
490
491	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
492	aese	v2.16b, v28.16b
493	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
494	aese	v5.16b, v28.16b
495	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
496
497	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
498.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
499	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
500
501.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
502	aese	v6.16b, v28.16b
503	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
504
505	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
506	aese	v7.16b, v28.16b
507	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
508	aese	v1.16b, v28.16b
509	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
510
511	aese	v3.16b, v28.16b
512	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
513	aese	v4.16b, v28.16b
514	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
515	aese	v0.16b, v28.16b
516	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
517
518.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
519	ldr	d16, [x10]			//MODULO - load modulo constant
520	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
521
522	aese	v7.16b, v26.16b
523	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
524	aese	v5.16b, v26.16b
525	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
526
527	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
528	aese	v1.16b, v26.16b
529	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
530	aese	v2.16b, v26.16b
531	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
532
533	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
534.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
535	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext
536
537	aese	v3.16b, v26.16b
538	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
539	rev32	v20.16b, v30.16b					//CTR block 8k+16
540	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
541
542	aese	v4.16b, v26.16b
543	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
544	aese	v0.16b, v26.16b
545	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
546	aese	v6.16b, v26.16b
547	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
548
549.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
550	ldp	q28, q26, [x8, #128]				//load rk8, rk9
551.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
552
553	aese	v2.16b, v27.16b
554	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
555	aese	v7.16b, v27.16b
556	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
557	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext
558
559	aese	v5.16b, v27.16b
560	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
561	aese	v6.16b, v27.16b
562	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
563	aese	v1.16b, v27.16b
564	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
565
566	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
567	aese	v0.16b, v27.16b
568	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
569	aese	v4.16b, v27.16b
570	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
571
572	rev32	v22.16b, v30.16b					//CTR block 8k+17
573	aese	v3.16b, v27.16b
574	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
575
576	aese	v5.16b, v28.16b
577	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
578	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load plaintext
579	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
580
581	aese	v2.16b, v28.16b
582	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
583	aese	v1.16b, v28.16b
584	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
585	aese	v7.16b, v28.16b
586	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
587
588	aese	v4.16b, v28.16b
589	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
590.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
591	ldr	q27, [x8, #160]					//load rk10
592
593	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
594	rev32	v23.16b, v30.16b					//CTR block 8k+18
595	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
596	aese	v3.16b, v28.16b
597	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
598
599	aese	v0.16b, v28.16b
600	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
601.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
602	aese	v6.16b, v28.16b
603	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
604
605	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
606	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
607	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
608
609	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load plaintext
610	rev32	v25.16b, v30.16b					//CTR block 8k+19
611	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
612
613	cmp	x0, x5				//.LOOP CONTROL
614.inst	0xce046d8c	//eor3 v12.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result
615	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
616
617	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
618	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
619
620.inst	0xce026d4a	//eor3 v10.16b, v10.16b, v2.16b, v27.16b				//AES block 8k+10 - result
621
622	mov	v2.16b, v23.16b					//CTR block 8k+18
623	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
624
625	rev32	v4.16b, v30.16b				//CTR block 8k+20
626	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
627
628.inst	0xce076def	//eor3 v15.16b, v15.16b, v7.16b, v27.16b				//AES block 7 - result
629	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
630	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
631
632.inst	0xce016d29	//eor3 v9.16b, v9.16b, v1.16b, v27.16b				//AES block 8k+9 - result
633.inst	0xce036d6b	//eor3 v11.16b, v11.16b, v3.16b, v27.16b				//AES block 8k+11 - result
634	mov	v3.16b, v25.16b					//CTR block 8k+19
635
636	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
637.inst	0xce056dad	//eor3 v13.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
638	mov	v1.16b, v22.16b					//CTR block 8k+17
639
640.inst	0xce006d08	//eor3 v8.16b, v8.16b, v0.16b, v27.16b				//AES block 8k+8 - result
641	mov	v0.16b, v20.16b					//CTR block 8k+16
642	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result
643
644	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result
645.inst	0xce066dce	//eor3 v14.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result
646
647	stp	q12, q13, [x2], #32			//AES block 8k+12, 8k+13 - store result
648.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
649
650	stp	q14, q15, [x2], #32			//AES block 8k+14, 8k+15 - store result
651	b.lt	.L128_enc_main_loop
652
653.L128_enc_prepretail:	//PREPRETAIL
654	rev32	v5.16b, v30.16b				//CTR block 8k+13
655	ldr	q23, [x3, #176]				//load h7l | h7h
656	ext	v23.16b, v23.16b, v23.16b, #8
657	ldr	q25, [x3, #208]				//load h8l | h8h
658	ext	v25.16b, v25.16b, v25.16b, #8
659	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
660
661	ldr	q20, [x3, #128]				//load h5l | h5h
662	ext	v20.16b, v20.16b, v20.16b, #8
663	ldr	q22, [x3, #160]				//load h6l | h6h
664	ext	v22.16b, v22.16b, v22.16b, #8
665	rev64	v8.16b, v8.16b						//GHASH block 8k
666	rev64	v9.16b, v9.16b						//GHASH block 8k+1
667
668	ldr	q21, [x3, #144]				//load h6k | h5k
669	ldr	q24, [x3, #192]				//load h6k | h5k
670	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
671	rev64	v11.16b, v11.16b						//GHASH block 8k+3
672
673	rev64	v10.16b, v10.16b						//GHASH block 8k+2
674	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
675
676	rev32	v6.16b, v30.16b				//CTR block 8k+14
677
678	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
679	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
680	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
681
682	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
683	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
684
685	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
686	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
687	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
688
689	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
690	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
691
692	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
693	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
694
695	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
696	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
697
698	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
699	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
700
701	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
702
703	rev32	v7.16b, v30.16b				//CTR block 8k+15
704
705	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
706
707	aese	v2.16b, v26.16b
708	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
709
710	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
711	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
712
713	aese	v6.16b, v26.16b
714	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
715	aese	v3.16b, v26.16b
716	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
717
718	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
719	aese	v1.16b, v26.16b
720	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
721
722.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
723	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
724	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
725
726	aese	v5.16b, v26.16b
727	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
728	aese	v7.16b, v26.16b
729	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
730
731	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
732	aese	v4.16b, v26.16b
733	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
734	aese	v0.16b, v26.16b
735	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
736
737	aese	v3.16b, v27.16b
738	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
739	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
740
741	ldr	q23, [x3, #80]				//load h3l | h3h
742	ext	v23.16b, v23.16b, v23.16b, #8
743	ldr	q25, [x3, #112]				//load h4l | h4h
744	ext	v25.16b, v25.16b, v25.16b, #8
745
746	ldp	q28, q26, [x8, #32]				//load rk2, rk3
747	aese	v5.16b, v27.16b
748	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
749	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
750
751.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
752	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
753
754	aese	v1.16b, v27.16b
755	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
756	aese	v0.16b, v27.16b
757	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
758
759.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
760	ldr	q21, [x3, #48]				//load h2k | h1k
761	ldr	q24, [x3, #96]				//load h4k | h3k
762	aese	v2.16b, v27.16b
763	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
764
765	aese	v4.16b, v27.16b
766	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
767	aese	v7.16b, v27.16b
768	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
769
770	aese	v5.16b, v28.16b
771	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
772	aese	v2.16b, v28.16b
773	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
774	aese	v3.16b, v28.16b
775	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
776
777	aese	v1.16b, v28.16b
778	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
779	aese	v6.16b, v27.16b
780	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
781	aese	v4.16b, v28.16b
782	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
783
784	aese	v5.16b, v26.16b
785	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
786	aese	v0.16b, v28.16b
787	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
788
789	aese	v6.16b, v28.16b
790	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
791	aese	v7.16b, v28.16b
792	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
793	ldp	q27, q28, [x8, #64]				//load rk4, rk5
794
795	ldr	q20, [x3, #32]				//load h1l | h1h
796	ext	v20.16b, v20.16b, v20.16b, #8
797	ldr	q22, [x3, #64]				//load h1l | h1h
798	ext	v22.16b, v22.16b, v22.16b, #8
799	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
800	aese	v0.16b, v26.16b
801	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
802
803	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
804	aese	v6.16b, v26.16b
805	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
806	aese	v3.16b, v26.16b
807	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
808
809	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
810	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
811	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
812
813	aese	v2.16b, v26.16b
814	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
815	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
816
817	aese	v7.16b, v26.16b
818	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
819	aese	v1.16b, v26.16b
820	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
821	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
822
823	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
824	aese	v4.16b, v26.16b
825	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
826	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
827
828	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
829	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
830	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
831
832	aese	v1.16b, v27.16b
833	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
834	aese	v3.16b, v27.16b
835	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
836.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
837
838.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
839	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
840	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
841
842	aese	v1.16b, v28.16b
843	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
844	aese	v6.16b, v27.16b
845	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
846	aese	v0.16b, v27.16b
847	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
848
849	aese	v7.16b, v27.16b
850	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
851	aese	v2.16b, v27.16b
852	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
853
854	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
855	aese	v4.16b, v27.16b
856	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
857	aese	v5.16b, v27.16b
858	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
859
860	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
861	ldp	q26, q27, [x8, #96]				//load rk6, rk7
862	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
863
864.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
865	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
866	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
867
868	aese	v0.16b, v28.16b
869	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
870	aese	v7.16b, v28.16b
871	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
872	ldr	d16, [x10]			//MODULO - load modulo constant
873
874	aese	v2.16b, v28.16b
875	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
876	aese	v4.16b, v28.16b
877	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
878
879.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
880	aese	v5.16b, v28.16b
881	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
882	aese	v6.16b, v28.16b
883	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
884
885	aese	v3.16b, v28.16b
886	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
887	aese	v4.16b, v26.16b
888	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
889
890	aese	v5.16b, v26.16b
891	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
892	aese	v2.16b, v26.16b
893	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
894	aese	v0.16b, v26.16b
895	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
896
897	aese	v3.16b, v26.16b
898	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
899.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
900.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
901
902	aese	v6.16b, v26.16b
903	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
904	aese	v1.16b, v26.16b
905	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
906	aese	v7.16b, v26.16b
907	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
908
909	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
910.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
911	ldp	q28, q26, [x8, #128]				//load rk8, rk9
912
913	aese	v3.16b, v27.16b
914	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
915	aese	v6.16b, v27.16b
916	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
917	aese	v1.16b, v27.16b
918	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
919	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
920
921	aese	v5.16b, v27.16b
922	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
923	aese	v0.16b, v27.16b
924	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
925.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
926
927	aese	v2.16b, v27.16b
928	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
929	aese	v7.16b, v27.16b
930	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
931
932	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
933	aese	v4.16b, v27.16b
934	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
935
936	aese	v7.16b, v28.16b
937	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
938	aese	v2.16b, v28.16b
939	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
940	aese	v1.16b, v28.16b
941	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
942	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
943
944	aese	v6.16b, v28.16b
945	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
946.inst	0xce114a73	//eor3 v19.16b, v19.16b, v17.16b, v18.16b		 	//MODULO - fold into low
947	aese	v4.16b, v28.16b
948	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
949
950	aese	v3.16b, v28.16b
951	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
952	aese	v0.16b, v28.16b
953	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
954	aese	v5.16b, v28.16b
955	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
956
957	ldr	q27, [x8, #160]					//load rk10
958	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
959	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
960
961	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
962	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
963
964	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
965	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
966
967	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
968	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
969.L128_enc_tail:	//TAIL
970
971	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
972	ldr	q8, [x0], #16				//AES block 8k+8 - load plaintext
973
974	mov	v29.16b, v27.16b
975	ldp	q20, q21, [x3, #128]			//load h5l | h5h
976	ext	v20.16b, v20.16b, v20.16b, #8
977
978.inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b			//AES block 8k+8 - result
979	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
980	ldp	q22, q23, [x3, #160]			//load h6l | h6h
981	ext	v22.16b, v22.16b, v22.16b, #8
982	ext	v23.16b, v23.16b, v23.16b, #8
983
984	ldp	q24, q25, [x3, #192]			//load h8k | h7k
985	ext	v25.16b, v25.16b, v25.16b, #8
986	cmp	x5, #112
987	b.gt	.L128_enc_blocks_more_than_7
988
989	mov	v7.16b, v6.16b
990	mov	v6.16b, v5.16b
991	movi	v17.8b, #0
992
993	cmp	x5, #96
994	sub	v30.4s, v30.4s, v31.4s
995	mov	v5.16b, v4.16b
996
997	mov	v4.16b, v3.16b
998	mov	v3.16b, v2.16b
999	mov	v2.16b, v1.16b
1000
1001	movi	v19.8b, #0
1002	movi	v18.8b, #0
1003	b.gt	.L128_enc_blocks_more_than_6
1004
1005	mov	v7.16b, v6.16b
1006	cmp	x5, #80
1007
1008	sub	v30.4s, v30.4s, v31.4s
1009	mov	v6.16b, v5.16b
1010	mov	v5.16b, v4.16b
1011
1012	mov	v4.16b, v3.16b
1013	mov	v3.16b, v1.16b
1014	b.gt	.L128_enc_blocks_more_than_5
1015
1016	cmp	x5, #64
1017	sub	v30.4s, v30.4s, v31.4s
1018
1019	mov	v7.16b, v6.16b
1020	mov	v6.16b, v5.16b
1021
1022	mov	v5.16b, v4.16b
1023	mov	v4.16b, v1.16b
1024	b.gt	.L128_enc_blocks_more_than_4
1025
1026	mov	v7.16b, v6.16b
1027	sub	v30.4s, v30.4s, v31.4s
1028	mov	v6.16b, v5.16b
1029
1030	mov	v5.16b, v1.16b
1031	cmp	x5, #48
1032	b.gt	.L128_enc_blocks_more_than_3
1033
1034	sub	v30.4s, v30.4s, v31.4s
1035	mov	v7.16b, v6.16b
1036	mov	v6.16b, v1.16b
1037
1038	cmp	x5, #32
1039	ldr	q24, [x3, #96]					//load h4k | h3k
1040	b.gt	.L128_enc_blocks_more_than_2
1041
1042	cmp	x5, #16
1043
1044	sub	v30.4s, v30.4s, v31.4s
1045	mov	v7.16b, v1.16b
1046	b.gt	.L128_enc_blocks_more_than_1
1047
1048	ldr	q21, [x3, #48]					//load h2k | h1k
1049	sub	v30.4s, v30.4s, v31.4s
1050	b	.L128_enc_blocks_less_than_1
1051.L128_enc_blocks_more_than_7:	//blocks	left >  7
1052	st1	{ v9.16b}, [x2], #16				//AES final-7 block  - store result
1053
1054	rev64	v8.16b, v9.16b						//GHASH final-7 block
1055	ldr	q9, [x0], #16				//AES final-6 block - load plaintext
1056
1057	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
1058
1059	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
1060
1061	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
1062
1063	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
1064
1065	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
1066	movi	v16.8b, #0						//suppress further partial tag feed in
1067
1068.inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result
1069
1070	pmull	v18.1q, v27.1d, v18.1d				//GHASH final-7 block - mid
1071	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
1072.L128_enc_blocks_more_than_6:	//blocks	left >  6
1073
1074	st1	{ v9.16b}, [x2], #16				//AES final-6 block - store result
1075
1076	rev64	v8.16b, v9.16b						//GHASH final-6 block
1077	ldr	q9, [x0], #16				//AES final-5 block - load plaintext
1078
1079	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
1080
1081	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
1082
1083.inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result
1084	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
1085
1086	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
1087	movi	v16.8b, #0						//suppress further partial tag feed in
1088
1089	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
1090	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
1091
1092	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
1093
1094	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
1095	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
1096.L128_enc_blocks_more_than_5:	//blocks	left >  5
1097
1098	st1	{ v9.16b}, [x2], #16				//AES final-5 block - store result
1099
1100	rev64	v8.16b, v9.16b						//GHASH final-5 block
1101
1102	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
1103
1104	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
1105	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
1106	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
1107
1108	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
1109
1110	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
1111
1112	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
1113
1114.inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
1115	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
1116	movi	v16.8b, #0						//suppress further partial tag feed in
1117
1118	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
1119	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
1120
1121	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
1122.L128_enc_blocks_more_than_4:	//blocks	left >  4
1123
1124	st1	{ v9.16b}, [x2], #16			  	//AES final-4 block - store result
1125
1126	rev64	v8.16b, v9.16b						//GHASH final-4 block
1127
1128	ldr	q9, [x0], #16				//AES final-3 block - load plaintext
1129
1130	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
1131
1132	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
1133	movi	v16.8b, #0						//suppress further partial tag feed in
1134	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
1135
1136	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
1137
1138	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
1139
1140	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
1141	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
1142
1143	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
1144
1145.inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
1146	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
1147.L128_enc_blocks_more_than_3:	//blocks	left >  3
1148
1149	st1	{ v9.16b}, [x2], #16			  	//AES final-3 block - store result
1150
1151	ldr	q25, [x3, #112]				//load h4l | h4h
1152	ext	v25.16b, v25.16b, v25.16b, #8
1153
1154	rev64	v8.16b, v9.16b						//GHASH final-3 block
1155
1156	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
1157	movi	v16.8b, #0						//suppress further partial tag feed in
1158
1159	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
1160	ldr	q24, [x3, #96]				//load h4k | h3k
1161	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
1162
1163	ldr	q9, [x0], #16				//AES final-2 block - load plaintext
1164
1165	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
1166
1167	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
1168	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
1169
1170.inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result
1171
1172	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
1173	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
1174
1175	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
1176	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
1177.L128_enc_blocks_more_than_2:	//blocks	left >  2
1178
1179	st1	{ v9.16b}, [x2], #16			  	//AES final-2 block - store result
1180
1181	rev64	v8.16b, v9.16b						//GHASH final-2 block
1182
1183	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
1184
1185	ldr	q9, [x0], #16				//AES final-1 block - load plaintext
1186
1187	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
1188	ldr	q23, [x3, #80]				//load h3l | h3h
1189	ext	v23.16b, v23.16b, v23.16b, #8
1190	movi	v16.8b, #0						//suppress further partial tag feed in
1191
1192	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
1193.inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result
1194
1195	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
1196
1197	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
1198	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
1199
1200	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
1201
1202	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
1203	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
1204.L128_enc_blocks_more_than_1:	//blocks	left >  1
1205
1206	st1	{ v9.16b}, [x2], #16			  	//AES final-1 block - store result
1207
1208	ldr	q22, [x3, #64]				//load h2l | h2h
1209	ext	v22.16b, v22.16b, v22.16b, #8
1210	rev64	v8.16b, v9.16b						//GHASH final-1 block
1211	ldr	q9, [x0], #16				//AES final block - load plaintext
1212
1213	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
1214
1215	movi	v16.8b, #0						//suppress further partial tag feed in
1216	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
1217.inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result
1218
1219	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
1220
1221	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
1222
1223	ldr	q21, [x3, #48]				//load h2k | h1k
1224
1225	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
1226
1227	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
1228	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
1229
1230	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
1231
1232	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
1233	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
1234.L128_enc_blocks_less_than_1:	//blocks	left <= 1
1235
1236	rev32	v30.16b, v30.16b
1237	str	q30, [x16]					//store the updated counter
1238	and	x1, x1, #127			 	//bit_length %= 128
1239
1240	sub	x1, x1, #128			 	//bit_length -= 128
1241
1242	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
1243
1244	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
1245	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
1246	and	x1, x1, #127			 	//bit_length %= 128
1247
1248	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
1249	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
1250	cmp	x1, #64
1251
1252	csel	x13, x7, x6, lt
1253	csel	x14, x6, xzr, lt
1254
1255	mov	v0.d[1], x14
1256	mov	v0.d[0], x13					//ctr0b is mask for last block
1257
1258	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
1259
1260	rev64	v8.16b, v9.16b						//GHASH final block
1261
1262	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
1263	st1	{ v9.16b}, [x2]				//store all 16B
1264
1265	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
1266
1267	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
1268
1269	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
1270	ldr	q20, [x3, #32]				//load h1l | h1h
1271	ext	v20.16b, v20.16b, v20.16b, #8
1272
1273	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
1274
1275	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
1276	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
1277	ldr	d16, [x10]			//MODULO - load modulo constant
1278
1279	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
1280
1281	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
1282
1283	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
1284
1285	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
1286	pmull	v29.1q, v17.1d, v16.1d		  	//MODULO - top 64b align with mid
1287
1288.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		  	//MODULO - karatsuba tidy up
1289
1290.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b		 	//MODULO - fold into mid
1291
1292	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
1293	ext	v21.16b, v18.16b, v18.16b, #8			  	//MODULO - other mid alignment
1294
1295.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		  	//MODULO - fold into low
1296	ext	v19.16b, v19.16b, v19.16b, #8
1297	rev64	v19.16b, v19.16b
1298	st1	{ v19.16b }, [x3]
1299	mov	x0, x9
1300
1301	ldp	d10, d11, [sp, #16]
1302	ldp	d12, d13, [sp, #32]
1303	ldp	d14, d15, [sp, #48]
1304	ldp	d8, d9, [sp], #80
1305	ret
1306
1307.L128_enc_ret:
1308	mov	w0, #0x0
1309	ret
1310.size	unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
1311.globl	unroll8_eor3_aes_gcm_dec_128_kernel
1312.type	unroll8_eor3_aes_gcm_dec_128_kernel,%function
1313.align	4
1314unroll8_eor3_aes_gcm_dec_128_kernel:
1315	AARCH64_VALID_CALL_TARGET
1316	cbz	x1, .L128_dec_ret
1317	stp	d8, d9, [sp, #-80]!
1318	lsr	x9, x1, #3
1319	mov	x16, x4
1320	mov	x8, x5
1321	stp	d10, d11, [sp, #16]
1322	stp	d12, d13, [sp, #32]
1323	stp	d14, d15, [sp, #48]
1324	mov	x5, #0xc200000000000000
1325	stp	x5, xzr, [sp, #64]
1326	add	x10, sp, #64
1327
1328	mov	x5, x9
1329	ld1	{ v0.16b}, [x16]					//CTR block 0
1330
1331	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
1332	sub	x5, x5, #1		//byte_len - 1
1333
1334	mov	x15, #0x100000000				//set up counter increment
1335	movi	v31.16b, #0x0
1336	mov	v31.d[1], x15
1337	ld1	{ v19.16b}, [x3]
1338	ext	v19.16b, v19.16b, v19.16b, #8
1339	rev64	v19.16b, v19.16b
1340
1341	rev32	v30.16b, v0.16b				//set up reversed counter
1342
1343	aese	v0.16b, v26.16b
1344	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
1345
1346	add	v30.4s, v30.4s, v31.4s		//CTR block 0
1347
1348	rev32	v1.16b, v30.16b				//CTR block 1
1349	add	v30.4s, v30.4s, v31.4s		//CTR block 1
1350
1351	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1352
1353	rev32	v2.16b, v30.16b				//CTR block 2
1354	add	v30.4s, v30.4s, v31.4s		//CTR block 2
1355	aese	v1.16b, v26.16b
1356	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
1357
1358	rev32	v3.16b, v30.16b				//CTR block 3
1359	add	v30.4s, v30.4s, v31.4s		//CTR block 3
1360
1361	aese	v0.16b, v27.16b
1362	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
1363	aese	v1.16b, v27.16b
1364	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
1365
1366	rev32	v4.16b, v30.16b				//CTR block 4
1367	add	v30.4s, v30.4s, v31.4s		//CTR block 4
1368
1369	rev32	v5.16b, v30.16b				//CTR block 5
1370	add	v30.4s, v30.4s, v31.4s		//CTR block 5
1371
1372	aese	v2.16b, v26.16b
1373	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
1374
1375	rev32	v6.16b, v30.16b				//CTR block 6
1376	add	v30.4s, v30.4s, v31.4s		//CTR block 6
1377	aese	v5.16b, v26.16b
1378	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
1379
1380	aese	v3.16b, v26.16b
1381	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
1382	aese	v4.16b, v26.16b
1383	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
1384
1385	rev32	v7.16b, v30.16b				//CTR block 7
1386
1387	aese	v6.16b, v26.16b
1388	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
1389	aese	v2.16b, v27.16b
1390	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
1391
1392	aese	v7.16b, v26.16b
1393	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
1394
1395	ldp	q28, q26, [x8, #32]				//load rk2, rk3
1396
1397	aese	v6.16b, v27.16b
1398	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
1399	aese	v5.16b, v27.16b
1400	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
1401
1402	aese	v4.16b, v27.16b
1403	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
1404	aese	v7.16b, v27.16b
1405	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
1406
1407	aese	v7.16b, v28.16b
1408	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
1409	aese	v0.16b, v28.16b
1410	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
1411	aese	v3.16b, v27.16b
1412	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
1413
1414	aese	v6.16b, v28.16b
1415	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
1416	aese	v2.16b, v28.16b
1417	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
1418	aese	v5.16b, v28.16b
1419	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
1420
1421	aese	v4.16b, v28.16b
1422	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
1423	aese	v3.16b, v28.16b
1424	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
1425	aese	v1.16b, v28.16b
1426	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
1427
1428	aese	v6.16b, v26.16b
1429	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
1430	aese	v2.16b, v26.16b
1431	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
1432
1433	ldp	q27, q28, [x8, #64]				//load rk4, rk5
1434	aese	v5.16b, v26.16b
1435	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
1436
1437	aese	v0.16b, v26.16b
1438	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
1439	aese	v7.16b, v26.16b
1440	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
1441
1442	aese	v3.16b, v26.16b
1443	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
1444	aese	v1.16b, v26.16b
1445	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
1446
1447	aese	v0.16b, v27.16b
1448	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
1449	aese	v7.16b, v27.16b
1450	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
1451	aese	v4.16b, v26.16b
1452	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
1453
1454	aese	v6.16b, v27.16b
1455	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
1456	aese	v1.16b, v27.16b
1457	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
1458	aese	v3.16b, v27.16b
1459	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
1460
1461	aese	v5.16b, v27.16b
1462	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
1463	aese	v4.16b, v27.16b
1464	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
1465	aese	v2.16b, v27.16b
1466	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
1467
1468	ldp	q26, q27, [x8, #96]				//load rk6, rk7
1469	aese	v2.16b, v28.16b
1470	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
1471	aese	v3.16b, v28.16b
1472	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
1473
1474	aese	v6.16b, v28.16b
1475	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
1476	aese	v1.16b, v28.16b
1477	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
1478
1479	aese	v7.16b, v28.16b
1480	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
1481	aese	v5.16b, v28.16b
1482	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
1483
1484	aese	v4.16b, v28.16b
1485	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
1486
1487	aese	v3.16b, v26.16b
1488	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
1489	aese	v2.16b, v26.16b
1490	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
1491	aese	v0.16b, v28.16b
1492	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
1493
1494	aese	v5.16b, v26.16b
1495	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
1496	aese	v4.16b, v26.16b
1497	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
1498	aese	v1.16b, v26.16b
1499	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
1500
1501	aese	v0.16b, v26.16b
1502	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
1503	aese	v7.16b, v26.16b
1504	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
1505	aese	v6.16b, v26.16b
1506	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
1507
1508	aese	v3.16b, v27.16b
1509	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
1510	aese	v4.16b, v27.16b
1511	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
1512	aese	v1.16b, v27.16b
1513	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
1514
1515	aese	v7.16b, v27.16b
1516	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
1517	aese	v5.16b, v27.16b
1518	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
1519	ldp	q28, q26, [x8, #128]				//load rk8, rk9
1520
1521	aese	v6.16b, v27.16b
1522	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
1523	aese	v2.16b, v27.16b
1524	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
1525	aese	v0.16b, v27.16b
1526	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
1527
1528	add	x5, x5, x0
1529	add	v30.4s, v30.4s, v31.4s		//CTR block 7
1530
1531	aese	v6.16b, v28.16b
1532	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
1533	aese	v0.16b, v28.16b
1534	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
1535
1536	aese	v1.16b, v28.16b
1537	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
1538	aese	v7.16b, v28.16b
1539	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
1540	aese	v3.16b, v28.16b
1541	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
1542
1543	aese	v5.16b, v28.16b
1544	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
1545	aese	v2.16b, v28.16b
1546	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
1547	aese	v4.16b, v28.16b
1548	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
1549
1550	aese	v0.16b, v26.16b						//AES block 0 - round 9
1551	aese	v1.16b, v26.16b						//AES block 1 - round 9
1552	aese	v6.16b, v26.16b						//AES block 6 - round 9
1553
1554	ldr	q27, [x8, #160]					//load rk10
1555	aese	v4.16b, v26.16b						//AES block 4 - round 9
1556	aese	v3.16b, v26.16b						//AES block 3 - round 9
1557
1558	aese	v2.16b, v26.16b						//AES block 2 - round 9
1559	aese	v5.16b, v26.16b						//AES block 5 - round 9
1560	aese	v7.16b, v26.16b						//AES block 7 - round 9
1561
1562	add	x4, x0, x1, lsr #3		//end_input_ptr
1563	cmp	x0, x5				//check if we have <= 8 blocks
1564	b.ge	.L128_dec_tail						//handle tail
1565
1566	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext
1567
1568.inst	0xce006d00	//eor3 v0.16b, v8.16b, v0.16b, v27.16b				//AES block 0 - result
1569.inst	0xce016d21	//eor3 v1.16b, v9.16b, v1.16b, v27.16b				//AES block 1 - result
1570	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result
1571
1572	rev32	v0.16b, v30.16b				//CTR block 8
1573	add	v30.4s, v30.4s, v31.4s		//CTR block 8
1574	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext
1575
1576	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext
1577
1578	rev32	v1.16b, v30.16b				//CTR block 9
1579	add	v30.4s, v30.4s, v31.4s		//CTR block 9
1580	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext
1581
1582.inst	0xce036d63	//eor3 v3.16b, v11.16b, v3.16b, v27.16b				//AES block 3 - result
1583.inst	0xce026d42	//eor3 v2.16b, v10.16b, v2.16b, v27.16b				//AES block 2 - result
1584	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result
1585
1586	rev32	v2.16b, v30.16b				//CTR block 10
1587	add	v30.4s, v30.4s, v31.4s		//CTR block 10
1588
1589.inst	0xce066dc6	//eor3 v6.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result
1590
1591	rev32	v3.16b, v30.16b				//CTR block 11
1592	add	v30.4s, v30.4s, v31.4s		//CTR block 11
1593
1594.inst	0xce046d84	//eor3 v4.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result
1595.inst	0xce056da5	//eor3 v5.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
1596	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result
1597
1598.inst	0xce076de7	//eor3 v7.16b, v15.16b, v7.16b, v27.16b				//AES block 7 - result
1599	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result
1600	rev32	v4.16b, v30.16b				//CTR block 12
1601
1602	cmp	x0, x5				//check if we have <= 8 blocks
1603	add	v30.4s, v30.4s, v31.4s		//CTR block 12
1604	b.ge	.L128_dec_prepretail					//do prepretail
1605
1606.L128_dec_main_loop:	//main	loop start
1607	ldr	q23, [x3, #176]				//load h7l | h7h
1608	ext	v23.16b, v23.16b, v23.16b, #8
1609	ldr	q25, [x3, #208]				//load h8l | h8h
1610	ext	v25.16b, v25.16b, v25.16b, #8
1611
1612	rev64	v9.16b, v9.16b						//GHASH block 8k+1
1613	rev64	v8.16b, v8.16b						//GHASH block 8k
1614	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
1615
1616	rev64	v14.16b, v14.16b						//GHASH block 8k+6
1617	ldr	q20, [x3, #128]				//load h5l | h5h
1618	ext	v20.16b, v20.16b, v20.16b, #8
1619	ldr	q22, [x3, #160]				//load h6l | h6h
1620	ext	v22.16b, v22.16b, v22.16b, #8
1621
1622	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
1623	rev32	v5.16b, v30.16b				//CTR block 8k+13
1624	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
1625
1626	rev64	v10.16b, v10.16b						//GHASH block 8k+2
1627	rev64	v12.16b, v12.16b						//GHASH block 8k+4
1628	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
1629
1630	rev32	v6.16b, v30.16b				//CTR block 8k+14
1631	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
1632	ldr	q21, [x3, #144]				//load h6k | h5k
1633	ldr	q24, [x3, #192]				//load h8k | h7k
1634
1635	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
1636	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
1637	rev64	v11.16b, v11.16b						//GHASH block 8k+3
1638
1639	rev32	v7.16b, v30.16b				//CTR block 8k+15
1640	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
1641	rev64	v13.16b, v13.16b						//GHASH block 8k+5
1642
1643	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
1644	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
1645	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
1646
1647	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
1648	aese	v4.16b, v26.16b
1649	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
1650	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
1651
1652	aese	v6.16b, v26.16b
1653	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
1654	aese	v5.16b, v26.16b
1655	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
1656	aese	v7.16b, v26.16b
1657	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
1658
1659	aese	v3.16b, v26.16b
1660	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
1661	aese	v2.16b, v26.16b
1662	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
1663	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
1664
1665	aese	v1.16b, v26.16b
1666	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
1667	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
1668	aese	v0.16b, v26.16b
1669	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
1670
1671	aese	v2.16b, v27.16b
1672	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
1673	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
1674.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
1675
1676	ldp	q28, q26, [x8, #32]				//load rk2, rk3
1677	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
1678	aese	v7.16b, v27.16b
1679	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
1680
1681	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
1682	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
1683	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
1684
1685	ldr	q23, [x3, #80]				//load h3l | h3h
1686	ext	v23.16b, v23.16b, v23.16b, #8
1687	ldr	q25, [x3, #112]				//load h4l | h4h
1688	ext	v25.16b, v25.16b, v25.16b, #8
1689	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
1690	aese	v6.16b, v27.16b
1691	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
1692
1693	aese	v4.16b, v27.16b
1694	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
1695	aese	v5.16b, v27.16b
1696	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
1697	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
1698
1699	aese	v3.16b, v27.16b
1700	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
1701	aese	v0.16b, v27.16b
1702	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
1703	aese	v1.16b, v27.16b
1704	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
1705
1706	aese	v7.16b, v28.16b
1707	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
1708	aese	v2.16b, v28.16b
1709	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
1710.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
1711
1712	aese	v4.16b, v28.16b
1713	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
1714	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
1715	ldr	q20, [x3, #32]				//load h1l | h1h
1716	ext	v20.16b, v20.16b, v20.16b, #8
1717	ldr	q22, [x3, #64]				//load h2l | h2h
1718	ext	v22.16b, v22.16b, v22.16b, #8
1719
1720	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
1721	aese	v1.16b, v28.16b
1722	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
1723	aese	v3.16b, v28.16b
1724	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
1725
1726	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
1727	aese	v5.16b, v28.16b
1728	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
1729	aese	v0.16b, v28.16b
1730	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
1731
1732	aese	v6.16b, v28.16b
1733	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
1734	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
1735	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
1736
1737	aese	v7.16b, v26.16b
1738	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
1739	rev64	v15.16b, v15.16b						//GHASH block 8k+7
1740	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
1741
1742	ldp	q27, q28, [x8, #64]				//load rk4, rk5
1743	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
1744.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
1745
1746	ldr	q21, [x3, #48]				//load h2k | h1k
1747	ldr	q24, [x3, #96]				//load h4k | h3k
1748	aese	v2.16b, v26.16b
1749	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
1750	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
1751
1752	aese	v4.16b, v26.16b
1753	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
1754	aese	v3.16b, v26.16b
1755	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
1756	aese	v1.16b, v26.16b
1757	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
1758
1759	aese	v0.16b, v26.16b
1760	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
1761	aese	v6.16b, v26.16b
1762	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
1763	aese	v5.16b, v26.16b
1764	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
1765
1766	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
1767	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
1768	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
1769
1770	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
1771	aese	v0.16b, v27.16b
1772	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
1773	aese	v7.16b, v27.16b
1774	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
1775
1776	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
1777	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
1778	aese	v3.16b, v27.16b
1779	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
1780
1781	aese	v1.16b, v27.16b
1782	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
1783	aese	v5.16b, v27.16b
1784	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
1785	aese	v6.16b, v27.16b
1786	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
1787
1788	aese	v2.16b, v27.16b
1789	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
1790	aese	v4.16b, v27.16b
1791	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
1792	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
1793
1794	ldp	q26, q27, [x8, #96]				//load rk6, rk7
1795	aese	v0.16b, v28.16b
1796	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
1797	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
1798
1799	aese	v2.16b, v28.16b
1800	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
1801	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
1802	aese	v1.16b, v28.16b
1803	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
1804
1805	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
1806	aese	v6.16b, v28.16b
1807	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
1808	aese	v7.16b, v28.16b
1809	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
1810
1811	aese	v3.16b, v28.16b
1812	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
1813	aese	v5.16b, v28.16b
1814	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
1815	aese	v4.16b, v28.16b
1816	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
1817
1818	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
1819.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b 			//GHASH block 8k+4, 8k+5 - mid
1820.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
1821
1822	aese	v3.16b, v26.16b
1823	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
1824.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
1825	aese	v7.16b, v26.16b
1826	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
1827
1828	aese	v1.16b, v26.16b
1829	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
1830	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
1831	aese	v6.16b, v26.16b
1832	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
1833
1834	aese	v2.16b, v26.16b
1835	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
1836	aese	v5.16b, v26.16b
1837	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
1838	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
1839
1840	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
1841	aese	v0.16b, v26.16b
1842	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
1843	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
1844
1845.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
1846	aese	v4.16b, v26.16b
1847	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
1848	ldp	q28, q26, [x8, #128]				//load rk8, rk9
1849
1850	ldr	d16, [x10]			//MODULO - load modulo constant
1851.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
1852	aese	v5.16b, v27.16b
1853	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
1854
1855	rev32	v20.16b, v30.16b					//CTR block 8k+16
1856.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
1857	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
1858
1859	aese	v6.16b, v27.16b
1860	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
1861	aese	v3.16b, v27.16b
1862	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
1863	aese	v7.16b, v27.16b
1864	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
1865
1866	aese	v2.16b, v27.16b
1867	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
1868	aese	v1.16b, v27.16b
1869	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
1870	rev32	v22.16b, v30.16b					//CTR block 8k+17
1871
1872	aese	v4.16b, v27.16b
1873	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
1874	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
1875	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
1876
1877.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
1878	aese	v0.16b, v27.16b
1879	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
1880	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
1881
1882	aese	v5.16b, v28.16b
1883	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
1884	aese	v1.16b, v28.16b
1885	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
1886	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext
1887
1888	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext
1889	aese	v0.16b, v28.16b
1890	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
1891	rev32	v23.16b, v30.16b					//CTR block 8k+18
1892
1893	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext
1894	aese	v4.16b, v28.16b
1895	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
1896.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
1897
1898	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
1899	aese	v3.16b, v28.16b
1900	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
1901	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
1902
1903	aese	v7.16b, v28.16b
1904	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
1905	aese	v2.16b, v28.16b
1906	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
1907	aese	v6.16b, v28.16b
1908	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
1909
1910	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
1911	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
1912	ldr	q27, [x8, #160]					//load rk10
1913
1914	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
1915	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
1916	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
1917
1918	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
1919	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
1920	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
1921
1922	rev32	v25.16b, v30.16b					//CTR block 8k+19
1923	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
1924
1925	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
1926	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
1927.inst	0xce016d21	//eor3 v1.16b, v9.16b, v1.16b, v27.16b				//AES block 8k+9 - result
1928
1929.inst	0xce006d00	//eor3 v0.16b, v8.16b, v0.16b, v27.16b				//AES block 8k+8 - result
1930.inst	0xce076de7	//eor3 v7.16b, v15.16b, v7.16b, v27.16b				//AES block 8k+15 - result
1931.inst	0xce066dc6	//eor3 v6.16b, v14.16b, v6.16b, v27.16b				//AES block 8k+14 - result
1932
1933.inst	0xce026d42	//eor3 v2.16b, v10.16b, v2.16b, v27.16b				//AES block 8k+10 - result
1934	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
1935	mov	v1.16b, v22.16b					//CTR block 8k+17
1936
1937.inst	0xce046d84	//eor3 v4.16b, v12.16b, v4.16b, v27.16b				//AES block 8k+12 - result
1938.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
1939	mov	v0.16b, v20.16b					//CTR block 8k+16
1940
1941.inst	0xce036d63	//eor3 v3.16b, v11.16b, v3.16b, v27.16b				//AES block 8k+11 - result
1942	cmp	x0, x5				//.LOOP CONTROL
1943	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result
1944
1945.inst	0xce056da5	//eor3 v5.16b, v13.16b, v5.16b, v27.16b				//AES block 8k+13 - result
1946	mov	v2.16b, v23.16b					//CTR block 8k+18
1947
1948	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
1949	rev32	v4.16b, v30.16b				//CTR block 8k+20
1950	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
1951
1952	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
1953	mov	v3.16b, v25.16b					//CTR block 8k+19
1954	b.lt	.L128_dec_main_loop
1955
1956.L128_dec_prepretail:	//PREPRETAIL
1957	rev64	v11.16b, v11.16b						//GHASH block 8k+3
1958	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
1959	rev64	v8.16b, v8.16b						//GHASH block 8k
1960
1961	rev64	v10.16b, v10.16b						//GHASH block 8k+2
1962	rev32	v5.16b, v30.16b				//CTR block 8k+13
1963	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
1964
1965	ldr	q23, [x3, #176]				//load h7l | h7h
1966	ext	v23.16b, v23.16b, v23.16b, #8
1967	ldr	q25, [x3, #208]				//load h8l | h8h
1968	ext	v25.16b, v25.16b, v25.16b, #8
1969	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
1970	rev64	v9.16b, v9.16b						//GHASH block 8k+1
1971
1972	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
1973	ldr	q20, [x3, #128]				//load h5l | h5h
1974	ext	v20.16b, v20.16b, v20.16b, #8
1975	ldr	q22, [x3, #160]				//load h6l | h6h
1976	ext	v22.16b, v22.16b, v22.16b, #8
1977	rev64	v13.16b, v13.16b						//GHASH block 8k+5
1978
1979	rev64	v12.16b, v12.16b						//GHASH block 8k+4
1980
1981	rev64	v14.16b, v14.16b						//GHASH block 8k+6
1982
1983	ldr	q21, [x3, #144]				//load h6k | h5k
1984	ldr	q24, [x3, #192]				//load h8k | h7k
1985	rev32	v6.16b, v30.16b				//CTR block 8k+14
1986	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
1987
1988	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
1989	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
1990	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
1991
1992	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
1993	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
1994	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
1995
1996	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
1997	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
1998	aese	v0.16b, v26.16b
1999	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
2000
2001	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
2002	aese	v4.16b, v26.16b
2003	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
2004	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
2005
2006	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
2007	rev32	v7.16b, v30.16b				//CTR block 8k+15
2008	aese	v3.16b, v26.16b
2009	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
2010
2011.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
2012	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
2013	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
2014
2015	aese	v2.16b, v26.16b
2016	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
2017	aese	v1.16b, v26.16b
2018	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
2019	aese	v5.16b, v26.16b
2020	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
2021
2022	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k - mid
2023	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
2024	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
2025
2026	aese	v2.16b, v27.16b
2027	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
2028	aese	v7.16b, v26.16b
2029	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
2030	aese	v6.16b, v26.16b
2031	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
2032
2033	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
2034	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
2035	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
2036
2037	aese	v6.16b, v27.16b
2038	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
2039	aese	v4.16b, v27.16b
2040	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
2041	aese	v5.16b, v27.16b
2042	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
2043
2044	ldp	q28, q26, [x8, #32]				//load rk2, rk3
2045.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
2046	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
2047
2048	ldr	q23, [x3, #80]				//load h3l | h3h
2049	ext	v23.16b, v23.16b, v23.16b, #8
2050	ldr	q25, [x3, #112]				//load h4l | h4h
2051	ext	v25.16b, v25.16b, v25.16b, #8
2052	aese	v1.16b, v27.16b
2053	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
2054	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
2055
2056	aese	v3.16b, v27.16b
2057	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
2058	aese	v7.16b, v27.16b
2059	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
2060	aese	v0.16b, v27.16b
2061	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
2062
2063	ldr	q20, [x3, #32]				//load h1l | h1h
2064	ext	v20.16b, v20.16b, v20.16b, #8
2065	ldr	q22, [x3, #64]				//load h2l | h2h
2066	ext	v22.16b, v22.16b, v22.16b, #8
2067.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
2068
2069	aese	v0.16b, v28.16b
2070	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
2071	aese	v6.16b, v28.16b
2072	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
2073	aese	v2.16b, v28.16b
2074	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
2075
2076	aese	v4.16b, v28.16b
2077	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
2078	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
2079	aese	v7.16b, v28.16b
2080	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
2081
2082	aese	v1.16b, v28.16b
2083	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
2084	aese	v5.16b, v28.16b
2085	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
2086	aese	v3.16b, v28.16b
2087	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
2088
2089	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
2090	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
2091	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
2092
2093	ldp	q27, q28, [x8, #64]				//load rk4, rk5
2094	rev64	v15.16b, v15.16b						//GHASH block 8k+7
2095	aese	v6.16b, v26.16b
2096	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
2097
2098	ldr	q21, [x3, #48]				//load h2k | h1k
2099	ldr	q24, [x3, #96]				//load h4k | h3k
2100	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
2101	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
2102
2103	aese	v2.16b, v26.16b
2104	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
2105	aese	v0.16b, v26.16b
2106	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
2107	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
2108
2109	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
2110	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
2111	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
2112
2113	aese	v4.16b, v26.16b
2114	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
2115	aese	v3.16b, v26.16b
2116	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
2117	aese	v7.16b, v26.16b
2118	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
2119
2120	aese	v1.16b, v26.16b
2121	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
2122	aese	v5.16b, v26.16b
2123	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
2124	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
2125
2126.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
2127	aese	v0.16b, v27.16b
2128	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
2129	aese	v2.16b, v27.16b
2130	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
2131
2132	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
2133	aese	v5.16b, v27.16b
2134	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
2135	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
2136
2137	aese	v1.16b, v27.16b
2138	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
2139	aese	v6.16b, v27.16b
2140	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
2141	aese	v4.16b, v27.16b
2142	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
2143
2144	aese	v7.16b, v27.16b
2145	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
2146	aese	v3.16b, v27.16b
2147	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
2148	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
2149
2150	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
2151	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
2152	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
2153
2154	ldp	q26, q27, [x8, #96]				//load rk6, rk7
2155.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
2156	aese	v6.16b, v28.16b
2157	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
2158
2159	ldr	d16, [x10]			//MODULO - load modulo constant
2160	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
2161.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
2162
2163	aese	v0.16b, v28.16b
2164	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
2165	aese	v2.16b, v28.16b
2166	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
2167	aese	v4.16b, v28.16b
2168	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
2169
2170	aese	v3.16b, v28.16b
2171	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
2172	aese	v1.16b, v28.16b
2173	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
2174	aese	v5.16b, v28.16b
2175	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
2176
2177	aese	v7.16b, v28.16b
2178	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
2179.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
2180.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
2181
2182	aese	v4.16b, v26.16b
2183	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
2184	aese	v1.16b, v26.16b
2185	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
2186	aese	v2.16b, v26.16b
2187	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
2188
2189.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
2190	aese	v5.16b, v26.16b
2191	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
2192	aese	v0.16b, v26.16b
2193	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
2194
2195	aese	v3.16b, v26.16b
2196	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
2197	aese	v6.16b, v26.16b
2198	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
2199	aese	v7.16b, v26.16b
2200	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
2201
2202	aese	v4.16b, v27.16b
2203	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
2204.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
2205	ldp	q28, q26, [x8, #128]				//load rk8, rk9
2206
2207	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
2208	aese	v3.16b, v27.16b
2209	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
2210	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
2211
2212	aese	v5.16b, v27.16b
2213	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
2214	aese	v6.16b, v27.16b
2215	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
2216	aese	v0.16b, v27.16b
2217	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
2218
2219	aese	v7.16b, v27.16b
2220	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
2221	aese	v1.16b, v27.16b
2222	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
2223	aese	v2.16b, v27.16b
2224	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
2225
2226.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
2227	ldr	q27, [x8, #160]					//load rk10
2228
2229	aese	v3.16b, v28.16b
2230	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
2231	aese	v0.16b, v28.16b
2232	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
2233
2234	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
2235	aese	v6.16b, v28.16b
2236	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
2237	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
2238
2239	aese	v2.16b, v28.16b
2240	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
2241	aese	v1.16b, v28.16b
2242	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
2243	aese	v7.16b, v28.16b
2244	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
2245
2246	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
2247	aese	v5.16b, v28.16b
2248	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
2249	aese	v4.16b, v28.16b
2250	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
2251
2252.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
2253	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
2254	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
2255
2256	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
2257	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
2258	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
2259
2260	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
2261	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
2262	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
2263
2264.L128_dec_tail:	//TAIL
2265
2266	mov	v29.16b, v27.16b
2267	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
2268
2269	cmp	x5, #112
2270
2271	ldp	q24, q25, [x3, #192]			//load h8k | h7k
2272	ext	v25.16b, v25.16b, v25.16b, #8
2273	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext
2274
2275	ldp	q20, q21, [x3, #128]			//load h5l | h5h
2276	ext	v20.16b, v20.16b, v20.16b, #8
2277	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
2278
2279	ldp	q22, q23, [x3, #160]			//load h6l | h6h
2280	ext	v22.16b, v22.16b, v22.16b, #8
2281	ext	v23.16b, v23.16b, v23.16b, #8
2282
2283.inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
2284	b.gt	.L128_dec_blocks_more_than_7
2285
2286	cmp	x5, #96
2287	mov	v7.16b, v6.16b
2288	movi	v19.8b, #0
2289
2290	movi	v17.8b, #0
2291	mov	v6.16b, v5.16b
2292	mov	v5.16b, v4.16b
2293
2294	mov	v4.16b, v3.16b
2295	mov	v3.16b, v2.16b
2296	mov	v2.16b, v1.16b
2297
2298	movi	v18.8b, #0
2299	sub	v30.4s, v30.4s, v31.4s
2300	b.gt	.L128_dec_blocks_more_than_6
2301
2302	cmp	x5, #80
2303	sub	v30.4s, v30.4s, v31.4s
2304
2305	mov	v7.16b, v6.16b
2306	mov	v6.16b, v5.16b
2307	mov	v5.16b, v4.16b
2308
2309	mov	v4.16b, v3.16b
2310	mov	v3.16b, v1.16b
2311	b.gt	.L128_dec_blocks_more_than_5
2312
2313	cmp	x5, #64
2314
2315	mov	v7.16b, v6.16b
2316	mov	v6.16b, v5.16b
2317	mov	v5.16b, v4.16b
2318
2319	mov	v4.16b, v1.16b
2320	sub	v30.4s, v30.4s, v31.4s
2321	b.gt	.L128_dec_blocks_more_than_4
2322
2323	sub	v30.4s, v30.4s, v31.4s
2324	mov	v7.16b, v6.16b
2325	mov	v6.16b, v5.16b
2326
2327	mov	v5.16b, v1.16b
2328	cmp	x5, #48
2329	b.gt	.L128_dec_blocks_more_than_3
2330
2331	sub	v30.4s, v30.4s, v31.4s
2332	mov	v7.16b, v6.16b
2333	cmp	x5, #32
2334
2335	ldr	q24, [x3, #96]				//load h4k | h3k
2336	mov	v6.16b, v1.16b
2337	b.gt	.L128_dec_blocks_more_than_2
2338
2339	cmp	x5, #16
2340
2341	mov	v7.16b, v1.16b
2342	sub	v30.4s, v30.4s, v31.4s
2343	b.gt	.L128_dec_blocks_more_than_1
2344
2345	sub	v30.4s, v30.4s, v31.4s
2346	ldr	q21, [x3, #48]				//load h2k | h1k
2347	b	.L128_dec_blocks_less_than_1
2348.L128_dec_blocks_more_than_7:	//blocks	left >  7
2349	rev64	v8.16b, v9.16b						//GHASH final-7 block
2350
2351	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
2352
2353	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
2354
2355	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
2356	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
2357
2358	movi	v16.8b, #0						//suppress further partial tag feed in
2359	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext
2360
2361	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
2362
2363	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
2364	st1	{ v12.16b}, [x2], #16			 	//AES final-7 block  - store result
2365.inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result
2366
2367	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
2368.L128_dec_blocks_more_than_6:	//blocks	left >  6
2369
2370	rev64	v8.16b, v9.16b						//GHASH final-6 block
2371
2372	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
2373
2374	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
2375
2376	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
2377
2378	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
2379	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
2380	movi	v16.8b, #0						//suppress further partial tag feed in
2381
2382	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
2383	st1	{ v12.16b}, [x2], #16			 	//AES final-6 block - store result
2384	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
2385
2386	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
2387	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
2388
2389	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
2390.inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result
2391.L128_dec_blocks_more_than_5:	//blocks	left >  5
2392
2393	rev64	v8.16b, v9.16b						//GHASH final-5 block
2394
2395	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext
2396	st1	{ v12.16b}, [x2], #16			 	//AES final-5 block - store result
2397
2398	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
2399
2400	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
2401
2402.inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result
2403
2404	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
2405
2406	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
2407	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
2408	movi	v16.8b, #0						//suppress further partial tag feed in
2409
2410	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
2411	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
2412	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
2413
2414	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
2415	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
2416.L128_dec_blocks_more_than_4:	//blocks	left >  4
2417
2418	rev64	v8.16b, v9.16b						//GHASH final-4 block
2419
2420	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
2421	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext
2422
2423	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
2424	movi	v16.8b, #0						//suppress further partial tag feed in
2425	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
2426
2427	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
2428
2429	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
2430
2431	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result
2432	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
2433
2434.inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result
2435	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
2436
2437	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
2438
2439	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
2440.L128_dec_blocks_more_than_3:	//blocks	left >  3
2441
2442	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result
2443	rev64	v8.16b, v9.16b						//GHASH final-3 block
2444
2445	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
2446
2447	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
2448
2449	ldr	q25, [x3, #112]				//load h4l | h4h
2450	ext	v25.16b, v25.16b, v25.16b, #8
2451	ldr	q24, [x3, #96]				//load h4k | h3k
2452
2453	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
2454
2455	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext
2456
2457	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
2458	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
2459	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
2460
2461	movi	v16.8b, #0						//suppress further partial tag feed in
2462.inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result
2463	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
2464
2465	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
2466
2467	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
2468	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
2469.L128_dec_blocks_more_than_2:	//blocks	left >  2
2470
2471	rev64	v8.16b, v9.16b						//GHASH final-2 block
2472
2473	st1	{ v12.16b}, [x2], #16			 	//AES final-2 block - store result
2474
2475	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
2476	ldr	q23, [x3, #80]				//load h3l | h3h
2477	ext	v23.16b, v23.16b, v23.16b, #8
2478	movi	v16.8b, #0						//suppress further partial tag feed in
2479
2480	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
2481
2482	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
2483
2484	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
2485
2486	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
2487	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
2488	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext
2489
2490	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
2491
2492	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
2493
2494.inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result
2495	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
2496.L128_dec_blocks_more_than_1:	//blocks	left >  1
2497
2498	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result
2499	rev64	v8.16b, v9.16b						//GHASH final-1 block
2500
2501	ldr	q22, [x3, #64]				//load h2l | h2h
2502	ext	v22.16b, v22.16b, v22.16b, #8
2503
2504	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
2505
2506	movi	v16.8b, #0						//suppress further partial tag feed in
2507
2508	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
2509
2510	ldr	q9, [x0], #16				//AES final block - load ciphertext
2511	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
2512
2513	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
2514	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
2515	ldr	q21, [x3, #48]				//load h2k | h1k
2516
2517	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
2518.inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result
2519
2520	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
2521
2522	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
2523
2524	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
2525
2526	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
2527.L128_dec_blocks_less_than_1:	//blocks	left <= 1
2528
2529	and	x1, x1, #127				//bit_length %= 128
2530
2531	sub	x1, x1, #128				//bit_length -= 128
2532
2533	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
2534
2535	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
2536	and	x1, x1, #127				//bit_length %= 128
2537
2538	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
2539	cmp	x1, #64
2540	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
2541
2542	csel	x13, x7, x6, lt
2543	csel	x14, x6, xzr, lt
2544
2545	mov	v0.d[1], x14
2546	mov	v0.d[0], x13					//ctr0b is mask for last block
2547
2548	ldr	q20, [x3, #32]				//load h1l | h1h
2549	ext	v20.16b, v20.16b, v20.16b, #8
2550	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
2551
2552	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
2553
2554	rev64	v8.16b, v9.16b						//GHASH final block
2555
2556	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
2557
2558	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
2559	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
2560
2561	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
2562	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
2563
2564	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
2565
2566	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
2567	st1	{ v12.16b}, [x2]				//store all 16B
2568
2569	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
2570
2571	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
2572	ldr	d16, [x10]			//MODULO - load modulo constant
2573
2574	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
2575
2576	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up
2577
2578	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
2579	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
2580
2581	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up
2582
2583.inst	0xce115652	//eor3 v18.16b, v18.16b, v17.16b, v21.16b			//MODULO - fold into mid
2584
2585	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
2586	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
2587
2588.inst	0xce124673	//eor3 v19.16b, v19.16b, v18.16b, v17.16b			//MODULO - fold into low
2589	ext	v19.16b, v19.16b, v19.16b, #8
2590	rev64	v19.16b, v19.16b
2591	st1	{ v19.16b }, [x3]
2592	rev32	v30.16b, v30.16b
2593
2594	str	q30, [x16]					//store the updated counter
2595
2596	mov	x0, x9
2597
2598	ldp	d10, d11, [sp, #16]
2599	ldp	d12, d13, [sp, #32]
2600	ldp	d14, d15, [sp, #48]
2601	ldp	d8, d9, [sp], #80
2602	ret
2603.L128_dec_ret:
2604	mov	w0, #0x0
2605	ret
2606.size	unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
2607.globl	unroll8_eor3_aes_gcm_enc_192_kernel
2608.type	unroll8_eor3_aes_gcm_enc_192_kernel,%function
2609.align	4
2610unroll8_eor3_aes_gcm_enc_192_kernel:
2611	AARCH64_VALID_CALL_TARGET
2612	cbz	x1, .L192_enc_ret
2613	stp	d8, d9, [sp, #-80]!
2614	lsr	x9, x1, #3
2615	mov	x16, x4
2616	mov	x8, x5
2617	stp	d10, d11, [sp, #16]
2618	stp	d12, d13, [sp, #32]
2619	stp	d14, d15, [sp, #48]
2620	mov	x5, #0xc200000000000000
2621	stp	x5, xzr, [sp, #64]
2622	add	x10, sp, #64
2623
2624	mov	x5, x9
2625	ld1	{ v0.16b}, [x16]					//CTR block 0
2626
2627	mov	x15, #0x100000000				//set up counter increment
2628	movi	v31.16b, #0x0
2629	mov	v31.d[1], x15
2630
2631	rev32	v30.16b, v0.16b				//set up reversed counter
2632
2633	add	v30.4s, v30.4s, v31.4s		//CTR block 0
2634
2635	rev32	v1.16b, v30.16b				//CTR block 1
2636	add	v30.4s, v30.4s, v31.4s		//CTR block 1
2637
2638	rev32	v2.16b, v30.16b				//CTR block 2
2639	add	v30.4s, v30.4s, v31.4s		//CTR block 2
2640
2641	rev32	v3.16b, v30.16b				//CTR block 3
2642	add	v30.4s, v30.4s, v31.4s		//CTR block 3
2643
2644	rev32	v4.16b, v30.16b				//CTR block 4
2645	add	v30.4s, v30.4s, v31.4s		//CTR block 4
2646	sub	x5, x5, #1		//byte_len - 1
2647
2648	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2649
2650	rev32	v5.16b, v30.16b				//CTR block 5
2651	add	v30.4s, v30.4s, v31.4s		//CTR block 5
2652	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
2653
2654	add	x5, x5, x0
2655
2656	rev32	v6.16b, v30.16b				//CTR block 6
2657	add	v30.4s, v30.4s, v31.4s		//CTR block 6
2658
2659	rev32	v7.16b, v30.16b				//CTR block 7
2660
2661	aese	v5.16b, v26.16b
2662	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
2663	aese	v4.16b, v26.16b
2664	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
2665	aese	v3.16b, v26.16b
2666	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
2667
2668	aese	v0.16b, v26.16b
2669	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
2670	aese	v1.16b, v26.16b
2671	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
2672	aese	v7.16b, v26.16b
2673	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
2674
2675	aese	v6.16b, v26.16b
2676	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
2677	aese	v2.16b, v26.16b
2678	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
2679	ldp	q28, q26, [x8, #32]				//load rk2, rk3
2680
2681	aese	v5.16b, v27.16b
2682	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
2683	aese	v7.16b, v27.16b
2684	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
2685
2686	aese	v2.16b, v27.16b
2687	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
2688	aese	v3.16b, v27.16b
2689	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
2690	aese	v6.16b, v27.16b
2691	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
2692
2693	aese	v5.16b, v28.16b
2694	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
2695	aese	v4.16b, v27.16b
2696	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
2697	aese	v0.16b, v27.16b
2698	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
2699
2700	aese	v1.16b, v27.16b
2701	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
2702	aese	v7.16b, v28.16b
2703	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
2704	aese	v3.16b, v28.16b
2705	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
2706
2707	aese	v2.16b, v28.16b
2708	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
2709	aese	v0.16b, v28.16b
2710	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
2711
2712	aese	v1.16b, v28.16b
2713	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
2714	aese	v4.16b, v28.16b
2715	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
2716	aese	v6.16b, v28.16b
2717	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
2718
2719	ldp	q27, q28, [x8, #64]				//load rk4, rk5
2720	aese	v4.16b, v26.16b
2721	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
2722
2723	aese	v7.16b, v26.16b
2724	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
2725	aese	v3.16b, v26.16b
2726	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
2727	aese	v2.16b, v26.16b
2728	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
2729
2730	aese	v1.16b, v26.16b
2731	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
2732
2733	aese	v0.16b, v26.16b
2734	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
2735
2736	aese	v6.16b, v26.16b
2737	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
2738
2739	aese	v0.16b, v27.16b
2740	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
2741	aese	v1.16b, v27.16b
2742	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
2743	aese	v5.16b, v26.16b
2744	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
2745
2746	aese	v3.16b, v27.16b
2747	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
2748	aese	v2.16b, v27.16b
2749	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
2750	aese	v4.16b, v27.16b
2751	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
2752
2753	aese	v6.16b, v27.16b
2754	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
2755	aese	v7.16b, v27.16b
2756	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
2757	aese	v5.16b, v27.16b
2758	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
2759
2760	aese	v1.16b, v28.16b
2761	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
2762	ldp	q26, q27, [x8, #96]				//load rk6, rk7
2763	aese	v2.16b, v28.16b
2764	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
2765
2766	aese	v4.16b, v28.16b
2767	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
2768	aese	v7.16b, v28.16b
2769	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
2770	aese	v0.16b, v28.16b
2771	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
2772
2773	aese	v5.16b, v28.16b
2774	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
2775	aese	v6.16b, v28.16b
2776	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
2777	aese	v3.16b, v28.16b
2778	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
2779
2780	add	v30.4s, v30.4s, v31.4s		//CTR block 7
2781
2782	aese	v5.16b, v26.16b
2783	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
2784	aese	v4.16b, v26.16b
2785	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
2786	aese	v3.16b, v26.16b
2787	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
2788
2789	aese	v2.16b, v26.16b
2790	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
2791	aese	v6.16b, v26.16b
2792	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
2793	aese	v1.16b, v26.16b
2794	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
2795
2796	aese	v0.16b, v26.16b
2797	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
2798	aese	v7.16b, v26.16b
2799	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
2800	ldp	q28, q26, [x8, #128]				//load rk8, rk9
2801
2802	aese	v6.16b, v27.16b
2803	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
2804	aese	v3.16b, v27.16b
2805	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
2806
2807	aese	v4.16b, v27.16b
2808	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
2809	aese	v0.16b, v27.16b
2810	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
2811
2812	aese	v7.16b, v27.16b
2813	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
2814	aese	v1.16b, v27.16b
2815	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
2816
2817	aese	v2.16b, v27.16b
2818	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
2819	aese	v5.16b, v27.16b
2820	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
2821
2822	aese	v7.16b, v28.16b
2823	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
2824	aese	v0.16b, v28.16b
2825	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
2826
2827	aese	v4.16b, v28.16b
2828	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
2829	aese	v3.16b, v28.16b
2830	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
2831	aese	v5.16b, v28.16b
2832	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
2833
2834	aese	v2.16b, v28.16b
2835	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
2836	aese	v1.16b, v28.16b
2837	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
2838	aese	v6.16b, v28.16b
2839	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
2840
2841	add	x4, x0, x1, lsr #3		//end_input_ptr
2842	cmp	x0, x5				//check if we have <= 8 blocks
2843	aese	v3.16b, v26.16b
2844	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
2845
2846	ld1	{ v19.16b}, [x3]
2847	ext	v19.16b, v19.16b, v19.16b, #8
2848	rev64	v19.16b, v19.16b
2849	ldp	q27, q28, [x8, #160]				//load rk10, rk11
2850
2851	aese	v6.16b, v26.16b
2852	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
2853	aese	v1.16b, v26.16b
2854	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
2855
2856	aese	v5.16b, v26.16b
2857	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
2858	aese	v2.16b, v26.16b
2859	aesmc	v2.16b, v2.16b			//AES block 2 - round 9
2860
2861	aese	v0.16b, v26.16b
2862	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
2863	aese	v4.16b, v26.16b
2864	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
2865
2866	aese	v6.16b, v27.16b
2867	aesmc	v6.16b, v6.16b			//AES block 14 - round 10
2868	aese	v7.16b, v26.16b
2869	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
2870	aese	v3.16b, v27.16b
2871	aesmc	v3.16b, v3.16b			//AES block 11 - round 10
2872
2873	aese	v1.16b, v27.16b
2874	aesmc	v1.16b, v1.16b			//AES block 9 - round 10
2875	aese	v5.16b, v27.16b
2876	aesmc	v5.16b, v5.16b			//AES block 13 - round 10
2877	aese	v4.16b, v27.16b
2878	aesmc	v4.16b, v4.16b			//AES block 12 - round 10
2879
2880	aese	v0.16b, v27.16b
2881	aesmc	v0.16b, v0.16b			//AES block 8 - round 10
2882	aese	v2.16b, v27.16b
2883	aesmc	v2.16b, v2.16b			//AES block 10 - round 10
2884	aese	v7.16b, v27.16b
2885	aesmc	v7.16b, v7.16b			//AES block 15 - round 10
2886
2887	aese	v6.16b, v28.16b						//AES block 14 - round 11
2888	aese	v3.16b, v28.16b						//AES block 11 - round 11
2889
2890	aese	v4.16b, v28.16b						//AES block 12 - round 11
2891	aese	v7.16b, v28.16b						//AES block 15 - round 11
2892	ldr	q26, [x8, #192]					//load rk12
2893
2894	aese	v1.16b, v28.16b						//AES block 9 - round 11
2895	aese	v5.16b, v28.16b						//AES block 13 - round 11
2896
2897	aese	v2.16b, v28.16b						//AES block 10 - round 11
2898	aese	v0.16b, v28.16b						//AES block 8 - round 11
2899	b.ge	.L192_enc_tail						//handle tail
2900
2901	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext
2902
2903	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext
2904
2905	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext
2906
2907	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
2908
2909.inst	0xce006908	//eor3 v8.16b, v8.16b, v0.16b, v26.16b				//AES block 0 - result
2910	rev32	v0.16b, v30.16b				//CTR block 8
2911	add	v30.4s, v30.4s, v31.4s		//CTR block 8
2912
2913.inst	0xce03696b	//eor3 v11.16b, v11.16b, v3.16b, v26.16b				//AES block 3 - result
2914.inst	0xce016929	//eor3 v9.16b, v9.16b, v1.16b, v26.16b				//AES block 1 - result
2915
2916	rev32	v1.16b, v30.16b				//CTR block 9
2917	add	v30.4s, v30.4s, v31.4s		//CTR block 9
2918.inst	0xce04698c	//eor3 v12.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result
2919
2920.inst	0xce0569ad	//eor3 v13.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
2921.inst	0xce0769ef	//eor3 v15.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result
2922	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result
2923
2924.inst	0xce02694a	//eor3 v10.16b, v10.16b, v2.16b, v26.16b				//AES block 2 - result
2925	rev32	v2.16b, v30.16b				//CTR block 10
2926	add	v30.4s, v30.4s, v31.4s		//CTR block 10
2927
2928	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result
2929	cmp	x0, x5				//check if we have <= 8 blocks
2930
2931	rev32	v3.16b, v30.16b				//CTR block 11
2932	add	v30.4s, v30.4s, v31.4s		//CTR block 11
2933.inst	0xce0669ce	//eor3 v14.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result
2934
2935	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
2936
2937	rev32	v4.16b, v30.16b				//CTR block 12
2938	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
2939	add	v30.4s, v30.4s, v31.4s		//CTR block 12
2940
2941	b.ge	.L192_enc_prepretail					//do prepretail
2942
2943.L192_enc_main_loop:	//main	loop start
2944	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
2945	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
2946	rev64	v10.16b, v10.16b						//GHASH block 8k+2
2947
2948	rev32	v5.16b, v30.16b				//CTR block 8k+13
2949	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
2950	ldr	q23, [x3, #176]				//load h7l | h7h
2951	ext	v23.16b, v23.16b, v23.16b, #8
2952	ldr	q25, [x3, #208]				//load h8l | h8h
2953	ext	v25.16b, v25.16b, v25.16b, #8
2954
2955	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
2956	rev64	v8.16b, v8.16b						//GHASH block 8k
2957	ldr	q20, [x3, #128]				//load h5l | h5h
2958	ext	v20.16b, v20.16b, v20.16b, #8
2959	ldr	q22, [x3, #160]				//load h6l | h6h
2960	ext	v22.16b, v22.16b, v22.16b, #8
2961
2962	rev64	v9.16b, v9.16b						//GHASH block 8k+1
2963	rev32	v6.16b, v30.16b				//CTR block 8k+14
2964	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
2965
2966	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
2967	rev64	v11.16b, v11.16b						//GHASH block 8k+3
2968	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
2969
2970	aese	v0.16b, v26.16b
2971	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
2972	rev32	v7.16b, v30.16b				//CTR block 8k+15
2973	aese	v1.16b, v26.16b
2974	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
2975
2976	aese	v3.16b, v26.16b
2977	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
2978	aese	v5.16b, v26.16b
2979	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
2980	aese	v2.16b, v26.16b
2981	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
2982
2983	aese	v7.16b, v26.16b
2984	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
2985	aese	v4.16b, v26.16b
2986	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
2987	aese	v6.16b, v26.16b
2988	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
2989
2990	ldp	q28, q26, [x8, #32]				//load rk2, rk3
2991	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
2992	aese	v0.16b, v27.16b
2993	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
2994
2995	aese	v4.16b, v27.16b
2996	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
2997	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
2998	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
2999
3000	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
3001	aese	v3.16b, v27.16b
3002	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
3003	ldr	q21, [x3, #144]				//load h6k | h5k
3004	ldr	q24, [x3, #192]				//load h8k | h7k
3005
3006	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
3007	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
3008	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
3009
3010	aese	v1.16b, v27.16b
3011	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
3012	aese	v2.16b, v27.16b
3013	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
3014	aese	v5.16b, v27.16b
3015	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
3016
3017	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
3018	aese	v6.16b, v27.16b
3019	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
3020	aese	v7.16b, v27.16b
3021	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
3022
3023	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
3024	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
3025	aese	v1.16b, v28.16b
3026	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
3027
3028	aese	v3.16b, v28.16b
3029	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
3030	aese	v4.16b, v28.16b
3031	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
3032	aese	v6.16b, v28.16b
3033	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
3034
3035	aese	v5.16b, v28.16b
3036	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
3037	aese	v1.16b, v26.16b
3038	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
3039.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
3040
3041	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
3042	aese	v7.16b, v28.16b
3043	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
3044	aese	v4.16b, v26.16b
3045	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
3046
3047	aese	v2.16b, v28.16b
3048	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
3049	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
3050	aese	v0.16b, v28.16b
3051	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
3052
3053	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
3054	aese	v3.16b, v26.16b
3055	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
3056	ldp	q27, q28, [x8, #64]				//load rk4, rk5
3057
3058	aese	v0.16b, v26.16b
3059	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
3060	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
3061	ldr	q23, [x3, #80]				//load h3l | h3h
3062	ext	v23.16b, v23.16b, v23.16b, #8
3063	ldr	q25, [x3, #112]				//load h4l | h4h
3064	ext	v25.16b, v25.16b, v25.16b, #8
3065
3066	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k - mid
3067	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
3068	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
3069
3070	aese	v5.16b, v26.16b
3071	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
3072	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
3073	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
3074
3075	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
3076	aese	v6.16b, v26.16b
3077	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
3078.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
3079
3080	aese	v1.16b, v27.16b
3081	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
3082	aese	v3.16b, v27.16b
3083	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
3084	aese	v7.16b, v26.16b
3085	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
3086
3087	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
3088	aese	v6.16b, v27.16b
3089	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
3090	aese	v2.16b, v26.16b
3091	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
3092
3093	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
3094	aese	v0.16b, v27.16b
3095	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
3096	aese	v4.16b, v27.16b
3097	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
3098
3099	aese	v2.16b, v27.16b
3100	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
3101	aese	v5.16b, v27.16b
3102	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
3103	aese	v7.16b, v27.16b
3104	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
3105
3106.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
3107	aese	v4.16b, v28.16b
3108	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
3109	ldr	q20, [x3, #32]				//load h1l | h1h
3110	ext	v20.16b, v20.16b, v20.16b, #8
3111	ldr	q22, [x3, #64]				//load h2l | h2h
3112	ext	v22.16b, v22.16b, v22.16b, #8
3113
3114	ldp	q26, q27, [x8, #96]				//load rk6, rk7
3115	aese	v2.16b, v28.16b
3116	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
3117	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
3118
3119	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
3120	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
3121	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
3122
3123	aese	v5.16b, v28.16b
3124	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
3125	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
3126
3127	aese	v6.16b, v28.16b
3128	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
3129	ldr	q21, [x3, #48]				//load h2k | h1k
3130	ldr	q24, [x3, #96]				//load h4k | h3k
3131
3132	aese	v1.16b, v28.16b
3133	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
3134	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
3135	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
3136
3137	aese	v3.16b, v28.16b
3138	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
3139	aese	v7.16b, v28.16b
3140	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
3141	aese	v0.16b, v28.16b
3142	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
3143
3144	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
3145	aese	v4.16b, v26.16b
3146	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
3147	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
3148
3149	aese	v0.16b, v26.16b
3150	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
3151	aese	v3.16b, v26.16b
3152	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
3153	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
3154
3155	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
3156	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
3157	aese	v2.16b, v26.16b
3158	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
3159
3160	aese	v6.16b, v26.16b
3161	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
3162	aese	v5.16b, v26.16b
3163	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
3164
3165	aese	v7.16b, v26.16b
3166	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
3167	aese	v2.16b, v27.16b
3168	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
3169	aese	v1.16b, v26.16b
3170	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
3171
3172	aese	v6.16b, v27.16b
3173	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
3174	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
3175
3176	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
3177	ldp	q28, q26, [x8, #128]				//load rk8, rk9
3178	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
3179
3180	aese	v4.16b, v27.16b
3181	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
3182	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
3183	aese	v5.16b, v27.16b
3184	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
3185
3186.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
3187	aese	v7.16b, v27.16b
3188	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
3189	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
3190
3191	ldr	d16, [x10]			//MODULO - load modulo constant
3192.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
3193	aese	v0.16b, v27.16b
3194	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
3195
3196	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
3197	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
3198	aese	v3.16b, v27.16b
3199	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
3200
3201	aese	v5.16b, v28.16b
3202	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
3203	aese	v4.16b, v28.16b
3204	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
3205	aese	v0.16b, v28.16b
3206	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
3207
3208	aese	v6.16b, v28.16b
3209	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
3210.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
3211	aese	v1.16b, v27.16b
3212	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
3213
3214	aese	v7.16b, v28.16b
3215	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
3216	aese	v2.16b, v28.16b
3217	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
3218	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
3219
3220	aese	v1.16b, v28.16b
3221	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
3222	aese	v3.16b, v28.16b
3223	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
3224	ldp	q27, q28, [x8, #160]				//load rk10, rk11
3225
3226.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
3227	rev32	v20.16b, v30.16b					//CTR block 8k+16
3228	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
3229
3230	aese	v2.16b, v26.16b
3231	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
3232.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
3233.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
3234
3235	aese	v6.16b, v26.16b
3236	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
3237	aese	v3.16b, v26.16b
3238	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
3239	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext
3240
3241	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
3242	rev32	v22.16b, v30.16b					//CTR block 8k+17
3243	aese	v0.16b, v26.16b
3244	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
3245
3246	aese	v4.16b, v26.16b
3247	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
3248	aese	v1.16b, v26.16b
3249	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
3250	aese	v7.16b, v26.16b
3251	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
3252
3253.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
3254	aese	v5.16b, v26.16b
3255	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
3256	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
3257
3258	aese	v2.16b, v27.16b
3259	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
3260	aese	v4.16b, v27.16b
3261	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
3262	ldr	q26, [x8, #192]					//load rk12
3263	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
3264
3265	aese	v0.16b, v27.16b
3266	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
3267	aese	v7.16b, v27.16b
3268	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
3269	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext
3270
3271	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
3272.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
3273	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load plaintext
3274
3275	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load plaintext
3276	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
3277	aese	v1.16b, v27.16b
3278	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
3279
3280	rev32	v23.16b, v30.16b					//CTR block 8k+18
3281	aese	v5.16b, v27.16b
3282	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
3283
3284	aese	v3.16b, v27.16b
3285	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
3286	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
3287
3288	aese	v6.16b, v27.16b
3289	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
3290	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
3291	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
3292
3293	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
3294	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
3295.inst	0xce04698c	//eor3 v12.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result
3296
3297	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
3298	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
3299	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
3300
3301	rev32	v25.16b, v30.16b					//CTR block 8k+19
3302	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
3303.inst	0xce0769ef	//eor3 v15.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result
3304
3305.inst	0xce02694a	//eor3 v10.16b, v10.16b, v2.16b, v26.16b				//AES block 8k+10 - result
3306.inst	0xce006908	//eor3 v8.16b, v8.16b, v0.16b, v26.16b				//AES block 8k+8 - result
3307	mov	v2.16b, v23.16b					//CTR block 8k+18
3308
3309.inst	0xce016929	//eor3 v9.16b, v9.16b, v1.16b, v26.16b				//AES block 8k+9 - result
3310	mov	v1.16b, v22.16b					//CTR block 8k+17
3311	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result
3312	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
3313
3314.inst	0xce0669ce	//eor3 v14.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result
3315	mov	v0.16b, v20.16b					//CTR block 8k+16
3316	rev32	v4.16b, v30.16b				//CTR block 8k+20
3317
3318	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
3319.inst	0xce0569ad	//eor3 v13.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
3320.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
3321
3322.inst	0xce03696b	//eor3 v11.16b, v11.16b, v3.16b, v26.16b				//AES block 8k+11 - result
3323	mov	v3.16b, v25.16b					//CTR block 8k+19
3324
3325	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result
3326
3327	stp	q12, q13, [x2], #32			//AES block 8k+12, 8k+13 - store result
3328
3329	cmp	x0, x5				//.LOOP CONTROL
3330	stp	q14, q15, [x2], #32			//AES block 8k+14, 8k+15 - store result
3331	b.lt	.L192_enc_main_loop
3332
3333.L192_enc_prepretail:	//PREPRETAIL
3334	rev32	v5.16b, v30.16b				//CTR block 8k+13
3335	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
3336	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
3337
3338	ldr	q23, [x3, #176]				//load h7l | h7h
3339	ext	v23.16b, v23.16b, v23.16b, #8
3340	ldr	q25, [x3, #208]				//load h8l | h8h
3341	ext	v25.16b, v25.16b, v25.16b, #8
3342	rev64	v8.16b, v8.16b						//GHASH block 8k
3343	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
3344
3345	rev32	v6.16b, v30.16b				//CTR block 8k+14
3346	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
3347	ldr	q21, [x3, #144]				//load h6k | h5k
3348	ldr	q24, [x3, #192]				//load h8k | h7k
3349
3350	rev64	v11.16b, v11.16b						//GHASH block 8k+3
3351	rev64	v10.16b, v10.16b						//GHASH block 8k+2
3352	ldr	q20, [x3, #128]				//load h5l | h5h
3353	ext	v20.16b, v20.16b, v20.16b, #8
3354	ldr	q22, [x3, #160]				//load h6l | h6h
3355	ext	v22.16b, v22.16b, v22.16b, #8
3356
3357	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
3358	rev32	v7.16b, v30.16b				//CTR block 8k+15
3359	rev64	v9.16b, v9.16b						//GHASH block 8k+1
3360
3361	aese	v5.16b, v26.16b
3362	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
3363	aese	v2.16b, v26.16b
3364	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
3365	aese	v3.16b, v26.16b
3366	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
3367
3368	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
3369	aese	v0.16b, v26.16b
3370	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
3371	aese	v6.16b, v26.16b
3372	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
3373
3374	aese	v1.16b, v26.16b
3375	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
3376	aese	v4.16b, v26.16b
3377	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
3378	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
3379
3380	aese	v6.16b, v27.16b
3381	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
3382	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
3383	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
3384
3385	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
3386	aese	v7.16b, v26.16b
3387	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
3388	ldp	q28, q26, [x8, #32]				//load rk2, rk3
3389
3390	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
3391	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
3392	aese	v2.16b, v27.16b
3393	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
3394
3395	aese	v5.16b, v27.16b
3396	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
3397	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
3398	aese	v1.16b, v27.16b
3399	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
3400
3401	aese	v7.16b, v27.16b
3402	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
3403	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
3404	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
3405
3406	aese	v3.16b, v27.16b
3407	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
3408	aese	v0.16b, v27.16b
3409	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
3410	aese	v4.16b, v27.16b
3411	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
3412
3413	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
3414	aese	v5.16b, v28.16b
3415	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
3416	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
3417
3418	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
3419	aese	v7.16b, v28.16b
3420	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
3421.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
3422
3423	aese	v5.16b, v26.16b
3424	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
3425	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
3426	aese	v6.16b, v28.16b
3427	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
3428
3429	aese	v0.16b, v28.16b
3430	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
3431	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
3432	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
3433
3434	aese	v3.16b, v28.16b
3435	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
3436	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
3437	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
3438
3439	aese	v2.16b, v28.16b
3440	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
3441	aese	v1.16b, v28.16b
3442	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
3443	aese	v4.16b, v28.16b
3444	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
3445
3446	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
3447	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
3448	ldp	q27, q28, [x8, #64]				//load rk4, rk5
3449
3450	aese	v1.16b, v26.16b
3451	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
3452	aese	v6.16b, v26.16b
3453	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
3454	aese	v2.16b, v26.16b
3455	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
3456
3457	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
3458.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
3459	aese	v7.16b, v26.16b
3460	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
3461
3462	ldr	q23, [x3, #80]				//load h3l | h3h
3463	ext	v23.16b, v23.16b, v23.16b, #8
3464	ldr	q25, [x3, #112]				//load h4l | h4h
3465	ext	v25.16b, v25.16b, v25.16b, #8
3466	aese	v3.16b, v26.16b
3467	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
3468	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
3469
3470	ldr	q20, [x3, #32]				//load h1l | h1h
3471	ext	v20.16b, v20.16b, v20.16b, #8
3472	ldr	q22, [x3, #64]				//load h2l | h2h
3473	ext	v22.16b, v22.16b, v22.16b, #8
3474	aese	v4.16b, v26.16b
3475	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
3476	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
3477
3478	aese	v0.16b, v26.16b
3479	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
3480	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
3481	aese	v6.16b, v27.16b
3482	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
3483
3484	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
3485	aese	v7.16b, v27.16b
3486	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
3487	aese	v5.16b, v27.16b
3488	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
3489
3490.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
3491	aese	v3.16b, v27.16b
3492	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
3493	aese	v0.16b, v27.16b
3494	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
3495
3496	aese	v1.16b, v27.16b
3497	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
3498	aese	v4.16b, v27.16b
3499	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
3500	aese	v2.16b, v27.16b
3501	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
3502
3503	aese	v0.16b, v28.16b
3504	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
3505	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
3506	ldr	q21, [x3, #48]				//load h2k | h1k
3507	ldr	q24, [x3, #96]				//load h4k | h3k
3508
3509	aese	v1.16b, v28.16b
3510	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
3511	aese	v2.16b, v28.16b
3512	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
3513	ldp	q26, q27, [x8, #96]				//load rk6, rk7
3514
3515	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
3516	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
3517	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
3518
3519	aese	v4.16b, v28.16b
3520	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
3521	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
3522
3523	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
3524	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
3525	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
3526
3527	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
3528	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
3529	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
3530
3531	aese	v5.16b, v28.16b
3532	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
3533	aese	v1.16b, v26.16b
3534	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
3535	aese	v7.16b, v28.16b
3536	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
3537
3538	aese	v6.16b, v28.16b
3539	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
3540	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
3541	aese	v3.16b, v28.16b
3542	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
3543
3544	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
3545	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
3546
3547	aese	v4.16b, v26.16b
3548	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
3549	aese	v5.16b, v26.16b
3550	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
3551	aese	v1.16b, v27.16b
3552	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
3553
3554	aese	v0.16b, v26.16b
3555	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
3556	aese	v7.16b, v26.16b
3557	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
3558.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
3559
3560	aese	v2.16b, v26.16b
3561	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
3562.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
3563	aese	v5.16b, v27.16b
3564	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
3565
3566	aese	v6.16b, v26.16b
3567	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
3568	ldr	d16, [x10]			//MODULO - load modulo constant
3569	aese	v3.16b, v26.16b
3570	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
3571
3572	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
3573	aese	v0.16b, v27.16b
3574	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
3575.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
3576
3577	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
3578	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
3579	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
3580
3581	aese	v4.16b, v27.16b
3582	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
3583	aese	v2.16b, v27.16b
3584	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
3585	ldp	q28, q26, [x8, #128]				//load rk8, rk9
3586
3587	aese	v3.16b, v27.16b
3588	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
3589.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
3590
3591.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
3592.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
3593
3594.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
3595	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
3596	aese	v7.16b, v27.16b
3597	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
3598	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
3599
3600	aese	v5.16b, v28.16b
3601	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
3602	aese	v1.16b, v28.16b
3603	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
3604
3605	aese	v6.16b, v27.16b
3606	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
3607	aese	v2.16b, v28.16b
3608	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
3609.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
3610
3611	aese	v3.16b, v28.16b
3612	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
3613	aese	v5.16b, v26.16b
3614	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
3615	aese	v4.16b, v28.16b
3616	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
3617
3618	aese	v0.16b, v28.16b
3619	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
3620	aese	v7.16b, v28.16b
3621	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
3622	aese	v6.16b, v28.16b
3623	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
3624
3625	aese	v3.16b, v26.16b
3626	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
3627	ldp	q27, q28, [x8, #160]				//load rk10, rk11
3628	aese	v4.16b, v26.16b
3629	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
3630
3631	aese	v2.16b, v26.16b
3632	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
3633	aese	v7.16b, v26.16b
3634	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
3635
3636	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
3637	aese	v6.16b, v26.16b
3638	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
3639	aese	v0.16b, v26.16b
3640	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
3641	aese	v1.16b, v26.16b
3642	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
3643
3644	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
3645	ldr	q26, [x8, #192]					//load rk12
3646
3647	aese	v7.16b, v27.16b
3648	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
3649	aese	v1.16b, v27.16b
3650	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
3651	aese	v2.16b, v27.16b
3652	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
3653
3654.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
3655	aese	v0.16b, v27.16b
3656	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
3657	aese	v3.16b, v27.16b
3658	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
3659
3660	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
3661	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
3662
3663	aese	v4.16b, v27.16b
3664	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
3665	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
3666
3667	aese	v5.16b, v27.16b
3668	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
3669	aese	v6.16b, v27.16b
3670	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
3671
3672	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
3673	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
3674	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
3675
3676	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
3677	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
3678	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
3679
3680.L192_enc_tail:	//TAIL
3681
3682	ldp	q20, q21, [x3, #128]			//load h5l | h5h
3683	ext	v20.16b, v20.16b, v20.16b, #8
3684	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
3685
3686	ldr	q8, [x0], #16				//AES block 8k+8 - l3ad plaintext
3687
3688	ldp	q24, q25, [x3, #192]			//load h8k | h7k
3689	ext	v25.16b, v25.16b, v25.16b, #8
3690
3691	mov	v29.16b, v26.16b
3692
3693	ldp	q22, q23, [x3, #160]			//load h6l | h6h
3694	ext	v22.16b, v22.16b, v22.16b, #8
3695	ext	v23.16b, v23.16b, v23.16b, #8
3696	cmp	x5, #112
3697
3698.inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b			//AES block 8k+8 - result
3699	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
3700	b.gt	.L192_enc_blocks_more_than_7
3701
3702	cmp	x5, #96
3703	mov	v7.16b, v6.16b
3704	movi	v17.8b, #0
3705
3706	mov	v6.16b, v5.16b
3707	movi	v19.8b, #0
3708	sub	v30.4s, v30.4s, v31.4s
3709
3710	mov	v5.16b, v4.16b
3711	mov	v4.16b, v3.16b
3712	mov	v3.16b, v2.16b
3713
3714	mov	v2.16b, v1.16b
3715	movi	v18.8b, #0
3716	b.gt	.L192_enc_blocks_more_than_6
3717
3718	mov	v7.16b, v6.16b
3719	cmp	x5, #80
3720
3721	mov	v6.16b, v5.16b
3722	mov	v5.16b, v4.16b
3723	mov	v4.16b, v3.16b
3724
3725	mov	v3.16b, v1.16b
3726	sub	v30.4s, v30.4s, v31.4s
3727	b.gt	.L192_enc_blocks_more_than_5
3728
3729	cmp	x5, #64
3730	sub	v30.4s, v30.4s, v31.4s
3731
3732	mov	v7.16b, v6.16b
3733	mov	v6.16b, v5.16b
3734	mov	v5.16b, v4.16b
3735
3736	mov	v4.16b, v1.16b
3737	b.gt	.L192_enc_blocks_more_than_4
3738
3739	mov	v7.16b, v6.16b
3740	mov	v6.16b, v5.16b
3741	mov	v5.16b, v1.16b
3742
3743	sub	v30.4s, v30.4s, v31.4s
3744	cmp	x5, #48
3745	b.gt	.L192_enc_blocks_more_than_3
3746
3747	mov	v7.16b, v6.16b
3748	mov	v6.16b, v1.16b
3749	sub	v30.4s, v30.4s, v31.4s
3750
3751	ldr	q24, [x3, #96]				//load h4k | h3k
3752	cmp	x5, #32
3753	b.gt	.L192_enc_blocks_more_than_2
3754
3755	sub	v30.4s, v30.4s, v31.4s
3756
3757	cmp	x5, #16
3758	mov	v7.16b, v1.16b
3759	b.gt	.L192_enc_blocks_more_than_1
3760
3761	sub	v30.4s, v30.4s, v31.4s
3762	ldr	q21, [x3, #48]				//load h2k | h1k
3763	b	.L192_enc_blocks_less_than_1
3764.L192_enc_blocks_more_than_7:	//blocks	left >  7
3765	st1	{ v9.16b}, [x2], #16			 	//AES final-7 block  - store result
3766
3767	rev64	v8.16b, v9.16b						//GHASH final-7 block
3768	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
3769
3770	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
3771
3772	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
3773
3774	ldr	q9, [x0], #16				//AES final-6 block - load plaintext
3775
3776	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
3777	movi	v16.8b, #0						//suppress further partial tag feed in
3778	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
3779
3780	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
3781
3782	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
3783.inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result
3784.L192_enc_blocks_more_than_6:	//blocks	left >  6
3785
3786	st1	{ v9.16b}, [x2], #16			 	//AES final-6 block - store result
3787
3788	rev64	v8.16b, v9.16b						//GHASH final-6 block
3789
3790	ldr	q9, [x0], #16				//AES final-5 block - load plaintext
3791
3792	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
3793
3794	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
3795
3796	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
3797.inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result
3798
3799	movi	v16.8b, #0						//suppress further partial tag feed in
3800	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
3801	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
3802
3803	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
3804
3805	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
3806	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
3807
3808	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
3809.L192_enc_blocks_more_than_5:	//blocks	left >  5
3810
3811	st1	{ v9.16b}, [x2], #16			 	//AES final-5 block - store result
3812
3813	rev64	v8.16b, v9.16b						//GHASH final-5 block
3814
3815	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
3816
3817	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
3818
3819	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
3820	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
3821
3822	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
3823	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
3824
3825	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
3826	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
3827
3828	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
3829	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
3830
3831.inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
3832	movi	v16.8b, #0						//suppress further partial tag feed in
3833
3834	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
3835.L192_enc_blocks_more_than_4:	//blocks	left >  4
3836
3837	st1	{ v9.16b}, [x2], #16				//AES final-4 block - store result
3838
3839	rev64	v8.16b, v9.16b						//GHASH final-4 block
3840
3841	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
3842
3843	ldr	q9, [x0], #16				//AES final-3 block - load plaintext
3844	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
3845	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
3846
3847	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
3848	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
3849
3850	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
3851
3852	movi	v16.8b, #0						//suppress further partial tag feed in
3853	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
3854
3855	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
3856
3857	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
3858.inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
3859.L192_enc_blocks_more_than_3:	//blocks	left >  3
3860
3861	ldr	q24, [x3, #96]				//load h4k | h3k
3862	st1	{ v9.16b}, [x2], #16			 	//AES final-3 block - store result
3863
3864	rev64	v8.16b, v9.16b						//GHASH final-3 block
3865
3866	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
3867	movi	v16.8b, #0						//suppress further partial tag feed in
3868
3869	ldr	q9, [x0], #16				//AES final-2 block - load plaintext
3870	ldr	q25, [x3, #112]				//load h4l | h4h
3871	ext	v25.16b, v25.16b, v25.16b, #8
3872
3873	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
3874
3875.inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result
3876	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
3877
3878	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
3879	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
3880
3881	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
3882	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
3883
3884	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
3885
3886	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
3887	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
3888.L192_enc_blocks_more_than_2:	//blocks	left >  2
3889
3890	st1	{ v9.16b}, [x2], #16			 	//AES final-2 block - store result
3891
3892	rev64	v8.16b, v9.16b						//GHASH final-2 block
3893	ldr	q23, [x3, #80]				//load h3l | h3h
3894	ext	v23.16b, v23.16b, v23.16b, #8
3895
3896	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
3897
3898	ldr	q9, [x0], #16				//AES final-1 block - load plaintext
3899	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
3900
3901	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
3902
3903	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
3904	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
3905	movi	v16.8b, #0						//suppress further partial tag feed in
3906
3907	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
3908
3909	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
3910	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
3911
3912	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
3913.inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result
3914.L192_enc_blocks_more_than_1:	//blocks	left >  1
3915
3916	ldr	q22, [x3, #64]				//load h1l | h1h
3917	ext	v22.16b, v22.16b, v22.16b, #8
3918	st1	{ v9.16b}, [x2], #16			 	//AES final-1 block - store result
3919
3920	rev64	v8.16b, v9.16b						//GHASH final-1 block
3921
3922	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
3923
3924	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
3925	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
3926
3927	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
3928	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
3929	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
3930
3931	ldr	q9, [x0], #16				//AES final block - load plaintext
3932	ldr	q21, [x3, #48]				//load h2k | h1k
3933
3934	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
3935
3936.inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result
3937	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
3938
3939	movi	v16.8b, #0						//suppress further partial tag feed in
3940
3941	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
3942	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
3943.L192_enc_blocks_less_than_1:	//blocks	left <= 1
3944
3945	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
3946	and	x1, x1, #127				//bit_length %= 128
3947
3948	sub	x1, x1, #128				//bit_length -= 128
3949
3950	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
3951
3952	and	x1, x1, #127				//bit_length %= 128
3953
3954	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
3955	cmp	x1, #64
3956	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
3957
3958	csel	x13, x7, x6, lt
3959	csel	x14, x6, xzr, lt
3960
3961	mov	v0.d[1], x14
3962	ldr	q20, [x3, #32]				//load h1l | h1h
3963	ext	v20.16b, v20.16b, v20.16b, #8
3964
3965	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
3966	mov	v0.d[0], x13					//ctr0b is mask for last block
3967
3968	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
3969
3970	rev64	v8.16b, v9.16b						//GHASH final block
3971	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
3972
3973	st1	{ v9.16b}, [x2]				//store all 16B
3974
3975	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
3976
3977	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
3978	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
3979
3980	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
3981	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
3982
3983	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
3984
3985	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
3986
3987	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
3988	ldr	d16, [x10]			//MODULO - load modulo constant
3989
3990	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
3991	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
3992
3993	rev32	v30.16b, v30.16b
3994
3995	str	q30, [x16]					//store the updated counter
3996.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
3997
3998	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
3999
4000.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
4001
4002	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
4003	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
4004
4005.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
4006	ext	v19.16b, v19.16b, v19.16b, #8
4007	rev64	v19.16b, v19.16b
4008	st1	{ v19.16b }, [x3]
4009
4010	mov	x0, x9					//return sizes
4011
4012	ldp	d10, d11, [sp, #16]
4013	ldp	d12, d13, [sp, #32]
4014	ldp	d14, d15, [sp, #48]
4015	ldp	d8, d9, [sp], #80
4016	ret
4017
4018.L192_enc_ret:
4019	mov	w0, #0x0
4020	ret
4021.size	unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
4022.globl	unroll8_eor3_aes_gcm_dec_192_kernel
4023.type	unroll8_eor3_aes_gcm_dec_192_kernel,%function
4024.align	4
4025unroll8_eor3_aes_gcm_dec_192_kernel:
4026	AARCH64_VALID_CALL_TARGET
4027	cbz	x1, .L192_dec_ret
4028	stp	d8, d9, [sp, #-80]!
4029	lsr	x9, x1, #3
4030	mov	x16, x4
4031	mov	x8, x5
4032	stp	d10, d11, [sp, #16]
4033	stp	d12, d13, [sp, #32]
4034	stp	d14, d15, [sp, #48]
4035	mov	x5, #0xc200000000000000
4036	stp	x5, xzr, [sp, #64]
4037	add	x10, sp, #64
4038
4039	mov	x5, x9
4040	ld1	{ v0.16b}, [x16]					//CTR block 0
4041	ld1	{ v19.16b}, [x3]
4042
4043	mov	x15, #0x100000000			//set up counter increment
4044	movi	v31.16b, #0x0
4045	mov	v31.d[1], x15
4046
4047	rev32	v30.16b, v0.16b				//set up reversed counter
4048
4049	add	v30.4s, v30.4s, v31.4s		//CTR block 0
4050
4051	rev32	v1.16b, v30.16b				//CTR block 1
4052	add	v30.4s, v30.4s, v31.4s		//CTR block 1
4053
4054	rev32	v2.16b, v30.16b				//CTR block 2
4055	add	v30.4s, v30.4s, v31.4s		//CTR block 2
4056
4057	rev32	v3.16b, v30.16b				//CTR block 3
4058	add	v30.4s, v30.4s, v31.4s		//CTR block 3
4059
4060	rev32	v4.16b, v30.16b				//CTR block 4
4061	add	v30.4s, v30.4s, v31.4s		//CTR block 4
4062
4063	rev32	v5.16b, v30.16b				//CTR block 5
4064	add	v30.4s, v30.4s, v31.4s		//CTR block 5
4065	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
4066
4067	rev32	v6.16b, v30.16b				//CTR block 6
4068	add	v30.4s, v30.4s, v31.4s		//CTR block 6
4069
4070	rev32	v7.16b, v30.16b				//CTR block 7
4071
4072	aese	v3.16b, v26.16b
4073	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
4074	aese	v6.16b, v26.16b
4075	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
4076	aese	v5.16b, v26.16b
4077	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
4078
4079	aese	v0.16b, v26.16b
4080	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
4081	aese	v1.16b, v26.16b
4082	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
4083	aese	v7.16b, v26.16b
4084	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
4085
4086	aese	v2.16b, v26.16b
4087	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
4088	aese	v4.16b, v26.16b
4089	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
4090	ldp	q28, q26, [x8, #32]				//load rk2, rk3
4091
4092	aese	v1.16b, v27.16b
4093	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
4094
4095	aese	v2.16b, v27.16b
4096	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
4097
4098	aese	v0.16b, v27.16b
4099	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
4100	aese	v3.16b, v27.16b
4101	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
4102	aese	v7.16b, v27.16b
4103	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
4104
4105	aese	v5.16b, v27.16b
4106	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
4107	aese	v6.16b, v27.16b
4108	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
4109
4110	aese	v7.16b, v28.16b
4111	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
4112	aese	v0.16b, v28.16b
4113	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
4114	aese	v4.16b, v27.16b
4115	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
4116
4117	aese	v5.16b, v28.16b
4118	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
4119	aese	v1.16b, v28.16b
4120	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
4121	aese	v2.16b, v28.16b
4122	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
4123
4124	aese	v3.16b, v28.16b
4125	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
4126	aese	v4.16b, v28.16b
4127	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
4128	aese	v6.16b, v28.16b
4129	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
4130
4131	aese	v7.16b, v26.16b
4132	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
4133
4134	ldp	q27, q28, [x8, #64]				//load rk4, rk5
4135	aese	v2.16b, v26.16b
4136	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
4137	aese	v5.16b, v26.16b
4138	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
4139
4140	aese	v0.16b, v26.16b
4141	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
4142	aese	v3.16b, v26.16b
4143	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
4144
4145	aese	v4.16b, v26.16b
4146	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
4147	aese	v1.16b, v26.16b
4148	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
4149	aese	v6.16b, v26.16b
4150	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
4151
4152	aese	v3.16b, v27.16b
4153	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
4154	aese	v2.16b, v27.16b
4155	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
4156	aese	v5.16b, v27.16b
4157	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
4158
4159	aese	v1.16b, v27.16b
4160	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
4161	aese	v7.16b, v27.16b
4162	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
4163	aese	v6.16b, v27.16b
4164	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
4165
4166	aese	v0.16b, v27.16b
4167	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
4168	aese	v5.16b, v28.16b
4169	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
4170	aese	v4.16b, v27.16b
4171	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
4172
4173	aese	v6.16b, v28.16b
4174	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
4175	ldp	q26, q27, [x8, #96]				//load rk6, rk7
4176
4177	aese	v0.16b, v28.16b
4178	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
4179	aese	v4.16b, v28.16b
4180	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
4181	aese	v1.16b, v28.16b
4182	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
4183
4184	aese	v3.16b, v28.16b
4185	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
4186	aese	v2.16b, v28.16b
4187	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
4188	aese	v7.16b, v28.16b
4189	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
4190
4191	sub	x5, x5, #1		//byte_len - 1
4192
4193	aese	v4.16b, v26.16b
4194	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
4195	aese	v5.16b, v26.16b
4196	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
4197	aese	v1.16b, v26.16b
4198	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
4199
4200	aese	v0.16b, v26.16b
4201	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
4202	aese	v3.16b, v26.16b
4203	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
4204	aese	v6.16b, v26.16b
4205	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
4206
4207	aese	v7.16b, v26.16b
4208	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
4209	aese	v2.16b, v26.16b
4210	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
4211	ldp	q28, q26, [x8, #128]				//load rk8, rk9
4212
4213	add	v30.4s, v30.4s, v31.4s		//CTR block 7
4214
4215	aese	v3.16b, v27.16b
4216	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
4217	aese	v7.16b, v27.16b
4218	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
4219
4220	aese	v2.16b, v27.16b
4221	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
4222	aese	v1.16b, v27.16b
4223	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
4224	aese	v4.16b, v27.16b
4225	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
4226
4227	aese	v6.16b, v27.16b
4228	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
4229	aese	v0.16b, v27.16b
4230	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
4231	aese	v5.16b, v27.16b
4232	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
4233
4234	aese	v1.16b, v28.16b
4235	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
4236	aese	v2.16b, v28.16b
4237	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
4238	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4239
4240	aese	v7.16b, v28.16b
4241	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
4242	aese	v6.16b, v28.16b
4243	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
4244	aese	v5.16b, v28.16b
4245	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
4246
4247	aese	v4.16b, v28.16b
4248	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
4249	aese	v3.16b, v28.16b
4250	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
4251	aese	v0.16b, v28.16b
4252	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
4253
4254	add	x4, x0, x1, lsr #3		//end_input_ptr
4255	aese	v6.16b, v26.16b
4256	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
4257
4258	ld1	{ v19.16b}, [x3]
4259	ext	v19.16b, v19.16b, v19.16b, #8
4260	rev64	v19.16b, v19.16b
4261
4262	ldp	q27, q28, [x8, #160]				//load rk10, rk11
4263
4264	aese	v0.16b, v26.16b
4265	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
4266	add	x5, x5, x0
4267
4268	aese	v1.16b, v26.16b
4269	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
4270	aese	v7.16b, v26.16b
4271	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
4272	aese	v4.16b, v26.16b
4273	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
4274
4275	cmp	x0, x5				//check if we have <= 8 blocks
4276	aese	v3.16b, v26.16b
4277	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
4278
4279	aese	v5.16b, v26.16b
4280	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
4281	aese	v2.16b, v26.16b
4282	aesmc	v2.16b, v2.16b			//AES block 2 - round 9
4283
4284	aese	v3.16b, v27.16b
4285	aesmc	v3.16b, v3.16b			//AES block 3 - round 10
4286	aese	v1.16b, v27.16b
4287	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
4288	aese	v7.16b, v27.16b
4289	aesmc	v7.16b, v7.16b			//AES block 7 - round 10
4290
4291	aese	v4.16b, v27.16b
4292	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
4293	aese	v0.16b, v27.16b
4294	aesmc	v0.16b, v0.16b			//AES block 0 - round 10
4295	aese	v2.16b, v27.16b
4296	aesmc	v2.16b, v2.16b			//AES block 2 - round 10
4297
4298	aese	v6.16b, v27.16b
4299	aesmc	v6.16b, v6.16b			//AES block 6 - round 10
4300	aese	v5.16b, v27.16b
4301	aesmc	v5.16b, v5.16b			//AES block 5 - round 10
4302	ldr	q26, [x8, #192]					//load rk12
4303
4304	aese	v0.16b, v28.16b						//AES block 0 - round 11
4305	aese	v1.16b, v28.16b						//AES block 1 - round 11
4306	aese	v4.16b, v28.16b						//AES block 4 - round 11
4307
4308	aese	v6.16b, v28.16b						//AES block 6 - round 11
4309	aese	v5.16b, v28.16b						//AES block 5 - round 11
4310	aese	v7.16b, v28.16b						//AES block 7 - round 11
4311
4312	aese	v2.16b, v28.16b						//AES block 2 - round 11
4313	aese	v3.16b, v28.16b						//AES block 3 - round 11
4314	b.ge	.L192_dec_tail						//handle tail
4315
4316	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext
4317
4318	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext
4319
4320	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext
4321
4322.inst	0xce016921	//eor3 v1.16b, v9.16b, v1.16b, v26.16b				//AES block 1 - result
4323.inst	0xce006900	//eor3 v0.16b, v8.16b, v0.16b, v26.16b				//AES block 0 - result
4324	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result
4325
4326	rev32	v0.16b, v30.16b				//CTR block 8
4327	add	v30.4s, v30.4s, v31.4s		//CTR block 8
4328
4329	rev32	v1.16b, v30.16b				//CTR block 9
4330	add	v30.4s, v30.4s, v31.4s		//CTR block 9
4331.inst	0xce036963	//eor3 v3.16b, v11.16b, v3.16b, v26.16b				//AES block 3 - result
4332
4333.inst	0xce026942	//eor3 v2.16b, v10.16b, v2.16b, v26.16b				//AES block 2 - result
4334	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result
4335	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext
4336
4337	rev32	v2.16b, v30.16b				//CTR block 10
4338	add	v30.4s, v30.4s, v31.4s		//CTR block 10
4339
4340.inst	0xce046984	//eor3 v4.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result
4341
4342	rev32	v3.16b, v30.16b				//CTR block 11
4343	add	v30.4s, v30.4s, v31.4s		//CTR block 11
4344
4345.inst	0xce0569a5	//eor3 v5.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
4346	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result
4347	cmp	x0, x5				//check if we have <= 8 blocks
4348
4349.inst	0xce0669c6	//eor3 v6.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result
4350.inst	0xce0769e7	//eor3 v7.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result
4351	rev32	v4.16b, v30.16b				//CTR block 12
4352
4353	add	v30.4s, v30.4s, v31.4s		//CTR block 12
4354	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result
4355	b.ge	.L192_dec_prepretail					//do prepretail
4356
4357.L192_dec_main_loop:	//main	loop start
4358	rev64	v9.16b, v9.16b						//GHASH block 8k+1
4359	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
4360	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
4361
4362	rev64	v8.16b, v8.16b						//GHASH block 8k
4363	rev32	v5.16b, v30.16b				//CTR block 8k+13
4364	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
4365
4366	ldr	q23, [x3, #176]				//load h7l | h7h
4367	ext	v23.16b, v23.16b, v23.16b, #8
4368	ldr	q25, [x3, #208]				//load h8l | h8h
4369	ext	v25.16b, v25.16b, v25.16b, #8
4370	rev64	v12.16b, v12.16b						//GHASH block 8k+4
4371	rev64	v11.16b, v11.16b						//GHASH block 8k+3
4372
4373	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
4374	rev32	v6.16b, v30.16b				//CTR block 8k+14
4375	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
4376
4377	rev64	v13.16b, v13.16b						//GHASH block 8k+5
4378
4379	rev32	v7.16b, v30.16b				//CTR block 8k+15
4380	aese	v1.16b, v26.16b
4381	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
4382	aese	v6.16b, v26.16b
4383	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
4384
4385	aese	v5.16b, v26.16b
4386	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
4387	aese	v4.16b, v26.16b
4388	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
4389	aese	v0.16b, v26.16b
4390	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
4391
4392	aese	v7.16b, v26.16b
4393	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
4394	aese	v2.16b, v26.16b
4395	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
4396	aese	v3.16b, v26.16b
4397	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
4398
4399	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
4400	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
4401	ldp	q28, q26, [x8, #32]				//load rk2, rk3
4402
4403	aese	v6.16b, v27.16b
4404	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
4405	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
4406	ldr	q20, [x3, #128]				//load h5l | h5h
4407	ext	v20.16b, v20.16b, v20.16b, #8
4408	ldr	q22, [x3, #160]				//load h6l | h6h
4409	ext	v22.16b, v22.16b, v22.16b, #8
4410
4411	aese	v0.16b, v27.16b
4412	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
4413	aese	v3.16b, v27.16b
4414	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
4415	aese	v7.16b, v27.16b
4416	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
4417
4418	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
4419	aese	v2.16b, v27.16b
4420	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
4421	aese	v4.16b, v27.16b
4422	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
4423
4424	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
4425	rev64	v10.16b, v10.16b						//GHASH block 8k+2
4426	aese	v1.16b, v27.16b
4427	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
4428
4429	aese	v5.16b, v27.16b
4430	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
4431	ldr	q21, [x3, #144]				//load h6k | h5k
4432	ldr	q24, [x3, #192]				//load h8k | h7k
4433	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
4434
4435	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
4436	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
4437	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
4438
4439	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
4440	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
4441	aese	v6.16b, v28.16b
4442	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
4443
4444	aese	v2.16b, v28.16b
4445	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
4446	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
4447.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
4448
4449	aese	v1.16b, v28.16b
4450	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
4451	aese	v6.16b, v26.16b
4452	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
4453	aese	v4.16b, v28.16b
4454	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
4455
4456	aese	v0.16b, v28.16b
4457	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
4458	aese	v7.16b, v28.16b
4459	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
4460	aese	v3.16b, v28.16b
4461	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
4462
4463	ldr	q23, [x3, #80]				//load h3l | h3h
4464	ext	v23.16b, v23.16b, v23.16b, #8
4465	ldr	q25, [x3, #112]				//load h4l | h4h
4466	ext	v25.16b, v25.16b, v25.16b, #8
4467	aese	v5.16b, v28.16b
4468	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
4469	aese	v2.16b, v26.16b
4470	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
4471
4472	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
4473	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
4474	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
4475
4476	aese	v3.16b, v26.16b
4477	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
4478	aese	v4.16b, v26.16b
4479	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
4480
4481	aese	v0.16b, v26.16b
4482	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
4483	aese	v7.16b, v26.16b
4484	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
4485	ldp	q27, q28, [x8, #64]				//load rk4, rk5
4486
4487	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
4488.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
4489	aese	v1.16b, v26.16b
4490	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
4491
4492	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
4493	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
4494
4495	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
4496	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
4497	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
4498
4499	aese	v5.16b, v26.16b
4500	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
4501	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
4502	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
4503
4504	aese	v4.16b, v27.16b
4505	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
4506	aese	v6.16b, v27.16b
4507	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
4508	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
4509
4510	aese	v5.16b, v27.16b
4511	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
4512	aese	v1.16b, v27.16b
4513	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
4514	aese	v3.16b, v27.16b
4515	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
4516
4517	aese	v2.16b, v27.16b
4518	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
4519	aese	v0.16b, v27.16b
4520	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
4521	aese	v7.16b, v27.16b
4522	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
4523
4524	ldr	q20, [x3, #32]				//load h1l | h1h
4525	ext	v20.16b, v20.16b, v20.16b, #8
4526	ldr	q22, [x3, #64]				//load h2l | h2h
4527	ext	v22.16b, v22.16b, v22.16b, #8
4528	aese	v3.16b, v28.16b
4529	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
4530	aese	v5.16b, v28.16b
4531	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
4532
4533	ldp	q26, q27, [x8, #96]				//load rk6, rk7
4534	aese	v7.16b, v28.16b
4535	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
4536	rev64	v15.16b, v15.16b						//GHASH block 8k+7
4537
4538	aese	v4.16b, v28.16b
4539	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
4540.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
4541	aese	v1.16b, v28.16b
4542	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
4543
4544	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
4545	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
4546	aese	v2.16b, v28.16b
4547	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
4548
4549	aese	v6.16b, v28.16b
4550	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
4551	aese	v0.16b, v28.16b
4552	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
4553	rev64	v14.16b, v14.16b						//GHASH block 8k+6
4554
4555	ldr	q21, [x3, #48]				//load h2k | h1k
4556	ldr	q24, [x3, #96]				//load h4k | h3k
4557	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
4558	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
4559
4560	aese	v0.16b, v26.16b
4561	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
4562	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
4563	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
4564
4565	aese	v7.16b, v26.16b
4566	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
4567	aese	v2.16b, v26.16b
4568	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
4569	aese	v6.16b, v26.16b
4570	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
4571
4572	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
4573	aese	v3.16b, v26.16b
4574	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
4575	aese	v1.16b, v26.16b
4576	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
4577
4578	aese	v2.16b, v27.16b
4579	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
4580	aese	v6.16b, v27.16b
4581	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
4582	aese	v5.16b, v26.16b
4583	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
4584
4585	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
4586.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
4587.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
4588
4589	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
4590	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
4591	aese	v4.16b, v26.16b
4592	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
4593
4594	aese	v5.16b, v27.16b
4595	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
4596	ldp	q28, q26, [x8, #128]				//load rk8, rk9
4597	aese	v3.16b, v27.16b
4598	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
4599
4600	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
4601	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
4602	aese	v1.16b, v27.16b
4603	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
4604
4605	aese	v4.16b, v27.16b
4606	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
4607	aese	v0.16b, v27.16b
4608	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
4609	aese	v7.16b, v27.16b
4610	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
4611
4612.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
4613	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
4614	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
4615
4616	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
4617	ldr	d16, [x10]			//MODULO - load modulo constant
4618	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
4619
4620	aese	v2.16b, v28.16b
4621	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
4622	aese	v5.16b, v28.16b
4623	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
4624	aese	v7.16b, v28.16b
4625	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
4626
4627	aese	v0.16b, v28.16b
4628	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
4629	aese	v3.16b, v28.16b
4630	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
4631.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
4632
4633	aese	v4.16b, v28.16b
4634	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
4635	aese	v1.16b, v28.16b
4636	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
4637	aese	v6.16b, v28.16b
4638	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
4639
4640.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
4641	rev32	v20.16b, v30.16b					//CTR block 8k+16
4642	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
4643
4644	aese	v5.16b, v26.16b
4645	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
4646.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
4647	aese	v1.16b, v26.16b
4648	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
4649
4650	aese	v3.16b, v26.16b
4651	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
4652	aese	v7.16b, v26.16b
4653	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
4654	ldp	q27, q28, [x8, #160]				//load rk10, rk11
4655
4656.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
4657	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext
4658
4659	aese	v2.16b, v26.16b
4660	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
4661	aese	v0.16b, v26.16b
4662	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
4663	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext
4664
4665	rev32	v22.16b, v30.16b					//CTR block 8k+17
4666	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
4667	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
4668
4669	aese	v6.16b, v26.16b
4670	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
4671	aese	v4.16b, v26.16b
4672	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
4673	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
4674
4675	aese	v3.16b, v27.16b
4676	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
4677	aese	v7.16b, v27.16b
4678	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
4679	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext
4680
4681	rev32	v23.16b, v30.16b					//CTR block 8k+18
4682	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
4683.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
4684
4685	aese	v0.16b, v27.16b
4686	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
4687	aese	v1.16b, v27.16b
4688	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
4689	ldr	q26, [x8, #192]					//load rk12
4690
4691	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
4692	aese	v4.16b, v27.16b
4693	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
4694	aese	v6.16b, v27.16b
4695	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
4696
4697	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
4698	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
4699	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
4700
4701	aese	v2.16b, v27.16b
4702	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
4703	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
4704	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
4705
4706.inst	0xce006900	//eor3 v0.16b, v8.16b, v0.16b, v26.16b				//AES block 8k+8 - result
4707	rev32	v25.16b, v30.16b					//CTR block 8k+19
4708	aese	v5.16b, v27.16b
4709	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
4710
4711	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
4712	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
4713	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
4714
4715	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
4716	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
4717	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
4718
4719.inst	0xce016921	//eor3 v1.16b, v9.16b, v1.16b, v26.16b				//AES block 8k+9 - result
4720	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
4721.inst	0xce036963	//eor3 v3.16b, v11.16b, v3.16b, v26.16b				//AES block 8k+11 - result
4722
4723.inst	0xce026942	//eor3 v2.16b, v10.16b, v2.16b, v26.16b				//AES block 8k+10 - result
4724.inst	0xce0769e7	//eor3 v7.16b, v15.16b, v7.16b, v26.16b				//AES block 8k+15 - result
4725	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result
4726
4727.inst	0xce0569a5	//eor3 v5.16b, v13.16b, v5.16b, v26.16b				//AES block 8k+13 - result
4728.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
4729	mov	v3.16b, v25.16b					//CTR block 8k+19
4730
4731.inst	0xce046984	//eor3 v4.16b, v12.16b, v4.16b, v26.16b				//AES block 8k+12 - result
4732	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
4733	cmp	x0, x5				//.LOOP CONTROL
4734
4735.inst	0xce0669c6	//eor3 v6.16b, v14.16b, v6.16b, v26.16b				//AES block 8k+14 - result
4736	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
4737	mov	v0.16b, v20.16b					//CTR block 8k+16
4738
4739	mov	v1.16b, v22.16b					//CTR block 8k+17
4740	mov	v2.16b, v23.16b					//CTR block 8k+18
4741
4742	rev32	v4.16b, v30.16b				//CTR block 8k+20
4743	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
4744	b.lt	.L192_dec_main_loop
4745
4746.L192_dec_prepretail:	//PREPRETAIL
4747	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
4748	rev32	v5.16b, v30.16b				//CTR block 8k+13
4749	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
4750
4751	ldr	q23, [x3, #176]				//load h7l | h7h
4752	ext	v23.16b, v23.16b, v23.16b, #8
4753	ldr	q25, [x3, #208]				//load h8l | h8h
4754	ext	v25.16b, v25.16b, v25.16b, #8
4755	rev64	v8.16b, v8.16b						//GHASH block 8k
4756	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
4757
4758	rev64	v11.16b, v11.16b						//GHASH block 8k+3
4759	rev32	v6.16b, v30.16b				//CTR block 8k+14
4760	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
4761
4762	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
4763	rev64	v10.16b, v10.16b						//GHASH block 8k+2
4764	rev64	v9.16b, v9.16b						//GHASH block 8k+1
4765
4766	ldr	q20, [x3, #128]				//load h5l | h5h
4767	ext	v20.16b, v20.16b, v20.16b, #8
4768	ldr	q22, [x3, #160]				//load h6l | h6h
4769	ext	v22.16b, v22.16b, v22.16b, #8
4770	rev32	v7.16b, v30.16b				//CTR block 8k+15
4771
4772	aese	v0.16b, v26.16b
4773	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
4774	aese	v6.16b, v26.16b
4775	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
4776	aese	v5.16b, v26.16b
4777	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
4778
4779	aese	v3.16b, v26.16b
4780	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
4781	aese	v2.16b, v26.16b
4782	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
4783	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
4784
4785	aese	v4.16b, v26.16b
4786	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
4787	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
4788	aese	v1.16b, v26.16b
4789	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
4790
4791	aese	v6.16b, v27.16b
4792	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
4793	aese	v7.16b, v26.16b
4794	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
4795	ldp	q28, q26, [x8, #32]				//load rk2, rk3
4796
4797	aese	v4.16b, v27.16b
4798	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
4799	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
4800	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
4801
4802	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
4803	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
4804	aese	v3.16b, v27.16b
4805	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
4806
4807	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
4808	aese	v7.16b, v27.16b
4809	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
4810	aese	v0.16b, v27.16b
4811	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
4812
4813	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
4814	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
4815	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
4816
4817	aese	v2.16b, v27.16b
4818	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
4819	aese	v1.16b, v27.16b
4820	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
4821	aese	v5.16b, v27.16b
4822	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
4823
4824	ldr	q21, [x3, #144]				//load h6k | h5k
4825	ldr	q24, [x3, #192]				//load h8k | h7k
4826	aese	v3.16b, v28.16b
4827	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
4828	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
4829
4830	aese	v6.16b, v28.16b
4831	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
4832	rev64	v13.16b, v13.16b						//GHASH block 8k+5
4833	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
4834
4835.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
4836	aese	v4.16b, v28.16b
4837	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
4838	aese	v5.16b, v28.16b
4839	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
4840
4841	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
4842	aese	v3.16b, v26.16b
4843	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
4844	aese	v7.16b, v28.16b
4845	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
4846
4847	aese	v0.16b, v28.16b
4848	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
4849	aese	v2.16b, v28.16b
4850	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
4851	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
4852
4853	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
4854	aese	v1.16b, v28.16b
4855	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
4856	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
4857
4858	aese	v5.16b, v26.16b
4859	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
4860	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
4861	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
4862
4863	aese	v7.16b, v26.16b
4864	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
4865	aese	v6.16b, v26.16b
4866	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
4867	aese	v4.16b, v26.16b
4868	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
4869
4870.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
4871	ldp	q27, q28, [x8, #64]				//load rk4, rk5
4872	aese	v0.16b, v26.16b
4873	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
4874
4875	ldr	q23, [x3, #80]				//load h3l | h3h
4876	ext	v23.16b, v23.16b, v23.16b, #8
4877	ldr	q25, [x3, #112]				//load h4l | h4h
4878	ext	v25.16b, v25.16b, v25.16b, #8
4879	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
4880	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
4881
4882	ldr	q20, [x3, #32]				//load h1l | h1h
4883	ext	v20.16b, v20.16b, v20.16b, #8
4884	ldr	q22, [x3, #64]				//load h2l | h2h
4885	ext	v22.16b, v22.16b, v22.16b, #8
4886	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
4887	aese	v2.16b, v26.16b
4888	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
4889
4890	rev64	v15.16b, v15.16b						//GHASH block 8k+7
4891
4892.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
4893	rev64	v12.16b, v12.16b						//GHASH block 8k+4
4894
4895	aese	v5.16b, v27.16b
4896	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
4897	aese	v4.16b, v27.16b
4898	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
4899	aese	v1.16b, v26.16b
4900	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
4901
4902	aese	v2.16b, v27.16b
4903	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
4904	aese	v0.16b, v27.16b
4905	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
4906	aese	v3.16b, v27.16b
4907	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
4908
4909	aese	v1.16b, v27.16b
4910	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
4911	aese	v6.16b, v27.16b
4912	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
4913	aese	v7.16b, v27.16b
4914	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
4915
4916	rev64	v14.16b, v14.16b						//GHASH block 8k+6
4917	ldr	q21, [x3, #48]				//load h2k | h1k
4918	ldr	q24, [x3, #96]				//load h4k | h3k
4919	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
4920
4921	aese	v7.16b, v28.16b
4922	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
4923	aese	v1.16b, v28.16b
4924	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
4925	aese	v2.16b, v28.16b
4926	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
4927
4928	ldp	q26, q27, [x8, #96]				//load rk6, rk7
4929	aese	v6.16b, v28.16b
4930	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
4931	aese	v5.16b, v28.16b
4932	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
4933
4934	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
4935	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
4936	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
4937
4938	aese	v4.16b, v28.16b
4939	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
4940
4941	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
4942	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
4943	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
4944
4945	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
4946	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
4947	aese	v0.16b, v28.16b
4948	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
4949
4950	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
4951	aese	v3.16b, v28.16b
4952	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
4953	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
4954
4955	aese	v4.16b, v26.16b
4956	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
4957	aese	v2.16b, v26.16b
4958	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
4959
4960	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
4961	aese	v1.16b, v26.16b
4962	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
4963	aese	v7.16b, v26.16b
4964	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
4965
4966	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
4967	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
4968	aese	v0.16b, v26.16b
4969	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
4970
4971	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
4972	aese	v5.16b, v26.16b
4973	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
4974	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
4975
4976.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
4977	aese	v4.16b, v27.16b
4978	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
4979.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
4980
4981	aese	v3.16b, v26.16b
4982	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
4983	aese	v6.16b, v26.16b
4984	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
4985	aese	v5.16b, v27.16b
4986	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
4987
4988	ldp	q28, q26, [x8, #128]				//load rk8, rk9
4989	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
4990	aese	v2.16b, v27.16b
4991	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
4992
4993	ldr	d16, [x10]			//MODULO - load modulo constant
4994.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
4995	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
4996
4997	aese	v1.16b, v27.16b
4998	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
4999	aese	v7.16b, v27.16b
5000	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
5001	aese	v6.16b, v27.16b
5002	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
5003
5004.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
5005.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
5006.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
5007
5008	aese	v0.16b, v27.16b
5009	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
5010	aese	v3.16b, v27.16b
5011	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
5012
5013.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
5014	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
5015	aese	v2.16b, v28.16b
5016	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
5017
5018	aese	v6.16b, v28.16b
5019	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
5020	aese	v7.16b, v28.16b
5021	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
5022	aese	v1.16b, v28.16b
5023	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
5024
5025	aese	v3.16b, v28.16b
5026	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
5027	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
5028	aese	v0.16b, v28.16b
5029	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
5030
5031	aese	v5.16b, v28.16b
5032	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
5033	aese	v4.16b, v28.16b
5034	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
5035	ldp	q27, q28, [x8, #160]				//load rk10, rk11
5036
5037.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
5038	aese	v7.16b, v26.16b
5039	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
5040	aese	v6.16b, v26.16b
5041	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
5042
5043	aese	v5.16b, v26.16b
5044	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
5045	aese	v2.16b, v26.16b
5046	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
5047	aese	v3.16b, v26.16b
5048	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
5049
5050	aese	v0.16b, v26.16b
5051	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
5052	aese	v1.16b, v26.16b
5053	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
5054	aese	v4.16b, v26.16b
5055	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
5056
5057	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
5058	ldr	q26, [x8, #192]					//load rk12
5059	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
5060
5061	aese	v2.16b, v27.16b
5062	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
5063	aese	v5.16b, v27.16b
5064	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
5065	aese	v0.16b, v27.16b
5066	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
5067
5068	aese	v4.16b, v27.16b
5069	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
5070	aese	v6.16b, v27.16b
5071	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
5072	aese	v7.16b, v27.16b
5073	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
5074
5075	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
5076.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
5077	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
5078
5079	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
5080	aese	v3.16b, v27.16b
5081	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
5082	aese	v1.16b, v27.16b
5083	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
5084
5085	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
5086	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
5087	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
5088
5089	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
5090	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
5091	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
5092
5093.L192_dec_tail:	//TAIL
5094
5095	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
5096
5097	ldp	q20, q21, [x3, #128]			//load h5l | h5h
5098	ext	v20.16b, v20.16b, v20.16b, #8
5099	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext
5100
5101	ldp	q24, q25, [x3, #192]			//load h8k | h7k
5102	ext	v25.16b, v25.16b, v25.16b, #8
5103
5104	mov	v29.16b, v26.16b
5105
5106	ldp	q22, q23, [x3, #160]			//load h6l | h6h
5107	ext	v22.16b, v22.16b, v22.16b, #8
5108	ext	v23.16b, v23.16b, v23.16b, #8
5109	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
5110
5111.inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
5112	cmp	x5, #112
5113	b.gt	.L192_dec_blocks_more_than_7
5114
5115	mov	v7.16b, v6.16b
5116	movi	v17.8b, #0
5117	sub	v30.4s, v30.4s, v31.4s
5118
5119	mov	v6.16b, v5.16b
5120	mov	v5.16b, v4.16b
5121	mov	v4.16b, v3.16b
5122
5123	cmp	x5, #96
5124	movi	v19.8b, #0
5125	mov	v3.16b, v2.16b
5126
5127	mov	v2.16b, v1.16b
5128	movi	v18.8b, #0
5129	b.gt	.L192_dec_blocks_more_than_6
5130
5131	mov	v7.16b, v6.16b
5132	mov	v6.16b, v5.16b
5133	mov	v5.16b, v4.16b
5134
5135	mov	v4.16b, v3.16b
5136	mov	v3.16b, v1.16b
5137
5138	sub	v30.4s, v30.4s, v31.4s
5139	cmp	x5, #80
5140	b.gt	.L192_dec_blocks_more_than_5
5141
5142	mov	v7.16b, v6.16b
5143	mov	v6.16b, v5.16b
5144
5145	mov	v5.16b, v4.16b
5146	mov	v4.16b, v1.16b
5147	cmp	x5, #64
5148
5149	sub	v30.4s, v30.4s, v31.4s
5150	b.gt	.L192_dec_blocks_more_than_4
5151
5152	sub	v30.4s, v30.4s, v31.4s
5153	mov	v7.16b, v6.16b
5154	mov	v6.16b, v5.16b
5155
5156	mov	v5.16b, v1.16b
5157	cmp	x5, #48
5158	b.gt	.L192_dec_blocks_more_than_3
5159
5160	sub	v30.4s, v30.4s, v31.4s
5161	mov	v7.16b, v6.16b
5162	cmp	x5, #32
5163
5164	mov	v6.16b, v1.16b
5165	ldr	q24, [x3, #96]				//load h4k | h3k
5166	b.gt	.L192_dec_blocks_more_than_2
5167
5168	sub	v30.4s, v30.4s, v31.4s
5169
5170	mov	v7.16b, v1.16b
5171	cmp	x5, #16
5172	b.gt	.L192_dec_blocks_more_than_1
5173
5174	sub	v30.4s, v30.4s, v31.4s
5175	ldr	q21, [x3, #48]				//load h2k | h1k
5176	b	.L192_dec_blocks_less_than_1
5177.L192_dec_blocks_more_than_7:	//blocks	left >  7
5178	rev64	v8.16b, v9.16b						//GHASH final-7 block
5179
5180	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
5181	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
5182
5183	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
5184	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
5185	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext
5186
5187	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
5188
5189	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
5190	st1	{ v12.16b}, [x2], #16			 	//AES final-7 block  - store result
5191
5192.inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result
5193
5194	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
5195	movi	v16.8b, #0						//suppress further partial tag feed in
5196.L192_dec_blocks_more_than_6:	//blocks	left >  6
5197
5198	rev64	v8.16b, v9.16b						//GHASH final-6 block
5199
5200	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
5201
5202	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
5203	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
5204
5205	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
5206	movi	v16.8b, #0						//suppress further partial tag feed in
5207	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
5208
5209	st1	{ v12.16b}, [x2], #16			 	//AES final-6 block - store result
5210.inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result
5211
5212	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
5213	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
5214	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
5215
5216	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
5217	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
5218.L192_dec_blocks_more_than_5:	//blocks	left >  5
5219
5220	rev64	v8.16b, v9.16b						//GHASH final-5 block
5221
5222	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
5223
5224	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
5225
5226	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
5227
5228	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
5229	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
5230
5231	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext
5232
5233	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
5234	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
5235
5236	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
5237
5238	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
5239	movi	v16.8b, #0						//suppress further partial tag feed in
5240	st1	{ v12.16b}, [x2], #16			 	//AES final-5 block - store result
5241
5242	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
5243.inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result
5244.L192_dec_blocks_more_than_4:	//blocks	left >  4
5245
5246	rev64	v8.16b, v9.16b						//GHASH final-4 block
5247
5248	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
5249	movi	v16.8b, #0						//suppress further partial tag feed in
5250
5251	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext
5252	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
5253	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
5254
5255	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
5256
5257	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
5258
5259	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
5260	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result
5261	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
5262
5263.inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result
5264
5265	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
5266	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
5267.L192_dec_blocks_more_than_3:	//blocks	left >  3
5268
5269	ldr	q25, [x3, #112]				//load h4l | h4h
5270	ext	v25.16b, v25.16b, v25.16b, #8
5271	rev64	v8.16b, v9.16b						//GHASH final-3 block
5272	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext
5273
5274	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
5275
5276	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
5277	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
5278
5279	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
5280	movi	v16.8b, #0						//suppress further partial tag feed in
5281	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
5282
5283	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result
5284	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
5285.inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result
5286
5287	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
5288	ldr	q24, [x3, #96]				//load h4k | h3k
5289
5290	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
5291
5292	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
5293
5294	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
5295.L192_dec_blocks_more_than_2:	//blocks	left >  2
5296
5297	rev64	v8.16b, v9.16b						//GHASH final-2 block
5298	ldr	q23, [x3, #80]				//load h3l | h3h
5299	ext	v23.16b, v23.16b, v23.16b, #8
5300
5301	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
5302
5303	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
5304	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext
5305
5306	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
5307
5308	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
5309
5310	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
5311	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
5312
5313	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
5314	movi	v16.8b, #0						//suppress further partial tag feed in
5315
5316	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
5317	st1	{ v12.16b}, [x2], #16			 	//AES final-2 block - store result
5318
5319	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
5320.inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result
5321.L192_dec_blocks_more_than_1:	//blocks	left >  1
5322
5323	rev64	v8.16b, v9.16b						//GHASH final-1 block
5324	ldr	q9, [x0], #16				//AES final block - load ciphertext
5325	ldr	q22, [x3, #64]				//load h1l | h1h
5326	ext	v22.16b, v22.16b, v22.16b, #8
5327
5328	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
5329	movi	v16.8b, #0						//suppress further partial tag feed in
5330	ldr	q21, [x3, #48]				//load h2k | h1k
5331
5332	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
5333	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
5334	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result
5335
5336	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
5337
5338.inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result
5339
5340	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
5341
5342	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
5343
5344	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
5345
5346	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
5347
5348	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
5349	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
5350.L192_dec_blocks_less_than_1:	//blocks	left <= 1
5351
5352	rev32	v30.16b, v30.16b
5353	and	x1, x1, #127				//bit_length %= 128
5354
5355	sub	x1, x1, #128				//bit_length -= 128
5356	str	q30, [x16]					//store the updated counter
5357
5358	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
5359	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
5360
5361	and	x1, x1, #127				//bit_length %= 128
5362
5363	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
5364	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
5365	cmp	x1, #64
5366
5367	csel	x13, x7, x6, lt
5368	csel	x14, x6, xzr, lt
5369	ldr	q20, [x3, #32]				//load h1l | h1h
5370	ext	v20.16b, v20.16b, v20.16b, #8
5371
5372	mov	v0.d[1], x14
5373	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
5374
5375	mov	v0.d[0], x13					//ctr0b is mask for last block
5376
5377	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
5378	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
5379
5380	rev64	v8.16b, v9.16b						//GHASH final block
5381
5382	st1	{ v12.16b}, [x2]				//store all 16B
5383
5384	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
5385
5386	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
5387	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
5388
5389	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
5390	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
5391	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
5392
5393	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
5394	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
5395
5396	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up
5397	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
5398	ldr	d16, [x10]			//MODULO - load modulo constant
5399
5400	pmull	v21.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
5401	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
5402
5403	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up
5404
5405.inst	0xce115652	//eor3 v18.16b, v18.16b, v17.16b, v21.16b			//MODULO - fold into mid
5406
5407	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
5408	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
5409
5410.inst	0xce124673	//eor3 v19.16b, v19.16b, v18.16b, v17.16b			//MODULO - fold into low
5411	ext	v19.16b, v19.16b, v19.16b, #8
5412	rev64	v19.16b, v19.16b
5413	st1	{ v19.16b }, [x3]
5414
5415	mov	x0, x9
5416
5417	ldp	d10, d11, [sp, #16]
5418	ldp	d12, d13, [sp, #32]
5419	ldp	d14, d15, [sp, #48]
5420	ldp	d8, d9, [sp], #80
5421	ret
5422
5423.L192_dec_ret:
5424	mov	w0, #0x0
5425	ret
5426.size	unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
5427.globl	unroll8_eor3_aes_gcm_enc_256_kernel
5428.type	unroll8_eor3_aes_gcm_enc_256_kernel,%function
5429.align	4
5430unroll8_eor3_aes_gcm_enc_256_kernel:
5431	AARCH64_VALID_CALL_TARGET
5432	cbz	x1, .L256_enc_ret
5433	stp	d8, d9, [sp, #-80]!
5434	lsr	x9, x1, #3
5435	mov	x16, x4
5436	mov	x8, x5
5437	stp	d10, d11, [sp, #16]
5438	stp	d12, d13, [sp, #32]
5439	stp	d14, d15, [sp, #48]
5440	mov	x5, #0xc200000000000000
5441	stp	x5, xzr, [sp, #64]
5442	add	x10, sp, #64
5443
5444	ld1	{ v0.16b}, [x16]					//CTR block 0
5445
5446	mov	x5, x9
5447
5448	mov	x15, #0x100000000			//set up counter increment
5449	movi	v31.16b, #0x0
5450	mov	v31.d[1], x15
5451	sub	x5, x5, #1		//byte_len - 1
5452
5453	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5454
5455	add	x5, x5, x0
5456
5457	rev32	v30.16b, v0.16b				//set up reversed counter
5458
5459	add	v30.4s, v30.4s, v31.4s		//CTR block 0
5460
5461	rev32	v1.16b, v30.16b				//CTR block 1
5462	add	v30.4s, v30.4s, v31.4s		//CTR block 1
5463
5464	rev32	v2.16b, v30.16b				//CTR block 2
5465	add	v30.4s, v30.4s, v31.4s		//CTR block 2
5466
5467	rev32	v3.16b, v30.16b				//CTR block 3
5468	add	v30.4s, v30.4s, v31.4s		//CTR block 3
5469
5470	rev32	v4.16b, v30.16b				//CTR block 4
5471	add	v30.4s, v30.4s, v31.4s		//CTR block 4
5472
5473	rev32	v5.16b, v30.16b				//CTR block 5
5474	add	v30.4s, v30.4s, v31.4s		//CTR block 5
5475	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
5476
5477	rev32	v6.16b, v30.16b				//CTR block 6
5478	add	v30.4s, v30.4s, v31.4s		//CTR block 6
5479
5480	rev32	v7.16b, v30.16b				//CTR block 7
5481
5482	aese	v3.16b, v26.16b
5483	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
5484	aese	v4.16b, v26.16b
5485	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
5486	aese	v2.16b, v26.16b
5487	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
5488
5489	aese	v0.16b, v26.16b
5490	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
5491	aese	v1.16b, v26.16b
5492	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
5493	aese	v6.16b, v26.16b
5494	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
5495
5496	aese	v5.16b, v26.16b
5497	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
5498	aese	v7.16b, v26.16b
5499	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
5500	ldp	q28, q26, [x8, #32]				//load rk2, rk3
5501
5502	aese	v4.16b, v27.16b
5503	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
5504	aese	v1.16b, v27.16b
5505	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
5506	aese	v3.16b, v27.16b
5507	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
5508
5509	aese	v6.16b, v27.16b
5510	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
5511	aese	v5.16b, v27.16b
5512	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
5513
5514	aese	v2.16b, v27.16b
5515	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
5516
5517	aese	v7.16b, v27.16b
5518	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
5519
5520	aese	v2.16b, v28.16b
5521	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
5522	aese	v3.16b, v28.16b
5523	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
5524	aese	v0.16b, v27.16b
5525	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
5526
5527	aese	v7.16b, v28.16b
5528	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
5529	aese	v6.16b, v28.16b
5530	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
5531	aese	v5.16b, v28.16b
5532	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
5533
5534	aese	v4.16b, v28.16b
5535	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
5536	aese	v0.16b, v28.16b
5537	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
5538	aese	v1.16b, v28.16b
5539	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
5540
5541	aese	v5.16b, v26.16b
5542	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
5543	aese	v3.16b, v26.16b
5544	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
5545	ldp	q27, q28, [x8, #64]				//load rk4, rk5
5546
5547	aese	v4.16b, v26.16b
5548	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
5549
5550	aese	v1.16b, v26.16b
5551	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
5552	aese	v6.16b, v26.16b
5553	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
5554	aese	v7.16b, v26.16b
5555	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
5556
5557	aese	v2.16b, v26.16b
5558	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
5559	aese	v0.16b, v26.16b
5560	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
5561
5562	aese	v4.16b, v27.16b
5563	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
5564	aese	v6.16b, v27.16b
5565	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
5566	aese	v1.16b, v27.16b
5567	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
5568
5569	aese	v2.16b, v27.16b
5570	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
5571	aese	v0.16b, v27.16b
5572	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
5573
5574	aese	v3.16b, v27.16b
5575	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
5576	aese	v7.16b, v27.16b
5577	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
5578	aese	v5.16b, v27.16b
5579	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
5580
5581	aese	v0.16b, v28.16b
5582	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
5583	aese	v2.16b, v28.16b
5584	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
5585	ldp	q26, q27, [x8, #96]				//load rk6, rk7
5586
5587	aese	v1.16b, v28.16b
5588	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
5589	aese	v4.16b, v28.16b
5590	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
5591	aese	v5.16b, v28.16b
5592	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
5593
5594	aese	v3.16b, v28.16b
5595	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
5596	aese	v6.16b, v28.16b
5597	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
5598	aese	v7.16b, v28.16b
5599	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
5600
5601	aese	v1.16b, v26.16b
5602	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
5603	aese	v5.16b, v26.16b
5604	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
5605	aese	v4.16b, v26.16b
5606	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
5607
5608	aese	v2.16b, v26.16b
5609	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
5610	aese	v6.16b, v26.16b
5611	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
5612	aese	v0.16b, v26.16b
5613	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
5614
5615	aese	v7.16b, v26.16b
5616	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
5617	aese	v3.16b, v26.16b
5618	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
5619	ldp	q28, q26, [x8, #128]				//load rk8, rk9
5620
5621	aese	v2.16b, v27.16b
5622	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
5623	aese	v0.16b, v27.16b
5624	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
5625
5626	aese	v7.16b, v27.16b
5627	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
5628	aese	v6.16b, v27.16b
5629	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
5630	aese	v1.16b, v27.16b
5631	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
5632
5633	aese	v5.16b, v27.16b
5634	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
5635	aese	v3.16b, v27.16b
5636	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
5637
5638	aese	v4.16b, v27.16b
5639	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
5640
5641	aese	v6.16b, v28.16b
5642	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
5643	aese	v1.16b, v28.16b
5644	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
5645
5646	aese	v3.16b, v28.16b
5647	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
5648	aese	v0.16b, v28.16b
5649	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
5650	aese	v7.16b, v28.16b
5651	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
5652
5653	aese	v5.16b, v28.16b
5654	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
5655	aese	v4.16b, v28.16b
5656	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
5657	aese	v2.16b, v28.16b
5658	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
5659
5660	ld1	{ v19.16b}, [x3]
5661	ext	v19.16b, v19.16b, v19.16b, #8
5662	rev64	v19.16b, v19.16b
5663	ldp	q27, q28, [x8, #160]				//load rk10, rk11
5664
5665	aese	v6.16b, v26.16b
5666	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
5667	aese	v7.16b, v26.16b
5668	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
5669	aese	v3.16b, v26.16b
5670	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
5671
5672	aese	v4.16b, v26.16b
5673	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
5674	aese	v5.16b, v26.16b
5675	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
5676	aese	v2.16b, v26.16b
5677	aesmc	v2.16b, v2.16b			//AES block 2 - round 9
5678
5679	aese	v1.16b, v26.16b
5680	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
5681
5682	aese	v7.16b, v27.16b
5683	aesmc	v7.16b, v7.16b			//AES block 7 - round 10
5684	aese	v4.16b, v27.16b
5685	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
5686	aese	v0.16b, v26.16b
5687	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
5688
5689	aese	v1.16b, v27.16b
5690	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
5691	aese	v5.16b, v27.16b
5692	aesmc	v5.16b, v5.16b			//AES block 5 - round 10
5693	aese	v3.16b, v27.16b
5694	aesmc	v3.16b, v3.16b			//AES block 3 - round 10
5695
5696	aese	v2.16b, v27.16b
5697	aesmc	v2.16b, v2.16b			//AES block 2 - round 10
5698	aese	v0.16b, v27.16b
5699	aesmc	v0.16b, v0.16b			//AES block 0 - round 10
5700	aese	v6.16b, v27.16b
5701	aesmc	v6.16b, v6.16b			//AES block 6 - round 10
5702
5703	aese	v4.16b, v28.16b
5704	aesmc	v4.16b, v4.16b			//AES block 4 - round 11
5705	ldp	q26, q27, [x8, #192]				//load rk12, rk13
5706	aese	v5.16b, v28.16b
5707	aesmc	v5.16b, v5.16b			//AES block 5 - round 11
5708
5709	aese	v2.16b, v28.16b
5710	aesmc	v2.16b, v2.16b			//AES block 2 - round 11
5711	aese	v6.16b, v28.16b
5712	aesmc	v6.16b, v6.16b			//AES block 6 - round 11
5713	aese	v1.16b, v28.16b
5714	aesmc	v1.16b, v1.16b			//AES block 1 - round 11
5715
5716	aese	v0.16b, v28.16b
5717	aesmc	v0.16b, v0.16b			//AES block 0 - round 11
5718	aese	v3.16b, v28.16b
5719	aesmc	v3.16b, v3.16b			//AES block 3 - round 11
5720	aese	v7.16b, v28.16b
5721	aesmc	v7.16b, v7.16b			//AES block 7 - round 11
5722
5723	add	v30.4s, v30.4s, v31.4s		//CTR block 7
5724	ldr	q28, [x8, #224]					//load rk14
5725
5726	aese	v4.16b, v26.16b
5727	aesmc	v4.16b, v4.16b			//AES block 4 - round 12
5728	aese	v2.16b, v26.16b
5729	aesmc	v2.16b, v2.16b			//AES block 2 - round 12
5730	aese	v1.16b, v26.16b
5731	aesmc	v1.16b, v1.16b			//AES block 1 - round 12
5732
5733	aese	v0.16b, v26.16b
5734	aesmc	v0.16b, v0.16b			//AES block 0 - round 12
5735	aese	v5.16b, v26.16b
5736	aesmc	v5.16b, v5.16b			//AES block 5 - round 12
5737	aese	v3.16b, v26.16b
5738	aesmc	v3.16b, v3.16b			//AES block 3 - round 12
5739
5740	aese	v2.16b, v27.16b						//AES block 2 - round 13
5741	aese	v1.16b, v27.16b						//AES block 1 - round 13
5742	aese	v4.16b, v27.16b						//AES block 4 - round 13
5743
5744	aese	v6.16b, v26.16b
5745	aesmc	v6.16b, v6.16b			//AES block 6 - round 12
5746	aese	v7.16b, v26.16b
5747	aesmc	v7.16b, v7.16b			//AES block 7 - round 12
5748
5749	aese	v0.16b, v27.16b						//AES block 0 - round 13
5750	aese	v5.16b, v27.16b						//AES block 5 - round 13
5751
5752	aese	v6.16b, v27.16b						//AES block 6 - round 13
5753	aese	v7.16b, v27.16b						//AES block 7 - round 13
5754	aese	v3.16b, v27.16b						//AES block 3 - round 13
5755
5756	add	x4, x0, x1, lsr #3		//end_input_ptr
5757	cmp	x0, x5				//check if we have <= 8 blocks
5758	b.ge	.L256_enc_tail						//handle tail
5759
5760	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext
5761
5762	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext
5763
5764.inst	0xce007108	//eor3 v8.16b, v8.16b, v0.16b, v28.16b				//AES block 0 - result
5765	rev32	v0.16b, v30.16b				//CTR block 8
5766	add	v30.4s, v30.4s, v31.4s		//CTR block 8
5767
5768.inst	0xce017129	//eor3 v9.16b, v9.16b, v1.16b, v28.16b				//AES block 1 - result
5769.inst	0xce03716b	//eor3 v11.16b, v11.16b, v3.16b, v28.16b				//AES block 3 - result
5770
5771	rev32	v1.16b, v30.16b				//CTR block 9
5772	add	v30.4s, v30.4s, v31.4s		//CTR block 9
5773	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext
5774
5775	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
5776.inst	0xce02714a	//eor3 v10.16b, v10.16b, v2.16b, v28.16b				//AES block 2 - result
5777	cmp	x0, x5				//check if we have <= 8 blocks
5778
5779	rev32	v2.16b, v30.16b				//CTR block 10
5780	add	v30.4s, v30.4s, v31.4s		//CTR block 10
5781	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result
5782
5783	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result
5784
5785	rev32	v3.16b, v30.16b				//CTR block 11
5786	add	v30.4s, v30.4s, v31.4s		//CTR block 11
5787
5788.inst	0xce04718c	//eor3 v12.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result
5789
5790.inst	0xce0771ef	//eor3 v15.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
5791.inst	0xce0671ce	//eor3 v14.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result
5792.inst	0xce0571ad	//eor3 v13.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result
5793
5794	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
5795	rev32	v4.16b, v30.16b				//CTR block 12
5796
5797	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
5798	add	v30.4s, v30.4s, v31.4s		//CTR block 12
5799	b.ge	.L256_enc_prepretail					//do prepretail
5800
5801.L256_enc_main_loop:	//main	loop start
5802	ldp	q26, q27, [x8, #0]					//load rk0, rk1
5803
5804	rev32	v5.16b, v30.16b				//CTR block 8k+13
5805	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
5806	ldr	q21, [x3, #144]				//load h6k | h5k
5807	ldr	q24, [x3, #192]				//load h8k | h7k
5808
5809	rev64	v11.16b, v11.16b						//GHASH block 8k+3
5810	ldr	q20, [x3, #128]				//load h5l | h5h
5811	ext	v20.16b, v20.16b, v20.16b, #8
5812	ldr	q22, [x3, #160]				//load h6l | h6h
5813	ext	v22.16b, v22.16b, v22.16b, #8
5814	rev64	v9.16b, v9.16b						//GHASH block 8k+1
5815
5816	rev32	v6.16b, v30.16b				//CTR block 8k+14
5817	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
5818	rev64	v8.16b, v8.16b						//GHASH block 8k
5819
5820	rev64	v12.16b, v12.16b						//GHASH block 8k+4
5821	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
5822	ldr	q23, [x3, #176]				//load h7l | h7h
5823	ext	v23.16b, v23.16b, v23.16b, #8
5824	ldr	q25, [x3, #208]				//load h8l | h8h
5825	ext	v25.16b, v25.16b, v25.16b, #8
5826
5827	aese	v3.16b, v26.16b
5828	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
5829	aese	v5.16b, v26.16b
5830	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
5831	rev32	v7.16b, v30.16b				//CTR block 8k+15
5832
5833	aese	v0.16b, v26.16b
5834	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
5835	aese	v1.16b, v26.16b
5836	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
5837	aese	v6.16b, v26.16b
5838	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
5839
5840	aese	v7.16b, v26.16b
5841	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
5842	aese	v2.16b, v26.16b
5843	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
5844	aese	v4.16b, v26.16b
5845	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
5846
5847	ldp	q28, q26, [x8, #32]				//load rk2, rk3
5848	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
5849	aese	v6.16b, v27.16b
5850	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
5851
5852	aese	v2.16b, v27.16b
5853	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
5854	aese	v1.16b, v27.16b
5855	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
5856	aese	v0.16b, v27.16b
5857	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
5858
5859	aese	v4.16b, v27.16b
5860	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
5861	aese	v3.16b, v27.16b
5862	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
5863	aese	v5.16b, v27.16b
5864	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
5865
5866	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
5867	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
5868	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
5869
5870	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
5871	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
5872	aese	v7.16b, v27.16b
5873	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
5874
5875	aese	v1.16b, v28.16b
5876	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
5877	aese	v5.16b, v28.16b
5878	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
5879	aese	v6.16b, v28.16b
5880	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
5881
5882	aese	v2.16b, v28.16b
5883	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
5884	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
5885	aese	v4.16b, v28.16b
5886	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
5887
5888	aese	v5.16b, v26.16b
5889	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
5890	aese	v6.16b, v26.16b
5891	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
5892	aese	v0.16b, v28.16b
5893	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
5894
5895	aese	v1.16b, v26.16b
5896	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
5897	aese	v7.16b, v28.16b
5898	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
5899	aese	v3.16b, v28.16b
5900	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
5901
5902	aese	v4.16b, v26.16b
5903	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
5904	rev64	v14.16b, v14.16b						//GHASH block 8k+6
5905	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
5906
5907	aese	v3.16b, v26.16b
5908	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
5909	ldp	q27, q28, [x8, #64]				//load rk4, rk5
5910	rev64	v10.16b, v10.16b						//GHASH block 8k+2
5911
5912	aese	v2.16b, v26.16b
5913	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
5914	aese	v7.16b, v26.16b
5915	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
5916	aese	v0.16b, v26.16b
5917	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
5918
5919	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
5920	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
5921	rev64	v13.16b, v13.16b						//GHASH block 8k+5
5922
5923	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
5924	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
5925	ldr	q23, [x3, #80]				//load h3l | h3h
5926	ext	v23.16b, v23.16b, v23.16b, #8
5927	ldr	q25, [x3, #112]				//load h4l | h4h
5928	ext	v25.16b, v25.16b, v25.16b, #8
5929
5930	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
5931.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
5932	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
5933
5934	aese	v4.16b, v27.16b
5935	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
5936	aese	v1.16b, v27.16b
5937	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
5938	aese	v5.16b, v27.16b
5939	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
5940
5941	aese	v7.16b, v27.16b
5942	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
5943	aese	v3.16b, v27.16b
5944	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
5945	aese	v2.16b, v27.16b
5946	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
5947
5948	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
5949	aese	v6.16b, v27.16b
5950	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
5951	aese	v0.16b, v27.16b
5952	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
5953
5954	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
5955	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
5956	ldp	q26, q27, [x8, #96]				//load rk6, rk7
5957
5958	aese	v5.16b, v28.16b
5959	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
5960	aese	v7.16b, v28.16b
5961	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
5962	aese	v4.16b, v28.16b
5963	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
5964
5965	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
5966	aese	v2.16b, v28.16b
5967	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
5968	rev64	v15.16b, v15.16b						//GHASH block 8k+7
5969
5970	aese	v3.16b, v28.16b
5971	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
5972	aese	v6.16b, v28.16b
5973	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
5974	aese	v1.16b, v28.16b
5975	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
5976
5977	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
5978	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
5979	aese	v0.16b, v28.16b
5980	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
5981
5982	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
5983	aese	v4.16b, v26.16b
5984	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
5985	aese	v2.16b, v26.16b
5986	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
5987
5988	aese	v6.16b, v26.16b
5989	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
5990	aese	v1.16b, v26.16b
5991	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
5992	aese	v7.16b, v26.16b
5993	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
5994
5995	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
5996	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
5997	aese	v5.16b, v26.16b
5998	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
5999
6000.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
6001	aese	v3.16b, v26.16b
6002	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
6003	aese	v0.16b, v26.16b
6004	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
6005
6006	ldp	q28, q26, [x8, #128]				//load rk8, rk9
6007	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
6008	aese	v5.16b, v27.16b
6009	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
6010
6011	ldr	q20, [x3, #32]				//load h1l | h1h
6012	ext	v20.16b, v20.16b, v20.16b, #8
6013	ldr	q22, [x3, #64]				//load h2l | h2h
6014	ext	v22.16b, v22.16b, v22.16b, #8
6015	aese	v2.16b, v27.16b
6016	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
6017.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
6018
6019	ldr	q21, [x3, #48]				//load h2k | h1k
6020	ldr	q24, [x3, #96]				//load h4k | h3k
6021	aese	v6.16b, v27.16b
6022	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
6023	aese	v3.16b, v27.16b
6024	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
6025
6026	aese	v0.16b, v27.16b
6027	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
6028	aese	v7.16b, v27.16b
6029	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
6030	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
6031
6032	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
6033	aese	v4.16b, v27.16b
6034	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
6035	aese	v1.16b, v27.16b
6036	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
6037
6038	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
6039	aese	v7.16b, v28.16b
6040	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
6041	aese	v0.16b, v28.16b
6042	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
6043
6044	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
6045	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
6046	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
6047
6048	aese	v3.16b, v28.16b
6049	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
6050	aese	v0.16b, v26.16b
6051	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
6052	aese	v1.16b, v28.16b
6053	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
6054
6055	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
6056	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
6057	aese	v2.16b, v28.16b
6058	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
6059
6060	aese	v5.16b, v28.16b
6061	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
6062	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
6063	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
6064
6065	aese	v6.16b, v28.16b
6066	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
6067	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
6068	aese	v4.16b, v28.16b
6069	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
6070
6071.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
6072	aese	v7.16b, v26.16b
6073	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
6074	aese	v5.16b, v26.16b
6075	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
6076
6077	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
6078	aese	v6.16b, v26.16b
6079	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
6080	aese	v4.16b, v26.16b
6081	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
6082
6083	ldp	q27, q28, [x8, #160]				//load rk10, rk11
6084	aese	v2.16b, v26.16b
6085	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
6086	aese	v3.16b, v26.16b
6087	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
6088
6089	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
6090.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
6091	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
6092
6093	ldr	d16, [x10]			//MODULO - load modulo constant
6094	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
6095	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
6096
6097	aese	v1.16b, v26.16b
6098	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
6099
6100.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
6101.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
6102.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
6103
6104	aese	v4.16b, v27.16b
6105	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
6106	aese	v3.16b, v27.16b
6107	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
6108	aese	v5.16b, v27.16b
6109	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
6110
6111	aese	v0.16b, v27.16b
6112	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
6113	aese	v2.16b, v27.16b
6114	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
6115	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
6116
6117	aese	v1.16b, v27.16b
6118	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
6119	aese	v7.16b, v27.16b
6120	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
6121	aese	v6.16b, v27.16b
6122	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
6123
6124.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
6125
6126	ldp	q26, q27, [x8, #192]				//load rk12, rk13
6127	rev32	v20.16b, v30.16b					//CTR block 8k+16
6128
6129	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
6130	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext
6131	aese	v2.16b, v28.16b
6132	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
6133
6134	aese	v6.16b, v28.16b
6135	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
6136	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
6137	aese	v3.16b, v28.16b
6138	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
6139
6140	aese	v0.16b, v28.16b
6141	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
6142	aese	v7.16b, v28.16b
6143	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
6144
6145	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
6146	aese	v1.16b, v28.16b
6147	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
6148
6149	aese	v7.16b, v26.16b
6150	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
6151	aese	v5.16b, v28.16b
6152	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
6153
6154	aese	v3.16b, v26.16b
6155	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
6156	aese	v6.16b, v26.16b
6157	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
6158	rev32	v22.16b, v30.16b					//CTR block 8k+17
6159
6160	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
6161	aese	v4.16b, v28.16b
6162	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
6163.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
6164
6165	aese	v5.16b, v26.16b
6166	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
6167	ldr	q28, [x8, #224]					//load rk14
6168	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
6169
6170	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext
6171	aese	v2.16b, v26.16b
6172	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
6173	aese	v4.16b, v26.16b
6174	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
6175
6176.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
6177	aese	v1.16b, v26.16b
6178	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
6179	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext
6180
6181	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
6182	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
6183	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
6184
6185	rev32	v23.16b, v30.16b					//CTR block 8k+18
6186	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
6187	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
6188
6189	aese	v0.16b, v26.16b
6190	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
6191	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
6192	cmp	x0, x5				//.LOOP CONTROL
6193
6194.inst	0xce02714a	//eor3 v10.16b, v10.16b, v2.16b, v28.16b				//AES block 8k+10 - result
6195	rev32	v25.16b, v30.16b					//CTR block 8k+19
6196	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
6197
6198	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
6199	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
6200.inst	0xce0571ad	//eor3 v13.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result
6201
6202	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
6203	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
6204	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
6205
6206.inst	0xce04718c	//eor3 v12.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result
6207	rev32	v4.16b, v30.16b				//CTR block 8k+20
6208.inst	0xce03716b	//eor3 v11.16b, v11.16b, v3.16b, v28.16b				//AES block 8k+11 - result
6209
6210	mov	v3.16b, v25.16b					//CTR block 8k+19
6211.inst	0xce017129	//eor3 v9.16b, v9.16b, v1.16b, v28.16b				//AES block 8k+9 - result
6212.inst	0xce007108	//eor3 v8.16b, v8.16b, v0.16b, v28.16b				//AES block 8k+8 - result
6213
6214	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
6215	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result
6216	mov	v2.16b, v23.16b					//CTR block 8k+18
6217
6218.inst	0xce0771ef	//eor3 v15.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
6219.inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
6220	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result
6221
6222.inst	0xce0671ce	//eor3 v14.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result
6223	mov	v1.16b, v22.16b					//CTR block 8k+17
6224	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
6225
6226	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
6227	mov	v0.16b, v20.16b					//CTR block 8k+16
6228	b.lt	.L256_enc_main_loop
6229
6230.L256_enc_prepretail:	//PREPRETAIL
6231	rev32	v5.16b, v30.16b				//CTR block 8k+13
6232	ldp	q26, q27, [x8, #0]					//load rk0, rk1
6233	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
6234
6235	rev64	v10.16b, v10.16b						//GHASH block 8k+2
6236
6237	rev32	v6.16b, v30.16b				//CTR block 8k+14
6238	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
6239
6240	rev64	v13.16b, v13.16b						//GHASH block 8k+5
6241	ldr	q21, [x3, #144]				//load h6k | h5k
6242	ldr	q24, [x3, #192]				//load h8k | h7k
6243
6244	rev32	v7.16b, v30.16b				//CTR block 8k+15
6245
6246	aese	v6.16b, v26.16b
6247	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
6248	aese	v4.16b, v26.16b
6249	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
6250	aese	v1.16b, v26.16b
6251	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
6252
6253	aese	v5.16b, v26.16b
6254	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
6255	aese	v0.16b, v26.16b
6256	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
6257
6258	aese	v2.16b, v26.16b
6259	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
6260	aese	v7.16b, v26.16b
6261	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
6262	aese	v3.16b, v26.16b
6263	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
6264
6265	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
6266	rev64	v8.16b, v8.16b						//GHASH block 8k
6267	aese	v1.16b, v27.16b
6268	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
6269
6270	rev64	v9.16b, v9.16b						//GHASH block 8k+1
6271	ldp	q28, q26, [x8, #32]				//load rk2, rk3
6272	aese	v3.16b, v27.16b
6273	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
6274
6275	ldr	q23, [x3, #176]				//load h7l | h7h
6276	ext	v23.16b, v23.16b, v23.16b, #8
6277	ldr	q25, [x3, #208]				//load h8l | h8h
6278	ext	v25.16b, v25.16b, v25.16b, #8
6279	aese	v2.16b, v27.16b
6280	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
6281
6282	ldr	q20, [x3, #128]				//load h5l | h5h
6283	ext	v20.16b, v20.16b, v20.16b, #8
6284	ldr	q22, [x3, #160]				//load h6l | h6h
6285	ext	v22.16b, v22.16b, v22.16b, #8
6286	aese	v0.16b, v27.16b
6287	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
6288	aese	v5.16b, v27.16b
6289	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
6290
6291	aese	v4.16b, v27.16b
6292	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
6293	eor	v8.16b, v8.16b, v19.16b					//PRE 1
6294
6295	rev64	v11.16b, v11.16b						//GHASH block 8k+3
6296	aese	v6.16b, v27.16b
6297	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
6298
6299	aese	v1.16b, v28.16b
6300	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
6301	aese	v2.16b, v28.16b
6302	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
6303	aese	v7.16b, v27.16b
6304	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
6305
6306	aese	v4.16b, v28.16b
6307	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
6308	aese	v0.16b, v28.16b
6309	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
6310	aese	v6.16b, v28.16b
6311	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
6312
6313	aese	v5.16b, v28.16b
6314	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
6315	aese	v7.16b, v28.16b
6316	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
6317	aese	v3.16b, v28.16b
6318	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
6319
6320	ldp	q27, q28, [x8, #64]				//load rk4, rk5
6321	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
6322	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
6323
6324	rev64	v14.16b, v14.16b						//GHASH block 8k+6
6325	aese	v4.16b, v26.16b
6326	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
6327	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
6328
6329	aese	v7.16b, v26.16b
6330	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
6331	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
6332	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
6333
6334	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
6335	aese	v6.16b, v26.16b
6336	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
6337
6338	aese	v2.16b, v26.16b
6339	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
6340	aese	v3.16b, v26.16b
6341	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
6342	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
6343
6344	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
6345	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
6346	aese	v1.16b, v26.16b
6347	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
6348
6349	aese	v0.16b, v26.16b
6350	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
6351	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
6352	aese	v5.16b, v26.16b
6353	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
6354
6355	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
6356	aese	v1.16b, v27.16b
6357	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
6358	aese	v6.16b, v27.16b
6359	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
6360
6361	aese	v0.16b, v27.16b
6362	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
6363	aese	v2.16b, v27.16b
6364	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
6365	aese	v4.16b, v27.16b
6366	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
6367
6368	aese	v6.16b, v28.16b
6369	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
6370	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
6371.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
6372
6373	aese	v7.16b, v27.16b
6374	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
6375	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
6376	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
6377
6378	aese	v5.16b, v27.16b
6379	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
6380	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
6381	aese	v3.16b, v27.16b
6382	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
6383
6384	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
6385	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
6386	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
6387
6388	rev64	v12.16b, v12.16b						//GHASH block 8k+4
6389	aese	v1.16b, v28.16b
6390	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
6391	aese	v0.16b, v28.16b
6392	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
6393
6394	aese	v7.16b, v28.16b
6395	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
6396	aese	v4.16b, v28.16b
6397	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
6398	ldp	q26, q27, [x8, #96]				//load rk6, rk7
6399
6400	ldr	q23, [x3, #80]				//load h3l | h3h
6401	ext	v23.16b, v23.16b, v23.16b, #8
6402	ldr	q25, [x3, #112]				//load h4l | h4h
6403	ext	v25.16b, v25.16b, v25.16b, #8
6404	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
6405	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
6406
6407.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
6408	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
6409
6410	aese	v5.16b, v28.16b
6411	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
6412	rev64	v15.16b, v15.16b						//GHASH block 8k+7
6413	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
6414
6415	aese	v3.16b, v28.16b
6416	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
6417	aese	v2.16b, v28.16b
6418	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
6419.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
6420
6421	aese	v7.16b, v26.16b
6422	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
6423	aese	v4.16b, v26.16b
6424	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
6425	aese	v6.16b, v26.16b
6426	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
6427
6428	ldr	q21, [x3, #48]				//load h2k | h1k
6429	ldr	q24, [x3, #96]				//load h4k | h3k
6430	aese	v5.16b, v26.16b
6431	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
6432	aese	v3.16b, v26.16b
6433	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
6434
6435	aese	v0.16b, v26.16b
6436	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
6437	aese	v1.16b, v26.16b
6438	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
6439	aese	v2.16b, v26.16b
6440	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
6441
6442	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
6443	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
6444	ldr	q20, [x3, #32]				//load h1l | h1h
6445	ext	v20.16b, v20.16b, v20.16b, #8
6446	ldr	q22, [x3, #64]				//load h2l | h2h
6447	ext	v22.16b, v22.16b, v22.16b, #8
6448
6449	ldp	q28, q26, [x8, #128]				//load rk8, rk9
6450	aese	v1.16b, v27.16b
6451	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
6452	aese	v4.16b, v27.16b
6453	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
6454
6455	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
6456	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
6457
6458	aese	v5.16b, v27.16b
6459	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
6460	aese	v6.16b, v27.16b
6461	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
6462	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
6463
6464	aese	v7.16b, v27.16b
6465	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
6466	aese	v3.16b, v27.16b
6467	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
6468	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
6469
6470	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
6471	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
6472	aese	v2.16b, v27.16b
6473	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
6474
6475	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
6476	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
6477	aese	v0.16b, v27.16b
6478	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
6479
6480	aese	v7.16b, v28.16b
6481	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
6482.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
6483	aese	v2.16b, v28.16b
6484	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
6485
6486	aese	v6.16b, v28.16b
6487	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
6488	aese	v4.16b, v28.16b
6489	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
6490	aese	v3.16b, v28.16b
6491	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
6492
6493	aese	v5.16b, v28.16b
6494	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
6495	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
6496	aese	v0.16b, v28.16b
6497	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
6498
6499	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
6500	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
6501	aese	v1.16b, v28.16b
6502	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
6503
6504	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
6505	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
6506	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
6507
6508	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
6509.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
6510.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
6511
6512	ldp	q27, q28, [x8, #160]				//load rk10, rk11
6513	aese	v1.16b, v26.16b
6514	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
6515	aese	v0.16b, v26.16b
6516	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
6517
6518.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
6519.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
6520	ldr	d16, [x10]			//MODULO - load modulo constant
6521
6522.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
6523
6524	aese	v3.16b, v26.16b
6525	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
6526	aese	v7.16b, v26.16b
6527	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
6528	aese	v5.16b, v26.16b
6529	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
6530
6531	aese	v2.16b, v26.16b
6532	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
6533	aese	v6.16b, v26.16b
6534	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
6535
6536	aese	v5.16b, v27.16b
6537	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
6538	aese	v1.16b, v27.16b
6539	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
6540	aese	v4.16b, v26.16b
6541	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
6542
6543	aese	v7.16b, v27.16b
6544	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
6545	aese	v6.16b, v27.16b
6546	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
6547	aese	v3.16b, v27.16b
6548	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
6549
6550	aese	v4.16b, v27.16b
6551	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
6552	aese	v0.16b, v27.16b
6553	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
6554	aese	v2.16b, v27.16b
6555	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
6556
6557	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
6558.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
6559	aese	v7.16b, v28.16b
6560	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
6561
6562	ldp	q26, q27, [x8, #192]				//load rk12, rk13
6563	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
6564	aese	v2.16b, v28.16b
6565	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
6566
6567.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
6568	aese	v1.16b, v28.16b
6569	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
6570	aese	v6.16b, v28.16b
6571	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
6572
6573	aese	v0.16b, v28.16b
6574	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
6575	aese	v4.16b, v28.16b
6576	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
6577	aese	v5.16b, v28.16b
6578	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
6579
6580	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
6581	aese	v3.16b, v28.16b
6582	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
6583	ldr	q28, [x8, #224]					//load rk14
6584
6585	aese	v1.16b, v26.16b
6586	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
6587	aese	v2.16b, v26.16b
6588	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
6589	aese	v0.16b, v26.16b
6590	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
6591
6592	aese	v6.16b, v26.16b
6593	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
6594	aese	v5.16b, v26.16b
6595	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
6596	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
6597
6598	aese	v4.16b, v26.16b
6599	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
6600	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
6601
6602	aese	v3.16b, v26.16b
6603	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
6604	aese	v7.16b, v26.16b
6605	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
6606	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
6607
6608.inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
6609	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
6610	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
6611
6612	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
6613	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
6614	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
6615
6616	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
6617	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
6618.L256_enc_tail:	//TAIL
6619
6620	ldp	q24, q25, [x3, #192]			//load h8l | h8h
6621	ext	v25.16b, v25.16b, v25.16b, #8
6622	sub	x5, x4, x0		//main_end_input_ptr is number of bytes left to process
6623
6624	ldr	q8, [x0], #16				//AES block 8k+8 - load plaintext
6625
6626	ldp	q20, q21, [x3, #128]			//load h5l | h5h
6627	ext	v20.16b, v20.16b, v20.16b, #8
6628
6629	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
6630	ldp	q22, q23, [x3, #160]			//load h6l | h6h
6631	ext	v22.16b, v22.16b, v22.16b, #8
6632	ext	v23.16b, v23.16b, v23.16b, #8
6633	mov	v29.16b, v28.16b
6634
6635	cmp	x5, #112
6636.inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b				//AES block 8k+8 - result
6637	b.gt	.L256_enc_blocks_more_than_7
6638
6639	movi	v19.8b, #0
6640	mov	v7.16b, v6.16b
6641	movi	v17.8b, #0
6642
6643	mov	v6.16b, v5.16b
6644	mov	v5.16b, v4.16b
6645	mov	v4.16b, v3.16b
6646
6647	mov	v3.16b, v2.16b
6648	sub	v30.4s, v30.4s, v31.4s
6649	mov	v2.16b, v1.16b
6650
6651	movi	v18.8b, #0
6652	cmp	x5, #96
6653	b.gt	.L256_enc_blocks_more_than_6
6654
6655	mov	v7.16b, v6.16b
6656	mov	v6.16b, v5.16b
6657	cmp	x5, #80
6658
6659	mov	v5.16b, v4.16b
6660	mov	v4.16b, v3.16b
6661	mov	v3.16b, v1.16b
6662
6663	sub	v30.4s, v30.4s, v31.4s
6664	b.gt	.L256_enc_blocks_more_than_5
6665
6666	mov	v7.16b, v6.16b
6667	sub	v30.4s, v30.4s, v31.4s
6668
6669	mov	v6.16b, v5.16b
6670	mov	v5.16b, v4.16b
6671
6672	cmp	x5, #64
6673	mov	v4.16b, v1.16b
6674	b.gt	.L256_enc_blocks_more_than_4
6675
6676	cmp	x5, #48
6677	mov	v7.16b, v6.16b
6678	mov	v6.16b, v5.16b
6679
6680	mov	v5.16b, v1.16b
6681	sub	v30.4s, v30.4s, v31.4s
6682	b.gt	.L256_enc_blocks_more_than_3
6683
6684	cmp	x5, #32
6685	mov	v7.16b, v6.16b
6686	ldr	q24, [x3, #96]				//load h4k | h3k
6687
6688	mov	v6.16b, v1.16b
6689	sub	v30.4s, v30.4s, v31.4s
6690	b.gt	.L256_enc_blocks_more_than_2
6691
6692	mov	v7.16b, v1.16b
6693
6694	sub	v30.4s, v30.4s, v31.4s
6695	cmp	x5, #16
6696	b.gt	.L256_enc_blocks_more_than_1
6697
6698	sub	v30.4s, v30.4s, v31.4s
6699	ldr	q21, [x3, #48]				//load h2k | h1k
6700	b	.L256_enc_blocks_less_than_1
6701.L256_enc_blocks_more_than_7:	//blocks	left >  7
6702	st1	{ v9.16b}, [x2], #16				//AES final-7 block  - store result
6703
6704	rev64	v8.16b, v9.16b						//GHASH final-7 block
6705
6706	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
6707
6708	ldr	q9, [x0], #16				//AES final-6 block - load plaintext
6709
6710	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
6711	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
6712	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
6713
6714	movi	v16.8b, #0						//suppress further partial tag feed in
6715
6716	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
6717.inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result
6718
6719	pmull	v18.1q, v27.1d, v18.1d				//GHASH final-7 block - mid
6720	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
6721.L256_enc_blocks_more_than_6:	//blocks	left >  6
6722
6723	st1	{ v9.16b}, [x2], #16				//AES final-6 block - store result
6724
6725	rev64	v8.16b, v9.16b						//GHASH final-6 block
6726
6727	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
6728
6729	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
6730	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
6731	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
6732
6733	ldr	q9, [x0], #16				//AES final-5 block - load plaintext
6734
6735	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
6736
6737	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
6738
6739	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
6740.inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result
6741
6742	movi	v16.8b, #0						//suppress further partial tag feed in
6743
6744	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
6745	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
6746.L256_enc_blocks_more_than_5:	//blocks	left >  5
6747
6748	st1	{ v9.16b}, [x2], #16				//AES final-5 block - store result
6749
6750	rev64	v8.16b, v9.16b						//GHASH final-5 block
6751
6752	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
6753
6754	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
6755
6756	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
6757
6758	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
6759	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
6760
6761	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
6762
6763	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
6764	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
6765
6766	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
6767	movi	v16.8b, #0						//suppress further partial tag feed in
6768	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
6769
6770	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
6771.inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
6772.L256_enc_blocks_more_than_4:	//blocks	left >  4
6773
6774	st1	{ v9.16b}, [x2], #16				//AES final-4 block - store result
6775
6776	rev64	v8.16b, v9.16b						//GHASH final-4 block
6777
6778	ldr	q9, [x0], #16				//AES final-3 block - load plaintext
6779
6780	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
6781
6782	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
6783	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
6784
6785.inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
6786	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
6787
6788	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
6789	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
6790
6791	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
6792
6793	movi	v16.8b, #0						//suppress further partial tag feed in
6794
6795	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
6796	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
6797.L256_enc_blocks_more_than_3:	//blocks	left >  3
6798
6799	st1	{ v9.16b}, [x2], #16				//AES final-3 block - store result
6800
6801	ldr	q25, [x3, #112]				//load h4l | h4h
6802	ext	v25.16b, v25.16b, v25.16b, #8
6803	rev64	v8.16b, v9.16b						//GHASH final-3 block
6804
6805	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
6806
6807	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
6808	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
6809
6810	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
6811	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
6812	ldr	q24, [x3, #96]				//load h4k | h3k
6813
6814	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
6815	ldr	q9, [x0], #16				//AES final-2 block - load plaintext
6816
6817	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
6818	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
6819
6820.inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result
6821	movi	v16.8b, #0						//suppress further partial tag feed in
6822
6823	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
6824	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
6825.L256_enc_blocks_more_than_2:	//blocks	left >  2
6826
6827	ldr	q23, [x3, #80]				//load h3l | h3h
6828	ext	v23.16b, v23.16b, v23.16b, #8
6829
6830	st1	{ v9.16b}, [x2], #16			 	//AES final-2 block - store result
6831
6832	rev64	v8.16b, v9.16b						//GHASH final-2 block
6833	ldr	q9, [x0], #16				//AES final-1 block - load plaintext
6834
6835	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
6836
6837	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
6838
6839	movi	v16.8b, #0						//suppress further partial tag feed in
6840
6841	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
6842.inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result
6843
6844	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
6845
6846	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
6847
6848	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
6849	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
6850
6851	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
6852	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
6853.L256_enc_blocks_more_than_1:	//blocks	left >  1
6854
6855	st1	{ v9.16b}, [x2], #16				//AES final-1 block - store result
6856
6857	ldr	q22, [x3, #64]				//load h2l | h2h
6858	ext	v22.16b, v22.16b, v22.16b, #8
6859	rev64	v8.16b, v9.16b						//GHASH final-1 block
6860	ldr	q9, [x0], #16				//AES final block - load plaintext
6861
6862	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
6863	movi	v16.8b, #0						//suppress further partial tag feed in
6864
6865	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
6866	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
6867
6868.inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result
6869	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
6870
6871	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
6872	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
6873
6874	ldr	q21, [x3, #48]				//load h2k | h1k
6875
6876	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
6877	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
6878
6879	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
6880
6881	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
6882.L256_enc_blocks_less_than_1:	//blocks	left <= 1
6883
6884	and	x1, x1, #127				//bit_length %= 128
6885
6886	sub	x1, x1, #128				//bit_length -= 128
6887
6888	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
6889
6890	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
6891	and	x1, x1, #127				//bit_length %= 128
6892
6893	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
6894	cmp	x1, #64
6895	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
6896
6897	csel	x14, x6, xzr, lt
6898	csel	x13, x7, x6, lt
6899
6900	mov	v0.d[0], x13					//ctr0b is mask for last block
6901	ldr	q20, [x3, #32]				//load h1l | h1h
6902	ext	v20.16b, v20.16b, v20.16b, #8
6903
6904	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
6905	mov	v0.d[1], x14
6906
6907	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
6908
6909	rev64	v8.16b, v9.16b						//GHASH final block
6910
6911	rev32	v30.16b, v30.16b
6912	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
6913	str	q30, [x16]					//store the updated counter
6914
6915	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
6916	st1	{ v9.16b}, [x2]				//store all 16B
6917
6918	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
6919	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
6920	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
6921
6922	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
6923	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
6924
6925	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
6926
6927	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
6928
6929	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
6930	ldr	d16, [x10]			//MODULO - load modulo constant
6931
6932	ext	v21.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
6933
6934.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
6935	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
6936
6937.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
6938
6939	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
6940	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
6941
6942.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
6943	ext	v19.16b, v19.16b, v19.16b, #8
6944	rev64	v19.16b, v19.16b
6945	st1	{ v19.16b }, [x3]
6946	mov	x0, x9					//return sizes
6947
6948	ldp	d10, d11, [sp, #16]
6949	ldp	d12, d13, [sp, #32]
6950	ldp	d14, d15, [sp, #48]
6951	ldp	d8, d9, [sp], #80
6952	ret
6953
6954.L256_enc_ret:
6955	mov	w0, #0x0
6956	ret
6957.size	unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
6958.globl	unroll8_eor3_aes_gcm_dec_256_kernel
6959.type	unroll8_eor3_aes_gcm_dec_256_kernel,%function
6960.align	4
6961unroll8_eor3_aes_gcm_dec_256_kernel:
6962	AARCH64_VALID_CALL_TARGET
6963	cbz	x1, .L256_dec_ret
6964	stp	d8, d9, [sp, #-80]!
6965	lsr	x9, x1, #3
6966	mov	x16, x4
6967	mov	x8, x5
6968	stp	d10, d11, [sp, #16]
6969	stp	d12, d13, [sp, #32]
6970	stp	d14, d15, [sp, #48]
6971	mov	x5, #0xc200000000000000
6972	stp	x5, xzr, [sp, #64]
6973	add	x10, sp, #64
6974
6975	ld1	{ v0.16b}, [x16]					//CTR block 0
6976
6977	mov	x15, #0x100000000			//set up counter increment
6978	movi	v31.16b, #0x0
6979	mov	v31.d[1], x15
6980	mov	x5, x9
6981
6982	sub	x5, x5, #1		//byte_len - 1
6983
6984	rev32	v30.16b, v0.16b				//set up reversed counter
6985
6986	add	v30.4s, v30.4s, v31.4s		//CTR block 0
6987
6988	rev32	v1.16b, v30.16b				//CTR block 1
6989	add	v30.4s, v30.4s, v31.4s		//CTR block 1
6990
6991	rev32	v2.16b, v30.16b				//CTR block 2
6992	add	v30.4s, v30.4s, v31.4s		//CTR block 2
6993	ldp	q26, q27, [x8, #0]				  	//load rk0, rk1
6994
6995	rev32	v3.16b, v30.16b				//CTR block 3
6996	add	v30.4s, v30.4s, v31.4s		//CTR block 3
6997
6998	rev32	v4.16b, v30.16b				//CTR block 4
6999	add	v30.4s, v30.4s, v31.4s		//CTR block 4
7000
7001	aese	v0.16b, v26.16b
7002	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
7003
7004	rev32	v5.16b, v30.16b				//CTR block 5
7005	add	v30.4s, v30.4s, v31.4s		//CTR block 5
7006
7007	aese	v1.16b, v26.16b
7008	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
7009	aese	v2.16b, v26.16b
7010	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
7011
7012	rev32	v6.16b, v30.16b				//CTR block 6
7013	add	v30.4s, v30.4s, v31.4s		//CTR block 6
7014
7015	rev32	v7.16b, v30.16b				//CTR block 7
7016	aese	v4.16b, v26.16b
7017	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
7018
7019	aese	v6.16b, v26.16b
7020	aesmc	v6.16b, v6.16b		        //AES block 6 - round 0
7021	aese	v5.16b, v26.16b
7022	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
7023
7024	aese	v3.16b, v26.16b
7025	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
7026	aese	v7.16b, v26.16b
7027	aesmc	v7.16b, v7.16b		        //AES block 7 - round 0
7028	ldp	q28, q26, [x8, #32]				//load rk2, rk3
7029
7030	aese	v6.16b, v27.16b
7031	aesmc	v6.16b, v6.16b		        //AES block 6 - round 1
7032	aese	v4.16b, v27.16b
7033	aesmc	v4.16b, v4.16b		        //AES block 4 - round 1
7034	aese	v0.16b, v27.16b
7035	aesmc	v0.16b, v0.16b		        //AES block 0 - round 1
7036
7037	aese	v5.16b, v27.16b
7038	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
7039	aese	v7.16b, v27.16b
7040	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
7041	aese	v1.16b, v27.16b
7042	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
7043
7044	aese	v2.16b, v27.16b
7045	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
7046	aese	v3.16b, v27.16b
7047	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
7048
7049	aese	v3.16b, v28.16b
7050	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
7051	aese	v2.16b, v28.16b
7052	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
7053	aese	v6.16b, v28.16b
7054	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
7055
7056	aese	v1.16b, v28.16b
7057	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
7058	aese	v7.16b, v28.16b
7059	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
7060	aese	v5.16b, v28.16b
7061	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
7062
7063	aese	v0.16b, v28.16b
7064	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
7065	aese	v4.16b, v28.16b
7066	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
7067	ldp	q27, q28, [x8, #64]				//load rk4, rk5
7068
7069	aese	v1.16b, v26.16b
7070	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
7071	aese	v2.16b, v26.16b
7072	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
7073
7074	aese	v3.16b, v26.16b
7075	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
7076	aese	v4.16b, v26.16b
7077	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
7078
7079	aese	v5.16b, v26.16b
7080	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
7081	aese	v7.16b, v26.16b
7082	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
7083	aese	v0.16b, v26.16b
7084	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
7085
7086	aese	v6.16b, v26.16b
7087	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
7088
7089	aese	v7.16b, v27.16b
7090	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
7091	aese	v3.16b, v27.16b
7092	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
7093
7094	aese	v6.16b, v27.16b
7095	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
7096	aese	v2.16b, v27.16b
7097	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
7098	aese	v0.16b, v27.16b
7099	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
7100
7101	aese	v4.16b, v27.16b
7102	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
7103	aese	v1.16b, v27.16b
7104	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
7105	aese	v5.16b, v27.16b
7106	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
7107
7108	aese	v0.16b, v28.16b
7109	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
7110	aese	v6.16b, v28.16b
7111	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
7112
7113	ldp	q26, q27, [x8, #96]				//load rk6, rk7
7114	aese	v4.16b, v28.16b
7115	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
7116	aese	v7.16b, v28.16b
7117	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
7118
7119	aese	v5.16b, v28.16b
7120	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
7121
7122	aese	v2.16b, v28.16b
7123	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
7124	aese	v3.16b, v28.16b
7125	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
7126
7127	aese	v1.16b, v28.16b
7128	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
7129
7130	aese	v4.16b, v26.16b
7131	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
7132	aese	v3.16b, v26.16b
7133	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
7134	aese	v7.16b, v26.16b
7135	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
7136
7137	aese	v6.16b, v26.16b
7138	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
7139	aese	v0.16b, v26.16b
7140	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
7141	aese	v5.16b, v26.16b
7142	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
7143
7144	aese	v2.16b, v26.16b
7145	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
7146	aese	v1.16b, v26.16b
7147	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
7148	ldp	q28, q26, [x8, #128]				//load rk8, rk9
7149
7150	aese	v5.16b, v27.16b
7151	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
7152	aese	v0.16b, v27.16b
7153	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
7154
7155	aese	v3.16b, v27.16b
7156	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
7157	aese	v2.16b, v27.16b
7158	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
7159	aese	v7.16b, v27.16b
7160	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
7161
7162	aese	v4.16b, v27.16b
7163	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
7164	aese	v1.16b, v27.16b
7165	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
7166	aese	v6.16b, v27.16b
7167	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
7168
7169	and	x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
7170	aese	v7.16b, v28.16b
7171	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
7172	aese	v5.16b, v28.16b
7173	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
7174
7175	aese	v0.16b, v28.16b
7176	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
7177	aese	v1.16b, v28.16b
7178	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
7179	aese	v2.16b, v28.16b
7180	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
7181
7182	aese	v4.16b, v28.16b
7183	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
7184	aese	v3.16b, v28.16b
7185	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
7186	aese	v6.16b, v28.16b
7187	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
7188
7189	aese	v2.16b, v26.16b
7190	aesmc	v2.16b, v2.16b			//AES block 2 - round 9
7191
7192	ld1	{ v19.16b}, [x3]
7193	ext	v19.16b, v19.16b, v19.16b, #8
7194	rev64	v19.16b, v19.16b
7195	ldp	q27, q28, [x8, #160]				//load rk10, rk11
7196	add	x4, x0, x1, lsr #3 //end_input_ptr
7197	add	x5, x5, x0
7198
7199	aese	v3.16b, v26.16b
7200	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
7201	aese	v6.16b, v26.16b
7202	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
7203
7204	aese	v4.16b, v26.16b
7205	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
7206	aese	v5.16b, v26.16b
7207	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
7208
7209	aese	v7.16b, v26.16b
7210	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
7211
7212	aese	v0.16b, v26.16b
7213	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
7214	aese	v1.16b, v26.16b
7215	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
7216
7217	aese	v4.16b, v27.16b
7218	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
7219	aese	v7.16b, v27.16b
7220	aesmc	v7.16b, v7.16b			//AES block 7 - round 10
7221	aese	v5.16b, v27.16b
7222	aesmc	v5.16b, v5.16b			//AES block 5 - round 10
7223
7224	aese	v1.16b, v27.16b
7225	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
7226	aese	v2.16b, v27.16b
7227	aesmc	v2.16b, v2.16b			//AES block 2 - round 10
7228	aese	v0.16b, v27.16b
7229	aesmc	v0.16b, v0.16b			//AES block 0 - round 10
7230
7231	aese	v6.16b, v27.16b
7232	aesmc	v6.16b, v6.16b			//AES block 6 - round 10
7233	aese	v3.16b, v27.16b
7234	aesmc	v3.16b, v3.16b			//AES block 3 - round 10
7235	ldp	q26, q27, [x8, #192]				//load rk12, rk13
7236
7237	aese	v0.16b, v28.16b
7238	aesmc	v0.16b, v0.16b			//AES block 0 - round 11
7239	add	v30.4s, v30.4s, v31.4s //CTR block 7
7240
7241	aese	v7.16b, v28.16b
7242	aesmc	v7.16b, v7.16b			//AES block 7 - round 11
7243	aese	v3.16b, v28.16b
7244	aesmc	v3.16b, v3.16b			//AES block 3 - round 11
7245	aese	v1.16b, v28.16b
7246	aesmc	v1.16b, v1.16b			//AES block 1 - round 11
7247
7248	aese	v5.16b, v28.16b
7249	aesmc	v5.16b, v5.16b			//AES block 5 - round 11
7250	aese	v4.16b, v28.16b
7251	aesmc	v4.16b, v4.16b			//AES block 4 - round 11
7252	aese	v2.16b, v28.16b
7253	aesmc	v2.16b, v2.16b			//AES block 2 - round 11
7254
7255	aese	v6.16b, v28.16b
7256	aesmc	v6.16b, v6.16b			//AES block 6 - round 11
7257	ldr	q28, [x8, #224]					//load rk14
7258
7259	aese	v1.16b, v26.16b
7260	aesmc	v1.16b, v1.16b			//AES block 1 - round 12
7261	aese	v4.16b, v26.16b
7262	aesmc	v4.16b, v4.16b			//AES block 4 - round 12
7263	aese	v5.16b, v26.16b
7264	aesmc	v5.16b, v5.16b			//AES block 5 - round 12
7265
7266	cmp	x0, x5				//check if we have <= 8 blocks
7267	aese	v3.16b, v26.16b
7268	aesmc	v3.16b, v3.16b			//AES block 3 - round 12
7269	aese	v2.16b, v26.16b
7270	aesmc	v2.16b, v2.16b			//AES block 2 - round 12
7271
7272	aese	v6.16b, v26.16b
7273	aesmc	v6.16b, v6.16b			//AES block 6 - round 12
7274	aese	v0.16b, v26.16b
7275	aesmc	v0.16b, v0.16b			//AES block 0 - round 12
7276	aese	v7.16b, v26.16b
7277	aesmc	v7.16b, v7.16b			//AES block 7 - round 12
7278
7279	aese	v5.16b, v27.16b						//AES block 5 - round 13
7280	aese	v1.16b, v27.16b						//AES block 1 - round 13
7281	aese	v2.16b, v27.16b						//AES block 2 - round 13
7282
7283	aese	v0.16b, v27.16b						//AES block 0 - round 13
7284	aese	v4.16b, v27.16b						//AES block 4 - round 13
7285	aese	v6.16b, v27.16b						//AES block 6 - round 13
7286
7287	aese	v3.16b, v27.16b						//AES block 3 - round 13
7288	aese	v7.16b, v27.16b						//AES block 7 - round 13
7289	b.ge	.L256_dec_tail						//handle tail
7290
7291	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext
7292
7293	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext
7294
7295	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext
7296
7297	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext
7298	cmp	x0, x5				//check if we have <= 8 blocks
7299
7300.inst	0xce017121	//eor3 v1.16b, v9.16b, v1.16b, v28.16b				//AES block 1 - result
7301.inst	0xce007100	//eor3 v0.16b, v8.16b, v0.16b, v28.16b				//AES block 0 - result
7302	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result
7303
7304	rev32	v0.16b, v30.16b				//CTR block 8
7305	add	v30.4s, v30.4s, v31.4s		//CTR block 8
7306.inst	0xce037163	//eor3 v3.16b, v11.16b, v3.16b, v28.16b				//AES block 3 - result
7307
7308.inst	0xce0571a5	//eor3 v5.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result
7309
7310.inst	0xce047184	//eor3 v4.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result
7311	rev32	v1.16b, v30.16b				//CTR block 9
7312	add	v30.4s, v30.4s, v31.4s		//CTR block 9
7313
7314.inst	0xce027142	//eor3 v2.16b, v10.16b, v2.16b, v28.16b				//AES block 2 - result
7315	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result
7316
7317	rev32	v2.16b, v30.16b				//CTR block 10
7318	add	v30.4s, v30.4s, v31.4s		//CTR block 10
7319
7320.inst	0xce0671c6	//eor3 v6.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result
7321
7322	rev32	v3.16b, v30.16b				//CTR block 11
7323	add	v30.4s, v30.4s, v31.4s		//CTR block 11
7324	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result
7325
7326.inst	0xce0771e7	//eor3 v7.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
7327	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result
7328
7329	rev32	v4.16b, v30.16b				//CTR block 12
7330	add	v30.4s, v30.4s, v31.4s		//CTR block 12
7331	b.ge	.L256_dec_prepretail					//do prepretail
7332
7333.L256_dec_main_loop:	//main	loop start
7334	rev32	v5.16b, v30.16b				//CTR block 8k+13
7335	ldp	q26, q27, [x8, #0]					//load rk0, rk1
7336	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
7337
7338	rev64	v9.16b, v9.16b						//GHASH block 8k+1
7339	ldr	q23, [x3, #176]				//load h7l | h7h
7340	ext	v23.16b, v23.16b, v23.16b, #8
7341	ldr	q25, [x3, #208]				//load h8l | h8h
7342	ext	v25.16b, v25.16b, v25.16b, #8
7343
7344	rev32	v6.16b, v30.16b				//CTR block 8k+14
7345	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
7346	rev64	v8.16b, v8.16b						//GHASH block 8k
7347
7348	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
7349	rev64	v12.16b, v12.16b						//GHASH block 8k+4
7350	rev64	v11.16b, v11.16b						//GHASH block 8k+3
7351
7352	rev32	v7.16b, v30.16b				//CTR block 8k+15
7353	rev64	v15.16b, v15.16b						//GHASH block 8k+7
7354
7355	aese	v3.16b, v26.16b
7356	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
7357	aese	v6.16b, v26.16b
7358	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
7359	aese	v2.16b, v26.16b
7360	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
7361
7362	aese	v7.16b, v26.16b
7363	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
7364	aese	v0.16b, v26.16b
7365	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
7366	aese	v5.16b, v26.16b
7367	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
7368
7369	aese	v4.16b, v26.16b
7370	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
7371	aese	v1.16b, v26.16b
7372	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
7373	ldp	q28, q26, [x8, #32]				//load rk2, rk3
7374
7375	eor	v8.16b, v8.16b, v19.16b					//PRE 1
7376	ldr	q20, [x3, #128]				//load h5l | h5h
7377	ext	v20.16b, v20.16b, v20.16b, #8
7378	ldr	q22, [x3, #160]				//load h6l | h6h
7379	ext	v22.16b, v22.16b, v22.16b, #8
7380	aese	v6.16b, v27.16b
7381	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
7382
7383	aese	v4.16b, v27.16b
7384	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
7385	rev64	v10.16b, v10.16b						//GHASH block 8k+2
7386	aese	v3.16b, v27.16b
7387	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
7388
7389	aese	v0.16b, v27.16b
7390	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
7391	aese	v5.16b, v27.16b
7392	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
7393	aese	v2.16b, v27.16b
7394	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
7395
7396	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
7397	aese	v7.16b, v27.16b
7398	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
7399	aese	v1.16b, v27.16b
7400	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
7401
7402	aese	v4.16b, v28.16b
7403	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
7404	aese	v0.16b, v28.16b
7405	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
7406	aese	v3.16b, v28.16b
7407	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
7408
7409	aese	v6.16b, v28.16b
7410	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
7411	aese	v7.16b, v28.16b
7412	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
7413	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
7414
7415	aese	v5.16b, v28.16b
7416	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
7417	aese	v2.16b, v28.16b
7418	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
7419	aese	v1.16b, v28.16b
7420	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
7421
7422	ldp	q27, q28, [x8, #64]				//load rk4, rk5
7423	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
7424	aese	v3.16b, v26.16b
7425	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
7426
7427	aese	v0.16b, v26.16b
7428	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
7429	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
7430	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
7431
7432	aese	v5.16b, v26.16b
7433	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
7434	aese	v6.16b, v26.16b
7435	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
7436	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
7437
7438	aese	v4.16b, v26.16b
7439	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
7440	aese	v1.16b, v26.16b
7441	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
7442	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
7443
7444	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
7445	aese	v2.16b, v26.16b
7446	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
7447	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
7448
7449	aese	v5.16b, v27.16b
7450	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
7451	aese	v7.16b, v26.16b
7452	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
7453	aese	v3.16b, v27.16b
7454	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
7455
7456	aese	v2.16b, v27.16b
7457	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
7458	aese	v0.16b, v27.16b
7459	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
7460	aese	v1.16b, v27.16b
7461	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
7462
7463	aese	v6.16b, v27.16b
7464	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
7465	aese	v7.16b, v27.16b
7466	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
7467	aese	v4.16b, v27.16b
7468	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
7469
7470	ldr	q21, [x3, #144]				//load h6k | h5k
7471	ldr	q24, [x3, #192]				//load h8k | h7k
7472	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
7473	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
7474
7475	ldp	q26, q27, [x8, #96]				//load rk6, rk7
7476	aese	v5.16b, v28.16b
7477	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
7478	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
7479
7480	aese	v0.16b, v28.16b
7481	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
7482	aese	v3.16b, v28.16b
7483	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
7484	aese	v7.16b, v28.16b
7485	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
7486
7487	aese	v1.16b, v28.16b
7488	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
7489	aese	v2.16b, v28.16b
7490	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
7491	aese	v6.16b, v28.16b
7492	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
7493
7494.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
7495	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
7496	rev64	v13.16b, v13.16b						//GHASH block 8k+5
7497
7498	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
7499	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
7500	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
7501
7502	aese	v3.16b, v26.16b
7503	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
7504	aese	v0.16b, v26.16b
7505	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
7506	aese	v4.16b, v28.16b
7507	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
7508
7509	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
7510	aese	v1.16b, v26.16b
7511	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
7512	aese	v6.16b, v26.16b
7513	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
7514
7515	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
7516	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
7517	aese	v4.16b, v26.16b
7518	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
7519
7520	aese	v2.16b, v26.16b
7521	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
7522	aese	v5.16b, v26.16b
7523	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
7524	aese	v7.16b, v26.16b
7525	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
7526
7527	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
7528	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
7529.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
7530
7531	ldr	q23, [x3, #80]				//load h3l | h3h
7532	ext	v23.16b, v23.16b, v23.16b, #8
7533	ldr	q25, [x3, #112]				//load h4l | h4h
7534	ext	v25.16b, v25.16b, v25.16b, #8
7535	rev64	v14.16b, v14.16b						//GHASH block 8k+6
7536	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
7537
7538	aese	v2.16b, v27.16b
7539	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
7540	aese	v5.16b, v27.16b
7541	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
7542	ldp	q28, q26, [x8, #128]				//load rk8, rk9
7543
7544	ldr	q20, [x3, #32]				//load h1l | h1h
7545	ext	v20.16b, v20.16b, v20.16b, #8
7546	ldr	q22, [x3, #64]				//load h2l | h2h
7547	ext	v22.16b, v22.16b, v22.16b, #8
7548.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
7549	aese	v7.16b, v27.16b
7550	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
7551
7552	aese	v1.16b, v27.16b
7553	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
7554	aese	v3.16b, v27.16b
7555	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
7556	aese	v6.16b, v27.16b
7557	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
7558
7559	ldr	q21, [x3, #48]				//load h2k | h1k
7560	ldr	q24, [x3, #96]				//load h4k | h3k
7561	aese	v0.16b, v27.16b
7562	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
7563	aese	v4.16b, v27.16b
7564	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
7565
7566	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
7567	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
7568	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
7569
7570	aese	v5.16b, v28.16b
7571	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
7572	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
7573	aese	v2.16b, v28.16b
7574	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
7575
7576	aese	v6.16b, v28.16b
7577	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
7578	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
7579	aese	v1.16b, v28.16b
7580	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
7581
7582	aese	v4.16b, v28.16b
7583	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
7584	aese	v0.16b, v28.16b
7585	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
7586	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
7587
7588	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
7589	aese	v3.16b, v28.16b
7590	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
7591	aese	v7.16b, v28.16b
7592	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
7593
7594	ldp	q27, q28, [x8, #160]				//load rk10, rk11
7595	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
7596	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
7597
7598	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
7599.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
7600	aese	v3.16b, v26.16b
7601	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
7602
7603	aese	v6.16b, v26.16b
7604	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
7605	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
7606	aese	v5.16b, v26.16b
7607	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
7608
7609	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext
7610	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
7611	aese	v7.16b, v26.16b
7612	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
7613
7614	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
7615	aese	v2.16b, v26.16b
7616	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
7617	aese	v1.16b, v26.16b
7618	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
7619
7620	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
7621	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
7622	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
7623
7624	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
7625	aese	v3.16b, v27.16b
7626	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
7627	aese	v6.16b, v27.16b
7628	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
7629
7630	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
7631	aese	v0.16b, v26.16b
7632	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
7633.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
7634
7635	aese	v4.16b, v26.16b
7636	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
7637.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
7638.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
7639
7640	aese	v2.16b, v27.16b
7641	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
7642	aese	v5.16b, v27.16b
7643	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
7644	aese	v7.16b, v27.16b
7645	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
7646
7647	aese	v1.16b, v27.16b
7648	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
7649	aese	v0.16b, v27.16b
7650	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
7651	aese	v4.16b, v27.16b
7652	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
7653
7654.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
7655	rev32	v20.16b, v30.16b					//CTR block 8k+16
7656	ldr	d16, [x10]			//MODULO - load modulo constant
7657
7658	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
7659	aese	v1.16b, v28.16b
7660	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
7661	ldp	q26, q27, [x8, #192]				//load rk12, rk13
7662
7663	aese	v0.16b, v28.16b
7664	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
7665	aese	v6.16b, v28.16b
7666	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
7667
7668.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
7669	rev32	v22.16b, v30.16b					//CTR block 8k+17
7670	aese	v2.16b, v28.16b
7671	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
7672
7673	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext
7674	aese	v7.16b, v28.16b
7675	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
7676	ext	v21.16b, v17.16b, v17.16b, #8				 //MODULO - other top alignment
7677
7678	aese	v5.16b, v28.16b
7679	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
7680	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
7681	aese	v3.16b, v28.16b
7682	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
7683
7684	aese	v2.16b, v26.16b
7685	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
7686	aese	v7.16b, v26.16b
7687	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
7688	aese	v6.16b, v26.16b
7689	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
7690
7691	rev32	v23.16b, v30.16b					//CTR block 8k+18
7692	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
7693	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
7694
7695.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
7696	aese	v1.16b, v26.16b
7697	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
7698	aese	v4.16b, v28.16b
7699	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
7700
7701	ldr	q28, [x8, #224]					//load rk14
7702	aese	v5.16b, v26.16b
7703	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
7704	aese	v3.16b, v26.16b
7705	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
7706
7707.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
7708	aese	v0.16b, v26.16b
7709	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
7710	aese	v4.16b, v26.16b
7711	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
7712
7713	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext
7714	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
7715	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
7716
7717	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
7718	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
7719	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
7720
7721	rev32	v25.16b, v30.16b					//CTR block 8k+19
7722.inst	0xce027142	//eor3 v2.16b, v10.16b, v2.16b, v28.16b				//AES block 8k+10 - result
7723.inst	0xce017121	//eor3 v1.16b, v9.16b, v1.16b, v28.16b				//AES block 8k+9 - result
7724
7725	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
7726	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
7727
7728	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
7729	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
7730	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
7731
7732.inst	0xce0571a5	//eor3 v5.16b, v13.16b, v5.16b, v28.16b				//AES block 8k+13 - result
7733.inst	0xce007100	//eor3 v0.16b, v8.16b, v0.16b, v28.16b				//AES block 8k+8 - result
7734	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
7735
7736	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
7737	mov	v0.16b, v20.16b					//CTR block 8k+16
7738.inst	0xce047184	//eor3 v4.16b, v12.16b, v4.16b, v28.16b				//AES block 8k+12 - result
7739
7740.inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
7741.inst	0xce037163	//eor3 v3.16b, v11.16b, v3.16b, v28.16b				//AES block 8k+11 - result
7742	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result
7743
7744	mov	v3.16b, v25.16b					//CTR block 8k+19
7745	mov	v2.16b, v23.16b					//CTR block 8k+18
7746	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
7747
7748	mov	v1.16b, v22.16b					//CTR block 8k+17
7749	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
7750.inst	0xce0771e7	//eor3 v7.16b, v15.16b, v7.16b, v28.16b				//AES block 8k+15 - result
7751
7752.inst	0xce0671c6	//eor3 v6.16b, v14.16b, v6.16b, v28.16b				//AES block 8k+14 - result
7753	rev32	v4.16b, v30.16b				//CTR block 8k+20
7754	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
7755
7756	cmp	x0, x5				//.LOOP CONTROL
7757	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
7758	b.lt	.L256_dec_main_loop
7759
7760.L256_dec_prepretail:	//PREPRETAIL
7761	ldp	q26, q27, [x8, #0]					//load rk0, rk1
7762	rev32	v5.16b, v30.16b				//CTR block 8k+13
7763	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
7764
7765	rev64	v12.16b, v12.16b						//GHASH block 8k+4
7766	ldr	q21, [x3, #144]				//load h6k | h5k
7767	ldr	q24, [x3, #192]				//load h8k | h7k
7768
7769	rev32	v6.16b, v30.16b				//CTR block 8k+14
7770	rev64	v8.16b, v8.16b						//GHASH block 8k
7771	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
7772
7773	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
7774	ldr	q23, [x3, #176]				//load h7l | h7h
7775	ext	v23.16b, v23.16b, v23.16b, #8
7776	ldr	q25, [x3, #208]				//load h8l | h8h
7777	ext	v25.16b, v25.16b, v25.16b, #8
7778	rev64	v9.16b, v9.16b						//GHASH block 8k+1
7779
7780	rev32	v7.16b, v30.16b				//CTR block 8k+15
7781	rev64	v10.16b, v10.16b						//GHASH block 8k+2
7782	ldr	q20, [x3, #128]				//load h5l | h5h
7783	ext	v20.16b, v20.16b, v20.16b, #8
7784	ldr	q22, [x3, #160]				//load h6l | h6h
7785	ext	v22.16b, v22.16b, v22.16b, #8
7786
7787	aese	v0.16b, v26.16b
7788	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
7789	aese	v1.16b, v26.16b
7790	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
7791	aese	v4.16b, v26.16b
7792	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
7793
7794	aese	v3.16b, v26.16b
7795	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
7796	aese	v5.16b, v26.16b
7797	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
7798	aese	v6.16b, v26.16b
7799	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
7800
7801	aese	v4.16b, v27.16b
7802	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
7803	aese	v7.16b, v26.16b
7804	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
7805	aese	v2.16b, v26.16b
7806	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
7807
7808	ldp	q28, q26, [x8, #32]				//load rk2, rk3
7809	aese	v0.16b, v27.16b
7810	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
7811	eor	v8.16b, v8.16b, v19.16b					//PRE 1
7812
7813	aese	v7.16b, v27.16b
7814	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
7815	aese	v6.16b, v27.16b
7816	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
7817	aese	v2.16b, v27.16b
7818	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
7819
7820	aese	v3.16b, v27.16b
7821	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
7822	aese	v1.16b, v27.16b
7823	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
7824	aese	v5.16b, v27.16b
7825	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
7826
7827	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
7828	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
7829	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
7830
7831	rev64	v11.16b, v11.16b						//GHASH block 8k+3
7832	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
7833
7834	aese	v5.16b, v28.16b
7835	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
7836	aese	v7.16b, v28.16b
7837	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
7838	aese	v1.16b, v28.16b
7839	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
7840
7841	aese	v3.16b, v28.16b
7842	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
7843	aese	v6.16b, v28.16b
7844	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
7845	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
7846
7847	aese	v0.16b, v28.16b
7848	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
7849	aese	v7.16b, v26.16b
7850	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
7851
7852	aese	v5.16b, v26.16b
7853	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
7854	rev64	v14.16b, v14.16b						//GHASH block 8k+6
7855
7856	aese	v0.16b, v26.16b
7857	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
7858	aese	v2.16b, v28.16b
7859	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
7860	aese	v6.16b, v26.16b
7861	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
7862
7863	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
7864	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
7865	aese	v4.16b, v28.16b
7866	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
7867
7868	ldp	q27, q28, [x8, #64]				//load rk4, rk5
7869	aese	v1.16b, v26.16b
7870	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
7871	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
7872
7873	aese	v2.16b, v26.16b
7874	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
7875	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
7876	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
7877
7878	aese	v4.16b, v26.16b
7879	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
7880	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
7881	aese	v3.16b, v26.16b
7882	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
7883
7884.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
7885	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
7886	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
7887
7888	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
7889	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
7890	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
7891
7892	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
7893	aese	v5.16b, v27.16b
7894	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
7895	aese	v0.16b, v27.16b
7896	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
7897
7898.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
7899	ldr	q20, [x3, #32]				//load h1l | h1h
7900	ext	v20.16b, v20.16b, v20.16b, #8
7901	ldr	q22, [x3, #64]				//load h2l | h2h
7902	ext	v22.16b, v22.16b, v22.16b, #8
7903	aese	v7.16b, v27.16b
7904	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
7905
7906	aese	v2.16b, v27.16b
7907	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
7908	aese	v6.16b, v27.16b
7909	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
7910	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
7911
7912	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
7913	aese	v7.16b, v28.16b
7914	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
7915	aese	v1.16b, v27.16b
7916	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
7917
7918	aese	v2.16b, v28.16b
7919	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
7920	aese	v3.16b, v27.16b
7921	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
7922	aese	v4.16b, v27.16b
7923	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
7924
7925	aese	v1.16b, v28.16b
7926	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
7927	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
7928	aese	v6.16b, v28.16b
7929	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
7930
7931	aese	v4.16b, v28.16b
7932	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
7933	aese	v3.16b, v28.16b
7934	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
7935	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
7936
7937	aese	v0.16b, v28.16b
7938	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
7939	aese	v5.16b, v28.16b
7940	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
7941	ldp	q26, q27, [x8, #96]				//load rk6, rk7
7942
7943	ldr	q23, [x3, #80]				//load h3l | h3h
7944	ext	v23.16b, v23.16b, v23.16b, #8
7945	ldr	q25, [x3, #112]				//load h4l | h4h
7946	ext	v25.16b, v25.16b, v25.16b, #8
7947	rev64	v15.16b, v15.16b						//GHASH block 8k+7
7948	rev64	v13.16b, v13.16b						//GHASH block 8k+5
7949
7950.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
7951
7952	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
7953
7954	aese	v0.16b, v26.16b
7955	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
7956	ldr	q21, [x3, #48]				//load h2k | h1k
7957	ldr	q24, [x3, #96]				//load h4k | h3k
7958	aese	v6.16b, v26.16b
7959	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
7960
7961	aese	v5.16b, v26.16b
7962	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
7963	aese	v7.16b, v26.16b
7964	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
7965
7966	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
7967	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
7968	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
7969
7970	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
7971	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
7972	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
7973
7974	aese	v7.16b, v27.16b
7975	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
7976	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
7977	aese	v1.16b, v26.16b
7978	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
7979
7980	aese	v2.16b, v26.16b
7981	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
7982	aese	v3.16b, v26.16b
7983	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
7984	aese	v4.16b, v26.16b
7985	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
7986
7987	ldp	q28, q26, [x8, #128]				//load rk8, rk9
7988	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
7989	aese	v5.16b, v27.16b
7990	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
7991
7992	aese	v1.16b, v27.16b
7993	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
7994	aese	v4.16b, v27.16b
7995	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
7996
7997	aese	v6.16b, v27.16b
7998	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
7999	aese	v2.16b, v27.16b
8000	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
8001.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
8002
8003	aese	v0.16b, v27.16b
8004	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
8005	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
8006	aese	v3.16b, v27.16b
8007	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
8008
8009	aese	v0.16b, v28.16b
8010	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
8011	aese	v7.16b, v28.16b
8012	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
8013	aese	v4.16b, v28.16b
8014	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
8015
8016	aese	v1.16b, v28.16b
8017	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
8018	aese	v5.16b, v28.16b
8019	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
8020	aese	v6.16b, v28.16b
8021	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
8022
8023	aese	v3.16b, v28.16b
8024	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
8025	aese	v4.16b, v26.16b
8026	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
8027	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
8028
8029	aese	v0.16b, v26.16b
8030	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
8031	aese	v1.16b, v26.16b
8032	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
8033	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
8034
8035	aese	v6.16b, v26.16b
8036	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
8037	aese	v7.16b, v26.16b
8038	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
8039	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
8040
8041	aese	v2.16b, v28.16b
8042	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
8043	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
8044	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
8045
8046	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
8047	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
8048	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
8049
8050	ldp	q27, q28, [x8, #160]				//load rk10, rk11
8051.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
8052.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
8053
8054	aese	v2.16b, v26.16b
8055	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
8056	aese	v3.16b, v26.16b
8057	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
8058	aese	v5.16b, v26.16b
8059	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
8060
8061.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
8062.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
8063	ldr	d16, [x10]			//MODULO - load modulo constant
8064
8065.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
8066
8067	aese	v4.16b, v27.16b
8068	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
8069	aese	v6.16b, v27.16b
8070	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
8071	aese	v5.16b, v27.16b
8072	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
8073
8074	aese	v0.16b, v27.16b
8075	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
8076	aese	v2.16b, v27.16b
8077	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
8078	aese	v3.16b, v27.16b
8079	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
8080
8081.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
8082
8083	aese	v7.16b, v27.16b
8084	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
8085	aese	v1.16b, v27.16b
8086	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
8087	ldp	q26, q27, [x8, #192]				//load rk12, rk13
8088
8089	ext	v21.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
8090
8091	aese	v2.16b, v28.16b
8092	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
8093	aese	v1.16b, v28.16b
8094	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
8095	aese	v0.16b, v28.16b
8096	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
8097
8098	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
8099	aese	v3.16b, v28.16b
8100	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
8101
8102	aese	v7.16b, v28.16b
8103	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
8104	aese	v6.16b, v28.16b
8105	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
8106	aese	v4.16b, v28.16b
8107	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
8108
8109	aese	v5.16b, v28.16b
8110	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
8111	aese	v3.16b, v26.16b
8112	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
8113
8114.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
8115
8116	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
8117	aese	v2.16b, v26.16b
8118	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
8119	aese	v6.16b, v26.16b
8120	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
8121
8122	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
8123	aese	v4.16b, v26.16b
8124	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
8125	aese	v7.16b, v26.16b
8126	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
8127
8128	aese	v0.16b, v26.16b
8129	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
8130	ldr	q28, [x8, #224]					//load rk14
8131	aese	v1.16b, v26.16b
8132	aesmc	v1.16b, v1.16b	        	//AES block 8k+9 - round 12
8133
8134	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
8135	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
8136	aese	v5.16b, v26.16b
8137	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
8138
8139	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
8140	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
8141	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
8142
8143	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
8144.inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
8145	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
8146
8147	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
8148	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
8149.L256_dec_tail:	//TAIL
8150
8151	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
8152	sub	x5, x4, x0		//main_end_input_ptr is number of bytes left to process
8153	cmp	x5, #112
8154
8155	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext
8156
8157	ldp	q24, q25, [x3, #192]			//load h8k | h7k
8158	ext	v25.16b, v25.16b, v25.16b, #8
8159	mov	v29.16b, v28.16b
8160
8161	ldp	q20, q21, [x3, #128]			//load h5l | h5h
8162	ext	v20.16b, v20.16b, v20.16b, #8
8163
8164.inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
8165	ldp	q22, q23, [x3, #160]			//load h6l | h6h
8166	ext	v22.16b, v22.16b, v22.16b, #8
8167	ext	v23.16b, v23.16b, v23.16b, #8
8168	b.gt	.L256_dec_blocks_more_than_7
8169
8170	mov	v7.16b, v6.16b
8171	sub	v30.4s, v30.4s, v31.4s
8172	mov	v6.16b, v5.16b
8173
8174	mov	v5.16b, v4.16b
8175	mov	v4.16b, v3.16b
8176	movi	v19.8b, #0
8177
8178	movi	v17.8b, #0
8179	movi	v18.8b, #0
8180	mov	v3.16b, v2.16b
8181
8182	cmp	x5, #96
8183	mov	v2.16b, v1.16b
8184	b.gt	.L256_dec_blocks_more_than_6
8185
8186	mov	v7.16b, v6.16b
8187	mov	v6.16b, v5.16b
8188
8189	mov	v5.16b, v4.16b
8190	cmp	x5, #80
8191	sub	v30.4s, v30.4s, v31.4s
8192
8193	mov	v4.16b, v3.16b
8194	mov	v3.16b, v1.16b
8195	b.gt	.L256_dec_blocks_more_than_5
8196
8197	cmp	x5, #64
8198	mov	v7.16b, v6.16b
8199	sub	v30.4s, v30.4s, v31.4s
8200
8201	mov	v6.16b, v5.16b
8202
8203	mov	v5.16b, v4.16b
8204	mov	v4.16b, v1.16b
8205	b.gt	.L256_dec_blocks_more_than_4
8206
8207	sub	v30.4s, v30.4s, v31.4s
8208	mov	v7.16b, v6.16b
8209	cmp	x5, #48
8210
8211	mov	v6.16b, v5.16b
8212	mov	v5.16b, v1.16b
8213	b.gt	.L256_dec_blocks_more_than_3
8214
8215	ldr	q24, [x3, #96]				//load h4k | h3k
8216	sub	v30.4s, v30.4s, v31.4s
8217	mov	v7.16b, v6.16b
8218
8219	cmp	x5, #32
8220	mov	v6.16b, v1.16b
8221	b.gt	.L256_dec_blocks_more_than_2
8222
8223	sub	v30.4s, v30.4s, v31.4s
8224
8225	mov	v7.16b, v1.16b
8226	cmp	x5, #16
8227	b.gt	.L256_dec_blocks_more_than_1
8228
8229	sub	v30.4s, v30.4s, v31.4s
8230	ldr	q21, [x3, #48]				//load h2k | h1k
8231	b	.L256_dec_blocks_less_than_1
8232.L256_dec_blocks_more_than_7:	//blocks	left >  7
8233	rev64	v8.16b, v9.16b						//GHASH final-7 block
8234	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext
8235	st1	{ v12.16b}, [x2], #16				//AES final-7 block  - store result
8236
8237	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
8238
8239	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
8240
8241	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
8242.inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result
8243
8244	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
8245
8246	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
8247	movi	v16.8b, #0						//suppress further partial tag feed in
8248
8249	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
8250	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
8251.L256_dec_blocks_more_than_6:	//blocks	left >  6
8252
8253	rev64	v8.16b, v9.16b						//GHASH final-6 block
8254
8255	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
8256	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
8257	movi	v16.8b, #0						//suppress further partial tag feed in
8258
8259	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
8260	st1	{ v12.16b}, [x2], #16				//AES final-6 block - store result
8261	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
8262
8263	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
8264
8265.inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result
8266	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
8267	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
8268
8269	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
8270
8271	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
8272	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
8273.L256_dec_blocks_more_than_5:	//blocks	left >  5
8274
8275	rev64	v8.16b, v9.16b						//GHASH final-5 block
8276
8277	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
8278
8279	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
8280	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
8281
8282	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext
8283
8284	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
8285	st1	{ v12.16b}, [x2], #16			  	//AES final-5 block - store result
8286
8287	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
8288	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
8289
8290	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
8291
8292	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
8293.inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result
8294	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
8295
8296	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
8297	movi	v16.8b, #0						//suppress further partial tag feed in
8298.L256_dec_blocks_more_than_4:	//blocks	left >  4
8299
8300	rev64	v8.16b, v9.16b						//GHASH final-4 block
8301
8302	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
8303
8304	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
8305	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext
8306
8307	movi	v16.8b, #0						//suppress further partial tag feed in
8308
8309	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
8310	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
8311
8312	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
8313
8314	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
8315
8316	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
8317
8318	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
8319	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result
8320
8321	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
8322.inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result
8323.L256_dec_blocks_more_than_3:	//blocks	left >  3
8324
8325	ldr	q25, [x3, #112]				//load h4l | h4h
8326	ext	v25.16b, v25.16b, v25.16b, #8
8327	rev64	v8.16b, v9.16b						//GHASH final-3 block
8328
8329	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
8330	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext
8331	ldr	q24, [x3, #96]				//load h4k | h3k
8332
8333	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
8334	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result
8335
8336.inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result
8337
8338	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
8339
8340	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
8341	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
8342	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
8343
8344	movi	v16.8b, #0						//suppress further partial tag feed in
8345	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
8346	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
8347
8348	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
8349
8350	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
8351.L256_dec_blocks_more_than_2:	//blocks	left >  2
8352
8353	rev64	v8.16b, v9.16b						//GHASH final-2 block
8354
8355	ldr	q23, [x3, #80]				//load h3l | h3h
8356	ext	v23.16b, v23.16b, v23.16b, #8
8357	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext
8358
8359	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
8360
8361	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
8362
8363	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
8364	st1	{ v12.16b}, [x2], #16			  	//AES final-2 block - store result
8365.inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result
8366
8367	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
8368	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
8369	movi	v16.8b, #0						//suppress further partial tag feed in
8370
8371	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
8372	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
8373
8374	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
8375	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
8376.L256_dec_blocks_more_than_1:	//blocks	left >  1
8377
8378	rev64	v8.16b, v9.16b						//GHASH final-1 block
8379
8380	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
8381
8382	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
8383	ldr	q22, [x3, #64]				//load h2l | h2h
8384	ext	v22.16b, v22.16b, v22.16b, #8
8385
8386	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
8387	ldr	q9, [x0], #16				//AES final block - load ciphertext
8388	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result
8389
8390	ldr	q21, [x3, #48]				//load h2k | h1k
8391	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
8392
8393	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
8394
8395	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
8396
8397.inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result
8398	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
8399
8400	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
8401
8402	movi	v16.8b, #0						//suppress further partial tag feed in
8403	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
8404
8405	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
8406.L256_dec_blocks_less_than_1:	//blocks	left <= 1
8407
8408	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
8409	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
8410	and	x1, x1, #127				//bit_length %= 128
8411
8412	sub	x1, x1, #128				//bit_length -= 128
8413	rev32	v30.16b, v30.16b
8414	str	q30, [x16]					//store the updated counter
8415
8416	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
8417
8418	and	x1, x1, #127			 	//bit_length %= 128
8419
8420	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
8421	cmp	x1, #64
8422	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
8423
8424	csel	x14, x6, xzr, lt
8425	csel	x13, x7, x6, lt
8426
8427	mov	v0.d[0], x13					//ctr0b is mask for last block
8428	mov	v0.d[1], x14
8429
8430	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
8431	ldr	q20, [x3, #32]				//load h1l | h1h
8432	ext	v20.16b, v20.16b, v20.16b, #8
8433	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
8434
8435	rev64	v8.16b, v9.16b						//GHASH final block
8436
8437	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
8438
8439	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
8440	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
8441
8442	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
8443
8444	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
8445	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
8446
8447	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
8448
8449	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
8450	ldr	d16, [x10]			//MODULO - load modulo constant
8451	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
8452
8453	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
8454	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up
8455
8456	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
8457	st1	{ v12.16b}, [x2]				//store all 16B
8458
8459	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up
8460
8461	eor	v21.16b, v17.16b, v21.16b				//MODULO - fold into mid
8462	eor	v18.16b, v18.16b, v21.16b				//MODULO - fold into mid
8463
8464	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
8465
8466	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
8467	eor	v19.16b, v19.16b, v17.16b				//MODULO - fold into low
8468
8469	eor	v19.16b, v19.16b, v18.16b				//MODULO - fold into low
8470	ext	v19.16b, v19.16b, v19.16b, #8
8471	rev64	v19.16b, v19.16b
8472	st1	{ v19.16b }, [x3]
8473	mov	x0, x9
8474
8475	ldp	d10, d11, [sp, #16]
8476	ldp	d12, d13, [sp, #32]
8477	ldp	d14, d15, [sp, #48]
8478	ldp	d8, d9, [sp], #80
8479	ret
8480
8481.L256_dec_ret:
8482	mov	w0, #0x0
8483	ret
8484.size	unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
8485.byte	65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0
8486.align	2
8487.align	2
8488#endif
8489